Spaces:

aditya-me13
/

cams-pollution-dashboard

Sleeping

File size: 13,666 Bytes

import os
import cdsapi
import zipfile
import logging
import pandas as pd

from pathlib import Path
from datetime import datetime, timedelta
from constants import DOWNLOAD_FOLDER

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

class CAMSDownloader:
    def __init__(self, download_dir=DOWNLOAD_FOLDER):
        """
        Initialize CAMS downloader
        
        Parameters:
        download_dir (str): Directory to store downloaded files
        """
        self.download_dir = Path(download_dir)
        self.download_dir.mkdir(exist_ok=True)
        
        # Create subdirectories
        self.extracted_dir = self.download_dir / "extracted"
        self.extracted_dir.mkdir(exist_ok=True)
        
        self.client = None
        self._init_client()
    
    def _init_client(self):
        """Initialize CDS API client"""
        # Try to get the credentials from environment variables
        try:
            cdsapi_url = os.getenv('CDSAPI_URL')
            cdsapi_key = os.getenv('CDSAPI_KEY')
            
            if cdsapi_url and cdsapi_key:
                self.client = cdsapi.Client(key=cdsapi_key, url=cdsapi_url)
                logger.info("✅ CDS API client initialized from environment variables")
                return
            
            # Try to read from .cdsapirc file
            cdsapirc_path = Path.cwd() / ".cdsapirc"
            if not cdsapirc_path.exists():
                cdsapirc_path = Path.home() / ".cdsapirc"
            
            if cdsapirc_path.exists():
                with open(cdsapirc_path, 'r') as f:
                    lines = f.readlines()
                
                url = None
                key = None
                for line in lines:
                    line = line.strip()
                    if line.startswith('url:'):
                        url = line.split(':', 1)[1].strip()
                    elif line.startswith('key:'):
                        key = line.split(':', 1)[1].strip()
                
                if url and key:
                    self.client = cdsapi.Client(key=key, url=url)
                    logger.info("✅ CDS API client initialized from .cdsapirc file")
                    return
                else:
                    raise ValueError("Could not parse URL or key from .cdsapirc file")
            
            # Last resort: Try default initialization
            self.client = cdsapi.Client()
            logger.info("✅ CDS API client initialized with default settings")
                
        except Exception as e:
            logger.warning(f"⚠️ Could not initialize CDS API client: {str(e)}")
            logger.warning("Please ensure you have:\n"
                            "1. Created an account at https://cds.climate.copernicus.eu/\n"
                            "2. Set CDSAPI_URL and CDSAPI_KEY environment variables\n"
                            "3. Or created a .cdsapirc file in your home directory")
            self.client = None
    
    def is_client_ready(self):
        """Check if CDS API client is ready"""
        return self.client is not None
    
    def download_cams_data(self, date_str, variables=None, pressure_levels=None):
        """Download CAMS atmospheric composition data for a specific date"""
        if not self.is_client_ready():
            raise Exception("CDS API client not initialized. Please check your credentials.")
        
        try:
            target_date = pd.to_datetime(date_str)
            date_str = target_date.strftime('%Y-%m-%d')
        except:
            raise ValueError(f"Invalid date format: {date_str}. Use YYYY-MM-DD format.")
        
        filename = f"{date_str}-cams.nc.zip"
        filepath = self.download_dir / filename
        
        if filepath.exists():
            logger.info(f"✅ Data for {date_str} already exists: {filename}")
            return str(filepath)
        
        if variables is None:
            variables = [
                # Meteorological surface-level variables:
                "10m_u_component_of_wind",
                "10m_v_component_of_wind", 
                "2m_temperature",
                "mean_sea_level_pressure",
                # Pollution surface-level variables:
                "particulate_matter_1um",
                "particulate_matter_2.5um", 
                "particulate_matter_10um",
                "total_column_carbon_monoxide",
                "total_column_nitrogen_monoxide",
                "total_column_nitrogen_dioxide",
                "total_column_ozone",
                "total_column_sulphur_dioxide",
                # Meteorological atmospheric variables:
                "u_component_of_wind",
                "v_component_of_wind",
                "temperature", 
                "geopotential",
                "specific_humidity",
                # Pollution atmospheric variables:
                "carbon_monoxide",
                "nitrogen_dioxide",
                "nitrogen_monoxide", 
                "ozone",
                "sulphur_dioxide",
            ]
        
        if pressure_levels is None:
            pressure_levels = [
                "50", "100", "150", "200", "250", "300", "400",
                "500", "600", "700", "850", "925", "1000",
            ]
        
        logger.info(f"🔄 Downloading CAMS data for {date_str}...")
        logger.info(f"Variables: {len(variables)} selected")
        logger.info(f"Pressure levels: {len(pressure_levels)} levels")
        
        try:
            logger.info("📡 Requesting data from CAMS API...")
            self.client.retrieve(
                "cams-global-atmospheric-composition-forecasts",
                {
                    "type": "forecast",
                    "leadtime_hour": "0",
                    "variable": variables,
                    "pressure_level": pressure_levels,
                    "date": date_str,
                    "time": ["00:00", "12:00"],
                    "format": "netcdf_zip",
                },
                str(filepath),
            )
            
            # Validate the downloaded file
            if filepath.exists():
                file_size = filepath.stat().st_size
                logger.info(f"📁 Downloaded file size: {file_size / 1024 / 1024:.2f} MB")
                
                if file_size < 10000:
                    logger.warning(f"⚠️ Downloaded file is very small ({file_size} bytes)")
                    with open(filepath, 'rb') as f:
                        header = f.read(200)
                        if b'error' in header.lower() or b'html' in header.lower():
                            filepath.unlink()
                            raise Exception("CAMS API returned an error response instead of data")
                
                logger.info(f"✅ Successfully downloaded: {filename}")
                return str(filepath)
            else:
                raise Exception("Download completed but file was not created")
            
        except Exception as e:
            if filepath.exists():
                logger.warning(f"🗑️ Cleaning up failed download: {filepath}")
                filepath.unlink()
            raise Exception(f"Error downloading CAMS data: {str(e)}")
    
    def extract_cams_files(self, zip_path):
        """Extract surface and atmospheric data from CAMS ZIP file"""
        zip_path = Path(zip_path)
        if not zip_path.exists():
            raise FileNotFoundError(f"ZIP file not found: {zip_path}")
        
        try:
            file_size = zip_path.stat().st_size
            if file_size < 1000:
                logger.warning(f"⚠️ Downloaded file is too small ({file_size} bytes), likely an error response")
                with open(zip_path, 'rb') as f:
                    header = f.read(100)
                    if b'html' in header.lower() or b'error' in header.lower():
                        raise Exception("Downloaded file appears to be an HTML error page, not ZIP data")
                
            if not zipfile.is_zipfile(zip_path):
                logger.error(f"❌ File is not a valid ZIP file: {zip_path}")
                with open(zip_path, 'r', errors='ignore') as f:
                    first_lines = f.read(200)
                    logger.debug(f"File contents preview: {first_lines[:100]}...")
                raise Exception(f"Downloaded file is not a valid ZIP archive. File size: {file_size} bytes")
                
        except Exception as e:
            raise Exception(f"Error validating ZIP file: {str(e)}")
        
        date_str = zip_path.stem.replace("-cams.nc", "")
        surface_path = self.extracted_dir / f"{date_str}-cams-surface.nc"
        atmospheric_path = self.extracted_dir / f"{date_str}-cams-atmospheric.nc"
        
        extracted_files = {}
        
        try:
            with zipfile.ZipFile(zip_path, "r") as zf:
                zip_contents = zf.namelist()
                
                surface_file = next((f for f in zip_contents if 'sfc' in f.lower() or f.endswith('_sfc.nc')), None)
                if surface_file and not surface_path.exists():
                    with open(surface_path, "wb") as f:
                        f.write(zf.read(surface_file))
                    logger.info(f"✅ Extracted surface data: {surface_path.name}")
                    extracted_files['surface'] = str(surface_path)
                elif surface_path.exists():
                    extracted_files['surface'] = str(surface_path)
                
                atmospheric_file = next((f for f in zip_contents if 'plev' in f.lower() or f.endswith('_plev.nc')), None)
                if atmospheric_file and not atmospheric_path.exists():
                    with open(atmospheric_path, "wb") as f:
                        f.write(zf.read(atmospheric_file))
                    logger.info(f"✅ Extracted atmospheric data: {atmospheric_path.name}")
                    extracted_files['atmospheric'] = str(atmospheric_path)
                elif atmospheric_path.exists():
                    extracted_files['atmospheric'] = str(atmospheric_path)
                
                if not extracted_files:
                    nc_files = [f for f in zip_contents if f.endswith('.nc')]
                    for nc_file in nc_files:
                        output_path = self.extracted_dir / nc_file
                        if not output_path.exists():
                            with open(output_path, "wb") as f:
                                f.write(zf.read(nc_file))
                            extracted_files[nc_file] = str(output_path)
        
        except Exception as e:
            raise Exception(f"Error extracting ZIP file: {str(e)}")
        
        if not extracted_files:
            raise Exception("No NetCDF files found in ZIP archive")
        
        return extracted_files
    
    def list_downloaded_files(self):
        """List all downloaded CAMS files"""
        downloaded_files = []
        
        for zip_file in self.download_dir.glob("*-cams.nc.zip"):
            date_str = zip_file.stem.replace("-cams.nc", "")
            file_info = {
                'date': date_str,
                'zip_path': str(zip_file),
                'size_mb': zip_file.stat().st_size / (1024 * 1024),
                'downloaded': zip_file.stat().st_mtime
            }
            downloaded_files.append(file_info)
        
        downloaded_files.sort(key=lambda x: x['date'], reverse=True)
        return downloaded_files
    
    def cleanup_old_files(self, days_old=7):
        """Clean up downloaded files older than specified days"""
        try:
            cutoff_date = datetime.now() - timedelta(days=days_old)
            deleted_count = 0
            
            for zip_file in self.download_dir.glob("*-cams.nc.zip"):
                if datetime.fromtimestamp(zip_file.stat().st_mtime) < cutoff_date:
                    zip_file.unlink()
                    deleted_count += 1
            
            for nc_file in self.extracted_dir.glob("*.nc"):
                if datetime.fromtimestamp(nc_file.stat().st_mtime) < cutoff_date:
                    nc_file.unlink()
                    deleted_count += 1
            
            logger.info(f"🧹 Cleaned up {deleted_count} old files")
            return deleted_count
            
        except Exception as e:
            logger.error(f"Error during cleanup: {str(e)}")
            return 0


def test_cams_downloader():
    """Test function for CAMS downloader"""
    logger.info("Testing CAMS downloader...")
    downloader = CAMSDownloader()
    
    if not downloader.is_client_ready():
        logger.error("❌ CDS API client not ready. Please check your credentials.")
        return False
    
    test_date = (datetime.now() - timedelta(days=600)).strftime('%Y-%m-%d')
    logger.info(f"Testing download for date: {test_date}")
    logger.warning("⚠️ This may take several minutes for the first download...")
    
    try:
        zip_path = downloader.download_cams_data(test_date)
        logger.info(f"✅ Download successful: {zip_path}")
        
        extracted_files = downloader.extract_cams_files(zip_path)
        logger.info(f"✅ Extraction successful: {len(extracted_files)} files")
        
        downloaded = downloader.list_downloaded_files()
        logger.info(f"✅ Found {len(downloaded)} downloaded files")
        
        return True
        
    except Exception as e:
        logger.error(f"❌ Test failed: {str(e)}")
        return False


if __name__ == "__main__":
    test_cams_downloader()