File size: 13,666 Bytes
808378f
cfea739
 
808378f
 
 
cfea739
 
808378f
 
 
 
cfea739
 
808378f
cfea739
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
808378f
cfea739
f89b28b
 
 
 
 
808378f
f89b28b
 
808378f
cfea739
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
808378f
f89b28b
cfea739
 
f89b28b
 
 
808378f
cfea739
 
808378f
 
 
 
 
cfea739
 
 
 
 
 
 
808378f
cfea739
 
 
 
 
 
 
 
 
 
 
 
 
808378f
cfea739
 
 
 
808378f
cfea739
 
 
 
808378f
cfea739
 
 
 
 
 
 
 
808378f
cfea739
 
 
 
 
808378f
cfea739
 
 
 
 
 
 
 
 
 
 
 
 
808378f
 
 
cfea739
 
808378f
cfea739
 
 
 
 
 
 
 
808378f
cfea739
 
 
 
 
610152e
 
 
808378f
610152e
808378f
 
610152e
 
 
 
 
 
808378f
610152e
 
 
cfea739
 
 
808378f
cfea739
 
 
 
808378f
cfea739
 
 
 
610152e
 
808378f
 
610152e
 
 
 
 
 
808378f
610152e
 
808378f
610152e
 
 
808378f
610152e
cfea739
 
 
 
 
 
 
 
 
 
808378f
cfea739
 
 
808378f
cfea739
 
 
 
808378f
cfea739
 
 
808378f
cfea739
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
808378f
 
cfea739
 
 
808378f
cfea739
 
 
 
 
 
 
 
 
 
808378f
cfea739
 
 
808378f
cfea739
 
 
 
 
808378f
cfea739
 
 
808378f
cfea739
 
 
808378f
 
cfea739
 
 
808378f
cfea739
 
808378f
cfea739
 
808378f
cfea739
 
 
 
808378f
cfea739
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
import os
import cdsapi
import zipfile
import logging
import pandas as pd

from pathlib import Path
from datetime import datetime, timedelta
from constants import DOWNLOAD_FOLDER

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

class CAMSDownloader:
    def __init__(self, download_dir=DOWNLOAD_FOLDER):
        """
        Initialize CAMS downloader
        
        Parameters:
        download_dir (str): Directory to store downloaded files
        """
        self.download_dir = Path(download_dir)
        self.download_dir.mkdir(exist_ok=True)
        
        # Create subdirectories
        self.extracted_dir = self.download_dir / "extracted"
        self.extracted_dir.mkdir(exist_ok=True)
        
        self.client = None
        self._init_client()
    
    def _init_client(self):
        """Initialize CDS API client"""
        # Try to get the credentials from environment variables
        try:
            cdsapi_url = os.getenv('CDSAPI_URL')
            cdsapi_key = os.getenv('CDSAPI_KEY')
            
            if cdsapi_url and cdsapi_key:
                self.client = cdsapi.Client(key=cdsapi_key, url=cdsapi_url)
                logger.info("βœ… CDS API client initialized from environment variables")
                return
            
            # Try to read from .cdsapirc file
            cdsapirc_path = Path.cwd() / ".cdsapirc"
            if not cdsapirc_path.exists():
                cdsapirc_path = Path.home() / ".cdsapirc"
            
            if cdsapirc_path.exists():
                with open(cdsapirc_path, 'r') as f:
                    lines = f.readlines()
                
                url = None
                key = None
                for line in lines:
                    line = line.strip()
                    if line.startswith('url:'):
                        url = line.split(':', 1)[1].strip()
                    elif line.startswith('key:'):
                        key = line.split(':', 1)[1].strip()
                
                if url and key:
                    self.client = cdsapi.Client(key=key, url=url)
                    logger.info("βœ… CDS API client initialized from .cdsapirc file")
                    return
                else:
                    raise ValueError("Could not parse URL or key from .cdsapirc file")
            
            # Last resort: Try default initialization
            self.client = cdsapi.Client()
            logger.info("βœ… CDS API client initialized with default settings")
                
        except Exception as e:
            logger.warning(f"⚠️ Could not initialize CDS API client: {str(e)}")
            logger.warning("Please ensure you have:\n"
                            "1. Created an account at https://cds.climate.copernicus.eu/\n"
                            "2. Set CDSAPI_URL and CDSAPI_KEY environment variables\n"
                            "3. Or created a .cdsapirc file in your home directory")
            self.client = None
    
    def is_client_ready(self):
        """Check if CDS API client is ready"""
        return self.client is not None
    
    def download_cams_data(self, date_str, variables=None, pressure_levels=None):
        """Download CAMS atmospheric composition data for a specific date"""
        if not self.is_client_ready():
            raise Exception("CDS API client not initialized. Please check your credentials.")
        
        try:
            target_date = pd.to_datetime(date_str)
            date_str = target_date.strftime('%Y-%m-%d')
        except:
            raise ValueError(f"Invalid date format: {date_str}. Use YYYY-MM-DD format.")
        
        filename = f"{date_str}-cams.nc.zip"
        filepath = self.download_dir / filename
        
        if filepath.exists():
            logger.info(f"βœ… Data for {date_str} already exists: {filename}")
            return str(filepath)
        
        if variables is None:
            variables = [
                # Meteorological surface-level variables:
                "10m_u_component_of_wind",
                "10m_v_component_of_wind", 
                "2m_temperature",
                "mean_sea_level_pressure",
                # Pollution surface-level variables:
                "particulate_matter_1um",
                "particulate_matter_2.5um", 
                "particulate_matter_10um",
                "total_column_carbon_monoxide",
                "total_column_nitrogen_monoxide",
                "total_column_nitrogen_dioxide",
                "total_column_ozone",
                "total_column_sulphur_dioxide",
                # Meteorological atmospheric variables:
                "u_component_of_wind",
                "v_component_of_wind",
                "temperature", 
                "geopotential",
                "specific_humidity",
                # Pollution atmospheric variables:
                "carbon_monoxide",
                "nitrogen_dioxide",
                "nitrogen_monoxide", 
                "ozone",
                "sulphur_dioxide",
            ]
        
        if pressure_levels is None:
            pressure_levels = [
                "50", "100", "150", "200", "250", "300", "400",
                "500", "600", "700", "850", "925", "1000",
            ]
        
        logger.info(f"πŸ”„ Downloading CAMS data for {date_str}...")
        logger.info(f"Variables: {len(variables)} selected")
        logger.info(f"Pressure levels: {len(pressure_levels)} levels")
        
        try:
            logger.info("πŸ“‘ Requesting data from CAMS API...")
            self.client.retrieve(
                "cams-global-atmospheric-composition-forecasts",
                {
                    "type": "forecast",
                    "leadtime_hour": "0",
                    "variable": variables,
                    "pressure_level": pressure_levels,
                    "date": date_str,
                    "time": ["00:00", "12:00"],
                    "format": "netcdf_zip",
                },
                str(filepath),
            )
            
            # Validate the downloaded file
            if filepath.exists():
                file_size = filepath.stat().st_size
                logger.info(f"πŸ“ Downloaded file size: {file_size / 1024 / 1024:.2f} MB")
                
                if file_size < 10000:
                    logger.warning(f"⚠️ Downloaded file is very small ({file_size} bytes)")
                    with open(filepath, 'rb') as f:
                        header = f.read(200)
                        if b'error' in header.lower() or b'html' in header.lower():
                            filepath.unlink()
                            raise Exception("CAMS API returned an error response instead of data")
                
                logger.info(f"βœ… Successfully downloaded: {filename}")
                return str(filepath)
            else:
                raise Exception("Download completed but file was not created")
            
        except Exception as e:
            if filepath.exists():
                logger.warning(f"πŸ—‘οΈ Cleaning up failed download: {filepath}")
                filepath.unlink()
            raise Exception(f"Error downloading CAMS data: {str(e)}")
    
    def extract_cams_files(self, zip_path):
        """Extract surface and atmospheric data from CAMS ZIP file"""
        zip_path = Path(zip_path)
        if not zip_path.exists():
            raise FileNotFoundError(f"ZIP file not found: {zip_path}")
        
        try:
            file_size = zip_path.stat().st_size
            if file_size < 1000:
                logger.warning(f"⚠️ Downloaded file is too small ({file_size} bytes), likely an error response")
                with open(zip_path, 'rb') as f:
                    header = f.read(100)
                    if b'html' in header.lower() or b'error' in header.lower():
                        raise Exception("Downloaded file appears to be an HTML error page, not ZIP data")
                
            if not zipfile.is_zipfile(zip_path):
                logger.error(f"❌ File is not a valid ZIP file: {zip_path}")
                with open(zip_path, 'r', errors='ignore') as f:
                    first_lines = f.read(200)
                    logger.debug(f"File contents preview: {first_lines[:100]}...")
                raise Exception(f"Downloaded file is not a valid ZIP archive. File size: {file_size} bytes")
                
        except Exception as e:
            raise Exception(f"Error validating ZIP file: {str(e)}")
        
        date_str = zip_path.stem.replace("-cams.nc", "")
        surface_path = self.extracted_dir / f"{date_str}-cams-surface.nc"
        atmospheric_path = self.extracted_dir / f"{date_str}-cams-atmospheric.nc"
        
        extracted_files = {}
        
        try:
            with zipfile.ZipFile(zip_path, "r") as zf:
                zip_contents = zf.namelist()
                
                surface_file = next((f for f in zip_contents if 'sfc' in f.lower() or f.endswith('_sfc.nc')), None)
                if surface_file and not surface_path.exists():
                    with open(surface_path, "wb") as f:
                        f.write(zf.read(surface_file))
                    logger.info(f"βœ… Extracted surface data: {surface_path.name}")
                    extracted_files['surface'] = str(surface_path)
                elif surface_path.exists():
                    extracted_files['surface'] = str(surface_path)
                
                atmospheric_file = next((f for f in zip_contents if 'plev' in f.lower() or f.endswith('_plev.nc')), None)
                if atmospheric_file and not atmospheric_path.exists():
                    with open(atmospheric_path, "wb") as f:
                        f.write(zf.read(atmospheric_file))
                    logger.info(f"βœ… Extracted atmospheric data: {atmospheric_path.name}")
                    extracted_files['atmospheric'] = str(atmospheric_path)
                elif atmospheric_path.exists():
                    extracted_files['atmospheric'] = str(atmospheric_path)
                
                if not extracted_files:
                    nc_files = [f for f in zip_contents if f.endswith('.nc')]
                    for nc_file in nc_files:
                        output_path = self.extracted_dir / nc_file
                        if not output_path.exists():
                            with open(output_path, "wb") as f:
                                f.write(zf.read(nc_file))
                            extracted_files[nc_file] = str(output_path)
        
        except Exception as e:
            raise Exception(f"Error extracting ZIP file: {str(e)}")
        
        if not extracted_files:
            raise Exception("No NetCDF files found in ZIP archive")
        
        return extracted_files
    
    def list_downloaded_files(self):
        """List all downloaded CAMS files"""
        downloaded_files = []
        
        for zip_file in self.download_dir.glob("*-cams.nc.zip"):
            date_str = zip_file.stem.replace("-cams.nc", "")
            file_info = {
                'date': date_str,
                'zip_path': str(zip_file),
                'size_mb': zip_file.stat().st_size / (1024 * 1024),
                'downloaded': zip_file.stat().st_mtime
            }
            downloaded_files.append(file_info)
        
        downloaded_files.sort(key=lambda x: x['date'], reverse=True)
        return downloaded_files
    
    def cleanup_old_files(self, days_old=7):
        """Clean up downloaded files older than specified days"""
        try:
            cutoff_date = datetime.now() - timedelta(days=days_old)
            deleted_count = 0
            
            for zip_file in self.download_dir.glob("*-cams.nc.zip"):
                if datetime.fromtimestamp(zip_file.stat().st_mtime) < cutoff_date:
                    zip_file.unlink()
                    deleted_count += 1
            
            for nc_file in self.extracted_dir.glob("*.nc"):
                if datetime.fromtimestamp(nc_file.stat().st_mtime) < cutoff_date:
                    nc_file.unlink()
                    deleted_count += 1
            
            logger.info(f"🧹 Cleaned up {deleted_count} old files")
            return deleted_count
            
        except Exception as e:
            logger.error(f"Error during cleanup: {str(e)}")
            return 0


def test_cams_downloader():
    """Test function for CAMS downloader"""
    logger.info("Testing CAMS downloader...")
    downloader = CAMSDownloader()
    
    if not downloader.is_client_ready():
        logger.error("❌ CDS API client not ready. Please check your credentials.")
        return False
    
    test_date = (datetime.now() - timedelta(days=600)).strftime('%Y-%m-%d')
    logger.info(f"Testing download for date: {test_date}")
    logger.warning("⚠️ This may take several minutes for the first download...")
    
    try:
        zip_path = downloader.download_cams_data(test_date)
        logger.info(f"βœ… Download successful: {zip_path}")
        
        extracted_files = downloader.extract_cams_files(zip_path)
        logger.info(f"βœ… Extraction successful: {len(extracted_files)} files")
        
        downloaded = downloader.list_downloaded_files()
        logger.info(f"βœ… Found {len(downloaded)} downloaded files")
        
        return True
        
    except Exception as e:
        logger.error(f"❌ Test failed: {str(e)}")
        return False


if __name__ == "__main__":
    test_cams_downloader()