Update app.py
Browse files
app.py
CHANGED
|
@@ -216,30 +216,42 @@ def create_treemap(treemap_data, count_by, title=None):
|
|
| 216 |
|
| 217 |
def download_with_progress(url, progress=None):
|
| 218 |
"""Download a file with progress tracking"""
|
| 219 |
-
|
| 220 |
-
|
| 221 |
-
|
| 222 |
-
|
| 223 |
-
|
| 224 |
-
|
| 225 |
-
|
| 226 |
-
|
| 227 |
-
progress
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 228 |
|
| 229 |
-
|
| 230 |
-
|
| 231 |
-
|
| 232 |
-
|
| 233 |
-
|
| 234 |
-
|
| 235 |
-
|
| 236 |
-
|
| 237 |
-
|
| 238 |
-
|
| 239 |
-
|
| 240 |
-
|
| 241 |
-
|
| 242 |
-
return data.getvalue()
|
| 243 |
|
| 244 |
def download_and_process_models(progress=None):
|
| 245 |
"""Download and process the models data from HuggingFace dataset with progress tracking"""
|
|
@@ -250,8 +262,7 @@ def download_and_process_models(progress=None):
|
|
| 250 |
|
| 251 |
# Check if we have cached data
|
| 252 |
if os.path.exists('data/processed_models.parquet'):
|
| 253 |
-
|
| 254 |
-
progress(1.0, "Loading from cache...")
|
| 255 |
print("Loading models from cache...")
|
| 256 |
df = pd.read_parquet('data/processed_models.parquet')
|
| 257 |
return df
|
|
@@ -259,65 +270,65 @@ def download_and_process_models(progress=None):
|
|
| 259 |
# URL to the models.parquet file
|
| 260 |
url = "https://huggingface.co/datasets/cfahlgren1/hub-stats/resolve/main/models.parquet"
|
| 261 |
|
| 262 |
-
|
| 263 |
-
progress(0.0, "Starting download...")
|
| 264 |
print(f"Downloading models data from {url}...")
|
| 265 |
|
| 266 |
-
|
| 267 |
-
|
| 268 |
-
|
| 269 |
-
|
| 270 |
-
progress
|
| 271 |
-
|
| 272 |
-
|
| 273 |
-
|
| 274 |
-
|
| 275 |
-
|
| 276 |
-
|
| 277 |
-
|
| 278 |
-
|
| 279 |
-
|
| 280 |
-
|
| 281 |
-
|
| 282 |
-
|
| 283 |
-
|
| 284 |
-
|
| 285 |
-
|
| 286 |
-
|
| 287 |
-
|
| 288 |
-
|
| 289 |
-
|
| 290 |
-
|
| 291 |
-
|
| 292 |
-
|
| 293 |
-
|
| 294 |
-
|
| 295 |
-
|
| 296 |
-
|
| 297 |
-
|
| 298 |
-
|
| 299 |
-
|
| 300 |
-
|
| 301 |
-
|
| 302 |
-
|
| 303 |
-
|
| 304 |
-
|
| 305 |
-
|
| 306 |
-
|
| 307 |
-
|
| 308 |
-
|
| 309 |
-
|
| 310 |
-
|
| 311 |
-
|
| 312 |
-
|
| 313 |
-
|
|
|
|
|
|
|
| 314 |
|
| 315 |
except Exception as e:
|
| 316 |
print(f"Error downloading or processing data: {e}")
|
| 317 |
-
|
| 318 |
-
progress(1.0, "Using sample data (download failed)")
|
| 319 |
# Return sample data for testing if real data unavailable
|
| 320 |
-
return create_sample_data()
|
| 321 |
|
| 322 |
def create_sample_data(progress=None):
|
| 323 |
"""Create sample data for testing when real data is unavailable"""
|
|
|
|
| 216 |
|
| 217 |
def download_with_progress(url, progress=None):
|
| 218 |
"""Download a file with progress tracking"""
|
| 219 |
+
try:
|
| 220 |
+
response = requests.get(url, stream=True)
|
| 221 |
+
total_size = int(response.headers.get('content-length', 0))
|
| 222 |
+
block_size = 1024 # 1 Kibibyte
|
| 223 |
+
data = BytesIO()
|
| 224 |
+
|
| 225 |
+
if total_size == 0:
|
| 226 |
+
# If content length is unknown, we can't show accurate progress
|
| 227 |
+
if progress is not None:
|
| 228 |
+
progress(0, "Starting download...")
|
| 229 |
+
|
| 230 |
+
for chunk in response.iter_content(block_size):
|
| 231 |
+
data.write(chunk)
|
| 232 |
+
if progress is not None:
|
| 233 |
+
progress(0, f"Downloading... (unknown size)")
|
| 234 |
+
else:
|
| 235 |
+
downloaded = 0
|
| 236 |
+
for chunk in response.iter_content(block_size):
|
| 237 |
+
downloaded += len(chunk)
|
| 238 |
+
data.write(chunk)
|
| 239 |
+
if progress is not None:
|
| 240 |
+
percent = int(100 * downloaded / total_size)
|
| 241 |
+
progress(percent / 100, f"Downloading... {percent}% ({downloaded//(1024*1024)}MB/{total_size//(1024*1024)}MB)")
|
| 242 |
|
| 243 |
+
return data.getvalue()
|
| 244 |
+
except Exception as e:
|
| 245 |
+
print(f"Error in download_with_progress: {e}")
|
| 246 |
+
raise
|
| 247 |
+
|
| 248 |
+
def update_progress(progress_obj, value, description):
|
| 249 |
+
"""Safely update progress with error handling"""
|
| 250 |
+
try:
|
| 251 |
+
if progress_obj is not None:
|
| 252 |
+
progress_obj(value, description)
|
| 253 |
+
except Exception as e:
|
| 254 |
+
print(f"Error updating progress: {e}")
|
|
|
|
|
|
|
| 255 |
|
| 256 |
def download_and_process_models(progress=None):
|
| 257 |
"""Download and process the models data from HuggingFace dataset with progress tracking"""
|
|
|
|
| 262 |
|
| 263 |
# Check if we have cached data
|
| 264 |
if os.path.exists('data/processed_models.parquet'):
|
| 265 |
+
update_progress(progress, 1.0, "Loading from cache...")
|
|
|
|
| 266 |
print("Loading models from cache...")
|
| 267 |
df = pd.read_parquet('data/processed_models.parquet')
|
| 268 |
return df
|
|
|
|
| 270 |
# URL to the models.parquet file
|
| 271 |
url = "https://huggingface.co/datasets/cfahlgren1/hub-stats/resolve/main/models.parquet"
|
| 272 |
|
| 273 |
+
update_progress(progress, 0.0, "Starting download...")
|
|
|
|
| 274 |
print(f"Downloading models data from {url}...")
|
| 275 |
|
| 276 |
+
try:
|
| 277 |
+
# Download with progress tracking
|
| 278 |
+
file_content = download_with_progress(url, progress)
|
| 279 |
+
|
| 280 |
+
update_progress(progress, 0.9, "Parsing parquet file...")
|
| 281 |
+
|
| 282 |
+
# Read the parquet file
|
| 283 |
+
table = pq.read_table(BytesIO(file_content))
|
| 284 |
+
df = table.to_pandas()
|
| 285 |
+
|
| 286 |
+
print(f"Downloaded {len(df)} models")
|
| 287 |
+
|
| 288 |
+
update_progress(progress, 0.95, "Processing data...")
|
| 289 |
+
|
| 290 |
+
# Process the safetensors column if it's a string (JSON)
|
| 291 |
+
if 'safetensors' in df.columns:
|
| 292 |
+
def parse_safetensors(val):
|
| 293 |
+
if isinstance(val, str):
|
| 294 |
+
try:
|
| 295 |
+
return json.loads(val)
|
| 296 |
+
except:
|
| 297 |
+
return None
|
| 298 |
+
return val
|
| 299 |
+
|
| 300 |
+
df['safetensors'] = df['safetensors'].apply(parse_safetensors)
|
| 301 |
+
|
| 302 |
+
# Process the tags column if needed
|
| 303 |
+
if 'tags' in df.columns and len(df) > 0 and not isinstance(df['tags'].iloc[0], list):
|
| 304 |
+
def parse_tags(val):
|
| 305 |
+
if isinstance(val, str):
|
| 306 |
+
try:
|
| 307 |
+
return json.loads(val)
|
| 308 |
+
except:
|
| 309 |
+
return []
|
| 310 |
+
return val if isinstance(val, list) else []
|
| 311 |
+
|
| 312 |
+
df['tags'] = df['tags'].apply(parse_tags)
|
| 313 |
+
|
| 314 |
+
# Cache the processed data
|
| 315 |
+
update_progress(progress, 0.98, "Saving to cache...")
|
| 316 |
+
df.to_parquet('data/processed_models.parquet')
|
| 317 |
+
|
| 318 |
+
update_progress(progress, 1.0, "Data ready!")
|
| 319 |
+
|
| 320 |
+
return df
|
| 321 |
+
|
| 322 |
+
except Exception as download_error:
|
| 323 |
+
print(f"Download failed: {download_error}")
|
| 324 |
+
update_progress(progress, 0.5, "Download failed, generating sample data...")
|
| 325 |
+
return create_sample_data(progress)
|
| 326 |
|
| 327 |
except Exception as e:
|
| 328 |
print(f"Error downloading or processing data: {e}")
|
| 329 |
+
update_progress(progress, 1.0, "Using sample data (error occurred)")
|
|
|
|
| 330 |
# Return sample data for testing if real data unavailable
|
| 331 |
+
return create_sample_data(progress)
|
| 332 |
|
| 333 |
def create_sample_data(progress=None):
|
| 334 |
"""Create sample data for testing when real data is unavailable"""
|