DataVerse

Running

App Files Files Community

evijit HF Staff commited on May 10

Commit

9784f64

verified ·

1 Parent(s): 9c451ee

Update app.py

Browse files

Files changed (1) hide show

app.py +89 -78

app.py CHANGED Viewed

@@ -216,30 +216,42 @@ def create_treemap(treemap_data, count_by, title=None):
 def download_with_progress(url, progress=None):
     """Download a file with progress tracking"""
-    response = requests.get(url, stream=True)
-    total_size = int(response.headers.get('content-length', 0))
-    block_size = 1024  # 1 Kibibyte
-    data = BytesIO()
-    if total_size == 0:
-        # If content length is unknown, we can't show accurate progress
-        if progress:
-            progress(0, "Starting download...")
-        for chunk in response.iter_content(block_size):
-            data.write(chunk)
-            if progress:
-                progress(0, f"Downloading... (unknown size)")
-    else:
-        downloaded = 0
-        for chunk in response.iter_content(block_size):
-            downloaded += len(chunk)
-            data.write(chunk)
-            if progress:
-                percent = int(100 * downloaded / total_size)
-                progress(percent / 100, f"Downloading... {percent}% ({downloaded//(1024*1024)}MB/{total_size//(1024*1024)}MB)")
-    return data.getvalue()
 def download_and_process_models(progress=None):
     """Download and process the models data from HuggingFace dataset with progress tracking"""
@@ -250,8 +262,7 @@ def download_and_process_models(progress=None):
         # Check if we have cached data
         if os.path.exists('data/processed_models.parquet'):
-            if progress:
-                progress(1.0, "Loading from cache...")
             print("Loading models from cache...")
             df = pd.read_parquet('data/processed_models.parquet')
             return df
@@ -259,65 +270,65 @@ def download_and_process_models(progress=None):
         # URL to the models.parquet file
         url = "https://huggingface.co/datasets/cfahlgren1/hub-stats/resolve/main/models.parquet"
-        if progress:
-            progress(0.0, "Starting download...")
         print(f"Downloading models data from {url}...")
-        # Download with progress tracking
-        file_content = download_with_progress(url, progress)
-        if progress:
-            progress(0.9, "Parsing parquet file...")
-        # Read the parquet file
-        table = pq.read_table(BytesIO(file_content))
-        df = table.to_pandas()
-        print(f"Downloaded {len(df)} models")
-        if progress:
-            progress(0.95, "Processing data...")
-        # Process the safetensors column if it's a string (JSON)
-        if 'safetensors' in df.columns:
-            def parse_safetensors(val):
-                if isinstance(val, str):
-                    try:
-                        return json.loads(val)
-                    except:
-                        return None
-                return val
-            df['safetensors'] = df['safetensors'].apply(parse_safetensors)
-        # Process the tags column if needed
-        if 'tags' in df.columns and not isinstance(df['tags'].iloc[0], list):
-            def parse_tags(val):
-                if isinstance(val, str):
-                    try:
-                        return json.loads(val)
-                    except:
-                        return []
-                return val if isinstance(val, list) else []
-            df['tags'] = df['tags'].apply(parse_tags)
-        # Cache the processed data
-        if progress:
-            progress(0.98, "Saving to cache...")
-        df.to_parquet('data/processed_models.parquet')
-        if progress:
-            progress(1.0, "Data ready!")
-        return df
     except Exception as e:
         print(f"Error downloading or processing data: {e}")
-        if progress:
-            progress(1.0, "Using sample data (download failed)")
         # Return sample data for testing if real data unavailable
-        return create_sample_data()
 def create_sample_data(progress=None):
     """Create sample data for testing when real data is unavailable"""

 def download_with_progress(url, progress=None):
     """Download a file with progress tracking"""
+    try:
+        response = requests.get(url, stream=True)
+        total_size = int(response.headers.get('content-length', 0))
+        block_size = 1024  # 1 Kibibyte
+        data = BytesIO()
+        if total_size == 0:
+            # If content length is unknown, we can't show accurate progress
+            if progress is not None:
+                progress(0, "Starting download...")
+            for chunk in response.iter_content(block_size):
+                data.write(chunk)
+                if progress is not None:
+                    progress(0, f"Downloading... (unknown size)")
+        else:
+            downloaded = 0
+            for chunk in response.iter_content(block_size):
+                downloaded += len(chunk)
+                data.write(chunk)
+                if progress is not None:
+                    percent = int(100 * downloaded / total_size)
+                    progress(percent / 100, f"Downloading... {percent}% ({downloaded//(1024*1024)}MB/{total_size//(1024*1024)}MB)")
+        return data.getvalue()
+    except Exception as e:
+        print(f"Error in download_with_progress: {e}")
+        raise
+def update_progress(progress_obj, value, description):
+    """Safely update progress with error handling"""
+    try:
+        if progress_obj is not None:
+            progress_obj(value, description)
+    except Exception as e:
+        print(f"Error updating progress: {e}")
 def download_and_process_models(progress=None):
     """Download and process the models data from HuggingFace dataset with progress tracking"""
         # Check if we have cached data
         if os.path.exists('data/processed_models.parquet'):
+            update_progress(progress, 1.0, "Loading from cache...")
             print("Loading models from cache...")
             df = pd.read_parquet('data/processed_models.parquet')
             return df
         # URL to the models.parquet file
         url = "https://huggingface.co/datasets/cfahlgren1/hub-stats/resolve/main/models.parquet"
+        update_progress(progress, 0.0, "Starting download...")
         print(f"Downloading models data from {url}...")
+        try:
+            # Download with progress tracking
+            file_content = download_with_progress(url, progress)
+            update_progress(progress, 0.9, "Parsing parquet file...")
+            # Read the parquet file
+            table = pq.read_table(BytesIO(file_content))
+            df = table.to_pandas()
+            print(f"Downloaded {len(df)} models")
+            update_progress(progress, 0.95, "Processing data...")
+            # Process the safetensors column if it's a string (JSON)
+            if 'safetensors' in df.columns:
+                def parse_safetensors(val):
+                    if isinstance(val, str):
+                        try:
+                            return json.loads(val)
+                        except:
+                            return None
+                    return val
+                df['safetensors'] = df['safetensors'].apply(parse_safetensors)
+            # Process the tags column if needed
+            if 'tags' in df.columns and len(df) > 0 and not isinstance(df['tags'].iloc[0], list):
+                def parse_tags(val):
+                    if isinstance(val, str):
+                        try:
+                            return json.loads(val)
+                        except:
+                            return []
+                    return val if isinstance(val, list) else []
+                df['tags'] = df['tags'].apply(parse_tags)
+            # Cache the processed data
+            update_progress(progress, 0.98, "Saving to cache...")
+            df.to_parquet('data/processed_models.parquet')
+            update_progress(progress, 1.0, "Data ready!")
+            return df
+        except Exception as download_error:
+            print(f"Download failed: {download_error}")
+            update_progress(progress, 0.5, "Download failed, generating sample data...")
+            return create_sample_data(progress)
     except Exception as e:
         print(f"Error downloading or processing data: {e}")
+        update_progress(progress, 1.0, "Using sample data (error occurred)")
         # Return sample data for testing if real data unavailable
+        return create_sample_data(progress)
 def create_sample_data(progress=None):
     """Create sample data for testing when real data is unavailable"""