Spaces:

cstr
/

conceptnet_db

Sleeping

App Files Files Community

cstr commited on Nov 6

Commit

13a2324

verified ·

1 Parent(s): 1e767d5

Update app.py

Browse files

Files changed (1) hide show

app.py +121 -120

app.py CHANGED Viewed

@@ -4,6 +4,7 @@ import pandas as pd
 from huggingface_hub import hf_hub_download, snapshot_download
 import os
 import traceback
 from pathlib import Path
 # --- 1. Download and Cache the Database with Indices ---
@@ -52,6 +53,23 @@ except Exception as e:
 # --- 2. Database Helper Functions ---
 def get_db_connection():
     """
     Creates a new read-only connection to the SQLite database with optimizations.
@@ -65,7 +83,6 @@ def get_db_connection():
             ("PRAGMA query_only = ON", True),
             ("PRAGMA temp_store = MEMORY", True),
             ("PRAGMA cache_size = -128000", True),      # 128MB cache
-            ("PRAGMA page_size = 8192", False),         # Can't change on existing DB
             ("PRAGMA mmap_size = 2147483648", True),    # 2GB memory-mapped I/O
             ("PRAGMA synchronous = OFF", True),
             ("PRAGMA locking_mode = NORMAL", True),
@@ -76,7 +93,7 @@ def get_db_connection():
             try:
                 conn.execute(pragma)
             except sqlite3.OperationalError as e:
-                if critical:
                     print(f"Warning: Could not apply {pragma}: {e}")
         return conn
@@ -128,12 +145,6 @@ def verify_indices():
             if fts:
                 print(f"✅ Full-Text Search enabled: {[f[0] for f in fts]}")
-            # Check for FTS tables
-            cursor.execute("SELECT name FROM sqlite_master WHERE type='table' AND name LIKE '%fts%'")
-            fts_tables = cursor.fetchall()
-            if fts_tables:
-                print(f"✅ FTS tables found: {[t[0] for t in fts_tables]}")
             # Check database stats
             cursor.execute("PRAGMA page_size")
             page_size = cursor.fetchone()[0]
@@ -158,62 +169,68 @@ def get_schema_info():
     schema_md = "# 📚 Database Schema\n\n"
     try:
-        with get_db_connection() as conn:
-            cursor = conn.cursor()
-            # Get database stats
-            cursor.execute("SELECT page_count * page_size as size FROM pragma_page_count(), pragma_page_size()")
-            db_size = cursor.fetchone()[0]
-            schema_md += f"**Database Size:** {db_size / 1024 / 1024 / 1024:.2f} GB\n\n"
-            cursor.execute("SELECT name FROM sqlite_master WHERE type='table' AND name NOT LIKE 'sqlite_%';")
-            tables = cursor.fetchall()
-            if not tables:
-                return "Could not find any tables in the database."
-            for table in tables:
-                table_name = table[0]
-                # Get row count
-                try:
-                    cursor.execute(f"SELECT COUNT(*) FROM {table_name}")
-                    row_count = cursor.fetchone()[0]
-                    schema_md += f"## Table: `{table_name}` ({row_count:,} rows)\n\n"
-                except:
-                    schema_md += f"## Table: `{table_name}`\n\n"
-                schema_md += "### Columns\n\n"
-                schema_md += "| Column Name | Data Type | Not Null | Primary Key |\n"
-                schema_md += "|:------------|:----------|:---------|:------------|\n"
-                cursor.execute(f"PRAGMA table_info({table_name});")
-                columns = cursor.fetchall()
-                for col in columns:
-                    name, dtype, notnull, pk = col[1], col[2], col[3], col[5]
-                    schema_md += f"| `{name}` | `{dtype}` | {'✓' if notnull else '✗'} | {'✓' if pk else '✗'} |\n"
-                # Show indices with details
-                cursor.execute(f"PRAGMA index_list({table_name});")
-                indices = cursor.fetchall()
-                if indices:
-                    schema_md += f"\n### Indices ({len(indices)})\n\n"
-                    for idx in indices:
-                        idx_name, unique, origin = idx[1], idx[2], idx[3]
-                        # Get indexed columns
-                        cursor.execute(f"PRAGMA index_info({idx_name});")
-                        idx_cols = cursor.fetchall()
-                        cols = [col[2] for col in idx_cols if col[2]]
-                        unique_badge = "🔒 UNIQUE" if unique else "📑 INDEX"
-                        schema_md += f"- **{idx_name}** {unique_badge}\n"
-                        schema_md += f"  - Columns: `{', '.join(cols) if cols else 'id'}`\n"
-                        schema_md += f"  - Origin: {origin}\n"
-                schema_md += "\n---\n\n"
-            return schema_md
     except Exception as e:
         print(f"Error in get_schema_info: {e}")
         traceback.print_exc()
@@ -305,15 +322,6 @@ def run_raw_query(sql_query):
     try:
         with get_db_connection() as conn:
-            # Show query plan for debugging
-            try:
-                explain_result = conn.execute("EXPLAIN QUERY PLAN " + sql_query).fetchall()
-                print("Query Plan:")
-                for row in explain_result:
-                    print(f"  {row}")
-            except:
-                pass
             df = pd.read_sql_query(sql_query, conn)
         if df.empty:
@@ -328,6 +336,7 @@ def run_raw_query(sql_query):
 def get_semantic_profile(word, lang='en'):
     """
     HIGHLY OPTIMIZED: Single query with UNION ALL for all relations at once.
     """
     if not word:
         return "⚠️ Please enter a word."
@@ -336,7 +345,6 @@ def get_semantic_profile(word, lang='en'):
     like_path = f"/c/{lang}/{word}%"
     print(f"Getting semantic profile for: {like_path}")
-    # Most important relations for semantic understanding
     relations_to_check = [
         "/r/IsA", "/r/PartOf", "/r/HasA", "/r/UsedFor", "/r/CapableOf",
         "/r/Causes", "/r/HasProperty", "/r/Synonym", "/r/Antonym",
@@ -348,12 +356,12 @@ def get_semantic_profile(word, lang='en'):
     try:
         with get_db_connection() as conn:
-            # MEGA-OPTIMIZED: Single UNION ALL query for all relations
             union_parts = []
             union_params = []
             for rel in relations_to_check:
-                # Outgoing edges (word as subject)
                 union_parts.append("""
                     SELECT
                         ? as rel_label,
@@ -366,12 +374,10 @@ def get_semantic_profile(word, lang='en'):
                     INNER JOIN node en ON e.end_id = en.id
                     INNER JOIN relation r ON e.rel_id = r.id
                     WHERE s.id LIKE ? AND r.label = ?
-                    ORDER BY e.weight DESC
-                    LIMIT 7
                 """)
                 union_params.extend([rel, like_path, rel])
-                # Incoming edges (word as object)
                 union_parts.append("""
                     SELECT
                         ? as rel_label,
@@ -384,13 +390,11 @@ def get_semantic_profile(word, lang='en'):
                     INNER JOIN node en ON e.end_id = en.id
                     INNER JOIN relation r ON e.rel_id = r.id
                     WHERE en.id LIKE ? AND r.label = ?
-                    ORDER BY e.weight DESC
-                    LIMIT 7
                 """)
                 union_params.extend([rel, like_path, rel])
-            # Execute the mega-query
-            full_query = " UNION ALL ".join(union_parts)
             print(f"Executing optimized semantic profile query...")
             cursor = conn.execute(full_query, union_params)
@@ -409,49 +413,42 @@ This could mean:
 **Tip:** Use the Query Builder to search manually."""
-            # Group and format results
-            current_rel = None
-            rel_results = []
-            total_relations = 0
             for rel_label, direction, target_id, target_label, weight in results:
-                if rel_label != current_rel:
-                    if current_rel is not None:
-                        # Write previous relation
-                        output_md += f"## {current_rel}\n\n"
-                        if rel_results:
-                            for line in rel_results:
-                                output_md += line
-                            total_relations += len(rel_results)
-                        else:
-                            output_md += "*No results*\n"
-                        output_md += "\n"
-                    current_rel = rel_label
-                    rel_results = []
-                # Format output
-                weight_str = f"{weight:.3f}"
-                if direction == 'out':
-                    rel_results.append(
-                        f"- **{word}** {rel_label} → *{target_label}* "
-                        f"`[{weight_str}]`\n"
-                    )
-                else:
-                    rel_results.append(
-                        f"- *{target_label}* {rel_label} → **{word}** "
-                        f"`[{weight_str}]`\n"
-                    )
-            # Write last relation
-            if current_rel is not None:
-                output_md += f"## {current_rel}\n\n"
-                if rel_results:
-                    for line in rel_results:
-                        output_md += line
-                    total_relations += len(rel_results)
-                else:
                     output_md += "*No results*\n"
                 output_md += "\n"
             output_md += "---\n"
@@ -467,6 +464,9 @@ This could mean:
 # --- 3. Build the Gradio UI ---
 # Verify indices on startup
 verify_indices()
@@ -479,7 +479,8 @@ with gr.Blocks(title="ConceptNet SQLite Explorer", theme=gr.themes.Soft()) as de
     gr.Markdown(
         f"**Database:** `{os.path.basename(DB_PATH)}` ({db_size_gb:.2f} GB) | "
-        f"**Status:** {index_status}"
     )
     gr.Markdown("*Explore semantic relationships in ConceptNet with optimized indexed queries*")

 from huggingface_hub import hf_hub_download, snapshot_download
 import os
 import traceback
+import shutil
 from pathlib import Path
 # --- 1. Download and Cache the Database with Indices ---
 # --- 2. Database Helper Functions ---
+def check_disk_space():
+    """Check and report disk space"""
+    total, used, free = shutil.disk_usage("/")
+    print(f"\n=== Disk Space Check ===")
+    print(f"Total: {total / (2**30):.2f} GB")
+    print(f"Used: {used / (2**30):.2f} GB")
+    print(f"Free: {free / (2**30):.2f} GB")
+    if free < 5 * (2**30):
+        print("⚠️  WARNING: Less than 5GB free!")
+    else:
+        print("✅ Sufficient disk space")
+    print("========================\n")
+    return free
 def get_db_connection():
     """
     Creates a new read-only connection to the SQLite database with optimizations.
             ("PRAGMA query_only = ON", True),
             ("PRAGMA temp_store = MEMORY", True),
             ("PRAGMA cache_size = -128000", True),      # 128MB cache
             ("PRAGMA mmap_size = 2147483648", True),    # 2GB memory-mapped I/O
             ("PRAGMA synchronous = OFF", True),
             ("PRAGMA locking_mode = NORMAL", True),
             try:
                 conn.execute(pragma)
             except sqlite3.OperationalError as e:
+                if critical and "journal_mode" not in pragma:
                     print(f"Warning: Could not apply {pragma}: {e}")
         return conn
             if fts:
                 print(f"✅ Full-Text Search enabled: {[f[0] for f in fts]}")
             # Check database stats
             cursor.execute("PRAGMA page_size")
             page_size = cursor.fetchone()[0]
     schema_md = "# 📚 Database Schema\n\n"
     try:
+        conn = get_db_connection()
+        if not conn:
+            return "❌ Could not connect to database"
+        cursor = conn.cursor()
+        # Get database stats
+        cursor.execute("SELECT page_count * page_size as size FROM pragma_page_count(), pragma_page_size()")
+        db_size = cursor.fetchone()[0]
+        schema_md += f"**Database Size:** {db_size / 1024 / 1024 / 1024:.2f} GB\n\n"
+        cursor.execute("SELECT name FROM sqlite_master WHERE type='table' AND name NOT LIKE 'sqlite_%';")
+        tables = cursor.fetchall()
+        if not tables:
+            conn.close()
+            return "Could not find any tables in the database."
+        for table in tables:
+            table_name = table[0]
+            # Get row count (with timeout protection)
+            try:
+                cursor.execute(f"SELECT COUNT(*) FROM {table_name}")
+                row_count = cursor.fetchone()[0]
+                schema_md += f"## Table: `{table_name}` ({row_count:,} rows)\n\n"
+            except:
+                schema_md += f"## Table: `{table_name}`\n\n"
+            schema_md += "### Columns\n\n"
+            schema_md += "| Column Name | Data Type | Not Null | Primary Key |\n"
+            schema_md += "|:------------|:----------|:---------|:------------|\n"
+            cursor.execute(f"PRAGMA table_info({table_name});")
+            columns = cursor.fetchall()
+            for col in columns:
+                name, dtype, notnull, pk = col[1], col[2], col[3], col[5]
+                schema_md += f"| `{name}` | `{dtype}` | {'✓' if notnull else '✗'} | {'✓' if pk else '✗'} |\n"
+            # Show indices with details
+            cursor.execute(f"PRAGMA index_list({table_name});")
+            indices = cursor.fetchall()
+            if indices:
+                schema_md += f"\n### Indices ({len(indices)})\n\n"
+                for idx in indices:
+                    idx_name, unique, origin = idx[1], idx[2], idx[3]
+                    # Get indexed columns
+                    cursor.execute(f"PRAGMA index_info({idx_name});")
+                    idx_cols = cursor.fetchall()
+                    cols = [col[2] for col in idx_cols if col[2]]
+                    unique_badge = "🔒 UNIQUE" if unique else "📑 INDEX"
+                    schema_md += f"- **{idx_name}** {unique_badge}\n"
+                    schema_md += f"  - Columns: `{', '.join(cols) if cols else 'id'}`\n"
+                    schema_md += f"  - Origin: {origin}\n"
+            schema_md += "\n---\n\n"
+        conn.close()
+        return schema_md
     except Exception as e:
         print(f"Error in get_schema_info: {e}")
         traceback.print_exc()
     try:
         with get_db_connection() as conn:
             df = pd.read_sql_query(sql_query, conn)
         if df.empty:
 def get_semantic_profile(word, lang='en'):
     """
     HIGHLY OPTIMIZED: Single query with UNION ALL for all relations at once.
+    FIXED: ORDER BY placed correctly after all UNION ALL clauses.
     """
     if not word:
         return "⚠️ Please enter a word."
     like_path = f"/c/{lang}/{word}%"
     print(f"Getting semantic profile for: {like_path}")
     relations_to_check = [
         "/r/IsA", "/r/PartOf", "/r/HasA", "/r/UsedFor", "/r/CapableOf",
         "/r/Causes", "/r/HasProperty", "/r/Synonym", "/r/Antonym",
     try:
         with get_db_connection() as conn:
+            # FIXED: ORDER BY goes AFTER all UNION ALL clauses
             union_parts = []
             union_params = []
             for rel in relations_to_check:
+                # Outgoing edges
                 union_parts.append("""
                     SELECT
                         ? as rel_label,
                     INNER JOIN node en ON e.end_id = en.id
                     INNER JOIN relation r ON e.rel_id = r.id
                     WHERE s.id LIKE ? AND r.label = ?
                 """)
                 union_params.extend([rel, like_path, rel])
+                # Incoming edges
                 union_parts.append("""
                     SELECT
                         ? as rel_label,
                     INNER JOIN node en ON e.end_id = en.id
                     INNER JOIN relation r ON e.rel_id = r.id
                     WHERE en.id LIKE ? AND r.label = ?
                 """)
                 union_params.extend([rel, like_path, rel])
+            # Combine all parts with UNION ALL, then ORDER BY at the very end
+            full_query = " UNION ALL ".join(union_parts) + " ORDER BY rel_label, weight DESC"
             print(f"Executing optimized semantic profile query...")
             cursor = conn.execute(full_query, union_params)
 **Tip:** Use the Query Builder to search manually."""
+            # Group and format results (limit to top 7 per relation per direction)
+            results_by_rel = {}
             for rel_label, direction, target_id, target_label, weight in results:
+                key = (rel_label, direction)
+                if key not in results_by_rel:
+                    results_by_rel[key] = []
+                if len(results_by_rel[key]) < 7:  # Limit to 7 per relation per direction
+                    results_by_rel[key].append((target_id, target_label, weight))
+            # Format output
+            total_relations = 0
+            for rel in relations_to_check:
+                output_md += f"## {rel}\n\n"
+                has_results = False
+                # Outgoing
+                out_key = (rel, 'out')
+                if out_key in results_by_rel and results_by_rel[out_key]:
+                    for target_id, target_label, weight in results_by_rel[out_key]:
+                        output_md += f"- **{word}** {rel} → *{target_label}* `[{weight:.3f}]`\n"
+                        has_results = True
+                        total_relations += 1
+                # Incoming
+                in_key = (rel, 'in')
+                if in_key in results_by_rel and results_by_rel[in_key]:
+                    for target_id, target_label, weight in results_by_rel[in_key]:
+                        output_md += f"- *{target_label}* {rel} → **{word}** `[{weight:.3f}]`\n"
+                        has_results = True
+                        total_relations += 1
+                if not has_results:
                     output_md += "*No results*\n"
                 output_md += "\n"
             output_md += "---\n"
 # --- 3. Build the Gradio UI ---
+# Check disk space first
+free_space = check_disk_space()
 # Verify indices on startup
 verify_indices()
     gr.Markdown(
         f"**Database:** `{os.path.basename(DB_PATH)}` ({db_size_gb:.2f} GB) | "
+        f"**Status:** {index_status} | "
+        f"**Free Disk:** {free_space / (2**30):.2f} GB"
     )
     gr.Markdown("*Explore semantic relationships in ConceptNet with optimized indexed queries*")