Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| import sqlite3 | |
| import pandas as pd | |
| from huggingface_hub import hf_hub_download, snapshot_download | |
| import os | |
| import time | |
| from pathlib import Path | |
| # ===== CONFIGURATION ===== | |
| TARGET_LANGUAGES = ['de'] | |
| # ========================= | |
| print(f"π Filtering to: {', '.join([l.upper() for l in TARGET_LANGUAGES])}") | |
| # Download database | |
| REPO_ID = "ysenarath/conceptnet-sqlite" | |
| DB_FILENAME = "data/conceptnet-v5.7.0.db" | |
| DB_PATH = hf_hub_download(repo_id=REPO_ID, filename=DB_FILENAME, repo_type="dataset") | |
| print(f"Database: {DB_PATH}") | |
| try: | |
| CACHE_DIR = snapshot_download( | |
| repo_id=REPO_ID, | |
| repo_type="dataset", | |
| allow_patterns=["data/conceptnet-v5.7.0-index/*"] | |
| ) | |
| INDEX_PATH = os.path.join(CACHE_DIR, "data/conceptnet-v5.7.0-index") | |
| if os.path.exists(INDEX_PATH): | |
| print(f"Index files: {len(list(Path(INDEX_PATH).glob('*.ldb')))}") | |
| except: | |
| INDEX_PATH = None | |
| def get_db_connection(): | |
| """Create optimized connection""" | |
| db_uri = f"file:{DB_PATH}?mode=ro" | |
| conn = sqlite3.connect(db_uri, uri=True, check_same_thread=False) | |
| conn.execute("PRAGMA query_only = ON") | |
| conn.execute("PRAGMA cache_size = -256000") | |
| conn.execute("PRAGMA mmap_size = 4294967296") | |
| conn.execute("PRAGMA temp_store = MEMORY") | |
| return conn | |
| def get_semantic_profile_fast(word, lang='de'): | |
| """ | |
| FAST VERSION: Query node table first (has index!), then use exact ID matches. | |
| This avoids full table scan on edge table. | |
| """ | |
| if not word: | |
| return "β οΈ Please enter a word." | |
| if lang not in TARGET_LANGUAGES: | |
| return f"β οΈ Language '{lang}' not available." | |
| word = word.strip().lower().replace(' ', '_') | |
| like_path = f"/c/{lang}/{word}%" | |
| print(f"\n{'='*60}") | |
| print(f"Semantic Profile: {word} ({lang})") | |
| print(f"{'='*60}") | |
| relations = [ | |
| "/r/IsA", "/r/PartOf", "/r/HasA", "/r/UsedFor", "/r/CapableOf", | |
| "/r/Causes", "/r/HasProperty", "/r/Synonym", "/r/Antonym", | |
| "/r/AtLocation", "/r/RelatedTo" | |
| ] | |
| output_md = f"# π§ Semantic Profile: '{word}'\n\n" | |
| try: | |
| with get_db_connection() as conn: | |
| cursor = conn.cursor() | |
| # STEP 1: Find matching nodes (FAST - uses index on node.id) | |
| print(f"Step 1: Finding nodes matching '{like_path}'...") | |
| start = time.time() | |
| cursor.execute("SELECT id, label FROM node WHERE id LIKE ?", (like_path,)) | |
| matching_nodes = cursor.fetchall() | |
| elapsed = time.time() - start | |
| print(f" Found {len(matching_nodes)} nodes in {elapsed:.3f}s") | |
| if not matching_nodes: | |
| return f"# π§ Semantic Profile: '{word}'\n\nβ οΈ **No nodes found**\n\nTry checking spelling or use a more common word." | |
| # Get the primary node ID (first match) | |
| node_ids = [n[0] for n in matching_nodes] | |
| primary_id = node_ids[0] | |
| print(f" Primary ID: {primary_id}") | |
| for node_id, label in matching_nodes[:3]: | |
| output_md += f"**Node:** `{node_id}` ({label})\n" | |
| output_md += "\n" | |
| total_relations = 0 | |
| # STEP 2: For each relation, query with EXACT ID match (uses PK index!) | |
| for rel in relations: | |
| print(f"\nStep 2: Querying {rel}...") | |
| output_md += f"## {rel}\n\n" | |
| has_results = False | |
| # Outgoing edges - FAST because we use exact start_id match | |
| start = time.time() | |
| # Use IN with explicit node IDs - much faster than LIKE on edge table | |
| placeholders = ','.join(['?'] * len(node_ids)) | |
| query_out = f""" | |
| SELECT en.label, e.weight | |
| FROM edge e | |
| JOIN node en ON e.end_id = en.id | |
| JOIN relation r ON e.rel_id = r.id | |
| WHERE e.start_id IN ({placeholders}) AND r.label = ? | |
| ORDER BY e.weight DESC | |
| LIMIT 5 | |
| """ | |
| cursor.execute(query_out, node_ids + [rel]) | |
| out_results = cursor.fetchall() | |
| elapsed = time.time() - start | |
| print(f" Outgoing: {len(out_results)} results in {elapsed:.3f}s") | |
| for label, weight in out_results: | |
| output_md += f"- **{word}** {rel} β *{label}* `[{weight:.3f}]`\n" | |
| has_results = True | |
| total_relations += 1 | |
| # Incoming edges | |
| start = time.time() | |
| query_in = f""" | |
| SELECT s.label, e.weight | |
| FROM edge e | |
| JOIN node s ON e.start_id = s.id | |
| JOIN relation r ON e.rel_id = r.id | |
| WHERE e.end_id IN ({placeholders}) AND r.label = ? | |
| ORDER BY e.weight DESC | |
| LIMIT 5 | |
| """ | |
| cursor.execute(query_in, node_ids + [rel]) | |
| in_results = cursor.fetchall() | |
| elapsed = time.time() - start | |
| print(f" Incoming: {len(in_results)} results in {elapsed:.3f}s") | |
| for label, weight in in_results: | |
| output_md += f"- *{label}* {rel} β **{word}** `[{weight:.3f}]`\n" | |
| has_results = True | |
| total_relations += 1 | |
| if not has_results: | |
| output_md += "*No results*\n" | |
| output_md += "\n" | |
| output_md += "---\n" | |
| output_md += f"**Total relations found:** {total_relations}\n" | |
| print(f"\nβ Complete: {total_relations} relations") | |
| print("="*60 + "\n") | |
| return output_md | |
| except Exception as e: | |
| print(f"ERROR: {e}") | |
| import traceback | |
| traceback.print_exc() | |
| return f"**β Error:**\n\n```\n{e}\n```" | |
| def run_query_fast(start_node, relation, end_node, limit): | |
| """ | |
| FAST VERSION: Get node IDs first, then use exact matches. | |
| """ | |
| print(f"\n{'='*60}") | |
| print(f"Query: start={start_node}, rel={relation}, end={end_node}") | |
| print(f"{'='*60}") | |
| try: | |
| with get_db_connection() as conn: | |
| cursor = conn.cursor() | |
| start_ids = [] | |
| end_ids = [] | |
| # Step 1: Get start node IDs (if specified) | |
| if start_node: | |
| pattern = start_node if '%' in start_node else f"%{start_node}%" | |
| cursor.execute("SELECT id FROM node WHERE id LIKE ? LIMIT 100", (pattern,)) | |
| start_ids = [row[0] for row in cursor.fetchall()] | |
| print(f" Start nodes: {len(start_ids)}") | |
| if not start_ids: | |
| return pd.DataFrame(), f"No nodes found matching '{start_node}'" | |
| # Step 2: Get end node IDs (if specified) | |
| if end_node: | |
| pattern = end_node if '%' in end_node else f"%{end_node}%" | |
| cursor.execute("SELECT id FROM node WHERE id LIKE ? LIMIT 100", (pattern,)) | |
| end_ids = [row[0] for row in cursor.fetchall()] | |
| print(f" End nodes: {len(end_ids)}") | |
| if not end_ids: | |
| return pd.DataFrame(), f"No nodes found matching '{end_node}'" | |
| # Step 3: Query edges with exact ID matches | |
| query = """ | |
| SELECT | |
| e.id, | |
| s.id, | |
| r.label, | |
| en.id, | |
| e.weight, | |
| s.label, | |
| en.label | |
| FROM edge e | |
| JOIN relation r ON e.rel_id = r.id | |
| JOIN node s ON e.start_id = s.id | |
| JOIN node en ON e.end_id = en.id | |
| WHERE 1=1 | |
| """ | |
| params = [] | |
| # Add language filter with IN clause for speed | |
| lang_ids_query = " OR ".join([f"s.id LIKE '/c/{lang}/%' OR en.id LIKE '/c/{lang}/%'" for lang in TARGET_LANGUAGES]) | |
| query += f" AND ({lang_ids_query})" | |
| if start_ids: | |
| placeholders = ','.join(['?'] * len(start_ids)) | |
| query += f" AND e.start_id IN ({placeholders})" | |
| params.extend(start_ids) | |
| if relation: | |
| if '%' in relation: | |
| query += " AND r.label LIKE ?" | |
| else: | |
| query += " AND r.label = ?" | |
| params.append(relation) | |
| if end_ids: | |
| placeholders = ','.join(['?'] * len(end_ids)) | |
| query += f" AND e.end_id IN ({placeholders})" | |
| params.extend(end_ids) | |
| query += " ORDER BY e.weight DESC LIMIT ?" | |
| params.append(limit) | |
| print(f" Executing query with {len(params)} params...") | |
| start_time = time.time() | |
| df = pd.read_sql_query(query, conn, params=params) | |
| elapsed = time.time() - start_time | |
| print(f" β {len(df)} results in {elapsed:.2f}s") | |
| print("="*60 + "\n") | |
| if df.empty: | |
| return pd.DataFrame(), f"No results found ({elapsed:.2f}s)" | |
| df.columns = ['edge_id', 'start_id', 'relation', 'end_id', 'weight', 'start_label', 'end_label'] | |
| return df, f"β {len(df)} results in {elapsed:.2f}s" | |
| except Exception as e: | |
| print(f"ERROR: {e}") | |
| import traceback | |
| traceback.print_exc() | |
| return pd.DataFrame(), f"**β Error:**\n\n```\n{e}\n```" | |
| def run_raw_query(sql_query): | |
| """Execute raw SQL""" | |
| if not sql_query.strip().upper().startswith("SELECT"): | |
| return pd.DataFrame(), "Only SELECT allowed" | |
| try: | |
| with get_db_connection() as conn: | |
| start = time.time() | |
| df = pd.read_sql_query(sql_query, conn) | |
| elapsed = time.time() - start | |
| return df, f"β {len(df)} rows in {elapsed:.2f}s" | |
| except Exception as e: | |
| return pd.DataFrame(), f"Error: {e}" | |
| def get_schema_info(): | |
| """Get schema""" | |
| with get_db_connection() as conn: | |
| cursor = conn.cursor() | |
| md = "# π Schema\n\n" | |
| md += "β οΈ **CRITICAL:** Edge table has NO indices on start_id/end_id!\n\n" | |
| md += "This means LIKE queries on edge table do full table scans (34M rows).\n\n" | |
| md += "**Workaround:** Query node table first (has index), then use exact ID matches.\n\n" | |
| cursor.execute("SELECT name FROM sqlite_master WHERE type='table' AND name NOT LIKE 'sqlite_%'") | |
| tables = cursor.fetchall() | |
| for table_name, in tables: | |
| cursor.execute(f"SELECT COUNT(*) FROM {table_name}") | |
| count = cursor.fetchone()[0] | |
| md += f"## {table_name} ({count:,} rows)\n\n" | |
| cursor.execute(f"PRAGMA table_info({table_name})") | |
| cols = cursor.fetchall() | |
| md += "| Column | Type |\n|:--|:--|\n" | |
| for col in cols: | |
| md += f"| `{col[1]}` | `{col[2]}` |\n" | |
| cursor.execute(f"PRAGMA index_list({table_name})") | |
| indices = cursor.fetchall() | |
| if indices: | |
| md += f"\n**Indices:** {len(indices)}\n" | |
| for idx in indices: | |
| cursor.execute(f"PRAGMA index_info({idx[1]})") | |
| idx_cols = cursor.fetchall() | |
| cols_str = ', '.join([c[2] for c in idx_cols if c[2]]) or 'PRIMARY KEY' | |
| md += f"- {idx[1]}: {cols_str}\n" | |
| md += "\n---\n\n" | |
| return md | |
| # Test on startup | |
| print("\nπ§ͺ TESTING DATABASE...") | |
| with get_db_connection() as conn: | |
| cursor = conn.cursor() | |
| # Test 1: Node query (should be fast - has index) | |
| start = time.time() | |
| cursor.execute("SELECT COUNT(*) FROM node WHERE id LIKE '/c/de/%'") | |
| de_count = cursor.fetchone()[0] | |
| elapsed = time.time() - start | |
| print(f"β DE nodes: {de_count:,} ({elapsed:.3f}s)") | |
| # Test 2: Get specific node | |
| cursor.execute("SELECT id FROM node WHERE id LIKE '/c/de/hund%' LIMIT 1") | |
| hund_id = cursor.fetchone() | |
| if hund_id: | |
| print(f"β Found 'hund': {hund_id[0]}") | |
| # Test 3: Query edges with exact ID (should be fast) | |
| start = time.time() | |
| cursor.execute(""" | |
| SELECT COUNT(*) FROM edge | |
| WHERE start_id = ? OR end_id = ? | |
| """, (hund_id[0], hund_id[0])) | |
| edge_count = cursor.fetchone()[0] | |
| elapsed = time.time() - start | |
| print(f"β Edges for 'hund': {edge_count} ({elapsed:.3f}s)") | |
| print("\nπ Starting app...\n") | |
| # Gradio UI | |
| with gr.Blocks(title="ConceptNet Explorer (FAST)", theme=gr.themes.Soft()) as demo: | |
| gr.Markdown("# π§ ConceptNet Explorer (Optimized for Missing Indices)") | |
| gr.Markdown( | |
| f"**Language:** {', '.join([l.upper() for l in TARGET_LANGUAGES])} | " | |
| "**Strategy:** Query nodes first (indexed), then exact edge matches" | |
| ) | |
| with gr.Tabs(): | |
| with gr.TabItem("π Semantic Profile"): | |
| gr.Markdown("**Fast semantic profile using indexed node queries**") | |
| with gr.Row(): | |
| word_input = gr.Textbox(label="Word", placeholder="hund", value="hund") | |
| lang_input = gr.Dropdown(choices=TARGET_LANGUAGES, value=TARGET_LANGUAGES[0], label="Language") | |
| semantic_btn = gr.Button("π Get Profile", variant="primary", size="lg") | |
| semantic_output = gr.Markdown("*Click to start...*") | |
| with gr.TabItem("β‘ Query Builder"): | |
| gr.Markdown("**Fast queries using node lookup β exact edge matches**") | |
| with gr.Row(): | |
| start_input = gr.Textbox(label="Start Node", placeholder="hund", value="") | |
| rel_input = gr.Textbox(label="Relation", placeholder="IsA", value="") | |
| end_input = gr.Textbox(label="End Node", placeholder="tier", value="") | |
| limit_slider = gr.Slider(label="Limit", minimum=1, maximum=100, value=20, step=1) | |
| query_btn = gr.Button("βΆοΈ Run Query", variant="primary", size="lg") | |
| status_output = gr.Markdown("*Ready...*") | |
| results_output = gr.DataFrame(label="Results") | |
| with gr.TabItem("π» Raw SQL"): | |
| raw_sql_input = gr.Textbox( | |
| label="SQL Query", | |
| value="SELECT * FROM node WHERE id LIKE '/c/de/hund%' LIMIT 10", | |
| lines=3 | |
| ) | |
| raw_btn = gr.Button("βΆοΈ Execute") | |
| raw_status = gr.Markdown() | |
| raw_results = gr.DataFrame() | |
| with gr.TabItem("π Schema"): | |
| schema_btn = gr.Button("π Load Schema") | |
| schema_output = gr.Markdown("*Click to load...*") | |
| gr.Markdown("---\n**Optimization:** Avoids slow LIKE queries on edge table by querying indexed node table first") | |
| # Connect functions | |
| semantic_btn.click(get_semantic_profile_fast, [word_input, lang_input], semantic_output) | |
| query_btn.click(run_query_fast, [start_input, rel_input, end_input, limit_slider], [results_output, status_output]) | |
| raw_btn.click(run_raw_query, raw_sql_input, [raw_results, raw_status]) | |
| schema_btn.click(get_schema_info, None, schema_output) | |
| if __name__ == "__main__": | |
| demo.launch(ssr_mode=False) |