conceptnet_db / app.py
cstr's picture
Update app.py
09241e4 verified
raw
history blame
16 kB
import gradio as gr
import sqlite3
import pandas as pd
from huggingface_hub import hf_hub_download, snapshot_download
import os
import time
from pathlib import Path
# ===== CONFIGURATION =====
TARGET_LANGUAGES = ['de']
# =========================
print(f"🌍 Filtering to: {', '.join([l.upper() for l in TARGET_LANGUAGES])}")
# Download database
REPO_ID = "ysenarath/conceptnet-sqlite"
DB_FILENAME = "data/conceptnet-v5.7.0.db"
DB_PATH = hf_hub_download(repo_id=REPO_ID, filename=DB_FILENAME, repo_type="dataset")
print(f"Database: {DB_PATH}")
try:
CACHE_DIR = snapshot_download(
repo_id=REPO_ID,
repo_type="dataset",
allow_patterns=["data/conceptnet-v5.7.0-index/*"]
)
INDEX_PATH = os.path.join(CACHE_DIR, "data/conceptnet-v5.7.0-index")
if os.path.exists(INDEX_PATH):
print(f"Index files: {len(list(Path(INDEX_PATH).glob('*.ldb')))}")
except:
INDEX_PATH = None
def get_db_connection():
"""Create optimized connection"""
db_uri = f"file:{DB_PATH}?mode=ro"
conn = sqlite3.connect(db_uri, uri=True, check_same_thread=False)
conn.execute("PRAGMA query_only = ON")
conn.execute("PRAGMA cache_size = -256000")
conn.execute("PRAGMA mmap_size = 4294967296")
conn.execute("PRAGMA temp_store = MEMORY")
return conn
def get_semantic_profile_fast(word, lang='de'):
"""
FAST VERSION: Query node table first (has index!), then use exact ID matches.
This avoids full table scan on edge table.
"""
if not word:
return "⚠️ Please enter a word."
if lang not in TARGET_LANGUAGES:
return f"⚠️ Language '{lang}' not available."
word = word.strip().lower().replace(' ', '_')
like_path = f"/c/{lang}/{word}%"
print(f"\n{'='*60}")
print(f"Semantic Profile: {word} ({lang})")
print(f"{'='*60}")
relations = [
"/r/IsA", "/r/PartOf", "/r/HasA", "/r/UsedFor", "/r/CapableOf",
"/r/Causes", "/r/HasProperty", "/r/Synonym", "/r/Antonym",
"/r/AtLocation", "/r/RelatedTo"
]
output_md = f"# 🧠 Semantic Profile: '{word}'\n\n"
try:
with get_db_connection() as conn:
cursor = conn.cursor()
# STEP 1: Find matching nodes (FAST - uses index on node.id)
print(f"Step 1: Finding nodes matching '{like_path}'...")
start = time.time()
cursor.execute("SELECT id, label FROM node WHERE id LIKE ?", (like_path,))
matching_nodes = cursor.fetchall()
elapsed = time.time() - start
print(f" Found {len(matching_nodes)} nodes in {elapsed:.3f}s")
if not matching_nodes:
return f"# 🧠 Semantic Profile: '{word}'\n\n⚠️ **No nodes found**\n\nTry checking spelling or use a more common word."
# Get the primary node ID (first match)
node_ids = [n[0] for n in matching_nodes]
primary_id = node_ids[0]
print(f" Primary ID: {primary_id}")
for node_id, label in matching_nodes[:3]:
output_md += f"**Node:** `{node_id}` ({label})\n"
output_md += "\n"
total_relations = 0
# STEP 2: For each relation, query with EXACT ID match (uses PK index!)
for rel in relations:
print(f"\nStep 2: Querying {rel}...")
output_md += f"## {rel}\n\n"
has_results = False
# Outgoing edges - FAST because we use exact start_id match
start = time.time()
# Use IN with explicit node IDs - much faster than LIKE on edge table
placeholders = ','.join(['?'] * len(node_ids))
query_out = f"""
SELECT en.label, e.weight
FROM edge e
JOIN node en ON e.end_id = en.id
JOIN relation r ON e.rel_id = r.id
WHERE e.start_id IN ({placeholders}) AND r.label = ?
ORDER BY e.weight DESC
LIMIT 5
"""
cursor.execute(query_out, node_ids + [rel])
out_results = cursor.fetchall()
elapsed = time.time() - start
print(f" Outgoing: {len(out_results)} results in {elapsed:.3f}s")
for label, weight in out_results:
output_md += f"- **{word}** {rel} β†’ *{label}* `[{weight:.3f}]`\n"
has_results = True
total_relations += 1
# Incoming edges
start = time.time()
query_in = f"""
SELECT s.label, e.weight
FROM edge e
JOIN node s ON e.start_id = s.id
JOIN relation r ON e.rel_id = r.id
WHERE e.end_id IN ({placeholders}) AND r.label = ?
ORDER BY e.weight DESC
LIMIT 5
"""
cursor.execute(query_in, node_ids + [rel])
in_results = cursor.fetchall()
elapsed = time.time() - start
print(f" Incoming: {len(in_results)} results in {elapsed:.3f}s")
for label, weight in in_results:
output_md += f"- *{label}* {rel} β†’ **{word}** `[{weight:.3f}]`\n"
has_results = True
total_relations += 1
if not has_results:
output_md += "*No results*\n"
output_md += "\n"
output_md += "---\n"
output_md += f"**Total relations found:** {total_relations}\n"
print(f"\nβœ… Complete: {total_relations} relations")
print("="*60 + "\n")
return output_md
except Exception as e:
print(f"ERROR: {e}")
import traceback
traceback.print_exc()
return f"**❌ Error:**\n\n```\n{e}\n```"
def run_query_fast(start_node, relation, end_node, limit):
"""
FAST VERSION: Get node IDs first, then use exact matches.
"""
print(f"\n{'='*60}")
print(f"Query: start={start_node}, rel={relation}, end={end_node}")
print(f"{'='*60}")
try:
with get_db_connection() as conn:
cursor = conn.cursor()
start_ids = []
end_ids = []
# Step 1: Get start node IDs (if specified)
if start_node:
pattern = start_node if '%' in start_node else f"%{start_node}%"
cursor.execute("SELECT id FROM node WHERE id LIKE ? LIMIT 100", (pattern,))
start_ids = [row[0] for row in cursor.fetchall()]
print(f" Start nodes: {len(start_ids)}")
if not start_ids:
return pd.DataFrame(), f"No nodes found matching '{start_node}'"
# Step 2: Get end node IDs (if specified)
if end_node:
pattern = end_node if '%' in end_node else f"%{end_node}%"
cursor.execute("SELECT id FROM node WHERE id LIKE ? LIMIT 100", (pattern,))
end_ids = [row[0] for row in cursor.fetchall()]
print(f" End nodes: {len(end_ids)}")
if not end_ids:
return pd.DataFrame(), f"No nodes found matching '{end_node}'"
# Step 3: Query edges with exact ID matches
query = """
SELECT
e.id,
s.id,
r.label,
en.id,
e.weight,
s.label,
en.label
FROM edge e
JOIN relation r ON e.rel_id = r.id
JOIN node s ON e.start_id = s.id
JOIN node en ON e.end_id = en.id
WHERE 1=1
"""
params = []
# Add language filter with IN clause for speed
lang_ids_query = " OR ".join([f"s.id LIKE '/c/{lang}/%' OR en.id LIKE '/c/{lang}/%'" for lang in TARGET_LANGUAGES])
query += f" AND ({lang_ids_query})"
if start_ids:
placeholders = ','.join(['?'] * len(start_ids))
query += f" AND e.start_id IN ({placeholders})"
params.extend(start_ids)
if relation:
if '%' in relation:
query += " AND r.label LIKE ?"
else:
query += " AND r.label = ?"
params.append(relation)
if end_ids:
placeholders = ','.join(['?'] * len(end_ids))
query += f" AND e.end_id IN ({placeholders})"
params.extend(end_ids)
query += " ORDER BY e.weight DESC LIMIT ?"
params.append(limit)
print(f" Executing query with {len(params)} params...")
start_time = time.time()
df = pd.read_sql_query(query, conn, params=params)
elapsed = time.time() - start_time
print(f" βœ… {len(df)} results in {elapsed:.2f}s")
print("="*60 + "\n")
if df.empty:
return pd.DataFrame(), f"No results found ({elapsed:.2f}s)"
df.columns = ['edge_id', 'start_id', 'relation', 'end_id', 'weight', 'start_label', 'end_label']
return df, f"βœ… {len(df)} results in {elapsed:.2f}s"
except Exception as e:
print(f"ERROR: {e}")
import traceback
traceback.print_exc()
return pd.DataFrame(), f"**❌ Error:**\n\n```\n{e}\n```"
def run_raw_query(sql_query):
"""Execute raw SQL"""
if not sql_query.strip().upper().startswith("SELECT"):
return pd.DataFrame(), "Only SELECT allowed"
try:
with get_db_connection() as conn:
start = time.time()
df = pd.read_sql_query(sql_query, conn)
elapsed = time.time() - start
return df, f"βœ… {len(df)} rows in {elapsed:.2f}s"
except Exception as e:
return pd.DataFrame(), f"Error: {e}"
def get_schema_info():
"""Get schema"""
with get_db_connection() as conn:
cursor = conn.cursor()
md = "# πŸ“š Schema\n\n"
md += "⚠️ **CRITICAL:** Edge table has NO indices on start_id/end_id!\n\n"
md += "This means LIKE queries on edge table do full table scans (34M rows).\n\n"
md += "**Workaround:** Query node table first (has index), then use exact ID matches.\n\n"
cursor.execute("SELECT name FROM sqlite_master WHERE type='table' AND name NOT LIKE 'sqlite_%'")
tables = cursor.fetchall()
for table_name, in tables:
cursor.execute(f"SELECT COUNT(*) FROM {table_name}")
count = cursor.fetchone()[0]
md += f"## {table_name} ({count:,} rows)\n\n"
cursor.execute(f"PRAGMA table_info({table_name})")
cols = cursor.fetchall()
md += "| Column | Type |\n|:--|:--|\n"
for col in cols:
md += f"| `{col[1]}` | `{col[2]}` |\n"
cursor.execute(f"PRAGMA index_list({table_name})")
indices = cursor.fetchall()
if indices:
md += f"\n**Indices:** {len(indices)}\n"
for idx in indices:
cursor.execute(f"PRAGMA index_info({idx[1]})")
idx_cols = cursor.fetchall()
cols_str = ', '.join([c[2] for c in idx_cols if c[2]]) or 'PRIMARY KEY'
md += f"- {idx[1]}: {cols_str}\n"
md += "\n---\n\n"
return md
# Test on startup
print("\nπŸ§ͺ TESTING DATABASE...")
with get_db_connection() as conn:
cursor = conn.cursor()
# Test 1: Node query (should be fast - has index)
start = time.time()
cursor.execute("SELECT COUNT(*) FROM node WHERE id LIKE '/c/de/%'")
de_count = cursor.fetchone()[0]
elapsed = time.time() - start
print(f"βœ… DE nodes: {de_count:,} ({elapsed:.3f}s)")
# Test 2: Get specific node
cursor.execute("SELECT id FROM node WHERE id LIKE '/c/de/hund%' LIMIT 1")
hund_id = cursor.fetchone()
if hund_id:
print(f"βœ… Found 'hund': {hund_id[0]}")
# Test 3: Query edges with exact ID (should be fast)
start = time.time()
cursor.execute("""
SELECT COUNT(*) FROM edge
WHERE start_id = ? OR end_id = ?
""", (hund_id[0], hund_id[0]))
edge_count = cursor.fetchone()[0]
elapsed = time.time() - start
print(f"βœ… Edges for 'hund': {edge_count} ({elapsed:.3f}s)")
print("\nπŸš€ Starting app...\n")
# Gradio UI
with gr.Blocks(title="ConceptNet Explorer (FAST)", theme=gr.themes.Soft()) as demo:
gr.Markdown("# 🧠 ConceptNet Explorer (Optimized for Missing Indices)")
gr.Markdown(
f"**Language:** {', '.join([l.upper() for l in TARGET_LANGUAGES])} | "
"**Strategy:** Query nodes first (indexed), then exact edge matches"
)
with gr.Tabs():
with gr.TabItem("πŸ” Semantic Profile"):
gr.Markdown("**Fast semantic profile using indexed node queries**")
with gr.Row():
word_input = gr.Textbox(label="Word", placeholder="hund", value="hund")
lang_input = gr.Dropdown(choices=TARGET_LANGUAGES, value=TARGET_LANGUAGES[0], label="Language")
semantic_btn = gr.Button("πŸ” Get Profile", variant="primary", size="lg")
semantic_output = gr.Markdown("*Click to start...*")
with gr.TabItem("⚑ Query Builder"):
gr.Markdown("**Fast queries using node lookup β†’ exact edge matches**")
with gr.Row():
start_input = gr.Textbox(label="Start Node", placeholder="hund", value="")
rel_input = gr.Textbox(label="Relation", placeholder="IsA", value="")
end_input = gr.Textbox(label="End Node", placeholder="tier", value="")
limit_slider = gr.Slider(label="Limit", minimum=1, maximum=100, value=20, step=1)
query_btn = gr.Button("▢️ Run Query", variant="primary", size="lg")
status_output = gr.Markdown("*Ready...*")
results_output = gr.DataFrame(label="Results")
with gr.TabItem("πŸ’» Raw SQL"):
raw_sql_input = gr.Textbox(
label="SQL Query",
value="SELECT * FROM node WHERE id LIKE '/c/de/hund%' LIMIT 10",
lines=3
)
raw_btn = gr.Button("▢️ Execute")
raw_status = gr.Markdown()
raw_results = gr.DataFrame()
with gr.TabItem("πŸ“Š Schema"):
schema_btn = gr.Button("πŸ“Š Load Schema")
schema_output = gr.Markdown("*Click to load...*")
gr.Markdown("---\n**Optimization:** Avoids slow LIKE queries on edge table by querying indexed node table first")
# Connect functions
semantic_btn.click(get_semantic_profile_fast, [word_input, lang_input], semantic_output)
query_btn.click(run_query_fast, [start_input, rel_input, end_input, limit_slider], [results_output, status_output])
raw_btn.click(run_raw_query, raw_sql_input, [raw_results, raw_status])
schema_btn.click(get_schema_info, None, schema_output)
if __name__ == "__main__":
demo.launch(ssr_mode=False)