Spaces:

cstr
/

conceptnet_db

Sleeping

App Files Files Community

conceptnet_db / app.py

cstr

Update app.py

09241e4 verified 2 months ago

raw

history blame

16 kB

	import gradio as gr
	import sqlite3
	import pandas as pd
	from huggingface_hub import hf_hub_download, snapshot_download
	import os
	import time
	from pathlib import Path

	# ===== CONFIGURATION =====
	TARGET_LANGUAGES = ['de']
	# =========================

	print(f"🌍 Filtering to: {', '.join([l.upper() for l in TARGET_LANGUAGES])}")

	# Download database
	REPO_ID = "ysenarath/conceptnet-sqlite"
	DB_FILENAME = "data/conceptnet-v5.7.0.db"

	DB_PATH = hf_hub_download(repo_id=REPO_ID, filename=DB_FILENAME, repo_type="dataset")
	print(f"Database: {DB_PATH}")

	try:
	CACHE_DIR = snapshot_download(
	repo_id=REPO_ID,
	repo_type="dataset",
	allow_patterns=["data/conceptnet-v5.7.0-index/*"]
	)
	INDEX_PATH = os.path.join(CACHE_DIR, "data/conceptnet-v5.7.0-index")
	if os.path.exists(INDEX_PATH):
	print(f"Index files: {len(list(Path(INDEX_PATH).glob('*.ldb')))}")
	except:
	INDEX_PATH = None

	def get_db_connection():
	"""Create optimized connection"""
	db_uri = f"file:{DB_PATH}?mode=ro"
	conn = sqlite3.connect(db_uri, uri=True, check_same_thread=False)
	conn.execute("PRAGMA query_only = ON")
	conn.execute("PRAGMA cache_size = -256000")
	conn.execute("PRAGMA mmap_size = 4294967296")
	conn.execute("PRAGMA temp_store = MEMORY")
	return conn

	def get_semantic_profile_fast(word, lang='de'):
	"""
	FAST VERSION: Query node table first (has index!), then use exact ID matches.
	This avoids full table scan on edge table.
	"""
	if not word:
	return "⚠️ Please enter a word."

	if lang not in TARGET_LANGUAGES:
	return f"⚠️ Language '{lang}' not available."

	word = word.strip().lower().replace(' ', '_')
	like_path = f"/c/{lang}/{word}%"

	print(f"\n{'='*60}")
	print(f"Semantic Profile: {word} ({lang})")
	print(f"{'='*60}")

	relations = [
	"/r/IsA", "/r/PartOf", "/r/HasA", "/r/UsedFor", "/r/CapableOf",
	"/r/Causes", "/r/HasProperty", "/r/Synonym", "/r/Antonym",
	"/r/AtLocation", "/r/RelatedTo"
	]

	output_md = f"# 🧠 Semantic Profile: '{word}'\n\n"

	try:
	with get_db_connection() as conn:
	cursor = conn.cursor()

	# STEP 1: Find matching nodes (FAST - uses index on node.id)
	print(f"Step 1: Finding nodes matching '{like_path}'...")
	start = time.time()
	cursor.execute("SELECT id, label FROM node WHERE id LIKE ?", (like_path,))
	matching_nodes = cursor.fetchall()
	elapsed = time.time() - start
	print(f" Found {len(matching_nodes)} nodes in {elapsed:.3f}s")

	if not matching_nodes:
	return f"# 🧠 Semantic Profile: '{word}'\n\n⚠️ No nodes found\n\nTry checking spelling or use a more common word."

	# Get the primary node ID (first match)
	node_ids = [n[0] for n in matching_nodes]
	primary_id = node_ids[0]
	print(f" Primary ID: {primary_id}")

	for node_id, label in matching_nodes[:3]:
	output_md += f"Node: `{node_id}` ({label})\n"
	output_md += "\n"

	total_relations = 0

	# STEP 2: For each relation, query with EXACT ID match (uses PK index!)
	for rel in relations:
	print(f"\nStep 2: Querying {rel}...")
	output_md += f"## {rel}\n\n"

	has_results = False

	# Outgoing edges - FAST because we use exact start_id match
	start = time.time()
	# Use IN with explicit node IDs - much faster than LIKE on edge table
	placeholders = ','.join(['?'] * len(node_ids))
	query_out = f"""
	SELECT en.label, e.weight
	FROM edge e
	JOIN node en ON e.end_id = en.id
	JOIN relation r ON e.rel_id = r.id
	WHERE e.start_id IN ({placeholders}) AND r.label = ?
	ORDER BY e.weight DESC
	LIMIT 5
	"""
	cursor.execute(query_out, node_ids + [rel])
	out_results = cursor.fetchall()
	elapsed = time.time() - start
	print(f" Outgoing: {len(out_results)} results in {elapsed:.3f}s")

	for label, weight in out_results:
	output_md += f"- {word} {rel} → {label} `[{weight:.3f}]`\n"
	has_results = True
	total_relations += 1

	# Incoming edges
	start = time.time()
	query_in = f"""
	SELECT s.label, e.weight
	FROM edge e
	JOIN node s ON e.start_id = s.id
	JOIN relation r ON e.rel_id = r.id
	WHERE e.end_id IN ({placeholders}) AND r.label = ?
	ORDER BY e.weight DESC
	LIMIT 5
	"""
	cursor.execute(query_in, node_ids + [rel])
	in_results = cursor.fetchall()
	elapsed = time.time() - start
	print(f" Incoming: {len(in_results)} results in {elapsed:.3f}s")

	for label, weight in in_results:
	output_md += f"- {label} {rel} → {word} `[{weight:.3f}]`\n"
	has_results = True
	total_relations += 1

	if not has_results:
	output_md += "No results\n"

	output_md += "\n"

	output_md += "---\n"
	output_md += f"Total relations found: {total_relations}\n"

	print(f"\n✅ Complete: {total_relations} relations")
	print("="*60 + "\n")

	return output_md

	except Exception as e:
	print(f"ERROR: {e}")
	import traceback
	traceback.print_exc()
	return f"❌ Error:\n\n```\n{e}\n```"

	def run_query_fast(start_node, relation, end_node, limit):
	"""
	FAST VERSION: Get node IDs first, then use exact matches.
	"""
	print(f"\n{'='*60}")
	print(f"Query: start={start_node}, rel={relation}, end={end_node}")
	print(f"{'='*60}")

	try:
	with get_db_connection() as conn:
	cursor = conn.cursor()

	start_ids = []
	end_ids = []

	# Step 1: Get start node IDs (if specified)
	if start_node:
	pattern = start_node if '%' in start_node else f"%{start_node}%"
	cursor.execute("SELECT id FROM node WHERE id LIKE ? LIMIT 100", (pattern,))
	start_ids = [row[0] for row in cursor.fetchall()]
	print(f" Start nodes: {len(start_ids)}")

	if not start_ids:
	return pd.DataFrame(), f"No nodes found matching '{start_node}'"

	# Step 2: Get end node IDs (if specified)
	if end_node:
	pattern = end_node if '%' in end_node else f"%{end_node}%"
	cursor.execute("SELECT id FROM node WHERE id LIKE ? LIMIT 100", (pattern,))
	end_ids = [row[0] for row in cursor.fetchall()]
	print(f" End nodes: {len(end_ids)}")

	if not end_ids:
	return pd.DataFrame(), f"No nodes found matching '{end_node}'"

	# Step 3: Query edges with exact ID matches
	query = """
	SELECT
	e.id,
	s.id,
	r.label,
	en.id,
	e.weight,
	s.label,
	en.label
	FROM edge e
	JOIN relation r ON e.rel_id = r.id
	JOIN node s ON e.start_id = s.id
	JOIN node en ON e.end_id = en.id
	WHERE 1=1
	"""
	params = []

	# Add language filter with IN clause for speed
	lang_ids_query = " OR ".join([f"s.id LIKE '/c/{lang}/%' OR en.id LIKE '/c/{lang}/%'" for lang in TARGET_LANGUAGES])
	query += f" AND ({lang_ids_query})"

	if start_ids:
	placeholders = ','.join(['?'] * len(start_ids))
	query += f" AND e.start_id IN ({placeholders})"
	params.extend(start_ids)

	if relation:
	if '%' in relation:
	query += " AND r.label LIKE ?"
	else:
	query += " AND r.label = ?"
	params.append(relation)

	if end_ids:
	placeholders = ','.join(['?'] * len(end_ids))
	query += f" AND e.end_id IN ({placeholders})"
	params.extend(end_ids)

	query += " ORDER BY e.weight DESC LIMIT ?"
	params.append(limit)

	print(f" Executing query with {len(params)} params...")
	start_time = time.time()

	df = pd.read_sql_query(query, conn, params=params)

	elapsed = time.time() - start_time
	print(f" ✅ {len(df)} results in {elapsed:.2f}s")
	print("="*60 + "\n")

	if df.empty:
	return pd.DataFrame(), f"No results found ({elapsed:.2f}s)"

	df.columns = ['edge_id', 'start_id', 'relation', 'end_id', 'weight', 'start_label', 'end_label']
	return df, f"✅ {len(df)} results in {elapsed:.2f}s"

	except Exception as e:
	print(f"ERROR: {e}")
	import traceback
	traceback.print_exc()
	return pd.DataFrame(), f"❌ Error:\n\n```\n{e}\n```"

	def run_raw_query(sql_query):
	"""Execute raw SQL"""
	if not sql_query.strip().upper().startswith("SELECT"):
	return pd.DataFrame(), "Only SELECT allowed"

	try:
	with get_db_connection() as conn:
	start = time.time()
	df = pd.read_sql_query(sql_query, conn)
	elapsed = time.time() - start
	return df, f"✅ {len(df)} rows in {elapsed:.2f}s"
	except Exception as e:
	return pd.DataFrame(), f"Error: {e}"

	def get_schema_info():
	"""Get schema"""
	with get_db_connection() as conn:
	cursor = conn.cursor()

	md = "# 📚 Schema\n\n"
	md += "⚠️ CRITICAL: Edge table has NO indices on start_id/end_id!\n\n"
	md += "This means LIKE queries on edge table do full table scans (34M rows).\n\n"
	md += "Workaround: Query node table first (has index), then use exact ID matches.\n\n"

	cursor.execute("SELECT name FROM sqlite_master WHERE type='table' AND name NOT LIKE 'sqlite_%'")
	tables = cursor.fetchall()

	for table_name, in tables:
	cursor.execute(f"SELECT COUNT(*) FROM {table_name}")
	count = cursor.fetchone()[0]
	md += f"## {table_name} ({count:,} rows)\n\n"

	cursor.execute(f"PRAGMA table_info({table_name})")
	cols = cursor.fetchall()

	md += "\| Column \| Type \|\n\|:--\|:--\|\n"
	for col in cols:
	md += f"\| `{col[1]}` \| `{col[2]}` \|\n"

	cursor.execute(f"PRAGMA index_list({table_name})")
	indices = cursor.fetchall()

	if indices:
	md += f"\nIndices: {len(indices)}\n"
	for idx in indices:
	cursor.execute(f"PRAGMA index_info({idx[1]})")
	idx_cols = cursor.fetchall()
	cols_str = ', '.join([c[2] for c in idx_cols if c[2]]) or 'PRIMARY KEY'
	md += f"- {idx[1]}: {cols_str}\n"

	md += "\n---\n\n"

	return md

	# Test on startup
	print("\n🧪 TESTING DATABASE...")
	with get_db_connection() as conn:
	cursor = conn.cursor()

	# Test 1: Node query (should be fast - has index)
	start = time.time()
	cursor.execute("SELECT COUNT(*) FROM node WHERE id LIKE '/c/de/%'")
	de_count = cursor.fetchone()[0]
	elapsed = time.time() - start
	print(f"✅ DE nodes: {de_count:,} ({elapsed:.3f}s)")

	# Test 2: Get specific node
	cursor.execute("SELECT id FROM node WHERE id LIKE '/c/de/hund%' LIMIT 1")
	hund_id = cursor.fetchone()
	if hund_id:
	print(f"✅ Found 'hund': {hund_id[0]}")

	# Test 3: Query edges with exact ID (should be fast)
	start = time.time()
	cursor.execute("""
	SELECT COUNT(*) FROM edge
	WHERE start_id = ? OR end_id = ?
	""", (hund_id[0], hund_id[0]))
	edge_count = cursor.fetchone()[0]
	elapsed = time.time() - start
	print(f"✅ Edges for 'hund': {edge_count} ({elapsed:.3f}s)")

	print("\n🚀 Starting app...\n")

	# Gradio UI
	with gr.Blocks(title="ConceptNet Explorer (FAST)", theme=gr.themes.Soft()) as demo:
	gr.Markdown("# 🧠 ConceptNet Explorer (Optimized for Missing Indices)")
	gr.Markdown(
	f"Language: {', '.join([l.upper() for l in TARGET_LANGUAGES])} \| "
	"Strategy: Query nodes first (indexed), then exact edge matches"
	)

	with gr.Tabs():
	with gr.TabItem("🔍 Semantic Profile"):
	gr.Markdown("Fast semantic profile using indexed node queries")

	with gr.Row():
	word_input = gr.Textbox(label="Word", placeholder="hund", value="hund")
	lang_input = gr.Dropdown(choices=TARGET_LANGUAGES, value=TARGET_LANGUAGES[0], label="Language")

	semantic_btn = gr.Button("🔍 Get Profile", variant="primary", size="lg")
	semantic_output = gr.Markdown("Click to start...")

	with gr.TabItem("⚡ Query Builder"):
	gr.Markdown("Fast queries using node lookup → exact edge matches")

	with gr.Row():
	start_input = gr.Textbox(label="Start Node", placeholder="hund", value="")
	rel_input = gr.Textbox(label="Relation", placeholder="IsA", value="")
	end_input = gr.Textbox(label="End Node", placeholder="tier", value="")

	limit_slider = gr.Slider(label="Limit", minimum=1, maximum=100, value=20, step=1)
	query_btn = gr.Button("▶️ Run Query", variant="primary", size="lg")

	status_output = gr.Markdown("Ready...")
	results_output = gr.DataFrame(label="Results")

	with gr.TabItem("💻 Raw SQL"):
	raw_sql_input = gr.Textbox(
	label="SQL Query",
	value="SELECT * FROM node WHERE id LIKE '/c/de/hund%' LIMIT 10",
	lines=3
	)
	raw_btn = gr.Button("▶️ Execute")
	raw_status = gr.Markdown()
	raw_results = gr.DataFrame()

	with gr.TabItem("📊 Schema"):
	schema_btn = gr.Button("📊 Load Schema")
	schema_output = gr.Markdown("Click to load...")

	gr.Markdown("---\nOptimization: Avoids slow LIKE queries on edge table by querying indexed node table first")

	# Connect functions
	semantic_btn.click(get_semantic_profile_fast, [word_input, lang_input], semantic_output)
	query_btn.click(run_query_fast, [start_input, rel_input, end_input, limit_slider], [results_output, status_output])
	raw_btn.click(run_raw_query, raw_sql_input, [raw_results, raw_status])
	schema_btn.click(get_schema_info, None, schema_output)

	if __name__ == "__main__":
	demo.launch(ssr_mode=False)