cstr commited on
Commit
1e767d5
Β·
verified Β·
1 Parent(s): 7ff8eef

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +40 -47
app.py CHANGED
@@ -35,7 +35,6 @@ except Exception as e:
35
 
36
  # Download the entire index folder for better performance
37
  try:
38
- # Use snapshot_download to get the entire data directory with indices
39
  CACHE_DIR = snapshot_download(
40
  repo_id=REPO_ID,
41
  repo_type="dataset",
@@ -44,7 +43,6 @@ try:
44
  INDEX_PATH = os.path.join(CACHE_DIR, INDEX_FOLDER)
45
  print(f"Index files downloaded to: {INDEX_PATH}")
46
 
47
- # Count index files
48
  if os.path.exists(INDEX_PATH):
49
  index_files = list(Path(INDEX_PATH).glob("*.ldb"))
50
  print(f"Found {len(index_files)} index files (.ldb)")
@@ -64,22 +62,22 @@ def get_db_connection():
64
 
65
  # Apply PRAGMA optimizations for read performance
66
  pragmas = [
67
- "PRAGMA query_only = ON", # Read-only mode
68
- "PRAGMA temp_store = MEMORY", # Use memory for temp tables
69
- "PRAGMA cache_size = -128000", # 128MB cache (negative = KB)
70
- "PRAGMA page_size = 8192", # Larger page size for better I/O
71
- "PRAGMA mmap_size = 2147483648", # 2GB memory-mapped I/O
72
- "PRAGMA synchronous = OFF", # Safe for read-only
73
- "PRAGMA journal_mode = OFF", # No journal needed for read-only
74
- "PRAGMA locking_mode = NORMAL", # Allow multiple readers
75
- "PRAGMA threads = 4", # Use multiple threads
76
  ]
77
 
78
- for pragma in pragmas:
79
  try:
80
  conn.execute(pragma)
81
  except sqlite3.OperationalError as e:
82
- print(f"Warning: Could not apply {pragma}: {e}")
 
83
 
84
  return conn
85
  except Exception as e:
@@ -117,9 +115,9 @@ def verify_indices():
117
  cursor.execute(f"PRAGMA index_info({idx_name})")
118
  idx_cols = cursor.fetchall()
119
 
120
- cols = [col[2] for col in idx_cols]
121
  unique_str = "UNIQUE" if unique else "NON-UNIQUE"
122
- print(f" β”œβ”€ {idx_name} ({unique_str}) on columns: {', '.join(cols)}")
123
  total_indices += 1
124
 
125
  print(f"\nβœ… Total indices found: {total_indices}")
@@ -130,7 +128,13 @@ def verify_indices():
130
  if fts:
131
  print(f"βœ… Full-Text Search enabled: {[f[0] for f in fts]}")
132
 
133
- # Check database page size and cache
 
 
 
 
 
 
134
  cursor.execute("PRAGMA page_size")
135
  page_size = cursor.fetchone()[0]
136
  cursor.execute("PRAGMA cache_size")
@@ -138,7 +142,6 @@ def verify_indices():
138
  print(f"\nπŸ“ˆ Page size: {page_size} bytes")
139
  print(f"πŸ“ˆ Cache size: {abs(cache_size)} KB" if cache_size < 0 else f"πŸ“ˆ Cache size: {cache_size} pages")
140
 
141
- # Get database size
142
  cursor.execute("SELECT page_count * page_size as size FROM pragma_page_count(), pragma_page_size()")
143
  db_size = cursor.fetchone()[0]
144
  print(f"πŸ“¦ Database size: {db_size / 1024 / 1024 / 1024:.2f} GB")
@@ -205,7 +208,7 @@ def get_schema_info():
205
 
206
  unique_badge = "πŸ”’ UNIQUE" if unique else "πŸ“‘ INDEX"
207
  schema_md += f"- **{idx_name}** {unique_badge}\n"
208
- schema_md += f" - Columns: `{', '.join(cols) if cols else 'N/A'}`\n"
209
  schema_md += f" - Origin: {origin}\n"
210
 
211
  schema_md += "\n---\n\n"
@@ -244,13 +247,12 @@ def run_query(start_node, relation, end_node, limit):
244
  params = []
245
 
246
  try:
247
- # Build WHERE conditions leveraging indices
248
  if start_node:
249
  if "%" in start_node:
250
  where_conditions.append("s.id LIKE ?")
251
  params.append(start_node)
252
  else:
253
- # Exact match or prefix match
254
  where_conditions.append("s.id LIKE ?")
255
  params.append(f"%{start_node}%")
256
 
@@ -259,7 +261,6 @@ def run_query(start_node, relation, end_node, limit):
259
  where_conditions.append("r.label LIKE ?")
260
  params.append(relation)
261
  else:
262
- # Exact match is faster
263
  where_conditions.append("r.label = ?")
264
  params.append(relation)
265
 
@@ -274,24 +275,12 @@ def run_query(start_node, relation, end_node, limit):
274
  if where_conditions:
275
  query += " WHERE " + " AND ".join(where_conditions)
276
 
277
- # Order by weight to get most relevant results first
278
  query += " ORDER BY e.weight DESC LIMIT ?"
279
  params.append(limit)
280
 
281
  print(f"Executing SQL with {len(params)} parameters")
282
 
283
  with get_db_connection() as conn:
284
- # Use EXPLAIN QUERY PLAN to verify index usage (for debugging)
285
- explain_query = "EXPLAIN QUERY PLAN " + query
286
- try:
287
- explain_result = conn.execute(explain_query, params).fetchall()
288
- print("Query Plan:")
289
- for row in explain_result:
290
- print(f" {row}")
291
- except:
292
- pass
293
-
294
- # Execute actual query
295
  df = pd.read_sql_query(query, conn, params=params)
296
 
297
  if df.empty:
@@ -316,7 +305,7 @@ def run_raw_query(sql_query):
316
 
317
  try:
318
  with get_db_connection() as conn:
319
- # Show query plan
320
  try:
321
  explain_result = conn.execute("EXPLAIN QUERY PLAN " + sql_query).fetchall()
322
  print("Query Plan:")
@@ -339,7 +328,6 @@ def run_raw_query(sql_query):
339
  def get_semantic_profile(word, lang='en'):
340
  """
341
  HIGHLY OPTIMIZED: Single query with UNION ALL for all relations at once.
342
- Uses indexed columns for maximum speed.
343
  """
344
  if not word:
345
  return "⚠️ Please enter a word."
@@ -374,7 +362,6 @@ def get_semantic_profile(word, lang='en'):
374
  en.label as target_label,
375
  e.weight as weight
376
  FROM edge e
377
- INDEXED BY (SELECT name FROM pragma_index_list('edge') LIMIT 1)
378
  INNER JOIN node s ON e.start_id = s.id
379
  INNER JOIN node en ON e.end_id = en.id
380
  INNER JOIN relation r ON e.rel_id = r.id
@@ -485,9 +472,14 @@ verify_indices()
485
 
486
  with gr.Blocks(title="ConceptNet SQLite Explorer", theme=gr.themes.Soft()) as demo:
487
  gr.Markdown("# 🧠 ConceptNet SQLite Explorer")
 
 
 
 
 
488
  gr.Markdown(
489
- f"**Database:** `{os.path.basename(DB_PATH)}` ({os.path.getsize(DB_PATH) / 1024 / 1024 / 1024:.2f} GB) | "
490
- f"**Status:** {'βœ… Indices Loaded' if INDEX_PATH and os.path.exists(INDEX_PATH) else '⚠️ No Index Cache'}"
491
  )
492
  gr.Markdown("*Explore semantic relationships in ConceptNet with optimized indexed queries*")
493
 
@@ -585,7 +577,7 @@ with gr.Blocks(title="ConceptNet SQLite Explorer", theme=gr.themes.Soft()) as de
585
 
586
  raw_sql_input = gr.Textbox(
587
  label="SQL Query",
588
- placeholder="SELECT s.label, r.label, e.label FROM edge e JOIN node s ON e.start_id = s.id JOIN relation r ON e.rel_id = r.id JOIN node e ON e.end_id = e.id WHERE s.label = 'dog' LIMIT 10",
589
  lines=6,
590
  info="Write SELECT query"
591
  )
@@ -602,11 +594,11 @@ with gr.Blocks(title="ConceptNet SQLite Explorer", theme=gr.themes.Soft()) as de
602
  "```\n\n"
603
  "**Find strongest connections:**\n"
604
  "```sql\n"
605
- "SELECT s.label, r.label, e.label, edge.weight\n"
606
- "FROM edge \n"
607
- "JOIN node s ON edge.start_id = s.id\n"
608
- "JOIN relation r ON edge.rel_id = r.id\n"
609
- "JOIN node e ON edge.end_id = e.id\n"
610
  "ORDER BY weight DESC LIMIT 20\n"
611
  "```\n\n"
612
  "**Check index usage:**\n"
@@ -630,10 +622,11 @@ with gr.Blocks(title="ConceptNet SQLite Explorer", theme=gr.themes.Soft()) as de
630
  schema_output = gr.Markdown("*Click button to load schema...*")
631
 
632
  gr.Markdown("---")
 
633
  gr.Markdown(
634
- "πŸ’‘ **Performance:** Queries use database indices for fast lookups. "
635
- "Exact matches are faster than wildcards. "
636
- f"{'βœ… Index files loaded from HuggingFace cache.' if INDEX_PATH else '⚠️ Running without index cache - queries may be slower.'}"
637
  )
638
 
639
  # Connect UI to functions
 
35
 
36
  # Download the entire index folder for better performance
37
  try:
 
38
  CACHE_DIR = snapshot_download(
39
  repo_id=REPO_ID,
40
  repo_type="dataset",
 
43
  INDEX_PATH = os.path.join(CACHE_DIR, INDEX_FOLDER)
44
  print(f"Index files downloaded to: {INDEX_PATH}")
45
 
 
46
  if os.path.exists(INDEX_PATH):
47
  index_files = list(Path(INDEX_PATH).glob("*.ldb"))
48
  print(f"Found {len(index_files)} index files (.ldb)")
 
62
 
63
  # Apply PRAGMA optimizations for read performance
64
  pragmas = [
65
+ ("PRAGMA query_only = ON", True),
66
+ ("PRAGMA temp_store = MEMORY", True),
67
+ ("PRAGMA cache_size = -128000", True), # 128MB cache
68
+ ("PRAGMA page_size = 8192", False), # Can't change on existing DB
69
+ ("PRAGMA mmap_size = 2147483648", True), # 2GB memory-mapped I/O
70
+ ("PRAGMA synchronous = OFF", True),
71
+ ("PRAGMA locking_mode = NORMAL", True),
72
+ ("PRAGMA threads = 4", True),
 
73
  ]
74
 
75
+ for pragma, critical in pragmas:
76
  try:
77
  conn.execute(pragma)
78
  except sqlite3.OperationalError as e:
79
+ if critical:
80
+ print(f"Warning: Could not apply {pragma}: {e}")
81
 
82
  return conn
83
  except Exception as e:
 
115
  cursor.execute(f"PRAGMA index_info({idx_name})")
116
  idx_cols = cursor.fetchall()
117
 
118
+ cols = [col[2] for col in idx_cols if col[2]]
119
  unique_str = "UNIQUE" if unique else "NON-UNIQUE"
120
+ print(f" β”œβ”€ {idx_name} ({unique_str}) on columns: {', '.join(cols) if cols else 'id'}")
121
  total_indices += 1
122
 
123
  print(f"\nβœ… Total indices found: {total_indices}")
 
128
  if fts:
129
  print(f"βœ… Full-Text Search enabled: {[f[0] for f in fts]}")
130
 
131
+ # Check for FTS tables
132
+ cursor.execute("SELECT name FROM sqlite_master WHERE type='table' AND name LIKE '%fts%'")
133
+ fts_tables = cursor.fetchall()
134
+ if fts_tables:
135
+ print(f"βœ… FTS tables found: {[t[0] for t in fts_tables]}")
136
+
137
+ # Check database stats
138
  cursor.execute("PRAGMA page_size")
139
  page_size = cursor.fetchone()[0]
140
  cursor.execute("PRAGMA cache_size")
 
142
  print(f"\nπŸ“ˆ Page size: {page_size} bytes")
143
  print(f"πŸ“ˆ Cache size: {abs(cache_size)} KB" if cache_size < 0 else f"πŸ“ˆ Cache size: {cache_size} pages")
144
 
 
145
  cursor.execute("SELECT page_count * page_size as size FROM pragma_page_count(), pragma_page_size()")
146
  db_size = cursor.fetchone()[0]
147
  print(f"πŸ“¦ Database size: {db_size / 1024 / 1024 / 1024:.2f} GB")
 
208
 
209
  unique_badge = "πŸ”’ UNIQUE" if unique else "πŸ“‘ INDEX"
210
  schema_md += f"- **{idx_name}** {unique_badge}\n"
211
+ schema_md += f" - Columns: `{', '.join(cols) if cols else 'id'}`\n"
212
  schema_md += f" - Origin: {origin}\n"
213
 
214
  schema_md += "\n---\n\n"
 
247
  params = []
248
 
249
  try:
250
+ # Build WHERE conditions
251
  if start_node:
252
  if "%" in start_node:
253
  where_conditions.append("s.id LIKE ?")
254
  params.append(start_node)
255
  else:
 
256
  where_conditions.append("s.id LIKE ?")
257
  params.append(f"%{start_node}%")
258
 
 
261
  where_conditions.append("r.label LIKE ?")
262
  params.append(relation)
263
  else:
 
264
  where_conditions.append("r.label = ?")
265
  params.append(relation)
266
 
 
275
  if where_conditions:
276
  query += " WHERE " + " AND ".join(where_conditions)
277
 
 
278
  query += " ORDER BY e.weight DESC LIMIT ?"
279
  params.append(limit)
280
 
281
  print(f"Executing SQL with {len(params)} parameters")
282
 
283
  with get_db_connection() as conn:
 
 
 
 
 
 
 
 
 
 
 
284
  df = pd.read_sql_query(query, conn, params=params)
285
 
286
  if df.empty:
 
305
 
306
  try:
307
  with get_db_connection() as conn:
308
+ # Show query plan for debugging
309
  try:
310
  explain_result = conn.execute("EXPLAIN QUERY PLAN " + sql_query).fetchall()
311
  print("Query Plan:")
 
328
  def get_semantic_profile(word, lang='en'):
329
  """
330
  HIGHLY OPTIMIZED: Single query with UNION ALL for all relations at once.
 
331
  """
332
  if not word:
333
  return "⚠️ Please enter a word."
 
362
  en.label as target_label,
363
  e.weight as weight
364
  FROM edge e
 
365
  INNER JOIN node s ON e.start_id = s.id
366
  INNER JOIN node en ON e.end_id = en.id
367
  INNER JOIN relation r ON e.rel_id = r.id
 
472
 
473
  with gr.Blocks(title="ConceptNet SQLite Explorer", theme=gr.themes.Soft()) as demo:
474
  gr.Markdown("# 🧠 ConceptNet SQLite Explorer")
475
+
476
+ # Status info
477
+ db_size_gb = os.path.getsize(DB_PATH) / 1024 / 1024 / 1024 if os.path.exists(DB_PATH) else 0
478
+ index_status = 'βœ… Indices Loaded' if INDEX_PATH and os.path.exists(INDEX_PATH) else '⚠️ No Index Cache'
479
+
480
  gr.Markdown(
481
+ f"**Database:** `{os.path.basename(DB_PATH)}` ({db_size_gb:.2f} GB) | "
482
+ f"**Status:** {index_status}"
483
  )
484
  gr.Markdown("*Explore semantic relationships in ConceptNet with optimized indexed queries*")
485
 
 
577
 
578
  raw_sql_input = gr.Textbox(
579
  label="SQL Query",
580
+ placeholder="SELECT s.label, r.label, en.label FROM edge e JOIN node s ON e.start_id = s.id JOIN relation r ON e.rel_id = r.id JOIN node en ON e.end_id = en.id WHERE s.label = 'dog' LIMIT 10",
581
  lines=6,
582
  info="Write SELECT query"
583
  )
 
594
  "```\n\n"
595
  "**Find strongest connections:**\n"
596
  "```sql\n"
597
+ "SELECT s.label, r.label, en.label, e.weight\n"
598
+ "FROM edge e\n"
599
+ "JOIN node s ON e.start_id = s.id\n"
600
+ "JOIN relation r ON e.rel_id = r.id\n"
601
+ "JOIN node en ON e.end_id = en.id\n"
602
  "ORDER BY weight DESC LIMIT 20\n"
603
  "```\n\n"
604
  "**Check index usage:**\n"
 
622
  schema_output = gr.Markdown("*Click button to load schema...*")
623
 
624
  gr.Markdown("---")
625
+ index_count = len(list(Path(INDEX_PATH).glob("*.ldb"))) if INDEX_PATH and os.path.exists(INDEX_PATH) else 0
626
  gr.Markdown(
627
+ f"πŸ’‘ **Performance:** Queries use database indices for fast lookups. "
628
+ f"Exact matches are faster than wildcards. "
629
+ f"{'βœ… ' + str(index_count) + ' index files loaded.' if index_count > 0 else '⚠️ Running without index cache.'}"
630
  )
631
 
632
  # Connect UI to functions