cstr commited on
Commit
13a2324
Β·
verified Β·
1 Parent(s): 1e767d5

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +121 -120
app.py CHANGED
@@ -4,6 +4,7 @@ import pandas as pd
4
  from huggingface_hub import hf_hub_download, snapshot_download
5
  import os
6
  import traceback
 
7
  from pathlib import Path
8
 
9
  # --- 1. Download and Cache the Database with Indices ---
@@ -52,6 +53,23 @@ except Exception as e:
52
 
53
  # --- 2. Database Helper Functions ---
54
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
55
  def get_db_connection():
56
  """
57
  Creates a new read-only connection to the SQLite database with optimizations.
@@ -65,7 +83,6 @@ def get_db_connection():
65
  ("PRAGMA query_only = ON", True),
66
  ("PRAGMA temp_store = MEMORY", True),
67
  ("PRAGMA cache_size = -128000", True), # 128MB cache
68
- ("PRAGMA page_size = 8192", False), # Can't change on existing DB
69
  ("PRAGMA mmap_size = 2147483648", True), # 2GB memory-mapped I/O
70
  ("PRAGMA synchronous = OFF", True),
71
  ("PRAGMA locking_mode = NORMAL", True),
@@ -76,7 +93,7 @@ def get_db_connection():
76
  try:
77
  conn.execute(pragma)
78
  except sqlite3.OperationalError as e:
79
- if critical:
80
  print(f"Warning: Could not apply {pragma}: {e}")
81
 
82
  return conn
@@ -128,12 +145,6 @@ def verify_indices():
128
  if fts:
129
  print(f"βœ… Full-Text Search enabled: {[f[0] for f in fts]}")
130
 
131
- # Check for FTS tables
132
- cursor.execute("SELECT name FROM sqlite_master WHERE type='table' AND name LIKE '%fts%'")
133
- fts_tables = cursor.fetchall()
134
- if fts_tables:
135
- print(f"βœ… FTS tables found: {[t[0] for t in fts_tables]}")
136
-
137
  # Check database stats
138
  cursor.execute("PRAGMA page_size")
139
  page_size = cursor.fetchone()[0]
@@ -158,62 +169,68 @@ def get_schema_info():
158
  schema_md = "# πŸ“š Database Schema\n\n"
159
 
160
  try:
161
- with get_db_connection() as conn:
162
- cursor = conn.cursor()
 
163
 
164
- # Get database stats
165
- cursor.execute("SELECT page_count * page_size as size FROM pragma_page_count(), pragma_page_size()")
166
- db_size = cursor.fetchone()[0]
167
- schema_md += f"**Database Size:** {db_size / 1024 / 1024 / 1024:.2f} GB\n\n"
168
-
169
- cursor.execute("SELECT name FROM sqlite_master WHERE type='table' AND name NOT LIKE 'sqlite_%';")
170
- tables = cursor.fetchall()
171
-
172
- if not tables:
173
- return "Could not find any tables in the database."
 
 
 
174
 
175
- for table in tables:
176
- table_name = table[0]
177
-
178
- # Get row count
179
- try:
180
- cursor.execute(f"SELECT COUNT(*) FROM {table_name}")
181
- row_count = cursor.fetchone()[0]
182
- schema_md += f"## Table: `{table_name}` ({row_count:,} rows)\n\n"
183
- except:
184
- schema_md += f"## Table: `{table_name}`\n\n"
185
-
186
- schema_md += "### Columns\n\n"
187
- schema_md += "| Column Name | Data Type | Not Null | Primary Key |\n"
188
- schema_md += "|:------------|:----------|:---------|:------------|\n"
189
-
190
- cursor.execute(f"PRAGMA table_info({table_name});")
191
- columns = cursor.fetchall()
192
- for col in columns:
193
- name, dtype, notnull, pk = col[1], col[2], col[3], col[5]
194
- schema_md += f"| `{name}` | `{dtype}` | {'βœ“' if notnull else 'βœ—'} | {'βœ“' if pk else 'βœ—'} |\n"
195
-
196
- # Show indices with details
197
- cursor.execute(f"PRAGMA index_list({table_name});")
198
- indices = cursor.fetchall()
199
- if indices:
200
- schema_md += f"\n### Indices ({len(indices)})\n\n"
201
- for idx in indices:
202
- idx_name, unique, origin = idx[1], idx[2], idx[3]
203
-
204
- # Get indexed columns
205
- cursor.execute(f"PRAGMA index_info({idx_name});")
206
- idx_cols = cursor.fetchall()
207
- cols = [col[2] for col in idx_cols if col[2]]
208
-
209
- unique_badge = "πŸ”’ UNIQUE" if unique else "πŸ“‘ INDEX"
210
- schema_md += f"- **{idx_name}** {unique_badge}\n"
211
- schema_md += f" - Columns: `{', '.join(cols) if cols else 'id'}`\n"
212
- schema_md += f" - Origin: {origin}\n"
213
-
214
- schema_md += "\n---\n\n"
215
 
216
- return schema_md
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
217
  except Exception as e:
218
  print(f"Error in get_schema_info: {e}")
219
  traceback.print_exc()
@@ -305,15 +322,6 @@ def run_raw_query(sql_query):
305
 
306
  try:
307
  with get_db_connection() as conn:
308
- # Show query plan for debugging
309
- try:
310
- explain_result = conn.execute("EXPLAIN QUERY PLAN " + sql_query).fetchall()
311
- print("Query Plan:")
312
- for row in explain_result:
313
- print(f" {row}")
314
- except:
315
- pass
316
-
317
  df = pd.read_sql_query(sql_query, conn)
318
 
319
  if df.empty:
@@ -328,6 +336,7 @@ def run_raw_query(sql_query):
328
  def get_semantic_profile(word, lang='en'):
329
  """
330
  HIGHLY OPTIMIZED: Single query with UNION ALL for all relations at once.
 
331
  """
332
  if not word:
333
  return "⚠️ Please enter a word."
@@ -336,7 +345,6 @@ def get_semantic_profile(word, lang='en'):
336
  like_path = f"/c/{lang}/{word}%"
337
  print(f"Getting semantic profile for: {like_path}")
338
 
339
- # Most important relations for semantic understanding
340
  relations_to_check = [
341
  "/r/IsA", "/r/PartOf", "/r/HasA", "/r/UsedFor", "/r/CapableOf",
342
  "/r/Causes", "/r/HasProperty", "/r/Synonym", "/r/Antonym",
@@ -348,12 +356,12 @@ def get_semantic_profile(word, lang='en'):
348
 
349
  try:
350
  with get_db_connection() as conn:
351
- # MEGA-OPTIMIZED: Single UNION ALL query for all relations
352
  union_parts = []
353
  union_params = []
354
 
355
  for rel in relations_to_check:
356
- # Outgoing edges (word as subject)
357
  union_parts.append("""
358
  SELECT
359
  ? as rel_label,
@@ -366,12 +374,10 @@ def get_semantic_profile(word, lang='en'):
366
  INNER JOIN node en ON e.end_id = en.id
367
  INNER JOIN relation r ON e.rel_id = r.id
368
  WHERE s.id LIKE ? AND r.label = ?
369
- ORDER BY e.weight DESC
370
- LIMIT 7
371
  """)
372
  union_params.extend([rel, like_path, rel])
373
 
374
- # Incoming edges (word as object)
375
  union_parts.append("""
376
  SELECT
377
  ? as rel_label,
@@ -384,13 +390,11 @@ def get_semantic_profile(word, lang='en'):
384
  INNER JOIN node en ON e.end_id = en.id
385
  INNER JOIN relation r ON e.rel_id = r.id
386
  WHERE en.id LIKE ? AND r.label = ?
387
- ORDER BY e.weight DESC
388
- LIMIT 7
389
  """)
390
  union_params.extend([rel, like_path, rel])
391
 
392
- # Execute the mega-query
393
- full_query = " UNION ALL ".join(union_parts)
394
 
395
  print(f"Executing optimized semantic profile query...")
396
  cursor = conn.execute(full_query, union_params)
@@ -409,49 +413,42 @@ This could mean:
409
 
410
  **Tip:** Use the Query Builder to search manually."""
411
 
412
- # Group and format results
413
- current_rel = None
414
- rel_results = []
415
- total_relations = 0
416
 
417
  for rel_label, direction, target_id, target_label, weight in results:
418
- if rel_label != current_rel:
419
- if current_rel is not None:
420
- # Write previous relation
421
- output_md += f"## {current_rel}\n\n"
422
- if rel_results:
423
- for line in rel_results:
424
- output_md += line
425
- total_relations += len(rel_results)
426
- else:
427
- output_md += "*No results*\n"
428
- output_md += "\n"
429
-
430
- current_rel = rel_label
431
- rel_results = []
432
-
433
- # Format output
434
- weight_str = f"{weight:.3f}"
435
- if direction == 'out':
436
- rel_results.append(
437
- f"- **{word}** {rel_label} β†’ *{target_label}* "
438
- f"`[{weight_str}]`\n"
439
- )
440
- else:
441
- rel_results.append(
442
- f"- *{target_label}* {rel_label} β†’ **{word}** "
443
- f"`[{weight_str}]`\n"
444
- )
445
 
446
- # Write last relation
447
- if current_rel is not None:
448
- output_md += f"## {current_rel}\n\n"
449
- if rel_results:
450
- for line in rel_results:
451
- output_md += line
452
- total_relations += len(rel_results)
453
- else:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
454
  output_md += "*No results*\n"
 
455
  output_md += "\n"
456
 
457
  output_md += "---\n"
@@ -467,6 +464,9 @@ This could mean:
467
 
468
  # --- 3. Build the Gradio UI ---
469
 
 
 
 
470
  # Verify indices on startup
471
  verify_indices()
472
 
@@ -479,7 +479,8 @@ with gr.Blocks(title="ConceptNet SQLite Explorer", theme=gr.themes.Soft()) as de
479
 
480
  gr.Markdown(
481
  f"**Database:** `{os.path.basename(DB_PATH)}` ({db_size_gb:.2f} GB) | "
482
- f"**Status:** {index_status}"
 
483
  )
484
  gr.Markdown("*Explore semantic relationships in ConceptNet with optimized indexed queries*")
485
 
 
4
  from huggingface_hub import hf_hub_download, snapshot_download
5
  import os
6
  import traceback
7
+ import shutil
8
  from pathlib import Path
9
 
10
  # --- 1. Download and Cache the Database with Indices ---
 
53
 
54
  # --- 2. Database Helper Functions ---
55
 
56
+ def check_disk_space():
57
+ """Check and report disk space"""
58
+ total, used, free = shutil.disk_usage("/")
59
+
60
+ print(f"\n=== Disk Space Check ===")
61
+ print(f"Total: {total / (2**30):.2f} GB")
62
+ print(f"Used: {used / (2**30):.2f} GB")
63
+ print(f"Free: {free / (2**30):.2f} GB")
64
+
65
+ if free < 5 * (2**30):
66
+ print("⚠️ WARNING: Less than 5GB free!")
67
+ else:
68
+ print("βœ… Sufficient disk space")
69
+ print("========================\n")
70
+
71
+ return free
72
+
73
  def get_db_connection():
74
  """
75
  Creates a new read-only connection to the SQLite database with optimizations.
 
83
  ("PRAGMA query_only = ON", True),
84
  ("PRAGMA temp_store = MEMORY", True),
85
  ("PRAGMA cache_size = -128000", True), # 128MB cache
 
86
  ("PRAGMA mmap_size = 2147483648", True), # 2GB memory-mapped I/O
87
  ("PRAGMA synchronous = OFF", True),
88
  ("PRAGMA locking_mode = NORMAL", True),
 
93
  try:
94
  conn.execute(pragma)
95
  except sqlite3.OperationalError as e:
96
+ if critical and "journal_mode" not in pragma:
97
  print(f"Warning: Could not apply {pragma}: {e}")
98
 
99
  return conn
 
145
  if fts:
146
  print(f"βœ… Full-Text Search enabled: {[f[0] for f in fts]}")
147
 
 
 
 
 
 
 
148
  # Check database stats
149
  cursor.execute("PRAGMA page_size")
150
  page_size = cursor.fetchone()[0]
 
169
  schema_md = "# πŸ“š Database Schema\n\n"
170
 
171
  try:
172
+ conn = get_db_connection()
173
+ if not conn:
174
+ return "❌ Could not connect to database"
175
 
176
+ cursor = conn.cursor()
177
+
178
+ # Get database stats
179
+ cursor.execute("SELECT page_count * page_size as size FROM pragma_page_count(), pragma_page_size()")
180
+ db_size = cursor.fetchone()[0]
181
+ schema_md += f"**Database Size:** {db_size / 1024 / 1024 / 1024:.2f} GB\n\n"
182
+
183
+ cursor.execute("SELECT name FROM sqlite_master WHERE type='table' AND name NOT LIKE 'sqlite_%';")
184
+ tables = cursor.fetchall()
185
+
186
+ if not tables:
187
+ conn.close()
188
+ return "Could not find any tables in the database."
189
 
190
+ for table in tables:
191
+ table_name = table[0]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
192
 
193
+ # Get row count (with timeout protection)
194
+ try:
195
+ cursor.execute(f"SELECT COUNT(*) FROM {table_name}")
196
+ row_count = cursor.fetchone()[0]
197
+ schema_md += f"## Table: `{table_name}` ({row_count:,} rows)\n\n"
198
+ except:
199
+ schema_md += f"## Table: `{table_name}`\n\n"
200
+
201
+ schema_md += "### Columns\n\n"
202
+ schema_md += "| Column Name | Data Type | Not Null | Primary Key |\n"
203
+ schema_md += "|:------------|:----------|:---------|:------------|\n"
204
+
205
+ cursor.execute(f"PRAGMA table_info({table_name});")
206
+ columns = cursor.fetchall()
207
+ for col in columns:
208
+ name, dtype, notnull, pk = col[1], col[2], col[3], col[5]
209
+ schema_md += f"| `{name}` | `{dtype}` | {'βœ“' if notnull else 'βœ—'} | {'βœ“' if pk else 'βœ—'} |\n"
210
+
211
+ # Show indices with details
212
+ cursor.execute(f"PRAGMA index_list({table_name});")
213
+ indices = cursor.fetchall()
214
+ if indices:
215
+ schema_md += f"\n### Indices ({len(indices)})\n\n"
216
+ for idx in indices:
217
+ idx_name, unique, origin = idx[1], idx[2], idx[3]
218
+
219
+ # Get indexed columns
220
+ cursor.execute(f"PRAGMA index_info({idx_name});")
221
+ idx_cols = cursor.fetchall()
222
+ cols = [col[2] for col in idx_cols if col[2]]
223
+
224
+ unique_badge = "πŸ”’ UNIQUE" if unique else "πŸ“‘ INDEX"
225
+ schema_md += f"- **{idx_name}** {unique_badge}\n"
226
+ schema_md += f" - Columns: `{', '.join(cols) if cols else 'id'}`\n"
227
+ schema_md += f" - Origin: {origin}\n"
228
+
229
+ schema_md += "\n---\n\n"
230
+
231
+ conn.close()
232
+ return schema_md
233
+
234
  except Exception as e:
235
  print(f"Error in get_schema_info: {e}")
236
  traceback.print_exc()
 
322
 
323
  try:
324
  with get_db_connection() as conn:
 
 
 
 
 
 
 
 
 
325
  df = pd.read_sql_query(sql_query, conn)
326
 
327
  if df.empty:
 
336
  def get_semantic_profile(word, lang='en'):
337
  """
338
  HIGHLY OPTIMIZED: Single query with UNION ALL for all relations at once.
339
+ FIXED: ORDER BY placed correctly after all UNION ALL clauses.
340
  """
341
  if not word:
342
  return "⚠️ Please enter a word."
 
345
  like_path = f"/c/{lang}/{word}%"
346
  print(f"Getting semantic profile for: {like_path}")
347
 
 
348
  relations_to_check = [
349
  "/r/IsA", "/r/PartOf", "/r/HasA", "/r/UsedFor", "/r/CapableOf",
350
  "/r/Causes", "/r/HasProperty", "/r/Synonym", "/r/Antonym",
 
356
 
357
  try:
358
  with get_db_connection() as conn:
359
+ # FIXED: ORDER BY goes AFTER all UNION ALL clauses
360
  union_parts = []
361
  union_params = []
362
 
363
  for rel in relations_to_check:
364
+ # Outgoing edges
365
  union_parts.append("""
366
  SELECT
367
  ? as rel_label,
 
374
  INNER JOIN node en ON e.end_id = en.id
375
  INNER JOIN relation r ON e.rel_id = r.id
376
  WHERE s.id LIKE ? AND r.label = ?
 
 
377
  """)
378
  union_params.extend([rel, like_path, rel])
379
 
380
+ # Incoming edges
381
  union_parts.append("""
382
  SELECT
383
  ? as rel_label,
 
390
  INNER JOIN node en ON e.end_id = en.id
391
  INNER JOIN relation r ON e.rel_id = r.id
392
  WHERE en.id LIKE ? AND r.label = ?
 
 
393
  """)
394
  union_params.extend([rel, like_path, rel])
395
 
396
+ # Combine all parts with UNION ALL, then ORDER BY at the very end
397
+ full_query = " UNION ALL ".join(union_parts) + " ORDER BY rel_label, weight DESC"
398
 
399
  print(f"Executing optimized semantic profile query...")
400
  cursor = conn.execute(full_query, union_params)
 
413
 
414
  **Tip:** Use the Query Builder to search manually."""
415
 
416
+ # Group and format results (limit to top 7 per relation per direction)
417
+ results_by_rel = {}
 
 
418
 
419
  for rel_label, direction, target_id, target_label, weight in results:
420
+ key = (rel_label, direction)
421
+ if key not in results_by_rel:
422
+ results_by_rel[key] = []
423
+ if len(results_by_rel[key]) < 7: # Limit to 7 per relation per direction
424
+ results_by_rel[key].append((target_id, target_label, weight))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
425
 
426
+ # Format output
427
+ total_relations = 0
428
+ for rel in relations_to_check:
429
+ output_md += f"## {rel}\n\n"
430
+
431
+ has_results = False
432
+
433
+ # Outgoing
434
+ out_key = (rel, 'out')
435
+ if out_key in results_by_rel and results_by_rel[out_key]:
436
+ for target_id, target_label, weight in results_by_rel[out_key]:
437
+ output_md += f"- **{word}** {rel} β†’ *{target_label}* `[{weight:.3f}]`\n"
438
+ has_results = True
439
+ total_relations += 1
440
+
441
+ # Incoming
442
+ in_key = (rel, 'in')
443
+ if in_key in results_by_rel and results_by_rel[in_key]:
444
+ for target_id, target_label, weight in results_by_rel[in_key]:
445
+ output_md += f"- *{target_label}* {rel} β†’ **{word}** `[{weight:.3f}]`\n"
446
+ has_results = True
447
+ total_relations += 1
448
+
449
+ if not has_results:
450
  output_md += "*No results*\n"
451
+
452
  output_md += "\n"
453
 
454
  output_md += "---\n"
 
464
 
465
  # --- 3. Build the Gradio UI ---
466
 
467
+ # Check disk space first
468
+ free_space = check_disk_space()
469
+
470
  # Verify indices on startup
471
  verify_indices()
472
 
 
479
 
480
  gr.Markdown(
481
  f"**Database:** `{os.path.basename(DB_PATH)}` ({db_size_gb:.2f} GB) | "
482
+ f"**Status:** {index_status} | "
483
+ f"**Free Disk:** {free_space / (2**30):.2f} GB"
484
  )
485
  gr.Markdown("*Explore semantic relationships in ConceptNet with optimized indexed queries*")
486