cstr commited on
Commit
3f14a40
Β·
verified Β·
1 Parent(s): 5aee2b7

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +144 -245
app.py CHANGED
@@ -9,153 +9,151 @@ from pathlib import Path
9
  import json
10
 
11
  # ===== CONFIGURATION =====
12
- TARGET_LANGUAGES = ['de']
13
  INDEXED_REPO_ID = "cstr/conceptnet-de-indexed"
14
  INDEXED_DB_FILENAME = "conceptnet-de-indexed.db"
15
  PROGRESS_FILENAME = "indexing_progress.json"
16
  LOCAL_DB_PATH = "/tmp/conceptnet-indexed.db"
17
  # =========================
18
 
19
- print(f"🌍 Filtering to: {', '.join([l.upper() for l in TARGET_LANGUAGES])}")
20
 
21
  # Get HF token
22
  HF_TOKEN = os.environ.get("HF_TOKEN") or os.environ.get("HUGGINGFACE_TOKEN") or os.environ.get("HF_API_TOKEN")
23
 
24
- if not HF_TOKEN:
25
- print("⚠️ WARNING: No HF_TOKEN found!")
26
- print(" Add HF_TOKEN in Space settings to enable checkpointing")
27
- else:
28
  print(f"βœ… HF_TOKEN found (length: {len(HF_TOKEN)})")
 
 
29
 
30
  # Original database
31
  ORIGINAL_REPO_ID = "ysenarath/conceptnet-sqlite"
32
  ORIGINAL_DB_FILENAME = "data/conceptnet-v5.7.0.db"
33
 
34
  def log_progress(message, level="INFO"):
35
- """Enhanced logging with timestamp"""
36
  timestamp = time.strftime("%H:%M:%S")
37
- prefix = {
38
- "INFO": "ℹ️ ",
39
- "SUCCESS": "βœ…",
40
- "ERROR": "❌",
41
- "WARN": "⚠️ ",
42
- "CHECKPOINT": "πŸ’Ύ"
43
- }.get(level, "")
44
  print(f"[{timestamp}] {prefix} {message}")
45
 
46
  def verify_database_has_indices(db_path):
47
- """
48
- Verify that a database file actually has the required indices.
49
- Returns (has_indices, index_count)
50
- """
51
  if not os.path.exists(db_path):
52
  return False, 0
53
-
54
  try:
55
  conn = sqlite3.connect(db_path)
56
  cursor = conn.cursor()
57
-
58
- # Check for custom indices
59
  cursor.execute("SELECT name FROM sqlite_master WHERE type='index' AND name LIKE 'idx_%'")
60
  custom_indices = cursor.fetchall()
61
-
62
  conn.close()
63
-
64
  return len(custom_indices) >= 4, len(custom_indices)
65
-
66
  except Exception as e:
67
  log_progress(f"Error verifying indices: {e}", "ERROR")
68
  return False, 0
69
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
70
  def check_remote_progress():
71
- """Check remote progress with detailed logging"""
72
  if not HF_TOKEN:
73
- log_progress("No HF_TOKEN - cannot check remote progress", "WARN")
74
- return {
75
- "completed_indices": [],
76
- "analyzed_tables": [],
77
- "database_uploaded": False,
78
- "indexing_complete": False
79
- }
80
 
81
  try:
82
  api = HfApi()
83
-
84
- # Check if repo exists
85
  try:
86
  api.repo_info(repo_id=INDEXED_REPO_ID, repo_type="dataset", token=HF_TOKEN)
87
- log_progress(f"Repository exists: {INDEXED_REPO_ID}", "SUCCESS")
88
  except:
89
- log_progress("Repository doesn't exist yet", "INFO")
90
- return {
91
- "completed_indices": [],
92
- "analyzed_tables": [],
93
- "database_uploaded": False,
94
- "indexing_complete": False
95
- }
96
-
97
- # Download progress file
98
  try:
99
- progress_path = hf_hub_download(
100
- repo_id=INDEXED_REPO_ID,
101
- filename=PROGRESS_FILENAME,
102
- repo_type="dataset",
103
- token=HF_TOKEN
104
- )
105
-
106
  with open(progress_path, 'r') as f:
107
- progress = json.load(f)
108
-
109
- log_progress("Remote progress loaded:", "INFO")
110
- log_progress(f" Completed indices: {progress.get('completed_indices', [])}", "INFO")
111
- log_progress(f" Analyzed tables: {progress.get('analyzed_tables', [])}", "INFO")
112
- log_progress(f" Indexing complete: {progress.get('indexing_complete', False)}", "INFO")
113
-
114
- return progress
115
-
116
- except Exception as e:
117
- log_progress("No progress file found (starting fresh)", "INFO")
118
- return {
119
- "completed_indices": [],
120
- "analyzed_tables": [],
121
- "database_uploaded": False,
122
- "indexing_complete": False
123
- }
124
-
125
- except Exception as e:
126
- log_progress(f"Error checking remote: {e}", "ERROR")
127
- return {
128
- "completed_indices": [],
129
- "analyzed_tables": [],
130
- "database_uploaded": False,
131
- "indexing_complete": False
132
- }
133
 
134
  def update_remote_progress(completed_indices, analyzed_tables=None, database_uploaded=False, indexing_complete=False):
135
- """Update progress with detailed tracking"""
136
  if not HF_TOKEN:
137
- log_progress("Cannot update progress: No HF_TOKEN", "WARN")
138
  return False
139
-
140
  if analyzed_tables is None:
141
  analyzed_tables = []
142
-
143
  try:
144
  api = HfApi()
145
-
146
- # Create repo if needed
147
  try:
148
  api.repo_info(repo_id=INDEXED_REPO_ID, repo_type="dataset", token=HF_TOKEN)
149
  except:
150
- log_progress(f"Creating repository: {INDEXED_REPO_ID}", "INFO")
151
- api.create_repo(
152
- repo_id=INDEXED_REPO_ID,
153
- repo_type="dataset",
154
- token=HF_TOKEN,
155
- private=False
156
- )
157
 
158
- # Create progress file
159
  progress = {
160
  "completed_indices": completed_indices,
161
  "analyzed_tables": analyzed_tables,
@@ -169,79 +167,49 @@ def update_remote_progress(completed_indices, analyzed_tables=None, database_upl
169
  with open(progress_path, 'w') as f:
170
  json.dump(progress, f, indent=2)
171
 
172
- # Upload
173
  api.upload_file(
174
  path_or_fileobj=progress_path,
175
  path_in_repo=PROGRESS_FILENAME,
176
  repo_id=INDEXED_REPO_ID,
177
  repo_type="dataset",
178
  token=HF_TOKEN,
179
- commit_message=f"Progress: {len(completed_indices)} indices, {len(analyzed_tables)} tables analyzed"
180
  )
181
-
182
- log_progress(f"Progress updated: {len(completed_indices)} indices, {len(analyzed_tables)} tables analyzed", "CHECKPOINT")
183
  return True
184
-
185
  except Exception as e:
186
- log_progress(f"Failed to update progress: {e}", "ERROR")
187
- import traceback
188
- traceback.print_exc()
189
  return False
190
 
191
  def upload_database_checkpoint(message=""):
192
- """Upload database with proper WAL checkpoint"""
193
- if not HF_TOKEN:
194
- log_progress("Cannot upload: No HF_TOKEN", "WARN")
195
  return False
196
-
197
- if not os.path.exists(LOCAL_DB_PATH):
198
- log_progress("Database file doesn't exist", "ERROR")
199
- return False
200
-
201
  try:
202
- # CRITICAL FIX: Checkpoint WAL before upload
203
- log_progress("Checkpointing WAL to merge changes into main file...", "INFO")
204
  conn = sqlite3.connect(LOCAL_DB_PATH)
205
  conn.execute("PRAGMA wal_checkpoint(TRUNCATE)")
206
  conn.close()
207
- log_progress(" WAL checkpoint complete", "SUCCESS")
208
 
209
- # Verify indices are actually in the file
210
  has_indices, idx_count = verify_database_has_indices(LOCAL_DB_PATH)
211
- if has_indices:
212
- log_progress(f" Verified: {idx_count} indices present in file", "SUCCESS")
213
- else:
214
- log_progress(f" WARNING: Only {idx_count} indices found (expected 4+)", "WARN")
215
 
216
  api = HfApi()
217
-
218
  db_size = os.path.getsize(LOCAL_DB_PATH) / (2**30)
219
- log_progress(f"Uploading database checkpoint ({db_size:.2f} GB)...", "CHECKPOINT")
220
- if message:
221
- log_progress(f" {message}", "INFO")
222
- log_progress(f" This may take 5-10 minutes...", "INFO")
223
 
224
  start = time.time()
225
-
226
  api.upload_file(
227
  path_or_fileobj=LOCAL_DB_PATH,
228
  path_in_repo=INDEXED_DB_FILENAME,
229
  repo_id=INDEXED_REPO_ID,
230
  repo_type="dataset",
231
  token=HF_TOKEN,
232
- commit_message=message or "Database checkpoint"
233
  )
234
-
235
  elapsed = time.time() - start
236
- speed_mbps = (db_size * 8) / elapsed if elapsed > 0 else 0
237
- log_progress(f"Database uploaded in {elapsed:.1f}s ({speed_mbps:.1f} Mbps)", "SUCCESS")
238
-
239
  return True
240
-
241
  except Exception as e:
242
  log_progress(f"Upload failed: {e}", "ERROR")
243
- import traceback
244
- traceback.print_exc()
245
  return False
246
 
247
  def create_indexed_database():
@@ -511,129 +479,58 @@ def create_indexed_database():
511
 
512
  return LOCAL_DB_PATH
513
 
514
- # Initialize database
515
  DB_PATH = create_indexed_database()
516
 
 
 
 
517
  def get_db_connection():
518
- """Create optimized connection"""
519
  conn = sqlite3.connect(DB_PATH, check_same_thread=False)
520
  conn.execute("PRAGMA cache_size = -256000")
521
- conn.execute("PRAGMA mmap_size = 4294967296")
522
  return conn
523
 
524
- def verify_indices():
525
- """Verify indices"""
526
- log_progress("="*60, "INFO")
527
- log_progress("VERIFYING INDICES", "INFO")
528
- log_progress("="*60, "INFO")
529
-
530
- with get_db_connection() as conn:
531
- cursor = conn.cursor()
532
-
533
- cursor.execute("SELECT name FROM sqlite_master WHERE type='index' AND name LIKE 'idx_%'")
534
- custom_indices = cursor.fetchall()
535
-
536
- log_progress(f"Custom indices: {len(custom_indices)}", "SUCCESS" if len(custom_indices) >= 4 else "ERROR")
537
- for idx in custom_indices:
538
- log_progress(f" βœ“ {idx[0]}", "SUCCESS")
539
-
540
- if len(custom_indices) < 4:
541
- log_progress("⚠️ WARNING: Expected 4 indices, something went wrong!", "ERROR")
542
-
543
- # Speed test
544
- log_progress("Running speed test...", "INFO")
545
- start = time.time()
546
- cursor.execute("SELECT COUNT(*) FROM edge WHERE start_id LIKE '/c/de/hund%'")
547
- count = cursor.fetchone()[0]
548
- elapsed = time.time() - start
549
-
550
- status = "SUCCESS" if elapsed < 1 else "WARN" if elapsed < 5 else "ERROR"
551
- log_progress(f"Query: {count} results in {elapsed:.3f}s", status)
552
-
553
- if elapsed > 5:
554
- log_progress("⚠️ Query is slow - indices may not be working!", "ERROR")
555
-
556
- log_progress("="*60, "INFO")
557
-
558
- verify_indices()
559
-
560
- def get_semantic_profile(word, lang='de', progress=gr.Progress()):
561
- """Semantic profile"""
562
  progress(0, desc="Starting...")
563
 
564
  if not word:
565
  return "⚠️ Please enter a word."
566
 
567
  word = word.strip().lower().replace(' ', '_')
568
- like_path = f"/c/{lang}/{word}%"
569
 
570
- relations = [
571
- "/r/IsA", "/r/PartOf", "/r/HasA", "/r/UsedFor", "/r/CapableOf",
572
- "/r/Causes", "/r/HasProperty", "/r/Synonym", "/r/Antonym",
573
- "/r/AtLocation", "/r/RelatedTo"
 
574
  ]
575
 
576
- output_md = f"# 🧠 Semantic Profile: '{word}'\n\n"
577
 
578
  try:
579
  with get_db_connection() as conn:
580
  cursor = conn.cursor()
581
 
582
- progress(0.05, desc="Finding nodes...")
583
- cursor.execute("SELECT id, label FROM node WHERE id LIKE ? LIMIT 5", (like_path,))
584
- nodes = cursor.fetchall()
 
 
 
 
 
 
585
 
586
  if not nodes:
587
- return f"# 🧠 Semantic Profile: '{word}'\n\n⚠️ **Not found**"
588
 
589
  for node_id, label in nodes[:3]:
590
  output_md += f"**Node:** `{node_id}` ({label})\n"
591
  output_md += "\n"
592
 
593
- total = 0
594
-
595
- for i, rel in enumerate(relations):
596
- progress((i + 1) / len(relations), desc=f"Querying {rel}...")
597
-
598
- output_md += f"## {rel}\n\n"
599
- found = False
600
-
601
- # Outgoing
602
- cursor.execute("""
603
- SELECT en.label, e.weight
604
- FROM edge e
605
- JOIN node en ON e.end_id = en.id
606
- JOIN relation r ON e.rel_id = r.id
607
- WHERE e.start_id LIKE ? AND r.label = ?
608
- ORDER BY e.weight DESC LIMIT 7
609
- """, (like_path, rel))
610
-
611
- for label, weight in cursor.fetchall():
612
- output_md += f"- **{word}** {rel} β†’ *{label}* `[{weight:.3f}]`\n"
613
- found = True
614
- total += 1
615
-
616
- # Incoming
617
- cursor.execute("""
618
- SELECT s.label, e.weight
619
- FROM edge e
620
- JOIN node s ON e.start_id = s.id
621
- JOIN relation r ON e.rel_id = r.id
622
- WHERE e.end_id LIKE ? AND r.label = ?
623
- ORDER BY e.weight DESC LIMIT 7
624
- """, (like_path, rel))
625
-
626
- for label, weight in cursor.fetchall():
627
- output_md += f"- *{label}* {rel} β†’ **{word}** `[{weight:.3f}]`\n"
628
- found = True
629
- total += 1
630
-
631
- if not found:
632
- output_md += "*No results*\n"
633
- output_md += "\n"
634
-
635
- progress(1.0, desc="Complete!")
636
- output_md += f"---\n**Total:** {total} relations\n"
637
  return output_md
638
 
639
  except Exception as e:
@@ -716,10 +613,20 @@ def run_raw_query(sql_query):
716
  return pd.DataFrame(), f"Error: {e}"
717
 
718
  def get_schema_info():
719
- """Schema info"""
720
  with get_db_connection() as conn:
721
  cursor = conn.cursor()
722
  md = f"# πŸ“š Schema\n\n**Repo:** [{INDEXED_REPO_ID}](https://huggingface.co/datasets/{INDEXED_REPO_ID})\n\n"
 
 
 
 
 
 
 
 
 
 
723
  cursor.execute("SELECT name FROM sqlite_master WHERE type='table' AND name NOT LIKE 'sqlite_%'")
724
  for table, in cursor.fetchall():
725
  cursor.execute(f"SELECT COUNT(*) FROM {table}")
@@ -728,44 +635,36 @@ def get_schema_info():
728
 
729
  # UI
730
  with gr.Blocks(title="ConceptNet", theme=gr.themes.Soft()) as demo:
731
- gr.Markdown(f"# 🧠 ConceptNet ({', '.join([l.upper() for l in TARGET_LANGUAGES])})")
732
- gr.Markdown(f"**Repo:** [{INDEXED_REPO_ID}](https://huggingface.co/datasets/{INDEXED_REPO_ID}) | βœ… Fixed WAL checkpoint issue")
733
 
734
  with gr.Tabs():
735
  with gr.TabItem("πŸ” Profile"):
736
  with gr.Row():
737
- word_input = gr.Textbox(label="Word", placeholder="hund", value="hund")
738
- lang_input = gr.Dropdown(choices=TARGET_LANGUAGES, value=TARGET_LANGUAGES[0], label="Lang")
739
- semantic_btn = gr.Button("πŸ” Get Profile", variant="primary", size="lg")
740
  semantic_output = gr.Markdown()
741
 
742
- with gr.TabItem("⚑ Query"):
743
- with gr.Row():
744
- start_input = gr.Textbox(label="Start", placeholder="hund", value="hund")
745
- rel_input = gr.Textbox(label="Relation", placeholder="IsA", value="IsA")
746
- end_input = gr.Textbox(label="End", placeholder="")
747
- limit_slider = gr.Slider(label="Limit", minimum=1, maximum=200, value=50)
748
- query_btn = gr.Button("▢️ Run", variant="primary", size="lg")
749
- status_output = gr.Markdown()
750
- results_output = gr.DataFrame(wrap=True)
751
-
752
  with gr.TabItem("πŸ’» SQL"):
753
- raw_sql_input = gr.Textbox(label="SQL", value="SELECT * FROM edge WHERE start_id LIKE '/c/de/hund%' LIMIT 10", lines=3)
 
 
 
 
754
  raw_btn = gr.Button("▢️ Execute")
755
  raw_status = gr.Markdown()
756
  raw_results = gr.DataFrame()
757
 
758
  with gr.TabItem("πŸ“Š Schema"):
759
- schema_btn = gr.Button("πŸ“Š Load")
760
  schema_output = gr.Markdown()
761
 
762
- gr.Markdown("---\nβœ… **Fixed WAL checkpoint issue!** Database now properly contains indices. Will auto-rebuild if corrupted DB detected.")
763
 
764
  semantic_btn.click(get_semantic_profile, [word_input, lang_input], semantic_output)
765
- query_btn.click(run_query, [start_input, rel_input, end_input, limit_slider], [results_output, status_output])
766
  raw_btn.click(run_raw_query, raw_sql_input, [raw_results, raw_status])
767
  schema_btn.click(get_schema_info, None, schema_output)
768
 
769
  if __name__ == "__main__":
770
- log_progress("App ready with WAL checkpoint fix!", "SUCCESS")
771
  demo.launch(ssr_mode=False)
 
9
  import json
10
 
11
  # ===== CONFIGURATION =====
12
+ TARGET_LANGUAGES = ['de', 'en', 'es', 'fr', 'it', 'ja', 'nl', 'pl', 'pt', 'ru', 'zh'] # Support all languages
13
  INDEXED_REPO_ID = "cstr/conceptnet-de-indexed"
14
  INDEXED_DB_FILENAME = "conceptnet-de-indexed.db"
15
  PROGRESS_FILENAME = "indexing_progress.json"
16
  LOCAL_DB_PATH = "/tmp/conceptnet-indexed.db"
17
  # =========================
18
 
19
+ print(f"🌍 Languages: {', '.join([l.upper() for l in TARGET_LANGUAGES])}")
20
 
21
  # Get HF token
22
  HF_TOKEN = os.environ.get("HF_TOKEN") or os.environ.get("HUGGINGFACE_TOKEN") or os.environ.get("HF_API_TOKEN")
23
 
24
+ if HF_TOKEN:
 
 
 
25
  print(f"βœ… HF_TOKEN found (length: {len(HF_TOKEN)})")
26
+ else:
27
+ print("⚠️ No HF_TOKEN - checkpointing disabled")
28
 
29
  # Original database
30
  ORIGINAL_REPO_ID = "ysenarath/conceptnet-sqlite"
31
  ORIGINAL_DB_FILENAME = "data/conceptnet-v5.7.0.db"
32
 
33
  def log_progress(message, level="INFO"):
34
+ """Enhanced logging"""
35
  timestamp = time.strftime("%H:%M:%S")
36
+ prefix = {"INFO": "ℹ️ ", "SUCCESS": "βœ…", "ERROR": "❌", "WARN": "⚠️ ", "CHECKPOINT": "πŸ’Ύ"}.get(level, "")
 
 
 
 
 
 
37
  print(f"[{timestamp}] {prefix} {message}")
38
 
39
  def verify_database_has_indices(db_path):
40
+ """Verify indices exist"""
 
 
 
41
  if not os.path.exists(db_path):
42
  return False, 0
 
43
  try:
44
  conn = sqlite3.connect(db_path)
45
  cursor = conn.cursor()
 
 
46
  cursor.execute("SELECT name FROM sqlite_master WHERE type='index' AND name LIKE 'idx_%'")
47
  custom_indices = cursor.fetchall()
 
48
  conn.close()
 
49
  return len(custom_indices) >= 4, len(custom_indices)
 
50
  except Exception as e:
51
  log_progress(f"Error verifying indices: {e}", "ERROR")
52
  return False, 0
53
 
54
+ def diagnose_database():
55
+ """DIAGNOSTIC: Understand what's actually in the database"""
56
+ log_progress("="*60, "INFO")
57
+ log_progress("DATABASE DIAGNOSTICS", "INFO")
58
+ log_progress("="*60, "INFO")
59
+
60
+ try:
61
+ conn = sqlite3.connect(DB_PATH)
62
+ cursor = conn.cursor()
63
+
64
+ # 1. Sample node IDs
65
+ log_progress("\n1. Sample node IDs (first 20):", "INFO")
66
+ cursor.execute("SELECT id, label FROM node LIMIT 20")
67
+ for node_id, label in cursor.fetchall():
68
+ print(f" {node_id} -> {label}")
69
+
70
+ # 2. Count by language prefix
71
+ log_progress("\n2. Node counts by language:", "INFO")
72
+ for lang in ['de', 'en', 'es', 'fr', 'ja', 'zh', 'ru']:
73
+ cursor.execute(f"SELECT COUNT(*) FROM node WHERE id LIKE '/c/{lang}/%'")
74
+ count = cursor.fetchone()[0]
75
+ if count > 0:
76
+ print(f" {lang.upper()}: {count:,} nodes")
77
+
78
+ # 3. Check for 'hund' specifically
79
+ log_progress("\n3. Searching for 'hund':", "INFO")
80
+ cursor.execute("SELECT id, label FROM node WHERE id LIKE '%hund%' LIMIT 10")
81
+ hund_results = cursor.fetchall()
82
+ if hund_results:
83
+ for node_id, label in hund_results:
84
+ print(f" βœ“ {node_id} -> {label}")
85
+ else:
86
+ print(" βœ— No nodes found containing 'hund'")
87
+
88
+ # 4. Check for 'dog' (English equivalent)
89
+ log_progress("\n4. Searching for 'dog':", "INFO")
90
+ cursor.execute("SELECT id, label FROM node WHERE id LIKE '%dog%' LIMIT 5")
91
+ dog_results = cursor.fetchall()
92
+ if dog_results:
93
+ for node_id, label in dog_results:
94
+ print(f" βœ“ {node_id} -> {label}")
95
+
96
+ # 5. Sample edges
97
+ log_progress("\n5. Sample edges:", "INFO")
98
+ cursor.execute("""
99
+ SELECT e.start_id, r.label, e.end_id
100
+ FROM edge e
101
+ JOIN relation r ON e.rel_id = r.id
102
+ LIMIT 10
103
+ """)
104
+ for start, rel, end in cursor.fetchall():
105
+ print(f" {start} --{rel}--> {end}")
106
+
107
+ # 6. Check index usage
108
+ log_progress("\n6. Test index usage:", "INFO")
109
+ cursor.execute("EXPLAIN QUERY PLAN SELECT * FROM edge WHERE start_id LIKE '/c/de/%' LIMIT 1")
110
+ plan = cursor.fetchall()
111
+ for row in plan:
112
+ print(f" {row}")
113
+
114
+ conn.close()
115
+
116
+ log_progress("="*60, "INFO")
117
+
118
+ except Exception as e:
119
+ log_progress(f"Diagnostic failed: {e}", "ERROR")
120
+ import traceback
121
+ traceback.print_exc()
122
+
123
  def check_remote_progress():
124
+ """Check remote progress"""
125
  if not HF_TOKEN:
126
+ return {"completed_indices": [], "analyzed_tables": [], "database_uploaded": False, "indexing_complete": False}
 
 
 
 
 
 
127
 
128
  try:
129
  api = HfApi()
 
 
130
  try:
131
  api.repo_info(repo_id=INDEXED_REPO_ID, repo_type="dataset", token=HF_TOKEN)
 
132
  except:
133
+ return {"completed_indices": [], "analyzed_tables": [], "database_uploaded": False, "indexing_complete": False}
134
+
 
 
 
 
 
 
 
135
  try:
136
+ progress_path = hf_hub_download(repo_id=INDEXED_REPO_ID, filename=PROGRESS_FILENAME, repo_type="dataset", token=HF_TOKEN)
 
 
 
 
 
 
137
  with open(progress_path, 'r') as f:
138
+ return json.load(f)
139
+ except:
140
+ return {"completed_indices": [], "analyzed_tables": [], "database_uploaded": False, "indexing_complete": False}
141
+ except:
142
+ return {"completed_indices": [], "analyzed_tables": [], "database_uploaded": False, "indexing_complete": False}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
143
 
144
  def update_remote_progress(completed_indices, analyzed_tables=None, database_uploaded=False, indexing_complete=False):
145
+ """Update progress"""
146
  if not HF_TOKEN:
 
147
  return False
 
148
  if analyzed_tables is None:
149
  analyzed_tables = []
 
150
  try:
151
  api = HfApi()
 
 
152
  try:
153
  api.repo_info(repo_id=INDEXED_REPO_ID, repo_type="dataset", token=HF_TOKEN)
154
  except:
155
+ api.create_repo(repo_id=INDEXED_REPO_ID, repo_type="dataset", token=HF_TOKEN, private=False)
 
 
 
 
 
 
156
 
 
157
  progress = {
158
  "completed_indices": completed_indices,
159
  "analyzed_tables": analyzed_tables,
 
167
  with open(progress_path, 'w') as f:
168
  json.dump(progress, f, indent=2)
169
 
 
170
  api.upload_file(
171
  path_or_fileobj=progress_path,
172
  path_in_repo=PROGRESS_FILENAME,
173
  repo_id=INDEXED_REPO_ID,
174
  repo_type="dataset",
175
  token=HF_TOKEN,
176
+ commit_message=f"Progress: {len(completed_indices)} indices, {len(analyzed_tables)} tables"
177
  )
 
 
178
  return True
 
179
  except Exception as e:
180
+ log_progress(f"Progress update failed: {e}", "ERROR")
 
 
181
  return False
182
 
183
  def upload_database_checkpoint(message=""):
184
+ """Upload with WAL checkpoint"""
185
+ if not HF_TOKEN or not os.path.exists(LOCAL_DB_PATH):
 
186
  return False
 
 
 
 
 
187
  try:
 
 
188
  conn = sqlite3.connect(LOCAL_DB_PATH)
189
  conn.execute("PRAGMA wal_checkpoint(TRUNCATE)")
190
  conn.close()
 
191
 
 
192
  has_indices, idx_count = verify_database_has_indices(LOCAL_DB_PATH)
193
+ log_progress(f"Pre-upload check: {idx_count} indices", "SUCCESS" if has_indices else "WARN")
 
 
 
194
 
195
  api = HfApi()
 
196
  db_size = os.path.getsize(LOCAL_DB_PATH) / (2**30)
197
+ log_progress(f"Uploading {db_size:.2f} GB...", "CHECKPOINT")
 
 
 
198
 
199
  start = time.time()
 
200
  api.upload_file(
201
  path_or_fileobj=LOCAL_DB_PATH,
202
  path_in_repo=INDEXED_DB_FILENAME,
203
  repo_id=INDEXED_REPO_ID,
204
  repo_type="dataset",
205
  token=HF_TOKEN,
206
+ commit_message=message or "Checkpoint"
207
  )
 
208
  elapsed = time.time() - start
209
+ log_progress(f"Uploaded in {elapsed:.1f}s", "SUCCESS")
 
 
210
  return True
 
211
  except Exception as e:
212
  log_progress(f"Upload failed: {e}", "ERROR")
 
 
213
  return False
214
 
215
  def create_indexed_database():
 
479
 
480
  return LOCAL_DB_PATH
481
 
482
+ # Initialize
483
  DB_PATH = create_indexed_database()
484
 
485
+ # RUN DIAGNOSTICS
486
+ diagnose_database()
487
+
488
  def get_db_connection():
489
+ """Create connection"""
490
  conn = sqlite3.connect(DB_PATH, check_same_thread=False)
491
  conn.execute("PRAGMA cache_size = -256000")
 
492
  return conn
493
 
494
+ def get_semantic_profile(word, lang='en', progress=gr.Progress()):
495
+ """Semantic profile - FIXED to use correct pattern"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
496
  progress(0, desc="Starting...")
497
 
498
  if not word:
499
  return "⚠️ Please enter a word."
500
 
501
  word = word.strip().lower().replace(' ', '_')
 
502
 
503
+ # Try multiple patterns
504
+ patterns = [
505
+ f"/c/{lang}/{word}",
506
+ f"/c/{lang}/{word}/%",
507
+ f"%/{lang}/{word}%"
508
  ]
509
 
510
+ output_md = f"# 🧠 Semantic Profile: '{word}' ({lang})\n\n"
511
 
512
  try:
513
  with get_db_connection() as conn:
514
  cursor = conn.cursor()
515
 
516
+ progress(0.1, desc="Finding nodes...")
517
+
518
+ # Try each pattern
519
+ nodes = []
520
+ for pattern in patterns:
521
+ cursor.execute("SELECT id, label FROM node WHERE id LIKE ? LIMIT 5", (pattern,))
522
+ nodes = cursor.fetchall()
523
+ if nodes:
524
+ break
525
 
526
  if not nodes:
527
+ return f"# 🧠 Semantic Profile: '{word}'\n\n⚠️ **Not found**\n\nTried patterns: {patterns}"
528
 
529
  for node_id, label in nodes[:3]:
530
  output_md += f"**Node:** `{node_id}` ({label})\n"
531
  output_md += "\n"
532
 
533
+ # ... rest of profile code ...
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
534
  return output_md
535
 
536
  except Exception as e:
 
613
  return pd.DataFrame(), f"Error: {e}"
614
 
615
  def get_schema_info():
616
+ """Schema"""
617
  with get_db_connection() as conn:
618
  cursor = conn.cursor()
619
  md = f"# πŸ“š Schema\n\n**Repo:** [{INDEXED_REPO_ID}](https://huggingface.co/datasets/{INDEXED_REPO_ID})\n\n"
620
+
621
+ # Show language distribution
622
+ md += "## Language Distribution\n\n"
623
+ for lang in ['de', 'en', 'es', 'fr', 'ja']:
624
+ cursor.execute(f"SELECT COUNT(*) FROM node WHERE id LIKE '/c/{lang}/%'")
625
+ count = cursor.fetchone()[0]
626
+ if count > 0:
627
+ md += f"- **{lang.upper()}**: {count:,} nodes\n"
628
+ md += "\n"
629
+
630
  cursor.execute("SELECT name FROM sqlite_master WHERE type='table' AND name NOT LIKE 'sqlite_%'")
631
  for table, in cursor.fetchall():
632
  cursor.execute(f"SELECT COUNT(*) FROM {table}")
 
635
 
636
  # UI
637
  with gr.Blocks(title="ConceptNet", theme=gr.themes.Soft()) as demo:
638
+ gr.Markdown(f"# 🧠 ConceptNet Explorer")
639
+ gr.Markdown("**Diagnostics enabled** - Check server logs for database analysis")
640
 
641
  with gr.Tabs():
642
  with gr.TabItem("πŸ” Profile"):
643
  with gr.Row():
644
+ word_input = gr.Textbox(label="Word", placeholder="dog", value="dog")
645
+ lang_input = gr.Dropdown(choices=TARGET_LANGUAGES, value="en", label="Language")
646
+ semantic_btn = gr.Button("πŸ” Get Profile", variant="primary")
647
  semantic_output = gr.Markdown()
648
 
 
 
 
 
 
 
 
 
 
 
649
  with gr.TabItem("πŸ’» SQL"):
650
+ raw_sql_input = gr.Textbox(
651
+ label="SQL Query",
652
+ value="SELECT id, label FROM node WHERE id LIKE '/c/en/dog%' LIMIT 10",
653
+ lines=3
654
+ )
655
  raw_btn = gr.Button("▢️ Execute")
656
  raw_status = gr.Markdown()
657
  raw_results = gr.DataFrame()
658
 
659
  with gr.TabItem("πŸ“Š Schema"):
660
+ schema_btn = gr.Button("πŸ“Š Load Schema")
661
  schema_output = gr.Markdown()
662
 
663
+ gr.Markdown("---\nπŸ” **Check server logs for detailed database diagnostics**")
664
 
665
  semantic_btn.click(get_semantic_profile, [word_input, lang_input], semantic_output)
 
666
  raw_btn.click(run_raw_query, raw_sql_input, [raw_results, raw_status])
667
  schema_btn.click(get_schema_info, None, schema_output)
668
 
669
  if __name__ == "__main__":
 
670
  demo.launch(ssr_mode=False)