Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -35,7 +35,6 @@ except Exception as e:
|
|
| 35 |
|
| 36 |
# Download the entire index folder for better performance
|
| 37 |
try:
|
| 38 |
-
# Use snapshot_download to get the entire data directory with indices
|
| 39 |
CACHE_DIR = snapshot_download(
|
| 40 |
repo_id=REPO_ID,
|
| 41 |
repo_type="dataset",
|
|
@@ -44,7 +43,6 @@ try:
|
|
| 44 |
INDEX_PATH = os.path.join(CACHE_DIR, INDEX_FOLDER)
|
| 45 |
print(f"Index files downloaded to: {INDEX_PATH}")
|
| 46 |
|
| 47 |
-
# Count index files
|
| 48 |
if os.path.exists(INDEX_PATH):
|
| 49 |
index_files = list(Path(INDEX_PATH).glob("*.ldb"))
|
| 50 |
print(f"Found {len(index_files)} index files (.ldb)")
|
|
@@ -64,22 +62,22 @@ def get_db_connection():
|
|
| 64 |
|
| 65 |
# Apply PRAGMA optimizations for read performance
|
| 66 |
pragmas = [
|
| 67 |
-
"PRAGMA query_only = ON",
|
| 68 |
-
"PRAGMA temp_store = MEMORY",
|
| 69 |
-
"PRAGMA cache_size = -128000",
|
| 70 |
-
"PRAGMA page_size = 8192",
|
| 71 |
-
"PRAGMA mmap_size = 2147483648",
|
| 72 |
-
"PRAGMA synchronous = OFF",
|
| 73 |
-
"PRAGMA
|
| 74 |
-
"PRAGMA
|
| 75 |
-
"PRAGMA threads = 4", # Use multiple threads
|
| 76 |
]
|
| 77 |
|
| 78 |
-
for pragma in pragmas:
|
| 79 |
try:
|
| 80 |
conn.execute(pragma)
|
| 81 |
except sqlite3.OperationalError as e:
|
| 82 |
-
|
|
|
|
| 83 |
|
| 84 |
return conn
|
| 85 |
except Exception as e:
|
|
@@ -117,9 +115,9 @@ def verify_indices():
|
|
| 117 |
cursor.execute(f"PRAGMA index_info({idx_name})")
|
| 118 |
idx_cols = cursor.fetchall()
|
| 119 |
|
| 120 |
-
cols = [col[2] for col in idx_cols]
|
| 121 |
unique_str = "UNIQUE" if unique else "NON-UNIQUE"
|
| 122 |
-
print(f" ββ {idx_name} ({unique_str}) on columns: {', '.join(cols)}")
|
| 123 |
total_indices += 1
|
| 124 |
|
| 125 |
print(f"\nβ
Total indices found: {total_indices}")
|
|
@@ -130,7 +128,13 @@ def verify_indices():
|
|
| 130 |
if fts:
|
| 131 |
print(f"β
Full-Text Search enabled: {[f[0] for f in fts]}")
|
| 132 |
|
| 133 |
-
# Check
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 134 |
cursor.execute("PRAGMA page_size")
|
| 135 |
page_size = cursor.fetchone()[0]
|
| 136 |
cursor.execute("PRAGMA cache_size")
|
|
@@ -138,7 +142,6 @@ def verify_indices():
|
|
| 138 |
print(f"\nπ Page size: {page_size} bytes")
|
| 139 |
print(f"π Cache size: {abs(cache_size)} KB" if cache_size < 0 else f"π Cache size: {cache_size} pages")
|
| 140 |
|
| 141 |
-
# Get database size
|
| 142 |
cursor.execute("SELECT page_count * page_size as size FROM pragma_page_count(), pragma_page_size()")
|
| 143 |
db_size = cursor.fetchone()[0]
|
| 144 |
print(f"π¦ Database size: {db_size / 1024 / 1024 / 1024:.2f} GB")
|
|
@@ -205,7 +208,7 @@ def get_schema_info():
|
|
| 205 |
|
| 206 |
unique_badge = "π UNIQUE" if unique else "π INDEX"
|
| 207 |
schema_md += f"- **{idx_name}** {unique_badge}\n"
|
| 208 |
-
schema_md += f" - Columns: `{', '.join(cols) if cols else '
|
| 209 |
schema_md += f" - Origin: {origin}\n"
|
| 210 |
|
| 211 |
schema_md += "\n---\n\n"
|
|
@@ -244,13 +247,12 @@ def run_query(start_node, relation, end_node, limit):
|
|
| 244 |
params = []
|
| 245 |
|
| 246 |
try:
|
| 247 |
-
# Build WHERE conditions
|
| 248 |
if start_node:
|
| 249 |
if "%" in start_node:
|
| 250 |
where_conditions.append("s.id LIKE ?")
|
| 251 |
params.append(start_node)
|
| 252 |
else:
|
| 253 |
-
# Exact match or prefix match
|
| 254 |
where_conditions.append("s.id LIKE ?")
|
| 255 |
params.append(f"%{start_node}%")
|
| 256 |
|
|
@@ -259,7 +261,6 @@ def run_query(start_node, relation, end_node, limit):
|
|
| 259 |
where_conditions.append("r.label LIKE ?")
|
| 260 |
params.append(relation)
|
| 261 |
else:
|
| 262 |
-
# Exact match is faster
|
| 263 |
where_conditions.append("r.label = ?")
|
| 264 |
params.append(relation)
|
| 265 |
|
|
@@ -274,24 +275,12 @@ def run_query(start_node, relation, end_node, limit):
|
|
| 274 |
if where_conditions:
|
| 275 |
query += " WHERE " + " AND ".join(where_conditions)
|
| 276 |
|
| 277 |
-
# Order by weight to get most relevant results first
|
| 278 |
query += " ORDER BY e.weight DESC LIMIT ?"
|
| 279 |
params.append(limit)
|
| 280 |
|
| 281 |
print(f"Executing SQL with {len(params)} parameters")
|
| 282 |
|
| 283 |
with get_db_connection() as conn:
|
| 284 |
-
# Use EXPLAIN QUERY PLAN to verify index usage (for debugging)
|
| 285 |
-
explain_query = "EXPLAIN QUERY PLAN " + query
|
| 286 |
-
try:
|
| 287 |
-
explain_result = conn.execute(explain_query, params).fetchall()
|
| 288 |
-
print("Query Plan:")
|
| 289 |
-
for row in explain_result:
|
| 290 |
-
print(f" {row}")
|
| 291 |
-
except:
|
| 292 |
-
pass
|
| 293 |
-
|
| 294 |
-
# Execute actual query
|
| 295 |
df = pd.read_sql_query(query, conn, params=params)
|
| 296 |
|
| 297 |
if df.empty:
|
|
@@ -316,7 +305,7 @@ def run_raw_query(sql_query):
|
|
| 316 |
|
| 317 |
try:
|
| 318 |
with get_db_connection() as conn:
|
| 319 |
-
# Show query plan
|
| 320 |
try:
|
| 321 |
explain_result = conn.execute("EXPLAIN QUERY PLAN " + sql_query).fetchall()
|
| 322 |
print("Query Plan:")
|
|
@@ -339,7 +328,6 @@ def run_raw_query(sql_query):
|
|
| 339 |
def get_semantic_profile(word, lang='en'):
|
| 340 |
"""
|
| 341 |
HIGHLY OPTIMIZED: Single query with UNION ALL for all relations at once.
|
| 342 |
-
Uses indexed columns for maximum speed.
|
| 343 |
"""
|
| 344 |
if not word:
|
| 345 |
return "β οΈ Please enter a word."
|
|
@@ -374,7 +362,6 @@ def get_semantic_profile(word, lang='en'):
|
|
| 374 |
en.label as target_label,
|
| 375 |
e.weight as weight
|
| 376 |
FROM edge e
|
| 377 |
-
INDEXED BY (SELECT name FROM pragma_index_list('edge') LIMIT 1)
|
| 378 |
INNER JOIN node s ON e.start_id = s.id
|
| 379 |
INNER JOIN node en ON e.end_id = en.id
|
| 380 |
INNER JOIN relation r ON e.rel_id = r.id
|
|
@@ -485,9 +472,14 @@ verify_indices()
|
|
| 485 |
|
| 486 |
with gr.Blocks(title="ConceptNet SQLite Explorer", theme=gr.themes.Soft()) as demo:
|
| 487 |
gr.Markdown("# π§ ConceptNet SQLite Explorer")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 488 |
gr.Markdown(
|
| 489 |
-
f"**Database:** `{os.path.basename(DB_PATH)}` ({
|
| 490 |
-
f"**Status:** {
|
| 491 |
)
|
| 492 |
gr.Markdown("*Explore semantic relationships in ConceptNet with optimized indexed queries*")
|
| 493 |
|
|
@@ -585,7 +577,7 @@ with gr.Blocks(title="ConceptNet SQLite Explorer", theme=gr.themes.Soft()) as de
|
|
| 585 |
|
| 586 |
raw_sql_input = gr.Textbox(
|
| 587 |
label="SQL Query",
|
| 588 |
-
placeholder="SELECT s.label, r.label,
|
| 589 |
lines=6,
|
| 590 |
info="Write SELECT query"
|
| 591 |
)
|
|
@@ -602,11 +594,11 @@ with gr.Blocks(title="ConceptNet SQLite Explorer", theme=gr.themes.Soft()) as de
|
|
| 602 |
"```\n\n"
|
| 603 |
"**Find strongest connections:**\n"
|
| 604 |
"```sql\n"
|
| 605 |
-
"SELECT s.label, r.label,
|
| 606 |
-
"FROM edge \n"
|
| 607 |
-
"JOIN node s ON
|
| 608 |
-
"JOIN relation r ON
|
| 609 |
-
"JOIN node
|
| 610 |
"ORDER BY weight DESC LIMIT 20\n"
|
| 611 |
"```\n\n"
|
| 612 |
"**Check index usage:**\n"
|
|
@@ -630,10 +622,11 @@ with gr.Blocks(title="ConceptNet SQLite Explorer", theme=gr.themes.Soft()) as de
|
|
| 630 |
schema_output = gr.Markdown("*Click button to load schema...*")
|
| 631 |
|
| 632 |
gr.Markdown("---")
|
|
|
|
| 633 |
gr.Markdown(
|
| 634 |
-
"π‘ **Performance:** Queries use database indices for fast lookups. "
|
| 635 |
-
"Exact matches are faster than wildcards. "
|
| 636 |
-
f"{'β
|
| 637 |
)
|
| 638 |
|
| 639 |
# Connect UI to functions
|
|
|
|
| 35 |
|
| 36 |
# Download the entire index folder for better performance
|
| 37 |
try:
|
|
|
|
| 38 |
CACHE_DIR = snapshot_download(
|
| 39 |
repo_id=REPO_ID,
|
| 40 |
repo_type="dataset",
|
|
|
|
| 43 |
INDEX_PATH = os.path.join(CACHE_DIR, INDEX_FOLDER)
|
| 44 |
print(f"Index files downloaded to: {INDEX_PATH}")
|
| 45 |
|
|
|
|
| 46 |
if os.path.exists(INDEX_PATH):
|
| 47 |
index_files = list(Path(INDEX_PATH).glob("*.ldb"))
|
| 48 |
print(f"Found {len(index_files)} index files (.ldb)")
|
|
|
|
| 62 |
|
| 63 |
# Apply PRAGMA optimizations for read performance
|
| 64 |
pragmas = [
|
| 65 |
+
("PRAGMA query_only = ON", True),
|
| 66 |
+
("PRAGMA temp_store = MEMORY", True),
|
| 67 |
+
("PRAGMA cache_size = -128000", True), # 128MB cache
|
| 68 |
+
("PRAGMA page_size = 8192", False), # Can't change on existing DB
|
| 69 |
+
("PRAGMA mmap_size = 2147483648", True), # 2GB memory-mapped I/O
|
| 70 |
+
("PRAGMA synchronous = OFF", True),
|
| 71 |
+
("PRAGMA locking_mode = NORMAL", True),
|
| 72 |
+
("PRAGMA threads = 4", True),
|
|
|
|
| 73 |
]
|
| 74 |
|
| 75 |
+
for pragma, critical in pragmas:
|
| 76 |
try:
|
| 77 |
conn.execute(pragma)
|
| 78 |
except sqlite3.OperationalError as e:
|
| 79 |
+
if critical:
|
| 80 |
+
print(f"Warning: Could not apply {pragma}: {e}")
|
| 81 |
|
| 82 |
return conn
|
| 83 |
except Exception as e:
|
|
|
|
| 115 |
cursor.execute(f"PRAGMA index_info({idx_name})")
|
| 116 |
idx_cols = cursor.fetchall()
|
| 117 |
|
| 118 |
+
cols = [col[2] for col in idx_cols if col[2]]
|
| 119 |
unique_str = "UNIQUE" if unique else "NON-UNIQUE"
|
| 120 |
+
print(f" ββ {idx_name} ({unique_str}) on columns: {', '.join(cols) if cols else 'id'}")
|
| 121 |
total_indices += 1
|
| 122 |
|
| 123 |
print(f"\nβ
Total indices found: {total_indices}")
|
|
|
|
| 128 |
if fts:
|
| 129 |
print(f"β
Full-Text Search enabled: {[f[0] for f in fts]}")
|
| 130 |
|
| 131 |
+
# Check for FTS tables
|
| 132 |
+
cursor.execute("SELECT name FROM sqlite_master WHERE type='table' AND name LIKE '%fts%'")
|
| 133 |
+
fts_tables = cursor.fetchall()
|
| 134 |
+
if fts_tables:
|
| 135 |
+
print(f"β
FTS tables found: {[t[0] for t in fts_tables]}")
|
| 136 |
+
|
| 137 |
+
# Check database stats
|
| 138 |
cursor.execute("PRAGMA page_size")
|
| 139 |
page_size = cursor.fetchone()[0]
|
| 140 |
cursor.execute("PRAGMA cache_size")
|
|
|
|
| 142 |
print(f"\nπ Page size: {page_size} bytes")
|
| 143 |
print(f"π Cache size: {abs(cache_size)} KB" if cache_size < 0 else f"π Cache size: {cache_size} pages")
|
| 144 |
|
|
|
|
| 145 |
cursor.execute("SELECT page_count * page_size as size FROM pragma_page_count(), pragma_page_size()")
|
| 146 |
db_size = cursor.fetchone()[0]
|
| 147 |
print(f"π¦ Database size: {db_size / 1024 / 1024 / 1024:.2f} GB")
|
|
|
|
| 208 |
|
| 209 |
unique_badge = "π UNIQUE" if unique else "π INDEX"
|
| 210 |
schema_md += f"- **{idx_name}** {unique_badge}\n"
|
| 211 |
+
schema_md += f" - Columns: `{', '.join(cols) if cols else 'id'}`\n"
|
| 212 |
schema_md += f" - Origin: {origin}\n"
|
| 213 |
|
| 214 |
schema_md += "\n---\n\n"
|
|
|
|
| 247 |
params = []
|
| 248 |
|
| 249 |
try:
|
| 250 |
+
# Build WHERE conditions
|
| 251 |
if start_node:
|
| 252 |
if "%" in start_node:
|
| 253 |
where_conditions.append("s.id LIKE ?")
|
| 254 |
params.append(start_node)
|
| 255 |
else:
|
|
|
|
| 256 |
where_conditions.append("s.id LIKE ?")
|
| 257 |
params.append(f"%{start_node}%")
|
| 258 |
|
|
|
|
| 261 |
where_conditions.append("r.label LIKE ?")
|
| 262 |
params.append(relation)
|
| 263 |
else:
|
|
|
|
| 264 |
where_conditions.append("r.label = ?")
|
| 265 |
params.append(relation)
|
| 266 |
|
|
|
|
| 275 |
if where_conditions:
|
| 276 |
query += " WHERE " + " AND ".join(where_conditions)
|
| 277 |
|
|
|
|
| 278 |
query += " ORDER BY e.weight DESC LIMIT ?"
|
| 279 |
params.append(limit)
|
| 280 |
|
| 281 |
print(f"Executing SQL with {len(params)} parameters")
|
| 282 |
|
| 283 |
with get_db_connection() as conn:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 284 |
df = pd.read_sql_query(query, conn, params=params)
|
| 285 |
|
| 286 |
if df.empty:
|
|
|
|
| 305 |
|
| 306 |
try:
|
| 307 |
with get_db_connection() as conn:
|
| 308 |
+
# Show query plan for debugging
|
| 309 |
try:
|
| 310 |
explain_result = conn.execute("EXPLAIN QUERY PLAN " + sql_query).fetchall()
|
| 311 |
print("Query Plan:")
|
|
|
|
| 328 |
def get_semantic_profile(word, lang='en'):
|
| 329 |
"""
|
| 330 |
HIGHLY OPTIMIZED: Single query with UNION ALL for all relations at once.
|
|
|
|
| 331 |
"""
|
| 332 |
if not word:
|
| 333 |
return "β οΈ Please enter a word."
|
|
|
|
| 362 |
en.label as target_label,
|
| 363 |
e.weight as weight
|
| 364 |
FROM edge e
|
|
|
|
| 365 |
INNER JOIN node s ON e.start_id = s.id
|
| 366 |
INNER JOIN node en ON e.end_id = en.id
|
| 367 |
INNER JOIN relation r ON e.rel_id = r.id
|
|
|
|
| 472 |
|
| 473 |
with gr.Blocks(title="ConceptNet SQLite Explorer", theme=gr.themes.Soft()) as demo:
|
| 474 |
gr.Markdown("# π§ ConceptNet SQLite Explorer")
|
| 475 |
+
|
| 476 |
+
# Status info
|
| 477 |
+
db_size_gb = os.path.getsize(DB_PATH) / 1024 / 1024 / 1024 if os.path.exists(DB_PATH) else 0
|
| 478 |
+
index_status = 'β
Indices Loaded' if INDEX_PATH and os.path.exists(INDEX_PATH) else 'β οΈ No Index Cache'
|
| 479 |
+
|
| 480 |
gr.Markdown(
|
| 481 |
+
f"**Database:** `{os.path.basename(DB_PATH)}` ({db_size_gb:.2f} GB) | "
|
| 482 |
+
f"**Status:** {index_status}"
|
| 483 |
)
|
| 484 |
gr.Markdown("*Explore semantic relationships in ConceptNet with optimized indexed queries*")
|
| 485 |
|
|
|
|
| 577 |
|
| 578 |
raw_sql_input = gr.Textbox(
|
| 579 |
label="SQL Query",
|
| 580 |
+
placeholder="SELECT s.label, r.label, en.label FROM edge e JOIN node s ON e.start_id = s.id JOIN relation r ON e.rel_id = r.id JOIN node en ON e.end_id = en.id WHERE s.label = 'dog' LIMIT 10",
|
| 581 |
lines=6,
|
| 582 |
info="Write SELECT query"
|
| 583 |
)
|
|
|
|
| 594 |
"```\n\n"
|
| 595 |
"**Find strongest connections:**\n"
|
| 596 |
"```sql\n"
|
| 597 |
+
"SELECT s.label, r.label, en.label, e.weight\n"
|
| 598 |
+
"FROM edge e\n"
|
| 599 |
+
"JOIN node s ON e.start_id = s.id\n"
|
| 600 |
+
"JOIN relation r ON e.rel_id = r.id\n"
|
| 601 |
+
"JOIN node en ON e.end_id = en.id\n"
|
| 602 |
"ORDER BY weight DESC LIMIT 20\n"
|
| 603 |
"```\n\n"
|
| 604 |
"**Check index usage:**\n"
|
|
|
|
| 622 |
schema_output = gr.Markdown("*Click button to load schema...*")
|
| 623 |
|
| 624 |
gr.Markdown("---")
|
| 625 |
+
index_count = len(list(Path(INDEX_PATH).glob("*.ldb"))) if INDEX_PATH and os.path.exists(INDEX_PATH) else 0
|
| 626 |
gr.Markdown(
|
| 627 |
+
f"π‘ **Performance:** Queries use database indices for fast lookups. "
|
| 628 |
+
f"Exact matches are faster than wildcards. "
|
| 629 |
+
f"{'β
' + str(index_count) + ' index files loaded.' if index_count > 0 else 'β οΈ Running without index cache.'}"
|
| 630 |
)
|
| 631 |
|
| 632 |
# Connect UI to functions
|