Bohaska (aider) commited on
Commit
20e9110
·
1 Parent(s): 3a373f3

refactor: Rename search types and update file paths and examples.

Browse files
app.py CHANGED
@@ -3,25 +3,24 @@ from FlagEmbedding import BGEM3FlagModel
3
  import numpy as np
4
  import json
5
  import os
 
6
 
7
  # --- Configuration and Global Data Loading ---
8
 
9
  # Determine the directory of the script to load files relative to it
10
  script_dir = os.path.dirname(os.path.abspath(__file__))
11
 
12
- # Define paths for issue embedding types (removed colbert)
13
  issue_embeddings_paths = {
14
- 'fuzzy': os.path.join(script_dir, 'ns_issues_dense_bge-m3.npy'),
15
- 'direct': os.path.join(script_dir, 'ns_issues_sparse_bge-m3.npy'),
16
- # 'mixed': os.path.join(script_dir, 'ns_issues_colbert_bge-m3.npy') # Removed
17
  }
18
  issue_titles_path = os.path.join(script_dir, 'issue_titles.json')
19
 
20
- # Define paths for GA resolution embedding types (removed colbert)
21
  ga_embeddings_paths = {
22
- 'fuzzy': os.path.join(script_dir, 'ns_ga_resolutions_dense_bge-m3.npy'),
23
- 'direct': os.path.join(script_dir, 'ns_ga_resolutions_sparse_bge-m3.npy'),
24
- # 'mixed': os.path.join(script_dir, 'ns_ga_resolutions_colbert_bge-m3.npy') # Removed
25
  }
26
  ga_resolutions_path = os.path.join(script_dir, 'parsed_ga_resolutions.json')
27
 
@@ -36,13 +35,13 @@ except Exception as e:
36
  print("Please ensure you have an internet connection or the model is cached locally.")
37
  model = None # Indicate model loading failed
38
 
39
- # Issue data storage for all types (removed colbert)
40
  issue_all_embeddings = {
41
- 'fuzzy': None,
42
- 'direct': None,
43
- # 'mixed': None # Removed
44
  }
45
  issue_titles = {}
 
46
 
47
  print("Loading issue data...")
48
  try:
@@ -50,7 +49,7 @@ try:
50
  # Load available embedding types for issues
51
  for embed_type, path in issue_embeddings_paths.items():
52
  if os.path.exists(path):
53
- if embed_type == 'direct': # Only sparse is loaded as list of objects now
54
  # Load sparse dictionaries: it's a NumPy object array, convert to list of objects
55
  issue_all_embeddings[embed_type] = np.load(path, allow_pickle=True).tolist()
56
  else: # Dense
@@ -64,6 +63,37 @@ try:
64
  with open(issue_titles_path, encoding='utf-8') as file:
65
  issue_titles = json.load(file)
66
  print(f"Issue data loaded: {len(issue_titles)} issues.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
67
  except FileNotFoundError as e:
68
  print(f"Error loading issue data: {e}")
69
  print(
@@ -71,11 +101,10 @@ except FileNotFoundError as e:
71
  except Exception as e:
72
  print(f"Error loading issue data: {e}")
73
 
74
- # GA resolution data storage for all types (removed colbert)
75
  ga_all_embeddings = {
76
- 'fuzzy': None,
77
- 'direct': None,
78
- # 'mixed': None # Removed
79
  }
80
  ga_resolutions_data = []
81
 
@@ -85,7 +114,7 @@ try:
85
  # Load available embedding types for GA resolutions
86
  for embed_type, path in ga_embeddings_paths.items():
87
  if os.path.exists(path):
88
- if embed_type == 'direct': # Only sparse is loaded as list of objects now
89
  ga_all_embeddings[embed_type] = np.load(path, allow_pickle=True).tolist()
90
  else: # Dense
91
  ga_all_embeddings[embed_type] = np.load(path)
@@ -108,9 +137,9 @@ except Exception as e:
108
 
109
  # --- Search Functions ---
110
 
111
- def _perform_search(search_term: str, corpus_embeddings_dict: dict, corpus_titles_map_or_list, search_type: str):
112
  """
113
- Helper function to perform a search given the search term, corpus embeddings, and search type.
114
  Returns sorted list of (index, similarity_score).
115
  """
116
  if not model:
@@ -121,22 +150,20 @@ def _perform_search(search_term: str, corpus_embeddings_dict: dict, corpus_title
121
  corpus_embeddings = corpus_embeddings_dict.get(search_type)
122
  if corpus_embeddings is None:
123
  raise ValueError(f"Corpus data for search type '{search_type}' not loaded. Cannot perform search.")
124
- if not corpus_titles_map_or_list:
125
- raise ValueError("Corpus titles/data not loaded. Cannot perform search.")
126
 
127
  # Encode the search term for relevant types
128
  query_embeddings = model.encode([search_term],
129
  return_dense=True,
130
- return_sparse=True, # This will return 'lexical_weights' for BGE-M3
131
- return_colbert_vecs=False) # <--- REMOVED COLBERT GENERATION FOR QUERY
132
 
133
- similarity_scores = [] # Use a list to collect scores, then convert to numpy array
134
 
135
- if search_type == 'fuzzy':
136
  query_vec = query_embeddings['dense_vecs'] # Shape: (1, embedding_dim)
137
  # Perform dot product for dense similarity
138
  similarity_scores = (query_vec @ corpus_embeddings.T)[0] # Result shape: (num_docs,)
139
- elif search_type == 'direct':
140
  # 'lexical_weights' is a list of dictionaries, even for a single query.
141
  # We need the first (and only) dictionary from this list.
142
  if 'lexical_weights' not in query_embeddings or not query_embeddings['lexical_weights']:
@@ -148,17 +175,9 @@ def _perform_search(search_term: str, corpus_embeddings_dict: dict, corpus_title
148
  score = model.compute_lexical_matching_score(query_sparse_dict, doc_sparse_dict)
149
  similarity_scores.append(score)
150
  similarity_scores = np.array(similarity_scores) # Convert to numpy array
151
- # Removed 'mixed' (ColBERT) search type
152
- # elif search_type == 'mixed':
153
- # if 'colbert_vecs' not in query_embeddings or not query_embeddings['colbert_vecs']:
154
- # raise ValueError("ColBERT vectors not returned for query. Model or configuration issue.")
155
- # query_colbert_vec = query_embeddings['colbert_vecs'][0]
156
- # for doc_colbert_vec in corpus_embeddings:
157
- # score = model.colbert_score(query_colbert_vec, doc_colbert_vec)
158
- # similarity_scores.append(score)
159
- # similarity_scores = np.array(similarity_scores)
160
  else:
161
- raise ValueError(f"Unsupported search type: {search_type}")
 
162
 
163
  # Pair index with similarity score
164
  indexed_similarities = [(i, score) for i, score in enumerate(similarity_scores)]
@@ -168,89 +187,196 @@ def _perform_search(search_term: str, corpus_embeddings_dict: dict, corpus_title
168
 
169
  return sorted_similarities
170
 
 
 
 
 
 
 
 
 
171
 
172
- def get_issue_similarity_rankings(search_term: str, search_type: str = 'fuzzy'):
173
- """Searches issues and returns formatted results."""
174
- try:
175
- sorted_similarities = _perform_search(search_term, issue_all_embeddings, issue_titles, search_type)
 
 
 
 
 
 
 
 
 
176
 
177
- similarity_text = f"# Top 20 Issue Search Results ({search_type.capitalize()})\n"
178
- if not sorted_similarities:
179
- return similarity_text + "No issues found."
 
 
 
180
 
181
- search_ranking = 1
182
- # Get top 20 results
183
- for index, sim_score in sorted_similarities[:20]:
184
- # issue_titles is a dict, needs string key
185
- issue_title = issue_titles.get(str(index), f"Unknown Issue (Index {index})")
186
- similarity_text += f"{search_ranking}. {issue_title}, Similarity: {sim_score:.4f}\n"
187
- search_ranking += 1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
188
 
189
- return similarity_text
190
  except Exception as e:
191
  return f"An error occurred during issue search: {e}"
192
 
193
 
194
  def search_ga_resolutions(search_term: str, hide_repealed: bool, hide_repeal_category: bool,
195
- search_type: str = 'fuzzy'):
196
  """
197
  Searches GA resolutions, filters repealed and/or repeal category if requested,
198
  and returns formatted results with links and status.
199
  """
200
  try:
201
- raw_sorted_similarities = _perform_search(search_term, ga_all_embeddings, ga_resolutions_data, search_type)
202
-
203
- # --- Filtering ---
204
- filtered_indexed_similarities = []
205
- for index, score in raw_sorted_similarities:
206
- # Ensure index is valid
207
- if index < len(ga_resolutions_data):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
208
  resolution = ga_resolutions_data[index]
 
 
 
209
  status = resolution.get('status')
210
- category = resolution.get('category')
211
-
212
- # Apply filters
213
- if hide_repealed and status == "Repealed":
214
- continue
215
- if hide_repeal_category and category == "Repeal":
216
- continue
217
-
218
- filtered_indexed_similarities.append((index, score))
219
-
220
- # The list is already sorted, no re-sort needed after filtering.
221
-
222
- # --- Formatting Results ---
223
- similarity_text = f"# Top 20 GA Resolution Search Results ({search_type.capitalize()})\n"
224
- if not filtered_indexed_similarities:
225
- status_msgs = []
226
- if hide_repealed: status_msgs.append("Repealed")
227
- if hide_repeal_category: status_msgs.append("Repeal Category")
228
-
229
- filter_msg = " (Filtered out " + " and ".join(status_msgs) + ")" if status_msgs else ""
230
- return similarity_text + f"No matching resolutions found{filter_msg}."
231
-
232
- search_ranking = 1
233
- # Get top 20 results from the sorted and filtered list
234
- for index, sim_score in filtered_indexed_similarities[:20]:
235
- resolution = ga_resolutions_data[index]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
236
 
237
- title = resolution.get('title', 'Untitled Resolution')
238
- res_id = resolution.get('id', 'N/A')
239
- council = resolution.get('council', 1)
240
- status = resolution.get('status')
241
 
242
- # Add [REPEALED] marker if the status is "Repealed"
243
- status_marker = "[REPEALED] " if status == "Repealed" else ""
244
 
245
- # Construct the NationStates URL
246
- url = f"https://www.nationstates.net/page=WA_past_resolution/id={res_id}/council={council}"
247
 
248
- # Format as Markdown link with the status marker
249
- similarity_text += f"{search_ranking}. {status_marker}[#{res_id} {title}]({url}), Similarity: {sim_score:.4f}\n"
250
 
251
- search_ranking += 1
252
 
253
- return similarity_text
254
  except Exception as e:
255
  return f"An error occurred during GA resolution search: {e}"
256
 
@@ -281,20 +407,22 @@ with gr.Blocks() as demo:
281
  fn=get_issue_similarity_rankings,
282
  inputs=[
283
  gr.Textbox(label="Search term", placeholder="What issue are you looking for?"),
284
- gr.Radio(["fuzzy", "direct"], label="Search Type", value="fuzzy", # <--- Removed 'mixed'
285
- info="Choose search type.") # <--- Updated info message
286
  ],
287
  outputs=gr.Markdown(),
288
  examples=[
289
  # Examples for Issue Search (search_term, search_type)
290
- ["coffee", "fuzzy"],
291
- ["land value tax", "direct"],
292
- ["Elon Musk", "direct"],
293
  ["After an corrupted election, citizens demand the real results, after discovering it was rigged. ",
294
- "fuzzy"],
295
  [
296
  "Eureka! A new scientific law regarding the universe's expansion may have just been discovered at the University of @@CAPITAL@@. Unfortunately, tempers are flaring over who should get naming credit. Maxtopian grad student Georgie Bubble claims the work alone while his boss Dr.@@RANDOMNAME1@@ claims that all work in the University is @@NAME@@’s collectively. Your Minister of Education has elevated this to your desk.",
297
- "fuzzy"],
 
 
298
  ],
299
  title=None,
300
  description=None,
@@ -312,8 +440,8 @@ with gr.Blocks() as demo:
312
  ga_search_term_input = gr.Textbox(label="Search term", placeholder="What are you looking for?")
313
  ga_hide_repealed_checkbox = gr.Checkbox(value=True, label="Hide repealed resolutions")
314
  ga_hide_repeal_category_checkbox = gr.Checkbox(value=True, label="Hide repeals")
315
- ga_search_type_radio = gr.Radio(["fuzzy", "direct"], label="Search Type", value="fuzzy", # <--- Removed 'mixed'
316
- info="Choose search type.")
317
 
318
  ga_search_interface = gr.Interface(
319
  fn=search_ga_resolutions,
@@ -327,11 +455,13 @@ with gr.Blocks() as demo:
327
  outputs=gr.Markdown(),
328
  examples=[
329
  # Examples for GA Resolution Search (search_term, hide_repealed, hide_repeal_category, search_type)
330
- ["condemn genocide", True, True, "fuzzy"],
331
- ["rights of animals", True, True, "direct"],
332
- ["regulating space mining", True, True, "fuzzy"],
333
- ["founding of the World Assembly", True, True, "fuzzy"],
334
- ["environmental protection", True, True, "fuzzy"],
 
 
335
  ],
336
  title=None,
337
  description=None,
@@ -343,4 +473,4 @@ with gr.Blocks() as demo:
343
  if __name__ == "__main__":
344
  # Set share=True to make the app accessible externally (requires ngrok)
345
  # share=False is default and runs locally
346
- demo.launch()
 
3
  import numpy as np
4
  import json
5
  import os
6
+ import re # Added for strict search context extraction
7
 
8
  # --- Configuration and Global Data Loading ---
9
 
10
  # Determine the directory of the script to load files relative to it
11
  script_dir = os.path.dirname(os.path.abspath(__file__))
12
 
13
+ # Define paths for issue embedding types
14
  issue_embeddings_paths = {
15
+ 'semantic': os.path.join(script_dir, 'ns_issues_semantic_bge-m3.npy'), # Renamed from fuzzy
16
+ 'loose': os.path.join(script_dir, 'ns_issues_loose_bge-m3.npy'), # Renamed from direct
 
17
  }
18
  issue_titles_path = os.path.join(script_dir, 'issue_titles.json')
19
 
20
+ # Define paths for GA resolution embedding types
21
  ga_embeddings_paths = {
22
+ 'semantic': os.path.join(script_dir, 'ns_ga_resolutions_semantic_bge-m3.npy'), # Renamed from fuzzy
23
+ 'loose': os.path.join(script_dir, 'ns_ga_resolutions_loose_bge-m3.npy'), # Renamed from direct
 
24
  }
25
  ga_resolutions_path = os.path.join(script_dir, 'parsed_ga_resolutions.json')
26
 
 
35
  print("Please ensure you have an internet connection or the model is cached locally.")
36
  model = None # Indicate model loading failed
37
 
38
+ # Issue data storage for all types
39
  issue_all_embeddings = {
40
+ 'semantic': None,
41
+ 'loose': None,
 
42
  }
43
  issue_titles = {}
44
+ all_issue_raw_texts = [] # New: To store raw issue texts for strict search
45
 
46
  print("Loading issue data...")
47
  try:
 
49
  # Load available embedding types for issues
50
  for embed_type, path in issue_embeddings_paths.items():
51
  if os.path.exists(path):
52
+ if embed_type == 'loose': # Only sparse is loaded as list of objects now
53
  # Load sparse dictionaries: it's a NumPy object array, convert to list of objects
54
  issue_all_embeddings[embed_type] = np.load(path, allow_pickle=True).tolist()
55
  else: # Dense
 
63
  with open(issue_titles_path, encoding='utf-8') as file:
64
  issue_titles = json.load(file)
65
  print(f"Issue data loaded: {len(issue_titles)} issues.")
66
+
67
+ # --- Load raw issue texts for strict search ---
68
+ # The issue text files are in 'small_scripts/make_embedding/002 - Issue Megalist (MAIN) copy/'
69
+ issues_input_dir = os.path.join(script_dir, 'small_scripts', 'make_embedding', '002 - Issue Megalist (MAIN) copy')
70
+
71
+ # Replicate get_issue_files logic from embedding.py to ensure correct order
72
+ issue_files_for_raw_load = []
73
+ file_pattern = re.compile(r'(\d+) TO (\d+)\.txt')
74
+
75
+ if os.path.isdir(issues_input_dir):
76
+ for filename in os.listdir(issues_input_dir):
77
+ if filename.endswith('.txt'):
78
+ match = file_pattern.match(filename)
79
+ if match:
80
+ start_num = int(match.group(1))
81
+ issue_files_for_raw_load.append((start_num, filename))
82
+ issue_files_for_raw_load.sort(key=lambda x: x[0])
83
+ issue_files_for_raw_load = [os.path.join(issues_input_dir, filename) for _, filename in issue_files_for_raw_load]
84
+
85
+ for filepath in issue_files_for_raw_load:
86
+ with open(filepath, 'r', encoding='utf-8') as file:
87
+ issues_text_in_file = file.read()
88
+ # Split issues by the separator and remove any empty strings resulting from multiple separators
89
+ issues_list_in_file = [
90
+ issue.strip() for issue in issues_text_in_file.split("[hr][/hr]") if issue.strip()
91
+ ]
92
+ all_issue_raw_texts.extend(issues_list_in_file)
93
+ print(f" Loaded {len(all_issue_raw_texts)} raw issue texts for strict search.")
94
+ else:
95
+ print(f" Warning: Issue text directory '{issues_input_dir}' not found. Strict issue search will not work.")
96
+
97
  except FileNotFoundError as e:
98
  print(f"Error loading issue data: {e}")
99
  print(
 
101
  except Exception as e:
102
  print(f"Error loading issue data: {e}")
103
 
104
+ # GA resolution data storage for all types
105
  ga_all_embeddings = {
106
+ 'semantic': None,
107
+ 'loose': None,
 
108
  }
109
  ga_resolutions_data = []
110
 
 
114
  # Load available embedding types for GA resolutions
115
  for embed_type, path in ga_embeddings_paths.items():
116
  if os.path.exists(path):
117
+ if embed_type == 'loose': # Only sparse is loaded as list of objects now
118
  ga_all_embeddings[embed_type] = np.load(path, allow_pickle=True).tolist()
119
  else: # Dense
120
  ga_all_embeddings[embed_type] = np.load(path)
 
137
 
138
  # --- Search Functions ---
139
 
140
+ def _perform_search(search_term: str, corpus_embeddings_dict: dict, search_type: str):
141
  """
142
+ Helper function to perform an embedding-based search given the search term, corpus embeddings, and search type.
143
  Returns sorted list of (index, similarity_score).
144
  """
145
  if not model:
 
150
  corpus_embeddings = corpus_embeddings_dict.get(search_type)
151
  if corpus_embeddings is None:
152
  raise ValueError(f"Corpus data for search type '{search_type}' not loaded. Cannot perform search.")
 
 
153
 
154
  # Encode the search term for relevant types
155
  query_embeddings = model.encode([search_term],
156
  return_dense=True,
157
+ return_sparse=True,
158
+ return_colbert_vecs=False)
159
 
160
+ similarity_scores = []
161
 
162
+ if search_type == 'semantic': # Renamed from 'fuzzy'
163
  query_vec = query_embeddings['dense_vecs'] # Shape: (1, embedding_dim)
164
  # Perform dot product for dense similarity
165
  similarity_scores = (query_vec @ corpus_embeddings.T)[0] # Result shape: (num_docs,)
166
+ elif search_type == 'loose': # Renamed from 'direct'
167
  # 'lexical_weights' is a list of dictionaries, even for a single query.
168
  # We need the first (and only) dictionary from this list.
169
  if 'lexical_weights' not in query_embeddings or not query_embeddings['lexical_weights']:
 
175
  score = model.compute_lexical_matching_score(query_sparse_dict, doc_sparse_dict)
176
  similarity_scores.append(score)
177
  similarity_scores = np.array(similarity_scores) # Convert to numpy array
 
 
 
 
 
 
 
 
 
178
  else:
179
+ # This function should only be called for embedding-based searches
180
+ raise ValueError(f"Unsupported embedding search type: {search_type}")
181
 
182
  # Pair index with similarity score
183
  indexed_similarities = [(i, score) for i, score in enumerate(similarity_scores)]
 
187
 
188
  return sorted_similarities
189
 
190
+ def _extract_context(text: str, query: str, context_chars: int = 100):
191
+ """Extracts surrounding context for a given query in text, split on newlines."""
192
+ text_lower = text.lower()
193
+ query_lower = query.lower()
194
+
195
+ start_index = text_lower.find(query_lower)
196
+ if start_index == -1:
197
+ return "" # Query not found, should not happen if we got here
198
 
199
+ end_index = start_index + len(query)
200
+
201
+ # Find start of context
202
+ context_start = max(0, start_index - context_chars)
203
+ # Find end of context
204
+ context_end = min(len(text), end_index + context_chars)
205
+
206
+ # Adjust context_start to the nearest newline before it
207
+ if context_start > 0:
208
+ pre_context = text[0:context_start]
209
+ last_newline_before_start = pre_context.rfind('\n')
210
+ if last_newline_before_start != -1:
211
+ context_start = last_newline_before_start + 1
212
 
213
+ # Adjust context_end to the nearest newline after it
214
+ if context_end < len(text):
215
+ post_context = text[context_end:len(text)]
216
+ first_newline_after_end = post_context.find('\n')
217
+ if first_newline_after_end != -1:
218
+ context_end = context_end + first_newline_after_end
219
 
220
+ extracted_text = text[context_start:context_end]
221
+
222
+ # Highlight the query using regex for case-insensitive replacement
223
+ highlighted_text = re.sub(re.escape(query), lambda m: f"**{m.group(0)}**", extracted_text, flags=re.IGNORECASE, count=1)
224
+
225
+ return f"```\n{highlighted_text}\n```"
226
+
227
+
228
+ def get_issue_similarity_rankings(search_term: str, search_type: str = 'semantic'): # Renamed default
229
+ """Searches issues and returns formatted results."""
230
+ try:
231
+ if not search_term:
232
+ return "Please enter a search term."
233
+
234
+ if search_type == 'strict':
235
+ if not all_issue_raw_texts:
236
+ return "Raw issue texts not loaded. Strict search is unavailable."
237
+
238
+ strict_matches = []
239
+ search_term_lower = search_term.lower()
240
+ for i, issue_text in enumerate(all_issue_raw_texts):
241
+ if search_term_lower in issue_text.lower():
242
+ strict_matches.append((i, 1.0)) # Use 1.0 as a dummy score for strict matches
243
+
244
+ similarity_text = f"# Top 20 Issue Search Results (Strict)\n"
245
+ if not strict_matches:
246
+ return similarity_text + "No exact matches found."
247
+
248
+ search_ranking = 1
249
+ for index, sim_score in strict_matches[:20]: # Still limit to top 20
250
+ issue_title = issue_titles.get(str(index), f"Unknown Issue (Index {index})")
251
+ context = _extract_context(all_issue_raw_texts[index], search_term)
252
+ similarity_text += f"{search_ranking}. {issue_title}, Match: {sim_score:.4f}\n{context}\n"
253
+ search_ranking += 1
254
+ return similarity_text
255
+
256
+ else: # Embedding-based search
257
+ sorted_similarities = _perform_search(search_term, issue_all_embeddings, search_type)
258
+
259
+ similarity_text = f"# Top 20 Issue Search Results ({search_type.capitalize()})\n"
260
+ if not sorted_similarities:
261
+ return similarity_text + "No issues found."
262
+
263
+ search_ranking = 1
264
+ for index, sim_score in sorted_similarities[:20]:
265
+ # issue_titles is a dict, needs string key
266
+ issue_title = issue_titles.get(str(index), f"Unknown Issue (Index {index})")
267
+ similarity_text += f"{search_ranking}. {issue_title}, Similarity: {sim_score:.4f}\n"
268
+ search_ranking += 1
269
+ return similarity_text
270
 
 
271
  except Exception as e:
272
  return f"An error occurred during issue search: {e}"
273
 
274
 
275
  def search_ga_resolutions(search_term: str, hide_repealed: bool, hide_repeal_category: bool,
276
+ search_type: str = 'semantic'): # Renamed default
277
  """
278
  Searches GA resolutions, filters repealed and/or repeal category if requested,
279
  and returns formatted results with links and status.
280
  """
281
  try:
282
+ if not search_term:
283
+ return "Please enter a search term."
284
+
285
+ if search_type == 'strict':
286
+ if not ga_resolutions_data:
287
+ return "GA resolution data not loaded. Strict search is unavailable."
288
+
289
+ strict_matches = []
290
+ search_term_lower = search_term.lower()
291
+ for i, resolution in enumerate(ga_resolutions_data):
292
+ resolution_body = resolution.get('body', '')
293
+ if search_term_lower in resolution_body.lower():
294
+ # Apply filters immediately for strict search
295
+ status = resolution.get('status')
296
+ category = resolution.get('category')
297
+ if hide_repealed and status == "Repealed":
298
+ continue
299
+ if hide_repeal_category and category == "Repeal":
300
+ continue
301
+ strict_matches.append((i, 1.0)) # Dummy score
302
+
303
+ similarity_text = f"# Top 20 GA Resolution Search Results (Strict)\n"
304
+ if not strict_matches:
305
+ status_msgs = []
306
+ if hide_repealed: status_msgs.append("Repealed")
307
+ if hide_repeal_category: status_msgs.append("Repeal Category")
308
+ filter_msg = " (Filtered out " + " and ".join(status_msgs) + ")" if status_msgs else ""
309
+ return similarity_text + f"No exact matches found{filter_msg}."
310
+
311
+ search_ranking = 1
312
+ for index, sim_score in strict_matches[:20]:
313
  resolution = ga_resolutions_data[index]
314
+ title = resolution.get('title', 'Untitled Resolution')
315
+ res_id = resolution.get('id', 'N/A')
316
+ council = resolution.get('council', 1)
317
  status = resolution.get('status')
318
+ status_marker = "[REPEALED] " if status == "Repealed" else ""
319
+ url = f"https://www.nationstates.net/page=WA_past_resolution/id={res_id}/council={council}"
320
+
321
+ context = _extract_context(resolution.get('body', ''), search_term)
322
+
323
+ similarity_text += f"{search_ranking}. {status_marker}[#{res_id} {title}]({url}), Match: {sim_score:.4f}\n{context}\n"
324
+ search_ranking += 1
325
+ return similarity_text
326
+
327
+ else: # Embedding-based search
328
+ raw_sorted_similarities = _perform_search(search_term, ga_all_embeddings, search_type)
329
+
330
+ # --- Filtering ---
331
+ filtered_indexed_similarities = []
332
+ for index, score in raw_sorted_similarities:
333
+ # Ensure index is valid
334
+ if index < len(ga_resolutions_data):
335
+ resolution = ga_resolutions_data[index]
336
+ status = resolution.get('status')
337
+ category = resolution.get('category')
338
+
339
+ # Apply filters
340
+ if hide_repealed and status == "Repealed":
341
+ continue
342
+ if hide_repeal_category and category == "Repeal":
343
+ continue
344
+ filtered_indexed_similarities.append((index, score))
345
+
346
+ # The list is already sorted, no re-sort needed after filtering.
347
+
348
+ # --- Formatting Results ---
349
+ similarity_text = f"# Top 20 GA Resolution Search Results ({search_type.capitalize()})\n"
350
+ if not filtered_indexed_similarities:
351
+ status_msgs = []
352
+ if hide_repealed: status_msgs.append("Repealed")
353
+ if hide_repeal_category: status_msgs.append("Repeal Category")
354
+
355
+ filter_msg = " (Filtered out " + " and ".join(status_msgs) + ")" if status_msgs else ""
356
+ return similarity_text + f"No matching resolutions found{filter_msg}."
357
+
358
+ search_ranking = 1
359
+ # Get top 20 results from the sorted and filtered list
360
+ for index, sim_score in filtered_indexed_similarities[:20]:
361
+ resolution = ga_resolutions_data[index]
362
 
363
+ title = resolution.get('title', 'Untitled Resolution')
364
+ res_id = resolution.get('id', 'N/A')
365
+ council = resolution.get('council', 1)
366
+ status = resolution.get('status')
367
 
368
+ # Add [REPEALED] marker if the status is "Repealed"
369
+ status_marker = "[REPEALED] " if status == "Repealed" else ""
370
 
371
+ # Construct the NationStates URL
372
+ url = f"https://www.nationstates.net/page=WA_past_resolution/id={res_id}/council={council}"
373
 
374
+ # Format as Markdown link with the status marker
375
+ similarity_text += f"{search_ranking}. {status_marker}[#{res_id} {title}]({url}), Similarity: {sim_score:.4f}\n"
376
 
377
+ search_ranking += 1
378
 
379
+ return similarity_text
380
  except Exception as e:
381
  return f"An error occurred during GA resolution search: {e}"
382
 
 
407
  fn=get_issue_similarity_rankings,
408
  inputs=[
409
  gr.Textbox(label="Search term", placeholder="What issue are you looking for?"),
410
+ gr.Radio(["semantic", "loose", "strict"], label="Search Type", value="semantic", # Updated options
411
+ info="Choose search type: 'semantic' for conceptual similarity, 'loose' for keyword matching, 'strict' for exact phrase matching.") # Updated info
412
  ],
413
  outputs=gr.Markdown(),
414
  examples=[
415
  # Examples for Issue Search (search_term, search_type)
416
+ ["coffee", "semantic"],
417
+ ["land value tax", "loose"],
418
+ ["Elon Musk", "loose"],
419
  ["After an corrupted election, citizens demand the real results, after discovering it was rigged. ",
420
+ "semantic"],
421
  [
422
  "Eureka! A new scientific law regarding the universe's expansion may have just been discovered at the University of @@CAPITAL@@. Unfortunately, tempers are flaring over who should get naming credit. Maxtopian grad student Georgie Bubble claims the work alone while his boss Dr.@@RANDOMNAME1@@ claims that all work in the University is @@NAME@@’s collectively. Your Minister of Education has elevated this to your desk.",
423
+ "semantic"],
424
+ ["tax", "strict"], # New example for strict
425
+ ["environmental protection", "strict"] # New example for strict
426
  ],
427
  title=None,
428
  description=None,
 
440
  ga_search_term_input = gr.Textbox(label="Search term", placeholder="What are you looking for?")
441
  ga_hide_repealed_checkbox = gr.Checkbox(value=True, label="Hide repealed resolutions")
442
  ga_hide_repeal_category_checkbox = gr.Checkbox(value=True, label="Hide repeals")
443
+ ga_search_type_radio = gr.Radio(["semantic", "loose", "strict"], label="Search Type", value="semantic", # Updated options
444
+ info="Choose search type: 'semantic' for conceptual similarity, 'loose' for keyword matching, 'strict' for exact phrase matching.") # Updated info
445
 
446
  ga_search_interface = gr.Interface(
447
  fn=search_ga_resolutions,
 
455
  outputs=gr.Markdown(),
456
  examples=[
457
  # Examples for GA Resolution Search (search_term, hide_repealed, hide_repeal_category, search_type)
458
+ ["condemn genocide", True, True, "semantic"],
459
+ ["rights of animals", True, True, "loose"],
460
+ ["regulating space mining", True, True, "semantic"],
461
+ ["founding of the World Assembly", True, True, "semantic"],
462
+ ["environmental protection", True, True, "semantic"],
463
+ ["human rights", True, True, "strict"], # New example for strict
464
+ ["World Assembly", True, True, "strict"] # New example for strict
465
  ],
466
  title=None,
467
  description=None,
 
473
  if __name__ == "__main__":
474
  # Set share=True to make the app accessible externally (requires ngrok)
475
  # share=False is default and runs locally
476
+ demo.launch()
small_scripts/make_embedding/embedding.py CHANGED
@@ -125,10 +125,10 @@ def encode_issues():
125
  return_sparse=True, # This will return 'lexical_weights' for BGE-M3
126
  return_colbert_vecs=False) # <--- REMOVED COLBERT GENERATION
127
 
128
- # Save Dense Embeddings
129
  np.save(file_cache_dense_path, embeddings['dense_vecs'])
130
 
131
- # --- Save Sparse Embeddings ---
132
  # 'lexical_weights' is a list of dictionaries, one for each item in the batch
133
  sparse_list_of_dicts = embeddings.get('lexical_weights')
134
 
@@ -136,10 +136,6 @@ def encode_issues():
136
  # This allows storing Python objects (dictionaries) in a NumPy array.
137
  np.save(file_cache_sparse_path, np.array(sparse_list_of_dicts, dtype=object), allow_pickle=True)
138
 
139
- # Removed saving ColBERT Embeddings
140
- # colbert_list_of_arrays = embeddings.get('colbert_vecs')
141
- # np.save(file_cache_colbert_path, np.array(colbert_list_of_arrays, dtype=object), allow_pickle=True)
142
-
143
 
144
  print(f" Encoded and cached {len(issues_list_in_file)} issues from {filename}.")
145
 
@@ -152,8 +148,8 @@ def encode_issues():
152
  print("\n--- Consolidation Phase: Combining cached embeddings ---")
153
 
154
  # Initialize lists to collect all embeddings in the correct global order
155
- final_dense_embeddings_list = []
156
- final_sparse_embeddings_list = [] # Will hold Python dictionaries
157
  # Removed final_colbert_embeddings_list
158
 
159
  # Re-get sorted file paths to ensure correct order for consolidation
@@ -173,11 +169,11 @@ def encode_issues():
173
  os.path.exists(file_cache_sparse_path)): # Removed colbert cache check
174
 
175
  # Load and append to the lists
176
- final_dense_embeddings_list.append(np.load(file_cache_dense_path))
177
 
178
  # Load sparse dictionaries: it's a NumPy object array, convert to list of dicts
179
  loaded_sparse_dicts_for_file = np.load(file_cache_sparse_path, allow_pickle=True).tolist()
180
- final_sparse_embeddings_list.extend(loaded_sparse_dicts_for_file)
181
 
182
  # Removed loading ColBERT arrays
183
  # loaded_colbert_arrays_for_file = np.load(file_cache_colbert_path, allow_pickle=True).tolist()
@@ -195,7 +191,7 @@ def encode_issues():
195
  print(
196
  f" Warning: Cached embedding files for {filename} are incomplete or missing. Skipping in consolidation. This may affect global issue indexing.")
197
 
198
- if not final_dense_embeddings_list:
199
  print("No embeddings were successfully loaded for consolidation. No output files generated.")
200
  return
201
 
@@ -203,21 +199,21 @@ def encode_issues():
203
  # Concatenate all collected embeddings into single large NumPy arrays
204
  print("Concatenating and saving final consolidated embeddings...")
205
 
206
- # Dense embeddings
207
- final_dense_array = np.vstack(final_dense_embeddings_list)
208
- np.save(os.path.join(OUTPUT_DIR, 'ns_issues_dense_bge-m3.npy'), final_dense_array)
209
  print(
210
- f" Saved dense embeddings to {os.path.join(OUTPUT_DIR, 'ns_issues_dense_bge-m3.npy')} (Shape: {final_dense_array.shape})")
211
 
212
- # Sparse embeddings (now a list of dictionaries, saved as object array)
213
- if final_sparse_embeddings_list:
214
  # Save the list of dictionaries as a NumPy object array
215
- final_sparse_array = np.array(final_sparse_embeddings_list, dtype=object)
216
- np.save(os.path.join(OUTPUT_DIR, 'ns_issues_sparse_bge-m3.npy'), final_sparse_array, allow_pickle=True)
217
  print(
218
- f" Saved sparse embeddings to {os.path.join(OUTPUT_DIR, 'ns_issues_sparse_bge-m3.npy')} (Total objects: {len(final_sparse_array)}, type: {type(final_sparse_array)})")
219
  else:
220
- print(" No sparse embeddings to save.")
221
 
222
  # Removed ColBERT embeddings saving
223
  # if final_colbert_embeddings_list:
@@ -232,4 +228,4 @@ def encode_issues():
232
 
233
  # Call this function to start the embedding process.
234
  if __name__ == "__main__":
235
- encode_issues()
 
125
  return_sparse=True, # This will return 'lexical_weights' for BGE-M3
126
  return_colbert_vecs=False) # <--- REMOVED COLBERT GENERATION
127
 
128
+ # Save Semantic (Dense) Embeddings
129
  np.save(file_cache_dense_path, embeddings['dense_vecs'])
130
 
131
+ # --- Save Loose (Sparse) Embeddings ---
132
  # 'lexical_weights' is a list of dictionaries, one for each item in the batch
133
  sparse_list_of_dicts = embeddings.get('lexical_weights')
134
 
 
136
  # This allows storing Python objects (dictionaries) in a NumPy array.
137
  np.save(file_cache_sparse_path, np.array(sparse_list_of_dicts, dtype=object), allow_pickle=True)
138
 
 
 
 
 
139
 
140
  print(f" Encoded and cached {len(issues_list_in_file)} issues from {filename}.")
141
 
 
148
  print("\n--- Consolidation Phase: Combining cached embeddings ---")
149
 
150
  # Initialize lists to collect all embeddings in the correct global order
151
+ final_semantic_embeddings_list = [] # Renamed from final_dense_embeddings_list
152
+ final_loose_embeddings_list = [] # Renamed from final_sparse_embeddings_list
153
  # Removed final_colbert_embeddings_list
154
 
155
  # Re-get sorted file paths to ensure correct order for consolidation
 
169
  os.path.exists(file_cache_sparse_path)): # Removed colbert cache check
170
 
171
  # Load and append to the lists
172
+ final_semantic_embeddings_list.append(np.load(file_cache_dense_path)) # Renamed
173
 
174
  # Load sparse dictionaries: it's a NumPy object array, convert to list of dicts
175
  loaded_sparse_dicts_for_file = np.load(file_cache_sparse_path, allow_pickle=True).tolist()
176
+ final_loose_embeddings_list.extend(loaded_sparse_dicts_for_file) # Renamed
177
 
178
  # Removed loading ColBERT arrays
179
  # loaded_colbert_arrays_for_file = np.load(file_cache_colbert_path, allow_pickle=True).tolist()
 
191
  print(
192
  f" Warning: Cached embedding files for {filename} are incomplete or missing. Skipping in consolidation. This may affect global issue indexing.")
193
 
194
+ if not final_semantic_embeddings_list: # Renamed
195
  print("No embeddings were successfully loaded for consolidation. No output files generated.")
196
  return
197
 
 
199
  # Concatenate all collected embeddings into single large NumPy arrays
200
  print("Concatenating and saving final consolidated embeddings...")
201
 
202
+ # Semantic (Dense) embeddings
203
+ final_semantic_array = np.vstack(final_semantic_embeddings_list) # Renamed
204
+ np.save(os.path.join(OUTPUT_DIR, 'ns_issues_semantic_bge-m3.npy'), final_semantic_array) # Renamed file
205
  print(
206
+ f" Saved semantic embeddings to {os.path.join(OUTPUT_DIR, 'ns_issues_semantic_bge-m3.npy')} (Shape: {final_semantic_array.shape})") # Renamed file and type
207
 
208
+ # Loose (Sparse) embeddings (now a list of dictionaries, saved as object array)
209
+ if final_loose_embeddings_list: # Renamed
210
  # Save the list of dictionaries as a NumPy object array
211
+ final_loose_array = np.array(final_loose_embeddings_list, dtype=object) # Renamed
212
+ np.save(os.path.join(OUTPUT_DIR, 'ns_issues_loose_bge-m3.npy'), final_loose_array, allow_pickle=True) # Renamed file
213
  print(
214
+ f" Saved loose embeddings to {os.path.join(OUTPUT_DIR, 'ns_issues_loose_bge-m3.npy')} (Total objects: {len(final_loose_array)}, type: {type(final_loose_array)})") # Renamed file and type
215
  else:
216
+ print(" No loose embeddings to save.") # Renamed
217
 
218
  # Removed ColBERT embeddings saving
219
  # if final_colbert_embeddings_list:
 
228
 
229
  # Call this function to start the embedding process.
230
  if __name__ == "__main__":
231
+ encode_issues()
small_scripts/make_embedding/embedding_ga_resolutions.py CHANGED
@@ -63,20 +63,20 @@ def encode_ga_resolutions():
63
  # Ensure output directory exists
64
  os.makedirs(OUTPUT_DIR, exist_ok=True)
65
 
66
- # --- Save Dense Embeddings ---
67
  dense_embeddings = embeddings['dense_vecs']
68
- dense_output_path = os.path.join(OUTPUT_DIR, 'ns_ga_resolutions_dense_bge-m3.npy')
69
  np.save(dense_output_path, dense_embeddings)
70
- print(f"Saved dense embeddings to {dense_output_path} (Shape: {dense_embeddings.shape})")
71
 
72
- # --- Save Sparse Embeddings ---
73
  # 'lexical_weights' is a list of dictionaries, one for each item in the batch
74
  sparse_list_of_dicts = embeddings['lexical_weights']
75
 
76
  # Save this list of sparse dictionaries as a NumPy object array
77
- sparse_output_path = os.path.join(OUTPUT_DIR, 'ns_ga_resolutions_sparse_bge-m3.npy')
78
  np.save(sparse_output_path, np.array(sparse_list_of_dicts, dtype=object), allow_pickle=True) # allow_pickle is essential for storing Python objects
79
- print(f"Saved sparse embeddings to {sparse_output_path} (Total objects: {len(sparse_list_of_dicts)})")
80
 
81
 
82
  # --- Removed ColBERT Embeddings Saving ---
@@ -94,4 +94,4 @@ def encode_ga_resolutions():
94
 
95
  # Call the function to start the embedding process
96
  if __name__ == "__main__":
97
- encode_ga_resolutions()
 
63
  # Ensure output directory exists
64
  os.makedirs(OUTPUT_DIR, exist_ok=True)
65
 
66
+ # --- Save Semantic (Dense) Embeddings ---
67
  dense_embeddings = embeddings['dense_vecs']
68
+ dense_output_path = os.path.join(OUTPUT_DIR, 'ns_ga_resolutions_semantic_bge-m3.npy') # Renamed file
69
  np.save(dense_output_path, dense_embeddings)
70
+ print(f"Saved semantic embeddings to {dense_output_path} (Shape: {dense_embeddings.shape})") # Renamed type and file
71
 
72
+ # --- Save Loose (Sparse) Embeddings ---
73
  # 'lexical_weights' is a list of dictionaries, one for each item in the batch
74
  sparse_list_of_dicts = embeddings['lexical_weights']
75
 
76
  # Save this list of sparse dictionaries as a NumPy object array
77
+ sparse_output_path = os.path.join(OUTPUT_DIR, 'ns_ga_resolutions_loose_bge-m3.npy') # Renamed file
78
  np.save(sparse_output_path, np.array(sparse_list_of_dicts, dtype=object), allow_pickle=True) # allow_pickle is essential for storing Python objects
79
+ print(f"Saved loose embeddings to {sparse_output_path} (Total objects: {len(sparse_list_of_dicts)})") # Renamed type and file
80
 
81
 
82
  # --- Removed ColBERT Embeddings Saving ---
 
94
 
95
  # Call the function to start the embedding process
96
  if __name__ == "__main__":
97
+ encode_ga_resolutions()