Spaces:
Running
Running
Bohaska (aider) commited on
Commit ·
20e9110
1
Parent(s): 3a373f3
refactor: Rename search types and update file paths and examples.
Browse files- app.py +243 -113
- small_scripts/make_embedding/embedding.py +18 -22
- small_scripts/make_embedding/embedding_ga_resolutions.py +7 -7
app.py
CHANGED
|
@@ -3,25 +3,24 @@ from FlagEmbedding import BGEM3FlagModel
|
|
| 3 |
import numpy as np
|
| 4 |
import json
|
| 5 |
import os
|
|
|
|
| 6 |
|
| 7 |
# --- Configuration and Global Data Loading ---
|
| 8 |
|
| 9 |
# Determine the directory of the script to load files relative to it
|
| 10 |
script_dir = os.path.dirname(os.path.abspath(__file__))
|
| 11 |
|
| 12 |
-
# Define paths for issue embedding types
|
| 13 |
issue_embeddings_paths = {
|
| 14 |
-
'
|
| 15 |
-
'
|
| 16 |
-
# 'mixed': os.path.join(script_dir, 'ns_issues_colbert_bge-m3.npy') # Removed
|
| 17 |
}
|
| 18 |
issue_titles_path = os.path.join(script_dir, 'issue_titles.json')
|
| 19 |
|
| 20 |
-
# Define paths for GA resolution embedding types
|
| 21 |
ga_embeddings_paths = {
|
| 22 |
-
'
|
| 23 |
-
'
|
| 24 |
-
# 'mixed': os.path.join(script_dir, 'ns_ga_resolutions_colbert_bge-m3.npy') # Removed
|
| 25 |
}
|
| 26 |
ga_resolutions_path = os.path.join(script_dir, 'parsed_ga_resolutions.json')
|
| 27 |
|
|
@@ -36,13 +35,13 @@ except Exception as e:
|
|
| 36 |
print("Please ensure you have an internet connection or the model is cached locally.")
|
| 37 |
model = None # Indicate model loading failed
|
| 38 |
|
| 39 |
-
# Issue data storage for all types
|
| 40 |
issue_all_embeddings = {
|
| 41 |
-
'
|
| 42 |
-
'
|
| 43 |
-
# 'mixed': None # Removed
|
| 44 |
}
|
| 45 |
issue_titles = {}
|
|
|
|
| 46 |
|
| 47 |
print("Loading issue data...")
|
| 48 |
try:
|
|
@@ -50,7 +49,7 @@ try:
|
|
| 50 |
# Load available embedding types for issues
|
| 51 |
for embed_type, path in issue_embeddings_paths.items():
|
| 52 |
if os.path.exists(path):
|
| 53 |
-
if embed_type == '
|
| 54 |
# Load sparse dictionaries: it's a NumPy object array, convert to list of objects
|
| 55 |
issue_all_embeddings[embed_type] = np.load(path, allow_pickle=True).tolist()
|
| 56 |
else: # Dense
|
|
@@ -64,6 +63,37 @@ try:
|
|
| 64 |
with open(issue_titles_path, encoding='utf-8') as file:
|
| 65 |
issue_titles = json.load(file)
|
| 66 |
print(f"Issue data loaded: {len(issue_titles)} issues.")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 67 |
except FileNotFoundError as e:
|
| 68 |
print(f"Error loading issue data: {e}")
|
| 69 |
print(
|
|
@@ -71,11 +101,10 @@ except FileNotFoundError as e:
|
|
| 71 |
except Exception as e:
|
| 72 |
print(f"Error loading issue data: {e}")
|
| 73 |
|
| 74 |
-
# GA resolution data storage for all types
|
| 75 |
ga_all_embeddings = {
|
| 76 |
-
'
|
| 77 |
-
'
|
| 78 |
-
# 'mixed': None # Removed
|
| 79 |
}
|
| 80 |
ga_resolutions_data = []
|
| 81 |
|
|
@@ -85,7 +114,7 @@ try:
|
|
| 85 |
# Load available embedding types for GA resolutions
|
| 86 |
for embed_type, path in ga_embeddings_paths.items():
|
| 87 |
if os.path.exists(path):
|
| 88 |
-
if embed_type == '
|
| 89 |
ga_all_embeddings[embed_type] = np.load(path, allow_pickle=True).tolist()
|
| 90 |
else: # Dense
|
| 91 |
ga_all_embeddings[embed_type] = np.load(path)
|
|
@@ -108,9 +137,9 @@ except Exception as e:
|
|
| 108 |
|
| 109 |
# --- Search Functions ---
|
| 110 |
|
| 111 |
-
def _perform_search(search_term: str, corpus_embeddings_dict: dict,
|
| 112 |
"""
|
| 113 |
-
Helper function to perform
|
| 114 |
Returns sorted list of (index, similarity_score).
|
| 115 |
"""
|
| 116 |
if not model:
|
|
@@ -121,22 +150,20 @@ def _perform_search(search_term: str, corpus_embeddings_dict: dict, corpus_title
|
|
| 121 |
corpus_embeddings = corpus_embeddings_dict.get(search_type)
|
| 122 |
if corpus_embeddings is None:
|
| 123 |
raise ValueError(f"Corpus data for search type '{search_type}' not loaded. Cannot perform search.")
|
| 124 |
-
if not corpus_titles_map_or_list:
|
| 125 |
-
raise ValueError("Corpus titles/data not loaded. Cannot perform search.")
|
| 126 |
|
| 127 |
# Encode the search term for relevant types
|
| 128 |
query_embeddings = model.encode([search_term],
|
| 129 |
return_dense=True,
|
| 130 |
-
return_sparse=True,
|
| 131 |
-
return_colbert_vecs=False)
|
| 132 |
|
| 133 |
-
similarity_scores = []
|
| 134 |
|
| 135 |
-
if search_type == '
|
| 136 |
query_vec = query_embeddings['dense_vecs'] # Shape: (1, embedding_dim)
|
| 137 |
# Perform dot product for dense similarity
|
| 138 |
similarity_scores = (query_vec @ corpus_embeddings.T)[0] # Result shape: (num_docs,)
|
| 139 |
-
elif search_type == '
|
| 140 |
# 'lexical_weights' is a list of dictionaries, even for a single query.
|
| 141 |
# We need the first (and only) dictionary from this list.
|
| 142 |
if 'lexical_weights' not in query_embeddings or not query_embeddings['lexical_weights']:
|
|
@@ -148,17 +175,9 @@ def _perform_search(search_term: str, corpus_embeddings_dict: dict, corpus_title
|
|
| 148 |
score = model.compute_lexical_matching_score(query_sparse_dict, doc_sparse_dict)
|
| 149 |
similarity_scores.append(score)
|
| 150 |
similarity_scores = np.array(similarity_scores) # Convert to numpy array
|
| 151 |
-
# Removed 'mixed' (ColBERT) search type
|
| 152 |
-
# elif search_type == 'mixed':
|
| 153 |
-
# if 'colbert_vecs' not in query_embeddings or not query_embeddings['colbert_vecs']:
|
| 154 |
-
# raise ValueError("ColBERT vectors not returned for query. Model or configuration issue.")
|
| 155 |
-
# query_colbert_vec = query_embeddings['colbert_vecs'][0]
|
| 156 |
-
# for doc_colbert_vec in corpus_embeddings:
|
| 157 |
-
# score = model.colbert_score(query_colbert_vec, doc_colbert_vec)
|
| 158 |
-
# similarity_scores.append(score)
|
| 159 |
-
# similarity_scores = np.array(similarity_scores)
|
| 160 |
else:
|
| 161 |
-
|
|
|
|
| 162 |
|
| 163 |
# Pair index with similarity score
|
| 164 |
indexed_similarities = [(i, score) for i, score in enumerate(similarity_scores)]
|
|
@@ -168,89 +187,196 @@ def _perform_search(search_term: str, corpus_embeddings_dict: dict, corpus_title
|
|
| 168 |
|
| 169 |
return sorted_similarities
|
| 170 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 171 |
|
| 172 |
-
|
| 173 |
-
|
| 174 |
-
|
| 175 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 176 |
|
| 177 |
-
|
| 178 |
-
|
| 179 |
-
|
|
|
|
|
|
|
|
|
|
| 180 |
|
| 181 |
-
|
| 182 |
-
|
| 183 |
-
|
| 184 |
-
|
| 185 |
-
|
| 186 |
-
|
| 187 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 188 |
|
| 189 |
-
return similarity_text
|
| 190 |
except Exception as e:
|
| 191 |
return f"An error occurred during issue search: {e}"
|
| 192 |
|
| 193 |
|
| 194 |
def search_ga_resolutions(search_term: str, hide_repealed: bool, hide_repeal_category: bool,
|
| 195 |
-
search_type: str = '
|
| 196 |
"""
|
| 197 |
Searches GA resolutions, filters repealed and/or repeal category if requested,
|
| 198 |
and returns formatted results with links and status.
|
| 199 |
"""
|
| 200 |
try:
|
| 201 |
-
|
| 202 |
-
|
| 203 |
-
|
| 204 |
-
|
| 205 |
-
|
| 206 |
-
|
| 207 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 208 |
resolution = ga_resolutions_data[index]
|
|
|
|
|
|
|
|
|
|
| 209 |
status = resolution.get('status')
|
| 210 |
-
|
| 211 |
-
|
| 212 |
-
|
| 213 |
-
|
| 214 |
-
|
| 215 |
-
|
| 216 |
-
|
| 217 |
-
|
| 218 |
-
|
| 219 |
-
|
| 220 |
-
|
| 221 |
-
|
| 222 |
-
|
| 223 |
-
|
| 224 |
-
|
| 225 |
-
|
| 226 |
-
|
| 227 |
-
|
| 228 |
-
|
| 229 |
-
|
| 230 |
-
|
| 231 |
-
|
| 232 |
-
|
| 233 |
-
|
| 234 |
-
|
| 235 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 236 |
|
| 237 |
-
|
| 238 |
-
|
| 239 |
-
|
| 240 |
-
|
| 241 |
|
| 242 |
-
|
| 243 |
-
|
| 244 |
|
| 245 |
-
|
| 246 |
-
|
| 247 |
|
| 248 |
-
|
| 249 |
-
|
| 250 |
|
| 251 |
-
|
| 252 |
|
| 253 |
-
|
| 254 |
except Exception as e:
|
| 255 |
return f"An error occurred during GA resolution search: {e}"
|
| 256 |
|
|
@@ -281,20 +407,22 @@ with gr.Blocks() as demo:
|
|
| 281 |
fn=get_issue_similarity_rankings,
|
| 282 |
inputs=[
|
| 283 |
gr.Textbox(label="Search term", placeholder="What issue are you looking for?"),
|
| 284 |
-
gr.Radio(["
|
| 285 |
-
info="Choose search type.") #
|
| 286 |
],
|
| 287 |
outputs=gr.Markdown(),
|
| 288 |
examples=[
|
| 289 |
# Examples for Issue Search (search_term, search_type)
|
| 290 |
-
["coffee", "
|
| 291 |
-
["land value tax", "
|
| 292 |
-
["Elon Musk", "
|
| 293 |
["After an corrupted election, citizens demand the real results, after discovering it was rigged. ",
|
| 294 |
-
"
|
| 295 |
[
|
| 296 |
"Eureka! A new scientific law regarding the universe's expansion may have just been discovered at the University of @@CAPITAL@@. Unfortunately, tempers are flaring over who should get naming credit. Maxtopian grad student Georgie Bubble claims the work alone while his boss Dr.@@RANDOMNAME1@@ claims that all work in the University is @@NAME@@’s collectively. Your Minister of Education has elevated this to your desk.",
|
| 297 |
-
"
|
|
|
|
|
|
|
| 298 |
],
|
| 299 |
title=None,
|
| 300 |
description=None,
|
|
@@ -312,8 +440,8 @@ with gr.Blocks() as demo:
|
|
| 312 |
ga_search_term_input = gr.Textbox(label="Search term", placeholder="What are you looking for?")
|
| 313 |
ga_hide_repealed_checkbox = gr.Checkbox(value=True, label="Hide repealed resolutions")
|
| 314 |
ga_hide_repeal_category_checkbox = gr.Checkbox(value=True, label="Hide repeals")
|
| 315 |
-
ga_search_type_radio = gr.Radio(["
|
| 316 |
-
info="Choose search type.")
|
| 317 |
|
| 318 |
ga_search_interface = gr.Interface(
|
| 319 |
fn=search_ga_resolutions,
|
|
@@ -327,11 +455,13 @@ with gr.Blocks() as demo:
|
|
| 327 |
outputs=gr.Markdown(),
|
| 328 |
examples=[
|
| 329 |
# Examples for GA Resolution Search (search_term, hide_repealed, hide_repeal_category, search_type)
|
| 330 |
-
["condemn genocide", True, True, "
|
| 331 |
-
["rights of animals", True, True, "
|
| 332 |
-
["regulating space mining", True, True, "
|
| 333 |
-
["founding of the World Assembly", True, True, "
|
| 334 |
-
["environmental protection", True, True, "
|
|
|
|
|
|
|
| 335 |
],
|
| 336 |
title=None,
|
| 337 |
description=None,
|
|
@@ -343,4 +473,4 @@ with gr.Blocks() as demo:
|
|
| 343 |
if __name__ == "__main__":
|
| 344 |
# Set share=True to make the app accessible externally (requires ngrok)
|
| 345 |
# share=False is default and runs locally
|
| 346 |
-
demo.launch()
|
|
|
|
| 3 |
import numpy as np
|
| 4 |
import json
|
| 5 |
import os
|
| 6 |
+
import re # Added for strict search context extraction
|
| 7 |
|
| 8 |
# --- Configuration and Global Data Loading ---
|
| 9 |
|
| 10 |
# Determine the directory of the script to load files relative to it
|
| 11 |
script_dir = os.path.dirname(os.path.abspath(__file__))
|
| 12 |
|
| 13 |
+
# Define paths for issue embedding types
|
| 14 |
issue_embeddings_paths = {
|
| 15 |
+
'semantic': os.path.join(script_dir, 'ns_issues_semantic_bge-m3.npy'), # Renamed from fuzzy
|
| 16 |
+
'loose': os.path.join(script_dir, 'ns_issues_loose_bge-m3.npy'), # Renamed from direct
|
|
|
|
| 17 |
}
|
| 18 |
issue_titles_path = os.path.join(script_dir, 'issue_titles.json')
|
| 19 |
|
| 20 |
+
# Define paths for GA resolution embedding types
|
| 21 |
ga_embeddings_paths = {
|
| 22 |
+
'semantic': os.path.join(script_dir, 'ns_ga_resolutions_semantic_bge-m3.npy'), # Renamed from fuzzy
|
| 23 |
+
'loose': os.path.join(script_dir, 'ns_ga_resolutions_loose_bge-m3.npy'), # Renamed from direct
|
|
|
|
| 24 |
}
|
| 25 |
ga_resolutions_path = os.path.join(script_dir, 'parsed_ga_resolutions.json')
|
| 26 |
|
|
|
|
| 35 |
print("Please ensure you have an internet connection or the model is cached locally.")
|
| 36 |
model = None # Indicate model loading failed
|
| 37 |
|
| 38 |
+
# Issue data storage for all types
|
| 39 |
issue_all_embeddings = {
|
| 40 |
+
'semantic': None,
|
| 41 |
+
'loose': None,
|
|
|
|
| 42 |
}
|
| 43 |
issue_titles = {}
|
| 44 |
+
all_issue_raw_texts = [] # New: To store raw issue texts for strict search
|
| 45 |
|
| 46 |
print("Loading issue data...")
|
| 47 |
try:
|
|
|
|
| 49 |
# Load available embedding types for issues
|
| 50 |
for embed_type, path in issue_embeddings_paths.items():
|
| 51 |
if os.path.exists(path):
|
| 52 |
+
if embed_type == 'loose': # Only sparse is loaded as list of objects now
|
| 53 |
# Load sparse dictionaries: it's a NumPy object array, convert to list of objects
|
| 54 |
issue_all_embeddings[embed_type] = np.load(path, allow_pickle=True).tolist()
|
| 55 |
else: # Dense
|
|
|
|
| 63 |
with open(issue_titles_path, encoding='utf-8') as file:
|
| 64 |
issue_titles = json.load(file)
|
| 65 |
print(f"Issue data loaded: {len(issue_titles)} issues.")
|
| 66 |
+
|
| 67 |
+
# --- Load raw issue texts for strict search ---
|
| 68 |
+
# The issue text files are in 'small_scripts/make_embedding/002 - Issue Megalist (MAIN) copy/'
|
| 69 |
+
issues_input_dir = os.path.join(script_dir, 'small_scripts', 'make_embedding', '002 - Issue Megalist (MAIN) copy')
|
| 70 |
+
|
| 71 |
+
# Replicate get_issue_files logic from embedding.py to ensure correct order
|
| 72 |
+
issue_files_for_raw_load = []
|
| 73 |
+
file_pattern = re.compile(r'(\d+) TO (\d+)\.txt')
|
| 74 |
+
|
| 75 |
+
if os.path.isdir(issues_input_dir):
|
| 76 |
+
for filename in os.listdir(issues_input_dir):
|
| 77 |
+
if filename.endswith('.txt'):
|
| 78 |
+
match = file_pattern.match(filename)
|
| 79 |
+
if match:
|
| 80 |
+
start_num = int(match.group(1))
|
| 81 |
+
issue_files_for_raw_load.append((start_num, filename))
|
| 82 |
+
issue_files_for_raw_load.sort(key=lambda x: x[0])
|
| 83 |
+
issue_files_for_raw_load = [os.path.join(issues_input_dir, filename) for _, filename in issue_files_for_raw_load]
|
| 84 |
+
|
| 85 |
+
for filepath in issue_files_for_raw_load:
|
| 86 |
+
with open(filepath, 'r', encoding='utf-8') as file:
|
| 87 |
+
issues_text_in_file = file.read()
|
| 88 |
+
# Split issues by the separator and remove any empty strings resulting from multiple separators
|
| 89 |
+
issues_list_in_file = [
|
| 90 |
+
issue.strip() for issue in issues_text_in_file.split("[hr][/hr]") if issue.strip()
|
| 91 |
+
]
|
| 92 |
+
all_issue_raw_texts.extend(issues_list_in_file)
|
| 93 |
+
print(f" Loaded {len(all_issue_raw_texts)} raw issue texts for strict search.")
|
| 94 |
+
else:
|
| 95 |
+
print(f" Warning: Issue text directory '{issues_input_dir}' not found. Strict issue search will not work.")
|
| 96 |
+
|
| 97 |
except FileNotFoundError as e:
|
| 98 |
print(f"Error loading issue data: {e}")
|
| 99 |
print(
|
|
|
|
| 101 |
except Exception as e:
|
| 102 |
print(f"Error loading issue data: {e}")
|
| 103 |
|
| 104 |
+
# GA resolution data storage for all types
|
| 105 |
ga_all_embeddings = {
|
| 106 |
+
'semantic': None,
|
| 107 |
+
'loose': None,
|
|
|
|
| 108 |
}
|
| 109 |
ga_resolutions_data = []
|
| 110 |
|
|
|
|
| 114 |
# Load available embedding types for GA resolutions
|
| 115 |
for embed_type, path in ga_embeddings_paths.items():
|
| 116 |
if os.path.exists(path):
|
| 117 |
+
if embed_type == 'loose': # Only sparse is loaded as list of objects now
|
| 118 |
ga_all_embeddings[embed_type] = np.load(path, allow_pickle=True).tolist()
|
| 119 |
else: # Dense
|
| 120 |
ga_all_embeddings[embed_type] = np.load(path)
|
|
|
|
| 137 |
|
| 138 |
# --- Search Functions ---
|
| 139 |
|
| 140 |
+
def _perform_search(search_term: str, corpus_embeddings_dict: dict, search_type: str):
|
| 141 |
"""
|
| 142 |
+
Helper function to perform an embedding-based search given the search term, corpus embeddings, and search type.
|
| 143 |
Returns sorted list of (index, similarity_score).
|
| 144 |
"""
|
| 145 |
if not model:
|
|
|
|
| 150 |
corpus_embeddings = corpus_embeddings_dict.get(search_type)
|
| 151 |
if corpus_embeddings is None:
|
| 152 |
raise ValueError(f"Corpus data for search type '{search_type}' not loaded. Cannot perform search.")
|
|
|
|
|
|
|
| 153 |
|
| 154 |
# Encode the search term for relevant types
|
| 155 |
query_embeddings = model.encode([search_term],
|
| 156 |
return_dense=True,
|
| 157 |
+
return_sparse=True,
|
| 158 |
+
return_colbert_vecs=False)
|
| 159 |
|
| 160 |
+
similarity_scores = []
|
| 161 |
|
| 162 |
+
if search_type == 'semantic': # Renamed from 'fuzzy'
|
| 163 |
query_vec = query_embeddings['dense_vecs'] # Shape: (1, embedding_dim)
|
| 164 |
# Perform dot product for dense similarity
|
| 165 |
similarity_scores = (query_vec @ corpus_embeddings.T)[0] # Result shape: (num_docs,)
|
| 166 |
+
elif search_type == 'loose': # Renamed from 'direct'
|
| 167 |
# 'lexical_weights' is a list of dictionaries, even for a single query.
|
| 168 |
# We need the first (and only) dictionary from this list.
|
| 169 |
if 'lexical_weights' not in query_embeddings or not query_embeddings['lexical_weights']:
|
|
|
|
| 175 |
score = model.compute_lexical_matching_score(query_sparse_dict, doc_sparse_dict)
|
| 176 |
similarity_scores.append(score)
|
| 177 |
similarity_scores = np.array(similarity_scores) # Convert to numpy array
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 178 |
else:
|
| 179 |
+
# This function should only be called for embedding-based searches
|
| 180 |
+
raise ValueError(f"Unsupported embedding search type: {search_type}")
|
| 181 |
|
| 182 |
# Pair index with similarity score
|
| 183 |
indexed_similarities = [(i, score) for i, score in enumerate(similarity_scores)]
|
|
|
|
| 187 |
|
| 188 |
return sorted_similarities
|
| 189 |
|
| 190 |
+
def _extract_context(text: str, query: str, context_chars: int = 100):
|
| 191 |
+
"""Extracts surrounding context for a given query in text, split on newlines."""
|
| 192 |
+
text_lower = text.lower()
|
| 193 |
+
query_lower = query.lower()
|
| 194 |
+
|
| 195 |
+
start_index = text_lower.find(query_lower)
|
| 196 |
+
if start_index == -1:
|
| 197 |
+
return "" # Query not found, should not happen if we got here
|
| 198 |
|
| 199 |
+
end_index = start_index + len(query)
|
| 200 |
+
|
| 201 |
+
# Find start of context
|
| 202 |
+
context_start = max(0, start_index - context_chars)
|
| 203 |
+
# Find end of context
|
| 204 |
+
context_end = min(len(text), end_index + context_chars)
|
| 205 |
+
|
| 206 |
+
# Adjust context_start to the nearest newline before it
|
| 207 |
+
if context_start > 0:
|
| 208 |
+
pre_context = text[0:context_start]
|
| 209 |
+
last_newline_before_start = pre_context.rfind('\n')
|
| 210 |
+
if last_newline_before_start != -1:
|
| 211 |
+
context_start = last_newline_before_start + 1
|
| 212 |
|
| 213 |
+
# Adjust context_end to the nearest newline after it
|
| 214 |
+
if context_end < len(text):
|
| 215 |
+
post_context = text[context_end:len(text)]
|
| 216 |
+
first_newline_after_end = post_context.find('\n')
|
| 217 |
+
if first_newline_after_end != -1:
|
| 218 |
+
context_end = context_end + first_newline_after_end
|
| 219 |
|
| 220 |
+
extracted_text = text[context_start:context_end]
|
| 221 |
+
|
| 222 |
+
# Highlight the query using regex for case-insensitive replacement
|
| 223 |
+
highlighted_text = re.sub(re.escape(query), lambda m: f"**{m.group(0)}**", extracted_text, flags=re.IGNORECASE, count=1)
|
| 224 |
+
|
| 225 |
+
return f"```\n{highlighted_text}\n```"
|
| 226 |
+
|
| 227 |
+
|
| 228 |
+
def get_issue_similarity_rankings(search_term: str, search_type: str = 'semantic'): # Renamed default
|
| 229 |
+
"""Searches issues and returns formatted results."""
|
| 230 |
+
try:
|
| 231 |
+
if not search_term:
|
| 232 |
+
return "Please enter a search term."
|
| 233 |
+
|
| 234 |
+
if search_type == 'strict':
|
| 235 |
+
if not all_issue_raw_texts:
|
| 236 |
+
return "Raw issue texts not loaded. Strict search is unavailable."
|
| 237 |
+
|
| 238 |
+
strict_matches = []
|
| 239 |
+
search_term_lower = search_term.lower()
|
| 240 |
+
for i, issue_text in enumerate(all_issue_raw_texts):
|
| 241 |
+
if search_term_lower in issue_text.lower():
|
| 242 |
+
strict_matches.append((i, 1.0)) # Use 1.0 as a dummy score for strict matches
|
| 243 |
+
|
| 244 |
+
similarity_text = f"# Top 20 Issue Search Results (Strict)\n"
|
| 245 |
+
if not strict_matches:
|
| 246 |
+
return similarity_text + "No exact matches found."
|
| 247 |
+
|
| 248 |
+
search_ranking = 1
|
| 249 |
+
for index, sim_score in strict_matches[:20]: # Still limit to top 20
|
| 250 |
+
issue_title = issue_titles.get(str(index), f"Unknown Issue (Index {index})")
|
| 251 |
+
context = _extract_context(all_issue_raw_texts[index], search_term)
|
| 252 |
+
similarity_text += f"{search_ranking}. {issue_title}, Match: {sim_score:.4f}\n{context}\n"
|
| 253 |
+
search_ranking += 1
|
| 254 |
+
return similarity_text
|
| 255 |
+
|
| 256 |
+
else: # Embedding-based search
|
| 257 |
+
sorted_similarities = _perform_search(search_term, issue_all_embeddings, search_type)
|
| 258 |
+
|
| 259 |
+
similarity_text = f"# Top 20 Issue Search Results ({search_type.capitalize()})\n"
|
| 260 |
+
if not sorted_similarities:
|
| 261 |
+
return similarity_text + "No issues found."
|
| 262 |
+
|
| 263 |
+
search_ranking = 1
|
| 264 |
+
for index, sim_score in sorted_similarities[:20]:
|
| 265 |
+
# issue_titles is a dict, needs string key
|
| 266 |
+
issue_title = issue_titles.get(str(index), f"Unknown Issue (Index {index})")
|
| 267 |
+
similarity_text += f"{search_ranking}. {issue_title}, Similarity: {sim_score:.4f}\n"
|
| 268 |
+
search_ranking += 1
|
| 269 |
+
return similarity_text
|
| 270 |
|
|
|
|
| 271 |
except Exception as e:
|
| 272 |
return f"An error occurred during issue search: {e}"
|
| 273 |
|
| 274 |
|
| 275 |
def search_ga_resolutions(search_term: str, hide_repealed: bool, hide_repeal_category: bool,
|
| 276 |
+
search_type: str = 'semantic'): # Renamed default
|
| 277 |
"""
|
| 278 |
Searches GA resolutions, filters repealed and/or repeal category if requested,
|
| 279 |
and returns formatted results with links and status.
|
| 280 |
"""
|
| 281 |
try:
|
| 282 |
+
if not search_term:
|
| 283 |
+
return "Please enter a search term."
|
| 284 |
+
|
| 285 |
+
if search_type == 'strict':
|
| 286 |
+
if not ga_resolutions_data:
|
| 287 |
+
return "GA resolution data not loaded. Strict search is unavailable."
|
| 288 |
+
|
| 289 |
+
strict_matches = []
|
| 290 |
+
search_term_lower = search_term.lower()
|
| 291 |
+
for i, resolution in enumerate(ga_resolutions_data):
|
| 292 |
+
resolution_body = resolution.get('body', '')
|
| 293 |
+
if search_term_lower in resolution_body.lower():
|
| 294 |
+
# Apply filters immediately for strict search
|
| 295 |
+
status = resolution.get('status')
|
| 296 |
+
category = resolution.get('category')
|
| 297 |
+
if hide_repealed and status == "Repealed":
|
| 298 |
+
continue
|
| 299 |
+
if hide_repeal_category and category == "Repeal":
|
| 300 |
+
continue
|
| 301 |
+
strict_matches.append((i, 1.0)) # Dummy score
|
| 302 |
+
|
| 303 |
+
similarity_text = f"# Top 20 GA Resolution Search Results (Strict)\n"
|
| 304 |
+
if not strict_matches:
|
| 305 |
+
status_msgs = []
|
| 306 |
+
if hide_repealed: status_msgs.append("Repealed")
|
| 307 |
+
if hide_repeal_category: status_msgs.append("Repeal Category")
|
| 308 |
+
filter_msg = " (Filtered out " + " and ".join(status_msgs) + ")" if status_msgs else ""
|
| 309 |
+
return similarity_text + f"No exact matches found{filter_msg}."
|
| 310 |
+
|
| 311 |
+
search_ranking = 1
|
| 312 |
+
for index, sim_score in strict_matches[:20]:
|
| 313 |
resolution = ga_resolutions_data[index]
|
| 314 |
+
title = resolution.get('title', 'Untitled Resolution')
|
| 315 |
+
res_id = resolution.get('id', 'N/A')
|
| 316 |
+
council = resolution.get('council', 1)
|
| 317 |
status = resolution.get('status')
|
| 318 |
+
status_marker = "[REPEALED] " if status == "Repealed" else ""
|
| 319 |
+
url = f"https://www.nationstates.net/page=WA_past_resolution/id={res_id}/council={council}"
|
| 320 |
+
|
| 321 |
+
context = _extract_context(resolution.get('body', ''), search_term)
|
| 322 |
+
|
| 323 |
+
similarity_text += f"{search_ranking}. {status_marker}[#{res_id} {title}]({url}), Match: {sim_score:.4f}\n{context}\n"
|
| 324 |
+
search_ranking += 1
|
| 325 |
+
return similarity_text
|
| 326 |
+
|
| 327 |
+
else: # Embedding-based search
|
| 328 |
+
raw_sorted_similarities = _perform_search(search_term, ga_all_embeddings, search_type)
|
| 329 |
+
|
| 330 |
+
# --- Filtering ---
|
| 331 |
+
filtered_indexed_similarities = []
|
| 332 |
+
for index, score in raw_sorted_similarities:
|
| 333 |
+
# Ensure index is valid
|
| 334 |
+
if index < len(ga_resolutions_data):
|
| 335 |
+
resolution = ga_resolutions_data[index]
|
| 336 |
+
status = resolution.get('status')
|
| 337 |
+
category = resolution.get('category')
|
| 338 |
+
|
| 339 |
+
# Apply filters
|
| 340 |
+
if hide_repealed and status == "Repealed":
|
| 341 |
+
continue
|
| 342 |
+
if hide_repeal_category and category == "Repeal":
|
| 343 |
+
continue
|
| 344 |
+
filtered_indexed_similarities.append((index, score))
|
| 345 |
+
|
| 346 |
+
# The list is already sorted, no re-sort needed after filtering.
|
| 347 |
+
|
| 348 |
+
# --- Formatting Results ---
|
| 349 |
+
similarity_text = f"# Top 20 GA Resolution Search Results ({search_type.capitalize()})\n"
|
| 350 |
+
if not filtered_indexed_similarities:
|
| 351 |
+
status_msgs = []
|
| 352 |
+
if hide_repealed: status_msgs.append("Repealed")
|
| 353 |
+
if hide_repeal_category: status_msgs.append("Repeal Category")
|
| 354 |
+
|
| 355 |
+
filter_msg = " (Filtered out " + " and ".join(status_msgs) + ")" if status_msgs else ""
|
| 356 |
+
return similarity_text + f"No matching resolutions found{filter_msg}."
|
| 357 |
+
|
| 358 |
+
search_ranking = 1
|
| 359 |
+
# Get top 20 results from the sorted and filtered list
|
| 360 |
+
for index, sim_score in filtered_indexed_similarities[:20]:
|
| 361 |
+
resolution = ga_resolutions_data[index]
|
| 362 |
|
| 363 |
+
title = resolution.get('title', 'Untitled Resolution')
|
| 364 |
+
res_id = resolution.get('id', 'N/A')
|
| 365 |
+
council = resolution.get('council', 1)
|
| 366 |
+
status = resolution.get('status')
|
| 367 |
|
| 368 |
+
# Add [REPEALED] marker if the status is "Repealed"
|
| 369 |
+
status_marker = "[REPEALED] " if status == "Repealed" else ""
|
| 370 |
|
| 371 |
+
# Construct the NationStates URL
|
| 372 |
+
url = f"https://www.nationstates.net/page=WA_past_resolution/id={res_id}/council={council}"
|
| 373 |
|
| 374 |
+
# Format as Markdown link with the status marker
|
| 375 |
+
similarity_text += f"{search_ranking}. {status_marker}[#{res_id} {title}]({url}), Similarity: {sim_score:.4f}\n"
|
| 376 |
|
| 377 |
+
search_ranking += 1
|
| 378 |
|
| 379 |
+
return similarity_text
|
| 380 |
except Exception as e:
|
| 381 |
return f"An error occurred during GA resolution search: {e}"
|
| 382 |
|
|
|
|
| 407 |
fn=get_issue_similarity_rankings,
|
| 408 |
inputs=[
|
| 409 |
gr.Textbox(label="Search term", placeholder="What issue are you looking for?"),
|
| 410 |
+
gr.Radio(["semantic", "loose", "strict"], label="Search Type", value="semantic", # Updated options
|
| 411 |
+
info="Choose search type: 'semantic' for conceptual similarity, 'loose' for keyword matching, 'strict' for exact phrase matching.") # Updated info
|
| 412 |
],
|
| 413 |
outputs=gr.Markdown(),
|
| 414 |
examples=[
|
| 415 |
# Examples for Issue Search (search_term, search_type)
|
| 416 |
+
["coffee", "semantic"],
|
| 417 |
+
["land value tax", "loose"],
|
| 418 |
+
["Elon Musk", "loose"],
|
| 419 |
["After an corrupted election, citizens demand the real results, after discovering it was rigged. ",
|
| 420 |
+
"semantic"],
|
| 421 |
[
|
| 422 |
"Eureka! A new scientific law regarding the universe's expansion may have just been discovered at the University of @@CAPITAL@@. Unfortunately, tempers are flaring over who should get naming credit. Maxtopian grad student Georgie Bubble claims the work alone while his boss Dr.@@RANDOMNAME1@@ claims that all work in the University is @@NAME@@’s collectively. Your Minister of Education has elevated this to your desk.",
|
| 423 |
+
"semantic"],
|
| 424 |
+
["tax", "strict"], # New example for strict
|
| 425 |
+
["environmental protection", "strict"] # New example for strict
|
| 426 |
],
|
| 427 |
title=None,
|
| 428 |
description=None,
|
|
|
|
| 440 |
ga_search_term_input = gr.Textbox(label="Search term", placeholder="What are you looking for?")
|
| 441 |
ga_hide_repealed_checkbox = gr.Checkbox(value=True, label="Hide repealed resolutions")
|
| 442 |
ga_hide_repeal_category_checkbox = gr.Checkbox(value=True, label="Hide repeals")
|
| 443 |
+
ga_search_type_radio = gr.Radio(["semantic", "loose", "strict"], label="Search Type", value="semantic", # Updated options
|
| 444 |
+
info="Choose search type: 'semantic' for conceptual similarity, 'loose' for keyword matching, 'strict' for exact phrase matching.") # Updated info
|
| 445 |
|
| 446 |
ga_search_interface = gr.Interface(
|
| 447 |
fn=search_ga_resolutions,
|
|
|
|
| 455 |
outputs=gr.Markdown(),
|
| 456 |
examples=[
|
| 457 |
# Examples for GA Resolution Search (search_term, hide_repealed, hide_repeal_category, search_type)
|
| 458 |
+
["condemn genocide", True, True, "semantic"],
|
| 459 |
+
["rights of animals", True, True, "loose"],
|
| 460 |
+
["regulating space mining", True, True, "semantic"],
|
| 461 |
+
["founding of the World Assembly", True, True, "semantic"],
|
| 462 |
+
["environmental protection", True, True, "semantic"],
|
| 463 |
+
["human rights", True, True, "strict"], # New example for strict
|
| 464 |
+
["World Assembly", True, True, "strict"] # New example for strict
|
| 465 |
],
|
| 466 |
title=None,
|
| 467 |
description=None,
|
|
|
|
| 473 |
if __name__ == "__main__":
|
| 474 |
# Set share=True to make the app accessible externally (requires ngrok)
|
| 475 |
# share=False is default and runs locally
|
| 476 |
+
demo.launch()
|
small_scripts/make_embedding/embedding.py
CHANGED
|
@@ -125,10 +125,10 @@ def encode_issues():
|
|
| 125 |
return_sparse=True, # This will return 'lexical_weights' for BGE-M3
|
| 126 |
return_colbert_vecs=False) # <--- REMOVED COLBERT GENERATION
|
| 127 |
|
| 128 |
-
# Save Dense Embeddings
|
| 129 |
np.save(file_cache_dense_path, embeddings['dense_vecs'])
|
| 130 |
|
| 131 |
-
# --- Save Sparse Embeddings ---
|
| 132 |
# 'lexical_weights' is a list of dictionaries, one for each item in the batch
|
| 133 |
sparse_list_of_dicts = embeddings.get('lexical_weights')
|
| 134 |
|
|
@@ -136,10 +136,6 @@ def encode_issues():
|
|
| 136 |
# This allows storing Python objects (dictionaries) in a NumPy array.
|
| 137 |
np.save(file_cache_sparse_path, np.array(sparse_list_of_dicts, dtype=object), allow_pickle=True)
|
| 138 |
|
| 139 |
-
# Removed saving ColBERT Embeddings
|
| 140 |
-
# colbert_list_of_arrays = embeddings.get('colbert_vecs')
|
| 141 |
-
# np.save(file_cache_colbert_path, np.array(colbert_list_of_arrays, dtype=object), allow_pickle=True)
|
| 142 |
-
|
| 143 |
|
| 144 |
print(f" Encoded and cached {len(issues_list_in_file)} issues from {filename}.")
|
| 145 |
|
|
@@ -152,8 +148,8 @@ def encode_issues():
|
|
| 152 |
print("\n--- Consolidation Phase: Combining cached embeddings ---")
|
| 153 |
|
| 154 |
# Initialize lists to collect all embeddings in the correct global order
|
| 155 |
-
|
| 156 |
-
|
| 157 |
# Removed final_colbert_embeddings_list
|
| 158 |
|
| 159 |
# Re-get sorted file paths to ensure correct order for consolidation
|
|
@@ -173,11 +169,11 @@ def encode_issues():
|
|
| 173 |
os.path.exists(file_cache_sparse_path)): # Removed colbert cache check
|
| 174 |
|
| 175 |
# Load and append to the lists
|
| 176 |
-
|
| 177 |
|
| 178 |
# Load sparse dictionaries: it's a NumPy object array, convert to list of dicts
|
| 179 |
loaded_sparse_dicts_for_file = np.load(file_cache_sparse_path, allow_pickle=True).tolist()
|
| 180 |
-
|
| 181 |
|
| 182 |
# Removed loading ColBERT arrays
|
| 183 |
# loaded_colbert_arrays_for_file = np.load(file_cache_colbert_path, allow_pickle=True).tolist()
|
|
@@ -195,7 +191,7 @@ def encode_issues():
|
|
| 195 |
print(
|
| 196 |
f" Warning: Cached embedding files for {filename} are incomplete or missing. Skipping in consolidation. This may affect global issue indexing.")
|
| 197 |
|
| 198 |
-
if not
|
| 199 |
print("No embeddings were successfully loaded for consolidation. No output files generated.")
|
| 200 |
return
|
| 201 |
|
|
@@ -203,21 +199,21 @@ def encode_issues():
|
|
| 203 |
# Concatenate all collected embeddings into single large NumPy arrays
|
| 204 |
print("Concatenating and saving final consolidated embeddings...")
|
| 205 |
|
| 206 |
-
# Dense embeddings
|
| 207 |
-
|
| 208 |
-
np.save(os.path.join(OUTPUT_DIR, '
|
| 209 |
print(
|
| 210 |
-
f" Saved
|
| 211 |
|
| 212 |
-
# Sparse embeddings (now a list of dictionaries, saved as object array)
|
| 213 |
-
if
|
| 214 |
# Save the list of dictionaries as a NumPy object array
|
| 215 |
-
|
| 216 |
-
np.save(os.path.join(OUTPUT_DIR, '
|
| 217 |
print(
|
| 218 |
-
f" Saved
|
| 219 |
else:
|
| 220 |
-
print(" No
|
| 221 |
|
| 222 |
# Removed ColBERT embeddings saving
|
| 223 |
# if final_colbert_embeddings_list:
|
|
@@ -232,4 +228,4 @@ def encode_issues():
|
|
| 232 |
|
| 233 |
# Call this function to start the embedding process.
|
| 234 |
if __name__ == "__main__":
|
| 235 |
-
encode_issues()
|
|
|
|
| 125 |
return_sparse=True, # This will return 'lexical_weights' for BGE-M3
|
| 126 |
return_colbert_vecs=False) # <--- REMOVED COLBERT GENERATION
|
| 127 |
|
| 128 |
+
# Save Semantic (Dense) Embeddings
|
| 129 |
np.save(file_cache_dense_path, embeddings['dense_vecs'])
|
| 130 |
|
| 131 |
+
# --- Save Loose (Sparse) Embeddings ---
|
| 132 |
# 'lexical_weights' is a list of dictionaries, one for each item in the batch
|
| 133 |
sparse_list_of_dicts = embeddings.get('lexical_weights')
|
| 134 |
|
|
|
|
| 136 |
# This allows storing Python objects (dictionaries) in a NumPy array.
|
| 137 |
np.save(file_cache_sparse_path, np.array(sparse_list_of_dicts, dtype=object), allow_pickle=True)
|
| 138 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 139 |
|
| 140 |
print(f" Encoded and cached {len(issues_list_in_file)} issues from {filename}.")
|
| 141 |
|
|
|
|
| 148 |
print("\n--- Consolidation Phase: Combining cached embeddings ---")
|
| 149 |
|
| 150 |
# Initialize lists to collect all embeddings in the correct global order
|
| 151 |
+
final_semantic_embeddings_list = [] # Renamed from final_dense_embeddings_list
|
| 152 |
+
final_loose_embeddings_list = [] # Renamed from final_sparse_embeddings_list
|
| 153 |
# Removed final_colbert_embeddings_list
|
| 154 |
|
| 155 |
# Re-get sorted file paths to ensure correct order for consolidation
|
|
|
|
| 169 |
os.path.exists(file_cache_sparse_path)): # Removed colbert cache check
|
| 170 |
|
| 171 |
# Load and append to the lists
|
| 172 |
+
final_semantic_embeddings_list.append(np.load(file_cache_dense_path)) # Renamed
|
| 173 |
|
| 174 |
# Load sparse dictionaries: it's a NumPy object array, convert to list of dicts
|
| 175 |
loaded_sparse_dicts_for_file = np.load(file_cache_sparse_path, allow_pickle=True).tolist()
|
| 176 |
+
final_loose_embeddings_list.extend(loaded_sparse_dicts_for_file) # Renamed
|
| 177 |
|
| 178 |
# Removed loading ColBERT arrays
|
| 179 |
# loaded_colbert_arrays_for_file = np.load(file_cache_colbert_path, allow_pickle=True).tolist()
|
|
|
|
| 191 |
print(
|
| 192 |
f" Warning: Cached embedding files for {filename} are incomplete or missing. Skipping in consolidation. This may affect global issue indexing.")
|
| 193 |
|
| 194 |
+
if not final_semantic_embeddings_list: # Renamed
|
| 195 |
print("No embeddings were successfully loaded for consolidation. No output files generated.")
|
| 196 |
return
|
| 197 |
|
|
|
|
| 199 |
# Concatenate all collected embeddings into single large NumPy arrays
|
| 200 |
print("Concatenating and saving final consolidated embeddings...")
|
| 201 |
|
| 202 |
+
# Semantic (Dense) embeddings
|
| 203 |
+
final_semantic_array = np.vstack(final_semantic_embeddings_list) # Renamed
|
| 204 |
+
np.save(os.path.join(OUTPUT_DIR, 'ns_issues_semantic_bge-m3.npy'), final_semantic_array) # Renamed file
|
| 205 |
print(
|
| 206 |
+
f" Saved semantic embeddings to {os.path.join(OUTPUT_DIR, 'ns_issues_semantic_bge-m3.npy')} (Shape: {final_semantic_array.shape})") # Renamed file and type
|
| 207 |
|
| 208 |
+
# Loose (Sparse) embeddings (now a list of dictionaries, saved as object array)
|
| 209 |
+
if final_loose_embeddings_list: # Renamed
|
| 210 |
# Save the list of dictionaries as a NumPy object array
|
| 211 |
+
final_loose_array = np.array(final_loose_embeddings_list, dtype=object) # Renamed
|
| 212 |
+
np.save(os.path.join(OUTPUT_DIR, 'ns_issues_loose_bge-m3.npy'), final_loose_array, allow_pickle=True) # Renamed file
|
| 213 |
print(
|
| 214 |
+
f" Saved loose embeddings to {os.path.join(OUTPUT_DIR, 'ns_issues_loose_bge-m3.npy')} (Total objects: {len(final_loose_array)}, type: {type(final_loose_array)})") # Renamed file and type
|
| 215 |
else:
|
| 216 |
+
print(" No loose embeddings to save.") # Renamed
|
| 217 |
|
| 218 |
# Removed ColBERT embeddings saving
|
| 219 |
# if final_colbert_embeddings_list:
|
|
|
|
| 228 |
|
| 229 |
# Call this function to start the embedding process.
|
| 230 |
if __name__ == "__main__":
|
| 231 |
+
encode_issues()
|
small_scripts/make_embedding/embedding_ga_resolutions.py
CHANGED
|
@@ -63,20 +63,20 @@ def encode_ga_resolutions():
|
|
| 63 |
# Ensure output directory exists
|
| 64 |
os.makedirs(OUTPUT_DIR, exist_ok=True)
|
| 65 |
|
| 66 |
-
# --- Save Dense Embeddings ---
|
| 67 |
dense_embeddings = embeddings['dense_vecs']
|
| 68 |
-
dense_output_path = os.path.join(OUTPUT_DIR, '
|
| 69 |
np.save(dense_output_path, dense_embeddings)
|
| 70 |
-
print(f"Saved
|
| 71 |
|
| 72 |
-
# --- Save Sparse Embeddings ---
|
| 73 |
# 'lexical_weights' is a list of dictionaries, one for each item in the batch
|
| 74 |
sparse_list_of_dicts = embeddings['lexical_weights']
|
| 75 |
|
| 76 |
# Save this list of sparse dictionaries as a NumPy object array
|
| 77 |
-
sparse_output_path = os.path.join(OUTPUT_DIR, '
|
| 78 |
np.save(sparse_output_path, np.array(sparse_list_of_dicts, dtype=object), allow_pickle=True) # allow_pickle is essential for storing Python objects
|
| 79 |
-
print(f"Saved
|
| 80 |
|
| 81 |
|
| 82 |
# --- Removed ColBERT Embeddings Saving ---
|
|
@@ -94,4 +94,4 @@ def encode_ga_resolutions():
|
|
| 94 |
|
| 95 |
# Call the function to start the embedding process
|
| 96 |
if __name__ == "__main__":
|
| 97 |
-
encode_ga_resolutions()
|
|
|
|
| 63 |
# Ensure output directory exists
|
| 64 |
os.makedirs(OUTPUT_DIR, exist_ok=True)
|
| 65 |
|
| 66 |
+
# --- Save Semantic (Dense) Embeddings ---
|
| 67 |
dense_embeddings = embeddings['dense_vecs']
|
| 68 |
+
dense_output_path = os.path.join(OUTPUT_DIR, 'ns_ga_resolutions_semantic_bge-m3.npy') # Renamed file
|
| 69 |
np.save(dense_output_path, dense_embeddings)
|
| 70 |
+
print(f"Saved semantic embeddings to {dense_output_path} (Shape: {dense_embeddings.shape})") # Renamed type and file
|
| 71 |
|
| 72 |
+
# --- Save Loose (Sparse) Embeddings ---
|
| 73 |
# 'lexical_weights' is a list of dictionaries, one for each item in the batch
|
| 74 |
sparse_list_of_dicts = embeddings['lexical_weights']
|
| 75 |
|
| 76 |
# Save this list of sparse dictionaries as a NumPy object array
|
| 77 |
+
sparse_output_path = os.path.join(OUTPUT_DIR, 'ns_ga_resolutions_loose_bge-m3.npy') # Renamed file
|
| 78 |
np.save(sparse_output_path, np.array(sparse_list_of_dicts, dtype=object), allow_pickle=True) # allow_pickle is essential for storing Python objects
|
| 79 |
+
print(f"Saved loose embeddings to {sparse_output_path} (Total objects: {len(sparse_list_of_dicts)})") # Renamed type and file
|
| 80 |
|
| 81 |
|
| 82 |
# --- Removed ColBERT Embeddings Saving ---
|
|
|
|
| 94 |
|
| 95 |
# Call the function to start the embedding process
|
| 96 |
if __name__ == "__main__":
|
| 97 |
+
encode_ga_resolutions()
|