Spaces:

Bohaska
/

ns_issue_search

Running

App Files Files Community

Bohaska (aider) commited on Jun 11, 2025

Commit

20e9110

1 Parent(s): 3a373f3

refactor: Rename search types and update file paths and examples.

Browse files

Files changed (3) hide show

app.py +243 -113
small_scripts/make_embedding/embedding.py +18 -22
small_scripts/make_embedding/embedding_ga_resolutions.py +7 -7

app.py CHANGED Viewed

@@ -3,25 +3,24 @@ from FlagEmbedding import BGEM3FlagModel
 import numpy as np
 import json
 import os
 # --- Configuration and Global Data Loading ---
 # Determine the directory of the script to load files relative to it
 script_dir = os.path.dirname(os.path.abspath(__file__))
-# Define paths for issue embedding types (removed colbert)
 issue_embeddings_paths = {
-    'fuzzy': os.path.join(script_dir, 'ns_issues_dense_bge-m3.npy'),
-    'direct': os.path.join(script_dir, 'ns_issues_sparse_bge-m3.npy'),
-    # 'mixed': os.path.join(script_dir, 'ns_issues_colbert_bge-m3.npy') # Removed
 }
 issue_titles_path = os.path.join(script_dir, 'issue_titles.json')
-# Define paths for GA resolution embedding types (removed colbert)
 ga_embeddings_paths = {
-    'fuzzy': os.path.join(script_dir, 'ns_ga_resolutions_dense_bge-m3.npy'),
-    'direct': os.path.join(script_dir, 'ns_ga_resolutions_sparse_bge-m3.npy'),
-    # 'mixed': os.path.join(script_dir, 'ns_ga_resolutions_colbert_bge-m3.npy') # Removed
 }
 ga_resolutions_path = os.path.join(script_dir, 'parsed_ga_resolutions.json')
@@ -36,13 +35,13 @@ except Exception as e:
     print("Please ensure you have an internet connection or the model is cached locally.")
     model = None  # Indicate model loading failed
-# Issue data storage for all types (removed colbert)
 issue_all_embeddings = {
-    'fuzzy': None,
-    'direct': None,
-    # 'mixed': None # Removed
 }
 issue_titles = {}
 print("Loading issue data...")
 try:
@@ -50,7 +49,7 @@ try:
         # Load available embedding types for issues
         for embed_type, path in issue_embeddings_paths.items():
             if os.path.exists(path):
-                if embed_type == 'direct': # Only sparse is loaded as list of objects now
                     # Load sparse dictionaries: it's a NumPy object array, convert to list of objects
                     issue_all_embeddings[embed_type] = np.load(path, allow_pickle=True).tolist()
                 else: # Dense
@@ -64,6 +63,37 @@ try:
         with open(issue_titles_path, encoding='utf-8') as file:
             issue_titles = json.load(file)
         print(f"Issue data loaded: {len(issue_titles)} issues.")
 except FileNotFoundError as e:
     print(f"Error loading issue data: {e}")
     print(
@@ -71,11 +101,10 @@ except FileNotFoundError as e:
 except Exception as e:
     print(f"Error loading issue data: {e}")
-# GA resolution data storage for all types (removed colbert)
 ga_all_embeddings = {
-    'fuzzy': None,
-    'direct': None,
-    # 'mixed': None # Removed
 }
 ga_resolutions_data = []
@@ -85,7 +114,7 @@ try:
         # Load available embedding types for GA resolutions
         for embed_type, path in ga_embeddings_paths.items():
             if os.path.exists(path):
-                if embed_type == 'direct': # Only sparse is loaded as list of objects now
                     ga_all_embeddings[embed_type] = np.load(path, allow_pickle=True).tolist()
                 else: # Dense
                     ga_all_embeddings[embed_type] = np.load(path)
@@ -108,9 +137,9 @@ except Exception as e:
 # --- Search Functions ---
-def _perform_search(search_term: str, corpus_embeddings_dict: dict, corpus_titles_map_or_list, search_type: str):
     """
-    Helper function to perform a search given the search term, corpus embeddings, and search type.
     Returns sorted list of (index, similarity_score).
     """
     if not model:
@@ -121,22 +150,20 @@ def _perform_search(search_term: str, corpus_embeddings_dict: dict, corpus_title
     corpus_embeddings = corpus_embeddings_dict.get(search_type)
     if corpus_embeddings is None:
         raise ValueError(f"Corpus data for search type '{search_type}' not loaded. Cannot perform search.")
-    if not corpus_titles_map_or_list:
-        raise ValueError("Corpus titles/data not loaded. Cannot perform search.")
     # Encode the search term for relevant types
     query_embeddings = model.encode([search_term],
                                     return_dense=True,
-                                    return_sparse=True,  # This will return 'lexical_weights' for BGE-M3
-                                    return_colbert_vecs=False) # <--- REMOVED COLBERT GENERATION FOR QUERY
-    similarity_scores = []  # Use a list to collect scores, then convert to numpy array
-    if search_type == 'fuzzy':
         query_vec = query_embeddings['dense_vecs']  # Shape: (1, embedding_dim)
         # Perform dot product for dense similarity
         similarity_scores = (query_vec @ corpus_embeddings.T)[0]  # Result shape: (num_docs,)
-    elif search_type == 'direct':
         # 'lexical_weights' is a list of dictionaries, even for a single query.
         # We need the first (and only) dictionary from this list.
         if 'lexical_weights' not in query_embeddings or not query_embeddings['lexical_weights']:
@@ -148,17 +175,9 @@ def _perform_search(search_term: str, corpus_embeddings_dict: dict, corpus_title
             score = model.compute_lexical_matching_score(query_sparse_dict, doc_sparse_dict)
             similarity_scores.append(score)
         similarity_scores = np.array(similarity_scores)  # Convert to numpy array
-    # Removed 'mixed' (ColBERT) search type
-    # elif search_type == 'mixed':
-    #     if 'colbert_vecs' not in query_embeddings or not query_embeddings['colbert_vecs']:
-    #         raise ValueError("ColBERT vectors not returned for query. Model or configuration issue.")
-    #     query_colbert_vec = query_embeddings['colbert_vecs'][0]
-    #     for doc_colbert_vec in corpus_embeddings:
-    #         score = model.colbert_score(query_colbert_vec, doc_colbert_vec)
-    #         similarity_scores.append(score)
-    #     similarity_scores = np.array(similarity_scores)
     else:
-        raise ValueError(f"Unsupported search type: {search_type}")
     # Pair index with similarity score
     indexed_similarities = [(i, score) for i, score in enumerate(similarity_scores)]
@@ -168,89 +187,196 @@ def _perform_search(search_term: str, corpus_embeddings_dict: dict, corpus_title
     return sorted_similarities
-def get_issue_similarity_rankings(search_term: str, search_type: str = 'fuzzy'):
-    """Searches issues and returns formatted results."""
-    try:
-        sorted_similarities = _perform_search(search_term, issue_all_embeddings, issue_titles, search_type)
-        similarity_text = f"# Top 20 Issue Search Results ({search_type.capitalize()})\n"
-        if not sorted_similarities:
-            return similarity_text + "No issues found."
-        search_ranking = 1
-        # Get top 20 results
-        for index, sim_score in sorted_similarities[:20]:
-            # issue_titles is a dict, needs string key
-            issue_title = issue_titles.get(str(index), f"Unknown Issue (Index {index})")
-            similarity_text += f"{search_ranking}. {issue_title}, Similarity: {sim_score:.4f}\n"
-            search_ranking += 1
-        return similarity_text
     except Exception as e:
         return f"An error occurred during issue search: {e}"
 def search_ga_resolutions(search_term: str, hide_repealed: bool, hide_repeal_category: bool,
-                          search_type: str = 'fuzzy'):
     """
     Searches GA resolutions, filters repealed and/or repeal category if requested,
     and returns formatted results with links and status.
     """
     try:
-        raw_sorted_similarities = _perform_search(search_term, ga_all_embeddings, ga_resolutions_data, search_type)
-        # --- Filtering ---
-        filtered_indexed_similarities = []
-        for index, score in raw_sorted_similarities:
-            # Ensure index is valid
-            if index < len(ga_resolutions_data):
                 resolution = ga_resolutions_data[index]
                 status = resolution.get('status')
-                category = resolution.get('category')
-                # Apply filters
-                if hide_repealed and status == "Repealed":
-                    continue
-                if hide_repeal_category and category == "Repeal":
-                    continue
-                filtered_indexed_similarities.append((index, score))
-        # The list is already sorted, no re-sort needed after filtering.
-        # --- Formatting Results ---
-        similarity_text = f"# Top 20 GA Resolution Search Results ({search_type.capitalize()})\n"
-        if not filtered_indexed_similarities:
-            status_msgs = []
-            if hide_repealed: status_msgs.append("Repealed")
-            if hide_repeal_category: status_msgs.append("Repeal Category")
-            filter_msg = " (Filtered out " + " and ".join(status_msgs) + ")" if status_msgs else ""
-            return similarity_text + f"No matching resolutions found{filter_msg}."
-        search_ranking = 1
-        # Get top 20 results from the sorted and filtered list
-        for index, sim_score in filtered_indexed_similarities[:20]:
-            resolution = ga_resolutions_data[index]
-            title = resolution.get('title', 'Untitled Resolution')
-            res_id = resolution.get('id', 'N/A')
-            council = resolution.get('council', 1)
-            status = resolution.get('status')
-            # Add [REPEALED] marker if the status is "Repealed"
-            status_marker = "[REPEALED] " if status == "Repealed" else ""
-            # Construct the NationStates URL
-            url = f"https://www.nationstates.net/page=WA_past_resolution/id={res_id}/council={council}"
-            # Format as Markdown link with the status marker
-            similarity_text += f"{search_ranking}. {status_marker}[#{res_id} {title}]({url}), Similarity: {sim_score:.4f}\n"
-            search_ranking += 1
-        return similarity_text
     except Exception as e:
         return f"An error occurred during GA resolution search: {e}"
@@ -281,20 +407,22 @@ with gr.Blocks() as demo:
                 fn=get_issue_similarity_rankings,
                 inputs=[
                     gr.Textbox(label="Search term", placeholder="What issue are you looking for?"),
-                    gr.Radio(["fuzzy", "direct"], label="Search Type", value="fuzzy", # <--- Removed 'mixed'
-                             info="Choose search type.") # <--- Updated info message
                 ],
                 outputs=gr.Markdown(),
                 examples=[
                     # Examples for Issue Search (search_term, search_type)
-                    ["coffee", "fuzzy"],
-                    ["land value tax", "direct"],
-                    ["Elon Musk", "direct"],
                     ["After an corrupted election, citizens demand the real results, after discovering it was rigged. ",
-                     "fuzzy"],
                     [
                         "Eureka! A new scientific law regarding the universe's expansion may have just been discovered at the University of @@CAPITAL@@. Unfortunately, tempers are flaring over who should get naming credit. Maxtopian grad student Georgie Bubble claims the work alone while his boss Dr.@@RANDOMNAME1@@ claims that all work in the University is @@NAME@@’s collectively. Your Minister of Education has elevated this to your desk.",
-                        "fuzzy"],
                 ],
                 title=None,
                 description=None,
@@ -312,8 +440,8 @@ with gr.Blocks() as demo:
             ga_search_term_input = gr.Textbox(label="Search term", placeholder="What are you looking for?")
             ga_hide_repealed_checkbox = gr.Checkbox(value=True, label="Hide repealed resolutions")
             ga_hide_repeal_category_checkbox = gr.Checkbox(value=True, label="Hide repeals")
-            ga_search_type_radio = gr.Radio(["fuzzy", "direct"], label="Search Type", value="fuzzy", # <--- Removed 'mixed'
-                                            info="Choose search type.")
             ga_search_interface = gr.Interface(
                 fn=search_ga_resolutions,
@@ -327,11 +455,13 @@ with gr.Blocks() as demo:
                 outputs=gr.Markdown(),
                 examples=[
                     # Examples for GA Resolution Search (search_term, hide_repealed, hide_repeal_category, search_type)
-                    ["condemn genocide", True, True, "fuzzy"],
-                    ["rights of animals", True, True, "direct"],
-                    ["regulating space mining", True, True, "fuzzy"],
-                    ["founding of the World Assembly", True, True, "fuzzy"],
-                    ["environmental protection", True, True, "fuzzy"],
                 ],
                 title=None,
                 description=None,
@@ -343,4 +473,4 @@ with gr.Blocks() as demo:
 if __name__ == "__main__":
     # Set share=True to make the app accessible externally (requires ngrok)
     # share=False is default and runs locally
-    demo.launch()

 import numpy as np
 import json
 import os
+import re # Added for strict search context extraction
 # --- Configuration and Global Data Loading ---
 # Determine the directory of the script to load files relative to it
 script_dir = os.path.dirname(os.path.abspath(__file__))
+# Define paths for issue embedding types
 issue_embeddings_paths = {
+    'semantic': os.path.join(script_dir, 'ns_issues_semantic_bge-m3.npy'), # Renamed from fuzzy
+    'loose': os.path.join(script_dir, 'ns_issues_loose_bge-m3.npy'),       # Renamed from direct
 }
 issue_titles_path = os.path.join(script_dir, 'issue_titles.json')
+# Define paths for GA resolution embedding types
 ga_embeddings_paths = {
+    'semantic': os.path.join(script_dir, 'ns_ga_resolutions_semantic_bge-m3.npy'), # Renamed from fuzzy
+    'loose': os.path.join(script_dir, 'ns_ga_resolutions_loose_bge-m3.npy'),       # Renamed from direct
 }
 ga_resolutions_path = os.path.join(script_dir, 'parsed_ga_resolutions.json')
     print("Please ensure you have an internet connection or the model is cached locally.")
     model = None  # Indicate model loading failed
+# Issue data storage for all types
 issue_all_embeddings = {
+    'semantic': None,
+    'loose': None,
 }
 issue_titles = {}
+all_issue_raw_texts = [] # New: To store raw issue texts for strict search
 print("Loading issue data...")
 try:
         # Load available embedding types for issues
         for embed_type, path in issue_embeddings_paths.items():
             if os.path.exists(path):
+                if embed_type == 'loose': # Only sparse is loaded as list of objects now
                     # Load sparse dictionaries: it's a NumPy object array, convert to list of objects
                     issue_all_embeddings[embed_type] = np.load(path, allow_pickle=True).tolist()
                 else: # Dense
         with open(issue_titles_path, encoding='utf-8') as file:
             issue_titles = json.load(file)
         print(f"Issue data loaded: {len(issue_titles)} issues.")
+        # --- Load raw issue texts for strict search ---
+        # The issue text files are in 'small_scripts/make_embedding/002 - Issue Megalist (MAIN) copy/'
+        issues_input_dir = os.path.join(script_dir, 'small_scripts', 'make_embedding', '002 - Issue Megalist (MAIN) copy')
+        # Replicate get_issue_files logic from embedding.py to ensure correct order
+        issue_files_for_raw_load = []
+        file_pattern = re.compile(r'(\d+) TO (\d+)\.txt')
+        if os.path.isdir(issues_input_dir):
+            for filename in os.listdir(issues_input_dir):
+                if filename.endswith('.txt'):
+                    match = file_pattern.match(filename)
+                    if match:
+                        start_num = int(match.group(1))
+                        issue_files_for_raw_load.append((start_num, filename))
+            issue_files_for_raw_load.sort(key=lambda x: x[0])
+            issue_files_for_raw_load = [os.path.join(issues_input_dir, filename) for _, filename in issue_files_for_raw_load]
+            for filepath in issue_files_for_raw_load:
+                with open(filepath, 'r', encoding='utf-8') as file:
+                    issues_text_in_file = file.read()
+                    # Split issues by the separator and remove any empty strings resulting from multiple separators
+                    issues_list_in_file = [
+                        issue.strip() for issue in issues_text_in_file.split("[hr][/hr]") if issue.strip()
+                    ]
+                    all_issue_raw_texts.extend(issues_list_in_file)
+            print(f"  Loaded {len(all_issue_raw_texts)} raw issue texts for strict search.")
+        else:
+            print(f"  Warning: Issue text directory '{issues_input_dir}' not found. Strict issue search will not work.")
 except FileNotFoundError as e:
     print(f"Error loading issue data: {e}")
     print(
 except Exception as e:
     print(f"Error loading issue data: {e}")
+# GA resolution data storage for all types
 ga_all_embeddings = {
+    'semantic': None,
+    'loose': None,
 }
 ga_resolutions_data = []
         # Load available embedding types for GA resolutions
         for embed_type, path in ga_embeddings_paths.items():
             if os.path.exists(path):
+                if embed_type == 'loose': # Only sparse is loaded as list of objects now
                     ga_all_embeddings[embed_type] = np.load(path, allow_pickle=True).tolist()
                 else: # Dense
                     ga_all_embeddings[embed_type] = np.load(path)
 # --- Search Functions ---
+def _perform_search(search_term: str, corpus_embeddings_dict: dict, search_type: str):
     """
+    Helper function to perform an embedding-based search given the search term, corpus embeddings, and search type.
     Returns sorted list of (index, similarity_score).
     """
     if not model:
     corpus_embeddings = corpus_embeddings_dict.get(search_type)
     if corpus_embeddings is None:
         raise ValueError(f"Corpus data for search type '{search_type}' not loaded. Cannot perform search.")
     # Encode the search term for relevant types
     query_embeddings = model.encode([search_term],
                                     return_dense=True,
+                                    return_sparse=True,
+                                    return_colbert_vecs=False)
+    similarity_scores = []
+    if search_type == 'semantic': # Renamed from 'fuzzy'
         query_vec = query_embeddings['dense_vecs']  # Shape: (1, embedding_dim)
         # Perform dot product for dense similarity
         similarity_scores = (query_vec @ corpus_embeddings.T)[0]  # Result shape: (num_docs,)
+    elif search_type == 'loose': # Renamed from 'direct'
         # 'lexical_weights' is a list of dictionaries, even for a single query.
         # We need the first (and only) dictionary from this list.
         if 'lexical_weights' not in query_embeddings or not query_embeddings['lexical_weights']:
             score = model.compute_lexical_matching_score(query_sparse_dict, doc_sparse_dict)
             similarity_scores.append(score)
         similarity_scores = np.array(similarity_scores)  # Convert to numpy array
     else:
+        # This function should only be called for embedding-based searches
+        raise ValueError(f"Unsupported embedding search type: {search_type}")
     # Pair index with similarity score
     indexed_similarities = [(i, score) for i, score in enumerate(similarity_scores)]
     return sorted_similarities
+def _extract_context(text: str, query: str, context_chars: int = 100):
+    """Extracts surrounding context for a given query in text, split on newlines."""
+    text_lower = text.lower()
+    query_lower = query.lower()
+    start_index = text_lower.find(query_lower)
+    if start_index == -1:
+        return "" # Query not found, should not happen if we got here
+    end_index = start_index + len(query)
+    # Find start of context
+    context_start = max(0, start_index - context_chars)
+    # Find end of context
+    context_end = min(len(text), end_index + context_chars)
+    # Adjust context_start to the nearest newline before it
+    if context_start > 0:
+        pre_context = text[0:context_start]
+        last_newline_before_start = pre_context.rfind('\n')
+        if last_newline_before_start != -1:
+            context_start = last_newline_before_start + 1
+    # Adjust context_end to the nearest newline after it
+    if context_end < len(text):
+        post_context = text[context_end:len(text)]
+        first_newline_after_end = post_context.find('\n')
+        if first_newline_after_end != -1:
+            context_end = context_end + first_newline_after_end
+    extracted_text = text[context_start:context_end]
+    # Highlight the query using regex for case-insensitive replacement
+    highlighted_text = re.sub(re.escape(query), lambda m: f"**{m.group(0)}**", extracted_text, flags=re.IGNORECASE, count=1)
+    return f"```\n{highlighted_text}\n```"
+def get_issue_similarity_rankings(search_term: str, search_type: str = 'semantic'): # Renamed default
+    """Searches issues and returns formatted results."""
+    try:
+        if not search_term:
+            return "Please enter a search term."
+        if search_type == 'strict':
+            if not all_issue_raw_texts:
+                return "Raw issue texts not loaded. Strict search is unavailable."
+            strict_matches = []
+            search_term_lower = search_term.lower()
+            for i, issue_text in enumerate(all_issue_raw_texts):
+                if search_term_lower in issue_text.lower():
+                    strict_matches.append((i, 1.0)) # Use 1.0 as a dummy score for strict matches
+            similarity_text = f"# Top 20 Issue Search Results (Strict)\n"
+            if not strict_matches:
+                return similarity_text + "No exact matches found."
+            search_ranking = 1
+            for index, sim_score in strict_matches[:20]: # Still limit to top 20
+                issue_title = issue_titles.get(str(index), f"Unknown Issue (Index {index})")
+                context = _extract_context(all_issue_raw_texts[index], search_term)
+                similarity_text += f"{search_ranking}. {issue_title}, Match: {sim_score:.4f}\n{context}\n"
+                search_ranking += 1
+            return similarity_text
+        else: # Embedding-based search
+            sorted_similarities = _perform_search(search_term, issue_all_embeddings, search_type)
+            similarity_text = f"# Top 20 Issue Search Results ({search_type.capitalize()})\n"
+            if not sorted_similarities:
+                return similarity_text + "No issues found."
+            search_ranking = 1
+            for index, sim_score in sorted_similarities[:20]:
+                # issue_titles is a dict, needs string key
+                issue_title = issue_titles.get(str(index), f"Unknown Issue (Index {index})")
+                similarity_text += f"{search_ranking}. {issue_title}, Similarity: {sim_score:.4f}\n"
+                search_ranking += 1
+            return similarity_text
     except Exception as e:
         return f"An error occurred during issue search: {e}"
 def search_ga_resolutions(search_term: str, hide_repealed: bool, hide_repeal_category: bool,
+                          search_type: str = 'semantic'): # Renamed default
     """
     Searches GA resolutions, filters repealed and/or repeal category if requested,
     and returns formatted results with links and status.
     """
     try:
+        if not search_term:
+            return "Please enter a search term."
+        if search_type == 'strict':
+            if not ga_resolutions_data:
+                return "GA resolution data not loaded. Strict search is unavailable."
+            strict_matches = []
+            search_term_lower = search_term.lower()
+            for i, resolution in enumerate(ga_resolutions_data):
+                resolution_body = resolution.get('body', '')
+                if search_term_lower in resolution_body.lower():
+                    # Apply filters immediately for strict search
+                    status = resolution.get('status')
+                    category = resolution.get('category')
+                    if hide_repealed and status == "Repealed":
+                        continue
+                    if hide_repeal_category and category == "Repeal":
+                        continue
+                    strict_matches.append((i, 1.0)) # Dummy score
+            similarity_text = f"# Top 20 GA Resolution Search Results (Strict)\n"
+            if not strict_matches:
+                status_msgs = []
+                if hide_repealed: status_msgs.append("Repealed")
+                if hide_repeal_category: status_msgs.append("Repeal Category")
+                filter_msg = " (Filtered out " + " and ".join(status_msgs) + ")" if status_msgs else ""
+                return similarity_text + f"No exact matches found{filter_msg}."
+            search_ranking = 1
+            for index, sim_score in strict_matches[:20]:
                 resolution = ga_resolutions_data[index]
+                title = resolution.get('title', 'Untitled Resolution')
+                res_id = resolution.get('id', 'N/A')
+                council = resolution.get('council', 1)
                 status = resolution.get('status')
+                status_marker = "[REPEALED] " if status == "Repealed" else ""
+                url = f"https://www.nationstates.net/page=WA_past_resolution/id={res_id}/council={council}"
+                context = _extract_context(resolution.get('body', ''), search_term)
+                similarity_text += f"{search_ranking}. {status_marker}[#{res_id} {title}]({url}), Match: {sim_score:.4f}\n{context}\n"
+                search_ranking += 1
+            return similarity_text
+        else: # Embedding-based search
+            raw_sorted_similarities = _perform_search(search_term, ga_all_embeddings, search_type)
+            # --- Filtering ---
+            filtered_indexed_similarities = []
+            for index, score in raw_sorted_similarities:
+                # Ensure index is valid
+                if index < len(ga_resolutions_data):
+                    resolution = ga_resolutions_data[index]
+                    status = resolution.get('status')
+                    category = resolution.get('category')
+                    # Apply filters
+                    if hide_repealed and status == "Repealed":
+                        continue
+                    if hide_repeal_category and category == "Repeal":
+                        continue
+                    filtered_indexed_similarities.append((index, score))
+            # The list is already sorted, no re-sort needed after filtering.
+            # --- Formatting Results ---
+            similarity_text = f"# Top 20 GA Resolution Search Results ({search_type.capitalize()})\n"
+            if not filtered_indexed_similarities:
+                status_msgs = []
+                if hide_repealed: status_msgs.append("Repealed")
+                if hide_repeal_category: status_msgs.append("Repeal Category")
+                filter_msg = " (Filtered out " + " and ".join(status_msgs) + ")" if status_msgs else ""
+                return similarity_text + f"No matching resolutions found{filter_msg}."
+            search_ranking = 1
+            # Get top 20 results from the sorted and filtered list
+            for index, sim_score in filtered_indexed_similarities[:20]:
+                resolution = ga_resolutions_data[index]
+                title = resolution.get('title', 'Untitled Resolution')
+                res_id = resolution.get('id', 'N/A')
+                council = resolution.get('council', 1)
+                status = resolution.get('status')
+                # Add [REPEALED] marker if the status is "Repealed"
+                status_marker = "[REPEALED] " if status == "Repealed" else ""
+                # Construct the NationStates URL
+                url = f"https://www.nationstates.net/page=WA_past_resolution/id={res_id}/council={council}"
+                # Format as Markdown link with the status marker
+                similarity_text += f"{search_ranking}. {status_marker}[#{res_id} {title}]({url}), Similarity: {sim_score:.4f}\n"
+                search_ranking += 1
+            return similarity_text
     except Exception as e:
         return f"An error occurred during GA resolution search: {e}"
                 fn=get_issue_similarity_rankings,
                 inputs=[
                     gr.Textbox(label="Search term", placeholder="What issue are you looking for?"),
+                    gr.Radio(["semantic", "loose", "strict"], label="Search Type", value="semantic", # Updated options
+                             info="Choose search type: 'semantic' for conceptual similarity, 'loose' for keyword matching, 'strict' for exact phrase matching.") # Updated info
                 ],
                 outputs=gr.Markdown(),
                 examples=[
                     # Examples for Issue Search (search_term, search_type)
+                    ["coffee", "semantic"],
+                    ["land value tax", "loose"],
+                    ["Elon Musk", "loose"],
                     ["After an corrupted election, citizens demand the real results, after discovering it was rigged. ",
+                     "semantic"],
                     [
                         "Eureka! A new scientific law regarding the universe's expansion may have just been discovered at the University of @@CAPITAL@@. Unfortunately, tempers are flaring over who should get naming credit. Maxtopian grad student Georgie Bubble claims the work alone while his boss Dr.@@RANDOMNAME1@@ claims that all work in the University is @@NAME@@’s collectively. Your Minister of Education has elevated this to your desk.",
+                        "semantic"],
+                    ["tax", "strict"], # New example for strict
+                    ["environmental protection", "strict"] # New example for strict
                 ],
                 title=None,
                 description=None,
             ga_search_term_input = gr.Textbox(label="Search term", placeholder="What are you looking for?")
             ga_hide_repealed_checkbox = gr.Checkbox(value=True, label="Hide repealed resolutions")
             ga_hide_repeal_category_checkbox = gr.Checkbox(value=True, label="Hide repeals")
+            ga_search_type_radio = gr.Radio(["semantic", "loose", "strict"], label="Search Type", value="semantic", # Updated options
+                                            info="Choose search type: 'semantic' for conceptual similarity, 'loose' for keyword matching, 'strict' for exact phrase matching.") # Updated info
             ga_search_interface = gr.Interface(
                 fn=search_ga_resolutions,
                 outputs=gr.Markdown(),
                 examples=[
                     # Examples for GA Resolution Search (search_term, hide_repealed, hide_repeal_category, search_type)
+                    ["condemn genocide", True, True, "semantic"],
+                    ["rights of animals", True, True, "loose"],
+                    ["regulating space mining", True, True, "semantic"],
+                    ["founding of the World Assembly", True, True, "semantic"],
+                    ["environmental protection", True, True, "semantic"],
+                    ["human rights", True, True, "strict"], # New example for strict
+                    ["World Assembly", True, True, "strict"] # New example for strict
                 ],
                 title=None,
                 description=None,
 if __name__ == "__main__":
     # Set share=True to make the app accessible externally (requires ngrok)
     # share=False is default and runs locally
+    demo.launch()

small_scripts/make_embedding/embedding.py CHANGED Viewed

@@ -125,10 +125,10 @@ def encode_issues():
                                       return_sparse=True,  # This will return 'lexical_weights' for BGE-M3
                                       return_colbert_vecs=False) # <--- REMOVED COLBERT GENERATION
-            # Save Dense Embeddings
             np.save(file_cache_dense_path, embeddings['dense_vecs'])
-            # --- Save Sparse Embeddings ---
             # 'lexical_weights' is a list of dictionaries, one for each item in the batch
             sparse_list_of_dicts = embeddings.get('lexical_weights')
@@ -136,10 +136,6 @@ def encode_issues():
             # This allows storing Python objects (dictionaries) in a NumPy array.
             np.save(file_cache_sparse_path, np.array(sparse_list_of_dicts, dtype=object), allow_pickle=True)
-            # Removed saving ColBERT Embeddings
-            # colbert_list_of_arrays = embeddings.get('colbert_vecs')
-            # np.save(file_cache_colbert_path, np.array(colbert_list_of_arrays, dtype=object), allow_pickle=True)
             print(f"  Encoded and cached {len(issues_list_in_file)} issues from {filename}.")
@@ -152,8 +148,8 @@ def encode_issues():
     print("\n--- Consolidation Phase: Combining cached embeddings ---")
     # Initialize lists to collect all embeddings in the correct global order
-    final_dense_embeddings_list = []
-    final_sparse_embeddings_list = []  # Will hold Python dictionaries
     # Removed final_colbert_embeddings_list
     # Re-get sorted file paths to ensure correct order for consolidation
@@ -173,11 +169,11 @@ def encode_issues():
                 os.path.exists(file_cache_sparse_path)): # Removed colbert cache check
             # Load and append to the lists
-            final_dense_embeddings_list.append(np.load(file_cache_dense_path))
             # Load sparse dictionaries: it's a NumPy object array, convert to list of dicts
             loaded_sparse_dicts_for_file = np.load(file_cache_sparse_path, allow_pickle=True).tolist()
-            final_sparse_embeddings_list.extend(loaded_sparse_dicts_for_file)
             # Removed loading ColBERT arrays
             # loaded_colbert_arrays_for_file = np.load(file_cache_colbert_path, allow_pickle=True).tolist()
@@ -195,7 +191,7 @@ def encode_issues():
             print(
                 f"  Warning: Cached embedding files for {filename} are incomplete or missing. Skipping in consolidation. This may affect global issue indexing.")
-    if not final_dense_embeddings_list:
         print("No embeddings were successfully loaded for consolidation. No output files generated.")
         return
@@ -203,21 +199,21 @@ def encode_issues():
     # Concatenate all collected embeddings into single large NumPy arrays
     print("Concatenating and saving final consolidated embeddings...")
-    # Dense embeddings
-    final_dense_array = np.vstack(final_dense_embeddings_list)
-    np.save(os.path.join(OUTPUT_DIR, 'ns_issues_dense_bge-m3.npy'), final_dense_array)
     print(
-        f"  Saved dense embeddings to {os.path.join(OUTPUT_DIR, 'ns_issues_dense_bge-m3.npy')} (Shape: {final_dense_array.shape})")
-    # Sparse embeddings (now a list of dictionaries, saved as object array)
-    if final_sparse_embeddings_list:
         # Save the list of dictionaries as a NumPy object array
-        final_sparse_array = np.array(final_sparse_embeddings_list, dtype=object)
-        np.save(os.path.join(OUTPUT_DIR, 'ns_issues_sparse_bge-m3.npy'), final_sparse_array, allow_pickle=True)
         print(
-            f"  Saved sparse embeddings to {os.path.join(OUTPUT_DIR, 'ns_issues_sparse_bge-m3.npy')} (Total objects: {len(final_sparse_array)}, type: {type(final_sparse_array)})")
     else:
-        print("  No sparse embeddings to save.")
     # Removed ColBERT embeddings saving
     # if final_colbert_embeddings_list:
@@ -232,4 +228,4 @@ def encode_issues():
 # Call this function to start the embedding process.
 if __name__ == "__main__":
-    encode_issues()

                                       return_sparse=True,  # This will return 'lexical_weights' for BGE-M3
                                       return_colbert_vecs=False) # <--- REMOVED COLBERT GENERATION
+            # Save Semantic (Dense) Embeddings
             np.save(file_cache_dense_path, embeddings['dense_vecs'])
+            # --- Save Loose (Sparse) Embeddings ---
             # 'lexical_weights' is a list of dictionaries, one for each item in the batch
             sparse_list_of_dicts = embeddings.get('lexical_weights')
             # This allows storing Python objects (dictionaries) in a NumPy array.
             np.save(file_cache_sparse_path, np.array(sparse_list_of_dicts, dtype=object), allow_pickle=True)
             print(f"  Encoded and cached {len(issues_list_in_file)} issues from {filename}.")
     print("\n--- Consolidation Phase: Combining cached embeddings ---")
     # Initialize lists to collect all embeddings in the correct global order
+    final_semantic_embeddings_list = [] # Renamed from final_dense_embeddings_list
+    final_loose_embeddings_list = []  # Renamed from final_sparse_embeddings_list
     # Removed final_colbert_embeddings_list
     # Re-get sorted file paths to ensure correct order for consolidation
                 os.path.exists(file_cache_sparse_path)): # Removed colbert cache check
             # Load and append to the lists
+            final_semantic_embeddings_list.append(np.load(file_cache_dense_path)) # Renamed
             # Load sparse dictionaries: it's a NumPy object array, convert to list of dicts
             loaded_sparse_dicts_for_file = np.load(file_cache_sparse_path, allow_pickle=True).tolist()
+            final_loose_embeddings_list.extend(loaded_sparse_dicts_for_file) # Renamed
             # Removed loading ColBERT arrays
             # loaded_colbert_arrays_for_file = np.load(file_cache_colbert_path, allow_pickle=True).tolist()
             print(
                 f"  Warning: Cached embedding files for {filename} are incomplete or missing. Skipping in consolidation. This may affect global issue indexing.")
+    if not final_semantic_embeddings_list: # Renamed
         print("No embeddings were successfully loaded for consolidation. No output files generated.")
         return
     # Concatenate all collected embeddings into single large NumPy arrays
     print("Concatenating and saving final consolidated embeddings...")
+    # Semantic (Dense) embeddings
+    final_semantic_array = np.vstack(final_semantic_embeddings_list) # Renamed
+    np.save(os.path.join(OUTPUT_DIR, 'ns_issues_semantic_bge-m3.npy'), final_semantic_array) # Renamed file
     print(
+        f"  Saved semantic embeddings to {os.path.join(OUTPUT_DIR, 'ns_issues_semantic_bge-m3.npy')} (Shape: {final_semantic_array.shape})") # Renamed file and type
+    # Loose (Sparse) embeddings (now a list of dictionaries, saved as object array)
+    if final_loose_embeddings_list: # Renamed
         # Save the list of dictionaries as a NumPy object array
+        final_loose_array = np.array(final_loose_embeddings_list, dtype=object) # Renamed
+        np.save(os.path.join(OUTPUT_DIR, 'ns_issues_loose_bge-m3.npy'), final_loose_array, allow_pickle=True) # Renamed file
         print(
+            f"  Saved loose embeddings to {os.path.join(OUTPUT_DIR, 'ns_issues_loose_bge-m3.npy')} (Total objects: {len(final_loose_array)}, type: {type(final_loose_array)})") # Renamed file and type
     else:
+        print("  No loose embeddings to save.") # Renamed
     # Removed ColBERT embeddings saving
     # if final_colbert_embeddings_list:
 # Call this function to start the embedding process.
 if __name__ == "__main__":
+    encode_issues()

small_scripts/make_embedding/embedding_ga_resolutions.py CHANGED Viewed

@@ -63,20 +63,20 @@ def encode_ga_resolutions():
         # Ensure output directory exists
         os.makedirs(OUTPUT_DIR, exist_ok=True)
-        # --- Save Dense Embeddings ---
         dense_embeddings = embeddings['dense_vecs']
-        dense_output_path = os.path.join(OUTPUT_DIR, 'ns_ga_resolutions_dense_bge-m3.npy')
         np.save(dense_output_path, dense_embeddings)
-        print(f"Saved dense embeddings to {dense_output_path} (Shape: {dense_embeddings.shape})")
-        # --- Save Sparse Embeddings ---
         # 'lexical_weights' is a list of dictionaries, one for each item in the batch
         sparse_list_of_dicts = embeddings['lexical_weights']
         # Save this list of sparse dictionaries as a NumPy object array
-        sparse_output_path = os.path.join(OUTPUT_DIR, 'ns_ga_resolutions_sparse_bge-m3.npy')
         np.save(sparse_output_path, np.array(sparse_list_of_dicts, dtype=object), allow_pickle=True) # allow_pickle is essential for storing Python objects
-        print(f"Saved sparse embeddings to {sparse_output_path} (Total objects: {len(sparse_list_of_dicts)})")
         # --- Removed ColBERT Embeddings Saving ---
@@ -94,4 +94,4 @@ def encode_ga_resolutions():
 # Call the function to start the embedding process
 if __name__ == "__main__":
-    encode_ga_resolutions()

         # Ensure output directory exists
         os.makedirs(OUTPUT_DIR, exist_ok=True)
+        # --- Save Semantic (Dense) Embeddings ---
         dense_embeddings = embeddings['dense_vecs']
+        dense_output_path = os.path.join(OUTPUT_DIR, 'ns_ga_resolutions_semantic_bge-m3.npy') # Renamed file
         np.save(dense_output_path, dense_embeddings)
+        print(f"Saved semantic embeddings to {dense_output_path} (Shape: {dense_embeddings.shape})") # Renamed type and file
+        # --- Save Loose (Sparse) Embeddings ---
         # 'lexical_weights' is a list of dictionaries, one for each item in the batch
         sparse_list_of_dicts = embeddings['lexical_weights']
         # Save this list of sparse dictionaries as a NumPy object array
+        sparse_output_path = os.path.join(OUTPUT_DIR, 'ns_ga_resolutions_loose_bge-m3.npy') # Renamed file
         np.save(sparse_output_path, np.array(sparse_list_of_dicts, dtype=object), allow_pickle=True) # allow_pickle is essential for storing Python objects
+        print(f"Saved loose embeddings to {sparse_output_path} (Total objects: {len(sparse_list_of_dicts)})") # Renamed type and file
         # --- Removed ColBERT Embeddings Saving ---
 # Call the function to start the embedding process
 if __name__ == "__main__":
+    encode_ga_resolutions()