Bohaska commited on
Commit
aaea9cb
·
1 Parent(s): 59e58c4

add GA resolution search

Browse files
.gitignore ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ /small_scripts/
2
+ .gitignore
app.py CHANGED
@@ -2,61 +2,223 @@ import gradio as gr
2
  from FlagEmbedding import BGEM3FlagModel
3
  import numpy as np
4
  import json
 
5
 
6
- model = BGEM3FlagModel('BAAI/bge-m3',
7
- use_fp16=True) # Setting use_fp16 to True speeds up computation with a slight performance degradation
8
- issue_embeddings = np.load('ns_issues_dense_bge-m3.npy')
9
 
10
- with open("issue_titles.json") as file:
11
- issue_titles = json.load(file)
 
 
 
 
12
 
13
- def get_similarity(search_term):
14
- search_embedding = model.encode([search_term])['dense_vecs']
15
- similarity = search_embedding @ issue_embeddings.T
16
- return similarity
17
 
 
 
 
 
 
 
 
 
 
 
18
 
19
- def get_similarity_rankings(search_term):
20
- similarity = get_similarity(search_term)
21
 
22
- results = []
23
- for i, search_query_similarities in enumerate(similarity):
24
- indexed_similarities = []
25
- for issue_index, sim_score in enumerate(search_query_similarities):
26
- indexed_similarities.append((issue_index, sim_score))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
27
  sorted_similarities = sorted(indexed_similarities, key=lambda item: item[1], reverse=True)
28
- results.append(sorted_similarities)
29
 
30
- similarity_text = ""
31
- for i, search_query_result in enumerate(results):
32
- similarity_text += f"# Search Results"
33
  search_ranking = 1
34
- for index, sim_score in search_query_result[:20]:
35
- similarity_text += f"\n{search_ranking}. {issue_titles[str(index)]}, Similarity: {sim_score:.4f}"
 
 
 
 
 
 
 
 
 
 
 
 
 
36
  search_ranking += 1
37
 
38
- return similarity_text
 
 
39
 
40
 
 
 
41
  """
42
- For information on how to customize the ChatInterface, peruse the gradio docs: https://www.gradio.app/docs/chatinterface
 
 
 
43
  """
44
- demo = gr.Interface(
45
- get_similarity_rankings,
46
- inputs=gr.Textbox(label="Search term", placeholder="What are you looking for?"),
47
- outputs=gr.Markdown(container=True),
48
- examples=["coffee",
49
- "land value tax",
50
- "Elon Musk",
51
- "After an corrupted election, citizens demand the real results, after discovering it was rigged. ",
52
- "Eureka! A new scientific law regarding the universe's expansion may have just been discovered at the University of @@CAPITAL@@. Unfortunately, tempers are flaring over who should get naming credit. Maxtopian grad student Georgie Bubble claims the work alone while his boss Dr.@@RANDOMNAME1@@ claims that all work in the University is @@NAME@@’s collectively. Your Minister of Education has elevated this to your desk.",
53
- ],
54
- title="NationStates Issue Search",
55
- description="Search through all 1660 issues in [NationStates](https://www.nationstates.net/). This is like semantic search, not keyword search. You can input words, sentences, paragraphs, or even full issues. For finding duplicates, I recommend only searching with the issue description.",
56
- article="Made by [Jiangbei](www.nationstates.net/nation=jiangbei). [Issues](https://github.com/Clarissa-Valentine-Z/NationStates-Issue-Megathread) downloaded from [Valentine Z](https://www.nationstates.net/nation=valentine_z) and written by respective authors. Powered by [BAAI/bge-m3](https://huggingface.co/BAAI/bge-m3).",
57
- submit_btn="Search",
58
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
59
 
60
 
 
61
  if __name__ == "__main__":
62
- demo.launch()
 
 
 
2
  from FlagEmbedding import BGEM3FlagModel
3
  import numpy as np
4
  import json
5
+ import os # Import os to handle potential path issues
6
 
7
+ # --- Configuration and Global Data Loading ---
 
 
8
 
9
+ # Determine the directory of the script to load files relative to it
10
+ script_dir = os.path.dirname(os.path.abspath(__file__))
11
+ issue_embeddings_path = os.path.join(script_dir, 'ns_issues_dense_bge-m3.npy')
12
+ issue_titles_path = os.path.join(script_dir, 'issue_titles.json')
13
+ ga_embeddings_path = os.path.join(script_dir, 'ns_ga_resolutions_dense_bge-m3.npy')
14
+ ga_resolutions_path = os.path.join(script_dir, 'parsed_ga_resolutions.json')
15
 
 
 
 
 
16
 
17
+ print("Loading BGE-M3 model...")
18
+ try:
19
+ # Use a local path if the model is downloaded, or let it download from Hugging Face
20
+ model = BGEM3FlagModel('BAAI/bge-m3', use_fp16=True)
21
+ print("Model loaded successfully.")
22
+ except Exception as e:
23
+ print(f"Error loading model: {e}")
24
+ print("Please ensure you have an internet connection or the model is cached locally.")
25
+ # Consider exiting or handling the error appropriately
26
+ exit()
27
 
 
 
28
 
29
+ print("Loading issue data...")
30
+ try:
31
+ issue_embeddings = np.load(issue_embeddings_path)
32
+ with open(issue_titles_path) as file:
33
+ issue_titles = json.load(file)
34
+ print(f"Issue data loaded: {len(issue_titles)} issues.")
35
+ except FileNotFoundError as e:
36
+ print(f"Error loading issue data: {e}")
37
+ print("Please ensure 'ns_issues_dense_bge-m3.npy' and 'issue_titles.json' are in the same directory as app.py")
38
+ # Consider exiting or handling the error appropriately
39
+ exit()
40
+ except Exception as e:
41
+ print(f"Error loading issue data: {e}")
42
+ exit()
43
+
44
+
45
+ print("Loading GA resolution data...")
46
+ try:
47
+ ga_embeddings = np.load(ga_embeddings_path)
48
+ with open(ga_resolutions_path) as file:
49
+ ga_resolutions_data = json.load(file) # List of dictionaries
50
+ print(f"GA resolution data loaded: {len(ga_resolutions_data)} resolutions.")
51
+ except FileNotFoundError as e:
52
+ print(f"Error loading GA resolution data: {e}")
53
+ print("Please ensure 'ns_ga_resolutions_dense_bge-m3.npy' and 'parsed_ga_resolutions.json' are in the same directory as app.py")
54
+ # Consider exiting or handling the error appropriately
55
+ # If the file is not found, the GA search tab won't work, but the app might still launch with just the issue search
56
+ ga_embeddings = None # Indicate that GA data is not available
57
+ ga_resolutions_data = []
58
+ except Exception as e:
59
+ print(f"Error loading GA resolution data: {e}")
60
+ ga_embeddings = None
61
+ ga_resolutions_data = []
62
+
63
+
64
+ # --- Search Functions ---
65
+
66
+ def get_issue_similarity_rankings(search_term):
67
+ """Searches issues and returns formatted results."""
68
+ if not search_term:
69
+ return "Please enter a search term."
70
+ if issue_embeddings is None or not issue_titles:
71
+ return "Issue data not loaded. Cannot perform search."
72
+
73
+ try:
74
+ # Encode the search term
75
+ search_embedding = model.encode([search_term])['dense_vecs']
76
+
77
+ # Calculate similarity (dot product)
78
+ similarity = search_embedding @ issue_embeddings.T # Shape: (1, num_issues)
79
+
80
+ # Get similarities for the single search term
81
+ search_query_similarities = similarity[0] # Shape: (num_issues,)
82
+
83
+ # Pair index with similarity score
84
+ indexed_similarities = [(i, score) for i, score in enumerate(search_query_similarities)]
85
+
86
+ # Sort by similarity score in descending order
87
+ sorted_similarities = sorted(indexed_similarities, key=lambda item: item[1], reverse=True)
88
+
89
+ # Format results as text
90
+ similarity_text = "# Top 20 Issue Search Results\n"
91
+ search_ranking = 1
92
+ # Get top 20 results
93
+ for index, sim_score in sorted_similarities[:20]:
94
+ # issue_titles is a dict, needs string key
95
+ issue_title = issue_titles.get(str(index), f"Unknown Issue (Index {index})")
96
+ similarity_text += f"{search_ranking}. {issue_title}, Similarity: {sim_score:.4f}\n"
97
+ search_ranking += 1
98
+
99
+ return similarity_text
100
+ except Exception as e:
101
+ return f"An error occurred during issue search: {e}"
102
+
103
+
104
+ def search_ga_resolutions(search_term):
105
+ """Searches GA resolutions and returns formatted results with links."""
106
+ if not search_term:
107
+ return "Please enter a search term."
108
+ if ga_embeddings is None or not ga_resolutions_data:
109
+ return "GA Resolution data not loaded. Cannot perform search."
110
+
111
+ try:
112
+ # Encode the search term
113
+ search_embedding = model.encode([search_term])['dense_vecs']
114
+
115
+ # Calculate similarity (dot product)
116
+ similarity = search_embedding @ ga_embeddings.T # Shape: (1, num_resolutions)
117
+
118
+ # Get similarities for the single search term
119
+ search_query_similarities = similarity[0] # Shape: (num_resolutions,)
120
+
121
+ # Pair index with similarity score
122
+ indexed_similarities = [(i, score) for i, score in enumerate(search_query_similarities)]
123
+
124
+ # Sort by similarity score in descending order
125
  sorted_similarities = sorted(indexed_similarities, key=lambda item: item[1], reverse=True)
 
126
 
127
+ # Format results as Markdown with links
128
+ similarity_text = "# Top 20 GA Resolution Search Results\n"
 
129
  search_ranking = 1
130
+ # Get top 20 results
131
+ for index, sim_score in sorted_similarities[:20]:
132
+ if index < len(ga_resolutions_data):
133
+ resolution = ga_resolutions_data[index]
134
+ title = resolution.get('title', 'Untitled Resolution')
135
+ res_id = resolution.get('id', 'N/A')
136
+ council = resolution.get('council', 1) # Default to council 1 if not specified
137
+ # Construct the NationStates URL
138
+ url = f"https://www.nationstates.net/page=WA_past_resolution/id={res_id}/council={council}"
139
+ # Format as Markdown link
140
+ similarity_text += f"{search_ranking}. [{title}]({url}), Similarity: {sim_score:.4f}\n"
141
+ else:
142
+ # Fallback if index is unexpectedly out of range
143
+ similarity_text += f"{search_ranking}. Error retrieving resolution data for index {index}, Similarity: {sim_score:.4f}\n"
144
+
145
  search_ranking += 1
146
 
147
+ return similarity_text
148
+ except Exception as e:
149
+ return f"An error occurred during GA resolution search: {e}"
150
 
151
 
152
+ # --- Gradio Interface ---
153
+
154
  """
155
+ For information on how to customize the Gradio Blocks and Tabs, peruse the gradio docs:
156
+ https://www.gradio.app/docs/blocks
157
+ https://www.gradio.app/docs/tabs
158
+ https://www.gradio.app/docs/interface (used within tabs)
159
  """
160
+
161
+ with gr.Blocks() as demo:
162
+ gr.Markdown("""
163
+ # NationStates Semantic Search
164
+ Search through NationStates content using semantic search powered by BGE-M3.
165
+ """)
166
+
167
+ with gr.Tabs() as tabs:
168
+ with gr.TabItem("Issue Search"):
169
+ gr.Markdown("""
170
+ ### Search NationStates Issues
171
+ Search through all 1660 issues. Semantic search allows finding related concepts or paraphrased ideas, not just keywords.
172
+ """)
173
+ issue_search_interface = gr.Interface(
174
+ fn=get_issue_similarity_rankings,
175
+ inputs=gr.Textbox(label="Search term", placeholder="What issue are you looking for?"),
176
+ outputs=gr.Markdown(container=True),
177
+ examples=[
178
+ "coffee",
179
+ "land value tax",
180
+ "Elon Musk",
181
+ "After an corrupted election, citizens demand the real results, after discovering it was rigged. ",
182
+ "Eureka! A new scientific law regarding the universe's expansion may have just been discovered at the University of @@CAPITAL@@. Unfortunately, tempers are flaring over who should get naming credit. Maxtopian grad student Georgie Bubble claims the work alone while his boss Dr.@@RANDOMNAME1@@ claims that all work in the University is @@NAME@@’s collectively. Your Minister of Education has elevated this to your desk.",
183
+ ],
184
+ title=None, # Title is now handled by the Markdown within the tab
185
+ description=None, # Description is now handled by the Markdown within the tab
186
+ submit_btn="Search Issues",
187
+ # No live=True as it's computationally intensive
188
+ )
189
+
190
+ with gr.TabItem("GA Resolution Search"):
191
+ gr.Markdown("""
192
+ ### Search NationStates General Assembly Resolutions
193
+ Search through General Assembly resolutions. The results include clickable links to the resolutions on NationStates.
194
+ """)
195
+ ga_search_interface = gr.Interface(
196
+ fn=search_ga_resolutions,
197
+ inputs=gr.Textbox(label="Search term", placeholder="What GA resolution are you looking for?"),
198
+ outputs=gr.Markdown(container=True),
199
+ examples=[
200
+ "repeal process", # Common term related to resolutions
201
+ "condemn genocide",
202
+ "rights of animals",
203
+ "regulating space mining",
204
+ "limit weapons production",
205
+ "World Assembly neutrality", # Example from Resolution 2 description/body
206
+ "founding of the World Assembly", # Example from Resolution 1 description/body
207
+ "recognition of new nations" # Example of a common WA topic
208
+ ],
209
+ title=None, # Title handled by Markdown
210
+ description=None, # Description handled by Markdown
211
+ submit_btn="Search Resolutions",
212
+ # No live=True
213
+ )
214
+
215
+ gr.Markdown("""
216
+ <p>Made by [Jiangbei](www.nationstates.net/nation=jiangbei). Issue data from [Valentine Z](https://www.nationstates.net/nation=valentine_z). GA Resolution data parsed from NationStates. Powered by [BAAI/bge-m3](https://huggingface.co/BAAI/bge-m3).</p>
217
+ """)
218
 
219
 
220
+ # --- Launch App ---
221
  if __name__ == "__main__":
222
+ # Set share=True to make the app accessible externally (requires ngrok)
223
+ # share=False is default and runs locally
224
+ demo.launch()
ns_ga_resolutions_dense_bge-m3.npy ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7a3e1e4def2ec2cd87a3c11fabb8af6e2457251c120619ffc29940c938c100e4
3
+ size 1595520
parsed_ga_resolutions.json ADDED
The diff for this file is too large to render. See raw diff