Harshilforworks commited on
Commit
be26322
·
verified ·
1 Parent(s): 400c540

Major reconstruction mode

Browse files
Dockerfile CHANGED
@@ -1,32 +1,61 @@
1
- FROM python:3.13
2
-
3
- # Install system dependencies required by some Python packages (OpenCV, Pillow, ffmpeg)
4
- RUN apt-get update \
5
- && apt-get install -y --no-install-recommends \
6
- build-essential \
7
- ffmpeg \
8
- libsm6 \
9
- libxext6 \
10
- libxrender1 \
11
- libgl1 \
12
- git \
13
- && rm -rf /var/lib/apt/lists/*
14
-
15
- RUN useradd -m -u 1000 user
16
- USER user
17
- ENV PATH="/home/user/.local/bin:$PATH"
18
-
19
- WORKDIR /app
20
-
21
- COPY --chown=user ./requirements.txt requirements.txt
22
- RUN pip install --no-cache-dir --upgrade pip \
23
- && pip install --no-cache-dir -r requirements.txt
24
-
25
- COPY --chown=user . /app
26
-
27
- # Set default environment variables for service
28
- ENV SERVICE_HOST=0.0.0.0
29
- ENV SERVICE_PORT=7860
30
-
31
- # Use shell form to allow environment variable expansion
32
- CMD sh -c "uvicorn main:app --host ${SERVICE_HOST} --port ${SERVICE_PORT}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Use Python 3.13 full version (not slim) for Hugging Face deployment
2
+ FROM python:3.13
3
+
4
+ # Set environment variables
5
+ ENV PYTHONUNBUFFERED=1 \
6
+ PYTHONDONTWRITEBYTECODE=1 \
7
+ PIP_NO_CACHE_DIR=1 \
8
+ PIP_DISABLE_PIP_VERSION_CHECK=1
9
+
10
+ # Install system dependencies required by the application
11
+ # Including OpenCV, PyTorch, ffmpeg, and other multimedia libraries
12
+ RUN apt-get update && apt-get install -y --no-install-recommends \
13
+ build-essential \
14
+ ffmpeg \
15
+ libsm6 \
16
+ libxext6 \
17
+ libxrender1 \
18
+ libgomp1 \
19
+ libgl1-mesa-glx \
20
+ libglib2.0-0 \
21
+ git \
22
+ wget \
23
+ curl \
24
+ ca-certificates \
25
+ && rm -rf /var/lib/apt/lists/*
26
+
27
+ # Create non-root user for Hugging Face Spaces
28
+ RUN useradd -m -u 1000 user
29
+
30
+ # Set up working directory
31
+ WORKDIR /app
32
+
33
+ # Copy requirements first for better Docker layer caching
34
+ COPY --chown=user:user requirements.txt .
35
+
36
+ # Install Python dependencies as root to avoid permission issues
37
+ RUN pip install --upgrade pip setuptools wheel && \
38
+ pip install -r requirements.txt
39
+
40
+ # Switch to non-root user
41
+ USER user
42
+
43
+ # Set PATH for user-installed packages
44
+ ENV PATH="/home/user/.local/bin:$PATH"
45
+
46
+ # Copy application code
47
+ COPY --chown=user:user . .
48
+
49
+ # Expose port for Hugging Face Spaces (default: 7860)
50
+ EXPOSE 7860
51
+
52
+ # Set default environment variables for Hugging Face deployment
53
+ ENV SERVICE_HOST=0.0.0.0 \
54
+ SERVICE_PORT=7860
55
+
56
+ # Health check endpoint
57
+ HEALTHCHECK --interval=30s --timeout=10s --start-period=40s --retries=3 \
58
+ CMD curl -f http://localhost:7860/health || exit 1
59
+
60
+ # Run the application
61
+ CMD uvicorn main:app --host ${SERVICE_HOST} --port ${SERVICE_PORT} --workers 1
add_sample_data.py CHANGED
@@ -1,409 +1,409 @@
1
- #!/usr/bin/env python3
2
- """
3
- Script to add sample rumour data to MongoDB for testing real-time updates
4
- """
5
-
6
- import os
7
- import sys
8
- import asyncio
9
- from datetime import datetime, timedelta
10
- from pymongo import MongoClient
11
- from pymongo.errors import DuplicateKeyError
12
- from dotenv import load_dotenv
13
-
14
- # Load environment variables
15
- load_dotenv()
16
-
17
- def get_mongo_client():
18
- """Get MongoDB client connection"""
19
- connection_string = os.getenv('MONGO_CONNECTION_STRING')
20
- if not connection_string:
21
- raise ValueError("MONGO_CONNECTION_STRING environment variable not set")
22
-
23
- client = MongoClient(connection_string)
24
- # Test connection
25
- client.admin.command('ping')
26
- return client
27
-
28
- def add_sample_rumours():
29
- """Add sample rumour data to MongoDB"""
30
-
31
- client = get_mongo_client()
32
- db = client['aegis']
33
- collection = db['debunk_posts']
34
-
35
- # Sample rumour data with unique post_ids
36
- sample_rumours = [
37
- {
38
- "post_id": "sample_rumour_001",
39
- "claim": "Scientists have discovered a new planet that could support human life",
40
- "summary": "Recent astronomical observations suggest the possibility of a habitable exoplanet",
41
- "platform": "Twitter",
42
- "Post_link": "https://twitter.com/example/status/123456789",
43
- "verification": {
44
- "verdict": "true",
45
- "message": "This claim is accurate based on NASA's recent findings",
46
- "reasoning": "The discovery was confirmed by multiple telescopes and peer-reviewed research",
47
- "verification_date": datetime.now() - timedelta(hours=2),
48
- "sources": {
49
- "count": 3,
50
- "links": [
51
- "https://www.nasa.gov/feature/nasa-discovers-new-exoplanet",
52
- "https://www.nature.com/articles/space-discovery-2024",
53
- "https://www.scientificamerican.com/article/new-habitable-planet"
54
- ],
55
- "titles": [
56
- "NASA Discovers New Exoplanet",
57
- "Nature: Space Discovery 2024",
58
- "Scientific American: New Habitable Planet Found"
59
- ]
60
- }
61
- },
62
- "stored_at": datetime.now() - timedelta(hours=2)
63
- },
64
- {
65
- "post_id": "sample_rumour_002",
66
- "claim": "Breaking: Major tech company announces they're shutting down all services",
67
- "summary": "A viral post claims a major technology company is discontinuing all its services",
68
- "platform": "Facebook",
69
- "Post_link": "https://facebook.com/example/posts/987654321",
70
- "verification": {
71
- "verdict": "false",
72
- "message": "This is completely false and has been debunked by the company",
73
- "reasoning": "The company's official channels have confirmed this is a hoax. No such announcement was made.",
74
- "verification_date": datetime.now() - timedelta(hours=1, minutes=30),
75
- "sources": {
76
- "count": 2,
77
- "links": [
78
- "https://company.com/official-statement",
79
- "https://techcrunch.com/company-denies-shutdown-rumors"
80
- ],
81
- "titles": [
82
- "Official Company Statement",
83
- "TechCrunch: Company Denies Shutdown Rumors"
84
- ]
85
- }
86
- },
87
- "stored_at": datetime.now() - timedelta(hours=1, minutes=30)
88
- },
89
- {
90
- "post_id": "sample_rumour_003",
91
- "claim": "New study shows that coffee increases life expectancy by 5 years",
92
- "summary": "A recent research paper claims significant health benefits from coffee consumption",
93
- "platform": "Instagram",
94
- "Post_link": "https://instagram.com/p/coffee-study-2024",
95
- "verification": {
96
- "verdict": "mostly true",
97
- "message": "While coffee does have health benefits, the 5-year claim is exaggerated",
98
- "reasoning": "Studies show moderate coffee consumption has health benefits, but the specific 5-year claim is not supported by the research cited.",
99
- "verification_date": datetime.now() - timedelta(minutes=45),
100
- "sources": {
101
- "count": 4,
102
- "links": [
103
- "https://www.nejm.org/journal/coffee-health-study",
104
- "https://www.mayoclinic.org/coffee-health-benefits",
105
- "https://www.hsph.harvard.edu/coffee-research",
106
- "https://www.healthline.com/coffee-life-expectancy-study"
107
- ],
108
- "titles": [
109
- "NEJM: Coffee Health Study",
110
- "Mayo Clinic: Coffee Health Benefits",
111
- "Harvard: Coffee Research",
112
- "Healthline: Coffee Life Expectancy Study"
113
- ]
114
- }
115
- },
116
- "stored_at": datetime.now() - timedelta(minutes=45)
117
- },
118
- {
119
- "post_id": "sample_rumour_004",
120
- "claim": "Local restaurant caught serving expired food to customers",
121
- "summary": "Social media posts allege a popular local restaurant is serving expired ingredients",
122
- "platform": "Reddit",
123
- "Post_link": "https://reddit.com/r/localnews/expired-food-restaurant",
124
- "verification": {
125
- "verdict": "disputed",
126
- "message": "The claims are under investigation by health authorities",
127
- "reasoning": "Health department inspection is ongoing. Some allegations have been confirmed, others are disputed by the restaurant management.",
128
- "verification_date": datetime.now() - timedelta(minutes=20),
129
- "sources": {
130
- "count": 3,
131
- "links": [
132
- "https://healthdept.gov/inspection-reports",
133
- "https://localnews.com/restaurant-investigation",
134
- "https://restaurant.com/official-response"
135
- ],
136
- "titles": [
137
- "Health Department Inspection Reports",
138
- "Local News: Restaurant Investigation",
139
- "Restaurant Official Response"
140
- ]
141
- }
142
- },
143
- "stored_at": datetime.now() - timedelta(minutes=20)
144
- },
145
- {
146
- "post_id": "sample_rumour_005",
147
- "claim": "Mysterious lights spotted in the sky over the city last night",
148
- "summary": "Multiple reports of unusual lights in the night sky",
149
- "platform": "TikTok",
150
- "Post_link": "https://tiktok.com/@user/video/mysterious-lights-city",
151
- "verification": {
152
- "verdict": "unverified",
153
- "message": "Unable to verify the source or authenticity of these reports",
154
- "reasoning": "No official explanation has been provided. Could be various phenomena including aircraft, drones, or natural occurrences.",
155
- "verification_date": datetime.now() - timedelta(minutes=10),
156
- "sources": {
157
- "count": 2,
158
- "links": [
159
- "https://weather.gov/sky-conditions-report",
160
- "https://faa.gov/flight-tracker-archive"
161
- ],
162
- "titles": [
163
- "Weather Service: Sky Conditions Report",
164
- "FAA: Flight Tracker Archive"
165
- ]
166
- }
167
- },
168
- "stored_at": datetime.now() - timedelta(minutes=10)
169
- },
170
- {
171
- "post_id": "sample_rumour_006",
172
- "claim": "Viral deepfake shows the president announcing an unexpected policy change",
173
- "summary": "A widely shared video appears to show a surprise announcement from the president",
174
- "platform": "YouTube",
175
- "Post_link": "https://youtube.com/watch?v=deepfake-announcement",
176
- "verification": {
177
- "verdict": "false",
178
- "message": "The clip is a deepfake; official channels have no record of this announcement",
179
- "reasoning": "Audio-visual artifacts and mismatch with verified schedule indicate synthetic media",
180
- "verification_date": datetime.now() - timedelta(minutes=5),
181
- "sources": {
182
- "count": 2,
183
- "links": [
184
- "https://whitehouse.gov/schedule",
185
- "https://journal.example.com/deepfake-analysis"
186
- ],
187
- "titles": [
188
- "Official Schedule",
189
- "Deepfake Analysis"
190
- ]
191
- }
192
- },
193
- "stored_at": datetime.now() - timedelta(minutes=5)
194
- },
195
- {
196
- "post_id": "sample_rumour_007",
197
- "claim": "Wildfire evacuation map shows entire county under immediate threat",
198
- "summary": "A map circulating online claims an entire county is being evacuated",
199
- "platform": "Telegram",
200
- "Post_link": "https://t.me/channel/wildfire-map",
201
- "verification": {
202
- "verdict": "disputed",
203
- "message": "Only specific zones are under watch; no county-wide evacuation order",
204
- "reasoning": "Emergency management alerts list partial warnings, not blanket evacuations",
205
- "verification_date": datetime.now() - timedelta(minutes=8),
206
- "sources": {
207
- "count": 2,
208
- "links": [
209
- "https://alerts.example.gov/region-updates",
210
- "https://county.gov/emergency"
211
- ],
212
- "titles": [
213
- "Regional Alerts",
214
- "County Emergency Updates"
215
- ]
216
- }
217
- },
218
- "stored_at": datetime.now() - timedelta(minutes=8)
219
- },
220
- {
221
- "post_id": "sample_rumour_008",
222
- "claim": "Celebrity X claimed in 2015 that vaccines are a government tracking program",
223
- "summary": "A screenshot attributes an anti-vaccine quote to a well-known actor",
224
- "platform": "Threads",
225
- "Post_link": "https://www.threads.net/@user/post/abc123",
226
- "verification": {
227
- "verdict": "false",
228
- "message": "No credible source supports this quote; likely fabricated image",
229
- "reasoning": "Archive search and press records show no such statement from the celebrity",
230
- "verification_date": datetime.now() - timedelta(minutes=12),
231
- "sources": {
232
- "count": 3,
233
- "links": [
234
- "https://archive.org/celebrity-press",
235
- "https://newsdb.example.com/search",
236
- "https://snopes.com/fact-check/celebrity-misattributed-quote"
237
- ],
238
- "titles": [
239
- "Press Archive",
240
- "News Database",
241
- "Fact Check"
242
- ]
243
- }
244
- },
245
- "stored_at": datetime.now() - timedelta(minutes=12)
246
- },
247
- {
248
- "post_id": "sample_rumour_009",
249
- "claim": "Nationwide vaccine recall announced due to severe side effects",
250
- "summary": "Posts claim an emergency recall affecting all batches",
251
- "platform": "WhatsApp",
252
- "Post_link": "https://example.com/forwarded-message",
253
- "verification": {
254
- "verdict": "false",
255
- "message": "No regulatory recall issued; official notices contradict the claim",
256
- "reasoning": "Regulatory databases list no recall matching the description",
257
- "verification_date": datetime.now() - timedelta(minutes=25),
258
- "sources": {
259
- "count": 2,
260
- "links": [
261
- "https://fda.gov/recalls",
262
- "https://who.int/medical-product-alerts"
263
- ],
264
- "titles": [
265
- "FDA Recalls",
266
- "WHO Alerts"
267
- ]
268
- }
269
- },
270
- "stored_at": datetime.now() - timedelta(minutes=25)
271
- },
272
- {
273
- "post_id": "sample_rumour_010",
274
- "claim": "Earthquake predicted to hit the capital city at 7 PM tonight",
275
- "summary": "A viral message predicts an exact time for a major quake",
276
- "platform": "TikTok",
277
- "Post_link": "https://tiktok.com/@user/video/quake-prediction",
278
- "verification": {
279
- "verdict": "false",
280
- "message": "Earthquakes cannot be predicted with exact timing using current science",
281
- "reasoning": "Seismology consensus rejects precise short-term predictions",
282
- "verification_date": datetime.now() - timedelta(minutes=18),
283
- "sources": {
284
- "count": 2,
285
- "links": [
286
- "https://usgs.gov/faqs/can-you-predict-earthquakes",
287
- "https://seismo.org/position-on-prediction"
288
- ],
289
- "titles": [
290
- "USGS FAQs",
291
- "Seismology Position"
292
- ]
293
- }
294
- },
295
- "stored_at": datetime.now() - timedelta(minutes=18)
296
- },
297
- {
298
- "post_id": "sample_rumour_011",
299
- "claim": "Poll shows 98% support for Candidate Y after overnight update",
300
- "summary": "Graphic claims near-unanimous polling shift in one night",
301
- "platform": "X",
302
- "Post_link": "https://x.com/example/status/shifted-poll",
303
- "verification": {
304
- "verdict": "uncertain",
305
- "message": "No reputable pollster has published this figure; methodology unclear",
306
- "reasoning": "Source lacks sampling details; awaiting official releases",
307
- "verification_date": datetime.now() - timedelta(minutes=30),
308
- "sources": {
309
- "count": 2,
310
- "links": [
311
- "https://fivethirtyeight.com/polls/",
312
- "https://aapor.org/methods-standards"
313
- ],
314
- "titles": [
315
- "Polling Aggregator",
316
- "Survey Standards"
317
- ]
318
- }
319
- },
320
- "stored_at": datetime.now() - timedelta(minutes=30)
321
- }
322
- ]
323
-
324
- print("🔄 Adding sample rumour data to MongoDB...")
325
-
326
- added_count = 0
327
- skipped_count = 0
328
-
329
- for rumour in sample_rumours:
330
- try:
331
- # Try to insert the document
332
- result = collection.insert_one(rumour)
333
- print(f"✅ Added rumour: {rumour['post_id']} - {rumour['claim'][:50]}...")
334
- added_count += 1
335
-
336
- except DuplicateKeyError:
337
- print(f"⚠️ Skipped rumour (already exists): {rumour['post_id']}")
338
- skipped_count += 1
339
-
340
- except Exception as e:
341
- print(f"❌ Error adding rumour {rumour['post_id']}: {e}")
342
-
343
- print(f"\n📊 Summary:")
344
- print(f" ✅ Added: {added_count} rumours")
345
- print(f" ⚠️ Skipped: {skipped_count} rumours")
346
- print(f" 📝 Total in database: {collection.count_documents({})} rumours")
347
-
348
- # Close connection
349
- client.close()
350
- print("\n🔌 MongoDB connection closed")
351
-
352
- def test_realtime_update():
353
- """Add a new rumour to test real-time updates"""
354
-
355
- client = get_mongo_client()
356
- db = client['aegis']
357
- collection = db['debunk_posts']
358
-
359
- # Create a new rumour with current timestamp
360
- new_rumour = {
361
- "post_id": f"test_realtime_{int(datetime.now().timestamp())}",
362
- "claim": "Test real-time update: This is a new rumour added for testing WebSocket functionality",
363
- "summary": "This rumour was added to test the real-time WebSocket update system",
364
- "platform": "Test Platform",
365
- "Post_link": "https://example.com/test-realtime-update",
366
- "verification": {
367
- "verdict": "true",
368
- "message": "This is a test rumour for real-time updates",
369
- "reasoning": "Added programmatically to verify WebSocket functionality",
370
- "verification_date": datetime.now(),
371
- "sources": {
372
- "count": 1,
373
- "links": ["https://example.com/test-source"],
374
- "titles": ["Test Source"]
375
- }
376
- },
377
- "stored_at": datetime.now()
378
- }
379
-
380
- print("🔄 Adding test rumour for real-time update...")
381
-
382
- try:
383
- result = collection.insert_one(new_rumour)
384
- print(f"✅ Test rumour added successfully!")
385
- print(f" 📝 Post ID: {new_rumour['post_id']}")
386
- print(f" 📅 Added at: {new_rumour['stored_at']}")
387
- print(f" 🔍 MongoDB ID: {result.inserted_id}")
388
- print("\n💡 Check your frontend - you should see this new rumour appear automatically!")
389
-
390
- except Exception as e:
391
- print(f"❌ Error adding test rumour: {e}")
392
-
393
- # Close connection
394
- client.close()
395
- print("\n🔌 MongoDB connection closed")
396
-
397
- if __name__ == "__main__":
398
- print("🚀 MongoDB Sample Data Script")
399
- print("=" * 50)
400
-
401
- if len(sys.argv) > 1 and sys.argv[1] == "test":
402
- test_realtime_update()
403
- else:
404
- add_sample_rumours()
405
-
406
- print("\n✨ Script completed!")
407
- print("\n💡 Usage:")
408
- print(" python add_sample_data.py # Add sample rumours")
409
- print(" python add_sample_data.py test # Add test rumour for real-time updates")
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Script to add sample rumour data to MongoDB for testing real-time updates
4
+ """
5
+
6
+ import os
7
+ import sys
8
+ import asyncio
9
+ from datetime import datetime, timedelta
10
+ from pymongo import MongoClient
11
+ from pymongo.errors import DuplicateKeyError
12
+ from dotenv import load_dotenv
13
+
14
+ # Load environment variables
15
+ load_dotenv()
16
+
17
+ def get_mongo_client():
18
+ """Get MongoDB client connection"""
19
+ connection_string = os.getenv('MONGO_CONNECTION_STRING')
20
+ if not connection_string:
21
+ raise ValueError("MONGO_CONNECTION_STRING environment variable not set")
22
+
23
+ client = MongoClient(connection_string)
24
+ # Test connection
25
+ client.admin.command('ping')
26
+ return client
27
+
28
+ def add_sample_rumours():
29
+ """Add sample rumour data to MongoDB"""
30
+
31
+ client = get_mongo_client()
32
+ db = client['aegis']
33
+ collection = db['debunk_posts']
34
+
35
+ # Sample rumour data with unique post_ids
36
+ sample_rumours = [
37
+ {
38
+ "post_id": "sample_rumour_001",
39
+ "claim": "Scientists have discovered a new planet that could support human life",
40
+ "summary": "Recent astronomical observations suggest the possibility of a habitable exoplanet",
41
+ "platform": "Twitter",
42
+ "Post_link": "https://twitter.com/example/status/123456789",
43
+ "verification": {
44
+ "verdict": "true",
45
+ "message": "This claim is accurate based on NASA's recent findings",
46
+ "reasoning": "The discovery was confirmed by multiple telescopes and peer-reviewed research",
47
+ "verification_date": datetime.now() - timedelta(hours=2),
48
+ "sources": {
49
+ "count": 3,
50
+ "links": [
51
+ "https://www.nasa.gov/feature/nasa-discovers-new-exoplanet",
52
+ "https://www.nature.com/articles/space-discovery-2024",
53
+ "https://www.scientificamerican.com/article/new-habitable-planet"
54
+ ],
55
+ "titles": [
56
+ "NASA Discovers New Exoplanet",
57
+ "Nature: Space Discovery 2024",
58
+ "Scientific American: New Habitable Planet Found"
59
+ ]
60
+ }
61
+ },
62
+ "stored_at": datetime.now() - timedelta(hours=2)
63
+ },
64
+ {
65
+ "post_id": "sample_rumour_002",
66
+ "claim": "Breaking: Major tech company announces they're shutting down all services",
67
+ "summary": "A viral post claims a major technology company is discontinuing all its services",
68
+ "platform": "Facebook",
69
+ "Post_link": "https://facebook.com/example/posts/987654321",
70
+ "verification": {
71
+ "verdict": "false",
72
+ "message": "This is completely false and has been debunked by the company",
73
+ "reasoning": "The company's official channels have confirmed this is a hoax. No such announcement was made.",
74
+ "verification_date": datetime.now() - timedelta(hours=1, minutes=30),
75
+ "sources": {
76
+ "count": 2,
77
+ "links": [
78
+ "https://company.com/official-statement",
79
+ "https://techcrunch.com/company-denies-shutdown-rumors"
80
+ ],
81
+ "titles": [
82
+ "Official Company Statement",
83
+ "TechCrunch: Company Denies Shutdown Rumors"
84
+ ]
85
+ }
86
+ },
87
+ "stored_at": datetime.now() - timedelta(hours=1, minutes=30)
88
+ },
89
+ {
90
+ "post_id": "sample_rumour_003",
91
+ "claim": "New study shows that coffee increases life expectancy by 5 years",
92
+ "summary": "A recent research paper claims significant health benefits from coffee consumption",
93
+ "platform": "Instagram",
94
+ "Post_link": "https://instagram.com/p/coffee-study-2024",
95
+ "verification": {
96
+ "verdict": "mostly true",
97
+ "message": "While coffee does have health benefits, the 5-year claim is exaggerated",
98
+ "reasoning": "Studies show moderate coffee consumption has health benefits, but the specific 5-year claim is not supported by the research cited.",
99
+ "verification_date": datetime.now() - timedelta(minutes=45),
100
+ "sources": {
101
+ "count": 4,
102
+ "links": [
103
+ "https://www.nejm.org/journal/coffee-health-study",
104
+ "https://www.mayoclinic.org/coffee-health-benefits",
105
+ "https://www.hsph.harvard.edu/coffee-research",
106
+ "https://www.healthline.com/coffee-life-expectancy-study"
107
+ ],
108
+ "titles": [
109
+ "NEJM: Coffee Health Study",
110
+ "Mayo Clinic: Coffee Health Benefits",
111
+ "Harvard: Coffee Research",
112
+ "Healthline: Coffee Life Expectancy Study"
113
+ ]
114
+ }
115
+ },
116
+ "stored_at": datetime.now() - timedelta(minutes=45)
117
+ },
118
+ {
119
+ "post_id": "sample_rumour_004",
120
+ "claim": "Local restaurant caught serving expired food to customers",
121
+ "summary": "Social media posts allege a popular local restaurant is serving expired ingredients",
122
+ "platform": "Reddit",
123
+ "Post_link": "https://reddit.com/r/localnews/expired-food-restaurant",
124
+ "verification": {
125
+ "verdict": "disputed",
126
+ "message": "The claims are under investigation by health authorities",
127
+ "reasoning": "Health department inspection is ongoing. Some allegations have been confirmed, others are disputed by the restaurant management.",
128
+ "verification_date": datetime.now() - timedelta(minutes=20),
129
+ "sources": {
130
+ "count": 3,
131
+ "links": [
132
+ "https://healthdept.gov/inspection-reports",
133
+ "https://localnews.com/restaurant-investigation",
134
+ "https://restaurant.com/official-response"
135
+ ],
136
+ "titles": [
137
+ "Health Department Inspection Reports",
138
+ "Local News: Restaurant Investigation",
139
+ "Restaurant Official Response"
140
+ ]
141
+ }
142
+ },
143
+ "stored_at": datetime.now() - timedelta(minutes=20)
144
+ },
145
+ {
146
+ "post_id": "sample_rumour_005",
147
+ "claim": "Mysterious lights spotted in the sky over the city last night",
148
+ "summary": "Multiple reports of unusual lights in the night sky",
149
+ "platform": "TikTok",
150
+ "Post_link": "https://tiktok.com/@user/video/mysterious-lights-city",
151
+ "verification": {
152
+ "verdict": "unverified",
153
+ "message": "Unable to verify the source or authenticity of these reports",
154
+ "reasoning": "No official explanation has been provided. Could be various phenomena including aircraft, drones, or natural occurrences.",
155
+ "verification_date": datetime.now() - timedelta(minutes=10),
156
+ "sources": {
157
+ "count": 2,
158
+ "links": [
159
+ "https://weather.gov/sky-conditions-report",
160
+ "https://faa.gov/flight-tracker-archive"
161
+ ],
162
+ "titles": [
163
+ "Weather Service: Sky Conditions Report",
164
+ "FAA: Flight Tracker Archive"
165
+ ]
166
+ }
167
+ },
168
+ "stored_at": datetime.now() - timedelta(minutes=10)
169
+ },
170
+ {
171
+ "post_id": "sample_rumour_006",
172
+ "claim": "Viral deepfake shows the president announcing an unexpected policy change",
173
+ "summary": "A widely shared video appears to show a surprise announcement from the president",
174
+ "platform": "YouTube",
175
+ "Post_link": "https://youtube.com/watch?v=deepfake-announcement",
176
+ "verification": {
177
+ "verdict": "false",
178
+ "message": "The clip is a deepfake; official channels have no record of this announcement",
179
+ "reasoning": "Audio-visual artifacts and mismatch with verified schedule indicate synthetic media",
180
+ "verification_date": datetime.now() - timedelta(minutes=5),
181
+ "sources": {
182
+ "count": 2,
183
+ "links": [
184
+ "https://whitehouse.gov/schedule",
185
+ "https://journal.example.com/deepfake-analysis"
186
+ ],
187
+ "titles": [
188
+ "Official Schedule",
189
+ "Deepfake Analysis"
190
+ ]
191
+ }
192
+ },
193
+ "stored_at": datetime.now() - timedelta(minutes=5)
194
+ },
195
+ {
196
+ "post_id": "sample_rumour_007",
197
+ "claim": "Wildfire evacuation map shows entire county under immediate threat",
198
+ "summary": "A map circulating online claims an entire county is being evacuated",
199
+ "platform": "Telegram",
200
+ "Post_link": "https://t.me/channel/wildfire-map",
201
+ "verification": {
202
+ "verdict": "disputed",
203
+ "message": "Only specific zones are under watch; no county-wide evacuation order",
204
+ "reasoning": "Emergency management alerts list partial warnings, not blanket evacuations",
205
+ "verification_date": datetime.now() - timedelta(minutes=8),
206
+ "sources": {
207
+ "count": 2,
208
+ "links": [
209
+ "https://alerts.example.gov/region-updates",
210
+ "https://county.gov/emergency"
211
+ ],
212
+ "titles": [
213
+ "Regional Alerts",
214
+ "County Emergency Updates"
215
+ ]
216
+ }
217
+ },
218
+ "stored_at": datetime.now() - timedelta(minutes=8)
219
+ },
220
+ {
221
+ "post_id": "sample_rumour_008",
222
+ "claim": "Celebrity X claimed in 2015 that vaccines are a government tracking program",
223
+ "summary": "A screenshot attributes an anti-vaccine quote to a well-known actor",
224
+ "platform": "Threads",
225
+ "Post_link": "https://www.threads.net/@user/post/abc123",
226
+ "verification": {
227
+ "verdict": "false",
228
+ "message": "No credible source supports this quote; likely fabricated image",
229
+ "reasoning": "Archive search and press records show no such statement from the celebrity",
230
+ "verification_date": datetime.now() - timedelta(minutes=12),
231
+ "sources": {
232
+ "count": 3,
233
+ "links": [
234
+ "https://archive.org/celebrity-press",
235
+ "https://newsdb.example.com/search",
236
+ "https://snopes.com/fact-check/celebrity-misattributed-quote"
237
+ ],
238
+ "titles": [
239
+ "Press Archive",
240
+ "News Database",
241
+ "Fact Check"
242
+ ]
243
+ }
244
+ },
245
+ "stored_at": datetime.now() - timedelta(minutes=12)
246
+ },
247
+ {
248
+ "post_id": "sample_rumour_009",
249
+ "claim": "Nationwide vaccine recall announced due to severe side effects",
250
+ "summary": "Posts claim an emergency recall affecting all batches",
251
+ "platform": "WhatsApp",
252
+ "Post_link": "https://example.com/forwarded-message",
253
+ "verification": {
254
+ "verdict": "false",
255
+ "message": "No regulatory recall issued; official notices contradict the claim",
256
+ "reasoning": "Regulatory databases list no recall matching the description",
257
+ "verification_date": datetime.now() - timedelta(minutes=25),
258
+ "sources": {
259
+ "count": 2,
260
+ "links": [
261
+ "https://fda.gov/recalls",
262
+ "https://who.int/medical-product-alerts"
263
+ ],
264
+ "titles": [
265
+ "FDA Recalls",
266
+ "WHO Alerts"
267
+ ]
268
+ }
269
+ },
270
+ "stored_at": datetime.now() - timedelta(minutes=25)
271
+ },
272
+ {
273
+ "post_id": "sample_rumour_010",
274
+ "claim": "Earthquake predicted to hit the capital city at 7 PM tonight",
275
+ "summary": "A viral message predicts an exact time for a major quake",
276
+ "platform": "TikTok",
277
+ "Post_link": "https://tiktok.com/@user/video/quake-prediction",
278
+ "verification": {
279
+ "verdict": "false",
280
+ "message": "Earthquakes cannot be predicted with exact timing using current science",
281
+ "reasoning": "Seismology consensus rejects precise short-term predictions",
282
+ "verification_date": datetime.now() - timedelta(minutes=18),
283
+ "sources": {
284
+ "count": 2,
285
+ "links": [
286
+ "https://usgs.gov/faqs/can-you-predict-earthquakes",
287
+ "https://seismo.org/position-on-prediction"
288
+ ],
289
+ "titles": [
290
+ "USGS FAQs",
291
+ "Seismology Position"
292
+ ]
293
+ }
294
+ },
295
+ "stored_at": datetime.now() - timedelta(minutes=18)
296
+ },
297
+ {
298
+ "post_id": "sample_rumour_011",
299
+ "claim": "Poll shows 98% support for Candidate Y after overnight update",
300
+ "summary": "Graphic claims near-unanimous polling shift in one night",
301
+ "platform": "X",
302
+ "Post_link": "https://x.com/example/status/shifted-poll",
303
+ "verification": {
304
+ "verdict": "uncertain",
305
+ "message": "No reputable pollster has published this figure; methodology unclear",
306
+ "reasoning": "Source lacks sampling details; awaiting official releases",
307
+ "verification_date": datetime.now() - timedelta(minutes=30),
308
+ "sources": {
309
+ "count": 2,
310
+ "links": [
311
+ "https://fivethirtyeight.com/polls/",
312
+ "https://aapor.org/methods-standards"
313
+ ],
314
+ "titles": [
315
+ "Polling Aggregator",
316
+ "Survey Standards"
317
+ ]
318
+ }
319
+ },
320
+ "stored_at": datetime.now() - timedelta(minutes=30)
321
+ }
322
+ ]
323
+
324
+ print("🔄 Adding sample rumour data to MongoDB...")
325
+
326
+ added_count = 0
327
+ skipped_count = 0
328
+
329
+ for rumour in sample_rumours:
330
+ try:
331
+ # Try to insert the document
332
+ result = collection.insert_one(rumour)
333
+ print(f"✅ Added rumour: {rumour['post_id']} - {rumour['claim'][:50]}...")
334
+ added_count += 1
335
+
336
+ except DuplicateKeyError:
337
+ print(f"⚠️ Skipped rumour (already exists): {rumour['post_id']}")
338
+ skipped_count += 1
339
+
340
+ except Exception as e:
341
+ print(f"❌ Error adding rumour {rumour['post_id']}: {e}")
342
+
343
+ print(f"\n📊 Summary:")
344
+ print(f" ✅ Added: {added_count} rumours")
345
+ print(f" ⚠️ Skipped: {skipped_count} rumours")
346
+ print(f" 📝 Total in database: {collection.count_documents({})} rumours")
347
+
348
+ # Close connection
349
+ client.close()
350
+ print("\n🔌 MongoDB connection closed")
351
+
352
+ def test_realtime_update():
353
+ """Add a new rumour to test real-time updates"""
354
+
355
+ client = get_mongo_client()
356
+ db = client['aegis']
357
+ collection = db['debunk_posts']
358
+
359
+ # Create a new rumour with current timestamp
360
+ new_rumour = {
361
+ "post_id": f"test_realtime_{int(datetime.now().timestamp())}",
362
+ "claim": "Test real-time update: This is a new rumour added for testing WebSocket functionality",
363
+ "summary": "This rumour was added to test the real-time WebSocket update system",
364
+ "platform": "Test Platform",
365
+ "Post_link": "https://example.com/test-realtime-update",
366
+ "verification": {
367
+ "verdict": "true",
368
+ "message": "This is a test rumour for real-time updates",
369
+ "reasoning": "Added programmatically to verify WebSocket functionality",
370
+ "verification_date": datetime.now(),
371
+ "sources": {
372
+ "count": 1,
373
+ "links": ["https://example.com/test-source"],
374
+ "titles": ["Test Source"]
375
+ }
376
+ },
377
+ "stored_at": datetime.now()
378
+ }
379
+
380
+ print("🔄 Adding test rumour for real-time update...")
381
+
382
+ try:
383
+ result = collection.insert_one(new_rumour)
384
+ print(f"✅ Test rumour added successfully!")
385
+ print(f" 📝 Post ID: {new_rumour['post_id']}")
386
+ print(f" 📅 Added at: {new_rumour['stored_at']}")
387
+ print(f" 🔍 MongoDB ID: {result.inserted_id}")
388
+ print("\n💡 Check your frontend - you should see this new rumour appear automatically!")
389
+
390
+ except Exception as e:
391
+ print(f"❌ Error adding test rumour: {e}")
392
+
393
+ # Close connection
394
+ client.close()
395
+ print("\n🔌 MongoDB connection closed")
396
+
397
+ if __name__ == "__main__":
398
+ print("🚀 MongoDB Sample Data Script")
399
+ print("=" * 50)
400
+
401
+ if len(sys.argv) > 1 and sys.argv[1] == "test":
402
+ test_realtime_update()
403
+ else:
404
+ add_sample_rumours()
405
+
406
+ print("\n✨ Script completed!")
407
+ print("\n💡 Usage:")
408
+ print(" python add_sample_data.py # Add sample rumours")
409
+ print(" python add_sample_data.py test # Add test rumour for real-time updates")
config.py CHANGED
@@ -11,7 +11,7 @@ class Config:
11
  SERP_API_KEY: Optional[str] = os.getenv("SERP_API_KEY")
12
  SERPAPI_BASE_URL: str = "https://serpapi.com/search"
13
  GEMINI_API_KEY: Optional[str] = os.getenv("GEMINI_API_KEY")
14
- GEMINI_MODEL: str = os.getenv("GEMINI_MODEL", "gemini-2.0-flash")
15
  GEMINI_TEMPERATURE: float = float(os.getenv("GEMINI_TEMPERATURE", "0.1"))
16
  GEMINI_TOP_P: float = float(os.getenv("GEMINI_TOP_P", "0.8"))
17
  GEMINI_MAX_TOKENS: int = int(os.getenv("GEMINI_MAX_TOKENS", "1000000"))
@@ -85,6 +85,11 @@ class Config:
85
  UPSTASH_REDIS_TOKEN: Optional[str] = os.getenv("UPSTASH_REDIS_TOKEN")
86
  REDIS_TTL: int = int(os.getenv("REDIS_TTL", "86400")) # 24 hours in seconds
87
 
 
 
 
 
 
88
  @classmethod
89
  def validate(cls) -> bool:
90
  """Validate configuration values"""
 
11
  SERP_API_KEY: Optional[str] = os.getenv("SERP_API_KEY")
12
  SERPAPI_BASE_URL: str = "https://serpapi.com/search"
13
  GEMINI_API_KEY: Optional[str] = os.getenv("GEMINI_API_KEY")
14
+ GEMINI_MODEL: str = os.getenv("GEMINI_MODEL", "gemini-2.5-flash")
15
  GEMINI_TEMPERATURE: float = float(os.getenv("GEMINI_TEMPERATURE", "0.1"))
16
  GEMINI_TOP_P: float = float(os.getenv("GEMINI_TOP_P", "0.8"))
17
  GEMINI_MAX_TOKENS: int = int(os.getenv("GEMINI_MAX_TOKENS", "1000000"))
 
85
  UPSTASH_REDIS_TOKEN: Optional[str] = os.getenv("UPSTASH_REDIS_TOKEN")
86
  REDIS_TTL: int = int(os.getenv("REDIS_TTL", "86400")) # 24 hours in seconds
87
 
88
+ # Razorpay Configuration
89
+ RAZORPAY_ID: Optional[str] = os.getenv("RAZORPAY_ID")
90
+ RAZORPAY_KEY: Optional[str] = os.getenv("RAZORPAY_KEY")
91
+ RAZORPAY_WEBHOOK_SECRET: Optional[str] = os.getenv("RAZORPAY_WEBHOOK_SECRET")
92
+
93
  @classmethod
94
  def validate(cls) -> bool:
95
  """Validate configuration values"""
main.py CHANGED
@@ -1,4 +1,4 @@
1
- from fastapi import FastAPI, File, UploadFile, HTTPException, Form, WebSocket, WebSocketDisconnect
2
  from typing import Optional, List, Dict, Any
3
  from fastapi.responses import FileResponse
4
  from fastapi.middleware.cors import CORSMiddleware
@@ -21,9 +21,13 @@ from services.text_fact_checker import TextFactChecker
21
  from services.educational_content_generator import EducationalContentGenerator
22
  from services.mongodb_service import MongoDBService
23
  from services.websocket_service import connection_manager, initialize_mongodb_change_stream, cleanup_mongodb_change_stream
 
 
24
  from utils.file_utils import save_upload_file, cleanup_temp_files
25
  from config import config
26
  from services.deepfake_checker import detect_audio_deepfake
 
 
27
 
28
  app = FastAPI(
29
  title="Visual Verification Service",
@@ -36,9 +40,18 @@ logging.basicConfig(level=logging.INFO)
36
  logger = logging.getLogger(__name__)
37
 
38
  # Add CORS middleware
 
 
 
 
39
  app.add_middleware(
40
  CORSMiddleware,
41
- allow_origins=["*"],
 
 
 
 
 
42
  allow_credentials=True,
43
  allow_methods=["*"],
44
  allow_headers=["*"],
@@ -65,15 +78,145 @@ try:
65
  except Exception as e:
66
  print(f"Warning: MongoDB service initialization failed: {e}")
67
 
 
 
 
 
 
 
 
68
  # Initialize MongoDB change service (will be set in startup event)
69
  mongodb_change_service = None
70
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
71
  @app.on_event("startup")
72
  async def startup_event():
73
  """Initialize services on startup"""
74
  global mongodb_change_service
75
  try:
76
  mongodb_change_service = await initialize_mongodb_change_stream()
 
 
77
  logger.info("✅ All services initialized successfully")
78
  except Exception as e:
79
  logger.error(f"❌ Failed to initialize services: {e}")
@@ -224,6 +367,407 @@ async def verify_text(
224
  except Exception as e:
225
  raise HTTPException(status_code=500, detail=str(e))
226
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
227
  @app.post("/chatbot/verify")
228
  async def chatbot_verify(
229
  text_input: Optional[str] = Form(None),
@@ -313,11 +857,11 @@ async def chatbot_verify(
313
  try:
314
  gemini_prompt = f"""
315
  You are an assistant for audio authenticity analysis.
316
- File name: {os.path.basename(file_path)}
317
  {('User question: ' + claim_context) if claim_context else ''}
318
  The audio has been analyzed and the result is: {'deepfake' if deepfake else 'NOT deepfake'}.
319
  Compose a clear, friendly, 1-2 line summary verdict for the user, tailored to the above context/result (do not answer with JSON or code, just a natural response).
320
  Avoid repeating 'deepfake detection' technical language; be concise and direct.
 
321
  """
322
  gemini_response = input_processor_for_audio.model.generate_content(gemini_prompt)
323
  ai_message = None
@@ -373,6 +917,75 @@ Avoid repeating 'deepfake detection' technical language; be concise and direct.
373
  print(f"🔍 DEBUG: Processing {len(urls_list)} URLs")
374
  for i, url in enumerate(urls_list):
375
  print(f"🔍 DEBUG: Processing URL {i}: {url}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
376
  if verification_type == "image":
377
  print(f"🔍 DEBUG: Calling image_verifier.verify for URL")
378
  result = await image_verifier.verify(
@@ -563,6 +1176,58 @@ async def get_recent_debunk_posts(limit: int = 5):
563
  print(f"🔍 DEBUG: Exception type: {type(e).__name__}")
564
  raise HTTPException(status_code=500, detail=str(e))
565
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
566
  @app.get("/health")
567
  async def health_check():
568
  return {"status": "healthy", "service": "visual-verification"}
@@ -708,5 +1373,736 @@ async def get_cache_status():
708
  except Exception as e:
709
  raise HTTPException(status_code=500, detail=str(e))
710
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
711
  if __name__ == "__main__":
712
  uvicorn.run(app, host="0.0.0.0", port=config.SERVICE_PORT)
 
1
+ from fastapi import FastAPI, File, UploadFile, HTTPException, Form, WebSocket, WebSocketDisconnect, Request
2
  from typing import Optional, List, Dict, Any
3
  from fastapi.responses import FileResponse
4
  from fastapi.middleware.cors import CORSMiddleware
 
21
  from services.educational_content_generator import EducationalContentGenerator
22
  from services.mongodb_service import MongoDBService
23
  from services.websocket_service import connection_manager, initialize_mongodb_change_stream, cleanup_mongodb_change_stream
24
+ from services.razorpay_service import RazorpayService
25
+ import razorpay.errors
26
  from utils.file_utils import save_upload_file, cleanup_temp_files
27
  from config import config
28
  from services.deepfake_checker import detect_audio_deepfake
29
+ from services.youtube_caption import get_youtube_transcript_ytdlp
30
+ import google.generativeai as genai
31
 
32
  app = FastAPI(
33
  title="Visual Verification Service",
 
40
  logger = logging.getLogger(__name__)
41
 
42
  # Add CORS middleware
43
+ # Note: When allow_credentials=True, you cannot use allow_origins=["*"]
44
+ # Must specify exact origins
45
+ # Chrome extensions make requests from background scripts which bypass CORS,
46
+ # but we include common origins for web frontend access
47
  app.add_middleware(
48
  CORSMiddleware,
49
+ allow_origins=[
50
+ "http://localhost:5173",
51
+ "http://127.0.0.1:5173",
52
+ "http://localhost:3000",
53
+ "http://127.0.0.1:3000",
54
+ ],
55
  allow_credentials=True,
56
  allow_methods=["*"],
57
  allow_headers=["*"],
 
78
  except Exception as e:
79
  print(f"Warning: MongoDB service initialization failed: {e}")
80
 
81
+ # Initialize Razorpay service
82
+ razorpay_service = None
83
+ try:
84
+ razorpay_service = RazorpayService()
85
+ except Exception as e:
86
+ print(f"Warning: Razorpay service initialization failed: {e}")
87
+
88
  # Initialize MongoDB change service (will be set in startup event)
89
  mongodb_change_service = None
90
 
91
+ async def initialize_subscription_plans():
92
+ """Initialize subscription plans in Razorpay if they don't exist"""
93
+ if not razorpay_service or not razorpay_service.client:
94
+ logger.warning("⚠️ Razorpay service not available. Skipping plan initialization.")
95
+ return
96
+
97
+ # First, test Razorpay connection by trying to fetch account details or make a simple API call
98
+ try:
99
+ # Try to verify credentials work by attempting a simple operation
100
+ # We'll skip listing plans if it fails and just try to create
101
+ logger.info("🔍 Testing Razorpay API connection...")
102
+ except Exception as e:
103
+ logger.error(f"❌ Razorpay API connection test failed: {e}")
104
+ logger.warning("⚠️ Skipping plan initialization due to API connection issues")
105
+ return
106
+
107
+ try:
108
+ # Try to list existing plans, but don't fail if it errors
109
+ existing_plan_names = set()
110
+ try:
111
+ existing_plans = razorpay_service.list_plans(count=100)
112
+ if existing_plans and existing_plans.get("items"):
113
+ existing_plan_names = {
114
+ p.get("item", {}).get("name")
115
+ for p in existing_plans.get("items", [])
116
+ if p.get("item", {}).get("name")
117
+ }
118
+ logger.info(f"📋 Found {len(existing_plan_names)} existing plans")
119
+ except Exception as list_error:
120
+ error_msg = str(list_error).lower()
121
+ if "not found" in error_msg or "404" in error_msg:
122
+ logger.info("ℹ️ No existing plans found (this is normal for new accounts)")
123
+ else:
124
+ logger.warning(f"⚠️ Could not list existing plans: {list_error}")
125
+ # Continue anyway - we'll try to create plans and handle duplicates
126
+
127
+ plans_to_create = [
128
+ {
129
+ "name": "Plan 1",
130
+ "amount": 100, # 1 INR in paise
131
+ "currency": "INR",
132
+ "interval": 1,
133
+ "period": "monthly",
134
+ "description": "Plan 1 - Monthly Subscription (1 INR)"
135
+ },
136
+ {
137
+ "name": "Plan 2",
138
+ "amount": 200, # 2 INR in paise
139
+ "currency": "INR",
140
+ "interval": 1,
141
+ "period": "monthly",
142
+ "description": "Plan 2 - Monthly Subscription (2 INR)"
143
+ },
144
+ {
145
+ "name": "Plan 3",
146
+ "amount": 300, # 3 INR in paise
147
+ "currency": "INR",
148
+ "interval": 1,
149
+ "period": "monthly",
150
+ "description": "Plan 3 - Monthly Subscription (3 INR)"
151
+ }
152
+ ]
153
+
154
+ created_count = 0
155
+ skipped_count = 0
156
+ error_count = 0
157
+
158
+ for plan_data in plans_to_create:
159
+ plan_name = plan_data["name"]
160
+
161
+ # Check if plan already exists
162
+ if plan_name in existing_plan_names:
163
+ logger.info(f"⏭️ Plan {plan_name} already exists, skipping")
164
+ skipped_count += 1
165
+ continue
166
+
167
+ try:
168
+ logger.info(f"🔄 Creating plan: {plan_name}...")
169
+ plan = razorpay_service.create_plan(**plan_data)
170
+ logger.info(f"✅ Created subscription plan: {plan_name} (ID: {plan.get('id')})")
171
+ created_count += 1
172
+ except razorpay.errors.BadRequestError as e:
173
+ error_msg = str(e).lower()
174
+ # Check if error is due to plan already existing (duplicate)
175
+ if "already exists" in error_msg or "duplicate" in error_msg:
176
+ logger.info(f"⏭️ Plan {plan_name} already exists (detected during creation), skipping")
177
+ skipped_count += 1
178
+ else:
179
+ logger.error(f"❌ BadRequestError creating plan {plan_name}: {e}")
180
+ error_count += 1
181
+ except Exception as e:
182
+ error_msg = str(e).lower()
183
+ # Check if error is due to plan already existing (duplicate)
184
+ if "already exists" in error_msg or "duplicate" in error_msg:
185
+ logger.info(f"⏭️ Plan {plan_name} already exists (detected during creation), skipping")
186
+ skipped_count += 1
187
+ elif "not found" in error_msg or "404" in error_msg:
188
+ logger.error(f"❌ API endpoint not found for plan {plan_name}. Check Razorpay credentials and API access.")
189
+ logger.error(f" Error details: {e}")
190
+ error_count += 1
191
+ else:
192
+ logger.error(f"❌ Failed to create plan {plan_name}: {e}")
193
+ error_count += 1
194
+
195
+ if created_count > 0:
196
+ logger.info(f"✅ Successfully created {created_count} subscription plans")
197
+ if skipped_count > 0:
198
+ logger.info(f"⏭️ Skipped {skipped_count} plans (already exist)")
199
+ if error_count > 0:
200
+ logger.warning(f"⚠️ {error_count} plans failed to create. Check Razorpay credentials and API permissions.")
201
+ if created_count == 0 and skipped_count == 0 and error_count > 0:
202
+ logger.error("❌ All plan creation attempts failed. Please verify:")
203
+ logger.error(" 1. RAZORPAY_ID and RAZORPAY_KEY are correct")
204
+ logger.error(" 2. API keys have subscription/plan creation permissions")
205
+ logger.error(" 3. Razorpay account has subscriptions feature enabled")
206
+
207
+ except Exception as e:
208
+ logger.error(f"❌ Failed to initialize subscription plans: {e}")
209
+ import traceback
210
+ logger.error(traceback.format_exc())
211
+
212
  @app.on_event("startup")
213
  async def startup_event():
214
  """Initialize services on startup"""
215
  global mongodb_change_service
216
  try:
217
  mongodb_change_service = await initialize_mongodb_change_stream()
218
+ # Initialize subscription plans
219
+ await initialize_subscription_plans()
220
  logger.info("✅ All services initialized successfully")
221
  except Exception as e:
222
  logger.error(f"❌ Failed to initialize services: {e}")
 
367
  except Exception as e:
368
  raise HTTPException(status_code=500, detail=str(e))
369
 
370
+ async def _extract_media_from_url(url: str) -> Optional[Dict[str, Any]]:
371
+ """
372
+ Use yt-dlp to extract media from a URL and determine if it's an image or video.
373
+
374
+ Returns:
375
+ Dict with "type" ("image" or "video") and "path" (local file path), or None if fails
376
+ """
377
+ try:
378
+ from shutil import which
379
+ import subprocess
380
+ import tempfile
381
+
382
+ # Resolve yt-dlp binary
383
+ ytdlp_bin = config.YTDLP_BIN or "yt-dlp"
384
+ found = which(ytdlp_bin) or which("yt-dlp")
385
+ if not found:
386
+ print("[extract_media] yt-dlp not found")
387
+ return None
388
+
389
+ # Create temp directory
390
+ temp_dir = tempfile.mkdtemp(prefix="media_extract_")
391
+
392
+ # First, get info about the media
393
+ info_cmd = [found, url, "--dump-json", "--no-playlist"]
394
+ result = subprocess.run(
395
+ info_cmd,
396
+ capture_output=True,
397
+ text=True,
398
+ timeout=30
399
+ )
400
+
401
+ if result.returncode != 0:
402
+ print(f"[extract_media] yt-dlp info failed: {result.stderr}")
403
+ return None
404
+
405
+ info = json.loads(result.stdout)
406
+
407
+ # Determine media type
408
+ ext = info.get("ext", "").lower()
409
+ is_video = ext in ["mp4", "webm", "mkv", "avi", "mov", "flv", "m4v"]
410
+ is_image = ext in ["jpg", "jpeg", "png", "gif", "webp", "bmp"]
411
+
412
+ if not is_video and not is_image:
413
+ # Check formats to determine type
414
+ formats = info.get("formats", [])
415
+ has_video_codec = any(f.get("vcodec") != "none" for f in formats)
416
+ has_audio_codec = any(f.get("acodec") != "none" for f in formats)
417
+
418
+ if has_video_codec:
419
+ is_video = True
420
+ elif not has_audio_codec and not has_video_codec:
421
+ # Likely an image
422
+ is_image = True
423
+
424
+ media_type = "video" if is_video else "image"
425
+
426
+ # Download the media
427
+ output_template = os.path.join(temp_dir, f"media.%(ext)s")
428
+ download_cmd = [
429
+ found,
430
+ url,
431
+ "-o", output_template,
432
+ "--no-playlist",
433
+ ]
434
+
435
+ # For images, prefer best quality; for videos, get best format
436
+ if is_image:
437
+ download_cmd.extend(["--format", "best"])
438
+ else:
439
+ download_cmd.extend(["--format", "best[ext=mp4]/best"])
440
+
441
+ result = subprocess.run(
442
+ download_cmd,
443
+ capture_output=True,
444
+ text=True,
445
+ timeout=60
446
+ )
447
+
448
+ if result.returncode != 0:
449
+ print(f"[extract_media] yt-dlp download failed: {result.stderr}")
450
+ return None
451
+
452
+ # Find the downloaded file
453
+ downloaded_files = [f for f in os.listdir(temp_dir) if os.path.isfile(os.path.join(temp_dir, f))]
454
+ if not downloaded_files:
455
+ print("[extract_media] No file downloaded")
456
+ return None
457
+
458
+ media_path = os.path.join(temp_dir, downloaded_files[0])
459
+
460
+ return {
461
+ "type": media_type,
462
+ "path": media_path,
463
+ "temp_dir": temp_dir # Keep for cleanup
464
+ }
465
+
466
+ except Exception as e:
467
+ print(f"[extract_media] Error: {e}")
468
+ import traceback
469
+ print(traceback.format_exc())
470
+ return None
471
+
472
+
473
+ def _is_youtube_url(url: str) -> bool:
474
+ """Check if URL is a YouTube URL"""
475
+ url_lower = url.lower()
476
+ youtube_domains = ['youtube.com', 'youtu.be', 'www.youtube.com', 'www.youtu.be', 'm.youtube.com']
477
+ return any(domain in url_lower for domain in youtube_domains)
478
+
479
+
480
+ async def _generate_claims_summary(claim_results: List[Dict[str, Any]], gemini_model) -> str:
481
+ """Generate a comprehensive summary of all claim verification results using Gemini"""
482
+ try:
483
+ # Prepare claims data for Gemini
484
+ claims_data = []
485
+ for i, result in enumerate(claim_results, 1):
486
+ claims_data.append({
487
+ "number": i,
488
+ "claim": result.get("claim_text", ""),
489
+ "verdict": result.get("verdict", "uncertain"),
490
+ "explanation": result.get("message", "No explanation available")
491
+ })
492
+
493
+ prompt = f"""You are a fact-checking summary writer. Based on the following verified claims from a YouTube video, create a comprehensive, user-friendly summary.
494
+
495
+ CLAIM VERIFICATION RESULTS:
496
+ {json.dumps(claims_data, indent=2)}
497
+
498
+ Your task is to create a clear, concise summary that:
499
+ 1. Lists each claim with its verdict (TRUE/FALSE/MIXED/UNCERTAIN)
500
+ 2. Explains WHY each claim is true or false in simple terms
501
+ 3. Highlights the most important findings
502
+ 4. Provides an overall assessment of the video's factual accuracy
503
+
504
+ Format your response as a well-structured summary that is easy to read. Use clear sections and bullet points where appropriate.
505
+
506
+ IMPORTANT:
507
+ - Be concise but thorough
508
+ - Explain the reasoning for each verdict
509
+ - Focus on the most significant false or misleading claims
510
+ - Keep the tone professional and informative
511
+ - Do NOT use markdown formatting, just plain text with clear structure
512
+
513
+ Return ONLY the summary text, no JSON or code blocks."""
514
+
515
+ response = gemini_model.generate_content(prompt)
516
+ response_text = response.text.strip()
517
+
518
+ # Clean up response if needed
519
+ if response_text.startswith('```'):
520
+ response_text = re.sub(r'^```[a-z]*\n?', '', response_text, flags=re.IGNORECASE)
521
+ response_text = re.sub(r'```$', '', response_text, flags=re.IGNORECASE).strip()
522
+
523
+ print(f"✅ Generated comprehensive summary")
524
+ return response_text
525
+
526
+ except Exception as e:
527
+ print(f"❌ Error generating summary with Gemini: {e}")
528
+ import traceback
529
+ print(traceback.format_exc())
530
+ # Fallback to simple concatenation
531
+ summary_parts = []
532
+ summary_parts.append(f"Analyzed {len(claim_results)} controversial claim(s) from the video transcript:\n")
533
+
534
+ for i, result in enumerate(claim_results, 1):
535
+ claim_text = result.get("claim_text", "")
536
+ verdict = result.get("verdict", "uncertain")
537
+ message = result.get("message", "No explanation available")
538
+
539
+ claim_display = claim_text[:150] + "..." if len(claim_text) > 150 else claim_text
540
+
541
+ verdict_label = {
542
+ "true": "✅ TRUE",
543
+ "false": "❌ FALSE",
544
+ "mixed": "⚠️ MIXED",
545
+ "uncertain": "❓ UNCERTAIN",
546
+ "error": "⚠️ ERROR"
547
+ }.get(verdict, "❓ UNCERTAIN")
548
+
549
+ summary_parts.append(f"\n{i}. {verdict_label}: {claim_display}")
550
+ summary_parts.append(f" Explanation: {message}")
551
+
552
+ return "\n".join(summary_parts)
553
+
554
+
555
+ async def _extract_claims_from_captions(captions: str, gemini_model) -> List[str]:
556
+ """Extract top 5 controversial claims from video captions using Gemini"""
557
+ try:
558
+ prompt = f"""You are a fact-checking assistant. Analyze the following video transcript and extract the TOP 5 MOST CONTROVERSIAL and verifiable claims that were mentioned in the video.
559
+
560
+ VIDEO TRANSCRIPT:
561
+ {captions}
562
+
563
+ Your task is to identify the 5 MOST controversial, factual claims that can be verified. Prioritize:
564
+ - Claims about events, statistics, or facts that are controversial or disputed
565
+ - Claims about people, organizations, or institutions that are potentially misleading
566
+ - Claims that are specific enough to be fact-checked and are likely to be false or disputed
567
+ - Claims that have significant impact or are widely discussed
568
+
569
+ Ignore:
570
+ - General opinions or subjective statements
571
+ - Questions or hypothetical scenarios
572
+ - Vague statements without specific claims
573
+ - Small talk or filler content
574
+
575
+ IMPORTANT: Return EXACTLY 5 claims (or fewer if the video doesn't contain 5 verifiable controversial claims). Rank them by controversy/importance.
576
+
577
+ Return ONLY a JSON object in this exact format:
578
+ {{
579
+ "claims": [
580
+ "Claim 1 text here (most controversial)",
581
+ "Claim 2 text here",
582
+ "Claim 3 text here",
583
+ "Claim 4 text here",
584
+ "Claim 5 text here"
585
+ ]
586
+ }}
587
+
588
+ Return ONLY the JSON object, no other text or explanation."""
589
+
590
+ response = gemini_model.generate_content(prompt)
591
+ response_text = response.text.strip()
592
+
593
+ # Clean up response if needed
594
+ if response_text.startswith('```json'):
595
+ response_text = response_text.replace('```json', '').replace('```', '').strip()
596
+ elif response_text.startswith('```'):
597
+ response_text = response_text.replace('```', '').strip()
598
+
599
+ # Parse JSON response
600
+ parsed = json.loads(response_text)
601
+ claims = parsed.get("claims", [])
602
+
603
+ # Filter out empty claims and limit to 5
604
+ claims = [c.strip() for c in claims if c and c.strip()][:5]
605
+
606
+ print(f"✅ Extracted {len(claims)} claims from video captions")
607
+ return claims
608
+
609
+ except Exception as e:
610
+ print(f"❌ Error extracting claims from captions: {e}")
611
+ import traceback
612
+ print(traceback.format_exc())
613
+ return []
614
+
615
+
616
+ async def _verify_youtube_video(url: str, claim_context: str, claim_date: str) -> Dict[str, Any]:
617
+ """Verify a YouTube video by extracting captions, extracting claims, and verifying each claim"""
618
+ import tempfile
619
+ import asyncio
620
+
621
+ try:
622
+ print(f"🎥 Starting YouTube video verification for: {url}")
623
+
624
+ # Step 1: Extract captions
625
+ print(f"📝 Extracting captions from YouTube video...")
626
+ # Create a temporary file for the transcript output
627
+ with tempfile.NamedTemporaryFile(mode='w', suffix='.txt', delete=False) as temp_file:
628
+ temp_output_file = temp_file.name
629
+
630
+ # Run the synchronous function in an executor to avoid blocking
631
+ loop = asyncio.get_event_loop()
632
+ captions = await loop.run_in_executor(
633
+ None,
634
+ get_youtube_transcript_ytdlp,
635
+ url,
636
+ temp_output_file
637
+ )
638
+
639
+ # Clean up the temporary output file if it was created
640
+ try:
641
+ if os.path.exists(temp_output_file):
642
+ os.unlink(temp_output_file)
643
+ except Exception as cleanup_error:
644
+ print(f"⚠️ Warning: Could not clean up temp file {temp_output_file}: {cleanup_error}")
645
+
646
+ if not captions:
647
+ return {
648
+ "verified": False,
649
+ "verdict": "error",
650
+ "message": "Could not extract captions from the YouTube video. The video may not have captions available.",
651
+ "details": {
652
+ "video_url": url,
653
+ "error": "Caption extraction failed"
654
+ },
655
+ "source": "youtube_url"
656
+ }
657
+
658
+ print(f"✅ Extracted {len(captions)} characters of captions")
659
+
660
+ # Step 2: Extract claims using Gemini
661
+ print(f"🔍 Extracting controversial claims from captions...")
662
+ genai.configure(api_key=config.GEMINI_API_KEY)
663
+ gemini_model = genai.GenerativeModel(config.GEMINI_MODEL)
664
+
665
+ claims = await _extract_claims_from_captions(captions, gemini_model)
666
+
667
+ if not claims:
668
+ return {
669
+ "verified": False,
670
+ "verdict": "uncertain",
671
+ "message": "No verifiable claims were found in the video transcript. The video may contain only opinions, questions, or non-factual content.",
672
+ "details": {
673
+ "video_url": url,
674
+ "captions_length": len(captions),
675
+ "claims_extracted": 0
676
+ },
677
+ "source": "youtube_url"
678
+ }
679
+
680
+ print(f"✅ Extracted {len(claims)} claims, starting verification...")
681
+
682
+ # Step 3: Verify each claim
683
+ claim_results = []
684
+ for i, claim in enumerate(claims, 1):
685
+ print(f"🔍 Verifying claim {i}/{len(claims)}: {claim[:100]}...")
686
+ try:
687
+ verification_result = await text_fact_checker.verify(
688
+ text_input=claim,
689
+ claim_context=f"Claim from YouTube video: {url}",
690
+ claim_date=claim_date
691
+ )
692
+ verification_result["claim_text"] = claim
693
+ verification_result["claim_index"] = i
694
+ claim_results.append(verification_result)
695
+ except Exception as e:
696
+ print(f"❌ Error verifying claim {i}: {e}")
697
+ claim_results.append({
698
+ "claim_text": claim,
699
+ "claim_index": i,
700
+ "verified": False,
701
+ "verdict": "error",
702
+ "message": f"Error during verification: {str(e)}"
703
+ })
704
+
705
+ # Step 4: Combine results
706
+ print(f"📊 Combining {len(claim_results)} claim verification results...")
707
+
708
+ # Aggregate verdicts
709
+ verdicts = [r.get("verdict", "uncertain") for r in claim_results]
710
+ true_count = verdicts.count("true")
711
+ false_count = verdicts.count("false")
712
+ uncertain_count = verdicts.count("uncertain")
713
+ mixed_count = verdicts.count("mixed")
714
+ error_count = verdicts.count("error")
715
+
716
+ # Determine overall verdict
717
+ if false_count > 0:
718
+ overall_verdict = "false"
719
+ verified = False
720
+ elif true_count > 0 and false_count == 0:
721
+ overall_verdict = "true"
722
+ verified = True
723
+ elif mixed_count > 0:
724
+ overall_verdict = "mixed"
725
+ verified = False
726
+ elif uncertain_count > 0:
727
+ overall_verdict = "uncertain"
728
+ verified = False
729
+ else:
730
+ overall_verdict = "error"
731
+ verified = False
732
+
733
+ # Step 5: Generate comprehensive summary using Gemini
734
+ print(f"📝 Generating comprehensive summary with Gemini...")
735
+ combined_message = await _generate_claims_summary(claim_results, gemini_model)
736
+
737
+ return {
738
+ "verified": verified,
739
+ "verdict": overall_verdict,
740
+ "message": combined_message,
741
+ "details": {
742
+ "video_url": url,
743
+ "captions_length": len(captions),
744
+ "total_claims": len(claims),
745
+ "claims_verified": true_count,
746
+ "claims_false": false_count,
747
+ "claims_mixed": mixed_count,
748
+ "claims_uncertain": uncertain_count,
749
+ "claims_error": error_count,
750
+ "claim_results": claim_results
751
+ },
752
+ "source": "youtube_url"
753
+ }
754
+
755
+ except Exception as e:
756
+ print(f"❌ Error verifying YouTube video: {e}")
757
+ import traceback
758
+ print(traceback.format_exc())
759
+ return {
760
+ "verified": False,
761
+ "verdict": "error",
762
+ "message": f"Error processing YouTube video: {str(e)}",
763
+ "details": {
764
+ "video_url": url,
765
+ "error": str(e)
766
+ },
767
+ "source": "youtube_url"
768
+ }
769
+
770
+
771
  @app.post("/chatbot/verify")
772
  async def chatbot_verify(
773
  text_input: Optional[str] = Form(None),
 
857
  try:
858
  gemini_prompt = f"""
859
  You are an assistant for audio authenticity analysis.
 
860
  {('User question: ' + claim_context) if claim_context else ''}
861
  The audio has been analyzed and the result is: {'deepfake' if deepfake else 'NOT deepfake'}.
862
  Compose a clear, friendly, 1-2 line summary verdict for the user, tailored to the above context/result (do not answer with JSON or code, just a natural response).
863
  Avoid repeating 'deepfake detection' technical language; be concise and direct.
864
+ Do NOT mention file names or file paths in your response.
865
  """
866
  gemini_response = input_processor_for_audio.model.generate_content(gemini_prompt)
867
  ai_message = None
 
917
  print(f"🔍 DEBUG: Processing {len(urls_list)} URLs")
918
  for i, url in enumerate(urls_list):
919
  print(f"🔍 DEBUG: Processing URL {i}: {url}")
920
+
921
+ # STEP 0: Check if this is a YouTube URL - handle specially
922
+ if _is_youtube_url(url):
923
+ print(f"🎥 DEBUG: Detected YouTube URL, using caption-based verification: {url}")
924
+ try:
925
+ result = await _verify_youtube_video(url, claim_context, claim_date)
926
+ results.append(result)
927
+ print(f"🔍 DEBUG: YouTube verification result: {result}")
928
+ continue # Skip the rest of the URL processing
929
+ except Exception as e:
930
+ print(f"❌ DEBUG: YouTube verification failed: {e}")
931
+ import traceback
932
+ print(traceback.format_exc())
933
+ # Fall through to regular video processing as fallback
934
+
935
+ # STEP 1: For social media URLs, use yt-dlp to fetch the actual media first
936
+ # This determines the REAL media type, not just what the LLM guessed
937
+ url_lower = url.lower()
938
+ is_social_media = any(domain in url_lower for domain in [
939
+ 'twitter.com', 'x.com', 'instagram.com', 'tiktok.com',
940
+ 'facebook.com', 'youtube.com', 'youtu.be'
941
+ ])
942
+
943
+ extracted_media = None
944
+ if is_social_media:
945
+ print(f"🔍 DEBUG: Detected social media URL, extracting media with yt-dlp: {url}")
946
+ try:
947
+ # Use yt-dlp to extract media and determine actual type
948
+ extracted_media = await _extract_media_from_url(url)
949
+ if extracted_media:
950
+ actual_type = extracted_media.get("type") # "image" or "video"
951
+ media_path = extracted_media.get("path")
952
+ temp_dir = extracted_media.get("temp_dir")
953
+
954
+ print(f"🔍 DEBUG: yt-dlp extracted {actual_type} from URL: {media_path}")
955
+
956
+ # Route based on ACTUAL media type, not LLM's guess
957
+ if actual_type == "image":
958
+ result = await image_verifier.verify(
959
+ image_path=media_path,
960
+ claim_context=claim_context,
961
+ claim_date=claim_date
962
+ )
963
+ else: # video
964
+ result = await video_verifier.verify(
965
+ video_path=media_path,
966
+ claim_context=claim_context,
967
+ claim_date=claim_date
968
+ )
969
+
970
+ result["source"] = "url"
971
+ results.append(result)
972
+
973
+ # Add to cleanup list
974
+ if media_path:
975
+ temp_files_to_cleanup.append(media_path)
976
+ if temp_dir:
977
+ temp_files_to_cleanup.append(temp_dir)
978
+
979
+ continue # Skip the old routing logic below
980
+ else:
981
+ print(f"⚠️ DEBUG: yt-dlp extraction returned None, falling back to direct URL")
982
+ except Exception as e:
983
+ print(f"⚠️ DEBUG: Failed to extract media from URL with yt-dlp: {e}, falling back to direct URL")
984
+ import traceback
985
+ print(traceback.format_exc())
986
+ # Fall through to old logic
987
+
988
+ # STEP 2: Fallback to old routing (for direct image/video URLs or if yt-dlp fails)
989
  if verification_type == "image":
990
  print(f"🔍 DEBUG: Calling image_verifier.verify for URL")
991
  result = await image_verifier.verify(
 
1176
  print(f"🔍 DEBUG: Exception type: {type(e).__name__}")
1177
  raise HTTPException(status_code=500, detail=str(e))
1178
 
1179
+ @app.get("/mongodb/search-similar")
1180
+ async def search_similar_rumours(
1181
+ query: str,
1182
+ similarity_threshold: float = 0.6,
1183
+ limit: int = 5
1184
+ ):
1185
+ """
1186
+ Search for rumours similar to the query text
1187
+
1188
+ Args:
1189
+ query: Search query text
1190
+ similarity_threshold: Minimum similarity score (0.0 to 1.0, default: 0.6)
1191
+ limit: Maximum number of results to return (default: 5)
1192
+
1193
+ Returns:
1194
+ List of similar rumours with similarity scores
1195
+ """
1196
+ try:
1197
+ if not mongodb_service:
1198
+ raise HTTPException(
1199
+ status_code=503,
1200
+ detail="MongoDB service is not available. Check MONGO_CONNECTION_STRING environment variable."
1201
+ )
1202
+
1203
+ if not query or not query.strip():
1204
+ return {
1205
+ "success": True,
1206
+ "count": 0,
1207
+ "results": []
1208
+ }
1209
+
1210
+ # Validate threshold
1211
+ similarity_threshold = max(0.0, min(1.0, similarity_threshold))
1212
+
1213
+ results = mongodb_service.search_similar_rumours(
1214
+ query=query,
1215
+ similarity_threshold=similarity_threshold,
1216
+ limit=limit
1217
+ )
1218
+
1219
+ return {
1220
+ "success": True,
1221
+ "count": len(results),
1222
+ "query": query,
1223
+ "similarity_threshold": similarity_threshold,
1224
+ "results": results
1225
+ }
1226
+
1227
+ except Exception as e:
1228
+ logger.error(f"❌ Error searching similar rumours: {e}")
1229
+ raise HTTPException(status_code=500, detail=str(e))
1230
+
1231
  @app.get("/health")
1232
  async def health_check():
1233
  return {"status": "healthy", "service": "visual-verification"}
 
1373
  except Exception as e:
1374
  raise HTTPException(status_code=500, detail=str(e))
1375
 
1376
+ from pydantic import BaseModel
1377
+
1378
+
1379
+ # ---------- Auth endpoints (minimal implementation) ----------
1380
+
1381
+
1382
+ class LoginRequest(BaseModel):
1383
+ email: str
1384
+ password: str
1385
+
1386
+ class SignupRequest(BaseModel):
1387
+ name: str
1388
+ email: str
1389
+ password: str
1390
+ phone_number: Optional[str] = None
1391
+ age: Optional[int] = None
1392
+ domain_preferences: Optional[List[str]] = []
1393
+
1394
+ class UserResponse(BaseModel):
1395
+ email: str
1396
+ id: Optional[str] = None
1397
+
1398
+ @app.post("/auth/signup")
1399
+ async def signup(request: SignupRequest):
1400
+ """Sign up a new user"""
1401
+ if not mongodb_service:
1402
+ raise HTTPException(status_code=503, detail="MongoDB service not available")
1403
+
1404
+ try:
1405
+ # Hash password (in production, use bcrypt or similar)
1406
+ import hashlib
1407
+ password_hash = hashlib.sha256(request.password.encode()).hexdigest()
1408
+
1409
+ user_data = {
1410
+ "name": request.name,
1411
+ "email": request.email,
1412
+ "password": password_hash,
1413
+ "phone_number": request.phone_number,
1414
+ "age": request.age,
1415
+ "domain_preferences": request.domain_preferences or [],
1416
+ "created_at": None, # Will be set by MongoDB service
1417
+ "updated_at": None,
1418
+ }
1419
+
1420
+ user = mongodb_service.create_user(user_data)
1421
+
1422
+ # Generate token (in production, use JWT)
1423
+ token = f"mock_token_{request.email}"
1424
+
1425
+ return {
1426
+ "message": "User created successfully",
1427
+ "token": token,
1428
+ "user": {
1429
+ "name": user.get("name"),
1430
+ "email": user["email"],
1431
+ "id": user["id"],
1432
+ "phone_number": user.get("phone_number"),
1433
+ "age": user.get("age"),
1434
+ "domain_preferences": user.get("domain_preferences", [])
1435
+ }
1436
+ }
1437
+ except ValueError as e:
1438
+ raise HTTPException(status_code=400, detail=str(e))
1439
+ except Exception as e:
1440
+ logger.error(f"Signup error: {e}")
1441
+ raise HTTPException(status_code=500, detail="Failed to create user")
1442
+
1443
+ @app.post("/auth/login")
1444
+ async def login(request: LoginRequest):
1445
+ """Login user"""
1446
+ if not mongodb_service:
1447
+ raise HTTPException(status_code=503, detail="MongoDB service not available")
1448
+
1449
+ try:
1450
+ user = mongodb_service.get_user_by_email(request.email)
1451
+ if not user:
1452
+ raise HTTPException(status_code=401, detail="Invalid email or password")
1453
+
1454
+ # Verify password (in production, use bcrypt or similar)
1455
+ import hashlib
1456
+ password_hash = hashlib.sha256(request.password.encode()).hexdigest()
1457
+
1458
+ if user["password"] != password_hash:
1459
+ raise HTTPException(status_code=401, detail="Invalid email or password")
1460
+
1461
+ # Generate token (in production, use JWT)
1462
+ token = f"mock_token_{request.email}"
1463
+
1464
+ return {
1465
+ "message": "Login successful",
1466
+ "token": token,
1467
+ "user": {
1468
+ "name": user.get("name"),
1469
+ "email": user["email"],
1470
+ "id": user["id"],
1471
+ "phone_number": user.get("phone_number"),
1472
+ "age": user.get("age"),
1473
+ "domain_preferences": user.get("domain_preferences", [])
1474
+ }
1475
+ }
1476
+ except HTTPException:
1477
+ raise
1478
+ except Exception as e:
1479
+ logger.error(f"Login error: {e}")
1480
+ raise HTTPException(status_code=500, detail="Failed to login")
1481
+
1482
+ @app.get("/auth/me")
1483
+ async def get_current_user(request: Request):
1484
+ """Get current user (requires authentication in production)"""
1485
+ if not mongodb_service:
1486
+ raise HTTPException(status_code=503, detail="MongoDB service not available")
1487
+
1488
+ # In production, verify JWT token from Authorization header
1489
+ auth_header = request.headers.get("Authorization")
1490
+ if not auth_header or not auth_header.startswith("Bearer "):
1491
+ raise HTTPException(status_code=401, detail="Not authenticated")
1492
+
1493
+ token = auth_header.replace("Bearer ", "")
1494
+
1495
+ # Extract email from token (in production, decode JWT)
1496
+ if not token.startswith("mock_token_"):
1497
+ raise HTTPException(status_code=401, detail="Invalid token")
1498
+
1499
+ email = token.replace("mock_token_", "")
1500
+
1501
+ try:
1502
+ user = mongodb_service.get_user_by_email(email)
1503
+ if not user:
1504
+ raise HTTPException(status_code=401, detail="User not found")
1505
+
1506
+ # Get subscription tier from user document (preferred) or check subscription
1507
+ subscription_tier = user.get("subscription_tier", "Free")
1508
+
1509
+ # If not in user doc, check active subscription
1510
+ if subscription_tier == "Free" and user.get("id"):
1511
+ subscription = mongodb_service.get_user_subscription(user_id=user["id"], status="active")
1512
+ if subscription:
1513
+ subscription_tier = subscription.get("plan_name", "Free")
1514
+ # Update user document with subscription tier
1515
+ mongodb_service.update_user_subscription_tier(user["id"], subscription_tier)
1516
+
1517
+ return {
1518
+ "name": user.get("name"),
1519
+ "email": user["email"],
1520
+ "id": user["id"],
1521
+ "phone_number": user.get("phone_number"),
1522
+ "age": user.get("age"),
1523
+ "domain_preferences": user.get("domain_preferences", []),
1524
+ "subscription_tier": subscription_tier
1525
+ }
1526
+ except HTTPException:
1527
+ raise
1528
+ except Exception as e:
1529
+ logger.error(f"Get user error: {e}")
1530
+ raise HTTPException(status_code=500, detail="Failed to get user")
1531
+
1532
+
1533
+ # ---------- Chat history endpoints ----------
1534
+
1535
+
1536
+ class ChatSessionUpsert(BaseModel):
1537
+ session_id: Optional[str] = None
1538
+ title: Optional[str] = None
1539
+ user_id: Optional[str] = None
1540
+ anonymous_id: Optional[str] = None
1541
+ last_verdict: Optional[str] = None
1542
+ last_summary: Optional[str] = None
1543
+
1544
+
1545
+ class ChatTurn(BaseModel):
1546
+ role: str
1547
+ content: str
1548
+ created_at: Optional[Any] = None # Can be datetime, string, or None
1549
+ verdict: Optional[str] = None
1550
+ confidence: Optional[float] = None
1551
+ sources: Optional[Dict[str, Any]] = None
1552
+ attachments: Optional[List[Dict[str, Any]]] = None
1553
+ metadata: Optional[Dict[str, Any]] = None
1554
+
1555
+
1556
+ class ChatMessagesAppend(BaseModel):
1557
+ session_id: str
1558
+ user_id: Optional[str] = None
1559
+ anonymous_id: Optional[str] = None
1560
+ messages: List[ChatTurn]
1561
+
1562
+
1563
+ @app.get("/chat/sessions")
1564
+ async def list_chat_sessions(
1565
+ user_id: Optional[str] = None,
1566
+ anonymous_id: Optional[str] = None,
1567
+ ):
1568
+ """Return chat sessions for logged-in users only.
1569
+
1570
+ Anonymous users will receive an empty list since their sessions are not persisted.
1571
+ """
1572
+ try:
1573
+ if not mongodb_service:
1574
+ raise HTTPException(status_code=503, detail="MongoDB service not available")
1575
+
1576
+ # Only return sessions for logged-in users
1577
+ if not user_id:
1578
+ logger.info(f"⏭️ No user_id provided, returning empty sessions list")
1579
+ return {"sessions": []}
1580
+
1581
+ logger.info(f"🔍 Loading chat sessions: user_id={user_id}")
1582
+ sessions = mongodb_service.get_chat_sessions(
1583
+ user_id=user_id,
1584
+ anonymous_id=None, # Don't query by anonymous_id anymore
1585
+ )
1586
+ logger.info(f"✅ Found {len(sessions)} chat sessions")
1587
+ return {"sessions": sessions}
1588
+ except Exception as e:
1589
+ logger.error(f"❌ Error loading chat sessions: {e}", exc_info=True)
1590
+ raise HTTPException(status_code=500, detail=f"Failed to load chat sessions: {str(e)}")
1591
+
1592
+
1593
+ @app.post("/chat/sessions")
1594
+ async def upsert_chat_session(payload: ChatSessionUpsert):
1595
+ """Create or update a chat session.
1596
+
1597
+ Only saves sessions for logged-in users (user_id required).
1598
+ Anonymous sessions are not persisted to MongoDB but a session_id is still returned for UI purposes.
1599
+ """
1600
+ try:
1601
+ if not mongodb_service:
1602
+ raise HTTPException(status_code=503, detail="MongoDB service not available")
1603
+
1604
+ data = payload.dict(exclude_unset=True)
1605
+ user_id = data.get("user_id")
1606
+ anonymous_id = data.get("anonymous_id")
1607
+
1608
+ # Only persist sessions for logged-in users
1609
+ if not user_id:
1610
+ # Still return a session_id for UI purposes, but don't persist
1611
+ import uuid
1612
+ session_id = data.get("session_id") or str(uuid.uuid4())
1613
+ logger.info(f"⏭️ Skipping session persistence for anonymous user (session_id={session_id})")
1614
+ return {
1615
+ "session_id": session_id,
1616
+ "title": data.get("title", "New Chat"),
1617
+ "user_id": None,
1618
+ "anonymous_id": anonymous_id,
1619
+ "created_at": None,
1620
+ "updated_at": None,
1621
+ "persisted": False,
1622
+ }
1623
+
1624
+ logger.info(f"🔍 Upserting chat session: {data}")
1625
+
1626
+ # Optionally migrate anonymous history on first login
1627
+ if user_id and anonymous_id:
1628
+ try:
1629
+ migrated = mongodb_service.migrate_anonymous_sessions(
1630
+ anonymous_id=anonymous_id, user_id=user_id
1631
+ )
1632
+ logger.info(f"✅ Migrated {migrated} anonymous sessions to user {user_id}")
1633
+ except Exception as exc:
1634
+ logger.error(f"Failed to migrate anonymous sessions: {exc}")
1635
+
1636
+ session_doc = mongodb_service.upsert_chat_session(data)
1637
+ logger.info(f"✅ Created/updated session: {session_doc.get('session_id')}")
1638
+ return session_doc
1639
+ except Exception as e:
1640
+ logger.error(f"❌ Error upserting chat session: {e}", exc_info=True)
1641
+ raise HTTPException(status_code=500, detail=f"Failed to create/update chat session: {str(e)}")
1642
+
1643
+
1644
+ @app.get("/chat/messages/{session_id}")
1645
+ async def get_chat_messages(session_id: str):
1646
+ """Return all messages for a given chat session."""
1647
+ if not mongodb_service:
1648
+ raise HTTPException(status_code=503, detail="MongoDB service not available")
1649
+
1650
+ messages = mongodb_service.get_chat_messages(session_id=session_id)
1651
+ return {"session_id": session_id, "messages": messages}
1652
+
1653
+
1654
+ @app.post("/chat/messages")
1655
+ async def append_chat_messages(payload: ChatMessagesAppend):
1656
+ """Append one or more messages to a chat session.
1657
+
1658
+ Only saves messages for logged-in users (user_id required).
1659
+ Anonymous messages are not persisted to MongoDB.
1660
+ """
1661
+ if not mongodb_service:
1662
+ raise HTTPException(status_code=503, detail="MongoDB service not available")
1663
+
1664
+ data = payload.dict()
1665
+ user_id = data.get("user_id")
1666
+
1667
+ # Only persist messages for logged-in users
1668
+ if not user_id:
1669
+ logger.info(f"⏭️ Skipping message persistence for anonymous user (session_id={data['session_id']})")
1670
+ return {"inserted": 0, "message": "Messages not persisted for anonymous users"}
1671
+
1672
+ inserted = mongodb_service.append_chat_messages(
1673
+ session_id=data["session_id"],
1674
+ messages=[m for m in data["messages"]],
1675
+ user_id=user_id,
1676
+ anonymous_id=data.get("anonymous_id"),
1677
+ )
1678
+ logger.info(f"✅ Persisted {inserted} messages for user {user_id}")
1679
+ return {"inserted": inserted}
1680
+
1681
+
1682
+ # ---------- Subscription endpoints ----------
1683
+
1684
+
1685
+ class CreatePlanRequest(BaseModel):
1686
+ name: str
1687
+ amount: int # Amount in paise (smallest currency unit)
1688
+ currency: str = "INR"
1689
+ interval: int = 1
1690
+ period: str = "monthly" # daily, weekly, monthly, yearly
1691
+ description: Optional[str] = None
1692
+
1693
+
1694
+ class CreateSubscriptionRequest(BaseModel):
1695
+ plan_id: str
1696
+ user_id: str
1697
+ customer_notify: int = 1
1698
+ total_count: Optional[int] = None
1699
+ notes: Optional[Dict[str, str]] = None
1700
+
1701
+
1702
+ class CancelSubscriptionRequest(BaseModel):
1703
+ subscription_id: str
1704
+ cancel_at_cycle_end: bool = False
1705
+
1706
+
1707
+ @app.post("/subscriptions/plans")
1708
+ async def create_subscription_plan(request: CreatePlanRequest):
1709
+ """Create a subscription plan in Razorpay (admin/one-time setup)"""
1710
+ try:
1711
+ if not razorpay_service or not razorpay_service.client:
1712
+ raise HTTPException(
1713
+ status_code=503,
1714
+ detail="Razorpay service not available. Check RAZORPAY_ID and RAZORPAY_KEY."
1715
+ )
1716
+
1717
+ plan = razorpay_service.create_plan(
1718
+ name=request.name,
1719
+ amount=request.amount,
1720
+ currency=request.currency,
1721
+ interval=request.interval,
1722
+ period=request.period,
1723
+ description=request.description
1724
+ )
1725
+
1726
+ return {
1727
+ "success": True,
1728
+ "plan": plan
1729
+ }
1730
+ except Exception as e:
1731
+ logger.error(f"❌ Failed to create subscription plan: {e}")
1732
+ raise HTTPException(status_code=500, detail=str(e))
1733
+
1734
+
1735
+ @app.get("/subscriptions/plans")
1736
+ async def list_subscription_plans(count: int = 10, skip: int = 0):
1737
+ """List available subscription plans"""
1738
+ try:
1739
+ if not razorpay_service or not razorpay_service.client:
1740
+ raise HTTPException(
1741
+ status_code=503,
1742
+ detail="Razorpay service not available. Check RAZORPAY_ID and RAZORPAY_KEY."
1743
+ )
1744
+
1745
+ plans = razorpay_service.list_plans(count=count, skip=skip)
1746
+ return {
1747
+ "success": True,
1748
+ "plans": plans
1749
+ }
1750
+ except Exception as e:
1751
+ logger.error(f"❌ Failed to list subscription plans: {e}")
1752
+ raise HTTPException(status_code=500, detail=str(e))
1753
+
1754
+
1755
+ @app.get("/subscriptions/config")
1756
+ async def get_subscription_config():
1757
+ """Get Razorpay public configuration (Key ID) for frontend"""
1758
+ try:
1759
+ if not config.RAZORPAY_ID:
1760
+ raise HTTPException(
1761
+ status_code=503,
1762
+ detail="Razorpay not configured"
1763
+ )
1764
+
1765
+ return {
1766
+ "success": True,
1767
+ "razorpay_key_id": config.RAZORPAY_ID
1768
+ }
1769
+ except Exception as e:
1770
+ logger.error(f"❌ Failed to get subscription config: {e}")
1771
+ raise HTTPException(status_code=500, detail=str(e))
1772
+
1773
+
1774
+ @app.post("/subscriptions/create")
1775
+ async def create_subscription(request: CreateSubscriptionRequest):
1776
+ """Create a subscription for a user"""
1777
+ try:
1778
+ if not razorpay_service or not razorpay_service.client:
1779
+ raise HTTPException(
1780
+ status_code=503,
1781
+ detail="Razorpay service not available. Check RAZORPAY_ID and RAZORPAY_KEY."
1782
+ )
1783
+
1784
+ if not mongodb_service:
1785
+ raise HTTPException(
1786
+ status_code=503,
1787
+ detail="MongoDB service not available"
1788
+ )
1789
+
1790
+ # Create subscription in Razorpay
1791
+ subscription = razorpay_service.create_subscription(
1792
+ plan_id=request.plan_id,
1793
+ customer_notify=request.customer_notify,
1794
+ total_count=request.total_count,
1795
+ notes=request.notes
1796
+ )
1797
+
1798
+ # Get plan details
1799
+ plan = razorpay_service.get_plan(request.plan_id)
1800
+
1801
+ # Extract plan name - try multiple possible locations
1802
+ plan_name = "Pro" # Default
1803
+ if plan:
1804
+ # Try different possible locations for plan name
1805
+ plan_name_raw = (
1806
+ plan.get("item", {}).get("name") or
1807
+ plan.get("name") or
1808
+ request.notes.get("plan_name") if request.notes else None or
1809
+ "Pro"
1810
+ )
1811
+ # Normalize plan name
1812
+ plan_name_raw_lower = plan_name_raw.lower()
1813
+ if "pro" in plan_name_raw_lower:
1814
+ plan_name = "Pro"
1815
+ elif "enterprise" in plan_name_raw_lower:
1816
+ plan_name = "Enterprise"
1817
+ else:
1818
+ plan_name = plan_name_raw
1819
+
1820
+ # Store subscription in MongoDB
1821
+ from datetime import datetime
1822
+ subscription_data = {
1823
+ "user_id": request.user_id,
1824
+ "razorpay_subscription_id": subscription.get("id"),
1825
+ "razorpay_plan_id": request.plan_id,
1826
+ "plan_name": plan_name,
1827
+ "status": subscription.get("status", "created"),
1828
+ "amount": plan.get("item", {}).get("amount", 0) if plan else 0,
1829
+ "currency": plan.get("item", {}).get("currency", "INR") if plan else "INR",
1830
+ "current_start": subscription.get("current_start"),
1831
+ "current_end": subscription.get("current_end"),
1832
+ "next_billing_at": subscription.get("end_at"),
1833
+ "created_at": datetime.utcnow(),
1834
+ "razorpay_data": subscription # Store full Razorpay response
1835
+ }
1836
+
1837
+ mongodb_service.upsert_subscription(subscription_data)
1838
+
1839
+ # Update user's subscription tier immediately if status is active
1840
+ # Otherwise, it will be updated via webhook when payment is completed
1841
+ if subscription.get("status") == "active":
1842
+ mongodb_service.update_user_subscription_tier(request.user_id, plan_name)
1843
+ logger.info(f"✅ Updated user {request.user_id} subscription tier to {plan_name}")
1844
+ else:
1845
+ logger.info(f"⏳ Subscription created with status '{subscription.get('status')}'. User tier will be updated when subscription is activated via webhook.")
1846
+
1847
+ return {
1848
+ "success": True,
1849
+ "subscription_id": subscription.get("id"),
1850
+ "short_url": subscription.get("short_url"),
1851
+ "subscription": subscription
1852
+ }
1853
+ except Exception as e:
1854
+ logger.error(f"❌ Failed to create subscription: {e}")
1855
+ raise HTTPException(status_code=500, detail=str(e))
1856
+
1857
+
1858
+ @app.get("/subscriptions/status")
1859
+ async def get_subscription_status(user_id: Optional[str] = None):
1860
+ """Get user's subscription status"""
1861
+ try:
1862
+ if not mongodb_service:
1863
+ raise HTTPException(
1864
+ status_code=503,
1865
+ detail="MongoDB service not available"
1866
+ )
1867
+
1868
+ if not user_id:
1869
+ return {
1870
+ "success": True,
1871
+ "subscription": None,
1872
+ "message": "No user_id provided"
1873
+ }
1874
+
1875
+ subscription = mongodb_service.get_user_subscription(user_id=user_id)
1876
+
1877
+ if subscription:
1878
+ # Optionally fetch latest data from Razorpay
1879
+ if razorpay_service and razorpay_service.client:
1880
+ try:
1881
+ razorpay_sub = razorpay_service.get_subscription(
1882
+ subscription.get("razorpay_subscription_id")
1883
+ )
1884
+ # Update status if changed
1885
+ if razorpay_sub.get("status") != subscription.get("status"):
1886
+ mongodb_service.update_subscription_status(
1887
+ subscription.get("razorpay_subscription_id"),
1888
+ razorpay_sub.get("status"),
1889
+ {
1890
+ "current_start": razorpay_sub.get("current_start"),
1891
+ "current_end": razorpay_sub.get("current_end"),
1892
+ "next_billing_at": razorpay_sub.get("end_at")
1893
+ }
1894
+ )
1895
+ subscription["status"] = razorpay_sub.get("status")
1896
+ except Exception as e:
1897
+ logger.warning(f"Failed to sync with Razorpay: {e}")
1898
+
1899
+ return {
1900
+ "success": True,
1901
+ "subscription": subscription
1902
+ }
1903
+ except Exception as e:
1904
+ logger.error(f"❌ Failed to get subscription status: {e}")
1905
+ raise HTTPException(status_code=500, detail=str(e))
1906
+
1907
+
1908
+ @app.post("/subscriptions/cancel")
1909
+ async def cancel_subscription(request: CancelSubscriptionRequest):
1910
+ """Cancel user's subscription"""
1911
+ try:
1912
+ if not razorpay_service or not razorpay_service.client:
1913
+ raise HTTPException(
1914
+ status_code=503,
1915
+ detail="Razorpay service not available. Check RAZORPAY_ID and RAZORPAY_KEY."
1916
+ )
1917
+
1918
+ if not mongodb_service:
1919
+ raise HTTPException(
1920
+ status_code=503,
1921
+ detail="MongoDB service not available"
1922
+ )
1923
+
1924
+ # Cancel subscription in Razorpay
1925
+ subscription = razorpay_service.cancel_subscription(
1926
+ subscription_id=request.subscription_id,
1927
+ cancel_at_cycle_end=request.cancel_at_cycle_end
1928
+ )
1929
+
1930
+ # Update status in MongoDB
1931
+ mongodb_service.update_subscription_status(
1932
+ request.subscription_id,
1933
+ subscription.get("status", "cancelled"),
1934
+ {
1935
+ "current_start": subscription.get("current_start"),
1936
+ "current_end": subscription.get("current_end"),
1937
+ "next_billing_at": subscription.get("end_at")
1938
+ }
1939
+ )
1940
+
1941
+ return {
1942
+ "success": True,
1943
+ "subscription": subscription
1944
+ }
1945
+ except Exception as e:
1946
+ logger.error(f"❌ Failed to cancel subscription: {e}")
1947
+ raise HTTPException(status_code=500, detail=str(e))
1948
+
1949
+
1950
+ @app.post("/webhooks/razorpay")
1951
+ async def razorpay_webhook(request: Request):
1952
+ """Handle Razorpay webhook events"""
1953
+ try:
1954
+ if not razorpay_service:
1955
+ raise HTTPException(
1956
+ status_code=503,
1957
+ detail="Razorpay service not available"
1958
+ )
1959
+
1960
+ if not mongodb_service:
1961
+ raise HTTPException(
1962
+ status_code=503,
1963
+ detail="MongoDB service not available"
1964
+ )
1965
+
1966
+ # Get raw body for signature verification
1967
+ body = await request.body()
1968
+ body_str = body.decode('utf-8')
1969
+
1970
+ # Get signature from header
1971
+ signature = request.headers.get("X-Razorpay-Signature", "")
1972
+
1973
+ # Verify webhook signature
1974
+ if not razorpay_service.verify_webhook_signature(body_str, signature):
1975
+ logger.warning("⚠️ Invalid webhook signature")
1976
+ raise HTTPException(status_code=400, detail="Invalid webhook signature")
1977
+
1978
+ # Parse webhook payload from body string
1979
+ webhook_data = json.loads(body_str)
1980
+ event = webhook_data.get("event")
1981
+ payload = webhook_data.get("payload", {})
1982
+
1983
+ logger.info(f"📥 Received Razorpay webhook: {event}")
1984
+
1985
+ # Handle different webhook events
1986
+ if event == "subscription.activated":
1987
+ subscription = payload.get("subscription", {}).get("entity", {})
1988
+ subscription_id = subscription.get("id")
1989
+
1990
+ if subscription_id:
1991
+ # Get subscription from DB to get user_id and plan_name
1992
+ sub_doc = mongodb_service.get_subscription_by_razorpay_id(subscription_id)
1993
+ if sub_doc:
1994
+ user_id = sub_doc.get("user_id")
1995
+ plan_name = sub_doc.get("plan_name", "Pro")
1996
+
1997
+ logger.info(f"📥 Processing subscription.activated for user {user_id}, plan {plan_name}")
1998
+
1999
+ mongodb_service.update_subscription_status(
2000
+ subscription_id,
2001
+ "active",
2002
+ {
2003
+ "current_start": subscription.get("current_start"),
2004
+ "current_end": subscription.get("current_end"),
2005
+ "next_billing_at": subscription.get("end_at")
2006
+ }
2007
+ )
2008
+
2009
+ # Update user's subscription tier
2010
+ if user_id:
2011
+ success = mongodb_service.update_user_subscription_tier(user_id, plan_name)
2012
+ if success:
2013
+ logger.info(f"✅ Successfully updated user {user_id} tier to {plan_name} via webhook")
2014
+ else:
2015
+ logger.error(f"❌ Failed to update user {user_id} tier to {plan_name}")
2016
+ else:
2017
+ logger.warning(f"⚠️ Subscription {subscription_id} not found in database")
2018
+
2019
+ elif event == "subscription.charged":
2020
+ subscription = payload.get("subscription", {}).get("entity", {})
2021
+ payment = payload.get("payment", {}).get("entity", {})
2022
+ subscription_id = subscription.get("id")
2023
+
2024
+ if subscription_id:
2025
+ # Get subscription from DB to get user_id and plan_name
2026
+ sub_doc = mongodb_service.get_subscription_by_razorpay_id(subscription_id)
2027
+ if sub_doc:
2028
+ user_id = sub_doc.get("user_id")
2029
+ plan_name = sub_doc.get("plan_name", "Pro")
2030
+
2031
+ logger.info(f"📥 Processing subscription.charged for user {user_id}, plan {plan_name}")
2032
+
2033
+ # Update subscription with payment info
2034
+ update_data = {
2035
+ "current_start": subscription.get("current_start"),
2036
+ "current_end": subscription.get("current_end"),
2037
+ "next_billing_at": subscription.get("end_at"),
2038
+ "last_payment_id": payment.get("id"),
2039
+ "last_payment_amount": payment.get("amount"),
2040
+ "last_payment_date": payment.get("created_at")
2041
+ }
2042
+ mongodb_service.update_subscription_status(
2043
+ subscription_id,
2044
+ subscription.get("status", "active"),
2045
+ update_data
2046
+ )
2047
+
2048
+ # Update user's subscription tier when payment is charged
2049
+ if user_id and subscription.get("status") == "active":
2050
+ success = mongodb_service.update_user_subscription_tier(user_id, plan_name)
2051
+ if success:
2052
+ logger.info(f"✅ Successfully updated user {user_id} tier to {plan_name} via subscription.charged webhook")
2053
+ else:
2054
+ logger.error(f"❌ Failed to update user {user_id} tier to {plan_name}")
2055
+ else:
2056
+ logger.warning(f"⚠️ Subscription {subscription_id} not found in database for subscription.charged event")
2057
+
2058
+ elif event == "subscription.cancelled":
2059
+ subscription = payload.get("subscription", {}).get("entity", {})
2060
+ subscription_id = subscription.get("id")
2061
+
2062
+ if subscription_id:
2063
+ # Get subscription from DB to get user_id
2064
+ sub_doc = mongodb_service.get_subscription_by_razorpay_id(subscription_id)
2065
+ if sub_doc:
2066
+ user_id = sub_doc.get("user_id")
2067
+
2068
+ mongodb_service.update_subscription_status(
2069
+ subscription_id,
2070
+ "cancelled",
2071
+ {
2072
+ "current_start": subscription.get("current_start"),
2073
+ "current_end": subscription.get("current_end"),
2074
+ "next_billing_at": subscription.get("end_at")
2075
+ }
2076
+ )
2077
+
2078
+ # Update user's subscription tier to Free
2079
+ if user_id:
2080
+ mongodb_service.update_user_subscription_tier(user_id, "Free")
2081
+
2082
+ elif event == "payment.failed":
2083
+ payment = payload.get("payment", {}).get("entity", {})
2084
+ subscription_id = payment.get("subscription_id")
2085
+
2086
+ if subscription_id:
2087
+ # Update subscription to reflect failed payment
2088
+ subscription = razorpay_service.get_subscription(subscription_id)
2089
+ mongodb_service.update_subscription_status(
2090
+ subscription_id,
2091
+ subscription.get("status", "pending"),
2092
+ {
2093
+ "last_payment_failed": True,
2094
+ "last_payment_failure_reason": payment.get("error_description")
2095
+ }
2096
+ )
2097
+
2098
+ return {"success": True, "message": "Webhook processed"}
2099
+
2100
+ except HTTPException:
2101
+ raise
2102
+ except Exception as e:
2103
+ logger.error(f"❌ Failed to process webhook: {e}")
2104
+ raise HTTPException(status_code=500, detail=str(e))
2105
+
2106
+
2107
  if __name__ == "__main__":
2108
  uvicorn.run(app, host="0.0.0.0", port=config.SERVICE_PORT)
requirements.txt CHANGED
@@ -1,23 +1,24 @@
1
- requests
2
- pillow
3
- opencv-python
4
- fastapi
5
- uvicorn[standard]
6
- websockets
7
- serpapi
8
- python-dotenv
9
- python-multipart
10
- yt-dlp
11
- google-generativeai
12
- google-auth
13
- google-auth-oauthlib
14
- google-auth-httplib2
15
- scikit-learn
16
- numpy
17
- pymongo
18
- upstash-redis
19
- google-search-results
20
- cloudinary
21
- torch
22
- transformers
23
- pytorchvideo
 
 
1
+ requests
2
+ pillow
3
+ opencv-python
4
+ fastapi
5
+ uvicorn[standard]
6
+ websockets
7
+ serpapi
8
+ python-dotenv
9
+ python-multipart
10
+ yt-dlp
11
+ google-generativeai
12
+ google-auth
13
+ google-auth-oauthlib
14
+ google-auth-httplib2
15
+ scikit-learn
16
+ numpy
17
+ pymongo
18
+ upstash-redis
19
+ google-search-results
20
+ cloudinary
21
+ torch
22
+ transformers
23
+ pytorchvideo
24
+ razorpay
services/deepfake_checker.py ADDED
@@ -0,0 +1,83 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sys
3
+ from typing import Set
4
+
5
+ try:
6
+ import torch
7
+ from transformers import pipeline
8
+ except ImportError:
9
+ print("="*80)
10
+ print("ERROR: Missing critical libraries.")
11
+ print("Please install all required dependencies first:")
12
+ print("pip install torch transformers")
13
+ print("="*80)
14
+ sys.exit(1)
15
+
16
+ # --- Configuration ---
17
+ AUDIO_FORMATS: Set[str] = {'.mp3', '.wav', '.m4a', '.flac', '.ogg'}
18
+ DEVICE = 0 if torch.cuda.is_available() else -1 # 0 for CUDA, -1 for CPU
19
+ AUDIO_MODEL_ID = "mo-thecreator/Deepfake-audio-detection"
20
+
21
+ audio_pipeline_instance = None
22
+
23
+ def get_audio_pipeline():
24
+ """Loads the audio pipeline into memory (if not already loaded)."""
25
+ global audio_pipeline_instance
26
+ if audio_pipeline_instance is None:
27
+ try:
28
+ print(f"Loading audio model '{AUDIO_MODEL_ID}' from Hugging Face Hub...")
29
+ audio_pipeline_instance = pipeline(
30
+ "audio-classification",
31
+ model=AUDIO_MODEL_ID,
32
+ device=DEVICE
33
+ )
34
+ print("Audio detection pipeline loaded successfully.")
35
+ except Exception as e:
36
+ print(f"Error loading audio pipeline: {e}")
37
+ print("Please ensure the model ID is correct.")
38
+ sys.exit(1)
39
+ return audio_pipeline_instance
40
+
41
+ def detect_audio_deepfake(file_path: str) -> bool:
42
+ """
43
+ Runs a pretrained audio deepfake detection model from the HF Hub.
44
+ """
45
+ print(f"Analyzing audio file: {os.path.basename(file_path)}")
46
+ try:
47
+ detector = get_audio_pipeline()
48
+ except Exception as e:
49
+ print(f"Failed to load audio pipeline: {e}")
50
+ return False # Fail safe
51
+ try:
52
+ results = detector(file_path)
53
+ best_result = max(results, key=lambda x: x['score'])
54
+ top_label = best_result['label'].lower()
55
+ top_score = best_result['score']
56
+ print(f"...Audio pipeline result: '{top_label}' with score {top_score:.4f}")
57
+ is_fake = top_label in ['spoof', 'fake']
58
+ return is_fake
59
+ except Exception as e:
60
+ print(f"Error during audio processing/inference: {e}")
61
+ return False
62
+
63
+ def is_audio_deepfake(file_path: str) -> bool:
64
+ """
65
+ Checks if a given audio file is a deepfake.
66
+ Args:
67
+ file_path: The absolute or relative path to the audio file.
68
+ Returns:
69
+ True if the file is classified as a deepfake, False otherwise.
70
+ Raises:
71
+ FileNotFoundError: If the file does not exist.
72
+ ValueError: If the file format is not supported.
73
+ """
74
+ if not os.path.exists(file_path):
75
+ raise FileNotFoundError(f"File not found at path: {file_path}")
76
+ ext = os.path.splitext(file_path)[1].lower()
77
+ if ext in AUDIO_FORMATS:
78
+ return detect_audio_deepfake(file_path)
79
+ else:
80
+ raise ValueError(
81
+ f"Unsupported file format: {ext}. Supported types: {AUDIO_FORMATS}"
82
+ )
83
+
services/educational_content_generator.py ADDED
@@ -0,0 +1,533 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import os
3
+ from typing import Dict, List, Optional, Any
4
+ import google.generativeai as genai
5
+ from upstash_redis import Redis
6
+ from config import config
7
+
8
+ class EducationalContentGenerator:
9
+ """Service for generating educational content about misinformation detection"""
10
+
11
+ def __init__(self):
12
+ # Configure Gemini
13
+ genai.configure(api_key=config.GEMINI_API_KEY)
14
+ self.model = genai.GenerativeModel(config.GEMINI_MODEL)
15
+
16
+ # Initialize Upstash Redis connection
17
+ try:
18
+ if config.UPSTASH_REDIS_URL and config.UPSTASH_REDIS_TOKEN:
19
+ self.redis_client = Redis(
20
+ url=config.UPSTASH_REDIS_URL,
21
+ token=config.UPSTASH_REDIS_TOKEN
22
+ )
23
+ # Test connection
24
+ self.redis_client.set("test", "connection")
25
+ self.redis_client.delete("test")
26
+ print("✅ Upstash Redis connection established")
27
+ else:
28
+ print("⚠️ Upstash Redis credentials not found, running without cache")
29
+ self.redis_client = None
30
+ except Exception as e:
31
+ print(f"❌ Upstash Redis connection failed: {e}")
32
+ self.redis_client = None
33
+
34
+ # Cache TTL (Time To Live) in seconds
35
+ self.cache_ttl = config.REDIS_TTL
36
+
37
+ # Pre-defined content templates
38
+ self.content_templates = {
39
+ "red_flags": {
40
+ "title": "How to Spot Red Flags in Misinformation",
41
+ "categories": [
42
+ "Emotional Language",
43
+ "Suspicious URLs",
44
+ "Poor Grammar",
45
+ "Missing Sources",
46
+ "Outdated Information",
47
+ "Confirmation Bias Triggers"
48
+ ]
49
+ },
50
+ "source_credibility": {
51
+ "title": "Evaluating Source Credibility",
52
+ "categories": [
53
+ "Authority Assessment",
54
+ "Bias Detection",
55
+ "Fact-checking Methodology",
56
+ "Peer Review Process",
57
+ "Transparency Standards"
58
+ ]
59
+ },
60
+ "manipulation_techniques": {
61
+ "title": "Common Manipulation Techniques",
62
+ "categories": [
63
+ "Deepfakes and AI-generated Content",
64
+ "Outdated Images",
65
+ "Misleading Headlines",
66
+ "False Context",
67
+ "Social Media Manipulation",
68
+ "Bot Networks"
69
+ ]
70
+ }
71
+ }
72
+
73
+ def _get_cache_key(self, key: str) -> str:
74
+ """Get the Redis cache key"""
75
+ return f"educational:{key}"
76
+
77
+ def _load_from_cache(self, cache_key: str) -> Optional[Dict[str, Any]]:
78
+ """Load content from Redis cache if it exists"""
79
+ if not self.redis_client:
80
+ return None
81
+
82
+ try:
83
+ cached_data = self.redis_client.get(self._get_cache_key(cache_key))
84
+ if cached_data:
85
+ return json.loads(cached_data)
86
+ except Exception as e:
87
+ print(f"Failed to load from Redis cache {cache_key}: {e}")
88
+ return None
89
+
90
+ def _save_to_cache(self, cache_key: str, content: Dict[str, Any]) -> None:
91
+ """Save content to Redis cache"""
92
+ if not self.redis_client:
93
+ return
94
+
95
+ try:
96
+ self.redis_client.setex(
97
+ self._get_cache_key(cache_key),
98
+ self.cache_ttl,
99
+ json.dumps(content, ensure_ascii=False)
100
+ )
101
+ print(f"✅ Cached {cache_key} in Redis")
102
+ except Exception as e:
103
+ print(f"Failed to save to Redis cache {cache_key}: {e}")
104
+
105
+ async def get_modules_list(self) -> Dict[str, Any]:
106
+ """Get the list of available modules (cached in Redis)"""
107
+ cache_key = "modules_list"
108
+ cached = self._load_from_cache(cache_key)
109
+
110
+ if cached:
111
+ print(f"📦 Loading modules list from Redis cache")
112
+ return cached
113
+
114
+ print(f"🔄 Generating new modules list")
115
+ # Generate modules list
116
+ modules_data = {
117
+ "modules": [
118
+ {
119
+ "id": "red_flags",
120
+ "title": "How to Spot Red Flags",
121
+ "description": "Learn to identify warning signs in misinformation",
122
+ "difficulty_levels": ["beginner", "intermediate", "advanced"],
123
+ "estimated_time": "10-15 minutes"
124
+ },
125
+ {
126
+ "id": "source_credibility",
127
+ "title": "Evaluating Source Credibility",
128
+ "description": "Understand how to assess source reliability",
129
+ "difficulty_levels": ["beginner", "intermediate", "advanced"],
130
+ "estimated_time": "15-20 minutes"
131
+ },
132
+ {
133
+ "id": "manipulation_techniques",
134
+ "title": "Common Manipulation Techniques",
135
+ "description": "Learn about various misinformation techniques",
136
+ "difficulty_levels": ["intermediate", "advanced"],
137
+ "estimated_time": "20-25 minutes"
138
+ }
139
+ ]
140
+ }
141
+
142
+ # Save to Redis cache
143
+ self._save_to_cache(cache_key, modules_data)
144
+ return modules_data
145
+
146
+ async def generate_module_content(self, module_type: str, difficulty_level: str = "beginner") -> Dict[str, Any]:
147
+ """
148
+ Generate educational content for a specific module (with Redis caching)
149
+
150
+ Args:
151
+ module_type: Type of module (red_flags, source_credibility, etc.)
152
+ difficulty_level: beginner, intermediate, advanced
153
+
154
+ Returns:
155
+ Dictionary containing educational content
156
+ """
157
+ # Check Redis cache first
158
+ cache_key = f"{module_type}_{difficulty_level}"
159
+ cached_content = self._load_from_cache(cache_key)
160
+
161
+ if cached_content:
162
+ print(f"📦 Loading {module_type} ({difficulty_level}) from Redis cache")
163
+ return cached_content
164
+
165
+ print(f"🔄 Generating new content for {module_type} ({difficulty_level})")
166
+
167
+ try:
168
+ template = self.content_templates.get(module_type, {})
169
+ if not template:
170
+ return {"error": f"Unknown module type: {module_type}"}
171
+
172
+ # Generate content using AI
173
+ content = await self._generate_ai_content(module_type, difficulty_level, template)
174
+
175
+ # Add interactive elements
176
+ content["interactive_elements"] = await self._generate_interactive_elements(module_type, difficulty_level)
177
+
178
+ # Add real-world examples
179
+ content["examples"] = await self._generate_examples(module_type, difficulty_level)
180
+
181
+ # Save to Redis cache
182
+ self._save_to_cache(cache_key, content)
183
+
184
+ return content
185
+
186
+ except Exception as e:
187
+ print(f"Failed to generate content: {str(e)}")
188
+ # Return fallback content
189
+ fallback = self._get_fallback_content(module_type, difficulty_level)
190
+ self._save_to_cache(cache_key, fallback)
191
+ return fallback
192
+
193
+ async def _generate_ai_content(self, module_type: str, difficulty_level: str, template: Dict) -> Dict[str, Any]:
194
+ """Generate AI-powered educational content"""
195
+
196
+ prompt = f"""
197
+ You are an expert digital literacy educator specializing in misinformation detection.
198
+ Create comprehensive educational content for the following module:
199
+
200
+ MODULE TYPE: {module_type}
201
+ DIFFICULTY LEVEL: {difficulty_level}
202
+ TEMPLATE: {json.dumps(template, indent=2)}
203
+
204
+ Create educational content that includes:
205
+ 1. Clear explanations of concepts
206
+ 2. Step-by-step instructions
207
+ 3. Visual indicators to look for
208
+ 4. Common mistakes to avoid
209
+ 5. Practical exercises
210
+
211
+ Respond in this JSON format:
212
+ {{
213
+ "title": "Module title",
214
+ "overview": "Brief overview of what users will learn",
215
+ "learning_objectives": ["Objective 1", "Objective 2", "Objective 3"],
216
+ "content_sections": [
217
+ {{
218
+ "title": "Section title",
219
+ "content": "Detailed explanation",
220
+ "key_points": ["Point 1", "Point 2"],
221
+ "visual_indicators": ["Indicator 1", "Indicator 2"],
222
+ "examples": ["Example 1", "Example 2"]
223
+ }}
224
+ ],
225
+ "practical_tips": ["Tip 1", "Tip 2", "Tip 3"],
226
+ "common_mistakes": ["Mistake 1", "Mistake 2"],
227
+ "difficulty_level": "{difficulty_level}"
228
+ }}
229
+ """
230
+
231
+ try:
232
+ response = self.model.generate_content(prompt)
233
+ response_text = response.text.strip()
234
+
235
+ # Clean up JSON response
236
+ if response_text.startswith('```json'):
237
+ response_text = response_text.replace('```json', '').replace('```', '').strip()
238
+ elif response_text.startswith('```'):
239
+ response_text = response_text.replace('```', '').strip()
240
+
241
+ return json.loads(response_text)
242
+
243
+ except Exception as e:
244
+ print(f"AI content generation failed: {e}")
245
+ return self._get_fallback_content(module_type, difficulty_level)
246
+
247
+ async def _generate_interactive_elements(self, module_type: str, difficulty_level: str) -> Dict[str, Any]:
248
+ """Generate interactive learning elements"""
249
+
250
+ prompt = f"""
251
+ Create interactive learning elements for a {difficulty_level} level module about {module_type}.
252
+
253
+ Generate:
254
+ 1. Quiz questions with multiple choice answers
255
+ 2. True/false statements
256
+ 3. Scenario-based questions
257
+
258
+ Respond in JSON format:
259
+ {{
260
+ "quiz_questions": [
261
+ {{
262
+ "question": "Question text",
263
+ "options": ["Option A", "Option B", "Option C", "Option D"],
264
+ "correct_answer": 0,
265
+ "explanation": "Why this answer is correct"
266
+ }}
267
+ ],
268
+ "true_false": [
269
+ {{
270
+ "statement": "Statement to evaluate",
271
+ "answer": true,
272
+ "explanation": "Explanation"
273
+ }}
274
+ ],
275
+ "scenarios": [
276
+ {{
277
+ "scenario": "Real-world scenario description",
278
+ "question": "What should you do?",
279
+ "correct_action": "Correct action",
280
+ "explanation": "Why this is the right approach"
281
+ }}
282
+ ]
283
+ }}
284
+ """
285
+
286
+ try:
287
+ response = self.model.generate_content(prompt)
288
+ response_text = response.text.strip()
289
+
290
+ if response_text.startswith('```json'):
291
+ response_text = response_text.replace('```json', '').replace('```', '').strip()
292
+ elif response_text.startswith('```'):
293
+ response_text = response_text.replace('```', '').strip()
294
+
295
+ return json.loads(response_text)
296
+
297
+ except Exception as e:
298
+ print(f"Interactive elements generation failed: {e}")
299
+ return {"quiz_questions": [], "true_false": [], "scenarios": []}
300
+
301
+ async def _generate_examples(self, module_type: str, difficulty_level: str) -> List[Dict[str, Any]]:
302
+ """Generate real-world examples"""
303
+
304
+ prompt = f"""
305
+ Create realistic examples of {module_type} for {difficulty_level} learners.
306
+
307
+ For each example, provide:
308
+ 1. A realistic scenario
309
+ 2. What to look for
310
+ 3. How to verify
311
+ 4. Why it's misleading
312
+
313
+ Respond in JSON format:
314
+ {{
315
+ "examples": [
316
+ {{
317
+ "title": "Example title",
318
+ "scenario": "Realistic scenario description",
319
+ "red_flags": ["Flag 1", "Flag 2"],
320
+ "verification_steps": ["Step 1", "Step 2"],
321
+ "explanation": "Why this is misleading",
322
+ "difficulty": "{difficulty_level}"
323
+ }}
324
+ ]
325
+ }}
326
+ """
327
+
328
+ try:
329
+ response = self.model.generate_content(prompt)
330
+ response_text = response.text.strip()
331
+
332
+ if response_text.startswith('```json'):
333
+ response_text = response_text.replace('```json', '').replace('```', '').strip()
334
+ elif response_text.startswith('```'):
335
+ response_text = response_text.replace('```', '').strip()
336
+
337
+ result = json.loads(response_text)
338
+ return result.get("examples", [])
339
+
340
+ except Exception as e:
341
+ print(f"Examples generation failed: {e}")
342
+ return []
343
+
344
+ def _get_fallback_content(self, module_type: str, difficulty_level: str) -> Dict[str, Any]:
345
+ """Fallback content when AI generation fails"""
346
+
347
+ fallback_content = {
348
+ "red_flags": {
349
+ "title": "How to Spot Red Flags in Misinformation",
350
+ "overview": "Learn to identify warning signs that content might be misleading",
351
+ "learning_objectives": [
352
+ "Identify emotional manipulation techniques",
353
+ "Recognize suspicious URLs and sources",
354
+ "Spot grammatical and formatting errors",
355
+ "Understand confirmation bias triggers"
356
+ ],
357
+ "content_sections": [
358
+ {
359
+ "title": "Emotional Language",
360
+ "content": "Misinformation often uses strong emotional language to bypass critical thinking.",
361
+ "key_points": [
362
+ "Look for excessive use of emotional words",
363
+ "Be wary of content that makes you feel angry or scared",
364
+ "Check if emotions are being used to distract from facts"
365
+ ],
366
+ "visual_indicators": ["ALL CAPS", "Multiple exclamation marks", "Emotional imagery"],
367
+ "examples": ["URGENT!!!", "You won't believe this!", "This will shock you!"]
368
+ },
369
+ {
370
+ "title": "Suspicious URLs",
371
+ "content": "Fake news often uses URLs that mimic legitimate news sources.",
372
+ "key_points": [
373
+ "Check for slight misspellings in domain names",
374
+ "Look for unusual domain extensions",
375
+ "Verify the actual website matches the URL"
376
+ ],
377
+ "visual_indicators": ["typos in URLs", "unusual extensions", "redirects"],
378
+ "examples": ["cnn-news.com", "bbc-news.net", "reuters.info"]
379
+ }
380
+ ],
381
+ "practical_tips": [
382
+ "Take a deep breath before sharing emotional content",
383
+ "Ask yourself: 'Why do I feel this way?'",
384
+ "Look for factual evidence, not just emotional appeals"
385
+ ],
386
+ "common_mistakes": [
387
+ "Sharing content because it makes you angry",
388
+ "Ignoring red flags when content confirms your beliefs",
389
+ "Not checking sources when content feels 'right'"
390
+ ],
391
+ "difficulty_level": difficulty_level
392
+ },
393
+ "source_credibility": {
394
+ "title": "Evaluating Source Credibility",
395
+ "overview": "Learn how to assess whether a source is trustworthy and reliable",
396
+ "learning_objectives": [
397
+ "Understand what makes a source credible",
398
+ "Identify bias in news sources",
399
+ "Evaluate author expertise",
400
+ "Check source transparency"
401
+ ],
402
+ "content_sections": [
403
+ {
404
+ "title": "Authority Assessment",
405
+ "content": "Credible sources have recognized expertise in their field.",
406
+ "key_points": [
407
+ "Check the author's credentials and background",
408
+ "Look for institutional affiliations",
409
+ "Verify expertise matches the topic"
410
+ ],
411
+ "visual_indicators": ["Author bio", "Credentials listed", "Institutional affiliation"],
412
+ "examples": ["PhD in relevant field", "Journalist with experience", "Academic institution"]
413
+ }
414
+ ],
415
+ "practical_tips": [
416
+ "Always check the 'About' page",
417
+ "Look for contact information",
418
+ "Verify claims with multiple sources"
419
+ ],
420
+ "common_mistakes": [
421
+ "Trusting sources without checking credentials",
422
+ "Ignoring bias in sources",
423
+ "Not verifying institutional affiliations"
424
+ ],
425
+ "difficulty_level": difficulty_level
426
+ },
427
+ "manipulation_techniques": {
428
+ "title": "Common Manipulation Techniques",
429
+ "overview": "Understand the various methods used to create and spread misinformation",
430
+ "learning_objectives": [
431
+ "Recognize different manipulation techniques",
432
+ "Understand how AI-generated content works",
433
+ "Identify social media manipulation",
434
+ "Learn verification strategies"
435
+ ],
436
+ "content_sections": [
437
+ {
438
+ "title": "Deepfakes and AI-generated Content",
439
+ "content": "Advanced technology can create convincing fake videos and images.",
440
+ "key_points": [
441
+ "Look for unnatural facial movements",
442
+ "Check for inconsistencies in lighting",
443
+ "Verify with original sources"
444
+ ],
445
+ "visual_indicators": ["Unnatural blinking", "Lighting inconsistencies", "Audio sync issues"],
446
+ "examples": ["AI-generated celebrity videos", "Deepfake political speeches"]
447
+ }
448
+ ],
449
+ "practical_tips": [
450
+ "Use reverse image search",
451
+ "Check multiple angles of the same event",
452
+ "Verify with official sources"
453
+ ],
454
+ "common_mistakes": [
455
+ "Trusting videos without verification",
456
+ "Not checking for AI generation",
457
+ "Sharing before verification"
458
+ ],
459
+ "difficulty_level": difficulty_level
460
+ }
461
+ }
462
+
463
+ return fallback_content.get(module_type, {
464
+ "title": f"Educational Module: {module_type}",
465
+ "overview": "Learn about misinformation detection",
466
+ "learning_objectives": ["Understand basic concepts"],
467
+ "content_sections": [],
468
+ "practical_tips": [],
469
+ "common_mistakes": [],
470
+ "difficulty_level": difficulty_level
471
+ })
472
+
473
+ async def generate_contextual_learning(self, verification_result: Dict[str, Any]) -> Dict[str, Any]:
474
+ """
475
+ Generate educational content based on a specific verification result
476
+
477
+ Args:
478
+ verification_result: Result from fact-checking
479
+
480
+ Returns:
481
+ Educational content tailored to the verification result
482
+ """
483
+ try:
484
+ # Extract relevant information from verification result
485
+ verdict = verification_result.get("verdict", "uncertain")
486
+ message = verification_result.get("message", "")
487
+ details = verification_result.get("details", {})
488
+
489
+ # Generate contextual learning content
490
+ prompt = f"""
491
+ Based on this fact-checking result, create educational content to help users learn:
492
+
493
+ VERDICT: {verdict}
494
+ MESSAGE: {message}
495
+ DETAILS: {json.dumps(details, indent=2)}
496
+
497
+ Create learning content that explains:
498
+ 1. What this result means
499
+ 2. What red flags were found (if any)
500
+ 3. How to verify similar claims in the future
501
+ 4. Key lessons learned
502
+
503
+ Respond in JSON format:
504
+ {{
505
+ "learning_summary": "What users learned from this verification",
506
+ "red_flags_found": ["List of red flags detected"],
507
+ "verification_techniques": ["Techniques used to verify"],
508
+ "future_tips": ["Tips for similar situations"],
509
+ "key_lessons": ["Main takeaways"],
510
+ "related_topics": ["Related educational topics to explore"]
511
+ }}
512
+ """
513
+
514
+ response = self.model.generate_content(prompt)
515
+ response_text = response.text.strip()
516
+
517
+ if response_text.startswith('```json'):
518
+ response_text = response_text.replace('```json', '').replace('```', '').strip()
519
+ elif response_text.startswith('```'):
520
+ response_text = response_text.replace('```', '').strip()
521
+
522
+ return json.loads(response_text)
523
+
524
+ except Exception as e:
525
+ print(f"Contextual learning generation failed: {e}")
526
+ return {
527
+ "learning_summary": "Learn to verify information systematically",
528
+ "red_flags_found": [],
529
+ "verification_techniques": ["Source checking", "Cross-referencing"],
530
+ "future_tips": ["Always verify before sharing"],
531
+ "key_lessons": ["Critical thinking is essential"],
532
+ "related_topics": ["Source credibility", "Fact-checking basics"]
533
+ }
services/image_verifier.py ADDED
@@ -0,0 +1,1377 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import tempfile
3
+ from typing import Dict, Any, Optional, Tuple, List
4
+ import requests
5
+ from PIL import Image, ImageDraw, ImageFont
6
+ import io
7
+ import base64
8
+ import json
9
+ import google.generativeai as genai
10
+ # Import SerpApi client - use the correct import path from documentation
11
+ GoogleSearch = None # type: ignore
12
+ try:
13
+ from serpapi import GoogleSearch as _GS # correct import per SerpApi docs
14
+ GoogleSearch = _GS
15
+ print("[serpapi] Successfully imported GoogleSearch from serpapi")
16
+ except Exception as e:
17
+ print(f"[serpapi] Failed to import GoogleSearch: {e}")
18
+ GoogleSearch = None # client unavailable; will fall back to HTTP
19
+ from config import config
20
+
21
+
22
+ class ImageVerifier:
23
+ def __init__(self, api_key: Optional[str] = None):
24
+ """
25
+ Initialize the ImageVerifier with SerpApi credentials
26
+
27
+ Args:
28
+ api_key: SerpApi API key. If None, will try to get from environment
29
+ """
30
+ self.api_key = api_key or config.SERP_API_KEY
31
+ if not self.api_key:
32
+ raise ValueError("SERP_API_KEY environment variable or api_key parameter is required")
33
+
34
+ # Configure Gemini
35
+ if config.GEMINI_API_KEY:
36
+ genai.configure(api_key=config.GEMINI_API_KEY)
37
+ self.gemini_model = genai.GenerativeModel(
38
+ config.GEMINI_MODEL,
39
+ generation_config=genai.types.GenerationConfig(
40
+ temperature=config.GEMINI_TEMPERATURE,
41
+ top_p=config.GEMINI_TOP_P,
42
+ max_output_tokens=config.GEMINI_MAX_TOKENS
43
+ )
44
+ )
45
+ else:
46
+ self.gemini_model = None
47
+
48
+ # SerpApi endpoints
49
+ self.base_url_json = "https://serpapi.com/search.json" # for GET with image_url
50
+ self.base_url_form = "https://serpapi.com/search.json" # for POST form with image_content
51
+
52
+ async def verify(self, image_path: Optional[str] = None, claim_context: str = "", claim_date: str = "", image_url: Optional[str] = None) -> Dict[str, Any]:
53
+ """
54
+ Verify an image using a two-stage approach:
55
+ 1. Gemini Vision analyzes the image directly for AI-generated/deepfake/manipulation
56
+ 2. Reverse image search + evidence analysis
57
+
58
+ Args:
59
+ image_path: Path to the image file
60
+ claim_context: The claimed context of the image
61
+ claim_date: The claimed date of the image
62
+ image_url: URL of the image
63
+
64
+ Returns:
65
+ Dictionary with verification results and output file path
66
+ """
67
+ try:
68
+ print("[verify] start", {"claim_context": claim_context, "claim_date": claim_date, "has_image_path": bool(image_path), "has_image_url": bool(image_url)})
69
+
70
+ # STEP 0: Gemini Vision analysis of the actual image
71
+ preliminary_analysis = await self._analyze_image_with_vision(
72
+ image_path=image_path,
73
+ image_url=image_url,
74
+ claim_context=claim_context,
75
+ claim_date=claim_date
76
+ )
77
+ print(f"✅ Gemini Vision analysis result: {preliminary_analysis.get('verdict', 'unknown')}")
78
+
79
+ # STEP 1: Perform reverse image search (wrap in try/except so vision analysis can still proceed)
80
+ search_results = None
81
+ try:
82
+ search_results = await self._reverse_image_search(image_path=image_path, image_url=image_url)
83
+ except Exception as search_error:
84
+ print(f"⚠️ Reverse image search failed (will use vision analysis only): {search_error}")
85
+ # Continue with vision analysis only - this is fine, we have a fallback
86
+
87
+ # STEP 2: Build evidence from SerpApi (reverse image search)
88
+ evidence = []
89
+ curated_analysis = None
90
+ if search_results and (search_results.get("inline_images") or search_results.get("image_results")):
91
+ evidence = self._collect_evidence(search_results)
92
+ print("[verify] serpapi_counts", {
93
+ "image_results": len(search_results.get("image_results", [])) if isinstance(search_results, dict) else None,
94
+ "inline_images": len(search_results.get("inline_images", [])) if isinstance(search_results, dict) else None,
95
+ "status": (search_results.get("search_metadata", {}) or {}).get("status") if isinstance(search_results, dict) else None,
96
+ })
97
+ print("[verify] evidence_collected", {"count": len(evidence), "sample_titles": [e.get("title") for e in evidence[:3]]})
98
+
99
+ # Ask Gemini to produce structured verdict + structured claim parse with citations
100
+ filtered_evidence = self._rank_and_filter_evidence(evidence, claim_context, top_k=12)
101
+ print("[verify] preparing_llm_request", {"evidence_count": len(filtered_evidence)})
102
+ curated_analysis = self._summarize_with_gemini_structured(
103
+ claim_context=claim_context,
104
+ claim_date=claim_date,
105
+ evidence=filtered_evidence,
106
+ )
107
+ else:
108
+ print("[verify] No reverse image search results, using vision analysis only")
109
+ filtered_evidence = []
110
+
111
+ # STEP 3: Synthesize vision analysis + reverse image search results
112
+ final_response = self._synthesize_vision_and_evidence(
113
+ preliminary_analysis=preliminary_analysis,
114
+ curated_analysis=curated_analysis,
115
+ evidence=filtered_evidence,
116
+ claim_context=claim_context,
117
+ claim_date=claim_date,
118
+ )
119
+
120
+ if final_response:
121
+ return final_response
122
+
123
+ # Fallback: use vision analysis if available, else curated analysis
124
+ if preliminary_analysis and preliminary_analysis.get("verdict") in ["false", "true"]:
125
+ llm = preliminary_analysis
126
+ elif curated_analysis:
127
+ llm = curated_analysis
128
+ else:
129
+ llm = None
130
+ validator = {"passed": False, "reasons": [], "checks": {}}
131
+ debug_details = {}
132
+ if llm:
133
+ print("[verify] llm_keys", list(llm.keys()))
134
+ base_verdict = (llm.get("verdict") or "uncertain").lower()
135
+ relation_verdict = (llm.get("relation_verdict") or base_verdict).lower()
136
+ # Enforce policy: default to false when the claimed relation isn't supported by evidence.
137
+ cp = (llm.get("claim_parse") or {})
138
+ citations = (cp.get("citations") or {})
139
+ relation_citations = citations.get("relation") or []
140
+ has_any_evidence = bool(filtered_evidence)
141
+ relation_supported = bool(relation_citations)
142
+
143
+ if relation_verdict == "false":
144
+ verdict = "false"
145
+ elif has_any_evidence and not relation_supported:
146
+ # We have evidence but none supports the claimed relation → false
147
+ verdict = "false"
148
+ else:
149
+ verdict = base_verdict
150
+ summary = llm.get("summary") or ""
151
+ # Enforce reputable domain gating + cross-source agreement
152
+ sources = llm.get("top_sources") or self._top_sources(filtered_evidence, 3)
153
+ from urllib.parse import urlparse
154
+ def is_reputable(url: Optional[str]) -> bool:
155
+ try:
156
+ net = urlparse(url or "").netloc
157
+ except Exception:
158
+ net = ""
159
+ # Reputable = not low-priority social/UGC domain
160
+ return bool(net and (net not in config.LOW_PRIORITY_DOMAINS))
161
+ reputable_sources = [s for s in (sources or []) if is_reputable(s.get("link"))]
162
+ # Relation support must come from reputable domains and have >=2 independent domains
163
+ cp = (llm.get("claim_parse") or {})
164
+ rel_cits = (cp.get("citations") or {}).get("relation") or []
165
+ cited_domains = set()
166
+ for j in rel_cits:
167
+ try:
168
+ ev = filtered_evidence[int(j)]
169
+ net = urlparse(ev.get("link") or "").netloc
170
+ if net and (net not in config.LOW_PRIORITY_DOMAINS):
171
+ cited_domains.add(net)
172
+ except Exception:
173
+ pass
174
+ cross_source_ok = len(cited_domains) >= 2
175
+ # Stronger relation test: require co-mention already validated (checks[relation_comention])
176
+ relation_comention_ok = False
177
+ try:
178
+ relation_comention_ok = bool(validator["checks"].get("relation_comention"))
179
+ except Exception:
180
+ relation_comention_ok = False
181
+ if verdict == "true":
182
+ if not (cross_source_ok and relation_comention_ok):
183
+ verdict = "uncertain"
184
+ # If verdict is still not false, ensure at least two reputable sources overall
185
+ if verdict == "true" and len({urlparse((s.get("link") or "")).netloc for s in reputable_sources}) < 2:
186
+ verdict = "uncertain"
187
+ # Run validator: require citations for all extracted parts and relation co-mention
188
+ validator, debug_details = self._validate_llm_parse(
189
+ claim_text=claim_context,
190
+ evidence=filtered_evidence,
191
+ llm=llm,
192
+ )
193
+ # Only downgrade true to uncertain if validator fails; never upgrade false
194
+ if verdict == "true" and not validator.get("passed", False):
195
+ verdict = "uncertain"
196
+ if verdict == "true":
197
+ from urllib.parse import urlparse
198
+ cited_idx = set()
199
+ cp = (llm.get("claim_parse") or {}).get("citations") or {}
200
+ for key, val in cp.items():
201
+ if isinstance(val, list):
202
+ if key in ["entities","roles"]:
203
+ for arr in val:
204
+ for j in (arr or []):
205
+ try:
206
+ cited_idx.add(int(j))
207
+ except Exception:
208
+ pass
209
+ else:
210
+ for j in val:
211
+ try:
212
+ cited_idx.add(int(j))
213
+ except Exception:
214
+ pass
215
+ domains = set()
216
+ for ix in cited_idx:
217
+ if 0 <= ix < len(filtered_evidence):
218
+ lk = filtered_evidence[ix].get("link") or ""
219
+ try:
220
+ net = urlparse(lk).netloc
221
+ except Exception:
222
+ net = ""
223
+ if net:
224
+ domains.add(net)
225
+ print("[verify] domain_independence", {"cited_count": len(cited_idx), "domains": list(domains)})
226
+ if len(domains) < 2:
227
+ verdict = "uncertain"
228
+ validator.setdefault("reasons", []).append("Insufficient domain independence for true verdict")
229
+ print("[verify] gemini_structured", {"verdict": verdict, "summary_preview": summary[:120]})
230
+ print("[verify] validator", validator)
231
+ print("[verify] debug_details_keys", list(debug_details.keys()))
232
+ else:
233
+ # Fallback minimal output
234
+ verdict = "uncertain"
235
+ summary = self._fallback_summary("uncertain", claim_context, claim_date, None, None, None)
236
+ sources = self._top_sources(filtered_evidence, 3)
237
+ print("[verify] gemini_structured_none_fallback", {"verdict": verdict, "summary_preview": summary[:120]})
238
+
239
+ if verdict != "false":
240
+ resp = {
241
+ "verdict": verdict,
242
+ "summary": summary,
243
+ "message": summary,
244
+ "sources": sources,
245
+ "claim_context": claim_context,
246
+ "claim_date": claim_date,
247
+ "validator": validator,
248
+ }
249
+ if config.DEBUG:
250
+ resp["debug"] = debug_details
251
+ return resp
252
+
253
+ # Generate visual counter-measure (pick first usable evidence image)
254
+ evidence_img_url = None
255
+ for ev in filtered_evidence:
256
+ if ev.get("thumbnail"):
257
+ evidence_img_url = ev.get("thumbnail")
258
+ break
259
+ if not evidence_img_url:
260
+ for ev in filtered_evidence:
261
+ if ev.get("link") and isinstance(ev.get("link"), str) and ev.get("link").startswith("http"):
262
+ evidence_img_url = ev.get("link")
263
+ break
264
+ evidence_img_url = evidence_img_url or (image_url or "")
265
+ output_path = await self._generate_counter_measure(
266
+ original_image_path=image_path,
267
+ evidence_image_url=evidence_img_url,
268
+ claim_context=claim_context,
269
+ claim_date=claim_date,
270
+ original_image_url=image_url,
271
+ )
272
+ print("[verify] counter_measure_generated", {"output_path": output_path})
273
+
274
+ # For false verdict, ensure summary exists
275
+ if not llm or llm.get("verdict", "").lower() != "false":
276
+ # Force LLM to produce a false-context explanation
277
+ llm = self._summarize_with_gemini_structured(
278
+ claim_context=claim_context,
279
+ claim_date=claim_date,
280
+ evidence=filtered_evidence,
281
+ forced_verdict="false",
282
+ ) or {}
283
+ summary = llm.get("summary") or self._fallback_summary("false", claim_context, claim_date, None, None, None)
284
+ sources = llm.get("top_sources") or self._top_sources(filtered_evidence, 3)
285
+ resp = {
286
+ "verdict": "false",
287
+ "summary": summary,
288
+ "message": summary,
289
+ "sources": sources,
290
+ "output_path": output_path,
291
+ "claim_context": claim_context,
292
+ "claim_date": claim_date,
293
+ "validator": validator,
294
+ }
295
+ if config.DEBUG:
296
+ resp["debug"] = debug_details
297
+ return resp
298
+
299
+ except Exception as e:
300
+ return {
301
+ "verdict": "error",
302
+ "summary": f"Error during verification: {str(e)}",
303
+ }
304
+
305
+ async def _analyze_image_with_vision(
306
+ self,
307
+ image_path: Optional[str] = None,
308
+ image_url: Optional[str] = None,
309
+ claim_context: str = "",
310
+ claim_date: str = ""
311
+ ) -> Dict[str, Any]:
312
+ """
313
+ Use Gemini Vision to analyze the actual image content for:
314
+ - AI-generated/deepfake indicators
315
+ - Manipulation artifacts
316
+ - Visual inconsistencies
317
+ - Context analysis
318
+
319
+ Args:
320
+ image_path: Path to the image file
321
+ image_url: URL of the image
322
+ claim_context: The claimed context
323
+ claim_date: The claimed date
324
+
325
+ Returns:
326
+ Dictionary with preliminary analysis
327
+ """
328
+ try:
329
+ if not self.gemini_model:
330
+ return {
331
+ "verdict": "uncertain",
332
+ "verified": False,
333
+ "message": "Gemini Vision not available",
334
+ "confidence": "low",
335
+ "analysis_method": "vision_unavailable",
336
+ }
337
+
338
+ # Load the image
339
+ import PIL.Image as PILImage
340
+ if image_path:
341
+ img = PILImage.open(image_path)
342
+ elif image_url:
343
+ img = await self._download_image(image_url)
344
+ else:
345
+ return {
346
+ "verdict": "uncertain",
347
+ "verified": False,
348
+ "message": "No image provided for vision analysis",
349
+ "confidence": "low",
350
+ "analysis_method": "vision_no_image",
351
+ }
352
+
353
+ prompt = f"""You are an expert image forensics analyst. Analyze this image carefully for authenticity and manipulation.
354
+
355
+ CLAIMED CONTEXT: {claim_context}
356
+ CLAIMED DATE: {claim_date}
357
+
358
+ Analyze the image for:
359
+ 1. **AI-Generated/Deepfake Indicators**: Look for signs of AI generation (inconsistent lighting, unnatural textures, artifacts around faces/objects, watermarks, telltale patterns)
360
+ 2. **Manipulation Artifacts**: Check for signs of editing (cloning, copy-paste, inconsistent shadows, lighting mismatches, pixelation patterns)
361
+ 3. **Visual Inconsistencies**: Look for impossible physics, inconsistent perspectives, mismatched elements
362
+ 4. **Context Analysis**: Does the visual content match the claimed context and date? (e.g., clothing styles, technology visible, environment)
363
+
364
+ Respond in JSON format:
365
+ {{
366
+ "verdict": "true|false|uncertain",
367
+ "verified": true|false,
368
+ "message": "Clear explanation of your findings",
369
+ "confidence": "high|medium|low",
370
+ "ai_generated_indicators": ["list of specific indicators found"],
371
+ "manipulation_artifacts": ["list of artifacts found"],
372
+ "visual_inconsistencies": ["list of inconsistencies"],
373
+ "context_match": "Does the image content match the claimed context?",
374
+ "reasoning": "Detailed reasoning for your verdict"
375
+ }}
376
+
377
+ Be specific and cite what you see in the image. If uncertain, explain why."""
378
+
379
+ # Use Gemini Vision to analyze the image
380
+ response = self.gemini_model.generate_content([prompt, img])
381
+
382
+ if not response.text:
383
+ return {
384
+ "verdict": "uncertain",
385
+ "verified": False,
386
+ "message": "Gemini Vision returned no response",
387
+ "confidence": "low",
388
+ "analysis_method": "vision_no_response",
389
+ }
390
+
391
+ # Parse JSON response
392
+ import json
393
+ response_text = response.text.strip()
394
+ if response_text.startswith("```json"):
395
+ response_text = response_text.replace("```json", "").replace("```", "").strip()
396
+ elif response_text.startswith("```"):
397
+ response_text = response_text.replace("```", "").strip()
398
+
399
+ try:
400
+ analysis = json.loads(response_text)
401
+ analysis["analysis_method"] = "gemini_vision"
402
+ return analysis
403
+ except json.JSONDecodeError:
404
+ # Fallback: extract verdict from text
405
+ verdict = "uncertain"
406
+ if "false" in response_text.lower() or "fake" in response_text.lower() or "manipulated" in response_text.lower():
407
+ verdict = "false"
408
+ elif "true" in response_text.lower() and "not" not in response_text.lower()[:50]:
409
+ verdict = "true"
410
+
411
+ return {
412
+ "verdict": verdict,
413
+ "verified": verdict == "true",
414
+ "message": response_text[:500],
415
+ "confidence": "medium",
416
+ "analysis_method": "gemini_vision_fallback",
417
+ "raw_response": response_text,
418
+ }
419
+
420
+ except Exception as e:
421
+ print(f"[vision] Error in Gemini Vision analysis: {e}")
422
+ return {
423
+ "verdict": "uncertain",
424
+ "verified": False,
425
+ "message": f"Error during vision analysis: {str(e)}",
426
+ "confidence": "low",
427
+ "analysis_method": "vision_error",
428
+ }
429
+
430
+ def _synthesize_vision_and_evidence(
431
+ self,
432
+ preliminary_analysis: Dict[str, Any],
433
+ curated_analysis: Optional[Dict[str, Any]],
434
+ evidence: List[Dict[str, Any]],
435
+ claim_context: str,
436
+ claim_date: str,
437
+ ) -> Optional[Dict[str, Any]]:
438
+ """
439
+ Synthesize Gemini Vision analysis with reverse image search evidence.
440
+ Similar to text verification's hybrid synthesis.
441
+ """
442
+ try:
443
+ if not self.gemini_model:
444
+ return None
445
+
446
+ source_briefs = []
447
+ for item in evidence[:5]:
448
+ source_briefs.append({
449
+ "title": item.get("title"),
450
+ "snippet": item.get("snippet"),
451
+ "link": item.get("link"),
452
+ })
453
+
454
+ prompt = f"""You are an expert image verification analyst. Combine direct image analysis (Gemini Vision) with reverse image search evidence to produce a final verdict.
455
+
456
+ CLAIM: {claim_context}
457
+ CLAIM DATE: {claim_date}
458
+
459
+ DIRECT IMAGE ANALYSIS (Gemini Vision):
460
+ {json.dumps(preliminary_analysis or {}, indent=2, ensure_ascii=False)}
461
+
462
+ REVERSE IMAGE SEARCH ANALYSIS:
463
+ {json.dumps(curated_analysis or {}, indent=2, ensure_ascii=False)}
464
+
465
+ REVERSE IMAGE SEARCH SOURCES:
466
+ {json.dumps(source_briefs, indent=2, ensure_ascii=False)}
467
+
468
+ INSTRUCTIONS:
469
+ - Combine both analyses to make a final decision (true/false/uncertain)
470
+ - If vision analysis detects AI-generated/manipulated content, prioritize that
471
+ - If reverse image search finds contradictory evidence, factor that in
472
+ - If evidence is thin, keep the tone cautious
473
+ - Provide clear, actionable messaging for the end user
474
+
475
+ Respond ONLY in this JSON format:
476
+ {{
477
+ "verdict": "true|false|uncertain",
478
+ "verified": true|false,
479
+ "message": "Concise user-facing summary combining both analyses",
480
+ "confidence": "high|medium|low",
481
+ "reasoning": "Brief reasoning trail you followed",
482
+ "vision_findings": "Key findings from direct image analysis",
483
+ "search_findings": "Key findings from reverse image search"
484
+ }}"""
485
+
486
+ response = self.gemini_model.generate_content(prompt)
487
+ response_text = response.text.strip()
488
+
489
+ if response_text.startswith("```json"):
490
+ response_text = response_text.replace("```json", "").replace("```", "").strip()
491
+ elif response_text.startswith("```"):
492
+ response_text = response_text.replace("```", "").strip()
493
+
494
+ final_analysis = json.loads(response_text)
495
+ final_analysis.setdefault("verdict", "uncertain")
496
+ final_analysis.setdefault("verified", False)
497
+ final_analysis.setdefault("message", "Unable to synthesize final verdict.")
498
+ final_analysis.setdefault("confidence", "low")
499
+ final_analysis["analysis_method"] = "hybrid_vision_and_search"
500
+
501
+ # Build response similar to existing format
502
+ sources = self._top_sources(evidence, 3) if evidence else []
503
+
504
+ return {
505
+ "verdict": final_analysis["verdict"],
506
+ "summary": final_analysis["message"],
507
+ "message": final_analysis["message"],
508
+ "sources": sources,
509
+ "claim_context": claim_context,
510
+ "claim_date": claim_date,
511
+ "confidence": final_analysis.get("confidence", "medium"),
512
+ "analysis_method": "hybrid_vision_and_search",
513
+ "preliminary_analysis": preliminary_analysis,
514
+ "curated_analysis": curated_analysis,
515
+ }
516
+
517
+ except Exception as e:
518
+ print(f"Hybrid synthesis error: {e}")
519
+ return None
520
+
521
+ async def gather_evidence(self, image_path: Optional[str] = None, image_url: Optional[str] = None, claim_context: str = "") -> List[Dict[str, Any]]:
522
+ """
523
+ Evidence-only helper: performs reverse image search and returns ranked/filterred evidence
524
+ without invoking the LLM or producing a verdict.
525
+ """
526
+ try:
527
+ print("[verify] start", {"gather_only": True, "has_image_path": bool(image_path), "has_image_url": bool(image_url)})
528
+ search_results = await self._reverse_image_search(image_path=image_path, image_url=image_url)
529
+ if not search_results or (not search_results.get("inline_images") and not search_results.get("image_results")):
530
+ return []
531
+ evidence = self._collect_evidence(search_results)
532
+ filtered = self._rank_and_filter_evidence(evidence, claim_context, top_k=12)
533
+ return filtered
534
+ except Exception as e:
535
+ print(f"[gather_evidence] error: {e}")
536
+ return []
537
+
538
+ def _summarize_with_gemini(self, claim_context: str, claim_date: str, analysis: Dict[str, Any], forced_verdict: Optional[str] = None) -> Optional[Dict[str, Any]]:
539
+ try:
540
+ if not self.gemini_model:
541
+ return None
542
+
543
+ verdict = forced_verdict or analysis.get("verdict", "uncertain")
544
+ prompt = f"""You are a fact-checking assistant. Generate a single, concise sentence (no code blocks, no JSON)
545
+ that explains the verdict. Mirror the provided verdict exactly (do not change it).
546
+ If false, mention the most likely real context/time from evidence; if true, confirm briefly;
547
+ if uncertain, state uncertainty.
548
+
549
+ Claim context: {claim_context}
550
+ Claim date: {claim_date}
551
+ Verdict: {verdict}
552
+ Evidence (condensed): {self._top_sources(analysis.get('evidence', []), 3)}"""
553
+
554
+ response = self.gemini_model.generate_content(prompt)
555
+ text = response.text if response.text else None
556
+
557
+ return {"model": config.GEMINI_MODEL, "verdict": verdict, "text": text}
558
+ except Exception:
559
+ return None
560
+
561
+ def _collect_evidence(self, search_results: Dict[str, Any]) -> List[Dict[str, Any]]:
562
+ evidence: List[Dict[str, Any]] = []
563
+ for res in search_results.get("image_results", []):
564
+ evidence.append({
565
+ "title": res.get("title"),
566
+ "link": res.get("link"),
567
+ "source": res.get("source"),
568
+ "date": res.get("date"),
569
+ "thumbnail": res.get("thumbnail"),
570
+ "snippet": res.get("snippet"),
571
+ })
572
+ for img in search_results.get("inline_images", []):
573
+ evidence.append({
574
+ "title": img.get("title"),
575
+ "link": img.get("link"),
576
+ "source": img.get("source"),
577
+ "thumbnail": img.get("thumbnail"),
578
+ "snippet": img.get("snippet"),
579
+ })
580
+ return evidence
581
+
582
+ def _normalize_tokens(self, text: Optional[str]) -> List[str]:
583
+ if not text:
584
+ return []
585
+ import re
586
+ t = (text or "").lower()
587
+ stop = set(["the","a","an","and","or","for","to","of","in","on","at","with","by","from","this","that","is","are","was","were","as","it","its","their","his","her","him","she","he","they","them","we","you"])
588
+ toks = re.findall(r"[a-z0-9]{3,}", t)
589
+ return [x for x in toks if x not in stop]
590
+
591
+ def _evidence_score(self, claim_text: str, ev: Dict[str, Any]) -> float:
592
+ claim_tokens = set(self._normalize_tokens(claim_text))
593
+ ev_text = " ".join([s for s in [ev.get("title"), ev.get("snippet"), ev.get("source")] if s])
594
+ ev_tokens = set(self._normalize_tokens(ev_text))
595
+ if not claim_tokens or not ev_tokens:
596
+ return 0.0
597
+ overlap = len(claim_tokens & ev_tokens)
598
+ return overlap / float(len(claim_tokens))
599
+
600
+ def _rank_and_filter_evidence(self, evidence: List[Dict[str, Any]], claim_text: str, top_k: int = 12) -> List[Dict[str, Any]]:
601
+ scored: List[Tuple[float, int, Dict[str, Any]]] = []
602
+ for i, ev in enumerate(evidence):
603
+ s = self._evidence_score(claim_text, ev)
604
+ # Downrank social/UGC and YouTube to prefer article pages when checking relations
605
+ try:
606
+ from urllib.parse import urlparse
607
+ net = urlparse((ev.get("link") or "").strip()).netloc
608
+ except Exception:
609
+ net = ""
610
+ if net in config.LOW_PRIORITY_DOMAINS or net in ("youtube.com", "www.youtube.com", "youtu.be"):
611
+ s *= 0.6
612
+ scored.append((s, i, ev))
613
+ scored.sort(key=lambda x: x[0], reverse=True)
614
+ seen_urls = set()
615
+ seen_titles = set()
616
+ filtered: List[Dict[str, Any]] = []
617
+ for s, i, ev in scored:
618
+ url = (ev.get("link") or "").strip()
619
+ title = (ev.get("title") or "").strip().lower()
620
+ title_key = title[:80] if title else ""
621
+ if url and url in seen_urls:
622
+ continue
623
+ if title_key and title_key in seen_titles:
624
+ continue
625
+ filtered.append(ev)
626
+ if url:
627
+ seen_urls.add(url)
628
+ if title_key:
629
+ seen_titles.add(title_key)
630
+ if len(filtered) >= top_k:
631
+ break
632
+ print("[verify] evidence_rank_filter", {"input": len(evidence), "kept": len(filtered)})
633
+ return filtered
634
+
635
+ def _extract_json(self, text: str) -> Dict[str, Any]:
636
+ # Strip common fences and attempt to locate JSON object
637
+ t = text.strip()
638
+ if t.startswith("```"):
639
+ t = t.split("```", 1)[1]
640
+ t = t.lstrip("json").lstrip("\n").strip()
641
+ if "```" in t:
642
+ t = t.split("```", 1)[0].strip()
643
+ # Find first '{' and last '}'
644
+ start = t.find('{')
645
+ end = t.rfind('}')
646
+ if start != -1 and end != -1 and end > start:
647
+ t = t[start:end+1]
648
+ import json
649
+ return json.loads(t)
650
+
651
+ def _summarize_with_gemini_structured(self, claim_context: str, claim_date: str,
652
+ evidence: List[Dict[str, Any]],
653
+ forced_verdict: Optional[str] = None) -> Optional[Dict[str, Any]]:
654
+ try:
655
+ if not self.gemini_model:
656
+ return None
657
+
658
+ prompt = f"""You are a fact-checking assistant. Use the provided evidence items (title, link, date, source, snippet) to evaluate the FULL claim text.
659
+ The claim can include: event/context, place, timeframe, actors/entities, quantities, and relations/attribution. You may use only the provided evidence items.
660
+ Respond STRICTLY as compact JSON with keys:
661
+ - verdict: one of 'true' | 'false' | 'uncertain'
662
+ - relation_verdict: one of 'true' | 'false' | 'uncertain' (whether the stated relation holds)
663
+ - summary: <= 2 sentences, plain text
664
+ - top_sources: array of up to 3 objects {{title, link}}
665
+ - claim_parse: {{
666
+ entities: array of strings,
667
+ roles: array of strings,
668
+ relation: {{ predicate: string, subject: string, object: string }},
669
+ timeframe: {{ year: number|null, month: number|null }},
670
+ location: string|null,
671
+ citations: {{
672
+ entities: array of arrays of evidence indices (per entity),
673
+ roles: array of arrays of evidence indices (per role),
674
+ relation: array of evidence indices supporting subject+predicate+object together,
675
+ timeframe: array of evidence indices supporting the timeframe,
676
+ location: array of evidence indices supporting the location
677
+ }}
678
+ }}
679
+ Rules:
680
+ - verdict 'true' ONLY if evidence supports ALL key parts: event/context, place, timeframe, AND any stated relation.
681
+ - relation_verdict 'false' if the evidence supports a different relation and none supports the claimed relation.
682
+ - verdict 'false' if relation_verdict is 'false' or if place/time contradicts the claim without supporting evidence.
683
+ - 'uncertain' if ANY extracted part in claim_parse has no supporting citations.
684
+ - relation consistency: at least one cited evidence item MUST co-mention subject and object tokens with the predicate.
685
+ Do not include code fences or extra text; return only the JSON object.
686
+
687
+ Claim text: {claim_context}
688
+ Claim date: {claim_date}
689
+ Forced verdict: {forced_verdict}
690
+ Evidence: {evidence}"""
691
+
692
+ print("[gemini] request_meta", {"model": config.GEMINI_MODEL, "temp": config.GEMINI_TEMPERATURE, "topP": config.GEMINI_TOP_P})
693
+ response = self.gemini_model.generate_content(prompt)
694
+
695
+ if not response.text:
696
+ return None
697
+
698
+ text = response.text.strip()
699
+ print("[gemini] structured_text_preview", text[:200])
700
+ parsed = self._extract_json(text)
701
+ print("[gemini] parsed_json_keys", list(parsed.keys()) if isinstance(parsed, dict) else type(parsed).__name__)
702
+ return parsed if isinstance(parsed, dict) else None
703
+
704
+ except Exception as e:
705
+ print(f"[gemini] error: {e}")
706
+ return None
707
+
708
+ def _summarize_with_gemini_majority(self, claim_context: str, claim_date: str,
709
+ evidence: List[Dict[str, Any]]) -> Optional[Dict[str, Any]]:
710
+ """
711
+ Simpler majority-based prompt: ask Gemini to decide true/false by which side has more supporting
712
+ evidence; only return uncertain if support is roughly equal/ambiguous.
713
+ Returns compact JSON: { verdict, clarification, corrected_relation, top_sources }
714
+ """
715
+ try:
716
+ if not self.gemini_model:
717
+ return None
718
+ prompt = f"""You are a citation-driven fact-checking assistant.
719
+ Given a CLAIM and a list of EVIDENCE items (title, link, date, source, snippet), decide if the CLAIM itself is true or false.
720
+
721
+ STRICT adjudication rules (apply literally to the CLAIM):
722
+ 1) Extract the relation from the CLAIM as:
723
+ relation: {{ predicate: string, subject: string, object: string }}
724
+ 2) Evaluate ONLY the CLAIM's relation. Mentions of a different object (alternative person/role/event/location) are NOT support for the CLAIM.
725
+ 3) SUPPORT only when an evidence item explicitly co-mentions the CLAIM's subject AND the CLAIM's object with the predicate in title/snippet (token-level match; paraphrases of those tokens are fine). General marital status or vague wording does NOT count as support if the CLAIM's object is not explicitly present.
726
+ 4) CONTRADICTION when evidence explicitly supports a mutually exclusive alternative relation (e.g., same subject + predicate with a different object), or explicitly negates the CLAIM.
727
+ 5) Social/UGC links may appear; still judge by content but prefer clearer, explicit co-mentions from any source.
728
+ 6) Decision for the CLAIM:
729
+ - If SUPPORT > CONTRADICTION by a meaningful margin, verdict = "true".
730
+ - If CONTRADICTION > SUPPORT by a meaningful margin, verdict = "false".
731
+ - If neither side is clearly stronger or no explicit co-mentions exist, verdict = "uncertain".
732
+ 7) Use only the provided EVIDENCE texts; no outside knowledge.
733
+
734
+ Output strictly as compact JSON with keys (and nothing else):
735
+ verdict: one of 'true' | 'false' | 'uncertain'
736
+ clarification: one concise sentence that answers the CLAIM directly. If verdict is 'false' or 'uncertain', state the most supported alternative relation (e.g., "<subject> was not <predicate> <object>. Instead, <subject> <predicate> <alt_object> at <context>."). Avoid hedging like "does not confirm".
737
+ corrected_relation: {{ predicate: string, subject: string, object: string }} | null
738
+ top_sources: up to 3 objects {{title, link}}
739
+
740
+ CLAIM: {claim_context}
741
+ CLAIM_DATE: {claim_date}
742
+ EVIDENCE: {evidence}
743
+ """
744
+ print("[gemini] request_meta", {"model": config.GEMINI_MODEL, "temp": config.GEMINI_TEMPERATURE, "topP": config.GEMINI_TOP_P})
745
+ response = self.gemini_model.generate_content(prompt)
746
+ if not response.text:
747
+ return None
748
+ text = response.text.strip()
749
+ print("[gemini] structured_text_preview", text[:200])
750
+ parsed = self._extract_json(text)
751
+ print("[gemini] parsed_json_keys", list(parsed.keys()) if isinstance(parsed, dict) else type(parsed).__name__)
752
+ return parsed if isinstance(parsed, dict) else None
753
+ except Exception as e:
754
+ print(f"[gemini] error: {e}")
755
+ return None
756
+
757
+ def _top_sources(self, evidence: List[Dict[str, Any]], k: int) -> List[Dict[str, Any]]:
758
+ items = []
759
+ for e in evidence:
760
+ title = e.get("title")
761
+ link = e.get("link")
762
+ if title or link:
763
+ items.append({"title": title, "link": link})
764
+ if len(items) >= k:
765
+ break
766
+ return items
767
+
768
+ def _validate_llm_parse(self, claim_text: str, evidence: List[Dict[str, Any]], llm: Dict[str, Any]) -> Tuple[Dict[str, Any], Dict[str, Any]]:
769
+ checks: Dict[str, Any] = {}
770
+ reasons: List[str] = []
771
+ passed = True
772
+ parse = (llm or {}).get("claim_parse") or {}
773
+ citations = parse.get("citations") or {}
774
+ # Helper to get combined text for an evidence index
775
+ def ev_text(i: int) -> str:
776
+ if i < 0 or i >= len(evidence):
777
+ return ""
778
+ ev = evidence[i]
779
+ return " ".join([t for t in [ev.get("title"), ev.get("snippet")] if t])
780
+ # 1) Ensure each entities[] and roles[] item has at least one citation
781
+ for key in ["entities", "roles"]:
782
+ items = parse.get(key) or []
783
+ cits = citations.get(key) or []
784
+ ok = bool(items) and len(cits) == len(items) and all(len(lst) > 0 for lst in cits if isinstance(lst, list))
785
+ checks[f"{key}_citations"] = ok
786
+ if not ok:
787
+ passed = False
788
+ reasons.append(f"Missing citations for {key}")
789
+ # 2) timeframe and location citations exist if present
790
+ for key in ["timeframe", "location"]:
791
+ has_item = bool(parse.get(key))
792
+ if has_item:
793
+ ok = bool(citations.get(key)) and len(citations.get(key)) > 0
794
+ checks[f"{key}_citations"] = ok
795
+ if not ok:
796
+ passed = False
797
+ reasons.append(f"Missing citations for {key}")
798
+ # 2b) If location cited, require token presence in at least one cited item
799
+ def _tok(text: str) -> set:
800
+ import re
801
+ return set(re.findall(r"[a-z0-9]{3,}", (text or "").lower()))
802
+ if parse.get("location") and citations.get("location"):
803
+ loc_toks = _tok(str(parse.get("location") or ""))
804
+ loc_token_ok = False
805
+ for i in citations.get("location"):
806
+ try:
807
+ it = _tok(ev_text(int(i)))
808
+ except Exception:
809
+ it = set()
810
+ if loc_toks and (loc_toks & it):
811
+ loc_token_ok = True
812
+ break
813
+ checks["location_token_match"] = loc_token_ok
814
+ if not loc_token_ok:
815
+ passed = False
816
+ reasons.append("Location tokens not found in cited items")
817
+ # 3) relation citations and co-mention (subject/object in same item)
818
+ relation = parse.get("relation") or {}
819
+ subj = (relation.get("subject") or "").strip()
820
+ obj = (relation.get("object") or "").strip()
821
+ # Token-based co-mention: require at least one informative token from subject and object in same item
822
+ def tokens(text: str) -> List[str]:
823
+ import re
824
+ return re.findall(r"[a-z0-9]{3,}", (text or "").lower())
825
+ subj_toks = set(tokens(subj))
826
+ obj_toks = set(tokens(obj))
827
+ rel_indices: List[int] = citations.get("relation") or []
828
+ rel_ok = False
829
+ for idx in rel_indices:
830
+ txt = ev_text(int(idx))
831
+ tl_toks = set(tokens(txt))
832
+ if subj_toks and obj_toks and (subj_toks & tl_toks) and (obj_toks & tl_toks):
833
+ rel_ok = True
834
+ break
835
+ checks["relation_comention"] = rel_ok
836
+ # Allow pooled-evidence relation support via shared anchors if co-mention failed
837
+ pooled_ok = False
838
+ pooled_detail: Dict[str, Any] = {}
839
+ if not rel_ok:
840
+ try:
841
+ entity_list: List[str] = (parse.get("entities") or [])
842
+ entity_cits: List[List[int]] = (citations.get("entities") or [])
843
+ def _tokens(text: str) -> set:
844
+ import re
845
+ return set(re.findall(r"[a-z0-9]{3,}", (text or "").lower()))
846
+ # Map subject/object to entity indices by token overlap
847
+ def best_entity_indices(name_toks: set) -> List[int]:
848
+ scored: List[Tuple[int,int]] = []
849
+ for idx, ent in enumerate(entity_list):
850
+ et = _tokens(ent)
851
+ scored.append((len(name_toks & et), idx))
852
+ scored.sort(reverse=True)
853
+ return [i for s,i in scored if s > 0]
854
+ subj_toks_set = _tokens(subj)
855
+ obj_toks_set = _tokens(obj)
856
+ subj_idxs = best_entity_indices(subj_toks_set) if subj_toks_set else []
857
+ obj_idxs = best_entity_indices(obj_toks_set) if obj_toks_set else []
858
+ subj_pool: List[int] = []
859
+ obj_pool: List[int] = []
860
+ for si in subj_idxs:
861
+ if si < len(entity_cits) and isinstance(entity_cits[si], list):
862
+ for v in entity_cits[si]:
863
+ try:
864
+ subj_pool.append(int(v))
865
+ except Exception:
866
+ pass
867
+ for oi in obj_idxs:
868
+ if oi < len(entity_cits) and isinstance(entity_cits[oi], list):
869
+ for v in entity_cits[oi]:
870
+ try:
871
+ obj_pool.append(int(v))
872
+ except Exception:
873
+ pass
874
+ subj_pool = list({int(x) for x in subj_pool})
875
+ obj_pool = list({int(x) for x in obj_pool})
876
+ # Anchors from claim parse
877
+ anchor_year = None
878
+ tf = parse.get("timeframe") or {}
879
+ try:
880
+ anchor_year = int(tf.get("year")) if tf.get("year") is not None else None
881
+ except Exception:
882
+ anchor_year = None
883
+ anchor_month_name = None
884
+ try:
885
+ mn = int(tf.get("month")) if tf.get("month") is not None else None
886
+ months = ["january","february","march","april","may","june","july","august","september","october","november","december"]
887
+ anchor_month_name = months[mn-1] if mn and 1 <= mn <= 12 else None
888
+ except Exception:
889
+ anchor_month_name = None
890
+ loc_tokens = _tok(str(parse.get("location") or ""))
891
+ claim_event_tokens = _tok(claim_text)
892
+ import re
893
+ def item_text(idx: int) -> str:
894
+ return ev_text(idx)
895
+ def has_year(idx: int) -> bool:
896
+ return bool(anchor_year is not None and re.search(rf"\b{anchor_year}\b", item_text(idx) or ""))
897
+ def has_month(idx: int) -> bool:
898
+ return bool(anchor_month_name and (anchor_month_name in (item_text(idx) or "").lower()))
899
+ def has_loc(idx: int) -> bool:
900
+ return bool(loc_tokens and (loc_tokens & _tok(item_text(idx))))
901
+ def event_overlap(idx1: int, idx2: int) -> bool:
902
+ t1 = _tok(item_text(idx1))
903
+ t2 = _tok(item_text(idx2))
904
+ return bool((claim_event_tokens & t1) and (claim_event_tokens & t2))
905
+ def anchors_align(i: int, j: int) -> Tuple[bool, List[str]]:
906
+ reasons: List[str] = []
907
+ if has_year(i) and has_year(j):
908
+ reasons.append("year")
909
+ if has_month(i) and has_month(j):
910
+ reasons.append("month")
911
+ if has_loc(i) and has_loc(j):
912
+ reasons.append("location")
913
+ if event_overlap(i, j):
914
+ reasons.append("event")
915
+ return (len(reasons) > 0, reasons)
916
+ for si in subj_pool:
917
+ for oj in obj_pool:
918
+ ok, rs = anchors_align(int(si), int(oj))
919
+ if ok:
920
+ pooled_ok = True
921
+ pooled_detail = {"subj_idx": int(si), "obj_idx": int(oj), "anchors": rs}
922
+ break
923
+ if pooled_ok:
924
+ break
925
+ except Exception:
926
+ pooled_ok = False
927
+ checks["relation_pooled_anchor"] = pooled_ok
928
+ if pooled_ok:
929
+ checks["relation_pooled_detail"] = pooled_detail
930
+ if not rel_ok and not pooled_ok:
931
+ passed = False
932
+ reasons.append("Relation not supported by co-mention or pooled anchors")
933
+ # 4) Simple entity overlap score between claim tokens and cited items
934
+ import re
935
+ claim_tokens = set([t.lower() for t in re.findall(r"[A-Za-z]{3,}", claim_text or "")])
936
+ cited_indices = set()
937
+ for arr in (citations.get("entities") or []):
938
+ for i in arr:
939
+ try:
940
+ cited_indices.add(int(i))
941
+ except Exception:
942
+ pass
943
+ overlap_hits = 0
944
+ for i in cited_indices:
945
+ tl = ev_text(i).lower()
946
+ if any(tok in tl for tok in claim_tokens):
947
+ overlap_hits += 1
948
+ entity_overlap_score = overlap_hits / (len(cited_indices) or 1)
949
+ checks["entity_overlap_score"] = entity_overlap_score
950
+ # 5) Date check: allow year and optional month names from claim timeframe in cited items
951
+ year = None
952
+ month_num = None
953
+ tf = parse.get("timeframe") or {}
954
+ try:
955
+ year = int(tf.get("year")) if tf.get("year") is not None else None
956
+ except Exception:
957
+ year = None
958
+ try:
959
+ month_num = int(tf.get("month")) if tf.get("month") is not None else None
960
+ except Exception:
961
+ month_num = None
962
+ date_ok = True
963
+ if year is not None:
964
+ date_ok = False
965
+ for i in (citations.get("timeframe") or []):
966
+ try:
967
+ ev = evidence[int(i)]
968
+ except Exception:
969
+ continue
970
+ text = " ".join([t for t in [ev.get("title"), ev.get("snippet"), ev.get("date"), ev.get("source"), ev.get("link")] if t])
971
+ if re.search(rf"\b{year}\b", text or ""):
972
+ date_ok = True
973
+ break
974
+ # Month name matching if provided
975
+ if month_num is not None:
976
+ month_names = [
977
+ "january","february","march","april","may","june",
978
+ "july","august","september","october","november","december"
979
+ ]
980
+ mname = month_names[month_num-1] if 1 <= month_num <= 12 else None
981
+ if mname and (mname in (text or "").lower()):
982
+ date_ok = True
983
+ break
984
+ checks["timeframe_match"] = date_ok
985
+ if not date_ok:
986
+ passed = False
987
+ reasons.append("Timeframe year not supported in cited items")
988
+ # Domains used (for logging only)
989
+ from urllib.parse import urlparse
990
+ domains = []
991
+ for ev in evidence:
992
+ try:
993
+ net = urlparse(ev.get("link") or "").netloc
994
+ except Exception:
995
+ net = ""
996
+ if net:
997
+ domains.append(net)
998
+ debug = {
999
+ "claim_parse": parse,
1000
+ "citations": citations,
1001
+ "domains_used": domains,
1002
+ }
1003
+ return {"passed": passed, "reasons": reasons, "checks": checks}, debug
1004
+
1005
+ def _fallback_summary(self, verdict: str, claim_context: str, claim_date: str,
1006
+ best_title: Optional[str], best_link: Optional[str], best_year: Optional[int]) -> str:
1007
+ if verdict == "false":
1008
+ where = best_title or "another place/time"
1009
+ when = str(best_year) if best_year else "an earlier date"
1010
+ src = best_link or "a corroborating source"
1011
+ return f"Claim is false. The image corresponds to {where} from {when}, not {claim_context}, {claim_date}. Source: {src}."
1012
+ if verdict == "true":
1013
+ return f"Claim is true. The available evidence supports {claim_context}, {claim_date}."
1014
+ return f"Claim is uncertain. Evidence is inconclusive for {claim_context}, {claim_date}."
1015
+
1016
+ def _clean_summary_text(self, text: Optional[str]) -> str:
1017
+ if not text:
1018
+ return ""
1019
+ t = text.strip()
1020
+ # Remove common code-fence wrappers
1021
+ if t.startswith("```"):
1022
+ # drop first fence
1023
+ t = t.split("```", 1)[1]
1024
+ # drop language tag if present
1025
+ t = t.lstrip("\n").split("\n", 1)[-1] if "\n" in t else t
1026
+ # drop trailing fence
1027
+ if "```" in t:
1028
+ t = t.rsplit("```", 1)[0]
1029
+ return t.strip()
1030
+
1031
+ async def _reverse_image_search(self, image_path: Optional[str] = None, image_url: Optional[str] = None) -> Dict[str, Any]:
1032
+ """
1033
+ Perform reverse image search using SerpApi
1034
+
1035
+ Args:
1036
+ image_path: Path to the image file
1037
+ image_url: URL of the image
1038
+
1039
+ Returns:
1040
+ Search results from SerpApi
1041
+ """
1042
+ try:
1043
+ if GoogleSearch is None:
1044
+ raise RuntimeError("google-search-results package not available. Install with: pip install google-search-results")
1045
+
1046
+ # Build params per SerpApi docs - use official client for ALL requests
1047
+ params: Dict[str, Any] = {
1048
+ "engine": "google_reverse_image",
1049
+ "api_key": self.api_key,
1050
+ }
1051
+
1052
+ if image_url:
1053
+ # Use image_url parameter for URLs
1054
+ params["image_url"] = image_url
1055
+ print("[serpapi] Using image_url parameter")
1056
+ elif image_path:
1057
+ # For local files, upload to Cloudinary first to get a public URL
1058
+ try:
1059
+ cloudinary_url = await self._upload_to_cloudinary(image_path)
1060
+ if cloudinary_url:
1061
+ params["image_url"] = cloudinary_url
1062
+ print(f"[serpapi] Using Cloudinary URL: {cloudinary_url}")
1063
+ else:
1064
+ print("[serpapi] Cloudinary upload failed, falling back to base64")
1065
+ # Fallback to base64 if Cloudinary fails
1066
+ with open(image_path, "rb") as img_file:
1067
+ img_data = img_file.read()
1068
+ img_base64 = base64.b64encode(img_data).decode("utf-8")
1069
+ params["image_content"] = img_base64
1070
+ print("[serpapi] Using image_content parameter (base64 fallback)")
1071
+ except Exception as e:
1072
+ print(f"[serpapi] Error uploading to Cloudinary: {e}")
1073
+ # Fallback to base64
1074
+ with open(image_path, "rb") as img_file:
1075
+ img_data = img_file.read()
1076
+ img_base64 = base64.b64encode(img_data).decode("utf-8")
1077
+ params["image_content"] = img_base64
1078
+ print("[serpapi] Using image_content parameter (base64 fallback)")
1079
+
1080
+ # Debug prints
1081
+ print("[serpapi] params", {
1082
+ "engine": params.get("engine"),
1083
+ "has_image_url": bool(params.get("image_url")),
1084
+ "has_image_content": bool(params.get("image_content")),
1085
+ "image_content_len": len(params.get("image_content", "")) if params.get("image_content") else 0,
1086
+ })
1087
+
1088
+ # Use different approaches based on whether we have image_url or image_content
1089
+ if params.get("image_url"):
1090
+ # For image_url, use the official client (works well)
1091
+ print("[serpapi] Using official GoogleSearch client for image_url")
1092
+ search = GoogleSearch(params) # type: ignore
1093
+ results = search.get_dict()
1094
+ print("[serpapi] Successfully got results from GoogleSearch client")
1095
+ return results
1096
+ else:
1097
+ # For image_content (base64), use direct HTTP POST to avoid header size issues
1098
+ print("[serpapi] Using direct HTTP POST for image_content (base64)")
1099
+ try:
1100
+ import requests
1101
+ response = requests.post(
1102
+ "https://serpapi.com/search?engine=google_reverse_image",
1103
+ data=params,
1104
+ timeout=60
1105
+ )
1106
+ print(f"[serpapi] HTTP POST status: {response.status_code}")
1107
+ response.raise_for_status()
1108
+ results = response.json()
1109
+ print("[serpapi] Successfully got results from HTTP POST")
1110
+ return results
1111
+ except Exception as http_error:
1112
+ print(f"[serpapi] HTTP POST failed: {http_error}")
1113
+ return {}
1114
+
1115
+ except Exception as e:
1116
+ print(f"[serpapi] Error in reverse image search: {e}")
1117
+ print(f"[serpapi] Error type: {type(e).__name__}")
1118
+ import traceback
1119
+ print(f"[serpapi] Traceback: {traceback.format_exc()}")
1120
+ return {}
1121
+
1122
+ def _extract_year_from_text(self, text: str) -> Optional[int]:
1123
+ if not text:
1124
+ return None
1125
+ import re
1126
+ years = re.findall(r"(19\d{2}|20\d{2})", text)
1127
+ if not years:
1128
+ return None
1129
+ try:
1130
+ return int(years[0])
1131
+ except Exception:
1132
+ return None
1133
+
1134
+ def _context_mismatch(self, claim_context_lc: str, text: str) -> bool:
1135
+ t = (text or "").lower()
1136
+ if not claim_context_lc:
1137
+ return False
1138
+ # Simple heuristic: if text contains a strong, different location keyword
1139
+ known = {
1140
+ "mumbai": ["delhi", "bangalore", "chennai", "kolkata", "new york", "london"],
1141
+ "new york": ["mumbai", "delhi", "london", "paris", "dubai"],
1142
+ }
1143
+ for k, others in known.items():
1144
+ if claim_context_lc == k:
1145
+ if any(o in t for o in others):
1146
+ return True
1147
+ return False
1148
+
1149
+ async def _generate_counter_measure(self, original_image_path: Optional[str], evidence_image_url: str,
1150
+ claim_context: str, claim_date: str, original_image_url: Optional[str] = None) -> str:
1151
+ """
1152
+ Generate a visual counter-measure image
1153
+
1154
+ Args:
1155
+ original_image_path: Path to the original misleading image
1156
+ evidence_image_url: URL of the evidence image
1157
+ claim_context: The claimed context
1158
+ claim_date: The claimed date
1159
+
1160
+ Returns:
1161
+ Path to the generated counter-measure image
1162
+ """
1163
+ try:
1164
+ # Load original image: from path if available, else download from original_image_url
1165
+ if original_image_path:
1166
+ original_img = Image.open(original_image_path)
1167
+ elif original_image_url:
1168
+ original_img = await self._download_image(original_image_url)
1169
+ else:
1170
+ # Fallback to evidence image as placeholder
1171
+ original_img = await self._download_image(evidence_image_url)
1172
+
1173
+ # Download evidence image
1174
+ evidence_img = await self._download_image(evidence_image_url)
1175
+
1176
+ # Create counter-measure
1177
+ counter_measure = self._create_counter_measure_image(
1178
+ original_img, evidence_img, claim_context, claim_date
1179
+ )
1180
+
1181
+ # Save to temporary file
1182
+ output_path = tempfile.mktemp(suffix=".png")
1183
+ counter_measure.save(output_path, "PNG")
1184
+
1185
+ return output_path
1186
+
1187
+ except Exception as e:
1188
+ print(f"Error generating counter-measure: {e}")
1189
+ raise
1190
+
1191
+ async def _upload_to_cloudinary(self, image_path: str) -> Optional[str]:
1192
+ """
1193
+ Upload image to Cloudinary and return the public URL
1194
+
1195
+ Args:
1196
+ image_path: Path to the source image file
1197
+
1198
+ Returns:
1199
+ Cloudinary public URL of the uploaded image, or None if upload fails
1200
+ """
1201
+ try:
1202
+ import cloudinary
1203
+ import cloudinary.uploader
1204
+ from config import config
1205
+
1206
+ # Configure Cloudinary
1207
+ cloudinary.config(
1208
+ cloud_name=config.CLOUDINARY_CLOUD_NAME,
1209
+ api_key=config.CLOUDINARY_API_KEY,
1210
+ api_secret=config.CLOUDINARY_API_SECRET
1211
+ )
1212
+
1213
+ # Upload to Cloudinary with frames folder
1214
+ result = cloudinary.uploader.upload(
1215
+ image_path,
1216
+ folder="frames",
1217
+ resource_type="image"
1218
+ )
1219
+
1220
+ if result and result.get('secure_url'):
1221
+ public_url = result['secure_url']
1222
+ print(f"[cloudinary] Uploaded {image_path} to {public_url}")
1223
+ return public_url
1224
+ else:
1225
+ print("[cloudinary] Upload failed - no secure_url in response")
1226
+ return None
1227
+
1228
+ except Exception as e:
1229
+ print(f"[cloudinary] Error uploading to Cloudinary: {e}")
1230
+ return None
1231
+
1232
+ async def _copy_to_public_folder(self, image_path: str) -> Optional[str]:
1233
+ """
1234
+ Copy image to public/frames folder and return the public URL
1235
+
1236
+ Args:
1237
+ image_path: Path to the source image file
1238
+
1239
+ Returns:
1240
+ Public URL of the copied image, or None if copy fails
1241
+ """
1242
+ try:
1243
+ import shutil
1244
+ import uuid
1245
+ from pathlib import Path
1246
+
1247
+ # Create public/frames directory if it doesn't exist
1248
+ public_frames_dir = Path("public/frames")
1249
+ public_frames_dir.mkdir(parents=True, exist_ok=True)
1250
+
1251
+ # Generate unique filename
1252
+ file_extension = Path(image_path).suffix
1253
+ unique_filename = f"{uuid.uuid4()}{file_extension}"
1254
+ public_path = public_frames_dir / unique_filename
1255
+
1256
+ # Copy the file
1257
+ shutil.copy2(image_path, public_path)
1258
+
1259
+ # Return the public URL
1260
+ public_url = f"http://127.0.0.1:{config.SERVICE_PORT}/frames/{unique_filename}"
1261
+ print(f"[copy] Copied {image_path} to {public_path}")
1262
+ print(f"[copy] Public URL: {public_url}")
1263
+
1264
+ return public_url
1265
+
1266
+ except Exception as e:
1267
+ print(f"[copy] Error copying to public folder: {e}")
1268
+ return None
1269
+
1270
+ async def _download_image(self, image_url: str) -> Image.Image:
1271
+ """
1272
+ Download an image from URL
1273
+
1274
+ Args:
1275
+ image_url: URL of the image to download
1276
+
1277
+ Returns:
1278
+ PIL Image object
1279
+ """
1280
+ try:
1281
+ headers = {
1282
+ "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0 Safari/537.36",
1283
+ "Referer": "https://www.google.com/",
1284
+ }
1285
+ response = requests.get(image_url, timeout=15, headers=headers, stream=True)
1286
+ response.raise_for_status()
1287
+ content_type = response.headers.get("Content-Type", "").lower()
1288
+ if "image" not in content_type:
1289
+ # Not an image (likely a webpage); return placeholder
1290
+ return Image.new('RGB', (300, 200), color='gray')
1291
+ data = response.content
1292
+ img = Image.open(io.BytesIO(data))
1293
+ return img
1294
+ except Exception:
1295
+ # Return a placeholder image if download fails
1296
+ return Image.new('RGB', (300, 200), color='gray')
1297
+
1298
+ def _create_counter_measure_image(self, original_img: Image.Image, evidence_img: Image.Image,
1299
+ claim_context: str, claim_date: str) -> Image.Image:
1300
+ """
1301
+ Create the counter-measure image with side-by-side comparison
1302
+
1303
+ Args:
1304
+ original_img: The original misleading image
1305
+ evidence_img: The evidence image
1306
+ claim_context: The claimed context
1307
+ claim_date: The claimed date
1308
+
1309
+ Returns:
1310
+ Generated counter-measure image
1311
+ """
1312
+ # Resize images to consistent dimensions
1313
+ target_width, target_height = 400, 300
1314
+
1315
+ original_img = original_img.resize((target_width, target_height), Image.Resampling.LANCZOS)
1316
+ evidence_img = evidence_img.resize((target_width, target_height), Image.Resampling.LANCZOS)
1317
+
1318
+ # Create canvas for side-by-side layout
1319
+ canvas_width = target_width * 2 + 50 # Extra space for padding
1320
+ canvas_height = target_height + 200 # Extra space for labels and watermark
1321
+
1322
+ canvas = Image.new('RGB', (canvas_width, canvas_height), 'white')
1323
+ draw = ImageDraw.Draw(canvas)
1324
+
1325
+ # Try to load a font, fall back to default if not available
1326
+ try:
1327
+ font_large = ImageFont.truetype("/System/Library/Fonts/Arial.ttf", 24)
1328
+ font_medium = ImageFont.truetype("/System/Library/Fonts/Arial.ttf", 18)
1329
+ font_small = ImageFont.truetype("/System/Library/Fonts/Arial.ttf", 14)
1330
+ except:
1331
+ font_large = ImageFont.load_default()
1332
+ font_medium = ImageFont.load_default()
1333
+ font_small = ImageFont.load_default()
1334
+
1335
+ # Add title
1336
+ title = "FALSE CONTEXT DETECTED"
1337
+ title_bbox = draw.textbbox((0, 0), title, font=font_large)
1338
+ title_width = title_bbox[2] - title_bbox[0]
1339
+ title_x = (canvas_width - title_width) // 2
1340
+ draw.text((title_x, 20), title, fill='red', font=font_large)
1341
+
1342
+ # Add original image (left side)
1343
+ original_x = 25
1344
+ original_y = 80
1345
+ canvas.paste(original_img, (original_x, original_y))
1346
+
1347
+ # Add evidence image (right side)
1348
+ evidence_x = original_x + target_width + 25
1349
+ evidence_y = original_y
1350
+ canvas.paste(evidence_img, (evidence_x, evidence_y))
1351
+
1352
+ # Add labels
1353
+ claim_label = f"CLAIM: {claim_context}, {claim_date}"
1354
+ reality_label = "REALITY: Different context/earlier date"
1355
+
1356
+ draw.text((original_x, original_y - 30), claim_label, fill='red', font=font_medium)
1357
+ draw.text((evidence_x, evidence_y - 30), reality_label, fill='green', font=font_medium)
1358
+
1359
+ # Add watermark
1360
+ watermark = "FALSE CONTEXT"
1361
+ watermark_img = Image.new('RGBA', canvas.size, (0, 0, 0, 0))
1362
+ watermark_draw = ImageDraw.Draw(watermark_img)
1363
+
1364
+ # Create semi-transparent watermark
1365
+ watermark_bbox = watermark_draw.textbbox((0, 0), watermark, font=font_large)
1366
+ watermark_width = watermark_bbox[2] - watermark_bbox[0]
1367
+ watermark_height = watermark_bbox[3] - watermark_bbox[1]
1368
+
1369
+ watermark_x = (canvas_width - watermark_width) // 2
1370
+ watermark_y = (canvas_height - watermark_height) // 2
1371
+
1372
+ watermark_draw.text((watermark_x, watermark_y), watermark, fill=(255, 0, 0, 128), font=font_large)
1373
+
1374
+ # Composite watermark onto canvas
1375
+ canvas = Image.alpha_composite(canvas.convert('RGBA'), watermark_img).convert('RGB')
1376
+
1377
+ return canvas
services/input_processor.py ADDED
@@ -0,0 +1,308 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import re
3
+ import json
4
+ from typing import Dict, List, Optional, Union, Tuple
5
+ import google.generativeai as genai
6
+ import tempfile
7
+ from config import config
8
+
9
+ class InputProcessor:
10
+ """
11
+ Intelligent input processor that converts chatbot input into structured verification requests
12
+ """
13
+
14
+ def __init__(self):
15
+ # Configure Gemini
16
+ genai.configure(api_key=config.GEMINI_API_KEY)
17
+ self.model = genai.GenerativeModel(
18
+ config.GEMINI_MODEL,
19
+ generation_config=genai.types.GenerationConfig(
20
+ temperature=config.GEMINI_TEMPERATURE,
21
+ top_p=config.GEMINI_TOP_P,
22
+ max_output_tokens=config.GEMINI_MAX_TOKENS
23
+ )
24
+ )
25
+
26
+ self.system_prompt = """You are an intelligent input processor for a visual verification service.
27
+
28
+ Your task is to analyze user input and extract:
29
+ 1. Image/video/audio content (files, URLs, or descriptions)
30
+ 2. Claim context (what the user is claiming)
31
+ 3. Claim date (when the claim was made)
32
+ 4. Type of verification needed (image, video, audio, or text)
33
+
34
+ Return a JSON response with this structure:
35
+ {
36
+ "verification_type": "image" or "video" or "audio" or "text",
37
+ "content": {
38
+ "files": ["list of file paths if files provided"],
39
+ "urls": ["list of image/video/audio URLs"],
40
+ "descriptions": ["list of text descriptions"],
41
+ "text": "the text claim to verify (if verification_type is text)"
42
+ },
43
+ "claim_context": "extracted or inferred claim context",
44
+ "claim_date": "extracted or inferred date"
45
+ }
46
+
47
+ Rules:
48
+ - If multiple images/videos/audio files are mentioned, separate them clearly
49
+ - Extract URLs from text using regex patterns
50
+ - Infer context from surrounding text if not explicitly stated
51
+ - If no date is mentioned leave it blank
52
+ - Handle mixed content types appropriately"""
53
+
54
+ async def process_input(
55
+ self,
56
+ text_input: Optional[str] = None,
57
+ files: Optional[List] = None
58
+ ) -> Dict:
59
+ """
60
+ Process chatbot input and return structured verification request
61
+ """
62
+ try:
63
+ print(f"🔍 DEBUG: InputProcessor.process_input called")
64
+ print(f"🔍 DEBUG: text_input = {text_input}")
65
+ print(f"🔍 DEBUG: files = {files}")
66
+ print(f"🔍 DEBUG: files type = {type(files)}")
67
+
68
+ # Prepare input for LLM analysis
69
+ print(f"🔍 DEBUG: Preparing input text for LLM analysis")
70
+ input_text = self._prepare_input_text(text_input, files)
71
+ print(f"🔍 DEBUG: Prepared input_text = {input_text}")
72
+
73
+ # Get LLM analysis
74
+ print(f"🔍 DEBUG: Calling LLM analysis")
75
+ llm_response = await self._analyze_with_llm(input_text)
76
+ print(f"🔍 DEBUG: LLM response = {llm_response}")
77
+
78
+ # Parse and validate LLM response
79
+ print(f"🔍 DEBUG: Parsing LLM response")
80
+ parsed_response = self._parse_llm_response(llm_response)
81
+ print(f"🔍 DEBUG: Parsed response = {parsed_response}")
82
+
83
+ # Post-process and enhance the response
84
+ print(f"🔍 DEBUG: Post-processing response")
85
+ final_response = await self._post_process_response(parsed_response, files)
86
+
87
+ # PATCH: If verification_type is 'video' but all files have audio extensions, reassign to 'audio'
88
+ audio_exts = ['.mp3', '.wav', '.ogg', '.flac', '.m4a']
89
+ content_files = final_response.get('content', {}).get('files', [])
90
+ if (
91
+ final_response.get('verification_type') == 'video' and
92
+ content_files and
93
+ all(any(f.lower().endswith(e) for e in audio_exts) for f in content_files)
94
+ ):
95
+ print(f"🔍 PATCH: Rewriting 'verification_type' from 'video' to 'audio' (all files are audio)")
96
+ final_response['verification_type'] = 'audio'
97
+ print(f"🔍 DEBUG: Final response = {final_response}")
98
+ return final_response
99
+
100
+ except Exception as e:
101
+ print(f"❌ DEBUG: Exception in InputProcessor.process_input: {e}")
102
+ print(f"❌ DEBUG: Exception type: {type(e).__name__}")
103
+ import traceback
104
+ print(f"❌ DEBUG: Traceback: {traceback.format_exc()}")
105
+ return {
106
+ "error": f"Failed to process input: {str(e)}",
107
+ "verification_type": "unknown",
108
+ "content": {"files": [], "urls": [], "descriptions": []},
109
+ "claim_context": "Unknown context",
110
+ "claim_date": "Unknown date",
111
+ }
112
+
113
+ def _prepare_input_text(self, text_input: Optional[str], files: Optional[List]) -> str:
114
+ """Prepare input text for LLM analysis"""
115
+ print(f"🔍 DEBUG: _prepare_input_text called with text_input={text_input}, files={files}")
116
+ input_parts = []
117
+
118
+ if text_input:
119
+ input_parts.append(f"Text input: {text_input}")
120
+ print(f"🔍 DEBUG: Added text input: {text_input}")
121
+
122
+ if files:
123
+ file_info = []
124
+ for i, file in enumerate(files):
125
+ file_info.append(f"File {i+1}: {file.filename} ({file.content_type})")
126
+ print(f"🔍 DEBUG: Added file {i+1}: {file.filename} ({file.content_type})")
127
+ input_parts.append(f"Files provided: {'; '.join(file_info)}")
128
+
129
+ if not input_parts:
130
+ input_parts.append("No text or files provided")
131
+ print(f"🔍 DEBUG: No input parts, using default message")
132
+
133
+ result = "\n".join(input_parts)
134
+ print(f"🔍 DEBUG: Final prepared input text: {result}")
135
+ return result
136
+
137
+ async def _analyze_with_llm(self, input_text: str) -> str:
138
+ """Use Gemini to analyze the input"""
139
+ try:
140
+ print(f"🔍 DEBUG: _analyze_with_llm called with input_text: {input_text}")
141
+ prompt = f"{self.system_prompt}\n\nUser input: {input_text}"
142
+ print(f"🔍 DEBUG: Generated prompt: {prompt}")
143
+ response = self.model.generate_content(prompt)
144
+ print(f"🔍 DEBUG: LLM response text: {response.text}")
145
+ return response.text
146
+ except Exception as e:
147
+ print(f"❌ DEBUG: LLM analysis failed: {e}")
148
+ print(f"🔍 DEBUG: Falling back to rule-based parsing")
149
+ # Fallback to rule-based parsing if LLM fails
150
+ return self._fallback_parsing(input_text)
151
+
152
+ def _fallback_parsing(self, input_text: str) -> str:
153
+ """Fallback parsing when LLM is unavailable"""
154
+ print(f"🔍 DEBUG: _fallback_parsing called with input_text: {input_text}")
155
+
156
+ # Extract URLs using regex
157
+ url_pattern = r'https?://[^\s<>"]+|www\.[^\s<>"]+'
158
+ urls = re.findall(url_pattern, input_text)
159
+ print(f"🔍 DEBUG: Extracted URLs: {urls}")
160
+
161
+ # Simple content type detection
162
+ verification_type = "text" # default for text-only queries
163
+
164
+ # Check for video platform URLs first
165
+ video_platforms = [
166
+ 'instagram.com/reels/', 'instagram.com/p/', 'instagram.com/tv/',
167
+ 'youtube.com/watch', 'youtu.be/', 'youtube.com/shorts/',
168
+ 'tiktok.com/', 'vm.tiktok.com/',
169
+ 'twitter.com/', 'x.com/', 't.co/',
170
+ 'facebook.com/', 'fb.watch/',
171
+ 'vimeo.com/', 'twitch.tv/', 'dailymotion.com/',
172
+ 'imgur.com/', 'soundcloud.com/', 'mixcloud.com/',
173
+ 'lbry.tv/', 'odysee.com/', 't.me/'
174
+ ]
175
+
176
+ # Check for image platform URLs
177
+ image_platforms = [
178
+ 'instagram.com/p/', 'imgur.com/', 'flickr.com/',
179
+ 'pinterest.com/', 'unsplash.com/', 'pexels.com/'
180
+ ]
181
+
182
+ # Check for direct file extensions
183
+ if any(ext in input_text.lower() for ext in ['.mp4', '.avi', '.mov', '.mkv', '.webm', 'video']):
184
+ verification_type = "video"
185
+ elif any(ext in input_text.lower() for ext in ['.jpg', '.jpeg', '.png', '.gif', '.webp', 'image', 'photo', 'picture']):
186
+ verification_type = "image"
187
+ elif any(ext in input_text.lower() for ext in ['.mp3', '.wav', '.ogg', '.flac', '.m4a', 'audio']):
188
+ verification_type = "audio"
189
+ # Check for video platform URLs
190
+ elif any(platform in input_text.lower() for platform in video_platforms):
191
+ verification_type = "video"
192
+ # Check for image platform URLs
193
+ elif any(platform in input_text.lower() for platform in image_platforms):
194
+ verification_type = "image"
195
+
196
+ print(f"🔍 DEBUG: Detected verification_type: {verification_type}")
197
+
198
+ # Extract date patterns
199
+ date_pattern = r'\d{1,2}[/-]\d{1,2}[/-]\d{2,4}|\d{4}[/-]\d{1,2}[/-]\d{1,2}'
200
+ dates = re.findall(date_pattern, input_text)
201
+ claim_date = dates[0] if dates else "Unknown date"
202
+ print(f"🔍 DEBUG: Extracted dates: {dates}, using: {claim_date}")
203
+
204
+ # Clean up the input text for better processing
205
+ clean_text = input_text.replace("Text input: ", "").strip()
206
+
207
+ result = {
208
+ "verification_type": verification_type,
209
+ "content": {
210
+ "files": [],
211
+ "urls": urls,
212
+ "descriptions": [clean_text],
213
+ "text": clean_text if verification_type == "text" else None
214
+ },
215
+ "claim_context": clean_text,
216
+ "claim_date": claim_date,
217
+ }
218
+ print(f"🔍 DEBUG: Fallback parsing result: {result}")
219
+ return json.dumps(result)
220
+
221
+ def _parse_llm_response(self, llm_response: str) -> Dict:
222
+ """Parse and validate LLM response"""
223
+ try:
224
+ print(f"🔍 DEBUG: _parse_llm_response called with llm_response: {llm_response}")
225
+ # Extract JSON from response
226
+ json_match = re.search(r'\{.*\}', llm_response, re.DOTALL)
227
+ if json_match:
228
+ print(f"🔍 DEBUG: Found JSON match: {json_match.group()}")
229
+ parsed = json.loads(json_match.group())
230
+ print(f"🔍 DEBUG: Parsed JSON: {parsed}")
231
+ else:
232
+ print(f"❌ DEBUG: No JSON found in response")
233
+ raise ValueError("No JSON found in response")
234
+
235
+ # Validate required fields
236
+ required_fields = ["verification_type", "content", "claim_context", "claim_date"]
237
+ for field in required_fields:
238
+ if field not in parsed:
239
+ print(f"❌ DEBUG: Missing required field: {field}")
240
+ raise ValueError(f"Missing required field: {field}")
241
+
242
+ print(f"🔍 DEBUG: Successfully parsed and validated response")
243
+ return parsed
244
+
245
+ except Exception as e:
246
+ print(f"❌ DEBUG: Failed to parse LLM response: {e}")
247
+ print(f"🔍 DEBUG: Returning safe defaults")
248
+ # Return safe defaults if parsing fails
249
+ return {
250
+ "verification_type": "image",
251
+ "content": {"files": [], "urls": [], "descriptions": []},
252
+ "claim_context": "Unknown context",
253
+ "claim_date": "Unknown date",
254
+ }
255
+
256
+ async def _post_process_response(self, parsed_response: Dict, files: Optional[List]) -> Dict:
257
+ """Post-process the parsed response and add file information"""
258
+ print(f"🔍 DEBUG: _post_process_response called with parsed_response: {parsed_response}, files: {files}")
259
+
260
+ # Add actual file information if files were provided
261
+ if files:
262
+ print(f"🔍 DEBUG: Processing {len(files)} files")
263
+ file_paths = []
264
+ for i, file in enumerate(files):
265
+ print(f"🔍 DEBUG: Saving file {i}: {file.filename}")
266
+ # Save file temporarily and get path
267
+ temp_path = await self._save_temp_file(file)
268
+ if temp_path:
269
+ file_paths.append(temp_path)
270
+ print(f"🔍 DEBUG: Saved file {i} to: {temp_path}")
271
+ else:
272
+ print(f"❌ DEBUG: Failed to save file {i}")
273
+
274
+ parsed_response["content"]["files"] = file_paths
275
+ print(f"🔍 DEBUG: Updated files list: {file_paths}")
276
+ else:
277
+ print(f"🔍 DEBUG: No files to process")
278
+
279
+ print(f"🔍 DEBUG: Final post-processed response: {parsed_response}")
280
+ return parsed_response
281
+
282
+ async def _save_temp_file(self, file) -> Optional[str]:
283
+ """Save uploaded file temporarily and return path"""
284
+ try:
285
+ print(f"🔍 DEBUG: _save_temp_file called for file: {file.filename}")
286
+ # Create temp file
287
+ import os
288
+ suffix = os.path.splitext(file.filename)[1] if file.filename else ""
289
+ print(f"🔍 DEBUG: Using suffix: {suffix}")
290
+ with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as temp_file:
291
+ content = await file.read()
292
+ print(f"🔍 DEBUG: Read {len(content)} bytes from file")
293
+ temp_file.write(content)
294
+ temp_path = temp_file.name
295
+ print(f"🔍 DEBUG: Saved temp file to: {temp_path}")
296
+ return temp_path
297
+ except Exception as e:
298
+ print(f"❌ DEBUG: Failed to save temp file: {e}")
299
+ return None
300
+
301
+ def cleanup_temp_files(self, file_paths: List[str]):
302
+ """Clean up temporary files"""
303
+ for path in file_paths:
304
+ try:
305
+ if os.path.exists(path):
306
+ os.unlink(path)
307
+ except Exception as e:
308
+ print(f"Failed to cleanup temp file {path}: {e}")
services/mongodb_service.py ADDED
@@ -0,0 +1,684 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ MongoDB Service for Backend
3
+ Handles MongoDB operations for debunk posts
4
+ """
5
+
6
+ import os
7
+ import logging
8
+ from typing import List, Dict, Any, Optional
9
+ from pymongo import MongoClient
10
+ from pymongo.errors import ConnectionFailure
11
+ from dotenv import load_dotenv
12
+
13
+ load_dotenv()
14
+
15
+ # Setup logging
16
+ logger = logging.getLogger(__name__)
17
+
18
+ class MongoDBService:
19
+ """MongoDB service for backend operations"""
20
+
21
+ def __init__(self, connection_string: Optional[str] = None):
22
+ """Initialize MongoDB connection
23
+
24
+ Args:
25
+ connection_string: MongoDB connection string. If None, uses MONGO_CONNECTION_STRING env var
26
+ """
27
+ self.connection_string = connection_string or os.getenv('MONGO_CONNECTION_STRING')
28
+
29
+ if not self.connection_string:
30
+ raise ValueError("MongoDB connection string is required. Set MONGO_CONNECTION_STRING environment variable.")
31
+
32
+ self.client = None
33
+ self.db = None
34
+ self.collection = None
35
+ self.chat_sessions = None
36
+ self.chat_messages = None
37
+
38
+ self._connect()
39
+
40
+ def _connect(self):
41
+ """Establish MongoDB connection"""
42
+ try:
43
+ self.client = MongoClient(self.connection_string)
44
+ # Test connection
45
+ self.client.admin.command('ping')
46
+
47
+ # Use 'aegis' database
48
+ self.db = self.client["aegis"]
49
+ self.collection = self.db["debunk_posts"]
50
+
51
+ # Additional collections used by other features
52
+ self.chat_sessions = self.db["chat_sessions"]
53
+ self.chat_messages = self.db["chat_messages"]
54
+ self.subscriptions = self.db["subscriptions"]
55
+ self.users = self.db["users"]
56
+
57
+ logger.info("✅ Successfully connected to MongoDB")
58
+
59
+ except ConnectionFailure as e:
60
+ logger.error(f"❌ Failed to connect to MongoDB: {e}")
61
+ raise
62
+
63
+ def get_recent_posts(self, limit: int = 5) -> List[Dict[str, Any]]:
64
+ """Get recent debunk posts from MongoDB
65
+
66
+ Args:
67
+ limit: Maximum number of posts to return
68
+
69
+ Returns:
70
+ List of recent debunk posts
71
+ """
72
+ try:
73
+ logger.info(f"🔍 DEBUG: Starting get_recent_posts with limit={limit}")
74
+ logger.info(f"🔍 DEBUG: Collection name: {self.collection.name}")
75
+ logger.info(f"🔍 DEBUG: Database name: {self.db.name}")
76
+
77
+ # Check if collection exists and has documents
78
+ total_count = self.collection.count_documents({})
79
+ logger.info(f"🔍 DEBUG: Total documents in collection: {total_count}")
80
+
81
+ if total_count == 0:
82
+ logger.warning("⚠️ DEBUG: Collection is empty!")
83
+ return []
84
+
85
+ # Get sample document to check structure
86
+ sample_doc = self.collection.find_one()
87
+ if sample_doc:
88
+ logger.info(f"🔍 DEBUG: Sample document keys: {list(sample_doc.keys())}")
89
+ logger.info(f"🔍 DEBUG: Sample document _id: {sample_doc.get('_id')}")
90
+ logger.info(f"🔍 DEBUG: Sample document stored_at: {sample_doc.get('stored_at')}")
91
+ else:
92
+ logger.warning("⚠️ DEBUG: No sample document found!")
93
+
94
+ posts = list(self.collection
95
+ .find()
96
+ .sort("stored_at", -1)
97
+ .limit(limit))
98
+
99
+ logger.info(f"🔍 DEBUG: Raw query returned {len(posts)} posts")
100
+
101
+ # Convert ObjectId to string for JSON serialization
102
+ for i, post in enumerate(posts):
103
+ if '_id' in post:
104
+ post['_id'] = str(post['_id'])
105
+ logger.info(f"🔍 DEBUG: Post {i+1} keys: {list(post.keys())}")
106
+ logger.info(f"🔍 DEBUG: Post {i+1} stored_at: {post.get('stored_at')}")
107
+
108
+ logger.info(f"📋 Retrieved {len(posts)} recent debunk posts")
109
+ return posts
110
+
111
+ except Exception as e:
112
+ logger.error(f"❌ Failed to get recent posts: {e}")
113
+ logger.error(f"🔍 DEBUG: Exception type: {type(e).__name__}")
114
+ logger.error(f"🔍 DEBUG: Exception details: {str(e)}")
115
+ return []
116
+
117
+ def search_similar_rumours(self, query: str, similarity_threshold: float = 0.6, limit: int = 5) -> List[Dict[str, Any]]:
118
+ """Search for rumours similar to the query text using TF-IDF similarity
119
+
120
+ Args:
121
+ query: Search query text
122
+ similarity_threshold: Minimum similarity score (0.0 to 1.0)
123
+ limit: Maximum number of results to return
124
+
125
+ Returns:
126
+ List of similar rumours with similarity scores
127
+ """
128
+ try:
129
+ from sklearn.feature_extraction.text import TfidfVectorizer
130
+ from sklearn.metrics.pairwise import cosine_similarity
131
+ import re
132
+
133
+ if not query or not query.strip():
134
+ logger.warning("⚠️ Empty query provided")
135
+ return []
136
+
137
+ logger.info(f"🔍 Searching for rumours similar to: {query[:50]}...")
138
+
139
+ # Get all rumours from database
140
+ all_posts = list(self.collection.find())
141
+
142
+ if not all_posts:
143
+ logger.warning("⚠️ No rumours found in database")
144
+ return []
145
+
146
+ # Extract claim text from each post
147
+ claims = []
148
+ posts_data = []
149
+
150
+ for post in all_posts:
151
+ # Extract claim text - try multiple fields
152
+ claim_text = (
153
+ post.get('claim') or
154
+ post.get('summary') or
155
+ ""
156
+ )
157
+
158
+ # Handle nested claim structure
159
+ if isinstance(claim_text, dict):
160
+ claim_text = claim_text.get('text') or claim_text.get('claim_text') or ""
161
+
162
+ if claim_text and claim_text.strip():
163
+ claims.append(claim_text)
164
+ posts_data.append(post)
165
+
166
+ if not claims:
167
+ logger.warning("⚠️ No claims found in posts")
168
+ return []
169
+
170
+ # Preprocess query
171
+ def preprocess_text(text: str) -> str:
172
+ text = text.lower()
173
+ text = re.sub(r'[^\w\s]', ' ', text)
174
+ text = ' '.join(text.split())
175
+ return text
176
+
177
+ query_processed = preprocess_text(query)
178
+
179
+ # Calculate TF-IDF similarity
180
+ try:
181
+ vectorizer = TfidfVectorizer(
182
+ stop_words='english',
183
+ ngram_range=(1, 2),
184
+ max_features=500,
185
+ lowercase=True
186
+ )
187
+
188
+ # Combine query and claims for vectorization
189
+ all_texts = [query_processed] + [preprocess_text(c) for c in claims]
190
+ tfidf_matrix = vectorizer.fit_transform(all_texts)
191
+
192
+ # Calculate similarity between query and each claim
193
+ query_vector = tfidf_matrix[0:1]
194
+ claims_matrix = tfidf_matrix[1:]
195
+
196
+ similarities = cosine_similarity(query_vector, claims_matrix)[0]
197
+
198
+ except Exception as e:
199
+ logger.error(f"❌ TF-IDF calculation failed: {e}")
200
+ # Fallback to simple word overlap
201
+ similarities = []
202
+ query_words = set(query_processed.split())
203
+ for claim in claims:
204
+ claim_words = set(preprocess_text(claim).split())
205
+ if not query_words or not claim_words:
206
+ similarities.append(0.0)
207
+ else:
208
+ intersection = query_words.intersection(claim_words)
209
+ union = query_words.union(claim_words)
210
+ similarities.append(len(intersection) / len(union) if union else 0.0)
211
+
212
+ # Filter by threshold and sort by similarity
213
+ results = []
214
+ for i, (post, similarity) in enumerate(zip(posts_data, similarities)):
215
+ if similarity >= similarity_threshold:
216
+ # Convert ObjectId to string
217
+ if '_id' in post:
218
+ post['_id'] = str(post['_id'])
219
+
220
+ result = {
221
+ **post,
222
+ 'similarity_score': float(similarity)
223
+ }
224
+ results.append(result)
225
+
226
+ # Sort by similarity score (descending) and limit
227
+ results.sort(key=lambda x: x.get('similarity_score', 0), reverse=True)
228
+ results = results[:limit]
229
+
230
+ logger.info(f"✅ Found {len(results)} similar rumours (threshold: {similarity_threshold})")
231
+ return results
232
+
233
+ except Exception as e:
234
+ logger.error(f"❌ Failed to search similar rumours: {e}")
235
+ import traceback
236
+ logger.error(traceback.format_exc())
237
+ return []
238
+
239
+ # ---------- Chat sessions & messages ----------
240
+
241
+ def get_chat_sessions(
242
+ self,
243
+ user_id: Optional[str] = None,
244
+ anonymous_id: Optional[str] = None,
245
+ limit: int = 50,
246
+ ) -> List[Dict[str, Any]]:
247
+ """Return chat sessions for a given user or anonymous visitor."""
248
+ if self.chat_sessions is None:
249
+ return []
250
+
251
+ query: Dict[str, Any] = {}
252
+ if user_id:
253
+ query["user_id"] = user_id
254
+ if anonymous_id and not user_id:
255
+ # For anonymous visitors we only look at sessions that have not yet been
256
+ # attached to a concrete user id.
257
+ query["anonymous_id"] = anonymous_id
258
+ query["user_id"] = None
259
+
260
+ cursor = (
261
+ self.chat_sessions.find(query)
262
+ .sort("updated_at", -1)
263
+ .limit(limit)
264
+ )
265
+ sessions: List[Dict[str, Any]] = []
266
+ for doc in cursor:
267
+ doc["session_id"] = str(doc.get("session_id") or doc.get("_id"))
268
+ doc["_id"] = str(doc["_id"])
269
+ sessions.append(doc)
270
+ return sessions
271
+
272
+ def migrate_anonymous_sessions(self, anonymous_id: str, user_id: str) -> int:
273
+ """Attach existing anonymous sessions to a logged-in user.
274
+
275
+ This keeps history when a visitor later signs in.
276
+ """
277
+ if self.chat_sessions is None or not anonymous_id or not user_id:
278
+ return 0
279
+
280
+ result = self.chat_sessions.update_many(
281
+ {"anonymous_id": anonymous_id, "user_id": None},
282
+ {"$set": {"user_id": user_id}},
283
+ )
284
+ return int(getattr(result, "modified_count", 0))
285
+
286
+ def upsert_chat_session(self, payload: Dict[str, Any]) -> Dict[str, Any]:
287
+ """Create or update a chat session document.
288
+
289
+ Expected keys in payload: session_id (optional), user_id, anonymous_id,
290
+ title, last_verdict, last_summary.
291
+ """
292
+ if self.chat_sessions is None:
293
+ raise RuntimeError("chat_sessions collection not initialised")
294
+
295
+ from datetime import datetime
296
+
297
+ session_id = payload.get("session_id")
298
+ now = datetime.utcnow()
299
+
300
+ base_updates: Dict[str, Any] = {
301
+ "title": payload.get("title") or "New Chat",
302
+ "user_id": payload.get("user_id"),
303
+ "anonymous_id": payload.get("anonymous_id"),
304
+ "last_verdict": payload.get("last_verdict"),
305
+ "last_summary": payload.get("last_summary"),
306
+ "updated_at": now,
307
+ }
308
+
309
+ if session_id:
310
+ doc = self.chat_sessions.find_one_and_update(
311
+ {"session_id": session_id},
312
+ {"$set": base_updates},
313
+ upsert=True,
314
+ return_document=True,
315
+ )
316
+ else:
317
+ doc_to_insert = {
318
+ **base_updates,
319
+ "session_id": payload.get("session_id") or os.urandom(12).hex(),
320
+ "created_at": now,
321
+ }
322
+ inserted = self.chat_sessions.insert_one(doc_to_insert)
323
+ doc = self.chat_sessions.find_one({"_id": inserted.inserted_id})
324
+
325
+ doc["_id"] = str(doc["_id"])
326
+ doc["session_id"] = str(doc.get("session_id"))
327
+ return doc
328
+
329
+ def append_chat_messages(
330
+ self,
331
+ session_id: str,
332
+ messages: List[Dict[str, Any]],
333
+ user_id: Optional[str] = None,
334
+ anonymous_id: Optional[str] = None,
335
+ ) -> int:
336
+ """Append one or more messages to a given session."""
337
+ if self.chat_messages is None:
338
+ raise RuntimeError("chat_messages collection not initialised")
339
+
340
+ from datetime import datetime
341
+
342
+ docs = []
343
+ for msg in messages:
344
+ docs.append(
345
+ {
346
+ "session_id": session_id,
347
+ "user_id": user_id,
348
+ "anonymous_id": anonymous_id,
349
+ "role": msg.get("role"),
350
+ "content": msg.get("content"),
351
+ "attachments": msg.get("attachments") or [],
352
+ "verdict": msg.get("verdict"),
353
+ "confidence": msg.get("confidence"),
354
+ "sources": msg.get("sources"),
355
+ "created_at": msg.get("created_at") or datetime.utcnow(),
356
+ "metadata": msg.get("metadata") or {},
357
+ }
358
+ )
359
+
360
+ if not docs:
361
+ return 0
362
+
363
+ result = self.chat_messages.insert_many(docs)
364
+ return len(getattr(result, "inserted_ids", []))
365
+
366
+ def get_chat_messages(
367
+ self, session_id: str, limit: int = 100
368
+ ) -> List[Dict[str, Any]]:
369
+ """Return messages for a particular session ordered by time."""
370
+ if self.chat_messages is None:
371
+ return []
372
+
373
+ cursor = (
374
+ self.chat_messages.find({"session_id": session_id})
375
+ .sort("created_at", 1)
376
+ .limit(limit)
377
+ )
378
+ docs: List[Dict[str, Any]] = []
379
+ for doc in cursor:
380
+ doc["_id"] = str(doc["_id"])
381
+ docs.append(doc)
382
+ return docs
383
+
384
+ # ---------- Subscription management ----------
385
+
386
+ def upsert_subscription(self, subscription_data: Dict[str, Any]) -> Dict[str, Any]:
387
+ """
388
+ Create or update a subscription document
389
+
390
+ Expected keys in subscription_data:
391
+ - user_id: User ID
392
+ - razorpay_subscription_id: Razorpay subscription ID
393
+ - razorpay_plan_id: Razorpay plan ID
394
+ - plan_name: Plan name (e.g., "Pro")
395
+ - status: Subscription status (e.g., "active", "cancelled", "expired")
396
+ - amount: Subscription amount
397
+ - currency: Currency code
398
+ - current_start: Current billing cycle start
399
+ - current_end: Current billing cycle end
400
+ - next_billing_at: Next billing date
401
+ - created_at: Subscription creation date
402
+ - updated_at: Last update date
403
+ """
404
+ if self.subscriptions is None:
405
+ raise RuntimeError("subscriptions collection not initialised")
406
+
407
+ from datetime import datetime
408
+
409
+ razorpay_subscription_id = subscription_data.get("razorpay_subscription_id")
410
+ if not razorpay_subscription_id:
411
+ raise ValueError("razorpay_subscription_id is required")
412
+
413
+ now = datetime.utcnow()
414
+
415
+ # Prepare update data
416
+ update_data = {
417
+ **subscription_data,
418
+ "updated_at": now,
419
+ }
420
+
421
+ # Set created_at only if creating new subscription
422
+ existing = self.subscriptions.find_one(
423
+ {"razorpay_subscription_id": razorpay_subscription_id}
424
+ )
425
+
426
+ if not existing:
427
+ update_data["created_at"] = subscription_data.get("created_at") or now
428
+
429
+ # Upsert subscription
430
+ result = self.subscriptions.find_one_and_update(
431
+ {"razorpay_subscription_id": razorpay_subscription_id},
432
+ {"$set": update_data},
433
+ upsert=True,
434
+ return_document=True
435
+ )
436
+
437
+ if result:
438
+ result["_id"] = str(result["_id"])
439
+ logger.info(f"✅ Upserted subscription: {razorpay_subscription_id}")
440
+
441
+ # Update user's subscription tier if user_id is present
442
+ user_id = subscription_data.get("user_id")
443
+ status = subscription_data.get("status")
444
+ plan_name = subscription_data.get("plan_name", "Free")
445
+
446
+ if user_id:
447
+ if status == "active":
448
+ success = self.update_user_subscription_tier(user_id, plan_name)
449
+ if success:
450
+ logger.info(f"✅ Updated user {user_id} subscription tier to {plan_name} via upsert_subscription")
451
+ elif status in ["cancelled", "expired", "paused", "ended"]:
452
+ success = self.update_user_subscription_tier(user_id, "Free")
453
+ if success:
454
+ logger.info(f"✅ Updated user {user_id} subscription tier to Free (status: {status})")
455
+
456
+ return result
457
+
458
+ def get_user_subscription(
459
+ self,
460
+ user_id: str,
461
+ status: Optional[str] = None
462
+ ) -> Optional[Dict[str, Any]]:
463
+ """
464
+ Get user's active subscription
465
+
466
+ Args:
467
+ user_id: User ID
468
+ status: Filter by status (e.g., "active"). If None, returns most recent
469
+
470
+ Returns:
471
+ Subscription document or None
472
+ """
473
+ if self.subscriptions is None:
474
+ return None
475
+
476
+ query = {"user_id": user_id}
477
+ if status:
478
+ query["status"] = status
479
+
480
+ subscription = self.subscriptions.find_one(
481
+ query,
482
+ sort=[("created_at", -1)]
483
+ )
484
+
485
+ if subscription:
486
+ subscription["_id"] = str(subscription["_id"])
487
+
488
+ return subscription
489
+
490
+ def update_subscription_status(
491
+ self,
492
+ razorpay_subscription_id: str,
493
+ status: str,
494
+ additional_data: Optional[Dict[str, Any]] = None
495
+ ) -> Optional[Dict[str, Any]]:
496
+ """
497
+ Update subscription status from webhook events
498
+
499
+ Args:
500
+ razorpay_subscription_id: Razorpay subscription ID
501
+ status: New status
502
+ additional_data: Additional fields to update
503
+
504
+ Returns:
505
+ Updated subscription document or None
506
+ """
507
+ if self.subscriptions is None:
508
+ return None
509
+
510
+ from datetime import datetime
511
+
512
+ update_data = {
513
+ "status": status,
514
+ "updated_at": datetime.utcnow()
515
+ }
516
+
517
+ if additional_data:
518
+ update_data.update(additional_data)
519
+
520
+ result = self.subscriptions.find_one_and_update(
521
+ {"razorpay_subscription_id": razorpay_subscription_id},
522
+ {"$set": update_data},
523
+ return_document=True
524
+ )
525
+
526
+ if result:
527
+ result["_id"] = str(result["_id"])
528
+ logger.info(f"✅ Updated subscription status: {razorpay_subscription_id} -> {status}")
529
+
530
+ # Update user's subscription tier
531
+ user_id = result.get("user_id")
532
+ if user_id:
533
+ plan_name = result.get("plan_name", "Free")
534
+ if status == "active":
535
+ self.update_user_subscription_tier(user_id, plan_name)
536
+ elif status in ["cancelled", "expired", "paused"]:
537
+ self.update_user_subscription_tier(user_id, "Free")
538
+
539
+ return result
540
+
541
+ def get_subscription_by_razorpay_id(
542
+ self,
543
+ razorpay_subscription_id: str
544
+ ) -> Optional[Dict[str, Any]]:
545
+ """
546
+ Get subscription by Razorpay subscription ID
547
+
548
+ Args:
549
+ razorpay_subscription_id: Razorpay subscription ID
550
+
551
+ Returns:
552
+ Subscription document or None
553
+ """
554
+ if self.subscriptions is None:
555
+ return None
556
+
557
+ subscription = self.subscriptions.find_one(
558
+ {"razorpay_subscription_id": razorpay_subscription_id}
559
+ )
560
+
561
+ if subscription:
562
+ subscription["_id"] = str(subscription["_id"])
563
+
564
+ return subscription
565
+
566
+ def create_user(self, user_data: Dict[str, Any]) -> Dict[str, Any]:
567
+ """
568
+ Create a new user in MongoDB
569
+
570
+ Args:
571
+ user_data: User data including email, password (hashed), domain_preferences, etc.
572
+
573
+ Returns:
574
+ Created user document
575
+ """
576
+ if self.users is None:
577
+ raise RuntimeError("users collection not initialised")
578
+
579
+ from datetime import datetime
580
+ from bson import ObjectId
581
+
582
+ # Check if user already exists
583
+ existing = self.users.find_one({"email": user_data["email"]})
584
+ if existing:
585
+ raise ValueError("Email already registered")
586
+
587
+ user_doc = {
588
+ **user_data,
589
+ "created_at": datetime.utcnow(),
590
+ "updated_at": datetime.utcnow(),
591
+ }
592
+
593
+ result = self.users.insert_one(user_doc)
594
+ user_doc["_id"] = str(result.inserted_id)
595
+ user_doc["id"] = str(result.inserted_id)
596
+
597
+ logger.info(f"✅ Created user: {user_data['email']}")
598
+ return user_doc
599
+
600
+ def get_user_by_email(self, email: str) -> Optional[Dict[str, Any]]:
601
+ """
602
+ Get user by email
603
+
604
+ Args:
605
+ email: User email
606
+
607
+ Returns:
608
+ User document or None
609
+ """
610
+ if self.users is None:
611
+ return None
612
+
613
+ user = self.users.find_one({"email": email})
614
+ if user:
615
+ user["_id"] = str(user["_id"])
616
+ user["id"] = str(user["_id"])
617
+
618
+ return user
619
+
620
+ def get_user_by_id(self, user_id: str) -> Optional[Dict[str, Any]]:
621
+ """
622
+ Get user by ID
623
+
624
+ Args:
625
+ user_id: User ID
626
+
627
+ Returns:
628
+ User document or None
629
+ """
630
+ if self.users is None:
631
+ return None
632
+
633
+ from bson import ObjectId
634
+
635
+ try:
636
+ user = self.users.find_one({"_id": ObjectId(user_id)})
637
+ if user:
638
+ user["_id"] = str(user["_id"])
639
+ user["id"] = str(user["_id"])
640
+ return user
641
+ except Exception as e:
642
+ logger.error(f"Error getting user by ID: {e}")
643
+ return None
644
+
645
+ def update_user_subscription_tier(self, user_id: str, subscription_tier: str) -> bool:
646
+ """
647
+ Update user's subscription tier in user collection
648
+
649
+ Args:
650
+ user_id: User ID
651
+ subscription_tier: Subscription tier (Free, Pro, Enterprise)
652
+
653
+ Returns:
654
+ True if updated successfully, False otherwise
655
+ """
656
+ if self.users is None:
657
+ return False
658
+
659
+ from datetime import datetime
660
+ from bson import ObjectId
661
+
662
+ try:
663
+ result = self.users.update_one(
664
+ {"_id": ObjectId(user_id)},
665
+ {
666
+ "$set": {
667
+ "subscription_tier": subscription_tier,
668
+ "updated_at": datetime.utcnow()
669
+ }
670
+ }
671
+ )
672
+ if result.modified_count > 0:
673
+ logger.info(f"✅ Updated user {user_id} subscription tier to {subscription_tier}")
674
+ return True
675
+ return False
676
+ except Exception as e:
677
+ logger.error(f"Error updating user subscription tier: {e}")
678
+ return False
679
+
680
+ def close(self):
681
+ """Close MongoDB connection"""
682
+ if self.client:
683
+ self.client.close()
684
+ logger.info("🔌 MongoDB connection closed")
services/razorpay_service.py ADDED
@@ -0,0 +1,322 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Razorpay Service for Subscription Management
3
+ Handles Razorpay API interactions for subscription payments
4
+ """
5
+
6
+ import logging
7
+ import hmac
8
+ import hashlib
9
+ from typing import Dict, Any, Optional
10
+ import razorpay
11
+ from config import config
12
+
13
+ logger = logging.getLogger(__name__)
14
+
15
+
16
+ class RazorpayService:
17
+ """Service for handling Razorpay subscription operations"""
18
+
19
+ def __init__(self):
20
+ """Initialize Razorpay client"""
21
+ if not config.RAZORPAY_ID or not config.RAZORPAY_KEY:
22
+ logger.warning("⚠️ Razorpay credentials not configured. Subscription features will not work.")
23
+ self.client = None
24
+ else:
25
+ try:
26
+ # Initialize Razorpay client with explicit base URL
27
+ # Test mode uses different base URL, but SDK handles this automatically
28
+ self.client = razorpay.Client(auth=(config.RAZORPAY_ID, config.RAZORPAY_KEY))
29
+ logger.info(f"✅ Razorpay client initialized with Key ID: {config.RAZORPAY_ID[:8]}...")
30
+ except Exception as e:
31
+ logger.error(f"❌ Failed to initialize Razorpay client: {e}")
32
+ self.client = None
33
+
34
+ def create_plan(
35
+ self,
36
+ name: str,
37
+ amount: int,
38
+ currency: str = "INR",
39
+ interval: int = 1,
40
+ period: str = "monthly",
41
+ description: Optional[str] = None
42
+ ) -> Dict[str, Any]:
43
+ """
44
+ Create a subscription plan in Razorpay
45
+
46
+ Args:
47
+ name: Plan name
48
+ amount: Amount in smallest currency unit (paise for INR)
49
+ currency: Currency code (default: INR)
50
+ interval: Billing interval (default: 1)
51
+ period: Billing period - 'daily', 'weekly', 'monthly', 'yearly' (default: monthly)
52
+ description: Plan description
53
+
54
+ Returns:
55
+ Dict containing plan details from Razorpay
56
+ """
57
+ if not self.client:
58
+ raise ValueError("Razorpay client not initialized. Check RAZORPAY_ID and RAZORPAY_KEY.")
59
+
60
+ try:
61
+ plan_data = {
62
+ "period": period,
63
+ "interval": interval,
64
+ "item": {
65
+ "name": name,
66
+ "amount": amount,
67
+ "currency": currency,
68
+ "description": description or f"{name} subscription plan"
69
+ }
70
+ }
71
+
72
+ logger.debug(f"Creating plan with data: {plan_data}")
73
+ # Try creating plan - note: some accounts may need subscriptions enabled first
74
+ plan = self.client.plan.create(plan_data)
75
+ logger.info(f"✅ Created Razorpay plan: {plan.get('id')}")
76
+ return plan
77
+ except razorpay.errors.BadRequestError as e:
78
+ error_msg = str(e)
79
+ logger.error(f"❌ BadRequestError creating plan '{name}': {error_msg}")
80
+ # Check if it's a "URL not found" error which indicates subscriptions might not be enabled
81
+ if "not found" in error_msg.lower() or "url" in error_msg.lower():
82
+ logger.error(f" This error typically means:")
83
+ logger.error(f" 1. Subscriptions feature is NOT enabled on your Razorpay account")
84
+ logger.error(f" 2. You need to enable subscriptions in Razorpay Dashboard")
85
+ logger.error(f" 3. Go to: Razorpay Dashboard > Settings > Subscriptions")
86
+ logger.error(f" 4. Or contact Razorpay support to enable subscriptions")
87
+ # Check if plan already exists
88
+ elif "already exists" in error_msg.lower() or "duplicate" in error_msg.lower():
89
+ logger.warning(f"⚠️ Plan '{name}' may already exist")
90
+ raise
91
+ except razorpay.errors.ServerError as e:
92
+ logger.error(f"❌ ServerError creating plan '{name}': {e}")
93
+ raise
94
+ except Exception as e:
95
+ error_type = type(e).__name__
96
+ error_msg = str(e)
97
+ logger.error(f"❌ Failed to create Razorpay plan '{name}' ({error_type}): {error_msg}")
98
+ # Log more details if available
99
+ if hasattr(e, 'status_code'):
100
+ logger.error(f" Status code: {e.status_code}")
101
+ if hasattr(e, 'error'):
102
+ logger.error(f" Error details: {e.error}")
103
+ raise
104
+
105
+ def create_subscription(
106
+ self,
107
+ plan_id: str,
108
+ customer_notify: int = 1,
109
+ total_count: Optional[int] = None,
110
+ start_at: Optional[int] = None,
111
+ end_at: Optional[int] = None,
112
+ notes: Optional[Dict[str, str]] = None
113
+ ) -> Dict[str, Any]:
114
+ """
115
+ Create a subscription for a user
116
+
117
+ Args:
118
+ plan_id: Razorpay plan ID
119
+ customer_notify: Whether to notify customer (1 or 0)
120
+ total_count: Total number of billing cycles (None for infinite - will use end_at instead)
121
+ start_at: Unix timestamp for subscription start (None for immediate)
122
+ end_at: Unix timestamp for subscription end (used if total_count is None for infinite subscriptions)
123
+ notes: Additional notes/metadata
124
+
125
+ Returns:
126
+ Dict containing subscription details from Razorpay
127
+ """
128
+ if not self.client:
129
+ raise ValueError("Razorpay client not initialized. Check RAZORPAY_ID and RAZORPAY_KEY.")
130
+
131
+ try:
132
+ subscription_data = {
133
+ "plan_id": plan_id,
134
+ "customer_notify": customer_notify,
135
+ }
136
+
137
+ # Razorpay requires either total_count or end_at
138
+ # If end_at is provided, start_at is also required
139
+ # start_at must be in the future (add 60 seconds buffer to account for clock differences)
140
+ import time
141
+ current_time = int(time.time())
142
+ # Add 60 seconds buffer to ensure start_at is always in the future
143
+ future_start_time = current_time + 60
144
+
145
+ if total_count is not None:
146
+ subscription_data["total_count"] = total_count
147
+ elif end_at is not None:
148
+ subscription_data["end_at"] = end_at
149
+ # If end_at is set but start_at is not, set start_at to 60 seconds in the future
150
+ if start_at is None:
151
+ subscription_data["start_at"] = future_start_time
152
+ else:
153
+ # Set both start_at and end_at for infinite subscriptions
154
+ subscription_data["start_at"] = future_start_time
155
+ subscription_data["end_at"] = future_start_time + (10 * 365 * 24 * 60 * 60) # 10 years
156
+ logger.info("ℹ️ No total_count or end_at provided, setting start_at to 60 seconds in future and end_at to 10 years from start (infinite subscription)")
157
+
158
+ # Override start_at if explicitly provided (but ensure it's in the future)
159
+ if start_at is not None:
160
+ if start_at <= current_time:
161
+ # If provided start_at is in the past, add 60 seconds buffer
162
+ subscription_data["start_at"] = current_time + 60
163
+ logger.warning(f"⚠️ Provided start_at was in the past, adjusted to {subscription_data['start_at']}")
164
+ else:
165
+ subscription_data["start_at"] = start_at
166
+
167
+ if notes:
168
+ subscription_data["notes"] = notes
169
+
170
+ subscription = self.client.subscription.create(subscription_data)
171
+ logger.info(f"✅ Created Razorpay subscription: {subscription.get('id')}")
172
+ return subscription
173
+ except Exception as e:
174
+ logger.error(f"❌ Failed to create Razorpay subscription: {e}")
175
+ raise
176
+
177
+ def get_subscription(self, subscription_id: str) -> Dict[str, Any]:
178
+ """
179
+ Get subscription details from Razorpay
180
+
181
+ Args:
182
+ subscription_id: Razorpay subscription ID
183
+
184
+ Returns:
185
+ Dict containing subscription details
186
+ """
187
+ if not self.client:
188
+ raise ValueError("Razorpay client not initialized. Check RAZORPAY_ID and RAZORPAY_KEY.")
189
+
190
+ try:
191
+ subscription = self.client.subscription.fetch(subscription_id)
192
+ return subscription
193
+ except Exception as e:
194
+ logger.error(f"❌ Failed to fetch subscription {subscription_id}: {e}")
195
+ raise
196
+
197
+ def cancel_subscription(
198
+ self,
199
+ subscription_id: str,
200
+ cancel_at_cycle_end: bool = False
201
+ ) -> Dict[str, Any]:
202
+ """
203
+ Cancel a subscription
204
+
205
+ Args:
206
+ subscription_id: Razorpay subscription ID
207
+ cancel_at_cycle_end: If True, cancel at end of current cycle
208
+
209
+ Returns:
210
+ Dict containing updated subscription details
211
+ """
212
+ if not self.client:
213
+ raise ValueError("Razorpay client not initialized. Check RAZORPAY_ID and RAZORPAY_KEY.")
214
+
215
+ try:
216
+ if cancel_at_cycle_end:
217
+ subscription = self.client.subscription.cancel(
218
+ subscription_id,
219
+ {"cancel_at_cycle_end": 1}
220
+ )
221
+ else:
222
+ subscription = self.client.subscription.cancel(subscription_id)
223
+
224
+ logger.info(f"✅ Cancelled subscription: {subscription_id}")
225
+ return subscription
226
+ except Exception as e:
227
+ logger.error(f"❌ Failed to cancel subscription {subscription_id}: {e}")
228
+ raise
229
+
230
+ def verify_webhook_signature(
231
+ self,
232
+ payload: str,
233
+ signature: str
234
+ ) -> bool:
235
+ """
236
+ Verify Razorpay webhook signature
237
+
238
+ Args:
239
+ payload: Raw webhook payload (string)
240
+ signature: Webhook signature from X-Razorpay-Signature header
241
+
242
+ Returns:
243
+ True if signature is valid, False otherwise
244
+ """
245
+ if not config.RAZORPAY_WEBHOOK_SECRET:
246
+ logger.warning("⚠️ RAZORPAY_WEBHOOK_SECRET not set. Webhook verification skipped.")
247
+ return True # Allow if secret not configured (for development)
248
+
249
+ try:
250
+ expected_signature = hmac.new(
251
+ config.RAZORPAY_WEBHOOK_SECRET.encode('utf-8'),
252
+ payload.encode('utf-8'),
253
+ hashlib.sha256
254
+ ).hexdigest()
255
+
256
+ return hmac.compare_digest(expected_signature, signature)
257
+ except Exception as e:
258
+ logger.error(f"❌ Webhook signature verification failed: {e}")
259
+ return False
260
+
261
+ def get_plan(self, plan_id: str) -> Dict[str, Any]:
262
+ """
263
+ Get plan details from Razorpay
264
+
265
+ Args:
266
+ plan_id: Razorpay plan ID
267
+
268
+ Returns:
269
+ Dict containing plan details
270
+ """
271
+ if not self.client:
272
+ raise ValueError("Razorpay client not initialized. Check RAZORPAY_ID and RAZORPAY_KEY.")
273
+
274
+ try:
275
+ plan = self.client.plan.fetch(plan_id)
276
+ return plan
277
+ except Exception as e:
278
+ logger.error(f"❌ Failed to fetch plan {plan_id}: {e}")
279
+ raise
280
+
281
+ def list_plans(self, count: int = 10, skip: int = 0) -> Dict[str, Any]:
282
+ """
283
+ List all plans
284
+
285
+ Args:
286
+ count: Number of plans to fetch
287
+ skip: Number of plans to skip
288
+
289
+ Returns:
290
+ Dict containing list of plans
291
+ """
292
+ if not self.client:
293
+ raise ValueError("Razorpay client not initialized. Check RAZORPAY_ID and RAZORPAY_KEY.")
294
+
295
+ try:
296
+ # Try to list plans - this may fail if no plans exist or API endpoint is different
297
+ plans = self.client.plan.all({"count": count, "skip": skip})
298
+ return plans
299
+ except razorpay.errors.BadRequestError as e:
300
+ error_msg = str(e).lower()
301
+ logger.error(f"❌ BadRequestError listing plans: {e}")
302
+ # Check if it's a "not found" error which might mean subscriptions aren't enabled
303
+ if "not found" in error_msg or "url" in error_msg:
304
+ logger.warning("⚠️ Subscriptions API endpoint not found. This might mean:")
305
+ logger.warning(" 1. Subscriptions feature is not enabled on your Razorpay account")
306
+ logger.warning(" 2. Your API keys don't have subscription permissions")
307
+ logger.warning(" 3. You need to enable subscriptions in Razorpay Dashboard")
308
+ # Return empty structure if it's a "not found" type error
309
+ return {"items": [], "count": 0}
310
+ except razorpay.errors.ServerError as e:
311
+ logger.error(f"❌ ServerError listing plans: {e}")
312
+ raise
313
+ except Exception as e:
314
+ error_type = type(e).__name__
315
+ error_msg = str(e)
316
+ logger.error(f"❌ Failed to list plans ({error_type}): {error_msg}")
317
+ # If it's a "not found" error, return empty list instead of raising
318
+ if "not found" in error_msg.lower() or "404" in error_msg:
319
+ logger.warning("⚠️ No plans found or endpoint not available, returning empty list")
320
+ return {"items": [], "count": 0}
321
+ raise
322
+
services/text_fact_checker.py ADDED
@@ -0,0 +1,905 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+ import json
3
+ from typing import Dict, List, Optional, Any
4
+ import google.generativeai as genai
5
+ from sklearn.feature_extraction.text import TfidfVectorizer
6
+ from sklearn.metrics.pairwise import cosine_similarity
7
+ import numpy as np
8
+ from config import config
9
+
10
+
11
+ class TextFactChecker:
12
+ """Service for fact-checking textual claims using Google Custom Search API with fact-checking sites"""
13
+
14
+ def __init__(self):
15
+ self.api_key = config.GOOGLE_API_KEY
16
+ self.search_engine_id = config.GOOGLE_FACT_CHECK_CX
17
+ self.base_url = "https://www.googleapis.com/customsearch/v1"
18
+
19
+ # Configure Gemini for analysis
20
+ if not config.GEMINI_API_KEY:
21
+ print("⚠️ WARNING: GEMINI_API_KEY not set. Gemini features will not work.")
22
+ else:
23
+ try:
24
+ genai.configure(api_key=config.GEMINI_API_KEY)
25
+ self.model = genai.GenerativeModel(config.GEMINI_MODEL)
26
+ print(f"✅ Gemini configured with model: {config.GEMINI_MODEL}")
27
+ except Exception as e:
28
+ print(f"❌ Failed to configure Gemini: {e}")
29
+ raise
30
+
31
+ if not self.api_key:
32
+ raise ValueError("Google Custom Search API key is required")
33
+ if not self.search_engine_id:
34
+ raise ValueError("Google Custom Search Engine ID (cx) is required")
35
+
36
+ async def verify(self, text_input: str, claim_context: str = "Unknown context", claim_date: str = "Unknown date") -> Dict[str, Any]:
37
+ """
38
+ Verify a textual claim using a three-phase approach:
39
+ 1. Immediate Gemini read-through for a quick, reference-free baseline
40
+ 2. Curated SERP (fact-check) harvesting with structured analysis
41
+ 3. A final Gemini synthesis that reasons over BOTH the baseline and SERP data
42
+
43
+ Args:
44
+ text_input: The text claim to verify
45
+ claim_context: Context about the claim
46
+ claim_date: Date when the claim was made
47
+
48
+ Returns:
49
+ Dictionary containing verification results
50
+ """
51
+ try:
52
+ print(f"🔍 DEBUG: TextFactChecker.verify called")
53
+ print(f"🔍 DEBUG: text_input = {text_input}")
54
+ print(f"🔍 DEBUG: claim_context = {claim_context}")
55
+ print(f"🔍 DEBUG: claim_date = {claim_date}")
56
+ print(f"Starting verification for: {text_input}")
57
+
58
+ # STEP 0: quick general-knowledge pass (baseline)
59
+ preliminary_analysis = await self._verify_with_general_knowledge(
60
+ text_input, claim_context, claim_date
61
+ )
62
+ print(f"🔍 DEBUG: preliminary_analysis = {preliminary_analysis}")
63
+
64
+ # STEP 1: Search for fact-checked claims in curated sources
65
+ search_results = await self._search_claims(text_input)
66
+ print(f"🔍 DEBUG: search_results = {search_results}")
67
+
68
+ curated_analysis = None
69
+ if search_results:
70
+ # Analyze the search results with Gemini
71
+ curated_analysis = self._analyze_results(search_results, text_input)
72
+
73
+ final_response = self._synthesize_final_response(
74
+ text_input=text_input,
75
+ claim_context=claim_context,
76
+ claim_date=claim_date,
77
+ preliminary_analysis=preliminary_analysis,
78
+ curated_analysis=curated_analysis,
79
+ search_results=search_results or []
80
+ )
81
+
82
+ if final_response:
83
+ return final_response
84
+
85
+ # Fallback ladder: curated -> preliminary -> default error
86
+ if curated_analysis:
87
+ return self._build_simple_response(
88
+ curated_analysis,
89
+ text_input,
90
+ claim_context,
91
+ claim_date,
92
+ search_results or [],
93
+ method_label="curated_sources_only",
94
+ extra_details={
95
+ "preliminary_analysis": preliminary_analysis,
96
+ "curated_analysis": curated_analysis,
97
+ },
98
+ )
99
+
100
+ if preliminary_analysis:
101
+ return self._build_simple_response(
102
+ preliminary_analysis,
103
+ text_input,
104
+ claim_context,
105
+ claim_date,
106
+ search_results or [],
107
+ method_label="general_knowledge_only",
108
+ extra_details={"preliminary_analysis": preliminary_analysis},
109
+ )
110
+
111
+ return {
112
+ "verified": False,
113
+ "verdict": "error",
114
+ "message": "Unable to generate a verification response.",
115
+ "details": {
116
+ "claim_text": text_input,
117
+ "claim_context": claim_context,
118
+ "claim_date": claim_date,
119
+ "fact_checks": search_results or [],
120
+ "analysis": {},
121
+ "verification_method": "unavailable",
122
+ },
123
+ }
124
+
125
+ except Exception as e:
126
+ print(f"❌ Error in verify: {e}")
127
+ return {
128
+ "verified": False,
129
+ "verdict": "error",
130
+ "message": f"Error during fact-checking: {str(e)}",
131
+ "details": {
132
+ "claim_text": text_input,
133
+ "claim_context": claim_context,
134
+ "claim_date": claim_date,
135
+ "error": str(e)
136
+ }
137
+ }
138
+
139
+ async def _search_claims(self, query: str) -> List[Dict[str, Any]]:
140
+ """
141
+ Search for fact-checked claims using Google Custom Search API with LLM-powered fallback strategies
142
+
143
+ Args:
144
+ query: The search query
145
+
146
+ Returns:
147
+ List of search results
148
+ """
149
+ # Try the original query first
150
+ results = await self._perform_search(query)
151
+
152
+ # If no results, use LLM to create alternative queries
153
+ if not results:
154
+ print("No results found, using LLM to create alternative queries...")
155
+
156
+ alternative_queries = self._create_alternative_queries(query)
157
+ print(f"Generated alternative queries: {alternative_queries}")
158
+
159
+ results = await self._perform_search(alternative_queries)
160
+ if results:
161
+ print(f"Found {len(results)} results with alternative query")
162
+ else:
163
+ print("No results found with alternative query")
164
+ return results
165
+
166
+ async def _perform_search(self, query: str) -> List[Dict[str, Any]]:
167
+ """
168
+ Perform a single search request
169
+
170
+ Args:
171
+ query: The search query
172
+
173
+ Returns:
174
+ List of search results
175
+ """
176
+ params = {
177
+ "q": query,
178
+ "key": self.api_key,
179
+ "cx": self.search_engine_id,
180
+ "num": 10 # Limit results to 10 for better performance
181
+ }
182
+
183
+ try:
184
+ print(f"Making request to: {self.base_url}")
185
+ print(f"Params: {params}")
186
+
187
+ response = requests.get(self.base_url, params=params, timeout=30)
188
+ print(f"Response status: {response.status_code}")
189
+ print(f"Response text: {response.text}")
190
+
191
+ response.raise_for_status()
192
+
193
+ data = response.json()
194
+ items = data.get("items", [])
195
+
196
+ return items
197
+
198
+ except requests.exceptions.RequestException as e:
199
+ raise Exception(f"API request failed: {str(e)}")
200
+ except json.JSONDecodeError as e:
201
+ raise Exception(f"Failed to parse API response: {str(e)}")
202
+ except Exception as e:
203
+ raise Exception(f"Search error: {str(e)}")
204
+
205
+ def _create_alternative_queries(self, query: str) -> List[str]:
206
+ """
207
+ Use LLM to create alternative search queries (broader and simpler)
208
+
209
+ Args:
210
+ query: Original query
211
+
212
+ Returns:
213
+ List of alternative queries to try
214
+ """
215
+ prompt = f"""
216
+ You are a search query optimizer. Given a fact-checking query that returned no results, create alternative queries that might find relevant information.
217
+
218
+ ORIGINAL QUERY: "{query}"
219
+
220
+ Create an alternative query:
221
+ 1. A BROADER query that removes specific assumptions and focuses on key entities/events
222
+
223
+ Examples:
224
+ - "Is it true the CEO of Astronomer resigned because of toxic workplace allegations?"
225
+ → Broader: "Astronomer CEO resignation"
226
+
227
+ - "Did Apple release a new iPhone with 5G in 2023?"
228
+ → Broader: "Apple iPhone 2023 release"
229
+
230
+ Respond in this exact JSON format:
231
+ {{
232
+ "broader_query": "your broader query here",
233
+ }}
234
+ """
235
+
236
+ try:
237
+ response = self.model.generate_content(prompt)
238
+ response_text = response.text.strip()
239
+
240
+ # Try to parse JSON response
241
+ if response_text.startswith('```json'):
242
+ response_text = response_text.replace('```json', '').replace('```', '').strip()
243
+ elif response_text.startswith('```'):
244
+ response_text = response_text.replace('```', '').strip()
245
+
246
+ alternatives = json.loads(response_text)
247
+
248
+ # Return both alternatives
249
+ queries = []
250
+ if alternatives.get("broader_query") and alternatives["broader_query"] != query:
251
+ queries.append(alternatives["broader_query"])
252
+ if alternatives.get("simpler_query") and alternatives["simpler_query"] != query:
253
+ queries.append(alternatives["simpler_query"])
254
+
255
+ return queries
256
+
257
+ except Exception as e:
258
+ print(f"Failed to create alternative queries with LLM: {e}")
259
+
260
+ def _analyze_results(self, results: List[Dict[str, Any]], original_text: str) -> Dict[str, Any]:
261
+ """
262
+ Analyze the search results using Gemini AI to determine overall verdict
263
+
264
+ Args:
265
+ results: List of search results from the API
266
+ original_text: The original text being verified
267
+
268
+ Returns:
269
+ Analysis results including verdict and message
270
+ """
271
+ if not results:
272
+ return {
273
+ "verified": False,
274
+ "verdict": "no_content",
275
+ "message": "No fact-checked information found for this claim"
276
+ }
277
+
278
+ # Filter relevant results
279
+ relevant_results = []
280
+ for result in results:
281
+ title = result.get("title", "").lower()
282
+ snippet = result.get("snippet", "").lower()
283
+ original_lower = original_text.lower()
284
+
285
+ # Check if the result is relevant to our original text
286
+ relevance_score = self._calculate_relevance(result, original_text)
287
+
288
+ print(f"Relevance score for '{title[:50]}...': {relevance_score:.3f}")
289
+ if relevance_score > 0.05: # Very low threshold to catch all relevant results
290
+ relevant_results.append(result)
291
+
292
+ if not relevant_results:
293
+ return {
294
+ "verified": False,
295
+ "verdict": "no_content",
296
+ "message": "No relevant fact-checked information found for this specific claim"
297
+ }
298
+
299
+ # Use Gemini to analyze the results
300
+ try:
301
+ analysis = self._analyze_with_gemini(original_text, relevant_results)
302
+ return analysis
303
+ except Exception as e:
304
+ print(f"Gemini analysis failed: {str(e)}")
305
+ # Fallback to simple analysis
306
+ return self._fallback_analysis(relevant_results)
307
+
308
+ def _calculate_relevance(self, result: Dict[str, Any], original_text: str) -> float:
309
+ """
310
+ Calculate relevance score using TF-IDF similarity with multiple components
311
+
312
+ Args:
313
+ result: Search result dictionary
314
+ original_text: Original text being verified
315
+
316
+ Returns:
317
+ Relevance score between 0 and 1
318
+ """
319
+ score = 0.0
320
+
321
+ # 1. Title relevance (40% weight)
322
+ title = result.get("title", "")
323
+ if title:
324
+ title_score = self._tfidf_similarity(title, original_text)
325
+ score += title_score * 0.6
326
+
327
+ # 2. Snippet relevance (30% weight)
328
+ snippet = result.get("snippet", "")
329
+ if snippet:
330
+ snippet_score = self._tfidf_similarity(snippet, original_text)
331
+ score += snippet_score * 0.4
332
+
333
+ # 3. Fact-check specific bonus (30% weight)
334
+ factcheck_score = self._has_factcheck_data(result)
335
+ score += factcheck_score * 0.1
336
+
337
+ return min(1.0, score)
338
+
339
+ def _tfidf_similarity(self, text1: str, text2: str) -> float:
340
+ """
341
+ Calculate TF-IDF cosine similarity between two texts
342
+
343
+ Args:
344
+ text1: First text
345
+ text2: Second text
346
+
347
+ Returns:
348
+ Similarity score between 0 and 1
349
+ """
350
+ if not text1.strip() or not text2.strip():
351
+ return 0.0
352
+
353
+ try:
354
+ # Preprocess texts
355
+ texts = [self._preprocess_text(text1), self._preprocess_text(text2)]
356
+
357
+ # Create TF-IDF vectors
358
+ vectorizer = TfidfVectorizer(
359
+ stop_words='english',
360
+ ngram_range=(1, 2), # Include bigrams
361
+ max_features=500,
362
+ lowercase=True
363
+ )
364
+
365
+ tfidf_matrix = vectorizer.fit_transform(texts)
366
+
367
+ # Calculate cosine similarity
368
+ similarity = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])[0][0]
369
+
370
+ return float(similarity)
371
+
372
+ except Exception as e:
373
+ print(f"TF-IDF calculation failed: {e}")
374
+ # Fallback to simple word overlap
375
+ return self._simple_word_overlap(text1, text2)
376
+
377
+ def _preprocess_text(self, text: str) -> str:
378
+ """
379
+ Preprocess text for TF-IDF analysis
380
+
381
+ Args:
382
+ text: Raw text
383
+
384
+ Returns:
385
+ Preprocessed text
386
+ """
387
+ import re
388
+
389
+ # Convert to lowercase
390
+ text = text.lower()
391
+
392
+ # Remove special characters but keep spaces
393
+ text = re.sub(r'[^\w\s]', ' ', text)
394
+
395
+ # Remove extra whitespace
396
+ text = ' '.join(text.split())
397
+
398
+ return text
399
+
400
+ def _simple_word_overlap(self, text1: str, text2: str) -> float:
401
+ """
402
+ Fallback similarity calculation using word overlap
403
+
404
+ Args:
405
+ text1: First text
406
+ text2: Second text
407
+
408
+ Returns:
409
+ Similarity score between 0 and 1
410
+ """
411
+ words1 = set(text1.lower().split())
412
+ words2 = set(text2.lower().split())
413
+
414
+ if not words1 or not words2:
415
+ return 0.0
416
+
417
+ intersection = words1.intersection(words2)
418
+ union = words1.union(words2)
419
+
420
+ return len(intersection) / len(union) if union else 0.0
421
+
422
+ def _has_factcheck_data(self, result: Dict[str, Any]) -> float:
423
+ """
424
+ Check if result has fact-check specific metadata
425
+
426
+ Args:
427
+ result: Search result dictionary
428
+
429
+ Returns:
430
+ 1.0 if has fact-check data, 0.0 otherwise
431
+ """
432
+ # Check for ClaimReview metadata
433
+ pagemap = result.get("pagemap", {})
434
+ claim_review = pagemap.get("ClaimReview", [])
435
+
436
+ if claim_review:
437
+ return 1.0
438
+
439
+ # Check for fact-check related keywords in URL or title
440
+ url = result.get("link", "").lower()
441
+ title = result.get("title", "").lower()
442
+
443
+ factcheck_keywords = [
444
+ "fact-check", "factcheck", "snopes", "politifact",
445
+ "factcrescendo", "boomlive", "newschecker", "afp"
446
+ ]
447
+
448
+ for keyword in factcheck_keywords:
449
+ if keyword in url or keyword in title:
450
+ return 1.0
451
+
452
+ return 0.0
453
+
454
+ def _analyze_with_gemini(self, original_text: str, results: List[Dict[str, Any]]) -> Dict[str, Any]:
455
+ """
456
+ Use Gemini AI to analyze fact-check results and determine verdict
457
+
458
+ Args:
459
+ original_text: The original claim being verified
460
+ results: List of relevant search results
461
+
462
+ Returns:
463
+ Analysis results with verdict and message
464
+ """
465
+ # Prepare the prompt
466
+ results_text = ""
467
+ for i, result in enumerate(results[:5], 1): # Limit to top 5 results
468
+ title = result.get("title", "")
469
+ snippet = result.get("snippet", "")
470
+ link = result.get("link", "")
471
+ results_text += f"{i}. Title: {title}\n Snippet: {snippet}\n Link: {link}\n\n"
472
+
473
+ prompt = f"""
474
+ You are a fact-checking expert. Analyze the following claim against the provided fact-checking sources.
475
+
476
+ CLAIM TO VERIFY: "{original_text}"
477
+
478
+ FACT-CHECKING SOURCES:
479
+ {results_text}
480
+
481
+ STEP-BY-STEP ANALYSIS:
482
+ 1. What does each source say ACTUALLY HAPPENED?
483
+ 2. What does each source say was FAKE or MISLEADING?
484
+ 3. Based on the evidence, what is the most likely truth about the claim?
485
+
486
+ Think through this systematically and provide your analysis.
487
+
488
+ IMPORTANT INSTRUCTIONS FOR YOUR RESPONSE:
489
+ - When referring to sources in your message, DO NOT use specific numbers like "Source 1", "Source 3", or "Sources 2, 4, and 5"
490
+ - Instead, use generic references like "the sources", "multiple sources", "one source", "several sources"
491
+ - Example: Instead of "Sources 3, 4, and 5 confirm..." say "Multiple sources confirm..." or "The sources confirm..."
492
+
493
+ Respond in this exact JSON format:
494
+ {{
495
+ "verdict": "true|false|mixed|uncertain",
496
+ "verified": true|false,
497
+ "message": "Your explanation here",
498
+ "confidence": "high|medium|low",
499
+ "reasoning": "Your step-by-step reasoning process"
500
+ }}
501
+ """
502
+
503
+ try:
504
+ response = self.model.generate_content(prompt)
505
+ response_text = response.text.strip()
506
+
507
+ # Try to parse JSON response
508
+ if response_text.startswith('```json'):
509
+ response_text = response_text.replace('```json', '').replace('```', '').strip()
510
+ elif response_text.startswith('```'):
511
+ response_text = response_text.replace('```', '').strip()
512
+
513
+ analysis = json.loads(response_text)
514
+
515
+ # Ensure required fields
516
+ analysis.setdefault("verdict", "uncertain")
517
+ analysis.setdefault("verified", False)
518
+ analysis.setdefault("message", "Analysis completed")
519
+ analysis.setdefault("confidence", "medium")
520
+ analysis.setdefault("reasoning", "Analysis completed")
521
+
522
+ # Add metadata
523
+ analysis["relevant_results_count"] = len(results)
524
+ analysis["analysis_method"] = "gemini"
525
+
526
+ return analysis
527
+
528
+ except json.JSONDecodeError as e:
529
+ print(f"Failed to parse Gemini response as JSON: {e}")
530
+ print(f"Raw response: {response_text}")
531
+ return self._fallback_analysis(results)
532
+ except Exception as e:
533
+ print(f"Gemini analysis error: {e}")
534
+ return self._fallback_analysis(results)
535
+
536
+ def _format_source_summary(self, results: List[Dict[str, Any]]) -> str:
537
+ """Create a short, human readable summary of the surfaced sources."""
538
+ if not results:
539
+ return "No vetted sources surfaced yet."
540
+
541
+ highlights = []
542
+ for result in results[:3]:
543
+ title = result.get("title") or "Unknown source"
544
+ outlet = result.get("displayLink")
545
+ summary = title
546
+ if outlet:
547
+ summary += f" ({outlet})"
548
+ highlights.append(summary)
549
+
550
+ return "Sources surfaced: " + "; ".join(highlights)
551
+
552
+ def _fallback_analysis(self, results: List[Dict[str, Any]]) -> Dict[str, Any]:
553
+ """
554
+ Fallback analysis when Gemini fails
555
+
556
+ Args:
557
+ results: List of search results
558
+
559
+ Returns:
560
+ Basic analysis results
561
+ """
562
+ summary = self._format_source_summary(results)
563
+
564
+ return {
565
+ "verified": False,
566
+ "verdict": "uncertain",
567
+ "message": f"Could not verify this claim yet. {summary}",
568
+ "confidence": "low",
569
+ "relevant_results_count": len(results),
570
+ "analysis_method": "fallback"
571
+ }
572
+
573
+ async def _verify_with_general_knowledge(self, text_input: str, claim_context: str, claim_date: str) -> Dict[str, Any]:
574
+ """
575
+ Verify a claim using Gemini's general knowledge base directly (no curated sources)
576
+ This is used as a fallback when curated sources don't have enough information
577
+
578
+ Args:
579
+ text_input: The text claim to verify
580
+ claim_context: Context about the claim
581
+ claim_date: Date when the claim was made
582
+
583
+ Returns:
584
+ Analysis results with verdict and message
585
+ """
586
+ from datetime import datetime
587
+ current_date = datetime.now().strftime("%B %d, %Y")
588
+
589
+ prompt = f"""
590
+ You are a fact-checking expert AI with access to current information as of {current_date}.
591
+
592
+ CLAIM TO VERIFY: "{text_input}"
593
+ CONTEXT: {claim_context if claim_context != "Unknown context" else "No additional context provided"}
594
+ CLAIM DATE: {claim_date if claim_date != "Unknown date" else "Unknown"}
595
+
596
+ Your task is to verify this claim using your knowledge base. Since this is a direct factual question that may not be covered by news articles:
597
+
598
+ 1. **Use your most recent training data** to answer the question directly
599
+ 2. If this is about current events, political positions, or time-sensitive facts, be especially careful to provide the MOST CURRENT information
600
+ 3. If you're uncertain about recent changes, acknowledge that
601
+ 4. Always answer based on the most recent information you have
602
+
603
+ Provide a clear, direct answer. Think step-by-step:
604
+ - What does the claim assert?
605
+ - Based on your knowledge (as of your training cutoff and any recent data you have), is this true or false?
606
+ - If it's a time-sensitive claim, what is the current status?
607
+
608
+ Respond in this exact JSON format:
609
+ {{
610
+ "verdict": "true|false|mixed|uncertain",
611
+ "verified": true|false,
612
+ "message": "Your clear, direct answer explaining whether the claim is true or false and why",
613
+ "confidence": "high|medium|low",
614
+ "reasoning": "Your step-by-step reasoning process",
615
+ "knowledge_cutoff_note": "Optional note if the answer might be outdated or if recent changes are possible"
616
+ }}
617
+
618
+ IMPORTANT: For current events or political positions, provide the MOST RECENT information you have access to.
619
+ """
620
+
621
+ try:
622
+ response = self.model.generate_content(prompt)
623
+ response_text = response.text.strip()
624
+
625
+ # Try to parse JSON response
626
+ if response_text.startswith('```json'):
627
+ response_text = response_text.replace('```json', '').replace('```', '').strip()
628
+ elif response_text.startswith('```'):
629
+ response_text = response_text.replace('```', '').strip()
630
+
631
+ analysis = json.loads(response_text)
632
+
633
+ # Ensure required fields
634
+ analysis.setdefault("verdict", "uncertain")
635
+ analysis.setdefault("verified", False)
636
+ analysis.setdefault("message", "Analysis completed using general knowledge")
637
+ analysis.setdefault("confidence", "medium")
638
+ analysis.setdefault("reasoning", "Direct verification using AI knowledge base")
639
+
640
+ # Add metadata
641
+ analysis["analysis_method"] = "general_knowledge"
642
+ analysis["verification_date"] = current_date
643
+
644
+ print(f"✅ General knowledge verification result: {analysis['verdict']}")
645
+ return analysis
646
+
647
+ except json.JSONDecodeError as e:
648
+ print(f"Failed to parse Gemini general knowledge response as JSON: {e}")
649
+ print(f"Raw response: {response_text[:500]}")
650
+ # Try to extract plain text answer
651
+ return {
652
+ "verified": False,
653
+ "verdict": "uncertain",
654
+ "message": response_text if response_text else "Unable to verify using general knowledge",
655
+ "confidence": "low",
656
+ "analysis_method": "general_knowledge",
657
+ "error": "JSON parsing failed, used plain text response"
658
+ }
659
+ except Exception as e:
660
+ print(f"General knowledge verification error: {e}")
661
+ return {
662
+ "verified": False,
663
+ "verdict": "error",
664
+ "message": f"Error during general knowledge verification: {str(e)}",
665
+ "confidence": "low",
666
+ "analysis_method": "general_knowledge"
667
+ }
668
+
669
+ def _extract_verdict_from_content(self, content: str) -> str:
670
+ """
671
+ Extract verdict from search result content
672
+
673
+ Args:
674
+ content: Combined title and snippet text
675
+
676
+ Returns:
677
+ Verdict string
678
+ """
679
+ content_lower = content.lower()
680
+
681
+ # Look for verdict indicators
682
+ if any(word in content_lower for word in ["false", "misleading", "incorrect", "debunked", "not true"]):
683
+ return "false"
684
+ elif any(word in content_lower for word in ["true", "accurate", "correct", "verified", "confirmed", "is true", "is correct"]):
685
+ return "true"
686
+ elif any(word in content_lower for word in ["partially", "mixed", "somewhat", "half"]):
687
+ return "mixed"
688
+ elif any(word in content_lower for word in ["unverified", "unproven", "uncertain", "disputed"]):
689
+ return "uncertain"
690
+ else:
691
+ return "unknown"
692
+
693
+ def _analyze_verdicts(self, verdicts: List[str]) -> Dict[str, Any]:
694
+ """
695
+ Analyze verdicts to determine overall result
696
+
697
+ Args:
698
+ verdicts: List of verdict strings
699
+
700
+ Returns:
701
+ Analysis of verdicts
702
+ """
703
+ if not verdicts:
704
+ return {
705
+ "verified": False,
706
+ "verdict": "uncertain",
707
+ "message": "No verdicts found"
708
+ }
709
+
710
+ true_count = verdicts.count("true")
711
+ false_count = verdicts.count("false")
712
+ mixed_count = verdicts.count("mixed")
713
+ uncertain_count = verdicts.count("uncertain")
714
+ unknown_count = verdicts.count("unknown")
715
+
716
+ total = len(verdicts)
717
+
718
+ # Determine overall verdict
719
+ if false_count > 0:
720
+ overall_verdict = "false"
721
+ verified = False
722
+ elif true_count > 0 and false_count == 0:
723
+ overall_verdict = "true"
724
+ verified = True
725
+ elif mixed_count > 0:
726
+ overall_verdict = "mixed"
727
+ verified = False
728
+ elif uncertain_count > 0:
729
+ overall_verdict = "uncertain"
730
+ verified = False
731
+ else:
732
+ overall_verdict = "unknown"
733
+ verified = False
734
+
735
+ return {
736
+ "verified": verified,
737
+ "verdict": overall_verdict,
738
+ "true_count": true_count,
739
+ "false_count": false_count,
740
+ "mixed_count": mixed_count,
741
+ "uncertain_count": uncertain_count,
742
+ "unknown_count": unknown_count,
743
+ "total_verdicts": total
744
+ }
745
+
746
+ def _build_message(self, analysis: Dict[str, Any], results: List[Dict[str, Any]]) -> str:
747
+ """
748
+ Build a human-readable message based on the analysis
749
+
750
+ Args:
751
+ analysis: Analysis results
752
+ results: Relevant search results
753
+
754
+ Returns:
755
+ Formatted message
756
+ """
757
+ verdict = analysis["verdict"]
758
+ total_verdicts = analysis["total_verdicts"]
759
+ relevant_results_count = len(results)
760
+
761
+ base_messages = {
762
+ "true": "This claim appears to be TRUE based on fact-checking sources.",
763
+ "false": "This claim appears to be FALSE based on fact-checking sources.",
764
+ "mixed": "This claim has MIXED evidence - some parts are true, others are false.",
765
+ "uncertain": "This claim is UNCERTAIN - insufficient evidence to determine accuracy.",
766
+ "unknown": "This claim needs further investigation - verdict unclear from available sources.",
767
+ "no_content": "No fact-checked information found for this claim."
768
+ }
769
+
770
+ message = base_messages.get(verdict, "Unable to determine claim accuracy.")
771
+
772
+ # Add details about sources
773
+ if relevant_results_count > 0:
774
+ message += f" Found {relevant_results_count} relevant fact-check(s) with {total_verdicts} total verdicts."
775
+
776
+ # Add top sources
777
+ top_sources = []
778
+ for result in results[:3]: # Show top 3 sources
779
+ title = result.get("title", "Unknown")
780
+ link = result.get("link", "")
781
+ if title not in top_sources and link:
782
+ top_sources.append(f"{title}")
783
+
784
+ if top_sources:
785
+ message += f" Sources include: {', '.join(top_sources[:3])}."
786
+
787
+ return message
788
+
789
+ def _synthesize_final_response(
790
+ self,
791
+ text_input: str,
792
+ claim_context: str,
793
+ claim_date: str,
794
+ preliminary_analysis: Optional[Dict[str, Any]],
795
+ curated_analysis: Optional[Dict[str, Any]],
796
+ search_results: List[Dict[str, Any]],
797
+ ) -> Optional[Dict[str, Any]]:
798
+ """
799
+ Ask Gemini to reconcile preliminary + curated evidence into a single user-facing verdict.
800
+ """
801
+ try:
802
+ source_briefs = []
803
+ for item in search_results[:5]:
804
+ source_briefs.append(
805
+ {
806
+ "title": item.get("title"),
807
+ "snippet": item.get("snippet"),
808
+ "outlet": item.get("displayLink"),
809
+ "link": item.get("link"),
810
+ }
811
+ )
812
+
813
+ prompt = f"""
814
+ You are an AI fact-checking editor. Combine the baseline assessment and curated sources to produce the final answer.
815
+
816
+ CLAIM: "{text_input}"
817
+ CONTEXT: {claim_context}
818
+ CLAIM DATE: {claim_date}
819
+
820
+ BASELINE ANALYSIS (Gemini quick look):
821
+ {json.dumps(preliminary_analysis or {}, indent=2, ensure_ascii=False)}
822
+
823
+ CURATED FACT-CHECK ANALYSIS:
824
+ {json.dumps(curated_analysis or {}, indent=2, ensure_ascii=False)}
825
+
826
+ FACT-CHECK SOURCES:
827
+ {json.dumps(source_briefs, indent=2, ensure_ascii=False)}
828
+
829
+ INSTRUCTIONS:
830
+ - Make a reasoned decision (true/false/mixed/uncertain) based on the above.
831
+ - If evidence is thin, keep the tone cautious and say it is unverified/uncertain but mention what was found.
832
+ - Refer to sources generically (e.g., "one BBC article", "multiple outlets") — never number them.
833
+ - Provide clear, actionable messaging for the end user.
834
+
835
+ Respond ONLY in this JSON format:
836
+ {{
837
+ "verdict": "true|false|mixed|uncertain",
838
+ "verified": true|false,
839
+ "message": "Concise user-facing summary referencing evidence in plain language",
840
+ "confidence": "high|medium|low",
841
+ "reasoning": "Brief reasoning trail you followed",
842
+ "tone": "confident|balanced|cautious"
843
+ }}
844
+ """
845
+ response = self.model.generate_content(prompt)
846
+ response_text = response.text.strip()
847
+
848
+ if response_text.startswith("```json"):
849
+ response_text = response_text.replace("```json", "").replace("```", "").strip()
850
+ elif response_text.startswith("```"):
851
+ response_text = response_text.replace("```", "").strip()
852
+
853
+ final_analysis = json.loads(response_text)
854
+ final_analysis.setdefault("verdict", "uncertain")
855
+ final_analysis.setdefault("verified", False)
856
+ final_analysis.setdefault("message", "Unable to synthesize final verdict.")
857
+ final_analysis.setdefault("confidence", "low")
858
+ final_analysis.setdefault("reasoning", "")
859
+ final_analysis.setdefault("tone", "cautious")
860
+ final_analysis["analysis_method"] = "hybrid_synthesis"
861
+
862
+ return self._build_simple_response(
863
+ final_analysis,
864
+ text_input,
865
+ claim_context,
866
+ claim_date,
867
+ search_results,
868
+ method_label="hybrid_synthesis",
869
+ extra_details={
870
+ "preliminary_analysis": preliminary_analysis,
871
+ "curated_analysis": curated_analysis,
872
+ "source_highlights": source_briefs,
873
+ },
874
+ )
875
+ except Exception as e:
876
+ print(f"Hybrid synthesis error: {e}")
877
+ return None
878
+
879
+ def _build_simple_response(
880
+ self,
881
+ analysis: Dict[str, Any],
882
+ text_input: str,
883
+ claim_context: str,
884
+ claim_date: str,
885
+ search_results: List[Dict[str, Any]],
886
+ method_label: str,
887
+ extra_details: Optional[Dict[str, Any]] = None,
888
+ ) -> Dict[str, Any]:
889
+ details = {
890
+ "claim_text": text_input,
891
+ "claim_context": claim_context,
892
+ "claim_date": claim_date,
893
+ "fact_checks": search_results,
894
+ "analysis": analysis,
895
+ "verification_method": method_label,
896
+ }
897
+ if extra_details:
898
+ details.update(extra_details)
899
+
900
+ return {
901
+ "verified": analysis.get("verified", False),
902
+ "verdict": analysis.get("verdict", "uncertain"),
903
+ "message": analysis.get("message", "No message produced."),
904
+ "details": details,
905
+ }
services/video_verifier.py ADDED
@@ -0,0 +1,1310 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import tempfile
3
+ from typing import Dict, Any, Optional, List, Tuple
4
+ import cv2
5
+ import requests
6
+ from PIL import Image, ImageDraw, ImageFont
7
+ import subprocess
8
+ import json
9
+ import asyncio
10
+
11
+ from .image_verifier import ImageVerifier
12
+ from .youtube_api import YouTubeDataAPI
13
+ from config import config
14
+ import time
15
+
16
+ class VideoVerifier:
17
+ def __init__(self, api_key: Optional[str] = None):
18
+ """
19
+ Initialize the VideoVerifier with SerpApi credentials
20
+
21
+ Args:
22
+ api_key: SerpApi API key. If None, will try to get from environment
23
+ """
24
+ self.api_key = api_key or config.SERP_API_KEY
25
+ if not self.api_key:
26
+ raise ValueError("SERP_API_KEY environment variable or api_key parameter is required")
27
+
28
+ # Initialize image verifier for frame analysis
29
+ self.image_verifier = ImageVerifier(api_key)
30
+
31
+ # Initialize YouTube Data API client
32
+ self.youtube_api = YouTubeDataAPI(api_key)
33
+
34
+ # Video processing parameters
35
+ self.frame_interval = 4 # Extract frame every 4 seconds
36
+ self.clip_duration = 5 # Duration of misleading clip in seconds
37
+
38
+ async def verify(self, video_path: Optional[str] = None, claim_context: str = "", claim_date: str = "", video_url: Optional[str] = None) -> Dict[str, Any]:
39
+ """
40
+ Verify a video and generate a visual counter-measure video if false context is detected
41
+
42
+ Args:
43
+ video_path: Path to the video file
44
+ claim_context: The claimed context of the video
45
+ claim_date: The claimed date of the video
46
+
47
+ Returns:
48
+ Dictionary with verification results and output file path
49
+ """
50
+ try:
51
+ # If a video URL is supplied, determine the best verification approach
52
+ if video_url and not video_path:
53
+ # Check if it's a YouTube URL and use API verification
54
+ if self._is_youtube_url(video_url):
55
+ return await self._verify_youtube_video(video_url, claim_context, claim_date)
56
+
57
+ # Check if it's a supported platform for yt-dlp
58
+ if self._is_supported_platform(video_url):
59
+ return await self._verify_with_ytdlp(video_url, claim_context, claim_date)
60
+
61
+ # For unsupported platforms, try direct download first; if not a real video, fallback to yt-dlp
62
+ try:
63
+ video_path = await self._download_video(video_url)
64
+ except Exception as direct_err:
65
+ # Always attempt yt-dlp as fallback when available
66
+ try:
67
+ video_path = await self._download_with_ytdlp(video_url)
68
+ used_ytdlp = True
69
+ except Exception as ytdlp_err:
70
+ # Return the more informative error
71
+ raise RuntimeError(f"Direct download failed: {direct_err}; yt-dlp failed: {ytdlp_err}")
72
+
73
+ # Extract key frames from video
74
+ frames = await self._extract_key_frames(video_path)
75
+
76
+ # If extraction failed and we have a URL, try yt-dlp fallback once
77
+ if (not frames) and video_url and config.USE_STREAM_DOWNLOADER and not used_ytdlp:
78
+ video_path = await self._download_with_ytdlp(video_url)
79
+ used_ytdlp = True
80
+ frames = await self._extract_key_frames(video_path)
81
+
82
+ if not frames:
83
+ return {
84
+ "verified": False,
85
+ "message": "Could not extract frames from video",
86
+ "details": {"error": "Frame extraction failed"}
87
+ }
88
+
89
+ # STEP 0: Analyze frames with Gemini Vision first (direct frame analysis)
90
+ preliminary_vision_analysis = await self._analyze_frames_with_vision(
91
+ frames, claim_context, claim_date
92
+ )
93
+ print(f"✅ Gemini Vision analysis result: {preliminary_vision_analysis.get('overall_verdict', 'unknown')}")
94
+
95
+ # STEP 1: Analyze frames with reverse image search (existing approach)
96
+ # Wrap in try/except so vision analysis can still proceed if search fails
97
+ reverse_search_analysis = None
98
+ try:
99
+ reverse_search_analysis = await self._analyze_frames(frames, claim_context, claim_date)
100
+ except Exception as search_error:
101
+ print(f"⚠️ Reverse image search analysis failed (will use vision analysis only): {search_error}")
102
+ # Continue with vision analysis only
103
+
104
+ # STEP 2: Synthesize vision analysis + reverse image search results
105
+ if reverse_search_analysis:
106
+ final_analysis = self._synthesize_video_analyses(
107
+ preliminary_vision_analysis=preliminary_vision_analysis,
108
+ reverse_search_analysis=reverse_search_analysis,
109
+ frames=frames,
110
+ claim_context=claim_context,
111
+ claim_date=claim_date,
112
+ )
113
+
114
+ if final_analysis:
115
+ analysis = final_analysis
116
+ else:
117
+ # Fallback: use vision analysis if synthesis fails
118
+ if preliminary_vision_analysis.get("overall_verdict") in ["false", "true"]:
119
+ analysis = preliminary_vision_analysis
120
+ else:
121
+ analysis = reverse_search_analysis
122
+ else:
123
+ # No reverse search results, use vision analysis only
124
+ print("⚠️ Using vision analysis only (reverse image search unavailable)")
125
+ analysis = preliminary_vision_analysis
126
+
127
+ if analysis.get("overall_verdict") != "false":
128
+ return {
129
+ "verified": analysis.get("overall_verdict") == "true",
130
+ "message": analysis.get("overall_summary") or "No decisive false context detected in video frames",
131
+ "details": {
132
+ "frames_analyzed": len(frames),
133
+ "overall_verdict": analysis.get("overall_verdict"),
134
+ "frame_summaries": analysis.get("frame_summaries", []),
135
+ }
136
+ }
137
+
138
+ # Generate video counter-measure only if we have a specific false frame
139
+ false_ctx = analysis.get("false_context_frame")
140
+ if not false_ctx:
141
+ return {
142
+ "verified": False,
143
+ "message": analysis.get("overall_summary") or "False context inferred but no specific frame identified for counter-measure.",
144
+ "details": {
145
+ "frames_analyzed": len(frames),
146
+ "overall_verdict": analysis.get("overall_verdict"),
147
+ "frame_summaries": analysis.get("frame_summaries", []),
148
+ }
149
+ }
150
+ output_path = await self._generate_video_counter_measure(
151
+ video_path, false_ctx, claim_context, claim_date
152
+ )
153
+
154
+ result: Dict[str, Any] = {
155
+ "verified": True,
156
+ "message": "False context detected and video counter-measure generated",
157
+ "output_path": output_path,
158
+ "false_context_frame": analysis.get("false_context_frame"),
159
+ "details": {
160
+ "frames_analyzed": len(frames),
161
+ "claim_context": claim_context,
162
+ "claim_date": claim_date
163
+ }
164
+ }
165
+ # Attempt Cloudinary cleanup (best-effort) before responding
166
+ await self._cloudinary_cleanup_prefix(config.CLOUDINARY_FOLDER or "frames")
167
+ return result
168
+
169
+ except Exception as e:
170
+ return {
171
+ "verified": False,
172
+ "message": f"Error during video verification: {str(e)}",
173
+ "details": {"error": str(e)}
174
+ }
175
+
176
+ async def _download_video(self, url: str) -> str:
177
+ try:
178
+ resp = requests.get(url, stream=True, timeout=30)
179
+ resp.raise_for_status()
180
+ content_type = (resp.headers.get("Content-Type") or "").lower()
181
+ looks_like_video = ("video" in content_type) or url.lower().endswith((".mp4", ".mov", ".mkv", ".webm", ".m4v"))
182
+ if not looks_like_video:
183
+ raise RuntimeError(f"URL is not a direct video (content-type={content_type})")
184
+ suffix = ".mp4"
185
+ tmp = tempfile.NamedTemporaryFile(delete=False, suffix=suffix)
186
+ bytes_written = 0
187
+ for chunk in resp.iter_content(chunk_size=1 << 14):
188
+ if chunk:
189
+ tmp.write(chunk)
190
+ bytes_written += len(chunk)
191
+ tmp.close()
192
+ # Heuristic: reject tiny files that aren't valid containers
193
+ if bytes_written < 200 * 1024: # 200KB
194
+ os.unlink(tmp.name)
195
+ raise RuntimeError("Downloaded file too small to be a valid video")
196
+ return tmp.name
197
+ except Exception as e:
198
+ raise RuntimeError(f"Failed to download video: {e}")
199
+
200
+ async def _download_with_ytdlp(self, url: str) -> str:
201
+ try:
202
+ # Resolve yt-dlp binary
203
+ ytdlp_bin = self._resolve_ytdlp_bin()
204
+ tmp_dir = tempfile.mkdtemp()
205
+ out_path = os.path.join(tmp_dir, "video.%(ext)s")
206
+ cmd = [
207
+ ytdlp_bin,
208
+ "-f", "best[height<=720]/best[height<=480]/best",
209
+ "--no-warnings",
210
+ "--no-call-home",
211
+ "--no-progress",
212
+ "--restrict-filenames",
213
+ "--socket-timeout", "30",
214
+ "--retries", "3",
215
+ "--fragment-retries", "3",
216
+ "--user-agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
217
+ "--extractor-retries", "3",
218
+ "-o", out_path,
219
+ url,
220
+ ]
221
+ proc = await asyncio.create_subprocess_exec(
222
+ *cmd, stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE
223
+ )
224
+ try:
225
+ await asyncio.wait_for(proc.communicate(), timeout=config.STREAM_DOWNLOAD_TIMEOUT)
226
+ except asyncio.TimeoutError:
227
+ proc.kill()
228
+ raise RuntimeError("yt-dlp timed out")
229
+ if proc.returncode != 0:
230
+ # capture stderr for diagnostics
231
+ raise RuntimeError("yt-dlp failed (non-zero exit)")
232
+ # Resolve resulting file (first mp4 in dir)
233
+ for fname in os.listdir(tmp_dir):
234
+ if fname.lower().endswith((".mp4", ".mkv", ".webm", ".mov")):
235
+ return os.path.join(tmp_dir, fname)
236
+ raise RuntimeError("yt-dlp produced no playable file")
237
+ except Exception as e:
238
+ raise RuntimeError(f"yt-dlp error: {e}")
239
+
240
+ def _resolve_ytdlp_bin(self) -> str:
241
+ # Prefer configured path if executable, else try PATH
242
+ cand = config.YTDLP_BIN or "yt-dlp"
243
+ if os.path.isabs(cand) and os.path.isfile(cand) and os.access(cand, os.X_OK):
244
+ return cand
245
+ from shutil import which
246
+ found = which(cand) or which("yt-dlp")
247
+ if not found:
248
+ raise RuntimeError("yt-dlp not found on PATH; install yt-dlp or set YTDLP_BIN")
249
+ return found
250
+
251
+ def _is_youtube_url(self, url: str) -> bool:
252
+ """
253
+ Check if the URL is a YouTube URL
254
+
255
+ Args:
256
+ url: URL to check
257
+
258
+ Returns:
259
+ True if it's a YouTube URL, False otherwise
260
+ """
261
+ youtube_domains = [
262
+ 'youtube.com',
263
+ 'www.youtube.com',
264
+ 'youtu.be',
265
+ 'www.youtu.be',
266
+ 'm.youtube.com'
267
+ ]
268
+
269
+ url_lower = url.lower()
270
+ return any(domain in url_lower for domain in youtube_domains)
271
+
272
+ def _is_supported_platform(self, url: str) -> bool:
273
+ """
274
+ Check if the URL is from a platform supported by yt-dlp
275
+
276
+ Args:
277
+ url: URL to check
278
+
279
+ Returns:
280
+ True if it's a supported platform, False otherwise
281
+ """
282
+ supported_domains = [
283
+ # Video platforms
284
+ 'instagram.com', 'www.instagram.com',
285
+ 'tiktok.com', 'www.tiktok.com', 'vm.tiktok.com',
286
+ 'twitter.com', 'x.com', 'www.twitter.com', 'www.x.com',
287
+ 'facebook.com', 'www.facebook.com', 'fb.watch',
288
+ 'vimeo.com', 'www.vimeo.com',
289
+ 'twitch.tv', 'www.twitch.tv',
290
+ 'dailymotion.com', 'www.dailymotion.com',
291
+ 'youtube.com', 'www.youtube.com', 'youtu.be', 'www.youtu.be',
292
+
293
+ # Image platforms
294
+ 'imgur.com', 'www.imgur.com',
295
+ 'flickr.com', 'www.flickr.com',
296
+
297
+ # Audio platforms
298
+ 'soundcloud.com', 'www.soundcloud.com',
299
+ 'mixcloud.com', 'www.mixcloud.com',
300
+
301
+ # Alternative platforms
302
+ 'lbry.tv', 'odysee.com', 'www.odysee.com',
303
+ 'telegram.org', 't.me',
304
+ 'linkedin.com', 'www.linkedin.com',
305
+
306
+ # Other platforms
307
+ 'streamable.com', 'www.streamable.com',
308
+ 'rumble.com', 'www.rumble.com',
309
+ 'bitchute.com', 'www.bitchute.com',
310
+ 'peertube.tv', 'www.peertube.tv'
311
+ ]
312
+
313
+ url_lower = url.lower()
314
+ return any(domain in url_lower for domain in supported_domains)
315
+
316
+ async def _verify_with_ytdlp(self, url: str, claim_context: str, claim_date: str) -> Dict[str, Any]:
317
+ """
318
+ Verify a video from supported platforms using yt-dlp + visual analysis
319
+
320
+ Args:
321
+ url: Video URL from supported platform
322
+ claim_context: The claimed context of the video
323
+ claim_date: The claimed date of the video
324
+
325
+ Returns:
326
+ Dictionary with verification results
327
+ """
328
+ try:
329
+ print(f"🔍 DEBUG: Verifying video with yt-dlp: {url}")
330
+
331
+ # Download video using yt-dlp
332
+ video_path = await self._download_with_ytdlp(url)
333
+
334
+ # Extract frames for visual verification
335
+ frames = await self._extract_key_frames(video_path)
336
+
337
+ if frames:
338
+ # Perform visual analysis on frames
339
+ visual_analysis = await self._analyze_frames_visually(frames, claim_context, claim_date)
340
+
341
+ # Get platform info
342
+ platform = self._get_platform_name(url)
343
+
344
+ return {
345
+ 'verified': visual_analysis.get('verified', True),
346
+ 'message': f"✅ Video verified from {platform}: {visual_analysis.get('message', 'Visual analysis completed')}",
347
+ 'details': {
348
+ 'verification_method': 'ytdlp_plus_visual',
349
+ 'platform': platform,
350
+ 'url': url,
351
+ 'claim_context': claim_context,
352
+ 'claim_date': claim_date,
353
+ 'visual_analysis': visual_analysis.get('details', {}),
354
+ 'frames_analyzed': len(frames)
355
+ },
356
+ 'reasoning': f"Video verified from {platform} using yt-dlp and visual analysis. {visual_analysis.get('reasoning', '')}",
357
+ 'sources': [url]
358
+ }
359
+ else:
360
+ # Fallback to basic verification if frames can't be extracted
361
+ platform = self._get_platform_name(url)
362
+ return {
363
+ 'verified': True,
364
+ 'message': f"✅ Video verified from {platform} (basic verification - frame extraction failed)",
365
+ 'details': {
366
+ 'verification_method': 'ytdlp_basic',
367
+ 'platform': platform,
368
+ 'url': url,
369
+ 'claim_context': claim_context,
370
+ 'claim_date': claim_date,
371
+ 'limitation': 'Visual frame analysis unavailable'
372
+ },
373
+ 'reasoning': f"Video verified from {platform} using yt-dlp. Visual analysis was not possible due to frame extraction issues.",
374
+ 'sources': [url]
375
+ }
376
+
377
+ except Exception as e:
378
+ platform = self._get_platform_name(url)
379
+ return {
380
+ 'verified': False,
381
+ 'message': f'Error during {platform} video verification: {str(e)}',
382
+ 'details': {'error': str(e), 'platform': platform},
383
+ 'reasoning': f'An error occurred while verifying the {platform} video: {str(e)}',
384
+ 'sources': [url]
385
+ }
386
+
387
+ def _get_platform_name(self, url: str) -> str:
388
+ """Get the platform name from URL"""
389
+ url_lower = url.lower()
390
+
391
+ if 'instagram.com' in url_lower:
392
+ return 'Instagram'
393
+ elif 'tiktok.com' in url_lower or 'vm.tiktok.com' in url_lower:
394
+ return 'TikTok'
395
+ elif 'twitter.com' in url_lower or 'x.com' in url_lower:
396
+ return 'Twitter/X'
397
+ elif 'facebook.com' in url_lower or 'fb.watch' in url_lower:
398
+ return 'Facebook'
399
+ elif 'vimeo.com' in url_lower:
400
+ return 'Vimeo'
401
+ elif 'twitch.tv' in url_lower:
402
+ return 'Twitch'
403
+ elif 'dailymotion.com' in url_lower:
404
+ return 'DailyMotion'
405
+ elif 'imgur.com' in url_lower:
406
+ return 'Imgur'
407
+ elif 'soundcloud.com' in url_lower:
408
+ return 'SoundCloud'
409
+ elif 'mixcloud.com' in url_lower:
410
+ return 'Mixcloud'
411
+ elif 'lbry.tv' in url_lower or 'odysee.com' in url_lower:
412
+ return 'LBRY/Odysee'
413
+ elif 'telegram.org' in url_lower or 't.me' in url_lower:
414
+ return 'Telegram'
415
+ elif 'linkedin.com' in url_lower:
416
+ return 'LinkedIn'
417
+ else:
418
+ return 'Unknown Platform'
419
+
420
+ async def _verify_youtube_video(self, url: str, claim_context: str, claim_date: str) -> Dict[str, Any]:
421
+ """
422
+ Verify a YouTube video using hybrid approach: API metadata + yt-dlp for visual analysis
423
+
424
+ Args:
425
+ url: YouTube URL
426
+ claim_context: The claimed context of the video
427
+ claim_date: The claimed date of the video
428
+
429
+ Returns:
430
+ Dictionary with verification results
431
+ """
432
+ try:
433
+ # Step 1: Use YouTube Data API to verify the video exists and get metadata
434
+ verification_result = self.youtube_api.verify_video_exists(url)
435
+
436
+ if not verification_result.get('verified'):
437
+ return {
438
+ 'verified': False,
439
+ 'message': f'YouTube video verification failed: {verification_result.get("message", "Unknown error")}',
440
+ 'details': verification_result.get('details', {}),
441
+ 'reasoning': f'The video could not be verified through YouTube Data API. {verification_result.get("message", "Unknown error")}',
442
+ 'sources': [url]
443
+ }
444
+
445
+ # Step 2: Video exists, now try to download for visual analysis
446
+ video_details = verification_result.get('details', {})
447
+
448
+ try:
449
+ # Attempt to download video for frame analysis
450
+ print(f"🔍 DEBUG: Attempting to download video for visual analysis: {url}")
451
+ video_path = await self._download_with_ytdlp(url)
452
+
453
+ # Extract frames for visual verification
454
+ frames = await self._extract_key_frames(video_path)
455
+
456
+ if frames:
457
+ # Perform visual analysis on frames
458
+ visual_analysis = await self._analyze_frames_visually(frames, claim_context, claim_date)
459
+
460
+ # Combine metadata + visual analysis
461
+ return {
462
+ 'verified': visual_analysis.get('verified', True),
463
+ 'message': f"✅ Video verified with visual analysis: '{video_details.get('title', 'Unknown Title')}' by {video_details.get('channel_title', 'Unknown Channel')}\n\n{visual_analysis.get('message', '')}",
464
+ 'details': {
465
+ 'verification_method': 'hybrid_youtube_api_plus_visual',
466
+ 'video_id': video_details.get('video_id'),
467
+ 'title': video_details.get('title'),
468
+ 'channel_title': video_details.get('channel_title'),
469
+ 'published_at': video_details.get('published_at'),
470
+ 'duration': video_details.get('duration'),
471
+ 'view_count': video_details.get('view_count'),
472
+ 'thumbnail_url': video_details.get('thumbnail_url'),
473
+ 'claim_context': claim_context,
474
+ 'claim_date': claim_date,
475
+ 'visual_analysis': visual_analysis.get('details', {}),
476
+ 'frames_analyzed': len(frames)
477
+ },
478
+ 'reasoning': f"Video verified through YouTube Data API and visual analysis. {visual_analysis.get('reasoning', '')}",
479
+ 'sources': [url]
480
+ }
481
+ else:
482
+ # Fallback to metadata-only verification
483
+ print(f"⚠️ DEBUG: Could not extract frames, falling back to metadata verification")
484
+ return self._create_metadata_only_response(video_details, claim_context, claim_date, url)
485
+
486
+ except Exception as download_error:
487
+ # Fallback to metadata-only verification if download fails
488
+ print(f"⚠️ DEBUG: Video download failed: {download_error}, falling back to metadata verification")
489
+ return self._create_metadata_only_response(video_details, claim_context, claim_date, url)
490
+
491
+ except Exception as e:
492
+ return {
493
+ 'verified': False,
494
+ 'message': f'Error during YouTube video verification: {str(e)}',
495
+ 'details': {'error': str(e)},
496
+ 'reasoning': f'An error occurred while verifying the YouTube video: {str(e)}',
497
+ 'sources': [url]
498
+ }
499
+
500
+ def _create_metadata_only_response(self, video_details: Dict[str, Any], claim_context: str, claim_date: str, url: str) -> Dict[str, Any]:
501
+ """Create a metadata-only verification response when visual analysis fails"""
502
+ verification_message = f"✅ Video verified (metadata only): '{video_details.get('title', 'Unknown Title')}' by {video_details.get('channel_title', 'Unknown Channel')}"
503
+
504
+ # Add context analysis if available
505
+ if claim_context and claim_context.lower() != "the user wants to verify the content of the provided youtube video.":
506
+ verification_message += f"\n\n📝 Claim Context: {claim_context}"
507
+ verification_message += f"\n⚠️ Note: Visual content analysis unavailable - only metadata verification performed"
508
+
509
+ if claim_date and claim_date.strip():
510
+ verification_message += f"\n📅 Claimed Date: {claim_date}"
511
+
512
+ verification_message += f"\n📊 Video Stats: {video_details.get('view_count', 'Unknown')} views, Published: {video_details.get('published_at', 'Unknown')}"
513
+
514
+ return {
515
+ 'verified': True,
516
+ 'message': verification_message,
517
+ 'details': {
518
+ 'verification_method': 'youtube_data_api_metadata_only',
519
+ 'video_id': video_details.get('video_id'),
520
+ 'title': video_details.get('title'),
521
+ 'channel_title': video_details.get('channel_title'),
522
+ 'published_at': video_details.get('published_at'),
523
+ 'duration': video_details.get('duration'),
524
+ 'view_count': video_details.get('view_count'),
525
+ 'thumbnail_url': video_details.get('thumbnail_url'),
526
+ 'claim_context': claim_context,
527
+ 'claim_date': claim_date,
528
+ 'limitation': 'Visual content analysis unavailable'
529
+ },
530
+ 'reasoning': f"Video verified through YouTube Data API metadata only. Visual content analysis was not possible due to download limitations.",
531
+ 'sources': [url]
532
+ }
533
+
534
+ async def _analyze_frames_visually(self, frames: List[Tuple[str, float]], claim_context: str, claim_date: str) -> Dict[str, Any]:
535
+ """
536
+ Analyze extracted frames for visual verification
537
+
538
+ Args:
539
+ frames: List of (frame_path, timestamp) tuples
540
+ claim_context: The claimed context
541
+ claim_date: The claimed date
542
+
543
+ Returns:
544
+ Dictionary with visual analysis results
545
+ """
546
+ try:
547
+ # Analyze each frame using the image verifier
548
+ frame_analyses = []
549
+
550
+ for frame_path, timestamp in frames:
551
+ try:
552
+ frame_result = await self.image_verifier.verify(
553
+ image_path=frame_path,
554
+ claim_context=f"{claim_context} (Frame at {timestamp}s)",
555
+ claim_date=claim_date
556
+ )
557
+ frame_analyses.append({
558
+ 'timestamp': timestamp,
559
+ 'result': frame_result
560
+ })
561
+ except Exception as e:
562
+ print(f"⚠️ DEBUG: Frame analysis failed for {timestamp}s: {e}")
563
+ continue
564
+
565
+ if not frame_analyses:
566
+ return {
567
+ 'verified': False,
568
+ 'message': 'No frames could be analyzed',
569
+ 'details': {'error': 'All frame analyses failed'},
570
+ 'reasoning': 'Visual analysis failed for all extracted frames'
571
+ }
572
+
573
+ # Determine overall verification result
574
+ verified_count = sum(1 for analysis in frame_analyses if analysis['result'].get('verified', False))
575
+ total_frames = len(frame_analyses)
576
+
577
+ if verified_count == 0:
578
+ verification_status = False
579
+ message = f"❌ Visual analysis found no supporting evidence in {total_frames} frames"
580
+ elif verified_count == total_frames:
581
+ verification_status = True
582
+ message = f"✅ Visual analysis confirmed claim in all {total_frames} frames"
583
+ else:
584
+ verification_status = True # Partial verification
585
+ message = f"⚠️ Visual analysis partially confirmed claim in {verified_count}/{total_frames} frames"
586
+
587
+ return {
588
+ 'verified': verification_status,
589
+ 'message': message,
590
+ 'details': {
591
+ 'frames_analyzed': total_frames,
592
+ 'verified_frames': verified_count,
593
+ 'frame_results': frame_analyses
594
+ },
595
+ 'reasoning': f"Analyzed {total_frames} video frames. {verified_count} frames supported the claim."
596
+ }
597
+
598
+ except Exception as e:
599
+ return {
600
+ 'verified': False,
601
+ 'message': f'Visual analysis failed: {str(e)}',
602
+ 'details': {'error': str(e)},
603
+ 'reasoning': f'Error during visual frame analysis: {str(e)}'
604
+ }
605
+
606
+ async def _extract_key_frames(self, video_path: str) -> List[Tuple[str, float]]:
607
+ """
608
+ Extract key frames from video at regular intervals
609
+
610
+ Args:
611
+ video_path: Path to the video file
612
+
613
+ Returns:
614
+ List of tuples (frame_path, timestamp)
615
+ """
616
+ try:
617
+ frames = []
618
+ cap = cv2.VideoCapture(video_path)
619
+
620
+ if not cap.isOpened():
621
+ print(f"Error: Could not open video file {video_path}")
622
+ return []
623
+
624
+ # Get video properties
625
+ fps = cap.get(cv2.CAP_PROP_FPS)
626
+ total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
627
+ duration = total_frames / fps if fps > 0 else 0
628
+
629
+ frame_interval_frames = int(fps * self.frame_interval)
630
+
631
+ frame_count = 0
632
+ saved_count = 0
633
+
634
+ while True:
635
+ ret, frame = cap.read()
636
+ if not ret:
637
+ break
638
+
639
+ # Save frame at regular intervals
640
+ if frame_count % frame_interval_frames == 0:
641
+ timestamp = frame_count / fps
642
+ # Save frame into public/frames for local static serving
643
+ out_dir = os.path.join("public", "frames")
644
+ os.makedirs(out_dir, exist_ok=True)
645
+ frame_file = f"frame_{int(timestamp*1000)}.jpg"
646
+ frame_path = os.path.join(out_dir, frame_file)
647
+ cv2.imwrite(frame_path, frame, [int(cv2.IMWRITE_JPEG_QUALITY), 85])
648
+ frames.append((frame_path, timestamp))
649
+ saved_count += 1
650
+
651
+ # Limit number of frames to analyze
652
+ if saved_count >= 10: # Max 10 frames
653
+ break
654
+
655
+ frame_count += 1
656
+
657
+ cap.release()
658
+ return frames
659
+
660
+ except Exception as e:
661
+ print(f"Error extracting frames: {e}")
662
+ return []
663
+
664
+ async def _analyze_frames_with_vision(
665
+ self,
666
+ frames: List[Tuple[str, float]],
667
+ claim_context: str,
668
+ claim_date: str
669
+ ) -> Dict[str, Any]:
670
+ """
671
+ Analyze video frames directly with Gemini Vision (first pass).
672
+ Detects AI-generated/deepfake/manipulation in frames.
673
+
674
+ Args:
675
+ frames: List of (frame_path, timestamp) tuples
676
+ claim_context: The claimed context
677
+ claim_date: The claimed date
678
+
679
+ Returns:
680
+ Dictionary with preliminary vision analysis
681
+ """
682
+ try:
683
+ if not self.image_verifier.gemini_model:
684
+ return {
685
+ "overall_verdict": "uncertain",
686
+ "overall_summary": "Gemini Vision not available",
687
+ "frame_analyses": [],
688
+ "analysis_method": "vision_unavailable",
689
+ }
690
+
691
+ frame_analyses = []
692
+ for frame_path, timestamp in frames:
693
+ try:
694
+ # Use image verifier's vision analysis method
695
+ vision_result = await self.image_verifier._analyze_image_with_vision(
696
+ image_path=frame_path,
697
+ image_url=None,
698
+ claim_context=f"{claim_context} (Frame at {timestamp}s)",
699
+ claim_date=claim_date
700
+ )
701
+ frame_analyses.append({
702
+ "timestamp": timestamp,
703
+ "frame_path": frame_path,
704
+ "vision_analysis": vision_result,
705
+ })
706
+ except Exception as e:
707
+ print(f"⚠️ Vision analysis failed for frame at {timestamp}s: {e}")
708
+ continue
709
+
710
+ if not frame_analyses:
711
+ return {
712
+ "overall_verdict": "uncertain",
713
+ "overall_summary": "No frames could be analyzed with vision",
714
+ "frame_analyses": [],
715
+ "analysis_method": "vision_no_frames",
716
+ }
717
+
718
+ # Aggregate vision results across frames
719
+ false_count = sum(1 for fa in frame_analyses if fa["vision_analysis"].get("verdict") == "false")
720
+ true_count = sum(1 for fa in frame_analyses if fa["vision_analysis"].get("verdict") == "true")
721
+ uncertain_count = len(frame_analyses) - false_count - true_count
722
+
723
+ # Determine overall verdict
724
+ if false_count > true_count and false_count > uncertain_count:
725
+ overall_verdict = "false"
726
+ overall_summary = f"Vision analysis detected manipulation/AI-generated content in {false_count}/{len(frame_analyses)} frames"
727
+ elif true_count > false_count and true_count > uncertain_count:
728
+ overall_verdict = "true"
729
+ overall_summary = f"Vision analysis found authentic content in {true_count}/{len(frame_analyses)} frames"
730
+ else:
731
+ overall_verdict = "uncertain"
732
+ overall_summary = f"Vision analysis inconclusive: {true_count} true, {false_count} false, {uncertain_count} uncertain across {len(frame_analyses)} frames"
733
+
734
+ return {
735
+ "overall_verdict": overall_verdict,
736
+ "overall_summary": overall_summary,
737
+ "frame_analyses": frame_analyses,
738
+ "false_count": false_count,
739
+ "true_count": true_count,
740
+ "uncertain_count": uncertain_count,
741
+ "analysis_method": "gemini_vision",
742
+ }
743
+
744
+ except Exception as e:
745
+ print(f"[vision] Error in frame vision analysis: {e}")
746
+ return {
747
+ "overall_verdict": "uncertain",
748
+ "overall_summary": f"Error during vision analysis: {str(e)}",
749
+ "frame_analyses": [],
750
+ "analysis_method": "vision_error",
751
+ }
752
+
753
+ def _synthesize_video_analyses(
754
+ self,
755
+ preliminary_vision_analysis: Dict[str, Any],
756
+ reverse_search_analysis: Dict[str, Any],
757
+ frames: List[Tuple[str, float]],
758
+ claim_context: str,
759
+ claim_date: str,
760
+ ) -> Optional[Dict[str, Any]]:
761
+ """
762
+ Synthesize Gemini Vision analysis with reverse image search results.
763
+ """
764
+ try:
765
+ if not self.image_verifier.gemini_model:
766
+ return None
767
+
768
+ prompt = f"""You are an expert video verification analyst. Combine direct frame analysis (Gemini Vision) with reverse image search evidence to produce a final verdict.
769
+
770
+ CLAIM: {claim_context}
771
+ CLAIM DATE: {claim_date}
772
+
773
+ DIRECT FRAME ANALYSIS (Gemini Vision):
774
+ {json.dumps(preliminary_vision_analysis or {}, indent=2, ensure_ascii=False)}
775
+
776
+ REVERSE IMAGE SEARCH ANALYSIS:
777
+ {json.dumps(reverse_search_analysis or {}, indent=2, ensure_ascii=False)}
778
+
779
+ TOTAL FRAMES ANALYZED: {len(frames)}
780
+
781
+ INSTRUCTIONS:
782
+ - Combine both analyses to make a final decision (true/false/uncertain)
783
+ - If vision analysis detects AI-generated/manipulated content in multiple frames, prioritize that
784
+ - If reverse image search finds contradictory evidence, factor that in
785
+ - Consider consistency across frames
786
+ - If evidence is thin, keep the tone cautious
787
+ - Provide clear, actionable messaging for the end user
788
+
789
+ Respond ONLY in this JSON format:
790
+ {{
791
+ "overall_verdict": "true|false|uncertain",
792
+ "overall_summary": "Concise user-facing summary combining both analyses",
793
+ "confidence": "high|medium|low",
794
+ "reasoning": "Brief reasoning trail you followed",
795
+ "vision_findings": "Key findings from direct frame analysis",
796
+ "search_findings": "Key findings from reverse image search"
797
+ }}"""
798
+
799
+ response = self.image_verifier.gemini_model.generate_content(prompt)
800
+ response_text = response.text.strip()
801
+
802
+ if response_text.startswith("```json"):
803
+ response_text = response_text.replace("```json", "").replace("```", "").strip()
804
+ elif response_text.startswith("```"):
805
+ response_text = response_text.replace("```", "").strip()
806
+
807
+ final_analysis = json.loads(response_text)
808
+ final_analysis.setdefault("overall_verdict", "uncertain")
809
+ final_analysis.setdefault("overall_summary", "Unable to synthesize final verdict.")
810
+ final_analysis.setdefault("confidence", "low")
811
+ final_analysis["analysis_method"] = "hybrid_vision_and_search"
812
+
813
+ # Preserve frame summaries and sources from reverse search
814
+ final_analysis["frame_summaries"] = reverse_search_analysis.get("frame_summaries", [])
815
+ final_analysis["consolidated_sources"] = reverse_search_analysis.get("consolidated_sources", [])
816
+ final_analysis["preliminary_vision_analysis"] = preliminary_vision_analysis
817
+ final_analysis["reverse_search_analysis"] = reverse_search_analysis
818
+
819
+ return final_analysis
820
+
821
+ except Exception as e:
822
+ print(f"Video hybrid synthesis error: {e}")
823
+ return None
824
+
825
+ async def _analyze_frames(self, frames: List[Tuple[str, float]],
826
+ claim_context: str, claim_date: str) -> Dict[str, Any]:
827
+ """
828
+ Analyze extracted frames for false context
829
+
830
+ Args:
831
+ frames: List of (frame_path, timestamp) tuples
832
+ claim_context: The claimed context
833
+ claim_date: The claimed date
834
+
835
+ Returns:
836
+ Aggregated analysis with overall verdict and optional false frame
837
+ """
838
+ frame_summaries: List[Dict[str, Any]] = []
839
+ false_hit: Optional[Dict[str, Any]] = None
840
+ true_hit: Optional[Dict[str, Any]] = None
841
+ saw_false_validated = False
842
+ saw_true_validated = False
843
+ # 1) Per-frame: only gather evidence; defer verdict to a single final pass
844
+ all_evidence: List[Dict[str, Any]] = []
845
+ for frame_path, timestamp in frames:
846
+ try:
847
+ # Upload frame to Cloudinary if configured, else local static URL
848
+ frame_url = None
849
+ if config.CLOUDINARY_CLOUD_NAME and (config.CLOUDINARY_UPLOAD_PRESET or (config.CLOUDINARY_API_KEY and config.CLOUDINARY_API_SECRET)):
850
+ frame_url = await self._upload_frame_cloudinary(frame_path)
851
+ if not frame_url:
852
+ # fallback local (note: SerpApi can't access localhost; cloudinary is preferred)
853
+ from urllib.parse import quote
854
+ rel = frame_path.replace(os.path.join("public", ''), "") if frame_path.startswith("public"+os.sep) else os.path.basename(frame_path)
855
+ frame_url = f"http://127.0.0.1:{config.SERVICE_PORT}/static/{quote(rel)}"
856
+ print("[video] analyze_frame", {"ts": timestamp, "path": frame_path})
857
+ # Gather evidence only for this frame
858
+ ev = await self.image_verifier.gather_evidence(
859
+ image_path=None, image_url=frame_url, claim_context=claim_context
860
+ )
861
+ all_evidence.extend(ev or [])
862
+ # Populate a placeholder entry per frame (no verdict yet)
863
+ frame_entry = {
864
+ "timestamp": timestamp,
865
+ "verdict": None,
866
+ "summary": None,
867
+ "sources": None,
868
+ "frame_path": frame_path,
869
+ "validator": None,
870
+ "details": {"evidence": ev or []},
871
+ }
872
+ # Compute rule-based confidence (0..1)
873
+ conf = 0.2
874
+ reasons: List[str] = []
875
+ checks = {}
876
+ if frame_entry["verdict"] == "true":
877
+ if checks.get("relation_comention"):
878
+ conf += 0.3; reasons.append("relation_comention")
879
+ if frame_entry["verdict"] == "false":
880
+ if not checks.get("relation_comention"):
881
+ conf += 0.25; reasons.append("no_relation_support")
882
+ if checks.get("timeframe_citations") or checks.get("timeframe_match"):
883
+ conf += 0.15; reasons.append("timeframe_match")
884
+ eos = checks.get("entity_overlap_score")
885
+ try:
886
+ if eos is not None and float(eos) >= 0.7:
887
+ conf += 0.1; reasons.append("entity_overlap")
888
+ except Exception:
889
+ pass
890
+ # Penalize if sources dominated by low-priority domains
891
+ low_priority_hits = 0
892
+ total_sources = 0
893
+ try:
894
+ from urllib.parse import urlparse
895
+ for s in (frame_entry.get("sources") or []):
896
+ total_sources += 1
897
+ net = urlparse((s.get("link") or "")).netloc
898
+ if net in config.LOW_PRIORITY_DOMAINS:
899
+ low_priority_hits += 1
900
+ except Exception:
901
+ pass
902
+ if total_sources > 0 and low_priority_hits / float(total_sources) >= 0.5:
903
+ conf -= 0.2; reasons.append("low_priority_sources")
904
+ if conf < 0.0: conf = 0.0
905
+ if conf > 1.0: conf = 1.0
906
+ frame_entry["confidence"] = conf
907
+ frame_entry["confidence_reasons"] = reasons
908
+ print("[video] frame_result", {"ts": timestamp, "verdict": frame_entry["verdict"], "passed": (frame_entry.get("validator") or {}).get("passed")})
909
+ # No per-frame debug when gathering evidence only
910
+ frame_summaries.append(frame_entry)
911
+ # No per-frame validator flags when gathering evidence only
912
+ if false_hit is None:
913
+ false_hit = {
914
+ "timestamp": timestamp,
915
+ "frame_path": frame_path,
916
+ "evidence_image": None,
917
+ "details": {"evidence": ev or []},
918
+ }
919
+ if true_hit is None:
920
+ true_hit = {
921
+ "timestamp": timestamp,
922
+ "frame_path": frame_path,
923
+ "details": {"evidence": ev or []},
924
+ }
925
+
926
+ except Exception as e:
927
+ print(f"Error analyzing frame {frame_path}: {e}")
928
+ # Keep files even on error for debugging
929
+
930
+ # 2) Single final pass: send aggregated evidence to image verifier's Gemini summarizer
931
+ # Reuse image verifier's structured summarizer for a consolidated verdict
932
+ # Use the simple majority-based summarizer per product rule
933
+ final_llm = self.image_verifier._summarize_with_gemini_majority(
934
+ claim_context=claim_context,
935
+ claim_date=claim_date,
936
+ evidence=all_evidence[:24], # cap to keep prompt manageable
937
+ ) or {}
938
+ final_verdict = (final_llm.get("verdict") or "uncertain").lower()
939
+ # Prefer LLM clarification if present; else fallback to previous summary
940
+ final_summary = final_llm.get("clarification") or final_llm.get("summary") or "Consolidated evidence analyzed."
941
+
942
+ # Deterministic co-mention vote to override ambiguous LLM outcomes
943
+ def _tokens(text: str) -> List[str]:
944
+ import re
945
+ return re.findall(r"[a-z0-9]{3,}", (text or "").lower())
946
+
947
+ def _split_relation(claim: str) -> Tuple[List[str], List[str]]:
948
+ # Heuristic: split on ' with ' to get subject vs object; fallback to all tokens as subject
949
+ cl = (claim or "").strip()
950
+ i = cl.lower().find(" with ")
951
+ if i != -1:
952
+ subj = cl[:i].strip()
953
+ obj = cl[i+6:].strip().split(".")[0]
954
+ else:
955
+ subj = cl
956
+ obj = ""
957
+ return list(set(_tokens(subj))), list(set(_tokens(obj)))
958
+
959
+ def _evidence_text(ev: Dict[str, Any]) -> str:
960
+ return " ".join([t for t in [ev.get("title"), ev.get("snippet"), ev.get("source")] if t])
961
+
962
+ subj_toks, obj_toks = _split_relation(claim_context)
963
+ support = 0
964
+ contra = 0
965
+ for ev in all_evidence[:24]:
966
+ txt_toks = set(_tokens(_evidence_text(ev)))
967
+ if not txt_toks:
968
+ continue
969
+ subj_hit = bool(subj_toks and (set(subj_toks) & txt_toks))
970
+ obj_hit = bool(obj_toks and (set(obj_toks) & txt_toks))
971
+ if subj_hit and obj_hit:
972
+ support += 1
973
+ elif subj_hit and obj_toks:
974
+ # mentions subject but not the claimed object → treat as contradiction to the claimed relation
975
+ contra += 1
976
+
977
+ # Apply override rules: prioritize clear majority; else keep LLM
978
+ if support == 0 and contra > 0:
979
+ final_verdict = "false" # keep LLM clarification
980
+ elif support > contra and (support - contra) >= 1:
981
+ final_verdict = "true" # keep LLM clarification
982
+ elif contra > support and (contra - support) >= 1:
983
+ final_verdict = "false" # keep LLM clarification
984
+ # else keep LLM's verdict/summary
985
+
986
+ return {
987
+ "overall_verdict": final_verdict,
988
+ "overall_summary": final_summary,
989
+ "frame_summaries": frame_summaries,
990
+ "consolidated_sources": final_llm.get("top_sources") or self.image_verifier._top_sources(all_evidence, 3),
991
+ }
992
+
993
+ async def _upload_frame_cloudinary(self, frame_path: str) -> Optional[str]:
994
+ try:
995
+ import hashlib
996
+ import requests
997
+ cloud = config.CLOUDINARY_CLOUD_NAME
998
+ folder = config.CLOUDINARY_FOLDER.strip('/')
999
+ # Unsigned upload if preset provided
1000
+ if config.CLOUDINARY_UPLOAD_PRESET:
1001
+ url = f"https://api.cloudinary.com/v1_1/{cloud}/image/upload"
1002
+ with open(frame_path, 'rb') as f:
1003
+ files = {"file": f}
1004
+ data = {"upload_preset": config.CLOUDINARY_UPLOAD_PRESET, "folder": folder}
1005
+ r = requests.post(url, files=files, data=data, timeout=30)
1006
+ r.raise_for_status()
1007
+ return r.json().get("secure_url")
1008
+ # Signed upload
1009
+ ts = str(int(time.time()))
1010
+ params_to_sign = {"timestamp": ts, "folder": folder}
1011
+ to_sign = "&".join([f"{k}={v}" for k, v in sorted(params_to_sign.items())]) + config.CLOUDINARY_API_SECRET
1012
+ signature = hashlib.sha1(to_sign.encode('utf-8')).hexdigest()
1013
+ url = f"https://api.cloudinary.com/v1_1/{cloud}/image/upload"
1014
+ with open(frame_path, 'rb') as f:
1015
+ files = {"file": f}
1016
+ data = {
1017
+ "api_key": config.CLOUDINARY_API_KEY,
1018
+ "timestamp": ts,
1019
+ "signature": signature,
1020
+ "folder": folder,
1021
+ }
1022
+ r = requests.post(url, files=files, data=data, timeout=30)
1023
+ r.raise_for_status()
1024
+ return r.json().get("secure_url")
1025
+ except Exception as e:
1026
+ print(f"Cloudinary upload failed: {e}")
1027
+ return None
1028
+
1029
+ async def _generate_video_counter_measure(self, video_path: str,
1030
+ false_context_frame: Dict[str, Any],
1031
+ claim_context: str, claim_date: str) -> str:
1032
+ """
1033
+ Generate a video counter-measure
1034
+
1035
+ Args:
1036
+ video_path: Path to the original video
1037
+ false_context_frame: Information about the false context frame
1038
+ claim_context: The claimed context
1039
+ claim_date: The claimed date
1040
+
1041
+ Returns:
1042
+ Path to the generated counter-measure video
1043
+ """
1044
+ try:
1045
+ # Create temporary directory for video processing
1046
+ temp_dir = tempfile.mkdtemp()
1047
+
1048
+ # Generate video components
1049
+ title_clip = await self._create_title_clip(temp_dir, claim_context, claim_date)
1050
+ misleading_clip = await self._create_misleading_clip(
1051
+ video_path, false_context_frame["timestamp"], temp_dir
1052
+ )
1053
+ debunk_clip = await self._create_debunk_clip(
1054
+ temp_dir, false_context_frame, claim_context, claim_date
1055
+ )
1056
+ verdict_clip = await self._create_verdict_clip(temp_dir)
1057
+
1058
+ # Concatenate all clips
1059
+ output_path = await self._concatenate_clips(
1060
+ [title_clip, misleading_clip, debunk_clip, verdict_clip],
1061
+ temp_dir
1062
+ )
1063
+
1064
+ # Clean up temporary files
1065
+ self._cleanup_temp_files(temp_dir)
1066
+
1067
+ # Attempt Cloudinary cleanup (best-effort) before responding
1068
+ await self._cloudinary_cleanup_prefix(config.CLOUDINARY_FOLDER or "frames")
1069
+ return output_path
1070
+
1071
+ except Exception as e:
1072
+ print(f"Error generating video counter-measure: {e}")
1073
+ raise
1074
+
1075
+ async def _create_title_clip(self, temp_dir: str, claim_context: str, claim_date: str) -> str:
1076
+ """Create title clip with claim information"""
1077
+ try:
1078
+ # Create title image
1079
+ img = Image.new('RGB', (800, 400), 'white')
1080
+ draw = ImageDraw.Draw(img)
1081
+
1082
+ try:
1083
+ font_large = ImageFont.truetype("/System/Library/Fonts/Arial.ttf", 36)
1084
+ font_medium = ImageFont.truetype("/System/Library/Fonts/Arial.ttf", 24)
1085
+ except:
1086
+ font_large = ImageFont.load_default()
1087
+ font_medium = ImageFont.load_default()
1088
+
1089
+ # Add title
1090
+ title = "FALSE CONTEXT DETECTED"
1091
+ title_bbox = draw.textbbox((0, 0), title, font=font_large)
1092
+ title_width = title_bbox[2] - title_bbox[0]
1093
+ title_x = (800 - title_width) // 2
1094
+ draw.text((title_x, 100), title, fill='red', font=font_large)
1095
+
1096
+ # Add claim details
1097
+ claim_text = f"Claim: {claim_context}, {claim_date}"
1098
+ claim_bbox = draw.textbbox((0, 0), claim_text, font=font_medium)
1099
+ claim_width = claim_bbox[2] - claim_bbox[0]
1100
+ claim_x = (800 - claim_width) // 2
1101
+ draw.text((claim_x, 200), claim_text, fill='black', font=font_medium)
1102
+
1103
+ # Save image
1104
+ title_img_path = os.path.join(temp_dir, "title.png")
1105
+ img.save(title_img_path)
1106
+
1107
+ # Convert to video clip
1108
+ title_video_path = os.path.join(temp_dir, "title.mp4")
1109
+ await self._image_to_video(title_img_path, title_video_path, duration=3)
1110
+
1111
+ return title_video_path
1112
+
1113
+ except Exception as e:
1114
+ print(f"Error creating title clip: {e}")
1115
+ raise
1116
+
1117
+ async def _create_misleading_clip(self, video_path: str, timestamp: float, temp_dir: str) -> str:
1118
+ """Create clip from original misleading video"""
1119
+ try:
1120
+ # Calculate frame numbers for 5-second clip
1121
+ cap = cv2.VideoCapture(video_path)
1122
+ fps = cap.get(cv2.CAP_PROP_FPS)
1123
+ cap.release()
1124
+
1125
+ start_frame = int(timestamp * fps) - int(self.clip_duration / 2 * fps)
1126
+ start_frame = max(0, start_frame)
1127
+
1128
+ # Extract clip using ffmpeg
1129
+ clip_path = os.path.join(temp_dir, "misleading_clip.mp4")
1130
+
1131
+ start_time = max(0, timestamp - self.clip_duration / 2)
1132
+
1133
+ cmd = [
1134
+ 'ffmpeg', '-i', video_path,
1135
+ '-ss', str(start_time),
1136
+ '-t', str(self.clip_duration),
1137
+ '-c', 'copy',
1138
+ '-y', clip_path
1139
+ ]
1140
+
1141
+ process = await asyncio.create_subprocess_exec(
1142
+ *cmd, stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE
1143
+ )
1144
+ await process.communicate()
1145
+
1146
+ if process.returncode != 0:
1147
+ raise Exception("FFmpeg failed to create misleading clip")
1148
+
1149
+ return clip_path
1150
+
1151
+ except Exception as e:
1152
+ print(f"Error creating misleading clip: {e}")
1153
+ raise
1154
+
1155
+ async def _create_debunk_clip(self, temp_dir: str, false_context_frame: Dict[str, Any],
1156
+ claim_context: str, claim_date: str) -> str:
1157
+ """Create debunk scene clip with side-by-side comparison"""
1158
+ try:
1159
+ # Create debunk image using image verifier's counter-measure
1160
+ debunk_img_path = await self.image_verifier._generate_counter_measure(
1161
+ false_context_frame["frame_path"],
1162
+ false_context_frame["evidence_image"],
1163
+ claim_context,
1164
+ claim_date
1165
+ )
1166
+
1167
+ # Move to temp directory
1168
+ final_debunk_img = os.path.join(temp_dir, "debunk.png")
1169
+ os.rename(debunk_img_path, final_debunk_img)
1170
+
1171
+ # Convert to video clip
1172
+ debunk_video_path = os.path.join(temp_dir, "debunk.mp4")
1173
+ await self._image_to_video(final_debunk_img, debunk_video_path, duration=5)
1174
+
1175
+ return debunk_video_path
1176
+
1177
+ except Exception as e:
1178
+ print(f"Error creating debunk clip: {e}")
1179
+ raise
1180
+
1181
+ async def _create_verdict_clip(self, temp_dir: str) -> str:
1182
+ """Create verdict clip with conclusion"""
1183
+ try:
1184
+ # Create verdict image
1185
+ img = Image.new('RGB', (800, 400), 'white')
1186
+ draw = ImageDraw.Draw(img)
1187
+
1188
+ try:
1189
+ font_large = ImageFont.truetype("/System/Library/Fonts/Arial.ttf", 36)
1190
+ font_medium = ImageFont.truetype("/System/Library/Fonts/Arial.ttf", 24)
1191
+ except:
1192
+ font_large = ImageFont.load_default()
1193
+ font_medium = ImageFont.load_default()
1194
+
1195
+ # Add verdict
1196
+ verdict = "VERDICT: FALSE CONTEXT"
1197
+ verdict_bbox = draw.textbbox((0, 0), verdict, font=font_large)
1198
+ verdict_width = verdict_bbox[2] - verdict_bbox[0]
1199
+ verdict_x = (800 - verdict_width) // 2
1200
+ draw.text((verdict_x, 100), verdict, fill='red', font=font_large)
1201
+
1202
+ # Add explanation
1203
+ explanation = "This content is being used in a false context"
1204
+ explanation_bbox = draw.textbbox((0, 0), explanation, font=font_medium)
1205
+ explanation_width = explanation_bbox[2] - explanation_bbox[0]
1206
+ explanation_x = (800 - explanation_width) // 2
1207
+ draw.text((explanation_x, 200), explanation, fill='black', font=font_medium)
1208
+
1209
+ # Save image
1210
+ verdict_img_path = os.path.join(temp_dir, "verdict.png")
1211
+ img.save(verdict_img_path)
1212
+
1213
+ # Convert to video clip
1214
+ verdict_video_path = os.path.join(temp_dir, "verdict.mp4")
1215
+ await self._image_to_video(verdict_img_path, verdict_video_path, duration=3)
1216
+
1217
+ return verdict_video_path
1218
+
1219
+ except Exception as e:
1220
+ print(f"Error creating verdict clip: {e}")
1221
+ raise
1222
+
1223
+ async def _image_to_video(self, image_path: str, video_path: str, duration: int) -> None:
1224
+ """Convert image to video clip using FFmpeg"""
1225
+ try:
1226
+ cmd = [
1227
+ 'ffmpeg', '-loop', '1',
1228
+ '-i', image_path,
1229
+ '-c:v', 'libx264',
1230
+ '-t', str(duration),
1231
+ '-pix_fmt', 'yuv420p',
1232
+ '-y', video_path
1233
+ ]
1234
+
1235
+ process = await asyncio.create_subprocess_exec(
1236
+ *cmd, stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE
1237
+ )
1238
+ await process.communicate()
1239
+
1240
+ if process.returncode != 0:
1241
+ raise Exception("FFmpeg failed to convert image to video")
1242
+
1243
+ except Exception as e:
1244
+ print(f"Error converting image to video: {e}")
1245
+ raise
1246
+
1247
+ async def _concatenate_clips(self, clip_paths: List[str], temp_dir: str) -> str:
1248
+ """Concatenate multiple video clips into one"""
1249
+ try:
1250
+ # Create file list for FFmpeg
1251
+ file_list_path = os.path.join(temp_dir, "clips.txt")
1252
+ with open(file_list_path, 'w') as f:
1253
+ for clip_path in clip_paths:
1254
+ f.write(f"file '{clip_path}'\n")
1255
+
1256
+ # Concatenate clips
1257
+ output_path = tempfile.mktemp(suffix=".mp4")
1258
+
1259
+ cmd = [
1260
+ 'ffmpeg', '-f', 'concat',
1261
+ '-safe', '0',
1262
+ '-i', file_list_path,
1263
+ '-c', 'copy',
1264
+ '-y', output_path
1265
+ ]
1266
+
1267
+ process = await asyncio.create_subprocess_exec(
1268
+ *cmd, stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE
1269
+ )
1270
+ await process.communicate()
1271
+
1272
+ if process.returncode != 0:
1273
+ raise Exception("FFmpeg failed to concatenate clips")
1274
+
1275
+ return output_path
1276
+
1277
+ except Exception as e:
1278
+ print(f"Error concatenating clips: {e}")
1279
+ raise
1280
+
1281
+ def _cleanup_temp_files(self, temp_dir: str) -> None:
1282
+ """Clean up temporary files and directory"""
1283
+ try:
1284
+ import shutil
1285
+ shutil.rmtree(temp_dir)
1286
+ except Exception as e:
1287
+ print(f"Error cleaning up temp files: {e}")
1288
+
1289
+ async def _cloudinary_cleanup_prefix(self, prefix: str) -> None:
1290
+ try:
1291
+ if not (config.CLOUDINARY_CLOUD_NAME and (config.CLOUDINARY_API_KEY and config.CLOUDINARY_API_SECRET)):
1292
+ return
1293
+ # List and delete all resources under the folder prefix (rate-limited; best-effort)
1294
+ import requests
1295
+ from requests.auth import HTTPBasicAuth
1296
+ cloud = config.CLOUDINARY_CLOUD_NAME
1297
+ auth = HTTPBasicAuth(config.CLOUDINARY_API_KEY, config.CLOUDINARY_API_SECRET)
1298
+ list_url = f"https://api.cloudinary.com/v1_1/{cloud}/resources/image"
1299
+ params = {"prefix": prefix, "max_results": 100}
1300
+ r = requests.get(list_url, params=params, auth=auth, timeout=20)
1301
+ if r.status_code != 200:
1302
+ return
1303
+ data = r.json()
1304
+ public_ids = [res.get("public_id") for res in data.get("resources", []) if res.get("public_id")]
1305
+ if not public_ids:
1306
+ return
1307
+ del_url = f"https://api.cloudinary.com/v1_1/{cloud}/resources/image/delete_by_ids"
1308
+ requests.post(del_url, data={"public_ids": ",".join(public_ids)}, auth=auth, timeout=20)
1309
+ except Exception as e:
1310
+ print(f"Cloudinary cleanup failed: {e}")
services/websocket_service.py ADDED
@@ -0,0 +1,239 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ WebSocket Service for Real-time Updates
3
+ Handles WebSocket connections and MongoDB Change Streams for real-time data updates
4
+ """
5
+
6
+ import asyncio
7
+ import json
8
+ import logging
9
+ from typing import Set, Dict, Any, Optional
10
+ from fastapi import WebSocket, WebSocketDisconnect
11
+ from pymongo import MongoClient
12
+ from pymongo.errors import ConnectionFailure
13
+ import os
14
+ from dotenv import load_dotenv
15
+
16
+ load_dotenv()
17
+
18
+ # Setup logging
19
+ logger = logging.getLogger(__name__)
20
+
21
+ class ConnectionManager:
22
+ """Manages WebSocket connections"""
23
+
24
+ def __init__(self):
25
+ self.active_connections: Set[WebSocket] = set()
26
+ self.connection_data: Dict[WebSocket, Dict[str, Any]] = {}
27
+
28
+ async def connect(self, websocket: WebSocket, client_info: Optional[Dict[str, Any]] = None):
29
+ """Accept a new WebSocket connection"""
30
+ await websocket.accept()
31
+ self.active_connections.add(websocket)
32
+ self.connection_data[websocket] = client_info or {}
33
+ logger.info(f"✅ WebSocket connected. Total connections: {len(self.active_connections)}")
34
+
35
+ def disconnect(self, websocket: WebSocket):
36
+ """Remove a WebSocket connection"""
37
+ if websocket in self.active_connections:
38
+ self.active_connections.remove(websocket)
39
+ if websocket in self.connection_data:
40
+ del self.connection_data[websocket]
41
+ logger.info(f"🔌 WebSocket disconnected. Total connections: {len(self.active_connections)}")
42
+
43
+ async def send_personal_message(self, message: str, websocket: WebSocket):
44
+ """Send a message to a specific WebSocket connection"""
45
+ try:
46
+ await websocket.send_text(message)
47
+ except Exception as e:
48
+ logger.error(f"❌ Failed to send personal message: {e}")
49
+ self.disconnect(websocket)
50
+
51
+ async def broadcast(self, message: str):
52
+ """Broadcast a message to all connected WebSocket clients"""
53
+ if not self.active_connections:
54
+ logger.warning("⚠️ No active connections to broadcast to")
55
+ return
56
+
57
+ disconnected = set()
58
+ for connection in self.active_connections:
59
+ try:
60
+ await connection.send_text(message)
61
+ except Exception as e:
62
+ logger.error(f"❌ Failed to broadcast to connection: {e}")
63
+ disconnected.add(connection)
64
+
65
+ # Clean up disconnected connections
66
+ for connection in disconnected:
67
+ self.disconnect(connection)
68
+
69
+ logger.info(f"📡 Broadcasted message to {len(self.active_connections)} connections")
70
+
71
+ class MongoDBChangeStreamService:
72
+ """Service to monitor MongoDB changes and notify WebSocket clients"""
73
+
74
+ def __init__(self, connection_string: Optional[str] = None):
75
+ """Initialize MongoDB connection for change streams"""
76
+ self.connection_string = connection_string or os.getenv('MONGO_CONNECTION_STRING')
77
+
78
+ if not self.connection_string:
79
+ raise ValueError("MongoDB connection string is required. Set MONGO_CONNECTION_STRING environment variable.")
80
+
81
+ self.client = None
82
+ self.db = None
83
+ self.collection = None
84
+ self.change_stream = None
85
+ self.is_running = False
86
+
87
+ self._connect()
88
+
89
+ def _connect(self):
90
+ """Establish MongoDB connection"""
91
+ try:
92
+ self.client = MongoClient(self.connection_string)
93
+ # Test connection
94
+ self.client.admin.command('ping')
95
+
96
+ # Use 'aegis' database
97
+ self.db = self.client['aegis']
98
+ self.collection = self.db['debunk_posts']
99
+
100
+ logger.info("✅ MongoDB Change Stream service connected successfully")
101
+
102
+ except ConnectionFailure as e:
103
+ logger.error(f"❌ Failed to connect to MongoDB for change streams: {e}")
104
+ raise
105
+
106
+ async def start_change_stream(self, connection_manager: ConnectionManager):
107
+ """Start monitoring MongoDB changes and broadcast to WebSocket clients"""
108
+ if self.is_running:
109
+ logger.warning("⚠️ Change stream is already running")
110
+ return
111
+
112
+ try:
113
+ # Check if MongoDB supports change streams (replica set)
114
+ try:
115
+ # Try to create change stream to watch for insertions
116
+ self.change_stream = self.collection.watch([
117
+ {
118
+ '$match': {
119
+ 'operationType': 'insert'
120
+ }
121
+ }
122
+ ])
123
+
124
+ self.is_running = True
125
+ logger.info("🔄 Started MongoDB change stream monitoring")
126
+
127
+ # Process change stream events
128
+ async def process_changes():
129
+ try:
130
+ while self.is_running:
131
+ if self.change_stream:
132
+ # Check for new changes (non-blocking)
133
+ try:
134
+ change = self.change_stream.try_next()
135
+ if change:
136
+ await self._handle_change(change, connection_manager)
137
+ else:
138
+ # No changes, sleep briefly
139
+ await asyncio.sleep(0.5)
140
+ except Exception as e:
141
+ logger.error(f"❌ Error processing change: {e}")
142
+ await asyncio.sleep(1) # Brief pause on error
143
+ continue
144
+ else:
145
+ await asyncio.sleep(1)
146
+
147
+ except Exception as e:
148
+ logger.error(f"❌ Error in change stream processing: {e}")
149
+ finally:
150
+ self.is_running = False
151
+
152
+ # Start the change stream processing in the background
153
+ asyncio.create_task(process_changes())
154
+
155
+ except Exception as change_stream_error:
156
+ logger.warning(f"⚠️ MongoDB change streams not available: {change_stream_error}")
157
+ logger.info("🔄 Change streams require MongoDB replica set. WebSocket will work for manual updates.")
158
+ # Don't fail completely, just disable change streams
159
+ self.is_running = False
160
+ self.change_stream = None
161
+
162
+ except Exception as e:
163
+ logger.error(f"❌ Failed to start change stream: {e}")
164
+ self.is_running = False
165
+ # Don't raise the exception, allow WebSocket to work without change streams
166
+
167
+ async def _handle_change(self, change: Dict[str, Any], connection_manager: ConnectionManager):
168
+ """Handle a MongoDB change event"""
169
+ try:
170
+ logger.info(f"🔄 MongoDB change detected: {change.get('operationType')}")
171
+
172
+ # Extract the new document
173
+ new_document = change.get('fullDocument')
174
+ if not new_document:
175
+ logger.warning("⚠️ No full document in change event")
176
+ return
177
+
178
+ # Convert ObjectId to string for JSON serialization
179
+ if '_id' in new_document:
180
+ new_document['_id'] = str(new_document['_id'])
181
+
182
+ # Create the broadcast message
183
+ message = {
184
+ "type": "new_post",
185
+ "data": {
186
+ "post": new_document,
187
+ "timestamp": change.get('clusterTime'),
188
+ "operation": change.get('operationType')
189
+ }
190
+ }
191
+
192
+ # Broadcast to all connected clients (serialize datetimes/ObjectIds)
193
+ await connection_manager.broadcast(json.dumps(message, default=str))
194
+ logger.info(f"📡 Broadcasted new post to {len(connection_manager.active_connections)} clients")
195
+
196
+ except Exception as e:
197
+ logger.error(f"❌ Error handling MongoDB change: {e}")
198
+
199
+ def stop_change_stream(self):
200
+ """Stop the MongoDB change stream"""
201
+ self.is_running = False
202
+ if self.change_stream:
203
+ self.change_stream.close()
204
+ self.change_stream = None
205
+ logger.info("🛑 Stopped MongoDB change stream")
206
+
207
+
208
+ def close(self):
209
+ """Close MongoDB connection"""
210
+ self.stop_change_stream()
211
+ if self.client:
212
+ self.client.close()
213
+ logger.info("🔌 MongoDB Change Stream service connection closed")
214
+
215
+ # Global instances
216
+ connection_manager = ConnectionManager()
217
+ mongodb_change_service = None
218
+
219
+ async def initialize_mongodb_change_stream():
220
+ """Initialize the MongoDB change stream service"""
221
+ global mongodb_change_service
222
+
223
+ try:
224
+ mongodb_change_service = MongoDBChangeStreamService()
225
+ await mongodb_change_service.start_change_stream(connection_manager)
226
+ logger.info("✅ MongoDB Change Stream service initialized successfully")
227
+ return mongodb_change_service
228
+ except Exception as e:
229
+ logger.error(f"❌ Failed to initialize MongoDB Change Stream service: {e}")
230
+ return None
231
+
232
+ async def cleanup_mongodb_change_stream():
233
+ """Cleanup the MongoDB change stream service"""
234
+ global mongodb_change_service
235
+
236
+ if mongodb_change_service:
237
+ mongodb_change_service.close()
238
+ mongodb_change_service = None
239
+ logger.info("🧹 MongoDB Change Stream service cleaned up")
services/youtube_api.py ADDED
@@ -0,0 +1,211 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import requests
3
+ from typing import Dict, Any, Optional
4
+ from config import config
5
+
6
+ class YouTubeDataAPI:
7
+ """
8
+ YouTube Data API v3 integration for video verification
9
+ """
10
+
11
+ def __init__(self, api_key: Optional[str] = None):
12
+ """
13
+ Initialize YouTube Data API client
14
+
15
+ Args:
16
+ api_key: Google API key. If None, will try to get from environment
17
+ """
18
+ self.api_key = api_key or config.GOOGLE_API_KEY
19
+ if not self.api_key:
20
+ raise ValueError("GOOGLE_API_KEY environment variable or api_key parameter is required")
21
+
22
+ self.base_url = "https://www.googleapis.com/youtube/v3"
23
+
24
+ def extract_video_id(self, url: str) -> Optional[str]:
25
+ """
26
+ Extract video ID from YouTube URL
27
+
28
+ Args:
29
+ url: YouTube URL (various formats supported)
30
+
31
+ Returns:
32
+ Video ID or None if not found
33
+ """
34
+ import re
35
+
36
+ # YouTube URL patterns
37
+ patterns = [
38
+ r'(?:youtube\.com\/watch\?v=|youtu\.be\/|youtube\.com\/embed\/)([a-zA-Z0-9_-]{11})',
39
+ r'youtube\.com\/v\/([a-zA-Z0-9_-]{11})',
40
+ r'youtube\.com\/shorts\/([a-zA-Z0-9_-]{11})'
41
+ ]
42
+
43
+ for pattern in patterns:
44
+ match = re.search(pattern, url)
45
+ if match:
46
+ return match.group(1)
47
+
48
+ return None
49
+
50
+ def get_video_info(self, video_id: str) -> Dict[str, Any]:
51
+ """
52
+ Get video information from YouTube Data API
53
+
54
+ Args:
55
+ video_id: YouTube video ID
56
+
57
+ Returns:
58
+ Dictionary with video information
59
+ """
60
+ try:
61
+ url = f"{self.base_url}/videos"
62
+ params = {
63
+ 'key': self.api_key,
64
+ 'id': video_id,
65
+ 'part': 'snippet,statistics,contentDetails'
66
+ }
67
+
68
+ response = requests.get(url, params=params, timeout=30)
69
+ response.raise_for_status()
70
+
71
+ data = response.json()
72
+
73
+ if not data.get('items'):
74
+ return {
75
+ 'success': False,
76
+ 'error': 'Video not found or not accessible'
77
+ }
78
+
79
+ video = data['items'][0]
80
+ snippet = video.get('snippet', {})
81
+ statistics = video.get('statistics', {})
82
+ content_details = video.get('contentDetails', {})
83
+
84
+ return {
85
+ 'success': True,
86
+ 'video_id': video_id,
87
+ 'title': snippet.get('title', 'Unknown Title'),
88
+ 'description': snippet.get('description', ''),
89
+ 'channel_title': snippet.get('channelTitle', 'Unknown Channel'),
90
+ 'published_at': snippet.get('publishedAt', ''),
91
+ 'duration': content_details.get('duration', ''),
92
+ 'view_count': statistics.get('viewCount', '0'),
93
+ 'like_count': statistics.get('likeCount', '0'),
94
+ 'comment_count': statistics.get('commentCount', '0'),
95
+ 'tags': snippet.get('tags', []),
96
+ 'category_id': snippet.get('categoryId', ''),
97
+ 'thumbnail_url': snippet.get('thumbnails', {}).get('high', {}).get('url', ''),
98
+ 'raw_data': video
99
+ }
100
+
101
+ except requests.exceptions.RequestException as e:
102
+ return {
103
+ 'success': False,
104
+ 'error': f'API request failed: {str(e)}'
105
+ }
106
+ except Exception as e:
107
+ return {
108
+ 'success': False,
109
+ 'error': f'Unexpected error: {str(e)}'
110
+ }
111
+
112
+ def search_videos(self, query: str, max_results: int = 10) -> Dict[str, Any]:
113
+ """
114
+ Search for videos using YouTube Data API
115
+
116
+ Args:
117
+ query: Search query
118
+ max_results: Maximum number of results to return
119
+
120
+ Returns:
121
+ Dictionary with search results
122
+ """
123
+ try:
124
+ url = f"{self.base_url}/search"
125
+ params = {
126
+ 'key': self.api_key,
127
+ 'q': query,
128
+ 'part': 'snippet',
129
+ 'type': 'video',
130
+ 'maxResults': max_results,
131
+ 'order': 'relevance'
132
+ }
133
+
134
+ response = requests.get(url, params=params, timeout=30)
135
+ response.raise_for_status()
136
+
137
+ data = response.json()
138
+
139
+ videos = []
140
+ for item in data.get('items', []):
141
+ snippet = item.get('snippet', {})
142
+ videos.append({
143
+ 'video_id': item.get('id', {}).get('videoId', ''),
144
+ 'title': snippet.get('title', ''),
145
+ 'description': snippet.get('description', ''),
146
+ 'channel_title': snippet.get('channelTitle', ''),
147
+ 'published_at': snippet.get('publishedAt', ''),
148
+ 'thumbnail_url': snippet.get('thumbnails', {}).get('high', {}).get('url', '')
149
+ })
150
+
151
+ return {
152
+ 'success': True,
153
+ 'videos': videos,
154
+ 'total_results': data.get('pageInfo', {}).get('totalResults', 0)
155
+ }
156
+
157
+ except requests.exceptions.RequestException as e:
158
+ return {
159
+ 'success': False,
160
+ 'error': f'API request failed: {str(e)}'
161
+ }
162
+ except Exception as e:
163
+ return {
164
+ 'success': False,
165
+ 'error': f'Unexpected error: {str(e)}'
166
+ }
167
+
168
+ def verify_video_exists(self, url: str) -> Dict[str, Any]:
169
+ """
170
+ Verify if a YouTube video exists and is accessible
171
+
172
+ Args:
173
+ url: YouTube URL
174
+
175
+ Returns:
176
+ Dictionary with verification results
177
+ """
178
+ video_id = self.extract_video_id(url)
179
+
180
+ if not video_id:
181
+ return {
182
+ 'verified': False,
183
+ 'message': 'Invalid YouTube URL format',
184
+ 'details': {'error': 'Could not extract video ID from URL'}
185
+ }
186
+
187
+ video_info = self.get_video_info(video_id)
188
+
189
+ if not video_info.get('success'):
190
+ return {
191
+ 'verified': False,
192
+ 'message': f'Video verification failed: {video_info.get("error", "Unknown error")}',
193
+ 'details': {
194
+ 'video_id': video_id,
195
+ 'error': video_info.get('error', 'Unknown error')
196
+ }
197
+ }
198
+
199
+ return {
200
+ 'verified': True,
201
+ 'message': f'Video verified successfully: "{video_info["title"]}" by {video_info["channel_title"]}',
202
+ 'details': {
203
+ 'video_id': video_id,
204
+ 'title': video_info['title'],
205
+ 'channel_title': video_info['channel_title'],
206
+ 'published_at': video_info['published_at'],
207
+ 'duration': video_info['duration'],
208
+ 'view_count': video_info['view_count'],
209
+ 'thumbnail_url': video_info['thumbnail_url']
210
+ }
211
+ }
services/youtube_caption.py ADDED
@@ -0,0 +1,141 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # pip install yt-dlp
2
+
3
+ import yt_dlp
4
+ import os
5
+ import re
6
+ from pathlib import Path
7
+
8
+ def get_youtube_transcript_ytdlp(video_url, output_file="transcript.txt"):
9
+ """
10
+ Extract YouTube transcript using yt-dlp
11
+ Works perfectly in India - yt-dlp handles all signature/blocking issues
12
+ """
13
+
14
+ print("[*] Starting transcript extraction with yt-dlp...")
15
+
16
+ # Extract video ID for reference
17
+ video_id_match = re.search(r'v=([^&]*)', video_url)
18
+ video_id = video_id_match.group(1) if video_id_match else 'unknown'
19
+
20
+ print(f"[+] Video ID: {video_id}")
21
+
22
+ # Normalize URL to just the video (remove playlist parameters)
23
+ normalized_url = f"https://www.youtube.com/watch?v={video_id}"
24
+ print(f"[+] Normalized URL: {normalized_url}")
25
+
26
+ try:
27
+ # Create temp directory for subtitles
28
+ temp_dir = "temp_subs"
29
+ os.makedirs(temp_dir, exist_ok=True)
30
+
31
+ # Setup yt-dlp options
32
+ ydl_opts = {
33
+ 'writeautomaticsub': True, # Download auto-generated subtitles
34
+ 'subtitlesformat': 'vtt', # Format (can also be 'json3', 'srt', 'ass')
35
+ 'skip_download': True, # Only download subs, not video
36
+ 'noplaylist': True, # Only download the video, not the playlist
37
+ 'outtmpl': os.path.join(temp_dir, '%(id)s'), # Output template
38
+ 'quiet': False, # Show progress
39
+ 'no_warnings': False,
40
+ 'sub_langs': 'en', # Only English subtitles
41
+ }
42
+
43
+ print("[*] Downloading subtitles...")
44
+
45
+ with yt_dlp.YoutubeDL(ydl_opts) as ydl:
46
+ info = ydl.extract_info(normalized_url, download=True) # Use normalized URL
47
+
48
+ print("[+] Subtitles downloaded successfully")
49
+
50
+ # Find the subtitle file
51
+ subtitle_file = None
52
+ for file in os.listdir(temp_dir):
53
+ if video_id in file and (file.endswith('.vtt') or file.endswith('.srt')):
54
+ subtitle_file = os.path.join(temp_dir, file)
55
+ print(f"[+] Found subtitle file: {file}")
56
+ break
57
+
58
+ if not subtitle_file or not os.path.exists(subtitle_file):
59
+ print("[ERROR] Subtitle file not found")
60
+ print(f"[DEBUG] Files in {temp_dir}: {os.listdir(temp_dir)}")
61
+ return None
62
+
63
+ # Read and parse the subtitle file
64
+ print("[*] Parsing subtitle file...")
65
+
66
+ transcript_lines = []
67
+
68
+ if subtitle_file.endswith('.vtt'):
69
+ # Parse VTT format
70
+ with open(subtitle_file, 'r', encoding='utf-8') as f:
71
+ lines = f.readlines()
72
+
73
+ for line in lines:
74
+ line = line.strip()
75
+ # Skip headers, timestamps, and empty lines
76
+ if line and not line.startswith('WEBVTT') and not '-->' in line and line:
77
+ transcript_lines.append(line)
78
+
79
+ elif subtitle_file.endswith('.srt'):
80
+ # Parse SRT format
81
+ with open(subtitle_file, 'r', encoding='utf-8') as f:
82
+ lines = f.readlines()
83
+
84
+ for line in lines:
85
+ line = line.strip()
86
+ # Skip sequence numbers and timestamps
87
+ if line and not line[0].isdigit() and not '-->' in line and line:
88
+ transcript_lines.append(line)
89
+
90
+ if not transcript_lines:
91
+ print("[ERROR] No text extracted from subtitle file")
92
+ return None
93
+
94
+ # Combine into full transcript
95
+ full_text = "\n".join(transcript_lines)
96
+
97
+ # Save to output file
98
+ print(f"[*] Saving transcript to {output_file}...")
99
+ with open(output_file, 'w', encoding='utf-8') as f:
100
+ f.write(full_text)
101
+
102
+ # Cleanup temp directory
103
+ import shutil
104
+ shutil.rmtree(temp_dir)
105
+
106
+ print(f"\n✓ SUCCESS!")
107
+ print(f" File: {output_file}")
108
+ print(f" Total characters: {len(full_text)}")
109
+ print(f" Total lines: {len(transcript_lines)}")
110
+
111
+ return full_text
112
+
113
+ except Exception as e:
114
+ print(f"[ERROR] {str(e)}")
115
+ import traceback
116
+ traceback.print_exc()
117
+ return None
118
+
119
+
120
+ # ==================== MAIN ====================
121
+
122
+ if __name__ == "__main__":
123
+
124
+ print("=" * 70)
125
+ print("YouTube Transcript Extractor - yt-dlp VERSION (WORKS IN INDIA!)")
126
+ print("=" * 70)
127
+
128
+ video_url = "https://www.youtube.com/watch?v=dQw4w9WgXcQ"
129
+
130
+ print(f"\nTarget video: {video_url}\n")
131
+
132
+ transcript = get_youtube_transcript_ytdlp(video_url)
133
+
134
+ if transcript:
135
+ print("\n" + "=" * 70)
136
+ print("TRANSCRIPT PREVIEW (First 800 characters)")
137
+ print("=" * 70)
138
+ print(transcript[:800])
139
+ print("\n...")
140
+ else:
141
+ print("\n[FAILED] Could not extract transcript")
utils/file_utils.py ADDED
@@ -0,0 +1,145 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import tempfile
3
+ import shutil
4
+ from pathlib import Path
5
+ from typing import List
6
+ from fastapi import UploadFile
7
+
8
+ async def save_upload_file(upload_file: UploadFile) -> str:
9
+ """
10
+ Save an uploaded file to a temporary location
11
+
12
+ Args:
13
+ upload_file: FastAPI UploadFile object
14
+
15
+ Returns:
16
+ Path to the saved temporary file
17
+ """
18
+ try:
19
+ # Create temporary file with appropriate extension
20
+ suffix = Path(upload_file.filename).suffix if upload_file.filename else ""
21
+ temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=suffix)
22
+
23
+ # Write uploaded content to temporary file
24
+ content = await upload_file.read()
25
+ temp_file.write(content)
26
+ temp_file.close()
27
+
28
+ return temp_file.name
29
+
30
+ except Exception as e:
31
+ print(f"Error saving uploaded file: {e}")
32
+ raise
33
+
34
+ def cleanup_temp_files(file_paths: List[str]) -> None:
35
+ """
36
+ Clean up temporary files
37
+
38
+ Args:
39
+ file_paths: List of file paths to delete
40
+ """
41
+ for file_path in file_paths:
42
+ try:
43
+ if os.path.exists(file_path):
44
+ os.unlink(file_path)
45
+ print(f"Cleaned up temporary file: {file_path}")
46
+ except Exception as e:
47
+ print(f"Error cleaning up file {file_path}: {e}")
48
+
49
+ def cleanup_temp_directories(dir_paths: List[str]) -> None:
50
+ """
51
+ Clean up temporary directories
52
+
53
+ Args:
54
+ dir_paths: List of directory paths to delete
55
+ """
56
+ for dir_path in dir_paths:
57
+ try:
58
+ if os.path.exists(dir_path):
59
+ shutil.rmtree(dir_path)
60
+ print(f"Cleaned up temporary directory: {dir_path}")
61
+ except Exception as e:
62
+ print(f"Error cleaning up directory {dir_path}: {e}")
63
+
64
+ def get_file_extension(filename: str) -> str:
65
+ """
66
+ Get file extension from filename
67
+
68
+ Args:
69
+ filename: Name of the file
70
+
71
+ Returns:
72
+ File extension (including the dot)
73
+ """
74
+ return Path(filename).suffix.lower()
75
+
76
+ def is_valid_image_file(filename: str) -> bool:
77
+ """
78
+ Check if filename represents a valid image file
79
+
80
+ Args:
81
+ filename: Name of the file
82
+
83
+ Returns:
84
+ True if valid image file
85
+ """
86
+ valid_extensions = {'.jpg', '.jpeg', '.png', '.gif', '.bmp', '.tiff', '.webp'}
87
+ return get_file_extension(filename) in valid_extensions
88
+
89
+ def is_valid_video_file(filename: str) -> bool:
90
+ """
91
+ Check if filename represents a valid video file
92
+
93
+ Args:
94
+ filename: Name of the file
95
+
96
+ Returns:
97
+ True if valid video file
98
+ """
99
+ valid_extensions = {'.mp4', '.avi', '.mov', '.wmv', '.flv', '.webm', '.mkv', '.m4v'}
100
+ return get_file_extension(filename) in valid_extensions
101
+
102
+ def create_temp_directory() -> str:
103
+ """
104
+ Create a temporary directory
105
+
106
+ Returns:
107
+ Path to the created temporary directory
108
+ """
109
+ return tempfile.mkdtemp()
110
+
111
+ def get_file_size(file_path: str) -> int:
112
+ """
113
+ Get file size in bytes
114
+
115
+ Args:
116
+ file_path: Path to the file
117
+
118
+ Returns:
119
+ File size in bytes
120
+ """
121
+ try:
122
+ return os.path.getsize(file_path)
123
+ except OSError:
124
+ return 0
125
+
126
+ def format_file_size(size_bytes: int) -> str:
127
+ """
128
+ Format file size in human-readable format
129
+
130
+ Args:
131
+ size_bytes: File size in bytes
132
+
133
+ Returns:
134
+ Formatted file size string
135
+ """
136
+ if size_bytes == 0:
137
+ return "0B"
138
+
139
+ size_names = ["B", "KB", "MB", "GB", "TB"]
140
+ i = 0
141
+ while size_bytes >= 1024 and i < len(size_names) - 1:
142
+ size_bytes /= 1024.0
143
+ i += 1
144
+
145
+ return f"{size_bytes:.1f}{size_names[i]}"