VeuReu commited on
Commit
b9246df
·
verified ·
1 Parent(s): 9d9aacc

Delete main_process/main_router - marcos.py

Browse files
Files changed (1) hide show
  1. main_process/main_router - marcos.py +0 -410
main_process/main_router - marcos.py DELETED
@@ -1,410 +0,0 @@
1
- import os
2
- import io
3
-
4
- from pathlib import Path
5
- from typing import Counter,List, Dict
6
- import ast
7
- import json
8
- import torch
9
- from svision_client import extract_scenes, add_ocr_and_faces, keyframes_every_second_extraction, extract_descripcion_escena
10
- from asr_client import extract_audio_from_video, diarize_audio, transcribe_long_audio, transcribe_short_audio, identificar_veu
11
-
12
- from fastapi import APIRouter, UploadFile, File, Query, HTTPException
13
- from fastapi.responses import JSONResponse, StreamingResponse, FileResponse
14
-
15
- from storage.common import validate_token
16
- from storage.files.file_manager import FileManager
17
- from storage.embeddings_routers import get_embeddings_json
18
-
19
- EMBEDDINGS_ROOT = Path("/data/embeddings")
20
- MEDIA_ROOT = Path("/data/media")
21
- os.environ["CUDA_VISIBLE_DEVICES"] = "1"
22
- router = APIRouter(prefix="/transcription", tags=["Initial Transcription Process"])
23
- HF_TOKEN = os.getenv("VEUREU_TOKEN")
24
-
25
- def get_casting(video_sha1: str):
26
- """Recupera els embeddings reals de càsting per a un vídeo a partir del seu SHA1.
27
-
28
- Llegeix el JSON d'embeddings que demo ha pujat prèviament a /data/embeddings
29
- mitjançant l'endpoint /embeddings/upload_embeddings i en retorna les
30
- columnes face_col i voice_col.
31
- """
32
-
33
- # get_embeddings_json retorna el JSON complet tal com es va pujar (casting_json)
34
- faces_json = get_embeddings_json(video_sha1, "faces")
35
- voices_json = get_embeddings_json(video_sha1, "voices")
36
-
37
- return faces_json, voices_json
38
-
39
- def map_identities_per_second(frames_per_second, intervals):
40
- for seg in intervals:
41
- seg_start = seg["start"]
42
- seg_end = seg["end"]
43
-
44
- identities = []
45
- for f in frames_per_second:
46
- if seg_start <= f["start"] <= seg_end:
47
- for face in f.get("faces", []):
48
- identities.append(face)
49
-
50
- seg["counts"] = dict(Counter(identities))
51
-
52
- return intervals
53
-
54
- def _fmt_srt_time(seconds: float) -> str:
55
- """Formatea segundos en el formato SRT HH:MM:SS,mmm"""
56
- h = int(seconds // 3600)
57
- m = int((seconds % 3600) // 60)
58
- s = int(seconds % 60)
59
- ms = int((seconds - int(seconds)) * 1000)
60
- return f"{h:02}:{m:02}:{s:02},{ms:03}"
61
-
62
- from pathlib import Path
63
- from typing import List, Dict
64
- from fastapi import HTTPException
65
-
66
-
67
- def generate_srt_from_segments(segments: List[Dict], sha1: str) -> str:
68
- """
69
- Generate an SRT subtitle file from diarization/transcription segments.
70
-
71
- This function:
72
- - Creates the required folder structure for storing SRTs.
73
- - Removes any previous SRT files for the same SHA1.
74
- - Builds the SRT content with timestamps, speaker identity and transcription.
75
- - Saves the SRT file to disk.
76
- - Returns the SRT content as a string (to be sent by the endpoint).
77
-
78
- Parameters
79
- ----------
80
- segments : List[Dict]
81
- List of dictionaries containing:
82
- - "start": float (start time in seconds)
83
- - "end": float (end time in seconds)
84
- - "speaker": dict with "identity"
85
- - "transcription": str
86
- sha1 : str
87
- Identifier used to locate the target media folder.
88
-
89
- Returns
90
- -------
91
- str
92
- Full SRT file content as a string.
93
- """
94
-
95
- # Path: /data/media/<sha1>
96
- video_root = MEDIA_ROOT / sha1
97
- video_root.mkdir(parents=True, exist_ok=True)
98
-
99
- # Path: /data/media/<sha1>/origin_srt
100
- srt_dir = video_root / "initial_srt"
101
- srt_dir.mkdir(parents=True, exist_ok=True)
102
-
103
- # Delete old SRT files
104
- try:
105
- for old_srt in srt_dir.glob("*.srt"):
106
- old_srt.unlink()
107
- except Exception as exc:
108
- raise HTTPException(status_code=500, detail=f"Failed to delete old SRT files: {exc}")
109
-
110
- # Save file as initial.srt
111
- final_path = srt_dir / "initial.srt"
112
-
113
- # Build SRT content
114
- srt_lines = []
115
-
116
- for i, seg in enumerate(segments, start=1):
117
- start = seg.get("start", 0.0)
118
- end = seg.get("end", 0.0)
119
- transcription = seg.get("transcription", "").strip()
120
-
121
- speaker_info = seg.get("speaker", {})
122
- speaker = speaker_info.get("identity", "Unknown")
123
-
124
- text = f"[{speaker}]: {transcription}" if speaker else transcription
125
-
126
- entry = (
127
- f"{i}\n"
128
- f"{_fmt_srt_time(start)} --> {_fmt_srt_time(end)}\n"
129
- f"{text}\n"
130
- )
131
- srt_lines.append(entry)
132
-
133
- # Join with blank lines
134
- srt_content = "\n".join(srt_lines)
135
-
136
- # Write to disk
137
- try:
138
- with final_path.open("w", encoding="utf-8-sig") as f:
139
- f.write(srt_content)
140
- except Exception as exc:
141
- raise HTTPException(status_code=500, detail=f"Failed to write SRT file: {exc}")
142
-
143
- return srt_content
144
-
145
- def pipeline_preprocessing_vision(video_path: str, face_col):
146
- """
147
- Pipeline que toma un video y realiza todo el preprocesamiento del video de la parte de vision.
148
- """
149
-
150
- print(f"Procesando video para visión: {video_path}")
151
-
152
- print("Extrayendo escenas...")
153
- threshold: float = 30.0
154
- offset_frames: int = 3
155
- crop_ratio: float = 0.1
156
- result_extract_scenes = extract_scenes(video_path, threshold, offset_frames, crop_ratio)
157
- print(result_extract_scenes)
158
- # Obtener las rutas de las imágenes y la información de las escenas
159
- escenas = result_extract_scenes[0] if len(result_extract_scenes) > 0 else []
160
- escenas_paths = [f["image"] for f in escenas]
161
- print(escenas_paths)
162
- info_escenas = result_extract_scenes[1] if len(result_extract_scenes) > 1 else []
163
- print(info_escenas)
164
-
165
- print("Extrayendo imagenes por segundo...")
166
- result_extract_per_second = keyframes_every_second_extraction(video_path)
167
- # Obtener las rutas de las imágenes y la información de las escenas
168
- images_per_second = result_extract_per_second[0] if len(result_extract_per_second) > 0 else []
169
- images_per_second_paths = [f["image"] for f in images_per_second]
170
- info_images_per_second = result_extract_per_second[1] if len(result_extract_per_second) > 1 else []
171
-
172
- print("Aumentamos la información de las escenas viendo quién aparece en cada escena y detectando OCR...")
173
- info_escenas_completa = []
174
- for imagen_escena, info_escena in zip(escenas_paths, info_escenas):
175
- result_add_ocr_and_faces = add_ocr_and_faces(imagen_escena, info_escena, face_col)
176
- info_escenas_completa.append(result_add_ocr_and_faces)
177
-
178
- print("Aumentamos la información de las imagenes por segundo viendo quién aparece en cada escena y detectando OCR...")
179
- info_images_per_second_completa = []
180
- for imagen_segundo, info_segundo in zip(images_per_second_paths, info_images_per_second):
181
- result_add_ocr_and_faces =add_ocr_and_faces(imagen_segundo, info_segundo, face_col)
182
- info_images_per_second_completa.append(result_add_ocr_and_faces)
183
- print(info_escenas_completa)
184
-
185
- print("Ahora se va a tratar los OCR (se sustituirán ciertas escenas por alguna de las imágenes por segundo si tienen mejor OCR)...")
186
- # Se hará lo último
187
-
188
- print("Combinando información de escenas e imágenes por segundo...")
189
- info_escenas_completa = map_identities_per_second(info_images_per_second_completa, info_escenas_completa)
190
- print(info_escenas_completa)
191
-
192
- print("Ahora se incluyen en los diccionarios de las escenas la descripciones de estas.")
193
- for escena_path, info_escena in zip(escenas_paths, info_escenas_completa):
194
- descripcion_escena = extract_descripcion_escena(escena_path)
195
- lista = ast.literal_eval(descripcion_escena)
196
- frase = lista[0]
197
- info_escena["descripcion"] = frase
198
- del descripcion_escena
199
- torch.cuda.empty_cache()
200
-
201
- return info_escenas_completa, info_images_per_second_completa
202
-
203
- def pipeline_preprocessing_audio(video_path: str, voice_col):
204
- """
205
- Pipeline que toma un video y realiza todo el preprocesamiento del video de la parte de audio.
206
- """
207
- print(f"Procesando video para audio: {video_path}")
208
-
209
- print("Extrayendo audio del video...")
210
- audio_video = extract_audio_from_video(video_path)
211
- print(audio_video)
212
-
213
- print("Diartizando el audio...")
214
- diarization_audio = diarize_audio(audio_video)
215
- print(diarization_audio)
216
- clips_path = diarization_audio[0]
217
- print(clips_path)
218
- diarization_info = diarization_audio[1]
219
- print(diarization_info)
220
-
221
- print("Transcribiendo el video completo...")
222
- full_transcription = transcribe_long_audio(audio_video)
223
- print(full_transcription)
224
-
225
- print("Transcribiendo los clips diartizados...")
226
- for clip_path, clip_info in zip(clips_path, diarization_info):
227
- clip_transcription = transcribe_short_audio(clip_path)
228
- clip_info["transcription"] = clip_transcription
229
-
230
- print("Calculando los embeddings para cada uno de los clips obtenidos y posteriormente identificar las voces...")
231
- for clip_path, clip_info in zip(clips_path, diarization_info):
232
- clip_speaker = identificar_veu(clip_path, voice_col)
233
- clip_info["speaker"] = clip_speaker
234
-
235
- return full_transcription, diarization_info
236
-
237
- @router.post("/generate_initial_srt_and_info", tags=["Initial Transcription Process"])
238
- async def pipeline_video_analysis(
239
- sha1: str,
240
- token: str = Query(..., description="Token required for authorization")
241
- ):
242
- """
243
- Endpoint that processes a full video identified by its SHA1 folder, performs
244
- complete audio-visual preprocessing, and returns an SRT subtitle file.
245
-
246
- This pipeline integrates:
247
- - Vision preprocessing (scene detection, keyframes, OCR, face recognition)
248
- - Audio preprocessing (diarization, speech recognition, speaker identity matching)
249
- - Identity mapping between vision and audio streams
250
- - Final generation of an SRT file describing who speaks and when
251
-
252
- Parameters
253
- ----------
254
- sha1 : str
255
- Identifier corresponding to the folder containing the video and related assets.
256
- token : str
257
- Security token required for authorization.
258
-
259
- Returns
260
- -------
261
- str
262
- The generated SRT file (as text) containing time-aligned subtitles with
263
- speaker identities and transcriptions.
264
- """
265
-
266
- validate_token(token)
267
-
268
- # Resolve directories
269
- file_manager = FileManager(MEDIA_ROOT)
270
- sha1_folder = MEDIA_ROOT / sha1
271
- clip_folder = sha1_folder / "clip"
272
-
273
- if not sha1_folder.exists() or not sha1_folder.is_dir():
274
- raise HTTPException(status_code=404, detail="SHA1 folder not found")
275
-
276
- if not clip_folder.exists() or not clip_folder.is_dir():
277
- raise HTTPException(status_code=404, detail="Clip folder not found")
278
-
279
- # Locate video file
280
- mp4_files = list(clip_folder.glob("*.mp4"))
281
- if not mp4_files:
282
- raise HTTPException(status_code=404, detail="No MP4 files found")
283
-
284
- video_path = mp4_files[0]
285
-
286
- # Convert absolute path to a relative path for FileManager
287
- video_path = MEDIA_ROOT / video_path.relative_to(MEDIA_ROOT)
288
-
289
- print(f"Processing full video: {video_path}")
290
-
291
- # Get face and voice embeddings for casting
292
- face_col, voice_col = get_casting(sha1)
293
-
294
- # Vision processing pipeline
295
- info_escenas, info_images_per_second = pipeline_preprocessing_vision(video_path, face_col)
296
- torch.cuda.empty_cache()
297
-
298
- # Audio processing pipeline
299
- full_transcription, info_clips = pipeline_preprocessing_audio(video_path, voice_col)
300
-
301
- # Merge identities from vision pipeline with audio segments
302
- info_clips = map_identities_per_second(info_images_per_second, info_clips)
303
-
304
- # Generate the final SRT subtitle file
305
- srt = generate_srt_from_segments(info_clips, sha1)
306
-
307
- # Create result JSON
308
- result_json = {
309
- "full_transcription": full_transcription,
310
- "info_escenas": info_escenas,
311
- "info_clips": info_clips
312
- }
313
-
314
- # Path: /data/media/<sha1>
315
- video_root = MEDIA_ROOT / sha1
316
- video_root.mkdir(parents=True, exist_ok=True)
317
-
318
- # Path: /data/media/<sha1>/origin_srt
319
- srt_dir = video_root / "initial_srt"
320
- srt_dir.mkdir(parents=True, exist_ok=True)
321
-
322
- final_path = srt_dir / "initial_info.json"
323
-
324
- with final_path.open("w", encoding="utf-8") as f:
325
- json.dump({
326
- "full_transcription": full_transcription,
327
- "info_escenas": info_escenas,
328
- "info_clips": info_clips
329
- }, f, ensure_ascii=False, indent=4)
330
-
331
- # The endpoint returns OK message info
332
- return {"status": "ok", "message": "Initial SRT and info JSON generated"}
333
-
334
- def get_initial_info_path(sha1:str):
335
- video_root = MEDIA_ROOT / sha1
336
- srt_dir = video_root / "initial_srt"
337
- final_path = srt_dir / "initial_info.json"
338
-
339
- if not video_root.exists() or not video_root.is_dir():
340
- raise HTTPException(status_code=404, detail="SHA1 folder not found")
341
- if not srt_dir.exists() or not srt_dir.is_dir():
342
- raise HTTPException(status_code=404, detail="initial_srt folder not found")
343
- if not final_path.exists() or not final_path.is_file():
344
- raise HTTPException(status_code=404, detail="initial_info JSON not found")
345
-
346
- return final_path
347
-
348
- def get_initial_srt_path(sha1:str):
349
- video_root = MEDIA_ROOT / sha1
350
- srt_dir = video_root / "initial_srt"
351
- final_path = srt_dir / "initial.srt"
352
-
353
- if not video_root.exists() or not video_root.is_dir():
354
- raise HTTPException(status_code=404, detail="SHA1 folder not found")
355
- if not srt_dir.exists() or not srt_dir.is_dir():
356
- raise HTTPException(status_code=404, detail="initial_srt folder not found")
357
- if not final_path.exists() or not final_path.is_file():
358
- raise HTTPException(status_code=404, detail="initial.srt SRT not found")
359
-
360
- return final_path
361
-
362
- @router.get("/download_initial_srt", tags=["Initial Transcription Process"])
363
- def download_initial_srt(
364
- sha1: str,
365
- token: str = Query(..., description="Token required for authorization")
366
- ):
367
- """
368
- Download the cast CSV for a specific video identified by its SHA-1.
369
- The CSV is expected under:
370
- /data/media/<sha1>/cast/cast.csv
371
- Steps:
372
- - Validate the token.
373
- - Ensure /data/media/<sha1> and /cast exist.
374
- - Return the CSV as a FileResponse.
375
- - Raise 404 if any folder or file is missing.
376
- """
377
- validate_token(token)
378
-
379
- file_path = get_initial_srt_path(sha1)
380
-
381
- return FileResponse(
382
- path=file_path,
383
- media_type="text/srt",
384
- filename="initial.srt"
385
- )
386
-
387
- @router.get("/download_initial_info", tags=["Initial Transcription Process"])
388
- def download_initial_info(
389
- sha1: str,
390
- token: str = Query(..., description="Token required for authorization")
391
- ):
392
- """
393
- Download the cast CSV for a specific video identified by its SHA-1.
394
- The CSV is expected under:
395
- /data/media/<sha1>/cast/cast.csv
396
- Steps:
397
- - Validate the token.
398
- - Ensure /data/media/<sha1> and /cast exist.
399
- - Return the CSV as a FileResponse.
400
- - Raise 404 if any folder or file is missing.
401
- """
402
- validate_token(token)
403
-
404
- file_path = get_initial_info_path(sha1)
405
-
406
- return FileResponse(
407
- path=file_path,
408
- media_type="text/json",
409
- filename="initial_info.json"
410
- )