Spaces:

VeuReu
/

engine

Running

App Files Files Community

engine / main_process /main_router.py

VeuReu

Update main_process/main_router.py

ea9c4ec verified about 9 hours ago

raw

history blame contribute delete

15.4 kB

	import os
	import io

	from pathlib import Path
	from typing import Counter,List, Dict
	import ast
	import json
	import torch
	from svision_client import extract_scenes, add_ocr_and_faces, keyframes_every_second_extraction, extract_descripcion_escena
	from asr_client import extract_audio_from_video, diarize_audio, transcribe_long_audio, transcribe_short_audio, identificar_veu

	from fastapi import APIRouter, UploadFile, File, Query, HTTPException
	from fastapi.responses import JSONResponse, StreamingResponse, FileResponse

	from storage.common import validate_token
	from storage.files.file_manager import FileManager
	from storage.embeddings_routers import get_embeddings_json

	EMBEDDINGS_ROOT = Path("/data/embeddings")
	MEDIA_ROOT = Path("/data/media")
	os.environ["CUDA_VISIBLE_DEVICES"] = "1"
	router = APIRouter(prefix="/transcription", tags=["Initial Transcription Process"])
	HF_TOKEN = os.getenv("VEUREU_TOKEN")

	def get_casting(video_sha1: str):
	"""Recupera els embeddings reals de càsting per a un vídeo a partir del seu SHA1.

	Llegeix el JSON d'embeddings que demo ha pujat prèviament a /data/embeddings
	mitjançant l'endpoint /embeddings/upload_embeddings i en retorna les
	columnes face_col i voice_col.
	"""

	# get_embeddings_json retorna el JSON complet tal com es va pujar (casting_json)
	faces_json = get_embeddings_json(video_sha1, "faces")
	faces_json = faces_json["face_col"]
	print("--------------")
	print("la base de datos de caras es ")
	print(faces_json)
	voices_json = get_embeddings_json(video_sha1, "voices")
	voices_json = voices_json["voice_col"]
	print("--------------")
	print("la base de datos de voces es ")
	print(voices_json)

	return faces_json, voices_json

	def map_identities_per_second(frames_per_second, intervals):
	for seg in intervals:
	seg_start = seg["start"]
	seg_end = seg["end"]

	identities = []
	for f in frames_per_second:
	if seg_start <= f["start"] <= seg_end:
	for face in f.get("faces", []):
	identities.append(face)

	seg["counts"] = dict(Counter(identities))

	return intervals

	def _fmt_srt_time(seconds: float) -> str:
	"""Formatea segundos en el formato SRT HH:MM:SS,mmm"""
	h = int(seconds // 3600)
	m = int((seconds % 3600) // 60)
	s = int(seconds % 60)
	ms = int((seconds - int(seconds)) * 1000)
	return f"{h:02}:{m:02}:{s:02},{ms:03}"

	from pathlib import Path
	from typing import List, Dict
	from fastapi import HTTPException


	def generate_srt_from_segments(segments: List[Dict], sha1: str) -> str:
	"""
	Generate an SRT subtitle file from diarization/transcription segments.

	This function:
	- Creates the required folder structure for storing SRTs.
	- Removes any previous SRT files for the same SHA1.
	- Builds the SRT content with timestamps, speaker identity and transcription.
	- Saves the SRT file to disk.
	- Returns the SRT content as a string (to be sent by the endpoint).

	Parameters
	----------
	segments : List[Dict]
	List of dictionaries containing:
	- "start": float (start time in seconds)
	- "end": float (end time in seconds)
	- "speaker": dict with "identity"
	- "transcription": str
	sha1 : str
	Identifier used to locate the target media folder.

	Returns
	-------
	str
	Full SRT file content as a string.
	"""

	# Path: /data/media/<sha1>
	video_root = MEDIA_ROOT / sha1
	video_root.mkdir(parents=True, exist_ok=True)

	# Path: /data/media/<sha1>/origin_srt
	srt_dir = video_root / "initial_srt"
	srt_dir.mkdir(parents=True, exist_ok=True)

	# Delete old SRT files
	try:
	for old_srt in srt_dir.glob("*.srt"):
	old_srt.unlink()
	except Exception as exc:
	raise HTTPException(status_code=500, detail=f"Failed to delete old SRT files: {exc}")

	# Save file as initial.srt
	final_path = srt_dir / "initial.srt"

	# Build SRT content
	srt_lines = []

	for i, seg in enumerate(segments, start=1):
	start = seg.get("start", 0.0)
	end = seg.get("end", 0.0)
	transcription = seg.get("transcription", "").strip()

	speaker_info = seg.get("speaker", {})
	speaker = speaker_info.get("identity", "Unknown")

	text = f"[{speaker}]: {transcription}" if speaker else transcription

	entry = (
	f"{i}\n"
	f"{_fmt_srt_time(start)} --> {_fmt_srt_time(end)}\n"
	f"{text}\n"
	)
	srt_lines.append(entry)

	# Join with blank lines
	srt_content = "\n".join(srt_lines)

	# Write to disk
	try:
	with final_path.open("w", encoding="utf-8-sig") as f:
	f.write(srt_content)
	except Exception as exc:
	raise HTTPException(status_code=500, detail=f"Failed to write SRT file: {exc}")

	return srt_content

	def pipeline_preprocessing_vision(video_path: str, face_col):
	"""
	Pipeline que toma un video y realiza todo el preprocesamiento del video de la parte de vision.
	"""

	print(f"Procesando video para visión: {video_path}")

	print("----------------------")
	print(face_col)

	print("Extrayendo escenas...")
	threshold: float = 30.0
	offset_frames: int = 240
	crop_ratio: float = 0.1
	result_extract_scenes = extract_scenes(video_path, threshold, offset_frames, crop_ratio)
	print(result_extract_scenes)
	# Obtener las rutas de las imágenes y la información de las escenas
	escenas = result_extract_scenes[0] if len(result_extract_scenes) > 0 else []
	escenas_paths = [f["image"] for f in escenas]
	print(escenas_paths)
	info_escenas = result_extract_scenes[1] if len(result_extract_scenes) > 1 else []
	print(info_escenas)

	print("Extrayendo imagenes por segundo...")
	result_extract_per_second = keyframes_every_second_extraction(video_path)
	# Obtener las rutas de las imágenes y la información de las escenas
	images_per_second = result_extract_per_second[0] if len(result_extract_per_second) > 0 else []
	images_per_second_paths = [f["image"] for f in images_per_second]
	info_images_per_second = result_extract_per_second[1] if len(result_extract_per_second) > 1 else []

	print("Aumentamos la información de las escenas viendo quién aparece en cada escena y detectando OCR...")
	info_escenas_completa = []
	for imagen_escena, info_escena in zip(escenas_paths, info_escenas):
	result_add_ocr_and_faces = add_ocr_and_faces(imagen_escena, info_escena, face_col)
	info_escenas_completa.append(result_add_ocr_and_faces)

	print("Aumentamos la información de las imagenes por segundo viendo quién aparece en cada escena y detectando OCR...")
	info_images_per_second_completa = []
	for imagen_segundo, info_segundo in zip(images_per_second_paths, info_images_per_second):
	result_add_ocr_and_faces =add_ocr_and_faces(imagen_segundo, info_segundo, face_col)
	info_images_per_second_completa.append(result_add_ocr_and_faces)
	print(info_escenas_completa)

	print("Ahora se va a tratar los OCR (se sustituirán ciertas escenas por alguna de las imágenes por segundo si tienen mejor OCR)...")
	# Se hará lo último

	print("Combinando información de escenas e imágenes por segundo...")
	info_escenas_completa = map_identities_per_second(info_images_per_second_completa, info_escenas_completa)
	print(info_escenas_completa)

	print("Ahora se incluyen en los diccionarios de las escenas la descripciones de estas.")
	for escena_path, info_escena in zip(escenas_paths, info_escenas_completa):
	descripcion_escena = extract_descripcion_escena(escena_path)
	lista = ast.literal_eval(descripcion_escena)
	frase = lista[0]
	info_escena["descripcion"] = frase
	del descripcion_escena
	torch.cuda.empty_cache()

	return info_escenas_completa, info_images_per_second_completa

	def pipeline_preprocessing_audio(video_path: str, voice_col):
	"""
	Pipeline que toma un video y realiza todo el preprocesamiento del video de la parte de audio.
	"""
	print(f"Procesando video para audio: {video_path}")

	print("Extrayendo audio del video...")
	audio_video = extract_audio_from_video(video_path)
	print(audio_video)

	print("Diartizando el audio...")
	diarization_audio = diarize_audio(audio_video)
	print(diarization_audio)
	clips_path = diarization_audio[0]
	print(clips_path)
	diarization_info = diarization_audio[1]
	print(diarization_info)

	print("Transcribiendo el video completo...")
	full_transcription = transcribe_long_audio(audio_video)
	print(full_transcription)

	print("Transcribiendo los clips diartizados...")
	for clip_path, clip_info in zip(clips_path, diarization_info):
	clip_transcription = transcribe_short_audio(clip_path)
	clip_info["transcription"] = clip_transcription

	print("Calculando los embeddings para cada uno de los clips obtenidos y posteriormente identificar las voces...")
	for clip_path, clip_info in zip(clips_path, diarization_info):
	clip_speaker = identificar_veu(clip_path, voice_col)
	clip_info["speaker"] = clip_speaker

	return full_transcription, diarization_info

	@router.post("/generate_initial_srt_and_info", tags=["Initial Transcription Process"])
	async def pipeline_video_analysis(
	sha1: str,
	token: str = Query(..., description="Token required for authorization")
	):
	"""
	Endpoint that processes a full video identified by its SHA1 folder, performs
	complete audio-visual preprocessing, and returns an SRT subtitle file.

	This pipeline integrates:
	- Vision preprocessing (scene detection, keyframes, OCR, face recognition)
	- Audio preprocessing (diarization, speech recognition, speaker identity matching)
	- Identity mapping between vision and audio streams
	- Final generation of an SRT file describing who speaks and when

	Parameters
	----------
	sha1 : str
	Identifier corresponding to the folder containing the video and related assets.
	token : str
	Security token required for authorization.

	Returns
	-------
	str
	The generated SRT file (as text) containing time-aligned subtitles with
	speaker identities and transcriptions.
	"""

	validate_token(token)

	# Resolve directories
	file_manager = FileManager(MEDIA_ROOT)
	sha1_folder = MEDIA_ROOT / sha1
	clip_folder = sha1_folder / "clip"

	if not sha1_folder.exists() or not sha1_folder.is_dir():
	raise HTTPException(status_code=404, detail="SHA1 folder not found")

	if not clip_folder.exists() or not clip_folder.is_dir():
	raise HTTPException(status_code=404, detail="Clip folder not found")

	# Locate video file
	mp4_files = list(clip_folder.glob("*.mp4"))
	if not mp4_files:
	raise HTTPException(status_code=404, detail="No MP4 files found")

	video_path = mp4_files[0]

	# Convert absolute path to a relative path for FileManager
	video_path = MEDIA_ROOT / video_path.relative_to(MEDIA_ROOT)

	print(f"Processing full video: {video_path}")

	# Get face and voice embeddings for casting
	face_col, voice_col = get_casting(sha1)

	# Vision processing pipeline
	info_escenas, info_images_per_second = pipeline_preprocessing_vision(video_path, face_col)
	torch.cuda.empty_cache()

	# Audio processing pipeline
	full_transcription, info_clips = pipeline_preprocessing_audio(video_path, voice_col)

	# Merge identities from vision pipeline with audio segments
	info_clips = map_identities_per_second(info_images_per_second, info_clips)

	# Generate the final SRT subtitle file
	srt = generate_srt_from_segments(info_clips, sha1)

	# Create result JSON
	result_json = {
	"full_transcription": full_transcription,
	"info_escenas": info_escenas,
	"info_clips": info_clips
	}

	# Path: /data/media/<sha1>
	video_root = MEDIA_ROOT / sha1
	video_root.mkdir(parents=True, exist_ok=True)

	# Path: /data/media/<sha1>/origin_srt
	srt_dir = video_root / "initial_srt"
	srt_dir.mkdir(parents=True, exist_ok=True)

	final_path = srt_dir / "initial_info.json"

	with final_path.open("w", encoding="utf-8") as f:
	json.dump({
	"full_transcription": full_transcription,
	"info_escenas": info_escenas,
	"info_clips": info_clips
	}, f, ensure_ascii=False, indent=4)

	# The endpoint returns OK message info
	return {"status": "ok", "message": "Initial SRT and info JSON generated"}

	def get_initial_info_path(sha1:str):
	video_root = MEDIA_ROOT / sha1
	srt_dir = video_root / "initial_srt"
	final_path = srt_dir / "initial_info.json"

	if not video_root.exists() or not video_root.is_dir():
	raise HTTPException(status_code=404, detail="SHA1 folder not found")
	if not srt_dir.exists() or not srt_dir.is_dir():
	raise HTTPException(status_code=404, detail="initial_srt folder not found")
	if not final_path.exists() or not final_path.is_file():
	raise HTTPException(status_code=404, detail="initial_info JSON not found")

	return final_path

	def get_initial_srt_path(sha1:str):
	video_root = MEDIA_ROOT / sha1
	srt_dir = video_root / "initial_srt"
	final_path = srt_dir / "initial.srt"

	if not video_root.exists() or not video_root.is_dir():
	raise HTTPException(status_code=404, detail="SHA1 folder not found")
	if not srt_dir.exists() or not srt_dir.is_dir():
	raise HTTPException(status_code=404, detail="initial_srt folder not found")
	if not final_path.exists() or not final_path.is_file():
	raise HTTPException(status_code=404, detail="initial.srt SRT not found")

	return final_path

	@router.get("/download_initial_srt", tags=["Initial Transcription Process"])
	def download_initial_srt(
	sha1: str,
	token: str = Query(..., description="Token required for authorization")
	):
	"""
	Download the cast CSV for a specific video identified by its SHA-1.
	The CSV is expected under:
	/data/media/<sha1>/cast/cast.csv
	Steps:
	- Validate the token.
	- Ensure /data/media/<sha1> and /cast exist.
	- Return the CSV as a FileResponse.
	- Raise 404 if any folder or file is missing.
	"""
	validate_token(token)

	file_path = get_initial_srt_path(sha1)

	return FileResponse(
	path=file_path,
	media_type="text/srt",
	filename="initial.srt"
	)

	@router.get("/download_initial_info", tags=["Initial Transcription Process"])
	def download_initial_info(
	sha1: str,
	token: str = Query(..., description="Token required for authorization")
	):
	"""
	Download the cast CSV for a specific video identified by its SHA-1.
	The CSV is expected under:
	/data/media/<sha1>/cast/cast.csv
	Steps:
	- Validate the token.
	- Ensure /data/media/<sha1> and /cast exist.
	- Return the CSV as a FileResponse.
	- Raise 404 if any folder or file is missing.
	"""
	validate_token(token)

	file_path = get_initial_info_path(sha1)

	return FileResponse(
	path=file_path,
	media_type="text/json",
	filename="initial_info.json"
	)