Spaces:

VeuReu
/

engine

Running

App Files Files Community

VeuReu commited on 11 days ago

Commit

b9246df

verified ·

1 Parent(s): 9d9aacc

Delete main_process/main_router - marcos.py

Browse files

Files changed (1) hide show

main_process/main_router - marcos.py +0 -410

main_process/main_router - marcos.py DELETED Viewed

@@ -1,410 +0,0 @@
-import os
-import io
-from pathlib import Path
-from typing import Counter,List, Dict
-import ast
-import json
-import torch
-from svision_client import extract_scenes, add_ocr_and_faces, keyframes_every_second_extraction, extract_descripcion_escena
-from asr_client import extract_audio_from_video, diarize_audio, transcribe_long_audio, transcribe_short_audio, identificar_veu
-from fastapi import APIRouter, UploadFile, File, Query, HTTPException
-from fastapi.responses import JSONResponse, StreamingResponse, FileResponse
-from storage.common import validate_token
-from storage.files.file_manager import FileManager
-from storage.embeddings_routers import get_embeddings_json
-EMBEDDINGS_ROOT = Path("/data/embeddings")
-MEDIA_ROOT = Path("/data/media")
-os.environ["CUDA_VISIBLE_DEVICES"] = "1"
-router = APIRouter(prefix="/transcription", tags=["Initial Transcription Process"])
-HF_TOKEN = os.getenv("VEUREU_TOKEN")
-def get_casting(video_sha1: str):
-    """Recupera els embeddings reals de càsting per a un vídeo a partir del seu SHA1.
-    Llegeix el JSON d'embeddings que demo ha pujat prèviament a /data/embeddings
-    mitjançant l'endpoint /embeddings/upload_embeddings i en retorna les
-    columnes face_col i voice_col.
-    """
-    # get_embeddings_json retorna el JSON complet tal com es va pujar (casting_json)
-    faces_json = get_embeddings_json(video_sha1, "faces")
-    voices_json = get_embeddings_json(video_sha1, "voices")
-    return faces_json, voices_json
-def map_identities_per_second(frames_per_second, intervals):
-    for seg in intervals:
-        seg_start = seg["start"]
-        seg_end = seg["end"]
-        identities = []
-        for f in frames_per_second:
-            if seg_start <= f["start"] <= seg_end:
-                for face in f.get("faces", []):
-                    identities.append(face)
-        seg["counts"] = dict(Counter(identities))
-    return intervals
-def _fmt_srt_time(seconds: float) -> str:
-    """Formatea segundos en el formato SRT HH:MM:SS,mmm"""
-    h = int(seconds // 3600)
-    m = int((seconds % 3600) // 60)
-    s = int(seconds % 60)
-    ms = int((seconds - int(seconds)) * 1000)
-    return f"{h:02}:{m:02}:{s:02},{ms:03}"
-from pathlib import Path
-from typing import List, Dict
-from fastapi import HTTPException
-def generate_srt_from_segments(segments: List[Dict], sha1: str) -> str:
-    """
-    Generate an SRT subtitle file from diarization/transcription segments.
-    This function:
-    - Creates the required folder structure for storing SRTs.
-    - Removes any previous SRT files for the same SHA1.
-    - Builds the SRT content with timestamps, speaker identity and transcription.
-    - Saves the SRT file to disk.
-    - Returns the SRT content as a string (to be sent by the endpoint).
-    Parameters
-    ----------
-    segments : List[Dict]
-        List of dictionaries containing:
-            - "start": float (start time in seconds)
-            - "end": float (end time in seconds)
-            - "speaker": dict with "identity"
-            - "transcription": str
-    sha1 : str
-        Identifier used to locate the target media folder.
-    Returns
-    -------
-    str
-        Full SRT file content as a string.
-    """
-    # Path: /data/media/<sha1>
-    video_root = MEDIA_ROOT / sha1
-    video_root.mkdir(parents=True, exist_ok=True)
-    # Path: /data/media/<sha1>/origin_srt
-    srt_dir = video_root / "initial_srt"
-    srt_dir.mkdir(parents=True, exist_ok=True)
-    # Delete old SRT files
-    try:
-        for old_srt in srt_dir.glob("*.srt"):
-            old_srt.unlink()
-    except Exception as exc:
-        raise HTTPException(status_code=500, detail=f"Failed to delete old SRT files: {exc}")
-    # Save file as initial.srt
-    final_path = srt_dir / "initial.srt"
-    # Build SRT content
-    srt_lines = []
-    for i, seg in enumerate(segments, start=1):
-        start = seg.get("start", 0.0)
-        end = seg.get("end", 0.0)
-        transcription = seg.get("transcription", "").strip()
-        speaker_info = seg.get("speaker", {})
-        speaker = speaker_info.get("identity", "Unknown")
-        text = f"[{speaker}]: {transcription}" if speaker else transcription
-        entry = (
-            f"{i}\n"
-            f"{_fmt_srt_time(start)} --> {_fmt_srt_time(end)}\n"
-            f"{text}\n"
-        )
-        srt_lines.append(entry)
-    # Join with blank lines
-    srt_content = "\n".join(srt_lines)
-    # Write to disk
-    try:
-        with final_path.open("w", encoding="utf-8-sig") as f:
-            f.write(srt_content)
-    except Exception as exc:
-        raise HTTPException(status_code=500, detail=f"Failed to write SRT file: {exc}")
-    return srt_content
-def pipeline_preprocessing_vision(video_path: str, face_col):
-    """
-    Pipeline que toma un video y realiza todo el preprocesamiento del video de la parte de vision.
-    """
-    print(f"Procesando video para visión: {video_path}")
-    print("Extrayendo escenas...")
-    threshold: float = 30.0
-    offset_frames: int = 3
-    crop_ratio: float = 0.1
-    result_extract_scenes = extract_scenes(video_path, threshold, offset_frames, crop_ratio)
-    print(result_extract_scenes)
-    # Obtener las rutas de las imágenes y la información de las escenas
-    escenas = result_extract_scenes[0] if len(result_extract_scenes) > 0 else []
-    escenas_paths = [f["image"] for f in escenas]
-    print(escenas_paths)
-    info_escenas = result_extract_scenes[1] if len(result_extract_scenes) > 1 else []
-    print(info_escenas)
-    print("Extrayendo imagenes por segundo...")
-    result_extract_per_second = keyframes_every_second_extraction(video_path)
-    # Obtener las rutas de las imágenes y la información de las escenas
-    images_per_second = result_extract_per_second[0] if len(result_extract_per_second) > 0 else []
-    images_per_second_paths = [f["image"] for f in images_per_second]
-    info_images_per_second = result_extract_per_second[1] if len(result_extract_per_second) > 1 else []
-    print("Aumentamos la información de las escenas viendo quién aparece en cada escena y detectando OCR...")
-    info_escenas_completa = []
-    for imagen_escena, info_escena in zip(escenas_paths, info_escenas):
-        result_add_ocr_and_faces = add_ocr_and_faces(imagen_escena, info_escena, face_col)
-        info_escenas_completa.append(result_add_ocr_and_faces)
-    print("Aumentamos la información de las imagenes por segundo viendo quién aparece en cada escena y detectando OCR...")
-    info_images_per_second_completa = []
-    for imagen_segundo, info_segundo in zip(images_per_second_paths, info_images_per_second):
-        result_add_ocr_and_faces =add_ocr_and_faces(imagen_segundo, info_segundo, face_col)
-        info_images_per_second_completa.append(result_add_ocr_and_faces)
-    print(info_escenas_completa)
-    print("Ahora se va a tratar los OCR (se sustituirán ciertas escenas por alguna de las imágenes por segundo si tienen mejor OCR)...")
-    # Se hará lo último
-    print("Combinando información de escenas e imágenes por segundo...")
-    info_escenas_completa = map_identities_per_second(info_images_per_second_completa, info_escenas_completa)
-    print(info_escenas_completa)
-    print("Ahora se incluyen en los diccionarios de las escenas la descripciones de estas.")
-    for escena_path, info_escena in zip(escenas_paths, info_escenas_completa):
-        descripcion_escena = extract_descripcion_escena(escena_path)
-        lista = ast.literal_eval(descripcion_escena)
-        frase = lista[0]
-        info_escena["descripcion"] = frase
-        del descripcion_escena
-        torch.cuda.empty_cache()
-    return info_escenas_completa, info_images_per_second_completa
-def pipeline_preprocessing_audio(video_path: str, voice_col):
-    """
-    Pipeline que toma un video y realiza todo el preprocesamiento del video de la parte de audio.
-    """
-    print(f"Procesando video para audio: {video_path}")
-    print("Extrayendo audio del video...")
-    audio_video = extract_audio_from_video(video_path)
-    print(audio_video)
-    print("Diartizando el audio...")
-    diarization_audio = diarize_audio(audio_video)
-    print(diarization_audio)
-    clips_path = diarization_audio[0]
-    print(clips_path)
-    diarization_info = diarization_audio[1]
-    print(diarization_info)
-    print("Transcribiendo el video completo...")
-    full_transcription = transcribe_long_audio(audio_video)
-    print(full_transcription)
-    print("Transcribiendo los clips diartizados...")
-    for clip_path, clip_info in zip(clips_path, diarization_info):
-        clip_transcription = transcribe_short_audio(clip_path)
-        clip_info["transcription"] = clip_transcription
-    print("Calculando los embeddings para cada uno de los clips obtenidos y posteriormente identificar las voces...")
-    for clip_path, clip_info in zip(clips_path, diarization_info):
-        clip_speaker = identificar_veu(clip_path, voice_col)
-        clip_info["speaker"] = clip_speaker
-    return full_transcription, diarization_info
-@router.post("/generate_initial_srt_and_info", tags=["Initial Transcription Process"])
-async def pipeline_video_analysis(
-    sha1: str,
-    token: str = Query(..., description="Token required for authorization")
-):
-    """
-    Endpoint that processes a full video identified by its SHA1 folder, performs
-    complete audio-visual preprocessing, and returns an SRT subtitle file.
-    This pipeline integrates:
-    - Vision preprocessing (scene detection, keyframes, OCR, face recognition)
-    - Audio preprocessing (diarization, speech recognition, speaker identity matching)
-    - Identity mapping between vision and audio streams
-    - Final generation of an SRT file describing who speaks and when
-    Parameters
-    ----------
-    sha1 : str
-        Identifier corresponding to the folder containing the video and related assets.
-    token : str
-        Security token required for authorization.
-    Returns
-    -------
-    str
-        The generated SRT file (as text) containing time-aligned subtitles with
-        speaker identities and transcriptions.
-    """
-    validate_token(token)
-    # Resolve directories
-    file_manager = FileManager(MEDIA_ROOT)
-    sha1_folder = MEDIA_ROOT / sha1
-    clip_folder = sha1_folder / "clip"
-    if not sha1_folder.exists() or not sha1_folder.is_dir():
-        raise HTTPException(status_code=404, detail="SHA1 folder not found")
-    if not clip_folder.exists() or not clip_folder.is_dir():
-        raise HTTPException(status_code=404, detail="Clip folder not found")
-    # Locate video file
-    mp4_files = list(clip_folder.glob("*.mp4"))
-    if not mp4_files:
-        raise HTTPException(status_code=404, detail="No MP4 files found")
-    video_path = mp4_files[0]
-    # Convert absolute path to a relative path for FileManager
-    video_path = MEDIA_ROOT / video_path.relative_to(MEDIA_ROOT)
-    print(f"Processing full video: {video_path}")
-    # Get face and voice embeddings for casting
-    face_col, voice_col = get_casting(sha1)
-    # Vision processing pipeline
-    info_escenas, info_images_per_second = pipeline_preprocessing_vision(video_path, face_col)
-    torch.cuda.empty_cache()
-    # Audio processing pipeline
-    full_transcription, info_clips = pipeline_preprocessing_audio(video_path, voice_col)
-    # Merge identities from vision pipeline with audio segments
-    info_clips = map_identities_per_second(info_images_per_second, info_clips)
-    # Generate the final SRT subtitle file
-    srt = generate_srt_from_segments(info_clips, sha1)
-    # Create result JSON
-    result_json = {
-            "full_transcription": full_transcription,
-            "info_escenas": info_escenas,
-            "info_clips": info_clips
-    }
-    # Path: /data/media/<sha1>
-    video_root = MEDIA_ROOT / sha1
-    video_root.mkdir(parents=True, exist_ok=True)
-    # Path: /data/media/<sha1>/origin_srt
-    srt_dir = video_root / "initial_srt"
-    srt_dir.mkdir(parents=True, exist_ok=True)
-    final_path = srt_dir / "initial_info.json"
-    with final_path.open("w", encoding="utf-8") as f:
-        json.dump({
-            "full_transcription": full_transcription,
-            "info_escenas": info_escenas,
-            "info_clips": info_clips
-        }, f, ensure_ascii=False, indent=4)
-    # The endpoint returns OK message info
-    return {"status": "ok", "message": "Initial SRT and info JSON generated"}
-def get_initial_info_path(sha1:str):
-    video_root = MEDIA_ROOT / sha1
-    srt_dir = video_root / "initial_srt"
-    final_path = srt_dir / "initial_info.json"
-    if not video_root.exists() or not video_root.is_dir():
-        raise HTTPException(status_code=404, detail="SHA1 folder not found")
-    if not srt_dir.exists() or not srt_dir.is_dir():
-        raise HTTPException(status_code=404, detail="initial_srt folder not found")
-    if not final_path.exists() or not final_path.is_file():
-        raise HTTPException(status_code=404, detail="initial_info JSON not found")
-    return final_path
-def get_initial_srt_path(sha1:str):
-    video_root = MEDIA_ROOT / sha1
-    srt_dir = video_root / "initial_srt"
-    final_path = srt_dir / "initial.srt"
-    if not video_root.exists() or not video_root.is_dir():
-        raise HTTPException(status_code=404, detail="SHA1 folder not found")
-    if not srt_dir.exists() or not srt_dir.is_dir():
-        raise HTTPException(status_code=404, detail="initial_srt folder not found")
-    if not final_path.exists() or not final_path.is_file():
-        raise HTTPException(status_code=404, detail="initial.srt SRT not found")
-    return final_path
-@router.get("/download_initial_srt", tags=["Initial Transcription Process"])
-def download_initial_srt(
-    sha1: str,
-    token: str = Query(..., description="Token required for authorization")
-):
-    """
-    Download the cast CSV for a specific video identified by its SHA-1.
-    The CSV is expected under:
-        /data/media/<sha1>/cast/cast.csv
-    Steps:
-    - Validate the token.
-    - Ensure /data/media/<sha1> and /cast exist.
-    - Return the CSV as a FileResponse.
-    - Raise 404 if any folder or file is missing.
-    """
-    validate_token(token)
-    file_path = get_initial_srt_path(sha1)
-    return FileResponse(
-        path=file_path,
-        media_type="text/srt",
-        filename="initial.srt"
-    )
-@router.get("/download_initial_info", tags=["Initial Transcription Process"])
-def download_initial_info(
-    sha1: str,
-    token: str = Query(..., description="Token required for authorization")
-):
-    """
-    Download the cast CSV for a specific video identified by its SHA-1.
-    The CSV is expected under:
-        /data/media/<sha1>/cast/cast.csv
-    Steps:
-    - Validate the token.
-    - Ensure /data/media/<sha1> and /cast exist.
-    - Return the CSV as a FileResponse.
-    - Raise 404 if any folder or file is missing.
-    """
-    validate_token(token)
-    file_path = get_initial_info_path(sha1)
-    return FileResponse(
-        path=file_path,
-        media_type="text/json",
-        filename="initial_info.json"
-    )