Spaces:

VeuReu
/

engine

Running

File size: 5,820 Bytes

from __future__ import annotations
from typing import Any, Dict, List, Optional, Tuple
from pathlib import Path

from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

from vision_tools import (
    keyframe_conditional_extraction_ana,
    keyframe_every_second,
    process_frames,
    FaceOfImageEmbedding,
    generar_montage,
    describe_montage_sequence,   # fallback local
)

from llm_router import load_yaml, LLMRouter

def cluster_ocr_sequential(ocr_list: List[Dict[str, Any]], threshold: float = 0.6) -> List[Dict[str, Any]]:
    if not ocr_list:
        return []
    ocr_text = [item.get("ocr") for item in ocr_list if item and isinstance(item.get("ocr"), str)]
    if not ocr_text:
        return []
    model = SentenceTransformer("all-MiniLM-L6-v2")
    embeddings = model.encode(ocr_text, normalize_embeddings=True)

    clusters_repr = []
    prev_emb = embeddings[0]
    start_time = ocr_list[0]["start"]
    for i, emb in enumerate(embeddings[1:], 1):
        sim = cosine_similarity([prev_emb], [emb])[0][0]
        if sim < threshold:
            clusters_repr.append({"index": i - 1, "start_time": start_time})
            prev_emb = emb
            start_time = ocr_list[i]["start"]
    clusters_repr.append({"index": len(embeddings) - 1, "start_time": start_time})

    ocr_final = []
    for cluster in clusters_repr:
        idx = cluster["index"]
        if idx < len(ocr_list) and ocr_list[idx].get("ocr"):
            it = ocr_list[idx]
            ocr_final.append({
                "ocr": it.get("ocr"),
                "image_path": it.get("image_path"),
                "start": cluster["start_time"],
                "end": it.get("end"),
                "faces": it.get("faces"),
            })
    return ocr_final

def build_keyframes_and_per_second(

    video_path: str,

    out_dir: Path,

    cfg: Dict[str, Any],

    face_collection=None,

) -> Tuple[List[Dict[str, Any]], List[Dict[str, Any]], float]:
    kf_dir = out_dir / "keyframes"
    ps_dir = out_dir / "frames_per_second"

    keyframes = keyframe_conditional_extraction_ana(video_path=video_path, output_dir=str(kf_dir))
    per_second = keyframe_every_second(video_path=video_path, output_dir=str(ps_dir))

    embedder = FaceOfImageEmbedding(deepface_model="Facenet512")
    kf_proc = process_frames(frames=keyframes, config=cfg, face_col=face_collection, embedding_model=embedder)
    ps_proc = process_frames(frames=per_second, config=cfg, face_col=face_collection, embedding_model=embedder)

    ocr_list = [{
        "ocr": fr.get("ocr"),
        "image_path": fr.get("image_path"),
        "start": fr.get("start"),
        "end": fr.get("end"),
        "faces": fr.get("faces"),
    } for fr in ps_proc]
    ocr_final = cluster_ocr_sequential(ocr_list, threshold=float(cfg.get("video_processing", {}).get("ocr_clustering", {}).get("similarity_threshold", 0.6)))

    kf_mod: List[Dict[str, Any]] = []
    idx = 1
    for k in kf_proc:
        ks, ke = k["start"], k["end"]
        inicio = True
        sustituido = False
        for f in ocr_final:
            if f["start"] >= ks and f["end"] <= ke and inicio:
                kf_mod.append({
                    "id": idx,
                    "start": k["start"],
                    "end": None,
                    "image_path": f["image_path"],
                    "faces": f["faces"],
                    "ocr": f.get("ocr"),
                    "description": None,
                })
                idx += 1
                sustituido = True
                inicio = False
            elif f["start"] >= ks and f["end"] <= ke and not inicio:
                kf_mod.append({
                    "id": idx,
                    "start": f["start"],
                    "end": None,
                    "image_path": f["image_path"],
                    "faces": f["faces"],
                    "ocr": f.get("ocr"),
                    "description": None,
                })
                idx += 1
        if not sustituido:
            k2 = dict(k)
            k2["id"] = idx
            kf_mod.append(k2)
            idx += 1

    return kf_mod, ps_proc, 0.0

def describe_keyframes_with_llm(

    keyframes: List[Dict[str, Any]],

    out_dir: Path,

    face_identities: Optional[set] = None,

    config_path: str | None = None,

) -> Tuple[List[Dict[str, Any]], Optional[str]]:
    cfg = load_yaml(config_path or "config.yaml")
    model_name = (cfg.get("background_descriptor", {}).get("description", {}) or {}).get("model", "salamandra-vision")

    frame_paths = [k.get("image_path") for k in keyframes if k.get("image_path")]
    montage_dir = out_dir / "montage"
    montage_path = None
    if frame_paths:
        montage_path = generar_montage(frame_paths, montage_dir)
        context = {
            "informacion": [{k: v for k, v in fr.items() if k in ("start", "end", "ocr", "faces")} for fr in keyframes],
            "face_identities": sorted(list(face_identities or set()))
        }
        try:
            router = LLMRouter(cfg)
            descs = router.vision_describe(frame_paths, context=context, model=model_name)
        except Exception:
            descs = describe_montage_sequence(
                montage_path=str(montage_path),
                n=len(frame_paths),
                informacion=keyframes,
                face_identities=face_identities or set(),
                config_path=config_path or "config.yaml",
            )
        for i, fr in enumerate(keyframes):
            if i < len(descs):
                fr["description"] = descs[i]
    return keyframes, str(montage_path) if montage_path else None