Spaces:

VeuReu
/

engine

Running

App Files Files Community

VeuReu commited on Oct 28

Commit

104fa1a

verified ·

1 Parent(s): c854a70

Upload 6 files

Browse files

Files changed (1) hide show

vision_tools.py +24 -74

vision_tools.py CHANGED Viewed

@@ -43,6 +43,7 @@ from transformers import AutoProcessor, LlavaForConditionalGeneration
 from PIL import Image
 from audio_tools import process_audio_for_video
 import cv2
@@ -238,91 +239,40 @@ def describe_montage_sequence(
     informacion,
     face_identities,
     *,
-    config_path: str = 'configs/config_veureu.yaml'
 ) -> Dict[str, Any]:
-    """Describe each sub-image of a montage.
-    Inputs
-    ------
-    montage_path: str
-        Path to a composite image made of n sub-ismages placed sequentially.
-    n: int
-        Number of sub-images to split and describe.
-    config_path: str
-        Path to YAML with 'vision_describer' configuration (provider and params).
-    Returns
-    -------
-    list []: with the descripcion of each image
     """
-    path_model = "BSC-LT/salamandra-7b-vision"
-    processor = AutoProcessor.from_pretrained(path_model)
-    device = "cuda" if torch.cuda.is_available() else "cpu"
-    dtype = torch.float16 if device == "cuda" else torch.float32
-    model = LlavaForConditionalGeneration.from_pretrained(
-        path_model,
-        torch_dtype=dtype,
-        low_cpu_mem_usage=True
-    ).to(device)
     img = cv2.imread(montage_path, cv2.IMREAD_COLOR)
     if img is None:
         raise RuntimeError(f"No se puede leer la imagen: {montage_path}")
-    cfg = load_config(config_path)
     tiles = _split_montage(img, n, cfg)
     if len(tiles) < n:
         raise RuntimeError(f"Se produjeron {len(tiles)} tiles, se esperaban {n}")
-    # Convertir cada tile a PIL Image
-    tile_images = [Image.fromarray(cv2.cvtColor(t, cv2.COLOR_BGR2RGB)) for t in tiles]
-    sys_prompt = (
-        "Ets un expert en narrativa visual. "
-        "Descriu la imatge de manera molt breu i senzilla, en català, "
-        "explicant només l’acció principal que s’hi veu. "
-        "Respon amb una sola frase curta (10–20 paraules com a màxim), "
-        "sense afegir detalls innecessaris ni descriure l’entorn."
-    )
-    all_results = []
-    for i in range(len(tile_images)):
-        batch = [tile_images[i]]  # lista con un solo tile
-        conversation = [
-            {"role": "system", "content": sys_prompt},
-            {"role": "user", "content": [
-                {"type": "image", "image": batch[0]},
-                {"type": "text", "text": (
-                    f"Descriu la imatge de manera molt breu i senzilla, en català. ")}
-            ]}
-        ]
-        prompt_batch = processor.apply_chat_template(conversation, add_generation_prompt=True)
-        inputs = processor(images=batch, text=prompt_batch, return_tensors="pt")
-        for k, v in inputs.items():
-            if v.dtype.is_floating_point:
-                inputs[k] = v.to(device, dtype)
-            else:
-                inputs[k] = v.to(device)
-        output = model.generate(**inputs, max_new_tokens=1024)
-        text = processor.decode(output[0], skip_special_tokens=True)
-        lines = text.split("\n")
-        desc = ""
-        for i, line in enumerate(lines):
-            if line.lower().startswith(" assistant"):
-                desc = "\n".join(lines[i+1:]).strip()
-                break
-        all_results.append(desc)
-        torch.cuda.empty_cache()
-    return all_results
 # --------------------------- IMAGES EXTRACTION -----------------------------
 def keyframe_conditional_extraction_ana(

 from PIL import Image
 from audio_tools import process_audio_for_video
+from llm_router import load_yaml, LLMRouter
 import cv2
     informacion,
     face_identities,
     *,
+    config_path: str = 'config.yaml'
 ) -> Dict[str, Any]:
+    """Describe each sub-image of a montage using remote Space (svision) via LLMRouter.
+    Returns a list of descriptions, one per tile.
     """
     img = cv2.imread(montage_path, cv2.IMREAD_COLOR)
     if img is None:
         raise RuntimeError(f"No se puede leer la imagen: {montage_path}")
+    # Load engine config and split montage into tiles
+    cfg = load_yaml(config_path)
     tiles = _split_montage(img, n, cfg)
     if len(tiles) < n:
         raise RuntimeError(f"Se produjeron {len(tiles)} tiles, se esperaban {n}")
+    # Persist tiles as temporary images next to montage
+    out_dir = Path(montage_path).parent
+    frame_paths: List[str] = []
+    for i, t in enumerate(tiles):
+        p = out_dir / f"tile_{i:03d}.jpg"
+        cv2.imwrite(str(p), t)
+        frame_paths.append(str(p))
+    # Prepare context and call remote vision describer
+    context = {
+        "informacion": informacion,
+        "face_identities": sorted(list(face_identities or set())),
+    }
+    model_name = (cfg.get("models", {}).get("vision") or "salamandra-vision")
+    router = LLMRouter(cfg)
+    descs = router.vision_describe(frame_paths, context=context, model=model_name)
+    return descs
 # --------------------------- IMAGES EXTRACTION -----------------------------
 def keyframe_conditional_extraction_ana(