VeuReu commited on
Commit
104fa1a
·
verified ·
1 Parent(s): c854a70

Upload 6 files

Browse files
Files changed (1) hide show
  1. vision_tools.py +24 -74
vision_tools.py CHANGED
@@ -43,6 +43,7 @@ from transformers import AutoProcessor, LlavaForConditionalGeneration
43
  from PIL import Image
44
 
45
  from audio_tools import process_audio_for_video
 
46
 
47
  import cv2
48
 
@@ -238,91 +239,40 @@ def describe_montage_sequence(
238
  informacion,
239
  face_identities,
240
  *,
241
- config_path: str = 'configs/config_veureu.yaml'
242
  ) -> Dict[str, Any]:
243
- """Describe each sub-image of a montage.
244
-
245
- Inputs
246
- ------
247
- montage_path: str
248
- Path to a composite image made of n sub-ismages placed sequentially.
249
- n: int
250
- Number of sub-images to split and describe.
251
- config_path: str
252
- Path to YAML with 'vision_describer' configuration (provider and params).
253
-
254
- Returns
255
- -------
256
- list []: with the descripcion of each image
257
  """
258
 
259
- path_model = "BSC-LT/salamandra-7b-vision"
260
-
261
- processor = AutoProcessor.from_pretrained(path_model)
262
- device = "cuda" if torch.cuda.is_available() else "cpu"
263
- dtype = torch.float16 if device == "cuda" else torch.float32
264
- model = LlavaForConditionalGeneration.from_pretrained(
265
- path_model,
266
- torch_dtype=dtype,
267
- low_cpu_mem_usage=True
268
- ).to(device)
269
-
270
  img = cv2.imread(montage_path, cv2.IMREAD_COLOR)
271
  if img is None:
272
  raise RuntimeError(f"No se puede leer la imagen: {montage_path}")
273
 
274
- cfg = load_config(config_path)
 
275
  tiles = _split_montage(img, n, cfg)
276
  if len(tiles) < n:
277
  raise RuntimeError(f"Se produjeron {len(tiles)} tiles, se esperaban {n}")
278
 
279
- # Convertir cada tile a PIL Image
280
- tile_images = [Image.fromarray(cv2.cvtColor(t, cv2.COLOR_BGR2RGB)) for t in tiles]
281
-
282
- sys_prompt = (
283
- "Ets un expert en narrativa visual. "
284
- "Descriu la imatge de manera molt breu i senzilla, en català, "
285
- "explicant només l’acció principal que s’hi veu. "
286
- "Respon amb una sola frase curta (10–20 paraules com a màxim), "
287
- "sense afegir detalls innecessaris ni descriure l’entorn."
288
- )
289
-
290
- all_results = []
291
-
292
- for i in range(len(tile_images)):
293
- batch = [tile_images[i]] # lista con un solo tile
294
-
295
- conversation = [
296
- {"role": "system", "content": sys_prompt},
297
- {"role": "user", "content": [
298
- {"type": "image", "image": batch[0]},
299
- {"type": "text", "text": (
300
- f"Descriu la imatge de manera molt breu i senzilla, en català. ")}
301
- ]}
302
- ]
303
- prompt_batch = processor.apply_chat_template(conversation, add_generation_prompt=True)
304
-
305
- inputs = processor(images=batch, text=prompt_batch, return_tensors="pt")
306
- for k, v in inputs.items():
307
- if v.dtype.is_floating_point:
308
- inputs[k] = v.to(device, dtype)
309
- else:
310
- inputs[k] = v.to(device)
311
-
312
- output = model.generate(**inputs, max_new_tokens=1024)
313
- text = processor.decode(output[0], skip_special_tokens=True)
314
- lines = text.split("\n")
315
-
316
- desc = ""
317
- for i, line in enumerate(lines):
318
- if line.lower().startswith(" assistant"):
319
- desc = "\n".join(lines[i+1:]).strip()
320
- break
321
-
322
- all_results.append(desc)
323
- torch.cuda.empty_cache()
324
-
325
- return all_results
326
 
327
  # --------------------------- IMAGES EXTRACTION -----------------------------
328
  def keyframe_conditional_extraction_ana(
 
43
  from PIL import Image
44
 
45
  from audio_tools import process_audio_for_video
46
+ from llm_router import load_yaml, LLMRouter
47
 
48
  import cv2
49
 
 
239
  informacion,
240
  face_identities,
241
  *,
242
+ config_path: str = 'config.yaml'
243
  ) -> Dict[str, Any]:
244
+ """Describe each sub-image of a montage using remote Space (svision) via LLMRouter.
245
+
246
+ Returns a list of descriptions, one per tile.
 
 
 
 
 
 
 
 
 
 
 
247
  """
248
 
 
 
 
 
 
 
 
 
 
 
 
249
  img = cv2.imread(montage_path, cv2.IMREAD_COLOR)
250
  if img is None:
251
  raise RuntimeError(f"No se puede leer la imagen: {montage_path}")
252
 
253
+ # Load engine config and split montage into tiles
254
+ cfg = load_yaml(config_path)
255
  tiles = _split_montage(img, n, cfg)
256
  if len(tiles) < n:
257
  raise RuntimeError(f"Se produjeron {len(tiles)} tiles, se esperaban {n}")
258
 
259
+ # Persist tiles as temporary images next to montage
260
+ out_dir = Path(montage_path).parent
261
+ frame_paths: List[str] = []
262
+ for i, t in enumerate(tiles):
263
+ p = out_dir / f"tile_{i:03d}.jpg"
264
+ cv2.imwrite(str(p), t)
265
+ frame_paths.append(str(p))
266
+
267
+ # Prepare context and call remote vision describer
268
+ context = {
269
+ "informacion": informacion,
270
+ "face_identities": sorted(list(face_identities or set())),
271
+ }
272
+ model_name = (cfg.get("models", {}).get("vision") or "salamandra-vision")
273
+ router = LLMRouter(cfg)
274
+ descs = router.vision_describe(frame_paths, context=context, model=model_name)
275
+ return descs
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
276
 
277
  # --------------------------- IMAGES EXTRACTION -----------------------------
278
  def keyframe_conditional_extraction_ana(