Upload 6 files
Browse files- vision_tools.py +24 -74
vision_tools.py
CHANGED
|
@@ -43,6 +43,7 @@ from transformers import AutoProcessor, LlavaForConditionalGeneration
|
|
| 43 |
from PIL import Image
|
| 44 |
|
| 45 |
from audio_tools import process_audio_for_video
|
|
|
|
| 46 |
|
| 47 |
import cv2
|
| 48 |
|
|
@@ -238,91 +239,40 @@ def describe_montage_sequence(
|
|
| 238 |
informacion,
|
| 239 |
face_identities,
|
| 240 |
*,
|
| 241 |
-
config_path: str = '
|
| 242 |
) -> Dict[str, Any]:
|
| 243 |
-
"""Describe each sub-image of a montage.
|
| 244 |
-
|
| 245 |
-
|
| 246 |
-
------
|
| 247 |
-
montage_path: str
|
| 248 |
-
Path to a composite image made of n sub-ismages placed sequentially.
|
| 249 |
-
n: int
|
| 250 |
-
Number of sub-images to split and describe.
|
| 251 |
-
config_path: str
|
| 252 |
-
Path to YAML with 'vision_describer' configuration (provider and params).
|
| 253 |
-
|
| 254 |
-
Returns
|
| 255 |
-
-------
|
| 256 |
-
list []: with the descripcion of each image
|
| 257 |
"""
|
| 258 |
|
| 259 |
-
path_model = "BSC-LT/salamandra-7b-vision"
|
| 260 |
-
|
| 261 |
-
processor = AutoProcessor.from_pretrained(path_model)
|
| 262 |
-
device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 263 |
-
dtype = torch.float16 if device == "cuda" else torch.float32
|
| 264 |
-
model = LlavaForConditionalGeneration.from_pretrained(
|
| 265 |
-
path_model,
|
| 266 |
-
torch_dtype=dtype,
|
| 267 |
-
low_cpu_mem_usage=True
|
| 268 |
-
).to(device)
|
| 269 |
-
|
| 270 |
img = cv2.imread(montage_path, cv2.IMREAD_COLOR)
|
| 271 |
if img is None:
|
| 272 |
raise RuntimeError(f"No se puede leer la imagen: {montage_path}")
|
| 273 |
|
| 274 |
-
|
|
|
|
| 275 |
tiles = _split_montage(img, n, cfg)
|
| 276 |
if len(tiles) < n:
|
| 277 |
raise RuntimeError(f"Se produjeron {len(tiles)} tiles, se esperaban {n}")
|
| 278 |
|
| 279 |
-
#
|
| 280 |
-
|
| 281 |
-
|
| 282 |
-
|
| 283 |
-
|
| 284 |
-
|
| 285 |
-
|
| 286 |
-
|
| 287 |
-
|
| 288 |
-
|
| 289 |
-
|
| 290 |
-
|
| 291 |
-
|
| 292 |
-
|
| 293 |
-
|
| 294 |
-
|
| 295 |
-
|
| 296 |
-
{"role": "system", "content": sys_prompt},
|
| 297 |
-
{"role": "user", "content": [
|
| 298 |
-
{"type": "image", "image": batch[0]},
|
| 299 |
-
{"type": "text", "text": (
|
| 300 |
-
f"Descriu la imatge de manera molt breu i senzilla, en català. ")}
|
| 301 |
-
]}
|
| 302 |
-
]
|
| 303 |
-
prompt_batch = processor.apply_chat_template(conversation, add_generation_prompt=True)
|
| 304 |
-
|
| 305 |
-
inputs = processor(images=batch, text=prompt_batch, return_tensors="pt")
|
| 306 |
-
for k, v in inputs.items():
|
| 307 |
-
if v.dtype.is_floating_point:
|
| 308 |
-
inputs[k] = v.to(device, dtype)
|
| 309 |
-
else:
|
| 310 |
-
inputs[k] = v.to(device)
|
| 311 |
-
|
| 312 |
-
output = model.generate(**inputs, max_new_tokens=1024)
|
| 313 |
-
text = processor.decode(output[0], skip_special_tokens=True)
|
| 314 |
-
lines = text.split("\n")
|
| 315 |
-
|
| 316 |
-
desc = ""
|
| 317 |
-
for i, line in enumerate(lines):
|
| 318 |
-
if line.lower().startswith(" assistant"):
|
| 319 |
-
desc = "\n".join(lines[i+1:]).strip()
|
| 320 |
-
break
|
| 321 |
-
|
| 322 |
-
all_results.append(desc)
|
| 323 |
-
torch.cuda.empty_cache()
|
| 324 |
-
|
| 325 |
-
return all_results
|
| 326 |
|
| 327 |
# --------------------------- IMAGES EXTRACTION -----------------------------
|
| 328 |
def keyframe_conditional_extraction_ana(
|
|
|
|
| 43 |
from PIL import Image
|
| 44 |
|
| 45 |
from audio_tools import process_audio_for_video
|
| 46 |
+
from llm_router import load_yaml, LLMRouter
|
| 47 |
|
| 48 |
import cv2
|
| 49 |
|
|
|
|
| 239 |
informacion,
|
| 240 |
face_identities,
|
| 241 |
*,
|
| 242 |
+
config_path: str = 'config.yaml'
|
| 243 |
) -> Dict[str, Any]:
|
| 244 |
+
"""Describe each sub-image of a montage using remote Space (svision) via LLMRouter.
|
| 245 |
+
|
| 246 |
+
Returns a list of descriptions, one per tile.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 247 |
"""
|
| 248 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 249 |
img = cv2.imread(montage_path, cv2.IMREAD_COLOR)
|
| 250 |
if img is None:
|
| 251 |
raise RuntimeError(f"No se puede leer la imagen: {montage_path}")
|
| 252 |
|
| 253 |
+
# Load engine config and split montage into tiles
|
| 254 |
+
cfg = load_yaml(config_path)
|
| 255 |
tiles = _split_montage(img, n, cfg)
|
| 256 |
if len(tiles) < n:
|
| 257 |
raise RuntimeError(f"Se produjeron {len(tiles)} tiles, se esperaban {n}")
|
| 258 |
|
| 259 |
+
# Persist tiles as temporary images next to montage
|
| 260 |
+
out_dir = Path(montage_path).parent
|
| 261 |
+
frame_paths: List[str] = []
|
| 262 |
+
for i, t in enumerate(tiles):
|
| 263 |
+
p = out_dir / f"tile_{i:03d}.jpg"
|
| 264 |
+
cv2.imwrite(str(p), t)
|
| 265 |
+
frame_paths.append(str(p))
|
| 266 |
+
|
| 267 |
+
# Prepare context and call remote vision describer
|
| 268 |
+
context = {
|
| 269 |
+
"informacion": informacion,
|
| 270 |
+
"face_identities": sorted(list(face_identities or set())),
|
| 271 |
+
}
|
| 272 |
+
model_name = (cfg.get("models", {}).get("vision") or "salamandra-vision")
|
| 273 |
+
router = LLMRouter(cfg)
|
| 274 |
+
descs = router.vision_describe(frame_paths, context=context, model=model_name)
|
| 275 |
+
return descs
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 276 |
|
| 277 |
# --------------------------- IMAGES EXTRACTION -----------------------------
|
| 278 |
def keyframe_conditional_extraction_ana(
|