from __future__ import annotations import re from dataclasses import dataclass from datetime import timedelta from typing import List, Optional, Dict, Any TIME_RE = re.compile( r"(?P\d{2}:\d{2}:\d{2}[,\.]\d{3})\s*-->\s*(?P\d{2}:\d{2}:\d{2}[,\.]\d{3})" ) @dataclass class SRTBlock: index: int start: float # seconds end: float # seconds text: str def _parse_timestamp(ts: str) -> float: """Convierte 'HH:MM:SS,mmm' o 'HH:MM:SS.mmm' a segundos (float).""" ts = ts.replace(",", ".") h, m, s = ts.split(":") seconds, millis = (s.split("." ) + ["0"])[:2] td = timedelta( hours=int(h), minutes=int(m), seconds=int(seconds), milliseconds=int(millis.ljust(3, "0")), ) return td.total_seconds() def _parse_srt(srt_text: str) -> List[SRTBlock]: """Parsea texto SRT en una lista de bloques SRTBlock.""" srt_text = srt_text.replace("\r\n", "\n").replace("\r", "\n") chunks = [c.strip() for c in re.split(r"\n\s*\n", srt_text) if c.strip()] blocks: List[SRTBlock] = [] for chunk in chunks: lines = chunk.split("\n") idx_line = 0 index = None if lines and lines[0].strip().isdigit(): index = int(lines[0].strip()) idx_line = 1 time_match = None time_line_idx = None for i in range(idx_line, min(idx_line + 3, len(lines))): m = TIME_RE.search(lines[i]) if m: time_match = m time_line_idx = i break if not time_match or time_line_idx is None: continue start = _parse_timestamp(time_match.group("start")) end = _parse_timestamp(time_match.group("end")) if index is None: index = len(blocks) + 1 text = "\n".join(lines[time_line_idx + 1 :]).strip() blocks.append(SRTBlock(index=index, start=start, end=end, text=text)) return blocks def analyze_srt( srt_text: str, *, ad_markers: Optional[List[str]] = None, ) -> Dict[str, Any]: """Analiza un SRT y devuelve métricas básicas. Métricas devueltas: - duration_sec: duración total estimada del vídeo (segundos) - words_per_min: número de palabras por minuto - speakers_blocks_per_min: número de bloques de diálogo por minuto - ad_time_ratio: porcentaje (0..1) del tiempo total con bloques marcados como AD - blocks_per_min: número total de bloques por minuto Heurísticas: - Se asume que la duración del vídeo es el final del último bloque. - Un "bloque de AD" es aquel cuya primera línea contiene alguno de los marcadores indicados en `ad_markers` (por ejemplo: "[AD]", "AD:", "(AD)"). """ blocks = _parse_srt(srt_text) if not blocks: return { "duration_sec": 0.0, "words_per_min": 0.0, "speakers_blocks_per_min": 0.0, "ad_time_ratio": 0.0, "blocks_per_min": 0.0, } duration_sec = max(b.end for b in blocks) duration_min = max(duration_sec / 60.0, 1e-6) # Palabras totales total_words = 0 for b in blocks: total_words += len(b.text.split()) # Bloques considerados de "hablante" (no AD) if ad_markers is None: ad_markers = ["[AD]", "AD:", "(AD)"] def is_ad_block(block: SRTBlock) -> bool: first_line = (block.text.splitlines() or [""])[0].strip().upper() for mk in ad_markers: if mk.upper() in first_line: return True return False ad_time = 0.0 speech_blocks = 0 for b in blocks: if is_ad_block(b): ad_time += max(0.0, b.end - b.start) else: speech_blocks += 1 words_per_min = total_words / duration_min speakers_blocks_per_min = speech_blocks / duration_min blocks_per_min = len(blocks) / duration_min ad_time_ratio = ad_time / duration_sec if duration_sec > 0 else 0.0 return { "duration_sec": float(duration_sec), "words_per_min": float(words_per_min), "speakers_blocks_per_min": float(speakers_blocks_per_min), "ad_time_ratio": float(ad_time_ratio), "blocks_per_min": float(blocks_per_min), } def embed_srt_sentences( srt_text: str, *, model_name: str = "sentence-transformers/all-MiniLM-L6-v2", ) -> Dict[str, Any]: """Devuelve embeddings para las frases de un SRT. Args: srt_text: Contenido completo del archivo SRT como string. model_name: Nombre del modelo de sentence-transformers a usar. Returns: Diccionario con: - "model_name": nombre del modelo utilizado - "sentences": lista de strings (una por bloque) - "embeddings": lista de listas de floats con los embeddings NOTA: Requiere instalar `sentence-transformers` y un backend de PyTorch compatible. Si no está instalado, lanzará ImportError. """ blocks = _parse_srt(srt_text) sentences = [b.text.replace("\n", " ").strip() for b in blocks if b.text.strip()] if not sentences: return {"model_name": model_name, "sentences": [], "embeddings": []} try: from sentence_transformers import SentenceTransformer except ImportError as exc: raise ImportError( "sentence-transformers no está instalado. " "Instala la dependencia para poder generar embeddings." ) from exc model = SentenceTransformer(model_name) embs = model.encode(sentences, convert_to_numpy=False) embeddings = [list(map(float, vec)) for vec in embs] return { "model_name": model_name, "sentences": sentences, "embeddings": embeddings, }