File size: 1,965 Bytes
903b444
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
# Embedding + autocomplete index builder — creates FAISS vector index and bigram index
import os
import numpy as np
import pandas as pd
import faiss
from sentence_transformers import SentenceTransformer
from config import (
    META_CSV,
    INDEX_DIR,
    FAISS_PATH,
    EMBEDDING_MODEL,
    VIDEO_METADATA,  
)

# Autocomplete index builder
from autocomplete import build_bigrams_index, BIGRAMS_PATH

# Build FAISS embedding index + bigram autocomplete index
def build_embedding_index(subtitle_blocks: list[dict]):
    texts = [(s.get("text") or "") for s in subtitle_blocks]
    if not texts:
        raise ValueError("No texts found in subtitle blocks. Did you generate metadata.csv?")

    model = SentenceTransformer(EMBEDDING_MODEL)
    vectors = model.encode(texts, show_progress_bar=True, convert_to_numpy=True)

    vectors = np.asarray(vectors, dtype=np.float32)

    index = faiss.IndexFlatL2(vectors.shape[1])
    index.add(vectors)

    os.makedirs(INDEX_DIR, exist_ok=True)
    faiss.write_index(index, os.fspath(FAISS_PATH))

    # Build bigrams for autocomplete
    build_bigrams_index(subtitle_blocks, out_path=BIGRAMS_PATH, min_count=2)

# Load subtitle blocks from CSV and with video titles
def load_blocks_from_csv(csv_path) -> list[dict]:
    df = pd.read_csv(csv_path)
    records = df.to_dict("records")
    for r in records:
        vid = r.get("video_id")
        friendly_key = next((k for k, v in VIDEO_METADATA.items() if v["id"] == vid), None)
        if friendly_key:
            r["video_title"] = VIDEO_METADATA[friendly_key]["title"]
        else:
            r["video_title"] = "Unknown Video"
    return records

# build FAISS + autocomplete indexes
if __name__ == "__main__":
    if not META_CSV.exists():
        raise FileNotFoundError(
            f"metadata.csv not found at {META_CSV}. Run clean_subtitles.py first to generate it."
        )
    blocks = load_blocks_from_csv(META_CSV)
    build_embedding_index(blocks)