VeuReu commited on
Commit
2b325e0
·
verified ·
1 Parent(s): 0df240a

Update main_process/main_router.py

Browse files
Files changed (1) hide show
  1. main_process/main_router.py +0 -53
main_process/main_router.py CHANGED
@@ -36,59 +36,6 @@ def get_casting(video_sha1: str):
36
 
37
  return faces_json, voices_json
38
 
39
- def cluster_secuencial_ocr(ocr_list, threshold=0.3):
40
- if not ocr_list:
41
- return []
42
-
43
- ocr_text = [item["ocr"] for item in ocr_list if item and isinstance(item["ocr"], str)]
44
- if not ocr_text:
45
- return []
46
-
47
- print(ocr_text)
48
-
49
- model = SentenceTransformer("all-MiniLM-L6-v2")
50
- embeddings = model.encode(ocr_text, normalize_embeddings=True)
51
-
52
- clusters_representantes = []
53
- if not embeddings.any():
54
- return []
55
-
56
- prev_embedding = embeddings[0]
57
- start_time = ocr_list[0]["start"]
58
- ocr_prev=ocr_text[0]
59
-
60
- for i, emb in enumerate(embeddings[1:], 1):
61
- ocr_actual=ocr_text[i]
62
- sim = cosine_similarity([prev_embedding], [emb])[0][0]
63
-
64
- print(ocr_prev, " - ", ocr_actual)
65
- print(f"Similitud entre: {sim}")
66
-
67
- if sim < threshold:
68
- clusters_representantes.append({'index': i-1, 'start_time': start_time})
69
- prev_embedding = emb
70
- ocr_prev=ocr_actual
71
- start_time = ocr_list[i]["start"]
72
-
73
- clusters_representantes.append({'index': len(embeddings)-1, 'start_time': start_time})
74
- print(clusters_representantes)
75
-
76
- ocr_final = []
77
- for cluster_info in clusters_representantes:
78
- idx = cluster_info['index']
79
-
80
- if idx < len(ocr_list) and ocr_list[idx]["ocr"]:
81
- ocr_item = {
82
- "ocr": ocr_list[idx]["ocr"],
83
- "image_path": ocr_list[idx]["image_path"],
84
- "start": cluster_info['start_time'],
85
- "end": ocr_list[idx]["end"],
86
- "faces": ocr_list[idx]["faces"]
87
- }
88
- ocr_final.append(ocr_item)
89
-
90
- return ocr_final
91
-
92
  def map_identities_per_second(frames_per_second, intervals):
93
  for seg in intervals:
94
  seg_start = seg["start"]
 
36
 
37
  return faces_json, voices_json
38
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
39
  def map_identities_per_second(frames_per_second, intervals):
40
  for seg in intervals:
41
  seg_start = seg["start"]