File size: 46,362 Bytes
86f2273
 
 
 
 
 
24de544
9160cc3
c389f60
86f2273
 
 
 
 
 
 
 
ee82f9e
2109071
 
 
ee82f9e
 
c5cb13a
 
 
 
 
86f2273
 
 
 
2109071
 
86f2273
 
 
 
 
 
 
 
 
 
 
 
 
 
 
41136a7
 
86f2273
 
 
 
 
 
41136a7
 
86f2273
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a565d25
86f2273
 
 
a565d25
86f2273
 
a565d25
86f2273
 
 
 
c5cb13a
 
 
 
a565d25
39778cd
 
 
 
a565d25
c5cb13a
 
 
a565d25
 
 
 
 
86f2273
 
5fcf529
 
 
 
a565d25
5fcf529
 
86f2273
 
 
5fcf529
86f2273
 
 
5fcf529
 
 
86f2273
 
 
f95eaf7
 
933f9d0
 
 
 
 
 
 
 
f95eaf7
933f9d0
f95eaf7
933f9d0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f95eaf7
 
 
 
24de544
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f95eaf7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f12646c
 
 
 
 
f511ee6
f12646c
 
dd240fd
f12646c
2ec7512
 
 
f12646c
2ec7512
f12646c
 
 
 
 
 
 
 
 
dd240fd
2ec7512
 
f12646c
2ec7512
f12646c
 
 
 
 
 
 
 
 
dd240fd
2ec7512
f12646c
2ec7512
 
f12646c
 
 
 
 
 
 
 
f95eaf7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
86f2273
 
 
 
 
 
 
 
 
 
 
 
 
24de544
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
86f2273
87b95ac
 
 
 
 
 
 
e14296a
 
 
 
 
 
 
 
87b95ac
 
 
3552dbb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a565d25
86f2273
 
a565d25
 
 
 
 
 
 
 
 
86f2273
 
 
 
 
 
2d0d64e
 
 
 
 
 
86f2273
a565d25
 
 
 
 
 
 
 
 
 
 
 
 
 
86f2273
 
 
a565d25
86f2273
 
 
 
 
 
 
 
 
 
 
 
 
a565d25
 
 
933f9d0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
86f2273
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e4cb957
ab655cc
 
0f6bc4e
86f2273
 
 
 
 
 
 
 
 
30ae992
 
 
 
 
86f2273
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
30ae992
e4cb957
ab655cc
30ae992
e4cb957
ab655cc
0f6bc4e
86f2273
 
 
 
 
 
 
 
 
e4cb957
ab655cc
e4cb957
30ae992
e4cb957
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
30ae992
e4cb957
ab655cc
e4cb957
 
 
 
 
 
 
 
86f2273
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c389f60
 
 
 
e898458
 
 
 
 
 
 
 
 
c389f60
 
 
 
 
 
 
 
 
 
 
 
ddc470f
c389f60
0aa5080
c389f60
 
0aa5080
 
 
 
 
c389f60
 
 
 
 
9aa52c7
c389f60
 
1feefa8
c389f60
 
 
9aa52c7
09e41fd
 
 
 
 
 
 
 
 
 
 
 
 
91700a2
09e41fd
 
 
 
 
 
 
 
 
 
 
 
c389f60
 
 
 
 
0476e3d
 
 
 
9aa52c7
0476e3d
 
 
 
 
 
 
 
 
 
 
 
bb03cb8
 
 
0476e3d
bb03cb8
 
 
 
 
 
0476e3d
bb03cb8
 
 
 
 
 
 
 
0476e3d
bb03cb8
9a751f4
0476e3d
 
 
 
c389f60
 
 
e898458
c389f60
 
 
 
e898458
 
c389f60
e898458
 
 
c389f60
e898458
 
 
c389f60
e898458
 
 
 
c389f60
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9aa52c7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0aa5080
 
 
 
 
9aa52c7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
# api_client.py  (UI - Space "veureu")
import os
import requests
import base64
import zipfile
import io
import json
from typing import Iterable, Dict, Any, Tuple
from PIL import Image

class APIClient:
    """
    Cliente para 'engine':
      POST /jobs                       -> {"job_id": "..."}
      GET  /jobs/{job_id}/status       -> {"status": "queued|processing|done|failed", ...}
      GET  /jobs/{job_id}/result       -> JobResult {"book": {...}, "une": {...}, ...}
    """
    def __init__(self, base_url: str, use_mock: bool = False, data_dir: str | None = None, token: str | None = None, timeout: int = 180, tts_url: str | None = None):
        # Base URL del engine: prioritzar variable d'entorn ENGINE_URL
        env_engine_url = os.getenv("ENGINE_URL")
        self.base_url = (env_engine_url or base_url or "").rstrip("/")
        # URL específica para el servicio TTS (por defecto usa la variable de entorno API_TTS_URL)
        self.tts_url = tts_url or os.getenv("API_TTS_URL", "https://veureu-tts.hf.space")
        print(f"🔧 APIClient.__init__ - tts_url recibida: {tts_url}")
        print(f"🔧 APIClient.__init__ - os.getenv(API_TTS_URL): {os.getenv('API_TTS_URL')}")
        print(f"🔧 APIClient.__init__ - self.tts_url final: {self.tts_url}")
        print(f"🔧 APIClient.__init__ - tipo self.tts_url: {type(self.tts_url)}")
        print(f"🔧 APIClient.__init__ - repr self.tts_url: {repr(self.tts_url)}")
        self.use_mock = use_mock
        self.data_dir = data_dir
        self.timeout = timeout
        self.session = requests.Session()
        # Token secret del engine: prioritzar API_ENGINE_VEUREU
        token = token or os.getenv("API_ENGINE_VEUREU")
        if token:
            self.session.headers.update({"Authorization": f"Bearer {token}"})

    # ---- modo real (engine) ----
    def _post_jobs(self, video_path: str, modes: Iterable[str]) -> Dict[str, Any]:
        url = f"{self.base_url}/jobs"
        files = {"file": (os.path.basename(video_path), open(video_path, "rb"), "application/octet-stream")}
        data = {"modes": ",".join(modes)}
        r = self.session.post(url, files=files, data=data, timeout=self.timeout)
        r.raise_for_status()
        return r.json()  # {"job_id": ...}

    def _get_status(self, job_id: str) -> Dict[str, Any]:
        url = f"{self.base_url}/jobs/{job_id}/status"
        r = self.session.get(url, timeout=self.timeout)
        if r.status_code == 404:
            return {"status": "not_found"}
        r.raise_for_status()
        return r.json()

    def _get_result(self, job_id: str) -> Dict[str, Any]:
        url = f"{self.base_url}/jobs/{job_id}/status"
        r = self.session.get(url, timeout=self.timeout)
        if r.status_code == 404:
            return {"status": "not_found"}
        r.raise_for_status()
        return r.json()  # JobResult (status + results según engine)

    # ---- API que usa streamlit_app.py ----
    def process_video(self, video_path: str, modes: Iterable[str]) -> Dict[str, Any]:
        """Devuelve {"job_id": "..."}"""
        if self.use_mock:
            return {"job_id": "mock-123"}
        return self._post_jobs(video_path, modes)

    def get_job(self, job_id: str) -> Dict[str, Any]:
        """
        La UI espera algo del estilo:
          {"status":"done","results":{"book":{...},"une":{...}}}
        Adaptamos la respuesta de /result del engine a ese contrato.
        """
        if self.use_mock:
            # resultado inmediato de prueba
            return {
                "status": "done",
                "results": {
                    "book": {"text": "Text d'exemple (book)", "mp3_bytes": b""},
                    "une":  {"srt": "1\n00:00:00,000 --> 00:00:01,000\nExemple UNE\n", "mp3_bytes": b""},
                }
            }

        # Opción 1: chequear estado primero
        st = self._get_status(job_id)
        if st.get("status") in {"queued", "processing"}:
            return {"status": st.get("status", "queued")}

        # Opción 2: obtener resultado final
        res = self._get_result(job_id)

        # NUEVO: si el engine ya devuelve {"status": ..., "results": {...}}, pásalo tal cual
        if isinstance(res, dict) and isinstance(res.get("results"), dict):
            return {
                "status": res.get("status", st.get("status", "done")),
                "results": res.get("results", {}),
            }

        # LEGACY: mapeo antiguo basado en claves top-level (book/une)
        results = {}
        if "book" in res:
            results["book"] = {
                "text": res["book"].get("text"),
            }
        if "une" in res:
            results["une"] = {
                "srt": res["une"].get("srt"),
            }
        for k in ("book", "une"):
            if k in res:
                if "characters" in res[k]:
                    results[k]["characters"] = res[k]["characters"]
                if "metrics" in res[k]:
                    results[k]["metrics"] = res[k]["metrics"]

        status = "done" if results else st.get("status", "unknown")
        return {"status": status, "results": results}


    def tts_matxa(self, text: str, voice: str = "central/grau") -> dict:
        """
        Llama al space 'tts' para sintetizar audio.
        Usa /tts/text para textos cortos (<480 chars) o /tts/text_long para textos largos.

        Args:
            text (str): Texto a sintetizar.
            voice (str): Voz de Matxa a usar (p.ej. 'central/grau').

        Returns:
            dict: {'mp3_bytes': bytes} o {'error': str}
        """
        if not self.tts_url:
            raise ValueError("La URL del servei TTS no està configurada (API_TTS_URL)")

        print(f"🔧 tts_matxa - self.tts_url ANTES de construir URL: {self.tts_url}")
        print(f"🔧 tts_matxa - tipo self.tts_url: {type(self.tts_url)}")
        print(f"🔧 tts_matxa - repr self.tts_url: {repr(self.tts_url)}")
        
        # Usar endpoint apropiado según la longitud del texto
        if len(text) > 480:
            url = f"{self.tts_url.rstrip('/')}/tts/text_long"
        else:
            url = f"{self.tts_url.rstrip('/')}/tts/text"
        
        print(f"🔧 tts_matxa - URL final construida: {url}")
        print(f"🔧 tts_matxa - repr URL final: {repr(url)}")
        
        data = {
            "texto": text,
            "voice": voice,
            "formato": "mp3"
        }
        
        try:
            print(f"🎯 Llamando TTS a: {url}")
            print(f"📝 Texto length: {len(text)} caracteres")
            print(f"🗣️  Voz: {voice}")
            
            r = self.session.post(url, data=data, timeout=self.timeout * 2)  # Más tiempo para textos largos
            print(f"📊 Response status: {r.status_code}")
            
            r.raise_for_status()
            
            # Devolver los bytes directamente para que el cliente los pueda concatenar
            print(f"✅ Audio recibido: {len(r.content)} bytes")
            return {"mp3_bytes": r.content}

        except requests.exceptions.RequestException as e:
            print(f"❌ Error cridant a TTS: {e}")
            print(f"❌ URL: {url}")
            print(f"❌ Data: {data}")
            # Devolvemos un diccionario con error para que la UI lo muestre
            return {"error": str(e)}


    def import_databases(self) -> dict:
        """Descarga todas las BDs del engine (/data/db) como ZIP.

        Endpoint: GET /db/download_all_db_files
        Retorna: {"zip_bytes": bytes} o {"error": str}
        """

        token = os.getenv("API_ENGINE_VEUREU", "")
        url = f"{self.base_url}/db/download_all_db_files"
        try:
            r = self.session.get(url, params={"token": token}, timeout=self.timeout * 2)
            r.raise_for_status()
            # El endpoint devuelve un ZIP binario
            return {"zip_bytes": r.content}
        except requests.exceptions.RequestException as e:
            print(f"[import_databases] Error: {e}")
            return {"error": str(e)}


    # --- Initial transcription (generate_initial_srt_and_info + downloads) ---

    def generate_initial_srt_and_info(self, sha1sum: str) -> dict:
        """Lanza el pipeline inicial de transcripció al engine.

        Endpoint: POST /transcription/generate_initial_srt_and_info
        Params: sha1, token (HF_TOKEN)
        """

        url = f"{self.base_url}/transcription/generate_initial_srt_and_info"
        hf_token = os.getenv("HF_TOKEN")
        params: dict[str, Any] = {"sha1": sha1sum}
        if hf_token:
            params["token"] = hf_token

        try:
            r = self.session.post(url, params=params, timeout=self.timeout * 10)
            r.raise_for_status()
            # El backend pot retornar text pla o JSON; ho encapsulem sempre com dict
            if r.headers.get("content-type", "").startswith("application/json"):
                body = r.json()
            else:
                body = {"srt": r.text or ""}
            body.setdefault("status", "ok")
            return body
        except requests.exceptions.RequestException as e:
            return {"error": str(e)}

    def download_initial_srt(self, sha1sum: str) -> dict:
        """Descarrega l'initial.srt generat pel pipeline inicial.

        Endpoint: GET /transcription/download_initial_srt
        """

        url = f"{self.base_url}/transcription/download_initial_srt"
        hf_token = os.getenv("HF_TOKEN")
        params: dict[str, Any] = {"sha1": sha1sum}
        if hf_token:
            params["token"] = hf_token

        try:
            r = self.session.get(url, params=params, timeout=self.timeout * 5)
            r.raise_for_status()
            # El backend retorna un fitxer de text (SRT)
            return {"text": r.text or ""}
        except requests.exceptions.RequestException as e:
            return {"error": str(e)}

    def download_initial_info(self, sha1sum: str) -> dict:
        """Descarrega l'info.json inicial associat al vídeo.

        Endpoint: GET /transcription/download_initial_info
        """

        url = f"{self.base_url}/transcription/download_initial_info"
        hf_token = os.getenv("HF_TOKEN")
        params: dict[str, Any] = {"sha1": sha1sum}
        if hf_token:
            params["token"] = hf_token

        try:
            r = self.session.get(url, params=params, timeout=self.timeout * 5)
            r.raise_for_status()
            return {"text": r.text or ""}
        except requests.exceptions.RequestException as e:
            return {"error": str(e)}


    # --- Salamandra pipeline (result.srt + free_narration.txt) ---

    def generate_salamandra_result(self, sha1sum: str) -> dict:
        """Orquestra la generació dels fitxers de sortida de Salamandra.

        Endpoint: POST /salamandra/generate_salamadra_result
        """

        url = f"{self.base_url}/salamandra/generate_salamadra_result"
        hf_token = os.getenv("HF_TOKEN")
        params: dict[str, Any] = {"sha1": sha1sum}
        if hf_token:
            params["token"] = hf_token

        try:
            r = self.session.post(url, params=params, timeout=self.timeout * 20)
            r.raise_for_status()
            return r.json() if r.headers.get("content-type", "").startswith("application/json") else {"status": "ok"}
        except requests.exceptions.RequestException as e:
            return {"error": str(e)}

    def download_salamandra_srt(self, sha1sum: str) -> dict:
        """Descarrega el result.srt de Salamandra.

        Endpoint: GET /salamandra/download_salamadra_srt
        """

        url = f"{self.base_url}/salamandra/download_salamadra_srt"
        hf_token = os.getenv("HF_TOKEN")
        params: dict[str, Any] = {"sha1": sha1sum}
        if hf_token:
            params["token"] = hf_token

        try:
            r = self.session.get(url, params=params, timeout=self.timeout * 5)
            r.raise_for_status()
            return {"text": r.text or ""}
        except requests.exceptions.RequestException as e:
            return {"error": str(e)}

    def download_salamandra_free_narration(self, sha1sum: str) -> dict:
        """Descarrega el free_narration.txt de Salamandra.

        Endpoint: GET /salamandra/download_salamadra_free_narration
        """

        url = f"{self.base_url}/salamandra/download_salamadra_free_narration"
        hf_token = os.getenv("HF_TOKEN")
        params: dict[str, Any] = {"sha1": sha1sum}
        if hf_token:
            params["token"] = hf_token

        try:
            r = self.session.get(url, params=params, timeout=self.timeout * 5)
            r.raise_for_status()
            return {"text": r.text or ""}
        except requests.exceptions.RequestException as e:
            return {"error": str(e)}


    # --- MoE pipeline (result.srt + free_narration.txt) ---

    def generate_moe_result(self, sha1sum: str) -> dict:
        """Orquestra la generació dels fitxers de sortida de MoE.

        Endpoint: POST /moe/generate_moe_result
        """

        url = f"{self.base_url}/moe/generate_moe_result"
        hf_token = os.getenv("HF_TOKEN")
        params: dict[str, Any] = {"sha1": sha1sum}
        if hf_token:
            params["token"] = hf_token

        try:
            r = self.session.post(url, params=params, timeout=self.timeout * 20)
            r.raise_for_status()
            return r.json() if r.headers.get("content-type", "").startswith("application/json") else {"status": "ok"}
        except requests.exceptions.RequestException as e:
            return {"error": str(e)}

    def download_moe_srt(self, sha1sum: str) -> dict:
        """Descarrega el result.srt de MoE.

        Endpoint: GET /moe/download_moe_srt
        """

        url = f"{self.base_url}/moe/download_moe_srt"
        hf_token = os.getenv("HF_TOKEN")
        params: dict[str, Any] = {"sha1": sha1sum}
        if hf_token:
            params["token"] = hf_token

        try:
            r = self.session.get(url, params=params, timeout=self.timeout * 5)
            r.raise_for_status()
            return {"text": r.text or ""}
        except requests.exceptions.RequestException as e:
            return {"error": str(e)}

    def download_moe_free_narration(self, sha1sum: str) -> dict:
        """Descarrega el free_narration.txt de MoE.

        Endpoint: GET /moe/download_moe_free_narration
        """

        url = f"{self.base_url}/moe/download_moe_free_narration"
        hf_token = os.getenv("HF_TOKEN")
        params: dict[str, Any] = {"sha1": sha1sum}
        if hf_token:
            params["token"] = hf_token

        try:
            r = self.session.get(url, params=params, timeout=self.timeout * 5)
            r.raise_for_status()
            return {"text": r.text or ""}
        except requests.exceptions.RequestException as e:
            return {"error": str(e)}


    def upload_embeddings(self, video_hash: str, embeddings_json: dict) -> dict:
        """Puja el JSON de càsting (faces+voices) com a embeddings al backend engine.

        Utilitza l'endpoint /embeddings/upload_embeddings per als dos tipus ('faces' i 'voices').
        """

        url = f"{self.base_url}/embeddings/upload_embeddings"
        hf_token = os.getenv("HF_TOKEN")

        # Serialitzar un sol cop el JSON complet de càsting
        try:
            payload_bytes = json.dumps(embeddings_json, ensure_ascii=False).encode("utf-8")
        except Exception as e:
            return {"error": f"Error serialitzant embeddings_json: {e}"}

        results: dict[str, Any] = {}

        for embedding_type in ("faces", "voices"):
            params = {
                "embedding_type": embedding_type,
                "video_hash": video_hash,
            }
            if hf_token:
                params["token"] = hf_token

            files = {
                "file": ("embeddings.json", payload_bytes, "application/json"),
            }

            try:
                r = self.session.post(url, params=params, files=files, timeout=self.timeout * 2)
                r.raise_for_status()
                results[embedding_type] = r.json() if r.headers.get("content-type", "").startswith("application/json") else {"status": "ok"}
            except requests.exceptions.RequestException as e:
                results[embedding_type] = {"error": str(e)}

        return results


    def import_media(self, sha1sum: str) -> dict:
        url = f"{self.base_url}/import_media/{sha1sum}"
        try:
            r = self.session.get(url, timeout=self.timeout * 5)
            r.raise_for_status()
            return {"zip_bytes": r.content}
        except requests.exceptions.RequestException as e:
            return {"error": str(e)}


    def import_media_version(self, sha1sum: str, version: str) -> dict:
        url = f"{self.base_url}/import_media_version/{sha1sum}/{version}"
        try:
            r = self.session.get(url, timeout=self.timeout * 5)
            r.raise_for_status()
            return {"zip_bytes": r.content}
        except requests.exceptions.RequestException as e:
            return {"error": str(e)}


    # ---- Pending videos (peding_videos) ----

    def upload_pending_video(self, video_bytes: bytes, filename: str) -> dict:
        """Sube un vídeo pendiente al engine (carpeta /data/peding_videos).

        Usa el endpoint POST /peding_videos/upload_pending_video.
        """

        url = f"{self.base_url}/pending_videos/upload_pending_video"
        files = {"video": (filename, io.BytesIO(video_bytes), "video/mp4")}
        # El backend engine requereix un token de query (?token=...) validat contra HF_TOKEN
        hf_token = os.getenv("HF_TOKEN")
        params = {"token": hf_token} if hf_token else {}
        try:
            r = self.session.post(url, params=params, files=files, timeout=self.timeout * 5)
            r.raise_for_status()
            return r.json() if r.headers.get("content-type", "").startswith("application/json") else {"status": "ok"}
        except requests.exceptions.RequestException as e:
            return {"error": str(e)}


    def list_pending_videos(self) -> dict:
        """Llista els vídeos pendents al backend (endpoint GET /peding_videos/list_peding_videos)."""

        url = f"{self.base_url}/pending_videos/list_pending_videos"
        hf_token = os.getenv("HF_TOKEN")
        params = {"token": hf_token} if hf_token else {}
        try:
            r = self.session.get(url, params=params, timeout=self.timeout * 5)
            r.raise_for_status()
            return r.json()
        except requests.exceptions.RequestException as e:
            return {"error": str(e)}


    def download_pending_video(self, sha1sum: str) -> dict:
        """Descarrega un vídeo pendent per sha1 (GET /peding_videos/download_peding_video)."""

        url = f"{self.base_url}/pending_videos/download_pending_video"
        hf_token = os.getenv("HF_TOKEN")
        params = {"sha1": sha1sum}
        if hf_token:
            params["token"] = hf_token
        try:
            r = self.session.get(url, params=params, timeout=self.timeout * 5)
            r.raise_for_status()
            return {"video_bytes": r.content}
        except requests.exceptions.RequestException as e:
            return {"error": str(e)}


    def update_databases(self, payload: dict) -> dict:
        """Envia les sentències SQL generades a l'endpoint /update_databases."""

        url = f"{self.base_url}/update_databases"
        try:
            r = self.session.post(url, json=payload, timeout=self.timeout * 5)
            r.raise_for_status()
            return r.json()
        except requests.exceptions.RequestException as e:
            return {"error": str(e)}


    def export_media(self, zip_bytes: bytes) -> dict:
        """Envia un ZIP amb els nous vídeos a l'endpoint /export_media."""

        url = f"{self.base_url}/export_media"
        files = {"media_zip": ("media_export.zip", zip_bytes, "application/zip")}
        try:
            r = self.session.post(url, files=files, timeout=self.timeout * 10)
            r.raise_for_status()
            return r.json()
        except requests.exceptions.RequestException as e:
            return {"error": str(e)}

    def generate_audiodescription(self, video_bytes: bytes, video_name: str) -> dict:
        """Llama al endpoint del engine /generate_audiodescription con un MP4 en memoria."""
        url = f"{self.base_url}/generate_audiodescription"
        try:
            files = {
                "video": (video_name or "video.mp4", video_bytes, "video/mp4")
            }
            r = self.session.post(url, files=files, timeout=self.timeout * 10)
            r.raise_for_status()
            return r.json()
        except requests.exceptions.RequestException as e:
            return {"error": str(e)}

    def generate_salamandra_ad_from_sha1(self, sha1sum: str) -> dict:
        """Genera l'SRT d'audiodescripció (Salamandra) a partir del SHA1 del vídeo.

        Crida al endpoint /transcription/generate_srt del engine, que retorna
        directament el contingut de l'SRT com a text pla. Aquest mètode embolica
        la resposta en un dict compatible amb la UI existent:

            {"status": "done", "results": {"une_srt": "...", "free_text": ""}}
        """

        url = f"{self.base_url}/transcription/generate_srt"
        hf_token = os.getenv("HF_TOKEN")
        params: dict[str, Any] = {"sha1": sha1sum}
        if hf_token:
            params["token"] = hf_token

        try:
            r = self.session.post(url, params=params, timeout=self.timeout * 10)
            r.raise_for_status()
            srt_text = r.text or ""
            return {
                "status": "done",
                "results": {
                    "une_srt": srt_text,
                    "free_text": "",
                },
            }
        except requests.exceptions.RequestException as e:
            return {"error": str(e)}

    def finalize_casting(self, payload: dict) -> dict:
        """Envía el càsting definitiu al engine para consolidar identidades e indexar."""
        url = f"{self.base_url}/finalize_casting"
        try:
            r = self.session.post(url, json=payload, timeout=self.timeout * 5)
            r.raise_for_status()
            return r.json()
        except requests.exceptions.HTTPError as e:
            resp = e.response
            try:
                # Try to include JSON error if present
                return {"error": str(e), "status_code": resp.status_code if resp is not None else None, "body": resp.json() if resp is not None else None}
            except Exception:
                # Fallback to text body
                return {"error": str(e), "status_code": resp.status_code if resp is not None else None, "body": (resp.text if resp is not None else None)}
        except requests.exceptions.RequestException as e:
            return {"error": str(e)}

    def load_casting(self, faces_dir: str, voices_dir: str, db_dir: str, drop_collections: bool = False) -> dict:
        """Carga índices de caras y voces al motor de búsqueda Chroma del engine."""
        url = f"{self.base_url}/load_casting"
        data = {
            "faces_dir": faces_dir,
            "voices_dir": voices_dir,
            "db_dir": db_dir,
            "drop_collections": str(1 if drop_collections else 0),
        }
        try:
            r = self.session.post(url, data=data, timeout=self.timeout * 5)
            r.raise_for_status()
            return r.json()
        except requests.exceptions.RequestException as e:
            return {"error": str(e)}

    def rebuild_video_with_ad(self, video_path: str, srt_path: str, voice: str = "central/grau") -> dict:
        """
        Llama al space 'tts' para reconstruir un vídeo con audiodescripció a partir de un SRT.
        Usa el endpoint /tts/srt que devuelve un ZIP con el vídeo final.
        
        Args:
            video_path: Ruta al archivo de vídeo original
            srt_path: Ruta al archivo SRT con las audiodescripciones
            voice: Voz de Matxa (por defecto 'central/grau')
        
        Returns:
            dict: {'video_bytes': bytes} o {'error': str}
        """
        if not self.tts_url:
            raise ValueError("La URL del servei TTS no està configurada (API_TTS_URL)")

        url = f"{self.tts_url.rstrip('/')}/tts/srt"
        
        print(f"🎬 Reconstruyendo video con AD")
        print(f"🎯 URL TTS: {url}")
        print(f"📹 Video: {video_path}")
        print(f"📝 SRT: {srt_path}")
        print(f"🗣️  Voz: {voice}")
        
        try:
            with open(video_path, 'rb') as video_file:
                with open(srt_path, 'rb') as srt_file:
                    files = {
                        'video': (os.path.basename(video_path), video_file, 'video/mp4'),
                        'srt': (os.path.basename(srt_path), srt_file, 'application/x-subrip')
                    }
                    data = {
                        "voice": voice,
                        "ad_format": "mp3",
                        "include_final_mp4": "1"
                    }
                    
                    r = self.session.post(url, files=files, data=data, timeout=self.timeout * 5)
                    r.raise_for_status()
            
            # El servidor devuelve un ZIP, lo procesamos en memoria
            with zipfile.ZipFile(io.BytesIO(r.content)) as z:
                # Buscamos el archivo video_con_ad.mp4 dentro del ZIP
                for filename in z.namelist():
                    if filename.endswith('.mp4'):
                        video_bytes = z.read(filename)
                        return {"video_bytes": video_bytes}
            
            # Si no se encuentra el MP4 en el ZIP
            return {"error": "No se encontró el archivo de vídeo MP4 en la respuesta del servidor."}

        except requests.exceptions.RequestException as e:
            print(f"Error cridant a la reconstrucció de vídeo: {e}")
            return {"error": str(e)}
        except zipfile.BadZipFile:
            return {"error": "La respuesta del servidor no fue un archivo ZIP válido."}
        except Exception as e:
            print(f"Error inesperat: {e}")
            return {"error": str(e)}


    def apply_refinement(
        self,
        *,
        sha1sum: str | None = None,
        version: str | None = None,
        srt_content: str | None = None,
        reflection_enabled: bool = True,
        reflexion_enabled: bool = False,
        introspection_enabled: bool = False,
    ) -> dict:
        """Aplica el pipeline de refinement multi-agent sobre un SRT.

        Endpoint: POST /refinement/apply_refinement

        Pot treballar de dues maneres:
          - Passant sha1sum+version perquè el backend llegeixi l'SRT de les BDs
          - Passant srt_content explícitament
        """

        url = f"{self.base_url}/refinement/apply_refinement"
        hf_token = os.getenv("HF_TOKEN")

        payload: dict[str, Any] = {
            "reflection_enabled": bool(reflection_enabled),
            "reflexion_enabled": bool(reflexion_enabled),
            "introspection_enabled": bool(introspection_enabled),
        }

        if sha1sum is not None:
            payload["sha1sum"] = sha1sum
        if version is not None:
            payload["version"] = version
        if srt_content is not None:
            payload["srt_content"] = srt_content
        if hf_token:
            payload["token"] = hf_token

        try:
            r = self.session.post(url, json=payload, timeout=self.timeout * 10)
            r.raise_for_status()
            return r.json() if r.headers.get("content-type", "").startswith("application/json") else {"status": "ok"}
        except requests.exceptions.RequestException as e:
            return {"error": str(e)}


    def refine_narration(self, dialogues_srt: str, frame_descriptions_json: str = "[]", config_path: str = "config.yaml") -> dict:
        """Llama al endpoint del engine /refine_narration para generar narrativa y/o SRT."""
        url = f"{self.base_url}/refine_narration"
        data = {
            "dialogues_srt": dialogues_srt,
            "frame_descriptions_json": frame_descriptions_json,
            "config_path": config_path,
        }
        try:
            r = self.session.post(url, data=data, timeout=self.timeout)
            r.raise_for_status()
            return r.json()
        except requests.exceptions.RequestException as e:
            return {"error": str(e)}


    def create_initial_casting(self, video_path: str = None, video_bytes: bytes = None, video_name: str = None,
                               face_max_groups: int = 3, face_min_cluster_size: int = 3, face_sensitivity: float = 0.5,
                               voice_max_groups: int = 3, voice_min_cluster_size: int = 3, voice_sensitivity: float = 0.5,
                               max_frames: int = 100) -> dict:
        """
        Llama al endpoint del space 'engine' para crear el 'initial casting'.

        Envía el vídeo recién importado como archivo y los parámetros de clustering.
        
        Args:
            video_path: Path to video file (if reading from disk)
            video_bytes: Video file bytes (if already in memory)
            video_name: Name for the video file
            face_max_groups: Max number of face clusters (hierarchical)
            face_min_cluster_size: Minimum face cluster size
            voice_max_groups: Max number of voice clusters (hierarchical)
            voice_min_cluster_size: Minimum voice cluster size
            max_frames: Maximum number of frames to process
        """
        url = f"{self.base_url}/create_initial_casting"
        try:
            # Prepare file data
            if video_bytes:
                filename = video_name or "video.mp4"
                files = {
                    "video": (filename, video_bytes, "video/mp4"),
                }
            elif video_path:
                with open(video_path, "rb") as f:
                    files = {
                        "video": (os.path.basename(video_path), f.read(), "video/mp4"),
                    }
            else:
                return {"error": "Either video_path or video_bytes must be provided"}
            
            data = {
                "max_groups": str(face_max_groups),
                "min_cluster_size": str(face_min_cluster_size),
                "face_sensitivity": str(face_sensitivity),
                "voice_max_groups": str(voice_max_groups),
                "voice_min_cluster_size": str(voice_min_cluster_size),
                "voice_sensitivity": str(voice_sensitivity),
                "max_frames": str(max_frames),
            }
            r = self.session.post(url, files=files, data=data, timeout=self.timeout * 5)
            r.raise_for_status()
            return r.json() if r.headers.get("content-type", "").startswith("application/json") else {"ok": True}
        except requests.exceptions.RequestException as e:
            return {"error": str(e)}
        except Exception as e:
            return {"error": f"Unexpected error: {str(e)}"}

    def detect_scenes(self, video_path: str = None, video_bytes: bytes = None, video_name: str = None,
                      max_groups: int = 3, min_cluster_size: int = 3, scene_sensitivity: float = 0.5, frame_interval_sec: float = 0.5) -> dict:
        """
        Call engine /detect_scenes to compute scene clusters using hierarchical clustering on color histograms.
        """
        url = f"{self.base_url}/detect_scenes"
        try:
            if video_bytes:
                filename = video_name or "video.mp4"
                files = {
                    "video": (filename, video_bytes, "video/mp4"),
                }
            elif video_path:
                with open(video_path, "rb") as f:
                    files = {
                        "video": (os.path.basename(video_path), f.read(), "video/mp4"),
                    }
            else:
                return {"error": "Either video_path or video_bytes must be provided"}

            data = {
                "max_groups": str(max_groups),
                "min_cluster_size": str(min_cluster_size),
                "scene_sensitivity": str(scene_sensitivity),
                "frame_interval_sec": str(frame_interval_sec),
            }
            r = self.session.post(url, files=files, data=data, timeout=self.timeout * 5)
            r.raise_for_status()
            return r.json()
        except requests.exceptions.RequestException as e:
            return {"error": str(e)}

    def generate_audio_from_text_file(self, text_content: str, voice: str = "central/grau") -> dict:
        """
        Genera un único MP3 a partir de un texto largo, usando el endpoint de SRT.
        1. Convierte el texto en un SRT falso.
        2. Llama a /tts/srt con el SRT.
        3. Extrae el 'ad_master.mp3' del ZIP resultante.
        """
        if not self.tts_url:
            raise ValueError("La URL del servei TTS no està configurada (API_TTS_URL)")

        # 1. Crear un SRT falso en memoria
        srt_content = ""
        start_time = 0
        for i, line in enumerate(text_content.strip().split('\n')):
            line = line.strip()
            if not line:
                continue
            # Asignar 5 segundos por línea, un valor simple
            end_time = start_time + 5
            
            def format_time(seconds):
                h = int(seconds / 3600)
                m = int((seconds % 3600) / 60)
                s = int(seconds % 60)
                ms = int((seconds - int(seconds)) * 1000)
                return f"{h:02d}:{m:02d}:{s:02d},{ms:03d}"

            srt_content += f"{i+1}\n"
            srt_content += f"{format_time(start_time)} --> {format_time(end_time)}\n"
            srt_content += f"{line}\n\n"
            start_time = end_time

        if not srt_content:
            return {"error": "El texto proporcionado estaba vacío o no se pudo procesar."}

        # 2. Llamar al endpoint /tts/srt
        url = f"{self.tts_url.rstrip('/')}/tts/srt"
        try:
            files = {
                'srt': ('fake_ad.srt', srt_content, 'application/x-subrip')
            }
            data = {"voice": voice, "ad_format": "mp3"}
            
            r = requests.post(url, files=files, data=data, timeout=self.timeout * 5)
            r.raise_for_status()

            # 3. Extraer 'ad_master.mp3' del ZIP
            with zipfile.ZipFile(io.BytesIO(r.content)) as z:
                for filename in z.namelist():
                    if filename == 'ad_master.mp3':
                        mp3_bytes = z.read(filename)
                        return {"mp3_bytes": mp3_bytes}
            
            return {"error": "No se encontró 'ad_master.mp3' en la respuesta del servidor."}

        except requests.exceptions.RequestException as e:
            return {"error": f"Error llamando a la API de SRT: {e}"}
        except zipfile.BadZipFile:
            return {"error": "La respuesta del servidor no fue un archivo ZIP válido."}


    def tts_long_text(self, text: str, voice: str = "central/grau") -> dict:
        """
        Llama al endpoint '/tts/text_long' para sintetizar un texto largo.
        La API se encarga de todo el procesamiento.
        """
        if not self.tts_url:
            raise ValueError("La URL del servei TTS no està configurada (API_TTS_URL)")

        url = f"{self.tts_url.rstrip('/')}/tts/text_long"
        data = {
            "texto": text,
            "voice": voice,
            "formato": "mp3"
        }
        
        try:
            # Usamos un timeout más largo por si el texto es muy extenso
            r = requests.post(url, data=data, timeout=self.timeout * 10)
            r.raise_for_status()
            return {"mp3_bytes": r.content}

        except requests.exceptions.RequestException as e:
            print(f"Error cridant a TTS per a text llarg: {e}")
            return {"error": str(e)}


# ===========================
# Cliente para SVision Space
# ===========================

# Nombres catalanes comunes para asignar a personajes (deben coincidir con app.py)
def get_catalan_names():
    """Retorna llistes de noms catalans."""
    noms_home = ["Jordi", "Marc", "Pau", "Pere", "Joan", "Josep", "David", "Àlex", "Guillem", "Albert",
                 "Arnau", "Martí", "Bernat", "Oriol", "Roger", "Pol", "Lluís", "Sergi", "Carles", "Xavier"]
    noms_dona = ["Maria", "Anna", "Laura", "Marta", "Cristina", "Núria", "Montserrat", "Júlia", "Sara", "Carla",
                 "Alba", "Elisabet", "Rosa", "Gemma", "Sílvia", "Teresa", "Irene", "Laia", "Marina", "Bet"]
    return noms_home, noms_dona

def describe_image_with_svision(image_path: str, is_face: bool = True) -> Tuple[str, str]:
    """
    Llama al space svision para describir una imagen (cara o escena).
    
    Args:
        image_path: Ruta absoluta a la imagen
        is_face: True si es una cara, False si es una escena
    
    Returns:
        tuple (descripción_completa, nombre_abreviado)
    """
    try:
        from gradio_client import Client, handle_file
        
        # Conectar al space svision con timeout generoso per al cold start de ZeroGPU
        svision_url = os.getenv("SVISION_URL", "https://veureu-svision.hf.space")
        print(f"[svision] Connectant a {svision_url}...")

        # La versió actual de gradio_client al Space no accepta el paràmetre hf_token al constructor.
        # Confiem en la configuració d'entorn del Space per a l'autenticació (si s'escau).
        client = Client(svision_url)
        print("[svision] Client creat (sense hf_token explícit)")
        
        # Preparar prompt según el tipo
        if is_face:
            prompt = "Descriu aquesta persona. Inclou: edat aproximada (jove/adult), gènere, característiques físiques notables (ulleres, barba, bigoti, etc.), expressió i vestimenta."
        else:
            prompt = "Descriu aquesta escena breument en 2-3 frases: tipus de localització i elements principals."
        
        print(f"[svision] Enviant petició (pot trigar si ZeroGPU està en cold start)...")
        print(f"[svision] Image path: {image_path}")
        
        import time
        start_time = time.time()
        max_tokens = 256 if is_face else 128
        max_attempts = int(os.getenv("SVISION_MAX_ATTEMPTS", "5"))
        wait_seconds = int(os.getenv("SVISION_RETRY_WAIT", "5"))
        result = None
        last_error: Exception | None = None

        for attempt in range(1, max_attempts + 1):
            try:
                print(f"[svision] Attempt {attempt}/{max_attempts} (wait={wait_seconds}s)")
                result = client.predict(
                    handle_file(image_path),
                    prompt,
                    max_tokens,
                    0.7,
                    api_name="/describe"
                )
                if result and isinstance(result, str) and result.strip():
                    break
                raise RuntimeError("Resposta buida de svision")
            except Exception as exc:
                last_error = exc
                print(f"[svision] Error attempt {attempt}/{max_attempts}: {exc}")
                if attempt == max_attempts:
                    raise
                time.sleep(wait_seconds)
                wait_seconds = min(wait_seconds * 2, 40)

        elapsed = time.time() - start_time
        print(f"[svision] Resposta rebuda en {elapsed:.1f}s")
        
        full_description = result.strip() if result else ""
        
        # PASO 1: Eliminar el prompt original que puede aparecer en la respuesta
        prompt_markers = [
            "Descriu aquesta persona. Inclou: edat aproximada (jove/adult), gènere, característiques físiques notables (ulleres, barba, bigoti, etc.), expressió i vestimenta.",
            "Descriu aquesta escena. Inclou: tipus de localització (interior/exterior), elements principals, ambient, il·luminació.",
            "Descriu aquesta escena breument en 2-3 frases: tipus de localització i elements principals.",
            "Descriu aquesta persona.",
            "Descriu aquesta escena.",
        ]
        
        for marker in prompt_markers:
            if marker in full_description:
                # Eliminar el prompt y todo lo que esté antes
                parts = full_description.split(marker, 1)
                if len(parts) > 1:
                    full_description = parts[1].strip()
        
        # PASO 2: Limpiar prefijos no deseados de forma más agresiva
        # Lista de prefijos comunes que aparecen
        prefixes_to_remove = [
            "user:", "user ", "user\n", "user\t",
            "assistant:", "assistant ", "assistant\n", "assistant\t",
            "User:", "User ", "User\n",
            "Assistant:", "Assistant ", "Assistant\n",
            "system:", "system ",
        ]
        
        # Intentar limpiar múltiples veces por si hay varios prefijos
        for _ in range(5):  # Máximo 5 iteraciones
            original = full_description
            for prefix in prefixes_to_remove:
                if full_description.lower().startswith(prefix.lower()):
                    full_description = full_description[len(prefix):].strip()
                    break
            if original == full_description:
                break  # No hubo cambios, salir
        
        # PASO 3: Limpiar espacios en blanco múltiples y saltos de línea al inicio
        full_description = full_description.lstrip()
        
        # PASO 4: Si empieza con salto de línea o tabulación, limpiar
        while full_description and full_description[0] in ['\n', '\t', '\r', ' ']:
            full_description = full_description[1:]
        
        if not full_description:
            return ("", "")
        
        # Generar nombre aleatorio en catalán para caras
        if is_face:
            # Extraer características clave para el nombre
            desc_lower = full_description.lower()
            
            # Determinar género
            is_female = any(word in desc_lower for word in ["dona", "noia", "nena", "femení", "femenina"])
            
            # Seleccionar nombre aleatorio pero consistente (hash del path)
            import hashlib
            hash_val = int(hashlib.md5(image_path.encode()).hexdigest(), 16)
            
            noms_home, noms_dona = get_catalan_names()
            if is_female:
                name_list = noms_dona
            else:
                name_list = noms_home
            
            # Usar hash para selección consistente
            short_name = name_list[hash_val % len(name_list)]
        else:
            # Para escenas, extraer primeras palabras clave
            words = full_description.split()[:4]
            short_name = " ".join(words).capitalize()
        
        print(f"[svision] Descripció generada: {full_description[:100]}...")
        print(f"[svision] Nom: {short_name}")
        
        return (full_description, short_name)
        
    except Exception as e:
        print(f"[svision] Error al descriure imatge: {e}")
        import traceback
        traceback.print_exc()
        return ("", "")


def generate_short_scene_name(description: str) -> str:
    """
    Genera un nombre corto de escena (< 3 palabras) basándose en la descripción
    usando el modelo schat (Salamandra-Instruct).
    
    Args:
        description: Descripción completa de la escena de svision
    
    Returns:
        Nombre corto de la escena (< 3 palabras) o string vacío si falla
    """
    try:
        # Importar gradio_client
        from gradio_client import Client, handle_file
        
        # URL del space schat
        schat_url = os.getenv("SCHAT_URL", "https://veureu-schat.hf.space")
        print(f"[schat] Connectant a {schat_url}...")

        # La versió actual de gradio_client al Space no accepta el paràmetre hf_token.
        # Confiem en la configuració d'entorn per a l'autenticació si és necessari.
        client = Client(schat_url)
        print("[schat] Client creat (sense hf_token explícit)")
        
        # Preparar prompt
        prompt = f"Basant-te en aquesta descripció d'una escena, genera un nom curt de menys de 3 paraules que la resumeixi:\n\n{description}\n\nNom de l'escena:"
        
        print(f"[schat] Generant nom curt per descripció: {description[:100]}...")
        
        # Llamar al endpoint /predict de schat
        # Parámetros típicos: mensaje, historial, max_new_tokens, temperature, top_p, top_k, repetition_penalty
        result = client.predict(
            prompt,  # mensaje
            [],  # historial vacío
            256,  # max_new_tokens
            0.7,  # temperature
            0.9,  # top_p
            50,  # top_k
            1.0,  # repetition_penalty
            api_name="/predict"
        )
        
        # El resultado es una tupla (respuesta, historial)
        if isinstance(result, tuple) and len(result) >= 1:
            short_name = result[0].strip() if result[0] else ""
        elif isinstance(result, str):
            short_name = result.strip()
        else:
            short_name = ""
        
        # Limpiar posibles comillas o puntuación extra
        short_name = short_name.strip('"\'.,!?').strip()
        
        # Limpiar prefijos no deseados
        prefixes_to_remove = [
            "Nom de l'escena:",
            "nom de l'escena:",
            "Escena:",
            "escena:",
        ]
        
        for prefix in prefixes_to_remove:
            if short_name.lower().startswith(prefix.lower()):
                short_name = short_name[len(prefix):].strip()
        
        # Limitar a 3 palabras
        words = short_name.split()
        if len(words) > 3:
            short_name = " ".join(words[:3])
        
        print(f"[schat] Nom curt generat: {short_name}")
        
        return short_name
        
    except Exception as e:
        print(f"[schat] Error al generar nom curt: {e}")
        import traceback
        traceback.print_exc()
        return ""