Spaces:

BIBLETUM
/

Audio_itits

Sleeping

App Files Files Community

BIBLETUM commited on Oct 28

Commit

76c29f8

verified ·

1 Parent(s): 5c17c33

Update app.py

Browse files

Files changed (1) hide show

app.py +201 -42

app.py CHANGED Viewed

@@ -3,68 +3,227 @@ import time
 from pathlib import Path
 from typing import List, Tuple, Dict
 import numpy as np
 import pandas as pd
 import gradio as gr
 # === Utils ===
 OUTDIR = Path("outputs")
 OUTDIR.mkdir(parents=True, exist_ok=True)
 def slug(s: str) -> str:
-return "".join(c if c.isalnum() else "_" for c in s)[:80].strip("_")
 def save_wav(path: Path, sr: int, audio):
-import numpy as np
-import scipy.io.wavfile as wav
-if hasattr(audio, "detach"):
-audio = audio.detach().cpu().numpy()
-a = np.array(audio).astype(np.float32)
-a = np.squeeze(a)
-if a.ndim == 2 and a.shape[0] < a.shape[1]:
-a = a.T
-# normalize if needed
-max_abs = np.max(np.abs(a)) if a.size else 1.0
-if np.isfinite(max_abs) and max_abs > 1.0:
-a = a / max_abs
-wav.write(str(path), int(sr), a)
 # === Lazy model registry ===
 MODEL_NAMES = {
-"suno/bark-small": "bark",
-"facebook/mms-tts-rus": "mms",
-"facebook/seamless-m4t-v2-large": "seamless",
 }
 _model_cache: Dict[str, object] = {}
-_device_hint = "auto" # for pipelines; Seamless picks cpu/gpu inside
 def _load_bark():
-from transformers import pipeline
-pipe = pipeline("text-to-speech", model="suno/bark-small", device_map=_device_hint)
-# Bark иногда не имеет pad_token_id
-if getattr(pipe.model.config, "pad_token_id", None) is None:
-pipe.model.config.pad_token_id = pipe.model.config.eos_token_id
-def generate(text: str) -> Tuple[int, np.ndarray]:
-out = pipe(text)
-return int(out["sampling_rate"]), np.asarray(out["audio"], dtype=np.float32)
-return generate
-demo.launch()

 from pathlib import Path
 from typing import List, Tuple, Dict
 import numpy as np
 import pandas as pd
 import gradio as gr
 # === Utils ===
 OUTDIR = Path("outputs")
 OUTDIR.mkdir(parents=True, exist_ok=True)
 def slug(s: str) -> str:
+    """Make a safe filename slug (ASCII, underscores)."""
+    if s is None:
+        s = ""
+    return "".join(c if c.isalnum() else "_" for c in s)[:80].strip("_")
 def save_wav(path: Path, sr: int, audio):
+    import numpy as np
+    import scipy.io.wavfile as wav
+    if hasattr(audio, "detach"):
+        audio = audio.detach().cpu().numpy()
+    a = np.array(audio).astype(np.float32)
+    a = np.squeeze(a)
+    if a.ndim == 2 and a.shape[0] < a.shape[1]:
+        a = a.T
+    # normalize if needed
+    max_abs = np.max(np.abs(a)) if a.size else 1.0
+    if np.isfinite(max_abs) and max_abs > 1.0:
+        a = a / max_abs
+    wav.write(str(path), int(sr), a)
 # === Lazy model registry ===
 MODEL_NAMES = {
+    "suno/bark-small": "bark",
+    "facebook/mms-tts-rus": "mms",
+    "facebook/seamless-m4t-v2-large": "seamless",
 }
 _model_cache: Dict[str, object] = {}
+_device_hint = "auto"  # for pipelines; Seamless picks cpu/gpu inside
 def _load_bark():
+    from transformers import pipeline
+    pipe = pipeline("text-to-speech", model="suno/bark-small", device_map=_device_hint)
+    # Bark иногда не имеет pad_token_id
+    if getattr(pipe.model.config, "pad_token_id", None) is None:
+        pipe.model.config.pad_token_id = pipe.model.config.eos_token_id
+    def generate(text: str) -> Tuple[int, np.ndarray]:
+        out = pipe(text)
+        return int(out["sampling_rate"]), np.asarray(out["audio"], dtype=np.float32)
+    return generate
+def _load_mms():
+    from transformers import pipeline
+    pipe = pipeline("text-to-speech", model="facebook/mms-tts-rus", device_map=_device_hint)
+    if getattr(pipe.model.config, "pad_token_id", None) is None:
+        pipe.model.config.pad_token_id = pipe.model.config.eos_token_id
+    def generate(text: str) -> Tuple[int, np.ndarray]:
+        out = pipe(text)
+        return int(out["sampling_rate"]), np.asarray(out["audio"], dtype=np.float32)
+    return generate
+def _load_seamless():
+    import torch
+    import numpy as np
+    from transformers import AutoProcessor
+    # ВНИМАНИЕ: импорт класса модели из подмодуля transformers
+    from transformers.models.seamless_m4t_v2.modeling_seamless_m4t_v2 import (
+        SeamlessM4Tv2Model,
+    )
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    proc = AutoProcessor.from_pretrained("facebook/seamless-m4t-v2-large")
+    model = SeamlessM4Tv2Model.from_pretrained("facebook/seamless-m4t-v2-large").to(device)
+    def generate(text: str) -> Tuple[int, np.ndarray]:
+        inputs = proc(text=text, src_lang="rus", return_tensors="pt")
+        inputs = {k: v.to(device) for k, v in inputs.items()}
+        with torch.no_grad():
+            audio = model.generate(**inputs, tgt_lang="rus")[0]
+        audio = audio.detach().cpu().numpy().squeeze().astype(np.float32)
+        return 16000, audio  # Seamless выдаёт 16kHz
+    return generate
+def get_generator(kind: str):
+    if kind in _model_cache:
+        return _model_cache[kind]
+    if kind == "bark":
+        gen = _load_bark()
+    elif kind == "mms":
+        gen = _load_mms()
+    elif kind == "seamless":
+        gen = _load_seamless()
+    else:
+        raise ValueError(f"Unknown model kind: {kind}")
+    _model_cache[kind] = gen
+    return gen
+# === Inference ===
+DEFAULT_PROMPTS = (
+    "Привет! Это короткий тест русского TTS.\n"
+    "Сегодня мы проверяем интонации, паузы и четкость дикции.\n"
+    "Немного сложнее: числа 3.14 и 2025 читаем правильно."
+)
+def run_tts(
+    prompts_text: str,
+    split_lines: bool,
+    model_choice: str,
+) -> tuple:
+    """Main Gradio callback.
+    Returns:
+        files: list[str] — файловые пути для скачивания
+        df:    pd.DataFrame — таблица с метаданными
+        last_audio: tuple[int, np.ndarray] | None — предпросмотр последнего файла
+    """
+    text_items: List[str] = []
+    if split_lines:
+        for line in [s.strip() for s in prompts_text.splitlines()]:
+            if line:
+                text_items.append(line)
+    else:
+        text_items = [prompts_text.strip()] if prompts_text.strip() else []
+    if not text_items:
+        return [], pd.DataFrame(), None
+    kind = MODEL_NAMES[model_choice]
+    gen = get_generator(kind)
+    stamp_dir = OUTDIR / time.strftime("%Y%m%d-%H%M%S")
+    stamp_dir.mkdir(parents=True, exist_ok=True)
+    rows = []
+    file_paths: List[str] = []
+    last_audio_payload = None
+    for p in text_items:
+        t0 = time.time()
+        sr, audio = gen(p)
+        dt = time.time() - t0
+        path = stamp_dir / f"{slug(model_choice)}__{slug(p)}.wav"
+        save_wav(path, sr, audio)
+        rows.append(
+            {
+                "model": model_choice,
+                "prompt": p,
+                "file": str(path),
+                "sr": sr,
+                "gen_time_s": round(dt, 3),
+            }
+        )
+        file_paths.append(str(path))
+        last_audio_payload = (sr, audio)
+    df = pd.DataFrame(rows)
+    return file_paths, df, last_audio_payload
+# === UI ===
+description_md = (
+    """
+    Russian TTS Bench: выберите модель и введите один или несколько промптов.\
+    По умолчанию каждая строка — отдельный промпт. Результаты сохраняются в `outputs/…`.
+    **Модели:**
+    - `suno/bark-small` — небольшой мультиязычный TTS.
+    - `facebook/mms-tts-rus` — русская TTS из проекта MMS.
+    - `facebook/seamless-m4t-v2-large` — крупная модель перевода/говорения; тяжёлая для CPU.
+    ⚠️ На CPU генерация может быть очень медленной, особенно для Seamless. Для комфортной работы выберите Space с GPU.
+    """
+)
+with gr.Blocks(title="Russian TTS Bench") as demo:
+    gr.Markdown("# 🗣️ Russian TTS Bench")
+    gr.Markdown(description_md)
+    with gr.Row():
+        model_choice = gr.Dropdown(
+            label="Модель",
+            choices=list(MODEL_NAMES.keys()),
+            value="suno/bark-small",
+        )
+        split_lines = gr.Checkbox(value=True, label="Одна строка = один промпт")
+    prompts = gr.Textbox(
+        label="Промпты",
+        value=DEFAULT_PROMPTS,
+        lines=6,
+        placeholder="Каждая строка — отдельный промпт…",
+    )
+    run_btn = gr.Button("Сгенерировать", variant="primary")
+    with gr.Row():
+        files = gr.Files(label="Файлы .wav для скачивания")
+    with gr.Row():
+        df_out = gr.Dataframe(label="Таблица результатов", interactive=False)
+    with gr.Row():
+        preview = gr.Audio(label="Предпросмотр последнего семпла", autoplay=False)
+    run_btn.click(
+        fn=run_tts,
+        inputs=[prompts, split_lines, model_choice],
+        outputs=[files, df_out, preview],
+    )
+if __name__ == "__main__":
+    demo.launch()