Spaces:

FINAL-Bench
/

Darwin-TTS-1.7B-Cross

Paused

App Files Files Community

SeaWolf-AI commited on Apr 15

Commit

b3eab56

verified ·

1 Parent(s): 095807b

Update app.py

Browse files

Files changed (1) hide show

app.py +12 -7

app.py CHANGED Viewed

@@ -1,8 +1,10 @@
-# app.py — v2 (레시피 비공개, speech_tokenizer 해결)
 """
 Darwin-TTS-1.7B-Cross v2 — HuggingFace Space
 """
-import os, io, torch, numpy as np, soundfile as sf
 from pathlib import Path
 from contextlib import asynccontextmanager
 from fastapi import FastAPI, HTTPException
@@ -13,7 +15,6 @@ state = {"darwin_weights": None}
 @asynccontextmanager
 async def lifespan(app: FastAPI):
-    # Darwin 가중치 사전 로드
     from huggingface_hub import hf_hub_download
     print("📦 Loading Darwin weights...")
     path = hf_hub_download("FINAL-Bench/Darwin-TTS-1.7B-Cross", "model.safetensors")
@@ -37,18 +38,17 @@ async def index():
 async def synthesize(request: dict):
     text = request.get("text", "안녕하세요, 저는 다윈입니다.")
     use_darwin = request.get("use_darwin", True)
     model = None
     try:
         from qwen_tts import Qwen3TTSModel
-        # 항상 원본에서 로드 (speech_tokenizer 포함)
         model = Qwen3TTSModel.from_pretrained(
             "Qwen/Qwen3-TTS-12Hz-1.7B-Base",
             device_map="cuda:0", dtype=torch.bfloat16
         )
-        # Darwin 모드: 사전 블렌딩된 가중치로 교체
         if use_darwin and state["darwin_weights"]:
             cnt = 0
             for n, p in model.model.named_parameters():
@@ -56,10 +56,15 @@ async def synthesize(request: dict):
                     with torch.no_grad():
                         p.copy_(state["darwin_weights"][n].to(p.device, p.dtype))
                     cnt += 1
-            print(f"  Darwin weights applied: {cnt} tensors")
         ref_path = "/tmp/darwin_ref.wav"
-        sf.write(ref_path, (0.1 * np.sin(2 * np.pi * 200 * np.linspace(0, 3, 72000))).astype(np.float32), 24000)
         wavs, sr = model.generate_voice_clone(
             text=text, ref_audio=ref_path, ref_text="ref", x_vector_only_mode=True

+# app.py — Darwin-TTS v2 (레시피 비공개 + Voice Cloning)
 """
 Darwin-TTS-1.7B-Cross v2 — HuggingFace Space
+- Original / Darwin toggle (레시피 비공개)
+- Voice Cloning: 사용자 음성 업로드 → 그 목소리로 생성
 """
+import os, io, torch, numpy as np, soundfile as sf, base64
 from pathlib import Path
 from contextlib import asynccontextmanager
 from fastapi import FastAPI, HTTPException
 @asynccontextmanager
 async def lifespan(app: FastAPI):
     from huggingface_hub import hf_hub_download
     print("📦 Loading Darwin weights...")
     path = hf_hub_download("FINAL-Bench/Darwin-TTS-1.7B-Cross", "model.safetensors")
 async def synthesize(request: dict):
     text = request.get("text", "안녕하세요, 저는 다윈입니다.")
     use_darwin = request.get("use_darwin", True)
+    ref_audio_b64 = request.get("ref_audio", None)
     model = None
     try:
         from qwen_tts import Qwen3TTSModel
         model = Qwen3TTSModel.from_pretrained(
             "Qwen/Qwen3-TTS-12Hz-1.7B-Base",
             device_map="cuda:0", dtype=torch.bfloat16
         )
         if use_darwin and state["darwin_weights"]:
             cnt = 0
             for n, p in model.model.named_parameters():
                     with torch.no_grad():
                         p.copy_(state["darwin_weights"][n].to(p.device, p.dtype))
                     cnt += 1
+        # Voice Cloning: base64 오디오 → wav 파일
         ref_path = "/tmp/darwin_ref.wav"
+        if ref_audio_b64:
+            audio_bytes = base64.b64decode(ref_audio_b64)
+            with open(ref_path, "wb") as f:
+                f.write(audio_bytes)
+        else:
+            sf.write(ref_path, (0.1 * np.sin(2 * np.pi * 200 * np.linspace(0, 3, 72000))).astype(np.float32), 24000)
         wavs, sr = model.generate_voice_clone(
             text=text, ref_audio=ref_path, ref_text="ref", x_vector_only_mode=True