Update app.py
Browse files
app.py
CHANGED
|
@@ -1,8 +1,10 @@
|
|
| 1 |
-
# app.py β v2 (λ μνΌ λΉκ³΅κ°
|
| 2 |
"""
|
| 3 |
Darwin-TTS-1.7B-Cross v2 β HuggingFace Space
|
|
|
|
|
|
|
| 4 |
"""
|
| 5 |
-
import os, io, torch, numpy as np, soundfile as sf
|
| 6 |
from pathlib import Path
|
| 7 |
from contextlib import asynccontextmanager
|
| 8 |
from fastapi import FastAPI, HTTPException
|
|
@@ -13,7 +15,6 @@ state = {"darwin_weights": None}
|
|
| 13 |
|
| 14 |
@asynccontextmanager
|
| 15 |
async def lifespan(app: FastAPI):
|
| 16 |
-
# Darwin κ°μ€μΉ μ¬μ λ‘λ
|
| 17 |
from huggingface_hub import hf_hub_download
|
| 18 |
print("π¦ Loading Darwin weights...")
|
| 19 |
path = hf_hub_download("FINAL-Bench/Darwin-TTS-1.7B-Cross", "model.safetensors")
|
|
@@ -37,18 +38,17 @@ async def index():
|
|
| 37 |
async def synthesize(request: dict):
|
| 38 |
text = request.get("text", "μλ
νμΈμ, μ λ λ€μμ
λλ€.")
|
| 39 |
use_darwin = request.get("use_darwin", True)
|
|
|
|
| 40 |
|
| 41 |
model = None
|
| 42 |
try:
|
| 43 |
from qwen_tts import Qwen3TTSModel
|
| 44 |
|
| 45 |
-
# νμ μλ³Έμμ λ‘λ (speech_tokenizer ν¬ν¨)
|
| 46 |
model = Qwen3TTSModel.from_pretrained(
|
| 47 |
"Qwen/Qwen3-TTS-12Hz-1.7B-Base",
|
| 48 |
device_map="cuda:0", dtype=torch.bfloat16
|
| 49 |
)
|
| 50 |
|
| 51 |
-
# Darwin λͺ¨λ: μ¬μ λΈλ λ©λ κ°μ€μΉλ‘ κ΅μ²΄
|
| 52 |
if use_darwin and state["darwin_weights"]:
|
| 53 |
cnt = 0
|
| 54 |
for n, p in model.model.named_parameters():
|
|
@@ -56,10 +56,15 @@ async def synthesize(request: dict):
|
|
| 56 |
with torch.no_grad():
|
| 57 |
p.copy_(state["darwin_weights"][n].to(p.device, p.dtype))
|
| 58 |
cnt += 1
|
| 59 |
-
print(f" Darwin weights applied: {cnt} tensors")
|
| 60 |
|
|
|
|
| 61 |
ref_path = "/tmp/darwin_ref.wav"
|
| 62 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 63 |
|
| 64 |
wavs, sr = model.generate_voice_clone(
|
| 65 |
text=text, ref_audio=ref_path, ref_text="ref", x_vector_only_mode=True
|
|
|
|
| 1 |
+
# app.py β Darwin-TTS v2 (λ μνΌ λΉκ³΅κ° + Voice Cloning)
|
| 2 |
"""
|
| 3 |
Darwin-TTS-1.7B-Cross v2 β HuggingFace Space
|
| 4 |
+
- Original / Darwin toggle (λ μνΌ λΉκ³΅κ°)
|
| 5 |
+
- Voice Cloning: μ¬μ©μ μμ± μ
λ‘λ β κ·Έ λͺ©μλ¦¬λ‘ μμ±
|
| 6 |
"""
|
| 7 |
+
import os, io, torch, numpy as np, soundfile as sf, base64
|
| 8 |
from pathlib import Path
|
| 9 |
from contextlib import asynccontextmanager
|
| 10 |
from fastapi import FastAPI, HTTPException
|
|
|
|
| 15 |
|
| 16 |
@asynccontextmanager
|
| 17 |
async def lifespan(app: FastAPI):
|
|
|
|
| 18 |
from huggingface_hub import hf_hub_download
|
| 19 |
print("π¦ Loading Darwin weights...")
|
| 20 |
path = hf_hub_download("FINAL-Bench/Darwin-TTS-1.7B-Cross", "model.safetensors")
|
|
|
|
| 38 |
async def synthesize(request: dict):
|
| 39 |
text = request.get("text", "μλ
νμΈμ, μ λ λ€μμ
λλ€.")
|
| 40 |
use_darwin = request.get("use_darwin", True)
|
| 41 |
+
ref_audio_b64 = request.get("ref_audio", None)
|
| 42 |
|
| 43 |
model = None
|
| 44 |
try:
|
| 45 |
from qwen_tts import Qwen3TTSModel
|
| 46 |
|
|
|
|
| 47 |
model = Qwen3TTSModel.from_pretrained(
|
| 48 |
"Qwen/Qwen3-TTS-12Hz-1.7B-Base",
|
| 49 |
device_map="cuda:0", dtype=torch.bfloat16
|
| 50 |
)
|
| 51 |
|
|
|
|
| 52 |
if use_darwin and state["darwin_weights"]:
|
| 53 |
cnt = 0
|
| 54 |
for n, p in model.model.named_parameters():
|
|
|
|
| 56 |
with torch.no_grad():
|
| 57 |
p.copy_(state["darwin_weights"][n].to(p.device, p.dtype))
|
| 58 |
cnt += 1
|
|
|
|
| 59 |
|
| 60 |
+
# Voice Cloning: base64 μ€λμ€ β wav νμΌ
|
| 61 |
ref_path = "/tmp/darwin_ref.wav"
|
| 62 |
+
if ref_audio_b64:
|
| 63 |
+
audio_bytes = base64.b64decode(ref_audio_b64)
|
| 64 |
+
with open(ref_path, "wb") as f:
|
| 65 |
+
f.write(audio_bytes)
|
| 66 |
+
else:
|
| 67 |
+
sf.write(ref_path, (0.1 * np.sin(2 * np.pi * 200 * np.linspace(0, 3, 72000))).astype(np.float32), 24000)
|
| 68 |
|
| 69 |
wavs, sr = model.generate_voice_clone(
|
| 70 |
text=text, ref_audio=ref_path, ref_text="ref", x_vector_only_mode=True
|