Spaces:

FINAL-Bench
/

Darwin-TTS-1.7B-Cross

Paused

App Files Files Community

Darwin-TTS-1.7B-Cross / app.py

SeaWolf-AI

Update app.py

b3eab56 verified about 1 month ago

raw

history blame contribute delete

3.4 kB

	# app.py — Darwin-TTS v2 (레시피 비공개 + Voice Cloning)
	"""
	Darwin-TTS-1.7B-Cross v2 — HuggingFace Space
	- Original / Darwin toggle (레시피 비공개)
	- Voice Cloning: 사용자 음성 업로드 → 그 목소리로 생성
	"""
	import os, io, torch, numpy as np, soundfile as sf, base64
	from pathlib import Path
	from contextlib import asynccontextmanager
	from fastapi import FastAPI, HTTPException
	from fastapi.responses import HTMLResponse, Response
	from safetensors import safe_open

	state = {"darwin_weights": None}

	@asynccontextmanager
	async def lifespan(app: FastAPI):
	from huggingface_hub import hf_hub_download
	print("📦 Loading Darwin weights...")
	path = hf_hub_download("FINAL-Bench/Darwin-TTS-1.7B-Cross", "model.safetensors")
	weights = {}
	with safe_open(path, framework="pt") as s:
	for k in s.keys():
	weights[k] = s.get_tensor(k)
	state["darwin_weights"] = weights
	print(f" ✅ {len(weights)} tensors cached")
	yield
	state["darwin_weights"] = None

	app = FastAPI(title="Darwin-TTS-1.7B-Cross", lifespan=lifespan)

	@app.get("/", response_class=HTMLResponse)
	async def index():
	with open("index.html", "r") as f:
	return f.read()

	@app.post("/synthesize")
	async def synthesize(request: dict):
	text = request.get("text", "안녕하세요, 저는 다윈입니다.")
	use_darwin = request.get("use_darwin", True)
	ref_audio_b64 = request.get("ref_audio", None)

	model = None
	try:
	from qwen_tts import Qwen3TTSModel

	model = Qwen3TTSModel.from_pretrained(
	"Qwen/Qwen3-TTS-12Hz-1.7B-Base",
	device_map="cuda:0", dtype=torch.bfloat16
	)

	if use_darwin and state["darwin_weights"]:
	cnt = 0
	for n, p in model.model.named_parameters():
	if n in state["darwin_weights"]:
	with torch.no_grad():
	p.copy_(state["darwin_weights"][n].to(p.device, p.dtype))
	cnt += 1

	# Voice Cloning: base64 오디오 → wav 파일
	ref_path = "/tmp/darwin_ref.wav"
	if ref_audio_b64:
	audio_bytes = base64.b64decode(ref_audio_b64)
	with open(ref_path, "wb") as f:
	f.write(audio_bytes)
	else:
	sf.write(ref_path, (0.1 * np.sin(2 * np.pi * 200 * np.linspace(0, 3, 72000))).astype(np.float32), 24000)

	wavs, sr = model.generate_voice_clone(
	text=text, ref_audio=ref_path, ref_text="ref", x_vector_only_mode=True
	)
	wav = wavs[0].cpu().numpy() if hasattr(wavs[0], "cpu") else np.array(wavs[0])

	buf = io.BytesIO()
	sf.write(buf, wav, sr, format="WAV")
	buf.seek(0)

	del model; torch.cuda.empty_cache()

	return Response(
	content=buf.read(),
	media_type="audio/wav",
	headers={"X-Duration": f"{len(wav)/sr:.1f}", "X-Model": "Darwin" if use_darwin else "Original"},
	)
	except Exception as e:
	if model is not None: del model
	torch.cuda.empty_cache()
	raise HTTPException(status_code=500, detail=str(e))

	@app.get("/health")
	async def health():
	return {"status": "ok", "cuda": torch.cuda.is_available(), "darwin_loaded": state["darwin_weights"] is not None}

	if __name__ == "__main__":
	import uvicorn
	uvicorn.run(app, host="0.0.0.0", port=7860)