Spaces:
Running
on
Zero
Running
on
Zero
Upload 6 files
Browse files- Echox_copy_stream.py +8 -71
- app.py +0 -4
- requirements.txt +3 -48
Echox_copy_stream.py
CHANGED
|
@@ -12,7 +12,6 @@ import librosa
|
|
| 12 |
from text_to_speech import *
|
| 13 |
import torch.nn.functional as F
|
| 14 |
from concurrent.futures import ThreadPoolExecutor, as_completed
|
| 15 |
-
from tts_wrapper import CosyVoice2TTS
|
| 16 |
|
| 17 |
from transformers import logging as hf_logging
|
| 18 |
hf_logging.set_verbosity_error()
|
|
@@ -37,8 +36,8 @@ def load_model(args, device):
|
|
| 37 |
quantization_config=quantization_config,
|
| 38 |
token=hf_token,
|
| 39 |
).eval().to(device)
|
| 40 |
-
|
| 41 |
-
|
| 42 |
|
| 43 |
if args.peft_model_id:
|
| 44 |
lora_config = PeftConfig.from_pretrained(args.peft_model_id)
|
|
@@ -97,45 +96,10 @@ class EchoxAssistant():
|
|
| 97 |
self.base_model_path = "FreedomIntelligence/EchoX-8B"
|
| 98 |
self.peft_model_id = None
|
| 99 |
self.audio_tower = "openai/whisper-large-v3"
|
| 100 |
-
self.cosyvoice_model_path = "iic/CosyVoice2-0.5B"
|
| 101 |
-
self.cosyvoice_ref_audio = "show_case/ref.wav"
|
| 102 |
-
self.cosyvoice_ref_text = "It's always a good idea to research and compare prices from different sources to get a more accurate idea of the average price of a used car in the United States for different years."
|
| 103 |
-
|
| 104 |
self.args = BasicSetting()
|
| 105 |
self.device = "cuda"
|
| 106 |
-
|
| 107 |
-
|
| 108 |
-
if self.args.cosyvoice_model_path:
|
| 109 |
-
print(f"[EchoxAssistant] Initializing CosyVoice2 TTS from {self.args.cosyvoice_model_path} ...")
|
| 110 |
-
# try:
|
| 111 |
-
self.cosyvoice_tts = CosyVoice2TTS(model_dir=self.args.cosyvoice_model_path, device=self.device)
|
| 112 |
-
# reference prompt for zero-shot voice cloning
|
| 113 |
-
self.cosyvoice_ref_audio = self.args.cosyvoice_ref_audio
|
| 114 |
-
self.cosyvoice_ref_text = self.args.cosyvoice_ref_text or ""
|
| 115 |
-
|
| 116 |
-
self.vocoder = None
|
| 117 |
-
self.voc_cfg = None
|
| 118 |
-
self.audio_executor = ThreadPoolExecutor(max_workers=2)
|
| 119 |
-
print("[EchoxAssistant] CosyVoice2 TTS ready.")
|
| 120 |
-
# except Exception as e:
|
| 121 |
-
# print(f"[EchoxAssistant] Failed to init CosyVoice2TTS: {e}. Falling back to original vocoder.")
|
| 122 |
-
# self.cosyvoice_tts = None
|
| 123 |
-
# self.vocoder, self.voc_cfg = load_speech_model(self.device)
|
| 124 |
-
# self.audio_executor = ThreadPoolExecutor(max_workers=2)
|
| 125 |
-
else:
|
| 126 |
-
print(f"backup plan")
|
| 127 |
-
self.vocoder, self.voc_cfg = load_speech_model(self.device)
|
| 128 |
-
self.cosyvoice_tts = None
|
| 129 |
-
self.cosyvoice_ref_audio = None
|
| 130 |
-
self.cosyvoice_ref_text = ""
|
| 131 |
-
self.model, self.audio_processor, self.tokenizer, self.unit_translator = load_model(self.args, self.device)
|
| 132 |
-
self.audio_executor = ThreadPoolExecutor(max_workers=2)
|
| 133 |
-
|
| 134 |
-
if not hasattr(self, "model"):
|
| 135 |
-
self.model, self.audio_processor, self.tokenizer, self.unit_translator = load_model(self.args, self.device)
|
| 136 |
-
|
| 137 |
-
# self.vocoder, self.voc_cfg= load_speech_model(self.device)
|
| 138 |
-
# self.model, self.audio_processor, self.tokenizer, self.unit_translator = load_model(self.args, self.device)
|
| 139 |
self.audio_executor = ThreadPoolExecutor(max_workers=2)
|
| 140 |
# self.specAug = SpecAugmentTransform()
|
| 141 |
# special_token
|
|
@@ -406,28 +370,13 @@ class EchoxAssistant():
|
|
| 406 |
if should_segment:
|
| 407 |
segment_end_idx = check_idx + 1
|
| 408 |
print(f"Segmenting at step {segment_end_idx-1}, similarity={similarity_at_check:.4f}. Submitting to background audio generation.")
|
| 409 |
-
print(f"### {self.cosyvoice_tts}")
|
| 410 |
|
| 411 |
segment_hidden_states = torch.stack(
|
| 412 |
accumulated_hidden_states[segment_start_idx:segment_end_idx], dim=0
|
| 413 |
).unsqueeze(0)
|
| 414 |
|
| 415 |
-
|
| 416 |
-
|
| 417 |
-
segment_text = self.tokenizer.decode(segment_token_ids, skip_special_tokens=True)
|
| 418 |
-
future = self.audio_executor.submit(
|
| 419 |
-
lambda txt=segment_text: self.cosyvoice_tts.synthesize(
|
| 420 |
-
text=txt,
|
| 421 |
-
prompt_text=self.cosyvoice_ref_text,
|
| 422 |
-
prompt_speech_path=self.cosyvoice_ref_audio,
|
| 423 |
-
output_path=None,
|
| 424 |
-
stream=False
|
| 425 |
-
)
|
| 426 |
-
)
|
| 427 |
-
audio_futures.append(future)
|
| 428 |
-
else:
|
| 429 |
-
future = self.audio_executor.submit(self._generate_audio_segment, segment_hidden_states)
|
| 430 |
-
audio_futures.append(future)
|
| 431 |
|
| 432 |
segment_start_idx = segment_end_idx
|
| 433 |
|
|
@@ -435,23 +384,11 @@ class EchoxAssistant():
|
|
| 435 |
current_attention_mask = torch.ones_like(next_token)
|
| 436 |
|
| 437 |
if segment_start_idx < len(accumulated_hidden_states):
|
|
|
|
| 438 |
segment_hidden_states = torch.stack(
|
| 439 |
accumulated_hidden_states[segment_start_idx:], dim=0
|
| 440 |
).unsqueeze(0)
|
| 441 |
-
|
| 442 |
-
segment_token_ids = accumulated_tokens[segment_start_idx:]
|
| 443 |
-
segment_text = self.tokenizer.decode(segment_token_ids, skip_special_tokens=True)
|
| 444 |
-
future = self.audio_executor.submit(
|
| 445 |
-
lambda txt=segment_text: self.cosyvoice_tts.synthesize(
|
| 446 |
-
text=txt,
|
| 447 |
-
prompt_text=self.cosyvoice_ref_text,
|
| 448 |
-
prompt_speech_path=self.cosyvoice_ref_audio,
|
| 449 |
-
output_path=None,
|
| 450 |
-
stream=False
|
| 451 |
-
)
|
| 452 |
-
)
|
| 453 |
-
else:
|
| 454 |
-
future = self.audio_executor.submit(self._generate_audio_segment, segment_hidden_states)
|
| 455 |
audio_futures.append(future)
|
| 456 |
|
| 457 |
for future in audio_futures:
|
|
|
|
| 12 |
from text_to_speech import *
|
| 13 |
import torch.nn.functional as F
|
| 14 |
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
|
|
| 15 |
|
| 16 |
from transformers import logging as hf_logging
|
| 17 |
hf_logging.set_verbosity_error()
|
|
|
|
| 36 |
quantization_config=quantization_config,
|
| 37 |
token=hf_token,
|
| 38 |
).eval().to(device)
|
| 39 |
+
for module in model.model.audio_tower:
|
| 40 |
+
module = module.to(device)
|
| 41 |
|
| 42 |
if args.peft_model_id:
|
| 43 |
lora_config = PeftConfig.from_pretrained(args.peft_model_id)
|
|
|
|
| 96 |
self.base_model_path = "FreedomIntelligence/EchoX-8B"
|
| 97 |
self.peft_model_id = None
|
| 98 |
self.audio_tower = "openai/whisper-large-v3"
|
|
|
|
|
|
|
|
|
|
|
|
|
| 99 |
self.args = BasicSetting()
|
| 100 |
self.device = "cuda"
|
| 101 |
+
self.vocoder, self.voc_cfg= load_speech_model(self.device)
|
| 102 |
+
self.model, self.audio_processor, self.tokenizer, self.unit_translator = load_model(self.args, self.device)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 103 |
self.audio_executor = ThreadPoolExecutor(max_workers=2)
|
| 104 |
# self.specAug = SpecAugmentTransform()
|
| 105 |
# special_token
|
|
|
|
| 370 |
if should_segment:
|
| 371 |
segment_end_idx = check_idx + 1
|
| 372 |
print(f"Segmenting at step {segment_end_idx-1}, similarity={similarity_at_check:.4f}. Submitting to background audio generation.")
|
|
|
|
| 373 |
|
| 374 |
segment_hidden_states = torch.stack(
|
| 375 |
accumulated_hidden_states[segment_start_idx:segment_end_idx], dim=0
|
| 376 |
).unsqueeze(0)
|
| 377 |
|
| 378 |
+
future = self.audio_executor.submit(self._generate_audio_segment, segment_hidden_states)
|
| 379 |
+
audio_futures.append(future)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 380 |
|
| 381 |
segment_start_idx = segment_end_idx
|
| 382 |
|
|
|
|
| 384 |
current_attention_mask = torch.ones_like(next_token)
|
| 385 |
|
| 386 |
if segment_start_idx < len(accumulated_hidden_states):
|
| 387 |
+
print(f"Processing final segment from {segment_start_idx} to {len(accumulated_hidden_states)}")
|
| 388 |
segment_hidden_states = torch.stack(
|
| 389 |
accumulated_hidden_states[segment_start_idx:], dim=0
|
| 390 |
).unsqueeze(0)
|
| 391 |
+
future = self.audio_executor.submit(self._generate_audio_segment, segment_hidden_states)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 392 |
audio_futures.append(future)
|
| 393 |
|
| 394 |
for future in audio_futures:
|
app.py
CHANGED
|
@@ -3,7 +3,6 @@ import sys
|
|
| 3 |
|
| 4 |
subprocess.check_call([sys.executable, "-m", "pip", "install", "pip==24.0"])
|
| 5 |
subprocess.check_call([sys.executable, "-m", "pip", "install", "omegaconf==2.0.6"])
|
| 6 |
-
subprocess.check_call([sys.executable, "-m", "pip", "install", "huggingface_hub==0.36.0"])
|
| 7 |
subprocess.check_call([sys.executable, "-m", "pip", "install", "git+https://github.com/facebookresearch/[email protected]"])
|
| 8 |
import gradio as gr
|
| 9 |
import os
|
|
@@ -11,7 +10,6 @@ import torch
|
|
| 11 |
import librosa
|
| 12 |
import soundfile as sf
|
| 13 |
import tempfile
|
| 14 |
-
import numpy as np
|
| 15 |
import spaces # ZeroGPU requirement
|
| 16 |
|
| 17 |
# 导入你的模块
|
|
@@ -100,8 +98,6 @@ def process_audio_text(text, audio):
|
|
| 100 |
|
| 101 |
if audio_data is not None:
|
| 102 |
sr, audio_array = audio_data
|
| 103 |
-
if isinstance(audio_array, np.ndarray) and audio_array.ndim == 2 and audio_array.shape[0] == 1:
|
| 104 |
-
audio_array = audio_array.squeeze(0)
|
| 105 |
yield (sr, audio_array), accumulated_text
|
| 106 |
else:
|
| 107 |
yield None, accumulated_text
|
|
|
|
| 3 |
|
| 4 |
subprocess.check_call([sys.executable, "-m", "pip", "install", "pip==24.0"])
|
| 5 |
subprocess.check_call([sys.executable, "-m", "pip", "install", "omegaconf==2.0.6"])
|
|
|
|
| 6 |
subprocess.check_call([sys.executable, "-m", "pip", "install", "git+https://github.com/facebookresearch/[email protected]"])
|
| 7 |
import gradio as gr
|
| 8 |
import os
|
|
|
|
| 10 |
import librosa
|
| 11 |
import soundfile as sf
|
| 12 |
import tempfile
|
|
|
|
| 13 |
import spaces # ZeroGPU requirement
|
| 14 |
|
| 15 |
# 导入你的模块
|
|
|
|
| 98 |
|
| 99 |
if audio_data is not None:
|
| 100 |
sr, audio_array = audio_data
|
|
|
|
|
|
|
| 101 |
yield (sr, audio_array), accumulated_text
|
| 102 |
else:
|
| 103 |
yield None, accumulated_text
|
requirements.txt
CHANGED
|
@@ -1,54 +1,9 @@
|
|
|
|
|
| 1 |
librosa==0.10.2.post1
|
| 2 |
numpy==1.24.4
|
| 3 |
peft==0.5.0
|
| 4 |
sentencepiece==0.2.0
|
|
|
|
| 5 |
torch==2.2.0
|
| 6 |
tqdm==4.66.5
|
| 7 |
-
transformers==4.49.0
|
| 8 |
-
funasr
|
| 9 |
-
onnxruntime-gpu
|
| 10 |
-
inflect
|
| 11 |
-
jieba
|
| 12 |
-
pypinyin
|
| 13 |
-
g2p_en
|
| 14 |
-
matcha-tts
|
| 15 |
-
pyarrow
|
| 16 |
-
pyworld
|
| 17 |
-
torchcodec
|
| 18 |
-
--extra-index-url https://download.pytorch.org/whl/cu121
|
| 19 |
-
--extra-index-url https://aiinfra.pkgs.visualstudio.com/PublicPackages/_packaging/onnxruntime-cuda-12/pypi/simple/ # https://github.com/microsoft/onnxruntime/issues/21684
|
| 20 |
-
conformer==0.3.2
|
| 21 |
-
deepspeed==0.15.1; sys_platform == 'linux'
|
| 22 |
-
diffusers==0.29.0
|
| 23 |
-
fastapi==0.115.6
|
| 24 |
-
fastapi-cli==0.0.4
|
| 25 |
-
gdown==5.1.0
|
| 26 |
-
gradio==3.43.2
|
| 27 |
-
grpcio==1.57.0
|
| 28 |
-
grpcio-tools==1.57.0
|
| 29 |
-
hydra-core==1.3.2
|
| 30 |
-
HyperPyYAML==1.2.2
|
| 31 |
-
inflect==7.3.1
|
| 32 |
-
lightning==2.2.4
|
| 33 |
-
matplotlib==3.7.5
|
| 34 |
-
modelscope==1.20.0
|
| 35 |
-
networkx==3.1
|
| 36 |
-
omegaconf==2.3.0
|
| 37 |
-
onnx==1.16.0
|
| 38 |
-
onnxruntime-gpu==1.18.0; sys_platform == 'linux'
|
| 39 |
-
onnxruntime==1.18.0; sys_platform == 'darwin' or sys_platform == 'win32'
|
| 40 |
-
openai-whisper==20231117
|
| 41 |
-
protobuf==4.25
|
| 42 |
-
pyarrow==18.1.0
|
| 43 |
-
pydantic==2.7.0
|
| 44 |
-
pyworld==0.3.4
|
| 45 |
-
rich==13.7.1
|
| 46 |
-
soundfile==0.12.1
|
| 47 |
-
tensorboard==2.14.0
|
| 48 |
-
tensorrt-cu12==10.0.1; sys_platform == 'linux'
|
| 49 |
-
tensorrt-cu12-bindings==10.0.1; sys_platform == 'linux'
|
| 50 |
-
tensorrt-cu12-libs==10.0.1; sys_platform == 'linux'
|
| 51 |
-
torchaudio
|
| 52 |
-
uvicorn==0.30.0
|
| 53 |
-
wetext==0.0.4
|
| 54 |
-
wget==3.2
|
|
|
|
| 1 |
+
gradio==5.44.1
|
| 2 |
librosa==0.10.2.post1
|
| 3 |
numpy==1.24.4
|
| 4 |
peft==0.5.0
|
| 5 |
sentencepiece==0.2.0
|
| 6 |
+
soundfile==0.12.1
|
| 7 |
torch==2.2.0
|
| 8 |
tqdm==4.66.5
|
| 9 |
+
transformers==4.49.0
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|