Spaces:

FreedomIntelligence
/

EchoX

Running on Zero

App Files Files Community

tzzte commited on about 18 hours ago

Commit

76c9d9f

verified ·

1 Parent(s): 97288f1

Upload 6 files

Browse files

Files changed (3) hide show

Echox_copy_stream.py +8 -71
app.py +0 -4
requirements.txt +3 -48

Echox_copy_stream.py CHANGED Viewed

@@ -12,7 +12,6 @@ import librosa
 from text_to_speech import *
 import torch.nn.functional as F
 from concurrent.futures import ThreadPoolExecutor, as_completed
-from tts_wrapper import CosyVoice2TTS
 from transformers import logging as hf_logging
 hf_logging.set_verbosity_error()
@@ -37,8 +36,8 @@ def load_model(args, device):
         quantization_config=quantization_config,
         token=hf_token,
     ).eval().to(device)
-    # for module in model.model.audio_tower:
-    #     module = module.to(device)
     if args.peft_model_id:
         lora_config = PeftConfig.from_pretrained(args.peft_model_id)
@@ -97,45 +96,10 @@ class EchoxAssistant():
                 self.base_model_path = "FreedomIntelligence/EchoX-8B"
                 self.peft_model_id = None
                 self.audio_tower = "openai/whisper-large-v3"
-                self.cosyvoice_model_path = "iic/CosyVoice2-0.5B"
-                self.cosyvoice_ref_audio = "show_case/ref.wav"
-                self.cosyvoice_ref_text = "It's always a good idea to research and compare prices from different sources to get a more accurate idea of the average price of a used car in the United States for different years."
         self.args = BasicSetting()
         self.device = "cuda"
-        print(f"### {self.args.cosyvoice_model_path}")
-        if self.args.cosyvoice_model_path:
-            print(f"[EchoxAssistant] Initializing CosyVoice2 TTS from {self.args.cosyvoice_model_path} ...")
-            # try:
-            self.cosyvoice_tts = CosyVoice2TTS(model_dir=self.args.cosyvoice_model_path, device=self.device)
-            # reference prompt for zero-shot voice cloning
-            self.cosyvoice_ref_audio = self.args.cosyvoice_ref_audio
-            self.cosyvoice_ref_text = self.args.cosyvoice_ref_text or ""
-            self.vocoder = None
-            self.voc_cfg = None
-            self.audio_executor = ThreadPoolExecutor(max_workers=2)
-            print("[EchoxAssistant] CosyVoice2 TTS ready.")
-            # except Exception as e:
-            #     print(f"[EchoxAssistant] Failed to init CosyVoice2TTS: {e}. Falling back to original vocoder.")
-            #     self.cosyvoice_tts = None
-            #     self.vocoder, self.voc_cfg = load_speech_model(self.device)
-            #     self.audio_executor = ThreadPoolExecutor(max_workers=2)
-        else:
-            print(f"backup plan")
-            self.vocoder, self.voc_cfg = load_speech_model(self.device)
-            self.cosyvoice_tts = None
-            self.cosyvoice_ref_audio = None
-            self.cosyvoice_ref_text = ""
-            self.model, self.audio_processor, self.tokenizer, self.unit_translator = load_model(self.args, self.device)
-            self.audio_executor = ThreadPoolExecutor(max_workers=2)
-        if not hasattr(self, "model"):
-            self.model, self.audio_processor, self.tokenizer, self.unit_translator = load_model(self.args, self.device)
-        # self.vocoder, self.voc_cfg= load_speech_model(self.device)
-        # self.model, self.audio_processor, self.tokenizer, self.unit_translator = load_model(self.args, self.device)
         self.audio_executor = ThreadPoolExecutor(max_workers=2)
         # self.specAug = SpecAugmentTransform()
         # special_token
@@ -406,28 +370,13 @@ class EchoxAssistant():
                     if should_segment:
                         segment_end_idx = check_idx + 1
                         print(f"Segmenting at step {segment_end_idx-1}, similarity={similarity_at_check:.4f}. Submitting to background audio generation.")
-                        print(f"### {self.cosyvoice_tts}")
                         segment_hidden_states = torch.stack(
                             accumulated_hidden_states[segment_start_idx:segment_end_idx], dim=0
                         ).unsqueeze(0)
-                        if self.cosyvoice_tts:
-                            segment_token_ids = accumulated_tokens[segment_start_idx:segment_end_idx]
-                            segment_text = self.tokenizer.decode(segment_token_ids, skip_special_tokens=True)
-                            future = self.audio_executor.submit(
-                                lambda txt=segment_text: self.cosyvoice_tts.synthesize(
-                                    text=txt,
-                                    prompt_text=self.cosyvoice_ref_text,
-                                    prompt_speech_path=self.cosyvoice_ref_audio,
-                                    output_path=None,
-                                    stream=False
-                                )
-                            )
-                            audio_futures.append(future)
-                        else:
-                            future = self.audio_executor.submit(self._generate_audio_segment, segment_hidden_states)
-                            audio_futures.append(future)
                         segment_start_idx = segment_end_idx
@@ -435,23 +384,11 @@ class EchoxAssistant():
                 current_attention_mask = torch.ones_like(next_token)
             if segment_start_idx < len(accumulated_hidden_states):
                 segment_hidden_states = torch.stack(
                     accumulated_hidden_states[segment_start_idx:], dim=0
                 ).unsqueeze(0)
-                if self.cosyvoice_tts:
-                    segment_token_ids = accumulated_tokens[segment_start_idx:]
-                    segment_text = self.tokenizer.decode(segment_token_ids, skip_special_tokens=True)
-                    future = self.audio_executor.submit(
-                        lambda txt=segment_text: self.cosyvoice_tts.synthesize(
-                            text=txt,
-                            prompt_text=self.cosyvoice_ref_text,
-                            prompt_speech_path=self.cosyvoice_ref_audio,
-                            output_path=None,
-                            stream=False
-                        )
-                    )
-                else:
-                    future = self.audio_executor.submit(self._generate_audio_segment, segment_hidden_states)
                 audio_futures.append(future)
             for future in audio_futures:

 from text_to_speech import *
 import torch.nn.functional as F
 from concurrent.futures import ThreadPoolExecutor, as_completed
 from transformers import logging as hf_logging
 hf_logging.set_verbosity_error()
         quantization_config=quantization_config,
         token=hf_token,
     ).eval().to(device)
+    for module in model.model.audio_tower:
+        module = module.to(device)
     if args.peft_model_id:
         lora_config = PeftConfig.from_pretrained(args.peft_model_id)
                 self.base_model_path = "FreedomIntelligence/EchoX-8B"
                 self.peft_model_id = None
                 self.audio_tower = "openai/whisper-large-v3"
         self.args = BasicSetting()
         self.device = "cuda"
+        self.vocoder, self.voc_cfg= load_speech_model(self.device)
+        self.model, self.audio_processor, self.tokenizer, self.unit_translator = load_model(self.args, self.device)
         self.audio_executor = ThreadPoolExecutor(max_workers=2)
         # self.specAug = SpecAugmentTransform()
         # special_token
                     if should_segment:
                         segment_end_idx = check_idx + 1
                         print(f"Segmenting at step {segment_end_idx-1}, similarity={similarity_at_check:.4f}. Submitting to background audio generation.")
                         segment_hidden_states = torch.stack(
                             accumulated_hidden_states[segment_start_idx:segment_end_idx], dim=0
                         ).unsqueeze(0)
+                        future = self.audio_executor.submit(self._generate_audio_segment, segment_hidden_states)
+                        audio_futures.append(future)
                         segment_start_idx = segment_end_idx
                 current_attention_mask = torch.ones_like(next_token)
             if segment_start_idx < len(accumulated_hidden_states):
+                print(f"Processing final segment from {segment_start_idx} to {len(accumulated_hidden_states)}")
                 segment_hidden_states = torch.stack(
                     accumulated_hidden_states[segment_start_idx:], dim=0
                 ).unsqueeze(0)
+                future = self.audio_executor.submit(self._generate_audio_segment, segment_hidden_states)
                 audio_futures.append(future)
             for future in audio_futures:

app.py CHANGED Viewed

@@ -3,7 +3,6 @@ import sys
 subprocess.check_call([sys.executable, "-m", "pip", "install", "pip==24.0"])
 subprocess.check_call([sys.executable, "-m", "pip", "install", "omegaconf==2.0.6"])
-subprocess.check_call([sys.executable, "-m", "pip", "install", "huggingface_hub==0.36.0"])
 subprocess.check_call([sys.executable, "-m", "pip", "install", "git+https://github.com/facebookresearch/[email protected]"])
 import gradio as gr
 import os
@@ -11,7 +10,6 @@ import torch
 import librosa
 import soundfile as sf
 import tempfile
-import numpy as np
 import spaces  # ZeroGPU requirement
 # 导入你的模块
@@ -100,8 +98,6 @@ def process_audio_text(text, audio):
             if audio_data is not None:
                 sr, audio_array = audio_data
-                if isinstance(audio_array, np.ndarray) and audio_array.ndim == 2 and audio_array.shape[0] == 1:
-                    audio_array = audio_array.squeeze(0)
                 yield (sr, audio_array), accumulated_text
             else:
                 yield None, accumulated_text

 subprocess.check_call([sys.executable, "-m", "pip", "install", "pip==24.0"])
 subprocess.check_call([sys.executable, "-m", "pip", "install", "omegaconf==2.0.6"])
 subprocess.check_call([sys.executable, "-m", "pip", "install", "git+https://github.com/facebookresearch/[email protected]"])
 import gradio as gr
 import os
 import librosa
 import soundfile as sf
 import tempfile
 import spaces  # ZeroGPU requirement
 # 导入你的模块
             if audio_data is not None:
                 sr, audio_array = audio_data
                 yield (sr, audio_array), accumulated_text
             else:
                 yield None, accumulated_text

requirements.txt CHANGED Viewed

@@ -1,54 +1,9 @@
 librosa==0.10.2.post1
 numpy==1.24.4
 peft==0.5.0
 sentencepiece==0.2.0
 torch==2.2.0
 tqdm==4.66.5
-transformers==4.49.0
-funasr
-onnxruntime-gpu
-inflect
-jieba
-pypinyin
-g2p_en
-matcha-tts
-pyarrow
-pyworld
-torchcodec
---extra-index-url https://download.pytorch.org/whl/cu121
---extra-index-url https://aiinfra.pkgs.visualstudio.com/PublicPackages/_packaging/onnxruntime-cuda-12/pypi/simple/ # https://github.com/microsoft/onnxruntime/issues/21684
-conformer==0.3.2
-deepspeed==0.15.1; sys_platform == 'linux'
-diffusers==0.29.0
-fastapi==0.115.6
-fastapi-cli==0.0.4
-gdown==5.1.0
-gradio==3.43.2
-grpcio==1.57.0
-grpcio-tools==1.57.0
-hydra-core==1.3.2
-HyperPyYAML==1.2.2
-inflect==7.3.1
-lightning==2.2.4
-matplotlib==3.7.5
-modelscope==1.20.0
-networkx==3.1
-omegaconf==2.3.0
-onnx==1.16.0
-onnxruntime-gpu==1.18.0; sys_platform == 'linux'
-onnxruntime==1.18.0; sys_platform == 'darwin' or sys_platform == 'win32'
-openai-whisper==20231117
-protobuf==4.25
-pyarrow==18.1.0
-pydantic==2.7.0
-pyworld==0.3.4
-rich==13.7.1
-soundfile==0.12.1
-tensorboard==2.14.0
-tensorrt-cu12==10.0.1; sys_platform == 'linux'
-tensorrt-cu12-bindings==10.0.1; sys_platform == 'linux'
-tensorrt-cu12-libs==10.0.1; sys_platform == 'linux'
-torchaudio
-uvicorn==0.30.0
-wetext==0.0.4
-wget==3.2

+gradio==5.44.1
 librosa==0.10.2.post1
 numpy==1.24.4
 peft==0.5.0
 sentencepiece==0.2.0
+soundfile==0.12.1
 torch==2.2.0
 tqdm==4.66.5
+transformers==4.49.0