tzzte commited on
Commit
76c9d9f
·
verified ·
1 Parent(s): 97288f1

Upload 6 files

Browse files
Files changed (3) hide show
  1. Echox_copy_stream.py +8 -71
  2. app.py +0 -4
  3. requirements.txt +3 -48
Echox_copy_stream.py CHANGED
@@ -12,7 +12,6 @@ import librosa
12
  from text_to_speech import *
13
  import torch.nn.functional as F
14
  from concurrent.futures import ThreadPoolExecutor, as_completed
15
- from tts_wrapper import CosyVoice2TTS
16
 
17
  from transformers import logging as hf_logging
18
  hf_logging.set_verbosity_error()
@@ -37,8 +36,8 @@ def load_model(args, device):
37
  quantization_config=quantization_config,
38
  token=hf_token,
39
  ).eval().to(device)
40
- # for module in model.model.audio_tower:
41
- # module = module.to(device)
42
 
43
  if args.peft_model_id:
44
  lora_config = PeftConfig.from_pretrained(args.peft_model_id)
@@ -97,45 +96,10 @@ class EchoxAssistant():
97
  self.base_model_path = "FreedomIntelligence/EchoX-8B"
98
  self.peft_model_id = None
99
  self.audio_tower = "openai/whisper-large-v3"
100
- self.cosyvoice_model_path = "iic/CosyVoice2-0.5B"
101
- self.cosyvoice_ref_audio = "show_case/ref.wav"
102
- self.cosyvoice_ref_text = "It's always a good idea to research and compare prices from different sources to get a more accurate idea of the average price of a used car in the United States for different years."
103
-
104
  self.args = BasicSetting()
105
  self.device = "cuda"
106
-
107
- print(f"### {self.args.cosyvoice_model_path}")
108
- if self.args.cosyvoice_model_path:
109
- print(f"[EchoxAssistant] Initializing CosyVoice2 TTS from {self.args.cosyvoice_model_path} ...")
110
- # try:
111
- self.cosyvoice_tts = CosyVoice2TTS(model_dir=self.args.cosyvoice_model_path, device=self.device)
112
- # reference prompt for zero-shot voice cloning
113
- self.cosyvoice_ref_audio = self.args.cosyvoice_ref_audio
114
- self.cosyvoice_ref_text = self.args.cosyvoice_ref_text or ""
115
-
116
- self.vocoder = None
117
- self.voc_cfg = None
118
- self.audio_executor = ThreadPoolExecutor(max_workers=2)
119
- print("[EchoxAssistant] CosyVoice2 TTS ready.")
120
- # except Exception as e:
121
- # print(f"[EchoxAssistant] Failed to init CosyVoice2TTS: {e}. Falling back to original vocoder.")
122
- # self.cosyvoice_tts = None
123
- # self.vocoder, self.voc_cfg = load_speech_model(self.device)
124
- # self.audio_executor = ThreadPoolExecutor(max_workers=2)
125
- else:
126
- print(f"backup plan")
127
- self.vocoder, self.voc_cfg = load_speech_model(self.device)
128
- self.cosyvoice_tts = None
129
- self.cosyvoice_ref_audio = None
130
- self.cosyvoice_ref_text = ""
131
- self.model, self.audio_processor, self.tokenizer, self.unit_translator = load_model(self.args, self.device)
132
- self.audio_executor = ThreadPoolExecutor(max_workers=2)
133
-
134
- if not hasattr(self, "model"):
135
- self.model, self.audio_processor, self.tokenizer, self.unit_translator = load_model(self.args, self.device)
136
-
137
- # self.vocoder, self.voc_cfg= load_speech_model(self.device)
138
- # self.model, self.audio_processor, self.tokenizer, self.unit_translator = load_model(self.args, self.device)
139
  self.audio_executor = ThreadPoolExecutor(max_workers=2)
140
  # self.specAug = SpecAugmentTransform()
141
  # special_token
@@ -406,28 +370,13 @@ class EchoxAssistant():
406
  if should_segment:
407
  segment_end_idx = check_idx + 1
408
  print(f"Segmenting at step {segment_end_idx-1}, similarity={similarity_at_check:.4f}. Submitting to background audio generation.")
409
- print(f"### {self.cosyvoice_tts}")
410
 
411
  segment_hidden_states = torch.stack(
412
  accumulated_hidden_states[segment_start_idx:segment_end_idx], dim=0
413
  ).unsqueeze(0)
414
 
415
- if self.cosyvoice_tts:
416
- segment_token_ids = accumulated_tokens[segment_start_idx:segment_end_idx]
417
- segment_text = self.tokenizer.decode(segment_token_ids, skip_special_tokens=True)
418
- future = self.audio_executor.submit(
419
- lambda txt=segment_text: self.cosyvoice_tts.synthesize(
420
- text=txt,
421
- prompt_text=self.cosyvoice_ref_text,
422
- prompt_speech_path=self.cosyvoice_ref_audio,
423
- output_path=None,
424
- stream=False
425
- )
426
- )
427
- audio_futures.append(future)
428
- else:
429
- future = self.audio_executor.submit(self._generate_audio_segment, segment_hidden_states)
430
- audio_futures.append(future)
431
 
432
  segment_start_idx = segment_end_idx
433
 
@@ -435,23 +384,11 @@ class EchoxAssistant():
435
  current_attention_mask = torch.ones_like(next_token)
436
 
437
  if segment_start_idx < len(accumulated_hidden_states):
 
438
  segment_hidden_states = torch.stack(
439
  accumulated_hidden_states[segment_start_idx:], dim=0
440
  ).unsqueeze(0)
441
- if self.cosyvoice_tts:
442
- segment_token_ids = accumulated_tokens[segment_start_idx:]
443
- segment_text = self.tokenizer.decode(segment_token_ids, skip_special_tokens=True)
444
- future = self.audio_executor.submit(
445
- lambda txt=segment_text: self.cosyvoice_tts.synthesize(
446
- text=txt,
447
- prompt_text=self.cosyvoice_ref_text,
448
- prompt_speech_path=self.cosyvoice_ref_audio,
449
- output_path=None,
450
- stream=False
451
- )
452
- )
453
- else:
454
- future = self.audio_executor.submit(self._generate_audio_segment, segment_hidden_states)
455
  audio_futures.append(future)
456
 
457
  for future in audio_futures:
 
12
  from text_to_speech import *
13
  import torch.nn.functional as F
14
  from concurrent.futures import ThreadPoolExecutor, as_completed
 
15
 
16
  from transformers import logging as hf_logging
17
  hf_logging.set_verbosity_error()
 
36
  quantization_config=quantization_config,
37
  token=hf_token,
38
  ).eval().to(device)
39
+ for module in model.model.audio_tower:
40
+ module = module.to(device)
41
 
42
  if args.peft_model_id:
43
  lora_config = PeftConfig.from_pretrained(args.peft_model_id)
 
96
  self.base_model_path = "FreedomIntelligence/EchoX-8B"
97
  self.peft_model_id = None
98
  self.audio_tower = "openai/whisper-large-v3"
 
 
 
 
99
  self.args = BasicSetting()
100
  self.device = "cuda"
101
+ self.vocoder, self.voc_cfg= load_speech_model(self.device)
102
+ self.model, self.audio_processor, self.tokenizer, self.unit_translator = load_model(self.args, self.device)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
103
  self.audio_executor = ThreadPoolExecutor(max_workers=2)
104
  # self.specAug = SpecAugmentTransform()
105
  # special_token
 
370
  if should_segment:
371
  segment_end_idx = check_idx + 1
372
  print(f"Segmenting at step {segment_end_idx-1}, similarity={similarity_at_check:.4f}. Submitting to background audio generation.")
 
373
 
374
  segment_hidden_states = torch.stack(
375
  accumulated_hidden_states[segment_start_idx:segment_end_idx], dim=0
376
  ).unsqueeze(0)
377
 
378
+ future = self.audio_executor.submit(self._generate_audio_segment, segment_hidden_states)
379
+ audio_futures.append(future)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
380
 
381
  segment_start_idx = segment_end_idx
382
 
 
384
  current_attention_mask = torch.ones_like(next_token)
385
 
386
  if segment_start_idx < len(accumulated_hidden_states):
387
+ print(f"Processing final segment from {segment_start_idx} to {len(accumulated_hidden_states)}")
388
  segment_hidden_states = torch.stack(
389
  accumulated_hidden_states[segment_start_idx:], dim=0
390
  ).unsqueeze(0)
391
+ future = self.audio_executor.submit(self._generate_audio_segment, segment_hidden_states)
 
 
 
 
 
 
 
 
 
 
 
 
 
392
  audio_futures.append(future)
393
 
394
  for future in audio_futures:
app.py CHANGED
@@ -3,7 +3,6 @@ import sys
3
 
4
  subprocess.check_call([sys.executable, "-m", "pip", "install", "pip==24.0"])
5
  subprocess.check_call([sys.executable, "-m", "pip", "install", "omegaconf==2.0.6"])
6
- subprocess.check_call([sys.executable, "-m", "pip", "install", "huggingface_hub==0.36.0"])
7
  subprocess.check_call([sys.executable, "-m", "pip", "install", "git+https://github.com/facebookresearch/[email protected]"])
8
  import gradio as gr
9
  import os
@@ -11,7 +10,6 @@ import torch
11
  import librosa
12
  import soundfile as sf
13
  import tempfile
14
- import numpy as np
15
  import spaces # ZeroGPU requirement
16
 
17
  # 导入你的模块
@@ -100,8 +98,6 @@ def process_audio_text(text, audio):
100
 
101
  if audio_data is not None:
102
  sr, audio_array = audio_data
103
- if isinstance(audio_array, np.ndarray) and audio_array.ndim == 2 and audio_array.shape[0] == 1:
104
- audio_array = audio_array.squeeze(0)
105
  yield (sr, audio_array), accumulated_text
106
  else:
107
  yield None, accumulated_text
 
3
 
4
  subprocess.check_call([sys.executable, "-m", "pip", "install", "pip==24.0"])
5
  subprocess.check_call([sys.executable, "-m", "pip", "install", "omegaconf==2.0.6"])
 
6
  subprocess.check_call([sys.executable, "-m", "pip", "install", "git+https://github.com/facebookresearch/[email protected]"])
7
  import gradio as gr
8
  import os
 
10
  import librosa
11
  import soundfile as sf
12
  import tempfile
 
13
  import spaces # ZeroGPU requirement
14
 
15
  # 导入你的模块
 
98
 
99
  if audio_data is not None:
100
  sr, audio_array = audio_data
 
 
101
  yield (sr, audio_array), accumulated_text
102
  else:
103
  yield None, accumulated_text
requirements.txt CHANGED
@@ -1,54 +1,9 @@
 
1
  librosa==0.10.2.post1
2
  numpy==1.24.4
3
  peft==0.5.0
4
  sentencepiece==0.2.0
 
5
  torch==2.2.0
6
  tqdm==4.66.5
7
- transformers==4.49.0
8
- funasr
9
- onnxruntime-gpu
10
- inflect
11
- jieba
12
- pypinyin
13
- g2p_en
14
- matcha-tts
15
- pyarrow
16
- pyworld
17
- torchcodec
18
- --extra-index-url https://download.pytorch.org/whl/cu121
19
- --extra-index-url https://aiinfra.pkgs.visualstudio.com/PublicPackages/_packaging/onnxruntime-cuda-12/pypi/simple/ # https://github.com/microsoft/onnxruntime/issues/21684
20
- conformer==0.3.2
21
- deepspeed==0.15.1; sys_platform == 'linux'
22
- diffusers==0.29.0
23
- fastapi==0.115.6
24
- fastapi-cli==0.0.4
25
- gdown==5.1.0
26
- gradio==3.43.2
27
- grpcio==1.57.0
28
- grpcio-tools==1.57.0
29
- hydra-core==1.3.2
30
- HyperPyYAML==1.2.2
31
- inflect==7.3.1
32
- lightning==2.2.4
33
- matplotlib==3.7.5
34
- modelscope==1.20.0
35
- networkx==3.1
36
- omegaconf==2.3.0
37
- onnx==1.16.0
38
- onnxruntime-gpu==1.18.0; sys_platform == 'linux'
39
- onnxruntime==1.18.0; sys_platform == 'darwin' or sys_platform == 'win32'
40
- openai-whisper==20231117
41
- protobuf==4.25
42
- pyarrow==18.1.0
43
- pydantic==2.7.0
44
- pyworld==0.3.4
45
- rich==13.7.1
46
- soundfile==0.12.1
47
- tensorboard==2.14.0
48
- tensorrt-cu12==10.0.1; sys_platform == 'linux'
49
- tensorrt-cu12-bindings==10.0.1; sys_platform == 'linux'
50
- tensorrt-cu12-libs==10.0.1; sys_platform == 'linux'
51
- torchaudio
52
- uvicorn==0.30.0
53
- wetext==0.0.4
54
- wget==3.2
 
1
+ gradio==5.44.1
2
  librosa==0.10.2.post1
3
  numpy==1.24.4
4
  peft==0.5.0
5
  sentencepiece==0.2.0
6
+ soundfile==0.12.1
7
  torch==2.2.0
8
  tqdm==4.66.5
9
+ transformers==4.49.0