|
|
|
|
|
|
|
|
|
|
|
|
|
|
mod synthesis; |
|
|
|
|
|
pub use synthesis::{IndexTTS, SynthesisOptions, SynthesisResult}; |
|
|
|
|
|
use crate::{Error, Result}; |
|
|
use std::path::{Path, PathBuf}; |
|
|
|
|
|
|
|
|
#[derive(Debug, Clone, Copy, PartialEq, Eq)] |
|
|
pub enum PipelineStage { |
|
|
TextNormalization, |
|
|
Tokenization, |
|
|
SemanticEncoding, |
|
|
SpeakerConditioning, |
|
|
GptGeneration, |
|
|
AcousticExpansion, |
|
|
Vocoding, |
|
|
PostProcessing, |
|
|
} |
|
|
|
|
|
impl PipelineStage { |
|
|
|
|
|
pub fn name(&self) -> &'static str { |
|
|
match self { |
|
|
PipelineStage::TextNormalization => "Text Normalization", |
|
|
PipelineStage::Tokenization => "Tokenization", |
|
|
PipelineStage::SemanticEncoding => "Semantic Encoding", |
|
|
PipelineStage::SpeakerConditioning => "Speaker Conditioning", |
|
|
PipelineStage::GptGeneration => "GPT Generation", |
|
|
PipelineStage::AcousticExpansion => "Acoustic Expansion", |
|
|
PipelineStage::Vocoding => "Vocoding", |
|
|
PipelineStage::PostProcessing => "Post Processing", |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
pub fn all() -> Vec<PipelineStage> { |
|
|
vec![ |
|
|
PipelineStage::TextNormalization, |
|
|
PipelineStage::Tokenization, |
|
|
PipelineStage::SemanticEncoding, |
|
|
PipelineStage::SpeakerConditioning, |
|
|
PipelineStage::GptGeneration, |
|
|
PipelineStage::AcousticExpansion, |
|
|
PipelineStage::Vocoding, |
|
|
PipelineStage::PostProcessing, |
|
|
] |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
pub type ProgressCallback = Box<dyn Fn(PipelineStage, f32) + Send + Sync>; |
|
|
|
|
|
|
|
|
#[derive(Debug, Clone)] |
|
|
pub struct PipelineConfig { |
|
|
|
|
|
pub model_dir: PathBuf, |
|
|
|
|
|
pub use_fp16: bool, |
|
|
|
|
|
pub device: String, |
|
|
|
|
|
pub enable_cache: bool, |
|
|
|
|
|
pub max_text_length: usize, |
|
|
|
|
|
pub max_audio_duration: f32, |
|
|
} |
|
|
|
|
|
impl Default for PipelineConfig { |
|
|
fn default() -> Self { |
|
|
Self { |
|
|
model_dir: PathBuf::from("models"), |
|
|
use_fp16: false, |
|
|
device: "cpu".to_string(), |
|
|
enable_cache: true, |
|
|
max_text_length: 500, |
|
|
max_audio_duration: 30.0, |
|
|
} |
|
|
} |
|
|
} |
|
|
|
|
|
impl PipelineConfig { |
|
|
|
|
|
pub fn with_model_dir<P: AsRef<Path>>(mut self, path: P) -> Self { |
|
|
self.model_dir = path.as_ref().to_path_buf(); |
|
|
self |
|
|
} |
|
|
|
|
|
|
|
|
pub fn with_fp16(mut self, enable: bool) -> Self { |
|
|
self.use_fp16 = enable; |
|
|
self |
|
|
} |
|
|
|
|
|
|
|
|
pub fn with_device(mut self, device: &str) -> Self { |
|
|
self.device = device.to_string(); |
|
|
self |
|
|
} |
|
|
|
|
|
|
|
|
pub fn validate(&self) -> Result<()> { |
|
|
if !self.model_dir.exists() { |
|
|
log::warn!( |
|
|
"Model directory does not exist: {}", |
|
|
self.model_dir.display() |
|
|
); |
|
|
} |
|
|
|
|
|
if self.max_text_length == 0 { |
|
|
return Err(Error::Config("max_text_length must be > 0".into())); |
|
|
} |
|
|
|
|
|
if self.max_audio_duration <= 0.0 { |
|
|
return Err(Error::Config("max_audio_duration must be > 0".into())); |
|
|
} |
|
|
|
|
|
Ok(()) |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
pub fn segment_text(text: &str, max_segment_len: usize) -> Vec<String> { |
|
|
use crate::text::TextNormalizer; |
|
|
|
|
|
let normalizer = TextNormalizer::new(); |
|
|
let sentences = normalizer.split_sentences(text); |
|
|
|
|
|
let mut segments = Vec::new(); |
|
|
let mut current_segment = String::new(); |
|
|
|
|
|
for sentence in sentences { |
|
|
if current_segment.len() + sentence.len() > max_segment_len && !current_segment.is_empty() |
|
|
{ |
|
|
segments.push(current_segment.trim().to_string()); |
|
|
current_segment = sentence; |
|
|
} else { |
|
|
if !current_segment.is_empty() { |
|
|
current_segment.push(' '); |
|
|
} |
|
|
current_segment.push_str(&sentence); |
|
|
} |
|
|
} |
|
|
|
|
|
if !current_segment.trim().is_empty() { |
|
|
segments.push(current_segment.trim().to_string()); |
|
|
} |
|
|
|
|
|
segments |
|
|
} |
|
|
|
|
|
|
|
|
pub fn concatenate_audio(segments: &[Vec<f32>], silence_duration_ms: u32, sample_rate: u32) -> Vec<f32> { |
|
|
let silence_samples = (silence_duration_ms as usize * sample_rate as usize) / 1000; |
|
|
let silence = vec![0.0f32; silence_samples]; |
|
|
|
|
|
let mut result = Vec::new(); |
|
|
|
|
|
for (i, segment) in segments.iter().enumerate() { |
|
|
result.extend_from_slice(segment); |
|
|
if i < segments.len() - 1 { |
|
|
result.extend_from_slice(&silence); |
|
|
} |
|
|
} |
|
|
|
|
|
result |
|
|
} |
|
|
|
|
|
|
|
|
pub fn estimate_duration(text: &str, chars_per_second: f32) -> f32 { |
|
|
text.chars().count() as f32 / chars_per_second |
|
|
} |
|
|
|