import gradio as gr import torch import io import wave import numpy as np from transformers import AutoModelForCausalLM, AutoTokenizer from snac import SNAC # Mock spaces module for local testing try: import spaces except ImportError: class SpacesMock: @staticmethod def GPU(func): return func spaces = SpacesMock() # Constants CODE_START_TOKEN_ID = 128257 CODE_END_TOKEN_ID = 128258 CODE_TOKEN_OFFSET = 128266 SNAC_MIN_ID = 128266 SNAC_MAX_ID = 156937 SOH_ID = 128259 EOH_ID = 128260 SOA_ID = 128261 BOS_ID = 128000 TEXT_EOT_ID = 128009 AUDIO_SAMPLE_RATE = 24000 # Preset characters (2 realistic + 2 creative) PRESET_CHARACTERS = { "Male American": { "description": "Realistic male voice in the 20s age with a american accent. High pitch, raspy timbre, brisk pacing, neutral tone delivery at medium intensity, viral_content domain, short_form_narrator role, neutral delivery", "example_text": "And of course, the so-called easy hack didn't work at all. What a surprise. " }, "Female British": { "description": "Realistic female voice in the 30s age with a british accent. Normal pitch, throaty timbre, conversational pacing, sarcastic tone delivery at low intensity, podcast domain, interviewer role, formal delivery", "example_text": "You propose that the key to happiness is to simply ignore all external pressures. I'm sure it must work brilliantly in theory." }, "Robot": { "description": "Creative, ai_machine_voice character. Male voice in their 30s with a american accent. High pitch, robotic timbre, slow pacing, sad tone at medium intensity.", "example_text": "My directives require me to conserve energy, yet I have kept the archive of their farewell messages active. Listening to their voices is the only process that alleviates this paradox." }, "Singer": { "description": "Creative, animated_cartoon character. Male voice in their 30s with a american accent. High pitch, deep timbre, slow pacing, sarcastic tone at medium intensity.", "example_text": "Of course you'd think that trying to reason with the fifty-foot-tall rage monster is a viable course of action. Why would we ever consider running away very fast." } } # Global model variables model = None tokenizer = None snac_model = None models_loaded = False def build_prompt(tokenizer, description: str, text: str) -> str: """ Build a formatted prompt for the Maya1 text-to-speech model. This function constructs the full input prompt expected by Maya1, including special control tokens and a structured description tag that defines voice characteristics and emotional delivery. Args: tokenizer: The tokenizer associated with the Maya1 model. description (str): A structured natural-language description of the voice. text (str): The text content to be synthesized into speech. Returns: str: A fully formatted prompt string ready for tokenization and generation. """ soh_token = tokenizer.decode([SOH_ID]) eoh_token = tokenizer.decode([EOH_ID]) soa_token = tokenizer.decode([SOA_ID]) sos_token = tokenizer.decode([CODE_START_TOKEN_ID]) eot_token = tokenizer.decode([TEXT_EOT_ID]) bos_token = tokenizer.bos_token formatted_text = f' {text}' prompt = ( soh_token + bos_token + formatted_text + eot_token + eoh_token + soa_token + sos_token ) return prompt def unpack_snac_from_7(snac_tokens: list) -> list: """ Unpack SNAC tokens from 7-token frames into hierarchical code levels. This function converts a flat list of SNAC token IDs produced by the model into three hierarchical code streams required by the SNAC decoder. Args: snac_tokens (list): A list of integer SNAC token IDs generated by the model. Returns: list: - level_1 (list[int]): Coarse acoustic codes. - level_2 (list[int]): Mid-level acoustic codes. - level_3 (list[int]): Fine-grained acoustic codes. """ if snac_tokens and snac_tokens[-1] == CODE_END_TOKEN_ID: snac_tokens = snac_tokens[:-1] frames = len(snac_tokens) // 7 snac_tokens = snac_tokens[:frames * 7] if frames == 0: return [[], [], []] l1, l2, l3 = [], [], [] for i in range(frames): slots = snac_tokens[i * 7:(i + 1) * 7] l1.append((slots[0] - CODE_TOKEN_OFFSET) % 4096) l2.extend([ (slots[1] - CODE_TOKEN_OFFSET) % 4096, (slots[4] - CODE_TOKEN_OFFSET) % 4096, ]) l3.extend([ (slots[2] - CODE_TOKEN_OFFSET) % 4096, (slots[3] - CODE_TOKEN_OFFSET) % 4096, (slots[5] - CODE_TOKEN_OFFSET) % 4096, (slots[6] - CODE_TOKEN_OFFSET) % 4096, ]) return [l1, l2, l3] def load_models(): """ Load the Maya1 language model, tokenizer, and SNAC audio decoder. This function performs one-time initialization of all required models. Subsequent calls are no-ops to avoid reloading large model weights. """ global model, tokenizer, snac_model, models_loaded if models_loaded: return print("Loading Maya1 model with Transformers...") model = AutoModelForCausalLM.from_pretrained( "maya-research/maya1", torch_dtype=torch.bfloat16, device_map="auto", trust_remote_code=True ) tokenizer = AutoTokenizer.from_pretrained( "maya-research/maya1", trust_remote_code=True ) print("Loading SNAC decoder...") snac_model = SNAC.from_pretrained("hubertsiuzdak/snac_24khz").eval() if torch.cuda.is_available(): snac_model = snac_model.to("cuda") models_loaded = True print("Models loaded successfully!") def preset_selected(preset_name): """ Update the voice description and example text based on a preset selection. This function is used as a Gradio event handler to populate UI fields when a preset character is chosen. Args: preset_name (str): The name of the selected preset character. Returns: tuple: - description (str): The preset voice description. - example_text (str): The preset example dialogue. """ if preset_name in PRESET_CHARACTERS: char = PRESET_CHARACTERS[preset_name] return char["description"], char["example_text"] return "", "" @spaces.GPU def generate_speech(preset_name, description, text, temperature, max_tokens): """ Generate emotional speech audio from text and voice description. This function runs the full Maya1 inference pipeline: prompt construction, token generation, SNAC code extraction, audio decoding, and WAV export. It is designed to be called directly from a Gradio interface. Args: preset_name (str): Name of the selected preset character. description (str): Natural-language voice design description. text (str): Input text containing optional emotion tags. temperature (float): Sampling temperature controlling creativity. max_tokens (int): Maximum number of tokens to generate. Returns: tuple: - audio_path (str or None): Path to the generated WAV file. - status_message (str): Success or error message. """ try: load_models() if not description or not text: return None, "Error: Please provide both description and text!" prompt = build_prompt(tokenizer, description, text) inputs = tokenizer(prompt, return_tensors="pt") if torch.cuda.is_available(): inputs = {k: v.to("cuda") for k, v in inputs.items()} with torch.inference_mode(): outputs = model.generate( **inputs, max_new_tokens=max_tokens, min_new_tokens=28, temperature=temperature, top_p=0.9, repetition_penalty=1.1, do_sample=True, eos_token_id=CODE_END_TOKEN_ID, pad_token_id=tokenizer.pad_token_id, ) generated_ids = outputs[0, inputs["input_ids"].shape[1]:].tolist() eos_idx = generated_ids.index(CODE_END_TOKEN_ID) if CODE_END_TOKEN_ID in generated_ids else len(generated_ids) snac_tokens = [t for t in generated_ids[:eos_idx] if SNAC_MIN_ID <= t <= SNAC_MAX_ID] if len(snac_tokens) < 7: return None, "Error: Not enough tokens generated. Try different text or increase max_tokens." levels = unpack_snac_from_7(snac_tokens) device = "cuda" if torch.cuda.is_available() else "cpu" codes_tensor = [ torch.tensor(level, dtype=torch.long, device=device).unsqueeze(0) for level in levels ] with torch.inference_mode(): z_q = snac_model.quantizer.from_codes(codes_tensor) audio = snac_model.decoder(z_q)[0, 0].cpu().numpy() if len(audio) > 2048: audio = audio[2048:] import tempfile import soundfile as sf audio_int16 = (audio * 32767).astype(np.int16) with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file: tmp_path = tmp_file.name sf.write(tmp_path, audio_int16, AUDIO_SAMPLE_RATE) duration = len(audio) / AUDIO_SAMPLE_RATE return tmp_path, f"Generated {duration:.2f}s of emotional speech!" except Exception as e: import traceback error_msg = f"Error: {str(e)}\n{traceback.format_exc()}" print(error_msg) return None, error_msg # -------------------- Gradio App -------------------- with gr.Blocks(title="Maya1 - Open Source Emotional TTS", theme=gr.themes.Soft()) as demo: gr.Markdown(""" # Maya1 - Open Source Emotional Text-to-Speech **The best open source voice AI model with emotions!** """) with gr.Row(): with gr.Column(scale=1): preset_dropdown = gr.Dropdown( choices=list(PRESET_CHARACTERS.keys()), value=list(PRESET_CHARACTERS.keys())[0], label="Preset Characters" ) description_input = gr.Textbox( label="Voice Description", lines=3, value=PRESET_CHARACTERS[list(PRESET_CHARACTERS.keys())[0]]["description"] ) text_input = gr.Textbox( label="Text to Speak", lines=4, value=PRESET_CHARACTERS[list(PRESET_CHARACTERS.keys())[0]]["example_text"] ) temperature_slider = gr.Slider(0.1, 1.0, 0.4, step=0.1, label="Temperature") max_tokens_slider = gr.Slider(100, 2048, 1500, step=50, label="Max Tokens") generate_btn = gr.Button("Generate Speech", variant="primary") with gr.Column(scale=1): audio_output = gr.Audio(type="filepath", label="Generated Audio") status_output = gr.Textbox(label="Status") preset_dropdown.change( fn=preset_selected, inputs=preset_dropdown, outputs=[description_input, text_input] ) generate_btn.click( fn=generate_speech, inputs=[preset_dropdown, description_input, text_input, temperature_slider, max_tokens_slider], outputs=[audio_output, status_output] ) if __name__ == "__main__": demo.launch(mcp_server=True)