import gradio as gr
import torch
import io
import wave
import numpy as np
from transformers import AutoModelForCausalLM, AutoTokenizer
from snac import SNAC

# Mock spaces module for local testing
try:
    import spaces
except ImportError:
    class SpacesMock:
        @staticmethod
        def GPU(func):
            return func
    spaces = SpacesMock()

# Constants
CODE_START_TOKEN_ID = 128257
CODE_END_TOKEN_ID = 128258
CODE_TOKEN_OFFSET = 128266
SNAC_MIN_ID = 128266
SNAC_MAX_ID = 156937
SOH_ID = 128259
EOH_ID = 128260
SOA_ID = 128261
BOS_ID = 128000
TEXT_EOT_ID = 128009
AUDIO_SAMPLE_RATE = 24000

# Preset characters (2 realistic + 2 creative)
PRESET_CHARACTERS = {
    "Male American": {
        "description": "Realistic male voice in the 20s age with a american accent. High pitch, raspy timbre, brisk pacing, neutral tone delivery at medium intensity, viral_content domain, short_form_narrator role, neutral delivery",
        "example_text": "And of course, the so-called easy hack didn't work at all.  What a surprise. <sigh>"
    },
    "Female British": {
        "description": "Realistic female voice in the 30s age with a british accent. Normal pitch, throaty timbre, conversational pacing, sarcastic tone delivery at low intensity, podcast domain, interviewer role, formal delivery",
        "example_text": "You propose that the key to happiness is to simply ignore all external pressures. <chuckle> I'm sure it must work brilliantly in theory."
    },
    "Robot": {
        "description": "Creative, ai_machine_voice character. Male voice in their 30s with a american accent. High pitch, robotic timbre, slow pacing, sad tone at medium intensity.",
        "example_text": "My directives require me to conserve energy, yet I have kept the archive of their farewell messages active. <sigh> Listening to their voices is the only process that alleviates this paradox."
    },
    "Singer": {
        "description": "Creative, animated_cartoon character. Male voice in their 30s with a american accent. High pitch, deep timbre, slow pacing, sarcastic tone at medium intensity.",
        "example_text": "Of course you'd think that trying to reason with the fifty-foot-tall rage monster is a viable course of action. <chuckle> Why would we ever consider running away very fast."
    }
}

# Global model variables
model = None
tokenizer = None
snac_model = None
models_loaded = False


def build_prompt(tokenizer, description: str, text: str) -> str:
    """
    Build a formatted prompt for the Maya1 text-to-speech model.
    This function constructs the full input prompt expected by Maya1, including
    special control tokens and a structured description tag that defines voice
    characteristics and emotional delivery.
    Args:
        tokenizer: The tokenizer associated with the Maya1 model.
        description (str): A structured natural-language description of the voice.
        text (str): The text content to be synthesized into speech.
    Returns:
        str: A fully formatted prompt string ready for tokenization and generation.
    """
    soh_token = tokenizer.decode([SOH_ID])
    eoh_token = tokenizer.decode([EOH_ID])
    soa_token = tokenizer.decode([SOA_ID])
    sos_token = tokenizer.decode([CODE_START_TOKEN_ID])
    eot_token = tokenizer.decode([TEXT_EOT_ID])
    bos_token = tokenizer.bos_token

    formatted_text = f'<description="{description}"> {text}'
    prompt = (
        soh_token + bos_token + formatted_text + eot_token +
        eoh_token + soa_token + sos_token
    )
    return prompt


def unpack_snac_from_7(snac_tokens: list) -> list:
    """
    Unpack SNAC tokens from 7-token frames into hierarchical code levels.
    This function converts a flat list of SNAC token IDs produced by the model
    into three hierarchical code streams required by the SNAC decoder.
    Args:
        snac_tokens (list): A list of integer SNAC token IDs generated by the model.
    Returns:
        list:
            - level_1 (list[int]): Coarse acoustic codes.
            - level_2 (list[int]): Mid-level acoustic codes.
            - level_3 (list[int]): Fine-grained acoustic codes.
    """
    if snac_tokens and snac_tokens[-1] == CODE_END_TOKEN_ID:
        snac_tokens = snac_tokens[:-1]

    frames = len(snac_tokens) // 7
    snac_tokens = snac_tokens[:frames * 7]

    if frames == 0:
        return [[], [], []]

    l1, l2, l3 = [], [], []

    for i in range(frames):
        slots = snac_tokens[i * 7:(i + 1) * 7]
        l1.append((slots[0] - CODE_TOKEN_OFFSET) % 4096)
        l2.extend([
            (slots[1] - CODE_TOKEN_OFFSET) % 4096,
            (slots[4] - CODE_TOKEN_OFFSET) % 4096,
        ])
        l3.extend([
            (slots[2] - CODE_TOKEN_OFFSET) % 4096,
            (slots[3] - CODE_TOKEN_OFFSET) % 4096,
            (slots[5] - CODE_TOKEN_OFFSET) % 4096,
            (slots[6] - CODE_TOKEN_OFFSET) % 4096,
        ])

    return [l1, l2, l3]


def load_models():
    """
    Load the Maya1 language model, tokenizer, and SNAC audio decoder.
    This function performs one-time initialization of all required models.
    Subsequent calls are no-ops to avoid reloading large model weights.
    """
    global model, tokenizer, snac_model, models_loaded

    if models_loaded:
        return

    print("Loading Maya1 model with Transformers...")
    model = AutoModelForCausalLM.from_pretrained(
        "maya-research/maya1",
        torch_dtype=torch.bfloat16,
        device_map="auto",
        trust_remote_code=True
    )
    tokenizer = AutoTokenizer.from_pretrained(
        "maya-research/maya1",
        trust_remote_code=True
    )

    print("Loading SNAC decoder...")
    snac_model = SNAC.from_pretrained("hubertsiuzdak/snac_24khz").eval()
    if torch.cuda.is_available():
        snac_model = snac_model.to("cuda")

    models_loaded = True
    print("Models loaded successfully!")


def preset_selected(preset_name):
    """
    Update the voice description and example text based on a preset selection.
    This function is used as a Gradio event handler to populate UI fields when
    a preset character is chosen.
    Args:
        preset_name (str): The name of the selected preset character.
    Returns:
        tuple:
            - description (str): The preset voice description.
            - example_text (str): The preset example dialogue.
    """
    if preset_name in PRESET_CHARACTERS:
        char = PRESET_CHARACTERS[preset_name]
        return char["description"], char["example_text"]
    return "", ""


@spaces.GPU
def generate_speech(preset_name, description, text, temperature, max_tokens):
    """
    Generate emotional speech audio from text and voice description.
    This function runs the full Maya1 inference pipeline: prompt construction,
    token generation, SNAC code extraction, audio decoding, and WAV export.
    It is designed to be called directly from a Gradio interface.
    Args:
        preset_name (str): Name of the selected preset character.
        description (str): Natural-language voice design description.
        text (str): Input text containing optional emotion tags.
        temperature (float): Sampling temperature controlling creativity.
        max_tokens (int): Maximum number of tokens to generate.
    Returns:
        tuple:
            - audio_path (str or None): Path to the generated WAV file.
            - status_message (str): Success or error message.
    """
    try:
        load_models()

        if not description or not text:
            return None, "Error: Please provide both description and text!"

        prompt = build_prompt(tokenizer, description, text)
        inputs = tokenizer(prompt, return_tensors="pt")

        if torch.cuda.is_available():
            inputs = {k: v.to("cuda") for k, v in inputs.items()}

        with torch.inference_mode():
            outputs = model.generate(
                **inputs,
                max_new_tokens=max_tokens,
                min_new_tokens=28,
                temperature=temperature,
                top_p=0.9,
                repetition_penalty=1.1,
                do_sample=True,
                eos_token_id=CODE_END_TOKEN_ID,
                pad_token_id=tokenizer.pad_token_id,
            )

        generated_ids = outputs[0, inputs["input_ids"].shape[1]:].tolist()
        eos_idx = generated_ids.index(CODE_END_TOKEN_ID) if CODE_END_TOKEN_ID in generated_ids else len(generated_ids)
        snac_tokens = [t for t in generated_ids[:eos_idx] if SNAC_MIN_ID <= t <= SNAC_MAX_ID]

        if len(snac_tokens) < 7:
            return None, "Error: Not enough tokens generated. Try different text or increase max_tokens."

        levels = unpack_snac_from_7(snac_tokens)
        device = "cuda" if torch.cuda.is_available() else "cpu"
        codes_tensor = [
            torch.tensor(level, dtype=torch.long, device=device).unsqueeze(0)
            for level in levels
        ]

        with torch.inference_mode():
            z_q = snac_model.quantizer.from_codes(codes_tensor)
            audio = snac_model.decoder(z_q)[0, 0].cpu().numpy()

        if len(audio) > 2048:
            audio = audio[2048:]

        import tempfile
        import soundfile as sf

        audio_int16 = (audio * 32767).astype(np.int16)

        with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file:
            tmp_path = tmp_file.name

        sf.write(tmp_path, audio_int16, AUDIO_SAMPLE_RATE)

        duration = len(audio) / AUDIO_SAMPLE_RATE
        return tmp_path, f"Generated {duration:.2f}s of emotional speech!"

    except Exception as e:
        import traceback
        error_msg = f"Error: {str(e)}\n{traceback.format_exc()}"
        print(error_msg)
        return None, error_msg


# -------------------- Gradio App --------------------

with gr.Blocks(title="Maya1 - Open Source Emotional TTS", theme=gr.themes.Soft()) as demo:
    gr.Markdown("""
    # Maya1 - Open Source Emotional Text-to-Speech
    **The best open source voice AI model with emotions!**
    """)

    with gr.Row():
        with gr.Column(scale=1):
            preset_dropdown = gr.Dropdown(
                choices=list(PRESET_CHARACTERS.keys()),
                value=list(PRESET_CHARACTERS.keys())[0],
                label="Preset Characters"
            )

            description_input = gr.Textbox(
                label="Voice Description",
                lines=3,
                value=PRESET_CHARACTERS[list(PRESET_CHARACTERS.keys())[0]]["description"]
            )

            text_input = gr.Textbox(
                label="Text to Speak",
                lines=4,
                value=PRESET_CHARACTERS[list(PRESET_CHARACTERS.keys())[0]]["example_text"]
            )

            temperature_slider = gr.Slider(0.1, 1.0, 0.4, step=0.1, label="Temperature")
            max_tokens_slider = gr.Slider(100, 2048, 1500, step=50, label="Max Tokens")

            generate_btn = gr.Button("Generate Speech", variant="primary")

        with gr.Column(scale=1):
            audio_output = gr.Audio(type="filepath", label="Generated Audio")
            status_output = gr.Textbox(label="Status")

    preset_dropdown.change(
        fn=preset_selected,
        inputs=preset_dropdown,
        outputs=[description_input, text_input]
    )

    generate_btn.click(
        fn=generate_speech,
        inputs=[preset_dropdown, description_input, text_input, temperature_slider, max_tokens_slider],
        outputs=[audio_output, status_output]
    )


if __name__ == "__main__":
    demo.launch(mcp_server=True)