# app.py
import gradio as gr
import torch
from transformers import pipeline, AutoTokenizer, T5ForConditionalGeneration
from diffusers import StableDiffusionPipeline
import speech_recognition as sr
from io import BytesIO

# ========== Step 1: Prompt Enhancement ==========
prompt_model = T5ForConditionalGeneration.from_pretrained("google/flan-t5-small")
prompt_tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-small")

def enhance_prompt(raw_input, style_choice):
    template = f"Generate a detailed Stable Diffusion prompt about: {raw_input} in {style_choice} style."
    inputs = prompt_tokenizer(template, return_tensors="pt")
    outputs = prompt_model.generate(inputs.input_ids, max_length=100)
    return prompt_tokenizer.decode(outputs[0], skip_special_tokens=True)

# ========== Step 2: Image Generation ==========
sd_pipe = StableDiffusionPipeline.from_pretrained(
    "runwayml/stable-diffusion-v1-5",
    torch_dtype=torch.float32,
    use_safetensors=True
)
sd_pipe.enable_attention_slicing()  # 降低内存消耗

def generate_image(enhanced_prompt, steps=20, guidance=7.5):
    return sd_pipe(
        enhanced_prompt,
        num_inference_steps=int(steps),
        guidance_scale=guidance,
        generator=torch.Generator().manual_seed(42)
    ).images[0]

# ========== Step 3: Voice Input ==========
recognizer = sr.Recognizer()

def audio_to_text(audio_file):
    with sr.AudioFile(audio_file) as source:
        audio = recognizer.record(source)
        return recognizer.recognize_whisper(audio, model="tiny.en")

# ========== Gradio Interface ==========
with gr.Blocks(title="AI Art Studio") as app:
    gr.Markdown("## 🎨 AI Art Generator (CPU Optimized)")
    
    with gr.Row():
        with gr.Column(scale=2):
            # ===== 交互控件 =====
            input_type = gr.Radio(["Text", "Voice"], label="输入方式")
            voice_input = gr.Audio(source="upload", type="filepath", visible=False)
            text_input = gr.Textbox(label="输入描述", placeholder="描述你想生成的画面...")
            
            style_choice = gr.Dropdown(
                ["Digital Art", "Oil Painting", "Anime", "Photorealistic"],
                value="Digital Art",
                label="艺术风格"
            )
            
            generate_btn = gr.Button("生成作品", variant="primary")
            
            with gr.Accordion("高级设置", open=False):
                steps_slider = gr.Slider(10, 30, value=20, step=1, label="生成步数")
                guidance_slider = gr.Slider(5.0, 10.0, value=7.5, label="创意自由度")
        
        with gr.Column(scale=3):
            # ===== 输出展示 =====
            prompt_output = gr.Textbox(label="优化后的Prompt", interactive=False)
            image_output = gr.Image(label="生成结果", show_label=False)

    # ===== 交互逻辑 =====
    input_type.change(
        fn=lambda x: (gr.update(visible=x=="Voice"), gr.update(visible=x=="Text")),
        inputs=input_type,
        outputs=[voice_input, text_input]
    )
    
    generate_btn.click(
        fn=lambda x,t: audio_to_text(x) if t=="Voice" else t,
        inputs=[voice_input, input_type],
        outputs=text_input
    ).success(
        fn=enhance_prompt,
        inputs=[text_input, style_choice],
        outputs=prompt_output
    ).success(
        fn=generate_image,
        inputs=[prompt_output, steps_slider, guidance_slider],
        outputs=image_output
    )

# ========== Step 4: Huggingface Deployment ==========
if __name__ == "__main__":
    app.launch(server_name="0.0.0.0", server_port=7860)