# app.py import gradio as gr import torch from transformers import pipeline, AutoTokenizer, T5ForConditionalGeneration from diffusers import StableDiffusionPipeline import speech_recognition as sr from io import BytesIO # ========== Step 1: Prompt Enhancement ========== prompt_model = T5ForConditionalGeneration.from_pretrained("google/flan-t5-small") prompt_tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-small") def enhance_prompt(raw_input, style_choice): template = f"Generate a detailed Stable Diffusion prompt about: {raw_input} in {style_choice} style." inputs = prompt_tokenizer(template, return_tensors="pt") outputs = prompt_model.generate(inputs.input_ids, max_length=100) return prompt_tokenizer.decode(outputs[0], skip_special_tokens=True) # ========== Step 2: Image Generation ========== sd_pipe = StableDiffusionPipeline.from_pretrained( "runwayml/stable-diffusion-v1-5", torch_dtype=torch.float32, use_safetensors=True ) sd_pipe.enable_attention_slicing() # 降低内存消耗 def generate_image(enhanced_prompt, steps=20, guidance=7.5): return sd_pipe( enhanced_prompt, num_inference_steps=int(steps), guidance_scale=guidance, generator=torch.Generator().manual_seed(42) ).images[0] # ========== Step 3: Voice Input ========== recognizer = sr.Recognizer() def audio_to_text(audio_file): with sr.AudioFile(audio_file) as source: audio = recognizer.record(source) return recognizer.recognize_whisper(audio, model="tiny.en") # ========== Gradio Interface ========== with gr.Blocks(title="AI Art Studio") as app: gr.Markdown("## 🎨 AI Art Generator (CPU Optimized)") with gr.Row(): with gr.Column(scale=2): # ===== 交互控件 ===== input_type = gr.Radio(["Text", "Voice"], label="输入方式") voice_input = gr.Audio(source="upload", type="filepath", visible=False) text_input = gr.Textbox(label="输入描述", placeholder="描述你想生成的画面...") style_choice = gr.Dropdown( ["Digital Art", "Oil Painting", "Anime", "Photorealistic"], value="Digital Art", label="艺术风格" ) generate_btn = gr.Button("生成作品", variant="primary") with gr.Accordion("高级设置", open=False): steps_slider = gr.Slider(10, 30, value=20, step=1, label="生成步数") guidance_slider = gr.Slider(5.0, 10.0, value=7.5, label="创意自由度") with gr.Column(scale=3): # ===== 输出展示 ===== prompt_output = gr.Textbox(label="优化后的Prompt", interactive=False) image_output = gr.Image(label="生成结果", show_label=False) # ===== 交互逻辑 ===== input_type.change( fn=lambda x: (gr.update(visible=x=="Voice"), gr.update(visible=x=="Text")), inputs=input_type, outputs=[voice_input, text_input] ) generate_btn.click( fn=lambda x,t: audio_to_text(x) if t=="Voice" else t, inputs=[voice_input, input_type], outputs=text_input ).success( fn=enhance_prompt, inputs=[text_input, style_choice], outputs=prompt_output ).success( fn=generate_image, inputs=[prompt_output, steps_slider, guidance_slider], outputs=image_output ) # ========== Step 4: Huggingface Deployment ========== if __name__ == "__main__": app.launch(server_name="0.0.0.0", server_port=7860)