Spaces:

garyuzair
/

POV

Runtime error

App Files Files Community

garyuzair commited on May 9

Commit

da66c01

verified ·

1 Parent(s): d4c3da5

Update src/app_hf_space_optimized.py

Browse files

Files changed (1) hide show

src/app_hf_space_optimized.py +507 -89

src/app_hf_space_optimized.py CHANGED Viewed

@@ -1,6 +1,3 @@
-# ✅ Fully Functional POV Automation App (Streamlit for HF Spaces)
-# Optimized for Free Tier: LLM (TinyLlama), SD 1.4, Parler-TTS, FFmpeg image+audio → video
 import os
 import gc
 import torch
@@ -9,6 +6,9 @@ import tempfile
 import json
 import subprocess
 from huggingface_hub import hf_hub_download
 from transformers import AutoTokenizer, AutoModelForCausalLM
 from parler_tts import ParlerTTSForConditionalGeneration
@@ -17,116 +17,534 @@ from PIL import Image
 import soundfile as sf
 # --- Config ---
-st.set_page_config(layout="wide", page_title="⚡ POV Generator (Lite HF Space)")
-LLM_MODEL_ID = "mistralai/Mistral-7B-Instruct-v0.1"
 IMG_MODEL_ID = "CompVis/stable-diffusion-v1-4"
-TTS_MODEL_ID = "parler-tts/parler-tts-mini-v1.1"
-CACHE_DIR = "/tmp/hf_cache"
 os.makedirs(CACHE_DIR, exist_ok=True)
 os.environ['HUGGINGFACE_HUB_CACHE'] = CACHE_DIR
-# --- Util ---
-def clear_torch():
     gc.collect()
     if torch.cuda.is_available():
         torch.cuda.empty_cache()
-# --- Step 1: Generate JSON Story ---
-def generate_story(prompt: str, num_scenes: int):
-    st.info("🧠 Generating story...")
-    tokenizer = AutoTokenizer.from_pretrained(LLM_MODEL_ID, cache_dir=CACHE_DIR)
     model = AutoModelForCausalLM.from_pretrained(
-        LLM_MODEL_ID,
         torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
         device_map="auto",
         cache_dir=CACHE_DIR
     )
-    sys_prompt = (
-        f"You are a scriptwriter. Write a short POV story in exactly {num_scenes} scenes. "
-        f"Respond ONLY with valid JSON in this format: "
-        f"{{\"title\": \"Your Title\", \"scenes\": [\"scene 1\", \"scene 2\", ..., \"scene {num_scenes}\"]}}"
     )
-    full_prompt = tokenizer.apply_chat_template([
-        {"role": "system", "content": sys_prompt},
-        {"role": "user", "content": prompt}
-    ], tokenize=False)
-    input_ids = tokenizer(full_prompt, return_tensors="pt").input_ids.to(model.device)
-    output = model.generate(input_ids, max_new_tokens=1024, do_sample=True)
-    result = tokenizer.decode(output[0], skip_special_tokens=True)
-    clear_torch()
-    try:
-        json_part = result[result.index("{"): result.rindex("}")+1]
-        return json.loads(json_part)
-    except:
-        st.error("⚠️ Failed to parse JSON. Check model output.")
-        st.code(result)
-        return None
-# --- Step 2: Generate Images ---
-def generate_images(scenes):
-    st.info("🎨 Generating images...")
-    pipe = StableDiffusionPipeline.from_pretrained(
-        IMG_MODEL_ID,
         torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
         cache_dir=CACHE_DIR
     )
-    pipe.to("cuda" if torch.cuda.is_available() else "cpu")
-    images = []
-    for i, scene in enumerate(scenes):
-        image = pipe(scene).images[0]
-        images.append(image)
-        st.image(image, caption=f"Scene {i+1}")
-    clear_torch()
-    return images
-# --- Step 3: Generate TTS ---
-def generate_audios(scenes):
-    st.info("🔊 Generating audio...")
-    tts = ParlerTTSForConditionalGeneration.from_pretrained(TTS_MODEL_ID, device_map="auto", cache_dir=CACHE_DIR)
-    tokenizer = AutoTokenizer.from_pretrained(TTS_MODEL_ID, cache_dir=CACHE_DIR)
-    desc_tokenizer = AutoTokenizer.from_pretrained(tts.config.text_encoder._name_or_path, cache_dir=CACHE_DIR)
     audio_paths = []
-    for i, scene in enumerate(scenes):
-        desc_ids = desc_tokenizer("Neutral narrator", return_tensors="pt").input_ids.to(tts.device)
-        prompt_ids = tokenizer(scene, return_tensors="pt").input_ids.to(tts.device)
-        wav = tts.generate(input_ids=desc_ids, prompt_input_ids=prompt_ids).to(torch.float32).cpu().numpy()
-        path = f"audio_{i+1}.wav"
-        sf.write(path, wav, 24000)
-        audio_paths.append(path)
-        st.audio(path)
-    clear_torch()
     return audio_paths
 # --- Step 4: Create Video ---
-def create_video(images, audio_paths):
-    st.info("📹 Creating video...")
-    frames_dir = tempfile.mkdtemp()
-    for idx, img in enumerate(images):
-        img.save(os.path.join(frames_dir, f"frame_{idx:03}.png"))
-    video_path = "final_output.mp4"
-    command = [
-        "ffmpeg", "-y", "-r", "1", "-i", f"{frames_dir}/frame_%03d.png",
-        "-i", audio_paths[0],
-        "-c:v", "libx264", "-pix_fmt", "yuv420p",
-        video_path
     ]
-    subprocess.run(command, stdout=subprocess.DEVNULL, stderr=subprocess.STDOUT)
-    st.video(video_path)
-# --- UI ---
-st.title("⚡ POV Generator – Hugging Face Free Tier Ready")
-prompt = st.text_area("Enter a POV prompt:", "POV: You wake up on Mars and can’t remember Earth")
-num_scenes = st.slider("Number of Scenes", 6, 20, 6)
-if st.button("🚀 Generate Story"):
-    story = generate_story(prompt, num_scenes)
-    if story:
-        st.subheader(story['title'])
-        images = generate_images(story['scenes'])
-        audios = generate_audios(story['scenes'])
-        create_video(images, audios)

 import os
 import gc
 import torch
 import json
 import subprocess
 from huggingface_hub import hf_hub_download
+import shutil
+from datetime import datetime
+from io import BytesIO
 from transformers import AutoTokenizer, AutoModelForCausalLM
 from parler_tts import ParlerTTSForConditionalGeneration
 import soundfile as sf
 # --- Config ---
+st.set_page_config(layout="wide", page_title="⚡ POV Generator Pro")
+LLM_MODEL_ID = "openai-community/gpt2-medium"  # Slightly larger GPT-2 model
 IMG_MODEL_ID = "CompVis/stable-diffusion-v1-4"
+TTS_MODEL_ID = "parler-tts/parler-tts-mini-v1.1" # Make sure this matches your desired ParlerTTS model version
+# Using Streamlit's native caching for Hugging Face Hub downloads if possible,
+# otherwise, this explicit cache dir is fine.
+# For HF Spaces, /tmp is ephemeral but fine for a session.
+CACHE_DIR = os.path.join(tempfile.gettempdir(), "hf_cache_pov_generator")
 os.makedirs(CACHE_DIR, exist_ok=True)
 os.environ['HUGGINGFACE_HUB_CACHE'] = CACHE_DIR
+os.environ['HF_HOME'] = CACHE_DIR # Also sets the general Hugging Face home
+os.environ['TRANSFORMERS_CACHE'] = CACHE_DIR
+os.environ['DIFFUSERS_CACHE'] = CACHE_DIR
+# --- Session State Initialization ---
+if 'run_id' not in st.session_state:
+    st.session_state.run_id = datetime.now().strftime("%Y%m%d_%H%M%S")
+if 'story_data' not in st.session_state:
+    st.session_state.story_data = None
+if 'pil_images' not in st.session_state:
+    st.session_state.pil_images = None
+if 'image_paths_for_video' not in st.session_state:
+    st.session_state.image_paths_for_video = None
+if 'audio_paths' not in st.session_state:
+    st.session_state.audio_paths = None
+if 'video_path' not in st.session_state:
+    st.session_state.video_path = None
+if 'temp_base_dir' not in st.session_state:
+    st.session_state.temp_base_dir = None
+# --- Utility ---
+def get_session_temp_dir():
+    if st.session_state.temp_base_dir and os.path.exists(st.session_state.temp_base_dir):
+        return st.session_state.temp_base_dir
+    # Define a base directory for all temporary files for this session run
+    # This helps in cleaning up everything related to one generation run
+    base_dir = os.path.join(tempfile.gettempdir(), f"pov_generator_run_{st.session_state.run_id}")
+    os.makedirs(base_dir, exist_ok=True)
+    st.session_state.temp_base_dir = base_dir
+    return base_dir
+def cleanup_temp_files(specific_dir=None):
+    """Cleans up temporary files."""
+    path_to_clean = specific_dir or st.session_state.get("temp_base_dir")
+    if path_to_clean and os.path.exists(path_to_clean):
+        try:
+            shutil.rmtree(path_to_clean)
+            if specific_dir is None: # Only reset if cleaning the main session temp dir
+                 st.session_state.temp_base_dir = None
+            print(f"Cleaned up temp directory: {path_to_clean}")
+        except Exception as e:
+            print(f"Error cleaning up temp directory {path_to_clean}: {e}")
+    # Clean up individual files if they were stored outside temp_base_dir (legacy or direct)
+    # For this improved version, all temp files should be within temp_base_dir
+def clear_torch_cache():
     gc.collect()
     if torch.cuda.is_available():
         torch.cuda.empty_cache()
+# --- Model Loading (Cached) ---
+@st.cache_resource
+def load_llm_model_and_tokenizer(model_id):
+    tokenizer = AutoTokenizer.from_pretrained(model_id, cache_dir=CACHE_DIR)
     model = AutoModelForCausalLM.from_pretrained(
+        model_id,
         torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
         device_map="auto",
         cache_dir=CACHE_DIR
     )
+    if tokenizer.pad_token_id is None: # GPT-2 might not have a pad token by default
+        tokenizer.pad_token = tokenizer.eos_token
+        model.config.pad_token_id = model.config.eos_token_id
+    return model, tokenizer
+@st.cache_resource
+def load_sd_pipeline(model_id):
+    pipe = StableDiffusionPipeline.from_pretrained(
+        model_id,
+        torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
+        cache_dir=CACHE_DIR
     )
+    if torch.cuda.is_available():
+        pipe = pipe.to("cuda")
+    return pipe
+@st.cache_resource
+def load_tts_model_and_tokenizers(model_id):
+    tts_model = ParlerTTSForConditionalGeneration.from_pretrained(
+        model_id,
         torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
+        device_map="auto",
         cache_dir=CACHE_DIR
     )
+    prompt_tokenizer = AutoTokenizer.from_pretrained(model_id, cache_dir=CACHE_DIR)
+    # Ensure text_encoder config attribute is correctly accessed
+    desc_tokenizer_path = tts_model.config.text_encoder.name_or_path if hasattr(tts_model.config.text_encoder, 'name_or_path') else tts_model.config.text_encoder._name_or_path
+    desc_tokenizer = AutoTokenizer.from_pretrained(desc_tokenizer_path, cache_dir=CACHE_DIR)
+    return tts_model, prompt_tokenizer, desc_tokenizer
+# --- Step 1: Generate JSON Story ---
+def generate_story(prompt: str, num_scenes: int):
+    model, tokenizer = load_llm_model_and_tokenizer(LLM_MODEL_ID)
+    # Refined prompt for better scene separation and count
+    story_prompt = (
+        f"Generate a compelling short POV story based on the following prompt: '{prompt}'. "
+        f"The story should consist of exactly {num_scenes} distinct scenes. "
+        f"Clearly separate each scene with the delimiter '###'. "
+        f"Do not include any introductory or concluding text outside of the scenes and their separators. "
+        f"Each scene should be a paragraph of 2-4 sentences."
+    )
+    input_ids = tokenizer.encode(story_prompt, return_tensors="pt").to(model.device)
+    # Calculate max_new_tokens, ensuring it doesn't exceed model capacity
+    # Model's max context length (e.g., 1024 for GPT-2, 2048 for GPT-2-medium/large)
+    # model.config.n_ctx might not always be present or accurate for all models, using common values.
+    # For gpt2-medium, n_positions is 1024.
+    max_model_tokens = getattr(model.config, 'n_positions', 1024)
+    max_possible_new_tokens = max_model_tokens - input_ids.shape[1] - 20 # Safety buffer
+    desired_tokens_per_scene = 75 # Avg tokens per scene
+    desired_total_tokens = num_scenes * desired_tokens_per_scene
+    # Cap generated tokens to prevent overly long outputs and stay within model limits
+    max_new_tokens_val = min(desired_total_tokens, 700, max_possible_new_tokens)
+    if max_new_tokens_val <= 0:
+        st.error("Prompt is too long, or an issue with calculating max tokens. Not enough space for generating new tokens.")
+        return None
+    output = model.generate(
+        input_ids,
+        max_new_tokens=max_new_tokens_val,
+        do_sample=True,
+        temperature=0.7,
+        top_k=50,
+        pad_token_id=tokenizer.eos_token_id
+    )
+    full_result = tokenizer.decode(output[0], skip_special_tokens=True)
+    # Remove the input prompt from the beginning of the result
+    if full_result.startswith(story_prompt):
+        generated_text = full_result[len(story_prompt):].strip()
+    else:
+        # Fallback: sometimes the model doesn't perfectly echo the input.
+        # Try to find common start of generation if input is complex or long.
+        # For now, assume it generates after the prompt or just the story.
+        # A simple heuristic is to take the part after the last occurrence of a keyword from the prompt.
+        # This is fragile; good prompt engineering is key.
+        # For now, let's assume it doesn't include the prompt in the output or the above split works.
+        # Or, that the '###' split will handle it.
+        generated_text = full_result # If unsure, process the whole output.
+    scenes_raw = generated_text.split("###")
+    processed_scenes = []
+    for s in scenes_raw:
+        s_clean = s.strip()
+        if s_clean: # Skip empty scenes
+            processed_scenes.append(s_clean)
+    final_scenes = processed_scenes
+    # If more scenes than requested, take the first N. If fewer, use what's available.
+    if len(final_scenes) > num_scenes:
+        final_scenes = final_scenes[:num_scenes]
+        st.warning(f"LLM generated more scenes than requested. Using the first {num_scenes}.")
+    elif len(final_scenes) < num_scenes:
+        st.warning(f"LLM generated {len(final_scenes)} scenes, but {num_scenes} were requested. Using available scenes.")
+    if not final_scenes:
+        st.error("Failed to parse scenes from LLM output. The output was: " + generated_text)
+        return None
+    clear_torch_cache()
+    return {"title": prompt[:60].capitalize(), "scenes": final_scenes}
+# --- Step 2: Generate Images ---
+def generate_images_for_scenes(scenes):
+    pipe = load_sd_pipeline(IMG_MODEL_ID)
+    pil_images = []
+    # Create a directory for storing frame images for the video
+    frames_dir = os.path.join(get_session_temp_dir(), "frames_for_video")
+    os.makedirs(frames_dir, exist_ok=True)
+    image_paths_for_video = []
+    cols = st.columns(3) # Adjust number of columns as preferred
+    col_idx = 0
+    for i, scene_text in enumerate(scenes):
+        with st.spinner(f"Generating image for scene {i+1}..."):
+            try:
+                # Add a style modifier for better visual appeal, can be user-configurable
+                styled_prompt = f"{scene_text}, cinematic lighting, detailed, high quality"
+                image = pipe(styled_prompt, num_inference_steps=30).images[0] # Reduced steps for speed
+                pil_images.append(image)
+                # Save image for video creation
+                img_path = os.path.join(frames_dir, f"frame_{i:03d}.png")
+                image.save(img_path)
+                image_paths_for_video.append(img_path)
+                with cols[col_idx % len(cols)]:
+                    st.image(image, caption=f"Scene {i+1}: {scene_text[:100]}...")
+                    # Download button for individual image
+                    img_byte_arr = BytesIO()
+                    image.save(img_byte_arr, format='PNG')
+                    st.download_button(
+                        label=f"Download Scene {i+1} Image",
+                        data=img_byte_arr.getvalue(),
+                        file_name=f"scene_{i+1}_image.png",
+                        mime="image/png",
+                        key=f"download_img_{i}"
+                    )
+                col_idx += 1
+            except Exception as e:
+                st.error(f"Error generating image for scene {i+1}: {e}")
+                pil_images.append(None) # Placeholder for failed image
+                image_paths_for_video.append(None) # Placeholder
+    clear_torch_cache()
+    return pil_images, image_paths_for_video
+# --- Step 3: Generate TTS ---
+def generate_audios_for_scenes(scenes):
+    tts_model, prompt_tokenizer, desc_tokenizer = load_tts_model_and_tokenizers(TTS_MODEL_ID)
+    audio_dir = os.path.join(get_session_temp_dir(), "audio_files")
+    os.makedirs(audio_dir, exist_ok=True)
     audio_paths = []
+    cols = st.columns(3) # Adjust number of columns
+    col_idx = 0
+    # User-configurable description, or keep it fixed
+    tts_description = "A neutral and clear narrator voice."
+    for i, scene_text in enumerate(scenes):
+        with st.spinner(f"Generating audio for scene {i+1}..."):
+            try:
+                desc_ids = desc_tokenizer(tts_description, return_tensors="pt").input_ids.to(tts_model.device)
+                prompt_ids = prompt_tokenizer(scene_text, return_tensors="pt").input_ids.to(tts_model.device)
+                # Generate audio
+                # For parler-tts, generation_kwargs might be useful, e.g., temperature for description
+                # generation_output = tts_model.generate(input_ids=desc_ids, prompt_input_ids=prompt_ids, temperature=0.7) # Example
+                generation_output = tts_model.generate(input_ids=desc_ids, prompt_input_ids=prompt_ids)
+                audio_waveform = generation_output.to(torch.float32).cpu().numpy()
+                file_path = os.path.join(audio_dir, f"audio_scene_{i+1}.wav")
+                sf.write(file_path, audio_waveform, tts_model.config.sampling_rate) # Use model's sampling rate
+                audio_paths.append(file_path)
+                with cols[col_idx % len(cols)]:
+                    st.markdown(f"**Audio for Scene {i+1}**")
+                    st.audio(file_path)
+                    with open(file_path, "rb") as f_audio:
+                        st.download_button(
+                            label=f"Download Scene {i+1} Audio",
+                            data=f_audio.read(), # Read bytes for download
+                            file_name=f"scene_{i+1}_audio.wav",
+                            mime="audio/wav",
+                            key=f"download_audio_{i}"
+                        )
+                col_idx += 1
+            except Exception as e:
+                st.error(f"Error generating audio for scene {i+1}: {e}")
+                audio_paths.append(None) # Placeholder
+    clear_torch_cache()
     return audio_paths
 # --- Step 4: Create Video ---
+def create_video_from_scenes(image_file_paths, audio_file_paths, output_filename="final_pov_video.mp4"):
+    if not image_file_paths or not audio_file_paths or len(image_file_paths) != len(audio_file_paths):
+        st.error("Mismatch in number of images and audio files, or missing assets. Cannot create video.")
+        return None
+    # Ensure ffmpeg is installed and accessible
+    try:
+        subprocess.run(["ffmpeg", "-version"], capture_output=True, check=True)
+    except (subprocess.CalledProcessError, FileNotFoundError):
+        st.error("FFMPEG is not installed or not found in PATH. Video creation is not possible.")
+        st.markdown("Please install FFMPEG: `sudo apt update && sudo apt install ffmpeg` (Linux) or `brew install ffmpeg` (macOS).")
+        return None
+    temp_clips_dir = os.path.join(get_session_temp_dir(), "temp_video_clips")
+    os.makedirs(temp_clips_dir, exist_ok=True)
+    video_clips_paths = []
+    valid_scene_count = 0
+    for i, (img_path, audio_path) in enumerate(zip(image_file_paths, audio_file_paths)):
+        if img_path is None or audio_path is None:
+            st.warning(f"Skipping scene {i+1} in video due to missing image or audio.")
+            continue
+        try:
+            audio_info = sf.info(audio_path)
+            audio_duration = audio_info.duration
+            if audio_duration <= 0.1: # Minimum practical duration
+                st.warning(f"Audio for scene {i+1} is too short ({audio_duration:.2f}s). Using a minimum duration of 1s.")
+                audio_duration = 1.0 # Enforce a minimum duration
+            clip_path = os.path.join(temp_clips_dir, f"clip_{i:03d}.mp4")
+            # Create individual clip: loop image, add audio, set duration to audio length
+            command = [
+                "ffmpeg", "-y",
+                "-loop", "1", "-i", img_path,       # Loop the image
+                "-i", audio_path,                     # Input audio
+                "-c:v", "libx264", "-preset", "medium", "-tune", "stillimage",
+                "-c:a", "aac", "-b:a", "192k",
+                "-pix_fmt", "yuv420p",
+                "-t", str(audio_duration),            # Duration of this clip
+                "-shortest",                          # End when shortest input (audio) ends
+                clip_path
+            ]
+            process = subprocess.run(command, capture_output=True, text=True)
+            if process.returncode != 0:
+                st.error(f"FFMPEG error creating clip for scene {i+1}:\n{process.stderr}")
+                continue # Skip this clip
+            video_clips_paths.append(clip_path)
+            valid_scene_count += 1
+        except Exception as e:
+            st.error(f"Error processing scene {i+1} for video: {e}")
+            continue
+    if not video_clips_paths or valid_scene_count == 0:
+        st.error("No valid video clips were generated. Cannot create final video.")
+        cleanup_temp_files(temp_clips_dir) # Clean up partial clips
+        return None
+    # Create a file list for ffmpeg concat
+    concat_list_file = os.path.join(temp_clips_dir, "concat_list.txt")
+    with open(concat_list_file, "w") as f:
+        for clip_p in video_clips_paths:
+            # Paths in concat file need to be relative or absolute, ensure correct format for ffmpeg
+            # Using absolute paths is safer here if concat_list.txt is in a different dir than clips.
+            # Since they are in the same dir, relative is fine.
+            f.write(f"file '{os.path.basename(clip_p)}'\n")
+    final_video_path = os.path.join(get_session_temp_dir(), output_filename)
+    concat_command = [
+        "ffmpeg", "-y",
+        "-f", "concat", "-safe", "0", "-i", concat_list_file,
+        "-c", "copy", # Re-mux, don't re-encode if codecs are compatible
+        final_video_path
     ]
+    process = subprocess.run(concat_command, capture_output=True, text=True, cwd=temp_clips_dir) # Run from clips dir
+    if process.returncode != 0:
+        st.error(f"FFMPEG error concatenating video clips:\n{process.stderr}")
+        cleanup_temp_files(temp_clips_dir) # Clean up partial clips
+        return None
+    st.success("Video created successfully!")
+    # cleanup_temp_files(temp_clips_dir) # Optionally clean up intermediate clips after final video is made
+    # Better to clean up everything at session end or via button.
+    return final_video_path
+# --- Main App UI ---
+st.title("⚡ POV Story Generator Pro")
+st.markdown("Create engaging POV stories with AI-generated text, images, audio, and video.")
+st.markdown("---")
+# Sidebar for inputs
+with st.sidebar:
+    st.header("📝 Story Configuration")
+    prompt = st.text_area(
+        "Enter your POV story prompt:",
+        st.session_state.get("user_prompt", "POV: You are a detective solving a mystery in a futuristic city."),
+        height=100,
+        key="user_prompt_input"
+    )
+    num_scenes = st.slider("Number of Scenes:", min_value=2, max_value=10, value=st.session_state.get("num_scenes_val", 3), key="num_scenes_slider")
+    st.markdown("---")
+    if st.button("🚀 Generate Full Story & Assets", type="primary", use_container_width=True):
+        # Reset states for a new generation run
+        st.session_state.run_id = datetime.now().strftime("%Y%m%d_%H%M%S") # New unique ID for this run
+        cleanup_temp_files() # Clean up any previous run's temp files
+        st.session_state.story_data = None
+        st.session_state.pil_images = None
+        st.session_state.image_paths_for_video = None
+        st.session_state.audio_paths = None
+        st.session_state.video_path = None
+        st.session_state.user_prompt = prompt # Save current input values
+        st.session_state.num_scenes_val = num_scenes
+        # Trigger generation flags (optional, direct execution is fine too)
+        st.session_state.generate_all = True
+    st.markdown("---")
+    st.header("🛠️ Utilities")
+    if st.button("🧹 Clear Cache & Temp Files & Restart", use_container_width=True):
+        # Clear model caches
+        st.cache_resource.clear()
+        # Clear session state related to generated artifacts
+        keys_to_clear = ['story_data', 'pil_images', 'image_paths_for_video',
+                         'audio_paths', 'video_path', 'temp_base_dir', 'generate_all']
+        for key in keys_to_clear:
+            if key in st.session_state:
+                del st.session_state[key]
+        cleanup_temp_files() # Ensure physical temp files are deleted
+        st.session_state.run_id = datetime.now().strftime("%Y%m%d_%H%M%S") # New ID after clear
+        st.success("Caches and temporary files cleared. App will restart.")
+        st.rerun()
+# Main content area
+if st.session_state.get("generate_all"):
+    # --- 1. Generate Story ---
+    with st.status("🧠 Generating story...", expanded=True) as status_story:
+        try:
+            st.session_state.story_data = generate_story(st.session_state.user_prompt, st.session_state.num_scenes_val)
+            if st.session_state.story_data:
+                status_story.update(label="Story generated successfully!", state="complete")
+            else:
+                status_story.update(label="Story generation failed.", state="error")
+                st.session_state.generate_all = False # Stop further processing
+        except Exception as e:
+            st.error(f"An unexpected error occurred during story generation: {e}")
+            status_story.update(label="Story generation error.", state="error")
+            st.session_state.generate_all = False
+    # --- Display Story ---
+    if st.session_state.story_data:
+        st.subheader(f"🎬 Story: {st.session_state.story_data['title']}")
+        for i, scene_text in enumerate(st.session_state.story_data['scenes']):
+            st.markdown(f"**Scene {i+1}:** {scene_text}")
+        story_json = json.dumps(st.session_state.story_data, indent=2)
+        st.download_button(
+            label="Download Story (JSON)",
+            data=story_json,
+            file_name=f"{st.session_state.story_data['title'].replace(' ', '_').lower()}_story.json",
+            mime="application/json"
+        )
+        st.markdown("---")
+    # --- 2. Generate Images (if story succeeded) ---
+    if st.session_state.get("generate_all") and st.session_state.story_data:
+        with st.status("🎨 Generating images for scenes...", expanded=True) as status_images:
+            try:
+                st.session_state.pil_images, st.session_state.image_paths_for_video = generate_images_for_scenes(st.session_state.story_data['scenes'])
+                if all(img is not None for img in st.session_state.pil_images): # Basic check
+                    status_images.update(label="Images generated successfully!", state="complete")
+                elif any(img is not None for img in st.session_state.pil_images):
+                     status_images.update(label="Some images generated. Check for errors.", state="warning")
+                else:
+                    status_images.update(label="Image generation failed for all scenes.", state="error")
+                    st.session_state.generate_all = False # Stop further processing
+            except Exception as e:
+                st.error(f"An unexpected error occurred during image generation: {e}")
+                status_images.update(label="Image generation error.", state="error")
+                st.session_state.generate_all = False
+        st.markdown("---")
+    # --- 3. Generate Audio (if images succeeded or partially) ---
+    if st.session_state.get("generate_all") and st.session_state.story_data and st.session_state.pil_images:
+        with st.status("🔊 Generating audio for scenes...", expanded=True) as status_audio:
+            try:
+                st.session_state.audio_paths = generate_audios_for_scenes(st.session_state.story_data['scenes'])
+                if all(p is not None for p in st.session_state.audio_paths): # Basic check
+                    status_audio.update(label="Audio generated successfully!", state="complete")
+                elif any(p is not None for p in st.session_state.audio_paths):
+                    status_audio.update(label="Some audio files generated. Check for errors.", state="warning")
+                else:
+                    status_audio.update(label="Audio generation failed for all scenes.", state="error")
+                    st.session_state.generate_all = False # Stop further processing
+            except Exception as e:
+                st.error(f"An unexpected error occurred during audio generation: {e}")
+                status_audio.update(label="Audio generation error.", state="error")
+                st.session_state.generate_all = False
+        st.markdown("---")
+    # --- 4. Create Video (if audio succeeded or partially) ---
+    if st.session_state.get("generate_all") and st.session_state.image_paths_for_video and st.session_state.audio_paths:
+        # Ensure there's at least one valid pair of image and audio
+        valid_assets = sum(1 for img, aud in zip(st.session_state.image_paths_for_video, st.session_state.audio_paths) if img and aud)
+        if valid_assets > 0:
+            with st.status("📹 Creating final video...", expanded=True) as status_video:
+                try:
+                    st.session_state.video_path = create_video_from_scenes(
+                        st.session_state.image_paths_for_video,
+                        st.session_state.audio_paths
+                    )
+                    if st.session_state.video_path:
+                        status_video.update(label="Video created successfully!", state="complete")
+                    else:
+                        status_video.update(label="Video creation failed.", state="error")
+                except Exception as e:
+                    st.error(f"An unexpected error occurred during video creation: {e}")
+                    status_video.update(label="Video creation error.", state="error")
+            if st.session_state.video_path:
+                st.subheader("🎞️ Final Video Presentation")
+                st.video(st.session_state.video_path)
+                with open(st.session_state.video_path, "rb") as f_video:
+                    st.download_button(
+                        label="Download Final Video",
+                        data=f_video.read(),
+                        file_name=os.path.basename(st.session_state.video_path),
+                        mime="video/mp4"
+                    )
+            st.markdown("---")
+        else:
+            st.warning("Not enough valid image/audio pairs to create a video.")
+    # Reset generation trigger
+    if "generate_all" in st.session_state: # Check if key exists before deleting
+        del st.session_state.generate_all
+elif not st.session_state.get("user_prompt"): # Show initial message if no prompt yet
+    st.info("Configure your story in the sidebar and click 'Generate Full Story & Assets' to begin!")
+# --- Final Cleanup Instruction (Optional: can be tied to session end if platform supports) ---
+# For Streamlit, manual cleanup via button or at start of new run is common.
+# The `cleanup_temp_files()` is called at the start of a new generation.