Spaces:

Yuxihenry
/

SpatialTrackerV2

Running on Zero

App Files Files Community

xiaoyuxi commited on Jun 24, 2025

Commit

09a6aa8

1 Parent(s): dc6fad3

backend

Browse files

Files changed (1) hide show

app.py +703 -992

app.py CHANGED Viewed

@@ -1,161 +1,412 @@
 import gradio as gr
 import os
-import json
-import numpy as np
 import cv2
 import base64
-import requests
-import time
-from typing import List, Tuple
-# Backend Space URL - replace with your actual backend space URL
-BACKEND_SPACE_URL = "Yuxihenry/SpatialTrackerV2_Backend"  # Replace with actual backend space URL
-hf_token = os.getenv("HF_TOKEN")  # Replace with your actual Hugging Face token
-# Debug information
-print(f"🔧 Environment Debug Info:")
-print(f"   - Backend URL: {BACKEND_SPACE_URL}")
-print(f"   - HF Token available: {'Yes' if hf_token else 'No'}")
-print(f"   - HF Token length: {len(hf_token) if hf_token else 0}")
-# Flag to track if backend is available
-BACKEND_AVAILABLE = False
-backend_client = None
-def check_user_permissions():
-    """Check if user has necessary permissions"""
-    print("🔐 Checking user permissions...")
-    if not hf_token:
-        print("❌ No HF Token found")
-        print("🔧 To get a token:")
-        print("   1. Go to https://huggingface.co/settings/tokens")
-        print("   2. Create a new token with 'read' permissions")
-        print("   3. Set it as environment variable: export HF_TOKEN='your_token'")
-        return False
-    # Try to access user info
     try:
-        headers = {'Authorization': f'Bearer {hf_token}'}
-        response = requests.get('https://huggingface.co/api/whoami', headers=headers, timeout=5)
-        if response.status_code == 200:
-            user_info = response.json()
-            username = user_info.get('name', 'Unknown')
-            print(f"✅ Authenticated as: {username}")
-            # Check if user has access to the specific space
-            space_url = f"https://huggingface.co/api/spaces/{BACKEND_SPACE_URL}"
-            space_response = requests.get(space_url, headers=headers, timeout=5)
-            if space_response.status_code == 200:
-                print("✅ You have access to the backend Space")
-                return True
-            elif space_response.status_code == 401:
-                print("❌ You don't have access to the backend Space")
-                print("🔧 Solutions:")
-                print("   1. Contact the Space owner to add you as collaborator")
-                print("   2. Ask the owner to make the Space public")
-                return False
-            elif space_response.status_code == 404:
-                print("❌ Backend Space not found")
-                print("🔧 Please check if the Space URL is correct")
-                return False
-            else:
-                print(f"⚠️  Unexpected response checking Space access: {space_response.status_code}")
-                return False
-        else:
-            print(f"❌ Token validation failed: {response.status_code}")
-            print("🔧 Your token might be invalid or expired")
-            return False
     except Exception as e:
-        print(f"❌ Error checking permissions: {e}")
         return False
-def check_backend_space_status():
-    """Check if backend space is running via HTTP request"""
     try:
-        backend_url = f"https://huggingface.co/spaces/{BACKEND_SPACE_URL}"
-        print(f"🔍 Checking backend space status: {backend_url}")
-        # Prepare headers with authentication if token is available
-        headers = {}
-        if hf_token:
-            headers['Authorization'] = f'Bearer {hf_token}'
-            print(f"🔐 Using HF Token for authentication")
-        # Try to access the space page
-        response = requests.get(backend_url, headers=headers, timeout=10)
-        if response.status_code == 200:
-            print("✅ Backend space page is accessible")
-            # Check if space is running (look for common indicators)
-            page_content = response.text.lower()
-            if "runtime error" in page_content:
-                print("❌ Backend space has runtime error")
-                return False
-            elif "building" in page_content:
-                print("🔄 Backend space is building...")
-                return False
-            elif "sleeping" in page_content:
-                print("😴 Backend space is sleeping")
-                return False
-            else:
-                print("✅ Backend space appears to be running")
-                return True
-        elif response.status_code == 401:
-            print("❌ Authentication failed (HTTP 401)")
-            print("🔧 This means:")
-            print("   - The backend Space is private")
-            print("   - Your HF Token doesn't have access to this Space")
-            print("   - You need to be added as a collaborator to the Space")
-            print("   - Or the Space owner needs to make it public")
-            return False
-        elif response.status_code == 404:
-            print("❌ Backend space not found (HTTP 404)")
-            print("🔧 Please check if the Space URL is correct:")
-            print(f"   Current URL: {BACKEND_SPACE_URL}")
-            return False
-        else:
-            print(f"❌ Backend space not accessible (HTTP {response.status_code})")
-            print(f"🔧 Response: {response.text[:200]}...")
-            return False
-    except requests.RequestException as e:
-        print(f"❌ Failed to check backend space status: {e}")
-        return False
     except Exception as e:
-        print(f"❌ Unexpected error checking backend: {e}")
-        return False
-def initialize_backend():
-    """Initialize backend connection using gradio_client"""
-    global backend_client, BACKEND_AVAILABLE
-    try:
-        from gradio_client import Client
-        # Connect to HF Space
-        if hf_token:
-            backend_client = Client(BACKEND_SPACE_URL, hf_token=hf_token)
-        else:
-            backend_client = Client(BACKEND_SPACE_URL)
-        # Test the connection
-        backend_client.view_api()
-        BACKEND_AVAILABLE = True
-        return True
-    except Exception as e:
-        print(f"❌ Backend connection failed: {e}")
-        BACKEND_AVAILABLE = False
-        return False
 def numpy_to_base64(arr):
     """Convert numpy array to base64 string"""
@@ -165,907 +416,367 @@ def base64_to_numpy(b64_str, shape, dtype):
     """Convert base64 string back to numpy array"""
     return np.frombuffer(base64.b64decode(b64_str), dtype=dtype).reshape(shape)
-def base64_to_image(b64_str):
-    """Convert base64 string to numpy image array"""
-    if not b64_str:
-        return None
-    try:
-        # Decode base64 to bytes
-        img_bytes = base64.b64decode(b64_str)
-        # Convert bytes to numpy array
-        nparr = np.frombuffer(img_bytes, np.uint8)
-        # Decode image
-        img = cv2.imdecode(nparr, cv2.IMREAD_COLOR)
-        # Convert BGR to RGB
-        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
-        return img
-    except Exception as e:
-        print(f"Error converting base64 to image: {e}")
-        return None
 def get_video_name(video_path):
     """Extract video name without extension"""
     return os.path.splitext(os.path.basename(video_path))[0]
-def extract_first_frame(video_path):
-    """Extract first frame from video file"""
     try:
-        cap = cv2.VideoCapture(video_path)
-        ret, frame = cap.read()
-        cap.release()
-        if ret:
-            # Convert BGR to RGB
-            frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
-            return frame_rgb
-        else:
-            return None
-    except Exception as e:
-        print(f"Error extracting first frame: {e}")
-        return None
-def handle_video_upload(video):
-    """Handle video upload and extract first frame"""
-    if video is None:
-        return None, None, [], 50, 756, 3
-    try:
-        if BACKEND_AVAILABLE and backend_client:
-            # Try to use backend API
-            try:
-                print("🔧 Calling backend API for video upload...")
-                # Call the unified API with upload_video function type
-                result = backend_client.predict(
-                    "upload_video",  # function_type
-                    video,           # video file
-                    "",              # original_image_state (not used for upload)
-                    [],              # selected_points (not used for upload)
-                    "positive_point", # point_type (not used for upload)
-                    0,               # point_x (not used for upload)
-                    0,               # point_y (not used for upload)
-                    50,              # grid_size (not used for upload)
-                    756,             # vo_points (not used for upload)
-                    3,               # fps (not used for upload)
-                    api_name="/unified_api"
-                )
-                print(f"✅ Backend video upload API call successful!")
-                print(f"🔧 Result type: {type(result)}")
-                print(f"🔧 Result: {result}")
-                # Parse the result - expect a dict with success status
-                if isinstance(result, dict) and result.get("success"):
-                    # Extract data from backend response
-                    original_image_state = result.get("original_image_state", "")
-                    display_image = result.get("display_image", None)
-                    selected_points = result.get("selected_points", [])
-                    # Get video settings based on video name
-                    video_name = get_video_name(video)
-                    grid_size_val, vo_points_val, fps_val = get_video_settings(video_name)
-                    return original_image_state, display_image, selected_points, grid_size_val, vo_points_val, fps_val
-                else:
-                    print("Backend processing failed, using local fallback")
-                    # Fallback to local processing
-                    pass
-            except Exception as e:
-                print(f"Backend API call failed: {e}")
-                # Fallback to local processing
-                pass
-        # Fallback: local processing
-        print("Using local video processing...")
-        display_image = extract_first_frame(video)
-        # Create a simple state representation
-        original_image_state = json.dumps({
-            "video_path": video,
-            "frame": "local_processing"
-        })
-        # Get video settings
-        video_name = get_video_name(video)
-        grid_size_val, vo_points_val, fps_val = get_video_settings(video_name)
-        return original_image_state, display_image, [], grid_size_val, vo_points_val, fps_val
     except Exception as e:
-        print(f"Error in handle_video_upload: {e}")
-        return None, None, [], 50, 756, 3
-def select_point(original_img: str, sel_pix: list, point_type: str, evt: gr.SelectData):
-    """Handle point selection for SAM"""
-    if original_img is None:
-        return None, []
     try:
-        if BACKEND_AVAILABLE and backend_client:
-            # Try to use backend API
-            try:
-                print(f"🔧 Calling backend select point API: x={evt.index[0]}, y={evt.index[1]}, type={point_type}")
-                # Call the unified API with select_point function type
-                result = backend_client.predict(
-                    "select_point",  # function_type
-                    None,            # video file (not used for select_point)
-                    original_img,    # original_image_state
-                    sel_pix,         # selected_points
-                    point_type,      # point_type
-                    evt.index[0],    # point_x
-                    evt.index[1],    # point_y
-                    50,              # grid_size (not used for select_point)
-                    756,             # vo_points (not used for select_point)
-                    3,               # fps (not used for select_point)
-                    api_name="/unified_api"
-                )
-                print(f"✅ Backend select point API call successful!")
-                print(f"🔧 Result type: {type(result)}")
-                print(f"🔧 Result: {result}")
-                # Parse the result - expect a dict with success status
-                if isinstance(result, dict) and result.get("success"):
-                    display_image = result.get("display_image", None)
-                    new_sel_pix = result.get("selected_points", sel_pix)
-                    return display_image, new_sel_pix
-                else:
-                    print("Backend processing failed, using local fallback")
-                    # Fallback to local processing
-                    pass
-            except Exception as e:
-                print(f"Backend API call failed: {e}")
-                # Check for specific gradio_client errors
-                if "AppError" in str(type(e)):
-                    print("🔧 Backend Space has internal errors (AppError)")
-                    print("🔧 The backend Space code has bugs or configuration issues")
-                    print("🔧 Contact the Space owner to fix the backend implementation")
-                elif "Could not fetch config" in str(e):
-                    print("🔧 Config fetch failed - possible Gradio version mismatch")
-                    print("🔧 Frontend and backend may be using incompatible Gradio versions")
-                elif "timeout" in str(e).lower():
-                    print("🔧 Backend request timed out - Space might be overloaded")
-                else:
-                    print(f"🔧 Unexpected error type: {type(e).__name__}")
-                print("🔄 Showing error message instead of visualization...")
-                # Fallback to local processing
-                pass
-        # Fallback: local processing with improved visualization
-        print("Using local point selection with enhanced visualization...")
-        # Parse original image state
-        try:
-            state_data = json.loads(original_img)
-            video_path = state_data.get("video_path")
-        except:
-            video_path = None
-        if video_path:
-            # Re-extract frame and add point with mask visualization
-            display_image = extract_first_frame(video_path)
-            if display_image is not None:
-                # Add point to the image with enhanced visualization
-                x, y = evt.index[0], evt.index[1]
-                color = (0, 255, 0) if point_type == 'positive_point' else (255, 0, 0)
-                # Draw a larger, more visible point
-                cv2.circle(display_image, (x, y), 8, color, -1)
-                cv2.circle(display_image, (x, y), 12, (255, 255, 255), 2)
-                # Add point to selected points list
-                new_sel_pix = sel_pix.copy()
-                new_sel_pix.append([x, y, point_type])
-                return display_image, new_sel_pix
-        return None, []
     except Exception as e:
-        print(f"Error in select_point: {e}")
-        return None, []
-def reset_points(original_img: str, sel_pix):
-    """Reset points and restore original image"""
-    if original_img is None:
-        return None, []
     try:
-        if BACKEND_AVAILABLE and backend_client:
-            # Try to use backend API
-            try:
-                print("🔧 Calling backend reset points API...")
-                # Call the unified API with reset_points function type
-                result = backend_client.predict(
-                    "reset_points",  # function_type
-                    None,            # video file (not used for reset_points)
-                    original_img,    # original_image_state
-                    sel_pix,         # selected_points
-                    "positive_point", # point_type (not used for reset_points)
-                    0,               # point_x (not used for reset_points)
-                    0,               # point_y (not used for reset_points)
-                    50,              # grid_size (not used for reset_points)
-                    756,             # vo_points (not used for reset_points)
-                    3,               # fps (not used for reset_points)
-                    api_name="/unified_api"
-                )
-                print(f"✅ Backend reset points API call successful!")
-                print(f"🔧 Result: {result}")
-                # Parse the result
-                if isinstance(result, dict) and result.get("success"):
-                    display_image = result.get("display_image", None)
-                    new_sel_pix = result.get("selected_points", [])
-                    return display_image, new_sel_pix
-                else:
-                    print("Backend processing failed, using local fallback")
-                    # Fallback to local processing
-                    pass
-            except Exception as e:
-                print(f"Backend API call failed: {e}")
-                # Fallback to local processing
-                pass
-        # Fallback: local processing
-        print("Using local reset points...")
-        # Parse original image state
-        try:
-            state_data = json.loads(original_img)
-            video_path = state_data.get("video_path")
-        except:
-            video_path = None
-        if video_path:
-            # Re-extract original frame
-            display_image = extract_first_frame(video_path)
-            return display_image, []
-        return None, []
-    except Exception as e:
-        print(f"Error in reset_points: {e}")
-        return None, []
-def launch_viz(grid_size, vo_points, fps, original_image_state):
-    """Launch visualization with user-specific temp directory"""
-    if original_image_state is None:
-        return None, None
-    try:
-        if BACKEND_AVAILABLE and backend_client:
-            # Try to use backend API
-            try:
-                print(f"🔧 Calling backend API with parameters: grid_size={grid_size}, vo_points={vo_points}, fps={fps}")
-                print(f"🔧 Original image state type: {type(original_image_state)}")
-                print(f"🔧 Original image state preview: {str(original_image_state)[:100]}...")
-                # Validate and potentially fix the original_image_state format
-                state_to_send = original_image_state
-                # Check if this is a local processing state that needs to be converted
-                try:
-                    if isinstance(original_image_state, str):
-                        parsed_state = json.loads(original_image_state)
-                        if "video_path" in parsed_state and "frame" in parsed_state:
-                            # This is a local processing state, we need to handle differently
-                            print("🔧 Detected local processing state, cannot use backend for tracking")
-                            print("🔧 Backend requires proper video upload state from backend API")
-                            # Fall through to local processing
-                            raise ValueError("Local state cannot be processed by backend")
-                except json.JSONDecodeError:
-                    print("🔧 Invalid JSON state, cannot send to backend")
-                    raise ValueError("Invalid state format")
-                # Call the unified API with run_tracker function type
-                result = backend_client.predict(
-                    "run_tracker",        # function_type
-                    None,                 # video file (not used for run_tracker)
-                    state_to_send,        # original_image_state
-                    [],                   # selected_points (not used for run_tracker)
-                    "positive_point",     # point_type (not used for run_tracker)
-                    0,                    # point_x (not used for run_tracker)
-                    0,                    # point_y (not used for run_tracker)
-                    grid_size,            # grid_size
-                    vo_points,            # vo_points
-                    fps,                  # fps
-                    api_name="/unified_api"
-                )
-                print(f"✅ Backend API call successful!")
-                print(f"🔧 Result type: {type(result)}")
-                print(f"🔧 Result: {result}")
-                # Parse the result
-                if isinstance(result, dict) and result.get("success"):
-                    viz_html = result.get("viz_html", "")
-                    track_video_path = result.get("track_video_path", "")
-                    return viz_html, track_video_path
-                else:
-                    error_msg = result.get("error", "Unknown error") if isinstance(result, dict) else "Backend processing failed"
-                    print(f"❌ Backend processing failed: {error_msg}")
-                    # Fall through to error message
-                    pass
-            except Exception as e:
-                print(f"❌ Backend API call failed: {e}")
-                print(f"🔧 Error type: {type(e)}")
-                print(f"🔧 Error details: {str(e)}")
-                # Check for specific gradio_client errors
-                if "AppError" in str(type(e)):
-                    print("🔧 Backend Space has internal errors (AppError)")
-                    print("🔧 The backend Space code has bugs or configuration issues")
-                    print("🔧 Contact the Space owner to fix the backend implementation")
-                elif "Could not fetch config" in str(e):
-                    print("🔧 Config fetch failed - possible Gradio version mismatch")
-                    print("🔧 Frontend and backend may be using incompatible Gradio versions")
-                elif "timeout" in str(e).lower():
-                    print("🔧 Backend request timed out - Space might be overloaded")
-                elif "Expecting value" in str(e):
-                    print("🔧 JSON parsing error in backend - state format mismatch")
-                    print("🔧 This happens when using local processing state with backend API")
-                    print("🔧 Please upload video again to use backend processing")
-                else:
-                    print(f"🔧 Unexpected error type: {type(e).__name__}")
-                print("🔄 Showing error message instead of visualization...")
-                # Fall through to error message
-                pass
-        # Create an informative error message based on the state
-        state_info = ""
-        try:
-            if isinstance(original_image_state, str):
-                parsed_state = json.loads(original_image_state)
-                if "video_path" in parsed_state:
-                    video_name = os.path.basename(parsed_state["video_path"])
-                    state_info = f"Video: {video_name}"
-        except:
-            state_info = "State format unknown"
-        # Fallback: show message that backend is required
-        error_message = f"""
-        <div style='border: 3px solid #ff6b6b; border-radius: 10px; padding: 20px; background-color: #fff5f5;'>
-            <h3 style='color: #d63031; margin-bottom: 15px;'>⚠️ Backend Processing Required</h3>
-            <p style='color: #2d3436; line-height: 1.6;'>
-                The tracking and visualization features require backend processing. The current setup is using local processing which is incompatible with the backend API.
-            </p>
-            <h4 style='color: #d63031; margin: 15px 0 10px 0;'>Solutions:</h4>
-            <ul style='color: #2d3436; line-height: 1.6;'>
-                <li><strong>Upload video again:</strong> This will properly initialize the backend state</li>
-                <li><strong>Select points on the frame:</strong> Ensure you've clicked on the object to track</li>
-                <li><strong>Check backend connection:</strong> Ensure the backend Space is running</li>
-                <li><strong>Use compatible state:</strong> Avoid local processing mode</li>
-            </ul>
-            <div style='background-color: #f8f9fa; border-radius: 5px; padding: 10px; margin-top: 15px;'>
-                <p style='color: #2d3436; font-weight: bold; margin: 0 0 5px 0;'>Debug Information:</p>
-                <p style='color: #666; font-size: 12px; margin: 0;'>Backend Available: {BACKEND_AVAILABLE}</p>
-                <p style='color: #666; font-size: 12px; margin: 0;'>Backend Client: {backend_client is not None}</p>
-                <p style='color: #666; font-size: 12px; margin: 0;'>Backend URL: {BACKEND_SPACE_URL}</p>
-                <p style='color: #666; font-size: 12px; margin: 0;'>State Info: {state_info}</p>
-                <p style='color: #666; font-size: 12px; margin: 0;'>Processing Mode: {"Backend" if BACKEND_AVAILABLE else "Local (Limited)"}</p>
-            </div>
-            <div style='background-color: #e3f2fd; border-radius: 5px; padding: 10px; margin-top: 10px; border-left: 4px solid #2196f3;'>
-                <p style='color: #1976d2; font-weight: bold; margin: 0 0 5px 0;'>💡 Quick Fix:</p>
-                <p style='color: #1976d2; font-size: 13px; margin: 0;'>
-                    Try uploading your video again - this should properly initialize the backend state for tracking.
-                </p>
-            </div>
-        </div>
-        """
-        return error_message, None
     except Exception as e:
-        print(f"Error in launch_viz: {e}")
-        return None, None
-def clear_all():
-    """Clear all buffers and temporary files"""
-    return None, None, [], 50, 756, 3
-def update_tracker_model(model_name):
-    """Update tracker model (placeholder function)"""
-    return
-def get_video_settings(video_name):
-    """Get video-specific settings based on video name"""
-    video_settings = {
-        "kiss": (45, 700, 10),
-        "backpack": (40, 600, 2),
-        "kitchen": (60, 800, 3),
-        "pillow": (35, 500, 2),
-        "hockey": (45, 700, 2),
-        "drifting": (35, 1000, 6),
-        "ball": (45, 256, 6),
-        "ken_block_0": (45, 700, 2),
-        "ego_kc1": (45, 500, 4),
-        "vertical_place": (45, 500, 3),
-        "ego_teaser": (45, 1200, 10),
-        "robot_unitree": (45, 500, 4),
-        "droid_robot": (35, 400, 5),
-        "robot_2": (45, 256, 5),
-        "cinema_0": (45, 356, 5),
-        "cinema_1": (45, 756, 3),
-    }
-    return video_settings.get(video_name, (50, 756, 3))
-def test_backend_connection():
-    """Test if backend is actually working"""
-    global BACKEND_AVAILABLE
-    if not backend_client:
-        return False
     try:
-        print("Testing backend connection with a simple call...")
-        # Check if we have fns available
-        if hasattr(backend_client, 'fns') and backend_client.fns:
-            print("✅ Backend API functions are available")
-            print(f"🔧 Available function indices: {list(backend_client.fns.keys())}")
-            return True
         else:
-            print("❌ Backend API functions not found")
-            return False
     except Exception as e:
-        print(f"❌ Backend connection test failed: {e}")
-        return False
-def test_backend_api():
-    """Test specific backend API functions"""
-    if not BACKEND_AVAILABLE or not backend_client:
-        print("❌ Backend not available for testing")
-        return False
-    try:
-        print("🧪 Testing backend API functions...")
-        # Test if fns exist and show available indices
-        if hasattr(backend_client, 'fns') and backend_client.fns:
-            print(f"✅ Backend has {len(backend_client.fns)} functions available")
-            for idx in backend_client.fns.keys():
-                print(f"✅ Function {idx} is available")
-        else:
-            print("❌ No functions found in backend API")
-            return False
-        return True
-    except Exception as e:
-        print(f"❌ Backend API test failed: {e}")
-        return False
-# Initialize the backend connection
-print("🚀 Initializing frontend application...")
-result = initialize_backend()
-# Test backend connection if available
-if result and BACKEND_AVAILABLE:
-    print("✅ Backend connection successful!")
-else:
-    print("❌ Backend connection failed!")
-# Create the Gradio interface
-print("🎨 Creating Gradio interface...")
-with gr.Blocks(
-    theme=gr.themes.Soft(),
-    title="SpatialTracker V2 - Frontend",
-    css="""
-    .gradio-container {
-        max-width: 1200px !important;
-        margin: auto !important;
-    }
-    .gr-button {
-        margin: 5px;
-    }
-    .gr-form {
-        background: white;
-        border-radius: 10px;
-        padding: 20px;
-        box-shadow: 0 2px 10px rgba(0,0,0,0.1);
-    }
-    /* 固定视频上传组件高度 */
-    .gr-video {
-        height: 300px !important;
-        min-height: 300px !important;
-        max-height: 300px !important;
-    }
-    .gr-video video {
-        height: 260px !important;
-        max-height: 260px !important;
-        object-fit: contain !important;
-        background: #f8f9fa;
-    }
-    .gr-video .gr-video-player {
-        height: 260px !important;
-        max-height: 260px !important;
-    }
-    /* 水平滚动的示例视频样式 */
-    .example-videos .gr-examples {
-        overflow: visible !important;
-    }
-    .example-videos .gr-examples .gr-table-wrapper {
-        overflow-x: auto !important;
-        overflow-y: hidden !important;
-        scrollbar-width: thin;
-        scrollbar-color: #667eea #f1f1f1;
-    }
-    .example-videos .gr-examples .gr-table-wrapper::-webkit-scrollbar {
-        height: 8px;
-    }
-    .example-videos .gr-examples .gr-table-wrapper::-webkit-scrollbar-track {
-        background: #f1f1f1;
-        border-radius: 4px;
-    }
-    .example-videos .gr-examples .gr-table-wrapper::-webkit-scrollbar-thumb {
-        background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
-        border-radius: 4px;
-    }
-    .example-videos .gr-examples .gr-table-wrapper::-webkit-scrollbar-thumb:hover {
-        background: linear-gradient(135deg, #5a6fd8 0%, #6a4190 100%);
-    }
-    .example-videos .gr-examples .gr-table {
-        display: flex !important;
-        flex-wrap: nowrap !important;
-        min-width: max-content !important;
-        gap: 10px !important;
-    }
-    .example-videos .gr-examples .gr-table tbody {
-        display: flex !important;
-        flex-direction: row !important;
-        flex-wrap: nowrap !important;
-        gap: 10px !important;
-    }
-    .example-videos .gr-examples .gr-table tbody tr {
-        display: flex !important;
-        flex-direction: column !important;
-        min-width: 120px !important;
-        max-width: 120px !important;
-        margin: 0 !important;
-        background: white;
-        border-radius: 8px;
-        box-shadow: 0 2px 8px rgba(0,0,0,0.1);
-        transition: all 0.3s ease;
-        cursor: pointer;
-    }
-    .example-videos .gr-examples .gr-table tbody tr:hover {
-        transform: translateY(-2px);
-        box-shadow: 0 4px 12px rgba(102, 126, 234, 0.2);
-    }
-    .example-videos .gr-examples .gr-table tbody tr td {
-        text-align: center !important;
-        padding: 8px !important;
-        border: none !important;
-    }
-    .example-videos .gr-examples .gr-table tbody tr td video {
-        border-radius: 6px !important;
-        width: 100% !important;
-        height: auto !important;
-    }
-    .example-videos .gr-examples .gr-table tbody tr td:last-child {
-        font-size: 12px !important;
-        font-weight: 500 !important;
-        color: #333 !important;
-        padding-top: 4px !important;
-    }
-    /* 新的水平滚动示例视频样式 */
-    .horizontal-examples .gr-examples {
-        overflow: visible !important;
-    }
-    .horizontal-examples .gr-examples .gr-table-wrapper {
-        overflow-x: auto !important;
-        overflow-y: hidden !important;
-        scrollbar-width: thin;
-        scrollbar-color: #667eea #f1f1f1;
-        padding: 10px 0;
-    }
-    .horizontal-examples .gr-examples .gr-table-wrapper::-webkit-scrollbar {
-        height: 8px;
-    }
-    .horizontal-examples .gr-examples .gr-table-wrapper::-webkit-scrollbar-track {
-        background: #f1f1f1;
-        border-radius: 4px;
-    }
-    .horizontal-examples .gr-examples .gr-table-wrapper::-webkit-scrollbar-thumb {
-        background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
-        border-radius: 4px;
-    }
-    .horizontal-examples .gr-examples .gr-table-wrapper::-webkit-scrollbar-thumb:hover {
-        background: linear-gradient(135deg, #5a6fd8 0%, #6a4190 100%);
-    }
-    .horizontal-examples .gr-examples .gr-table {
-        display: flex !important;
-        flex-wrap: nowrap !important;
-        min-width: max-content !important;
-        gap: 15px !important;
-        padding-bottom: 10px;
-    }
-    .horizontal-examples .gr-examples .gr-table tbody {
-        display: flex !important;
-        flex-direction: row !important;
-        flex-wrap: nowrap !important;
-        gap: 15px !important;
-    }
-    .horizontal-examples .gr-examples .gr-table tbody tr {
-        display: flex !important;
-        flex-direction: column !important;
-        min-width: 160px !important;
-        max-width: 160px !important;
-        margin: 0 !important;
-        background: white;
-        border-radius: 12px;
-        box-shadow: 0 3px 12px rgba(0,0,0,0.12);
-        transition: all 0.3s ease;
-        cursor: pointer;
-        overflow: hidden;
-    }
-    .horizontal-examples .gr-examples .gr-table tbody tr:hover {
-        transform: translateY(-4px);
-        box-shadow: 0 8px 20px rgba(102, 126, 234, 0.25);
-    }
-    .horizontal-examples .gr-examples .gr-table tbody tr td {
-        text-align: center !important;
-        padding: 0 !important;
-        border: none !important;
-    }
-    .horizontal-examples .gr-examples .gr-table tbody tr td:first-child {
-        padding: 0 !important;
-    }
-    .horizontal-examples .gr-examples .gr-table tbody tr td video {
-        border-radius: 8px 8px 0 0 !important;
-        width: 100% !important;
-        height: 90px !important;
-        object-fit: cover !important;
-    }
-    .horizontal-examples .gr-examples .gr-table tbody tr td:last-child {
-        font-size: 11px !important;
-        font-weight: 600 !important;
-        color: #333 !important;
-        padding: 8px 12px !important;
-        background: linear-gradient(135deg, #f8f9ff 0%, #e6f3ff 100%);
-        border-radius: 0 0 8px 8px;
-    }
-    """
-) as demo:
-    gr.Markdown("""
-    # 🎯 SpatialTracker V2 - Frontend Interface
-    Welcome to SpatialTracker V2! This interface allows you to track objects in videos using advanced computer vision techniques.
-    **Instructions:**
-    1. Upload a video file or select from examples below
-    2. Click on the object you want to track in the first frame
-    3. Adjust tracking parameters if needed
-    4. Click "Launch Visualization" to start tracking
-    """)
-    # Status indicator with more detailed information
-    if BACKEND_AVAILABLE:
-        status_text = "🟢 Backend Connected"
-        status_details = f"Connected to: {BACKEND_SPACE_URL}"
-    else:
-        status_text = "🟡 Running in Standalone Mode"
-        status_details = f"Backend unavailable: {BACKEND_SPACE_URL}"
-    gr.Markdown(f"**Status:** {status_text}")
-    gr.Markdown(f"<small style='color: #666;'>{status_details}</small>", elem_id="status-details")
-    # GitHub Star Reminder - Added back!
-    gr.HTML("""
-    <div style='background: linear-gradient(135deg, #e8eaff 0%, #f0f2ff 100%);
-                border-radius: 10px;
-                padding: 15px;
-                margin: 15px 0;
-                box-shadow: 0 2px 8px rgba(102, 126, 234, 0.1);
-                border: 1px solid rgba(102, 126, 234, 0.15);'>
-        <div style='text-align: center; color: #4a5568;'>
-            <h3 style='margin: 0 0 10px 0; font-size: 18px; text-shadow: none; color: #2d3748;'>
-                ⭐ Love SpatialTracker? Give us a Star! ⭐
-            </h3>
-            <p style='margin: 0 0 12px 0; font-size: 14px; opacity: 0.8; color: #4a5568;'>
-                Help us grow by starring our repository on GitHub! 🚀
-            </p>
-            <div style='display: flex; justify-content: center;'>
-                <a href="https://github.com/henry123-boy/SpaTrackerV2"
-                   target="_blank"
-                   style='display: inline-flex;
-                          align-items: center;
-                          gap: 6px;
-                          background: rgba(102, 126, 234, 0.1);
-                          color: #4a5568;
-                          padding: 8px 16px;
-                          border-radius: 20px;
-                          text-decoration: none;
-                          font-weight: bold;
-                          font-size: 14px;
-                          backdrop-filter: blur(5px);
-                          border: 1px solid rgba(102, 126, 234, 0.2);
-                          transition: all 0.3s ease;'
-                   onmouseover="this.style.background='rgba(102, 126, 234, 0.15)'; this.style.transform='translateY(-1px)'"
-                   onmouseout="this.style.background='rgba(102, 126, 234, 0.1)'; this.style.transform='translateY(0)'">
-                    <span style='font-size: 16px;'>⭐</span>
-                    Star on GitHub
-                </a>
-            </div>
-        </div>
-    </div>
-    """)
-    # Example videos section - moved to top
-    with gr.Group(elem_classes=["example-videos"]):
-        gr.Markdown("### 📂 Example Videos")
-        gr.Markdown("Try these example videos to get started quickly:")
-        # Custom horizontal scrolling video gallery
-        gr.HTML("""
-        <div style='background-color: #f8f9ff; border-radius: 8px; padding: 10px; margin: 10px 0; border-left: 4px solid #667eea;'>
-            <p style='margin: 0; font-size: 13px; color: #666; display: flex; align-items: center; gap: 8px;'>
-                <span style='font-size: 16px;'>💡</span>
-                <strong>Tip:</strong> Scroll horizontally below to see all example videos
-            </p>
-        </div>
-        """)
-        # Define video_input here so it can be referenced in examples
-        video_input = gr.Video(
-            label="Upload Video or Select Example",
-            format="mp4",
-            height=300
-        )
-        # Create a horizontal scrolling container for the examples
-        with gr.Group(elem_classes=["horizontal-examples"]):
-            gr.Examples(
-                examples=[
-                    ["examples/kiss.mp4"],
-                    ["examples/backpack.mp4"],
-                    ["examples/kitchen.mp4"],
-                    ["examples/pillow.mp4"],
-                    ["examples/hockey.mp4"],
-                    ["examples/drifting.mp4"],
-                    ["examples/ball.mp4"],
-                    ["examples/ken_block_0.mp4"],
-                    ["examples/ego_kc1.mp4"],
-                    ["examples/vertical_place.mp4"],
-                    ["examples/ego_teaser.mp4"],
-                    ["examples/robot_unitree.mp4"],
-                    ["examples/droid_robot.mp4"],
-                    ["examples/robot_2.mp4"],
-                    ["examples/cinema_0.mp4"],
-                    ["examples/cinema_1.mp4"],
-                ],
-                inputs=video_input,
-                label="🎬 Click on any example to load it",
-                examples_per_page=16  # Show all examples on one page
-            )
-    with gr.Row():
-        with gr.Column(scale=1):
-            # Interactive frame display
-            with gr.Group():
-                gr.Markdown("### 🎯 Point Selection")
-                gr.Markdown("Click on the object you want to track in the frame below:")
-                interactive_frame = gr.Image(
-                    label="Click to select tracking points",
-                    type="numpy",
-                    interactive=True
-                )
-                with gr.Row():
-                    point_type = gr.Radio(
-                        choices=["positive_point", "negative_point"],
-                        value="positive_point",
-                        label="Point Type",
-                        info="Positive points indicate the object to track, negative points indicate areas to avoid"
-                    )
-                with gr.Row():
-                    reset_points_btn = gr.Button("🔄 Reset Points", variant="secondary")
-                    clear_all_btn = gr.Button("🗑️ Clear All", variant="stop")
-        with gr.Column(scale=1):
-            # Tracking results
-            with gr.Group():
-                gr.Markdown("### 🎬 Tracking Results")
-                tracking_result_video = gr.Video(
-                    label="Tracking Result Video",
-                    interactive=False,
-                    height=300
-                )
-            # 3D Visualization
-            with gr.Group():
-                gr.Markdown("### 🌐 3D Visualization")
-                viz_html = gr.HTML(
-                    label="3D Trajectory Visualization",
-                    value="<p>Upload a video and select points to see 3D visualization here.</p>"
-                )
-    # Advanced settings section - changed to open=True
-    with gr.Accordion("⚙️ Advanced Settings", open=True):
-        gr.Markdown("Adjust these parameters to optimize tracking performance:")
-        with gr.Row():
-            grid_size = gr.Slider(
-                minimum=10,
-                maximum=100,
-                step=10,
-                value=50,
-                label="Grid Size",
-                info="Size of the tracking grid (larger = more detailed)"
-            )
-            vo_points = gr.Slider(
-                minimum=100,
-                maximum=2000,
-                step=50,
-                value=756,
-                label="VO Points",
-                info="Number of visual odometry points (more = better accuracy)"
-            )
-            fps = gr.Slider(
-                minimum=1,
-                maximum=30,
-                step=1,
-                value=3,
-                label="FPS",
-                info="Frames per second for processing (higher = smoother but slower)"
-            )
-    # Launch button
-    with gr.Row():
-        launch_btn = gr.Button("🚀 Launch Visualization", variant="primary", size="lg")
-    # Hidden state variables
-    original_image_state = gr.State(None)
-    selected_points = gr.State([])
-    # Event handlers
-    video_input.change(
-        fn=handle_video_upload,
-        inputs=[video_input],
-        outputs=[original_image_state, interactive_frame, selected_points, grid_size, vo_points, fps]
-    )
-    interactive_frame.select(
-        fn=select_point,
-        inputs=[original_image_state, selected_points, point_type],
-        outputs=[interactive_frame, selected_points]
-    )
-    reset_points_btn.click(
-        fn=reset_points,
-        inputs=[original_image_state, selected_points],
-        outputs=[interactive_frame, selected_points]
-    )
-    clear_all_btn.click(
-        fn=clear_all,
-        outputs=[video_input, interactive_frame, selected_points, grid_size, vo_points, fps]
-    )
-    launch_btn.click(
-        fn=launch_viz,
-        inputs=[grid_size, vo_points, fps, original_image_state],
-        outputs=[viz_html, tracking_result_video]
-    )
-# Launch the interface
-if __name__ == "__main__":
-    print("🌟 Launching SpatialTracker V2 Frontend...")
-    print(f"🔗 Backend Status: {'Connected' if BACKEND_AVAILABLE else 'Disconnected'}")
-    demo.launch(
         server_name="0.0.0.0",
         server_port=7860,
-        share=True,
         debug=True,
         show_error=True
     )

 import gradio as gr
 import os
+import sys
+import logging
+import time
+import uuid
+import atexit
+from concurrent.futures import ThreadPoolExecutor
+from typing import Union, List, Tuple, Dict, Any
+# Configure logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+# Import spaces for ZeroGPU support
+try:
+    import spaces
+except ImportError:
+    # Fallback for local development
+    def spaces(func):
+        return func
+# Import other dependencies
+import subprocess
 import cv2
+import numpy as np
+import threading
+import tempfile
+import shutil
+import glob
+import json
 import base64
+import struct
+import zlib
+import argparse
+import socket
+import gc
+from pathlib import Path
+from einops import rearrange
+from tempfile import TemporaryDirectory
+from http.server import SimpleHTTPRequestHandler
+from socketserver import ThreadingTCPServer
+import socketserver
+import http.server
+import torch
+from huggingface_hub import hf_hub_download
+# Import custom modules with error handling
+try:
+    from app_3rd.sam_utils.inference import SamPredictor, get_sam_predictor, run_inference
+    from app_3rd.spatrack_utils.infer_track import get_tracker_predictor, run_tracker, get_points_on_a_grid
+except ImportError as e:
+    logger.error(f"Failed to import custom modules: {e}")
+    raise
+MAX_FRAMES = 80
+try:
+    import vggt
+except:
+    subprocess.run(["pip", "install", "-e", "./models/vggt"], check=True)
+    sys.path.append("/home/user/app/models/vggt")
+# init the model
+os.environ["VGGT_DIR"] = hf_hub_download("facebook/VGGT-1B", "model.pt")
+if os.environ.get("VGGT_DIR", None) is not None:
+    from vggt.models.vggt import VGGT
+    from vggt.utils.load_fn import preprocess_image
+    from vggt.utils.pose_enc import pose_encoding_to_extri_intri
+    vggt_model = VGGT()
+    vggt_model.load_state_dict(torch.load(os.environ.get("VGGT_DIR")))
+    vggt_model.eval()
+    vggt_model = vggt_model.to("cuda")
+# Global model initialization
+print("🚀 Initializing global models...")
+def init_global_models():
+    """Initialize global models (CPU only for ZeroGPU compatibility)"""
     try:
+        print("🔧 Loading SAM predictor...")
+        sam_predictor = get_sam_predictor()
+        print("✅ SAM predictor loaded successfully")
+        # Keep on CPU for ZeroGPU - will be moved to GPU in the decorated function
+        print("🔧 Loading tracker models...")
+        out_dir = os.path.join("temp_init", "results")
+        os.makedirs(out_dir, exist_ok=True)
+        tracker_model, tracker_viser = get_tracker_predictor(out_dir, vo_points=756)
+        print("✅ Tracker models loaded successfully")
+        # Keep on CPU for ZeroGPU - will be moved to GPU in the decorated function
+        print("✅ All models initialized successfully!")
+        return True
     except Exception as e:
+        print(f"❌ Error initializing models: {e}")
+        import traceback
+        traceback.print_exc()
         return False
+# Initialize models at startup
+# Thread pool for delayed deletion
+thread_pool_executor = ThreadPoolExecutor(max_workers=2)
+def delete_later(path: Union[str, os.PathLike], delay: int = 600):
+    """Delete file or directory after specified delay (default 10 minutes)"""
+    def _delete():
+        try:
+            if os.path.isfile(path):
+                os.remove(path)
+            elif os.path.isdir(path):
+                shutil.rmtree(path)
+        except Exception as e:
+            logger.warning(f"Failed to delete {path}: {e}")
+    def _wait_and_delete():
+        time.sleep(delay)
+        _delete()
+    thread_pool_executor.submit(_wait_and_delete)
+    atexit.register(_delete)
+def create_user_temp_dir():
+    """Create a unique temporary directory for each user session"""
+    session_id = str(uuid.uuid4())[:8]  # Short unique ID
+    temp_dir = os.path.join("temp", f"session_{session_id}")
+    os.makedirs(temp_dir, exist_ok=True)
+    # Schedule deletion after 10 minutes
+    delete_later(temp_dir, delay=600)
+    return temp_dir
+# Wrap the core GPU functions with @spaces.GPU
+@spaces.GPU
+def gpu_run_sam(image, points, boxes):
+    """GPU-accelerated SAM inference"""
+    # Initialize SAM predictor inside GPU function
+    predictor = get_sam_predictor()
+    # Ensure predictor is on GPU - handle different SAM predictor types
     try:
+        if hasattr(predictor, 'model'):
+            # For transformers SAM
+            predictor.model = predictor.model.cuda()
+        elif hasattr(predictor, 'sam'):
+            # For segment-anything SAM
+            predictor.sam = predictor.sam.cuda()
+        elif hasattr(predictor, 'to'):
+            # Generic PyTorch model
+            predictor = predictor.to('cuda')
+        # Also ensure image is on the right device if it's a tensor
+        if hasattr(image, 'cuda'):
+            image = image.cuda()
     except Exception as e:
+        print(f"Warning: Could not move predictor to GPU: {e}")
+    return run_inference(predictor, image, points, boxes)
+@spaces.GPU
+def gpu_run_tracker(temp_dir, video_name, grid_size, vo_points, fps):
+    """GPU-accelerated tracking"""
+    import torchvision.transforms as T
+    import decord
+    # Initialize tracker model inside GPU function
+    out_dir = os.path.join(temp_dir, "results")
+    os.makedirs(out_dir, exist_ok=True)
+    tracker_model, tracker_viser = get_tracker_predictor(out_dir, vo_points=vo_points)
+    # Setup paths
+    video_path = os.path.join(temp_dir, f"{video_name}.mp4")
+    mask_path = os.path.join(temp_dir, f"{video_name}.png")
+    out_dir = os.path.join(temp_dir, "results")
+    os.makedirs(out_dir, exist_ok=True)
+    # Load video using decord
+    video_reader = decord.VideoReader(video_path)
+    video_tensor = torch.from_numpy(video_reader.get_batch(range(len(video_reader))).asnumpy()).permute(0, 3, 1, 2)  # Convert to tensor and permute to (N, C, H, W)
+    # resize make sure the shortest side is 336
+    h, w = video_tensor.shape[2:]
+    scale = max(224 / h, 224 / w)
+    if scale < 1:
+        new_h, new_w = int(h * scale), int(w * scale)
+        video_tensor = T.Resize((new_h, new_w))(video_tensor)
+    video_tensor = video_tensor[::fps].float()[:MAX_FRAMES]
+    # Move video tensor to GPU
+    video_tensor = video_tensor.cuda()
+    print(f"Video tensor shape: {video_tensor.shape}, device: {video_tensor.device}")
+    depth_tensor = None
+    intrs = None
+    extrs = None
+    data_npz_load = {}
+    # run vggt
+    if os.environ.get("VGGT_DIR", None) is not None:
+        # process the image tensor
+        video_tensor = preprocess_image(video_tensor)[None]
+        with torch.no_grad():
+            with torch.cuda.amp.autocast(dtype=torch.bfloat16):
+                # Predict attributes including cameras, depth maps, and point maps.
+                aggregated_tokens_list, ps_idx = vggt_model.aggregator(video_tensor.cuda()/255)
+                pose_enc = vggt_model.camera_head(aggregated_tokens_list)[-1]
+                # Extrinsic and intrinsic matrices, following OpenCV convention (camera from world)
+                extrinsic, intrinsic = pose_encoding_to_extri_intri(pose_enc, video_tensor.shape[-2:])
+                # Predict Depth Maps
+                depth_map, depth_conf = vggt_model.depth_head(aggregated_tokens_list, video_tensor.cuda()/255, ps_idx)
+        depth_tensor = depth_map.squeeze().cpu().numpy()
+        extrs = np.eye(4)[None].repeat(len(depth_tensor), axis=0)
+        extrs[:, :3, :4] = extrinsic.squeeze().cpu().numpy()
+        intrs = intrinsic.squeeze().cpu().numpy()
+        video_tensor = video_tensor.squeeze()
+        #NOTE: 20% of the depth is not reliable
+        threshold = depth_conf.squeeze()[0].view(-1).quantile(0.6).item()
+        unc_metric = depth_conf.squeeze().cpu().numpy() > threshold
+    # Load and process mask
+    if os.path.exists(mask_path):
+        mask = cv2.imread(mask_path)
+        mask = cv2.resize(mask, (video_tensor.shape[3], video_tensor.shape[2]))
+        mask = mask.sum(axis=-1)>0
+    else:
+        mask = np.ones_like(video_tensor[0,0].cpu().numpy())>0
+        grid_size = 10
+    # Get frame dimensions and create grid points
+    frame_H, frame_W = video_tensor.shape[2:]
+    grid_pts = get_points_on_a_grid(grid_size, (frame_H, frame_W), device="cuda")  # Create on GPU
+    # Sample mask values at grid points and filter out points where mask=0
+    if os.path.exists(mask_path):
+        grid_pts_int = grid_pts[0].long()
+        mask_values = mask[grid_pts_int.cpu()[...,1], grid_pts_int.cpu()[...,0]]
+        grid_pts = grid_pts[:, mask_values]
+    query_xyt = torch.cat([torch.zeros_like(grid_pts[:, :, :1]), grid_pts], dim=2)[0].cpu().numpy()
+    print(f"Query points shape: {query_xyt.shape}")
+    # Run model inference
+    with torch.amp.autocast(device_type="cuda", dtype=torch.bfloat16):
+        (
+            c2w_traj, intrs, point_map, conf_depth,
+            track3d_pred, track2d_pred, vis_pred, conf_pred, video
+        ) = tracker_model.forward(video_tensor, depth=depth_tensor,
+                            intrs=intrs, extrs=extrs,
+                            queries=query_xyt,
+                            fps=1, full_point=False, iters_track=4,
+                            query_no_BA=True, fixed_cam=False, stage=1,
+                            support_frame=len(video_tensor)-1, replace_ratio=0.2)
+        # Resize results to avoid too large I/O Burden
+        max_size = 224
+        h, w = video.shape[2:]
+        scale = min(max_size / h, max_size / w)
+        if scale < 1:
+            new_h, new_w = int(h * scale), int(w * scale)
+            video = T.Resize((new_h, new_w))(video)
+            video_tensor = T.Resize((new_h, new_w))(video_tensor)
+            point_map = T.Resize((new_h, new_w))(point_map)
+            track2d_pred[...,:2] = track2d_pred[...,:2] * scale
+            intrs[:,:2,:] = intrs[:,:2,:] * scale
+            conf_depth = T.Resize((new_h, new_w))(conf_depth)
+        # Visualize tracks
+        tracker_viser.visualize(video=video[None],
+                        tracks=track2d_pred[None][...,:2],
+                        visibility=vis_pred[None],filename="test")
+        # Save in tapip3d format
+        data_npz_load["coords"] = (torch.einsum("tij,tnj->tni", c2w_traj[:,:3,:3], track3d_pred[:,:,:3].cpu()) + c2w_traj[:,:3,3][:,None,:]).numpy()
+        data_npz_load["extrinsics"] = torch.inverse(c2w_traj).cpu().numpy()
+        data_npz_load["intrinsics"] = intrs.cpu().numpy()
+        data_npz_load["depths"] = point_map[:,2,...].cpu().numpy()
+        data_npz_load["video"] = (video_tensor).cpu().numpy()/255
+        data_npz_load["visibs"] = vis_pred.cpu().numpy()
+        data_npz_load["confs"] = conf_pred.cpu().numpy()
+        data_npz_load["confs_depth"] = conf_depth.cpu().numpy()
+        np.savez(os.path.join(out_dir, f'result.npz'), **data_npz_load)
+    return os.path.join(out_dir, "result.npz"), os.path.join(out_dir, "test_pred_track.mp4")
+def compress_and_write(filename, header, blob):
+    header_bytes = json.dumps(header).encode("utf-8")
+    header_len = struct.pack("<I", len(header_bytes))
+    with open(filename, "wb") as f:
+        f.write(header_len)
+        f.write(header_bytes)
+        f.write(blob)
+def process_point_cloud_data(npz_file, width=256, height=192, fps=4):
+    fixed_size = (width, height)
+    data = np.load(npz_file)
+    extrinsics = data["extrinsics"]
+    intrinsics = data["intrinsics"]
+    trajs = data["coords"]
+    T, C, H, W = data["video"].shape
+    fx = intrinsics[0, 0, 0]
+    fy = intrinsics[0, 1, 1]
+    fov_y = 2 * np.arctan(H / (2 * fy)) * (180 / np.pi)
+    fov_x = 2 * np.arctan(W / (2 * fx)) * (180 / np.pi)
+    original_aspect_ratio = (W / fx) / (H / fy)
+    rgb_video = (rearrange(data["video"], "T C H W -> T H W C") * 255).astype(np.uint8)
+    rgb_video = np.stack([cv2.resize(frame, fixed_size, interpolation=cv2.INTER_AREA)
+                          for frame in rgb_video])
+    depth_video = data["depths"].astype(np.float32)
+    if "confs_depth" in data.keys():
+        confs = (data["confs_depth"].astype(np.float32) > 0.5).astype(np.float32)
+        depth_video = depth_video * confs
+    depth_video = np.stack([cv2.resize(frame, fixed_size, interpolation=cv2.INTER_NEAREST)
+                            for frame in depth_video])
+    scale_x = fixed_size[0] / W
+    scale_y = fixed_size[1] / H
+    intrinsics = intrinsics.copy()
+    intrinsics[:, 0, :] *= scale_x
+    intrinsics[:, 1, :] *= scale_y
+    min_depth = float(depth_video.min()) * 0.8
+    max_depth = float(depth_video.max()) * 1.5
+    depth_normalized = (depth_video - min_depth) / (max_depth - min_depth)
+    depth_int = (depth_normalized * ((1 << 16) - 1)).astype(np.uint16)
+    depths_rgb = np.zeros((T, fixed_size[1], fixed_size[0], 3), dtype=np.uint8)
+    depths_rgb[:, :, :, 0] = (depth_int & 0xFF).astype(np.uint8)
+    depths_rgb[:, :, :, 1] = ((depth_int >> 8) & 0xFF).astype(np.uint8)
+    first_frame_inv = np.linalg.inv(extrinsics[0])
+    normalized_extrinsics = np.array([first_frame_inv @ ext for ext in extrinsics])
+    normalized_trajs = np.zeros_like(trajs)
+    for t in range(T):
+        homogeneous_trajs = np.concatenate([trajs[t], np.ones((trajs.shape[1], 1))], axis=1)
+        transformed_trajs = (first_frame_inv @ homogeneous_trajs.T).T
+        normalized_trajs[t] = transformed_trajs[:, :3]
+    arrays = {
+        "rgb_video": rgb_video,
+        "depths_rgb": depths_rgb,
+        "intrinsics": intrinsics,
+        "extrinsics": normalized_extrinsics,
+        "inv_extrinsics": np.linalg.inv(normalized_extrinsics),
+        "trajectories": normalized_trajs.astype(np.float32),
+        "cameraZ": 0.0
+    }
+    header = {}
+    blob_parts = []
+    offset = 0
+    for key, arr in arrays.items():
+        arr = np.ascontiguousarray(arr)
+        arr_bytes = arr.tobytes()
+        header[key] = {
+            "dtype": str(arr.dtype),
+            "shape": arr.shape,
+            "offset": offset,
+            "length": len(arr_bytes)
+        }
+        blob_parts.append(arr_bytes)
+        offset += len(arr_bytes)
+    raw_blob = b"".join(blob_parts)
+    compressed_blob = zlib.compress(raw_blob, level=9)
+    header["meta"] = {
+        "depthRange": [min_depth, max_depth],
+        "totalFrames": int(T),
+        "resolution": fixed_size,
+        "baseFrameRate": fps,
+        "numTrajectoryPoints": normalized_trajs.shape[1],
+        "fov": float(fov_y),
+        "fov_x": float(fov_x),
+        "original_aspect_ratio": float(original_aspect_ratio),
+        "fixed_aspect_ratio": float(fixed_size[0]/fixed_size[1])
+    }
+    # Create temporary file for compression
+    temp_data_file = f'./temp_data_{int(time.time())}.bin'
+    compress_and_write(temp_data_file, header, compressed_blob)
+    # Read the compressed data and encode to base64
+    with open(temp_data_file, "rb") as f:
+        encoded_blob = base64.b64encode(f.read()).decode("ascii")
+    # Clean up temporary file
+    os.unlink(temp_data_file)
+    # Read the HTML template and inject the base64 data
+    with open('./_viz/viz_template.html') as f:
+        html_template = f.read()
+    # Inject the base64 data into the HTML
+    html_content = html_template.replace(
+        "<head>",
+        f"<head>\n<script>window.embeddedBase64 = `{encoded_blob}`;</script>"
+    )
+    return html_content
 def numpy_to_base64(arr):
     """Convert numpy array to base64 string"""
     """Convert base64 string back to numpy array"""
     return np.frombuffer(base64.b64decode(b64_str), dtype=dtype).reshape(shape)
 def get_video_name(video_path):
     """Extract video name without extension"""
     return os.path.splitext(os.path.basename(video_path))[0]
+# Backend API Functions
+def backend_upload_video(video_path: str) -> Dict[str, Any]:
+    """Backend API for video upload"""
     try:
+        # Create user-specific temporary directory
+        user_temp_dir = create_user_temp_dir()
+        # Get original video name
+        video_name = get_video_name(video_path)
+        temp_video_path = os.path.join(user_temp_dir, f"{video_name}.mp4")
+        shutil.copy(video_path, temp_video_path)
+        print(f"Video saved to: {temp_video_path}")
+        # Extract first frame
+        cap = cv2.VideoCapture(temp_video_path)
+        success, frame = cap.read()
+        cap.release()
+        if not success:
+            return {"success": False, "error": "Failed to read video"}
+        # Resize frame to have minimum side length of 336
+        h, w = frame.shape[:2]
+        scale = 336 / min(h, w)
+        new_h, new_w = int(h * scale)//2*2, int(w * scale)//2*2
+        frame = cv2.resize(frame, (new_w, new_h), interpolation=cv2.INTER_LINEAR)
+        frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
+        # Convert frame to base64 string for storage, include temp_dir info
+        frame_data = {
+            'data': numpy_to_base64(frame),
+            'shape': frame.shape,
+            'dtype': str(frame.dtype),
+            'temp_dir': user_temp_dir  # Store temp directory path
+        }
+        return {
+            "success": True,
+            "original_image_state": json.dumps(frame_data),
+            "display_image": frame,
+            "selected_points": [],
+            "temp_dir": user_temp_dir
+        }
     except Exception as e:
+        logger.error(f"Error in backend_upload_video: {e}")
+        return {"success": False, "error": str(e)}
+def backend_select_point(original_img: str, sel_pix: list, point_type: str, point_x: int, point_y: int) -> Dict[str, Any]:
+    """Backend API for point selection"""
     try:
+        # Convert stored image data back to numpy array
+        frame_data = json.loads(original_img)
+        original_img = base64_to_numpy(frame_data['data'], frame_data['shape'], frame_data['dtype'])
+        temp_dir = frame_data.get('temp_dir', 'temp')  # Get user-specific temp dir
+        # Create a display image for visualization
+        display_img = original_img.copy()
+        # Create a new list instead of modifying the existing one
+        new_sel_pix = sel_pix.copy() if sel_pix else []
+        new_sel_pix.append(((point_x, point_y), 1 if point_type == 'positive_point' else 0))
+        # Run SAM inference
+        o_masks = gpu_run_sam(original_img, new_sel_pix, [])
+        # Draw points on display image
+        COLORS = [(0, 0, 255), (0, 255, 255)]  # BGR: Red for negative, Yellow for positive
+        MARKERS = [1, 5]  # Cross for negative, Star for positive
+        MARKER_SIZE = 8  # Increased marker size
+        for point, label in new_sel_pix:
+            cv2.drawMarker(display_img, point, COLORS[label], markerType=MARKERS[label], markerSize=MARKER_SIZE, thickness=2)
+        # Draw mask overlay on display image
+        if o_masks:
+            # Get the final mask (which is already processed as pos_mask - neg_mask)
+            mask = o_masks[0][0]  # Get first mask
+            # Create a light blue overlay
+            overlay = display_img.copy()
+            overlay[mask.squeeze()!=0] = [20, 60, 200]  # Light blue in BGR
+            # Blend with original image with lower alpha
+            display_img = cv2.addWeighted(overlay, 0.6, display_img, 0.4, 0)
+        # Save mask
+        if o_masks:
+            video_files = glob.glob(os.path.join(temp_dir, "*.mp4"))
+            if video_files:
+                video_name = get_video_name(video_files[0])
+                for mask, _ in o_masks:
+                    o_mask = np.uint8(mask.squeeze() * 255)
+                    o_file = os.path.join(temp_dir, f"{video_name}.png")
+                    cv2.imwrite(o_file, o_mask)
+        return {
+            "success": True,
+            "display_image": display_img,
+            "selected_points": new_sel_pix
+        }
+    except Exception as e:
+        logger.error(f"Error in backend_select_point: {e}")
+        return {"success": False, "error": str(e)}
+def backend_reset_points(original_img: str, sel_pix: list) -> Dict[str, Any]:
+    """Backend API for resetting points"""
+    try:
+        # Convert stored image data back to numpy array
+        frame_data = json.loads(original_img)
+        original_img = base64_to_numpy(frame_data['data'], frame_data['shape'], frame_data['dtype'])
+        temp_dir = frame_data.get('temp_dir', 'temp')  # Get user-specific temp dir
+        # Create a display image for visualization (just the original image)
+        display_img = original_img.copy()
+        # Clear all points
+        new_sel_pix = []
+        # Clear any existing masks in user's temp directory
+        for mask_file in glob.glob(os.path.join(temp_dir, "*.png")):
+            try:
+                os.remove(mask_file)
+            except Exception as e:
+                logger.warning(f"Failed to remove mask file {mask_file}: {e}")
+        return {
+            "success": True,
+            "display_image": display_img,
+            "selected_points": new_sel_pix
+        }
     except Exception as e:
+        logger.error(f"Error in backend_reset_points: {e}")
+        return {"success": False, "error": str(e)}
+def backend_run_tracker(grid_size: int, vo_points: int, fps: int, original_image_state: str) -> Dict[str, Any]:
+    """Backend API for running tracker and visualization"""
     try:
+        # Get user's temp directory from stored frame data
+        frame_data = json.loads(original_image_state)
+        temp_dir = frame_data.get('temp_dir', 'temp')
+        video_files = glob.glob(os.path.join(temp_dir, "*.mp4"))
+        if not video_files:
+            return {"success": False, "error": "No video file found"}
+        video_path = video_files[0]
+        video_name = get_video_name(video_path)
+        # Run tracker
+        npz_path, track2d_video = gpu_run_tracker(temp_dir, video_name, grid_size, vo_points, fps)
+        # Generate HTML content
+        html_content = process_point_cloud_data(npz_path)
+        # Schedule deletion of generated files
+        if os.path.exists(track2d_video):
+            delete_later(track2d_video, delay=600)
+        if os.path.exists(npz_path):
+            delete_later(npz_path, delay=600)
+        return {
+            "success": True,
+            "viz_html": html_content,
+            "track_video_path": track2d_video
+        }
     except Exception as e:
+        logger.error(f"Error in backend_run_tracker: {e}")
+        return {"success": False, "error": str(e)}
+# Remove the separate interfaces and create a unified API handler
+def unified_api_handler(function_type: str, *args) -> Dict[str, Any]:
+    """Unified API handler for all backend functions"""
     try:
+        if function_type == "upload_video":
+            # args[0] should be the video file
+            return backend_upload_video(args[0])
+        elif function_type == "select_point":
+            # args: original_img, sel_pix, point_type, point_x, point_y
+            return backend_select_point(args[0], args[1], args[2], args[3], args[4])
+        elif function_type == "reset_points":
+            # args: original_img, sel_pix
+            return backend_reset_points(args[0], args[1])
+        elif function_type == "run_tracker":
+            # args: grid_size, vo_points, fps, original_image_state
+            return backend_run_tracker(args[0], args[1], args[2], args[3])
         else:
+            return {"success": False, "error": f"Unknown function type: {function_type}"}
     except Exception as e:
+        logger.error(f"Error in unified_api_handler: {e}")
+        return {"success": False, "error": str(e)}
+# Create the main unified API interface
+main_api = gr.Interface(
+    fn=unified_api_handler,
+    inputs=[
+        gr.Dropdown(
+            choices=["upload_video", "select_point", "reset_points", "run_tracker"],
+            label="Function Type",
+            value="upload_video"
+        ),
+        gr.File(label="Video File (for upload_video)", file_types=[".mp4", ".avi", ".mov"]),
+        gr.Textbox(label="Original Image State", value=""),
+        gr.JSON(label="Selected Points", value=[]),
+        gr.Radio(choices=['positive_point', 'negative_point'], label="Point Type", value='positive_point'),
+        gr.Number(label="Point X", value=0),
+        gr.Number(label="Point Y", value=0),
+        gr.Number(label="Grid Size", value=50),
+        gr.Number(label="VO Points", value=756),
+        gr.Number(label="FPS", value=3)
+    ],
+    outputs=[
+        gr.JSON(label="Result")
+    ],
+    title="SpaTrackV2 Backend API",
+    description="Unified Backend API for SpaTrackV2. This is a private Space that provides core functionality.",
+    api_name="unified_api"
+)
+# Create additional interfaces for individual API functions for manual testing
+select_point_api = gr.Interface(
+    fn=backend_select_point,
+    inputs=[
+        gr.Textbox(label="Original Image State"),
+        gr.JSON(label="Selected Points"),
+        gr.Radio(choices=['positive_point', 'negative_point'], label="Point Type"),
+        gr.Number(label="Point X"),
+        gr.Number(label="Point Y")
+    ],
+    outputs=[
+        gr.JSON(label="Result")
+    ],
+    title="Select Point API",
+    description="API for selecting points on video frames"
+)
+reset_points_api = gr.Interface(
+    fn=backend_reset_points,
+    inputs=[
+        gr.Textbox(label="Original Image State"),
+        gr.JSON(label="Selected Points")
+    ],
+    outputs=[
+        gr.JSON(label="Result")
+    ],
+    title="Reset Points API",
+    description="API for resetting points"
+)
+tracker_api = gr.Interface(
+    fn=backend_run_tracker,
+    inputs=[
+        gr.Number(label="Grid Size", value=50),
+        gr.Number(label="VO Points", value=756),
+        gr.Number(label="FPS", value=3),
+        gr.Textbox(label="Original Image State")
+    ],
+    outputs=[
+        gr.JSON(label="Result")
+    ],
+    title="Run Tracker API",
+    description="API for running the tracking algorithm"
+)
+# Create a combined interface with tabs for manual testing
+with gr.Blocks(title="SpaTrackV2 Backend API") as backend_app:
+    gr.Markdown("# 🚀 SpaTrackV2 Backend API")
+    gr.Markdown("This is a private backend Space that provides core SpaTrackV2 functionality.")
+    with gr.Tabs():
+        with gr.TabItem("Unified API"):
+            main_api.render()
+        with gr.TabItem("Upload Video"):
+            upload_api = gr.Interface(
+                fn=backend_upload_video,
+                inputs=[gr.File(label="Upload Video", file_types=[".mp4", ".avi", ".mov"])],
+                outputs=[gr.JSON(label="Result")],
+                title="Upload Video API"
+            )
+            upload_api.render()
+        with gr.TabItem("Select Point"):
+            select_point_api.render()
+        with gr.TabItem("Reset Points"):
+            reset_points_api.render()
+        with gr.TabItem("Run Tracker"):
+            tracker_api.render()
+        with gr.TabItem("API Info"):
+            gr.Markdown("""
+            ## Available API Functions
+            ### Unified API
+            - **Function**: `unified_api_handler`
+            - **Input**: Function type + parameters
+            - **Output**: JSON result
+            ### Individual Functions
+            #### 1. Upload Video
+            - **Function**: `backend_upload_video`
+            - **Input**: Video file
+            - **Output**: Initial state and settings
+            #### 2. Select Point
+            - **Function**: `backend_select_point`
+            - **Input**: Image state + point coordinates
+            - **Output**: Updated image and points
+            #### 3. Reset Points
+            - **Function**: `backend_reset_points`
+            - **Input**: Image state + points
+            - **Output**: Reset image and empty points
+            #### 4. Run Tracker
+            - **Function**: `backend_run_tracker`
+            - **Input**: Parameters + image state
+            - **Output**: Visualization and tracking results
+            ### 5. GPU Functions
+            - `gpu_run_sam(image, points, boxes)`: GPU-accelerated SAM inference
+            - `gpu_run_tracker(temp_dir, video_name, grid_size, vo_points, fps)`: GPU-accelerated tracking
+            """)
+if __name__ == "__main__":
+    # Print startup information
+    print("🚀 Starting SpaTrackV2 Backend Space...")
+    print(f"🔧 Python version: {sys.version}")
+    print(f"🔧 Working directory: {os.getcwd()}")
+    print(f"🔧 GPU available: {torch.cuda.is_available()}")
+    if torch.cuda.is_available():
+        print(f"🔧 GPU device: {torch.cuda.get_device_name(0)}")
+        print(f"🔧 GPU memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")
+    print(f"🔧 Initializing models and GPU resources...")
+    # Initialize global models
+    init_success = init_global_models()
+    if init_success:
+        print("✅ Backend initialization complete!")
+    else:
+        print("❌ Backend initialization failed! Continuing with limited functionality...")
+    print("📡 Starting Gradio backend interface...")
+    print(f"🔧 Available GPU functions: {[name for name in globals() if name.startswith('gpu_')]}")
+    # Launch the complete backend app (not just main_api)
+    backend_app.launch(
         server_name="0.0.0.0",
         server_port=7860,
+        share=False,  # Backend shouldn't need sharing
         debug=True,
         show_error=True
     )