import gradio as gr import pandas as pd # Your leaderboard name TITLE = """

Physical AI Bench Leaderboard

""" # CSS to make the leaderboard full height CSS = """ #predict_leaderboard, #transfer_leaderboard, #reason_leaderboard { height: auto !important; max-height: none !important; } #predict_leaderboard .wrap, #transfer_leaderboard .wrap, #reason_leaderboard .wrap { max-height: none !important; height: auto !important; } #predict_leaderboard .tbody, #transfer_leaderboard .tbody, #reason_leaderboard .tbody { max-height: none !important; height: auto !important; overflow-x: auto !important; overflow-y: hidden !important; } """ # What does your leaderboard evaluate? INTRODUCTION_TEXT = """ **Physical AI Bench (PAI-Bench)** is a comprehensive benchmark suite for evaluating physical AI generation and understanding across diverse scenarios including autonomous vehicles, robotics, industrial spaces, and everyday ego-centric environments. """ # Which evaluations are you running? how can people reproduce what you have? LLM_BENCHMARKS_TEXT = """ ## How it works This leaderboard tracks model performance across three core dimensions: - **🎨 Generation**: Evaluates world foundation models' ability to predict future states across 1,044 diverse physical scenarios - **🔄 Conditional Generation**: Focuses on world model generation with complex control signals, featuring 600 videos across robotic arm operations, autonomous driving, and ego-centric scenes - **🧠 Understanding**: Evaluates understanding and reasoning about physical scenes, with 1,214 embodied reasoning scenarios focused on autonomous vehicle actions PAI-Bench covers multiple physical AI domains including autonomous driving, robotics, industrial spaces, physics simulations, human interactions, and common sense reasoning. ### Resources - 🌐 [GitHub Repository](https://github.com/SHI-Labs/physical-ai-bench) - 📊 [Generation Dataset](https://huggingface.co/datasets/shi-labs/physical-ai-bench-predict) - 📊 [Conditional Generation Dataset](https://huggingface.co/datasets/shi-labs/physical-ai-bench-transfer) - 📊 [Understanding Dataset](https://huggingface.co/datasets/shi-labs/physical-ai-bench-reason) - đŸ“Ļ [Artifacts](https://huggingface.co/datasets/Leymore/physical-ai-bench-artifacts) ## Reproducibility To evaluate your models on PAI-Bench, visit our [GitHub repository](https://github.com/SHI-Labs/physical-ai-bench) for evaluation scripts and detailed instructions. ## Citation If you use Physical AI Bench in your research, please cite: ```bibtex @misc{zhou2025paibenchcomprehensivebenchmarkphysical, title={PAI-Bench: A Comprehensive Benchmark For Physical AI}, author={Fengzhe Zhou and Jiannan Huang and Jialuo Li and Deva Ramanan and Humphrey Shi}, year={2025}, eprint={2512.01989}, archivePrefix={arXiv}, primaryClass={cs.CV}, url={https://arxiv.org/abs/2512.01989}, } ``` """ # ============================================================================ # Model Links Utility # ============================================================================ def create_model_link(model_name): """ Convert a model name to a markdown link to Hugging Face. Args: model_name: Model name in format "org/model-name" or just a plain name Returns: Markdown formatted link or original name if format doesn't match """ if not isinstance(model_name, str): return model_name if '/' in model_name: hf_url = f"https://huggingface.co/{model_name}" display_name = model_name.split('/')[-1] return f"[{display_name}]({hf_url})" return model_name # ============================================================================ # Generation Tab Configuration and Utilities # ============================================================================ # Column name to abbreviation mapping for display PREDICT_COLUMN_ABBREV = { 'Common Sense': 'CS', 'AV': 'AV', 'Robot': 'RO', 'Industry': 'IN', 'Human': 'HU', 'Physics': 'PH', 'Subject Consistency': 'SC', 'Background Consistency': 'BC', 'Motion Smoothness': 'MS', 'Aesthetic Quality': 'AQ', 'Imaging Quality': 'IQ', 'Overall Consistency': 'OC', 'I2V Subject': 'IS', 'I2V Background': 'IB', } # Expected column order (full names from JSON) PREDICT_COLUMN_ORDER = [ 'Model', 'Overall', 'Domain', 'Quality', 'Common Sense', 'AV', 'Robot', 'Industry', 'Human', 'Physics', 'Subject Consistency', 'Background Consistency', 'Motion Smoothness', 'Aesthetic Quality', 'Imaging Quality', 'Overall Consistency', 'I2V Subject', 'I2V Background' ] # Columns to hide by default (but still available for filtering/selection) PREDICT_HIDDEN_COLUMNS = [] # Semantic/Domain dimensions (for selection button) - use abbreviations matching dataframe PREDICT_DOMAIN_SCORE_DIMENSIONS = [ 'Domain', 'CS', 'AV', 'RO', 'IN', 'HU', 'PH', ] # Quality dimensions (for selection button) - use abbreviations matching dataframe PREDICT_QUALITY_SCORE_DIMENSIONS = [ 'Quality', 'SC', 'BC', 'MS', 'AQ', 'IQ', 'OC', 'IS', 'IB' ] PREDICT_DESELECTED_COLUMNS = ['Domain', 'Quality'] PREDICT_ALL_SELECTED_COLUMNS = [ 'Domain', 'Quality', 'CS', 'AV', 'RO', 'IN', 'HU', 'PH', 'SC', 'BC', 'MS', 'AQ', 'IQ', 'OC', 'IS', 'IB' ] # Columns that can never be deselected PREDICT_NEVER_HIDDEN_COLUMNS = ['Model', 'Overall'] # Columns displayed by default PREDICT_DEFAULT_DISPLAYED_COLUMNS = PREDICT_NEVER_HIDDEN_COLUMNS + PREDICT_ALL_SELECTED_COLUMNS def load_predict_json(json_path): """ Load generation leaderboard JSON. The JSON should already be pre-processed by generate_predict_leaderboard.py with correct column names, ordering, sorting, and separate model/url fields. """ df = pd.read_json(json_path, orient='records') if 'model' in df.columns and 'url' in df.columns: def create_link(row): if pd.notna(row['url']): display_name = row['model'].split('/')[-1] if '/' in row['model'] else row['model'] return f"[{display_name}]({row['url']})" return row['model'] df['model'] = df.apply(create_link, axis=1) df = df.drop(columns=['url']) df = df.rename(columns={'model': 'Model'}) for col in df.columns: if col != 'Model' and pd.api.types.is_numeric_dtype(df[col]): df[col] = df[col].apply(lambda x: f"{x:.1f}" if pd.notna(x) else x) # Rename columns to abbreviations for display df = df.rename(columns=PREDICT_COLUMN_ABBREV) return df def get_predict_checkbox_choices(dataframe): """Get checkbox choices with full name (abbrev) format""" # Create reverse mapping from abbreviation to full name abbrev_to_full = {v: k for k, v in PREDICT_COLUMN_ABBREV.items()} choices = [] for col in dataframe.columns: if col in ['Model', 'Overall']: continue if col in abbrev_to_full: full_name = abbrev_to_full[col] choices.append((f"{full_name} ({col})", col)) else: choices.append((col, col)) return choices def select_predict_domain_score(): """Return domain score for checkbox selection""" return gr.update(value=PREDICT_DOMAIN_SCORE_DIMENSIONS) def select_predict_quality_score(): """Return quality score for checkbox selection""" return gr.update(value=PREDICT_QUALITY_SCORE_DIMENSIONS) def deselect_predict_all(): """Deselect all dimensions""" return gr.update(value=PREDICT_DESELECTED_COLUMNS) def select_predict_all(): """Select all dimensions""" return gr.update(value=PREDICT_ALL_SELECTED_COLUMNS) def on_predict_dimension_selection_change(selected_columns, full_df): """Handle dimension selection changes and update the dataframe""" present_columns = ['Model', 'Overall'] for col in selected_columns: if col not in present_columns and col in full_df.columns: present_columns.append(col) updated_data = full_df[present_columns] datatypes = [] for col in present_columns: if col == 'Model': datatypes.append('markdown') else: datatypes.append('str') return gr.update(value=updated_data, datatype=datatypes, headers=present_columns) def init_predict_leaderboard(dataframe): """Initialize the Generation leaderboard with given dataframe""" if dataframe is None or dataframe.empty: raise ValueError("Leaderboard DataFrame is empty or None.") # Get columns that exist in the dataframe available_default_cols = [col for col in PREDICT_DEFAULT_DISPLAYED_COLUMNS if col in dataframe.columns] # Filter dataframe to show only default columns initially display_df = dataframe[available_default_cols] # Determine datatypes dynamically datatypes = [] for col in display_df.columns: if col == 'Model': datatypes.append('markdown') else: datatypes.append('str') with gr.Row(): with gr.Column(scale=1): domain_score_btn = gr.Button("Domain Score", size="md") quality_score_btn = gr.Button("Quality Score", size="md") select_all_btn = gr.Button("Select All", size="md") deselect_btn = gr.Button("Deselect All", size="md") with gr.Column(scale=4): # Get checkbox choices with "Full Name (Abbrev)" format checkbox_choices = get_predict_checkbox_choices(dataframe) checkbox_group = gr.CheckboxGroup( choices=checkbox_choices, value=[col for col in PREDICT_ALL_SELECTED_COLUMNS if col in dataframe.columns], label="Evaluation Dimensions", interactive=True, ) data_component = gr.Dataframe( value=display_df, headers=list(display_df.columns), datatype=datatypes, interactive=False, visible=True, wrap=False, column_widths=["320px"] + ["80px"] * (len(display_df.columns) - 1), pinned_columns=1, elem_id="predict_leaderboard", max_height=10000, ) # Setup event handlers domain_score_btn.click( select_predict_domain_score, inputs=None, outputs=[checkbox_group] ).then( fn=on_predict_dimension_selection_change, inputs=[checkbox_group, gr.State(dataframe)], outputs=data_component ) quality_score_btn.click( select_predict_quality_score, inputs=None, outputs=[checkbox_group] ).then( fn=on_predict_dimension_selection_change, inputs=[checkbox_group, gr.State(dataframe)], outputs=data_component ) deselect_btn.click( deselect_predict_all, inputs=None, outputs=[checkbox_group] ).then( fn=on_predict_dimension_selection_change, inputs=[checkbox_group, gr.State(dataframe)], outputs=data_component ) select_all_btn.click( select_predict_all, inputs=None, outputs=[checkbox_group] ).then( fn=on_predict_dimension_selection_change, inputs=[checkbox_group, gr.State(dataframe)], outputs=data_component ) checkbox_group.change( fn=on_predict_dimension_selection_change, inputs=[checkbox_group, gr.State(dataframe)], outputs=data_component ) return data_component # ============================================================================ # Conditional Generation Tab Configuration and Utilities # ============================================================================ TRANSFER_COLUMN_ORDER = [ 'Model', 'Condition', 'Blur SSIM ↑', 'Edge F1 ↑', 'Depth si-RMSE ↓', 'Mask mIoU ↑', 'Quality Score ↑', 'Diversity ↑' ] TRANSFER_HIDDEN_COLUMNS = [] TRANSFER_QUALITY_DIMENSIONS = [ 'Blur SSIM ↑', 'Edge F1 ↑', 'Depth si-RMSE ↓', 'Mask mIoU ↑', 'Quality Score ↑', 'Diversity ↑', ] TRANSFER_ALL_SELECTED_COLUMNS = TRANSFER_QUALITY_DIMENSIONS TRANSFER_NEVER_HIDDEN_COLUMNS = ['Model', 'Condition'] TRANSFER_DEFAULT_DISPLAYED_COLUMNS = TRANSFER_NEVER_HIDDEN_COLUMNS + TRANSFER_ALL_SELECTED_COLUMNS def load_transfer_json(json_path): """Load conditional generation leaderboard JSON""" df = pd.read_json(json_path, orient='records') if 'model' in df.columns and 'url' in df.columns: def create_link(row): if pd.notna(row['url']): display_name = row['model'].split('/')[-1] if '/' in row['model'] else row['model'] return f"[{display_name}]({row['url']})" return row['model'] df['model'] = df.apply(create_link, axis=1) df = df.drop(columns=['url']) df = df.rename(columns={'model': 'Model'}) for col in df.columns: if col not in ['Model', 'Condition'] and pd.api.types.is_numeric_dtype(df[col]): df[col] = df[col].apply(lambda x: f"{x:.3f}" if pd.notna(x) else x) return df def select_transfer_all(): """Select all dimensions""" return gr.update(value=TRANSFER_ALL_SELECTED_COLUMNS) def deselect_transfer_all(): """Deselect all dimensions""" return gr.update(value=[]) def on_transfer_dimension_selection_change(selected_columns, full_df): """Handle dimension selection changes and update the dataframe""" present_columns = ['Model', 'Condition'] for col in selected_columns: if col not in present_columns and col in full_df.columns: present_columns.append(col) updated_data = full_df[present_columns] datatypes = [] for col in present_columns: if col == 'Model': datatypes.append('markdown') else: datatypes.append('str') return gr.update(value=updated_data, datatype=datatypes, headers=present_columns) def init_transfer_leaderboard(dataframe): """Initialize the Conditional Generation leaderboard with given dataframe""" if dataframe is None or dataframe.empty: raise ValueError("Leaderboard DataFrame is empty or None.") available_default_cols = [col for col in TRANSFER_DEFAULT_DISPLAYED_COLUMNS if col in dataframe.columns] display_df = dataframe[available_default_cols] datatypes = [] for col in display_df.columns: if col == 'Model': datatypes.append('markdown') else: datatypes.append('str') with gr.Row(): with gr.Column(scale=1): select_all_btn = gr.Button("Select All", size="md") deselect_btn = gr.Button("Deselect All", size="md") with gr.Column(scale=4): dimension_choices = [col for col in dataframe.columns if col not in TRANSFER_NEVER_HIDDEN_COLUMNS] checkbox_group = gr.CheckboxGroup( choices=dimension_choices, value=[col for col in TRANSFER_DEFAULT_DISPLAYED_COLUMNS if col in dimension_choices], label="Evaluation Dimensions", interactive=True, ) data_component = gr.Dataframe( value=display_df, headers=list(display_df.columns), datatype=datatypes, interactive=False, visible=True, wrap=False, column_widths=["280px", "120px"] + ["150px"] * (len(display_df.columns) - 2), pinned_columns=2, elem_id="transfer_leaderboard", max_height=10000, ) deselect_btn.click( deselect_transfer_all, inputs=None, outputs=[checkbox_group] ).then( fn=on_transfer_dimension_selection_change, inputs=[checkbox_group, gr.State(dataframe)], outputs=data_component ) select_all_btn.click( select_transfer_all, inputs=None, outputs=[checkbox_group] ).then( fn=on_transfer_dimension_selection_change, inputs=[checkbox_group, gr.State(dataframe)], outputs=data_component ) checkbox_group.change( fn=on_transfer_dimension_selection_change, inputs=[checkbox_group, gr.State(dataframe)], outputs=data_component ) return data_component # ============================================================================ # Understanding Tab Configuration and Utilities # ============================================================================ # Column name to abbreviation mapping for display REASON_COLUMN_ABBREV = { 'Common Sense': 'CS', 'Embodied Reasoning': 'ER', 'BridgeData V2': 'BD', 'RoboVQA': 'RV', 'RoboFail': 'RF', 'Agibot': 'AB', 'HoloAssist': 'HA', } # Desired column order (full names from JSON) REASON_COLUMN_ORDER = [ 'Model', 'Thinking', 'Overall', 'Common Sense', 'Embodied Reasoning', 'Space', 'Time', 'Physics', 'BridgeData V2', 'RoboVQA', 'RoboFail', 'Agibot', 'HoloAssist', 'AV' ] # Columns to hide by default (but still available for filtering/selection) REASON_HIDDEN_COLUMNS = [] # Reasoning dimensions (for selection button) - use abbreviations matching dataframe REASON_COMMON_SENSE_DIMENSIONS = [ 'CS', 'Space', 'Time', 'Physics', ] # Domain dimensions (for selection button) - use abbreviations matching dataframe REASON_EMBODIED_REASONING_DIMENSIONS = [ 'ER', 'Space', 'Time', 'Physics', 'BD', 'RV', 'RF', 'AB', 'HA', 'AV', ] REASON_DESELECTED_COLUMNS = [ 'CS', 'ER', ] REASON_ALL_SELECTED_COLUMNS = [ 'CS', 'ER', 'Space', 'Time', 'Physics', 'BD', 'RV', 'RF', 'AB', 'HA', 'AV', ] # Columns that can never be deselected REASON_NEVER_HIDDEN_COLUMNS = ['Model', 'Thinking', 'Overall'] # Columns displayed by default (using renamed column names) REASON_DEFAULT_DISPLAYED_COLUMNS = REASON_NEVER_HIDDEN_COLUMNS + REASON_ALL_SELECTED_COLUMNS def load_reason_json(json_path): """Load understanding leaderboard JSON""" df = pd.read_json(json_path, orient='records') if 'model' in df.columns and 'url' in df.columns: def create_link(row): if pd.notna(row['url']): display_name = row['model'].split('/')[-1] if '/' in row['model'] else row['model'] return f"[{display_name}]({row['url']})" return row['model'] df['model'] = df.apply(create_link, axis=1) df = df.drop(columns=['url']) df = df.rename(columns={'model': 'Model'}) for col in df.columns: if col != 'Model' and pd.api.types.is_numeric_dtype(df[col]): df[col] = df[col].apply(lambda x: f"{x:.1f}" if pd.notna(x) else x) # Rename columns to abbreviations for display df = df.rename(columns=REASON_COLUMN_ABBREV) return df def get_reason_checkbox_choices(dataframe): """Get checkbox choices with full name (abbrev) format""" # Create reverse mapping from abbreviation to full name abbrev_to_full = {v: k for k, v in REASON_COLUMN_ABBREV.items()} choices = [] for col in dataframe.columns: if col in ['Model', 'Thinking', 'Overall']: continue if col in abbrev_to_full: full_name = abbrev_to_full[col] choices.append((f"{full_name} ({col})", col)) else: choices.append((col, col)) return choices def select_reason_common_sense_dimensions(): """Return reasoning dimensions for checkbox selection""" return gr.update(value=REASON_COMMON_SENSE_DIMENSIONS) def select_reason_embodied_reasoning_dimensions(): """Return domain dimensions for checkbox selection""" return gr.update(value=REASON_EMBODIED_REASONING_DIMENSIONS) def deselect_reason_all(): """Deselect all dimensions""" return gr.update(value=REASON_DESELECTED_COLUMNS) def select_reason_all(): """Select all dimensions""" return gr.update(value=REASON_ALL_SELECTED_COLUMNS) def on_reason_dimension_selection_change(selected_columns, full_df): """Handle dimension selection changes and update the dataframe""" present_columns = ['Model', 'Thinking', 'Overall'] for col in selected_columns: if col not in present_columns and col in full_df.columns: present_columns.append(col) updated_data = full_df[present_columns] datatypes = [] for col in present_columns: if col == 'Model': datatypes.append('markdown') else: datatypes.append('str') return gr.update(value=updated_data, datatype=datatypes, headers=present_columns) def init_reason_leaderboard(dataframe): """Initialize the Understanding leaderboard with given dataframe""" if dataframe is None or dataframe.empty: raise ValueError("Leaderboard DataFrame is empty or None.") # Get columns that exist in the dataframe available_default_cols = [col for col in REASON_DEFAULT_DISPLAYED_COLUMNS if col in dataframe.columns] # Filter dataframe to show only default columns initially display_df = dataframe[available_default_cols] # Determine datatypes dynamically datatypes = [] for col in display_df.columns: if col == 'Model': datatypes.append('markdown') else: datatypes.append('str') with gr.Row(): with gr.Column(scale=1): common_sense_btn = gr.Button("Common Sense", size="md") embodied_reasoning_btn = gr.Button("Embodied Reasoning", size="md") select_all_btn = gr.Button("Select All", size="md") deselect_btn = gr.Button("Deselect All", size="md") with gr.Column(scale=4): # Get checkbox choices with "Full Name (Abbrev)" format checkbox_choices = get_reason_checkbox_choices(dataframe) checkbox_group = gr.CheckboxGroup( choices=checkbox_choices, value=[col for col in REASON_ALL_SELECTED_COLUMNS if col in dataframe.columns], label="Evaluation Dimensions", interactive=True, ) data_component = gr.Dataframe( value=display_df, headers=list(display_df.columns), datatype=datatypes, interactive=False, visible=True, wrap=False, column_widths=["320px", "100px"] + ["100px"] * (len(display_df.columns) - 2), pinned_columns=2, elem_id="reason_leaderboard", max_height=10000, ) # Setup event handlers common_sense_btn.click( select_reason_common_sense_dimensions, inputs=None, outputs=[checkbox_group] ).then( fn=on_reason_dimension_selection_change, inputs=[checkbox_group, gr.State(dataframe)], outputs=data_component ) embodied_reasoning_btn.click( select_reason_embodied_reasoning_dimensions, inputs=None, outputs=[checkbox_group] ).then( fn=on_reason_dimension_selection_change, inputs=[checkbox_group, gr.State(dataframe)], outputs=data_component ) deselect_btn.click( deselect_reason_all, inputs=None, outputs=[checkbox_group] ).then( fn=on_reason_dimension_selection_change, inputs=[checkbox_group, gr.State(dataframe)], outputs=data_component ) select_all_btn.click( select_reason_all, inputs=None, outputs=[checkbox_group] ).then( fn=on_reason_dimension_selection_change, inputs=[checkbox_group, gr.State(dataframe)], outputs=data_component ) checkbox_group.change( fn=on_reason_dimension_selection_change, inputs=[checkbox_group, gr.State(dataframe)], outputs=data_component ) return data_component # ============================================================================ # Main Application # ============================================================================ demo = gr.Blocks() with demo: gr.HTML(f"") gr.HTML(TITLE) gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text") with gr.Tabs(elem_classes="tab-buttons") as tabs: with gr.TabItem("🎨 Generation", elem_id="predict-tab", id=0): predict_df = load_predict_json("data/generation-leaderboard.json") predict_leaderboard = init_predict_leaderboard(predict_df) with gr.TabItem("🔄 Conditional Generation", elem_id="transfer-tab", id=1): transfer_df = load_transfer_json("data/conditional_generation-leaderboard.json") transfer_leaderboard = init_transfer_leaderboard(transfer_df) with gr.TabItem("🧠 Understanding", elem_id="reason-tab", id=2): reason_df = load_reason_json("data/understanding-leaderboard.json") reason_leaderboard = init_reason_leaderboard(reason_df) with gr.TabItem("â„šī¸ About", elem_id="about-tab", id=3): gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text") demo.launch()