import gradio as gr
import pandas as pd
# Your leaderboard name
TITLE = """
Physical AI Bench Leaderboard
"""
# CSS to make the leaderboard full height
CSS = """
#predict_leaderboard, #transfer_leaderboard, #reason_leaderboard {
height: auto !important;
max-height: none !important;
}
#predict_leaderboard .wrap, #transfer_leaderboard .wrap, #reason_leaderboard .wrap {
max-height: none !important;
height: auto !important;
}
#predict_leaderboard .tbody, #transfer_leaderboard .tbody, #reason_leaderboard .tbody {
max-height: none !important;
height: auto !important;
overflow-x: auto !important;
overflow-y: hidden !important;
}
"""
# What does your leaderboard evaluate?
INTRODUCTION_TEXT = """
**Physical AI Bench (PAI-Bench)** is a comprehensive benchmark suite for evaluating physical AI generation and understanding across diverse scenarios including autonomous vehicles, robotics, industrial spaces, and everyday ego-centric environments.
"""
# Which evaluations are you running? how can people reproduce what you have?
LLM_BENCHMARKS_TEXT = """
## How it works
This leaderboard tracks model performance across three core dimensions:
- **đ¨ Generation**: Evaluates world foundation models' ability to predict future states across 1,044 diverse physical scenarios
- **đ Conditional Generation**: Focuses on world model generation with complex control signals, featuring 600 videos across robotic arm operations, autonomous driving, and ego-centric scenes
- **đ§ Understanding**: Evaluates understanding and reasoning about physical scenes, with 1,214 embodied reasoning scenarios focused on autonomous vehicle actions
PAI-Bench covers multiple physical AI domains including autonomous driving, robotics, industrial spaces, physics simulations, human interactions, and common sense reasoning.
### Resources
- đ [GitHub Repository](https://github.com/SHI-Labs/physical-ai-bench)
- đ [Generation Dataset](https://huggingface.co/datasets/shi-labs/physical-ai-bench-predict)
- đ [Conditional Generation Dataset](https://huggingface.co/datasets/shi-labs/physical-ai-bench-transfer)
- đ [Understanding Dataset](https://huggingface.co/datasets/shi-labs/physical-ai-bench-reason)
- đĻ [Artifacts](https://huggingface.co/datasets/Leymore/physical-ai-bench-artifacts)
## Reproducibility
To evaluate your models on PAI-Bench, visit our [GitHub repository](https://github.com/SHI-Labs/physical-ai-bench) for evaluation scripts and detailed instructions.
## Citation
If you use Physical AI Bench in your research, please cite:
```bibtex
@misc{zhou2025paibenchcomprehensivebenchmarkphysical,
title={PAI-Bench: A Comprehensive Benchmark For Physical AI},
author={Fengzhe Zhou and Jiannan Huang and Jialuo Li and Deva Ramanan and Humphrey Shi},
year={2025},
eprint={2512.01989},
archivePrefix={arXiv},
primaryClass={cs.CV},
url={https://arxiv.org/abs/2512.01989},
}
```
"""
# ============================================================================
# Model Links Utility
# ============================================================================
def create_model_link(model_name):
"""
Convert a model name to a markdown link to Hugging Face.
Args:
model_name: Model name in format "org/model-name" or just a plain name
Returns:
Markdown formatted link or original name if format doesn't match
"""
if not isinstance(model_name, str):
return model_name
if '/' in model_name:
hf_url = f"https://huggingface.co/{model_name}"
display_name = model_name.split('/')[-1]
return f"[{display_name}]({hf_url})"
return model_name
# ============================================================================
# Generation Tab Configuration and Utilities
# ============================================================================
# Column name to abbreviation mapping for display
PREDICT_COLUMN_ABBREV = {
'Common Sense': 'CS',
'AV': 'AV',
'Robot': 'RO',
'Industry': 'IN',
'Human': 'HU',
'Physics': 'PH',
'Subject Consistency': 'SC',
'Background Consistency': 'BC',
'Motion Smoothness': 'MS',
'Aesthetic Quality': 'AQ',
'Imaging Quality': 'IQ',
'Overall Consistency': 'OC',
'I2V Subject': 'IS',
'I2V Background': 'IB',
}
# Expected column order (full names from JSON)
PREDICT_COLUMN_ORDER = [
'Model',
'Overall',
'Domain',
'Quality',
'Common Sense',
'AV',
'Robot',
'Industry',
'Human',
'Physics',
'Subject Consistency',
'Background Consistency',
'Motion Smoothness',
'Aesthetic Quality',
'Imaging Quality',
'Overall Consistency',
'I2V Subject',
'I2V Background'
]
# Columns to hide by default (but still available for filtering/selection)
PREDICT_HIDDEN_COLUMNS = []
# Semantic/Domain dimensions (for selection button) - use abbreviations matching dataframe
PREDICT_DOMAIN_SCORE_DIMENSIONS = [
'Domain',
'CS', 'AV', 'RO', 'IN', 'HU', 'PH',
]
# Quality dimensions (for selection button) - use abbreviations matching dataframe
PREDICT_QUALITY_SCORE_DIMENSIONS = [
'Quality',
'SC', 'BC', 'MS', 'AQ', 'IQ', 'OC', 'IS', 'IB'
]
PREDICT_DESELECTED_COLUMNS = ['Domain', 'Quality']
PREDICT_ALL_SELECTED_COLUMNS = [
'Domain', 'Quality',
'CS', 'AV', 'RO', 'IN', 'HU', 'PH',
'SC', 'BC', 'MS', 'AQ', 'IQ', 'OC', 'IS', 'IB'
]
# Columns that can never be deselected
PREDICT_NEVER_HIDDEN_COLUMNS = ['Model', 'Overall']
# Columns displayed by default
PREDICT_DEFAULT_DISPLAYED_COLUMNS = PREDICT_NEVER_HIDDEN_COLUMNS + PREDICT_ALL_SELECTED_COLUMNS
def load_predict_json(json_path):
"""
Load generation leaderboard JSON.
The JSON should already be pre-processed by generate_predict_leaderboard.py
with correct column names, ordering, sorting, and separate model/url fields.
"""
df = pd.read_json(json_path, orient='records')
if 'model' in df.columns and 'url' in df.columns:
def create_link(row):
if pd.notna(row['url']):
display_name = row['model'].split('/')[-1] if '/' in row['model'] else row['model']
return f"[{display_name}]({row['url']})"
return row['model']
df['model'] = df.apply(create_link, axis=1)
df = df.drop(columns=['url'])
df = df.rename(columns={'model': 'Model'})
for col in df.columns:
if col != 'Model' and pd.api.types.is_numeric_dtype(df[col]):
df[col] = df[col].apply(lambda x: f"{x:.1f}" if pd.notna(x) else x)
# Rename columns to abbreviations for display
df = df.rename(columns=PREDICT_COLUMN_ABBREV)
return df
def get_predict_checkbox_choices(dataframe):
"""Get checkbox choices with full name (abbrev) format"""
# Create reverse mapping from abbreviation to full name
abbrev_to_full = {v: k for k, v in PREDICT_COLUMN_ABBREV.items()}
choices = []
for col in dataframe.columns:
if col in ['Model', 'Overall']:
continue
if col in abbrev_to_full:
full_name = abbrev_to_full[col]
choices.append((f"{full_name} ({col})", col))
else:
choices.append((col, col))
return choices
def select_predict_domain_score():
"""Return domain score for checkbox selection"""
return gr.update(value=PREDICT_DOMAIN_SCORE_DIMENSIONS)
def select_predict_quality_score():
"""Return quality score for checkbox selection"""
return gr.update(value=PREDICT_QUALITY_SCORE_DIMENSIONS)
def deselect_predict_all():
"""Deselect all dimensions"""
return gr.update(value=PREDICT_DESELECTED_COLUMNS)
def select_predict_all():
"""Select all dimensions"""
return gr.update(value=PREDICT_ALL_SELECTED_COLUMNS)
def on_predict_dimension_selection_change(selected_columns, full_df):
"""Handle dimension selection changes and update the dataframe"""
present_columns = ['Model', 'Overall']
for col in selected_columns:
if col not in present_columns and col in full_df.columns:
present_columns.append(col)
updated_data = full_df[present_columns]
datatypes = []
for col in present_columns:
if col == 'Model':
datatypes.append('markdown')
else:
datatypes.append('str')
return gr.update(value=updated_data, datatype=datatypes, headers=present_columns)
def init_predict_leaderboard(dataframe):
"""Initialize the Generation leaderboard with given dataframe"""
if dataframe is None or dataframe.empty:
raise ValueError("Leaderboard DataFrame is empty or None.")
# Get columns that exist in the dataframe
available_default_cols = [col for col in PREDICT_DEFAULT_DISPLAYED_COLUMNS if col in dataframe.columns]
# Filter dataframe to show only default columns initially
display_df = dataframe[available_default_cols]
# Determine datatypes dynamically
datatypes = []
for col in display_df.columns:
if col == 'Model':
datatypes.append('markdown')
else:
datatypes.append('str')
with gr.Row():
with gr.Column(scale=1):
domain_score_btn = gr.Button("Domain Score", size="md")
quality_score_btn = gr.Button("Quality Score", size="md")
select_all_btn = gr.Button("Select All", size="md")
deselect_btn = gr.Button("Deselect All", size="md")
with gr.Column(scale=4):
# Get checkbox choices with "Full Name (Abbrev)" format
checkbox_choices = get_predict_checkbox_choices(dataframe)
checkbox_group = gr.CheckboxGroup(
choices=checkbox_choices,
value=[col for col in PREDICT_ALL_SELECTED_COLUMNS if col in dataframe.columns],
label="Evaluation Dimensions",
interactive=True,
)
data_component = gr.Dataframe(
value=display_df,
headers=list(display_df.columns),
datatype=datatypes,
interactive=False,
visible=True,
wrap=False,
column_widths=["320px"] + ["80px"] * (len(display_df.columns) - 1),
pinned_columns=1,
elem_id="predict_leaderboard",
max_height=10000,
)
# Setup event handlers
domain_score_btn.click(
select_predict_domain_score,
inputs=None,
outputs=[checkbox_group]
).then(
fn=on_predict_dimension_selection_change,
inputs=[checkbox_group, gr.State(dataframe)],
outputs=data_component
)
quality_score_btn.click(
select_predict_quality_score,
inputs=None,
outputs=[checkbox_group]
).then(
fn=on_predict_dimension_selection_change,
inputs=[checkbox_group, gr.State(dataframe)],
outputs=data_component
)
deselect_btn.click(
deselect_predict_all,
inputs=None,
outputs=[checkbox_group]
).then(
fn=on_predict_dimension_selection_change,
inputs=[checkbox_group, gr.State(dataframe)],
outputs=data_component
)
select_all_btn.click(
select_predict_all,
inputs=None,
outputs=[checkbox_group]
).then(
fn=on_predict_dimension_selection_change,
inputs=[checkbox_group, gr.State(dataframe)],
outputs=data_component
)
checkbox_group.change(
fn=on_predict_dimension_selection_change,
inputs=[checkbox_group, gr.State(dataframe)],
outputs=data_component
)
return data_component
# ============================================================================
# Conditional Generation Tab Configuration and Utilities
# ============================================================================
TRANSFER_COLUMN_ORDER = [
'Model',
'Condition',
'Blur SSIM â',
'Edge F1 â',
'Depth si-RMSE â',
'Mask mIoU â',
'Quality Score â',
'Diversity â'
]
TRANSFER_HIDDEN_COLUMNS = []
TRANSFER_QUALITY_DIMENSIONS = [
'Blur SSIM â',
'Edge F1 â',
'Depth si-RMSE â',
'Mask mIoU â',
'Quality Score â',
'Diversity â',
]
TRANSFER_ALL_SELECTED_COLUMNS = TRANSFER_QUALITY_DIMENSIONS
TRANSFER_NEVER_HIDDEN_COLUMNS = ['Model', 'Condition']
TRANSFER_DEFAULT_DISPLAYED_COLUMNS = TRANSFER_NEVER_HIDDEN_COLUMNS + TRANSFER_ALL_SELECTED_COLUMNS
def load_transfer_json(json_path):
"""Load conditional generation leaderboard JSON"""
df = pd.read_json(json_path, orient='records')
if 'model' in df.columns and 'url' in df.columns:
def create_link(row):
if pd.notna(row['url']):
display_name = row['model'].split('/')[-1] if '/' in row['model'] else row['model']
return f"[{display_name}]({row['url']})"
return row['model']
df['model'] = df.apply(create_link, axis=1)
df = df.drop(columns=['url'])
df = df.rename(columns={'model': 'Model'})
for col in df.columns:
if col not in ['Model', 'Condition'] and pd.api.types.is_numeric_dtype(df[col]):
df[col] = df[col].apply(lambda x: f"{x:.3f}" if pd.notna(x) else x)
return df
def select_transfer_all():
"""Select all dimensions"""
return gr.update(value=TRANSFER_ALL_SELECTED_COLUMNS)
def deselect_transfer_all():
"""Deselect all dimensions"""
return gr.update(value=[])
def on_transfer_dimension_selection_change(selected_columns, full_df):
"""Handle dimension selection changes and update the dataframe"""
present_columns = ['Model', 'Condition']
for col in selected_columns:
if col not in present_columns and col in full_df.columns:
present_columns.append(col)
updated_data = full_df[present_columns]
datatypes = []
for col in present_columns:
if col == 'Model':
datatypes.append('markdown')
else:
datatypes.append('str')
return gr.update(value=updated_data, datatype=datatypes, headers=present_columns)
def init_transfer_leaderboard(dataframe):
"""Initialize the Conditional Generation leaderboard with given dataframe"""
if dataframe is None or dataframe.empty:
raise ValueError("Leaderboard DataFrame is empty or None.")
available_default_cols = [col for col in TRANSFER_DEFAULT_DISPLAYED_COLUMNS if col in dataframe.columns]
display_df = dataframe[available_default_cols]
datatypes = []
for col in display_df.columns:
if col == 'Model':
datatypes.append('markdown')
else:
datatypes.append('str')
with gr.Row():
with gr.Column(scale=1):
select_all_btn = gr.Button("Select All", size="md")
deselect_btn = gr.Button("Deselect All", size="md")
with gr.Column(scale=4):
dimension_choices = [col for col in dataframe.columns
if col not in TRANSFER_NEVER_HIDDEN_COLUMNS]
checkbox_group = gr.CheckboxGroup(
choices=dimension_choices,
value=[col for col in TRANSFER_DEFAULT_DISPLAYED_COLUMNS if col in dimension_choices],
label="Evaluation Dimensions",
interactive=True,
)
data_component = gr.Dataframe(
value=display_df,
headers=list(display_df.columns),
datatype=datatypes,
interactive=False,
visible=True,
wrap=False,
column_widths=["280px", "120px"] + ["150px"] * (len(display_df.columns) - 2),
pinned_columns=2,
elem_id="transfer_leaderboard",
max_height=10000,
)
deselect_btn.click(
deselect_transfer_all,
inputs=None,
outputs=[checkbox_group]
).then(
fn=on_transfer_dimension_selection_change,
inputs=[checkbox_group, gr.State(dataframe)],
outputs=data_component
)
select_all_btn.click(
select_transfer_all,
inputs=None,
outputs=[checkbox_group]
).then(
fn=on_transfer_dimension_selection_change,
inputs=[checkbox_group, gr.State(dataframe)],
outputs=data_component
)
checkbox_group.change(
fn=on_transfer_dimension_selection_change,
inputs=[checkbox_group, gr.State(dataframe)],
outputs=data_component
)
return data_component
# ============================================================================
# Understanding Tab Configuration and Utilities
# ============================================================================
# Column name to abbreviation mapping for display
REASON_COLUMN_ABBREV = {
'Common Sense': 'CS',
'Embodied Reasoning': 'ER',
'BridgeData V2': 'BD',
'RoboVQA': 'RV',
'RoboFail': 'RF',
'Agibot': 'AB',
'HoloAssist': 'HA',
}
# Desired column order (full names from JSON)
REASON_COLUMN_ORDER = [
'Model',
'Thinking',
'Overall',
'Common Sense',
'Embodied Reasoning',
'Space',
'Time',
'Physics',
'BridgeData V2',
'RoboVQA',
'RoboFail',
'Agibot',
'HoloAssist',
'AV'
]
# Columns to hide by default (but still available for filtering/selection)
REASON_HIDDEN_COLUMNS = []
# Reasoning dimensions (for selection button) - use abbreviations matching dataframe
REASON_COMMON_SENSE_DIMENSIONS = [
'CS',
'Space',
'Time',
'Physics',
]
# Domain dimensions (for selection button) - use abbreviations matching dataframe
REASON_EMBODIED_REASONING_DIMENSIONS = [
'ER',
'Space',
'Time',
'Physics',
'BD', 'RV', 'RF', 'AB', 'HA', 'AV',
]
REASON_DESELECTED_COLUMNS = [
'CS',
'ER',
]
REASON_ALL_SELECTED_COLUMNS = [
'CS', 'ER',
'Space', 'Time', 'Physics',
'BD', 'RV', 'RF', 'AB', 'HA', 'AV',
]
# Columns that can never be deselected
REASON_NEVER_HIDDEN_COLUMNS = ['Model', 'Thinking', 'Overall']
# Columns displayed by default (using renamed column names)
REASON_DEFAULT_DISPLAYED_COLUMNS = REASON_NEVER_HIDDEN_COLUMNS + REASON_ALL_SELECTED_COLUMNS
def load_reason_json(json_path):
"""Load understanding leaderboard JSON"""
df = pd.read_json(json_path, orient='records')
if 'model' in df.columns and 'url' in df.columns:
def create_link(row):
if pd.notna(row['url']):
display_name = row['model'].split('/')[-1] if '/' in row['model'] else row['model']
return f"[{display_name}]({row['url']})"
return row['model']
df['model'] = df.apply(create_link, axis=1)
df = df.drop(columns=['url'])
df = df.rename(columns={'model': 'Model'})
for col in df.columns:
if col != 'Model' and pd.api.types.is_numeric_dtype(df[col]):
df[col] = df[col].apply(lambda x: f"{x:.1f}" if pd.notna(x) else x)
# Rename columns to abbreviations for display
df = df.rename(columns=REASON_COLUMN_ABBREV)
return df
def get_reason_checkbox_choices(dataframe):
"""Get checkbox choices with full name (abbrev) format"""
# Create reverse mapping from abbreviation to full name
abbrev_to_full = {v: k for k, v in REASON_COLUMN_ABBREV.items()}
choices = []
for col in dataframe.columns:
if col in ['Model', 'Thinking', 'Overall']:
continue
if col in abbrev_to_full:
full_name = abbrev_to_full[col]
choices.append((f"{full_name} ({col})", col))
else:
choices.append((col, col))
return choices
def select_reason_common_sense_dimensions():
"""Return reasoning dimensions for checkbox selection"""
return gr.update(value=REASON_COMMON_SENSE_DIMENSIONS)
def select_reason_embodied_reasoning_dimensions():
"""Return domain dimensions for checkbox selection"""
return gr.update(value=REASON_EMBODIED_REASONING_DIMENSIONS)
def deselect_reason_all():
"""Deselect all dimensions"""
return gr.update(value=REASON_DESELECTED_COLUMNS)
def select_reason_all():
"""Select all dimensions"""
return gr.update(value=REASON_ALL_SELECTED_COLUMNS)
def on_reason_dimension_selection_change(selected_columns, full_df):
"""Handle dimension selection changes and update the dataframe"""
present_columns = ['Model', 'Thinking', 'Overall']
for col in selected_columns:
if col not in present_columns and col in full_df.columns:
present_columns.append(col)
updated_data = full_df[present_columns]
datatypes = []
for col in present_columns:
if col == 'Model':
datatypes.append('markdown')
else:
datatypes.append('str')
return gr.update(value=updated_data, datatype=datatypes, headers=present_columns)
def init_reason_leaderboard(dataframe):
"""Initialize the Understanding leaderboard with given dataframe"""
if dataframe is None or dataframe.empty:
raise ValueError("Leaderboard DataFrame is empty or None.")
# Get columns that exist in the dataframe
available_default_cols = [col for col in REASON_DEFAULT_DISPLAYED_COLUMNS if col in dataframe.columns]
# Filter dataframe to show only default columns initially
display_df = dataframe[available_default_cols]
# Determine datatypes dynamically
datatypes = []
for col in display_df.columns:
if col == 'Model':
datatypes.append('markdown')
else:
datatypes.append('str')
with gr.Row():
with gr.Column(scale=1):
common_sense_btn = gr.Button("Common Sense", size="md")
embodied_reasoning_btn = gr.Button("Embodied Reasoning", size="md")
select_all_btn = gr.Button("Select All", size="md")
deselect_btn = gr.Button("Deselect All", size="md")
with gr.Column(scale=4):
# Get checkbox choices with "Full Name (Abbrev)" format
checkbox_choices = get_reason_checkbox_choices(dataframe)
checkbox_group = gr.CheckboxGroup(
choices=checkbox_choices,
value=[col for col in REASON_ALL_SELECTED_COLUMNS if col in dataframe.columns],
label="Evaluation Dimensions",
interactive=True,
)
data_component = gr.Dataframe(
value=display_df,
headers=list(display_df.columns),
datatype=datatypes,
interactive=False,
visible=True,
wrap=False,
column_widths=["320px", "100px"] + ["100px"] * (len(display_df.columns) - 2),
pinned_columns=2,
elem_id="reason_leaderboard",
max_height=10000,
)
# Setup event handlers
common_sense_btn.click(
select_reason_common_sense_dimensions,
inputs=None,
outputs=[checkbox_group]
).then(
fn=on_reason_dimension_selection_change,
inputs=[checkbox_group, gr.State(dataframe)],
outputs=data_component
)
embodied_reasoning_btn.click(
select_reason_embodied_reasoning_dimensions,
inputs=None,
outputs=[checkbox_group]
).then(
fn=on_reason_dimension_selection_change,
inputs=[checkbox_group, gr.State(dataframe)],
outputs=data_component
)
deselect_btn.click(
deselect_reason_all,
inputs=None,
outputs=[checkbox_group]
).then(
fn=on_reason_dimension_selection_change,
inputs=[checkbox_group, gr.State(dataframe)],
outputs=data_component
)
select_all_btn.click(
select_reason_all,
inputs=None,
outputs=[checkbox_group]
).then(
fn=on_reason_dimension_selection_change,
inputs=[checkbox_group, gr.State(dataframe)],
outputs=data_component
)
checkbox_group.change(
fn=on_reason_dimension_selection_change,
inputs=[checkbox_group, gr.State(dataframe)],
outputs=data_component
)
return data_component
# ============================================================================
# Main Application
# ============================================================================
demo = gr.Blocks()
with demo:
gr.HTML(f"")
gr.HTML(TITLE)
gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
with gr.Tabs(elem_classes="tab-buttons") as tabs:
with gr.TabItem("đ¨ Generation", elem_id="predict-tab", id=0):
predict_df = load_predict_json("data/generation-leaderboard.json")
predict_leaderboard = init_predict_leaderboard(predict_df)
with gr.TabItem("đ Conditional Generation", elem_id="transfer-tab", id=1):
transfer_df = load_transfer_json("data/conditional_generation-leaderboard.json")
transfer_leaderboard = init_transfer_leaderboard(transfer_df)
with gr.TabItem("đ§ Understanding", elem_id="reason-tab", id=2):
reason_df = load_reason_json("data/understanding-leaderboard.json")
reason_leaderboard = init_reason_leaderboard(reason_df)
with gr.TabItem("âšī¸ About", elem_id="about-tab", id=3):
gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
demo.launch()