|
|
import gradio as gr |
|
|
import pandas as pd |
|
|
|
|
|
|
|
|
|
|
|
TITLE = """<h1 align="center" id="space-title">Physical AI Bench Leaderboard</h1>""" |
|
|
|
|
|
|
|
|
CSS = """ |
|
|
#predict_leaderboard, #transfer_leaderboard, #reason_leaderboard { |
|
|
height: auto !important; |
|
|
max-height: none !important; |
|
|
} |
|
|
#predict_leaderboard .wrap, #transfer_leaderboard .wrap, #reason_leaderboard .wrap { |
|
|
max-height: none !important; |
|
|
height: auto !important; |
|
|
} |
|
|
#predict_leaderboard .tbody, #transfer_leaderboard .tbody, #reason_leaderboard .tbody { |
|
|
max-height: none !important; |
|
|
height: auto !important; |
|
|
overflow-x: auto !important; |
|
|
overflow-y: hidden !important; |
|
|
} |
|
|
""" |
|
|
|
|
|
|
|
|
|
|
|
INTRODUCTION_TEXT = """ |
|
|
**Physical AI Bench (PAI-Bench)** is a comprehensive benchmark suite for evaluating physical AI generation and understanding across diverse scenarios including autonomous vehicles, robotics, industrial spaces, and everyday ego-centric environments. |
|
|
""" |
|
|
|
|
|
|
|
|
LLM_BENCHMARKS_TEXT = """ |
|
|
## How it works |
|
|
|
|
|
This leaderboard tracks model performance across three core dimensions: |
|
|
|
|
|
- **π¨ Generation**: Evaluates world foundation models' ability to predict future states across 1,044 diverse physical scenarios |
|
|
- **π Conditional Generation**: Focuses on world model generation with complex control signals, featuring 600 videos across robotic arm operations, autonomous driving, and ego-centric scenes |
|
|
- **π§ Understanding**: Evaluates understanding and reasoning about physical scenes, with 1,214 embodied reasoning scenarios focused on autonomous vehicle actions |
|
|
|
|
|
PAI-Bench covers multiple physical AI domains including autonomous driving, robotics, industrial spaces, physics simulations, human interactions, and common sense reasoning. |
|
|
|
|
|
### Resources |
|
|
- π [GitHub Repository](https://github.com/SHI-Labs/physical-ai-bench) |
|
|
- π [Generation Dataset](https://huggingface.co/datasets/shi-labs/physical-ai-bench-predict) |
|
|
- π [Conditional Generation Dataset](https://huggingface.co/datasets/shi-labs/physical-ai-bench-transfer) |
|
|
- π [Understanding Dataset](https://huggingface.co/datasets/shi-labs/physical-ai-bench-reason) |
|
|
- π¦ [Artifacts](https://huggingface.co/datasets/Leymore/physical-ai-bench-artifacts) |
|
|
|
|
|
## Reproducibility |
|
|
|
|
|
To evaluate your models on PAI-Bench, visit our [GitHub repository](https://github.com/SHI-Labs/physical-ai-bench) for evaluation scripts and detailed instructions. |
|
|
|
|
|
## Citation |
|
|
|
|
|
If you use Physical AI Bench in your research, please cite: |
|
|
|
|
|
```bibtex |
|
|
@misc{zhou2025paibenchcomprehensivebenchmarkphysical, |
|
|
title={PAI-Bench: A Comprehensive Benchmark For Physical AI}, |
|
|
author={Fengzhe Zhou and Jiannan Huang and Jialuo Li and Deva Ramanan and Humphrey Shi}, |
|
|
year={2025}, |
|
|
eprint={2512.01989}, |
|
|
archivePrefix={arXiv}, |
|
|
primaryClass={cs.CV}, |
|
|
url={https://arxiv.org/abs/2512.01989}, |
|
|
} |
|
|
|
|
|
``` |
|
|
""" |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def create_model_link(model_name): |
|
|
""" |
|
|
Convert a model name to a markdown link to Hugging Face. |
|
|
|
|
|
Args: |
|
|
model_name: Model name in format "org/model-name" or just a plain name |
|
|
|
|
|
Returns: |
|
|
Markdown formatted link or original name if format doesn't match |
|
|
""" |
|
|
if not isinstance(model_name, str): |
|
|
return model_name |
|
|
|
|
|
if '/' in model_name: |
|
|
hf_url = f"https://huggingface.co/{model_name}" |
|
|
display_name = model_name.split('/')[-1] |
|
|
return f"[{display_name}]({hf_url})" |
|
|
|
|
|
return model_name |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
PREDICT_COLUMN_ABBREV = { |
|
|
'Common Sense': 'CS', |
|
|
'AV': 'AV', |
|
|
'Robot': 'RO', |
|
|
'Industry': 'IN', |
|
|
'Human': 'HU', |
|
|
'Physics': 'PH', |
|
|
'Subject Consistency': 'SC', |
|
|
'Background Consistency': 'BC', |
|
|
'Motion Smoothness': 'MS', |
|
|
'Aesthetic Quality': 'AQ', |
|
|
'Imaging Quality': 'IQ', |
|
|
'Overall Consistency': 'OC', |
|
|
'I2V Subject': 'IS', |
|
|
'I2V Background': 'IB', |
|
|
} |
|
|
|
|
|
|
|
|
PREDICT_COLUMN_ORDER = [ |
|
|
'Model', |
|
|
'Overall', |
|
|
'Domain', |
|
|
'Quality', |
|
|
'Common Sense', |
|
|
'AV', |
|
|
'Robot', |
|
|
'Industry', |
|
|
'Human', |
|
|
'Physics', |
|
|
'Subject Consistency', |
|
|
'Background Consistency', |
|
|
'Motion Smoothness', |
|
|
'Aesthetic Quality', |
|
|
'Imaging Quality', |
|
|
'Overall Consistency', |
|
|
'I2V Subject', |
|
|
'I2V Background' |
|
|
] |
|
|
|
|
|
|
|
|
PREDICT_HIDDEN_COLUMNS = [] |
|
|
|
|
|
|
|
|
PREDICT_DOMAIN_SCORE_DIMENSIONS = [ |
|
|
'Domain', |
|
|
'CS', 'AV', 'RO', 'IN', 'HU', 'PH', |
|
|
] |
|
|
|
|
|
|
|
|
PREDICT_QUALITY_SCORE_DIMENSIONS = [ |
|
|
'Quality', |
|
|
'SC', 'BC', 'MS', 'AQ', 'IQ', 'OC', 'IS', 'IB' |
|
|
] |
|
|
|
|
|
PREDICT_DESELECTED_COLUMNS = ['Domain', 'Quality'] |
|
|
|
|
|
PREDICT_ALL_SELECTED_COLUMNS = [ |
|
|
'Domain', 'Quality', |
|
|
'CS', 'AV', 'RO', 'IN', 'HU', 'PH', |
|
|
'SC', 'BC', 'MS', 'AQ', 'IQ', 'OC', 'IS', 'IB' |
|
|
] |
|
|
|
|
|
|
|
|
PREDICT_NEVER_HIDDEN_COLUMNS = ['Model', 'Overall'] |
|
|
|
|
|
|
|
|
PREDICT_DEFAULT_DISPLAYED_COLUMNS = PREDICT_NEVER_HIDDEN_COLUMNS + PREDICT_ALL_SELECTED_COLUMNS |
|
|
|
|
|
def load_predict_json(json_path): |
|
|
""" |
|
|
Load generation leaderboard JSON. |
|
|
|
|
|
The JSON should already be pre-processed by generate_predict_leaderboard.py |
|
|
with correct column names, ordering, sorting, and separate model/url fields. |
|
|
""" |
|
|
df = pd.read_json(json_path, orient='records') |
|
|
|
|
|
if 'model' in df.columns and 'url' in df.columns: |
|
|
def create_link(row): |
|
|
if pd.notna(row['url']): |
|
|
display_name = row['model'].split('/')[-1] if '/' in row['model'] else row['model'] |
|
|
return f"[{display_name}]({row['url']})" |
|
|
return row['model'] |
|
|
|
|
|
df['model'] = df.apply(create_link, axis=1) |
|
|
df = df.drop(columns=['url']) |
|
|
|
|
|
df = df.rename(columns={'model': 'Model'}) |
|
|
|
|
|
for col in df.columns: |
|
|
if col != 'Model' and pd.api.types.is_numeric_dtype(df[col]): |
|
|
df[col] = df[col].apply(lambda x: f"{x:.1f}" if pd.notna(x) else x) |
|
|
|
|
|
|
|
|
df = df.rename(columns=PREDICT_COLUMN_ABBREV) |
|
|
|
|
|
return df |
|
|
|
|
|
|
|
|
def get_predict_checkbox_choices(dataframe): |
|
|
"""Get checkbox choices with full name (abbrev) format""" |
|
|
|
|
|
abbrev_to_full = {v: k for k, v in PREDICT_COLUMN_ABBREV.items()} |
|
|
|
|
|
choices = [] |
|
|
for col in dataframe.columns: |
|
|
if col in ['Model', 'Overall']: |
|
|
continue |
|
|
if col in abbrev_to_full: |
|
|
full_name = abbrev_to_full[col] |
|
|
choices.append((f"{full_name} ({col})", col)) |
|
|
else: |
|
|
choices.append((col, col)) |
|
|
|
|
|
return choices |
|
|
|
|
|
|
|
|
def select_predict_domain_score(): |
|
|
"""Return domain score for checkbox selection""" |
|
|
return gr.update(value=PREDICT_DOMAIN_SCORE_DIMENSIONS) |
|
|
|
|
|
def select_predict_quality_score(): |
|
|
"""Return quality score for checkbox selection""" |
|
|
return gr.update(value=PREDICT_QUALITY_SCORE_DIMENSIONS) |
|
|
|
|
|
def deselect_predict_all(): |
|
|
"""Deselect all dimensions""" |
|
|
return gr.update(value=PREDICT_DESELECTED_COLUMNS) |
|
|
|
|
|
def select_predict_all(): |
|
|
"""Select all dimensions""" |
|
|
return gr.update(value=PREDICT_ALL_SELECTED_COLUMNS) |
|
|
|
|
|
def on_predict_dimension_selection_change(selected_columns, full_df): |
|
|
"""Handle dimension selection changes and update the dataframe""" |
|
|
present_columns = ['Model', 'Overall'] |
|
|
|
|
|
for col in selected_columns: |
|
|
if col not in present_columns and col in full_df.columns: |
|
|
present_columns.append(col) |
|
|
|
|
|
updated_data = full_df[present_columns] |
|
|
|
|
|
datatypes = [] |
|
|
for col in present_columns: |
|
|
if col == 'Model': |
|
|
datatypes.append('markdown') |
|
|
else: |
|
|
datatypes.append('str') |
|
|
|
|
|
return gr.update(value=updated_data, datatype=datatypes, headers=present_columns) |
|
|
|
|
|
|
|
|
def init_predict_leaderboard(dataframe): |
|
|
"""Initialize the Generation leaderboard with given dataframe""" |
|
|
if dataframe is None or dataframe.empty: |
|
|
raise ValueError("Leaderboard DataFrame is empty or None.") |
|
|
|
|
|
|
|
|
available_default_cols = [col for col in PREDICT_DEFAULT_DISPLAYED_COLUMNS if col in dataframe.columns] |
|
|
|
|
|
|
|
|
display_df = dataframe[available_default_cols] |
|
|
|
|
|
|
|
|
datatypes = [] |
|
|
for col in display_df.columns: |
|
|
if col == 'Model': |
|
|
datatypes.append('markdown') |
|
|
else: |
|
|
datatypes.append('str') |
|
|
|
|
|
with gr.Row(): |
|
|
with gr.Column(scale=1): |
|
|
domain_score_btn = gr.Button("Domain Score", size="md") |
|
|
quality_score_btn = gr.Button("Quality Score", size="md") |
|
|
select_all_btn = gr.Button("Select All", size="md") |
|
|
deselect_btn = gr.Button("Deselect All", size="md") |
|
|
|
|
|
with gr.Column(scale=4): |
|
|
|
|
|
checkbox_choices = get_predict_checkbox_choices(dataframe) |
|
|
|
|
|
checkbox_group = gr.CheckboxGroup( |
|
|
choices=checkbox_choices, |
|
|
value=[col for col in PREDICT_ALL_SELECTED_COLUMNS if col in dataframe.columns], |
|
|
label="Evaluation Dimensions", |
|
|
interactive=True, |
|
|
) |
|
|
|
|
|
data_component = gr.Dataframe( |
|
|
value=display_df, |
|
|
headers=list(display_df.columns), |
|
|
datatype=datatypes, |
|
|
interactive=False, |
|
|
visible=True, |
|
|
wrap=False, |
|
|
column_widths=["320px"] + ["80px"] * (len(display_df.columns) - 1), |
|
|
pinned_columns=1, |
|
|
elem_id="predict_leaderboard", |
|
|
max_height=10000, |
|
|
|
|
|
) |
|
|
|
|
|
|
|
|
domain_score_btn.click( |
|
|
select_predict_domain_score, |
|
|
inputs=None, |
|
|
outputs=[checkbox_group] |
|
|
).then( |
|
|
fn=on_predict_dimension_selection_change, |
|
|
inputs=[checkbox_group, gr.State(dataframe)], |
|
|
outputs=data_component |
|
|
) |
|
|
|
|
|
quality_score_btn.click( |
|
|
select_predict_quality_score, |
|
|
inputs=None, |
|
|
outputs=[checkbox_group] |
|
|
).then( |
|
|
fn=on_predict_dimension_selection_change, |
|
|
inputs=[checkbox_group, gr.State(dataframe)], |
|
|
outputs=data_component |
|
|
) |
|
|
|
|
|
deselect_btn.click( |
|
|
deselect_predict_all, |
|
|
inputs=None, |
|
|
outputs=[checkbox_group] |
|
|
).then( |
|
|
fn=on_predict_dimension_selection_change, |
|
|
inputs=[checkbox_group, gr.State(dataframe)], |
|
|
outputs=data_component |
|
|
) |
|
|
|
|
|
select_all_btn.click( |
|
|
select_predict_all, |
|
|
inputs=None, |
|
|
outputs=[checkbox_group] |
|
|
).then( |
|
|
fn=on_predict_dimension_selection_change, |
|
|
inputs=[checkbox_group, gr.State(dataframe)], |
|
|
outputs=data_component |
|
|
) |
|
|
|
|
|
checkbox_group.change( |
|
|
fn=on_predict_dimension_selection_change, |
|
|
inputs=[checkbox_group, gr.State(dataframe)], |
|
|
outputs=data_component |
|
|
) |
|
|
|
|
|
return data_component |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
TRANSFER_COLUMN_ORDER = [ |
|
|
'Model', |
|
|
'Condition', |
|
|
'Blur SSIM β', |
|
|
'Edge F1 β', |
|
|
'Depth si-RMSE β', |
|
|
'Mask mIoU β', |
|
|
'Quality Score β', |
|
|
'Diversity β' |
|
|
] |
|
|
|
|
|
TRANSFER_HIDDEN_COLUMNS = [] |
|
|
|
|
|
TRANSFER_QUALITY_DIMENSIONS = [ |
|
|
'Blur SSIM β', |
|
|
'Edge F1 β', |
|
|
'Depth si-RMSE β', |
|
|
'Mask mIoU β', |
|
|
'Quality Score β', |
|
|
'Diversity β', |
|
|
] |
|
|
|
|
|
TRANSFER_ALL_SELECTED_COLUMNS = TRANSFER_QUALITY_DIMENSIONS |
|
|
|
|
|
TRANSFER_NEVER_HIDDEN_COLUMNS = ['Model', 'Condition'] |
|
|
|
|
|
TRANSFER_DEFAULT_DISPLAYED_COLUMNS = TRANSFER_NEVER_HIDDEN_COLUMNS + TRANSFER_ALL_SELECTED_COLUMNS |
|
|
|
|
|
|
|
|
def load_transfer_json(json_path): |
|
|
"""Load conditional generation leaderboard JSON""" |
|
|
df = pd.read_json(json_path, orient='records') |
|
|
|
|
|
if 'model' in df.columns and 'url' in df.columns: |
|
|
def create_link(row): |
|
|
if pd.notna(row['url']): |
|
|
display_name = row['model'].split('/')[-1] if '/' in row['model'] else row['model'] |
|
|
return f"[{display_name}]({row['url']})" |
|
|
return row['model'] |
|
|
|
|
|
df['model'] = df.apply(create_link, axis=1) |
|
|
df = df.drop(columns=['url']) |
|
|
|
|
|
df = df.rename(columns={'model': 'Model'}) |
|
|
|
|
|
for col in df.columns: |
|
|
if col not in ['Model', 'Condition'] and pd.api.types.is_numeric_dtype(df[col]): |
|
|
df[col] = df[col].apply(lambda x: f"{x:.3f}" if pd.notna(x) else x) |
|
|
|
|
|
return df |
|
|
|
|
|
|
|
|
def select_transfer_all(): |
|
|
"""Select all dimensions""" |
|
|
return gr.update(value=TRANSFER_ALL_SELECTED_COLUMNS) |
|
|
|
|
|
|
|
|
def deselect_transfer_all(): |
|
|
"""Deselect all dimensions""" |
|
|
return gr.update(value=[]) |
|
|
|
|
|
|
|
|
def on_transfer_dimension_selection_change(selected_columns, full_df): |
|
|
"""Handle dimension selection changes and update the dataframe""" |
|
|
present_columns = ['Model', 'Condition'] |
|
|
|
|
|
for col in selected_columns: |
|
|
if col not in present_columns and col in full_df.columns: |
|
|
present_columns.append(col) |
|
|
|
|
|
updated_data = full_df[present_columns] |
|
|
|
|
|
datatypes = [] |
|
|
for col in present_columns: |
|
|
if col == 'Model': |
|
|
datatypes.append('markdown') |
|
|
else: |
|
|
datatypes.append('str') |
|
|
|
|
|
return gr.update(value=updated_data, datatype=datatypes, headers=present_columns) |
|
|
|
|
|
|
|
|
def init_transfer_leaderboard(dataframe): |
|
|
"""Initialize the Conditional Generation leaderboard with given dataframe""" |
|
|
if dataframe is None or dataframe.empty: |
|
|
raise ValueError("Leaderboard DataFrame is empty or None.") |
|
|
|
|
|
available_default_cols = [col for col in TRANSFER_DEFAULT_DISPLAYED_COLUMNS if col in dataframe.columns] |
|
|
|
|
|
display_df = dataframe[available_default_cols] |
|
|
|
|
|
datatypes = [] |
|
|
for col in display_df.columns: |
|
|
if col == 'Model': |
|
|
datatypes.append('markdown') |
|
|
else: |
|
|
datatypes.append('str') |
|
|
|
|
|
with gr.Row(): |
|
|
with gr.Column(scale=1): |
|
|
select_all_btn = gr.Button("Select All", size="md") |
|
|
deselect_btn = gr.Button("Deselect All", size="md") |
|
|
|
|
|
with gr.Column(scale=4): |
|
|
dimension_choices = [col for col in dataframe.columns |
|
|
if col not in TRANSFER_NEVER_HIDDEN_COLUMNS] |
|
|
|
|
|
checkbox_group = gr.CheckboxGroup( |
|
|
choices=dimension_choices, |
|
|
value=[col for col in TRANSFER_DEFAULT_DISPLAYED_COLUMNS if col in dimension_choices], |
|
|
label="Evaluation Dimensions", |
|
|
interactive=True, |
|
|
) |
|
|
|
|
|
data_component = gr.Dataframe( |
|
|
value=display_df, |
|
|
headers=list(display_df.columns), |
|
|
datatype=datatypes, |
|
|
interactive=False, |
|
|
visible=True, |
|
|
wrap=False, |
|
|
column_widths=["280px", "120px"] + ["150px"] * (len(display_df.columns) - 2), |
|
|
pinned_columns=2, |
|
|
elem_id="transfer_leaderboard", |
|
|
max_height=10000, |
|
|
) |
|
|
|
|
|
deselect_btn.click( |
|
|
deselect_transfer_all, |
|
|
inputs=None, |
|
|
outputs=[checkbox_group] |
|
|
).then( |
|
|
fn=on_transfer_dimension_selection_change, |
|
|
inputs=[checkbox_group, gr.State(dataframe)], |
|
|
outputs=data_component |
|
|
) |
|
|
|
|
|
select_all_btn.click( |
|
|
select_transfer_all, |
|
|
inputs=None, |
|
|
outputs=[checkbox_group] |
|
|
).then( |
|
|
fn=on_transfer_dimension_selection_change, |
|
|
inputs=[checkbox_group, gr.State(dataframe)], |
|
|
outputs=data_component |
|
|
) |
|
|
|
|
|
checkbox_group.change( |
|
|
fn=on_transfer_dimension_selection_change, |
|
|
inputs=[checkbox_group, gr.State(dataframe)], |
|
|
outputs=data_component |
|
|
) |
|
|
|
|
|
return data_component |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
REASON_COLUMN_ABBREV = { |
|
|
'Common Sense': 'CS', |
|
|
'Embodied Reasoning': 'ER', |
|
|
'BridgeData V2': 'BD', |
|
|
'RoboVQA': 'RV', |
|
|
'RoboFail': 'RF', |
|
|
'Agibot': 'AB', |
|
|
'HoloAssist': 'HA', |
|
|
} |
|
|
|
|
|
|
|
|
REASON_COLUMN_ORDER = [ |
|
|
'Model', |
|
|
'Thinking', |
|
|
'Overall', |
|
|
'Common Sense', |
|
|
'Embodied Reasoning', |
|
|
'Space', |
|
|
'Time', |
|
|
'Physics', |
|
|
'BridgeData V2', |
|
|
'RoboVQA', |
|
|
'RoboFail', |
|
|
'Agibot', |
|
|
'HoloAssist', |
|
|
'AV' |
|
|
] |
|
|
|
|
|
|
|
|
REASON_HIDDEN_COLUMNS = [] |
|
|
|
|
|
|
|
|
REASON_COMMON_SENSE_DIMENSIONS = [ |
|
|
'CS', |
|
|
'Space', |
|
|
'Time', |
|
|
'Physics', |
|
|
] |
|
|
|
|
|
|
|
|
REASON_EMBODIED_REASONING_DIMENSIONS = [ |
|
|
'ER', |
|
|
'Space', |
|
|
'Time', |
|
|
'Physics', |
|
|
'BD', 'RV', 'RF', 'AB', 'HA', 'AV', |
|
|
] |
|
|
|
|
|
REASON_DESELECTED_COLUMNS = [ |
|
|
'CS', |
|
|
'ER', |
|
|
] |
|
|
|
|
|
REASON_ALL_SELECTED_COLUMNS = [ |
|
|
'CS', 'ER', |
|
|
'Space', 'Time', 'Physics', |
|
|
'BD', 'RV', 'RF', 'AB', 'HA', 'AV', |
|
|
] |
|
|
|
|
|
|
|
|
REASON_NEVER_HIDDEN_COLUMNS = ['Model', 'Thinking', 'Overall'] |
|
|
|
|
|
|
|
|
REASON_DEFAULT_DISPLAYED_COLUMNS = REASON_NEVER_HIDDEN_COLUMNS + REASON_ALL_SELECTED_COLUMNS |
|
|
|
|
|
|
|
|
def load_reason_json(json_path): |
|
|
"""Load understanding leaderboard JSON""" |
|
|
df = pd.read_json(json_path, orient='records') |
|
|
|
|
|
if 'model' in df.columns and 'url' in df.columns: |
|
|
def create_link(row): |
|
|
if pd.notna(row['url']): |
|
|
display_name = row['model'].split('/')[-1] if '/' in row['model'] else row['model'] |
|
|
return f"[{display_name}]({row['url']})" |
|
|
return row['model'] |
|
|
|
|
|
df['model'] = df.apply(create_link, axis=1) |
|
|
df = df.drop(columns=['url']) |
|
|
|
|
|
df = df.rename(columns={'model': 'Model'}) |
|
|
|
|
|
for col in df.columns: |
|
|
if col != 'Model' and pd.api.types.is_numeric_dtype(df[col]): |
|
|
df[col] = df[col].apply(lambda x: f"{x:.1f}" if pd.notna(x) else x) |
|
|
|
|
|
|
|
|
df = df.rename(columns=REASON_COLUMN_ABBREV) |
|
|
|
|
|
return df |
|
|
|
|
|
|
|
|
def get_reason_checkbox_choices(dataframe): |
|
|
"""Get checkbox choices with full name (abbrev) format""" |
|
|
|
|
|
abbrev_to_full = {v: k for k, v in REASON_COLUMN_ABBREV.items()} |
|
|
|
|
|
choices = [] |
|
|
for col in dataframe.columns: |
|
|
if col in ['Model', 'Thinking', 'Overall']: |
|
|
continue |
|
|
if col in abbrev_to_full: |
|
|
full_name = abbrev_to_full[col] |
|
|
choices.append((f"{full_name} ({col})", col)) |
|
|
else: |
|
|
choices.append((col, col)) |
|
|
|
|
|
return choices |
|
|
|
|
|
|
|
|
def select_reason_common_sense_dimensions(): |
|
|
"""Return reasoning dimensions for checkbox selection""" |
|
|
return gr.update(value=REASON_COMMON_SENSE_DIMENSIONS) |
|
|
|
|
|
|
|
|
def select_reason_embodied_reasoning_dimensions(): |
|
|
"""Return domain dimensions for checkbox selection""" |
|
|
return gr.update(value=REASON_EMBODIED_REASONING_DIMENSIONS) |
|
|
|
|
|
|
|
|
def deselect_reason_all(): |
|
|
"""Deselect all dimensions""" |
|
|
return gr.update(value=REASON_DESELECTED_COLUMNS) |
|
|
|
|
|
|
|
|
def select_reason_all(): |
|
|
"""Select all dimensions""" |
|
|
return gr.update(value=REASON_ALL_SELECTED_COLUMNS) |
|
|
|
|
|
|
|
|
def on_reason_dimension_selection_change(selected_columns, full_df): |
|
|
"""Handle dimension selection changes and update the dataframe""" |
|
|
present_columns = ['Model', 'Thinking', 'Overall'] |
|
|
|
|
|
for col in selected_columns: |
|
|
if col not in present_columns and col in full_df.columns: |
|
|
present_columns.append(col) |
|
|
|
|
|
updated_data = full_df[present_columns] |
|
|
|
|
|
datatypes = [] |
|
|
for col in present_columns: |
|
|
if col == 'Model': |
|
|
datatypes.append('markdown') |
|
|
else: |
|
|
datatypes.append('str') |
|
|
|
|
|
return gr.update(value=updated_data, datatype=datatypes, headers=present_columns) |
|
|
|
|
|
|
|
|
def init_reason_leaderboard(dataframe): |
|
|
"""Initialize the Understanding leaderboard with given dataframe""" |
|
|
if dataframe is None or dataframe.empty: |
|
|
raise ValueError("Leaderboard DataFrame is empty or None.") |
|
|
|
|
|
|
|
|
available_default_cols = [col for col in REASON_DEFAULT_DISPLAYED_COLUMNS if col in dataframe.columns] |
|
|
|
|
|
|
|
|
display_df = dataframe[available_default_cols] |
|
|
|
|
|
|
|
|
datatypes = [] |
|
|
for col in display_df.columns: |
|
|
if col == 'Model': |
|
|
datatypes.append('markdown') |
|
|
else: |
|
|
datatypes.append('str') |
|
|
|
|
|
with gr.Row(): |
|
|
with gr.Column(scale=1): |
|
|
common_sense_btn = gr.Button("Common Sense", size="md") |
|
|
embodied_reasoning_btn = gr.Button("Embodied Reasoning", size="md") |
|
|
select_all_btn = gr.Button("Select All", size="md") |
|
|
deselect_btn = gr.Button("Deselect All", size="md") |
|
|
|
|
|
with gr.Column(scale=4): |
|
|
|
|
|
checkbox_choices = get_reason_checkbox_choices(dataframe) |
|
|
|
|
|
checkbox_group = gr.CheckboxGroup( |
|
|
choices=checkbox_choices, |
|
|
value=[col for col in REASON_ALL_SELECTED_COLUMNS if col in dataframe.columns], |
|
|
label="Evaluation Dimensions", |
|
|
interactive=True, |
|
|
) |
|
|
|
|
|
data_component = gr.Dataframe( |
|
|
value=display_df, |
|
|
headers=list(display_df.columns), |
|
|
datatype=datatypes, |
|
|
interactive=False, |
|
|
visible=True, |
|
|
wrap=False, |
|
|
column_widths=["320px", "100px"] + ["100px"] * (len(display_df.columns) - 2), |
|
|
pinned_columns=2, |
|
|
elem_id="reason_leaderboard", |
|
|
max_height=10000, |
|
|
) |
|
|
|
|
|
|
|
|
common_sense_btn.click( |
|
|
select_reason_common_sense_dimensions, |
|
|
inputs=None, |
|
|
outputs=[checkbox_group] |
|
|
).then( |
|
|
fn=on_reason_dimension_selection_change, |
|
|
inputs=[checkbox_group, gr.State(dataframe)], |
|
|
outputs=data_component |
|
|
) |
|
|
|
|
|
embodied_reasoning_btn.click( |
|
|
select_reason_embodied_reasoning_dimensions, |
|
|
inputs=None, |
|
|
outputs=[checkbox_group] |
|
|
).then( |
|
|
fn=on_reason_dimension_selection_change, |
|
|
inputs=[checkbox_group, gr.State(dataframe)], |
|
|
outputs=data_component |
|
|
) |
|
|
|
|
|
deselect_btn.click( |
|
|
deselect_reason_all, |
|
|
inputs=None, |
|
|
outputs=[checkbox_group] |
|
|
).then( |
|
|
fn=on_reason_dimension_selection_change, |
|
|
inputs=[checkbox_group, gr.State(dataframe)], |
|
|
outputs=data_component |
|
|
) |
|
|
|
|
|
select_all_btn.click( |
|
|
select_reason_all, |
|
|
inputs=None, |
|
|
outputs=[checkbox_group] |
|
|
).then( |
|
|
fn=on_reason_dimension_selection_change, |
|
|
inputs=[checkbox_group, gr.State(dataframe)], |
|
|
outputs=data_component |
|
|
) |
|
|
|
|
|
checkbox_group.change( |
|
|
fn=on_reason_dimension_selection_change, |
|
|
inputs=[checkbox_group, gr.State(dataframe)], |
|
|
outputs=data_component |
|
|
) |
|
|
|
|
|
return data_component |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
demo = gr.Blocks() |
|
|
with demo: |
|
|
gr.HTML(f"<style>{CSS}</style>") |
|
|
gr.HTML(TITLE) |
|
|
gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text") |
|
|
|
|
|
with gr.Tabs(elem_classes="tab-buttons") as tabs: |
|
|
with gr.TabItem("π¨ Generation", elem_id="predict-tab", id=0): |
|
|
predict_df = load_predict_json("data/generation-leaderboard.json") |
|
|
predict_leaderboard = init_predict_leaderboard(predict_df) |
|
|
|
|
|
with gr.TabItem("π Conditional Generation", elem_id="transfer-tab", id=1): |
|
|
transfer_df = load_transfer_json("data/conditional_generation-leaderboard.json") |
|
|
transfer_leaderboard = init_transfer_leaderboard(transfer_df) |
|
|
|
|
|
with gr.TabItem("π§ Understanding", elem_id="reason-tab", id=2): |
|
|
reason_df = load_reason_json("data/understanding-leaderboard.json") |
|
|
reason_leaderboard = init_reason_leaderboard(reason_df) |
|
|
|
|
|
with gr.TabItem("βΉοΈ About", elem_id="about-tab", id=3): |
|
|
gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text") |
|
|
|
|
|
demo.launch() |
|
|
|