Fengzhe Zhou commited on
Commit
1421041
Β·
1 Parent(s): ea97839
.gitignore CHANGED
@@ -1,2 +1,3 @@
1
  scripts/
 
2
  __pycache__/
 
1
  scripts/
2
+ internal-data/
3
  __pycache__/
app.py CHANGED
@@ -7,15 +7,15 @@ TITLE = """<h1 align="center" id="space-title">Physical AI Bench Leaderboard</h1
7
 
8
  # CSS to make the leaderboard full height
9
  CSS = """
10
- #predict_leaderboard, #reason_leaderboard {
11
  height: auto !important;
12
  max-height: none !important;
13
  }
14
- #predict_leaderboard .wrap, #reason_leaderboard .wrap {
15
  max-height: none !important;
16
  height: auto !important;
17
  }
18
- #predict_leaderboard .tbody, #reason_leaderboard .tbody {
19
  max-height: none !important;
20
  height: auto !important;
21
  overflow-x: auto !important;
@@ -46,6 +46,7 @@ PAI-Bench covers multiple physical AI domains including autonomous driving, robo
46
  - πŸ“Š [Generation Dataset](https://huggingface.co/datasets/shi-labs/physical-ai-bench-predict)
47
  - πŸ“Š [Conditional Generation Dataset](https://huggingface.co/datasets/shi-labs/physical-ai-bench-transfer)
48
  - πŸ“Š [Understanding Dataset](https://huggingface.co/datasets/shi-labs/physical-ai-bench-reason)
 
49
 
50
  ## Reproducibility
51
 
@@ -57,13 +58,13 @@ If you use Physical AI Bench in your research, please cite:
57
 
58
  ```bibtex
59
  @misc{zhou2025paibenchcomprehensivebenchmarkphysical,
60
- title={PAI-Bench: A Comprehensive Benchmark For Physical AI},
61
  author={Fengzhe Zhou and Jiannan Huang and Jialuo Li and Deva Ramanan and Humphrey Shi},
62
  year={2025},
63
  eprint={2512.01989},
64
  archivePrefix={arXiv},
65
  primaryClass={cs.CV},
66
- url={https://arxiv.org/abs/2512.01989},
67
  }
68
 
69
  ```
@@ -99,12 +100,30 @@ def create_model_link(model_name):
99
  # Generation Tab Configuration and Utilities
100
  # ============================================================================
101
 
102
- # Expected column order (the CSV should already have this order)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
103
  PREDICT_COLUMN_ORDER = [
104
- 'model',
105
  'Overall',
106
- 'Domain Score',
107
- 'Quality Score',
108
  'Common Sense',
109
  'AV',
110
  'Robot',
@@ -115,66 +134,39 @@ PREDICT_COLUMN_ORDER = [
115
  'Background Consistency',
116
  'Motion Smoothness',
117
  'Aesthetic Quality',
118
- 'Image Quality',
119
  'Overall Consistency',
120
  'I2V Subject',
121
- 'I2V Background',
122
- 'params',
123
- 'activate_params'
124
  ]
125
 
126
  # Columns to hide by default (but still available for filtering/selection)
127
- PREDICT_HIDDEN_COLUMNS = ['params', 'activate_params']
128
 
129
- # Semantic/Domain dimensions (for selection button)
130
  PREDICT_DOMAIN_SCORE_DIMENSIONS = [
131
- 'Domain Score',
132
- 'Common Sense',
133
- 'AV',
134
- 'Robot',
135
- 'Industry',
136
- 'Human',
137
- 'Physics',
138
  ]
139
 
140
- # Quality dimensions (for selection button)
141
  PREDICT_QUALITY_SCORE_DIMENSIONS = [
142
- 'Quality Score',
143
- 'Subject Consistency',
144
- 'Background Consistency',
145
- 'Motion Smoothness',
146
- 'Aesthetic Quality',
147
- 'Image Quality',
148
- 'Overall Consistency',
149
- 'I2V Subject',
150
- 'I2V Background'
151
  ]
152
 
153
- PREDICT_DESELECTED_COLUMNS = ['Domain Score', 'Quality Score']
154
 
155
  PREDICT_ALL_SELECTED_COLUMNS = [
156
- 'Domain Score',
157
- 'Quality Score',
158
- 'Common Sense',
159
- 'AV',
160
- 'Robot',
161
- 'Industry',
162
- 'Human',
163
- 'Physics',
164
- 'Subject Consistency',
165
- 'Background Consistency',
166
- 'Motion Smoothness',
167
- 'Aesthetic Quality',
168
- 'Image Quality',
169
- 'Overall Consistency',
170
- 'I2V Subject',
171
- 'I2V Background'
172
  ]
173
 
174
  # Columns that can never be deselected
175
- PREDICT_NEVER_HIDDEN_COLUMNS = ['model', 'Overall']
176
 
177
- # Columns displayed by default (using renamed column names)
178
  PREDICT_DEFAULT_DISPLAYED_COLUMNS = PREDICT_NEVER_HIDDEN_COLUMNS + PREDICT_ALL_SELECTED_COLUMNS
179
 
180
  def load_predict_json(json_path):
@@ -196,15 +188,36 @@ def load_predict_json(json_path):
196
  df['model'] = df.apply(create_link, axis=1)
197
  df = df.drop(columns=['url'])
198
 
199
- # Format numbers to ensure decimal places (1 decimal for numeric columns)
200
- # Numbers should already be scaled to 0-100 by the generation script
201
  for col in df.columns:
202
- if col not in ['model', 'params', 'activate_params'] and pd.api.types.is_numeric_dtype(df[col]):
203
  df[col] = df[col].apply(lambda x: f"{x:.1f}" if pd.notna(x) else x)
204
 
 
 
 
205
  return df
206
 
207
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
208
  def select_predict_domain_score():
209
  """Return domain score for checkbox selection"""
210
  return gr.update(value=PREDICT_DOMAIN_SCORE_DIMENSIONS)
@@ -223,24 +236,18 @@ def select_predict_all():
223
 
224
  def on_predict_dimension_selection_change(selected_columns, full_df):
225
  """Handle dimension selection changes and update the dataframe"""
226
- # Always include model and Overall columns
227
- present_columns = ['model', 'Overall']
228
 
229
- # Add selected columns
230
  for col in selected_columns:
231
  if col not in present_columns and col in full_df.columns:
232
  present_columns.append(col)
233
 
234
- # Filter dataframe to show only selected columns
235
  updated_data = full_df[present_columns]
236
 
237
- # Determine datatypes
238
  datatypes = []
239
  for col in present_columns:
240
- if col == 'model':
241
  datatypes.append('markdown')
242
- elif col in ['params', 'activate_params']:
243
- datatypes.append('number')
244
  else:
245
  datatypes.append('str')
246
 
@@ -261,14 +268,11 @@ def init_predict_leaderboard(dataframe):
261
  # Determine datatypes dynamically
262
  datatypes = []
263
  for col in display_df.columns:
264
- if col == 'model':
265
  datatypes.append('markdown')
266
- elif col in ['params', 'activate_params']:
267
- datatypes.append('number')
268
  else:
269
- datatypes.append('str') # All numeric columns are now formatted as strings
270
 
271
- # Create the UI components
272
  with gr.Row():
273
  with gr.Column(scale=1):
274
  domain_score_btn = gr.Button("Domain Score", size="md")
@@ -277,13 +281,12 @@ def init_predict_leaderboard(dataframe):
277
  deselect_btn = gr.Button("Deselect All", size="md")
278
 
279
  with gr.Column(scale=4):
280
- # Get all dimension columns (exclude model, Overall, scores, and params)
281
- dimension_choices = [col for col in dataframe.columns
282
- if col not in PREDICT_NEVER_HIDDEN_COLUMNS + PREDICT_HIDDEN_COLUMNS]
283
 
284
  checkbox_group = gr.CheckboxGroup(
285
- choices=dimension_choices,
286
- value=[col for col in PREDICT_DEFAULT_DISPLAYED_COLUMNS if col in dimension_choices],
287
  label="Evaluation Dimensions",
288
  interactive=True,
289
  )
@@ -295,7 +298,7 @@ def init_predict_leaderboard(dataframe):
295
  interactive=False,
296
  visible=True,
297
  wrap=False,
298
- column_widths=["320px"] + ["200px"] * (len(display_df.columns) - 1),
299
  pinned_columns=1,
300
  elem_id="predict_leaderboard",
301
  max_height=10000,
@@ -352,18 +355,185 @@ def init_predict_leaderboard(dataframe):
352
  return data_component
353
 
354
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
355
  # ============================================================================
356
  # Understanding Tab Configuration and Utilities
357
  # ============================================================================
358
 
359
- # Column name mapping for display
360
- REASON_COLUMN_MAPPING = {
361
- 'Physical world': 'Physics'
 
 
 
 
 
 
362
  }
363
 
364
- # Desired column order
365
  REASON_COLUMN_ORDER = [
366
- 'model',
 
367
  'Overall',
368
  'Common Sense',
369
  'Embodied Reasoning',
@@ -375,85 +545,91 @@ REASON_COLUMN_ORDER = [
375
  'RoboFail',
376
  'Agibot',
377
  'HoloAssist',
378
- 'AV',
379
- 'params',
380
- 'activate_params'
381
  ]
382
 
383
  # Columns to hide by default (but still available for filtering/selection)
384
- REASON_HIDDEN_COLUMNS = ['params', 'activate_params']
385
 
386
- # Reasoning dimensions (for selection button)
387
  REASON_COMMON_SENSE_DIMENSIONS = [
388
- 'Common Sense',
389
  'Space',
390
  'Time',
391
  'Physics',
392
  ]
393
 
394
- # Domain dimensions (for selection button)
395
  REASON_EMBODIED_REASONING_DIMENSIONS = [
396
- 'Embodied Reasoning',
397
  'Space',
398
  'Time',
399
  'Physics',
400
- 'BridgeData V2',
401
- 'RoboVQA',
402
- 'RoboFail',
403
- 'Agibot',
404
- 'HoloAssist',
405
- 'AV',
406
  ]
407
 
408
  REASON_DESELECTED_COLUMNS = [
409
- 'Common Sense',
410
- 'Embodied Reasoning',
411
  ]
412
 
413
  REASON_ALL_SELECTED_COLUMNS = [
414
- 'Common Sense',
415
- 'Embodied Reasoning',
416
- 'Space',
417
- 'Time',
418
- 'Physics',
419
- 'BridgeData V2',
420
- 'RoboVQA',
421
- 'RoboFail',
422
- 'Agibot',
423
- 'HoloAssist',
424
- 'AV',
425
  ]
426
 
427
  # Columns that can never be deselected
428
- REASON_NEVER_HIDDEN_COLUMNS = ['model', 'Overall']
429
 
430
  # Columns displayed by default (using renamed column names)
431
  REASON_DEFAULT_DISPLAYED_COLUMNS = REASON_NEVER_HIDDEN_COLUMNS + REASON_ALL_SELECTED_COLUMNS
432
 
433
 
434
- def load_reason_csv(csv_path):
435
- """Load CSV and apply column mapping and ordering"""
436
- df = pd.read_csv(csv_path)
437
 
438
- # Apply column mapping
439
- df = df.rename(columns=REASON_COLUMN_MAPPING)
 
 
 
 
440
 
441
- # Reorder columns (only keep columns that exist in the dataframe)
442
- available_cols = [col for col in REASON_COLUMN_ORDER if col in df.columns]
443
- df = df[available_cols]
444
 
445
- # Convert model names to HuggingFace links
446
- if 'model' in df.columns:
447
- df['model'] = df['model'].apply(create_model_link)
448
 
449
- # Format numbers to ensure decimal places (1 decimal for integers)
450
  for col in df.columns:
451
- if col not in ['model', 'params', 'activate_params'] and pd.api.types.is_numeric_dtype(df[col]):
452
  df[col] = df[col].apply(lambda x: f"{x:.1f}" if pd.notna(x) else x)
453
 
 
 
 
454
  return df
455
 
456
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
457
  def select_reason_common_sense_dimensions():
458
  """Return reasoning dimensions for checkbox selection"""
459
  return gr.update(value=REASON_COMMON_SENSE_DIMENSIONS)
@@ -476,24 +652,18 @@ def select_reason_all():
476
 
477
  def on_reason_dimension_selection_change(selected_columns, full_df):
478
  """Handle dimension selection changes and update the dataframe"""
479
- # Always include model and Overall columns
480
- present_columns = ['model', 'Overall']
481
 
482
- # Add selected columns
483
  for col in selected_columns:
484
  if col not in present_columns and col in full_df.columns:
485
  present_columns.append(col)
486
 
487
- # Filter dataframe to show only selected columns
488
  updated_data = full_df[present_columns]
489
 
490
- # Determine datatypes
491
  datatypes = []
492
  for col in present_columns:
493
- if col == 'model':
494
  datatypes.append('markdown')
495
- elif col in ['params', 'activate_params']:
496
- datatypes.append('number')
497
  else:
498
  datatypes.append('str')
499
 
@@ -514,14 +684,11 @@ def init_reason_leaderboard(dataframe):
514
  # Determine datatypes dynamically
515
  datatypes = []
516
  for col in display_df.columns:
517
- if col == 'model':
518
  datatypes.append('markdown')
519
- elif col in ['params', 'activate_params']:
520
- datatypes.append('number')
521
  else:
522
- datatypes.append('str') # All numeric columns are now formatted as strings
523
 
524
- # Create the UI components
525
  with gr.Row():
526
  with gr.Column(scale=1):
527
  common_sense_btn = gr.Button("Common Sense", size="md")
@@ -530,13 +697,12 @@ def init_reason_leaderboard(dataframe):
530
  deselect_btn = gr.Button("Deselect All", size="md")
531
 
532
  with gr.Column(scale=4):
533
- # Get all dimension columns (exclude model, Overall, and params)
534
- dimension_choices = [col for col in dataframe.columns
535
- if col not in REASON_NEVER_HIDDEN_COLUMNS + REASON_HIDDEN_COLUMNS]
536
 
537
  checkbox_group = gr.CheckboxGroup(
538
- choices=dimension_choices,
539
- value=[col for col in REASON_DEFAULT_DISPLAYED_COLUMNS if col in dimension_choices],
540
  label="Evaluation Dimensions",
541
  interactive=True,
542
  )
@@ -547,12 +713,11 @@ def init_reason_leaderboard(dataframe):
547
  datatype=datatypes,
548
  interactive=False,
549
  visible=True,
550
- wrap=False, # Allow horizontal scrolling, don't wrap content
551
- column_widths=["320px"] + ["200px"] * (len(display_df.columns) - 1),
552
- pinned_columns=1,
553
  elem_id="reason_leaderboard",
554
  max_height=10000,
555
-
556
  )
557
 
558
  # Setup event handlers
@@ -611,23 +776,24 @@ def init_reason_leaderboard(dataframe):
611
 
612
  demo = gr.Blocks()
613
  with demo:
 
614
  gr.HTML(TITLE)
615
  gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
616
 
617
  with gr.Tabs(elem_classes="tab-buttons") as tabs:
618
  with gr.TabItem("🎨 Generation", elem_id="predict-tab", id=0):
619
- predict_df = load_predict_json("data/predict-leaderboard.json")
620
  predict_leaderboard = init_predict_leaderboard(predict_df)
621
 
622
  with gr.TabItem("πŸ”„ Conditional Generation", elem_id="transfer-tab", id=1):
623
- gr.Markdown("## Coming Soon", elem_classes="markdown-text")
 
624
 
625
  with gr.TabItem("🧠 Understanding", elem_id="reason-tab", id=2):
626
- reason_df = load_reason_csv("data/reason-leaderboard.csv")
627
  reason_leaderboard = init_reason_leaderboard(reason_df)
628
 
629
  with gr.TabItem("ℹ️ About", elem_id="about-tab", id=3):
630
  gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
631
 
632
- demo.launch(css=CSS)
633
-
 
7
 
8
  # CSS to make the leaderboard full height
9
  CSS = """
10
+ #predict_leaderboard, #transfer_leaderboard, #reason_leaderboard {
11
  height: auto !important;
12
  max-height: none !important;
13
  }
14
+ #predict_leaderboard .wrap, #transfer_leaderboard .wrap, #reason_leaderboard .wrap {
15
  max-height: none !important;
16
  height: auto !important;
17
  }
18
+ #predict_leaderboard .tbody, #transfer_leaderboard .tbody, #reason_leaderboard .tbody {
19
  max-height: none !important;
20
  height: auto !important;
21
  overflow-x: auto !important;
 
46
  - πŸ“Š [Generation Dataset](https://huggingface.co/datasets/shi-labs/physical-ai-bench-predict)
47
  - πŸ“Š [Conditional Generation Dataset](https://huggingface.co/datasets/shi-labs/physical-ai-bench-transfer)
48
  - πŸ“Š [Understanding Dataset](https://huggingface.co/datasets/shi-labs/physical-ai-bench-reason)
49
+ - πŸ“¦ [Artifacts](https://huggingface.co/datasets/Leymore/physical-ai-bench-artifacts)
50
 
51
  ## Reproducibility
52
 
 
58
 
59
  ```bibtex
60
  @misc{zhou2025paibenchcomprehensivebenchmarkphysical,
61
+ title={PAI-Bench: A Comprehensive Benchmark For Physical AI},
62
  author={Fengzhe Zhou and Jiannan Huang and Jialuo Li and Deva Ramanan and Humphrey Shi},
63
  year={2025},
64
  eprint={2512.01989},
65
  archivePrefix={arXiv},
66
  primaryClass={cs.CV},
67
+ url={https://arxiv.org/abs/2512.01989},
68
  }
69
 
70
  ```
 
100
  # Generation Tab Configuration and Utilities
101
  # ============================================================================
102
 
103
+ # Column name to abbreviation mapping for display
104
+ PREDICT_COLUMN_ABBREV = {
105
+ 'Common Sense': 'CS',
106
+ 'AV': 'AV',
107
+ 'Robot': 'RO',
108
+ 'Industry': 'IN',
109
+ 'Human': 'HU',
110
+ 'Physics': 'PH',
111
+ 'Subject Consistency': 'SC',
112
+ 'Background Consistency': 'BC',
113
+ 'Motion Smoothness': 'MS',
114
+ 'Aesthetic Quality': 'AQ',
115
+ 'Imaging Quality': 'IQ',
116
+ 'Overall Consistency': 'OC',
117
+ 'I2V Subject': 'IS',
118
+ 'I2V Background': 'IB',
119
+ }
120
+
121
+ # Expected column order (full names from JSON)
122
  PREDICT_COLUMN_ORDER = [
123
+ 'Model',
124
  'Overall',
125
+ 'Domain',
126
+ 'Quality',
127
  'Common Sense',
128
  'AV',
129
  'Robot',
 
134
  'Background Consistency',
135
  'Motion Smoothness',
136
  'Aesthetic Quality',
137
+ 'Imaging Quality',
138
  'Overall Consistency',
139
  'I2V Subject',
140
+ 'I2V Background'
 
 
141
  ]
142
 
143
  # Columns to hide by default (but still available for filtering/selection)
144
+ PREDICT_HIDDEN_COLUMNS = []
145
 
146
+ # Semantic/Domain dimensions (for selection button) - use abbreviations matching dataframe
147
  PREDICT_DOMAIN_SCORE_DIMENSIONS = [
148
+ 'Domain',
149
+ 'CS', 'AV', 'RO', 'IN', 'HU', 'PH',
 
 
 
 
 
150
  ]
151
 
152
+ # Quality dimensions (for selection button) - use abbreviations matching dataframe
153
  PREDICT_QUALITY_SCORE_DIMENSIONS = [
154
+ 'Quality',
155
+ 'SC', 'BC', 'MS', 'AQ', 'IQ', 'OC', 'IS', 'IB'
 
 
 
 
 
 
 
156
  ]
157
 
158
+ PREDICT_DESELECTED_COLUMNS = ['Domain', 'Quality']
159
 
160
  PREDICT_ALL_SELECTED_COLUMNS = [
161
+ 'Domain', 'Quality',
162
+ 'CS', 'AV', 'RO', 'IN', 'HU', 'PH',
163
+ 'SC', 'BC', 'MS', 'AQ', 'IQ', 'OC', 'IS', 'IB'
 
 
 
 
 
 
 
 
 
 
 
 
 
164
  ]
165
 
166
  # Columns that can never be deselected
167
+ PREDICT_NEVER_HIDDEN_COLUMNS = ['Model', 'Overall']
168
 
169
+ # Columns displayed by default
170
  PREDICT_DEFAULT_DISPLAYED_COLUMNS = PREDICT_NEVER_HIDDEN_COLUMNS + PREDICT_ALL_SELECTED_COLUMNS
171
 
172
  def load_predict_json(json_path):
 
188
  df['model'] = df.apply(create_link, axis=1)
189
  df = df.drop(columns=['url'])
190
 
191
+ df = df.rename(columns={'model': 'Model'})
192
+
193
  for col in df.columns:
194
+ if col != 'Model' and pd.api.types.is_numeric_dtype(df[col]):
195
  df[col] = df[col].apply(lambda x: f"{x:.1f}" if pd.notna(x) else x)
196
 
197
+ # Rename columns to abbreviations for display
198
+ df = df.rename(columns=PREDICT_COLUMN_ABBREV)
199
+
200
  return df
201
 
202
 
203
+ def get_predict_checkbox_choices(dataframe):
204
+ """Get checkbox choices with full name (abbrev) format"""
205
+ # Create reverse mapping from abbreviation to full name
206
+ abbrev_to_full = {v: k for k, v in PREDICT_COLUMN_ABBREV.items()}
207
+
208
+ choices = []
209
+ for col in dataframe.columns:
210
+ if col in ['Model', 'Overall']:
211
+ continue
212
+ if col in abbrev_to_full:
213
+ full_name = abbrev_to_full[col]
214
+ choices.append((f"{full_name} ({col})", col))
215
+ else:
216
+ choices.append((col, col))
217
+
218
+ return choices
219
+
220
+
221
  def select_predict_domain_score():
222
  """Return domain score for checkbox selection"""
223
  return gr.update(value=PREDICT_DOMAIN_SCORE_DIMENSIONS)
 
236
 
237
  def on_predict_dimension_selection_change(selected_columns, full_df):
238
  """Handle dimension selection changes and update the dataframe"""
239
+ present_columns = ['Model', 'Overall']
 
240
 
 
241
  for col in selected_columns:
242
  if col not in present_columns and col in full_df.columns:
243
  present_columns.append(col)
244
 
 
245
  updated_data = full_df[present_columns]
246
 
 
247
  datatypes = []
248
  for col in present_columns:
249
+ if col == 'Model':
250
  datatypes.append('markdown')
 
 
251
  else:
252
  datatypes.append('str')
253
 
 
268
  # Determine datatypes dynamically
269
  datatypes = []
270
  for col in display_df.columns:
271
+ if col == 'Model':
272
  datatypes.append('markdown')
 
 
273
  else:
274
+ datatypes.append('str')
275
 
 
276
  with gr.Row():
277
  with gr.Column(scale=1):
278
  domain_score_btn = gr.Button("Domain Score", size="md")
 
281
  deselect_btn = gr.Button("Deselect All", size="md")
282
 
283
  with gr.Column(scale=4):
284
+ # Get checkbox choices with "Full Name (Abbrev)" format
285
+ checkbox_choices = get_predict_checkbox_choices(dataframe)
 
286
 
287
  checkbox_group = gr.CheckboxGroup(
288
+ choices=checkbox_choices,
289
+ value=[col for col in PREDICT_ALL_SELECTED_COLUMNS if col in dataframe.columns],
290
  label="Evaluation Dimensions",
291
  interactive=True,
292
  )
 
298
  interactive=False,
299
  visible=True,
300
  wrap=False,
301
+ column_widths=["320px"] + ["80px"] * (len(display_df.columns) - 1),
302
  pinned_columns=1,
303
  elem_id="predict_leaderboard",
304
  max_height=10000,
 
355
  return data_component
356
 
357
 
358
+ # ============================================================================
359
+ # Conditional Generation Tab Configuration and Utilities
360
+ # ============================================================================
361
+
362
+ TRANSFER_COLUMN_ORDER = [
363
+ 'Model',
364
+ 'Condition',
365
+ 'Blur SSIM ↑',
366
+ 'Edge F1 ↑',
367
+ 'Depth si-RMSE ↓',
368
+ 'Mask mIoU ↑',
369
+ 'Quality Score ↑',
370
+ 'Diversity ↑'
371
+ ]
372
+
373
+ TRANSFER_HIDDEN_COLUMNS = []
374
+
375
+ TRANSFER_QUALITY_DIMENSIONS = [
376
+ 'Blur SSIM ↑',
377
+ 'Edge F1 ↑',
378
+ 'Depth si-RMSE ↓',
379
+ 'Mask mIoU ↑',
380
+ 'Quality Score ↑',
381
+ 'Diversity ↑',
382
+ ]
383
+
384
+ TRANSFER_ALL_SELECTED_COLUMNS = TRANSFER_QUALITY_DIMENSIONS
385
+
386
+ TRANSFER_NEVER_HIDDEN_COLUMNS = ['Model', 'Condition']
387
+
388
+ TRANSFER_DEFAULT_DISPLAYED_COLUMNS = TRANSFER_NEVER_HIDDEN_COLUMNS + TRANSFER_ALL_SELECTED_COLUMNS
389
+
390
+
391
+ def load_transfer_json(json_path):
392
+ """Load conditional generation leaderboard JSON"""
393
+ df = pd.read_json(json_path, orient='records')
394
+
395
+ if 'model' in df.columns and 'url' in df.columns:
396
+ def create_link(row):
397
+ if pd.notna(row['url']):
398
+ display_name = row['model'].split('/')[-1] if '/' in row['model'] else row['model']
399
+ return f"[{display_name}]({row['url']})"
400
+ return row['model']
401
+
402
+ df['model'] = df.apply(create_link, axis=1)
403
+ df = df.drop(columns=['url'])
404
+
405
+ df = df.rename(columns={'model': 'Model'})
406
+
407
+ for col in df.columns:
408
+ if col not in ['Model', 'Condition'] and pd.api.types.is_numeric_dtype(df[col]):
409
+ df[col] = df[col].apply(lambda x: f"{x:.3f}" if pd.notna(x) else x)
410
+
411
+ return df
412
+
413
+
414
+ def select_transfer_all():
415
+ """Select all dimensions"""
416
+ return gr.update(value=TRANSFER_ALL_SELECTED_COLUMNS)
417
+
418
+
419
+ def deselect_transfer_all():
420
+ """Deselect all dimensions"""
421
+ return gr.update(value=[])
422
+
423
+
424
+ def on_transfer_dimension_selection_change(selected_columns, full_df):
425
+ """Handle dimension selection changes and update the dataframe"""
426
+ present_columns = ['Model', 'Condition']
427
+
428
+ for col in selected_columns:
429
+ if col not in present_columns and col in full_df.columns:
430
+ present_columns.append(col)
431
+
432
+ updated_data = full_df[present_columns]
433
+
434
+ datatypes = []
435
+ for col in present_columns:
436
+ if col == 'Model':
437
+ datatypes.append('markdown')
438
+ else:
439
+ datatypes.append('str')
440
+
441
+ return gr.update(value=updated_data, datatype=datatypes, headers=present_columns)
442
+
443
+
444
+ def init_transfer_leaderboard(dataframe):
445
+ """Initialize the Conditional Generation leaderboard with given dataframe"""
446
+ if dataframe is None or dataframe.empty:
447
+ raise ValueError("Leaderboard DataFrame is empty or None.")
448
+
449
+ available_default_cols = [col for col in TRANSFER_DEFAULT_DISPLAYED_COLUMNS if col in dataframe.columns]
450
+
451
+ display_df = dataframe[available_default_cols]
452
+
453
+ datatypes = []
454
+ for col in display_df.columns:
455
+ if col == 'Model':
456
+ datatypes.append('markdown')
457
+ else:
458
+ datatypes.append('str')
459
+
460
+ with gr.Row():
461
+ with gr.Column(scale=1):
462
+ select_all_btn = gr.Button("Select All", size="md")
463
+ deselect_btn = gr.Button("Deselect All", size="md")
464
+
465
+ with gr.Column(scale=4):
466
+ dimension_choices = [col for col in dataframe.columns
467
+ if col not in TRANSFER_NEVER_HIDDEN_COLUMNS]
468
+
469
+ checkbox_group = gr.CheckboxGroup(
470
+ choices=dimension_choices,
471
+ value=[col for col in TRANSFER_DEFAULT_DISPLAYED_COLUMNS if col in dimension_choices],
472
+ label="Evaluation Dimensions",
473
+ interactive=True,
474
+ )
475
+
476
+ data_component = gr.Dataframe(
477
+ value=display_df,
478
+ headers=list(display_df.columns),
479
+ datatype=datatypes,
480
+ interactive=False,
481
+ visible=True,
482
+ wrap=False,
483
+ column_widths=["280px", "120px"] + ["150px"] * (len(display_df.columns) - 2),
484
+ pinned_columns=2,
485
+ elem_id="transfer_leaderboard",
486
+ max_height=10000,
487
+ )
488
+
489
+ deselect_btn.click(
490
+ deselect_transfer_all,
491
+ inputs=None,
492
+ outputs=[checkbox_group]
493
+ ).then(
494
+ fn=on_transfer_dimension_selection_change,
495
+ inputs=[checkbox_group, gr.State(dataframe)],
496
+ outputs=data_component
497
+ )
498
+
499
+ select_all_btn.click(
500
+ select_transfer_all,
501
+ inputs=None,
502
+ outputs=[checkbox_group]
503
+ ).then(
504
+ fn=on_transfer_dimension_selection_change,
505
+ inputs=[checkbox_group, gr.State(dataframe)],
506
+ outputs=data_component
507
+ )
508
+
509
+ checkbox_group.change(
510
+ fn=on_transfer_dimension_selection_change,
511
+ inputs=[checkbox_group, gr.State(dataframe)],
512
+ outputs=data_component
513
+ )
514
+
515
+ return data_component
516
+
517
+
518
  # ============================================================================
519
  # Understanding Tab Configuration and Utilities
520
  # ============================================================================
521
 
522
+ # Column name to abbreviation mapping for display
523
+ REASON_COLUMN_ABBREV = {
524
+ 'Common Sense': 'CS',
525
+ 'Embodied Reasoning': 'ER',
526
+ 'BridgeData V2': 'BD',
527
+ 'RoboVQA': 'RV',
528
+ 'RoboFail': 'RF',
529
+ 'Agibot': 'AB',
530
+ 'HoloAssist': 'HA',
531
  }
532
 
533
+ # Desired column order (full names from JSON)
534
  REASON_COLUMN_ORDER = [
535
+ 'Model',
536
+ 'Thinking',
537
  'Overall',
538
  'Common Sense',
539
  'Embodied Reasoning',
 
545
  'RoboFail',
546
  'Agibot',
547
  'HoloAssist',
548
+ 'AV'
 
 
549
  ]
550
 
551
  # Columns to hide by default (but still available for filtering/selection)
552
+ REASON_HIDDEN_COLUMNS = []
553
 
554
+ # Reasoning dimensions (for selection button) - use abbreviations matching dataframe
555
  REASON_COMMON_SENSE_DIMENSIONS = [
556
+ 'CS',
557
  'Space',
558
  'Time',
559
  'Physics',
560
  ]
561
 
562
+ # Domain dimensions (for selection button) - use abbreviations matching dataframe
563
  REASON_EMBODIED_REASONING_DIMENSIONS = [
564
+ 'ER',
565
  'Space',
566
  'Time',
567
  'Physics',
568
+ 'BD', 'RV', 'RF', 'AB', 'HA', 'AV',
 
 
 
 
 
569
  ]
570
 
571
  REASON_DESELECTED_COLUMNS = [
572
+ 'CS',
573
+ 'ER',
574
  ]
575
 
576
  REASON_ALL_SELECTED_COLUMNS = [
577
+ 'CS', 'ER',
578
+ 'Space', 'Time', 'Physics',
579
+ 'BD', 'RV', 'RF', 'AB', 'HA', 'AV',
 
 
 
 
 
 
 
 
580
  ]
581
 
582
  # Columns that can never be deselected
583
+ REASON_NEVER_HIDDEN_COLUMNS = ['Model', 'Thinking', 'Overall']
584
 
585
  # Columns displayed by default (using renamed column names)
586
  REASON_DEFAULT_DISPLAYED_COLUMNS = REASON_NEVER_HIDDEN_COLUMNS + REASON_ALL_SELECTED_COLUMNS
587
 
588
 
589
+ def load_reason_json(json_path):
590
+ """Load understanding leaderboard JSON"""
591
+ df = pd.read_json(json_path, orient='records')
592
 
593
+ if 'model' in df.columns and 'url' in df.columns:
594
+ def create_link(row):
595
+ if pd.notna(row['url']):
596
+ display_name = row['model'].split('/')[-1] if '/' in row['model'] else row['model']
597
+ return f"[{display_name}]({row['url']})"
598
+ return row['model']
599
 
600
+ df['model'] = df.apply(create_link, axis=1)
601
+ df = df.drop(columns=['url'])
 
602
 
603
+ df = df.rename(columns={'model': 'Model'})
 
 
604
 
 
605
  for col in df.columns:
606
+ if col != 'Model' and pd.api.types.is_numeric_dtype(df[col]):
607
  df[col] = df[col].apply(lambda x: f"{x:.1f}" if pd.notna(x) else x)
608
 
609
+ # Rename columns to abbreviations for display
610
+ df = df.rename(columns=REASON_COLUMN_ABBREV)
611
+
612
  return df
613
 
614
 
615
+ def get_reason_checkbox_choices(dataframe):
616
+ """Get checkbox choices with full name (abbrev) format"""
617
+ # Create reverse mapping from abbreviation to full name
618
+ abbrev_to_full = {v: k for k, v in REASON_COLUMN_ABBREV.items()}
619
+
620
+ choices = []
621
+ for col in dataframe.columns:
622
+ if col in ['Model', 'Thinking', 'Overall']:
623
+ continue
624
+ if col in abbrev_to_full:
625
+ full_name = abbrev_to_full[col]
626
+ choices.append((f"{full_name} ({col})", col))
627
+ else:
628
+ choices.append((col, col))
629
+
630
+ return choices
631
+
632
+
633
  def select_reason_common_sense_dimensions():
634
  """Return reasoning dimensions for checkbox selection"""
635
  return gr.update(value=REASON_COMMON_SENSE_DIMENSIONS)
 
652
 
653
  def on_reason_dimension_selection_change(selected_columns, full_df):
654
  """Handle dimension selection changes and update the dataframe"""
655
+ present_columns = ['Model', 'Thinking', 'Overall']
 
656
 
 
657
  for col in selected_columns:
658
  if col not in present_columns and col in full_df.columns:
659
  present_columns.append(col)
660
 
 
661
  updated_data = full_df[present_columns]
662
 
 
663
  datatypes = []
664
  for col in present_columns:
665
+ if col == 'Model':
666
  datatypes.append('markdown')
 
 
667
  else:
668
  datatypes.append('str')
669
 
 
684
  # Determine datatypes dynamically
685
  datatypes = []
686
  for col in display_df.columns:
687
+ if col == 'Model':
688
  datatypes.append('markdown')
 
 
689
  else:
690
+ datatypes.append('str')
691
 
 
692
  with gr.Row():
693
  with gr.Column(scale=1):
694
  common_sense_btn = gr.Button("Common Sense", size="md")
 
697
  deselect_btn = gr.Button("Deselect All", size="md")
698
 
699
  with gr.Column(scale=4):
700
+ # Get checkbox choices with "Full Name (Abbrev)" format
701
+ checkbox_choices = get_reason_checkbox_choices(dataframe)
 
702
 
703
  checkbox_group = gr.CheckboxGroup(
704
+ choices=checkbox_choices,
705
+ value=[col for col in REASON_ALL_SELECTED_COLUMNS if col in dataframe.columns],
706
  label="Evaluation Dimensions",
707
  interactive=True,
708
  )
 
713
  datatype=datatypes,
714
  interactive=False,
715
  visible=True,
716
+ wrap=False,
717
+ column_widths=["320px", "100px"] + ["100px"] * (len(display_df.columns) - 2),
718
+ pinned_columns=2,
719
  elem_id="reason_leaderboard",
720
  max_height=10000,
 
721
  )
722
 
723
  # Setup event handlers
 
776
 
777
  demo = gr.Blocks()
778
  with demo:
779
+ gr.HTML(f"<style>{CSS}</style>")
780
  gr.HTML(TITLE)
781
  gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
782
 
783
  with gr.Tabs(elem_classes="tab-buttons") as tabs:
784
  with gr.TabItem("🎨 Generation", elem_id="predict-tab", id=0):
785
+ predict_df = load_predict_json("data/generation-leaderboard.json")
786
  predict_leaderboard = init_predict_leaderboard(predict_df)
787
 
788
  with gr.TabItem("πŸ”„ Conditional Generation", elem_id="transfer-tab", id=1):
789
+ transfer_df = load_transfer_json("data/conditional_generation-leaderboard.json")
790
+ transfer_leaderboard = init_transfer_leaderboard(transfer_df)
791
 
792
  with gr.TabItem("🧠 Understanding", elem_id="reason-tab", id=2):
793
+ reason_df = load_reason_json("data/understanding-leaderboard.json")
794
  reason_leaderboard = init_reason_leaderboard(reason_df)
795
 
796
  with gr.TabItem("ℹ️ About", elem_id="about-tab", id=3):
797
  gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
798
 
799
+ demo.launch()
 
data/conditional_generation-leaderboard.json ADDED
@@ -0,0 +1,123 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "model":"Cosmos-Transfer2.5-2B",
4
+ "Condition":"Blur",
5
+ "url":"https:\/\/huggingface.co\/nvidia\/Cosmos-Transfer2.5-2B",
6
+ "Blur SSIM ↑":0.905,
7
+ "Edge F1 ↑":0.259,
8
+ "Depth si-RMSE ↓":0.543,
9
+ "Mask mIoU ↑":0.753,
10
+ "Quality Score ↑":8.765,
11
+ "Diversity ↑":0.177
12
+ },
13
+ {
14
+ "model":"Cosmos-Transfer2.5-2B",
15
+ "Condition":"All",
16
+ "url":"https:\/\/huggingface.co\/nvidia\/Cosmos-Transfer2.5-2B",
17
+ "Blur SSIM ↑":0.896,
18
+ "Edge F1 ↑":0.448,
19
+ "Depth si-RMSE ↓":0.594,
20
+ "Mask mIoU ↑":0.765,
21
+ "Quality Score ↑":9.241,
22
+ "Diversity ↑":0.128
23
+ },
24
+ {
25
+ "model":"Cosmos-Transfer2.5-2B",
26
+ "Condition":"Edge",
27
+ "url":"https:\/\/huggingface.co\/nvidia\/Cosmos-Transfer2.5-2B",
28
+ "Blur SSIM ↑":0.759,
29
+ "Edge F1 ↑":0.392,
30
+ "Depth si-RMSE ↓":0.735,
31
+ "Mask mIoU ↑":0.744,
32
+ "Quality Score ↑":8.045,
33
+ "Diversity ↑":0.356
34
+ },
35
+ {
36
+ "model":"Cosmos-Transfer2.5-2B",
37
+ "Condition":"Depth",
38
+ "url":"https:\/\/huggingface.co\/nvidia\/Cosmos-Transfer2.5-2B",
39
+ "Blur SSIM ↑":0.695,
40
+ "Edge F1 ↑":0.17,
41
+ "Depth si-RMSE ↓":0.827,
42
+ "Mask mIoU ↑":0.718,
43
+ "Quality Score ↑":7.299,
44
+ "Diversity ↑":0.405
45
+ },
46
+ {
47
+ "model":"Wan2.2-Fun-A14B-Control",
48
+ "Condition":"Edge",
49
+ "url":"https:\/\/huggingface.co\/alibaba-pai\/Wan2.2-Fun-A14B-Control",
50
+ "Blur SSIM ↑":0.68,
51
+ "Edge F1 ↑":0.374,
52
+ "Depth si-RMSE ↓":0.839,
53
+ "Mask mIoU ↑":0.741,
54
+ "Quality Score ↑":9.001,
55
+ "Diversity ↑":0.384
56
+ },
57
+ {
58
+ "model":"Cosmos-Transfer2.5-2B",
59
+ "Condition":"Seg",
60
+ "url":"https:\/\/huggingface.co\/nvidia\/Cosmos-Transfer2.5-2B",
61
+ "Blur SSIM ↑":0.662,
62
+ "Edge F1 ↑":0.128,
63
+ "Depth si-RMSE ↓":1.073,
64
+ "Mask mIoU ↑":0.709,
65
+ "Quality Score ↑":7.868,
66
+ "Diversity ↑":0.436
67
+ },
68
+ {
69
+ "model":"Wan2.2-Fun-5B-Control",
70
+ "Condition":"Edge",
71
+ "url":"https:\/\/huggingface.co\/alibaba-pai\/Wan2.2-Fun-5B-Control",
72
+ "Blur SSIM ↑":0.61,
73
+ "Edge F1 ↑":0.271,
74
+ "Depth si-RMSE ↓":1.011,
75
+ "Mask mIoU ↑":0.71,
76
+ "Quality Score ↑":8.793,
77
+ "Diversity ↑":0.399
78
+ },
79
+ {
80
+ "model":"Wan2.2-Fun-A14B-Control",
81
+ "Condition":"Blur",
82
+ "url":"https:\/\/huggingface.co\/alibaba-pai\/Wan2.2-Fun-A14B-Control",
83
+ "Blur SSIM ↑":0.567,
84
+ "Edge F1 ↑":0.087,
85
+ "Depth si-RMSE ↓":2.109,
86
+ "Mask mIoU ↑":0.502,
87
+ "Quality Score ↑":8.808,
88
+ "Diversity ↑":0.53
89
+ },
90
+ {
91
+ "model":"Wan2.2-Fun-A14B-Control",
92
+ "Condition":"Depth",
93
+ "url":"https:\/\/huggingface.co\/alibaba-pai\/Wan2.2-Fun-A14B-Control",
94
+ "Blur SSIM ↑":0.559,
95
+ "Edge F1 ↑":0.109,
96
+ "Depth si-RMSE ↓":2.097,
97
+ "Mask mIoU ↑":0.577,
98
+ "Quality Score ↑":9.221,
99
+ "Diversity ↑":0.517
100
+ },
101
+ {
102
+ "model":"Wan2.2-Fun-5B-Control",
103
+ "Condition":"Depth",
104
+ "url":"https:\/\/huggingface.co\/alibaba-pai\/Wan2.2-Fun-5B-Control",
105
+ "Blur SSIM ↑":0.556,
106
+ "Edge F1 ↑":0.106,
107
+ "Depth si-RMSE ↓":1.819,
108
+ "Mask mIoU ↑":0.615,
109
+ "Quality Score ↑":9.317,
110
+ "Diversity ↑":0.481
111
+ },
112
+ {
113
+ "model":"Wan2.2-Fun-A14B-Control",
114
+ "Condition":"Seg",
115
+ "url":"https:\/\/huggingface.co\/alibaba-pai\/Wan2.2-Fun-A14B-Control",
116
+ "Blur SSIM ↑":0.472,
117
+ "Edge F1 ↑":0.097,
118
+ "Depth si-RMSE ↓":1.601,
119
+ "Mask mIoU ↑":0.663,
120
+ "Quality Score ↑":7.791,
121
+ "Diversity ↑":0.355
122
+ }
123
+ ]
data/{predict-leaderboard.json β†’ generation-leaderboard.json} RENAMED
@@ -1,10 +1,31 @@
1
  [
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  {
3
  "model":"Veo-3",
4
  "url":"https:\/\/deepmind.google\/models\/veo",
5
  "Overall":82.1,
6
- "Domain Score":86.7,
7
- "Quality Score":77.6,
8
  "Common Sense":94.4,
9
  "AV":68.7,
10
  "Robot":86.9,
@@ -15,19 +36,38 @@
15
  "Background Consistency":93.1,
16
  "Motion Smoothness":99.2,
17
  "Aesthetic Quality":51.9,
18
- "Image Quality":69.8,
19
  "Overall Consistency":21.7,
20
  "I2V Subject":97.0,
21
- "I2V Background":96.9,
22
- "params":null,
23
- "activate_params":null
24
  },
25
  {
26
- "model":"nvidia\/Cosmos-Predict2.5-2B",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
27
  "url":"https:\/\/huggingface.co\/nvidia\/Cosmos-Predict2.5-2B",
28
  "Overall":81.0,
29
- "Domain Score":84.0,
30
- "Quality Score":77.9,
31
  "Common Sense":94.1,
32
  "AV":66.1,
33
  "Robot":80.8,
@@ -38,19 +78,17 @@
38
  "Background Consistency":94.2,
39
  "Motion Smoothness":99.1,
40
  "Aesthetic Quality":52.4,
41
- "Image Quality":70.8,
42
  "Overall Consistency":20.1,
43
  "I2V Subject":96.6,
44
- "I2V Background":97.4,
45
- "params":2.0,
46
- "activate_params":2.0
47
  },
48
  {
49
- "model":"Wan-AI\/Wan2.2-I2V-A14B",
50
  "url":"https:\/\/huggingface.co\/Wan-AI\/Wan2.2-I2V-A14B",
51
  "Overall":80.6,
52
- "Domain Score":84.1,
53
- "Quality Score":77.2,
54
  "Common Sense":93.2,
55
  "AV":66.3,
56
  "Robot":81.7,
@@ -61,19 +99,17 @@
61
  "Background Consistency":93.7,
62
  "Motion Smoothness":98.3,
63
  "Aesthetic Quality":51.2,
64
- "Image Quality":69.6,
65
  "Overall Consistency":20.4,
66
  "I2V Subject":96.0,
67
- "I2V Background":96.6,
68
- "params":14.0,
69
- "activate_params":14.0
70
  },
71
  {
72
- "model":"Wan-AI\/Wan2.2-TI2V-5B",
73
  "url":"https:\/\/huggingface.co\/Wan-AI\/Wan2.2-TI2V-5B",
74
  "Overall":80.4,
75
- "Domain Score":83.4,
76
- "Quality Score":77.4,
77
  "Common Sense":93.1,
78
  "AV":65.2,
79
  "Robot":79.3,
@@ -84,19 +120,38 @@
84
  "Background Consistency":93.7,
85
  "Motion Smoothness":98.8,
86
  "Aesthetic Quality":51.9,
87
- "Image Quality":69.9,
88
  "Overall Consistency":20.3,
89
  "I2V Subject":95.9,
90
- "I2V Background":96.7,
91
- "params":5.0,
92
- "activate_params":5.0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
93
  },
94
  {
95
- "model":"Wan-AI\/Wan2.1-I2V-14B-720P",
96
  "url":"https:\/\/huggingface.co\/Wan-AI\/Wan2.1-I2V-14B-720P",
97
- "Overall":79.7,
98
- "Domain Score":82.7,
99
- "Quality Score":76.8,
100
  "Common Sense":90.6,
101
  "AV":66.9,
102
  "Robot":80.1,
@@ -107,19 +162,38 @@
107
  "Background Consistency":93.1,
108
  "Motion Smoothness":98.1,
109
  "Aesthetic Quality":51.5,
110
- "Image Quality":70.1,
111
  "Overall Consistency":20.4,
112
  "I2V Subject":95.2,
113
- "I2V Background":96.0,
114
- "params":14.0,
115
- "activate_params":14.0
116
  },
117
  {
118
- "model":"MAGI\/MAGI-1-24B",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
119
  "url":"https:\/\/huggingface.co\/sand-ai\/MAGI-1",
120
  "Overall":78.5,
121
- "Domain Score":80.5,
122
- "Quality Score":76.5,
123
  "Common Sense":90.6,
124
  "AV":61.8,
125
  "Robot":73.5,
@@ -130,19 +204,17 @@
130
  "Background Consistency":92.4,
131
  "Motion Smoothness":99.0,
132
  "Aesthetic Quality":50.2,
133
- "Image Quality":64.2,
134
  "Overall Consistency":21.4,
135
  "I2V Subject":96.8,
136
- "I2V Background":97.9,
137
- "params":24.0,
138
- "activate_params":24.0
139
  },
140
  {
141
- "model":"THUDM\/CogVideoX1.5-5B-I2V",
142
  "url":"https:\/\/huggingface.co\/THUDM\/CogVideoX1.5-5B-I2V",
143
  "Overall":78.3,
144
- "Domain Score":80.1,
145
- "Quality Score":76.6,
146
  "Common Sense":89.1,
147
  "AV":59.7,
148
  "Robot":73.0,
@@ -153,19 +225,17 @@
153
  "Background Consistency":93.9,
154
  "Motion Smoothness":98.5,
155
  "Aesthetic Quality":50.0,
156
- "Image Quality":66.5,
157
  "Overall Consistency":21.2,
158
  "I2V Subject":95.0,
159
- "I2V Background":96.1,
160
- "params":5.0,
161
- "activate_params":5.0
162
  },
163
  {
164
- "model":"THUDM\/CogVideoX-5B-I2V",
165
- "url":"https:\/\/huggingface.co\/THUDM\/CogVideoX-5B-I2V",
166
  "Overall":77.9,
167
- "Domain Score":79.5,
168
- "Quality Score":76.3,
169
  "Common Sense":87.7,
170
  "AV":58.0,
171
  "Robot":74.0,
@@ -176,19 +246,17 @@
176
  "Background Consistency":93.4,
177
  "Motion Smoothness":98.0,
178
  "Aesthetic Quality":51.2,
179
- "Image Quality":64.6,
180
  "Overall Consistency":21.3,
181
  "I2V Subject":94.1,
182
- "I2V Background":95.9,
183
- "params":5.0,
184
- "activate_params":5.0
185
  },
186
  {
187
- "model":"Lightricks\/LTX-Video-13B",
188
  "url":"https:\/\/huggingface.co\/Lightricks\/LTX-Video",
189
  "Overall":77.9,
190
- "Domain Score":78.4,
191
- "Quality Score":77.4,
192
  "Common Sense":88.9,
193
  "AV":55.3,
194
  "Robot":70.1,
@@ -199,19 +267,17 @@
199
  "Background Consistency":93.5,
200
  "Motion Smoothness":99.0,
201
  "Aesthetic Quality":53.5,
202
- "Image Quality":69.5,
203
  "Overall Consistency":21.4,
204
  "I2V Subject":95.7,
205
- "I2V Background":96.0,
206
- "params":13.0,
207
- "activate_params":13.0
208
  },
209
  {
210
- "model":"Tencent\/HunyuanVideo-I2V",
211
- "url":"https:\/\/huggingface.co\/Tencent\/HunyuanVideo-I2V",
212
  "Overall":77.4,
213
- "Domain Score":76.8,
214
- "Quality Score":78.0,
215
  "Common Sense":87.4,
216
  "AV":56.3,
217
  "Robot":67.7,
@@ -222,42 +288,17 @@
222
  "Background Consistency":95.3,
223
  "Motion Smoothness":99.5,
224
  "Aesthetic Quality":52.1,
225
- "Image Quality":65.2,
226
  "Overall Consistency":21.5,
227
  "I2V Subject":98.6,
228
- "I2V Background":97.6,
229
- "params":null,
230
- "activate_params":null
231
  },
232
  {
233
- "model":"MAGI\/MAGI-1-4.5B",
234
- "url":"https:\/\/huggingface.co\/sand-ai\/MAGI-1",
235
- "Overall":76.9,
236
- "Domain Score":77.4,
237
- "Quality Score":76.3,
238
- "Common Sense":87.5,
239
- "AV":56.3,
240
- "Robot":71.6,
241
- "Industry":79.8,
242
- "Human":76.0,
243
- "Physics":88.9,
244
- "Subject Consistency":92.1,
245
- "Background Consistency":93.3,
246
- "Motion Smoothness":99.0,
247
- "Aesthetic Quality":50.4,
248
- "Image Quality":61.8,
249
- "Overall Consistency":21.6,
250
- "I2V Subject":94.5,
251
- "I2V Background":98.1,
252
- "params":4.5,
253
- "activate_params":4.5
254
- },
255
- {
256
- "model":"Lightricks\/LTX-Video-2B",
257
  "url":"https:\/\/huggingface.co\/Lightricks\/LTX-Video",
258
  "Overall":76.9,
259
- "Domain Score":76.6,
260
- "Quality Score":77.1,
261
  "Common Sense":87.3,
262
  "AV":53.6,
263
  "Robot":67.1,
@@ -268,19 +309,38 @@
268
  "Background Consistency":92.7,
269
  "Motion Smoothness":98.7,
270
  "Aesthetic Quality":53.2,
271
- "Image Quality":71.3,
272
  "Overall Consistency":21.1,
273
  "I2V Subject":95.0,
274
- "I2V Background":95.9,
275
- "params":2.0,
276
- "activate_params":2.0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
277
  },
278
  {
279
- "model":"Doubiiu\/DynamiCrafter_1024",
280
  "url":"https:\/\/huggingface.co\/Doubiiu\/DynamiCrafter_1024",
281
  "Overall":69.7,
282
- "Domain Score":65.6,
283
- "Quality Score":73.7,
284
  "Common Sense":75.2,
285
  "AV":43.4,
286
  "Robot":55.0,
@@ -291,11 +351,9 @@
291
  "Background Consistency":92.5,
292
  "Motion Smoothness":94.9,
293
  "Aesthetic Quality":51.5,
294
- "Image Quality":68.0,
295
  "Overall Consistency":21.2,
296
  "I2V Subject":84.5,
297
- "I2V Background":86.2,
298
- "params":null,
299
- "activate_params":null
300
  }
301
  ]
 
1
  [
2
+ {
3
+ "model":"Source",
4
+ "url":null,
5
+ "Overall":82.6,
6
+ "Domain":87.1,
7
+ "Quality":78.0,
8
+ "Common Sense":96.4,
9
+ "AV":71.3,
10
+ "Robot":86.2,
11
+ "Industry":88.6,
12
+ "Human":83.5,
13
+ "Physics":93.5,
14
+ "Subject Consistency":93.3,
15
+ "Background Consistency":94.2,
16
+ "Motion Smoothness":99.1,
17
+ "Aesthetic Quality":51.7,
18
+ "Imaging Quality":68.4,
19
+ "Overall Consistency":21.5,
20
+ "I2V Subject":97.8,
21
+ "I2V Background":98.2
22
+ },
23
  {
24
  "model":"Veo-3",
25
  "url":"https:\/\/deepmind.google\/models\/veo",
26
  "Overall":82.1,
27
+ "Domain":86.7,
28
+ "Quality":77.6,
29
  "Common Sense":94.4,
30
  "AV":68.7,
31
  "Robot":86.9,
 
36
  "Background Consistency":93.1,
37
  "Motion Smoothness":99.2,
38
  "Aesthetic Quality":51.9,
39
+ "Imaging Quality":69.8,
40
  "Overall Consistency":21.7,
41
  "I2V Subject":97.0,
42
+ "I2V Background":96.9
 
 
43
  },
44
  {
45
+ "model":"Cosmos-Predict2.5-14B",
46
+ "url":"https:\/\/huggingface.co\/nvidia\/Cosmos-Predict2.5-14B",
47
+ "Overall":81.0,
48
+ "Domain":83.8,
49
+ "Quality":78.1,
50
+ "Common Sense":94.2,
51
+ "AV":67.8,
52
+ "Robot":79.9,
53
+ "Industry":87.7,
54
+ "Human":80.0,
55
+ "Physics":93.5,
56
+ "Subject Consistency":93.4,
57
+ "Background Consistency":94.8,
58
+ "Motion Smoothness":99.1,
59
+ "Aesthetic Quality":52.5,
60
+ "Imaging Quality":70.0,
61
+ "Overall Consistency":20.1,
62
+ "I2V Subject":97.2,
63
+ "I2V Background":97.9
64
+ },
65
+ {
66
+ "model":"Cosmos-Predict2.5-2B",
67
  "url":"https:\/\/huggingface.co\/nvidia\/Cosmos-Predict2.5-2B",
68
  "Overall":81.0,
69
+ "Domain":84.0,
70
+ "Quality":77.9,
71
  "Common Sense":94.1,
72
  "AV":66.1,
73
  "Robot":80.8,
 
78
  "Background Consistency":94.2,
79
  "Motion Smoothness":99.1,
80
  "Aesthetic Quality":52.4,
81
+ "Imaging Quality":70.8,
82
  "Overall Consistency":20.1,
83
  "I2V Subject":96.6,
84
+ "I2V Background":97.4
 
 
85
  },
86
  {
87
+ "model":"Wan2.2-I2V-A14B",
88
  "url":"https:\/\/huggingface.co\/Wan-AI\/Wan2.2-I2V-A14B",
89
  "Overall":80.6,
90
+ "Domain":84.1,
91
+ "Quality":77.2,
92
  "Common Sense":93.2,
93
  "AV":66.3,
94
  "Robot":81.7,
 
99
  "Background Consistency":93.7,
100
  "Motion Smoothness":98.3,
101
  "Aesthetic Quality":51.2,
102
+ "Imaging Quality":69.6,
103
  "Overall Consistency":20.4,
104
  "I2V Subject":96.0,
105
+ "I2V Background":96.6
 
 
106
  },
107
  {
108
+ "model":"Wan2.2-TI2V-5B",
109
  "url":"https:\/\/huggingface.co\/Wan-AI\/Wan2.2-TI2V-5B",
110
  "Overall":80.4,
111
+ "Domain":83.4,
112
+ "Quality":77.4,
113
  "Common Sense":93.1,
114
  "AV":65.2,
115
  "Robot":79.3,
 
120
  "Background Consistency":93.7,
121
  "Motion Smoothness":98.8,
122
  "Aesthetic Quality":51.9,
123
+ "Imaging Quality":69.9,
124
  "Overall Consistency":20.3,
125
  "I2V Subject":95.9,
126
+ "I2V Background":96.7
127
+ },
128
+ {
129
+ "model":"Cosmos-Predict2-14B-Video2World",
130
+ "url":"https:\/\/huggingface.co\/nvidia\/Cosmos-Predict2-14B-Video2World",
131
+ "Overall":80.0,
132
+ "Domain":84.3,
133
+ "Quality":75.8,
134
+ "Common Sense":93.1,
135
+ "AV":67.1,
136
+ "Robot":80.3,
137
+ "Industry":86.9,
138
+ "Human":82.3,
139
+ "Physics":92.8,
140
+ "Subject Consistency":89.6,
141
+ "Background Consistency":92.8,
142
+ "Motion Smoothness":98.0,
143
+ "Aesthetic Quality":49.8,
144
+ "Imaging Quality":67.5,
145
+ "Overall Consistency":21.5,
146
+ "I2V Subject":92.2,
147
+ "I2V Background":94.9
148
  },
149
  {
150
+ "model":"Wan2.1-I2V-14B-720P",
151
  "url":"https:\/\/huggingface.co\/Wan-AI\/Wan2.1-I2V-14B-720P",
152
+ "Overall":79.8,
153
+ "Domain":82.7,
154
+ "Quality":76.8,
155
  "Common Sense":90.6,
156
  "AV":66.9,
157
  "Robot":80.1,
 
162
  "Background Consistency":93.1,
163
  "Motion Smoothness":98.1,
164
  "Aesthetic Quality":51.5,
165
+ "Imaging Quality":70.1,
166
  "Overall Consistency":20.4,
167
  "I2V Subject":95.2,
168
+ "I2V Background":96.0
 
 
169
  },
170
  {
171
+ "model":"Cosmos-Predict2-2B-Video2World",
172
+ "url":"https:\/\/huggingface.co\/nvidia\/Cosmos-Predict2-2B-Video2World",
173
+ "Overall":79.6,
174
+ "Domain":83.9,
175
+ "Quality":75.2,
176
+ "Common Sense":92.0,
177
+ "AV":66.1,
178
+ "Robot":80.6,
179
+ "Industry":86.0,
180
+ "Human":82.8,
181
+ "Physics":92.0,
182
+ "Subject Consistency":88.7,
183
+ "Background Consistency":92.1,
184
+ "Motion Smoothness":97.6,
185
+ "Aesthetic Quality":49.3,
186
+ "Imaging Quality":65.9,
187
+ "Overall Consistency":21.6,
188
+ "I2V Subject":91.9,
189
+ "I2V Background":94.6
190
+ },
191
+ {
192
+ "model":"MAGI-1-24B",
193
  "url":"https:\/\/huggingface.co\/sand-ai\/MAGI-1",
194
  "Overall":78.5,
195
+ "Domain":80.5,
196
+ "Quality":76.5,
197
  "Common Sense":90.6,
198
  "AV":61.8,
199
  "Robot":73.5,
 
204
  "Background Consistency":92.4,
205
  "Motion Smoothness":99.0,
206
  "Aesthetic Quality":50.2,
207
+ "Imaging Quality":64.2,
208
  "Overall Consistency":21.4,
209
  "I2V Subject":96.8,
210
+ "I2V Background":97.9
 
 
211
  },
212
  {
213
+ "model":"CogVideoX1.5-5B-I2V",
214
  "url":"https:\/\/huggingface.co\/THUDM\/CogVideoX1.5-5B-I2V",
215
  "Overall":78.3,
216
+ "Domain":80.1,
217
+ "Quality":76.6,
218
  "Common Sense":89.1,
219
  "AV":59.7,
220
  "Robot":73.0,
 
225
  "Background Consistency":93.9,
226
  "Motion Smoothness":98.5,
227
  "Aesthetic Quality":50.0,
228
+ "Imaging Quality":66.5,
229
  "Overall Consistency":21.2,
230
  "I2V Subject":95.0,
231
+ "I2V Background":96.1
 
 
232
  },
233
  {
234
+ "model":"CogVideoX-5b-I2V",
235
+ "url":"https:\/\/huggingface.co\/THUDM\/CogVideoX-5b-I2V",
236
  "Overall":77.9,
237
+ "Domain":79.5,
238
+ "Quality":76.3,
239
  "Common Sense":87.7,
240
  "AV":58.0,
241
  "Robot":74.0,
 
246
  "Background Consistency":93.4,
247
  "Motion Smoothness":98.0,
248
  "Aesthetic Quality":51.2,
249
+ "Imaging Quality":64.6,
250
  "Overall Consistency":21.3,
251
  "I2V Subject":94.1,
252
+ "I2V Background":95.9
 
 
253
  },
254
  {
255
+ "model":"LTX-Video-13B",
256
  "url":"https:\/\/huggingface.co\/Lightricks\/LTX-Video",
257
  "Overall":77.9,
258
+ "Domain":78.4,
259
+ "Quality":77.4,
260
  "Common Sense":88.9,
261
  "AV":55.3,
262
  "Robot":70.1,
 
267
  "Background Consistency":93.5,
268
  "Motion Smoothness":99.0,
269
  "Aesthetic Quality":53.5,
270
+ "Imaging Quality":69.5,
271
  "Overall Consistency":21.4,
272
  "I2V Subject":95.7,
273
+ "I2V Background":96.0
 
 
274
  },
275
  {
276
+ "model":"HunyuanVideo-I2V",
277
+ "url":"https:\/\/huggingface.co\/tencent\/HunyuanVideo-I2V",
278
  "Overall":77.4,
279
+ "Domain":76.8,
280
+ "Quality":78.0,
281
  "Common Sense":87.4,
282
  "AV":56.3,
283
  "Robot":67.7,
 
288
  "Background Consistency":95.3,
289
  "Motion Smoothness":99.5,
290
  "Aesthetic Quality":52.1,
291
+ "Imaging Quality":65.2,
292
  "Overall Consistency":21.5,
293
  "I2V Subject":98.6,
294
+ "I2V Background":97.6
 
 
295
  },
296
  {
297
+ "model":"LTX-Video-2B",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
298
  "url":"https:\/\/huggingface.co\/Lightricks\/LTX-Video",
299
  "Overall":76.9,
300
+ "Domain":76.6,
301
+ "Quality":77.1,
302
  "Common Sense":87.3,
303
  "AV":53.6,
304
  "Robot":67.1,
 
309
  "Background Consistency":92.7,
310
  "Motion Smoothness":98.7,
311
  "Aesthetic Quality":53.2,
312
+ "Imaging Quality":71.3,
313
  "Overall Consistency":21.1,
314
  "I2V Subject":95.0,
315
+ "I2V Background":95.9
316
+ },
317
+ {
318
+ "model":"MAGI-1-4.5B",
319
+ "url":"https:\/\/huggingface.co\/sand-ai\/MAGI-1",
320
+ "Overall":76.9,
321
+ "Domain":77.4,
322
+ "Quality":76.3,
323
+ "Common Sense":87.5,
324
+ "AV":56.3,
325
+ "Robot":71.6,
326
+ "Industry":79.8,
327
+ "Human":76.0,
328
+ "Physics":88.9,
329
+ "Subject Consistency":92.1,
330
+ "Background Consistency":93.3,
331
+ "Motion Smoothness":99.0,
332
+ "Aesthetic Quality":50.4,
333
+ "Imaging Quality":61.8,
334
+ "Overall Consistency":21.6,
335
+ "I2V Subject":94.5,
336
+ "I2V Background":98.1
337
  },
338
  {
339
+ "model":"DynamiCrafter_1024",
340
  "url":"https:\/\/huggingface.co\/Doubiiu\/DynamiCrafter_1024",
341
  "Overall":69.7,
342
+ "Domain":65.6,
343
+ "Quality":73.7,
344
  "Common Sense":75.2,
345
  "AV":43.4,
346
  "Robot":55.0,
 
351
  "Background Consistency":92.5,
352
  "Motion Smoothness":94.9,
353
  "Aesthetic Quality":51.5,
354
+ "Imaging Quality":68.0,
355
  "Overall Consistency":21.2,
356
  "I2V Subject":84.5,
357
+ "I2V Background":86.2
 
 
358
  }
359
  ]
data/reason-leaderboard.csv DELETED
@@ -1,15 +0,0 @@
1
- model,Overall,Common Sense,Embodied Reasoning,Space,Time,Physics,BridgeData V2,RoboVQA,RoboFail,Agibot,HoloAssist,AV,params,activate_params
2
- GPT-5,70.0,72.7,67.4,67.5,72.8,74.3,53.0,90.9,68.0,55.0,73.0,62.0,,
3
- Qwen/Qwen3-VL-235B-A22B-Instruct,64.8,65.2,64.4,56.2,69.8,62.4,42.0,93.6,71.0,45.0,76.0,56.0,235.0,22.0
4
- Qwen/Qwen3-VL-30B-A3B-Instruct,60.6,59.9,61.3,52.5,62.1,59.7,36.0,89.1,67.0,43.0,81.0,49.0,30.0,3.0
5
- Qwen/Qwen2.5-VL-72B-Instruct,56.8,57.9,55.7,56.2,62.8,52.2,35.0,90.9,73.0,35.0,58.0,39.0,72.0,72.0
6
- OpenGVLab/InternVL3_5-38B,55.8,55.8,55.7,57.5,60.4,49.1,36.0,81.8,67.0,44.0,71.0,32.0,38.0,38.0
7
- nvidia/Cosmos-Reason1-7B,54.3,50.7,57.9,57.5,53.7,44.2,41.0,91.8,65.0,42.0,57.0,47.0,7.0,7.0
8
- GPT-4o,53.7,56.3,51.1,55.0,55.0,58.4,40.0,56.4,65.0,37.0,65.0,43.0,,
9
- Qwen/Qwen2.5-VL-32B-Instruct,51.9,53.8,50.0,50.0,61.1,45.6,32.0,90.0,52.0,34.0,55.0,33.0,32.0,32.0
10
- OpenGVLab/InternVL3_5-8B,50.5,50.5,50.5,48.8,54.7,45.6,32.0,77.3,66.0,38.0,49.0,38.0,8.0,8.0
11
- Qwen/Qwen2.5-VL-7B-Instruct,50.3,47.7,53.0,47.5,55.4,37.6,33.0,83.6,62.0,44.0,47.0,45.0,7.0,7.0
12
- OpenGVLab/InternVL3_5-14B,49.7,50.3,49.0,52.5,52.0,47.3,26.0,80.0,67.0,28.0,54.0,36.0,14.0,14.0
13
- OpenGVLab/InternVL3_5-30B-A3B,49.5,49.5,49.5,47.5,54.4,43.8,37.0,78.2,60.0,27.0,55.0,37.0,30.0,3.0
14
- Qwen/Qwen2.5-VL-3B-Instruct,48.1,47.4,48.9,47.5,50.7,42.9,31.0,82.7,63.0,36.0,48.0,29.0,3.0,3.0
15
- zai-org/GLM-4.5V,45.5,46.0,44.9,46.2,50.7,39.8,26.0,83.6,69.0,25.0,24.0,38.0,,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
data/understanding-leaderboard.json ADDED
@@ -0,0 +1,376 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "model":"GPT-5",
4
+ "url":"https:\/\/openai.com\/gpt-5\/",
5
+ "Thinking":"Yes",
6
+ "Overall":69.8,
7
+ "Common Sense":71.4,
8
+ "Embodied Reasoning":68.2,
9
+ "Space":67.5,
10
+ "Time":73.2,
11
+ "Physics":70.4,
12
+ "BridgeData V2":49.0,
13
+ "RoboVQA":87.3,
14
+ "RoboFail":74.0,
15
+ "Agibot":60.0,
16
+ "HoloAssist":76.0,
17
+ "AV":61.0
18
+ },
19
+ {
20
+ "model":"Cosmos-Reason2-8B",
21
+ "url":"https:\/\/huggingface.co\/nvidia\/Cosmos-Reason2-8B",
22
+ "Thinking":"No",
23
+ "Overall":65.4,
24
+ "Common Sense":62.7,
25
+ "Embodied Reasoning":68.0,
26
+ "Space":68.8,
27
+ "Time":67.1,
28
+ "Physics":54.9,
29
+ "BridgeData V2":51.0,
30
+ "RoboVQA":92.7,
31
+ "RoboFail":71.0,
32
+ "Agibot":52.0,
33
+ "HoloAssist":67.0,
34
+ "AV":72.0
35
+ },
36
+ {
37
+ "model":"Qwen3-VL-235B-A22B-Instruct",
38
+ "url":"https:\/\/huggingface.co\/Qwen\/Qwen3-VL-235B-A22B-Instruct",
39
+ "Thinking":"No",
40
+ "Overall":64.7,
41
+ "Common Sense":64.9,
42
+ "Embodied Reasoning":64.4,
43
+ "Space":56.2,
44
+ "Time":69.5,
45
+ "Physics":61.9,
46
+ "BridgeData V2":42.0,
47
+ "RoboVQA":93.6,
48
+ "RoboFail":71.0,
49
+ "Agibot":45.0,
50
+ "HoloAssist":76.0,
51
+ "AV":56.0
52
+ },
53
+ {
54
+ "model":"Qwen3-VL-235B-A22B-Thinking",
55
+ "url":"https:\/\/huggingface.co\/Qwen\/Qwen3-VL-235B-A22B-Thinking",
56
+ "Thinking":"Yes",
57
+ "Overall":63.7,
58
+ "Common Sense":66.4,
59
+ "Embodied Reasoning":61.0,
60
+ "Space":72.5,
61
+ "Time":69.1,
62
+ "Physics":60.6,
63
+ "BridgeData V2":42.0,
64
+ "RoboVQA":92.7,
65
+ "RoboFail":66.0,
66
+ "Agibot":47.0,
67
+ "HoloAssist":74.0,
68
+ "AV":41.0
69
+ },
70
+ {
71
+ "model":"Qwen3-VL-32B-Instruct",
72
+ "url":"https:\/\/huggingface.co\/Qwen\/Qwen3-VL-32B-Instruct",
73
+ "Thinking":"No",
74
+ "Overall":62.0,
75
+ "Common Sense":62.9,
76
+ "Embodied Reasoning":61.1,
77
+ "Space":53.8,
78
+ "Time":67.8,
79
+ "Physics":59.7,
80
+ "BridgeData V2":42.0,
81
+ "RoboVQA":90.9,
82
+ "RoboFail":71.0,
83
+ "Agibot":50.0,
84
+ "HoloAssist":72.0,
85
+ "AV":38.0
86
+ },
87
+ {
88
+ "model":"Qwen3-VL-32B-Thinking",
89
+ "url":"https:\/\/huggingface.co\/Qwen\/Qwen3-VL-32B-Thinking",
90
+ "Thinking":"Yes",
91
+ "Overall":61.0,
92
+ "Common Sense":63.7,
93
+ "Embodied Reasoning":58.4,
94
+ "Space":66.2,
95
+ "Time":66.4,
96
+ "Physics":59.3,
97
+ "BridgeData V2":46.0,
98
+ "RoboVQA":93.6,
99
+ "RoboFail":61.0,
100
+ "Agibot":48.0,
101
+ "HoloAssist":67.0,
102
+ "AV":31.0
103
+ },
104
+ {
105
+ "model":"Qwen2.5-VL-72B-Instruct",
106
+ "url":"https:\/\/huggingface.co\/Qwen\/Qwen2.5-VL-72B-Instruct",
107
+ "Thinking":"No",
108
+ "Overall":60.8,
109
+ "Common Sense":58.6,
110
+ "Embodied Reasoning":63.0,
111
+ "Space":65.0,
112
+ "Time":57.7,
113
+ "Physics":57.5,
114
+ "BridgeData V2":50.0,
115
+ "RoboVQA":91.8,
116
+ "RoboFail":68.0,
117
+ "Agibot":52.0,
118
+ "HoloAssist":70.0,
119
+ "AV":43.0
120
+ },
121
+ {
122
+ "model":"Qwen3-VL-30B-A3B-Instruct",
123
+ "url":"https:\/\/huggingface.co\/Qwen\/Qwen3-VL-30B-A3B-Instruct",
124
+ "Thinking":"No",
125
+ "Overall":59.5,
126
+ "Common Sense":58.6,
127
+ "Embodied Reasoning":60.3,
128
+ "Space":52.5,
129
+ "Time":60.4,
130
+ "Physics":58.4,
131
+ "BridgeData V2":38.0,
132
+ "RoboVQA":90.9,
133
+ "RoboFail":69.0,
134
+ "Agibot":41.0,
135
+ "HoloAssist":73.0,
136
+ "AV":47.0
137
+ },
138
+ {
139
+ "model":"GLM-4.5V",
140
+ "url":"https:\/\/huggingface.co\/zai-org\/GLM-4.5V",
141
+ "Thinking":"No",
142
+ "Overall":59.2,
143
+ "Common Sense":60.9,
144
+ "Embodied Reasoning":57.5,
145
+ "Space":63.8,
146
+ "Time":65.4,
147
+ "Physics":54.0,
148
+ "BridgeData V2":37.0,
149
+ "RoboVQA":77.3,
150
+ "RoboFail":76.0,
151
+ "Agibot":39.0,
152
+ "HoloAssist":71.0,
153
+ "AV":43.0
154
+ },
155
+ {
156
+ "model":"Qwen3-VL-30B-A3B-Thinking",
157
+ "url":"https:\/\/huggingface.co\/Qwen\/Qwen3-VL-30B-A3B-Thinking",
158
+ "Thinking":"Yes",
159
+ "Overall":57.3,
160
+ "Common Sense":56.1,
161
+ "Embodied Reasoning":58.5,
162
+ "Space":55.0,
163
+ "Time":60.1,
164
+ "Physics":51.3,
165
+ "BridgeData V2":35.0,
166
+ "RoboVQA":91.8,
167
+ "RoboFail":66.0,
168
+ "Agibot":46.0,
169
+ "HoloAssist":76.0,
170
+ "AV":33.0
171
+ },
172
+ {
173
+ "model":"Qwen3-VL-8B-Thinking",
174
+ "url":"https:\/\/huggingface.co\/Qwen\/Qwen3-VL-8B-Thinking",
175
+ "Thinking":"Yes",
176
+ "Overall":57.3,
177
+ "Common Sense":57.0,
178
+ "Embodied Reasoning":57.7,
179
+ "Space":58.8,
180
+ "Time":61.4,
181
+ "Physics":50.4,
182
+ "BridgeData V2":36.0,
183
+ "RoboVQA":87.3,
184
+ "RoboFail":61.0,
185
+ "Agibot":48.0,
186
+ "HoloAssist":71.0,
187
+ "AV":40.0
188
+ },
189
+ {
190
+ "model":"Qwen3-VL-8B-Instruct",
191
+ "url":"https:\/\/huggingface.co\/Qwen\/Qwen3-VL-8B-Instruct",
192
+ "Thinking":"No",
193
+ "Overall":56.8,
194
+ "Common Sense":55.0,
195
+ "Embodied Reasoning":58.7,
196
+ "Space":53.8,
197
+ "Time":58.4,
198
+ "Physics":50.9,
199
+ "BridgeData V2":35.0,
200
+ "RoboVQA":89.1,
201
+ "RoboFail":61.0,
202
+ "Agibot":49.0,
203
+ "HoloAssist":75.0,
204
+ "AV":40.0
205
+ },
206
+ {
207
+ "model":"InternVL3.5-241B-A28B",
208
+ "url":"https:\/\/huggingface.co\/OpenGVLab\/InternVL3_5-241B-A28B",
209
+ "Thinking":"No",
210
+ "Overall":56.3,
211
+ "Common Sense":56.3,
212
+ "Embodied Reasoning":56.4,
213
+ "Space":60.0,
214
+ "Time":57.4,
215
+ "Physics":53.5,
216
+ "BridgeData V2":34.0,
217
+ "RoboVQA":78.2,
218
+ "RoboFail":66.0,
219
+ "Agibot":43.0,
220
+ "HoloAssist":75.0,
221
+ "AV":40.0
222
+ },
223
+ {
224
+ "model":"GPT-4o",
225
+ "url":"https:\/\/openai.com\/index\/hello-gpt-4o\/",
226
+ "Thinking":"No",
227
+ "Overall":56.2,
228
+ "Common Sense":58.6,
229
+ "Embodied Reasoning":53.8,
230
+ "Space":61.2,
231
+ "Time":57.0,
232
+ "Physics":59.7,
233
+ "BridgeData V2":44.0,
234
+ "RoboVQA":68.2,
235
+ "RoboFail":71.0,
236
+ "Agibot":45.0,
237
+ "HoloAssist":55.0,
238
+ "AV":38.0
239
+ },
240
+ {
241
+ "model":"InternVL3.5-38B",
242
+ "url":"https:\/\/huggingface.co\/OpenGVLab\/InternVL3_5-38B",
243
+ "Thinking":"No",
244
+ "Overall":55.8,
245
+ "Common Sense":55.8,
246
+ "Embodied Reasoning":55.7,
247
+ "Space":58.8,
248
+ "Time":59.7,
249
+ "Physics":49.6,
250
+ "BridgeData V2":36.0,
251
+ "RoboVQA":82.7,
252
+ "RoboFail":66.0,
253
+ "Agibot":44.0,
254
+ "HoloAssist":69.0,
255
+ "AV":34.0
256
+ },
257
+ {
258
+ "model":"Cosmos-Reason1-7B",
259
+ "url":"https:\/\/huggingface.co\/nvidia\/Cosmos-Reason1-7B",
260
+ "Thinking":"No",
261
+ "Overall":55.7,
262
+ "Common Sense":53.1,
263
+ "Embodied Reasoning":58.2,
264
+ "Space":63.8,
265
+ "Time":55.7,
266
+ "Physics":46.0,
267
+ "BridgeData V2":41.0,
268
+ "RoboVQA":91.8,
269
+ "RoboFail":66.0,
270
+ "Agibot":41.0,
271
+ "HoloAssist":59.0,
272
+ "AV":47.0
273
+ },
274
+ {
275
+ "model":"Qwen2.5-VL-32B-Instruct",
276
+ "url":"https:\/\/huggingface.co\/Qwen\/Qwen2.5-VL-32B-Instruct",
277
+ "Thinking":"No",
278
+ "Overall":55.3,
279
+ "Common Sense":55.5,
280
+ "Embodied Reasoning":55.1,
281
+ "Space":50.0,
282
+ "Time":62.1,
283
+ "Physics":48.7,
284
+ "BridgeData V2":35.0,
285
+ "RoboVQA":93.6,
286
+ "RoboFail":65.0,
287
+ "Agibot":45.0,
288
+ "HoloAssist":56.0,
289
+ "AV":32.0
290
+ },
291
+ {
292
+ "model":"Qwen2.5-VL-7B-Instruct",
293
+ "url":"https:\/\/huggingface.co\/Qwen\/Qwen2.5-VL-7B-Instruct",
294
+ "Thinking":"No",
295
+ "Overall":51.0,
296
+ "Common Sense":45.7,
297
+ "Embodied Reasoning":56.2,
298
+ "Space":51.2,
299
+ "Time":48.7,
300
+ "Physics":39.8,
301
+ "BridgeData V2":35.0,
302
+ "RoboVQA":87.3,
303
+ "RoboFail":63.0,
304
+ "Agibot":53.0,
305
+ "HoloAssist":60.0,
306
+ "AV":36.0
307
+ },
308
+ {
309
+ "model":"InternVL3.5-8B",
310
+ "url":"https:\/\/huggingface.co\/OpenGVLab\/InternVL3_5-8B",
311
+ "Thinking":"No",
312
+ "Overall":49.7,
313
+ "Common Sense":50.5,
314
+ "Embodied Reasoning":48.9,
315
+ "Space":50.0,
316
+ "Time":55.0,
317
+ "Physics":44.7,
318
+ "BridgeData V2":29.0,
319
+ "RoboVQA":75.5,
320
+ "RoboFail":63.0,
321
+ "Agibot":39.0,
322
+ "HoloAssist":49.0,
323
+ "AV":35.0
324
+ },
325
+ {
326
+ "model":"InternVL3.5-30B-A3B",
327
+ "url":"https:\/\/huggingface.co\/OpenGVLab\/InternVL3_5-30B-A3B",
328
+ "Thinking":"No",
329
+ "Overall":49.4,
330
+ "Common Sense":51.2,
331
+ "Embodied Reasoning":47.7,
332
+ "Space":48.8,
333
+ "Time":55.7,
334
+ "Physics":46.0,
335
+ "BridgeData V2":37.0,
336
+ "RoboVQA":74.5,
337
+ "RoboFail":60.0,
338
+ "Agibot":23.0,
339
+ "HoloAssist":55.0,
340
+ "AV":34.0
341
+ },
342
+ {
343
+ "model":"InternVL3.5-14B",
344
+ "url":"https:\/\/huggingface.co\/OpenGVLab\/InternVL3_5-14B",
345
+ "Thinking":"No",
346
+ "Overall":48.8,
347
+ "Common Sense":49.7,
348
+ "Embodied Reasoning":47.9,
349
+ "Space":50.0,
350
+ "Time":51.3,
351
+ "Physics":47.3,
352
+ "BridgeData V2":23.0,
353
+ "RoboVQA":80.0,
354
+ "RoboFail":67.0,
355
+ "Agibot":27.0,
356
+ "HoloAssist":56.0,
357
+ "AV":31.0
358
+ },
359
+ {
360
+ "model":"Claude-3.5-Sonnet",
361
+ "url":"https:\/\/www.anthropic.com\/news\/claude-3-5-sonnet",
362
+ "Thinking":"No",
363
+ "Overall":46.0,
364
+ "Common Sense":47.8,
365
+ "Embodied Reasoning":44.1,
366
+ "Space":55.0,
367
+ "Time":46.6,
368
+ "Physics":46.9,
369
+ "BridgeData V2":29.0,
370
+ "RoboVQA":74.5,
371
+ "RoboFail":58.0,
372
+ "Agibot":28.0,
373
+ "HoloAssist":38.0,
374
+ "AV":34.0
375
+ }
376
+ ]
inspect_gradio.py DELETED
@@ -1,5 +0,0 @@
1
- import gradio as gr
2
- import inspect
3
-
4
- with open("signature.txt", "w") as f:
5
- f.write(str(inspect.signature(gr.Dataframe.__init__)))