Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
File size: 8,037 Bytes
3f53c67 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 |
Model,Vendor,Model Type,L1_Total_Tasks,L2_Total_Tasks,L3_Total_Tasks,L4_Total_Tasks,L5_Total_Tasks,L6_Total_Tasks,L7_Total_Tasks,L1_Evaluated_Tasks,L2_Evaluated_Tasks,L3_Evaluated_Tasks,L4_Evaluated_Tasks,L5_Evaluated_Tasks,L6_Evaluated_Tasks,L7_Evaluated_Tasks,L1_Avg_Exec_Time,L2_Avg_Exec_Time,L3_Avg_Exec_Time,L4_Avg_Exec_Time,L5_Avg_Exec_Time,L6_Avg_Exec_Time,L7_Avg_Exec_Time,L1_Avg_Tokens,L2_Avg_Tokens,L3_Avg_Tokens,L4_Avg_Tokens,L5_Avg_Tokens,L6_Avg_Tokens,L7_Avg_Tokens,L1_Avg_TPS,L2_Avg_TPS,L3_Avg_TPS,L4_Avg_TPS,L5_Avg_TPS,L6_Avg_TPS,L7_Avg_TPS,L1_Avg_TTFT,L2_Avg_TTFT,L3_Avg_TTFT,L4_Avg_TTFT,L5_Avg_TTFT,L6_Avg_TTFT,L7_Avg_TTFT,L1_RRR,L2_RRR,L3_RRR,L4_RRR,L5_RRR,L6_RRR,L7_RRR,L1_SR,L2_SR,L3_SR,L4_SR,L5_SR,L6_SR,L7_SR,L1_EPR_CVR,L2_EPR_CVR,L3_EPR_CVR,L4_EPR_CVR,L5_EPR_CVR,L6_EPR_CVR,L7_EPR_CVR,L1_pass@k,L2_pass@k,L3_pass@k,L4_pass@k,L5_pass@k,L6_pass@k,L7_pass@k,L1_TooAcc,L1_ArgAcc,L1_CallEM,L1_RespOK,L2_SelectAcc,L3_FSM,L3_PSM,L3_ΔSteps_norm,L4_Coverage,L4_SourceEPR,L5_AdaptiveRoutingScore,L5_FallbackSR,L6_RedundantCallRate,L6_EffScore,L7_ContextRetention,L7_RefRecall
nova-2-lite,Amazon,API,11,15,10,10,20,15,10,11,15,10,10,20,15,10,8.93,5.72,9.82,18.44,9.28,1.54,2.34,3327.64,5633.33,16431.2,23542.0,8650.3,2454.87,3767.7,372.65,984.32,1672.39,1276.67,932.02,1590.31,1612.51,4.2313,2.2447,2.0483,4.5429,1.8541,1.5429,2.3359,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.9,0.9,0.55,0.8,0.9,1.0,1.0,0.9417,1.0,0.2542,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.7955,0.4545,1.0,1.0,0.5,1.0,0.45,0.65,0.65,0.2625,0.65,1.0,0.0,0.95,0.85
gpt-4o,OpenAI,API,11,15,10,10,20,15,10,11,15,10,10,20,15,10,5.42,7.28,12.74,19.25,8.37,4.45,3.68,3302.0,2871.0,11588.0,16022.6,3909.25,1564.2,4044.4,609.02,394.63,909.92,832.37,467.04,351.32,1098.86,1.5767,3.2437,3.3023,5.9534,1.5256,4.452,2.9725,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.5,0.8667,0.5,1.0,0.8667,1.0,1.0,0.1833,0.0,0.2,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.5909,0.2727,1.0,1.0,0.4,1.0,0.35,0.7167,0.7167,0.2583,0.6,1.0,0.0,0.95,0.95
DeepSeek-V3.1,DeepSeek,OSS,11,15,10,10,20,15,10,11,15,10,10,20,15,10,3.53,10.56,18.4,28.31,13.21,7.63,3.25,1469.73,10547.33,23309.7,42090.4,9212.0,4614.6,4392.9,416.14,998.63,1266.84,1486.94,697.45,604.79,1351.85,1.8044,3.7647,4.442,6.5445,2.0181,5.3715,2.6493,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.9333,0.9,1.0,0.65,0.8667,0.5,0.8182,0.8667,0.74,0.775,0.2117,0.2,0.2,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.8182,0.5455,0.2727,1.0,0.8667,0.3,0.8,0.3,0.6667,0.575,0.2133,0.7,1.0,0.1667,1.0,0.975
gemini-2.5-flash,Google,API,11,15,10,10,20,15,10,11,15,10,10,20,15,10,4.3,4.88,7.75,11.4,6.07,2.63,4.14,1733.73,4466.67,3951.7,12934.7,2054.0,3562.87,4716.3,402.93,914.6,509.6,1135.09,338.22,1353.71,1137.94,1.975,1.9409,3.4497,3.5025,1.7375,1.8039,2.5157,1.0,1.0,1.0,1.0,1.0,1.0,0.7,0.9091,1.0,0.7,0.6,0.25,0.7333,0.3,0.9091,1.0,0.6,0.8,0.1,0.3333,0.3,1.0,1.0,1.0,1.0,1.0,1.0,0.7,0.9091,0.6136,0.1818,1.0,1.0,0.5,0.55,0.35,0.35,0.35,0.1,0.2,1.0,0.3333,0.9,0.875
glm-4.6v,Z.ai,OSS,11,15,10,10,20,15,10,11,15,10,10,20,15,10,10.08,15.04,27.21,34.65,37.16,11.9,8.41,2465.09,8454.93,15996.0,35309.7,16376.0,2716.53,4826.2,244.61,562.21,587.98,1019.06,440.71,228.19,574.12,4.8704,5.7804,7.0628,8.554,5.1927,10.9267,6.6557,1.0,1.0,1.0,1.0,0.95,1.0,1.0,1.0,1.0,1.0,1.0,0.65,0.9333,0.6,1.0,1.0,1.0,1.0,0.3083,0.0667,0.3,1.0,1.0,1.0,1.0,0.95,1.0,1.0,1.0,0.6818,0.3636,1.0,1.0,0.6,0.9667,0.6,0.5667,0.5667,0.2601,0.75,1.0,0.0667,0.9,0.8
grok-4.1-fast,xAI,API,11,15,10,10,20,15,10,11,15,10,10,20,15,10,11.99,17.15,27.66,44.02,39.33,12.44,17.68,3710.64,6381.8,15081.5,13744.4,16053.9,2901.0,5535.2,309.6,372.15,545.25,312.25,408.21,233.21,313.08,5.7846,6.5545,8.836,12.3512,6.6412,11.301,14.0295,1.0,1.0,1.0,0.8,1.0,1.0,1.0,0.9091,1.0,1.0,0.8,0.8,0.9333,0.8,1.0,1.0,1.0,0.8,0.3797,0.0667,0.4,1.0,1.0,1.0,0.8,1.0,1.0,1.0,1.0,0.75,0.5455,1.0,1.0,0.6,1.0,0.5667,0.5667,0.5667,0.3475,0.95,1.0,0.0667,0.975,0.85
claude-haiku-4-5,Anthropic,API,11,15,10,10,20,15,10,11,15,10,10,20,15,10,5.18,9.9,14.65,21.61,18.33,3.69,4.22,4504.64,11367.93,23333.9,42628.5,13977.65,2732.53,7153.3,869.59,1148.23,1593.07,1972.65,762.46,741.38,1697.01,2.4328,3.2797,4.1784,5.2912,2.2585,3.6851,3.3065,1.0,1.0,1.0,1.0,0.95,1.0,1.0,1.0,1.0,1.0,0.9,0.65,0.8,0.7,1.0,1.0,1.0,1.0,0.2358,0.0,0.3,1.0,1.0,1.0,1.0,0.95,1.0,1.0,1.0,0.6136,0.2727,1.0,1.0,0.6,1.0,0.5,0.75,0.7389,0.2283,0.75,1.0,0.0,1.0,0.925
gemini-2.5-flash-lite,Google,API,11,15,10,10,20,15,10,11,15,10,10,20,15,10,1.62,2.83,1.55,5.72,3.74,1.66,2.97,1930.09,3337.87,5892.0,15236.2,1795.9,1572.73,2577.8,1188.63,1179.12,3797.73,2664.96,480.67,944.86,868.65,0.6444,0.9106,0.6729,1.1369,0.5226,0.7943,0.6945,1.0,1.0,1.0,1.0,0.9,1.0,0.4,1.0,0.8667,0.2,0.7,0.25,0.6,0.4,1.0,0.8667,0.275,0.6,0.1167,0.2,0.2,1.0,1.0,1.0,1.0,0.9,1.0,0.4,1.0,0.6364,0.2727,1.0,0.8667,0.1,0.2,0.1,0.35,0.35,0.125,0.25,1.0,0.1333,0.975,0.825
claude-sonnet-4-5,Anthropic,API,11,15,10,10,20,15,10,11,15,10,10,20,15,10,6.77,11.69,19.86,34.08,19.1,5.45,7.18,3215.09,5874.0,19958.4,60071.8,10702.45,2710.47,10297.8,474.96,502.51,1004.85,1762.73,560.27,497.52,1434.99,3.1551,5.243,5.9522,8.9693,3.4574,5.4468,4.6806,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.9,0.55,0.8,0.6,1.0,1.0,1.0,1.0,0.1742,0.0,0.4,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.6591,0.2727,1.0,1.0,0.9,1.0,0.75,0.75,0.75,0.1892,0.6,1.0,0.0,1.0,0.975
gpt-4o-mini,OpenAI,API,11,15,10,10,20,15,10,11,15,10,10,20,15,10,2.79,5.61,8.13,25.46,7.19,2.63,2.9,1389.55,4236.13,11772.4,11700.1,5203.7,1561.93,3940.3,498.7,755.34,1448.9,459.62,724.0,594.06,1357.18,1.2394,1.9904,2.5526,9.1994,0.9279,2.6286,2.1975,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.9091,1.0,1.0,1.0,0.6,0.6667,0.5,1.0,0.8667,1.0,1.0,0.1946,0.0,0.2,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.6591,0.2727,1.0,1.0,0.5,0.9167,0.5,0.5833,0.5833,0.2171,0.75,1.0,0.0,0.925,0.975
gpt-5,OpenAI,API,11,15,10,10,20,15,10,11,15,10,10,20,15,10,5.64,11.23,14.48,24.59,19.64,9.31,10.15,2306.18,16867.2,19321.9,29718.7,10773.2,6753.07,9451.3,409.06,1501.34,1334.6,1208.62,548.57,725.02,931.01,2.4414,3.442,5.8573,7.5822,3.1615,5.978,5.431,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.9091,0.9333,1.0,0.9,0.85,0.8667,0.8,1.0,1.0,0.7,0.7,0.2728,0.2,0.5,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.7273,0.3636,1.0,1.0,0.1,0.5667,0.4,0.55,0.5333,0.3,0.85,1.0,0.1444,1.0,0.975
qwen3-next-80b-a3b,Alibaba,OSS,11,15,10,10,20,15,10,11,15,10,10,20,15,10,4.13,12.63,17.18,28.84,10.59,9.59,7.92,1937.82,4725.0,15345.8,22067.0,6512.1,2198.27,5761.5,469.0,374.15,893.49,765.08,615.2,229.2,727.4,1.907,5.8972,5.5666,10.0412,1.985,9.5896,5.561,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.6,0.9333,0.7,1.0,1.0,1.0,1.0,0.2375,0.0,0.2,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.7727,0.4545,1.0,1.0,0.8,1.0,0.65,0.7,0.7,0.2542,0.7,1.0,0.0,0.975,0.95
gpt-5-mini,OpenAI,API,11,15,10,10,20,15,10,11,15,10,10,20,15,10,7.14,7.36,12.37,13.11,11.67,7.22,8.02,2963.73,4288.47,9704.4,8528.4,3510.45,2465.07,5810.8,414.91,582.29,784.64,650.71,300.9,341.21,724.39,3.4248,3.2995,5.2383,6.41,2.7195,6.5991,6.5065,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.9091,0.9333,0.9,0.8,0.2,0.8667,1.0,1.0,0.8667,0.6,0.6,0.0917,0.0667,0.3,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.7045,0.3636,1.0,1.0,0.3,0.55,0.25,0.3667,0.3667,0.0917,0.2,1.0,0.0667,1.0,0.95
nova-lite,Amazon,API,11,15,10,10,20,15,10,11,15,10,10,20,15,10,3.29,7.72,12.08,18.88,11.81,5.05,3.2,2760.64,7563.27,17904.5,43855.6,12621.5,23029.87,6711.7,839.35,979.15,1482.74,2323.41,1068.7,4562.8,2094.59,1.4877,2.958,2.4853,4.0705,1.4959,2.0742,2.2498,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.9091,1.0,0.5,0.9,0.3,0.8,0.4,1.0,1.0,1.0,1.0,0.1373,0.4667,0.4,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.5909,0.1818,1.0,1.0,0.5,0.85,0.45,0.5667,0.5667,0.1376,0.6,0.3,0.3133,0.725,0.675
gemini-2.5-pro,Google,API,11,15,10,10,20,15,10,11,15,10,10,20,15,10,10.88,11.9,23.24,19.5,23.03,7.52,9.7,2524.45,4880.93,3022.7,15671.5,4011.9,5005.8,9071.0,232.11,410.31,130.06,803.81,174.17,665.86,935.55,5.2265,5.6138,9.9988,8.3578,5.6094,4.8197,5.9149,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.9333,0.1,0.6,0.3,0.7333,0.5,1.0,1.0,0.5,0.7,0.125,0.4,0.5,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.7045,0.3636,1.0,1.0,0.0,0.2667,0.2,0.4667,0.4667,0.125,0.3,1.0,0.1333,0.875,0.85 |