Buckets:
| suite,model,model_slug,source_kind,label,artifact_count,generation_ok,generation_total,artifact_present,generation_duration_s,input_tokens,output_tokens,total_tokens,billing_tokens,reasoning_tokens,tool_use_tokens,cache_read_tokens,cache_write_tokens,cache_hit_tokens,total_cache_tokens,effective_input_tokens,display_input_tokens,usage_event_count,tool_calls,turn_count,self_check_attempted,self_check_ran,self_check_succeeded,self_check_runs,self_check_failed_runs,self_check_successful_runs,self_correction_edits,self_corrected_after_checker,self_correction_verified,assistant_turns_trace,deterministic_failures,deterministic_warnings,vlm_failures,vlm_warnings,deterministic_failure_units,deterministic_warning_units,vlm_failure_units,vlm_warning_units,generation_trace_count,vlm_trace_count,selected_record_path,quality_score,quality_score_basis,missing_final_artifacts,efficiency_score,quality_efficiency_score,rank_quality_efficiency | |
| publish,codexresponses.gpt-5.5,codexresponses-gpt-5-5,clean-final,skill-with-shell-codexresponses-gpt-5-5-opus-gpt55-deepseek-experiment-20260524-164522,5,5,5,5,670.53,766387,31560,797947,797947,3177,0,0,0,608768,608768,157619,766387,47,69,47,5,5,5,9,3,6,0,0,3,47,0,0,0,0,0,0,0,0,5,5,results/publish/models/codexresponses-gpt-5-5,100.0,sum_of_five_20_point_task_scores,0,85.35,96.34,1 | |
| publish,opus47,opus47,clean-final,skill-with-shell-opus47-publication-final,5,5,5,5,872.87,1980822,60545,2041367,2041367,0,0,1547331,205103,0,1752434,228388,1980822,67,83,67,5,5,5,10,0,10,0,0,0,67,0,0,0,0,0,0,0,0,5,5,results/publish/models/opus47,100.0,sum_of_five_20_point_task_scores,0,72.88,93.22,2 | |
| new-model-day,codexresponses.gpt-5.4,codexresponses-gpt-5-4,clean-final,skill-with-shell-codexresponses-gpt-5-4-new-model-day,5,5,5,5,1008.0,798798,45488,844286,844286,0,0,0,0,468480,468480,330318,798798,40,81,40,5,2,2,2,0,2,0,0,0,40,0,0,1,0,0,0,1,0,5,5,results/new-model-day/models/codexresponses-gpt-5-4,98.4,sum_of_five_20_point_task_scores,0,76.77,92.99,3 | |
| publish,gpt-5.3-codex,gpt-5-3-codex,clean-final,skill-with-shell-gpt-5-3-codex-publication-final,5,5,5,5,372.521,1260130,28682,1288812,1288812,6974,0,0,0,1036288,1036288,223842,1260130,58,70,58,3,3,2,5,3,2,0,0,2,58,6,2,1,1,3,1,1,1,5,5,results/publish/models/gpt-5-3-codex,94.4,sum_of_five_20_point_task_scores,0,88.54,92.94,4 | |
| publish,glm51,glm51,clean-final,skill-with-shell-glm51-publication-final,5,5,5,5,767.449,1557545,52925,1610470,1610470,0,0,0,0,1221440,1221440,336105,1557545,62,74,62,5,5,4,6,1,5,0,0,0,62,2,0,2,4,1,0,1,2,5,5,results/publish/models/glm51,96.8,sum_of_five_20_point_task_scores,0,78.47,92.22,5 | |
| new-model-day,opus48,opus48,clean-final,skill-with-shell-opus48-new-model-day,5,5,5,5,979.386,2286911,67209,2354120,2354120,0,0,1782694,236242,0,2018936,267975,2286911,71,91,71,5,5,5,10,0,10,0,0,0,71,0,0,0,0,0,0,0,0,5,5,results/new-model-day/models/opus48,100.0,sum_of_five_20_point_task_scores,0,68.02,92.0,6 | |
| new-model-day,opus46,opus46,clean-final,skill-with-shell-opus46-new-model-day,5,5,5,5,997.508,1793023,58899,1851922,1851922,0,0,1408770,192607,0,1601377,191646,1793023,73,98,73,5,5,5,9,2,7,0,0,2,73,2,0,0,0,1,0,0,0,5,5,results/new-model-day/models/opus46,98.8,sum_of_five_20_point_task_scores,0,69.34,91.44,7 | |
| new-model-day,opus?task_budget=50000,opus-task-budget-50000,clean-final,skill-with-shell-opus-task-budget-50000-new-model-day,5,4,5,5,325.213,488908,27099,516007,516007,0,0,219981,84644,0,304625,184283,488908,28,29,28,4,3,3,6,0,6,0,0,0,28,4,0,1,1,1,0,1,1,5,5,results/new-model-day/models/opus-task-budget-50000,87.0,sum_of_five_20_point_task_scores,0,99.19,90.05,8 | |
| publish,codexresponses.gpt-5.4-mini,codexresponses-gpt-5-4-mini,clean-final,skill-with-shell-codexresponses-gpt-5-4-mini-publication-final,5,5,5,5,1155.803,2799895,87812,2887707,2887707,55592,0,0,0,2607104,2607104,192791,2799895,72,113,72,5,5,5,13,6,7,0,0,4,72,0,2,0,0,0,1,0,0,5,5,results/publish/models/codexresponses-gpt-5-4-mini,99.8,sum_of_five_20_point_task_scores,0,58.69,89.52,9 | |
| new-model-day,opus?task_budget=200000,opus-task-budget-200000,clean-final,skill-with-shell-opus-task-budget-200000-new-model-day,5,5,5,5,1189.632,3584777,96188,3680965,3680965,0,0,3006403,266881,0,3273284,311493,3584777,88,105,88,5,5,5,12,0,12,0,0,0,88,4,0,0,2,2,0,0,1,5,5,results/new-model-day/models/opus-task-budget-200000,97.4,sum_of_five_20_point_task_scores,0,54.95,86.79,10 | |
| publish,haiku45,haiku45,clean-final,skill-with-shell-haiku45-publication-final,5,3,5,5,370.199,907211,42193,949404,949404,0,0,450168,130377,0,580545,162656,743201,38,49,47,3,2,2,2,0,2,0,0,0,38,26,12,1,5,7,3,1,2,5,5,results/publish/models/haiku45,83.0,sum_of_five_20_point_task_scores,0,93.26,85.56,11 | |
| publish,gemini35flash,gemini35flash,clean-final,skill-with-shell-gemini35flash-publication-final,5,5,5,5,774.424,8095357,32386,8127743,8127743,78303,0,0,0,6936722,6936722,1158635,8095357,147,142,147,5,5,3,13,9,4,0,0,2,147,0,0,0,0,0,0,0,0,5,5,results/publish/models/gemini35flash,100.0,sum_of_five_20_point_task_scores,0,34.71,83.68,12 | |
| publish,sonnet46,sonnet46,clean-final,skill-with-shell-sonnet46-publication-final,5,5,5,5,2303.843,4878483,156614,5035097,5035097,0,0,4215364,372099,0,4587463,291020,4878483,93,108,93,5,5,5,11,2,9,0,0,2,93,0,0,0,0,0,0,0,0,5,5,results/publish/models/sonnet46,100.0,sum_of_five_20_point_task_scores,0,25.36,81.34,13 | |
| publish,deepseek,deepseek,clean-final,skill-with-shell-deepseek-publication-final,5,4,5,5,1242.354,2535136,77564,2612700,2612700,0,0,0,0,2637696,2637696,180215,2817911,84,97,80,5,4,4,8,3,5,0,0,3,84,8,1,7,0,3,1,2,0,5,5,results/publish/models/deepseek,84.0,sum_of_five_20_point_task_scores,0,60.63,78.16,14 | |
| publish,kimi,kimi,clean-final,skill-with-shell-kimi-publication-final,5,4,5,5,1764.648,2600494,69995,2670489,2670489,0,0,0,0,2332928,2332928,267566,2600494,87,99,87,5,4,4,7,2,5,0,0,2,87,6,0,7,2,2,0,3,2,5,5,results/publish/models/kimi,83.8,sum_of_five_20_point_task_scores,0,49.7,75.27,15 | |
| publish,grok-4.3,grok-4-3,clean-final,skill-with-shell-grok-4-3-publication-final,5,2,5,5,284.258,575242,24310,599552,599552,8191,0,0,0,336000,336000,134147,470147,52,44,49,1,0,0,0,0,0,0,0,0,52,16,0,11,1,4,0,4,1,5,5,results/publish/models/grok-4-3,58.0,sum_of_five_20_point_task_scores,0,97.49,67.87,16 | |
| publish,codexspark,codexspark,clean-final,skill-with-shell-codexspark-publication-final,5,3,5,5,363.473,7093160,92660,7185820,7185820,58302,0,0,0,6181120,6181120,450555,6631675,167,174,159,3,2,2,5,0,5,0,0,0,167,14,6,5,0,4,2,2,0,5,5,results/publish/models/codexspark,72.2,sum_of_five_20_point_task_scores,0,43.38,65.0,17 | |
| publish,minimax27,minimax27,clean-final,skill-with-shell-minimax27-publication-final,5,1,5,5,1039.913,1255595,70938,1326533,1326533,0,0,0,0,841088,841088,485011,1326099,49,55,50,3,2,2,3,1,2,0,0,1,49,26,4,4,4,7,1,1,1,5,5,results/publish/models/minimax27,58.0,sum_of_five_20_point_task_scores,0,77.19,62.8,18 | |
Xet Storage Details
- Size:
- 6.85 kB
- Xet hash:
- 12c0490c51f17cf65e4d2be902d314b0af95582c80abfdd973a67b9dca555338
·
Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.