evalstate/birch-html / analysis /data /model-summary.json
evalstate's picture
download
raw
30.7 kB
[
{
"suite": "publish",
"model": "codexresponses.gpt-5.5",
"model_slug": "codexresponses-gpt-5-5",
"source_kind": "clean-final",
"label": "skill-with-shell-codexresponses-gpt-5-5-opus-gpt55-deepseek-experiment-20260524-164522",
"artifact_count": 5,
"generation_ok": 5,
"generation_total": 5,
"artifact_present": 5,
"generation_duration_s": 670.53,
"input_tokens": 766387,
"output_tokens": 31560,
"total_tokens": 797947,
"billing_tokens": 797947,
"reasoning_tokens": 3177,
"tool_use_tokens": 0,
"cache_read_tokens": 0,
"cache_write_tokens": 0,
"cache_hit_tokens": 608768,
"total_cache_tokens": 608768,
"effective_input_tokens": 157619,
"display_input_tokens": 766387,
"usage_event_count": 47,
"tool_calls": 69,
"turn_count": 47,
"self_check_attempted": 5,
"self_check_ran": 5,
"self_check_succeeded": 5,
"self_check_runs": 9,
"self_check_failed_runs": 3,
"self_check_successful_runs": 6,
"self_correction_edits": 0,
"self_corrected_after_checker": 0,
"self_correction_verified": 3,
"assistant_turns_trace": 47,
"deterministic_failures": 0,
"deterministic_warnings": 0,
"vlm_failures": 0,
"vlm_warnings": 0,
"deterministic_failure_units": 0,
"deterministic_warning_units": 0,
"vlm_failure_units": 0,
"vlm_warning_units": 0,
"generation_trace_count": 5,
"vlm_trace_count": 5,
"selected_record_path": "results/publish/models/codexresponses-gpt-5-5",
"quality_score": 100.0,
"quality_score_basis": "sum_of_five_20_point_task_scores",
"missing_final_artifacts": 0,
"efficiency_score": 85.35,
"quality_efficiency_score": 96.34,
"rank_quality_efficiency": 1
},
{
"suite": "publish",
"model": "opus47",
"model_slug": "opus47",
"source_kind": "clean-final",
"label": "skill-with-shell-opus47-publication-final",
"artifact_count": 5,
"generation_ok": 5,
"generation_total": 5,
"artifact_present": 5,
"generation_duration_s": 872.87,
"input_tokens": 1980822,
"output_tokens": 60545,
"total_tokens": 2041367,
"billing_tokens": 2041367,
"reasoning_tokens": 0,
"tool_use_tokens": 0,
"cache_read_tokens": 1547331,
"cache_write_tokens": 205103,
"cache_hit_tokens": 0,
"total_cache_tokens": 1752434,
"effective_input_tokens": 228388,
"display_input_tokens": 1980822,
"usage_event_count": 67,
"tool_calls": 83,
"turn_count": 67,
"self_check_attempted": 5,
"self_check_ran": 5,
"self_check_succeeded": 5,
"self_check_runs": 10,
"self_check_failed_runs": 0,
"self_check_successful_runs": 10,
"self_correction_edits": 0,
"self_corrected_after_checker": 0,
"self_correction_verified": 0,
"assistant_turns_trace": 67,
"deterministic_failures": 0,
"deterministic_warnings": 0,
"vlm_failures": 0,
"vlm_warnings": 0,
"deterministic_failure_units": 0,
"deterministic_warning_units": 0,
"vlm_failure_units": 0,
"vlm_warning_units": 0,
"generation_trace_count": 5,
"vlm_trace_count": 5,
"selected_record_path": "results/publish/models/opus47",
"quality_score": 100.0,
"quality_score_basis": "sum_of_five_20_point_task_scores",
"missing_final_artifacts": 0,
"efficiency_score": 72.88,
"quality_efficiency_score": 93.22,
"rank_quality_efficiency": 2
},
{
"suite": "new-model-day",
"model": "codexresponses.gpt-5.4",
"model_slug": "codexresponses-gpt-5-4",
"source_kind": "clean-final",
"label": "skill-with-shell-codexresponses-gpt-5-4-new-model-day",
"artifact_count": 5,
"generation_ok": 5,
"generation_total": 5,
"artifact_present": 5,
"generation_duration_s": 1008.0,
"input_tokens": 798798,
"output_tokens": 45488,
"total_tokens": 844286,
"billing_tokens": 844286,
"reasoning_tokens": 0,
"tool_use_tokens": 0,
"cache_read_tokens": 0,
"cache_write_tokens": 0,
"cache_hit_tokens": 468480,
"total_cache_tokens": 468480,
"effective_input_tokens": 330318,
"display_input_tokens": 798798,
"usage_event_count": 40,
"tool_calls": 81,
"turn_count": 40,
"self_check_attempted": 5,
"self_check_ran": 2,
"self_check_succeeded": 2,
"self_check_runs": 2,
"self_check_failed_runs": 0,
"self_check_successful_runs": 2,
"self_correction_edits": 0,
"self_corrected_after_checker": 0,
"self_correction_verified": 0,
"assistant_turns_trace": 40,
"deterministic_failures": 0,
"deterministic_warnings": 0,
"vlm_failures": 1,
"vlm_warnings": 0,
"deterministic_failure_units": 0,
"deterministic_warning_units": 0,
"vlm_failure_units": 1,
"vlm_warning_units": 0,
"generation_trace_count": 5,
"vlm_trace_count": 5,
"selected_record_path": "results/new-model-day/models/codexresponses-gpt-5-4",
"quality_score": 98.4,
"quality_score_basis": "sum_of_five_20_point_task_scores",
"missing_final_artifacts": 0,
"efficiency_score": 76.77,
"quality_efficiency_score": 92.99,
"rank_quality_efficiency": 3
},
{
"suite": "publish",
"model": "gpt-5.3-codex",
"model_slug": "gpt-5-3-codex",
"source_kind": "clean-final",
"label": "skill-with-shell-gpt-5-3-codex-publication-final",
"artifact_count": 5,
"generation_ok": 5,
"generation_total": 5,
"artifact_present": 5,
"generation_duration_s": 372.521,
"input_tokens": 1260130,
"output_tokens": 28682,
"total_tokens": 1288812,
"billing_tokens": 1288812,
"reasoning_tokens": 6974,
"tool_use_tokens": 0,
"cache_read_tokens": 0,
"cache_write_tokens": 0,
"cache_hit_tokens": 1036288,
"total_cache_tokens": 1036288,
"effective_input_tokens": 223842,
"display_input_tokens": 1260130,
"usage_event_count": 58,
"tool_calls": 70,
"turn_count": 58,
"self_check_attempted": 3,
"self_check_ran": 3,
"self_check_succeeded": 2,
"self_check_runs": 5,
"self_check_failed_runs": 3,
"self_check_successful_runs": 2,
"self_correction_edits": 0,
"self_corrected_after_checker": 0,
"self_correction_verified": 2,
"assistant_turns_trace": 58,
"deterministic_failures": 6,
"deterministic_warnings": 2,
"vlm_failures": 1,
"vlm_warnings": 1,
"deterministic_failure_units": 3,
"deterministic_warning_units": 1,
"vlm_failure_units": 1,
"vlm_warning_units": 1,
"generation_trace_count": 5,
"vlm_trace_count": 5,
"selected_record_path": "results/publish/models/gpt-5-3-codex",
"quality_score": 94.4,
"quality_score_basis": "sum_of_five_20_point_task_scores",
"missing_final_artifacts": 0,
"efficiency_score": 88.54,
"quality_efficiency_score": 92.94,
"rank_quality_efficiency": 4
},
{
"suite": "publish",
"model": "glm51",
"model_slug": "glm51",
"source_kind": "clean-final",
"label": "skill-with-shell-glm51-publication-final",
"artifact_count": 5,
"generation_ok": 5,
"generation_total": 5,
"artifact_present": 5,
"generation_duration_s": 767.449,
"input_tokens": 1557545,
"output_tokens": 52925,
"total_tokens": 1610470,
"billing_tokens": 1610470,
"reasoning_tokens": 0,
"tool_use_tokens": 0,
"cache_read_tokens": 0,
"cache_write_tokens": 0,
"cache_hit_tokens": 1221440,
"total_cache_tokens": 1221440,
"effective_input_tokens": 336105,
"display_input_tokens": 1557545,
"usage_event_count": 62,
"tool_calls": 74,
"turn_count": 62,
"self_check_attempted": 5,
"self_check_ran": 5,
"self_check_succeeded": 4,
"self_check_runs": 6,
"self_check_failed_runs": 1,
"self_check_successful_runs": 5,
"self_correction_edits": 0,
"self_corrected_after_checker": 0,
"self_correction_verified": 0,
"assistant_turns_trace": 62,
"deterministic_failures": 2,
"deterministic_warnings": 0,
"vlm_failures": 2,
"vlm_warnings": 4,
"deterministic_failure_units": 1,
"deterministic_warning_units": 0,
"vlm_failure_units": 1,
"vlm_warning_units": 2,
"generation_trace_count": 5,
"vlm_trace_count": 5,
"selected_record_path": "results/publish/models/glm51",
"quality_score": 96.8,
"quality_score_basis": "sum_of_five_20_point_task_scores",
"missing_final_artifacts": 0,
"efficiency_score": 78.47,
"quality_efficiency_score": 92.22,
"rank_quality_efficiency": 5
},
{
"suite": "new-model-day",
"model": "opus48",
"model_slug": "opus48",
"source_kind": "clean-final",
"label": "skill-with-shell-opus48-new-model-day",
"artifact_count": 5,
"generation_ok": 5,
"generation_total": 5,
"artifact_present": 5,
"generation_duration_s": 979.386,
"input_tokens": 2286911,
"output_tokens": 67209,
"total_tokens": 2354120,
"billing_tokens": 2354120,
"reasoning_tokens": 0,
"tool_use_tokens": 0,
"cache_read_tokens": 1782694,
"cache_write_tokens": 236242,
"cache_hit_tokens": 0,
"total_cache_tokens": 2018936,
"effective_input_tokens": 267975,
"display_input_tokens": 2286911,
"usage_event_count": 71,
"tool_calls": 91,
"turn_count": 71,
"self_check_attempted": 5,
"self_check_ran": 5,
"self_check_succeeded": 5,
"self_check_runs": 10,
"self_check_failed_runs": 0,
"self_check_successful_runs": 10,
"self_correction_edits": 0,
"self_corrected_after_checker": 0,
"self_correction_verified": 0,
"assistant_turns_trace": 71,
"deterministic_failures": 0,
"deterministic_warnings": 0,
"vlm_failures": 0,
"vlm_warnings": 0,
"deterministic_failure_units": 0,
"deterministic_warning_units": 0,
"vlm_failure_units": 0,
"vlm_warning_units": 0,
"generation_trace_count": 5,
"vlm_trace_count": 5,
"selected_record_path": "results/new-model-day/models/opus48",
"quality_score": 100.0,
"quality_score_basis": "sum_of_five_20_point_task_scores",
"missing_final_artifacts": 0,
"efficiency_score": 68.02,
"quality_efficiency_score": 92.0,
"rank_quality_efficiency": 6
},
{
"suite": "new-model-day",
"model": "opus46",
"model_slug": "opus46",
"source_kind": "clean-final",
"label": "skill-with-shell-opus46-new-model-day",
"artifact_count": 5,
"generation_ok": 5,
"generation_total": 5,
"artifact_present": 5,
"generation_duration_s": 997.508,
"input_tokens": 1793023,
"output_tokens": 58899,
"total_tokens": 1851922,
"billing_tokens": 1851922,
"reasoning_tokens": 0,
"tool_use_tokens": 0,
"cache_read_tokens": 1408770,
"cache_write_tokens": 192607,
"cache_hit_tokens": 0,
"total_cache_tokens": 1601377,
"effective_input_tokens": 191646,
"display_input_tokens": 1793023,
"usage_event_count": 73,
"tool_calls": 98,
"turn_count": 73,
"self_check_attempted": 5,
"self_check_ran": 5,
"self_check_succeeded": 5,
"self_check_runs": 9,
"self_check_failed_runs": 2,
"self_check_successful_runs": 7,
"self_correction_edits": 0,
"self_corrected_after_checker": 0,
"self_correction_verified": 2,
"assistant_turns_trace": 73,
"deterministic_failures": 2,
"deterministic_warnings": 0,
"vlm_failures": 0,
"vlm_warnings": 0,
"deterministic_failure_units": 1,
"deterministic_warning_units": 0,
"vlm_failure_units": 0,
"vlm_warning_units": 0,
"generation_trace_count": 5,
"vlm_trace_count": 5,
"selected_record_path": "results/new-model-day/models/opus46",
"quality_score": 98.8,
"quality_score_basis": "sum_of_five_20_point_task_scores",
"missing_final_artifacts": 0,
"efficiency_score": 69.34,
"quality_efficiency_score": 91.44,
"rank_quality_efficiency": 7
},
{
"suite": "new-model-day",
"model": "opus?task_budget=50000",
"model_slug": "opus-task-budget-50000",
"source_kind": "clean-final",
"label": "skill-with-shell-opus-task-budget-50000-new-model-day",
"artifact_count": 5,
"generation_ok": 4,
"generation_total": 5,
"artifact_present": 5,
"generation_duration_s": 325.213,
"input_tokens": 488908,
"output_tokens": 27099,
"total_tokens": 516007,
"billing_tokens": 516007,
"reasoning_tokens": 0,
"tool_use_tokens": 0,
"cache_read_tokens": 219981,
"cache_write_tokens": 84644,
"cache_hit_tokens": 0,
"total_cache_tokens": 304625,
"effective_input_tokens": 184283,
"display_input_tokens": 488908,
"usage_event_count": 28,
"tool_calls": 29,
"turn_count": 28,
"self_check_attempted": 4,
"self_check_ran": 3,
"self_check_succeeded": 3,
"self_check_runs": 6,
"self_check_failed_runs": 0,
"self_check_successful_runs": 6,
"self_correction_edits": 0,
"self_corrected_after_checker": 0,
"self_correction_verified": 0,
"assistant_turns_trace": 28,
"deterministic_failures": 4,
"deterministic_warnings": 0,
"vlm_failures": 1,
"vlm_warnings": 1,
"deterministic_failure_units": 1,
"deterministic_warning_units": 0,
"vlm_failure_units": 1,
"vlm_warning_units": 1,
"generation_trace_count": 5,
"vlm_trace_count": 5,
"selected_record_path": "results/new-model-day/models/opus-task-budget-50000",
"quality_score": 87.0,
"quality_score_basis": "sum_of_five_20_point_task_scores",
"missing_final_artifacts": 0,
"efficiency_score": 99.19,
"quality_efficiency_score": 90.05,
"rank_quality_efficiency": 8
},
{
"suite": "publish",
"model": "codexresponses.gpt-5.4-mini",
"model_slug": "codexresponses-gpt-5-4-mini",
"source_kind": "clean-final",
"label": "skill-with-shell-codexresponses-gpt-5-4-mini-publication-final",
"artifact_count": 5,
"generation_ok": 5,
"generation_total": 5,
"artifact_present": 5,
"generation_duration_s": 1155.803,
"input_tokens": 2799895,
"output_tokens": 87812,
"total_tokens": 2887707,
"billing_tokens": 2887707,
"reasoning_tokens": 55592,
"tool_use_tokens": 0,
"cache_read_tokens": 0,
"cache_write_tokens": 0,
"cache_hit_tokens": 2607104,
"total_cache_tokens": 2607104,
"effective_input_tokens": 192791,
"display_input_tokens": 2799895,
"usage_event_count": 72,
"tool_calls": 113,
"turn_count": 72,
"self_check_attempted": 5,
"self_check_ran": 5,
"self_check_succeeded": 5,
"self_check_runs": 13,
"self_check_failed_runs": 6,
"self_check_successful_runs": 7,
"self_correction_edits": 0,
"self_corrected_after_checker": 0,
"self_correction_verified": 4,
"assistant_turns_trace": 72,
"deterministic_failures": 0,
"deterministic_warnings": 2,
"vlm_failures": 0,
"vlm_warnings": 0,
"deterministic_failure_units": 0,
"deterministic_warning_units": 1,
"vlm_failure_units": 0,
"vlm_warning_units": 0,
"generation_trace_count": 5,
"vlm_trace_count": 5,
"selected_record_path": "results/publish/models/codexresponses-gpt-5-4-mini",
"quality_score": 99.8,
"quality_score_basis": "sum_of_five_20_point_task_scores",
"missing_final_artifacts": 0,
"efficiency_score": 58.69,
"quality_efficiency_score": 89.52,
"rank_quality_efficiency": 9
},
{
"suite": "new-model-day",
"model": "opus?task_budget=200000",
"model_slug": "opus-task-budget-200000",
"source_kind": "clean-final",
"label": "skill-with-shell-opus-task-budget-200000-new-model-day",
"artifact_count": 5,
"generation_ok": 5,
"generation_total": 5,
"artifact_present": 5,
"generation_duration_s": 1189.632,
"input_tokens": 3584777,
"output_tokens": 96188,
"total_tokens": 3680965,
"billing_tokens": 3680965,
"reasoning_tokens": 0,
"tool_use_tokens": 0,
"cache_read_tokens": 3006403,
"cache_write_tokens": 266881,
"cache_hit_tokens": 0,
"total_cache_tokens": 3273284,
"effective_input_tokens": 311493,
"display_input_tokens": 3584777,
"usage_event_count": 88,
"tool_calls": 105,
"turn_count": 88,
"self_check_attempted": 5,
"self_check_ran": 5,
"self_check_succeeded": 5,
"self_check_runs": 12,
"self_check_failed_runs": 0,
"self_check_successful_runs": 12,
"self_correction_edits": 0,
"self_corrected_after_checker": 0,
"self_correction_verified": 0,
"assistant_turns_trace": 88,
"deterministic_failures": 4,
"deterministic_warnings": 0,
"vlm_failures": 0,
"vlm_warnings": 2,
"deterministic_failure_units": 2,
"deterministic_warning_units": 0,
"vlm_failure_units": 0,
"vlm_warning_units": 1,
"generation_trace_count": 5,
"vlm_trace_count": 5,
"selected_record_path": "results/new-model-day/models/opus-task-budget-200000",
"quality_score": 97.4,
"quality_score_basis": "sum_of_five_20_point_task_scores",
"missing_final_artifacts": 0,
"efficiency_score": 54.95,
"quality_efficiency_score": 86.79,
"rank_quality_efficiency": 10
},
{
"suite": "publish",
"model": "haiku45",
"model_slug": "haiku45",
"source_kind": "clean-final",
"label": "skill-with-shell-haiku45-publication-final",
"artifact_count": 5,
"generation_ok": 3,
"generation_total": 5,
"artifact_present": 5,
"generation_duration_s": 370.199,
"input_tokens": 907211,
"output_tokens": 42193,
"total_tokens": 949404,
"billing_tokens": 949404,
"reasoning_tokens": 0,
"tool_use_tokens": 0,
"cache_read_tokens": 450168,
"cache_write_tokens": 130377,
"cache_hit_tokens": 0,
"total_cache_tokens": 580545,
"effective_input_tokens": 162656,
"display_input_tokens": 743201,
"usage_event_count": 38,
"tool_calls": 49,
"turn_count": 47,
"self_check_attempted": 3,
"self_check_ran": 2,
"self_check_succeeded": 2,
"self_check_runs": 2,
"self_check_failed_runs": 0,
"self_check_successful_runs": 2,
"self_correction_edits": 0,
"self_corrected_after_checker": 0,
"self_correction_verified": 0,
"assistant_turns_trace": 38,
"deterministic_failures": 26,
"deterministic_warnings": 12,
"vlm_failures": 1,
"vlm_warnings": 5,
"deterministic_failure_units": 7,
"deterministic_warning_units": 3,
"vlm_failure_units": 1,
"vlm_warning_units": 2,
"generation_trace_count": 5,
"vlm_trace_count": 5,
"selected_record_path": "results/publish/models/haiku45",
"quality_score": 83.0,
"quality_score_basis": "sum_of_five_20_point_task_scores",
"missing_final_artifacts": 0,
"efficiency_score": 93.26,
"quality_efficiency_score": 85.56,
"rank_quality_efficiency": 11
},
{
"suite": "publish",
"model": "gemini35flash",
"model_slug": "gemini35flash",
"source_kind": "clean-final",
"label": "skill-with-shell-gemini35flash-publication-final",
"artifact_count": 5,
"generation_ok": 5,
"generation_total": 5,
"artifact_present": 5,
"generation_duration_s": 774.424,
"input_tokens": 8095357,
"output_tokens": 32386,
"total_tokens": 8127743,
"billing_tokens": 8127743,
"reasoning_tokens": 78303,
"tool_use_tokens": 0,
"cache_read_tokens": 0,
"cache_write_tokens": 0,
"cache_hit_tokens": 6936722,
"total_cache_tokens": 6936722,
"effective_input_tokens": 1158635,
"display_input_tokens": 8095357,
"usage_event_count": 147,
"tool_calls": 142,
"turn_count": 147,
"self_check_attempted": 5,
"self_check_ran": 5,
"self_check_succeeded": 3,
"self_check_runs": 13,
"self_check_failed_runs": 9,
"self_check_successful_runs": 4,
"self_correction_edits": 0,
"self_corrected_after_checker": 0,
"self_correction_verified": 2,
"assistant_turns_trace": 147,
"deterministic_failures": 0,
"deterministic_warnings": 0,
"vlm_failures": 0,
"vlm_warnings": 0,
"deterministic_failure_units": 0,
"deterministic_warning_units": 0,
"vlm_failure_units": 0,
"vlm_warning_units": 0,
"generation_trace_count": 5,
"vlm_trace_count": 5,
"selected_record_path": "results/publish/models/gemini35flash",
"quality_score": 100.0,
"quality_score_basis": "sum_of_five_20_point_task_scores",
"missing_final_artifacts": 0,
"efficiency_score": 34.71,
"quality_efficiency_score": 83.68,
"rank_quality_efficiency": 12
},
{
"suite": "publish",
"model": "sonnet46",
"model_slug": "sonnet46",
"source_kind": "clean-final",
"label": "skill-with-shell-sonnet46-publication-final",
"artifact_count": 5,
"generation_ok": 5,
"generation_total": 5,
"artifact_present": 5,
"generation_duration_s": 2303.843,
"input_tokens": 4878483,
"output_tokens": 156614,
"total_tokens": 5035097,
"billing_tokens": 5035097,
"reasoning_tokens": 0,
"tool_use_tokens": 0,
"cache_read_tokens": 4215364,
"cache_write_tokens": 372099,
"cache_hit_tokens": 0,
"total_cache_tokens": 4587463,
"effective_input_tokens": 291020,
"display_input_tokens": 4878483,
"usage_event_count": 93,
"tool_calls": 108,
"turn_count": 93,
"self_check_attempted": 5,
"self_check_ran": 5,
"self_check_succeeded": 5,
"self_check_runs": 11,
"self_check_failed_runs": 2,
"self_check_successful_runs": 9,
"self_correction_edits": 0,
"self_corrected_after_checker": 0,
"self_correction_verified": 2,
"assistant_turns_trace": 93,
"deterministic_failures": 0,
"deterministic_warnings": 0,
"vlm_failures": 0,
"vlm_warnings": 0,
"deterministic_failure_units": 0,
"deterministic_warning_units": 0,
"vlm_failure_units": 0,
"vlm_warning_units": 0,
"generation_trace_count": 5,
"vlm_trace_count": 5,
"selected_record_path": "results/publish/models/sonnet46",
"quality_score": 100.0,
"quality_score_basis": "sum_of_five_20_point_task_scores",
"missing_final_artifacts": 0,
"efficiency_score": 25.36,
"quality_efficiency_score": 81.34,
"rank_quality_efficiency": 13
},
{
"suite": "publish",
"model": "deepseek",
"model_slug": "deepseek",
"source_kind": "clean-final",
"label": "skill-with-shell-deepseek-publication-final",
"artifact_count": 5,
"generation_ok": 4,
"generation_total": 5,
"artifact_present": 5,
"generation_duration_s": 1242.354,
"input_tokens": 2535136,
"output_tokens": 77564,
"total_tokens": 2612700,
"billing_tokens": 2612700,
"reasoning_tokens": 0,
"tool_use_tokens": 0,
"cache_read_tokens": 0,
"cache_write_tokens": 0,
"cache_hit_tokens": 2637696,
"total_cache_tokens": 2637696,
"effective_input_tokens": 180215,
"display_input_tokens": 2817911,
"usage_event_count": 84,
"tool_calls": 97,
"turn_count": 80,
"self_check_attempted": 5,
"self_check_ran": 4,
"self_check_succeeded": 4,
"self_check_runs": 8,
"self_check_failed_runs": 3,
"self_check_successful_runs": 5,
"self_correction_edits": 0,
"self_corrected_after_checker": 0,
"self_correction_verified": 3,
"assistant_turns_trace": 84,
"deterministic_failures": 8,
"deterministic_warnings": 1,
"vlm_failures": 7,
"vlm_warnings": 0,
"deterministic_failure_units": 3,
"deterministic_warning_units": 1,
"vlm_failure_units": 2,
"vlm_warning_units": 0,
"generation_trace_count": 5,
"vlm_trace_count": 5,
"selected_record_path": "results/publish/models/deepseek",
"quality_score": 84.0,
"quality_score_basis": "sum_of_five_20_point_task_scores",
"missing_final_artifacts": 0,
"efficiency_score": 60.63,
"quality_efficiency_score": 78.16,
"rank_quality_efficiency": 14
},
{
"suite": "publish",
"model": "kimi",
"model_slug": "kimi",
"source_kind": "clean-final",
"label": "skill-with-shell-kimi-publication-final",
"artifact_count": 5,
"generation_ok": 4,
"generation_total": 5,
"artifact_present": 5,
"generation_duration_s": 1764.648,
"input_tokens": 2600494,
"output_tokens": 69995,
"total_tokens": 2670489,
"billing_tokens": 2670489,
"reasoning_tokens": 0,
"tool_use_tokens": 0,
"cache_read_tokens": 0,
"cache_write_tokens": 0,
"cache_hit_tokens": 2332928,
"total_cache_tokens": 2332928,
"effective_input_tokens": 267566,
"display_input_tokens": 2600494,
"usage_event_count": 87,
"tool_calls": 99,
"turn_count": 87,
"self_check_attempted": 5,
"self_check_ran": 4,
"self_check_succeeded": 4,
"self_check_runs": 7,
"self_check_failed_runs": 2,
"self_check_successful_runs": 5,
"self_correction_edits": 0,
"self_corrected_after_checker": 0,
"self_correction_verified": 2,
"assistant_turns_trace": 87,
"deterministic_failures": 6,
"deterministic_warnings": 0,
"vlm_failures": 7,
"vlm_warnings": 2,
"deterministic_failure_units": 2,
"deterministic_warning_units": 0,
"vlm_failure_units": 3,
"vlm_warning_units": 2,
"generation_trace_count": 5,
"vlm_trace_count": 5,
"selected_record_path": "results/publish/models/kimi",
"quality_score": 83.8,
"quality_score_basis": "sum_of_five_20_point_task_scores",
"missing_final_artifacts": 0,
"efficiency_score": 49.7,
"quality_efficiency_score": 75.27,
"rank_quality_efficiency": 15
},
{
"suite": "publish",
"model": "grok-4.3",
"model_slug": "grok-4-3",
"source_kind": "clean-final",
"label": "skill-with-shell-grok-4-3-publication-final",
"artifact_count": 5,
"generation_ok": 2,
"generation_total": 5,
"artifact_present": 5,
"generation_duration_s": 284.258,
"input_tokens": 575242,
"output_tokens": 24310,
"total_tokens": 599552,
"billing_tokens": 599552,
"reasoning_tokens": 8191,
"tool_use_tokens": 0,
"cache_read_tokens": 0,
"cache_write_tokens": 0,
"cache_hit_tokens": 336000,
"total_cache_tokens": 336000,
"effective_input_tokens": 134147,
"display_input_tokens": 470147,
"usage_event_count": 52,
"tool_calls": 44,
"turn_count": 49,
"self_check_attempted": 1,
"self_check_ran": 0,
"self_check_succeeded": 0,
"self_check_runs": 0,
"self_check_failed_runs": 0,
"self_check_successful_runs": 0,
"self_correction_edits": 0,
"self_corrected_after_checker": 0,
"self_correction_verified": 0,
"assistant_turns_trace": 52,
"deterministic_failures": 16,
"deterministic_warnings": 0,
"vlm_failures": 11,
"vlm_warnings": 1,
"deterministic_failure_units": 4,
"deterministic_warning_units": 0,
"vlm_failure_units": 4,
"vlm_warning_units": 1,
"generation_trace_count": 5,
"vlm_trace_count": 5,
"selected_record_path": "results/publish/models/grok-4-3",
"quality_score": 58.0,
"quality_score_basis": "sum_of_five_20_point_task_scores",
"missing_final_artifacts": 0,
"efficiency_score": 97.49,
"quality_efficiency_score": 67.87,
"rank_quality_efficiency": 16
},
{
"suite": "publish",
"model": "codexspark",
"model_slug": "codexspark",
"source_kind": "clean-final",
"label": "skill-with-shell-codexspark-publication-final",
"artifact_count": 5,
"generation_ok": 3,
"generation_total": 5,
"artifact_present": 5,
"generation_duration_s": 363.473,
"input_tokens": 7093160,
"output_tokens": 92660,
"total_tokens": 7185820,
"billing_tokens": 7185820,
"reasoning_tokens": 58302,
"tool_use_tokens": 0,
"cache_read_tokens": 0,
"cache_write_tokens": 0,
"cache_hit_tokens": 6181120,
"total_cache_tokens": 6181120,
"effective_input_tokens": 450555,
"display_input_tokens": 6631675,
"usage_event_count": 167,
"tool_calls": 174,
"turn_count": 159,
"self_check_attempted": 3,
"self_check_ran": 2,
"self_check_succeeded": 2,
"self_check_runs": 5,
"self_check_failed_runs": 0,
"self_check_successful_runs": 5,
"self_correction_edits": 0,
"self_corrected_after_checker": 0,
"self_correction_verified": 0,
"assistant_turns_trace": 167,
"deterministic_failures": 14,
"deterministic_warnings": 6,
"vlm_failures": 5,
"vlm_warnings": 0,
"deterministic_failure_units": 4,
"deterministic_warning_units": 2,
"vlm_failure_units": 2,
"vlm_warning_units": 0,
"generation_trace_count": 5,
"vlm_trace_count": 5,
"selected_record_path": "results/publish/models/codexspark",
"quality_score": 72.2,
"quality_score_basis": "sum_of_five_20_point_task_scores",
"missing_final_artifacts": 0,
"efficiency_score": 43.38,
"quality_efficiency_score": 65.0,
"rank_quality_efficiency": 17
},
{
"suite": "publish",
"model": "minimax27",
"model_slug": "minimax27",
"source_kind": "clean-final",
"label": "skill-with-shell-minimax27-publication-final",
"artifact_count": 5,
"generation_ok": 1,
"generation_total": 5,
"artifact_present": 5,
"generation_duration_s": 1039.913,
"input_tokens": 1255595,
"output_tokens": 70938,
"total_tokens": 1326533,
"billing_tokens": 1326533,
"reasoning_tokens": 0,
"tool_use_tokens": 0,
"cache_read_tokens": 0,
"cache_write_tokens": 0,
"cache_hit_tokens": 841088,
"total_cache_tokens": 841088,
"effective_input_tokens": 485011,
"display_input_tokens": 1326099,
"usage_event_count": 49,
"tool_calls": 55,
"turn_count": 50,
"self_check_attempted": 3,
"self_check_ran": 2,
"self_check_succeeded": 2,
"self_check_runs": 3,
"self_check_failed_runs": 1,
"self_check_successful_runs": 2,
"self_correction_edits": 0,
"self_corrected_after_checker": 0,
"self_correction_verified": 1,
"assistant_turns_trace": 49,
"deterministic_failures": 26,
"deterministic_warnings": 4,
"vlm_failures": 4,
"vlm_warnings": 4,
"deterministic_failure_units": 7,
"deterministic_warning_units": 1,
"vlm_failure_units": 1,
"vlm_warning_units": 1,
"generation_trace_count": 5,
"vlm_trace_count": 5,
"selected_record_path": "results/publish/models/minimax27",
"quality_score": 58.0,
"quality_score_basis": "sum_of_five_20_point_task_scores",
"missing_final_artifacts": 0,
"efficiency_score": 77.19,
"quality_efficiency_score": 62.8,
"rank_quality_efficiency": 18
}
]

Xet Storage Details

Size:
30.7 kB
·
Xet hash:
1939e646e5bc7fea84ea5782fa5c6a4c949b3e83907020dfb8f02e73b9d95bad

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.