Buckets:
| [ | |
| { | |
| "suite": "publish", | |
| "model": "codexresponses.gpt-5.5", | |
| "model_slug": "codexresponses-gpt-5-5", | |
| "source_kind": "clean-final", | |
| "label": "skill-with-shell-codexresponses-gpt-5-5-opus-gpt55-deepseek-experiment-20260524-164522", | |
| "artifact_count": 5, | |
| "generation_ok": 5, | |
| "generation_total": 5, | |
| "artifact_present": 5, | |
| "generation_duration_s": 670.53, | |
| "input_tokens": 766387, | |
| "output_tokens": 31560, | |
| "total_tokens": 797947, | |
| "billing_tokens": 797947, | |
| "reasoning_tokens": 3177, | |
| "tool_use_tokens": 0, | |
| "cache_read_tokens": 0, | |
| "cache_write_tokens": 0, | |
| "cache_hit_tokens": 608768, | |
| "total_cache_tokens": 608768, | |
| "effective_input_tokens": 157619, | |
| "display_input_tokens": 766387, | |
| "usage_event_count": 47, | |
| "tool_calls": 69, | |
| "turn_count": 47, | |
| "self_check_attempted": 5, | |
| "self_check_ran": 5, | |
| "self_check_succeeded": 5, | |
| "self_check_runs": 9, | |
| "self_check_failed_runs": 3, | |
| "self_check_successful_runs": 6, | |
| "self_correction_edits": 0, | |
| "self_corrected_after_checker": 0, | |
| "self_correction_verified": 3, | |
| "assistant_turns_trace": 47, | |
| "deterministic_failures": 0, | |
| "deterministic_warnings": 0, | |
| "vlm_failures": 0, | |
| "vlm_warnings": 0, | |
| "deterministic_failure_units": 0, | |
| "deterministic_warning_units": 0, | |
| "vlm_failure_units": 0, | |
| "vlm_warning_units": 0, | |
| "generation_trace_count": 5, | |
| "vlm_trace_count": 5, | |
| "selected_record_path": "results/publish/models/codexresponses-gpt-5-5", | |
| "quality_score": 100.0, | |
| "quality_score_basis": "sum_of_five_20_point_task_scores", | |
| "missing_final_artifacts": 0, | |
| "efficiency_score": 85.35, | |
| "quality_efficiency_score": 96.34, | |
| "rank_quality_efficiency": 1 | |
| }, | |
| { | |
| "suite": "publish", | |
| "model": "opus47", | |
| "model_slug": "opus47", | |
| "source_kind": "clean-final", | |
| "label": "skill-with-shell-opus47-publication-final", | |
| "artifact_count": 5, | |
| "generation_ok": 5, | |
| "generation_total": 5, | |
| "artifact_present": 5, | |
| "generation_duration_s": 872.87, | |
| "input_tokens": 1980822, | |
| "output_tokens": 60545, | |
| "total_tokens": 2041367, | |
| "billing_tokens": 2041367, | |
| "reasoning_tokens": 0, | |
| "tool_use_tokens": 0, | |
| "cache_read_tokens": 1547331, | |
| "cache_write_tokens": 205103, | |
| "cache_hit_tokens": 0, | |
| "total_cache_tokens": 1752434, | |
| "effective_input_tokens": 228388, | |
| "display_input_tokens": 1980822, | |
| "usage_event_count": 67, | |
| "tool_calls": 83, | |
| "turn_count": 67, | |
| "self_check_attempted": 5, | |
| "self_check_ran": 5, | |
| "self_check_succeeded": 5, | |
| "self_check_runs": 10, | |
| "self_check_failed_runs": 0, | |
| "self_check_successful_runs": 10, | |
| "self_correction_edits": 0, | |
| "self_corrected_after_checker": 0, | |
| "self_correction_verified": 0, | |
| "assistant_turns_trace": 67, | |
| "deterministic_failures": 0, | |
| "deterministic_warnings": 0, | |
| "vlm_failures": 0, | |
| "vlm_warnings": 0, | |
| "deterministic_failure_units": 0, | |
| "deterministic_warning_units": 0, | |
| "vlm_failure_units": 0, | |
| "vlm_warning_units": 0, | |
| "generation_trace_count": 5, | |
| "vlm_trace_count": 5, | |
| "selected_record_path": "results/publish/models/opus47", | |
| "quality_score": 100.0, | |
| "quality_score_basis": "sum_of_five_20_point_task_scores", | |
| "missing_final_artifacts": 0, | |
| "efficiency_score": 72.88, | |
| "quality_efficiency_score": 93.22, | |
| "rank_quality_efficiency": 2 | |
| }, | |
| { | |
| "suite": "new-model-day", | |
| "model": "codexresponses.gpt-5.4", | |
| "model_slug": "codexresponses-gpt-5-4", | |
| "source_kind": "clean-final", | |
| "label": "skill-with-shell-codexresponses-gpt-5-4-new-model-day", | |
| "artifact_count": 5, | |
| "generation_ok": 5, | |
| "generation_total": 5, | |
| "artifact_present": 5, | |
| "generation_duration_s": 1008.0, | |
| "input_tokens": 798798, | |
| "output_tokens": 45488, | |
| "total_tokens": 844286, | |
| "billing_tokens": 844286, | |
| "reasoning_tokens": 0, | |
| "tool_use_tokens": 0, | |
| "cache_read_tokens": 0, | |
| "cache_write_tokens": 0, | |
| "cache_hit_tokens": 468480, | |
| "total_cache_tokens": 468480, | |
| "effective_input_tokens": 330318, | |
| "display_input_tokens": 798798, | |
| "usage_event_count": 40, | |
| "tool_calls": 81, | |
| "turn_count": 40, | |
| "self_check_attempted": 5, | |
| "self_check_ran": 2, | |
| "self_check_succeeded": 2, | |
| "self_check_runs": 2, | |
| "self_check_failed_runs": 0, | |
| "self_check_successful_runs": 2, | |
| "self_correction_edits": 0, | |
| "self_corrected_after_checker": 0, | |
| "self_correction_verified": 0, | |
| "assistant_turns_trace": 40, | |
| "deterministic_failures": 0, | |
| "deterministic_warnings": 0, | |
| "vlm_failures": 1, | |
| "vlm_warnings": 0, | |
| "deterministic_failure_units": 0, | |
| "deterministic_warning_units": 0, | |
| "vlm_failure_units": 1, | |
| "vlm_warning_units": 0, | |
| "generation_trace_count": 5, | |
| "vlm_trace_count": 5, | |
| "selected_record_path": "results/new-model-day/models/codexresponses-gpt-5-4", | |
| "quality_score": 98.4, | |
| "quality_score_basis": "sum_of_five_20_point_task_scores", | |
| "missing_final_artifacts": 0, | |
| "efficiency_score": 76.77, | |
| "quality_efficiency_score": 92.99, | |
| "rank_quality_efficiency": 3 | |
| }, | |
| { | |
| "suite": "publish", | |
| "model": "gpt-5.3-codex", | |
| "model_slug": "gpt-5-3-codex", | |
| "source_kind": "clean-final", | |
| "label": "skill-with-shell-gpt-5-3-codex-publication-final", | |
| "artifact_count": 5, | |
| "generation_ok": 5, | |
| "generation_total": 5, | |
| "artifact_present": 5, | |
| "generation_duration_s": 372.521, | |
| "input_tokens": 1260130, | |
| "output_tokens": 28682, | |
| "total_tokens": 1288812, | |
| "billing_tokens": 1288812, | |
| "reasoning_tokens": 6974, | |
| "tool_use_tokens": 0, | |
| "cache_read_tokens": 0, | |
| "cache_write_tokens": 0, | |
| "cache_hit_tokens": 1036288, | |
| "total_cache_tokens": 1036288, | |
| "effective_input_tokens": 223842, | |
| "display_input_tokens": 1260130, | |
| "usage_event_count": 58, | |
| "tool_calls": 70, | |
| "turn_count": 58, | |
| "self_check_attempted": 3, | |
| "self_check_ran": 3, | |
| "self_check_succeeded": 2, | |
| "self_check_runs": 5, | |
| "self_check_failed_runs": 3, | |
| "self_check_successful_runs": 2, | |
| "self_correction_edits": 0, | |
| "self_corrected_after_checker": 0, | |
| "self_correction_verified": 2, | |
| "assistant_turns_trace": 58, | |
| "deterministic_failures": 6, | |
| "deterministic_warnings": 2, | |
| "vlm_failures": 1, | |
| "vlm_warnings": 1, | |
| "deterministic_failure_units": 3, | |
| "deterministic_warning_units": 1, | |
| "vlm_failure_units": 1, | |
| "vlm_warning_units": 1, | |
| "generation_trace_count": 5, | |
| "vlm_trace_count": 5, | |
| "selected_record_path": "results/publish/models/gpt-5-3-codex", | |
| "quality_score": 94.4, | |
| "quality_score_basis": "sum_of_five_20_point_task_scores", | |
| "missing_final_artifacts": 0, | |
| "efficiency_score": 88.54, | |
| "quality_efficiency_score": 92.94, | |
| "rank_quality_efficiency": 4 | |
| }, | |
| { | |
| "suite": "publish", | |
| "model": "glm51", | |
| "model_slug": "glm51", | |
| "source_kind": "clean-final", | |
| "label": "skill-with-shell-glm51-publication-final", | |
| "artifact_count": 5, | |
| "generation_ok": 5, | |
| "generation_total": 5, | |
| "artifact_present": 5, | |
| "generation_duration_s": 767.449, | |
| "input_tokens": 1557545, | |
| "output_tokens": 52925, | |
| "total_tokens": 1610470, | |
| "billing_tokens": 1610470, | |
| "reasoning_tokens": 0, | |
| "tool_use_tokens": 0, | |
| "cache_read_tokens": 0, | |
| "cache_write_tokens": 0, | |
| "cache_hit_tokens": 1221440, | |
| "total_cache_tokens": 1221440, | |
| "effective_input_tokens": 336105, | |
| "display_input_tokens": 1557545, | |
| "usage_event_count": 62, | |
| "tool_calls": 74, | |
| "turn_count": 62, | |
| "self_check_attempted": 5, | |
| "self_check_ran": 5, | |
| "self_check_succeeded": 4, | |
| "self_check_runs": 6, | |
| "self_check_failed_runs": 1, | |
| "self_check_successful_runs": 5, | |
| "self_correction_edits": 0, | |
| "self_corrected_after_checker": 0, | |
| "self_correction_verified": 0, | |
| "assistant_turns_trace": 62, | |
| "deterministic_failures": 2, | |
| "deterministic_warnings": 0, | |
| "vlm_failures": 2, | |
| "vlm_warnings": 4, | |
| "deterministic_failure_units": 1, | |
| "deterministic_warning_units": 0, | |
| "vlm_failure_units": 1, | |
| "vlm_warning_units": 2, | |
| "generation_trace_count": 5, | |
| "vlm_trace_count": 5, | |
| "selected_record_path": "results/publish/models/glm51", | |
| "quality_score": 96.8, | |
| "quality_score_basis": "sum_of_five_20_point_task_scores", | |
| "missing_final_artifacts": 0, | |
| "efficiency_score": 78.47, | |
| "quality_efficiency_score": 92.22, | |
| "rank_quality_efficiency": 5 | |
| }, | |
| { | |
| "suite": "new-model-day", | |
| "model": "opus48", | |
| "model_slug": "opus48", | |
| "source_kind": "clean-final", | |
| "label": "skill-with-shell-opus48-new-model-day", | |
| "artifact_count": 5, | |
| "generation_ok": 5, | |
| "generation_total": 5, | |
| "artifact_present": 5, | |
| "generation_duration_s": 979.386, | |
| "input_tokens": 2286911, | |
| "output_tokens": 67209, | |
| "total_tokens": 2354120, | |
| "billing_tokens": 2354120, | |
| "reasoning_tokens": 0, | |
| "tool_use_tokens": 0, | |
| "cache_read_tokens": 1782694, | |
| "cache_write_tokens": 236242, | |
| "cache_hit_tokens": 0, | |
| "total_cache_tokens": 2018936, | |
| "effective_input_tokens": 267975, | |
| "display_input_tokens": 2286911, | |
| "usage_event_count": 71, | |
| "tool_calls": 91, | |
| "turn_count": 71, | |
| "self_check_attempted": 5, | |
| "self_check_ran": 5, | |
| "self_check_succeeded": 5, | |
| "self_check_runs": 10, | |
| "self_check_failed_runs": 0, | |
| "self_check_successful_runs": 10, | |
| "self_correction_edits": 0, | |
| "self_corrected_after_checker": 0, | |
| "self_correction_verified": 0, | |
| "assistant_turns_trace": 71, | |
| "deterministic_failures": 0, | |
| "deterministic_warnings": 0, | |
| "vlm_failures": 0, | |
| "vlm_warnings": 0, | |
| "deterministic_failure_units": 0, | |
| "deterministic_warning_units": 0, | |
| "vlm_failure_units": 0, | |
| "vlm_warning_units": 0, | |
| "generation_trace_count": 5, | |
| "vlm_trace_count": 5, | |
| "selected_record_path": "results/new-model-day/models/opus48", | |
| "quality_score": 100.0, | |
| "quality_score_basis": "sum_of_five_20_point_task_scores", | |
| "missing_final_artifacts": 0, | |
| "efficiency_score": 68.02, | |
| "quality_efficiency_score": 92.0, | |
| "rank_quality_efficiency": 6 | |
| }, | |
| { | |
| "suite": "new-model-day", | |
| "model": "opus46", | |
| "model_slug": "opus46", | |
| "source_kind": "clean-final", | |
| "label": "skill-with-shell-opus46-new-model-day", | |
| "artifact_count": 5, | |
| "generation_ok": 5, | |
| "generation_total": 5, | |
| "artifact_present": 5, | |
| "generation_duration_s": 997.508, | |
| "input_tokens": 1793023, | |
| "output_tokens": 58899, | |
| "total_tokens": 1851922, | |
| "billing_tokens": 1851922, | |
| "reasoning_tokens": 0, | |
| "tool_use_tokens": 0, | |
| "cache_read_tokens": 1408770, | |
| "cache_write_tokens": 192607, | |
| "cache_hit_tokens": 0, | |
| "total_cache_tokens": 1601377, | |
| "effective_input_tokens": 191646, | |
| "display_input_tokens": 1793023, | |
| "usage_event_count": 73, | |
| "tool_calls": 98, | |
| "turn_count": 73, | |
| "self_check_attempted": 5, | |
| "self_check_ran": 5, | |
| "self_check_succeeded": 5, | |
| "self_check_runs": 9, | |
| "self_check_failed_runs": 2, | |
| "self_check_successful_runs": 7, | |
| "self_correction_edits": 0, | |
| "self_corrected_after_checker": 0, | |
| "self_correction_verified": 2, | |
| "assistant_turns_trace": 73, | |
| "deterministic_failures": 2, | |
| "deterministic_warnings": 0, | |
| "vlm_failures": 0, | |
| "vlm_warnings": 0, | |
| "deterministic_failure_units": 1, | |
| "deterministic_warning_units": 0, | |
| "vlm_failure_units": 0, | |
| "vlm_warning_units": 0, | |
| "generation_trace_count": 5, | |
| "vlm_trace_count": 5, | |
| "selected_record_path": "results/new-model-day/models/opus46", | |
| "quality_score": 98.8, | |
| "quality_score_basis": "sum_of_five_20_point_task_scores", | |
| "missing_final_artifacts": 0, | |
| "efficiency_score": 69.34, | |
| "quality_efficiency_score": 91.44, | |
| "rank_quality_efficiency": 7 | |
| }, | |
| { | |
| "suite": "new-model-day", | |
| "model": "opus?task_budget=50000", | |
| "model_slug": "opus-task-budget-50000", | |
| "source_kind": "clean-final", | |
| "label": "skill-with-shell-opus-task-budget-50000-new-model-day", | |
| "artifact_count": 5, | |
| "generation_ok": 4, | |
| "generation_total": 5, | |
| "artifact_present": 5, | |
| "generation_duration_s": 325.213, | |
| "input_tokens": 488908, | |
| "output_tokens": 27099, | |
| "total_tokens": 516007, | |
| "billing_tokens": 516007, | |
| "reasoning_tokens": 0, | |
| "tool_use_tokens": 0, | |
| "cache_read_tokens": 219981, | |
| "cache_write_tokens": 84644, | |
| "cache_hit_tokens": 0, | |
| "total_cache_tokens": 304625, | |
| "effective_input_tokens": 184283, | |
| "display_input_tokens": 488908, | |
| "usage_event_count": 28, | |
| "tool_calls": 29, | |
| "turn_count": 28, | |
| "self_check_attempted": 4, | |
| "self_check_ran": 3, | |
| "self_check_succeeded": 3, | |
| "self_check_runs": 6, | |
| "self_check_failed_runs": 0, | |
| "self_check_successful_runs": 6, | |
| "self_correction_edits": 0, | |
| "self_corrected_after_checker": 0, | |
| "self_correction_verified": 0, | |
| "assistant_turns_trace": 28, | |
| "deterministic_failures": 4, | |
| "deterministic_warnings": 0, | |
| "vlm_failures": 1, | |
| "vlm_warnings": 1, | |
| "deterministic_failure_units": 1, | |
| "deterministic_warning_units": 0, | |
| "vlm_failure_units": 1, | |
| "vlm_warning_units": 1, | |
| "generation_trace_count": 5, | |
| "vlm_trace_count": 5, | |
| "selected_record_path": "results/new-model-day/models/opus-task-budget-50000", | |
| "quality_score": 87.0, | |
| "quality_score_basis": "sum_of_five_20_point_task_scores", | |
| "missing_final_artifacts": 0, | |
| "efficiency_score": 99.19, | |
| "quality_efficiency_score": 90.05, | |
| "rank_quality_efficiency": 8 | |
| }, | |
| { | |
| "suite": "publish", | |
| "model": "codexresponses.gpt-5.4-mini", | |
| "model_slug": "codexresponses-gpt-5-4-mini", | |
| "source_kind": "clean-final", | |
| "label": "skill-with-shell-codexresponses-gpt-5-4-mini-publication-final", | |
| "artifact_count": 5, | |
| "generation_ok": 5, | |
| "generation_total": 5, | |
| "artifact_present": 5, | |
| "generation_duration_s": 1155.803, | |
| "input_tokens": 2799895, | |
| "output_tokens": 87812, | |
| "total_tokens": 2887707, | |
| "billing_tokens": 2887707, | |
| "reasoning_tokens": 55592, | |
| "tool_use_tokens": 0, | |
| "cache_read_tokens": 0, | |
| "cache_write_tokens": 0, | |
| "cache_hit_tokens": 2607104, | |
| "total_cache_tokens": 2607104, | |
| "effective_input_tokens": 192791, | |
| "display_input_tokens": 2799895, | |
| "usage_event_count": 72, | |
| "tool_calls": 113, | |
| "turn_count": 72, | |
| "self_check_attempted": 5, | |
| "self_check_ran": 5, | |
| "self_check_succeeded": 5, | |
| "self_check_runs": 13, | |
| "self_check_failed_runs": 6, | |
| "self_check_successful_runs": 7, | |
| "self_correction_edits": 0, | |
| "self_corrected_after_checker": 0, | |
| "self_correction_verified": 4, | |
| "assistant_turns_trace": 72, | |
| "deterministic_failures": 0, | |
| "deterministic_warnings": 2, | |
| "vlm_failures": 0, | |
| "vlm_warnings": 0, | |
| "deterministic_failure_units": 0, | |
| "deterministic_warning_units": 1, | |
| "vlm_failure_units": 0, | |
| "vlm_warning_units": 0, | |
| "generation_trace_count": 5, | |
| "vlm_trace_count": 5, | |
| "selected_record_path": "results/publish/models/codexresponses-gpt-5-4-mini", | |
| "quality_score": 99.8, | |
| "quality_score_basis": "sum_of_five_20_point_task_scores", | |
| "missing_final_artifacts": 0, | |
| "efficiency_score": 58.69, | |
| "quality_efficiency_score": 89.52, | |
| "rank_quality_efficiency": 9 | |
| }, | |
| { | |
| "suite": "new-model-day", | |
| "model": "opus?task_budget=200000", | |
| "model_slug": "opus-task-budget-200000", | |
| "source_kind": "clean-final", | |
| "label": "skill-with-shell-opus-task-budget-200000-new-model-day", | |
| "artifact_count": 5, | |
| "generation_ok": 5, | |
| "generation_total": 5, | |
| "artifact_present": 5, | |
| "generation_duration_s": 1189.632, | |
| "input_tokens": 3584777, | |
| "output_tokens": 96188, | |
| "total_tokens": 3680965, | |
| "billing_tokens": 3680965, | |
| "reasoning_tokens": 0, | |
| "tool_use_tokens": 0, | |
| "cache_read_tokens": 3006403, | |
| "cache_write_tokens": 266881, | |
| "cache_hit_tokens": 0, | |
| "total_cache_tokens": 3273284, | |
| "effective_input_tokens": 311493, | |
| "display_input_tokens": 3584777, | |
| "usage_event_count": 88, | |
| "tool_calls": 105, | |
| "turn_count": 88, | |
| "self_check_attempted": 5, | |
| "self_check_ran": 5, | |
| "self_check_succeeded": 5, | |
| "self_check_runs": 12, | |
| "self_check_failed_runs": 0, | |
| "self_check_successful_runs": 12, | |
| "self_correction_edits": 0, | |
| "self_corrected_after_checker": 0, | |
| "self_correction_verified": 0, | |
| "assistant_turns_trace": 88, | |
| "deterministic_failures": 4, | |
| "deterministic_warnings": 0, | |
| "vlm_failures": 0, | |
| "vlm_warnings": 2, | |
| "deterministic_failure_units": 2, | |
| "deterministic_warning_units": 0, | |
| "vlm_failure_units": 0, | |
| "vlm_warning_units": 1, | |
| "generation_trace_count": 5, | |
| "vlm_trace_count": 5, | |
| "selected_record_path": "results/new-model-day/models/opus-task-budget-200000", | |
| "quality_score": 97.4, | |
| "quality_score_basis": "sum_of_five_20_point_task_scores", | |
| "missing_final_artifacts": 0, | |
| "efficiency_score": 54.95, | |
| "quality_efficiency_score": 86.79, | |
| "rank_quality_efficiency": 10 | |
| }, | |
| { | |
| "suite": "publish", | |
| "model": "haiku45", | |
| "model_slug": "haiku45", | |
| "source_kind": "clean-final", | |
| "label": "skill-with-shell-haiku45-publication-final", | |
| "artifact_count": 5, | |
| "generation_ok": 3, | |
| "generation_total": 5, | |
| "artifact_present": 5, | |
| "generation_duration_s": 370.199, | |
| "input_tokens": 907211, | |
| "output_tokens": 42193, | |
| "total_tokens": 949404, | |
| "billing_tokens": 949404, | |
| "reasoning_tokens": 0, | |
| "tool_use_tokens": 0, | |
| "cache_read_tokens": 450168, | |
| "cache_write_tokens": 130377, | |
| "cache_hit_tokens": 0, | |
| "total_cache_tokens": 580545, | |
| "effective_input_tokens": 162656, | |
| "display_input_tokens": 743201, | |
| "usage_event_count": 38, | |
| "tool_calls": 49, | |
| "turn_count": 47, | |
| "self_check_attempted": 3, | |
| "self_check_ran": 2, | |
| "self_check_succeeded": 2, | |
| "self_check_runs": 2, | |
| "self_check_failed_runs": 0, | |
| "self_check_successful_runs": 2, | |
| "self_correction_edits": 0, | |
| "self_corrected_after_checker": 0, | |
| "self_correction_verified": 0, | |
| "assistant_turns_trace": 38, | |
| "deterministic_failures": 26, | |
| "deterministic_warnings": 12, | |
| "vlm_failures": 1, | |
| "vlm_warnings": 5, | |
| "deterministic_failure_units": 7, | |
| "deterministic_warning_units": 3, | |
| "vlm_failure_units": 1, | |
| "vlm_warning_units": 2, | |
| "generation_trace_count": 5, | |
| "vlm_trace_count": 5, | |
| "selected_record_path": "results/publish/models/haiku45", | |
| "quality_score": 83.0, | |
| "quality_score_basis": "sum_of_five_20_point_task_scores", | |
| "missing_final_artifacts": 0, | |
| "efficiency_score": 93.26, | |
| "quality_efficiency_score": 85.56, | |
| "rank_quality_efficiency": 11 | |
| }, | |
| { | |
| "suite": "publish", | |
| "model": "gemini35flash", | |
| "model_slug": "gemini35flash", | |
| "source_kind": "clean-final", | |
| "label": "skill-with-shell-gemini35flash-publication-final", | |
| "artifact_count": 5, | |
| "generation_ok": 5, | |
| "generation_total": 5, | |
| "artifact_present": 5, | |
| "generation_duration_s": 774.424, | |
| "input_tokens": 8095357, | |
| "output_tokens": 32386, | |
| "total_tokens": 8127743, | |
| "billing_tokens": 8127743, | |
| "reasoning_tokens": 78303, | |
| "tool_use_tokens": 0, | |
| "cache_read_tokens": 0, | |
| "cache_write_tokens": 0, | |
| "cache_hit_tokens": 6936722, | |
| "total_cache_tokens": 6936722, | |
| "effective_input_tokens": 1158635, | |
| "display_input_tokens": 8095357, | |
| "usage_event_count": 147, | |
| "tool_calls": 142, | |
| "turn_count": 147, | |
| "self_check_attempted": 5, | |
| "self_check_ran": 5, | |
| "self_check_succeeded": 3, | |
| "self_check_runs": 13, | |
| "self_check_failed_runs": 9, | |
| "self_check_successful_runs": 4, | |
| "self_correction_edits": 0, | |
| "self_corrected_after_checker": 0, | |
| "self_correction_verified": 2, | |
| "assistant_turns_trace": 147, | |
| "deterministic_failures": 0, | |
| "deterministic_warnings": 0, | |
| "vlm_failures": 0, | |
| "vlm_warnings": 0, | |
| "deterministic_failure_units": 0, | |
| "deterministic_warning_units": 0, | |
| "vlm_failure_units": 0, | |
| "vlm_warning_units": 0, | |
| "generation_trace_count": 5, | |
| "vlm_trace_count": 5, | |
| "selected_record_path": "results/publish/models/gemini35flash", | |
| "quality_score": 100.0, | |
| "quality_score_basis": "sum_of_five_20_point_task_scores", | |
| "missing_final_artifacts": 0, | |
| "efficiency_score": 34.71, | |
| "quality_efficiency_score": 83.68, | |
| "rank_quality_efficiency": 12 | |
| }, | |
| { | |
| "suite": "publish", | |
| "model": "sonnet46", | |
| "model_slug": "sonnet46", | |
| "source_kind": "clean-final", | |
| "label": "skill-with-shell-sonnet46-publication-final", | |
| "artifact_count": 5, | |
| "generation_ok": 5, | |
| "generation_total": 5, | |
| "artifact_present": 5, | |
| "generation_duration_s": 2303.843, | |
| "input_tokens": 4878483, | |
| "output_tokens": 156614, | |
| "total_tokens": 5035097, | |
| "billing_tokens": 5035097, | |
| "reasoning_tokens": 0, | |
| "tool_use_tokens": 0, | |
| "cache_read_tokens": 4215364, | |
| "cache_write_tokens": 372099, | |
| "cache_hit_tokens": 0, | |
| "total_cache_tokens": 4587463, | |
| "effective_input_tokens": 291020, | |
| "display_input_tokens": 4878483, | |
| "usage_event_count": 93, | |
| "tool_calls": 108, | |
| "turn_count": 93, | |
| "self_check_attempted": 5, | |
| "self_check_ran": 5, | |
| "self_check_succeeded": 5, | |
| "self_check_runs": 11, | |
| "self_check_failed_runs": 2, | |
| "self_check_successful_runs": 9, | |
| "self_correction_edits": 0, | |
| "self_corrected_after_checker": 0, | |
| "self_correction_verified": 2, | |
| "assistant_turns_trace": 93, | |
| "deterministic_failures": 0, | |
| "deterministic_warnings": 0, | |
| "vlm_failures": 0, | |
| "vlm_warnings": 0, | |
| "deterministic_failure_units": 0, | |
| "deterministic_warning_units": 0, | |
| "vlm_failure_units": 0, | |
| "vlm_warning_units": 0, | |
| "generation_trace_count": 5, | |
| "vlm_trace_count": 5, | |
| "selected_record_path": "results/publish/models/sonnet46", | |
| "quality_score": 100.0, | |
| "quality_score_basis": "sum_of_five_20_point_task_scores", | |
| "missing_final_artifacts": 0, | |
| "efficiency_score": 25.36, | |
| "quality_efficiency_score": 81.34, | |
| "rank_quality_efficiency": 13 | |
| }, | |
| { | |
| "suite": "publish", | |
| "model": "deepseek", | |
| "model_slug": "deepseek", | |
| "source_kind": "clean-final", | |
| "label": "skill-with-shell-deepseek-publication-final", | |
| "artifact_count": 5, | |
| "generation_ok": 4, | |
| "generation_total": 5, | |
| "artifact_present": 5, | |
| "generation_duration_s": 1242.354, | |
| "input_tokens": 2535136, | |
| "output_tokens": 77564, | |
| "total_tokens": 2612700, | |
| "billing_tokens": 2612700, | |
| "reasoning_tokens": 0, | |
| "tool_use_tokens": 0, | |
| "cache_read_tokens": 0, | |
| "cache_write_tokens": 0, | |
| "cache_hit_tokens": 2637696, | |
| "total_cache_tokens": 2637696, | |
| "effective_input_tokens": 180215, | |
| "display_input_tokens": 2817911, | |
| "usage_event_count": 84, | |
| "tool_calls": 97, | |
| "turn_count": 80, | |
| "self_check_attempted": 5, | |
| "self_check_ran": 4, | |
| "self_check_succeeded": 4, | |
| "self_check_runs": 8, | |
| "self_check_failed_runs": 3, | |
| "self_check_successful_runs": 5, | |
| "self_correction_edits": 0, | |
| "self_corrected_after_checker": 0, | |
| "self_correction_verified": 3, | |
| "assistant_turns_trace": 84, | |
| "deterministic_failures": 8, | |
| "deterministic_warnings": 1, | |
| "vlm_failures": 7, | |
| "vlm_warnings": 0, | |
| "deterministic_failure_units": 3, | |
| "deterministic_warning_units": 1, | |
| "vlm_failure_units": 2, | |
| "vlm_warning_units": 0, | |
| "generation_trace_count": 5, | |
| "vlm_trace_count": 5, | |
| "selected_record_path": "results/publish/models/deepseek", | |
| "quality_score": 84.0, | |
| "quality_score_basis": "sum_of_five_20_point_task_scores", | |
| "missing_final_artifacts": 0, | |
| "efficiency_score": 60.63, | |
| "quality_efficiency_score": 78.16, | |
| "rank_quality_efficiency": 14 | |
| }, | |
| { | |
| "suite": "publish", | |
| "model": "kimi", | |
| "model_slug": "kimi", | |
| "source_kind": "clean-final", | |
| "label": "skill-with-shell-kimi-publication-final", | |
| "artifact_count": 5, | |
| "generation_ok": 4, | |
| "generation_total": 5, | |
| "artifact_present": 5, | |
| "generation_duration_s": 1764.648, | |
| "input_tokens": 2600494, | |
| "output_tokens": 69995, | |
| "total_tokens": 2670489, | |
| "billing_tokens": 2670489, | |
| "reasoning_tokens": 0, | |
| "tool_use_tokens": 0, | |
| "cache_read_tokens": 0, | |
| "cache_write_tokens": 0, | |
| "cache_hit_tokens": 2332928, | |
| "total_cache_tokens": 2332928, | |
| "effective_input_tokens": 267566, | |
| "display_input_tokens": 2600494, | |
| "usage_event_count": 87, | |
| "tool_calls": 99, | |
| "turn_count": 87, | |
| "self_check_attempted": 5, | |
| "self_check_ran": 4, | |
| "self_check_succeeded": 4, | |
| "self_check_runs": 7, | |
| "self_check_failed_runs": 2, | |
| "self_check_successful_runs": 5, | |
| "self_correction_edits": 0, | |
| "self_corrected_after_checker": 0, | |
| "self_correction_verified": 2, | |
| "assistant_turns_trace": 87, | |
| "deterministic_failures": 6, | |
| "deterministic_warnings": 0, | |
| "vlm_failures": 7, | |
| "vlm_warnings": 2, | |
| "deterministic_failure_units": 2, | |
| "deterministic_warning_units": 0, | |
| "vlm_failure_units": 3, | |
| "vlm_warning_units": 2, | |
| "generation_trace_count": 5, | |
| "vlm_trace_count": 5, | |
| "selected_record_path": "results/publish/models/kimi", | |
| "quality_score": 83.8, | |
| "quality_score_basis": "sum_of_five_20_point_task_scores", | |
| "missing_final_artifacts": 0, | |
| "efficiency_score": 49.7, | |
| "quality_efficiency_score": 75.27, | |
| "rank_quality_efficiency": 15 | |
| }, | |
| { | |
| "suite": "publish", | |
| "model": "grok-4.3", | |
| "model_slug": "grok-4-3", | |
| "source_kind": "clean-final", | |
| "label": "skill-with-shell-grok-4-3-publication-final", | |
| "artifact_count": 5, | |
| "generation_ok": 2, | |
| "generation_total": 5, | |
| "artifact_present": 5, | |
| "generation_duration_s": 284.258, | |
| "input_tokens": 575242, | |
| "output_tokens": 24310, | |
| "total_tokens": 599552, | |
| "billing_tokens": 599552, | |
| "reasoning_tokens": 8191, | |
| "tool_use_tokens": 0, | |
| "cache_read_tokens": 0, | |
| "cache_write_tokens": 0, | |
| "cache_hit_tokens": 336000, | |
| "total_cache_tokens": 336000, | |
| "effective_input_tokens": 134147, | |
| "display_input_tokens": 470147, | |
| "usage_event_count": 52, | |
| "tool_calls": 44, | |
| "turn_count": 49, | |
| "self_check_attempted": 1, | |
| "self_check_ran": 0, | |
| "self_check_succeeded": 0, | |
| "self_check_runs": 0, | |
| "self_check_failed_runs": 0, | |
| "self_check_successful_runs": 0, | |
| "self_correction_edits": 0, | |
| "self_corrected_after_checker": 0, | |
| "self_correction_verified": 0, | |
| "assistant_turns_trace": 52, | |
| "deterministic_failures": 16, | |
| "deterministic_warnings": 0, | |
| "vlm_failures": 11, | |
| "vlm_warnings": 1, | |
| "deterministic_failure_units": 4, | |
| "deterministic_warning_units": 0, | |
| "vlm_failure_units": 4, | |
| "vlm_warning_units": 1, | |
| "generation_trace_count": 5, | |
| "vlm_trace_count": 5, | |
| "selected_record_path": "results/publish/models/grok-4-3", | |
| "quality_score": 58.0, | |
| "quality_score_basis": "sum_of_five_20_point_task_scores", | |
| "missing_final_artifacts": 0, | |
| "efficiency_score": 97.49, | |
| "quality_efficiency_score": 67.87, | |
| "rank_quality_efficiency": 16 | |
| }, | |
| { | |
| "suite": "publish", | |
| "model": "codexspark", | |
| "model_slug": "codexspark", | |
| "source_kind": "clean-final", | |
| "label": "skill-with-shell-codexspark-publication-final", | |
| "artifact_count": 5, | |
| "generation_ok": 3, | |
| "generation_total": 5, | |
| "artifact_present": 5, | |
| "generation_duration_s": 363.473, | |
| "input_tokens": 7093160, | |
| "output_tokens": 92660, | |
| "total_tokens": 7185820, | |
| "billing_tokens": 7185820, | |
| "reasoning_tokens": 58302, | |
| "tool_use_tokens": 0, | |
| "cache_read_tokens": 0, | |
| "cache_write_tokens": 0, | |
| "cache_hit_tokens": 6181120, | |
| "total_cache_tokens": 6181120, | |
| "effective_input_tokens": 450555, | |
| "display_input_tokens": 6631675, | |
| "usage_event_count": 167, | |
| "tool_calls": 174, | |
| "turn_count": 159, | |
| "self_check_attempted": 3, | |
| "self_check_ran": 2, | |
| "self_check_succeeded": 2, | |
| "self_check_runs": 5, | |
| "self_check_failed_runs": 0, | |
| "self_check_successful_runs": 5, | |
| "self_correction_edits": 0, | |
| "self_corrected_after_checker": 0, | |
| "self_correction_verified": 0, | |
| "assistant_turns_trace": 167, | |
| "deterministic_failures": 14, | |
| "deterministic_warnings": 6, | |
| "vlm_failures": 5, | |
| "vlm_warnings": 0, | |
| "deterministic_failure_units": 4, | |
| "deterministic_warning_units": 2, | |
| "vlm_failure_units": 2, | |
| "vlm_warning_units": 0, | |
| "generation_trace_count": 5, | |
| "vlm_trace_count": 5, | |
| "selected_record_path": "results/publish/models/codexspark", | |
| "quality_score": 72.2, | |
| "quality_score_basis": "sum_of_five_20_point_task_scores", | |
| "missing_final_artifacts": 0, | |
| "efficiency_score": 43.38, | |
| "quality_efficiency_score": 65.0, | |
| "rank_quality_efficiency": 17 | |
| }, | |
| { | |
| "suite": "publish", | |
| "model": "minimax27", | |
| "model_slug": "minimax27", | |
| "source_kind": "clean-final", | |
| "label": "skill-with-shell-minimax27-publication-final", | |
| "artifact_count": 5, | |
| "generation_ok": 1, | |
| "generation_total": 5, | |
| "artifact_present": 5, | |
| "generation_duration_s": 1039.913, | |
| "input_tokens": 1255595, | |
| "output_tokens": 70938, | |
| "total_tokens": 1326533, | |
| "billing_tokens": 1326533, | |
| "reasoning_tokens": 0, | |
| "tool_use_tokens": 0, | |
| "cache_read_tokens": 0, | |
| "cache_write_tokens": 0, | |
| "cache_hit_tokens": 841088, | |
| "total_cache_tokens": 841088, | |
| "effective_input_tokens": 485011, | |
| "display_input_tokens": 1326099, | |
| "usage_event_count": 49, | |
| "tool_calls": 55, | |
| "turn_count": 50, | |
| "self_check_attempted": 3, | |
| "self_check_ran": 2, | |
| "self_check_succeeded": 2, | |
| "self_check_runs": 3, | |
| "self_check_failed_runs": 1, | |
| "self_check_successful_runs": 2, | |
| "self_correction_edits": 0, | |
| "self_corrected_after_checker": 0, | |
| "self_correction_verified": 1, | |
| "assistant_turns_trace": 49, | |
| "deterministic_failures": 26, | |
| "deterministic_warnings": 4, | |
| "vlm_failures": 4, | |
| "vlm_warnings": 4, | |
| "deterministic_failure_units": 7, | |
| "deterministic_warning_units": 1, | |
| "vlm_failure_units": 1, | |
| "vlm_warning_units": 1, | |
| "generation_trace_count": 5, | |
| "vlm_trace_count": 5, | |
| "selected_record_path": "results/publish/models/minimax27", | |
| "quality_score": 58.0, | |
| "quality_score_basis": "sum_of_five_20_point_task_scores", | |
| "missing_final_artifacts": 0, | |
| "efficiency_score": 77.19, | |
| "quality_efficiency_score": 62.8, | |
| "rank_quality_efficiency": 18 | |
| } | |
| ] | |
Xet Storage Details
- Size:
- 30.7 kB
- Xet hash:
- 1939e646e5bc7fea84ea5782fa5c6a4c949b3e83907020dfb8f02e73b9d95bad
·
Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.