{
  "note": "Workspace-Bench-Lite leaderboard rows are generated from detailed_rubrics_pass_table_all_runs.csv, a detailed rubrics pass table for the latest public Lite experiments.",
  "litePublicResults": [
    {
      "rank": 1,
      "setting_name": "OpenClaw--Opus-4.7--Test-Rubrics-Checked",
      "agent": "OpenClaw",
      "harness": "OpenClaw",
      "model": "Opus-4.7",
      "split_name": "Test-Rubrics-Checked",
      "easy_rubrics_accuracy": 78.4,
      "medium_rubrics_accuracy": 66.9,
      "hard_rubrics_accuracy": 61.8,
      "total_rubrics_accuracy": 66.7,
      "pass_at": {
        "10": 95,
        "20": 92,
        "30": 90,
        "40": 81,
        "50": 71,
        "60": 63,
        "70": 51,
        "80": 43,
        "90": 36,
        "100": 24
      },
      "source": "detailed-rubrics-pass-table",
      "report_url": "https://huggingface.co/datasets/Workspace-Bench/Workspace-Bench-Lite",
      "verified": true,
      "date": "2026-05-14",
      "rubric_pass_rate": 66.7,
      "overall_score": 66.7,
      "workspace_size": "Lite",
      "profile": "All profiles",
      "capability": "Rubric pass rate"
    },
    {
      "rank": 2,
      "setting_name": "ClaudeCode--Opus-4.7--Test-Rubrics-Checked",
      "agent": "ClaudeCode",
      "harness": "ClaudeCode",
      "model": "Opus-4.7",
      "split_name": "Test-Rubrics-Checked",
      "easy_rubrics_accuracy": 77.1,
      "medium_rubrics_accuracy": 66.4,
      "hard_rubrics_accuracy": 56.9,
      "total_rubrics_accuracy": 64.7,
      "pass_at": {
        "10": 96,
        "20": 93,
        "30": 87,
        "40": 82,
        "50": 76,
        "60": 63,
        "70": 49,
        "80": 39,
        "90": 31,
        "100": 17
      },
      "source": "detailed-rubrics-pass-table",
      "report_url": "https://huggingface.co/datasets/Workspace-Bench/Workspace-Bench-Lite",
      "verified": true,
      "date": "2026-05-14",
      "rubric_pass_rate": 64.7,
      "overall_score": 64.7,
      "workspace_size": "Lite",
      "profile": "All profiles",
      "capability": "Rubric pass rate"
    },
    {
      "rank": 3,
      "setting_name": "Hermes--Opus-4.7--Test-Rubrics-Checked",
      "agent": "Hermes",
      "harness": "Hermes",
      "model": "Opus-4.7",
      "split_name": "Test-Rubrics-Checked",
      "easy_rubrics_accuracy": 63.7,
      "medium_rubrics_accuracy": 65.6,
      "hard_rubrics_accuracy": 63.2,
      "total_rubrics_accuracy": 64.5,
      "pass_at": {
        "10": 97,
        "20": 94,
        "30": 89,
        "40": 82,
        "50": 72,
        "60": 65,
        "70": 48,
        "80": 35,
        "90": 25,
        "100": 18
      },
      "source": "detailed-rubrics-pass-table",
      "report_url": "https://huggingface.co/datasets/Workspace-Bench/Workspace-Bench-Lite",
      "verified": true,
      "date": "2026-05-14",
      "rubric_pass_rate": 64.5,
      "overall_score": 64.5,
      "workspace_size": "Lite",
      "profile": "All profiles",
      "capability": "Rubric pass rate"
    },
    {
      "rank": 4,
      "setting_name": "DeepAgent--GLM-5.1--Test-Rubrics-Checked",
      "agent": "DeepAgent",
      "harness": "DeepAgent",
      "model": "GLM-5.1",
      "split_name": "Test-Rubrics-Checked",
      "easy_rubrics_accuracy": 59.2,
      "medium_rubrics_accuracy": 61.9,
      "hard_rubrics_accuracy": 60.5,
      "total_rubrics_accuracy": 61,
      "pass_at": {
        "10": 94,
        "20": 90,
        "30": 84,
        "40": 78,
        "50": 63,
        "60": 51,
        "70": 45,
        "80": 39,
        "90": 29,
        "100": 16
      },
      "source": "detailed-rubrics-pass-table",
      "report_url": "https://huggingface.co/datasets/Workspace-Bench/Workspace-Bench-Lite",
      "verified": true,
      "date": "2026-05-14",
      "rubric_pass_rate": 61,
      "overall_score": 61,
      "workspace_size": "Lite",
      "profile": "All profiles",
      "capability": "Rubric pass rate"
    },
    {
      "rank": 5,
      "setting_name": "Hermes--GLM-5.1--Test-Rubrics-Checked",
      "agent": "Hermes",
      "harness": "Hermes",
      "model": "GLM-5.1",
      "split_name": "Test-Rubrics-Checked",
      "easy_rubrics_accuracy": 73.9,
      "medium_rubrics_accuracy": 57.3,
      "hard_rubrics_accuracy": 52.1,
      "total_rubrics_accuracy": 57.7,
      "pass_at": {
        "10": 95,
        "20": 90,
        "30": 81,
        "40": 71,
        "50": 59,
        "60": 52,
        "70": 40,
        "80": 35,
        "90": 20,
        "100": 13
      },
      "source": "detailed-rubrics-pass-table",
      "report_url": "https://huggingface.co/datasets/Workspace-Bench/Workspace-Bench-Lite",
      "verified": true,
      "date": "2026-05-14",
      "rubric_pass_rate": 57.7,
      "overall_score": 57.7,
      "workspace_size": "Lite",
      "profile": "All profiles",
      "capability": "Rubric pass rate"
    },
    {
      "rank": 6,
      "setting_name": "OpenClaw--GLM-5.1--Test-Rubrics-Checked",
      "agent": "OpenClaw",
      "harness": "OpenClaw",
      "model": "GLM-5.1",
      "split_name": "Test-Rubrics-Checked",
      "easy_rubrics_accuracy": 65.7,
      "medium_rubrics_accuracy": 56.3,
      "hard_rubrics_accuracy": 56.3,
      "total_rubrics_accuracy": 57.5,
      "pass_at": {
        "10": 87,
        "20": 80,
        "30": 77,
        "40": 69,
        "50": 61,
        "60": 57,
        "70": 46,
        "80": 41,
        "90": 23,
        "100": 14
      },
      "source": "detailed-rubrics-pass-table",
      "report_url": "https://huggingface.co/datasets/Workspace-Bench/Workspace-Bench-Lite",
      "verified": true,
      "date": "2026-05-14",
      "rubric_pass_rate": 57.5,
      "overall_score": 57.5,
      "workspace_size": "Lite",
      "profile": "All profiles",
      "capability": "Rubric pass rate"
    },
    {
      "rank": 7,
      "setting_name": "OpenClaw--Qwen-3.6-Plus--Test-Rubrics-Checked",
      "agent": "OpenClaw",
      "harness": "OpenClaw",
      "model": "Qwen-3.6-Plus",
      "split_name": "Test-Rubrics-Checked",
      "easy_rubrics_accuracy": 62,
      "medium_rubrics_accuracy": 59.6,
      "hard_rubrics_accuracy": 46.3,
      "total_rubrics_accuracy": 55.6,
      "pass_at": {
        "10": 92,
        "20": 82,
        "30": 73,
        "40": 68,
        "50": 64,
        "60": 50,
        "70": 42,
        "80": 33,
        "90": 23,
        "100": 15
      },
      "source": "detailed-rubrics-pass-table",
      "report_url": "https://huggingface.co/datasets/Workspace-Bench/Workspace-Bench-Lite",
      "verified": true,
      "date": "2026-05-14",
      "rubric_pass_rate": 55.6,
      "overall_score": 55.6,
      "workspace_size": "Lite",
      "profile": "All profiles",
      "capability": "Rubric pass rate"
    },
    {
      "rank": 8,
      "setting_name": "ClaudeCode--MiniMax-M2.7--Test-Rubrics-Checked",
      "agent": "ClaudeCode",
      "harness": "ClaudeCode",
      "model": "MiniMax-M2.7",
      "split_name": "Test-Rubrics-Checked",
      "easy_rubrics_accuracy": 60.4,
      "medium_rubrics_accuracy": 56.6,
      "hard_rubrics_accuracy": 49,
      "total_rubrics_accuracy": 54.6,
      "pass_at": {
        "10": 92,
        "20": 84,
        "30": 77,
        "40": 66,
        "50": 57,
        "60": 46,
        "70": 40,
        "80": 29,
        "90": 25,
        "100": 18
      },
      "source": "detailed-rubrics-pass-table",
      "report_url": "https://huggingface.co/datasets/Workspace-Bench/Workspace-Bench-Lite",
      "verified": true,
      "date": "2026-05-14",
      "rubric_pass_rate": 54.6,
      "overall_score": 54.6,
      "workspace_size": "Lite",
      "profile": "All profiles",
      "capability": "Rubric pass rate"
    },
    {
      "rank": 9,
      "setting_name": "DeepAgent--Opus-4.7--Test-Rubrics-Checked",
      "agent": "DeepAgent",
      "harness": "DeepAgent",
      "model": "Opus-4.7",
      "split_name": "Test-Rubrics-Checked",
      "easy_rubrics_accuracy": 58.8,
      "medium_rubrics_accuracy": 58.8,
      "hard_rubrics_accuracy": 45.6,
      "total_rubrics_accuracy": 54.4,
      "pass_at": {
        "10": 77,
        "20": 75,
        "30": 70,
        "40": 67,
        "50": 63,
        "60": 58,
        "70": 43,
        "80": 34,
        "90": 26,
        "100": 18
      },
      "source": "detailed-rubrics-pass-table",
      "report_url": "https://huggingface.co/datasets/Workspace-Bench/Workspace-Bench-Lite",
      "verified": true,
      "date": "2026-05-14",
      "rubric_pass_rate": 54.4,
      "overall_score": 54.4,
      "workspace_size": "Lite",
      "profile": "All profiles",
      "capability": "Rubric pass rate"
    },
    {
      "rank": 10,
      "setting_name": "Codex--GLM-5.1--Test-Rubrics-Checked",
      "agent": "Codex",
      "harness": "Codex",
      "model": "GLM-5.1",
      "split_name": "Test-Rubrics-Checked",
      "easy_rubrics_accuracy": 52.7,
      "medium_rubrics_accuracy": 59.3,
      "hard_rubrics_accuracy": 42.7,
      "total_rubrics_accuracy": 53.1,
      "pass_at": {
        "10": 88,
        "20": 83,
        "30": 74,
        "40": 66,
        "50": 59,
        "60": 48,
        "70": 39,
        "80": 30,
        "90": 21,
        "100": 13
      },
      "source": "detailed-rubrics-pass-table",
      "report_url": "https://huggingface.co/datasets/Workspace-Bench/Workspace-Bench-Lite",
      "verified": true,
      "date": "2026-05-14",
      "rubric_pass_rate": 53.1,
      "overall_score": 53.1,
      "workspace_size": "Lite",
      "profile": "All profiles",
      "capability": "Rubric pass rate"
    },
    {
      "rank": 11,
      "setting_name": "ClaudeCode--Seed-2.0-Lite--Test-Rubrics-Checked",
      "agent": "ClaudeCode",
      "harness": "ClaudeCode",
      "model": "Seed-2.0-Lite",
      "split_name": "Test-Rubrics-Checked",
      "easy_rubrics_accuracy": 67.8,
      "medium_rubrics_accuracy": 52.7,
      "hard_rubrics_accuracy": 47.5,
      "total_rubrics_accuracy": 53,
      "pass_at": {
        "10": 90,
        "20": 81,
        "30": 72,
        "40": 61,
        "50": 52,
        "60": 47,
        "70": 42,
        "80": 32,
        "90": 21,
        "100": 12
      },
      "source": "detailed-rubrics-pass-table",
      "report_url": "https://huggingface.co/datasets/Workspace-Bench/Workspace-Bench-Lite",
      "verified": true,
      "date": "2026-05-14",
      "rubric_pass_rate": 53,
      "overall_score": 53,
      "workspace_size": "Lite",
      "profile": "All profiles",
      "capability": "Rubric pass rate"
    },
    {
      "rank": 12,
      "setting_name": "ClaudeCode--GLM-5.1--Test-Rubrics-Checked",
      "agent": "ClaudeCode",
      "harness": "ClaudeCode",
      "model": "GLM-5.1",
      "split_name": "Test-Rubrics-Checked",
      "easy_rubrics_accuracy": 65.7,
      "medium_rubrics_accuracy": 56.1,
      "hard_rubrics_accuracy": 41.8,
      "total_rubrics_accuracy": 52.6,
      "pass_at": {
        "10": 87,
        "20": 79,
        "30": 76,
        "40": 66,
        "50": 56,
        "60": 49,
        "70": 38,
        "80": 30,
        "90": 18,
        "100": 11
      },
      "source": "detailed-rubrics-pass-table",
      "report_url": "https://huggingface.co/datasets/Workspace-Bench/Workspace-Bench-Lite",
      "verified": true,
      "date": "2026-05-14",
      "rubric_pass_rate": 52.6,
      "overall_score": 52.6,
      "workspace_size": "Lite",
      "profile": "All profiles",
      "capability": "Rubric pass rate"
    },
    {
      "rank": 13,
      "setting_name": "Hermes--MiniMax-M2.7--Test-Rubrics-Checked",
      "agent": "Hermes",
      "harness": "Hermes",
      "model": "MiniMax-M2.7",
      "split_name": "Test-Rubrics-Checked",
      "easy_rubrics_accuracy": 58.4,
      "medium_rubrics_accuracy": 53.9,
      "hard_rubrics_accuracy": 48.2,
      "total_rubrics_accuracy": 52.6,
      "pass_at": {
        "10": 91,
        "20": 82,
        "30": 74,
        "40": 64,
        "50": 49,
        "60": 44,
        "70": 37,
        "80": 29,
        "90": 24,
        "100": 15
      },
      "source": "detailed-rubrics-pass-table",
      "report_url": "https://huggingface.co/datasets/Workspace-Bench/Workspace-Bench-Lite",
      "verified": true,
      "date": "2026-05-14",
      "rubric_pass_rate": 52.6,
      "overall_score": 52.6,
      "workspace_size": "Lite",
      "profile": "All profiles",
      "capability": "Rubric pass rate"
    },
    {
      "rank": 14,
      "setting_name": "ClaudeCode--GPT-5.4--Test-Rubrics-Checked",
      "agent": "ClaudeCode",
      "harness": "ClaudeCode",
      "model": "GPT-5.4",
      "split_name": "Test-Rubrics-Checked",
      "easy_rubrics_accuracy": 49,
      "medium_rubrics_accuracy": 52.1,
      "hard_rubrics_accuracy": 52.4,
      "total_rubrics_accuracy": 51.8,
      "pass_at": {
        "10": 89,
        "20": 86,
        "30": 72,
        "40": 65,
        "50": 51,
        "60": 42,
        "70": 36,
        "80": 27,
        "90": 14,
        "100": 10
      },
      "source": "detailed-rubrics-pass-table",
      "report_url": "https://huggingface.co/datasets/Workspace-Bench/Workspace-Bench-Lite",
      "verified": true,
      "date": "2026-05-14",
      "rubric_pass_rate": 51.8,
      "overall_score": 51.8,
      "workspace_size": "Lite",
      "profile": "All profiles",
      "capability": "Rubric pass rate"
    },
    {
      "rank": 15,
      "setting_name": "Hermes--Qwen-3.6-Plus--Test-Rubrics-Checked",
      "agent": "Hermes",
      "harness": "Hermes",
      "model": "Qwen-3.6-Plus",
      "split_name": "Test-Rubrics-Checked",
      "easy_rubrics_accuracy": 51.8,
      "medium_rubrics_accuracy": 50.4,
      "hard_rubrics_accuracy": 51.3,
      "total_rubrics_accuracy": 50.9,
      "pass_at": {
        "10": 86,
        "20": 80,
        "30": 73,
        "40": 65,
        "50": 52,
        "60": 42,
        "70": 36,
        "80": 26,
        "90": 16,
        "100": 13
      },
      "source": "detailed-rubrics-pass-table",
      "report_url": "https://huggingface.co/datasets/Workspace-Bench/Workspace-Bench-Lite",
      "verified": true,
      "date": "2026-05-14",
      "rubric_pass_rate": 50.9,
      "overall_score": 50.9,
      "workspace_size": "Lite",
      "profile": "All profiles",
      "capability": "Rubric pass rate"
    },
    {
      "rank": 16,
      "setting_name": "Hermes--Kimi-2.5--Test-Rubrics-Checked",
      "agent": "Hermes",
      "harness": "Hermes",
      "model": "Kimi-2.5",
      "split_name": "Test-Rubrics-Checked",
      "easy_rubrics_accuracy": 61.6,
      "medium_rubrics_accuracy": 49.9,
      "hard_rubrics_accuracy": 42.7,
      "total_rubrics_accuracy": 49.1,
      "pass_at": {
        "10": 86,
        "20": 80,
        "30": 73,
        "40": 61,
        "50": 52,
        "60": 41,
        "70": 31,
        "80": 25,
        "90": 20,
        "100": 16
      },
      "source": "detailed-rubrics-pass-table",
      "report_url": "https://huggingface.co/datasets/Workspace-Bench/Workspace-Bench-Lite",
      "verified": true,
      "date": "2026-05-14",
      "rubric_pass_rate": 49.1,
      "overall_score": 49.1,
      "workspace_size": "Lite",
      "profile": "All profiles",
      "capability": "Rubric pass rate"
    },
    {
      "rank": 17,
      "setting_name": "ClaudeCode--Kimi-2.5--Test-Rubrics-Checked",
      "agent": "ClaudeCode",
      "harness": "ClaudeCode",
      "model": "Kimi-2.5",
      "split_name": "Test-Rubrics-Checked",
      "easy_rubrics_accuracy": 55.9,
      "medium_rubrics_accuracy": 53.8,
      "hard_rubrics_accuracy": 36.6,
      "total_rubrics_accuracy": 48.3,
      "pass_at": {
        "10": 87,
        "20": 75,
        "30": 68,
        "40": 57,
        "50": 51,
        "60": 42,
        "70": 36,
        "80": 27,
        "90": 24,
        "100": 13
      },
      "source": "detailed-rubrics-pass-table",
      "report_url": "https://huggingface.co/datasets/Workspace-Bench/Workspace-Bench-Lite",
      "verified": true,
      "date": "2026-05-14",
      "rubric_pass_rate": 48.3,
      "overall_score": 48.3,
      "workspace_size": "Lite",
      "profile": "All profiles",
      "capability": "Rubric pass rate"
    },
    {
      "rank": 18,
      "setting_name": "Codex--GPT-5.4--Test-Rubrics-Checked",
      "agent": "Codex",
      "harness": "Codex",
      "model": "GPT-5.4",
      "split_name": "Test-Rubrics-Checked",
      "easy_rubrics_accuracy": 42.9,
      "medium_rubrics_accuracy": 51.5,
      "hard_rubrics_accuracy": 43.4,
      "total_rubrics_accuracy": 47.7,
      "pass_at": {
        "10": 84,
        "20": 77,
        "30": 66,
        "40": 61,
        "50": 46,
        "60": 35,
        "70": 28,
        "80": 23,
        "90": 15,
        "100": 10
      },
      "source": "detailed-rubrics-pass-table",
      "report_url": "https://huggingface.co/datasets/Workspace-Bench/Workspace-Bench-Lite",
      "verified": true,
      "date": "2026-05-14",
      "rubric_pass_rate": 47.7,
      "overall_score": 47.7,
      "workspace_size": "Lite",
      "profile": "All profiles",
      "capability": "Rubric pass rate"
    },
    {
      "rank": 19,
      "setting_name": "Codex--Qwen-3.6-Plus--Test-Rubrics-Checked",
      "agent": "Codex",
      "harness": "Codex",
      "model": "Qwen-3.6-Plus",
      "split_name": "Test-Rubrics-Checked",
      "easy_rubrics_accuracy": 63.3,
      "medium_rubrics_accuracy": 47.8,
      "hard_rubrics_accuracy": 38.9,
      "total_rubrics_accuracy": 47.3,
      "pass_at": {
        "10": 77,
        "20": 73,
        "30": 66,
        "40": 57,
        "50": 48,
        "60": 46,
        "70": 41,
        "80": 32,
        "90": 21,
        "100": 13
      },
      "source": "detailed-rubrics-pass-table",
      "report_url": "https://huggingface.co/datasets/Workspace-Bench/Workspace-Bench-Lite",
      "verified": true,
      "date": "2026-05-14",
      "rubric_pass_rate": 47.3,
      "overall_score": 47.3,
      "workspace_size": "Lite",
      "profile": "All profiles",
      "capability": "Rubric pass rate"
    },
    {
      "rank": 20,
      "setting_name": "OpenClaw--GPT-5.4--Test-Rubrics-Checked",
      "agent": "OpenClaw",
      "harness": "OpenClaw",
      "model": "GPT-5.4",
      "split_name": "Test-Rubrics-Checked",
      "easy_rubrics_accuracy": 50.6,
      "medium_rubrics_accuracy": 52.2,
      "hard_rubrics_accuracy": 37.6,
      "total_rubrics_accuracy": 47.1,
      "pass_at": {
        "10": 90,
        "20": 77,
        "30": 67,
        "40": 58,
        "50": 48,
        "60": 42,
        "70": 33,
        "80": 25,
        "90": 16,
        "100": 9
      },
      "source": "detailed-rubrics-pass-table",
      "report_url": "https://huggingface.co/datasets/Workspace-Bench/Workspace-Bench-Lite",
      "verified": true,
      "date": "2026-05-14",
      "rubric_pass_rate": 47.1,
      "overall_score": 47.1,
      "workspace_size": "Lite",
      "profile": "All profiles",
      "capability": "Rubric pass rate"
    },
    {
      "rank": 21,
      "setting_name": "Codex--Kimi-K2.5--Test-Rubrics-Checked",
      "agent": "Codex",
      "harness": "Codex",
      "model": "Kimi-2.5",
      "split_name": "Test-Rubrics-Checked",
      "easy_rubrics_accuracy": 58,
      "medium_rubrics_accuracy": 53.5,
      "hard_rubrics_accuracy": 30.9,
      "total_rubrics_accuracy": 46.9,
      "pass_at": {
        "10": 87,
        "20": 78,
        "30": 63,
        "40": 56,
        "50": 48,
        "60": 37,
        "70": 30,
        "80": 20,
        "90": 13,
        "100": 10
      },
      "source": "detailed-rubrics-pass-table",
      "report_url": "https://huggingface.co/datasets/Workspace-Bench/Workspace-Bench-Lite",
      "verified": true,
      "date": "2026-05-14",
      "rubric_pass_rate": 46.9,
      "overall_score": 46.9,
      "workspace_size": "Lite",
      "profile": "All profiles",
      "capability": "Rubric pass rate"
    },
    {
      "rank": 22,
      "setting_name": "OpenClaw--Seed-2.0-Lite--Test-Rubrics-Checked",
      "agent": "OpenClaw",
      "harness": "OpenClaw",
      "model": "Seed-2.0-Lite",
      "split_name": "Test-Rubrics-Checked",
      "easy_rubrics_accuracy": 55.5,
      "medium_rubrics_accuracy": 52.1,
      "hard_rubrics_accuracy": 34.4,
      "total_rubrics_accuracy": 46.6,
      "pass_at": {
        "10": 84,
        "20": 71,
        "30": 63,
        "40": 56,
        "50": 48,
        "60": 43,
        "70": 33,
        "80": 25,
        "90": 16,
        "100": 13
      },
      "source": "detailed-rubrics-pass-table",
      "report_url": "https://huggingface.co/datasets/Workspace-Bench/Workspace-Bench-Lite",
      "verified": true,
      "date": "2026-05-14",
      "rubric_pass_rate": 46.6,
      "overall_score": 46.6,
      "workspace_size": "Lite",
      "profile": "All profiles",
      "capability": "Rubric pass rate"
    },
    {
      "rank": 23,
      "setting_name": "Hermes--Seed-2.0-Lite--Test-Rubrics-Checked",
      "agent": "Hermes",
      "harness": "Hermes",
      "model": "Seed-2.0-Lite",
      "split_name": "Test-Rubrics-Checked",
      "easy_rubrics_accuracy": 55.5,
      "medium_rubrics_accuracy": 50.6,
      "hard_rubrics_accuracy": 34.5,
      "total_rubrics_accuracy": 45.9,
      "pass_at": {
        "10": 86,
        "20": 75,
        "30": 63,
        "40": 54,
        "50": 45,
        "60": 42,
        "70": 34,
        "80": 22,
        "90": 17,
        "100": 9
      },
      "source": "detailed-rubrics-pass-table",
      "report_url": "https://huggingface.co/datasets/Workspace-Bench/Workspace-Bench-Lite",
      "verified": true,
      "date": "2026-05-14",
      "rubric_pass_rate": 45.9,
      "overall_score": 45.9,
      "workspace_size": "Lite",
      "profile": "All profiles",
      "capability": "Rubric pass rate"
    },
    {
      "rank": 24,
      "setting_name": "DeepAgent--MiniMax-M2.7--Test-Rubrics-Checked",
      "agent": "DeepAgent",
      "harness": "DeepAgent",
      "model": "MiniMax-M2.7",
      "split_name": "Test-Rubrics-Checked",
      "easy_rubrics_accuracy": 53.5,
      "medium_rubrics_accuracy": 51.3,
      "hard_rubrics_accuracy": 31.8,
      "total_rubrics_accuracy": 45,
      "pass_at": {
        "10": 87,
        "20": 67,
        "30": 59,
        "40": 51,
        "50": 43,
        "60": 34,
        "70": 27,
        "80": 20,
        "90": 14,
        "100": 10
      },
      "source": "detailed-rubrics-pass-table",
      "report_url": "https://huggingface.co/datasets/Workspace-Bench/Workspace-Bench-Lite",
      "verified": true,
      "date": "2026-05-14",
      "rubric_pass_rate": 45,
      "overall_score": 45,
      "workspace_size": "Lite",
      "profile": "All profiles",
      "capability": "Rubric pass rate"
    },
    {
      "rank": 25,
      "setting_name": "OpenClaw--Kimi-2.5--Test-Rubrics-Checked",
      "agent": "OpenClaw",
      "harness": "OpenClaw",
      "model": "Kimi-2.5",
      "split_name": "Test-Rubrics-Checked",
      "easy_rubrics_accuracy": 56.3,
      "medium_rubrics_accuracy": 47.1,
      "hard_rubrics_accuracy": 35.6,
      "total_rubrics_accuracy": 44.5,
      "pass_at": {
        "10": 84,
        "20": 71,
        "30": 65,
        "40": 53,
        "50": 43,
        "60": 38,
        "70": 32,
        "80": 25,
        "90": 18,
        "100": 13
      },
      "source": "detailed-rubrics-pass-table",
      "report_url": "https://huggingface.co/datasets/Workspace-Bench/Workspace-Bench-Lite",
      "verified": true,
      "date": "2026-05-14",
      "rubric_pass_rate": 44.5,
      "overall_score": 44.5,
      "workspace_size": "Lite",
      "profile": "All profiles",
      "capability": "Rubric pass rate"
    },
    {
      "rank": 26,
      "setting_name": "Hermes--GPT-5.4--Test-Rubrics-Checked",
      "agent": "Hermes",
      "harness": "Hermes",
      "model": "GPT-5.4",
      "split_name": "Test-Rubrics-Checked",
      "easy_rubrics_accuracy": 60.4,
      "medium_rubrics_accuracy": 46,
      "hard_rubrics_accuracy": 35.2,
      "total_rubrics_accuracy": 44.3,
      "pass_at": {
        "10": 86,
        "20": 79,
        "30": 63,
        "40": 54,
        "50": 47,
        "60": 36,
        "70": 28,
        "80": 22,
        "90": 15,
        "100": 8
      },
      "source": "detailed-rubrics-pass-table",
      "report_url": "https://huggingface.co/datasets/Workspace-Bench/Workspace-Bench-Lite",
      "verified": true,
      "date": "2026-05-14",
      "rubric_pass_rate": 44.3,
      "overall_score": 44.3,
      "workspace_size": "Lite",
      "profile": "All profiles",
      "capability": "Rubric pass rate"
    },
    {
      "rank": 27,
      "setting_name": "OpenClaw--MiniMax-M2.7--Test-Rubrics-Checked",
      "agent": "OpenClaw",
      "harness": "OpenClaw",
      "model": "MiniMax-M2.7",
      "split_name": "Test-Rubrics-Checked",
      "easy_rubrics_accuracy": 57.1,
      "medium_rubrics_accuracy": 45.5,
      "hard_rubrics_accuracy": 36.8,
      "total_rubrics_accuracy": 44.1,
      "pass_at": {
        "10": 77,
        "20": 73,
        "30": 61,
        "40": 54,
        "50": 46,
        "60": 37,
        "70": 29,
        "80": 24,
        "90": 13,
        "100": 9
      },
      "source": "detailed-rubrics-pass-table",
      "report_url": "https://huggingface.co/datasets/Workspace-Bench/Workspace-Bench-Lite",
      "verified": true,
      "date": "2026-05-14",
      "rubric_pass_rate": 44.1,
      "overall_score": 44.1,
      "workspace_size": "Lite",
      "profile": "All profiles",
      "capability": "Rubric pass rate"
    },
    {
      "rank": 28,
      "setting_name": "Codex--MiniMax-M2.7--Test-Rubrics-Checked",
      "agent": "Codex",
      "harness": "Codex",
      "model": "MiniMax-M2.7",
      "split_name": "Test-Rubrics-Checked",
      "easy_rubrics_accuracy": 53.1,
      "medium_rubrics_accuracy": 49.1,
      "hard_rubrics_accuracy": 27.5,
      "total_rubrics_accuracy": 42.7,
      "pass_at": {
        "10": 78,
        "20": 67,
        "30": 56,
        "40": 48,
        "50": 41,
        "60": 37,
        "70": 31,
        "80": 27,
        "90": 18,
        "100": 16
      },
      "source": "detailed-rubrics-pass-table",
      "report_url": "https://huggingface.co/datasets/Workspace-Bench/Workspace-Bench-Lite",
      "verified": true,
      "date": "2026-05-14",
      "rubric_pass_rate": 42.7,
      "overall_score": 42.7,
      "workspace_size": "Lite",
      "profile": "All profiles",
      "capability": "Rubric pass rate"
    },
    {
      "rank": 29,
      "setting_name": "ClaudeCode--Seed-2.0-Code--Test-Rubrics-Checked",
      "agent": "ClaudeCode",
      "harness": "ClaudeCode",
      "model": "Seed-2.0-Code",
      "split_name": "Test-Rubrics-Checked",
      "easy_rubrics_accuracy": 59.2,
      "medium_rubrics_accuracy": 45.5,
      "hard_rubrics_accuracy": 30.5,
      "total_rubrics_accuracy": 42.3,
      "pass_at": {
        "10": 86,
        "20": 69,
        "30": 59,
        "40": 52,
        "50": 43,
        "60": 32,
        "70": 27,
        "80": 22,
        "90": 13,
        "100": 7
      },
      "source": "detailed-rubrics-pass-table",
      "report_url": "https://huggingface.co/datasets/Workspace-Bench/Workspace-Bench-Lite",
      "verified": true,
      "date": "2026-05-14",
      "rubric_pass_rate": 42.3,
      "overall_score": 42.3,
      "workspace_size": "Lite",
      "profile": "All profiles",
      "capability": "Rubric pass rate"
    },
    {
      "rank": 30,
      "setting_name": "DeepAgent--Kimi-2.5--Test-Rubrics-Checked",
      "agent": "DeepAgent",
      "harness": "DeepAgent",
      "model": "Kimi-2.5",
      "split_name": "Test-Rubrics-Checked",
      "easy_rubrics_accuracy": 36.3,
      "medium_rubrics_accuracy": 46.7,
      "hard_rubrics_accuracy": 35.6,
      "total_rubrics_accuracy": 41.6,
      "pass_at": {
        "10": 79,
        "20": 68,
        "30": 58,
        "40": 50,
        "50": 42,
        "60": 35,
        "70": 29,
        "80": 23,
        "90": 15,
        "100": 13
      },
      "source": "detailed-rubrics-pass-table",
      "report_url": "https://huggingface.co/datasets/Workspace-Bench/Workspace-Bench-Lite",
      "verified": true,
      "date": "2026-05-14",
      "rubric_pass_rate": 41.6,
      "overall_score": 41.6,
      "workspace_size": "Lite",
      "profile": "All profiles",
      "capability": "Rubric pass rate"
    },
    {
      "rank": 31,
      "setting_name": "OpenClaw--Seed-2.0-Code--Test-Rubrics-Checked",
      "agent": "OpenClaw",
      "harness": "OpenClaw",
      "model": "Seed-2.0-Code",
      "split_name": "Test-Rubrics-Checked",
      "easy_rubrics_accuracy": 66.9,
      "medium_rubrics_accuracy": 37.3,
      "hard_rubrics_accuracy": 33.9,
      "total_rubrics_accuracy": 40.1,
      "pass_at": {
        "10": 84,
        "20": 64,
        "30": 51,
        "40": 46,
        "50": 43,
        "60": 33,
        "70": 24,
        "80": 19,
        "90": 15,
        "100": 9
      },
      "source": "detailed-rubrics-pass-table",
      "report_url": "https://huggingface.co/datasets/Workspace-Bench/Workspace-Bench-Lite",
      "verified": true,
      "date": "2026-05-14",
      "rubric_pass_rate": 40.1,
      "overall_score": 40.1,
      "workspace_size": "Lite",
      "profile": "All profiles",
      "capability": "Rubric pass rate"
    },
    {
      "rank": 32,
      "setting_name": "DeepAgent--Qwen-3.6-Plus--Test-Rubrics-Checked",
      "agent": "DeepAgent",
      "harness": "DeepAgent",
      "model": "Qwen-3.6-Plus",
      "split_name": "Test-Rubrics-Checked",
      "easy_rubrics_accuracy": 41.6,
      "medium_rubrics_accuracy": 41.2,
      "hard_rubrics_accuracy": 35.5,
      "total_rubrics_accuracy": 39.4,
      "pass_at": {
        "10": 67,
        "20": 61,
        "30": 56,
        "40": 49,
        "50": 42,
        "60": 33,
        "70": 25,
        "80": 23,
        "90": 16,
        "100": 9
      },
      "source": "detailed-rubrics-pass-table",
      "report_url": "https://huggingface.co/datasets/Workspace-Bench/Workspace-Bench-Lite",
      "verified": true,
      "date": "2026-05-14",
      "rubric_pass_rate": 39.4,
      "overall_score": 39.4,
      "workspace_size": "Lite",
      "profile": "All profiles",
      "capability": "Rubric pass rate"
    },
    {
      "rank": 33,
      "setting_name": "Hermes--Seed-2.0-Code--Test-Rubrics-Checked",
      "agent": "Hermes",
      "harness": "Hermes",
      "model": "Seed-2.0-Code",
      "split_name": "Test-Rubrics-Checked",
      "easy_rubrics_accuracy": 61.2,
      "medium_rubrics_accuracy": 41.5,
      "hard_rubrics_accuracy": 25,
      "total_rubrics_accuracy": 38.6,
      "pass_at": {
        "10": 78,
        "20": 65,
        "30": 55,
        "40": 45,
        "50": 36,
        "60": 29,
        "70": 24,
        "80": 17,
        "90": 15,
        "100": 7
      },
      "source": "detailed-rubrics-pass-table",
      "report_url": "https://huggingface.co/datasets/Workspace-Bench/Workspace-Bench-Lite",
      "verified": true,
      "date": "2026-05-14",
      "rubric_pass_rate": 38.6,
      "overall_score": 38.6,
      "workspace_size": "Lite",
      "profile": "All profiles",
      "capability": "Rubric pass rate"
    },
    {
      "rank": 34,
      "setting_name": "ClaudeCode--Gemini-3.1-Pro--Test-Rubrics-Checked",
      "agent": "ClaudeCode",
      "harness": "ClaudeCode",
      "model": "Gemini-3.1-Pro",
      "split_name": "Test-Rubrics-Checked",
      "easy_rubrics_accuracy": 43.3,
      "medium_rubrics_accuracy": 36.1,
      "hard_rubrics_accuracy": 37.4,
      "total_rubrics_accuracy": 37.5,
      "pass_at": {
        "10": 78,
        "20": 68,
        "30": 52,
        "40": 47,
        "50": 40,
        "60": 23,
        "70": 16,
        "80": 13,
        "90": 9,
        "100": 6
      },
      "source": "detailed-rubrics-pass-table",
      "report_url": "https://huggingface.co/datasets/Workspace-Bench/Workspace-Bench-Lite",
      "verified": true,
      "date": "2026-05-14",
      "rubric_pass_rate": 37.5,
      "overall_score": 37.5,
      "workspace_size": "Lite",
      "profile": "All profiles",
      "capability": "Rubric pass rate"
    },
    {
      "rank": 35,
      "setting_name": "DeepAgent--Gemini-3.1-Pro--Test-Rubrics-Checked",
      "agent": "DeepAgent",
      "harness": "DeepAgent",
      "model": "Gemini-3.1-Pro",
      "split_name": "Test-Rubrics-Checked",
      "easy_rubrics_accuracy": 55.9,
      "medium_rubrics_accuracy": 39,
      "hard_rubrics_accuracy": 27.1,
      "total_rubrics_accuracy": 37.2,
      "pass_at": {
        "10": 75,
        "20": 60,
        "30": 52,
        "40": 46,
        "50": 35,
        "60": 26,
        "70": 23,
        "80": 18,
        "90": 13,
        "100": 9
      },
      "source": "detailed-rubrics-pass-table",
      "report_url": "https://huggingface.co/datasets/Workspace-Bench/Workspace-Bench-Lite",
      "verified": true,
      "date": "2026-05-14",
      "rubric_pass_rate": 37.2,
      "overall_score": 37.2,
      "workspace_size": "Lite",
      "profile": "All profiles",
      "capability": "Rubric pass rate"
    },
    {
      "rank": 36,
      "setting_name": "Codex--Grok-4.3--Test-Rubrics-Checked",
      "agent": "Codex",
      "harness": "Codex",
      "model": "Grok-4.3",
      "split_name": "Test-Rubrics-Checked",
      "easy_rubrics_accuracy": 48.6,
      "medium_rubrics_accuracy": 39.8,
      "hard_rubrics_accuracy": 27.3,
      "total_rubrics_accuracy": 36.9,
      "pass_at": {
        "10": 78,
        "20": 63,
        "30": 52,
        "40": 43,
        "50": 36,
        "60": 27,
        "70": 20,
        "80": 17,
        "90": 14,
        "100": 10
      },
      "source": "detailed-rubrics-pass-table",
      "report_url": "https://huggingface.co/datasets/Workspace-Bench/Workspace-Bench-Lite",
      "verified": true,
      "date": "2026-05-14",
      "rubric_pass_rate": 36.9,
      "overall_score": 36.9,
      "workspace_size": "Lite",
      "profile": "All profiles",
      "capability": "Rubric pass rate"
    },
    {
      "rank": 37,
      "setting_name": "DeepAgent--GPT-5.4--Test-Rubrics-Checked",
      "agent": "DeepAgent",
      "harness": "DeepAgent",
      "model": "GPT-5.4",
      "split_name": "Test-Rubrics-Checked",
      "easy_rubrics_accuracy": 44.1,
      "medium_rubrics_accuracy": 38.2,
      "hard_rubrics_accuracy": 29.9,
      "total_rubrics_accuracy": 36.2,
      "pass_at": {
        "10": 75,
        "20": 61,
        "30": 55,
        "40": 47,
        "50": 36,
        "60": 27,
        "70": 19,
        "80": 15,
        "90": 10,
        "100": 7
      },
      "source": "detailed-rubrics-pass-table",
      "report_url": "https://huggingface.co/datasets/Workspace-Bench/Workspace-Bench-Lite",
      "verified": true,
      "date": "2026-05-14",
      "rubric_pass_rate": 36.2,
      "overall_score": 36.2,
      "workspace_size": "Lite",
      "profile": "All profiles",
      "capability": "Rubric pass rate"
    },
    {
      "rank": 38,
      "setting_name": "Hermes--Grok-4.3--Test-Rubrics-Checked",
      "agent": "Hermes",
      "harness": "Hermes",
      "model": "Grok-4.3",
      "split_name": "Test-Rubrics-Checked",
      "easy_rubrics_accuracy": 37.1,
      "medium_rubrics_accuracy": 39.8,
      "hard_rubrics_accuracy": 30,
      "total_rubrics_accuracy": 36.2,
      "pass_at": {
        "10": 75,
        "20": 63,
        "30": 51,
        "40": 39,
        "50": 33,
        "60": 30,
        "70": 22,
        "80": 18,
        "90": 12,
        "100": 8
      },
      "source": "detailed-rubrics-pass-table",
      "report_url": "https://huggingface.co/datasets/Workspace-Bench/Workspace-Bench-Lite",
      "verified": true,
      "date": "2026-05-14",
      "rubric_pass_rate": 36.2,
      "overall_score": 36.2,
      "workspace_size": "Lite",
      "profile": "All profiles",
      "capability": "Rubric pass rate"
    },
    {
      "rank": 39,
      "setting_name": "DeepAgent--Seed-2.0-Lite--Test-Rubrics-Checked",
      "agent": "DeepAgent",
      "harness": "DeepAgent",
      "model": "Seed-2.0-Lite",
      "split_name": "Test-Rubrics-Checked",
      "easy_rubrics_accuracy": 48.2,
      "medium_rubrics_accuracy": 39.8,
      "hard_rubrics_accuracy": 25.1,
      "total_rubrics_accuracy": 36,
      "pass_at": {
        "10": 74,
        "20": 59,
        "30": 53,
        "40": 44,
        "50": 36,
        "60": 29,
        "70": 22,
        "80": 18,
        "90": 12,
        "100": 8
      },
      "source": "detailed-rubrics-pass-table",
      "report_url": "https://huggingface.co/datasets/Workspace-Bench/Workspace-Bench-Lite",
      "verified": true,
      "date": "2026-05-14",
      "rubric_pass_rate": 36,
      "overall_score": 36,
      "workspace_size": "Lite",
      "profile": "All profiles",
      "capability": "Rubric pass rate"
    },
    {
      "rank": 40,
      "setting_name": "DeepAgent--Seed-2.0-Code--Test-Rubrics-Checked",
      "agent": "DeepAgent",
      "harness": "DeepAgent",
      "model": "Seed-2.0-Code",
      "split_name": "Test-Rubrics-Checked",
      "easy_rubrics_accuracy": 46.1,
      "medium_rubrics_accuracy": 35,
      "hard_rubrics_accuracy": 29.5,
      "total_rubrics_accuracy": 34.6,
      "pass_at": {
        "10": 71,
        "20": 55,
        "30": 46,
        "40": 39,
        "50": 35,
        "60": 27,
        "70": 21,
        "80": 13,
        "90": 9,
        "100": 8
      },
      "source": "detailed-rubrics-pass-table",
      "report_url": "https://huggingface.co/datasets/Workspace-Bench/Workspace-Bench-Lite",
      "verified": true,
      "date": "2026-05-14",
      "rubric_pass_rate": 34.6,
      "overall_score": 34.6,
      "workspace_size": "Lite",
      "profile": "All profiles",
      "capability": "Rubric pass rate"
    },
    {
      "rank": 41,
      "setting_name": "OpenClaw--Grok-4.3--Test-Rubrics-Checked",
      "agent": "OpenClaw",
      "harness": "OpenClaw",
      "model": "Grok-4.3",
      "split_name": "Test-Rubrics-Checked",
      "easy_rubrics_accuracy": 36.3,
      "medium_rubrics_accuracy": 40.1,
      "hard_rubrics_accuracy": 24.7,
      "total_rubrics_accuracy": 34.4,
      "pass_at": {
        "10": 77,
        "20": 56,
        "30": 48,
        "40": 36,
        "50": 27,
        "60": 22,
        "70": 20,
        "80": 16,
        "90": 12,
        "100": 8
      },
      "source": "detailed-rubrics-pass-table",
      "report_url": "https://huggingface.co/datasets/Workspace-Bench/Workspace-Bench-Lite",
      "verified": true,
      "date": "2026-05-14",
      "rubric_pass_rate": 34.4,
      "overall_score": 34.4,
      "workspace_size": "Lite",
      "profile": "All profiles",
      "capability": "Rubric pass rate"
    },
    {
      "rank": 42,
      "setting_name": "Codex--Gemini-3.1-Pro--Test-Rubrics-Checked",
      "agent": "Codex",
      "harness": "Codex",
      "model": "Gemini-3.1-Pro",
      "split_name": "Test-Rubrics-Checked",
      "easy_rubrics_accuracy": 39.6,
      "medium_rubrics_accuracy": 36,
      "hard_rubrics_accuracy": 21.5,
      "total_rubrics_accuracy": 31.9,
      "pass_at": {
        "10": 77,
        "20": 58,
        "30": 41,
        "40": 35,
        "50": 27,
        "60": 20,
        "70": 12,
        "80": 11,
        "90": 8,
        "100": 5
      },
      "source": "detailed-rubrics-pass-table",
      "report_url": "https://huggingface.co/datasets/Workspace-Bench/Workspace-Bench-Lite",
      "verified": true,
      "date": "2026-05-14",
      "rubric_pass_rate": 31.9,
      "overall_score": 31.9,
      "workspace_size": "Lite",
      "profile": "All profiles",
      "capability": "Rubric pass rate"
    },
    {
      "rank": 43,
      "setting_name": "OpenClaw--Gemini-3.1-Pro--Test-Rubrics-Checked",
      "agent": "OpenClaw",
      "harness": "OpenClaw",
      "model": "Gemini-3.1-Pro",
      "split_name": "Test-Rubrics-Checked",
      "easy_rubrics_accuracy": 52.2,
      "medium_rubrics_accuracy": 30.4,
      "hard_rubrics_accuracy": 25.3,
      "total_rubrics_accuracy": 31.6,
      "pass_at": {
        "10": 65,
        "20": 50,
        "30": 45,
        "40": 34,
        "50": 31,
        "60": 24,
        "70": 20,
        "80": 17,
        "90": 11,
        "100": 8
      },
      "source": "detailed-rubrics-pass-table",
      "report_url": "https://huggingface.co/datasets/Workspace-Bench/Workspace-Bench-Lite",
      "verified": true,
      "date": "2026-05-14",
      "rubric_pass_rate": 31.6,
      "overall_score": 31.6,
      "workspace_size": "Lite",
      "profile": "All profiles",
      "capability": "Rubric pass rate"
    },
    {
      "rank": 44,
      "setting_name": "Hermes--Gemini-3.1-Pro--Test-Rubrics-Checked",
      "agent": "Hermes",
      "harness": "Hermes",
      "model": "Gemini-3.1-Pro",
      "split_name": "Test-Rubrics-Checked",
      "easy_rubrics_accuracy": 42.4,
      "medium_rubrics_accuracy": 30.7,
      "hard_rubrics_accuracy": 15.2,
      "total_rubrics_accuracy": 27.1,
      "pass_at": {
        "10": 62,
        "20": 52,
        "30": 42,
        "40": 32,
        "50": 23,
        "60": 18,
        "70": 15,
        "80": 11,
        "90": 7,
        "100": 7
      },
      "source": "detailed-rubrics-pass-table",
      "report_url": "https://huggingface.co/datasets/Workspace-Bench/Workspace-Bench-Lite",
      "verified": true,
      "date": "2026-05-14",
      "rubric_pass_rate": 27.1,
      "overall_score": 27.1,
      "workspace_size": "Lite",
      "profile": "All profiles",
      "capability": "Rubric pass rate"
    },
    {
      "rank": 45,
      "setting_name": "DeepAgent--Grok-4.3--Test-Rubrics-Checked",
      "agent": "DeepAgent",
      "harness": "DeepAgent",
      "model": "Grok-4.3",
      "split_name": "Test-Rubrics-Checked",
      "easy_rubrics_accuracy": 24.5,
      "medium_rubrics_accuracy": 13.6,
      "hard_rubrics_accuracy": 9.7,
      "total_rubrics_accuracy": 13.8,
      "pass_at": {
        "10": 44,
        "20": 27,
        "30": 12,
        "40": 8,
        "50": 5,
        "60": 5,
        "70": 3,
        "80": 2,
        "90": 1,
        "100": 0
      },
      "source": "detailed-rubrics-pass-table",
      "report_url": "https://huggingface.co/datasets/Workspace-Bench/Workspace-Bench-Lite",
      "verified": true,
      "date": "2026-05-14",
      "rubric_pass_rate": 13.8,
      "overall_score": 13.8,
      "workspace_size": "Lite",
      "profile": "All profiles",
      "capability": "Rubric pass rate"
    }
  ],
  "thresholds": [
    {
      "label": "Pass >= 30%",
      "value": 30,
      "averagePassedTasks": 62.9,
      "bestPassedTasks": 90,
      "systemsWithAnyPass": 45
    },
    {
      "label": "Pass >= 40%",
      "value": 40,
      "averagePassedTasks": 54.7,
      "bestPassedTasks": 82,
      "systemsWithAnyPass": 45
    },
    {
      "label": "Pass >= 50%",
      "value": 50,
      "averagePassedTasks": 46.2,
      "bestPassedTasks": 76,
      "systemsWithAnyPass": 45
    },
    {
      "label": "Pass >= 60%",
      "value": 60,
      "averagePassedTasks": 38.4,
      "bestPassedTasks": 65,
      "systemsWithAnyPass": 45
    },
    {
      "label": "Pass >= 70%",
      "value": 70,
      "averagePassedTasks": 30.8,
      "bestPassedTasks": 51,
      "systemsWithAnyPass": 45
    },
    {
      "label": "Pass >= 80%",
      "value": 80,
      "averagePassedTasks": 24.3,
      "bestPassedTasks": 43,
      "systemsWithAnyPass": 45
    },
    {
      "label": "Pass >= 90%",
      "value": 90,
      "averagePassedTasks": 17,
      "bestPassedTasks": 36,
      "systemsWithAnyPass": 45
    },
    {
      "label": "Pass >= 100%",
      "value": 100,
      "averagePassedTasks": 11.2,
      "bestPassedTasks": 24,
      "systemsWithAnyPass": 44
    }
  ],
  "fullSummaryRows": [
    {
      "rank": 1,
      "agent": "Human + Tools",
      "harness": "Human reference",
      "model": "Human",
      "overall_score": 80.7,
      "task_success_rate": 80.7,
      "rubric_pass_rate": 80.7,
      "workspace_size": "Full",
      "profile": "All profiles",
      "capability": "Overall",
      "date": "2026-05-05",
      "verified": true,
      "source": "paper-reported",
      "report_url": "https://arxiv.org/abs/2605.03596"
    },
    {
      "rank": 2,
      "agent": "Best public agent",
      "harness": "Best evaluated harness",
      "model": "Best evaluated model",
      "overall_score": 68.7,
      "task_success_rate": 68.7,
      "rubric_pass_rate": 68.7,
      "workspace_size": "Full",
      "profile": "All profiles",
      "capability": "Overall",
      "date": "2026-05-05",
      "verified": true,
      "source": "paper-reported",
      "report_url": "https://arxiv.org/abs/2605.03596"
    },
    {
      "rank": 3,
      "agent": "Average public agent",
      "harness": "Mean across evaluated agents",
      "model": "4 harnesses / 7 foundation models",
      "overall_score": 47.4,
      "task_success_rate": 47.4,
      "rubric_pass_rate": 47.4,
      "workspace_size": "Full",
      "profile": "All profiles",
      "capability": "Overall",
      "date": "2026-05-05",
      "verified": true,
      "source": "paper-reported",
      "report_url": "https://arxiv.org/abs/2605.03596"
    }
  ],
  "leaderboards": [
    {
      "name": "Overall",
      "description": "Full Workspace-Bench paper summary. Detailed full-benchmark per-system tables are not public yet."
    },
    {
      "name": "Workspace-Bench-Lite",
      "description": "Latest public Workspace-Bench-Lite detailed rubric results generated from the released experiment table."
    },
    {
      "name": "Threshold Views",
      "description": "Inspect how many public Lite system combinations clear each rubric pass-rate threshold."
    },
    {
      "name": "By Worker Profile",
      "description": "Official benchmark composition by workspace profile from the public distribution figure."
    },
    {
      "name": "By Difficulty",
      "description": "Official task difficulty split from the public distribution figure."
    },
    {
      "name": "By Ability",
      "description": "Official task ability counts from the public distribution figure."
    }
  ]
}
