{
  "name": "AgentRanks coding test evidence",
  "url": "https://agentranks.uk/coding-tests.html",
  "dateModified": "2026-06-21",
  "currentScoreAudit": {
    "status": "provisional",
    "formula": "legacy_ARscore = agent_architecture_score * model_SWE_bench_percent / 100",
    "verdict": "Useful for first-pass stack discovery, but not accurate enough to be treated as a real coding ability leaderboard.",
    "issues": [
      "It mixes subjective agent architecture scores with model benchmark scores.",
      "It does not include end-to-end terminal task completion.",
      "It does not include cost per completed task, retry count, latency, or tool failure rate.",
      "It does not distinguish official, third-party, self-reported, and rumor sources.",
      "It can over-rank unavailable models or models with strong benchmark numbers but weak product access."
    ]
  },
  "recommendedFormula": {
    "label": "AgentRanks Tested Score v2",
    "formula": "0.35 * real_task_pass + 0.20 * SWE_bench + 0.15 * terminal_bench + 0.10 * cost_efficiency + 0.10 * availability + 0.10 * maintainability",
    "note": "Use v2 only after AgentRanks has run the same task pack against each agent + model stack."
  },
  "benchmarkSources": [
    {
      "name": "SWE-bench Verified",
      "unit": "model or agent bug-fix success",
      "why": "Real GitHub issue resolution with tests; strong signal, but increasingly saturated and source quality varies by run.",
      "source": "https://www.swebench.com/",
      "trust": "official benchmark"
    },
    {
      "name": "Terminal-Bench 2.0",
      "unit": "agent + model terminal completion",
      "why": "Closer to real coding-agent work because the system must inspect files, run commands, debug, and finish tasks.",
      "source": "https://www.tbench.ai/leaderboard/terminal-bench/2.0",
      "trust": "official benchmark"
    },
    {
      "name": "Terminal-Bench Hard",
      "unit": "model terminal task success",
      "why": "Useful for separating frontier coding models on harder terminal tasks.",
      "source": "https://artificialanalysis.ai/evaluations/terminalbench-hard",
      "trust": "third-party evaluation"
    },
    {
      "name": "SWE-bench / Vals AI",
      "unit": "model coding benchmark snapshot",
      "why": "Useful secondary snapshot with recent Fable 5, Opus 4.8, and GPT-5.5 figures, but should not replace primary benchmark links.",
      "source": "https://vals.ai/benchmarks/swebench",
      "trust": "third-party evaluation"
    }
  ],
  "externalSignals": [
    {
      "stack": "Codex + GPT-5.5",
      "type": "Terminal-Bench v2.1",
      "score": "83.4%",
      "source": "https://www.morphllm.com/best-ai-coding-agents-2026",
      "trust": "reported",
      "note": "Reported as leading Terminal-Bench v2.1 in June 2026 coverage; needs direct source verification before becoming an AgentRanks official score."
    },
    {
      "stack": "Claude Code + Fable 5",
      "type": "Terminal-Bench v2.1",
      "score": "83.1%",
      "source": "https://www.morphllm.com/best-ai-coding-agents-2026",
      "trust": "reported / unavailable",
      "note": "Very strong reported score, but Fable 5 access was suspended by Anthropic on June 12, 2026."
    },
    {
      "stack": "Claude Fable 5",
      "type": "SWE-bench Verified",
      "score": "95.0%",
      "source": "https://vals.ai/benchmarks/swebench",
      "trust": "third-party snapshot",
      "note": "Model-level coding signal, not an agent workflow score."
    },
    {
      "stack": "Claude Opus 4.8",
      "type": "SWE-bench Verified",
      "score": "88.6%",
      "source": "https://vals.ai/benchmarks/swebench",
      "trust": "third-party snapshot",
      "note": "Useful model-level signal for Claude Code-style stacks."
    },
    {
      "stack": "GPT-5.5",
      "type": "SWE-bench Verified",
      "score": "82.6%",
      "source": "https://vals.ai/benchmarks/swebench",
      "trust": "third-party snapshot",
      "note": "Useful model-level signal for Codex-style stacks."
    },
    {
      "stack": "Claude Fable 5",
      "type": "Terminal-Bench Hard",
      "score": "62.9%",
      "source": "https://artificialanalysis.ai/evaluations/terminalbench-hard",
      "trust": "third-party evaluation",
      "note": "Hard terminal task model benchmark; availability caveat applies."
    },
    {
      "stack": "GPT-5.5",
      "type": "Terminal-Bench Hard",
      "score": "60.6%",
      "source": "https://artificialanalysis.ai/evaluations/terminalbench-hard",
      "trust": "third-party evaluation",
      "note": "Hard terminal task model benchmark, useful as a model-side signal."
    }
  ],
  "agentRanksTestPack": [
    {
      "name": "Bug fix with hidden tests",
      "weight": 30,
      "task": "Patch a real failing issue in a small repo, add regression coverage, and pass hidden tests."
    },
    {
      "name": "Terminal autonomy",
      "weight": 20,
      "task": "Inspect a repo, run commands, diagnose failures, and produce a working fix without manual file hints."
    },
    {
      "name": "Feature build",
      "weight": 15,
      "task": "Implement a small UI/API feature from product requirements with tests and no unrelated churn."
    },
    {
      "name": "Refactor safety",
      "weight": 15,
      "task": "Refactor a shared module while preserving behavior and avoiding over-broad edits."
    },
    {
      "name": "Cost and latency",
      "weight": 10,
      "task": "Measure tokens, wall time, retries, and cost per accepted solution."
    },
    {
      "name": "Maintainability review",
      "weight": 10,
      "task": "Score code clarity, test quality, minimalism, and ease of future modification."
    }
  ]
}