Evaluation framework for AI coding agents. Define benchmarks, run agents, collect metrics.
pip install -e .
import agentbench
bench = agentbench.from_tasks("my-bench", [
{"id": "t1", "prompt": "...", "test_code": "..."},
])
summary = agentbench.run_sync(my_agent, bench)
print(agentbench.console_report(summary))MIT