|
6 | 6 | github: "https://github.com/EuniAI/Prometheus" |
7 | 7 | website: "https://euni.ai/" |
8 | 8 |
|
| 9 | +- title: "Terminal-Bench: A Benchmark for AI Agents in Terminal Environments" |
| 10 | + authors: "The Terminal-Bench Team" |
| 11 | + venue: "2025" |
| 12 | + links: |
| 13 | + paper: "" |
| 14 | + github: "https://github.com/laude-institute/terminal-bench" |
| 15 | + website: "https://www.tbench.ai/" |
| 16 | + |
| 17 | +- title: "SWE-Bench Pro: Can AI Agents Solve Long-Horizon Software Engineering Tasks?" |
| 18 | + authors: "Xiang Deng, Jeff Da, Edwin Pan, Yannis Yiming He, Charles Ide, Kanak Garg, Niklas Lauffer, Andrew Park, Nitin Pasari, Chetan Rane, Karmini Sampath, Maya Krishnan, Srivatsa Kundurthy, Sean Hendryx, Zifan Wang, Chen Bo Calvin Zhang, Noah Jacobson, Bing Liu, Brad Kenstler" |
| 19 | + venue: "arXiv 2025" |
| 20 | + links: |
| 21 | + paper: "https://arxiv.org/abs/2509.16941" |
| 22 | + github: "https://github.com/scaleapi/SWE-bench_Pro-os" |
| 23 | + website: "https://scale.com/leaderboard/swe_bench_pro_public" |
| 24 | + |
| 25 | +- title: "SWE-PolyBench: A multi-language benchmark for repository level evaluation of coding agents" |
| 26 | + authors: "Muhammad Shihab Rashid, Christian Bock, Yuan Zhuang, Alexander Buchholz, Tim Esler, Simon Valentin, Luca Franceschi, Martin Wistuba, Prabhu Teja Sivaprasad, Woo Jung Kim, Anoop Deoras, Giovanni Zappella, Laurent Callot" |
| 27 | + venue: "arXiv 2025" |
| 28 | + links: |
| 29 | + paper: "https://arxiv.org/abs/2504.08703" |
| 30 | + github: "https://github.com/amazon-science/SWE-PolyBench" |
| 31 | + website: "https://amazon-science.github.io/SWE-PolyBench/" |
| 32 | + |
| 33 | +- title: "Multi-SWE-bench: A Multilingual Benchmark for Issue Resolving" |
| 34 | + authors: "Daoguang Zan, Zhirong Huang, Wei Liu, Hanwu Chen, Linhao Zhang, Shulin Xin, Lu Chen, Qi Liu, Xiaojian Zhong, Aoyan Li, Siyao Liu, Yongsheng Xiao, Liangqiang Chen, Yuyu Zhang, Jing Su, Tianyu Liu, Rui Long, Kai Shen, Liang Xiang" |
| 35 | + venue: "arXiv 2025" |
| 36 | + links: |
| 37 | + paper: "https://arxiv.org/abs/2504.02605" |
| 38 | + github: "https://github.com/multi-swe-bench/multi-swe-bench" |
| 39 | + website: "https://multi-swe-bench.github.io/" |
| 40 | + |
| 41 | +- title: "SWE-Effi: Re-Evaluating Software AI Agent System Effectiveness Under Resource Constraints" |
| 42 | + authors: "Zhiyu Fan, Kirill Vasilevski, Dayi Lin, Boyuan Chen, Yihao Chen, Zhiqing Zhong, Jie M. Zhang, Pinjia He, Ahmed E. Hassan" |
| 43 | + venue: "arXiv 2025" |
| 44 | + links: |
| 45 | + paper: "https://arxiv.org/abs/2509.09853" |
| 46 | + github: "https://centre-for-software-excellence.github.io/SWE-Effi/" |
| 47 | + website: "https://github.com/Centre-for-Software-Excellence/SWE-Effi" |
| 48 | + |
9 | 49 | - title: "Saving SWE-Bench: A Benchmark Mutation Approach for Realistic Agent Evaluation" |
10 | 50 | authors: "Spandan Garg, Ben Steenhoek, Yufan Huang" |
11 | 51 | venue: "arXiv 2025" |
|
14 | 54 | github: "" |
15 | 55 | website: "" |
16 | 56 |
|
| 57 | +- title: "SWE-MERA: A Dynamic Benchmark for Agenticly Evaluating Large Language Models on Software Engineering Tasks" |
| 58 | + authors: "Pavel Adamenko, Mikhail Ivanov, Aidar Valeev, Rodion Levichev, Pavel Zadorozhny, Ivan Lopatin, Dmitry Babayev, Alena Fenogenova, Valentin Malykh" |
| 59 | + venue: "arXiv 2025" |
| 60 | + links: |
| 61 | + paper: "https://arxiv.org/abs/2507.11059" |
| 62 | + github: "https://github.com/MERA-Evaluation/repotest" |
| 63 | + website: "https://mera-evaluation.github.io/demo-swe-mera/" |
| 64 | + |
| 65 | +- title: "Auto-SWE-Bench: A Framework for the Scalable Generation of Software Engineering Benchmark from Open-Source Repositories" |
| 66 | + authors: "Anonymous Authors" |
| 67 | + venue: "2025" |
| 68 | + links: |
| 69 | + paper: "https://openreview.net/forum?id=Gxw1EDSm9S" |
| 70 | + github: "" |
| 71 | + website: "" |
| 72 | + |
17 | 73 | - title: "SWE-RL: Advancing LLM Reasoning via Reinforcement Learning on Open Software Evolution" |
18 | 74 | authors: "Yuxiang Wei, Olivier Duchenne, Jade Copet, Quentin Carbonneaux, Lingming Zhang, Daniel Fried, Gabriel Synnaeve, Rishabh Singh, Sida I. Wang" |
19 | 75 | venue: "NeurIPS 2025" |
|
22 | 78 | github: "https://github.com/facebookresearch/swe-rl" |
23 | 79 | website: "" |
24 | 80 |
|
| 81 | +- title: "SWE-bench Goes Live!" |
| 82 | + authors: "Linghao Zhang, Shilin He, Chaoyun Zhang, Yu Kang, Bowen Li, Chengxing Xie, Junhao Wang, Maoquan Wang, Yufan Huang, Shengyu Fu, Elsie Nallipogu, Qingwei Lin, Yingnong Dang, Saravan Rajmohan, Dongmei Zhang" |
| 83 | + venue: "NeurIPS 2025 Datasets & Benchmarks Track" |
| 84 | + links: |
| 85 | + paper: "https://arxiv.org/abs/2505.23419" |
| 86 | + github: "https://github.com/microsoft/SWE-bench-Live" |
| 87 | + website: "https://swe-bench-live.github.io/" |
| 88 | + |
25 | 89 | - title: "Training Software Engineering Agents and Verifiers with SWE-Gym" |
26 | 90 | authors: "Jiayi Pan, Xingyao Wang, Graham Neubig, Navdeep Jaitly, Heng Ji, Alane Suhr, Yizhe Zhang" |
27 | 91 | venue: "ICML 2025" |
|
30 | 94 | github: "https://github.com/SWE-Gym/SWE-Gym" |
31 | 95 | website: "" |
32 | 96 |
|
| 97 | +- title: "SWE-Lancer: Can Frontier LLMs Earn $1 Million from Real-World Freelance Software Engineering?" |
| 98 | + authors: "Samuel Miserendino, Michele Wang, Tejal Patwardhan, Johannes Heidecke" |
| 99 | + venue: "ICML 2025" |
| 100 | + links: |
| 101 | + paper: "https://arxiv.org/abs/2502.12115" |
| 102 | + github: "https://github.com/openai/frontier-evals/tree/main/project/swelancer" |
| 103 | + website: "https://openai.com/index/swe-lancer/" |
| 104 | + |
33 | 105 | - title: "SWE-GPT: A Process-Centric Language Model for Automated Software Improvement" |
34 | 106 | authors: "Yingwei Ma, Rongyu Cao, Yongchang Cao, Yue Zhang, Jue Chen, Yibo Liu, Yuchen Liu, Binhua Li, Fei Huang, Yongbin Li" |
35 | 107 | venue: "ISSTA 2025" |
|
62 | 134 | github: "https://github.com/OpenAutoCoder/Agentless" |
63 | 135 | website: "" |
64 | 136 |
|
| 137 | +- title: "OmniGIRL: A Multilingual and Multimodal Benchmark for GitHub Issue Resolution" |
| 138 | + authors: "Lianghong Guo, Wei Tao, Runhan Jiang, Yanlin Wang, Jiachi Chen, Xilin Liu, Yuchi Ma, Mingzhi Mao, Hongyu Zhang, Zibin Zheng" |
| 139 | + venue: "ISSTA 2025" |
| 140 | + links: |
| 141 | + paper: "https://arxiv.org/abs/2505.04606" |
| 142 | + github: "https://github.com/DeepSoftwareAnalytics/OmniGIRL" |
| 143 | + website: "https://deepsoftwareanalytics.github.io/omnigirl_leaderboard.html" |
| 144 | + |
65 | 145 | - title: "AutoCodeRover: Autonomous Program Improvement" |
66 | 146 | authors: "Yuntong Zhang, Haifeng Ruan, Zhiyu Fan, Abhik Roychoudhury" |
67 | 147 | venue: "ISSTA 2024" |
|
77 | 157 | paper: "https://arxiv.org/abs/2405.15793" |
78 | 158 | github: "https://github.com/SWE-agent/SWE-agent" |
79 | 159 | website: "https://swe-agent.com/" |
| 160 | + |
| 161 | +- title: "SWE-bench: Can Language Models Resolve Real-World GitHub Issues?" |
| 162 | + authors: "Carlos E. Jimenez, John Yang, Alexander Wettig, Shunyu Yao, Kexin Pei, Ofir Press, Karthik Narasimhan" |
| 163 | + venue: "ICLR 2024" |
| 164 | + links: |
| 165 | + paper: "https://arxiv.org/abs/2310.06770" |
| 166 | + github: "https://github.com/SWE-bench/SWE-bench" |
| 167 | + website: "https://www.swebench.com/" |
0 commit comments