|
22 | 22 | github: "https://github.com/amazon-science/SWE-PolyBench" |
23 | 23 | website: "https://amazon-science.github.io/SWE-PolyBench/" |
24 | 24 |
|
| 25 | +- title: "MORSE-500: A Programmatically Controllable Video Benchmark to Stress-Test Multimodal Reasoning" |
| 26 | + authors: "Zikui Cai, Andrew Wang, Anirudh Satheesh, Ankit Nakhawa, Hyunwoo Jae, Keenan Powell, Minghui Liu, Neel Jay, Sungbin Oh, Xiyao Wang, Yongyuan Liang, Tom Goldstein, Furong Huang" |
| 27 | + venue: "arXiv 2025" |
| 28 | + links: |
| 29 | + paper: "https://arxiv.org/abs/2506.05523" |
| 30 | + github: "https://github.com/morse-benchmark/morse-500" |
| 31 | + website: "https://morse-500.github.io/" |
| 32 | + |
25 | 33 | - title: "SWE-SQL: Illuminating LLM Pathways to Solve User SQL Issues in Real-World Applications" |
26 | 34 | authors: "Jinyang Li, Xiaolong Li, Ge Qu, Per Jacobsson, Bowen Qin, Binyuan Hui, Shuzheng Si, Nan Huo, Xiaohan Xu, Yue Zhang, Ziwei Tang, Yuanshuai Li, Florensia Widjaja, Xintong Zhu, Feige Zhou, Yongfeng Huang, Yannis Papakonstantinou, Fatma Ozcan, Chenhao Ma, Reynold Cheng" |
27 | 35 | venue: "arXiv 2025" |
|
46 | 54 | github: "https://github.com/swe-perf/swe-perf" |
47 | 55 | website: "https://swe-perf.github.io" |
48 | 56 |
|
| 57 | +- title: "MetaGen: A DSL, Database, and Benchmark for VLM-Assisted Metamaterial Generation" |
| 58 | + authors: "Liane Makatura, Benjamin Jones, Siyuan Bian, Wojciech Matusik" |
| 59 | + venue: "arXiv 2025" |
| 60 | + links: |
| 61 | + paper: "https://arxiv.org/abs/2508.17568" |
| 62 | + github: "" |
| 63 | + website: "" |
| 64 | + |
| 65 | +- title: "WebDS: An End-to-End Benchmark for Web-based Data Science" |
| 66 | + authors: "Ethan Hsu, Hong Meng Yam, Ines Bouissou, Aaron Murali John, Raj Thota, Josh Koe, Vivek Sarath Putta, G K Dharesan, Alexander Spangher, Shikhar Murty, Tenghao Huang, Christopher D. Manning" |
| 67 | + venue: "arXiv 2025" |
| 68 | + links: |
| 69 | + paper: "https://arxiv.org/abs/2508.01222" |
| 70 | + github: "" |
| 71 | + website: "" |
| 72 | + |
| 73 | +- title: "Plot2Code: A Comprehensive Benchmark for Evaluating Multi-modal Large Language Models in Code Generation from Scientific Plots" |
| 74 | + authors: "Chengyue Wu, Yixiao Ge, Qiushan Guo, Jiahao Wang, Zhixuan Liang, Zeyu Lu, Ying Shan, Ping Luo" |
| 75 | + venue: "arXiv 2025" |
| 76 | + links: |
| 77 | + paper: "https://arxiv.org/abs/2405.07990" |
| 78 | + github: "https://github.com/TencentARC/Plot2Code" |
| 79 | + website: "https://huggingface.co/datasets/TencentARC/Plot2Code" |
| 80 | + |
49 | 81 | - title: "SWE-rebench: An Automated Pipeline for Task Collection and Decontaminated Evaluation of Software Engineering Agents" |
50 | 82 | authors: "Ibragim Badertdinov, Alexander Golubev, Maksim Nekrashevich, Anton Shevtsov, Simon Karasik, Andrei Andriushchenko, Maria Trofimova, Daria Litvintseva, Boris Yangel" |
51 | 83 | venue: "arXiv 2025" |
|
70 | 102 | github: "https://centre-for-software-excellence.github.io/SWE-Effi/" |
71 | 103 | website: "https://github.com/Centre-for-Software-Excellence/SWE-Effi" |
72 | 104 |
|
| 105 | +- title: "PReview: A Benchmark Dataset for Pull Request Outcomes and Quality Analysis" |
| 106 | + authors: "Anonymous Authors" |
| 107 | + venue: "2025" |
| 108 | + links: |
| 109 | + paper: "https://openreview.net/forum?id=cdwp8BXTVV" |
| 110 | + github: "" |
| 111 | + website: "" |
| 112 | + |
73 | 113 | - title: "SEC-bench: Automated Benchmarking of LLM Agents on Real-World Software Security Tasks" |
74 | 114 | authors: "Hwiwon Lee, Ziqi Zhang, Hanxiao Lu, Lingming Zhang" |
75 | 115 | venue: "arXiv 2025" |
|
118 | 158 | github: "https://github.com/LiuHengyu321/IR3D-Bench" |
119 | 159 | website: "https://ir3d-bench.github.io/" |
120 | 160 |
|
| 161 | +- title: "BlenderGym: Benchmarking Foundational Model Systems for Graphics Editing" |
| 162 | + authors: "Yunqi Gu, Ian Huang, Jihyeon Je, Guandao Yang, Leonidas Guiba" |
| 163 | + venue: "CVPR 2025" |
| 164 | + links: |
| 165 | + paper: "https://openaccess.thecvf.com/content/CVPR2025/html/Gu_BlenderGym_Benchmarking_Foundational_Model_Systems_for_Graphics_Editing_CVPR_2025_paper.html" |
| 166 | + github: "https://github.com/richard-guyunqi/BlenderGym-Open" |
| 167 | + website: "https://blendergym.github.io/" |
| 168 | + |
121 | 169 | - title: "SyncMind: Measuring Agent Out-of-Sync Recovery in Collaborative Software Engineering" |
122 | 170 | authors: "Xuehang Guo, Xingyao Wang, Yangyi Chen, Sha Li, Chi Han, Manling Li, Heng Ji" |
123 | 171 | venue: "ICML 2025" |
|
150 | 198 | github: "https://github.com/uiuc-kang-lab/cve-bench" |
151 | 199 | website: "" |
152 | 200 |
|
| 201 | +- title: "WebMMU: A Benchmark for Multimodal Multilingual Website Understanding and Code Generation" |
| 202 | + authors: "Rabiul Awal, Mahsa Massoud, Aarash Feizi, Zichao Li, Suyuchen Wang, Christopher Pal, Aishwarya Agrawal, David Vazquez, Siva Reddy, Juan A. Rodriguez, Perouz Taslakian, Spandana Gella, Sai Rajeswar" |
| 203 | + venue: "EMNLP 2025" |
| 204 | + links: |
| 205 | + paper: "https://arxiv.org/abs/2508.16763" |
| 206 | + github: "" |
| 207 | + website: "https://webmmu-paper.github.io/" |
| 208 | + |
153 | 209 | - title: "CVE-Bench: Benchmarking LLM-based Software Engineering Agent’s Ability to Repair Real-World CVE Vulnerabilities" |
154 | 210 | authors: "Peiran Wang, Xiaogeng Liu, Chaowei Xiao" |
155 | 211 | venue: "NAACL 2025" |
|
166 | 222 | github: "https://github.com/FSoft-AI4Code/RepoExec" |
167 | 223 | website: "https://fsoft-ai4code.github.io/repoexec/" |
168 | 224 |
|
| 225 | +- title: "FEA-Bench: A Benchmark for Evaluating Repository-Level Code Generation for Feature Implementation" |
| 226 | + authors: "Wei Li, Xin Zhang, Zhongxin Guo, Shaoguang Mao, Wen Luo, Guangyue Peng, Yangyu Huang, Houfeng Wang, Scarlett Li" |
| 227 | + venue: "ACL 2025" |
| 228 | + links: |
| 229 | + paper: "https://arxiv.org/abs/2503.06680" |
| 230 | + github: "https://github.com/microsoft/FEA-Bench" |
| 231 | + website: "https://gmago-leway.github.io/fea-bench.github.io/" |
| 232 | + |
| 233 | +- title: "ProjectEval: A Benchmark for Programming Agents Automated Evaluation on Project-Level Code Generation" |
| 234 | + authors: "Kaiyuan Liu, Youcheng Pan, Yang Xiang, Daojing He, Jing Li, Yexing Du, Tianrun Gao" |
| 235 | + venue: "ACL 2025 Findings" |
| 236 | + links: |
| 237 | + paper: "https://aclanthology.org/2025.findings-acl.1036/" |
| 238 | + github: "https://github.com/RyanLoil/ProjectEval/" |
| 239 | + website: "" |
| 240 | + |
169 | 241 | - title: "EnvBench: A Benchmark for Automated Environment Setup" |
170 | 242 | authors: "Aleksandra Eliseeva, Alexander Kovrigin, Ilia Kholkin, Egor Bogomolov, Yaroslav Zharov" |
171 | 243 | venue: "ICLR 2025 Workshop" |
|
189 | 261 | paper: "https://arxiv.org/abs/2310.06770" |
190 | 262 | github: "https://github.com/SWE-bench/SWE-bench" |
191 | 263 | website: "https://www.swebench.com/" |
| 264 | + |
| 265 | +- title: "CADTalk: An Algorithm and Benchmark for Semantic Commenting of CAD Programs" |
| 266 | + authors: "Haocheng Yuan, Jing Xu, Hao Pan, Adrien Bousseau, Niloy J. Mitra, Changjian Li" |
| 267 | + venue: "CVPR 2024" |
| 268 | + links: |
| 269 | + paper: "https://openaccess.thecvf.com/content/CVPR2024/html/Yuan_CADTalk_An_Algorithm_and_Benchmark_for_Semantic_Commenting_of_CAD_CVPR_2024_paper.html" |
| 270 | + github: "https://github.com/YYYYYHC/CADTalk" |
| 271 | + website: "https://enigma-li.github.io/CADTalk/" |
| 272 | + |
| 273 | +- title: "DA-Code: Agent Data Science Code Generation Benchmark for Large Language Models" |
| 274 | + authors: "Yiming Huang, Jianwen Luo, Yan Yu, Yitong Zhang, Fangyu Lei, Yifan Wei, Shizhu He, Lifu Huang, Xiao Liu, Jun Zhao, Kang Liu" |
| 275 | + venue: "EMNLP 2024" |
| 276 | + links: |
| 277 | + paper: "https://aclanthology.org/2024.emnlp-main.748/" |
| 278 | + github: "https://github.com/yiyihum/da-code" |
| 279 | + website: "https://da-code-bench.github.io/" |
0 commit comments