Skip to content

Commit 3117c99

Browse files
committed
Add latest papers on benchmarks, data synthesis, issue resolution, products, surveys, and so on
1 parent 3efdbe8 commit 3117c99

11 files changed

Lines changed: 444 additions & 46 deletions

README.md

Lines changed: 195 additions & 40 deletions
Large diffs are not rendered by default.
Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
- title: "AutoMind: Adaptive Knowledgeable Agent for Automated Data Science"
2+
authors: "Yixin Ou, Yujie Luo, Jingsheng Zheng, Lanning Wei, Zhuoyun Yu, Shuofei Qiao, Jintian Zhang, Da Zheng, Yuren Mao, Yunjun Gao, Huajun Chen, Ningyu Zhang"
3+
venue: "arXiv 2025"
4+
links:
5+
paper: "https://arxiv.org/abs/2506.10974"
6+
github: "https://github.com/innovatingAI/AutoMind"
7+
website: "https://innovatingai.github.io/"

data/papers_benchmarks.yaml

Lines changed: 37 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -38,13 +38,29 @@
3838
github: "https://github.com/multi-swe-bench/multi-swe-bench"
3939
website: "https://multi-swe-bench.github.io/"
4040

41-
- title: "SWE-bench Goes Live!"
42-
authors: "Linghao Zhang, Shilin He, Chaoyun Zhang, Yu Kang, Bowen Li, Chengxing Xie, Junhao Wang, Maoquan Wang, Yufan Huang, Shengyu Fu, Elsie Nallipogu, Qingwei Lin, Yingnong Dang, Saravan Rajmohan, Dongmei Zhang"
41+
- title: "SWE-Effi: Re-Evaluating Software AI Agent System Effectiveness Under Resource Constraints"
42+
authors: "Zhiyu Fan, Kirill Vasilevski, Dayi Lin, Boyuan Chen, Yihao Chen, Zhiqing Zhong, Jie M. Zhang, Pinjia He, Ahmed E. Hassan"
4343
venue: "arXiv 2025"
4444
links:
45-
paper: "https://arxiv.org/abs/2505.23419"
46-
github: "https://github.com/microsoft/SWE-bench-Live"
47-
website: "https://swe-bench-live.github.io/"
45+
paper: "https://arxiv.org/abs/2509.09853"
46+
github: "https://centre-for-software-excellence.github.io/SWE-Effi/"
47+
website: "https://github.com/Centre-for-Software-Excellence/SWE-Effi"
48+
49+
- title: "SEC-bench: Automated Benchmarking of LLM Agents on Real-World Software Security Tasks"
50+
authors: "Hwiwon Lee, Ziqi Zhang, Hanxiao Lu, Lingming Zhang"
51+
venue: "arXiv 2025"
52+
links:
53+
paper: "https://arxiv.org/abs/2506.11791"
54+
github: "https://github.com/SEC-bench/SEC-bench"
55+
website: "https://sec-bench.github.io/"
56+
57+
- title: "SecureAgentBench: Benchmarking Secure Code Generation under Realistic Vulnerability Scenarios"
58+
authors: "Junkai Chen, Huihui Huang, Yunbo Lyu, Junwen An, Jieke Shi, Chengran Yang, Ting Zhang, Haoye Tian, Yikun Li, Zhenhao Li, Xin Zhou, Xing Hu, David Lo"
59+
venue: "arXiv 2025"
60+
links:
61+
paper: "https://arxiv.org/abs/2509.22097"
62+
github: "https://github.com/iCSawyer/SecureAgentBench"
63+
website: ""
4864

4965
- title: "SWE-MERA: A Dynamic Benchmark for Agenticly Evaluating Large Language Models on Software Engineering Tasks"
5066
authors: "Pavel Adamenko, Mikhail Ivanov, Aidar Valeev, Rodion Levichev, Pavel Zadorozhny, Ivan Lopatin, Dmitry Babayev, Alena Fenogenova, Valentin Malykh"
@@ -54,6 +70,22 @@
5470
github: "https://github.com/MERA-Evaluation/repotest"
5571
website: "https://mera-evaluation.github.io/demo-swe-mera/"
5672

73+
- title: "Auto-SWE-Bench: A Framework for the Scalable Generation of Software Engineering Benchmark from Open-Source Repositories"
74+
authors: "Anonymous Authors"
75+
venue: "2025"
76+
links:
77+
paper: "https://openreview.net/forum?id=Gxw1EDSm9S"
78+
github: ""
79+
website: ""
80+
81+
- title: "SWE-bench Goes Live!"
82+
authors: "Linghao Zhang, Shilin He, Chaoyun Zhang, Yu Kang, Bowen Li, Chengxing Xie, Junhao Wang, Maoquan Wang, Yufan Huang, Shengyu Fu, Elsie Nallipogu, Qingwei Lin, Yingnong Dang, Saravan Rajmohan, Dongmei Zhang"
83+
venue: "NeurIPS 2025 Datasets & Benchmarks Track"
84+
links:
85+
paper: "https://arxiv.org/abs/2505.23419"
86+
github: "https://github.com/microsoft/SWE-bench-Live"
87+
website: "https://swe-bench-live.github.io/"
88+
5789
- title: "SWE-Lancer: Can Frontier LLMs Earn $1 Million from Real-World Freelance Software Engineering?"
5890
authors: "Samuel Miserendino, Michele Wang, Tejal Patwardhan, Johannes Heidecke"
5991
venue: "ICML 2025"

data/papers_code_generation.yaml

Whitespace-only changes.

data/papers_data_synthesis.yaml

Lines changed: 25 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,31 @@
1+
- title: "SWE-Mirror: Scaling Issue-Resolving Datasets by Mirroring Issues Across Repositories"
2+
authors: "Junhao Wang, Daoguang Zan, Shulin Xin, Siyao Liu, Yurong Wu, Kai Shen"
3+
venue: "arXiv 2025"
4+
links:
5+
paper: "https://arxiv.org/abs/2509.08724"
6+
github: ""
7+
website: ""
8+
19
- title: "SPICE: An Automated SWE-Bench Labeling Pipeline for Issue Clarity, Test Coverage, and Effort Estimation"
210
authors: "Gustavo A. Oliva, Gopi Krishnan Rajbahadur, Aaditya Bhatia, Haoxiang Zhang, Yihao Chen, Zhilong Chen, Arthur Leung, Dayi Lin, Boyuan Chen, Ahmed E. Hassan"
311
venue: "arXiv 2025"
412
links:
513
paper: "https://arxiv.org/abs/2507.09108"
614
github: ""
7-
website: ""
15+
website: ""
16+
17+
- title: "RepoForge: Training a SOTA Fast-thinking SWE Agent with an End-to-End Data Curation Pipeline Synergizing SFT and RL at Scale"
18+
authors: "Zhilong Chen, Chengzong Zhao, Boyuan Chen, Dayi Lin, Yihao Chen, Arthur Leung, Gopi Krishnan Rajbahadur, Gustavo Oliva, Haoxiang Zhang, Aadi Bhatia, Kim Kisub, Kirill Vasilevski, Youssef Esseddiq, Yanruo Yang, Ahmed Hassan"
19+
venue: "arXiv 2025"
20+
links:
21+
paper: "https://arxiv.org/abs/2508.01550"
22+
github: ""
23+
website: "https://centre-for-software-excellence.github.io/docs/blog/repoforge"
24+
25+
- title: "MCTS-Refined CoT: High-Quality Fine-Tuning Data for LLM-Based Repository Issue Resolution"
26+
authors: "Yibo Wang, Zhihao Peng, Ying Wang, Zhao Wei, Hai Yu, Zhiliang Zhu"
27+
venue: "arXiv 2025"
28+
links:
29+
paper: "https://arxiv.org/abs/2506.12728"
30+
github: ""
31+
website: "https://mcts-refine.github.io/"
Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
- title: "Issue Localization via LLM-Driven Iterative Code Graph Searching"
2+
authors: "Zhonghao Jiang, Xiaoxue Ren, Meng Yan, Wei Jiang, Yong Li, Zhongxin Liu"
3+
venue: "ASE 2025"
4+
links:
5+
paper: "https://arxiv.org/abs/2503.22424"
6+
github: "https://github.com/ZhonghaoJiang/CoSIL"
7+
website: ""

data/papers_issue_resolution.yaml

Lines changed: 88 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,46 @@
66
github: "https://github.com/EuniAI/Prometheus"
77
website: "https://euni.ai/"
88

9+
- title: "Terminal-Bench: A Benchmark for AI Agents in Terminal Environments"
10+
authors: "The Terminal-Bench Team"
11+
venue: "2025"
12+
links:
13+
paper: ""
14+
github: "https://github.com/laude-institute/terminal-bench"
15+
website: "https://www.tbench.ai/"
16+
17+
- title: "SWE-Bench Pro: Can AI Agents Solve Long-Horizon Software Engineering Tasks?"
18+
authors: "Xiang Deng, Jeff Da, Edwin Pan, Yannis Yiming He, Charles Ide, Kanak Garg, Niklas Lauffer, Andrew Park, Nitin Pasari, Chetan Rane, Karmini Sampath, Maya Krishnan, Srivatsa Kundurthy, Sean Hendryx, Zifan Wang, Chen Bo Calvin Zhang, Noah Jacobson, Bing Liu, Brad Kenstler"
19+
venue: "arXiv 2025"
20+
links:
21+
paper: "https://arxiv.org/abs/2509.16941"
22+
github: "https://github.com/scaleapi/SWE-bench_Pro-os"
23+
website: "https://scale.com/leaderboard/swe_bench_pro_public"
24+
25+
- title: "SWE-PolyBench: A multi-language benchmark for repository level evaluation of coding agents"
26+
authors: "Muhammad Shihab Rashid, Christian Bock, Yuan Zhuang, Alexander Buchholz, Tim Esler, Simon Valentin, Luca Franceschi, Martin Wistuba, Prabhu Teja Sivaprasad, Woo Jung Kim, Anoop Deoras, Giovanni Zappella, Laurent Callot"
27+
venue: "arXiv 2025"
28+
links:
29+
paper: "https://arxiv.org/abs/2504.08703"
30+
github: "https://github.com/amazon-science/SWE-PolyBench"
31+
website: "https://amazon-science.github.io/SWE-PolyBench/"
32+
33+
- title: "Multi-SWE-bench: A Multilingual Benchmark for Issue Resolving"
34+
authors: "Daoguang Zan, Zhirong Huang, Wei Liu, Hanwu Chen, Linhao Zhang, Shulin Xin, Lu Chen, Qi Liu, Xiaojian Zhong, Aoyan Li, Siyao Liu, Yongsheng Xiao, Liangqiang Chen, Yuyu Zhang, Jing Su, Tianyu Liu, Rui Long, Kai Shen, Liang Xiang"
35+
venue: "arXiv 2025"
36+
links:
37+
paper: "https://arxiv.org/abs/2504.02605"
38+
github: "https://github.com/multi-swe-bench/multi-swe-bench"
39+
website: "https://multi-swe-bench.github.io/"
40+
41+
- title: "SWE-Effi: Re-Evaluating Software AI Agent System Effectiveness Under Resource Constraints"
42+
authors: "Zhiyu Fan, Kirill Vasilevski, Dayi Lin, Boyuan Chen, Yihao Chen, Zhiqing Zhong, Jie M. Zhang, Pinjia He, Ahmed E. Hassan"
43+
venue: "arXiv 2025"
44+
links:
45+
paper: "https://arxiv.org/abs/2509.09853"
46+
github: "https://centre-for-software-excellence.github.io/SWE-Effi/"
47+
website: "https://github.com/Centre-for-Software-Excellence/SWE-Effi"
48+
949
- title: "Saving SWE-Bench: A Benchmark Mutation Approach for Realistic Agent Evaluation"
1050
authors: "Spandan Garg, Ben Steenhoek, Yufan Huang"
1151
venue: "arXiv 2025"
@@ -14,6 +54,22 @@
1454
github: ""
1555
website: ""
1656

57+
- title: "SWE-MERA: A Dynamic Benchmark for Agenticly Evaluating Large Language Models on Software Engineering Tasks"
58+
authors: "Pavel Adamenko, Mikhail Ivanov, Aidar Valeev, Rodion Levichev, Pavel Zadorozhny, Ivan Lopatin, Dmitry Babayev, Alena Fenogenova, Valentin Malykh"
59+
venue: "arXiv 2025"
60+
links:
61+
paper: "https://arxiv.org/abs/2507.11059"
62+
github: "https://github.com/MERA-Evaluation/repotest"
63+
website: "https://mera-evaluation.github.io/demo-swe-mera/"
64+
65+
- title: "Auto-SWE-Bench: A Framework for the Scalable Generation of Software Engineering Benchmark from Open-Source Repositories"
66+
authors: "Anonymous Authors"
67+
venue: "2025"
68+
links:
69+
paper: "https://openreview.net/forum?id=Gxw1EDSm9S"
70+
github: ""
71+
website: ""
72+
1773
- title: "SWE-RL: Advancing LLM Reasoning via Reinforcement Learning on Open Software Evolution"
1874
authors: "Yuxiang Wei, Olivier Duchenne, Jade Copet, Quentin Carbonneaux, Lingming Zhang, Daniel Fried, Gabriel Synnaeve, Rishabh Singh, Sida I. Wang"
1975
venue: "NeurIPS 2025"
@@ -22,6 +78,14 @@
2278
github: "https://github.com/facebookresearch/swe-rl"
2379
website: ""
2480

81+
- title: "SWE-bench Goes Live!"
82+
authors: "Linghao Zhang, Shilin He, Chaoyun Zhang, Yu Kang, Bowen Li, Chengxing Xie, Junhao Wang, Maoquan Wang, Yufan Huang, Shengyu Fu, Elsie Nallipogu, Qingwei Lin, Yingnong Dang, Saravan Rajmohan, Dongmei Zhang"
83+
venue: "NeurIPS 2025 Datasets & Benchmarks Track"
84+
links:
85+
paper: "https://arxiv.org/abs/2505.23419"
86+
github: "https://github.com/microsoft/SWE-bench-Live"
87+
website: "https://swe-bench-live.github.io/"
88+
2589
- title: "Training Software Engineering Agents and Verifiers with SWE-Gym"
2690
authors: "Jiayi Pan, Xingyao Wang, Graham Neubig, Navdeep Jaitly, Heng Ji, Alane Suhr, Yizhe Zhang"
2791
venue: "ICML 2025"
@@ -30,6 +94,14 @@
3094
github: "https://github.com/SWE-Gym/SWE-Gym"
3195
website: ""
3296

97+
- title: "SWE-Lancer: Can Frontier LLMs Earn $1 Million from Real-World Freelance Software Engineering?"
98+
authors: "Samuel Miserendino, Michele Wang, Tejal Patwardhan, Johannes Heidecke"
99+
venue: "ICML 2025"
100+
links:
101+
paper: "https://arxiv.org/abs/2502.12115"
102+
github: "https://github.com/openai/frontier-evals/tree/main/project/swelancer"
103+
website: "https://openai.com/index/swe-lancer/"
104+
33105
- title: "SWE-GPT: A Process-Centric Language Model for Automated Software Improvement"
34106
authors: "Yingwei Ma, Rongyu Cao, Yongchang Cao, Yue Zhang, Jue Chen, Yibo Liu, Yuchen Liu, Binhua Li, Fei Huang, Yongbin Li"
35107
venue: "ISSTA 2025"
@@ -62,6 +134,14 @@
62134
github: "https://github.com/OpenAutoCoder/Agentless"
63135
website: ""
64136

137+
- title: "OmniGIRL: A Multilingual and Multimodal Benchmark for GitHub Issue Resolution"
138+
authors: "Lianghong Guo, Wei Tao, Runhan Jiang, Yanlin Wang, Jiachi Chen, Xilin Liu, Yuchi Ma, Mingzhi Mao, Hongyu Zhang, Zibin Zheng"
139+
venue: "ISSTA 2025"
140+
links:
141+
paper: "https://arxiv.org/abs/2505.04606"
142+
github: "https://github.com/DeepSoftwareAnalytics/OmniGIRL"
143+
website: "https://deepsoftwareanalytics.github.io/omnigirl_leaderboard.html"
144+
65145
- title: "AutoCodeRover: Autonomous Program Improvement"
66146
authors: "Yuntong Zhang, Haifeng Ruan, Zhiyu Fan, Abhik Roychoudhury"
67147
venue: "ISSTA 2024"
@@ -77,3 +157,11 @@
77157
paper: "https://arxiv.org/abs/2405.15793"
78158
github: "https://github.com/SWE-agent/SWE-agent"
79159
website: "https://swe-agent.com/"
160+
161+
- title: "SWE-bench: Can Language Models Resolve Real-World GitHub Issues?"
162+
authors: "Carlos E. Jimenez, John Yang, Alexander Wettig, Shunyu Yao, Kexin Pei, Ofir Press, Karthik Narasimhan"
163+
venue: "ICLR 2024"
164+
links:
165+
paper: "https://arxiv.org/abs/2310.06770"
166+
github: "https://github.com/SWE-bench/SWE-bench"
167+
website: "https://www.swebench.com/"

data/papers_products.yaml

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,14 @@
1414
github: "https://github.com/openai/codex"
1515
website: "https://openai.com/codex/"
1616

17+
- title: "Lovable"
18+
authors: "Lovable"
19+
venue: "2024"
20+
links:
21+
paper: ""
22+
github: ""
23+
website: "https://lovable.dev/"
24+
1725
- title: "Devin"
1826
authors: "Cognition AI"
1927
venue: "2024"
@@ -54,6 +62,14 @@
5462
github: "https://github.com/SWE-agent/SWE-agent"
5563
website: "https://swe-agent.com/"
5664

65+
- title: "Open Lovable"
66+
authors: "Firecrawl"
67+
venue: "2025"
68+
links:
69+
paper: ""
70+
github: "https://github.com/firecrawl/open-lovable"
71+
website: "https://open-lovable.com/"
72+
5773
- title: "PR-Agent & Qodo Merge"
5874
authors: "Qodo"
5975
venue: "2024"
@@ -134,3 +150,10 @@
134150
github: ""
135151
website: "https://www.essential.com/"
136152

153+
- title: "Anything"
154+
authors: "Anything"
155+
venue: "2025"
156+
links:
157+
paper: ""
158+
github: ""
159+
website: "https://www.createanything.com/"
Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
- title: "SEC-bench: Automated Benchmarking of LLM Agents on Real-World Software Security Tasks"
2+
authors: "Hwiwon Lee, Ziqi Zhang, Hanxiao Lu, Lingming Zhang"
3+
venue: "arXiv 2025"
4+
links:
5+
paper: "https://arxiv.org/abs/2506.11791"
6+
github: "https://github.com/SEC-bench/SEC-bench"
7+
website: "https://sec-bench.github.io/"
8+
9+
- title: "SecureAgentBench: Benchmarking Secure Code Generation under Realistic Vulnerability Scenarios"
10+
authors: "Junkai Chen, Huihui Huang, Yunbo Lyu, Junwen An, Jieke Shi, Chengran Yang, Ting Zhang, Haoye Tian, Yikun Li, Zhenhao Li, Xin Zhou, Xing Hu, David Lo"
11+
venue: "arXiv 2025"
12+
links:
13+
paper: "https://arxiv.org/abs/2509.22097"
14+
github: "https://github.com/iCSawyer/SecureAgentBench"
15+
website: ""
16+
17+
- title: "CVE-Bench: A Benchmark for AI Agents’ Ability to Exploit Real-World Web Application Vulnerabilities"
18+
authors: "Yuxuan Zhu, Antony Kellermann, Dylan Bowman, Philip Li, Akul Gupta, Adarsh Danda, Richard Fang, Conner Jensen, Eric Ihli, Jason Benn, Jet Geronimo, Avi Dhir, Sudhit Rao, Kaicheng Yu, Twm Stone, Daniel Kang"
19+
venue: "ICML 2025"
20+
links:
21+
paper: "https://arxiv.org/abs/2503.17332"
22+
github: "https://github.com/uiuc-kang-lab/cve-bench"
23+
website: ""
24+
25+
- title: "CVE-Bench: Benchmarking LLM-based Software Engineering Agent’s Ability to Repair Real-World CVE Vulnerabilities"
26+
authors: "Peiran Wang, Xiaogeng Liu, Chaowei Xiao"
27+
venue: "NAACL 2025"
28+
links:
29+
paper: "https://aclanthology.org/2025.naacl-long.212/"
30+
github: ""
31+
website: ""

data/papers_sql_engineering.yaml

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
- title: "SWE-SQL: Illuminating LLM Pathways to Solve User SQL Issues in Real-World Applications"
2+
authors: "Jinyang Li, Xiaolong Li, Ge Qu, Per Jacobsson, Bowen Qin, Binyuan Hui, Shuzheng Si, Nan Huo, Xiaohan Xu, Yue Zhang, Ziwei Tang, Yuanshuai Li, Florensia Widjaja, Xintong Zhu, Feige Zhou, Yongfeng Huang, Yannis Papakonstantinou, Fatma Ozcan, Chenhao Ma, Reynold Cheng"
3+
venue: "arXiv 2025"
4+
links:
5+
paper: "https://arxiv.org/abs/2506.18951"
6+
github: "https://github.com/bird-bench/BIRD-CRITIC-1"
7+
website: "https://bird-critic.github.io/"

0 commit comments

Comments
 (0)