-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathfilter_papers.py
More file actions
32 lines (24 loc) · 963 Bytes
/
filter_papers.py
File metadata and controls
32 lines (24 loc) · 963 Bytes
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
import json
import re
# 读取原始数据
with open('/home/weijia/paper_analyze/iclr26_all_papers.json', 'r', encoding='utf-8') as f:
data = json.load(f)
# 定义关键词(不区分大小写)
keywords = ['code generation']
# 筛选函数
def matches_keywords(paper):
text = ' '.join([
paper.get('title', ''),
paper.get('abstract', ''),
' '.join(paper.get('keywords', [])),
paper.get('primary_area', '')
]).lower()
return any(keyword in text for keyword in keywords)
# 筛选数据
filtered_data = [paper for paper in data if matches_keywords(paper)]
# 保存结果
with open('/home/weijia/paper_analyze/filtered_papers_code_generation.json', 'w', encoding='utf-8') as f:
json.dump(filtered_data, f, ensure_ascii=False, indent=2)
# print(f'原始论文数量: {len(data)}')
print(f'筛选后论文数量: {len(filtered_data)}')
print(f'结果已保存到: filtered_papers_code_generation.json')