Skip to content

Commit 1d1840f

Browse files
authored
Merge pull request #31 from HIGHFIVE-SW/main
main내용을 develop으로
2 parents 82ea492 + 0ab7e08 commit 1d1840f

19 files changed

Lines changed: 1383 additions & 120 deletions

.github/workflows/deploy.yml

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
name: Deploy with GitHub Actions
2+
3+
on:
4+
push:
5+
branches:
6+
- main
7+
8+
jobs:
9+
deploy:
10+
runs-on: ubuntu-latest
11+
steps:
12+
- name: Checkout repository
13+
uses: actions/checkout@v3
14+
15+
- name: Set execute permissions for deploy script
16+
run: chmod +x ${{ github.workspace }}/deploy_script.sh
17+
18+
19+
- name: Setup SSH Key
20+
run: |
21+
echo "${{ secrets.SSH_PRIVATE_KEY }}" > id_rsa.pem
22+
chmod 600 id_rsa.pem
23+
24+
ssh -i id_rsa.pem -o StrictHostKeyChecking=no elicer@central-01.tcp.tunnel.elice.io -p 50735 "cd ~/HIGHFIVE-AI/ && git reset --hard origin/main && git pull origin main && chmod +x deploy_script.sh && ./deploy_script.sh"

api_request/reliefweb.py

Lines changed: 0 additions & 56 deletions
This file was deleted.

app.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,9 +3,10 @@
33
from flask import Flask
44
from flask_cors import CORS
55
from dotenv import load_dotenv
6+
from flasgger import Swagger
67

78
from server.logger import logger
8-
9+
#test
910
# 현재 app.py 파일의 디렉토리 경로를 sys.path에 추가
1011
current_dir = os.path.dirname(os.path.abspath(__file__))
1112
if current_dir not in sys.path:
@@ -23,6 +24,8 @@
2324
app = Flask(__name__)
2425
CORS(app, resources={r"/*": {"origins": "*"}})
2526

27+
swagger=Swagger(app)
28+
2629
# 모든 Blueprint 등록
2730
from chat import chat_bp
2831
app.register_blueprint(chat_bp)

crawler/bbc_crawler.py

Lines changed: 128 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,128 @@
1+
import requests
2+
from crawler.keyword_extractor import extract_keyword
3+
from summarization.sum_translate import translate_en_to_ko
4+
from crawler.save_to_db import save_issues
5+
from bs4 import BeautifulSoup
6+
from datetime import datetime
7+
from server.db import run_query
8+
9+
BASE_URL = 'https://web-cdn.api.bbci.co.uk/xd/content-collection/'
10+
COLLECTIONS = {
11+
'natural-wonders' : '9f0b9075-b620-4859-abdc-ed042dd9ee66',
12+
'weather-science' : '696fca43-ec53-418d-a42c-067cb0449ba9',
13+
'climate-solutions' : '5fa7bbe8-5ea3-4bc6-ac7e-546d0dc4a16b',
14+
}
15+
HEADERS = {
16+
'User-Agent': 'Mozilla/5.0'
17+
}
18+
SIZE = 9
19+
20+
def get_last_issue_date():
21+
sql = """
22+
SELECT MAX(issue_date)
23+
FROM issues;
24+
"""
25+
result = run_query(sql)
26+
27+
if result and result[0][0]:
28+
dt = result[0][0]
29+
latest_issue_date = dt.strftime("%Y-%m-%d %H:%M:%S.%f")
30+
return latest_issue_date
31+
else:
32+
return None
33+
34+
def is_end(date, end_time):
35+
date_dt = datetime.strptime(date, "%Y-%m-%d %H:%M:%S.%f")
36+
end_time_dt = datetime.strptime(end_time, "%Y-%m-%d %H:%M:%S.%f")
37+
return date_dt <= end_time_dt
38+
39+
def get_datetime(time):
40+
dt = datetime.strptime(time, "%Y-%m-%dT%H:%M:%S.%fZ")
41+
return dt.strftime("%Y-%m-%d %H:%M:%S.%f")
42+
43+
def get_content(url):
44+
response = requests.get(url)
45+
soup = BeautifulSoup(response.content, "html.parser")
46+
content_divs = soup.find_all('div', attrs={'data-component': 'text-block'})
47+
contents = [div.get_text(strip=True) for div in content_divs]
48+
full_content = '\n'.join(contents) if contents else "No Content"
49+
50+
return full_content
51+
52+
def get_articles(page, collection_id, end_time):
53+
params = {
54+
'page': page,
55+
'size': SIZE,
56+
}
57+
58+
response = requests.get(BASE_URL + collection_id, params=params, headers=HEADERS)
59+
60+
if not response:
61+
return []
62+
63+
datas = response.json().get('data')
64+
articles = []
65+
66+
for data in datas:
67+
date = get_datetime(data['firstPublishedAt'])
68+
69+
if end_time:
70+
if is_end(date, end_time):
71+
break
72+
73+
title = translate_en_to_ko(data['title'])
74+
keyword = extract_keyword(data['summary'])
75+
summary = translate_en_to_ko(data['summary'])
76+
url = "https://www.bbc.com" + data['path']
77+
image = data['indexImage']['model']['blocks']['src'] or None
78+
79+
articles.append(
80+
{
81+
'content': summary,
82+
'image_url': image,
83+
'issue_date': date,
84+
'keyword': keyword,
85+
'site_url': url,
86+
'title': title,
87+
}
88+
)
89+
print(f"[BBC] 크롤링 완료 : {title}")
90+
91+
return articles
92+
93+
def crawl():
94+
print("[BBC] 크롤링 시작")
95+
results = []
96+
last_issue_date = get_last_issue_date()
97+
98+
if last_issue_date:
99+
print(f"[BBC] DB의 마지막 이슈 이후 데이터만 크롤링 시작 (DATE : {last_issue_date})")
100+
else:
101+
print(f"[BBC] DB에 이슈 없음, 모든 데이터 크롤링 시작")
102+
103+
for category, collection_id in COLLECTIONS.items():
104+
# print(f"[BBC] 카테고리 {category} :")
105+
page = 0
106+
107+
while True:
108+
articles = get_articles(page, collection_id, last_issue_date)
109+
110+
if not articles:
111+
break
112+
113+
results.extend(articles)
114+
page += 1
115+
116+
if results:
117+
print(f"[BBC] 크롤링 완료 : {len(results)}개의 이슈를 크롤링했습니다.")
118+
save_issues(results)
119+
else:
120+
print("[BBC] 크롤링 완료 : 새로운 이슈가 없습니다.")
121+
122+
123+
124+
def main():
125+
crawl()
126+
127+
if __name__ == '__main__':
128+
main()

crawler/idealist_crawler.py

Lines changed: 146 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,146 @@
1+
import requests
2+
import json
3+
from datetime import datetime, timedelta, timezone
4+
from crawler.keyword_extractor import extract_keyword
5+
from crawler.save_to_db import save_activities
6+
from server.db import run_query
7+
8+
ENDPOINT = "https://nsv3auess7-dsn.algolia.net/1/indexes/*/queries"
9+
HEADERS = {
10+
"Content-Type": "application/json",
11+
"x-algolia-agent": "Algolia for JavaScript (5.20.0); Search (5.20.0); Browser",
12+
"x-algolia-api-key": "c2730ea10ab82787f2f3cc961e8c1e06",
13+
"x-algolia-application-id": "NSV3AUESS7"
14+
}
15+
DEFAULT_IMAGE_URL = "https://www.idealist.org/assets/417d88fd628db1c1ac861f3ea8db58c1a159d52a/images/icons/action-opps/action-opps-volunteermatch.svg"
16+
17+
def get_last_timestamp():
18+
sql = """
19+
SELECT start_date
20+
FROM activities
21+
WHERE activity_site = 'IDEALIST'
22+
ORDER BY start_date DESC
23+
LIMIT 1;
24+
"""
25+
last_timestamp = run_query(sql)
26+
27+
if last_timestamp:
28+
dt = last_timestamp[0][0].replace(tzinfo=timezone.utc)
29+
return int(dt.timestamp())
30+
else:
31+
return 0
32+
33+
def build_payload(page, type='volunteer', timestamp=0):
34+
if type == 'volunteer':
35+
filters = f"actionType:'VOLOP' AND published > {timestamp}"
36+
index_name = "idealist7-production-action-opps"
37+
else:
38+
filters = f"type:'INTERNSHIP' AND published > {timestamp}"
39+
index_name = "idealist7-production"
40+
41+
return {
42+
"requests": [
43+
{
44+
"indexName": index_name,
45+
"facets": ["*"],
46+
"hitsPerPage": 100,
47+
"attributesToSnippet": ["description:20"],
48+
"attributesToRetrieve": ["*"],
49+
"filters": filters,
50+
"removeStopWords": True,
51+
"ignorePlurals": True,
52+
"advancedSyntax": True,
53+
"queryLanguages": ["en"],
54+
"page": page,
55+
"query": "",
56+
"getRankingInfo": True,
57+
"clickAnalytics": True,
58+
"analytics": True
59+
}
60+
]
61+
}
62+
63+
def get_url(item):
64+
url = item.get("url")
65+
if isinstance(url, str):
66+
return url
67+
elif isinstance(url, dict):
68+
return "https://www.idealist.org" + next(iter(url.values()), "")
69+
return ""
70+
71+
def get_image(item):
72+
img = item.get("imageUrl") or DEFAULT_IMAGE_URL
73+
return img
74+
75+
def get_published(item):
76+
timestamp = item.get("published")
77+
return datetime.fromtimestamp(timestamp, tz=timezone.utc).strftime('%Y-%m-%d %H:%M:%S.%f')
78+
79+
def get_activities(page, timestamp, type):
80+
payload = build_payload(page, type, timestamp)
81+
response = requests.post(ENDPOINT, headers=HEADERS, json=payload)
82+
83+
try:
84+
data = response.json()["results"][0]["hits"]
85+
except Exception as e:
86+
print(f"[!] JSON 파싱 에러: {e}")
87+
return None
88+
89+
result = []
90+
91+
if data:
92+
for item in data:
93+
activity_type = "VOLUNTEER" if type=='volunteer' else 'INTERNSHIP'
94+
activity_content = item.get("description")
95+
activity_name = item.get("name")
96+
activity_image_url = get_image(item)
97+
activity_url = get_url(item)
98+
start_date = get_published(item)
99+
end_date = None
100+
keyword = extract_keyword(activity_content)
101+
102+
result.append(
103+
{
104+
"activity_site": "IDEALIST",
105+
"activity_type": activity_type,
106+
"activity_content": activity_content,
107+
"end_date": end_date,
108+
"activity_image_url": activity_image_url,
109+
"keyword": keyword,
110+
"activity_name": activity_name,
111+
"site_url": activity_url,
112+
"start_date": start_date
113+
}
114+
)
115+
print(f"[IDEALIST] 크롤링 완료 : {item.get("name", '')}")
116+
return result
117+
else:
118+
return None
119+
120+
def crawl():
121+
print("[IDEALIST] 크롤링 시작")
122+
crawled_activities = []
123+
last_timestamp = get_last_timestamp()
124+
125+
if last_timestamp > 0:
126+
print(f"[IDEALIST] DB의 마지막 활동 이후 데이터만 크롤링 시작 (TIMESTAMP: {last_timestamp})")
127+
else:
128+
print(f"[IDEALIST] DB에 활동 없음, 모든 데이터 크롤링 시작")
129+
130+
for type in ['volunteer', 'internship']:
131+
page = 0
132+
while True:
133+
activities = get_activities(page, last_timestamp, type)
134+
if not activities:
135+
break
136+
crawled_activities.extend(activities)
137+
page += 1
138+
139+
if crawled_activities:
140+
print(f"[IDEALIST] 크롤링 완료 : {len(crawled_activities)}개의 활동을 크롤링했습니다.")
141+
save_activities(crawled_activities)
142+
else:
143+
print("[IDEALIST] 크롤링 완료 : 새로운 활동이 없습니다.")
144+
145+
if __name__ == "__main__":
146+
crawl()

0 commit comments

Comments
 (0)