-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathspiegel_scraper.py
More file actions
99 lines (82 loc) · 3.92 KB
/
spiegel_scraper.py
File metadata and controls
99 lines (82 loc) · 3.92 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
import re
from datetime import datetime, timedelta
from functools import reduce
from traceback import format_exc
from bs4 import BeautifulSoup, Comment
from pyspark.sql import Row, SparkSession
from common import download
min_date = datetime(2000, 1, 1)
base_url = 'http://www.spiegel.de'
archive_url_template = base_url + '/nachrichtenarchiv/artikel-{}.html'
executor_count = 64
sample_fraction = 1
refugee_words = ['Flüchtling', 'Asylant', 'Asylbewerber', 'Asylsuchender', 'Heimatvertriebener', 'Migrant']
def build_archive_url(date):
return archive_url_template.format(date.strftime('%d.%m.%Y'))
def generate_dates(min_date):
delta = datetime.today() - min_date
return [min_date + timedelta(days=n) for n in range(delta.days)]
def extract_article_urls(url):
html = download(url)
if html is not None:
return [a_tag['href'] if re.match('http.*', a_tag['href']) else base_url + a_tag['href'] for a_tag in
BeautifulSoup(html, 'lxml').select('#content-main .column-wide ul li a')]
else:
print('extraction of urls {} failed ({})'.format(url, format_exc()))
return []
def extract_text(nodes):
return reduce(lambda agg, cur: agg + cur.getText(), nodes)
def extract_article_content(url):
html = download(url)
try:
if html is not None:
soup = BeautifulSoup(html, 'lxml')
content_main = soup.select_one('#content-main')
selectors_to_remove = ['.article-function-social-media',
'.article-icon.spiegelplus',
'.article-function-box',
'script',
'style',
'#js-article-column > p',
'#js-article-top-wide-asset',
'.asset-box',
'.article-copyright',
'.article-function-box-wide',
'.top-anchor',
'.module-box',
'.spiegel-asset-box',
'#spRecommendations',
'#js-video-slider',
'.column-both-bottom',
'#footer']
for selector in selectors_to_remove:
for node in content_main.select(selector):
node.decompose()
for comment in soup.findAll(text=lambda text: isinstance(text, Comment)):
comment.extract()
content = re.sub('(\r\n|\n|\t|\s+)', ' ',
reduce(lambda agg, cur: agg + ' ' + cur, content_main.findAll(text=True)))
return content
except Exception:
print('extraction of {} failed ({})'.format(url, format_exc()))
if __name__ == '__main__':
spark = SparkSession.builder.appName('spiegel_scraper').master('local[{}]'.format(executor_count)).getOrCreate()
dates = generate_dates(min_date)
article_urls = spark \
.sparkContext \
.parallelize(dates) \
.sample(fraction=sample_fraction, withReplacement=False) \
.map(lambda date: Row(date=date, archive_url=build_archive_url(date))) \
.flatMap(lambda r: [Row(date=r.date, article_url=url) for url in extract_article_urls(r.archive_url)]) \
.filter(lambda r: 'spiegel.de' in r.article_url) \
.filter(lambda r: 'spiegel.de/video' not in r.article_url) \
.repartition(512) \
.map(lambda r: Row(date=r.date, article_url=r.article_url, article=extract_article_content(r.article_url))) \
.filter(lambda r: r.article is not None) \
.filter(lambda r: any([w in r.article for w in refugee_words])) \
.toDF() \
.write \
.format('csv') \
.mode('overwrite') \
.option('header', 'true') \
.save('spiegel-articles-csv')