-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathpreprocess.py
More file actions
42 lines (29 loc) · 1.2 KB
/
preprocess.py
File metadata and controls
42 lines (29 loc) · 1.2 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
#! /usr/bin/python3
import xml.etree.ElementTree as ET
import mwparserfromhell
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer("english", ignore_stopwords=True)
tree = ET.parse('dump.xml')
root = tree.getroot()
ns = '{http://www.mediawiki.org/xml/export-0.10/}'
sectionsToRemove = ['References', 'Further Reading', 'See also', 'External Links']
for article in root.findall(f'./{ns}page'):
title = article.find(f'{ns}title').text # Type: str
raw_body = article.find(f'{ns}revision/{ns}text').text
wikicode = mwparserfromhell.parse(raw_body)
sections = wikicode.get_sections()
for section in sections:
try:
sectionTitle = section.filter_headings()[0].title
if sectionTitle in sectionsToRemove:
sections.remove(section)
except IndexError:
continue
# remove slashes from titles
title = title.replace('/','_')
stripped = wikicode.strip_code()
with open(f'text/{title}.txt','w') as myFile:
myFile.write(stripped)
stemmed = ' '.join([stemmer.stem(word) for word in stripped.split(' ')])
with open(f'text/{title}-stemmed.txt','w') as myFile:
myFile.write(stemmed)