-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathextract_lemas.py
More file actions
77 lines (61 loc) · 2.38 KB
/
extract_lemas.py
File metadata and controls
77 lines (61 loc) · 2.38 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
from os import listdir
from os.path import isfile, join
import xml.etree.ElementTree as ET
import ipdb
from lemmatizer.ESlemmatizer import ESLemmatizer
import random
import re
try:
# UCS-4
regex = re.compile('[\U00010000-\U0010ffff]')
except re.error:
# UCS-2
regex = re.compile('[\uD800-\uDBFF][\uDC00-\uDFFF]')
def clean_utf8(rawdata):
return regex.sub(' ', rawdata)
def chunks(l, n):
"""Yield successive n-sized chunks from l."""
for i in range(0, len(l), n):
yield l[i:i + n]
lemmas_server = 'http://hator00.tsc.uc3m.es:6100/nlp/annotations/'
stw_file = './lemmatizer/lemafiles/stopwords/ESstopwords_SNOWBALL.txt'
dict_eq_file = ''
POS = '"NOUN", "VERB", "ADJECTIVE"'
concurrent_posts = 10
removenumbers = True
keepSentence = True
#Initialize lemmatizer
ESLM = ESLemmatizer(lemmas_server=lemmas_server, stw_file=stw_file,
dict_eq_file=dict_eq_file, POS=POS, removenumbers=removenumbers,
keepSentence=keepSentence)
xml_dir = './data_Law_BOE/XML'
LEMAS_dir = './data_Law_BOE/LEMAS'
xml_files = [f for f in listdir(xml_dir) if isfile(join(xml_dir, f))]
already_lematized = [f for f in listdir(LEMAS_dir) if isfile(join(LEMAS_dir, f))]
already_lematized = set([f.split('.txt')[0] for f in already_lematized])
xml_files = [f for f in xml_files if f.split('.xml')[0] not in already_lematized]
cont = 0
for chk in chunks(xml_files, 1000):
print('Procesados', cont, 'de', len(xml_files))
cont+=1000
to_lemmatize = []
for f in chk:
tree = ET.parse(join(xml_dir, f))
root = tree.getroot()
all_text = []
for el in root:
if el.tag=='texto':
for parrafo in el:
if parrafo.tag=='p':
if parrafo.text:
all_text.append(parrafo.text.strip())
if all_text:
to_lemmatize.append([f.split('.xml')[0], clean_utf8(' '.join(all_text))])
random.shuffle(to_lemmatize)
lemasBatch = ESLM.lemmatizeBatch(to_lemmatize, processes=concurrent_posts)
#Remove entries that where not lemmatized correctly
lemasBatch = [[el[0], clean_utf8(el[1])] for el in lemasBatch if len(el[1])]
print('Lematizados', len(lemasBatch), 'de', len(to_lemmatize), 'documentos')
for el in lemasBatch:
with open(join(LEMAS_dir, el[0]+'.txt'), 'w', encoding='utf8') as fout:
fout.write(el[1])