-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathlem_corpus.py
More file actions
71 lines (61 loc) · 2.25 KB
/
lem_corpus.py
File metadata and controls
71 lines (61 loc) · 2.25 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
import nltk
# nltk.download("stopwords")
"""
Creates a corpus of data.
This programs reads all paths in data directory and all files in subdirectories.
After that script normalize and preprocess all documents to create a corpus file.
It will take ~40-60 min to complete this script.
"""
from pymystem3 import Mystem
from nltk.corpus import stopwords
from string import punctuation
from bs4 import BeautifulSoup
import os
import re
stem = Mystem()
stop = set(stopwords.words("russian"))
stop.update(['.', ',', '"', "'", '?', '!', ':', ';', '(', ')', '[', ']', '{', '}', '#', '№', '*', '_', '\n'])
path = "data/ffs"
total_corpus = []
corpus = []
PATHS = []
PREPARED_CORPUS_PATH = 'data/prepared_corpus.txt'
def preprocess_text(input_text):
param = re.sub('[^a-zA-Zа-яА-Я]', ' ', input_text)
param.lower()
param = stem.lemmatize(param)
param = [token for token in param if token not in stop and token != " " and token.strip() not in punctuation]
input_text = " ".join(param)
return input_text
for directories in os.listdir(path):
PATHS.append(os.path.join(path, directories))
f = open(PREPARED_CORPUS_PATH, 'a')
count = 0
for dirname in PATHS:
for filename in os.listdir(dirname):
html_report_part1 = open(os.path.join(dirname, filename), 'r')
soup = BeautifulSoup(html_report_part1, 'html.parser')
text = preprocess_text(soup.get_text())
text = ' '.join(word for word in text.split() if len(word) > 3)
corpus.append(text)
total_corpus.append(text.split())
print("File " + str(os.path.join(dirname, filename)) + " processed")
f.writelines(text)
# print(text)
count += 1
print(str(100*count/2207) + "%")
print("Directory " + dirname + " processed")
f.close()
PREPARED_PATH = 'data/prepared.txt'
f = open(PREPARED_PATH, 'w')
for i in range(0, len(corpus)):
f.write(corpus[i] + "\n")
f.close()
for i in range(0, len(total_corpus)):
single_document = ""
for j in range(0, len(total_corpus[i])):
if single_document == "":
single_document = total_corpus[i][j]
else:
single_document += " " + total_corpus[i][j]
corpus.append(single_document)