-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathlib_process.py
More file actions
103 lines (85 loc) · 3.18 KB
/
lib_process.py
File metadata and controls
103 lines (85 loc) · 3.18 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
# -*- coding: utf-8 -*-
import string
from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import stopwords
from unidecode import unidecode
tabin=[ord(ch) for ch in string.punctuation]
tabout=[' ' for i in range(len(tabin))]
trantab=dict(zip(tabin,tabout))
for ch in "–—。,、)(·!】【`‘’":
trantab[ord(ch)]=' '
whitelist=set(['win','won','most','biggest','largest','fastest'])
blacklist=set(['give','also',' ','and','of','in','list','-PRON-','does','any'])
stop = set(stopwords.words('english'))
filter_list=(stop|blacklist)-whitelist
stemmer=SnowballStemmer('english')
def findOneDBEntry(conn,condition_field,value,result_field):
item=conn.find_one({condition_field:value})
if item is None:
return None
return item[result_field]
def findAllDBEntry(conn,condition_field,value):
list_doc=conn.find({condition_field:value})
if list_doc is None:
return None
return list_doc
def save_zipped_pickle(obj, filename, protocol=-1):
import pickle,gzip
with gzip.open(filename, 'wb') as f:
pickle.dump(obj, f, protocol)
def load_zipped_pickle(filename):
import pickle,gzip
with gzip.open(filename, 'rb') as f:
loaded_object = pickle.load(f)
return loaded_object
def save_obj(obj,filename):
import pickle
with open(filename,'wb') as f:
pickle.dump(obj,f,pickle.HIGHEST_PROTOCOL)
def load_obj(filename):
import pickle
with open(filename, 'rb') as f:
return pickle.load(f)
def remove_stopwords(line,SEPERATE_CHAR=' '):
line=line.strip()
if len(line)==0:
return ''
list=line.split(SEPERATE_CHAR)
res_list=[]
res_list=[word for word in list if word not in filter_list]
return SEPERATE_CHAR.join(res_list)
def cleanSentence(line,isLower=True,SEPERATE_CHAR=' '):
if len(line)==0:
return ''
line = line.translate(trantab)
if isLower==True:
line=line.lower()
line=SEPERATE_CHAR.join(line.split())
return line
def stemSentence(line,stemmer=SnowballStemmer('english'),isCleanNeeded=True):
if isCleanNeeded==True:
line=cleanSentence(line,True)
if stemmer is None:
stemmer=SnowballStemmer('english')
list=line.split(' ')
stemlist=[stemmer.stem(word) for word in list]
res=' '.join(stemlist)
return res
def cleanAccentedCharacter(line):
return unidecode(line)
def superCleanSentence(line):
# lower clean stem stopword_removed
#line = cleanAccentedCharacter(line)
line = line.translate(trantab).lower()
list_term=line.split()
stemlist=[stemmer.stem(word) for word in list_term if word not in filter_list]
temp_str=' '.join(stemlist)
return temp_str
def superCleanSentence_lemma(line):
# lower clean lemmatize stopword_removed
#line = line.translate(trantab)
import spacy
spacy_nlp=spacy.load('en', disable=['parser', 'ner'])
doc = spacy_nlp(line)
temp_str=' '.join([token.lemma_ for token in doc if token.lemma_ not in filter_list])
return ' '.join(temp_str.translate(trantab).split())