-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathpredict.py
More file actions
78 lines (56 loc) · 2.09 KB
/
predict.py
File metadata and controls
78 lines (56 loc) · 2.09 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
import pickle
from nltk.tokenize import TreebankWordTokenizer
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
import re
import wordninja
import pandas as pd
def load_model(model_path):
'''
Input: string
Output: Model object
'''
return pickle.load(open(model_path, 'rb'))
def split_joint_text(tokens):
'''
This function is for splitting words like 'wickedmachine' to ['wicked', 'machine']
'''
lm = wordninja.LanguageModel('text.txt.gz')
for each_token in tokens:
temp_list = lm.split(each_token)
temp_list = list(filter(lambda x: len(x)>1,temp_list))
if len(temp_list)> 1:
tokens.remove(each_token)
tokens.extend(temp_list)
return tokens
def preprocess(input_sentence):
tokenizer_obj= TreebankWordTokenizer()
stemmer_obj = PorterStemmer()
## Removing punctuations and special characters
input_sentence = input_sentence.lower()
clean_text = re.sub(r"[^a-z0-9]", ' ', input_sentence)
## Tokenizing text
tokens = tokenizer_obj.tokenize(clean_text)
tokens = [stemmer_obj.stem(x) for x in tokens]
## Splitting joint words (For eg. partsfor, forcaptive)
tokens = split_joint_text(tokens)
## Removing all tokens of length less than 3
stop_words = stopwords.words("english")
clean_tokens = [word for word in tokens if word not in stop_words]
filtered_tokens = list(filter(lambda x: len(x)>2, clean_tokens))
processed_sentence = ' '.join(filtered_tokens)
return processed_sentence
def predict(input_string):
response = {'success': False, 'label':None}
if len(input_string) == 0:
response = {'success': False, 'label':None}
else:
model = load_model('model.pickle')
clean_string = preprocess(input_string)
if len(clean_string)>3:
series_obj = pd.Series(clean_string)
label = model.predict(series_obj)[0]
response = {'success': True, 'label':label}
return response
if __name__=='__main__':
predict(input_string)