-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathapp.py
More file actions
189 lines (160 loc) · 5.75 KB
/
app.py
File metadata and controls
189 lines (160 loc) · 5.75 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
# coding: utf-8
# In[1]:
from flask import Flask,jsonify,request
from numpy.linalg import norm
app = Flask(__name__)
##Form doc vectors from Glove word vectors.
import numpy as np
import gensim
from nltk.tokenize import word_tokenize
from gensim.parsing.preprocessing import strip_punctuation,remove_stopwords
from bs4 import BeautifulSoup
from bs4.element import Comment
import urllib
import re
def tag_visible(element):
if element.parent.name in ['style', 'script', 'head', 'title', 'meta', '[document]']:
return False
if isinstance(element, Comment):
return False
return True
def text_from_html(body):
soup = BeautifulSoup(body, 'html.parser')
texts = soup.findAll(text=True)
visible_texts = filter(tag_visible, texts)
return u" ".join(t.strip() for t in visible_texts)
def process_data(text_array):
sents = text_array
for i,sentence in enumerate(sents):
sents[i] = strip_punctuation(sentence)
#d = ' '.join(word_tokenize(text_array))
sents[i] = remove_stopwords(sents[i])
sents[i] = sents[i].lower()
return sents
def load_word_vectors(fpath):
f = open(fpath,'r')
word_vectors = {} #dic with key as word and vector as value
for line in f.readlines():
word = line.split()[0]
word_vectors[word] = np.array([float(s) for s in line.split()[1:]])
f.close()
return word_vectors
#Form the document vectors using existing word vectors
#inputs list of sentences, that are processed, and the word_vectors dictionary
def form_doc_vectors(sen_list, word_vectors):
print("Creating doc vecs")
#Split the sentence into words and concatenate their word_vectors vertically
sen_vecs = []
for sen in sen_list:
words = sen.split()
vec = np.array([word_vectors[w] for w in words if w in word_vectors.keys()])
min_vec = np.min(vec,axis=0)
max_vec = np.max(vec,axis=0)
#Concatenate min and max horizontally
doc_vec = np.hstack((min_vec,max_vec)).tolist()
sen_vecs.append(doc_vec)
return sen_vecs
def form_query_vec(query, word_vectors):
words = query.split()
try:
vec = np.array([word_vectors[w] for w in words])
min_vec = np.min(vec,axis=0)
max_vec = np.max(vec,axis=0)
query_vec = np.hstack((min_vec,max_vec)).tolist()
except KeyError:
query_vec = ''
return query_vec
def find_dists(query_vec, sen_vecs):
#dists = norm(sen_vecs-query_vec, axis=1, ord=1)
dists = np.dot(query_vec, sen_vecs.T)
print(dists)
#find min dists
indices = range(0,len(dists))
#Zip and sort by value
zipped = zip(indices,dists)
zipped.sort(key=lambda x: x[1])
min_indices, min_dists = list(zip(*zipped))
return min_indices, min_dists
def docvecs(dump):
dump = text_from_html(dump)
original_text = dump.split('.')
text = re.sub(r'[^\x00-\x7F]+','', dump)
text_array2 = text.split('.')
text_inds = [i for i in range(len(text_array2)) if text_array2[i] != '' and len(text_array2[i].split()) > 5]
text_array = [text_array2[i] for i in text_inds]
original_text = [original_text[i] for i in text_inds]
text_array = process_data(text_array)
f = open("temp.txt",'a')
for line in text_array:
f.write(line)
f.close()
#doc2vec parameters
vector_size = 100
window_size = 15
min_count = 1
sampling_threshold = 1e-5
negative_size = 5
train_epoch = 1
dm = 0 #0 = dbow; 1 = dmpv
worker_count = 1 #number of parallel processes
#pretrained word embeddings
pretrained_emb = "./Glove/glove.6B.100d.txt" #None if use without pretrained embeddings
#input corpus
train_corpus = "temp.txt"
#output model
saved_path = "model.bin"
#enable logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
#train doc2vec model
docs = g.doc2vec.TaggedLineDocument(train_corpus)
model = g.Doc2Vec(docs, size=vector_size, window=window_size, min_count=min_count, sample=sampling_threshold, workers=worker_count, hs=0, dm=dm, negative=negative_size, dbow_words=1, dm_concat=1, pretrained_emb=pretrained_emb, iter=train_epoch)
#save model
model.save(saved_path)
#Convert each sentence into a vector
vec = []
print(len(text_array))
for line in text_array:
vec.append(model.infer_vector(line).tolist())
return model,text_array,vec,original_text
word_vectors = load_word_vectors('./Glove/glove.6B.100d.txt')
@app.route('/')
def index():
return "Hello, World!"
model = None
from flask import abort
import gensim.models as g
import logging
model = None
global_sen_vecs = None
original_text = None
@app.route('/senvec/indexing/', methods=['POST'])
def indexing():
global model
global global_sen_vecs
global original_text
model = None
global_sen_vecs = None
print("creating word vectors")
if not request.json or not 'dump' in request.json:
abort(400)
dump = request.json['dump']
model,text_array,global_sen_vecs,original_text = docvecs(dump)
print("Text array created")
#return jsonify({'sentence_vectors': global_sen_vecs, 'text_array': text_array})
return ""
@app.route('/senvec/searching/', methods=['POST'])
def searching():
global model
global global_sen_vecs
if not request.json or not 'query' in request.json:
abort(400)
query = request.json['query']
print(query)
query_vec = model.infer_vector(query)
#query_vec = form_query_vec(query,word_vectors)
#Find distances and return top IDs
min_indices, min_dists = find_dists(np.array(query_vec),np.array(global_sen_vecs))
orig = [original_text[i] for i in min_indices]
return jsonify({'original_text': orig})
if __name__ == '__main__':
app.run(host='0.0.0.0', port=3000,debug=True)