NLSearch/app.py at master · ubhacking2017/NLSearch · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189

# coding: utf-8

# In[1]:


from flask import Flask,jsonify,request
from numpy.linalg import norm
app = Flask(__name__)


##Form doc vectors from Glove word vectors.

import numpy as np
import gensim
from nltk.tokenize import word_tokenize
from gensim.parsing.preprocessing import strip_punctuation,remove_stopwords
from bs4 import BeautifulSoup
from bs4.element import Comment
import urllib
import re
def tag_visible(element):
    if element.parent.name in ['style', 'script', 'head', 'title', 'meta', '[document]']:
        return False
    if isinstance(element, Comment):
        return False
    return True


def text_from_html(body):
    soup = BeautifulSoup(body, 'html.parser')
    texts = soup.findAll(text=True)
    visible_texts = filter(tag_visible, texts)
    return u" ".join(t.strip() for t in visible_texts)

def process_data(text_array):
    sents = text_array
    for i,sentence in enumerate(sents):
        sents[i] = strip_punctuation(sentence)
        #d = ' '.join(word_tokenize(text_array))
        sents[i] = remove_stopwords(sents[i])
        sents[i] = sents[i].lower()
    return sents

def load_word_vectors(fpath):
    f = open(fpath,'r')
    word_vectors = {} #dic with key as word and vector as value
    for line in f.readlines():
        word = line.split()[0]
        word_vectors[word] = np.array([float(s) for s in line.split()[1:]])
    f.close()
    return word_vectors


#Form the document vectors using existing word vectors
#inputs list of sentences, that are processed, and the word_vectors dictionary
def form_doc_vectors(sen_list, word_vectors):
    print("Creating doc vecs")
    #Split the sentence into words and concatenate their word_vectors vertically
    sen_vecs = []
    for sen in sen_list:
        words = sen.split()

        vec = np.array([word_vectors[w] for w in words if w in word_vectors.keys()])
        min_vec = np.min(vec,axis=0)
        max_vec = np.max(vec,axis=0)
        #Concatenate min and max horizontally

        doc_vec = np.hstack((min_vec,max_vec)).tolist()
        sen_vecs.append(doc_vec)

    return sen_vecs

def form_query_vec(query, word_vectors):
    words = query.split()
    try:
       vec = np.array([word_vectors[w] for w in words])
       min_vec = np.min(vec,axis=0)
       max_vec = np.max(vec,axis=0)
       query_vec = np.hstack((min_vec,max_vec)).tolist()
    except KeyError:
       query_vec = ''
    return query_vec


def find_dists(query_vec, sen_vecs):
    #dists =  norm(sen_vecs-query_vec, axis=1, ord=1)
    dists = np.dot(query_vec, sen_vecs.T)
    print(dists)
    #find min dists
    indices = range(0,len(dists))
    #Zip and sort by value
    zipped = zip(indices,dists)
    zipped.sort(key=lambda x: x[1])
    min_indices, min_dists = list(zip(*zipped))
    return min_indices, min_dists

def docvecs(dump):
    dump = text_from_html(dump)
    original_text = dump.split('.')
    text = re.sub(r'[^\x00-\x7F]+','', dump)
    text_array2 = text.split('.')
    text_inds = [i for i in range(len(text_array2)) if text_array2[i] != '' and len(text_array2[i].split()) > 5]
    text_array = [text_array2[i] for i in text_inds]
    original_text = [original_text[i] for i in text_inds]
    text_array = process_data(text_array)

    f = open("temp.txt",'a')
    for line in text_array:
        f.write(line)
    f.close()
    #doc2vec parameters
    vector_size = 100
    window_size = 15
    min_count = 1
    sampling_threshold = 1e-5
    negative_size = 5
    train_epoch = 1
    dm = 0 #0 = dbow; 1 = dmpv
    worker_count = 1 #number of parallel processes
    #pretrained word embeddings
    pretrained_emb = "./Glove/glove.6B.100d.txt" #None if use without pretrained embeddings
    #input corpus
    train_corpus = "temp.txt"
    #output model
    saved_path = "model.bin"
    #enable logging
    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
    #train doc2vec model
    docs = g.doc2vec.TaggedLineDocument(train_corpus)
    model = g.Doc2Vec(docs, size=vector_size, window=window_size, min_count=min_count, sample=sampling_threshold, workers=worker_count, hs=0, dm=dm, negative=negative_size, dbow_words=1, dm_concat=1, pretrained_emb=pretrained_emb, iter=train_epoch)
    #save model
    model.save(saved_path)
    #Convert each sentence into a vector
    vec = []
    print(len(text_array))
    for line in text_array:
        vec.append(model.infer_vector(line).tolist())

    return model,text_array,vec,original_text


word_vectors = load_word_vectors('./Glove/glove.6B.100d.txt')


@app.route('/')
def index():
    return "Hello, World!"

model = None
from flask import abort
import gensim.models as g
import logging
model = None
global_sen_vecs = None
original_text = None
@app.route('/senvec/indexing/', methods=['POST'])
def indexing():
    global model
    global global_sen_vecs
    global original_text
    model = None
    global_sen_vecs = None
    print("creating word vectors")
    if not request.json or not 'dump' in request.json:
        abort(400)
    dump = request.json['dump']
    model,text_array,global_sen_vecs,original_text = docvecs(dump)
    print("Text array created")
    #return jsonify({'sentence_vectors': global_sen_vecs, 'text_array': text_array})
    return ""
@app.route('/senvec/searching/', methods=['POST'])
def searching():
    global model
    global global_sen_vecs
    if not request.json or not 'query' in request.json:
        abort(400)
    query = request.json['query']
    print(query)
    query_vec = model.infer_vector(query)
    #query_vec = form_query_vec(query,word_vectors)
    #Find distances and return top IDs
    min_indices, min_dists = find_dists(np.array(query_vec),np.array(global_sen_vecs))
    orig = [original_text[i] for i in min_indices]
    return jsonify({'original_text': orig})

if __name__ == '__main__':
    app.run(host='0.0.0.0', port=3000,debug=True)