-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathMeanEmbeddingVectorizer.py
More file actions
33 lines (28 loc) · 1023 Bytes
/
MeanEmbeddingVectorizer.py
File metadata and controls
33 lines (28 loc) · 1023 Bytes
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
from gensim.models import Word2Vec
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import Word2Vec_train
from collections import defaultdict
import pandas as pd
import numpy as np
class MeanEmbeddingVectorizer(object):
def __init__(self, model_cbow):
self.model_cbow = model_cbow
self.vector_size = model_cbow.wv.vector_size
def fit(self):
return self
def transform(self, docs):
doc_word_vector = self.doc_average_list(docs)
return doc_word_vector
def doc_average(self, doc):
mean = []
for word in doc:
if word in self.model_cbow.wv.index_to_key:
mean.append(self.model_cbow.wv.get_vector(word))
if not mean:
return np.zeros(self.vector_size)
else:
mean = np.array(mean).mean(axis=0)
return mean
def doc_average_list(self, docs):
return np.vstack([self.doc_average(doc) for doc in docs])