-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathTfidfEmbeddingVectorizer.py
More file actions
48 lines (38 loc) · 1.4 KB
/
TfidfEmbeddingVectorizer.py
File metadata and controls
48 lines (38 loc) · 1.4 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
from sklearn.feature_extraction.text import TfidfVectorizer
from collections import defaultdict
import numpy as np
class TfidfEmbeddingVectorizer(object):
def __init__(self, model_cbow):
self.model_cbow = model_cbow
self.word_idf_weight = None
self.vector_size = model_cbow.wv.vector_size
def fit(self, docs):
text_docs = []
for doc in docs:
text_docs.append(" ".join(doc))
tfidf = TfidfVectorizer()
tfidf.fit(text_docs)
# if a word was never seen it is given idf of the max of known idf value
max_idf = max(tfidf.idf_)
self.word_idf_weight = defaultdict(
lambda: max_idf,
[(word, tfidf.idf_[i]) for word, i in tfidf.vocabulary_.items()],
)
return self
def transform(self, docs):
doc_word_vector = self.doc_average_list(docs)
return doc_word_vector
def doc_average(self, doc):
mean = []
for word in doc:
if word in self.model_cbow.wv.index_to_key:
mean.append(
self.model_cbow.wv.get_vector(word) * self.word_idf_weight[word]
)
if not mean:
return np.zeros(self.vector_size)
else:
mean = np.array(mean).mean(axis=0)
return mean
def doc_average_list(self, docs):
return np.vstack([self.doc_average(doc) for doc in docs])