ir/tfidf_model.py at main · bashish101/ir · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
import numpy as np
from collections import Counter

class TFIDF():
    def __init__(self,
                data):
        super(TFIDF, self).__init__()
        self.data = data
        self.N = len(data)

        self.tf_dict = {}
        self.df_dict = {}
        self.idf_dict = {}

        # Create data structures with tfidf for all documents
        self.create_tf_dict()
        self.create_df_dict()
        self.create_idf_dict()
        self.create_tfidf_dict()

    def create_df_dict(self):
        '''Creates dictionary of df values for each word'''
        self.df_dict = {}
        for doc_text in self.data:
            for word in set(doc_text):
                self.df_dict[word] = self.df_dict.get(word, 0) + 1
        return self.df_dict

    def create_idf_dict(self):
        '''Creates dictionary of idf values for each word'''
        self.idf_dict = {}
        maximum = 0
        delta = 0.5
        for word, df in self.df_dict.items():
            idf = np.log((self.N + delta) / (df + delta))
            self.idf_dict[word] = idf
        return self.idf_dict

    def create_tf_dict(self):
        self.tf_dict = {}
        for doc_num, doc_text in enumerate(self.data):
            self.tf_dict[doc_num] = {}
            if (len(doc_text) == 0):
                continue
            word_freq_list = Counter(doc_text).most_common()
            # max normalization
            max_freq = word_freq_list[0][1]
            for word, freq in word_freq_list:
                self.tf_dict[doc_num][word] = freq / max_freq

        return self.tf_dict

    def create_tfidf_dict(self):
        '''Creates dictionary of tfidf vectors for each document in data'''
        self.tfidf_dict = {}
        for doc_num, tf_per_doc in self.tf_dict.items():
            self.tfidf_dict[doc_num] = {}
            for word, tf in tf_per_doc.items():
                self.tfidf_dict[doc_num][word] = tf * self.idf_dict.get(word, 0)

        return self.tfidf_dict

    def compute_query_tfidf(self, query):
        '''Computes tfidf vector representation for a given (tokenized) query'''
        query_vec =  {}
        word_freq_list = Counter(query).most_common()
        max_freq = word_freq_list[0][1]
        for word, freq in word_freq_list:
            # Get idf from pre-computed data structure
            idf = self.idf_dict.get(word, 0)
            # Compute tf for query (similar to that for a document)
            tf = (0.5 + 0.5 * freq / max_freq)
            query_vec[word] = tf * idf
        return query_vec

    def match(self, query_vec, doc_vec):
        '''Computes cosine similarity score (range [-1, 1]) between two vectors'''
        q_norm = np.linalg.norm([score for score in query_vec.values()])
        d_norm = np.linalg.norm([score for score in doc_vec.values()])
        score = 0
        for term, val in query_vec.items():
            score += val * doc_vec.get(term, 0)
        score /= q_norm * d_norm
        return score