LanguageCapstone/caption_embeddings.py at master · GenuineIntelligenceTeam/LanguageCapstone · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
import numpy as np
import pickle
from collections import defaultdict
import numpy as np
import time
import gensim
from gensim.models.keyedvectors import KeyedVectors
from sklearn.decomposition import TruncatedSVD
import matplotlib.pyplot as plt
path = "./data/glove.6B.50d.txt.w2v"
glove = KeyedVectors.load_word2vec_format(path, binary=False)
import re, string
from collections import Counter

# this creates a regular expression that identifies all punctuation character
# don't include this in `strip_punc`, otherwise you will re-compile this expression
# every time you call the function


def to_counter(doc):
    """
    Produce word-count of document, removing all punctuation
    and removing all punctuation.

    Parameters
    ----------
    doc : str

    Returns
    -------
    collections.Counter
        lower-cased word -> count"""

    return Counter(strip_punc(doc).lower().split())


def to_bag(counters, k=None, stop_words=None):
    """
    [word, word, ...] -> sorted list of top-k unique words
    Excludes words included in `stop_words`

    Parameters
    ----------
    counters : Iterable[Iterable[str]]

    k : Optional[int]
        If specified, only the top-k words are returned

    stop_words : Optional[Collection[str]]
        A collection of words to be ignored when populating the bag
    """
    bag = Counter()
    for counter in counters:
        bag.update(counter)

    if stop_words is not None:
        for word in set(stop_words):
            bag.pop(word, None)  # if word not in bag, return None
    return sorted(i for i,j in bag.most_common(k))


punc_regex = re.compile('[{}]'.format(re.escape(string.punctuation)))


with open("./data/stopwords.txt", 'r') as r:
    stops = []
    for line in r:
        stops += [i.strip() for i in line.split('\t')]


def strip_punc(corpus):
    """
     Removes all punctuation from a string.

        Parameters
        ----------
        corpus : str

        Returns
        -------
        str
            the corpus with all punctuation removed
    # substitute all punctuation marks with ""
    """
    return punc_regex.sub(' ', corpus)


def to_idf(bag, counters):
    """
    Given the bag-of-words, and the word-counts for each document, computes
    the inverse document-frequency (IDF) for each term in the bag.

    Parameters
    ----------
    bag : Sequence[str]
        Ordered list of words that we care about

    counters : Iterable[collections.Counter]
        The word -> count mapping for each document.

    Returns
    -------
    numpy.ndarray
        An array whose entries correspond to those in `bag`, storing
        the IDF for each term `t`:
                           log10(N / nt)
        Where `N` is the number of documents, and `nt` is the number of
        documents in which the term `t` occurs.
    """
    N = len(counters)
    nt = [sum(1 if t in counter else 0 for counter in counters) for t in bag]
    nt = np.array(nt, dtype=float)
    return np.log10(N / nt)


def caption_to_word_embedding (captions, use_stop_words = False):
    word_index_dict = {}
    word_counts = [to_counter(doc) for doc in captions]
    bag = to_bag(word_counts, stop_words=None)
    for i in range(len(bag)):
        word_index_dict[bag[i]] = i
    idf = to_idf(bag,word_counts)
    wordemb = np.zeros((len(captions),50))
    for i in range(len(captions)):
        captions[i] = strip_punc(captions[i])
        caption_as_list = list(captions[i].split())
        for word in caption_as_list:
            wordemb[i] += glove[word] * idf[word_index_dict[word]]
    return wordemb