NLP-Project/features.py at master · NicholasKobald/NLP-Project · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
import warnings #couldn't find a way around this.
warnings.filterwarnings(action='ignore', category=UserWarning, module='gensim')

import numpy as np
import gensim

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from utils import get_tokenized_lemmas_without_stopwords, transform_text

from keras.preprocessing.text import text_to_word_sequence


def create_bow(articles, headlines, vocab, stop_words='english', binary=False):
    hcv = TfidfVectorizer(vocabulary=vocab, norm='l2',
                          tokenizer=get_tokenized_lemmas_without_stopwords, binary=binary)
    bcv = TfidfVectorizer(vocabulary=vocab, norm='l2',
                          tokenizer=get_tokenized_lemmas_without_stopwords, binary=binary)
    X_head = hcv.fit_transform(articles).toarray()
    X_body = bcv.fit_transform(headlines).toarray()

    print(X_head.shape, X_body.shape, np.hstack((X_head, X_body)).shape)

    return np.hstack((X_body, X_head))


def get_w2v_idx(texts, w2v, max_len):
    return np.asarray([
        transform_text(t, w2v.vocab, max_len)
        for t in texts
    ])


def get_vectors(word_to_vec, text):
    tokens = text_to_word_sequence(text)
    return np.asarray([word_to_vec[token] for token in tokens if token in word_to_vec])


# Given a list of articles/headlines etc computes bigrams
# Returns list of dict, with each entry the bigram count for associated text
def compute_bigrams(text):
    bigrams=[]
    count=0
    for j in text:
        if count == 5:
            break
        else:
            count +=1
        words={}
        info=j.split(' ')
        for i in range(len(info)-1):
            if info[i]==' ' or info[i+1]==' ':
                continue
            word_pair=' '.join(info[i:i+2])
            if words.has_key(word_pair):
                words[word_pair] = words[word_pair]+1
            else:
                words[word_pair]=1
        bigrams.append(words)
    return bigrams


def test_word_to_vec_feature():
    gv = load_word2vec()
    print("Successfully loaded word2vec")
    vecs = get_vectors(gv, "This is a sample headline")
    print("Vecs", vecs)
    print("Successfully got vectors")

if __name__ == "__main__":
    test_word_to_vec_feature()