unsupervised/run.py at master · devurbandictionary/unsupervised · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
import numpy as np
import matplotlib.pyplot as plt

# Declaring two empty lists. One for words and one for their corresponding vector representation

words = []
vecs = []

# Getting word and corresponding vector from each line of the model.vec file generated by fasttext

import codecs # To open the file in specific mode

with codecs.open('model.vec', 'r', 'utf-8') as f_in:
    vocabulary, wv = zip(*[line.strip().split(' ', 1) for line in f_in])


# Populating the two lists. Need to convert vector values from string to numpy array

for i in range(10,len(vocabulary)): # Usually skip first 10 words becuase they might be garbage values.
    words.append(vocabulary[i])
    x = wv[i]
    vecs.append(np.fromstring(x,dtype='float32',sep=' '))
    # np.fromstring takes string values and converts to float32 with space as a separator


# Carrying out Singular Value Decomposition

U, s, Vh = np.linalg.svd(vecs,full_matrices=False)


# Plotting words and their vector representations

for i in range(len(words)):
        fig = plt.gcf()
        fig.set_size_inches(18.5, 10.5)
        plt.text(U[i,0], U[i,1], words[i])
        plt.xlim((-0.5,0.5))
        plt.ylim((-0.5,0.5))

plt.savefig('viz.jpg')