-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathprocesstext.py
More file actions
30 lines (24 loc) · 833 Bytes
/
processtext.py
File metadata and controls
30 lines (24 loc) · 833 Bytes
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
import msgpack
import lzma
import numpy as np
def encode_numpy(obj):
if isinstance(obj, np.ndarray):
return {
"__ndarray__": obj.tobytes(),
"dtype": str(obj.dtype),
"shape": obj.shape
}
return obj
def decode_numpy(obj):
if "__ndarray__" in obj:
return np.frombuffer(obj["__ndarray__"], dtype=obj["dtype"]).reshape(obj["shape"])
return obj
embeddings = {}
with open("glove.6B.200d.txt", "r", encoding="utf-8") as f:
for line in f:
vec = line.rstrip("\n").split(" ")
if vec[0].isalpha() and not vec[0] is 'bulletinyyy':
embeddings[vec[0]] = np.round(np.array(vec)[1:].astype(float),2)
print('saving')
with lzma.open("embeddings.msgpack.xz", "wb") as f:
f.write(msgpack.packb(embeddings, default=encode_numpy))