Language-Modelling-CSE291-AS2/dataset_parser.py at master · oscarvik/Language-Modelling-CSE291-AS2 · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
import json
import string

exclude = set(string.punctuation)

filename = "../../CopyLanguage-Modelling-CSE291-AS2/data/brown/brown.train.txt"
out_file = "../../CopyLanguage-Modelling-CSE291-AS2/data/brown/brown.train.json"

vocab_file = '../../CopyLanguage-Modelling-CSE291-AS2/data/ptb.vocab.json'

file = open(filename, "r")
vocab = json.load(open(vocab_file, "r"))["w2i"]

unk = "<unk>"
eos = 3

skipped = 0
no_unknowns = 0

data = {}
i = 0
for sentence in file.readlines():
    sentence = ''.join(ch for ch in sentence if ch not in exclude)
    words = sentence.strip().split(" ")
    if not len(sentence) or len(words) > 59:
        skipped += 1
        continue

    input_vector = [2]
    for word in sentence.split(" "):
        if not len(word) or (len(word) == 1 and ord(word) == 10):
            continue

        word = word.lower()
        if word in vocab:
            input_vector.append(vocab[word])
        else:
            #print(word, [ord(c) for c in word])
            no_unknowns += 1
            input_vector.append(vocab[unk])

    length = len(input_vector)
    target_vector = input_vector[1:] + [eos]

    input_vector += [0] * (60 - length)
    target_vector += [0] * (60 - length)

    data[i] = {
        "input": input_vector,
        "target": target_vector,
        "length": length
    }
    i+=1

print("Data parsed:")
print("\tNo unknowns:", no_unknowns)
print("\tNo sentences:", i)
json.dump(data, open(out_file, "w"))