punctuator2/main2.py at master · jorpro/punctuator2 · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
# coding: utf-8
from __future__ import division

from collections import OrderedDict
from time import time

import models
import data

import theano
import sys
import os.path

import theano.tensor as T
import numpy as np

from main import get_minibatch

MAX_EPOCHS = 50
MINIBATCH_SIZE = 128
L2_REG = 0.0
CLIPPING_THRESHOLD = 2.0
PATIENCE_EPOCHS = 1

"""
Second stage training
"""

if __name__ == "__main__":

    if len(sys.argv) > 1:
        model_name = sys.argv[1]
    else:
        sys.exit("'Model name' argument missing!")

    if len(sys.argv) > 2:
        num_hidden = int(sys.argv[2])
    else:
        sys.exit("'Hidden layer size' argument missing!")

    if len(sys.argv) > 3:
        learning_rate = float(sys.argv[3])
    else:
        sys.exit("'Learning rate' argument missing!")

    if len(sys.argv) > 4:
        stage1_model_file_name = sys.argv[4]
    else:
        sys.exit("'Stage 1 model path' argument missing!")

    model_file_name = "Model_stage2_%s_h%d_lr%s.pcl" % (model_name, num_hidden, learning_rate)

    print num_hidden, learning_rate, model_file_name

    word_vocabulary = data.read_vocabulary(data.WORD_VOCAB_FILE)
    punctuation_vocabulary = data.iterable_to_dict(data.PUNCTUATION_VOCABULARY)

    x = T.imatrix('x')
    y = T.imatrix('y')
    p = T.matrix('p')
    lr = T.scalar('lr')

    if os.path.isfile(model_file_name):
        print "Loading previous model state"

        net, state = models.load(model_file_name, MINIBATCH_SIZE, x, p)
        gsums, learning_rate, validation_ppl_history, starting_epoch, rng = state
        best_ppl = min(validation_ppl_history)

    else:
        rng = np.random
        rng.seed(1)

        print "Building model..."
        net = models.GRUstage2(
            rng=rng,
            x=x,
            minibatch_size=MINIBATCH_SIZE,
            n_hidden=num_hidden,
            x_vocabulary=word_vocabulary,
            y_vocabulary=punctuation_vocabulary,
            stage1_model_file_name=stage1_model_file_name,
            p=p
            )

        starting_epoch = 0
        best_ppl = np.inf
        validation_ppl_history = []

        gsums = [theano.shared(np.zeros_like(param.get_value(borrow=True))) for param in net.params]

    cost = net.cost(y) + L2_REG * net.L2_sqr

    gparams = T.grad(cost, net.params)
    updates = OrderedDict()

    # Compute norm of gradients
    norm = T.sqrt(T.sum(
               [T.sum(gparam ** 2) for gparam in gparams]
           ))


    # Adagrad: "Adaptive subgradient methods for online learning and stochastic optimization" (2011)
    for gparam, param, gsum in zip(gparams, net.params, gsums):
        gparam = T.switch(
            T.ge(norm, CLIPPING_THRESHOLD),
            gparam / norm * CLIPPING_THRESHOLD,
            gparam
        ) # Clipping of gradients
        updates[gsum] = gsum + (gparam ** 2)
        updates[param] = param - lr * (gparam / (T.sqrt(updates[gsum] + 1e-6)))

    train_model = theano.function(
        inputs=[x, p, y, lr],
        outputs=cost,
        updates=updates
    )

    validate_model = theano.function(
        inputs=[x, p, y],
        outputs=net.cost(y)
    )

    print "Training..."
    for epoch in range(starting_epoch, MAX_EPOCHS):
        t0 = time()
        total_neg_log_likelihood = 0
        total_num_output_samples = 0
        iteration = 0
        for X, Y, P in get_minibatch(data.TRAIN_FILE2, MINIBATCH_SIZE, shuffle=True, with_pauses=True):
            total_neg_log_likelihood += train_model(X, P, Y, learning_rate)
            total_num_output_samples += np.prod(Y.shape)
            iteration += 1
            if iteration % 100 == 0:
                sys.stdout.write("PPL: %.4f; Speed: %.2f sps\n" % (np.exp(total_neg_log_likelihood / total_num_output_samples), total_num_output_samples / max(time() - t0, 1e-100)))
                sys.stdout.flush()
        print "Total number of training labels: %d" % total_num_output_samples

        total_neg_log_likelihood = 0
        total_num_output_samples = 0
        for X, Y, P in get_minibatch(data.DEV_FILE2, MINIBATCH_SIZE, shuffle=False, with_pauses=True):
            total_neg_log_likelihood += validate_model(X, P, Y)
            total_num_output_samples += np.prod(Y.shape)
        print "Total number of validation labels: %d" % total_num_output_samples

        ppl = np.exp(total_neg_log_likelihood / total_num_output_samples)
        validation_ppl_history.append(ppl)

        print "Validation perplexity is %s" % np.round(ppl, 4)

        if ppl <= best_ppl:
            best_ppl = ppl
            net.save(model_file_name, gsums=gsums, learning_rate=learning_rate, validation_ppl_history=validation_ppl_history, best_validation_ppl=best_ppl, epoch=epoch, random_state=rng.get_state())
        elif best_ppl not in validation_ppl_history[-PATIENCE_EPOCHS:]:
            print "Finished!"
            break