-
Notifications
You must be signed in to change notification settings - Fork 19
Expand file tree
/
Copy pathnn_optimizers.py
More file actions
164 lines (132 loc) · 5.77 KB
/
nn_optimizers.py
File metadata and controls
164 lines (132 loc) · 5.77 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
"""
This file contains implementations of the various methods
used to train stuff.
"""
import theano
from theano import tensor
from util import numpy_floatX
def sgd(lr, tparams, grads, x_c, x_w, mask, wmask, y, cost):
""" Stochastic Gradient Descent
:note: A more complicated version of sgd then needed. This is
done like that for adadelta and rmsprop.
"""
# New set of shared variable that will contain the gradient
# for a mini-batch.
gshared = [theano.shared(p.get_value() * 0., name='%s_grad' % k)
for k, p in tparams.iteritems()]
gsup = [(gs, g) for gs, g in zip(gshared, grads)]
# Function that computes gradients for a mini-batch, but do not
# updates the weights.
f_grad_shared = theano.function([x_c, x_w, mask, wmask, y], cost, updates=gsup,
name='sgd_f_grad_shared')
pup = [(p, p - lr * g) for p, g in zip(tparams.values(), gshared)]
# Function that updates the weights from the previously computed
# gradient.
f_update = theano.function([lr], [], updates=pup,
name='sgd_f_update')
return f_grad_shared, f_update
def adadelta(lr, tparams, grads, x_c, mask, y_mask, y, cost):
"""
An adaptive learning rate optimizer
Parameters
----------
lr : Theano SharedVariable
Initial learning rate
tpramas: Theano SharedVariable
Model parameters
grads: Theano variable
Gradients of cost w.r.t to parameres
x: Theano variable
Model inputs
mask: Theano variable
Sequence mask
y: Theano variable
Targets
cost: Theano variable
Objective fucntion to minimize
Notes
-----
For more information, see [ADADELTA]_.
.. [ADADELTA] Matthew D. Zeiler, *ADADELTA: An Adaptive Learning
Rate Method*, arXiv:1212.5701.
"""
zipped_grads = [theano.shared(p.get_value() * numpy_floatX(0.),
name='%s_grad' % k)
for k, p in tparams.iteritems()]
running_up2 = [theano.shared(p.get_value() * numpy_floatX(0.),
name='%s_rup2' % k)
for k, p in tparams.iteritems()]
running_grads2 = [theano.shared(p.get_value() * numpy_floatX(0.),
name='%s_rgrad2' % k)
for k, p in tparams.iteritems()]
zgup = [(zg, g) for zg, g in zip(zipped_grads, grads)]
rg2up = [(rg2, 0.95 * rg2 + 0.05 * (g ** 2))
for rg2, g in zip(running_grads2, grads)]
f_grad_shared = theano.function([x_c, mask, y_mask, y], cost, updates=zgup + rg2up,
name='adadelta_f_grad_shared', on_unused_input='warn')
updir = [-tensor.sqrt(ru2 + 1e-6) / tensor.sqrt(rg2 + 1e-6) * zg
for zg, ru2, rg2 in zip(zipped_grads,
running_up2,
running_grads2)]
ru2up = [(ru2, 0.95 * ru2 + 0.05 * (ud ** 2))
for ru2, ud in zip(running_up2, updir)]
param_up = [(p, p + ud) for p, ud in zip(tparams.values(), updir)]
f_update = theano.function([lr], [], updates=ru2up + param_up,
on_unused_input='ignore',
name='adadelta_f_update')
return f_grad_shared, f_update
def rmsprop(lr, tparams, grads, x_c, x_w, mask, wmask, y, cost):
"""
A variant of SGD that scales the step size by running average of the
recent step norms.
Parameters
----------
lr : Theano SharedVariable
Initial learning rate
tpramas: Theano SharedVariable
Model parameters
grads: Theano variable
Gradients of cost w.r.t to parameres
x: Theano variable
Model inputs
mask: Theano variable
Sequence mask
y: Theano variable
Targets
cost: Theano variable
Objective fucntion to minimize
Notes
-----
For more information, see [Hint2014]_.
.. [Hint2014] Geoff Hinton, *Neural Networks for Machine Learning*,
lecture 6a,
http://cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf
"""
zipped_grads = [theano.shared(p.get_value() * numpy_floatX(0.),
name='%s_grad' % k)
for k, p in tparams.iteritems()]
running_grads = [theano.shared(p.get_value() * numpy_floatX(0.),
name='%s_rgrad' % k)
for k, p in tparams.iteritems()]
running_grads2 = [theano.shared(p.get_value() * numpy_floatX(0.),
name='%s_rgrad2' % k)
for k, p in tparams.iteritems()]
zgup = [(zg, g) for zg, g in zip(zipped_grads, grads)]
rgup = [(rg, 0.95 * rg + 0.05 * g) for rg, g in zip(running_grads, grads)]
rg2up = [(rg2, 0.95 * rg2 + 0.05 * (g ** 2))
for rg2, g in zip(running_grads2, grads)]
f_grad_shared = theano.function([x_c, x_w, mask, wmask, y], cost,
updates=zgup + rgup + rg2up,
name='rmsprop_f_grad_shared')
updir = [theano.shared(p.get_value() * numpy_floatX(0.),
name='%s_updir' % k)
for k, p in tparams.iteritems()]
updir_new = [(ud, 0.9 * ud - 1e-4 * zg / tensor.sqrt(rg2 - rg ** 2 + 1e-4))
for ud, zg, rg, rg2 in zip(updir, zipped_grads, running_grads,
running_grads2)]
param_up = [(p, p + udn[1])
for p, udn in zip(tparams.values(), updir_new)]
f_update = theano.function([lr], [], updates=updir_new + param_up,
on_unused_input='ignore',
name='rmsprop_f_update')
return f_grad_shared, f_update