From b43ff8e5cf4c9a720e2b9690de9afcf97eab3098 Mon Sep 17 00:00:00 2001 From: nickisghosty Date: Sun, 22 Oct 2017 07:13:22 -0700 Subject: [PATCH 1/6] Start/stop button, random client seed Replaces 'Free BTC' link with a 'START/STOP BOT' button. Once profit reaches 500 satoshi the client seed is changed to a random 16 char alphanumeric string --- DQ-Trevel.js | 3649 ++++++++++++++++++++++++++------------------------ 1 file changed, 1864 insertions(+), 1785 deletions(-) diff --git a/DQ-Trevel.js b/DQ-Trevel.js index ce75d9b..b35d805 100644 --- a/DQ-Trevel.js +++ b/DQ-Trevel.js @@ -1,1739 +1,1798 @@ var R = {}; // the Recurrent library -(function(global) { - "use strict"; - // Utility fun - function assert(condition, message) { - // from http://stackoverflow.com/questions/15313418/javascript-assert - if (!condition) { - message = message || "Assertion failed"; - if (typeof Error !== "undefined") { - throw new Error(message); - } - throw message; // Fallback - } - } - // Random numbers utils - var return_v = false; - var v_val = 0.0; - var gaussRandom = function() { - if (return_v) { - return_v = false; - return v_val; - } - var u = 2 * Math.random() - 1; - var v = 2 * Math.random() - 1; - var r = u * u + v * v; - if (r == 0 || r > 1) return gaussRandom(); - var c = Math.sqrt(-2 * Math.log(r) / r); - v_val = v * c; // cache this - return_v = true; - return u * c; - } - var randf = function(a, b) { - return Math.random() * (b - a) + a; - } - var randi = function(a, b) { - return Math.floor(Math.random() * (b - a) + a); - } - var randn = function(mu, std) { - return mu + gaussRandom() * std; - } - // helper function returns array of zeros of length n - // and uses typed arrays if available - var zeros = function(n) { - if (typeof(n) === 'undefined' || isNaN(n)) { - return []; - } - if (typeof ArrayBuffer === 'undefined') { - // lacking browser support - var arr = new Array(n); - for (var i = 0; i < n; i++) { - arr[i] = 0; - } - return arr; - } else { - return new Float64Array(n); - } - } - // Mat holds a matrix - var Mat = function(n, d) { - // n is number of rows d is number of columns - this.n = n; - this.d = d; - this.w = zeros(n * d); - this.dw = zeros(n * d); - } - Mat.prototype = { - get: function(row, col) { - // slow but careful accessor function - // we want row-major order - var ix = (this.d * row) + col; - assert(ix >= 0 && ix < this.w.length); - return this.w[ix]; - }, - set: function(row, col, v) { - // slow but careful accessor function - var ix = (this.d * row) + col; - assert(ix >= 0 && ix < this.w.length); - this.w[ix] = v; - }, - setFrom: function(arr) { - for (var i = 0, n = arr.length; i < n; i++) { - this.w[i] = arr[i]; - } - }, - setColumn: function(m, i) { - for (var q = 0, n = m.w.length; q < n; q++) { - this.w[(this.d * q) + i] = m.w[q]; - } - }, - toJSON: function() { - var json = {}; - json['n'] = this.n; - json['d'] = this.d; - json['w'] = this.w; - return json; - }, - fromJSON: function(json) { - this.n = json.n; - this.d = json.d; - this.w = zeros(this.n * this.d); - this.dw = zeros(this.n * this.d); - for (var i = 0, n = this.n * this.d; i < n; i++) { - this.w[i] = json.w[i]; // copy over weights - } - } - } - var copyMat = function(b) { - var a = new Mat(b.n, b.d); - a.setFrom(b.w); - return a; - } - var copyNet = function(net) { - // nets are (k,v) pairs with k = string key, v = Mat() - var new_net = {}; - for (var p in net) { - if (net.hasOwnProperty(p)) { - new_net[p] = copyMat(net[p]); - } - } - return new_net; - } - var updateMat = function(m, alpha) { - // updates in place - for (var i = 0, n = m.n * m.d; i < n; i++) { - if (m.dw[i] !== 0) { - m.w[i] += -alpha * m.dw[i]; - m.dw[i] = 0; - } - } - } - var updateNet = function(net, alpha) { - for (var p in net) { - if (net.hasOwnProperty(p)) { - updateMat(net[p], alpha); - } - } - } - var netToJSON = function(net) { - var j = {}; - for (var p in net) { - if (net.hasOwnProperty(p)) { - j[p] = net[p].toJSON(); - } - } - return j; - } - var netFromJSON = function(j) { - var net = {}; - for (var p in j) { - if (j.hasOwnProperty(p)) { - net[p] = new Mat(1, 1); // not proud of this - net[p].fromJSON(j[p]); - } - } - return net; - } - var netZeroGrads = function(net) { - for (var p in net) { - if (net.hasOwnProperty(p)) { - var mat = net[p]; - gradFillConst(mat, 0); - } - } - } - var netFlattenGrads = function(net) { - var n = 0; - for (var p in net) { - if (net.hasOwnProperty(p)) { - var mat = net[p]; - n += mat.dw.length; - } - } - var g = new Mat(n, 1); - var ix = 0; - for (var p in net) { - if (net.hasOwnProperty(p)) { - var mat = net[p]; - for (var i = 0, m = mat.dw.length; i < m; i++) { - g.w[ix] = mat.dw[i]; - ix++; - } - } - } - return g; - } - // return Mat but filled with random numbers from gaussian - var RandMat = function(n, d, mu, std) { - var m = new Mat(n, d); - fillRandn(m, mu, std); - //fillRand(m,-std,std); // kind of :P - return m; - } - // Mat utils - // fill matrix with random gaussian numbers - var fillRandn = function(m, mu, std) { - for (var i = 0, n = m.w.length; i < n; i++) { - m.w[i] = randn(mu, std); - } - } - var fillRand = function(m, lo, hi) { - for (var i = 0, n = m.w.length; i < n; i++) { - m.w[i] = randf(lo, hi); - } - } - var gradFillConst = function(m, c) { - for (var i = 0, n = m.dw.length; i < n; i++) { - m.dw[i] = c - } - } - // Transformer definitions - var Graph = function(needs_backprop) { - if (typeof needs_backprop === 'undefined') { - needs_backprop = true; - } - this.needs_backprop = needs_backprop; - // this will store a list of functions that perform backprop, - // in their forward pass order. So in backprop we will go - // backwards and evoke each one - this.backprop = []; - } - Graph.prototype = { - backward: function() { - for (var i = this.backprop.length - 1; i >= 0; i--) { - this.backprop[i](); // tick! - } - }, - rowPluck: function(m, ix) { - // pluck a row of m with index ix and return it as col vector - assert(ix >= 0 && ix < m.n); - var d = m.d; - var out = new Mat(d, 1); - for (var i = 0, n = d; i < n; i++) { - out.w[i] = m.w[d * ix + i]; - } // copy over the data - if (this.needs_backprop) { - var backward = function() { - for (var i = 0, n = d; i < n; i++) { - m.dw[d * ix + i] += out.dw[i]; - } - } - this.backprop.push(backward); - } - return out; - }, - tanh: function(m) { - // tanh nonlinearity - var out = new Mat(m.n, m.d); - var n = m.w.length; - for (var i = 0; i < n; i++) { - out.w[i] = Math.tanh(m.w[i]); - } - if (this.needs_backprop) { - var backward = function() { - for (var i = 0; i < n; i++) { - // grad for z = tanh(x) is (1 - z^2) - var mwi = out.w[i]; - m.dw[i] += (1.0 - mwi * mwi) * out.dw[i]; - } - } - this.backprop.push(backward); - } - return out; - }, - sigmoid: function(m) { - // sigmoid nonlinearity - var out = new Mat(m.n, m.d); - var n = m.w.length; - for (var i = 0; i < n; i++) { - out.w[i] = sig(m.w[i]); - } - if (this.needs_backprop) { - var backward = function() { - for (var i = 0; i < n; i++) { - // grad for z = tanh(x) is (1 - z^2) - var mwi = out.w[i]; - m.dw[i] += mwi * (1.0 - mwi) * out.dw[i]; - } - } - this.backprop.push(backward); - } - return out; - }, - relu: function(m) { - var out = new Mat(m.n, m.d); - var n = m.w.length; - for (var i = 0; i < n; i++) { - out.w[i] = Math.max(0, m.w[i]); // relu - } - if (this.needs_backprop) { - var backward = function() { - for (var i = 0; i < n; i++) { - m.dw[i] += m.w[i] > 0 ? out.dw[i] : 0.0; - } - } - this.backprop.push(backward); - } - return out; - }, - mul: function(m1, m2) { - // multiply matrices m1 * m2 - assert(m1.d === m2.n, 'matmul dimensions misaligned'); - var n = m1.n; - var d = m2.d; - var out = new Mat(n, d); - for (var i = 0; i < m1.n; i++) { // loop over rows of m1 - for (var j = 0; j < m2.d; j++) { // loop over cols of m2 - var dot = 0.0; - for (var k = 0; k < m1.d; k++) { // dot product loop - dot += m1.w[m1.d * i + k] * m2.w[m2.d * k + j]; - } - out.w[d * i + j] = dot; - } - } - if (this.needs_backprop) { - var backward = function() { - for (var i = 0; i < m1.n; i++) { // loop over rows of m1 - for (var j = 0; j < m2.d; j++) { // loop over cols of m2 - for (var k = 0; k < m1.d; k++) { // dot product loop - var b = out.dw[d * i + j]; - m1.dw[m1.d * i + k] += m2.w[m2.d * k + j] * b; - m2.dw[m2.d * k + j] += m1.w[m1.d * i + k] * b; - } - } - } - } - this.backprop.push(backward); - } - return out; - }, - add: function(m1, m2) { - assert(m1.w.length === m2.w.length); - var out = new Mat(m1.n, m1.d); - for (var i = 0, n = m1.w.length; i < n; i++) { - out.w[i] = m1.w[i] + m2.w[i]; - } - if (this.needs_backprop) { - var backward = function() { - for (var i = 0, n = m1.w.length; i < n; i++) { - m1.dw[i] += out.dw[i]; - m2.dw[i] += out.dw[i]; - } - } - this.backprop.push(backward); - } - return out; - }, - dot: function(m1, m2) { - // m1 m2 are both column vectors - assert(m1.w.length === m2.w.length); - var out = new Mat(1, 1); - var dot = 0.0; - for (var i = 0, n = m1.w.length; i < n; i++) { - dot += m1.w[i] * m2.w[i]; - } - out.w[0] = dot; - if (this.needs_backprop) { - var backward = function() { - for (var i = 0, n = m1.w.length; i < n; i++) { - m1.dw[i] += m2.w[i] * out.dw[0]; - m2.dw[i] += m1.w[i] * out.dw[0]; - } - } - this.backprop.push(backward); - } - return out; - }, - eltmul: function(m1, m2) { - assert(m1.w.length === m2.w.length); - var out = new Mat(m1.n, m1.d); - for (var i = 0, n = m1.w.length; i < n; i++) { - out.w[i] = m1.w[i] * m2.w[i]; - } - if (this.needs_backprop) { - var backward = function() { - for (var i = 0, n = m1.w.length; i < n; i++) { - m1.dw[i] += m2.w[i] * out.dw[i]; - m2.dw[i] += m1.w[i] * out.dw[i]; - } - } - this.backprop.push(backward); - } - return out; - }, - } - var softmax = function(m) { - var out = new Mat(m.n, m.d); // probability volume - var maxval = -999999; - for (var i = 0, n = m.w.length; i < n; i++) { - if (m.w[i] > maxval) maxval = m.w[i]; - } - var s = 0.0; - for (var i = 0, n = m.w.length; i < n; i++) { - out.w[i] = Math.exp(m.w[i] - maxval); - s += out.w[i]; - } - for (var i = 0, n = m.w.length; i < n; i++) { - out.w[i] /= s; - } - // no backward pass here needed - // since we will use the computed probabilities outside - // to set gradients directly on m - return out; - } - var Solver = function() { - this.decay_rate = 0.999; - this.smooth_eps = 1e-8; - this.step_cache = {}; - } - Solver.prototype = { - step: function(model, step_size, regc, clipval) { - // perform parameter update - var solver_stats = {}; - var num_clipped = 0; - var num_tot = 0; - for (var k in model) { - if (model.hasOwnProperty(k)) { - var m = model[k]; // mat ref - if (!(k in this.step_cache)) { - this.step_cache[k] = new Mat(m.n, m.d); - } - var s = this.step_cache[k]; - for (var i = 0, n = m.w.length; i < n; i++) { - // rmsprop adaptive learning rate - var mdwi = m.dw[i]; - s.w[i] = s.w[i] * this.decay_rate + (1.0 - this.decay_rate) * mdwi * mdwi; - // gradient clip - if (mdwi > clipval) { - mdwi = clipval; - num_clipped++; - } - if (mdwi < -clipval) { - mdwi = -clipval; - num_clipped++; - } - num_tot++; - // update (and regularize) - m.w[i] += -step_size * mdwi / Math.sqrt(s.w[i] + this.smooth_eps) - regc * m.w[i]; - m.dw[i] = 0; // reset gradients for next iteration - } - } - } - solver_stats['ratio_clipped'] = num_clipped * 1.0 / num_tot; - return solver_stats; - } - } - var initLSTM = function(input_size, hidden_sizes, output_size) { - // hidden size should be a list - var model = {}; - for (var d = 0; d < hidden_sizes.length; d++) { // loop over depths - var prev_size = d === 0 ? input_size : hidden_sizes[d - 1]; - var hidden_size = hidden_sizes[d]; - // gates parameters - model['Wix' + d] = new RandMat(hidden_size, prev_size, 0, 0.08); - model['Wih' + d] = new RandMat(hidden_size, hidden_size, 0, 0.08); - model['bi' + d] = new Mat(hidden_size, 1); - model['Wfx' + d] = new RandMat(hidden_size, prev_size, 0, 0.08); - model['Wfh' + d] = new RandMat(hidden_size, hidden_size, 0, 0.08); - model['bf' + d] = new Mat(hidden_size, 1); - model['Wox' + d] = new RandMat(hidden_size, prev_size, 0, 0.08); - model['Woh' + d] = new RandMat(hidden_size, hidden_size, 0, 0.08); - model['bo' + d] = new Mat(hidden_size, 1); - // cell write params - model['Wcx' + d] = new RandMat(hidden_size, prev_size, 0, 0.08); - model['Wch' + d] = new RandMat(hidden_size, hidden_size, 0, 0.08); - model['bc' + d] = new Mat(hidden_size, 1); - } - // decoder params - model['Whd'] = new RandMat(output_size, hidden_size, 0, 0.08); - model['bd'] = new Mat(output_size, 1); - return model; - } - var forwardLSTM = function(G, model, hidden_sizes, x, prev) { - // forward prop for a single tick of LSTM - // G is graph to append ops to - // model contains LSTM parameters - // x is 1D column vector with observation - // prev is a struct containing hidden and cell - // from previous iteration - if (prev == null || typeof prev.h === 'undefined') { - var hidden_prevs = []; - var cell_prevs = []; - for (var d = 0; d < hidden_sizes.length; d++) { - hidden_prevs.push(new R.Mat(hidden_sizes[d], 1)); - cell_prevs.push(new R.Mat(hidden_sizes[d], 1)); - } - } else { - var hidden_prevs = prev.h; - var cell_prevs = prev.c; - } - var hidden = []; - var cell = []; - for (var d = 0; d < hidden_sizes.length; d++) { - var input_vector = d === 0 ? x : hidden[d - 1]; - var hidden_prev = hidden_prevs[d]; - var cell_prev = cell_prevs[d]; - // input gate - var h0 = G.mul(model['Wix' + d], input_vector); - var h1 = G.mul(model['Wih' + d], hidden_prev); - var input_gate = G.sigmoid(G.add(G.add(h0, h1), model['bi' + d])); - // forget gate - var h2 = G.mul(model['Wfx' + d], input_vector); - var h3 = G.mul(model['Wfh' + d], hidden_prev); - var forget_gate = G.sigmoid(G.add(G.add(h2, h3), model['bf' + d])); - // output gate - var h4 = G.mul(model['Wox' + d], input_vector); - var h5 = G.mul(model['Woh' + d], hidden_prev); - var output_gate = G.sigmoid(G.add(G.add(h4, h5), model['bo' + d])); - // write operation on cells - var h6 = G.mul(model['Wcx' + d], input_vector); - var h7 = G.mul(model['Wch' + d], hidden_prev); - var cell_write = G.tanh(G.add(G.add(h6, h7), model['bc' + d])); - // compute new cell activation - var retain_cell = G.eltmul(forget_gate, cell_prev); // what do we keep from cell - var write_cell = G.eltmul(input_gate, cell_write); // what do we write to cell - var cell_d = G.add(retain_cell, write_cell); // new cell contents - // compute hidden state as gated, saturated cell activations - var hidden_d = G.eltmul(output_gate, G.tanh(cell_d)); - hidden.push(hidden_d); - cell.push(cell_d); - } - // one decoder to outputs at end - var output = G.add(G.mul(model['Whd'], hidden[hidden.length - 1]), model['bd']); - // return cell memory, hidden representation and output - return { - 'h': hidden, - 'c': cell, - 'o': output - }; - } - var sig = function(x) { - // helper function for computing sigmoid - return 1.0 / (1 + Math.exp(-x)); - } - var maxi = function(w) { - // argmax of array w - var maxv = w[0]; - var maxix = 0; - for (var i = 1, n = w.length; i < n; i++) { - var v = w[i]; - if (v > maxv) { - maxix = i; - maxv = v; - } - } - return maxix; - } - var samplei = function(w) { - // sample argmax from w, assuming w are - // probabilities that sum to one - var r = randf(0, 1); - var x = 0.0; - var i = 0; - while (true) { - x += w[i]; - if (x > r) { - return i; - } - i++; - } - return w.length - 1; // pretty sure we should never get here? - } - // various utils - global.assert = assert; - global.zeros = zeros; - global.maxi = maxi; - global.samplei = samplei; - global.randi = randi; - global.randn = randn; - global.softmax = softmax; - // classes - global.Mat = Mat; - global.RandMat = RandMat; - global.forwardLSTM = forwardLSTM; - global.initLSTM = initLSTM; - // more utils - global.updateMat = updateMat; - global.updateNet = updateNet; - global.copyMat = copyMat; - global.copyNet = copyNet; - global.netToJSON = netToJSON; - global.netFromJSON = netFromJSON; - global.netZeroGrads = netZeroGrads; - global.netFlattenGrads = netFlattenGrads; - // optimization - global.Solver = Solver; - global.Graph = Graph; +(function (global) { + "use strict"; + // Utility fun + function assert(condition, message) { + // from http://stackoverflow.com/questions/15313418/javascript-assert + if (!condition) { + message = message || "Assertion failed"; + if (typeof Error !== "undefined") { + throw new Error(message); + } + throw message; // Fallback + } + } + // Random numbers utils + var return_v = false; + var v_val = 0.0; + var gaussRandom = function () { + if (return_v) { + return_v = false; + return v_val; + } + var u = 2 * Math.random() - 1; + var v = 2 * Math.random() - 1; + var r = u * u + v * v; + if (r == 0 || r > 1) return gaussRandom(); + var c = Math.sqrt(-2 * Math.log(r) / r); + v_val = v * c; // cache this + return_v = true; + return u * c; + } + var randf = function (a, b) { + return Math.random() * (b - a) + a; + } + var randi = function (a, b) { + return Math.floor(Math.random() * (b - a) + a); + } + var randn = function (mu, std) { + return mu + gaussRandom() * std; + } + // helper function returns array of zeros of length n + // and uses typed arrays if available + var zeros = function (n) { + if (typeof (n) === 'undefined' || isNaN(n)) { + return []; + } + if (typeof ArrayBuffer === 'undefined') { + // lacking browser support + var arr = new Array(n); + for (var i = 0; i < n; i++) { + arr[i] = 0; + } + return arr; + } else { + return new Float64Array(n); + } + } + // Mat holds a matrix + var Mat = function (n, d) { + // n is number of rows d is number of columns + this.n = n; + this.d = d; + this.w = zeros(n * d); + this.dw = zeros(n * d); + } + Mat.prototype = { + get: function (row, col) { + // slow but careful accessor function + // we want row-major order + var ix = (this.d * row) + col; + assert(ix >= 0 && ix < this.w.length); + return this.w[ix]; + }, + set: function (row, col, v) { + // slow but careful accessor function + var ix = (this.d * row) + col; + assert(ix >= 0 && ix < this.w.length); + this.w[ix] = v; + }, + setFrom: function (arr) { + for (var i = 0, n = arr.length; i < n; i++) { + this.w[i] = arr[i]; + } + }, + setColumn: function (m, i) { + for (var q = 0, n = m.w.length; q < n; q++) { + this.w[(this.d * q) + i] = m.w[q]; + } + }, + toJSON: function () { + var json = {}; + json['n'] = this.n; + json['d'] = this.d; + json['w'] = this.w; + return json; + }, + fromJSON: function (json) { + this.n = json.n; + this.d = json.d; + this.w = zeros(this.n * this.d); + this.dw = zeros(this.n * this.d); + for (var i = 0, n = this.n * this.d; i < n; i++) { + this.w[i] = json.w[i]; // copy over weights + } + } + } + var copyMat = function (b) { + var a = new Mat(b.n, b.d); + a.setFrom(b.w); + return a; + } + var copyNet = function (net) { + // nets are (k,v) pairs with k = string key, v = Mat() + var new_net = {}; + for (var p in net) { + if (net.hasOwnProperty(p)) { + new_net[p] = copyMat(net[p]); + } + } + return new_net; + } + var updateMat = function (m, alpha) { + // updates in place + for (var i = 0, n = m.n * m.d; i < n; i++) { + if (m.dw[i] !== 0) { + m.w[i] += -alpha * m.dw[i]; + m.dw[i] = 0; + } + } + } + var updateNet = function (net, alpha) { + for (var p in net) { + if (net.hasOwnProperty(p)) { + updateMat(net[p], alpha); + } + } + } + var netToJSON = function (net) { + var j = {}; + for (var p in net) { + if (net.hasOwnProperty(p)) { + j[p] = net[p].toJSON(); + } + } + return j; + } + var netFromJSON = function (j) { + var net = {}; + for (var p in j) { + if (j.hasOwnProperty(p)) { + net[p] = new Mat(1, 1); // not proud of this + net[p].fromJSON(j[p]); + } + } + return net; + } + var netZeroGrads = function (net) { + for (var p in net) { + if (net.hasOwnProperty(p)) { + var mat = net[p]; + gradFillConst(mat, 0); + } + } + } + var netFlattenGrads = function (net) { + var n = 0; + for (var p in net) { + if (net.hasOwnProperty(p)) { + var mat = net[p]; + n += mat.dw.length; + } + } + var g = new Mat(n, 1); + var ix = 0; + for (var p in net) { + if (net.hasOwnProperty(p)) { + var mat = net[p]; + for (var i = 0, m = mat.dw.length; i < m; i++) { + g.w[ix] = mat.dw[i]; + ix++; + } + } + } + return g; + } + // return Mat but filled with random numbers from gaussian + var RandMat = function (n, d, mu, std) { + var m = new Mat(n, d); + fillRandn(m, mu, std); + //fillRand(m,-std,std); // kind of :P + return m; + } + // Mat utils + // fill matrix with random gaussian numbers + var fillRandn = function (m, mu, std) { + for (var i = 0, n = m.w.length; i < n; i++) { + m.w[i] = randn(mu, std); + } + } + var fillRand = function (m, lo, hi) { + for (var i = 0, n = m.w.length; i < n; i++) { + m.w[i] = randf(lo, hi); + } + } + var gradFillConst = function (m, c) { + for (var i = 0, n = m.dw.length; i < n; i++) { + m.dw[i] = c + } + } + // Transformer definitions + var Graph = function (needs_backprop) { + if (typeof needs_backprop === 'undefined') { + needs_backprop = true; + } + this.needs_backprop = needs_backprop; + // this will store a list of functions that perform backprop, + // in their forward pass order. So in backprop we will go + // backwards and evoke each one + this.backprop = []; + } + Graph.prototype = { + backward: function () { + for (var i = this.backprop.length - 1; i >= 0; i--) { + this.backprop[i](); // tick! + } + }, + rowPluck: function (m, ix) { + // pluck a row of m with index ix and return it as col vector + assert(ix >= 0 && ix < m.n); + var d = m.d; + var out = new Mat(d, 1); + for (var i = 0, n = d; i < n; i++) { + out.w[i] = m.w[d * ix + i]; + } // copy over the data + if (this.needs_backprop) { + var backward = function () { + for (var i = 0, n = d; i < n; i++) { + m.dw[d * ix + i] += out.dw[i]; + } + } + this.backprop.push(backward); + } + return out; + }, + tanh: function (m) { + // tanh nonlinearity + var out = new Mat(m.n, m.d); + var n = m.w.length; + for (var i = 0; i < n; i++) { + out.w[i] = Math.tanh(m.w[i]); + } + if (this.needs_backprop) { + var backward = function () { + for (var i = 0; i < n; i++) { + // grad for z = tanh(x) is (1 - z^2) + var mwi = out.w[i]; + m.dw[i] += (1.0 - mwi * mwi) * out.dw[i]; + } + } + this.backprop.push(backward); + } + return out; + }, + sigmoid: function (m) { + // sigmoid nonlinearity + var out = new Mat(m.n, m.d); + var n = m.w.length; + for (var i = 0; i < n; i++) { + out.w[i] = sig(m.w[i]); + } + if (this.needs_backprop) { + var backward = function () { + for (var i = 0; i < n; i++) { + // grad for z = tanh(x) is (1 - z^2) + var mwi = out.w[i]; + m.dw[i] += mwi * (1.0 - mwi) * out.dw[i]; + } + } + this.backprop.push(backward); + } + return out; + }, + relu: function (m) { + var out = new Mat(m.n, m.d); + var n = m.w.length; + for (var i = 0; i < n; i++) { + out.w[i] = Math.max(0, m.w[i]); // relu + } + if (this.needs_backprop) { + var backward = function () { + for (var i = 0; i < n; i++) { + m.dw[i] += m.w[i] > 0 ? out.dw[i] : 0.0; + } + } + this.backprop.push(backward); + } + return out; + }, + mul: function (m1, m2) { + // multiply matrices m1 * m2 + assert(m1.d === m2.n, 'matmul dimensions misaligned'); + var n = m1.n; + var d = m2.d; + var out = new Mat(n, d); + for (var i = 0; i < m1.n; i++) { // loop over rows of m1 + for (var j = 0; j < m2.d; j++) { // loop over cols of m2 + var dot = 0.0; + for (var k = 0; k < m1.d; k++) { // dot product loop + dot += m1.w[m1.d * i + k] * m2.w[m2.d * k + j]; + } + out.w[d * i + j] = dot; + } + } + if (this.needs_backprop) { + var backward = function () { + for (var i = 0; i < m1.n; i++) { // loop over rows of m1 + for (var j = 0; j < m2.d; j++) { // loop over cols of m2 + for (var k = 0; k < m1.d; k++) { // dot product loop + var b = out.dw[d * i + j]; + m1.dw[m1.d * i + k] += m2.w[m2.d * k + j] * b; + m2.dw[m2.d * k + j] += m1.w[m1.d * i + k] * b; + } + } + } + } + this.backprop.push(backward); + } + return out; + }, + add: function (m1, m2) { + assert(m1.w.length === m2.w.length); + var out = new Mat(m1.n, m1.d); + for (var i = 0, n = m1.w.length; i < n; i++) { + out.w[i] = m1.w[i] + m2.w[i]; + } + if (this.needs_backprop) { + var backward = function () { + for (var i = 0, n = m1.w.length; i < n; i++) { + m1.dw[i] += out.dw[i]; + m2.dw[i] += out.dw[i]; + } + } + this.backprop.push(backward); + } + return out; + }, + dot: function (m1, m2) { + // m1 m2 are both column vectors + assert(m1.w.length === m2.w.length); + var out = new Mat(1, 1); + var dot = 0.0; + for (var i = 0, n = m1.w.length; i < n; i++) { + dot += m1.w[i] * m2.w[i]; + } + out.w[0] = dot; + if (this.needs_backprop) { + var backward = function () { + for (var i = 0, n = m1.w.length; i < n; i++) { + m1.dw[i] += m2.w[i] * out.dw[0]; + m2.dw[i] += m1.w[i] * out.dw[0]; + } + } + this.backprop.push(backward); + } + return out; + }, + eltmul: function (m1, m2) { + assert(m1.w.length === m2.w.length); + var out = new Mat(m1.n, m1.d); + for (var i = 0, n = m1.w.length; i < n; i++) { + out.w[i] = m1.w[i] * m2.w[i]; + } + if (this.needs_backprop) { + var backward = function () { + for (var i = 0, n = m1.w.length; i < n; i++) { + m1.dw[i] += m2.w[i] * out.dw[i]; + m2.dw[i] += m1.w[i] * out.dw[i]; + } + } + this.backprop.push(backward); + } + return out; + }, + } + var softmax = function (m) { + var out = new Mat(m.n, m.d); // probability volume + var maxval = -999999; + for (var i = 0, n = m.w.length; i < n; i++) { + if (m.w[i] > maxval) maxval = m.w[i]; + } + var s = 0.0; + for (var i = 0, n = m.w.length; i < n; i++) { + out.w[i] = Math.exp(m.w[i] - maxval); + s += out.w[i]; + } + for (var i = 0, n = m.w.length; i < n; i++) { + out.w[i] /= s; + } + // no backward pass here needed + // since we will use the computed probabilities outside + // to set gradients directly on m + return out; + } + var Solver = function () { + this.decay_rate = 0.999; + this.smooth_eps = 1e-8; + this.step_cache = {}; + } + Solver.prototype = { + step: function (model, step_size, regc, clipval) { + // perform parameter update + var solver_stats = {}; + var num_clipped = 0; + var num_tot = 0; + for (var k in model) { + if (model.hasOwnProperty(k)) { + var m = model[k]; // mat ref + if (!(k in this.step_cache)) { + this.step_cache[k] = new Mat(m.n, m.d); + } + var s = this.step_cache[k]; + for (var i = 0, n = m.w.length; i < n; i++) { + // rmsprop adaptive learning rate + var mdwi = m.dw[i]; + s.w[i] = s.w[i] * this.decay_rate + (1.0 - this.decay_rate) * mdwi * mdwi; + // gradient clip + if (mdwi > clipval) { + mdwi = clipval; + num_clipped++; + } + if (mdwi < -clipval) { + mdwi = -clipval; + num_clipped++; + } + num_tot++; + // update (and regularize) + m.w[i] += -step_size * mdwi / Math.sqrt(s.w[i] + this.smooth_eps) - regc * m.w[i]; + m.dw[i] = 0; // reset gradients for next iteration + } + } + } + solver_stats['ratio_clipped'] = num_clipped * 1.0 / num_tot; + return solver_stats; + } + } + var initLSTM = function (input_size, hidden_sizes, output_size) { + // hidden size should be a list + var model = {}; + for (var d = 0; d < hidden_sizes.length; d++) { // loop over depths + var prev_size = d === 0 ? input_size : hidden_sizes[d - 1]; + var hidden_size = hidden_sizes[d]; + // gates parameters + model['Wix' + d] = new RandMat(hidden_size, prev_size, 0, 0.08); + model['Wih' + d] = new RandMat(hidden_size, hidden_size, 0, 0.08); + model['bi' + d] = new Mat(hidden_size, 1); + model['Wfx' + d] = new RandMat(hidden_size, prev_size, 0, 0.08); + model['Wfh' + d] = new RandMat(hidden_size, hidden_size, 0, 0.08); + model['bf' + d] = new Mat(hidden_size, 1); + model['Wox' + d] = new RandMat(hidden_size, prev_size, 0, 0.08); + model['Woh' + d] = new RandMat(hidden_size, hidden_size, 0, 0.08); + model['bo' + d] = new Mat(hidden_size, 1); + // cell write params + model['Wcx' + d] = new RandMat(hidden_size, prev_size, 0, 0.08); + model['Wch' + d] = new RandMat(hidden_size, hidden_size, 0, 0.08); + model['bc' + d] = new Mat(hidden_size, 1); + } + // decoder params + model['Whd'] = new RandMat(output_size, hidden_size, 0, 0.08); + model['bd'] = new Mat(output_size, 1); + return model; + } + var forwardLSTM = function (G, model, hidden_sizes, x, prev) { + // forward prop for a single tick of LSTM + // G is graph to append ops to + // model contains LSTM parameters + // x is 1D column vector with observation + // prev is a struct containing hidden and cell + // from previous iteration + if (prev == null || typeof prev.h === 'undefined') { + var hidden_prevs = []; + var cell_prevs = []; + for (var d = 0; d < hidden_sizes.length; d++) { + hidden_prevs.push(new R.Mat(hidden_sizes[d], 1)); + cell_prevs.push(new R.Mat(hidden_sizes[d], 1)); + } + } else { + var hidden_prevs = prev.h; + var cell_prevs = prev.c; + } + var hidden = []; + var cell = []; + for (var d = 0; d < hidden_sizes.length; d++) { + var input_vector = d === 0 ? x : hidden[d - 1]; + var hidden_prev = hidden_prevs[d]; + var cell_prev = cell_prevs[d]; + // input gate + var h0 = G.mul(model['Wix' + d], input_vector); + var h1 = G.mul(model['Wih' + d], hidden_prev); + var input_gate = G.sigmoid(G.add(G.add(h0, h1), model['bi' + d])); + // forget gate + var h2 = G.mul(model['Wfx' + d], input_vector); + var h3 = G.mul(model['Wfh' + d], hidden_prev); + var forget_gate = G.sigmoid(G.add(G.add(h2, h3), model['bf' + d])); + // output gate + var h4 = G.mul(model['Wox' + d], input_vector); + var h5 = G.mul(model['Woh' + d], hidden_prev); + var output_gate = G.sigmoid(G.add(G.add(h4, h5), model['bo' + d])); + // write operation on cells + var h6 = G.mul(model['Wcx' + d], input_vector); + var h7 = G.mul(model['Wch' + d], hidden_prev); + var cell_write = G.tanh(G.add(G.add(h6, h7), model['bc' + d])); + // compute new cell activation + var retain_cell = G.eltmul(forget_gate, cell_prev); // what do we keep from cell + var write_cell = G.eltmul(input_gate, cell_write); // what do we write to cell + var cell_d = G.add(retain_cell, write_cell); // new cell contents + // compute hidden state as gated, saturated cell activations + var hidden_d = G.eltmul(output_gate, G.tanh(cell_d)); + hidden.push(hidden_d); + cell.push(cell_d); + } + // one decoder to outputs at end + var output = G.add(G.mul(model['Whd'], hidden[hidden.length - 1]), model['bd']); + // return cell memory, hidden representation and output + return { + 'h': hidden, + 'c': cell, + 'o': output + }; + } + var sig = function (x) { + // helper function for computing sigmoid + return 1.0 / (1 + Math.exp(-x)); + } + var maxi = function (w) { + // argmax of array w + var maxv = w[0]; + var maxix = 0; + for (var i = 1, n = w.length; i < n; i++) { + var v = w[i]; + if (v > maxv) { + maxix = i; + maxv = v; + } + } + return maxix; + } + var samplei = function (w) { + // sample argmax from w, assuming w are + // probabilities that sum to one + var r = randf(0, 1); + var x = 0.0; + var i = 0; + while (true) { + x += w[i]; + if (x > r) { + return i; + } + i++; + } + return w.length - 1; // pretty sure we should never get here? + } + // various utils + global.assert = assert; + global.zeros = zeros; + global.maxi = maxi; + global.samplei = samplei; + global.randi = randi; + global.randn = randn; + global.softmax = softmax; + // classes + global.Mat = Mat; + global.RandMat = RandMat; + global.forwardLSTM = forwardLSTM; + global.initLSTM = initLSTM; + // more utils + global.updateMat = updateMat; + global.updateNet = updateNet; + global.copyMat = copyMat; + global.copyNet = copyNet; + global.netToJSON = netToJSON; + global.netFromJSON = netFromJSON; + global.netZeroGrads = netZeroGrads; + global.netFlattenGrads = netFlattenGrads; + // optimization + global.Solver = Solver; + global.Graph = Graph; })(R); // END OF RECURRENTJS var RL = {}; -(function(global) { - "use strict"; - // syntactic sugar function for getting default parameter values - var getopt = function(opt, field_name, default_value) { - if (typeof opt === 'undefined') { - return default_value; - } - return (typeof opt[field_name] !== 'undefined') ? opt[field_name] : default_value; - } - var zeros = R.zeros; // inherit these - var assert = R.assert; - var randi = R.randi; - var randf = R.randf; - var setConst = function(arr, c) { - for (var i = 0, n = arr.length; i < n; i++) { - arr[i] = c; - } - } - var sampleWeighted = function(p) { - var r = Math.random(); - var c = 0.0; - for (var i = 0, n = p.length; i < n; i++) { - c += p[i]; - if (c >= r) { - return i; - } - } - assert(false, 'wtf'); - } - // ------ - // AGENTS - // ------ - // DPAgent performs Value Iteration - // - can also be used for Policy Iteration if you really wanted to - // - requires model of the environment :( - // - does not learn from experience :( - // - assumes finite MDP :( - var DPAgent = function(env, opt) { - this.V = null; // state value function - this.P = null; // policy distribution \pi(s,a) - this.env = env; // store pointer to environment - this.gamma = getopt(opt, 'gamma', 0.75); // future reward discount factor - this.reset(); - } - DPAgent.prototype = { - reset: function() { - // reset the agent's policy and value function - this.ns = this.env.getNumStates(); - this.na = this.env.getMaxNumActions(); - this.V = zeros(this.ns); - this.P = zeros(this.ns * this.na); - // initialize uniform random policy - for (var s = 0; s < this.ns; s++) { - var poss = this.env.allowedActions(s); - for (var i = 0, n = poss.length; i < n; i++) { - this.P[poss[i] * this.ns + s] = 1.0 / poss.length; - } - } - }, - act: function(s) { - // behave according to the learned policy - var poss = this.env.allowedActions(s); - var ps = []; - for (var i = 0, n = poss.length; i < n; i++) { - var a = poss[i]; - var prob = this.P[a * this.ns + s]; - ps.push(prob); - } - var maxi = sampleWeighted(ps); - return poss[maxi]; - }, - learn: function() { - // perform a single round of value iteration - self.evaluatePolicy(); // writes this.V - self.updatePolicy(); // writes this.P - }, - evaluatePolicy: function() { - // perform a synchronous update of the value function - var Vnew = zeros(this.ns); - for (var s = 0; s < this.ns; s++) { - // integrate over actions in a stochastic policy - // note that we assume that policy probability mass over allowed actions sums to one - var v = 0.0; - var poss = this.env.allowedActions(s); - for (var i = 0, n = poss.length; i < n; i++) { - var a = poss[i]; - var prob = this.P[a * this.ns + s]; // probability of taking action under policy - if (prob === 0) { - continue; - } // no contribution, skip for speed - var ns = this.env.nextStateDistribution(s, a); - var rs = this.env.reward(s, a, ns); // reward for s->a->ns transition - v += prob * (rs + this.gamma * this.V[ns]); - } - Vnew[s] = v; - } - this.V = Vnew; // swap - }, - updatePolicy: function() { - // update policy to be greedy w.r.t. learned Value function - for (var s = 0; s < this.ns; s++) { - var poss = this.env.allowedActions(s); - // compute value of taking each allowed action - var vmax, nmax; - var vs = []; - for (var i = 0, n = poss.length; i < n; i++) { - var a = poss[i]; - var ns = this.env.nextStateDistribution(s, a); - var rs = this.env.reward(s, a, ns); - var v = rs + this.gamma * this.V[ns]; - vs.push(v); - if (i === 0 || v > vmax) { - vmax = v; - nmax = 1; - } else if (v === vmax) { - nmax += 1; - } - } - // update policy smoothly across all argmaxy actions - for (var i = 0, n = poss.length; i < n; i++) { - var a = poss[i]; - this.P[a * this.ns + s] = (vs[i] === vmax) ? 1.0 / nmax : 0.0; - } - } - }, - } - // QAgent uses TD (Q-Learning, SARSA) - // - does not require environment model :) - // - learns from experience :) - var TDAgent = function(env, opt) { - this.update = getopt(opt, 'update', 'qlearn'); // qlearn | sarsa - this.gamma = getopt(opt, 'gamma', 0.75); // future reward discount factor - this.epsilon = getopt(opt, 'epsilon', 0.1); // for epsilon-greedy policy - this.alpha = getopt(opt, 'alpha', 0.01); // value function learning rate - // class allows non-deterministic policy, and smoothly regressing towards the optimal policy based on Q - this.smooth_policy_update = getopt(opt, 'smooth_policy_update', false); - this.beta = getopt(opt, 'beta', 0.01); // learning rate for policy, if smooth updates are on - // eligibility traces - this.lambda = getopt(opt, 'lambda', 0); // eligibility trace decay. 0 = no eligibility traces used - this.replacing_traces = getopt(opt, 'replacing_traces', true); - // optional optimistic initial values - this.q_init_val = getopt(opt, 'q_init_val', 0); - this.planN = getopt(opt, 'planN', 0); // number of planning steps per learning iteration (0 = no planning) - this.Q = null; // state action value function - this.P = null; // policy distribution \pi(s,a) - this.e = null; // eligibility trace - this.env_model_s = null;; // environment model (s,a) -> (s',r) - this.env_model_r = null;; // environment model (s,a) -> (s',r) - this.env = env; // store pointer to environment - this.reset(); - } - TDAgent.prototype = { - reset: function() { - // reset the agent's policy and value function - this.ns = this.env.getNumStates(); - this.na = this.env.getMaxNumActions(); - this.Q = zeros(this.ns * this.na); - if (this.q_init_val !== 0) { - setConst(this.Q, this.q_init_val); - } - this.P = zeros(this.ns * this.na); - this.e = zeros(this.ns * this.na); - // model/planning vars - this.env_model_s = zeros(this.ns * this.na); - setConst(this.env_model_s, -1); // init to -1 so we can test if we saw the state before - this.env_model_r = zeros(this.ns * this.na); - this.sa_seen = []; - this.pq = zeros(this.ns * this.na); - // initialize uniform random policy - for (var s = 0; s < this.ns; s++) { - var poss = this.env.allowedActions(s); - for (var i = 0, n = poss.length; i < n; i++) { - this.P[poss[i] * this.ns + s] = 1.0 / poss.length; - } - } - // agent memory, needed for streaming updates - // (s0,a0,r0,s1,a1,r1,...) - this.r0 = null; - this.s0 = null; - this.s1 = null; - this.a0 = null; - this.a1 = null; - }, - resetEpisode: function() { - // an episode finished - }, - act: function(s) { - // act according to epsilon greedy policy - var poss = this.env.allowedActions(s); - var probs = []; - for (var i = 0, n = poss.length; i < n; i++) { - probs.push(this.P[poss[i] * this.ns + s]); - } - // epsilon greedy policy - if (Math.random() < this.epsilon) { - var a = poss[randi(0, poss.length)]; // random available action - this.explored = true; - } else { - var a = poss[sampleWeighted(probs)]; - this.explored = false; - } - // shift state memory - this.s0 = this.s1; - this.a0 = this.a1; - this.s1 = s; - this.a1 = a; - return a; - }, - learn: function(r1) { - // takes reward for previous action, which came from a call to act() - if (!(this.r0 == null)) { - this.learnFromTuple(this.s0, this.a0, this.r0, this.s1, this.a1, this.lambda); - if (this.planN > 0) { - this.updateModel(this.s0, this.a0, this.r0, this.s1); - this.plan(); - } - } - this.r0 = r1; // store this for next update - }, - updateModel: function(s0, a0, r0, s1) { - // transition (s0,a0) -> (r0,s1) was observed. Update environment model - var sa = a0 * this.ns + s0; - if (this.env_model_s[sa] === -1) { - // first time we see this state action - this.sa_seen.push(a0 * this.ns + s0); // add as seen state - } - this.env_model_s[sa] = s1; - this.env_model_r[sa] = r0; - }, - plan: function() { - // order the states based on current priority queue information - var spq = []; - for (var i = 0, n = this.sa_seen.length; i < n; i++) { - var sa = this.sa_seen[i]; - var sap = this.pq[sa]; - if (sap > 1e-5) { // gain a bit of efficiency - spq.push({ - sa: sa, - p: sap - }); - } - } - spq.sort(function(a, b) { - return a.p < b.p ? 1 : -1 - }); - // perform the updates - var nsteps = Math.min(this.planN, spq.length); - for (var k = 0; k < nsteps; k++) { - // random exploration - //var i = randi(0, this.sa_seen.length); // pick random prev seen state action - //var s0a0 = this.sa_seen[i]; - var s0a0 = spq[k].sa; - this.pq[s0a0] = 0; // erase priority, since we're backing up this state - var s0 = s0a0 % this.ns; - var a0 = Math.floor(s0a0 / this.ns); - var r0 = this.env_model_r[s0a0]; - var s1 = this.env_model_s[s0a0]; - var a1 = -1; // not used for Q learning - if (this.update === 'sarsa') { - // generate random action?... - var poss = this.env.allowedActions(s1); - var a1 = poss[randi(0, poss.length)]; - } - this.learnFromTuple(s0, a0, r0, s1, a1, 0); // note lambda = 0 - shouldnt use eligibility trace here - } - }, - learnFromTuple: function(s0, a0, r0, s1, a1, lambda) { - var sa = a0 * this.ns + s0; - // calculate the target for Q(s,a) - if (this.update === 'qlearn') { - // Q learning target is Q(s0,a0) = r0 + gamma * max_a Q[s1,a] - var poss = this.env.allowedActions(s1); - var qmax = 0; - for (var i = 0, n = poss.length; i < n; i++) { - var s1a = poss[i] * this.ns + s1; - var qval = this.Q[s1a]; - if (i === 0 || qval > qmax) { - qmax = qval; - } - } - var target = r0 + this.gamma * qmax; - } else if (this.update === 'sarsa') { - // SARSA target is Q(s0,a0) = r0 + gamma * Q[s1,a1] - var s1a1 = a1 * this.ns + s1; - var target = r0 + this.gamma * this.Q[s1a1]; - } - if (lambda > 0) { - // perform an eligibility trace update - if (this.replacing_traces) { - this.e[sa] = 1; - } else { - this.e[sa] += 1; - } - var edecay = lambda * this.gamma; - var state_update = zeros(this.ns); - for (var s = 0; s < this.ns; s++) { - var poss = this.env.allowedActions(s); - for (var i = 0; i < poss.length; i++) { - var a = poss[i]; - var saloop = a * this.ns + s; - var esa = this.e[saloop]; - var update = this.alpha * esa * (target - this.Q[saloop]); - this.Q[saloop] += update; - this.updatePriority(s, a, update); - this.e[saloop] *= edecay; - var u = Math.abs(update); - if (u > state_update[s]) { - state_update[s] = u; - } - } - } - for (var s = 0; s < this.ns; s++) { - if (state_update[s] > 1e-5) { // save efficiency here - this.updatePolicy(s); - } - } - if (this.explored && this.update === 'qlearn') { - // have to wipe the trace since q learning is off-policy :( - this.e = zeros(this.ns * this.na); - } - } else { - // simpler and faster update without eligibility trace - // update Q[sa] towards it with some step size - var update = this.alpha * (target - this.Q[sa]); - this.Q[sa] += update; - this.updatePriority(s0, a0, update); - // update the policy to reflect the change (if appropriate) - this.updatePolicy(s0); - } - }, - updatePriority: function(s, a, u) { - // used in planning. Invoked when Q[sa] += update - // we should find all states that lead to (s,a) and upgrade their priority - // of being update in the next planning step - u = Math.abs(u); - if (u < 1e-5) { - return; - } // for efficiency skip small updates - if (this.planN === 0) { - return; - } // there is no planning to be done, skip. - for (var si = 0; si < this.ns; si++) { - // note we are also iterating over impossible actions at all states, - // but this should be okay because their env_model_s should simply be -1 - // as initialized, so they will never be predicted to point to any state - // because they will never be observed, and hence never be added to the model - for (var ai = 0; ai < this.na; ai++) { - var siai = ai * this.ns + si; - if (this.env_model_s[siai] === s) { - // this state leads to s, add it to priority queue - this.pq[siai] += u; - } - } - } - }, - updatePolicy: function(s) { - var poss = this.env.allowedActions(s); - // set policy at s to be the action that achieves max_a Q(s,a) - // first find the maxy Q values - var qmax, nmax; - var qs = []; - for (var i = 0, n = poss.length; i < n; i++) { - var a = poss[i]; - var qval = this.Q[a * this.ns + s]; - qs.push(qval); - if (i === 0 || qval > qmax) { - qmax = qval; - nmax = 1; - } else if (qval === qmax) { - nmax += 1; - } - } - // now update the policy smoothly towards the argmaxy actions - var psum = 0.0; - for (var i = 0, n = poss.length; i < n; i++) { - var a = poss[i]; - var target = (qs[i] === qmax) ? 1.0 / nmax : 0.0; - var ix = a * this.ns + s; - if (this.smooth_policy_update) { - // slightly hacky :p - this.P[ix] += this.beta * (target - this.P[ix]); - psum += this.P[ix]; - } else { - // set hard target - this.P[ix] = target; - } - } - if (this.smooth_policy_update) { - // renomalize P if we're using smooth policy updates - for (var i = 0, n = poss.length; i < n; i++) { - var a = poss[i]; - this.P[a * this.ns + s] /= psum; - } - } - } - } - var DQNAgent = function(env, opt) { - this.gamma = getopt(opt, 'gamma', 0.75); // future reward discount factor - this.epsilon = getopt(opt, 'epsilon', 0.1); // for epsilon-greedy policy - this.alpha = getopt(opt, 'alpha', 0.01); // value function learning rate - this.experience_add_every = getopt(opt, 'experience_add_every', 25); // number of time steps before we add another experience to replay memory - this.experience_size = getopt(opt, 'experience_size', 5000); // size of experience replay - this.learning_steps_per_iteration = getopt(opt, 'learning_steps_per_iteration', 10); - this.tderror_clamp = getopt(opt, 'tderror_clamp', 1.0); - this.num_hidden_units = getopt(opt, 'num_hidden_units', 100); - this.env = env; - this.reset(); - } - DQNAgent.prototype = { - reset: function() { - this.nh = this.num_hidden_units; // number of hidden units - this.ns = this.env.getNumStates(); - this.na = this.env.getMaxNumActions(); - // nets are hardcoded for now as key (str) -> Mat - // not proud of this. better solution is to have a whole Net object - // on top of Mats, but for now sticking with this - this.net = {}; - this.net.W1 = new R.RandMat(this.nh, this.ns, 0, 0.01); - this.net.b1 = new R.Mat(this.nh, 1, 0, 0.01); - this.net.W2 = new R.RandMat(this.na, this.nh, 0, 0.01); - this.net.b2 = new R.Mat(this.na, 1, 0, 0.01); - this.exp = []; // experience - this.expi = 0; // where to insert - this.t = 0; - this.r0 = null; - this.s0 = null; - this.s1 = null; - this.a0 = null; - this.a1 = null; - this.tderror = 0; // for visualization only... - }, - toJSON: function() { - // save function - var j = {}; - j.nh = this.nh; - j.ns = this.ns; - j.na = this.na; - j.net = R.netToJSON(this.net); - return j; - }, - fromJSON: function(j) { - // load function - this.nh = j.nh; - this.ns = j.ns; - this.na = j.na; - this.net = R.netFromJSON(j.net); - }, - forwardQ: function(net, s, needs_backprop) { - var G = new R.Graph(needs_backprop); - var a1mat = G.add(G.mul(net.W1, s), net.b1); - var h1mat = G.tanh(a1mat); - var a2mat = G.add(G.mul(net.W2, h1mat), net.b2); - this.lastG = G; // back this up. Kind of hacky isn't it - return a2mat; - }, - act: function(slist) { - // convert to a Mat column vector - var s = new R.Mat(this.ns, 1); - s.setFrom(slist); - // epsilon greedy policy - if (Math.random() < this.epsilon) { - var a = randi(0, this.na); - } else { - // greedy wrt Q function - var amat = this.forwardQ(this.net, s, false); - var a = R.maxi(amat.w); // returns index of argmax action - } - // shift state memory - this.s0 = this.s1; - this.a0 = this.a1; - this.s1 = s; - this.a1 = a; - return a; - }, - learn: function(r1) { - // perform an update on Q function - if (!(this.r0 == null) && this.alpha > 0) { - // learn from this tuple to get a sense of how "surprising" it is to the agent - var tderror = this.learnFromTuple(this.s0, this.a0, this.r0, this.s1, this.a1); - this.tderror = tderror; // a measure of surprise - // decide if we should keep this experience in the replay - if (this.t % this.experience_add_every === 0) { - this.exp[this.expi] = [this.s0, this.a0, this.r0, this.s1, this.a1]; - this.expi += 1; - if (this.expi > this.experience_size) { - this.expi = 0; - } // roll over when we run out - } - this.t += 1; - // sample some additional experience from replay memory and learn from it - for (var k = 0; k < this.learning_steps_per_iteration; k++) { - var ri = randi(0, this.exp.length); // todo: priority sweeps? - var e = this.exp[ri]; - this.learnFromTuple(e[0], e[1], e[2], e[3], e[4]) - } - } - this.r0 = r1; // store for next update - }, - learnFromTuple: function(s0, a0, r0, s1, a1) { - // want: Q(s,a) = r + gamma * max_a' Q(s',a') - // compute the target Q value - var tmat = this.forwardQ(this.net, s1, false); - var qmax = r0 + this.gamma * tmat.w[R.maxi(tmat.w)]; - // now predict - var pred = this.forwardQ(this.net, s0, true); - var tderror = pred.w[a0] - qmax; - var clamp = this.tderror_clamp; - if (Math.abs(tderror) > clamp) { // huber loss to robustify - if (tderror > clamp) tderror = clamp; - if (tderror < -clamp) tderror = -clamp; - } - pred.dw[a0] = tderror; - this.lastG.backward(); // compute gradients on net params - // update net - R.updateNet(this.net, this.alpha); - return tderror; - } - } - // buggy implementation, doesnt work... - var SimpleReinforceAgent = function(env, opt) { - this.gamma = getopt(opt, 'gamma', 0.5); // future reward discount factor - this.epsilon = getopt(opt, 'epsilon', 0.75); // for epsilon-greedy policy - this.alpha = getopt(opt, 'alpha', 0.001); // actor net learning rate - this.beta = getopt(opt, 'beta', 0.01); // baseline net learning rate - this.env = env; - this.reset(); - } - SimpleReinforceAgent.prototype = { - reset: function() { - this.ns = this.env.getNumStates(); - this.na = this.env.getMaxNumActions(); - this.nh = 100; // number of hidden units - this.nhb = 100; // and also in the baseline lstm - this.actorNet = {}; - this.actorNet.W1 = new R.RandMat(this.nh, this.ns, 0, 0.01); - this.actorNet.b1 = new R.Mat(this.nh, 1, 0, 0.01); - this.actorNet.W2 = new R.RandMat(this.na, this.nh, 0, 0.1); - this.actorNet.b2 = new R.Mat(this.na, 1, 0, 0.01); - this.actorOutputs = []; - this.actorGraphs = []; - this.actorActions = []; // sampled ones - this.rewardHistory = []; - this.baselineNet = {}; - this.baselineNet.W1 = new R.RandMat(this.nhb, this.ns, 0, 0.01); - this.baselineNet.b1 = new R.Mat(this.nhb, 1, 0, 0.01); - this.baselineNet.W2 = new R.RandMat(this.na, this.nhb, 0, 0.01); - this.baselineNet.b2 = new R.Mat(this.na, 1, 0, 0.01); - this.baselineOutputs = []; - this.baselineGraphs = []; - this.t = 0; - }, - forwardActor: function(s, needs_backprop) { - var net = this.actorNet; - var G = new R.Graph(needs_backprop); - var a1mat = G.add(G.mul(net.W1, s), net.b1); - var h1mat = G.tanh(a1mat); - var a2mat = G.add(G.mul(net.W2, h1mat), net.b2); - return { - 'a': a2mat, - 'G': G - } - }, - forwardValue: function(s, needs_backprop) { - var net = this.baselineNet; - var G = new R.Graph(needs_backprop); - var a1mat = G.add(G.mul(net.W1, s), net.b1); - var h1mat = G.tanh(a1mat); - var a2mat = G.add(G.mul(net.W2, h1mat), net.b2); - return { - 'a': a2mat, - 'G': G - } - }, - act: function(slist) { - // convert to a Mat column vector - var s = new R.Mat(this.ns, 1); - s.setFrom(slist); - // forward the actor to get action output - var ans = this.forwardActor(s, true); - var amat = ans.a; - var ag = ans.G; - this.actorOutputs.push(amat); - this.actorGraphs.push(ag); - // forward the baseline estimator - var ans = this.forwardValue(s, true); - var vmat = ans.a; - var vg = ans.G; - this.baselineOutputs.push(vmat); - this.baselineGraphs.push(vg); - // sample action from the stochastic gaussian policy - var a = R.copyMat(amat); - var gaussVar = 0.02; - a.w[0] = R.randn(0, gaussVar); - a.w[1] = R.randn(0, gaussVar); - this.actorActions.push(a); - // shift state memory - this.s0 = this.s1; - this.a0 = this.a1; - this.s1 = s; - this.a1 = a; - return a; - }, - learn: function(r1) { - // perform an update on Q function - this.rewardHistory.push(r1); - var n = this.rewardHistory.length; - var baselineMSE = 0.0; - var nup = 100; // what chunk of experience to take - var nuse = 80; // what chunk to update from - if (n >= nup) { - // lets learn and flush - // first: compute the sample values at all points - var vs = []; - for (var t = 0; t < nuse; t++) { - var mul = 1; - // compute the actual discounted reward for this time step - var V = 0; - for (var t2 = t; t2 < n; t2++) { - V += mul * this.rewardHistory[t2]; - mul *= this.gamma; - if (mul < 1e-5) { - break; - } // efficiency savings - } - // get the predicted baseline at this time step - var b = this.baselineOutputs[t].w[0]; - for (var i = 0; i < this.na; i++) { - // [the action delta] * [the desirebility] - var update = -(V - b) * (this.actorActions[t].w[i] - this.actorOutputs[t].w[i]); - if (update > 0.1) { - update = 0.1; - } - if (update < -0.1) { - update = -0.1; - } - this.actorOutputs[t].dw[i] += update; - } - var update = -(V - b); - if (update > 0.1) { - update = 0.1; - } - if (update < 0.1) { - update = -0.1; - } - this.baselineOutputs[t].dw[0] += update; - baselineMSE += (V - b) * (V - b); - vs.push(V); - } - baselineMSE /= nuse; - // backprop all the things - for (var t = 0; t < nuse; t++) { - this.actorGraphs[t].backward(); - this.baselineGraphs[t].backward(); - } - R.updateNet(this.actorNet, this.alpha); // update actor network - R.updateNet(this.baselineNet, this.beta); // update baseline network - // flush - this.actorOutputs = []; - this.rewardHistory = []; - this.actorActions = []; - this.baselineOutputs = []; - this.actorGraphs = []; - this.baselineGraphs = []; - this.tderror = baselineMSE; - } - this.t += 1; - this.r0 = r1; // store for next update - }, - } - // buggy implementation as well, doesn't work - var RecurrentReinforceAgent = function(env, opt) { - this.gamma = getopt(opt, 'gamma', 0.5); // future reward discount factor - this.epsilon = getopt(opt, 'epsilon', 0.1); // for epsilon-greedy policy - this.alpha = getopt(opt, 'alpha', 0.001); // actor net learning rate - this.beta = getopt(opt, 'beta', 0.01); // baseline net learning rate - this.env = env; - this.reset(); - } - RecurrentReinforceAgent.prototype = { - reset: function() { - this.ns = this.env.getNumStates(); - this.na = this.env.getMaxNumActions(); - this.nh = 40; // number of hidden units - this.nhb = 40; // and also in the baseline lstm - this.actorLSTM = R.initLSTM(this.ns, [this.nh], this.na); - this.actorG = new R.Graph(); - this.actorPrev = null; - this.actorOutputs = []; - this.rewardHistory = []; - this.actorActions = []; - this.baselineLSTM = R.initLSTM(this.ns, [this.nhb], 1); - this.baselineG = new R.Graph(); - this.baselinePrev = null; - this.baselineOutputs = []; - this.t = 0; - this.r0 = null; - this.s0 = null; - this.s1 = null; - this.a0 = null; - this.a1 = null; - }, - act: function(slist) { - // convert to a Mat column vector - var s = new R.Mat(this.ns, 1); - s.setFrom(slist); - // forward the LSTM to get action distribution - var actorNext = R.forwardLSTM(this.actorG, this.actorLSTM, [this.nh], s, this.actorPrev); - this.actorPrev = actorNext; - var amat = actorNext.o; - this.actorOutputs.push(amat); - // forward the baseline LSTM - var baselineNext = R.forwardLSTM(this.baselineG, this.baselineLSTM, [this.nhb], s, this.baselinePrev); - this.baselinePrev = baselineNext; - this.baselineOutputs.push(baselineNext.o); - // sample action from actor policy - var gaussVar = 0.05; - var a = R.copyMat(amat); - for (var i = 0, n = a.w.length; i < n; i++) { - a.w[0] += R.randn(0, gaussVar); - a.w[1] += R.randn(0, gaussVar); - } - this.actorActions.push(a); - // shift state memory - this.s0 = this.s1; - this.a0 = this.a1; - this.s1 = s; - this.a1 = a; - return a; - }, - learn: function(r1) { - // perform an update on Q function - this.rewardHistory.push(r1); - var n = this.rewardHistory.length; - var baselineMSE = 0.0; - var nup = 100; // what chunk of experience to take - var nuse = 80; // what chunk to also update - if (n >= nup) { - // lets learn and flush - // first: compute the sample values at all points - var vs = []; - for (var t = 0; t < nuse; t++) { - var mul = 1; - var V = 0; - for (var t2 = t; t2 < n; t2++) { - V += mul * this.rewardHistory[t2]; - mul *= this.gamma; - if (mul < 1e-5) { - break; - } // efficiency savings - } - var b = this.baselineOutputs[t].w[0]; - // todo: take out the constants etc. - for (var i = 0; i < this.na; i++) { - // [the action delta] * [the desirebility] - var update = -(V - b) * (this.actorActions[t].w[i] - this.actorOutputs[t].w[i]); - if (update > 0.1) { - update = 0.1; - } - if (update < -0.1) { - update = -0.1; - } - this.actorOutputs[t].dw[i] += update; - } - var update = -(V - b); - if (update > 0.1) { - update = 0.1; - } - if (update < 0.1) { - update = -0.1; - } - this.baselineOutputs[t].dw[0] += update; - baselineMSE += (V - b) * (V - b); - vs.push(V); - } - baselineMSE /= nuse; - this.actorG.backward(); // update params! woohoo! - this.baselineG.backward(); - R.updateNet(this.actorLSTM, this.alpha); // update actor network - R.updateNet(this.baselineLSTM, this.beta); // update baseline network - // flush - this.actorG = new R.Graph(); - this.actorPrev = null; - this.actorOutputs = []; - this.rewardHistory = []; - this.actorActions = []; - this.baselineG = new R.Graph(); - this.baselinePrev = null; - this.baselineOutputs = []; - this.tderror = baselineMSE; - } - this.t += 1; - this.r0 = r1; // store for next update - }, - } - // Currently buggy implementation, doesnt work - var DeterministPG = function(env, opt) { - this.gamma = getopt(opt, 'gamma', 0.5); // future reward discount factor - this.epsilon = getopt(opt, 'epsilon', 0.5); // for epsilon-greedy policy - this.alpha = getopt(opt, 'alpha', 0.001); // actor net learning rate - this.beta = getopt(opt, 'beta', 0.01); // baseline net learning rate - this.env = env; - this.reset(); - } - DeterministPG.prototype = { - reset: function() { - this.ns = this.env.getNumStates(); - this.na = this.env.getMaxNumActions(); - this.nh = 100; // number of hidden units - // actor - this.actorNet = {}; - this.actorNet.W1 = new R.RandMat(this.nh, this.ns, 0, 0.01); - this.actorNet.b1 = new R.Mat(this.nh, 1, 0, 0.01); - this.actorNet.W2 = new R.RandMat(this.na, this.ns, 0, 0.1); - this.actorNet.b2 = new R.Mat(this.na, 1, 0, 0.01); - this.ntheta = this.na * this.ns + this.na; // number of params in actor - // critic - this.criticw = new R.RandMat(1, this.ntheta, 0, 0.01); // row vector - this.r0 = null; - this.s0 = null; - this.s1 = null; - this.a0 = null; - this.a1 = null; - this.t = 0; - }, - forwardActor: function(s, needs_backprop) { - var net = this.actorNet; - var G = new R.Graph(needs_backprop); - var a1mat = G.add(G.mul(net.W1, s), net.b1); - var h1mat = G.tanh(a1mat); - var a2mat = G.add(G.mul(net.W2, h1mat), net.b2); - return { - 'a': a2mat, - 'G': G - } - }, - act: function(slist) { - // convert to a Mat column vector - var s = new R.Mat(this.ns, 1); - s.setFrom(slist); - // forward the actor to get action output - var ans = this.forwardActor(s, false); - var amat = ans.a; - var ag = ans.G; - // sample action from the stochastic gaussian policy - var a = R.copyMat(amat); - if (Math.random() < this.epsilon) { - var gaussVar = 0.02; - a.w[0] = R.randn(0, gaussVar); - a.w[1] = R.randn(0, gaussVar); - } - var clamp = 0.25; - if (a.w[0] > clamp) a.w[0] = clamp; - if (a.w[0] < -clamp) a.w[0] = -clamp; - if (a.w[1] > clamp) a.w[1] = clamp; - if (a.w[1] < -clamp) a.w[1] = -clamp; - // shift state memory - this.s0 = this.s1; - this.a0 = this.a1; - this.s1 = s; - this.a1 = a; - return a; - }, - utilJacobianAt: function(s) { - var ujacobian = new R.Mat(this.ntheta, this.na); - for (var a = 0; a < this.na; a++) { - R.netZeroGrads(this.actorNet); - var ag = this.forwardActor(this.s0, true); - ag.a.dw[a] = 1.0; - ag.G.backward(); - var gflat = R.netFlattenGrads(this.actorNet); - ujacobian.setColumn(gflat, a); - } - return ujacobian; - }, - learn: function(r1) { - // perform an update on Q function - //this.rewardHistory.push(r1); - if (!(this.r0 == null)) { - var Gtmp = new R.Graph(false); - // dpg update: - // first compute the features psi: - // the jacobian matrix of the actor for s - var ujacobian0 = this.utilJacobianAt(this.s0); - // now form the features \psi(s,a) - var psi_sa0 = Gtmp.mul(ujacobian0, this.a0); // should be [this.ntheta x 1] "feature" vector - var qw0 = Gtmp.mul(this.criticw, psi_sa0); // 1x1 - // now do the same thing because we need \psi(s_{t+1}, \mu\_\theta(s\_t{t+1})) - var ujacobian1 = this.utilJacobianAt(this.s1); - var ag = this.forwardActor(this.s1, false); - var psi_sa1 = Gtmp.mul(ujacobian1, ag.a); - var qw1 = Gtmp.mul(this.criticw, psi_sa1); // 1x1 - // get the td error finally - var tderror = this.r0 + this.gamma * qw1.w[0] - qw0.w[0]; // lol - if (tderror > 0.5) tderror = 0.5; // clamp - if (tderror < -0.5) tderror = -0.5; - this.tderror = tderror; - // update actor policy with natural gradient - var net = this.actorNet; - var ix = 0; - for (var p in net) { - var mat = net[p]; - if (net.hasOwnProperty(p)) { - for (var i = 0, n = mat.w.length; i < n; i++) { - mat.w[i] += this.alpha * this.criticw.w[ix]; // natural gradient update - ix += 1; - } - } - } - // update the critic parameters too - for (var i = 0; i < this.ntheta; i++) { - var update = this.beta * tderror * psi_sa0.w[i]; - this.criticw.w[i] += update; - } - } - this.r0 = r1; // store for next update - }, - } - // exports - global.DPAgent = DPAgent; - global.TDAgent = TDAgent; - global.DQNAgent = DQNAgent; - //global.SimpleReinforceAgent = SimpleReinforceAgent; - //global.RecurrentReinforceAgent = RecurrentReinforceAgent; - //global.DeterministPG = DeterministPG; +(function (global) { + "use strict"; + // syntactic sugar function for getting default parameter values + var getopt = function (opt, field_name, default_value) { + if (typeof opt === 'undefined') { + return default_value; + } + return (typeof opt[field_name] !== 'undefined') ? opt[field_name] : default_value; + } + var zeros = R.zeros; // inherit these + var assert = R.assert; + var randi = R.randi; + var randf = R.randf; + var setConst = function (arr, c) { + for (var i = 0, n = arr.length; i < n; i++) { + arr[i] = c; + } + } + var sampleWeighted = function (p) { + var r = Math.random(); + var c = 0.0; + for (var i = 0, n = p.length; i < n; i++) { + c += p[i]; + if (c >= r) { + return i; + } + } + assert(false, 'wtf'); + } + // ------ + // AGENTS + // ------ + // DPAgent performs Value Iteration + // - can also be used for Policy Iteration if you really wanted to + // - requires model of the environment :( + // - does not learn from experience :( + // - assumes finite MDP :( + var DPAgent = function (env, opt) { + this.V = null; // state value function + this.P = null; // policy distribution \pi(s,a) + this.env = env; // store pointer to environment + this.gamma = getopt(opt, 'gamma', 0.75); // future reward discount factor + this.reset(); + } + DPAgent.prototype = { + reset: function () { + // reset the agent's policy and value function + this.ns = this.env.getNumStates(); + this.na = this.env.getMaxNumActions(); + this.V = zeros(this.ns); + this.P = zeros(this.ns * this.na); + // initialize uniform random policy + for (var s = 0; s < this.ns; s++) { + var poss = this.env.allowedActions(s); + for (var i = 0, n = poss.length; i < n; i++) { + this.P[poss[i] * this.ns + s] = 1.0 / poss.length; + } + } + }, + act: function (s) { + // behave according to the learned policy + var poss = this.env.allowedActions(s); + var ps = []; + for (var i = 0, n = poss.length; i < n; i++) { + var a = poss[i]; + var prob = this.P[a * this.ns + s]; + ps.push(prob); + } + var maxi = sampleWeighted(ps); + return poss[maxi]; + }, + learn: function () { + // perform a single round of value iteration + self.evaluatePolicy(); // writes this.V + self.updatePolicy(); // writes this.P + }, + evaluatePolicy: function () { + // perform a synchronous update of the value function + var Vnew = zeros(this.ns); + for (var s = 0; s < this.ns; s++) { + // integrate over actions in a stochastic policy + // note that we assume that policy probability mass over allowed actions sums to one + var v = 0.0; + var poss = this.env.allowedActions(s); + for (var i = 0, n = poss.length; i < n; i++) { + var a = poss[i]; + var prob = this.P[a * this.ns + s]; // probability of taking action under policy + if (prob === 0) { + continue; + } // no contribution, skip for speed + var ns = this.env.nextStateDistribution(s, a); + var rs = this.env.reward(s, a, ns); // reward for s->a->ns transition + v += prob * (rs + this.gamma * this.V[ns]); + } + Vnew[s] = v; + } + this.V = Vnew; // swap + }, + updatePolicy: function () { + // update policy to be greedy w.r.t. learned Value function + for (var s = 0; s < this.ns; s++) { + var poss = this.env.allowedActions(s); + // compute value of taking each allowed action + var vmax, nmax; + var vs = []; + for (var i = 0, n = poss.length; i < n; i++) { + var a = poss[i]; + var ns = this.env.nextStateDistribution(s, a); + var rs = this.env.reward(s, a, ns); + var v = rs + this.gamma * this.V[ns]; + vs.push(v); + if (i === 0 || v > vmax) { + vmax = v; + nmax = 1; + } else if (v === vmax) { + nmax += 1; + } + } + // update policy smoothly across all argmaxy actions + for (var i = 0, n = poss.length; i < n; i++) { + var a = poss[i]; + this.P[a * this.ns + s] = (vs[i] === vmax) ? 1.0 / nmax : 0.0; + } + } + }, + } + // QAgent uses TD (Q-Learning, SARSA) + // - does not require environment model :) + // - learns from experience :) + var TDAgent = function (env, opt) { + this.update = getopt(opt, 'update', 'qlearn'); // qlearn | sarsa + this.gamma = getopt(opt, 'gamma', 0.75); // future reward discount factor + this.epsilon = getopt(opt, 'epsilon', 0.1); // for epsilon-greedy policy + this.alpha = getopt(opt, 'alpha', 0.01); // value function learning rate + // class allows non-deterministic policy, and smoothly regressing towards the optimal policy based on Q + this.smooth_policy_update = getopt(opt, 'smooth_policy_update', false); + this.beta = getopt(opt, 'beta', 0.01); // learning rate for policy, if smooth updates are on + // eligibility traces + this.lambda = getopt(opt, 'lambda', 0); // eligibility trace decay. 0 = no eligibility traces used + this.replacing_traces = getopt(opt, 'replacing_traces', true); + // optional optimistic initial values + this.q_init_val = getopt(opt, 'q_init_val', 0); + this.planN = getopt(opt, 'planN', 0); // number of planning steps per learning iteration (0 = no planning) + this.Q = null; // state action value function + this.P = null; // policy distribution \pi(s,a) + this.e = null; // eligibility trace + this.env_model_s = null;; // environment model (s,a) -> (s',r) + this.env_model_r = null;; // environment model (s,a) -> (s',r) + this.env = env; // store pointer to environment + this.reset(); + } + TDAgent.prototype = { + reset: function () { + // reset the agent's policy and value function + this.ns = this.env.getNumStates(); + this.na = this.env.getMaxNumActions(); + this.Q = zeros(this.ns * this.na); + if (this.q_init_val !== 0) { + setConst(this.Q, this.q_init_val); + } + this.P = zeros(this.ns * this.na); + this.e = zeros(this.ns * this.na); + // model/planning vars + this.env_model_s = zeros(this.ns * this.na); + setConst(this.env_model_s, -1); // init to -1 so we can test if we saw the state before + this.env_model_r = zeros(this.ns * this.na); + this.sa_seen = []; + this.pq = zeros(this.ns * this.na); + // initialize uniform random policy + for (var s = 0; s < this.ns; s++) { + var poss = this.env.allowedActions(s); + for (var i = 0, n = poss.length; i < n; i++) { + this.P[poss[i] * this.ns + s] = 1.0 / poss.length; + } + } + // agent memory, needed for streaming updates + // (s0,a0,r0,s1,a1,r1,...) + this.r0 = null; + this.s0 = null; + this.s1 = null; + this.a0 = null; + this.a1 = null; + }, + resetEpisode: function () { + // an episode finished + }, + act: function (s) { + // act according to epsilon greedy policy + var poss = this.env.allowedActions(s); + var probs = []; + for (var i = 0, n = poss.length; i < n; i++) { + probs.push(this.P[poss[i] * this.ns + s]); + } + // epsilon greedy policy + if (Math.random() < this.epsilon) { + var a = poss[randi(0, poss.length)]; // random available action + this.explored = true; + } else { + var a = poss[sampleWeighted(probs)]; + this.explored = false; + } + // shift state memory + this.s0 = this.s1; + this.a0 = this.a1; + this.s1 = s; + this.a1 = a; + return a; + }, + learn: function (r1) { + // takes reward for previous action, which came from a call to act() + if (!(this.r0 == null)) { + this.learnFromTuple(this.s0, this.a0, this.r0, this.s1, this.a1, this.lambda); + if (this.planN > 0) { + this.updateModel(this.s0, this.a0, this.r0, this.s1); + this.plan(); + } + } + this.r0 = r1; // store this for next update + }, + updateModel: function (s0, a0, r0, s1) { + // transition (s0,a0) -> (r0,s1) was observed. Update environment model + var sa = a0 * this.ns + s0; + if (this.env_model_s[sa] === -1) { + // first time we see this state action + this.sa_seen.push(a0 * this.ns + s0); // add as seen state + } + this.env_model_s[sa] = s1; + this.env_model_r[sa] = r0; + }, + plan: function () { + // order the states based on current priority queue information + var spq = []; + for (var i = 0, n = this.sa_seen.length; i < n; i++) { + var sa = this.sa_seen[i]; + var sap = this.pq[sa]; + if (sap > 1e-5) { // gain a bit of efficiency + spq.push({ + sa: sa, + p: sap + }); + } + } + spq.sort(function (a, b) { + return a.p < b.p ? 1 : -1 + }); + // perform the updates + var nsteps = Math.min(this.planN, spq.length); + for (var k = 0; k < nsteps; k++) { + // random exploration + //var i = randi(0, this.sa_seen.length); // pick random prev seen state action + //var s0a0 = this.sa_seen[i]; + var s0a0 = spq[k].sa; + this.pq[s0a0] = 0; // erase priority, since we're backing up this state + var s0 = s0a0 % this.ns; + var a0 = Math.floor(s0a0 / this.ns); + var r0 = this.env_model_r[s0a0]; + var s1 = this.env_model_s[s0a0]; + var a1 = -1; // not used for Q learning + if (this.update === 'sarsa') { + // generate random action?... + var poss = this.env.allowedActions(s1); + var a1 = poss[randi(0, poss.length)]; + } + this.learnFromTuple(s0, a0, r0, s1, a1, 0); // note lambda = 0 - shouldnt use eligibility trace here + } + }, + learnFromTuple: function (s0, a0, r0, s1, a1, lambda) { + var sa = a0 * this.ns + s0; + // calculate the target for Q(s,a) + if (this.update === 'qlearn') { + // Q learning target is Q(s0,a0) = r0 + gamma * max_a Q[s1,a] + var poss = this.env.allowedActions(s1); + var qmax = 0; + for (var i = 0, n = poss.length; i < n; i++) { + var s1a = poss[i] * this.ns + s1; + var qval = this.Q[s1a]; + if (i === 0 || qval > qmax) { + qmax = qval; + } + } + var target = r0 + this.gamma * qmax; + } else if (this.update === 'sarsa') { + // SARSA target is Q(s0,a0) = r0 + gamma * Q[s1,a1] + var s1a1 = a1 * this.ns + s1; + var target = r0 + this.gamma * this.Q[s1a1]; + } + if (lambda > 0) { + // perform an eligibility trace update + if (this.replacing_traces) { + this.e[sa] = 1; + } else { + this.e[sa] += 1; + } + var edecay = lambda * this.gamma; + var state_update = zeros(this.ns); + for (var s = 0; s < this.ns; s++) { + var poss = this.env.allowedActions(s); + for (var i = 0; i < poss.length; i++) { + var a = poss[i]; + var saloop = a * this.ns + s; + var esa = this.e[saloop]; + var update = this.alpha * esa * (target - this.Q[saloop]); + this.Q[saloop] += update; + this.updatePriority(s, a, update); + this.e[saloop] *= edecay; + var u = Math.abs(update); + if (u > state_update[s]) { + state_update[s] = u; + } + } + } + for (var s = 0; s < this.ns; s++) { + if (state_update[s] > 1e-5) { // save efficiency here + this.updatePolicy(s); + } + } + if (this.explored && this.update === 'qlearn') { + // have to wipe the trace since q learning is off-policy :( + this.e = zeros(this.ns * this.na); + } + } else { + // simpler and faster update without eligibility trace + // update Q[sa] towards it with some step size + var update = this.alpha * (target - this.Q[sa]); + this.Q[sa] += update; + this.updatePriority(s0, a0, update); + // update the policy to reflect the change (if appropriate) + this.updatePolicy(s0); + } + }, + updatePriority: function (s, a, u) { + // used in planning. Invoked when Q[sa] += update + // we should find all states that lead to (s,a) and upgrade their priority + // of being update in the next planning step + u = Math.abs(u); + if (u < 1e-5) { + return; + } // for efficiency skip small updates + if (this.planN === 0) { + return; + } // there is no planning to be done, skip. + for (var si = 0; si < this.ns; si++) { + // note we are also iterating over impossible actions at all states, + // but this should be okay because their env_model_s should simply be -1 + // as initialized, so they will never be predicted to point to any state + // because they will never be observed, and hence never be added to the model + for (var ai = 0; ai < this.na; ai++) { + var siai = ai * this.ns + si; + if (this.env_model_s[siai] === s) { + // this state leads to s, add it to priority queue + this.pq[siai] += u; + } + } + } + }, + updatePolicy: function (s) { + var poss = this.env.allowedActions(s); + // set policy at s to be the action that achieves max_a Q(s,a) + // first find the maxy Q values + var qmax, nmax; + var qs = []; + for (var i = 0, n = poss.length; i < n; i++) { + var a = poss[i]; + var qval = this.Q[a * this.ns + s]; + qs.push(qval); + if (i === 0 || qval > qmax) { + qmax = qval; + nmax = 1; + } else if (qval === qmax) { + nmax += 1; + } + } + // now update the policy smoothly towards the argmaxy actions + var psum = 0.0; + for (var i = 0, n = poss.length; i < n; i++) { + var a = poss[i]; + var target = (qs[i] === qmax) ? 1.0 / nmax : 0.0; + var ix = a * this.ns + s; + if (this.smooth_policy_update) { + // slightly hacky :p + this.P[ix] += this.beta * (target - this.P[ix]); + psum += this.P[ix]; + } else { + // set hard target + this.P[ix] = target; + } + } + if (this.smooth_policy_update) { + // renomalize P if we're using smooth policy updates + for (var i = 0, n = poss.length; i < n; i++) { + var a = poss[i]; + this.P[a * this.ns + s] /= psum; + } + } + } + } + var DQNAgent = function (env, opt) { + this.gamma = getopt(opt, 'gamma', 0.75); // future reward discount factor + this.epsilon = getopt(opt, 'epsilon', 0.1); // for epsilon-greedy policy + this.alpha = getopt(opt, 'alpha', 0.01); // value function learning rate + this.experience_add_every = getopt(opt, 'experience_add_every', 25); // number of time steps before we add another experience to replay memory + this.experience_size = getopt(opt, 'experience_size', 5000); // size of experience replay + this.learning_steps_per_iteration = getopt(opt, 'learning_steps_per_iteration', 10); + this.tderror_clamp = getopt(opt, 'tderror_clamp', 1.0); + this.num_hidden_units = getopt(opt, 'num_hidden_units', 100); + this.env = env; + this.reset(); + } + DQNAgent.prototype = { + reset: function () { + this.nh = this.num_hidden_units; // number of hidden units + this.ns = this.env.getNumStates(); + this.na = this.env.getMaxNumActions(); + // nets are hardcoded for now as key (str) -> Mat + // not proud of this. better solution is to have a whole Net object + // on top of Mats, but for now sticking with this + this.net = {}; + this.net.W1 = new R.RandMat(this.nh, this.ns, 0, 0.01); + this.net.b1 = new R.Mat(this.nh, 1, 0, 0.01); + this.net.W2 = new R.RandMat(this.na, this.nh, 0, 0.01); + this.net.b2 = new R.Mat(this.na, 1, 0, 0.01); + this.exp = []; // experience + this.expi = 0; // where to insert + this.t = 0; + this.r0 = null; + this.s0 = null; + this.s1 = null; + this.a0 = null; + this.a1 = null; + this.tderror = 0; // for visualization only... + }, + toJSON: function () { + // save function + var j = {}; + j.nh = this.nh; + j.ns = this.ns; + j.na = this.na; + j.net = R.netToJSON(this.net); + return j; + }, + fromJSON: function (j) { + // load function + this.nh = j.nh; + this.ns = j.ns; + this.na = j.na; + this.net = R.netFromJSON(j.net); + }, + forwardQ: function (net, s, needs_backprop) { + var G = new R.Graph(needs_backprop); + var a1mat = G.add(G.mul(net.W1, s), net.b1); + var h1mat = G.tanh(a1mat); + var a2mat = G.add(G.mul(net.W2, h1mat), net.b2); + this.lastG = G; // back this up. Kind of hacky isn't it + return a2mat; + }, + act: function (slist) { + // convert to a Mat column vector + var s = new R.Mat(this.ns, 1); + s.setFrom(slist); + // epsilon greedy policy + if (Math.random() < this.epsilon) { + var a = randi(0, this.na); + } else { + // greedy wrt Q function + var amat = this.forwardQ(this.net, s, false); + var a = R.maxi(amat.w); // returns index of argmax action + } + // shift state memory + this.s0 = this.s1; + this.a0 = this.a1; + this.s1 = s; + this.a1 = a; + return a; + }, + learn: function (r1) { + // perform an update on Q function + if (!(this.r0 == null) && this.alpha > 0) { + // learn from this tuple to get a sense of how "surprising" it is to the agent + var tderror = this.learnFromTuple(this.s0, this.a0, this.r0, this.s1, this.a1); + this.tderror = tderror; // a measure of surprise + // decide if we should keep this experience in the replay + if (this.t % this.experience_add_every === 0) { + this.exp[this.expi] = [this.s0, this.a0, this.r0, this.s1, this.a1]; + this.expi += 1; + if (this.expi > this.experience_size) { + this.expi = 0; + } // roll over when we run out + } + this.t += 1; + // sample some additional experience from replay memory and learn from it + for (var k = 0; k < this.learning_steps_per_iteration; k++) { + var ri = randi(0, this.exp.length); // todo: priority sweeps? + var e = this.exp[ri]; + this.learnFromTuple(e[0], e[1], e[2], e[3], e[4]) + } + } + this.r0 = r1; // store for next update + }, + learnFromTuple: function (s0, a0, r0, s1, a1) { + // want: Q(s,a) = r + gamma * max_a' Q(s',a') + // compute the target Q value + var tmat = this.forwardQ(this.net, s1, false); + var qmax = r0 + this.gamma * tmat.w[R.maxi(tmat.w)]; + // now predict + var pred = this.forwardQ(this.net, s0, true); + var tderror = pred.w[a0] - qmax; + var clamp = this.tderror_clamp; + if (Math.abs(tderror) > clamp) { // huber loss to robustify + if (tderror > clamp) tderror = clamp; + if (tderror < -clamp) tderror = -clamp; + } + pred.dw[a0] = tderror; + this.lastG.backward(); // compute gradients on net params + // update net + R.updateNet(this.net, this.alpha); + return tderror; + } + } + // buggy implementation, doesnt work... + var SimpleReinforceAgent = function (env, opt) { + this.gamma = getopt(opt, 'gamma', 0.5); // future reward discount factor + this.epsilon = getopt(opt, 'epsilon', 0.75); // for epsilon-greedy policy + this.alpha = getopt(opt, 'alpha', 0.001); // actor net learning rate + this.beta = getopt(opt, 'beta', 0.01); // baseline net learning rate + this.env = env; + this.reset(); + } + SimpleReinforceAgent.prototype = { + reset: function () { + this.ns = this.env.getNumStates(); + this.na = this.env.getMaxNumActions(); + this.nh = 100; // number of hidden units + this.nhb = 100; // and also in the baseline lstm + this.actorNet = {}; + this.actorNet.W1 = new R.RandMat(this.nh, this.ns, 0, 0.01); + this.actorNet.b1 = new R.Mat(this.nh, 1, 0, 0.01); + this.actorNet.W2 = new R.RandMat(this.na, this.nh, 0, 0.1); + this.actorNet.b2 = new R.Mat(this.na, 1, 0, 0.01); + this.actorOutputs = []; + this.actorGraphs = []; + this.actorActions = []; // sampled ones + this.rewardHistory = []; + this.baselineNet = {}; + this.baselineNet.W1 = new R.RandMat(this.nhb, this.ns, 0, 0.01); + this.baselineNet.b1 = new R.Mat(this.nhb, 1, 0, 0.01); + this.baselineNet.W2 = new R.RandMat(this.na, this.nhb, 0, 0.01); + this.baselineNet.b2 = new R.Mat(this.na, 1, 0, 0.01); + this.baselineOutputs = []; + this.baselineGraphs = []; + this.t = 0; + }, + forwardActor: function (s, needs_backprop) { + var net = this.actorNet; + var G = new R.Graph(needs_backprop); + var a1mat = G.add(G.mul(net.W1, s), net.b1); + var h1mat = G.tanh(a1mat); + var a2mat = G.add(G.mul(net.W2, h1mat), net.b2); + return { + 'a': a2mat, + 'G': G + } + }, + forwardValue: function (s, needs_backprop) { + var net = this.baselineNet; + var G = new R.Graph(needs_backprop); + var a1mat = G.add(G.mul(net.W1, s), net.b1); + var h1mat = G.tanh(a1mat); + var a2mat = G.add(G.mul(net.W2, h1mat), net.b2); + return { + 'a': a2mat, + 'G': G + } + }, + act: function (slist) { + // convert to a Mat column vector + var s = new R.Mat(this.ns, 1); + s.setFrom(slist); + // forward the actor to get action output + var ans = this.forwardActor(s, true); + var amat = ans.a; + var ag = ans.G; + this.actorOutputs.push(amat); + this.actorGraphs.push(ag); + // forward the baseline estimator + var ans = this.forwardValue(s, true); + var vmat = ans.a; + var vg = ans.G; + this.baselineOutputs.push(vmat); + this.baselineGraphs.push(vg); + // sample action from the stochastic gaussian policy + var a = R.copyMat(amat); + var gaussVar = 0.02; + a.w[0] = R.randn(0, gaussVar); + a.w[1] = R.randn(0, gaussVar); + this.actorActions.push(a); + // shift state memory + this.s0 = this.s1; + this.a0 = this.a1; + this.s1 = s; + this.a1 = a; + return a; + }, + learn: function (r1) { + // perform an update on Q function + this.rewardHistory.push(r1); + var n = this.rewardHistory.length; + var baselineMSE = 0.0; + var nup = 100; // what chunk of experience to take + var nuse = 80; // what chunk to update from + if (n >= nup) { + // lets learn and flush + // first: compute the sample values at all points + var vs = []; + for (var t = 0; t < nuse; t++) { + var mul = 1; + // compute the actual discounted reward for this time step + var V = 0; + for (var t2 = t; t2 < n; t2++) { + V += mul * this.rewardHistory[t2]; + mul *= this.gamma; + if (mul < 1e-5) { + break; + } // efficiency savings + } + // get the predicted baseline at this time step + var b = this.baselineOutputs[t].w[0]; + for (var i = 0; i < this.na; i++) { + // [the action delta] * [the desirebility] + var update = -(V - b) * (this.actorActions[t].w[i] - this.actorOutputs[t].w[i]); + if (update > 0.1) { + update = 0.1; + } + if (update < -0.1) { + update = -0.1; + } + this.actorOutputs[t].dw[i] += update; + } + var update = -(V - b); + if (update > 0.1) { + update = 0.1; + } + if (update < 0.1) { + update = -0.1; + } + this.baselineOutputs[t].dw[0] += update; + baselineMSE += (V - b) * (V - b); + vs.push(V); + } + baselineMSE /= nuse; + // backprop all the things + for (var t = 0; t < nuse; t++) { + this.actorGraphs[t].backward(); + this.baselineGraphs[t].backward(); + } + R.updateNet(this.actorNet, this.alpha); // update actor network + R.updateNet(this.baselineNet, this.beta); // update baseline network + // flush + this.actorOutputs = []; + this.rewardHistory = []; + this.actorActions = []; + this.baselineOutputs = []; + this.actorGraphs = []; + this.baselineGraphs = []; + this.tderror = baselineMSE; + } + this.t += 1; + this.r0 = r1; // store for next update + }, + } + // buggy implementation as well, doesn't work + var RecurrentReinforceAgent = function (env, opt) { + this.gamma = getopt(opt, 'gamma', 0.5); // future reward discount factor + this.epsilon = getopt(opt, 'epsilon', 0.1); // for epsilon-greedy policy + this.alpha = getopt(opt, 'alpha', 0.001); // actor net learning rate + this.beta = getopt(opt, 'beta', 0.01); // baseline net learning rate + this.env = env; + this.reset(); + } + RecurrentReinforceAgent.prototype = { + reset: function () { + this.ns = this.env.getNumStates(); + this.na = this.env.getMaxNumActions(); + this.nh = 40; // number of hidden units + this.nhb = 40; // and also in the baseline lstm + this.actorLSTM = R.initLSTM(this.ns, [this.nh], this.na); + this.actorG = new R.Graph(); + this.actorPrev = null; + this.actorOutputs = []; + this.rewardHistory = []; + this.actorActions = []; + this.baselineLSTM = R.initLSTM(this.ns, [this.nhb], 1); + this.baselineG = new R.Graph(); + this.baselinePrev = null; + this.baselineOutputs = []; + this.t = 0; + this.r0 = null; + this.s0 = null; + this.s1 = null; + this.a0 = null; + this.a1 = null; + }, + act: function (slist) { + // convert to a Mat column vector + var s = new R.Mat(this.ns, 1); + s.setFrom(slist); + // forward the LSTM to get action distribution + var actorNext = R.forwardLSTM(this.actorG, this.actorLSTM, [this.nh], s, this.actorPrev); + this.actorPrev = actorNext; + var amat = actorNext.o; + this.actorOutputs.push(amat); + // forward the baseline LSTM + var baselineNext = R.forwardLSTM(this.baselineG, this.baselineLSTM, [this.nhb], s, this.baselinePrev); + this.baselinePrev = baselineNext; + this.baselineOutputs.push(baselineNext.o); + // sample action from actor policy + var gaussVar = 0.05; + var a = R.copyMat(amat); + for (var i = 0, n = a.w.length; i < n; i++) { + a.w[0] += R.randn(0, gaussVar); + a.w[1] += R.randn(0, gaussVar); + } + this.actorActions.push(a); + // shift state memory + this.s0 = this.s1; + this.a0 = this.a1; + this.s1 = s; + this.a1 = a; + return a; + }, + learn: function (r1) { + // perform an update on Q function + this.rewardHistory.push(r1); + var n = this.rewardHistory.length; + var baselineMSE = 0.0; + var nup = 100; // what chunk of experience to take + var nuse = 80; // what chunk to also update + if (n >= nup) { + // lets learn and flush + // first: compute the sample values at all points + var vs = []; + for (var t = 0; t < nuse; t++) { + var mul = 1; + var V = 0; + for (var t2 = t; t2 < n; t2++) { + V += mul * this.rewardHistory[t2]; + mul *= this.gamma; + if (mul < 1e-5) { + break; + } // efficiency savings + } + var b = this.baselineOutputs[t].w[0]; + // todo: take out the constants etc. + for (var i = 0; i < this.na; i++) { + // [the action delta] * [the desirebility] + var update = -(V - b) * (this.actorActions[t].w[i] - this.actorOutputs[t].w[i]); + if (update > 0.1) { + update = 0.1; + } + if (update < -0.1) { + update = -0.1; + } + this.actorOutputs[t].dw[i] += update; + } + var update = -(V - b); + if (update > 0.1) { + update = 0.1; + } + if (update < 0.1) { + update = -0.1; + } + this.baselineOutputs[t].dw[0] += update; + baselineMSE += (V - b) * (V - b); + vs.push(V); + } + baselineMSE /= nuse; + this.actorG.backward(); // update params! woohoo! + this.baselineG.backward(); + R.updateNet(this.actorLSTM, this.alpha); // update actor network + R.updateNet(this.baselineLSTM, this.beta); // update baseline network + // flush + this.actorG = new R.Graph(); + this.actorPrev = null; + this.actorOutputs = []; + this.rewardHistory = []; + this.actorActions = []; + this.baselineG = new R.Graph(); + this.baselinePrev = null; + this.baselineOutputs = []; + this.tderror = baselineMSE; + } + this.t += 1; + this.r0 = r1; // store for next update + }, + } + // Currently buggy implementation, doesnt work + var DeterministPG = function (env, opt) { + this.gamma = getopt(opt, 'gamma', 0.5); // future reward discount factor + this.epsilon = getopt(opt, 'epsilon', 0.5); // for epsilon-greedy policy + this.alpha = getopt(opt, 'alpha', 0.001); // actor net learning rate + this.beta = getopt(opt, 'beta', 0.01); // baseline net learning rate + this.env = env; + this.reset(); + } + DeterministPG.prototype = { + reset: function () { + this.ns = this.env.getNumStates(); + this.na = this.env.getMaxNumActions(); + this.nh = 100; // number of hidden units + // actor + this.actorNet = {}; + this.actorNet.W1 = new R.RandMat(this.nh, this.ns, 0, 0.01); + this.actorNet.b1 = new R.Mat(this.nh, 1, 0, 0.01); + this.actorNet.W2 = new R.RandMat(this.na, this.ns, 0, 0.1); + this.actorNet.b2 = new R.Mat(this.na, 1, 0, 0.01); + this.ntheta = this.na * this.ns + this.na; // number of params in actor + // critic + this.criticw = new R.RandMat(1, this.ntheta, 0, 0.01); // row vector + this.r0 = null; + this.s0 = null; + this.s1 = null; + this.a0 = null; + this.a1 = null; + this.t = 0; + }, + forwardActor: function (s, needs_backprop) { + var net = this.actorNet; + var G = new R.Graph(needs_backprop); + var a1mat = G.add(G.mul(net.W1, s), net.b1); + var h1mat = G.tanh(a1mat); + var a2mat = G.add(G.mul(net.W2, h1mat), net.b2); + return { + 'a': a2mat, + 'G': G + } + }, + act: function (slist) { + // convert to a Mat column vector + var s = new R.Mat(this.ns, 1); + s.setFrom(slist); + // forward the actor to get action output + var ans = this.forwardActor(s, false); + var amat = ans.a; + var ag = ans.G; + // sample action from the stochastic gaussian policy + var a = R.copyMat(amat); + if (Math.random() < this.epsilon) { + var gaussVar = 0.02; + a.w[0] = R.randn(0, gaussVar); + a.w[1] = R.randn(0, gaussVar); + } + var clamp = 0.25; + if (a.w[0] > clamp) a.w[0] = clamp; + if (a.w[0] < -clamp) a.w[0] = -clamp; + if (a.w[1] > clamp) a.w[1] = clamp; + if (a.w[1] < -clamp) a.w[1] = -clamp; + // shift state memory + this.s0 = this.s1; + this.a0 = this.a1; + this.s1 = s; + this.a1 = a; + return a; + }, + utilJacobianAt: function (s) { + var ujacobian = new R.Mat(this.ntheta, this.na); + for (var a = 0; a < this.na; a++) { + R.netZeroGrads(this.actorNet); + var ag = this.forwardActor(this.s0, true); + ag.a.dw[a] = 1.0; + ag.G.backward(); + var gflat = R.netFlattenGrads(this.actorNet); + ujacobian.setColumn(gflat, a); + } + return ujacobian; + }, + learn: function (r1) { + // perform an update on Q function + //this.rewardHistory.push(r1); + if (!(this.r0 == null)) { + var Gtmp = new R.Graph(false); + // dpg update: + // first compute the features psi: + // the jacobian matrix of the actor for s + var ujacobian0 = this.utilJacobianAt(this.s0); + // now form the features \psi(s,a) + var psi_sa0 = Gtmp.mul(ujacobian0, this.a0); // should be [this.ntheta x 1] "feature" vector + var qw0 = Gtmp.mul(this.criticw, psi_sa0); // 1x1 + // now do the same thing because we need \psi(s_{t+1}, \mu\_\theta(s\_t{t+1})) + var ujacobian1 = this.utilJacobianAt(this.s1); + var ag = this.forwardActor(this.s1, false); + var psi_sa1 = Gtmp.mul(ujacobian1, ag.a); + var qw1 = Gtmp.mul(this.criticw, psi_sa1); // 1x1 + // get the td error finally + var tderror = this.r0 + this.gamma * qw1.w[0] - qw0.w[0]; // lol + if (tderror > 0.5) tderror = 0.5; // clamp + if (tderror < -0.5) tderror = -0.5; + this.tderror = tderror; + // update actor policy with natural gradient + var net = this.actorNet; + var ix = 0; + for (var p in net) { + var mat = net[p]; + if (net.hasOwnProperty(p)) { + for (var i = 0, n = mat.w.length; i < n; i++) { + mat.w[i] += this.alpha * this.criticw.w[ix]; // natural gradient update + ix += 1; + } + } + } + // update the critic parameters too + for (var i = 0; i < this.ntheta; i++) { + var update = this.beta * tderror * psi_sa0.w[i]; + this.criticw.w[i] += update; + } + } + this.r0 = r1; // store for next update + }, + } + // exports + global.DPAgent = DPAgent; + global.TDAgent = TDAgent; + global.DQNAgent = DQNAgent; + //global.SimpleReinforceAgent = SimpleReinforceAgent; + //global.RecurrentReinforceAgent = RecurrentReinforceAgent; + //global.DeterministPG = DeterministPG; })(RL); var Trevel = { - //settings you can change - stop: false, - maxBet: 0.00001, - minBet: 0.00000002, - swap: true, - betSpeed: 100,//change this on init - verbose: true, - isTesting: false, - //money management - useKelly: false,//martingale performs better on live account! - kellyPercent: 5, //can't be more than 100 or less than 1 - useMartingale: true, //if kelly is true this won't work - martingaleMultiplier: 2, - //bot settings, these are set automaticcally don't bother - currentBalance: 0, - startingBalance: 0, - betAmount: 0, - profit: 0, - totalBets: 0, - totalWins: 0, - winRate: 0, - betHistory: [], //this is a sequence of all winning bets not the sequence of bets we placed - betOutcomes: [], - hbProbability: 0, - lbProbability: 0, - hbCount: 0, - lbcount: 0, - nextBet: "", - previousReward:0, - addBet: function(bet, outcome) { - if (bet === "LB" && outcome === "Win") { - Trevel.betHistory.push("LO"); - Trevel.betOutcomes.push("W"); - Trevel.totalWins++; - Trevel.lbcount++; - } - if (bet === "LB" && outcome === "Loose") { - Trevel.betHistory.push("HI"); - Trevel.hbCount++; - Trevel.betOutcomes.push("L"); - } - if (bet === "HB" && outcome === "Win") { - Trevel.betHistory.push("HI"); - Trevel.totalWins++; - Trevel.hbCount++; - Trevel.betOutcomes.push("W"); - } - if (bet === "HB" && outcome === "Loose") { - Trevel.betHistory.push("LO"); - Trevel.lbcount++; - Trevel.betOutcomes.push("L"); - } - Trevel.totalBets++; - }, - calculateProbabilities: function() { - Trevel.hbProbability = Trevel.hbCount / Trevel.betHistory.length; - Trevel.lbProbability = Trevel.lbcount / Trevel.betHistory.length; - Trevel.winRate = Trevel.totalWins / Trevel.totalBets; - if(Trevel.isTesting===false){ - Trevel.profit = Trevel.getProfit(); - } - }, - getCurrentBalance: function() { - return parseFloat($('#balance').html()); - }, - placeHighBet: function() { - $('#double_your_btc_bet_hi_button').click(); - }, - placeLowBet: function() { - $('#double_your_btc_bet_lo_button').click(); - }, - setBetAmount: function(amount) { - var elem = document.getElementById("double_your_btc_stake"); - elem.value = amount; - }, - setOutcome: function(bet) { - if ($('#double_your_btc_bet_lose').html() !== '') { - Trevel.addBet(bet, "Loose"); - } else { - Trevel.addBet(bet, "Win"); - } - }, - prepareBet: function() { - Trevel.calculateProbabilities(); - if (Trevel.betHistory.length < 10) { - if (Trevel.useMartingale === true && Trevel.betHistory.length>12) { - if ($('#double_your_btc_bet_lose').html() !== '' && parseFloat($('#double_your_btc_stake').val()) * Trevel.martingaleMultiplier < Trevel.maxBet) { - Trevel.setBetAmount((parseFloat($('#double_your_btc_stake').val()) * Trevel.martingaleMultiplier).toFixed(8)); - } else { - Trevel.setBetAmount(Trevel.minBet); - } - } - } else { - if (Trevel.useKelly === true && Trevel.betHistory.length>12) { - Trevel.currentBalance = Trevel.getCurrentBalance(); - var currMulty = document.getElementById("double_your_btc_payout_multiplier").value; - var kellyAmount = (((Trevel.currentBalance * Trevel.kellyPercent) / 100) * ((Trevel.winRate * currMulty - 1)) / (currMulty - 1)).toFixed(8); - if (kellyAmount > 0 && kellyAmount < Trevel.maxBet) { - Trevel.setBetAmount(kellyAmount); - } else { - Trevel.setBetAmount(Trevel.minBet); - } - } else if (Trevel.useMartingale === true && Trevel.betHistory.length>12) { - if ($('#double_your_btc_bet_lose').html() !== '' && parseFloat($('#double_your_btc_stake').val()) * Trevel.martingaleMultiplier < Trevel.maxBet) { - Trevel.setBetAmount((parseFloat($('#double_your_btc_stake').val()) * Trevel.martingaleMultiplier).toFixed(8)); - } else { - Trevel.setBetAmount(Trevel.minBet); - } - } - } - }, - placeBet: function() { - if (Trevel.nextBet === "HB") { - Trevel.placeHighBet(); - } else if (Trevel.nextBet === "LB") { - Trevel.placeLowBet(); - } else if (Trevel.betHistory.length > 0 && Trevel.swap === true) { - var prev = Trevel.betHistory[Trevel.betHistory.length - 1]; - if (prev === "LO") { - Trevel.placeHighBet(); - } else { - Trevel.placeLowBet(); - } - } else { - Trevel.placeLowBet(); - } - }, - getProfit: function() { - return (Trevel.getCurrentBalance() - Trevel.startingBalance).toFixed(8); - }, - getNumStates: function() { - return 8; - }, - getMaxNumActions: function() { - return 2; - }, - getSentiment: function(bet) { - if (bet === "HI") { - return 1; - } else { - return 0; - } - }, - getPreviousBets: function() { - var hist = []; - if (Trevel.betHistory.length > 12) { - hist.push(Trevel.getSentiment(Trevel.betHistory[Trevel.betHistory.length - 1])); - hist.push(Trevel.getSentiment(Trevel.betHistory[Trevel.betHistory.length - 2])); - hist.push(Trevel.getSentiment(Trevel.betHistory[Trevel.betHistory.length - 3])); - hist.push(Trevel.getSentiment(Trevel.betHistory[Trevel.betHistory.length - 4])); - hist.push(Trevel.getSentiment(Trevel.betHistory[Trevel.betHistory.length - 5])); - hist.push(Trevel.getSentiment(Trevel.betHistory[Trevel.betHistory.length - 6])); - hist.push(Trevel.getSentiment(Trevel.betHistory[Trevel.betHistory.length - 7])); - hist.push(Trevel.getSentiment(Trevel.betHistory[Trevel.betHistory.length - 8])); - } else { - hist = [0, 1, 0, 1, 0, 1, 0, 1]; //incase we just started... - } - return hist; - }, - getAgentState: function() { //we'll observe the last 8 bets - var s = Trevel.getPreviousBets(); - return s; - }, - getReward: function() { - var reward = 0; - var out1=Trevel.betOutcomes[Trevel.betOutcomes.length - 1]; - var out2=Trevel.betOutcomes[Trevel.betOutcomes.length - 2]; - if(out1==="L"){ - if(Trevel.previousReward<0){ - reward=Trevel.previousReward; - reward+=-0.03; - if(out2==="L"){ - reward+=-0.03; - } - } - else{ - reward=-0.03; - if(out2==="L"){ - reward+=-0.03; - } - } - } - else{ - if(Trevel.previousReward>0){ - reward=Trevel.previousReward; - reward+=0.01; - if(out2==="W"){ - reward+=0.01; - } - } - else{ - reward=0.01; - if(out2==="W"){ - reward+=0.01; - } - } - } - return reward; - }, - //for raw testing only - randomNumber: function(min, max) { - return Math.floor(Math.random() * (max - min + 1) + min); - }, - getTestOutcome: function(random) { - if (random % 2 == 0) { - return "HI"; - } else { - return "LO"; - } - }, - //initialize Trevel - init: function() { - Trevel.startingBalance = Trevel.currentBalance = parseFloat($('#balance').html()); - Trevel.setBetAmount(Trevel.minBet); - Trevel.stop = false; - Trevel.swap = true; - Trevel.betSpeed=3000; - } + //settings you can change + stop: true, + maxBet: 0.00001, + minBet: 0.00000005, + swap: true, + betSpeed: 100,//change this on init + verbose: true, + isTesting: false, + //money management + useKelly: false,//martingale performs better on live account! + korm: false, + kellyPercent: 5, //can't be more than 100 or less than 1 + useMartingale: true, //if kelly is true this won't work + + martingaleMultiplier: 2, + //bot settings, these are set automaticcally don't bother + currentBalance: 0, + startingBalance: 0, + betAmount: 0, + profit: 0, + totalBets: 0, + totalWins: 0, + winRate: 0, + betHistory: [], //this is a sequence of all winning bets not the sequence of bets we placed + betOutcomes: [], + hbProbability: 0, + lbProbability: 0, + hbCount: 0, + lbcount: 0, + nextBet: "", + previousReward: 0, + addBet: function (bet, outcome) { + if (bet === "LB" && outcome === "Win") { + this.betHistory.push("LO"); + this.betOutcomes.push("W"); + this.totalWins++; + this.lbcount++; + } + if (bet === "LB" && outcome === "Loose") { + this.betHistory.push("HI"); + this.hbCount++; + this.betOutcomes.push("L"); + } + if (bet === "HB" && outcome === "Win") { + this.betHistory.push("HI"); + this.totalWins++; + this.hbCount++; + this.betOutcomes.push("W"); + } + if (bet === "HB" && outcome === "Loose") { + this.betHistory.push("LO"); + this.lbcount++; + this.betOutcomes.push("L"); + } + this.totalBets++; + }, + calculateProbabilities: function () { + this.hbProbability = this.hbCount / this.betHistory.length; + this.lbProbability = this.lbcount / this.betHistory.length; + this.winRate = this.totalWins / this.totalBets; + if (this.isTesting === false) { + this.profit = this.getProfit(); + } + }, + getCurrentBalance: function () { + return parseFloat($('#balance').html()); + }, + placeHighBet: function () { + $('#double_your_btc_bet_hi_button').click(); + }, + placeLowBet: function () { + $('#double_your_btc_bet_lo_button').click(); + }, + setBetAmount: function (amount) { + var elem = document.getElementById("double_your_btc_stake"); + elem.value = amount; + }, + setOutcome: function (bet) { + if ($('#double_your_btc_bet_lose').html() !== '') { + this.addBet(bet, "Loose"); + } else { + this.addBet(bet, "Win"); + } + }, + prepareBet: function () { + this.calculateProbabilities(); + if (this.betHistory.length < 10) { + if (this.useMartingale === true && this.betHistory.length > 12) { + if ($('#double_your_btc_bet_lose').html() !== '' && parseFloat($('#double_your_btc_stake').val()) * this.martingaleMultiplier < this.maxBet) { + this.setBetAmount((parseFloat($('#double_your_btc_stake').val()) * this.martingaleMultiplier).toFixed(8)); + } else { + this.setBetAmount(this.minBet); + } + } + } else { + if (this.useKelly === true && this.betHistory.length > 12) { + this.currentBalance = this.getCurrentBalance(); + var currMulty = document.getElementById("double_your_btc_payout_multiplier").value; + var kellyAmount = (((this.currentBalance * this.kellyPercent) / 100) * ((this.winRate * currMulty - 1)) / (currMulty - 1)).toFixed(8); + if (kellyAmount > 0 && kellyAmount < this.maxBet) { + this.setBetAmount(kellyAmount); + } else { + this.setBetAmount(this.minBet); + } + } else if (this.useMartingale === true && this.betHistory.length > 12) { + if ($('#double_your_btc_bet_lose').html() !== '' && parseFloat($('#double_your_btc_stake').val()) * this.martingaleMultiplier < this.maxBet) { + this.setBetAmount((parseFloat($('#double_your_btc_stake').val()) * this.martingaleMultiplier).toFixed(8)); + } else { + this.setBetAmount(this.minBet); + } + } + } + }, + placeBet: function () { + if (this.nextBet === "HB") { + this.placeHighBet(); + } else if (this.nextBet === "LB") { + this.placeLowBet(); + } else if (this.betHistory.length > 0 && this.swap === true) { + var prev = this.betHistory[this.betHistory.length - 1]; + if (prev === "LO") { + this.placeHighBet(); + } else { + this.placeLowBet(); + } + } else { + this.placeLowBet(); + } + }, + getProfit: function () { + return (this.getCurrentBalance() - this.startingBalance).toFixed(8); + }, + getNumStates: function () { + return 8; + }, + getMaxNumActions: function () { + return 2; + }, + getSentiment: function (bet) { + if (bet === "HI") { + return 1; + } else { + return 0; + } + }, + getPreviousBets: function () { + var hist = []; + if (this.betHistory.length > 12) { + hist.push(this.getSentiment(this.betHistory[this.betHistory.length - 1])); + hist.push(this.getSentiment(this.betHistory[this.betHistory.length - 2])); + hist.push(this.getSentiment(this.betHistory[this.betHistory.length - 3])); + hist.push(this.getSentiment(this.betHistory[this.betHistory.length - 4])); + hist.push(this.getSentiment(this.betHistory[this.betHistory.length - 5])); + hist.push(this.getSentiment(this.betHistory[this.betHistory.length - 6])); + hist.push(this.getSentiment(this.betHistory[this.betHistory.length - 7])); + hist.push(this.getSentiment(this.betHistory[this.betHistory.length - 8])); + } else { + hist = [0, 1, 0, 1, 0, 1, 0, 1]; //incase we just started... + } + return hist; + }, + getAgentState: function () { //we'll observe the last 8 bets + var s = this.getPreviousBets(); + return s; + }, + getReward: function () { + var reward = 0; + var out1 = this.betOutcomes[this.betOutcomes.length - 1]; + var out2 = this.betOutcomes[this.betOutcomes.length - 2]; + if (out1 === "L") { + if (this.previousReward < 0) { + reward = this.previousReward; + reward += -0.03; + if (out2 === "L") { + reward += -0.03; + } + } + else { + reward = -0.03; + if (out2 === "L") { + reward += -0.03; + } + } + } + else { + if (this.previousReward > 0) { + reward = this.previousReward; + reward += 0.01; + if (out2 === "W") { + reward += 0.01; + } + } + else { + reward = 0.01; + if (out2 === "W") { + reward += 0.01; + } + } + } + return reward; + }, + //for raw testing only + randomNumber: function (min, max) { + return Math.floor(Math.random() * (max - min + 1) + min); + }, + getTestOutcome: function (random) { + if (random % 2 === 0) { + return "HI"; + } else { + return "LO"; + } + }, + //random string for random seed + rString: function (length, chars) { + var result = ''; + var length = 16; + var chars = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789abcdefghijklmnopqrstuvwxyz' + for (var i = length; i > 0; --i) result += chars[Math.floor(Math.random() * chars.length)]; + return result; + }, + //set client seed as random string + rSeed: function () { + $('#next_client_seed').val(rString()); + }, + //initialize this + init: function () { + this.startingBalance = this.currentBalance = parseFloat($('#balance').html()); + this.setBetAmount(this.minBet); + this.stop = true; + this.swap = true; + this.betSpeed = 3000; + document.getElementById("free_play_link_li").innerHTML = 'START BOT'; + + }, + // ask user config variables + config: function () { + + //define temporary variables + var maxb = 0, minb = 0; + + //prompt questions + maxb = prompt('Maximum bet stake in Satoshi', 1000); + this.maxBet = sattobtc(maxb); + minb = prompt('Minimum bet stake in Satoshi', 2); + this.minBet = sattobtc(minb); + this.martingaleMultiplier = prompt('Bet multiplier on lose', 2); + this.swap = prompt('True for swap enabled, false for disabled', 'true'); + this.korm = prompt('True to enable Kelly, false to enabled martingale, leave blank for both', 'false'); + this.betSpeed = prompt('Wait time before next bet is placed in ms', 3000); + + //convert satoshi to btc + function sattobtc(sat) { + var btc = 0.00000001; + return sat * btc; + } + if (this.korm === 'true') { + this.useKelly = true; + this.useMartingale = false; + } + else if (this.korm === 'false') { + this.useMartingale = true; + this.useKelly = false; + } + else { + this.useKelly = this.useMartingale = true; + } + //start betting + startbetting(); + }, + stopbets: function () { + env.stop = true; + clearInterval(interval); + console.log('Bet session has been stopped, to start over click start.'); + document.getElementById("free_play_link_li").innerHTML = 'START BOT'; + + + } }; //Deep Q learning with reinforceJS -var spec = {} +var spec = {}; spec.update = 'qlearn'; spec.gamma = 0.9; spec.epsilon = 0.45; @@ -1741,67 +1800,87 @@ spec.alpha = 0.01; spec.experience_add_every = 12; spec.experience_size = 100000; spec.learning_steps_per_iteration = 24; -spec.tderror_clamp = 0.7; +spec.tderror_clamp = 0.7; spec.num_hidden_units = 24; // create an environment object var env = Trevel; +var interval = null; if (env.isTesting === false) { - env.init(); + env.init(); } // create the DQN agent agent = new RL.DQNAgent(env, spec); -setInterval(function() { - if (env.stop === false) { - var state = env.getAgentState(); - var action = agent.act(state); - var outcome = ""; - if (env.isTesting === false) { - if (action === 0) { - env.nextBet = "LB"; - env.prepareBet(); - env.placeBet(); - env.setOutcome("LB"); - outcome = env.betOutcomes[env.betOutcomes.length - 1]; - } else if (action === 1) { - env.nextBet = "HB"; - env.prepareBet(); - env.placeBet(); - env.setOutcome("HB"); - outcome = env.betOutcomes[env.betOutcomes.length - 1]; - } - if (env.verbose === true) { - env.calculateProbabilities(); - //console.log("Machine Bet: " + action + "{" + env.nextBet + "} isKelly: " + env.useKelly + " isMartingale: " + env.useMartingale); - console.log("Profit: " + env.profit+" WinRate: " + (env.winRate*100).toFixed(2)); - } - } else { - console.log("Action: " + action); - var testOutcome = env.getTestOutcome(env.randomNumber(0, 1000)); - if (action === 0 && testOutcome === "LO") { - env.addBet("LB", "Win"); - outcome = "W"; - } else if (action === 0 && testOutcome === "HI") { - env.addBet("LB", "Loose"); - outcome = "L"; - } else if (action === 1 && testOutcome === "HI") { - env.addBet("HB", "Win"); - outcome = "W"; - } else if (action === 1 && testOutcome === "LO") { - env.addBet("HB", "Loose"); - outcome = "L"; - } - env.calculateProbabilities(); - console.log("Winrate: " + (env.winRate*100).toFixed(2)); - } - var reward = env.getReward(); - if (reward == 0) { - if (outcome === "L") { - reward = -0.03; - } else { - reward = 0.01; - } - } - agent.learn(reward); - env.previousReward=reward; - } -}, env.betSpeed); \ No newline at end of file + +// start betting function/agent interval +function startbetting() { + console.log('Starting bet session, to stop click STOP BOT'); + document.getElementById("free_play_link_li").innerHTML = 'STOP BOT'; + env.stop = false; + interval = setInterval(function () { loop(); }, env.betSpeed); +} +console.clear(); +console.log('You are using Trevel, with ReinforceJS'); +console.log('If you shall notice, the Free BTC link has been replaced with a START/STOP BOT button.'); +console.log('Click it to set the config. Note: These settings are not persistent.'); +console.log('To change the default values for these settings, search the script for "prompt"'); +console.log('Enjoy'); +function loop() { +if(env.profit > 0.00000500) +{ + env.rSeed(); +} + if (env.stop === false) { + var state = env.getAgentState(); + var action = agent.act(state); + var outcome = ""; + if (env.isTesting === false) { + if (action === 0) { + env.nextBet = "LB"; + env.prepareBet(); + env.placeBet(); + env.setOutcome("LB"); + outcome = env.betOutcomes[env.betOutcomes.length - 1]; + } else if (action === 1) { + env.nextBet = "HB"; + env.prepareBet(); + env.placeBet(); + env.setOutcome("HB"); + outcome = env.betOutcomes[env.betOutcomes.length - 1]; + } + if (env.verbose === true) { + env.calculateProbabilities(); + //console.log("Machine Bet: " + action + "{" + env.nextBet + "} isKelly: " + env.useKelly + " isMartingale: " + env.useMartingale); + console.log("Profit: " + env.profit + " WinRate: " + (env.winRate * 100).toFixed(2)); + } + } else { + console.log("Action: " + action); + var testOutcome = env.getTestOutcome(env.randomNumber(0, 1000)); + if (action === 0 && testOutcome === "LO") { + env.addBet("LB", "Win"); + outcome = "W"; + } else if (action === 0 && testOutcome === "HI") { + env.addBet("LB", "Loose"); + outcome = "L"; + } else if (action === 1 && testOutcome === "HI") { + env.addBet("HB", "Win"); + outcome = "W"; + } else if (action === 1 && testOutcome === "LO") { + env.addBet("HB", "Loose"); + outcome = "L"; + } + env.calculateProbabilities(); + console.log("Winrate: " + (env.winRate * 100).toFixed(2)); + } + var reward = env.getReward(); + if (reward === 0) { + if (outcome === "L") { + reward = -0.03; + } else { + reward = 0.01; + } + } + agent.learn(reward); + env.previousReward = reward; + } + +} From 39ebf534a9ee60a4d4df5c4251fc55774d6231ed Mon Sep 17 00:00:00 2001 From: nickisghosty Date: Sun, 22 Oct 2017 07:24:46 -0700 Subject: [PATCH 2/6] fixed indentation --- DQ-Trevel.js | 3677 +++++++++++++++++++++++++------------------------- 1 file changed, 1838 insertions(+), 1839 deletions(-) diff --git a/DQ-Trevel.js b/DQ-Trevel.js index b35d805..7a68dde 100644 --- a/DQ-Trevel.js +++ b/DQ-Trevel.js @@ -1,1795 +1,1795 @@ var R = {}; // the Recurrent library (function (global) { - "use strict"; - // Utility fun - function assert(condition, message) { - // from http://stackoverflow.com/questions/15313418/javascript-assert - if (!condition) { - message = message || "Assertion failed"; - if (typeof Error !== "undefined") { - throw new Error(message); - } - throw message; // Fallback - } - } - // Random numbers utils - var return_v = false; - var v_val = 0.0; - var gaussRandom = function () { - if (return_v) { - return_v = false; - return v_val; - } - var u = 2 * Math.random() - 1; - var v = 2 * Math.random() - 1; - var r = u * u + v * v; - if (r == 0 || r > 1) return gaussRandom(); - var c = Math.sqrt(-2 * Math.log(r) / r); - v_val = v * c; // cache this - return_v = true; - return u * c; - } - var randf = function (a, b) { - return Math.random() * (b - a) + a; - } - var randi = function (a, b) { - return Math.floor(Math.random() * (b - a) + a); - } - var randn = function (mu, std) { - return mu + gaussRandom() * std; - } - // helper function returns array of zeros of length n - // and uses typed arrays if available - var zeros = function (n) { - if (typeof (n) === 'undefined' || isNaN(n)) { - return []; - } - if (typeof ArrayBuffer === 'undefined') { - // lacking browser support - var arr = new Array(n); - for (var i = 0; i < n; i++) { - arr[i] = 0; - } - return arr; - } else { - return new Float64Array(n); - } - } - // Mat holds a matrix - var Mat = function (n, d) { - // n is number of rows d is number of columns - this.n = n; - this.d = d; - this.w = zeros(n * d); - this.dw = zeros(n * d); - } - Mat.prototype = { - get: function (row, col) { - // slow but careful accessor function - // we want row-major order - var ix = (this.d * row) + col; - assert(ix >= 0 && ix < this.w.length); - return this.w[ix]; - }, - set: function (row, col, v) { - // slow but careful accessor function - var ix = (this.d * row) + col; - assert(ix >= 0 && ix < this.w.length); - this.w[ix] = v; - }, - setFrom: function (arr) { - for (var i = 0, n = arr.length; i < n; i++) { - this.w[i] = arr[i]; - } - }, - setColumn: function (m, i) { - for (var q = 0, n = m.w.length; q < n; q++) { - this.w[(this.d * q) + i] = m.w[q]; - } - }, - toJSON: function () { - var json = {}; - json['n'] = this.n; - json['d'] = this.d; - json['w'] = this.w; - return json; - }, - fromJSON: function (json) { - this.n = json.n; - this.d = json.d; - this.w = zeros(this.n * this.d); - this.dw = zeros(this.n * this.d); - for (var i = 0, n = this.n * this.d; i < n; i++) { - this.w[i] = json.w[i]; // copy over weights - } - } - } - var copyMat = function (b) { - var a = new Mat(b.n, b.d); - a.setFrom(b.w); - return a; - } - var copyNet = function (net) { - // nets are (k,v) pairs with k = string key, v = Mat() - var new_net = {}; - for (var p in net) { - if (net.hasOwnProperty(p)) { - new_net[p] = copyMat(net[p]); - } - } - return new_net; - } - var updateMat = function (m, alpha) { - // updates in place - for (var i = 0, n = m.n * m.d; i < n; i++) { - if (m.dw[i] !== 0) { - m.w[i] += -alpha * m.dw[i]; - m.dw[i] = 0; - } - } - } - var updateNet = function (net, alpha) { - for (var p in net) { - if (net.hasOwnProperty(p)) { - updateMat(net[p], alpha); - } - } - } - var netToJSON = function (net) { - var j = {}; - for (var p in net) { - if (net.hasOwnProperty(p)) { - j[p] = net[p].toJSON(); - } - } - return j; - } - var netFromJSON = function (j) { - var net = {}; - for (var p in j) { - if (j.hasOwnProperty(p)) { - net[p] = new Mat(1, 1); // not proud of this - net[p].fromJSON(j[p]); - } - } - return net; - } - var netZeroGrads = function (net) { - for (var p in net) { - if (net.hasOwnProperty(p)) { - var mat = net[p]; - gradFillConst(mat, 0); - } - } - } - var netFlattenGrads = function (net) { - var n = 0; - for (var p in net) { - if (net.hasOwnProperty(p)) { - var mat = net[p]; - n += mat.dw.length; - } - } - var g = new Mat(n, 1); - var ix = 0; - for (var p in net) { - if (net.hasOwnProperty(p)) { - var mat = net[p]; - for (var i = 0, m = mat.dw.length; i < m; i++) { - g.w[ix] = mat.dw[i]; - ix++; - } - } - } - return g; - } - // return Mat but filled with random numbers from gaussian - var RandMat = function (n, d, mu, std) { - var m = new Mat(n, d); - fillRandn(m, mu, std); - //fillRand(m,-std,std); // kind of :P - return m; - } - // Mat utils - // fill matrix with random gaussian numbers - var fillRandn = function (m, mu, std) { - for (var i = 0, n = m.w.length; i < n; i++) { - m.w[i] = randn(mu, std); - } - } - var fillRand = function (m, lo, hi) { - for (var i = 0, n = m.w.length; i < n; i++) { - m.w[i] = randf(lo, hi); - } - } - var gradFillConst = function (m, c) { - for (var i = 0, n = m.dw.length; i < n; i++) { - m.dw[i] = c - } - } - // Transformer definitions - var Graph = function (needs_backprop) { - if (typeof needs_backprop === 'undefined') { - needs_backprop = true; - } - this.needs_backprop = needs_backprop; - // this will store a list of functions that perform backprop, - // in their forward pass order. So in backprop we will go - // backwards and evoke each one - this.backprop = []; - } - Graph.prototype = { - backward: function () { - for (var i = this.backprop.length - 1; i >= 0; i--) { - this.backprop[i](); // tick! - } - }, - rowPluck: function (m, ix) { - // pluck a row of m with index ix and return it as col vector - assert(ix >= 0 && ix < m.n); - var d = m.d; - var out = new Mat(d, 1); - for (var i = 0, n = d; i < n; i++) { - out.w[i] = m.w[d * ix + i]; - } // copy over the data - if (this.needs_backprop) { - var backward = function () { - for (var i = 0, n = d; i < n; i++) { - m.dw[d * ix + i] += out.dw[i]; - } - } - this.backprop.push(backward); - } - return out; - }, - tanh: function (m) { - // tanh nonlinearity - var out = new Mat(m.n, m.d); - var n = m.w.length; - for (var i = 0; i < n; i++) { - out.w[i] = Math.tanh(m.w[i]); - } - if (this.needs_backprop) { - var backward = function () { - for (var i = 0; i < n; i++) { - // grad for z = tanh(x) is (1 - z^2) - var mwi = out.w[i]; - m.dw[i] += (1.0 - mwi * mwi) * out.dw[i]; - } - } - this.backprop.push(backward); - } - return out; - }, - sigmoid: function (m) { - // sigmoid nonlinearity - var out = new Mat(m.n, m.d); - var n = m.w.length; - for (var i = 0; i < n; i++) { - out.w[i] = sig(m.w[i]); - } - if (this.needs_backprop) { - var backward = function () { - for (var i = 0; i < n; i++) { - // grad for z = tanh(x) is (1 - z^2) - var mwi = out.w[i]; - m.dw[i] += mwi * (1.0 - mwi) * out.dw[i]; - } - } - this.backprop.push(backward); - } - return out; - }, - relu: function (m) { - var out = new Mat(m.n, m.d); - var n = m.w.length; - for (var i = 0; i < n; i++) { - out.w[i] = Math.max(0, m.w[i]); // relu - } - if (this.needs_backprop) { - var backward = function () { - for (var i = 0; i < n; i++) { - m.dw[i] += m.w[i] > 0 ? out.dw[i] : 0.0; - } - } - this.backprop.push(backward); - } - return out; - }, - mul: function (m1, m2) { - // multiply matrices m1 * m2 - assert(m1.d === m2.n, 'matmul dimensions misaligned'); - var n = m1.n; - var d = m2.d; - var out = new Mat(n, d); - for (var i = 0; i < m1.n; i++) { // loop over rows of m1 - for (var j = 0; j < m2.d; j++) { // loop over cols of m2 - var dot = 0.0; - for (var k = 0; k < m1.d; k++) { // dot product loop - dot += m1.w[m1.d * i + k] * m2.w[m2.d * k + j]; - } - out.w[d * i + j] = dot; - } - } - if (this.needs_backprop) { - var backward = function () { - for (var i = 0; i < m1.n; i++) { // loop over rows of m1 - for (var j = 0; j < m2.d; j++) { // loop over cols of m2 - for (var k = 0; k < m1.d; k++) { // dot product loop - var b = out.dw[d * i + j]; - m1.dw[m1.d * i + k] += m2.w[m2.d * k + j] * b; - m2.dw[m2.d * k + j] += m1.w[m1.d * i + k] * b; - } - } - } - } - this.backprop.push(backward); - } - return out; - }, - add: function (m1, m2) { - assert(m1.w.length === m2.w.length); - var out = new Mat(m1.n, m1.d); - for (var i = 0, n = m1.w.length; i < n; i++) { - out.w[i] = m1.w[i] + m2.w[i]; - } - if (this.needs_backprop) { - var backward = function () { - for (var i = 0, n = m1.w.length; i < n; i++) { - m1.dw[i] += out.dw[i]; - m2.dw[i] += out.dw[i]; - } - } - this.backprop.push(backward); - } - return out; - }, - dot: function (m1, m2) { - // m1 m2 are both column vectors - assert(m1.w.length === m2.w.length); - var out = new Mat(1, 1); - var dot = 0.0; - for (var i = 0, n = m1.w.length; i < n; i++) { - dot += m1.w[i] * m2.w[i]; - } - out.w[0] = dot; - if (this.needs_backprop) { - var backward = function () { - for (var i = 0, n = m1.w.length; i < n; i++) { - m1.dw[i] += m2.w[i] * out.dw[0]; - m2.dw[i] += m1.w[i] * out.dw[0]; - } - } - this.backprop.push(backward); - } - return out; - }, - eltmul: function (m1, m2) { - assert(m1.w.length === m2.w.length); - var out = new Mat(m1.n, m1.d); - for (var i = 0, n = m1.w.length; i < n; i++) { - out.w[i] = m1.w[i] * m2.w[i]; - } - if (this.needs_backprop) { - var backward = function () { - for (var i = 0, n = m1.w.length; i < n; i++) { - m1.dw[i] += m2.w[i] * out.dw[i]; - m2.dw[i] += m1.w[i] * out.dw[i]; - } - } - this.backprop.push(backward); - } - return out; - }, - } - var softmax = function (m) { - var out = new Mat(m.n, m.d); // probability volume - var maxval = -999999; - for (var i = 0, n = m.w.length; i < n; i++) { - if (m.w[i] > maxval) maxval = m.w[i]; - } - var s = 0.0; - for (var i = 0, n = m.w.length; i < n; i++) { - out.w[i] = Math.exp(m.w[i] - maxval); - s += out.w[i]; - } - for (var i = 0, n = m.w.length; i < n; i++) { - out.w[i] /= s; - } - // no backward pass here needed - // since we will use the computed probabilities outside - // to set gradients directly on m - return out; - } - var Solver = function () { - this.decay_rate = 0.999; - this.smooth_eps = 1e-8; - this.step_cache = {}; - } - Solver.prototype = { - step: function (model, step_size, regc, clipval) { - // perform parameter update - var solver_stats = {}; - var num_clipped = 0; - var num_tot = 0; - for (var k in model) { - if (model.hasOwnProperty(k)) { - var m = model[k]; // mat ref - if (!(k in this.step_cache)) { - this.step_cache[k] = new Mat(m.n, m.d); - } - var s = this.step_cache[k]; - for (var i = 0, n = m.w.length; i < n; i++) { - // rmsprop adaptive learning rate - var mdwi = m.dw[i]; - s.w[i] = s.w[i] * this.decay_rate + (1.0 - this.decay_rate) * mdwi * mdwi; - // gradient clip - if (mdwi > clipval) { - mdwi = clipval; - num_clipped++; - } - if (mdwi < -clipval) { - mdwi = -clipval; - num_clipped++; - } - num_tot++; - // update (and regularize) - m.w[i] += -step_size * mdwi / Math.sqrt(s.w[i] + this.smooth_eps) - regc * m.w[i]; - m.dw[i] = 0; // reset gradients for next iteration - } - } - } - solver_stats['ratio_clipped'] = num_clipped * 1.0 / num_tot; - return solver_stats; - } - } - var initLSTM = function (input_size, hidden_sizes, output_size) { - // hidden size should be a list - var model = {}; - for (var d = 0; d < hidden_sizes.length; d++) { // loop over depths - var prev_size = d === 0 ? input_size : hidden_sizes[d - 1]; - var hidden_size = hidden_sizes[d]; - // gates parameters - model['Wix' + d] = new RandMat(hidden_size, prev_size, 0, 0.08); - model['Wih' + d] = new RandMat(hidden_size, hidden_size, 0, 0.08); - model['bi' + d] = new Mat(hidden_size, 1); - model['Wfx' + d] = new RandMat(hidden_size, prev_size, 0, 0.08); - model['Wfh' + d] = new RandMat(hidden_size, hidden_size, 0, 0.08); - model['bf' + d] = new Mat(hidden_size, 1); - model['Wox' + d] = new RandMat(hidden_size, prev_size, 0, 0.08); - model['Woh' + d] = new RandMat(hidden_size, hidden_size, 0, 0.08); - model['bo' + d] = new Mat(hidden_size, 1); - // cell write params - model['Wcx' + d] = new RandMat(hidden_size, prev_size, 0, 0.08); - model['Wch' + d] = new RandMat(hidden_size, hidden_size, 0, 0.08); - model['bc' + d] = new Mat(hidden_size, 1); - } - // decoder params - model['Whd'] = new RandMat(output_size, hidden_size, 0, 0.08); - model['bd'] = new Mat(output_size, 1); - return model; - } - var forwardLSTM = function (G, model, hidden_sizes, x, prev) { - // forward prop for a single tick of LSTM - // G is graph to append ops to - // model contains LSTM parameters - // x is 1D column vector with observation - // prev is a struct containing hidden and cell - // from previous iteration - if (prev == null || typeof prev.h === 'undefined') { - var hidden_prevs = []; - var cell_prevs = []; - for (var d = 0; d < hidden_sizes.length; d++) { - hidden_prevs.push(new R.Mat(hidden_sizes[d], 1)); - cell_prevs.push(new R.Mat(hidden_sizes[d], 1)); - } - } else { - var hidden_prevs = prev.h; - var cell_prevs = prev.c; - } - var hidden = []; - var cell = []; - for (var d = 0; d < hidden_sizes.length; d++) { - var input_vector = d === 0 ? x : hidden[d - 1]; - var hidden_prev = hidden_prevs[d]; - var cell_prev = cell_prevs[d]; - // input gate - var h0 = G.mul(model['Wix' + d], input_vector); - var h1 = G.mul(model['Wih' + d], hidden_prev); - var input_gate = G.sigmoid(G.add(G.add(h0, h1), model['bi' + d])); - // forget gate - var h2 = G.mul(model['Wfx' + d], input_vector); - var h3 = G.mul(model['Wfh' + d], hidden_prev); - var forget_gate = G.sigmoid(G.add(G.add(h2, h3), model['bf' + d])); - // output gate - var h4 = G.mul(model['Wox' + d], input_vector); - var h5 = G.mul(model['Woh' + d], hidden_prev); - var output_gate = G.sigmoid(G.add(G.add(h4, h5), model['bo' + d])); - // write operation on cells - var h6 = G.mul(model['Wcx' + d], input_vector); - var h7 = G.mul(model['Wch' + d], hidden_prev); - var cell_write = G.tanh(G.add(G.add(h6, h7), model['bc' + d])); - // compute new cell activation - var retain_cell = G.eltmul(forget_gate, cell_prev); // what do we keep from cell - var write_cell = G.eltmul(input_gate, cell_write); // what do we write to cell - var cell_d = G.add(retain_cell, write_cell); // new cell contents - // compute hidden state as gated, saturated cell activations - var hidden_d = G.eltmul(output_gate, G.tanh(cell_d)); - hidden.push(hidden_d); - cell.push(cell_d); - } - // one decoder to outputs at end - var output = G.add(G.mul(model['Whd'], hidden[hidden.length - 1]), model['bd']); - // return cell memory, hidden representation and output - return { - 'h': hidden, - 'c': cell, - 'o': output - }; - } - var sig = function (x) { - // helper function for computing sigmoid - return 1.0 / (1 + Math.exp(-x)); - } - var maxi = function (w) { - // argmax of array w - var maxv = w[0]; - var maxix = 0; - for (var i = 1, n = w.length; i < n; i++) { - var v = w[i]; - if (v > maxv) { - maxix = i; - maxv = v; - } - } - return maxix; - } - var samplei = function (w) { - // sample argmax from w, assuming w are - // probabilities that sum to one - var r = randf(0, 1); - var x = 0.0; - var i = 0; - while (true) { - x += w[i]; - if (x > r) { - return i; - } - i++; - } - return w.length - 1; // pretty sure we should never get here? - } - // various utils - global.assert = assert; - global.zeros = zeros; - global.maxi = maxi; - global.samplei = samplei; - global.randi = randi; - global.randn = randn; - global.softmax = softmax; - // classes - global.Mat = Mat; - global.RandMat = RandMat; - global.forwardLSTM = forwardLSTM; - global.initLSTM = initLSTM; - // more utils - global.updateMat = updateMat; - global.updateNet = updateNet; - global.copyMat = copyMat; - global.copyNet = copyNet; - global.netToJSON = netToJSON; - global.netFromJSON = netFromJSON; - global.netZeroGrads = netZeroGrads; - global.netFlattenGrads = netFlattenGrads; - // optimization - global.Solver = Solver; - global.Graph = Graph; + "use strict"; + // Utility fun + function assert(condition, message) { + // from http://stackoverflow.com/questions/15313418/javascript-assert + if (!condition) { + message = message || "Assertion failed"; + if (typeof Error !== "undefined") { + throw new Error(message); + } + throw message; // Fallback + } + } + // Random numbers utils + var return_v = false; + var v_val = 0.0; + var gaussRandom = function () { + if (return_v) { + return_v = false; + return v_val; + } + var u = 2 * Math.random() - 1; + var v = 2 * Math.random() - 1; + var r = u * u + v * v; + if (r == 0 || r > 1) return gaussRandom(); + var c = Math.sqrt(-2 * Math.log(r) / r); + v_val = v * c; // cache this + return_v = true; + return u * c; + } + var randf = function (a, b) { + return Math.random() * (b - a) + a; + } + var randi = function (a, b) { + return Math.floor(Math.random() * (b - a) + a); + } + var randn = function (mu, std) { + return mu + gaussRandom() * std; + } + // helper function returns array of zeros of length n + // and uses typed arrays if available + var zeros = function (n) { + if (typeof (n) === 'undefined' || isNaN(n)) { + return []; + } + if (typeof ArrayBuffer === 'undefined') { + // lacking browser support + var arr = new Array(n); + for (var i = 0; i < n; i++) { + arr[i] = 0; + } + return arr; + } else { + return new Float64Array(n); + } + } + // Mat holds a matrix + var Mat = function (n, d) { + // n is number of rows d is number of columns + this.n = n; + this.d = d; + this.w = zeros(n * d); + this.dw = zeros(n * d); + } + Mat.prototype = { + get: function (row, col) { + // slow but careful accessor function + // we want row-major order + var ix = (this.d * row) + col; + assert(ix >= 0 && ix < this.w.length); + return this.w[ix]; + }, + set: function (row, col, v) { + // slow but careful accessor function + var ix = (this.d * row) + col; + assert(ix >= 0 && ix < this.w.length); + this.w[ix] = v; + }, + setFrom: function (arr) { + for (var i = 0, n = arr.length; i < n; i++) { + this.w[i] = arr[i]; + } + }, + setColumn: function (m, i) { + for (var q = 0, n = m.w.length; q < n; q++) { + this.w[(this.d * q) + i] = m.w[q]; + } + }, + toJSON: function () { + var json = {}; + json['n'] = this.n; + json['d'] = this.d; + json['w'] = this.w; + return json; + }, + fromJSON: function (json) { + this.n = json.n; + this.d = json.d; + this.w = zeros(this.n * this.d); + this.dw = zeros(this.n * this.d); + for (var i = 0, n = this.n * this.d; i < n; i++) { + this.w[i] = json.w[i]; // copy over weights + } + } + } + var copyMat = function (b) { + var a = new Mat(b.n, b.d); + a.setFrom(b.w); + return a; + } + var copyNet = function (net) { + // nets are (k,v) pairs with k = string key, v = Mat() + var new_net = {}; + for (var p in net) { + if (net.hasOwnProperty(p)) { + new_net[p] = copyMat(net[p]); + } + } + return new_net; + } + var updateMat = function (m, alpha) { + // updates in place + for (var i = 0, n = m.n * m.d; i < n; i++) { + if (m.dw[i] !== 0) { + m.w[i] += -alpha * m.dw[i]; + m.dw[i] = 0; + } + } + } + var updateNet = function (net, alpha) { + for (var p in net) { + if (net.hasOwnProperty(p)) { + updateMat(net[p], alpha); + } + } + } + var netToJSON = function (net) { + var j = {}; + for (var p in net) { + if (net.hasOwnProperty(p)) { + j[p] = net[p].toJSON(); + } + } + return j; + } + var netFromJSON = function (j) { + var net = {}; + for (var p in j) { + if (j.hasOwnProperty(p)) { + net[p] = new Mat(1, 1); // not proud of this + net[p].fromJSON(j[p]); + } + } + return net; + } + var netZeroGrads = function (net) { + for (var p in net) { + if (net.hasOwnProperty(p)) { + var mat = net[p]; + gradFillConst(mat, 0); + } + } + } + var netFlattenGrads = function (net) { + var n = 0; + for (var p in net) { + if (net.hasOwnProperty(p)) { + var mat = net[p]; + n += mat.dw.length; + } + } + var g = new Mat(n, 1); + var ix = 0; + for (var p in net) { + if (net.hasOwnProperty(p)) { + var mat = net[p]; + for (var i = 0, m = mat.dw.length; i < m; i++) { + g.w[ix] = mat.dw[i]; + ix++; + } + } + } + return g; + } + // return Mat but filled with random numbers from gaussian + var RandMat = function (n, d, mu, std) { + var m = new Mat(n, d); + fillRandn(m, mu, std); + //fillRand(m,-std,std); // kind of :P + return m; + } + // Mat utils + // fill matrix with random gaussian numbers + var fillRandn = function (m, mu, std) { + for (var i = 0, n = m.w.length; i < n; i++) { + m.w[i] = randn(mu, std); + } + } + var fillRand = function (m, lo, hi) { + for (var i = 0, n = m.w.length; i < n; i++) { + m.w[i] = randf(lo, hi); + } + } + var gradFillConst = function (m, c) { + for (var i = 0, n = m.dw.length; i < n; i++) { + m.dw[i] = c + } + } + // Transformer definitions + var Graph = function (needs_backprop) { + if (typeof needs_backprop === 'undefined') { + needs_backprop = true; + } + this.needs_backprop = needs_backprop; + // this will store a list of functions that perform backprop, + // in their forward pass order. So in backprop we will go + // backwards and evoke each one + this.backprop = []; + } + Graph.prototype = { + backward: function () { + for (var i = this.backprop.length - 1; i >= 0; i--) { + this.backprop[i](); // tick! + } + }, + rowPluck: function (m, ix) { + // pluck a row of m with index ix and return it as col vector + assert(ix >= 0 && ix < m.n); + var d = m.d; + var out = new Mat(d, 1); + for (var i = 0, n = d; i < n; i++) { + out.w[i] = m.w[d * ix + i]; + } // copy over the data + if (this.needs_backprop) { + var backward = function () { + for (var i = 0, n = d; i < n; i++) { + m.dw[d * ix + i] += out.dw[i]; + } + } + this.backprop.push(backward); + } + return out; + }, + tanh: function (m) { + // tanh nonlinearity + var out = new Mat(m.n, m.d); + var n = m.w.length; + for (var i = 0; i < n; i++) { + out.w[i] = Math.tanh(m.w[i]); + } + if (this.needs_backprop) { + var backward = function () { + for (var i = 0; i < n; i++) { + // grad for z = tanh(x) is (1 - z^2) + var mwi = out.w[i]; + m.dw[i] += (1.0 - mwi * mwi) * out.dw[i]; + } + } + this.backprop.push(backward); + } + return out; + }, + sigmoid: function (m) { + // sigmoid nonlinearity + var out = new Mat(m.n, m.d); + var n = m.w.length; + for (var i = 0; i < n; i++) { + out.w[i] = sig(m.w[i]); + } + if (this.needs_backprop) { + var backward = function () { + for (var i = 0; i < n; i++) { + // grad for z = tanh(x) is (1 - z^2) + var mwi = out.w[i]; + m.dw[i] += mwi * (1.0 - mwi) * out.dw[i]; + } + } + this.backprop.push(backward); + } + return out; + }, + relu: function (m) { + var out = new Mat(m.n, m.d); + var n = m.w.length; + for (var i = 0; i < n; i++) { + out.w[i] = Math.max(0, m.w[i]); // relu + } + if (this.needs_backprop) { + var backward = function () { + for (var i = 0; i < n; i++) { + m.dw[i] += m.w[i] > 0 ? out.dw[i] : 0.0; + } + } + this.backprop.push(backward); + } + return out; + }, + mul: function (m1, m2) { + // multiply matrices m1 * m2 + assert(m1.d === m2.n, 'matmul dimensions misaligned'); + var n = m1.n; + var d = m2.d; + var out = new Mat(n, d); + for (var i = 0; i < m1.n; i++) { // loop over rows of m1 + for (var j = 0; j < m2.d; j++) { // loop over cols of m2 + var dot = 0.0; + for (var k = 0; k < m1.d; k++) { // dot product loop + dot += m1.w[m1.d * i + k] * m2.w[m2.d * k + j]; + } + out.w[d * i + j] = dot; + } + } + if (this.needs_backprop) { + var backward = function () { + for (var i = 0; i < m1.n; i++) { // loop over rows of m1 + for (var j = 0; j < m2.d; j++) { // loop over cols of m2 + for (var k = 0; k < m1.d; k++) { // dot product loop + var b = out.dw[d * i + j]; + m1.dw[m1.d * i + k] += m2.w[m2.d * k + j] * b; + m2.dw[m2.d * k + j] += m1.w[m1.d * i + k] * b; + } + } + } + } + this.backprop.push(backward); + } + return out; + }, + add: function (m1, m2) { + assert(m1.w.length === m2.w.length); + var out = new Mat(m1.n, m1.d); + for (var i = 0, n = m1.w.length; i < n; i++) { + out.w[i] = m1.w[i] + m2.w[i]; + } + if (this.needs_backprop) { + var backward = function () { + for (var i = 0, n = m1.w.length; i < n; i++) { + m1.dw[i] += out.dw[i]; + m2.dw[i] += out.dw[i]; + } + } + this.backprop.push(backward); + } + return out; + }, + dot: function (m1, m2) { + // m1 m2 are both column vectors + assert(m1.w.length === m2.w.length); + var out = new Mat(1, 1); + var dot = 0.0; + for (var i = 0, n = m1.w.length; i < n; i++) { + dot += m1.w[i] * m2.w[i]; + } + out.w[0] = dot; + if (this.needs_backprop) { + var backward = function () { + for (var i = 0, n = m1.w.length; i < n; i++) { + m1.dw[i] += m2.w[i] * out.dw[0]; + m2.dw[i] += m1.w[i] * out.dw[0]; + } + } + this.backprop.push(backward); + } + return out; + }, + eltmul: function (m1, m2) { + assert(m1.w.length === m2.w.length); + var out = new Mat(m1.n, m1.d); + for (var i = 0, n = m1.w.length; i < n; i++) { + out.w[i] = m1.w[i] * m2.w[i]; + } + if (this.needs_backprop) { + var backward = function () { + for (var i = 0, n = m1.w.length; i < n; i++) { + m1.dw[i] += m2.w[i] * out.dw[i]; + m2.dw[i] += m1.w[i] * out.dw[i]; + } + } + this.backprop.push(backward); + } + return out; + }, + } + var softmax = function (m) { + var out = new Mat(m.n, m.d); // probability volume + var maxval = -999999; + for (var i = 0, n = m.w.length; i < n; i++) { + if (m.w[i] > maxval) maxval = m.w[i]; + } + var s = 0.0; + for (var i = 0, n = m.w.length; i < n; i++) { + out.w[i] = Math.exp(m.w[i] - maxval); + s += out.w[i]; + } + for (var i = 0, n = m.w.length; i < n; i++) { + out.w[i] /= s; + } + // no backward pass here needed + // since we will use the computed probabilities outside + // to set gradients directly on m + return out; + } + var Solver = function () { + this.decay_rate = 0.999; + this.smooth_eps = 1e-8; + this.step_cache = {}; + } + Solver.prototype = { + step: function (model, step_size, regc, clipval) { + // perform parameter update + var solver_stats = {}; + var num_clipped = 0; + var num_tot = 0; + for (var k in model) { + if (model.hasOwnProperty(k)) { + var m = model[k]; // mat ref + if (!(k in this.step_cache)) { + this.step_cache[k] = new Mat(m.n, m.d); + } + var s = this.step_cache[k]; + for (var i = 0, n = m.w.length; i < n; i++) { + // rmsprop adaptive learning rate + var mdwi = m.dw[i]; + s.w[i] = s.w[i] * this.decay_rate + (1.0 - this.decay_rate) * mdwi * mdwi; + // gradient clip + if (mdwi > clipval) { + mdwi = clipval; + num_clipped++; + } + if (mdwi < -clipval) { + mdwi = -clipval; + num_clipped++; + } + num_tot++; + // update (and regularize) + m.w[i] += -step_size * mdwi / Math.sqrt(s.w[i] + this.smooth_eps) - regc * m.w[i]; + m.dw[i] = 0; // reset gradients for next iteration + } + } + } + solver_stats['ratio_clipped'] = num_clipped * 1.0 / num_tot; + return solver_stats; + } + } + var initLSTM = function (input_size, hidden_sizes, output_size) { + // hidden size should be a list + var model = {}; + for (var d = 0; d < hidden_sizes.length; d++) { // loop over depths + var prev_size = d === 0 ? input_size : hidden_sizes[d - 1]; + var hidden_size = hidden_sizes[d]; + // gates parameters + model['Wix' + d] = new RandMat(hidden_size, prev_size, 0, 0.08); + model['Wih' + d] = new RandMat(hidden_size, hidden_size, 0, 0.08); + model['bi' + d] = new Mat(hidden_size, 1); + model['Wfx' + d] = new RandMat(hidden_size, prev_size, 0, 0.08); + model['Wfh' + d] = new RandMat(hidden_size, hidden_size, 0, 0.08); + model['bf' + d] = new Mat(hidden_size, 1); + model['Wox' + d] = new RandMat(hidden_size, prev_size, 0, 0.08); + model['Woh' + d] = new RandMat(hidden_size, hidden_size, 0, 0.08); + model['bo' + d] = new Mat(hidden_size, 1); + // cell write params + model['Wcx' + d] = new RandMat(hidden_size, prev_size, 0, 0.08); + model['Wch' + d] = new RandMat(hidden_size, hidden_size, 0, 0.08); + model['bc' + d] = new Mat(hidden_size, 1); + } + // decoder params + model['Whd'] = new RandMat(output_size, hidden_size, 0, 0.08); + model['bd'] = new Mat(output_size, 1); + return model; + } + var forwardLSTM = function (G, model, hidden_sizes, x, prev) { + // forward prop for a single tick of LSTM + // G is graph to append ops to + // model contains LSTM parameters + // x is 1D column vector with observation + // prev is a struct containing hidden and cell + // from previous iteration + if (prev == null || typeof prev.h === 'undefined') { + var hidden_prevs = []; + var cell_prevs = []; + for (var d = 0; d < hidden_sizes.length; d++) { + hidden_prevs.push(new R.Mat(hidden_sizes[d], 1)); + cell_prevs.push(new R.Mat(hidden_sizes[d], 1)); + } + } else { + var hidden_prevs = prev.h; + var cell_prevs = prev.c; + } + var hidden = []; + var cell = []; + for (var d = 0; d < hidden_sizes.length; d++) { + var input_vector = d === 0 ? x : hidden[d - 1]; + var hidden_prev = hidden_prevs[d]; + var cell_prev = cell_prevs[d]; + // input gate + var h0 = G.mul(model['Wix' + d], input_vector); + var h1 = G.mul(model['Wih' + d], hidden_prev); + var input_gate = G.sigmoid(G.add(G.add(h0, h1), model['bi' + d])); + // forget gate + var h2 = G.mul(model['Wfx' + d], input_vector); + var h3 = G.mul(model['Wfh' + d], hidden_prev); + var forget_gate = G.sigmoid(G.add(G.add(h2, h3), model['bf' + d])); + // output gate + var h4 = G.mul(model['Wox' + d], input_vector); + var h5 = G.mul(model['Woh' + d], hidden_prev); + var output_gate = G.sigmoid(G.add(G.add(h4, h5), model['bo' + d])); + // write operation on cells + var h6 = G.mul(model['Wcx' + d], input_vector); + var h7 = G.mul(model['Wch' + d], hidden_prev); + var cell_write = G.tanh(G.add(G.add(h6, h7), model['bc' + d])); + // compute new cell activation + var retain_cell = G.eltmul(forget_gate, cell_prev); // what do we keep from cell + var write_cell = G.eltmul(input_gate, cell_write); // what do we write to cell + var cell_d = G.add(retain_cell, write_cell); // new cell contents + // compute hidden state as gated, saturated cell activations + var hidden_d = G.eltmul(output_gate, G.tanh(cell_d)); + hidden.push(hidden_d); + cell.push(cell_d); + } + // one decoder to outputs at end + var output = G.add(G.mul(model['Whd'], hidden[hidden.length - 1]), model['bd']); + // return cell memory, hidden representation and output + return { + 'h': hidden, + 'c': cell, + 'o': output + }; + } + var sig = function (x) { + // helper function for computing sigmoid + return 1.0 / (1 + Math.exp(-x)); + } + var maxi = function (w) { + // argmax of array w + var maxv = w[0]; + var maxix = 0; + for (var i = 1, n = w.length; i < n; i++) { + var v = w[i]; + if (v > maxv) { + maxix = i; + maxv = v; + } + } + return maxix; + } + var samplei = function (w) { + // sample argmax from w, assuming w are + // probabilities that sum to one + var r = randf(0, 1); + var x = 0.0; + var i = 0; + while (true) { + x += w[i]; + if (x > r) { + return i; + } + i++; + } + return w.length - 1; // pretty sure we should never get here? + } + // various utils + global.assert = assert; + global.zeros = zeros; + global.maxi = maxi; + global.samplei = samplei; + global.randi = randi; + global.randn = randn; + global.softmax = softmax; + // classes + global.Mat = Mat; + global.RandMat = RandMat; + global.forwardLSTM = forwardLSTM; + global.initLSTM = initLSTM; + // more utils + global.updateMat = updateMat; + global.updateNet = updateNet; + global.copyMat = copyMat; + global.copyNet = copyNet; + global.netToJSON = netToJSON; + global.netFromJSON = netFromJSON; + global.netZeroGrads = netZeroGrads; + global.netFlattenGrads = netFlattenGrads; + // optimization + global.Solver = Solver; + global.Graph = Graph; })(R); // END OF RECURRENTJS var RL = {}; (function (global) { - "use strict"; - // syntactic sugar function for getting default parameter values - var getopt = function (opt, field_name, default_value) { - if (typeof opt === 'undefined') { - return default_value; - } - return (typeof opt[field_name] !== 'undefined') ? opt[field_name] : default_value; - } - var zeros = R.zeros; // inherit these - var assert = R.assert; - var randi = R.randi; - var randf = R.randf; - var setConst = function (arr, c) { - for (var i = 0, n = arr.length; i < n; i++) { - arr[i] = c; - } - } - var sampleWeighted = function (p) { - var r = Math.random(); - var c = 0.0; - for (var i = 0, n = p.length; i < n; i++) { - c += p[i]; - if (c >= r) { - return i; - } - } - assert(false, 'wtf'); - } - // ------ - // AGENTS - // ------ - // DPAgent performs Value Iteration - // - can also be used for Policy Iteration if you really wanted to - // - requires model of the environment :( - // - does not learn from experience :( - // - assumes finite MDP :( - var DPAgent = function (env, opt) { - this.V = null; // state value function - this.P = null; // policy distribution \pi(s,a) - this.env = env; // store pointer to environment - this.gamma = getopt(opt, 'gamma', 0.75); // future reward discount factor - this.reset(); - } - DPAgent.prototype = { - reset: function () { - // reset the agent's policy and value function - this.ns = this.env.getNumStates(); - this.na = this.env.getMaxNumActions(); - this.V = zeros(this.ns); - this.P = zeros(this.ns * this.na); - // initialize uniform random policy - for (var s = 0; s < this.ns; s++) { - var poss = this.env.allowedActions(s); - for (var i = 0, n = poss.length; i < n; i++) { - this.P[poss[i] * this.ns + s] = 1.0 / poss.length; - } - } - }, - act: function (s) { - // behave according to the learned policy - var poss = this.env.allowedActions(s); - var ps = []; - for (var i = 0, n = poss.length; i < n; i++) { - var a = poss[i]; - var prob = this.P[a * this.ns + s]; - ps.push(prob); - } - var maxi = sampleWeighted(ps); - return poss[maxi]; - }, - learn: function () { - // perform a single round of value iteration - self.evaluatePolicy(); // writes this.V - self.updatePolicy(); // writes this.P - }, - evaluatePolicy: function () { - // perform a synchronous update of the value function - var Vnew = zeros(this.ns); - for (var s = 0; s < this.ns; s++) { - // integrate over actions in a stochastic policy - // note that we assume that policy probability mass over allowed actions sums to one - var v = 0.0; - var poss = this.env.allowedActions(s); - for (var i = 0, n = poss.length; i < n; i++) { - var a = poss[i]; - var prob = this.P[a * this.ns + s]; // probability of taking action under policy - if (prob === 0) { - continue; - } // no contribution, skip for speed - var ns = this.env.nextStateDistribution(s, a); - var rs = this.env.reward(s, a, ns); // reward for s->a->ns transition - v += prob * (rs + this.gamma * this.V[ns]); - } - Vnew[s] = v; - } - this.V = Vnew; // swap - }, - updatePolicy: function () { - // update policy to be greedy w.r.t. learned Value function - for (var s = 0; s < this.ns; s++) { - var poss = this.env.allowedActions(s); - // compute value of taking each allowed action - var vmax, nmax; - var vs = []; - for (var i = 0, n = poss.length; i < n; i++) { - var a = poss[i]; - var ns = this.env.nextStateDistribution(s, a); - var rs = this.env.reward(s, a, ns); - var v = rs + this.gamma * this.V[ns]; - vs.push(v); - if (i === 0 || v > vmax) { - vmax = v; - nmax = 1; - } else if (v === vmax) { - nmax += 1; - } - } - // update policy smoothly across all argmaxy actions - for (var i = 0, n = poss.length; i < n; i++) { - var a = poss[i]; - this.P[a * this.ns + s] = (vs[i] === vmax) ? 1.0 / nmax : 0.0; - } - } - }, - } - // QAgent uses TD (Q-Learning, SARSA) - // - does not require environment model :) - // - learns from experience :) - var TDAgent = function (env, opt) { - this.update = getopt(opt, 'update', 'qlearn'); // qlearn | sarsa - this.gamma = getopt(opt, 'gamma', 0.75); // future reward discount factor - this.epsilon = getopt(opt, 'epsilon', 0.1); // for epsilon-greedy policy - this.alpha = getopt(opt, 'alpha', 0.01); // value function learning rate - // class allows non-deterministic policy, and smoothly regressing towards the optimal policy based on Q - this.smooth_policy_update = getopt(opt, 'smooth_policy_update', false); - this.beta = getopt(opt, 'beta', 0.01); // learning rate for policy, if smooth updates are on - // eligibility traces - this.lambda = getopt(opt, 'lambda', 0); // eligibility trace decay. 0 = no eligibility traces used - this.replacing_traces = getopt(opt, 'replacing_traces', true); - // optional optimistic initial values - this.q_init_val = getopt(opt, 'q_init_val', 0); - this.planN = getopt(opt, 'planN', 0); // number of planning steps per learning iteration (0 = no planning) - this.Q = null; // state action value function - this.P = null; // policy distribution \pi(s,a) - this.e = null; // eligibility trace - this.env_model_s = null;; // environment model (s,a) -> (s',r) - this.env_model_r = null;; // environment model (s,a) -> (s',r) - this.env = env; // store pointer to environment - this.reset(); - } - TDAgent.prototype = { - reset: function () { - // reset the agent's policy and value function - this.ns = this.env.getNumStates(); - this.na = this.env.getMaxNumActions(); - this.Q = zeros(this.ns * this.na); - if (this.q_init_val !== 0) { - setConst(this.Q, this.q_init_val); - } - this.P = zeros(this.ns * this.na); - this.e = zeros(this.ns * this.na); - // model/planning vars - this.env_model_s = zeros(this.ns * this.na); - setConst(this.env_model_s, -1); // init to -1 so we can test if we saw the state before - this.env_model_r = zeros(this.ns * this.na); - this.sa_seen = []; - this.pq = zeros(this.ns * this.na); - // initialize uniform random policy - for (var s = 0; s < this.ns; s++) { - var poss = this.env.allowedActions(s); - for (var i = 0, n = poss.length; i < n; i++) { - this.P[poss[i] * this.ns + s] = 1.0 / poss.length; - } - } - // agent memory, needed for streaming updates - // (s0,a0,r0,s1,a1,r1,...) - this.r0 = null; - this.s0 = null; - this.s1 = null; - this.a0 = null; - this.a1 = null; - }, - resetEpisode: function () { - // an episode finished - }, - act: function (s) { - // act according to epsilon greedy policy - var poss = this.env.allowedActions(s); - var probs = []; - for (var i = 0, n = poss.length; i < n; i++) { - probs.push(this.P[poss[i] * this.ns + s]); - } - // epsilon greedy policy - if (Math.random() < this.epsilon) { - var a = poss[randi(0, poss.length)]; // random available action - this.explored = true; - } else { - var a = poss[sampleWeighted(probs)]; - this.explored = false; - } - // shift state memory - this.s0 = this.s1; - this.a0 = this.a1; - this.s1 = s; - this.a1 = a; - return a; - }, - learn: function (r1) { - // takes reward for previous action, which came from a call to act() - if (!(this.r0 == null)) { - this.learnFromTuple(this.s0, this.a0, this.r0, this.s1, this.a1, this.lambda); - if (this.planN > 0) { - this.updateModel(this.s0, this.a0, this.r0, this.s1); - this.plan(); - } - } - this.r0 = r1; // store this for next update - }, - updateModel: function (s0, a0, r0, s1) { - // transition (s0,a0) -> (r0,s1) was observed. Update environment model - var sa = a0 * this.ns + s0; - if (this.env_model_s[sa] === -1) { - // first time we see this state action - this.sa_seen.push(a0 * this.ns + s0); // add as seen state - } - this.env_model_s[sa] = s1; - this.env_model_r[sa] = r0; - }, - plan: function () { - // order the states based on current priority queue information - var spq = []; - for (var i = 0, n = this.sa_seen.length; i < n; i++) { - var sa = this.sa_seen[i]; - var sap = this.pq[sa]; - if (sap > 1e-5) { // gain a bit of efficiency - spq.push({ - sa: sa, - p: sap - }); - } - } - spq.sort(function (a, b) { - return a.p < b.p ? 1 : -1 - }); - // perform the updates - var nsteps = Math.min(this.planN, spq.length); - for (var k = 0; k < nsteps; k++) { - // random exploration - //var i = randi(0, this.sa_seen.length); // pick random prev seen state action - //var s0a0 = this.sa_seen[i]; - var s0a0 = spq[k].sa; - this.pq[s0a0] = 0; // erase priority, since we're backing up this state - var s0 = s0a0 % this.ns; - var a0 = Math.floor(s0a0 / this.ns); - var r0 = this.env_model_r[s0a0]; - var s1 = this.env_model_s[s0a0]; - var a1 = -1; // not used for Q learning - if (this.update === 'sarsa') { - // generate random action?... - var poss = this.env.allowedActions(s1); - var a1 = poss[randi(0, poss.length)]; - } - this.learnFromTuple(s0, a0, r0, s1, a1, 0); // note lambda = 0 - shouldnt use eligibility trace here - } - }, - learnFromTuple: function (s0, a0, r0, s1, a1, lambda) { - var sa = a0 * this.ns + s0; - // calculate the target for Q(s,a) - if (this.update === 'qlearn') { - // Q learning target is Q(s0,a0) = r0 + gamma * max_a Q[s1,a] - var poss = this.env.allowedActions(s1); - var qmax = 0; - for (var i = 0, n = poss.length; i < n; i++) { - var s1a = poss[i] * this.ns + s1; - var qval = this.Q[s1a]; - if (i === 0 || qval > qmax) { - qmax = qval; - } - } - var target = r0 + this.gamma * qmax; - } else if (this.update === 'sarsa') { - // SARSA target is Q(s0,a0) = r0 + gamma * Q[s1,a1] - var s1a1 = a1 * this.ns + s1; - var target = r0 + this.gamma * this.Q[s1a1]; - } - if (lambda > 0) { - // perform an eligibility trace update - if (this.replacing_traces) { - this.e[sa] = 1; - } else { - this.e[sa] += 1; - } - var edecay = lambda * this.gamma; - var state_update = zeros(this.ns); - for (var s = 0; s < this.ns; s++) { - var poss = this.env.allowedActions(s); - for (var i = 0; i < poss.length; i++) { - var a = poss[i]; - var saloop = a * this.ns + s; - var esa = this.e[saloop]; - var update = this.alpha * esa * (target - this.Q[saloop]); - this.Q[saloop] += update; - this.updatePriority(s, a, update); - this.e[saloop] *= edecay; - var u = Math.abs(update); - if (u > state_update[s]) { - state_update[s] = u; - } - } - } - for (var s = 0; s < this.ns; s++) { - if (state_update[s] > 1e-5) { // save efficiency here - this.updatePolicy(s); - } - } - if (this.explored && this.update === 'qlearn') { - // have to wipe the trace since q learning is off-policy :( - this.e = zeros(this.ns * this.na); - } - } else { - // simpler and faster update without eligibility trace - // update Q[sa] towards it with some step size - var update = this.alpha * (target - this.Q[sa]); - this.Q[sa] += update; - this.updatePriority(s0, a0, update); - // update the policy to reflect the change (if appropriate) - this.updatePolicy(s0); - } - }, - updatePriority: function (s, a, u) { - // used in planning. Invoked when Q[sa] += update - // we should find all states that lead to (s,a) and upgrade their priority - // of being update in the next planning step - u = Math.abs(u); - if (u < 1e-5) { - return; - } // for efficiency skip small updates - if (this.planN === 0) { - return; - } // there is no planning to be done, skip. - for (var si = 0; si < this.ns; si++) { - // note we are also iterating over impossible actions at all states, - // but this should be okay because their env_model_s should simply be -1 - // as initialized, so they will never be predicted to point to any state - // because they will never be observed, and hence never be added to the model - for (var ai = 0; ai < this.na; ai++) { - var siai = ai * this.ns + si; - if (this.env_model_s[siai] === s) { - // this state leads to s, add it to priority queue - this.pq[siai] += u; - } - } - } - }, - updatePolicy: function (s) { - var poss = this.env.allowedActions(s); - // set policy at s to be the action that achieves max_a Q(s,a) - // first find the maxy Q values - var qmax, nmax; - var qs = []; - for (var i = 0, n = poss.length; i < n; i++) { - var a = poss[i]; - var qval = this.Q[a * this.ns + s]; - qs.push(qval); - if (i === 0 || qval > qmax) { - qmax = qval; - nmax = 1; - } else if (qval === qmax) { - nmax += 1; - } - } - // now update the policy smoothly towards the argmaxy actions - var psum = 0.0; - for (var i = 0, n = poss.length; i < n; i++) { - var a = poss[i]; - var target = (qs[i] === qmax) ? 1.0 / nmax : 0.0; - var ix = a * this.ns + s; - if (this.smooth_policy_update) { - // slightly hacky :p - this.P[ix] += this.beta * (target - this.P[ix]); - psum += this.P[ix]; - } else { - // set hard target - this.P[ix] = target; - } - } - if (this.smooth_policy_update) { - // renomalize P if we're using smooth policy updates - for (var i = 0, n = poss.length; i < n; i++) { - var a = poss[i]; - this.P[a * this.ns + s] /= psum; - } - } - } - } - var DQNAgent = function (env, opt) { - this.gamma = getopt(opt, 'gamma', 0.75); // future reward discount factor - this.epsilon = getopt(opt, 'epsilon', 0.1); // for epsilon-greedy policy - this.alpha = getopt(opt, 'alpha', 0.01); // value function learning rate - this.experience_add_every = getopt(opt, 'experience_add_every', 25); // number of time steps before we add another experience to replay memory - this.experience_size = getopt(opt, 'experience_size', 5000); // size of experience replay - this.learning_steps_per_iteration = getopt(opt, 'learning_steps_per_iteration', 10); - this.tderror_clamp = getopt(opt, 'tderror_clamp', 1.0); - this.num_hidden_units = getopt(opt, 'num_hidden_units', 100); - this.env = env; - this.reset(); - } - DQNAgent.prototype = { - reset: function () { - this.nh = this.num_hidden_units; // number of hidden units - this.ns = this.env.getNumStates(); - this.na = this.env.getMaxNumActions(); - // nets are hardcoded for now as key (str) -> Mat - // not proud of this. better solution is to have a whole Net object - // on top of Mats, but for now sticking with this - this.net = {}; - this.net.W1 = new R.RandMat(this.nh, this.ns, 0, 0.01); - this.net.b1 = new R.Mat(this.nh, 1, 0, 0.01); - this.net.W2 = new R.RandMat(this.na, this.nh, 0, 0.01); - this.net.b2 = new R.Mat(this.na, 1, 0, 0.01); - this.exp = []; // experience - this.expi = 0; // where to insert - this.t = 0; - this.r0 = null; - this.s0 = null; - this.s1 = null; - this.a0 = null; - this.a1 = null; - this.tderror = 0; // for visualization only... - }, - toJSON: function () { - // save function - var j = {}; - j.nh = this.nh; - j.ns = this.ns; - j.na = this.na; - j.net = R.netToJSON(this.net); - return j; - }, - fromJSON: function (j) { - // load function - this.nh = j.nh; - this.ns = j.ns; - this.na = j.na; - this.net = R.netFromJSON(j.net); - }, - forwardQ: function (net, s, needs_backprop) { - var G = new R.Graph(needs_backprop); - var a1mat = G.add(G.mul(net.W1, s), net.b1); - var h1mat = G.tanh(a1mat); - var a2mat = G.add(G.mul(net.W2, h1mat), net.b2); - this.lastG = G; // back this up. Kind of hacky isn't it - return a2mat; - }, - act: function (slist) { - // convert to a Mat column vector - var s = new R.Mat(this.ns, 1); - s.setFrom(slist); - // epsilon greedy policy - if (Math.random() < this.epsilon) { - var a = randi(0, this.na); - } else { - // greedy wrt Q function - var amat = this.forwardQ(this.net, s, false); - var a = R.maxi(amat.w); // returns index of argmax action - } - // shift state memory - this.s0 = this.s1; - this.a0 = this.a1; - this.s1 = s; - this.a1 = a; - return a; - }, - learn: function (r1) { - // perform an update on Q function - if (!(this.r0 == null) && this.alpha > 0) { - // learn from this tuple to get a sense of how "surprising" it is to the agent - var tderror = this.learnFromTuple(this.s0, this.a0, this.r0, this.s1, this.a1); - this.tderror = tderror; // a measure of surprise - // decide if we should keep this experience in the replay - if (this.t % this.experience_add_every === 0) { - this.exp[this.expi] = [this.s0, this.a0, this.r0, this.s1, this.a1]; - this.expi += 1; - if (this.expi > this.experience_size) { - this.expi = 0; - } // roll over when we run out - } - this.t += 1; - // sample some additional experience from replay memory and learn from it - for (var k = 0; k < this.learning_steps_per_iteration; k++) { - var ri = randi(0, this.exp.length); // todo: priority sweeps? - var e = this.exp[ri]; - this.learnFromTuple(e[0], e[1], e[2], e[3], e[4]) - } - } - this.r0 = r1; // store for next update - }, - learnFromTuple: function (s0, a0, r0, s1, a1) { - // want: Q(s,a) = r + gamma * max_a' Q(s',a') - // compute the target Q value - var tmat = this.forwardQ(this.net, s1, false); - var qmax = r0 + this.gamma * tmat.w[R.maxi(tmat.w)]; - // now predict - var pred = this.forwardQ(this.net, s0, true); - var tderror = pred.w[a0] - qmax; - var clamp = this.tderror_clamp; - if (Math.abs(tderror) > clamp) { // huber loss to robustify - if (tderror > clamp) tderror = clamp; - if (tderror < -clamp) tderror = -clamp; - } - pred.dw[a0] = tderror; - this.lastG.backward(); // compute gradients on net params - // update net - R.updateNet(this.net, this.alpha); - return tderror; - } - } - // buggy implementation, doesnt work... - var SimpleReinforceAgent = function (env, opt) { - this.gamma = getopt(opt, 'gamma', 0.5); // future reward discount factor - this.epsilon = getopt(opt, 'epsilon', 0.75); // for epsilon-greedy policy - this.alpha = getopt(opt, 'alpha', 0.001); // actor net learning rate - this.beta = getopt(opt, 'beta', 0.01); // baseline net learning rate - this.env = env; - this.reset(); - } - SimpleReinforceAgent.prototype = { - reset: function () { - this.ns = this.env.getNumStates(); - this.na = this.env.getMaxNumActions(); - this.nh = 100; // number of hidden units - this.nhb = 100; // and also in the baseline lstm - this.actorNet = {}; - this.actorNet.W1 = new R.RandMat(this.nh, this.ns, 0, 0.01); - this.actorNet.b1 = new R.Mat(this.nh, 1, 0, 0.01); - this.actorNet.W2 = new R.RandMat(this.na, this.nh, 0, 0.1); - this.actorNet.b2 = new R.Mat(this.na, 1, 0, 0.01); - this.actorOutputs = []; - this.actorGraphs = []; - this.actorActions = []; // sampled ones - this.rewardHistory = []; - this.baselineNet = {}; - this.baselineNet.W1 = new R.RandMat(this.nhb, this.ns, 0, 0.01); - this.baselineNet.b1 = new R.Mat(this.nhb, 1, 0, 0.01); - this.baselineNet.W2 = new R.RandMat(this.na, this.nhb, 0, 0.01); - this.baselineNet.b2 = new R.Mat(this.na, 1, 0, 0.01); - this.baselineOutputs = []; - this.baselineGraphs = []; - this.t = 0; - }, - forwardActor: function (s, needs_backprop) { - var net = this.actorNet; - var G = new R.Graph(needs_backprop); - var a1mat = G.add(G.mul(net.W1, s), net.b1); - var h1mat = G.tanh(a1mat); - var a2mat = G.add(G.mul(net.W2, h1mat), net.b2); - return { - 'a': a2mat, - 'G': G - } - }, - forwardValue: function (s, needs_backprop) { - var net = this.baselineNet; - var G = new R.Graph(needs_backprop); - var a1mat = G.add(G.mul(net.W1, s), net.b1); - var h1mat = G.tanh(a1mat); - var a2mat = G.add(G.mul(net.W2, h1mat), net.b2); - return { - 'a': a2mat, - 'G': G - } - }, - act: function (slist) { - // convert to a Mat column vector - var s = new R.Mat(this.ns, 1); - s.setFrom(slist); - // forward the actor to get action output - var ans = this.forwardActor(s, true); - var amat = ans.a; - var ag = ans.G; - this.actorOutputs.push(amat); - this.actorGraphs.push(ag); - // forward the baseline estimator - var ans = this.forwardValue(s, true); - var vmat = ans.a; - var vg = ans.G; - this.baselineOutputs.push(vmat); - this.baselineGraphs.push(vg); - // sample action from the stochastic gaussian policy - var a = R.copyMat(amat); - var gaussVar = 0.02; - a.w[0] = R.randn(0, gaussVar); - a.w[1] = R.randn(0, gaussVar); - this.actorActions.push(a); - // shift state memory - this.s0 = this.s1; - this.a0 = this.a1; - this.s1 = s; - this.a1 = a; - return a; - }, - learn: function (r1) { - // perform an update on Q function - this.rewardHistory.push(r1); - var n = this.rewardHistory.length; - var baselineMSE = 0.0; - var nup = 100; // what chunk of experience to take - var nuse = 80; // what chunk to update from - if (n >= nup) { - // lets learn and flush - // first: compute the sample values at all points - var vs = []; - for (var t = 0; t < nuse; t++) { - var mul = 1; - // compute the actual discounted reward for this time step - var V = 0; - for (var t2 = t; t2 < n; t2++) { - V += mul * this.rewardHistory[t2]; - mul *= this.gamma; - if (mul < 1e-5) { - break; - } // efficiency savings - } - // get the predicted baseline at this time step - var b = this.baselineOutputs[t].w[0]; - for (var i = 0; i < this.na; i++) { - // [the action delta] * [the desirebility] - var update = -(V - b) * (this.actorActions[t].w[i] - this.actorOutputs[t].w[i]); - if (update > 0.1) { - update = 0.1; - } - if (update < -0.1) { - update = -0.1; - } - this.actorOutputs[t].dw[i] += update; - } - var update = -(V - b); - if (update > 0.1) { - update = 0.1; - } - if (update < 0.1) { - update = -0.1; - } - this.baselineOutputs[t].dw[0] += update; - baselineMSE += (V - b) * (V - b); - vs.push(V); - } - baselineMSE /= nuse; - // backprop all the things - for (var t = 0; t < nuse; t++) { - this.actorGraphs[t].backward(); - this.baselineGraphs[t].backward(); - } - R.updateNet(this.actorNet, this.alpha); // update actor network - R.updateNet(this.baselineNet, this.beta); // update baseline network - // flush - this.actorOutputs = []; - this.rewardHistory = []; - this.actorActions = []; - this.baselineOutputs = []; - this.actorGraphs = []; - this.baselineGraphs = []; - this.tderror = baselineMSE; - } - this.t += 1; - this.r0 = r1; // store for next update - }, - } - // buggy implementation as well, doesn't work - var RecurrentReinforceAgent = function (env, opt) { - this.gamma = getopt(opt, 'gamma', 0.5); // future reward discount factor - this.epsilon = getopt(opt, 'epsilon', 0.1); // for epsilon-greedy policy - this.alpha = getopt(opt, 'alpha', 0.001); // actor net learning rate - this.beta = getopt(opt, 'beta', 0.01); // baseline net learning rate - this.env = env; - this.reset(); - } - RecurrentReinforceAgent.prototype = { - reset: function () { - this.ns = this.env.getNumStates(); - this.na = this.env.getMaxNumActions(); - this.nh = 40; // number of hidden units - this.nhb = 40; // and also in the baseline lstm - this.actorLSTM = R.initLSTM(this.ns, [this.nh], this.na); - this.actorG = new R.Graph(); - this.actorPrev = null; - this.actorOutputs = []; - this.rewardHistory = []; - this.actorActions = []; - this.baselineLSTM = R.initLSTM(this.ns, [this.nhb], 1); - this.baselineG = new R.Graph(); - this.baselinePrev = null; - this.baselineOutputs = []; - this.t = 0; - this.r0 = null; - this.s0 = null; - this.s1 = null; - this.a0 = null; - this.a1 = null; - }, - act: function (slist) { - // convert to a Mat column vector - var s = new R.Mat(this.ns, 1); - s.setFrom(slist); - // forward the LSTM to get action distribution - var actorNext = R.forwardLSTM(this.actorG, this.actorLSTM, [this.nh], s, this.actorPrev); - this.actorPrev = actorNext; - var amat = actorNext.o; - this.actorOutputs.push(amat); - // forward the baseline LSTM - var baselineNext = R.forwardLSTM(this.baselineG, this.baselineLSTM, [this.nhb], s, this.baselinePrev); - this.baselinePrev = baselineNext; - this.baselineOutputs.push(baselineNext.o); - // sample action from actor policy - var gaussVar = 0.05; - var a = R.copyMat(amat); - for (var i = 0, n = a.w.length; i < n; i++) { - a.w[0] += R.randn(0, gaussVar); - a.w[1] += R.randn(0, gaussVar); - } - this.actorActions.push(a); - // shift state memory - this.s0 = this.s1; - this.a0 = this.a1; - this.s1 = s; - this.a1 = a; - return a; - }, - learn: function (r1) { - // perform an update on Q function - this.rewardHistory.push(r1); - var n = this.rewardHistory.length; - var baselineMSE = 0.0; - var nup = 100; // what chunk of experience to take - var nuse = 80; // what chunk to also update - if (n >= nup) { - // lets learn and flush - // first: compute the sample values at all points - var vs = []; - for (var t = 0; t < nuse; t++) { - var mul = 1; - var V = 0; - for (var t2 = t; t2 < n; t2++) { - V += mul * this.rewardHistory[t2]; - mul *= this.gamma; - if (mul < 1e-5) { - break; - } // efficiency savings - } - var b = this.baselineOutputs[t].w[0]; - // todo: take out the constants etc. - for (var i = 0; i < this.na; i++) { - // [the action delta] * [the desirebility] - var update = -(V - b) * (this.actorActions[t].w[i] - this.actorOutputs[t].w[i]); - if (update > 0.1) { - update = 0.1; - } - if (update < -0.1) { - update = -0.1; - } - this.actorOutputs[t].dw[i] += update; - } - var update = -(V - b); - if (update > 0.1) { - update = 0.1; - } - if (update < 0.1) { - update = -0.1; - } - this.baselineOutputs[t].dw[0] += update; - baselineMSE += (V - b) * (V - b); - vs.push(V); - } - baselineMSE /= nuse; - this.actorG.backward(); // update params! woohoo! - this.baselineG.backward(); - R.updateNet(this.actorLSTM, this.alpha); // update actor network - R.updateNet(this.baselineLSTM, this.beta); // update baseline network - // flush - this.actorG = new R.Graph(); - this.actorPrev = null; - this.actorOutputs = []; - this.rewardHistory = []; - this.actorActions = []; - this.baselineG = new R.Graph(); - this.baselinePrev = null; - this.baselineOutputs = []; - this.tderror = baselineMSE; - } - this.t += 1; - this.r0 = r1; // store for next update - }, - } - // Currently buggy implementation, doesnt work - var DeterministPG = function (env, opt) { - this.gamma = getopt(opt, 'gamma', 0.5); // future reward discount factor - this.epsilon = getopt(opt, 'epsilon', 0.5); // for epsilon-greedy policy - this.alpha = getopt(opt, 'alpha', 0.001); // actor net learning rate - this.beta = getopt(opt, 'beta', 0.01); // baseline net learning rate - this.env = env; - this.reset(); - } - DeterministPG.prototype = { - reset: function () { - this.ns = this.env.getNumStates(); - this.na = this.env.getMaxNumActions(); - this.nh = 100; // number of hidden units - // actor - this.actorNet = {}; - this.actorNet.W1 = new R.RandMat(this.nh, this.ns, 0, 0.01); - this.actorNet.b1 = new R.Mat(this.nh, 1, 0, 0.01); - this.actorNet.W2 = new R.RandMat(this.na, this.ns, 0, 0.1); - this.actorNet.b2 = new R.Mat(this.na, 1, 0, 0.01); - this.ntheta = this.na * this.ns + this.na; // number of params in actor - // critic - this.criticw = new R.RandMat(1, this.ntheta, 0, 0.01); // row vector - this.r0 = null; - this.s0 = null; - this.s1 = null; - this.a0 = null; - this.a1 = null; - this.t = 0; - }, - forwardActor: function (s, needs_backprop) { - var net = this.actorNet; - var G = new R.Graph(needs_backprop); - var a1mat = G.add(G.mul(net.W1, s), net.b1); - var h1mat = G.tanh(a1mat); - var a2mat = G.add(G.mul(net.W2, h1mat), net.b2); - return { - 'a': a2mat, - 'G': G - } - }, - act: function (slist) { - // convert to a Mat column vector - var s = new R.Mat(this.ns, 1); - s.setFrom(slist); - // forward the actor to get action output - var ans = this.forwardActor(s, false); - var amat = ans.a; - var ag = ans.G; - // sample action from the stochastic gaussian policy - var a = R.copyMat(amat); - if (Math.random() < this.epsilon) { - var gaussVar = 0.02; - a.w[0] = R.randn(0, gaussVar); - a.w[1] = R.randn(0, gaussVar); - } - var clamp = 0.25; - if (a.w[0] > clamp) a.w[0] = clamp; - if (a.w[0] < -clamp) a.w[0] = -clamp; - if (a.w[1] > clamp) a.w[1] = clamp; - if (a.w[1] < -clamp) a.w[1] = -clamp; - // shift state memory - this.s0 = this.s1; - this.a0 = this.a1; - this.s1 = s; - this.a1 = a; - return a; - }, - utilJacobianAt: function (s) { - var ujacobian = new R.Mat(this.ntheta, this.na); - for (var a = 0; a < this.na; a++) { - R.netZeroGrads(this.actorNet); - var ag = this.forwardActor(this.s0, true); - ag.a.dw[a] = 1.0; - ag.G.backward(); - var gflat = R.netFlattenGrads(this.actorNet); - ujacobian.setColumn(gflat, a); - } - return ujacobian; - }, - learn: function (r1) { - // perform an update on Q function - //this.rewardHistory.push(r1); - if (!(this.r0 == null)) { - var Gtmp = new R.Graph(false); - // dpg update: - // first compute the features psi: - // the jacobian matrix of the actor for s - var ujacobian0 = this.utilJacobianAt(this.s0); - // now form the features \psi(s,a) - var psi_sa0 = Gtmp.mul(ujacobian0, this.a0); // should be [this.ntheta x 1] "feature" vector - var qw0 = Gtmp.mul(this.criticw, psi_sa0); // 1x1 - // now do the same thing because we need \psi(s_{t+1}, \mu\_\theta(s\_t{t+1})) - var ujacobian1 = this.utilJacobianAt(this.s1); - var ag = this.forwardActor(this.s1, false); - var psi_sa1 = Gtmp.mul(ujacobian1, ag.a); - var qw1 = Gtmp.mul(this.criticw, psi_sa1); // 1x1 - // get the td error finally - var tderror = this.r0 + this.gamma * qw1.w[0] - qw0.w[0]; // lol - if (tderror > 0.5) tderror = 0.5; // clamp - if (tderror < -0.5) tderror = -0.5; - this.tderror = tderror; - // update actor policy with natural gradient - var net = this.actorNet; - var ix = 0; - for (var p in net) { - var mat = net[p]; - if (net.hasOwnProperty(p)) { - for (var i = 0, n = mat.w.length; i < n; i++) { - mat.w[i] += this.alpha * this.criticw.w[ix]; // natural gradient update - ix += 1; - } - } - } - // update the critic parameters too - for (var i = 0; i < this.ntheta; i++) { - var update = this.beta * tderror * psi_sa0.w[i]; - this.criticw.w[i] += update; - } - } - this.r0 = r1; // store for next update - }, - } - // exports - global.DPAgent = DPAgent; - global.TDAgent = TDAgent; - global.DQNAgent = DQNAgent; - //global.SimpleReinforceAgent = SimpleReinforceAgent; - //global.RecurrentReinforceAgent = RecurrentReinforceAgent; - //global.DeterministPG = DeterministPG; + "use strict"; + // syntactic sugar function for getting default parameter values + var getopt = function (opt, field_name, default_value) { + if (typeof opt === 'undefined') { + return default_value; + } + return (typeof opt[field_name] !== 'undefined') ? opt[field_name] : default_value; + } + var zeros = R.zeros; // inherit these + var assert = R.assert; + var randi = R.randi; + var randf = R.randf; + var setConst = function (arr, c) { + for (var i = 0, n = arr.length; i < n; i++) { + arr[i] = c; + } + } + var sampleWeighted = function (p) { + var r = Math.random(); + var c = 0.0; + for (var i = 0, n = p.length; i < n; i++) { + c += p[i]; + if (c >= r) { + return i; + } + } + assert(false, 'wtf'); + } + // ------ + // AGENTS + // ------ + // DPAgent performs Value Iteration + // - can also be used for Policy Iteration if you really wanted to + // - requires model of the environment :( + // - does not learn from experience :( + // - assumes finite MDP :( + var DPAgent = function (env, opt) { + this.V = null; // state value function + this.P = null; // policy distribution \pi(s,a) + this.env = env; // store pointer to environment + this.gamma = getopt(opt, 'gamma', 0.75); // future reward discount factor + this.reset(); + } + DPAgent.prototype = { + reset: function () { + // reset the agent's policy and value function + this.ns = this.env.getNumStates(); + this.na = this.env.getMaxNumActions(); + this.V = zeros(this.ns); + this.P = zeros(this.ns * this.na); + // initialize uniform random policy + for (var s = 0; s < this.ns; s++) { + var poss = this.env.allowedActions(s); + for (var i = 0, n = poss.length; i < n; i++) { + this.P[poss[i] * this.ns + s] = 1.0 / poss.length; + } + } + }, + act: function (s) { + // behave according to the learned policy + var poss = this.env.allowedActions(s); + var ps = []; + for (var i = 0, n = poss.length; i < n; i++) { + var a = poss[i]; + var prob = this.P[a * this.ns + s]; + ps.push(prob); + } + var maxi = sampleWeighted(ps); + return poss[maxi]; + }, + learn: function () { + // perform a single round of value iteration + self.evaluatePolicy(); // writes this.V + self.updatePolicy(); // writes this.P + }, + evaluatePolicy: function () { + // perform a synchronous update of the value function + var Vnew = zeros(this.ns); + for (var s = 0; s < this.ns; s++) { + // integrate over actions in a stochastic policy + // note that we assume that policy probability mass over allowed actions sums to one + var v = 0.0; + var poss = this.env.allowedActions(s); + for (var i = 0, n = poss.length; i < n; i++) { + var a = poss[i]; + var prob = this.P[a * this.ns + s]; // probability of taking action under policy + if (prob === 0) { + continue; + } // no contribution, skip for speed + var ns = this.env.nextStateDistribution(s, a); + var rs = this.env.reward(s, a, ns); // reward for s->a->ns transition + v += prob * (rs + this.gamma * this.V[ns]); + } + Vnew[s] = v; + } + this.V = Vnew; // swap + }, + updatePolicy: function () { + // update policy to be greedy w.r.t. learned Value function + for (var s = 0; s < this.ns; s++) { + var poss = this.env.allowedActions(s); + // compute value of taking each allowed action + var vmax, nmax; + var vs = []; + for (var i = 0, n = poss.length; i < n; i++) { + var a = poss[i]; + var ns = this.env.nextStateDistribution(s, a); + var rs = this.env.reward(s, a, ns); + var v = rs + this.gamma * this.V[ns]; + vs.push(v); + if (i === 0 || v > vmax) { + vmax = v; + nmax = 1; + } else if (v === vmax) { + nmax += 1; + } + } + // update policy smoothly across all argmaxy actions + for (var i = 0, n = poss.length; i < n; i++) { + var a = poss[i]; + this.P[a * this.ns + s] = (vs[i] === vmax) ? 1.0 / nmax : 0.0; + } + } + }, + } + // QAgent uses TD (Q-Learning, SARSA) + // - does not require environment model :) + // - learns from experience :) + var TDAgent = function (env, opt) { + this.update = getopt(opt, 'update', 'qlearn'); // qlearn | sarsa + this.gamma = getopt(opt, 'gamma', 0.75); // future reward discount factor + this.epsilon = getopt(opt, 'epsilon', 0.1); // for epsilon-greedy policy + this.alpha = getopt(opt, 'alpha', 0.01); // value function learning rate + // class allows non-deterministic policy, and smoothly regressing towards the optimal policy based on Q + this.smooth_policy_update = getopt(opt, 'smooth_policy_update', false); + this.beta = getopt(opt, 'beta', 0.01); // learning rate for policy, if smooth updates are on + // eligibility traces + this.lambda = getopt(opt, 'lambda', 0); // eligibility trace decay. 0 = no eligibility traces used + this.replacing_traces = getopt(opt, 'replacing_traces', true); + // optional optimistic initial values + this.q_init_val = getopt(opt, 'q_init_val', 0); + this.planN = getopt(opt, 'planN', 0); // number of planning steps per learning iteration (0 = no planning) + this.Q = null; // state action value function + this.P = null; // policy distribution \pi(s,a) + this.e = null; // eligibility trace + this.env_model_s = null;; // environment model (s,a) -> (s',r) + this.env_model_r = null;; // environment model (s,a) -> (s',r) + this.env = env; // store pointer to environment + this.reset(); + } + TDAgent.prototype = { + reset: function () { + // reset the agent's policy and value function + this.ns = this.env.getNumStates(); + this.na = this.env.getMaxNumActions(); + this.Q = zeros(this.ns * this.na); + if (this.q_init_val !== 0) { + setConst(this.Q, this.q_init_val); + } + this.P = zeros(this.ns * this.na); + this.e = zeros(this.ns * this.na); + // model/planning vars + this.env_model_s = zeros(this.ns * this.na); + setConst(this.env_model_s, -1); // init to -1 so we can test if we saw the state before + this.env_model_r = zeros(this.ns * this.na); + this.sa_seen = []; + this.pq = zeros(this.ns * this.na); + // initialize uniform random policy + for (var s = 0; s < this.ns; s++) { + var poss = this.env.allowedActions(s); + for (var i = 0, n = poss.length; i < n; i++) { + this.P[poss[i] * this.ns + s] = 1.0 / poss.length; + } + } + // agent memory, needed for streaming updates + // (s0,a0,r0,s1,a1,r1,...) + this.r0 = null; + this.s0 = null; + this.s1 = null; + this.a0 = null; + this.a1 = null; + }, + resetEpisode: function () { + // an episode finished + }, + act: function (s) { + // act according to epsilon greedy policy + var poss = this.env.allowedActions(s); + var probs = []; + for (var i = 0, n = poss.length; i < n; i++) { + probs.push(this.P[poss[i] * this.ns + s]); + } + // epsilon greedy policy + if (Math.random() < this.epsilon) { + var a = poss[randi(0, poss.length)]; // random available action + this.explored = true; + } else { + var a = poss[sampleWeighted(probs)]; + this.explored = false; + } + // shift state memory + this.s0 = this.s1; + this.a0 = this.a1; + this.s1 = s; + this.a1 = a; + return a; + }, + learn: function (r1) { + // takes reward for previous action, which came from a call to act() + if (!(this.r0 == null)) { + this.learnFromTuple(this.s0, this.a0, this.r0, this.s1, this.a1, this.lambda); + if (this.planN > 0) { + this.updateModel(this.s0, this.a0, this.r0, this.s1); + this.plan(); + } + } + this.r0 = r1; // store this for next update + }, + updateModel: function (s0, a0, r0, s1) { + // transition (s0,a0) -> (r0,s1) was observed. Update environment model + var sa = a0 * this.ns + s0; + if (this.env_model_s[sa] === -1) { + // first time we see this state action + this.sa_seen.push(a0 * this.ns + s0); // add as seen state + } + this.env_model_s[sa] = s1; + this.env_model_r[sa] = r0; + }, + plan: function () { + // order the states based on current priority queue information + var spq = []; + for (var i = 0, n = this.sa_seen.length; i < n; i++) { + var sa = this.sa_seen[i]; + var sap = this.pq[sa]; + if (sap > 1e-5) { // gain a bit of efficiency + spq.push({ + sa: sa, + p: sap + }); + } + } + spq.sort(function (a, b) { + return a.p < b.p ? 1 : -1 + }); + // perform the updates + var nsteps = Math.min(this.planN, spq.length); + for (var k = 0; k < nsteps; k++) { + // random exploration + //var i = randi(0, this.sa_seen.length); // pick random prev seen state action + //var s0a0 = this.sa_seen[i]; + var s0a0 = spq[k].sa; + this.pq[s0a0] = 0; // erase priority, since we're backing up this state + var s0 = s0a0 % this.ns; + var a0 = Math.floor(s0a0 / this.ns); + var r0 = this.env_model_r[s0a0]; + var s1 = this.env_model_s[s0a0]; + var a1 = -1; // not used for Q learning + if (this.update === 'sarsa') { + // generate random action?... + var poss = this.env.allowedActions(s1); + var a1 = poss[randi(0, poss.length)]; + } + this.learnFromTuple(s0, a0, r0, s1, a1, 0); // note lambda = 0 - shouldnt use eligibility trace here + } + }, + learnFromTuple: function (s0, a0, r0, s1, a1, lambda) { + var sa = a0 * this.ns + s0; + // calculate the target for Q(s,a) + if (this.update === 'qlearn') { + // Q learning target is Q(s0,a0) = r0 + gamma * max_a Q[s1,a] + var poss = this.env.allowedActions(s1); + var qmax = 0; + for (var i = 0, n = poss.length; i < n; i++) { + var s1a = poss[i] * this.ns + s1; + var qval = this.Q[s1a]; + if (i === 0 || qval > qmax) { + qmax = qval; + } + } + var target = r0 + this.gamma * qmax; + } else if (this.update === 'sarsa') { + // SARSA target is Q(s0,a0) = r0 + gamma * Q[s1,a1] + var s1a1 = a1 * this.ns + s1; + var target = r0 + this.gamma * this.Q[s1a1]; + } + if (lambda > 0) { + // perform an eligibility trace update + if (this.replacing_traces) { + this.e[sa] = 1; + } else { + this.e[sa] += 1; + } + var edecay = lambda * this.gamma; + var state_update = zeros(this.ns); + for (var s = 0; s < this.ns; s++) { + var poss = this.env.allowedActions(s); + for (var i = 0; i < poss.length; i++) { + var a = poss[i]; + var saloop = a * this.ns + s; + var esa = this.e[saloop]; + var update = this.alpha * esa * (target - this.Q[saloop]); + this.Q[saloop] += update; + this.updatePriority(s, a, update); + this.e[saloop] *= edecay; + var u = Math.abs(update); + if (u > state_update[s]) { + state_update[s] = u; + } + } + } + for (var s = 0; s < this.ns; s++) { + if (state_update[s] > 1e-5) { // save efficiency here + this.updatePolicy(s); + } + } + if (this.explored && this.update === 'qlearn') { + // have to wipe the trace since q learning is off-policy :( + this.e = zeros(this.ns * this.na); + } + } else { + // simpler and faster update without eligibility trace + // update Q[sa] towards it with some step size + var update = this.alpha * (target - this.Q[sa]); + this.Q[sa] += update; + this.updatePriority(s0, a0, update); + // update the policy to reflect the change (if appropriate) + this.updatePolicy(s0); + } + }, + updatePriority: function (s, a, u) { + // used in planning. Invoked when Q[sa] += update + // we should find all states that lead to (s,a) and upgrade their priority + // of being update in the next planning step + u = Math.abs(u); + if (u < 1e-5) { + return; + } // for efficiency skip small updates + if (this.planN === 0) { + return; + } // there is no planning to be done, skip. + for (var si = 0; si < this.ns; si++) { + // note we are also iterating over impossible actions at all states, + // but this should be okay because their env_model_s should simply be -1 + // as initialized, so they will never be predicted to point to any state + // because they will never be observed, and hence never be added to the model + for (var ai = 0; ai < this.na; ai++) { + var siai = ai * this.ns + si; + if (this.env_model_s[siai] === s) { + // this state leads to s, add it to priority queue + this.pq[siai] += u; + } + } + } + }, + updatePolicy: function (s) { + var poss = this.env.allowedActions(s); + // set policy at s to be the action that achieves max_a Q(s,a) + // first find the maxy Q values + var qmax, nmax; + var qs = []; + for (var i = 0, n = poss.length; i < n; i++) { + var a = poss[i]; + var qval = this.Q[a * this.ns + s]; + qs.push(qval); + if (i === 0 || qval > qmax) { + qmax = qval; + nmax = 1; + } else if (qval === qmax) { + nmax += 1; + } + } + // now update the policy smoothly towards the argmaxy actions + var psum = 0.0; + for (var i = 0, n = poss.length; i < n; i++) { + var a = poss[i]; + var target = (qs[i] === qmax) ? 1.0 / nmax : 0.0; + var ix = a * this.ns + s; + if (this.smooth_policy_update) { + // slightly hacky :p + this.P[ix] += this.beta * (target - this.P[ix]); + psum += this.P[ix]; + } else { + // set hard target + this.P[ix] = target; + } + } + if (this.smooth_policy_update) { + // renomalize P if we're using smooth policy updates + for (var i = 0, n = poss.length; i < n; i++) { + var a = poss[i]; + this.P[a * this.ns + s] /= psum; + } + } + } + } + var DQNAgent = function (env, opt) { + this.gamma = getopt(opt, 'gamma', 0.75); // future reward discount factor + this.epsilon = getopt(opt, 'epsilon', 0.1); // for epsilon-greedy policy + this.alpha = getopt(opt, 'alpha', 0.01); // value function learning rate + this.experience_add_every = getopt(opt, 'experience_add_every', 25); // number of time steps before we add another experience to replay memory + this.experience_size = getopt(opt, 'experience_size', 5000); // size of experience replay + this.learning_steps_per_iteration = getopt(opt, 'learning_steps_per_iteration', 10); + this.tderror_clamp = getopt(opt, 'tderror_clamp', 1.0); + this.num_hidden_units = getopt(opt, 'num_hidden_units', 100); + this.env = env; + this.reset(); + } + DQNAgent.prototype = { + reset: function () { + this.nh = this.num_hidden_units; // number of hidden units + this.ns = this.env.getNumStates(); + this.na = this.env.getMaxNumActions(); + // nets are hardcoded for now as key (str) -> Mat + // not proud of this. better solution is to have a whole Net object + // on top of Mats, but for now sticking with this + this.net = {}; + this.net.W1 = new R.RandMat(this.nh, this.ns, 0, 0.01); + this.net.b1 = new R.Mat(this.nh, 1, 0, 0.01); + this.net.W2 = new R.RandMat(this.na, this.nh, 0, 0.01); + this.net.b2 = new R.Mat(this.na, 1, 0, 0.01); + this.exp = []; // experience + this.expi = 0; // where to insert + this.t = 0; + this.r0 = null; + this.s0 = null; + this.s1 = null; + this.a0 = null; + this.a1 = null; + this.tderror = 0; // for visualization only... + }, + toJSON: function () { + // save function + var j = {}; + j.nh = this.nh; + j.ns = this.ns; + j.na = this.na; + j.net = R.netToJSON(this.net); + return j; + }, + fromJSON: function (j) { + // load function + this.nh = j.nh; + this.ns = j.ns; + this.na = j.na; + this.net = R.netFromJSON(j.net); + }, + forwardQ: function (net, s, needs_backprop) { + var G = new R.Graph(needs_backprop); + var a1mat = G.add(G.mul(net.W1, s), net.b1); + var h1mat = G.tanh(a1mat); + var a2mat = G.add(G.mul(net.W2, h1mat), net.b2); + this.lastG = G; // back this up. Kind of hacky isn't it + return a2mat; + }, + act: function (slist) { + // convert to a Mat column vector + var s = new R.Mat(this.ns, 1); + s.setFrom(slist); + // epsilon greedy policy + if (Math.random() < this.epsilon) { + var a = randi(0, this.na); + } else { + // greedy wrt Q function + var amat = this.forwardQ(this.net, s, false); + var a = R.maxi(amat.w); // returns index of argmax action + } + // shift state memory + this.s0 = this.s1; + this.a0 = this.a1; + this.s1 = s; + this.a1 = a; + return a; + }, + learn: function (r1) { + // perform an update on Q function + if (!(this.r0 == null) && this.alpha > 0) { + // learn from this tuple to get a sense of how "surprising" it is to the agent + var tderror = this.learnFromTuple(this.s0, this.a0, this.r0, this.s1, this.a1); + this.tderror = tderror; // a measure of surprise + // decide if we should keep this experience in the replay + if (this.t % this.experience_add_every === 0) { + this.exp[this.expi] = [this.s0, this.a0, this.r0, this.s1, this.a1]; + this.expi += 1; + if (this.expi > this.experience_size) { + this.expi = 0; + } // roll over when we run out + } + this.t += 1; + // sample some additional experience from replay memory and learn from it + for (var k = 0; k < this.learning_steps_per_iteration; k++) { + var ri = randi(0, this.exp.length); // todo: priority sweeps? + var e = this.exp[ri]; + this.learnFromTuple(e[0], e[1], e[2], e[3], e[4]) + } + } + this.r0 = r1; // store for next update + }, + learnFromTuple: function (s0, a0, r0, s1, a1) { + // want: Q(s,a) = r + gamma * max_a' Q(s',a') + // compute the target Q value + var tmat = this.forwardQ(this.net, s1, false); + var qmax = r0 + this.gamma * tmat.w[R.maxi(tmat.w)]; + // now predict + var pred = this.forwardQ(this.net, s0, true); + var tderror = pred.w[a0] - qmax; + var clamp = this.tderror_clamp; + if (Math.abs(tderror) > clamp) { // huber loss to robustify + if (tderror > clamp) tderror = clamp; + if (tderror < -clamp) tderror = -clamp; + } + pred.dw[a0] = tderror; + this.lastG.backward(); // compute gradients on net params + // update net + R.updateNet(this.net, this.alpha); + return tderror; + } + } + // buggy implementation, doesnt work... + var SimpleReinforceAgent = function (env, opt) { + this.gamma = getopt(opt, 'gamma', 0.5); // future reward discount factor + this.epsilon = getopt(opt, 'epsilon', 0.75); // for epsilon-greedy policy + this.alpha = getopt(opt, 'alpha', 0.001); // actor net learning rate + this.beta = getopt(opt, 'beta', 0.01); // baseline net learning rate + this.env = env; + this.reset(); + } + SimpleReinforceAgent.prototype = { + reset: function () { + this.ns = this.env.getNumStates(); + this.na = this.env.getMaxNumActions(); + this.nh = 100; // number of hidden units + this.nhb = 100; // and also in the baseline lstm + this.actorNet = {}; + this.actorNet.W1 = new R.RandMat(this.nh, this.ns, 0, 0.01); + this.actorNet.b1 = new R.Mat(this.nh, 1, 0, 0.01); + this.actorNet.W2 = new R.RandMat(this.na, this.nh, 0, 0.1); + this.actorNet.b2 = new R.Mat(this.na, 1, 0, 0.01); + this.actorOutputs = []; + this.actorGraphs = []; + this.actorActions = []; // sampled ones + this.rewardHistory = []; + this.baselineNet = {}; + this.baselineNet.W1 = new R.RandMat(this.nhb, this.ns, 0, 0.01); + this.baselineNet.b1 = new R.Mat(this.nhb, 1, 0, 0.01); + this.baselineNet.W2 = new R.RandMat(this.na, this.nhb, 0, 0.01); + this.baselineNet.b2 = new R.Mat(this.na, 1, 0, 0.01); + this.baselineOutputs = []; + this.baselineGraphs = []; + this.t = 0; + }, + forwardActor: function (s, needs_backprop) { + var net = this.actorNet; + var G = new R.Graph(needs_backprop); + var a1mat = G.add(G.mul(net.W1, s), net.b1); + var h1mat = G.tanh(a1mat); + var a2mat = G.add(G.mul(net.W2, h1mat), net.b2); + return { + 'a': a2mat, + 'G': G + } + }, + forwardValue: function (s, needs_backprop) { + var net = this.baselineNet; + var G = new R.Graph(needs_backprop); + var a1mat = G.add(G.mul(net.W1, s), net.b1); + var h1mat = G.tanh(a1mat); + var a2mat = G.add(G.mul(net.W2, h1mat), net.b2); + return { + 'a': a2mat, + 'G': G + } + }, + act: function (slist) { + // convert to a Mat column vector + var s = new R.Mat(this.ns, 1); + s.setFrom(slist); + // forward the actor to get action output + var ans = this.forwardActor(s, true); + var amat = ans.a; + var ag = ans.G; + this.actorOutputs.push(amat); + this.actorGraphs.push(ag); + // forward the baseline estimator + var ans = this.forwardValue(s, true); + var vmat = ans.a; + var vg = ans.G; + this.baselineOutputs.push(vmat); + this.baselineGraphs.push(vg); + // sample action from the stochastic gaussian policy + var a = R.copyMat(amat); + var gaussVar = 0.02; + a.w[0] = R.randn(0, gaussVar); + a.w[1] = R.randn(0, gaussVar); + this.actorActions.push(a); + // shift state memory + this.s0 = this.s1; + this.a0 = this.a1; + this.s1 = s; + this.a1 = a; + return a; + }, + learn: function (r1) { + // perform an update on Q function + this.rewardHistory.push(r1); + var n = this.rewardHistory.length; + var baselineMSE = 0.0; + var nup = 100; // what chunk of experience to take + var nuse = 80; // what chunk to update from + if (n >= nup) { + // lets learn and flush + // first: compute the sample values at all points + var vs = []; + for (var t = 0; t < nuse; t++) { + var mul = 1; + // compute the actual discounted reward for this time step + var V = 0; + for (var t2 = t; t2 < n; t2++) { + V += mul * this.rewardHistory[t2]; + mul *= this.gamma; + if (mul < 1e-5) { + break; + } // efficiency savings + } + // get the predicted baseline at this time step + var b = this.baselineOutputs[t].w[0]; + for (var i = 0; i < this.na; i++) { + // [the action delta] * [the desirebility] + var update = -(V - b) * (this.actorActions[t].w[i] - this.actorOutputs[t].w[i]); + if (update > 0.1) { + update = 0.1; + } + if (update < -0.1) { + update = -0.1; + } + this.actorOutputs[t].dw[i] += update; + } + var update = -(V - b); + if (update > 0.1) { + update = 0.1; + } + if (update < 0.1) { + update = -0.1; + } + this.baselineOutputs[t].dw[0] += update; + baselineMSE += (V - b) * (V - b); + vs.push(V); + } + baselineMSE /= nuse; + // backprop all the things + for (var t = 0; t < nuse; t++) { + this.actorGraphs[t].backward(); + this.baselineGraphs[t].backward(); + } + R.updateNet(this.actorNet, this.alpha); // update actor network + R.updateNet(this.baselineNet, this.beta); // update baseline network + // flush + this.actorOutputs = []; + this.rewardHistory = []; + this.actorActions = []; + this.baselineOutputs = []; + this.actorGraphs = []; + this.baselineGraphs = []; + this.tderror = baselineMSE; + } + this.t += 1; + this.r0 = r1; // store for next update + }, + } + // buggy implementation as well, doesn't work + var RecurrentReinforceAgent = function (env, opt) { + this.gamma = getopt(opt, 'gamma', 0.5); // future reward discount factor + this.epsilon = getopt(opt, 'epsilon', 0.1); // for epsilon-greedy policy + this.alpha = getopt(opt, 'alpha', 0.001); // actor net learning rate + this.beta = getopt(opt, 'beta', 0.01); // baseline net learning rate + this.env = env; + this.reset(); + } + RecurrentReinforceAgent.prototype = { + reset: function () { + this.ns = this.env.getNumStates(); + this.na = this.env.getMaxNumActions(); + this.nh = 40; // number of hidden units + this.nhb = 40; // and also in the baseline lstm + this.actorLSTM = R.initLSTM(this.ns, [this.nh], this.na); + this.actorG = new R.Graph(); + this.actorPrev = null; + this.actorOutputs = []; + this.rewardHistory = []; + this.actorActions = []; + this.baselineLSTM = R.initLSTM(this.ns, [this.nhb], 1); + this.baselineG = new R.Graph(); + this.baselinePrev = null; + this.baselineOutputs = []; + this.t = 0; + this.r0 = null; + this.s0 = null; + this.s1 = null; + this.a0 = null; + this.a1 = null; + }, + act: function (slist) { + // convert to a Mat column vector + var s = new R.Mat(this.ns, 1); + s.setFrom(slist); + // forward the LSTM to get action distribution + var actorNext = R.forwardLSTM(this.actorG, this.actorLSTM, [this.nh], s, this.actorPrev); + this.actorPrev = actorNext; + var amat = actorNext.o; + this.actorOutputs.push(amat); + // forward the baseline LSTM + var baselineNext = R.forwardLSTM(this.baselineG, this.baselineLSTM, [this.nhb], s, this.baselinePrev); + this.baselinePrev = baselineNext; + this.baselineOutputs.push(baselineNext.o); + // sample action from actor policy + var gaussVar = 0.05; + var a = R.copyMat(amat); + for (var i = 0, n = a.w.length; i < n; i++) { + a.w[0] += R.randn(0, gaussVar); + a.w[1] += R.randn(0, gaussVar); + } + this.actorActions.push(a); + // shift state memory + this.s0 = this.s1; + this.a0 = this.a1; + this.s1 = s; + this.a1 = a; + return a; + }, + learn: function (r1) { + // perform an update on Q function + this.rewardHistory.push(r1); + var n = this.rewardHistory.length; + var baselineMSE = 0.0; + var nup = 100; // what chunk of experience to take + var nuse = 80; // what chunk to also update + if (n >= nup) { + // lets learn and flush + // first: compute the sample values at all points + var vs = []; + for (var t = 0; t < nuse; t++) { + var mul = 1; + var V = 0; + for (var t2 = t; t2 < n; t2++) { + V += mul * this.rewardHistory[t2]; + mul *= this.gamma; + if (mul < 1e-5) { + break; + } // efficiency savings + } + var b = this.baselineOutputs[t].w[0]; + // todo: take out the constants etc. + for (var i = 0; i < this.na; i++) { + // [the action delta] * [the desirebility] + var update = -(V - b) * (this.actorActions[t].w[i] - this.actorOutputs[t].w[i]); + if (update > 0.1) { + update = 0.1; + } + if (update < -0.1) { + update = -0.1; + } + this.actorOutputs[t].dw[i] += update; + } + var update = -(V - b); + if (update > 0.1) { + update = 0.1; + } + if (update < 0.1) { + update = -0.1; + } + this.baselineOutputs[t].dw[0] += update; + baselineMSE += (V - b) * (V - b); + vs.push(V); + } + baselineMSE /= nuse; + this.actorG.backward(); // update params! woohoo! + this.baselineG.backward(); + R.updateNet(this.actorLSTM, this.alpha); // update actor network + R.updateNet(this.baselineLSTM, this.beta); // update baseline network + // flush + this.actorG = new R.Graph(); + this.actorPrev = null; + this.actorOutputs = []; + this.rewardHistory = []; + this.actorActions = []; + this.baselineG = new R.Graph(); + this.baselinePrev = null; + this.baselineOutputs = []; + this.tderror = baselineMSE; + } + this.t += 1; + this.r0 = r1; // store for next update + }, + } + // Currently buggy implementation, doesnt work + var DeterministPG = function (env, opt) { + this.gamma = getopt(opt, 'gamma', 0.5); // future reward discount factor + this.epsilon = getopt(opt, 'epsilon', 0.5); // for epsilon-greedy policy + this.alpha = getopt(opt, 'alpha', 0.001); // actor net learning rate + this.beta = getopt(opt, 'beta', 0.01); // baseline net learning rate + this.env = env; + this.reset(); + } + DeterministPG.prototype = { + reset: function () { + this.ns = this.env.getNumStates(); + this.na = this.env.getMaxNumActions(); + this.nh = 100; // number of hidden units + // actor + this.actorNet = {}; + this.actorNet.W1 = new R.RandMat(this.nh, this.ns, 0, 0.01); + this.actorNet.b1 = new R.Mat(this.nh, 1, 0, 0.01); + this.actorNet.W2 = new R.RandMat(this.na, this.ns, 0, 0.1); + this.actorNet.b2 = new R.Mat(this.na, 1, 0, 0.01); + this.ntheta = this.na * this.ns + this.na; // number of params in actor + // critic + this.criticw = new R.RandMat(1, this.ntheta, 0, 0.01); // row vector + this.r0 = null; + this.s0 = null; + this.s1 = null; + this.a0 = null; + this.a1 = null; + this.t = 0; + }, + forwardActor: function (s, needs_backprop) { + var net = this.actorNet; + var G = new R.Graph(needs_backprop); + var a1mat = G.add(G.mul(net.W1, s), net.b1); + var h1mat = G.tanh(a1mat); + var a2mat = G.add(G.mul(net.W2, h1mat), net.b2); + return { + 'a': a2mat, + 'G': G + } + }, + act: function (slist) { + // convert to a Mat column vector + var s = new R.Mat(this.ns, 1); + s.setFrom(slist); + // forward the actor to get action output + var ans = this.forwardActor(s, false); + var amat = ans.a; + var ag = ans.G; + // sample action from the stochastic gaussian policy + var a = R.copyMat(amat); + if (Math.random() < this.epsilon) { + var gaussVar = 0.02; + a.w[0] = R.randn(0, gaussVar); + a.w[1] = R.randn(0, gaussVar); + } + var clamp = 0.25; + if (a.w[0] > clamp) a.w[0] = clamp; + if (a.w[0] < -clamp) a.w[0] = -clamp; + if (a.w[1] > clamp) a.w[1] = clamp; + if (a.w[1] < -clamp) a.w[1] = -clamp; + // shift state memory + this.s0 = this.s1; + this.a0 = this.a1; + this.s1 = s; + this.a1 = a; + return a; + }, + utilJacobianAt: function (s) { + var ujacobian = new R.Mat(this.ntheta, this.na); + for (var a = 0; a < this.na; a++) { + R.netZeroGrads(this.actorNet); + var ag = this.forwardActor(this.s0, true); + ag.a.dw[a] = 1.0; + ag.G.backward(); + var gflat = R.netFlattenGrads(this.actorNet); + ujacobian.setColumn(gflat, a); + } + return ujacobian; + }, + learn: function (r1) { + // perform an update on Q function + //this.rewardHistory.push(r1); + if (!(this.r0 == null)) { + var Gtmp = new R.Graph(false); + // dpg update: + // first compute the features psi: + // the jacobian matrix of the actor for s + var ujacobian0 = this.utilJacobianAt(this.s0); + // now form the features \psi(s,a) + var psi_sa0 = Gtmp.mul(ujacobian0, this.a0); // should be [this.ntheta x 1] "feature" vector + var qw0 = Gtmp.mul(this.criticw, psi_sa0); // 1x1 + // now do the same thing because we need \psi(s_{t+1}, \mu\_\theta(s\_t{t+1})) + var ujacobian1 = this.utilJacobianAt(this.s1); + var ag = this.forwardActor(this.s1, false); + var psi_sa1 = Gtmp.mul(ujacobian1, ag.a); + var qw1 = Gtmp.mul(this.criticw, psi_sa1); // 1x1 + // get the td error finally + var tderror = this.r0 + this.gamma * qw1.w[0] - qw0.w[0]; // lol + if (tderror > 0.5) tderror = 0.5; // clamp + if (tderror < -0.5) tderror = -0.5; + this.tderror = tderror; + // update actor policy with natural gradient + var net = this.actorNet; + var ix = 0; + for (var p in net) { + var mat = net[p]; + if (net.hasOwnProperty(p)) { + for (var i = 0, n = mat.w.length; i < n; i++) { + mat.w[i] += this.alpha * this.criticw.w[ix]; // natural gradient update + ix += 1; + } + } + } + // update the critic parameters too + for (var i = 0; i < this.ntheta; i++) { + var update = this.beta * tderror * psi_sa0.w[i]; + this.criticw.w[i] += update; + } + } + this.r0 = r1; // store for next update + }, + } + // exports + global.DPAgent = DPAgent; + global.TDAgent = TDAgent; + global.DQNAgent = DQNAgent; + //global.SimpleReinforceAgent = SimpleReinforceAgent; + //global.RecurrentReinforceAgent = RecurrentReinforceAgent; + //global.DeterministPG = DeterministPG; })(RL); var Trevel = { - //settings you can change - stop: true, - maxBet: 0.00001, - minBet: 0.00000005, - swap: true, - betSpeed: 100,//change this on init - verbose: true, - isTesting: false, - //money management - useKelly: false,//martingale performs better on live account! - korm: false, - kellyPercent: 5, //can't be more than 100 or less than 1 - useMartingale: true, //if kelly is true this won't work + //settings you can change + stop: true, + maxBet: 0.00001, + minBet: 0.00000005, + swap: true, + betSpeed: 100,//change this on init + verbose: true, + isTesting: false, + //money management + useKelly: false,//martingale performs better on live account! + korm: false, + kellyPercent: 5, //can't be more than 100 or less than 1 + useMartingale: true, //if kelly is true this won't work - martingaleMultiplier: 2, - //bot settings, these are set automaticcally don't bother - currentBalance: 0, - startingBalance: 0, - betAmount: 0, - profit: 0, - totalBets: 0, - totalWins: 0, - winRate: 0, - betHistory: [], //this is a sequence of all winning bets not the sequence of bets we placed - betOutcomes: [], - hbProbability: 0, - lbProbability: 0, - hbCount: 0, - lbcount: 0, - nextBet: "", - previousReward: 0, - addBet: function (bet, outcome) { - if (bet === "LB" && outcome === "Win") { - this.betHistory.push("LO"); - this.betOutcomes.push("W"); - this.totalWins++; - this.lbcount++; - } - if (bet === "LB" && outcome === "Loose") { - this.betHistory.push("HI"); - this.hbCount++; - this.betOutcomes.push("L"); - } - if (bet === "HB" && outcome === "Win") { - this.betHistory.push("HI"); - this.totalWins++; - this.hbCount++; - this.betOutcomes.push("W"); - } - if (bet === "HB" && outcome === "Loose") { - this.betHistory.push("LO"); - this.lbcount++; - this.betOutcomes.push("L"); - } - this.totalBets++; - }, - calculateProbabilities: function () { - this.hbProbability = this.hbCount / this.betHistory.length; - this.lbProbability = this.lbcount / this.betHistory.length; - this.winRate = this.totalWins / this.totalBets; - if (this.isTesting === false) { - this.profit = this.getProfit(); - } - }, - getCurrentBalance: function () { - return parseFloat($('#balance').html()); - }, - placeHighBet: function () { - $('#double_your_btc_bet_hi_button').click(); - }, - placeLowBet: function () { - $('#double_your_btc_bet_lo_button').click(); - }, - setBetAmount: function (amount) { - var elem = document.getElementById("double_your_btc_stake"); - elem.value = amount; - }, - setOutcome: function (bet) { - if ($('#double_your_btc_bet_lose').html() !== '') { - this.addBet(bet, "Loose"); - } else { - this.addBet(bet, "Win"); - } - }, - prepareBet: function () { - this.calculateProbabilities(); - if (this.betHistory.length < 10) { - if (this.useMartingale === true && this.betHistory.length > 12) { - if ($('#double_your_btc_bet_lose').html() !== '' && parseFloat($('#double_your_btc_stake').val()) * this.martingaleMultiplier < this.maxBet) { - this.setBetAmount((parseFloat($('#double_your_btc_stake').val()) * this.martingaleMultiplier).toFixed(8)); - } else { - this.setBetAmount(this.minBet); - } - } - } else { - if (this.useKelly === true && this.betHistory.length > 12) { - this.currentBalance = this.getCurrentBalance(); - var currMulty = document.getElementById("double_your_btc_payout_multiplier").value; - var kellyAmount = (((this.currentBalance * this.kellyPercent) / 100) * ((this.winRate * currMulty - 1)) / (currMulty - 1)).toFixed(8); - if (kellyAmount > 0 && kellyAmount < this.maxBet) { - this.setBetAmount(kellyAmount); - } else { - this.setBetAmount(this.minBet); - } - } else if (this.useMartingale === true && this.betHistory.length > 12) { - if ($('#double_your_btc_bet_lose').html() !== '' && parseFloat($('#double_your_btc_stake').val()) * this.martingaleMultiplier < this.maxBet) { - this.setBetAmount((parseFloat($('#double_your_btc_stake').val()) * this.martingaleMultiplier).toFixed(8)); - } else { - this.setBetAmount(this.minBet); - } - } - } - }, - placeBet: function () { - if (this.nextBet === "HB") { - this.placeHighBet(); - } else if (this.nextBet === "LB") { - this.placeLowBet(); - } else if (this.betHistory.length > 0 && this.swap === true) { - var prev = this.betHistory[this.betHistory.length - 1]; - if (prev === "LO") { - this.placeHighBet(); - } else { - this.placeLowBet(); - } - } else { - this.placeLowBet(); - } - }, - getProfit: function () { - return (this.getCurrentBalance() - this.startingBalance).toFixed(8); - }, - getNumStates: function () { - return 8; - }, - getMaxNumActions: function () { - return 2; - }, - getSentiment: function (bet) { - if (bet === "HI") { - return 1; - } else { - return 0; - } - }, - getPreviousBets: function () { - var hist = []; - if (this.betHistory.length > 12) { - hist.push(this.getSentiment(this.betHistory[this.betHistory.length - 1])); - hist.push(this.getSentiment(this.betHistory[this.betHistory.length - 2])); - hist.push(this.getSentiment(this.betHistory[this.betHistory.length - 3])); - hist.push(this.getSentiment(this.betHistory[this.betHistory.length - 4])); - hist.push(this.getSentiment(this.betHistory[this.betHistory.length - 5])); - hist.push(this.getSentiment(this.betHistory[this.betHistory.length - 6])); - hist.push(this.getSentiment(this.betHistory[this.betHistory.length - 7])); - hist.push(this.getSentiment(this.betHistory[this.betHistory.length - 8])); - } else { - hist = [0, 1, 0, 1, 0, 1, 0, 1]; //incase we just started... - } - return hist; - }, - getAgentState: function () { //we'll observe the last 8 bets - var s = this.getPreviousBets(); - return s; - }, - getReward: function () { - var reward = 0; - var out1 = this.betOutcomes[this.betOutcomes.length - 1]; - var out2 = this.betOutcomes[this.betOutcomes.length - 2]; - if (out1 === "L") { - if (this.previousReward < 0) { - reward = this.previousReward; - reward += -0.03; - if (out2 === "L") { - reward += -0.03; - } - } - else { - reward = -0.03; - if (out2 === "L") { - reward += -0.03; - } - } - } - else { - if (this.previousReward > 0) { - reward = this.previousReward; - reward += 0.01; - if (out2 === "W") { - reward += 0.01; - } - } - else { - reward = 0.01; - if (out2 === "W") { - reward += 0.01; - } - } - } - return reward; - }, - //for raw testing only - randomNumber: function (min, max) { - return Math.floor(Math.random() * (max - min + 1) + min); - }, - getTestOutcome: function (random) { - if (random % 2 === 0) { - return "HI"; - } else { - return "LO"; - } - }, - //random string for random seed - rString: function (length, chars) { - var result = ''; - var length = 16; - var chars = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789abcdefghijklmnopqrstuvwxyz' - for (var i = length; i > 0; --i) result += chars[Math.floor(Math.random() * chars.length)]; - return result; - }, - //set client seed as random string - rSeed: function () { - $('#next_client_seed').val(rString()); - }, - //initialize this - init: function () { - this.startingBalance = this.currentBalance = parseFloat($('#balance').html()); - this.setBetAmount(this.minBet); - this.stop = true; - this.swap = true; - this.betSpeed = 3000; - document.getElementById("free_play_link_li").innerHTML = 'START BOT'; + martingaleMultiplier: 2, + //bot settings, these are set automaticcally don't bother + currentBalance: 0, + startingBalance: 0, + betAmount: 0, + profit: 0, + totalBets: 0, + totalWins: 0, + winRate: 0, + betHistory: [], //this is a sequence of all winning bets not the sequence of bets we placed + betOutcomes: [], + hbProbability: 0, + lbProbability: 0, + hbCount: 0, + lbcount: 0, + nextBet: "", + previousReward: 0, + addBet: function (bet, outcome) { + if (bet === "LB" && outcome === "Win") { + this.betHistory.push("LO"); + this.betOutcomes.push("W"); + this.totalWins++; + this.lbcount++; + } + if (bet === "LB" && outcome === "Loose") { + this.betHistory.push("HI"); + this.hbCount++; + this.betOutcomes.push("L"); + } + if (bet === "HB" && outcome === "Win") { + this.betHistory.push("HI"); + this.totalWins++; + this.hbCount++; + this.betOutcomes.push("W"); + } + if (bet === "HB" && outcome === "Loose") { + this.betHistory.push("LO"); + this.lbcount++; + this.betOutcomes.push("L"); + } + this.totalBets++; + }, + calculateProbabilities: function () { + this.hbProbability = this.hbCount / this.betHistory.length; + this.lbProbability = this.lbcount / this.betHistory.length; + this.winRate = this.totalWins / this.totalBets; + if (this.isTesting === false) { + this.profit = this.getProfit(); + } + }, + getCurrentBalance: function () { + return parseFloat($('#balance').html()); + }, + placeHighBet: function () { + $('#double_your_btc_bet_hi_button').click(); + }, + placeLowBet: function () { + $('#double_your_btc_bet_lo_button').click(); + }, + setBetAmount: function (amount) { + var elem = document.getElementById("double_your_btc_stake"); + elem.value = amount; + }, + setOutcome: function (bet) { + if ($('#double_your_btc_bet_lose').html() !== '') { + this.addBet(bet, "Loose"); + } else { + this.addBet(bet, "Win"); + } + }, + prepareBet: function () { + this.calculateProbabilities(); + if (this.betHistory.length < 10) { + if (this.useMartingale === true && this.betHistory.length > 12) { + if ($('#double_your_btc_bet_lose').html() !== '' && parseFloat($('#double_your_btc_stake').val()) * this.martingaleMultiplier < this.maxBet) { + this.setBetAmount((parseFloat($('#double_your_btc_stake').val()) * this.martingaleMultiplier).toFixed(8)); + } else { + this.setBetAmount(this.minBet); + } + } + } else { + if (this.useKelly === true && this.betHistory.length > 12) { + this.currentBalance = this.getCurrentBalance(); + var currMulty = document.getElementById("double_your_btc_payout_multiplier").value; + var kellyAmount = (((this.currentBalance * this.kellyPercent) / 100) * ((this.winRate * currMulty - 1)) / (currMulty - 1)).toFixed(8); + if (kellyAmount > 0 && kellyAmount < this.maxBet) { + this.setBetAmount(kellyAmount); + } else { + this.setBetAmount(this.minBet); + } + } else if (this.useMartingale === true && this.betHistory.length > 12) { + if ($('#double_your_btc_bet_lose').html() !== '' && parseFloat($('#double_your_btc_stake').val()) * this.martingaleMultiplier < this.maxBet) { + this.setBetAmount((parseFloat($('#double_your_btc_stake').val()) * this.martingaleMultiplier).toFixed(8)); + } else { + this.setBetAmount(this.minBet); + } + } + } + }, + placeBet: function () { + if (this.nextBet === "HB") { + this.placeHighBet(); + } else if (this.nextBet === "LB") { + this.placeLowBet(); + } else if (this.betHistory.length > 0 && this.swap === true) { + var prev = this.betHistory[this.betHistory.length - 1]; + if (prev === "LO") { + this.placeHighBet(); + } else { + this.placeLowBet(); + } + } else { + this.placeLowBet(); + } + }, + getProfit: function () { + return (this.getCurrentBalance() - this.startingBalance).toFixed(8); + }, + getNumStates: function () { + return 8; + }, + getMaxNumActions: function () { + return 2; + }, + getSentiment: function (bet) { + if (bet === "HI") { + return 1; + } else { + return 0; + } + }, + getPreviousBets: function () { + var hist = []; + if (this.betHistory.length > 12) { + hist.push(this.getSentiment(this.betHistory[this.betHistory.length - 1])); + hist.push(this.getSentiment(this.betHistory[this.betHistory.length - 2])); + hist.push(this.getSentiment(this.betHistory[this.betHistory.length - 3])); + hist.push(this.getSentiment(this.betHistory[this.betHistory.length - 4])); + hist.push(this.getSentiment(this.betHistory[this.betHistory.length - 5])); + hist.push(this.getSentiment(this.betHistory[this.betHistory.length - 6])); + hist.push(this.getSentiment(this.betHistory[this.betHistory.length - 7])); + hist.push(this.getSentiment(this.betHistory[this.betHistory.length - 8])); + } else { + hist = [0, 1, 0, 1, 0, 1, 0, 1]; //incase we just started... + } + return hist; + }, + getAgentState: function () { //we'll observe the last 8 bets + var s = this.getPreviousBets(); + return s; + }, + getReward: function () { + var reward = 0; + var out1 = this.betOutcomes[this.betOutcomes.length - 1]; + var out2 = this.betOutcomes[this.betOutcomes.length - 2]; + if (out1 === "L") { + if (this.previousReward < 0) { + reward = this.previousReward; + reward += -0.03; + if (out2 === "L") { + reward += -0.03; + } + } + else { + reward = -0.03; + if (out2 === "L") { + reward += -0.03; + } + } + } + else { + if (this.previousReward > 0) { + reward = this.previousReward; + reward += 0.01; + if (out2 === "W") { + reward += 0.01; + } + } + else { + reward = 0.01; + if (out2 === "W") { + reward += 0.01; + } + } + } + return reward; + }, + //for raw testing only + randomNumber: function (min, max) { + return Math.floor(Math.random() * (max - min + 1) + min); + }, + getTestOutcome: function (random) { + if (random % 2 === 0) { + return "HI"; + } else { + return "LO"; + } + }, + //random string for random seed + rString: function (length, chars) { + var result = ''; + var length = 16; + var chars = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789abcdefghijklmnopqrstuvwxyz' + for (var i = length; i > 0; --i) result += chars[Math.floor(Math.random() * chars.length)]; + return result; + }, + //set client seed as random string + rSeed: function () { + $('#next_client_seed').val(rString()); + }, + //initialize this + init: function () { + this.startingBalance = this.currentBalance = parseFloat($('#balance').html()); + this.setBetAmount(this.minBet); + this.stop = true; + this.swap = true; + this.betSpeed = 3000; + document.getElementById("free_play_link_li").innerHTML = 'START BOT'; - }, - // ask user config variables - config: function () { + }, + // ask user config variables + config: function () { - //define temporary variables - var maxb = 0, minb = 0; + //define temporary variables + var maxb = 0, minb = 0; - //prompt questions - maxb = prompt('Maximum bet stake in Satoshi', 1000); - this.maxBet = sattobtc(maxb); - minb = prompt('Minimum bet stake in Satoshi', 2); - this.minBet = sattobtc(minb); - this.martingaleMultiplier = prompt('Bet multiplier on lose', 2); - this.swap = prompt('True for swap enabled, false for disabled', 'true'); - this.korm = prompt('True to enable Kelly, false to enabled martingale, leave blank for both', 'false'); - this.betSpeed = prompt('Wait time before next bet is placed in ms', 3000); + //prompt questions + maxb = prompt('Maximum bet stake in Satoshi', 1000); + this.maxBet = sattobtc(maxb); + minb = prompt('Minimum bet stake in Satoshi', 2); + this.minBet = sattobtc(minb); + this.martingaleMultiplier = prompt('Bet multiplier on lose', 2); + this.swap = prompt('True for swap enabled, false for disabled', 'true'); + this.korm = prompt('True to enable Kelly, false to enabled martingale, leave blank for both', 'false'); + this.betSpeed = prompt('Wait time before next bet is placed in ms', 3000); - //convert satoshi to btc - function sattobtc(sat) { - var btc = 0.00000001; - return sat * btc; - } - if (this.korm === 'true') { - this.useKelly = true; - this.useMartingale = false; - } - else if (this.korm === 'false') { - this.useMartingale = true; - this.useKelly = false; - } - else { - this.useKelly = this.useMartingale = true; - } - //start betting - startbetting(); - }, - stopbets: function () { - env.stop = true; - clearInterval(interval); - console.log('Bet session has been stopped, to start over click start.'); - document.getElementById("free_play_link_li").innerHTML = 'START BOT'; + //convert satoshi to btc + function sattobtc(sat) { + var btc = 0.00000001; + return sat * btc; + } + if (this.korm === 'true') { + this.useKelly = true; + this.useMartingale = false; + } + else if (this.korm === 'false') { + this.useMartingale = true; + this.useKelly = false; + } + else { + this.useKelly = this.useMartingale = true; + } + //start betting + startbetting(); + }, + stopbets: function () { + env.stop = true; + clearInterval(interval); + console.log('Bet session has been stopped, to start over click start.'); + document.getElementById("free_play_link_li").innerHTML = 'START BOT'; - } + } }; //Deep Q learning with reinforceJS var spec = {}; @@ -1806,17 +1806,17 @@ spec.num_hidden_units = 24; var env = Trevel; var interval = null; if (env.isTesting === false) { - env.init(); + env.init(); } // create the DQN agent agent = new RL.DQNAgent(env, spec); // start betting function/agent interval function startbetting() { - console.log('Starting bet session, to stop click STOP BOT'); - document.getElementById("free_play_link_li").innerHTML = 'STOP BOT'; - env.stop = false; - interval = setInterval(function () { loop(); }, env.betSpeed); + console.log('Starting bet session, to stop click STOP BOT'); + document.getElementById("free_play_link_li").innerHTML = 'STOP BOT'; + env.stop = false; + interval = setInterval(function () { loop(); }, env.betSpeed); } console.clear(); console.log('You are using Trevel, with ReinforceJS'); @@ -1825,62 +1825,61 @@ console.log('Click it to set the config. Note: These settings are not persistent console.log('To change the default values for these settings, search the script for "prompt"'); console.log('Enjoy'); function loop() { -if(env.profit > 0.00000500) -{ - env.rSeed(); -} - if (env.stop === false) { - var state = env.getAgentState(); - var action = agent.act(state); - var outcome = ""; - if (env.isTesting === false) { - if (action === 0) { - env.nextBet = "LB"; - env.prepareBet(); - env.placeBet(); - env.setOutcome("LB"); - outcome = env.betOutcomes[env.betOutcomes.length - 1]; - } else if (action === 1) { - env.nextBet = "HB"; - env.prepareBet(); - env.placeBet(); - env.setOutcome("HB"); - outcome = env.betOutcomes[env.betOutcomes.length - 1]; - } - if (env.verbose === true) { - env.calculateProbabilities(); - //console.log("Machine Bet: " + action + "{" + env.nextBet + "} isKelly: " + env.useKelly + " isMartingale: " + env.useMartingale); - console.log("Profit: " + env.profit + " WinRate: " + (env.winRate * 100).toFixed(2)); - } - } else { - console.log("Action: " + action); - var testOutcome = env.getTestOutcome(env.randomNumber(0, 1000)); - if (action === 0 && testOutcome === "LO") { - env.addBet("LB", "Win"); - outcome = "W"; - } else if (action === 0 && testOutcome === "HI") { - env.addBet("LB", "Loose"); - outcome = "L"; - } else if (action === 1 && testOutcome === "HI") { - env.addBet("HB", "Win"); - outcome = "W"; - } else if (action === 1 && testOutcome === "LO") { - env.addBet("HB", "Loose"); - outcome = "L"; - } - env.calculateProbabilities(); - console.log("Winrate: " + (env.winRate * 100).toFixed(2)); - } - var reward = env.getReward(); - if (reward === 0) { - if (outcome === "L") { - reward = -0.03; - } else { - reward = 0.01; - } - } - agent.learn(reward); - env.previousReward = reward; - } + if (env.profit > 0.00000500) { + env.rSeed(); + } + if (env.stop === false) { + var state = env.getAgentState(); + var action = agent.act(state); + var outcome = ""; + if (env.isTesting === false) { + if (action === 0) { + env.nextBet = "LB"; + env.prepareBet(); + env.placeBet(); + env.setOutcome("LB"); + outcome = env.betOutcomes[env.betOutcomes.length - 1]; + } else if (action === 1) { + env.nextBet = "HB"; + env.prepareBet(); + env.placeBet(); + env.setOutcome("HB"); + outcome = env.betOutcomes[env.betOutcomes.length - 1]; + } + if (env.verbose === true) { + env.calculateProbabilities(); + //console.log("Machine Bet: " + action + "{" + env.nextBet + "} isKelly: " + env.useKelly + " isMartingale: " + env.useMartingale); + console.log("Profit: " + env.profit + " WinRate: " + (env.winRate * 100).toFixed(2)); + } + } else { + console.log("Action: " + action); + var testOutcome = env.getTestOutcome(env.randomNumber(0, 1000)); + if (action === 0 && testOutcome === "LO") { + env.addBet("LB", "Win"); + outcome = "W"; + } else if (action === 0 && testOutcome === "HI") { + env.addBet("LB", "Loose"); + outcome = "L"; + } else if (action === 1 && testOutcome === "HI") { + env.addBet("HB", "Win"); + outcome = "W"; + } else if (action === 1 && testOutcome === "LO") { + env.addBet("HB", "Loose"); + outcome = "L"; + } + env.calculateProbabilities(); + console.log("Winrate: " + (env.winRate * 100).toFixed(2)); + } + var reward = env.getReward(); + if (reward === 0) { + if (outcome === "L") { + reward = -0.03; + } else { + reward = 0.01; + } + } + agent.learn(reward); + env.previousReward = reward; + } } From 85aca3e9bb54d7f1bd541daa9c79358c519a8533 Mon Sep 17 00:00:00 2001 From: nickisghosty Date: Sun, 22 Oct 2017 07:31:09 -0700 Subject: [PATCH 3/6] fixed space after function --- DQ-Trevel.js | 240 +++++++++++++++++++++++++-------------------------- 1 file changed, 120 insertions(+), 120 deletions(-) diff --git a/DQ-Trevel.js b/DQ-Trevel.js index 7a68dde..358e04b 100644 --- a/DQ-Trevel.js +++ b/DQ-Trevel.js @@ -1,5 +1,5 @@ var R = {}; // the Recurrent library -(function (global) { +(function(global) { "use strict"; // Utility fun function assert(condition, message) { @@ -15,7 +15,7 @@ var R = {}; // the Recurrent library // Random numbers utils var return_v = false; var v_val = 0.0; - var gaussRandom = function () { + var gaussRandom = function() { if (return_v) { return_v = false; return v_val; @@ -29,18 +29,18 @@ var R = {}; // the Recurrent library return_v = true; return u * c; } - var randf = function (a, b) { + var randf = function(a, b) { return Math.random() * (b - a) + a; } - var randi = function (a, b) { + var randi = function(a, b) { return Math.floor(Math.random() * (b - a) + a); } - var randn = function (mu, std) { + var randn = function(mu, std) { return mu + gaussRandom() * std; } // helper function returns array of zeros of length n // and uses typed arrays if available - var zeros = function (n) { + var zeros = function(n) { if (typeof (n) === 'undefined' || isNaN(n)) { return []; } @@ -56,7 +56,7 @@ var R = {}; // the Recurrent library } } // Mat holds a matrix - var Mat = function (n, d) { + var Mat = function(n, d) { // n is number of rows d is number of columns this.n = n; this.d = d; @@ -64,37 +64,37 @@ var R = {}; // the Recurrent library this.dw = zeros(n * d); } Mat.prototype = { - get: function (row, col) { + get: function(row, col) { // slow but careful accessor function // we want row-major order var ix = (this.d * row) + col; assert(ix >= 0 && ix < this.w.length); return this.w[ix]; }, - set: function (row, col, v) { + set: function(row, col, v) { // slow but careful accessor function var ix = (this.d * row) + col; assert(ix >= 0 && ix < this.w.length); this.w[ix] = v; }, - setFrom: function (arr) { + setFrom: function(arr) { for (var i = 0, n = arr.length; i < n; i++) { this.w[i] = arr[i]; } }, - setColumn: function (m, i) { + setColumn: function(m, i) { for (var q = 0, n = m.w.length; q < n; q++) { this.w[(this.d * q) + i] = m.w[q]; } }, - toJSON: function () { + toJSON: function() { var json = {}; json['n'] = this.n; json['d'] = this.d; json['w'] = this.w; return json; }, - fromJSON: function (json) { + fromJSON: function(json) { this.n = json.n; this.d = json.d; this.w = zeros(this.n * this.d); @@ -104,12 +104,12 @@ var R = {}; // the Recurrent library } } } - var copyMat = function (b) { + var copyMat = function(b) { var a = new Mat(b.n, b.d); a.setFrom(b.w); return a; } - var copyNet = function (net) { + var copyNet = function(net) { // nets are (k,v) pairs with k = string key, v = Mat() var new_net = {}; for (var p in net) { @@ -119,7 +119,7 @@ var R = {}; // the Recurrent library } return new_net; } - var updateMat = function (m, alpha) { + var updateMat = function(m, alpha) { // updates in place for (var i = 0, n = m.n * m.d; i < n; i++) { if (m.dw[i] !== 0) { @@ -128,14 +128,14 @@ var R = {}; // the Recurrent library } } } - var updateNet = function (net, alpha) { + var updateNet = function(net, alpha) { for (var p in net) { if (net.hasOwnProperty(p)) { updateMat(net[p], alpha); } } } - var netToJSON = function (net) { + var netToJSON = function(net) { var j = {}; for (var p in net) { if (net.hasOwnProperty(p)) { @@ -144,7 +144,7 @@ var R = {}; // the Recurrent library } return j; } - var netFromJSON = function (j) { + var netFromJSON = function(j) { var net = {}; for (var p in j) { if (j.hasOwnProperty(p)) { @@ -154,7 +154,7 @@ var R = {}; // the Recurrent library } return net; } - var netZeroGrads = function (net) { + var netZeroGrads = function(net) { for (var p in net) { if (net.hasOwnProperty(p)) { var mat = net[p]; @@ -162,7 +162,7 @@ var R = {}; // the Recurrent library } } } - var netFlattenGrads = function (net) { + var netFlattenGrads = function(net) { var n = 0; for (var p in net) { if (net.hasOwnProperty(p)) { @@ -184,7 +184,7 @@ var R = {}; // the Recurrent library return g; } // return Mat but filled with random numbers from gaussian - var RandMat = function (n, d, mu, std) { + var RandMat = function(n, d, mu, std) { var m = new Mat(n, d); fillRandn(m, mu, std); //fillRand(m,-std,std); // kind of :P @@ -192,23 +192,23 @@ var R = {}; // the Recurrent library } // Mat utils // fill matrix with random gaussian numbers - var fillRandn = function (m, mu, std) { + var fillRandn = function(m, mu, std) { for (var i = 0, n = m.w.length; i < n; i++) { m.w[i] = randn(mu, std); } } - var fillRand = function (m, lo, hi) { + var fillRand = function(m, lo, hi) { for (var i = 0, n = m.w.length; i < n; i++) { m.w[i] = randf(lo, hi); } } - var gradFillConst = function (m, c) { + var gradFillConst = function(m, c) { for (var i = 0, n = m.dw.length; i < n; i++) { m.dw[i] = c } } // Transformer definitions - var Graph = function (needs_backprop) { + var Graph = function(needs_backprop) { if (typeof needs_backprop === 'undefined') { needs_backprop = true; } @@ -219,12 +219,12 @@ var R = {}; // the Recurrent library this.backprop = []; } Graph.prototype = { - backward: function () { + backward: function() { for (var i = this.backprop.length - 1; i >= 0; i--) { this.backprop[i](); // tick! } }, - rowPluck: function (m, ix) { + rowPluck: function(m, ix) { // pluck a row of m with index ix and return it as col vector assert(ix >= 0 && ix < m.n); var d = m.d; @@ -233,7 +233,7 @@ var R = {}; // the Recurrent library out.w[i] = m.w[d * ix + i]; } // copy over the data if (this.needs_backprop) { - var backward = function () { + var backward = function() { for (var i = 0, n = d; i < n; i++) { m.dw[d * ix + i] += out.dw[i]; } @@ -242,7 +242,7 @@ var R = {}; // the Recurrent library } return out; }, - tanh: function (m) { + tanh: function(m) { // tanh nonlinearity var out = new Mat(m.n, m.d); var n = m.w.length; @@ -250,7 +250,7 @@ var R = {}; // the Recurrent library out.w[i] = Math.tanh(m.w[i]); } if (this.needs_backprop) { - var backward = function () { + var backward = function() { for (var i = 0; i < n; i++) { // grad for z = tanh(x) is (1 - z^2) var mwi = out.w[i]; @@ -261,7 +261,7 @@ var R = {}; // the Recurrent library } return out; }, - sigmoid: function (m) { + sigmoid: function(m) { // sigmoid nonlinearity var out = new Mat(m.n, m.d); var n = m.w.length; @@ -269,7 +269,7 @@ var R = {}; // the Recurrent library out.w[i] = sig(m.w[i]); } if (this.needs_backprop) { - var backward = function () { + var backward = function() { for (var i = 0; i < n; i++) { // grad for z = tanh(x) is (1 - z^2) var mwi = out.w[i]; @@ -280,14 +280,14 @@ var R = {}; // the Recurrent library } return out; }, - relu: function (m) { + relu: function(m) { var out = new Mat(m.n, m.d); var n = m.w.length; for (var i = 0; i < n; i++) { out.w[i] = Math.max(0, m.w[i]); // relu } if (this.needs_backprop) { - var backward = function () { + var backward = function() { for (var i = 0; i < n; i++) { m.dw[i] += m.w[i] > 0 ? out.dw[i] : 0.0; } @@ -296,7 +296,7 @@ var R = {}; // the Recurrent library } return out; }, - mul: function (m1, m2) { + mul: function(m1, m2) { // multiply matrices m1 * m2 assert(m1.d === m2.n, 'matmul dimensions misaligned'); var n = m1.n; @@ -312,7 +312,7 @@ var R = {}; // the Recurrent library } } if (this.needs_backprop) { - var backward = function () { + var backward = function() { for (var i = 0; i < m1.n; i++) { // loop over rows of m1 for (var j = 0; j < m2.d; j++) { // loop over cols of m2 for (var k = 0; k < m1.d; k++) { // dot product loop @@ -327,14 +327,14 @@ var R = {}; // the Recurrent library } return out; }, - add: function (m1, m2) { + add: function(m1, m2) { assert(m1.w.length === m2.w.length); var out = new Mat(m1.n, m1.d); for (var i = 0, n = m1.w.length; i < n; i++) { out.w[i] = m1.w[i] + m2.w[i]; } if (this.needs_backprop) { - var backward = function () { + var backward = function() { for (var i = 0, n = m1.w.length; i < n; i++) { m1.dw[i] += out.dw[i]; m2.dw[i] += out.dw[i]; @@ -344,7 +344,7 @@ var R = {}; // the Recurrent library } return out; }, - dot: function (m1, m2) { + dot: function(m1, m2) { // m1 m2 are both column vectors assert(m1.w.length === m2.w.length); var out = new Mat(1, 1); @@ -354,7 +354,7 @@ var R = {}; // the Recurrent library } out.w[0] = dot; if (this.needs_backprop) { - var backward = function () { + var backward = function() { for (var i = 0, n = m1.w.length; i < n; i++) { m1.dw[i] += m2.w[i] * out.dw[0]; m2.dw[i] += m1.w[i] * out.dw[0]; @@ -364,14 +364,14 @@ var R = {}; // the Recurrent library } return out; }, - eltmul: function (m1, m2) { + eltmul: function(m1, m2) { assert(m1.w.length === m2.w.length); var out = new Mat(m1.n, m1.d); for (var i = 0, n = m1.w.length; i < n; i++) { out.w[i] = m1.w[i] * m2.w[i]; } if (this.needs_backprop) { - var backward = function () { + var backward = function() { for (var i = 0, n = m1.w.length; i < n; i++) { m1.dw[i] += m2.w[i] * out.dw[i]; m2.dw[i] += m1.w[i] * out.dw[i]; @@ -382,7 +382,7 @@ var R = {}; // the Recurrent library return out; }, } - var softmax = function (m) { + var softmax = function(m) { var out = new Mat(m.n, m.d); // probability volume var maxval = -999999; for (var i = 0, n = m.w.length; i < n; i++) { @@ -401,13 +401,13 @@ var R = {}; // the Recurrent library // to set gradients directly on m return out; } - var Solver = function () { + var Solver = function() { this.decay_rate = 0.999; this.smooth_eps = 1e-8; this.step_cache = {}; } Solver.prototype = { - step: function (model, step_size, regc, clipval) { + step: function(model, step_size, regc, clipval) { // perform parameter update var solver_stats = {}; var num_clipped = 0; @@ -443,7 +443,7 @@ var R = {}; // the Recurrent library return solver_stats; } } - var initLSTM = function (input_size, hidden_sizes, output_size) { + var initLSTM = function(input_size, hidden_sizes, output_size) { // hidden size should be a list var model = {}; for (var d = 0; d < hidden_sizes.length; d++) { // loop over depths @@ -469,7 +469,7 @@ var R = {}; // the Recurrent library model['bd'] = new Mat(output_size, 1); return model; } - var forwardLSTM = function (G, model, hidden_sizes, x, prev) { + var forwardLSTM = function(G, model, hidden_sizes, x, prev) { // forward prop for a single tick of LSTM // G is graph to append ops to // model contains LSTM parameters @@ -527,11 +527,11 @@ var R = {}; // the Recurrent library 'o': output }; } - var sig = function (x) { + var sig = function(x) { // helper function for computing sigmoid return 1.0 / (1 + Math.exp(-x)); } - var maxi = function (w) { + var maxi = function(w) { // argmax of array w var maxv = w[0]; var maxix = 0; @@ -544,7 +544,7 @@ var R = {}; // the Recurrent library } return maxix; } - var samplei = function (w) { + var samplei = function(w) { // sample argmax from w, assuming w are // probabilities that sum to one var r = randf(0, 1); @@ -587,10 +587,10 @@ var R = {}; // the Recurrent library })(R); // END OF RECURRENTJS var RL = {}; -(function (global) { +(function(global) { "use strict"; // syntactic sugar function for getting default parameter values - var getopt = function (opt, field_name, default_value) { + var getopt = function(opt, field_name, default_value) { if (typeof opt === 'undefined') { return default_value; } @@ -600,12 +600,12 @@ var RL = {}; var assert = R.assert; var randi = R.randi; var randf = R.randf; - var setConst = function (arr, c) { + var setConst = function(arr, c) { for (var i = 0, n = arr.length; i < n; i++) { arr[i] = c; } } - var sampleWeighted = function (p) { + var sampleWeighted = function(p) { var r = Math.random(); var c = 0.0; for (var i = 0, n = p.length; i < n; i++) { @@ -624,7 +624,7 @@ var RL = {}; // - requires model of the environment :( // - does not learn from experience :( // - assumes finite MDP :( - var DPAgent = function (env, opt) { + var DPAgent = function(env, opt) { this.V = null; // state value function this.P = null; // policy distribution \pi(s,a) this.env = env; // store pointer to environment @@ -632,7 +632,7 @@ var RL = {}; this.reset(); } DPAgent.prototype = { - reset: function () { + reset: function() { // reset the agent's policy and value function this.ns = this.env.getNumStates(); this.na = this.env.getMaxNumActions(); @@ -646,7 +646,7 @@ var RL = {}; } } }, - act: function (s) { + act: function(s) { // behave according to the learned policy var poss = this.env.allowedActions(s); var ps = []; @@ -658,12 +658,12 @@ var RL = {}; var maxi = sampleWeighted(ps); return poss[maxi]; }, - learn: function () { + learn: function() { // perform a single round of value iteration self.evaluatePolicy(); // writes this.V self.updatePolicy(); // writes this.P }, - evaluatePolicy: function () { + evaluatePolicy: function() { // perform a synchronous update of the value function var Vnew = zeros(this.ns); for (var s = 0; s < this.ns; s++) { @@ -685,7 +685,7 @@ var RL = {}; } this.V = Vnew; // swap }, - updatePolicy: function () { + updatePolicy: function() { // update policy to be greedy w.r.t. learned Value function for (var s = 0; s < this.ns; s++) { var poss = this.env.allowedActions(s); @@ -716,7 +716,7 @@ var RL = {}; // QAgent uses TD (Q-Learning, SARSA) // - does not require environment model :) // - learns from experience :) - var TDAgent = function (env, opt) { + var TDAgent = function(env, opt) { this.update = getopt(opt, 'update', 'qlearn'); // qlearn | sarsa this.gamma = getopt(opt, 'gamma', 0.75); // future reward discount factor this.epsilon = getopt(opt, 'epsilon', 0.1); // for epsilon-greedy policy @@ -739,7 +739,7 @@ var RL = {}; this.reset(); } TDAgent.prototype = { - reset: function () { + reset: function() { // reset the agent's policy and value function this.ns = this.env.getNumStates(); this.na = this.env.getMaxNumActions(); @@ -770,10 +770,10 @@ var RL = {}; this.a0 = null; this.a1 = null; }, - resetEpisode: function () { + resetEpisode: function() { // an episode finished }, - act: function (s) { + act: function(s) { // act according to epsilon greedy policy var poss = this.env.allowedActions(s); var probs = []; @@ -795,7 +795,7 @@ var RL = {}; this.a1 = a; return a; }, - learn: function (r1) { + learn: function(r1) { // takes reward for previous action, which came from a call to act() if (!(this.r0 == null)) { this.learnFromTuple(this.s0, this.a0, this.r0, this.s1, this.a1, this.lambda); @@ -806,7 +806,7 @@ var RL = {}; } this.r0 = r1; // store this for next update }, - updateModel: function (s0, a0, r0, s1) { + updateModel: function(s0, a0, r0, s1) { // transition (s0,a0) -> (r0,s1) was observed. Update environment model var sa = a0 * this.ns + s0; if (this.env_model_s[sa] === -1) { @@ -816,7 +816,7 @@ var RL = {}; this.env_model_s[sa] = s1; this.env_model_r[sa] = r0; }, - plan: function () { + plan: function() { // order the states based on current priority queue information var spq = []; for (var i = 0, n = this.sa_seen.length; i < n; i++) { @@ -829,7 +829,7 @@ var RL = {}; }); } } - spq.sort(function (a, b) { + spq.sort(function(a, b) { return a.p < b.p ? 1 : -1 }); // perform the updates @@ -853,7 +853,7 @@ var RL = {}; this.learnFromTuple(s0, a0, r0, s1, a1, 0); // note lambda = 0 - shouldnt use eligibility trace here } }, - learnFromTuple: function (s0, a0, r0, s1, a1, lambda) { + learnFromTuple: function(s0, a0, r0, s1, a1, lambda) { var sa = a0 * this.ns + s0; // calculate the target for Q(s,a) if (this.update === 'qlearn') { @@ -917,7 +917,7 @@ var RL = {}; this.updatePolicy(s0); } }, - updatePriority: function (s, a, u) { + updatePriority: function(s, a, u) { // used in planning. Invoked when Q[sa] += update // we should find all states that lead to (s,a) and upgrade their priority // of being update in the next planning step @@ -942,7 +942,7 @@ var RL = {}; } } }, - updatePolicy: function (s) { + updatePolicy: function(s) { var poss = this.env.allowedActions(s); // set policy at s to be the action that achieves max_a Q(s,a) // first find the maxy Q values @@ -983,7 +983,7 @@ var RL = {}; } } } - var DQNAgent = function (env, opt) { + var DQNAgent = function(env, opt) { this.gamma = getopt(opt, 'gamma', 0.75); // future reward discount factor this.epsilon = getopt(opt, 'epsilon', 0.1); // for epsilon-greedy policy this.alpha = getopt(opt, 'alpha', 0.01); // value function learning rate @@ -996,7 +996,7 @@ var RL = {}; this.reset(); } DQNAgent.prototype = { - reset: function () { + reset: function() { this.nh = this.num_hidden_units; // number of hidden units this.ns = this.env.getNumStates(); this.na = this.env.getMaxNumActions(); @@ -1018,7 +1018,7 @@ var RL = {}; this.a1 = null; this.tderror = 0; // for visualization only... }, - toJSON: function () { + toJSON: function() { // save function var j = {}; j.nh = this.nh; @@ -1027,14 +1027,14 @@ var RL = {}; j.net = R.netToJSON(this.net); return j; }, - fromJSON: function (j) { + fromJSON: function(j) { // load function this.nh = j.nh; this.ns = j.ns; this.na = j.na; this.net = R.netFromJSON(j.net); }, - forwardQ: function (net, s, needs_backprop) { + forwardQ: function(net, s, needs_backprop) { var G = new R.Graph(needs_backprop); var a1mat = G.add(G.mul(net.W1, s), net.b1); var h1mat = G.tanh(a1mat); @@ -1042,7 +1042,7 @@ var RL = {}; this.lastG = G; // back this up. Kind of hacky isn't it return a2mat; }, - act: function (slist) { + act: function(slist) { // convert to a Mat column vector var s = new R.Mat(this.ns, 1); s.setFrom(slist); @@ -1061,7 +1061,7 @@ var RL = {}; this.a1 = a; return a; }, - learn: function (r1) { + learn: function(r1) { // perform an update on Q function if (!(this.r0 == null) && this.alpha > 0) { // learn from this tuple to get a sense of how "surprising" it is to the agent @@ -1085,7 +1085,7 @@ var RL = {}; } this.r0 = r1; // store for next update }, - learnFromTuple: function (s0, a0, r0, s1, a1) { + learnFromTuple: function(s0, a0, r0, s1, a1) { // want: Q(s,a) = r + gamma * max_a' Q(s',a') // compute the target Q value var tmat = this.forwardQ(this.net, s1, false); @@ -1106,7 +1106,7 @@ var RL = {}; } } // buggy implementation, doesnt work... - var SimpleReinforceAgent = function (env, opt) { + var SimpleReinforceAgent = function(env, opt) { this.gamma = getopt(opt, 'gamma', 0.5); // future reward discount factor this.epsilon = getopt(opt, 'epsilon', 0.75); // for epsilon-greedy policy this.alpha = getopt(opt, 'alpha', 0.001); // actor net learning rate @@ -1115,7 +1115,7 @@ var RL = {}; this.reset(); } SimpleReinforceAgent.prototype = { - reset: function () { + reset: function() { this.ns = this.env.getNumStates(); this.na = this.env.getMaxNumActions(); this.nh = 100; // number of hidden units @@ -1138,7 +1138,7 @@ var RL = {}; this.baselineGraphs = []; this.t = 0; }, - forwardActor: function (s, needs_backprop) { + forwardActor: function(s, needs_backprop) { var net = this.actorNet; var G = new R.Graph(needs_backprop); var a1mat = G.add(G.mul(net.W1, s), net.b1); @@ -1149,7 +1149,7 @@ var RL = {}; 'G': G } }, - forwardValue: function (s, needs_backprop) { + forwardValue: function(s, needs_backprop) { var net = this.baselineNet; var G = new R.Graph(needs_backprop); var a1mat = G.add(G.mul(net.W1, s), net.b1); @@ -1160,7 +1160,7 @@ var RL = {}; 'G': G } }, - act: function (slist) { + act: function(slist) { // convert to a Mat column vector var s = new R.Mat(this.ns, 1); s.setFrom(slist); @@ -1189,7 +1189,7 @@ var RL = {}; this.a1 = a; return a; }, - learn: function (r1) { + learn: function(r1) { // perform an update on Q function this.rewardHistory.push(r1); var n = this.rewardHistory.length; @@ -1257,7 +1257,7 @@ var RL = {}; }, } // buggy implementation as well, doesn't work - var RecurrentReinforceAgent = function (env, opt) { + var RecurrentReinforceAgent = function(env, opt) { this.gamma = getopt(opt, 'gamma', 0.5); // future reward discount factor this.epsilon = getopt(opt, 'epsilon', 0.1); // for epsilon-greedy policy this.alpha = getopt(opt, 'alpha', 0.001); // actor net learning rate @@ -1266,7 +1266,7 @@ var RL = {}; this.reset(); } RecurrentReinforceAgent.prototype = { - reset: function () { + reset: function() { this.ns = this.env.getNumStates(); this.na = this.env.getMaxNumActions(); this.nh = 40; // number of hidden units @@ -1288,7 +1288,7 @@ var RL = {}; this.a0 = null; this.a1 = null; }, - act: function (slist) { + act: function(slist) { // convert to a Mat column vector var s = new R.Mat(this.ns, 1); s.setFrom(slist); @@ -1316,7 +1316,7 @@ var RL = {}; this.a1 = a; return a; }, - learn: function (r1) { + learn: function(r1) { // perform an update on Q function this.rewardHistory.push(r1); var n = this.rewardHistory.length; @@ -1382,7 +1382,7 @@ var RL = {}; }, } // Currently buggy implementation, doesnt work - var DeterministPG = function (env, opt) { + var DeterministPG = function(env, opt) { this.gamma = getopt(opt, 'gamma', 0.5); // future reward discount factor this.epsilon = getopt(opt, 'epsilon', 0.5); // for epsilon-greedy policy this.alpha = getopt(opt, 'alpha', 0.001); // actor net learning rate @@ -1391,7 +1391,7 @@ var RL = {}; this.reset(); } DeterministPG.prototype = { - reset: function () { + reset: function() { this.ns = this.env.getNumStates(); this.na = this.env.getMaxNumActions(); this.nh = 100; // number of hidden units @@ -1411,7 +1411,7 @@ var RL = {}; this.a1 = null; this.t = 0; }, - forwardActor: function (s, needs_backprop) { + forwardActor: function(s, needs_backprop) { var net = this.actorNet; var G = new R.Graph(needs_backprop); var a1mat = G.add(G.mul(net.W1, s), net.b1); @@ -1422,7 +1422,7 @@ var RL = {}; 'G': G } }, - act: function (slist) { + act: function(slist) { // convert to a Mat column vector var s = new R.Mat(this.ns, 1); s.setFrom(slist); @@ -1449,7 +1449,7 @@ var RL = {}; this.a1 = a; return a; }, - utilJacobianAt: function (s) { + utilJacobianAt: function(s) { var ujacobian = new R.Mat(this.ntheta, this.na); for (var a = 0; a < this.na; a++) { R.netZeroGrads(this.actorNet); @@ -1461,7 +1461,7 @@ var RL = {}; } return ujacobian; }, - learn: function (r1) { + learn: function(r1) { // perform an update on Q function //this.rewardHistory.push(r1); if (!(this.r0 == null)) { @@ -1544,7 +1544,7 @@ var Trevel = { lbcount: 0, nextBet: "", previousReward: 0, - addBet: function (bet, outcome) { + addBet: function(bet, outcome) { if (bet === "LB" && outcome === "Win") { this.betHistory.push("LO"); this.betOutcomes.push("W"); @@ -1569,7 +1569,7 @@ var Trevel = { } this.totalBets++; }, - calculateProbabilities: function () { + calculateProbabilities: function() { this.hbProbability = this.hbCount / this.betHistory.length; this.lbProbability = this.lbcount / this.betHistory.length; this.winRate = this.totalWins / this.totalBets; @@ -1577,27 +1577,27 @@ var Trevel = { this.profit = this.getProfit(); } }, - getCurrentBalance: function () { + getCurrentBalance: function() { return parseFloat($('#balance').html()); }, - placeHighBet: function () { + placeHighBet: function() { $('#double_your_btc_bet_hi_button').click(); }, - placeLowBet: function () { + placeLowBet: function() { $('#double_your_btc_bet_lo_button').click(); }, - setBetAmount: function (amount) { + setBetAmount: function(amount) { var elem = document.getElementById("double_your_btc_stake"); elem.value = amount; }, - setOutcome: function (bet) { + setOutcome: function(bet) { if ($('#double_your_btc_bet_lose').html() !== '') { this.addBet(bet, "Loose"); } else { this.addBet(bet, "Win"); } }, - prepareBet: function () { + prepareBet: function() { this.calculateProbabilities(); if (this.betHistory.length < 10) { if (this.useMartingale === true && this.betHistory.length > 12) { @@ -1626,7 +1626,7 @@ var Trevel = { } } }, - placeBet: function () { + placeBet: function() { if (this.nextBet === "HB") { this.placeHighBet(); } else if (this.nextBet === "LB") { @@ -1642,23 +1642,23 @@ var Trevel = { this.placeLowBet(); } }, - getProfit: function () { + getProfit: function() { return (this.getCurrentBalance() - this.startingBalance).toFixed(8); }, - getNumStates: function () { + getNumStates: function() { return 8; }, - getMaxNumActions: function () { + getMaxNumActions: function() { return 2; }, - getSentiment: function (bet) { + getSentiment: function(bet) { if (bet === "HI") { return 1; } else { return 0; } }, - getPreviousBets: function () { + getPreviousBets: function() { var hist = []; if (this.betHistory.length > 12) { hist.push(this.getSentiment(this.betHistory[this.betHistory.length - 1])); @@ -1674,11 +1674,11 @@ var Trevel = { } return hist; }, - getAgentState: function () { //we'll observe the last 8 bets + getAgentState: function() { //we'll observe the last 8 bets var s = this.getPreviousBets(); return s; }, - getReward: function () { + getReward: function() { var reward = 0; var out1 = this.betOutcomes[this.betOutcomes.length - 1]; var out2 = this.betOutcomes[this.betOutcomes.length - 2]; @@ -1715,10 +1715,10 @@ var Trevel = { return reward; }, //for raw testing only - randomNumber: function (min, max) { + randomNumber: function(min, max) { return Math.floor(Math.random() * (max - min + 1) + min); }, - getTestOutcome: function (random) { + getTestOutcome: function(random) { if (random % 2 === 0) { return "HI"; } else { @@ -1726,7 +1726,7 @@ var Trevel = { } }, //random string for random seed - rString: function (length, chars) { + rString: function(length, chars) { var result = ''; var length = 16; var chars = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789abcdefghijklmnopqrstuvwxyz' @@ -1734,11 +1734,11 @@ var Trevel = { return result; }, //set client seed as random string - rSeed: function () { + rSeed: function() { $('#next_client_seed').val(rString()); }, //initialize this - init: function () { + init: function() { this.startingBalance = this.currentBalance = parseFloat($('#balance').html()); this.setBetAmount(this.minBet); this.stop = true; @@ -1748,7 +1748,7 @@ var Trevel = { }, // ask user config variables - config: function () { + config: function() { //define temporary variables var maxb = 0, minb = 0; @@ -1782,7 +1782,7 @@ var Trevel = { //start betting startbetting(); }, - stopbets: function () { + stopbets: function() { env.stop = true; clearInterval(interval); console.log('Bet session has been stopped, to start over click start.'); @@ -1816,7 +1816,7 @@ function startbetting() { console.log('Starting bet session, to stop click STOP BOT'); document.getElementById("free_play_link_li").innerHTML = 'STOP BOT'; env.stop = false; - interval = setInterval(function () { loop(); }, env.betSpeed); + interval = setInterval(function() { loop(); }, env.betSpeed); } console.clear(); console.log('You are using Trevel, with ReinforceJS'); From 3fed033681bb876e8e1847512f415c19f330ee0d Mon Sep 17 00:00:00 2001 From: nickisghosty Date: Sun, 22 Oct 2017 08:17:55 -0700 Subject: [PATCH 4/6] fixed random seed forgot this in thgis.rString and made so every roll new random seed --- DQ-Trevel.js | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/DQ-Trevel.js b/DQ-Trevel.js index 358e04b..dbcf15a 100644 --- a/DQ-Trevel.js +++ b/DQ-Trevel.js @@ -1735,7 +1735,7 @@ var Trevel = { }, //set client seed as random string rSeed: function() { - $('#next_client_seed').val(rString()); + $('#next_client_seed').val(this.rString()); }, //initialize this init: function() { @@ -1825,9 +1825,8 @@ console.log('Click it to set the config. Note: These settings are not persistent console.log('To change the default values for these settings, search the script for "prompt"'); console.log('Enjoy'); function loop() { - if (env.profit > 0.00000500) { env.rSeed(); - } + if (env.stop === false) { var state = env.getAgentState(); var action = agent.act(state); From e895e4a6e77f34ec9c08aba5d12ec57727518bcf Mon Sep 17 00:00:00 2001 From: nickisghosty Date: Sun, 22 Oct 2017 10:00:55 -0700 Subject: [PATCH 5/6] added more of a log --- DQ-Trevel.js | 36 +++++++++++++++++++++++++++++++++--- 1 file changed, 33 insertions(+), 3 deletions(-) diff --git a/DQ-Trevel.js b/DQ-Trevel.js index dbcf15a..4455bef 100644 --- a/DQ-Trevel.js +++ b/DQ-Trevel.js @@ -1516,11 +1516,12 @@ var Trevel = { //settings you can change stop: true, maxBet: 0.00001, - minBet: 0.00000005, + minBet: 0.00000001, swap: true, betSpeed: 100,//change this on init verbose: true, isTesting: false, + newseed: '', // 'win' for change ever win, 'lose' for change every loss, blank for every roll //money management useKelly: false,//martingale performs better on live account! korm: false, @@ -1542,6 +1543,10 @@ var Trevel = { lbProbability: 0, hbCount: 0, lbcount: 0, + hbw: 0, + hbl: 0, + lbw: 0, + lbl: 0, nextBet: "", previousReward: 0, addBet: function(bet, outcome) { @@ -1550,22 +1555,26 @@ var Trevel = { this.betOutcomes.push("W"); this.totalWins++; this.lbcount++; + this.lbw++; } if (bet === "LB" && outcome === "Loose") { this.betHistory.push("HI"); this.hbCount++; this.betOutcomes.push("L"); + this.lbl++; } if (bet === "HB" && outcome === "Win") { this.betHistory.push("HI"); this.totalWins++; this.hbCount++; this.betOutcomes.push("W"); + this.hbw++; } if (bet === "HB" && outcome === "Loose") { this.betHistory.push("LO"); this.lbcount++; this.betOutcomes.push("L"); + this.hbl++; } this.totalBets++; }, @@ -1593,7 +1602,13 @@ var Trevel = { setOutcome: function(bet) { if ($('#double_your_btc_bet_lose').html() !== '') { this.addBet(bet, "Loose"); + if (this.newseed == 'lose') { + this.rSeed(); + } } else { + if (this.newseed == 'win') { + this.rSeed(); + } this.addBet(bet, "Win"); } }, @@ -1761,6 +1776,7 @@ var Trevel = { this.martingaleMultiplier = prompt('Bet multiplier on lose', 2); this.swap = prompt('True for swap enabled, false for disabled', 'true'); this.korm = prompt('True to enable Kelly, false to enabled martingale, leave blank for both', 'false'); + this.newseed = prompt('Randomize client seed every "win", "lose", or leave blank for every roll', ''); this.betSpeed = prompt('Wait time before next bet is placed in ms', 3000); //convert satoshi to btc @@ -1825,8 +1841,11 @@ console.log('Click it to set the config. Note: These settings are not persistent console.log('To change the default values for these settings, search the script for "prompt"'); console.log('Enjoy'); function loop() { + if (Trevel.newseed == '') { env.rSeed(); - + } + + if (env.stop === false) { var state = env.getAgentState(); var action = agent.act(state); @@ -1847,8 +1866,19 @@ function loop() { } if (env.verbose === true) { env.calculateProbabilities(); + clear(); + //console.log("Machine Bet: " + action + "{" + env.nextBet + "} isKelly: " + env.useKelly + " isMartingale: " + env.useMartingale); - console.log("Profit: " + env.profit + " WinRate: " + (env.winRate * 100).toFixed(2)); + console.log("Bet Number: " + env.totalBets + " | Outcome: " + outcome); + console.log("Win Rate: " + (env.winRate * 100).toFixed(2) + " | Hi/Lo Win Rate: " + ((env.hbw / env.totalBets) * 100).toFixed(2) + " / " + ((env.lbw / env.totalBets) * 100).toFixed(2)); + console.log("Wins/Loses: " + env.totalWins + " / " + (env.totalBets - env.totalWins)); + console.log("Hi/Lo Bets: " + env.hbCount + " / " + env.lbcount); + console.log("Hi/Lo Wins: " + env.hbw + " / " + env.lbw); + console.log("Hi/Lo Loses: " + env.hbl + " / " + env.lbl); + console.log("Hi Probability: " + env.hbProbability.toFixed(2) + " | Lo Probability: " + env.lbProbability.toFixed(2)); + console.log("Client Seed: " + $('#next_client_seed').val() + " | Lotto Tickets: " + $('#user_lottery_tickets').html() + " | Rewards Points: " + $('.user_reward_points').text()); + console.log("Profit: " + env.profit + " | Balance: " + env.getCurrentBalance().toFixed(8)); + } } else { console.log("Action: " + action); From 82ba6030384799402c7431e7b293079ecc7b883f Mon Sep 17 00:00:00 2001 From: nickisghosty Date: Sun, 22 Oct 2017 10:54:43 -0700 Subject: [PATCH 6/6] Made log easier to read --- DQ-Trevel.js | 22 +++++++++++++--------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/DQ-Trevel.js b/DQ-Trevel.js index 4455bef..8dcbc2e 100644 --- a/DQ-Trevel.js +++ b/DQ-Trevel.js @@ -1869,15 +1869,19 @@ function loop() { clear(); //console.log("Machine Bet: " + action + "{" + env.nextBet + "} isKelly: " + env.useKelly + " isMartingale: " + env.useMartingale); - console.log("Bet Number: " + env.totalBets + " | Outcome: " + outcome); - console.log("Win Rate: " + (env.winRate * 100).toFixed(2) + " | Hi/Lo Win Rate: " + ((env.hbw / env.totalBets) * 100).toFixed(2) + " / " + ((env.lbw / env.totalBets) * 100).toFixed(2)); - console.log("Wins/Loses: " + env.totalWins + " / " + (env.totalBets - env.totalWins)); - console.log("Hi/Lo Bets: " + env.hbCount + " / " + env.lbcount); - console.log("Hi/Lo Wins: " + env.hbw + " / " + env.lbw); - console.log("Hi/Lo Loses: " + env.hbl + " / " + env.lbl); - console.log("Hi Probability: " + env.hbProbability.toFixed(2) + " | Lo Probability: " + env.lbProbability.toFixed(2)); - console.log("Client Seed: " + $('#next_client_seed').val() + " | Lotto Tickets: " + $('#user_lottery_tickets').html() + " | Rewards Points: " + $('.user_reward_points').text()); - console.log("Profit: " + env.profit + " | Balance: " + env.getCurrentBalance().toFixed(8)); + + console.log("| Client Seed: " + $('#next_client_seed').val() + " | Lotto Tickets: " + $('#user_lottery_tickets').html() + " | Rewards Points: " + $('.user_reward_points').text()); + console.log(" 8=========================== "+ env.totalBets + " ===========================D ~ ") + console.log("| Win Rate: " + (env.winRate * 100).toFixed(2) + " | Hi Win Rate: " + ((env.hbw / env.totalBets) * 100).toFixed(2) + " | Lo Win Rate: " + ((env.lbw / env.totalBets) * 100).toFixed(2) + " |"); + console.log("|____________________________________________________________|"); + console.log("| Total Wins: " + env.totalWins + " | Total Hi: " + env.hbCount + " | Hi Wins: " + env.hbw + " | Hi Loss " + env.hbl+ " |"); + console.log("| Total Loss: " + (env.totalBets - env.totalWins) + " | Total Lo " + env.lbcount + " | Lo Wins: " + env.lbw + " | Lo Loss " + env.lbl+" |"); + console.log("|____________________________________________________________|"); + console.log("| Hi Probability: " + env.hbProbability.toFixed(2) + " | Lo Probability: " + env.lbProbability.toFixed(2) + " |"); + console.log("|____________________________________________________________|"); + console.log("| Last Bet: " + env.betHistory[env.betHistory.length - 1]+ " | Outcome: " + outcome+ " | Stake: "+ env.betAmount+ " |"); + console.log("| Profit: " + env.profit + " | Balance: " + env.getCurrentBalance().toFixed(8)+ " |"); + console.log("|____________________________________________________________|") } } else {