diff --git a/edunn/__init__.py b/edunn/__init__.py index e930a1f..08d1015 100644 --- a/edunn/__init__.py +++ b/edunn/__init__.py @@ -3,6 +3,18 @@ from .model import Model, Phase, ModelWithParameters, ModelWithoutParameters from .optimizer import ( Optimizer, + SGD, + MomentumSGD, + NesterovMomentumSGD, + RMSpropOptimizer, + AdamOptimizer, + SignSGDOptimizer, +) +from .trainers import ( + Trainer, + SupervisedTrainer, + RecurrentTrainer, + # Backward-compatible wrappers (combine optimizer + trainer) BatchedGradientOptimizer, GradientDescent, RMSprop, diff --git a/edunn/optimizer.py b/edunn/optimizer.py index 2971ddc..3dd01df 100644 --- a/edunn/optimizer.py +++ b/edunn/optimizer.py @@ -2,142 +2,160 @@ # "An overview of gradient descent optimization algorithms" https://ruder.io/optimizing-gradient-descent/ -from typing import Dict import numpy as np -from .model import Model, Phase -from .model import ParameterSet -import sys, abc -from tqdm.auto import tqdm +from .model import Model, ParameterSet +import abc class Optimizer(abc.ABC): + """ + Base class for optimization algorithms. - @abc.abstractmethod - def optimize(self, model: Model, *args): - pass + An Optimizer defines *how* to update model parameters given their gradients. + It does NOT handle the training loop (batching, epochs, etc.) -- that is the + responsibility of a Trainer (see trainers.py). + Subclasses must implement: + - step(parameters, gradients, epoch, iteration): apply a single parameter update + Subclasses may optionally override: + - initialize(parameters): set up internal state (momentum buffers, etc.) + """ + + def initialize(self, parameters: ParameterSet): + """ + Initialize any internal state needed by the optimizer (e.g., momentum buffers). + Called once before the first call to step(). -def all_equal(list: []): - return len(list) == 0 or list.count(list[0]) == len(list) + :param parameters: dictionary mapping parameter names to numpy arrays + """ + pass + @abc.abstractmethod + def step(self, parameters: ParameterSet, gradients: ParameterSet, epoch: int, iteration: int): + """ + Perform a single parameter update. -import random + :param parameters: dictionary mapping parameter names to numpy arrays (mutable, update in-place) + :param gradients: dictionary mapping parameter names to their gradient arrays + :param epoch: current epoch number + :param iteration: current iteration (batch) number within the epoch + """ + pass -def batch_arrays(batch_size: int, *arrays, shuffle=False): +class SGD(Optimizer): """ + Stochastic Gradient Descent optimizer. - :param batch_size: size of batches - :param arrays: variable number of numpy arrays - :return: a generator that returns the arrays in batches + Updates parameters using the rule: + p = p - lr * gradient """ - sample_sizes = [a.shape[0] for a in arrays] - assert all_equal(sample_sizes) - batches = sample_sizes[0] // batch_size - batch_list = list(range(batches)) - if shuffle: - random.shuffle(batch_list) - for i in batch_list: - start = i * batch_size - end = start + batch_size - batch = [a[start:end,] for a in arrays] - yield tuple(batch) + def __init__(self, lr: float = 0.1): + self.lr = lr + def step(self, parameters: ParameterSet, gradients: ParameterSet, epoch: int, iteration: int): + for parameter_name, δEδp in gradients.items(): + p = parameters[parameter_name] + # use p[:] so that updates are in-place + # instead of creating a new variable + """YOUR IMPLEMENTATION START""" + p[:] = p - self.lr * δEδp + """YOUR IMPLEMENTATION END""" -class BatchedGradientOptimizer(Optimizer): - def __init__(self, batch_size: int, epochs: int, shuffle=True): - """ - :param epochs: number of epochs to train the model. Each epoch is a complete iteration over the training set. The number of parameter updates is n //batch_size, where n is the number of samples of the dataset - :param batch_size: Batch the dataset with batches of size `batch_size`, and perform an optimization step for each batch - """ - self.batch_size = batch_size - self.epochs = epochs - self.shuffle = shuffle +class MomentumSGD(Optimizer): + """ + Gradient Descent with Momentum. + + Maintains a velocity buffer and updates parameters using: + v = gamma * v + lr * gradient + p = p - v + """ - def backpropagation(self, model: Model, x: np.ndarray, y_true: np.ndarray, error_layer: Model): - # forward pass (model and error) - y = model.forward(x) - E = error_layer.forward(y_true, y) + def __init__(self, lr: float = 0.1, gamma: float = 0.9): + self.lr = lr + self.gamma = gamma + self.v = {} - # backward pass (error and model) - δEδy, _ = error_layer.backward(1) - δEδx, δEδps = model.backward(δEδy) + def initialize(self, parameters: ParameterSet): + for k, p in parameters.items(): + self.v[k] = np.zeros_like(p) - return δEδx, δEδps, E + def step(self, parameters: ParameterSet, gradients: ParameterSet, epoch: int, iteration: int): + if not self.v: + self.initialize(parameters) - def optimize(self, model: Model, x: np.ndarray, y: np.ndarray, error_layer: Model, verbose=True): - """ - Fit a model to a dataset. - :param model: the Model to optimize - :param x: dataset inputs - :param y: dataset outputs - :param error_layer: To be applied to the output of the last layer - :return: - """ - n = x.shape[0] - batches = n // self.batch_size - history = [] - model.set_phase(Phase.Training) - bar = tqdm(range(self.epochs), desc=f"optim. {model.name}", file=sys.stdout, disable=not verbose) - for epoch in bar: - epoch_error = 0 - for i, (x_batch, y_batch) in enumerate(batch_arrays(self.batch_size, x, y, shuffle=self.shuffle)): - δEδx, δEδps, batch_error = self.backpropagation(model, x_batch, y_batch, error_layer) - self.optimize_batch(model, δEδps, epoch, i) - epoch_error += batch_error - epoch_error /= batches - history.append(epoch_error) - bar.set_postfix_str(f"{error_layer.name}: {epoch_error:.5f}") - - return np.array(history) + for k, δEδp in gradients.items(): + p = parameters[k] + v = self.v[k] + # use p[:] and v[:] so that updates are in-place + # instead of creating a new variable + """YOUR IMPLEMENTATION START""" + v[:] = self.gamma * v + self.lr * δEδp + p[:] = p - v + """YOUR IMPLEMENTATION END""" - @abc.abstractmethod - def optimize_batch(self, model: Model, x: np.ndarray, y: np.ndarray, error_layer: Model, epoch: int): - pass +class NesterovMomentumSGD(Optimizer): + """ + Nesterov Accelerated Gradient (NAG) optimizer. -class GradientDescent(BatchedGradientOptimizer): + A variant of momentum that "looks ahead" by computing the gradient at + the anticipated future position: + v = gamma * v + lr * gradient + p = p - (gamma * v + lr * gradient) + """ - def __init__(self, batch_size: int, epochs: int, lr: float = 0.1, shuffle=True): - super().__init__(batch_size, epochs, shuffle) + def __init__(self, lr: float = 0.1, gamma: float = 0.9): self.lr = lr + self.gamma = gamma + self.v = {} - def optimize_batch(self, model: Model, δEδps: ParameterSet, epoch: int, iteration: int): + def initialize(self, parameters: ParameterSet): + for k, p in parameters.items(): + self.v[k] = np.zeros_like(p) - # Update parameters - parameters = model.get_parameters() - for parameter_name, δEδp in δEδps.items(): - p = parameters[parameter_name] + def step(self, parameters: ParameterSet, gradients: ParameterSet, epoch: int, iteration: int): + if not self.v: + self.initialize(parameters) + + for k, δEδp in gradients.items(): + p = parameters[k] + v = self.v[k] # use p[:] so that updates are in-place # instead of creating a new variable """YOUR IMPLEMENTATION START""" - p[:] = p - self.lr * δEδp + v[:] = self.gamma * v + self.lr * δEδp + p[:] = p - (self.gamma * v + self.lr * δEδp) """YOUR IMPLEMENTATION END""" -class RMSprop(BatchedGradientOptimizer): +class RMSpropOptimizer(Optimizer): + """ + RMSprop optimizer. + + Adapts the learning rate per-parameter using a running average of + squared gradients: + v = beta * v + (1 - beta) * gradient^2 + p = p - lr / (sqrt(v) + eps) * gradient + """ - def __init__( - self, batch_size: int, epochs: int, lr: float = 0.1, beta: float = 0.99, eps: float = 1e-8, shuffle=True - ): - super().__init__(batch_size, epochs, shuffle) + def __init__(self, lr: float = 0.1, beta: float = 0.99, eps: float = 1e-8): self.lr = lr self.beta = beta self.eps = eps - self.first = True self.v = {} - def optimize_batch(self, model: Model, δEδps: ParameterSet, epoch: int, iteration: int): - if self.first: - self.first = False - for k, p in model.get_parameters().items(): - self.v[k] = np.zeros_like(p) + def initialize(self, parameters: ParameterSet): + for k, p in parameters.items(): + self.v[k] = np.zeros_like(p) + + def step(self, parameters: ParameterSet, gradients: ParameterSet, epoch: int, iteration: int): + if not self.v: + self.initialize(parameters) - # Update parameters - parameters = model.get_parameters() - for parameter_name, δEδp in δEδps.items(): + for parameter_name, δEδp in gradients.items(): p = parameters[parameter_name] # use p[:] so that updates are in-place # instead of creating a new variable @@ -147,30 +165,37 @@ def optimize_batch(self, model: Model, δEδps: ParameterSet, epoch: int, iterat """YOUR IMPLEMENTATION END""" -class Adam(BatchedGradientOptimizer): +class AdamOptimizer(Optimizer): + """ + Adam optimizer (Adaptive Moment Estimation). + + Combines ideas from momentum and RMSprop, maintaining both first and + second moment estimates of the gradients with bias correction: + m = beta1 * m + (1 - beta1) * gradient + v = beta2 * v + (1 - beta2) * gradient^2 + m_hat = m / (1 - beta1^t) + v_hat = v / (1 - beta2^t) + p = p - lr * m_hat / (sqrt(v_hat) + eps) + """ - def __init__( - self, batch_size: int, epochs: int, lr: float = 0.1, betas: tuple = (0.9, 0.999), eps: int = 1e-08, shuffle=True - ): - super().__init__(batch_size, epochs, shuffle) + def __init__(self, lr: float = 0.1, betas: tuple = (0.9, 0.999), eps: float = 1e-08): self.lr = lr self.beta_1, self.beta_2 = betas self.eps = eps - self.first = True self.m = {} self.v = {} - def optimize_batch(self, model: Model, δEδps: ParameterSet, epoch: int, iteration: int): - if self.first: - self.first = False - for k, p in model.get_parameters().items(): - self.m[k] = np.zeros_like(p) - self.v[k] = np.zeros_like(p) + def initialize(self, parameters: ParameterSet): + for k, p in parameters.items(): + self.m[k] = np.zeros_like(p) + self.v[k] = np.zeros_like(p) + + def step(self, parameters: ParameterSet, gradients: ParameterSet, epoch: int, iteration: int): + if not self.m: + self.initialize(parameters) iteration += 1 - # Update parameters - parameters = model.get_parameters() - for parameter_name, δEδp in δEδps.items(): + for parameter_name, δEδp in gradients.items(): p = parameters[parameter_name] # use p[:] so that updates are in-place # instead of creating a new variable @@ -183,76 +208,21 @@ def optimize_batch(self, model: Model, δEδps: ParameterSet, epoch: int, iterat """YOUR IMPLEMENTATION END""" -class MomentumGD(BatchedGradientOptimizer): - - def __init__(self, batch_size: int, epochs: int, lr: float = 0.1, gamma=0.9, shuffle=True): - super().__init__(batch_size, epochs, shuffle) - self.lr = lr - self.gamma = gamma - self.first = True - self.v = {} - - def optimize_batch(self, model: Model, δEδps: ParameterSet, epoch: int, iteration: int): - if self.first: - self.first = False - for k, p in model.get_parameters().items(): - self.v[k] = np.zeros_like(p) - - # Update parameters - parameters = model.get_parameters() - for k, δEδp in δEδps.items(): - # K = parameter name - p = parameters[k] - v = self.v[k] - # use p[:] and v[:] so that updates are in-place - # instead of creating a new variable - """YOUR IMPLEMENTATION START""" - v[:] = self.gamma * v + self.lr * δEδp - p[:] = p - v - """YOUR IMPLEMENTATION END""" - +class SignSGDOptimizer(Optimizer): + """ + Sign Stochastic Gradient Descent. -class NesterovMomentumGD(BatchedGradientOptimizer): + Normalizes each gradient component by its magnitude, effectively + taking a unit step in the direction of the gradient: + p = p - lr * gradient / sqrt(gradient^2 + eps) + """ - def __init__(self, batch_size: int, epochs: int, lr: float = 0.1, gamma=0.9, shuffle=True): - super().__init__(batch_size, epochs, shuffle) + def __init__(self, lr: float = 0.1, eps: float = 1e-8): self.lr = lr - self.gamma = gamma - self.first = True - self.v = {} - - def optimize_batch(self, model: Model, δEδps: ParameterSet, epoch: int, iteration: int): - if self.first: - self.first = False - for k, p in model.get_parameters().items(): - self.v[k] = np.zeros_like(p) - - # Update parameters - parameters = model.get_parameters() - for k, δEδp in δEδps.items(): - # K = parameter name - p = parameters[k] - v = self.v[k] - # use p[:] so that updates are in-place - # instead of creating a new variable - """YOUR IMPLEMENTATION START""" - v[:] = self.gamma * v + self.lr * δEδp - p[:] = p - (self.gamma * v + self.lr * δEδp) - """YOUR IMPLEMENTATION END""" - - -class SignGD(BatchedGradientOptimizer): - - def __init__(self, batch_size: int, epochs: int, lr: float = 0.1, eps=1e-8, shuffle=True): - super().__init__(batch_size, epochs, shuffle) self.eps = eps - self.lr = lr - - def optimize_batch(self, model: Model, δEδps: ParameterSet, epoch: int, iteration: int): - # Update parameters - parameters = model.get_parameters() - for parameter_name, δEδp in δEδps.items(): + def step(self, parameters: ParameterSet, gradients: ParameterSet, epoch: int, iteration: int): + for parameter_name, δEδp in gradients.items(): p = parameters[parameter_name] # use p[:] so that updates are in-place # instead of creating a new variable diff --git a/edunn/trainers.py b/edunn/trainers.py index 65710f8..e596921 100644 --- a/edunn/trainers.py +++ b/edunn/trainers.py @@ -1,29 +1,37 @@ -# Additional material to help you implement optimizers: -# "An overview of gradient descent optimization algorithms" https://ruder.io/optimizing-gradient-descent/ +# Training strategies for eduNN models. +# +# A Trainer handles the training loop: batching, epochs, forward/backward passes, +# and calling the Optimizer to update parameters. Different Trainer subclasses +# implement different training strategies (supervised, recurrent, etc.). +# +# For backward compatibility, this module also provides wrapper classes that +# combine an Optimizer with a Trainer under the old API +# (e.g., GradientDescent(batch_size, epochs, lr).optimize(...)). from typing import Dict import numpy as np from .model import Model, Phase from .model import ParameterSet -import sys, abc +from .optimizer import ( + Optimizer, + SGD, + MomentumSGD, + NesterovMomentumSGD, + RMSpropOptimizer, + AdamOptimizer, + SignSGDOptimizer, +) +import sys +import abc +import random from tqdm.auto import tqdm -class Optimizer(abc.ABC): - - @abc.abstractmethod - def optimize(self, model: Model, *args): - pass - - def all_equal(list: []): return len(list) == 0 or list.count(list[0]) == len(list) -import random - - def batch_arrays(batch_size: int, *arrays, shuffle=False): """ @@ -45,18 +53,59 @@ def batch_arrays(batch_size: int, *arrays, shuffle=False): yield tuple(batch) -class BatchedGradientOptimizer(Optimizer): +def _accumulate_gradients(accumulated: ParameterSet, new_grads: ParameterSet): + """Add new_grads into accumulated in-place. If accumulated is empty, initialize it.""" + for k, v in new_grads.items(): + if k in accumulated: + accumulated[k] = accumulated[k] + v + else: + accumulated[k] = v.copy() + + +def _scale_gradients(gradients: ParameterSet, scale: float) -> ParameterSet: + """Scale all gradients by a constant factor.""" + return {k: v * scale for k, v in gradients.items()} + - def __init__(self, batch_size: int, epochs: int, shuffle=True): +# --------------------------------------------------------------------------- +# Trainer base class and strategies +# --------------------------------------------------------------------------- + + +class Trainer(abc.ABC): + """ + Base class for training strategies. + + A Trainer orchestrates the training loop: iterating over epochs and batches, + performing forward/backward passes, and calling an Optimizer to update + model parameters. + + Subclasses implement different training strategies for different model types + (feedforward, recurrent, generative, etc.). + """ + + def __init__(self, optimizer: Optimizer, batch_size: int, epochs: int, shuffle: bool = True): """ - :param epochs: number of epochs to train the model. Each epoch is a complete iteration over the training set. The number of parameter updates is n //batch_size, where n is the number of samples of the dataset - :param batch_size: Batch the dataset with batches of size `batch_size`, and perform an optimization step for each batch + :param optimizer: the Optimizer algorithm to use for parameter updates + :param batch_size: size of mini-batches + :param epochs: number of epochs (full passes over the training set) + :param shuffle: whether to shuffle batches each epoch """ + self.optimizer = optimizer self.batch_size = batch_size self.epochs = epochs self.shuffle = shuffle def backpropagation(self, model: Model, x: np.ndarray, y_true: np.ndarray, error_layer: Model): + """ + Perform a single forward + backward pass. + + :param model: the model + :param x: input batch + :param y_true: target batch + :param error_layer: error/loss layer + :return: (input_gradient, parameter_gradients, error_value) + """ # forward pass (model and error) y = model.forward(x) E = error_layer.forward(y_true, y) @@ -67,25 +116,60 @@ def backpropagation(self, model: Model, x: np.ndarray, y_true: np.ndarray, error return δEδx, δEδps, E - def optimize(self, model: Model, x: np.ndarray, y: np.ndarray, error_layer: Model, verbose=True): + @abc.abstractmethod + def train(self, model: Model, x: np.ndarray, y: np.ndarray, error_layer: Model, verbose=True) -> np.ndarray: + """ + Train a model on a dataset. + + :param model: the Model to train + :param x: dataset inputs + :param y: dataset outputs/targets + :param error_layer: loss function layer + :param verbose: whether to show progress + :return: array of per-epoch error values (training history) + """ + pass + + def optimize(self, model: Model, x: np.ndarray, y: np.ndarray, error_layer: Model, verbose=True) -> np.ndarray: """ - Fit a model to a dataset. + Alias for train(), provided for backward compatibility. + """ + return self.train(model, x, y, error_layer, verbose) + + +class SupervisedTrainer(Trainer): + """ + Standard supervised training strategy. + + Performs forward/backward on each mini-batch, then immediately updates + parameters via the optimizer. This is the typical training loop for + feedforward networks (MLPs, CNNs, etc.). + """ + + def __init__(self, optimizer: Optimizer, batch_size: int, epochs: int, shuffle: bool = True): + super().__init__(optimizer, batch_size, epochs, shuffle) + + def train(self, model: Model, x: np.ndarray, y: np.ndarray, error_layer: Model, verbose=True) -> np.ndarray: + """ + Fit a model to a dataset using standard supervised training. + :param model: the Model to optimize :param x: dataset inputs :param y: dataset outputs :param error_layer: To be applied to the output of the last layer - :return: + :return: array of per-epoch error values """ n = x.shape[0] batches = n // self.batch_size history = [] model.set_phase(Phase.Training) + self.optimizer.initialize(model.get_parameters()) bar = tqdm(range(self.epochs), desc=f"optim. {model.name}", file=sys.stdout, disable=not verbose) for epoch in bar: epoch_error = 0 for i, (x_batch, y_batch) in enumerate(batch_arrays(self.batch_size, x, y, shuffle=self.shuffle)): - δEδx, δEδps, batch_error = self.backpropagation(model, x, y, error_layer) - self.optimize_batch(model, δEδps, epoch, i) + δEδx, δEδps, batch_error = self.backpropagation(model, x_batch, y_batch, error_layer) + self.optimizer.step(model.get_parameters(), δEδps, epoch, i) epoch_error += batch_error epoch_error /= batches history.append(epoch_error) @@ -93,104 +177,248 @@ def optimize(self, model: Model, x: np.ndarray, y: np.ndarray, error_layer: Mode return np.array(history) - @abc.abstractmethod - def optimize_batch(self, model: Model, x: np.ndarray, y: np.ndarray, error_layer: Model, epoch: int): - pass + +class RecurrentTrainer(Trainer): + """ + Training strategy for recurrent models (RNNs, LSTMs, etc.). + + Unlike SupervisedTrainer which updates parameters after every batch, + RecurrentTrainer supports gradient accumulation across multiple batches + before performing a parameter update. This is useful for: + - Training on long sequences where memory is limited + - Simulating larger effective batch sizes + - Stabilizing training of recurrent models + + The gradient_accumulation_steps parameter controls how many batches of + gradients are accumulated before a single optimizer step. + """ + + def __init__( + self, + optimizer: Optimizer, + batch_size: int, + epochs: int, + shuffle: bool = True, + gradient_accumulation_steps: int = 1, + ): + """ + :param optimizer: the Optimizer algorithm to use for parameter updates + :param batch_size: size of mini-batches + :param epochs: number of epochs + :param shuffle: whether to shuffle batches each epoch + :param gradient_accumulation_steps: number of batches over which to + accumulate gradients before performing an optimizer step. A value + of 1 means update every batch (same as SupervisedTrainer). A value + of N means accumulate gradients over N batches, then update once + with the averaged gradients. + """ + super().__init__(optimizer, batch_size, epochs, shuffle) + assert gradient_accumulation_steps >= 1, "gradient_accumulation_steps must be >= 1" + self.gradient_accumulation_steps = gradient_accumulation_steps + + def train(self, model: Model, x: np.ndarray, y: np.ndarray, error_layer: Model, verbose=True) -> np.ndarray: + """ + Train a recurrent model with gradient accumulation. + + :param model: the Model to train + :param x: dataset inputs (typically shape: batch x timesteps x features) + :param y: dataset targets + :param error_layer: loss function layer + :param verbose: whether to show progress + :return: array of per-epoch error values + """ + n = x.shape[0] + batches = n // self.batch_size + history = [] + model.set_phase(Phase.Training) + self.optimizer.initialize(model.get_parameters()) + bar = tqdm(range(self.epochs), desc=f"optim. {model.name}", file=sys.stdout, disable=not verbose) + for epoch in bar: + epoch_error = 0 + accumulated_grads = {} + steps_since_update = 0 + + for i, (x_batch, y_batch) in enumerate(batch_arrays(self.batch_size, x, y, shuffle=self.shuffle)): + δEδx, δEδps, batch_error = self.backpropagation(model, x_batch, y_batch, error_layer) + _accumulate_gradients(accumulated_grads, δEδps) + steps_since_update += 1 + epoch_error += batch_error + + if steps_since_update >= self.gradient_accumulation_steps: + # Average the accumulated gradients and apply update + avg_grads = _scale_gradients(accumulated_grads, 1.0 / steps_since_update) + self.optimizer.step(model.get_parameters(), avg_grads, epoch, i) + accumulated_grads = {} + steps_since_update = 0 + + # Handle any remaining accumulated gradients at end of epoch + if steps_since_update > 0: + avg_grads = _scale_gradients(accumulated_grads, 1.0 / steps_since_update) + self.optimizer.step(model.get_parameters(), avg_grads, epoch, batches) + + epoch_error /= batches + history.append(epoch_error) + bar.set_postfix_str(f"{error_layer.name}: {epoch_error:.5f}") + + return np.array(history) + + +# --------------------------------------------------------------------------- +# Backward-compatible wrapper classes +# --------------------------------------------------------------------------- +# +# These classes preserve the old API where the optimizer algorithm and +# training loop were combined into a single object: +# +# optimizer = nn.GradientDescent(batch_size=32, epochs=100, lr=0.1) +# history = optimizer.optimize(model, x, y, error) +# +# Internally they delegate to the new separated Optimizer + SupervisedTrainer. +# --------------------------------------------------------------------------- + + +class BatchedGradientOptimizer(SupervisedTrainer): + """ + Base class for backward-compatible optimizer+trainer wrappers. + + This class exists to preserve the old API where optimizer algorithm + and training strategy were combined. New code should use + SupervisedTrainer (or RecurrentTrainer) with a separate Optimizer instead. + """ + pass class GradientDescent(BatchedGradientOptimizer): + """ + Backward-compatible wrapper combining SGD optimizer with supervised training. + + Old API (still works): + optimizer = nn.GradientDescent(batch_size=32, epochs=100, lr=0.1) + history = optimizer.optimize(model, x, y, error) + + New equivalent: + optimizer = nn.SGD(lr=0.1) + trainer = nn.SupervisedTrainer(optimizer, batch_size=32, epochs=100) + history = trainer.train(model, x, y, error) + """ def __init__(self, batch_size: int, epochs: int, lr: float = 0.1, shuffle=True): - super().__init__(batch_size, epochs, shuffle) + super().__init__(SGD(lr), batch_size, epochs, shuffle) + self.lr = lr + + def optimize_batch(self, model: Model, δEδps: ParameterSet, epoch: int, iteration: int): + self.optimizer.step(model.get_parameters(), δEδps, epoch, iteration) + + +class RMSprop(BatchedGradientOptimizer): + """ + Backward-compatible wrapper combining RMSprop optimizer with supervised training. + + Old API (still works): + optimizer = nn.RMSprop(batch_size=32, epochs=100, lr=0.1) + history = optimizer.optimize(model, x, y, error) + + New equivalent: + optimizer = nn.RMSpropOptimizer(lr=0.1) + trainer = nn.SupervisedTrainer(optimizer, batch_size=32, epochs=100) + history = trainer.train(model, x, y, error) + """ + + def __init__( + self, batch_size: int, epochs: int, lr: float = 0.1, beta: float = 0.99, eps: float = 1e-8, shuffle=True + ): + super().__init__(RMSpropOptimizer(lr, beta, eps), batch_size, epochs, shuffle) self.lr = lr def optimize_batch(self, model: Model, δEδps: ParameterSet, epoch: int, iteration: int): + self.optimizer.step(model.get_parameters(), δEδps, epoch, iteration) + + +class Adam(BatchedGradientOptimizer): + """ + Backward-compatible wrapper combining Adam optimizer with supervised training. + + Old API (still works): + optimizer = nn.Adam(batch_size=32, epochs=100, lr=0.1) + history = optimizer.optimize(model, x, y, error) + + New equivalent: + optimizer = nn.AdamOptimizer(lr=0.1) + trainer = nn.SupervisedTrainer(optimizer, batch_size=32, epochs=100) + history = trainer.train(model, x, y, error) + """ + + def __init__( + self, batch_size: int, epochs: int, lr: float = 0.1, betas: tuple = (0.9, 0.999), eps: int = 1e-08, shuffle=True + ): + super().__init__(AdamOptimizer(lr, betas, eps), batch_size, epochs, shuffle) + self.lr = lr - # Update parameters - parameters = model.get_parameters() - for parameter_name, δEδp in δEδps.items(): - p = parameters[parameter_name] - # use p[:] so that updates are in-place - # instead of creating a new variable - """YOUR IMPLEMENTATION START""" - p[:] = p - self.lr * δEδp - """YOUR IMPLEMENTATION END""" + def optimize_batch(self, model: Model, δEδps: ParameterSet, epoch: int, iteration: int): + self.optimizer.step(model.get_parameters(), δEδps, epoch, iteration) class MomentumGD(BatchedGradientOptimizer): + """ + Backward-compatible wrapper combining Momentum SGD optimizer with supervised training. + + Old API (still works): + optimizer = nn.MomentumGD(batch_size=32, epochs=100, lr=0.1) + history = optimizer.optimize(model, x, y, error) + + New equivalent: + optimizer = nn.MomentumSGD(lr=0.1, gamma=0.9) + trainer = nn.SupervisedTrainer(optimizer, batch_size=32, epochs=100) + history = trainer.train(model, x, y, error) + """ def __init__(self, batch_size: int, epochs: int, lr: float = 0.1, gamma=0.9, shuffle=True): - super().__init__(batch_size, epochs, shuffle) + super().__init__(MomentumSGD(lr, gamma), batch_size, epochs, shuffle) self.lr = lr - self.gamma = gamma - self.first = True - self.v = {} def optimize_batch(self, model: Model, δEδps: ParameterSet, epoch: int, iteration: int): - if self.first: - self.first = False - for k, p in model.get_parameters().items(): - self.v[k] = np.zeros_like(p) - - # Update parameters - parameters = model.get_parameters() - for k, δEδp in δEδps.items(): - # K = parameter name - p = parameters[k] - v = self.v[k] - # use p[:] and v[:] so that updates are in-place - # instead of creating a new variable - """YOUR IMPLEMENTATION START""" - v[:] = self.gamma * v + self.lr * δEδp - p[:] = p - v - """YOUR IMPLEMENTATION END""" + self.optimizer.step(model.get_parameters(), δEδps, epoch, iteration) class NesterovMomentumGD(BatchedGradientOptimizer): + """ + Backward-compatible wrapper combining Nesterov Momentum SGD optimizer with supervised training. + + Old API (still works): + optimizer = nn.NesterovMomentumGD(batch_size=32, epochs=100, lr=0.1) + history = optimizer.optimize(model, x, y, error) + + New equivalent: + optimizer = nn.NesterovMomentumSGD(lr=0.1, gamma=0.9) + trainer = nn.SupervisedTrainer(optimizer, batch_size=32, epochs=100) + history = trainer.train(model, x, y, error) + """ def __init__(self, batch_size: int, epochs: int, lr: float = 0.1, gamma=0.9, shuffle=True): - super().__init__(batch_size, epochs, shuffle) + super().__init__(NesterovMomentumSGD(lr, gamma), batch_size, epochs, shuffle) self.lr = lr - self.gamma = gamma - self.first = True - self.v = {} def optimize_batch(self, model: Model, δEδps: ParameterSet, epoch: int, iteration: int): - if self.first: - self.first = False - for k, p in model.get_parameters().items(): - self.v[k] = np.zeros_like(p) - - # Update parameters - parameters = model.get_parameters() - for k, δEδp in δEδps.items(): - # K = parameter name - p = parameters[k] - v = self.v[k] - # use p[:] so that updates are in-place - # instead of creating a new variable - """YOUR IMPLEMENTATION START""" - v[:] = self.gamma * v + self.lr * δEδp - p[:] = p - (self.gamma * v + self.lr * δEδp) - """YOUR IMPLEMENTATION END""" + self.optimizer.step(model.get_parameters(), δEδps, epoch, iteration) class SignGD(BatchedGradientOptimizer): + """ + Backward-compatible wrapper combining SignSGD optimizer with supervised training. + + Old API (still works): + optimizer = nn.SignGD(batch_size=32, epochs=100, lr=0.1) + history = optimizer.optimize(model, x, y, error) + + New equivalent: + optimizer = nn.SignSGDOptimizer(lr=0.1) + trainer = nn.SupervisedTrainer(optimizer, batch_size=32, epochs=100) + history = trainer.train(model, x, y, error) + """ def __init__(self, batch_size: int, epochs: int, lr: float = 0.1, eps=1e-8, shuffle=True): - super().__init__(batch_size, epochs, shuffle) - self.eps = eps + super().__init__(SignSGDOptimizer(lr, eps), batch_size, epochs, shuffle) self.lr = lr def optimize_batch(self, model: Model, δEδps: ParameterSet, epoch: int, iteration: int): - - # Update parameters - parameters = model.get_parameters() - for parameter_name, δEδp in δEδps.items(): - p = parameters[parameter_name] - # use p[:] so that updates are in-place - # instead of creating a new variable - """YOUR IMPLEMENTATION START""" - denom = np.sqrt(δEδp**2 + self.eps) - p[:] = p - self.lr * (δEδp / denom) - """YOUR IMPLEMENTATION END""" + self.optimizer.step(model.get_parameters(), δEδps, epoch, iteration)