Source code for pyro.optim.adagrad_rmsprop

# Copyright (c) 2017-2019 Uber Technologies, Inc.
# SPDX-License-Identifier: Apache-2.0

from typing import Any, Callable, Optional

import torch
from torch.optim.optimizer import Optimizer


[docs]class AdagradRMSProp(Optimizer): """ Implements a mash-up of the Adagrad algorithm and RMSProp. For the precise update equation see equations 10 and 11 in reference [1]. References: [1] 'Automatic Differentiation Variational Inference', Alp Kucukelbir, Dustin Tran, Rajesh Ranganath, Andrew Gelman, David M. Blei URL: https://arxiv.org/abs/1603.00788 [2] 'Lecture 6.5 RmsProp: Divide the gradient by a running average of its recent magnitude', Tieleman, T. and Hinton, G., COURSERA: Neural Networks for Machine Learning. [3] 'Adaptive subgradient methods for online learning and stochastic optimization', Duchi, John, Hazan, E and Singer, Y. Arguments: :param params: iterable of parameters to optimize or dicts defining parameter groups :param eta: sets the step size scale (optional; default: 1.0) :type eta: float :param t: t, optional): momentum parameter (optional; default: 0.1) :type t: float :param delta: modulates the exponent that controls how the step size scales (optional: default: 1e-16) :type delta: float """ def __init__( self, params, eta: float = 1.0, delta: float = 1.0e-16, t: float = 0.1 ): defaults = dict(eta=eta, delta=delta, t=t) super().__init__(params, defaults) for group in self.param_groups: for p in group["params"]: state = self.state[p] state["step"] = 0 state["sum"] = torch.zeros_like(p.data)
[docs] def share_memory(self) -> None: for group in self.param_groups: for p in group["params"]: state = self.state[p] state["sum"].share_memory_()
[docs] def step(self, closure: Optional[Callable] = None) -> Optional[Any]: """ Performs a single optimization step. :param closure: A (optional) closure that reevaluates the model and returns the loss. """ loss = None if closure is not None: loss = closure() for group in self.param_groups: for p in group["params"]: if p.grad is None: continue grad = p.grad.data if grad.is_sparse: raise NotImplementedError state = self.state[p] state["step"] += 1 if state["step"] == 1: # if first step, initialize variance bit to grad^2 state["sum"] = grad * grad else: state["sum"] *= 1.0 - group["t"] state["sum"] += group["t"] * grad * grad lr = group["eta"] * (state["step"] ** (-0.5 + group["delta"])) std = state["sum"].sqrt() p.data.addcdiv_(grad, 1.0 + std, value=-lr) return loss