Source code for pyro.contrib.bnn.hidden_layer

# Copyright (c) 2017-2019 Uber Technologies, Inc.
# SPDX-License-Identifier: Apache-2.0

import torch
import torch.nn.functional as F
from torch.distributions.utils import lazy_property

from pyro.contrib.bnn.utils import adjoin_ones_vector
from pyro.distributions.torch_distribution import TorchDistribution

[docs]class HiddenLayer(TorchDistribution): r""" This distribution is a basic building block in a Bayesian neural network. It represents a single hidden layer, i.e. an affine transformation applied to a set of inputs `X` followed by a non-linearity. The uncertainty in the weights is encoded in a Normal variational distribution specified by the parameters `A_scale` and `A_mean`. The so-called 'local reparameterization trick' is used to reduce variance (see reference below). In effect, this means the weights are never sampled directly; instead one samples in pre-activation space (i.e. before the non-linearity is applied). Since the weights are never directly sampled, when this distribution is used within the context of variational inference, care must be taken to correctly scale the KL divergence term that corresponds to the weight matrix. This term is folded into the `log_prob` method of this distributions. In effect, this distribution encodes the following generative process: A ~ Normal(A_mean, A_scale) output ~ non_linearity(AX) :param torch.Tensor X: B x D dimensional mini-batch of inputs :param torch.Tensor A_mean: D x H dimensional specifiying weight mean :param torch.Tensor A_scale: D x H dimensional (diagonal covariance matrix) specifying weight uncertainty :param callable non_linearity: a callable that specifies the non-linearity used. defaults to ReLU. :param float KL_factor: scaling factor for the KL divergence. prototypically this is equal to the size of the mini-batch divided by the size of the whole dataset. defaults to `1.0`. :param A_prior: the prior over the weights is assumed to be normal with mean zero and scale factor `A_prior`. default value is 1.0. :type A_prior: float or torch.Tensor :param bool include_hidden_bias: controls whether the activations should be augmented with a 1, which can be used to incorporate bias terms. defaults to `True`. :param bool weight_space_sampling: controls whether the local reparameterization trick is used. this is only intended to be used for internal testing. defaults to `False`. Reference: Kingma, Diederik P., Tim Salimans, and Max Welling. "Variational dropout and the local reparameterization trick." Advances in Neural Information Processing Systems. 2015. """ has_rsample = True def __init__( self, X=None, A_mean=None, A_scale=None, non_linearity=F.relu, KL_factor=1.0, A_prior_scale=1.0, include_hidden_bias=True, weight_space_sampling=False, ): self.X = X self.dim_X = X.size(-1) self.dim_H = A_mean.size(-1) assert ( A_mean.size(0) == self.dim_X ), "The dimensions of X and A_mean and A_scale must match accordingly; see documentation" self.A_mean = A_mean self.A_scale = A_scale self.non_linearity = non_linearity assert callable(non_linearity), "non_linearity must be callable" if A_scale.dim() != 2: raise NotImplementedError("A_scale must be 2-dimensional") self.KL_factor = KL_factor self.A_prior_scale = A_prior_scale self.weight_space_sampling = weight_space_sampling self.include_hidden_bias = include_hidden_bias def log_prob(self, value): return -self.KL_factor * self.KL @lazy_property def KL(self): KL_A = torch.pow(self.A_mean / self.A_prior_scale, 2.0).sum() KL_A -= self.dim_X * self.dim_H KL_A += torch.pow(self.A_scale / self.A_prior_scale, 2.0).sum() KL_A -= 2.0 * torch.log(self.A_scale / self.A_prior_scale).sum() return 0.5 * KL_A def rsample(self, sample_shape=torch.Size()): # note: weight space sampling is only meant for testing if self.weight_space_sampling: A = ( self.A_mean + torch.randn(sample_shape + self.A_scale.shape).type_as(self.A_mean) * self.A_scale ) activation = torch.matmul(self.X, A) else: _mean = torch.matmul(self.X, self.A_mean) X_sqr = torch.pow(self.X, 2.0).unsqueeze(-1) A_scale_sqr = torch.pow(self.A_scale, 2.0) _std = (X_sqr * A_scale_sqr).sum(-2).sqrt() activation = ( _mean + torch.randn(sample_shape + _std.shape).type_as(_std) * _std ) # apply non-linearity activation = self.non_linearity(activation) # add 1 element to activations if self.include_hidden_bias: activation = adjoin_ones_vector(activation) return activation