Source code for pyro.infer.energy_distance

# Copyright (c) 2017-2019 Uber Technologies, Inc.
# SPDX-License-Identifier: Apache-2.0

import operator
from collections import OrderedDict
from functools import reduce

import torch

import pyro
import pyro.poutine as poutine
from pyro.distributions.util import scale_and_mask
from pyro.infer.elbo import ELBO
from pyro.infer.util import is_validation_enabled, validation_enabled
from pyro.poutine.util import prune_subsample_sites
from pyro.util import check_model_guide_match, check_site_shape, warn_if_nan

def _squared_error(x, y, scale, mask):
    diff = x - y
    if getattr(scale, "shape", ()) or getattr(mask, "shape", ()):
        error = torch.einsum("nbe,nbe->nb", diff, diff)
        return scale_and_mask(error, scale, mask).sum(-1)
        error = torch.einsum("nbe,nbe->n", diff, diff)
        return scale_and_mask(error, scale, mask)

[docs]class EnergyDistance: r""" Posterior predictive energy distance [1,2] with optional Bayesian regularization by the prior. Let `p(x,z)=p(z) p(x|z)` be the model, `q(z|x)` be the guide. Then given data `x` and drawing an iid pair of samples :math:`(Z,X)` and :math:`(Z',X')` (where `Z` is latent and `X` is the posterior predictive), .. math :: & Z \sim q(z|x); \quad X \sim p(x|Z) \\ & Z' \sim q(z|x); \quad X' \sim p(x|Z') \\ & loss = \mathbb E_X \|X-x\|^\beta - \frac 1 2 \mathbb E_{X,X'}\|X-X'\|^\beta - \lambda \mathbb E_Z \log p(Z) This is a likelihood-free inference algorithm, and can be used for likelihoods without tractable density functions. The :math:`\beta` energy distance is a robust loss functions, and is well defined for any distribution with finite fractional moment :math:`\mathbb E[\|X\|^\beta]`. This requires static model structure, a fully reparametrized guide, and reparametrized likelihood distributions in the model. Model latent distributions may be non-reparametrized. **References** [1] Gabor J. Szekely, Maria L. Rizzo (2003) Energy Statistics: A Class of Statistics Based on Distances. [2] Tilmann Gneiting, Adrian E. Raftery (2007) Strictly Proper Scoring Rules, Prediction, and Estimation. :param float beta: Exponent :math:`\beta` from [1,2]. The loss function is strictly proper for distributions with finite :math:`beta`-absolute moment :math:`E[\|X\|^\beta]`. Thus for heavy tailed distributions ``beta`` should be small, e.g. for ``Cauchy`` distributions, :math:`\beta<1` is strictly proper. Defaults to 1. Must be in the open interval (0,2). :param float prior_scale: Nonnegative scale for prior regularization. Model parameters are trained only if this is positive. If zero (default), then model log densities will not be computed (guide log densities are never computed). :param int num_particles: The number of particles/samples used to form the gradient estimators. Must be at least 2. :param int max_plate_nesting: Optional bound on max number of nested :func:`pyro.plate` contexts. If omitted, this will guess a valid value by running the (model,guide) pair once. """ def __init__( self, beta=1.0, prior_scale=0.0, num_particles=2, max_plate_nesting=float("inf") ): if not (isinstance(beta, (float, int)) and 0 < beta and beta < 2): raise ValueError("Expected beta in (0,2), actual {}".format(beta)) if not (isinstance(prior_scale, (float, int)) and prior_scale >= 0): raise ValueError("Expected prior_scale >= 0, actual {}".format(prior_scale)) if not (isinstance(num_particles, int) and num_particles >= 2): raise ValueError( "Expected num_particles >= 2, actual {}".format(num_particles) ) self.beta = beta self.prior_scale = prior_scale self.num_particles = num_particles self.vectorize_particles = True self.max_plate_nesting = max_plate_nesting def _pow(self, x): if self.beta == 1: return x.sqrt() # cheaper than .pow() return x.pow(self.beta / 2) def _get_traces(self, model, guide, args, kwargs): if self.max_plate_nesting == float("inf"): with validation_enabled(False): # Avoid calling .log_prob() when undefined. # TODO factor this out as a stand-alone helper. ELBO._guess_max_plate_nesting(self, model, guide, args, kwargs) vectorize = pyro.plate( "num_particles_vectorized", self.num_particles, dim=-self.max_plate_nesting ) # Trace the guide as in ELBO. with poutine.trace() as tr, vectorize: guide(*args, **kwargs) guide_trace = tr.trace # Trace the model, drawing posterior predictive samples. with poutine.trace() as tr, poutine.uncondition(): with poutine.replay(trace=guide_trace), vectorize: model(*args, **kwargs) model_trace = tr.trace for site in model_trace.nodes.values(): if site["type"] == "sample" and site["infer"].get("was_observed", False): site["is_observed"] = True if is_validation_enabled(): check_model_guide_match(model_trace, guide_trace, self.max_plate_nesting) guide_trace = prune_subsample_sites(guide_trace) model_trace = prune_subsample_sites(model_trace) if is_validation_enabled(): for site in guide_trace.nodes.values(): if site["type"] == "sample": warn_if_nan(site["value"], site["name"]) if not getattr(site["fn"], "has_rsample", False): raise ValueError( "EnergyDistance requires fully reparametrized guides" ) for trace in model_trace.nodes.values(): if site["type"] == "sample": if site["is_observed"]: warn_if_nan(site["value"], site["name"]) if not getattr(site["fn"], "has_rsample", False): raise ValueError( "EnergyDistance requires reparametrized likelihoods" ) if self.prior_scale > 0: model_trace.compute_log_prob( site_filter=lambda name, site: not site["is_observed"] ) if is_validation_enabled(): for site in model_trace.nodes.values(): if site["type"] == "sample": if not site["is_observed"]: check_site_shape(site, self.max_plate_nesting) return guide_trace, model_trace
[docs] def __call__(self, model, guide, *args, **kwargs): """ Computes the surrogate loss that can be differentiated with autograd to produce gradient estimates for the model and guide parameters. """ guide_trace, model_trace = self._get_traces(model, guide, args, kwargs) # Extract observations and posterior predictive samples. data = OrderedDict() samples = OrderedDict() for name, site in model_trace.nodes.items(): if site["type"] == "sample" and site["is_observed"]: data[name] = site["infer"]["obs"] samples[name] = site["value"] assert list(data.keys()) == list(samples.keys()) if not data: raise ValueError("Found no observations") # Compute energy distance from mean average error and generalized entropy. squared_error = [] # E[ (X - x)^2 ] squared_entropy = [] # E[ (X - X')^2 ] prototype = next(iter(data.values())) pairs = ( prototype.new_ones(self.num_particles, self.num_particles) .tril(-1) .nonzero(as_tuple=False) ) for name, obs in data.items(): sample = samples[name] scale = model_trace.nodes[name]["scale"] mask = model_trace.nodes[name]["mask"] # Flatten to subshapes of (num_particles, batch_size, event_size). event_dim = model_trace.nodes[name]["fn"].event_dim batch_shape = obs.shape[: obs.dim() - event_dim] event_shape = obs.shape[obs.dim() - event_dim :] if getattr(scale, "shape", ()): scale = scale.expand(batch_shape).reshape(-1) if getattr(mask, "shape", ()): mask = mask.expand(batch_shape).reshape(-1) obs = obs.reshape(batch_shape.numel(), event_shape.numel()) sample = sample.reshape( self.num_particles, batch_shape.numel(), event_shape.numel() ) squared_error.append(_squared_error(sample, obs, scale, mask)) squared_entropy.append( _squared_error(*sample[pairs].unbind(1), scale, mask) ) squared_error = reduce(operator.add, squared_error) squared_entropy = reduce(operator.add, squared_entropy) error = self._pow(squared_error).mean() # E[ ||X-x||^beta ] entropy = self._pow(squared_entropy).mean() # E[ ||X-X'||^beta ] energy = error - 0.5 * entropy # Compute prior. log_prior = 0 if self.prior_scale > 0: for site in model_trace.nodes.values(): if site["type"] == "sample" and not site["is_observed"]: log_prior = log_prior + site["log_prob_sum"] # Compute final loss. loss = energy - self.prior_scale * log_prior warn_if_nan(loss, "loss") return loss
[docs] def loss(self, *args, **kwargs): """ Not implemented. Added for compatibility with unit tests only. """ raise NotImplementedError("EnergyDistance implements only surrogate loss")