1# Copyright 2016 The TensorFlow Authors. All Rights Reserved. 2# 3# Licensed under the Apache License, Version 2.0 (the "License"); 4# you may not use this file except in compliance with the License. 5# You may obtain a copy of the License at 6# 7# http://www.apache.org/licenses/LICENSE-2.0 8# 9# Unless required by applicable law or agreed to in writing, software 10# distributed under the License is distributed on an "AS IS" BASIS, 11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12# See the License for the specific language governing permissions and 13# limitations under the License. 14# ============================================================================== 15"""Monte Carlo integration and helpers. 16 17@@expectation 18@@expectation_importance_sampler 19@@expectation_importance_sampler_logspace 20""" 21 22from __future__ import absolute_import 23from __future__ import division 24from __future__ import print_function 25 26from tensorflow.python.framework import ops 27from tensorflow.python.ops import array_ops 28from tensorflow.python.ops import math_ops 29from tensorflow.python.ops import nn 30from tensorflow.python.util import deprecation 31 32__all__ = [ 33 'expectation', 34 'expectation_importance_sampler', 35 'expectation_importance_sampler_logspace', 36] 37 38 39def expectation_importance_sampler(f, 40 log_p, 41 sampling_dist_q, 42 z=None, 43 n=None, 44 seed=None, 45 name='expectation_importance_sampler'): 46 r"""Monte Carlo estimate of \\(E_p[f(Z)] = E_q[f(Z) p(Z) / q(Z)]\\). 47 48 With \\(p(z) := exp^{log_p(z)}\\), this `Op` returns 49 50 \\(n^{-1} sum_{i=1}^n [ f(z_i) p(z_i) / q(z_i) ], z_i ~ q,\\) 51 \\(\approx E_q[ f(Z) p(Z) / q(Z) ]\\) 52 \\(= E_p[f(Z)]\\) 53 54 This integral is done in log-space with max-subtraction to better handle the 55 often extreme values that `f(z) p(z) / q(z)` can take on. 56 57 If `f >= 0`, it is up to 2x more efficient to exponentiate the result of 58 `expectation_importance_sampler_logspace` applied to `Log[f]`. 59 60 User supplies either `Tensor` of samples `z`, or number of samples to draw `n` 61 62 Args: 63 f: Callable mapping samples from `sampling_dist_q` to `Tensors` with shape 64 broadcastable to `q.batch_shape`. 65 For example, `f` works "just like" `q.log_prob`. 66 log_p: Callable mapping samples from `sampling_dist_q` to `Tensors` with 67 shape broadcastable to `q.batch_shape`. 68 For example, `log_p` works "just like" `sampling_dist_q.log_prob`. 69 sampling_dist_q: The sampling distribution. 70 `tfp.distributions.Distribution`. 71 `float64` `dtype` recommended. 72 `log_p` and `q` should be supported on the same set. 73 z: `Tensor` of samples from `q`, produced by `q.sample` for some `n`. 74 n: Integer `Tensor`. Number of samples to generate if `z` is not provided. 75 seed: Python integer to seed the random number generator. 76 name: A name to give this `Op`. 77 78 Returns: 79 The importance sampling estimate. `Tensor` with `shape` equal 80 to batch shape of `q`, and `dtype` = `q.dtype`. 81 """ 82 q = sampling_dist_q 83 with ops.name_scope(name, values=[z, n]): 84 z = _get_samples(q, z, n, seed) 85 86 log_p_z = log_p(z) 87 q_log_prob_z = q.log_prob(z) 88 89 def _importance_sampler_positive_f(log_f_z): 90 # Same as expectation_importance_sampler_logspace, but using Tensors 91 # rather than samples and functions. Allows us to sample once. 92 log_values = log_f_z + log_p_z - q_log_prob_z 93 return _logspace_mean(log_values) 94 95 # With \\(f_{plus}(z) = max(0, f(z)), f_{minus}(z) = max(0, -f(z))\\), 96 # \\(E_p[f(Z)] = E_p[f_{plus}(Z)] - E_p[f_{minus}(Z)]\\) 97 # \\( = E_p[f_{plus}(Z) + 1] - E_p[f_{minus}(Z) + 1]\\) 98 # Without incurring bias, 1 is added to each to prevent zeros in logspace. 99 # The logarithm is approximately linear around 1 + epsilon, so this is good 100 # for small values of 'z' as well. 101 f_z = f(z) 102 log_f_plus_z = math_ops.log(nn.relu(f_z) + 1.) 103 log_f_minus_z = math_ops.log(nn.relu(-1. * f_z) + 1.) 104 105 log_f_plus_integral = _importance_sampler_positive_f(log_f_plus_z) 106 log_f_minus_integral = _importance_sampler_positive_f(log_f_minus_z) 107 108 return math_ops.exp(log_f_plus_integral) - math_ops.exp(log_f_minus_integral) 109 110 111def expectation_importance_sampler_logspace( 112 log_f, 113 log_p, 114 sampling_dist_q, 115 z=None, 116 n=None, 117 seed=None, 118 name='expectation_importance_sampler_logspace'): 119 r"""Importance sampling with a positive function, in log-space. 120 121 With \\(p(z) := exp^{log_p(z)}\\), and \\(f(z) = exp{log_f(z)}\\), 122 this `Op` returns 123 124 \\(Log[ n^{-1} sum_{i=1}^n [ f(z_i) p(z_i) / q(z_i) ] ], z_i ~ q,\\) 125 \\(\approx Log[ E_q[ f(Z) p(Z) / q(Z) ] ]\\) 126 \\(= Log[E_p[f(Z)]]\\) 127 128 This integral is done in log-space with max-subtraction to better handle the 129 often extreme values that `f(z) p(z) / q(z)` can take on. 130 131 In contrast to `expectation_importance_sampler`, this `Op` returns values in 132 log-space. 133 134 135 User supplies either `Tensor` of samples `z`, or number of samples to draw `n` 136 137 Args: 138 log_f: Callable mapping samples from `sampling_dist_q` to `Tensors` with 139 shape broadcastable to `q.batch_shape`. 140 For example, `log_f` works "just like" `sampling_dist_q.log_prob`. 141 log_p: Callable mapping samples from `sampling_dist_q` to `Tensors` with 142 shape broadcastable to `q.batch_shape`. 143 For example, `log_p` works "just like" `q.log_prob`. 144 sampling_dist_q: The sampling distribution. 145 `tfp.distributions.Distribution`. 146 `float64` `dtype` recommended. 147 `log_p` and `q` should be supported on the same set. 148 z: `Tensor` of samples from `q`, produced by `q.sample` for some `n`. 149 n: Integer `Tensor`. Number of samples to generate if `z` is not provided. 150 seed: Python integer to seed the random number generator. 151 name: A name to give this `Op`. 152 153 Returns: 154 Logarithm of the importance sampling estimate. `Tensor` with `shape` equal 155 to batch shape of `q`, and `dtype` = `q.dtype`. 156 """ 157 q = sampling_dist_q 158 with ops.name_scope(name, values=[z, n]): 159 z = _get_samples(q, z, n, seed) 160 log_values = log_f(z) + log_p(z) - q.log_prob(z) 161 return _logspace_mean(log_values) 162 163 164def _logspace_mean(log_values): 165 """Evaluate `Log[E[values]]` in a stable manner. 166 167 Args: 168 log_values: `Tensor` holding `Log[values]`. 169 170 Returns: 171 `Tensor` of same `dtype` as `log_values`, reduced across dim 0. 172 `Log[Mean[values]]`. 173 """ 174 # center = Max[Log[values]], with stop-gradient 175 # The center hopefully keep the exponentiated term small. It is canceled 176 # from the final result, so putting stop gradient on it will not change the 177 # final result. We put stop gradient on to eliminate unnecessary computation. 178 center = array_ops.stop_gradient(_sample_max(log_values)) 179 180 # centered_values = exp{Log[values] - E[Log[values]]} 181 centered_values = math_ops.exp(log_values - center) 182 183 # log_mean_of_values = Log[ E[centered_values] ] + center 184 # = Log[ E[exp{log_values - E[log_values]}] ] + center 185 # = Log[E[values]] - E[log_values] + center 186 # = Log[E[values]] 187 log_mean_of_values = math_ops.log(_sample_mean(centered_values)) + center 188 189 return log_mean_of_values 190 191 192@deprecation.deprecated( 193 '2018-10-01', 194 'The tf.contrib.bayesflow library has moved to ' 195 'TensorFlow Probability (https://github.com/tensorflow/probability). ' 196 'Use `tfp.monte_carlo.expectation` instead.', 197 warn_once=True) 198def expectation(f, samples, log_prob=None, use_reparametrization=True, 199 axis=0, keep_dims=False, name=None): 200 r"""Computes the Monte-Carlo approximation of \\(E_p[f(X)]\\). 201 202 This function computes the Monte-Carlo approximation of an expectation, i.e., 203 204 \\(E_p[f(X)] \approx= m^{-1} sum_i^m f(x_j), x_j\ ~iid\ p(X)\\) 205 206 where: 207 208 - `x_j = samples[j, ...]`, 209 - `log(p(samples)) = log_prob(samples)` and 210 - `m = prod(shape(samples)[axis])`. 211 212 Tricks: Reparameterization and Score-Gradient 213 214 When p is "reparameterized", i.e., a diffeomorphic transformation of a 215 parameterless distribution (e.g., 216 `Normal(Y; m, s) <=> Y = sX + m, X ~ Normal(0,1)`), we can swap gradient and 217 expectation, i.e., 218 grad[ Avg{ \\(s_i : i=1...n\\) } ] = Avg{ grad[\\(s_i\\)] : i=1...n } where 219 S_n = Avg{\\(s_i\\)}` and `\\(s_i = f(x_i), x_i ~ p\\). 220 221 However, if p is not reparameterized, TensorFlow's gradient will be incorrect 222 since the chain-rule stops at samples of non-reparameterized distributions. 223 (The non-differentiated result, `approx_expectation`, is the same regardless 224 of `use_reparametrization`.) In this circumstance using the Score-Gradient 225 trick results in an unbiased gradient, i.e., 226 227 ```none 228 grad[ E_p[f(X)] ] 229 = grad[ int dx p(x) f(x) ] 230 = int dx grad[ p(x) f(x) ] 231 = int dx [ p'(x) f(x) + p(x) f'(x) ] 232 = int dx p(x) [p'(x) / p(x) f(x) + f'(x) ] 233 = int dx p(x) grad[ f(x) p(x) / stop_grad[p(x)] ] 234 = E_p[ grad[ f(x) p(x) / stop_grad[p(x)] ] ] 235 ``` 236 237 Unless p is not reparametrized, it is usually preferable to 238 `use_reparametrization = True`. 239 240 Warning: users are responsible for verifying `p` is a "reparameterized" 241 distribution. 242 243 Example Use: 244 245 ```python 246 import tensorflow_probability as tfp 247 tfd = tfp.distributions 248 249 # Monte-Carlo approximation of a reparameterized distribution, e.g., Normal. 250 251 num_draws = int(1e5) 252 p = tfd.Normal(loc=0., scale=1.) 253 q = tfd.Normal(loc=1., scale=2.) 254 exact_kl_normal_normal = tfd.kl_divergence(p, q) 255 # ==> 0.44314718 256 approx_kl_normal_normal = tfp.monte_carlo.expectation( 257 f=lambda x: p.log_prob(x) - q.log_prob(x), 258 samples=p.sample(num_draws, seed=42), 259 log_prob=p.log_prob, 260 use_reparametrization=(p.reparameterization_type 261 == distribution.FULLY_REPARAMETERIZED)) 262 # ==> 0.44632751 263 # Relative Error: <1% 264 265 # Monte-Carlo approximation of non-reparameterized distribution, e.g., Gamma. 266 267 num_draws = int(1e5) 268 p = ds.Gamma(concentration=1., rate=1.) 269 q = ds.Gamma(concentration=2., rate=3.) 270 exact_kl_gamma_gamma = tfd.kl_divergence(p, q) 271 # ==> 0.37999129 272 approx_kl_gamma_gamma = tfp.monte_carlo.expectation( 273 f=lambda x: p.log_prob(x) - q.log_prob(x), 274 samples=p.sample(num_draws, seed=42), 275 log_prob=p.log_prob, 276 use_reparametrization=(p.reparameterization_type 277 == distribution.FULLY_REPARAMETERIZED)) 278 # ==> 0.37696719 279 # Relative Error: <1% 280 281 # For comparing the gradients, see `monte_carlo_test.py`. 282 ``` 283 284 Note: The above example is for illustration only. To compute approximate 285 KL-divergence, the following is preferred: 286 287 ```python 288 approx_kl_p_q = tfp.vi.monte_carlo_csiszar_f_divergence( 289 f=bf.kl_reverse, 290 p_log_prob=q.log_prob, 291 q=p, 292 num_draws=num_draws) 293 ``` 294 295 Args: 296 f: Python callable which can return `f(samples)`. 297 samples: `Tensor` of samples used to form the Monte-Carlo approximation of 298 \\(E_p[f(X)]\\). A batch of samples should be indexed by `axis` 299 dimensions. 300 log_prob: Python callable which can return `log_prob(samples)`. Must 301 correspond to the natural-logarithm of the pdf/pmf of each sample. Only 302 required/used if `use_reparametrization=False`. 303 Default value: `None`. 304 use_reparametrization: Python `bool` indicating that the approximation 305 should use the fact that the gradient of samples is unbiased. Whether 306 `True` or `False`, this arg only affects the gradient of the resulting 307 `approx_expectation`. 308 Default value: `True`. 309 axis: The dimensions to average. If `None`, averages all 310 dimensions. 311 Default value: `0` (the left-most dimension). 312 keep_dims: If True, retains averaged dimensions using size `1`. 313 Default value: `False`. 314 name: A `name_scope` for operations created by this function. 315 Default value: `None` (which implies "expectation"). 316 317 Returns: 318 approx_expectation: `Tensor` corresponding to the Monte-Carlo approximation 319 of \\(E_p[f(X)]\\). 320 321 Raises: 322 ValueError: if `f` is not a Python `callable`. 323 ValueError: if `use_reparametrization=False` and `log_prob` is not a Python 324 `callable`. 325 """ 326 327 with ops.name_scope(name, 'expectation', [samples]): 328 if not callable(f): 329 raise ValueError('`f` must be a callable function.') 330 if use_reparametrization: 331 return math_ops.reduce_mean(f(samples), axis=axis, keepdims=keep_dims) 332 else: 333 if not callable(log_prob): 334 raise ValueError('`log_prob` must be a callable function.') 335 stop = array_ops.stop_gradient # For readability. 336 x = stop(samples) 337 logpx = log_prob(x) 338 fx = f(x) # Call `f` once in case it has side-effects. 339 # We now rewrite f(x) so that: 340 # `grad[f(x)] := grad[f(x)] + f(x) * grad[logqx]`. 341 # To achieve this, we use a trick that 342 # `h(x) - stop(h(x)) == zeros_like(h(x))` 343 # but its gradient is grad[h(x)]. 344 # Note that IEEE754 specifies that `x - x == 0.` and `x + 0. == x`, hence 345 # this trick loses no precision. For more discussion regarding the 346 # relevant portions of the IEEE754 standard, see the StackOverflow 347 # question, 348 # "Is there a floating point value of x, for which x-x == 0 is false?" 349 # http://stackoverflow.com/q/2686644 350 fx += stop(fx) * (logpx - stop(logpx)) # Add zeros_like(logpx). 351 return math_ops.reduce_mean(fx, axis=axis, keepdims=keep_dims) 352 353 354def _sample_mean(values): 355 """Mean over sample indices. In this module this is always [0].""" 356 return math_ops.reduce_mean(values, axis=[0]) 357 358 359def _sample_max(values): 360 """Max over sample indices. In this module this is always [0].""" 361 return math_ops.reduce_max(values, axis=[0]) 362 363 364def _get_samples(dist, z, n, seed): 365 """Check args and return samples.""" 366 with ops.name_scope('get_samples', values=[z, n]): 367 if (n is None) == (z is None): 368 raise ValueError( 369 'Must specify exactly one of arguments "n" and "z". Found: ' 370 'n = %s, z = %s' % (n, z)) 371 if n is not None: 372 return dist.sample(n, seed=seed) 373 else: 374 return ops.convert_to_tensor(z, name='z') 375