• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
2#
3# Licensed under the Apache License, Version 2.0 (the "License");
4# you may not use this file except in compliance with the License.
5# You may obtain a copy of the License at
6#
7#     http://www.apache.org/licenses/LICENSE-2.0
8#
9# Unless required by applicable law or agreed to in writing, software
10# distributed under the License is distributed on an "AS IS" BASIS,
11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12# See the License for the specific language governing permissions and
13# limitations under the License.
14# ==============================================================================
15
16"""Gradient checker for any ops, graphs.
17
18The gradient checker verifies numerically that an op/graph properly
19computes the gradients
20"""
21from __future__ import absolute_import
22from __future__ import division
23from __future__ import print_function
24
25import numpy as np
26
27from tensorflow.python.framework import constant_op
28from tensorflow.python.framework import dtypes
29from tensorflow.python.framework import ops
30from tensorflow.python.ops import array_ops
31from tensorflow.python.ops import gradients
32from tensorflow.python.ops import math_ops
33from tensorflow.python.platform import tf_logging as logging
34from tensorflow.python.util.tf_export import tf_export
35
36
37def _product(t):
38  if isinstance(t, int):
39    return t
40  else:
41    y = 1
42    for x in t:
43      y *= x
44    return y
45
46
47def _extra_feeds(extra_feed_dict, new_feeds):
48  if not extra_feed_dict:
49    return new_feeds
50  r = {}
51  r.update(extra_feed_dict)
52  r.update(new_feeds)
53  return r
54
55
56def _compute_theoretical_jacobian(x, x_shape, x_data, dy, dy_shape, dx,
57                                  extra_feed_dict):
58  """Computes the theoretical Jacobian for dy/dx.
59
60  Computes the theoretical Jacobian using the ops generated by
61  compute_gradient().
62
63  Args:
64    x: the tensor "x".
65    x_shape: the dimensions of x as a tuple or an array of ints.
66    x_data: a numpy parray as the input data for x
67    dy: the tensor "dy".
68    dy_shape: the dimensions of dy as a tuple or an array of ints.
69    dx: Tensor or IndexedSlices representing dx
70    extra_feed_dict: dict that allows fixing specified tensor values
71      during the jacobian calculation.
72
73  Returns:
74    A 2-d numpy array representing the Jacobian for dy/dx. It has "x_size" rows
75    and "dy_size" columns where "x_size" is the number of elements in x and
76    "dy_size" is the number of elements in dy.
77
78  Raises:
79    ValueError: If `dy` is empty but the gradient is nonzero.
80  """
81  # Complex vectors are treated as vectors of twice as many reals.
82  if x.dtype.is_complex:
83    x_shape = tuple(x_shape) + (2,)
84  dy_factor = 2 if dy.dtype.is_complex else 1
85
86  # To compute the jacobian, we treat x and y as one-dimensional vectors.
87  x_size = _product(x_shape)
88  x_val_size = _product(x_shape[1:])  # This is used for sparse gradients
89  dy_size = _product(dy_shape) * dy_factor
90
91  # Allocate 2-D Jacobian, with x dimensions smashed into the first
92  # dimension and y dimensions smashed into the second.
93  jacobian = np.zeros((x_size, dy_size),
94                      dtype=x.dtype.real_dtype.as_numpy_dtype)
95
96  # For each of the entry of dy, we set this to be 1 and
97  # everything else to be 0 and compute the backprop -- this will give us one
98  # one column of the Jacobian matrix.
99  dy_data = np.zeros(dy_shape, dtype=dy.dtype.as_numpy_dtype)
100  dy_data_flat = dy_data.ravel().view(dy.dtype.real_dtype.as_numpy_dtype)
101  sess = ops.get_default_session()
102  for col in range(dy_size):
103    dy_data_flat[col] = 1
104    if isinstance(dx, ops.IndexedSlices):
105      backprop_indices, backprop_values = sess.run(
106          [dx.indices, dx.values],
107          feed_dict=_extra_feeds(extra_feed_dict, {x: x_data, dy: dy_data}))
108      for i, v in zip(backprop_indices, backprop_values):
109        r_begin = i * x_val_size
110        r_end = r_begin + x_val_size
111        jacobian[r_begin:r_end, col] += v.flat
112    else:
113      assert isinstance(dx, ops.Tensor), "dx = " + str(dx)
114      backprop = sess.run(
115          dx, feed_dict=_extra_feeds(extra_feed_dict, {x: x_data, dy: dy_data}))
116      jacobian[:, col] = backprop.ravel().view(jacobian.dtype)
117    dy_data_flat[col] = 0
118
119  # If the output is empty, run the gradients at least once and make sure
120  # they produce zeros.
121  if not dy_size:
122    backprop = sess.run(
123        dx, feed_dict=_extra_feeds(extra_feed_dict, {x: x_data, dy: dy_data}))
124    if backprop.shape != x_data.shape:
125      raise ValueError("Empty gradient has wrong shape: expected %s, got %s" %
126                       (x_data.shape, backprop.shape))
127    if np.any(backprop):
128      raise ValueError("Empty tensor with nonzero gradients")
129
130  logging.vlog(1, "Theoretical Jacobian =\n%s", jacobian)
131  return jacobian
132
133
134def _compute_numeric_jacobian(x, x_shape, x_data, y, y_shape, delta,
135                              extra_feed_dict):
136  """Computes the numeric Jacobian for dy/dx.
137
138  Computes the numeric Jacobian by slightly perturbing the inputs and
139  measuring the differences on the output.
140
141  Args:
142    x: the tensor "x".
143    x_shape: the dimensions of x as a tuple or an array of ints.
144    x_data: a numpy array as the input data for x
145    y: the tensor "y".
146    y_shape: the dimensions of y as a tuple or an array of ints.
147    delta: the amount of perturbation we give to the input
148    extra_feed_dict: dict that allows fixing specified tensor values
149      during the jacobian calculation.
150
151  Returns:
152    A 2-d numpy array representing the Jacobian for dy/dx. It has "x_size" rows
153    and "y_size" columns where "x_size" is the number of elements in x and
154    "y_size" is the number of elements in y.
155  """
156  # bfloat16 doesn't have enough bits to represent high precision numbers such
157  # as delta. Convert to float32 here. Since numeric_jacobian is expected to
158  # be the groundtruth to compare against, it shouldn't lose any information.
159  if x.dtype == dtypes.bfloat16:
160    x = math_ops.cast(x, dtypes.float32)  # TODO(wangpeng): Now that the new x
161            # is an output of the old x, isn't feeding to the new x a mistake?
162  if y.dtype == dtypes.bfloat16:
163    y = math_ops.cast(y, dtypes.float32)
164  if x_data.dtype == dtypes.bfloat16.as_numpy_dtype:
165    x_data = x_data.astype(np.float32)
166
167  # To compute the jacobian, we treat x and y as one-dimensional vectors
168  x_size = _product(x_shape) * (2 if x.dtype.is_complex else 1)
169  y_size = _product(y_shape) * (2 if y.dtype.is_complex else 1)
170  x_dtype = x.dtype.real_dtype.as_numpy_dtype
171  y_dtype = y.dtype.real_dtype.as_numpy_dtype
172
173  # Make sure we have the right types
174  x_data = np.asarray(x_data, dtype=x.dtype.as_numpy_dtype)
175  scale = np.asarray(2 * delta, dtype=y_dtype)[()]
176
177  jacobian = np.zeros((x_size, y_size), dtype=x_dtype)
178  # For each of the entry of x, we slightly perturbs this by adding and
179  # subtracting a delta and then compute difference between the outputs. This
180  # will give us one row of the Jacobian matrix.
181  for row in range(x_size):
182    x_pos = x_data.copy()
183    x_neg = x_data.copy()
184    x_pos.ravel().view(x_dtype)[row] += delta
185    y_pos = y.eval(feed_dict=_extra_feeds(extra_feed_dict, {x: x_pos}))
186    x_neg.ravel().view(x_dtype)[row] -= delta
187    y_neg = y.eval(feed_dict=_extra_feeds(extra_feed_dict, {x: x_neg}))
188    diff = (y_pos - y_neg) / scale
189    jacobian[row, :] = diff.ravel().view(y_dtype)
190
191  logging.vlog(1, "Numeric Jacobian =\n%s", jacobian)
192  return jacobian
193
194
195def _compute_dx_and_dy(x, y, y_shape):
196  """Returns a node to compute gradient of y wrt x."""
197  # We make up a dy so that we can compute the gradients. We don't really use
198  # the value of dy -- we will always feed it. We need to add an identity node
199  # so that we can always feed it properly. Otherwise, for the Add operation,
200  # dx is the same as dy and we cannot fetch the tensor that we are feeding.
201  with x.graph.as_default():
202    dy_orig = constant_op.constant(1.0, shape=y_shape, dtype=y.dtype)
203    dy = array_ops.identity(dy_orig)
204  # We compute the gradients for y wrt. x
205  grads = gradients.gradients(y, x, dy)
206  assert len(grads) == 1
207  return grads[0], dy_orig
208
209
210def _compute_gradient(x,
211                      x_shape,
212                      dx,
213                      y,
214                      y_shape,
215                      dy,
216                      x_init_value=None,
217                      delta=1e-3,
218                      extra_feed_dict=None):
219  """Computes the theoretical and numerical jacobian."""
220  t = dtypes.as_dtype(x.dtype)
221  allowed_types = [dtypes.float16, dtypes.bfloat16, dtypes.float32,
222                   dtypes.float64, dtypes.complex64, dtypes.complex128]
223  assert t.base_dtype in allowed_types, "Don't support type %s for x" % t.name
224  t2 = dtypes.as_dtype(y.dtype)
225  assert t2.base_dtype in allowed_types, "Don't support type %s for y" % t2.name
226
227  if x_init_value is not None:
228    i_shape = list(x_init_value.shape)
229    assert(list(x_shape) == i_shape), "x_shape = %s, init_data shape = %s" % (
230        x_shape, i_shape)
231    x_data = x_init_value
232  else:
233    x_data = np.random.random_sample(x_shape).astype(t.as_numpy_dtype)
234    if t.is_complex:
235      x_data.imag = np.random.random_sample(x_shape)
236
237  jacob_t = _compute_theoretical_jacobian(
238      x, x_shape, x_data, dy, y_shape, dx, extra_feed_dict=extra_feed_dict)
239  jacob_n = _compute_numeric_jacobian(
240      x, x_shape, x_data, y, y_shape, delta, extra_feed_dict=extra_feed_dict)
241  return jacob_t, jacob_n
242
243
244def _compute_gradient_list(x,
245                           x_shape,
246                           y,
247                           y_shape,
248                           x_init_value=None,
249                           delta=1e-3,
250                           init_targets=None,
251                           extra_feed_dict=None):
252  """Compute gradients for a list of x values."""
253  assert isinstance(x, list)
254  dx, dy = zip(*[_compute_dx_and_dy(xi, y, y_shape) for xi in x])
255
256  if init_targets is not None:
257    assert isinstance(init_targets, (list, tuple))
258    for init in init_targets:
259      init.run()
260  if x_init_value is None:
261    x_init_value = [None] * len(x)
262  ret = [_compute_gradient(xi, x_shapei, dxi, y, y_shape, dyi, x_init_valuei,
263                           delta, extra_feed_dict=extra_feed_dict)
264         for xi, x_shapei, dxi, dyi, x_init_valuei in zip(x, x_shape, dx, dy,
265                                                          x_init_value)]
266  return ret
267
268
269@tf_export(v1=["test.compute_gradient"])
270def compute_gradient(x,
271                     x_shape,
272                     y,
273                     y_shape,
274                     x_init_value=None,
275                     delta=1e-3,
276                     init_targets=None,
277                     extra_feed_dict=None):
278  """Computes and returns the theoretical and numerical Jacobian.
279
280  If `x` or `y` is complex, the Jacobian will still be real but the
281  corresponding Jacobian dimension(s) will be twice as large.  This is required
282  even if both input and output is complex since TensorFlow graphs are not
283  necessarily holomorphic, and may have gradients not expressible as complex
284  numbers.  For example, if `x` is complex with shape `[m]` and `y` is complex
285  with shape `[n]`, each Jacobian `J` will have shape `[m * 2, n * 2]` with
286
287      J[::2, ::2] = d(Re y)/d(Re x)
288      J[::2, 1::2] = d(Im y)/d(Re x)
289      J[1::2, ::2] = d(Re y)/d(Im x)
290      J[1::2, 1::2] = d(Im y)/d(Im x)
291
292  Args:
293    x: a tensor or list of tensors
294    x_shape: the dimensions of x as a tuple or an array of ints. If x is a list,
295    then this is the list of shapes.
296    y: a tensor
297    y_shape: the dimensions of y as a tuple or an array of ints.
298    x_init_value: (optional) a numpy array of the same shape as "x"
299      representing the initial value of x. If x is a list, this should be a list
300      of numpy arrays.  If this is none, the function will pick a random tensor
301      as the initial value.
302    delta: (optional) the amount of perturbation.
303    init_targets: list of targets to run to initialize model params.
304    extra_feed_dict: dict that allows fixing specified tensor values
305      during the Jacobian calculation.
306
307  Returns:
308    Two 2-d numpy arrays representing the theoretical and numerical
309    Jacobian for dy/dx. Each has "x_size" rows and "y_size" columns
310    where "x_size" is the number of elements in x and "y_size" is the
311    number of elements in y. If x is a list, returns a list of two numpy arrays.
312  """
313  # TODO(mrry): remove argument `init_targets`
314  if extra_feed_dict is None:
315    extra_feed_dict = {}
316
317  if isinstance(x, list):
318    return _compute_gradient_list(x, x_shape, y, y_shape, x_init_value, delta,
319                                  init_targets, extra_feed_dict=extra_feed_dict)
320  else:
321    if init_targets is not None:
322      assert isinstance(init_targets, (list, tuple))
323      for init in init_targets:
324        init.run()
325    dx, dy = _compute_dx_and_dy(x, y, y_shape)
326    ret = _compute_gradient(x, x_shape, dx, y, y_shape, dy, x_init_value, delta,
327                            extra_feed_dict=extra_feed_dict)
328    return ret
329
330
331def _compute_error(grad):
332  if isinstance(grad, tuple):
333    grad = [grad]
334  error = 0
335  for j_t, j_n in grad:
336    if j_t.size or j_n.size:  # Handle zero size tensors correctly
337      error = np.maximum(error, np.fabs(j_t - j_n).max())
338  return error
339
340
341@tf_export(v1=["test.compute_gradient_error"])
342def compute_gradient_error(x,
343                           x_shape,
344                           y,
345                           y_shape,
346                           x_init_value=None,
347                           delta=1e-3,
348                           init_targets=None,
349                           extra_feed_dict=None):
350  """Computes the gradient error.
351
352  Computes the maximum error for dy/dx between the computed Jacobian and the
353  numerically estimated Jacobian.
354
355  This function will modify the tensors passed in as it adds more operations
356  and hence changing the consumers of the operations of the input tensors.
357
358  This function adds operations to the current session. To compute the error
359  using a particular device, such as a GPU, use the standard methods for
360  setting a device (e.g. using with sess.graph.device() or setting a device
361  function in the session constructor).
362
363  Args:
364    x: a tensor or list of tensors
365    x_shape: the dimensions of x as a tuple or an array of ints. If x is a list,
366    then this is the list of shapes.
367    y: a tensor
368    y_shape: the dimensions of y as a tuple or an array of ints.
369    x_init_value: (optional) a numpy array of the same shape as "x"
370      representing the initial value of x. If x is a list, this should be a list
371      of numpy arrays.  If this is none, the function will pick a random tensor
372      as the initial value.
373    delta: (optional) the amount of perturbation.
374    init_targets: list of targets to run to initialize model params.
375    extra_feed_dict: dict that allows fixing specified tensor values
376      during the Jacobian calculation.
377
378  Returns:
379    The maximum error in between the two Jacobians.
380  """
381  grad = compute_gradient(x, x_shape, y, y_shape, x_init_value, delta,
382                          init_targets, extra_feed_dict=extra_feed_dict)
383  return _compute_error(grad)
384