1# Copyright 2015 The TensorFlow Authors. All Rights Reserved. 2# 3# Licensed under the Apache License, Version 2.0 (the "License"); 4# you may not use this file except in compliance with the License. 5# You may obtain a copy of the License at 6# 7# http://www.apache.org/licenses/LICENSE-2.0 8# 9# Unless required by applicable law or agreed to in writing, software 10# distributed under the License is distributed on an "AS IS" BASIS, 11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12# See the License for the specific language governing permissions and 13# limitations under the License. 14# ============================================================================== 15 16"""Gradient checker for any ops, graphs. 17 18The gradient checker verifies numerically that an op/graph properly 19computes the gradients 20""" 21from __future__ import absolute_import 22from __future__ import division 23from __future__ import print_function 24 25import numpy as np 26 27from tensorflow.python.framework import constant_op 28from tensorflow.python.framework import dtypes 29from tensorflow.python.framework import ops 30from tensorflow.python.ops import array_ops 31from tensorflow.python.ops import gradients 32from tensorflow.python.ops import math_ops 33from tensorflow.python.platform import tf_logging as logging 34from tensorflow.python.util.tf_export import tf_export 35 36 37def _product(t): 38 if isinstance(t, int): 39 return t 40 else: 41 y = 1 42 for x in t: 43 y *= x 44 return y 45 46 47def _extra_feeds(extra_feed_dict, new_feeds): 48 if not extra_feed_dict: 49 return new_feeds 50 r = {} 51 r.update(extra_feed_dict) 52 r.update(new_feeds) 53 return r 54 55 56def _compute_theoretical_jacobian(x, x_shape, x_data, dy, dy_shape, dx, 57 extra_feed_dict): 58 """Computes the theoretical Jacobian for dy/dx. 59 60 Computes the theoretical Jacobian using the ops generated by 61 compute_gradient(). 62 63 Args: 64 x: the tensor "x". 65 x_shape: the dimensions of x as a tuple or an array of ints. 66 x_data: a numpy parray as the input data for x 67 dy: the tensor "dy". 68 dy_shape: the dimensions of dy as a tuple or an array of ints. 69 dx: Tensor or IndexedSlices representing dx 70 extra_feed_dict: dict that allows fixing specified tensor values 71 during the jacobian calculation. 72 73 Returns: 74 A 2-d numpy array representing the Jacobian for dy/dx. It has "x_size" rows 75 and "dy_size" columns where "x_size" is the number of elements in x and 76 "dy_size" is the number of elements in dy. 77 78 Raises: 79 ValueError: If `dy` is empty but the gradient is nonzero. 80 """ 81 # Complex vectors are treated as vectors of twice as many reals. 82 if x.dtype.is_complex: 83 x_shape = tuple(x_shape) + (2,) 84 dy_factor = 2 if dy.dtype.is_complex else 1 85 86 # To compute the jacobian, we treat x and y as one-dimensional vectors. 87 x_size = _product(x_shape) 88 x_val_size = _product(x_shape[1:]) # This is used for sparse gradients 89 dy_size = _product(dy_shape) * dy_factor 90 91 # Allocate 2-D Jacobian, with x dimensions smashed into the first 92 # dimension and y dimensions smashed into the second. 93 jacobian = np.zeros((x_size, dy_size), 94 dtype=x.dtype.real_dtype.as_numpy_dtype) 95 96 # For each of the entry of dy, we set this to be 1 and 97 # everything else to be 0 and compute the backprop -- this will give us one 98 # one column of the Jacobian matrix. 99 dy_data = np.zeros(dy_shape, dtype=dy.dtype.as_numpy_dtype) 100 dy_data_flat = dy_data.ravel().view(dy.dtype.real_dtype.as_numpy_dtype) 101 sess = ops.get_default_session() 102 for col in range(dy_size): 103 dy_data_flat[col] = 1 104 if isinstance(dx, ops.IndexedSlices): 105 backprop_indices, backprop_values = sess.run( 106 [dx.indices, dx.values], 107 feed_dict=_extra_feeds(extra_feed_dict, {x: x_data, dy: dy_data})) 108 for i, v in zip(backprop_indices, backprop_values): 109 r_begin = i * x_val_size 110 r_end = r_begin + x_val_size 111 jacobian[r_begin:r_end, col] += v.flat 112 else: 113 assert isinstance(dx, ops.Tensor), "dx = " + str(dx) 114 backprop = sess.run( 115 dx, feed_dict=_extra_feeds(extra_feed_dict, {x: x_data, dy: dy_data})) 116 jacobian[:, col] = backprop.ravel().view(jacobian.dtype) 117 dy_data_flat[col] = 0 118 119 # If the output is empty, run the gradients at least once and make sure 120 # they produce zeros. 121 if not dy_size: 122 backprop = sess.run( 123 dx, feed_dict=_extra_feeds(extra_feed_dict, {x: x_data, dy: dy_data})) 124 if backprop.shape != x_data.shape: 125 raise ValueError("Empty gradient has wrong shape: expected %s, got %s" % 126 (x_data.shape, backprop.shape)) 127 if np.any(backprop): 128 raise ValueError("Empty tensor with nonzero gradients") 129 130 logging.vlog(1, "Theoretical Jacobian =\n%s", jacobian) 131 return jacobian 132 133 134def _compute_numeric_jacobian(x, x_shape, x_data, y, y_shape, delta, 135 extra_feed_dict): 136 """Computes the numeric Jacobian for dy/dx. 137 138 Computes the numeric Jacobian by slightly perturbing the inputs and 139 measuring the differences on the output. 140 141 Args: 142 x: the tensor "x". 143 x_shape: the dimensions of x as a tuple or an array of ints. 144 x_data: a numpy array as the input data for x 145 y: the tensor "y". 146 y_shape: the dimensions of y as a tuple or an array of ints. 147 delta: the amount of perturbation we give to the input 148 extra_feed_dict: dict that allows fixing specified tensor values 149 during the jacobian calculation. 150 151 Returns: 152 A 2-d numpy array representing the Jacobian for dy/dx. It has "x_size" rows 153 and "y_size" columns where "x_size" is the number of elements in x and 154 "y_size" is the number of elements in y. 155 """ 156 # bfloat16 doesn't have enough bits to represent high precision numbers such 157 # as delta. Convert to float32 here. Since numeric_jacobian is expected to 158 # be the groundtruth to compare against, it shouldn't lose any information. 159 if x.dtype == dtypes.bfloat16: 160 x = math_ops.cast(x, dtypes.float32) # TODO(wangpeng): Now that the new x 161 # is an output of the old x, isn't feeding to the new x a mistake? 162 if y.dtype == dtypes.bfloat16: 163 y = math_ops.cast(y, dtypes.float32) 164 if x_data.dtype == dtypes.bfloat16.as_numpy_dtype: 165 x_data = x_data.astype(np.float32) 166 167 # To compute the jacobian, we treat x and y as one-dimensional vectors 168 x_size = _product(x_shape) * (2 if x.dtype.is_complex else 1) 169 y_size = _product(y_shape) * (2 if y.dtype.is_complex else 1) 170 x_dtype = x.dtype.real_dtype.as_numpy_dtype 171 y_dtype = y.dtype.real_dtype.as_numpy_dtype 172 173 # Make sure we have the right types 174 x_data = np.asarray(x_data, dtype=x.dtype.as_numpy_dtype) 175 scale = np.asarray(2 * delta, dtype=y_dtype)[()] 176 177 jacobian = np.zeros((x_size, y_size), dtype=x_dtype) 178 # For each of the entry of x, we slightly perturbs this by adding and 179 # subtracting a delta and then compute difference between the outputs. This 180 # will give us one row of the Jacobian matrix. 181 for row in range(x_size): 182 x_pos = x_data.copy() 183 x_neg = x_data.copy() 184 x_pos.ravel().view(x_dtype)[row] += delta 185 y_pos = y.eval(feed_dict=_extra_feeds(extra_feed_dict, {x: x_pos})) 186 x_neg.ravel().view(x_dtype)[row] -= delta 187 y_neg = y.eval(feed_dict=_extra_feeds(extra_feed_dict, {x: x_neg})) 188 diff = (y_pos - y_neg) / scale 189 jacobian[row, :] = diff.ravel().view(y_dtype) 190 191 logging.vlog(1, "Numeric Jacobian =\n%s", jacobian) 192 return jacobian 193 194 195def _compute_dx_and_dy(x, y, y_shape): 196 """Returns a node to compute gradient of y wrt x.""" 197 # We make up a dy so that we can compute the gradients. We don't really use 198 # the value of dy -- we will always feed it. We need to add an identity node 199 # so that we can always feed it properly. Otherwise, for the Add operation, 200 # dx is the same as dy and we cannot fetch the tensor that we are feeding. 201 with x.graph.as_default(): 202 dy_orig = constant_op.constant(1.0, shape=y_shape, dtype=y.dtype) 203 dy = array_ops.identity(dy_orig) 204 # We compute the gradients for y wrt. x 205 grads = gradients.gradients(y, x, dy) 206 assert len(grads) == 1 207 return grads[0], dy_orig 208 209 210def _compute_gradient(x, 211 x_shape, 212 dx, 213 y, 214 y_shape, 215 dy, 216 x_init_value=None, 217 delta=1e-3, 218 extra_feed_dict=None): 219 """Computes the theoretical and numerical jacobian.""" 220 t = dtypes.as_dtype(x.dtype) 221 allowed_types = [dtypes.float16, dtypes.bfloat16, dtypes.float32, 222 dtypes.float64, dtypes.complex64, dtypes.complex128] 223 assert t.base_dtype in allowed_types, "Don't support type %s for x" % t.name 224 t2 = dtypes.as_dtype(y.dtype) 225 assert t2.base_dtype in allowed_types, "Don't support type %s for y" % t2.name 226 227 if x_init_value is not None: 228 i_shape = list(x_init_value.shape) 229 assert(list(x_shape) == i_shape), "x_shape = %s, init_data shape = %s" % ( 230 x_shape, i_shape) 231 x_data = x_init_value 232 else: 233 x_data = np.random.random_sample(x_shape).astype(t.as_numpy_dtype) 234 if t.is_complex: 235 x_data.imag = np.random.random_sample(x_shape) 236 237 jacob_t = _compute_theoretical_jacobian( 238 x, x_shape, x_data, dy, y_shape, dx, extra_feed_dict=extra_feed_dict) 239 jacob_n = _compute_numeric_jacobian( 240 x, x_shape, x_data, y, y_shape, delta, extra_feed_dict=extra_feed_dict) 241 return jacob_t, jacob_n 242 243 244def _compute_gradient_list(x, 245 x_shape, 246 y, 247 y_shape, 248 x_init_value=None, 249 delta=1e-3, 250 init_targets=None, 251 extra_feed_dict=None): 252 """Compute gradients for a list of x values.""" 253 assert isinstance(x, list) 254 dx, dy = zip(*[_compute_dx_and_dy(xi, y, y_shape) for xi in x]) 255 256 if init_targets is not None: 257 assert isinstance(init_targets, (list, tuple)) 258 for init in init_targets: 259 init.run() 260 if x_init_value is None: 261 x_init_value = [None] * len(x) 262 ret = [_compute_gradient(xi, x_shapei, dxi, y, y_shape, dyi, x_init_valuei, 263 delta, extra_feed_dict=extra_feed_dict) 264 for xi, x_shapei, dxi, dyi, x_init_valuei in zip(x, x_shape, dx, dy, 265 x_init_value)] 266 return ret 267 268 269@tf_export(v1=["test.compute_gradient"]) 270def compute_gradient(x, 271 x_shape, 272 y, 273 y_shape, 274 x_init_value=None, 275 delta=1e-3, 276 init_targets=None, 277 extra_feed_dict=None): 278 """Computes and returns the theoretical and numerical Jacobian. 279 280 If `x` or `y` is complex, the Jacobian will still be real but the 281 corresponding Jacobian dimension(s) will be twice as large. This is required 282 even if both input and output is complex since TensorFlow graphs are not 283 necessarily holomorphic, and may have gradients not expressible as complex 284 numbers. For example, if `x` is complex with shape `[m]` and `y` is complex 285 with shape `[n]`, each Jacobian `J` will have shape `[m * 2, n * 2]` with 286 287 J[::2, ::2] = d(Re y)/d(Re x) 288 J[::2, 1::2] = d(Im y)/d(Re x) 289 J[1::2, ::2] = d(Re y)/d(Im x) 290 J[1::2, 1::2] = d(Im y)/d(Im x) 291 292 Args: 293 x: a tensor or list of tensors 294 x_shape: the dimensions of x as a tuple or an array of ints. If x is a list, 295 then this is the list of shapes. 296 y: a tensor 297 y_shape: the dimensions of y as a tuple or an array of ints. 298 x_init_value: (optional) a numpy array of the same shape as "x" 299 representing the initial value of x. If x is a list, this should be a list 300 of numpy arrays. If this is none, the function will pick a random tensor 301 as the initial value. 302 delta: (optional) the amount of perturbation. 303 init_targets: list of targets to run to initialize model params. 304 extra_feed_dict: dict that allows fixing specified tensor values 305 during the Jacobian calculation. 306 307 Returns: 308 Two 2-d numpy arrays representing the theoretical and numerical 309 Jacobian for dy/dx. Each has "x_size" rows and "y_size" columns 310 where "x_size" is the number of elements in x and "y_size" is the 311 number of elements in y. If x is a list, returns a list of two numpy arrays. 312 """ 313 # TODO(mrry): remove argument `init_targets` 314 if extra_feed_dict is None: 315 extra_feed_dict = {} 316 317 if isinstance(x, list): 318 return _compute_gradient_list(x, x_shape, y, y_shape, x_init_value, delta, 319 init_targets, extra_feed_dict=extra_feed_dict) 320 else: 321 if init_targets is not None: 322 assert isinstance(init_targets, (list, tuple)) 323 for init in init_targets: 324 init.run() 325 dx, dy = _compute_dx_and_dy(x, y, y_shape) 326 ret = _compute_gradient(x, x_shape, dx, y, y_shape, dy, x_init_value, delta, 327 extra_feed_dict=extra_feed_dict) 328 return ret 329 330 331def _compute_error(grad): 332 if isinstance(grad, tuple): 333 grad = [grad] 334 error = 0 335 for j_t, j_n in grad: 336 if j_t.size or j_n.size: # Handle zero size tensors correctly 337 error = np.maximum(error, np.fabs(j_t - j_n).max()) 338 return error 339 340 341@tf_export(v1=["test.compute_gradient_error"]) 342def compute_gradient_error(x, 343 x_shape, 344 y, 345 y_shape, 346 x_init_value=None, 347 delta=1e-3, 348 init_targets=None, 349 extra_feed_dict=None): 350 """Computes the gradient error. 351 352 Computes the maximum error for dy/dx between the computed Jacobian and the 353 numerically estimated Jacobian. 354 355 This function will modify the tensors passed in as it adds more operations 356 and hence changing the consumers of the operations of the input tensors. 357 358 This function adds operations to the current session. To compute the error 359 using a particular device, such as a GPU, use the standard methods for 360 setting a device (e.g. using with sess.graph.device() or setting a device 361 function in the session constructor). 362 363 Args: 364 x: a tensor or list of tensors 365 x_shape: the dimensions of x as a tuple or an array of ints. If x is a list, 366 then this is the list of shapes. 367 y: a tensor 368 y_shape: the dimensions of y as a tuple or an array of ints. 369 x_init_value: (optional) a numpy array of the same shape as "x" 370 representing the initial value of x. If x is a list, this should be a list 371 of numpy arrays. If this is none, the function will pick a random tensor 372 as the initial value. 373 delta: (optional) the amount of perturbation. 374 init_targets: list of targets to run to initialize model params. 375 extra_feed_dict: dict that allows fixing specified tensor values 376 during the Jacobian calculation. 377 378 Returns: 379 The maximum error in between the two Jacobians. 380 """ 381 grad = compute_gradient(x, x_shape, y, y_shape, x_init_value, delta, 382 init_targets, extra_feed_dict=extra_feed_dict) 383 return _compute_error(grad) 384