1# Copyright 2015 The TensorFlow Authors. All Rights Reserved. 2# 3# Licensed under the Apache License, Version 2.0 (the "License"); 4# you may not use this file except in compliance with the License. 5# You may obtain a copy of the License at 6# 7# http://www.apache.org/licenses/LICENSE-2.0 8# 9# Unless required by applicable law or agreed to in writing, software 10# distributed under the License is distributed on an "AS IS" BASIS, 11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12# See the License for the specific language governing permissions and 13# limitations under the License. 14# ============================================================================== 15 16"""Gradient checker for functions. 17 18The gradient checker verifies numerically that an function properly 19computes the gradients 20""" 21from __future__ import absolute_import 22from __future__ import division 23from __future__ import print_function 24 25import numpy as np 26 27from tensorflow.python.eager import backprop 28from tensorflow.python.eager import context 29from tensorflow.python.framework import dtypes 30from tensorflow.python.framework import ops 31from tensorflow.python.ops import array_ops 32from tensorflow.python.ops import math_ops 33from tensorflow.python.platform import tf_logging as logging 34from tensorflow.python.util.tf_export import tf_export 35 36 37def _product(t): 38 if isinstance(t, int): 39 return t 40 else: 41 y = 1 42 for x in t: 43 y *= x 44 return y 45 46 47def _eval_indexed_slices(a): 48 """Converts IndexedSlices to IndexedSlicesValue with numpy indices/values. 49 50 When eager execution is enabled, converts IndexedSlices 51 to IndexedSlicesValue with numpy indices/values. 52 53 Args: 54 a: any value. 55 56 Returns: 57 If a is IndexedSlices and eager execution is enabled, calls numpy() on a's 58 fields. Otherwise returns a unchanged. 59 """ 60 if isinstance(a, ops.IndexedSlices) and context.executing_eagerly(): 61 return ops.IndexedSlicesValue( 62 indices=[x.numpy() for x in a.indices], 63 values=[x.numpy() for x in a.values], 64 dense_shape=a.dense_shape) 65 return a 66 67 68def _to_numpy(a): 69 """Converts Tensors, EagerTensors, and IndexedSlicesValue to numpy arrays. 70 71 Args: 72 a: any value. 73 74 Returns: 75 If a is EagerTensor or Tensor, returns the evaluation of a by calling 76 numpy() or run(). If a is IndexedSlicesValue, constructs the corresponding 77 dense numpy array. Otherwise returns a unchanged. 78 """ 79 if isinstance(a, ops.EagerTensor): 80 return a.numpy() 81 if isinstance(a, ops.Tensor): 82 sess = ops.get_default_session() 83 return sess.run(a) 84 if isinstance(a, ops.IndexedSlicesValue): 85 arr = np.zeros(a.dense_shape) 86 assert len(a.values) == len(a.indices), ( 87 "IndexedSlicesValue has %s value slices but %s indices\n%s" % 88 (a.values, a.indices, a)) 89 for values_slice, index in zip(a.values, a.indices): 90 assert 0 <= index < len(arr), ( 91 "IndexedSlicesValue has invalid index %s\n%s" % (index, a)) 92 arr[index] += values_slice 93 return arr 94 return a 95 96 97def _prepare(f, xs_dtypes): 98 """Return a function that executes 'f'. 99 100 In TF 2.x, this is the same as `f`. 101 In TF 1.x, returns a Python function that executes the graph defined by `f` 102 in a Session. 103 104 Args: 105 f: the function. 106 xs_dtypes: dtypes of f's arguments. 107 108 Returns: 109 a function that will be evaluated in both graph and eager mode 110 """ 111 if context.executing_eagerly(): 112 113 def decorated_eager(*xs_data): 114 return f(*map(ops.convert_to_tensor, xs_data)) 115 116 return decorated_eager 117 xs = [array_ops.placeholder(x_dtype) for x_dtype in xs_dtypes] 118 y = f(*xs) 119 sess = ops.get_default_session() 120 def decorated_graph(*xs_data): 121 xs_data = [_to_numpy(a) for a in xs_data] 122 return sess.run(y, feed_dict=dict(zip(xs, xs_data))) 123 return decorated_graph 124 125 126def _compute_theoretical_jacobian(f, y_shape, y_dtype, xs, param): 127 """Computes the theoretical Jacobian for f regarding xs[param]. 128 129 One can think of the relation among f, xs and y as y = f(xs). 130 131 Args: 132 f: the function. 133 y_shape: the shape of the result. 134 y_dtype: the dtype of the result. 135 xs: a list of tensors. 136 param: the index of the target parameter. 137 138 Returns: 139 A 2-d numpy array representing the Jacobian. It has "x_size" rows 140 and "y_size" columns where "x_size" is the number of elements in xs[param] 141 and "y_size" is the number of elements in the result. 142 143 Raises: 144 ValueError: If result is empty but the gradient is nonzero. 145 """ 146 x = xs[param] 147 # Complex vectors are treated as vectors of twice as many reals. 148 x_shape = tuple(x.shape) + (2,) if x.dtype.is_complex else x.shape 149 y_factor = 2 if y_dtype.is_complex else 1 150 151 # To compute the jacobian, we treat x and y as one-dimensional vectors. 152 x_size = _product(x_shape) 153 x_val_size = _product(x_shape[1:]) # This is used for sparse gradients 154 y_size = _product(y_shape) * y_factor 155 156 # Allocate 2-D Jacobian, with x dimensions smashed into the first 157 # dimension and y dimensions smashed into the second. 158 jacobian = np.zeros((x_size, y_size), dtype=x.dtype.real_dtype.as_numpy_dtype) 159 160 # For each of the entry of dy, we set this to be 1 and 161 # everything else to be 0 and compute the gradients -- this will give us one 162 # one column of the Jacobian matrix. 163 dy_data = np.zeros(y_shape, dtype=y_dtype.as_numpy_dtype) 164 dy_data_flat = dy_data.ravel().view(y_dtype.real_dtype.as_numpy_dtype) 165 grad_fn_unprep = backprop.gradients_function(f, [param]) 166 grad_fn = _prepare(lambda dy, *xs: grad_fn_unprep(*xs, dy=dy), 167 [y_dtype] + [x.dtype for x in xs]) 168 for col in range(y_size): 169 dy_data_flat[col] = 1 170 grad = _to_numpy(grad_fn(dy_data, *xs)[0]) 171 grad = _eval_indexed_slices(grad) 172 dy_data_flat[col] = 0 173 if isinstance(grad, ops.IndexedSlicesValue): 174 for i, v in zip(grad.indices, grad.values): 175 r_begin = i * x_val_size 176 r_end = r_begin + x_val_size 177 jacobian[r_begin:r_end, col] += v.flat 178 else: 179 jacobian[:, col] = grad.ravel().view(jacobian.dtype) 180 181 # If the output is empty, run the gradients at least once and make sure 182 # they produce zeros. 183 if y_size == 0: # don't use 'not y_size', because y_size may not be an int 184 grad = _to_numpy(grad_fn(dy_data, *xs)[0]) 185 if grad.shape != x.shape: 186 raise ValueError("Empty gradient has wrong shape: expected %s, got %s" % 187 (x.shape, grad.shape)) 188 if np.any(grad): 189 raise ValueError("Empty tensor with nonzero gradients") 190 191 logging.vlog(1, "Theoretical Jacobian =\n%s", jacobian) 192 return jacobian 193 194 195def _compute_numeric_jacobian(f, y_size, y_dtype, xs, param, 196 delta): 197 """Computes the numeric Jacobian for f regarding xs[param]. 198 199 One can think of the relation among f, xs and y as y = f(xs). 200 201 Args: 202 f: the function. 203 y_size: the number of elements of the result. 204 y_dtype: the dtype of the result. 205 xs: a list of tensors. 206 param: the index of the target parameter. 207 delta: the amount of perturbation we give to the input. 208 209 Returns: 210 A 2-d numpy array representing the Jacobian. It has "x_size" rows 211 and "y_size" columns where "x_size" is the number of elements in xs[param] 212 and "y_size" is the number of elements in the result. 213 """ 214 # bfloat16 doesn't have enough bits to represent high precision numbers such 215 # as delta. Convert to float32 here. Since numeric_jacobian is expected to 216 # be the groundtruth to compare against, it shouldn't lose any information. 217 x_shape = xs[param].shape 218 x_dtype = xs[param].dtype 219 if y_dtype == dtypes.bfloat16: 220 f = lambda *xs: math_ops.cast(f(*xs), dtypes.float32) 221 y_dtype = dtypes.float32 222 223 # To compute the jacobian, we treat x and y as one-dimensional vectors 224 x_size = _product(x_shape) * (2 if x_dtype.is_complex else 1) 225 y_size = y_size * (2 if y_dtype.is_complex else 1) 226 x_dtype = x_dtype.real_dtype.as_numpy_dtype 227 y_dtype = y_dtype.real_dtype.as_numpy_dtype 228 229 xs_dtypes = [x.dtype for x in xs] 230 # Converts xs to numpy arrays to do in-place perturbation. 231 # Calls asarray() to avoid copying in ravel() later. 232 xs = [np.asarray(_to_numpy(x)) for x in xs] 233 x = xs[param] 234 235 # Make sure we have the right types 236 scale = np.asarray(2 * delta, dtype=y_dtype)[()] 237 238 jacobian = np.zeros((x_size, y_size), dtype=x_dtype) 239 # For each of the entry of x, we slightly perturbs this by adding and 240 # subtracting a delta and then compute difference between the outputs. This 241 # will give us one row of the Jacobian matrix. 242 243 f = _prepare(f, xs_dtypes) 244 for row in range(x_size): 245 original = x.ravel().view(x_dtype)[row] 246 x.ravel().view(x_dtype)[row] += delta 247 y_pos = _to_numpy(f(*xs)) 248 x.ravel().view(x_dtype)[row] = original 249 x.ravel().view(x_dtype)[row] -= delta 250 y_neg = _to_numpy(f(*xs)) 251 x.ravel().view(x_dtype)[row] = original 252 diff = (y_pos - y_neg) / scale 253 jacobian[row, :] = diff.ravel().view(y_dtype) 254 255 logging.vlog(1, "Numeric Jacobian =\n%s", jacobian) 256 return jacobian 257 258 259def _compute_gradient(f, 260 y_shape, 261 y_dtype, 262 xs, 263 param, 264 delta): 265 """Computes the theoretical and numerical jacobian.""" 266 x = xs[param] 267 t = x.dtype 268 allowed_types = [dtypes.float16, dtypes.bfloat16, dtypes.float32, 269 dtypes.float64, dtypes.complex64, dtypes.complex128] 270 assert t.base_dtype in allowed_types, ("Cannot compute gradient for" 271 "unsupported type %s of argument %s" % 272 (t.name, param)) 273 t2 = y_dtype 274 assert t2.base_dtype in allowed_types, ("Cannot compute gradient for" 275 "unsupported type %s of y" % t2.name) 276 y_size = _product(y_shape) 277 jacob_t = _compute_theoretical_jacobian(f, y_shape, y_dtype, 278 xs, param) 279 jacob_n = _compute_numeric_jacobian(f, y_size, y_dtype, xs, 280 param, delta) 281 return jacob_t, jacob_n 282 283 284def _compute_gradient_list(f, xs, delta): 285 """Compute gradients for a list of x values.""" 286 # convert xs to tensors so that dtype and shape have uniform types 287 xs = list(map(ops.convert_to_tensor, xs)) 288 # run the function to get info of the result 289 xs_dtypes = [x.dtype for x in xs] 290 f_temp = _prepare(f, xs_dtypes) 291 y = f_temp(*xs) 292 return zip(*[_compute_gradient(f, y.shape, dtypes.as_dtype(y.dtype), 293 xs, i, delta) for i in range(len(xs))]) 294 295 296@tf_export("test.compute_gradient", v1=[]) 297def compute_gradient(f, x, delta=1e-3): 298 """Computes the theoretical and numeric Jacobian of f. 299 300 With y = f(x), computes the theoretical and numeric Jacobian dy/dx. 301 302 Args: 303 f: the function. 304 x: a list of tensors. 305 delta: (optional) perturbation used to compute numeric Jacobian. 306 307 Returns: 308 A pair of lists, where the first is a list of 2-d numpy arrays representing 309 the theoretical Jacobians for each argument, and the second list is the 310 numerical ones. Each 2-d array has "x_size" rows 311 and "y_size" columns where "x_size" is the number of elements in the 312 corresponding argument and "y_size" is the number of elements in f(x). 313 314 Raises: 315 ValueError: If result is empty but the gradient is nonzero. 316 """ 317 if not isinstance(x, list): 318 raise ValueError( 319 "`x` must be a list of Tensors (arguments to `f`), not a %s" % type(x)) 320 return _compute_gradient_list(f, x, delta) 321 322 323def max_error(grad1, grad2): 324 """Computes maximum elementwise gap. 325 326 Computes the maximum elementwise gap between two lists of tensors of the same 327 shape. 328 329 Args: 330 grad1: a lists of tensors. 331 grad2: a lists of tensors with the same shape as grad1. 332 333 Returns: 334 The maximum elementwise gap between the two. 335 """ 336 error = 0 337 for j_t, j_n in zip(grad1, grad2): 338 if j_t.size or j_n.size: # Handle zero size tensors correctly 339 error = np.maximum(error, np.fabs(j_t - j_n).max()) 340 return error 341