1# Copyright 2019 The TensorFlow Authors. All Rights Reserved. 2# 3# Licensed under the Apache License, Version 2.0 (the "License"); 4# you may not use this file except in compliance with the License. 5# You may obtain a copy of the License at 6# 7# http://www.apache.org/licenses/LICENSE-2.0 8# 9# Unless required by applicable law or agreed to in writing, software 10# distributed under the License is distributed on an "AS IS" BASIS, 11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12# See the License for the specific language governing permissions and 13# limitations under the License. 14# ============================================================================== 15"""Eager-graph unified check numerics callback.""" 16 17from __future__ import absolute_import 18from __future__ import division 19from __future__ import print_function 20 21import collections 22import threading 23 24import numpy as np 25 26from tensorflow.core.protobuf import debug_event_pb2 27from tensorflow.python.debug.lib import op_callbacks_common 28from tensorflow.python.debug.lib import source_utils 29from tensorflow.python.eager import monitoring 30from tensorflow.python.framework import op_callbacks 31from tensorflow.python.framework import ops 32from tensorflow.python.ops import array_ops 33from tensorflow.python.ops import gen_debug_ops 34from tensorflow.python.platform import tf_logging as logging 35from tensorflow.python.util import compat 36from tensorflow.python.util.tf_export import tf_export 37 38 39# Many ops have benign NaN outputs, and running them with check_numerics 40# on will create unwanted errors 41# TODO(b/142497024): Replace this allowlist with function decorators in the ops 42IGNORE_OP_OUTPUTS = ( 43 # For FusedBatchNorm, if the input tensor is empty then batch_mean and 44 # batch_variance will be NaN. reserve_space holds intermediate values 45 # derived from batch_mean and batch_variance used for gradient calculation 46 (b"FusedBatchNorm", 1), # batch_mean 47 (b"FusedBatchNorm", 2), # batch_variance 48 (b"FusedBatchNorm", 3), # reserve_space_1 49 (b"FusedBatchNorm", 4), # reserve_space_2 50 51 # Same as above 52 (b"FusedBatchNormV2", 1), # batch_mean 53 (b"FusedBatchNormV2", 2), # batch_variance 54 (b"FusedBatchNormV2", 3), # reserve_space_1 55 (b"FusedBatchNormV2", 4), # reserve_space_2 56 57 # Same as above, but reserve_space_3 holds additional intermediate values 58 (b"FusedBatchNormV3", 1), # batch_mean 59 (b"FusedBatchNormV3", 2), # batch_variance 60 (b"FusedBatchNormV3", 3), # reserve_space_1 61 (b"FusedBatchNormV3", 4), # reserve_space_2 62 (b"FusedBatchNormV3", 5), # reserve_space_3 63) 64 65# Some frequently used ops are generally safe and we can skip them to reduce 66# overhead. NOTE: This list is compiled by observing operations called by 67# models in practice and is not a comprehensive list of safe operations. 68SAFE_OPS = ( 69 b"Concat", 70 b"ConcatV2", 71 b"ExpandDims", 72 b"Fill", 73 b"Gather", 74 b"Maximum", 75 b"Minimum", 76 b"Reshape", 77 b"Slice", 78 b"Squeeze", 79 b"Stack", 80 b"StridedSlice", 81 b"StridedSliceGrad", 82 b"TensorListConcatV2", 83 b"TensorListGather", 84 b"TensorListGetItem", 85 b"TensorListPopBack", 86 b"TensorListStack", 87 b"Transpose", 88 b"Unpack", 89) 90 91_state = threading.local() 92 93_check_numerics_callback_create_counter = monitoring.Counter( 94 "/tensorflow/api/python/debugging/check_numerics_callback_create_counter", 95 "Counter for number of times the check_numerics op callback is created.") 96 97 98def limit_string_length(string, max_len=50): 99 """Limit the length of input string. 100 101 Args: 102 string: Input string. 103 max_len: (int or None) If int, the length limit. If None, no limit. 104 105 Returns: 106 Possibly length-limited string. 107 """ 108 if max_len is None or len(string) <= max_len: 109 return string 110 else: 111 return "..." + string[len(string) - max_len:] 112 113 114# A dictionary that supports looking up the original input tensor names. 115_CHECK_NUMERICS_INPUT_LOOKUP = collections.defaultdict(dict) 116 117 118def _maybe_lookup_original_input_tensor(graph, tensor): 119 if (graph and 120 graph in _CHECK_NUMERICS_INPUT_LOOKUP and 121 tensor.name in _CHECK_NUMERICS_INPUT_LOOKUP[graph]): 122 return _CHECK_NUMERICS_INPUT_LOOKUP[graph][tensor.name] 123 else: 124 return tensor 125 126 127def get_check_numerics_error_message(slot, 128 num_outputs, 129 op_type, 130 tensor, 131 inputs, 132 graph=None, 133 traceback=None, 134 stack_height_limit=30, 135 path_length_limit=50): 136 """Create a meaningful and user-friendly error message about offending tensor. 137 138 The error message reveals the following info about the op that outputs 139 NaN/Infinity: dtype, shape (to the extent known at graph-construction time), 140 input tensors, stack trace for op creation (if is graph mode). 141 142 Args: 143 slot: (int) slot index of the tensor output. 144 num_outputs: (int) total number of outputs of the op. 145 op_type: (str) Type of the that generates `tensor`. 146 tensor: (Tensor) the offending tensor, i.e., the tensor that contains 147 Infinities or NaNs. 148 inputs: (array of Tensor) inputs to the op that generates `tensor`. 149 graph: (tf.Graph) the graph object that `tensor` belongs to. Available only 150 under graph mode. 151 traceback: (list of trace frames) the stack trace of the op's creation. 152 Available only under graph model. 153 stack_height_limit: (int or None) If int, limit to the height of the stack 154 trace printed in the error message. If None, no limit to the height. 155 path_length_limit: (int or None) Length limit for file paths included in the 156 formatted stack trace. 157 158 Returns: 159 (str) A formatted error message. 160 """ 161 eager_vs_graph_qualifier = "graph" if graph else "eagerly-executing" 162 message = "\n" 163 message += ( 164 "\n!!! Detected Infinity or NaN in output %d of " 165 "%s op \"%s\" (# of outputs: %d) !!!\n" % 166 (slot, eager_vs_graph_qualifier, op_type, num_outputs)) 167 168 message += " dtype: %s\n" % tensor.dtype 169 message += " shape: %s\n" % (tensor.shape,) 170 171 if not graph: 172 # This is an eager tensor. We can get its numpy value and count 173 # NaNs and Infs. 174 is_inf = np.isinf(tensor) 175 176 num_neg_inf = np.sum(np.logical_and(np.less(tensor, 0.), is_inf)) 177 num_pos_inf = np.sum(np.logical_and(np.greater(tensor, 0.), is_inf)) 178 num_nan = np.sum(np.isnan(tensor)) 179 if num_neg_inf > 0: 180 message += " # of -Inf elements: %s\n" % num_neg_inf 181 if num_pos_inf > 0: 182 message += " # of +Inf elements: %s\n" % num_pos_inf 183 if num_nan: 184 message += " # of +NaN elements: %s\n" % num_nan 185 186 if len(inputs) > 1: 187 message += "\n Input tensors (%d):\n" % len(inputs) 188 for slot, input_tensor in enumerate(inputs): 189 message += " %d: %s\n" % ( 190 slot, _maybe_lookup_original_input_tensor(graph, input_tensor)) 191 elif len(inputs) == 1: 192 message += "\n Input tensor: %s\n" % ( 193 _maybe_lookup_original_input_tensor(graph, inputs[0])) 194 if graph and hasattr(graph, "name") and graph.name: 195 message += " Graph name: \"%s\"\n" % graph.name 196 197 # Format the stack trace for the op's creation. We omit files that 198 # belong to tensorflow itself. 199 if graph and traceback: 200 message += ( 201 "\n Stack trace of op's creation (\"->\": inferred user code):\n") 202 if stack_height_limit is not None and len(traceback) > stack_height_limit: 203 num_omitted_frames = len(traceback) - stack_height_limit 204 message += " + ... (Omitted %d frames)\n" % num_omitted_frames 205 for filepath, lineno, function_name, source_line in traceback[ 206 -stack_height_limit:]: 207 user_code_indicator = " " 208 if not source_utils.guess_is_tensorflow_py_library(filepath): 209 user_code_indicator = " -> " 210 211 message += " + %s (L%d) %s\n" % ( 212 limit_string_length(filepath, path_length_limit), lineno, 213 function_name) 214 if source_line is not None: 215 message += "%s| %s\n" % (user_code_indicator, source_line) 216 message += "\n" 217 return message 218 219 220def _debug_summary(x): 221 return gen_debug_ops.debug_numeric_summary_v2( 222 x, 223 tensor_debug_mode=( 224 debug_event_pb2.TensorDebugMode.REDUCE_INF_NAN_THREE_SLOTS)) 225 226 227class CheckNumericsCallback(object): 228 """Wrapper for the numerics-checking callback for thread locality.""" 229 230 def __init__(self, stack_height_limit, path_length_limit): 231 self._stack_height_limit = stack_height_limit 232 self._path_length_limit = path_length_limit 233 # A dict mapping Placeholder tensors to their instrumenting debug tensors. 234 # Used only under V1 graph mode, where we can't rely on auto control 235 # dependency to execute the debug tensors and hence need to attach the debug 236 # tensors as control dependencies of the ops that consume the Placeholder. 237 self._placeholder_to_debug_tensor = dict() 238 239 def callback(self, 240 op_type, 241 inputs, 242 attrs, 243 outputs, 244 op_name=None, 245 graph=None): 246 """Eager-function unified callback for checking numerics.""" 247 del attrs, op_name # Unused 248 op_type_bytes = compat.as_bytes(op_type) 249 is_v1_graph_mode = not ops.executing_eagerly_outside_functions() 250 if (op_type_bytes in op_callbacks_common.OP_CALLBACK_SKIP_OPS or 251 op_type_bytes in SAFE_OPS): 252 return None 253 if graph: 254 # Under graph mode. Insert check_numerics op. 255 instrumented_outputs = [] 256 if is_v1_graph_mode: 257 for input_tensor in inputs: 258 if input_tensor in self._placeholder_to_debug_tensor and outputs: 259 outputs[0].op._add_control_input( # pylint: disable=protected-access 260 self._placeholder_to_debug_tensor[input_tensor].op) 261 for slot, output in enumerate(outputs): 262 if (output.dtype.is_floating and 263 (op_type_bytes, slot) not in IGNORE_OP_OUTPUTS): 264 checked_output = array_ops.check_numerics_v2( 265 # TF v2 has automatic control dependencies added to stateful async 266 # ops, which allows us to run check_numerics asynchronously. 267 # In the above case we use debug_summary to reduce all output 268 # tensors asynchronously from the op being checked and then 269 # process the tensor summary with check_numerics. 270 output if is_v1_graph_mode else _debug_summary(output), 271 get_check_numerics_error_message( 272 slot, 273 len(outputs), 274 op_type, 275 output, 276 inputs, 277 graph=graph, 278 traceback=output.op.traceback, 279 stack_height_limit=self._stack_height_limit, 280 path_length_limit=self._path_length_limit)) 281 _CHECK_NUMERICS_INPUT_LOOKUP[graph][checked_output.name] = output 282 instrumented_outputs.append(self._get_output_tensor( 283 op_type_bytes, output, checked_output, is_v1_graph_mode)) 284 else: 285 instrumented_outputs.append(output) 286 return instrumented_outputs 287 else: 288 if op_type_bytes == b"CheckNumericsV2": 289 # TODO(b/140334369): Remove this special casing logic once op_callback. 290 # automatically prevents infinite recursion in eager mode. 291 return None 292 # Under eager mode. Eagerly execute check_numerics op. 293 for slot, output in enumerate(outputs): 294 if (output.dtype.is_floating and 295 (op_type_bytes, slot) not in IGNORE_OP_OUTPUTS): 296 array_ops.check_numerics_v2( 297 output, 298 get_check_numerics_error_message( 299 slot, len(outputs), op_type, output, inputs, 300 stack_height_limit=self._stack_height_limit, 301 path_length_limit=self._path_length_limit)) 302 303 def _get_output_tensor(self, 304 op_type, 305 tensor, 306 checked_tensor, 307 is_v1_graph_mode): 308 """Determine what tensor to output from callback. 309 310 Args: 311 op_type: Type of the op that outputs the original symbolic tensor, as 312 `bytes`. 313 tensor: The original output symbolic tensor. 314 checked_tensor: The debugger-instrumented, numerics-checking tensor. 315 is_v1_graph_mode: Whether the debugged proggram is running under V1 graph 316 mode. 317 318 Returns: 319 A symbolic tensor to be returned by the dumping op_callback. 320 """ 321 if is_v1_graph_mode: 322 # Placeholders need special treatment under V1 graph mode. The 323 # callback can't simply override the Placeholder tensor to the debug 324 # tensor, as that would cause the Placeholder op to lack a value. 325 # The debug tensor is remembered and will be attached as control 326 # inputs to ops that consumer the Placeholders later. 327 if op_type == b"Placeholder": 328 self._placeholder_to_debug_tensor[tensor] = checked_tensor 329 return tensor 330 else: 331 return checked_tensor 332 else: 333 # Under non-v1 graph mode, rely on auto control dependency to run the 334 # checked tensor. 335 return tensor 336 337 338@tf_export("debugging.enable_check_numerics") 339def enable_check_numerics(stack_height_limit=30, 340 path_length_limit=50): 341 r"""Enable tensor numerics checking in an eager/graph unified fashion. 342 343 The numerics checking mechanism will cause any TensorFlow eager execution or 344 graph execution to error out as soon as an op's output tensor contains 345 infinity or NaN. 346 347 This method is idempotent. Calling it multiple times has the same effect 348 as calling it once. 349 350 This method takes effect only on the thread in which it is called. 351 352 When a op's float-type output tensor contains any Infinity or NaN, an 353 `tf.errors.InvalidArgumentError` will be thrown, with an error message that 354 reveals the following information: 355 - The type of the op that generated the tensor with bad numerics. 356 - Data type (dtype) of the tensor. 357 - Shape of the tensor (to the extent known at the time of eager execution 358 or graph construction). 359 - Name of the containing graph (if available). 360 - (Graph mode only): The stack trace of the intra-graph op's creation, 361 with a stack-height limit and a path-length limit for visual clarity. 362 The stack frames that belong to the user's code (as opposed to 363 tensorflow's internal code) are highlighted with a text arrow ("->"). 364 - (Eager mode only): How many of the offending tensor's elements are 365 `Infinity` and `NaN`, respectively. 366 367 Once enabled, the check-numerics mechanism can be disabled by using 368 `tf.debugging.disable_check_numerics()`. 369 370 Example usage: 371 372 1. Catching infinity during the execution of a `tf.function` graph: 373 374 ```py 375 import tensorflow as tf 376 377 tf.debugging.enable_check_numerics() 378 379 @tf.function 380 def square_log_x_plus_1(x): 381 v = tf.math.log(x + 1) 382 return tf.math.square(v) 383 384 x = -1.0 385 386 # When the following line runs, a function graph will be compiled 387 # from the Python function `square_log_x_plus_1()`. Due to the 388 # `enable_check_numerics()` call above, the graph will contain 389 # numerics checking ops that will run during the function graph's 390 # execution. The function call generates an -infinity when the Log 391 # (logarithm) op operates on the output tensor of the Add op. 392 # The program errors out at this line, printing an error message. 393 y = square_log_x_plus_1(x) 394 z = -y 395 ``` 396 397 2. Catching NaN during eager execution: 398 399 ```py 400 import numpy as np 401 import tensorflow as tf 402 403 tf.debugging.enable_check_numerics() 404 405 x = np.array([[0.0, -1.0], [4.0, 3.0]]) 406 407 # The following line executes the Sqrt op eagerly. Due to the negative 408 # element in the input array, a NaN is generated. Due to the 409 # `enable_check_numerics()` call above, the program errors immediately 410 # at this line, printing an error message. 411 y = tf.math.sqrt(x) 412 z = tf.matmul(y, y) 413 ``` 414 415 NOTE: If your code is running on TPUs, be sure to call 416 `tf.config.set_soft_device_placement(True)` before calling 417 `tf.debugging.enable_check_numerics()` as this API uses automatic outside 418 compilation on TPUs. For example: 419 420 ```py 421 tf.config.set_soft_device_placement(True) 422 tf.debugging.enable_check_numerics() 423 424 resolver = tf.distribute.cluster_resolver.TPUClusterResolver(tpu='') 425 strategy = tf.distribute.TPUStrategy(resolver) 426 with strategy.scope(): 427 # ... 428 ``` 429 430 Args: 431 stack_height_limit: Limit to the height of the printed stack trace. 432 Applicable only to ops in `tf.function`s (graphs). 433 path_length_limit: Limit to the file path included in the printed stack 434 trace. Applicable only to ops in `tf.function`s (graphs). 435 """ 436 if not hasattr(_state, "check_numerics_callback"): 437 _state.check_numerics_callback = CheckNumericsCallback( 438 stack_height_limit, path_length_limit) 439 op_callbacks.add_op_callback(_state.check_numerics_callback.callback) 440 441 logging.info( 442 "Enabled check-numerics callback in thread %s", 443 threading.current_thread().name) 444 _check_numerics_callback_create_counter.get_cell().increase_by(1) 445 446 447@tf_export("debugging.disable_check_numerics") 448def disable_check_numerics(): 449 """Disable the eager/graph unified numerics checking mechanism. 450 451 This method can be used after a call to `tf.debugging.enable_check_numerics()` 452 to disable the numerics-checking mechanism that catches infinity and NaN 453 values output by ops executed eagerly or in tf.function-compiled graphs. 454 455 This method is idempotent. Calling it multiple times has the same effect 456 as calling it once. 457 458 This method takes effect only on the thread in which it is called. 459 """ 460 if not hasattr(_state, "check_numerics_callback"): 461 return 462 try: 463 op_callbacks.remove_op_callback(_state.check_numerics_callback.callback) 464 delattr(_state, "check_numerics_callback") 465 logging.info( 466 "Disabled check-numerics callback in thread %s", 467 threading.current_thread().name) 468 except KeyError: 469 # Tolerate disabling the check numerics callback without 470 # enable_check_numerics() being called first. 471 pass 472