• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
2#
3# Licensed under the Apache License, Version 2.0 (the "License");
4# you may not use this file except in compliance with the License.
5# You may obtain a copy of the License at
6#
7#     http://www.apache.org/licenses/LICENSE-2.0
8#
9# Unless required by applicable law or agreed to in writing, software
10# distributed under the License is distributed on an "AS IS" BASIS,
11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12# See the License for the specific language governing permissions and
13# limitations under the License.
14# ==============================================================================
15"""Eager-graph unified check numerics callback."""
16
17from __future__ import absolute_import
18from __future__ import division
19from __future__ import print_function
20
21import collections
22import threading
23
24import numpy as np
25
26from tensorflow.core.protobuf import debug_event_pb2
27from tensorflow.python.debug.lib import op_callbacks_common
28from tensorflow.python.debug.lib import source_utils
29from tensorflow.python.eager import monitoring
30from tensorflow.python.framework import op_callbacks
31from tensorflow.python.framework import ops
32from tensorflow.python.ops import array_ops
33from tensorflow.python.ops import gen_debug_ops
34from tensorflow.python.platform import tf_logging as logging
35from tensorflow.python.util import compat
36from tensorflow.python.util.tf_export import tf_export
37
38
39# Many ops have benign NaN outputs, and running them with check_numerics
40# on will create unwanted errors
41# TODO(b/142497024): Replace this allowlist with function decorators in the ops
42IGNORE_OP_OUTPUTS = (
43    # For FusedBatchNorm, if the input tensor is empty then batch_mean and
44    # batch_variance will be NaN. reserve_space holds intermediate values
45    # derived from batch_mean and batch_variance used for gradient calculation
46    (b"FusedBatchNorm", 1),  # batch_mean
47    (b"FusedBatchNorm", 2),  # batch_variance
48    (b"FusedBatchNorm", 3),  # reserve_space_1
49    (b"FusedBatchNorm", 4),  # reserve_space_2
50
51    # Same as above
52    (b"FusedBatchNormV2", 1),  # batch_mean
53    (b"FusedBatchNormV2", 2),  # batch_variance
54    (b"FusedBatchNormV2", 3),  # reserve_space_1
55    (b"FusedBatchNormV2", 4),  # reserve_space_2
56
57    # Same as above, but reserve_space_3 holds additional intermediate values
58    (b"FusedBatchNormV3", 1),  # batch_mean
59    (b"FusedBatchNormV3", 2),  # batch_variance
60    (b"FusedBatchNormV3", 3),  # reserve_space_1
61    (b"FusedBatchNormV3", 4),  # reserve_space_2
62    (b"FusedBatchNormV3", 5),  # reserve_space_3
63)
64
65# Some frequently used ops are generally safe and we can skip them to reduce
66# overhead. NOTE: This list is compiled by observing operations called by
67# models in practice and is not a comprehensive list of safe operations.
68SAFE_OPS = (
69    b"Concat",
70    b"ConcatV2",
71    b"ExpandDims",
72    b"Fill",
73    b"Gather",
74    b"Maximum",
75    b"Minimum",
76    b"Reshape",
77    b"Slice",
78    b"Squeeze",
79    b"Stack",
80    b"StridedSlice",
81    b"StridedSliceGrad",
82    b"TensorListConcatV2",
83    b"TensorListGather",
84    b"TensorListGetItem",
85    b"TensorListPopBack",
86    b"TensorListStack",
87    b"Transpose",
88    b"Unpack",
89)
90
91_state = threading.local()
92
93_check_numerics_callback_create_counter = monitoring.Counter(
94    "/tensorflow/api/python/debugging/check_numerics_callback_create_counter",
95    "Counter for number of times the check_numerics op callback is created.")
96
97
98def limit_string_length(string, max_len=50):
99  """Limit the length of input string.
100
101  Args:
102    string: Input string.
103    max_len: (int or None) If int, the length limit. If None, no limit.
104
105  Returns:
106    Possibly length-limited string.
107  """
108  if max_len is None or len(string) <= max_len:
109    return string
110  else:
111    return "..." + string[len(string) - max_len:]
112
113
114# A dictionary that supports looking up the original input tensor names.
115_CHECK_NUMERICS_INPUT_LOOKUP = collections.defaultdict(dict)
116
117
118def _maybe_lookup_original_input_tensor(graph, tensor):
119  if (graph and
120      graph in _CHECK_NUMERICS_INPUT_LOOKUP and
121      tensor.name in _CHECK_NUMERICS_INPUT_LOOKUP[graph]):
122    return _CHECK_NUMERICS_INPUT_LOOKUP[graph][tensor.name]
123  else:
124    return tensor
125
126
127def get_check_numerics_error_message(slot,
128                                     num_outputs,
129                                     op_type,
130                                     tensor,
131                                     inputs,
132                                     graph=None,
133                                     traceback=None,
134                                     stack_height_limit=30,
135                                     path_length_limit=50):
136  """Create a meaningful and user-friendly error message about offending tensor.
137
138  The error message reveals the following info about the op that outputs
139  NaN/Infinity: dtype, shape (to the extent known at graph-construction time),
140  input tensors, stack trace for op creation (if is graph mode).
141
142  Args:
143    slot: (int) slot index of the tensor output.
144    num_outputs: (int) total number of outputs of the op.
145    op_type: (str) Type of the that generates `tensor`.
146    tensor: (Tensor) the offending tensor, i.e., the tensor that contains
147      Infinities or NaNs.
148    inputs: (array of Tensor) inputs to the op that generates `tensor`.
149    graph: (tf.Graph) the graph object that `tensor` belongs to. Available only
150      under graph mode.
151    traceback: (list of trace frames) the stack trace of the op's creation.
152      Available only under graph model.
153    stack_height_limit: (int or None) If int, limit to the height of the stack
154      trace printed in the error message. If None, no limit to the height.
155    path_length_limit: (int or None) Length limit for file paths included in the
156      formatted stack trace.
157
158  Returns:
159    (str) A formatted error message.
160  """
161  eager_vs_graph_qualifier = "graph" if graph else "eagerly-executing"
162  message = "\n"
163  message += (
164      "\n!!! Detected Infinity or NaN in output %d of "
165      "%s op \"%s\" (# of outputs: %d) !!!\n" %
166      (slot, eager_vs_graph_qualifier, op_type, num_outputs))
167
168  message += "  dtype: %s\n" % tensor.dtype
169  message += "  shape: %s\n" % (tensor.shape,)
170
171  if not graph:
172    # This is an eager tensor. We can get its numpy value and count
173    # NaNs and Infs.
174    is_inf = np.isinf(tensor)
175
176    num_neg_inf = np.sum(np.logical_and(np.less(tensor, 0.), is_inf))
177    num_pos_inf = np.sum(np.logical_and(np.greater(tensor, 0.), is_inf))
178    num_nan = np.sum(np.isnan(tensor))
179    if num_neg_inf > 0:
180      message += "  # of -Inf elements: %s\n" % num_neg_inf
181    if num_pos_inf > 0:
182      message += "  # of +Inf elements: %s\n" % num_pos_inf
183    if num_nan:
184      message += "  # of +NaN elements: %s\n" % num_nan
185
186  if len(inputs) > 1:
187    message += "\n  Input tensors (%d):\n" % len(inputs)
188    for slot, input_tensor in enumerate(inputs):
189      message += "         %d: %s\n" % (
190          slot, _maybe_lookup_original_input_tensor(graph, input_tensor))
191  elif len(inputs) == 1:
192    message += "\n  Input tensor: %s\n" % (
193        _maybe_lookup_original_input_tensor(graph, inputs[0]))
194  if graph and hasattr(graph, "name") and graph.name:
195    message += "  Graph name: \"%s\"\n" % graph.name
196
197  # Format the stack trace for the op's creation. We omit files that
198  # belong to tensorflow itself.
199  if graph and traceback:
200    message += (
201        "\n  Stack trace of op's creation (\"->\": inferred user code):\n")
202    if stack_height_limit is not None and len(traceback) > stack_height_limit:
203      num_omitted_frames = len(traceback) - stack_height_limit
204      message += "    + ... (Omitted %d frames)\n" % num_omitted_frames
205    for filepath, lineno, function_name, source_line in traceback[
206        -stack_height_limit:]:
207      user_code_indicator = "    "
208      if not source_utils.guess_is_tensorflow_py_library(filepath):
209        user_code_indicator = " -> "
210
211      message += "    + %s (L%d) %s\n" % (
212          limit_string_length(filepath, path_length_limit), lineno,
213          function_name)
214      if source_line is not None:
215        message += "%s|   %s\n" % (user_code_indicator, source_line)
216  message += "\n"
217  return message
218
219
220def _debug_summary(x):
221  return gen_debug_ops.debug_numeric_summary_v2(
222      x,
223      tensor_debug_mode=(
224          debug_event_pb2.TensorDebugMode.REDUCE_INF_NAN_THREE_SLOTS))
225
226
227class CheckNumericsCallback(object):
228  """Wrapper for the numerics-checking callback for thread locality."""
229
230  def __init__(self, stack_height_limit, path_length_limit):
231    self._stack_height_limit = stack_height_limit
232    self._path_length_limit = path_length_limit
233    # A dict mapping Placeholder tensors to their instrumenting debug tensors.
234    # Used only under V1 graph mode, where we can't rely on auto control
235    # dependency to execute the debug tensors and hence need to attach the debug
236    # tensors as control dependencies of the ops that consume the Placeholder.
237    self._placeholder_to_debug_tensor = dict()
238
239  def callback(self,
240               op_type,
241               inputs,
242               attrs,
243               outputs,
244               op_name=None,
245               graph=None):
246    """Eager-function unified callback for checking numerics."""
247    del attrs, op_name  # Unused
248    op_type_bytes = compat.as_bytes(op_type)
249    is_v1_graph_mode = not ops.executing_eagerly_outside_functions()
250    if (op_type_bytes in op_callbacks_common.OP_CALLBACK_SKIP_OPS or
251        op_type_bytes in SAFE_OPS):
252      return None
253    if graph:
254      # Under graph mode. Insert check_numerics op.
255      instrumented_outputs = []
256      if is_v1_graph_mode:
257        for input_tensor in inputs:
258          if input_tensor in self._placeholder_to_debug_tensor and outputs:
259            outputs[0].op._add_control_input(  # pylint: disable=protected-access
260                self._placeholder_to_debug_tensor[input_tensor].op)
261      for slot, output in enumerate(outputs):
262        if (output.dtype.is_floating and
263            (op_type_bytes, slot) not in IGNORE_OP_OUTPUTS):
264          checked_output = array_ops.check_numerics_v2(
265              # TF v2 has automatic control dependencies added to stateful async
266              # ops, which allows us to run check_numerics asynchronously.
267              # In the above case we use debug_summary to reduce all output
268              # tensors asynchronously from the op being checked and then
269              # process the tensor summary with check_numerics.
270              output if is_v1_graph_mode else _debug_summary(output),
271              get_check_numerics_error_message(
272                  slot,
273                  len(outputs),
274                  op_type,
275                  output,
276                  inputs,
277                  graph=graph,
278                  traceback=output.op.traceback,
279                  stack_height_limit=self._stack_height_limit,
280                  path_length_limit=self._path_length_limit))
281          _CHECK_NUMERICS_INPUT_LOOKUP[graph][checked_output.name] = output
282          instrumented_outputs.append(self._get_output_tensor(
283              op_type_bytes, output, checked_output, is_v1_graph_mode))
284        else:
285          instrumented_outputs.append(output)
286      return instrumented_outputs
287    else:
288      if op_type_bytes == b"CheckNumericsV2":
289        # TODO(b/140334369): Remove this special casing logic once op_callback.
290        # automatically prevents infinite recursion in eager mode.
291        return None
292      # Under eager mode. Eagerly execute check_numerics op.
293      for slot, output in enumerate(outputs):
294        if (output.dtype.is_floating and
295            (op_type_bytes, slot) not in IGNORE_OP_OUTPUTS):
296          array_ops.check_numerics_v2(
297              output,
298              get_check_numerics_error_message(
299                  slot, len(outputs), op_type, output, inputs,
300                  stack_height_limit=self._stack_height_limit,
301                  path_length_limit=self._path_length_limit))
302
303  def _get_output_tensor(self,
304                         op_type,
305                         tensor,
306                         checked_tensor,
307                         is_v1_graph_mode):
308    """Determine what tensor to output from callback.
309
310    Args:
311      op_type: Type of the op that outputs the original symbolic tensor, as
312        `bytes`.
313      tensor: The original output symbolic tensor.
314      checked_tensor: The debugger-instrumented, numerics-checking tensor.
315      is_v1_graph_mode: Whether the debugged proggram is running under V1 graph
316        mode.
317
318    Returns:
319      A symbolic tensor to be returned by the dumping op_callback.
320    """
321    if is_v1_graph_mode:
322      # Placeholders need special treatment under V1 graph mode. The
323      # callback can't simply override the Placeholder tensor to the debug
324      # tensor, as that would cause the Placeholder op to lack a value.
325      # The debug tensor is remembered and will be attached as control
326      # inputs to ops that consumer the Placeholders later.
327      if op_type == b"Placeholder":
328        self._placeholder_to_debug_tensor[tensor] = checked_tensor
329        return tensor
330      else:
331        return checked_tensor
332    else:
333      # Under non-v1 graph mode, rely on auto control dependency to run the
334      # checked tensor.
335      return tensor
336
337
338@tf_export("debugging.enable_check_numerics")
339def enable_check_numerics(stack_height_limit=30,
340                          path_length_limit=50):
341  r"""Enable tensor numerics checking in an eager/graph unified fashion.
342
343  The numerics checking mechanism will cause any TensorFlow eager execution or
344  graph execution to error out as soon as an op's output tensor contains
345  infinity or NaN.
346
347  This method is idempotent. Calling it multiple times has the same effect
348  as calling it once.
349
350  This method takes effect only on the thread in which it is called.
351
352  When a op's float-type output tensor contains any Infinity or NaN, an
353  `tf.errors.InvalidArgumentError` will be thrown, with an error message that
354  reveals the following information:
355    - The type of the op that generated the tensor with bad numerics.
356    - Data type (dtype) of the tensor.
357    - Shape of the tensor (to the extent known at the time of eager execution
358      or graph construction).
359    - Name of the containing graph (if available).
360    - (Graph mode only): The stack trace of the intra-graph op's creation,
361      with a stack-height limit and a path-length limit for visual clarity.
362      The stack frames that belong to the user's code (as opposed to
363      tensorflow's internal code) are highlighted with a text arrow ("->").
364    - (Eager mode only): How many of the offending tensor's elements are
365      `Infinity` and `NaN`, respectively.
366
367  Once enabled, the check-numerics mechanism can be disabled by using
368  `tf.debugging.disable_check_numerics()`.
369
370  Example usage:
371
372  1. Catching infinity during the execution of a `tf.function` graph:
373
374     ```py
375     import tensorflow as tf
376
377     tf.debugging.enable_check_numerics()
378
379     @tf.function
380     def square_log_x_plus_1(x):
381       v = tf.math.log(x + 1)
382       return tf.math.square(v)
383
384     x = -1.0
385
386     # When the following line runs, a function graph will be compiled
387     # from the Python function `square_log_x_plus_1()`. Due to the
388     # `enable_check_numerics()` call above, the graph will contain
389     # numerics checking ops that will run during the function graph's
390     # execution. The function call generates an -infinity when the Log
391     # (logarithm) op operates on the output tensor of the Add op.
392     # The program errors out at this line, printing an error message.
393     y = square_log_x_plus_1(x)
394     z = -y
395    ```
396
397  2. Catching NaN during eager execution:
398
399     ```py
400     import numpy as np
401     import tensorflow as tf
402
403     tf.debugging.enable_check_numerics()
404
405     x = np.array([[0.0, -1.0], [4.0, 3.0]])
406
407     # The following line executes the Sqrt op eagerly. Due to the negative
408     # element in the input array, a NaN is generated. Due to the
409     # `enable_check_numerics()` call above, the program errors immediately
410     # at this line, printing an error message.
411     y = tf.math.sqrt(x)
412     z = tf.matmul(y, y)
413     ```
414
415  NOTE: If your code is running on TPUs, be sure to call
416  `tf.config.set_soft_device_placement(True)` before calling
417  `tf.debugging.enable_check_numerics()` as this API uses automatic outside
418  compilation on TPUs. For example:
419
420  ```py
421  tf.config.set_soft_device_placement(True)
422  tf.debugging.enable_check_numerics()
423
424  resolver = tf.distribute.cluster_resolver.TPUClusterResolver(tpu='')
425  strategy = tf.distribute.TPUStrategy(resolver)
426  with strategy.scope():
427    # ...
428  ```
429
430  Args:
431    stack_height_limit: Limit to the height of the printed stack trace.
432      Applicable only to ops in `tf.function`s (graphs).
433    path_length_limit: Limit to the file path included in the printed stack
434      trace. Applicable only to ops in `tf.function`s (graphs).
435  """
436  if not hasattr(_state, "check_numerics_callback"):
437    _state.check_numerics_callback = CheckNumericsCallback(
438        stack_height_limit, path_length_limit)
439  op_callbacks.add_op_callback(_state.check_numerics_callback.callback)
440
441  logging.info(
442      "Enabled check-numerics callback in thread %s",
443      threading.current_thread().name)
444  _check_numerics_callback_create_counter.get_cell().increase_by(1)
445
446
447@tf_export("debugging.disable_check_numerics")
448def disable_check_numerics():
449  """Disable the eager/graph unified numerics checking mechanism.
450
451  This method can be used after a call to `tf.debugging.enable_check_numerics()`
452  to disable the numerics-checking mechanism that catches infinity and NaN
453  values output by ops executed eagerly or in tf.function-compiled graphs.
454
455  This method is idempotent. Calling it multiple times has the same effect
456  as calling it once.
457
458  This method takes effect only on the thread in which it is called.
459  """
460  if not hasattr(_state, "check_numerics_callback"):
461    return
462  try:
463    op_callbacks.remove_op_callback(_state.check_numerics_callback.callback)
464    delattr(_state, "check_numerics_callback")
465    logging.info(
466        "Disabled check-numerics callback in thread %s",
467        threading.current_thread().name)
468  except KeyError:
469    # Tolerate disabling the check numerics callback without
470    # enable_check_numerics() being called first.
471    pass
472