Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/tensorflow/python/debug/lib/check_numerics_callback.py: 27%
123 statements
« prev ^ index » next coverage.py v7.4.0, created at 2024-01-03 07:57 +0000
« prev ^ index » next coverage.py v7.4.0, created at 2024-01-03 07:57 +0000
1# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
2#
3# Licensed under the Apache License, Version 2.0 (the "License");
4# you may not use this file except in compliance with the License.
5# You may obtain a copy of the License at
6#
7# http://www.apache.org/licenses/LICENSE-2.0
8#
9# Unless required by applicable law or agreed to in writing, software
10# distributed under the License is distributed on an "AS IS" BASIS,
11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12# See the License for the specific language governing permissions and
13# limitations under the License.
14# ==============================================================================
15"""Eager-graph unified check numerics callback."""
17import collections
18import threading
20import numpy as np
22from tensorflow.core.protobuf import debug_event_pb2
23from tensorflow.python.debug.lib import op_callbacks_common
24from tensorflow.python.debug.lib import source_utils
25from tensorflow.python.eager import monitoring
26from tensorflow.python.framework import op_callbacks
27from tensorflow.python.framework import ops
28from tensorflow.python.ops import array_ops
29from tensorflow.python.ops import gen_debug_ops
30from tensorflow.python.platform import tf_logging as logging
31from tensorflow.python.util import compat
32from tensorflow.python.util import object_identity
33from tensorflow.python.util.tf_export import tf_export
36# Many ops have benign NaN outputs, and running them with check_numerics
37# on will create unwanted errors
38# TODO(b/142497024): Replace this allowlist with function decorators in the ops
39IGNORE_OP_OUTPUTS = (
40 # For FusedBatchNorm, if the input tensor is empty then batch_mean and
41 # batch_variance will be NaN. reserve_space holds intermediate values
42 # derived from batch_mean and batch_variance used for gradient calculation
43 (b"FusedBatchNorm", 1), # batch_mean
44 (b"FusedBatchNorm", 2), # batch_variance
45 (b"FusedBatchNorm", 3), # reserve_space_1
46 (b"FusedBatchNorm", 4), # reserve_space_2
48 # Same as above
49 (b"FusedBatchNormV2", 1), # batch_mean
50 (b"FusedBatchNormV2", 2), # batch_variance
51 (b"FusedBatchNormV2", 3), # reserve_space_1
52 (b"FusedBatchNormV2", 4), # reserve_space_2
54 # Same as above, but reserve_space_3 holds additional intermediate values
55 (b"FusedBatchNormV3", 1), # batch_mean
56 (b"FusedBatchNormV3", 2), # batch_variance
57 (b"FusedBatchNormV3", 3), # reserve_space_1
58 (b"FusedBatchNormV3", 4), # reserve_space_2
59 (b"FusedBatchNormV3", 5), # reserve_space_3
60)
62# Some frequently used ops are generally safe and we can skip them to reduce
63# overhead. NOTE: This list is compiled by observing operations called by
64# models in practice and is not a comprehensive list of safe operations.
65SAFE_OPS = (
66 b"Concat",
67 b"ConcatV2",
68 b"ExpandDims",
69 b"Fill",
70 b"Gather",
71 b"Maximum",
72 b"Minimum",
73 b"Reshape",
74 b"Slice",
75 b"Squeeze",
76 b"Stack",
77 b"StridedSlice",
78 b"StridedSliceGrad",
79 b"TensorListConcatV2",
80 b"TensorListGather",
81 b"TensorListGetItem",
82 b"TensorListPopBack",
83 b"TensorListStack",
84 b"Transpose",
85 b"Unpack",
86)
88_state = threading.local()
90_check_numerics_callback_create_counter = monitoring.Counter(
91 "/tensorflow/api/python/debugging/check_numerics_callback_create_counter",
92 "Counter for number of times the check_numerics op callback is created.")
95def limit_string_length(string, max_len=50):
96 """Limit the length of input string.
98 Args:
99 string: Input string.
100 max_len: (int or None) If int, the length limit. If None, no limit.
102 Returns:
103 Possibly length-limited string.
104 """
105 if max_len is None or len(string) <= max_len:
106 return string
107 else:
108 return "..." + string[len(string) - max_len:]
111# A dictionary that supports looking up the original input tensor names.
112_CHECK_NUMERICS_INPUT_LOOKUP = collections.defaultdict(dict)
115def _maybe_lookup_original_input_tensor(graph, tensor):
116 if (graph and
117 graph in _CHECK_NUMERICS_INPUT_LOOKUP and
118 tensor.name in _CHECK_NUMERICS_INPUT_LOOKUP[graph]):
119 return _CHECK_NUMERICS_INPUT_LOOKUP[graph][tensor.name]
120 else:
121 return tensor
124def get_check_numerics_error_message(slot,
125 num_outputs,
126 op_type,
127 tensor,
128 inputs,
129 graph=None,
130 traceback=None,
131 stack_height_limit=30,
132 path_length_limit=50):
133 """Create a meaningful and user-friendly error message about offending tensor.
135 The error message reveals the following info about the op that outputs
136 NaN/Infinity: dtype, shape (to the extent known at graph-construction time),
137 input tensors, stack trace for op creation (if is graph mode).
139 Args:
140 slot: (int) slot index of the tensor output.
141 num_outputs: (int) total number of outputs of the op.
142 op_type: (str) Type of the that generates `tensor`.
143 tensor: (Tensor) the offending tensor, i.e., the tensor that contains
144 Infinities or NaNs.
145 inputs: (array of Tensor) inputs to the op that generates `tensor`.
146 graph: (tf.Graph) the graph object that `tensor` belongs to. Available only
147 under graph mode.
148 traceback: (list of trace frames) the stack trace of the op's creation.
149 Available only under graph model.
150 stack_height_limit: (int or None) If int, limit to the height of the stack
151 trace printed in the error message. If None, no limit to the height.
152 path_length_limit: (int or None) Length limit for file paths included in the
153 formatted stack trace.
155 Returns:
156 (str) A formatted error message.
157 """
158 eager_vs_graph_qualifier = "graph" if graph else "eagerly-executing"
159 message = "\n"
160 message += (
161 "\n!!! Detected Infinity or NaN in output %d of "
162 "%s op \"%s\" (# of outputs: %d) !!!\n" %
163 (slot, eager_vs_graph_qualifier, op_type, num_outputs))
165 message += " dtype: %s\n" % tensor.dtype
166 message += " shape: %s\n" % (tensor.shape,)
168 if not graph:
169 # This is an eager tensor. We can get its numpy value and count
170 # NaNs and Infs.
171 is_inf = np.isinf(tensor)
173 num_neg_inf = np.sum(np.logical_and(np.less(tensor, 0.), is_inf))
174 num_pos_inf = np.sum(np.logical_and(np.greater(tensor, 0.), is_inf))
175 num_nan = np.sum(np.isnan(tensor))
176 if num_neg_inf > 0:
177 message += " # of -Inf elements: %s\n" % num_neg_inf
178 if num_pos_inf > 0:
179 message += " # of +Inf elements: %s\n" % num_pos_inf
180 if num_nan:
181 message += " # of +NaN elements: %s\n" % num_nan
183 if len(inputs) > 1:
184 message += "\n Input tensors (%d):\n" % len(inputs)
185 for slot, input_tensor in enumerate(inputs):
186 message += " %d: %s\n" % (
187 slot, _maybe_lookup_original_input_tensor(graph, input_tensor))
188 elif len(inputs) == 1:
189 message += "\n Input tensor: %s\n" % (
190 _maybe_lookup_original_input_tensor(graph, inputs[0]))
191 if graph and hasattr(graph, "name") and graph.name:
192 message += " Graph name: \"%s\"\n" % graph.name
194 # Format the stack trace for the op's creation. We omit files that
195 # belong to tensorflow itself.
196 if graph and traceback:
197 message += (
198 "\n Stack trace of op's creation (\"->\": inferred user code):\n")
199 if stack_height_limit is not None and len(traceback) > stack_height_limit:
200 num_omitted_frames = len(traceback) - stack_height_limit
201 message += " + ... (Omitted %d frames)\n" % num_omitted_frames
202 for filepath, lineno, function_name, source_line in traceback[
203 -stack_height_limit:]:
204 user_code_indicator = " "
205 if not source_utils.guess_is_tensorflow_py_library(filepath):
206 user_code_indicator = " -> "
208 message += " + %s (L%d) %s\n" % (
209 limit_string_length(filepath, path_length_limit), lineno,
210 function_name)
211 if source_line is not None:
212 message += "%s| %s\n" % (user_code_indicator, source_line)
213 message += "\n"
214 return message
217def _debug_summary(x):
218 return gen_debug_ops.debug_numeric_summary_v2(
219 x,
220 tensor_debug_mode=(
221 debug_event_pb2.TensorDebugMode.REDUCE_INF_NAN_THREE_SLOTS))
224class CheckNumericsCallback(object):
225 """Wrapper for the numerics-checking callback for thread locality."""
227 def __init__(self, stack_height_limit, path_length_limit):
228 self._stack_height_limit = stack_height_limit
229 self._path_length_limit = path_length_limit
230 # A dict mapping Placeholder tensors to their instrumenting debug tensors.
231 # Used only under V1 graph mode, where we can't rely on auto control
232 # dependency to execute the debug tensors and hence need to attach the debug
233 # tensors as control dependencies of the ops that consume the Placeholder.
234 self._placeholder_to_debug_tensor = (
235 object_identity.ObjectIdentityDictionary())
237 def callback(self,
238 op_type,
239 inputs,
240 attrs,
241 outputs,
242 op_name=None,
243 graph=None):
244 """Eager-function unified callback for checking numerics."""
245 del attrs, op_name # Unused
246 op_type_bytes = compat.as_bytes(op_type)
247 is_v1_graph_mode = not ops.executing_eagerly_outside_functions()
248 if (op_type_bytes in op_callbacks_common.OP_CALLBACK_SKIP_OPS or
249 op_type_bytes in SAFE_OPS):
250 return None
251 if graph:
252 # Under graph mode. Insert check_numerics op.
253 instrumented_outputs = []
254 if is_v1_graph_mode:
255 for input_tensor in inputs:
256 if input_tensor in self._placeholder_to_debug_tensor and outputs:
257 outputs[0].op._add_control_input( # pylint: disable=protected-access
258 self._placeholder_to_debug_tensor[input_tensor].op)
259 for slot, output in enumerate(outputs):
260 if (output.dtype.is_floating and
261 (op_type_bytes, slot) not in IGNORE_OP_OUTPUTS):
262 checked_output = array_ops.check_numerics_v2(
263 # TF v2 has automatic control dependencies added to stateful async
264 # ops, which allows us to run check_numerics asynchronously.
265 # In the above case we use debug_summary to reduce all output
266 # tensors asynchronously from the op being checked and then
267 # process the tensor summary with check_numerics.
268 output if is_v1_graph_mode else _debug_summary(output),
269 get_check_numerics_error_message(
270 slot,
271 len(outputs),
272 op_type,
273 output,
274 inputs,
275 graph=graph,
276 traceback=output.op.traceback,
277 stack_height_limit=self._stack_height_limit,
278 path_length_limit=self._path_length_limit))
279 _CHECK_NUMERICS_INPUT_LOOKUP[graph][checked_output.name] = output
280 instrumented_outputs.append(self._get_output_tensor(
281 op_type_bytes, output, checked_output, is_v1_graph_mode))
282 else:
283 instrumented_outputs.append(output)
284 return instrumented_outputs
285 else:
286 if op_type_bytes == b"CheckNumericsV2":
287 # TODO(b/140334369): Remove this special casing logic once op_callback.
288 # automatically prevents infinite recursion in eager mode.
289 return None
290 # Under eager mode. Eagerly execute check_numerics op.
291 for slot, output in enumerate(outputs):
292 if (output.dtype.is_floating and
293 (op_type_bytes, slot) not in IGNORE_OP_OUTPUTS):
294 array_ops.check_numerics_v2(
295 output,
296 get_check_numerics_error_message(
297 slot, len(outputs), op_type, output, inputs,
298 stack_height_limit=self._stack_height_limit,
299 path_length_limit=self._path_length_limit))
301 def _get_output_tensor(self,
302 op_type,
303 tensor,
304 checked_tensor,
305 is_v1_graph_mode):
306 """Determine what tensor to output from callback.
308 Args:
309 op_type: Type of the op that outputs the original symbolic tensor, as
310 `bytes`.
311 tensor: The original output symbolic tensor.
312 checked_tensor: The debugger-instrumented, numerics-checking tensor.
313 is_v1_graph_mode: Whether the debugged proggram is running under V1 graph
314 mode.
316 Returns:
317 A symbolic tensor to be returned by the dumping op_callback.
318 """
319 if is_v1_graph_mode:
320 # Placeholders need special treatment under V1 graph mode. The
321 # callback can't simply override the Placeholder tensor to the debug
322 # tensor, as that would cause the Placeholder op to lack a value.
323 # The debug tensor is remembered and will be attached as control
324 # inputs to ops that consumer the Placeholders later.
325 if op_type == b"Placeholder":
326 self._placeholder_to_debug_tensor[tensor] = checked_tensor
327 return tensor
328 else:
329 return checked_tensor
330 else:
331 # Under non-v1 graph mode, rely on auto control dependency to run the
332 # checked tensor.
333 return tensor
336@tf_export("debugging.enable_check_numerics")
337def enable_check_numerics(stack_height_limit=30,
338 path_length_limit=50):
339 r"""Enable tensor numerics checking in an eager/graph unified fashion.
341 The numerics checking mechanism will cause any TensorFlow eager execution or
342 graph execution to error out as soon as an op's output tensor contains
343 infinity or NaN.
345 This method is idempotent. Calling it multiple times has the same effect
346 as calling it once.
348 This method takes effect only on the thread in which it is called.
350 When a op's float-type output tensor contains any Infinity or NaN, an
351 `tf.errors.InvalidArgumentError` will be thrown, with an error message that
352 reveals the following information:
353 - The type of the op that generated the tensor with bad numerics.
354 - Data type (dtype) of the tensor.
355 - Shape of the tensor (to the extent known at the time of eager execution
356 or graph construction).
357 - Name of the containing graph (if available).
358 - (Graph mode only): The stack trace of the intra-graph op's creation,
359 with a stack-height limit and a path-length limit for visual clarity.
360 The stack frames that belong to the user's code (as opposed to
361 tensorflow's internal code) are highlighted with a text arrow ("->").
362 - (Eager mode only): How many of the offending tensor's elements are
363 `Infinity` and `NaN`, respectively.
365 Once enabled, the check-numerics mechanism can be disabled by using
366 `tf.debugging.disable_check_numerics()`.
368 Example usage:
370 1. Catching infinity during the execution of a `tf.function` graph:
372 ```py
373 import tensorflow as tf
375 tf.debugging.enable_check_numerics()
377 @tf.function
378 def square_log_x_plus_1(x):
379 v = tf.math.log(x + 1)
380 return tf.math.square(v)
382 x = -1.0
384 # When the following line runs, a function graph will be compiled
385 # from the Python function `square_log_x_plus_1()`. Due to the
386 # `enable_check_numerics()` call above, the graph will contain
387 # numerics checking ops that will run during the function graph's
388 # execution. The function call generates an -infinity when the Log
389 # (logarithm) op operates on the output tensor of the Add op.
390 # The program errors out at this line, printing an error message.
391 y = square_log_x_plus_1(x)
392 z = -y
393 ```
395 2. Catching NaN during eager execution:
397 ```py
398 import numpy as np
399 import tensorflow as tf
401 tf.debugging.enable_check_numerics()
403 x = np.array([[0.0, -1.0], [4.0, 3.0]])
405 # The following line executes the Sqrt op eagerly. Due to the negative
406 # element in the input array, a NaN is generated. Due to the
407 # `enable_check_numerics()` call above, the program errors immediately
408 # at this line, printing an error message.
409 y = tf.math.sqrt(x)
410 z = tf.matmul(y, y)
411 ```
413 NOTE: If your code is running on TPUs, be sure to call
414 `tf.config.set_soft_device_placement(True)` before calling
415 `tf.debugging.enable_check_numerics()` as this API uses automatic outside
416 compilation on TPUs. For example:
418 ```py
419 tf.config.set_soft_device_placement(True)
420 tf.debugging.enable_check_numerics()
422 resolver = tf.distribute.cluster_resolver.TPUClusterResolver(tpu='')
423 strategy = tf.distribute.TPUStrategy(resolver)
424 with strategy.scope():
425 # ...
426 ```
428 Args:
429 stack_height_limit: Limit to the height of the printed stack trace.
430 Applicable only to ops in `tf.function`s (graphs).
431 path_length_limit: Limit to the file path included in the printed stack
432 trace. Applicable only to ops in `tf.function`s (graphs).
433 """
434 if not hasattr(_state, "check_numerics_callback"):
435 _state.check_numerics_callback = CheckNumericsCallback(
436 stack_height_limit, path_length_limit)
437 op_callbacks.add_op_callback(_state.check_numerics_callback.callback)
439 logging.info(
440 "Enabled check-numerics callback in thread %s",
441 threading.current_thread().name)
442 _check_numerics_callback_create_counter.get_cell().increase_by(1)
445@tf_export("debugging.disable_check_numerics")
446def disable_check_numerics():
447 """Disable the eager/graph unified numerics checking mechanism.
449 This method can be used after a call to `tf.debugging.enable_check_numerics()`
450 to disable the numerics-checking mechanism that catches infinity and NaN
451 values output by ops executed eagerly or in tf.function-compiled graphs.
453 This method is idempotent. Calling it multiple times has the same effect
454 as calling it once.
456 This method takes effect only on the thread in which it is called.
457 """
458 if not hasattr(_state, "check_numerics_callback"):
459 return
460 try:
461 op_callbacks.remove_op_callback(_state.check_numerics_callback.callback)
462 delattr(_state, "check_numerics_callback")
463 logging.info(
464 "Disabled check-numerics callback in thread %s",
465 threading.current_thread().name)
466 except KeyError:
467 # Tolerate disabling the check numerics callback without
468 # enable_check_numerics() being called first.
469 pass