Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/tensorflow/python/debug/lib/check_numerics

3# Licensed under the Apache License, Version 2.0 (the "License");

4# you may not use this file except in compliance with the License.

5# You may obtain a copy of the License at

7# http://www.apache.org/licenses/LICENSE-2.0

9# Unless required by applicable law or agreed to in writing, software

10# distributed under the License is distributed on an "AS IS" BASIS,

11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

12# See the License for the specific language governing permissions and

13# limitations under the License.

14# ==============================================================================

15"""Eager-graph unified check numerics callback."""

17import collections

18import threading

20import numpy as np

22from tensorflow.core.protobuf import debug_event_pb2

23from tensorflow.python.debug.lib import op_callbacks_common

24from tensorflow.python.debug.lib import source_utils

25from tensorflow.python.eager import monitoring

26from tensorflow.python.framework import op_callbacks

27from tensorflow.python.framework import ops

28from tensorflow.python.ops import array_ops

29from tensorflow.python.ops import gen_debug_ops

30from tensorflow.python.platform import tf_logging as logging

31from tensorflow.python.util import compat

32from tensorflow.python.util import object_identity

33from tensorflow.python.util.tf_export import tf_export

36# Many ops have benign NaN outputs, and running them with check_numerics

37# on will create unwanted errors

38# TODO(b/142497024): Replace this allowlist with function decorators in the ops

39IGNORE_OP_OUTPUTS = (

40 # For FusedBatchNorm, if the input tensor is empty then batch_mean and

41 # batch_variance will be NaN. reserve_space holds intermediate values

42 # derived from batch_mean and batch_variance used for gradient calculation

43 (b"FusedBatchNorm", 1), # batch_mean

44 (b"FusedBatchNorm", 2), # batch_variance

45 (b"FusedBatchNorm", 3), # reserve_space_1

46 (b"FusedBatchNorm", 4), # reserve_space_2

48 # Same as above

49 (b"FusedBatchNormV2", 1), # batch_mean

50 (b"FusedBatchNormV2", 2), # batch_variance

51 (b"FusedBatchNormV2", 3), # reserve_space_1

52 (b"FusedBatchNormV2", 4), # reserve_space_2

54 # Same as above, but reserve_space_3 holds additional intermediate values

55 (b"FusedBatchNormV3", 1), # batch_mean

56 (b"FusedBatchNormV3", 2), # batch_variance

57 (b"FusedBatchNormV3", 3), # reserve_space_1

58 (b"FusedBatchNormV3", 4), # reserve_space_2

59 (b"FusedBatchNormV3", 5), # reserve_space_3

60)

62# Some frequently used ops are generally safe and we can skip them to reduce

63# overhead. NOTE: This list is compiled by observing operations called by

64# models in practice and is not a comprehensive list of safe operations.

65SAFE_OPS = (

66 b"Concat",

67 b"ConcatV2",

68 b"ExpandDims",

69 b"Fill",

70 b"Gather",

71 b"Maximum",

72 b"Minimum",

73 b"Reshape",

74 b"Slice",

75 b"Squeeze",

76 b"Stack",

77 b"StridedSlice",

78 b"StridedSliceGrad",

79 b"TensorListConcatV2",

80 b"TensorListGather",

81 b"TensorListGetItem",

82 b"TensorListPopBack",

83 b"TensorListStack",

84 b"Transpose",

85 b"Unpack",

86)

88_state = threading.local()

90_check_numerics_callback_create_counter = monitoring.Counter(

91 "/tensorflow/api/python/debugging/check_numerics_callback_create_counter",

92 "Counter for number of times the check_numerics op callback is created.")

95def limit_string_length(string, max_len=50):

96 """Limit the length of input string.

98 Args:

99 string: Input string.

100 max_len: (int or None) If int, the length limit. If None, no limit.

101

102 Returns:

103 Possibly length-limited string.

104 """

105 if max_len is None or len(string) <= max_len:

106 return string

107 else:

108 return "..." + string[len(string) - max_len:]

109

110

111# A dictionary that supports looking up the original input tensor names.

112_CHECK_NUMERICS_INPUT_LOOKUP = collections.defaultdict(dict)

113

114

115def _maybe_lookup_original_input_tensor(graph, tensor):

116 if (graph and

117 graph in _CHECK_NUMERICS_INPUT_LOOKUP and

118 tensor.name in _CHECK_NUMERICS_INPUT_LOOKUP[graph]):

119 return _CHECK_NUMERICS_INPUT_LOOKUP[graph][tensor.name]

120 else:

121 return tensor

122

123

124def get_check_numerics_error_message(slot,

125 num_outputs,

126 op_type,

127 tensor,

128 inputs,

129 graph=None,

130 traceback=None,

131 stack_height_limit=30,

132 path_length_limit=50):

133 """Create a meaningful and user-friendly error message about offending tensor.

134

135 The error message reveals the following info about the op that outputs

136 NaN/Infinity: dtype, shape (to the extent known at graph-construction time),

137 input tensors, stack trace for op creation (if is graph mode).

138

139 Args:

140 slot: (int) slot index of the tensor output.

141 num_outputs: (int) total number of outputs of the op.

142 op_type: (str) Type of the that generates `tensor`.

143 tensor: (Tensor) the offending tensor, i.e., the tensor that contains

144 Infinities or NaNs.

145 inputs: (array of Tensor) inputs to the op that generates `tensor`.

146 graph: (tf.Graph) the graph object that `tensor` belongs to. Available only

147 under graph mode.

148 traceback: (list of trace frames) the stack trace of the op's creation.

149 Available only under graph model.

150 stack_height_limit: (int or None) If int, limit to the height of the stack

151 trace printed in the error message. If None, no limit to the height.

152 path_length_limit: (int or None) Length limit for file paths included in the

153 formatted stack trace.

154

155 Returns:

156 (str) A formatted error message.

157 """

158 eager_vs_graph_qualifier = "graph" if graph else "eagerly-executing"

159 message = "\n"

160 message += (

161 "\n!!! Detected Infinity or NaN in output %d of "

162 "%s op \"%s\" (# of outputs: %d) !!!\n" %

163 (slot, eager_vs_graph_qualifier, op_type, num_outputs))

164

165 message += " dtype: %s\n" % tensor.dtype

166 message += " shape: %s\n" % (tensor.shape,)

167

168 if not graph:

169 # This is an eager tensor. We can get its numpy value and count

170 # NaNs and Infs.

171 is_inf = np.isinf(tensor)

172

173 num_neg_inf = np.sum(np.logical_and(np.less(tensor, 0.), is_inf))

174 num_pos_inf = np.sum(np.logical_and(np.greater(tensor, 0.), is_inf))

175 num_nan = np.sum(np.isnan(tensor))

176 if num_neg_inf > 0:

177 message += " # of -Inf elements: %s\n" % num_neg_inf

178 if num_pos_inf > 0:

179 message += " # of +Inf elements: %s\n" % num_pos_inf

180 if num_nan:

181 message += " # of +NaN elements: %s\n" % num_nan

182

183 if len(inputs) > 1:

184 message += "\n Input tensors (%d):\n" % len(inputs)

185 for slot, input_tensor in enumerate(inputs):

186 message += " %d: %s\n" % (

187 slot, _maybe_lookup_original_input_tensor(graph, input_tensor))

188 elif len(inputs) == 1:

189 message += "\n Input tensor: %s\n" % (

190 _maybe_lookup_original_input_tensor(graph, inputs[0]))

191 if graph and hasattr(graph, "name") and graph.name:

192 message += " Graph name: \"%s\"\n" % graph.name

193

194 # Format the stack trace for the op's creation. We omit files that

195 # belong to tensorflow itself.

196 if graph and traceback:

197 message += (

198 "\n Stack trace of op's creation (\"->\": inferred user code):\n")

199 if stack_height_limit is not None and len(traceback) > stack_height_limit:

200 num_omitted_frames = len(traceback) - stack_height_limit

201 message += " + ... (Omitted %d frames)\n" % num_omitted_frames

202 for filepath, lineno, function_name, source_line in traceback[

203 -stack_height_limit:]:

204 user_code_indicator = " "

205 if not source_utils.guess_is_tensorflow_py_library(filepath):

206 user_code_indicator = " -> "

207

208 message += " + %s (L%d) %s\n" % (

209 limit_string_length(filepath, path_length_limit), lineno,

210 function_name)

211 if source_line is not None:

212 message += "%s| %s\n" % (user_code_indicator, source_line)

213 message += "\n"

214 return message

215

216

217def _debug_summary(x):

218 return gen_debug_ops.debug_numeric_summary_v2(

219 x,

220 tensor_debug_mode=(

221 debug_event_pb2.TensorDebugMode.REDUCE_INF_NAN_THREE_SLOTS))

222

223

224class CheckNumericsCallback(object):

225 """Wrapper for the numerics-checking callback for thread locality."""

226

227 def __init__(self, stack_height_limit, path_length_limit):

228 self._stack_height_limit = stack_height_limit

229 self._path_length_limit = path_length_limit

230 # A dict mapping Placeholder tensors to their instrumenting debug tensors.

231 # Used only under V1 graph mode, where we can't rely on auto control

232 # dependency to execute the debug tensors and hence need to attach the debug

233 # tensors as control dependencies of the ops that consume the Placeholder.

234 self._placeholder_to_debug_tensor = (

235 object_identity.ObjectIdentityDictionary())

236

237 def callback(self,

238 op_type,

239 inputs,

240 attrs,

241 outputs,

242 op_name=None,

243 graph=None):

244 """Eager-function unified callback for checking numerics."""

245 del attrs, op_name # Unused

246 op_type_bytes = compat.as_bytes(op_type)

247 is_v1_graph_mode = not ops.executing_eagerly_outside_functions()

248 if (op_type_bytes in op_callbacks_common.OP_CALLBACK_SKIP_OPS or

249 op_type_bytes in SAFE_OPS):

250 return None

251 if graph:

252 # Under graph mode. Insert check_numerics op.

253 instrumented_outputs = []

254 if is_v1_graph_mode:

255 for input_tensor in inputs:

256 if input_tensor in self._placeholder_to_debug_tensor and outputs:

257 outputs[0].op._add_control_input( # pylint: disable=protected-access

258 self._placeholder_to_debug_tensor[input_tensor].op)

259 for slot, output in enumerate(outputs):

260 if (output.dtype.is_floating and

261 (op_type_bytes, slot) not in IGNORE_OP_OUTPUTS):

262 checked_output = array_ops.check_numerics_v2(

263 # TF v2 has automatic control dependencies added to stateful async

264 # ops, which allows us to run check_numerics asynchronously.

265 # In the above case we use debug_summary to reduce all output

266 # tensors asynchronously from the op being checked and then

267 # process the tensor summary with check_numerics.

268 output if is_v1_graph_mode else _debug_summary(output),

269 get_check_numerics_error_message(

270 slot,

271 len(outputs),

272 op_type,

273 output,

274 inputs,

275 graph=graph,

276 traceback=output.op.traceback,

277 stack_height_limit=self._stack_height_limit,

278 path_length_limit=self._path_length_limit))

279 _CHECK_NUMERICS_INPUT_LOOKUP[graph][checked_output.name] = output

280 instrumented_outputs.append(self._get_output_tensor(

281 op_type_bytes, output, checked_output, is_v1_graph_mode))

282 else:

283 instrumented_outputs.append(output)

284 return instrumented_outputs

285 else:

286 if op_type_bytes == b"CheckNumericsV2":

287 # TODO(b/140334369): Remove this special casing logic once op_callback.

288 # automatically prevents infinite recursion in eager mode.

289 return None

290 # Under eager mode. Eagerly execute check_numerics op.

291 for slot, output in enumerate(outputs):

292 if (output.dtype.is_floating and

293 (op_type_bytes, slot) not in IGNORE_OP_OUTPUTS):

294 array_ops.check_numerics_v2(

295 output,

296 get_check_numerics_error_message(

297 slot, len(outputs), op_type, output, inputs,

298 stack_height_limit=self._stack_height_limit,

299 path_length_limit=self._path_length_limit))

300

301 def _get_output_tensor(self,

302 op_type,

303 tensor,

304 checked_tensor,

305 is_v1_graph_mode):

306 """Determine what tensor to output from callback.

307

308 Args:

309 op_type: Type of the op that outputs the original symbolic tensor, as

310 `bytes`.

311 tensor: The original output symbolic tensor.

312 checked_tensor: The debugger-instrumented, numerics-checking tensor.

313 is_v1_graph_mode: Whether the debugged proggram is running under V1 graph

314 mode.

315

316 Returns:

317 A symbolic tensor to be returned by the dumping op_callback.

318 """

319 if is_v1_graph_mode:

320 # Placeholders need special treatment under V1 graph mode. The

321 # callback can't simply override the Placeholder tensor to the debug

322 # tensor, as that would cause the Placeholder op to lack a value.

323 # The debug tensor is remembered and will be attached as control

324 # inputs to ops that consumer the Placeholders later.

325 if op_type == b"Placeholder":

326 self._placeholder_to_debug_tensor[tensor] = checked_tensor

327 return tensor

328 else:

329 return checked_tensor

330 else:

331 # Under non-v1 graph mode, rely on auto control dependency to run the

332 # checked tensor.

333 return tensor

334

335

336@tf_export("debugging.enable_check_numerics")

337def enable_check_numerics(stack_height_limit=30,

338 path_length_limit=50):

339 r"""Enable tensor numerics checking in an eager/graph unified fashion.

340

341 The numerics checking mechanism will cause any TensorFlow eager execution or

342 graph execution to error out as soon as an op's output tensor contains

343 infinity or NaN.

344

345 This method is idempotent. Calling it multiple times has the same effect

346 as calling it once.

347

348 This method takes effect only on the thread in which it is called.

349

350 When a op's float-type output tensor contains any Infinity or NaN, an

351 `tf.errors.InvalidArgumentError` will be thrown, with an error message that

352 reveals the following information:

353 - The type of the op that generated the tensor with bad numerics.

354 - Data type (dtype) of the tensor.

355 - Shape of the tensor (to the extent known at the time of eager execution

356 or graph construction).

357 - Name of the containing graph (if available).

358 - (Graph mode only): The stack trace of the intra-graph op's creation,

359 with a stack-height limit and a path-length limit for visual clarity.

360 The stack frames that belong to the user's code (as opposed to

361 tensorflow's internal code) are highlighted with a text arrow ("->").

362 - (Eager mode only): How many of the offending tensor's elements are

363 `Infinity` and `NaN`, respectively.

364

365 Once enabled, the check-numerics mechanism can be disabled by using

366 `tf.debugging.disable_check_numerics()`.

367

368 Example usage:

369

370 1. Catching infinity during the execution of a `tf.function` graph:

371

372 ```py

373 import tensorflow as tf

374

375 tf.debugging.enable_check_numerics()

376

377 @tf.function

378 def square_log_x_plus_1(x):

379 v = tf.math.log(x + 1)

380 return tf.math.square(v)

381

382 x = -1.0

383

384 # When the following line runs, a function graph will be compiled

385 # from the Python function `square_log_x_plus_1()`. Due to the

386 # `enable_check_numerics()` call above, the graph will contain

387 # numerics checking ops that will run during the function graph's

388 # execution. The function call generates an -infinity when the Log

389 # (logarithm) op operates on the output tensor of the Add op.

390 # The program errors out at this line, printing an error message.

391 y = square_log_x_plus_1(x)

392 z = -y

393 ```

394

395 2. Catching NaN during eager execution:

396

397 ```py

398 import numpy as np

399 import tensorflow as tf

400

401 tf.debugging.enable_check_numerics()

402

403 x = np.array([[0.0, -1.0], [4.0, 3.0]])

404

405 # The following line executes the Sqrt op eagerly. Due to the negative

406 # element in the input array, a NaN is generated. Due to the

407 # `enable_check_numerics()` call above, the program errors immediately

408 # at this line, printing an error message.

409 y = tf.math.sqrt(x)

410 z = tf.matmul(y, y)

411 ```

412

413 NOTE: If your code is running on TPUs, be sure to call

414 `tf.config.set_soft_device_placement(True)` before calling

415 `tf.debugging.enable_check_numerics()` as this API uses automatic outside

416 compilation on TPUs. For example:

417

418 ```py

419 tf.config.set_soft_device_placement(True)

420 tf.debugging.enable_check_numerics()

421

422 resolver = tf.distribute.cluster_resolver.TPUClusterResolver(tpu='')

423 strategy = tf.distribute.TPUStrategy(resolver)

424 with strategy.scope():

425 # ...

426 ```

427

428 Args:

429 stack_height_limit: Limit to the height of the printed stack trace.

430 Applicable only to ops in `tf.function`s (graphs).

431 path_length_limit: Limit to the file path included in the printed stack

432 trace. Applicable only to ops in `tf.function`s (graphs).

433 """

434 if not hasattr(_state, "check_numerics_callback"):

435 _state.check_numerics_callback = CheckNumericsCallback(

436 stack_height_limit, path_length_limit)

437 op_callbacks.add_op_callback(_state.check_numerics_callback.callback)

438

439 logging.info(

440 "Enabled check-numerics callback in thread %s",

441 threading.current_thread().name)

442 _check_numerics_callback_create_counter.get_cell().increase_by(1)

443

444

445@tf_export("debugging.disable_check_numerics")

446def disable_check_numerics():

447 """Disable the eager/graph unified numerics checking mechanism.

448

449 This method can be used after a call to `tf.debugging.enable_check_numerics()`

450 to disable the numerics-checking mechanism that catches infinity and NaN

451 values output by ops executed eagerly or in tf.function-compiled graphs.

452

453 This method is idempotent. Calling it multiple times has the same effect

454 as calling it once.

455

456 This method takes effect only on the thread in which it is called.

457 """

458 if not hasattr(_state, "check_numerics_callback"):

459 return

460 try:

461 op_callbacks.remove_op_callback(_state.check_numerics_callback.callback)

462 delattr(_state, "check_numerics_callback")

463 logging.info(

464 "Disabled check-numerics callback in thread %s",

465 threading.current_thread().name)

466 except KeyError:

467 # Tolerate disabling the check numerics callback without

468 # enable_check_numerics() being called first.

469 pass

Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/tensorflow/python/debug/lib/check_numerics_callback.py: 27%

123 statements