Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/tensorflow/python/debug/lib/check_numerics_callback.py: 27%

123 statements  

« prev     ^ index     » next       coverage.py v7.4.0, created at 2024-01-03 07:57 +0000

1# Copyright 2019 The TensorFlow Authors. All Rights Reserved. 

2# 

3# Licensed under the Apache License, Version 2.0 (the "License"); 

4# you may not use this file except in compliance with the License. 

5# You may obtain a copy of the License at 

6# 

7# http://www.apache.org/licenses/LICENSE-2.0 

8# 

9# Unless required by applicable law or agreed to in writing, software 

10# distributed under the License is distributed on an "AS IS" BASIS, 

11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 

12# See the License for the specific language governing permissions and 

13# limitations under the License. 

14# ============================================================================== 

15"""Eager-graph unified check numerics callback.""" 

16 

17import collections 

18import threading 

19 

20import numpy as np 

21 

22from tensorflow.core.protobuf import debug_event_pb2 

23from tensorflow.python.debug.lib import op_callbacks_common 

24from tensorflow.python.debug.lib import source_utils 

25from tensorflow.python.eager import monitoring 

26from tensorflow.python.framework import op_callbacks 

27from tensorflow.python.framework import ops 

28from tensorflow.python.ops import array_ops 

29from tensorflow.python.ops import gen_debug_ops 

30from tensorflow.python.platform import tf_logging as logging 

31from tensorflow.python.util import compat 

32from tensorflow.python.util import object_identity 

33from tensorflow.python.util.tf_export import tf_export 

34 

35 

36# Many ops have benign NaN outputs, and running them with check_numerics 

37# on will create unwanted errors 

38# TODO(b/142497024): Replace this allowlist with function decorators in the ops 

39IGNORE_OP_OUTPUTS = ( 

40 # For FusedBatchNorm, if the input tensor is empty then batch_mean and 

41 # batch_variance will be NaN. reserve_space holds intermediate values 

42 # derived from batch_mean and batch_variance used for gradient calculation 

43 (b"FusedBatchNorm", 1), # batch_mean 

44 (b"FusedBatchNorm", 2), # batch_variance 

45 (b"FusedBatchNorm", 3), # reserve_space_1 

46 (b"FusedBatchNorm", 4), # reserve_space_2 

47 

48 # Same as above 

49 (b"FusedBatchNormV2", 1), # batch_mean 

50 (b"FusedBatchNormV2", 2), # batch_variance 

51 (b"FusedBatchNormV2", 3), # reserve_space_1 

52 (b"FusedBatchNormV2", 4), # reserve_space_2 

53 

54 # Same as above, but reserve_space_3 holds additional intermediate values 

55 (b"FusedBatchNormV3", 1), # batch_mean 

56 (b"FusedBatchNormV3", 2), # batch_variance 

57 (b"FusedBatchNormV3", 3), # reserve_space_1 

58 (b"FusedBatchNormV3", 4), # reserve_space_2 

59 (b"FusedBatchNormV3", 5), # reserve_space_3 

60) 

61 

62# Some frequently used ops are generally safe and we can skip them to reduce 

63# overhead. NOTE: This list is compiled by observing operations called by 

64# models in practice and is not a comprehensive list of safe operations. 

65SAFE_OPS = ( 

66 b"Concat", 

67 b"ConcatV2", 

68 b"ExpandDims", 

69 b"Fill", 

70 b"Gather", 

71 b"Maximum", 

72 b"Minimum", 

73 b"Reshape", 

74 b"Slice", 

75 b"Squeeze", 

76 b"Stack", 

77 b"StridedSlice", 

78 b"StridedSliceGrad", 

79 b"TensorListConcatV2", 

80 b"TensorListGather", 

81 b"TensorListGetItem", 

82 b"TensorListPopBack", 

83 b"TensorListStack", 

84 b"Transpose", 

85 b"Unpack", 

86) 

87 

88_state = threading.local() 

89 

90_check_numerics_callback_create_counter = monitoring.Counter( 

91 "/tensorflow/api/python/debugging/check_numerics_callback_create_counter", 

92 "Counter for number of times the check_numerics op callback is created.") 

93 

94 

95def limit_string_length(string, max_len=50): 

96 """Limit the length of input string. 

97 

98 Args: 

99 string: Input string. 

100 max_len: (int or None) If int, the length limit. If None, no limit. 

101 

102 Returns: 

103 Possibly length-limited string. 

104 """ 

105 if max_len is None or len(string) <= max_len: 

106 return string 

107 else: 

108 return "..." + string[len(string) - max_len:] 

109 

110 

111# A dictionary that supports looking up the original input tensor names. 

112_CHECK_NUMERICS_INPUT_LOOKUP = collections.defaultdict(dict) 

113 

114 

115def _maybe_lookup_original_input_tensor(graph, tensor): 

116 if (graph and 

117 graph in _CHECK_NUMERICS_INPUT_LOOKUP and 

118 tensor.name in _CHECK_NUMERICS_INPUT_LOOKUP[graph]): 

119 return _CHECK_NUMERICS_INPUT_LOOKUP[graph][tensor.name] 

120 else: 

121 return tensor 

122 

123 

124def get_check_numerics_error_message(slot, 

125 num_outputs, 

126 op_type, 

127 tensor, 

128 inputs, 

129 graph=None, 

130 traceback=None, 

131 stack_height_limit=30, 

132 path_length_limit=50): 

133 """Create a meaningful and user-friendly error message about offending tensor. 

134 

135 The error message reveals the following info about the op that outputs 

136 NaN/Infinity: dtype, shape (to the extent known at graph-construction time), 

137 input tensors, stack trace for op creation (if is graph mode). 

138 

139 Args: 

140 slot: (int) slot index of the tensor output. 

141 num_outputs: (int) total number of outputs of the op. 

142 op_type: (str) Type of the that generates `tensor`. 

143 tensor: (Tensor) the offending tensor, i.e., the tensor that contains 

144 Infinities or NaNs. 

145 inputs: (array of Tensor) inputs to the op that generates `tensor`. 

146 graph: (tf.Graph) the graph object that `tensor` belongs to. Available only 

147 under graph mode. 

148 traceback: (list of trace frames) the stack trace of the op's creation. 

149 Available only under graph model. 

150 stack_height_limit: (int or None) If int, limit to the height of the stack 

151 trace printed in the error message. If None, no limit to the height. 

152 path_length_limit: (int or None) Length limit for file paths included in the 

153 formatted stack trace. 

154 

155 Returns: 

156 (str) A formatted error message. 

157 """ 

158 eager_vs_graph_qualifier = "graph" if graph else "eagerly-executing" 

159 message = "\n" 

160 message += ( 

161 "\n!!! Detected Infinity or NaN in output %d of " 

162 "%s op \"%s\" (# of outputs: %d) !!!\n" % 

163 (slot, eager_vs_graph_qualifier, op_type, num_outputs)) 

164 

165 message += " dtype: %s\n" % tensor.dtype 

166 message += " shape: %s\n" % (tensor.shape,) 

167 

168 if not graph: 

169 # This is an eager tensor. We can get its numpy value and count 

170 # NaNs and Infs. 

171 is_inf = np.isinf(tensor) 

172 

173 num_neg_inf = np.sum(np.logical_and(np.less(tensor, 0.), is_inf)) 

174 num_pos_inf = np.sum(np.logical_and(np.greater(tensor, 0.), is_inf)) 

175 num_nan = np.sum(np.isnan(tensor)) 

176 if num_neg_inf > 0: 

177 message += " # of -Inf elements: %s\n" % num_neg_inf 

178 if num_pos_inf > 0: 

179 message += " # of +Inf elements: %s\n" % num_pos_inf 

180 if num_nan: 

181 message += " # of +NaN elements: %s\n" % num_nan 

182 

183 if len(inputs) > 1: 

184 message += "\n Input tensors (%d):\n" % len(inputs) 

185 for slot, input_tensor in enumerate(inputs): 

186 message += " %d: %s\n" % ( 

187 slot, _maybe_lookup_original_input_tensor(graph, input_tensor)) 

188 elif len(inputs) == 1: 

189 message += "\n Input tensor: %s\n" % ( 

190 _maybe_lookup_original_input_tensor(graph, inputs[0])) 

191 if graph and hasattr(graph, "name") and graph.name: 

192 message += " Graph name: \"%s\"\n" % graph.name 

193 

194 # Format the stack trace for the op's creation. We omit files that 

195 # belong to tensorflow itself. 

196 if graph and traceback: 

197 message += ( 

198 "\n Stack trace of op's creation (\"->\": inferred user code):\n") 

199 if stack_height_limit is not None and len(traceback) > stack_height_limit: 

200 num_omitted_frames = len(traceback) - stack_height_limit 

201 message += " + ... (Omitted %d frames)\n" % num_omitted_frames 

202 for filepath, lineno, function_name, source_line in traceback[ 

203 -stack_height_limit:]: 

204 user_code_indicator = " " 

205 if not source_utils.guess_is_tensorflow_py_library(filepath): 

206 user_code_indicator = " -> " 

207 

208 message += " + %s (L%d) %s\n" % ( 

209 limit_string_length(filepath, path_length_limit), lineno, 

210 function_name) 

211 if source_line is not None: 

212 message += "%s| %s\n" % (user_code_indicator, source_line) 

213 message += "\n" 

214 return message 

215 

216 

217def _debug_summary(x): 

218 return gen_debug_ops.debug_numeric_summary_v2( 

219 x, 

220 tensor_debug_mode=( 

221 debug_event_pb2.TensorDebugMode.REDUCE_INF_NAN_THREE_SLOTS)) 

222 

223 

224class CheckNumericsCallback(object): 

225 """Wrapper for the numerics-checking callback for thread locality.""" 

226 

227 def __init__(self, stack_height_limit, path_length_limit): 

228 self._stack_height_limit = stack_height_limit 

229 self._path_length_limit = path_length_limit 

230 # A dict mapping Placeholder tensors to their instrumenting debug tensors. 

231 # Used only under V1 graph mode, where we can't rely on auto control 

232 # dependency to execute the debug tensors and hence need to attach the debug 

233 # tensors as control dependencies of the ops that consume the Placeholder. 

234 self._placeholder_to_debug_tensor = ( 

235 object_identity.ObjectIdentityDictionary()) 

236 

237 def callback(self, 

238 op_type, 

239 inputs, 

240 attrs, 

241 outputs, 

242 op_name=None, 

243 graph=None): 

244 """Eager-function unified callback for checking numerics.""" 

245 del attrs, op_name # Unused 

246 op_type_bytes = compat.as_bytes(op_type) 

247 is_v1_graph_mode = not ops.executing_eagerly_outside_functions() 

248 if (op_type_bytes in op_callbacks_common.OP_CALLBACK_SKIP_OPS or 

249 op_type_bytes in SAFE_OPS): 

250 return None 

251 if graph: 

252 # Under graph mode. Insert check_numerics op. 

253 instrumented_outputs = [] 

254 if is_v1_graph_mode: 

255 for input_tensor in inputs: 

256 if input_tensor in self._placeholder_to_debug_tensor and outputs: 

257 outputs[0].op._add_control_input( # pylint: disable=protected-access 

258 self._placeholder_to_debug_tensor[input_tensor].op) 

259 for slot, output in enumerate(outputs): 

260 if (output.dtype.is_floating and 

261 (op_type_bytes, slot) not in IGNORE_OP_OUTPUTS): 

262 checked_output = array_ops.check_numerics_v2( 

263 # TF v2 has automatic control dependencies added to stateful async 

264 # ops, which allows us to run check_numerics asynchronously. 

265 # In the above case we use debug_summary to reduce all output 

266 # tensors asynchronously from the op being checked and then 

267 # process the tensor summary with check_numerics. 

268 output if is_v1_graph_mode else _debug_summary(output), 

269 get_check_numerics_error_message( 

270 slot, 

271 len(outputs), 

272 op_type, 

273 output, 

274 inputs, 

275 graph=graph, 

276 traceback=output.op.traceback, 

277 stack_height_limit=self._stack_height_limit, 

278 path_length_limit=self._path_length_limit)) 

279 _CHECK_NUMERICS_INPUT_LOOKUP[graph][checked_output.name] = output 

280 instrumented_outputs.append(self._get_output_tensor( 

281 op_type_bytes, output, checked_output, is_v1_graph_mode)) 

282 else: 

283 instrumented_outputs.append(output) 

284 return instrumented_outputs 

285 else: 

286 if op_type_bytes == b"CheckNumericsV2": 

287 # TODO(b/140334369): Remove this special casing logic once op_callback. 

288 # automatically prevents infinite recursion in eager mode. 

289 return None 

290 # Under eager mode. Eagerly execute check_numerics op. 

291 for slot, output in enumerate(outputs): 

292 if (output.dtype.is_floating and 

293 (op_type_bytes, slot) not in IGNORE_OP_OUTPUTS): 

294 array_ops.check_numerics_v2( 

295 output, 

296 get_check_numerics_error_message( 

297 slot, len(outputs), op_type, output, inputs, 

298 stack_height_limit=self._stack_height_limit, 

299 path_length_limit=self._path_length_limit)) 

300 

301 def _get_output_tensor(self, 

302 op_type, 

303 tensor, 

304 checked_tensor, 

305 is_v1_graph_mode): 

306 """Determine what tensor to output from callback. 

307 

308 Args: 

309 op_type: Type of the op that outputs the original symbolic tensor, as 

310 `bytes`. 

311 tensor: The original output symbolic tensor. 

312 checked_tensor: The debugger-instrumented, numerics-checking tensor. 

313 is_v1_graph_mode: Whether the debugged proggram is running under V1 graph 

314 mode. 

315 

316 Returns: 

317 A symbolic tensor to be returned by the dumping op_callback. 

318 """ 

319 if is_v1_graph_mode: 

320 # Placeholders need special treatment under V1 graph mode. The 

321 # callback can't simply override the Placeholder tensor to the debug 

322 # tensor, as that would cause the Placeholder op to lack a value. 

323 # The debug tensor is remembered and will be attached as control 

324 # inputs to ops that consumer the Placeholders later. 

325 if op_type == b"Placeholder": 

326 self._placeholder_to_debug_tensor[tensor] = checked_tensor 

327 return tensor 

328 else: 

329 return checked_tensor 

330 else: 

331 # Under non-v1 graph mode, rely on auto control dependency to run the 

332 # checked tensor. 

333 return tensor 

334 

335 

336@tf_export("debugging.enable_check_numerics") 

337def enable_check_numerics(stack_height_limit=30, 

338 path_length_limit=50): 

339 r"""Enable tensor numerics checking in an eager/graph unified fashion. 

340 

341 The numerics checking mechanism will cause any TensorFlow eager execution or 

342 graph execution to error out as soon as an op's output tensor contains 

343 infinity or NaN. 

344 

345 This method is idempotent. Calling it multiple times has the same effect 

346 as calling it once. 

347 

348 This method takes effect only on the thread in which it is called. 

349 

350 When a op's float-type output tensor contains any Infinity or NaN, an 

351 `tf.errors.InvalidArgumentError` will be thrown, with an error message that 

352 reveals the following information: 

353 - The type of the op that generated the tensor with bad numerics. 

354 - Data type (dtype) of the tensor. 

355 - Shape of the tensor (to the extent known at the time of eager execution 

356 or graph construction). 

357 - Name of the containing graph (if available). 

358 - (Graph mode only): The stack trace of the intra-graph op's creation, 

359 with a stack-height limit and a path-length limit for visual clarity. 

360 The stack frames that belong to the user's code (as opposed to 

361 tensorflow's internal code) are highlighted with a text arrow ("->"). 

362 - (Eager mode only): How many of the offending tensor's elements are 

363 `Infinity` and `NaN`, respectively. 

364 

365 Once enabled, the check-numerics mechanism can be disabled by using 

366 `tf.debugging.disable_check_numerics()`. 

367 

368 Example usage: 

369 

370 1. Catching infinity during the execution of a `tf.function` graph: 

371 

372 ```py 

373 import tensorflow as tf 

374 

375 tf.debugging.enable_check_numerics() 

376 

377 @tf.function 

378 def square_log_x_plus_1(x): 

379 v = tf.math.log(x + 1) 

380 return tf.math.square(v) 

381 

382 x = -1.0 

383 

384 # When the following line runs, a function graph will be compiled 

385 # from the Python function `square_log_x_plus_1()`. Due to the 

386 # `enable_check_numerics()` call above, the graph will contain 

387 # numerics checking ops that will run during the function graph's 

388 # execution. The function call generates an -infinity when the Log 

389 # (logarithm) op operates on the output tensor of the Add op. 

390 # The program errors out at this line, printing an error message. 

391 y = square_log_x_plus_1(x) 

392 z = -y 

393 ``` 

394 

395 2. Catching NaN during eager execution: 

396 

397 ```py 

398 import numpy as np 

399 import tensorflow as tf 

400 

401 tf.debugging.enable_check_numerics() 

402 

403 x = np.array([[0.0, -1.0], [4.0, 3.0]]) 

404 

405 # The following line executes the Sqrt op eagerly. Due to the negative 

406 # element in the input array, a NaN is generated. Due to the 

407 # `enable_check_numerics()` call above, the program errors immediately 

408 # at this line, printing an error message. 

409 y = tf.math.sqrt(x) 

410 z = tf.matmul(y, y) 

411 ``` 

412 

413 NOTE: If your code is running on TPUs, be sure to call 

414 `tf.config.set_soft_device_placement(True)` before calling 

415 `tf.debugging.enable_check_numerics()` as this API uses automatic outside 

416 compilation on TPUs. For example: 

417 

418 ```py 

419 tf.config.set_soft_device_placement(True) 

420 tf.debugging.enable_check_numerics() 

421 

422 resolver = tf.distribute.cluster_resolver.TPUClusterResolver(tpu='') 

423 strategy = tf.distribute.TPUStrategy(resolver) 

424 with strategy.scope(): 

425 # ... 

426 ``` 

427 

428 Args: 

429 stack_height_limit: Limit to the height of the printed stack trace. 

430 Applicable only to ops in `tf.function`s (graphs). 

431 path_length_limit: Limit to the file path included in the printed stack 

432 trace. Applicable only to ops in `tf.function`s (graphs). 

433 """ 

434 if not hasattr(_state, "check_numerics_callback"): 

435 _state.check_numerics_callback = CheckNumericsCallback( 

436 stack_height_limit, path_length_limit) 

437 op_callbacks.add_op_callback(_state.check_numerics_callback.callback) 

438 

439 logging.info( 

440 "Enabled check-numerics callback in thread %s", 

441 threading.current_thread().name) 

442 _check_numerics_callback_create_counter.get_cell().increase_by(1) 

443 

444 

445@tf_export("debugging.disable_check_numerics") 

446def disable_check_numerics(): 

447 """Disable the eager/graph unified numerics checking mechanism. 

448 

449 This method can be used after a call to `tf.debugging.enable_check_numerics()` 

450 to disable the numerics-checking mechanism that catches infinity and NaN 

451 values output by ops executed eagerly or in tf.function-compiled graphs. 

452 

453 This method is idempotent. Calling it multiple times has the same effect 

454 as calling it once. 

455 

456 This method takes effect only on the thread in which it is called. 

457 """ 

458 if not hasattr(_state, "check_numerics_callback"): 

459 return 

460 try: 

461 op_callbacks.remove_op_callback(_state.check_numerics_callback.callback) 

462 delattr(_state, "check_numerics_callback") 

463 logging.info( 

464 "Disabled check-numerics callback in thread %s", 

465 threading.current_thread().name) 

466 except KeyError: 

467 # Tolerate disabling the check numerics callback without 

468 # enable_check_numerics() being called first. 

469 pass