Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/tensorflow/python/debug/lib/debug

3# Licensed under the Apache License, Version 2.0 (the "License");

4# you may not use this file except in compliance with the License.

5# You may obtain a copy of the License at

7# http://www.apache.org/licenses/LICENSE-2.0

9# Unless required by applicable law or agreed to in writing, software

10# distributed under the License is distributed on an "AS IS" BASIS,

11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

12# See the License for the specific language governing permissions and

13# limitations under the License.

14# ==============================================================================

15"""Classes and functions to handle debug-dump data of TensorFlow Debugger."""

17import collections

18import glob

19import json

20import os

21import platform

22import re

24import numpy as np

26from tensorflow.core.framework import graph_pb2

27from tensorflow.core.framework import types_pb2

28from tensorflow.core.util import event_pb2

29from tensorflow.python.debug.lib import debug_graphs

30from tensorflow.python.framework import tensor_util

31from tensorflow.python.platform import gfile

32from tensorflow.python.platform import tf_logging as logging

33from tensorflow.python.util import compat

36# TODO(cais): Tie these string constants in with C++?

37METADATA_FILE_PREFIX = "_tfdbg_"

38CORE_METADATA_TAG = "core_metadata_"

39GRAPH_FILE_TAG = "graph_"

40DEVICE_TAG = "device_"

41HASH_TAG = "hash"

43FETCHES_INFO_FILE_TAG = "fetches_info_"

44FEED_KEYS_INFO_FILE_TAG = "feed_keys_info_"

47def _glob(glob_pattern):

48 if platform.system() == "Windows":

49 return glob.glob(glob_pattern)

50 else:

51 return gfile.Glob(glob_pattern)

54class InconvertibleTensorProto:

55 """Represents a TensorProto that cannot be converted to np.ndarray."""

57 def __init__(self, tensor_proto, initialized=True):

58 """Constructor.

60 Args:

61 tensor_proto: the `TensorProto` object that cannot be represented as a

62 `np.ndarray` object.

63 initialized: (`bool`) whether the Tensor is initialized.

64 """

65 self._tensor_proto = tensor_proto

66 self._initialized = initialized

68 def __str__(self):

69 output = "" if self._initialized else "Uninitialized tensor:\n"

70 output += str(self._tensor_proto)

71 return output

73 @property

74 def initialized(self):

75 return self._initialized

78def load_tensor_from_event_file(event_file_path):

79 """Load a tensor from an event file.

81 Assumes that the event file contains a `Event` protobuf and the `Event`

82 protobuf contains a `Tensor` value.

84 Args:

85 event_file_path: (`str`) path to the event file.

87 Returns:

88 The tensor value loaded from the event file, as a `numpy.ndarray`. For

89 uninitialized Tensors, returns `None`. For Tensors of data types that

90 cannot be converted to `numpy.ndarray` (e.g., `tf.resource`), return

91 `None`.

92 """

94 event = event_pb2.Event()

95 with gfile.Open(event_file_path, "rb") as f:

96 event.ParseFromString(f.read())

97 return load_tensor_from_event(event)

100def load_tensor_from_event(event):

101 """Load a tensor from an Event proto.

102

103 Args:

104 event: The Event proto, assumed to hold a tensor value in its

105 summary.value[0] field.

106

107 Returns:

108 The tensor value loaded from the event file, as a `numpy.ndarray`, if

109 representation of the tensor value by a `numpy.ndarray` is possible.

110 For uninitialized Tensors, returns `None`. For Tensors of data types that

111 cannot be represented as `numpy.ndarray` (e.g., `tf.resource`), return

112 the `TensorProto` protobuf object without converting it to a

113 `numpy.ndarray`.

114 """

115

116 tensor_proto = event.summary.value[0].tensor

117 shape = tensor_util.TensorShapeProtoToList(tensor_proto.tensor_shape)

118 num_elements = 1

119 for shape_dim in shape:

120 num_elements *= shape_dim

121

122 if tensor_proto.tensor_content or tensor_proto.string_val or not num_elements:

123 # Initialized tensor or empty tensor.

124 if tensor_proto.dtype == types_pb2.DT_RESOURCE:

125 tensor_value = InconvertibleTensorProto(tensor_proto)

126 else:

127 try:

128 tensor_value = tensor_util.MakeNdarray(tensor_proto)

129 except KeyError:

130 tensor_value = InconvertibleTensorProto(tensor_proto)

131 else:

132 # Uninitialized tensor or tensor of unconvertible data type.

133 tensor_value = InconvertibleTensorProto(tensor_proto, False)

134

135 return tensor_value

136

137

138def _load_graph_def_from_event_file(event_file_path):

139 event = event_pb2.Event()

140 with gfile.Open(event_file_path, "rb") as f:

141 event.ParseFromString(f.read())

142

143 return graph_pb2.GraphDef.FromString(event.graph_def)

144

145

146def _load_log_message_from_event_file(event_file_path):

147 event = event_pb2.Event()

148 with gfile.Open(event_file_path, "rb") as f:

149 event.ParseFromString(f.read())

150

151 return event.log_message.message

152

153

154def _is_graph_file(file_name):

155 return file_name.startswith(METADATA_FILE_PREFIX + GRAPH_FILE_TAG)

156

157

158def _is_run_fetches_info_file(file_name):

159 return file_name == METADATA_FILE_PREFIX + FETCHES_INFO_FILE_TAG

160

161

162def _is_run_feed_keys_info_file(file_name):

163 return file_name == METADATA_FILE_PREFIX + FEED_KEYS_INFO_FILE_TAG

164

165

166def _get_tensor_name(node_name, output_slot):

167 """Get tensor name given node name and output slot index.

168

169 Args:

170 node_name: Name of the node that outputs the tensor, as a string.

171 output_slot: Output slot index of the tensor, as an integer.

172

173 Returns:

174 Name of the tensor, as a string.

175 """

176

177 return "%s:%d" % (node_name, output_slot)

178

179

180def _get_tensor_watch_key(node_name, output_slot, debug_op):

181 """Get the string representation of a debug watch on a tensor.

182

183 Args:

184 node_name: Name of the node by which the watched tensor is produced, as a

185 string.

186 output_slot: Output slot index of the tensor, as an integer.

187 debug_op: Name of the debug op that is used to watch the tensor, as a

188 string.

189

190 Returns:

191 A string representing the debug watch on the tensor (i.e., the "watch

192 key").

193 """

194 return "%s:%s" % (_get_tensor_name(node_name, output_slot), debug_op)

195

196

197def has_inf_or_nan(datum, tensor):

198 """A predicate for whether a tensor consists of any bad numerical values.

199

200 This predicate is common enough to merit definition in this module.

201 Bad numerical values include `nan`s and `inf`s.

202 The signature of this function follows the requirement of the method

203 `DebugDumpDir.find()`.

204

205 Args:

206 datum: (`DebugTensorDatum`) Datum metadata.

207 tensor: (`numpy.ndarray` or None) Value of the tensor. None represents

208 an uninitialized tensor.

209

210 Returns:

211 (`bool`) True if and only if tensor consists of any nan or inf values.

212 """

213

214 _ = datum # Datum metadata is unused in this predicate.

215

216 if isinstance(tensor, InconvertibleTensorProto):

217 # Uninitialized tensor doesn't have bad numerical values.

218 # Also return False for data types that cannot be represented as numpy

219 # arrays.

220 return False

221 elif (np.issubdtype(tensor.dtype, np.floating) or

222 np.issubdtype(tensor.dtype, np.complexfloating) or

223 np.issubdtype(tensor.dtype, np.integer)):

224 return np.any(np.isnan(tensor)) or np.any(np.isinf(tensor))

225 else:

226 return False

227

228

229_CoreMetadata = collections.namedtuple("CoreMetadata", [

230 "global_step", "session_run_index", "executor_step_index", "input_names",

231 "output_names", "target_nodes"

232])

233

234

235def extract_core_metadata_from_event_proto(event):

236 json_metadata = json.loads(event.log_message.message)

237 return _CoreMetadata(json_metadata["global_step"],

238 json_metadata["session_run_index"],

239 json_metadata["executor_step_index"],

240 json_metadata["input_names"],

241 json_metadata["output_names"],

242 json_metadata["target_nodes"])

243

244

245def device_name_to_device_path(device_name):

246 """Convert device name to device path."""

247 device_name_items = compat.as_text(device_name).split("/")

248 device_name_items = [item.replace(":", "_") for item in device_name_items]

249 return METADATA_FILE_PREFIX + DEVICE_TAG + ",".join(device_name_items)

250

251

252def device_path_to_device_name(device_dir):

253 """Parse device name from device path.

254

255 Args:

256 device_dir: (str) a directory name for the device.

257

258 Returns:

259 (str) parsed device name.

260 """

261 path_items = os.path.basename(device_dir)[

262 len(METADATA_FILE_PREFIX) + len(DEVICE_TAG):].split(",")

263 return "/".join([

264 path_item.replace("device_", "device:").replace("_", ":", 1)

265 for path_item in path_items])

266

267

268class DebugTensorDatum:

269 """A single tensor dumped by TensorFlow Debugger (tfdbg).

270

271 Contains metadata about the dumped tensor, including `timestamp`,

272 `node_name`, `output_slot`, `debug_op`, and path to the dump file

273 (`file_path`).

274

275 This type does not hold the generally space-expensive tensor value (numpy

276 array). Instead, it points to the file from which the tensor value can be

277 loaded (with the `get_tensor` method) if needed.

278 """

279

280 def __init__(self, dump_root, debug_dump_rel_path):

281 """`DebugTensorDatum` constructor.

282

283 Args:

284 dump_root: (`str`) Debug dump root directory. This path should not include

285 the path component that represents the device name (see also below).

286 debug_dump_rel_path: (`str`) Path to a debug dump file, relative to the

287 `dump_root`. The first item of this relative path is assumed to be

288 a path representing the name of the device that the Tensor belongs to.

289 See `device_path_to_device_name` for more details on the device path.

290 For example, suppose the debug dump root

291 directory is `/tmp/tfdbg_1` and the dump file is at

292 `/tmp/tfdbg_1/<device_path>/>ns_1/node_a_0_DebugIdentity_123456789`,

293 then the value of the debug_dump_rel_path should be

294 `<device_path>/ns_1/node_a_0_DebugIdentity_1234456789`.

295

296 Raises:

297 ValueError: If the base file name of the dump file does not conform to

298 the dump file naming pattern:

299 `node_name`_`output_slot`_`debug_op`_`timestamp`

300 """

301

302 path_components = os.path.normpath(debug_dump_rel_path).split(os.sep)

303 self._device_name = device_path_to_device_name(path_components[0])

304 base = path_components[-1]

305 if base.count("_") < 3:

306 raise ValueError(

307 "Dump file path does not conform to the naming pattern: %s" % base)

308

309 self._extended_timestamp = base.split("_")[-1]

310 # It may include an index suffix at the end if file path collision happened

311 # due to identical timestamps.

312 if "-" in self._extended_timestamp:

313 self._timestamp = int(

314 self._extended_timestamp[:self._extended_timestamp.find("-")])

315 else:

316 self._timestamp = int(self._extended_timestamp)

317

318 self._debug_op = base.split("_")[-2]

319 self._output_slot = int(base.split("_")[-3])

320

321 node_base_name = "_".join(base.split("_")[:-3])

322 self._node_name = "/".join(path_components[1:-1] + [node_base_name])

323

324 self._file_path = os.path.join(dump_root, debug_dump_rel_path)

325 self._dump_size_bytes = (gfile.Stat(self._file_path).length if

326 gfile.Exists(self._file_path) else None)

327

328 def __str__(self):

329 return "{DebugTensorDatum (%s) %s:%d @ %s @ %d}" % (self.device_name,

330 self.node_name,

331 self.output_slot,

332 self.debug_op,

333 self.timestamp)

334

335 def __repr__(self):

336 return self.__str__()

337

338 def get_tensor(self):

339 """Get tensor from the dump (`Event`) file.

340

341 Returns:

342 The tensor loaded from the dump (`Event`) file.

343 """

344

345 return load_tensor_from_event_file(self.file_path)

346

347 # TODO(cais): Add time unit suffix to timestamp and t0 (us).

348 @property

349 def timestamp(self):

350 """Timestamp of when this tensor value was dumped.

351

352 Returns:

353 (`int`) The timestamp in microseconds.

354 """

355

356 return self._timestamp

357

358 @property

359 def extended_timestamp(self):

360 """Extended timestamp, possibly with an index suffix.

361

362 The index suffix, e.g., "-1", is for disambiguating multiple dumps of the

363 same tensor with the same timestamp, which can occur if the dumping events

364 are spaced by shorter than the temporal resolution of the timestamps.

365

366 Returns:

367 (`str`) The extended timestamp.

368 """

369

370 return self._extended_timestamp

371

372 @property

373 def debug_op(self):

374 """Name of the debug op.

375

376 Returns:

377 (`str`) debug op name (e.g., `DebugIdentity`).

378 """

379

380 return self._debug_op

381

382 @property

383 def device_name(self):

384 """Name of the device that the tensor belongs to.

385

386 Returns:

387 (`str`) device name.

388 """

389

390 return self._device_name

391

392 @property

393 def node_name(self):

394 """Name of the node from which the tensor value was dumped.

395

396 Returns:

397 (`str`) name of the node watched by the debug op.

398 """

399

400 return self._node_name

401

402 @property

403 def output_slot(self):

404 """Output slot index from which the tensor value was dumped.

405

406 Returns:

407 (`int`) output slot index watched by the debug op.

408 """

409

410 return self._output_slot

411

412 @property

413 def tensor_name(self):

414 """Name of the tensor watched by the debug op.

415

416 Returns:

417 (`str`) `Tensor` name, in the form of `node_name`:`output_slot`

418 """

419

420 return _get_tensor_name(self.node_name, self.output_slot)

421

422 @property

423 def watch_key(self):

424 """Watch key identities a debug watch on a tensor.

425

426 Returns:

427 (`str`) A watch key, in the form of `tensor_name`:`debug_op`.

428 """

429

430 return _get_tensor_watch_key(self.node_name, self.output_slot,

431 self.debug_op)

432

433 @property

434 def file_path(self):

435 """Path to the file which stores the value of the dumped tensor."""

436

437 return self._file_path

438

439 @property

440 def dump_size_bytes(self):

441 """Size of the dump file.

442

443 Unit: byte.

444

445 Returns:

446 If the dump file exists, size of the dump file, in bytes.

447 If the dump file does not exist, None.

448 """

449

450 return self._dump_size_bytes

451

452

453class WatchKeyDoesNotExistInDebugDumpDirError(ValueError):

454 pass

455

456

457class DebugDumpDir:

458 """Data set from a debug-dump directory on filesystem.

459

460 An instance of `DebugDumpDir` contains all `DebugTensorDatum` instances

461 in a tfdbg dump root directory.

462 """

463

464 def __init__(self, dump_root, partition_graphs=None, validate=True):

465 """`DebugDumpDir` constructor.

466

467 Args:

468 dump_root: (`str`) path to the dump root directory.

469 partition_graphs: A repeated field of GraphDefs representing the

470 partition graphs executed by the TensorFlow runtime.

471 validate: (`bool`) whether the dump files are to be validated against the

472 partition graphs.

473

474 Raises:

475 IOError: If dump_root does not exist as a directory.

476 ValueError: If more than one core metadata file is found under the dump

477 root directory.

478 """

479

480 if not gfile.IsDirectory(dump_root):

481 raise IOError("Dump root directory %s does not exist" % dump_root)

482

483 self._core_metadata = []

484

485 # Find the list of devices.

486 self._dump_root = dump_root

487

488 self._load_core_metadata()

489 self._load_fetches_info()

490 self._load_feeds_info()

491 self._load_all_device_dumps(partition_graphs, validate)

492

493 self._python_graph = None

494

495 def _load_all_device_dumps(self, partition_graphs, validate):

496 """Load the dump data for all devices."""

497 device_dirs = _glob(os.path.join(

498 self._dump_root, METADATA_FILE_PREFIX + DEVICE_TAG + "*"))

499

500 self._device_names = []

501 self._t0s = {}

502 self._dump_tensor_data = {}

503 self._dump_graph_file_paths = {}

504 self._debug_watches = {}

505 self._watch_key_to_devices = {}

506 self._watch_key_to_datum = {}

507 self._watch_key_to_rel_time = {}

508 self._watch_key_to_dump_size_bytes = {}

509 for device_dir in device_dirs:

510 device_name = device_path_to_device_name(device_dir)

511 self._device_names.append(device_name)

512 self._load_device_dumps(device_name, device_dir)

513 self._load_partition_graphs(partition_graphs, validate)

514 self._calculate_t0()

515

516 for device_name in self._device_names:

517 self._create_tensor_watch_maps(device_name)

518

519 def _load_device_dumps(self, device_name, device_root):

520 """Load `DebugTensorDatum` instances from the dump root of a given device.

521

522 Populates a map {device_name: a list of `DebugTensorDatum`}, where the list

523 is sorted by ascending timestamp.

524

525 This sorting order reflects the order in which the TensorFlow executor

526 processed the nodes of the graph. It is (one of many possible) topological

527 sort of the nodes. This is useful for displaying tensors in the debugger

528 frontend as well as for the use case in which the user wants to find a

529 "culprit tensor", i.e., the first tensor in the graph that exhibits certain

530 problematic properties, i.e., all zero values, or bad numerical values such

531 as nan and inf.

532

533 In addition, creates a map from node name to debug watches. In this Map,

534 the key is the watched node name; the value is a dictionary.

535 Of this dictionary, the key is the watched_output_slot.

536

537 This method attempts to load the debug watches from the tensor dump files

538 first, before loading the full set of debug watches from the partition

539 graphs as done later. This is necessary because sometimes the partition

540 graphs may not be available, e.g., when the run errors out.

541

542 Args:

543 device_name: (`str`) name of the device.

544 device_root: (`str`) dump root directory of the given device.

545

546 Raises:

547 ValueError: If GraphDef for the device is not available.

548 """

549

550 self._dump_tensor_data[device_name] = []

551 self._debug_watches[device_name] = collections.defaultdict(

552 lambda: collections.defaultdict(set))

553

554 for root, _, files in gfile.Walk(device_root):

555 for f in files:

556 if _is_graph_file(f):

557 self._dump_graph_file_paths[device_name] = os.path.join(root, f)

558 else:

559 datum = self._dump_file_name_to_datum(root, f)

560 self._dump_tensor_data[device_name].append(datum)

561 self._debug_watches[device_name][datum.node_name][

562 datum.output_slot].add(datum.debug_op)

563

564 self._dump_tensor_data[device_name] = sorted(

565 self._dump_tensor_data[device_name],

566 key=lambda x: x.extended_timestamp)

567

568 if self._dump_tensor_data[device_name]:

569 self._t0s[device_name] = self._dump_tensor_data[device_name][0].timestamp

570 else:

571 self._t0s[device_name] = None

572

573 def _calculate_t0(self):

574 """Calculate the first timestamp across all devices."""

575 t0s = [t0 for t0 in self._t0s.values() if t0 is not None]

576 self._t0 = min(t0s) if t0s else None

577

578 def _load_core_metadata(self):

579 core_metadata_files = _glob(os.path.join(

580 self._dump_root, METADATA_FILE_PREFIX + CORE_METADATA_TAG + "*"))

581 for core_metadata_file in core_metadata_files:

582 with gfile.Open(core_metadata_file, "rb") as f:

583 event = event_pb2.Event()

584 event.ParseFromString(f.read())

585 self._core_metadata.append(

586 extract_core_metadata_from_event_proto(event))

587

588 def _load_fetches_info(self):

589 fetches_info_files = _glob(os.path.join(

590 self._dump_root, METADATA_FILE_PREFIX + FETCHES_INFO_FILE_TAG + "*"))

591 self._run_fetches_info = []

592 for fetches_info_file in fetches_info_files:

593 self._run_fetches_info.append(

594 _load_log_message_from_event_file(fetches_info_file))

595

596 def _load_feeds_info(self):

597 feeds_info_files = _glob(os.path.join(

598 self._dump_root, METADATA_FILE_PREFIX + FEED_KEYS_INFO_FILE_TAG + "*"))

599 self._run_feed_keys_info = []

600 for feeds_info_file in feeds_info_files:

601 self._run_feed_keys_info.append(

602 _load_log_message_from_event_file(feeds_info_file))

603

604 def _dump_file_name_to_datum(self, dir_name, file_name):

605 """Obtain a DebugTensorDatum from the directory and file name.

606

607 Args:

608 dir_name: (`str`) Name of the directory in which the dump file resides.

609 file_name: (`str`) Base name of the dump file.

610

611 Returns:

612 (`DebugTensorDatum`) The `DebugTensorDatum` loaded from the dump file.

613 """

614

615 # Calculate the relative path of the dump file with respect to the root.

616 debug_dump_rel_path = os.path.join(

617 os.path.relpath(dir_name, self._dump_root), file_name)

618 return DebugTensorDatum(self._dump_root, debug_dump_rel_path)

619

620 def _create_tensor_watch_maps(self, device_name):

621 """Create maps from tensor watch keys to datum and to timestamps.

622

623 Create a map from watch key (tensor name + debug op) to `DebugTensorDatum`

624 item. Also make a map from watch key to relative timestamp.

625 "relative" means (absolute timestamp - t0).

626

627 Args:

628 device_name: (str) name of the device.

629 """

630

631 self._watch_key_to_datum[device_name] = {}

632 self._watch_key_to_rel_time[device_name] = {}

633 self._watch_key_to_dump_size_bytes[device_name] = {}

634 for datum in self._dump_tensor_data[device_name]:

635 if datum.watch_key not in self._watch_key_to_devices:

636 self._watch_key_to_devices[datum.watch_key] = {device_name}

637 else:

638 self._watch_key_to_devices[datum.watch_key].add(device_name)

639

640 if datum.watch_key not in self._watch_key_to_datum[device_name]:

641 self._watch_key_to_datum[device_name][datum.watch_key] = [datum]

642 self._watch_key_to_rel_time[device_name][datum.watch_key] = [

643 datum.timestamp - self._t0]

644 self._watch_key_to_dump_size_bytes[device_name][datum.watch_key] = [

645 datum.dump_size_bytes]

646 else:

647 self._watch_key_to_datum[device_name][datum.watch_key].append(datum)

648 self._watch_key_to_rel_time[device_name][datum.watch_key].append(

649 datum.timestamp - self._t0)

650 self._watch_key_to_dump_size_bytes[device_name][datum.watch_key].append(

651 datum.dump_size_bytes)

652

653 def set_python_graph(self, python_graph):

654 """Provide Python `Graph` object to the wrapper.

655

656 Unlike the partition graphs, which are protobuf `GraphDef` objects, `Graph`

657 is a Python object and carries additional information such as the traceback

658 of the construction of the nodes in the graph.

659

660 Args:

661 python_graph: (ops.Graph) The Python Graph object.

662 """

663

664 self._python_graph = python_graph

665 self._node_traceback = {}

666 if self._python_graph:

667 for op in self._python_graph.get_operations():

668 self._node_traceback[op.name] = tuple(map(tuple, op.traceback))

669

670 @property

671 def python_graph(self):

672 """Get the Python graph.

673

674 Returns:

675 If the Python graph has been set, returns a `tf.Graph` object. Otherwise,

676 returns None.

677 """

678

679 return self._python_graph

680

681 @property

682 def core_metadata(self):

683 """Metadata about the `Session.run()` call from the core runtime.

684

685 Of the three counters available in the return value, `global_step` is

686 supplied by the caller of the debugged `Session.run()`, while

687 `session_run_index` and `executor_step_index` are determined by the state

688 of the core runtime, automatically. For the same fetch list, feed keys and

689 debug tensor watch options, the same executor will be used and

690 `executor_step_index` should increase by one at a time. However, runs with

691 different fetch lists, feed keys and debug_tensor watch options that all

692 share the same `Session` object can lead to gaps in `session_run_index`.

693

694 Returns:

695 If core metadata are loaded, a `namedtuple` with the fields:

696 `global_step`: A global step count supplied by the caller of

697 `Session.run()`. It is optional to the caller. If the caller did not

698 supply this parameter, its value will be -1.

699 `session_run_index`: A sorted index for Run() calls to the underlying

700 TensorFlow `Session` object.

701 `executor_step_index`: A counter for invocations of a given runtime

702 executor. The same executor is re-used for the same fetched tensors,

703 target nodes, input feed keys and debug tensor watch options.

704 `input_names`: Names of the input (feed) Tensors.

705 `output_names`: Names of the output (fetched) Tensors.

706 `target_nodes`: Names of the target nodes.

707 If the core metadata have not been loaded, `None`.

708 If more than one core metadata files exist, return a list of the

709 `nametuple` described above.

710 """

711

712 output = self._core_metadata

713 return output[0] if len(output) == 1 else output

714

715 @property

716 def dumped_tensor_data(self):

717 """Retrieve dumped tensor data."""

718 if len(self.devices()) == 1:

719 return self._dump_tensor_data[self.devices()[0]]

720 else:

721 all_devices_data = self._dump_tensor_data.values()

722 data = []

723 for device_data in all_devices_data:

724 data.extend(device_data)

725 return sorted(data, key=lambda x: x.extended_timestamp)

726

727 @property

728 def t0(self):

729 """Absolute timestamp of the first dumped tensor across all devices.

730

731 Returns:

732 (`int`) absolute timestamp of the first dumped tensor, in microseconds.

733 """

734 return self._t0

735

736 @property

737 def size(self):

738 """Total number of dumped tensors in the dump root directory.

739

740 Returns:

741 (`int`) The total number of dumped tensors in the dump root directory.

742 """

743 return sum(len(self._dump_tensor_data[device_name])

744 for device_name in self._dump_tensor_data)

745

746 def _load_partition_graphs(self, client_partition_graphs, validate):

747 """Load and process partition graphs.

748

749 Load the graphs; parse the input and control input structure; obtain the

750 device and op type of each node; remove the Copy and debug ops inserted

751 by the debugger. The gathered information can be used to validate the

752 tensor dumps.

753

754 Args:

755 client_partition_graphs: A repeated field of GraphDefs representing the

756 partition graphs executed by the TensorFlow runtime, from the Python

757 client. These partition graphs are used only if partition graphs

758 cannot be loaded from the dump directory on the file system.

759 validate: (`bool`) Whether the dump files are to be validated against the

760 partition graphs.

761

762 Raises:

763 ValueError: If the partition GraphDef of one or more devices fail to be

764 loaded.

765 """

766 self._debug_graphs = {}

767 self._node_devices = {}

768

769 partition_graphs_and_device_names = []

770 for device_name in self._device_names:

771 partition_graph = None

772 if device_name in self._dump_graph_file_paths:

773 partition_graph = _load_graph_def_from_event_file(

774 self._dump_graph_file_paths[device_name])

775 else:

776 logging.warn(

777 "Failed to load partition graphs for device %s from disk. "

778 "As a fallback, the client graphs will be used. This "

779 "may cause mismatches in device names." % device_name)

780 partition_graph = self._find_partition_graph(client_partition_graphs,

781 device_name)

782

783 if partition_graph:

784 partition_graphs_and_device_names.append((partition_graph,

785 device_name))

786

787 for partition_graph, maybe_device_name in partition_graphs_and_device_names:

788 debug_graph = debug_graphs.DebugGraph(partition_graph,

789 device_name=maybe_device_name)

790 self._debug_graphs[debug_graph.device_name] = debug_graph

791 self._collect_node_devices(debug_graph)

792

793 if validate and debug_graph.device_name in self._dump_tensor_data:

794 self._validate_dump_with_graphs(debug_graph.device_name)

795

796 def _find_partition_graph(self, partition_graphs, device_name):

797 if partition_graphs is None:

798 return None

799 else:

800 for graph_def in partition_graphs:

801 for node_def in graph_def.node:

802 if node_def.device == device_name:

803 return graph_def

804 return None

805

806 def _collect_node_devices(self, debug_graph):

807 for node_name in debug_graph.node_devices:

808 if node_name in self._node_devices:

809 self._node_devices[node_name] = self._node_devices[node_name].union(

810 debug_graph.node_devices[node_name])

811 else:

812 self._node_devices[node_name] = debug_graph.node_devices[node_name]

813

814 def _validate_dump_with_graphs(self, device_name):

815 """Validate the dumped tensor data against the partition graphs.

816

817 Only the watched nodes are validated by this method, because tfdbg allows

818 clients to watch only a subset of the nodes.

819

820 Args:

821 device_name: (`str`) device name.

822

823 Raises:

824 LookupError: If the partition graphs have not been loaded yet.

825 ValueError: If dumps contain node names not found in partition graph.

826 Or if the temporal order of the dump's timestamps violate the

827 input relations on the partition graphs.

828 """

829 if not self._debug_graphs:

830 raise LookupError(

831 "No partition graphs loaded for device %s" % device_name)

832 debug_graph = self._debug_graphs[device_name]

833

834 # Verify that the node names in the dump data are all present in the

835 # partition graphs.

836 for datum in self._dump_tensor_data[device_name]:

837 if datum.node_name not in debug_graph.node_inputs:

838 raise ValueError("Node name '%s' is not found in partition graphs of "

839 "device %s." % (datum.node_name, device_name))

840

841 pending_inputs = {}

842 for node in debug_graph.node_inputs:

843 pending_inputs[node] = []

844 inputs = debug_graph.node_inputs[node]

845 for inp in inputs:

846 inp_node = debug_graphs.get_node_name(inp)

847 inp_output_slot = debug_graphs.get_output_slot(inp)

848 # Inputs from Enter and NextIteration nodes are not validated because

849 # DebugNodeInserter::InsertNodes() in the debugger core skips creating

850 # control edges from debug ops watching these types of nodes.

851 if (inp_node in self._debug_watches[device_name] and

852 inp_output_slot in self._debug_watches[device_name][inp_node] and

853 debug_graph.node_op_types.get(inp) not in (

854 "Enter", "NextIteration") and

855 (inp_node, inp_output_slot) not in pending_inputs[node]):

856 pending_inputs[node].append((inp_node, inp_output_slot))

857

858 for i, datum in enumerate(self._dump_tensor_data[device_name]):

859 node = datum.node_name

860 slot = datum.output_slot

861 # In some cases (e.g., system clocks with insufficient precision),

862 # the upstream and downstream tensors may have identical timestamps, the

863 # following check examines this possibility and avoids raising an error if

864 # that is the case.

865 if not self._satisfied_at_timestamp(

866 device_name, pending_inputs[node], datum.timestamp, start_i=i + 1):

867 raise ValueError("Causality violated in timing relations of debug "

868 "dumps: %s (%d): "

869 "these input(s) are not satisfied: %s" %

870 (node, datum.timestamp, repr(pending_inputs[node])))

871

872 recipients = debug_graph.node_recipients[node]

873 for recipient in recipients:

874 recipient_pending_inputs = pending_inputs[recipient]

875 if (node, slot) in recipient_pending_inputs:

876 if self.node_op_type(recipient) == "Merge":

877 # If this is a Merge op, we automatically clear the list because

878 # a Merge node only requires one of its two inputs.

879 del recipient_pending_inputs[:]

880 else:

881 del recipient_pending_inputs[

882 recipient_pending_inputs.index((node, slot))]

883

884 def _satisfied_at_timestamp(self, device_name, pending, timestamp, start_i=0):

885 """Determine whether pending inputs are satisfied at given timestamp.

886

887 Note: This method mutates the input argument "pending".

888

889 Args:

890 device_name: (str) device name.

891 pending: A list of 2-tuple (node_name, output_slot): the dependencies to

892 check.

893 timestamp: (int) the timestamp in question.

894 start_i: (int) the index in self._dump_tensor_data to start searching for

895 the timestamp.

896

897 Returns:

898 (bool) Whether all the dependencies in pending are satisfied at the

899 timestamp. If pending is empty to begin with, return True.

900 """

901 if not pending:

902 return True

903

904 for datum in self._dump_tensor_data[device_name][start_i:]:

905 if datum.timestamp > timestamp:

906 break

907 if (datum.timestamp == timestamp and

908 (datum.node_name, datum.output_slot) in pending):

909 pending.remove((datum.node_name, datum.output_slot))

910 if not pending:

911 return True

912

913 return not pending

914

915 def loaded_partition_graphs(self):

916 """Test whether partition graphs have been loaded."""

917 return bool(self._debug_graphs)

918

919 def partition_graphs(self):

920 """Get the partition graphs.

921

922 Returns:

923 Partition graphs as a list of GraphDef.

924

925 Raises:

926 LookupError: If no partition graphs have been loaded.

927 """

928 if not self._debug_graphs:

929 raise LookupError("No partition graphs have been loaded.")

930 return [self._debug_graphs[key].debug_graph_def

931 for key in self._debug_graphs]

932

933 def reconstructed_non_debug_partition_graphs(self):

934 """Reconstruct partition graphs with the debugger-inserted ops stripped.

935

936 The reconstructed partition graphs are identical to the original (i.e.,

937 non-debugger-decorated) partition graphs except in the following respects:

938 1) The exact names of the runtime-inserted internal nodes may differ.

939 These include _Send, _Recv, _HostSend, _HostRecv, _Retval ops.

940 2) As a consequence of 1, the nodes that receive input directly from such

941 send- and recv-type ops will have different input names.

942 3) The parallel_iteration attribute of while-loop Enter ops are set to 1.

943

944 Returns:

945 A dict mapping device names (`str`s) to reconstructed

946 `tf.compat.v1.GraphDef`s.

947 """

948 non_debug_graphs = {}

949 for key in self._debug_graphs:

950 non_debug_graphs[key] = self._debug_graphs[key].non_debug_graph_def

951 return non_debug_graphs

952

953 @property

954 def run_fetches_info(self):

955 """Get a str representation of the fetches used in the Session.run() call.

956

957 Returns:

958 If the information is available from one `Session.run` call, a `str`

959 obtained from `repr(fetches)`.

960 If the information is available from multiple `Session.run` calls, a

961 `list` of `str` from `repr(fetches)`.

962 If the information is not available, `None`.

963 """

964

965 output = self._run_fetches_info

966 return output[0] if len(output) == 1 else output

967

968 @property

969 def run_feed_keys_info(self):

970 """Get a str representation of the feed_dict used in the Session.run() call.

971

972 Returns:

973 If the information is available from one `Session.run` call, a `str`

974 obtained from `repr(feed_dict)`.

975 If the information is available from multiple `Session.run` calls, a

976 `list` of `str` obtained from `repr(feed_dict)`.

977 If the information is not available, `None`.

978 """

979

980 output = self._run_feed_keys_info

981 return output[0] if len(output) == 1 else output

982

983 def _infer_device_name(self, device_name, node_name):

984 """Infer the device name given node name.

985

986 If device_name is provided (i.e., not None), it'll be simply returned right

987 away.

988

989 Args:

990 device_name: (str or None) name of the device. If None, will try to infer

991 the device name by looking at the available nodes.

992 node_name: (str) name of the node.

993

994 Returns:

995 (str) Inferred name of the device, if available.

996

997 Raises:

998 ValueError: If the node name does not exist on any of the available

999 devices or if there are multiple devices that contain the node with

1000 the given name.

1001 """

1002 if device_name is None:

1003 if node_name in self._node_devices:

1004 if len(self._node_devices[node_name]) == 1:

1005 return list(self._node_devices[node_name])[0]

1006 else:

1007 raise ValueError(

1008 "There are multiple (%d) devices with nodes named '%s' but "

1009 "device_name is not specified." %

1010 (len(self._node_devices[node_name]), node_name))

1011 else:

1012 raise ValueError("None of the %d device(s) has a node named '%s'." %

1013 (len(self._device_names), node_name))

1014 else:

1015 return device_name

1016

1017 def nodes(self, device_name=None):

1018 """Get a list of all nodes from the partition graphs.

1019

1020 Args:

1021 device_name: (`str`) name of device. If None, all nodes from all available

1022 devices will be included.

1023

1024 Returns:

1025 All nodes' names, as a list of str.

1026

1027 Raises:

1028 LookupError: If no partition graphs have been loaded.

1029 ValueError: If specified node name does not exist.

1030 """

1031 if not self._debug_graphs:

1032 raise LookupError("No partition graphs have been loaded.")

1033 if device_name is None:

1034 nodes = []

1035 for device_name in self._debug_graphs:

1036 nodes.extend(self._debug_graphs[device_name].node_inputs.keys())

1037 return nodes

1038 else:

1039 if device_name not in self._debug_graphs:

1040 raise ValueError("Invalid device name: %s" % device_name)

1041 return self._debug_graphs[device_name].node_inputs.keys()

1042

1043 def node_attributes(self, node_name, device_name=None):

1044 """Get the attributes of a node.

1045

1046 Args:

1047 node_name: Name of the node in question.

1048 device_name: (`str`) name of the device. If there is only one device or if

1049 node_name exists on only one device, this argument is optional.

1050

1051 Returns:

1052 Attributes of the node.

1053

1054 Raises:

1055 LookupError: If no partition graphs have been loaded.

1056 """

1057 if not self._debug_graphs:

1058 raise LookupError("No partition graphs have been loaded.")

1059

1060 device_name = self._infer_device_name(device_name, node_name)

1061 return self._debug_graphs[device_name].node_attributes[node_name]

1062

1063 def node_inputs(self, node_name, is_control=False, device_name=None):

1064 """Get the inputs of given node according to partition graphs.

1065

1066 Args:

1067 node_name: Name of the node.

1068 is_control: (`bool`) Whether control inputs, rather than non-control

1069 inputs, are to be returned.

1070 device_name: (`str`) name of the device. If there is only one device or if

1071 node_name exists on only one device, this argument is optional.

1072

1073 Returns:

1074 (`list` of `str`) inputs to the node, as a list of node names.

1075

1076 Raises:

1077 LookupError: If node inputs and control inputs have not been loaded

1078 from partition graphs yet.

1079 """

1080 if not self._debug_graphs:

1081 raise LookupError(

1082 "Node inputs are not loaded from partition graphs yet.")

1083

1084 device_name = self._infer_device_name(device_name, node_name)

1085 if is_control:

1086 return self._debug_graphs[device_name].node_ctrl_inputs[node_name]

1087 else:

1088 return self._debug_graphs[device_name].node_inputs[node_name]

1089

1090 def transitive_inputs(self,

1091 node_name,

1092 include_control=True,

1093 include_reversed_ref=False,

1094 device_name=None,):

1095 """Get the transitive inputs of given node according to partition graphs.

1096

1097 Args:

1098 node_name: Name of the node.

1099 include_control: Include control inputs (True by default).

1100 include_reversed_ref: Whether a ref input, say from A to B, is to be also

1101 considered as an input from B to A. The rationale is that ref inputs

1102 generally let the recipient (e.g., B in this case) mutate the value of

1103 the source (e.g., A in this case). So the reverse direction of the ref

1104 edge reflects the direction of information flow.

1105 device_name: (`str`) name of the device. If there is only one device or if

1106 node_name exists on only one device, this argument is optional.

1107

1108 Returns:

1109 (`list` of `str`) all transitive inputs to the node, as a list of node

1110 names.

1111

1112 Raises:

1113 LookupError: If node inputs and control inputs have not been loaded

1114 from partition graphs yet.

1115 """

1116 if not self._debug_graphs:

1117 raise LookupError(

1118 "Node inputs are not loaded from partition graphs yet.")

1119

1120 device_name = self._infer_device_name(device_name, node_name)

1121

1122 input_lists = [self._debug_graphs[device_name].node_inputs]

1123 if include_control:

1124 input_lists.append(self._debug_graphs[device_name].node_ctrl_inputs)

1125 if include_reversed_ref:

1126 input_lists.append(

1127 self._debug_graphs[device_name].node_reversed_ref_inputs)

1128 tracer = debug_graphs.DFSGraphTracer(

1129 input_lists,

1130 skip_node_names=self._get_merge_node_names(device_name))

1131 tracer.trace(node_name)

1132 return tracer.inputs()

1133

1134 def _get_merge_node_names(self, device_name):

1135 """Lazily get a list of Merge nodes on a given device."""

1136 if device_name not in self._device_names:

1137 raise ValueError("Invalid device name: %s" % device_name)

1138

1139 if not hasattr(self, "_merge_node_names"):

1140 self._merge_node_names = {}

1141 if device_name not in self._merge_node_names:

1142 debug_graph = self._debug_graphs[device_name]

1143 self._merge_node_names[device_name] = [

1144 node for node in debug_graph.node_op_types

1145 if debug_graph.node_op_types[node] == "Merge"]

1146 return self._merge_node_names[device_name]

1147

1148 def find_some_path(self,

1149 src_node_name,

1150 dst_node_name,

1151 include_control=True,

1152 include_reversed_ref=False,

1153 device_name=None):

1154 """Find a path between a source node and a destination node.

1155

1156 Limitation: the source and destination are required to be on the same

1157 device, i.e., this method does not yet take into account Send/Recv nodes

1158 across devices.

1159

1160 TODO(cais): Make this method work across device edges by tracing Send/Recv

1161 nodes.

1162

1163 Args:

1164 src_node_name: (`str`) name of the source node or name of an output tensor

1165 of the node.

1166 dst_node_name: (`str`) name of the destination node or name of an output

1167 tensor of the node.

1168 include_control: (`bool`) whrther control edges are considered in the

1169 graph tracing.

1170 include_reversed_ref: Whether a ref input, say from A to B, is to be also

1171 considered as an input from B to A. The rationale is that ref inputs

1172 generally let the recipient (e.g., B in this case) mutate the value of

1173 the source (e.g., A in this case). So the reverse direction of the ref

1174 edge reflects the direction of information flow.

1175 device_name: (`str`) name of the device. If there is only one device or if

1176 node_name exists on only one device, this argument is optional.

1177

1178 Returns:

1179 A path from the src_node_name to dst_node_name, as a `list` of `str`, if

1180 it exists. The list includes src_node_name as the first item and

1181 dst_node_name as the last.

1182 If such a path does not exist, `None`.

1183

1184 Raises:

1185 ValueError: If the source and destination nodes are not on the same

1186 device.

1187 """

1188 src_device_name = self._infer_device_name(device_name, src_node_name)

1189 dst_device_name = self._infer_device_name(device_name, dst_node_name)

1190

1191 if src_device_name != dst_device_name:

1192 raise ValueError(

1193 "Source (%s) and destination (%s) are not on the same device: "

1194 "%s vs. %s" % (src_node_name, dst_node_name, src_device_name,

1195 dst_device_name))

1196

1197 input_lists = [self._debug_graphs[dst_device_name].node_inputs]

1198 debug_graph = self._debug_graphs[dst_device_name]

1199 if include_control:

1200 input_lists.append(debug_graph.node_ctrl_inputs)

1201 if include_reversed_ref:

1202 input_lists.append(debug_graph.node_reversed_ref_inputs)

1203 tracer = debug_graphs.DFSGraphTracer(

1204 input_lists,

1205 skip_node_names=self._get_merge_node_names(dst_device_name),

1206 destination_node_name=src_node_name)

1207 # Here the value of destination_node_name is src_node_name, because we

1208 # are tracing the graph from output to its inputs (i.e., going backwards

1209 # on the graph).

1210

1211 try:

1212 tracer.trace(dst_node_name)

1213 except debug_graphs.GraphTracingReachedDestination:

1214 # Prune nodes not on the path.

1215 inputs = [dst_node_name] + tracer.inputs()

1216 depth_list = [0] + tracer.depth_list()

1217

1218 path = []

1219 curr_depth = depth_list[-1]

1220 for inp, depth in zip(reversed(inputs), reversed(depth_list)):

1221 if depth == curr_depth:

1222 path.append(inp)

1223 curr_depth -= 1

1224 return path

1225

1226 def node_recipients(self, node_name, is_control=False, device_name=None):

1227 """Get recipient of the given node's output according to partition graphs.

1228

1229 Args:

1230 node_name: (`str`) name of the node.

1231 is_control: (`bool`) whether control outputs, rather than non-control

1232 outputs, are to be returned.

1233 device_name: (`str`) name of the device. If there is only one device or if

1234 node_name exists on only one device, this argument is optional.

1235

1236 Returns:

1237 (`list` of `str`) all inputs to the node, as a list of node names.

1238

1239 Raises:

1240 LookupError: If node inputs and control inputs have not been loaded

1241 from partition graphs yet.

1242 """

1243

1244 if not self._debug_graphs:

1245 raise LookupError(

1246 "Node recipients are not loaded from partition graphs yet.")

1247

1248 device_name = self._infer_device_name(device_name, node_name)

1249 debug_graph = self._debug_graphs[device_name]

1250 if is_control:

1251 return debug_graph.node_ctrl_recipients[node_name]

1252 else:

1253 return debug_graph.node_recipients[node_name]

1254

1255 def devices(self):

1256 """Get the list of device names.

1257

1258 Returns:

1259 (`list` of `str`) names of the devices.

1260 """

1261 return self._device_names

1262

1263 def node_exists(self, node_name, device_name=None):

1264 """Test if a node exists in the partition graphs.

1265

1266 Args:

1267 node_name: (`str`) name of the node to be checked.

1268 device_name: optional device name. If None, will search for the node

1269 on all available devices. Otherwise, search for the node only on

1270 the given device.

1271

1272 Returns:

1273 A boolean indicating whether the node exists.

1274

1275 Raises:

1276 LookupError: If no partition graphs have been loaded yet.

1277 ValueError: If device_name is specified but cannot be found.

1278 """

1279 if not self._debug_graphs:

1280 raise LookupError(

1281 "Nodes have not been loaded from partition graphs yet.")

1282

1283 if (device_name is not None) and device_name not in self._debug_graphs:

1284 raise ValueError(

1285 "The specified device_name '%s' cannot be found." % device_name)

1286

1287 for _, debug_graph in self._debug_graphs.items():

1288 if node_name in debug_graph.node_inputs:

1289 return True

1290 return False

1291

1292 def node_device(self, node_name):

1293 """Get the names of the devices that has nodes of the specified name.

1294

1295 Args:

1296 node_name: (`str`) name of the node.

1297

1298 Returns:

1299 (`str` or `list` of `str`) name of the device(s) on which the node of the

1300 given name is found. Returns a `str` if there is only one such device,

1301 otherwise return a `list` of `str`.

1302

1303 Raises:

1304 LookupError: If node inputs and control inputs have not been loaded

1305 from partition graphs yet.

1306 ValueError: If the node does not exist in partition graphs.

1307 """

1308 if not self._debug_graphs:

1309 raise LookupError(

1310 "Node devices are not loaded from partition graphs yet.")

1311

1312 if node_name not in self._node_devices:

1313 raise ValueError("Node '%s' does not exist in partition graphs." %

1314 node_name)

1315

1316 output = list(self._node_devices[node_name])

1317 return output[0] if len(output) == 1 else output

1318

1319 def node_op_type(self, node_name, device_name=None):

1320 """Get the op type of given node.

1321

1322 Args:

1323 node_name: (`str`) name of the node.

1324 device_name: (`str`) name of the device. If there is only one device or if

1325 node_name exists on only one device, this argument is optional.

1326

1327 Returns:

1328 (`str`) op type of the node.

1329

1330 Raises:

1331 LookupError: If node op types have not been loaded

1332 from partition graphs yet.

1333 """

1334 if not self._debug_graphs:

1335 raise LookupError(

1336 "Node op types are not loaded from partition graphs yet.")

1337

1338 device_name = self._infer_device_name(device_name, node_name)

1339 return self._debug_graphs[device_name].node_op_types[node_name]

1340

1341 def debug_watch_keys(self, node_name, device_name=None):

1342 """Get all tensor watch keys of given node according to partition graphs.

1343

1344 Args:

1345 node_name: (`str`) name of the node.

1346 device_name: (`str`) name of the device. If there is only one device or if

1347 node_name exists on only one device, this argument is optional.

1348

1349 Returns:

1350 (`list` of `str`) all debug tensor watch keys. Returns an empty list if

1351 the node name does not correspond to any debug watch keys.

1352

1353 Raises:

1354 `LookupError`: If debug watch information has not been loaded from

1355 partition graphs yet.

1356 """

1357

1358 try:

1359 device_name = self._infer_device_name(device_name, node_name)

1360 except ValueError:

1361 return []

1362

1363 if node_name not in self._debug_watches[device_name]:

1364 return []

1365

1366 watch_keys = []

1367 for watched_slot in self._debug_watches[device_name][node_name]:

1368 debug_ops = self._debug_watches[device_name][node_name][watched_slot]

1369 for debug_op in debug_ops:

1370 watch_keys.append(

1371 _get_tensor_watch_key(node_name, watched_slot, debug_op))

1372

1373 return watch_keys

1374

1375 def watch_key_to_data(self, debug_watch_key, device_name=None):

1376 """Get all `DebugTensorDatum` instances corresponding to a debug watch key.

1377

1378 Args:

1379 debug_watch_key: (`str`) debug watch key.

1380 device_name: (`str`) name of the device. If there is only one device or if

1381 the specified debug_watch_key exists on only one device, this argument

1382 is optional.

1383

1384 Returns:

1385 A list of `DebugTensorDatum` instances that correspond to the debug watch

1386 key. If the watch key does not exist, returns an empty list.

1387

1388 Raises:

1389 ValueError: If there are multiple devices that have the debug_watch_key,

1390 but device_name is not specified.

1391 """

1392 if device_name is None:

1393 matching_device_names = [

1394 name for name in self._watch_key_to_datum

1395 if debug_watch_key in self._watch_key_to_datum[name]]

1396 if not matching_device_names:

1397 return []

1398 elif len(matching_device_names) == 1:

1399 device_name = matching_device_names[0]

1400 else:

1401 raise ValueError(

1402 "The debug watch key '%s' exists on multiple (%d) devices, but "

1403 "device name is not specified." %

1404 (debug_watch_key, len(matching_device_names)))

1405 elif device_name not in self._debug_key_to_datum:

1406 raise ValueError(

1407 "There is no device named '%s' consisting of debug watch keys." %

1408 device_name)

1409

1410 return self._watch_key_to_datum[device_name].get(debug_watch_key, [])

1411

1412 def find(self,

1413 predicate,

1414 first_n=0,

1415 device_name=None,

1416 exclude_node_names=None):

1417 """Find dumped tensor data by a certain predicate.

1418

1419 Args:

1420 predicate: A callable that takes two input arguments:

1421

1422 ```python

1423 def predicate(debug_tensor_datum, tensor):

1424 # returns a bool

1425 ```

1426

1427 where `debug_tensor_datum` is an instance of `DebugTensorDatum`, which

1428 carries the metadata, such as the `Tensor`'s node name, output slot

1429 timestamp, debug op name, etc.; and `tensor` is the dumped tensor value

1430 as a `numpy.ndarray`.

1431 first_n: (`int`) return only the first n `DebugTensotDatum` instances (in

1432 time order) for which the predicate returns True. To return all the

1433 `DebugTensotDatum` instances, let first_n be <= 0.

1434 device_name: optional device name.

1435 exclude_node_names: Optional regular expression to exclude nodes with

1436 names matching the regular expression.

1437

1438 Returns:

1439 A list of all `DebugTensorDatum` objects in this `DebugDumpDir` object

1440 for which predicate returns True, sorted in ascending order of the

1441 timestamp.

1442 """

1443 if exclude_node_names:

1444 exclude_node_names = re.compile(exclude_node_names)

1445

1446 matched_data = []

1447 for device in (self._dump_tensor_data if device_name is None

1448 else (self._dump_tensor_data[device_name],)):

1449 for datum in self._dump_tensor_data[device]:

1450 if exclude_node_names and exclude_node_names.match(datum.node_name):

1451 continue

1452

1453 if predicate(datum, datum.get_tensor()):

1454 matched_data.append(datum)

1455

1456 if first_n > 0 and len(matched_data) >= first_n:

1457 return matched_data

1458

1459 return matched_data

1460

1461 def get_tensor_file_paths(self,

1462 node_name,

1463 output_slot,

1464 debug_op,

1465 device_name=None):

1466 """Get the file paths from a debug-dumped tensor.

1467

1468 Args:

1469 node_name: (`str`) name of the node that the tensor is produced by.

1470 output_slot: (`int`) output slot index of tensor.

1471 debug_op: (`str`) name of the debug op.

1472 device_name: (`str`) name of the device. If there is only one device or if

1473 the specified debug_watch_key exists on only one device, this argument

1474 is optional.

1475

1476 Returns:

1477 List of file path(s) loaded. This is a list because each debugged tensor

1478 may be dumped multiple times.

1479

1480 Raises:

1481 WatchKeyDoesNotExistInDebugDumpDirError: If the tensor does not exist in

1482 the debug-dump data.

1483 """

1484

1485 device_name = self._infer_device_name(device_name, node_name)

1486 watch_key = _get_tensor_watch_key(node_name, output_slot, debug_op)

1487 if watch_key not in self._watch_key_to_datum[device_name]:

1488 raise WatchKeyDoesNotExistInDebugDumpDirError(

1489 "Watch key \"%s\" does not exist in the debug dump of device %s" %

1490 (watch_key, device_name))

1491

1492 return [datum.file_path for datum in

1493 self._watch_key_to_datum[device_name][watch_key]]

1494

1495 def get_tensors(self, node_name, output_slot, debug_op, device_name=None):

1496 """Get the tensor value from for a debug-dumped tensor.

1497

1498 The tensor may be dumped multiple times in the dump root directory, so a

1499 list of tensors (`numpy.ndarray`) is returned.

1500

1501 Args:

1502 node_name: (`str`) name of the node that the tensor is produced by.

1503 output_slot: (`int`) output slot index of tensor.

1504 debug_op: (`str`) name of the debug op.

1505 device_name: (`str`) name of the device. If there is only one device or if

1506 the specified debug_watch_key exists on only one device, this argument

1507 is optional.

1508

1509 Returns:

1510 List of tensors (`numpy.ndarray`) loaded from the debug-dump file(s).

1511

1512 Raises:

1513 WatchKeyDoesNotExistInDebugDumpDirError: If the tensor does not exist in

1514 the debug-dump data.

1515 """

1516

1517 watch_key = _get_tensor_watch_key(node_name, output_slot, debug_op)

1518 try:

1519 device_name = self._infer_device_name(device_name, node_name)

1520 return [datum.get_tensor() for datum in

1521 self._watch_key_to_datum[device_name][watch_key]]

1522 except (ValueError, KeyError):

1523 raise WatchKeyDoesNotExistInDebugDumpDirError(

1524 "Watch key \"%s\" does not exist in the debug dump of device %s" %

1525 (watch_key, device_name))

1526

1527 def get_rel_timestamps(self,

1528 node_name,

1529 output_slot,

1530 debug_op,

1531 device_name=None):

1532 """Get the relative timestamp from for a debug-dumped tensor.

1533

1534 Relative timestamp means (absolute timestamp - `t0`), where `t0` is the

1535 absolute timestamp of the first dumped tensor in the dump root. The tensor

1536 may be dumped multiple times in the dump root directory, so a list of

1537 relative timestamps (`numpy.ndarray`) is returned.

1538

1539 Args:

1540 node_name: (`str`) name of the node that the tensor is produced by.

1541 output_slot: (`int`) output slot index of tensor.

1542 debug_op: (`str`) name of the debug op.

1543 device_name: (`str`) name of the device. If there is only one device or if

1544 the specified debug_watch_key exists on only one device, this argument

1545 is optional.

1546

1547 Returns:

1548 (`list` of `int`) list of relative timestamps.

1549

1550 Raises:

1551 WatchKeyDoesNotExistInDebugDumpDirError: If the tensor watch key does not

1552 exist in the debug dump data.

1553 """

1554

1555 device_name = self._infer_device_name(device_name, node_name)

1556 watch_key = _get_tensor_watch_key(node_name, output_slot, debug_op)

1557 if watch_key not in self._watch_key_to_datum[device_name]:

1558 raise WatchKeyDoesNotExistInDebugDumpDirError(

1559 "Watch key \"%s\" does not exist in the debug dump" % watch_key)

1560

1561 # TODO(cais): Figure out whether this should be relative to the global t0.

1562 return self._watch_key_to_rel_time[device_name][watch_key]

1563

1564 def get_dump_sizes_bytes(self,

1565 node_name,

1566 output_slot,

1567 debug_op,

1568 device_name=None):

1569 """Get the sizes of the dump files for a debug-dumped tensor.

1570

1571 Unit of the file size: byte.

1572

1573 Args:

1574 node_name: (`str`) name of the node that the tensor is produced by.

1575 output_slot: (`int`) output slot index of tensor.

1576 debug_op: (`str`) name of the debug op.

1577 device_name: (`str`) name of the device. If there is only one device or if

1578 the specified debug_watch_key exists on only one device, this argument

1579 is optional.

1580

1581 Returns:

1582 (`list` of `int`): list of dump file sizes in bytes.

1583

1584 Raises:

1585 WatchKeyDoesNotExistInDebugDumpDirError: If the tensor watch key does not

1586 exist in the debug dump data.

1587 """

1588

1589 device_name = self._infer_device_name(device_name, node_name)

1590 watch_key = _get_tensor_watch_key(node_name, output_slot, debug_op)

1591 if watch_key not in self._watch_key_to_datum[device_name]:

1592 raise WatchKeyDoesNotExistInDebugDumpDirError(

1593 "Watch key \"%s\" does not exist in the debug dump of device %s" %

1594 (watch_key, device_name))

1595

1596 return self._watch_key_to_dump_size_bytes[device_name][watch_key]

1597

1598 def node_traceback(self, element_name):

1599 """Try to retrieve the Python traceback of node's construction.

1600

1601 Args:

1602 element_name: (`str`) Name of a graph element (node or tensor).

1603

1604 Returns:

1605 (list) The traceback list object as returned by the `extract_trace`

1606 method of Python's traceback module.

1607

1608 Raises:

1609 LookupError: If Python graph is not available for traceback lookup.

1610 KeyError: If the node cannot be found in the Python graph loaded.

1611 """

1612

1613 if self._python_graph is None:

1614 raise LookupError("Python graph is not available for traceback lookup")

1615

1616 node_name = debug_graphs.get_node_name(element_name)

1617 if node_name not in self._node_traceback:

1618 raise KeyError("Cannot find node \"%s\" in Python graph" % node_name)

1619

1620 return self._node_traceback[node_name]

Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/tensorflow/python/debug/lib/debug_data.py: 23%

526 statements