Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/tensorflow/python/client/timeline.py: 19%

328 statements  

« prev     ^ index     » next       coverage.py v7.4.0, created at 2024-01-03 07:57 +0000

1# Copyright 2016 The TensorFlow Authors. All Rights Reserved. 

2# 

3# Licensed under the Apache License, Version 2.0 (the "License"); 

4# you may not use this file except in compliance with the License. 

5# You may obtain a copy of the License at 

6# 

7# http://www.apache.org/licenses/LICENSE-2.0 

8# 

9# Unless required by applicable law or agreed to in writing, software 

10# distributed under the License is distributed on an "AS IS" BASIS, 

11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 

12# See the License for the specific language governing permissions and 

13# limitations under the License. 

14# ============================================================================== 

15"""Timeline visualization for TensorFlow using Chrome Trace Format.""" 

16 

17import collections 

18import copy 

19import json 

20import re 

21 

22# The timeline target is usually imported as part of BUILD target 

23# "platform_test", which includes also includes the "platform" 

24# dependency. This is why the logging import here is okay. 

25from tensorflow.python.platform import build_info 

26from tensorflow.python.platform import tf_logging as logging 

27 

28 

29class AllocationMaximum(collections.namedtuple( 

30 'AllocationMaximum', ('timestamp', 'num_bytes', 'tensors'))): 

31 """Stores the maximum allocation for a given allocator within the timelne. 

32 

33 Parameters: 

34 timestamp: `tensorflow::Env::NowMicros()` when this maximum was reached. 

35 num_bytes: the total memory used at this time. 

36 tensors: the set of tensors allocated at this time. 

37 """ 

38 pass 

39 

40 

41class StepStatsAnalysis(collections.namedtuple( 

42 'StepStatsAnalysis', ('chrome_trace', 'allocator_maximums'))): 

43 """Stores the step stats analysis output. 

44 

45 Parameters: 

46 chrome_trace: A dict containing the chrome trace analysis. 

47 allocator_maximums: A dict mapping allocator names to AllocationMaximum. 

48 """ 

49 pass 

50 

51 

52class _ChromeTraceFormatter(object): 

53 """A helper class for generating traces in Chrome Trace Format.""" 

54 

55 def __init__(self, show_memory=False): 

56 """Constructs a new Chrome Trace formatter.""" 

57 self._show_memory = show_memory 

58 self._events = [] 

59 self._metadata = [] 

60 

61 def _create_event(self, ph, category, name, pid, tid, timestamp): 

62 """Creates a new Chrome Trace event. 

63 

64 For details of the file format, see: 

65 https://github.com/catapult-project/catapult/blob/master/tracing/README.md 

66 

67 Args: 

68 ph: The type of event - usually a single character. 

69 category: The event category as a string. 

70 name: The event name as a string. 

71 pid: Identifier of the process generating this event as an integer. 

72 tid: Identifier of the thread generating this event as an integer. 

73 timestamp: The timestamp of this event as a long integer. 

74 

75 Returns: 

76 A JSON compatible event object. 

77 """ 

78 event = {} 

79 event['ph'] = ph 

80 event['cat'] = category 

81 event['name'] = name 

82 event['pid'] = pid 

83 event['tid'] = tid 

84 event['ts'] = timestamp 

85 return event 

86 

87 def emit_pid(self, name, pid): 

88 """Adds a process metadata event to the trace. 

89 

90 Args: 

91 name: The process name as a string. 

92 pid: Identifier of the process as an integer. 

93 """ 

94 event = {} 

95 event['name'] = 'process_name' 

96 event['ph'] = 'M' 

97 event['pid'] = pid 

98 event['args'] = {'name': name} 

99 self._metadata.append(event) 

100 

101 def emit_tid(self, name, pid, tid): 

102 """Adds a thread metadata event to the trace. 

103 

104 Args: 

105 name: The thread name as a string. 

106 pid: Identifier of the process as an integer. 

107 tid: Identifier of the thread as an integer. 

108 """ 

109 event = {} 

110 event['name'] = 'thread_name' 

111 event['ph'] = 'M' 

112 event['pid'] = pid 

113 event['tid'] = tid 

114 event['args'] = {'name': name} 

115 self._metadata.append(event) 

116 

117 def emit_region(self, timestamp, duration, pid, tid, category, name, args): 

118 """Adds a region event to the trace. 

119 

120 Args: 

121 timestamp: The start timestamp of this region as a long integer. 

122 duration: The duration of this region as a long integer. 

123 pid: Identifier of the process generating this event as an integer. 

124 tid: Identifier of the thread generating this event as an integer. 

125 category: The event category as a string. 

126 name: The event name as a string. 

127 args: A JSON-compatible dictionary of event arguments. 

128 """ 

129 event = self._create_event('X', category, name, pid, tid, timestamp) 

130 event['dur'] = duration 

131 event['args'] = args 

132 self._events.append(event) 

133 

134 def emit_obj_create(self, category, name, timestamp, pid, tid, object_id): 

135 """Adds an object creation event to the trace. 

136 

137 Args: 

138 category: The event category as a string. 

139 name: The event name as a string. 

140 timestamp: The timestamp of this event as a long integer. 

141 pid: Identifier of the process generating this event as an integer. 

142 tid: Identifier of the thread generating this event as an integer. 

143 object_id: Identifier of the object as an integer. 

144 """ 

145 event = self._create_event('N', category, name, pid, tid, timestamp) 

146 event['id'] = object_id 

147 self._events.append(event) 

148 

149 def emit_obj_delete(self, category, name, timestamp, pid, tid, object_id): 

150 """Adds an object deletion event to the trace. 

151 

152 Args: 

153 category: The event category as a string. 

154 name: The event name as a string. 

155 timestamp: The timestamp of this event as a long integer. 

156 pid: Identifier of the process generating this event as an integer. 

157 tid: Identifier of the thread generating this event as an integer. 

158 object_id: Identifier of the object as an integer. 

159 """ 

160 event = self._create_event('D', category, name, pid, tid, timestamp) 

161 event['id'] = object_id 

162 self._events.append(event) 

163 

164 def emit_obj_snapshot(self, category, name, timestamp, pid, tid, object_id, 

165 snapshot): 

166 """Adds an object snapshot event to the trace. 

167 

168 Args: 

169 category: The event category as a string. 

170 name: The event name as a string. 

171 timestamp: The timestamp of this event as a long integer. 

172 pid: Identifier of the process generating this event as an integer. 

173 tid: Identifier of the thread generating this event as an integer. 

174 object_id: Identifier of the object as an integer. 

175 snapshot: A JSON-compatible representation of the object. 

176 """ 

177 event = self._create_event('O', category, name, pid, tid, timestamp) 

178 event['id'] = object_id 

179 event['args'] = {'snapshot': snapshot} 

180 self._events.append(event) 

181 

182 def emit_flow_start(self, name, timestamp, pid, tid, flow_id): 

183 """Adds a flow start event to the trace. 

184 

185 When matched with a flow end event (with the same 'flow_id') this will 

186 cause the trace viewer to draw an arrow between the start and end events. 

187 

188 Args: 

189 name: The event name as a string. 

190 timestamp: The timestamp of this event as a long integer. 

191 pid: Identifier of the process generating this event as an integer. 

192 tid: Identifier of the thread generating this event as an integer. 

193 flow_id: Identifier of the flow as an integer. 

194 """ 

195 event = self._create_event('s', 'DataFlow', name, pid, tid, timestamp) 

196 event['id'] = flow_id 

197 self._events.append(event) 

198 

199 def emit_flow_end(self, name, timestamp, pid, tid, flow_id): 

200 """Adds a flow end event to the trace. 

201 

202 When matched with a flow start event (with the same 'flow_id') this will 

203 cause the trace viewer to draw an arrow between the start and end events. 

204 

205 Args: 

206 name: The event name as a string. 

207 timestamp: The timestamp of this event as a long integer. 

208 pid: Identifier of the process generating this event as an integer. 

209 tid: Identifier of the thread generating this event as an integer. 

210 flow_id: Identifier of the flow as an integer. 

211 """ 

212 event = self._create_event('t', 'DataFlow', name, pid, tid, timestamp) 

213 event['id'] = flow_id 

214 self._events.append(event) 

215 

216 def emit_counter(self, category, name, pid, timestamp, counter, value): 

217 """Emits a record for a single counter. 

218 

219 Args: 

220 category: The event category as a string. 

221 name: The event name as a string. 

222 pid: Identifier of the process generating this event as an integer. 

223 timestamp: The timestamp of this event as a long integer. 

224 counter: Name of the counter as a string. 

225 value: Value of the counter as an integer. 

226 """ 

227 event = self._create_event('C', category, name, pid, 0, timestamp) 

228 event['args'] = {counter: value} 

229 self._events.append(event) 

230 

231 def emit_counters(self, category, name, pid, timestamp, counters): 

232 """Emits a counter record for the dictionary 'counters'. 

233 

234 Args: 

235 category: The event category as a string. 

236 name: The event name as a string. 

237 pid: Identifier of the process generating this event as an integer. 

238 timestamp: The timestamp of this event as a long integer. 

239 counters: Dictionary of counter values. 

240 """ 

241 event = self._create_event('C', category, name, pid, 0, timestamp) 

242 event['args'] = counters.copy() 

243 self._events.append(event) 

244 

245 def format_to_string(self, pretty=False): 

246 """Formats the chrome trace to a string. 

247 

248 Args: 

249 pretty: (Optional.) If True, produce human-readable JSON output. 

250 

251 Returns: 

252 A JSON-formatted string in Chrome Trace format. 

253 """ 

254 trace = {} 

255 trace['traceEvents'] = self._metadata + self._events 

256 if pretty: 

257 return json.dumps(trace, indent=4, separators=(',', ': ')) 

258 else: 

259 return json.dumps(trace, separators=(',', ':')) 

260 

261 

262class _TensorTracker(object): 

263 """An internal class to track the lifetime of a Tensor.""" 

264 

265 def __init__(self, name, object_id, timestamp, pid, allocator, num_bytes): 

266 """Creates an object to track tensor references. 

267 

268 This class is not thread safe and is intended only for internal use by 

269 the 'Timeline' class in this file. 

270 

271 Args: 

272 name: The name of the Tensor as a string. 

273 object_id: Chrome Trace object identifier assigned for this Tensor. 

274 timestamp: The creation timestamp of this event as a long integer. 

275 pid: Process identifier of the associated device, as an integer. 

276 allocator: Name of the allocator used to create the Tensor. 

277 num_bytes: Number of bytes allocated (long integer). 

278 

279 Returns: 

280 A 'TensorTracker' object. 

281 """ 

282 self._name = name 

283 self._pid = pid 

284 self._object_id = object_id 

285 self._create_time = timestamp 

286 self._allocator = allocator 

287 self._num_bytes = num_bytes 

288 self._ref_times = [] 

289 self._unref_times = [] 

290 

291 @property 

292 def name(self): 

293 """Name of this tensor.""" 

294 return self._name 

295 

296 @property 

297 def pid(self): 

298 """ID of the process which created this tensor (an integer).""" 

299 return self._pid 

300 

301 @property 

302 def create_time(self): 

303 """Timestamp when this tensor was created (long integer).""" 

304 return self._create_time 

305 

306 @property 

307 def object_id(self): 

308 """Returns the object identifier of this tensor (integer).""" 

309 return self._object_id 

310 

311 @property 

312 def num_bytes(self): 

313 """Size of this tensor in bytes (long integer).""" 

314 return self._num_bytes 

315 

316 @property 

317 def allocator(self): 

318 """Name of the allocator used to create this tensor (string).""" 

319 return self._allocator 

320 

321 @property 

322 def last_unref(self): 

323 """Last unreference timestamp of this tensor (long integer).""" 

324 return max(self._unref_times) 

325 

326 def add_ref(self, timestamp): 

327 """Adds a reference to this tensor with the specified timestamp. 

328 

329 Args: 

330 timestamp: Timestamp of object reference as an integer. 

331 """ 

332 self._ref_times.append(timestamp) 

333 

334 def add_unref(self, timestamp): 

335 """Adds an unref to this tensor with the specified timestamp. 

336 

337 Args: 

338 timestamp: Timestamp of object unreference as an integer. 

339 """ 

340 self._unref_times.append(timestamp) 

341 

342 

343class Timeline(object): 

344 """A class for visualizing execution timelines of TensorFlow steps.""" 

345 

346 def __init__(self, step_stats, graph=None): 

347 """Constructs a new Timeline. 

348 

349 A 'Timeline' is used for visualizing the execution of a TensorFlow 

350 computation. It shows the timings and concurrency of execution at 

351 the granularity of TensorFlow Ops. 

352 This class is not thread safe. 

353 

354 Args: 

355 step_stats: The 'StepStats' proto recording execution times. 

356 graph: (Optional) The 'Graph' that was executed. 

357 """ 

358 

359 self._origin_step_stats = step_stats 

360 self._step_stats = None 

361 self._graph = graph 

362 self._chrome_trace = _ChromeTraceFormatter() 

363 self._next_pid = 0 

364 self._device_pids = {} # device name -> pid for compute activity. 

365 self._tensor_pids = {} # device name -> pid for tensors. 

366 self._tensors = {} # tensor_name -> TensorTracker 

367 self._next_flow_id = 0 

368 self._flow_starts = {} # tensor_name -> (timestamp, pid, tid) 

369 self._alloc_times = {} # tensor_name -> ( time, allocator, size ) 

370 self._allocator_maximums = {} # allocator name => maximum bytes long 

371 

372 def _alloc_pid(self): 

373 """Allocate a process Id.""" 

374 pid = self._next_pid 

375 self._next_pid += 1 

376 return pid 

377 

378 def _alloc_flow_id(self): 

379 """Allocate a flow Id.""" 

380 flow_id = self._next_flow_id 

381 self._next_flow_id += 1 

382 return flow_id 

383 

384 def _parse_op_label(self, label): 

385 """Parses the fields in a node timeline label.""" 

386 # Expects labels of the form: name = op(arg, arg, ...). 

387 match = re.match(r'(.*) = (.*)\((.*)\)', label) 

388 if match is None: 

389 return 'unknown', 'unknown', [] 

390 nn, op, inputs = match.groups() 

391 if not inputs: 

392 inputs = [] 

393 else: 

394 inputs = inputs.split(', ') 

395 return nn, op, inputs 

396 

397 def _parse_kernel_label(self, label, node_name): 

398 """Parses the fields in a node timeline label.""" 

399 # Expects labels of the form: retval (arg) detail @@annotation 

400 start = label.find('@@') 

401 end = label.find('#') 

402 if start >= 0 and end >= 0 and start + 2 < end: 

403 node_name = label[start + 2:end] 

404 # Node names should always have the form 'name:op'. 

405 fields = node_name.split(':') + ['unknown'] 

406 name, op = fields[:2] 

407 return name, op 

408 

409 def _assign_lanes(self): 

410 """Assigns non-overlapping lanes for the activities on each device.""" 

411 for device_stats in self._step_stats.dev_stats: 

412 # TODO(pbar): Genuine thread IDs in NodeExecStats might be helpful. 

413 lanes = [0] 

414 for ns in device_stats.node_stats: 

415 l = -1 

416 for (i, lts) in enumerate(lanes): 

417 if ns.all_start_micros > lts: 

418 l = i 

419 lanes[l] = ns.all_start_micros + ns.all_end_rel_micros 

420 break 

421 if l < 0: 

422 l = len(lanes) 

423 lanes.append(ns.all_start_micros + ns.all_end_rel_micros) 

424 ns.thread_id = l 

425 

426 def _emit_op(self, nodestats, pid, is_gputrace): 

427 """Generates a Chrome Trace event to show Op execution. 

428 

429 Args: 

430 nodestats: The 'NodeExecStats' proto recording op execution. 

431 pid: The pid assigned for the device where this op ran. 

432 is_gputrace: If True then this op came from the GPUTracer. 

433 """ 

434 node_name = nodestats.node_name 

435 start = nodestats.all_start_micros 

436 duration = nodestats.all_end_rel_micros 

437 tid = nodestats.thread_id 

438 inputs = [] 

439 if is_gputrace: 

440 node_name, op = self._parse_kernel_label(nodestats.timeline_label, 

441 node_name) 

442 elif node_name == 'RecvTensor': 

443 # RPC tracing does not use the standard timeline_label format. 

444 op = 'RecvTensor' 

445 else: 

446 _, op, inputs = self._parse_op_label(nodestats.timeline_label) 

447 args = {'name': node_name, 'op': op} 

448 if build_info.build_info['is_rocm_build']: 

449 args['kernel'] = nodestats.timeline_label.split('@@')[0] 

450 for i, iname in enumerate(inputs): 

451 args['input%d' % i] = iname 

452 self._chrome_trace.emit_region(start, duration, pid, tid, 'Op', op, args) 

453 

454 def _emit_tensor_snapshot(self, tensor, timestamp, pid, tid, value): 

455 """Generate Chrome Trace snapshot event for a computed Tensor. 

456 

457 Args: 

458 tensor: A 'TensorTracker' object. 

459 timestamp: The timestamp of this snapshot as a long integer. 

460 pid: The pid assigned for showing the device where this op ran. 

461 tid: The tid of the thread computing the tensor snapshot. 

462 value: A JSON-compliant snapshot of the object. 

463 """ 

464 desc = str(value.tensor_description).replace('"', '') 

465 snapshot = {'tensor_description': desc} 

466 self._chrome_trace.emit_obj_snapshot('Tensor', tensor.name, timestamp, pid, 

467 tid, tensor.object_id, snapshot) 

468 

469 def _produce_tensor(self, name, timestamp, tensors_pid, allocator, num_bytes): 

470 object_id = len(self._tensors) 

471 tensor = _TensorTracker(name, object_id, timestamp, tensors_pid, allocator, 

472 num_bytes) 

473 self._tensors[name] = tensor 

474 return tensor 

475 

476 def _is_gputrace_device(self, device_name): 

477 """Returns true if this device is part of the GPUTracer logging.""" 

478 return '/stream:' in device_name or '/memcpy' in device_name 

479 

480 def _allocate_pids(self): 

481 """Allocate fake process ids for each device in the StepStats.""" 

482 self._allocators_pid = self._alloc_pid() 

483 self._chrome_trace.emit_pid('Allocators', self._allocators_pid) 

484 

485 # Add processes in the Chrome trace to show compute and data activity. 

486 for dev_stats in self._step_stats.dev_stats: 

487 device_pid = self._alloc_pid() 

488 self._device_pids[dev_stats.device] = device_pid 

489 tensors_pid = self._alloc_pid() 

490 self._tensor_pids[dev_stats.device] = tensors_pid 

491 self._chrome_trace.emit_pid(dev_stats.device + ' Compute', device_pid) 

492 self._chrome_trace.emit_pid(dev_stats.device + ' Tensors', tensors_pid) 

493 

494 def _analyze_tensors(self, show_memory): 

495 """Analyze tensor references to track dataflow.""" 

496 for dev_stats in self._step_stats.dev_stats: 

497 device_pid = self._device_pids[dev_stats.device] 

498 tensors_pid = self._tensor_pids[dev_stats.device] 

499 for node_stats in dev_stats.node_stats: 

500 tid = node_stats.thread_id 

501 node_name = node_stats.node_name 

502 start_time = node_stats.all_start_micros 

503 end_time = node_stats.all_start_micros + node_stats.all_end_rel_micros 

504 for index, output in enumerate(node_stats.output): 

505 if index: 

506 output_name = '%s:%d' % (node_name, index) 

507 else: 

508 output_name = node_name 

509 

510 allocation = output.tensor_description.allocation_description 

511 num_bytes = allocation.requested_bytes 

512 allocator_name = allocation.allocator_name 

513 tensor = self._produce_tensor(output_name, start_time, tensors_pid, 

514 allocator_name, num_bytes) 

515 tensor.add_ref(start_time) 

516 tensor.add_unref(end_time) 

517 self._flow_starts[output_name] = (end_time, device_pid, tid) 

518 

519 if show_memory: 

520 self._chrome_trace.emit_obj_create('Tensor', output_name, 

521 start_time, tensors_pid, tid, 

522 tensor.object_id) 

523 self._emit_tensor_snapshot(tensor, end_time - 1, tensors_pid, tid, 

524 output) 

525 

526 def _show_compute(self, show_dataflow): 

527 """Visualize the computation activity.""" 

528 for dev_stats in self._step_stats.dev_stats: 

529 device_name = dev_stats.device 

530 device_pid = self._device_pids[device_name] 

531 is_gputrace = self._is_gputrace_device(device_name) 

532 

533 for node_stats in dev_stats.node_stats: 

534 tid = node_stats.thread_id 

535 start_time = node_stats.all_start_micros 

536 end_time = node_stats.all_start_micros + node_stats.all_end_rel_micros 

537 self._emit_op(node_stats, device_pid, is_gputrace) 

538 

539 if is_gputrace or node_stats.node_name == 'RecvTensor': 

540 continue 

541 

542 _, _, inputs = self._parse_op_label(node_stats.timeline_label) 

543 for input_name in inputs: 

544 if input_name not in self._tensors: 

545 # This can happen when partitioning has inserted a Send/Recv. 

546 # We remove the numeric suffix so that the dataflow appears to 

547 # come from the original node. Ideally, the StepStats would 

548 # contain logging for the Send and Recv nodes. 

549 index = input_name.rfind('/_') 

550 if index > 0: 

551 input_name = input_name[:index] 

552 

553 if input_name in self._tensors: 

554 tensor = self._tensors[input_name] 

555 tensor.add_ref(start_time) 

556 tensor.add_unref(end_time - 1) 

557 

558 if show_dataflow: 

559 # We use a different flow ID for every graph edge. 

560 create_time, create_pid, create_tid = self._flow_starts[ 

561 input_name] 

562 # Don't add flows when producer and consumer ops are on the same 

563 # pid/tid since the horizontal arrows clutter the visualization. 

564 if create_pid != device_pid or create_tid != tid: 

565 flow_id = self._alloc_flow_id() 

566 self._chrome_trace.emit_flow_start(input_name, create_time, 

567 create_pid, create_tid, 

568 flow_id) 

569 self._chrome_trace.emit_flow_end(input_name, start_time, 

570 device_pid, tid, flow_id) 

571 else: 

572 logging.vlog(1, 'Can\'t find tensor %s - removed by CSE?', 

573 input_name) 

574 

575 def _show_memory_counters(self): 

576 """Produce a counter series for each memory allocator.""" 

577 # Iterate over all tensor trackers to build a list of allocations and 

578 # frees for each allocator. Then sort the lists and emit a cumulative 

579 # counter series for each allocator. 

580 allocations = {} 

581 for name in self._tensors: 

582 tensor = self._tensors[name] 

583 self._chrome_trace.emit_obj_delete('Tensor', name, tensor.last_unref, 

584 tensor.pid, 0, tensor.object_id) 

585 allocator = tensor.allocator 

586 if allocator not in allocations: 

587 allocations[allocator] = [] 

588 num_bytes = tensor.num_bytes 

589 allocations[allocator].append((tensor.create_time, num_bytes, name)) 

590 allocations[allocator].append((tensor.last_unref, -num_bytes, name)) 

591 

592 alloc_maxes = {} 

593 

594 # Generate a counter series showing total allocations for each allocator. 

595 for allocator in allocations: 

596 alloc_list = allocations[allocator] 

597 alloc_list.sort() 

598 total_bytes = 0 

599 alloc_tensor_set = set() 

600 alloc_maxes[allocator] = AllocationMaximum( 

601 timestamp=0, num_bytes=0, tensors=set()) 

602 for time, num_bytes, name in sorted( 

603 alloc_list, key=lambda allocation: allocation[0]): 

604 total_bytes += num_bytes 

605 if num_bytes < 0: 

606 alloc_tensor_set.discard(name) 

607 else: 

608 alloc_tensor_set.add(name) 

609 

610 if total_bytes > alloc_maxes[allocator].num_bytes: 

611 alloc_maxes[allocator] = AllocationMaximum( 

612 timestamp=time, 

613 num_bytes=total_bytes, 

614 tensors=copy.deepcopy(alloc_tensor_set)) 

615 

616 self._chrome_trace.emit_counter('Memory', allocator, 

617 self._allocators_pid, time, allocator, 

618 total_bytes) 

619 self._allocator_maximums = alloc_maxes 

620 

621 def _preprocess_op_time(self, op_time): 

622 """Update the start and end time of ops in step stats. 

623 

624 Args: 

625 op_time: How the execution time of op is shown in timeline. Possible values 

626 are "schedule", "gpu" and "all". "schedule" will show op from the time it 

627 is scheduled to the end of the scheduling. Notice by the end of its 

628 scheduling its async kernels may not start yet. It is shown using the 

629 default value from step_stats. "gpu" will show op with the execution time 

630 of its kernels on GPU. "all" will show op from the start of its scheduling 

631 to the end of its last kernel. 

632 """ 

633 if op_time == 'schedule': 

634 self._step_stats = self._origin_step_stats 

635 return 

636 self._step_stats = copy.deepcopy(self._origin_step_stats) 

637 # Separate job task and gpu tracer stream 

638 stream_all_stats = [] 

639 job_stats = [] 

640 for stats in self._step_stats.dev_stats: 

641 if '/stream:all' in stats.device: 

642 stream_all_stats.append(stats) 

643 elif '/job' in stats.device: 

644 job_stats.append(stats) 

645 

646 # Record the start time of the first kernel and the end time of 

647 # the last gpu kernel for all ops. 

648 op_gpu_start = {} 

649 op_gpu_end = {} 

650 for stats in stream_all_stats: 

651 for kernel in stats.node_stats: 

652 name, _ = self._parse_kernel_label(kernel.timeline_label, 

653 kernel.node_name) 

654 start = kernel.all_start_micros 

655 end = kernel.all_start_micros + kernel.all_end_rel_micros 

656 if name in op_gpu_start: 

657 op_gpu_start[name] = min(op_gpu_start[name], start) 

658 op_gpu_end[name] = max(op_gpu_end[name], end) 

659 else: 

660 op_gpu_start[name] = start 

661 op_gpu_end[name] = end 

662 

663 # Update the start and end time of each op according to the op_time 

664 for stats in job_stats: 

665 for op in stats.node_stats: 

666 if op.node_name in op_gpu_start: 

667 end = max(op_gpu_end[op.node_name], 

668 op.all_start_micros + op.all_end_rel_micros) 

669 if op_time == 'gpu': 

670 op.all_start_micros = op_gpu_start[op.node_name] 

671 op.all_end_rel_micros = end - op.all_start_micros 

672 

673 def analyze_step_stats(self, 

674 show_dataflow=True, 

675 show_memory=True, 

676 op_time='schedule'): 

677 """Analyze the step stats and format it into Chrome Trace Format. 

678 

679 Args: 

680 show_dataflow: (Optional.) If True, add flow events to the trace 

681 connecting producers and consumers of tensors. 

682 show_memory: (Optional.) If True, add object snapshot events to the trace 

683 showing the sizes and lifetimes of tensors. 

684 op_time: (Optional.) How the execution time of op is shown in timeline. 

685 Possible values are "schedule", "gpu" and "all". "schedule" will show op 

686 from the time it is scheduled to the end of the scheduling. Notice by 

687 the end of its scheduling its async kernels may not start yet. It is 

688 shown using the default value from step_stats. "gpu" will show op with 

689 the execution time of its kernels on GPU. "all" will show op from the 

690 start of its scheduling to the end of its last kernel. 

691 

692 Returns: 

693 A 'StepStatsAnalysis' object. 

694 """ 

695 self._preprocess_op_time(op_time) 

696 self._allocate_pids() 

697 self._assign_lanes() 

698 self._analyze_tensors(show_memory) 

699 self._show_compute(show_dataflow) 

700 if show_memory: 

701 self._show_memory_counters() 

702 return StepStatsAnalysis( 

703 chrome_trace=self._chrome_trace, 

704 allocator_maximums=self._allocator_maximums) 

705 

706 def generate_chrome_trace_format(self, 

707 show_dataflow=True, 

708 show_memory=False, 

709 op_time='schedule'): 

710 """Produces a trace in Chrome Trace Format. 

711 

712 Args: 

713 show_dataflow: (Optional.) If True, add flow events to the trace 

714 connecting producers and consumers of tensors. 

715 show_memory: (Optional.) If True, add object snapshot events to the trace 

716 showing the sizes and lifetimes of tensors. 

717 op_time: (Optional.) How the execution time of op is shown in timeline. 

718 Possible values are "schedule", "gpu" and "all". 

719 "schedule" will show op from the time it is scheduled to the end of 

720 the scheduling. 

721 Notice by the end of its scheduling its async kernels may not start 

722 yet. It is shown using the default value from step_stats. 

723 "gpu" will show op with the execution time of its kernels on GPU. 

724 "all" will show op from the start of its scheduling to the end of 

725 its last kernel. 

726 

727 Returns: 

728 A JSON formatted string in Chrome Trace format. 

729 """ 

730 step_stats_analysis = self.analyze_step_stats( 

731 show_dataflow=show_dataflow, show_memory=show_memory, op_time=op_time) 

732 

733 return step_stats_analysis.chrome_trace.format_to_string(pretty=True)