Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/tensorflow/python/eager/backprop.py: 20%

416 statements  

« prev     ^ index     » next       coverage.py v7.4.0, created at 2024-01-03 07:57 +0000

1# Copyright 2017 The TensorFlow Authors. All Rights Reserved. 

2# 

3# Licensed under the Apache License, Version 2.0 (the "License"); 

4# you may not use this file except in compliance with the License. 

5# You may obtain a copy of the License at 

6# 

7# http://www.apache.org/licenses/LICENSE-2.0 

8# 

9# Unless required by applicable law or agreed to in writing, software 

10# distributed under the License is distributed on an "AS IS" BASIS, 

11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 

12# See the License for the specific language governing permissions and 

13# limitations under the License. 

14# ============================================================================== 

15"""Code for backpropagation using the tape utilities.""" 

16 

17# TODO(b/159343581): Properly support CompositeTensor in all functions in this 

18# file. 

19 

20import functools 

21import operator 

22 

23from tensorflow.python import pywrap_tfe 

24from tensorflow.python.eager import backprop_util 

25from tensorflow.python.eager import context 

26from tensorflow.python.eager import execute 

27from tensorflow.python.eager import imperative_grad 

28from tensorflow.python.eager import tape 

29from tensorflow.python.framework import composite_tensor 

30from tensorflow.python.framework import composite_tensor_gradient 

31from tensorflow.python.framework import constant_op 

32from tensorflow.python.framework import dtypes 

33from tensorflow.python.framework import indexed_slices 

34from tensorflow.python.framework import ops 

35from tensorflow.python.framework import tensor_shape 

36from tensorflow.python.framework import tensor_util 

37from tensorflow.python.framework import type_spec 

38from tensorflow.python.ops import array_ops 

39from tensorflow.python.ops import check_ops 

40from tensorflow.python.ops import control_flow_util 

41from tensorflow.python.ops import default_gradient 

42from tensorflow.python.ops import gen_array_ops 

43from tensorflow.python.ops import gen_math_ops 

44from tensorflow.python.ops import resource_variable_ops 

45from tensorflow.python.ops.parallel_for import control_flow_ops as pfor_ops 

46from tensorflow.python.ops.unconnected_gradients import UnconnectedGradients 

47from tensorflow.python.platform import tf_logging as logging 

48from tensorflow.python.util import _pywrap_utils 

49from tensorflow.python.util import nest 

50from tensorflow.python.util import tf_contextlib 

51from tensorflow.python.util import tf_inspect 

52from tensorflow.python.util import variable_utils 

53from tensorflow.python.util.tf_export import tf_export 

54 

55 

56_op_attr_type_cache = {} 

57 

58 

59def op_attr_type(op_type, attr_name): 

60 try: 

61 return _op_attr_type_cache[(op_type, attr_name)] 

62 except KeyError: 

63 context.ensure_initialized() 

64 h = context.context()._handle # pylint: disable=protected-access 

65 attr_type = pywrap_tfe.TFE_OpNameGetAttrType(h, op_type, attr_name) 

66 _op_attr_type_cache[(op_type, attr_name)] = attr_type 

67 return attr_type 

68 

69 

70def make_attr(attr_type, value): 

71 # pybind11 enums do not return the raw value like SWIG enums do. They are 

72 # useful when comparing amongst each other but not direct integers as we are 

73 # doing in most tests. 

74 # https://pybind11.readthedocs.io/en/stable/classes.html#enumerations-and-internal-types 

75 # TODO(amitpatankar): After all SWIG transitions, convert the enum comparisons 

76 # from integer value to class. 

77 if attr_type == int(pywrap_tfe.TF_ATTR_TYPE): 

78 return dtypes.as_dtype(value) 

79 if attr_type == [int(pywrap_tfe.TF_ATTR_TYPE)]: 

80 return [dtypes.as_dtype(v) for v in value] 

81 if attr_type == int(pywrap_tfe.TF_ATTR_SHAPE): 

82 return tensor_shape.as_shape(value).as_proto() 

83 if attr_type == [int(pywrap_tfe.TF_ATTR_SHAPE)]: 

84 return [tensor_shape.as_shape(v).as_proto() for v in value] 

85 return nest.map_structure( 

86 lambda v: v.encode() if isinstance(v, str) else v, 

87 value) 

88 

89 

90class _MockOp(object): 

91 """Pretends to be a tf.Operation for the gradient functions.""" 

92 

93 def __init__(self, attrs, inputs, outputs, typ, skip_input_indices): 

94 self.attrs = attrs 

95 self.inputs = inputs 

96 self.outputs = outputs 

97 self.type = typ 

98 self.skip_input_indices = skip_input_indices 

99 

100 def get_attr(self, attr): 

101 typ = op_attr_type(self.type, attr) 

102 for i in range(0, len(self.attrs), 2): 

103 if self.attrs[i] == attr: 

104 return make_attr(typ, self.attrs[i + 1]) 

105 raise KeyError(attr) 

106 

107 def _get_control_flow_context(self): 

108 raise NotImplementedError( 

109 "tf.GradientTape.gradients() does not support graph control flow " 

110 "operations like tf.cond or tf.while at this time. Use tf.gradients() " 

111 "instead. If you need this feature, please file a feature request at " 

112 "https://github.com/tensorflow/tensorflow/issues/new" 

113 ) 

114 

115 

116def _gradient_function(op_name, attr_tuple, num_inputs, inputs, outputs, 

117 out_grads, skip_input_indices, forward_pass_name_scope): 

118 """Calls the gradient function of the op. 

119 

120 Args: 

121 op_name: the name of the op to be differentiated. 

122 attr_tuple: the attrs, as a tuple. 

123 num_inputs: the number of inputs to the op. 

124 inputs: inputs to the original operation. 

125 outputs: outputs to the original operation. 

126 out_grads: gradients of the operation wrt its outputs. 

127 skip_input_indices: a tuple that is passed to the gradient function, 

128 indicating which inputs to skip calculating the gradient for 

129 forward_pass_name_scope: the namescope of the op in the forward pass. 

130 

131 Returns: 

132 The gradients with respect to the inputs of the function, as a list. 

133 """ 

134 mock_op = _MockOp(attr_tuple, inputs, outputs, op_name, skip_input_indices) 

135 grad_fn = ops._gradient_registry.lookup(op_name) # pylint: disable=protected-access 

136 if grad_fn is None: 

137 return [None] * num_inputs 

138 

139 # This does not work with v1 TensorArrays. 

140 if ops.executing_eagerly_outside_functions( 

141 ) or control_flow_util.EnableControlFlowV2(ops.get_default_graph()): 

142 gradient_name_scope = "gradient_tape/" 

143 if forward_pass_name_scope: 

144 gradient_name_scope += forward_pass_name_scope + "/" 

145 with ops.name_scope(gradient_name_scope): 

146 return grad_fn(mock_op, *out_grads) 

147 else: 

148 return grad_fn(mock_op, *out_grads) 

149 

150 

151pywrap_tfe.TFE_Py_RegisterGradientFunction(_gradient_function) 

152 

153 

154def _must_record_gradient(): 

155 return not pywrap_tfe.TFE_Py_TapeSetIsEmpty() 

156 

157 

158@tf_export("__internal__.record_gradient", v1=[]) 

159def record_gradient(op_name, inputs, attrs, outputs): 

160 """Explicitly record the gradient for a given op. 

161 

162 Args: 

163 op_name: The op name as listed in the `OpDef` for the op. 

164 inputs: A list of tensor inputs to the op. 

165 attrs: The op attributes as a flattened list of alternating attribute names 

166 and attribute values. 

167 outputs: A list of tensor outputs from the op. 

168 """ 

169 pywrap_tfe.TFE_Py_RecordGradient(op_name, inputs, attrs, outputs, 

170 ops.get_name_scope()) 

171 

172 

173execute.must_record_gradient = _must_record_gradient 

174execute.record_gradient = record_gradient 

175 

176 

177def implicit_val_and_grad(f): 

178 """Returns a function which differentiates f with respect to variables. 

179 

180 The wrapped function returns the value and the gradient of f when called with 

181 the same arguments. The gradient is with respect to all trainable TFE 

182 variables accessed by `f`. 

183 

184 This function is useful when the exact set of variables to differentiate with 

185 is not known ahead of time. 

186 

187 Example: 

188 

189 ```python 

190 dense_layer = tf.compat.v1.layers.Dense(1) 

191 def loss(x, y): 

192 return tf.reduce_sum(tf.square(dense_layer(x) - y)) 

193 

194 # Obtain the gradient function. 

195 val_grad_fn = tfe.implicit_value_and_gradients(loss) 

196 

197 # Invoke the gradient function with concrete values of x and y. 

198 x = tf.constant([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]]) 

199 y = tf.constant([[10.0], [20.0]]) 

200 value, grads_and_vars = val_grad_fn(x, y) 

201 print('Value of loss: %s' % value) 

202 

203 # Apply the gradients to Variables. 

204 optimizer = tf.compat.v1.train.GradientDescentOptimizer(0.1) 

205 optimizer.apply_gradients(grads_and_vars) 

206 ``` 

207 

208 Args: 

209 f: function to be differentiated. If `f` returns a scalar, this scalar will 

210 be differentiated. If `f` returns a tensor or list of tensors, by default 

211 a scalar will be computed by adding all their values to produce a single 

212 scalar. 

213 

214 Returns: 

215 A function which, when called, returns a tuple pair. 

216 Its first element is the value to which the function evaluates. 

217 Its second element is list of (gradient, variable) pairs. 

218 

219 Raises: 

220 ValueError: if `f` returns None. 

221 """ 

222 # TODO(cais): Remove calls to tf.constant() once the gradients functions 

223 # accept lists and np.ndarrays. 

224 

225 def grad_fn(*args, **kwds): 

226 """Computes the gradient of the wrapped function.""" 

227 this_tape = tape.push_new_tape() 

228 try: 

229 end_node = f(*args, **kwds) 

230 if end_node is None: 

231 raise ValueError("Cannot differentiate a function that returns None; " 

232 "did you forget to return a value from {}?".format( 

233 f.__name__)) 

234 finally: 

235 tape.pop_tape(this_tape) 

236 # Note: variables are returned in construction order. This ensures unique 

237 # order across executions. 

238 variables = this_tape.watched_variables() 

239 if not variables: 

240 raise ValueError("No trainable variables were accessed while the " 

241 "function was being computed.") 

242 

243 sources = [v.handle for v in variables] 

244 for s in sources: 

245 if getattr(s, "is_packed", False): 

246 raise ValueError( 

247 "GradientTape.gradient is not supported on packed EagerTensors yet." 

248 ) 

249 grad = imperative_grad.imperative_grad(this_tape, nest.flatten(end_node), 

250 sources) 

251 return end_node, list(zip(grad, variables)) 

252 

253 return grad_fn 

254 

255 

256def implicit_grad(f): 

257 """Returns a function which differentiates f with respect to variables. 

258 

259 The wrapped function returns the gradient of f when called with the same 

260 arguments. The gradient is with respect to all trainable TFE variables 

261 accessed by `f`. 

262 

263 This function is useful when the exact set of variables to differentiate with 

264 is not known ahead of time. 

265 

266 Example: 

267 

268 ```python 

269 dense_layer = tf.compat.v1.layers.Dense(1) 

270 def loss(x, y): 

271 return tf.reduce_sum(tf.square(dense_layer(x) - y)) 

272 

273 # Obtain the gradient function. 

274 grad_fn = tfe.implicit_gradients(loss) 

275 

276 # Invoke the gradient function with concrete values of x and y. 

277 x = tf.constant([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]]) 

278 y = tf.constant([[10.0], [20.0]]) 

279 grads_and_vars = grad_fn(x, y) 

280 

281 # Apply the gradients to Variables. 

282 optimizer = tf.compat.v1.train.GradientDescentOptimizer(0.1) 

283 optimizer.apply_gradients(grads_and_vars) 

284 ``` 

285 

286 Args: 

287 f: function to be differentiated. If `f` returns a scalar, this scalar will 

288 be differentiated. If `f` returns a tensor or list of tensors, by default 

289 a scalar will be computed by adding all their values to produce a single 

290 scalar. 

291 

292 Returns: 

293 A function which, when called, returns a list of (gradient, variable) pairs. 

294 """ 

295 # TODO(cais): Remove calls to tf.constant() once the gradients functions 

296 # accept lists and np.ndarrays. 

297 

298 def grad_fn(*args, **kwds): 

299 """Computes the gradient of the wrapped function.""" 

300 return implicit_val_and_grad(f)(*args, **kwds)[1] 

301 

302 return grad_fn 

303 

304 

305def _get_arg_spec(f, params, param_args): 

306 """The positions of the parameters of f to be differentiated in param_args.""" 

307 try: 

308 args = tf_inspect.getfullargspec(f).args 

309 except TypeError as e: 

310 # TypeError can happen when f is a callable object. 

311 if params is None: 

312 return range(len(param_args)) 

313 elif all(isinstance(x, int) for x in params): 

314 return params 

315 raise ValueError("Either callable provided is not a function or could not " 

316 "inspect its arguments by name: %s. Original error: %s" 

317 % (f, e)) 

318 if params is None: 

319 if not args: 

320 return range(len(param_args)) 

321 if args[0] == "self": 

322 return range(len(args) - 1) 

323 else: 

324 return range(len(args)) 

325 elif all(isinstance(x, str) for x in params): 

326 return [args.index(n) for n in params] 

327 elif all(isinstance(x, int) for x in params): 

328 return params 

329 else: 

330 raise ValueError( 

331 "params must be all strings or all integers; got %s." % params) 

332 

333 

334def gradients_function(f, params=None): 

335 """Returns a function which differentiates f with respect to params. 

336 

337 Example: 

338 ```python 

339 # f(x, y) = (x ^ 3) * y - x * (y ^ 2) 

340 # Therefore, the 1st order derivatives are: 

341 # df / dx = 3 * (x ^ 2) * y - y ^ 2 

342 # df / dy = x ^ 3 - 2 * x * y 

343 # The 2nd order derivatives with respect to x is: 

344 # d^2 f / (dx)^2 = 6 * x * y 

345 def f(x, y): 

346 return x * x * x * y - x * y * y 

347 

348 # Obtain a function that returns 1st order gradients. 

349 grad_fn = tfe.gradients_function(f) 

350 

351 x = 2.0 

352 y = 3.0 

353 

354 # Invoke the 1st order gradient function. 

355 x_grad, y_grad = grad_fn(x, y) 

356 assert x_grad.numpy() == 3 * (2 ** 2) * 3 - 3 ** 2 

357 assert y_grad.numpy() == (2 ** 3) - 2 * 2 * 3 

358 

359 # Obtain a function that returns the 2nd order gradient with respect to x. 

360 gradgrad_fn = tfe.gradients_function(lambda x, y: grad_fn(x, y)[0]) 

361 

362 # Invoke the 2nd order gradient function. 

363 x_gradgrad = gradgrad_fn(x, y)[0] 

364 assert x_gradgrad.numpy() == 6 * 2 * 3 

365 

366 # To obtain a callable that returns the gradient(s) of `f` with respect to a 

367 # subset of its inputs, use the `params` keyword argument with 

368 # `gradients_function()`. 

369 ygrad_fn = tfe.gradients_function(f, params=[1]) 

370 

371 (y_grad,) = ygrad_fn(x, y) 

372 assert y_grad.numpy() == (2 ** 3) - 2 * 2 * 3 

373 ``` 

374 

375 Note that only tensors with real or complex dtypes are differentiable. 

376 

377 Args: 

378 f: function to be differentiated. If `f` returns a scalar, this scalar will 

379 be differentiated. If `f` returns a tensor or list of tensors, by default 

380 a scalar will be computed by adding all their values to produce a single 

381 scalar. If desired, the tensors can be elementwise multiplied by the 

382 tensors passed as the `dy` keyword argument to the returned gradient 

383 function. 

384 params: list of parameter names of f or list of integers indexing the 

385 parameters with respect to which we'll differentiate. Passing None 

386 differentiates with respect to all parameters. 

387 

388 Returns: 

389 function which, when called, returns the value of f and the gradient 

390 of `f` with respect to all of `params`. The function takes an extra optional 

391 keyword argument `dy`. Setting it allows computation of vector jacobian 

392 products for vectors other than the vector of ones. 

393 

394 Raises: 

395 ValueError: if the params are not all strings or all integers. 

396 """ 

397 

398 def decorated(*args, **kwds): 

399 """Computes the gradient of the decorated function.""" 

400 

401 _, grad = val_and_grad_function(f, params=params)(*args, **kwds) 

402 return grad 

403 

404 return decorated 

405 

406 

407def _ensure_unique_tensor_objects(parameter_positions, args): 

408 """Make each of the parameter_positions in args a unique ops.Tensor object. 

409 

410 Ensure that each parameter is treated independently. 

411 For example: 

412 

413 def f(x, y): return x * y 

414 g = gradients_function(f) 

415 one = tf.constant(1.) 

416 

417 g(one, one) should return [1., 1.] 

418 (even though the two arguments are the same Tensor object). 

419 

420 Args: 

421 parameter_positions: List of indices into args defining the arguments to 

422 differentiate against. 

423 args: A list of arguments to the function to be differentiated. 

424 

425 Returns: 

426 args, possibly edited in-place. 

427 """ 

428 s = set() 

429 for (i, t) in enumerate(args): 

430 if i in parameter_positions: 

431 tid = ops.tensor_id(t) 

432 if tid in s: 

433 args[i] = gen_array_ops.identity(args[i]) 

434 else: 

435 s.add(tid) 

436 return args 

437 

438 

439def val_and_grad_function(f, params=None): 

440 """Returns a function that computes f and its derivative w.r.t. params. 

441 

442 Example: 

443 ```python 

444 # f(x, y) = (x ^ 3) * y - x * (y ^ 2) 

445 # Therefore, the 1st order derivatives are: 

446 # df / dx = 3 * (x ^ 2) * y - y ^ 2 

447 # df / dy = x ^ 3 - 2 * x * y 

448 def f(x, y): 

449 return x * x * x * y - x * y * y 

450 

451 # Obtain a function that returns the function value and the 1st order 

452 # gradients. 

453 val_grads_fn = tfe.value_and_gradients_function(f) 

454 

455 x = 2.0 

456 y = 3.0 

457 

458 # Invoke the value-and-gradients function. 

459 f_val, (x_grad, y_grad) = val_grads_fn(x, y) 

460 assert f_val.numpy() == (2 ** 3) * 3 - 2 * (3 ** 2) 

461 assert x_grad.numpy() == 3 * (2 ** 2) * 3 - 3 ** 2 

462 assert y_grad.numpy() == (2 ** 3) - 2 * 2 * 3 

463 

464 # To obtain a callable that returns the value of `f` and the gradient(s) of 

465 # `f` with respect to a subset of its inputs, use the `params` keyword 

466 # argument with `value_and_gradients_function()`. 

467 val_ygrad_fn = tfe.value_and_gradients_function(f, params=[1]) 

468 

469 f_val, (y_grad,) = val_ygrad_fn(x, y) 

470 assert f_val.numpy() == (2 ** 3) * 3 - 2 * (3 ** 2) 

471 assert y_grad.numpy() == (2 ** 3) - 2 * 2 * 3 

472 ``` 

473 

474 Args: 

475 f: function to be differentiated. If `f` returns a scalar, this scalar will 

476 be differentiated. If `f` returns a tensor or list of tensors, by default 

477 a scalar will be computed by adding all their values to produce a single 

478 scalar. If desired, the tensors can be elementwise multiplied by the 

479 tensors passed as the `dy` keyword argument to the returned gradient 

480 function. 

481 params: list of parameter names of f or list of integers indexing the 

482 parameters with respect to which we'll differentiate. Passing `None` 

483 differentiates with respect to all parameters. 

484 

485 Returns: 

486 function which, when called, returns the value of f and the gradient 

487 of f with respect to all of `params`. The function takes an extra optional 

488 keyword argument "dy". Setting it allows computation of vector jacobian 

489 products for vectors other than the vector of ones. 

490 

491 Raises: 

492 ValueError: if the params are not all strings or all integers. 

493 """ 

494 

495 def decorated(*args, **kwds): 

496 """Computes the value and gradient of the decorated function.""" 

497 dy = kwds.pop("dy", None) 

498 if kwds: 

499 raise ValueError("Functions to be differentiated cannot " 

500 "receive keyword arguments.") 

501 val, vjp = make_vjp(f, params)(*args, **kwds) 

502 return val, vjp(dy=dy) 

503 

504 return decorated 

505 

506 

507def make_vjp(f, params=None, persistent=True): 

508 """Returns a function that computes f and its vjp w.r.t. 

509 

510 params. 

511 

512 The term "vjp" here is an abbreviation for vector-jacobian product. 

513 

514 Args: 

515 f: the function to be differentiated. 

516 params: the parameters (numbers or names) to differentiate with respect to. 

517 A value of None will differentiate with respect to all parameters. 

518 persistent: Boolean controlling whether the VJP function can be re-used. 

519 Must be True or False. 

520 

521 Returns: 

522 A function, which when called, returns a tuple (value, vjp), where: 

523 - value is the result of calling f. 

524 - vjp is a function, which takes a vector as an argument and 

525 returns the product of that vector with the Jacobian of f. 

526 Providing no argument to vjp is equivalent to providing a 

527 vector of ones. 

528 

529 For example, 

530 ```python 

531 def f(x): 

532 return x * x 

533 

534 wrapped_fn = tfe.make_vjp(f) 

535 result, vjp = wrapped_fn(tf.constant(3.0)) 

536 # result is 9.0 

537 vjp() # the vjp function returns 6.0 

538 

539 Raises: 

540 ValueError: if `f` returns None. 

541 """ 

542 

543 def decorated(*args, **kwds): 

544 """Computes the value and gradient of the decorated function.""" 

545 parameter_positions = _get_arg_spec(f, params, args) 

546 assert not kwds, "The gradient function can't take keyword arguments." 

547 this_tape = tape.push_new_tape(persistent=persistent) 

548 try: 

549 sources = [] 

550 args = [ 

551 ops.convert_to_tensor(arg) if i in parameter_positions else arg 

552 for i, arg in enumerate(args) 

553 ] 

554 args = _ensure_unique_tensor_objects(parameter_positions, args) 

555 for i in parameter_positions: 

556 if getattr(args[i], "is_packed", False): 

557 raise ValueError( 

558 "GradientTape.gradient is not supported on packed EagerTensors" 

559 "yet.") 

560 sources.append(args[i]) 

561 tape.watch(this_tape, args[i]) 

562 result = f(*args) 

563 if result is None: 

564 raise ValueError("Cannot differentiate a function that returns None; " 

565 "did you forget to return a value from {}?".format( 

566 f.__name__)) 

567 flat_result = nest.flatten(result) 

568 flat_result = [gen_array_ops.identity(x) for x in flat_result] 

569 result = nest.pack_sequence_as(result, flat_result) 

570 finally: 

571 tape.pop_tape(this_tape) 

572 def vjp(dy=None): 

573 if dy is not None: 

574 dy = [ops.convert_to_tensor(x) for x in nest.flatten(dy)] 

575 return imperative_grad.imperative_grad( 

576 this_tape, nest.flatten(result), sources, output_gradients=dy) 

577 

578 return result, vjp 

579 

580 return decorated 

581 

582 

583def _aggregate_grads(gradients): 

584 """Aggregate gradients from multiple sources. 

585 

586 Args: 

587 gradients: A list of 'Tensor' or 'IndexedSlices' gradients. 

588 

589 Returns: 

590 If 'gradients' only has 'Tensor', returns an aggregated 'Tensor'. 

591 Otherwise returns an aggregated 'IndexedSlices'. 

592 """ 

593 assert gradients, "No gradients to aggregate" 

594 

595 if len(gradients) == 1: 

596 return gradients[0] 

597 if all(isinstance(g, ops.Tensor) for g in gradients): 

598 return gen_math_ops.add_n(gradients) 

599 else: 

600 assert all( 

601 isinstance(g, (ops.Tensor, indexed_slices.IndexedSlices)) 

602 for g in gradients) 

603 return backprop_util.AggregateIndexedSlicesGradients(gradients) 

604 

605 

606def _num_elements(grad): 

607 """The number of elements in the `grad` tensor.""" 

608 if isinstance(grad, ops.Tensor): 

609 shape_tuple = grad._shape_tuple() # pylint: disable=protected-access 

610 elif isinstance(grad, indexed_slices.IndexedSlices): 

611 shape_tuple = grad.values._shape_tuple() # pylint: disable=protected-access 

612 else: 

613 raise ValueError("`grad` not a Tensor or IndexedSlices.") 

614 if shape_tuple is None or None in shape_tuple: 

615 return 0 

616 return functools.reduce(operator.mul, shape_tuple, 1) 

617 

618 

619def _fast_fill(value, shape, dtype): 

620 return array_ops.fill( 

621 constant_op.constant(shape, dtype=dtypes.int32), 

622 constant_op.constant(value, dtype=dtype)) 

623 

624 

625def _zeros(shape, dtype): 

626 """Helper to return (possibly cached) zero tensors in eager mode.""" 

627 # Note: variants will use _zeros_like 

628 if dtype == dtypes.string or dtype == dtypes.resource: 

629 return None 

630 

631 ctx = context.context() 

632 if not ctx.executing_eagerly(): 

633 return array_ops.zeros(shape, dtype) 

634 

635 device = ctx.device_name 

636 

637 if tensor_util.is_tf_type(shape): 

638 shape_key = shape.ref() 

639 else: 

640 shape_key = shape 

641 cache_key = shape_key, dtype, device 

642 cached = ctx.zeros_cache().get(cache_key) 

643 if cached is None: 

644 if dtypes.as_dtype(dtype).is_bool: 

645 value = False 

646 else: 

647 value = 0 

648 cached = _fast_fill(value, shape, dtype) 

649 ctx.zeros_cache().put(cache_key, cached) 

650 return cached 

651 

652 

653def _ones(shape, dtype): 

654 as_dtype = dtypes.as_dtype(dtype) 

655 if as_dtype == dtypes.string: 

656 return None 

657 

658 if not context.executing_eagerly(): 

659 return array_ops.ones(shape, dtype) 

660 

661 if as_dtype.is_bool: 

662 value = True 

663 else: 

664 value = 1 

665 

666 if shape == (): # pylint: disable=g-explicit-bool-comparison 

667 return constant_op.constant(value, dtype=dtype) 

668 return _fast_fill(value, shape, dtype) 

669 

670 

671_default_vspace = imperative_grad.VSpace( 

672 num_elements_fn=_num_elements, 

673 aggregate_fn=_aggregate_grads, 

674 zeros_fn=_zeros, 

675 ones_fn=_ones, 

676 zeros_like_fn=default_gradient.zeros_like, 

677 ones_like_fn=default_gradient.ones_like, 

678 graph_shape_fn=gen_array_ops.shape) 

679pywrap_tfe.TFE_Py_RegisterVSpace(_default_vspace) 

680 

681 

682def _handle_or_self(x): 

683 """Unwrap resource variable/ndarray to return tensors.""" 

684 if resource_variable_ops.is_resource_variable(x): 

685 return x.handle 

686 return x 

687 

688 

689def _extract_tensors_and_variables(tensor): 

690 """Extracts tensors and variables from the input object.""" 

691 for obj in nest.flatten(tensor): 

692 if _pywrap_utils.IsTensor(obj) or _pywrap_utils.IsVariable(obj): 

693 yield obj 

694 elif isinstance(obj, composite_tensor.CompositeTensor): 

695 components = type_spec.type_spec_from_value(obj)._to_components(obj) # pylint: disable=protected-access 

696 yield from _extract_tensors_and_variables(components) 

697 else: 

698 raise ValueError(f"Passed in object {obj} of type {type(obj).__name__!r}" 

699 f", not tf.Tensor or tf.Variable or ExtensionType.") 

700 

701 

702@tf_export("GradientTape", "autodiff.GradientTape", v1=["GradientTape"]) 

703class GradientTape: 

704 """Record operations for automatic differentiation. 

705 

706 Operations are recorded if they are executed within this context manager and 

707 at least one of their inputs is being "watched". 

708 

709 Trainable variables (created by `tf.Variable` or `tf.compat.v1.get_variable`, 

710 where `trainable=True` is default in both cases) are automatically watched. 

711 Tensors can be manually watched by invoking the `watch` method on this context 

712 manager. 

713 

714 For example, consider the function `y = x * x`. The gradient at `x = 3.0` can 

715 be computed as: 

716 

717 >>> x = tf.constant(3.0) 

718 >>> with tf.GradientTape() as g: 

719 ... g.watch(x) 

720 ... y = x * x 

721 >>> dy_dx = g.gradient(y, x) 

722 >>> print(dy_dx) 

723 tf.Tensor(6.0, shape=(), dtype=float32) 

724 

725 GradientTapes can be nested to compute higher-order derivatives. For example, 

726 

727 >>> x = tf.constant(5.0) 

728 >>> with tf.GradientTape() as g: 

729 ... g.watch(x) 

730 ... with tf.GradientTape() as gg: 

731 ... gg.watch(x) 

732 ... y = x * x 

733 ... dy_dx = gg.gradient(y, x) # dy_dx = 2 * x 

734 >>> d2y_dx2 = g.gradient(dy_dx, x) # d2y_dx2 = 2 

735 >>> print(dy_dx) 

736 tf.Tensor(10.0, shape=(), dtype=float32) 

737 >>> print(d2y_dx2) 

738 tf.Tensor(2.0, shape=(), dtype=float32) 

739 

740 By default, the resources held by a GradientTape are released as soon as 

741 GradientTape.gradient() method is called. To compute multiple gradients over 

742 the same computation, create a persistent gradient tape. This allows multiple 

743 calls to the gradient() method as resources are released when the tape object 

744 is garbage collected. For example: 

745 

746 >>> x = tf.constant(3.0) 

747 >>> with tf.GradientTape(persistent=True) as g: 

748 ... g.watch(x) 

749 ... y = x * x 

750 ... z = y * y 

751 >>> dz_dx = g.gradient(z, x) # (4*x^3 at x = 3) 

752 >>> print(dz_dx) 

753 tf.Tensor(108.0, shape=(), dtype=float32) 

754 >>> dy_dx = g.gradient(y, x) 

755 >>> print(dy_dx) 

756 tf.Tensor(6.0, shape=(), dtype=float32) 

757 

758 By default GradientTape will automatically watch any trainable variables that 

759 are accessed inside the context. If you want fine grained control over which 

760 variables are watched you can disable automatic tracking by passing 

761 `watch_accessed_variables=False` to the tape constructor: 

762 

763 >>> x = tf.Variable(2.0) 

764 >>> w = tf.Variable(5.0) 

765 >>> with tf.GradientTape( 

766 ... watch_accessed_variables=False, persistent=True) as tape: 

767 ... tape.watch(x) 

768 ... y = x ** 2 # Gradients will be available for `x`. 

769 ... z = w ** 3 # No gradients will be available as `w` isn't being watched. 

770 >>> dy_dx = tape.gradient(y, x) 

771 >>> print(dy_dx) 

772 tf.Tensor(4.0, shape=(), dtype=float32) 

773 >>> # No gradients will be available as `w` isn't being watched. 

774 >>> dz_dw = tape.gradient(z, w) 

775 >>> print(dz_dw) 

776 None 

777 

778 Note that when using models you should ensure that your variables exist when 

779 using `watch_accessed_variables=False`. Otherwise it's quite easy to make your 

780 first iteration not have any gradients: 

781 

782 ```python 

783 a = tf.keras.layers.Dense(32) 

784 b = tf.keras.layers.Dense(32) 

785 

786 with tf.GradientTape(watch_accessed_variables=False) as tape: 

787 tape.watch(a.variables) # Since `a.build` has not been called at this point 

788 # `a.variables` will return an empty list and the 

789 # tape will not be watching anything. 

790 result = b(a(inputs)) 

791 tape.gradient(result, a.variables) # The result of this computation will be 

792 # a list of `None`s since a's variables 

793 # are not being watched. 

794 ``` 

795 

796 Note that only tensors with real or complex dtypes are differentiable. 

797 """ 

798 

799 def __init__(self, persistent=False, watch_accessed_variables=True): 

800 """Creates a new GradientTape. 

801 

802 Args: 

803 persistent: Boolean controlling whether a persistent gradient tape 

804 is created. False by default, which means at most one call can 

805 be made to the gradient() method on this object. 

806 watch_accessed_variables: Boolean controlling whether the tape will 

807 automatically `watch` any (trainable) variables accessed while the tape 

808 is active. Defaults to True meaning gradients can be requested from any 

809 result computed in the tape derived from reading a trainable `Variable`. 

810 If False users must explicitly `watch` any `Variable`s they want to 

811 request gradients from. 

812 """ 

813 self._tape = None 

814 self._persistent = persistent 

815 self._watch_accessed_variables = watch_accessed_variables 

816 self._watched_variables = () 

817 self._recording = False 

818 

819 def __enter__(self): 

820 """Enters a context inside which operations are recorded on this tape.""" 

821 self._push_tape() 

822 return self 

823 

824 def __exit__(self, typ, value, traceback): 

825 """Exits the recording context, no further operations are traced.""" 

826 if self._recording: 

827 self._pop_tape() 

828 

829 def _push_tape(self): 

830 """Pushes a new tape onto the tape stack.""" 

831 if self._recording: 

832 raise ValueError("Tape is still recording, This can happen if you try to " 

833 "re-enter an already-active tape.") 

834 if self._tape is None: 

835 self._tape = tape.push_new_tape( 

836 persistent=self._persistent, 

837 watch_accessed_variables=self._watch_accessed_variables) 

838 else: 

839 tape.push_tape(self._tape) 

840 self._recording = True 

841 

842 def _pop_tape(self): 

843 if not self._recording: 

844 raise ValueError("Tape is not recording.") 

845 tape.pop_tape(self._tape) 

846 self._recording = False 

847 

848 @tf_contextlib.contextmanager 

849 def _ensure_recording(self): 

850 """Ensures that this tape is recording.""" 

851 if not self._recording: 

852 try: 

853 self._push_tape() 

854 yield 

855 finally: 

856 self._pop_tape() 

857 else: 

858 yield 

859 

860 # TODO(b/209081027): Add a variable in composite tensor test case after 

861 # variables become composite tensors. 

862 def watch(self, tensor): 

863 """Ensures that `tensor` is being traced by this tape. 

864 

865 Args: 

866 tensor: a Tensor/Variable or list of Tensors/Variables. 

867 

868 Raises: 

869 ValueError: if it encounters something that is not a tensor. 

870 """ 

871 for t in _extract_tensors_and_variables(tensor): 

872 if not backprop_util.IsTrainable(t): 

873 logging.log_first_n( 

874 logging.WARN, "The dtype of the watched tensor must be " 

875 "floating (e.g. tf.float32), got %r", 5, t.dtype) 

876 if hasattr(t, "handle"): 

877 # There are many variable-like objects, all of them currently have 

878 # `handle` attribute that points to a tensor. If this changes, 

879 # internals of watch_variable need to change as well. 

880 tape.watch_variable(self._tape, t) 

881 else: 

882 tape.watch(self._tape, t) 

883 

884 @tf_contextlib.contextmanager 

885 def stop_recording(self): 

886 """Temporarily stops recording operations on this tape. 

887 

888 Operations executed while this context manager is active will not be 

889 recorded on the tape. This is useful for reducing the memory used by tracing 

890 all computations. 

891 

892 For example: 

893 

894 >>> x = tf.constant(4.0) 

895 >>> with tf.GradientTape() as tape: 

896 ... with tape.stop_recording(): 

897 ... y = x ** 2 

898 >>> dy_dx = tape.gradient(y, x) 

899 >>> print(dy_dx) 

900 None 

901 

902 Yields: 

903 None 

904 Raises: 

905 RuntimeError: if the tape is not currently recording. 

906 """ 

907 if self._tape is None: 

908 raise RuntimeError( 

909 "Trying to stop recording a tape which is not recording.") 

910 self._pop_tape() 

911 try: 

912 yield 

913 finally: 

914 self._push_tape() 

915 

916 def reset(self): 

917 """Clears all information stored in this tape. 

918 

919 Equivalent to exiting and reentering the tape context manager with a new 

920 tape. For example, the two following code blocks are equivalent: 

921 

922 ``` 

923 with tf.GradientTape() as t: 

924 loss = loss_fn() 

925 with tf.GradientTape() as t: 

926 loss += other_loss_fn() 

927 t.gradient(loss, ...) # Only differentiates other_loss_fn, not loss_fn 

928 

929 

930 # The following is equivalent to the above 

931 with tf.GradientTape() as t: 

932 loss = loss_fn() 

933 t.reset() 

934 loss += other_loss_fn() 

935 t.gradient(loss, ...) # Only differentiates other_loss_fn, not loss_fn 

936 ``` 

937 

938 This is useful if you don't want to exit the context manager for the tape, 

939 or can't because the desired reset point is inside a control flow construct: 

940 

941 ``` 

942 with tf.GradientTape() as t: 

943 loss = ... 

944 if loss > k: 

945 t.reset() 

946 ``` 

947 """ 

948 self._pop_tape() 

949 self._tape = None 

950 self._push_tape() 

951 

952 def watched_variables(self): 

953 """Returns variables watched by this tape in order of construction.""" 

954 if self._tape is not None: 

955 self._watched_variables = self._tape.watched_variables() 

956 return self._watched_variables 

957 

958 def gradient(self, 

959 target, 

960 sources, 

961 output_gradients=None, 

962 unconnected_gradients=UnconnectedGradients.NONE): 

963 """Computes the gradient using operations recorded in context of this tape. 

964 

965 Note: Unless you set `persistent=True` a GradientTape can only be used to 

966 compute one set of gradients (or jacobians). 

967 

968 In addition to Tensors, gradient also supports RaggedTensors. For example, 

969 

970 >>> x = tf.ragged.constant([[1.0, 2.0], [3.0]]) 

971 >>> with tf.GradientTape() as g: 

972 ... g.watch(x) 

973 ... y = x * x 

974 >>> g.gradient(y, x) 

975 <tf.RaggedTensor [[2.0, 4.0], [6.0]]> 

976 

977 Args: 

978 target: a list or nested structure of Tensors or Variables or 

979 CompositeTensors to be differentiated. 

980 sources: a list or nested structure of Tensors or Variables or 

981 CompositeTensors. `target` will be differentiated against elements in 

982 `sources`. 

983 output_gradients: a list of gradients, one for each differentiable 

984 element of target. Defaults to None. 

985 unconnected_gradients: a value which can either hold 'none' or 'zero' and 

986 alters the value which will be returned if the target and sources are 

987 unconnected. The possible values and effects are detailed in 

988 'UnconnectedGradients' and it defaults to 'none'. 

989 

990 Returns: 

991 a list or nested structure of Tensors (or IndexedSlices, or None, or 

992 CompositeTensor), one for each element in `sources`. Returned structure 

993 is the same as the structure of `sources`. 

994 

995 Raises: 

996 RuntimeError: If called on a used, non-persistent tape. 

997 RuntimeError: If called inside the context of the tape. 

998 TypeError: If the target is a None object. 

999 ValueError: If the target is a variable or if unconnected gradients is 

1000 called with an unknown value. 

1001 """ 

1002 if self._tape is None: 

1003 raise RuntimeError("A non-persistent GradientTape can only be used to " 

1004 "compute one set of gradients (or jacobians)") 

1005 if self._recording: 

1006 if not self._persistent: 

1007 self._pop_tape() 

1008 else: 

1009 logging.log_first_n( 

1010 logging.WARN, "Calling GradientTape.gradient on a persistent " 

1011 "tape inside its context is significantly less " 

1012 "efficient than calling it outside the context (it " 

1013 "causes the gradient ops to be recorded on the " 

1014 "tape, leading to increased CPU and memory usage). " 

1015 "Only call GradientTape.gradient inside the " 

1016 "context if you actually want to trace the " 

1017 "gradient in order to compute higher order " 

1018 "derivatives.", 1) 

1019 

1020 if target is None: 

1021 raise TypeError("Argument `target` should be a list or nested structure" 

1022 " of Tensors, Variables or CompositeTensors to be " 

1023 "differentiated, but received None.") 

1024 

1025 flat_targets = [] 

1026 for t in nest.flatten(target): 

1027 flat_targets.append(_handle_or_self(t)) 

1028 flat_targets = composite_tensor_gradient.get_flat_tensors_for_gradients( 

1029 flat_targets) 

1030 for t in flat_targets: 

1031 if not backprop_util.IsTrainable(t): 

1032 logging.vlog( 

1033 1, "The dtype of the target tensor must be " 

1034 "floating (e.g. tf.float32) when calling GradientTape.gradient, " 

1035 "got %r", t.dtype) 

1036 

1037 flat_sources_raw = nest.flatten(sources) 

1038 flat_sources = [] 

1039 for t in flat_sources_raw: 

1040 flat_sources.append(_handle_or_self(t)) 

1041 flat_sources = composite_tensor_gradient.get_flat_tensors_for_gradients( 

1042 flat_sources) 

1043 for t in flat_sources: 

1044 if not backprop_util.IsTrainable(t): 

1045 logging.vlog( 

1046 1, "The dtype of the source tensor must be " 

1047 "floating (e.g. tf.float32) when calling GradientTape.gradient, " 

1048 "got %r", t.dtype) 

1049 if getattr(t, "is_packed", False): 

1050 raise ValueError( 

1051 "GradientTape.gradient is not supported on packed EagerTensors yet." 

1052 ) 

1053 

1054 if output_gradients is not None: 

1055 output_gradients = nest.flatten( 

1056 variable_utils.convert_variables_to_tensors(output_gradients)) 

1057 output_gradients = ( 

1058 composite_tensor_gradient.get_flat_tensors_for_gradients( 

1059 output_gradients)) 

1060 output_gradients = [None if x is None else ops.convert_to_tensor(x) 

1061 for x in output_gradients] 

1062 

1063 flat_grad = imperative_grad.imperative_grad( 

1064 self._tape, 

1065 flat_targets, 

1066 flat_sources, 

1067 output_gradients=output_gradients, 

1068 sources_raw=flat_sources_raw, 

1069 unconnected_gradients=unconnected_gradients) 

1070 

1071 if not self._persistent: 

1072 # Keep track of watched variables before setting tape to None 

1073 self._watched_variables = self._tape.watched_variables() 

1074 self._tape = None 

1075 

1076 flat_sources_raw = nest.map_structure(_handle_or_self, flat_sources_raw) 

1077 flat_grad = composite_tensor_gradient.replace_flat_tensors_for_gradients( 

1078 flat_sources_raw, flat_grad) 

1079 grad = nest.pack_sequence_as(sources, flat_grad) 

1080 return grad 

1081 

1082 def jacobian(self, 

1083 target, 

1084 sources, 

1085 unconnected_gradients=UnconnectedGradients.NONE, 

1086 parallel_iterations=None, 

1087 experimental_use_pfor=True): 

1088 """Computes the jacobian using operations recorded in context of this tape. 

1089 

1090 Note: Unless you set `persistent=True` a GradientTape can only be used to 

1091 compute one set of gradients (or jacobians). 

1092 

1093 Note: By default the jacobian implementation uses parallel for (pfor), which 

1094 creates a tf.function under the hood for each jacobian call. For better 

1095 performance, and to avoid recompilation and vectorization rewrites on each 

1096 call, enclose GradientTape code in @tf.function. 

1097 

1098 See[wikipedia 

1099 article](http://en.wikipedia.org/wiki/jacobian_matrix_and_determinant) 

1100 for the definition of a Jacobian. 

1101 

1102 Example usage: 

1103 

1104 ```python 

1105 with tf.GradientTape() as g: 

1106 x = tf.constant([1.0, 2.0]) 

1107 g.watch(x) 

1108 y = x * x 

1109 jacobian = g.jacobian(y, x) 

1110 # jacobian value is [[2., 0.], [0., 4.]] 

1111 ``` 

1112 

1113 Args: 

1114 target: Tensor to be differentiated. 

1115 sources: a list or nested structure of Tensors or Variables. `target` 

1116 will be differentiated against elements in `sources`. 

1117 unconnected_gradients: a value which can either hold 'none' or 'zero' and 

1118 alters the value which will be returned if the target and sources are 

1119 unconnected. The possible values and effects are detailed in 

1120 'UnconnectedGradients' and it defaults to 'none'. 

1121 parallel_iterations: A knob to control how many iterations are dispatched 

1122 in parallel. This knob can be used to control the total memory usage. 

1123 experimental_use_pfor: If true, vectorizes the jacobian computation. Else 

1124 falls back to a sequential while_loop. Vectorization can sometimes fail 

1125 or lead to excessive memory usage. This option can be used to disable 

1126 vectorization in such cases. 

1127 

1128 Returns: 

1129 A list or nested structure of Tensors (or None), one for each element in 

1130 `sources`. Returned structure is the same as the structure of `sources`. 

1131 Note if any gradient is sparse (IndexedSlices), jacobian function 

1132 currently makes it dense and returns a Tensor instead. This may change in 

1133 the future. 

1134 

1135 

1136 Raises: 

1137 RuntimeError: If called on a used, non-persistent tape. 

1138 RuntimeError: If called on a non-persistent tape with eager execution 

1139 enabled and without enabling experimental_use_pfor. 

1140 ValueError: If vectorization of jacobian computation fails. 

1141 """ 

1142 if self._tape is None: 

1143 raise RuntimeError("A non-persistent GradientTape can only be used to " 

1144 "compute one set of gradients (or jacobians)") 

1145 

1146 flat_sources = nest.flatten(sources) 

1147 target_static_shape = target.shape 

1148 target_shape = array_ops.shape(target) 

1149 # Note that we push and pop the tape here and below. This is needed since we 

1150 # need gradients through the enclosed operations. 

1151 with self._ensure_recording(): 

1152 target = array_ops.reshape(target, [-1]) 

1153 

1154 def loop_fn(i): 

1155 with self._ensure_recording(): 

1156 y = array_ops.gather(target, i) 

1157 return self.gradient(y, flat_sources, 

1158 unconnected_gradients=unconnected_gradients) 

1159 

1160 try: 

1161 target_size = int(target.shape[0]) 

1162 except TypeError: 

1163 target_size = array_ops.shape(target)[0] 

1164 

1165 if experimental_use_pfor: 

1166 try: 

1167 output = pfor_ops.pfor(loop_fn, target_size, 

1168 parallel_iterations=parallel_iterations) 

1169 except ValueError as err: 

1170 raise ValueError( 

1171 "Encountered an exception while vectorizing the " 

1172 "jacobian computation. Vectorization can be disabled by setting" 

1173 " experimental_use_pfor to False.") from err 

1174 else: 

1175 if context.executing_eagerly() and not self._persistent: 

1176 raise RuntimeError( 

1177 "GradientTape must be created with persistent=True" 

1178 " to compute the jacobian with eager execution enabled and with " 

1179 " experimental_use_pfor set to False.") 

1180 output = pfor_ops.for_loop( 

1181 loop_fn, [target.dtype] * len(flat_sources), target_size, 

1182 parallel_iterations=parallel_iterations) 

1183 

1184 for i, out in enumerate(output): 

1185 if out is not None: 

1186 new_shape = array_ops.concat( 

1187 [target_shape, array_ops.shape(out)[1:]], axis=0) 

1188 out = array_ops.reshape(out, new_shape) 

1189 if context.executing_eagerly(): 

1190 out.set_shape(target_static_shape.concatenate(flat_sources[i].shape)) 

1191 output[i] = out 

1192 

1193 return nest.pack_sequence_as(sources, output) 

1194 

1195 def batch_jacobian(self, 

1196 target, 

1197 source, 

1198 unconnected_gradients=UnconnectedGradients.NONE, 

1199 parallel_iterations=None, 

1200 experimental_use_pfor=True): 

1201 """Computes and stacks per-example jacobians. 

1202 

1203 See [wikipedia article](http://en.wikipedia.org/wiki/jacobian_matrix_and_determinant) 

1204 for the definition of a Jacobian. This function is essentially an efficient 

1205 implementation of the following: 

1206 

1207 `tf.stack([self.jacobian(y[i], x[i]) for i in range(x.shape[0])])`. 

1208 

1209 Note that compared to `GradientTape.jacobian` which computes gradient of 

1210 each output value w.r.t each input value, this function is useful when 

1211 `target[i,...]` is independent of `source[j,...]` for `j != i`. This 

1212 assumption allows more efficient computation as compared to 

1213 `GradientTape.jacobian`. The output, as well as intermediate activations, 

1214 are lower dimensional and avoid a bunch of redundant zeros which would 

1215 result in the jacobian computation given the independence assumption. 

1216 

1217 Note: Unless you set `persistent=True` a GradientTape can only be used to 

1218 compute one set of gradients (or jacobians). 

1219 

1220 Note: By default the batch_jacobian implementation uses parallel for (pfor), 

1221 which creates a tf.function under the hood for each batch_jacobian call. 

1222 For better performance, and to avoid recompilation and vectorization 

1223 rewrites on each call, enclose GradientTape code in @tf.function. 

1224 

1225 

1226 Example usage: 

1227 

1228 ```python 

1229 with tf.GradientTape() as g: 

1230 x = tf.constant([[1., 2.], [3., 4.]], dtype=tf.float32) 

1231 g.watch(x) 

1232 y = x * x 

1233 batch_jacobian = g.batch_jacobian(y, x) 

1234 # batch_jacobian is [[[2, 0], [0, 4]], [[6, 0], [0, 8]]] 

1235 ``` 

1236 

1237 Args: 

1238 target: A tensor with rank 2 or higher and with shape [b, y1, ..., y_n]. 

1239 `target[i,...]` should only depend on `source[i,...]`. 

1240 source: A tensor with rank 2 or higher and with shape [b, x1, ..., x_m]. 

1241 unconnected_gradients: a value which can either hold 'none' or 'zero' and 

1242 alters the value which will be returned if the target and sources are 

1243 unconnected. The possible values and effects are detailed in 

1244 'UnconnectedGradients' and it defaults to 'none'. 

1245 parallel_iterations: A knob to control how many iterations are dispatched 

1246 in parallel. This knob can be used to control the total memory usage. 

1247 experimental_use_pfor: If true, uses pfor for computing the Jacobian. Else 

1248 uses a tf.while_loop. 

1249 

1250 Returns: 

1251 A tensor `t` with shape [b, y_1, ..., y_n, x1, ..., x_m] where `t[i, ...]` 

1252 is the jacobian of `target[i, ...]` w.r.t. `source[i, ...]`, i.e. stacked 

1253 per-example jacobians. 

1254 

1255 Raises: 

1256 RuntimeError: If called on a used, non-persistent tape. 

1257 RuntimeError: If called on a non-persistent tape with eager execution 

1258 enabled and without enabling experimental_use_pfor. 

1259 ValueError: If vectorization of jacobian computation fails or if first 

1260 dimension of `target` and `source` do not match. 

1261 """ 

1262 if self._tape is None: 

1263 raise RuntimeError("A non-persistent GradientTape can only be used to" 

1264 "compute one set of gradients (or jacobians)") 

1265 target_shape = target.shape 

1266 if target_shape.rank is None: 

1267 dim = tensor_shape.Dimension(None) 

1268 else: 

1269 dim = target_shape.dims[0] 

1270 if not (target_shape.with_rank_at_least(2) and 

1271 source.shape.with_rank_at_least(2) and 

1272 dim.is_compatible_with(source.shape[0])): 

1273 raise ValueError( 

1274 "Need first dimension of target shape (%s) and " 

1275 "source shape (%s) to match." % (target.shape, source.shape)) 

1276 if target_shape.is_fully_defined(): 

1277 batch_size = int(target_shape[0]) 

1278 target_row_size = target_shape.num_elements() // batch_size 

1279 else: 

1280 target_shape = array_ops.shape(target) 

1281 batch_size = target_shape[0] 

1282 target_row_size = array_ops.size(target) // batch_size 

1283 source_shape = array_ops.shape(source) 

1284 # Flatten target to 2-D. 

1285 # Note that we push and pop the tape here and below. This is needed since we 

1286 # need gradients through the enclosed operations. 

1287 with self._ensure_recording(): 

1288 with ops.control_dependencies( 

1289 [check_ops.assert_equal(batch_size, source_shape[0])]): 

1290 target = array_ops.reshape(target, [batch_size, target_row_size]) 

1291 

1292 run_once = False 

1293 

1294 def loop_fn(i): 

1295 nonlocal run_once 

1296 if run_once and not self._persistent: 

1297 if parallel_iterations is not None: 

1298 raise RuntimeError( 

1299 "GradientTape must be created with persistent=True" 

1300 " to compute the batch_jacobian with parallel_iterations.") 

1301 else: 

1302 raise RuntimeError( 

1303 "GradientTape must be created with persistent=True" 

1304 " to compute the batch_jacobian.") 

1305 run_once = True 

1306 

1307 with self._ensure_recording(): 

1308 y = array_ops.gather(target, i, axis=1) 

1309 return self.gradient(y, source, 

1310 unconnected_gradients=unconnected_gradients) 

1311 

1312 if experimental_use_pfor: 

1313 try: 

1314 output = pfor_ops.pfor(loop_fn, target_row_size, 

1315 parallel_iterations=parallel_iterations) 

1316 except ValueError as err: 

1317 raise ValueError( 

1318 "Encountered an exception while vectorizing the " 

1319 "batch_jacobian computation. Vectorization can be disabled by " 

1320 "setting experimental_use_pfor to False.") from err 

1321 else: 

1322 if context.executing_eagerly() and not self._persistent: 

1323 raise RuntimeError( 

1324 "GradientTape must be created with persistent=True" 

1325 " to compute the batch_jacobian with eager execution enabled and " 

1326 " with experimental_use_pfor set to False.") 

1327 output = pfor_ops.for_loop(loop_fn, target.dtype, target_row_size, 

1328 parallel_iterations=parallel_iterations) 

1329 new_shape = array_ops.concat([target_shape, source_shape[1:]], axis=0) 

1330 if output is None: 

1331 # Note that this block is returning zeros when it could use `None` to 

1332 # represent unconnected gradients. This is to maintain compatibility with 

1333 # the previous behavior, which ignored `unconnected_gradients`. 

1334 output = array_ops.zeros(new_shape, target.dtype) 

1335 return output 

1336 else: 

1337 output = array_ops.reshape(output, 

1338 [target_row_size, batch_size, -1]) 

1339 output = array_ops.transpose(output, [1, 0, 2]) 

1340 

1341 output = array_ops.reshape(output, new_shape) 

1342 return output