Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/tensorflow/python/ops/nn_grad.py: 33%
400 statements
« prev ^ index » next coverage.py v7.4.0, created at 2024-01-03 07:57 +0000
« prev ^ index » next coverage.py v7.4.0, created at 2024-01-03 07:57 +0000
1# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
2#
3# Licensed under the Apache License, Version 2.0 (the "License");
4# you may not use this file except in compliance with the License.
5# You may obtain a copy of the License at
6#
7# http://www.apache.org/licenses/LICENSE-2.0
8#
9# Unless required by applicable law or agreed to in writing, software
10# distributed under the License is distributed on an "AS IS" BASIS,
11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12# See the License for the specific language governing permissions and
13# limitations under the License.
14# ==============================================================================
15"""Gradients for operators defined in nn_ops.py."""
17import functools
18import itertools
19import operator
21from tensorflow.python.eager import backprop
22from tensorflow.python.framework import dtypes
23from tensorflow.python.framework import ops
24from tensorflow.python.ops import array_ops
25from tensorflow.python.ops import array_ops_stack
26from tensorflow.python.ops import gen_nn_ops
27from tensorflow.python.ops import math_ops
28from tensorflow.python.ops import nn_ops
31@ops.RegisterGradient("Conv2DBackpropInput")
32def _Conv2DBackpropInputGrad(op, grad):
33 """The derivatives for deconvolution.
35 Args:
36 op: the Deconvolution op.
37 grad: the tensor representing the gradient w.r.t. the output
39 Returns:
40 the gradients w.r.t. the input and the filter
41 """
42 # We call the gen_nn_ops backprop functions instead of nn_ops backprop
43 # functions for performance reasons in Eager mode. See _Conv2DGrad.
44 return [
45 None,
46 gen_nn_ops.conv2d_backprop_filter(
47 grad,
48 array_ops.shape(op.inputs[1]),
49 op.inputs[2],
50 dilations=op.get_attr("dilations"),
51 strides=op.get_attr("strides"),
52 padding=op.get_attr("padding"),
53 explicit_paddings=op.get_attr("explicit_paddings"),
54 use_cudnn_on_gpu=op.get_attr("use_cudnn_on_gpu"),
55 data_format=op.get_attr("data_format").decode()),
56 gen_nn_ops.conv2d(
57 grad,
58 op.inputs[1],
59 dilations=op.get_attr("dilations"),
60 strides=op.get_attr("strides"),
61 padding=op.get_attr("padding"),
62 explicit_paddings=op.get_attr("explicit_paddings"),
63 use_cudnn_on_gpu=op.get_attr("use_cudnn_on_gpu"),
64 data_format=op.get_attr("data_format").decode())
65 ]
68@ops.RegisterGradient("Conv2DBackpropFilter")
69def _Conv2DBackpropFilterGrad(op, grad):
70 # We call the gen_nn_ops backprop functions instead of nn_ops backprop
71 # functions for performance reasons in Eager mode. See _Conv2DGrad.
72 return [
73 gen_nn_ops.conv2d_backprop_input(
74 array_ops.shape(op.inputs[0]),
75 grad,
76 op.inputs[2],
77 dilations=op.get_attr("dilations"),
78 strides=op.get_attr("strides"),
79 padding=op.get_attr("padding"),
80 explicit_paddings=op.get_attr("explicit_paddings"),
81 use_cudnn_on_gpu=op.get_attr("use_cudnn_on_gpu"),
82 data_format=op.get_attr("data_format").decode()), None,
83 gen_nn_ops.conv2d(
84 op.inputs[0],
85 grad,
86 dilations=op.get_attr("dilations"),
87 strides=op.get_attr("strides"),
88 padding=op.get_attr("padding"),
89 explicit_paddings=op.get_attr("explicit_paddings"),
90 use_cudnn_on_gpu=op.get_attr("use_cudnn_on_gpu"),
91 data_format=op.get_attr("data_format").decode())
92 ]
95@ops.RegisterGradient("DepthwiseConv2dNativeBackpropInput")
96def _DepthwiseConv2dNativeBackpropInputGrad(op, grad):
97 """The derivatives for deconvolution.
99 Args:
100 op: the Deconvolution op.
101 grad: the tensor representing the gradient w.r.t. the output
103 Returns:
104 the gradients w.r.t. the input and the filter
105 """
106 return [
107 None,
108 gen_nn_ops.depthwise_conv2d_native_backprop_filter(
109 grad,
110 array_ops.shape(op.inputs[1]),
111 op.inputs[2],
112 dilations=op.get_attr("dilations"),
113 strides=op.get_attr("strides"),
114 padding=op.get_attr("padding"),
115 explicit_paddings=op.get_attr("explicit_paddings"),
116 data_format=op.get_attr("data_format")),
117 gen_nn_ops.depthwise_conv2d_native(
118 grad,
119 op.inputs[1],
120 dilations=op.get_attr("dilations"),
121 strides=op.get_attr("strides"),
122 padding=op.get_attr("padding"),
123 explicit_paddings=op.get_attr("explicit_paddings"),
124 data_format=op.get_attr("data_format"))
125 ]
128@ops.RegisterGradient("DepthwiseConv2dNativeBackpropFilter")
129def _DepthwiseConv2dNativeBackpropFilterGrad(op, grad):
130 return [
131 gen_nn_ops.depthwise_conv2d_native_backprop_input(
132 array_ops.shape(op.inputs[0]),
133 grad,
134 op.inputs[2],
135 dilations=op.get_attr("dilations"),
136 strides=op.get_attr("strides"),
137 padding=op.get_attr("padding"),
138 explicit_paddings=op.get_attr("explicit_paddings"),
139 data_format=op.get_attr("data_format")), None,
140 gen_nn_ops.depthwise_conv2d_native(
141 op.inputs[0],
142 grad,
143 dilations=op.get_attr("dilations"),
144 strides=op.get_attr("strides"),
145 padding=op.get_attr("padding"),
146 explicit_paddings=op.get_attr("explicit_paddings"),
147 data_format=op.get_attr("data_format"))
148 ]
151@ops.RegisterGradient("Conv3D")
152def _Conv3DGrad(op, grad):
153 data_format = op.get_attr("data_format").decode()
154 shape_0, shape_1 = array_ops.shape_n([op.inputs[0], op.inputs[1]])
155 return [
156 nn_ops.conv3d_backprop_input_v2(
157 shape_0,
158 op.inputs[1],
159 grad,
160 dilations=op.get_attr("dilations"),
161 strides=op.get_attr("strides"),
162 padding=op.get_attr("padding"),
163 data_format=data_format),
164 nn_ops.conv3d_backprop_filter_v2(
165 op.inputs[0],
166 shape_1,
167 grad,
168 dilations=op.get_attr("dilations"),
169 strides=op.get_attr("strides"),
170 padding=op.get_attr("padding"),
171 data_format=data_format)
172 ]
175@ops.RegisterGradient("Conv3DBackpropInputV2")
176def _Conv3DBackpropInputGrad(op, grad):
177 data_format = op.get_attr("data_format").decode()
178 return [
179 None,
180 nn_ops.conv3d_backprop_filter_v2(
181 grad,
182 array_ops.shape(op.inputs[1]),
183 op.inputs[2],
184 dilations=op.get_attr("dilations"),
185 strides=op.get_attr("strides"),
186 padding=op.get_attr("padding"),
187 data_format=data_format),
188 nn_ops.conv3d(
189 grad,
190 op.inputs[1],
191 dilations=op.get_attr("dilations"),
192 strides=op.get_attr("strides"),
193 padding=op.get_attr("padding"),
194 data_format=data_format)
195 ]
198@ops.RegisterGradient("Conv3DBackpropFilterV2")
199def _Conv3DBackpropFilterGrad(op, grad):
200 data_format = op.get_attr("data_format").decode()
201 return [
202 nn_ops.conv3d_backprop_input_v2(
203 array_ops.shape(op.inputs[0]),
204 grad,
205 op.inputs[2],
206 dilations=op.get_attr("dilations"),
207 strides=op.get_attr("strides"),
208 padding=op.get_attr("padding"),
209 data_format=data_format), None,
210 nn_ops.conv3d(
211 op.inputs[0],
212 grad,
213 dilations=op.get_attr("dilations"),
214 strides=op.get_attr("strides"),
215 padding=op.get_attr("padding"),
216 data_format=data_format)
217 ]
220@ops.RegisterGradient("AvgPool3D")
221def _AvgPool3DGrad(op, grad):
222 return gen_nn_ops.avg_pool3d_grad(
223 array_ops.shape(op.inputs[0]),
224 grad,
225 ksize=op.get_attr("ksize"),
226 strides=op.get_attr("strides"),
227 padding=op.get_attr("padding"),
228 data_format=op.get_attr("data_format").decode())
231@ops.RegisterGradient("AvgPool3DGrad")
232def _AvgPool3DGradGrad(op, grad):
233 return (array_ops.stop_gradient(op.inputs[0]),
234 gen_nn_ops.avg_pool3d(
235 grad,
236 op.get_attr("ksize"),
237 op.get_attr("strides"),
238 op.get_attr("padding"),
239 data_format=op.get_attr("data_format").decode()))
242@ops.RegisterGradient("MaxPool3D")
243def _MaxPool3DGrad(op, grad):
244 return gen_nn_ops.max_pool3d_grad(
245 op.inputs[0],
246 op.outputs[0],
247 grad,
248 ksize=op.get_attr("ksize"),
249 strides=op.get_attr("strides"),
250 padding=op.get_attr("padding"),
251 data_format=op.get_attr("data_format").decode())
254@ops.RegisterGradient("MaxPool3DGrad")
255def _MaxPool3DGradGrad(op, grad):
256 return (array_ops.zeros_like(op.inputs[0]),
257 array_ops.zeros_like(op.inputs[1]),
258 gen_nn_ops.max_pool3d_grad_grad(
259 op.inputs[0],
260 op.inputs[1],
261 grad,
262 op.get_attr("ksize"),
263 op.get_attr("strides"),
264 padding=op.get_attr("padding"),
265 data_format=op.get_attr("data_format").decode()))
268@ops.RegisterGradient("MaxPool3DGradGrad")
269def _MaxPool3DGradGradGrad(op, grad):
270 return (array_ops.zeros_like(op.inputs[0]),
271 array_ops.zeros_like(op.inputs[1]),
272 gen_nn_ops.max_pool3d_grad(
273 op.inputs[0],
274 op.inputs[1],
275 grad,
276 op.get_attr("ksize"),
277 op.get_attr("strides"),
278 padding=op.get_attr("padding"),
279 data_format=op.get_attr("data_format").decode()))
282@ops.RegisterGradient("Softmax")
283def _SoftmaxGrad(op, grad_softmax):
284 """The derivative of the softmax nonlinearity.
286 We assume that probs is of shape [batch_size * dim]
287 The formula for dsoftmax / dx = (diag(softmax) - softmax * softmax').
288 This matrix is diagonal minus a rank one matrix, so it is easy to implement
289 as follows:
291 grad_x = grad_softmax * softmax - sum(grad_softmax * softmax) * softmax
293 Args:
294 op: the Softmax op.
295 grad_softmax: the tensor representing the gradient w.r.t. the softmax
296 output.
298 Returns:
299 gradient w.r.t the input to the softmax
301 """
302 softmax = op.outputs[0]
303 sum_channels = math_ops.reduce_sum(grad_softmax * softmax, -1, keepdims=True)
304 return (grad_softmax - sum_channels) * softmax
307@ops.RegisterGradient("LogSoftmax")
308def _LogSoftmaxGrad(op, grad):
309 """The gradient for log_softmax.
311 log_softmax = input - log(sum(exp(input))
312 dlog_softmax/dinput = diag - softmax(input)
314 Args:
315 op: The log softmax op.
316 grad: The tensor representing the gradient w.r.t. the output.
318 Returns:
319 The gradients w.r.t. the input.
320 """
321 softmax = math_ops.exp(op.outputs[0])
322 return grad - math_ops.reduce_sum(grad, -1, keepdims=True) * softmax
325@ops.RegisterGradient("BiasAdd")
326def _BiasAddGrad(op, received_grad):
327 """Return the gradients for the 2 inputs of bias_op.
329 The first input of unused_bias_op is the tensor t, and its gradient is
330 just the gradient the unused_bias_op received.
332 The second input of unused_bias_op is the bias vector which has one fewer
333 dimension than "received_grad" (the batch dimension.) Its gradient is the
334 received gradient Summed on the batch dimension, which is the first dimension.
336 Args:
337 op: The BiasOp for which we need to generate gradients.
338 received_grad: Tensor. The gradients passed to the BiasOp.
340 Returns:
341 Two tensors, the first one for the "tensor" input of the BiasOp,
342 the second one for the "bias" input of the BiasOp.
343 """
344 try:
345 data_format = op.get_attr("data_format")
346 except ValueError:
347 data_format = None
348 return (received_grad,
349 gen_nn_ops.bias_add_grad(
350 out_backprop=received_grad, data_format=data_format))
353@ops.RegisterGradient("BiasAddGrad")
354def _BiasAddGradGrad(op, received_grad):
355 """Gradient for the BiasAddGrad op.
357 Args:
358 op: BiasAddGrad op for which we are calculating gradients.
359 received_grad: The gradients passed to the BiasAddGrad op.
361 Returns:
362 A single gradient Tensor for the input to BiasAddGrad (which
363 is the gradient of the bias term in BiasAdd)
364 """
366 try:
367 data_format = op.get_attr("data_format")
368 except ValueError:
369 data_format = None
371 shape = array_ops.shape(op.inputs[0])
372 bias_shape = array_ops.shape(received_grad)
374 if data_format == b"NCHW":
375 expanded_shape = array_ops.concat([
376 array_ops.ones_like(shape[:1]), bias_shape,
377 array_ops.ones_like(shape[2:])
378 ], 0)
379 tile_mults = array_ops.concat([shape[:1], [1], shape[2:]], 0)
380 else:
381 expanded_shape = array_ops.concat(
382 [array_ops.ones_like(shape[:-1]), bias_shape], 0)
383 tile_mults = array_ops.concat([shape[:-1], [1]], 0)
385 expanded_grad = array_ops.reshape(received_grad, expanded_shape)
386 return array_ops.tile(expanded_grad, tile_mults)
389@ops.RegisterGradient("BiasAddV1")
390def _BiasAddGradV1(unused_bias_op, received_grad):
391 """Return the gradients for the 2 inputs of bias_op.
393 The first input of unused_bias_op is the tensor t, and its gradient is
394 just the gradient the unused_bias_op received.
396 The second input of unused_bias_op is the bias vector which has one fewer
397 dimension than "received_grad" (the batch dimension.) Its gradient is the
398 received gradient Summed on the batch dimension, which is the first dimension.
400 Args:
401 unused_bias_op: The BiasOp for which we need to generate gradients.
402 received_grad: Tensor. The gradients passed to the BiasOp.
404 Returns:
405 Two tensors, the first one for the "tensor" input of the BiasOp,
406 the second one for the "bias" input of the BiasOp.
407 """
408 reduction_dim_tensor = math_ops.range(array_ops.rank(received_grad) - 1)
409 return (received_grad, math_ops.reduce_sum(received_grad,
410 reduction_dim_tensor))
413@ops.RegisterGradient("Relu")
414def _ReluGrad(op, grad):
415 return gen_nn_ops.relu_grad(grad, op.outputs[0])
418@ops.RegisterGradient("EluGrad")
419def _EluGradGrad(op, grad):
420 elu_x = op.inputs[1]
421 return (gen_nn_ops.elu_grad(grad, elu_x),
422 array_ops.where(
423 elu_x < 0, grad * op.inputs[0], array_ops.zeros_like(elu_x)))
426@ops.RegisterGradient("SeluGrad")
427def _SeluGradGrad(op, grad):
428 selu_x = op.inputs[1]
429 return (gen_nn_ops.selu_grad(grad, selu_x),
430 array_ops.where(
431 selu_x < 0., grad * op.inputs[0], array_ops.zeros_like(selu_x)))
434@ops.RegisterGradient("Relu6")
435def _Relu6Grad(op, grad):
436 return gen_nn_ops.relu6_grad(grad, op.outputs[0])
439@ops.RegisterGradient("Relu6Grad")
440def _Relu6GradGrad(op, grad):
441 x = op.inputs[1]
442 return (gen_nn_ops.relu6_grad(grad, x), array_ops.zeros_like(x))
445@ops.RegisterGradient("LeakyRelu")
446def _LeakyReluGrad(op, grad):
447 x = op.inputs[0]
448 alpha = op.get_attr("alpha")
449 return gen_nn_ops.leaky_relu_grad(grad, x, alpha=alpha)
452@ops.RegisterGradient("LeakyReluGrad")
453def _LeakyReluGradGrad(op, grad):
454 x = op.inputs[1]
455 alpha = op.get_attr("alpha")
456 return (gen_nn_ops.leaky_relu_grad(grad, x,
457 alpha=alpha), array_ops.zeros_like(x))
460@ops.RegisterGradient("Elu")
461def _EluGrad(op, grad):
462 return gen_nn_ops.elu_grad(grad, op.outputs[0])
465@ops.RegisterGradient("Selu")
466def _SeluGrad(op, grad):
467 return gen_nn_ops.selu_grad(grad, op.outputs[0])
470@ops.RegisterGradient("Softplus")
471def _SoftplusGrad(op, grad):
472 return grad * math_ops.sigmoid(op.inputs[0])
475@ops.RegisterGradient("SoftplusGrad")
476def _SoftplusGradGrad(op, grad):
477 # Let:
478 # y = tf.nn.softplus(x)
479 # dx = gen_nn_ops.softplus_grad(dy, x) = dy / (1 + exp(-x))
480 # This op computes (ddy, d2x) from op.inputs == [dy, x] and grad == ddx.
481 dy, x = op.inputs
482 with ops.control_dependencies([grad]):
483 ddy = gen_nn_ops.softplus_grad(grad, x)
484 d2x = grad * dy / (math_ops.exp(-x) + 2.0 + math_ops.exp(x))
485 return (ddy, d2x)
488@ops.RegisterGradient("Softsign")
489def _SoftsignGrad(op, grad):
490 return gen_nn_ops.softsign_grad(grad, op.inputs[0])
493@ops.RegisterGradient("ReluGrad")
494def _ReluGradGrad(op, grad):
495 x = op.inputs[1]
496 return (gen_nn_ops.relu_grad(grad, x), array_ops.zeros_like(x))
499def _BroadcastMul(vec, mat):
500 """Multiply after broadcasting vec to match dimensions of mat.
502 Args:
503 vec: A 1-D tensor of dimension [D0]
504 mat: A 2-D tensor of dimension [D0, D1]
506 Returns:
507 A tensor of dimension [D0, D1], the result of vec * mat
508 """
509 # Reshape vec to [D0, 1]
510 vec = array_ops.expand_dims(vec, -1)
511 return vec * mat
514@ops.RegisterGradient("SoftmaxCrossEntropyWithLogits")
515def _SoftmaxCrossEntropyWithLogitsGrad(op, grad_loss, grad_grad):
516 """Gradient function for SoftmaxCrossEntropyWithLogits."""
517 # grad_loss is the backprop for cost, and we multiply it with the gradients
518 # (which is output[1])
519 # grad_grad is the backprop for softmax gradient.
520 #
521 # Second derivative is just softmax derivative w.r.t. logits.
522 softmax_grad = op.outputs[1]
523 grad = _BroadcastMul(grad_loss, softmax_grad)
525 logits = op.inputs[0]
526 if (grad_grad is not None and
527 not getattr(grad_grad, "_is_zeros_tensor", False)):
528 softmax = nn_ops.softmax(logits)
530 grad += ((grad_grad - array_ops.squeeze(
531 math_ops.matmul(
532 array_ops.expand_dims(grad_grad, 1),
533 array_ops.expand_dims(softmax, 2)),
534 axis=1)) * softmax)
536 return grad, _BroadcastMul(grad_loss, -nn_ops.log_softmax(logits)) # pylint: disable=invalid-unary-operand-type
539@ops.RegisterGradient("SparseSoftmaxCrossEntropyWithLogits")
540def _SparseSoftmaxCrossEntropyWithLogitsGrad(op, grad_loss, grad_grad):
541 """Gradient function for SparseSoftmaxCrossEntropyWithLogits."""
542 # grad_loss is the backprop for cost, and we multiply it with the gradients
543 # (which is output[1])
544 # grad_grad is the backprop for softmax gradient.
545 # There is no gradient for the labels
546 #
547 # Second derivative is just softmax derivative w.r.t. logits.
548 softmax_grad = op.outputs[1]
549 grad = _BroadcastMul(grad_loss, softmax_grad)
551 logits = op.inputs[0]
552 if (grad_grad is not None and
553 not getattr(grad_grad, "_is_zeros_tensor", False)):
554 softmax = nn_ops.softmax(logits)
556 grad += ((grad_grad - array_ops.squeeze(
557 math_ops.matmul(
558 array_ops.expand_dims(grad_grad, 1),
559 array_ops.expand_dims(softmax, 2)),
560 axis=1)) * softmax)
562 return grad, None
565@ops.RegisterGradient("Conv2D")
566def _Conv2DGrad(op, grad):
567 """Gradient function for Conv2D."""
568 dilations = op.get_attr("dilations")
569 strides = op.get_attr("strides")
570 padding = op.get_attr("padding")
571 explicit_paddings = op.get_attr("explicit_paddings")
572 use_cudnn_on_gpu = op.get_attr("use_cudnn_on_gpu")
573 data_format = op.get_attr("data_format")
574 shape_0, shape_1 = array_ops.shape_n([op.inputs[0], op.inputs[1]])
576 # We call the gen_nn_ops backprop functions instead of nn_ops backprop
577 # functions for performance reasons in Eager mode. gen_nn_ops functions take a
578 # `explicit_paddings` parameter, but nn_ops functions do not. So if we were
579 # to use the nn_ops functions, we would have to convert `padding` and
580 # `explicit_paddings` into a single `padding` parameter, increasing overhead
581 # in Eager mode.
582 return [
583 gen_nn_ops.conv2d_backprop_input(
584 shape_0,
585 op.inputs[1],
586 grad,
587 dilations=dilations,
588 strides=strides,
589 padding=padding,
590 explicit_paddings=explicit_paddings,
591 use_cudnn_on_gpu=use_cudnn_on_gpu,
592 data_format=data_format),
593 gen_nn_ops.conv2d_backprop_filter(
594 op.inputs[0],
595 shape_1,
596 grad,
597 dilations=dilations,
598 strides=strides,
599 padding=padding,
600 explicit_paddings=explicit_paddings,
601 use_cudnn_on_gpu=use_cudnn_on_gpu,
602 data_format=data_format)
603 ]
606@ops.RegisterGradient("DepthwiseConv2dNative")
607def _DepthwiseConv2dNativeGrad(op, grad):
608 return [
609 gen_nn_ops.depthwise_conv2d_native_backprop_input(
610 array_ops.shape(op.inputs[0]),
611 op.inputs[1],
612 grad,
613 dilations=op.get_attr("dilations"),
614 strides=op.get_attr("strides"),
615 padding=op.get_attr("padding"),
616 explicit_paddings=op.get_attr("explicit_paddings"),
617 data_format=op.get_attr("data_format")),
618 gen_nn_ops.depthwise_conv2d_native_backprop_filter(
619 op.inputs[0],
620 array_ops.shape(op.inputs[1]),
621 grad,
622 dilations=op.get_attr("dilations"),
623 strides=op.get_attr("strides"),
624 padding=op.get_attr("padding"),
625 explicit_paddings=op.get_attr("explicit_paddings"),
626 data_format=op.get_attr("data_format"))
627 ]
630@ops.RegisterGradient("Dilation2D")
631def _Dilation2DGrad(op, grad):
632 return [
633 nn_ops.dilation2d_backprop_input(op.inputs[0], op.inputs[1], grad,
634 op.get_attr("strides"),
635 op.get_attr("rates"),
636 op.get_attr("padding")),
637 nn_ops.dilation2d_backprop_filter(op.inputs[0], op.inputs[1], grad,
638 op.get_attr("strides"),
639 op.get_attr("rates"),
640 op.get_attr("padding"))
641 ]
644@ops.RegisterGradient("LRN")
645def _LRNGrad(op, grad):
646 depth_radius = op.get_attr("depth_radius")
647 bias = op.get_attr("bias")
648 alpha = op.get_attr("alpha")
649 beta = op.get_attr("beta")
650 return [
651 gen_nn_ops.lrn_grad(grad, op.inputs[0], op.outputs[0], depth_radius, bias,
652 alpha, beta)
653 ]
656@ops.RegisterGradient("AvgPool")
657def _AvgPoolGrad(op, grad):
658 return gen_nn_ops.avg_pool_grad(
659 array_ops.shape(op.inputs[0]),
660 grad,
661 op.get_attr("ksize"),
662 op.get_attr("strides"),
663 op.get_attr("padding"),
664 data_format=op.get_attr("data_format"))
667@ops.RegisterGradient("AvgPoolGrad")
668def _AvgPoolGradGrad(op, grad):
669 return (array_ops.stop_gradient(op.inputs[0]),
670 gen_nn_ops.avg_pool(
671 grad,
672 op.get_attr("ksize"),
673 op.get_attr("strides"),
674 op.get_attr("padding"),
675 data_format=op.get_attr("data_format")))
678@ops.RegisterGradient("MaxPool")
679def _MaxPoolGrad(op, grad):
680 return gen_nn_ops.max_pool_grad(
681 op.inputs[0],
682 op.outputs[0],
683 grad,
684 op.get_attr("ksize"),
685 op.get_attr("strides"),
686 padding=op.get_attr("padding"),
687 explicit_paddings=op.get_attr("explicit_paddings"),
688 data_format=op.get_attr("data_format"))
691@ops.RegisterGradient("MaxPoolV2")
692def _MaxPoolGradV2(op, grad):
693 ksize = op.inputs[1]
694 strides = op.inputs[2]
695 return gen_nn_ops.max_pool_grad_v2(
696 op.inputs[0],
697 op.outputs[0],
698 grad,
699 ksize,
700 strides,
701 padding=op.get_attr("padding"),
702 data_format=op.get_attr("data_format")), None, None
705@ops.RegisterGradient("MaxPoolWithArgmax")
706def _MaxPoolGradWithArgmax(op, grad, unused_argmax_grad):
707 del unused_argmax_grad
708 return gen_nn_ops.max_pool_grad_with_argmax(
709 op.inputs[0],
710 grad,
711 op.outputs[1],
712 op.get_attr("ksize"),
713 op.get_attr("strides"),
714 padding=op.get_attr("padding"),
715 include_batch_in_index=op.get_attr("include_batch_in_index"))
718@ops.RegisterGradient("MaxPoolGrad")
719def _MaxPoolGradGrad(op, grad):
720 return (array_ops.zeros_like(op.inputs[0]),
721 array_ops.zeros_like(op.inputs[1]),
722 gen_nn_ops.max_pool_grad_grad(
723 op.inputs[0],
724 op.inputs[1],
725 grad,
726 op.get_attr("ksize"),
727 op.get_attr("strides"),
728 padding=op.get_attr("padding"),
729 data_format=op.get_attr("data_format")))
732@ops.RegisterGradient("MaxPoolGradV2")
733def _MaxPoolGradGradV2(op, grad):
734 ksize = op.inputs[3]
735 strides = op.inputs[4]
736 return (array_ops.zeros_like(op.inputs[0]),
737 array_ops.zeros_like(op.inputs[1]),
738 gen_nn_ops.max_pool_grad_grad_v2(
739 op.inputs[0],
740 op.inputs[1],
741 grad,
742 ksize,
743 strides,
744 padding=op.get_attr("padding"),
745 data_format=op.get_attr("data_format")), None, None)
748@ops.RegisterGradient("MaxPoolGradGrad")
749def _MaxPoolGradGradGrad(op, grad):
750 return (array_ops.zeros_like(op.inputs[0]),
751 array_ops.zeros_like(op.inputs[1]),
752 gen_nn_ops.max_pool_grad(
753 op.inputs[0],
754 op.inputs[1],
755 grad,
756 op.get_attr("ksize"),
757 op.get_attr("strides"),
758 padding=op.get_attr("padding"),
759 data_format=op.get_attr("data_format")))
762@ops.RegisterGradient("FractionalMaxPool")
763def _FractionalMaxPoolGrad(op, grad_0, unused_grad_1, unused_grad_2):
764 """Returns gradient for FractionalMaxPool.
766 Since FractionalMaxPool has three outputs, there are three gradients passed in
767 for each of the outputs. Only the first one is useful, the other two gradients
768 are empty.
770 Args:
771 op: The FractionalMaxPoolOp.
772 grad_0: Gradient with respect to op.outputs[0]
773 unused_grad_1: Gradient with respect to op.outputs[1]/row_seq. It is empty.
774 unused_grad_2: Gradient with respect to op.outputs[2]/col_seq. It is empty.
776 Returns:
777 Input backprop for FractionalMaxPool op.
778 """
779 return gen_nn_ops.fractional_max_pool_grad(
780 op.inputs[0], op.outputs[0], grad_0, op.outputs[1], op.outputs[2],
781 op.get_attr("overlapping"))
784@ops.RegisterGradient("FractionalAvgPool")
785def _FractionalAvgPoolGrad(op, grad_0, unused_grad_1, unused_grad_2):
786 """Returns gradient for FractionalAvgPool.
788 Since FractionalAvgPool has three outputs, there are three gradients passed in
789 for each of the outputs. Only the first one is useful, the other two gradients
790 are empty.
792 Args:
793 op: The FractionalAvgPoolOp.
794 grad_0: Gradient with respect to op.outputs[0]
795 unused_grad_1: Gradient with respect to op.outputs[1]/row_seq. It is empty.
796 unused_grad_2: Gradient with respect to op.outputs[2]/col_seq. It is empty.
798 Returns:
799 Input backprop for FractionalAvgPool op.
800 """
801 return gen_nn_ops.fractional_avg_pool_grad(op.inputs[0].get_shape(), grad_0,
802 op.outputs[1], op.outputs[2],
803 op.get_attr("overlapping"))
806@ops.RegisterGradient("BatchNormWithGlobalNormalization")
807def _BatchNormWithGlobalNormalizationGrad(op, grad):
808 """Return the gradients for the 5 inputs of BatchNormWithGlobalNormalization.
810 We do not backprop anything for the mean and var intentionally as they are
811 not being trained with backprop in the operation.
813 Args:
814 op: The BatchNormOp for which we need to generate gradients.
815 grad: Tensor. The gradients passed to the BatchNormOp.
817 Returns:
818 dx: Backprop for input, which is (grad * (g * rsqrt(v + epsilon)))
819 dm: Backprop for mean, which is
820 sum_over_rest(grad * g) * (-1 / rsqrt(v + epsilon))
821 dv: Backprop for variance, which is
822 sum_over_rest(grad * g * (x - m)) * (-1/2) * (v + epsilon) ^ (-3/2)
823 db: Backprop for beta, which is grad reduced in all except the
824 last dimension.
825 dg: Backprop for gamma, which is (grad * ((x - m) * rsqrt(v + epsilon)))
826 """
827 dx, dm, dv, db, dg = gen_nn_ops.batch_norm_with_global_normalization_grad(
828 op.inputs[0], op.inputs[1], op.inputs[2], op.inputs[4], grad,
829 op.get_attr("variance_epsilon"), op.get_attr("scale_after_normalization"))
830 return dx, dm, dv, db, dg
833def _BaseFusedBatchNormGrad(op, version, *grad):
834 """Return the gradients for the 3 inputs of BatchNorm.
836 Args:
837 op: The BatchNormOp for which we need to compute gradients.
838 version: Integer indicating which version to use of the fused batch
839 norm gradient.
840 *grad: An argument list for tensors of gradients wrt the outputs
841 with grad[0] as grad_y.
843 Returns:
844 grad_x: gradient for x, which is scale * rsqrt(variance + epsilon) *
845 [grad_y - mean(grad_y) - (x - mean(x)) *
846 mean(grad_y * (x - mean(x))) / (variance + epsilon)]
847 in training mode; grad_y * scale * rsqrt(pop_variance + epsilon)
848 in freeze mode.
850 grad_scale: gradient for scale, which is sum(grad_y * (x - mean(x)) *
851 rsqrt(variance + epsilon)) in training mode;
852 sum(grad_y * (x - pop_mean) * rsqrt(pop_variance + epsilon))
853 in freeze mode.
855 grad_offset: gradient for offset, which is sum(grad_y) in training mode;
856 sum(grad_y) in freeze mode.
857 """
858 x = op.inputs[0]
859 grad_y = grad[0]
860 scale = op.inputs[1]
861 epsilon = op.get_attr("epsilon")
862 data_format = op.get_attr("data_format")
863 is_training = op.get_attr("is_training")
864 if version == 2:
865 grad_fun = gen_nn_ops.fused_batch_norm_grad_v3
866 elif version == 1:
867 grad_fun = gen_nn_ops.fused_batch_norm_grad_v2
868 else:
869 grad_fun = gen_nn_ops.fused_batch_norm_grad
870 if is_training:
871 args = {
872 "y_backprop": grad_y,
873 "x": x,
874 "scale": scale,
875 "reserve_space_1": op.outputs[3],
876 "reserve_space_2": op.outputs[4],
877 "epsilon": epsilon,
878 "data_format": data_format,
879 "is_training": is_training
880 }
881 if version == 2:
882 args["reserve_space_3"] = op.outputs[5]
883 dx, dscale, doffset, _, _ = grad_fun(**args)
884 else:
885 pop_mean = op.inputs[3]
886 pop_var = op.inputs[4]
887 if data_format == b"NCHW":
888 x = array_ops.transpose(x, [0, 2, 3, 1])
889 grad_y = array_ops.transpose(grad_y, [0, 2, 3, 1])
890 elif data_format == b"NCDHW":
891 x = array_ops.transpose(x, [0, 2, 3, 4, 1])
892 grad_y = array_ops.transpose(grad_y, [0, 2, 3, 4, 1])
893 target_data_format = ("NHWC" if data_format in (b"NCHW",
894 b"NHWC") else "NDHWC")
895 args = {
896 "y_backprop": grad_y,
897 "x": x,
898 "scale": scale,
899 "reserve_space_1": pop_mean,
900 "reserve_space_2": pop_var,
901 "epsilon": epsilon,
902 "data_format": target_data_format,
903 "is_training": is_training
904 }
905 if version == 2:
906 args["reserve_space_3"] = op.outputs[5]
907 dx, dscale, doffset, _, _ = grad_fun(**args)
908 if data_format == b"NCHW":
909 dx = array_ops.transpose(dx, [0, 3, 1, 2])
910 elif data_format == b"NCDHW":
911 dx = array_ops.transpose(dx, [0, 4, 1, 2, 3])
912 return dx, dscale, doffset, None, None
915@ops.RegisterGradient("FusedBatchNorm")
916def _FusedBatchNormGrad(op, *grad):
917 return _BaseFusedBatchNormGrad(op, 0, *grad)
920@ops.RegisterGradient("FusedBatchNormV2")
921def _FusedBatchNormV2Grad(op, *grad):
922 return _BaseFusedBatchNormGrad(op, 1, *grad)
925@ops.RegisterGradient("FusedBatchNormV3")
926def _FusedBatchNormV3Grad(op, *grad):
927 return _BaseFusedBatchNormGrad(op, 2, *grad)
930def _BatchNormGrad(grad_y,
931 x,
932 scale,
933 pop_mean,
934 pop_var,
935 epsilon,
936 data_format,
937 is_training=True):
938 """Returns the gradients for the 3 inputs of BatchNorm.
940 Args:
941 grad_y: A `Tensor` of 4 or 5 dimensions for gradient for y.
942 x: A `Tensor` of 4 or 5 dimensions for x.
943 scale: A `Tensor` of 1 dimension for scaling.
944 pop_mean: A `Tensor` of 1 dimension for the population mean. Only used when
945 is_training=False.
946 pop_var: A `Tensor` of 1 dimension for the population variance. Only used
947 when is_training=False.
948 epsilon: A small float number added to the variance of x.
949 data_format: The data format for input. Either b"NHWC" or b"NCHW".
950 is_training: A bool value to indicate the operation is for training
951 (default) or inference.
953 Returns:
954 A tuple (grad_x, grad_scale, grad_offset), where grad_x is the gradient
955 for x, grad_scale the gradient for scale, and grad_offset the gradient
956 for offset.
957 """
958 x_dtype = x.dtype.base_dtype
959 if x_dtype == dtypes.float16 or x_dtype == dtypes.bfloat16:
960 # float16 math is too imprecise, so we do the batch norm gradient
961 # computations in float32.
962 x = math_ops.cast(x, dtypes.float32)
963 grad_y = math_ops.cast(grad_y, dtypes.float32)
964 if is_training:
965 if data_format == b"NHWC":
966 keepdims = False
967 reduce_axis = [0, 1, 2]
968 elif data_format == b"NDHWC":
969 keepdims = False
970 reduce_axis = [0, 1, 2, 3]
971 elif data_format == b"NCHW":
972 keepdims = True
973 reduce_axis = [0, 2, 3]
974 shape = [1, array_ops.size(scale), 1, 1]
975 scale = array_ops.reshape(scale, shape)
976 else:
977 keepdims = True
978 reduce_axis = [0, 2, 3, 4]
979 shape = [1, array_ops.size(scale), 1, 1, 1]
980 scale = array_ops.reshape(scale, shape)
981 mean_grad_y = math_ops.reduce_mean(grad_y, reduce_axis, keepdims=keepdims)
982 mean_x = math_ops.reduce_mean(x, reduce_axis, keepdims=keepdims)
983 var_x = math_ops.reduce_mean(
984 math_ops.squared_difference(x, array_ops.stop_gradient(mean_x)),
985 reduce_axis,
986 keepdims=keepdims)
987 grad_y_offset = grad_y - mean_grad_y
988 x_offset = x - mean_x
989 mean = math_ops.reduce_mean(
990 grad_y * x_offset, axis=reduce_axis, keepdims=keepdims)
991 grad_x = scale * math_ops.rsqrt(var_x + epsilon) * (
992 grad_y_offset - math_ops.reciprocal(var_x + epsilon) * mean * x_offset)
993 grad_scale = math_ops.rsqrt(var_x + epsilon) * math_ops.reduce_sum(
994 grad_y * x_offset, axis=reduce_axis, keepdims=keepdims)
995 if data_format == b"NCHW" or data_format == b"NCDHW":
996 grad_scale = array_ops.squeeze(grad_scale)
997 grad_offset = math_ops.reduce_sum(grad_y, axis=reduce_axis)
998 return math_ops.cast(grad_x, x_dtype), grad_scale, grad_offset
999 else:
1000 if data_format == b"NHWC":
1001 reduce_axis = [0, 1, 2]
1002 elif data_format == b"NDHWC":
1003 reduce_axis = [0, 1, 2, 3]
1004 elif data_format == b"NCHW":
1005 reduce_axis = [0, 2, 3]
1006 shape = [1, array_ops.size(pop_mean), 1, 1]
1007 pop_mean = array_ops.reshape(pop_mean, shape)
1008 pop_var = array_ops.reshape(pop_var, shape)
1009 scale = array_ops.reshape(scale, shape)
1010 else:
1011 reduce_axis = [0, 2, 3, 4]
1012 shape = [1, array_ops.size(pop_mean), 1, 1, 1]
1013 pop_mean = array_ops.reshape(pop_mean, shape)
1014 pop_var = array_ops.reshape(pop_var, shape)
1015 scale = array_ops.reshape(scale, shape)
1017 grad_offset = math_ops.reduce_sum(grad_y, axis=reduce_axis)
1018 var_rsqrt = math_ops.rsqrt(pop_var + epsilon)
1019 grad_scale = math_ops.reduce_sum(
1020 grad_y * (x - pop_mean) * var_rsqrt, axis=reduce_axis)
1021 grad_x = grad_y * scale * var_rsqrt
1022 return math_ops.cast(grad_x, x_dtype), grad_scale, grad_offset
1025@ops.RegisterGradient("FusedBatchNormGrad")
1026def _FusedBatchNormGradGrad(op, *grad):
1027 """Returns the gradients for the 3 inputs of FusedBatchNormGrad.
1029 Args:
1030 op: The FusedBatchNormGradOp for which we need to compute gradients.
1031 *grad: An argument list for tensors of gradients wrt the outputs with
1032 grad[0] as grad_grad_x, grad[1] as grad_grad_scale, grad[2] as
1033 grad_grad_offset.
1035 Returns:
1036 A tuple (grad_grad_y, grad_x, grad_scale, None, None), where grad_grad_y
1037 is the gradient for grad_y, grad_x the gradient for x, grad_scale the
1038 gradient for scale.
1039 """
1040 data_format = op.get_attr("data_format")
1041 epsilon = op.get_attr("epsilon")
1042 is_training = op.get_attr("is_training")
1043 grad_y = op.inputs[0]
1044 x = op.inputs[1]
1045 scale = op.inputs[2]
1046 pop_mean = op.inputs[3]
1047 pop_var = op.inputs[4]
1048 grad_grad_x = grad[0]
1049 grad_grad_scale = grad[1]
1050 grad_grad_offset = grad[2]
1051 with backprop.GradientTape() as tape:
1052 tape.watch(grad_y)
1053 tape.watch(x)
1054 tape.watch(scale)
1055 grad_x, grad_scale, grad_offset = _BatchNormGrad(
1056 grad_y, x, scale, pop_mean, pop_var, epsilon, data_format, is_training)
1057 grad_initial = [grad_grad_x, grad_grad_scale, grad_grad_offset]
1058 grad_grad_y, grad_x, grad_scale = tape.gradient(
1059 [grad_x, grad_scale, grad_offset], [grad_y, x, scale], grad_initial)
1060 return grad_grad_y, grad_x, grad_scale, None, None
1063@ops.RegisterGradient("FusedBatchNormGradV2")
1064def _FusedBatchNormGradGradV2(op, *grad):
1065 return _FusedBatchNormGradGrad(op, *grad)
1068@ops.RegisterGradient("FusedBatchNormGradV3")
1069def _FusedBatchNormGradGradV3(op, *grad):
1070 grad_grad_y, grad_x, grad_scale, _, _ = _FusedBatchNormGradGrad(op, *grad)
1071 return grad_grad_y, grad_x, grad_scale, None, None, None
1074@ops.RegisterGradient("L2Loss")
1075def _L2LossGrad(op, grad):
1076 """Return the gradients for L2Loss.
1078 Args:
1079 op: The L2LossOp for which we need to generate gradients.
1080 grad: Tensor containing a single number.
1082 Returns:
1083 The gradient, which is (x * grad).
1084 """
1085 return op.inputs[0] * grad
1088@ops.RegisterGradient("TopK")
1089@ops.RegisterGradient("TopKV2")
1090def _TopKGrad(op, grad, _):
1091 """Return the gradients for TopK.
1093 Args:
1094 op: The TopKOp for which we need to generate gradients.
1095 grad: Tensor. The gradients passed to the TopKOp.
1097 Returns:
1098 A list of two tensors, the first being the gradient w.r.t to the input and
1099 TopK, and the second being the gradient w.r.t. to the indices (all zero).
1100 """
1101 in_shape = array_ops.shape(op.inputs[0])
1102 ind_shape = array_ops.shape(op.outputs[1])
1104 # int32 is not supported on GPU hence up-casting
1105 ind_lastdim = array_ops.gather(
1106 math_ops.cast(ind_shape, dtypes.int64),
1107 array_ops.size(ind_shape) - 1)
1108 # Flatten indices to 2D.
1109 ind_2d = array_ops.reshape(
1110 op.outputs[1], array_ops_stack.stack([-1, ind_lastdim]))
1112 in_lastdim = array_ops.gather(
1113 math_ops.cast(in_shape, dtypes.int64),
1114 array_ops.size(in_shape) - 1)
1115 outerdim = array_ops.shape(ind_2d)[0]
1116 # Compute linear indices (flattened to 1D).
1117 ind = array_ops.reshape(
1118 ind_2d + math_ops.cast(
1119 array_ops.expand_dims(
1120 math_ops.range(0,
1121 math_ops.cast(outerdim, dtypes.int64) * in_lastdim,
1122 in_lastdim), -1), dtypes.int32), [-1])
1124 # Substitute grad to appropriate locations and fill the rest with zeros,
1125 # finally reshaping it to the original input shape.
1126 return [
1127 array_ops.reshape(
1128 array_ops.scatter_nd(
1129 array_ops.expand_dims(ind, -1), array_ops.reshape(grad, [-1]),
1130 [math_ops.reduce_prod(in_shape)]), in_shape),
1131 array_ops.zeros([], dtype=dtypes.int32)
1132 ]
1135@ops.RegisterGradient("ApproxTopK")
1136def _ApproxTopKGradient(op, grad, _):
1137 """Return the gradients for ApproxTopK.
1139 Args:
1140 op: The ApproxTopK for which we need to generate gradients.
1141 grad: The gradients for backprop.
1143 Returns:
1144 Scattered gradient based on the top-k indices.
1145 """
1146 # The code below is to generate the correct index and value mapping for
1147 # scatter_nd to work properly.
1148 #
1149 # We use static evaluations as much as possible to reduce the runtime cost.
1150 # That's said, use operation.shape instead of array_ops.shape;
1151 # and use functools.reduce(operator.mul, ...) instead of math_ops.reduce_prod
1152 idx_shape = op.outputs[1].shape
1153 lifted_idx_shape = idx_shape + [1]
1154 flat_shape_len = functools.reduce(operator.mul, idx_shape)
1155 rank = idx_shape.rank
1156 reduction_dim = op.get_attr("reduction_dimension")
1157 if reduction_dim < 0:
1158 reduction_dim = rank + reduction_dim
1160 def GetLiftedIdx(d):
1161 if d == reduction_dim:
1162 return array_ops.reshape(op.outputs[1], lifted_idx_shape)
1163 iota_len = idx_shape[d]
1164 iota_shape = list(itertools.repeat(1, rank + 1))
1165 iota_shape[d] = iota_len
1166 iota = array_ops.reshape(math_ops.range(iota_len), iota_shape)
1167 return array_ops.broadcast_to(iota, lifted_idx_shape)
1169 lifted_idx = array_ops.concat(
1170 list(GetLiftedIdx(d) for d in range(rank)), axis=rank)
1171 flat_idx = array_ops.reshape(lifted_idx, [flat_shape_len, rank])
1172 flat_grad = array_ops.reshape(grad, [flat_shape_len])
1173 return array_ops.scatter_nd(flat_idx, flat_grad, op.inputs[0].shape)
1176@ops.RegisterGradient("NthElement")
1177def _NthElementGrad(op, grad):
1178 """Return the gradients for NthElement.
1180 Args:
1181 op: The NthElementOp for which we need to generate gradients.
1182 grad: Tensor. The gradients passed to the NthElementOp
1184 Returns:
1185 A list of two tensors, the first being the gradient w.r.t. the input,
1186 the second being the gradient w.r.t. the N (None).
1187 """
1188 input = op.inputs[0] # pylint: disable=redefined-builtin
1189 output = op.outputs[0]
1191 # Compute the number of elements which equal to output in each reduction
1192 # dimension. If there are multiple elements then the gradient will be
1193 # divided between them.
1194 indicators = math_ops.cast(
1195 math_ops.equal(array_ops.expand_dims(output, -1), input), grad.dtype)
1197 grad = array_ops.expand_dims(grad, -1)
1198 num_selected = array_ops.expand_dims(math_ops.reduce_sum(indicators, -1), -1)
1200 return [math_ops.divide(indicators, num_selected) * grad, None]
1203def _MeanAggregator(inputs, segments):
1204 """Replaces each segment with its mean along the last axis.
1206 Specifically, each value in the `inputs` tensor gets replaced by the mean
1207 value computed from the values that belong to the same segment.
1209 Args:
1210 inputs: A 2-tensor. Aggregation is done over dimension 1.
1211 segments: A 2-tensor, same shape as `input`.
1213 Returns:
1214 The result, same shape and type as `inputs`.
1215 """
1216 result = []
1217 for inputs_i, segments_i in zip(
1218 array_ops.split(inputs, inputs.shape[0]),
1219 array_ops.split(segments, segments.shape[0])):
1220 # Note that we do not use tf.math.segment_mean, as it has no TPU support.
1221 means_i = math_ops.unsorted_segment_mean(
1222 inputs_i, segments_i, num_segments=math_ops.reduce_max(segments_i) + 1)
1223 result.append(
1224 array_ops.reshape(array_ops.gather(means_i, segments_i), [-1]))
1225 return array_ops_stack.stack(result, axis=0)
1228# We have to register the gradients for these ops so that tensorflow will know
1229# how to differentiate them.
1230@ops.RegisterGradient("IsotonicRegression")
1231def _IsotonicRegressionGrad(op, grad_output, grad_segments):
1232 """Gradient for the isotonic regression function.
1234 Args:
1235 op: The IsotonicRegression tensorflow op.
1236 grad_output: Tensor of incoming gradients with respect to the output.
1237 grad_segments: Tensor of incoming gradients with respect to the segments.
1239 Returns:
1240 A tensor, same size as `grad_output` with the gradient with respect to
1241 the input.
1242 """
1243 del grad_segments # Discrete, non-differentiable.
1244 segments = op.outputs[1]
1245 return _MeanAggregator(grad_output, segments)