Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/tensorflow/python/ops/gradients_impl.py: 55%
55 statements
« prev ^ index » next coverage.py v7.4.0, created at 2024-01-03 07:57 +0000
« prev ^ index » next coverage.py v7.4.0, created at 2024-01-03 07:57 +0000
1# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
2#
3# Licensed under the Apache License, Version 2.0 (the "License");
4# you may not use this file except in compliance with the License.
5# You may obtain a copy of the License at
6#
7# http://www.apache.org/licenses/LICENSE-2.0
8#
9# Unless required by applicable law or agreed to in writing, software
10# distributed under the License is distributed on an "AS IS" BASIS,
11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12# See the License for the specific language governing permissions and
13# limitations under the License.
14# ==============================================================================
15"""Implements the graph generation for computation of gradients."""
17from tensorflow.python.framework import dtypes
18from tensorflow.python.framework import ops
19from tensorflow.python.ops import array_grad # pylint: disable=unused-import
20from tensorflow.python.ops import array_ops
21from tensorflow.python.ops import check_ops # pylint: disable=unused-import
22from tensorflow.python.ops import control_flow_grad # pylint: disable=unused-import
23from tensorflow.python.ops import gradients_util
24from tensorflow.python.ops import image_grad # pylint: disable=unused-import
25from tensorflow.python.ops import linalg_grad # pylint: disable=unused-import
26from tensorflow.python.ops import linalg_ops # pylint: disable=unused-import
27from tensorflow.python.ops import logging_ops # pylint: disable=unused-import
28from tensorflow.python.ops import manip_grad # pylint: disable=unused-import
29from tensorflow.python.ops import math_grad # pylint: disable=unused-import
30from tensorflow.python.ops import math_ops
31from tensorflow.python.ops import optional_grad # pylint: disable=unused-import
32from tensorflow.python.ops import random_grad # pylint: disable=unused-import
33from tensorflow.python.ops import tensor_array_ops
34from tensorflow.python.ops import while_loop
35from tensorflow.python.ops.unconnected_gradients import UnconnectedGradients
36from tensorflow.python.util.tf_export import tf_export
39@tf_export(v1=["gradients"])
40def gradients(ys,
41 xs,
42 grad_ys=None,
43 name="gradients",
44 colocate_gradients_with_ops=False,
45 gate_gradients=False,
46 aggregation_method=None,
47 stop_gradients=None,
48 unconnected_gradients=UnconnectedGradients.NONE):
49 """Constructs symbolic derivatives of sum of `ys` w.r.t. x in `xs`.
51 `ys` and `xs` are each a `Tensor` or a list of tensors. `grad_ys`
52 is a list of `Tensor`, holding the gradients received by the
53 `ys`. The list must be the same length as `ys`.
55 `gradients()` adds ops to the graph to output the derivatives of `ys` with
56 respect to `xs`. It returns a list of `Tensor` of length `len(xs)` where
57 each tensor is the `sum(dy/dx)` for y in `ys` and for x in `xs`.
59 `grad_ys` is a list of tensors of the same length as `ys` that holds
60 the initial gradients for each y in `ys`. When `grad_ys` is None,
61 we fill in a tensor of '1's of the shape of y for each y in `ys`. A
62 user can provide their own initial `grad_ys` to compute the
63 derivatives using a different initial gradient for each y (e.g., if
64 one wanted to weight the gradient differently for each value in
65 each y).
67 `stop_gradients` is a `Tensor` or a list of tensors to be considered constant
68 with respect to all `xs`. These tensors will not be backpropagated through,
69 as though they had been explicitly disconnected using `stop_gradient`. Among
70 other things, this allows computation of partial derivatives as opposed to
71 total derivatives. For example:
73 ```python
74 a = tf.constant(0.)
75 b = 2 * a
76 g = tf.gradients(a + b, [a, b], stop_gradients=[a, b])
77 ```
79 Here the partial derivatives `g` evaluate to `[1.0, 1.0]`, compared to the
80 total derivatives `tf.gradients(a + b, [a, b])`, which take into account the
81 influence of `a` on `b` and evaluate to `[3.0, 1.0]`. Note that the above is
82 equivalent to:
84 ```python
85 a = tf.stop_gradient(tf.constant(0.))
86 b = tf.stop_gradient(2 * a)
87 g = tf.gradients(a + b, [a, b])
88 ```
90 `stop_gradients` provides a way of stopping gradient after the graph has
91 already been constructed, as compared to `tf.stop_gradient` which is used
92 during graph construction. When the two approaches are combined,
93 backpropagation stops at both `tf.stop_gradient` nodes and nodes in
94 `stop_gradients`, whichever is encountered first.
96 All integer tensors are considered constant with respect to all `xs`, as if
97 they were included in `stop_gradients`.
99 `unconnected_gradients` determines the value returned for each x in xs if it
100 is unconnected in the graph to ys. By default this is None to safeguard
101 against errors. Mathematically these gradients are zero which can be requested
102 using the `'zero'` option. `tf.UnconnectedGradients` provides the
103 following options and behaviors:
105 ```python
106 a = tf.ones([1, 2])
107 b = tf.ones([3, 1])
108 g1 = tf.gradients([b], [a], unconnected_gradients='none')
109 sess.run(g1) # [None]
111 g2 = tf.gradients([b], [a], unconnected_gradients='zero')
112 sess.run(g2) # [array([[0., 0.]], dtype=float32)]
113 ```
115 Let us take one practical example which comes during the back propogation
116 phase. This function is used to evaluate the derivatives of the cost function
117 with respect to Weights `Ws` and Biases `bs`. Below sample implementation
118 provides the exaplantion of what it is actually used for :
120 ```python
121 Ws = tf.constant(0.)
122 bs = 2 * Ws
123 cost = Ws + bs # This is just an example. So, please ignore the formulas.
124 g = tf.gradients(cost, [Ws, bs])
125 dCost_dW, dCost_db = g
126 ```
129 Args:
130 ys: A `Tensor` or list of tensors to be differentiated.
131 xs: A `Tensor` or list of tensors to be used for differentiation.
132 grad_ys: Optional. A `Tensor` or list of tensors the same size as
133 `ys` and holding the gradients computed for each y in `ys`.
134 name: Optional name to use for grouping all the gradient ops together.
135 defaults to 'gradients'.
136 colocate_gradients_with_ops: If True, try colocating gradients with
137 the corresponding op.
138 gate_gradients: If True, add a tuple around the gradients returned
139 for an operations. This avoids some race conditions.
140 aggregation_method: Specifies the method used to combine gradient terms.
141 Accepted values are constants defined in the class `AggregationMethod`.
142 stop_gradients: Optional. A `Tensor` or list of tensors not to differentiate
143 through.
144 unconnected_gradients: Optional. Specifies the gradient value returned when
145 the given input tensors are unconnected. Accepted values are constants
146 defined in the class `tf.UnconnectedGradients` and the default value is
147 `none`.
149 Returns:
150 A list of `Tensor` of length `len(xs)` where each tensor is the `sum(dy/dx)`
151 for y in `ys` and for x in `xs`.
153 Raises:
154 LookupError: if one of the operations between `x` and `y` does not
155 have a registered gradient function.
156 ValueError: if the arguments are invalid.
157 RuntimeError: if called in Eager mode.
159 """
160 # Creating the gradient graph for control flow mutates Operations.
161 # _mutation_lock ensures a Session.run call cannot occur between creating and
162 # mutating new ops.
163 # pylint: disable=protected-access
164 with ops.get_default_graph()._mutation_lock():
165 return gradients_util._GradientsHelper(
166 ys, xs, grad_ys, name, colocate_gradients_with_ops,
167 gate_gradients, aggregation_method, stop_gradients,
168 unconnected_gradients)
169 # pylint: enable=protected-access
172@tf_export("gradients", v1=[])
173def gradients_v2(ys, # pylint: disable=invalid-name
174 xs,
175 grad_ys=None,
176 name="gradients",
177 gate_gradients=False,
178 aggregation_method=None,
179 stop_gradients=None,
180 unconnected_gradients=UnconnectedGradients.NONE):
181 """Constructs symbolic derivatives of sum of `ys` w.r.t. x in `xs`.
183 `tf.gradients` is only valid in a graph context. In particular,
184 it is valid in the context of a `tf.function` wrapper, where code
185 is executing as a graph.
187 `ys` and `xs` are each a `Tensor` or a list of tensors. `grad_ys`
188 is a list of `Tensor`, holding the gradients received by the
189 `ys`. The list must be the same length as `ys`.
191 `gradients()` adds ops to the graph to output the derivatives of `ys` with
192 respect to `xs`. It returns a list of `Tensor` of length `len(xs)` where
193 each tensor is the `sum(dy/dx)` for y in `ys` and for x in `xs`.
195 `grad_ys` is a list of tensors of the same length as `ys` that holds
196 the initial gradients for each y in `ys`. When `grad_ys` is None,
197 we fill in a tensor of '1's of the shape of y for each y in `ys`. A
198 user can provide their own initial `grad_ys` to compute the
199 derivatives using a different initial gradient for each y (e.g., if
200 one wanted to weight the gradient differently for each value in
201 each y).
203 `stop_gradients` is a `Tensor` or a list of tensors to be considered constant
204 with respect to all `xs`. These tensors will not be backpropagated through,
205 as though they had been explicitly disconnected using `stop_gradient`. Among
206 other things, this allows computation of partial derivatives as opposed to
207 total derivatives. For example:
209 >>> @tf.function
210 ... def example():
211 ... a = tf.constant(0.)
212 ... b = 2 * a
213 ... return tf.gradients(a + b, [a, b], stop_gradients=[a, b])
214 >>> example()
215 [<tf.Tensor: shape=(), dtype=float32, numpy=1.0>,
216 <tf.Tensor: shape=(), dtype=float32, numpy=1.0>]
218 Here the partial derivatives `g` evaluate to `[1.0, 1.0]`, compared to the
219 total derivatives `tf.gradients(a + b, [a, b])`, which take into account the
220 influence of `a` on `b` and evaluate to `[3.0, 1.0]`. Note that the above is
221 equivalent to:
223 >>> @tf.function
224 ... def example():
225 ... a = tf.stop_gradient(tf.constant(0.))
226 ... b = tf.stop_gradient(2 * a)
227 ... return tf.gradients(a + b, [a, b])
228 >>> example()
229 [<tf.Tensor: shape=(), dtype=float32, numpy=1.0>,
230 <tf.Tensor: shape=(), dtype=float32, numpy=1.0>]
232 `stop_gradients` provides a way of stopping gradient after the graph has
233 already been constructed, as compared to `tf.stop_gradient` which is used
234 during graph construction. When the two approaches are combined,
235 backpropagation stops at both `tf.stop_gradient` nodes and nodes in
236 `stop_gradients`, whichever is encountered first.
238 All integer tensors are considered constant with respect to all `xs`, as if
239 they were included in `stop_gradients`.
241 `unconnected_gradients` determines the value returned for each x in xs if it
242 is unconnected in the graph to ys. By default this is None to safeguard
243 against errors. Mathematically these gradients are zero which can be requested
244 using the `'zero'` option. `tf.UnconnectedGradients` provides the
245 following options and behaviors:
247 >>> @tf.function
248 ... def example(use_zero):
249 ... a = tf.ones([1, 2])
250 ... b = tf.ones([3, 1])
251 ... if use_zero:
252 ... return tf.gradients([b], [a], unconnected_gradients='zero')
253 ... else:
254 ... return tf.gradients([b], [a], unconnected_gradients='none')
255 >>> example(False)
256 [None]
257 >>> example(True)
258 [<tf.Tensor: shape=(1, 2), dtype=float32, numpy=array([[0., 0.]], ...)>]
260 Let us take one practical example which comes during the back propogation
261 phase. This function is used to evaluate the derivatives of the cost function
262 with respect to Weights `Ws` and Biases `bs`. Below sample implementation
263 provides the exaplantion of what it is actually used for :
265 >>> @tf.function
266 ... def example():
267 ... Ws = tf.constant(0.)
268 ... bs = 2 * Ws
269 ... cost = Ws + bs # This is just an example. Please ignore the formulas.
270 ... g = tf.gradients(cost, [Ws, bs])
271 ... dCost_dW, dCost_db = g
272 ... return dCost_dW, dCost_db
273 >>> example()
274 (<tf.Tensor: shape=(), dtype=float32, numpy=3.0>,
275 <tf.Tensor: shape=(), dtype=float32, numpy=1.0>)
277 Args:
278 ys: A `Tensor` or list of tensors to be differentiated.
279 xs: A `Tensor` or list of tensors to be used for differentiation.
280 grad_ys: Optional. A `Tensor` or list of tensors the same size as
281 `ys` and holding the gradients computed for each y in `ys`.
282 name: Optional name to use for grouping all the gradient ops together.
283 defaults to 'gradients'.
284 gate_gradients: If True, add a tuple around the gradients returned
285 for an operations. This avoids some race conditions.
286 aggregation_method: Specifies the method used to combine gradient terms.
287 Accepted values are constants defined in the class `AggregationMethod`.
288 stop_gradients: Optional. A `Tensor` or list of tensors not to differentiate
289 through.
290 unconnected_gradients: Optional. Specifies the gradient value returned when
291 the given input tensors are unconnected. Accepted values are constants
292 defined in the class `tf.UnconnectedGradients` and the default value is
293 `none`.
295 Returns:
296 A list of `Tensor` of length `len(xs)` where each tensor is the `sum(dy/dx)`
297 for y in `ys` and for x in `xs`.
299 Raises:
300 LookupError: if one of the operations between `x` and `y` does not
301 have a registered gradient function.
302 ValueError: if the arguments are invalid.
303 RuntimeError: if called in Eager mode.
305 """
306 # Creating the gradient graph for control flow mutates Operations.
307 # _mutation_lock ensures a Session.run call cannot occur between creating and
308 # mutating new ops.
309 # pylint: disable=protected-access
310 with ops.get_default_graph()._mutation_lock():
311 return gradients_util._GradientsHelper(
312 ys, xs, grad_ys, name, True, gate_gradients,
313 aggregation_method, stop_gradients,
314 unconnected_gradients)
315 # pylint: enable=protected-access
318# TODO(vrv): Make this available when we want to make it public.
319def _hessian_vector_product(ys, xs, v):
320 """Multiply the Hessian of `ys` wrt `xs` by `v`.
322 This is an efficient construction that uses a backprop-like approach
323 to compute the product between the Hessian and another vector. The
324 Hessian is usually too large to be explicitly computed or even
325 represented, but this method allows us to at least multiply by it
326 for the same big-O cost as backprop.
328 Implicit Hessian-vector products are the main practical, scalable way
329 of using second derivatives with neural networks. They allow us to
330 do things like construct Krylov subspaces and approximate conjugate
331 gradient descent.
333 Example: if `y` = 1/2 `x`^T A `x`, then `hessian_vector_product(y,
334 x, v)` will return an expression that evaluates to the same values
335 as (A + A.T) `v`.
337 Args:
338 ys: A scalar value, or a tensor or list of tensors to be summed to
339 yield a scalar.
340 xs: A list of tensors that we should construct the Hessian over.
341 v: A list of tensors, with the same shapes as xs, that we want to
342 multiply by the Hessian.
344 Returns:
345 A list of tensors (or if the list would be length 1, a single tensor)
346 containing the product between the Hessian and `v`.
348 Raises:
349 ValueError: `xs` and `v` have different length.
351 """
353 # Validate the input
354 length = len(xs)
355 if len(v) != length:
356 raise ValueError("xs and v must have the same length.")
358 # First backprop
359 grads = gradients(ys, xs)
361 assert len(grads) == length
362 elemwise_products = [
363 math_ops.multiply(grad_elem, array_ops.stop_gradient(v_elem))
364 for grad_elem, v_elem in zip(grads, v)
365 if grad_elem is not None
366 ]
368 # Second backprop
369 return gradients(elemwise_products, xs)
372@tf_export(v1=["hessians"])
373def hessians(ys,
374 xs,
375 name="hessians",
376 colocate_gradients_with_ops=False,
377 gate_gradients=False,
378 aggregation_method=None):
379 """Constructs the Hessian of sum of `ys` with respect to `x` in `xs`.
381 `hessians()` adds ops to the graph to output the Hessian matrix of `ys`
382 with respect to `xs`. It returns a list of `Tensor` of length `len(xs)`
383 where each tensor is the Hessian of `sum(ys)`.
385 The Hessian is a matrix of second-order partial derivatives of a scalar
386 tensor (see https://en.wikipedia.org/wiki/Hessian_matrix for more details).
388 Args:
389 ys: A `Tensor` or list of tensors to be differentiated.
390 xs: A `Tensor` or list of tensors to be used for differentiation.
391 name: Optional name to use for grouping all the gradient ops together.
392 defaults to 'hessians'.
393 colocate_gradients_with_ops: See `gradients()` documentation for details.
394 gate_gradients: See `gradients()` documentation for details.
395 aggregation_method: See `gradients()` documentation for details.
397 Returns:
398 A list of Hessian matrices of `sum(ys)` for each `x` in `xs`.
400 Raises:
401 LookupError: if one of the operations between `xs` and `ys` does not
402 have a registered gradient function.
403 """
404 xs = gradients_util._AsList(xs) # pylint: disable=protected-access
405 kwargs = {
406 "colocate_gradients_with_ops": colocate_gradients_with_ops,
407 "gate_gradients": gate_gradients,
408 "aggregation_method": aggregation_method
409 }
410 # Compute first-order derivatives and iterate for each x in xs.
411 hessians = []
412 _gradients = gradients(ys, xs, **kwargs)
413 for gradient, x in zip(_gradients, xs):
414 # change shape to one-dimension without graph branching
415 gradient = array_ops.reshape(gradient, [-1])
417 # Declare an iterator and tensor array loop variables for the gradients.
418 n = array_ops.size(x)
419 loop_vars = [
420 array_ops.constant(0, dtypes.int32),
421 tensor_array_ops.TensorArray(x.dtype, n)
422 ]
423 # Iterate over all elements of the gradient and compute second order
424 # derivatives.
425 _, hessian = while_loop.while_loop(
426 lambda j, _: j < n,
427 lambda j, result: (j + 1,
428 result.write(j, gradients(gradient[j], x)[0])),
429 loop_vars
430 )
432 _shape = array_ops.shape(x)
433 _reshaped_hessian = array_ops.reshape(hessian.stack(),
434 array_ops.concat((_shape, _shape), 0))
435 hessians.append(_reshaped_hessian)
436 return hessians
439@tf_export("hessians", v1=[])
440def HessiansV2(ys,
441 xs,
442 gate_gradients=False,
443 aggregation_method=None,
444 name="hessians"):
445 """Constructs the Hessian of sum of `ys` with respect to `x` in `xs`.
447 `hessians()` adds ops to the graph to output the Hessian matrix of `ys`
448 with respect to `xs`. It returns a list of `Tensor` of length `len(xs)`
449 where each tensor is the Hessian of `sum(ys)`.
451 The Hessian is a matrix of second-order partial derivatives of a scalar
452 tensor (see https://en.wikipedia.org/wiki/Hessian_matrix for more details).
454 Args:
455 ys: A `Tensor` or list of tensors to be differentiated.
456 xs: A `Tensor` or list of tensors to be used for differentiation.
457 gate_gradients: See `gradients()` documentation for details.
458 aggregation_method: See `gradients()` documentation for details.
459 name: Optional name to use for grouping all the gradient ops together.
460 defaults to 'hessians'.
462 Returns:
463 A list of Hessian matrices of `sum(ys)` for each `x` in `xs`.
465 Raises:
466 LookupError: if one of the operations between `xs` and `ys` does not
467 have a registered gradient function.
468 """
469 return hessians(
470 ys,
471 xs,
472 name=name,
473 colocate_gradients_with_ops=True,
474 gate_gradients=gate_gradients,
475 aggregation_method=aggregation_method)