Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/tensorflow/python/training/adam.py: 26%
90 statements
« prev ^ index » next coverage.py v7.4.0, created at 2024-01-03 07:57 +0000
« prev ^ index » next coverage.py v7.4.0, created at 2024-01-03 07:57 +0000
1# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
2#
3# Licensed under the Apache License, Version 2.0 (the "License");
4# you may not use this file except in compliance with the License.
5# You may obtain a copy of the License at
6#
7# http://www.apache.org/licenses/LICENSE-2.0
8#
9# Unless required by applicable law or agreed to in writing, software
10# distributed under the License is distributed on an "AS IS" BASIS,
11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12# See the License for the specific language governing permissions and
13# limitations under the License.
14# ==============================================================================
15"""Adam for TensorFlow."""
16from tensorflow.python.eager import context
17from tensorflow.python.framework import ops
18from tensorflow.python.ops import control_flow_ops
19from tensorflow.python.ops import math_ops
20from tensorflow.python.ops import resource_variable_ops
21from tensorflow.python.ops import state_ops
22from tensorflow.python.training import optimizer
23from tensorflow.python.training import training_ops
24from tensorflow.python.util.tf_export import tf_export
27@tf_export(v1=["train.AdamOptimizer"])
28class AdamOptimizer(optimizer.Optimizer):
29 """Optimizer that implements the Adam algorithm.
31 References:
32 Adam - A Method for Stochastic Optimization:
33 [Kingma et al., 2015](https://arxiv.org/abs/1412.6980)
34 ([pdf](https://arxiv.org/pdf/1412.6980.pdf))
36 @compatibility(TF2)
37 tf.compat.v1.train.AdamOptimizer is compatible with eager mode and
38 `tf.function`.
39 When eager execution is enabled, `learning_rate`, `beta1`, `beta2`, and
40 `epsilon` can each be a callable that takes no arguments and returns the
41 actual value to use. This can be useful for changing these values across
42 different invocations of optimizer functions.
44 To switch to native TF2 style, use [`tf.keras.optimizers.Adam`]
45 (https://www.tensorflow.org/api_docs/python/tf/keras/optimizers/Adam)
46 instead. Please notice that due to the implementation differences,
47 `tf.keras.optimizers.Adam` and
48 `tf.compat.v1.train.AdamOptimizer` may have slight differences in
49 floating point numerics even though the formula used for the variable
50 updates still matches.
52 #### Structural Mapping to Native TF2
54 Before:
56 ```python
57 optimizer = tf.compat.v1.train.AdamOptimizer(learning_rate=0.001)
58 ```
60 After:
62 ```python
63 optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)
64 ```
66 #### How to Map Arguments
67 |TF1 Arg Name |TF2 Arg Name |Note |
68 |----------------------|-------------|----------------------|
69 |learning_rate |learning_rate|Be careful of setting learning_rate as a
70 : : : tensor value computed from the global
71 : : : step. In TF1 this was usually meant to
72 : : : imply a dynamic learning rate and would
73 : : : recompute in each step. In TF2 (eager +
74 : : : function) it will treat it as a scalar
75 : : : value that only gets computed once
76 : : : instead of a symbolic placeholder to be
77 : : : computed each time. :
78 |beta1 |beta_1 | |
79 |beta2 |beta_2 | |
80 |epsilon |epsilon | Default value is 1e-08 in TF1, but
81 : : : 1e-07 in TF2. :
82 |use_locking |N/A |Not applicable in TF2. |
84 #### Before & After Usage Example
85 Before:
87 ```python
88 x = tf.Variable([1,2,3], dtype=tf.float32)
89 grad = tf.constant([0.1, 0.2, 0.3])
90 optimizer = tf.compat.v1.train.AdamOptimizer(learning_rate=0.001)
91 optimizer.apply_gradients(zip([grad], [x]))
92 ```
94 After:
96 ```python
97 x = tf.Variable([1,2,3], dtype=tf.float32)
98 grad = tf.constant([0.1, 0.2, 0.3])
99 optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)
100 optimizer.apply_gradients(zip([grad], [x]))
101 ```
103 @end_compatibility
104 """
106 def __init__(self,
107 learning_rate=0.001,
108 beta1=0.9,
109 beta2=0.999,
110 epsilon=1e-8,
111 use_locking=False,
112 name="Adam"):
113 r"""Construct a new Adam optimizer.
115 Initialization:
117 $$m_0 := 0 \text{(Initialize initial 1st moment vector)}$$
118 $$v_0 := 0 \text{(Initialize initial 2nd moment vector)}$$
119 $$t := 0 \text{(Initialize timestep)}$$
121 The update rule for `variable` with gradient `g` uses an optimization
122 described at the end of section 2 of the paper:
124 $$t := t + 1$$
125 $$\text{lr}_t := \mathrm{learning_rate} *
126 \sqrt{1 - \beta_2^t} / (1 - \beta_1^t)$$
128 $$m_t := \beta_1 * m_{t-1} + (1 - \beta_1) * g$$
129 $$v_t := \beta_2 * v_{t-1} + (1 - \beta_2) * g * g$$
130 $$\text{variable} := \text{variable} -
131 \text{lr}_t * m_t / (\sqrt{v_t} + \epsilon)$$
133 The default value of 1e-8 for epsilon might not be a good default in
134 general. For example, when training an Inception network on ImageNet a
135 current good choice is 1.0 or 0.1. Note that since AdamOptimizer uses the
136 formulation just before Section 2.1 of the Kingma and Ba paper rather than
137 the formulation in Algorithm 1, the "epsilon" referred to here is "epsilon
138 hat" in the paper.
140 The sparse implementation of this algorithm (used when the gradient is an
141 IndexedSlices object, typically because of `tf.gather` or an embedding
142 lookup in the forward pass) does apply momentum to variable slices even if
143 they were not used in the forward pass (meaning they have a gradient equal
144 to zero). Momentum decay (beta1) is also applied to the entire momentum
145 accumulator. This means that the sparse behavior is equivalent to the dense
146 behavior (in contrast to some momentum implementations which ignore momentum
147 unless a variable slice was actually used).
149 Args:
150 learning_rate: A Tensor or a floating point value. The learning rate.
151 beta1: A float value or a constant float tensor. The exponential decay
152 rate for the 1st moment estimates.
153 beta2: A float value or a constant float tensor. The exponential decay
154 rate for the 2nd moment estimates.
155 epsilon: A small constant for numerical stability. This epsilon is
156 "epsilon hat" in the Kingma and Ba paper (in the formula just before
157 Section 2.1), not the epsilon in Algorithm 1 of the paper.
158 use_locking: If True use locks for update operations.
159 name: Optional name for the operations created when applying gradients.
160 Defaults to "Adam".
163 """
165 super(AdamOptimizer, self).__init__(use_locking, name)
166 self._lr = learning_rate
167 self._beta1 = beta1
168 self._beta2 = beta2
169 self._epsilon = epsilon
171 # Tensor versions of the constructor arguments, created in _prepare().
172 self._lr_t = None
173 self._beta1_t = None
174 self._beta2_t = None
175 self._epsilon_t = None
177 def _get_beta_accumulators(self):
178 with ops.init_scope():
179 if context.executing_eagerly():
180 graph = None
181 else:
182 graph = ops.get_default_graph()
183 return (self._get_non_slot_variable("beta1_power", graph=graph),
184 self._get_non_slot_variable("beta2_power", graph=graph))
186 def _create_slots(self, var_list):
187 # Create the beta1 and beta2 accumulators on the same device as the first
188 # variable. Sort the var_list to make sure this device is consistent across
189 # workers (these need to go on the same PS, otherwise some updates are
190 # silently ignored).
191 first_var = min(var_list, key=lambda x: x.name)
192 self._create_non_slot_variable(
193 initial_value=self._beta1, name="beta1_power", colocate_with=first_var)
194 self._create_non_slot_variable(
195 initial_value=self._beta2, name="beta2_power", colocate_with=first_var)
197 # Create slots for the first and second moments.
198 for v in var_list:
199 self._zeros_slot(v, "m", self._name)
200 self._zeros_slot(v, "v", self._name)
202 def _prepare(self):
203 lr = self._call_if_callable(self._lr)
204 beta1 = self._call_if_callable(self._beta1)
205 beta2 = self._call_if_callable(self._beta2)
206 epsilon = self._call_if_callable(self._epsilon)
208 self._lr_t = ops.convert_to_tensor(lr, name="learning_rate")
209 self._beta1_t = ops.convert_to_tensor(beta1, name="beta1")
210 self._beta2_t = ops.convert_to_tensor(beta2, name="beta2")
211 self._epsilon_t = ops.convert_to_tensor(epsilon, name="epsilon")
213 def _apply_dense(self, grad, var):
214 m = self.get_slot(var, "m")
215 v = self.get_slot(var, "v")
216 beta1_power, beta2_power = self._get_beta_accumulators()
217 return training_ops.apply_adam(
218 var,
219 m,
220 v,
221 math_ops.cast(beta1_power, var.dtype.base_dtype),
222 math_ops.cast(beta2_power, var.dtype.base_dtype),
223 math_ops.cast(self._lr_t, var.dtype.base_dtype),
224 math_ops.cast(self._beta1_t, var.dtype.base_dtype),
225 math_ops.cast(self._beta2_t, var.dtype.base_dtype),
226 math_ops.cast(self._epsilon_t, var.dtype.base_dtype),
227 grad,
228 use_locking=self._use_locking).op
230 def _resource_apply_dense(self, grad, var):
231 m = self.get_slot(var, "m")
232 v = self.get_slot(var, "v")
233 beta1_power, beta2_power = self._get_beta_accumulators()
234 return training_ops.resource_apply_adam(
235 var.handle,
236 m.handle,
237 v.handle,
238 math_ops.cast(beta1_power, grad.dtype.base_dtype),
239 math_ops.cast(beta2_power, grad.dtype.base_dtype),
240 math_ops.cast(self._lr_t, grad.dtype.base_dtype),
241 math_ops.cast(self._beta1_t, grad.dtype.base_dtype),
242 math_ops.cast(self._beta2_t, grad.dtype.base_dtype),
243 math_ops.cast(self._epsilon_t, grad.dtype.base_dtype),
244 grad,
245 use_locking=self._use_locking)
247 def _apply_sparse_shared(self, grad, var, indices, scatter_add):
248 beta1_power, beta2_power = self._get_beta_accumulators()
249 beta1_power = math_ops.cast(beta1_power, var.dtype.base_dtype)
250 beta2_power = math_ops.cast(beta2_power, var.dtype.base_dtype)
251 lr_t = math_ops.cast(self._lr_t, var.dtype.base_dtype)
252 beta1_t = math_ops.cast(self._beta1_t, var.dtype.base_dtype)
253 beta2_t = math_ops.cast(self._beta2_t, var.dtype.base_dtype)
254 epsilon_t = math_ops.cast(self._epsilon_t, var.dtype.base_dtype)
255 lr = (lr_t * math_ops.sqrt(1 - beta2_power) / (1 - beta1_power))
256 # m_t = beta1 * m + (1 - beta1) * g_t
257 m = self.get_slot(var, "m")
258 m_scaled_g_values = grad * (1 - beta1_t)
259 m_t = state_ops.assign(m, m * beta1_t, use_locking=self._use_locking)
260 with ops.control_dependencies([m_t]):
261 m_t = scatter_add(m, indices, m_scaled_g_values)
262 # v_t = beta2 * v + (1 - beta2) * (g_t * g_t)
263 v = self.get_slot(var, "v")
264 v_scaled_g_values = (grad * grad) * (1 - beta2_t)
265 v_t = state_ops.assign(v, v * beta2_t, use_locking=self._use_locking)
266 with ops.control_dependencies([v_t]):
267 v_t = scatter_add(v, indices, v_scaled_g_values)
268 v_sqrt = math_ops.sqrt(v_t)
269 var_update = state_ops.assign_sub(
270 var, lr * m_t / (v_sqrt + epsilon_t), use_locking=self._use_locking)
271 return control_flow_ops.group(*[var_update, m_t, v_t])
273 def _apply_sparse(self, grad, var):
274 return self._apply_sparse_shared(
275 grad.values,
276 var,
277 grad.indices,
278 lambda x, i, v: state_ops.scatter_add( # pylint: disable=g-long-lambda
279 x,
280 i,
281 v,
282 use_locking=self._use_locking))
284 def _resource_scatter_add(self, x, i, v):
285 with ops.control_dependencies(
286 [resource_variable_ops.resource_scatter_add(x.handle, i, v)]):
287 return x.value()
289 def _resource_apply_sparse(self, grad, var, indices):
290 return self._apply_sparse_shared(grad, var, indices,
291 self._resource_scatter_add)
293 def _finish(self, update_ops, name_scope):
294 # Update the power accumulators.
295 with ops.control_dependencies(update_ops):
296 beta1_power, beta2_power = self._get_beta_accumulators()
297 with ops.colocate_with(beta1_power):
298 update_beta1 = beta1_power.assign(
299 beta1_power * self._beta1_t, use_locking=self._use_locking)
300 update_beta2 = beta2_power.assign(
301 beta2_power * self._beta2_t, use_locking=self._use_locking)
302 return control_flow_ops.group(
303 *update_ops + [update_beta1, update_beta2], name=name_scope)