Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/keras/src/optimizers/legacy/adam.py: 18%
144 statements
« prev ^ index » next coverage.py v7.4.0, created at 2024-01-03 07:57 +0000
« prev ^ index » next coverage.py v7.4.0, created at 2024-01-03 07:57 +0000
1# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
2#
3# Licensed under the Apache License, Version 2.0 (the "License");
4# you may not use this file except in compliance with the License.
5# You may obtain a copy of the License at
6#
7# http://www.apache.org/licenses/LICENSE-2.0
8#
9# Unless required by applicable law or agreed to in writing, software
10# distributed under the License is distributed on an "AS IS" BASIS,
11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12# See the License for the specific language governing permissions and
13# limitations under the License.
14# ==============================================================================
15"""Adam optimizer implementation."""
17import tensorflow.compat.v2 as tf
19from keras.src import backend_config
20from keras.src.optimizers.legacy import optimizer_v2
22# isort: off
23from tensorflow.python.util.tf_export import keras_export
26@keras_export(
27 "keras.optimizers.legacy.Adam",
28 v1=["keras.optimizers.Adam", "keras.optimizers.legacy.Adam"],
29)
30class Adam(optimizer_v2.OptimizerV2):
31 r"""Optimizer that implements the Adam algorithm.
33 Adam optimization is a stochastic gradient descent method that is based on
34 adaptive estimation of first-order and second-order moments.
36 According to
37 [Kingma et al., 2014](http://arxiv.org/abs/1412.6980),
38 the method is "*computationally
39 efficient, has little memory requirement, invariant to diagonal rescaling of
40 gradients, and is well suited for problems that are large in terms of
41 data/parameters*".
43 Args:
44 learning_rate: A `Tensor`, floating point value, or a schedule that is a
45 `tf.keras.optimizers.schedules.LearningRateSchedule`, or a callable
46 that takes no arguments and returns the actual value to use, The
47 learning rate. Defaults to `0.001`.
48 beta_1: A float value or a constant float tensor, or a callable
49 that takes no arguments and returns the actual value to use. The
50 exponential decay rate for the 1st moment estimates. Defaults to `0.9`.
51 beta_2: A float value or a constant float tensor, or a callable
52 that takes no arguments and returns the actual value to use, The
53 exponential decay rate for the 2nd moment estimates. Defaults to
54 `0.999`.
55 epsilon: A small constant for numerical stability. This epsilon is
56 "epsilon hat" in the Kingma and Ba paper (in the formula just before
57 Section 2.1), not the epsilon in Algorithm 1 of the paper. Defaults to
58 `1e-7`.
59 amsgrad: Boolean. Whether to apply AMSGrad variant of this algorithm from
60 the paper "On the Convergence of Adam and beyond". Defaults to `False`.
61 name: Optional name for the operations created when applying gradients.
62 Defaults to `"Adam"`.
63 **kwargs: keyword arguments. Allowed arguments are `clipvalue`,
64 `clipnorm`, `global_clipnorm`.
65 If `clipvalue` (float) is set, the gradient of each weight
66 is clipped to be no higher than this value.
67 If `clipnorm` (float) is set, the gradient of each weight
68 is individually clipped so that its norm is no higher than this value.
69 If `global_clipnorm` (float) is set the gradient of all weights is
70 clipped so that their global norm is no higher than this value.
72 Usage:
74 >>> opt = tf.keras.optimizers.legacy.Adam(learning_rate=0.1)
75 >>> var1 = tf.Variable(10.0)
76 >>> loss = lambda: (var1 ** 2)/2.0 # d(loss)/d(var1) == var1
77 >>> step_count = opt.minimize(loss, [var1]).numpy()
78 >>> # The first step is `-learning_rate*sign(grad)`
79 >>> var1.numpy()
80 9.9
82 Reference:
83 - [Kingma et al., 2014](http://arxiv.org/abs/1412.6980)
84 - [Reddi et al., 2018](
85 https://openreview.net/pdf?id=ryQu7f-RZ) for `amsgrad`.
87 Notes:
89 The default value of 1e-7 for epsilon might not be a good default in
90 general. For example, when training an Inception network on ImageNet a
91 current good choice is 1.0 or 0.1. Note that since Adam uses the
92 formulation just before Section 2.1 of the Kingma and Ba paper rather than
93 the formulation in Algorithm 1, the "epsilon" referred to here is "epsilon
94 hat" in the paper.
96 The sparse implementation of this algorithm (used when the gradient is an
97 IndexedSlices object, typically because of `tf.gather` or an embedding
98 lookup in the forward pass) does apply momentum to variable slices even if
99 they were not used in the forward pass (meaning they have a gradient equal
100 to zero). Momentum decay (beta1) is also applied to the entire momentum
101 accumulator. This means that the sparse behavior is equivalent to the dense
102 behavior (in contrast to some momentum implementations which ignore momentum
103 unless a variable slice was actually used).
104 """
106 _HAS_AGGREGATE_GRAD = True
108 def __init__(
109 self,
110 learning_rate=0.001,
111 beta_1=0.9,
112 beta_2=0.999,
113 epsilon=1e-7,
114 amsgrad=False,
115 name="Adam",
116 **kwargs
117 ):
118 super().__init__(name, **kwargs)
119 self._set_hyper("learning_rate", kwargs.get("lr", learning_rate))
120 self._set_hyper("decay", self._initial_decay)
121 self._set_hyper("beta_1", beta_1)
122 self._set_hyper("beta_2", beta_2)
123 self.epsilon = epsilon or backend_config.epsilon()
124 self.amsgrad = amsgrad
126 def _create_slots(self, var_list):
127 # Create slots for the first and second moments.
128 # Separate for-loops to respect the ordering of slot variables from v1.
129 for var in var_list:
130 self.add_slot(var, "m")
131 for var in var_list:
132 self.add_slot(var, "v")
133 if self.amsgrad:
134 for var in var_list:
135 self.add_slot(var, "vhat")
137 def _prepare_local(self, var_device, var_dtype, apply_state):
138 super()._prepare_local(var_device, var_dtype, apply_state)
140 local_step = tf.cast(self.iterations + 1, var_dtype)
141 beta_1_t = tf.identity(self._get_hyper("beta_1", var_dtype))
142 beta_2_t = tf.identity(self._get_hyper("beta_2", var_dtype))
143 beta_1_power = tf.pow(beta_1_t, local_step)
144 beta_2_power = tf.pow(beta_2_t, local_step)
145 lr = apply_state[(var_device, var_dtype)]["lr_t"] * (
146 tf.sqrt(1 - beta_2_power) / (1 - beta_1_power)
147 )
148 apply_state[(var_device, var_dtype)].update(
149 dict(
150 lr=lr,
151 epsilon=tf.convert_to_tensor(self.epsilon, var_dtype),
152 beta_1_t=beta_1_t,
153 beta_1_power=beta_1_power,
154 one_minus_beta_1_t=1 - beta_1_t,
155 beta_2_t=beta_2_t,
156 beta_2_power=beta_2_power,
157 one_minus_beta_2_t=1 - beta_2_t,
158 )
159 )
161 def set_weights(self, weights):
162 params = self.weights
163 # If the weights are generated by Keras V1 optimizer, it includes vhats
164 # even without amsgrad, i.e, V1 optimizer has 3x + 1 variables, while V2
165 # optimizer has 2x + 1 variables. Filter vhats out for compatibility.
166 num_vars = int((len(params) - 1) / 2)
167 if len(weights) == 3 * num_vars + 1:
168 weights = weights[: len(params)]
169 super().set_weights(weights)
171 def _resource_apply_dense(self, grad, var, apply_state=None):
172 var_device, var_dtype = var.device, var.dtype.base_dtype
173 coefficients = (apply_state or {}).get(
174 (var_device, var_dtype)
175 ) or self._fallback_apply_state(var_device, var_dtype)
177 m = self.get_slot(var, "m")
178 v = self.get_slot(var, "v")
180 if not self.amsgrad:
181 return tf.raw_ops.ResourceApplyAdam(
182 var=var.handle,
183 m=m.handle,
184 v=v.handle,
185 beta1_power=coefficients["beta_1_power"],
186 beta2_power=coefficients["beta_2_power"],
187 lr=coefficients["lr_t"],
188 beta1=coefficients["beta_1_t"],
189 beta2=coefficients["beta_2_t"],
190 epsilon=coefficients["epsilon"],
191 grad=grad,
192 use_locking=self._use_locking,
193 )
194 else:
195 vhat = self.get_slot(var, "vhat")
196 return tf.raw_ops.ResourceApplyAdamWithAmsgrad(
197 var=var.handle,
198 m=m.handle,
199 v=v.handle,
200 vhat=vhat.handle,
201 beta1_power=coefficients["beta_1_power"],
202 beta2_power=coefficients["beta_2_power"],
203 lr=coefficients["lr_t"],
204 beta1=coefficients["beta_1_t"],
205 beta2=coefficients["beta_2_t"],
206 epsilon=coefficients["epsilon"],
207 grad=grad,
208 use_locking=self._use_locking,
209 )
211 def _resource_apply_sparse(self, grad, var, indices, apply_state=None):
212 var_device, var_dtype = var.device, var.dtype.base_dtype
213 coefficients = (apply_state or {}).get(
214 (var_device, var_dtype)
215 ) or self._fallback_apply_state(var_device, var_dtype)
217 # m_t = beta1 * m + (1 - beta1) * g_t
218 m = self.get_slot(var, "m")
219 m_scaled_g_values = grad * coefficients["one_minus_beta_1_t"]
220 m_t = tf.compat.v1.assign(
221 m, m * coefficients["beta_1_t"], use_locking=self._use_locking
222 )
223 with tf.control_dependencies([m_t]):
224 m_t = self._resource_scatter_add(m, indices, m_scaled_g_values)
226 # v_t = beta2 * v + (1 - beta2) * (g_t * g_t)
227 v = self.get_slot(var, "v")
228 v_scaled_g_values = (grad * grad) * coefficients["one_minus_beta_2_t"]
229 v_t = tf.compat.v1.assign(
230 v, v * coefficients["beta_2_t"], use_locking=self._use_locking
231 )
232 with tf.control_dependencies([v_t]):
233 v_t = self._resource_scatter_add(v, indices, v_scaled_g_values)
235 if not self.amsgrad:
236 v_sqrt = tf.sqrt(v_t)
237 var_update = tf.compat.v1.assign_sub(
238 var,
239 coefficients["lr"] * m_t / (v_sqrt + coefficients["epsilon"]),
240 use_locking=self._use_locking,
241 )
242 return tf.group(*[var_update, m_t, v_t])
243 else:
244 v_hat = self.get_slot(var, "vhat")
245 v_hat_t = tf.maximum(v_hat, v_t)
246 with tf.control_dependencies([v_hat_t]):
247 v_hat_t = tf.compat.v1.assign(
248 v_hat, v_hat_t, use_locking=self._use_locking
249 )
250 v_hat_sqrt = tf.sqrt(v_hat_t)
251 var_update = tf.compat.v1.assign_sub(
252 var,
253 coefficients["lr"]
254 * m_t
255 / (v_hat_sqrt + coefficients["epsilon"]),
256 use_locking=self._use_locking,
257 )
258 return tf.group(*[var_update, m_t, v_t, v_hat_t])
260 def get_config(self):
261 config = super().get_config()
262 config.update(
263 {
264 "learning_rate": self._serialize_hyperparameter(
265 "learning_rate"
266 ),
267 "decay": self._initial_decay,
268 "beta_1": self._serialize_hyperparameter("beta_1"),
269 "beta_2": self._serialize_hyperparameter("beta_2"),
270 "epsilon": self.epsilon,
271 "amsgrad": self.amsgrad,
272 }
273 )
274 return config
277class NonFusedAdam(optimizer_v2.OptimizerV2):
278 r"""Optimizer that implements the Adam algorithm without fused kernels.
280 Adam optimization is a stochastic gradient descent method that is based on
281 adaptive estimation of first-order and second-order moments.
282 According to the paper
283 [Adam: A Method for Stochastic Optimization. Kingma et al.,
284 2014](http://arxiv.org/abs/1412.6980), the method is "*computationally
285 efficient, has little memory requirement, invariant to diagonal rescaling of
286 gradients, and is well suited for problems that are large in terms of
287 data/parameters*".
289 For AMSGrad see [On The Convergence Of Adam And Beyond.
290 Reddi et al., 5-8](https://openreview.net/pdf?id=ryQu7f-RZ).
292 **If amsgrad = False**:
294 initialize $m_0$ as 1st moment vector
295 initialize $v_0$ as 2nd moment vector
297 The update rule for $\theta$ with gradient $g$ uses an optimization
298 described at the end of section 2 of the paper:
300 $$lr_t = \mathrm{learning\_rate} *
301 \sqrt{1 - \beta_2^t} / (1 - \beta_1^t)$$
302 $$m_t = \beta_1 * m_{t-1} + (1 - \beta_1) * g$$
303 $$v_t = \beta_2 * v_{t-1} + (1 - \beta_2) * g^2$$
304 $$\theta_t = \theta_{t-1} - lr_t * m_t / (\sqrt{v_t} + \epsilon)$$
306 **If amsgrad = True**:
308 initialize $m_0$ as 1st moment vector
309 initialize $v_0$ as 2nd moment vector
310 initialize $\hat{v}_0$ as 2nd moment vector
312 The update rule for $\theta$ with gradient $g$ uses an optimization
313 described at the end of section 2 of the paper:
315 $$lr_t = \mathrm{learning\_rate} *
316 \sqrt{1 - \beta_2^t} / (1 - \beta_1^t)$$
318 $$m_t = \beta_1 * m_{t-1} + (1 - \beta_1) * g$$
319 $$v_t = \beta_2 * v_{t-1} + (1 - \beta_2) * g^2$$
320 $$\hat{v}_t = \max(\hat{v}_{t-1}, v_t)$$
321 $$\theta_t = \theta_{t-1} - lr_t * m_t / (\sqrt{\hat{v}_t} + \epsilon)$$
323 The default value of 1e-7 for epsilon might not be a good default in
324 general. For example, when training an Inception network on ImageNet a
325 current good choice is 1.0 or 0.1. Note that since Adam uses the
326 formulation just before Section 2.1 of the Kingma and Ba paper rather than
327 the formulation in Algorithm 1, the "epsilon" referred to here is "epsilon
328 hat" in the paper.
330 The sparse implementation of this algorithm (used when the gradient is an
331 IndexedSlices object, typically because of `tf.gather` or an embedding
332 lookup in the forward pass) does apply momentum to variable slices even if
333 they were not used in the forward pass (meaning they have a gradient equal
334 to zero). Momentum decay (beta1) is also applied to the entire momentum
335 accumulator. This means that the sparse behavior is equivalent to the dense
336 behavior (in contrast to some momentum implementations which ignore momentum
337 unless a variable slice was actually used).
339 Usage:
341 >>> opt = tf.keras.optimizers.legacy.Adam(learning_rate=0.1)
342 >>> var1 = tf.Variable(10.0)
343 >>> loss = lambda: (var1 ** 2)/2.0 # d(loss)/d(var1) == var1
344 >>> step_count = opt.minimize(loss, [var1]).numpy()
345 >>> # The first step is `-learning_rate*sign(grad)`
346 >>> var1.numpy()
347 9.9
348 """
350 _HAS_AGGREGATE_GRAD = True
352 def __init__(
353 self,
354 learning_rate=0.001,
355 beta_1=0.9,
356 beta_2=0.999,
357 epsilon=1e-7,
358 amsgrad=False,
359 name="Adam",
360 **kwargs
361 ):
362 """Construct a new Adam optimizer.
364 Args:
365 learning_rate: A `Tensor`, floating point value, or a schedule that is
366 a `tf.keras.optimizers.schedules.LearningRateSchedule`, or a
367 callable that takes no arguments and returns the actual value to
368 use, The learning rate. Defaults to `0.001`.
369 beta_1: A float value or a constant float tensor, or a callable that
370 takes no arguments and returns the actual value to use. The
371 exponential decay rate for the 1st moment estimates. Defaults to
372 `0.9`.
373 beta_2: A float value or a constant float tensor, or a callable that
374 takes no arguments and returns the actual value to use, The
375 exponential decay rate for the 2nd moment estimates. Defaults to
376 `0.999`.
377 epsilon: A small constant for numerical stability. This epsilon is
378 "epsilon hat" in the Kingma and Ba paper (in the formula just before
379 Section 2.1), not the epsilon in Algorithm 1 of the paper. Defaults
380 to `1e-7`.
381 amsgrad: Boolean. Whether to apply AMSGrad variant of this algorithm
382 from the paper "On the Convergence of Adam and beyond". Defaults to
383 `False`.
384 name: Optional name for the operations created when applying
385 gradients. Defaults to "Adam".
386 **kwargs: keyword arguments. Allowed to be {`clipnorm`, `clipvalue`,
387 `lr`, `decay`}. `clipnorm` is clip gradients by norm; `clipvalue` is
388 clip gradients by value, `decay` is included for backward
389 compatibility to allow time inverse decay of learning rate. `lr` is
390 included for backward compatibility, recommended to use
391 `learning_rate` instead.
392 """
394 super().__init__(name, **kwargs)
395 self._set_hyper("learning_rate", kwargs.get("lr", learning_rate))
396 self._set_hyper("decay", self._initial_decay)
397 self._set_hyper("beta_1", beta_1)
398 self._set_hyper("beta_2", beta_2)
399 self.epsilon = epsilon or backend_config.epsilon()
400 self.amsgrad = amsgrad
402 def _create_slots(self, var_list):
403 # Create slots for the first and second moments.
404 # Separate for-loops to respect the ordering of slot variables from v1.
405 for var in var_list:
406 self.add_slot(var, "m")
407 for var in var_list:
408 self.add_slot(var, "v")
409 if self.amsgrad:
410 for var in var_list:
411 self.add_slot(var, "vhat")
413 def _prepare_local(self, var_device, var_dtype, apply_state):
414 super()._prepare_local(var_device, var_dtype, apply_state)
416 local_step = tf.cast(self.iterations + 1, var_dtype)
417 beta_1_t = tf.identity(self._get_hyper("beta_1", var_dtype))
418 beta_2_t = tf.identity(self._get_hyper("beta_2", var_dtype))
419 beta_1_power = tf.pow(beta_1_t, local_step)
420 beta_2_power = tf.pow(beta_2_t, local_step)
421 lr = apply_state[(var_device, var_dtype)]["lr_t"] * (
422 tf.sqrt(1 - beta_2_power) / (1 - beta_1_power)
423 )
424 apply_state[(var_device, var_dtype)].update(
425 dict(
426 lr=lr,
427 epsilon=tf.convert_to_tensor(self.epsilon, var_dtype),
428 beta_1_t=beta_1_t,
429 beta_1_power=beta_1_power,
430 one_minus_beta_1_t=1 - beta_1_t,
431 beta_2_t=beta_2_t,
432 beta_2_power=beta_2_power,
433 one_minus_beta_2_t=1 - beta_2_t,
434 )
435 )
437 def set_weights(self, weights):
438 params = self.weights
439 # If the weights are generated by Keras V1 optimizer, it includes vhats
440 # even without amsgrad, i.e, V1 optimizer has 3x + 1 variables, while V2
441 # optimizer has 2x + 1 variables. Filter vhats out for compatibility.
442 num_vars = int((len(params) - 1) / 2)
443 if len(weights) == 3 * num_vars + 1:
444 weights = weights[: len(params)]
445 super().set_weights(weights)
447 @tf.function(jit_compile=True)
448 def _resource_apply_dense(self, grad, var, apply_state=None):
449 var_device, var_dtype = var.device, var.dtype.base_dtype
450 coefficients = (apply_state or {}).get(
451 (var_device, var_dtype)
452 ) or self._fallback_apply_state(var_device, var_dtype)
454 m = self.get_slot(var, "m")
455 v = self.get_slot(var, "v")
457 alpha = (
458 coefficients["lr_t"]
459 * tf.sqrt(1 - coefficients["beta_2_power"])
460 / (1 - coefficients["beta_1_power"])
461 )
462 m.assign_add((grad - m) * (1 - coefficients["beta_1_t"]))
463 v.assign_add((tf.square(grad) - v) * (1 - coefficients["beta_2_t"]))
464 if self.amsgrad:
465 vhat = self.get_slot(var, "vhat")
466 vhat.assign(tf.maximum(vhat, v))
467 v = vhat
468 var.assign_sub((m * alpha) / (tf.sqrt(v) + coefficients["epsilon"]))
470 @tf.function(jit_compile=True)
471 def _resource_apply_sparse(self, grad, var, indices, apply_state=None):
472 var_device, var_dtype = var.device, var.dtype.base_dtype
473 coefficients = (apply_state or {}).get(
474 (var_device, var_dtype)
475 ) or self._fallback_apply_state(var_device, var_dtype)
477 # m_t = beta1 * m + (1 - beta1) * g_t
478 m = self.get_slot(var, "m")
479 m_scaled_g_values = grad * coefficients["one_minus_beta_1_t"]
480 m.assign(m * coefficients["beta_1_t"])
481 m.scatter_add(tf.IndexedSlices(m_scaled_g_values, indices))
483 # v_t = beta2 * v + (1 - beta2) * (g_t * g_t)
484 v = self.get_slot(var, "v")
485 v_scaled_g_values = (grad * grad) * coefficients["one_minus_beta_2_t"]
486 v.assign(v * coefficients["beta_2_t"])
487 v.scatter_add(tf.IndexedSlices(v_scaled_g_values, indices))
489 if not self.amsgrad:
490 var.assign_sub(
491 coefficients["lr"] * m / (tf.sqrt(v) + coefficients["epsilon"])
492 )
493 else:
494 v_hat = self.get_slot(var, "vhat")
495 v_hat.assign(tf.maximum(v_hat, v))
496 var.assign_sub(
497 coefficients["lr"]
498 * m
499 / (tf.sqrt(v_hat) + coefficients["epsilon"])
500 )
502 def get_config(self):
503 config = super().get_config()
504 config.update(
505 {
506 "learning_rate": self._serialize_hyperparameter(
507 "learning_rate"
508 ),
509 "decay": self._initial_decay,
510 "beta_1": self._serialize_hyperparameter("beta_1"),
511 "beta_2": self._serialize_hyperparameter("beta_2"),
512 "epsilon": self.epsilon,
513 "amsgrad": self.amsgrad,
514 }
515 )
516 return config