Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/keras/src/optimizers/adam.py: 21%
62 statements
« prev ^ index » next coverage.py v7.4.0, created at 2024-01-03 07:57 +0000
« prev ^ index » next coverage.py v7.4.0, created at 2024-01-03 07:57 +0000
1# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
2#
3# Licensed under the Apache License, Version 2.0 (the "License");
4# you may not use this file except in compliance with the License.
5# You may obtain a copy of the License at
6#
7# http://www.apache.org/licenses/LICENSE-2.0
8#
9# Unless required by applicable law or agreed to in writing, software
10# distributed under the License is distributed on an "AS IS" BASIS,
11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12# See the License for the specific language governing permissions and
13# limitations under the License.
14# ==============================================================================
15"""Adam optimizer implementation."""
17import tensorflow.compat.v2 as tf
19from keras.src.optimizers import optimizer
20from keras.src.saving.object_registration import register_keras_serializable
22# isort: off
23from tensorflow.python.util.tf_export import keras_export
26@register_keras_serializable()
27@keras_export(
28 "keras.optimizers.Adam",
29 "keras.optimizers.experimental.Adam",
30 "keras.dtensor.experimental.optimizers.Adam",
31 v1=[],
32)
33class Adam(optimizer.Optimizer):
34 r"""Optimizer that implements the Adam algorithm.
36 Adam optimization is a stochastic gradient descent method that is based on
37 adaptive estimation of first-order and second-order moments.
39 According to
40 [Kingma et al., 2014](http://arxiv.org/abs/1412.6980),
41 the method is "*computationally
42 efficient, has little memory requirement, invariant to diagonal rescaling of
43 gradients, and is well suited for problems that are large in terms of
44 data/parameters*".
46 Args:
47 learning_rate: A `tf.Tensor`, floating point value, a schedule that is a
48 `tf.keras.optimizers.schedules.LearningRateSchedule`, or a callable
49 that takes no arguments and returns the actual value to use. The
50 learning rate. Defaults to `0.001`.
51 beta_1: A float value or a constant float tensor, or a callable
52 that takes no arguments and returns the actual value to use. The
53 exponential decay rate for the 1st moment estimates. Defaults to `0.9`.
54 beta_2: A float value or a constant float tensor, or a callable
55 that takes no arguments and returns the actual value to use. The
56 exponential decay rate for the 2nd moment estimates. Defaults to
57 `0.999`.
58 epsilon: A small constant for numerical stability. This epsilon is
59 "epsilon hat" in the Kingma and Ba paper (in the formula just before
60 Section 2.1), not the epsilon in Algorithm 1 of the paper. Defaults to
61 `1e-7`.
62 amsgrad: Boolean. Whether to apply AMSGrad variant of this algorithm from
63 the paper "On the Convergence of Adam and beyond". Defaults to `False`.
64 {{base_optimizer_keyword_args}}
66 Reference:
67 - [Kingma et al., 2014](http://arxiv.org/abs/1412.6980)
68 - [Reddi et al., 2018](
69 https://openreview.net/pdf?id=ryQu7f-RZ) for `amsgrad`.
71 Notes:
73 The default value of 1e-7 for epsilon might not be a good default in
74 general. For example, when training an Inception network on ImageNet a
75 current good choice is 1.0 or 0.1. Note that since Adam uses the
76 formulation just before Section 2.1 of the Kingma and Ba paper rather than
77 the formulation in Algorithm 1, the "epsilon" referred to here is "epsilon
78 hat" in the paper.
80 The sparse implementation of this algorithm (used when the gradient is an
81 IndexedSlices object, typically because of `tf.gather` or an embedding
82 lookup in the forward pass) does apply momentum to variable slices even if
83 they were not used in the forward pass (meaning they have a gradient equal
84 to zero). Momentum decay (beta1) is also applied to the entire momentum
85 accumulator. This means that the sparse behavior is equivalent to the dense
86 behavior (in contrast to some momentum implementations which ignore momentum
87 unless a variable slice was actually used).
88 """
90 def __init__(
91 self,
92 learning_rate=0.001,
93 beta_1=0.9,
94 beta_2=0.999,
95 epsilon=1e-7,
96 amsgrad=False,
97 weight_decay=None,
98 clipnorm=None,
99 clipvalue=None,
100 global_clipnorm=None,
101 use_ema=False,
102 ema_momentum=0.99,
103 ema_overwrite_frequency=None,
104 jit_compile=True,
105 name="Adam",
106 **kwargs
107 ):
108 super().__init__(
109 name=name,
110 weight_decay=weight_decay,
111 clipnorm=clipnorm,
112 clipvalue=clipvalue,
113 global_clipnorm=global_clipnorm,
114 use_ema=use_ema,
115 ema_momentum=ema_momentum,
116 ema_overwrite_frequency=ema_overwrite_frequency,
117 jit_compile=jit_compile,
118 **kwargs
119 )
120 self._learning_rate = self._build_learning_rate(learning_rate)
121 self.beta_1 = beta_1
122 self.beta_2 = beta_2
123 self.epsilon = epsilon
124 self.amsgrad = amsgrad
126 def build(self, var_list):
127 """Initialize optimizer variables.
129 Adam optimizer has 3 types of variables: momentums, velocities and
130 velocity_hat (only set when amsgrad is applied),
132 Args:
133 var_list: list of model variables to build Adam variables on.
134 """
135 super().build(var_list)
136 if hasattr(self, "_built") and self._built:
137 return
138 self._built = True
139 self._momentums = []
140 self._velocities = []
141 for var in var_list:
142 self._momentums.append(
143 self.add_variable_from_reference(
144 model_variable=var, variable_name="m"
145 )
146 )
147 self._velocities.append(
148 self.add_variable_from_reference(
149 model_variable=var, variable_name="v"
150 )
151 )
152 if self.amsgrad:
153 self._velocity_hats = []
154 for var in var_list:
155 self._velocity_hats.append(
156 self.add_variable_from_reference(
157 model_variable=var, variable_name="vhat"
158 )
159 )
161 def update_step(self, gradient, variable):
162 """Update step given gradient and the associated model variable."""
163 beta_1_power = None
164 beta_2_power = None
165 lr = tf.cast(self.learning_rate, variable.dtype)
166 local_step = tf.cast(self.iterations + 1, variable.dtype)
167 beta_1_power = tf.pow(tf.cast(self.beta_1, variable.dtype), local_step)
168 beta_2_power = tf.pow(tf.cast(self.beta_2, variable.dtype), local_step)
170 var_key = self._var_key(variable)
171 m = self._momentums[self._index_dict[var_key]]
172 v = self._velocities[self._index_dict[var_key]]
174 alpha = lr * tf.sqrt(1 - beta_2_power) / (1 - beta_1_power)
176 if isinstance(gradient, tf.IndexedSlices):
177 # Sparse gradients.
178 m.assign_add(-m * (1 - self.beta_1))
179 m.scatter_add(
180 tf.IndexedSlices(
181 gradient.values * (1 - self.beta_1), gradient.indices
182 )
183 )
184 v.assign_add(-v * (1 - self.beta_2))
185 v.scatter_add(
186 tf.IndexedSlices(
187 tf.square(gradient.values) * (1 - self.beta_2),
188 gradient.indices,
189 )
190 )
191 if self.amsgrad:
192 v_hat = self._velocity_hats[self._index_dict[var_key]]
193 v_hat.assign(tf.maximum(v_hat, v))
194 v = v_hat
195 variable.assign_sub((m * alpha) / (tf.sqrt(v) + self.epsilon))
196 else:
197 # Dense gradients.
198 m.assign_add((gradient - m) * (1 - self.beta_1))
199 v.assign_add((tf.square(gradient) - v) * (1 - self.beta_2))
200 if self.amsgrad:
201 v_hat = self._velocity_hats[self._index_dict[var_key]]
202 v_hat.assign(tf.maximum(v_hat, v))
203 v = v_hat
204 variable.assign_sub((m * alpha) / (tf.sqrt(v) + self.epsilon))
206 def get_config(self):
207 config = super().get_config()
209 config.update(
210 {
211 "learning_rate": self._serialize_hyperparameter(
212 self._learning_rate
213 ),
214 "beta_1": self.beta_1,
215 "beta_2": self.beta_2,
216 "epsilon": self.epsilon,
217 "amsgrad": self.amsgrad,
218 }
219 )
220 return config
223Adam.__doc__ = Adam.__doc__.replace(
224 "{{base_optimizer_keyword_args}}", optimizer.base_optimizer_keyword_args
225)