Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/keras/src/optimizers/adam.py: 21%

3# Licensed under the Apache License, Version 2.0 (the "License");

4# you may not use this file except in compliance with the License.

5# You may obtain a copy of the License at

7# http://www.apache.org/licenses/LICENSE-2.0

9# Unless required by applicable law or agreed to in writing, software

10# distributed under the License is distributed on an "AS IS" BASIS,

11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

12# See the License for the specific language governing permissions and

13# limitations under the License.

14# ==============================================================================

15"""Adam optimizer implementation."""

17import tensorflow.compat.v2 as tf

19from keras.src.optimizers import optimizer

20from keras.src.saving.object_registration import register_keras_serializable

22# isort: off

23from tensorflow.python.util.tf_export import keras_export

26@register_keras_serializable()

27@keras_export(

28 "keras.optimizers.Adam",

29 "keras.optimizers.experimental.Adam",

30 "keras.dtensor.experimental.optimizers.Adam",

31 v1=[],

32)

33class Adam(optimizer.Optimizer):

34 r"""Optimizer that implements the Adam algorithm.

36 Adam optimization is a stochastic gradient descent method that is based on

37 adaptive estimation of first-order and second-order moments.

39 According to

40 [Kingma et al., 2014](http://arxiv.org/abs/1412.6980),

41 the method is "*computationally

42 efficient, has little memory requirement, invariant to diagonal rescaling of

43 gradients, and is well suited for problems that are large in terms of

44 data/parameters*".

46 Args:

47 learning_rate: A `tf.Tensor`, floating point value, a schedule that is a

48 `tf.keras.optimizers.schedules.LearningRateSchedule`, or a callable

49 that takes no arguments and returns the actual value to use. The

50 learning rate. Defaults to `0.001`.

51 beta_1: A float value or a constant float tensor, or a callable

52 that takes no arguments and returns the actual value to use. The

53 exponential decay rate for the 1st moment estimates. Defaults to `0.9`.

54 beta_2: A float value or a constant float tensor, or a callable

55 that takes no arguments and returns the actual value to use. The

56 exponential decay rate for the 2nd moment estimates. Defaults to

57 `0.999`.

58 epsilon: A small constant for numerical stability. This epsilon is

59 "epsilon hat" in the Kingma and Ba paper (in the formula just before

60 Section 2.1), not the epsilon in Algorithm 1 of the paper. Defaults to

61 `1e-7`.

62 amsgrad: Boolean. Whether to apply AMSGrad variant of this algorithm from

63 the paper "On the Convergence of Adam and beyond". Defaults to `False`.

64 {{base_optimizer_keyword_args}}

66 Reference:

67 - [Kingma et al., 2014](http://arxiv.org/abs/1412.6980)

68 - [Reddi et al., 2018](

69 https://openreview.net/pdf?id=ryQu7f-RZ) for `amsgrad`.

71 Notes:

73 The default value of 1e-7 for epsilon might not be a good default in

74 general. For example, when training an Inception network on ImageNet a

75 current good choice is 1.0 or 0.1. Note that since Adam uses the

76 formulation just before Section 2.1 of the Kingma and Ba paper rather than

77 the formulation in Algorithm 1, the "epsilon" referred to here is "epsilon

78 hat" in the paper.

80 The sparse implementation of this algorithm (used when the gradient is an

81 IndexedSlices object, typically because of `tf.gather` or an embedding

82 lookup in the forward pass) does apply momentum to variable slices even if

83 they were not used in the forward pass (meaning they have a gradient equal

84 to zero). Momentum decay (beta1) is also applied to the entire momentum

85 accumulator. This means that the sparse behavior is equivalent to the dense

86 behavior (in contrast to some momentum implementations which ignore momentum

87 unless a variable slice was actually used).

88 """

90 def __init__(

91 self,

92 learning_rate=0.001,

93 beta_1=0.9,

94 beta_2=0.999,

95 epsilon=1e-7,

96 amsgrad=False,

97 weight_decay=None,

98 clipnorm=None,

99 clipvalue=None,

100 global_clipnorm=None,

101 use_ema=False,

102 ema_momentum=0.99,

103 ema_overwrite_frequency=None,

104 jit_compile=True,

105 name="Adam",

106 **kwargs

107 ):

108 super().__init__(

109 name=name,

110 weight_decay=weight_decay,

111 clipnorm=clipnorm,

112 clipvalue=clipvalue,

113 global_clipnorm=global_clipnorm,

114 use_ema=use_ema,

115 ema_momentum=ema_momentum,

116 ema_overwrite_frequency=ema_overwrite_frequency,

117 jit_compile=jit_compile,

118 **kwargs

119 )

120 self._learning_rate = self._build_learning_rate(learning_rate)

121 self.beta_1 = beta_1

122 self.beta_2 = beta_2

123 self.epsilon = epsilon

124 self.amsgrad = amsgrad

125

126 def build(self, var_list):

127 """Initialize optimizer variables.

128

129 Adam optimizer has 3 types of variables: momentums, velocities and

130 velocity_hat (only set when amsgrad is applied),

131

132 Args:

133 var_list: list of model variables to build Adam variables on.

134 """

135 super().build(var_list)

136 if hasattr(self, "_built") and self._built:

137 return

138 self._built = True

139 self._momentums = []

140 self._velocities = []

141 for var in var_list:

142 self._momentums.append(

143 self.add_variable_from_reference(

144 model_variable=var, variable_name="m"

145 )

146 )

147 self._velocities.append(

148 self.add_variable_from_reference(

149 model_variable=var, variable_name="v"

150 )

151 )

152 if self.amsgrad:

153 self._velocity_hats = []

154 for var in var_list:

155 self._velocity_hats.append(

156 self.add_variable_from_reference(

157 model_variable=var, variable_name="vhat"

158 )

159 )

160

161 def update_step(self, gradient, variable):

162 """Update step given gradient and the associated model variable."""

163 beta_1_power = None

164 beta_2_power = None

165 lr = tf.cast(self.learning_rate, variable.dtype)

166 local_step = tf.cast(self.iterations + 1, variable.dtype)

167 beta_1_power = tf.pow(tf.cast(self.beta_1, variable.dtype), local_step)

168 beta_2_power = tf.pow(tf.cast(self.beta_2, variable.dtype), local_step)

169

170 var_key = self._var_key(variable)

171 m = self._momentums[self._index_dict[var_key]]

172 v = self._velocities[self._index_dict[var_key]]

173

174 alpha = lr * tf.sqrt(1 - beta_2_power) / (1 - beta_1_power)

175

176 if isinstance(gradient, tf.IndexedSlices):

177 # Sparse gradients.

178 m.assign_add(-m * (1 - self.beta_1))

179 m.scatter_add(

180 tf.IndexedSlices(

181 gradient.values * (1 - self.beta_1), gradient.indices

182 )

183 )

184 v.assign_add(-v * (1 - self.beta_2))

185 v.scatter_add(

186 tf.IndexedSlices(

187 tf.square(gradient.values) * (1 - self.beta_2),

188 gradient.indices,

189 )

190 )

191 if self.amsgrad:

192 v_hat = self._velocity_hats[self._index_dict[var_key]]

193 v_hat.assign(tf.maximum(v_hat, v))

194 v = v_hat

195 variable.assign_sub((m * alpha) / (tf.sqrt(v) + self.epsilon))

196 else:

197 # Dense gradients.

198 m.assign_add((gradient - m) * (1 - self.beta_1))

199 v.assign_add((tf.square(gradient) - v) * (1 - self.beta_2))

200 if self.amsgrad:

201 v_hat = self._velocity_hats[self._index_dict[var_key]]

202 v_hat.assign(tf.maximum(v_hat, v))

203 v = v_hat

204 variable.assign_sub((m * alpha) / (tf.sqrt(v) + self.epsilon))

205

206 def get_config(self):

207 config = super().get_config()

208

209 config.update(

210 {

211 "learning_rate": self._serialize_hyperparameter(

212 self._learning_rate

213 ),

214 "beta_1": self.beta_1,

215 "beta_2": self.beta_2,

216 "epsilon": self.epsilon,

217 "amsgrad": self.amsgrad,

218 }

219 )

220 return config

221

222

223Adam.__doc__ = Adam.__doc__.replace(

224 "{{base_optimizer_keyword_args}}", optimizer.base_optimizer_keyword_args

225)

226