Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/keras/src/optimizers/adam.py: 21%

62 statements  

« prev     ^ index     » next       coverage.py v7.4.0, created at 2024-01-03 07:57 +0000

1# Copyright 2021 The TensorFlow Authors. All Rights Reserved. 

2# 

3# Licensed under the Apache License, Version 2.0 (the "License"); 

4# you may not use this file except in compliance with the License. 

5# You may obtain a copy of the License at 

6# 

7# http://www.apache.org/licenses/LICENSE-2.0 

8# 

9# Unless required by applicable law or agreed to in writing, software 

10# distributed under the License is distributed on an "AS IS" BASIS, 

11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 

12# See the License for the specific language governing permissions and 

13# limitations under the License. 

14# ============================================================================== 

15"""Adam optimizer implementation.""" 

16 

17import tensorflow.compat.v2 as tf 

18 

19from keras.src.optimizers import optimizer 

20from keras.src.saving.object_registration import register_keras_serializable 

21 

22# isort: off 

23from tensorflow.python.util.tf_export import keras_export 

24 

25 

26@register_keras_serializable() 

27@keras_export( 

28 "keras.optimizers.Adam", 

29 "keras.optimizers.experimental.Adam", 

30 "keras.dtensor.experimental.optimizers.Adam", 

31 v1=[], 

32) 

33class Adam(optimizer.Optimizer): 

34 r"""Optimizer that implements the Adam algorithm. 

35 

36 Adam optimization is a stochastic gradient descent method that is based on 

37 adaptive estimation of first-order and second-order moments. 

38 

39 According to 

40 [Kingma et al., 2014](http://arxiv.org/abs/1412.6980), 

41 the method is "*computationally 

42 efficient, has little memory requirement, invariant to diagonal rescaling of 

43 gradients, and is well suited for problems that are large in terms of 

44 data/parameters*". 

45 

46 Args: 

47 learning_rate: A `tf.Tensor`, floating point value, a schedule that is a 

48 `tf.keras.optimizers.schedules.LearningRateSchedule`, or a callable 

49 that takes no arguments and returns the actual value to use. The 

50 learning rate. Defaults to `0.001`. 

51 beta_1: A float value or a constant float tensor, or a callable 

52 that takes no arguments and returns the actual value to use. The 

53 exponential decay rate for the 1st moment estimates. Defaults to `0.9`. 

54 beta_2: A float value or a constant float tensor, or a callable 

55 that takes no arguments and returns the actual value to use. The 

56 exponential decay rate for the 2nd moment estimates. Defaults to 

57 `0.999`. 

58 epsilon: A small constant for numerical stability. This epsilon is 

59 "epsilon hat" in the Kingma and Ba paper (in the formula just before 

60 Section 2.1), not the epsilon in Algorithm 1 of the paper. Defaults to 

61 `1e-7`. 

62 amsgrad: Boolean. Whether to apply AMSGrad variant of this algorithm from 

63 the paper "On the Convergence of Adam and beyond". Defaults to `False`. 

64 {{base_optimizer_keyword_args}} 

65 

66 Reference: 

67 - [Kingma et al., 2014](http://arxiv.org/abs/1412.6980) 

68 - [Reddi et al., 2018]( 

69 https://openreview.net/pdf?id=ryQu7f-RZ) for `amsgrad`. 

70 

71 Notes: 

72 

73 The default value of 1e-7 for epsilon might not be a good default in 

74 general. For example, when training an Inception network on ImageNet a 

75 current good choice is 1.0 or 0.1. Note that since Adam uses the 

76 formulation just before Section 2.1 of the Kingma and Ba paper rather than 

77 the formulation in Algorithm 1, the "epsilon" referred to here is "epsilon 

78 hat" in the paper. 

79 

80 The sparse implementation of this algorithm (used when the gradient is an 

81 IndexedSlices object, typically because of `tf.gather` or an embedding 

82 lookup in the forward pass) does apply momentum to variable slices even if 

83 they were not used in the forward pass (meaning they have a gradient equal 

84 to zero). Momentum decay (beta1) is also applied to the entire momentum 

85 accumulator. This means that the sparse behavior is equivalent to the dense 

86 behavior (in contrast to some momentum implementations which ignore momentum 

87 unless a variable slice was actually used). 

88 """ 

89 

90 def __init__( 

91 self, 

92 learning_rate=0.001, 

93 beta_1=0.9, 

94 beta_2=0.999, 

95 epsilon=1e-7, 

96 amsgrad=False, 

97 weight_decay=None, 

98 clipnorm=None, 

99 clipvalue=None, 

100 global_clipnorm=None, 

101 use_ema=False, 

102 ema_momentum=0.99, 

103 ema_overwrite_frequency=None, 

104 jit_compile=True, 

105 name="Adam", 

106 **kwargs 

107 ): 

108 super().__init__( 

109 name=name, 

110 weight_decay=weight_decay, 

111 clipnorm=clipnorm, 

112 clipvalue=clipvalue, 

113 global_clipnorm=global_clipnorm, 

114 use_ema=use_ema, 

115 ema_momentum=ema_momentum, 

116 ema_overwrite_frequency=ema_overwrite_frequency, 

117 jit_compile=jit_compile, 

118 **kwargs 

119 ) 

120 self._learning_rate = self._build_learning_rate(learning_rate) 

121 self.beta_1 = beta_1 

122 self.beta_2 = beta_2 

123 self.epsilon = epsilon 

124 self.amsgrad = amsgrad 

125 

126 def build(self, var_list): 

127 """Initialize optimizer variables. 

128 

129 Adam optimizer has 3 types of variables: momentums, velocities and 

130 velocity_hat (only set when amsgrad is applied), 

131 

132 Args: 

133 var_list: list of model variables to build Adam variables on. 

134 """ 

135 super().build(var_list) 

136 if hasattr(self, "_built") and self._built: 

137 return 

138 self._built = True 

139 self._momentums = [] 

140 self._velocities = [] 

141 for var in var_list: 

142 self._momentums.append( 

143 self.add_variable_from_reference( 

144 model_variable=var, variable_name="m" 

145 ) 

146 ) 

147 self._velocities.append( 

148 self.add_variable_from_reference( 

149 model_variable=var, variable_name="v" 

150 ) 

151 ) 

152 if self.amsgrad: 

153 self._velocity_hats = [] 

154 for var in var_list: 

155 self._velocity_hats.append( 

156 self.add_variable_from_reference( 

157 model_variable=var, variable_name="vhat" 

158 ) 

159 ) 

160 

161 def update_step(self, gradient, variable): 

162 """Update step given gradient and the associated model variable.""" 

163 beta_1_power = None 

164 beta_2_power = None 

165 lr = tf.cast(self.learning_rate, variable.dtype) 

166 local_step = tf.cast(self.iterations + 1, variable.dtype) 

167 beta_1_power = tf.pow(tf.cast(self.beta_1, variable.dtype), local_step) 

168 beta_2_power = tf.pow(tf.cast(self.beta_2, variable.dtype), local_step) 

169 

170 var_key = self._var_key(variable) 

171 m = self._momentums[self._index_dict[var_key]] 

172 v = self._velocities[self._index_dict[var_key]] 

173 

174 alpha = lr * tf.sqrt(1 - beta_2_power) / (1 - beta_1_power) 

175 

176 if isinstance(gradient, tf.IndexedSlices): 

177 # Sparse gradients. 

178 m.assign_add(-m * (1 - self.beta_1)) 

179 m.scatter_add( 

180 tf.IndexedSlices( 

181 gradient.values * (1 - self.beta_1), gradient.indices 

182 ) 

183 ) 

184 v.assign_add(-v * (1 - self.beta_2)) 

185 v.scatter_add( 

186 tf.IndexedSlices( 

187 tf.square(gradient.values) * (1 - self.beta_2), 

188 gradient.indices, 

189 ) 

190 ) 

191 if self.amsgrad: 

192 v_hat = self._velocity_hats[self._index_dict[var_key]] 

193 v_hat.assign(tf.maximum(v_hat, v)) 

194 v = v_hat 

195 variable.assign_sub((m * alpha) / (tf.sqrt(v) + self.epsilon)) 

196 else: 

197 # Dense gradients. 

198 m.assign_add((gradient - m) * (1 - self.beta_1)) 

199 v.assign_add((tf.square(gradient) - v) * (1 - self.beta_2)) 

200 if self.amsgrad: 

201 v_hat = self._velocity_hats[self._index_dict[var_key]] 

202 v_hat.assign(tf.maximum(v_hat, v)) 

203 v = v_hat 

204 variable.assign_sub((m * alpha) / (tf.sqrt(v) + self.epsilon)) 

205 

206 def get_config(self): 

207 config = super().get_config() 

208 

209 config.update( 

210 { 

211 "learning_rate": self._serialize_hyperparameter( 

212 self._learning_rate 

213 ), 

214 "beta_1": self.beta_1, 

215 "beta_2": self.beta_2, 

216 "epsilon": self.epsilon, 

217 "amsgrad": self.amsgrad, 

218 } 

219 ) 

220 return config 

221 

222 

223Adam.__doc__ = Adam.__doc__.replace( 

224 "{{base_optimizer_keyword_args}}", optimizer.base_optimizer_keyword_args 

225) 

226