Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/keras/src/optimizers/adafactor.py: 23%

64 statements  

« prev     ^ index     » next       coverage.py v7.4.0, created at 2024-01-03 07:57 +0000

1# Copyright 2021 The TensorFlow Authors. All Rights Reserved. 

2# 

3# Licensed under the Apache License, Version 2.0 (the "License"); 

4# you may not use this file except in compliance with the License. 

5# You may obtain a copy of the License at 

6# 

7# http://www.apache.org/licenses/LICENSE-2.0 

8# 

9# Unless required by applicable law or agreed to in writing, software 

10# distributed under the License is distributed on an "AS IS" BASIS, 

11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 

12# See the License for the specific language governing permissions and 

13# limitations under the License. 

14# ============================================================================== 

15"""Adagrad optimizer implementation.""" 

16 

17import tensorflow.compat.v2 as tf 

18 

19from keras.src.optimizers import optimizer 

20from keras.src.optimizers.schedules import learning_rate_schedule 

21from keras.src.saving.object_registration import register_keras_serializable 

22 

23# isort: off 

24from tensorflow.python.util.tf_export import keras_export 

25 

26 

27@register_keras_serializable() 

28@keras_export( 

29 "keras.optimizers.Adafactor", 

30 "keras.optimizers.experimental.Adafactor", 

31 v1=[], 

32) 

33class Adafactor(optimizer.Optimizer): 

34 """Optimizer that implements the Adafactor algorithm. 

35 

36 Adafactor is commonly used in NLP tasks, and has the advantage 

37 of taking less memory because it only saves partial information of previous 

38 gradients. 

39 

40 The default argument setup is based on the original paper (see reference). 

41 When gradients are of dimension > 2, Adafactor optimizer will delete the 

42 last 2 dimensions separately in its accumulator variables. 

43 

44 Args: 

45 learning_rate: Initial value for the learning rate: 

46 either a floating point value, 

47 or a `tf.keras.optimizers.schedules.LearningRateSchedule` instance. 

48 Defaults to 0.001. 

49 beta_2_decay: float, defaults to -0.8. The decay rate of `beta_2`. 

50 epsilon_1: float, defaults to 1e-30. A small offset to keep demoninator 

51 away from 0. 

52 epsilon_2: float, defaults to 1e-3. A small offset to avoid learning 

53 rate becoming too small by time. 

54 clip_threshold: float, defaults to 1.0. Clipping threshold. This is a part 

55 of Adafactor algorithm, independent from `clipnorm`, `clipvalue` and 

56 `global_clipnorm`. 

57 relative_step: bool, defaults to True. If `learning_rate` is a 

58 constant and `relative_step=True`, learning rate will be adjusted 

59 based on current iterations. This is a default learning rate decay 

60 in Adafactor. 

61 {{base_optimizer_keyword_args}} 

62 

63 Reference: 

64 - [Shazeer, Noam et al., 2018](https://arxiv.org/abs/1804.04235). 

65 

66 """ 

67 

68 def __init__( 

69 self, 

70 learning_rate=0.001, 

71 beta_2_decay=-0.8, 

72 epsilon_1=1e-30, 

73 epsilon_2=1e-3, 

74 clip_threshold=1.0, 

75 relative_step=True, 

76 weight_decay=None, 

77 clipnorm=None, 

78 clipvalue=None, 

79 global_clipnorm=None, 

80 use_ema=False, 

81 ema_momentum=0.99, 

82 ema_overwrite_frequency=None, 

83 jit_compile=True, 

84 name="Adafactor", 

85 **kwargs, 

86 ): 

87 super().__init__( 

88 name=name, 

89 weight_decay=weight_decay, 

90 clipnorm=clipnorm, 

91 clipvalue=clipvalue, 

92 global_clipnorm=global_clipnorm, 

93 use_ema=use_ema, 

94 ema_momentum=ema_momentum, 

95 ema_overwrite_frequency=ema_overwrite_frequency, 

96 jit_compile=jit_compile, 

97 **kwargs, 

98 ) 

99 self._learning_rate = self._build_learning_rate(learning_rate) 

100 self.beta_2_decay = beta_2_decay 

101 self.epsilon_1 = epsilon_1 

102 self.epsilon_2 = epsilon_2 

103 self.clip_threshold = clip_threshold 

104 self.relative_step = relative_step 

105 

106 def build(self, var_list): 

107 """Initialize optimizer variables. 

108 

109 Adam optimizer has 3 types of variables: momentums, velocities and 

110 velocity_hat (only set when amsgrad is applied), 

111 

112 Args: 

113 var_list: list of model variables to build Adam variables on. 

114 """ 

115 super().build(var_list) 

116 if hasattr(self, "_built") and self._built: 

117 return 

118 self._built = True 

119 self._r = [] 

120 self._c = [] 

121 self._v = [] 

122 for var in var_list: 

123 if len(var.shape) < 2: 

124 # Don't factor if variable is of dimension < 2, but we still 

125 # need to create dummy variables as placeholder. 

126 self._r.append(tf.Variable(0, name=f"r/{var._shared_name}")) 

127 self._c.append(tf.Variable(0, name=f"r/{var._shared_name}")) 

128 else: 

129 # Always factor the last 2 dimenstions. 

130 r_shape = var.shape[:-1] 

131 c_shape = var.shape[:-2] + var.shape[-1] 

132 self._r.append( 

133 self.add_variable( 

134 shape=r_shape, 

135 dtype=var.dtype, 

136 name=f"r/{var._shared_name}", 

137 ) 

138 ) 

139 self._c.append( 

140 self.add_variable( 

141 shape=c_shape, 

142 dtype=var.dtype, 

143 name=f"c/{var._shared_name}", 

144 ) 

145 ) 

146 self._v.append( 

147 self.add_variable_from_reference( 

148 model_variable=var, variable_name="v" 

149 ) 

150 ) 

151 

152 def _rms(self, x): 

153 return tf.sqrt(tf.reduce_mean(tf.square(x))) 

154 

155 def update_step(self, gradient, variable): 

156 """Update step given gradient and the associated model variable.""" 

157 

158 lr = tf.cast(self.learning_rate, variable.dtype) 

159 epsilon_2 = tf.cast(self.epsilon_2, variable.dtype) 

160 one = tf.cast(1.0, variable.dtype) 

161 local_step = tf.cast(self.iterations + 1, variable.dtype) 

162 if ( 

163 not isinstance( 

164 self._learning_rate, learning_rate_schedule.LearningRateSchedule 

165 ) 

166 and self.relative_step 

167 ): 

168 # If `relative_step=True` and learning rate is a constant, we 

169 # apply the relative step algorithm. 

170 lr = tf.minimum(lr, tf.math.rsqrt(local_step)) 

171 

172 var_key = self._var_key(variable) 

173 r = self._r[self._index_dict[var_key]] 

174 c = self._c[self._index_dict[var_key]] 

175 v = self._v[self._index_dict[var_key]] 

176 

177 rho_t = tf.minimum(lr, tf.math.rsqrt(local_step)) 

178 alpha_t = tf.maximum(epsilon_2, self._rms(variable)) * rho_t 

179 regulated_grad_square = tf.square(gradient) + self.epsilon_1 

180 beta_2_t = 1 - tf.pow(local_step, self.beta_2_decay) 

181 

182 if len(variable.shape) >= 2: 

183 # `r` deletes the last dimension of gradient, so it is of shape 

184 # `gradient.shape[:-1]`. 

185 r.assign( 

186 beta_2_t * r 

187 + (1 - beta_2_t) 

188 * tf.reduce_mean(regulated_grad_square, axis=-1) 

189 ) 

190 # `c` deletes the second last dimension of gradient, so it is of 

191 # shape `gradient.shape[:-2] + gradient.shape[-1]`. 

192 c.assign( 

193 beta_2_t * c 

194 + (1 - beta_2_t) 

195 * tf.reduce_mean(regulated_grad_square, axis=-2) 

196 ) 

197 v.assign( 

198 tf.expand_dims( 

199 r / tf.reduce_mean(r, axis=-1, keepdims=True), axis=-1 

200 ) 

201 * tf.expand_dims(c, -2) 

202 ) 

203 else: 

204 v.assign(beta_2_t * v + (1 - beta_2_t) * regulated_grad_square) 

205 

206 # `convert_to_tensor` unifies the handling of sparse and dense grads. 

207 u_t = tf.convert_to_tensor(gradient) * tf.math.rsqrt(v) 

208 u_t_hat = u_t / tf.maximum(one, (self._rms(u_t) / self.clip_threshold)) 

209 variable.assign_add(-alpha_t * u_t_hat) 

210 

211 def get_config(self): 

212 config = super().get_config() 

213 

214 config.update( 

215 { 

216 "learning_rate": self._serialize_hyperparameter( 

217 self._learning_rate 

218 ), 

219 "beta_2_decay": self.beta_2_decay, 

220 "epsilon_1": self.epsilon_1, 

221 "epsilon_2": self.epsilon_2, 

222 "clip_threshold": self.clip_threshold, 

223 "relative_step": self.relative_step, 

224 } 

225 ) 

226 return config 

227 

228 

229Adafactor.__doc__ = Adafactor.__doc__.replace( 

230 "{{base_optimizer_keyword_args}}", optimizer.base_optimizer_keyword_args 

231) 

232