Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/keras/src/optimizers/adafactor.py: 23%

3# Licensed under the Apache License, Version 2.0 (the "License");

4# you may not use this file except in compliance with the License.

5# You may obtain a copy of the License at

7# http://www.apache.org/licenses/LICENSE-2.0

9# Unless required by applicable law or agreed to in writing, software

10# distributed under the License is distributed on an "AS IS" BASIS,

11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

12# See the License for the specific language governing permissions and

13# limitations under the License.

14# ==============================================================================

15"""Adagrad optimizer implementation."""

17import tensorflow.compat.v2 as tf

19from keras.src.optimizers import optimizer

20from keras.src.optimizers.schedules import learning_rate_schedule

21from keras.src.saving.object_registration import register_keras_serializable

23# isort: off

24from tensorflow.python.util.tf_export import keras_export

27@register_keras_serializable()

28@keras_export(

29 "keras.optimizers.Adafactor",

30 "keras.optimizers.experimental.Adafactor",

31 v1=[],

32)

33class Adafactor(optimizer.Optimizer):

34 """Optimizer that implements the Adafactor algorithm.

36 Adafactor is commonly used in NLP tasks, and has the advantage

37 of taking less memory because it only saves partial information of previous

38 gradients.

40 The default argument setup is based on the original paper (see reference).

41 When gradients are of dimension > 2, Adafactor optimizer will delete the

42 last 2 dimensions separately in its accumulator variables.

44 Args:

45 learning_rate: Initial value for the learning rate:

46 either a floating point value,

47 or a `tf.keras.optimizers.schedules.LearningRateSchedule` instance.

48 Defaults to 0.001.

49 beta_2_decay: float, defaults to -0.8. The decay rate of `beta_2`.

50 epsilon_1: float, defaults to 1e-30. A small offset to keep demoninator

51 away from 0.

52 epsilon_2: float, defaults to 1e-3. A small offset to avoid learning

53 rate becoming too small by time.

54 clip_threshold: float, defaults to 1.0. Clipping threshold. This is a part

55 of Adafactor algorithm, independent from `clipnorm`, `clipvalue` and

56 `global_clipnorm`.

57 relative_step: bool, defaults to True. If `learning_rate` is a

58 constant and `relative_step=True`, learning rate will be adjusted

59 based on current iterations. This is a default learning rate decay

60 in Adafactor.

61 {{base_optimizer_keyword_args}}

63 Reference:

64 - [Shazeer, Noam et al., 2018](https://arxiv.org/abs/1804.04235).

66 """

68 def __init__(

69 self,

70 learning_rate=0.001,

71 beta_2_decay=-0.8,

72 epsilon_1=1e-30,

73 epsilon_2=1e-3,

74 clip_threshold=1.0,

75 relative_step=True,

76 weight_decay=None,

77 clipnorm=None,

78 clipvalue=None,

79 global_clipnorm=None,

80 use_ema=False,

81 ema_momentum=0.99,

82 ema_overwrite_frequency=None,

83 jit_compile=True,

84 name="Adafactor",

85 **kwargs,

86 ):

87 super().__init__(

88 name=name,

89 weight_decay=weight_decay,

90 clipnorm=clipnorm,

91 clipvalue=clipvalue,

92 global_clipnorm=global_clipnorm,

93 use_ema=use_ema,

94 ema_momentum=ema_momentum,

95 ema_overwrite_frequency=ema_overwrite_frequency,

96 jit_compile=jit_compile,

97 **kwargs,

98 )

99 self._learning_rate = self._build_learning_rate(learning_rate)

100 self.beta_2_decay = beta_2_decay

101 self.epsilon_1 = epsilon_1

102 self.epsilon_2 = epsilon_2

103 self.clip_threshold = clip_threshold

104 self.relative_step = relative_step

105

106 def build(self, var_list):

107 """Initialize optimizer variables.

108

109 Adam optimizer has 3 types of variables: momentums, velocities and

110 velocity_hat (only set when amsgrad is applied),

111

112 Args:

113 var_list: list of model variables to build Adam variables on.

114 """

115 super().build(var_list)

116 if hasattr(self, "_built") and self._built:

117 return

118 self._built = True

119 self._r = []

120 self._c = []

121 self._v = []

122 for var in var_list:

123 if len(var.shape) < 2:

124 # Don't factor if variable is of dimension < 2, but we still

125 # need to create dummy variables as placeholder.

126 self._r.append(tf.Variable(0, name=f"r/{var._shared_name}"))

127 self._c.append(tf.Variable(0, name=f"r/{var._shared_name}"))

128 else:

129 # Always factor the last 2 dimenstions.

130 r_shape = var.shape[:-1]

131 c_shape = var.shape[:-2] + var.shape[-1]

132 self._r.append(

133 self.add_variable(

134 shape=r_shape,

135 dtype=var.dtype,

136 name=f"r/{var._shared_name}",

137 )

138 )

139 self._c.append(

140 self.add_variable(

141 shape=c_shape,

142 dtype=var.dtype,

143 name=f"c/{var._shared_name}",

144 )

145 )

146 self._v.append(

147 self.add_variable_from_reference(

148 model_variable=var, variable_name="v"

149 )

150 )

151

152 def _rms(self, x):

153 return tf.sqrt(tf.reduce_mean(tf.square(x)))

154

155 def update_step(self, gradient, variable):

156 """Update step given gradient and the associated model variable."""

157

158 lr = tf.cast(self.learning_rate, variable.dtype)

159 epsilon_2 = tf.cast(self.epsilon_2, variable.dtype)

160 one = tf.cast(1.0, variable.dtype)

161 local_step = tf.cast(self.iterations + 1, variable.dtype)

162 if (

163 not isinstance(

164 self._learning_rate, learning_rate_schedule.LearningRateSchedule

165 )

166 and self.relative_step

167 ):

168 # If `relative_step=True` and learning rate is a constant, we

169 # apply the relative step algorithm.

170 lr = tf.minimum(lr, tf.math.rsqrt(local_step))

171

172 var_key = self._var_key(variable)

173 r = self._r[self._index_dict[var_key]]

174 c = self._c[self._index_dict[var_key]]

175 v = self._v[self._index_dict[var_key]]

176

177 rho_t = tf.minimum(lr, tf.math.rsqrt(local_step))

178 alpha_t = tf.maximum(epsilon_2, self._rms(variable)) * rho_t

179 regulated_grad_square = tf.square(gradient) + self.epsilon_1

180 beta_2_t = 1 - tf.pow(local_step, self.beta_2_decay)

181

182 if len(variable.shape) >= 2:

183 # `r` deletes the last dimension of gradient, so it is of shape

184 # `gradient.shape[:-1]`.

185 r.assign(

186 beta_2_t * r

187 + (1 - beta_2_t)

188 * tf.reduce_mean(regulated_grad_square, axis=-1)

189 )

190 # `c` deletes the second last dimension of gradient, so it is of

191 # shape `gradient.shape[:-2] + gradient.shape[-1]`.

192 c.assign(

193 beta_2_t * c

194 + (1 - beta_2_t)

195 * tf.reduce_mean(regulated_grad_square, axis=-2)

196 )

197 v.assign(

198 tf.expand_dims(

199 r / tf.reduce_mean(r, axis=-1, keepdims=True), axis=-1

200 )

201 * tf.expand_dims(c, -2)

202 )

203 else:

204 v.assign(beta_2_t * v + (1 - beta_2_t) * regulated_grad_square)

205

206 # `convert_to_tensor` unifies the handling of sparse and dense grads.

207 u_t = tf.convert_to_tensor(gradient) * tf.math.rsqrt(v)

208 u_t_hat = u_t / tf.maximum(one, (self._rms(u_t) / self.clip_threshold))

209 variable.assign_add(-alpha_t * u_t_hat)

210

211 def get_config(self):

212 config = super().get_config()

213

214 config.update(

215 {

216 "learning_rate": self._serialize_hyperparameter(

217 self._learning_rate

218 ),

219 "beta_2_decay": self.beta_2_decay,

220 "epsilon_1": self.epsilon_1,

221 "epsilon_2": self.epsilon_2,

222 "clip_threshold": self.clip_threshold,

223 "relative_step": self.relative_step,

224 }

225 )

226 return config

227

228

229Adafactor.__doc__ = Adafactor.__doc__.replace(

230 "{{base_optimizer_keyword_args}}", optimizer.base_optimizer_keyword_args

231)

232