Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/keras/src/optimizers/adafactor.py: 23%
64 statements
« prev ^ index » next coverage.py v7.4.0, created at 2024-01-03 07:57 +0000
« prev ^ index » next coverage.py v7.4.0, created at 2024-01-03 07:57 +0000
1# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
2#
3# Licensed under the Apache License, Version 2.0 (the "License");
4# you may not use this file except in compliance with the License.
5# You may obtain a copy of the License at
6#
7# http://www.apache.org/licenses/LICENSE-2.0
8#
9# Unless required by applicable law or agreed to in writing, software
10# distributed under the License is distributed on an "AS IS" BASIS,
11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12# See the License for the specific language governing permissions and
13# limitations under the License.
14# ==============================================================================
15"""Adagrad optimizer implementation."""
17import tensorflow.compat.v2 as tf
19from keras.src.optimizers import optimizer
20from keras.src.optimizers.schedules import learning_rate_schedule
21from keras.src.saving.object_registration import register_keras_serializable
23# isort: off
24from tensorflow.python.util.tf_export import keras_export
27@register_keras_serializable()
28@keras_export(
29 "keras.optimizers.Adafactor",
30 "keras.optimizers.experimental.Adafactor",
31 v1=[],
32)
33class Adafactor(optimizer.Optimizer):
34 """Optimizer that implements the Adafactor algorithm.
36 Adafactor is commonly used in NLP tasks, and has the advantage
37 of taking less memory because it only saves partial information of previous
38 gradients.
40 The default argument setup is based on the original paper (see reference).
41 When gradients are of dimension > 2, Adafactor optimizer will delete the
42 last 2 dimensions separately in its accumulator variables.
44 Args:
45 learning_rate: Initial value for the learning rate:
46 either a floating point value,
47 or a `tf.keras.optimizers.schedules.LearningRateSchedule` instance.
48 Defaults to 0.001.
49 beta_2_decay: float, defaults to -0.8. The decay rate of `beta_2`.
50 epsilon_1: float, defaults to 1e-30. A small offset to keep demoninator
51 away from 0.
52 epsilon_2: float, defaults to 1e-3. A small offset to avoid learning
53 rate becoming too small by time.
54 clip_threshold: float, defaults to 1.0. Clipping threshold. This is a part
55 of Adafactor algorithm, independent from `clipnorm`, `clipvalue` and
56 `global_clipnorm`.
57 relative_step: bool, defaults to True. If `learning_rate` is a
58 constant and `relative_step=True`, learning rate will be adjusted
59 based on current iterations. This is a default learning rate decay
60 in Adafactor.
61 {{base_optimizer_keyword_args}}
63 Reference:
64 - [Shazeer, Noam et al., 2018](https://arxiv.org/abs/1804.04235).
66 """
68 def __init__(
69 self,
70 learning_rate=0.001,
71 beta_2_decay=-0.8,
72 epsilon_1=1e-30,
73 epsilon_2=1e-3,
74 clip_threshold=1.0,
75 relative_step=True,
76 weight_decay=None,
77 clipnorm=None,
78 clipvalue=None,
79 global_clipnorm=None,
80 use_ema=False,
81 ema_momentum=0.99,
82 ema_overwrite_frequency=None,
83 jit_compile=True,
84 name="Adafactor",
85 **kwargs,
86 ):
87 super().__init__(
88 name=name,
89 weight_decay=weight_decay,
90 clipnorm=clipnorm,
91 clipvalue=clipvalue,
92 global_clipnorm=global_clipnorm,
93 use_ema=use_ema,
94 ema_momentum=ema_momentum,
95 ema_overwrite_frequency=ema_overwrite_frequency,
96 jit_compile=jit_compile,
97 **kwargs,
98 )
99 self._learning_rate = self._build_learning_rate(learning_rate)
100 self.beta_2_decay = beta_2_decay
101 self.epsilon_1 = epsilon_1
102 self.epsilon_2 = epsilon_2
103 self.clip_threshold = clip_threshold
104 self.relative_step = relative_step
106 def build(self, var_list):
107 """Initialize optimizer variables.
109 Adam optimizer has 3 types of variables: momentums, velocities and
110 velocity_hat (only set when amsgrad is applied),
112 Args:
113 var_list: list of model variables to build Adam variables on.
114 """
115 super().build(var_list)
116 if hasattr(self, "_built") and self._built:
117 return
118 self._built = True
119 self._r = []
120 self._c = []
121 self._v = []
122 for var in var_list:
123 if len(var.shape) < 2:
124 # Don't factor if variable is of dimension < 2, but we still
125 # need to create dummy variables as placeholder.
126 self._r.append(tf.Variable(0, name=f"r/{var._shared_name}"))
127 self._c.append(tf.Variable(0, name=f"r/{var._shared_name}"))
128 else:
129 # Always factor the last 2 dimenstions.
130 r_shape = var.shape[:-1]
131 c_shape = var.shape[:-2] + var.shape[-1]
132 self._r.append(
133 self.add_variable(
134 shape=r_shape,
135 dtype=var.dtype,
136 name=f"r/{var._shared_name}",
137 )
138 )
139 self._c.append(
140 self.add_variable(
141 shape=c_shape,
142 dtype=var.dtype,
143 name=f"c/{var._shared_name}",
144 )
145 )
146 self._v.append(
147 self.add_variable_from_reference(
148 model_variable=var, variable_name="v"
149 )
150 )
152 def _rms(self, x):
153 return tf.sqrt(tf.reduce_mean(tf.square(x)))
155 def update_step(self, gradient, variable):
156 """Update step given gradient and the associated model variable."""
158 lr = tf.cast(self.learning_rate, variable.dtype)
159 epsilon_2 = tf.cast(self.epsilon_2, variable.dtype)
160 one = tf.cast(1.0, variable.dtype)
161 local_step = tf.cast(self.iterations + 1, variable.dtype)
162 if (
163 not isinstance(
164 self._learning_rate, learning_rate_schedule.LearningRateSchedule
165 )
166 and self.relative_step
167 ):
168 # If `relative_step=True` and learning rate is a constant, we
169 # apply the relative step algorithm.
170 lr = tf.minimum(lr, tf.math.rsqrt(local_step))
172 var_key = self._var_key(variable)
173 r = self._r[self._index_dict[var_key]]
174 c = self._c[self._index_dict[var_key]]
175 v = self._v[self._index_dict[var_key]]
177 rho_t = tf.minimum(lr, tf.math.rsqrt(local_step))
178 alpha_t = tf.maximum(epsilon_2, self._rms(variable)) * rho_t
179 regulated_grad_square = tf.square(gradient) + self.epsilon_1
180 beta_2_t = 1 - tf.pow(local_step, self.beta_2_decay)
182 if len(variable.shape) >= 2:
183 # `r` deletes the last dimension of gradient, so it is of shape
184 # `gradient.shape[:-1]`.
185 r.assign(
186 beta_2_t * r
187 + (1 - beta_2_t)
188 * tf.reduce_mean(regulated_grad_square, axis=-1)
189 )
190 # `c` deletes the second last dimension of gradient, so it is of
191 # shape `gradient.shape[:-2] + gradient.shape[-1]`.
192 c.assign(
193 beta_2_t * c
194 + (1 - beta_2_t)
195 * tf.reduce_mean(regulated_grad_square, axis=-2)
196 )
197 v.assign(
198 tf.expand_dims(
199 r / tf.reduce_mean(r, axis=-1, keepdims=True), axis=-1
200 )
201 * tf.expand_dims(c, -2)
202 )
203 else:
204 v.assign(beta_2_t * v + (1 - beta_2_t) * regulated_grad_square)
206 # `convert_to_tensor` unifies the handling of sparse and dense grads.
207 u_t = tf.convert_to_tensor(gradient) * tf.math.rsqrt(v)
208 u_t_hat = u_t / tf.maximum(one, (self._rms(u_t) / self.clip_threshold))
209 variable.assign_add(-alpha_t * u_t_hat)
211 def get_config(self):
212 config = super().get_config()
214 config.update(
215 {
216 "learning_rate": self._serialize_hyperparameter(
217 self._learning_rate
218 ),
219 "beta_2_decay": self.beta_2_decay,
220 "epsilon_1": self.epsilon_1,
221 "epsilon_2": self.epsilon_2,
222 "clip_threshold": self.clip_threshold,
223 "relative_step": self.relative_step,
224 }
225 )
226 return config
229Adafactor.__doc__ = Adafactor.__doc__.replace(
230 "{{base_optimizer_keyword_args}}", optimizer.base_optimizer_keyword_args
231)