Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/keras/src/optimizers/schedules/learning_rate_schedule.py: 23%
264 statements
« prev ^ index » next coverage.py v7.4.0, created at 2024-01-03 07:57 +0000
« prev ^ index » next coverage.py v7.4.0, created at 2024-01-03 07:57 +0000
1# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
2#
3# Licensed under the Apache License, Version 2.0 (the "License");
4# you may not use this file except in compliance with the License.
5# You may obtain a copy of the License at
6#
7# http://www.apache.org/licenses/LICENSE-2.0
8#
9# Unless required by applicable law or agreed to in writing, software
10# distributed under the License is distributed on an "AS IS" BASIS,
11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12# See the License for the specific language governing permissions and
13# limitations under the License.
14# ==============================================================================
15"""Various learning rate schedule functions."""
17import abc
18import math
20import tensorflow.compat.v2 as tf
22from keras.src import backend
23from keras.src.saving import serialization_lib
24from keras.src.saving.legacy import serialization as legacy_serialization
26# isort: off
27from tensorflow.python.util.tf_export import keras_export
30@keras_export("keras.optimizers.schedules.LearningRateSchedule")
31class LearningRateSchedule:
32 """The learning rate schedule base class.
34 You can use a learning rate schedule to modulate how the learning rate
35 of your optimizer changes over time.
37 Several built-in learning rate schedules are available, such as
38 `tf.keras.optimizers.schedules.ExponentialDecay` or
39 `tf.keras.optimizers.schedules.PiecewiseConstantDecay`:
41 ```python
42 lr_schedule = keras.optimizers.schedules.ExponentialDecay(
43 initial_learning_rate=1e-2,
44 decay_steps=10000,
45 decay_rate=0.9)
46 optimizer = keras.optimizers.SGD(learning_rate=lr_schedule)
47 ```
49 A `LearningRateSchedule` instance can be passed in as the `learning_rate`
50 argument of any optimizer.
52 To implement your own schedule object, you should implement the `__call__`
53 method, which takes a `step` argument (scalar integer tensor, the
54 current training step count).
55 Like for any other Keras object, you can also optionally
56 make your object serializable by implementing the `get_config`
57 and `from_config` methods.
59 Example:
61 ```python
62 class MyLRSchedule(tf.keras.optimizers.schedules.LearningRateSchedule):
64 def __init__(self, initial_learning_rate):
65 self.initial_learning_rate = initial_learning_rate
67 def __call__(self, step):
68 return self.initial_learning_rate / (step + 1)
70 optimizer = tf.keras.optimizers.SGD(learning_rate=MyLRSchedule(0.1))
71 ```
72 """
74 @abc.abstractmethod
75 def __call__(self, step):
76 raise NotImplementedError(
77 f"Learning rate schedule '{self.__class__.__name__}' "
78 "must override `__call__(self, step)`."
79 )
81 @abc.abstractmethod
82 def get_config(self):
83 raise NotImplementedError(
84 f"Learning rate schedule '{self.__class__.__name__}' "
85 "must override `get_config()` in order to be serializable."
86 )
88 @classmethod
89 def from_config(cls, config):
90 """Instantiates a `LearningRateSchedule` from its config.
92 Args:
93 config: Output of `get_config()`.
95 Returns:
96 A `LearningRateSchedule` instance.
97 """
98 return cls(**config)
101@keras_export("keras.optimizers.schedules.ExponentialDecay")
102class ExponentialDecay(LearningRateSchedule):
103 """A LearningRateSchedule that uses an exponential decay schedule.
105 When training a model, it is often useful to lower the learning rate as
106 the training progresses. This schedule applies an exponential decay function
107 to an optimizer step, given a provided initial learning rate.
109 The schedule is a 1-arg callable that produces a decayed learning
110 rate when passed the current optimizer step. This can be useful for changing
111 the learning rate value across different invocations of optimizer functions.
112 It is computed as:
114 ```python
115 def decayed_learning_rate(step):
116 return initial_learning_rate * decay_rate ^ (step / decay_steps)
117 ```
119 If the argument `staircase` is `True`, then `step / decay_steps` is
120 an integer division and the decayed learning rate follows a
121 staircase function.
123 You can pass this schedule directly into a `tf.keras.optimizers.Optimizer`
124 as the learning rate.
125 Example: When fitting a Keras model, decay every 100000 steps with a base
126 of 0.96:
128 ```python
129 initial_learning_rate = 0.1
130 lr_schedule = tf.keras.optimizers.schedules.ExponentialDecay(
131 initial_learning_rate,
132 decay_steps=100000,
133 decay_rate=0.96,
134 staircase=True)
136 model.compile(optimizer=tf.keras.optimizers.SGD(learning_rate=lr_schedule),
137 loss='sparse_categorical_crossentropy',
138 metrics=['accuracy'])
140 model.fit(data, labels, epochs=5)
141 ```
143 The learning rate schedule is also serializable and deserializable using
144 `tf.keras.optimizers.schedules.serialize` and
145 `tf.keras.optimizers.schedules.deserialize`.
147 Returns:
148 A 1-arg callable learning rate schedule that takes the current optimizer
149 step and outputs the decayed learning rate, a scalar `Tensor` of the same
150 type as `initial_learning_rate`.
151 """
153 def __init__(
154 self,
155 initial_learning_rate,
156 decay_steps,
157 decay_rate,
158 staircase=False,
159 name=None,
160 ):
161 """Applies exponential decay to the learning rate.
163 Args:
164 initial_learning_rate: A scalar `float32` or `float64` `Tensor` or a
165 Python number. The initial learning rate.
166 decay_steps: A scalar `int32` or `int64` `Tensor` or a Python number.
167 Must be positive. See the decay computation above.
168 decay_rate: A scalar `float32` or `float64` `Tensor` or a
169 Python number. The decay rate.
170 staircase: Boolean. If `True` decay the learning rate at discrete
171 intervals
172 name: String. Optional name of the operation. Defaults to
173 'ExponentialDecay'.
174 """
175 super().__init__()
176 self.initial_learning_rate = initial_learning_rate
177 self.decay_steps = decay_steps
178 self.decay_rate = decay_rate
179 self.staircase = staircase
180 self.name = name
182 def __call__(self, step):
183 with tf.name_scope(self.name or "ExponentialDecay") as name:
184 initial_learning_rate = tf.convert_to_tensor(
185 self.initial_learning_rate, name="initial_learning_rate"
186 )
187 dtype = initial_learning_rate.dtype
188 decay_steps = tf.cast(self.decay_steps, dtype)
189 decay_rate = tf.cast(self.decay_rate, dtype)
191 global_step_recomp = tf.cast(step, dtype)
192 p = global_step_recomp / decay_steps
193 if self.staircase:
194 p = tf.floor(p)
195 return tf.multiply(
196 initial_learning_rate, tf.pow(decay_rate, p), name=name
197 )
199 def get_config(self):
200 return {
201 "initial_learning_rate": self.initial_learning_rate,
202 "decay_steps": self.decay_steps,
203 "decay_rate": self.decay_rate,
204 "staircase": self.staircase,
205 "name": self.name,
206 }
209@keras_export("keras.optimizers.schedules.PiecewiseConstantDecay")
210class PiecewiseConstantDecay(LearningRateSchedule):
211 """A LearningRateSchedule that uses a piecewise constant decay schedule.
213 The function returns a 1-arg callable to compute the piecewise constant
214 when passed the current optimizer step. This can be useful for changing the
215 learning rate value across different invocations of optimizer functions.
217 Example: use a learning rate that's 1.0 for the first 100001 steps, 0.5
218 for the next 10000 steps, and 0.1 for any additional steps.
220 ```python
221 step = tf.Variable(0, trainable=False)
222 boundaries = [100000, 110000]
223 values = [1.0, 0.5, 0.1]
224 learning_rate_fn = keras.optimizers.schedules.PiecewiseConstantDecay(
225 boundaries, values)
227 # Later, whenever we perform an optimization step, we pass in the step.
228 learning_rate = learning_rate_fn(step)
229 ```
231 You can pass this schedule directly into a `tf.keras.optimizers.Optimizer`
232 as the learning rate. The learning rate schedule is also serializable and
233 deserializable using `tf.keras.optimizers.schedules.serialize` and
234 `tf.keras.optimizers.schedules.deserialize`.
236 Returns:
237 A 1-arg callable learning rate schedule that takes the current optimizer
238 step and outputs the decayed learning rate, a scalar `Tensor` of the same
239 type as the boundary tensors.
241 The output of the 1-arg function that takes the `step`
242 is `values[0]` when `step <= boundaries[0]`,
243 `values[1]` when `step > boundaries[0]` and `step <= boundaries[1]`, ...,
244 and values[-1] when `step > boundaries[-1]`.
245 """
247 def __init__(self, boundaries, values, name=None):
248 """Piecewise constant from boundaries and interval values.
250 Args:
251 boundaries: A list of `Tensor`s or `int`s or `float`s with strictly
252 increasing entries, and with all elements having the same type as
253 the optimizer step.
254 values: A list of `Tensor`s or `float`s or `int`s that specifies the
255 values for the intervals defined by `boundaries`. It should have one
256 more element than `boundaries`, and all elements should have the
257 same type.
258 name: A string. Optional name of the operation. Defaults to
259 'PiecewiseConstant'.
261 Raises:
262 ValueError: if the number of elements in the lists do not match.
263 """
264 super().__init__()
266 if len(boundaries) != len(values) - 1:
267 raise ValueError(
268 "The length of boundaries should be 1 less than the length of "
269 f"values. Received: boundaries={boundaries} of length "
270 f"{len(boundaries)}, and values={values} "
271 f"of length {len(values)}."
272 )
274 self.boundaries = boundaries
275 self.values = values
276 self.name = name
278 def __call__(self, step):
279 with tf.name_scope(self.name or "PiecewiseConstant"):
280 boundaries = tf.nest.map_structure(
281 tf.convert_to_tensor, tf.nest.flatten(self.boundaries)
282 )
283 values = tf.nest.map_structure(
284 tf.convert_to_tensor, tf.nest.flatten(self.values)
285 )
286 x_recomp = tf.convert_to_tensor(step)
287 for i, b in enumerate(boundaries):
288 if b.dtype.base_dtype != x_recomp.dtype.base_dtype:
289 # We cast the boundaries to have the same type as the step
290 b = tf.cast(b, x_recomp.dtype.base_dtype)
291 boundaries[i] = b
292 pred_fn_pairs = []
293 pred_fn_pairs.append((x_recomp <= boundaries[0], lambda: values[0]))
294 pred_fn_pairs.append(
295 (x_recomp > boundaries[-1], lambda: values[-1])
296 )
297 for low, high, v in zip(
298 boundaries[:-1], boundaries[1:], values[1:-1]
299 ):
300 # Need to bind v here; can do this with lambda v=v: ...
301 pred = (x_recomp > low) & (x_recomp <= high)
302 pred_fn_pairs.append((pred, lambda v=v: v))
304 # The default isn't needed here because our conditions are mutually
305 # exclusive and exhaustive, but tf.case requires it.
306 default = lambda: values[0]
307 return tf.case(pred_fn_pairs, default, exclusive=True)
309 def get_config(self):
310 return {
311 "boundaries": self.boundaries,
312 "values": self.values,
313 "name": self.name,
314 }
317@keras_export("keras.optimizers.schedules.PolynomialDecay")
318class PolynomialDecay(LearningRateSchedule):
319 """A LearningRateSchedule that uses a polynomial decay schedule.
321 It is commonly observed that a monotonically decreasing learning rate, whose
322 degree of change is carefully chosen, results in a better performing model.
323 This schedule applies a polynomial decay function to an optimizer step,
324 given a provided `initial_learning_rate`, to reach an `end_learning_rate`
325 in the given `decay_steps`.
327 It requires a `step` value to compute the decayed learning rate. You
328 can just pass a TensorFlow variable that you increment at each training
329 step.
331 The schedule is a 1-arg callable that produces a decayed learning rate
332 when passed the current optimizer step. This can be useful for changing the
333 learning rate value across different invocations of optimizer functions.
334 It is computed as:
336 ```python
337 def decayed_learning_rate(step):
338 step = min(step, decay_steps)
339 return ((initial_learning_rate - end_learning_rate) *
340 (1 - step / decay_steps) ^ (power)
341 ) + end_learning_rate
342 ```
344 If `cycle` is True then a multiple of `decay_steps` is used, the first one
345 that is bigger than `step`.
347 ```python
348 def decayed_learning_rate(step):
349 decay_steps = decay_steps * ceil(step / decay_steps)
350 return ((initial_learning_rate - end_learning_rate) *
351 (1 - step / decay_steps) ^ (power)
352 ) + end_learning_rate
353 ```
355 You can pass this schedule directly into a `tf.keras.optimizers.Optimizer`
356 as the learning rate.
357 Example: Fit a model while decaying from 0.1 to 0.01 in 10000 steps using
358 sqrt (i.e. power=0.5):
360 ```python
361 ...
362 starter_learning_rate = 0.1
363 end_learning_rate = 0.01
364 decay_steps = 10000
365 learning_rate_fn = tf.keras.optimizers.schedules.PolynomialDecay(
366 starter_learning_rate,
367 decay_steps,
368 end_learning_rate,
369 power=0.5)
371 model.compile(optimizer=tf.keras.optimizers.SGD(
372 learning_rate=learning_rate_fn),
373 loss='sparse_categorical_crossentropy',
374 metrics=['accuracy'])
376 model.fit(data, labels, epochs=5)
377 ```
379 The learning rate schedule is also serializable and deserializable using
380 `tf.keras.optimizers.schedules.serialize` and
381 `tf.keras.optimizers.schedules.deserialize`.
383 Returns:
384 A 1-arg callable learning rate schedule that takes the current optimizer
385 step and outputs the decayed learning rate, a scalar `Tensor` of the same
386 type as `initial_learning_rate`.
387 """
389 def __init__(
390 self,
391 initial_learning_rate,
392 decay_steps,
393 end_learning_rate=0.0001,
394 power=1.0,
395 cycle=False,
396 name=None,
397 ):
398 """Applies a polynomial decay to the learning rate.
400 Args:
401 initial_learning_rate: A scalar `float32` or `float64` `Tensor` or a
402 Python number. The initial learning rate.
403 decay_steps: A scalar `int32` or `int64` `Tensor` or a Python number.
404 Must be positive. See the decay computation above.
405 end_learning_rate: A scalar `float32` or `float64` `Tensor` or a
406 Python number. The minimal end learning rate.
407 power: A scalar `float32` or `float64` `Tensor` or a
408 Python number. The power of the polynomial. Defaults to `1.0`.
409 cycle: A boolean, whether it should cycle beyond decay_steps.
410 name: String. Optional name of the operation. Defaults to
411 'PolynomialDecay'.
412 """
413 super().__init__()
415 self.initial_learning_rate = initial_learning_rate
416 self.decay_steps = decay_steps
417 self.end_learning_rate = end_learning_rate
418 self.power = power
419 self.cycle = cycle
420 self.name = name
422 def __call__(self, step):
423 with tf.name_scope(self.name or "PolynomialDecay") as name:
424 initial_learning_rate = tf.convert_to_tensor(
425 self.initial_learning_rate, name="initial_learning_rate"
426 )
427 dtype = initial_learning_rate.dtype
428 end_learning_rate = tf.cast(self.end_learning_rate, dtype)
429 power = tf.cast(self.power, dtype)
431 global_step_recomp = tf.cast(step, dtype)
432 decay_steps_recomp = tf.cast(self.decay_steps, dtype)
433 if self.cycle:
434 # Find the first multiple of decay_steps that is bigger than
435 # global_step. If global_step is zero set the multiplier to 1
436 multiplier = tf.where(
437 tf.equal(global_step_recomp, 0),
438 1.0,
439 tf.math.ceil(global_step_recomp / self.decay_steps),
440 )
441 decay_steps_recomp = tf.multiply(decay_steps_recomp, multiplier)
442 else:
443 # Make sure that the global_step used is not bigger than
444 # decay_steps.
445 global_step_recomp = tf.minimum(
446 global_step_recomp, decay_steps_recomp
447 )
449 p = tf.divide(global_step_recomp, decay_steps_recomp)
450 return tf.add(
451 tf.multiply(
452 initial_learning_rate - end_learning_rate,
453 tf.pow(1 - p, power),
454 ),
455 end_learning_rate,
456 name=name,
457 )
459 def get_config(self):
460 return {
461 "initial_learning_rate": self.initial_learning_rate,
462 "decay_steps": self.decay_steps,
463 "end_learning_rate": self.end_learning_rate,
464 "power": self.power,
465 "cycle": self.cycle,
466 "name": self.name,
467 }
470@keras_export("keras.optimizers.schedules.InverseTimeDecay")
471class InverseTimeDecay(LearningRateSchedule):
472 """A LearningRateSchedule that uses an inverse time decay schedule.
474 When training a model, it is often useful to lower the learning rate as
475 the training progresses. This schedule applies the inverse decay function
476 to an optimizer step, given a provided initial learning rate.
477 It requires a `step` value to compute the decayed learning rate. You can
478 just pass a TensorFlow variable that you increment at each training step.
480 The schedule is a 1-arg callable that produces a decayed learning
481 rate when passed the current optimizer step. This can be useful for changing
482 the learning rate value across different invocations of optimizer functions.
483 It is computed as:
485 ```python
486 def decayed_learning_rate(step):
487 return initial_learning_rate / (1 + decay_rate * step / decay_step)
488 ```
490 or, if `staircase` is `True`, as:
492 ```python
493 def decayed_learning_rate(step):
494 return initial_learning_rate / (1 + decay_rate * floor(step / decay_step))
495 ```
497 You can pass this schedule directly into a `tf.keras.optimizers.Optimizer`
498 as the learning rate.
499 Example: Fit a Keras model when decaying 1/t with a rate of 0.5:
501 ```python
502 ...
503 initial_learning_rate = 0.1
504 decay_steps = 1.0
505 decay_rate = 0.5
506 learning_rate_fn = keras.optimizers.schedules.InverseTimeDecay(
507 initial_learning_rate, decay_steps, decay_rate)
509 model.compile(optimizer=tf.keras.optimizers.SGD(
510 learning_rate=learning_rate_fn),
511 loss='sparse_categorical_crossentropy',
512 metrics=['accuracy'])
514 model.fit(data, labels, epochs=5)
515 ```
517 Returns:
518 A 1-arg callable learning rate schedule that takes the current optimizer
519 step and outputs the decayed learning rate, a scalar `Tensor` of the same
520 type as `initial_learning_rate`.
521 """
523 def __init__(
524 self,
525 initial_learning_rate,
526 decay_steps,
527 decay_rate,
528 staircase=False,
529 name=None,
530 ):
531 """Applies inverse time decay to the initial learning rate.
533 Args:
534 initial_learning_rate: A scalar `float32` or `float64` `Tensor` or a
535 Python number. The initial learning rate.
536 decay_steps: How often to apply decay.
537 decay_rate: A Python number. The decay rate.
538 staircase: Whether to apply decay in a discrete staircase, as opposed
539 to continuous, fashion.
540 name: String. Optional name of the operation. Defaults to
541 'InverseTimeDecay'.
542 """
543 super().__init__()
545 self.initial_learning_rate = initial_learning_rate
546 self.decay_steps = decay_steps
547 self.decay_rate = decay_rate
548 self.staircase = staircase
549 self.name = name
551 def __call__(self, step):
552 with tf.name_scope(self.name or "InverseTimeDecay") as name:
553 initial_learning_rate = tf.convert_to_tensor(
554 self.initial_learning_rate, name="initial_learning_rate"
555 )
556 dtype = initial_learning_rate.dtype
557 decay_steps = tf.cast(self.decay_steps, dtype)
558 decay_rate = tf.cast(self.decay_rate, dtype)
560 global_step_recomp = tf.cast(step, dtype)
561 p = global_step_recomp / decay_steps
562 if self.staircase:
563 p = tf.floor(p)
564 const = tf.cast(tf.constant(1), dtype)
565 denom = tf.add(const, tf.multiply(decay_rate, p))
566 return tf.divide(initial_learning_rate, denom, name=name)
568 def get_config(self):
569 return {
570 "initial_learning_rate": self.initial_learning_rate,
571 "decay_steps": self.decay_steps,
572 "decay_rate": self.decay_rate,
573 "staircase": self.staircase,
574 "name": self.name,
575 }
578@keras_export(
579 "keras.optimizers.schedules.CosineDecay", "keras.experimental.CosineDecay"
580)
581class CosineDecay(LearningRateSchedule):
582 """A LearningRateSchedule that uses a cosine decay with optional warmup.
584 See [Loshchilov & Hutter, ICLR2016](https://arxiv.org/abs/1608.03983),
585 SGDR: Stochastic Gradient Descent with Warm Restarts.
587 For the idea of a linear warmup of our learning rate,
588 see [Goyal et al.](https://arxiv.org/pdf/1706.02677.pdf).
590 When we begin training a model, we often want an initial increase in our
591 learning rate followed by a decay. If `warmup_target` is an int, this
592 schedule applies a linear increase per optimizer step to our learning rate
593 from `initial_learning_rate` to `warmup_target` for a duration of
594 `warmup_steps`. Afterwards, it applies a cosine decay function taking our
595 learning rate from `warmup_target` to `alpha` for a duration of
596 `decay_steps`. If `warmup_target` is None we skip warmup and our decay
597 will take our learning rate from `initial_learning_rate` to `alpha`.
598 It requires a `step` value to compute the learning rate. You can
599 just pass a TensorFlow variable that you increment at each training step.
601 The schedule is a 1-arg callable that produces a warmup followed by a
602 decayed learning rate when passed the current optimizer step. This can be
603 useful for changing the learning rate value across different invocations of
604 optimizer functions.
606 Our warmup is computed as:
608 ```python
609 def warmup_learning_rate(step):
610 completed_fraction = step / warmup_steps
611 total_delta = target_warmup - initial_learning_rate
612 return completed_fraction * total_delta
613 ```
615 And our decay is computed as:
617 ```python
618 if warmup_target is None:
619 initial_decay_lr = initial_learning_rate
620 else:
621 initial_decay_lr = warmup_target
623 def decayed_learning_rate(step):
624 step = min(step, decay_steps)
625 cosine_decay = 0.5 * (1 + cos(pi * step / decay_steps))
626 decayed = (1 - alpha) * cosine_decay + alpha
627 return initial_decay_lr * decayed
628 ```
630 Example usage without warmup:
632 ```python
633 decay_steps = 1000
634 initial_learning_rate = 0.1
635 lr_decayed_fn = tf.keras.optimizers.schedules.CosineDecay(
636 initial_learning_rate, decay_steps)
637 ```
639 Example usage with warmup:
641 ```python
642 decay_steps = 1000
643 initial_learning_rate = 0
644 warmup_steps = 1000
645 target_learning_rate = 0.1
646 lr_warmup_decayed_fn = tf.keras.optimizers.schedules.CosineDecay(
647 initial_learning_rate, decay_steps, warmup_target=target_learning_rate,
648 warmup_steps=warmup_steps
649 )
650 ```
652 You can pass this schedule directly into a `tf.keras.optimizers.Optimizer`
653 as the learning rate. The learning rate schedule is also serializable and
654 deserializable using `tf.keras.optimizers.schedules.serialize` and
655 `tf.keras.optimizers.schedules.deserialize`.
657 Returns:
658 A 1-arg callable learning rate schedule that takes the current optimizer
659 step and outputs the decayed learning rate, a scalar `Tensor` of the same
660 type as `initial_learning_rate`.
661 """
663 def __init__(
664 self,
665 initial_learning_rate,
666 decay_steps,
667 alpha=0.0,
668 name=None,
669 warmup_target=None,
670 warmup_steps=0,
671 ):
672 """Applies cosine decay to the learning rate.
674 Args:
675 initial_learning_rate: A scalar `float32` or `float64` `Tensor` or a
676 Python int. The initial learning rate.
677 decay_steps: A scalar `int32` or `int64` `Tensor` or a Python int.
678 Number of steps to decay over.
679 alpha: A scalar `float32` or `float64` `Tensor` or a Python int.
680 Minimum learning rate value for decay as a fraction of
681 `initial_learning_rate`.
682 name: String. Optional name of the operation. Defaults to
683 'CosineDecay'.
684 warmup_target: None or a scalar `float32` or `float64` `Tensor` or a
685 Python int. The target learning rate for our warmup phase. Will cast
686 to the `initial_learning_rate` datatype. Setting to None will skip
687 warmup and begins decay phase from `initial_learning_rate`.
688 Otherwise scheduler will warmup from `initial_learning_rate` to
689 `warmup_target`.
690 warmup_steps: A scalar `int32` or `int64` `Tensor` or a Python int.
691 Number of steps to warmup over.
692 """
693 super().__init__()
695 self.initial_learning_rate = initial_learning_rate
696 self.decay_steps = decay_steps
697 self.alpha = alpha
698 self.name = name
699 self.warmup_steps = warmup_steps
700 self.warmup_target = warmup_target
702 def _decay_function(self, step, decay_steps, decay_from_lr, dtype):
703 with tf.name_scope(self.name or "CosineDecay"):
704 completed_fraction = step / decay_steps
705 tf_pi = tf.constant(math.pi, dtype=dtype)
706 cosine_decayed = 0.5 * (1.0 + tf.cos(tf_pi * completed_fraction))
707 decayed = (1 - self.alpha) * cosine_decayed + self.alpha
708 return tf.multiply(decay_from_lr, decayed)
710 def _warmup_function(
711 self, step, warmup_steps, warmup_target, initial_learning_rate
712 ):
713 with tf.name_scope(self.name or "CosineDecay"):
714 completed_fraction = step / warmup_steps
715 total_step_delta = warmup_target - initial_learning_rate
716 return total_step_delta * completed_fraction + initial_learning_rate
718 def __call__(self, step):
719 with tf.name_scope(self.name or "CosineDecay"):
720 initial_learning_rate = tf.convert_to_tensor(
721 self.initial_learning_rate, name="initial_learning_rate"
722 )
723 dtype = initial_learning_rate.dtype
724 decay_steps = tf.cast(self.decay_steps, dtype)
725 global_step_recomp = tf.cast(step, dtype)
727 if self.warmup_target is None:
728 global_step_recomp = tf.minimum(global_step_recomp, decay_steps)
729 return self._decay_function(
730 global_step_recomp,
731 decay_steps,
732 initial_learning_rate,
733 dtype,
734 )
736 warmup_target = tf.cast(self.warmup_target, dtype)
737 warmup_steps = tf.cast(self.warmup_steps, dtype)
739 global_step_recomp = tf.minimum(
740 global_step_recomp, decay_steps + warmup_steps
741 )
743 return tf.cond(
744 global_step_recomp < warmup_steps,
745 lambda: self._warmup_function(
746 global_step_recomp,
747 warmup_steps,
748 warmup_target,
749 initial_learning_rate,
750 ),
751 lambda: self._decay_function(
752 global_step_recomp - warmup_steps,
753 decay_steps,
754 warmup_target,
755 dtype,
756 ),
757 )
759 def get_config(self):
760 return {
761 "initial_learning_rate": self.initial_learning_rate,
762 "decay_steps": self.decay_steps,
763 "alpha": self.alpha,
764 "name": self.name,
765 "warmup_target": self.warmup_target,
766 "warmup_steps": self.warmup_steps,
767 }
770@keras_export(
771 "keras.optimizers.schedules.CosineDecayRestarts",
772 "keras.experimental.CosineDecayRestarts",
773)
774class CosineDecayRestarts(LearningRateSchedule):
775 """A LearningRateSchedule that uses a cosine decay schedule with restarts.
777 See [Loshchilov & Hutter, ICLR2016](https://arxiv.org/abs/1608.03983),
778 SGDR: Stochastic Gradient Descent with Warm Restarts.
780 When training a model, it is often useful to lower the learning rate as
781 the training progresses. This schedule applies a cosine decay function with
782 restarts to an optimizer step, given a provided initial learning rate.
783 It requires a `step` value to compute the decayed learning rate. You can
784 just pass a TensorFlow variable that you increment at each training step.
786 The schedule is a 1-arg callable that produces a decayed learning
787 rate when passed the current optimizer step. This can be useful for changing
788 the learning rate value across different invocations of optimizer functions.
790 The learning rate multiplier first decays
791 from 1 to `alpha` for `first_decay_steps` steps. Then, a warm
792 restart is performed. Each new warm restart runs for `t_mul` times more
793 steps and with `m_mul` times initial learning rate as the new learning rate.
795 Example usage:
796 ```python
797 first_decay_steps = 1000
798 lr_decayed_fn = (
799 tf.keras.optimizers.schedules.CosineDecayRestarts(
800 initial_learning_rate,
801 first_decay_steps))
802 ```
804 You can pass this schedule directly into a `tf.keras.optimizers.Optimizer`
805 as the learning rate. The learning rate schedule is also serializable and
806 deserializable using `tf.keras.optimizers.schedules.serialize` and
807 `tf.keras.optimizers.schedules.deserialize`.
809 Returns:
810 A 1-arg callable learning rate schedule that takes the current optimizer
811 step and outputs the decayed learning rate, a scalar `Tensor` of the same
812 type as `initial_learning_rate`.
813 """
815 def __init__(
816 self,
817 initial_learning_rate,
818 first_decay_steps,
819 t_mul=2.0,
820 m_mul=1.0,
821 alpha=0.0,
822 name=None,
823 ):
824 """Applies cosine decay with restarts to the learning rate.
826 Args:
827 initial_learning_rate: A scalar `float32` or `float64` Tensor or a
828 Python number. The initial learning rate.
829 first_decay_steps: A scalar `int32` or `int64` `Tensor` or a Python
830 number. Number of steps to decay over.
831 t_mul: A scalar `float32` or `float64` `Tensor` or a Python number.
832 Used to derive the number of iterations in the i-th period.
833 m_mul: A scalar `float32` or `float64` `Tensor` or a Python number.
834 Used to derive the initial learning rate of the i-th period.
835 alpha: A scalar `float32` or `float64` Tensor or a Python number.
836 Minimum learning rate value as a fraction of the
837 initial_learning_rate.
838 name: String. Optional name of the operation. Defaults to 'SGDRDecay'.
839 """
840 super().__init__()
842 self.initial_learning_rate = initial_learning_rate
843 self.first_decay_steps = first_decay_steps
844 self._t_mul = t_mul
845 self._m_mul = m_mul
846 self.alpha = alpha
847 self.name = name
849 def __call__(self, step):
850 with tf.name_scope(self.name or "SGDRDecay") as name:
851 initial_learning_rate = tf.convert_to_tensor(
852 self.initial_learning_rate, name="initial_learning_rate"
853 )
854 dtype = initial_learning_rate.dtype
855 first_decay_steps = tf.cast(self.first_decay_steps, dtype)
856 alpha = tf.cast(self.alpha, dtype)
857 t_mul = tf.cast(self._t_mul, dtype)
858 m_mul = tf.cast(self._m_mul, dtype)
860 global_step_recomp = tf.cast(step, dtype)
861 completed_fraction = global_step_recomp / first_decay_steps
863 def compute_step(completed_fraction, geometric=False):
864 """Helper for `cond` operation."""
865 if geometric:
866 i_restart = tf.floor(
867 tf.math.log(1.0 - completed_fraction * (1.0 - t_mul))
868 / tf.math.log(t_mul)
869 )
871 sum_r = (1.0 - t_mul**i_restart) / (1.0 - t_mul)
872 completed_fraction = (
873 completed_fraction - sum_r
874 ) / t_mul**i_restart
876 else:
877 i_restart = tf.floor(completed_fraction)
878 completed_fraction -= i_restart
880 return i_restart, completed_fraction
882 i_restart, completed_fraction = tf.cond(
883 tf.equal(t_mul, 1.0),
884 lambda: compute_step(completed_fraction, geometric=False),
885 lambda: compute_step(completed_fraction, geometric=True),
886 )
888 m_fac = m_mul**i_restart
889 cosine_decayed = (
890 0.5
891 * m_fac
892 * (
893 1.0
894 + tf.cos(
895 tf.constant(math.pi, dtype=dtype) * completed_fraction
896 )
897 )
898 )
899 decayed = (1 - alpha) * cosine_decayed + alpha
901 return tf.multiply(initial_learning_rate, decayed, name=name)
903 def get_config(self):
904 return {
905 "initial_learning_rate": self.initial_learning_rate,
906 "first_decay_steps": self.first_decay_steps,
907 "t_mul": self._t_mul,
908 "m_mul": self._m_mul,
909 "alpha": self.alpha,
910 "name": self.name,
911 }
914# Note: this code is still used by V1 APIs.
915class LinearCosineDecay(LearningRateSchedule):
916 """A LearningRateSchedule that uses a linear cosine decay schedule.
918 See [Bello et al., ICML2017] Neural Optimizer Search with RL.
919 https://arxiv.org/abs/1709.07417
921 For the idea of warm starts here controlled by `num_periods`,
922 see [Loshchilov & Hutter, ICLR2016] SGDR: Stochastic Gradient Descent
923 with Warm Restarts. https://arxiv.org/abs/1608.03983
925 Note that linear cosine decay is more aggressive than cosine decay and
926 larger initial learning rates can typically be used.
928 When training a model, it is often recommended to lower the learning rate as
929 the training progresses. This schedule applies a linear cosine decay
930 function to an optimizer step, given a provided initial learning rate.
931 It requires a `step` value to compute the decayed learning rate. You can
932 just pass a TensorFlow variable that you increment at each training step.
934 The schedule is a 1-arg callable that produces a decayed learning
935 rate when passed the current optimizer step. This can be useful for changing
936 the learning rate value across different invocations of optimizer functions.
937 It is computed as:
939 ```python
940 def decayed_learning_rate(step):
941 step = min(step, decay_steps)
942 linear_decay = (decay_steps - step) / decay_steps
943 cosine_decay = 0.5 * (
944 1 + cos(pi * 2 * num_periods * step / decay_steps))
945 decayed = (alpha + linear_decay) * cosine_decay + beta
946 return initial_learning_rate * decayed
947 ```
949 Example usage:
950 ```python
951 decay_steps = 1000
952 lr_decayed_fn = (
953 tf.keras.experimental.LinearCosineDecay(
954 initial_learning_rate, decay_steps))
955 ```
957 You can pass this schedule directly into a `tf.keras.optimizers.Optimizer`
958 as the learning rate. The learning rate schedule is also serializable and
959 deserializable using `tf.keras.optimizers.schedules.serialize` and
960 `tf.keras.optimizers.schedules.deserialize`.
962 Returns:
963 A 1-arg callable learning rate schedule that takes the current optimizer
964 step and outputs the decayed learning rate, a scalar `Tensor` of the same
965 type as `initial_learning_rate`.
966 """
968 def __init__(
969 self,
970 initial_learning_rate,
971 decay_steps,
972 num_periods=0.5,
973 alpha=0.0,
974 beta=0.001,
975 name=None,
976 ):
977 """Applies linear cosine decay to the learning rate.
979 Args:
980 initial_learning_rate: A scalar `float32` or `float64` Tensor or a
981 Python number. The initial learning rate.
982 decay_steps: A scalar `int32` or `int64` `Tensor` or a Python number.
983 Number of steps to decay over.
984 num_periods: Number of periods in the cosine part of the decay.
985 See computation above.
986 alpha: See computation above.
987 beta: See computation above.
988 name: String. Optional name of the operation. Defaults to
989 'LinearCosineDecay'.
990 """
991 super().__init__()
993 self.initial_learning_rate = initial_learning_rate
994 self.decay_steps = decay_steps
995 self.num_periods = num_periods
996 self.alpha = alpha
997 self.beta = beta
998 self.name = name
1000 def __call__(self, step):
1001 with tf.name_scope(self.name or "LinearCosineDecay") as name:
1002 initial_learning_rate = tf.convert_to_tensor(
1003 self.initial_learning_rate, name="initial_learning_rate"
1004 )
1005 dtype = initial_learning_rate.dtype
1006 decay_steps = tf.cast(self.decay_steps, dtype)
1007 num_periods = tf.cast(self.num_periods, dtype)
1008 alpha = tf.cast(self.alpha, dtype)
1009 beta = tf.cast(self.beta, dtype)
1011 global_step_recomp = tf.cast(step, dtype)
1012 global_step_recomp = tf.minimum(global_step_recomp, decay_steps)
1013 linear_decayed = (decay_steps - global_step_recomp) / decay_steps
1014 completed_fraction = global_step_recomp / decay_steps
1015 fraction = 2.0 * num_periods * completed_fraction
1016 cosine_decayed = 0.5 * (
1017 1.0 + tf.cos(tf.constant(math.pi, dtype=dtype) * fraction)
1018 )
1020 linear_cosine_decayed = (
1021 alpha + linear_decayed
1022 ) * cosine_decayed + beta
1023 return tf.multiply(
1024 initial_learning_rate, linear_cosine_decayed, name=name
1025 )
1027 def get_config(self):
1028 return {
1029 "initial_learning_rate": self.initial_learning_rate,
1030 "decay_steps": self.decay_steps,
1031 "num_periods": self.num_periods,
1032 "alpha": self.alpha,
1033 "beta": self.beta,
1034 "name": self.name,
1035 }
1038# Note: this code is still used by V1 APIs.
1039class NoisyLinearCosineDecay(LearningRateSchedule):
1040 """A LearningRateSchedule that uses a noisy linear cosine decay schedule.
1042 See [Bello et al., ICML2017] Neural Optimizer Search with RL.
1043 https://arxiv.org/abs/1709.07417
1045 For the idea of warm starts here controlled by `num_periods`,
1046 see [Loshchilov & Hutter, ICLR2016] SGDR: Stochastic Gradient Descent
1047 with Warm Restarts. https://arxiv.org/abs/1608.03983
1049 Note that linear cosine decay is more aggressive than cosine decay and
1050 larger initial learning rates can typically be used.
1052 When training a model, it is often recommended to lower the learning rate as
1053 the training progresses. This schedule applies a noisy linear cosine decay
1054 function to an optimizer step, given a provided initial learning rate.
1055 It requires a `step` value to compute the decayed learning rate. You can
1056 just pass a TensorFlow variable that you increment at each training step.
1058 The schedule is a 1-arg callable that produces a decayed learning
1059 rate when passed the current optimizer step. This can be useful for changing
1060 the learning rate value across different invocations of optimizer functions.
1061 It is computed as:
1063 ```python
1064 def decayed_learning_rate(step):
1065 step = min(step, decay_steps)
1066 linear_decay = (decay_steps - step) / decay_steps)
1067 cosine_decay = 0.5 * (
1068 1 + cos(pi * 2 * num_periods * step / decay_steps))
1069 decayed = (alpha + linear_decay + eps_t) * cosine_decay + beta
1070 return initial_learning_rate * decayed
1071 ```
1072 where eps_t is 0-centered gaussian noise with variance
1073 initial_variance / (1 + global_step) ** variance_decay
1075 Example usage:
1076 ```python
1077 decay_steps = 1000
1078 lr_decayed_fn = (
1079 tf.keras.experimental.NoisyLinearCosineDecay(
1080 initial_learning_rate, decay_steps))
1081 ```
1083 You can pass this schedule directly into a `tf.keras.optimizers.Optimizer`
1084 as the learning rate. The learning rate schedule is also serializable and
1085 deserializable using `tf.keras.optimizers.schedules.serialize` and
1086 `tf.keras.optimizers.schedules.deserialize`.
1088 Returns:
1089 A 1-arg callable learning rate schedule that takes the current optimizer
1090 step and outputs the decayed learning rate, a scalar `Tensor` of the same
1091 type as `initial_learning_rate`.
1092 """
1094 def __init__(
1095 self,
1096 initial_learning_rate,
1097 decay_steps,
1098 initial_variance=1.0,
1099 variance_decay=0.55,
1100 num_periods=0.5,
1101 alpha=0.0,
1102 beta=0.001,
1103 seed=None,
1104 name=None,
1105 ):
1106 """Applies noisy linear cosine decay to the learning rate.
1108 Args:
1109 initial_learning_rate: A scalar `float32` or `float64` Tensor or a
1110 Python number. The initial learning rate.
1111 decay_steps: A scalar `int32` or `int64` `Tensor` or a Python number.
1112 Number of steps to decay over.
1113 initial_variance: initial variance for the noise. See computation
1114 above.
1115 variance_decay: decay for the noise's variance. See computation above.
1116 num_periods: Number of periods in the cosine part of the decay.
1117 See computation above.
1118 alpha: See computation above.
1119 beta: See computation above.
1120 seed: Integer, optional random seed to enable deterministic behavior.
1121 name: String. Optional name of the operation. Defaults to
1122 'NoisyLinearCosineDecay'.
1123 """
1124 super().__init__()
1126 self.initial_learning_rate = initial_learning_rate
1127 self.decay_steps = decay_steps
1128 self.initial_variance = initial_variance
1129 self.variance_decay = variance_decay
1130 self.num_periods = num_periods
1131 self.alpha = alpha
1132 self.beta = beta
1133 self.seed = seed
1134 self.name = name
1135 self._random_generator = backend.RandomGenerator(seed)
1137 def __call__(self, step):
1138 with tf.name_scope(self.name or "NoisyLinearCosineDecay") as name:
1139 initial_learning_rate = tf.convert_to_tensor(
1140 self.initial_learning_rate, name="initial_learning_rate"
1141 )
1142 dtype = initial_learning_rate.dtype
1143 decay_steps = tf.cast(self.decay_steps, dtype)
1144 initial_variance = tf.cast(self.initial_variance, dtype)
1145 variance_decay = tf.cast(self.variance_decay, dtype)
1146 num_periods = tf.cast(self.num_periods, dtype)
1147 alpha = tf.cast(self.alpha, dtype)
1148 beta = tf.cast(self.beta, dtype)
1150 global_step_recomp = tf.cast(step, dtype)
1151 global_step_recomp = tf.minimum(global_step_recomp, decay_steps)
1152 linear_decayed = (decay_steps - global_step_recomp) / decay_steps
1153 variance = initial_variance / (
1154 tf.pow(1.0 + global_step_recomp, variance_decay)
1155 )
1156 std = tf.sqrt(variance)
1157 noisy_linear_decayed = (
1158 linear_decayed
1159 + self._random_generator.random_normal(
1160 linear_decayed.shape, stddev=std
1161 )
1162 )
1164 completed_fraction = global_step_recomp / decay_steps
1165 fraction = 2.0 * num_periods * completed_fraction
1166 cosine_decayed = 0.5 * (
1167 1.0 + tf.cos(tf.constant(math.pi, dtype=dtype) * fraction)
1168 )
1169 noisy_linear_cosine_decayed = (
1170 alpha + noisy_linear_decayed
1171 ) * cosine_decayed + beta
1173 return tf.multiply(
1174 initial_learning_rate, noisy_linear_cosine_decayed, name=name
1175 )
1177 def get_config(self):
1178 return {
1179 "initial_learning_rate": self.initial_learning_rate,
1180 "decay_steps": self.decay_steps,
1181 "initial_variance": self.initial_variance,
1182 "variance_decay": self.variance_decay,
1183 "num_periods": self.num_periods,
1184 "alpha": self.alpha,
1185 "beta": self.beta,
1186 "seed": self.seed,
1187 "name": self.name,
1188 }
1191@keras_export("keras.optimizers.schedules.serialize")
1192def serialize(learning_rate_schedule, use_legacy_format=False):
1193 """Serializes a `LearningRateSchedule` into a JSON-compatible dict.
1195 Args:
1196 learning_rate_schedule: The `LearningRateSchedule` object to serialize.
1198 Returns:
1199 A JSON-serializable dict representing the object's config.
1201 Example:
1203 >>> lr_schedule = tf.keras.optimizers.schedules.ExponentialDecay(
1204 ... 0.1, decay_steps=100000, decay_rate=0.96, staircase=True)
1205 >>> tf.keras.optimizers.schedules.serialize(lr_schedule)
1206 {'module': 'keras.optimizers.schedules',
1207 'class_name': 'ExponentialDecay', 'config': {...},
1208 'registered_name': None}
1209 """
1210 if use_legacy_format:
1211 return legacy_serialization.serialize_keras_object(
1212 learning_rate_schedule
1213 )
1215 return serialization_lib.serialize_keras_object(learning_rate_schedule)
1218@keras_export("keras.optimizers.schedules.deserialize")
1219def deserialize(config, custom_objects=None, use_legacy_format=False):
1220 """Instantiates a `LearningRateSchedule` object from a serialized form.
1222 Args:
1223 config: The serialized form of the `LearningRateSchedule`.
1224 Dictionary of the form {'class_name': str, 'config': dict}.
1225 custom_objects: A dictionary mapping class names (or function names) of
1226 custom (non-Keras) objects to class/functions.
1228 Returns:
1229 A `LearningRateSchedule` object.
1231 Example:
1233 ```python
1234 # Configuration for PolynomialDecay
1235 config = {
1236 'class_name': 'PolynomialDecay',
1237 'config': {'cycle': False,
1238 'decay_steps': 10000,
1239 'end_learning_rate': 0.01,
1240 'initial_learning_rate': 0.1,
1241 'name': None,
1242 'power': 0.5}}
1243 lr_schedule = tf.keras.optimizers.schedules.deserialize(config)
1244 ```
1245 """
1246 if use_legacy_format:
1247 return legacy_serialization.deserialize_keras_object(
1248 config,
1249 module_objects=globals(),
1250 custom_objects=custom_objects,
1251 printable_module_name="decay",
1252 )
1254 return serialization_lib.deserialize_keras_object(
1255 config,
1256 module_objects=globals(),
1257 custom_objects=custom_objects,
1258 printable_module_name="decay",
1259 )