Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/tensorflow/python/keras/optimizer_v2/learning_rate_schedule.py: 26%
248 statements
« prev ^ index » next coverage.py v7.4.0, created at 2024-01-03 07:57 +0000
« prev ^ index » next coverage.py v7.4.0, created at 2024-01-03 07:57 +0000
1# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
2#
3# Licensed under the Apache License, Version 2.0 (the "License");
4# you may not use this file except in compliance with the License.
5# You may obtain a copy of the License at
6#
7# http://www.apache.org/licenses/LICENSE-2.0
8#
9# Unless required by applicable law or agreed to in writing, software
10# distributed under the License is distributed on an "AS IS" BASIS,
11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12# See the License for the specific language governing permissions and
13# limitations under the License.
14# ==============================================================================
15"""Various learning rate decay functions."""
17import abc
18import math
20from tensorflow.python.framework import constant_op
21from tensorflow.python.framework import ops
22from tensorflow.python.framework import tensor_conversion
23from tensorflow.python.keras.utils import generic_utils
24from tensorflow.python.ops import array_ops
25from tensorflow.python.ops import cond
26from tensorflow.python.ops import control_flow_case
27from tensorflow.python.ops import math_ops
28from tensorflow.python.ops import random_ops
29from tensorflow.python.util import nest
30from tensorflow.python.util.tf_export import keras_export
33@keras_export("keras.optimizers.schedules.LearningRateSchedule")
34class LearningRateSchedule(object):
35 """The learning rate schedule base class.
37 You can use a learning rate schedule to modulate how the learning rate
38 of your optimizer changes over time.
40 Several built-in learning rate schedules are available, such as
41 `tf.keras.optimizers.schedules.ExponentialDecay` or
42 `tf.keras.optimizers.schedules.PiecewiseConstantDecay`:
44 ```python
45 lr_schedule = keras.optimizers.schedules.ExponentialDecay(
46 initial_learning_rate=1e-2,
47 decay_steps=10000,
48 decay_rate=0.9)
49 optimizer = keras.optimizers.SGD(learning_rate=lr_schedule)
50 ```
52 A `LearningRateSchedule` instance can be passed in as the `learning_rate`
53 argument of any optimizer.
55 To implement your own schedule object, you should implement the `__call__`
56 method, which takes a `step` argument (scalar integer tensor, the
57 current training step count).
58 Like for any other Keras object, you can also optionally
59 make your object serializable by implementing the `get_config`
60 and `from_config` methods.
62 Example:
64 ```python
65 class MyLRSchedule(tf.keras.optimizers.schedules.LearningRateSchedule):
67 def __init__(self, initial_learning_rate):
68 self.initial_learning_rate = initial_learning_rate
70 def __call__(self, step):
71 return self.initial_learning_rate / (step + 1)
73 optimizer = tf.keras.optimizers.SGD(learning_rate=MyLRSchedule(0.1))
74 ```
75 """
77 @abc.abstractmethod
78 def __call__(self, step):
79 raise NotImplementedError("Learning rate schedule must override __call__")
81 @abc.abstractmethod
82 def get_config(self):
83 raise NotImplementedError("Learning rate schedule must override get_config")
85 @classmethod
86 def from_config(cls, config):
87 """Instantiates a `LearningRateSchedule` from its config.
89 Args:
90 config: Output of `get_config()`.
92 Returns:
93 A `LearningRateSchedule` instance.
94 """
95 return cls(**config)
98@keras_export("keras.optimizers.schedules.ExponentialDecay")
99class ExponentialDecay(LearningRateSchedule):
100 """A LearningRateSchedule that uses an exponential decay schedule.
102 When training a model, it is often useful to lower the learning rate as
103 the training progresses. This schedule applies an exponential decay function
104 to an optimizer step, given a provided initial learning rate.
106 The schedule a 1-arg callable that produces a decayed learning
107 rate when passed the current optimizer step. This can be useful for changing
108 the learning rate value across different invocations of optimizer functions.
109 It is computed as:
111 ```python
112 def decayed_learning_rate(step):
113 return initial_learning_rate * decay_rate ^ (step / decay_steps)
114 ```
116 If the argument `staircase` is `True`, then `step / decay_steps` is
117 an integer division and the decayed learning rate follows a
118 staircase function.
120 You can pass this schedule directly into a `tf.keras.optimizers.Optimizer`
121 as the learning rate.
122 Example: When fitting a Keras model, decay every 100000 steps with a base
123 of 0.96:
125 ```python
126 initial_learning_rate = 0.1
127 lr_schedule = tf.keras.optimizers.schedules.ExponentialDecay(
128 initial_learning_rate,
129 decay_steps=100000,
130 decay_rate=0.96,
131 staircase=True)
133 model.compile(optimizer=tf.keras.optimizers.SGD(learning_rate=lr_schedule),
134 loss='sparse_categorical_crossentropy',
135 metrics=['accuracy'])
137 model.fit(data, labels, epochs=5)
138 ```
140 The learning rate schedule is also serializable and deserializable using
141 `tf.keras.optimizers.schedules.serialize` and
142 `tf.keras.optimizers.schedules.deserialize`.
144 Returns:
145 A 1-arg callable learning rate schedule that takes the current optimizer
146 step and outputs the decayed learning rate, a scalar `Tensor` of the same
147 type as `initial_learning_rate`.
148 """
150 def __init__(
151 self,
152 initial_learning_rate,
153 decay_steps,
154 decay_rate,
155 staircase=False,
156 name=None):
157 """Applies exponential decay to the learning rate.
159 Args:
160 initial_learning_rate: A scalar `float32` or `float64` `Tensor` or a
161 Python number. The initial learning rate.
162 decay_steps: A scalar `int32` or `int64` `Tensor` or a Python number.
163 Must be positive. See the decay computation above.
164 decay_rate: A scalar `float32` or `float64` `Tensor` or a
165 Python number. The decay rate.
166 staircase: Boolean. If `True` decay the learning rate at discrete
167 intervals
168 name: String. Optional name of the operation. Defaults to
169 'ExponentialDecay'.
170 """
171 super(ExponentialDecay, self).__init__()
172 self.initial_learning_rate = initial_learning_rate
173 self.decay_steps = decay_steps
174 self.decay_rate = decay_rate
175 self.staircase = staircase
176 self.name = name
178 def __call__(self, step):
179 with ops.name_scope_v2(self.name or "ExponentialDecay") as name:
180 initial_learning_rate = (
181 tensor_conversion.convert_to_tensor_v2_with_dispatch(
182 self.initial_learning_rate, name="initial_learning_rate"
183 )
184 )
185 dtype = initial_learning_rate.dtype
186 decay_steps = math_ops.cast(self.decay_steps, dtype)
187 decay_rate = math_ops.cast(self.decay_rate, dtype)
189 global_step_recomp = math_ops.cast(step, dtype)
190 p = global_step_recomp / decay_steps
191 if self.staircase:
192 p = math_ops.floor(p)
193 return math_ops.multiply(
194 initial_learning_rate, math_ops.pow(decay_rate, p), name=name)
196 def get_config(self):
197 return {
198 "initial_learning_rate": self.initial_learning_rate,
199 "decay_steps": self.decay_steps,
200 "decay_rate": self.decay_rate,
201 "staircase": self.staircase,
202 "name": self.name
203 }
206@keras_export("keras.optimizers.schedules.PiecewiseConstantDecay")
207class PiecewiseConstantDecay(LearningRateSchedule):
208 """A LearningRateSchedule that uses a piecewise constant decay schedule.
210 The function returns a 1-arg callable to compute the piecewise constant
211 when passed the current optimizer step. This can be useful for changing the
212 learning rate value across different invocations of optimizer functions.
214 Example: use a learning rate that's 1.0 for the first 100001 steps, 0.5
215 for the next 10000 steps, and 0.1 for any additional steps.
217 ```python
218 step = tf.Variable(0, trainable=False)
219 boundaries = [100000, 110000]
220 values = [1.0, 0.5, 0.1]
221 learning_rate_fn = keras.optimizers.schedules.PiecewiseConstantDecay(
222 boundaries, values)
224 # Later, whenever we perform an optimization step, we pass in the step.
225 learning_rate = learning_rate_fn(step)
226 ```
228 You can pass this schedule directly into a `tf.keras.optimizers.Optimizer`
229 as the learning rate. The learning rate schedule is also serializable and
230 deserializable using `tf.keras.optimizers.schedules.serialize` and
231 `tf.keras.optimizers.schedules.deserialize`.
233 Returns:
234 A 1-arg callable learning rate schedule that takes the current optimizer
235 step and outputs the decayed learning rate, a scalar `Tensor` of the same
236 type as the boundary tensors.
238 The output of the 1-arg function that takes the `step`
239 is `values[0]` when `step <= boundaries[0]`,
240 `values[1]` when `step > boundaries[0]` and `step <= boundaries[1]`, ...,
241 and values[-1] when `step > boundaries[-1]`.
242 """
244 def __init__(
245 self,
246 boundaries,
247 values,
248 name=None):
249 """Piecewise constant from boundaries and interval values.
251 Args:
252 boundaries: A list of `Tensor`s or `int`s or `float`s with strictly
253 increasing entries, and with all elements having the same type as the
254 optimizer step.
255 values: A list of `Tensor`s or `float`s or `int`s that specifies the
256 values for the intervals defined by `boundaries`. It should have one
257 more element than `boundaries`, and all elements should have the same
258 type.
259 name: A string. Optional name of the operation. Defaults to
260 'PiecewiseConstant'.
262 Raises:
263 ValueError: if the number of elements in the lists do not match.
264 """
265 super(PiecewiseConstantDecay, self).__init__()
267 if len(boundaries) != len(values) - 1:
268 raise ValueError(
269 "The length of boundaries should be 1 less than the length of values")
271 self.boundaries = boundaries
272 self.values = values
273 self.name = name
275 def __call__(self, step):
276 with ops.name_scope_v2(self.name or "PiecewiseConstant"):
277 boundaries = nest.map_structure(
278 tensor_conversion.convert_to_tensor_v2_with_dispatch,
279 nest.flatten(self.boundaries),
280 )
281 values = nest.map_structure(
282 tensor_conversion.convert_to_tensor_v2_with_dispatch,
283 nest.flatten(self.values),
284 )
285 x_recomp = tensor_conversion.convert_to_tensor_v2_with_dispatch(step)
286 for i, b in enumerate(boundaries):
287 if b.dtype.base_dtype != x_recomp.dtype.base_dtype:
288 # We cast the boundaries to have the same type as the step
289 b = math_ops.cast(b, x_recomp.dtype.base_dtype)
290 boundaries[i] = b
291 pred_fn_pairs = []
292 pred_fn_pairs.append((x_recomp <= boundaries[0], lambda: values[0]))
293 pred_fn_pairs.append((x_recomp > boundaries[-1], lambda: values[-1]))
294 for low, high, v in zip(boundaries[:-1], boundaries[1:], values[1:-1]):
295 # Need to bind v here; can do this with lambda v=v: ...
296 pred = (x_recomp > low) & (x_recomp <= high)
297 pred_fn_pairs.append((pred, lambda v=v: v))
299 # The default isn't needed here because our conditions are mutually
300 # exclusive and exhaustive, but tf.case requires it.
301 default = lambda: values[0]
302 return control_flow_case.case(pred_fn_pairs, default, exclusive=True)
304 def get_config(self):
305 return {
306 "boundaries": self.boundaries,
307 "values": self.values,
308 "name": self.name
309 }
312@keras_export("keras.optimizers.schedules.PolynomialDecay")
313class PolynomialDecay(LearningRateSchedule):
314 """A LearningRateSchedule that uses a polynomial decay schedule.
316 It is commonly observed that a monotonically decreasing learning rate, whose
317 degree of change is carefully chosen, results in a better performing model.
318 This schedule applies a polynomial decay function to an optimizer step,
319 given a provided `initial_learning_rate`, to reach an `end_learning_rate`
320 in the given `decay_steps`.
322 It requires a `step` value to compute the decayed learning rate. You
323 can just pass a TensorFlow variable that you increment at each training
324 step.
326 The schedule is a 1-arg callable that produces a decayed learning rate
327 when passed the current optimizer step. This can be useful for changing the
328 learning rate value across different invocations of optimizer functions.
329 It is computed as:
331 ```python
332 def decayed_learning_rate(step):
333 step = min(step, decay_steps)
334 return ((initial_learning_rate - end_learning_rate) *
335 (1 - step / decay_steps) ^ (power)
336 ) + end_learning_rate
337 ```
339 If `cycle` is True then a multiple of `decay_steps` is used, the first one
340 that is bigger than `step`.
342 ```python
343 def decayed_learning_rate(step):
344 decay_steps = decay_steps * ceil(step / decay_steps)
345 return ((initial_learning_rate - end_learning_rate) *
346 (1 - step / decay_steps) ^ (power)
347 ) + end_learning_rate
348 ```
350 You can pass this schedule directly into a `tf.keras.optimizers.Optimizer`
351 as the learning rate.
352 Example: Fit a model while decaying from 0.1 to 0.01 in 10000 steps using
353 sqrt (i.e. power=0.5):
355 ```python
356 ...
357 starter_learning_rate = 0.1
358 end_learning_rate = 0.01
359 decay_steps = 10000
360 learning_rate_fn = tf.keras.optimizers.schedules.PolynomialDecay(
361 starter_learning_rate,
362 decay_steps,
363 end_learning_rate,
364 power=0.5)
366 model.compile(optimizer=tf.keras.optimizers.SGD(
367 learning_rate=learning_rate_fn),
368 loss='sparse_categorical_crossentropy',
369 metrics=['accuracy'])
371 model.fit(data, labels, epochs=5)
372 ```
374 The learning rate schedule is also serializable and deserializable using
375 `tf.keras.optimizers.schedules.serialize` and
376 `tf.keras.optimizers.schedules.deserialize`.
378 Returns:
379 A 1-arg callable learning rate schedule that takes the current optimizer
380 step and outputs the decayed learning rate, a scalar `Tensor` of the same
381 type as `initial_learning_rate`.
382 """
384 def __init__(
385 self,
386 initial_learning_rate,
387 decay_steps,
388 end_learning_rate=0.0001,
389 power=1.0,
390 cycle=False,
391 name=None):
392 """Applies a polynomial decay to the learning rate.
394 Args:
395 initial_learning_rate: A scalar `float32` or `float64` `Tensor` or a
396 Python number. The initial learning rate.
397 decay_steps: A scalar `int32` or `int64` `Tensor` or a Python number.
398 Must be positive. See the decay computation above.
399 end_learning_rate: A scalar `float32` or `float64` `Tensor` or a
400 Python number. The minimal end learning rate.
401 power: A scalar `float32` or `float64` `Tensor` or a
402 Python number. The power of the polynomial. Defaults to linear, 1.0.
403 cycle: A boolean, whether or not it should cycle beyond decay_steps.
404 name: String. Optional name of the operation. Defaults to
405 'PolynomialDecay'.
406 """
407 super(PolynomialDecay, self).__init__()
409 self.initial_learning_rate = initial_learning_rate
410 self.decay_steps = decay_steps
411 self.end_learning_rate = end_learning_rate
412 self.power = power
413 self.cycle = cycle
414 self.name = name
416 def __call__(self, step):
417 with ops.name_scope_v2(self.name or "PolynomialDecay") as name:
418 initial_learning_rate = (
419 tensor_conversion.convert_to_tensor_v2_with_dispatch(
420 self.initial_learning_rate, name="initial_learning_rate"
421 )
422 )
423 dtype = initial_learning_rate.dtype
424 end_learning_rate = math_ops.cast(self.end_learning_rate, dtype)
425 power = math_ops.cast(self.power, dtype)
427 global_step_recomp = math_ops.cast(step, dtype)
428 decay_steps_recomp = math_ops.cast(self.decay_steps, dtype)
429 if self.cycle:
430 # Find the first multiple of decay_steps that is bigger than
431 # global_step. If global_step is zero set the multiplier to 1
432 multiplier = array_ops.where_v2(
433 math_ops.equal(global_step_recomp, 0), 1.0,
434 math_ops.ceil(global_step_recomp / self.decay_steps))
435 decay_steps_recomp = math_ops.multiply(decay_steps_recomp, multiplier)
436 else:
437 # Make sure that the global_step used is not bigger than decay_steps.
438 global_step_recomp = math_ops.minimum(global_step_recomp,
439 decay_steps_recomp)
441 p = math_ops.divide(global_step_recomp, decay_steps_recomp)
442 return math_ops.add(
443 math_ops.multiply(initial_learning_rate - end_learning_rate,
444 math_ops.pow(1 - p, power)),
445 end_learning_rate,
446 name=name)
448 def get_config(self):
449 return {
450 "initial_learning_rate": self.initial_learning_rate,
451 "decay_steps": self.decay_steps,
452 "end_learning_rate": self.end_learning_rate,
453 "power": self.power,
454 "cycle": self.cycle,
455 "name": self.name
456 }
459@keras_export("keras.optimizers.schedules.InverseTimeDecay")
460class InverseTimeDecay(LearningRateSchedule):
461 """A LearningRateSchedule that uses an inverse time decay schedule.
463 When training a model, it is often useful to lower the learning rate as
464 the training progresses. This schedule applies the inverse decay function
465 to an optimizer step, given a provided initial learning rate.
466 It requires a `step` value to compute the decayed learning rate. You can
467 just pass a TensorFlow variable that you increment at each training step.
469 The schedule a 1-arg callable that produces a decayed learning
470 rate when passed the current optimizer step. This can be useful for changing
471 the learning rate value across different invocations of optimizer functions.
472 It is computed as:
474 ```python
475 def decayed_learning_rate(step):
476 return initial_learning_rate / (1 + decay_rate * step / decay_step)
477 ```
479 or, if `staircase` is `True`, as:
481 ```python
482 def decayed_learning_rate(step):
483 return initial_learning_rate / (1 + decay_rate * floor(step / decay_step))
484 ```
486 You can pass this schedule directly into a `tf.keras.optimizers.Optimizer`
487 as the learning rate.
488 Example: Fit a Keras model when decaying 1/t with a rate of 0.5:
490 ```python
491 ...
492 initial_learning_rate = 0.1
493 decay_steps = 1.0
494 decay_rate = 0.5
495 learning_rate_fn = keras.optimizers.schedules.InverseTimeDecay(
496 initial_learning_rate, decay_steps, decay_rate)
498 model.compile(optimizer=tf.keras.optimizers.SGD(
499 learning_rate=learning_rate_fn),
500 loss='sparse_categorical_crossentropy',
501 metrics=['accuracy'])
503 model.fit(data, labels, epochs=5)
504 ```
506 Returns:
507 A 1-arg callable learning rate schedule that takes the current optimizer
508 step and outputs the decayed learning rate, a scalar `Tensor` of the same
509 type as `initial_learning_rate`.
510 """
512 def __init__(
513 self,
514 initial_learning_rate,
515 decay_steps,
516 decay_rate,
517 staircase=False,
518 name=None):
519 """Applies inverse time decay to the initial learning rate.
521 Args:
522 initial_learning_rate: A scalar `float32` or `float64` `Tensor` or a
523 Python number. The initial learning rate.
524 decay_steps: How often to apply decay.
525 decay_rate: A Python number. The decay rate.
526 staircase: Whether to apply decay in a discrete staircase, as opposed to
527 continuous, fashion.
528 name: String. Optional name of the operation. Defaults to
529 'InverseTimeDecay'.
530 """
531 super(InverseTimeDecay, self).__init__()
533 self.initial_learning_rate = initial_learning_rate
534 self.decay_steps = decay_steps
535 self.decay_rate = decay_rate
536 self.staircase = staircase
537 self.name = name
539 def __call__(self, step):
540 with ops.name_scope_v2(self.name or "InverseTimeDecay") as name:
541 initial_learning_rate = (
542 tensor_conversion.convert_to_tensor_v2_with_dispatch(
543 self.initial_learning_rate, name="initial_learning_rate"
544 )
545 )
546 dtype = initial_learning_rate.dtype
547 decay_steps = math_ops.cast(self.decay_steps, dtype)
548 decay_rate = math_ops.cast(self.decay_rate, dtype)
550 global_step_recomp = math_ops.cast(step, dtype)
551 p = global_step_recomp / decay_steps
552 if self.staircase:
553 p = math_ops.floor(p)
554 const = math_ops.cast(constant_op.constant(1), dtype)
555 denom = math_ops.add(const, math_ops.multiply(decay_rate, p))
556 return math_ops.divide(initial_learning_rate, denom, name=name)
558 def get_config(self):
559 return {
560 "initial_learning_rate": self.initial_learning_rate,
561 "decay_steps": self.decay_steps,
562 "decay_rate": self.decay_rate,
563 "staircase": self.staircase,
564 "name": self.name
565 }
568@keras_export("keras.optimizers.schedules.CosineDecay",
569 "keras.experimental.CosineDecay")
570class CosineDecay(LearningRateSchedule):
571 """A LearningRateSchedule that uses a cosine decay schedule.
573 See [Loshchilov & Hutter, ICLR2016](https://arxiv.org/abs/1608.03983),
574 SGDR: Stochastic Gradient Descent with Warm Restarts.
576 When training a model, it is often useful to lower the learning rate as
577 the training progresses. This schedule applies a cosine decay function
578 to an optimizer step, given a provided initial learning rate.
579 It requires a `step` value to compute the decayed learning rate. You can
580 just pass a TensorFlow variable that you increment at each training step.
582 The schedule a 1-arg callable that produces a decayed learning
583 rate when passed the current optimizer step. This can be useful for changing
584 the learning rate value across different invocations of optimizer functions.
585 It is computed as:
587 ```python
588 def decayed_learning_rate(step):
589 step = min(step, decay_steps)
590 cosine_decay = 0.5 * (1 + cos(pi * step / decay_steps))
591 decayed = (1 - alpha) * cosine_decay + alpha
592 return initial_learning_rate * decayed
593 ```
595 Example usage:
596 ```python
597 decay_steps = 1000
598 lr_decayed_fn = tf.keras.optimizers.schedules.CosineDecay(
599 initial_learning_rate, decay_steps)
600 ```
602 You can pass this schedule directly into a `tf.keras.optimizers.Optimizer`
603 as the learning rate. The learning rate schedule is also serializable and
604 deserializable using `tf.keras.optimizers.schedules.serialize` and
605 `tf.keras.optimizers.schedules.deserialize`.
607 Returns:
608 A 1-arg callable learning rate schedule that takes the current optimizer
609 step and outputs the decayed learning rate, a scalar `Tensor` of the same
610 type as `initial_learning_rate`.
611 """
613 def __init__(
614 self,
615 initial_learning_rate,
616 decay_steps,
617 alpha=0.0,
618 name=None):
619 """Applies cosine decay to the learning rate.
621 Args:
622 initial_learning_rate: A scalar `float32` or `float64` Tensor or a
623 Python number. The initial learning rate.
624 decay_steps: A scalar `int32` or `int64` `Tensor` or a Python number.
625 Number of steps to decay over.
626 alpha: A scalar `float32` or `float64` Tensor or a Python number.
627 Minimum learning rate value as a fraction of initial_learning_rate.
628 name: String. Optional name of the operation. Defaults to 'CosineDecay'.
629 """
630 super(CosineDecay, self).__init__()
632 self.initial_learning_rate = initial_learning_rate
633 self.decay_steps = decay_steps
634 self.alpha = alpha
635 self.name = name
637 def __call__(self, step):
638 with ops.name_scope_v2(self.name or "CosineDecay"):
639 initial_learning_rate = (
640 tensor_conversion.convert_to_tensor_v2_with_dispatch(
641 self.initial_learning_rate, name="initial_learning_rate"
642 )
643 )
644 dtype = initial_learning_rate.dtype
645 decay_steps = math_ops.cast(self.decay_steps, dtype)
647 global_step_recomp = math_ops.cast(step, dtype)
648 global_step_recomp = math_ops.minimum(global_step_recomp, decay_steps)
649 completed_fraction = global_step_recomp / decay_steps
650 cosine_decayed = 0.5 * (1.0 + math_ops.cos(
651 constant_op.constant(math.pi) * completed_fraction))
653 decayed = (1 - self.alpha) * cosine_decayed + self.alpha
654 return math_ops.multiply(initial_learning_rate, decayed)
656 def get_config(self):
657 return {
658 "initial_learning_rate": self.initial_learning_rate,
659 "decay_steps": self.decay_steps,
660 "alpha": self.alpha,
661 "name": self.name
662 }
665@keras_export("keras.optimizers.schedules.CosineDecayRestarts",
666 "keras.experimental.CosineDecayRestarts")
667class CosineDecayRestarts(LearningRateSchedule):
668 """A LearningRateSchedule that uses a cosine decay schedule with restarts.
670 See [Loshchilov & Hutter, ICLR2016](https://arxiv.org/abs/1608.03983),
671 SGDR: Stochastic Gradient Descent with Warm Restarts.
673 When training a model, it is often useful to lower the learning rate as
674 the training progresses. This schedule applies a cosine decay function with
675 restarts to an optimizer step, given a provided initial learning rate.
676 It requires a `step` value to compute the decayed learning rate. You can
677 just pass a TensorFlow variable that you increment at each training step.
679 The schedule a 1-arg callable that produces a decayed learning
680 rate when passed the current optimizer step. This can be useful for changing
681 the learning rate value across different invocations of optimizer functions.
683 The learning rate multiplier first decays
684 from 1 to `alpha` for `first_decay_steps` steps. Then, a warm
685 restart is performed. Each new warm restart runs for `t_mul` times more
686 steps and with `m_mul` times smaller initial learning rate.
688 Example usage:
689 ```python
690 first_decay_steps = 1000
691 lr_decayed_fn = (
692 tf.keras.optimizers.schedules.CosineDecayRestarts(
693 initial_learning_rate,
694 first_decay_steps))
695 ```
697 You can pass this schedule directly into a `tf.keras.optimizers.Optimizer`
698 as the learning rate. The learning rate schedule is also serializable and
699 deserializable using `tf.keras.optimizers.schedules.serialize` and
700 `tf.keras.optimizers.schedules.deserialize`.
702 Returns:
703 A 1-arg callable learning rate schedule that takes the current optimizer
704 step and outputs the decayed learning rate, a scalar `Tensor` of the same
705 type as `initial_learning_rate`.
706 """
708 def __init__(
709 self,
710 initial_learning_rate,
711 first_decay_steps,
712 t_mul=2.0,
713 m_mul=1.0,
714 alpha=0.0,
715 name=None):
716 """Applies cosine decay with restarts to the learning rate.
718 Args:
719 initial_learning_rate: A scalar `float32` or `float64` Tensor or a Python
720 number. The initial learning rate.
721 first_decay_steps: A scalar `int32` or `int64` `Tensor` or a Python
722 number. Number of steps to decay over.
723 t_mul: A scalar `float32` or `float64` `Tensor` or a Python number.
724 Used to derive the number of iterations in the i-th period
725 m_mul: A scalar `float32` or `float64` `Tensor` or a Python number.
726 Used to derive the initial learning rate of the i-th period:
727 alpha: A scalar `float32` or `float64` Tensor or a Python number.
728 Minimum learning rate value as a fraction of the initial_learning_rate.
729 name: String. Optional name of the operation. Defaults to 'SGDRDecay'.
730 """
731 super(CosineDecayRestarts, self).__init__()
733 self.initial_learning_rate = initial_learning_rate
734 self.first_decay_steps = first_decay_steps
735 self._t_mul = t_mul
736 self._m_mul = m_mul
737 self.alpha = alpha
738 self.name = name
740 def __call__(self, step):
741 with ops.name_scope_v2(self.name or "SGDRDecay") as name:
742 initial_learning_rate = (
743 tensor_conversion.convert_to_tensor_v2_with_dispatch(
744 self.initial_learning_rate, name="initial_learning_rate"
745 )
746 )
747 dtype = initial_learning_rate.dtype
748 first_decay_steps = math_ops.cast(self.first_decay_steps, dtype)
749 alpha = math_ops.cast(self.alpha, dtype)
750 t_mul = math_ops.cast(self._t_mul, dtype)
751 m_mul = math_ops.cast(self._m_mul, dtype)
753 global_step_recomp = math_ops.cast(step, dtype)
754 completed_fraction = global_step_recomp / first_decay_steps
756 def compute_step(completed_fraction, geometric=False):
757 """Helper for `cond` operation."""
758 if geometric:
759 i_restart = math_ops.floor(
760 math_ops.log(1.0 - completed_fraction * (1.0 - t_mul)) /
761 math_ops.log(t_mul))
763 sum_r = (1.0 - t_mul**i_restart) / (1.0 - t_mul)
764 completed_fraction = (completed_fraction - sum_r) / t_mul**i_restart
766 else:
767 i_restart = math_ops.floor(completed_fraction)
768 completed_fraction -= i_restart
770 return i_restart, completed_fraction
772 i_restart, completed_fraction = cond.cond(
773 math_ops.equal(t_mul, 1.0),
774 lambda: compute_step(completed_fraction, geometric=False),
775 lambda: compute_step(completed_fraction, geometric=True))
777 m_fac = m_mul**i_restart
778 cosine_decayed = 0.5 * m_fac * (1.0 + math_ops.cos(
779 constant_op.constant(math.pi) * completed_fraction))
780 decayed = (1 - alpha) * cosine_decayed + alpha
782 return math_ops.multiply(initial_learning_rate, decayed, name=name)
784 def get_config(self):
785 return {
786 "initial_learning_rate": self.initial_learning_rate,
787 "first_decay_steps": self.first_decay_steps,
788 "t_mul": self._t_mul,
789 "m_mul": self._m_mul,
790 "alpha": self.alpha,
791 "name": self.name
792 }
795# Note: this code is still used by V1 APIs.
796class LinearCosineDecay(LearningRateSchedule):
797 """A LearningRateSchedule that uses a linear cosine decay schedule.
799 See [Bello et al., ICML2017] Neural Optimizer Search with RL.
800 https://arxiv.org/abs/1709.07417
802 For the idea of warm starts here controlled by `num_periods`,
803 see [Loshchilov & Hutter, ICLR2016] SGDR: Stochastic Gradient Descent
804 with Warm Restarts. https://arxiv.org/abs/1608.03983
806 Note that linear cosine decay is more aggressive than cosine decay and
807 larger initial learning rates can typically be used.
809 When training a model, it is often recommended to lower the learning rate as
810 the training progresses. This schedule applies a linear cosine decay
811 function to an optimizer step, given a provided initial learning rate.
812 It requires a `step` value to compute the decayed learning rate. You can
813 just pass a TensorFlow variable that you increment at each training step.
815 The schedule a 1-arg callable that produces a decayed learning
816 rate when passed the current optimizer step. This can be useful for changing
817 the learning rate value across different invocations of optimizer functions.
818 It is computed as:
820 ```python
821 def decayed_learning_rate(step):
822 step = min(step, decay_steps)
823 linear_decay = (decay_steps - step) / decay_steps
824 cosine_decay = 0.5 * (
825 1 + cos(pi * 2 * num_periods * step / decay_steps))
826 decayed = (alpha + linear_decay) * cosine_decay + beta
827 return initial_learning_rate * decayed
828 ```
830 Example usage:
831 ```python
832 decay_steps = 1000
833 lr_decayed_fn = (
834 tf.keras.experimental.LinearCosineDecay(
835 initial_learning_rate, decay_steps))
836 ```
838 You can pass this schedule directly into a `tf.keras.optimizers.Optimizer`
839 as the learning rate. The learning rate schedule is also serializable and
840 deserializable using `tf.keras.optimizers.schedules.serialize` and
841 `tf.keras.optimizers.schedules.deserialize`.
843 Returns:
844 A 1-arg callable learning rate schedule that takes the current optimizer
845 step and outputs the decayed learning rate, a scalar `Tensor` of the same
846 type as `initial_learning_rate`.
847 """
849 def __init__(
850 self,
851 initial_learning_rate,
852 decay_steps,
853 num_periods=0.5,
854 alpha=0.0,
855 beta=0.001,
856 name=None):
857 """Applies linear cosine decay to the learning rate.
859 Args:
860 initial_learning_rate: A scalar `float32` or `float64` Tensor or a Python
861 number. The initial learning rate.
862 decay_steps: A scalar `int32` or `int64` `Tensor` or a Python number.
863 Number of steps to decay over.
864 num_periods: Number of periods in the cosine part of the decay.
865 See computation above.
866 alpha: See computation above.
867 beta: See computation above.
868 name: String. Optional name of the operation. Defaults to
869 'LinearCosineDecay'.
870 """
871 super(LinearCosineDecay, self).__init__()
873 self.initial_learning_rate = initial_learning_rate
874 self.decay_steps = decay_steps
875 self.num_periods = num_periods
876 self.alpha = alpha
877 self.beta = beta
878 self.name = name
880 def __call__(self, step):
881 with ops.name_scope_v2(self.name or "LinearCosineDecay") as name:
882 initial_learning_rate = (
883 tensor_conversion.convert_to_tensor_v2_with_dispatch(
884 self.initial_learning_rate, name="initial_learning_rate"
885 )
886 )
887 dtype = initial_learning_rate.dtype
888 decay_steps = math_ops.cast(self.decay_steps, dtype)
889 num_periods = math_ops.cast(self.num_periods, dtype)
890 alpha = math_ops.cast(self.alpha, dtype)
891 beta = math_ops.cast(self.beta, dtype)
893 global_step_recomp = math_ops.cast(step, dtype)
894 global_step_recomp = math_ops.minimum(global_step_recomp, decay_steps)
895 linear_decayed = (decay_steps - global_step_recomp) / decay_steps
896 completed_fraction = global_step_recomp / decay_steps
897 fraction = 2.0 * num_periods * completed_fraction
898 cosine_decayed = 0.5 * (
899 1.0 + math_ops.cos(constant_op.constant(math.pi) * fraction))
901 linear_cosine_decayed = (alpha + linear_decayed) * cosine_decayed + beta
902 return math_ops.multiply(initial_learning_rate, linear_cosine_decayed,
903 name=name)
905 def get_config(self):
906 return {
907 "initial_learning_rate": self.initial_learning_rate,
908 "decay_steps": self.decay_steps,
909 "num_periods": self.num_periods,
910 "alpha": self.alpha,
911 "beta": self.beta,
912 "name": self.name
913 }
916# Note: this code is still used by V1 APIs.
917class NoisyLinearCosineDecay(LearningRateSchedule):
918 """A LearningRateSchedule that uses a noisy linear cosine decay schedule.
920 See [Bello et al., ICML2017] Neural Optimizer Search with RL.
921 https://arxiv.org/abs/1709.07417
923 For the idea of warm starts here controlled by `num_periods`,
924 see [Loshchilov & Hutter, ICLR2016] SGDR: Stochastic Gradient Descent
925 with Warm Restarts. https://arxiv.org/abs/1608.03983
927 Note that linear cosine decay is more aggressive than cosine decay and
928 larger initial learning rates can typically be used.
930 When training a model, it is often recommended to lower the learning rate as
931 the training progresses. This schedule applies a noisy linear cosine decay
932 function to an optimizer step, given a provided initial learning rate.
933 It requires a `step` value to compute the decayed learning rate. You can
934 just pass a TensorFlow variable that you increment at each training step.
936 The schedule a 1-arg callable that produces a decayed learning
937 rate when passed the current optimizer step. This can be useful for changing
938 the learning rate value across different invocations of optimizer functions.
939 It is computed as:
941 ```python
942 def decayed_learning_rate(step):
943 step = min(step, decay_steps)
944 linear_decay = (decay_steps - step) / decay_steps)
945 cosine_decay = 0.5 * (
946 1 + cos(pi * 2 * num_periods * step / decay_steps))
947 decayed = (alpha + linear_decay + eps_t) * cosine_decay + beta
948 return initial_learning_rate * decayed
949 ```
950 where eps_t is 0-centered gaussian noise with variance
951 initial_variance / (1 + global_step) ** variance_decay
953 Example usage:
954 ```python
955 decay_steps = 1000
956 lr_decayed_fn = (
957 tf.keras.experimental.NoisyLinearCosineDecay(
958 initial_learning_rate, decay_steps))
959 ```
961 You can pass this schedule directly into a `tf.keras.optimizers.Optimizer`
962 as the learning rate. The learning rate schedule is also serializable and
963 deserializable using `tf.keras.optimizers.schedules.serialize` and
964 `tf.keras.optimizers.schedules.deserialize`.
966 Returns:
967 A 1-arg callable learning rate schedule that takes the current optimizer
968 step and outputs the decayed learning rate, a scalar `Tensor` of the same
969 type as `initial_learning_rate`.
970 """
972 def __init__(
973 self,
974 initial_learning_rate,
975 decay_steps,
976 initial_variance=1.0,
977 variance_decay=0.55,
978 num_periods=0.5,
979 alpha=0.0,
980 beta=0.001,
981 name=None):
982 """Applies noisy linear cosine decay to the learning rate.
984 Args:
985 initial_learning_rate: A scalar `float32` or `float64` Tensor or a Python
986 number. The initial learning rate.
987 decay_steps: A scalar `int32` or `int64` `Tensor` or a Python number.
988 Number of steps to decay over.
989 initial_variance: initial variance for the noise. See computation above.
990 variance_decay: decay for the noise's variance. See computation above.
991 num_periods: Number of periods in the cosine part of the decay.
992 See computation above.
993 alpha: See computation above.
994 beta: See computation above.
995 name: String. Optional name of the operation. Defaults to
996 'NoisyLinearCosineDecay'.
997 """
998 super(NoisyLinearCosineDecay, self).__init__()
1000 self.initial_learning_rate = initial_learning_rate
1001 self.decay_steps = decay_steps
1002 self.initial_variance = initial_variance
1003 self.variance_decay = variance_decay
1004 self.num_periods = num_periods
1005 self.alpha = alpha
1006 self.beta = beta
1007 self.name = name
1009 def __call__(self, step):
1010 with ops.name_scope_v2(self.name or "NoisyLinearCosineDecay") as name:
1011 initial_learning_rate = (
1012 tensor_conversion.convert_to_tensor_v2_with_dispatch(
1013 self.initial_learning_rate, name="initial_learning_rate"
1014 )
1015 )
1016 dtype = initial_learning_rate.dtype
1017 decay_steps = math_ops.cast(self.decay_steps, dtype)
1018 initial_variance = math_ops.cast(self.initial_variance, dtype)
1019 variance_decay = math_ops.cast(self.variance_decay, dtype)
1020 num_periods = math_ops.cast(self.num_periods, dtype)
1021 alpha = math_ops.cast(self.alpha, dtype)
1022 beta = math_ops.cast(self.beta, dtype)
1024 global_step_recomp = math_ops.cast(step, dtype)
1025 global_step_recomp = math_ops.minimum(global_step_recomp, decay_steps)
1026 linear_decayed = (decay_steps - global_step_recomp) / decay_steps
1027 variance = initial_variance / (
1028 math_ops.pow(1.0 + global_step_recomp, variance_decay))
1029 std = math_ops.sqrt(variance)
1030 noisy_linear_decayed = (
1031 linear_decayed + random_ops.random_normal(
1032 linear_decayed.shape, stddev=std))
1034 completed_fraction = global_step_recomp / decay_steps
1035 fraction = 2.0 * num_periods * completed_fraction
1036 cosine_decayed = 0.5 * (
1037 1.0 + math_ops.cos(constant_op.constant(math.pi) * fraction))
1038 noisy_linear_cosine_decayed = (
1039 (alpha + noisy_linear_decayed) * cosine_decayed + beta)
1041 return math_ops.multiply(
1042 initial_learning_rate, noisy_linear_cosine_decayed, name=name)
1044 def get_config(self):
1045 return {
1046 "initial_learning_rate": self.initial_learning_rate,
1047 "decay_steps": self.decay_steps,
1048 "initial_variance": self.initial_variance,
1049 "variance_decay": self.variance_decay,
1050 "num_periods": self.num_periods,
1051 "alpha": self.alpha,
1052 "beta": self.beta,
1053 "name": self.name
1054 }
1057@keras_export("keras.optimizers.schedules.serialize")
1058def serialize(learning_rate_schedule):
1059 return generic_utils.serialize_keras_object(learning_rate_schedule)
1062@keras_export("keras.optimizers.schedules.deserialize")
1063def deserialize(config, custom_objects=None):
1064 return generic_utils.deserialize_keras_object(
1065 config,
1066 module_objects=globals(),
1067 custom_objects=custom_objects,
1068 printable_module_name="decay")