Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/keras/src/optimizers/schedules/learning_rate

3# Licensed under the Apache License, Version 2.0 (the "License");

4# you may not use this file except in compliance with the License.

5# You may obtain a copy of the License at

7# http://www.apache.org/licenses/LICENSE-2.0

9# Unless required by applicable law or agreed to in writing, software

10# distributed under the License is distributed on an "AS IS" BASIS,

11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

12# See the License for the specific language governing permissions and

13# limitations under the License.

14# ==============================================================================

15"""Various learning rate schedule functions."""

17import abc

18import math

20import tensorflow.compat.v2 as tf

22from keras.src import backend

23from keras.src.saving import serialization_lib

24from keras.src.saving.legacy import serialization as legacy_serialization

26# isort: off

27from tensorflow.python.util.tf_export import keras_export

30@keras_export("keras.optimizers.schedules.LearningRateSchedule")

31class LearningRateSchedule:

32 """The learning rate schedule base class.

34 You can use a learning rate schedule to modulate how the learning rate

35 of your optimizer changes over time.

37 Several built-in learning rate schedules are available, such as

38 `tf.keras.optimizers.schedules.ExponentialDecay` or

39 `tf.keras.optimizers.schedules.PiecewiseConstantDecay`:

41 ```python

42 lr_schedule = keras.optimizers.schedules.ExponentialDecay(

43 initial_learning_rate=1e-2,

44 decay_steps=10000,

45 decay_rate=0.9)

46 optimizer = keras.optimizers.SGD(learning_rate=lr_schedule)

47 ```

49 A `LearningRateSchedule` instance can be passed in as the `learning_rate`

50 argument of any optimizer.

52 To implement your own schedule object, you should implement the `__call__`

53 method, which takes a `step` argument (scalar integer tensor, the

54 current training step count).

55 Like for any other Keras object, you can also optionally

56 make your object serializable by implementing the `get_config`

57 and `from_config` methods.

59 Example:

61 ```python

62 class MyLRSchedule(tf.keras.optimizers.schedules.LearningRateSchedule):

64 def __init__(self, initial_learning_rate):

65 self.initial_learning_rate = initial_learning_rate

67 def __call__(self, step):

68 return self.initial_learning_rate / (step + 1)

70 optimizer = tf.keras.optimizers.SGD(learning_rate=MyLRSchedule(0.1))

71 ```

72 """

74 @abc.abstractmethod

75 def __call__(self, step):

76 raise NotImplementedError(

77 f"Learning rate schedule '{self.__class__.__name__}' "

78 "must override `__call__(self, step)`."

79 )

81 @abc.abstractmethod

82 def get_config(self):

83 raise NotImplementedError(

84 f"Learning rate schedule '{self.__class__.__name__}' "

85 "must override `get_config()` in order to be serializable."

86 )

88 @classmethod

89 def from_config(cls, config):

90 """Instantiates a `LearningRateSchedule` from its config.

92 Args:

93 config: Output of `get_config()`.

95 Returns:

96 A `LearningRateSchedule` instance.

97 """

98 return cls(**config)

100

101@keras_export("keras.optimizers.schedules.ExponentialDecay")

102class ExponentialDecay(LearningRateSchedule):

103 """A LearningRateSchedule that uses an exponential decay schedule.

104

105 When training a model, it is often useful to lower the learning rate as

106 the training progresses. This schedule applies an exponential decay function

107 to an optimizer step, given a provided initial learning rate.

108

109 The schedule is a 1-arg callable that produces a decayed learning

110 rate when passed the current optimizer step. This can be useful for changing

111 the learning rate value across different invocations of optimizer functions.

112 It is computed as:

113

114 ```python

115 def decayed_learning_rate(step):

116 return initial_learning_rate * decay_rate ^ (step / decay_steps)

117 ```

118

119 If the argument `staircase` is `True`, then `step / decay_steps` is

120 an integer division and the decayed learning rate follows a

121 staircase function.

122

123 You can pass this schedule directly into a `tf.keras.optimizers.Optimizer`

124 as the learning rate.

125 Example: When fitting a Keras model, decay every 100000 steps with a base

126 of 0.96:

127

128 ```python

129 initial_learning_rate = 0.1

130 lr_schedule = tf.keras.optimizers.schedules.ExponentialDecay(

131 initial_learning_rate,

132 decay_steps=100000,

133 decay_rate=0.96,

134 staircase=True)

135

136 model.compile(optimizer=tf.keras.optimizers.SGD(learning_rate=lr_schedule),

137 loss='sparse_categorical_crossentropy',

138 metrics=['accuracy'])

139

140 model.fit(data, labels, epochs=5)

141 ```

142

143 The learning rate schedule is also serializable and deserializable using

144 `tf.keras.optimizers.schedules.serialize` and

145 `tf.keras.optimizers.schedules.deserialize`.

146

147 Returns:

148 A 1-arg callable learning rate schedule that takes the current optimizer

149 step and outputs the decayed learning rate, a scalar `Tensor` of the same

150 type as `initial_learning_rate`.

151 """

152

153 def __init__(

154 self,

155 initial_learning_rate,

156 decay_steps,

157 decay_rate,

158 staircase=False,

159 name=None,

160 ):

161 """Applies exponential decay to the learning rate.

162

163 Args:

164 initial_learning_rate: A scalar `float32` or `float64` `Tensor` or a

165 Python number. The initial learning rate.

166 decay_steps: A scalar `int32` or `int64` `Tensor` or a Python number.

167 Must be positive. See the decay computation above.

168 decay_rate: A scalar `float32` or `float64` `Tensor` or a

169 Python number. The decay rate.

170 staircase: Boolean. If `True` decay the learning rate at discrete

171 intervals

172 name: String. Optional name of the operation. Defaults to

173 'ExponentialDecay'.

174 """

175 super().__init__()

176 self.initial_learning_rate = initial_learning_rate

177 self.decay_steps = decay_steps

178 self.decay_rate = decay_rate

179 self.staircase = staircase

180 self.name = name

181

182 def __call__(self, step):

183 with tf.name_scope(self.name or "ExponentialDecay") as name:

184 initial_learning_rate = tf.convert_to_tensor(

185 self.initial_learning_rate, name="initial_learning_rate"

186 )

187 dtype = initial_learning_rate.dtype

188 decay_steps = tf.cast(self.decay_steps, dtype)

189 decay_rate = tf.cast(self.decay_rate, dtype)

190

191 global_step_recomp = tf.cast(step, dtype)

192 p = global_step_recomp / decay_steps

193 if self.staircase:

194 p = tf.floor(p)

195 return tf.multiply(

196 initial_learning_rate, tf.pow(decay_rate, p), name=name

197 )

198

199 def get_config(self):

200 return {

201 "initial_learning_rate": self.initial_learning_rate,

202 "decay_steps": self.decay_steps,

203 "decay_rate": self.decay_rate,

204 "staircase": self.staircase,

205 "name": self.name,

206 }

207

208

209@keras_export("keras.optimizers.schedules.PiecewiseConstantDecay")

210class PiecewiseConstantDecay(LearningRateSchedule):

211 """A LearningRateSchedule that uses a piecewise constant decay schedule.

212

213 The function returns a 1-arg callable to compute the piecewise constant

214 when passed the current optimizer step. This can be useful for changing the

215 learning rate value across different invocations of optimizer functions.

216

217 Example: use a learning rate that's 1.0 for the first 100001 steps, 0.5

218 for the next 10000 steps, and 0.1 for any additional steps.

219

220 ```python

221 step = tf.Variable(0, trainable=False)

222 boundaries = [100000, 110000]

223 values = [1.0, 0.5, 0.1]

224 learning_rate_fn = keras.optimizers.schedules.PiecewiseConstantDecay(

225 boundaries, values)

226

227 # Later, whenever we perform an optimization step, we pass in the step.

228 learning_rate = learning_rate_fn(step)

229 ```

230

231 You can pass this schedule directly into a `tf.keras.optimizers.Optimizer`

232 as the learning rate. The learning rate schedule is also serializable and

233 deserializable using `tf.keras.optimizers.schedules.serialize` and

234 `tf.keras.optimizers.schedules.deserialize`.

235

236 Returns:

237 A 1-arg callable learning rate schedule that takes the current optimizer

238 step and outputs the decayed learning rate, a scalar `Tensor` of the same

239 type as the boundary tensors.

240

241 The output of the 1-arg function that takes the `step`

242 is `values[0]` when `step <= boundaries[0]`,

243 `values[1]` when `step > boundaries[0]` and `step <= boundaries[1]`, ...,

244 and values[-1] when `step > boundaries[-1]`.

245 """

246

247 def __init__(self, boundaries, values, name=None):

248 """Piecewise constant from boundaries and interval values.

249

250 Args:

251 boundaries: A list of `Tensor`s or `int`s or `float`s with strictly

252 increasing entries, and with all elements having the same type as

253 the optimizer step.

254 values: A list of `Tensor`s or `float`s or `int`s that specifies the

255 values for the intervals defined by `boundaries`. It should have one

256 more element than `boundaries`, and all elements should have the

257 same type.

258 name: A string. Optional name of the operation. Defaults to

259 'PiecewiseConstant'.

260

261 Raises:

262 ValueError: if the number of elements in the lists do not match.

263 """

264 super().__init__()

265

266 if len(boundaries) != len(values) - 1:

267 raise ValueError(

268 "The length of boundaries should be 1 less than the length of "

269 f"values. Received: boundaries={boundaries} of length "

270 f"{len(boundaries)}, and values={values} "

271 f"of length {len(values)}."

272 )

273

274 self.boundaries = boundaries

275 self.values = values

276 self.name = name

277

278 def __call__(self, step):

279 with tf.name_scope(self.name or "PiecewiseConstant"):

280 boundaries = tf.nest.map_structure(

281 tf.convert_to_tensor, tf.nest.flatten(self.boundaries)

282 )

283 values = tf.nest.map_structure(

284 tf.convert_to_tensor, tf.nest.flatten(self.values)

285 )

286 x_recomp = tf.convert_to_tensor(step)

287 for i, b in enumerate(boundaries):

288 if b.dtype.base_dtype != x_recomp.dtype.base_dtype:

289 # We cast the boundaries to have the same type as the step

290 b = tf.cast(b, x_recomp.dtype.base_dtype)

291 boundaries[i] = b

292 pred_fn_pairs = []

293 pred_fn_pairs.append((x_recomp <= boundaries[0], lambda: values[0]))

294 pred_fn_pairs.append(

295 (x_recomp > boundaries[-1], lambda: values[-1])

296 )

297 for low, high, v in zip(

298 boundaries[:-1], boundaries[1:], values[1:-1]

299 ):

300 # Need to bind v here; can do this with lambda v=v: ...

301 pred = (x_recomp > low) & (x_recomp <= high)

302 pred_fn_pairs.append((pred, lambda v=v: v))

303

304 # The default isn't needed here because our conditions are mutually

305 # exclusive and exhaustive, but tf.case requires it.

306 default = lambda: values[0]

307 return tf.case(pred_fn_pairs, default, exclusive=True)

308

309 def get_config(self):

310 return {

311 "boundaries": self.boundaries,

312 "values": self.values,

313 "name": self.name,

314 }

315

316

317@keras_export("keras.optimizers.schedules.PolynomialDecay")

318class PolynomialDecay(LearningRateSchedule):

319 """A LearningRateSchedule that uses a polynomial decay schedule.

320

321 It is commonly observed that a monotonically decreasing learning rate, whose

322 degree of change is carefully chosen, results in a better performing model.

323 This schedule applies a polynomial decay function to an optimizer step,

324 given a provided `initial_learning_rate`, to reach an `end_learning_rate`

325 in the given `decay_steps`.

326

327 It requires a `step` value to compute the decayed learning rate. You

328 can just pass a TensorFlow variable that you increment at each training

329 step.

330

331 The schedule is a 1-arg callable that produces a decayed learning rate

332 when passed the current optimizer step. This can be useful for changing the

333 learning rate value across different invocations of optimizer functions.

334 It is computed as:

335

336 ```python

337 def decayed_learning_rate(step):

338 step = min(step, decay_steps)

339 return ((initial_learning_rate - end_learning_rate) *

340 (1 - step / decay_steps) ^ (power)

341 ) + end_learning_rate

342 ```

343

344 If `cycle` is True then a multiple of `decay_steps` is used, the first one

345 that is bigger than `step`.

346

347 ```python

348 def decayed_learning_rate(step):

349 decay_steps = decay_steps * ceil(step / decay_steps)

350 return ((initial_learning_rate - end_learning_rate) *

351 (1 - step / decay_steps) ^ (power)

352 ) + end_learning_rate

353 ```

354

355 You can pass this schedule directly into a `tf.keras.optimizers.Optimizer`

356 as the learning rate.

357 Example: Fit a model while decaying from 0.1 to 0.01 in 10000 steps using

358 sqrt (i.e. power=0.5):

359

360 ```python

361 ...

362 starter_learning_rate = 0.1

363 end_learning_rate = 0.01

364 decay_steps = 10000

365 learning_rate_fn = tf.keras.optimizers.schedules.PolynomialDecay(

366 starter_learning_rate,

367 decay_steps,

368 end_learning_rate,

369 power=0.5)

370

371 model.compile(optimizer=tf.keras.optimizers.SGD(

372 learning_rate=learning_rate_fn),

373 loss='sparse_categorical_crossentropy',

374 metrics=['accuracy'])

375

376 model.fit(data, labels, epochs=5)

377 ```

378

379 The learning rate schedule is also serializable and deserializable using

380 `tf.keras.optimizers.schedules.serialize` and

381 `tf.keras.optimizers.schedules.deserialize`.

382

383 Returns:

384 A 1-arg callable learning rate schedule that takes the current optimizer

385 step and outputs the decayed learning rate, a scalar `Tensor` of the same

386 type as `initial_learning_rate`.

387 """

388

389 def __init__(

390 self,

391 initial_learning_rate,

392 decay_steps,

393 end_learning_rate=0.0001,

394 power=1.0,

395 cycle=False,

396 name=None,

397 ):

398 """Applies a polynomial decay to the learning rate.

399

400 Args:

401 initial_learning_rate: A scalar `float32` or `float64` `Tensor` or a

402 Python number. The initial learning rate.

403 decay_steps: A scalar `int32` or `int64` `Tensor` or a Python number.

404 Must be positive. See the decay computation above.

405 end_learning_rate: A scalar `float32` or `float64` `Tensor` or a

406 Python number. The minimal end learning rate.

407 power: A scalar `float32` or `float64` `Tensor` or a

408 Python number. The power of the polynomial. Defaults to `1.0`.

409 cycle: A boolean, whether it should cycle beyond decay_steps.

410 name: String. Optional name of the operation. Defaults to

411 'PolynomialDecay'.

412 """

413 super().__init__()

414

415 self.initial_learning_rate = initial_learning_rate

416 self.decay_steps = decay_steps

417 self.end_learning_rate = end_learning_rate

418 self.power = power

419 self.cycle = cycle

420 self.name = name

421

422 def __call__(self, step):

423 with tf.name_scope(self.name or "PolynomialDecay") as name:

424 initial_learning_rate = tf.convert_to_tensor(

425 self.initial_learning_rate, name="initial_learning_rate"

426 )

427 dtype = initial_learning_rate.dtype

428 end_learning_rate = tf.cast(self.end_learning_rate, dtype)

429 power = tf.cast(self.power, dtype)

430

431 global_step_recomp = tf.cast(step, dtype)

432 decay_steps_recomp = tf.cast(self.decay_steps, dtype)

433 if self.cycle:

434 # Find the first multiple of decay_steps that is bigger than

435 # global_step. If global_step is zero set the multiplier to 1

436 multiplier = tf.where(

437 tf.equal(global_step_recomp, 0),

438 1.0,

439 tf.math.ceil(global_step_recomp / self.decay_steps),

440 )

441 decay_steps_recomp = tf.multiply(decay_steps_recomp, multiplier)

442 else:

443 # Make sure that the global_step used is not bigger than

444 # decay_steps.

445 global_step_recomp = tf.minimum(

446 global_step_recomp, decay_steps_recomp

447 )

448

449 p = tf.divide(global_step_recomp, decay_steps_recomp)

450 return tf.add(

451 tf.multiply(

452 initial_learning_rate - end_learning_rate,

453 tf.pow(1 - p, power),

454 ),

455 end_learning_rate,

456 name=name,

457 )

458

459 def get_config(self):

460 return {

461 "initial_learning_rate": self.initial_learning_rate,

462 "decay_steps": self.decay_steps,

463 "end_learning_rate": self.end_learning_rate,

464 "power": self.power,

465 "cycle": self.cycle,

466 "name": self.name,

467 }

468

469

470@keras_export("keras.optimizers.schedules.InverseTimeDecay")

471class InverseTimeDecay(LearningRateSchedule):

472 """A LearningRateSchedule that uses an inverse time decay schedule.

473

474 When training a model, it is often useful to lower the learning rate as

475 the training progresses. This schedule applies the inverse decay function

476 to an optimizer step, given a provided initial learning rate.

477 It requires a `step` value to compute the decayed learning rate. You can

478 just pass a TensorFlow variable that you increment at each training step.

479

480 The schedule is a 1-arg callable that produces a decayed learning

481 rate when passed the current optimizer step. This can be useful for changing

482 the learning rate value across different invocations of optimizer functions.

483 It is computed as:

484

485 ```python

486 def decayed_learning_rate(step):

487 return initial_learning_rate / (1 + decay_rate * step / decay_step)

488 ```

489

490 or, if `staircase` is `True`, as:

491

492 ```python

493 def decayed_learning_rate(step):

494 return initial_learning_rate / (1 + decay_rate * floor(step / decay_step))

495 ```

496

497 You can pass this schedule directly into a `tf.keras.optimizers.Optimizer`

498 as the learning rate.

499 Example: Fit a Keras model when decaying 1/t with a rate of 0.5:

500

501 ```python

502 ...

503 initial_learning_rate = 0.1

504 decay_steps = 1.0

505 decay_rate = 0.5

506 learning_rate_fn = keras.optimizers.schedules.InverseTimeDecay(

507 initial_learning_rate, decay_steps, decay_rate)

508

509 model.compile(optimizer=tf.keras.optimizers.SGD(

510 learning_rate=learning_rate_fn),

511 loss='sparse_categorical_crossentropy',

512 metrics=['accuracy'])

513

514 model.fit(data, labels, epochs=5)

515 ```

516

517 Returns:

518 A 1-arg callable learning rate schedule that takes the current optimizer

519 step and outputs the decayed learning rate, a scalar `Tensor` of the same

520 type as `initial_learning_rate`.

521 """

522

523 def __init__(

524 self,

525 initial_learning_rate,

526 decay_steps,

527 decay_rate,

528 staircase=False,

529 name=None,

530 ):

531 """Applies inverse time decay to the initial learning rate.

532

533 Args:

534 initial_learning_rate: A scalar `float32` or `float64` `Tensor` or a

535 Python number. The initial learning rate.

536 decay_steps: How often to apply decay.

537 decay_rate: A Python number. The decay rate.

538 staircase: Whether to apply decay in a discrete staircase, as opposed

539 to continuous, fashion.

540 name: String. Optional name of the operation. Defaults to

541 'InverseTimeDecay'.

542 """

543 super().__init__()

544

545 self.initial_learning_rate = initial_learning_rate

546 self.decay_steps = decay_steps

547 self.decay_rate = decay_rate

548 self.staircase = staircase

549 self.name = name

550

551 def __call__(self, step):

552 with tf.name_scope(self.name or "InverseTimeDecay") as name:

553 initial_learning_rate = tf.convert_to_tensor(

554 self.initial_learning_rate, name="initial_learning_rate"

555 )

556 dtype = initial_learning_rate.dtype

557 decay_steps = tf.cast(self.decay_steps, dtype)

558 decay_rate = tf.cast(self.decay_rate, dtype)

559

560 global_step_recomp = tf.cast(step, dtype)

561 p = global_step_recomp / decay_steps

562 if self.staircase:

563 p = tf.floor(p)

564 const = tf.cast(tf.constant(1), dtype)

565 denom = tf.add(const, tf.multiply(decay_rate, p))

566 return tf.divide(initial_learning_rate, denom, name=name)

567

568 def get_config(self):

569 return {

570 "initial_learning_rate": self.initial_learning_rate,

571 "decay_steps": self.decay_steps,

572 "decay_rate": self.decay_rate,

573 "staircase": self.staircase,

574 "name": self.name,

575 }

576

577

578@keras_export(

579 "keras.optimizers.schedules.CosineDecay", "keras.experimental.CosineDecay"

580)

581class CosineDecay(LearningRateSchedule):

582 """A LearningRateSchedule that uses a cosine decay with optional warmup.

583

584 See [Loshchilov & Hutter, ICLR2016](https://arxiv.org/abs/1608.03983),

585 SGDR: Stochastic Gradient Descent with Warm Restarts.

586

587 For the idea of a linear warmup of our learning rate,

588 see [Goyal et al.](https://arxiv.org/pdf/1706.02677.pdf).

589

590 When we begin training a model, we often want an initial increase in our

591 learning rate followed by a decay. If `warmup_target` is an int, this

592 schedule applies a linear increase per optimizer step to our learning rate

593 from `initial_learning_rate` to `warmup_target` for a duration of

594 `warmup_steps`. Afterwards, it applies a cosine decay function taking our

595 learning rate from `warmup_target` to `alpha` for a duration of

596 `decay_steps`. If `warmup_target` is None we skip warmup and our decay

597 will take our learning rate from `initial_learning_rate` to `alpha`.

598 It requires a `step` value to compute the learning rate. You can

599 just pass a TensorFlow variable that you increment at each training step.

600

601 The schedule is a 1-arg callable that produces a warmup followed by a

602 decayed learning rate when passed the current optimizer step. This can be

603 useful for changing the learning rate value across different invocations of

604 optimizer functions.

605

606 Our warmup is computed as:

607

608 ```python

609 def warmup_learning_rate(step):

610 completed_fraction = step / warmup_steps

611 total_delta = target_warmup - initial_learning_rate

612 return completed_fraction * total_delta

613 ```

614

615 And our decay is computed as:

616

617 ```python

618 if warmup_target is None:

619 initial_decay_lr = initial_learning_rate

620 else:

621 initial_decay_lr = warmup_target

622

623 def decayed_learning_rate(step):

624 step = min(step, decay_steps)

625 cosine_decay = 0.5 * (1 + cos(pi * step / decay_steps))

626 decayed = (1 - alpha) * cosine_decay + alpha

627 return initial_decay_lr * decayed

628 ```

629

630 Example usage without warmup:

631

632 ```python

633 decay_steps = 1000

634 initial_learning_rate = 0.1

635 lr_decayed_fn = tf.keras.optimizers.schedules.CosineDecay(

636 initial_learning_rate, decay_steps)

637 ```

638

639 Example usage with warmup:

640

641 ```python

642 decay_steps = 1000

643 initial_learning_rate = 0

644 warmup_steps = 1000

645 target_learning_rate = 0.1

646 lr_warmup_decayed_fn = tf.keras.optimizers.schedules.CosineDecay(

647 initial_learning_rate, decay_steps, warmup_target=target_learning_rate,

648 warmup_steps=warmup_steps

649 )

650 ```

651

652 You can pass this schedule directly into a `tf.keras.optimizers.Optimizer`

653 as the learning rate. The learning rate schedule is also serializable and

654 deserializable using `tf.keras.optimizers.schedules.serialize` and

655 `tf.keras.optimizers.schedules.deserialize`.

656

657 Returns:

658 A 1-arg callable learning rate schedule that takes the current optimizer

659 step and outputs the decayed learning rate, a scalar `Tensor` of the same

660 type as `initial_learning_rate`.

661 """

662

663 def __init__(

664 self,

665 initial_learning_rate,

666 decay_steps,

667 alpha=0.0,

668 name=None,

669 warmup_target=None,

670 warmup_steps=0,

671 ):

672 """Applies cosine decay to the learning rate.

673

674 Args:

675 initial_learning_rate: A scalar `float32` or `float64` `Tensor` or a

676 Python int. The initial learning rate.

677 decay_steps: A scalar `int32` or `int64` `Tensor` or a Python int.

678 Number of steps to decay over.

679 alpha: A scalar `float32` or `float64` `Tensor` or a Python int.

680 Minimum learning rate value for decay as a fraction of

681 `initial_learning_rate`.

682 name: String. Optional name of the operation. Defaults to

683 'CosineDecay'.

684 warmup_target: None or a scalar `float32` or `float64` `Tensor` or a

685 Python int. The target learning rate for our warmup phase. Will cast

686 to the `initial_learning_rate` datatype. Setting to None will skip

687 warmup and begins decay phase from `initial_learning_rate`.

688 Otherwise scheduler will warmup from `initial_learning_rate` to

689 `warmup_target`.

690 warmup_steps: A scalar `int32` or `int64` `Tensor` or a Python int.

691 Number of steps to warmup over.

692 """

693 super().__init__()

694

695 self.initial_learning_rate = initial_learning_rate

696 self.decay_steps = decay_steps

697 self.alpha = alpha

698 self.name = name

699 self.warmup_steps = warmup_steps

700 self.warmup_target = warmup_target

701

702 def _decay_function(self, step, decay_steps, decay_from_lr, dtype):

703 with tf.name_scope(self.name or "CosineDecay"):

704 completed_fraction = step / decay_steps

705 tf_pi = tf.constant(math.pi, dtype=dtype)

706 cosine_decayed = 0.5 * (1.0 + tf.cos(tf_pi * completed_fraction))

707 decayed = (1 - self.alpha) * cosine_decayed + self.alpha

708 return tf.multiply(decay_from_lr, decayed)

709

710 def _warmup_function(

711 self, step, warmup_steps, warmup_target, initial_learning_rate

712 ):

713 with tf.name_scope(self.name or "CosineDecay"):

714 completed_fraction = step / warmup_steps

715 total_step_delta = warmup_target - initial_learning_rate

716 return total_step_delta * completed_fraction + initial_learning_rate

717

718 def __call__(self, step):

719 with tf.name_scope(self.name or "CosineDecay"):

720 initial_learning_rate = tf.convert_to_tensor(

721 self.initial_learning_rate, name="initial_learning_rate"

722 )

723 dtype = initial_learning_rate.dtype

724 decay_steps = tf.cast(self.decay_steps, dtype)

725 global_step_recomp = tf.cast(step, dtype)

726

727 if self.warmup_target is None:

728 global_step_recomp = tf.minimum(global_step_recomp, decay_steps)

729 return self._decay_function(

730 global_step_recomp,

731 decay_steps,

732 initial_learning_rate,

733 dtype,

734 )

735

736 warmup_target = tf.cast(self.warmup_target, dtype)

737 warmup_steps = tf.cast(self.warmup_steps, dtype)

738

739 global_step_recomp = tf.minimum(

740 global_step_recomp, decay_steps + warmup_steps

741 )

742

743 return tf.cond(

744 global_step_recomp < warmup_steps,

745 lambda: self._warmup_function(

746 global_step_recomp,

747 warmup_steps,

748 warmup_target,

749 initial_learning_rate,

750 ),

751 lambda: self._decay_function(

752 global_step_recomp - warmup_steps,

753 decay_steps,

754 warmup_target,

755 dtype,

756 ),

757 )

758

759 def get_config(self):

760 return {

761 "initial_learning_rate": self.initial_learning_rate,

762 "decay_steps": self.decay_steps,

763 "alpha": self.alpha,

764 "name": self.name,

765 "warmup_target": self.warmup_target,

766 "warmup_steps": self.warmup_steps,

767 }

768

769

770@keras_export(

771 "keras.optimizers.schedules.CosineDecayRestarts",

772 "keras.experimental.CosineDecayRestarts",

773)

774class CosineDecayRestarts(LearningRateSchedule):

775 """A LearningRateSchedule that uses a cosine decay schedule with restarts.

776

777 See [Loshchilov & Hutter, ICLR2016](https://arxiv.org/abs/1608.03983),

778 SGDR: Stochastic Gradient Descent with Warm Restarts.

779

780 When training a model, it is often useful to lower the learning rate as

781 the training progresses. This schedule applies a cosine decay function with

782 restarts to an optimizer step, given a provided initial learning rate.

783 It requires a `step` value to compute the decayed learning rate. You can

784 just pass a TensorFlow variable that you increment at each training step.

785

786 The schedule is a 1-arg callable that produces a decayed learning

787 rate when passed the current optimizer step. This can be useful for changing

788 the learning rate value across different invocations of optimizer functions.

789

790 The learning rate multiplier first decays

791 from 1 to `alpha` for `first_decay_steps` steps. Then, a warm

792 restart is performed. Each new warm restart runs for `t_mul` times more

793 steps and with `m_mul` times initial learning rate as the new learning rate.

794

795 Example usage:

796 ```python

797 first_decay_steps = 1000

798 lr_decayed_fn = (

799 tf.keras.optimizers.schedules.CosineDecayRestarts(

800 initial_learning_rate,

801 first_decay_steps))

802 ```

803

804 You can pass this schedule directly into a `tf.keras.optimizers.Optimizer`

805 as the learning rate. The learning rate schedule is also serializable and

806 deserializable using `tf.keras.optimizers.schedules.serialize` and

807 `tf.keras.optimizers.schedules.deserialize`.

808

809 Returns:

810 A 1-arg callable learning rate schedule that takes the current optimizer

811 step and outputs the decayed learning rate, a scalar `Tensor` of the same

812 type as `initial_learning_rate`.

813 """

814

815 def __init__(

816 self,

817 initial_learning_rate,

818 first_decay_steps,

819 t_mul=2.0,

820 m_mul=1.0,

821 alpha=0.0,

822 name=None,

823 ):

824 """Applies cosine decay with restarts to the learning rate.

825

826 Args:

827 initial_learning_rate: A scalar `float32` or `float64` Tensor or a

828 Python number. The initial learning rate.

829 first_decay_steps: A scalar `int32` or `int64` `Tensor` or a Python

830 number. Number of steps to decay over.

831 t_mul: A scalar `float32` or `float64` `Tensor` or a Python number.

832 Used to derive the number of iterations in the i-th period.

833 m_mul: A scalar `float32` or `float64` `Tensor` or a Python number.

834 Used to derive the initial learning rate of the i-th period.

835 alpha: A scalar `float32` or `float64` Tensor or a Python number.

836 Minimum learning rate value as a fraction of the

837 initial_learning_rate.

838 name: String. Optional name of the operation. Defaults to 'SGDRDecay'.

839 """

840 super().__init__()

841

842 self.initial_learning_rate = initial_learning_rate

843 self.first_decay_steps = first_decay_steps

844 self._t_mul = t_mul

845 self._m_mul = m_mul

846 self.alpha = alpha

847 self.name = name

848

849 def __call__(self, step):

850 with tf.name_scope(self.name or "SGDRDecay") as name:

851 initial_learning_rate = tf.convert_to_tensor(

852 self.initial_learning_rate, name="initial_learning_rate"

853 )

854 dtype = initial_learning_rate.dtype

855 first_decay_steps = tf.cast(self.first_decay_steps, dtype)

856 alpha = tf.cast(self.alpha, dtype)

857 t_mul = tf.cast(self._t_mul, dtype)

858 m_mul = tf.cast(self._m_mul, dtype)

859

860 global_step_recomp = tf.cast(step, dtype)

861 completed_fraction = global_step_recomp / first_decay_steps

862

863 def compute_step(completed_fraction, geometric=False):

864 """Helper for `cond` operation."""

865 if geometric:

866 i_restart = tf.floor(

867 tf.math.log(1.0 - completed_fraction * (1.0 - t_mul))

868 / tf.math.log(t_mul)

869 )

870

871 sum_r = (1.0 - t_mul**i_restart) / (1.0 - t_mul)

872 completed_fraction = (

873 completed_fraction - sum_r

874 ) / t_mul**i_restart

875

876 else:

877 i_restart = tf.floor(completed_fraction)

878 completed_fraction -= i_restart

879

880 return i_restart, completed_fraction

881

882 i_restart, completed_fraction = tf.cond(

883 tf.equal(t_mul, 1.0),

884 lambda: compute_step(completed_fraction, geometric=False),

885 lambda: compute_step(completed_fraction, geometric=True),

886 )

887

888 m_fac = m_mul**i_restart

889 cosine_decayed = (

890 0.5

891 * m_fac

892 * (

893 1.0

894 + tf.cos(

895 tf.constant(math.pi, dtype=dtype) * completed_fraction

896 )

897 )

898 )

899 decayed = (1 - alpha) * cosine_decayed + alpha

900

901 return tf.multiply(initial_learning_rate, decayed, name=name)

902

903 def get_config(self):

904 return {

905 "initial_learning_rate": self.initial_learning_rate,

906 "first_decay_steps": self.first_decay_steps,

907 "t_mul": self._t_mul,

908 "m_mul": self._m_mul,

909 "alpha": self.alpha,

910 "name": self.name,

911 }

912

913

914# Note: this code is still used by V1 APIs.

915class LinearCosineDecay(LearningRateSchedule):

916 """A LearningRateSchedule that uses a linear cosine decay schedule.

917

918 See [Bello et al., ICML2017] Neural Optimizer Search with RL.

919 https://arxiv.org/abs/1709.07417

920

921 For the idea of warm starts here controlled by `num_periods`,

922 see [Loshchilov & Hutter, ICLR2016] SGDR: Stochastic Gradient Descent

923 with Warm Restarts. https://arxiv.org/abs/1608.03983

924

925 Note that linear cosine decay is more aggressive than cosine decay and

926 larger initial learning rates can typically be used.

927

928 When training a model, it is often recommended to lower the learning rate as

929 the training progresses. This schedule applies a linear cosine decay

930 function to an optimizer step, given a provided initial learning rate.

931 It requires a `step` value to compute the decayed learning rate. You can

932 just pass a TensorFlow variable that you increment at each training step.

933

934 The schedule is a 1-arg callable that produces a decayed learning

935 rate when passed the current optimizer step. This can be useful for changing

936 the learning rate value across different invocations of optimizer functions.

937 It is computed as:

938

939 ```python

940 def decayed_learning_rate(step):

941 step = min(step, decay_steps)

942 linear_decay = (decay_steps - step) / decay_steps

943 cosine_decay = 0.5 * (

944 1 + cos(pi * 2 * num_periods * step / decay_steps))

945 decayed = (alpha + linear_decay) * cosine_decay + beta

946 return initial_learning_rate * decayed

947 ```

948

949 Example usage:

950 ```python

951 decay_steps = 1000

952 lr_decayed_fn = (

953 tf.keras.experimental.LinearCosineDecay(

954 initial_learning_rate, decay_steps))

955 ```

956

957 You can pass this schedule directly into a `tf.keras.optimizers.Optimizer`

958 as the learning rate. The learning rate schedule is also serializable and

959 deserializable using `tf.keras.optimizers.schedules.serialize` and

960 `tf.keras.optimizers.schedules.deserialize`.

961

962 Returns:

963 A 1-arg callable learning rate schedule that takes the current optimizer

964 step and outputs the decayed learning rate, a scalar `Tensor` of the same

965 type as `initial_learning_rate`.

966 """

967

968 def __init__(

969 self,

970 initial_learning_rate,

971 decay_steps,

972 num_periods=0.5,

973 alpha=0.0,

974 beta=0.001,

975 name=None,

976 ):

977 """Applies linear cosine decay to the learning rate.

978

979 Args:

980 initial_learning_rate: A scalar `float32` or `float64` Tensor or a

981 Python number. The initial learning rate.

982 decay_steps: A scalar `int32` or `int64` `Tensor` or a Python number.

983 Number of steps to decay over.

984 num_periods: Number of periods in the cosine part of the decay.

985 See computation above.

986 alpha: See computation above.

987 beta: See computation above.

988 name: String. Optional name of the operation. Defaults to

989 'LinearCosineDecay'.

990 """

991 super().__init__()

992

993 self.initial_learning_rate = initial_learning_rate

994 self.decay_steps = decay_steps

995 self.num_periods = num_periods

996 self.alpha = alpha

997 self.beta = beta

998 self.name = name

999

1000 def __call__(self, step):

1001 with tf.name_scope(self.name or "LinearCosineDecay") as name:

1002 initial_learning_rate = tf.convert_to_tensor(

1003 self.initial_learning_rate, name="initial_learning_rate"

1004 )

1005 dtype = initial_learning_rate.dtype

1006 decay_steps = tf.cast(self.decay_steps, dtype)

1007 num_periods = tf.cast(self.num_periods, dtype)

1008 alpha = tf.cast(self.alpha, dtype)

1009 beta = tf.cast(self.beta, dtype)

1010

1011 global_step_recomp = tf.cast(step, dtype)

1012 global_step_recomp = tf.minimum(global_step_recomp, decay_steps)

1013 linear_decayed = (decay_steps - global_step_recomp) / decay_steps

1014 completed_fraction = global_step_recomp / decay_steps

1015 fraction = 2.0 * num_periods * completed_fraction

1016 cosine_decayed = 0.5 * (

1017 1.0 + tf.cos(tf.constant(math.pi, dtype=dtype) * fraction)

1018 )

1019

1020 linear_cosine_decayed = (

1021 alpha + linear_decayed

1022 ) * cosine_decayed + beta

1023 return tf.multiply(

1024 initial_learning_rate, linear_cosine_decayed, name=name

1025 )

1026

1027 def get_config(self):

1028 return {

1029 "initial_learning_rate": self.initial_learning_rate,

1030 "decay_steps": self.decay_steps,

1031 "num_periods": self.num_periods,

1032 "alpha": self.alpha,

1033 "beta": self.beta,

1034 "name": self.name,

1035 }

1036

1037

1038# Note: this code is still used by V1 APIs.

1039class NoisyLinearCosineDecay(LearningRateSchedule):

1040 """A LearningRateSchedule that uses a noisy linear cosine decay schedule.

1041

1042 See [Bello et al., ICML2017] Neural Optimizer Search with RL.

1043 https://arxiv.org/abs/1709.07417

1044

1045 For the idea of warm starts here controlled by `num_periods`,

1046 see [Loshchilov & Hutter, ICLR2016] SGDR: Stochastic Gradient Descent

1047 with Warm Restarts. https://arxiv.org/abs/1608.03983

1048

1049 Note that linear cosine decay is more aggressive than cosine decay and

1050 larger initial learning rates can typically be used.

1051

1052 When training a model, it is often recommended to lower the learning rate as

1053 the training progresses. This schedule applies a noisy linear cosine decay

1054 function to an optimizer step, given a provided initial learning rate.

1055 It requires a `step` value to compute the decayed learning rate. You can

1056 just pass a TensorFlow variable that you increment at each training step.

1057

1058 The schedule is a 1-arg callable that produces a decayed learning

1059 rate when passed the current optimizer step. This can be useful for changing

1060 the learning rate value across different invocations of optimizer functions.

1061 It is computed as:

1062

1063 ```python

1064 def decayed_learning_rate(step):

1065 step = min(step, decay_steps)

1066 linear_decay = (decay_steps - step) / decay_steps)

1067 cosine_decay = 0.5 * (

1068 1 + cos(pi * 2 * num_periods * step / decay_steps))

1069 decayed = (alpha + linear_decay + eps_t) * cosine_decay + beta

1070 return initial_learning_rate * decayed

1071 ```

1072 where eps_t is 0-centered gaussian noise with variance

1073 initial_variance / (1 + global_step) ** variance_decay

1074

1075 Example usage:

1076 ```python

1077 decay_steps = 1000

1078 lr_decayed_fn = (

1079 tf.keras.experimental.NoisyLinearCosineDecay(

1080 initial_learning_rate, decay_steps))

1081 ```

1082

1083 You can pass this schedule directly into a `tf.keras.optimizers.Optimizer`

1084 as the learning rate. The learning rate schedule is also serializable and

1085 deserializable using `tf.keras.optimizers.schedules.serialize` and

1086 `tf.keras.optimizers.schedules.deserialize`.

1087

1088 Returns:

1089 A 1-arg callable learning rate schedule that takes the current optimizer

1090 step and outputs the decayed learning rate, a scalar `Tensor` of the same

1091 type as `initial_learning_rate`.

1092 """

1093

1094 def __init__(

1095 self,

1096 initial_learning_rate,

1097 decay_steps,

1098 initial_variance=1.0,

1099 variance_decay=0.55,

1100 num_periods=0.5,

1101 alpha=0.0,

1102 beta=0.001,

1103 seed=None,

1104 name=None,

1105 ):

1106 """Applies noisy linear cosine decay to the learning rate.

1107

1108 Args:

1109 initial_learning_rate: A scalar `float32` or `float64` Tensor or a

1110 Python number. The initial learning rate.

1111 decay_steps: A scalar `int32` or `int64` `Tensor` or a Python number.

1112 Number of steps to decay over.

1113 initial_variance: initial variance for the noise. See computation

1114 above.

1115 variance_decay: decay for the noise's variance. See computation above.

1116 num_periods: Number of periods in the cosine part of the decay.

1117 See computation above.

1118 alpha: See computation above.

1119 beta: See computation above.

1120 seed: Integer, optional random seed to enable deterministic behavior.

1121 name: String. Optional name of the operation. Defaults to

1122 'NoisyLinearCosineDecay'.

1123 """

1124 super().__init__()

1125

1126 self.initial_learning_rate = initial_learning_rate

1127 self.decay_steps = decay_steps

1128 self.initial_variance = initial_variance

1129 self.variance_decay = variance_decay

1130 self.num_periods = num_periods

1131 self.alpha = alpha

1132 self.beta = beta

1133 self.seed = seed

1134 self.name = name

1135 self._random_generator = backend.RandomGenerator(seed)

1136

1137 def __call__(self, step):

1138 with tf.name_scope(self.name or "NoisyLinearCosineDecay") as name:

1139 initial_learning_rate = tf.convert_to_tensor(

1140 self.initial_learning_rate, name="initial_learning_rate"

1141 )

1142 dtype = initial_learning_rate.dtype

1143 decay_steps = tf.cast(self.decay_steps, dtype)

1144 initial_variance = tf.cast(self.initial_variance, dtype)

1145 variance_decay = tf.cast(self.variance_decay, dtype)

1146 num_periods = tf.cast(self.num_periods, dtype)

1147 alpha = tf.cast(self.alpha, dtype)

1148 beta = tf.cast(self.beta, dtype)

1149

1150 global_step_recomp = tf.cast(step, dtype)

1151 global_step_recomp = tf.minimum(global_step_recomp, decay_steps)

1152 linear_decayed = (decay_steps - global_step_recomp) / decay_steps

1153 variance = initial_variance / (

1154 tf.pow(1.0 + global_step_recomp, variance_decay)

1155 )

1156 std = tf.sqrt(variance)

1157 noisy_linear_decayed = (

1158 linear_decayed

1159 + self._random_generator.random_normal(

1160 linear_decayed.shape, stddev=std

1161 )

1162 )

1163

1164 completed_fraction = global_step_recomp / decay_steps

1165 fraction = 2.0 * num_periods * completed_fraction

1166 cosine_decayed = 0.5 * (

1167 1.0 + tf.cos(tf.constant(math.pi, dtype=dtype) * fraction)

1168 )

1169 noisy_linear_cosine_decayed = (

1170 alpha + noisy_linear_decayed

1171 ) * cosine_decayed + beta

1172

1173 return tf.multiply(

1174 initial_learning_rate, noisy_linear_cosine_decayed, name=name

1175 )

1176

1177 def get_config(self):

1178 return {

1179 "initial_learning_rate": self.initial_learning_rate,

1180 "decay_steps": self.decay_steps,

1181 "initial_variance": self.initial_variance,

1182 "variance_decay": self.variance_decay,

1183 "num_periods": self.num_periods,

1184 "alpha": self.alpha,

1185 "beta": self.beta,

1186 "seed": self.seed,

1187 "name": self.name,

1188 }

1189

1190

1191@keras_export("keras.optimizers.schedules.serialize")

1192def serialize(learning_rate_schedule, use_legacy_format=False):

1193 """Serializes a `LearningRateSchedule` into a JSON-compatible dict.

1194

1195 Args:

1196 learning_rate_schedule: The `LearningRateSchedule` object to serialize.

1197

1198 Returns:

1199 A JSON-serializable dict representing the object's config.

1200

1201 Example:

1202

1203 >>> lr_schedule = tf.keras.optimizers.schedules.ExponentialDecay(

1204 ... 0.1, decay_steps=100000, decay_rate=0.96, staircase=True)

1205 >>> tf.keras.optimizers.schedules.serialize(lr_schedule)

1206 {'module': 'keras.optimizers.schedules',

1207 'class_name': 'ExponentialDecay', 'config': {...},

1208 'registered_name': None}

1209 """

1210 if use_legacy_format:

1211 return legacy_serialization.serialize_keras_object(

1212 learning_rate_schedule

1213 )

1214

1215 return serialization_lib.serialize_keras_object(learning_rate_schedule)

1216

1217

1218@keras_export("keras.optimizers.schedules.deserialize")

1219def deserialize(config, custom_objects=None, use_legacy_format=False):

1220 """Instantiates a `LearningRateSchedule` object from a serialized form.

1221

1222 Args:

1223 config: The serialized form of the `LearningRateSchedule`.

1224 Dictionary of the form {'class_name': str, 'config': dict}.

1225 custom_objects: A dictionary mapping class names (or function names) of

1226 custom (non-Keras) objects to class/functions.

1227

1228 Returns:

1229 A `LearningRateSchedule` object.

1230

1231 Example:

1232

1233 ```python

1234 # Configuration for PolynomialDecay

1235 config = {

1236 'class_name': 'PolynomialDecay',

1237 'config': {'cycle': False,

1238 'decay_steps': 10000,

1239 'end_learning_rate': 0.01,

1240 'initial_learning_rate': 0.1,

1241 'name': None,

1242 'power': 0.5}}

1243 lr_schedule = tf.keras.optimizers.schedules.deserialize(config)

1244 ```

1245 """

1246 if use_legacy_format:

1247 return legacy_serialization.deserialize_keras_object(

1248 config,

1249 module_objects=globals(),

1250 custom_objects=custom_objects,

1251 printable_module_name="decay",

1252 )

1253

1254 return serialization_lib.deserialize_keras_object(

1255 config,

1256 module_objects=globals(),

1257 custom_objects=custom_objects,

1258 printable_module_name="decay",

1259 )

1260

Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/keras/src/optimizers/schedules/learning_rate_schedule.py: 23%

264 statements