Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/tensorflow/python/keras/optimizer_v2/learning_rate

3# Licensed under the Apache License, Version 2.0 (the "License");

4# you may not use this file except in compliance with the License.

5# You may obtain a copy of the License at

7# http://www.apache.org/licenses/LICENSE-2.0

9# Unless required by applicable law or agreed to in writing, software

10# distributed under the License is distributed on an "AS IS" BASIS,

11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

12# See the License for the specific language governing permissions and

13# limitations under the License.

14# ==============================================================================

15"""Various learning rate decay functions."""

17import abc

18import math

20from tensorflow.python.framework import constant_op

21from tensorflow.python.framework import ops

22from tensorflow.python.framework import tensor_conversion

23from tensorflow.python.keras.utils import generic_utils

24from tensorflow.python.ops import array_ops

25from tensorflow.python.ops import cond

26from tensorflow.python.ops import control_flow_case

27from tensorflow.python.ops import math_ops

28from tensorflow.python.ops import random_ops

29from tensorflow.python.util import nest

30from tensorflow.python.util.tf_export import keras_export

33@keras_export("keras.optimizers.schedules.LearningRateSchedule")

34class LearningRateSchedule(object):

35 """The learning rate schedule base class.

37 You can use a learning rate schedule to modulate how the learning rate

38 of your optimizer changes over time.

40 Several built-in learning rate schedules are available, such as

41 `tf.keras.optimizers.schedules.ExponentialDecay` or

42 `tf.keras.optimizers.schedules.PiecewiseConstantDecay`:

44 ```python

45 lr_schedule = keras.optimizers.schedules.ExponentialDecay(

46 initial_learning_rate=1e-2,

47 decay_steps=10000,

48 decay_rate=0.9)

49 optimizer = keras.optimizers.SGD(learning_rate=lr_schedule)

50 ```

52 A `LearningRateSchedule` instance can be passed in as the `learning_rate`

53 argument of any optimizer.

55 To implement your own schedule object, you should implement the `__call__`

56 method, which takes a `step` argument (scalar integer tensor, the

57 current training step count).

58 Like for any other Keras object, you can also optionally

59 make your object serializable by implementing the `get_config`

60 and `from_config` methods.

62 Example:

64 ```python

65 class MyLRSchedule(tf.keras.optimizers.schedules.LearningRateSchedule):

67 def __init__(self, initial_learning_rate):

68 self.initial_learning_rate = initial_learning_rate

70 def __call__(self, step):

71 return self.initial_learning_rate / (step + 1)

73 optimizer = tf.keras.optimizers.SGD(learning_rate=MyLRSchedule(0.1))

74 ```

75 """

77 @abc.abstractmethod

78 def __call__(self, step):

79 raise NotImplementedError("Learning rate schedule must override __call__")

81 @abc.abstractmethod

82 def get_config(self):

83 raise NotImplementedError("Learning rate schedule must override get_config")

85 @classmethod

86 def from_config(cls, config):

87 """Instantiates a `LearningRateSchedule` from its config.

89 Args:

90 config: Output of `get_config()`.

92 Returns:

93 A `LearningRateSchedule` instance.

94 """

95 return cls(**config)

98@keras_export("keras.optimizers.schedules.ExponentialDecay")

99class ExponentialDecay(LearningRateSchedule):

100 """A LearningRateSchedule that uses an exponential decay schedule.

101

102 When training a model, it is often useful to lower the learning rate as

103 the training progresses. This schedule applies an exponential decay function

104 to an optimizer step, given a provided initial learning rate.

105

106 The schedule a 1-arg callable that produces a decayed learning

107 rate when passed the current optimizer step. This can be useful for changing

108 the learning rate value across different invocations of optimizer functions.

109 It is computed as:

110

111 ```python

112 def decayed_learning_rate(step):

113 return initial_learning_rate * decay_rate ^ (step / decay_steps)

114 ```

115

116 If the argument `staircase` is `True`, then `step / decay_steps` is

117 an integer division and the decayed learning rate follows a

118 staircase function.

119

120 You can pass this schedule directly into a `tf.keras.optimizers.Optimizer`

121 as the learning rate.

122 Example: When fitting a Keras model, decay every 100000 steps with a base

123 of 0.96:

124

125 ```python

126 initial_learning_rate = 0.1

127 lr_schedule = tf.keras.optimizers.schedules.ExponentialDecay(

128 initial_learning_rate,

129 decay_steps=100000,

130 decay_rate=0.96,

131 staircase=True)

132

133 model.compile(optimizer=tf.keras.optimizers.SGD(learning_rate=lr_schedule),

134 loss='sparse_categorical_crossentropy',

135 metrics=['accuracy'])

136

137 model.fit(data, labels, epochs=5)

138 ```

139

140 The learning rate schedule is also serializable and deserializable using

141 `tf.keras.optimizers.schedules.serialize` and

142 `tf.keras.optimizers.schedules.deserialize`.

143

144 Returns:

145 A 1-arg callable learning rate schedule that takes the current optimizer

146 step and outputs the decayed learning rate, a scalar `Tensor` of the same

147 type as `initial_learning_rate`.

148 """

149

150 def __init__(

151 self,

152 initial_learning_rate,

153 decay_steps,

154 decay_rate,

155 staircase=False,

156 name=None):

157 """Applies exponential decay to the learning rate.

158

159 Args:

160 initial_learning_rate: A scalar `float32` or `float64` `Tensor` or a

161 Python number. The initial learning rate.

162 decay_steps: A scalar `int32` or `int64` `Tensor` or a Python number.

163 Must be positive. See the decay computation above.

164 decay_rate: A scalar `float32` or `float64` `Tensor` or a

165 Python number. The decay rate.

166 staircase: Boolean. If `True` decay the learning rate at discrete

167 intervals

168 name: String. Optional name of the operation. Defaults to

169 'ExponentialDecay'.

170 """

171 super(ExponentialDecay, self).__init__()

172 self.initial_learning_rate = initial_learning_rate

173 self.decay_steps = decay_steps

174 self.decay_rate = decay_rate

175 self.staircase = staircase

176 self.name = name

177

178 def __call__(self, step):

179 with ops.name_scope_v2(self.name or "ExponentialDecay") as name:

180 initial_learning_rate = (

181 tensor_conversion.convert_to_tensor_v2_with_dispatch(

182 self.initial_learning_rate, name="initial_learning_rate"

183 )

184 )

185 dtype = initial_learning_rate.dtype

186 decay_steps = math_ops.cast(self.decay_steps, dtype)

187 decay_rate = math_ops.cast(self.decay_rate, dtype)

188

189 global_step_recomp = math_ops.cast(step, dtype)

190 p = global_step_recomp / decay_steps

191 if self.staircase:

192 p = math_ops.floor(p)

193 return math_ops.multiply(

194 initial_learning_rate, math_ops.pow(decay_rate, p), name=name)

195

196 def get_config(self):

197 return {

198 "initial_learning_rate": self.initial_learning_rate,

199 "decay_steps": self.decay_steps,

200 "decay_rate": self.decay_rate,

201 "staircase": self.staircase,

202 "name": self.name

203 }

204

205

206@keras_export("keras.optimizers.schedules.PiecewiseConstantDecay")

207class PiecewiseConstantDecay(LearningRateSchedule):

208 """A LearningRateSchedule that uses a piecewise constant decay schedule.

209

210 The function returns a 1-arg callable to compute the piecewise constant

211 when passed the current optimizer step. This can be useful for changing the

212 learning rate value across different invocations of optimizer functions.

213

214 Example: use a learning rate that's 1.0 for the first 100001 steps, 0.5

215 for the next 10000 steps, and 0.1 for any additional steps.

216

217 ```python

218 step = tf.Variable(0, trainable=False)

219 boundaries = [100000, 110000]

220 values = [1.0, 0.5, 0.1]

221 learning_rate_fn = keras.optimizers.schedules.PiecewiseConstantDecay(

222 boundaries, values)

223

224 # Later, whenever we perform an optimization step, we pass in the step.

225 learning_rate = learning_rate_fn(step)

226 ```

227

228 You can pass this schedule directly into a `tf.keras.optimizers.Optimizer`

229 as the learning rate. The learning rate schedule is also serializable and

230 deserializable using `tf.keras.optimizers.schedules.serialize` and

231 `tf.keras.optimizers.schedules.deserialize`.

232

233 Returns:

234 A 1-arg callable learning rate schedule that takes the current optimizer

235 step and outputs the decayed learning rate, a scalar `Tensor` of the same

236 type as the boundary tensors.

237

238 The output of the 1-arg function that takes the `step`

239 is `values[0]` when `step <= boundaries[0]`,

240 `values[1]` when `step > boundaries[0]` and `step <= boundaries[1]`, ...,

241 and values[-1] when `step > boundaries[-1]`.

242 """

243

244 def __init__(

245 self,

246 boundaries,

247 values,

248 name=None):

249 """Piecewise constant from boundaries and interval values.

250

251 Args:

252 boundaries: A list of `Tensor`s or `int`s or `float`s with strictly

253 increasing entries, and with all elements having the same type as the

254 optimizer step.

255 values: A list of `Tensor`s or `float`s or `int`s that specifies the

256 values for the intervals defined by `boundaries`. It should have one

257 more element than `boundaries`, and all elements should have the same

258 type.

259 name: A string. Optional name of the operation. Defaults to

260 'PiecewiseConstant'.

261

262 Raises:

263 ValueError: if the number of elements in the lists do not match.

264 """

265 super(PiecewiseConstantDecay, self).__init__()

266

267 if len(boundaries) != len(values) - 1:

268 raise ValueError(

269 "The length of boundaries should be 1 less than the length of values")

270

271 self.boundaries = boundaries

272 self.values = values

273 self.name = name

274

275 def __call__(self, step):

276 with ops.name_scope_v2(self.name or "PiecewiseConstant"):

277 boundaries = nest.map_structure(

278 tensor_conversion.convert_to_tensor_v2_with_dispatch,

279 nest.flatten(self.boundaries),

280 )

281 values = nest.map_structure(

282 tensor_conversion.convert_to_tensor_v2_with_dispatch,

283 nest.flatten(self.values),

284 )

285 x_recomp = tensor_conversion.convert_to_tensor_v2_with_dispatch(step)

286 for i, b in enumerate(boundaries):

287 if b.dtype.base_dtype != x_recomp.dtype.base_dtype:

288 # We cast the boundaries to have the same type as the step

289 b = math_ops.cast(b, x_recomp.dtype.base_dtype)

290 boundaries[i] = b

291 pred_fn_pairs = []

292 pred_fn_pairs.append((x_recomp <= boundaries[0], lambda: values[0]))

293 pred_fn_pairs.append((x_recomp > boundaries[-1], lambda: values[-1]))

294 for low, high, v in zip(boundaries[:-1], boundaries[1:], values[1:-1]):

295 # Need to bind v here; can do this with lambda v=v: ...

296 pred = (x_recomp > low) & (x_recomp <= high)

297 pred_fn_pairs.append((pred, lambda v=v: v))

298

299 # The default isn't needed here because our conditions are mutually

300 # exclusive and exhaustive, but tf.case requires it.

301 default = lambda: values[0]

302 return control_flow_case.case(pred_fn_pairs, default, exclusive=True)

303

304 def get_config(self):

305 return {

306 "boundaries": self.boundaries,

307 "values": self.values,

308 "name": self.name

309 }

310

311

312@keras_export("keras.optimizers.schedules.PolynomialDecay")

313class PolynomialDecay(LearningRateSchedule):

314 """A LearningRateSchedule that uses a polynomial decay schedule.

315

316 It is commonly observed that a monotonically decreasing learning rate, whose

317 degree of change is carefully chosen, results in a better performing model.

318 This schedule applies a polynomial decay function to an optimizer step,

319 given a provided `initial_learning_rate`, to reach an `end_learning_rate`

320 in the given `decay_steps`.

321

322 It requires a `step` value to compute the decayed learning rate. You

323 can just pass a TensorFlow variable that you increment at each training

324 step.

325

326 The schedule is a 1-arg callable that produces a decayed learning rate

327 when passed the current optimizer step. This can be useful for changing the

328 learning rate value across different invocations of optimizer functions.

329 It is computed as:

330

331 ```python

332 def decayed_learning_rate(step):

333 step = min(step, decay_steps)

334 return ((initial_learning_rate - end_learning_rate) *

335 (1 - step / decay_steps) ^ (power)

336 ) + end_learning_rate

337 ```

338

339 If `cycle` is True then a multiple of `decay_steps` is used, the first one

340 that is bigger than `step`.

341

342 ```python

343 def decayed_learning_rate(step):

344 decay_steps = decay_steps * ceil(step / decay_steps)

345 return ((initial_learning_rate - end_learning_rate) *

346 (1 - step / decay_steps) ^ (power)

347 ) + end_learning_rate

348 ```

349

350 You can pass this schedule directly into a `tf.keras.optimizers.Optimizer`

351 as the learning rate.

352 Example: Fit a model while decaying from 0.1 to 0.01 in 10000 steps using

353 sqrt (i.e. power=0.5):

354

355 ```python

356 ...

357 starter_learning_rate = 0.1

358 end_learning_rate = 0.01

359 decay_steps = 10000

360 learning_rate_fn = tf.keras.optimizers.schedules.PolynomialDecay(

361 starter_learning_rate,

362 decay_steps,

363 end_learning_rate,

364 power=0.5)

365

366 model.compile(optimizer=tf.keras.optimizers.SGD(

367 learning_rate=learning_rate_fn),

368 loss='sparse_categorical_crossentropy',

369 metrics=['accuracy'])

370

371 model.fit(data, labels, epochs=5)

372 ```

373

374 The learning rate schedule is also serializable and deserializable using

375 `tf.keras.optimizers.schedules.serialize` and

376 `tf.keras.optimizers.schedules.deserialize`.

377

378 Returns:

379 A 1-arg callable learning rate schedule that takes the current optimizer

380 step and outputs the decayed learning rate, a scalar `Tensor` of the same

381 type as `initial_learning_rate`.

382 """

383

384 def __init__(

385 self,

386 initial_learning_rate,

387 decay_steps,

388 end_learning_rate=0.0001,

389 power=1.0,

390 cycle=False,

391 name=None):

392 """Applies a polynomial decay to the learning rate.

393

394 Args:

395 initial_learning_rate: A scalar `float32` or `float64` `Tensor` or a

396 Python number. The initial learning rate.

397 decay_steps: A scalar `int32` or `int64` `Tensor` or a Python number.

398 Must be positive. See the decay computation above.

399 end_learning_rate: A scalar `float32` or `float64` `Tensor` or a

400 Python number. The minimal end learning rate.

401 power: A scalar `float32` or `float64` `Tensor` or a

402 Python number. The power of the polynomial. Defaults to linear, 1.0.

403 cycle: A boolean, whether or not it should cycle beyond decay_steps.

404 name: String. Optional name of the operation. Defaults to

405 'PolynomialDecay'.

406 """

407 super(PolynomialDecay, self).__init__()

408

409 self.initial_learning_rate = initial_learning_rate

410 self.decay_steps = decay_steps

411 self.end_learning_rate = end_learning_rate

412 self.power = power

413 self.cycle = cycle

414 self.name = name

415

416 def __call__(self, step):

417 with ops.name_scope_v2(self.name or "PolynomialDecay") as name:

418 initial_learning_rate = (

419 tensor_conversion.convert_to_tensor_v2_with_dispatch(

420 self.initial_learning_rate, name="initial_learning_rate"

421 )

422 )

423 dtype = initial_learning_rate.dtype

424 end_learning_rate = math_ops.cast(self.end_learning_rate, dtype)

425 power = math_ops.cast(self.power, dtype)

426

427 global_step_recomp = math_ops.cast(step, dtype)

428 decay_steps_recomp = math_ops.cast(self.decay_steps, dtype)

429 if self.cycle:

430 # Find the first multiple of decay_steps that is bigger than

431 # global_step. If global_step is zero set the multiplier to 1

432 multiplier = array_ops.where_v2(

433 math_ops.equal(global_step_recomp, 0), 1.0,

434 math_ops.ceil(global_step_recomp / self.decay_steps))

435 decay_steps_recomp = math_ops.multiply(decay_steps_recomp, multiplier)

436 else:

437 # Make sure that the global_step used is not bigger than decay_steps.

438 global_step_recomp = math_ops.minimum(global_step_recomp,

439 decay_steps_recomp)

440

441 p = math_ops.divide(global_step_recomp, decay_steps_recomp)

442 return math_ops.add(

443 math_ops.multiply(initial_learning_rate - end_learning_rate,

444 math_ops.pow(1 - p, power)),

445 end_learning_rate,

446 name=name)

447

448 def get_config(self):

449 return {

450 "initial_learning_rate": self.initial_learning_rate,

451 "decay_steps": self.decay_steps,

452 "end_learning_rate": self.end_learning_rate,

453 "power": self.power,

454 "cycle": self.cycle,

455 "name": self.name

456 }

457

458

459@keras_export("keras.optimizers.schedules.InverseTimeDecay")

460class InverseTimeDecay(LearningRateSchedule):

461 """A LearningRateSchedule that uses an inverse time decay schedule.

462

463 When training a model, it is often useful to lower the learning rate as

464 the training progresses. This schedule applies the inverse decay function

465 to an optimizer step, given a provided initial learning rate.

466 It requires a `step` value to compute the decayed learning rate. You can

467 just pass a TensorFlow variable that you increment at each training step.

468

469 The schedule a 1-arg callable that produces a decayed learning

470 rate when passed the current optimizer step. This can be useful for changing

471 the learning rate value across different invocations of optimizer functions.

472 It is computed as:

473

474 ```python

475 def decayed_learning_rate(step):

476 return initial_learning_rate / (1 + decay_rate * step / decay_step)

477 ```

478

479 or, if `staircase` is `True`, as:

480

481 ```python

482 def decayed_learning_rate(step):

483 return initial_learning_rate / (1 + decay_rate * floor(step / decay_step))

484 ```

485

486 You can pass this schedule directly into a `tf.keras.optimizers.Optimizer`

487 as the learning rate.

488 Example: Fit a Keras model when decaying 1/t with a rate of 0.5:

489

490 ```python

491 ...

492 initial_learning_rate = 0.1

493 decay_steps = 1.0

494 decay_rate = 0.5

495 learning_rate_fn = keras.optimizers.schedules.InverseTimeDecay(

496 initial_learning_rate, decay_steps, decay_rate)

497

498 model.compile(optimizer=tf.keras.optimizers.SGD(

499 learning_rate=learning_rate_fn),

500 loss='sparse_categorical_crossentropy',

501 metrics=['accuracy'])

502

503 model.fit(data, labels, epochs=5)

504 ```

505

506 Returns:

507 A 1-arg callable learning rate schedule that takes the current optimizer

508 step and outputs the decayed learning rate, a scalar `Tensor` of the same

509 type as `initial_learning_rate`.

510 """

511

512 def __init__(

513 self,

514 initial_learning_rate,

515 decay_steps,

516 decay_rate,

517 staircase=False,

518 name=None):

519 """Applies inverse time decay to the initial learning rate.

520

521 Args:

522 initial_learning_rate: A scalar `float32` or `float64` `Tensor` or a

523 Python number. The initial learning rate.

524 decay_steps: How often to apply decay.

525 decay_rate: A Python number. The decay rate.

526 staircase: Whether to apply decay in a discrete staircase, as opposed to

527 continuous, fashion.

528 name: String. Optional name of the operation. Defaults to

529 'InverseTimeDecay'.

530 """

531 super(InverseTimeDecay, self).__init__()

532

533 self.initial_learning_rate = initial_learning_rate

534 self.decay_steps = decay_steps

535 self.decay_rate = decay_rate

536 self.staircase = staircase

537 self.name = name

538

539 def __call__(self, step):

540 with ops.name_scope_v2(self.name or "InverseTimeDecay") as name:

541 initial_learning_rate = (

542 tensor_conversion.convert_to_tensor_v2_with_dispatch(

543 self.initial_learning_rate, name="initial_learning_rate"

544 )

545 )

546 dtype = initial_learning_rate.dtype

547 decay_steps = math_ops.cast(self.decay_steps, dtype)

548 decay_rate = math_ops.cast(self.decay_rate, dtype)

549

550 global_step_recomp = math_ops.cast(step, dtype)

551 p = global_step_recomp / decay_steps

552 if self.staircase:

553 p = math_ops.floor(p)

554 const = math_ops.cast(constant_op.constant(1), dtype)

555 denom = math_ops.add(const, math_ops.multiply(decay_rate, p))

556 return math_ops.divide(initial_learning_rate, denom, name=name)

557

558 def get_config(self):

559 return {

560 "initial_learning_rate": self.initial_learning_rate,

561 "decay_steps": self.decay_steps,

562 "decay_rate": self.decay_rate,

563 "staircase": self.staircase,

564 "name": self.name

565 }

566

567

568@keras_export("keras.optimizers.schedules.CosineDecay",

569 "keras.experimental.CosineDecay")

570class CosineDecay(LearningRateSchedule):

571 """A LearningRateSchedule that uses a cosine decay schedule.

572

573 See [Loshchilov & Hutter, ICLR2016](https://arxiv.org/abs/1608.03983),

574 SGDR: Stochastic Gradient Descent with Warm Restarts.

575

576 When training a model, it is often useful to lower the learning rate as

577 the training progresses. This schedule applies a cosine decay function

578 to an optimizer step, given a provided initial learning rate.

579 It requires a `step` value to compute the decayed learning rate. You can

580 just pass a TensorFlow variable that you increment at each training step.

581

582 The schedule a 1-arg callable that produces a decayed learning

583 rate when passed the current optimizer step. This can be useful for changing

584 the learning rate value across different invocations of optimizer functions.

585 It is computed as:

586

587 ```python

588 def decayed_learning_rate(step):

589 step = min(step, decay_steps)

590 cosine_decay = 0.5 * (1 + cos(pi * step / decay_steps))

591 decayed = (1 - alpha) * cosine_decay + alpha

592 return initial_learning_rate * decayed

593 ```

594

595 Example usage:

596 ```python

597 decay_steps = 1000

598 lr_decayed_fn = tf.keras.optimizers.schedules.CosineDecay(

599 initial_learning_rate, decay_steps)

600 ```

601

602 You can pass this schedule directly into a `tf.keras.optimizers.Optimizer`

603 as the learning rate. The learning rate schedule is also serializable and

604 deserializable using `tf.keras.optimizers.schedules.serialize` and

605 `tf.keras.optimizers.schedules.deserialize`.

606

607 Returns:

608 A 1-arg callable learning rate schedule that takes the current optimizer

609 step and outputs the decayed learning rate, a scalar `Tensor` of the same

610 type as `initial_learning_rate`.

611 """

612

613 def __init__(

614 self,

615 initial_learning_rate,

616 decay_steps,

617 alpha=0.0,

618 name=None):

619 """Applies cosine decay to the learning rate.

620

621 Args:

622 initial_learning_rate: A scalar `float32` or `float64` Tensor or a

623 Python number. The initial learning rate.

624 decay_steps: A scalar `int32` or `int64` `Tensor` or a Python number.

625 Number of steps to decay over.

626 alpha: A scalar `float32` or `float64` Tensor or a Python number.

627 Minimum learning rate value as a fraction of initial_learning_rate.

628 name: String. Optional name of the operation. Defaults to 'CosineDecay'.

629 """

630 super(CosineDecay, self).__init__()

631

632 self.initial_learning_rate = initial_learning_rate

633 self.decay_steps = decay_steps

634 self.alpha = alpha

635 self.name = name

636

637 def __call__(self, step):

638 with ops.name_scope_v2(self.name or "CosineDecay"):

639 initial_learning_rate = (

640 tensor_conversion.convert_to_tensor_v2_with_dispatch(

641 self.initial_learning_rate, name="initial_learning_rate"

642 )

643 )

644 dtype = initial_learning_rate.dtype

645 decay_steps = math_ops.cast(self.decay_steps, dtype)

646

647 global_step_recomp = math_ops.cast(step, dtype)

648 global_step_recomp = math_ops.minimum(global_step_recomp, decay_steps)

649 completed_fraction = global_step_recomp / decay_steps

650 cosine_decayed = 0.5 * (1.0 + math_ops.cos(

651 constant_op.constant(math.pi) * completed_fraction))

652

653 decayed = (1 - self.alpha) * cosine_decayed + self.alpha

654 return math_ops.multiply(initial_learning_rate, decayed)

655

656 def get_config(self):

657 return {

658 "initial_learning_rate": self.initial_learning_rate,

659 "decay_steps": self.decay_steps,

660 "alpha": self.alpha,

661 "name": self.name

662 }

663

664

665@keras_export("keras.optimizers.schedules.CosineDecayRestarts",

666 "keras.experimental.CosineDecayRestarts")

667class CosineDecayRestarts(LearningRateSchedule):

668 """A LearningRateSchedule that uses a cosine decay schedule with restarts.

669

670 See [Loshchilov & Hutter, ICLR2016](https://arxiv.org/abs/1608.03983),

671 SGDR: Stochastic Gradient Descent with Warm Restarts.

672

673 When training a model, it is often useful to lower the learning rate as

674 the training progresses. This schedule applies a cosine decay function with

675 restarts to an optimizer step, given a provided initial learning rate.

676 It requires a `step` value to compute the decayed learning rate. You can

677 just pass a TensorFlow variable that you increment at each training step.

678

679 The schedule a 1-arg callable that produces a decayed learning

680 rate when passed the current optimizer step. This can be useful for changing

681 the learning rate value across different invocations of optimizer functions.

682

683 The learning rate multiplier first decays

684 from 1 to `alpha` for `first_decay_steps` steps. Then, a warm

685 restart is performed. Each new warm restart runs for `t_mul` times more

686 steps and with `m_mul` times smaller initial learning rate.

687

688 Example usage:

689 ```python

690 first_decay_steps = 1000

691 lr_decayed_fn = (

692 tf.keras.optimizers.schedules.CosineDecayRestarts(

693 initial_learning_rate,

694 first_decay_steps))

695 ```

696

697 You can pass this schedule directly into a `tf.keras.optimizers.Optimizer`

698 as the learning rate. The learning rate schedule is also serializable and

699 deserializable using `tf.keras.optimizers.schedules.serialize` and

700 `tf.keras.optimizers.schedules.deserialize`.

701

702 Returns:

703 A 1-arg callable learning rate schedule that takes the current optimizer

704 step and outputs the decayed learning rate, a scalar `Tensor` of the same

705 type as `initial_learning_rate`.

706 """

707

708 def __init__(

709 self,

710 initial_learning_rate,

711 first_decay_steps,

712 t_mul=2.0,

713 m_mul=1.0,

714 alpha=0.0,

715 name=None):

716 """Applies cosine decay with restarts to the learning rate.

717

718 Args:

719 initial_learning_rate: A scalar `float32` or `float64` Tensor or a Python

720 number. The initial learning rate.

721 first_decay_steps: A scalar `int32` or `int64` `Tensor` or a Python

722 number. Number of steps to decay over.

723 t_mul: A scalar `float32` or `float64` `Tensor` or a Python number.

724 Used to derive the number of iterations in the i-th period

725 m_mul: A scalar `float32` or `float64` `Tensor` or a Python number.

726 Used to derive the initial learning rate of the i-th period:

727 alpha: A scalar `float32` or `float64` Tensor or a Python number.

728 Minimum learning rate value as a fraction of the initial_learning_rate.

729 name: String. Optional name of the operation. Defaults to 'SGDRDecay'.

730 """

731 super(CosineDecayRestarts, self).__init__()

732

733 self.initial_learning_rate = initial_learning_rate

734 self.first_decay_steps = first_decay_steps

735 self._t_mul = t_mul

736 self._m_mul = m_mul

737 self.alpha = alpha

738 self.name = name

739

740 def __call__(self, step):

741 with ops.name_scope_v2(self.name or "SGDRDecay") as name:

742 initial_learning_rate = (

743 tensor_conversion.convert_to_tensor_v2_with_dispatch(

744 self.initial_learning_rate, name="initial_learning_rate"

745 )

746 )

747 dtype = initial_learning_rate.dtype

748 first_decay_steps = math_ops.cast(self.first_decay_steps, dtype)

749 alpha = math_ops.cast(self.alpha, dtype)

750 t_mul = math_ops.cast(self._t_mul, dtype)

751 m_mul = math_ops.cast(self._m_mul, dtype)

752

753 global_step_recomp = math_ops.cast(step, dtype)

754 completed_fraction = global_step_recomp / first_decay_steps

755

756 def compute_step(completed_fraction, geometric=False):

757 """Helper for `cond` operation."""

758 if geometric:

759 i_restart = math_ops.floor(

760 math_ops.log(1.0 - completed_fraction * (1.0 - t_mul)) /

761 math_ops.log(t_mul))

762

763 sum_r = (1.0 - t_mul**i_restart) / (1.0 - t_mul)

764 completed_fraction = (completed_fraction - sum_r) / t_mul**i_restart

765

766 else:

767 i_restart = math_ops.floor(completed_fraction)

768 completed_fraction -= i_restart

769

770 return i_restart, completed_fraction

771

772 i_restart, completed_fraction = cond.cond(

773 math_ops.equal(t_mul, 1.0),

774 lambda: compute_step(completed_fraction, geometric=False),

775 lambda: compute_step(completed_fraction, geometric=True))

776

777 m_fac = m_mul**i_restart

778 cosine_decayed = 0.5 * m_fac * (1.0 + math_ops.cos(

779 constant_op.constant(math.pi) * completed_fraction))

780 decayed = (1 - alpha) * cosine_decayed + alpha

781

782 return math_ops.multiply(initial_learning_rate, decayed, name=name)

783

784 def get_config(self):

785 return {

786 "initial_learning_rate": self.initial_learning_rate,

787 "first_decay_steps": self.first_decay_steps,

788 "t_mul": self._t_mul,

789 "m_mul": self._m_mul,

790 "alpha": self.alpha,

791 "name": self.name

792 }

793

794

795# Note: this code is still used by V1 APIs.

796class LinearCosineDecay(LearningRateSchedule):

797 """A LearningRateSchedule that uses a linear cosine decay schedule.

798

799 See [Bello et al., ICML2017] Neural Optimizer Search with RL.

800 https://arxiv.org/abs/1709.07417

801

802 For the idea of warm starts here controlled by `num_periods`,

803 see [Loshchilov & Hutter, ICLR2016] SGDR: Stochastic Gradient Descent

804 with Warm Restarts. https://arxiv.org/abs/1608.03983

805

806 Note that linear cosine decay is more aggressive than cosine decay and

807 larger initial learning rates can typically be used.

808

809 When training a model, it is often recommended to lower the learning rate as

810 the training progresses. This schedule applies a linear cosine decay

811 function to an optimizer step, given a provided initial learning rate.

812 It requires a `step` value to compute the decayed learning rate. You can

813 just pass a TensorFlow variable that you increment at each training step.

814

815 The schedule a 1-arg callable that produces a decayed learning

816 rate when passed the current optimizer step. This can be useful for changing

817 the learning rate value across different invocations of optimizer functions.

818 It is computed as:

819

820 ```python

821 def decayed_learning_rate(step):

822 step = min(step, decay_steps)

823 linear_decay = (decay_steps - step) / decay_steps

824 cosine_decay = 0.5 * (

825 1 + cos(pi * 2 * num_periods * step / decay_steps))

826 decayed = (alpha + linear_decay) * cosine_decay + beta

827 return initial_learning_rate * decayed

828 ```

829

830 Example usage:

831 ```python

832 decay_steps = 1000

833 lr_decayed_fn = (

834 tf.keras.experimental.LinearCosineDecay(

835 initial_learning_rate, decay_steps))

836 ```

837

838 You can pass this schedule directly into a `tf.keras.optimizers.Optimizer`

839 as the learning rate. The learning rate schedule is also serializable and

840 deserializable using `tf.keras.optimizers.schedules.serialize` and

841 `tf.keras.optimizers.schedules.deserialize`.

842

843 Returns:

844 A 1-arg callable learning rate schedule that takes the current optimizer

845 step and outputs the decayed learning rate, a scalar `Tensor` of the same

846 type as `initial_learning_rate`.

847 """

848

849 def __init__(

850 self,

851 initial_learning_rate,

852 decay_steps,

853 num_periods=0.5,

854 alpha=0.0,

855 beta=0.001,

856 name=None):

857 """Applies linear cosine decay to the learning rate.

858

859 Args:

860 initial_learning_rate: A scalar `float32` or `float64` Tensor or a Python

861 number. The initial learning rate.

862 decay_steps: A scalar `int32` or `int64` `Tensor` or a Python number.

863 Number of steps to decay over.

864 num_periods: Number of periods in the cosine part of the decay.

865 See computation above.

866 alpha: See computation above.

867 beta: See computation above.

868 name: String. Optional name of the operation. Defaults to

869 'LinearCosineDecay'.

870 """

871 super(LinearCosineDecay, self).__init__()

872

873 self.initial_learning_rate = initial_learning_rate

874 self.decay_steps = decay_steps

875 self.num_periods = num_periods

876 self.alpha = alpha

877 self.beta = beta

878 self.name = name

879

880 def __call__(self, step):

881 with ops.name_scope_v2(self.name or "LinearCosineDecay") as name:

882 initial_learning_rate = (

883 tensor_conversion.convert_to_tensor_v2_with_dispatch(

884 self.initial_learning_rate, name="initial_learning_rate"

885 )

886 )

887 dtype = initial_learning_rate.dtype

888 decay_steps = math_ops.cast(self.decay_steps, dtype)

889 num_periods = math_ops.cast(self.num_periods, dtype)

890 alpha = math_ops.cast(self.alpha, dtype)

891 beta = math_ops.cast(self.beta, dtype)

892

893 global_step_recomp = math_ops.cast(step, dtype)

894 global_step_recomp = math_ops.minimum(global_step_recomp, decay_steps)

895 linear_decayed = (decay_steps - global_step_recomp) / decay_steps

896 completed_fraction = global_step_recomp / decay_steps

897 fraction = 2.0 * num_periods * completed_fraction

898 cosine_decayed = 0.5 * (

899 1.0 + math_ops.cos(constant_op.constant(math.pi) * fraction))

900

901 linear_cosine_decayed = (alpha + linear_decayed) * cosine_decayed + beta

902 return math_ops.multiply(initial_learning_rate, linear_cosine_decayed,

903 name=name)

904

905 def get_config(self):

906 return {

907 "initial_learning_rate": self.initial_learning_rate,

908 "decay_steps": self.decay_steps,

909 "num_periods": self.num_periods,

910 "alpha": self.alpha,

911 "beta": self.beta,

912 "name": self.name

913 }

914

915

916# Note: this code is still used by V1 APIs.

917class NoisyLinearCosineDecay(LearningRateSchedule):

918 """A LearningRateSchedule that uses a noisy linear cosine decay schedule.

919

920 See [Bello et al., ICML2017] Neural Optimizer Search with RL.

921 https://arxiv.org/abs/1709.07417

922

923 For the idea of warm starts here controlled by `num_periods`,

924 see [Loshchilov & Hutter, ICLR2016] SGDR: Stochastic Gradient Descent

925 with Warm Restarts. https://arxiv.org/abs/1608.03983

926

927 Note that linear cosine decay is more aggressive than cosine decay and

928 larger initial learning rates can typically be used.

929

930 When training a model, it is often recommended to lower the learning rate as

931 the training progresses. This schedule applies a noisy linear cosine decay

932 function to an optimizer step, given a provided initial learning rate.

933 It requires a `step` value to compute the decayed learning rate. You can

934 just pass a TensorFlow variable that you increment at each training step.

935

936 The schedule a 1-arg callable that produces a decayed learning

937 rate when passed the current optimizer step. This can be useful for changing

938 the learning rate value across different invocations of optimizer functions.

939 It is computed as:

940

941 ```python

942 def decayed_learning_rate(step):

943 step = min(step, decay_steps)

944 linear_decay = (decay_steps - step) / decay_steps)

945 cosine_decay = 0.5 * (

946 1 + cos(pi * 2 * num_periods * step / decay_steps))

947 decayed = (alpha + linear_decay + eps_t) * cosine_decay + beta

948 return initial_learning_rate * decayed

949 ```

950 where eps_t is 0-centered gaussian noise with variance

951 initial_variance / (1 + global_step) ** variance_decay

952

953 Example usage:

954 ```python

955 decay_steps = 1000

956 lr_decayed_fn = (

957 tf.keras.experimental.NoisyLinearCosineDecay(

958 initial_learning_rate, decay_steps))

959 ```

960

961 You can pass this schedule directly into a `tf.keras.optimizers.Optimizer`

962 as the learning rate. The learning rate schedule is also serializable and

963 deserializable using `tf.keras.optimizers.schedules.serialize` and

964 `tf.keras.optimizers.schedules.deserialize`.

965

966 Returns:

967 A 1-arg callable learning rate schedule that takes the current optimizer

968 step and outputs the decayed learning rate, a scalar `Tensor` of the same

969 type as `initial_learning_rate`.

970 """

971

972 def __init__(

973 self,

974 initial_learning_rate,

975 decay_steps,

976 initial_variance=1.0,

977 variance_decay=0.55,

978 num_periods=0.5,

979 alpha=0.0,

980 beta=0.001,

981 name=None):

982 """Applies noisy linear cosine decay to the learning rate.

983

984 Args:

985 initial_learning_rate: A scalar `float32` or `float64` Tensor or a Python

986 number. The initial learning rate.

987 decay_steps: A scalar `int32` or `int64` `Tensor` or a Python number.

988 Number of steps to decay over.

989 initial_variance: initial variance for the noise. See computation above.

990 variance_decay: decay for the noise's variance. See computation above.

991 num_periods: Number of periods in the cosine part of the decay.

992 See computation above.

993 alpha: See computation above.

994 beta: See computation above.

995 name: String. Optional name of the operation. Defaults to

996 'NoisyLinearCosineDecay'.

997 """

998 super(NoisyLinearCosineDecay, self).__init__()

999

1000 self.initial_learning_rate = initial_learning_rate

1001 self.decay_steps = decay_steps

1002 self.initial_variance = initial_variance

1003 self.variance_decay = variance_decay

1004 self.num_periods = num_periods

1005 self.alpha = alpha

1006 self.beta = beta

1007 self.name = name

1008

1009 def __call__(self, step):

1010 with ops.name_scope_v2(self.name or "NoisyLinearCosineDecay") as name:

1011 initial_learning_rate = (

1012 tensor_conversion.convert_to_tensor_v2_with_dispatch(

1013 self.initial_learning_rate, name="initial_learning_rate"

1014 )

1015 )

1016 dtype = initial_learning_rate.dtype

1017 decay_steps = math_ops.cast(self.decay_steps, dtype)

1018 initial_variance = math_ops.cast(self.initial_variance, dtype)

1019 variance_decay = math_ops.cast(self.variance_decay, dtype)

1020 num_periods = math_ops.cast(self.num_periods, dtype)

1021 alpha = math_ops.cast(self.alpha, dtype)

1022 beta = math_ops.cast(self.beta, dtype)

1023

1024 global_step_recomp = math_ops.cast(step, dtype)

1025 global_step_recomp = math_ops.minimum(global_step_recomp, decay_steps)

1026 linear_decayed = (decay_steps - global_step_recomp) / decay_steps

1027 variance = initial_variance / (

1028 math_ops.pow(1.0 + global_step_recomp, variance_decay))

1029 std = math_ops.sqrt(variance)

1030 noisy_linear_decayed = (

1031 linear_decayed + random_ops.random_normal(

1032 linear_decayed.shape, stddev=std))

1033

1034 completed_fraction = global_step_recomp / decay_steps

1035 fraction = 2.0 * num_periods * completed_fraction

1036 cosine_decayed = 0.5 * (

1037 1.0 + math_ops.cos(constant_op.constant(math.pi) * fraction))

1038 noisy_linear_cosine_decayed = (

1039 (alpha + noisy_linear_decayed) * cosine_decayed + beta)

1040

1041 return math_ops.multiply(

1042 initial_learning_rate, noisy_linear_cosine_decayed, name=name)

1043

1044 def get_config(self):

1045 return {

1046 "initial_learning_rate": self.initial_learning_rate,

1047 "decay_steps": self.decay_steps,

1048 "initial_variance": self.initial_variance,

1049 "variance_decay": self.variance_decay,

1050 "num_periods": self.num_periods,

1051 "alpha": self.alpha,

1052 "beta": self.beta,

1053 "name": self.name

1054 }

1055

1056

1057@keras_export("keras.optimizers.schedules.serialize")

1058def serialize(learning_rate_schedule):

1059 return generic_utils.serialize_keras_object(learning_rate_schedule)

1060

1061

1062@keras_export("keras.optimizers.schedules.deserialize")

1063def deserialize(config, custom_objects=None):

1064 return generic_utils.deserialize_keras_object(

1065 config,

1066 module_objects=globals(),

1067 custom_objects=custom_objects,

1068 printable_module_name="decay")

Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/tensorflow/python/keras/optimizer_v2/learning_rate_schedule.py: 26%

248 statements