Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/tensorflow/python/keras/optimizer_v2/learning_rate_schedule.py: 26%

248 statements  

« prev     ^ index     » next       coverage.py v7.4.0, created at 2024-01-03 07:57 +0000

1# Copyright 2015 The TensorFlow Authors. All Rights Reserved. 

2# 

3# Licensed under the Apache License, Version 2.0 (the "License"); 

4# you may not use this file except in compliance with the License. 

5# You may obtain a copy of the License at 

6# 

7# http://www.apache.org/licenses/LICENSE-2.0 

8# 

9# Unless required by applicable law or agreed to in writing, software 

10# distributed under the License is distributed on an "AS IS" BASIS, 

11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 

12# See the License for the specific language governing permissions and 

13# limitations under the License. 

14# ============================================================================== 

15"""Various learning rate decay functions.""" 

16 

17import abc 

18import math 

19 

20from tensorflow.python.framework import constant_op 

21from tensorflow.python.framework import ops 

22from tensorflow.python.framework import tensor_conversion 

23from tensorflow.python.keras.utils import generic_utils 

24from tensorflow.python.ops import array_ops 

25from tensorflow.python.ops import cond 

26from tensorflow.python.ops import control_flow_case 

27from tensorflow.python.ops import math_ops 

28from tensorflow.python.ops import random_ops 

29from tensorflow.python.util import nest 

30from tensorflow.python.util.tf_export import keras_export 

31 

32 

33@keras_export("keras.optimizers.schedules.LearningRateSchedule") 

34class LearningRateSchedule(object): 

35 """The learning rate schedule base class. 

36 

37 You can use a learning rate schedule to modulate how the learning rate 

38 of your optimizer changes over time. 

39 

40 Several built-in learning rate schedules are available, such as 

41 `tf.keras.optimizers.schedules.ExponentialDecay` or 

42 `tf.keras.optimizers.schedules.PiecewiseConstantDecay`: 

43 

44 ```python 

45 lr_schedule = keras.optimizers.schedules.ExponentialDecay( 

46 initial_learning_rate=1e-2, 

47 decay_steps=10000, 

48 decay_rate=0.9) 

49 optimizer = keras.optimizers.SGD(learning_rate=lr_schedule) 

50 ``` 

51 

52 A `LearningRateSchedule` instance can be passed in as the `learning_rate` 

53 argument of any optimizer. 

54 

55 To implement your own schedule object, you should implement the `__call__` 

56 method, which takes a `step` argument (scalar integer tensor, the 

57 current training step count). 

58 Like for any other Keras object, you can also optionally 

59 make your object serializable by implementing the `get_config` 

60 and `from_config` methods. 

61 

62 Example: 

63 

64 ```python 

65 class MyLRSchedule(tf.keras.optimizers.schedules.LearningRateSchedule): 

66 

67 def __init__(self, initial_learning_rate): 

68 self.initial_learning_rate = initial_learning_rate 

69 

70 def __call__(self, step): 

71 return self.initial_learning_rate / (step + 1) 

72 

73 optimizer = tf.keras.optimizers.SGD(learning_rate=MyLRSchedule(0.1)) 

74 ``` 

75 """ 

76 

77 @abc.abstractmethod 

78 def __call__(self, step): 

79 raise NotImplementedError("Learning rate schedule must override __call__") 

80 

81 @abc.abstractmethod 

82 def get_config(self): 

83 raise NotImplementedError("Learning rate schedule must override get_config") 

84 

85 @classmethod 

86 def from_config(cls, config): 

87 """Instantiates a `LearningRateSchedule` from its config. 

88 

89 Args: 

90 config: Output of `get_config()`. 

91 

92 Returns: 

93 A `LearningRateSchedule` instance. 

94 """ 

95 return cls(**config) 

96 

97 

98@keras_export("keras.optimizers.schedules.ExponentialDecay") 

99class ExponentialDecay(LearningRateSchedule): 

100 """A LearningRateSchedule that uses an exponential decay schedule. 

101 

102 When training a model, it is often useful to lower the learning rate as 

103 the training progresses. This schedule applies an exponential decay function 

104 to an optimizer step, given a provided initial learning rate. 

105 

106 The schedule a 1-arg callable that produces a decayed learning 

107 rate when passed the current optimizer step. This can be useful for changing 

108 the learning rate value across different invocations of optimizer functions. 

109 It is computed as: 

110 

111 ```python 

112 def decayed_learning_rate(step): 

113 return initial_learning_rate * decay_rate ^ (step / decay_steps) 

114 ``` 

115 

116 If the argument `staircase` is `True`, then `step / decay_steps` is 

117 an integer division and the decayed learning rate follows a 

118 staircase function. 

119 

120 You can pass this schedule directly into a `tf.keras.optimizers.Optimizer` 

121 as the learning rate. 

122 Example: When fitting a Keras model, decay every 100000 steps with a base 

123 of 0.96: 

124 

125 ```python 

126 initial_learning_rate = 0.1 

127 lr_schedule = tf.keras.optimizers.schedules.ExponentialDecay( 

128 initial_learning_rate, 

129 decay_steps=100000, 

130 decay_rate=0.96, 

131 staircase=True) 

132 

133 model.compile(optimizer=tf.keras.optimizers.SGD(learning_rate=lr_schedule), 

134 loss='sparse_categorical_crossentropy', 

135 metrics=['accuracy']) 

136 

137 model.fit(data, labels, epochs=5) 

138 ``` 

139 

140 The learning rate schedule is also serializable and deserializable using 

141 `tf.keras.optimizers.schedules.serialize` and 

142 `tf.keras.optimizers.schedules.deserialize`. 

143 

144 Returns: 

145 A 1-arg callable learning rate schedule that takes the current optimizer 

146 step and outputs the decayed learning rate, a scalar `Tensor` of the same 

147 type as `initial_learning_rate`. 

148 """ 

149 

150 def __init__( 

151 self, 

152 initial_learning_rate, 

153 decay_steps, 

154 decay_rate, 

155 staircase=False, 

156 name=None): 

157 """Applies exponential decay to the learning rate. 

158 

159 Args: 

160 initial_learning_rate: A scalar `float32` or `float64` `Tensor` or a 

161 Python number. The initial learning rate. 

162 decay_steps: A scalar `int32` or `int64` `Tensor` or a Python number. 

163 Must be positive. See the decay computation above. 

164 decay_rate: A scalar `float32` or `float64` `Tensor` or a 

165 Python number. The decay rate. 

166 staircase: Boolean. If `True` decay the learning rate at discrete 

167 intervals 

168 name: String. Optional name of the operation. Defaults to 

169 'ExponentialDecay'. 

170 """ 

171 super(ExponentialDecay, self).__init__() 

172 self.initial_learning_rate = initial_learning_rate 

173 self.decay_steps = decay_steps 

174 self.decay_rate = decay_rate 

175 self.staircase = staircase 

176 self.name = name 

177 

178 def __call__(self, step): 

179 with ops.name_scope_v2(self.name or "ExponentialDecay") as name: 

180 initial_learning_rate = ( 

181 tensor_conversion.convert_to_tensor_v2_with_dispatch( 

182 self.initial_learning_rate, name="initial_learning_rate" 

183 ) 

184 ) 

185 dtype = initial_learning_rate.dtype 

186 decay_steps = math_ops.cast(self.decay_steps, dtype) 

187 decay_rate = math_ops.cast(self.decay_rate, dtype) 

188 

189 global_step_recomp = math_ops.cast(step, dtype) 

190 p = global_step_recomp / decay_steps 

191 if self.staircase: 

192 p = math_ops.floor(p) 

193 return math_ops.multiply( 

194 initial_learning_rate, math_ops.pow(decay_rate, p), name=name) 

195 

196 def get_config(self): 

197 return { 

198 "initial_learning_rate": self.initial_learning_rate, 

199 "decay_steps": self.decay_steps, 

200 "decay_rate": self.decay_rate, 

201 "staircase": self.staircase, 

202 "name": self.name 

203 } 

204 

205 

206@keras_export("keras.optimizers.schedules.PiecewiseConstantDecay") 

207class PiecewiseConstantDecay(LearningRateSchedule): 

208 """A LearningRateSchedule that uses a piecewise constant decay schedule. 

209 

210 The function returns a 1-arg callable to compute the piecewise constant 

211 when passed the current optimizer step. This can be useful for changing the 

212 learning rate value across different invocations of optimizer functions. 

213 

214 Example: use a learning rate that's 1.0 for the first 100001 steps, 0.5 

215 for the next 10000 steps, and 0.1 for any additional steps. 

216 

217 ```python 

218 step = tf.Variable(0, trainable=False) 

219 boundaries = [100000, 110000] 

220 values = [1.0, 0.5, 0.1] 

221 learning_rate_fn = keras.optimizers.schedules.PiecewiseConstantDecay( 

222 boundaries, values) 

223 

224 # Later, whenever we perform an optimization step, we pass in the step. 

225 learning_rate = learning_rate_fn(step) 

226 ``` 

227 

228 You can pass this schedule directly into a `tf.keras.optimizers.Optimizer` 

229 as the learning rate. The learning rate schedule is also serializable and 

230 deserializable using `tf.keras.optimizers.schedules.serialize` and 

231 `tf.keras.optimizers.schedules.deserialize`. 

232 

233 Returns: 

234 A 1-arg callable learning rate schedule that takes the current optimizer 

235 step and outputs the decayed learning rate, a scalar `Tensor` of the same 

236 type as the boundary tensors. 

237 

238 The output of the 1-arg function that takes the `step` 

239 is `values[0]` when `step <= boundaries[0]`, 

240 `values[1]` when `step > boundaries[0]` and `step <= boundaries[1]`, ..., 

241 and values[-1] when `step > boundaries[-1]`. 

242 """ 

243 

244 def __init__( 

245 self, 

246 boundaries, 

247 values, 

248 name=None): 

249 """Piecewise constant from boundaries and interval values. 

250 

251 Args: 

252 boundaries: A list of `Tensor`s or `int`s or `float`s with strictly 

253 increasing entries, and with all elements having the same type as the 

254 optimizer step. 

255 values: A list of `Tensor`s or `float`s or `int`s that specifies the 

256 values for the intervals defined by `boundaries`. It should have one 

257 more element than `boundaries`, and all elements should have the same 

258 type. 

259 name: A string. Optional name of the operation. Defaults to 

260 'PiecewiseConstant'. 

261 

262 Raises: 

263 ValueError: if the number of elements in the lists do not match. 

264 """ 

265 super(PiecewiseConstantDecay, self).__init__() 

266 

267 if len(boundaries) != len(values) - 1: 

268 raise ValueError( 

269 "The length of boundaries should be 1 less than the length of values") 

270 

271 self.boundaries = boundaries 

272 self.values = values 

273 self.name = name 

274 

275 def __call__(self, step): 

276 with ops.name_scope_v2(self.name or "PiecewiseConstant"): 

277 boundaries = nest.map_structure( 

278 tensor_conversion.convert_to_tensor_v2_with_dispatch, 

279 nest.flatten(self.boundaries), 

280 ) 

281 values = nest.map_structure( 

282 tensor_conversion.convert_to_tensor_v2_with_dispatch, 

283 nest.flatten(self.values), 

284 ) 

285 x_recomp = tensor_conversion.convert_to_tensor_v2_with_dispatch(step) 

286 for i, b in enumerate(boundaries): 

287 if b.dtype.base_dtype != x_recomp.dtype.base_dtype: 

288 # We cast the boundaries to have the same type as the step 

289 b = math_ops.cast(b, x_recomp.dtype.base_dtype) 

290 boundaries[i] = b 

291 pred_fn_pairs = [] 

292 pred_fn_pairs.append((x_recomp <= boundaries[0], lambda: values[0])) 

293 pred_fn_pairs.append((x_recomp > boundaries[-1], lambda: values[-1])) 

294 for low, high, v in zip(boundaries[:-1], boundaries[1:], values[1:-1]): 

295 # Need to bind v here; can do this with lambda v=v: ... 

296 pred = (x_recomp > low) & (x_recomp <= high) 

297 pred_fn_pairs.append((pred, lambda v=v: v)) 

298 

299 # The default isn't needed here because our conditions are mutually 

300 # exclusive and exhaustive, but tf.case requires it. 

301 default = lambda: values[0] 

302 return control_flow_case.case(pred_fn_pairs, default, exclusive=True) 

303 

304 def get_config(self): 

305 return { 

306 "boundaries": self.boundaries, 

307 "values": self.values, 

308 "name": self.name 

309 } 

310 

311 

312@keras_export("keras.optimizers.schedules.PolynomialDecay") 

313class PolynomialDecay(LearningRateSchedule): 

314 """A LearningRateSchedule that uses a polynomial decay schedule. 

315 

316 It is commonly observed that a monotonically decreasing learning rate, whose 

317 degree of change is carefully chosen, results in a better performing model. 

318 This schedule applies a polynomial decay function to an optimizer step, 

319 given a provided `initial_learning_rate`, to reach an `end_learning_rate` 

320 in the given `decay_steps`. 

321 

322 It requires a `step` value to compute the decayed learning rate. You 

323 can just pass a TensorFlow variable that you increment at each training 

324 step. 

325 

326 The schedule is a 1-arg callable that produces a decayed learning rate 

327 when passed the current optimizer step. This can be useful for changing the 

328 learning rate value across different invocations of optimizer functions. 

329 It is computed as: 

330 

331 ```python 

332 def decayed_learning_rate(step): 

333 step = min(step, decay_steps) 

334 return ((initial_learning_rate - end_learning_rate) * 

335 (1 - step / decay_steps) ^ (power) 

336 ) + end_learning_rate 

337 ``` 

338 

339 If `cycle` is True then a multiple of `decay_steps` is used, the first one 

340 that is bigger than `step`. 

341 

342 ```python 

343 def decayed_learning_rate(step): 

344 decay_steps = decay_steps * ceil(step / decay_steps) 

345 return ((initial_learning_rate - end_learning_rate) * 

346 (1 - step / decay_steps) ^ (power) 

347 ) + end_learning_rate 

348 ``` 

349 

350 You can pass this schedule directly into a `tf.keras.optimizers.Optimizer` 

351 as the learning rate. 

352 Example: Fit a model while decaying from 0.1 to 0.01 in 10000 steps using 

353 sqrt (i.e. power=0.5): 

354 

355 ```python 

356 ... 

357 starter_learning_rate = 0.1 

358 end_learning_rate = 0.01 

359 decay_steps = 10000 

360 learning_rate_fn = tf.keras.optimizers.schedules.PolynomialDecay( 

361 starter_learning_rate, 

362 decay_steps, 

363 end_learning_rate, 

364 power=0.5) 

365 

366 model.compile(optimizer=tf.keras.optimizers.SGD( 

367 learning_rate=learning_rate_fn), 

368 loss='sparse_categorical_crossentropy', 

369 metrics=['accuracy']) 

370 

371 model.fit(data, labels, epochs=5) 

372 ``` 

373 

374 The learning rate schedule is also serializable and deserializable using 

375 `tf.keras.optimizers.schedules.serialize` and 

376 `tf.keras.optimizers.schedules.deserialize`. 

377 

378 Returns: 

379 A 1-arg callable learning rate schedule that takes the current optimizer 

380 step and outputs the decayed learning rate, a scalar `Tensor` of the same 

381 type as `initial_learning_rate`. 

382 """ 

383 

384 def __init__( 

385 self, 

386 initial_learning_rate, 

387 decay_steps, 

388 end_learning_rate=0.0001, 

389 power=1.0, 

390 cycle=False, 

391 name=None): 

392 """Applies a polynomial decay to the learning rate. 

393 

394 Args: 

395 initial_learning_rate: A scalar `float32` or `float64` `Tensor` or a 

396 Python number. The initial learning rate. 

397 decay_steps: A scalar `int32` or `int64` `Tensor` or a Python number. 

398 Must be positive. See the decay computation above. 

399 end_learning_rate: A scalar `float32` or `float64` `Tensor` or a 

400 Python number. The minimal end learning rate. 

401 power: A scalar `float32` or `float64` `Tensor` or a 

402 Python number. The power of the polynomial. Defaults to linear, 1.0. 

403 cycle: A boolean, whether or not it should cycle beyond decay_steps. 

404 name: String. Optional name of the operation. Defaults to 

405 'PolynomialDecay'. 

406 """ 

407 super(PolynomialDecay, self).__init__() 

408 

409 self.initial_learning_rate = initial_learning_rate 

410 self.decay_steps = decay_steps 

411 self.end_learning_rate = end_learning_rate 

412 self.power = power 

413 self.cycle = cycle 

414 self.name = name 

415 

416 def __call__(self, step): 

417 with ops.name_scope_v2(self.name or "PolynomialDecay") as name: 

418 initial_learning_rate = ( 

419 tensor_conversion.convert_to_tensor_v2_with_dispatch( 

420 self.initial_learning_rate, name="initial_learning_rate" 

421 ) 

422 ) 

423 dtype = initial_learning_rate.dtype 

424 end_learning_rate = math_ops.cast(self.end_learning_rate, dtype) 

425 power = math_ops.cast(self.power, dtype) 

426 

427 global_step_recomp = math_ops.cast(step, dtype) 

428 decay_steps_recomp = math_ops.cast(self.decay_steps, dtype) 

429 if self.cycle: 

430 # Find the first multiple of decay_steps that is bigger than 

431 # global_step. If global_step is zero set the multiplier to 1 

432 multiplier = array_ops.where_v2( 

433 math_ops.equal(global_step_recomp, 0), 1.0, 

434 math_ops.ceil(global_step_recomp / self.decay_steps)) 

435 decay_steps_recomp = math_ops.multiply(decay_steps_recomp, multiplier) 

436 else: 

437 # Make sure that the global_step used is not bigger than decay_steps. 

438 global_step_recomp = math_ops.minimum(global_step_recomp, 

439 decay_steps_recomp) 

440 

441 p = math_ops.divide(global_step_recomp, decay_steps_recomp) 

442 return math_ops.add( 

443 math_ops.multiply(initial_learning_rate - end_learning_rate, 

444 math_ops.pow(1 - p, power)), 

445 end_learning_rate, 

446 name=name) 

447 

448 def get_config(self): 

449 return { 

450 "initial_learning_rate": self.initial_learning_rate, 

451 "decay_steps": self.decay_steps, 

452 "end_learning_rate": self.end_learning_rate, 

453 "power": self.power, 

454 "cycle": self.cycle, 

455 "name": self.name 

456 } 

457 

458 

459@keras_export("keras.optimizers.schedules.InverseTimeDecay") 

460class InverseTimeDecay(LearningRateSchedule): 

461 """A LearningRateSchedule that uses an inverse time decay schedule. 

462 

463 When training a model, it is often useful to lower the learning rate as 

464 the training progresses. This schedule applies the inverse decay function 

465 to an optimizer step, given a provided initial learning rate. 

466 It requires a `step` value to compute the decayed learning rate. You can 

467 just pass a TensorFlow variable that you increment at each training step. 

468 

469 The schedule a 1-arg callable that produces a decayed learning 

470 rate when passed the current optimizer step. This can be useful for changing 

471 the learning rate value across different invocations of optimizer functions. 

472 It is computed as: 

473 

474 ```python 

475 def decayed_learning_rate(step): 

476 return initial_learning_rate / (1 + decay_rate * step / decay_step) 

477 ``` 

478 

479 or, if `staircase` is `True`, as: 

480 

481 ```python 

482 def decayed_learning_rate(step): 

483 return initial_learning_rate / (1 + decay_rate * floor(step / decay_step)) 

484 ``` 

485 

486 You can pass this schedule directly into a `tf.keras.optimizers.Optimizer` 

487 as the learning rate. 

488 Example: Fit a Keras model when decaying 1/t with a rate of 0.5: 

489 

490 ```python 

491 ... 

492 initial_learning_rate = 0.1 

493 decay_steps = 1.0 

494 decay_rate = 0.5 

495 learning_rate_fn = keras.optimizers.schedules.InverseTimeDecay( 

496 initial_learning_rate, decay_steps, decay_rate) 

497 

498 model.compile(optimizer=tf.keras.optimizers.SGD( 

499 learning_rate=learning_rate_fn), 

500 loss='sparse_categorical_crossentropy', 

501 metrics=['accuracy']) 

502 

503 model.fit(data, labels, epochs=5) 

504 ``` 

505 

506 Returns: 

507 A 1-arg callable learning rate schedule that takes the current optimizer 

508 step and outputs the decayed learning rate, a scalar `Tensor` of the same 

509 type as `initial_learning_rate`. 

510 """ 

511 

512 def __init__( 

513 self, 

514 initial_learning_rate, 

515 decay_steps, 

516 decay_rate, 

517 staircase=False, 

518 name=None): 

519 """Applies inverse time decay to the initial learning rate. 

520 

521 Args: 

522 initial_learning_rate: A scalar `float32` or `float64` `Tensor` or a 

523 Python number. The initial learning rate. 

524 decay_steps: How often to apply decay. 

525 decay_rate: A Python number. The decay rate. 

526 staircase: Whether to apply decay in a discrete staircase, as opposed to 

527 continuous, fashion. 

528 name: String. Optional name of the operation. Defaults to 

529 'InverseTimeDecay'. 

530 """ 

531 super(InverseTimeDecay, self).__init__() 

532 

533 self.initial_learning_rate = initial_learning_rate 

534 self.decay_steps = decay_steps 

535 self.decay_rate = decay_rate 

536 self.staircase = staircase 

537 self.name = name 

538 

539 def __call__(self, step): 

540 with ops.name_scope_v2(self.name or "InverseTimeDecay") as name: 

541 initial_learning_rate = ( 

542 tensor_conversion.convert_to_tensor_v2_with_dispatch( 

543 self.initial_learning_rate, name="initial_learning_rate" 

544 ) 

545 ) 

546 dtype = initial_learning_rate.dtype 

547 decay_steps = math_ops.cast(self.decay_steps, dtype) 

548 decay_rate = math_ops.cast(self.decay_rate, dtype) 

549 

550 global_step_recomp = math_ops.cast(step, dtype) 

551 p = global_step_recomp / decay_steps 

552 if self.staircase: 

553 p = math_ops.floor(p) 

554 const = math_ops.cast(constant_op.constant(1), dtype) 

555 denom = math_ops.add(const, math_ops.multiply(decay_rate, p)) 

556 return math_ops.divide(initial_learning_rate, denom, name=name) 

557 

558 def get_config(self): 

559 return { 

560 "initial_learning_rate": self.initial_learning_rate, 

561 "decay_steps": self.decay_steps, 

562 "decay_rate": self.decay_rate, 

563 "staircase": self.staircase, 

564 "name": self.name 

565 } 

566 

567 

568@keras_export("keras.optimizers.schedules.CosineDecay", 

569 "keras.experimental.CosineDecay") 

570class CosineDecay(LearningRateSchedule): 

571 """A LearningRateSchedule that uses a cosine decay schedule. 

572 

573 See [Loshchilov & Hutter, ICLR2016](https://arxiv.org/abs/1608.03983), 

574 SGDR: Stochastic Gradient Descent with Warm Restarts. 

575 

576 When training a model, it is often useful to lower the learning rate as 

577 the training progresses. This schedule applies a cosine decay function 

578 to an optimizer step, given a provided initial learning rate. 

579 It requires a `step` value to compute the decayed learning rate. You can 

580 just pass a TensorFlow variable that you increment at each training step. 

581 

582 The schedule a 1-arg callable that produces a decayed learning 

583 rate when passed the current optimizer step. This can be useful for changing 

584 the learning rate value across different invocations of optimizer functions. 

585 It is computed as: 

586 

587 ```python 

588 def decayed_learning_rate(step): 

589 step = min(step, decay_steps) 

590 cosine_decay = 0.5 * (1 + cos(pi * step / decay_steps)) 

591 decayed = (1 - alpha) * cosine_decay + alpha 

592 return initial_learning_rate * decayed 

593 ``` 

594 

595 Example usage: 

596 ```python 

597 decay_steps = 1000 

598 lr_decayed_fn = tf.keras.optimizers.schedules.CosineDecay( 

599 initial_learning_rate, decay_steps) 

600 ``` 

601 

602 You can pass this schedule directly into a `tf.keras.optimizers.Optimizer` 

603 as the learning rate. The learning rate schedule is also serializable and 

604 deserializable using `tf.keras.optimizers.schedules.serialize` and 

605 `tf.keras.optimizers.schedules.deserialize`. 

606 

607 Returns: 

608 A 1-arg callable learning rate schedule that takes the current optimizer 

609 step and outputs the decayed learning rate, a scalar `Tensor` of the same 

610 type as `initial_learning_rate`. 

611 """ 

612 

613 def __init__( 

614 self, 

615 initial_learning_rate, 

616 decay_steps, 

617 alpha=0.0, 

618 name=None): 

619 """Applies cosine decay to the learning rate. 

620 

621 Args: 

622 initial_learning_rate: A scalar `float32` or `float64` Tensor or a 

623 Python number. The initial learning rate. 

624 decay_steps: A scalar `int32` or `int64` `Tensor` or a Python number. 

625 Number of steps to decay over. 

626 alpha: A scalar `float32` or `float64` Tensor or a Python number. 

627 Minimum learning rate value as a fraction of initial_learning_rate. 

628 name: String. Optional name of the operation. Defaults to 'CosineDecay'. 

629 """ 

630 super(CosineDecay, self).__init__() 

631 

632 self.initial_learning_rate = initial_learning_rate 

633 self.decay_steps = decay_steps 

634 self.alpha = alpha 

635 self.name = name 

636 

637 def __call__(self, step): 

638 with ops.name_scope_v2(self.name or "CosineDecay"): 

639 initial_learning_rate = ( 

640 tensor_conversion.convert_to_tensor_v2_with_dispatch( 

641 self.initial_learning_rate, name="initial_learning_rate" 

642 ) 

643 ) 

644 dtype = initial_learning_rate.dtype 

645 decay_steps = math_ops.cast(self.decay_steps, dtype) 

646 

647 global_step_recomp = math_ops.cast(step, dtype) 

648 global_step_recomp = math_ops.minimum(global_step_recomp, decay_steps) 

649 completed_fraction = global_step_recomp / decay_steps 

650 cosine_decayed = 0.5 * (1.0 + math_ops.cos( 

651 constant_op.constant(math.pi) * completed_fraction)) 

652 

653 decayed = (1 - self.alpha) * cosine_decayed + self.alpha 

654 return math_ops.multiply(initial_learning_rate, decayed) 

655 

656 def get_config(self): 

657 return { 

658 "initial_learning_rate": self.initial_learning_rate, 

659 "decay_steps": self.decay_steps, 

660 "alpha": self.alpha, 

661 "name": self.name 

662 } 

663 

664 

665@keras_export("keras.optimizers.schedules.CosineDecayRestarts", 

666 "keras.experimental.CosineDecayRestarts") 

667class CosineDecayRestarts(LearningRateSchedule): 

668 """A LearningRateSchedule that uses a cosine decay schedule with restarts. 

669 

670 See [Loshchilov & Hutter, ICLR2016](https://arxiv.org/abs/1608.03983), 

671 SGDR: Stochastic Gradient Descent with Warm Restarts. 

672 

673 When training a model, it is often useful to lower the learning rate as 

674 the training progresses. This schedule applies a cosine decay function with 

675 restarts to an optimizer step, given a provided initial learning rate. 

676 It requires a `step` value to compute the decayed learning rate. You can 

677 just pass a TensorFlow variable that you increment at each training step. 

678 

679 The schedule a 1-arg callable that produces a decayed learning 

680 rate when passed the current optimizer step. This can be useful for changing 

681 the learning rate value across different invocations of optimizer functions. 

682 

683 The learning rate multiplier first decays 

684 from 1 to `alpha` for `first_decay_steps` steps. Then, a warm 

685 restart is performed. Each new warm restart runs for `t_mul` times more 

686 steps and with `m_mul` times smaller initial learning rate. 

687 

688 Example usage: 

689 ```python 

690 first_decay_steps = 1000 

691 lr_decayed_fn = ( 

692 tf.keras.optimizers.schedules.CosineDecayRestarts( 

693 initial_learning_rate, 

694 first_decay_steps)) 

695 ``` 

696 

697 You can pass this schedule directly into a `tf.keras.optimizers.Optimizer` 

698 as the learning rate. The learning rate schedule is also serializable and 

699 deserializable using `tf.keras.optimizers.schedules.serialize` and 

700 `tf.keras.optimizers.schedules.deserialize`. 

701 

702 Returns: 

703 A 1-arg callable learning rate schedule that takes the current optimizer 

704 step and outputs the decayed learning rate, a scalar `Tensor` of the same 

705 type as `initial_learning_rate`. 

706 """ 

707 

708 def __init__( 

709 self, 

710 initial_learning_rate, 

711 first_decay_steps, 

712 t_mul=2.0, 

713 m_mul=1.0, 

714 alpha=0.0, 

715 name=None): 

716 """Applies cosine decay with restarts to the learning rate. 

717 

718 Args: 

719 initial_learning_rate: A scalar `float32` or `float64` Tensor or a Python 

720 number. The initial learning rate. 

721 first_decay_steps: A scalar `int32` or `int64` `Tensor` or a Python 

722 number. Number of steps to decay over. 

723 t_mul: A scalar `float32` or `float64` `Tensor` or a Python number. 

724 Used to derive the number of iterations in the i-th period 

725 m_mul: A scalar `float32` or `float64` `Tensor` or a Python number. 

726 Used to derive the initial learning rate of the i-th period: 

727 alpha: A scalar `float32` or `float64` Tensor or a Python number. 

728 Minimum learning rate value as a fraction of the initial_learning_rate. 

729 name: String. Optional name of the operation. Defaults to 'SGDRDecay'. 

730 """ 

731 super(CosineDecayRestarts, self).__init__() 

732 

733 self.initial_learning_rate = initial_learning_rate 

734 self.first_decay_steps = first_decay_steps 

735 self._t_mul = t_mul 

736 self._m_mul = m_mul 

737 self.alpha = alpha 

738 self.name = name 

739 

740 def __call__(self, step): 

741 with ops.name_scope_v2(self.name or "SGDRDecay") as name: 

742 initial_learning_rate = ( 

743 tensor_conversion.convert_to_tensor_v2_with_dispatch( 

744 self.initial_learning_rate, name="initial_learning_rate" 

745 ) 

746 ) 

747 dtype = initial_learning_rate.dtype 

748 first_decay_steps = math_ops.cast(self.first_decay_steps, dtype) 

749 alpha = math_ops.cast(self.alpha, dtype) 

750 t_mul = math_ops.cast(self._t_mul, dtype) 

751 m_mul = math_ops.cast(self._m_mul, dtype) 

752 

753 global_step_recomp = math_ops.cast(step, dtype) 

754 completed_fraction = global_step_recomp / first_decay_steps 

755 

756 def compute_step(completed_fraction, geometric=False): 

757 """Helper for `cond` operation.""" 

758 if geometric: 

759 i_restart = math_ops.floor( 

760 math_ops.log(1.0 - completed_fraction * (1.0 - t_mul)) / 

761 math_ops.log(t_mul)) 

762 

763 sum_r = (1.0 - t_mul**i_restart) / (1.0 - t_mul) 

764 completed_fraction = (completed_fraction - sum_r) / t_mul**i_restart 

765 

766 else: 

767 i_restart = math_ops.floor(completed_fraction) 

768 completed_fraction -= i_restart 

769 

770 return i_restart, completed_fraction 

771 

772 i_restart, completed_fraction = cond.cond( 

773 math_ops.equal(t_mul, 1.0), 

774 lambda: compute_step(completed_fraction, geometric=False), 

775 lambda: compute_step(completed_fraction, geometric=True)) 

776 

777 m_fac = m_mul**i_restart 

778 cosine_decayed = 0.5 * m_fac * (1.0 + math_ops.cos( 

779 constant_op.constant(math.pi) * completed_fraction)) 

780 decayed = (1 - alpha) * cosine_decayed + alpha 

781 

782 return math_ops.multiply(initial_learning_rate, decayed, name=name) 

783 

784 def get_config(self): 

785 return { 

786 "initial_learning_rate": self.initial_learning_rate, 

787 "first_decay_steps": self.first_decay_steps, 

788 "t_mul": self._t_mul, 

789 "m_mul": self._m_mul, 

790 "alpha": self.alpha, 

791 "name": self.name 

792 } 

793 

794 

795# Note: this code is still used by V1 APIs. 

796class LinearCosineDecay(LearningRateSchedule): 

797 """A LearningRateSchedule that uses a linear cosine decay schedule. 

798 

799 See [Bello et al., ICML2017] Neural Optimizer Search with RL. 

800 https://arxiv.org/abs/1709.07417 

801 

802 For the idea of warm starts here controlled by `num_periods`, 

803 see [Loshchilov & Hutter, ICLR2016] SGDR: Stochastic Gradient Descent 

804 with Warm Restarts. https://arxiv.org/abs/1608.03983 

805 

806 Note that linear cosine decay is more aggressive than cosine decay and 

807 larger initial learning rates can typically be used. 

808 

809 When training a model, it is often recommended to lower the learning rate as 

810 the training progresses. This schedule applies a linear cosine decay 

811 function to an optimizer step, given a provided initial learning rate. 

812 It requires a `step` value to compute the decayed learning rate. You can 

813 just pass a TensorFlow variable that you increment at each training step. 

814 

815 The schedule a 1-arg callable that produces a decayed learning 

816 rate when passed the current optimizer step. This can be useful for changing 

817 the learning rate value across different invocations of optimizer functions. 

818 It is computed as: 

819 

820 ```python 

821 def decayed_learning_rate(step): 

822 step = min(step, decay_steps) 

823 linear_decay = (decay_steps - step) / decay_steps 

824 cosine_decay = 0.5 * ( 

825 1 + cos(pi * 2 * num_periods * step / decay_steps)) 

826 decayed = (alpha + linear_decay) * cosine_decay + beta 

827 return initial_learning_rate * decayed 

828 ``` 

829 

830 Example usage: 

831 ```python 

832 decay_steps = 1000 

833 lr_decayed_fn = ( 

834 tf.keras.experimental.LinearCosineDecay( 

835 initial_learning_rate, decay_steps)) 

836 ``` 

837 

838 You can pass this schedule directly into a `tf.keras.optimizers.Optimizer` 

839 as the learning rate. The learning rate schedule is also serializable and 

840 deserializable using `tf.keras.optimizers.schedules.serialize` and 

841 `tf.keras.optimizers.schedules.deserialize`. 

842 

843 Returns: 

844 A 1-arg callable learning rate schedule that takes the current optimizer 

845 step and outputs the decayed learning rate, a scalar `Tensor` of the same 

846 type as `initial_learning_rate`. 

847 """ 

848 

849 def __init__( 

850 self, 

851 initial_learning_rate, 

852 decay_steps, 

853 num_periods=0.5, 

854 alpha=0.0, 

855 beta=0.001, 

856 name=None): 

857 """Applies linear cosine decay to the learning rate. 

858 

859 Args: 

860 initial_learning_rate: A scalar `float32` or `float64` Tensor or a Python 

861 number. The initial learning rate. 

862 decay_steps: A scalar `int32` or `int64` `Tensor` or a Python number. 

863 Number of steps to decay over. 

864 num_periods: Number of periods in the cosine part of the decay. 

865 See computation above. 

866 alpha: See computation above. 

867 beta: See computation above. 

868 name: String. Optional name of the operation. Defaults to 

869 'LinearCosineDecay'. 

870 """ 

871 super(LinearCosineDecay, self).__init__() 

872 

873 self.initial_learning_rate = initial_learning_rate 

874 self.decay_steps = decay_steps 

875 self.num_periods = num_periods 

876 self.alpha = alpha 

877 self.beta = beta 

878 self.name = name 

879 

880 def __call__(self, step): 

881 with ops.name_scope_v2(self.name or "LinearCosineDecay") as name: 

882 initial_learning_rate = ( 

883 tensor_conversion.convert_to_tensor_v2_with_dispatch( 

884 self.initial_learning_rate, name="initial_learning_rate" 

885 ) 

886 ) 

887 dtype = initial_learning_rate.dtype 

888 decay_steps = math_ops.cast(self.decay_steps, dtype) 

889 num_periods = math_ops.cast(self.num_periods, dtype) 

890 alpha = math_ops.cast(self.alpha, dtype) 

891 beta = math_ops.cast(self.beta, dtype) 

892 

893 global_step_recomp = math_ops.cast(step, dtype) 

894 global_step_recomp = math_ops.minimum(global_step_recomp, decay_steps) 

895 linear_decayed = (decay_steps - global_step_recomp) / decay_steps 

896 completed_fraction = global_step_recomp / decay_steps 

897 fraction = 2.0 * num_periods * completed_fraction 

898 cosine_decayed = 0.5 * ( 

899 1.0 + math_ops.cos(constant_op.constant(math.pi) * fraction)) 

900 

901 linear_cosine_decayed = (alpha + linear_decayed) * cosine_decayed + beta 

902 return math_ops.multiply(initial_learning_rate, linear_cosine_decayed, 

903 name=name) 

904 

905 def get_config(self): 

906 return { 

907 "initial_learning_rate": self.initial_learning_rate, 

908 "decay_steps": self.decay_steps, 

909 "num_periods": self.num_periods, 

910 "alpha": self.alpha, 

911 "beta": self.beta, 

912 "name": self.name 

913 } 

914 

915 

916# Note: this code is still used by V1 APIs. 

917class NoisyLinearCosineDecay(LearningRateSchedule): 

918 """A LearningRateSchedule that uses a noisy linear cosine decay schedule. 

919 

920 See [Bello et al., ICML2017] Neural Optimizer Search with RL. 

921 https://arxiv.org/abs/1709.07417 

922 

923 For the idea of warm starts here controlled by `num_periods`, 

924 see [Loshchilov & Hutter, ICLR2016] SGDR: Stochastic Gradient Descent 

925 with Warm Restarts. https://arxiv.org/abs/1608.03983 

926 

927 Note that linear cosine decay is more aggressive than cosine decay and 

928 larger initial learning rates can typically be used. 

929 

930 When training a model, it is often recommended to lower the learning rate as 

931 the training progresses. This schedule applies a noisy linear cosine decay 

932 function to an optimizer step, given a provided initial learning rate. 

933 It requires a `step` value to compute the decayed learning rate. You can 

934 just pass a TensorFlow variable that you increment at each training step. 

935 

936 The schedule a 1-arg callable that produces a decayed learning 

937 rate when passed the current optimizer step. This can be useful for changing 

938 the learning rate value across different invocations of optimizer functions. 

939 It is computed as: 

940 

941 ```python 

942 def decayed_learning_rate(step): 

943 step = min(step, decay_steps) 

944 linear_decay = (decay_steps - step) / decay_steps) 

945 cosine_decay = 0.5 * ( 

946 1 + cos(pi * 2 * num_periods * step / decay_steps)) 

947 decayed = (alpha + linear_decay + eps_t) * cosine_decay + beta 

948 return initial_learning_rate * decayed 

949 ``` 

950 where eps_t is 0-centered gaussian noise with variance 

951 initial_variance / (1 + global_step) ** variance_decay 

952 

953 Example usage: 

954 ```python 

955 decay_steps = 1000 

956 lr_decayed_fn = ( 

957 tf.keras.experimental.NoisyLinearCosineDecay( 

958 initial_learning_rate, decay_steps)) 

959 ``` 

960 

961 You can pass this schedule directly into a `tf.keras.optimizers.Optimizer` 

962 as the learning rate. The learning rate schedule is also serializable and 

963 deserializable using `tf.keras.optimizers.schedules.serialize` and 

964 `tf.keras.optimizers.schedules.deserialize`. 

965 

966 Returns: 

967 A 1-arg callable learning rate schedule that takes the current optimizer 

968 step and outputs the decayed learning rate, a scalar `Tensor` of the same 

969 type as `initial_learning_rate`. 

970 """ 

971 

972 def __init__( 

973 self, 

974 initial_learning_rate, 

975 decay_steps, 

976 initial_variance=1.0, 

977 variance_decay=0.55, 

978 num_periods=0.5, 

979 alpha=0.0, 

980 beta=0.001, 

981 name=None): 

982 """Applies noisy linear cosine decay to the learning rate. 

983 

984 Args: 

985 initial_learning_rate: A scalar `float32` or `float64` Tensor or a Python 

986 number. The initial learning rate. 

987 decay_steps: A scalar `int32` or `int64` `Tensor` or a Python number. 

988 Number of steps to decay over. 

989 initial_variance: initial variance for the noise. See computation above. 

990 variance_decay: decay for the noise's variance. See computation above. 

991 num_periods: Number of periods in the cosine part of the decay. 

992 See computation above. 

993 alpha: See computation above. 

994 beta: See computation above. 

995 name: String. Optional name of the operation. Defaults to 

996 'NoisyLinearCosineDecay'. 

997 """ 

998 super(NoisyLinearCosineDecay, self).__init__() 

999 

1000 self.initial_learning_rate = initial_learning_rate 

1001 self.decay_steps = decay_steps 

1002 self.initial_variance = initial_variance 

1003 self.variance_decay = variance_decay 

1004 self.num_periods = num_periods 

1005 self.alpha = alpha 

1006 self.beta = beta 

1007 self.name = name 

1008 

1009 def __call__(self, step): 

1010 with ops.name_scope_v2(self.name or "NoisyLinearCosineDecay") as name: 

1011 initial_learning_rate = ( 

1012 tensor_conversion.convert_to_tensor_v2_with_dispatch( 

1013 self.initial_learning_rate, name="initial_learning_rate" 

1014 ) 

1015 ) 

1016 dtype = initial_learning_rate.dtype 

1017 decay_steps = math_ops.cast(self.decay_steps, dtype) 

1018 initial_variance = math_ops.cast(self.initial_variance, dtype) 

1019 variance_decay = math_ops.cast(self.variance_decay, dtype) 

1020 num_periods = math_ops.cast(self.num_periods, dtype) 

1021 alpha = math_ops.cast(self.alpha, dtype) 

1022 beta = math_ops.cast(self.beta, dtype) 

1023 

1024 global_step_recomp = math_ops.cast(step, dtype) 

1025 global_step_recomp = math_ops.minimum(global_step_recomp, decay_steps) 

1026 linear_decayed = (decay_steps - global_step_recomp) / decay_steps 

1027 variance = initial_variance / ( 

1028 math_ops.pow(1.0 + global_step_recomp, variance_decay)) 

1029 std = math_ops.sqrt(variance) 

1030 noisy_linear_decayed = ( 

1031 linear_decayed + random_ops.random_normal( 

1032 linear_decayed.shape, stddev=std)) 

1033 

1034 completed_fraction = global_step_recomp / decay_steps 

1035 fraction = 2.0 * num_periods * completed_fraction 

1036 cosine_decayed = 0.5 * ( 

1037 1.0 + math_ops.cos(constant_op.constant(math.pi) * fraction)) 

1038 noisy_linear_cosine_decayed = ( 

1039 (alpha + noisy_linear_decayed) * cosine_decayed + beta) 

1040 

1041 return math_ops.multiply( 

1042 initial_learning_rate, noisy_linear_cosine_decayed, name=name) 

1043 

1044 def get_config(self): 

1045 return { 

1046 "initial_learning_rate": self.initial_learning_rate, 

1047 "decay_steps": self.decay_steps, 

1048 "initial_variance": self.initial_variance, 

1049 "variance_decay": self.variance_decay, 

1050 "num_periods": self.num_periods, 

1051 "alpha": self.alpha, 

1052 "beta": self.beta, 

1053 "name": self.name 

1054 } 

1055 

1056 

1057@keras_export("keras.optimizers.schedules.serialize") 

1058def serialize(learning_rate_schedule): 

1059 return generic_utils.serialize_keras_object(learning_rate_schedule) 

1060 

1061 

1062@keras_export("keras.optimizers.schedules.deserialize") 

1063def deserialize(config, custom_objects=None): 

1064 return generic_utils.deserialize_keras_object( 

1065 config, 

1066 module_objects=globals(), 

1067 custom_objects=custom_objects, 

1068 printable_module_name="decay")