Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/keras/src/optimizers/optimizer_v1.py: 17%

402 statements  

« prev     ^ index     » next       coverage.py v7.4.0, created at 2024-01-03 07:57 +0000

1# Copyright 2015 The TensorFlow Authors. All Rights Reserved. 

2# 

3# Licensed under the Apache License, Version 2.0 (the "License"); 

4# you may not use this file except in compliance with the License. 

5# You may obtain a copy of the License at 

6# 

7# http://www.apache.org/licenses/LICENSE-2.0 

8# 

9# Unless required by applicable law or agreed to in writing, software 

10# distributed under the License is distributed on an "AS IS" BASIS, 

11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 

12# See the License for the specific language governing permissions and 

13# limitations under the License. 

14# ============================================================================== 

15 

16 

17"""Legacy v1 optimizer classes. 

18 

19For more examples see the base class `tf.compat.v1.keras.optimizers.Optimizer`. 

20""" 

21 

22import tensorflow.compat.v2 as tf 

23 

24from keras.src import backend 

25 

26 

27class Optimizer: 

28 """Abstract optimizer base class. 

29 

30 Note: this is the parent class of all optimizers, not an actual optimizer 

31 that can be used for training models. 

32 

33 All Keras optimizers support the following keyword arguments: 

34 

35 clipnorm: float >= 0. Gradients will be clipped 

36 when their L2 norm exceeds this value. 

37 clipvalue: float >= 0. Gradients will be clipped 

38 when their absolute value exceeds this value. 

39 """ 

40 

41 def __init__(self, **kwargs): 

42 allowed_kwargs = {"clipnorm", "clipvalue"} 

43 for k in kwargs: 

44 if k not in allowed_kwargs: 

45 raise TypeError( 

46 "Unexpected keyword argument passed to optimizer: " + str(k) 

47 ) 

48 # checks that clipnorm >= 0 and clipvalue >= 0 

49 if kwargs[k] < 0: 

50 raise ValueError(f"Expected {k} >= 0, received: {kwargs[k]}") 

51 self.__dict__.update(kwargs) 

52 self.updates = [] 

53 self.weights = [] 

54 

55 # Set this to False, indicating `apply_gradients` does not take the 

56 # `experimental_aggregate_gradients` argument. 

57 _HAS_AGGREGATE_GRAD = False 

58 

59 def _create_all_weights(self, params): 

60 """Creates and sets all optimizer weights. 

61 

62 Args: 

63 params: list or tuple of `Variable` objects that will be minimized 

64 using this optimizer. 

65 

66 Returns: 

67 Specific weight values that are used in `get_updates` 

68 """ 

69 raise NotImplementedError 

70 

71 def get_updates(self, loss, params): 

72 raise NotImplementedError 

73 

74 def get_gradients(self, loss, params): 

75 """Returns gradients of `loss` with respect to `params`. 

76 

77 Args: 

78 loss: Loss tensor. 

79 params: List of variables. 

80 

81 Returns: 

82 List of gradient tensors. 

83 

84 Raises: 

85 ValueError: In case any gradient cannot be computed (e.g. if 

86 gradient function not implemented). 

87 """ 

88 grads = backend.gradients(loss, params) 

89 if any(g is None for g in grads): 

90 raise ValueError( 

91 "An operation has `None` for gradient. " 

92 "Please make sure that all of your ops have a " 

93 "gradient defined (i.e. are differentiable). " 

94 "Common ops without gradient: " 

95 "backend.argmax, backend.round, backend.eval." 

96 ) 

97 if hasattr(self, "clipnorm"): 

98 grads = [tf.clip_by_norm(g, self.clipnorm) for g in grads] 

99 if hasattr(self, "clipvalue"): 

100 grads = [ 

101 tf.clip_by_value(g, -self.clipvalue, self.clipvalue) 

102 for g in grads 

103 ] 

104 return grads 

105 

106 def set_weights(self, weights): 

107 """Sets the weights of the optimizer, from Numpy arrays. 

108 

109 Should only be called after computing the gradients 

110 (otherwise the optimizer has no weights). 

111 

112 Args: 

113 weights: a list of Numpy arrays. The number of arrays and their 

114 shape must match number of the dimensions of the weights of the 

115 optimizer (i.e. it should match the output of `get_weights`). 

116 

117 Raises: 

118 ValueError: in case of incompatible weight shapes. 

119 """ 

120 params = self.weights 

121 if len(params) != len(weights): 

122 raise ValueError( 

123 "Length of the specified weight list (" 

124 + str(len(weights)) 

125 + ") does not match the number of weights of the optimizer (" 

126 + str(len(params)) 

127 + ")" 

128 ) 

129 weight_value_tuples = [] 

130 param_values = backend.batch_get_value(params) 

131 for pv, p, w in zip(param_values, params, weights): 

132 if pv.shape != w.shape: 

133 raise ValueError( 

134 "Optimizer weight shape " 

135 + str(pv.shape) 

136 + " not compatible with provided weight shape " 

137 + str(w.shape) 

138 ) 

139 weight_value_tuples.append((p, w)) 

140 backend.batch_set_value(weight_value_tuples) 

141 

142 def get_weights(self): 

143 """Returns the current value of the weights of the optimizer. 

144 

145 Returns: 

146 A list of numpy arrays. 

147 """ 

148 return backend.batch_get_value(self.weights) 

149 

150 def get_config(self): 

151 config = {} 

152 if hasattr(self, "clipnorm"): 

153 config["clipnorm"] = self.clipnorm 

154 if hasattr(self, "clipvalue"): 

155 config["clipvalue"] = self.clipvalue 

156 return config 

157 

158 @classmethod 

159 def from_config(cls, config): 

160 return cls(**config) 

161 

162 

163class SGD(Optimizer): 

164 """Stochastic gradient descent optimizer. 

165 

166 Includes support for momentum, 

167 learning rate decay, and Nesterov momentum. 

168 

169 Args: 

170 lr: float >= 0. Learning rate. 

171 momentum: float >= 0. Parameter that accelerates SGD in the relevant 

172 direction and dampens oscillations. 

173 decay: float >= 0. Learning rate decay over each update. 

174 nesterov: boolean. Whether to apply Nesterov momentum. 

175 """ 

176 

177 def __init__( 

178 self, lr=0.01, momentum=0.0, decay=0.0, nesterov=False, **kwargs 

179 ): 

180 super().__init__(**kwargs) 

181 with backend.name_scope(self.__class__.__name__): 

182 self.iterations = backend.variable( 

183 0, dtype="int64", name="iterations" 

184 ) 

185 self.lr = backend.variable(lr, name="lr") 

186 self.momentum = backend.variable(momentum, name="momentum") 

187 self.decay = backend.variable(decay, name="decay") 

188 self.initial_decay = decay 

189 self.nesterov = nesterov 

190 

191 def _create_all_weights(self, params): 

192 shapes = [backend.int_shape(p) for p in params] 

193 moments = [backend.zeros(shape) for shape in shapes] 

194 self.weights = [self.iterations] + moments 

195 return moments 

196 

197 def get_updates(self, loss, params): 

198 grads = self.get_gradients(loss, params) 

199 self.updates = [tf.compat.v1.assign_add(self.iterations, 1)] 

200 

201 lr = self.lr 

202 if self.initial_decay > 0: 

203 lr = lr * ( 

204 1.0 

205 / ( 

206 1.0 

207 + self.decay 

208 * tf.cast(self.iterations, backend.dtype(self.decay)) 

209 ) 

210 ) 

211 # momentum 

212 moments = self._create_all_weights(params) 

213 for p, g, m in zip(params, grads, moments): 

214 v = self.momentum * m - lr * g # velocity 

215 self.updates.append(tf.compat.v1.assign(m, v)) 

216 

217 if self.nesterov: 

218 new_p = p + self.momentum * v - lr * g 

219 else: 

220 new_p = p + v 

221 

222 # Apply constraints. 

223 if getattr(p, "constraint", None) is not None: 

224 new_p = p.constraint(new_p) 

225 

226 self.updates.append(tf.compat.v1.assign(p, new_p)) 

227 return self.updates 

228 

229 def get_config(self): 

230 config = { 

231 "lr": float(backend.get_value(self.lr)), 

232 "momentum": float(backend.get_value(self.momentum)), 

233 "decay": float(backend.get_value(self.decay)), 

234 "nesterov": self.nesterov, 

235 } 

236 base_config = super().get_config() 

237 return dict(list(base_config.items()) + list(config.items())) 

238 

239 

240class RMSprop(Optimizer): 

241 """RMSProp optimizer. 

242 

243 It is recommended to leave the parameters of this optimizer 

244 at their default values 

245 (except the learning rate, which can be freely tuned). 

246 

247 Args: 

248 lr: float >= 0. Learning rate. 

249 rho: float >= 0. 

250 epsilon: float >= 0. Fuzz factor. 

251 If `None`, defaults to `backend.epsilon()`. 

252 decay: float >= 0. Learning rate decay over each update. 

253 """ 

254 

255 def __init__(self, lr=0.001, rho=0.9, epsilon=None, decay=0.0, **kwargs): 

256 super().__init__(**kwargs) 

257 with backend.name_scope(self.__class__.__name__): 

258 self.lr = backend.variable(lr, name="lr") 

259 self.rho = backend.variable(rho, name="rho") 

260 self.decay = backend.variable(decay, name="decay") 

261 self.iterations = backend.variable( 

262 0, dtype="int64", name="iterations" 

263 ) 

264 if epsilon is None: 

265 epsilon = backend.epsilon() 

266 self.epsilon = epsilon 

267 self.initial_decay = decay 

268 

269 def _create_all_weights(self, params): 

270 accumulators = [ 

271 backend.zeros(backend.int_shape(p), dtype=backend.dtype(p)) 

272 for p in params 

273 ] 

274 self.weights = accumulators 

275 return accumulators 

276 

277 def get_updates(self, loss, params): 

278 grads = self.get_gradients(loss, params) 

279 accumulators = self._create_all_weights(params) 

280 self.updates = [tf.compat.v1.assign_add(self.iterations, 1)] 

281 

282 lr = self.lr 

283 if self.initial_decay > 0: 

284 lr = lr * ( 

285 1.0 

286 / ( 

287 1.0 

288 + self.decay 

289 * tf.cast(self.iterations, backend.dtype(self.decay)) 

290 ) 

291 ) 

292 

293 for p, g, a in zip(params, grads, accumulators): 

294 # update accumulator 

295 new_a = self.rho * a + (1.0 - self.rho) * tf.square(g) 

296 self.updates.append(tf.compat.v1.assign(a, new_a)) 

297 new_p = p - lr * g / (backend.sqrt(new_a) + self.epsilon) 

298 

299 # Apply constraints. 

300 if getattr(p, "constraint", None) is not None: 

301 new_p = p.constraint(new_p) 

302 

303 self.updates.append(tf.compat.v1.assign(p, new_p)) 

304 return self.updates 

305 

306 def get_config(self): 

307 config = { 

308 "lr": float(backend.get_value(self.lr)), 

309 "rho": float(backend.get_value(self.rho)), 

310 "decay": float(backend.get_value(self.decay)), 

311 "epsilon": self.epsilon, 

312 } 

313 base_config = super().get_config() 

314 return dict(list(base_config.items()) + list(config.items())) 

315 

316 

317class Adagrad(Optimizer): 

318 """Adagrad optimizer. 

319 

320 Adagrad is an optimizer with parameter-specific learning rates, 

321 which are adapted relative to how frequently a parameter gets 

322 updated during training. The more updates a parameter receives, 

323 the smaller the updates. 

324 

325 It is recommended to leave the parameters of this optimizer 

326 at their default values. 

327 

328 # Arguments 

329 lr: float >= 0. Initial learning rate. 

330 epsilon: float >= 0. If `None`, defaults to `backend.epsilon()`. 

331 decay: float >= 0. Learning rate decay over each update. 

332 

333 # References 

334 - [Adaptive Subgradient Methods for Online Learning and Stochastic 

335 Optimization](http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf) 

336 """ 

337 

338 def __init__(self, lr=0.01, epsilon=None, decay=0.0, **kwargs): 

339 super().__init__(**kwargs) 

340 with backend.name_scope(self.__class__.__name__): 

341 self.lr = backend.variable(lr, name="lr") 

342 self.decay = backend.variable(decay, name="decay") 

343 self.iterations = backend.variable( 

344 0, dtype="int64", name="iterations" 

345 ) 

346 if epsilon is None: 

347 epsilon = backend.epsilon() 

348 self.epsilon = epsilon 

349 self.initial_decay = decay 

350 

351 def _create_all_weights(self, params): 

352 shapes = [backend.int_shape(p) for p in params] 

353 accumulators = [backend.zeros(shape) for shape in shapes] 

354 self.weights = accumulators 

355 return accumulators 

356 

357 def get_updates(self, loss, params): 

358 grads = self.get_gradients(loss, params) 

359 accumulators = self._create_all_weights(params) 

360 

361 self.updates = [tf.compat.v1.assign_add(self.iterations, 1)] 

362 

363 lr = self.lr 

364 if self.initial_decay > 0: 

365 lr = lr * ( 

366 1.0 

367 / ( 

368 1.0 

369 + self.decay 

370 * tf.cast(self.iterations, backend.dtype(self.decay)) 

371 ) 

372 ) 

373 

374 for p, g, a in zip(params, grads, accumulators): 

375 new_a = a + tf.square(g) # update accumulator 

376 self.updates.append(tf.compat.v1.assign(a, new_a)) 

377 new_p = p - lr * g / (backend.sqrt(new_a) + self.epsilon) 

378 

379 # Apply constraints. 

380 if getattr(p, "constraint", None) is not None: 

381 new_p = p.constraint(new_p) 

382 

383 self.updates.append(tf.compat.v1.assign(p, new_p)) 

384 return self.updates 

385 

386 def get_config(self): 

387 config = { 

388 "lr": float(backend.get_value(self.lr)), 

389 "decay": float(backend.get_value(self.decay)), 

390 "epsilon": self.epsilon, 

391 } 

392 base_config = super().get_config() 

393 return dict(list(base_config.items()) + list(config.items())) 

394 

395 

396class Adadelta(Optimizer): 

397 """Adadelta optimizer. 

398 

399 Adadelta is a more robust extension of Adagrad 

400 that adapts learning rates based on a moving window of gradient updates, 

401 instead of accumulating all past gradients. This way, Adadelta continues 

402 learning even when many updates have been done. Compared to Adagrad, in the 

403 original version of Adadelta you don't have to set an initial learning 

404 rate. In this version, initial learning rate and decay factor can 

405 be set, as in most other Keras optimizers. 

406 

407 It is recommended to leave the parameters of this optimizer 

408 at their default values. 

409 

410 Arguments: 

411 lr: float >= 0. Initial learning rate, defaults to 1. 

412 It is recommended to leave it at the default value. 

413 rho: float >= 0. Adadelta decay factor, corresponding to fraction of 

414 gradient to keep at each time step. 

415 epsilon: float >= 0. Fuzz factor. 

416 If `None`, defaults to `backend.epsilon()`. 

417 decay: float >= 0. Initial learning rate decay. 

418 

419 References: 

420 - [Adadelta - an adaptive learning rate 

421 method](http://arxiv.org/abs/1212.5701) 

422 """ 

423 

424 def __init__(self, lr=1.0, rho=0.95, epsilon=None, decay=0.0, **kwargs): 

425 super().__init__(**kwargs) 

426 with backend.name_scope(self.__class__.__name__): 

427 self.lr = backend.variable(lr, name="lr") 

428 self.decay = backend.variable(decay, name="decay") 

429 self.iterations = backend.variable( 

430 0, dtype="int64", name="iterations" 

431 ) 

432 if epsilon is None: 

433 epsilon = backend.epsilon() 

434 self.rho = rho 

435 self.epsilon = epsilon 

436 self.initial_decay = decay 

437 

438 def _create_all_weights(self, params): 

439 shapes = [backend.int_shape(p) for p in params] 

440 accumulators = [backend.zeros(shape) for shape in shapes] 

441 delta_accumulators = [backend.zeros(shape) for shape in shapes] 

442 self.weights = accumulators + delta_accumulators 

443 return accumulators, delta_accumulators 

444 

445 def get_updates(self, loss, params): 

446 grads = self.get_gradients(loss, params) 

447 self.updates = [tf.compat.v1.assign_add(self.iterations, 1)] 

448 accumulators, delta_accumulators = self._create_all_weights(params) 

449 

450 lr = self.lr 

451 if self.initial_decay > 0: 

452 lr = lr * ( 

453 1.0 

454 / ( 

455 1.0 

456 + self.decay 

457 * tf.cast(self.iterations, backend.dtype(self.decay)) 

458 ) 

459 ) 

460 

461 for p, g, a, d_a in zip( 

462 params, grads, accumulators, delta_accumulators 

463 ): 

464 # update accumulator 

465 new_a = self.rho * a + (1.0 - self.rho) * tf.square(g) 

466 self.updates.append(tf.compat.v1.assign(a, new_a)) 

467 

468 # use the new accumulator and the *old* delta_accumulator 

469 update = ( 

470 g 

471 * backend.sqrt(d_a + self.epsilon) 

472 / backend.sqrt(new_a + self.epsilon) 

473 ) 

474 new_p = p - lr * update 

475 

476 # Apply constraints. 

477 if getattr(p, "constraint", None) is not None: 

478 new_p = p.constraint(new_p) 

479 

480 self.updates.append(tf.compat.v1.assign(p, new_p)) 

481 

482 # update delta_accumulator 

483 new_d_a = self.rho * d_a + (1 - self.rho) * tf.square(update) 

484 self.updates.append(tf.compat.v1.assign(d_a, new_d_a)) 

485 return self.updates 

486 

487 def get_config(self): 

488 config = { 

489 "lr": float(backend.get_value(self.lr)), 

490 "rho": self.rho, 

491 "decay": float(backend.get_value(self.decay)), 

492 "epsilon": self.epsilon, 

493 } 

494 base_config = super().get_config() 

495 return dict(list(base_config.items()) + list(config.items())) 

496 

497 

498class Adam(Optimizer): 

499 """Adam optimizer. 

500 

501 Default parameters follow those provided in the original paper. 

502 

503 Args: 

504 lr: float >= 0. Learning rate. 

505 beta_1: float, 0 < beta < 1. Generally close to 1. 

506 beta_2: float, 0 < beta < 1. Generally close to 1. 

507 epsilon: float >= 0. Fuzz factor. 

508 If `None`, defaults to `backend.epsilon()`. 

509 decay: float >= 0. Learning rate decay over each update. 

510 amsgrad: boolean. Whether to apply the AMSGrad variant of this algorithm 

511 from the paper "On the Convergence of Adam and Beyond". 

512 """ 

513 

514 def __init__( 

515 self, 

516 lr=0.001, 

517 beta_1=0.9, 

518 beta_2=0.999, 

519 epsilon=None, 

520 decay=0.0, 

521 amsgrad=False, 

522 **kwargs, 

523 ): 

524 super().__init__(**kwargs) 

525 with backend.name_scope(self.__class__.__name__): 

526 self.iterations = backend.variable( 

527 0, dtype="int64", name="iterations" 

528 ) 

529 self.lr = backend.variable(lr, name="lr") 

530 self.beta_1 = backend.variable(beta_1, name="beta_1") 

531 self.beta_2 = backend.variable(beta_2, name="beta_2") 

532 self.decay = backend.variable(decay, name="decay") 

533 if epsilon is None: 

534 epsilon = backend.epsilon() 

535 self.epsilon = epsilon 

536 self.initial_decay = decay 

537 self.amsgrad = amsgrad 

538 

539 def _create_all_weights(self, params): 

540 ms = [ 

541 backend.zeros(backend.int_shape(p), dtype=backend.dtype(p)) 

542 for p in params 

543 ] 

544 vs = [ 

545 backend.zeros(backend.int_shape(p), dtype=backend.dtype(p)) 

546 for p in params 

547 ] 

548 if self.amsgrad: 

549 vhats = [ 

550 backend.zeros(backend.int_shape(p), dtype=backend.dtype(p)) 

551 for p in params 

552 ] 

553 else: 

554 vhats = [backend.zeros(1) for _ in params] 

555 self.weights = [self.iterations] + ms + vs + vhats 

556 return ms, vs, vhats 

557 

558 def get_updates(self, loss, params): 

559 grads = self.get_gradients(loss, params) 

560 self.updates = [] 

561 

562 lr = self.lr 

563 if self.initial_decay > 0: 

564 lr = lr * ( 

565 1.0 

566 / ( 

567 1.0 

568 + self.decay 

569 * tf.cast(self.iterations, backend.dtype(self.decay)) 

570 ) 

571 ) 

572 

573 with tf.control_dependencies( 

574 [tf.compat.v1.assign_add(self.iterations, 1)] 

575 ): 

576 t = tf.cast(self.iterations, backend.floatx()) 

577 lr_t = lr * ( 

578 backend.sqrt(1.0 - tf.pow(self.beta_2, t)) 

579 / (1.0 - tf.pow(self.beta_1, t)) 

580 ) 

581 

582 ms, vs, vhats = self._create_all_weights(params) 

583 for p, g, m, v, vhat in zip(params, grads, ms, vs, vhats): 

584 m_t = (self.beta_1 * m) + (1.0 - self.beta_1) * g 

585 v_t = (self.beta_2 * v) + (1.0 - self.beta_2) * tf.square(g) 

586 if self.amsgrad: 

587 vhat_t = tf.maximum(vhat, v_t) 

588 p_t = p - lr_t * m_t / (backend.sqrt(vhat_t) + self.epsilon) 

589 self.updates.append(tf.compat.v1.assign(vhat, vhat_t)) 

590 else: 

591 p_t = p - lr_t * m_t / (backend.sqrt(v_t) + self.epsilon) 

592 

593 self.updates.append(tf.compat.v1.assign(m, m_t)) 

594 self.updates.append(tf.compat.v1.assign(v, v_t)) 

595 new_p = p_t 

596 

597 # Apply constraints. 

598 if getattr(p, "constraint", None) is not None: 

599 new_p = p.constraint(new_p) 

600 

601 self.updates.append(tf.compat.v1.assign(p, new_p)) 

602 return self.updates 

603 

604 def get_config(self): 

605 config = { 

606 "lr": float(backend.get_value(self.lr)), 

607 "beta_1": float(backend.get_value(self.beta_1)), 

608 "beta_2": float(backend.get_value(self.beta_2)), 

609 "decay": float(backend.get_value(self.decay)), 

610 "epsilon": self.epsilon, 

611 "amsgrad": self.amsgrad, 

612 } 

613 base_config = super().get_config() 

614 return dict(list(base_config.items()) + list(config.items())) 

615 

616 

617class Adamax(Optimizer): 

618 """Adamax optimizer from Adam paper's Section 7. 

619 

620 It is a variant of Adam based on the infinity norm. 

621 Default parameters follow those provided in the paper. 

622 

623 Args: 

624 lr: float >= 0. Learning rate. 

625 beta_1/beta_2: floats, 0 < beta < 1. Generally close to 1. 

626 epsilon: float >= 0. Fuzz factor. 

627 If `None`, defaults to `backend.epsilon()`. 

628 decay: float >= 0. Learning rate decay over each update. 

629 """ 

630 

631 def __init__( 

632 self, 

633 lr=0.002, 

634 beta_1=0.9, 

635 beta_2=0.999, 

636 epsilon=None, 

637 decay=0.0, 

638 **kwargs, 

639 ): 

640 super().__init__(**kwargs) 

641 with backend.name_scope(self.__class__.__name__): 

642 self.iterations = backend.variable( 

643 0, dtype="int64", name="iterations" 

644 ) 

645 self.lr = backend.variable(lr, name="lr") 

646 self.beta_1 = backend.variable(beta_1, name="beta_1") 

647 self.beta_2 = backend.variable(beta_2, name="beta_2") 

648 self.decay = backend.variable(decay, name="decay") 

649 if epsilon is None: 

650 epsilon = backend.epsilon() 

651 self.epsilon = epsilon 

652 self.initial_decay = decay 

653 

654 def _create_all_weights(self, params): 

655 

656 shapes = [backend.int_shape(p) for p in params] 

657 # zero init of 1st moment 

658 ms = [backend.zeros(shape) for shape in shapes] 

659 # zero init of exponentially weighted infinity norm 

660 us = [backend.zeros(shape) for shape in shapes] 

661 self.weights = [self.iterations] + ms + us 

662 return ms, us 

663 

664 def get_updates(self, loss, params): 

665 grads = self.get_gradients(loss, params) 

666 self.updates = [] 

667 

668 lr = self.lr 

669 if self.initial_decay > 0: 

670 lr = lr * ( 

671 1.0 

672 / ( 

673 1.0 

674 + self.decay 

675 * tf.cast(self.iterations, backend.dtype(self.decay)) 

676 ) 

677 ) 

678 

679 with tf.control_dependencies( 

680 [tf.compat.v1.assign_add(self.iterations, 1)] 

681 ): 

682 t = tf.cast(self.iterations, backend.floatx()) 

683 lr_t = lr / (1.0 - tf.pow(self.beta_1, t)) 

684 

685 ms, us = self._create_all_weights(params) 

686 

687 for p, g, m, u in zip(params, grads, ms, us): 

688 

689 m_t = (self.beta_1 * m) + (1.0 - self.beta_1) * g 

690 u_t = tf.maximum(self.beta_2 * u, tf.abs(g)) 

691 p_t = p - lr_t * m_t / (u_t + self.epsilon) 

692 

693 self.updates.append(tf.compat.v1.assign(m, m_t)) 

694 self.updates.append(tf.compat.v1.assign(u, u_t)) 

695 new_p = p_t 

696 

697 # Apply constraints. 

698 if getattr(p, "constraint", None) is not None: 

699 new_p = p.constraint(new_p) 

700 

701 self.updates.append(tf.compat.v1.assign(p, new_p)) 

702 return self.updates 

703 

704 def get_config(self): 

705 config = { 

706 "lr": float(backend.get_value(self.lr)), 

707 "beta_1": float(backend.get_value(self.beta_1)), 

708 "beta_2": float(backend.get_value(self.beta_2)), 

709 "decay": float(backend.get_value(self.decay)), 

710 "epsilon": self.epsilon, 

711 } 

712 base_config = super().get_config() 

713 return dict(list(base_config.items()) + list(config.items())) 

714 

715 

716class Nadam(Optimizer): 

717 """Nesterov Adam optimizer. 

718 

719 Much like Adam is essentially RMSprop with momentum, 

720 Nadam is Adam RMSprop with Nesterov momentum. 

721 

722 Default parameters follow those provided in the paper. 

723 It is recommended to leave the parameters of this optimizer 

724 at their default values. 

725 

726 Args: 

727 lr: float >= 0. Learning rate. 

728 beta_1/beta_2: floats, 0 < beta < 1. Generally close to 1. 

729 epsilon: float >= 0. Fuzz factor. 

730 If `None`, defaults to `backend.epsilon()`. 

731 """ 

732 

733 def __init__( 

734 self, 

735 lr=0.002, 

736 beta_1=0.9, 

737 beta_2=0.999, 

738 epsilon=None, 

739 schedule_decay=0.004, 

740 **kwargs, 

741 ): 

742 super().__init__(**kwargs) 

743 with backend.name_scope(self.__class__.__name__): 

744 self.iterations = backend.variable( 

745 0, dtype="int64", name="iterations" 

746 ) 

747 self.m_schedule = backend.variable(1.0, name="m_schedule") 

748 self.lr = backend.variable(lr, name="lr") 

749 self.beta_1 = backend.variable(beta_1, name="beta_1") 

750 self.beta_2 = backend.variable(beta_2, name="beta_2") 

751 if epsilon is None: 

752 epsilon = backend.epsilon() 

753 self.epsilon = epsilon 

754 self.schedule_decay = schedule_decay 

755 

756 def _create_all_weights(self, params): 

757 shapes = [backend.int_shape(p) for p in params] 

758 ms = [backend.zeros(shape) for shape in shapes] 

759 vs = [backend.zeros(shape) for shape in shapes] 

760 

761 self.weights = [self.iterations, self.m_schedule] + ms + vs 

762 return ms, vs 

763 

764 def get_updates(self, loss, params): 

765 grads = self.get_gradients(loss, params) 

766 self.updates = [] 

767 

768 with tf.control_dependencies( 

769 [tf.compat.v1.assign_add(self.iterations, 1)] 

770 ): 

771 t = tf.cast(self.iterations, backend.floatx()) 

772 

773 # Due to the recommendations in [2], i.e. warming momentum schedule 

774 momentum_cache_t = self.beta_1 * ( 

775 1.0 

776 - 0.5 

777 * (tf.pow(backend.cast_to_floatx(0.96), t * self.schedule_decay)) 

778 ) 

779 momentum_cache_t_1 = self.beta_1 * ( 

780 1.0 

781 - 0.5 

782 * ( 

783 tf.pow( 

784 backend.cast_to_floatx(0.96), (t + 1) * self.schedule_decay 

785 ) 

786 ) 

787 ) 

788 m_schedule_new = self.m_schedule * momentum_cache_t 

789 m_schedule_next = ( 

790 self.m_schedule * momentum_cache_t * momentum_cache_t_1 

791 ) 

792 self.updates.append((self.m_schedule, m_schedule_new)) 

793 

794 ms, vs = self._create_all_weights(params) 

795 

796 for p, g, m, v in zip(params, grads, ms, vs): 

797 # the following equations given in [1] 

798 g_prime = g / (1.0 - m_schedule_new) 

799 m_t = self.beta_1 * m + (1.0 - self.beta_1) * g 

800 m_t_prime = m_t / (1.0 - m_schedule_next) 

801 v_t = self.beta_2 * v + (1.0 - self.beta_2) * tf.square(g) 

802 v_t_prime = v_t / (1.0 - tf.pow(self.beta_2, t)) 

803 m_t_bar = ( 

804 1.0 - momentum_cache_t 

805 ) * g_prime + momentum_cache_t_1 * m_t_prime 

806 

807 self.updates.append(tf.compat.v1.assign(m, m_t)) 

808 self.updates.append(tf.compat.v1.assign(v, v_t)) 

809 

810 p_t = p - self.lr * m_t_bar / ( 

811 backend.sqrt(v_t_prime) + self.epsilon 

812 ) 

813 new_p = p_t 

814 

815 # Apply constraints. 

816 if getattr(p, "constraint", None) is not None: 

817 new_p = p.constraint(new_p) 

818 

819 self.updates.append(tf.compat.v1.assign(p, new_p)) 

820 return self.updates 

821 

822 def get_config(self): 

823 config = { 

824 "lr": float(backend.get_value(self.lr)), 

825 "beta_1": float(backend.get_value(self.beta_1)), 

826 "beta_2": float(backend.get_value(self.beta_2)), 

827 "epsilon": self.epsilon, 

828 "schedule_decay": self.schedule_decay, 

829 } 

830 base_config = super().get_config() 

831 return dict(list(base_config.items()) + list(config.items())) 

832 

833 

834class TFOptimizer(Optimizer, tf.__internal__.tracking.Trackable): 

835 """Wrapper class for native TensorFlow optimizers.""" 

836 

837 def __init__(self, optimizer, iterations=None): 

838 self.optimizer = optimizer 

839 self._track_trackable(optimizer, name="optimizer") 

840 if iterations is None: 

841 with backend.name_scope(self.__class__.__name__): 

842 self.iterations = backend.variable( 

843 0, dtype="int64", name="iterations" 

844 ) 

845 else: 

846 self.iterations = iterations 

847 self._track_trackable(self.iterations, name="global_step") 

848 

849 def _clip_gradients(self, grads): 

850 """Clip gradients according to the clipnorm and clipvalue attributes.""" 

851 # TFOptimizer wrapper has no gradient clipping options. 

852 return grads 

853 

854 def minimize(self, loss, var_list, grad_loss=None, tape=None): 

855 """Mimics the `OptimizerV2.minimize` API.""" 

856 if not callable(loss) and tape is None: 

857 raise ValueError( 

858 "`tape` is required when a `Tensor` loss is passed." 

859 ) 

860 tape = tape if tape is not None else tf.GradientTape() 

861 

862 if callable(loss): 

863 with tape: 

864 if not callable(var_list): 

865 tape.watch(var_list) 

866 loss = loss() 

867 if callable(var_list): 

868 var_list = var_list() 

869 

870 var_list = tf.nest.flatten(var_list) 

871 if var_list: 

872 grads = tape.gradient(loss, var_list, grad_loss) 

873 grads_and_vars = list(zip(grads, var_list)) 

874 self.apply_gradients(grads_and_vars) 

875 

876 def apply_gradients(self, grads_and_vars): 

877 self.optimizer.apply_gradients( 

878 grads_and_vars, global_step=self.iterations 

879 ) 

880 

881 def get_grads(self, loss, params): 

882 return self.optimizer.compute_gradients(loss, params) 

883 

884 def get_updates(self, loss, params): 

885 if tf.distribute.has_strategy(): 

886 self.updates = [] 

887 

888 if not params: 

889 # After the model vars have been created, the second call to 

890 # get_updates is called with params as an empty list. This 

891 # ensures that we call compute_gradients with params=None. 

892 grads = self.optimizer.compute_gradients(loss) 

893 else: 

894 grads = self.optimizer.compute_gradients(loss, params) 

895 global_step = tf.compat.v1.train.get_global_step() 

896 opt_update = self.optimizer.apply_gradients(grads, global_step) 

897 else: 

898 if not params: 

899 self.updates = [tf.compat.v1.assign_add(self.iterations, 1)] 

900 return self.updates 

901 

902 # Updates list starts out empty because the iterations variable is 

903 # incremented in optimizer.apply_gradients() 

904 self.updates = [] 

905 grads = self.optimizer.compute_gradients(loss, params) 

906 opt_update = self.optimizer.apply_gradients( 

907 grads, global_step=self.iterations 

908 ) 

909 

910 self.updates.append(opt_update) 

911 return self.updates 

912 

913 @property 

914 def weights(self): 

915 raise NotImplementedError 

916 

917 def get_config(self): 

918 raise NotImplementedError 

919 

920 def from_config(self, config): 

921 raise NotImplementedError 

922 

923 

924# Aliases. 

925 

926sgd = SGD 

927rmsprop = RMSprop 

928adagrad = Adagrad 

929adadelta = Adadelta 

930adam = Adam 

931adamax = Adamax 

932nadam = Nadam 

933