Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/tensorflow/python/keras/optimizer_v1.py: 18%

410 statements  

« prev     ^ index     » next       coverage.py v7.4.0, created at 2024-01-03 07:57 +0000

1# Copyright 2015 The TensorFlow Authors. All Rights Reserved. 

2# 

3# Licensed under the Apache License, Version 2.0 (the "License"); 

4# you may not use this file except in compliance with the License. 

5# You may obtain a copy of the License at 

6# 

7# http://www.apache.org/licenses/LICENSE-2.0 

8# 

9# Unless required by applicable law or agreed to in writing, software 

10# distributed under the License is distributed on an "AS IS" BASIS, 

11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 

12# See the License for the specific language governing permissions and 

13# limitations under the License. 

14# ============================================================================== 

15# pylint: disable=invalid-name 

16# pylint: disable=g-classes-have-attributes 

17"""Legacy v1 optimizer classes. 

18 

19For more examples see the base class `tf.compat.v1.keras.optimizers.Optimizer`. 

20""" 

21 

22from tensorflow.python.distribute import distribute_lib 

23from tensorflow.python.eager import backprop 

24from tensorflow.python.framework import ops 

25from tensorflow.python.keras import backend 

26from tensorflow.python.ops import clip_ops 

27from tensorflow.python.ops import math_ops 

28from tensorflow.python.ops import state_ops 

29from tensorflow.python.trackable import base as trackable 

30from tensorflow.python.training import training_util 

31from tensorflow.python.util import nest 

32 

33 

34class Optimizer(object): 

35 """Abstract optimizer base class. 

36 

37 Note: this is the parent class of all optimizers, not an actual optimizer 

38 that can be used for training models. 

39 

40 All Keras optimizers support the following keyword arguments: 

41 

42 clipnorm: float >= 0. Gradients will be clipped 

43 when their L2 norm exceeds this value. 

44 clipvalue: float >= 0. Gradients will be clipped 

45 when their absolute value exceeds this value. 

46 """ 

47 

48 def __init__(self, **kwargs): 

49 allowed_kwargs = {'clipnorm', 'clipvalue'} 

50 for k in kwargs: 

51 if k not in allowed_kwargs: 

52 raise TypeError('Unexpected keyword argument ' 

53 'passed to optimizer: ' + str(k)) 

54 # checks that clipnorm >= 0 and clipvalue >= 0 

55 if kwargs[k] < 0: 

56 raise ValueError('Expected {} >= 0, received: {}'.format(k, kwargs[k])) 

57 self.__dict__.update(kwargs) 

58 self.updates = [] 

59 self.weights = [] 

60 

61 # Set this to False, indicating `apply_gradients` does not take the 

62 # `experimental_aggregate_gradients` argument. 

63 _HAS_AGGREGATE_GRAD = False 

64 

65 def _create_all_weights(self, params): 

66 """Creates and sets all optimizer weights. 

67 

68 Args: 

69 params: list or tuple of `Variable` objects that will be minimized 

70 using this optimizer. 

71 

72 Returns: 

73 Specific weight values that are used in `get_updates` 

74 """ 

75 raise NotImplementedError 

76 

77 def get_updates(self, loss, params): 

78 raise NotImplementedError 

79 

80 def get_gradients(self, loss, params): 

81 """Returns gradients of `loss` with respect to `params`. 

82 

83 Args: 

84 loss: Loss tensor. 

85 params: List of variables. 

86 

87 Returns: 

88 List of gradient tensors. 

89 

90 Raises: 

91 ValueError: In case any gradient cannot be computed (e.g. if gradient 

92 function not implemented). 

93 """ 

94 grads = backend.gradients(loss, params) 

95 if any(g is None for g in grads): 

96 raise ValueError('An operation has `None` for gradient. ' 

97 'Please make sure that all of your ops have a ' 

98 'gradient defined (i.e. are differentiable). ' 

99 'Common ops without gradient: ' 

100 'backend.argmax, backend.round, backend.eval.') 

101 if hasattr(self, 'clipnorm'): 

102 grads = [clip_ops.clip_by_norm(g, self.clipnorm) for g in grads] 

103 if hasattr(self, 'clipvalue'): 

104 grads = [ 

105 clip_ops.clip_by_value(g, -self.clipvalue, self.clipvalue) 

106 for g in grads 

107 ] 

108 return grads 

109 

110 def set_weights(self, weights): 

111 """Sets the weights of the optimizer, from Numpy arrays. 

112 

113 Should only be called after computing the gradients 

114 (otherwise the optimizer has no weights). 

115 

116 Args: 

117 weights: a list of Numpy arrays. The number of arrays and their shape 

118 must match number of the dimensions of the weights of the optimizer 

119 (i.e. it should match the output of `get_weights`). 

120 

121 Raises: 

122 ValueError: in case of incompatible weight shapes. 

123 """ 

124 params = self.weights 

125 if len(params) != len(weights): 

126 raise ValueError('Length of the specified weight list (' + 

127 str(len(weights)) + 

128 ') does not match the number of weights ' 

129 'of the optimizer (' + str(len(params)) + ')') 

130 weight_value_tuples = [] 

131 param_values = backend.batch_get_value(params) 

132 for pv, p, w in zip(param_values, params, weights): 

133 if pv.shape != w.shape: 

134 raise ValueError('Optimizer weight shape ' + str(pv.shape) + 

135 ' not compatible with ' 

136 'provided weight shape ' + str(w.shape)) 

137 weight_value_tuples.append((p, w)) 

138 backend.batch_set_value(weight_value_tuples) 

139 

140 def get_weights(self): 

141 """Returns the current value of the weights of the optimizer. 

142 

143 Returns: 

144 A list of numpy arrays. 

145 """ 

146 return backend.batch_get_value(self.weights) 

147 

148 def get_config(self): 

149 config = {} 

150 if hasattr(self, 'clipnorm'): 

151 config['clipnorm'] = self.clipnorm 

152 if hasattr(self, 'clipvalue'): 

153 config['clipvalue'] = self.clipvalue 

154 return config 

155 

156 @classmethod 

157 def from_config(cls, config): 

158 return cls(**config) 

159 

160 

161class SGD(Optimizer): 

162 """Stochastic gradient descent optimizer. 

163 

164 Includes support for momentum, 

165 learning rate decay, and Nesterov momentum. 

166 

167 Args: 

168 lr: float >= 0. Learning rate. 

169 momentum: float >= 0. Parameter that accelerates SGD in the relevant 

170 direction and dampens oscillations. 

171 decay: float >= 0. Learning rate decay over each update. 

172 nesterov: boolean. Whether to apply Nesterov momentum. 

173 """ 

174 

175 def __init__(self, lr=0.01, momentum=0., decay=0., nesterov=False, **kwargs): 

176 super(SGD, self).__init__(**kwargs) 

177 with backend.name_scope(self.__class__.__name__): 

178 self.iterations = backend.variable(0, dtype='int64', name='iterations') 

179 self.lr = backend.variable(lr, name='lr') 

180 self.momentum = backend.variable(momentum, name='momentum') 

181 self.decay = backend.variable(decay, name='decay') 

182 self.initial_decay = decay 

183 self.nesterov = nesterov 

184 

185 def _create_all_weights(self, params): 

186 shapes = [backend.int_shape(p) for p in params] 

187 moments = [backend.zeros(shape) for shape in shapes] 

188 self.weights = [self.iterations] + moments 

189 return moments 

190 

191 def get_updates(self, loss, params): 

192 grads = self.get_gradients(loss, params) 

193 self.updates = [state_ops.assign_add(self.iterations, 1)] 

194 

195 lr = self.lr 

196 if self.initial_decay > 0: 

197 lr = lr * ( 

198 1. / 

199 (1. + 

200 self.decay * math_ops.cast(self.iterations, 

201 backend.dtype(self.decay)))) 

202 # momentum 

203 moments = self._create_all_weights(params) 

204 for p, g, m in zip(params, grads, moments): 

205 v = self.momentum * m - lr * g # velocity 

206 self.updates.append(state_ops.assign(m, v)) 

207 

208 if self.nesterov: 

209 new_p = p + self.momentum * v - lr * g 

210 else: 

211 new_p = p + v 

212 

213 # Apply constraints. 

214 if getattr(p, 'constraint', None) is not None: 

215 new_p = p.constraint(new_p) 

216 

217 self.updates.append(state_ops.assign(p, new_p)) 

218 return self.updates 

219 

220 def get_config(self): 

221 config = { 

222 'lr': float(backend.get_value(self.lr)), 

223 'momentum': float(backend.get_value(self.momentum)), 

224 'decay': float(backend.get_value(self.decay)), 

225 'nesterov': self.nesterov 

226 } 

227 base_config = super(SGD, self).get_config() 

228 return dict(list(base_config.items()) + list(config.items())) 

229 

230 

231class RMSprop(Optimizer): 

232 """RMSProp optimizer. 

233 

234 It is recommended to leave the parameters of this optimizer 

235 at their default values 

236 (except the learning rate, which can be freely tuned). 

237 

238 Args: 

239 lr: float >= 0. Learning rate. 

240 rho: float >= 0. 

241 epsilon: float >= 0. Fuzz factor. 

242 If `None`, defaults to `backend.epsilon()`. 

243 decay: float >= 0. Learning rate decay over each update. 

244 """ 

245 

246 def __init__(self, lr=0.001, rho=0.9, epsilon=None, decay=0., **kwargs): 

247 super(RMSprop, self).__init__(**kwargs) 

248 with backend.name_scope(self.__class__.__name__): 

249 self.lr = backend.variable(lr, name='lr') 

250 self.rho = backend.variable(rho, name='rho') 

251 self.decay = backend.variable(decay, name='decay') 

252 self.iterations = backend.variable(0, dtype='int64', name='iterations') 

253 if epsilon is None: 

254 epsilon = backend.epsilon() 

255 self.epsilon = epsilon 

256 self.initial_decay = decay 

257 

258 def _create_all_weights(self, params): 

259 accumulators = [ 

260 backend.zeros(backend.int_shape(p), dtype=backend.dtype(p)) 

261 for p in params] 

262 self.weights = accumulators 

263 return accumulators 

264 

265 def get_updates(self, loss, params): 

266 grads = self.get_gradients(loss, params) 

267 accumulators = self._create_all_weights(params) 

268 self.updates = [state_ops.assign_add(self.iterations, 1)] 

269 

270 lr = self.lr 

271 if self.initial_decay > 0: 

272 lr = lr * ( 

273 1. / 

274 (1. + 

275 self.decay * math_ops.cast(self.iterations, 

276 backend.dtype(self.decay)))) 

277 

278 for p, g, a in zip(params, grads, accumulators): 

279 # update accumulator 

280 new_a = self.rho * a + (1. - self.rho) * math_ops.square(g) 

281 self.updates.append(state_ops.assign(a, new_a)) 

282 new_p = p - lr * g / (backend.sqrt(new_a) + self.epsilon) 

283 

284 # Apply constraints. 

285 if getattr(p, 'constraint', None) is not None: 

286 new_p = p.constraint(new_p) 

287 

288 self.updates.append(state_ops.assign(p, new_p)) 

289 return self.updates 

290 

291 def get_config(self): 

292 config = { 

293 'lr': float(backend.get_value(self.lr)), 

294 'rho': float(backend.get_value(self.rho)), 

295 'decay': float(backend.get_value(self.decay)), 

296 'epsilon': self.epsilon 

297 } 

298 base_config = super(RMSprop, self).get_config() 

299 return dict(list(base_config.items()) + list(config.items())) 

300 

301 

302class Adagrad(Optimizer): 

303 """Adagrad optimizer. 

304 

305 Adagrad is an optimizer with parameter-specific learning rates, 

306 which are adapted relative to how frequently a parameter gets 

307 updated during training. The more updates a parameter receives, 

308 the smaller the updates. 

309 

310 It is recommended to leave the parameters of this optimizer 

311 at their default values. 

312 

313 # Arguments 

314 lr: float >= 0. Initial learning rate. 

315 epsilon: float >= 0. If `None`, defaults to `backend.epsilon()`. 

316 decay: float >= 0. Learning rate decay over each update. 

317 

318 # References 

319 - [Adaptive Subgradient Methods for Online Learning and Stochastic 

320 Optimization](http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf) 

321 """ 

322 

323 def __init__(self, lr=0.01, epsilon=None, decay=0., **kwargs): 

324 super(Adagrad, self).__init__(**kwargs) 

325 with backend.name_scope(self.__class__.__name__): 

326 self.lr = backend.variable(lr, name='lr') 

327 self.decay = backend.variable(decay, name='decay') 

328 self.iterations = backend.variable(0, dtype='int64', name='iterations') 

329 if epsilon is None: 

330 epsilon = backend.epsilon() 

331 self.epsilon = epsilon 

332 self.initial_decay = decay 

333 

334 def _create_all_weights(self, params): 

335 shapes = [backend.int_shape(p) for p in params] 

336 accumulators = [backend.zeros(shape) for shape in shapes] 

337 self.weights = accumulators 

338 return accumulators 

339 

340 def get_updates(self, loss, params): 

341 grads = self.get_gradients(loss, params) 

342 accumulators = self._create_all_weights(params) 

343 

344 self.updates = [state_ops.assign_add(self.iterations, 1)] 

345 

346 lr = self.lr 

347 if self.initial_decay > 0: 

348 lr = lr * ( 

349 1. / 

350 (1. + 

351 self.decay * math_ops.cast(self.iterations, 

352 backend.dtype(self.decay)))) 

353 

354 for p, g, a in zip(params, grads, accumulators): 

355 new_a = a + math_ops.square(g) # update accumulator 

356 self.updates.append(state_ops.assign(a, new_a)) 

357 new_p = p - lr * g / (backend.sqrt(new_a) + self.epsilon) 

358 

359 # Apply constraints. 

360 if getattr(p, 'constraint', None) is not None: 

361 new_p = p.constraint(new_p) 

362 

363 self.updates.append(state_ops.assign(p, new_p)) 

364 return self.updates 

365 

366 def get_config(self): 

367 config = { 

368 'lr': float(backend.get_value(self.lr)), 

369 'decay': float(backend.get_value(self.decay)), 

370 'epsilon': self.epsilon 

371 } 

372 base_config = super(Adagrad, self).get_config() 

373 return dict(list(base_config.items()) + list(config.items())) 

374 

375 

376class Adadelta(Optimizer): 

377 """Adadelta optimizer. 

378 

379 Adadelta is a more robust extension of Adagrad 

380 that adapts learning rates based on a moving window of gradient updates, 

381 instead of accumulating all past gradients. This way, Adadelta continues 

382 learning even when many updates have been done. Compared to Adagrad, in the 

383 original version of Adadelta you don't have to set an initial learning 

384 rate. In this version, initial learning rate and decay factor can 

385 be set, as in most other Keras optimizers. 

386 

387 It is recommended to leave the parameters of this optimizer 

388 at their default values. 

389 

390 Arguments: 

391 lr: float >= 0. Initial learning rate, defaults to 1. 

392 It is recommended to leave it at the default value. 

393 rho: float >= 0. Adadelta decay factor, corresponding to fraction of 

394 gradient to keep at each time step. 

395 epsilon: float >= 0. Fuzz factor. 

396 If `None`, defaults to `backend.epsilon()`. 

397 decay: float >= 0. Initial learning rate decay. 

398 

399 References: 

400 - [Adadelta - an adaptive learning rate 

401 method](http://arxiv.org/abs/1212.5701) 

402 """ 

403 

404 def __init__(self, lr=1.0, rho=0.95, epsilon=None, decay=0., **kwargs): 

405 super(Adadelta, self).__init__(**kwargs) 

406 with backend.name_scope(self.__class__.__name__): 

407 self.lr = backend.variable(lr, name='lr') 

408 self.decay = backend.variable(decay, name='decay') 

409 self.iterations = backend.variable(0, dtype='int64', name='iterations') 

410 if epsilon is None: 

411 epsilon = backend.epsilon() 

412 self.rho = rho 

413 self.epsilon = epsilon 

414 self.initial_decay = decay 

415 

416 def _create_all_weights(self, params): 

417 shapes = [backend.int_shape(p) for p in params] 

418 accumulators = [backend.zeros(shape) for shape in shapes] 

419 delta_accumulators = [backend.zeros(shape) for shape in shapes] 

420 self.weights = accumulators + delta_accumulators 

421 return accumulators, delta_accumulators 

422 

423 def get_updates(self, loss, params): 

424 grads = self.get_gradients(loss, params) 

425 self.updates = [state_ops.assign_add(self.iterations, 1)] 

426 accumulators, delta_accumulators = self._create_all_weights(params) 

427 

428 lr = self.lr 

429 if self.initial_decay > 0: 

430 lr = lr * ( 

431 1. / 

432 (1. + 

433 self.decay * math_ops.cast(self.iterations, 

434 backend.dtype(self.decay)))) 

435 

436 for p, g, a, d_a in zip(params, grads, accumulators, delta_accumulators): 

437 # update accumulator 

438 new_a = self.rho * a + (1. - self.rho) * math_ops.square(g) 

439 self.updates.append(state_ops.assign(a, new_a)) 

440 

441 # use the new accumulator and the *old* delta_accumulator 

442 update = g * backend.sqrt(d_a + self.epsilon) / backend.sqrt( 

443 new_a + self.epsilon) 

444 new_p = p - lr * update 

445 

446 # Apply constraints. 

447 if getattr(p, 'constraint', None) is not None: 

448 new_p = p.constraint(new_p) 

449 

450 self.updates.append(state_ops.assign(p, new_p)) 

451 

452 # update delta_accumulator 

453 new_d_a = self.rho * d_a + (1 - self.rho) * math_ops.square(update) 

454 self.updates.append(state_ops.assign(d_a, new_d_a)) 

455 return self.updates 

456 

457 def get_config(self): 

458 config = { 

459 'lr': float(backend.get_value(self.lr)), 

460 'rho': self.rho, 

461 'decay': float(backend.get_value(self.decay)), 

462 'epsilon': self.epsilon 

463 } 

464 base_config = super(Adadelta, self).get_config() 

465 return dict(list(base_config.items()) + list(config.items())) 

466 

467 

468class Adam(Optimizer): 

469 """Adam optimizer. 

470 

471 Default parameters follow those provided in the original paper. 

472 

473 Args: 

474 lr: float >= 0. Learning rate. 

475 beta_1: float, 0 < beta < 1. Generally close to 1. 

476 beta_2: float, 0 < beta < 1. Generally close to 1. 

477 epsilon: float >= 0. Fuzz factor. 

478 If `None`, defaults to `backend.epsilon()`. 

479 decay: float >= 0. Learning rate decay over each update. 

480 amsgrad: boolean. Whether to apply the AMSGrad variant of this algorithm 

481 from the paper "On the Convergence of Adam and Beyond". 

482 """ 

483 

484 def __init__(self, 

485 lr=0.001, 

486 beta_1=0.9, 

487 beta_2=0.999, 

488 epsilon=None, 

489 decay=0., 

490 amsgrad=False, 

491 **kwargs): 

492 super(Adam, self).__init__(**kwargs) 

493 with backend.name_scope(self.__class__.__name__): 

494 self.iterations = backend.variable(0, dtype='int64', name='iterations') 

495 self.lr = backend.variable(lr, name='lr') 

496 self.beta_1 = backend.variable(beta_1, name='beta_1') 

497 self.beta_2 = backend.variable(beta_2, name='beta_2') 

498 self.decay = backend.variable(decay, name='decay') 

499 if epsilon is None: 

500 epsilon = backend.epsilon() 

501 self.epsilon = epsilon 

502 self.initial_decay = decay 

503 self.amsgrad = amsgrad 

504 

505 def _create_all_weights(self, params): 

506 ms = [ 

507 backend.zeros(backend.int_shape(p), dtype=backend.dtype(p)) 

508 for p in params] 

509 vs = [ 

510 backend.zeros(backend.int_shape(p), dtype=backend.dtype(p)) 

511 for p in params] 

512 if self.amsgrad: 

513 vhats = [ 

514 backend.zeros(backend.int_shape(p), dtype=backend.dtype(p)) 

515 for p in params] 

516 else: 

517 vhats = [backend.zeros(1) for _ in params] 

518 self.weights = [self.iterations] + ms + vs + vhats 

519 return ms, vs, vhats 

520 

521 def get_updates(self, loss, params): 

522 grads = self.get_gradients(loss, params) 

523 self.updates = [] 

524 

525 lr = self.lr 

526 if self.initial_decay > 0: 

527 lr = lr * ( 

528 1. / 

529 (1. + 

530 self.decay * math_ops.cast(self.iterations, 

531 backend.dtype(self.decay)))) 

532 

533 with ops.control_dependencies([state_ops.assign_add(self.iterations, 1)]): 

534 t = math_ops.cast(self.iterations, backend.floatx()) 

535 lr_t = lr * ( 

536 backend.sqrt(1. - math_ops.pow(self.beta_2, t)) / 

537 (1. - math_ops.pow(self.beta_1, t))) 

538 

539 ms, vs, vhats = self._create_all_weights(params) 

540 for p, g, m, v, vhat in zip(params, grads, ms, vs, vhats): 

541 m_t = (self.beta_1 * m) + (1. - self.beta_1) * g 

542 v_t = (self.beta_2 * v) + (1. - self.beta_2) * math_ops.square(g) 

543 if self.amsgrad: 

544 vhat_t = math_ops.maximum(vhat, v_t) 

545 p_t = p - lr_t * m_t / (backend.sqrt(vhat_t) + self.epsilon) 

546 self.updates.append(state_ops.assign(vhat, vhat_t)) 

547 else: 

548 p_t = p - lr_t * m_t / (backend.sqrt(v_t) + self.epsilon) 

549 

550 self.updates.append(state_ops.assign(m, m_t)) 

551 self.updates.append(state_ops.assign(v, v_t)) 

552 new_p = p_t 

553 

554 # Apply constraints. 

555 if getattr(p, 'constraint', None) is not None: 

556 new_p = p.constraint(new_p) 

557 

558 self.updates.append(state_ops.assign(p, new_p)) 

559 return self.updates 

560 

561 def get_config(self): 

562 config = { 

563 'lr': float(backend.get_value(self.lr)), 

564 'beta_1': float(backend.get_value(self.beta_1)), 

565 'beta_2': float(backend.get_value(self.beta_2)), 

566 'decay': float(backend.get_value(self.decay)), 

567 'epsilon': self.epsilon, 

568 'amsgrad': self.amsgrad 

569 } 

570 base_config = super(Adam, self).get_config() 

571 return dict(list(base_config.items()) + list(config.items())) 

572 

573 

574class Adamax(Optimizer): 

575 """Adamax optimizer from Adam paper's Section 7. 

576 

577 It is a variant of Adam based on the infinity norm. 

578 Default parameters follow those provided in the paper. 

579 

580 Args: 

581 lr: float >= 0. Learning rate. 

582 beta_1/beta_2: floats, 0 < beta < 1. Generally close to 1. 

583 epsilon: float >= 0. Fuzz factor. 

584 If `None`, defaults to `backend.epsilon()`. 

585 decay: float >= 0. Learning rate decay over each update. 

586 """ 

587 

588 def __init__(self, 

589 lr=0.002, 

590 beta_1=0.9, 

591 beta_2=0.999, 

592 epsilon=None, 

593 decay=0., 

594 **kwargs): 

595 super(Adamax, self).__init__(**kwargs) 

596 with backend.name_scope(self.__class__.__name__): 

597 self.iterations = backend.variable(0, dtype='int64', name='iterations') 

598 self.lr = backend.variable(lr, name='lr') 

599 self.beta_1 = backend.variable(beta_1, name='beta_1') 

600 self.beta_2 = backend.variable(beta_2, name='beta_2') 

601 self.decay = backend.variable(decay, name='decay') 

602 if epsilon is None: 

603 epsilon = backend.epsilon() 

604 self.epsilon = epsilon 

605 self.initial_decay = decay 

606 

607 def _create_all_weights(self, params): 

608 

609 shapes = [backend.int_shape(p) for p in params] 

610 # zero init of 1st moment 

611 ms = [backend.zeros(shape) for shape in shapes] 

612 # zero init of exponentially weighted infinity norm 

613 us = [backend.zeros(shape) for shape in shapes] 

614 self.weights = [self.iterations] + ms + us 

615 return ms, us 

616 

617 def get_updates(self, loss, params): 

618 grads = self.get_gradients(loss, params) 

619 self.updates = [] 

620 

621 lr = self.lr 

622 if self.initial_decay > 0: 

623 lr = lr * ( 

624 1. / 

625 (1. + 

626 self.decay * math_ops.cast(self.iterations, 

627 backend.dtype(self.decay)))) 

628 

629 with ops.control_dependencies([state_ops.assign_add(self.iterations, 1)]): 

630 t = math_ops.cast(self.iterations, backend.floatx()) 

631 lr_t = lr / (1. - math_ops.pow(self.beta_1, t)) 

632 

633 ms, us = self._create_all_weights(params) 

634 

635 for p, g, m, u in zip(params, grads, ms, us): 

636 

637 m_t = (self.beta_1 * m) + (1. - self.beta_1) * g 

638 u_t = math_ops.maximum(self.beta_2 * u, math_ops.abs(g)) 

639 p_t = p - lr_t * m_t / (u_t + self.epsilon) 

640 

641 self.updates.append(state_ops.assign(m, m_t)) 

642 self.updates.append(state_ops.assign(u, u_t)) 

643 new_p = p_t 

644 

645 # Apply constraints. 

646 if getattr(p, 'constraint', None) is not None: 

647 new_p = p.constraint(new_p) 

648 

649 self.updates.append(state_ops.assign(p, new_p)) 

650 return self.updates 

651 

652 def get_config(self): 

653 config = { 

654 'lr': float(backend.get_value(self.lr)), 

655 'beta_1': float(backend.get_value(self.beta_1)), 

656 'beta_2': float(backend.get_value(self.beta_2)), 

657 'decay': float(backend.get_value(self.decay)), 

658 'epsilon': self.epsilon 

659 } 

660 base_config = super(Adamax, self).get_config() 

661 return dict(list(base_config.items()) + list(config.items())) 

662 

663 

664class Nadam(Optimizer): 

665 """Nesterov Adam optimizer. 

666 

667 Much like Adam is essentially RMSprop with momentum, 

668 Nadam is Adam RMSprop with Nesterov momentum. 

669 

670 Default parameters follow those provided in the paper. 

671 It is recommended to leave the parameters of this optimizer 

672 at their default values. 

673 

674 Args: 

675 lr: float >= 0. Learning rate. 

676 beta_1/beta_2: floats, 0 < beta < 1. Generally close to 1. 

677 epsilon: float >= 0. Fuzz factor. 

678 If `None`, defaults to `backend.epsilon()`. 

679 """ 

680 

681 def __init__(self, 

682 lr=0.002, 

683 beta_1=0.9, 

684 beta_2=0.999, 

685 epsilon=None, 

686 schedule_decay=0.004, 

687 **kwargs): 

688 super(Nadam, self).__init__(**kwargs) 

689 with backend.name_scope(self.__class__.__name__): 

690 self.iterations = backend.variable(0, dtype='int64', name='iterations') 

691 self.m_schedule = backend.variable(1., name='m_schedule') 

692 self.lr = backend.variable(lr, name='lr') 

693 self.beta_1 = backend.variable(beta_1, name='beta_1') 

694 self.beta_2 = backend.variable(beta_2, name='beta_2') 

695 if epsilon is None: 

696 epsilon = backend.epsilon() 

697 self.epsilon = epsilon 

698 self.schedule_decay = schedule_decay 

699 

700 def _create_all_weights(self, params): 

701 shapes = [backend.int_shape(p) for p in params] 

702 ms = [backend.zeros(shape) for shape in shapes] 

703 vs = [backend.zeros(shape) for shape in shapes] 

704 

705 self.weights = [self.iterations, self.m_schedule] + ms + vs 

706 return ms, vs 

707 

708 def get_updates(self, loss, params): 

709 grads = self.get_gradients(loss, params) 

710 self.updates = [] 

711 

712 with ops.control_dependencies([state_ops.assign_add(self.iterations, 1)]): 

713 t = math_ops.cast(self.iterations, backend.floatx()) 

714 

715 # Due to the recommendations in [2], i.e. warming momentum schedule 

716 momentum_cache_t = self.beta_1 * ( 

717 1. - 0.5 * 

718 (math_ops.pow(backend.cast_to_floatx(0.96), t * self.schedule_decay))) 

719 momentum_cache_t_1 = self.beta_1 * ( 

720 1. - 0.5 * 

721 (math_ops.pow(backend.cast_to_floatx(0.96), 

722 (t + 1) * self.schedule_decay))) 

723 m_schedule_new = self.m_schedule * momentum_cache_t 

724 m_schedule_next = self.m_schedule * momentum_cache_t * momentum_cache_t_1 

725 self.updates.append((self.m_schedule, m_schedule_new)) 

726 

727 ms, vs = self._create_all_weights(params) 

728 

729 for p, g, m, v in zip(params, grads, ms, vs): 

730 # the following equations given in [1] 

731 g_prime = g / (1. - m_schedule_new) 

732 m_t = self.beta_1 * m + (1. - self.beta_1) * g 

733 m_t_prime = m_t / (1. - m_schedule_next) 

734 v_t = self.beta_2 * v + (1. - self.beta_2) * math_ops.square(g) 

735 v_t_prime = v_t / (1. - math_ops.pow(self.beta_2, t)) 

736 m_t_bar = (1. - 

737 momentum_cache_t) * g_prime + momentum_cache_t_1 * m_t_prime 

738 

739 self.updates.append(state_ops.assign(m, m_t)) 

740 self.updates.append(state_ops.assign(v, v_t)) 

741 

742 p_t = p - self.lr * m_t_bar / (backend.sqrt(v_t_prime) + self.epsilon) 

743 new_p = p_t 

744 

745 # Apply constraints. 

746 if getattr(p, 'constraint', None) is not None: 

747 new_p = p.constraint(new_p) 

748 

749 self.updates.append(state_ops.assign(p, new_p)) 

750 return self.updates 

751 

752 def get_config(self): 

753 config = { 

754 'lr': float(backend.get_value(self.lr)), 

755 'beta_1': float(backend.get_value(self.beta_1)), 

756 'beta_2': float(backend.get_value(self.beta_2)), 

757 'epsilon': self.epsilon, 

758 'schedule_decay': self.schedule_decay 

759 } 

760 base_config = super(Nadam, self).get_config() 

761 return dict(list(base_config.items()) + list(config.items())) 

762 

763 

764class TFOptimizer(Optimizer, trackable.Trackable): 

765 """Wrapper class for native TensorFlow optimizers.""" 

766 

767 def __init__(self, optimizer, iterations=None): # pylint: disable=super-init-not-called 

768 self.optimizer = optimizer 

769 self._track_trackable(optimizer, name='optimizer') 

770 if iterations is None: 

771 with backend.name_scope(self.__class__.__name__): 

772 self.iterations = backend.variable(0, dtype='int64', name='iterations') 

773 else: 

774 self.iterations = iterations 

775 self._track_trackable(self.iterations, name='global_step') 

776 

777 def _clip_gradients(self, grads): 

778 """Clip gradients according to the clipnorm and clipvalue attributes.""" 

779 # TFOptimizer wrapper has no gradient clipping options. 

780 return grads 

781 

782 def minimize(self, loss, var_list, grad_loss=None, tape=None): 

783 """Mimics the `OptimizerV2.minimize` API.""" 

784 if not callable(loss) and tape is None: 

785 raise ValueError('`tape` is required when a `Tensor` loss is passed.') 

786 tape = tape if tape is not None else backprop.GradientTape() 

787 

788 if callable(loss): 

789 with tape: 

790 if not callable(var_list): 

791 tape.watch(var_list) 

792 loss = loss() 

793 if callable(var_list): 

794 var_list = var_list() 

795 

796 var_list = nest.flatten(var_list) 

797 if var_list: 

798 grads = tape.gradient(loss, var_list, grad_loss) 

799 grads_and_vars = list(zip(grads, var_list)) 

800 self.apply_gradients(grads_and_vars) 

801 

802 def apply_gradients(self, grads_and_vars): 

803 self.optimizer.apply_gradients(grads_and_vars, global_step=self.iterations) 

804 

805 def get_grads(self, loss, params): 

806 return self.optimizer.compute_gradients(loss, params) 

807 

808 def get_updates(self, loss, params): 

809 if distribute_lib.has_strategy(): 

810 self.updates = [] 

811 

812 if not params: 

813 # After the model vars have been created, the second call to get_updates 

814 # is called with params as an empty list. This ensures that we call 

815 # compute_gradients with params=None. 

816 grads = self.optimizer.compute_gradients(loss) 

817 else: 

818 grads = self.optimizer.compute_gradients(loss, params) 

819 global_step = training_util.get_global_step() 

820 opt_update = self.optimizer.apply_gradients(grads, global_step) 

821 else: 

822 if not params: 

823 self.updates = [state_ops.assign_add(self.iterations, 1)] 

824 return self.updates 

825 

826 # Updates list starts out empty because the iterations variable is 

827 # incremented in optimizer.apply_gradients() 

828 self.updates = [] 

829 grads = self.optimizer.compute_gradients(loss, params) 

830 opt_update = self.optimizer.apply_gradients( 

831 grads, global_step=self.iterations) 

832 

833 self.updates.append(opt_update) 

834 return self.updates 

835 

836 @property 

837 def weights(self): 

838 raise NotImplementedError 

839 

840 def get_config(self): 

841 raise NotImplementedError 

842 

843 def from_config(self, config): 

844 raise NotImplementedError 

845 

846 

847# Aliases. 

848 

849sgd = SGD 

850rmsprop = RMSprop 

851adagrad = Adagrad 

852adadelta = Adadelta 

853adam = Adam 

854adamax = Adamax 

855nadam = Nadam