Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/tensorflow/python/keras/optimizer_v1.py: 18%
410 statements
« prev ^ index » next coverage.py v7.4.0, created at 2024-01-03 07:57 +0000
« prev ^ index » next coverage.py v7.4.0, created at 2024-01-03 07:57 +0000
1# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
2#
3# Licensed under the Apache License, Version 2.0 (the "License");
4# you may not use this file except in compliance with the License.
5# You may obtain a copy of the License at
6#
7# http://www.apache.org/licenses/LICENSE-2.0
8#
9# Unless required by applicable law or agreed to in writing, software
10# distributed under the License is distributed on an "AS IS" BASIS,
11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12# See the License for the specific language governing permissions and
13# limitations under the License.
14# ==============================================================================
15# pylint: disable=invalid-name
16# pylint: disable=g-classes-have-attributes
17"""Legacy v1 optimizer classes.
19For more examples see the base class `tf.compat.v1.keras.optimizers.Optimizer`.
20"""
22from tensorflow.python.distribute import distribute_lib
23from tensorflow.python.eager import backprop
24from tensorflow.python.framework import ops
25from tensorflow.python.keras import backend
26from tensorflow.python.ops import clip_ops
27from tensorflow.python.ops import math_ops
28from tensorflow.python.ops import state_ops
29from tensorflow.python.trackable import base as trackable
30from tensorflow.python.training import training_util
31from tensorflow.python.util import nest
34class Optimizer(object):
35 """Abstract optimizer base class.
37 Note: this is the parent class of all optimizers, not an actual optimizer
38 that can be used for training models.
40 All Keras optimizers support the following keyword arguments:
42 clipnorm: float >= 0. Gradients will be clipped
43 when their L2 norm exceeds this value.
44 clipvalue: float >= 0. Gradients will be clipped
45 when their absolute value exceeds this value.
46 """
48 def __init__(self, **kwargs):
49 allowed_kwargs = {'clipnorm', 'clipvalue'}
50 for k in kwargs:
51 if k not in allowed_kwargs:
52 raise TypeError('Unexpected keyword argument '
53 'passed to optimizer: ' + str(k))
54 # checks that clipnorm >= 0 and clipvalue >= 0
55 if kwargs[k] < 0:
56 raise ValueError('Expected {} >= 0, received: {}'.format(k, kwargs[k]))
57 self.__dict__.update(kwargs)
58 self.updates = []
59 self.weights = []
61 # Set this to False, indicating `apply_gradients` does not take the
62 # `experimental_aggregate_gradients` argument.
63 _HAS_AGGREGATE_GRAD = False
65 def _create_all_weights(self, params):
66 """Creates and sets all optimizer weights.
68 Args:
69 params: list or tuple of `Variable` objects that will be minimized
70 using this optimizer.
72 Returns:
73 Specific weight values that are used in `get_updates`
74 """
75 raise NotImplementedError
77 def get_updates(self, loss, params):
78 raise NotImplementedError
80 def get_gradients(self, loss, params):
81 """Returns gradients of `loss` with respect to `params`.
83 Args:
84 loss: Loss tensor.
85 params: List of variables.
87 Returns:
88 List of gradient tensors.
90 Raises:
91 ValueError: In case any gradient cannot be computed (e.g. if gradient
92 function not implemented).
93 """
94 grads = backend.gradients(loss, params)
95 if any(g is None for g in grads):
96 raise ValueError('An operation has `None` for gradient. '
97 'Please make sure that all of your ops have a '
98 'gradient defined (i.e. are differentiable). '
99 'Common ops without gradient: '
100 'backend.argmax, backend.round, backend.eval.')
101 if hasattr(self, 'clipnorm'):
102 grads = [clip_ops.clip_by_norm(g, self.clipnorm) for g in grads]
103 if hasattr(self, 'clipvalue'):
104 grads = [
105 clip_ops.clip_by_value(g, -self.clipvalue, self.clipvalue)
106 for g in grads
107 ]
108 return grads
110 def set_weights(self, weights):
111 """Sets the weights of the optimizer, from Numpy arrays.
113 Should only be called after computing the gradients
114 (otherwise the optimizer has no weights).
116 Args:
117 weights: a list of Numpy arrays. The number of arrays and their shape
118 must match number of the dimensions of the weights of the optimizer
119 (i.e. it should match the output of `get_weights`).
121 Raises:
122 ValueError: in case of incompatible weight shapes.
123 """
124 params = self.weights
125 if len(params) != len(weights):
126 raise ValueError('Length of the specified weight list (' +
127 str(len(weights)) +
128 ') does not match the number of weights '
129 'of the optimizer (' + str(len(params)) + ')')
130 weight_value_tuples = []
131 param_values = backend.batch_get_value(params)
132 for pv, p, w in zip(param_values, params, weights):
133 if pv.shape != w.shape:
134 raise ValueError('Optimizer weight shape ' + str(pv.shape) +
135 ' not compatible with '
136 'provided weight shape ' + str(w.shape))
137 weight_value_tuples.append((p, w))
138 backend.batch_set_value(weight_value_tuples)
140 def get_weights(self):
141 """Returns the current value of the weights of the optimizer.
143 Returns:
144 A list of numpy arrays.
145 """
146 return backend.batch_get_value(self.weights)
148 def get_config(self):
149 config = {}
150 if hasattr(self, 'clipnorm'):
151 config['clipnorm'] = self.clipnorm
152 if hasattr(self, 'clipvalue'):
153 config['clipvalue'] = self.clipvalue
154 return config
156 @classmethod
157 def from_config(cls, config):
158 return cls(**config)
161class SGD(Optimizer):
162 """Stochastic gradient descent optimizer.
164 Includes support for momentum,
165 learning rate decay, and Nesterov momentum.
167 Args:
168 lr: float >= 0. Learning rate.
169 momentum: float >= 0. Parameter that accelerates SGD in the relevant
170 direction and dampens oscillations.
171 decay: float >= 0. Learning rate decay over each update.
172 nesterov: boolean. Whether to apply Nesterov momentum.
173 """
175 def __init__(self, lr=0.01, momentum=0., decay=0., nesterov=False, **kwargs):
176 super(SGD, self).__init__(**kwargs)
177 with backend.name_scope(self.__class__.__name__):
178 self.iterations = backend.variable(0, dtype='int64', name='iterations')
179 self.lr = backend.variable(lr, name='lr')
180 self.momentum = backend.variable(momentum, name='momentum')
181 self.decay = backend.variable(decay, name='decay')
182 self.initial_decay = decay
183 self.nesterov = nesterov
185 def _create_all_weights(self, params):
186 shapes = [backend.int_shape(p) for p in params]
187 moments = [backend.zeros(shape) for shape in shapes]
188 self.weights = [self.iterations] + moments
189 return moments
191 def get_updates(self, loss, params):
192 grads = self.get_gradients(loss, params)
193 self.updates = [state_ops.assign_add(self.iterations, 1)]
195 lr = self.lr
196 if self.initial_decay > 0:
197 lr = lr * (
198 1. /
199 (1. +
200 self.decay * math_ops.cast(self.iterations,
201 backend.dtype(self.decay))))
202 # momentum
203 moments = self._create_all_weights(params)
204 for p, g, m in zip(params, grads, moments):
205 v = self.momentum * m - lr * g # velocity
206 self.updates.append(state_ops.assign(m, v))
208 if self.nesterov:
209 new_p = p + self.momentum * v - lr * g
210 else:
211 new_p = p + v
213 # Apply constraints.
214 if getattr(p, 'constraint', None) is not None:
215 new_p = p.constraint(new_p)
217 self.updates.append(state_ops.assign(p, new_p))
218 return self.updates
220 def get_config(self):
221 config = {
222 'lr': float(backend.get_value(self.lr)),
223 'momentum': float(backend.get_value(self.momentum)),
224 'decay': float(backend.get_value(self.decay)),
225 'nesterov': self.nesterov
226 }
227 base_config = super(SGD, self).get_config()
228 return dict(list(base_config.items()) + list(config.items()))
231class RMSprop(Optimizer):
232 """RMSProp optimizer.
234 It is recommended to leave the parameters of this optimizer
235 at their default values
236 (except the learning rate, which can be freely tuned).
238 Args:
239 lr: float >= 0. Learning rate.
240 rho: float >= 0.
241 epsilon: float >= 0. Fuzz factor.
242 If `None`, defaults to `backend.epsilon()`.
243 decay: float >= 0. Learning rate decay over each update.
244 """
246 def __init__(self, lr=0.001, rho=0.9, epsilon=None, decay=0., **kwargs):
247 super(RMSprop, self).__init__(**kwargs)
248 with backend.name_scope(self.__class__.__name__):
249 self.lr = backend.variable(lr, name='lr')
250 self.rho = backend.variable(rho, name='rho')
251 self.decay = backend.variable(decay, name='decay')
252 self.iterations = backend.variable(0, dtype='int64', name='iterations')
253 if epsilon is None:
254 epsilon = backend.epsilon()
255 self.epsilon = epsilon
256 self.initial_decay = decay
258 def _create_all_weights(self, params):
259 accumulators = [
260 backend.zeros(backend.int_shape(p), dtype=backend.dtype(p))
261 for p in params]
262 self.weights = accumulators
263 return accumulators
265 def get_updates(self, loss, params):
266 grads = self.get_gradients(loss, params)
267 accumulators = self._create_all_weights(params)
268 self.updates = [state_ops.assign_add(self.iterations, 1)]
270 lr = self.lr
271 if self.initial_decay > 0:
272 lr = lr * (
273 1. /
274 (1. +
275 self.decay * math_ops.cast(self.iterations,
276 backend.dtype(self.decay))))
278 for p, g, a in zip(params, grads, accumulators):
279 # update accumulator
280 new_a = self.rho * a + (1. - self.rho) * math_ops.square(g)
281 self.updates.append(state_ops.assign(a, new_a))
282 new_p = p - lr * g / (backend.sqrt(new_a) + self.epsilon)
284 # Apply constraints.
285 if getattr(p, 'constraint', None) is not None:
286 new_p = p.constraint(new_p)
288 self.updates.append(state_ops.assign(p, new_p))
289 return self.updates
291 def get_config(self):
292 config = {
293 'lr': float(backend.get_value(self.lr)),
294 'rho': float(backend.get_value(self.rho)),
295 'decay': float(backend.get_value(self.decay)),
296 'epsilon': self.epsilon
297 }
298 base_config = super(RMSprop, self).get_config()
299 return dict(list(base_config.items()) + list(config.items()))
302class Adagrad(Optimizer):
303 """Adagrad optimizer.
305 Adagrad is an optimizer with parameter-specific learning rates,
306 which are adapted relative to how frequently a parameter gets
307 updated during training. The more updates a parameter receives,
308 the smaller the updates.
310 It is recommended to leave the parameters of this optimizer
311 at their default values.
313 # Arguments
314 lr: float >= 0. Initial learning rate.
315 epsilon: float >= 0. If `None`, defaults to `backend.epsilon()`.
316 decay: float >= 0. Learning rate decay over each update.
318 # References
319 - [Adaptive Subgradient Methods for Online Learning and Stochastic
320 Optimization](http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf)
321 """
323 def __init__(self, lr=0.01, epsilon=None, decay=0., **kwargs):
324 super(Adagrad, self).__init__(**kwargs)
325 with backend.name_scope(self.__class__.__name__):
326 self.lr = backend.variable(lr, name='lr')
327 self.decay = backend.variable(decay, name='decay')
328 self.iterations = backend.variable(0, dtype='int64', name='iterations')
329 if epsilon is None:
330 epsilon = backend.epsilon()
331 self.epsilon = epsilon
332 self.initial_decay = decay
334 def _create_all_weights(self, params):
335 shapes = [backend.int_shape(p) for p in params]
336 accumulators = [backend.zeros(shape) for shape in shapes]
337 self.weights = accumulators
338 return accumulators
340 def get_updates(self, loss, params):
341 grads = self.get_gradients(loss, params)
342 accumulators = self._create_all_weights(params)
344 self.updates = [state_ops.assign_add(self.iterations, 1)]
346 lr = self.lr
347 if self.initial_decay > 0:
348 lr = lr * (
349 1. /
350 (1. +
351 self.decay * math_ops.cast(self.iterations,
352 backend.dtype(self.decay))))
354 for p, g, a in zip(params, grads, accumulators):
355 new_a = a + math_ops.square(g) # update accumulator
356 self.updates.append(state_ops.assign(a, new_a))
357 new_p = p - lr * g / (backend.sqrt(new_a) + self.epsilon)
359 # Apply constraints.
360 if getattr(p, 'constraint', None) is not None:
361 new_p = p.constraint(new_p)
363 self.updates.append(state_ops.assign(p, new_p))
364 return self.updates
366 def get_config(self):
367 config = {
368 'lr': float(backend.get_value(self.lr)),
369 'decay': float(backend.get_value(self.decay)),
370 'epsilon': self.epsilon
371 }
372 base_config = super(Adagrad, self).get_config()
373 return dict(list(base_config.items()) + list(config.items()))
376class Adadelta(Optimizer):
377 """Adadelta optimizer.
379 Adadelta is a more robust extension of Adagrad
380 that adapts learning rates based on a moving window of gradient updates,
381 instead of accumulating all past gradients. This way, Adadelta continues
382 learning even when many updates have been done. Compared to Adagrad, in the
383 original version of Adadelta you don't have to set an initial learning
384 rate. In this version, initial learning rate and decay factor can
385 be set, as in most other Keras optimizers.
387 It is recommended to leave the parameters of this optimizer
388 at their default values.
390 Arguments:
391 lr: float >= 0. Initial learning rate, defaults to 1.
392 It is recommended to leave it at the default value.
393 rho: float >= 0. Adadelta decay factor, corresponding to fraction of
394 gradient to keep at each time step.
395 epsilon: float >= 0. Fuzz factor.
396 If `None`, defaults to `backend.epsilon()`.
397 decay: float >= 0. Initial learning rate decay.
399 References:
400 - [Adadelta - an adaptive learning rate
401 method](http://arxiv.org/abs/1212.5701)
402 """
404 def __init__(self, lr=1.0, rho=0.95, epsilon=None, decay=0., **kwargs):
405 super(Adadelta, self).__init__(**kwargs)
406 with backend.name_scope(self.__class__.__name__):
407 self.lr = backend.variable(lr, name='lr')
408 self.decay = backend.variable(decay, name='decay')
409 self.iterations = backend.variable(0, dtype='int64', name='iterations')
410 if epsilon is None:
411 epsilon = backend.epsilon()
412 self.rho = rho
413 self.epsilon = epsilon
414 self.initial_decay = decay
416 def _create_all_weights(self, params):
417 shapes = [backend.int_shape(p) for p in params]
418 accumulators = [backend.zeros(shape) for shape in shapes]
419 delta_accumulators = [backend.zeros(shape) for shape in shapes]
420 self.weights = accumulators + delta_accumulators
421 return accumulators, delta_accumulators
423 def get_updates(self, loss, params):
424 grads = self.get_gradients(loss, params)
425 self.updates = [state_ops.assign_add(self.iterations, 1)]
426 accumulators, delta_accumulators = self._create_all_weights(params)
428 lr = self.lr
429 if self.initial_decay > 0:
430 lr = lr * (
431 1. /
432 (1. +
433 self.decay * math_ops.cast(self.iterations,
434 backend.dtype(self.decay))))
436 for p, g, a, d_a in zip(params, grads, accumulators, delta_accumulators):
437 # update accumulator
438 new_a = self.rho * a + (1. - self.rho) * math_ops.square(g)
439 self.updates.append(state_ops.assign(a, new_a))
441 # use the new accumulator and the *old* delta_accumulator
442 update = g * backend.sqrt(d_a + self.epsilon) / backend.sqrt(
443 new_a + self.epsilon)
444 new_p = p - lr * update
446 # Apply constraints.
447 if getattr(p, 'constraint', None) is not None:
448 new_p = p.constraint(new_p)
450 self.updates.append(state_ops.assign(p, new_p))
452 # update delta_accumulator
453 new_d_a = self.rho * d_a + (1 - self.rho) * math_ops.square(update)
454 self.updates.append(state_ops.assign(d_a, new_d_a))
455 return self.updates
457 def get_config(self):
458 config = {
459 'lr': float(backend.get_value(self.lr)),
460 'rho': self.rho,
461 'decay': float(backend.get_value(self.decay)),
462 'epsilon': self.epsilon
463 }
464 base_config = super(Adadelta, self).get_config()
465 return dict(list(base_config.items()) + list(config.items()))
468class Adam(Optimizer):
469 """Adam optimizer.
471 Default parameters follow those provided in the original paper.
473 Args:
474 lr: float >= 0. Learning rate.
475 beta_1: float, 0 < beta < 1. Generally close to 1.
476 beta_2: float, 0 < beta < 1. Generally close to 1.
477 epsilon: float >= 0. Fuzz factor.
478 If `None`, defaults to `backend.epsilon()`.
479 decay: float >= 0. Learning rate decay over each update.
480 amsgrad: boolean. Whether to apply the AMSGrad variant of this algorithm
481 from the paper "On the Convergence of Adam and Beyond".
482 """
484 def __init__(self,
485 lr=0.001,
486 beta_1=0.9,
487 beta_2=0.999,
488 epsilon=None,
489 decay=0.,
490 amsgrad=False,
491 **kwargs):
492 super(Adam, self).__init__(**kwargs)
493 with backend.name_scope(self.__class__.__name__):
494 self.iterations = backend.variable(0, dtype='int64', name='iterations')
495 self.lr = backend.variable(lr, name='lr')
496 self.beta_1 = backend.variable(beta_1, name='beta_1')
497 self.beta_2 = backend.variable(beta_2, name='beta_2')
498 self.decay = backend.variable(decay, name='decay')
499 if epsilon is None:
500 epsilon = backend.epsilon()
501 self.epsilon = epsilon
502 self.initial_decay = decay
503 self.amsgrad = amsgrad
505 def _create_all_weights(self, params):
506 ms = [
507 backend.zeros(backend.int_shape(p), dtype=backend.dtype(p))
508 for p in params]
509 vs = [
510 backend.zeros(backend.int_shape(p), dtype=backend.dtype(p))
511 for p in params]
512 if self.amsgrad:
513 vhats = [
514 backend.zeros(backend.int_shape(p), dtype=backend.dtype(p))
515 for p in params]
516 else:
517 vhats = [backend.zeros(1) for _ in params]
518 self.weights = [self.iterations] + ms + vs + vhats
519 return ms, vs, vhats
521 def get_updates(self, loss, params):
522 grads = self.get_gradients(loss, params)
523 self.updates = []
525 lr = self.lr
526 if self.initial_decay > 0:
527 lr = lr * (
528 1. /
529 (1. +
530 self.decay * math_ops.cast(self.iterations,
531 backend.dtype(self.decay))))
533 with ops.control_dependencies([state_ops.assign_add(self.iterations, 1)]):
534 t = math_ops.cast(self.iterations, backend.floatx())
535 lr_t = lr * (
536 backend.sqrt(1. - math_ops.pow(self.beta_2, t)) /
537 (1. - math_ops.pow(self.beta_1, t)))
539 ms, vs, vhats = self._create_all_weights(params)
540 for p, g, m, v, vhat in zip(params, grads, ms, vs, vhats):
541 m_t = (self.beta_1 * m) + (1. - self.beta_1) * g
542 v_t = (self.beta_2 * v) + (1. - self.beta_2) * math_ops.square(g)
543 if self.amsgrad:
544 vhat_t = math_ops.maximum(vhat, v_t)
545 p_t = p - lr_t * m_t / (backend.sqrt(vhat_t) + self.epsilon)
546 self.updates.append(state_ops.assign(vhat, vhat_t))
547 else:
548 p_t = p - lr_t * m_t / (backend.sqrt(v_t) + self.epsilon)
550 self.updates.append(state_ops.assign(m, m_t))
551 self.updates.append(state_ops.assign(v, v_t))
552 new_p = p_t
554 # Apply constraints.
555 if getattr(p, 'constraint', None) is not None:
556 new_p = p.constraint(new_p)
558 self.updates.append(state_ops.assign(p, new_p))
559 return self.updates
561 def get_config(self):
562 config = {
563 'lr': float(backend.get_value(self.lr)),
564 'beta_1': float(backend.get_value(self.beta_1)),
565 'beta_2': float(backend.get_value(self.beta_2)),
566 'decay': float(backend.get_value(self.decay)),
567 'epsilon': self.epsilon,
568 'amsgrad': self.amsgrad
569 }
570 base_config = super(Adam, self).get_config()
571 return dict(list(base_config.items()) + list(config.items()))
574class Adamax(Optimizer):
575 """Adamax optimizer from Adam paper's Section 7.
577 It is a variant of Adam based on the infinity norm.
578 Default parameters follow those provided in the paper.
580 Args:
581 lr: float >= 0. Learning rate.
582 beta_1/beta_2: floats, 0 < beta < 1. Generally close to 1.
583 epsilon: float >= 0. Fuzz factor.
584 If `None`, defaults to `backend.epsilon()`.
585 decay: float >= 0. Learning rate decay over each update.
586 """
588 def __init__(self,
589 lr=0.002,
590 beta_1=0.9,
591 beta_2=0.999,
592 epsilon=None,
593 decay=0.,
594 **kwargs):
595 super(Adamax, self).__init__(**kwargs)
596 with backend.name_scope(self.__class__.__name__):
597 self.iterations = backend.variable(0, dtype='int64', name='iterations')
598 self.lr = backend.variable(lr, name='lr')
599 self.beta_1 = backend.variable(beta_1, name='beta_1')
600 self.beta_2 = backend.variable(beta_2, name='beta_2')
601 self.decay = backend.variable(decay, name='decay')
602 if epsilon is None:
603 epsilon = backend.epsilon()
604 self.epsilon = epsilon
605 self.initial_decay = decay
607 def _create_all_weights(self, params):
609 shapes = [backend.int_shape(p) for p in params]
610 # zero init of 1st moment
611 ms = [backend.zeros(shape) for shape in shapes]
612 # zero init of exponentially weighted infinity norm
613 us = [backend.zeros(shape) for shape in shapes]
614 self.weights = [self.iterations] + ms + us
615 return ms, us
617 def get_updates(self, loss, params):
618 grads = self.get_gradients(loss, params)
619 self.updates = []
621 lr = self.lr
622 if self.initial_decay > 0:
623 lr = lr * (
624 1. /
625 (1. +
626 self.decay * math_ops.cast(self.iterations,
627 backend.dtype(self.decay))))
629 with ops.control_dependencies([state_ops.assign_add(self.iterations, 1)]):
630 t = math_ops.cast(self.iterations, backend.floatx())
631 lr_t = lr / (1. - math_ops.pow(self.beta_1, t))
633 ms, us = self._create_all_weights(params)
635 for p, g, m, u in zip(params, grads, ms, us):
637 m_t = (self.beta_1 * m) + (1. - self.beta_1) * g
638 u_t = math_ops.maximum(self.beta_2 * u, math_ops.abs(g))
639 p_t = p - lr_t * m_t / (u_t + self.epsilon)
641 self.updates.append(state_ops.assign(m, m_t))
642 self.updates.append(state_ops.assign(u, u_t))
643 new_p = p_t
645 # Apply constraints.
646 if getattr(p, 'constraint', None) is not None:
647 new_p = p.constraint(new_p)
649 self.updates.append(state_ops.assign(p, new_p))
650 return self.updates
652 def get_config(self):
653 config = {
654 'lr': float(backend.get_value(self.lr)),
655 'beta_1': float(backend.get_value(self.beta_1)),
656 'beta_2': float(backend.get_value(self.beta_2)),
657 'decay': float(backend.get_value(self.decay)),
658 'epsilon': self.epsilon
659 }
660 base_config = super(Adamax, self).get_config()
661 return dict(list(base_config.items()) + list(config.items()))
664class Nadam(Optimizer):
665 """Nesterov Adam optimizer.
667 Much like Adam is essentially RMSprop with momentum,
668 Nadam is Adam RMSprop with Nesterov momentum.
670 Default parameters follow those provided in the paper.
671 It is recommended to leave the parameters of this optimizer
672 at their default values.
674 Args:
675 lr: float >= 0. Learning rate.
676 beta_1/beta_2: floats, 0 < beta < 1. Generally close to 1.
677 epsilon: float >= 0. Fuzz factor.
678 If `None`, defaults to `backend.epsilon()`.
679 """
681 def __init__(self,
682 lr=0.002,
683 beta_1=0.9,
684 beta_2=0.999,
685 epsilon=None,
686 schedule_decay=0.004,
687 **kwargs):
688 super(Nadam, self).__init__(**kwargs)
689 with backend.name_scope(self.__class__.__name__):
690 self.iterations = backend.variable(0, dtype='int64', name='iterations')
691 self.m_schedule = backend.variable(1., name='m_schedule')
692 self.lr = backend.variable(lr, name='lr')
693 self.beta_1 = backend.variable(beta_1, name='beta_1')
694 self.beta_2 = backend.variable(beta_2, name='beta_2')
695 if epsilon is None:
696 epsilon = backend.epsilon()
697 self.epsilon = epsilon
698 self.schedule_decay = schedule_decay
700 def _create_all_weights(self, params):
701 shapes = [backend.int_shape(p) for p in params]
702 ms = [backend.zeros(shape) for shape in shapes]
703 vs = [backend.zeros(shape) for shape in shapes]
705 self.weights = [self.iterations, self.m_schedule] + ms + vs
706 return ms, vs
708 def get_updates(self, loss, params):
709 grads = self.get_gradients(loss, params)
710 self.updates = []
712 with ops.control_dependencies([state_ops.assign_add(self.iterations, 1)]):
713 t = math_ops.cast(self.iterations, backend.floatx())
715 # Due to the recommendations in [2], i.e. warming momentum schedule
716 momentum_cache_t = self.beta_1 * (
717 1. - 0.5 *
718 (math_ops.pow(backend.cast_to_floatx(0.96), t * self.schedule_decay)))
719 momentum_cache_t_1 = self.beta_1 * (
720 1. - 0.5 *
721 (math_ops.pow(backend.cast_to_floatx(0.96),
722 (t + 1) * self.schedule_decay)))
723 m_schedule_new = self.m_schedule * momentum_cache_t
724 m_schedule_next = self.m_schedule * momentum_cache_t * momentum_cache_t_1
725 self.updates.append((self.m_schedule, m_schedule_new))
727 ms, vs = self._create_all_weights(params)
729 for p, g, m, v in zip(params, grads, ms, vs):
730 # the following equations given in [1]
731 g_prime = g / (1. - m_schedule_new)
732 m_t = self.beta_1 * m + (1. - self.beta_1) * g
733 m_t_prime = m_t / (1. - m_schedule_next)
734 v_t = self.beta_2 * v + (1. - self.beta_2) * math_ops.square(g)
735 v_t_prime = v_t / (1. - math_ops.pow(self.beta_2, t))
736 m_t_bar = (1. -
737 momentum_cache_t) * g_prime + momentum_cache_t_1 * m_t_prime
739 self.updates.append(state_ops.assign(m, m_t))
740 self.updates.append(state_ops.assign(v, v_t))
742 p_t = p - self.lr * m_t_bar / (backend.sqrt(v_t_prime) + self.epsilon)
743 new_p = p_t
745 # Apply constraints.
746 if getattr(p, 'constraint', None) is not None:
747 new_p = p.constraint(new_p)
749 self.updates.append(state_ops.assign(p, new_p))
750 return self.updates
752 def get_config(self):
753 config = {
754 'lr': float(backend.get_value(self.lr)),
755 'beta_1': float(backend.get_value(self.beta_1)),
756 'beta_2': float(backend.get_value(self.beta_2)),
757 'epsilon': self.epsilon,
758 'schedule_decay': self.schedule_decay
759 }
760 base_config = super(Nadam, self).get_config()
761 return dict(list(base_config.items()) + list(config.items()))
764class TFOptimizer(Optimizer, trackable.Trackable):
765 """Wrapper class for native TensorFlow optimizers."""
767 def __init__(self, optimizer, iterations=None): # pylint: disable=super-init-not-called
768 self.optimizer = optimizer
769 self._track_trackable(optimizer, name='optimizer')
770 if iterations is None:
771 with backend.name_scope(self.__class__.__name__):
772 self.iterations = backend.variable(0, dtype='int64', name='iterations')
773 else:
774 self.iterations = iterations
775 self._track_trackable(self.iterations, name='global_step')
777 def _clip_gradients(self, grads):
778 """Clip gradients according to the clipnorm and clipvalue attributes."""
779 # TFOptimizer wrapper has no gradient clipping options.
780 return grads
782 def minimize(self, loss, var_list, grad_loss=None, tape=None):
783 """Mimics the `OptimizerV2.minimize` API."""
784 if not callable(loss) and tape is None:
785 raise ValueError('`tape` is required when a `Tensor` loss is passed.')
786 tape = tape if tape is not None else backprop.GradientTape()
788 if callable(loss):
789 with tape:
790 if not callable(var_list):
791 tape.watch(var_list)
792 loss = loss()
793 if callable(var_list):
794 var_list = var_list()
796 var_list = nest.flatten(var_list)
797 if var_list:
798 grads = tape.gradient(loss, var_list, grad_loss)
799 grads_and_vars = list(zip(grads, var_list))
800 self.apply_gradients(grads_and_vars)
802 def apply_gradients(self, grads_and_vars):
803 self.optimizer.apply_gradients(grads_and_vars, global_step=self.iterations)
805 def get_grads(self, loss, params):
806 return self.optimizer.compute_gradients(loss, params)
808 def get_updates(self, loss, params):
809 if distribute_lib.has_strategy():
810 self.updates = []
812 if not params:
813 # After the model vars have been created, the second call to get_updates
814 # is called with params as an empty list. This ensures that we call
815 # compute_gradients with params=None.
816 grads = self.optimizer.compute_gradients(loss)
817 else:
818 grads = self.optimizer.compute_gradients(loss, params)
819 global_step = training_util.get_global_step()
820 opt_update = self.optimizer.apply_gradients(grads, global_step)
821 else:
822 if not params:
823 self.updates = [state_ops.assign_add(self.iterations, 1)]
824 return self.updates
826 # Updates list starts out empty because the iterations variable is
827 # incremented in optimizer.apply_gradients()
828 self.updates = []
829 grads = self.optimizer.compute_gradients(loss, params)
830 opt_update = self.optimizer.apply_gradients(
831 grads, global_step=self.iterations)
833 self.updates.append(opt_update)
834 return self.updates
836 @property
837 def weights(self):
838 raise NotImplementedError
840 def get_config(self):
841 raise NotImplementedError
843 def from_config(self, config):
844 raise NotImplementedError
847# Aliases.
849sgd = SGD
850rmsprop = RMSprop
851adagrad = Adagrad
852adadelta = Adadelta
853adam = Adam
854adamax = Adamax
855nadam = Nadam