Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/keras/src/optimizers/optimizer_v1.py: 17%
402 statements
« prev ^ index » next coverage.py v7.4.0, created at 2024-01-03 07:57 +0000
« prev ^ index » next coverage.py v7.4.0, created at 2024-01-03 07:57 +0000
1# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
2#
3# Licensed under the Apache License, Version 2.0 (the "License");
4# you may not use this file except in compliance with the License.
5# You may obtain a copy of the License at
6#
7# http://www.apache.org/licenses/LICENSE-2.0
8#
9# Unless required by applicable law or agreed to in writing, software
10# distributed under the License is distributed on an "AS IS" BASIS,
11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12# See the License for the specific language governing permissions and
13# limitations under the License.
14# ==============================================================================
17"""Legacy v1 optimizer classes.
19For more examples see the base class `tf.compat.v1.keras.optimizers.Optimizer`.
20"""
22import tensorflow.compat.v2 as tf
24from keras.src import backend
27class Optimizer:
28 """Abstract optimizer base class.
30 Note: this is the parent class of all optimizers, not an actual optimizer
31 that can be used for training models.
33 All Keras optimizers support the following keyword arguments:
35 clipnorm: float >= 0. Gradients will be clipped
36 when their L2 norm exceeds this value.
37 clipvalue: float >= 0. Gradients will be clipped
38 when their absolute value exceeds this value.
39 """
41 def __init__(self, **kwargs):
42 allowed_kwargs = {"clipnorm", "clipvalue"}
43 for k in kwargs:
44 if k not in allowed_kwargs:
45 raise TypeError(
46 "Unexpected keyword argument passed to optimizer: " + str(k)
47 )
48 # checks that clipnorm >= 0 and clipvalue >= 0
49 if kwargs[k] < 0:
50 raise ValueError(f"Expected {k} >= 0, received: {kwargs[k]}")
51 self.__dict__.update(kwargs)
52 self.updates = []
53 self.weights = []
55 # Set this to False, indicating `apply_gradients` does not take the
56 # `experimental_aggregate_gradients` argument.
57 _HAS_AGGREGATE_GRAD = False
59 def _create_all_weights(self, params):
60 """Creates and sets all optimizer weights.
62 Args:
63 params: list or tuple of `Variable` objects that will be minimized
64 using this optimizer.
66 Returns:
67 Specific weight values that are used in `get_updates`
68 """
69 raise NotImplementedError
71 def get_updates(self, loss, params):
72 raise NotImplementedError
74 def get_gradients(self, loss, params):
75 """Returns gradients of `loss` with respect to `params`.
77 Args:
78 loss: Loss tensor.
79 params: List of variables.
81 Returns:
82 List of gradient tensors.
84 Raises:
85 ValueError: In case any gradient cannot be computed (e.g. if
86 gradient function not implemented).
87 """
88 grads = backend.gradients(loss, params)
89 if any(g is None for g in grads):
90 raise ValueError(
91 "An operation has `None` for gradient. "
92 "Please make sure that all of your ops have a "
93 "gradient defined (i.e. are differentiable). "
94 "Common ops without gradient: "
95 "backend.argmax, backend.round, backend.eval."
96 )
97 if hasattr(self, "clipnorm"):
98 grads = [tf.clip_by_norm(g, self.clipnorm) for g in grads]
99 if hasattr(self, "clipvalue"):
100 grads = [
101 tf.clip_by_value(g, -self.clipvalue, self.clipvalue)
102 for g in grads
103 ]
104 return grads
106 def set_weights(self, weights):
107 """Sets the weights of the optimizer, from Numpy arrays.
109 Should only be called after computing the gradients
110 (otherwise the optimizer has no weights).
112 Args:
113 weights: a list of Numpy arrays. The number of arrays and their
114 shape must match number of the dimensions of the weights of the
115 optimizer (i.e. it should match the output of `get_weights`).
117 Raises:
118 ValueError: in case of incompatible weight shapes.
119 """
120 params = self.weights
121 if len(params) != len(weights):
122 raise ValueError(
123 "Length of the specified weight list ("
124 + str(len(weights))
125 + ") does not match the number of weights of the optimizer ("
126 + str(len(params))
127 + ")"
128 )
129 weight_value_tuples = []
130 param_values = backend.batch_get_value(params)
131 for pv, p, w in zip(param_values, params, weights):
132 if pv.shape != w.shape:
133 raise ValueError(
134 "Optimizer weight shape "
135 + str(pv.shape)
136 + " not compatible with provided weight shape "
137 + str(w.shape)
138 )
139 weight_value_tuples.append((p, w))
140 backend.batch_set_value(weight_value_tuples)
142 def get_weights(self):
143 """Returns the current value of the weights of the optimizer.
145 Returns:
146 A list of numpy arrays.
147 """
148 return backend.batch_get_value(self.weights)
150 def get_config(self):
151 config = {}
152 if hasattr(self, "clipnorm"):
153 config["clipnorm"] = self.clipnorm
154 if hasattr(self, "clipvalue"):
155 config["clipvalue"] = self.clipvalue
156 return config
158 @classmethod
159 def from_config(cls, config):
160 return cls(**config)
163class SGD(Optimizer):
164 """Stochastic gradient descent optimizer.
166 Includes support for momentum,
167 learning rate decay, and Nesterov momentum.
169 Args:
170 lr: float >= 0. Learning rate.
171 momentum: float >= 0. Parameter that accelerates SGD in the relevant
172 direction and dampens oscillations.
173 decay: float >= 0. Learning rate decay over each update.
174 nesterov: boolean. Whether to apply Nesterov momentum.
175 """
177 def __init__(
178 self, lr=0.01, momentum=0.0, decay=0.0, nesterov=False, **kwargs
179 ):
180 super().__init__(**kwargs)
181 with backend.name_scope(self.__class__.__name__):
182 self.iterations = backend.variable(
183 0, dtype="int64", name="iterations"
184 )
185 self.lr = backend.variable(lr, name="lr")
186 self.momentum = backend.variable(momentum, name="momentum")
187 self.decay = backend.variable(decay, name="decay")
188 self.initial_decay = decay
189 self.nesterov = nesterov
191 def _create_all_weights(self, params):
192 shapes = [backend.int_shape(p) for p in params]
193 moments = [backend.zeros(shape) for shape in shapes]
194 self.weights = [self.iterations] + moments
195 return moments
197 def get_updates(self, loss, params):
198 grads = self.get_gradients(loss, params)
199 self.updates = [tf.compat.v1.assign_add(self.iterations, 1)]
201 lr = self.lr
202 if self.initial_decay > 0:
203 lr = lr * (
204 1.0
205 / (
206 1.0
207 + self.decay
208 * tf.cast(self.iterations, backend.dtype(self.decay))
209 )
210 )
211 # momentum
212 moments = self._create_all_weights(params)
213 for p, g, m in zip(params, grads, moments):
214 v = self.momentum * m - lr * g # velocity
215 self.updates.append(tf.compat.v1.assign(m, v))
217 if self.nesterov:
218 new_p = p + self.momentum * v - lr * g
219 else:
220 new_p = p + v
222 # Apply constraints.
223 if getattr(p, "constraint", None) is not None:
224 new_p = p.constraint(new_p)
226 self.updates.append(tf.compat.v1.assign(p, new_p))
227 return self.updates
229 def get_config(self):
230 config = {
231 "lr": float(backend.get_value(self.lr)),
232 "momentum": float(backend.get_value(self.momentum)),
233 "decay": float(backend.get_value(self.decay)),
234 "nesterov": self.nesterov,
235 }
236 base_config = super().get_config()
237 return dict(list(base_config.items()) + list(config.items()))
240class RMSprop(Optimizer):
241 """RMSProp optimizer.
243 It is recommended to leave the parameters of this optimizer
244 at their default values
245 (except the learning rate, which can be freely tuned).
247 Args:
248 lr: float >= 0. Learning rate.
249 rho: float >= 0.
250 epsilon: float >= 0. Fuzz factor.
251 If `None`, defaults to `backend.epsilon()`.
252 decay: float >= 0. Learning rate decay over each update.
253 """
255 def __init__(self, lr=0.001, rho=0.9, epsilon=None, decay=0.0, **kwargs):
256 super().__init__(**kwargs)
257 with backend.name_scope(self.__class__.__name__):
258 self.lr = backend.variable(lr, name="lr")
259 self.rho = backend.variable(rho, name="rho")
260 self.decay = backend.variable(decay, name="decay")
261 self.iterations = backend.variable(
262 0, dtype="int64", name="iterations"
263 )
264 if epsilon is None:
265 epsilon = backend.epsilon()
266 self.epsilon = epsilon
267 self.initial_decay = decay
269 def _create_all_weights(self, params):
270 accumulators = [
271 backend.zeros(backend.int_shape(p), dtype=backend.dtype(p))
272 for p in params
273 ]
274 self.weights = accumulators
275 return accumulators
277 def get_updates(self, loss, params):
278 grads = self.get_gradients(loss, params)
279 accumulators = self._create_all_weights(params)
280 self.updates = [tf.compat.v1.assign_add(self.iterations, 1)]
282 lr = self.lr
283 if self.initial_decay > 0:
284 lr = lr * (
285 1.0
286 / (
287 1.0
288 + self.decay
289 * tf.cast(self.iterations, backend.dtype(self.decay))
290 )
291 )
293 for p, g, a in zip(params, grads, accumulators):
294 # update accumulator
295 new_a = self.rho * a + (1.0 - self.rho) * tf.square(g)
296 self.updates.append(tf.compat.v1.assign(a, new_a))
297 new_p = p - lr * g / (backend.sqrt(new_a) + self.epsilon)
299 # Apply constraints.
300 if getattr(p, "constraint", None) is not None:
301 new_p = p.constraint(new_p)
303 self.updates.append(tf.compat.v1.assign(p, new_p))
304 return self.updates
306 def get_config(self):
307 config = {
308 "lr": float(backend.get_value(self.lr)),
309 "rho": float(backend.get_value(self.rho)),
310 "decay": float(backend.get_value(self.decay)),
311 "epsilon": self.epsilon,
312 }
313 base_config = super().get_config()
314 return dict(list(base_config.items()) + list(config.items()))
317class Adagrad(Optimizer):
318 """Adagrad optimizer.
320 Adagrad is an optimizer with parameter-specific learning rates,
321 which are adapted relative to how frequently a parameter gets
322 updated during training. The more updates a parameter receives,
323 the smaller the updates.
325 It is recommended to leave the parameters of this optimizer
326 at their default values.
328 # Arguments
329 lr: float >= 0. Initial learning rate.
330 epsilon: float >= 0. If `None`, defaults to `backend.epsilon()`.
331 decay: float >= 0. Learning rate decay over each update.
333 # References
334 - [Adaptive Subgradient Methods for Online Learning and Stochastic
335 Optimization](http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf)
336 """
338 def __init__(self, lr=0.01, epsilon=None, decay=0.0, **kwargs):
339 super().__init__(**kwargs)
340 with backend.name_scope(self.__class__.__name__):
341 self.lr = backend.variable(lr, name="lr")
342 self.decay = backend.variable(decay, name="decay")
343 self.iterations = backend.variable(
344 0, dtype="int64", name="iterations"
345 )
346 if epsilon is None:
347 epsilon = backend.epsilon()
348 self.epsilon = epsilon
349 self.initial_decay = decay
351 def _create_all_weights(self, params):
352 shapes = [backend.int_shape(p) for p in params]
353 accumulators = [backend.zeros(shape) for shape in shapes]
354 self.weights = accumulators
355 return accumulators
357 def get_updates(self, loss, params):
358 grads = self.get_gradients(loss, params)
359 accumulators = self._create_all_weights(params)
361 self.updates = [tf.compat.v1.assign_add(self.iterations, 1)]
363 lr = self.lr
364 if self.initial_decay > 0:
365 lr = lr * (
366 1.0
367 / (
368 1.0
369 + self.decay
370 * tf.cast(self.iterations, backend.dtype(self.decay))
371 )
372 )
374 for p, g, a in zip(params, grads, accumulators):
375 new_a = a + tf.square(g) # update accumulator
376 self.updates.append(tf.compat.v1.assign(a, new_a))
377 new_p = p - lr * g / (backend.sqrt(new_a) + self.epsilon)
379 # Apply constraints.
380 if getattr(p, "constraint", None) is not None:
381 new_p = p.constraint(new_p)
383 self.updates.append(tf.compat.v1.assign(p, new_p))
384 return self.updates
386 def get_config(self):
387 config = {
388 "lr": float(backend.get_value(self.lr)),
389 "decay": float(backend.get_value(self.decay)),
390 "epsilon": self.epsilon,
391 }
392 base_config = super().get_config()
393 return dict(list(base_config.items()) + list(config.items()))
396class Adadelta(Optimizer):
397 """Adadelta optimizer.
399 Adadelta is a more robust extension of Adagrad
400 that adapts learning rates based on a moving window of gradient updates,
401 instead of accumulating all past gradients. This way, Adadelta continues
402 learning even when many updates have been done. Compared to Adagrad, in the
403 original version of Adadelta you don't have to set an initial learning
404 rate. In this version, initial learning rate and decay factor can
405 be set, as in most other Keras optimizers.
407 It is recommended to leave the parameters of this optimizer
408 at their default values.
410 Arguments:
411 lr: float >= 0. Initial learning rate, defaults to 1.
412 It is recommended to leave it at the default value.
413 rho: float >= 0. Adadelta decay factor, corresponding to fraction of
414 gradient to keep at each time step.
415 epsilon: float >= 0. Fuzz factor.
416 If `None`, defaults to `backend.epsilon()`.
417 decay: float >= 0. Initial learning rate decay.
419 References:
420 - [Adadelta - an adaptive learning rate
421 method](http://arxiv.org/abs/1212.5701)
422 """
424 def __init__(self, lr=1.0, rho=0.95, epsilon=None, decay=0.0, **kwargs):
425 super().__init__(**kwargs)
426 with backend.name_scope(self.__class__.__name__):
427 self.lr = backend.variable(lr, name="lr")
428 self.decay = backend.variable(decay, name="decay")
429 self.iterations = backend.variable(
430 0, dtype="int64", name="iterations"
431 )
432 if epsilon is None:
433 epsilon = backend.epsilon()
434 self.rho = rho
435 self.epsilon = epsilon
436 self.initial_decay = decay
438 def _create_all_weights(self, params):
439 shapes = [backend.int_shape(p) for p in params]
440 accumulators = [backend.zeros(shape) for shape in shapes]
441 delta_accumulators = [backend.zeros(shape) for shape in shapes]
442 self.weights = accumulators + delta_accumulators
443 return accumulators, delta_accumulators
445 def get_updates(self, loss, params):
446 grads = self.get_gradients(loss, params)
447 self.updates = [tf.compat.v1.assign_add(self.iterations, 1)]
448 accumulators, delta_accumulators = self._create_all_weights(params)
450 lr = self.lr
451 if self.initial_decay > 0:
452 lr = lr * (
453 1.0
454 / (
455 1.0
456 + self.decay
457 * tf.cast(self.iterations, backend.dtype(self.decay))
458 )
459 )
461 for p, g, a, d_a in zip(
462 params, grads, accumulators, delta_accumulators
463 ):
464 # update accumulator
465 new_a = self.rho * a + (1.0 - self.rho) * tf.square(g)
466 self.updates.append(tf.compat.v1.assign(a, new_a))
468 # use the new accumulator and the *old* delta_accumulator
469 update = (
470 g
471 * backend.sqrt(d_a + self.epsilon)
472 / backend.sqrt(new_a + self.epsilon)
473 )
474 new_p = p - lr * update
476 # Apply constraints.
477 if getattr(p, "constraint", None) is not None:
478 new_p = p.constraint(new_p)
480 self.updates.append(tf.compat.v1.assign(p, new_p))
482 # update delta_accumulator
483 new_d_a = self.rho * d_a + (1 - self.rho) * tf.square(update)
484 self.updates.append(tf.compat.v1.assign(d_a, new_d_a))
485 return self.updates
487 def get_config(self):
488 config = {
489 "lr": float(backend.get_value(self.lr)),
490 "rho": self.rho,
491 "decay": float(backend.get_value(self.decay)),
492 "epsilon": self.epsilon,
493 }
494 base_config = super().get_config()
495 return dict(list(base_config.items()) + list(config.items()))
498class Adam(Optimizer):
499 """Adam optimizer.
501 Default parameters follow those provided in the original paper.
503 Args:
504 lr: float >= 0. Learning rate.
505 beta_1: float, 0 < beta < 1. Generally close to 1.
506 beta_2: float, 0 < beta < 1. Generally close to 1.
507 epsilon: float >= 0. Fuzz factor.
508 If `None`, defaults to `backend.epsilon()`.
509 decay: float >= 0. Learning rate decay over each update.
510 amsgrad: boolean. Whether to apply the AMSGrad variant of this algorithm
511 from the paper "On the Convergence of Adam and Beyond".
512 """
514 def __init__(
515 self,
516 lr=0.001,
517 beta_1=0.9,
518 beta_2=0.999,
519 epsilon=None,
520 decay=0.0,
521 amsgrad=False,
522 **kwargs,
523 ):
524 super().__init__(**kwargs)
525 with backend.name_scope(self.__class__.__name__):
526 self.iterations = backend.variable(
527 0, dtype="int64", name="iterations"
528 )
529 self.lr = backend.variable(lr, name="lr")
530 self.beta_1 = backend.variable(beta_1, name="beta_1")
531 self.beta_2 = backend.variable(beta_2, name="beta_2")
532 self.decay = backend.variable(decay, name="decay")
533 if epsilon is None:
534 epsilon = backend.epsilon()
535 self.epsilon = epsilon
536 self.initial_decay = decay
537 self.amsgrad = amsgrad
539 def _create_all_weights(self, params):
540 ms = [
541 backend.zeros(backend.int_shape(p), dtype=backend.dtype(p))
542 for p in params
543 ]
544 vs = [
545 backend.zeros(backend.int_shape(p), dtype=backend.dtype(p))
546 for p in params
547 ]
548 if self.amsgrad:
549 vhats = [
550 backend.zeros(backend.int_shape(p), dtype=backend.dtype(p))
551 for p in params
552 ]
553 else:
554 vhats = [backend.zeros(1) for _ in params]
555 self.weights = [self.iterations] + ms + vs + vhats
556 return ms, vs, vhats
558 def get_updates(self, loss, params):
559 grads = self.get_gradients(loss, params)
560 self.updates = []
562 lr = self.lr
563 if self.initial_decay > 0:
564 lr = lr * (
565 1.0
566 / (
567 1.0
568 + self.decay
569 * tf.cast(self.iterations, backend.dtype(self.decay))
570 )
571 )
573 with tf.control_dependencies(
574 [tf.compat.v1.assign_add(self.iterations, 1)]
575 ):
576 t = tf.cast(self.iterations, backend.floatx())
577 lr_t = lr * (
578 backend.sqrt(1.0 - tf.pow(self.beta_2, t))
579 / (1.0 - tf.pow(self.beta_1, t))
580 )
582 ms, vs, vhats = self._create_all_weights(params)
583 for p, g, m, v, vhat in zip(params, grads, ms, vs, vhats):
584 m_t = (self.beta_1 * m) + (1.0 - self.beta_1) * g
585 v_t = (self.beta_2 * v) + (1.0 - self.beta_2) * tf.square(g)
586 if self.amsgrad:
587 vhat_t = tf.maximum(vhat, v_t)
588 p_t = p - lr_t * m_t / (backend.sqrt(vhat_t) + self.epsilon)
589 self.updates.append(tf.compat.v1.assign(vhat, vhat_t))
590 else:
591 p_t = p - lr_t * m_t / (backend.sqrt(v_t) + self.epsilon)
593 self.updates.append(tf.compat.v1.assign(m, m_t))
594 self.updates.append(tf.compat.v1.assign(v, v_t))
595 new_p = p_t
597 # Apply constraints.
598 if getattr(p, "constraint", None) is not None:
599 new_p = p.constraint(new_p)
601 self.updates.append(tf.compat.v1.assign(p, new_p))
602 return self.updates
604 def get_config(self):
605 config = {
606 "lr": float(backend.get_value(self.lr)),
607 "beta_1": float(backend.get_value(self.beta_1)),
608 "beta_2": float(backend.get_value(self.beta_2)),
609 "decay": float(backend.get_value(self.decay)),
610 "epsilon": self.epsilon,
611 "amsgrad": self.amsgrad,
612 }
613 base_config = super().get_config()
614 return dict(list(base_config.items()) + list(config.items()))
617class Adamax(Optimizer):
618 """Adamax optimizer from Adam paper's Section 7.
620 It is a variant of Adam based on the infinity norm.
621 Default parameters follow those provided in the paper.
623 Args:
624 lr: float >= 0. Learning rate.
625 beta_1/beta_2: floats, 0 < beta < 1. Generally close to 1.
626 epsilon: float >= 0. Fuzz factor.
627 If `None`, defaults to `backend.epsilon()`.
628 decay: float >= 0. Learning rate decay over each update.
629 """
631 def __init__(
632 self,
633 lr=0.002,
634 beta_1=0.9,
635 beta_2=0.999,
636 epsilon=None,
637 decay=0.0,
638 **kwargs,
639 ):
640 super().__init__(**kwargs)
641 with backend.name_scope(self.__class__.__name__):
642 self.iterations = backend.variable(
643 0, dtype="int64", name="iterations"
644 )
645 self.lr = backend.variable(lr, name="lr")
646 self.beta_1 = backend.variable(beta_1, name="beta_1")
647 self.beta_2 = backend.variable(beta_2, name="beta_2")
648 self.decay = backend.variable(decay, name="decay")
649 if epsilon is None:
650 epsilon = backend.epsilon()
651 self.epsilon = epsilon
652 self.initial_decay = decay
654 def _create_all_weights(self, params):
656 shapes = [backend.int_shape(p) for p in params]
657 # zero init of 1st moment
658 ms = [backend.zeros(shape) for shape in shapes]
659 # zero init of exponentially weighted infinity norm
660 us = [backend.zeros(shape) for shape in shapes]
661 self.weights = [self.iterations] + ms + us
662 return ms, us
664 def get_updates(self, loss, params):
665 grads = self.get_gradients(loss, params)
666 self.updates = []
668 lr = self.lr
669 if self.initial_decay > 0:
670 lr = lr * (
671 1.0
672 / (
673 1.0
674 + self.decay
675 * tf.cast(self.iterations, backend.dtype(self.decay))
676 )
677 )
679 with tf.control_dependencies(
680 [tf.compat.v1.assign_add(self.iterations, 1)]
681 ):
682 t = tf.cast(self.iterations, backend.floatx())
683 lr_t = lr / (1.0 - tf.pow(self.beta_1, t))
685 ms, us = self._create_all_weights(params)
687 for p, g, m, u in zip(params, grads, ms, us):
689 m_t = (self.beta_1 * m) + (1.0 - self.beta_1) * g
690 u_t = tf.maximum(self.beta_2 * u, tf.abs(g))
691 p_t = p - lr_t * m_t / (u_t + self.epsilon)
693 self.updates.append(tf.compat.v1.assign(m, m_t))
694 self.updates.append(tf.compat.v1.assign(u, u_t))
695 new_p = p_t
697 # Apply constraints.
698 if getattr(p, "constraint", None) is not None:
699 new_p = p.constraint(new_p)
701 self.updates.append(tf.compat.v1.assign(p, new_p))
702 return self.updates
704 def get_config(self):
705 config = {
706 "lr": float(backend.get_value(self.lr)),
707 "beta_1": float(backend.get_value(self.beta_1)),
708 "beta_2": float(backend.get_value(self.beta_2)),
709 "decay": float(backend.get_value(self.decay)),
710 "epsilon": self.epsilon,
711 }
712 base_config = super().get_config()
713 return dict(list(base_config.items()) + list(config.items()))
716class Nadam(Optimizer):
717 """Nesterov Adam optimizer.
719 Much like Adam is essentially RMSprop with momentum,
720 Nadam is Adam RMSprop with Nesterov momentum.
722 Default parameters follow those provided in the paper.
723 It is recommended to leave the parameters of this optimizer
724 at their default values.
726 Args:
727 lr: float >= 0. Learning rate.
728 beta_1/beta_2: floats, 0 < beta < 1. Generally close to 1.
729 epsilon: float >= 0. Fuzz factor.
730 If `None`, defaults to `backend.epsilon()`.
731 """
733 def __init__(
734 self,
735 lr=0.002,
736 beta_1=0.9,
737 beta_2=0.999,
738 epsilon=None,
739 schedule_decay=0.004,
740 **kwargs,
741 ):
742 super().__init__(**kwargs)
743 with backend.name_scope(self.__class__.__name__):
744 self.iterations = backend.variable(
745 0, dtype="int64", name="iterations"
746 )
747 self.m_schedule = backend.variable(1.0, name="m_schedule")
748 self.lr = backend.variable(lr, name="lr")
749 self.beta_1 = backend.variable(beta_1, name="beta_1")
750 self.beta_2 = backend.variable(beta_2, name="beta_2")
751 if epsilon is None:
752 epsilon = backend.epsilon()
753 self.epsilon = epsilon
754 self.schedule_decay = schedule_decay
756 def _create_all_weights(self, params):
757 shapes = [backend.int_shape(p) for p in params]
758 ms = [backend.zeros(shape) for shape in shapes]
759 vs = [backend.zeros(shape) for shape in shapes]
761 self.weights = [self.iterations, self.m_schedule] + ms + vs
762 return ms, vs
764 def get_updates(self, loss, params):
765 grads = self.get_gradients(loss, params)
766 self.updates = []
768 with tf.control_dependencies(
769 [tf.compat.v1.assign_add(self.iterations, 1)]
770 ):
771 t = tf.cast(self.iterations, backend.floatx())
773 # Due to the recommendations in [2], i.e. warming momentum schedule
774 momentum_cache_t = self.beta_1 * (
775 1.0
776 - 0.5
777 * (tf.pow(backend.cast_to_floatx(0.96), t * self.schedule_decay))
778 )
779 momentum_cache_t_1 = self.beta_1 * (
780 1.0
781 - 0.5
782 * (
783 tf.pow(
784 backend.cast_to_floatx(0.96), (t + 1) * self.schedule_decay
785 )
786 )
787 )
788 m_schedule_new = self.m_schedule * momentum_cache_t
789 m_schedule_next = (
790 self.m_schedule * momentum_cache_t * momentum_cache_t_1
791 )
792 self.updates.append((self.m_schedule, m_schedule_new))
794 ms, vs = self._create_all_weights(params)
796 for p, g, m, v in zip(params, grads, ms, vs):
797 # the following equations given in [1]
798 g_prime = g / (1.0 - m_schedule_new)
799 m_t = self.beta_1 * m + (1.0 - self.beta_1) * g
800 m_t_prime = m_t / (1.0 - m_schedule_next)
801 v_t = self.beta_2 * v + (1.0 - self.beta_2) * tf.square(g)
802 v_t_prime = v_t / (1.0 - tf.pow(self.beta_2, t))
803 m_t_bar = (
804 1.0 - momentum_cache_t
805 ) * g_prime + momentum_cache_t_1 * m_t_prime
807 self.updates.append(tf.compat.v1.assign(m, m_t))
808 self.updates.append(tf.compat.v1.assign(v, v_t))
810 p_t = p - self.lr * m_t_bar / (
811 backend.sqrt(v_t_prime) + self.epsilon
812 )
813 new_p = p_t
815 # Apply constraints.
816 if getattr(p, "constraint", None) is not None:
817 new_p = p.constraint(new_p)
819 self.updates.append(tf.compat.v1.assign(p, new_p))
820 return self.updates
822 def get_config(self):
823 config = {
824 "lr": float(backend.get_value(self.lr)),
825 "beta_1": float(backend.get_value(self.beta_1)),
826 "beta_2": float(backend.get_value(self.beta_2)),
827 "epsilon": self.epsilon,
828 "schedule_decay": self.schedule_decay,
829 }
830 base_config = super().get_config()
831 return dict(list(base_config.items()) + list(config.items()))
834class TFOptimizer(Optimizer, tf.__internal__.tracking.Trackable):
835 """Wrapper class for native TensorFlow optimizers."""
837 def __init__(self, optimizer, iterations=None):
838 self.optimizer = optimizer
839 self._track_trackable(optimizer, name="optimizer")
840 if iterations is None:
841 with backend.name_scope(self.__class__.__name__):
842 self.iterations = backend.variable(
843 0, dtype="int64", name="iterations"
844 )
845 else:
846 self.iterations = iterations
847 self._track_trackable(self.iterations, name="global_step")
849 def _clip_gradients(self, grads):
850 """Clip gradients according to the clipnorm and clipvalue attributes."""
851 # TFOptimizer wrapper has no gradient clipping options.
852 return grads
854 def minimize(self, loss, var_list, grad_loss=None, tape=None):
855 """Mimics the `OptimizerV2.minimize` API."""
856 if not callable(loss) and tape is None:
857 raise ValueError(
858 "`tape` is required when a `Tensor` loss is passed."
859 )
860 tape = tape if tape is not None else tf.GradientTape()
862 if callable(loss):
863 with tape:
864 if not callable(var_list):
865 tape.watch(var_list)
866 loss = loss()
867 if callable(var_list):
868 var_list = var_list()
870 var_list = tf.nest.flatten(var_list)
871 if var_list:
872 grads = tape.gradient(loss, var_list, grad_loss)
873 grads_and_vars = list(zip(grads, var_list))
874 self.apply_gradients(grads_and_vars)
876 def apply_gradients(self, grads_and_vars):
877 self.optimizer.apply_gradients(
878 grads_and_vars, global_step=self.iterations
879 )
881 def get_grads(self, loss, params):
882 return self.optimizer.compute_gradients(loss, params)
884 def get_updates(self, loss, params):
885 if tf.distribute.has_strategy():
886 self.updates = []
888 if not params:
889 # After the model vars have been created, the second call to
890 # get_updates is called with params as an empty list. This
891 # ensures that we call compute_gradients with params=None.
892 grads = self.optimizer.compute_gradients(loss)
893 else:
894 grads = self.optimizer.compute_gradients(loss, params)
895 global_step = tf.compat.v1.train.get_global_step()
896 opt_update = self.optimizer.apply_gradients(grads, global_step)
897 else:
898 if not params:
899 self.updates = [tf.compat.v1.assign_add(self.iterations, 1)]
900 return self.updates
902 # Updates list starts out empty because the iterations variable is
903 # incremented in optimizer.apply_gradients()
904 self.updates = []
905 grads = self.optimizer.compute_gradients(loss, params)
906 opt_update = self.optimizer.apply_gradients(
907 grads, global_step=self.iterations
908 )
910 self.updates.append(opt_update)
911 return self.updates
913 @property
914 def weights(self):
915 raise NotImplementedError
917 def get_config(self):
918 raise NotImplementedError
920 def from_config(self, config):
921 raise NotImplementedError
924# Aliases.
926sgd = SGD
927rmsprop = RMSprop
928adagrad = Adagrad
929adadelta = Adadelta
930adam = Adam
931adamax = Adamax
932nadam = Nadam