Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/tensorflow/python/keras/optimizer_v2/adam.py: 22%
152 statements
« prev ^ index » next coverage.py v7.4.0, created at 2024-01-03 07:57 +0000
« prev ^ index » next coverage.py v7.4.0, created at 2024-01-03 07:57 +0000
1# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
2#
3# Licensed under the Apache License, Version 2.0 (the "License");
4# you may not use this file except in compliance with the License.
5# You may obtain a copy of the License at
6#
7# http://www.apache.org/licenses/LICENSE-2.0
8#
9# Unless required by applicable law or agreed to in writing, software
10# distributed under the License is distributed on an "AS IS" BASIS,
11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12# See the License for the specific language governing permissions and
13# limitations under the License.
14# ==============================================================================
15"""Adam optimizer implementation."""
16# pylint: disable=g-classes-have-attributes
18from tensorflow.python.eager import def_function
19from tensorflow.python.framework import indexed_slices
20from tensorflow.python.framework import ops
21from tensorflow.python.framework import tensor_conversion
22from tensorflow.python.keras import backend_config
23from tensorflow.python.keras.optimizer_v2 import optimizer_v2
24from tensorflow.python.ops import array_ops
25from tensorflow.python.ops import control_flow_ops
26from tensorflow.python.ops import math_ops
27from tensorflow.python.ops import state_ops
28from tensorflow.python.training import gen_training_ops
29from tensorflow.python.util.tf_export import keras_export
32@keras_export('keras.optimizers.Adam')
33class Adam(optimizer_v2.OptimizerV2):
34 r"""Optimizer that implements the Adam algorithm.
36 Adam optimization is a stochastic gradient descent method that is based on
37 adaptive estimation of first-order and second-order moments.
39 According to
40 [Kingma et al., 2014](http://arxiv.org/abs/1412.6980),
41 the method is "*computationally
42 efficient, has little memory requirement, invariant to diagonal rescaling of
43 gradients, and is well suited for problems that are large in terms of
44 data/parameters*".
46 Args:
47 learning_rate: A `Tensor`, floating point value, or a schedule that is a
48 `tf.keras.optimizers.schedules.LearningRateSchedule`, or a callable
49 that takes no arguments and returns the actual value to use, The
50 learning rate. Defaults to 0.001.
51 beta_1: A float value or a constant float tensor, or a callable
52 that takes no arguments and returns the actual value to use. The
53 exponential decay rate for the 1st moment estimates. Defaults to 0.9.
54 beta_2: A float value or a constant float tensor, or a callable
55 that takes no arguments and returns the actual value to use, The
56 exponential decay rate for the 2nd moment estimates. Defaults to 0.999.
57 epsilon: A small constant for numerical stability. This epsilon is
58 "epsilon hat" in the Kingma and Ba paper (in the formula just before
59 Section 2.1), not the epsilon in Algorithm 1 of the paper. Defaults to
60 1e-7.
61 amsgrad: Boolean. Whether to apply AMSGrad variant of this algorithm from
62 the paper "On the Convergence of Adam and beyond". Defaults to `False`.
63 name: Optional name for the operations created when applying gradients.
64 Defaults to `"Adam"`.
65 **kwargs: Keyword arguments. Allowed to be one of
66 `"clipnorm"` or `"clipvalue"`.
67 `"clipnorm"` (float) clips gradients by norm; `"clipvalue"` (float) clips
68 gradients by value.
70 Usage:
72 >>> opt = tf.keras.optimizers.Adam(learning_rate=0.1)
73 >>> var1 = tf.Variable(10.0)
74 >>> loss = lambda: (var1 ** 2)/2.0 # d(loss)/d(var1) == var1
75 >>> step_count = opt.minimize(loss, [var1]).numpy()
76 >>> # The first step is `-learning_rate*sign(grad)`
77 >>> var1.numpy()
78 9.9
80 Reference:
81 - [Kingma et al., 2014](http://arxiv.org/abs/1412.6980)
82 - [Reddi et al., 2018](
83 https://openreview.net/pdf?id=ryQu7f-RZ) for `amsgrad`.
85 Notes:
87 The default value of 1e-7 for epsilon might not be a good default in
88 general. For example, when training an Inception network on ImageNet a
89 current good choice is 1.0 or 0.1. Note that since Adam uses the
90 formulation just before Section 2.1 of the Kingma and Ba paper rather than
91 the formulation in Algorithm 1, the "epsilon" referred to here is "epsilon
92 hat" in the paper.
94 The sparse implementation of this algorithm (used when the gradient is an
95 IndexedSlices object, typically because of `tf.gather` or an embedding
96 lookup in the forward pass) does apply momentum to variable slices even if
97 they were not used in the forward pass (meaning they have a gradient equal
98 to zero). Momentum decay (beta1) is also applied to the entire momentum
99 accumulator. This means that the sparse behavior is equivalent to the dense
100 behavior (in contrast to some momentum implementations which ignore momentum
101 unless a variable slice was actually used).
102 """
104 _HAS_AGGREGATE_GRAD = True
106 def __init__(self,
107 learning_rate=0.001,
108 beta_1=0.9,
109 beta_2=0.999,
110 epsilon=1e-7,
111 amsgrad=False,
112 name='Adam',
113 **kwargs):
114 super(Adam, self).__init__(name, **kwargs)
115 self._set_hyper('learning_rate', kwargs.get('lr', learning_rate))
116 self._set_hyper('decay', self._initial_decay)
117 self._set_hyper('beta_1', beta_1)
118 self._set_hyper('beta_2', beta_2)
119 self.epsilon = epsilon or backend_config.epsilon()
120 self.amsgrad = amsgrad
122 def _create_slots(self, var_list):
123 # Create slots for the first and second moments.
124 # Separate for-loops to respect the ordering of slot variables from v1.
125 for var in var_list:
126 self.add_slot(var, 'm')
127 for var in var_list:
128 self.add_slot(var, 'v')
129 if self.amsgrad:
130 for var in var_list:
131 self.add_slot(var, 'vhat')
133 def _prepare_local(self, var_device, var_dtype, apply_state):
134 super(Adam, self)._prepare_local(var_device, var_dtype, apply_state)
136 local_step = math_ops.cast(self.iterations + 1, var_dtype)
137 beta_1_t = array_ops.identity(self._get_hyper('beta_1', var_dtype))
138 beta_2_t = array_ops.identity(self._get_hyper('beta_2', var_dtype))
139 beta_1_power = math_ops.pow(beta_1_t, local_step)
140 beta_2_power = math_ops.pow(beta_2_t, local_step)
141 lr = (apply_state[(var_device, var_dtype)]['lr_t'] *
142 (math_ops.sqrt(1 - beta_2_power) / (1 - beta_1_power)))
143 apply_state[(var_device, var_dtype)].update(
144 dict(
145 lr=lr,
146 epsilon=tensor_conversion.convert_to_tensor_v2_with_dispatch(
147 self.epsilon, var_dtype
148 ),
149 beta_1_t=beta_1_t,
150 beta_1_power=beta_1_power,
151 one_minus_beta_1_t=1 - beta_1_t,
152 beta_2_t=beta_2_t,
153 beta_2_power=beta_2_power,
154 one_minus_beta_2_t=1 - beta_2_t,
155 )
156 )
158 def set_weights(self, weights):
159 params = self.weights
160 # If the weights are generated by Keras V1 optimizer, it includes vhats
161 # even without amsgrad, i.e, V1 optimizer has 3x + 1 variables, while V2
162 # optimizer has 2x + 1 variables. Filter vhats out for compatibility.
163 num_vars = int((len(params) - 1) / 2)
164 if len(weights) == 3 * num_vars + 1:
165 weights = weights[:len(params)]
166 super(Adam, self).set_weights(weights)
168 def _resource_apply_dense(self, grad, var, apply_state=None):
169 var_device, var_dtype = var.device, var.dtype.base_dtype
170 coefficients = ((apply_state or {}).get((var_device, var_dtype))
171 or self._fallback_apply_state(var_device, var_dtype))
173 m = self.get_slot(var, 'm')
174 v = self.get_slot(var, 'v')
176 if not self.amsgrad:
177 return gen_training_ops.ResourceApplyAdam(
178 var=var.handle,
179 m=m.handle,
180 v=v.handle,
181 beta1_power=coefficients['beta_1_power'],
182 beta2_power=coefficients['beta_2_power'],
183 lr=coefficients['lr_t'],
184 beta1=coefficients['beta_1_t'],
185 beta2=coefficients['beta_2_t'],
186 epsilon=coefficients['epsilon'],
187 grad=grad,
188 use_locking=self._use_locking)
189 else:
190 vhat = self.get_slot(var, 'vhat')
191 return gen_training_ops.ResourceApplyAdamWithAmsgrad(
192 var=var.handle,
193 m=m.handle,
194 v=v.handle,
195 vhat=vhat.handle,
196 beta1_power=coefficients['beta_1_power'],
197 beta2_power=coefficients['beta_2_power'],
198 lr=coefficients['lr_t'],
199 beta1=coefficients['beta_1_t'],
200 beta2=coefficients['beta_2_t'],
201 epsilon=coefficients['epsilon'],
202 grad=grad,
203 use_locking=self._use_locking)
205 def _resource_apply_sparse(self, grad, var, indices, apply_state=None):
206 var_device, var_dtype = var.device, var.dtype.base_dtype
207 coefficients = ((apply_state or {}).get((var_device, var_dtype))
208 or self._fallback_apply_state(var_device, var_dtype))
210 # m_t = beta1 * m + (1 - beta1) * g_t
211 m = self.get_slot(var, 'm')
212 m_scaled_g_values = grad * coefficients['one_minus_beta_1_t']
213 m_t = state_ops.assign(m, m * coefficients['beta_1_t'],
214 use_locking=self._use_locking)
215 with ops.control_dependencies([m_t]):
216 m_t = self._resource_scatter_add(m, indices, m_scaled_g_values)
218 # v_t = beta2 * v + (1 - beta2) * (g_t * g_t)
219 v = self.get_slot(var, 'v')
220 v_scaled_g_values = (grad * grad) * coefficients['one_minus_beta_2_t']
221 v_t = state_ops.assign(v, v * coefficients['beta_2_t'],
222 use_locking=self._use_locking)
223 with ops.control_dependencies([v_t]):
224 v_t = self._resource_scatter_add(v, indices, v_scaled_g_values)
226 if not self.amsgrad:
227 v_sqrt = math_ops.sqrt(v_t)
228 var_update = state_ops.assign_sub(
229 var, coefficients['lr'] * m_t / (v_sqrt + coefficients['epsilon']),
230 use_locking=self._use_locking)
231 return control_flow_ops.group(*[var_update, m_t, v_t])
232 else:
233 v_hat = self.get_slot(var, 'vhat')
234 v_hat_t = math_ops.maximum(v_hat, v_t)
235 with ops.control_dependencies([v_hat_t]):
236 v_hat_t = state_ops.assign(
237 v_hat, v_hat_t, use_locking=self._use_locking)
238 v_hat_sqrt = math_ops.sqrt(v_hat_t)
239 var_update = state_ops.assign_sub(
240 var,
241 coefficients['lr'] * m_t / (v_hat_sqrt + coefficients['epsilon']),
242 use_locking=self._use_locking)
243 return control_flow_ops.group(*[var_update, m_t, v_t, v_hat_t])
245 def get_config(self):
246 config = super(Adam, self).get_config()
247 config.update({
248 'learning_rate': self._serialize_hyperparameter('learning_rate'),
249 'decay': self._initial_decay,
250 'beta_1': self._serialize_hyperparameter('beta_1'),
251 'beta_2': self._serialize_hyperparameter('beta_2'),
252 'epsilon': self.epsilon,
253 'amsgrad': self.amsgrad,
254 })
255 return config
258class NonFusedAdam(optimizer_v2.OptimizerV2):
259 r"""Optimizer that implements the Adam algorithm without fused kernels.
261 Adam optimization is a stochastic gradient descent method that is based on
262 adaptive estimation of first-order and second-order moments.
263 According to the paper
264 [Adam: A Method for Stochastic Optimization. Kingma et al.,
265 2014](http://arxiv.org/abs/1412.6980), the method is "*computationally
266 efficient, has little memory requirement, invariant to diagonal rescaling of
267 gradients, and is well suited for problems that are large in terms of
268 data/parameters*".
270 For AMSGrad see [On The Convergence Of Adam And Beyond.
271 Reddi et al., 5-8](https://openreview.net/pdf?id=ryQu7f-RZ).
273 **If amsgrad = False**:
275 initialize $m_0$ as 1st moment vector
276 initialize $v_0$ as 2nd moment vector
278 The update rule for $\theta$ with gradient $g$ uses an optimization
279 described at the end of section 2 of the paper:
281 $$lr_t = \mathrm{learning\_rate} *
282 \sqrt{1 - \beta_2^t} / (1 - \beta_1^t)$$
283 $$m_t = \beta_1 * m_{t-1} + (1 - \beta_1) * g$$
284 $$v_t = \beta_2 * v_{t-1} + (1 - \beta_2) * g^2$$
285 $$\theta_t = \theta_{t-1} - lr_t * m_t / (\sqrt{v_t} + \epsilon)$$
287 **If amsgrad = True**:
289 initialize $m_0$ as 1st moment vector
290 initialize $v_0$ as 2nd moment vector
291 initialize $\hat{v}_0$ as 2nd moment vector
293 The update rule for $\theta$ with gradient $g$ uses an optimization
294 described at the end of section 2 of the paper:
296 $$lr_t = \mathrm{learning\_rate} *
297 \sqrt{1 - \beta_2^t} / (1 - \beta_1^t)$$
299 $$m_t = \beta_1 * m_{t-1} + (1 - \beta_1) * g$$
300 $$v_t = \beta_2 * v_{t-1} + (1 - \beta_2) * g^2$$
301 $$\hat{v}_t = \max(\hat{v}_{t-1}, v_t)$$
302 $$\theta_t = \theta_{t-1} - lr_t * m_t / (\sqrt{\hat{v}_t} + \epsilon)$$
304 The default value of 1e-7 for epsilon might not be a good default in
305 general. For example, when training an Inception network on ImageNet a
306 current good choice is 1.0 or 0.1. Note that since Adam uses the
307 formulation just before Section 2.1 of the Kingma and Ba paper rather than
308 the formulation in Algorithm 1, the "epsilon" referred to here is "epsilon
309 hat" in the paper.
311 The sparse implementation of this algorithm (used when the gradient is an
312 IndexedSlices object, typically because of `tf.gather` or an embedding
313 lookup in the forward pass) does apply momentum to variable slices even if
314 they were not used in the forward pass (meaning they have a gradient equal
315 to zero). Momentum decay (beta1) is also applied to the entire momentum
316 accumulator. This means that the sparse behavior is equivalent to the dense
317 behavior (in contrast to some momentum implementations which ignore momentum
318 unless a variable slice was actually used).
320 Usage:
322 >>> opt = tf.keras.optimizers.Adam(learning_rate=0.1)
323 >>> var1 = tf.Variable(10.0)
324 >>> loss = lambda: (var1 ** 2)/2.0 # d(loss)/d(var1) == var1
325 >>> step_count = opt.minimize(loss, [var1]).numpy()
326 >>> # The first step is `-learning_rate*sign(grad)`
327 >>> var1.numpy()
328 9.9
329 """
331 _HAS_AGGREGATE_GRAD = True
333 def __init__(self,
334 learning_rate=0.001,
335 beta_1=0.9,
336 beta_2=0.999,
337 epsilon=1e-7,
338 amsgrad=False,
339 name='Adam',
340 **kwargs):
341 """Construct a new Adam optimizer.
343 Args:
344 learning_rate: A `Tensor`, floating point value, or a schedule that is a
345 `tf.keras.optimizers.schedules.LearningRateSchedule`, or a callable that
346 takes no arguments and returns the actual value to use, The learning
347 rate. Defaults to 0.001.
348 beta_1: A float value or a constant float tensor, or a callable that takes
349 no arguments and returns the actual value to use. The exponential decay
350 rate for the 1st moment estimates. Defaults to 0.9.
351 beta_2: A float value or a constant float tensor, or a callable that takes
352 no arguments and returns the actual value to use, The exponential decay
353 rate for the 2nd moment estimates. Defaults to 0.999.
354 epsilon: A small constant for numerical stability. This epsilon is
355 "epsilon hat" in the Kingma and Ba paper (in the formula just before
356 Section 2.1), not the epsilon in Algorithm 1 of the paper. Defaults to
357 1e-7.
358 amsgrad: Boolean. Whether to apply AMSGrad variant of this algorithm from
359 the paper "On the Convergence of Adam and beyond". Defaults to `False`.
360 name: Optional name for the operations created when applying gradients.
361 Defaults to "Adam".
362 **kwargs: keyword arguments. Allowed to be {`clipnorm`, `clipvalue`, `lr`,
363 `decay`}. `clipnorm` is clip gradients by norm; `clipvalue` is clip
364 gradients by value, `decay` is included for backward compatibility to
365 allow time inverse decay of learning rate. `lr` is included for backward
366 compatibility, recommended to use `learning_rate` instead.
367 """
369 super(NonFusedAdam, self).__init__(name, **kwargs)
370 self._set_hyper('learning_rate', kwargs.get('lr', learning_rate))
371 self._set_hyper('decay', self._initial_decay)
372 self._set_hyper('beta_1', beta_1)
373 self._set_hyper('beta_2', beta_2)
374 self.epsilon = epsilon or backend_config.epsilon()
375 self.amsgrad = amsgrad
377 def _create_slots(self, var_list):
378 # Create slots for the first and second moments.
379 # Separate for-loops to respect the ordering of slot variables from v1.
380 for var in var_list:
381 self.add_slot(var, 'm')
382 for var in var_list:
383 self.add_slot(var, 'v')
384 if self.amsgrad:
385 for var in var_list:
386 self.add_slot(var, 'vhat')
388 def _prepare_local(self, var_device, var_dtype, apply_state):
389 super(NonFusedAdam, self)._prepare_local(var_device, var_dtype, apply_state)
391 local_step = math_ops.cast(self.iterations + 1, var_dtype)
392 beta_1_t = array_ops.identity(self._get_hyper('beta_1', var_dtype))
393 beta_2_t = array_ops.identity(self._get_hyper('beta_2', var_dtype))
394 beta_1_power = math_ops.pow(beta_1_t, local_step)
395 beta_2_power = math_ops.pow(beta_2_t, local_step)
396 lr = (
397 apply_state[(var_device, var_dtype)]['lr_t'] *
398 (math_ops.sqrt(1 - beta_2_power) / (1 - beta_1_power)))
399 apply_state[(var_device, var_dtype)].update(
400 dict(
401 lr=lr,
402 epsilon=tensor_conversion.convert_to_tensor_v2_with_dispatch(
403 self.epsilon, var_dtype
404 ),
405 beta_1_t=beta_1_t,
406 beta_1_power=beta_1_power,
407 one_minus_beta_1_t=1 - beta_1_t,
408 beta_2_t=beta_2_t,
409 beta_2_power=beta_2_power,
410 one_minus_beta_2_t=1 - beta_2_t,
411 )
412 )
414 def set_weights(self, weights):
415 params = self.weights
416 # If the weights are generated by Keras V1 optimizer, it includes vhats
417 # even without amsgrad, i.e, V1 optimizer has 3x + 1 variables, while V2
418 # optimizer has 2x + 1 variables. Filter vhats out for compatibility.
419 num_vars = int((len(params) - 1) / 2)
420 if len(weights) == 3 * num_vars + 1:
421 weights = weights[:len(params)]
422 super(NonFusedAdam, self).set_weights(weights)
424 @def_function.function(jit_compile=True)
425 def _resource_apply_dense(self, grad, var, apply_state=None):
426 var_device, var_dtype = var.device, var.dtype.base_dtype
427 coefficients = ((apply_state or {}).get((var_device, var_dtype)) or
428 self._fallback_apply_state(var_device, var_dtype))
430 m = self.get_slot(var, 'm')
431 v = self.get_slot(var, 'v')
433 alpha = (
434 coefficients['lr_t'] * math_ops.sqrt(1 - coefficients['beta_2_power']) /
435 (1 - coefficients['beta_1_power']))
436 m.assign_add((grad - m) * (1 - coefficients['beta_1_t']))
437 v.assign_add((math_ops.square(grad) - v) * (1 - coefficients['beta_2_t']))
438 if self.amsgrad:
439 vhat = self.get_slot(var, 'vhat')
440 vhat.assign(math_ops.maximum(vhat, v))
441 v = vhat
442 var.assign_sub(
443 (m * alpha) / (math_ops.sqrt(v) - coefficients['epsilon']))
445 @def_function.function(jit_compile=True)
446 def _resource_apply_sparse(self, grad, var, indices, apply_state=None):
447 var_device, var_dtype = var.device, var.dtype.base_dtype
448 coefficients = ((apply_state or {}).get((var_device, var_dtype)) or
449 self._fallback_apply_state(var_device, var_dtype))
451 # m_t = beta1 * m + (1 - beta1) * g_t
452 m = self.get_slot(var, 'm')
453 m_scaled_g_values = grad * coefficients['one_minus_beta_1_t']
454 m.assign(m * coefficients['beta_1_t'])
455 m.scatter_add(indexed_slices.IndexedSlices(m_scaled_g_values, indices))
457 # v_t = beta2 * v + (1 - beta2) * (g_t * g_t)
458 v = self.get_slot(var, 'v')
459 v_scaled_g_values = (grad * grad) * coefficients['one_minus_beta_2_t']
460 v.assign(v * coefficients['beta_2_t'])
461 v.scatter_add(indexed_slices.IndexedSlices(v_scaled_g_values, indices))
463 if not self.amsgrad:
464 var.assign_sub(coefficients['lr'] * m /
465 (math_ops.sqrt(v) + coefficients['epsilon']))
466 else:
467 v_hat = self.get_slot(var, 'vhat')
468 v_hat.assign(math_ops.maximum(v_hat, v))
469 var.assign_sub(coefficients['lr'] * m /
470 (math_ops.sqrt(v_hat) + coefficients['epsilon']))
472 def get_config(self):
473 config = super(NonFusedAdam, self).get_config()
474 config.update({
475 'learning_rate': self._serialize_hyperparameter('learning_rate'),
476 'decay': self._initial_decay,
477 'beta_1': self._serialize_hyperparameter('beta_1'),
478 'beta_2': self._serialize_hyperparameter('beta_2'),
479 'epsilon': self.epsilon,
480 'amsgrad': self.amsgrad,
481 })
482 return config