Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/keras/src/optimizers/adagrad.py: 36%
39 statements
« prev ^ index » next coverage.py v7.4.0, created at 2024-01-03 07:57 +0000
« prev ^ index » next coverage.py v7.4.0, created at 2024-01-03 07:57 +0000
1# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
2#
3# Licensed under the Apache License, Version 2.0 (the "License");
4# you may not use this file except in compliance with the License.
5# You may obtain a copy of the License at
6#
7# http://www.apache.org/licenses/LICENSE-2.0
8#
9# Unless required by applicable law or agreed to in writing, software
10# distributed under the License is distributed on an "AS IS" BASIS,
11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12# See the License for the specific language governing permissions and
13# limitations under the License.
14# ==============================================================================
15"""Adagrad optimizer implementation."""
17import tensorflow.compat.v2 as tf
19from keras.src import initializers
20from keras.src.optimizers import optimizer
21from keras.src.saving.object_registration import register_keras_serializable
23# isort: off
24from tensorflow.python.util.tf_export import keras_export
27@register_keras_serializable()
28@keras_export(
29 "keras.optimizers.experimental.Adagrad",
30 "keras.optimizers.Adagrad",
31 "keras.dtensor.experimental.optimizers.Adagrad",
32 v1=[],
33)
34class Adagrad(optimizer.Optimizer):
35 r"""Optimizer that implements the Adagrad algorithm.
37 Adagrad is an optimizer with parameter-specific learning rates,
38 which are adapted relative to how frequently a parameter gets
39 updated during training. The more updates a parameter receives,
40 the smaller the updates.
42 Args:
43 learning_rate: Initial value for the learning rate:
44 either a floating point value,
45 or a `tf.keras.optimizers.schedules.LearningRateSchedule` instance.
46 Defaults to 0.001.
47 Note that `Adagrad` tends to benefit from higher initial learning rate
48 values compared to other optimizers.
49 To match the exact form in the original paper, use 1.0.
50 initial_accumulator_value: Floating point value.
51 Starting value for the accumulators (per-parameter momentum values).
52 Must be non-negative.
53 epsilon: Small floating point value used to maintain numerical stability.
54 {{base_optimizer_keyword_args}}
56 Reference:
57 - [Duchi et al., 2011](
58 http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf).
59 """
61 def __init__(
62 self,
63 learning_rate=0.001,
64 initial_accumulator_value=0.1,
65 epsilon=1e-7,
66 weight_decay=None,
67 clipnorm=None,
68 clipvalue=None,
69 global_clipnorm=None,
70 use_ema=False,
71 ema_momentum=0.99,
72 ema_overwrite_frequency=None,
73 jit_compile=True,
74 name="Adagrad",
75 **kwargs
76 ):
77 super().__init__(
78 weight_decay=weight_decay,
79 clipnorm=clipnorm,
80 clipvalue=clipvalue,
81 global_clipnorm=global_clipnorm,
82 use_ema=use_ema,
83 ema_momentum=ema_momentum,
84 ema_overwrite_frequency=ema_overwrite_frequency,
85 jit_compile=jit_compile,
86 name=name,
87 **kwargs
88 )
89 self._learning_rate = self._build_learning_rate(learning_rate)
90 self.initial_accumulator_value = initial_accumulator_value
91 self.epsilon = epsilon
93 def build(self, var_list):
94 super().build(var_list)
95 if hasattr(self, "_built") and self._built:
96 return
97 self._built = True
98 self._accumulators = []
99 initializer = initializers.Constant(self.initial_accumulator_value)
100 for var in var_list:
101 self._accumulators.append(
102 self.add_variable_from_reference(
103 var,
104 "accumulator",
105 initial_value=initializer(shape=var.shape, dtype=var.dtype),
106 )
107 )
109 def update_step(self, grad, variable):
110 """Update step given gradient and the associated model variable."""
111 lr = tf.cast(self.learning_rate, variable.dtype)
113 var_key = self._var_key(variable)
114 accumulator = self._accumulators[self._index_dict[var_key]]
116 if isinstance(grad, tf.IndexedSlices):
117 # Sparse gradients.
118 accumulator.scatter_add(
119 tf.IndexedSlices(grad.values * grad.values, grad.indices)
120 )
121 sparse_accumulator = tf.gather(accumulator, indices=grad.indices)
122 sparse_denominator = tf.sqrt(sparse_accumulator + self.epsilon)
123 variable.scatter_add(
124 tf.IndexedSlices(
125 -lr * grad.values / sparse_denominator, grad.indices
126 )
127 )
128 else:
129 # Dense gradients.
130 accumulator.assign_add(grad * grad)
131 variable.assign_sub(lr * grad / tf.sqrt(accumulator + self.epsilon))
133 def get_config(self):
134 config = super().get_config()
136 config.update(
137 {
138 "learning_rate": self._serialize_hyperparameter(
139 self._learning_rate
140 ),
141 "initial_accumulator_value": self.initial_accumulator_value,
142 "epsilon": self.epsilon,
143 }
144 )
145 return config
148Adagrad.__doc__ = Adagrad.__doc__.replace(
149 "{{base_optimizer_keyword_args}}", optimizer.base_optimizer_keyword_args
150)