Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/keras/src/optimizers/adagrad.py: 36%

3# Licensed under the Apache License, Version 2.0 (the "License");

4# you may not use this file except in compliance with the License.

5# You may obtain a copy of the License at

7# http://www.apache.org/licenses/LICENSE-2.0

9# Unless required by applicable law or agreed to in writing, software

10# distributed under the License is distributed on an "AS IS" BASIS,

11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

12# See the License for the specific language governing permissions and

13# limitations under the License.

14# ==============================================================================

15"""Adagrad optimizer implementation."""

17import tensorflow.compat.v2 as tf

19from keras.src import initializers

20from keras.src.optimizers import optimizer

21from keras.src.saving.object_registration import register_keras_serializable

23# isort: off

24from tensorflow.python.util.tf_export import keras_export

27@register_keras_serializable()

28@keras_export(

29 "keras.optimizers.experimental.Adagrad",

30 "keras.optimizers.Adagrad",

31 "keras.dtensor.experimental.optimizers.Adagrad",

32 v1=[],

33)

34class Adagrad(optimizer.Optimizer):

35 r"""Optimizer that implements the Adagrad algorithm.

37 Adagrad is an optimizer with parameter-specific learning rates,

38 which are adapted relative to how frequently a parameter gets

39 updated during training. The more updates a parameter receives,

40 the smaller the updates.

42 Args:

43 learning_rate: Initial value for the learning rate:

44 either a floating point value,

45 or a `tf.keras.optimizers.schedules.LearningRateSchedule` instance.

46 Defaults to 0.001.

47 Note that `Adagrad` tends to benefit from higher initial learning rate

48 values compared to other optimizers.

49 To match the exact form in the original paper, use 1.0.

50 initial_accumulator_value: Floating point value.

51 Starting value for the accumulators (per-parameter momentum values).

52 Must be non-negative.

53 epsilon: Small floating point value used to maintain numerical stability.

54 {{base_optimizer_keyword_args}}

56 Reference:

57 - [Duchi et al., 2011](

58 http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf).

59 """

61 def __init__(

62 self,

63 learning_rate=0.001,

64 initial_accumulator_value=0.1,

65 epsilon=1e-7,

66 weight_decay=None,

67 clipnorm=None,

68 clipvalue=None,

69 global_clipnorm=None,

70 use_ema=False,

71 ema_momentum=0.99,

72 ema_overwrite_frequency=None,

73 jit_compile=True,

74 name="Adagrad",

75 **kwargs

76 ):

77 super().__init__(

78 weight_decay=weight_decay,

79 clipnorm=clipnorm,

80 clipvalue=clipvalue,

81 global_clipnorm=global_clipnorm,

82 use_ema=use_ema,

83 ema_momentum=ema_momentum,

84 ema_overwrite_frequency=ema_overwrite_frequency,

85 jit_compile=jit_compile,

86 name=name,

87 **kwargs

88 )

89 self._learning_rate = self._build_learning_rate(learning_rate)

90 self.initial_accumulator_value = initial_accumulator_value

91 self.epsilon = epsilon

93 def build(self, var_list):

94 super().build(var_list)

95 if hasattr(self, "_built") and self._built:

96 return

97 self._built = True

98 self._accumulators = []

99 initializer = initializers.Constant(self.initial_accumulator_value)

100 for var in var_list:

101 self._accumulators.append(

102 self.add_variable_from_reference(

103 var,

104 "accumulator",

105 initial_value=initializer(shape=var.shape, dtype=var.dtype),

106 )

107 )

108

109 def update_step(self, grad, variable):

110 """Update step given gradient and the associated model variable."""

111 lr = tf.cast(self.learning_rate, variable.dtype)

112

113 var_key = self._var_key(variable)

114 accumulator = self._accumulators[self._index_dict[var_key]]

115

116 if isinstance(grad, tf.IndexedSlices):

117 # Sparse gradients.

118 accumulator.scatter_add(

119 tf.IndexedSlices(grad.values * grad.values, grad.indices)

120 )

121 sparse_accumulator = tf.gather(accumulator, indices=grad.indices)

122 sparse_denominator = tf.sqrt(sparse_accumulator + self.epsilon)

123 variable.scatter_add(

124 tf.IndexedSlices(

125 -lr * grad.values / sparse_denominator, grad.indices

126 )

127 )

128 else:

129 # Dense gradients.

130 accumulator.assign_add(grad * grad)

131 variable.assign_sub(lr * grad / tf.sqrt(accumulator + self.epsilon))

132

133 def get_config(self):

134 config = super().get_config()

135

136 config.update(

137 {

138 "learning_rate": self._serialize_hyperparameter(

139 self._learning_rate

140 ),

141 "initial_accumulator_value": self.initial_accumulator_value,

142 "epsilon": self.epsilon,

143 }

144 )

145 return config

146

147

148Adagrad.__doc__ = Adagrad.__doc__.replace(

149 "{{base_optimizer_keyword_args}}", optimizer.base_optimizer_keyword_args

150)

151