Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/keras/src/optimizers/legacy/gradient

3# Licensed under the Apache License, Version 2.0 (the "License");

4# you may not use this file except in compliance with the License.

5# You may obtain a copy of the License at

7# http://www.apache.org/licenses/LICENSE-2.0

9# Unless required by applicable law or agreed to in writing, software

10# distributed under the License is distributed on an "AS IS" BASIS,

11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

12# See the License for the specific language governing permissions and

13# limitations under the License.

14# ==============================================================================

15"""SGD optimizer implementation."""

18import tensorflow.compat.v2 as tf

20from keras.src.optimizers.legacy import optimizer_v2

22# isort: off

23from tensorflow.python.util.tf_export import keras_export

26@keras_export(

27 "keras.optimizers.legacy.SGD",

28 v1=["keras.optimizers.SGD", "keras.optimizers.legacy.SGD"],

29)

30class SGD(optimizer_v2.OptimizerV2):

31 r"""Gradient descent (with momentum) optimizer.

33 Update rule for parameter `w` with gradient `g` when `momentum=0`:

35 ```python

36 w = w - learning_rate * g

37 ```

39 Update rule when `momentum` is larger than 0:

41 ```python

42 velocity = momentum * velocity - learning_rate * g

43 w = w + velocity

44 ```

46 When `nesterov=True`, this rule becomes:

48 ```python

49 velocity = momentum * velocity - learning_rate * g

50 w = w + momentum * velocity - learning_rate * g

51 ```

53 Args:

54 learning_rate: A `Tensor`, floating point value, or a schedule that is a

55 `tf.keras.optimizers.schedules.LearningRateSchedule`, or a callable

56 that takes no arguments and returns the actual value to use. The

57 learning rate. Defaults to `0.01`.

58 momentum: float hyperparameter >= 0 that accelerates gradient descent in

59 the relevant direction and dampens oscillations. Vanilla gradient

60 descent means no momentum. Defaults to `0.`.

61 nesterov: boolean. Whether to apply Nesterov momentum.

62 Defaults to `False`.

63 name: Optional name prefix for the operations created when applying

64 gradients. Defaults to `"SGD"`.

65 **kwargs: keyword arguments. Allowed arguments are `clipvalue`,

66 `clipnorm`, `global_clipnorm`.

67 If `clipvalue` (float) is set, the gradient of each weight

68 is clipped to be no higher than this value.

69 If `clipnorm` (float) is set, the gradient of each weight

70 is individually clipped so that its norm is no higher than this value.

71 If `global_clipnorm` (float) is set the gradient of all weights is

72 clipped so that their global norm is no higher than this value.

74 Usage:

76 >>> opt = tf.keras.optimizers.legacy.SGD(learning_rate=0.1)

77 >>> var = tf.Variable(1.0)

78 >>> loss = lambda: (var ** 2)/2.0 # d(loss)/d(var1) = var1

79 >>> step_count = opt.minimize(loss, [var]).numpy()

80 >>> # Step is `- learning_rate * grad`

81 >>> var.numpy()

82 0.9

84 >>> opt = tf.keras.optimizers.legacy.SGD(learning_rate=0.1, momentum=0.9)

85 >>> var = tf.Variable(1.0)

86 >>> val0 = var.value()

87 >>> loss = lambda: (var ** 2)/2.0 # d(loss)/d(var1) = var1

88 >>> # First step is `- learning_rate * grad`

89 >>> step_count = opt.minimize(loss, [var]).numpy()

90 >>> val1 = var.value()

91 >>> (val0 - val1).numpy()

92 0.1

93 >>> # On later steps, step-size increases because of momentum

94 >>> step_count = opt.minimize(loss, [var]).numpy()

95 >>> val2 = var.value()

96 >>> (val1 - val2).numpy()

97 0.18

99 Reference:

100 - For `nesterov=True`, See [Sutskever et al., 2013](

101 https://github.com/mlresearch/v28/blob/gh-pages/sutskever13.pdf).

102 """

103

104 _HAS_AGGREGATE_GRAD = True

105

106 def __init__(

107 self,

108 learning_rate=0.01,

109 momentum=0.0,

110 nesterov=False,

111 name="SGD",

112 **kwargs,

113 ):

114 super().__init__(name, **kwargs)

115 self._set_hyper("learning_rate", kwargs.get("lr", learning_rate))

116 self._set_hyper("decay", self._initial_decay)

117

118 self._momentum = False

119 if (

120 isinstance(momentum, tf.Tensor)

121 or callable(momentum)

122 or momentum > 0

123 ):

124 self._momentum = True

125 if isinstance(momentum, (int, float)) and (

126 momentum < 0 or momentum > 1

127 ):

128 raise ValueError(

129 "`momentum` must be between [0, 1]. Received: "

130 f"momentum={momentum} (of type {type(momentum)})."

131 )

132 self._set_hyper("momentum", momentum)

133

134 self.nesterov = nesterov

135

136 def _create_slots(self, var_list):

137 if self._momentum:

138 for var in var_list:

139 self.add_slot(var, "momentum")

140

141 def _prepare_local(self, var_device, var_dtype, apply_state):

142 super()._prepare_local(var_device, var_dtype, apply_state)

143 apply_state[(var_device, var_dtype)]["momentum"] = tf.identity(

144 self._get_hyper("momentum", var_dtype)

145 )

146

147 def _resource_apply_dense(self, grad, var, apply_state=None):

148 var_device, var_dtype = var.device, var.dtype.base_dtype

149 coefficients = (apply_state or {}).get(

150 (var_device, var_dtype)

151 ) or self._fallback_apply_state(var_device, var_dtype)

152

153 if self._momentum:

154 momentum_var = self.get_slot(var, "momentum")

155 return tf.raw_ops.ResourceApplyKerasMomentum(

156 var=var.handle,

157 accum=momentum_var.handle,

158 lr=coefficients["lr_t"],

159 grad=grad,

160 momentum=coefficients["momentum"],

161 use_locking=self._use_locking,

162 use_nesterov=self.nesterov,

163 )

164 else:

165 return tf.raw_ops.ResourceApplyGradientDescent(

166 var=var.handle,

167 alpha=coefficients["lr_t"],

168 delta=grad,

169 use_locking=self._use_locking,

170 )

171

172 def _resource_apply_sparse_duplicate_indices(

173 self, grad, var, indices, **kwargs

174 ):

175 if self._momentum:

176 return super()._resource_apply_sparse_duplicate_indices(

177 grad, var, indices, **kwargs

178 )

179 else:

180 var_device, var_dtype = var.device, var.dtype.base_dtype

181 coefficients = kwargs.get("apply_state", {}).get(

182 (var_device, var_dtype)

183 ) or self._fallback_apply_state(var_device, var_dtype)

184

185 return tf.raw_ops.ResourceScatterAdd(

186 resource=var.handle,

187 indices=indices,

188 updates=-grad * coefficients["lr_t"],

189 )

190

191 def _resource_apply_sparse(self, grad, var, indices, apply_state=None):

192 # This method is only needed for momentum optimization.

193 var_device, var_dtype = var.device, var.dtype.base_dtype

194 coefficients = (apply_state or {}).get(

195 (var_device, var_dtype)

196 ) or self._fallback_apply_state(var_device, var_dtype)

197

198 momentum_var = self.get_slot(var, "momentum")

199 return tf.raw_ops.ResourceSparseApplyKerasMomentum(

200 var=var.handle,

201 accum=momentum_var.handle,

202 lr=coefficients["lr_t"],

203 grad=grad,

204 indices=indices,

205 momentum=coefficients["momentum"],

206 use_locking=self._use_locking,

207 use_nesterov=self.nesterov,

208 )

209

210 def get_config(self):

211 config = super().get_config()

212 config.update(

213 {

214 "learning_rate": self._serialize_hyperparameter(

215 "learning_rate"

216 ),

217 "decay": self._initial_decay,

218 "momentum": self._serialize_hyperparameter("momentum"),

219 "nesterov": self.nesterov,

220 }

221 )

222 return config

223

Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/keras/src/optimizers/legacy/gradient_descent.py: 30%

47 statements