Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/keras/src/optimizers/legacy/gradient_descent.py: 30%
47 statements
« prev ^ index » next coverage.py v7.4.0, created at 2024-01-03 07:57 +0000
« prev ^ index » next coverage.py v7.4.0, created at 2024-01-03 07:57 +0000
1# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
2#
3# Licensed under the Apache License, Version 2.0 (the "License");
4# you may not use this file except in compliance with the License.
5# You may obtain a copy of the License at
6#
7# http://www.apache.org/licenses/LICENSE-2.0
8#
9# Unless required by applicable law or agreed to in writing, software
10# distributed under the License is distributed on an "AS IS" BASIS,
11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12# See the License for the specific language governing permissions and
13# limitations under the License.
14# ==============================================================================
15"""SGD optimizer implementation."""
18import tensorflow.compat.v2 as tf
20from keras.src.optimizers.legacy import optimizer_v2
22# isort: off
23from tensorflow.python.util.tf_export import keras_export
26@keras_export(
27 "keras.optimizers.legacy.SGD",
28 v1=["keras.optimizers.SGD", "keras.optimizers.legacy.SGD"],
29)
30class SGD(optimizer_v2.OptimizerV2):
31 r"""Gradient descent (with momentum) optimizer.
33 Update rule for parameter `w` with gradient `g` when `momentum=0`:
35 ```python
36 w = w - learning_rate * g
37 ```
39 Update rule when `momentum` is larger than 0:
41 ```python
42 velocity = momentum * velocity - learning_rate * g
43 w = w + velocity
44 ```
46 When `nesterov=True`, this rule becomes:
48 ```python
49 velocity = momentum * velocity - learning_rate * g
50 w = w + momentum * velocity - learning_rate * g
51 ```
53 Args:
54 learning_rate: A `Tensor`, floating point value, or a schedule that is a
55 `tf.keras.optimizers.schedules.LearningRateSchedule`, or a callable
56 that takes no arguments and returns the actual value to use. The
57 learning rate. Defaults to `0.01`.
58 momentum: float hyperparameter >= 0 that accelerates gradient descent in
59 the relevant direction and dampens oscillations. Vanilla gradient
60 descent means no momentum. Defaults to `0.`.
61 nesterov: boolean. Whether to apply Nesterov momentum.
62 Defaults to `False`.
63 name: Optional name prefix for the operations created when applying
64 gradients. Defaults to `"SGD"`.
65 **kwargs: keyword arguments. Allowed arguments are `clipvalue`,
66 `clipnorm`, `global_clipnorm`.
67 If `clipvalue` (float) is set, the gradient of each weight
68 is clipped to be no higher than this value.
69 If `clipnorm` (float) is set, the gradient of each weight
70 is individually clipped so that its norm is no higher than this value.
71 If `global_clipnorm` (float) is set the gradient of all weights is
72 clipped so that their global norm is no higher than this value.
74 Usage:
76 >>> opt = tf.keras.optimizers.legacy.SGD(learning_rate=0.1)
77 >>> var = tf.Variable(1.0)
78 >>> loss = lambda: (var ** 2)/2.0 # d(loss)/d(var1) = var1
79 >>> step_count = opt.minimize(loss, [var]).numpy()
80 >>> # Step is `- learning_rate * grad`
81 >>> var.numpy()
82 0.9
84 >>> opt = tf.keras.optimizers.legacy.SGD(learning_rate=0.1, momentum=0.9)
85 >>> var = tf.Variable(1.0)
86 >>> val0 = var.value()
87 >>> loss = lambda: (var ** 2)/2.0 # d(loss)/d(var1) = var1
88 >>> # First step is `- learning_rate * grad`
89 >>> step_count = opt.minimize(loss, [var]).numpy()
90 >>> val1 = var.value()
91 >>> (val0 - val1).numpy()
92 0.1
93 >>> # On later steps, step-size increases because of momentum
94 >>> step_count = opt.minimize(loss, [var]).numpy()
95 >>> val2 = var.value()
96 >>> (val1 - val2).numpy()
97 0.18
99 Reference:
100 - For `nesterov=True`, See [Sutskever et al., 2013](
101 https://github.com/mlresearch/v28/blob/gh-pages/sutskever13.pdf).
102 """
104 _HAS_AGGREGATE_GRAD = True
106 def __init__(
107 self,
108 learning_rate=0.01,
109 momentum=0.0,
110 nesterov=False,
111 name="SGD",
112 **kwargs,
113 ):
114 super().__init__(name, **kwargs)
115 self._set_hyper("learning_rate", kwargs.get("lr", learning_rate))
116 self._set_hyper("decay", self._initial_decay)
118 self._momentum = False
119 if (
120 isinstance(momentum, tf.Tensor)
121 or callable(momentum)
122 or momentum > 0
123 ):
124 self._momentum = True
125 if isinstance(momentum, (int, float)) and (
126 momentum < 0 or momentum > 1
127 ):
128 raise ValueError(
129 "`momentum` must be between [0, 1]. Received: "
130 f"momentum={momentum} (of type {type(momentum)})."
131 )
132 self._set_hyper("momentum", momentum)
134 self.nesterov = nesterov
136 def _create_slots(self, var_list):
137 if self._momentum:
138 for var in var_list:
139 self.add_slot(var, "momentum")
141 def _prepare_local(self, var_device, var_dtype, apply_state):
142 super()._prepare_local(var_device, var_dtype, apply_state)
143 apply_state[(var_device, var_dtype)]["momentum"] = tf.identity(
144 self._get_hyper("momentum", var_dtype)
145 )
147 def _resource_apply_dense(self, grad, var, apply_state=None):
148 var_device, var_dtype = var.device, var.dtype.base_dtype
149 coefficients = (apply_state or {}).get(
150 (var_device, var_dtype)
151 ) or self._fallback_apply_state(var_device, var_dtype)
153 if self._momentum:
154 momentum_var = self.get_slot(var, "momentum")
155 return tf.raw_ops.ResourceApplyKerasMomentum(
156 var=var.handle,
157 accum=momentum_var.handle,
158 lr=coefficients["lr_t"],
159 grad=grad,
160 momentum=coefficients["momentum"],
161 use_locking=self._use_locking,
162 use_nesterov=self.nesterov,
163 )
164 else:
165 return tf.raw_ops.ResourceApplyGradientDescent(
166 var=var.handle,
167 alpha=coefficients["lr_t"],
168 delta=grad,
169 use_locking=self._use_locking,
170 )
172 def _resource_apply_sparse_duplicate_indices(
173 self, grad, var, indices, **kwargs
174 ):
175 if self._momentum:
176 return super()._resource_apply_sparse_duplicate_indices(
177 grad, var, indices, **kwargs
178 )
179 else:
180 var_device, var_dtype = var.device, var.dtype.base_dtype
181 coefficients = kwargs.get("apply_state", {}).get(
182 (var_device, var_dtype)
183 ) or self._fallback_apply_state(var_device, var_dtype)
185 return tf.raw_ops.ResourceScatterAdd(
186 resource=var.handle,
187 indices=indices,
188 updates=-grad * coefficients["lr_t"],
189 )
191 def _resource_apply_sparse(self, grad, var, indices, apply_state=None):
192 # This method is only needed for momentum optimization.
193 var_device, var_dtype = var.device, var.dtype.base_dtype
194 coefficients = (apply_state or {}).get(
195 (var_device, var_dtype)
196 ) or self._fallback_apply_state(var_device, var_dtype)
198 momentum_var = self.get_slot(var, "momentum")
199 return tf.raw_ops.ResourceSparseApplyKerasMomentum(
200 var=var.handle,
201 accum=momentum_var.handle,
202 lr=coefficients["lr_t"],
203 grad=grad,
204 indices=indices,
205 momentum=coefficients["momentum"],
206 use_locking=self._use_locking,
207 use_nesterov=self.nesterov,
208 )
210 def get_config(self):
211 config = super().get_config()
212 config.update(
213 {
214 "learning_rate": self._serialize_hyperparameter(
215 "learning_rate"
216 ),
217 "decay": self._initial_decay,
218 "momentum": self._serialize_hyperparameter("momentum"),
219 "nesterov": self.nesterov,
220 }
221 )
222 return config