Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/keras/src/optimizers/legacy/gradient_descent.py: 30%

47 statements  

« prev     ^ index     » next       coverage.py v7.4.0, created at 2024-01-03 07:57 +0000

1# Copyright 2020 The TensorFlow Authors. All Rights Reserved. 

2# 

3# Licensed under the Apache License, Version 2.0 (the "License"); 

4# you may not use this file except in compliance with the License. 

5# You may obtain a copy of the License at 

6# 

7# http://www.apache.org/licenses/LICENSE-2.0 

8# 

9# Unless required by applicable law or agreed to in writing, software 

10# distributed under the License is distributed on an "AS IS" BASIS, 

11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 

12# See the License for the specific language governing permissions and 

13# limitations under the License. 

14# ============================================================================== 

15"""SGD optimizer implementation.""" 

16 

17 

18import tensorflow.compat.v2 as tf 

19 

20from keras.src.optimizers.legacy import optimizer_v2 

21 

22# isort: off 

23from tensorflow.python.util.tf_export import keras_export 

24 

25 

26@keras_export( 

27 "keras.optimizers.legacy.SGD", 

28 v1=["keras.optimizers.SGD", "keras.optimizers.legacy.SGD"], 

29) 

30class SGD(optimizer_v2.OptimizerV2): 

31 r"""Gradient descent (with momentum) optimizer. 

32 

33 Update rule for parameter `w` with gradient `g` when `momentum=0`: 

34 

35 ```python 

36 w = w - learning_rate * g 

37 ``` 

38 

39 Update rule when `momentum` is larger than 0: 

40 

41 ```python 

42 velocity = momentum * velocity - learning_rate * g 

43 w = w + velocity 

44 ``` 

45 

46 When `nesterov=True`, this rule becomes: 

47 

48 ```python 

49 velocity = momentum * velocity - learning_rate * g 

50 w = w + momentum * velocity - learning_rate * g 

51 ``` 

52 

53 Args: 

54 learning_rate: A `Tensor`, floating point value, or a schedule that is a 

55 `tf.keras.optimizers.schedules.LearningRateSchedule`, or a callable 

56 that takes no arguments and returns the actual value to use. The 

57 learning rate. Defaults to `0.01`. 

58 momentum: float hyperparameter >= 0 that accelerates gradient descent in 

59 the relevant direction and dampens oscillations. Vanilla gradient 

60 descent means no momentum. Defaults to `0.`. 

61 nesterov: boolean. Whether to apply Nesterov momentum. 

62 Defaults to `False`. 

63 name: Optional name prefix for the operations created when applying 

64 gradients. Defaults to `"SGD"`. 

65 **kwargs: keyword arguments. Allowed arguments are `clipvalue`, 

66 `clipnorm`, `global_clipnorm`. 

67 If `clipvalue` (float) is set, the gradient of each weight 

68 is clipped to be no higher than this value. 

69 If `clipnorm` (float) is set, the gradient of each weight 

70 is individually clipped so that its norm is no higher than this value. 

71 If `global_clipnorm` (float) is set the gradient of all weights is 

72 clipped so that their global norm is no higher than this value. 

73 

74 Usage: 

75 

76 >>> opt = tf.keras.optimizers.legacy.SGD(learning_rate=0.1) 

77 >>> var = tf.Variable(1.0) 

78 >>> loss = lambda: (var ** 2)/2.0 # d(loss)/d(var1) = var1 

79 >>> step_count = opt.minimize(loss, [var]).numpy() 

80 >>> # Step is `- learning_rate * grad` 

81 >>> var.numpy() 

82 0.9 

83 

84 >>> opt = tf.keras.optimizers.legacy.SGD(learning_rate=0.1, momentum=0.9) 

85 >>> var = tf.Variable(1.0) 

86 >>> val0 = var.value() 

87 >>> loss = lambda: (var ** 2)/2.0 # d(loss)/d(var1) = var1 

88 >>> # First step is `- learning_rate * grad` 

89 >>> step_count = opt.minimize(loss, [var]).numpy() 

90 >>> val1 = var.value() 

91 >>> (val0 - val1).numpy() 

92 0.1 

93 >>> # On later steps, step-size increases because of momentum 

94 >>> step_count = opt.minimize(loss, [var]).numpy() 

95 >>> val2 = var.value() 

96 >>> (val1 - val2).numpy() 

97 0.18 

98 

99 Reference: 

100 - For `nesterov=True`, See [Sutskever et al., 2013]( 

101 https://github.com/mlresearch/v28/blob/gh-pages/sutskever13.pdf). 

102 """ 

103 

104 _HAS_AGGREGATE_GRAD = True 

105 

106 def __init__( 

107 self, 

108 learning_rate=0.01, 

109 momentum=0.0, 

110 nesterov=False, 

111 name="SGD", 

112 **kwargs, 

113 ): 

114 super().__init__(name, **kwargs) 

115 self._set_hyper("learning_rate", kwargs.get("lr", learning_rate)) 

116 self._set_hyper("decay", self._initial_decay) 

117 

118 self._momentum = False 

119 if ( 

120 isinstance(momentum, tf.Tensor) 

121 or callable(momentum) 

122 or momentum > 0 

123 ): 

124 self._momentum = True 

125 if isinstance(momentum, (int, float)) and ( 

126 momentum < 0 or momentum > 1 

127 ): 

128 raise ValueError( 

129 "`momentum` must be between [0, 1]. Received: " 

130 f"momentum={momentum} (of type {type(momentum)})." 

131 ) 

132 self._set_hyper("momentum", momentum) 

133 

134 self.nesterov = nesterov 

135 

136 def _create_slots(self, var_list): 

137 if self._momentum: 

138 for var in var_list: 

139 self.add_slot(var, "momentum") 

140 

141 def _prepare_local(self, var_device, var_dtype, apply_state): 

142 super()._prepare_local(var_device, var_dtype, apply_state) 

143 apply_state[(var_device, var_dtype)]["momentum"] = tf.identity( 

144 self._get_hyper("momentum", var_dtype) 

145 ) 

146 

147 def _resource_apply_dense(self, grad, var, apply_state=None): 

148 var_device, var_dtype = var.device, var.dtype.base_dtype 

149 coefficients = (apply_state or {}).get( 

150 (var_device, var_dtype) 

151 ) or self._fallback_apply_state(var_device, var_dtype) 

152 

153 if self._momentum: 

154 momentum_var = self.get_slot(var, "momentum") 

155 return tf.raw_ops.ResourceApplyKerasMomentum( 

156 var=var.handle, 

157 accum=momentum_var.handle, 

158 lr=coefficients["lr_t"], 

159 grad=grad, 

160 momentum=coefficients["momentum"], 

161 use_locking=self._use_locking, 

162 use_nesterov=self.nesterov, 

163 ) 

164 else: 

165 return tf.raw_ops.ResourceApplyGradientDescent( 

166 var=var.handle, 

167 alpha=coefficients["lr_t"], 

168 delta=grad, 

169 use_locking=self._use_locking, 

170 ) 

171 

172 def _resource_apply_sparse_duplicate_indices( 

173 self, grad, var, indices, **kwargs 

174 ): 

175 if self._momentum: 

176 return super()._resource_apply_sparse_duplicate_indices( 

177 grad, var, indices, **kwargs 

178 ) 

179 else: 

180 var_device, var_dtype = var.device, var.dtype.base_dtype 

181 coefficients = kwargs.get("apply_state", {}).get( 

182 (var_device, var_dtype) 

183 ) or self._fallback_apply_state(var_device, var_dtype) 

184 

185 return tf.raw_ops.ResourceScatterAdd( 

186 resource=var.handle, 

187 indices=indices, 

188 updates=-grad * coefficients["lr_t"], 

189 ) 

190 

191 def _resource_apply_sparse(self, grad, var, indices, apply_state=None): 

192 # This method is only needed for momentum optimization. 

193 var_device, var_dtype = var.device, var.dtype.base_dtype 

194 coefficients = (apply_state or {}).get( 

195 (var_device, var_dtype) 

196 ) or self._fallback_apply_state(var_device, var_dtype) 

197 

198 momentum_var = self.get_slot(var, "momentum") 

199 return tf.raw_ops.ResourceSparseApplyKerasMomentum( 

200 var=var.handle, 

201 accum=momentum_var.handle, 

202 lr=coefficients["lr_t"], 

203 grad=grad, 

204 indices=indices, 

205 momentum=coefficients["momentum"], 

206 use_locking=self._use_locking, 

207 use_nesterov=self.nesterov, 

208 ) 

209 

210 def get_config(self): 

211 config = super().get_config() 

212 config.update( 

213 { 

214 "learning_rate": self._serialize_hyperparameter( 

215 "learning_rate" 

216 ), 

217 "decay": self._initial_decay, 

218 "momentum": self._serialize_hyperparameter("momentum"), 

219 "nesterov": self.nesterov, 

220 } 

221 ) 

222 return config 

223