Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/tensorflow_addons/optimizers/novograd.py: 20%

80 statements  

« prev     ^ index     » next       coverage.py v7.4.0, created at 2024-01-03 07:57 +0000

1# Copyright 2019 The TensorFlow Authors. All Rights Reserved. 

2# 

3# Licensed under the Apache License, Version 2.0 (the "License"); 

4# you may not use this file except in compliance with the License. 

5# You may obtain a copy of the License at 

6# 

7# http://www.apache.org/licenses/LICENSE-2.0 

8# 

9# Unless required by applicable law or agreed to in writing, software 

10# distributed under the License is distributed on an "AS IS" BASIS, 

11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 

12# See the License for the specific language governing permissions and 

13# limitations under the License. 

14# ============================================================================== 

15"""NovoGrad for TensorFlow.""" 

16 

17import tensorflow as tf 

18from tensorflow_addons.utils.types import FloatTensorLike 

19from tensorflow_addons.optimizers import KerasLegacyOptimizer 

20from typing import Union, Callable 

21from typeguard import typechecked 

22 

23 

24@tf.keras.utils.register_keras_serializable(package="Addons") 

25class NovoGrad(KerasLegacyOptimizer): 

26 """Optimizer that implements NovoGrad. 

27 

28 The NovoGrad Optimizer was first proposed in [Stochastic Gradient 

29 Methods with Layerwise Adaptive Moments for training of Deep 

30 Networks](https://arxiv.org/pdf/1905.11286.pdf) NovoGrad is a 

31 first-order SGD-based algorithm, which computes second moments per 

32 layer instead of per weight as in Adam. Compared to Adam, NovoGrad 

33 takes less memory, and has been found to be more numerically stable. 

34 (For more information on the computation please refer to this 

35 [link](https://nvidia.github.io/OpenSeq2Seq/html/optimizers.html)) 

36 

37 Second order moment = exponential moving average of Layer-wise square 

38 of grads: 

39 v_t <-- beta_2 * v_{t-1} + (1-beta_2) * (g_t)^2 

40 First order moment in one of four modes: 

41 1. moment of grads normalized by v_t: 

42 m_t <- beta_1 * m_{t-1} + [ g_t / (sqrt(v_t)+epsilon)] 

43 2. moment similar to Adam: exponential moving average of grads 

44 normalized by v_t (set grad_averaging = True to use this): 

45 m_t <- beta_1 * m_{t-1} + 

46 [(1 - beta_1) * (g_t / (sqrt(v_t) + epsilon))] 

47 3. weight decay adds a w_d term after grads are rescaled by 

48 1/sqrt(v_t) (set weight_decay > 0 to use this0: 

49 m_t <- beta_1 * m_{t-1} + 

50 [(g_t / (sqrt(v_t) + epsilon)) + (w_d * w_{t-1})] 

51 4. weight decay + exponential moving average from Adam: 

52 m_t <- beta_1 * m_{t-1} + 

53 [(1 - beta_1) * ((g_t / (sqrt(v_t + epsilon)) + 

54 (w_d * w_{t-1}))] 

55 Weight update: 

56 w_t <- w_{t-1} - lr_t * m_t 

57 

58 Example of usage: 

59 ```python 

60 opt = tfa.optimizers.NovoGrad( 

61 lr=1e-3, 

62 beta_1=0.9, 

63 beta_2=0.999, 

64 weight_decay=0.001, 

65 grad_averaging=False 

66 ) 

67 ``` 

68 """ 

69 

70 @typechecked 

71 def __init__( 

72 self, 

73 learning_rate: Union[FloatTensorLike, Callable] = 0.001, 

74 beta_1: FloatTensorLike = 0.9, 

75 beta_2: FloatTensorLike = 0.999, 

76 epsilon: FloatTensorLike = 1e-7, 

77 weight_decay: FloatTensorLike = 0.0, 

78 grad_averaging: bool = False, 

79 amsgrad: bool = False, 

80 name: str = "NovoGrad", 

81 **kwargs, 

82 ): 

83 r"""Construct a new NovoGrad optimizer. 

84 

85 Args: 

86 learning_rate: A `Tensor` or a floating point value. or a schedule 

87 that is a `tf.keras.optimizers.schedules.LearningRateSchedule` 

88 The learning rate. 

89 beta_1: A float value or a constant float tensor. 

90 The exponential decay rate for the 1st moment estimates. 

91 beta_2: A float value or a constant float tensor. 

92 The exponential decay rate for the 2nd moment estimates. 

93 epsilon: A small constant for numerical stability. 

94 weight_decay: A floating point value. Weight decay for each param. 

95 grad_averaging: determines whether to use Adam style exponential 

96 moving averaging for the first order moments. 

97 **kwargs: keyword arguments. Allowed to be {`clipnorm`, 

98 `clipvalue`, `lr`, `decay`}. `clipnorm` is clip gradients 

99 by norm; `clipvalue` is clip gradients by value, `decay` is 

100 included for backward compatibility to allow time inverse 

101 decay of learning rate. `lr` is included for backward 

102 compatibility, recommended to use `learning_rate` instead. 

103 """ 

104 super().__init__(name, **kwargs) 

105 if weight_decay < 0.0: 

106 raise ValueError("Weight decay rate cannot be negative") 

107 self._set_hyper("learning_rate", kwargs.get("lr", learning_rate)) 

108 self._set_hyper("decay", self._initial_decay) 

109 self._set_hyper("beta_1", beta_1) 

110 self._set_hyper("beta_2", beta_2) 

111 self._set_hyper("weight_decay", weight_decay) 

112 self._set_hyper("grad_averaging", grad_averaging) 

113 self.amsgrad = amsgrad 

114 self.epsilon = epsilon or tf.keras.backend.epsilon() 

115 

116 def _create_slots(self, var_list): 

117 # Create slots for the first and second moments. 

118 # Separate for-loops to respect the ordering of slot variables from v1. 

119 for var in var_list: 

120 self.add_slot(var=var, slot_name="m", initializer="zeros") 

121 for var in var_list: 

122 self.add_slot( 

123 var=var, slot_name="v", initializer=tf.zeros(shape=[], dtype=var.dtype) 

124 ) 

125 if self.amsgrad: 

126 for var in var_list: 

127 self.add_slot(var, "vhat") 

128 

129 def _prepare_local(self, var_device, var_dtype, apply_state): 

130 super()._prepare_local(var_device, var_dtype, apply_state) 

131 beta_1_t = tf.identity(self._get_hyper("beta_1", var_dtype)) 

132 beta_2_t = tf.identity(self._get_hyper("beta_2", var_dtype)) 

133 apply_state[(var_device, var_dtype)].update( 

134 dict( 

135 epsilon=tf.convert_to_tensor(self.epsilon, var_dtype), 

136 beta_1_t=beta_1_t, 

137 beta_2_t=beta_2_t, 

138 one_minus_beta_2_t=1 - beta_2_t, 

139 one_minus_beta_1_t=1 - beta_1_t, 

140 ) 

141 ) 

142 

143 def set_weights(self, weights): 

144 params = self.weights 

145 # If the weights are generated by Keras V1 optimizer, it includes vhats 

146 # even without amsgrad, i.e, V1 optimizer has 3x + 1 variables, while V2 

147 # optimizer has 2x + 1 variables. Filter vhats out for compatibility. 

148 num_vars = int((len(params) - 1) / 2) 

149 if len(weights) == 3 * num_vars + 1: 

150 weights = weights[: len(params)] 

151 super().set_weights(weights) 

152 

153 def _resource_apply_dense(self, grad, var, apply_state=None): 

154 var_device, var_dtype = var.device, var.dtype.base_dtype 

155 coefficients = (apply_state or {}).get( 

156 (var_device, var_dtype) 

157 ) or self._fallback_apply_state(var_device, var_dtype) 

158 weight_decay = self._get_hyper("weight_decay", var_dtype) 

159 grad_averaging = self._get_hyper("grad_averaging") 

160 

161 v = self.get_slot(var, "v") 

162 g_2 = tf.reduce_sum(tf.square(grad)) 

163 v_t = tf.cond( 

164 tf.equal(self.iterations, 0), 

165 lambda: g_2, 

166 lambda: v * coefficients["beta_2_t"] 

167 + g_2 * coefficients["one_minus_beta_2_t"], 

168 ) 

169 v_t = v.assign(v_t, use_locking=self._use_locking) 

170 

171 if self.amsgrad: 

172 vhat = self.get_slot(var, "vhat") 

173 vhat_t = vhat.assign(tf.maximum(vhat, v_t), use_locking=self._use_locking) 

174 grad = grad / (tf.sqrt(vhat_t) + self.epsilon) 

175 else: 

176 grad = grad / (tf.sqrt(v_t) + self.epsilon) 

177 grad = tf.cond( 

178 tf.greater(weight_decay, 0), lambda: grad + weight_decay * var, lambda: grad 

179 ) 

180 grad = tf.cond( 

181 tf.logical_and(grad_averaging, tf.not_equal(self.iterations, 0)), 

182 lambda: grad * coefficients["one_minus_beta_1_t"], 

183 lambda: grad, 

184 ) 

185 m = self.get_slot(var, "m") 

186 return tf.raw_ops.ResourceApplyKerasMomentum( 

187 var=var.handle, 

188 accum=m.handle, 

189 lr=coefficients["lr_t"], 

190 grad=grad, 

191 momentum=coefficients["beta_1_t"], 

192 use_locking=self._use_locking, 

193 use_nesterov=False, 

194 ) 

195 

196 def _resource_apply_sparse(self, grad, var, indices, apply_state=None): 

197 var_device, var_dtype = var.device, var.dtype.base_dtype 

198 coefficients = (apply_state or {}).get( 

199 (var_device, var_dtype) 

200 ) or self._fallback_apply_state(var_device, var_dtype) 

201 weight_decay = self._get_hyper("weight_decay", var_dtype) 

202 grad_averaging = self._get_hyper("grad_averaging") 

203 

204 v = self.get_slot(var, "v") 

205 g_2 = tf.reduce_sum(tf.square(grad)) 

206 # v is just a scalar and does not need to involve sparse tensors. 

207 v_t = tf.cond( 

208 tf.equal(self.iterations, 0), 

209 lambda: g_2, 

210 lambda: v * coefficients["beta_2_t"] 

211 + g_2 * coefficients["one_minus_beta_2_t"], 

212 ) 

213 v_t = v.assign(v_t, use_locking=self._use_locking) 

214 

215 if self.amsgrad: 

216 vhat = self.get_slot(var, "vhat") 

217 vhat_t = vhat.assign(tf.maximum(vhat, v_t), use_locking=self._use_locking) 

218 grad = grad / (tf.sqrt(vhat_t) + self.epsilon) 

219 else: 

220 grad = grad / (tf.sqrt(v_t) + self.epsilon) 

221 grad = tf.cond( 

222 tf.greater(weight_decay, 0), 

223 lambda: grad + weight_decay * tf.gather(var, indices), 

224 lambda: grad, 

225 ) 

226 grad = tf.cond( 

227 tf.logical_and(grad_averaging, tf.not_equal(self.iterations, 0)), 

228 lambda: grad * coefficients["one_minus_beta_1_t"], 

229 lambda: grad, 

230 ) 

231 m = self.get_slot(var, "m") 

232 return tf.raw_ops.ResourceSparseApplyKerasMomentum( 

233 var=var.handle, 

234 accum=m.handle, 

235 lr=coefficients["lr_t"], 

236 grad=grad, 

237 indices=indices, 

238 momentum=coefficients["beta_1_t"], 

239 use_locking=self._use_locking, 

240 use_nesterov=False, 

241 ) 

242 

243 def get_config(self): 

244 config = super().get_config() 

245 config.update( 

246 { 

247 "learning_rate": self._serialize_hyperparameter("learning_rate"), 

248 "beta_1": self._serialize_hyperparameter("beta_1"), 

249 "beta_2": self._serialize_hyperparameter("beta_2"), 

250 "epsilon": self.epsilon, 

251 "weight_decay": self._serialize_hyperparameter("weight_decay"), 

252 "grad_averaging": self._serialize_hyperparameter("grad_averaging"), 

253 } 

254 ) 

255 return config