Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/tensorflow/python/keras/optimizer_v2/adam.py: 22%

152 statements  

« prev     ^ index     » next       coverage.py v7.4.0, created at 2024-01-03 07:57 +0000

1# Copyright 2020 The TensorFlow Authors. All Rights Reserved. 

2# 

3# Licensed under the Apache License, Version 2.0 (the "License"); 

4# you may not use this file except in compliance with the License. 

5# You may obtain a copy of the License at 

6# 

7# http://www.apache.org/licenses/LICENSE-2.0 

8# 

9# Unless required by applicable law or agreed to in writing, software 

10# distributed under the License is distributed on an "AS IS" BASIS, 

11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 

12# See the License for the specific language governing permissions and 

13# limitations under the License. 

14# ============================================================================== 

15"""Adam optimizer implementation.""" 

16# pylint: disable=g-classes-have-attributes 

17 

18from tensorflow.python.eager import def_function 

19from tensorflow.python.framework import indexed_slices 

20from tensorflow.python.framework import ops 

21from tensorflow.python.framework import tensor_conversion 

22from tensorflow.python.keras import backend_config 

23from tensorflow.python.keras.optimizer_v2 import optimizer_v2 

24from tensorflow.python.ops import array_ops 

25from tensorflow.python.ops import control_flow_ops 

26from tensorflow.python.ops import math_ops 

27from tensorflow.python.ops import state_ops 

28from tensorflow.python.training import gen_training_ops 

29from tensorflow.python.util.tf_export import keras_export 

30 

31 

32@keras_export('keras.optimizers.Adam') 

33class Adam(optimizer_v2.OptimizerV2): 

34 r"""Optimizer that implements the Adam algorithm. 

35 

36 Adam optimization is a stochastic gradient descent method that is based on 

37 adaptive estimation of first-order and second-order moments. 

38 

39 According to 

40 [Kingma et al., 2014](http://arxiv.org/abs/1412.6980), 

41 the method is "*computationally 

42 efficient, has little memory requirement, invariant to diagonal rescaling of 

43 gradients, and is well suited for problems that are large in terms of 

44 data/parameters*". 

45 

46 Args: 

47 learning_rate: A `Tensor`, floating point value, or a schedule that is a 

48 `tf.keras.optimizers.schedules.LearningRateSchedule`, or a callable 

49 that takes no arguments and returns the actual value to use, The 

50 learning rate. Defaults to 0.001. 

51 beta_1: A float value or a constant float tensor, or a callable 

52 that takes no arguments and returns the actual value to use. The 

53 exponential decay rate for the 1st moment estimates. Defaults to 0.9. 

54 beta_2: A float value or a constant float tensor, or a callable 

55 that takes no arguments and returns the actual value to use, The 

56 exponential decay rate for the 2nd moment estimates. Defaults to 0.999. 

57 epsilon: A small constant for numerical stability. This epsilon is 

58 "epsilon hat" in the Kingma and Ba paper (in the formula just before 

59 Section 2.1), not the epsilon in Algorithm 1 of the paper. Defaults to 

60 1e-7. 

61 amsgrad: Boolean. Whether to apply AMSGrad variant of this algorithm from 

62 the paper "On the Convergence of Adam and beyond". Defaults to `False`. 

63 name: Optional name for the operations created when applying gradients. 

64 Defaults to `"Adam"`. 

65 **kwargs: Keyword arguments. Allowed to be one of 

66 `"clipnorm"` or `"clipvalue"`. 

67 `"clipnorm"` (float) clips gradients by norm; `"clipvalue"` (float) clips 

68 gradients by value. 

69 

70 Usage: 

71 

72 >>> opt = tf.keras.optimizers.Adam(learning_rate=0.1) 

73 >>> var1 = tf.Variable(10.0) 

74 >>> loss = lambda: (var1 ** 2)/2.0 # d(loss)/d(var1) == var1 

75 >>> step_count = opt.minimize(loss, [var1]).numpy() 

76 >>> # The first step is `-learning_rate*sign(grad)` 

77 >>> var1.numpy() 

78 9.9 

79 

80 Reference: 

81 - [Kingma et al., 2014](http://arxiv.org/abs/1412.6980) 

82 - [Reddi et al., 2018]( 

83 https://openreview.net/pdf?id=ryQu7f-RZ) for `amsgrad`. 

84 

85 Notes: 

86 

87 The default value of 1e-7 for epsilon might not be a good default in 

88 general. For example, when training an Inception network on ImageNet a 

89 current good choice is 1.0 or 0.1. Note that since Adam uses the 

90 formulation just before Section 2.1 of the Kingma and Ba paper rather than 

91 the formulation in Algorithm 1, the "epsilon" referred to here is "epsilon 

92 hat" in the paper. 

93 

94 The sparse implementation of this algorithm (used when the gradient is an 

95 IndexedSlices object, typically because of `tf.gather` or an embedding 

96 lookup in the forward pass) does apply momentum to variable slices even if 

97 they were not used in the forward pass (meaning they have a gradient equal 

98 to zero). Momentum decay (beta1) is also applied to the entire momentum 

99 accumulator. This means that the sparse behavior is equivalent to the dense 

100 behavior (in contrast to some momentum implementations which ignore momentum 

101 unless a variable slice was actually used). 

102 """ 

103 

104 _HAS_AGGREGATE_GRAD = True 

105 

106 def __init__(self, 

107 learning_rate=0.001, 

108 beta_1=0.9, 

109 beta_2=0.999, 

110 epsilon=1e-7, 

111 amsgrad=False, 

112 name='Adam', 

113 **kwargs): 

114 super(Adam, self).__init__(name, **kwargs) 

115 self._set_hyper('learning_rate', kwargs.get('lr', learning_rate)) 

116 self._set_hyper('decay', self._initial_decay) 

117 self._set_hyper('beta_1', beta_1) 

118 self._set_hyper('beta_2', beta_2) 

119 self.epsilon = epsilon or backend_config.epsilon() 

120 self.amsgrad = amsgrad 

121 

122 def _create_slots(self, var_list): 

123 # Create slots for the first and second moments. 

124 # Separate for-loops to respect the ordering of slot variables from v1. 

125 for var in var_list: 

126 self.add_slot(var, 'm') 

127 for var in var_list: 

128 self.add_slot(var, 'v') 

129 if self.amsgrad: 

130 for var in var_list: 

131 self.add_slot(var, 'vhat') 

132 

133 def _prepare_local(self, var_device, var_dtype, apply_state): 

134 super(Adam, self)._prepare_local(var_device, var_dtype, apply_state) 

135 

136 local_step = math_ops.cast(self.iterations + 1, var_dtype) 

137 beta_1_t = array_ops.identity(self._get_hyper('beta_1', var_dtype)) 

138 beta_2_t = array_ops.identity(self._get_hyper('beta_2', var_dtype)) 

139 beta_1_power = math_ops.pow(beta_1_t, local_step) 

140 beta_2_power = math_ops.pow(beta_2_t, local_step) 

141 lr = (apply_state[(var_device, var_dtype)]['lr_t'] * 

142 (math_ops.sqrt(1 - beta_2_power) / (1 - beta_1_power))) 

143 apply_state[(var_device, var_dtype)].update( 

144 dict( 

145 lr=lr, 

146 epsilon=tensor_conversion.convert_to_tensor_v2_with_dispatch( 

147 self.epsilon, var_dtype 

148 ), 

149 beta_1_t=beta_1_t, 

150 beta_1_power=beta_1_power, 

151 one_minus_beta_1_t=1 - beta_1_t, 

152 beta_2_t=beta_2_t, 

153 beta_2_power=beta_2_power, 

154 one_minus_beta_2_t=1 - beta_2_t, 

155 ) 

156 ) 

157 

158 def set_weights(self, weights): 

159 params = self.weights 

160 # If the weights are generated by Keras V1 optimizer, it includes vhats 

161 # even without amsgrad, i.e, V1 optimizer has 3x + 1 variables, while V2 

162 # optimizer has 2x + 1 variables. Filter vhats out for compatibility. 

163 num_vars = int((len(params) - 1) / 2) 

164 if len(weights) == 3 * num_vars + 1: 

165 weights = weights[:len(params)] 

166 super(Adam, self).set_weights(weights) 

167 

168 def _resource_apply_dense(self, grad, var, apply_state=None): 

169 var_device, var_dtype = var.device, var.dtype.base_dtype 

170 coefficients = ((apply_state or {}).get((var_device, var_dtype)) 

171 or self._fallback_apply_state(var_device, var_dtype)) 

172 

173 m = self.get_slot(var, 'm') 

174 v = self.get_slot(var, 'v') 

175 

176 if not self.amsgrad: 

177 return gen_training_ops.ResourceApplyAdam( 

178 var=var.handle, 

179 m=m.handle, 

180 v=v.handle, 

181 beta1_power=coefficients['beta_1_power'], 

182 beta2_power=coefficients['beta_2_power'], 

183 lr=coefficients['lr_t'], 

184 beta1=coefficients['beta_1_t'], 

185 beta2=coefficients['beta_2_t'], 

186 epsilon=coefficients['epsilon'], 

187 grad=grad, 

188 use_locking=self._use_locking) 

189 else: 

190 vhat = self.get_slot(var, 'vhat') 

191 return gen_training_ops.ResourceApplyAdamWithAmsgrad( 

192 var=var.handle, 

193 m=m.handle, 

194 v=v.handle, 

195 vhat=vhat.handle, 

196 beta1_power=coefficients['beta_1_power'], 

197 beta2_power=coefficients['beta_2_power'], 

198 lr=coefficients['lr_t'], 

199 beta1=coefficients['beta_1_t'], 

200 beta2=coefficients['beta_2_t'], 

201 epsilon=coefficients['epsilon'], 

202 grad=grad, 

203 use_locking=self._use_locking) 

204 

205 def _resource_apply_sparse(self, grad, var, indices, apply_state=None): 

206 var_device, var_dtype = var.device, var.dtype.base_dtype 

207 coefficients = ((apply_state or {}).get((var_device, var_dtype)) 

208 or self._fallback_apply_state(var_device, var_dtype)) 

209 

210 # m_t = beta1 * m + (1 - beta1) * g_t 

211 m = self.get_slot(var, 'm') 

212 m_scaled_g_values = grad * coefficients['one_minus_beta_1_t'] 

213 m_t = state_ops.assign(m, m * coefficients['beta_1_t'], 

214 use_locking=self._use_locking) 

215 with ops.control_dependencies([m_t]): 

216 m_t = self._resource_scatter_add(m, indices, m_scaled_g_values) 

217 

218 # v_t = beta2 * v + (1 - beta2) * (g_t * g_t) 

219 v = self.get_slot(var, 'v') 

220 v_scaled_g_values = (grad * grad) * coefficients['one_minus_beta_2_t'] 

221 v_t = state_ops.assign(v, v * coefficients['beta_2_t'], 

222 use_locking=self._use_locking) 

223 with ops.control_dependencies([v_t]): 

224 v_t = self._resource_scatter_add(v, indices, v_scaled_g_values) 

225 

226 if not self.amsgrad: 

227 v_sqrt = math_ops.sqrt(v_t) 

228 var_update = state_ops.assign_sub( 

229 var, coefficients['lr'] * m_t / (v_sqrt + coefficients['epsilon']), 

230 use_locking=self._use_locking) 

231 return control_flow_ops.group(*[var_update, m_t, v_t]) 

232 else: 

233 v_hat = self.get_slot(var, 'vhat') 

234 v_hat_t = math_ops.maximum(v_hat, v_t) 

235 with ops.control_dependencies([v_hat_t]): 

236 v_hat_t = state_ops.assign( 

237 v_hat, v_hat_t, use_locking=self._use_locking) 

238 v_hat_sqrt = math_ops.sqrt(v_hat_t) 

239 var_update = state_ops.assign_sub( 

240 var, 

241 coefficients['lr'] * m_t / (v_hat_sqrt + coefficients['epsilon']), 

242 use_locking=self._use_locking) 

243 return control_flow_ops.group(*[var_update, m_t, v_t, v_hat_t]) 

244 

245 def get_config(self): 

246 config = super(Adam, self).get_config() 

247 config.update({ 

248 'learning_rate': self._serialize_hyperparameter('learning_rate'), 

249 'decay': self._initial_decay, 

250 'beta_1': self._serialize_hyperparameter('beta_1'), 

251 'beta_2': self._serialize_hyperparameter('beta_2'), 

252 'epsilon': self.epsilon, 

253 'amsgrad': self.amsgrad, 

254 }) 

255 return config 

256 

257 

258class NonFusedAdam(optimizer_v2.OptimizerV2): 

259 r"""Optimizer that implements the Adam algorithm without fused kernels. 

260 

261 Adam optimization is a stochastic gradient descent method that is based on 

262 adaptive estimation of first-order and second-order moments. 

263 According to the paper 

264 [Adam: A Method for Stochastic Optimization. Kingma et al., 

265 2014](http://arxiv.org/abs/1412.6980), the method is "*computationally 

266 efficient, has little memory requirement, invariant to diagonal rescaling of 

267 gradients, and is well suited for problems that are large in terms of 

268 data/parameters*". 

269 

270 For AMSGrad see [On The Convergence Of Adam And Beyond. 

271 Reddi et al., 5-8](https://openreview.net/pdf?id=ryQu7f-RZ). 

272 

273 **If amsgrad = False**: 

274 

275 initialize $m_0$ as 1st moment vector 

276 initialize $v_0$ as 2nd moment vector 

277 

278 The update rule for $\theta$ with gradient $g$ uses an optimization 

279 described at the end of section 2 of the paper: 

280 

281 $$lr_t = \mathrm{learning\_rate} * 

282 \sqrt{1 - \beta_2^t} / (1 - \beta_1^t)$$ 

283 $$m_t = \beta_1 * m_{t-1} + (1 - \beta_1) * g$$ 

284 $$v_t = \beta_2 * v_{t-1} + (1 - \beta_2) * g^2$$ 

285 $$\theta_t = \theta_{t-1} - lr_t * m_t / (\sqrt{v_t} + \epsilon)$$ 

286 

287 **If amsgrad = True**: 

288 

289 initialize $m_0$ as 1st moment vector 

290 initialize $v_0$ as 2nd moment vector 

291 initialize $\hat{v}_0$ as 2nd moment vector 

292 

293 The update rule for $\theta$ with gradient $g$ uses an optimization 

294 described at the end of section 2 of the paper: 

295 

296 $$lr_t = \mathrm{learning\_rate} * 

297 \sqrt{1 - \beta_2^t} / (1 - \beta_1^t)$$ 

298 

299 $$m_t = \beta_1 * m_{t-1} + (1 - \beta_1) * g$$ 

300 $$v_t = \beta_2 * v_{t-1} + (1 - \beta_2) * g^2$$ 

301 $$\hat{v}_t = \max(\hat{v}_{t-1}, v_t)$$ 

302 $$\theta_t = \theta_{t-1} - lr_t * m_t / (\sqrt{\hat{v}_t} + \epsilon)$$ 

303 

304 The default value of 1e-7 for epsilon might not be a good default in 

305 general. For example, when training an Inception network on ImageNet a 

306 current good choice is 1.0 or 0.1. Note that since Adam uses the 

307 formulation just before Section 2.1 of the Kingma and Ba paper rather than 

308 the formulation in Algorithm 1, the "epsilon" referred to here is "epsilon 

309 hat" in the paper. 

310 

311 The sparse implementation of this algorithm (used when the gradient is an 

312 IndexedSlices object, typically because of `tf.gather` or an embedding 

313 lookup in the forward pass) does apply momentum to variable slices even if 

314 they were not used in the forward pass (meaning they have a gradient equal 

315 to zero). Momentum decay (beta1) is also applied to the entire momentum 

316 accumulator. This means that the sparse behavior is equivalent to the dense 

317 behavior (in contrast to some momentum implementations which ignore momentum 

318 unless a variable slice was actually used). 

319 

320 Usage: 

321 

322 >>> opt = tf.keras.optimizers.Adam(learning_rate=0.1) 

323 >>> var1 = tf.Variable(10.0) 

324 >>> loss = lambda: (var1 ** 2)/2.0 # d(loss)/d(var1) == var1 

325 >>> step_count = opt.minimize(loss, [var1]).numpy() 

326 >>> # The first step is `-learning_rate*sign(grad)` 

327 >>> var1.numpy() 

328 9.9 

329 """ 

330 

331 _HAS_AGGREGATE_GRAD = True 

332 

333 def __init__(self, 

334 learning_rate=0.001, 

335 beta_1=0.9, 

336 beta_2=0.999, 

337 epsilon=1e-7, 

338 amsgrad=False, 

339 name='Adam', 

340 **kwargs): 

341 """Construct a new Adam optimizer. 

342 

343 Args: 

344 learning_rate: A `Tensor`, floating point value, or a schedule that is a 

345 `tf.keras.optimizers.schedules.LearningRateSchedule`, or a callable that 

346 takes no arguments and returns the actual value to use, The learning 

347 rate. Defaults to 0.001. 

348 beta_1: A float value or a constant float tensor, or a callable that takes 

349 no arguments and returns the actual value to use. The exponential decay 

350 rate for the 1st moment estimates. Defaults to 0.9. 

351 beta_2: A float value or a constant float tensor, or a callable that takes 

352 no arguments and returns the actual value to use, The exponential decay 

353 rate for the 2nd moment estimates. Defaults to 0.999. 

354 epsilon: A small constant for numerical stability. This epsilon is 

355 "epsilon hat" in the Kingma and Ba paper (in the formula just before 

356 Section 2.1), not the epsilon in Algorithm 1 of the paper. Defaults to 

357 1e-7. 

358 amsgrad: Boolean. Whether to apply AMSGrad variant of this algorithm from 

359 the paper "On the Convergence of Adam and beyond". Defaults to `False`. 

360 name: Optional name for the operations created when applying gradients. 

361 Defaults to "Adam". 

362 **kwargs: keyword arguments. Allowed to be {`clipnorm`, `clipvalue`, `lr`, 

363 `decay`}. `clipnorm` is clip gradients by norm; `clipvalue` is clip 

364 gradients by value, `decay` is included for backward compatibility to 

365 allow time inverse decay of learning rate. `lr` is included for backward 

366 compatibility, recommended to use `learning_rate` instead. 

367 """ 

368 

369 super(NonFusedAdam, self).__init__(name, **kwargs) 

370 self._set_hyper('learning_rate', kwargs.get('lr', learning_rate)) 

371 self._set_hyper('decay', self._initial_decay) 

372 self._set_hyper('beta_1', beta_1) 

373 self._set_hyper('beta_2', beta_2) 

374 self.epsilon = epsilon or backend_config.epsilon() 

375 self.amsgrad = amsgrad 

376 

377 def _create_slots(self, var_list): 

378 # Create slots for the first and second moments. 

379 # Separate for-loops to respect the ordering of slot variables from v1. 

380 for var in var_list: 

381 self.add_slot(var, 'm') 

382 for var in var_list: 

383 self.add_slot(var, 'v') 

384 if self.amsgrad: 

385 for var in var_list: 

386 self.add_slot(var, 'vhat') 

387 

388 def _prepare_local(self, var_device, var_dtype, apply_state): 

389 super(NonFusedAdam, self)._prepare_local(var_device, var_dtype, apply_state) 

390 

391 local_step = math_ops.cast(self.iterations + 1, var_dtype) 

392 beta_1_t = array_ops.identity(self._get_hyper('beta_1', var_dtype)) 

393 beta_2_t = array_ops.identity(self._get_hyper('beta_2', var_dtype)) 

394 beta_1_power = math_ops.pow(beta_1_t, local_step) 

395 beta_2_power = math_ops.pow(beta_2_t, local_step) 

396 lr = ( 

397 apply_state[(var_device, var_dtype)]['lr_t'] * 

398 (math_ops.sqrt(1 - beta_2_power) / (1 - beta_1_power))) 

399 apply_state[(var_device, var_dtype)].update( 

400 dict( 

401 lr=lr, 

402 epsilon=tensor_conversion.convert_to_tensor_v2_with_dispatch( 

403 self.epsilon, var_dtype 

404 ), 

405 beta_1_t=beta_1_t, 

406 beta_1_power=beta_1_power, 

407 one_minus_beta_1_t=1 - beta_1_t, 

408 beta_2_t=beta_2_t, 

409 beta_2_power=beta_2_power, 

410 one_minus_beta_2_t=1 - beta_2_t, 

411 ) 

412 ) 

413 

414 def set_weights(self, weights): 

415 params = self.weights 

416 # If the weights are generated by Keras V1 optimizer, it includes vhats 

417 # even without amsgrad, i.e, V1 optimizer has 3x + 1 variables, while V2 

418 # optimizer has 2x + 1 variables. Filter vhats out for compatibility. 

419 num_vars = int((len(params) - 1) / 2) 

420 if len(weights) == 3 * num_vars + 1: 

421 weights = weights[:len(params)] 

422 super(NonFusedAdam, self).set_weights(weights) 

423 

424 @def_function.function(jit_compile=True) 

425 def _resource_apply_dense(self, grad, var, apply_state=None): 

426 var_device, var_dtype = var.device, var.dtype.base_dtype 

427 coefficients = ((apply_state or {}).get((var_device, var_dtype)) or 

428 self._fallback_apply_state(var_device, var_dtype)) 

429 

430 m = self.get_slot(var, 'm') 

431 v = self.get_slot(var, 'v') 

432 

433 alpha = ( 

434 coefficients['lr_t'] * math_ops.sqrt(1 - coefficients['beta_2_power']) / 

435 (1 - coefficients['beta_1_power'])) 

436 m.assign_add((grad - m) * (1 - coefficients['beta_1_t'])) 

437 v.assign_add((math_ops.square(grad) - v) * (1 - coefficients['beta_2_t'])) 

438 if self.amsgrad: 

439 vhat = self.get_slot(var, 'vhat') 

440 vhat.assign(math_ops.maximum(vhat, v)) 

441 v = vhat 

442 var.assign_sub( 

443 (m * alpha) / (math_ops.sqrt(v) - coefficients['epsilon'])) 

444 

445 @def_function.function(jit_compile=True) 

446 def _resource_apply_sparse(self, grad, var, indices, apply_state=None): 

447 var_device, var_dtype = var.device, var.dtype.base_dtype 

448 coefficients = ((apply_state or {}).get((var_device, var_dtype)) or 

449 self._fallback_apply_state(var_device, var_dtype)) 

450 

451 # m_t = beta1 * m + (1 - beta1) * g_t 

452 m = self.get_slot(var, 'm') 

453 m_scaled_g_values = grad * coefficients['one_minus_beta_1_t'] 

454 m.assign(m * coefficients['beta_1_t']) 

455 m.scatter_add(indexed_slices.IndexedSlices(m_scaled_g_values, indices)) 

456 

457 # v_t = beta2 * v + (1 - beta2) * (g_t * g_t) 

458 v = self.get_slot(var, 'v') 

459 v_scaled_g_values = (grad * grad) * coefficients['one_minus_beta_2_t'] 

460 v.assign(v * coefficients['beta_2_t']) 

461 v.scatter_add(indexed_slices.IndexedSlices(v_scaled_g_values, indices)) 

462 

463 if not self.amsgrad: 

464 var.assign_sub(coefficients['lr'] * m / 

465 (math_ops.sqrt(v) + coefficients['epsilon'])) 

466 else: 

467 v_hat = self.get_slot(var, 'vhat') 

468 v_hat.assign(math_ops.maximum(v_hat, v)) 

469 var.assign_sub(coefficients['lr'] * m / 

470 (math_ops.sqrt(v_hat) + coefficients['epsilon'])) 

471 

472 def get_config(self): 

473 config = super(NonFusedAdam, self).get_config() 

474 config.update({ 

475 'learning_rate': self._serialize_hyperparameter('learning_rate'), 

476 'decay': self._initial_decay, 

477 'beta_1': self._serialize_hyperparameter('beta_1'), 

478 'beta_2': self._serialize_hyperparameter('beta_2'), 

479 'epsilon': self.epsilon, 

480 'amsgrad': self.amsgrad, 

481 }) 

482 return config