Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/keras/src/optimizers/legacy/adam.py: 18%

144 statements  

« prev     ^ index     » next       coverage.py v7.4.0, created at 2024-01-03 07:57 +0000

1# Copyright 2020 The TensorFlow Authors. All Rights Reserved. 

2# 

3# Licensed under the Apache License, Version 2.0 (the "License"); 

4# you may not use this file except in compliance with the License. 

5# You may obtain a copy of the License at 

6# 

7# http://www.apache.org/licenses/LICENSE-2.0 

8# 

9# Unless required by applicable law or agreed to in writing, software 

10# distributed under the License is distributed on an "AS IS" BASIS, 

11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 

12# See the License for the specific language governing permissions and 

13# limitations under the License. 

14# ============================================================================== 

15"""Adam optimizer implementation.""" 

16 

17import tensorflow.compat.v2 as tf 

18 

19from keras.src import backend_config 

20from keras.src.optimizers.legacy import optimizer_v2 

21 

22# isort: off 

23from tensorflow.python.util.tf_export import keras_export 

24 

25 

26@keras_export( 

27 "keras.optimizers.legacy.Adam", 

28 v1=["keras.optimizers.Adam", "keras.optimizers.legacy.Adam"], 

29) 

30class Adam(optimizer_v2.OptimizerV2): 

31 r"""Optimizer that implements the Adam algorithm. 

32 

33 Adam optimization is a stochastic gradient descent method that is based on 

34 adaptive estimation of first-order and second-order moments. 

35 

36 According to 

37 [Kingma et al., 2014](http://arxiv.org/abs/1412.6980), 

38 the method is "*computationally 

39 efficient, has little memory requirement, invariant to diagonal rescaling of 

40 gradients, and is well suited for problems that are large in terms of 

41 data/parameters*". 

42 

43 Args: 

44 learning_rate: A `Tensor`, floating point value, or a schedule that is a 

45 `tf.keras.optimizers.schedules.LearningRateSchedule`, or a callable 

46 that takes no arguments and returns the actual value to use, The 

47 learning rate. Defaults to `0.001`. 

48 beta_1: A float value or a constant float tensor, or a callable 

49 that takes no arguments and returns the actual value to use. The 

50 exponential decay rate for the 1st moment estimates. Defaults to `0.9`. 

51 beta_2: A float value or a constant float tensor, or a callable 

52 that takes no arguments and returns the actual value to use, The 

53 exponential decay rate for the 2nd moment estimates. Defaults to 

54 `0.999`. 

55 epsilon: A small constant for numerical stability. This epsilon is 

56 "epsilon hat" in the Kingma and Ba paper (in the formula just before 

57 Section 2.1), not the epsilon in Algorithm 1 of the paper. Defaults to 

58 `1e-7`. 

59 amsgrad: Boolean. Whether to apply AMSGrad variant of this algorithm from 

60 the paper "On the Convergence of Adam and beyond". Defaults to `False`. 

61 name: Optional name for the operations created when applying gradients. 

62 Defaults to `"Adam"`. 

63 **kwargs: keyword arguments. Allowed arguments are `clipvalue`, 

64 `clipnorm`, `global_clipnorm`. 

65 If `clipvalue` (float) is set, the gradient of each weight 

66 is clipped to be no higher than this value. 

67 If `clipnorm` (float) is set, the gradient of each weight 

68 is individually clipped so that its norm is no higher than this value. 

69 If `global_clipnorm` (float) is set the gradient of all weights is 

70 clipped so that their global norm is no higher than this value. 

71 

72 Usage: 

73 

74 >>> opt = tf.keras.optimizers.legacy.Adam(learning_rate=0.1) 

75 >>> var1 = tf.Variable(10.0) 

76 >>> loss = lambda: (var1 ** 2)/2.0 # d(loss)/d(var1) == var1 

77 >>> step_count = opt.minimize(loss, [var1]).numpy() 

78 >>> # The first step is `-learning_rate*sign(grad)` 

79 >>> var1.numpy() 

80 9.9 

81 

82 Reference: 

83 - [Kingma et al., 2014](http://arxiv.org/abs/1412.6980) 

84 - [Reddi et al., 2018]( 

85 https://openreview.net/pdf?id=ryQu7f-RZ) for `amsgrad`. 

86 

87 Notes: 

88 

89 The default value of 1e-7 for epsilon might not be a good default in 

90 general. For example, when training an Inception network on ImageNet a 

91 current good choice is 1.0 or 0.1. Note that since Adam uses the 

92 formulation just before Section 2.1 of the Kingma and Ba paper rather than 

93 the formulation in Algorithm 1, the "epsilon" referred to here is "epsilon 

94 hat" in the paper. 

95 

96 The sparse implementation of this algorithm (used when the gradient is an 

97 IndexedSlices object, typically because of `tf.gather` or an embedding 

98 lookup in the forward pass) does apply momentum to variable slices even if 

99 they were not used in the forward pass (meaning they have a gradient equal 

100 to zero). Momentum decay (beta1) is also applied to the entire momentum 

101 accumulator. This means that the sparse behavior is equivalent to the dense 

102 behavior (in contrast to some momentum implementations which ignore momentum 

103 unless a variable slice was actually used). 

104 """ 

105 

106 _HAS_AGGREGATE_GRAD = True 

107 

108 def __init__( 

109 self, 

110 learning_rate=0.001, 

111 beta_1=0.9, 

112 beta_2=0.999, 

113 epsilon=1e-7, 

114 amsgrad=False, 

115 name="Adam", 

116 **kwargs 

117 ): 

118 super().__init__(name, **kwargs) 

119 self._set_hyper("learning_rate", kwargs.get("lr", learning_rate)) 

120 self._set_hyper("decay", self._initial_decay) 

121 self._set_hyper("beta_1", beta_1) 

122 self._set_hyper("beta_2", beta_2) 

123 self.epsilon = epsilon or backend_config.epsilon() 

124 self.amsgrad = amsgrad 

125 

126 def _create_slots(self, var_list): 

127 # Create slots for the first and second moments. 

128 # Separate for-loops to respect the ordering of slot variables from v1. 

129 for var in var_list: 

130 self.add_slot(var, "m") 

131 for var in var_list: 

132 self.add_slot(var, "v") 

133 if self.amsgrad: 

134 for var in var_list: 

135 self.add_slot(var, "vhat") 

136 

137 def _prepare_local(self, var_device, var_dtype, apply_state): 

138 super()._prepare_local(var_device, var_dtype, apply_state) 

139 

140 local_step = tf.cast(self.iterations + 1, var_dtype) 

141 beta_1_t = tf.identity(self._get_hyper("beta_1", var_dtype)) 

142 beta_2_t = tf.identity(self._get_hyper("beta_2", var_dtype)) 

143 beta_1_power = tf.pow(beta_1_t, local_step) 

144 beta_2_power = tf.pow(beta_2_t, local_step) 

145 lr = apply_state[(var_device, var_dtype)]["lr_t"] * ( 

146 tf.sqrt(1 - beta_2_power) / (1 - beta_1_power) 

147 ) 

148 apply_state[(var_device, var_dtype)].update( 

149 dict( 

150 lr=lr, 

151 epsilon=tf.convert_to_tensor(self.epsilon, var_dtype), 

152 beta_1_t=beta_1_t, 

153 beta_1_power=beta_1_power, 

154 one_minus_beta_1_t=1 - beta_1_t, 

155 beta_2_t=beta_2_t, 

156 beta_2_power=beta_2_power, 

157 one_minus_beta_2_t=1 - beta_2_t, 

158 ) 

159 ) 

160 

161 def set_weights(self, weights): 

162 params = self.weights 

163 # If the weights are generated by Keras V1 optimizer, it includes vhats 

164 # even without amsgrad, i.e, V1 optimizer has 3x + 1 variables, while V2 

165 # optimizer has 2x + 1 variables. Filter vhats out for compatibility. 

166 num_vars = int((len(params) - 1) / 2) 

167 if len(weights) == 3 * num_vars + 1: 

168 weights = weights[: len(params)] 

169 super().set_weights(weights) 

170 

171 def _resource_apply_dense(self, grad, var, apply_state=None): 

172 var_device, var_dtype = var.device, var.dtype.base_dtype 

173 coefficients = (apply_state or {}).get( 

174 (var_device, var_dtype) 

175 ) or self._fallback_apply_state(var_device, var_dtype) 

176 

177 m = self.get_slot(var, "m") 

178 v = self.get_slot(var, "v") 

179 

180 if not self.amsgrad: 

181 return tf.raw_ops.ResourceApplyAdam( 

182 var=var.handle, 

183 m=m.handle, 

184 v=v.handle, 

185 beta1_power=coefficients["beta_1_power"], 

186 beta2_power=coefficients["beta_2_power"], 

187 lr=coefficients["lr_t"], 

188 beta1=coefficients["beta_1_t"], 

189 beta2=coefficients["beta_2_t"], 

190 epsilon=coefficients["epsilon"], 

191 grad=grad, 

192 use_locking=self._use_locking, 

193 ) 

194 else: 

195 vhat = self.get_slot(var, "vhat") 

196 return tf.raw_ops.ResourceApplyAdamWithAmsgrad( 

197 var=var.handle, 

198 m=m.handle, 

199 v=v.handle, 

200 vhat=vhat.handle, 

201 beta1_power=coefficients["beta_1_power"], 

202 beta2_power=coefficients["beta_2_power"], 

203 lr=coefficients["lr_t"], 

204 beta1=coefficients["beta_1_t"], 

205 beta2=coefficients["beta_2_t"], 

206 epsilon=coefficients["epsilon"], 

207 grad=grad, 

208 use_locking=self._use_locking, 

209 ) 

210 

211 def _resource_apply_sparse(self, grad, var, indices, apply_state=None): 

212 var_device, var_dtype = var.device, var.dtype.base_dtype 

213 coefficients = (apply_state or {}).get( 

214 (var_device, var_dtype) 

215 ) or self._fallback_apply_state(var_device, var_dtype) 

216 

217 # m_t = beta1 * m + (1 - beta1) * g_t 

218 m = self.get_slot(var, "m") 

219 m_scaled_g_values = grad * coefficients["one_minus_beta_1_t"] 

220 m_t = tf.compat.v1.assign( 

221 m, m * coefficients["beta_1_t"], use_locking=self._use_locking 

222 ) 

223 with tf.control_dependencies([m_t]): 

224 m_t = self._resource_scatter_add(m, indices, m_scaled_g_values) 

225 

226 # v_t = beta2 * v + (1 - beta2) * (g_t * g_t) 

227 v = self.get_slot(var, "v") 

228 v_scaled_g_values = (grad * grad) * coefficients["one_minus_beta_2_t"] 

229 v_t = tf.compat.v1.assign( 

230 v, v * coefficients["beta_2_t"], use_locking=self._use_locking 

231 ) 

232 with tf.control_dependencies([v_t]): 

233 v_t = self._resource_scatter_add(v, indices, v_scaled_g_values) 

234 

235 if not self.amsgrad: 

236 v_sqrt = tf.sqrt(v_t) 

237 var_update = tf.compat.v1.assign_sub( 

238 var, 

239 coefficients["lr"] * m_t / (v_sqrt + coefficients["epsilon"]), 

240 use_locking=self._use_locking, 

241 ) 

242 return tf.group(*[var_update, m_t, v_t]) 

243 else: 

244 v_hat = self.get_slot(var, "vhat") 

245 v_hat_t = tf.maximum(v_hat, v_t) 

246 with tf.control_dependencies([v_hat_t]): 

247 v_hat_t = tf.compat.v1.assign( 

248 v_hat, v_hat_t, use_locking=self._use_locking 

249 ) 

250 v_hat_sqrt = tf.sqrt(v_hat_t) 

251 var_update = tf.compat.v1.assign_sub( 

252 var, 

253 coefficients["lr"] 

254 * m_t 

255 / (v_hat_sqrt + coefficients["epsilon"]), 

256 use_locking=self._use_locking, 

257 ) 

258 return tf.group(*[var_update, m_t, v_t, v_hat_t]) 

259 

260 def get_config(self): 

261 config = super().get_config() 

262 config.update( 

263 { 

264 "learning_rate": self._serialize_hyperparameter( 

265 "learning_rate" 

266 ), 

267 "decay": self._initial_decay, 

268 "beta_1": self._serialize_hyperparameter("beta_1"), 

269 "beta_2": self._serialize_hyperparameter("beta_2"), 

270 "epsilon": self.epsilon, 

271 "amsgrad": self.amsgrad, 

272 } 

273 ) 

274 return config 

275 

276 

277class NonFusedAdam(optimizer_v2.OptimizerV2): 

278 r"""Optimizer that implements the Adam algorithm without fused kernels. 

279 

280 Adam optimization is a stochastic gradient descent method that is based on 

281 adaptive estimation of first-order and second-order moments. 

282 According to the paper 

283 [Adam: A Method for Stochastic Optimization. Kingma et al., 

284 2014](http://arxiv.org/abs/1412.6980), the method is "*computationally 

285 efficient, has little memory requirement, invariant to diagonal rescaling of 

286 gradients, and is well suited for problems that are large in terms of 

287 data/parameters*". 

288 

289 For AMSGrad see [On The Convergence Of Adam And Beyond. 

290 Reddi et al., 5-8](https://openreview.net/pdf?id=ryQu7f-RZ). 

291 

292 **If amsgrad = False**: 

293 

294 initialize $m_0$ as 1st moment vector 

295 initialize $v_0$ as 2nd moment vector 

296 

297 The update rule for $\theta$ with gradient $g$ uses an optimization 

298 described at the end of section 2 of the paper: 

299 

300 $$lr_t = \mathrm{learning\_rate} * 

301 \sqrt{1 - \beta_2^t} / (1 - \beta_1^t)$$ 

302 $$m_t = \beta_1 * m_{t-1} + (1 - \beta_1) * g$$ 

303 $$v_t = \beta_2 * v_{t-1} + (1 - \beta_2) * g^2$$ 

304 $$\theta_t = \theta_{t-1} - lr_t * m_t / (\sqrt{v_t} + \epsilon)$$ 

305 

306 **If amsgrad = True**: 

307 

308 initialize $m_0$ as 1st moment vector 

309 initialize $v_0$ as 2nd moment vector 

310 initialize $\hat{v}_0$ as 2nd moment vector 

311 

312 The update rule for $\theta$ with gradient $g$ uses an optimization 

313 described at the end of section 2 of the paper: 

314 

315 $$lr_t = \mathrm{learning\_rate} * 

316 \sqrt{1 - \beta_2^t} / (1 - \beta_1^t)$$ 

317 

318 $$m_t = \beta_1 * m_{t-1} + (1 - \beta_1) * g$$ 

319 $$v_t = \beta_2 * v_{t-1} + (1 - \beta_2) * g^2$$ 

320 $$\hat{v}_t = \max(\hat{v}_{t-1}, v_t)$$ 

321 $$\theta_t = \theta_{t-1} - lr_t * m_t / (\sqrt{\hat{v}_t} + \epsilon)$$ 

322 

323 The default value of 1e-7 for epsilon might not be a good default in 

324 general. For example, when training an Inception network on ImageNet a 

325 current good choice is 1.0 or 0.1. Note that since Adam uses the 

326 formulation just before Section 2.1 of the Kingma and Ba paper rather than 

327 the formulation in Algorithm 1, the "epsilon" referred to here is "epsilon 

328 hat" in the paper. 

329 

330 The sparse implementation of this algorithm (used when the gradient is an 

331 IndexedSlices object, typically because of `tf.gather` or an embedding 

332 lookup in the forward pass) does apply momentum to variable slices even if 

333 they were not used in the forward pass (meaning they have a gradient equal 

334 to zero). Momentum decay (beta1) is also applied to the entire momentum 

335 accumulator. This means that the sparse behavior is equivalent to the dense 

336 behavior (in contrast to some momentum implementations which ignore momentum 

337 unless a variable slice was actually used). 

338 

339 Usage: 

340 

341 >>> opt = tf.keras.optimizers.legacy.Adam(learning_rate=0.1) 

342 >>> var1 = tf.Variable(10.0) 

343 >>> loss = lambda: (var1 ** 2)/2.0 # d(loss)/d(var1) == var1 

344 >>> step_count = opt.minimize(loss, [var1]).numpy() 

345 >>> # The first step is `-learning_rate*sign(grad)` 

346 >>> var1.numpy() 

347 9.9 

348 """ 

349 

350 _HAS_AGGREGATE_GRAD = True 

351 

352 def __init__( 

353 self, 

354 learning_rate=0.001, 

355 beta_1=0.9, 

356 beta_2=0.999, 

357 epsilon=1e-7, 

358 amsgrad=False, 

359 name="Adam", 

360 **kwargs 

361 ): 

362 """Construct a new Adam optimizer. 

363 

364 Args: 

365 learning_rate: A `Tensor`, floating point value, or a schedule that is 

366 a `tf.keras.optimizers.schedules.LearningRateSchedule`, or a 

367 callable that takes no arguments and returns the actual value to 

368 use, The learning rate. Defaults to `0.001`. 

369 beta_1: A float value or a constant float tensor, or a callable that 

370 takes no arguments and returns the actual value to use. The 

371 exponential decay rate for the 1st moment estimates. Defaults to 

372 `0.9`. 

373 beta_2: A float value or a constant float tensor, or a callable that 

374 takes no arguments and returns the actual value to use, The 

375 exponential decay rate for the 2nd moment estimates. Defaults to 

376 `0.999`. 

377 epsilon: A small constant for numerical stability. This epsilon is 

378 "epsilon hat" in the Kingma and Ba paper (in the formula just before 

379 Section 2.1), not the epsilon in Algorithm 1 of the paper. Defaults 

380 to `1e-7`. 

381 amsgrad: Boolean. Whether to apply AMSGrad variant of this algorithm 

382 from the paper "On the Convergence of Adam and beyond". Defaults to 

383 `False`. 

384 name: Optional name for the operations created when applying 

385 gradients. Defaults to "Adam". 

386 **kwargs: keyword arguments. Allowed to be {`clipnorm`, `clipvalue`, 

387 `lr`, `decay`}. `clipnorm` is clip gradients by norm; `clipvalue` is 

388 clip gradients by value, `decay` is included for backward 

389 compatibility to allow time inverse decay of learning rate. `lr` is 

390 included for backward compatibility, recommended to use 

391 `learning_rate` instead. 

392 """ 

393 

394 super().__init__(name, **kwargs) 

395 self._set_hyper("learning_rate", kwargs.get("lr", learning_rate)) 

396 self._set_hyper("decay", self._initial_decay) 

397 self._set_hyper("beta_1", beta_1) 

398 self._set_hyper("beta_2", beta_2) 

399 self.epsilon = epsilon or backend_config.epsilon() 

400 self.amsgrad = amsgrad 

401 

402 def _create_slots(self, var_list): 

403 # Create slots for the first and second moments. 

404 # Separate for-loops to respect the ordering of slot variables from v1. 

405 for var in var_list: 

406 self.add_slot(var, "m") 

407 for var in var_list: 

408 self.add_slot(var, "v") 

409 if self.amsgrad: 

410 for var in var_list: 

411 self.add_slot(var, "vhat") 

412 

413 def _prepare_local(self, var_device, var_dtype, apply_state): 

414 super()._prepare_local(var_device, var_dtype, apply_state) 

415 

416 local_step = tf.cast(self.iterations + 1, var_dtype) 

417 beta_1_t = tf.identity(self._get_hyper("beta_1", var_dtype)) 

418 beta_2_t = tf.identity(self._get_hyper("beta_2", var_dtype)) 

419 beta_1_power = tf.pow(beta_1_t, local_step) 

420 beta_2_power = tf.pow(beta_2_t, local_step) 

421 lr = apply_state[(var_device, var_dtype)]["lr_t"] * ( 

422 tf.sqrt(1 - beta_2_power) / (1 - beta_1_power) 

423 ) 

424 apply_state[(var_device, var_dtype)].update( 

425 dict( 

426 lr=lr, 

427 epsilon=tf.convert_to_tensor(self.epsilon, var_dtype), 

428 beta_1_t=beta_1_t, 

429 beta_1_power=beta_1_power, 

430 one_minus_beta_1_t=1 - beta_1_t, 

431 beta_2_t=beta_2_t, 

432 beta_2_power=beta_2_power, 

433 one_minus_beta_2_t=1 - beta_2_t, 

434 ) 

435 ) 

436 

437 def set_weights(self, weights): 

438 params = self.weights 

439 # If the weights are generated by Keras V1 optimizer, it includes vhats 

440 # even without amsgrad, i.e, V1 optimizer has 3x + 1 variables, while V2 

441 # optimizer has 2x + 1 variables. Filter vhats out for compatibility. 

442 num_vars = int((len(params) - 1) / 2) 

443 if len(weights) == 3 * num_vars + 1: 

444 weights = weights[: len(params)] 

445 super().set_weights(weights) 

446 

447 @tf.function(jit_compile=True) 

448 def _resource_apply_dense(self, grad, var, apply_state=None): 

449 var_device, var_dtype = var.device, var.dtype.base_dtype 

450 coefficients = (apply_state or {}).get( 

451 (var_device, var_dtype) 

452 ) or self._fallback_apply_state(var_device, var_dtype) 

453 

454 m = self.get_slot(var, "m") 

455 v = self.get_slot(var, "v") 

456 

457 alpha = ( 

458 coefficients["lr_t"] 

459 * tf.sqrt(1 - coefficients["beta_2_power"]) 

460 / (1 - coefficients["beta_1_power"]) 

461 ) 

462 m.assign_add((grad - m) * (1 - coefficients["beta_1_t"])) 

463 v.assign_add((tf.square(grad) - v) * (1 - coefficients["beta_2_t"])) 

464 if self.amsgrad: 

465 vhat = self.get_slot(var, "vhat") 

466 vhat.assign(tf.maximum(vhat, v)) 

467 v = vhat 

468 var.assign_sub((m * alpha) / (tf.sqrt(v) + coefficients["epsilon"])) 

469 

470 @tf.function(jit_compile=True) 

471 def _resource_apply_sparse(self, grad, var, indices, apply_state=None): 

472 var_device, var_dtype = var.device, var.dtype.base_dtype 

473 coefficients = (apply_state or {}).get( 

474 (var_device, var_dtype) 

475 ) or self._fallback_apply_state(var_device, var_dtype) 

476 

477 # m_t = beta1 * m + (1 - beta1) * g_t 

478 m = self.get_slot(var, "m") 

479 m_scaled_g_values = grad * coefficients["one_minus_beta_1_t"] 

480 m.assign(m * coefficients["beta_1_t"]) 

481 m.scatter_add(tf.IndexedSlices(m_scaled_g_values, indices)) 

482 

483 # v_t = beta2 * v + (1 - beta2) * (g_t * g_t) 

484 v = self.get_slot(var, "v") 

485 v_scaled_g_values = (grad * grad) * coefficients["one_minus_beta_2_t"] 

486 v.assign(v * coefficients["beta_2_t"]) 

487 v.scatter_add(tf.IndexedSlices(v_scaled_g_values, indices)) 

488 

489 if not self.amsgrad: 

490 var.assign_sub( 

491 coefficients["lr"] * m / (tf.sqrt(v) + coefficients["epsilon"]) 

492 ) 

493 else: 

494 v_hat = self.get_slot(var, "vhat") 

495 v_hat.assign(tf.maximum(v_hat, v)) 

496 var.assign_sub( 

497 coefficients["lr"] 

498 * m 

499 / (tf.sqrt(v_hat) + coefficients["epsilon"]) 

500 ) 

501 

502 def get_config(self): 

503 config = super().get_config() 

504 config.update( 

505 { 

506 "learning_rate": self._serialize_hyperparameter( 

507 "learning_rate" 

508 ), 

509 "decay": self._initial_decay, 

510 "beta_1": self._serialize_hyperparameter("beta_1"), 

511 "beta_2": self._serialize_hyperparameter("beta_2"), 

512 "epsilon": self.epsilon, 

513 "amsgrad": self.amsgrad, 

514 } 

515 ) 

516 return config 

517