Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/keras/src/optimizers/legacy/adam.py: 18%

3# Licensed under the Apache License, Version 2.0 (the "License");

4# you may not use this file except in compliance with the License.

5# You may obtain a copy of the License at

7# http://www.apache.org/licenses/LICENSE-2.0

9# Unless required by applicable law or agreed to in writing, software

10# distributed under the License is distributed on an "AS IS" BASIS,

11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

12# See the License for the specific language governing permissions and

13# limitations under the License.

14# ==============================================================================

15"""Adam optimizer implementation."""

17import tensorflow.compat.v2 as tf

19from keras.src import backend_config

20from keras.src.optimizers.legacy import optimizer_v2

22# isort: off

23from tensorflow.python.util.tf_export import keras_export

26@keras_export(

27 "keras.optimizers.legacy.Adam",

28 v1=["keras.optimizers.Adam", "keras.optimizers.legacy.Adam"],

29)

30class Adam(optimizer_v2.OptimizerV2):

31 r"""Optimizer that implements the Adam algorithm.

33 Adam optimization is a stochastic gradient descent method that is based on

34 adaptive estimation of first-order and second-order moments.

36 According to

37 [Kingma et al., 2014](http://arxiv.org/abs/1412.6980),

38 the method is "*computationally

39 efficient, has little memory requirement, invariant to diagonal rescaling of

40 gradients, and is well suited for problems that are large in terms of

41 data/parameters*".

43 Args:

44 learning_rate: A `Tensor`, floating point value, or a schedule that is a

45 `tf.keras.optimizers.schedules.LearningRateSchedule`, or a callable

46 that takes no arguments and returns the actual value to use, The

47 learning rate. Defaults to `0.001`.

48 beta_1: A float value or a constant float tensor, or a callable

49 that takes no arguments and returns the actual value to use. The

50 exponential decay rate for the 1st moment estimates. Defaults to `0.9`.

51 beta_2: A float value or a constant float tensor, or a callable

52 that takes no arguments and returns the actual value to use, The

53 exponential decay rate for the 2nd moment estimates. Defaults to

54 `0.999`.

55 epsilon: A small constant for numerical stability. This epsilon is

56 "epsilon hat" in the Kingma and Ba paper (in the formula just before

57 Section 2.1), not the epsilon in Algorithm 1 of the paper. Defaults to

58 `1e-7`.

59 amsgrad: Boolean. Whether to apply AMSGrad variant of this algorithm from

60 the paper "On the Convergence of Adam and beyond". Defaults to `False`.

61 name: Optional name for the operations created when applying gradients.

62 Defaults to `"Adam"`.

63 **kwargs: keyword arguments. Allowed arguments are `clipvalue`,

64 `clipnorm`, `global_clipnorm`.

65 If `clipvalue` (float) is set, the gradient of each weight

66 is clipped to be no higher than this value.

67 If `clipnorm` (float) is set, the gradient of each weight

68 is individually clipped so that its norm is no higher than this value.

69 If `global_clipnorm` (float) is set the gradient of all weights is

70 clipped so that their global norm is no higher than this value.

72 Usage:

74 >>> opt = tf.keras.optimizers.legacy.Adam(learning_rate=0.1)

75 >>> var1 = tf.Variable(10.0)

76 >>> loss = lambda: (var1 ** 2)/2.0 # d(loss)/d(var1) == var1

77 >>> step_count = opt.minimize(loss, [var1]).numpy()

78 >>> # The first step is `-learning_rate*sign(grad)`

79 >>> var1.numpy()

80 9.9

82 Reference:

83 - [Kingma et al., 2014](http://arxiv.org/abs/1412.6980)

84 - [Reddi et al., 2018](

85 https://openreview.net/pdf?id=ryQu7f-RZ) for `amsgrad`.

87 Notes:

89 The default value of 1e-7 for epsilon might not be a good default in

90 general. For example, when training an Inception network on ImageNet a

91 current good choice is 1.0 or 0.1. Note that since Adam uses the

92 formulation just before Section 2.1 of the Kingma and Ba paper rather than

93 the formulation in Algorithm 1, the "epsilon" referred to here is "epsilon

94 hat" in the paper.

96 The sparse implementation of this algorithm (used when the gradient is an

97 IndexedSlices object, typically because of `tf.gather` or an embedding

98 lookup in the forward pass) does apply momentum to variable slices even if

99 they were not used in the forward pass (meaning they have a gradient equal

100 to zero). Momentum decay (beta1) is also applied to the entire momentum

101 accumulator. This means that the sparse behavior is equivalent to the dense

102 behavior (in contrast to some momentum implementations which ignore momentum

103 unless a variable slice was actually used).

104 """

105

106 _HAS_AGGREGATE_GRAD = True

107

108 def __init__(

109 self,

110 learning_rate=0.001,

111 beta_1=0.9,

112 beta_2=0.999,

113 epsilon=1e-7,

114 amsgrad=False,

115 name="Adam",

116 **kwargs

117 ):

118 super().__init__(name, **kwargs)

119 self._set_hyper("learning_rate", kwargs.get("lr", learning_rate))

120 self._set_hyper("decay", self._initial_decay)

121 self._set_hyper("beta_1", beta_1)

122 self._set_hyper("beta_2", beta_2)

123 self.epsilon = epsilon or backend_config.epsilon()

124 self.amsgrad = amsgrad

125

126 def _create_slots(self, var_list):

127 # Create slots for the first and second moments.

128 # Separate for-loops to respect the ordering of slot variables from v1.

129 for var in var_list:

130 self.add_slot(var, "m")

131 for var in var_list:

132 self.add_slot(var, "v")

133 if self.amsgrad:

134 for var in var_list:

135 self.add_slot(var, "vhat")

136

137 def _prepare_local(self, var_device, var_dtype, apply_state):

138 super()._prepare_local(var_device, var_dtype, apply_state)

139

140 local_step = tf.cast(self.iterations + 1, var_dtype)

141 beta_1_t = tf.identity(self._get_hyper("beta_1", var_dtype))

142 beta_2_t = tf.identity(self._get_hyper("beta_2", var_dtype))

143 beta_1_power = tf.pow(beta_1_t, local_step)

144 beta_2_power = tf.pow(beta_2_t, local_step)

145 lr = apply_state[(var_device, var_dtype)]["lr_t"] * (

146 tf.sqrt(1 - beta_2_power) / (1 - beta_1_power)

147 )

148 apply_state[(var_device, var_dtype)].update(

149 dict(

150 lr=lr,

151 epsilon=tf.convert_to_tensor(self.epsilon, var_dtype),

152 beta_1_t=beta_1_t,

153 beta_1_power=beta_1_power,

154 one_minus_beta_1_t=1 - beta_1_t,

155 beta_2_t=beta_2_t,

156 beta_2_power=beta_2_power,

157 one_minus_beta_2_t=1 - beta_2_t,

158 )

159 )

160

161 def set_weights(self, weights):

162 params = self.weights

163 # If the weights are generated by Keras V1 optimizer, it includes vhats

164 # even without amsgrad, i.e, V1 optimizer has 3x + 1 variables, while V2

165 # optimizer has 2x + 1 variables. Filter vhats out for compatibility.

166 num_vars = int((len(params) - 1) / 2)

167 if len(weights) == 3 * num_vars + 1:

168 weights = weights[: len(params)]

169 super().set_weights(weights)

170

171 def _resource_apply_dense(self, grad, var, apply_state=None):

172 var_device, var_dtype = var.device, var.dtype.base_dtype

173 coefficients = (apply_state or {}).get(

174 (var_device, var_dtype)

175 ) or self._fallback_apply_state(var_device, var_dtype)

176

177 m = self.get_slot(var, "m")

178 v = self.get_slot(var, "v")

179

180 if not self.amsgrad:

181 return tf.raw_ops.ResourceApplyAdam(

182 var=var.handle,

183 m=m.handle,

184 v=v.handle,

185 beta1_power=coefficients["beta_1_power"],

186 beta2_power=coefficients["beta_2_power"],

187 lr=coefficients["lr_t"],

188 beta1=coefficients["beta_1_t"],

189 beta2=coefficients["beta_2_t"],

190 epsilon=coefficients["epsilon"],

191 grad=grad,

192 use_locking=self._use_locking,

193 )

194 else:

195 vhat = self.get_slot(var, "vhat")

196 return tf.raw_ops.ResourceApplyAdamWithAmsgrad(

197 var=var.handle,

198 m=m.handle,

199 v=v.handle,

200 vhat=vhat.handle,

201 beta1_power=coefficients["beta_1_power"],

202 beta2_power=coefficients["beta_2_power"],

203 lr=coefficients["lr_t"],

204 beta1=coefficients["beta_1_t"],

205 beta2=coefficients["beta_2_t"],

206 epsilon=coefficients["epsilon"],

207 grad=grad,

208 use_locking=self._use_locking,

209 )

210

211 def _resource_apply_sparse(self, grad, var, indices, apply_state=None):

212 var_device, var_dtype = var.device, var.dtype.base_dtype

213 coefficients = (apply_state or {}).get(

214 (var_device, var_dtype)

215 ) or self._fallback_apply_state(var_device, var_dtype)

216

217 # m_t = beta1 * m + (1 - beta1) * g_t

218 m = self.get_slot(var, "m")

219 m_scaled_g_values = grad * coefficients["one_minus_beta_1_t"]

220 m_t = tf.compat.v1.assign(

221 m, m * coefficients["beta_1_t"], use_locking=self._use_locking

222 )

223 with tf.control_dependencies([m_t]):

224 m_t = self._resource_scatter_add(m, indices, m_scaled_g_values)

225

226 # v_t = beta2 * v + (1 - beta2) * (g_t * g_t)

227 v = self.get_slot(var, "v")

228 v_scaled_g_values = (grad * grad) * coefficients["one_minus_beta_2_t"]

229 v_t = tf.compat.v1.assign(

230 v, v * coefficients["beta_2_t"], use_locking=self._use_locking

231 )

232 with tf.control_dependencies([v_t]):

233 v_t = self._resource_scatter_add(v, indices, v_scaled_g_values)

234

235 if not self.amsgrad:

236 v_sqrt = tf.sqrt(v_t)

237 var_update = tf.compat.v1.assign_sub(

238 var,

239 coefficients["lr"] * m_t / (v_sqrt + coefficients["epsilon"]),

240 use_locking=self._use_locking,

241 )

242 return tf.group(*[var_update, m_t, v_t])

243 else:

244 v_hat = self.get_slot(var, "vhat")

245 v_hat_t = tf.maximum(v_hat, v_t)

246 with tf.control_dependencies([v_hat_t]):

247 v_hat_t = tf.compat.v1.assign(

248 v_hat, v_hat_t, use_locking=self._use_locking

249 )

250 v_hat_sqrt = tf.sqrt(v_hat_t)

251 var_update = tf.compat.v1.assign_sub(

252 var,

253 coefficients["lr"]

254 * m_t

255 / (v_hat_sqrt + coefficients["epsilon"]),

256 use_locking=self._use_locking,

257 )

258 return tf.group(*[var_update, m_t, v_t, v_hat_t])

259

260 def get_config(self):

261 config = super().get_config()

262 config.update(

263 {

264 "learning_rate": self._serialize_hyperparameter(

265 "learning_rate"

266 ),

267 "decay": self._initial_decay,

268 "beta_1": self._serialize_hyperparameter("beta_1"),

269 "beta_2": self._serialize_hyperparameter("beta_2"),

270 "epsilon": self.epsilon,

271 "amsgrad": self.amsgrad,

272 }

273 )

274 return config

275

276

277class NonFusedAdam(optimizer_v2.OptimizerV2):

278 r"""Optimizer that implements the Adam algorithm without fused kernels.

279

280 Adam optimization is a stochastic gradient descent method that is based on

281 adaptive estimation of first-order and second-order moments.

282 According to the paper

283 [Adam: A Method for Stochastic Optimization. Kingma et al.,

284 2014](http://arxiv.org/abs/1412.6980), the method is "*computationally

285 efficient, has little memory requirement, invariant to diagonal rescaling of

286 gradients, and is well suited for problems that are large in terms of

287 data/parameters*".

288

289 For AMSGrad see [On The Convergence Of Adam And Beyond.

290 Reddi et al., 5-8](https://openreview.net/pdf?id=ryQu7f-RZ).

291

292 **If amsgrad = False**:

293

294 initialize $m_0$ as 1st moment vector

295 initialize $v_0$ as 2nd moment vector

296

297 The update rule for $\theta$ with gradient $g$ uses an optimization

298 described at the end of section 2 of the paper:

299

300 $$lr_t = \mathrm{learning\_rate} *

301 \sqrt{1 - \beta_2^t} / (1 - \beta_1^t)$$

302 $$m_t = \beta_1 * m_{t-1} + (1 - \beta_1) * g$$

303 $$v_t = \beta_2 * v_{t-1} + (1 - \beta_2) * g^2$$

304 $$\theta_t = \theta_{t-1} - lr_t * m_t / (\sqrt{v_t} + \epsilon)$$

305

306 **If amsgrad = True**:

307

308 initialize $m_0$ as 1st moment vector

309 initialize $v_0$ as 2nd moment vector

310 initialize $\hat{v}_0$ as 2nd moment vector

311

312 The update rule for $\theta$ with gradient $g$ uses an optimization

313 described at the end of section 2 of the paper:

314

315 $$lr_t = \mathrm{learning\_rate} *

316 \sqrt{1 - \beta_2^t} / (1 - \beta_1^t)$$

317

318 $$m_t = \beta_1 * m_{t-1} + (1 - \beta_1) * g$$

319 $$v_t = \beta_2 * v_{t-1} + (1 - \beta_2) * g^2$$

320 $$\hat{v}_t = \max(\hat{v}_{t-1}, v_t)$$

321 $$\theta_t = \theta_{t-1} - lr_t * m_t / (\sqrt{\hat{v}_t} + \epsilon)$$

322

323 The default value of 1e-7 for epsilon might not be a good default in

324 general. For example, when training an Inception network on ImageNet a

325 current good choice is 1.0 or 0.1. Note that since Adam uses the

326 formulation just before Section 2.1 of the Kingma and Ba paper rather than

327 the formulation in Algorithm 1, the "epsilon" referred to here is "epsilon

328 hat" in the paper.

329

330 The sparse implementation of this algorithm (used when the gradient is an

331 IndexedSlices object, typically because of `tf.gather` or an embedding

332 lookup in the forward pass) does apply momentum to variable slices even if

333 they were not used in the forward pass (meaning they have a gradient equal

334 to zero). Momentum decay (beta1) is also applied to the entire momentum

335 accumulator. This means that the sparse behavior is equivalent to the dense

336 behavior (in contrast to some momentum implementations which ignore momentum

337 unless a variable slice was actually used).

338

339 Usage:

340

341 >>> opt = tf.keras.optimizers.legacy.Adam(learning_rate=0.1)

342 >>> var1 = tf.Variable(10.0)

343 >>> loss = lambda: (var1 ** 2)/2.0 # d(loss)/d(var1) == var1

344 >>> step_count = opt.minimize(loss, [var1]).numpy()

345 >>> # The first step is `-learning_rate*sign(grad)`

346 >>> var1.numpy()

347 9.9

348 """

349

350 _HAS_AGGREGATE_GRAD = True

351

352 def __init__(

353 self,

354 learning_rate=0.001,

355 beta_1=0.9,

356 beta_2=0.999,

357 epsilon=1e-7,

358 amsgrad=False,

359 name="Adam",

360 **kwargs

361 ):

362 """Construct a new Adam optimizer.

363

364 Args:

365 learning_rate: A `Tensor`, floating point value, or a schedule that is

366 a `tf.keras.optimizers.schedules.LearningRateSchedule`, or a

367 callable that takes no arguments and returns the actual value to

368 use, The learning rate. Defaults to `0.001`.

369 beta_1: A float value or a constant float tensor, or a callable that

370 takes no arguments and returns the actual value to use. The

371 exponential decay rate for the 1st moment estimates. Defaults to

372 `0.9`.

373 beta_2: A float value or a constant float tensor, or a callable that

374 takes no arguments and returns the actual value to use, The

375 exponential decay rate for the 2nd moment estimates. Defaults to

376 `0.999`.

377 epsilon: A small constant for numerical stability. This epsilon is

378 "epsilon hat" in the Kingma and Ba paper (in the formula just before

379 Section 2.1), not the epsilon in Algorithm 1 of the paper. Defaults

380 to `1e-7`.

381 amsgrad: Boolean. Whether to apply AMSGrad variant of this algorithm

382 from the paper "On the Convergence of Adam and beyond". Defaults to

383 `False`.

384 name: Optional name for the operations created when applying

385 gradients. Defaults to "Adam".

386 **kwargs: keyword arguments. Allowed to be {`clipnorm`, `clipvalue`,

387 `lr`, `decay`}. `clipnorm` is clip gradients by norm; `clipvalue` is

388 clip gradients by value, `decay` is included for backward

389 compatibility to allow time inverse decay of learning rate. `lr` is

390 included for backward compatibility, recommended to use

391 `learning_rate` instead.

392 """

393

394 super().__init__(name, **kwargs)

395 self._set_hyper("learning_rate", kwargs.get("lr", learning_rate))

396 self._set_hyper("decay", self._initial_decay)

397 self._set_hyper("beta_1", beta_1)

398 self._set_hyper("beta_2", beta_2)

399 self.epsilon = epsilon or backend_config.epsilon()

400 self.amsgrad = amsgrad

401

402 def _create_slots(self, var_list):

403 # Create slots for the first and second moments.

404 # Separate for-loops to respect the ordering of slot variables from v1.

405 for var in var_list:

406 self.add_slot(var, "m")

407 for var in var_list:

408 self.add_slot(var, "v")

409 if self.amsgrad:

410 for var in var_list:

411 self.add_slot(var, "vhat")

412

413 def _prepare_local(self, var_device, var_dtype, apply_state):

414 super()._prepare_local(var_device, var_dtype, apply_state)

415

416 local_step = tf.cast(self.iterations + 1, var_dtype)

417 beta_1_t = tf.identity(self._get_hyper("beta_1", var_dtype))

418 beta_2_t = tf.identity(self._get_hyper("beta_2", var_dtype))

419 beta_1_power = tf.pow(beta_1_t, local_step)

420 beta_2_power = tf.pow(beta_2_t, local_step)

421 lr = apply_state[(var_device, var_dtype)]["lr_t"] * (

422 tf.sqrt(1 - beta_2_power) / (1 - beta_1_power)

423 )

424 apply_state[(var_device, var_dtype)].update(

425 dict(

426 lr=lr,

427 epsilon=tf.convert_to_tensor(self.epsilon, var_dtype),

428 beta_1_t=beta_1_t,

429 beta_1_power=beta_1_power,

430 one_minus_beta_1_t=1 - beta_1_t,

431 beta_2_t=beta_2_t,

432 beta_2_power=beta_2_power,

433 one_minus_beta_2_t=1 - beta_2_t,

434 )

435 )

436

437 def set_weights(self, weights):

438 params = self.weights

439 # If the weights are generated by Keras V1 optimizer, it includes vhats

440 # even without amsgrad, i.e, V1 optimizer has 3x + 1 variables, while V2

441 # optimizer has 2x + 1 variables. Filter vhats out for compatibility.

442 num_vars = int((len(params) - 1) / 2)

443 if len(weights) == 3 * num_vars + 1:

444 weights = weights[: len(params)]

445 super().set_weights(weights)

446

447 @tf.function(jit_compile=True)

448 def _resource_apply_dense(self, grad, var, apply_state=None):

449 var_device, var_dtype = var.device, var.dtype.base_dtype

450 coefficients = (apply_state or {}).get(

451 (var_device, var_dtype)

452 ) or self._fallback_apply_state(var_device, var_dtype)

453

454 m = self.get_slot(var, "m")

455 v = self.get_slot(var, "v")

456

457 alpha = (

458 coefficients["lr_t"]

459 * tf.sqrt(1 - coefficients["beta_2_power"])

460 / (1 - coefficients["beta_1_power"])

461 )

462 m.assign_add((grad - m) * (1 - coefficients["beta_1_t"]))

463 v.assign_add((tf.square(grad) - v) * (1 - coefficients["beta_2_t"]))

464 if self.amsgrad:

465 vhat = self.get_slot(var, "vhat")

466 vhat.assign(tf.maximum(vhat, v))

467 v = vhat

468 var.assign_sub((m * alpha) / (tf.sqrt(v) + coefficients["epsilon"]))

469

470 @tf.function(jit_compile=True)

471 def _resource_apply_sparse(self, grad, var, indices, apply_state=None):

472 var_device, var_dtype = var.device, var.dtype.base_dtype

473 coefficients = (apply_state or {}).get(

474 (var_device, var_dtype)

475 ) or self._fallback_apply_state(var_device, var_dtype)

476

477 # m_t = beta1 * m + (1 - beta1) * g_t

478 m = self.get_slot(var, "m")

479 m_scaled_g_values = grad * coefficients["one_minus_beta_1_t"]

480 m.assign(m * coefficients["beta_1_t"])

481 m.scatter_add(tf.IndexedSlices(m_scaled_g_values, indices))

482

483 # v_t = beta2 * v + (1 - beta2) * (g_t * g_t)

484 v = self.get_slot(var, "v")

485 v_scaled_g_values = (grad * grad) * coefficients["one_minus_beta_2_t"]

486 v.assign(v * coefficients["beta_2_t"])

487 v.scatter_add(tf.IndexedSlices(v_scaled_g_values, indices))

488

489 if not self.amsgrad:

490 var.assign_sub(

491 coefficients["lr"] * m / (tf.sqrt(v) + coefficients["epsilon"])

492 )

493 else:

494 v_hat = self.get_slot(var, "vhat")

495 v_hat.assign(tf.maximum(v_hat, v))

496 var.assign_sub(

497 coefficients["lr"]

498 * m

499 / (tf.sqrt(v_hat) + coefficients["epsilon"])

500 )

501

502 def get_config(self):

503 config = super().get_config()

504 config.update(

505 {

506 "learning_rate": self._serialize_hyperparameter(

507 "learning_rate"

508 ),

509 "decay": self._initial_decay,

510 "beta_1": self._serialize_hyperparameter("beta_1"),

511 "beta_2": self._serialize_hyperparameter("beta_2"),

512 "epsilon": self.epsilon,

513 "amsgrad": self.amsgrad,

514 }

515 )

516 return config

517