Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/tensorflow/python/keras/optimizer

3# Licensed under the Apache License, Version 2.0 (the "License");

4# you may not use this file except in compliance with the License.

5# You may obtain a copy of the License at

7# http://www.apache.org/licenses/LICENSE-2.0

9# Unless required by applicable law or agreed to in writing, software

10# distributed under the License is distributed on an "AS IS" BASIS,

11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

12# See the License for the specific language governing permissions and

13# limitations under the License.

14# ==============================================================================

15"""Adam optimizer implementation."""

16# pylint: disable=g-classes-have-attributes

18from tensorflow.python.eager import def_function

19from tensorflow.python.framework import indexed_slices

20from tensorflow.python.framework import ops

21from tensorflow.python.framework import tensor_conversion

22from tensorflow.python.keras import backend_config

23from tensorflow.python.keras.optimizer_v2 import optimizer_v2

24from tensorflow.python.ops import array_ops

25from tensorflow.python.ops import control_flow_ops

26from tensorflow.python.ops import math_ops

27from tensorflow.python.ops import state_ops

28from tensorflow.python.training import gen_training_ops

29from tensorflow.python.util.tf_export import keras_export

32@keras_export('keras.optimizers.Adam')

33class Adam(optimizer_v2.OptimizerV2):

34 r"""Optimizer that implements the Adam algorithm.

36 Adam optimization is a stochastic gradient descent method that is based on

37 adaptive estimation of first-order and second-order moments.

39 According to

40 [Kingma et al., 2014](http://arxiv.org/abs/1412.6980),

41 the method is "*computationally

42 efficient, has little memory requirement, invariant to diagonal rescaling of

43 gradients, and is well suited for problems that are large in terms of

44 data/parameters*".

46 Args:

47 learning_rate: A `Tensor`, floating point value, or a schedule that is a

48 `tf.keras.optimizers.schedules.LearningRateSchedule`, or a callable

49 that takes no arguments and returns the actual value to use, The

50 learning rate. Defaults to 0.001.

51 beta_1: A float value or a constant float tensor, or a callable

52 that takes no arguments and returns the actual value to use. The

53 exponential decay rate for the 1st moment estimates. Defaults to 0.9.

54 beta_2: A float value or a constant float tensor, or a callable

55 that takes no arguments and returns the actual value to use, The

56 exponential decay rate for the 2nd moment estimates. Defaults to 0.999.

57 epsilon: A small constant for numerical stability. This epsilon is

58 "epsilon hat" in the Kingma and Ba paper (in the formula just before

59 Section 2.1), not the epsilon in Algorithm 1 of the paper. Defaults to

60 1e-7.

61 amsgrad: Boolean. Whether to apply AMSGrad variant of this algorithm from

62 the paper "On the Convergence of Adam and beyond". Defaults to `False`.

63 name: Optional name for the operations created when applying gradients.

64 Defaults to `"Adam"`.

65 **kwargs: Keyword arguments. Allowed to be one of

66 `"clipnorm"` or `"clipvalue"`.

67 `"clipnorm"` (float) clips gradients by norm; `"clipvalue"` (float) clips

68 gradients by value.

70 Usage:

72 >>> opt = tf.keras.optimizers.Adam(learning_rate=0.1)

73 >>> var1 = tf.Variable(10.0)

74 >>> loss = lambda: (var1 ** 2)/2.0 # d(loss)/d(var1) == var1

75 >>> step_count = opt.minimize(loss, [var1]).numpy()

76 >>> # The first step is `-learning_rate*sign(grad)`

77 >>> var1.numpy()

78 9.9

80 Reference:

81 - [Kingma et al., 2014](http://arxiv.org/abs/1412.6980)

82 - [Reddi et al., 2018](

83 https://openreview.net/pdf?id=ryQu7f-RZ) for `amsgrad`.

85 Notes:

87 The default value of 1e-7 for epsilon might not be a good default in

88 general. For example, when training an Inception network on ImageNet a

89 current good choice is 1.0 or 0.1. Note that since Adam uses the

90 formulation just before Section 2.1 of the Kingma and Ba paper rather than

91 the formulation in Algorithm 1, the "epsilon" referred to here is "epsilon

92 hat" in the paper.

94 The sparse implementation of this algorithm (used when the gradient is an

95 IndexedSlices object, typically because of `tf.gather` or an embedding

96 lookup in the forward pass) does apply momentum to variable slices even if

97 they were not used in the forward pass (meaning they have a gradient equal

98 to zero). Momentum decay (beta1) is also applied to the entire momentum

99 accumulator. This means that the sparse behavior is equivalent to the dense

100 behavior (in contrast to some momentum implementations which ignore momentum

101 unless a variable slice was actually used).

102 """

103

104 _HAS_AGGREGATE_GRAD = True

105

106 def __init__(self,

107 learning_rate=0.001,

108 beta_1=0.9,

109 beta_2=0.999,

110 epsilon=1e-7,

111 amsgrad=False,

112 name='Adam',

113 **kwargs):

114 super(Adam, self).__init__(name, **kwargs)

115 self._set_hyper('learning_rate', kwargs.get('lr', learning_rate))

116 self._set_hyper('decay', self._initial_decay)

117 self._set_hyper('beta_1', beta_1)

118 self._set_hyper('beta_2', beta_2)

119 self.epsilon = epsilon or backend_config.epsilon()

120 self.amsgrad = amsgrad

121

122 def _create_slots(self, var_list):

123 # Create slots for the first and second moments.

124 # Separate for-loops to respect the ordering of slot variables from v1.

125 for var in var_list:

126 self.add_slot(var, 'm')

127 for var in var_list:

128 self.add_slot(var, 'v')

129 if self.amsgrad:

130 for var in var_list:

131 self.add_slot(var, 'vhat')

132

133 def _prepare_local(self, var_device, var_dtype, apply_state):

134 super(Adam, self)._prepare_local(var_device, var_dtype, apply_state)

135

136 local_step = math_ops.cast(self.iterations + 1, var_dtype)

137 beta_1_t = array_ops.identity(self._get_hyper('beta_1', var_dtype))

138 beta_2_t = array_ops.identity(self._get_hyper('beta_2', var_dtype))

139 beta_1_power = math_ops.pow(beta_1_t, local_step)

140 beta_2_power = math_ops.pow(beta_2_t, local_step)

141 lr = (apply_state[(var_device, var_dtype)]['lr_t'] *

142 (math_ops.sqrt(1 - beta_2_power) / (1 - beta_1_power)))

143 apply_state[(var_device, var_dtype)].update(

144 dict(

145 lr=lr,

146 epsilon=tensor_conversion.convert_to_tensor_v2_with_dispatch(

147 self.epsilon, var_dtype

148 ),

149 beta_1_t=beta_1_t,

150 beta_1_power=beta_1_power,

151 one_minus_beta_1_t=1 - beta_1_t,

152 beta_2_t=beta_2_t,

153 beta_2_power=beta_2_power,

154 one_minus_beta_2_t=1 - beta_2_t,

155 )

156 )

157

158 def set_weights(self, weights):

159 params = self.weights

160 # If the weights are generated by Keras V1 optimizer, it includes vhats

161 # even without amsgrad, i.e, V1 optimizer has 3x + 1 variables, while V2

162 # optimizer has 2x + 1 variables. Filter vhats out for compatibility.

163 num_vars = int((len(params) - 1) / 2)

164 if len(weights) == 3 * num_vars + 1:

165 weights = weights[:len(params)]

166 super(Adam, self).set_weights(weights)

167

168 def _resource_apply_dense(self, grad, var, apply_state=None):

169 var_device, var_dtype = var.device, var.dtype.base_dtype

170 coefficients = ((apply_state or {}).get((var_device, var_dtype))

171 or self._fallback_apply_state(var_device, var_dtype))

172

173 m = self.get_slot(var, 'm')

174 v = self.get_slot(var, 'v')

175

176 if not self.amsgrad:

177 return gen_training_ops.ResourceApplyAdam(

178 var=var.handle,

179 m=m.handle,

180 v=v.handle,

181 beta1_power=coefficients['beta_1_power'],

182 beta2_power=coefficients['beta_2_power'],

183 lr=coefficients['lr_t'],

184 beta1=coefficients['beta_1_t'],

185 beta2=coefficients['beta_2_t'],

186 epsilon=coefficients['epsilon'],

187 grad=grad,

188 use_locking=self._use_locking)

189 else:

190 vhat = self.get_slot(var, 'vhat')

191 return gen_training_ops.ResourceApplyAdamWithAmsgrad(

192 var=var.handle,

193 m=m.handle,

194 v=v.handle,

195 vhat=vhat.handle,

196 beta1_power=coefficients['beta_1_power'],

197 beta2_power=coefficients['beta_2_power'],

198 lr=coefficients['lr_t'],

199 beta1=coefficients['beta_1_t'],

200 beta2=coefficients['beta_2_t'],

201 epsilon=coefficients['epsilon'],

202 grad=grad,

203 use_locking=self._use_locking)

204

205 def _resource_apply_sparse(self, grad, var, indices, apply_state=None):

206 var_device, var_dtype = var.device, var.dtype.base_dtype

207 coefficients = ((apply_state or {}).get((var_device, var_dtype))

208 or self._fallback_apply_state(var_device, var_dtype))

209

210 # m_t = beta1 * m + (1 - beta1) * g_t

211 m = self.get_slot(var, 'm')

212 m_scaled_g_values = grad * coefficients['one_minus_beta_1_t']

213 m_t = state_ops.assign(m, m * coefficients['beta_1_t'],

214 use_locking=self._use_locking)

215 with ops.control_dependencies([m_t]):

216 m_t = self._resource_scatter_add(m, indices, m_scaled_g_values)

217

218 # v_t = beta2 * v + (1 - beta2) * (g_t * g_t)

219 v = self.get_slot(var, 'v')

220 v_scaled_g_values = (grad * grad) * coefficients['one_minus_beta_2_t']

221 v_t = state_ops.assign(v, v * coefficients['beta_2_t'],

222 use_locking=self._use_locking)

223 with ops.control_dependencies([v_t]):

224 v_t = self._resource_scatter_add(v, indices, v_scaled_g_values)

225

226 if not self.amsgrad:

227 v_sqrt = math_ops.sqrt(v_t)

228 var_update = state_ops.assign_sub(

229 var, coefficients['lr'] * m_t / (v_sqrt + coefficients['epsilon']),

230 use_locking=self._use_locking)

231 return control_flow_ops.group(*[var_update, m_t, v_t])

232 else:

233 v_hat = self.get_slot(var, 'vhat')

234 v_hat_t = math_ops.maximum(v_hat, v_t)

235 with ops.control_dependencies([v_hat_t]):

236 v_hat_t = state_ops.assign(

237 v_hat, v_hat_t, use_locking=self._use_locking)

238 v_hat_sqrt = math_ops.sqrt(v_hat_t)

239 var_update = state_ops.assign_sub(

240 var,

241 coefficients['lr'] * m_t / (v_hat_sqrt + coefficients['epsilon']),

242 use_locking=self._use_locking)

243 return control_flow_ops.group(*[var_update, m_t, v_t, v_hat_t])

244

245 def get_config(self):

246 config = super(Adam, self).get_config()

247 config.update({

248 'learning_rate': self._serialize_hyperparameter('learning_rate'),

249 'decay': self._initial_decay,

250 'beta_1': self._serialize_hyperparameter('beta_1'),

251 'beta_2': self._serialize_hyperparameter('beta_2'),

252 'epsilon': self.epsilon,

253 'amsgrad': self.amsgrad,

254 })

255 return config

256

257

258class NonFusedAdam(optimizer_v2.OptimizerV2):

259 r"""Optimizer that implements the Adam algorithm without fused kernels.

260

261 Adam optimization is a stochastic gradient descent method that is based on

262 adaptive estimation of first-order and second-order moments.

263 According to the paper

264 [Adam: A Method for Stochastic Optimization. Kingma et al.,

265 2014](http://arxiv.org/abs/1412.6980), the method is "*computationally

266 efficient, has little memory requirement, invariant to diagonal rescaling of

267 gradients, and is well suited for problems that are large in terms of

268 data/parameters*".

269

270 For AMSGrad see [On The Convergence Of Adam And Beyond.

271 Reddi et al., 5-8](https://openreview.net/pdf?id=ryQu7f-RZ).

272

273 **If amsgrad = False**:

274

275 initialize $m_0$ as 1st moment vector

276 initialize $v_0$ as 2nd moment vector

277

278 The update rule for $\theta$ with gradient $g$ uses an optimization

279 described at the end of section 2 of the paper:

280

281 $$lr_t = \mathrm{learning\_rate} *

282 \sqrt{1 - \beta_2^t} / (1 - \beta_1^t)$$

283 $$m_t = \beta_1 * m_{t-1} + (1 - \beta_1) * g$$

284 $$v_t = \beta_2 * v_{t-1} + (1 - \beta_2) * g^2$$

285 $$\theta_t = \theta_{t-1} - lr_t * m_t / (\sqrt{v_t} + \epsilon)$$

286

287 **If amsgrad = True**:

288

289 initialize $m_0$ as 1st moment vector

290 initialize $v_0$ as 2nd moment vector

291 initialize $\hat{v}_0$ as 2nd moment vector

292

293 The update rule for $\theta$ with gradient $g$ uses an optimization

294 described at the end of section 2 of the paper:

295

296 $$lr_t = \mathrm{learning\_rate} *

297 \sqrt{1 - \beta_2^t} / (1 - \beta_1^t)$$

298

299 $$m_t = \beta_1 * m_{t-1} + (1 - \beta_1) * g$$

300 $$v_t = \beta_2 * v_{t-1} + (1 - \beta_2) * g^2$$

301 $$\hat{v}_t = \max(\hat{v}_{t-1}, v_t)$$

302 $$\theta_t = \theta_{t-1} - lr_t * m_t / (\sqrt{\hat{v}_t} + \epsilon)$$

303

304 The default value of 1e-7 for epsilon might not be a good default in

305 general. For example, when training an Inception network on ImageNet a

306 current good choice is 1.0 or 0.1. Note that since Adam uses the

307 formulation just before Section 2.1 of the Kingma and Ba paper rather than

308 the formulation in Algorithm 1, the "epsilon" referred to here is "epsilon

309 hat" in the paper.

310

311 The sparse implementation of this algorithm (used when the gradient is an

312 IndexedSlices object, typically because of `tf.gather` or an embedding

313 lookup in the forward pass) does apply momentum to variable slices even if

314 they were not used in the forward pass (meaning they have a gradient equal

315 to zero). Momentum decay (beta1) is also applied to the entire momentum

316 accumulator. This means that the sparse behavior is equivalent to the dense

317 behavior (in contrast to some momentum implementations which ignore momentum

318 unless a variable slice was actually used).

319

320 Usage:

321

322 >>> opt = tf.keras.optimizers.Adam(learning_rate=0.1)

323 >>> var1 = tf.Variable(10.0)

324 >>> loss = lambda: (var1 ** 2)/2.0 # d(loss)/d(var1) == var1

325 >>> step_count = opt.minimize(loss, [var1]).numpy()

326 >>> # The first step is `-learning_rate*sign(grad)`

327 >>> var1.numpy()

328 9.9

329 """

330

331 _HAS_AGGREGATE_GRAD = True

332

333 def __init__(self,

334 learning_rate=0.001,

335 beta_1=0.9,

336 beta_2=0.999,

337 epsilon=1e-7,

338 amsgrad=False,

339 name='Adam',

340 **kwargs):

341 """Construct a new Adam optimizer.

342

343 Args:

344 learning_rate: A `Tensor`, floating point value, or a schedule that is a

345 `tf.keras.optimizers.schedules.LearningRateSchedule`, or a callable that

346 takes no arguments and returns the actual value to use, The learning

347 rate. Defaults to 0.001.

348 beta_1: A float value or a constant float tensor, or a callable that takes

349 no arguments and returns the actual value to use. The exponential decay

350 rate for the 1st moment estimates. Defaults to 0.9.

351 beta_2: A float value or a constant float tensor, or a callable that takes

352 no arguments and returns the actual value to use, The exponential decay

353 rate for the 2nd moment estimates. Defaults to 0.999.

354 epsilon: A small constant for numerical stability. This epsilon is

355 "epsilon hat" in the Kingma and Ba paper (in the formula just before

356 Section 2.1), not the epsilon in Algorithm 1 of the paper. Defaults to

357 1e-7.

358 amsgrad: Boolean. Whether to apply AMSGrad variant of this algorithm from

359 the paper "On the Convergence of Adam and beyond". Defaults to `False`.

360 name: Optional name for the operations created when applying gradients.

361 Defaults to "Adam".

362 **kwargs: keyword arguments. Allowed to be {`clipnorm`, `clipvalue`, `lr`,

363 `decay`}. `clipnorm` is clip gradients by norm; `clipvalue` is clip

364 gradients by value, `decay` is included for backward compatibility to

365 allow time inverse decay of learning rate. `lr` is included for backward

366 compatibility, recommended to use `learning_rate` instead.

367 """

368

369 super(NonFusedAdam, self).__init__(name, **kwargs)

370 self._set_hyper('learning_rate', kwargs.get('lr', learning_rate))

371 self._set_hyper('decay', self._initial_decay)

372 self._set_hyper('beta_1', beta_1)

373 self._set_hyper('beta_2', beta_2)

374 self.epsilon = epsilon or backend_config.epsilon()

375 self.amsgrad = amsgrad

376

377 def _create_slots(self, var_list):

378 # Create slots for the first and second moments.

379 # Separate for-loops to respect the ordering of slot variables from v1.

380 for var in var_list:

381 self.add_slot(var, 'm')

382 for var in var_list:

383 self.add_slot(var, 'v')

384 if self.amsgrad:

385 for var in var_list:

386 self.add_slot(var, 'vhat')

387

388 def _prepare_local(self, var_device, var_dtype, apply_state):

389 super(NonFusedAdam, self)._prepare_local(var_device, var_dtype, apply_state)

390

391 local_step = math_ops.cast(self.iterations + 1, var_dtype)

392 beta_1_t = array_ops.identity(self._get_hyper('beta_1', var_dtype))

393 beta_2_t = array_ops.identity(self._get_hyper('beta_2', var_dtype))

394 beta_1_power = math_ops.pow(beta_1_t, local_step)

395 beta_2_power = math_ops.pow(beta_2_t, local_step)

396 lr = (

397 apply_state[(var_device, var_dtype)]['lr_t'] *

398 (math_ops.sqrt(1 - beta_2_power) / (1 - beta_1_power)))

399 apply_state[(var_device, var_dtype)].update(

400 dict(

401 lr=lr,

402 epsilon=tensor_conversion.convert_to_tensor_v2_with_dispatch(

403 self.epsilon, var_dtype

404 ),

405 beta_1_t=beta_1_t,

406 beta_1_power=beta_1_power,

407 one_minus_beta_1_t=1 - beta_1_t,

408 beta_2_t=beta_2_t,

409 beta_2_power=beta_2_power,

410 one_minus_beta_2_t=1 - beta_2_t,

411 )

412 )

413

414 def set_weights(self, weights):

415 params = self.weights

416 # If the weights are generated by Keras V1 optimizer, it includes vhats

417 # even without amsgrad, i.e, V1 optimizer has 3x + 1 variables, while V2

418 # optimizer has 2x + 1 variables. Filter vhats out for compatibility.

419 num_vars = int((len(params) - 1) / 2)

420 if len(weights) == 3 * num_vars + 1:

421 weights = weights[:len(params)]

422 super(NonFusedAdam, self).set_weights(weights)

423

424 @def_function.function(jit_compile=True)

425 def _resource_apply_dense(self, grad, var, apply_state=None):

426 var_device, var_dtype = var.device, var.dtype.base_dtype

427 coefficients = ((apply_state or {}).get((var_device, var_dtype)) or

428 self._fallback_apply_state(var_device, var_dtype))

429

430 m = self.get_slot(var, 'm')

431 v = self.get_slot(var, 'v')

432

433 alpha = (

434 coefficients['lr_t'] * math_ops.sqrt(1 - coefficients['beta_2_power']) /

435 (1 - coefficients['beta_1_power']))

436 m.assign_add((grad - m) * (1 - coefficients['beta_1_t']))

437 v.assign_add((math_ops.square(grad) - v) * (1 - coefficients['beta_2_t']))

438 if self.amsgrad:

439 vhat = self.get_slot(var, 'vhat')

440 vhat.assign(math_ops.maximum(vhat, v))

441 v = vhat

442 var.assign_sub(

443 (m * alpha) / (math_ops.sqrt(v) - coefficients['epsilon']))

444

445 @def_function.function(jit_compile=True)

446 def _resource_apply_sparse(self, grad, var, indices, apply_state=None):

447 var_device, var_dtype = var.device, var.dtype.base_dtype

448 coefficients = ((apply_state or {}).get((var_device, var_dtype)) or

449 self._fallback_apply_state(var_device, var_dtype))

450

451 # m_t = beta1 * m + (1 - beta1) * g_t

452 m = self.get_slot(var, 'm')

453 m_scaled_g_values = grad * coefficients['one_minus_beta_1_t']

454 m.assign(m * coefficients['beta_1_t'])

455 m.scatter_add(indexed_slices.IndexedSlices(m_scaled_g_values, indices))

456

457 # v_t = beta2 * v + (1 - beta2) * (g_t * g_t)

458 v = self.get_slot(var, 'v')

459 v_scaled_g_values = (grad * grad) * coefficients['one_minus_beta_2_t']

460 v.assign(v * coefficients['beta_2_t'])

461 v.scatter_add(indexed_slices.IndexedSlices(v_scaled_g_values, indices))

462

463 if not self.amsgrad:

464 var.assign_sub(coefficients['lr'] * m /

465 (math_ops.sqrt(v) + coefficients['epsilon']))

466 else:

467 v_hat = self.get_slot(var, 'vhat')

468 v_hat.assign(math_ops.maximum(v_hat, v))

469 var.assign_sub(coefficients['lr'] * m /

470 (math_ops.sqrt(v_hat) + coefficients['epsilon']))

471

472 def get_config(self):

473 config = super(NonFusedAdam, self).get_config()

474 config.update({

475 'learning_rate': self._serialize_hyperparameter('learning_rate'),

476 'decay': self._initial_decay,

477 'beta_1': self._serialize_hyperparameter('beta_1'),

478 'beta_2': self._serialize_hyperparameter('beta_2'),

479 'epsilon': self.epsilon,

480 'amsgrad': self.amsgrad,

481 })

482 return config

Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/tensorflow/python/keras/optimizer_v2/adam.py: 22%

152 statements