Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/tensorflow/python/training/adam.py: 26%

3# Licensed under the Apache License, Version 2.0 (the "License");

4# you may not use this file except in compliance with the License.

5# You may obtain a copy of the License at

7# http://www.apache.org/licenses/LICENSE-2.0

9# Unless required by applicable law or agreed to in writing, software

10# distributed under the License is distributed on an "AS IS" BASIS,

11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

12# See the License for the specific language governing permissions and

13# limitations under the License.

14# ==============================================================================

15"""Adam for TensorFlow."""

16from tensorflow.python.eager import context

17from tensorflow.python.framework import ops

18from tensorflow.python.ops import control_flow_ops

19from tensorflow.python.ops import math_ops

20from tensorflow.python.ops import resource_variable_ops

21from tensorflow.python.ops import state_ops

22from tensorflow.python.training import optimizer

23from tensorflow.python.training import training_ops

24from tensorflow.python.util.tf_export import tf_export

27@tf_export(v1=["train.AdamOptimizer"])

28class AdamOptimizer(optimizer.Optimizer):

29 """Optimizer that implements the Adam algorithm.

31 References:

32 Adam - A Method for Stochastic Optimization:

33 [Kingma et al., 2015](https://arxiv.org/abs/1412.6980)

34 ([pdf](https://arxiv.org/pdf/1412.6980.pdf))

36 @compatibility(TF2)

37 tf.compat.v1.train.AdamOptimizer is compatible with eager mode and

38 `tf.function`.

39 When eager execution is enabled, `learning_rate`, `beta1`, `beta2`, and

40 `epsilon` can each be a callable that takes no arguments and returns the

41 actual value to use. This can be useful for changing these values across

42 different invocations of optimizer functions.

44 To switch to native TF2 style, use [`tf.keras.optimizers.Adam`]

45 (https://www.tensorflow.org/api_docs/python/tf/keras/optimizers/Adam)

46 instead. Please notice that due to the implementation differences,

47 `tf.keras.optimizers.Adam` and

48 `tf.compat.v1.train.AdamOptimizer` may have slight differences in

49 floating point numerics even though the formula used for the variable

50 updates still matches.

52 #### Structural Mapping to Native TF2

54 Before:

56 ```python

57 optimizer = tf.compat.v1.train.AdamOptimizer(learning_rate=0.001)

58 ```

60 After:

62 ```python

63 optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)

64 ```

66 #### How to Map Arguments

67 |TF1 Arg Name |TF2 Arg Name |Note |

68 |----------------------|-------------|----------------------|

69 |learning_rate |learning_rate|Be careful of setting learning_rate as a

70 : : : tensor value computed from the global

71 : : : step. In TF1 this was usually meant to

72 : : : imply a dynamic learning rate and would

73 : : : recompute in each step. In TF2 (eager +

74 : : : function) it will treat it as a scalar

75 : : : value that only gets computed once

76 : : : instead of a symbolic placeholder to be

77 : : : computed each time. :

78 |beta1 |beta_1 | |

79 |beta2 |beta_2 | |

80 |epsilon |epsilon | Default value is 1e-08 in TF1, but

81 : : : 1e-07 in TF2. :

82 |use_locking |N/A |Not applicable in TF2. |

84 #### Before & After Usage Example

85 Before:

87 ```python

88 x = tf.Variable([1,2,3], dtype=tf.float32)

89 grad = tf.constant([0.1, 0.2, 0.3])

90 optimizer = tf.compat.v1.train.AdamOptimizer(learning_rate=0.001)

91 optimizer.apply_gradients(zip([grad], [x]))

92 ```

94 After:

96 ```python

97 x = tf.Variable([1,2,3], dtype=tf.float32)

98 grad = tf.constant([0.1, 0.2, 0.3])

99 optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)

100 optimizer.apply_gradients(zip([grad], [x]))

101 ```

102

103 @end_compatibility

104 """

105

106 def __init__(self,

107 learning_rate=0.001,

108 beta1=0.9,

109 beta2=0.999,

110 epsilon=1e-8,

111 use_locking=False,

112 name="Adam"):

113 r"""Construct a new Adam optimizer.

114

115 Initialization:

116

117 $$m_0 := 0 \text{(Initialize initial 1st moment vector)}$$

118 $$v_0 := 0 \text{(Initialize initial 2nd moment vector)}$$

119 $$t := 0 \text{(Initialize timestep)}$$

120

121 The update rule for `variable` with gradient `g` uses an optimization

122 described at the end of section 2 of the paper:

123

124 $$t := t + 1$$

125 $$\text{lr}_t := \mathrm{learning_rate} *

126 \sqrt{1 - \beta_2^t} / (1 - \beta_1^t)$$

127

128 $$m_t := \beta_1 * m_{t-1} + (1 - \beta_1) * g$$

129 $$v_t := \beta_2 * v_{t-1} + (1 - \beta_2) * g * g$$

130 $$\text{variable} := \text{variable} -

131 \text{lr}_t * m_t / (\sqrt{v_t} + \epsilon)$$

132

133 The default value of 1e-8 for epsilon might not be a good default in

134 general. For example, when training an Inception network on ImageNet a

135 current good choice is 1.0 or 0.1. Note that since AdamOptimizer uses the

136 formulation just before Section 2.1 of the Kingma and Ba paper rather than

137 the formulation in Algorithm 1, the "epsilon" referred to here is "epsilon

138 hat" in the paper.

139

140 The sparse implementation of this algorithm (used when the gradient is an

141 IndexedSlices object, typically because of `tf.gather` or an embedding

142 lookup in the forward pass) does apply momentum to variable slices even if

143 they were not used in the forward pass (meaning they have a gradient equal

144 to zero). Momentum decay (beta1) is also applied to the entire momentum

145 accumulator. This means that the sparse behavior is equivalent to the dense

146 behavior (in contrast to some momentum implementations which ignore momentum

147 unless a variable slice was actually used).

148

149 Args:

150 learning_rate: A Tensor or a floating point value. The learning rate.

151 beta1: A float value or a constant float tensor. The exponential decay

152 rate for the 1st moment estimates.

153 beta2: A float value or a constant float tensor. The exponential decay

154 rate for the 2nd moment estimates.

155 epsilon: A small constant for numerical stability. This epsilon is

156 "epsilon hat" in the Kingma and Ba paper (in the formula just before

157 Section 2.1), not the epsilon in Algorithm 1 of the paper.

158 use_locking: If True use locks for update operations.

159 name: Optional name for the operations created when applying gradients.

160 Defaults to "Adam".

161

162

163 """

164

165 super(AdamOptimizer, self).__init__(use_locking, name)

166 self._lr = learning_rate

167 self._beta1 = beta1

168 self._beta2 = beta2

169 self._epsilon = epsilon

170

171 # Tensor versions of the constructor arguments, created in _prepare().

172 self._lr_t = None

173 self._beta1_t = None

174 self._beta2_t = None

175 self._epsilon_t = None

176

177 def _get_beta_accumulators(self):

178 with ops.init_scope():

179 if context.executing_eagerly():

180 graph = None

181 else:

182 graph = ops.get_default_graph()

183 return (self._get_non_slot_variable("beta1_power", graph=graph),

184 self._get_non_slot_variable("beta2_power", graph=graph))

185

186 def _create_slots(self, var_list):

187 # Create the beta1 and beta2 accumulators on the same device as the first

188 # variable. Sort the var_list to make sure this device is consistent across

189 # workers (these need to go on the same PS, otherwise some updates are

190 # silently ignored).

191 first_var = min(var_list, key=lambda x: x.name)

192 self._create_non_slot_variable(

193 initial_value=self._beta1, name="beta1_power", colocate_with=first_var)

194 self._create_non_slot_variable(

195 initial_value=self._beta2, name="beta2_power", colocate_with=first_var)

196

197 # Create slots for the first and second moments.

198 for v in var_list:

199 self._zeros_slot(v, "m", self._name)

200 self._zeros_slot(v, "v", self._name)

201

202 def _prepare(self):

203 lr = self._call_if_callable(self._lr)

204 beta1 = self._call_if_callable(self._beta1)

205 beta2 = self._call_if_callable(self._beta2)

206 epsilon = self._call_if_callable(self._epsilon)

207

208 self._lr_t = ops.convert_to_tensor(lr, name="learning_rate")

209 self._beta1_t = ops.convert_to_tensor(beta1, name="beta1")

210 self._beta2_t = ops.convert_to_tensor(beta2, name="beta2")

211 self._epsilon_t = ops.convert_to_tensor(epsilon, name="epsilon")

212

213 def _apply_dense(self, grad, var):

214 m = self.get_slot(var, "m")

215 v = self.get_slot(var, "v")

216 beta1_power, beta2_power = self._get_beta_accumulators()

217 return training_ops.apply_adam(

218 var,

219 m,

220 v,

221 math_ops.cast(beta1_power, var.dtype.base_dtype),

222 math_ops.cast(beta2_power, var.dtype.base_dtype),

223 math_ops.cast(self._lr_t, var.dtype.base_dtype),

224 math_ops.cast(self._beta1_t, var.dtype.base_dtype),

225 math_ops.cast(self._beta2_t, var.dtype.base_dtype),

226 math_ops.cast(self._epsilon_t, var.dtype.base_dtype),

227 grad,

228 use_locking=self._use_locking).op

229

230 def _resource_apply_dense(self, grad, var):

231 m = self.get_slot(var, "m")

232 v = self.get_slot(var, "v")

233 beta1_power, beta2_power = self._get_beta_accumulators()

234 return training_ops.resource_apply_adam(

235 var.handle,

236 m.handle,

237 v.handle,

238 math_ops.cast(beta1_power, grad.dtype.base_dtype),

239 math_ops.cast(beta2_power, grad.dtype.base_dtype),

240 math_ops.cast(self._lr_t, grad.dtype.base_dtype),

241 math_ops.cast(self._beta1_t, grad.dtype.base_dtype),

242 math_ops.cast(self._beta2_t, grad.dtype.base_dtype),

243 math_ops.cast(self._epsilon_t, grad.dtype.base_dtype),

244 grad,

245 use_locking=self._use_locking)

246

247 def _apply_sparse_shared(self, grad, var, indices, scatter_add):

248 beta1_power, beta2_power = self._get_beta_accumulators()

249 beta1_power = math_ops.cast(beta1_power, var.dtype.base_dtype)

250 beta2_power = math_ops.cast(beta2_power, var.dtype.base_dtype)

251 lr_t = math_ops.cast(self._lr_t, var.dtype.base_dtype)

252 beta1_t = math_ops.cast(self._beta1_t, var.dtype.base_dtype)

253 beta2_t = math_ops.cast(self._beta2_t, var.dtype.base_dtype)

254 epsilon_t = math_ops.cast(self._epsilon_t, var.dtype.base_dtype)

255 lr = (lr_t * math_ops.sqrt(1 - beta2_power) / (1 - beta1_power))

256 # m_t = beta1 * m + (1 - beta1) * g_t

257 m = self.get_slot(var, "m")

258 m_scaled_g_values = grad * (1 - beta1_t)

259 m_t = state_ops.assign(m, m * beta1_t, use_locking=self._use_locking)

260 with ops.control_dependencies([m_t]):

261 m_t = scatter_add(m, indices, m_scaled_g_values)

262 # v_t = beta2 * v + (1 - beta2) * (g_t * g_t)

263 v = self.get_slot(var, "v")

264 v_scaled_g_values = (grad * grad) * (1 - beta2_t)

265 v_t = state_ops.assign(v, v * beta2_t, use_locking=self._use_locking)

266 with ops.control_dependencies([v_t]):

267 v_t = scatter_add(v, indices, v_scaled_g_values)

268 v_sqrt = math_ops.sqrt(v_t)

269 var_update = state_ops.assign_sub(

270 var, lr * m_t / (v_sqrt + epsilon_t), use_locking=self._use_locking)

271 return control_flow_ops.group(*[var_update, m_t, v_t])

272

273 def _apply_sparse(self, grad, var):

274 return self._apply_sparse_shared(

275 grad.values,

276 var,

277 grad.indices,

278 lambda x, i, v: state_ops.scatter_add( # pylint: disable=g-long-lambda

279 x,

280 i,

281 v,

282 use_locking=self._use_locking))

283

284 def _resource_scatter_add(self, x, i, v):

285 with ops.control_dependencies(

286 [resource_variable_ops.resource_scatter_add(x.handle, i, v)]):

287 return x.value()

288

289 def _resource_apply_sparse(self, grad, var, indices):

290 return self._apply_sparse_shared(grad, var, indices,

291 self._resource_scatter_add)

292

293 def _finish(self, update_ops, name_scope):

294 # Update the power accumulators.

295 with ops.control_dependencies(update_ops):

296 beta1_power, beta2_power = self._get_beta_accumulators()

297 with ops.colocate_with(beta1_power):

298 update_beta1 = beta1_power.assign(

299 beta1_power * self._beta1_t, use_locking=self._use_locking)

300 update_beta2 = beta2_power.assign(

301 beta2_power * self._beta2_t, use_locking=self._use_locking)

302 return control_flow_ops.group(

303 *update_ops + [update_beta1, update_beta2], name=name_scope)