Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/tensorflow/python/training/adagrad

3# Licensed under the Apache License, Version 2.0 (the "License");

4# you may not use this file except in compliance with the License.

5# You may obtain a copy of the License at

7# http://www.apache.org/licenses/LICENSE-2.0

9# Unless required by applicable law or agreed to in writing, software

10# distributed under the License is distributed on an "AS IS" BASIS,

11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

12# See the License for the specific language governing permissions and

13# limitations under the License.

14# ==============================================================================

15"""Adagrad Dual Averaging for TensorFlow."""

16from tensorflow.python.framework import constant_op

17from tensorflow.python.framework import ops

18from tensorflow.python.ops import array_ops

19from tensorflow.python.ops import math_ops

20from tensorflow.python.training import optimizer

21from tensorflow.python.training import training_ops

22from tensorflow.python.util.tf_export import tf_export

25@tf_export(v1=["train.AdagradDAOptimizer"])

26class AdagradDAOptimizer(optimizer.Optimizer):

27 """Adagrad Dual Averaging algorithm for sparse linear models.

29 This optimizer takes care of regularization of unseen features in a mini batch

30 by updating them when they are seen with a closed form update rule that is

31 equivalent to having updated them on every mini-batch.

33 AdagradDA is typically used when there is a need for large sparsity in the

34 trained model. This optimizer only guarantees sparsity for linear models. Be

35 careful when using AdagradDA for deep networks as it will require careful

36 initialization of the gradient accumulators for it to train.

38 References:

39 Adaptive Subgradient Methods for Online Learning and Stochastic Optimization

40 :[Duchi et al., 2011](http://jmlr.org/papers/v12/duchi11a.html)

41 ([pdf](http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf))

42 """

44 def __init__(self,

45 learning_rate,

46 global_step,

47 initial_gradient_squared_accumulator_value=0.1,

48 l1_regularization_strength=0.0,

49 l2_regularization_strength=0.0,

50 use_locking=False,

51 name="AdagradDA"):

52 """Construct a new AdagradDA optimizer.

54 Args:

55 learning_rate: A `Tensor` or a floating point value. The learning rate.

56 global_step: A `Tensor` containing the current training step number.

57 initial_gradient_squared_accumulator_value: A floating point value.

58 Starting value for the accumulators, must be positive.

59 l1_regularization_strength: A float value, must be greater than or

60 equal to zero.

61 l2_regularization_strength: A float value, must be greater than or

62 equal to zero.

63 use_locking: If `True` use locks for update operations.

64 name: Optional name prefix for the operations created when applying

65 gradients. Defaults to "AdagradDA".

67 Raises:

68 ValueError: If the `initial_gradient_squared_accumulator_value` is

69 invalid.

70 """

71 if initial_gradient_squared_accumulator_value <= 0.0:

72 raise ValueError("initial_gradient_squared_accumulator_value must be "

73 "positive: %s" %

74 initial_gradient_squared_accumulator_value)

75 super(AdagradDAOptimizer, self).__init__(use_locking, name)

76 self._learning_rate = learning_rate

77 self._initial_gradient_squared_accumulator_value = (

78 initial_gradient_squared_accumulator_value)

79 # Created in Initialize.

80 self._learning_rate_tensor = None

81 self._l1_regularization_strength = l1_regularization_strength

82 self._l2_regularization_strength = l2_regularization_strength

83 self._global_step = global_step

84 self._global_step_on_worker = None

86 def _create_slots(self, var_list):

87 for v in var_list:

88 with ops.colocate_with(v):

89 g_val = constant_op.constant(

90 0.0, shape=v.get_shape(), dtype=v.dtype.base_dtype)

91 gg_val = constant_op.constant(

92 self._initial_gradient_squared_accumulator_value,

93 shape=v.get_shape(),

94 dtype=v.dtype.base_dtype)

95 self._get_or_make_slot(v, g_val, "gradient_accumulator", self._name)

96 self._get_or_make_slot(v, gg_val, "gradient_squared_accumulator",

97 self._name)

99 def _prepare(self):

100 self._learning_rate_tensor = ops.convert_to_tensor(

101 self._learning_rate, name="learning_rate")

102 # Performance optimization so that worker creates a copy of the global step

103 # to avoid overloading the parameter server holding the global step.

104 with ops.colocate_with(self._learning_rate_tensor):

105 self._global_step_on_worker = array_ops.identity(self._global_step) + 1

106

107 def _apply_dense(self, grad, var):

108 g_acc = self.get_slot(var, "gradient_accumulator")

109 gg_acc = self.get_slot(var, "gradient_squared_accumulator")

110 with ops.device(var.device):

111 global_step = array_ops.identity(self._global_step_on_worker)

112 return training_ops.apply_adagrad_da(

113 var,

114 g_acc,

115 gg_acc,

116 grad,

117 math_ops.cast(self._learning_rate_tensor, var.dtype.base_dtype),

118 math_ops.cast(self._l1_regularization_strength, var.dtype.base_dtype),

119 math_ops.cast(self._l2_regularization_strength, var.dtype.base_dtype),

120 global_step,

121 use_locking=self._use_locking)

122

123 def _resource_apply_dense(self, grad, var):

124 g_acc = self.get_slot(var, "gradient_accumulator")

125 gg_acc = self.get_slot(var, "gradient_squared_accumulator")

126 with ops.device(var.device):

127 global_step = array_ops.identity(self._global_step_on_worker)

128 return training_ops.resource_apply_adagrad_da(

129 var.handle,

130 g_acc.handle,

131 gg_acc.handle,

132 grad,

133 math_ops.cast(self._learning_rate_tensor, grad.dtype.base_dtype),

134 math_ops.cast(self._l1_regularization_strength, grad.dtype.base_dtype),

135 math_ops.cast(self._l2_regularization_strength, grad.dtype.base_dtype),

136 global_step,

137 use_locking=self._use_locking)

138

139 def _apply_sparse(self, grad, var):

140 g_acc = self.get_slot(var, "gradient_accumulator")

141 gg_acc = self.get_slot(var, "gradient_squared_accumulator")

142 with ops.device(var.device):

143 global_step = array_ops.identity(self._global_step_on_worker)

144 return training_ops.sparse_apply_adagrad_da(

145 var,

146 g_acc,

147 gg_acc,

148 grad.values,

149 grad.indices,

150 math_ops.cast(self._learning_rate_tensor, var.dtype.base_dtype),

151 math_ops.cast(self._l1_regularization_strength, var.dtype.base_dtype),

152 math_ops.cast(self._l2_regularization_strength, var.dtype.base_dtype),

153 global_step,

154 use_locking=self._use_locking)

155

156 def _resource_apply_sparse(self, grad, var, indices):

157 g_acc = self.get_slot(var, "gradient_accumulator")

158 gg_acc = self.get_slot(var, "gradient_squared_accumulator")

159 with ops.device(var.device):

160 global_step = array_ops.identity(self._global_step_on_worker)

161 return training_ops.resource_sparse_apply_adagrad_da(

162 var.handle,

163 g_acc.handle,

164 gg_acc.handle,

165 grad,

166 indices,

167 math_ops.cast(self._learning_rate_tensor, grad.dtype),

168 math_ops.cast(self._l1_regularization_strength, grad.dtype),

169 math_ops.cast(self._l2_regularization_strength, grad.dtype),

170 global_step,

171 use_locking=self._use_locking)

Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/tensorflow/python/training/adagrad_da.py: 30%

56 statements