Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/tensorflow/python/training/adagrad_da.py: 30%

56 statements  

« prev     ^ index     » next       coverage.py v7.4.0, created at 2024-01-03 07:57 +0000

1# Copyright 2016 The TensorFlow Authors. All Rights Reserved. 

2# 

3# Licensed under the Apache License, Version 2.0 (the "License"); 

4# you may not use this file except in compliance with the License. 

5# You may obtain a copy of the License at 

6# 

7# http://www.apache.org/licenses/LICENSE-2.0 

8# 

9# Unless required by applicable law or agreed to in writing, software 

10# distributed under the License is distributed on an "AS IS" BASIS, 

11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 

12# See the License for the specific language governing permissions and 

13# limitations under the License. 

14# ============================================================================== 

15"""Adagrad Dual Averaging for TensorFlow.""" 

16from tensorflow.python.framework import constant_op 

17from tensorflow.python.framework import ops 

18from tensorflow.python.ops import array_ops 

19from tensorflow.python.ops import math_ops 

20from tensorflow.python.training import optimizer 

21from tensorflow.python.training import training_ops 

22from tensorflow.python.util.tf_export import tf_export 

23 

24 

25@tf_export(v1=["train.AdagradDAOptimizer"]) 

26class AdagradDAOptimizer(optimizer.Optimizer): 

27 """Adagrad Dual Averaging algorithm for sparse linear models. 

28 

29 This optimizer takes care of regularization of unseen features in a mini batch 

30 by updating them when they are seen with a closed form update rule that is 

31 equivalent to having updated them on every mini-batch. 

32 

33 AdagradDA is typically used when there is a need for large sparsity in the 

34 trained model. This optimizer only guarantees sparsity for linear models. Be 

35 careful when using AdagradDA for deep networks as it will require careful 

36 initialization of the gradient accumulators for it to train. 

37 

38 References: 

39 Adaptive Subgradient Methods for Online Learning and Stochastic Optimization 

40 :[Duchi et al., 2011](http://jmlr.org/papers/v12/duchi11a.html) 

41 ([pdf](http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf)) 

42 """ 

43 

44 def __init__(self, 

45 learning_rate, 

46 global_step, 

47 initial_gradient_squared_accumulator_value=0.1, 

48 l1_regularization_strength=0.0, 

49 l2_regularization_strength=0.0, 

50 use_locking=False, 

51 name="AdagradDA"): 

52 """Construct a new AdagradDA optimizer. 

53 

54 Args: 

55 learning_rate: A `Tensor` or a floating point value. The learning rate. 

56 global_step: A `Tensor` containing the current training step number. 

57 initial_gradient_squared_accumulator_value: A floating point value. 

58 Starting value for the accumulators, must be positive. 

59 l1_regularization_strength: A float value, must be greater than or 

60 equal to zero. 

61 l2_regularization_strength: A float value, must be greater than or 

62 equal to zero. 

63 use_locking: If `True` use locks for update operations. 

64 name: Optional name prefix for the operations created when applying 

65 gradients. Defaults to "AdagradDA". 

66 

67 Raises: 

68 ValueError: If the `initial_gradient_squared_accumulator_value` is 

69 invalid. 

70 """ 

71 if initial_gradient_squared_accumulator_value <= 0.0: 

72 raise ValueError("initial_gradient_squared_accumulator_value must be " 

73 "positive: %s" % 

74 initial_gradient_squared_accumulator_value) 

75 super(AdagradDAOptimizer, self).__init__(use_locking, name) 

76 self._learning_rate = learning_rate 

77 self._initial_gradient_squared_accumulator_value = ( 

78 initial_gradient_squared_accumulator_value) 

79 # Created in Initialize. 

80 self._learning_rate_tensor = None 

81 self._l1_regularization_strength = l1_regularization_strength 

82 self._l2_regularization_strength = l2_regularization_strength 

83 self._global_step = global_step 

84 self._global_step_on_worker = None 

85 

86 def _create_slots(self, var_list): 

87 for v in var_list: 

88 with ops.colocate_with(v): 

89 g_val = constant_op.constant( 

90 0.0, shape=v.get_shape(), dtype=v.dtype.base_dtype) 

91 gg_val = constant_op.constant( 

92 self._initial_gradient_squared_accumulator_value, 

93 shape=v.get_shape(), 

94 dtype=v.dtype.base_dtype) 

95 self._get_or_make_slot(v, g_val, "gradient_accumulator", self._name) 

96 self._get_or_make_slot(v, gg_val, "gradient_squared_accumulator", 

97 self._name) 

98 

99 def _prepare(self): 

100 self._learning_rate_tensor = ops.convert_to_tensor( 

101 self._learning_rate, name="learning_rate") 

102 # Performance optimization so that worker creates a copy of the global step 

103 # to avoid overloading the parameter server holding the global step. 

104 with ops.colocate_with(self._learning_rate_tensor): 

105 self._global_step_on_worker = array_ops.identity(self._global_step) + 1 

106 

107 def _apply_dense(self, grad, var): 

108 g_acc = self.get_slot(var, "gradient_accumulator") 

109 gg_acc = self.get_slot(var, "gradient_squared_accumulator") 

110 with ops.device(var.device): 

111 global_step = array_ops.identity(self._global_step_on_worker) 

112 return training_ops.apply_adagrad_da( 

113 var, 

114 g_acc, 

115 gg_acc, 

116 grad, 

117 math_ops.cast(self._learning_rate_tensor, var.dtype.base_dtype), 

118 math_ops.cast(self._l1_regularization_strength, var.dtype.base_dtype), 

119 math_ops.cast(self._l2_regularization_strength, var.dtype.base_dtype), 

120 global_step, 

121 use_locking=self._use_locking) 

122 

123 def _resource_apply_dense(self, grad, var): 

124 g_acc = self.get_slot(var, "gradient_accumulator") 

125 gg_acc = self.get_slot(var, "gradient_squared_accumulator") 

126 with ops.device(var.device): 

127 global_step = array_ops.identity(self._global_step_on_worker) 

128 return training_ops.resource_apply_adagrad_da( 

129 var.handle, 

130 g_acc.handle, 

131 gg_acc.handle, 

132 grad, 

133 math_ops.cast(self._learning_rate_tensor, grad.dtype.base_dtype), 

134 math_ops.cast(self._l1_regularization_strength, grad.dtype.base_dtype), 

135 math_ops.cast(self._l2_regularization_strength, grad.dtype.base_dtype), 

136 global_step, 

137 use_locking=self._use_locking) 

138 

139 def _apply_sparse(self, grad, var): 

140 g_acc = self.get_slot(var, "gradient_accumulator") 

141 gg_acc = self.get_slot(var, "gradient_squared_accumulator") 

142 with ops.device(var.device): 

143 global_step = array_ops.identity(self._global_step_on_worker) 

144 return training_ops.sparse_apply_adagrad_da( 

145 var, 

146 g_acc, 

147 gg_acc, 

148 grad.values, 

149 grad.indices, 

150 math_ops.cast(self._learning_rate_tensor, var.dtype.base_dtype), 

151 math_ops.cast(self._l1_regularization_strength, var.dtype.base_dtype), 

152 math_ops.cast(self._l2_regularization_strength, var.dtype.base_dtype), 

153 global_step, 

154 use_locking=self._use_locking) 

155 

156 def _resource_apply_sparse(self, grad, var, indices): 

157 g_acc = self.get_slot(var, "gradient_accumulator") 

158 gg_acc = self.get_slot(var, "gradient_squared_accumulator") 

159 with ops.device(var.device): 

160 global_step = array_ops.identity(self._global_step_on_worker) 

161 return training_ops.resource_sparse_apply_adagrad_da( 

162 var.handle, 

163 g_acc.handle, 

164 gg_acc.handle, 

165 grad, 

166 indices, 

167 math_ops.cast(self._learning_rate_tensor, grad.dtype), 

168 math_ops.cast(self._l1_regularization_strength, grad.dtype), 

169 math_ops.cast(self._l2_regularization_strength, grad.dtype), 

170 global_step, 

171 use_locking=self._use_locking)