Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/tensorflow_addons/optimizers/conditional

3# Licensed under the Apache License, Version 2.0 (the "License");

4# you may not use this file except in compliance with the License.

5# You may obtain a copy of the License at

7# http://www.apache.org/licenses/LICENSE-2.0

9# Unless required by applicable law or agreed to in writing, software

10# distributed under the License is distributed on an "AS IS" BASIS,

11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

12# See the License for the specific language governing permissions and

13# limitations under the License.

14# ==============================================================================

15"""Conditional Gradient optimizer."""

17import tensorflow as tf

18from tensorflow_addons.optimizers import KerasLegacyOptimizer

19from tensorflow_addons.utils.types import FloatTensorLike

21from typeguard import typechecked

22from typing import Union, Callable

25@tf.keras.utils.register_keras_serializable(package="Addons")

26class ConditionalGradient(KerasLegacyOptimizer):

27 """Optimizer that implements the Conditional Gradient optimization.

29 This optimizer helps handle constraints well.

31 Currently only supports frobenius norm constraint or nuclear norm

32 constraint.

33 See https://arxiv.org/pdf/1803.06453.pdf

35 ```

36 variable -= (1-learning_rate) * (variable + lambda_ * gradient

37 / (frobenius_norm(gradient) + epsilon))

38 ```

40 Note that `lambda_` here refers to the constraint "lambda" in

41 the paper. `epsilon` is constant with tiny value as compared to

42 the value of frobenius norm of gradient. The purpose of `epsilon`

43 here is to avoid the case that the value of frobenius norm of

44 gradient is 0.

46 In this implementation, `epsilon` defaults to $10^{-7}$.

48 For nucler norm constraint, the formula is as following:

50 ```

51 variable -= (1-learning_rate) * (variable

52 + lambda_ * top_singular_vector(gradient))

53 ```

54 """

56 @typechecked

57 def __init__(

58 self,

59 learning_rate: Union[FloatTensorLike, Callable],

60 lambda_: Union[FloatTensorLike, Callable] = 0.01,

61 epsilon: FloatTensorLike = 1e-7,

62 ord: str = "fro",

63 name: str = "ConditionalGradient",

64 **kwargs,

65 ):

66 """Construct a new conditional gradient optimizer.

68 Args:

69 learning_rate: A `Tensor` or a floating point value. or a schedule

70 that is a `tf.keras.optimizers.schedules.LearningRateSchedule`

71 The learning rate.

72 lambda_: A `Tensor` or a floating point value. The constraint.

73 epsilon: A `Tensor` or a floating point value. A small constant

74 for numerical stability when handling the case of norm of

75 gradient to be zero.

76 ord: Order of the norm. Supported values are `'fro'`

77 and `'nuclear'`. Default is `'fro'`, which is frobenius norm.

78 name: Optional name prefix for the operations created when

79 applying gradients. Defaults to 'ConditionalGradient'.

80 **kwargs: keyword arguments. Allowed to be {`clipnorm`,

81 `clipvalue`, `lr`, `decay`}. `clipnorm` is clip gradients

82 by norm; `clipvalue` is clip gradients by value, `decay` is

83 included for backward compatibility to allow time inverse

84 decay of learning rate. `lr` is included for backward

85 compatibility, recommended to use `learning_rate` instead.

86 """

87 super().__init__(name=name, **kwargs)

88 self._set_hyper("learning_rate", kwargs.get("lr", learning_rate))

89 self._set_hyper("lambda_", lambda_)

90 self.epsilon = epsilon or tf.keras.backend.epsilon()

91 supported_norms = ["fro", "nuclear"]

92 if ord not in supported_norms:

93 raise ValueError(

94 "'ord' must be a supported matrix norm in %s, got '%s' instead"

95 % (supported_norms, ord)

96 )

97 self.ord = ord

99 def get_config(self):

100 config = {

101 "learning_rate": self._serialize_hyperparameter("learning_rate"),

102 "lambda_": self._serialize_hyperparameter("lambda_"),

103 "epsilon": self.epsilon,

104 "ord": self.ord,

105 }

106 base_config = super().get_config()

107 return {**base_config, **config}

108

109 def _create_slots(self, var_list):

110 for v in var_list:

111 self.add_slot(v, "conditional_gradient")

112

113 def _prepare_local(self, var_device, var_dtype, apply_state):

114 super()._prepare_local(var_device, var_dtype, apply_state)

115 apply_state[(var_device, var_dtype)]["learning_rate"] = tf.identity(

116 self._get_hyper("learning_rate", var_dtype)

117 )

118 apply_state[(var_device, var_dtype)]["lambda_"] = tf.identity(

119 self._get_hyper("lambda_", var_dtype)

120 )

121 apply_state[(var_device, var_dtype)]["epsilon"] = tf.convert_to_tensor(

122 self.epsilon, var_dtype

123 )

124

125 @staticmethod

126 def _frobenius_norm(m):

127 return tf.reduce_sum(m**2) ** 0.5

128

129 @staticmethod

130 def _top_singular_vector(m):

131 # handle the case where m is a tensor of rank 0 or rank 1.

132 # Example:

133 # scalar (rank 0) a, shape []=> [[a]], shape [1,1]

134 # vector (rank 1) [a,b], shape [2] => [[a,b]], shape [1,2]

135 original_rank = tf.rank(m)

136 shape = tf.shape(m)

137 first_pad = tf.cast(tf.less(original_rank, 2), dtype=tf.int32)

138 second_pad = tf.cast(tf.equal(original_rank, 0), dtype=tf.int32)

139 new_shape = tf.concat(

140 [

141 tf.ones(shape=first_pad, dtype=tf.int32),

142 tf.ones(shape=second_pad, dtype=tf.int32),

143 shape,

144 ],

145 axis=0,

146 )

147 n = tf.reshape(m, new_shape)

148 st, ut, vt = tf.linalg.svd(n, full_matrices=False)

149 n_size = tf.shape(n)

150 ut = tf.reshape(ut[:, 0], [n_size[0], 1])

151 vt = tf.reshape(vt[:, 0], [n_size[1], 1])

152 st = tf.matmul(ut, tf.transpose(vt))

153 # when we return the top singular vector, we have to remove the

154 # dimension we have added on

155 st_shape = tf.shape(st)

156 begin = tf.cast(tf.less(original_rank, 2), dtype=tf.int32)

157 end = 2 - tf.cast(tf.equal(original_rank, 0), dtype=tf.int32)

158 new_shape = st_shape[begin:end]

159 return tf.reshape(st, new_shape)

160

161 def _resource_apply_dense(self, grad, var, apply_state=None):

162 var_device, var_dtype = var.device, var.dtype.base_dtype

163 coefficients = (apply_state or {}).get(

164 (var_device, var_dtype)

165 ) or self._fallback_apply_state(var_device, var_dtype)

166 lr = coefficients["learning_rate"]

167 lambda_ = coefficients["lambda_"]

168 epsilon = coefficients["epsilon"]

169 if self.ord == "fro":

170 norm = tf.convert_to_tensor(

171 self._frobenius_norm(grad), name="norm", dtype=var.dtype.base_dtype

172 )

173 s = grad / (norm + epsilon)

174 else:

175 top_singular_vector = tf.convert_to_tensor(

176 self._top_singular_vector(grad),

177 name="top_singular_vector",

178 dtype=var.dtype.base_dtype,

179 )

180 s = top_singular_vector

181

182 var_update = tf.math.multiply(var, lr) - (1 - lr) * lambda_ * s

183 return var.assign(var_update, use_locking=self._use_locking)

184

185 def _resource_apply_sparse(self, grad, var, indices, apply_state=None):

186 var_device, var_dtype = var.device, var.dtype.base_dtype

187 coefficients = (apply_state or {}).get(

188 (var_device, var_dtype)

189 ) or self._fallback_apply_state(var_device, var_dtype)

190 lr = coefficients["learning_rate"]

191 lambda_ = coefficients["lambda_"]

192 epsilon = coefficients["epsilon"]

193 var_slice = tf.gather(var, indices)

194 if self.ord == "fro":

195 norm = tf.convert_to_tensor(

196 self._frobenius_norm(grad), name="norm", dtype=var.dtype.base_dtype

197 )

198 s = grad / (norm + epsilon)

199 else:

200 top_singular_vector = tf.convert_to_tensor(

201 self._top_singular_vector(grad),

202 name="top_singular_vector",

203 dtype=var.dtype.base_dtype,

204 )

205 s = top_singular_vector

206

207 var_update_value = tf.math.multiply(var_slice, lr) - (1 - lr) * lambda_ * s

208 var_update_op = self._resource_scatter_update(var, indices, var_update_value)

209 return var_update_op

Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/tensorflow_addons/optimizers/conditional_gradient.py: 24%

79 statements