Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/tensorflow_addons/optimizers/conditional_gradient.py: 24%

79 statements  

« prev     ^ index     » next       coverage.py v7.4.0, created at 2024-01-03 07:57 +0000

1# Copyright 2019 The TensorFlow Authors. All Rights Reserved. 

2# 

3# Licensed under the Apache License, Version 2.0 (the "License"); 

4# you may not use this file except in compliance with the License. 

5# You may obtain a copy of the License at 

6# 

7# http://www.apache.org/licenses/LICENSE-2.0 

8# 

9# Unless required by applicable law or agreed to in writing, software 

10# distributed under the License is distributed on an "AS IS" BASIS, 

11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 

12# See the License for the specific language governing permissions and 

13# limitations under the License. 

14# ============================================================================== 

15"""Conditional Gradient optimizer.""" 

16 

17import tensorflow as tf 

18from tensorflow_addons.optimizers import KerasLegacyOptimizer 

19from tensorflow_addons.utils.types import FloatTensorLike 

20 

21from typeguard import typechecked 

22from typing import Union, Callable 

23 

24 

25@tf.keras.utils.register_keras_serializable(package="Addons") 

26class ConditionalGradient(KerasLegacyOptimizer): 

27 """Optimizer that implements the Conditional Gradient optimization. 

28 

29 This optimizer helps handle constraints well. 

30 

31 Currently only supports frobenius norm constraint or nuclear norm 

32 constraint. 

33 See https://arxiv.org/pdf/1803.06453.pdf 

34 

35 ``` 

36 variable -= (1-learning_rate) * (variable + lambda_ * gradient 

37 / (frobenius_norm(gradient) + epsilon)) 

38 ``` 

39 

40 Note that `lambda_` here refers to the constraint "lambda" in 

41 the paper. `epsilon` is constant with tiny value as compared to 

42 the value of frobenius norm of gradient. The purpose of `epsilon` 

43 here is to avoid the case that the value of frobenius norm of 

44 gradient is 0. 

45 

46 In this implementation, `epsilon` defaults to $10^{-7}$. 

47 

48 For nucler norm constraint, the formula is as following: 

49 

50 ``` 

51 variable -= (1-learning_rate) * (variable 

52 + lambda_ * top_singular_vector(gradient)) 

53 ``` 

54 """ 

55 

56 @typechecked 

57 def __init__( 

58 self, 

59 learning_rate: Union[FloatTensorLike, Callable], 

60 lambda_: Union[FloatTensorLike, Callable] = 0.01, 

61 epsilon: FloatTensorLike = 1e-7, 

62 ord: str = "fro", 

63 name: str = "ConditionalGradient", 

64 **kwargs, 

65 ): 

66 """Construct a new conditional gradient optimizer. 

67 

68 Args: 

69 learning_rate: A `Tensor` or a floating point value. or a schedule 

70 that is a `tf.keras.optimizers.schedules.LearningRateSchedule` 

71 The learning rate. 

72 lambda_: A `Tensor` or a floating point value. The constraint. 

73 epsilon: A `Tensor` or a floating point value. A small constant 

74 for numerical stability when handling the case of norm of 

75 gradient to be zero. 

76 ord: Order of the norm. Supported values are `'fro'` 

77 and `'nuclear'`. Default is `'fro'`, which is frobenius norm. 

78 name: Optional name prefix for the operations created when 

79 applying gradients. Defaults to 'ConditionalGradient'. 

80 **kwargs: keyword arguments. Allowed to be {`clipnorm`, 

81 `clipvalue`, `lr`, `decay`}. `clipnorm` is clip gradients 

82 by norm; `clipvalue` is clip gradients by value, `decay` is 

83 included for backward compatibility to allow time inverse 

84 decay of learning rate. `lr` is included for backward 

85 compatibility, recommended to use `learning_rate` instead. 

86 """ 

87 super().__init__(name=name, **kwargs) 

88 self._set_hyper("learning_rate", kwargs.get("lr", learning_rate)) 

89 self._set_hyper("lambda_", lambda_) 

90 self.epsilon = epsilon or tf.keras.backend.epsilon() 

91 supported_norms = ["fro", "nuclear"] 

92 if ord not in supported_norms: 

93 raise ValueError( 

94 "'ord' must be a supported matrix norm in %s, got '%s' instead" 

95 % (supported_norms, ord) 

96 ) 

97 self.ord = ord 

98 

99 def get_config(self): 

100 config = { 

101 "learning_rate": self._serialize_hyperparameter("learning_rate"), 

102 "lambda_": self._serialize_hyperparameter("lambda_"), 

103 "epsilon": self.epsilon, 

104 "ord": self.ord, 

105 } 

106 base_config = super().get_config() 

107 return {**base_config, **config} 

108 

109 def _create_slots(self, var_list): 

110 for v in var_list: 

111 self.add_slot(v, "conditional_gradient") 

112 

113 def _prepare_local(self, var_device, var_dtype, apply_state): 

114 super()._prepare_local(var_device, var_dtype, apply_state) 

115 apply_state[(var_device, var_dtype)]["learning_rate"] = tf.identity( 

116 self._get_hyper("learning_rate", var_dtype) 

117 ) 

118 apply_state[(var_device, var_dtype)]["lambda_"] = tf.identity( 

119 self._get_hyper("lambda_", var_dtype) 

120 ) 

121 apply_state[(var_device, var_dtype)]["epsilon"] = tf.convert_to_tensor( 

122 self.epsilon, var_dtype 

123 ) 

124 

125 @staticmethod 

126 def _frobenius_norm(m): 

127 return tf.reduce_sum(m**2) ** 0.5 

128 

129 @staticmethod 

130 def _top_singular_vector(m): 

131 # handle the case where m is a tensor of rank 0 or rank 1. 

132 # Example: 

133 # scalar (rank 0) a, shape []=> [[a]], shape [1,1] 

134 # vector (rank 1) [a,b], shape [2] => [[a,b]], shape [1,2] 

135 original_rank = tf.rank(m) 

136 shape = tf.shape(m) 

137 first_pad = tf.cast(tf.less(original_rank, 2), dtype=tf.int32) 

138 second_pad = tf.cast(tf.equal(original_rank, 0), dtype=tf.int32) 

139 new_shape = tf.concat( 

140 [ 

141 tf.ones(shape=first_pad, dtype=tf.int32), 

142 tf.ones(shape=second_pad, dtype=tf.int32), 

143 shape, 

144 ], 

145 axis=0, 

146 ) 

147 n = tf.reshape(m, new_shape) 

148 st, ut, vt = tf.linalg.svd(n, full_matrices=False) 

149 n_size = tf.shape(n) 

150 ut = tf.reshape(ut[:, 0], [n_size[0], 1]) 

151 vt = tf.reshape(vt[:, 0], [n_size[1], 1]) 

152 st = tf.matmul(ut, tf.transpose(vt)) 

153 # when we return the top singular vector, we have to remove the 

154 # dimension we have added on 

155 st_shape = tf.shape(st) 

156 begin = tf.cast(tf.less(original_rank, 2), dtype=tf.int32) 

157 end = 2 - tf.cast(tf.equal(original_rank, 0), dtype=tf.int32) 

158 new_shape = st_shape[begin:end] 

159 return tf.reshape(st, new_shape) 

160 

161 def _resource_apply_dense(self, grad, var, apply_state=None): 

162 var_device, var_dtype = var.device, var.dtype.base_dtype 

163 coefficients = (apply_state or {}).get( 

164 (var_device, var_dtype) 

165 ) or self._fallback_apply_state(var_device, var_dtype) 

166 lr = coefficients["learning_rate"] 

167 lambda_ = coefficients["lambda_"] 

168 epsilon = coefficients["epsilon"] 

169 if self.ord == "fro": 

170 norm = tf.convert_to_tensor( 

171 self._frobenius_norm(grad), name="norm", dtype=var.dtype.base_dtype 

172 ) 

173 s = grad / (norm + epsilon) 

174 else: 

175 top_singular_vector = tf.convert_to_tensor( 

176 self._top_singular_vector(grad), 

177 name="top_singular_vector", 

178 dtype=var.dtype.base_dtype, 

179 ) 

180 s = top_singular_vector 

181 

182 var_update = tf.math.multiply(var, lr) - (1 - lr) * lambda_ * s 

183 return var.assign(var_update, use_locking=self._use_locking) 

184 

185 def _resource_apply_sparse(self, grad, var, indices, apply_state=None): 

186 var_device, var_dtype = var.device, var.dtype.base_dtype 

187 coefficients = (apply_state or {}).get( 

188 (var_device, var_dtype) 

189 ) or self._fallback_apply_state(var_device, var_dtype) 

190 lr = coefficients["learning_rate"] 

191 lambda_ = coefficients["lambda_"] 

192 epsilon = coefficients["epsilon"] 

193 var_slice = tf.gather(var, indices) 

194 if self.ord == "fro": 

195 norm = tf.convert_to_tensor( 

196 self._frobenius_norm(grad), name="norm", dtype=var.dtype.base_dtype 

197 ) 

198 s = grad / (norm + epsilon) 

199 else: 

200 top_singular_vector = tf.convert_to_tensor( 

201 self._top_singular_vector(grad), 

202 name="top_singular_vector", 

203 dtype=var.dtype.base_dtype, 

204 ) 

205 s = top_singular_vector 

206 

207 var_update_value = tf.math.multiply(var_slice, lr) - (1 - lr) * lambda_ * s 

208 var_update_op = self._resource_scatter_update(var, indices, var_update_value) 

209 return var_update_op