Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/tensorflow_addons/optimizers/conditional_gradient.py: 24%
79 statements
« prev ^ index » next coverage.py v7.4.0, created at 2024-01-03 07:57 +0000
« prev ^ index » next coverage.py v7.4.0, created at 2024-01-03 07:57 +0000
1# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
2#
3# Licensed under the Apache License, Version 2.0 (the "License");
4# you may not use this file except in compliance with the License.
5# You may obtain a copy of the License at
6#
7# http://www.apache.org/licenses/LICENSE-2.0
8#
9# Unless required by applicable law or agreed to in writing, software
10# distributed under the License is distributed on an "AS IS" BASIS,
11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12# See the License for the specific language governing permissions and
13# limitations under the License.
14# ==============================================================================
15"""Conditional Gradient optimizer."""
17import tensorflow as tf
18from tensorflow_addons.optimizers import KerasLegacyOptimizer
19from tensorflow_addons.utils.types import FloatTensorLike
21from typeguard import typechecked
22from typing import Union, Callable
25@tf.keras.utils.register_keras_serializable(package="Addons")
26class ConditionalGradient(KerasLegacyOptimizer):
27 """Optimizer that implements the Conditional Gradient optimization.
29 This optimizer helps handle constraints well.
31 Currently only supports frobenius norm constraint or nuclear norm
32 constraint.
33 See https://arxiv.org/pdf/1803.06453.pdf
35 ```
36 variable -= (1-learning_rate) * (variable + lambda_ * gradient
37 / (frobenius_norm(gradient) + epsilon))
38 ```
40 Note that `lambda_` here refers to the constraint "lambda" in
41 the paper. `epsilon` is constant with tiny value as compared to
42 the value of frobenius norm of gradient. The purpose of `epsilon`
43 here is to avoid the case that the value of frobenius norm of
44 gradient is 0.
46 In this implementation, `epsilon` defaults to $10^{-7}$.
48 For nucler norm constraint, the formula is as following:
50 ```
51 variable -= (1-learning_rate) * (variable
52 + lambda_ * top_singular_vector(gradient))
53 ```
54 """
56 @typechecked
57 def __init__(
58 self,
59 learning_rate: Union[FloatTensorLike, Callable],
60 lambda_: Union[FloatTensorLike, Callable] = 0.01,
61 epsilon: FloatTensorLike = 1e-7,
62 ord: str = "fro",
63 name: str = "ConditionalGradient",
64 **kwargs,
65 ):
66 """Construct a new conditional gradient optimizer.
68 Args:
69 learning_rate: A `Tensor` or a floating point value. or a schedule
70 that is a `tf.keras.optimizers.schedules.LearningRateSchedule`
71 The learning rate.
72 lambda_: A `Tensor` or a floating point value. The constraint.
73 epsilon: A `Tensor` or a floating point value. A small constant
74 for numerical stability when handling the case of norm of
75 gradient to be zero.
76 ord: Order of the norm. Supported values are `'fro'`
77 and `'nuclear'`. Default is `'fro'`, which is frobenius norm.
78 name: Optional name prefix for the operations created when
79 applying gradients. Defaults to 'ConditionalGradient'.
80 **kwargs: keyword arguments. Allowed to be {`clipnorm`,
81 `clipvalue`, `lr`, `decay`}. `clipnorm` is clip gradients
82 by norm; `clipvalue` is clip gradients by value, `decay` is
83 included for backward compatibility to allow time inverse
84 decay of learning rate. `lr` is included for backward
85 compatibility, recommended to use `learning_rate` instead.
86 """
87 super().__init__(name=name, **kwargs)
88 self._set_hyper("learning_rate", kwargs.get("lr", learning_rate))
89 self._set_hyper("lambda_", lambda_)
90 self.epsilon = epsilon or tf.keras.backend.epsilon()
91 supported_norms = ["fro", "nuclear"]
92 if ord not in supported_norms:
93 raise ValueError(
94 "'ord' must be a supported matrix norm in %s, got '%s' instead"
95 % (supported_norms, ord)
96 )
97 self.ord = ord
99 def get_config(self):
100 config = {
101 "learning_rate": self._serialize_hyperparameter("learning_rate"),
102 "lambda_": self._serialize_hyperparameter("lambda_"),
103 "epsilon": self.epsilon,
104 "ord": self.ord,
105 }
106 base_config = super().get_config()
107 return {**base_config, **config}
109 def _create_slots(self, var_list):
110 for v in var_list:
111 self.add_slot(v, "conditional_gradient")
113 def _prepare_local(self, var_device, var_dtype, apply_state):
114 super()._prepare_local(var_device, var_dtype, apply_state)
115 apply_state[(var_device, var_dtype)]["learning_rate"] = tf.identity(
116 self._get_hyper("learning_rate", var_dtype)
117 )
118 apply_state[(var_device, var_dtype)]["lambda_"] = tf.identity(
119 self._get_hyper("lambda_", var_dtype)
120 )
121 apply_state[(var_device, var_dtype)]["epsilon"] = tf.convert_to_tensor(
122 self.epsilon, var_dtype
123 )
125 @staticmethod
126 def _frobenius_norm(m):
127 return tf.reduce_sum(m**2) ** 0.5
129 @staticmethod
130 def _top_singular_vector(m):
131 # handle the case where m is a tensor of rank 0 or rank 1.
132 # Example:
133 # scalar (rank 0) a, shape []=> [[a]], shape [1,1]
134 # vector (rank 1) [a,b], shape [2] => [[a,b]], shape [1,2]
135 original_rank = tf.rank(m)
136 shape = tf.shape(m)
137 first_pad = tf.cast(tf.less(original_rank, 2), dtype=tf.int32)
138 second_pad = tf.cast(tf.equal(original_rank, 0), dtype=tf.int32)
139 new_shape = tf.concat(
140 [
141 tf.ones(shape=first_pad, dtype=tf.int32),
142 tf.ones(shape=second_pad, dtype=tf.int32),
143 shape,
144 ],
145 axis=0,
146 )
147 n = tf.reshape(m, new_shape)
148 st, ut, vt = tf.linalg.svd(n, full_matrices=False)
149 n_size = tf.shape(n)
150 ut = tf.reshape(ut[:, 0], [n_size[0], 1])
151 vt = tf.reshape(vt[:, 0], [n_size[1], 1])
152 st = tf.matmul(ut, tf.transpose(vt))
153 # when we return the top singular vector, we have to remove the
154 # dimension we have added on
155 st_shape = tf.shape(st)
156 begin = tf.cast(tf.less(original_rank, 2), dtype=tf.int32)
157 end = 2 - tf.cast(tf.equal(original_rank, 0), dtype=tf.int32)
158 new_shape = st_shape[begin:end]
159 return tf.reshape(st, new_shape)
161 def _resource_apply_dense(self, grad, var, apply_state=None):
162 var_device, var_dtype = var.device, var.dtype.base_dtype
163 coefficients = (apply_state or {}).get(
164 (var_device, var_dtype)
165 ) or self._fallback_apply_state(var_device, var_dtype)
166 lr = coefficients["learning_rate"]
167 lambda_ = coefficients["lambda_"]
168 epsilon = coefficients["epsilon"]
169 if self.ord == "fro":
170 norm = tf.convert_to_tensor(
171 self._frobenius_norm(grad), name="norm", dtype=var.dtype.base_dtype
172 )
173 s = grad / (norm + epsilon)
174 else:
175 top_singular_vector = tf.convert_to_tensor(
176 self._top_singular_vector(grad),
177 name="top_singular_vector",
178 dtype=var.dtype.base_dtype,
179 )
180 s = top_singular_vector
182 var_update = tf.math.multiply(var, lr) - (1 - lr) * lambda_ * s
183 return var.assign(var_update, use_locking=self._use_locking)
185 def _resource_apply_sparse(self, grad, var, indices, apply_state=None):
186 var_device, var_dtype = var.device, var.dtype.base_dtype
187 coefficients = (apply_state or {}).get(
188 (var_device, var_dtype)
189 ) or self._fallback_apply_state(var_device, var_dtype)
190 lr = coefficients["learning_rate"]
191 lambda_ = coefficients["lambda_"]
192 epsilon = coefficients["epsilon"]
193 var_slice = tf.gather(var, indices)
194 if self.ord == "fro":
195 norm = tf.convert_to_tensor(
196 self._frobenius_norm(grad), name="norm", dtype=var.dtype.base_dtype
197 )
198 s = grad / (norm + epsilon)
199 else:
200 top_singular_vector = tf.convert_to_tensor(
201 self._top_singular_vector(grad),
202 name="top_singular_vector",
203 dtype=var.dtype.base_dtype,
204 )
205 s = top_singular_vector
207 var_update_value = tf.math.multiply(var_slice, lr) - (1 - lr) * lambda_ * s
208 var_update_op = self._resource_scatter_update(var, indices, var_update_value)
209 return var_update_op