Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/tensorflow/python/tpu/tpu_embedding_v2_utils.py: 32%
295 statements
« prev ^ index » next coverage.py v7.4.0, created at 2024-01-03 07:57 +0000
« prev ^ index » next coverage.py v7.4.0, created at 2024-01-03 07:57 +0000
1# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
2#
3# Licensed under the Apache License, Version 2.0 (the "License");
4# you may not use this file except in compliance with the License.
5# You may obtain a copy of the License at
6#
7# http://www.apache.org/licenses/LICENSE-2.0
8#
9# Unless required by applicable law or agreed to in writing, software
10# distributed under the License is distributed on an "AS IS" BASIS,
11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12# See the License for the specific language governing permissions and
13# limitations under the License.
14# ==============================================================================
15"""Companion classes for mid level API for TPU Embeddings in TF2."""
17import abc
18import math
19import typing
20from typing import Any, Dict, Callable, Iterable, List, Optional, Text, Tuple, TypeVar, Union
22from absl import logging
24from tensorflow.core.protobuf.tpu import optimization_parameters_pb2
25from tensorflow.core.protobuf.tpu import tpu_embedding_configuration_pb2
26from tensorflow.python.distribute import device_util
27from tensorflow.python.distribute import sharded_variable
28from tensorflow.python.distribute import tpu_strategy
29from tensorflow.python.framework import device_spec
30from tensorflow.python.framework import ops
31from tensorflow.python.framework.tensor_shape import TensorShape
32from tensorflow.python.ops import init_ops_v2
33from tensorflow.python.ops import variables as tf_variables
34from tensorflow.python.tpu.ops import tpu_ops
35from tensorflow.python.types import core
36from tensorflow.python.util.tf_export import tf_export
39TableVariable = TypeVar("TableVariable", sharded_variable.ShardedVariable,
40 tf_variables.Variable)
41SlotVarCreationFnType = Callable[
42 [TableVariable, List[Text], List[init_ops_v2.Initializer]],
43 Dict[Text, TableVariable]]
44ClipValueType = Union[Tuple[float, float], float]
47class _Optimizer(metaclass=abc.ABCMeta):
48 """Base class for all optimizers, with common parameters."""
50 def __init__(
51 self,
52 learning_rate: Union[float, Callable[[], float]],
53 use_gradient_accumulation: bool,
54 clip_weight_min: Optional[float],
55 clip_weight_max: Optional[float],
56 weight_decay_factor: Optional[float],
57 multiply_weight_decay_factor_by_learning_rate: bool,
58 clipvalue: Optional[ClipValueType] = None,
59 slot_variable_creation_fn: Optional[SlotVarCreationFnType] = None,
60 low_dimensional_packing_status: bool = False,
61 ):
62 self.learning_rate = learning_rate
63 self.use_gradient_accumulation = use_gradient_accumulation
64 self.clip_weight_min = clip_weight_min
65 self.clip_weight_max = clip_weight_max
66 if not use_gradient_accumulation and clipvalue is not None:
67 raise ValueError(
68 f"When `use_gradient_accumulation` is False, gradient clipping "
69 f"cannot be used and `clipvalue` should be left as None. "
70 f"Received value {clipvalue} for argument `clipvalue`.")
71 if clipvalue is None:
72 clipvalue = (None, None)
73 elif not isinstance(clipvalue, tuple):
74 clipvalue = (-1. * clipvalue, clipvalue)
75 self.clip_gradient_min, self.clip_gradient_max = clipvalue
77 self.weight_decay_factor = weight_decay_factor
78 self.multiply_weight_decay_factor_by_learning_rate = (
79 multiply_weight_decay_factor_by_learning_rate)
81 if (slot_variable_creation_fn is not None and
82 not callable(slot_variable_creation_fn)):
83 raise ValueError(
84 f"Argument `slot_variable_creation_fn` must be either None or a "
85 f"callable. Received: {slot_variable_creation_fn}")
86 self.slot_variable_creation_fn = slot_variable_creation_fn
87 self.low_dimensional_packing_status = low_dimensional_packing_status
89 @abc.abstractmethod
90 def _slot_names(self) -> List[Text]:
91 """Returns the name of all the slot variables.
93 This does not include the 'parameters' variable and these names must match
94 the names of the slots variables as used in the corresponding
95 `tpu_ops.load_tpu_embedding_*` ops.
96 """
97 raise NotImplementedError
99 @abc.abstractmethod
100 def _slot_initializers(self) -> List[init_ops_v2.Initializer]:
101 """Returns initializers for slot variables.
103 This returns a parallel list to self._slot_names().
104 """
105 raise NotImplementedError
107 def _set_optimization_parameters(
108 self, parameters: optimization_parameters_pb2.OptimizationParameters):
109 """Sets the optimizer fields in the OptimizationParameters."""
110 if self.use_gradient_accumulation:
111 parameters.gradient_accumulation_status = (
112 optimization_parameters_pb2.GradientAccumulationStatus.ENABLED)
113 else:
114 parameters.gradient_accumulation_status = (
115 optimization_parameters_pb2.GradientAccumulationStatus.DISABLED)
117 if self.clip_weight_min is not None:
118 parameters.clipping_limits.lower.value = self.clip_weight_min
120 if self.clip_weight_max is not None:
121 parameters.clipping_limits.upper.value = self.clip_weight_max
123 if self.clip_gradient_min is not None:
124 parameters.gradient_clipping_limits.lower.value = self.clip_gradient_min
126 if self.clip_gradient_max is not None:
127 parameters.gradient_clipping_limits.upper.value = self.clip_gradient_max
129 if self.weight_decay_factor:
130 parameters.weight_decay_factor = self.weight_decay_factor
131 if self.multiply_weight_decay_factor_by_learning_rate:
132 parameters.multiply_weight_decay_factor_by_learning_rate = True
134 parameters.low_dimensional_packing_status = (
135 self.low_dimensional_packing_status
136 )
138 @abc.abstractmethod
139 def _load(self) -> Callable[..., ops.Operation]:
140 """Returns the load function for the optimizer."""
141 raise NotImplementedError
143 @abc.abstractmethod
144 def _retrieve(self) -> Callable[..., core.Tensor]:
145 """Returns the retrieve function for the optimizer."""
146 raise NotImplementedError
148 def _create_slots(
149 self, table: "TableConfig",
150 variable_creator: Callable[[Text, init_ops_v2.Initializer],
151 tf_variables.Variable]
152 ) -> Dict[Text, tf_variables.Variable]:
153 """Creates slot variables for table.
155 Args:
156 table: The table variable to create slots for.
157 variable_creator: A function which creates variables. Takes parameters
158 'name', 'initializer'.
160 Returns:
161 A dict of variables, keyed by self._slot_names().
162 """
163 if self.slot_variable_creation_fn is not None:
164 return self.slot_variable_creation_fn(table, self._slot_names(),
165 self._slot_initializers())
166 else:
167 slots = {}
168 for slot, initializer in zip(self._slot_names(),
169 self._slot_initializers()):
170 slots[slot] = variable_creator(slot, initializer)
171 return slots
173 def __eq__(self, other: Any) -> Union[Any, bool]:
174 if isinstance(other, self.__class__):
175 return all([
176 attr1 == attr2
177 for attr1, attr2 in zip(self.__dict__.items(), other.__dict__.items())
178 ])
179 else:
180 return False
182 def __hash__(self) -> int:
183 return hash(tuple(self.__dict__.items()))
186@tf_export("tpu.experimental.embedding.SGD")
187class SGD(_Optimizer):
188 """Optimization parameters for stochastic gradient descent for TPU embeddings.
190 Pass this to `tf.tpu.experimental.embedding.TPUEmbedding` via the `optimizer`
191 argument to set the global optimizer and its parameters:
193 ```
194 embedding = tf.tpu.experimental.embedding.TPUEmbedding(
195 ...
196 optimizer=tf.tpu.experimental.embedding.SGD(0.1))
197 ```
199 This can also be used in a `tf.tpu.experimental.embedding.TableConfig` as the
200 optimizer parameter to set a table specific optimizer. This will override the
201 optimizer and parameters for global embedding optimizer defined above:
203 ```
204 table_one = tf.tpu.experimental.embedding.TableConfig(
205 vocabulary_size=...,
206 dim=...,
207 optimizer=tf.tpu.experimental.embedding.SGD(0.2))
208 table_two = tf.tpu.experimental.embedding.TableConfig(
209 vocabulary_size=...,
210 dim=...)
212 feature_config = (
213 tf.tpu.experimental.embedding.FeatureConfig(
214 table=table_one),
215 tf.tpu.experimental.embedding.FeatureConfig(
216 table=table_two))
218 embedding = tf.tpu.experimental.embedding.TPUEmbedding(
219 feature_config=feature_config,
220 batch_size=...
221 optimizer=tf.tpu.experimental.embedding.SGD(0.1))
222 ```
224 In the above example, the first feature will be looked up in a table that has
225 a learning rate of 0.2 while the second feature will be looked up in a table
226 that has a learning rate of 0.1.
228 See 'tensorflow/core/protobuf/tpu/optimization_parameters.proto' for a
229 complete description of these parameters and their impacts on the optimizer
230 algorithm.
231 """
233 def __init__(
234 self,
235 learning_rate: Union[float, Callable[[], float]] = 0.01,
236 use_gradient_accumulation: bool = True,
237 clip_weight_min: Optional[float] = None,
238 clip_weight_max: Optional[float] = None,
239 weight_decay_factor: Optional[float] = None,
240 multiply_weight_decay_factor_by_learning_rate: bool = None,
241 clipvalue: Optional[ClipValueType] = None,
242 low_dimensional_packing_status: bool = False,
243 ):
244 """Optimization parameters for stochastic gradient descent.
246 Args:
247 learning_rate: The learning rate. It should be a floating point value or a
248 callable taking no arguments for a dynamic learning rate.
249 use_gradient_accumulation: setting this to `False` makes embedding
250 gradients calculation less accurate but faster.
251 clip_weight_min: the minimum value to clip by; None means -infinity.
252 clip_weight_max: the maximum value to clip by; None means +infinity.
253 weight_decay_factor: amount of weight decay to apply; None means that the
254 weights are not decayed. Weights are decayed by multiplying the weight
255 by this factor each step.
256 multiply_weight_decay_factor_by_learning_rate: if true,
257 `weight_decay_factor` is multiplied by the current learning rate.
258 clipvalue: Controls clipping of the gradient. Set to either a single
259 positive scalar value to get clipping or a tiple of scalar values (min,
260 max) to set a separate maximum or minimum. If one of the two entries is
261 None, then there will be no clipping that direction. Note if this is
262 set, you may see a decrease in performance as gradient accumulation
263 will be enabled (it is normally off for SGD as it has no affect on
264 accuracy). See
265 'tensorflow/core/protobuf/tpu/optimization_parameters.proto' for more
266 information on gradient accumulation and its impact on tpu embeddings.
267 low_dimensional_packing_status: Status of the low-dimensional embedding
268 packing optimization controls whether to optimize the packing of
269 1-dimensional, 2-dimensional, and 4-dimensional embedding tables in
270 memory.
271 """
272 super().__init__(
273 learning_rate,
274 use_gradient_accumulation,
275 clip_weight_min,
276 clip_weight_max,
277 weight_decay_factor,
278 multiply_weight_decay_factor_by_learning_rate,
279 clipvalue,
280 None,
281 low_dimensional_packing_status,
282 )
284 def _slot_names(self) -> List[Text]:
285 return []
287 def _slot_initializers(self) -> List[init_ops_v2.Initializer]:
288 return []
290 def _set_optimization_parameters(
291 self, parameters: optimization_parameters_pb2.OptimizationParameters):
292 super()._set_optimization_parameters(parameters)
293 parameters.stochastic_gradient_descent.SetInParent()
295 def _load(self) -> Callable[..., ops.Operation]:
296 return tpu_ops.load_tpu_embedding_stochastic_gradient_descent_parameters
298 def _retrieve(self) -> Callable[..., core.Tensor]:
299 return tpu_ops.retrieve_tpu_embedding_stochastic_gradient_descent_parameters
302@tf_export("tpu.experimental.embedding.Adagrad")
303class Adagrad(_Optimizer):
304 """Optimization parameters for Adagrad with TPU embeddings.
306 Pass this to `tf.tpu.experimental.embedding.TPUEmbedding` via the `optimizer`
307 argument to set the global optimizer and its parameters:
309 ```python
310 embedding = tf.tpu.experimental.embedding.TPUEmbedding(
311 ...
312 optimizer=tf.tpu.experimental.embedding.Adagrad(0.1))
313 ```
315 This can also be used in a `tf.tpu.experimental.embedding.TableConfig` as the
316 optimizer parameter to set a table specific optimizer. This will override the
317 optimizer and parameters for global embedding optimizer defined above:
319 ```python
320 table_one = tf.tpu.experimental.embedding.TableConfig(
321 vocabulary_size=...,
322 dim=...,
323 optimizer=tf.tpu.experimental.embedding.Adagrad(0.2))
324 table_two = tf.tpu.experimental.embedding.TableConfig(
325 vocabulary_size=...,
326 dim=...)
328 feature_config = (
329 tf.tpu.experimental.embedding.FeatureConfig(
330 table=table_one),
331 tf.tpu.experimental.embedding.FeatureConfig(
332 table=table_two))
334 embedding = tf.tpu.experimental.embedding.TPUEmbedding(
335 feature_config=feature_config,
336 batch_size=...
337 optimizer=tf.tpu.experimental.embedding.Adagrad(0.1))
338 ```
340 In the above example, the first feature will be looked up in a table that has
341 a learning rate of 0.2 while the second feature will be looked up in a table
342 that has a learning rate of 0.1.
344 See 'tensorflow/core/protobuf/tpu/optimization_parameters.proto' for a
345 complete description of these parameters and their impacts on the optimizer
346 algorithm.
347 """
349 def __init__(
350 self,
351 learning_rate: Union[float, Callable[[], float]] = 0.001,
352 initial_accumulator_value: float = 0.1,
353 use_gradient_accumulation: bool = True,
354 clip_weight_min: Optional[float] = None,
355 clip_weight_max: Optional[float] = None,
356 weight_decay_factor: Optional[float] = None,
357 multiply_weight_decay_factor_by_learning_rate: bool = None,
358 slot_variable_creation_fn: Optional[SlotVarCreationFnType] = None,
359 clipvalue: Optional[ClipValueType] = None,
360 low_dimensional_packing_status: bool = False,
361 ):
362 """Optimization parameters for Adagrad.
364 Args:
365 learning_rate: The learning rate. It should be a floating point value or a
366 callable taking no arguments for a dynamic learning rate.
367 initial_accumulator_value: initial accumulator for Adagrad.
368 use_gradient_accumulation: setting this to `False` makes embedding
369 gradients calculation less accurate but faster.
370 clip_weight_min: the minimum value to clip by; None means -infinity.
371 clip_weight_max: the maximum value to clip by; None means +infinity.
372 weight_decay_factor: amount of weight decay to apply; None means that the
373 weights are not decayed.
374 multiply_weight_decay_factor_by_learning_rate: if true,
375 `weight_decay_factor` is multiplied by the current learning rate.
376 slot_variable_creation_fn: If you wish do directly control the creation of
377 the slot variables, set this to a callable taking three parameters: a
378 table variable, a list of slot names to create for it, and a list of
379 initializers. This function should return a dict with the slot names as
380 keys and the created variables as values with types matching the table
381 variable. When set to None (the default), uses the built-in variable
382 creation.
383 clipvalue: Controls clipping of the gradient. Set to either a single
384 positive scalar value to get clipping or a tuple of scalar values (min,
385 max) to set a separate maximum or minimum. If one of the two entries is
386 None, then there will be no clipping that direction.
387 low_dimensional_packing_status: Status of the low-dimensional embedding
388 packing optimization controls whether to optimize the packing of
389 1-dimensional, 2-dimensional, and 4-dimensional embedding tables in
390 memory.
391 """
392 super().__init__(
393 learning_rate,
394 use_gradient_accumulation,
395 clip_weight_min,
396 clip_weight_max,
397 weight_decay_factor,
398 multiply_weight_decay_factor_by_learning_rate,
399 clipvalue,
400 slot_variable_creation_fn,
401 low_dimensional_packing_status,
402 )
403 if initial_accumulator_value <= 0:
404 raise ValueError(
405 f"Argument `initial_accumulator_value` must be a positive float. "
406 f"Received: {initial_accumulator_value}")
407 self.initial_accumulator_value = initial_accumulator_value
409 def _slot_names(self) -> List[Text]:
410 return ["accumulators"]
412 def _slot_initializers(self) -> List[init_ops_v2.Initializer]:
413 return [init_ops_v2.Constant(self.initial_accumulator_value)]
415 def _set_optimization_parameters(
416 self, parameters: optimization_parameters_pb2.OptimizationParameters):
417 super()._set_optimization_parameters(parameters)
418 parameters.adagrad.SetInParent()
420 def _load(self) -> Callable[..., ops.Operation]:
421 return tpu_ops.load_tpu_embedding_adagrad_parameters
423 def _retrieve(self) -> Callable[..., core.Tensor]:
424 return tpu_ops.retrieve_tpu_embedding_adagrad_parameters
427@tf_export("tpu.experimental.embedding.AdagradMomentum")
428class AdagradMomentum(_Optimizer):
429 """Optimization parameters for Adagrad + Momentum with TPU embeddings.
431 Pass this to `tf.tpu.experimental.embedding.TPUEmbedding` via the `optimizer`
432 argument to set the global optimizer and its parameters:
434 ```python
435 embedding = tf.tpu.experimental.embedding.TPUEmbedding(
436 ...
437 optimizer=tf.tpu.experimental.embedding.AdagradMomentum(0.1))
438 ```
440 This can also be used in a `tf.tpu.experimental.embedding.TableConfig` as the
441 optimizer parameter to set a table specific optimizer. This will override the
442 optimizer and parameters for global embedding optimizer defined above:
444 ```python
445 table_one = tf.tpu.experimental.embedding.TableConfig(
446 vocabulary_size=...,
447 dim=...,
448 optimizer=tf.tpu.experimental.embedding.AdagradMomentum(0.2))
449 table_two = tf.tpu.experimental.embedding.TableConfig(
450 vocabulary_size=...,
451 dim=...)
453 feature_config = (
454 tf.tpu.experimental.embedding.FeatureConfig(
455 table=table_one),
456 tf.tpu.experimental.embedding.FeatureConfig(
457 table=table_two))
459 embedding = tf.tpu.experimental.embedding.TPUEmbedding(
460 feature_config=feature_config,
461 batch_size=...
462 optimizer=tf.tpu.experimental.embedding.AdagradMomentum(0.1))
463 ```
465 In the above example, the first feature will be looked up in a table that has
466 a learning rate of 0.2 while the second feature will be looked up in a table
467 that has a learning rate of 0.1.
469 See 'tensorflow/core/protobuf/tpu/optimization_parameters.proto' for a
470 complete description of these parameters and their impacts on the optimizer
471 algorithm.
472 """
474 def __init__(
475 self,
476 learning_rate: Union[float, Callable[[], float]] = 0.001,
477 momentum: float = 0.0,
478 use_nesterov: bool = False,
479 exponent: float = 2,
480 beta2: float = 1,
481 epsilon: float = 1e-10,
482 use_gradient_accumulation: bool = True,
483 clip_weight_min: Optional[float] = None,
484 clip_weight_max: Optional[float] = None,
485 weight_decay_factor: Optional[float] = None,
486 multiply_weight_decay_factor_by_learning_rate: bool = None,
487 slot_variable_creation_fn: Optional[SlotVarCreationFnType] = None,
488 clipvalue: Optional[ClipValueType] = None,
489 low_dimensional_packing_status: bool = False,
490 ):
491 """Optimization parameters for Adagrad + Momentum.
493 Args:
494 learning_rate: The learning rate. It should be a floating point value or a
495 callable taking no arguments for a dynamic learning rate.
496 momentum: Moving average parameter for the momentum accumulator.
497 use_nesterov: Whether to use the Nesterov variant of momentum. See
498 Sutskever et al., 2013.
499 exponent: Exponent for the Adagrad accumulator.
500 beta2: Moving average parameter for the Adagrad accumulator.
501 epsilon: initial accumulator for Adagrad accumulator.
502 use_gradient_accumulation: setting this to `False` makes embedding
503 gradients calculation less accurate but faster.
504 clip_weight_min: the minimum value to clip by; None means -infinity.
505 clip_weight_max: the maximum value to clip by; None means +infinity.
506 weight_decay_factor: amount of weight decay to apply; None means that the
507 weights are not decayed.
508 multiply_weight_decay_factor_by_learning_rate: if true,
509 `weight_decay_factor` is multiplied by the current learning rate.
510 slot_variable_creation_fn: If you wish do directly control the creation of
511 the slot variables, set this to a callable taking three parameters: a
512 table variable, a list of slot names to create for it, and a list of
513 initializers. This function should return a dict with the slot names as
514 keys and the created variables as values with types matching the table
515 variable. When set to None (the default), uses the built-in variable
516 creation.
517 clipvalue: Controls clipping of the gradient. Set to either a single
518 positive scalar value to get clipping or a tuple of scalar values (min,
519 max) to set a separate maximum or minimum. If one of the two entries is
520 None, then there will be no clipping that direction.
521 low_dimensional_packing_status: Status of the low-dimensional embedding
522 packing optimization controls whether to optimize the packing of
523 1-dimensional, 2-dimensional, and 4-dimensional embedding tables in
524 memory.
525 """
526 super().__init__(
527 learning_rate,
528 use_gradient_accumulation,
529 clip_weight_min,
530 clip_weight_max,
531 weight_decay_factor,
532 multiply_weight_decay_factor_by_learning_rate,
533 clipvalue,
534 slot_variable_creation_fn,
535 low_dimensional_packing_status,
536 )
537 if epsilon <= 0:
538 raise ValueError("Adagrad momentum: epsilon must be positive")
539 if exponent <= 0:
540 raise ValueError("Adagrad momentum: Precondition exponent must >0")
541 self.momentum = momentum
542 self.use_nesterov = use_nesterov
543 self.exponent = exponent
544 self.beta2 = beta2
545 self.epsilon = epsilon
547 def _slot_names(self) -> List[Text]:
548 return ["accumulators", "momenta"]
550 def _slot_initializers(self) -> List[init_ops_v2.Initializer]:
551 return [init_ops_v2.Constant(), init_ops_v2.Constant()]
553 def _set_optimization_parameters(
554 self, parameters: optimization_parameters_pb2.OptimizationParameters):
555 super()._set_optimization_parameters(parameters)
556 parameters.adagrad_momentum.SetInParent()
557 parameters.adagrad_momentum.momentum = self.momentum
558 parameters.adagrad_momentum.use_nesterov = self.use_nesterov
559 parameters.adagrad_momentum.exponent = self.exponent
560 parameters.adagrad_momentum.beta2 = self.beta2
561 parameters.adagrad_momentum.epsilon = self.epsilon
563 def _load(self) -> Callable[..., ops.Operation]:
564 return tpu_ops.load_tpu_embedding_adagrad_momentum_parameters
566 def _retrieve(self) -> Callable[..., core.Tensor]:
567 return tpu_ops.retrieve_tpu_embedding_adagrad_momentum_parameters
570@tf_export("tpu.experimental.embedding.FTRL")
571class FTRL(_Optimizer):
572 """Optimization parameters for FTRL with TPU embeddings.
574 See Algorithm 1 of this
575 [paper](https://research.google.com/pubs/archive/41159.pdf).
577 Pass this to `tf.tpu.experimental.embedding.TPUEmbedding` via the `optimizer`
578 argument to set the global optimizer and its parameters:
580 ```python
581 embedding = tf.tpu.experimental.embedding.TPUEmbedding(
582 ...
583 optimizer=tf.tpu.experimental.embedding.FTRL(0.1))
584 ```
586 This can also be used in a `tf.tpu.experimental.embedding.TableConfig` as the
587 optimizer parameter to set a table specific optimizer. This will override the
588 optimizer and parameters for global embedding optimizer defined above:
590 ```python
591 table_one = tf.tpu.experimental.embedding.TableConfig(
592 vocabulary_size=...,
593 dim=...,
594 optimizer=tf.tpu.experimental.embedding.FTRL(0.2))
595 table_two = tf.tpu.experimental.embedding.TableConfig(
596 vocabulary_size=...,
597 dim=...)
599 feature_config = (
600 tf.tpu.experimental.embedding.FeatureConfig(
601 table=table_one),
602 tf.tpu.experimental.embedding.FeatureConfig(
603 table=table_two))
605 embedding = tf.tpu.experimental.embedding.TPUEmbedding(
606 feature_config=feature_config,
607 batch_size=...
608 optimizer=tf.tpu.experimental.embedding.FTRL(0.1))
609 ```
611 In the above example, the first feature will be looked up in a table that has
612 a learning rate of 0.2 while the second feature will be looked up in a table
613 that has a learning rate of 0.1.
615 See 'tensorflow/core/protobuf/tpu/optimization_parameters.proto' for a
616 complete description of these parameters and their impacts on the optimizer
617 algorithm.
618 """
620 def __init__(
621 self,
622 learning_rate: Union[float, Callable[[], float]] = 0.001,
623 learning_rate_power: float = -0.5,
624 l1_regularization_strength: float = 0.0,
625 l2_regularization_strength: float = 0.0,
626 beta: float = 0.0,
627 initial_accumulator_value: float = 0.1,
628 use_gradient_accumulation: bool = True,
629 clip_weight_min: Optional[float] = None,
630 clip_weight_max: Optional[float] = None,
631 weight_decay_factor: Optional[float] = None,
632 multiply_weight_decay_factor_by_learning_rate: bool = None,
633 slot_variable_creation_fn: Optional[SlotVarCreationFnType] = None,
634 clipvalue: Optional[ClipValueType] = None,
635 multiply_linear_by_learning_rate: bool = False,
636 allow_zero_accumulator: bool = False,
637 low_dimensional_packing_status: bool = False,
638 ):
639 """Optimization parameters for Adagrad.
641 Args:
642 learning_rate: The learning rate. It should be a floating point value or a
643 callable taking no arguments for a dynamic learning rate.
644 learning_rate_power: A float value, must be less or equal to zero.
645 Controls how the learning rate decreases during training. Use zero for a
646 fixed learning rate.
647 l1_regularization_strength: A float value, must be greater than or equal
648 to zero.
649 l2_regularization_strength: A float value, must be greater than or equal
650 to zero.
651 beta: A float value, representing the beta value from the paper.
652 initial_accumulator_value: The starting value for accumulators. Only zero
653 or positive values are allowed.
654 use_gradient_accumulation: setting this to `False` makes embedding
655 gradients calculation less accurate but faster.
656 clip_weight_min: the minimum value to clip by; None means -infinity.
657 clip_weight_max: the maximum value to clip by; None means +infinity.
658 weight_decay_factor: amount of weight decay to apply; None means that the
659 weights are not decayed.
660 multiply_weight_decay_factor_by_learning_rate: if true,
661 `weight_decay_factor` is multiplied by the current learning rate.
662 slot_variable_creation_fn: If you wish do directly control the creation of
663 the slot variables, set this to a callable taking three parameters: a
664 table variable, a list of slot names to create for it, and a list of
665 initializers. This function should return a dict with the slot names as
666 keys and the created variables as values with types matching the table
667 variable. When set to None (the default), uses the built-in variable
668 creation.
669 clipvalue: Controls clipping of the gradient. Set to either a single
670 positive scalar value to get clipping or a tuple of scalar values (min,
671 max) to set a separate maximum or minimum. If one of the two entries is
672 None, then there will be no clipping that direction.
673 multiply_linear_by_learning_rate: If set to True, a modified formula is
674 used for FTRL that treats the "linear" accumulator as being
675 pre-multiplied by the learning rate (i.e., the accumulator named
676 "linear" actually stores "linear * learning_rate"). Other than
677 checkpoint compatibility, this is mathematically equivalent for a static
678 learning rate; for a dynamic learning rate, it is nearly the same as
679 long as the learning rate does not change quickly. The benefit of this
680 is that the modified formula handles zero and near-zero learning rates
681 without producing NaNs, improving flexibility for learning rate ramp-up.
682 allow_zero_accumulator: If set to True, changes some internal formulas to
683 allow zero and near-zero accumulator values at the cost of some
684 performance; this only needs to be set if you are using an initial
685 accumulator value of zero, which is uncommon.
686 low_dimensional_packing_status: Status of the low-dimensional embedding
687 packing optimization controls whether to optimize the packing of
688 1-dimensional, 2-dimensional, and 4-dimensional embedding tables in
689 memory.
690 """
691 super().__init__(
692 learning_rate,
693 use_gradient_accumulation,
694 clip_weight_min,
695 clip_weight_max,
696 weight_decay_factor,
697 multiply_weight_decay_factor_by_learning_rate,
698 clipvalue,
699 slot_variable_creation_fn,
700 low_dimensional_packing_status,
701 )
702 if initial_accumulator_value <= 0:
703 raise ValueError(
704 f"Argument `initial_accumulator_value` must be a positive float. "
705 f"Received: {initial_accumulator_value}")
706 self.initial_accumulator_value = initial_accumulator_value
707 self.learning_rate_power = learning_rate_power
708 self.l1_regularization_strength = l1_regularization_strength
709 self.l2_regularization_strength = l2_regularization_strength
710 self.beta = beta
711 self.multiply_linear_by_learning_rate = multiply_linear_by_learning_rate
712 self.allow_zero_accumulator = allow_zero_accumulator
714 def _slot_names(self) -> List[Text]:
715 return ["accumulators", "linears"]
717 def _slot_initializers(self) -> List[init_ops_v2.Initializer]:
718 return [
719 init_ops_v2.Constant(self.initial_accumulator_value),
720 init_ops_v2.Constant()
721 ]
723 def _set_optimization_parameters(
724 self, parameters: optimization_parameters_pb2.OptimizationParameters):
725 super()._set_optimization_parameters(parameters)
726 ftrl = parameters.ftrl
727 ftrl.l1 = self.l1_regularization_strength
728 ftrl.l2 = self.l2_regularization_strength
729 ftrl.lr_power = self.learning_rate_power
730 ftrl.beta = self.beta
731 ftrl.multiply_linear_by_lr = self.multiply_linear_by_learning_rate
732 ftrl.allow_zero_accumulator = self.allow_zero_accumulator
734 def _load(self) -> Callable[..., ops.Operation]:
735 return tpu_ops.load_tpu_embedding_ftrl_parameters
737 def _retrieve(self) -> Callable[..., core.Tensor]:
738 return tpu_ops.retrieve_tpu_embedding_ftrl_parameters
741@tf_export("tpu.experimental.embedding.Adam")
742class Adam(_Optimizer):
743 """Optimization parameters for Adam with TPU embeddings.
745 Pass this to `tf.tpu.experimental.embedding.TPUEmbedding` via the `optimizer`
746 argument to set the global optimizer and its parameters:
748 NOTE: By default this optimizer is lazy, i.e. it will not apply the gradient
749 update of zero to rows that were not looked up. You can change this behavior
750 by setting `lazy_adam` to `False`.
752 ```python
753 embedding = tf.tpu.experimental.embedding.TPUEmbedding(
754 ...
755 optimizer=tf.tpu.experimental.embedding.Adam(0.1))
756 ```
758 This can also be used in a `tf.tpu.experimental.embedding.TableConfig` as the
759 optimizer parameter to set a table specific optimizer. This will override the
760 optimizer and parameters for global embedding optimizer defined above:
762 ```python
763 table_one = tf.tpu.experimental.embedding.TableConfig(
764 vocabulary_size=...,
765 dim=...,
766 optimizer=tf.tpu.experimental.embedding.Adam(0.2))
767 table_two = tf.tpu.experimental.embedding.TableConfig(
768 vocabulary_size=...,
769 dim=...)
771 feature_config = (
772 tf.tpu.experimental.embedding.FeatureConfig(
773 table=table_one),
774 tf.tpu.experimental.embedding.FeatureConfig(
775 table=table_two))
777 embedding = tf.tpu.experimental.embedding.TPUEmbedding(
778 feature_config=feature_config,
779 batch_size=...
780 optimizer=tf.tpu.experimental.embedding.Adam(0.1))
781 ```
783 In the above example, the first feature will be looked up in a table that has
784 a learning rate of 0.2 while the second feature will be looked up in a table
785 that has a learning rate of 0.1.
787 See 'tensorflow/core/protobuf/tpu/optimization_parameters.proto' for a
788 complete description of these parameters and their impacts on the optimizer
789 algorithm.
790 """
792 def __init__(
793 self,
794 learning_rate: Union[float, Callable[[], float]] = 0.001,
795 beta_1: float = 0.9,
796 beta_2: float = 0.999,
797 epsilon: float = 1e-07,
798 lazy_adam: bool = True,
799 sum_inside_sqrt: bool = True,
800 use_gradient_accumulation: bool = True,
801 clip_weight_min: Optional[float] = None,
802 clip_weight_max: Optional[float] = None,
803 weight_decay_factor: Optional[float] = None,
804 multiply_weight_decay_factor_by_learning_rate: bool = None,
805 slot_variable_creation_fn: Optional[SlotVarCreationFnType] = None,
806 clipvalue: Optional[ClipValueType] = None,
807 low_dimensional_packing_status: bool = False,
808 ):
809 """Optimization parameters for Adam.
811 See 'tensorflow/core/protobuf/tpu/optimization_parameters.proto' for a
812 complete description of these parameters and their impacts on the optimizer
813 algorithm.
815 Args:
816 learning_rate: The learning rate. It should be a floating point value or a
817 callable taking no arguments for a dynamic learning rate.
818 beta_1: A float value. The exponential decay rate for the 1st moment
819 estimates.
820 beta_2: A float value. The exponential decay rate for the 2nd moment
821 estimates.
822 epsilon: A small constant for numerical stability.
823 lazy_adam: Use lazy Adam instead of Adam. Lazy Adam trains faster.
824 sum_inside_sqrt: When this is true, the Adam update formula is changed
825 from `m / (sqrt(v) + epsilon)` to `m / sqrt(v + epsilon**2)`. This
826 option improves the performance of TPU training and is not expected to
827 harm model quality.
828 use_gradient_accumulation: Setting this to `False` makes embedding
829 gradients calculation less accurate but faster.
830 clip_weight_min: the minimum value to clip by; None means -infinity.
831 clip_weight_max: the maximum value to clip by; None means +infinity.
832 weight_decay_factor: amount of weight decay to apply; None means that the
833 weights are not decayed.
834 multiply_weight_decay_factor_by_learning_rate: if true,
835 `weight_decay_factor` is multiplied by the current learning rate.
836 slot_variable_creation_fn: If you wish do directly control the creation of
837 the slot variables, set this to a callable taking three parameters: a
838 table variable, a list of slot names to create for it, and a list of
839 initializers. This function should return a dict with the slot names as
840 keys and the created variables as values with types matching the table
841 variable. When set to None (the default), uses the built-in variable
842 creation.
843 clipvalue: Controls clipping of the gradient. Set to either a single
844 positive scalar value to get clipping or a tiple of scalar values (min,
845 max) to set a separate maximum or minimum. If one of the two entries is
846 None, then there will be no clipping that direction.
847 low_dimensional_packing_status: Status of the low-dimensional embedding
848 packing optimization controls whether to optimize the packing of
849 1-dimensional, 2-dimensional, and 4-dimensional embedding tables in
850 memory.
851 """
852 super(Adam, self).__init__(
853 learning_rate,
854 use_gradient_accumulation,
855 clip_weight_min,
856 clip_weight_max,
857 weight_decay_factor,
858 multiply_weight_decay_factor_by_learning_rate,
859 clipvalue,
860 slot_variable_creation_fn,
861 low_dimensional_packing_status,
862 )
863 if beta_1 < 0. or beta_1 >= 1.:
864 raise ValueError(
865 f"Argument `beta_1` must be >= 0 and < 1. Received: {beta_1}.")
866 if beta_2 < 0. or beta_2 >= 1.:
867 raise ValueError(
868 f"Argument `beta_2` must be >= 0 and < 1. Received: {beta_1}.")
869 if epsilon <= 0.:
870 raise ValueError("epsilon must be positive; got {}.".format(epsilon))
871 if not use_gradient_accumulation and not lazy_adam:
872 raise ValueError(
873 "When disabling lazy Adam (`lazy_adam=False`), "
874 "gradient accumulation must be used. "
875 "Set `use_gradient_accumulation` to False.")
877 self.beta_1 = beta_1
878 self.beta_2 = beta_2
879 self.epsilon = epsilon
880 self.lazy_adam = lazy_adam
881 self.sum_inside_sqrt = sum_inside_sqrt
883 def _slot_names(self) -> List[Text]:
884 return ["momenta", "velocities"]
886 def _slot_initializers(self) -> List[init_ops_v2.Initializer]:
887 return [init_ops_v2.Constant(), init_ops_v2.Constant()]
889 def _set_optimization_parameters(
890 self, parameters: optimization_parameters_pb2.OptimizationParameters):
891 super(Adam, self)._set_optimization_parameters(parameters)
892 parameters.adam.beta1 = self.beta_1
893 parameters.adam.beta2 = self.beta_2
894 parameters.adam.epsilon = self.epsilon
895 parameters.adam.use_non_lazy_adam = not self.lazy_adam
896 parameters.adam.use_sum_inside_sqrt = self.sum_inside_sqrt
898 def _load(self) -> Callable[..., ops.Operation]:
899 return tpu_ops.load_tpu_embedding_adam_parameters
901 def _retrieve(self) -> Callable[..., core.Tensor]:
902 return tpu_ops.retrieve_tpu_embedding_adam_parameters
905@tf_export("tpu.experimental.embedding.QuantizationConfig")
906class QuantizationConfig:
907 """Settings for simulated quantization of the tpu embedding table.
909 When simulated quantization is enabled, the results of the embedding lookup
910 are clipped and quantized according to the settings here before the combiner
911 is applied.
913 For example, to quantize `input` the following is done:
914 ```python
915 if input < lower
916 input = lower
917 if input > upper
918 input = upper
919 quantum = (upper - lower) / (num_buckets - 1)
920 input = math.floor((input - lower) / quantum + 0.5) * quantium + lower
921 ```
923 See tensorflow/core/protobuf/tpu/optimization_parameters.proto for more
924 details.
926 NOTE: This does not change the storage type of the embedding table, that will
927 continue to be float32 as will the saved variable in the checkpoint. You will
928 have to manually quantize the variable (typically with the same algorithm and
929 settings as above) manually.
930 """
932 def __init__(self, num_buckets: int, lower: float, upper: float):
933 """Simulated quantizaiton configuration.
935 Args:
936 num_buckets: The number of quantization buckets, must be atleast 2.
937 lower: The lower bound for the quantization range.
938 upper: The upper bound for the quantization range.
940 Returns:
941 `QuantizationConfig`.
943 Raises:
944 ValueError: if `num_buckets` is less than 2.
945 """
946 if num_buckets < 2:
947 raise ValueError(f"num_buckets is {num_buckets}, must be at least 2 for "
948 f"simulated quantization.")
950 self.num_buckets = num_buckets
951 self.lower = lower
952 self.upper = upper
954 def _set_optimization_parameters(
955 self, parameters: optimization_parameters_pb2.OptimizationParameters):
956 parameters.simulated_quantization.enabled = True
957 parameters.simulated_quantization.num_buckets = self.num_buckets
958 parameters.simulated_quantization.clipping_limits.lower.value = self.lower
959 parameters.simulated_quantization.clipping_limits.upper.value = self.upper
961 def __repr__(self):
962 return ("QuantizationConfig(num_buckets={num_buckets!r}, lower={lower!r}, "
963 "upper={upper!r})".format(
964 num_buckets=self.num_buckets,
965 lower=self.lower,
966 upper=self.upper))
969@tf_export("tpu.experimental.embedding.TableConfig")
970class TableConfig:
971 """Configuration data for one embedding table.
973 This class holds the configuration data for a single embedding table. It is
974 used as the `table` parameter of a
975 `tf.tpu.experimental.embedding.FeatureConfig`. Multiple
976 `tf.tpu.experimental.embedding.FeatureConfig` objects can use the same
977 `tf.tpu.experimental.embedding.TableConfig` object. In this case a shared
978 table will be created for those feature lookups.
980 ```python
981 table_config_one = tf.tpu.experimental.embedding.TableConfig(
982 vocabulary_size=...,
983 dim=...)
984 table_config_two = tf.tpu.experimental.embedding.TableConfig(
985 vocabulary_size=...,
986 dim=...)
987 feature_config = {
988 'feature_one': tf.tpu.experimental.embedding.FeatureConfig(
989 table=table_config_one),
990 'feature_two': tf.tpu.experimental.embedding.FeatureConfig(
991 table=table_config_one),
992 'feature_three': tf.tpu.experimental.embedding.FeatureConfig(
993 table=table_config_two)}
994 embedding = tf.tpu.experimental.embedding.TPUEmbedding(
995 feature_config=feature_config,
996 batch_size=...
997 optimizer=tf.tpu.experimental.embedding.Adam(0.1))
998 ```
1000 The above configuration has 2 tables, and three features. The first two
1001 features will be looked up in the first table and the third feature will be
1002 looked up in the second table.
1004 """
1006 def __init__(self,
1007 vocabulary_size: int,
1008 dim: int,
1009 initializer: Optional[Callable[[Any], None]] = None,
1010 optimizer: Optional[_Optimizer] = None,
1011 combiner: Text = "mean",
1012 name: Optional[Text] = None,
1013 quantization_config: QuantizationConfig = None):
1014 """Embedding table configuration.
1016 Args:
1017 vocabulary_size: Size of the table's vocabulary (number of rows).
1018 dim: The embedding dimension (width) of the table.
1019 initializer: A callable initializer taking one parameter, the shape of the
1020 variable that will be initialized. Will be called once per task, to
1021 initialize that task's shard of the embedding table. If not specified,
1022 defaults to `truncated_normal_initializer` with mean `0.0` and standard
1023 deviation `1/sqrt(dim)`.
1024 optimizer: An optional instance of an optimizer parameters class, instance
1025 of one of `tf.tpu.experimental.embedding.SGD`,
1026 `tf.tpu.experimental.embedding.Adagrad` or
1027 `tf.tpu.experimental.embedding.Adam`. If set will override the global
1028 optimizer passed to `tf.tpu.experimental.embedding.TPUEmbedding`.
1029 combiner: A string specifying how to reduce if there are multiple entries
1030 in a single row. Currently 'mean', 'sqrtn', 'sum' are supported, with
1031 'mean' the default. 'sqrtn' often achieves good accuracy, in particular
1032 with bag-of-words columns. For more information, see
1033 `tf.nn.embedding_lookup_sparse`.
1034 name: An optional string used to name the table. Useful for debugging.
1035 quantization_config: The simulated quantization config. An instance of
1036 `tf.tpu.experimental.embedding.QuantizationConfig`. See the class for
1037 more documentation.
1039 Returns:
1040 `TableConfig`.
1042 Raises:
1043 ValueError: if `vocabulary_size` is not a positive integer.
1044 ValueError: if `dim` is not a positive integer.
1045 ValueError: if `initializer` is specified and is not callable.
1046 ValueError: if `combiner` is not supported.
1047 """
1048 if not isinstance(vocabulary_size, int) or vocabulary_size < 1:
1049 raise ValueError(
1050 f"Argument `vocabulary_size` must be an int and must be >= 1. "
1051 f"Received: {vocabulary_size}")
1053 if not isinstance(dim, int) or dim < 1:
1054 raise ValueError(
1055 f"Argument `dim` (embedding dimension) "
1056 f"must be an int and must be >= 1. Received: {dim}")
1058 if (initializer is not None) and (not callable(initializer)):
1059 raise ValueError(
1060 f"Argument `initializer` must be a callable (or None). "
1061 f"Received: {initializer}")
1062 if initializer is None:
1063 initializer = init_ops_v2.TruncatedNormal(mean=0.0,
1064 stddev=1/math.sqrt(dim))
1065 accepted_combiners = ("mean", "sum", "sqrtn")
1066 if combiner not in accepted_combiners:
1067 raise ValueError(
1068 f"Argument `combiner` must be one of {accepted_combiners}. "
1069 f"Received: {combiner}")
1071 self.vocabulary_size = vocabulary_size
1072 self.dim = dim
1073 self.initializer = initializer
1074 self.optimizer = optimizer
1075 self.combiner = combiner
1076 self.name = name
1077 self.quantization_config = quantization_config
1079 def __repr__(self):
1080 # If using the default initializer, just print "None" for clarity.
1081 initializer = self.initializer
1083 if isinstance(initializer, init_ops_v2.TruncatedNormal):
1084 # PY2 type checking can't infer type of initializer even after if.
1085 initializer = typing.cast(init_ops_v2.TruncatedNormal, initializer)
1086 if (initializer.mean == 0.0
1087 and math.isclose(initializer.stddev, 1/math.sqrt(self.dim))):
1088 initializer = None
1090 return ("TableConfig(vocabulary_size={vocabulary_size!r}, dim={dim!r}, "
1091 "initializer={initializer!r}, optimizer={optimizer!r}, "
1092 "combiner={combiner!r}, name={name!r}, "
1093 "quantization_config={quantization!r})".format(
1094 vocabulary_size=self.vocabulary_size,
1095 dim=self.dim,
1096 initializer=initializer,
1097 optimizer=self.optimizer,
1098 combiner=self.combiner,
1099 name=self.name,
1100 quantization=self.quantization_config,
1101 ))
1103 def _set_table_descriptor(
1104 self,
1105 table_descriptor: tpu_embedding_configuration_pb2
1106 .TPUEmbeddingConfiguration.TableDescriptor,
1107 num_hosts: int,
1108 learning_rate_index: Dict[Callable[[], Any], int]):
1109 """Set the table descriptor from the table data."""
1110 table_descriptor.name = self.name
1112 # For small tables, we pad to the number of hosts so that at least one
1113 # id will be assigned to each host.
1114 table_descriptor.vocabulary_size = max(self.vocabulary_size, num_hosts)
1115 table_descriptor.dimension = self.dim
1117 parameters = table_descriptor.optimization_parameters
1119 # We handle the learning rate separately here and don't allow the
1120 # optimization class to handle this, as it doesn't know about dynamic
1121 # rates.
1122 if callable(self.optimizer.learning_rate):
1123 parameters.learning_rate.dynamic.tag = (
1124 learning_rate_index[self.optimizer.learning_rate])
1125 else:
1126 parameters.learning_rate.constant = self.optimizer.learning_rate
1128 if self.optimizer.low_dimensional_packing_status:
1129 parameters.low_dimensional_packing_status = (
1130 optimization_parameters_pb2.LowDimensionalPackingStatus.Status.ENABLED
1131 )
1132 # Use optimizer to handle the rest of the parameters.
1133 self.optimizer._set_optimization_parameters(parameters) # pylint: disable=protected-access
1134 if self.quantization_config:
1135 self.quantization_config._set_optimization_parameters(parameters) # pylint: disable=protected-access
1138@tf_export("tpu.experimental.embedding.FeatureConfig")
1139class FeatureConfig:
1140 """Configuration data for one embedding feature.
1142 This class holds the configuration data for a single embedding feature. The
1143 main use is to assign features to `tf.tpu.experimental.embedding.TableConfig`s
1144 via the table parameter:
1146 ```python
1147 table_config_one = tf.tpu.experimental.embedding.TableConfig(
1148 vocabulary_size=...,
1149 dim=...)
1150 table_config_two = tf.tpu.experimental.embedding.TableConfig(
1151 vocabulary_size=...,
1152 dim=...)
1153 feature_config = {
1154 'feature_one': tf.tpu.experimental.embedding.FeatureConfig(
1155 table=table_config_one),
1156 'feature_two': tf.tpu.experimental.embedding.FeatureConfig(
1157 table=table_config_one),
1158 'feature_three': tf.tpu.experimental.embedding.FeatureConfig(
1159 table=table_config_two)}
1160 embedding = tf.tpu.experimental.embedding.TPUEmbedding(
1161 feature_config=feature_config,
1162 batch_size=...
1163 optimizer=tf.tpu.experimental.embedding.Adam(0.1))
1164 ```
1166 The above configuration has 2 tables, and three features. The first two
1167 features will be looked up in the first table and the third feature will be
1168 looked up in the second table.
1170 You can also specify the output shape for each feature. The output shape
1171 should be the expected activation shape excluding the table dimension. For
1172 dense and sparse tensor, the output shape should be the same as the input
1173 shape excluding the last dimension. For ragged tensor, the output shape can
1174 mismatch the input shape.
1176 NOTE: The `max_sequence_length` will be only used when the input tensor has
1177 rank 2 and the `output_shape` is not set in the feature config.
1179 When feeding features into `embedding.enqueue` they can be `tf.Tensor`s,
1180 `tf.SparseTensor`s or `tf.RaggedTensor`s. When the argument
1181 `max_sequence_length` is 0, the default, you should expect a output of
1182 `embedding.dequeue` for this feature of shape `(batch_size, dim)`. If
1183 `max_sequence_length` is greater than 0, the feature is embedded as a sequence
1184 and padded up to the given length. The shape of the output for this feature
1185 will be `(batch_size, max_sequence_length, dim)`.
1186 """
1188 def __init__(self,
1189 table: TableConfig,
1190 max_sequence_length: int = 0,
1191 validate_weights_and_indices: bool = True,
1192 output_shape: Optional[Union[List[int], TensorShape]] = None,
1193 name: Optional[Text] = None):
1194 """Feature configuration.
1196 Args:
1197 table: An instance of `tf.tpu.experimental.embedding.TableConfig`,
1198 describing the table in which this feature should be looked up.
1199 max_sequence_length: If positive, the feature is a sequence feature with
1200 the corresponding maximum sequence length. If the sequence is longer
1201 than this, it will be truncated. If 0, the feature is not a sequence
1202 feature.
1203 validate_weights_and_indices: If true, uses safe_embedding_lookup during
1204 serving which ensures there are no empty rows and all weights and ids
1205 are positive at the expense of extra compute cost.
1206 output_shape: Optional argument to config the output shape of the feature
1207 activation. If provided, the feature feeding to the `embedding.enqueue`
1208 has to match the shape (for ragged tensor, the input shape and output
1209 shape can mismatch). If not provided, the shape can be either provided
1210 to the `embedding.build` or auto detected at the runtime.
1211 name: An optional name for the feature, useful for debugging.
1213 Returns:
1214 `FeatureConfig`.
1216 Raises:
1217 ValueError: if `table` is not an instance of
1218 `tf.tpu.experimental.embedding.TableConfig`.
1219 ValueError: if `max_sequence_length` not an integer or is negative.
1220 """
1221 if not isinstance(table, TableConfig):
1222 raise ValueError(f"Argument `table` has invalid type {type(table)}. "
1223 "Expected `tf.tpu.experimental.embedding.TableConfig`.")
1225 if not isinstance(max_sequence_length, int) or max_sequence_length < 0:
1226 raise ValueError(
1227 f"Argument `max_sequence_length` must be an int and must be >= 0. "
1228 f"Received: {max_sequence_length}")
1230 self.table = table
1231 self.max_sequence_length = max_sequence_length
1232 self.name = name
1233 self.output_shape = TensorShape(output_shape)
1235 if not isinstance(
1236 validate_weights_and_indices, bool):
1237 raise ValueError(
1238 f"Argument `validate_weights_and_indices` must be a boolean. "
1239 f"Received: {validate_weights_and_indices}")
1241 self.validate_weights_and_indices = validate_weights_and_indices
1243 def __repr__(self):
1244 return ("FeatureConfig(table={table!r}, "
1245 "max_sequence_length={max_sequence_length!r}, "
1246 "validate_weights_and_indices={validate_weights_and_indices!r}, "
1247 "output_shape={output_shape!r}, name={name!r})".format(
1248 table=self.table,
1249 max_sequence_length=self.max_sequence_length,
1250 validate_weights_and_indices=self.validate_weights_and_indices,
1251 output_shape=self.output_shape,
1252 name=self.name))
1255def log_tpu_embedding_configuration(
1256 config: tpu_embedding_configuration_pb2.TPUEmbeddingConfiguration) -> None:
1257 """Logs a TPUEmbeddingConfiguration proto across multiple statements.
1259 Args:
1260 config: TPUEmbeddingConfiguration proto to log. Necessary because
1261 logging.info has a maximum length to each log statement, which
1262 particularly large configs can exceed.
1263 """
1264 logging.info("Beginning log of TPUEmbeddingConfiguration.")
1265 for line in str(config).splitlines():
1266 logging.info(line)
1267 logging.info("Done with log of TPUEmbeddingConfiguration.")
1270def _sort_device_spec_strings(device_strings: Iterable[str]) -> List[str]:
1271 sorted_specs = sorted(
1272 (device_spec.DeviceSpecV2.from_string(spec) for spec in device_strings),
1273 key=lambda s: (s.replica, s.task, s.device_index),
1274 )
1275 return [spec.to_string() for spec in sorted_specs]
1278def get_list_of_hosts(strategy: tpu_strategy.TPUStrategy) -> List[Text]:
1279 """Returns a sorted list of CPU devices for the remote jobs.
1281 Args:
1282 strategy: A TPUStrategy object.
1284 Returns:
1285 A sorted list of device host strings.
1286 """
1288 list_of_hosts = []
1289 # Elsewehere we assume that the list of hosts is sorted.
1290 for tpu_device in _sort_device_spec_strings(strategy.extended.worker_devices):
1291 host = device_util.get_host_for_device(tpu_device)
1292 if host not in list_of_hosts:
1293 list_of_hosts.append(host)
1294 assert len(list_of_hosts) == strategy.extended.num_hosts
1295 return list_of_hosts