Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/tensorflow/python/feature_column/feature_column.py: 30%
885 statements
« prev ^ index » next coverage.py v7.4.0, created at 2024-01-03 07:57 +0000
« prev ^ index » next coverage.py v7.4.0, created at 2024-01-03 07:57 +0000
1# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
2#
3# Licensed under the Apache License, Version 2.0 (the "License");
4# you may not use this file except in compliance with the License.
5# You may obtain a copy of the License at
6#
7# http://www.apache.org/licenses/LICENSE-2.0
8#
9# Unless required by applicable law or agreed to in writing, software
10# distributed under the License is distributed on an "AS IS" BASIS,
11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12# See the License for the specific language governing permissions and
13# limitations under the License.
14# ==============================================================================
15"""This API defines FeatureColumn abstraction.
17FeatureColumns provide a high level abstraction for ingesting and representing
18features. FeatureColumns are also the primary way of encoding features for
19canned `tf.estimator.Estimator`s.
21When using FeatureColumns with `Estimators`, the type of feature column you
22should choose depends on (1) the feature type and (2) the model type.
241. Feature type:
26 * Continuous features can be represented by `numeric_column`.
27 * Categorical features can be represented by any `categorical_column_with_*`
28 column:
29 - `categorical_column_with_vocabulary_list`
30 - `categorical_column_with_vocabulary_file`
31 - `categorical_column_with_hash_bucket`
32 - `categorical_column_with_identity`
33 - `weighted_categorical_column`
352. Model type:
37 * Deep neural network models (`DNNClassifier`, `DNNRegressor`).
39 Continuous features can be directly fed into deep neural network models.
41 age_column = numeric_column("age")
43 To feed sparse features into DNN models, wrap the column with
44 `embedding_column` or `indicator_column`. `indicator_column` is recommended
45 for features with only a few possible values. For features with many
46 possible values, to reduce the size of your model, `embedding_column` is
47 recommended.
49 embedded_dept_column = embedding_column(
50 categorical_column_with_vocabulary_list(
51 "department", ["math", "philosophy", ...]), dimension=10)
53 * Wide (aka linear) models (`LinearClassifier`, `LinearRegressor`).
55 Sparse features can be fed directly into linear models. They behave like an
56 indicator column but with an efficient implementation.
58 dept_column = categorical_column_with_vocabulary_list("department",
59 ["math", "philosophy", "english"])
61 It is recommended that continuous features be bucketized before being
62 fed into linear models.
64 bucketized_age_column = bucketized_column(
65 source_column=age_column,
66 boundaries=[18, 25, 30, 35, 40, 45, 50, 55, 60, 65])
68 Sparse features can be crossed (also known as conjuncted or combined) in
69 order to form non-linearities, and then fed into linear models.
71 cross_dept_age_column = crossed_column(
72 columns=["department", bucketized_age_column],
73 hash_bucket_size=1000)
75Example of building canned `Estimator`s using FeatureColumns:
77 ```python
78 # Define features and transformations
79 deep_feature_columns = [age_column, embedded_dept_column]
80 wide_feature_columns = [dept_column, bucketized_age_column,
81 cross_dept_age_column]
83 # Build deep model
84 estimator = DNNClassifier(
85 feature_columns=deep_feature_columns,
86 hidden_units=[500, 250, 50])
87 estimator.train(...)
89 # Or build a wide model
90 estimator = LinearClassifier(
91 feature_columns=wide_feature_columns)
92 estimator.train(...)
94 # Or build a wide and deep model!
95 estimator = DNNLinearCombinedClassifier(
96 linear_feature_columns=wide_feature_columns,
97 dnn_feature_columns=deep_feature_columns,
98 dnn_hidden_units=[500, 250, 50])
99 estimator.train(...)
100 ```
103FeatureColumns can also be transformed into a generic input layer for
104custom models using `input_layer`.
106Example of building model using FeatureColumns, this can be used in a
107`model_fn` which is given to the {tf.estimator.Estimator}:
109 ```python
110 # Building model via layers
112 deep_feature_columns = [age_column, embedded_dept_column]
113 columns_to_tensor = parse_feature_columns_from_examples(
114 serialized=my_data,
115 feature_columns=deep_feature_columns)
116 first_layer = input_layer(
117 features=columns_to_tensor,
118 feature_columns=deep_feature_columns)
119 second_layer = fully_connected(first_layer, ...)
120 ```
122NOTE: Functions prefixed with "_" indicate experimental or private parts of
123the API subject to change, and should not be relied upon!
125NOTE: The new feature columns are being developed in feature_column_v2.py and
126are a somewhat duplicate of the code here. Please make sure to update logic
127in both places.
128"""
130import abc
131import collections
132import math
134import numpy as np
135import six
137from tensorflow.python.eager import context
138from tensorflow.python.feature_column import utils as fc_utils
139from tensorflow.python.framework import dtypes
140from tensorflow.python.framework import ops
141from tensorflow.python.framework import sparse_tensor as sparse_tensor_lib
142from tensorflow.python.framework import tensor_shape
143from tensorflow.python.layers import base
144from tensorflow.python.ops import array_ops
145from tensorflow.python.ops import array_ops_stack
146from tensorflow.python.ops import check_ops
147from tensorflow.python.ops import cond
148from tensorflow.python.ops import embedding_ops
149from tensorflow.python.ops import init_ops
150from tensorflow.python.ops import lookup_ops
151from tensorflow.python.ops import math_ops
152from tensorflow.python.ops import nn_ops
153from tensorflow.python.ops import parsing_ops
154from tensorflow.python.ops import resource_variable_ops
155from tensorflow.python.ops import sparse_ops
156from tensorflow.python.ops import string_ops
157from tensorflow.python.ops import template
158from tensorflow.python.ops import variable_scope
159from tensorflow.python.ops import variables
160from tensorflow.python.platform import gfile
161from tensorflow.python.platform import tf_logging as logging
162from tensorflow.python.training import checkpoint_utils
163from tensorflow.python.util import deprecation
164from tensorflow.python.util import nest
165from tensorflow.python.util.compat import collections_abc
166from tensorflow.python.util.tf_export import tf_export
167from tensorflow.tools.docs import doc_controls
169_FEATURE_COLUMN_DEPRECATION_WARNING = """\
170 Warning: tf.feature_column is not recommended for new code. Instead,
171 feature preprocessing can be done directly using either [Keras preprocessing
172 layers](https://www.tensorflow.org/guide/migrate/migrating_feature_columns)
173 or through the one-stop utility [`tf.keras.utils.FeatureSpace`](https://www.tensorflow.org/api_docs/python/tf/keras/utils/FeatureSpace)
174 built on top of them. See the [migration guide](https://tensorflow.org/guide/migrate)
175 for details.
176 """
178_FEATURE_COLUMN_DEPRECATION_RUNTIME_WARNING = (
179 'Use Keras preprocessing layers instead, either directly or via the '
180 '`tf.keras.utils.FeatureSpace` utility. Each of `tf.feature_column.*` has '
181 'a functional equivalent in `tf.keras.layers` for feature preprocessing '
182 'when training a Keras model.')
185def _internal_input_layer(features,
186 feature_columns,
187 weight_collections=None,
188 trainable=True,
189 cols_to_vars=None,
190 scope=None,
191 cols_to_output_tensors=None,
192 from_template=False):
193 """See input_layer. `scope` is a name or variable scope to use."""
195 feature_columns = _normalize_feature_columns(feature_columns)
196 for column in feature_columns:
197 if not isinstance(column, _DenseColumn):
198 raise ValueError(
199 'Items of feature_columns must be a _DenseColumn. '
200 'You can wrap a categorical column with an '
201 'embedding_column or indicator_column. Given: {}'.format(column))
202 weight_collections = list(weight_collections or [])
203 if ops.GraphKeys.GLOBAL_VARIABLES not in weight_collections:
204 weight_collections.append(ops.GraphKeys.GLOBAL_VARIABLES)
205 if ops.GraphKeys.MODEL_VARIABLES not in weight_collections:
206 weight_collections.append(ops.GraphKeys.MODEL_VARIABLES)
208 def _get_logits(): # pylint: disable=missing-docstring
209 builder = _LazyBuilder(features)
210 output_tensors = []
211 ordered_columns = []
212 for column in sorted(feature_columns, key=lambda x: x.name):
213 ordered_columns.append(column)
214 with variable_scope.variable_scope(
215 None, default_name=column._var_scope_name): # pylint: disable=protected-access
216 tensor = column._get_dense_tensor( # pylint: disable=protected-access
217 builder,
218 weight_collections=weight_collections,
219 trainable=trainable)
220 num_elements = column._variable_shape.num_elements() # pylint: disable=protected-access
221 batch_size = array_ops.shape(tensor)[0]
222 output_tensor = array_ops.reshape(
223 tensor, shape=(batch_size, num_elements))
224 output_tensors.append(output_tensor)
225 if cols_to_vars is not None:
226 # Retrieve any variables created (some _DenseColumn's don't create
227 # variables, in which case an empty list is returned).
228 cols_to_vars[column] = ops.get_collection(
229 ops.GraphKeys.GLOBAL_VARIABLES,
230 scope=variable_scope.get_variable_scope().name)
231 if cols_to_output_tensors is not None:
232 cols_to_output_tensors[column] = output_tensor
233 _verify_static_batch_size_equality(output_tensors, ordered_columns)
234 return array_ops.concat(output_tensors, 1)
236 # If we're constructing from the `make_template`, that by default adds a
237 # variable scope with the name of the layer. In that case, we dont want to
238 # add another `variable_scope` as that would break checkpoints.
239 if from_template:
240 return _get_logits()
241 else:
242 with variable_scope.variable_scope(
243 scope, default_name='input_layer', values=features.values()):
244 return _get_logits()
247@doc_controls.header(_FEATURE_COLUMN_DEPRECATION_WARNING)
248@tf_export(v1=['feature_column.input_layer'])
249@deprecation.deprecated(None, _FEATURE_COLUMN_DEPRECATION_RUNTIME_WARNING)
250def input_layer(features,
251 feature_columns,
252 weight_collections=None,
253 trainable=True,
254 cols_to_vars=None,
255 cols_to_output_tensors=None):
256 """Returns a dense `Tensor` as input layer based on given `feature_columns`.
258 Generally a single example in training data is described with FeatureColumns.
259 At the first layer of the model, this column oriented data should be converted
260 to a single `Tensor`.
262 Example:
264 ```python
265 price = numeric_column('price')
266 keywords_embedded = embedding_column(
267 categorical_column_with_hash_bucket("keywords", 10K), dimensions=16)
268 columns = [price, keywords_embedded, ...]
269 features = tf.io.parse_example(..., features=make_parse_example_spec(columns))
270 dense_tensor = input_layer(features, columns)
271 for units in [128, 64, 32]:
272 dense_tensor = tf.compat.v1.layers.dense(dense_tensor, units, tf.nn.relu)
273 prediction = tf.compat.v1.layers.dense(dense_tensor, 1)
274 ```
276 Args:
277 features: A mapping from key to tensors. `_FeatureColumn`s look up via these
278 keys. For example `numeric_column('price')` will look at 'price' key in
279 this dict. Values can be a `SparseTensor` or a `Tensor` depends on
280 corresponding `_FeatureColumn`.
281 feature_columns: An iterable containing the FeatureColumns to use as inputs
282 to your model. All items should be instances of classes derived from
283 `_DenseColumn` such as `numeric_column`, `embedding_column`,
284 `bucketized_column`, `indicator_column`. If you have categorical features,
285 you can wrap them with an `embedding_column` or `indicator_column`.
286 weight_collections: A list of collection names to which the Variable will be
287 added. Note that variables will also be added to collections
288 `tf.GraphKeys.GLOBAL_VARIABLES` and `ops.GraphKeys.MODEL_VARIABLES`.
289 trainable: If `True` also add the variable to the graph collection
290 `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`).
291 cols_to_vars: If not `None`, must be a dictionary that will be filled with a
292 mapping from `_FeatureColumn` to list of `Variable`s. For example, after
293 the call, we might have cols_to_vars = {_EmbeddingColumn(
294 categorical_column=_HashedCategoricalColumn( key='sparse_feature',
295 hash_bucket_size=5, dtype=tf.string), dimension=10): [<tf.Variable
296 'some_variable:0' shape=(5, 10), <tf.Variable 'some_variable:1' shape=(5,
297 10)]} If a column creates no variables, its value will be an empty list.
298 cols_to_output_tensors: If not `None`, must be a dictionary that will be
299 filled with a mapping from '_FeatureColumn' to the associated output
300 `Tensor`s.
302 Returns:
303 A `Tensor` which represents input layer of a model. Its shape
304 is (batch_size, first_layer_dimension) and its dtype is `float32`.
305 first_layer_dimension is determined based on given `feature_columns`.
307 Raises:
308 ValueError: if an item in `feature_columns` is not a `_DenseColumn`.
309 """
310 return _internal_input_layer(
311 features,
312 feature_columns,
313 weight_collections=weight_collections,
314 trainable=trainable,
315 cols_to_vars=cols_to_vars,
316 cols_to_output_tensors=cols_to_output_tensors)
319# TODO(akshayka): InputLayer should be a subclass of Layer, and it
320# should implement the logic in input_layer using Layer's build-and-call
321# paradigm; input_layer should create an instance of InputLayer and
322# return the result of invoking its apply method, just as functional layers do.
323class InputLayer(object):
324 """An object-oriented version of `input_layer` that reuses variables."""
326 def __init__(self,
327 feature_columns,
328 weight_collections=None,
329 trainable=True,
330 cols_to_vars=None,
331 name='feature_column_input_layer',
332 create_scope_now=True):
333 """See `input_layer`."""
335 self._feature_columns = feature_columns
336 self._weight_collections = weight_collections
337 self._trainable = trainable
338 self._cols_to_vars = cols_to_vars
339 self._name = name
340 self._input_layer_template = template.make_template(
341 self._name, _internal_input_layer, create_scope_now_=create_scope_now)
342 self._scope = self._input_layer_template.variable_scope
344 def __call__(self, features):
345 return self._input_layer_template(
346 features=features,
347 feature_columns=self._feature_columns,
348 weight_collections=self._weight_collections,
349 trainable=self._trainable,
350 cols_to_vars=None,
351 from_template=True)
353 @property
354 def name(self):
355 return self._name
357 @property
358 def non_trainable_variables(self):
359 return self._input_layer_template.non_trainable_variables
361 @property
362 def non_trainable_weights(self):
363 return self._input_layer_template.non_trainable_weights
365 @property
366 def trainable_variables(self):
367 return self._input_layer_template.trainable_variables
369 @property
370 def trainable_weights(self):
371 return self._input_layer_template.trainable_weights
373 @property
374 def variables(self):
375 return self._input_layer_template.variables
377 @property
378 def weights(self):
379 return self._input_layer_template.weights
382@doc_controls.header(_FEATURE_COLUMN_DEPRECATION_WARNING)
383@tf_export(v1=['feature_column.linear_model'])
384@deprecation.deprecated(None, _FEATURE_COLUMN_DEPRECATION_RUNTIME_WARNING)
385def linear_model(features,
386 feature_columns,
387 units=1,
388 sparse_combiner='sum',
389 weight_collections=None,
390 trainable=True,
391 cols_to_vars=None):
392 """Returns a linear prediction `Tensor` based on given `feature_columns`.
394 This function generates a weighted sum based on output dimension `units`.
395 Weighted sum refers to logits in classification problems. It refers to the
396 prediction itself for linear regression problems.
398 Note on supported columns: `linear_model` treats categorical columns as
399 `indicator_column`s. To be specific, assume the input as `SparseTensor` looks
400 like:
402 ```python
403 shape = [2, 2]
404 {
405 [0, 0]: "a"
406 [1, 0]: "b"
407 [1, 1]: "c"
408 }
409 ```
410 `linear_model` assigns weights for the presence of "a", "b", "c' implicitly,
411 just like `indicator_column`, while `input_layer` explicitly requires wrapping
412 each of categorical columns with an `embedding_column` or an
413 `indicator_column`.
415 Example of usage:
417 ```python
418 price = numeric_column('price')
419 price_buckets = bucketized_column(price, boundaries=[0., 10., 100., 1000.])
420 keywords = categorical_column_with_hash_bucket("keywords", 10K)
421 keywords_price = crossed_column('keywords', price_buckets, ...)
422 columns = [price_buckets, keywords, keywords_price ...]
423 features = tf.io.parse_example(..., features=make_parse_example_spec(columns))
424 prediction = linear_model(features, columns)
425 ```
427 The `sparse_combiner` argument works as follows
428 For example, for two features represented as the categorical columns:
430 ```python
431 # Feature 1
433 shape = [2, 2]
434 {
435 [0, 0]: "a"
436 [0, 1]: "b"
437 [1, 0]: "c"
438 }
440 # Feature 2
442 shape = [2, 3]
443 {
444 [0, 0]: "d"
445 [1, 0]: "e"
446 [1, 1]: "f"
447 [1, 2]: "f"
448 }
449 ```
451 with `sparse_combiner` as "mean", the linear model outputs consequently
452 are:
454 ```
455 y_0 = 1.0 / 2.0 * ( w_a + w_b ) + w_d + b
456 y_1 = w_c + 1.0 / 3.0 * ( w_e + 2.0 * w_f ) + b
457 ```
459 where `y_i` is the output, `b` is the bias, and `w_x` is the weight
460 assigned to the presence of `x` in the input features.
462 Args:
463 features: A mapping from key to tensors. `_FeatureColumn`s look up via these
464 keys. For example `numeric_column('price')` will look at 'price' key in
465 this dict. Values are `Tensor` or `SparseTensor` depending on
466 corresponding `_FeatureColumn`.
467 feature_columns: An iterable containing the FeatureColumns to use as inputs
468 to your model. All items should be instances of classes derived from
469 `_FeatureColumn`s.
470 units: An integer, dimensionality of the output space. Default value is 1.
471 sparse_combiner: A string specifying how to reduce if a categorical column
472 is multivalent. Except `numeric_column`, almost all columns passed to
473 `linear_model` are considered as categorical columns. It combines each
474 categorical column independently. Currently "mean", "sqrtn" and "sum" are
475 supported, with "sum" the default for linear model. "sqrtn" often achieves
476 good accuracy, in particular with bag-of-words columns.
477 * "sum": do not
478 normalize features in the column
479 * "mean": do l1 normalization on features
480 in the column
481 * "sqrtn": do l2 normalization on features in the column
482 weight_collections: A list of collection names to which the Variable will be
483 added. Note that, variables will also be added to collections
484 `tf.GraphKeys.GLOBAL_VARIABLES` and `ops.GraphKeys.MODEL_VARIABLES`.
485 trainable: If `True` also add the variable to the graph collection
486 `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`).
487 cols_to_vars: If not `None`, must be a dictionary that will be filled with a
488 mapping from `_FeatureColumn` to associated list of `Variable`s. For
489 example, after the call, we might have cols_to_vars = { _NumericColumn(
490 key='numeric_feature1', shape=(1,): [<tf.Variable
491 'linear_model/price2/weights:0' shape=(1, 1)>], 'bias': [<tf.Variable
492 'linear_model/bias_weights:0' shape=(1,)>], _NumericColumn(
493 key='numeric_feature2', shape=(2,)): [<tf.Variable
494 'linear_model/price1/weights:0' shape=(2, 1)>]} If a column creates no
495 variables, its value will be an empty list. Note that cols_to_vars will
496 also contain a string key 'bias' that maps to a list of Variables.
498 Returns:
499 A `Tensor` which represents predictions/logits of a linear model. Its shape
500 is (batch_size, units) and its dtype is `float32`.
502 Raises:
503 ValueError: if an item in `feature_columns` is neither a `_DenseColumn`
504 nor `_CategoricalColumn`.
505 """
506 with variable_scope.variable_scope(None, 'linear_model') as vs:
507 model_name = _strip_leading_slashes(vs.name)
508 linear_model_layer = _LinearModel(
509 feature_columns=feature_columns,
510 units=units,
511 sparse_combiner=sparse_combiner,
512 weight_collections=weight_collections,
513 trainable=trainable,
514 name=model_name)
515 retval = linear_model_layer(features) # pylint: disable=not-callable
516 if cols_to_vars is not None:
517 cols_to_vars.update(linear_model_layer.cols_to_vars())
518 return retval
521def _add_to_collections(var, weight_collections):
522 """Adds a var to the list of weight_collections provided.
524 Handles the case for partitioned and non-partitioned variables.
526 Args:
527 var: A variable or Partitioned Variable.
528 weight_collections: List of collections to add variable to.
529 """
530 for weight_collection in weight_collections:
531 # The layer self.add_variable call already adds it to GLOBAL_VARIABLES.
532 if weight_collection == ops.GraphKeys.GLOBAL_VARIABLES:
533 continue
534 # TODO(rohanj): Explore adding a _get_variable_list method on `Variable`
535 # so that we don't have to do this check.
536 if isinstance(var, variables.PartitionedVariable):
537 for constituent_var in list(var):
538 ops.add_to_collection(weight_collection, constituent_var)
539 else:
540 ops.add_to_collection(weight_collection, var)
543class _FCLinearWrapper(base.Layer):
544 """Wraps a _FeatureColumn in a layer for use in a linear model.
546 See `linear_model` above.
547 """
549 def __init__(self,
550 feature_column,
551 units=1,
552 sparse_combiner='sum',
553 weight_collections=None,
554 trainable=True,
555 name=None,
556 **kwargs):
557 super(_FCLinearWrapper, self).__init__(
558 trainable=trainable, name=name, **kwargs)
559 self._feature_column = feature_column
560 self._units = units
561 self._sparse_combiner = sparse_combiner
562 self._weight_collections = weight_collections
564 def build(self, _):
565 if isinstance(self._feature_column, _CategoricalColumn):
566 weight = self.add_variable(
567 name='weights',
568 shape=(self._feature_column._num_buckets, self._units), # pylint: disable=protected-access
569 initializer=init_ops.zeros_initializer(),
570 trainable=self.trainable)
571 else:
572 num_elements = self._feature_column._variable_shape.num_elements() # pylint: disable=protected-access
573 weight = self.add_variable(
574 name='weights',
575 shape=[num_elements, self._units],
576 initializer=init_ops.zeros_initializer(),
577 trainable=self.trainable)
578 _add_to_collections(weight, self._weight_collections)
579 self._weight_var = weight
580 self.built = True
582 def call(self, builder):
583 weighted_sum = _create_weighted_sum(
584 column=self._feature_column,
585 builder=builder,
586 units=self._units,
587 sparse_combiner=self._sparse_combiner,
588 weight_collections=self._weight_collections,
589 trainable=self.trainable,
590 weight_var=self._weight_var)
591 return weighted_sum
594class _BiasLayer(base.Layer):
595 """A layer for the bias term."""
597 def __init__(self,
598 units=1,
599 trainable=True,
600 weight_collections=None,
601 name=None,
602 **kwargs):
603 super(_BiasLayer, self).__init__(trainable=trainable, name=name, **kwargs)
604 self._units = units
605 self._weight_collections = weight_collections
607 def build(self, _):
608 self._bias_variable = self.add_variable(
609 'bias_weights',
610 shape=[self._units],
611 initializer=init_ops.zeros_initializer(),
612 trainable=self.trainable)
613 _add_to_collections(self._bias_variable, self._weight_collections)
614 self.built = True
616 def call(self, _):
617 return self._bias_variable
620def _get_expanded_variable_list(variable):
621 if (isinstance(variable, variables.Variable) or
622 resource_variable_ops.is_resource_variable(variable)):
623 return [variable] # Single variable case.
624 else: # Must be a PartitionedVariable, so convert into a list.
625 return list(variable)
628def _strip_leading_slashes(name):
629 return name.rsplit('/', 1)[-1]
632class _LinearModel(base.Layer):
633 """Creates a linear model using feature columns.
635 See `linear_model` for details.
636 """
638 def __init__(self,
639 feature_columns,
640 units=1,
641 sparse_combiner='sum',
642 weight_collections=None,
643 trainable=True,
644 name=None,
645 **kwargs):
646 super(_LinearModel, self).__init__(name=name, **kwargs)
647 # We force the keras_style to be True here, as a workaround to not being
648 # able to inherit keras.layers.Layer as base class. Setting this will let
649 # us skip all the legacy behavior for base.Layer.
650 # Also note that we use Layer as base class, instead of Model, since there
651 # isn't any Model specific behavior gets used, eg compile/fit.
652 self._keras_style = True
653 self._feature_columns = _normalize_feature_columns(feature_columns)
654 self._weight_collections = list(weight_collections or [])
655 if ops.GraphKeys.GLOBAL_VARIABLES not in self._weight_collections:
656 self._weight_collections.append(ops.GraphKeys.GLOBAL_VARIABLES)
657 if ops.GraphKeys.MODEL_VARIABLES not in self._weight_collections:
658 self._weight_collections.append(ops.GraphKeys.MODEL_VARIABLES)
660 column_layers = {}
661 for column in sorted(self._feature_columns, key=lambda x: x.name):
662 with variable_scope.variable_scope(
663 None, default_name=column._var_scope_name) as vs: # pylint: disable=protected-access
664 # Having the fully expressed variable scope name ends up doubly
665 # expressing the outer scope (scope with which this method was called)
666 # in the name of the variable that would get created.
667 column_name = _strip_leading_slashes(vs.name)
668 column_layer = _FCLinearWrapper(column, units, sparse_combiner,
669 self._weight_collections, trainable,
670 column_name, **kwargs)
671 column_layers[column_name] = column_layer
672 self._column_layers = self._add_layers(column_layers)
673 self._bias_layer = _BiasLayer(
674 units=units,
675 trainable=trainable,
676 weight_collections=self._weight_collections,
677 name='bias_layer',
678 **kwargs)
679 self._cols_to_vars = {}
681 def cols_to_vars(self):
682 """Returns a dict mapping _FeatureColumns to variables.
684 See `linear_model` for more information.
685 This is not populated till `call` is called i.e. layer is built.
686 """
687 return self._cols_to_vars
689 def call(self, features):
690 with variable_scope.variable_scope(self.name):
691 for column in self._feature_columns:
692 if not isinstance(column, (_DenseColumn, _CategoricalColumn)):
693 raise ValueError(
694 'Items of feature_columns must be either a '
695 '_DenseColumn or _CategoricalColumn. Given: {}'.format(column))
696 weighted_sums = []
697 ordered_columns = []
698 builder = _LazyBuilder(features)
699 for layer in sorted(self._column_layers.values(), key=lambda x: x.name):
700 column = layer._feature_column # pylint: disable=protected-access
701 ordered_columns.append(column)
702 weighted_sum = layer(builder)
703 weighted_sums.append(weighted_sum)
704 self._cols_to_vars[column] = ops.get_collection(
705 ops.GraphKeys.GLOBAL_VARIABLES, scope=layer.scope_name)
707 _verify_static_batch_size_equality(weighted_sums, ordered_columns)
708 predictions_no_bias = math_ops.add_n(
709 weighted_sums, name='weighted_sum_no_bias')
710 predictions = nn_ops.bias_add(
711 predictions_no_bias,
712 self._bias_layer( # pylint: disable=not-callable
713 builder,
714 scope=variable_scope.get_variable_scope()), # pylint: disable=not-callable
715 name='weighted_sum')
716 bias = self._bias_layer.variables[0]
717 self._cols_to_vars['bias'] = _get_expanded_variable_list(bias)
718 return predictions
720 def _add_layers(self, layers):
721 # "Magic" required for keras.Model classes to track all the variables in
722 # a list of layers.Layer objects.
723 # TODO(ashankar): Figure out API so user code doesn't have to do this.
724 for name, layer in layers.items():
725 setattr(self, 'layer-%s' % name, layer)
726 return layers
729def _transform_features(features, feature_columns):
730 """Returns transformed features based on features columns passed in.
732 Please note that most probably you would not need to use this function. Please
733 check `input_layer` and `linear_model` to see whether they will
734 satisfy your use case or not.
736 Example:
738 ```python
739 # Define features and transformations
740 crosses_a_x_b = crossed_column(
741 columns=["sparse_feature_a", "sparse_feature_b"], hash_bucket_size=10000)
742 price_buckets = bucketized_column(
743 source_column=numeric_column("price"), boundaries=[...])
745 columns = [crosses_a_x_b, price_buckets]
746 features = tf.io.parse_example(..., features=make_parse_example_spec(columns))
747 transformed = transform_features(features=features, feature_columns=columns)
749 assertCountEqual(columns, transformed.keys())
750 ```
752 Args:
753 features: A mapping from key to tensors. `_FeatureColumn`s look up via these
754 keys. For example `numeric_column('price')` will look at 'price' key in
755 this dict. Values can be a `SparseTensor` or a `Tensor` depends on
756 corresponding `_FeatureColumn`.
757 feature_columns: An iterable containing all the `_FeatureColumn`s.
759 Returns:
760 A `dict` mapping `_FeatureColumn` to `Tensor` and `SparseTensor` values.
761 """
762 feature_columns = _normalize_feature_columns(feature_columns)
763 outputs = {}
764 with ops.name_scope(
765 None, default_name='transform_features', values=features.values()):
766 builder = _LazyBuilder(features)
767 for column in sorted(feature_columns, key=lambda x: x.name):
768 with ops.name_scope(None, default_name=column.name):
769 outputs[column] = builder.get(column)
770 return outputs
773@doc_controls.header(_FEATURE_COLUMN_DEPRECATION_WARNING)
774@tf_export(v1=['feature_column.make_parse_example_spec'])
775@deprecation.deprecated(None, _FEATURE_COLUMN_DEPRECATION_RUNTIME_WARNING)
776def make_parse_example_spec(feature_columns):
777 """Creates parsing spec dictionary from input feature_columns.
779 The returned dictionary can be used as arg 'features' in
780 `tf.io.parse_example`.
782 Typical usage example:
784 ```python
785 # Define features and transformations
786 feature_a = categorical_column_with_vocabulary_file(...)
787 feature_b = numeric_column(...)
788 feature_c_bucketized = bucketized_column(numeric_column("feature_c"), ...)
789 feature_a_x_feature_c = crossed_column(
790 columns=["feature_a", feature_c_bucketized], ...)
792 feature_columns = set(
793 [feature_b, feature_c_bucketized, feature_a_x_feature_c])
794 features = tf.io.parse_example(
795 serialized=serialized_examples,
796 features=make_parse_example_spec(feature_columns))
797 ```
799 For the above example, make_parse_example_spec would return the dict:
801 ```python
802 {
803 "feature_a": parsing_ops.VarLenFeature(tf.string),
804 "feature_b": parsing_ops.FixedLenFeature([1], dtype=tf.float32),
805 "feature_c": parsing_ops.FixedLenFeature([1], dtype=tf.float32)
806 }
807 ```
809 Args:
810 feature_columns: An iterable containing all feature columns. All items
811 should be instances of classes derived from `_FeatureColumn`.
813 Returns:
814 A dict mapping each feature key to a `FixedLenFeature` or `VarLenFeature`
815 value.
817 Raises:
818 ValueError: If any of the given `feature_columns` is not a `_FeatureColumn`
819 instance.
820 """
821 result = {}
822 for column in feature_columns:
823 if not isinstance(column, _FeatureColumn):
824 raise ValueError('All feature_columns must be _FeatureColumn instances. '
825 'Given: {}'.format(column))
826 config = column._parse_example_spec # pylint: disable=protected-access
827 for key, value in six.iteritems(config):
828 if key in result and value != result[key]:
829 raise ValueError('feature_columns contain different parse_spec for key '
830 '{}. Given {} and {}'.format(key, value, result[key]))
831 result.update(config)
832 return result
835def _embedding_column(categorical_column,
836 dimension,
837 combiner='mean',
838 initializer=None,
839 ckpt_to_load_from=None,
840 tensor_name_in_ckpt=None,
841 max_norm=None,
842 trainable=True,
843 use_safe_embedding_lookup=True):
844 """`_DenseColumn` that converts from sparse, categorical input.
846 Use this when your inputs are sparse, but you want to convert them to a dense
847 representation (e.g., to feed to a DNN).
849 Inputs must be a `_CategoricalColumn` created by any of the
850 `categorical_column_*` function. Here is an example of using
851 `embedding_column` with `DNNClassifier`:
853 ```python
854 video_id = categorical_column_with_identity(
855 key='video_id', num_buckets=1000000, default_value=0)
856 columns = [embedding_column(video_id, 9),...]
858 estimator = tf.estimator.DNNClassifier(feature_columns=columns, ...)
860 label_column = ...
861 def input_fn():
862 features = tf.io.parse_example(
863 ..., features=make_parse_example_spec(columns + [label_column]))
864 labels = features.pop(label_column.name)
865 return features, labels
867 estimator.train(input_fn=input_fn, steps=100)
868 ```
870 Here is an example using `embedding_column` with model_fn:
872 ```python
873 def model_fn(features, ...):
874 video_id = categorical_column_with_identity(
875 key='video_id', num_buckets=1000000, default_value=0)
876 columns = [embedding_column(video_id, 9),...]
877 dense_tensor = input_layer(features, columns)
878 # Form DNN layers, calculate loss, and return EstimatorSpec.
879 ...
880 ```
882 Args:
883 categorical_column: A `_CategoricalColumn` created by a
884 `categorical_column_with_*` function. This column produces the sparse IDs
885 that are inputs to the embedding lookup.
886 dimension: An integer specifying dimension of the embedding, must be > 0.
887 combiner: A string specifying how to reduce if there are multiple entries in
888 a single row. Currently 'mean', 'sqrtn' and 'sum' are supported, with
889 'mean' the default. 'sqrtn' often achieves good accuracy, in particular
890 with bag-of-words columns. Each of this can be thought as example level
891 normalizations on the column. For more information, see
892 `tf.embedding_lookup_sparse`.
893 initializer: A variable initializer function to be used in embedding
894 variable initialization. If not specified, defaults to
895 `tf.compat.v1.truncated_normal_initializer` with mean `0.0` and standard
896 deviation `1/sqrt(dimension)`.
897 ckpt_to_load_from: String representing checkpoint name/pattern from which to
898 restore column weights. Required if `tensor_name_in_ckpt` is not `None`.
899 tensor_name_in_ckpt: Name of the `Tensor` in `ckpt_to_load_from` from which
900 to restore the column weights. Required if `ckpt_to_load_from` is not
901 `None`.
902 max_norm: If not `None`, embedding values are l2-normalized to this value.
903 trainable: Whether or not the embedding is trainable. Default is True.
904 use_safe_embedding_lookup: If true, uses safe_embedding_lookup_sparse
905 instead of embedding_lookup_sparse. safe_embedding_lookup_sparse ensures
906 there are no empty rows and all weights and ids are positive at the
907 expense of extra compute cost. This only applies to rank 2 (NxM) shaped
908 input tensors. Defaults to true, consider turning off if the above checks
909 are not needed. Note that having empty rows will not trigger any error
910 though the output result might be 0 or omitted.
912 Returns:
913 `_DenseColumn` that converts from sparse input.
915 Raises:
916 ValueError: if `dimension` not > 0.
917 ValueError: if exactly one of `ckpt_to_load_from` and `tensor_name_in_ckpt`
918 is specified.
919 ValueError: if `initializer` is specified and is not callable.
920 RuntimeError: If eager execution is enabled.
921 """
922 if (dimension is None) or (dimension < 1):
923 raise ValueError('Invalid dimension {}.'.format(dimension))
924 if (ckpt_to_load_from is None) != (tensor_name_in_ckpt is None):
925 raise ValueError('Must specify both `ckpt_to_load_from` and '
926 '`tensor_name_in_ckpt` or none of them.')
928 if (initializer is not None) and (not callable(initializer)):
929 raise ValueError('initializer must be callable if specified. '
930 'Embedding of column_name: {}'.format(
931 categorical_column.name))
932 if initializer is None:
933 initializer = init_ops.truncated_normal_initializer(
934 mean=0.0, stddev=1 / math.sqrt(dimension))
936 embedding_shape = categorical_column._num_buckets, dimension # pylint: disable=protected-access
938 def _creator(weight_collections, scope):
939 embedding_column_layer = _EmbeddingColumnLayer(
940 embedding_shape=embedding_shape,
941 initializer=initializer,
942 weight_collections=weight_collections,
943 trainable=trainable,
944 name='embedding_column_layer')
945 return embedding_column_layer(None, scope=scope) # pylint: disable=not-callable
947 return _EmbeddingColumn(
948 categorical_column=categorical_column,
949 dimension=dimension,
950 combiner=combiner,
951 layer_creator=_creator,
952 ckpt_to_load_from=ckpt_to_load_from,
953 tensor_name_in_ckpt=tensor_name_in_ckpt,
954 max_norm=max_norm,
955 trainable=trainable,
956 use_safe_embedding_lookup=use_safe_embedding_lookup)
959def _numeric_column(key,
960 shape=(1,),
961 default_value=None,
962 dtype=dtypes.float32,
963 normalizer_fn=None):
964 """Represents real valued or numerical features.
966 Example:
968 ```python
969 price = numeric_column('price')
970 columns = [price, ...]
971 features = tf.io.parse_example(..., features=make_parse_example_spec(columns))
972 dense_tensor = input_layer(features, columns)
974 # or
975 bucketized_price = bucketized_column(price, boundaries=[...])
976 columns = [bucketized_price, ...]
977 features = tf.io.parse_example(..., features=make_parse_example_spec(columns))
978 linear_prediction = linear_model(features, columns)
979 ```
981 Args:
982 key: A unique string identifying the input feature. It is used as the column
983 name and the dictionary key for feature parsing configs, feature `Tensor`
984 objects, and feature columns.
985 shape: An iterable of integers specifies the shape of the `Tensor`. An
986 integer can be given which means a single dimension `Tensor` with given
987 width. The `Tensor` representing the column will have the shape of
988 [batch_size] + `shape`.
989 default_value: A single value compatible with `dtype` or an iterable of
990 values compatible with `dtype` which the column takes on during
991 `tf.Example` parsing if data is missing. A default value of `None` will
992 cause `tf.io.parse_example` to fail if an example does not contain this
993 column. If a single value is provided, the same value will be applied as
994 the default value for every item. If an iterable of values is provided,
995 the shape of the `default_value` should be equal to the given `shape`.
996 dtype: defines the type of values. Default value is `tf.float32`. Must be a
997 non-quantized, real integer or floating point type.
998 normalizer_fn: If not `None`, a function that can be used to normalize the
999 value of the tensor after `default_value` is applied for parsing.
1000 Normalizer function takes the input `Tensor` as its argument, and returns
1001 the output `Tensor`. (e.g. lambda x: (x - 3.0) / 4.2). Please note that
1002 even though the most common use case of this function is normalization, it
1003 can be used for any kind of Tensorflow transformations.
1005 Returns:
1006 A `_NumericColumn`.
1008 Raises:
1009 TypeError: if any dimension in shape is not an int
1010 ValueError: if any dimension in shape is not a positive integer
1011 TypeError: if `default_value` is an iterable but not compatible with `shape`
1012 TypeError: if `default_value` is not compatible with `dtype`.
1013 ValueError: if `dtype` is not convertible to `tf.float32`.
1014 """
1015 shape = _check_shape(shape, key)
1016 if not (dtype.is_integer or dtype.is_floating):
1017 raise ValueError('dtype must be convertible to float. '
1018 'dtype: {}, key: {}'.format(dtype, key))
1019 default_value = fc_utils.check_default_value(shape, default_value, dtype, key)
1021 if normalizer_fn is not None and not callable(normalizer_fn):
1022 raise TypeError(
1023 'normalizer_fn must be a callable. Given: {}'.format(normalizer_fn))
1025 fc_utils.assert_key_is_string(key)
1026 return _NumericColumn(
1027 key,
1028 shape=shape,
1029 default_value=default_value,
1030 dtype=dtype,
1031 normalizer_fn=normalizer_fn)
1034def _bucketized_column(source_column, boundaries):
1035 """Represents discretized dense input.
1037 Buckets include the left boundary, and exclude the right boundary. Namely,
1038 `boundaries=[0., 1., 2.]` generates buckets `(-inf, 0.)`, `[0., 1.)`,
1039 `[1., 2.)`, and `[2., +inf)`.
1041 For example, if the inputs are
1043 ```python
1044 boundaries = [0, 10, 100]
1045 input tensor = [[-5, 10000]
1046 [150, 10]
1047 [5, 100]]
1048 ```
1050 then the output will be
1052 ```python
1053 output = [[0, 3]
1054 [3, 2]
1055 [1, 3]]
1056 ```
1058 Example:
1060 ```python
1061 price = numeric_column('price')
1062 bucketized_price = bucketized_column(price, boundaries=[...])
1063 columns = [bucketized_price, ...]
1064 features = tf.io.parse_example(..., features=make_parse_example_spec(columns))
1065 linear_prediction = linear_model(features, columns)
1067 # or
1068 columns = [bucketized_price, ...]
1069 features = tf.io.parse_example(..., features=make_parse_example_spec(columns))
1070 dense_tensor = input_layer(features, columns)
1071 ```
1073 A `bucketized_column` can also be crossed with another categorical column
1074 using `crossed_column`:
1076 ```python
1077 price = numeric_column('price')
1078 # bucketized_column converts numerical feature to a categorical one.
1079 bucketized_price = bucketized_column(price, boundaries=[...])
1080 # 'keywords' is a string feature.
1081 price_x_keywords = crossed_column([bucketized_price, 'keywords'], 50K)
1082 columns = [price_x_keywords, ...]
1083 features = tf.io.parse_example(..., features=make_parse_example_spec(columns))
1084 linear_prediction = linear_model(features, columns)
1085 ```
1087 Args:
1088 source_column: A one-dimensional dense column which is generated with
1089 `numeric_column`.
1090 boundaries: A sorted list or tuple of floats specifying the boundaries.
1092 Returns:
1093 A `_BucketizedColumn`.
1095 Raises:
1096 ValueError: If `source_column` is not a numeric column, or if it is not
1097 one-dimensional.
1098 ValueError: If `boundaries` is not a sorted list or tuple.
1099 """
1100 if not isinstance(source_column, _NumericColumn):
1101 raise ValueError(
1102 'source_column must be a column generated with numeric_column(). '
1103 'Given: {}'.format(source_column))
1104 if len(source_column.shape) > 1:
1105 raise ValueError('source_column must be one-dimensional column. '
1106 'Given: {}'.format(source_column))
1107 if (not boundaries or
1108 not (isinstance(boundaries, list) or isinstance(boundaries, tuple))):
1109 raise ValueError('boundaries must be a sorted list.')
1110 for i in range(len(boundaries) - 1):
1111 if boundaries[i] >= boundaries[i + 1]:
1112 raise ValueError('boundaries must be a sorted list.')
1113 return _BucketizedColumn(source_column, tuple(boundaries))
1116def _categorical_column_with_hash_bucket(key,
1117 hash_bucket_size,
1118 dtype=dtypes.string):
1119 """Represents sparse feature where ids are set by hashing.
1121 Use this when your sparse features are in string or integer format, and you
1122 want to distribute your inputs into a finite number of buckets by hashing.
1123 output_id = Hash(input_feature_string) % bucket_size for string type input.
1124 For int type input, the value is converted to its string representation first
1125 and then hashed by the same formula.
1127 For input dictionary `features`, `features[key]` is either `Tensor` or
1128 `SparseTensor`. If `Tensor`, missing values can be represented by `-1` for int
1129 and `''` for string, which will be dropped by this feature column.
1131 Example:
1133 ```python
1134 keywords = categorical_column_with_hash_bucket("keywords", 10K)
1135 columns = [keywords, ...]
1136 features = tf.io.parse_example(..., features=make_parse_example_spec(columns))
1137 linear_prediction = linear_model(features, columns)
1139 # or
1140 keywords_embedded = embedding_column(keywords, 16)
1141 columns = [keywords_embedded, ...]
1142 features = tf.io.parse_example(..., features=make_parse_example_spec(columns))
1143 dense_tensor = input_layer(features, columns)
1144 ```
1146 Args:
1147 key: A unique string identifying the input feature. It is used as the column
1148 name and the dictionary key for feature parsing configs, feature `Tensor`
1149 objects, and feature columns.
1150 hash_bucket_size: An int > 1. The number of buckets.
1151 dtype: The type of features. Only string and integer types are supported.
1153 Returns:
1154 A `_HashedCategoricalColumn`.
1156 Raises:
1157 ValueError: `hash_bucket_size` is not greater than 1.
1158 ValueError: `dtype` is neither string nor integer.
1159 """
1160 if hash_bucket_size is None:
1161 raise ValueError('hash_bucket_size must be set. ' 'key: {}'.format(key))
1163 if hash_bucket_size < 1:
1164 raise ValueError('hash_bucket_size must be at least 1. '
1165 'hash_bucket_size: {}, key: {}'.format(
1166 hash_bucket_size, key))
1168 fc_utils.assert_key_is_string(key)
1169 fc_utils.assert_string_or_int(dtype, prefix='column_name: {}'.format(key))
1171 return _HashedCategoricalColumn(key, hash_bucket_size, dtype)
1174def _categorical_column_with_vocabulary_file(key,
1175 vocabulary_file,
1176 vocabulary_size=None,
1177 num_oov_buckets=0,
1178 default_value=None,
1179 dtype=dtypes.string):
1180 """A `_CategoricalColumn` with a vocabulary file.
1182 Use this when your inputs are in string or integer format, and you have a
1183 vocabulary file that maps each value to an integer ID. By default,
1184 out-of-vocabulary values are ignored. Use either (but not both) of
1185 `num_oov_buckets` and `default_value` to specify how to include
1186 out-of-vocabulary values.
1188 For input dictionary `features`, `features[key]` is either `Tensor` or
1189 `SparseTensor`. If `Tensor`, missing values can be represented by `-1` for int
1190 and `''` for string, which will be dropped by this feature column.
1192 Example with `num_oov_buckets`:
1193 File '/us/states.txt' contains 50 lines, each with a 2-character U.S. state
1194 abbreviation. All inputs with values in that file are assigned an ID 0-49,
1195 corresponding to its line number. All other values are hashed and assigned an
1196 ID 50-54.
1198 ```python
1199 states = categorical_column_with_vocabulary_file(
1200 key='states', vocabulary_file='/us/states.txt', vocabulary_size=50,
1201 num_oov_buckets=5)
1202 columns = [states, ...]
1203 features = tf.io.parse_example(..., features=make_parse_example_spec(columns))
1204 linear_prediction = linear_model(features, columns)
1205 ```
1207 Example with `default_value`:
1208 File '/us/states.txt' contains 51 lines - the first line is 'XX', and the
1209 other 50 each have a 2-character U.S. state abbreviation. Both a literal 'XX'
1210 in input, and other values missing from the file, will be assigned ID 0. All
1211 others are assigned the corresponding line number 1-50.
1213 ```python
1214 states = categorical_column_with_vocabulary_file(
1215 key='states', vocabulary_file='/us/states.txt', vocabulary_size=51,
1216 default_value=0)
1217 columns = [states, ...]
1218 features = tf.io.parse_example(..., features=make_parse_example_spec(columns))
1219 linear_prediction, _, _ = linear_model(features, columns)
1220 ```
1222 And to make an embedding with either:
1224 ```python
1225 columns = [embedding_column(states, 3),...]
1226 features = tf.io.parse_example(..., features=make_parse_example_spec(columns))
1227 dense_tensor = input_layer(features, columns)
1228 ```
1230 Args:
1231 key: A unique string identifying the input feature. It is used as the column
1232 name and the dictionary key for feature parsing configs, feature `Tensor`
1233 objects, and feature columns.
1234 vocabulary_file: The vocabulary file name.
1235 vocabulary_size: Number of the elements in the vocabulary. This must be no
1236 greater than length of `vocabulary_file`, if less than length, later
1237 values are ignored. If None, it is set to the length of `vocabulary_file`.
1238 num_oov_buckets: Non-negative integer, the number of out-of-vocabulary
1239 buckets. All out-of-vocabulary inputs will be assigned IDs in the range
1240 `[vocabulary_size, vocabulary_size+num_oov_buckets)` based on a hash of
1241 the input value. A positive `num_oov_buckets` can not be specified with
1242 `default_value`.
1243 default_value: The integer ID value to return for out-of-vocabulary feature
1244 values, defaults to `-1`. This can not be specified with a positive
1245 `num_oov_buckets`.
1246 dtype: The type of features. Only string and integer types are supported.
1248 Returns:
1249 A `_CategoricalColumn` with a vocabulary file.
1251 Raises:
1252 ValueError: `vocabulary_file` is missing or cannot be opened.
1253 ValueError: `vocabulary_size` is missing or < 1.
1254 ValueError: `num_oov_buckets` is a negative integer.
1255 ValueError: `num_oov_buckets` and `default_value` are both specified.
1256 ValueError: `dtype` is neither string nor integer.
1257 """
1258 if not vocabulary_file:
1259 raise ValueError('Missing vocabulary_file in {}.'.format(key))
1261 if vocabulary_size is None:
1262 if not gfile.Exists(vocabulary_file):
1263 raise ValueError('vocabulary_file in {} does not exist.'.format(key))
1265 with gfile.GFile(vocabulary_file) as f:
1266 vocabulary_size = sum(1 for _ in f)
1267 logging.info(
1268 'vocabulary_size = %d in %s is inferred from the number of elements '
1269 'in the vocabulary_file %s.', vocabulary_size, key, vocabulary_file)
1271 # `vocabulary_size` isn't required for lookup, but it is for `_num_buckets`.
1272 if vocabulary_size < 1:
1273 raise ValueError('Invalid vocabulary_size in {}.'.format(key))
1274 if num_oov_buckets:
1275 if default_value is not None:
1276 raise ValueError(
1277 'Can\'t specify both num_oov_buckets and default_value in {}.'.format(
1278 key))
1279 if num_oov_buckets < 0:
1280 raise ValueError('Invalid num_oov_buckets {} in {}.'.format(
1281 num_oov_buckets, key))
1282 fc_utils.assert_string_or_int(dtype, prefix='column_name: {}'.format(key))
1283 fc_utils.assert_key_is_string(key)
1284 return _VocabularyFileCategoricalColumn(
1285 key=key,
1286 vocabulary_file=vocabulary_file,
1287 vocabulary_size=vocabulary_size,
1288 num_oov_buckets=0 if num_oov_buckets is None else num_oov_buckets,
1289 default_value=-1 if default_value is None else default_value,
1290 dtype=dtype)
1293def _categorical_column_with_vocabulary_list(key,
1294 vocabulary_list,
1295 dtype=None,
1296 default_value=-1,
1297 num_oov_buckets=0):
1298 """A `_CategoricalColumn` with in-memory vocabulary.
1300 Use this when your inputs are in string or integer format, and you have an
1301 in-memory vocabulary mapping each value to an integer ID. By default,
1302 out-of-vocabulary values are ignored. Use either (but not both) of
1303 `num_oov_buckets` and `default_value` to specify how to include
1304 out-of-vocabulary values.
1306 For input dictionary `features`, `features[key]` is either `Tensor` or
1307 `SparseTensor`. If `Tensor`, missing values can be represented by `-1` for int
1308 and `''` for string, which will be dropped by this feature column.
1310 Example with `num_oov_buckets`:
1311 In the following example, each input in `vocabulary_list` is assigned an ID
1312 0-3 corresponding to its index (e.g., input 'B' produces output 2). All other
1313 inputs are hashed and assigned an ID 4-5.
1315 ```python
1316 colors = categorical_column_with_vocabulary_list(
1317 key='colors', vocabulary_list=('R', 'G', 'B', 'Y'),
1318 num_oov_buckets=2)
1319 columns = [colors, ...]
1320 features = tf.io.parse_example(..., features=make_parse_example_spec(columns))
1321 linear_prediction, _, _ = linear_model(features, columns)
1322 ```
1324 Example with `default_value`:
1325 In the following example, each input in `vocabulary_list` is assigned an ID
1326 0-4 corresponding to its index (e.g., input 'B' produces output 3). All other
1327 inputs are assigned `default_value` 0.
1330 ```python
1331 colors = categorical_column_with_vocabulary_list(
1332 key='colors', vocabulary_list=('X', 'R', 'G', 'B', 'Y'), default_value=0)
1333 columns = [colors, ...]
1334 features = tf.io.parse_example(..., features=make_parse_example_spec(columns))
1335 linear_prediction, _, _ = linear_model(features, columns)
1336 ```
1338 And to make an embedding with either:
1340 ```python
1341 columns = [embedding_column(colors, 3),...]
1342 features = tf.io.parse_example(..., features=make_parse_example_spec(columns))
1343 dense_tensor = input_layer(features, columns)
1344 ```
1346 Args:
1347 key: A unique string identifying the input feature. It is used as the column
1348 name and the dictionary key for feature parsing configs, feature `Tensor`
1349 objects, and feature columns.
1350 vocabulary_list: An ordered iterable defining the vocabulary. Each feature
1351 is mapped to the index of its value (if present) in `vocabulary_list`.
1352 Must be castable to `dtype`.
1353 dtype: The type of features. Only string and integer types are supported. If
1354 `None`, it will be inferred from `vocabulary_list`.
1355 default_value: The integer ID value to return for out-of-vocabulary feature
1356 values, defaults to `-1`. This can not be specified with a positive
1357 `num_oov_buckets`.
1358 num_oov_buckets: Non-negative integer, the number of out-of-vocabulary
1359 buckets. All out-of-vocabulary inputs will be assigned IDs in the range
1360 `[len(vocabulary_list), len(vocabulary_list)+num_oov_buckets)` based on a
1361 hash of the input value. A positive `num_oov_buckets` can not be specified
1362 with `default_value`.
1364 Returns:
1365 A `_CategoricalColumn` with in-memory vocabulary.
1367 Raises:
1368 ValueError: if `vocabulary_list` is empty, or contains duplicate keys.
1369 ValueError: `num_oov_buckets` is a negative integer.
1370 ValueError: `num_oov_buckets` and `default_value` are both specified.
1371 ValueError: if `dtype` is not integer or string.
1372 """
1373 if (vocabulary_list is None) or (len(vocabulary_list) < 1):
1374 raise ValueError(
1375 'vocabulary_list {} must be non-empty, column_name: {}'.format(
1376 vocabulary_list, key))
1377 if len(set(vocabulary_list)) != len(vocabulary_list):
1378 raise ValueError(
1379 'Duplicate keys in vocabulary_list {}, column_name: {}'.format(
1380 vocabulary_list, key))
1381 vocabulary_dtype = dtypes.as_dtype(np.array(vocabulary_list).dtype)
1382 if num_oov_buckets:
1383 if default_value != -1:
1384 raise ValueError(
1385 'Can\'t specify both num_oov_buckets and default_value in {}.'.format(
1386 key))
1387 if num_oov_buckets < 0:
1388 raise ValueError('Invalid num_oov_buckets {} in {}.'.format(
1389 num_oov_buckets, key))
1390 fc_utils.assert_string_or_int(
1391 vocabulary_dtype, prefix='column_name: {} vocabulary'.format(key))
1392 if dtype is None:
1393 dtype = vocabulary_dtype
1394 elif dtype.is_integer != vocabulary_dtype.is_integer:
1395 raise ValueError(
1396 'dtype {} and vocabulary dtype {} do not match, column_name: {}'.format(
1397 dtype, vocabulary_dtype, key))
1398 fc_utils.assert_string_or_int(dtype, prefix='column_name: {}'.format(key))
1399 fc_utils.assert_key_is_string(key)
1401 return _VocabularyListCategoricalColumn(
1402 key=key,
1403 vocabulary_list=tuple(vocabulary_list),
1404 dtype=dtype,
1405 default_value=default_value,
1406 num_oov_buckets=num_oov_buckets)
1409def _categorical_column_with_identity(key, num_buckets, default_value=None):
1410 """A `_CategoricalColumn` that returns identity values.
1412 Use this when your inputs are integers in the range `[0, num_buckets)`, and
1413 you want to use the input value itself as the categorical ID. Values outside
1414 this range will result in `default_value` if specified, otherwise it will
1415 fail.
1417 Typically, this is used for contiguous ranges of integer indexes, but
1418 it doesn't have to be. This might be inefficient, however, if many of IDs
1419 are unused. Consider `categorical_column_with_hash_bucket` in that case.
1421 For input dictionary `features`, `features[key]` is either `Tensor` or
1422 `SparseTensor`. If `Tensor`, missing values can be represented by `-1` for int
1423 and `''` for string, which will be dropped by this feature column.
1425 In the following examples, each input in the range `[0, 1000000)` is assigned
1426 the same value. All other inputs are assigned `default_value` 0. Note that a
1427 literal 0 in inputs will result in the same default ID.
1429 Linear model:
1431 ```python
1432 video_id = categorical_column_with_identity(
1433 key='video_id', num_buckets=1000000, default_value=0)
1434 columns = [video_id, ...]
1435 features = tf.io.parse_example(..., features=make_parse_example_spec(columns))
1436 linear_prediction, _, _ = linear_model(features, columns)
1437 ```
1439 Embedding for a DNN model:
1441 ```python
1442 columns = [embedding_column(video_id, 9),...]
1443 features = tf.io.parse_example(..., features=make_parse_example_spec(columns))
1444 dense_tensor = input_layer(features, columns)
1445 ```
1447 Args:
1448 key: A unique string identifying the input feature. It is used as the column
1449 name and the dictionary key for feature parsing configs, feature `Tensor`
1450 objects, and feature columns.
1451 num_buckets: Range of inputs and outputs is `[0, num_buckets)`.
1452 default_value: If set, values outside of range `[0, num_buckets)` will be
1453 replaced with this value. If not set, values >= num_buckets will cause a
1454 failure while values < 0 will be dropped.
1456 Returns:
1457 A `_CategoricalColumn` that returns identity values.
1459 Raises:
1460 ValueError: if `num_buckets` is less than one.
1461 ValueError: if `default_value` is not in range `[0, num_buckets)`.
1462 """
1463 if num_buckets < 1:
1464 raise ValueError('num_buckets {} < 1, column_name {}'.format(
1465 num_buckets, key))
1466 if (default_value is not None) and ((default_value < 0) or
1467 (default_value >= num_buckets)):
1468 raise ValueError(
1469 'default_value {} not in range [0, {}), column_name {}'.format(
1470 default_value, num_buckets, key))
1471 fc_utils.assert_key_is_string(key)
1472 return _IdentityCategoricalColumn(
1473 key=key, num_buckets=num_buckets, default_value=default_value)
1476def _indicator_column(categorical_column):
1477 """Represents multi-hot representation of given categorical column.
1479 - For DNN model, `indicator_column` can be used to wrap any
1480 `categorical_column_*` (e.g., to feed to DNN). Consider to Use
1481 `embedding_column` if the number of buckets/unique(values) are large.
1483 - For Wide (aka linear) model, `indicator_column` is the internal
1484 representation for categorical column when passing categorical column
1485 directly (as any element in feature_columns) to `linear_model`. See
1486 `linear_model` for details.
1488 ```python
1489 name = indicator_column(categorical_column_with_vocabulary_list(
1490 'name', ['bob', 'george', 'wanda'])
1491 columns = [name, ...]
1492 features = tf.io.parse_example(..., features=make_parse_example_spec(columns))
1493 dense_tensor = input_layer(features, columns)
1495 dense_tensor == [[1, 0, 0]] # If "name" bytes_list is ["bob"]
1496 dense_tensor == [[1, 0, 1]] # If "name" bytes_list is ["bob", "wanda"]
1497 dense_tensor == [[2, 0, 0]] # If "name" bytes_list is ["bob", "bob"]
1498 ```
1500 Args:
1501 categorical_column: A `_CategoricalColumn` which is created by
1502 `categorical_column_with_*` or `crossed_column` functions.
1504 Returns:
1505 An `_IndicatorColumn`.
1506 """
1507 return _IndicatorColumn(categorical_column)
1510def _weighted_categorical_column(categorical_column,
1511 weight_feature_key,
1512 dtype=dtypes.float32):
1513 """Applies weight values to a `_CategoricalColumn`.
1515 Use this when each of your sparse inputs has both an ID and a value. For
1516 example, if you're representing text documents as a collection of word
1517 frequencies, you can provide 2 parallel sparse input features ('terms' and
1518 'frequencies' below).
1520 Example:
1522 Input `tf.Example` objects:
1524 ```proto
1525 [
1526 features {
1527 feature {
1528 key: "terms"
1529 value {bytes_list {value: "very" value: "model"}}
1530 }
1531 feature {
1532 key: "frequencies"
1533 value {float_list {value: 0.3 value: 0.1}}
1534 }
1535 },
1536 features {
1537 feature {
1538 key: "terms"
1539 value {bytes_list {value: "when" value: "course" value: "human"}}
1540 }
1541 feature {
1542 key: "frequencies"
1543 value {float_list {value: 0.4 value: 0.1 value: 0.2}}
1544 }
1545 }
1546 ]
1547 ```
1549 ```python
1550 categorical_column = categorical_column_with_hash_bucket(
1551 column_name='terms', hash_bucket_size=1000)
1552 weighted_column = weighted_categorical_column(
1553 categorical_column=categorical_column, weight_feature_key='frequencies')
1554 columns = [weighted_column, ...]
1555 features = tf.io.parse_example(..., features=make_parse_example_spec(columns))
1556 linear_prediction, _, _ = linear_model(features, columns)
1557 ```
1559 This assumes the input dictionary contains a `SparseTensor` for key
1560 'terms', and a `SparseTensor` for key 'frequencies'. These 2 tensors must have
1561 the same indices and dense shape.
1563 Args:
1564 categorical_column: A `_CategoricalColumn` created by
1565 `categorical_column_with_*` functions.
1566 weight_feature_key: String key for weight values.
1567 dtype: Type of weights, such as `tf.float32`. Only float and integer weights
1568 are supported.
1570 Returns:
1571 A `_CategoricalColumn` composed of two sparse features: one represents id,
1572 the other represents weight (value) of the id feature in that example.
1574 Raises:
1575 ValueError: if `dtype` is not convertible to float.
1576 """
1577 if (dtype is None) or not (dtype.is_integer or dtype.is_floating):
1578 raise ValueError('dtype {} is not convertible to float.'.format(dtype))
1579 return _WeightedCategoricalColumn(
1580 categorical_column=categorical_column,
1581 weight_feature_key=weight_feature_key,
1582 dtype=dtype)
1585def _crossed_column(keys, hash_bucket_size, hash_key=None):
1586 """Returns a column for performing crosses of categorical features.
1588 Crossed features are hashed according to `hash_bucket_size`. Conceptually,
1589 the transformation can be thought of as:
1590 Hash(cartesian product of features) % `hash_bucket_size`
1592 For example, if the input features are:
1594 * SparseTensor referred by first key:
1596 ```python
1597 shape = [2, 2]
1598 {
1599 [0, 0]: "a"
1600 [1, 0]: "b"
1601 [1, 1]: "c"
1602 }
1603 ```
1605 * SparseTensor referred by second key:
1607 ```python
1608 shape = [2, 1]
1609 {
1610 [0, 0]: "d"
1611 [1, 0]: "e"
1612 }
1613 ```
1615 then crossed feature will look like:
1617 ```python
1618 shape = [2, 2]
1619 {
1620 [0, 0]: Hash64("d", Hash64("a")) % hash_bucket_size
1621 [1, 0]: Hash64("e", Hash64("b")) % hash_bucket_size
1622 [1, 1]: Hash64("e", Hash64("c")) % hash_bucket_size
1623 }
1624 ```
1626 Here is an example to create a linear model with crosses of string features:
1628 ```python
1629 keywords_x_doc_terms = crossed_column(['keywords', 'doc_terms'], 50K)
1630 columns = [keywords_x_doc_terms, ...]
1631 features = tf.io.parse_example(..., features=make_parse_example_spec(columns))
1632 linear_prediction = linear_model(features, columns)
1633 ```
1635 You could also use vocabulary lookup before crossing:
1637 ```python
1638 keywords = categorical_column_with_vocabulary_file(
1639 'keywords', '/path/to/vocabulary/file', vocabulary_size=1K)
1640 keywords_x_doc_terms = crossed_column([keywords, 'doc_terms'], 50K)
1641 columns = [keywords_x_doc_terms, ...]
1642 features = tf.io.parse_example(..., features=make_parse_example_spec(columns))
1643 linear_prediction = linear_model(features, columns)
1644 ```
1646 If an input feature is of numeric type, you can use
1647 `categorical_column_with_identity`, or `bucketized_column`, as in the example:
1649 ```python
1650 # vertical_id is an integer categorical feature.
1651 vertical_id = categorical_column_with_identity('vertical_id', 10K)
1652 price = numeric_column('price')
1653 # bucketized_column converts numerical feature to a categorical one.
1654 bucketized_price = bucketized_column(price, boundaries=[...])
1655 vertical_id_x_price = crossed_column([vertical_id, bucketized_price], 50K)
1656 columns = [vertical_id_x_price, ...]
1657 features = tf.io.parse_example(..., features=make_parse_example_spec(columns))
1658 linear_prediction = linear_model(features, columns)
1659 ```
1661 To use crossed column in DNN model, you need to add it in an embedding column
1662 as in this example:
1664 ```python
1665 vertical_id_x_price = crossed_column([vertical_id, bucketized_price], 50K)
1666 vertical_id_x_price_embedded = embedding_column(vertical_id_x_price, 10)
1667 dense_tensor = input_layer(features, [vertical_id_x_price_embedded, ...])
1668 ```
1670 Args:
1671 keys: An iterable identifying the features to be crossed. Each element can
1672 be either:
1673 * string: Uses the corresponding feature which must be of string type.
1674 * `_CategoricalColumn`: Uses the transformed tensor produced by this
1675 column. Does not support hashed categorical column.
1676 hash_bucket_size: An int > 1. The number of buckets.
1677 hash_key: Specify the hash_key that will be used by the `FingerprintCat64`
1678 function to combine the crosses fingerprints on SparseCrossOp (optional).
1680 Returns:
1681 A `_CrossedColumn`.
1683 Raises:
1684 ValueError: If `len(keys) < 2`.
1685 ValueError: If any of the keys is neither a string nor `_CategoricalColumn`.
1686 ValueError: If any of the keys is `_HashedCategoricalColumn`.
1687 ValueError: If `hash_bucket_size < 1`.
1688 """
1689 if not hash_bucket_size or hash_bucket_size < 1:
1690 raise ValueError('hash_bucket_size must be > 1. '
1691 'hash_bucket_size: {}'.format(hash_bucket_size))
1692 if not keys or len(keys) < 2:
1693 raise ValueError(
1694 'keys must be a list with length > 1. Given: {}'.format(keys))
1695 for key in keys:
1696 if (not isinstance(key, six.string_types) and
1697 not isinstance(key, _CategoricalColumn)):
1698 raise ValueError(
1699 'Unsupported key type. All keys must be either string, or '
1700 'categorical column except _HashedCategoricalColumn. '
1701 'Given: {}'.format(key))
1702 if isinstance(key, _HashedCategoricalColumn):
1703 raise ValueError(
1704 'categorical_column_with_hash_bucket is not supported for crossing. '
1705 'Hashing before crossing will increase probability of collision. '
1706 'Instead, use the feature name as a string. Given: {}'.format(key))
1707 return _CrossedColumn(
1708 keys=tuple(keys), hash_bucket_size=hash_bucket_size, hash_key=hash_key)
1711# TODO(rohanj): Clearly define semantics of this layer.
1712class _EmbeddingColumnLayer(base.Layer):
1713 """A layer that stores all the state required for a embedding column."""
1715 def __init__(self,
1716 embedding_shape,
1717 initializer,
1718 weight_collections=None,
1719 trainable=True,
1720 name=None,
1721 **kwargs):
1722 """Constructor.
1724 Args:
1725 embedding_shape: Shape of the embedding variable used for lookup.
1726 initializer: A variable initializer function to be used in embedding
1727 variable initialization.
1728 weight_collections: A list of collection names to which the Variable will
1729 be added. Note that, variables will also be added to collections
1730 `tf.GraphKeys.GLOBAL_VARIABLES` and `ops.GraphKeys.MODEL_VARIABLES`.
1731 trainable: If `True` also add the variable to the graph collection
1732 `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`).
1733 name: Name of the layer
1734 **kwargs: keyword named properties.
1735 """
1736 super(_EmbeddingColumnLayer, self).__init__(
1737 trainable=trainable, name=name, **kwargs)
1738 self._embedding_shape = embedding_shape
1739 self._initializer = initializer
1740 self._weight_collections = weight_collections
1742 def set_weight_collections(self, weight_collections):
1743 """Sets the weight collections for the layer.
1745 Args:
1746 weight_collections: A list of collection names to which the Variable will
1747 be added.
1748 """
1749 self._weight_collections = weight_collections
1751 def build(self, _):
1752 self._embedding_weight_var = self.add_variable(
1753 name='embedding_weights',
1754 shape=self._embedding_shape,
1755 dtype=dtypes.float32,
1756 initializer=self._initializer,
1757 trainable=self.trainable)
1758 if self._weight_collections and not context.executing_eagerly():
1759 _add_to_collections(self._embedding_weight_var, self._weight_collections)
1760 self.built = True
1762 def call(self, _):
1763 return self._embedding_weight_var
1766@six.add_metaclass(abc.ABCMeta)
1767class _FeatureColumn(object):
1768 """Represents a feature column abstraction.
1770 WARNING: Do not subclass this layer unless you know what you are doing:
1771 the API is subject to future changes.
1773 To distinguish the concept of a feature family and a specific binary feature
1774 within a family, we refer to a feature family like "country" as a feature
1775 column. Following is an example feature in a `tf.Example` format:
1776 {key: "country", value: [ "US" ]}
1777 In this example the value of feature is "US" and "country" refers to the
1778 column of the feature.
1780 This class is an abstract class. User should not create instances of this.
1781 """
1783 @abc.abstractproperty
1784 def name(self):
1785 """Returns string. Used for naming and for name_scope."""
1786 pass
1788 def __lt__(self, other):
1789 """Allows feature columns to be sorted in Python 3 as they are in Python 2.
1791 Feature columns need to occasionally be sortable, for example when used as
1792 keys in a features dictionary passed to a layer.
1794 In CPython, `__lt__` must be defined for all objects in the
1795 sequence being sorted. If any objects do not have an `__lt__` compatible
1796 with feature column objects (such as strings), then CPython will fall back
1797 to using the `__gt__` method below.
1798 https://docs.python.org/3/library/stdtypes.html#list.sort
1800 Args:
1801 other: The other object to compare to.
1803 Returns:
1804 True if the string representation of this object is lexicographically less
1805 than the string representation of `other`. For FeatureColumn objects,
1806 this looks like "<__main__.FeatureColumn object at 0xa>".
1807 """
1808 return str(self) < str(other)
1810 def __gt__(self, other):
1811 """Allows feature columns to be sorted in Python 3 as they are in Python 2.
1813 Feature columns need to occasionally be sortable, for example when used as
1814 keys in a features dictionary passed to a layer.
1816 `__gt__` is called when the "other" object being compared during the sort
1817 does not have `__lt__` defined.
1818 Example:
1819 ```
1820 # __lt__ only class
1821 class A():
1822 def __lt__(self, other): return str(self) < str(other)
1824 a = A()
1825 a < "b" # True
1826 "0" < a # Error
1828 # __lt__ and __gt__ class
1829 class B():
1830 def __lt__(self, other): return str(self) < str(other)
1831 def __gt__(self, other): return str(self) > str(other)
1833 b = B()
1834 b < "c" # True
1835 "0" < b # True
1836 ```
1839 Args:
1840 other: The other object to compare to.
1842 Returns:
1843 True if the string representation of this object is lexicographically
1844 greater than the string representation of `other`. For FeatureColumn
1845 objects, this looks like "<__main__.FeatureColumn object at 0xa>".
1846 """
1847 return str(self) > str(other)
1849 @property
1850 def _var_scope_name(self):
1851 """Returns string. Used for variable_scope. Defaults to self.name."""
1852 return self.name
1854 @abc.abstractmethod
1855 def _transform_feature(self, inputs):
1856 """Returns intermediate representation (usually a `Tensor`).
1858 Uses `inputs` to create an intermediate representation (usually a `Tensor`)
1859 that other feature columns can use.
1861 Example usage of `inputs`:
1862 Let's say a Feature column depends on raw feature ('raw') and another
1863 `_FeatureColumn` (input_fc). To access corresponding `Tensor`s, inputs will
1864 be used as follows:
1866 ```python
1867 raw_tensor = inputs.get('raw')
1868 fc_tensor = inputs.get(input_fc)
1869 ```
1871 Args:
1872 inputs: A `_LazyBuilder` object to access inputs.
1874 Returns:
1875 Transformed feature `Tensor`.
1876 """
1877 pass
1879 @abc.abstractproperty
1880 def _parse_example_spec(self):
1881 """Returns a `tf.Example` parsing spec as dict.
1883 It is used for get_parsing_spec for `tf.io.parse_example`. Returned spec is
1884 a dict from keys ('string') to `VarLenFeature`, `FixedLenFeature`, and other
1885 supported objects. Please check documentation of `tf.io.parse_example` for
1886 all supported spec objects.
1888 Let's say a Feature column depends on raw feature ('raw') and another
1889 `_FeatureColumn` (input_fc). One possible implementation of
1890 _parse_example_spec is as follows:
1892 ```python
1893 spec = {'raw': tf.io.FixedLenFeature(...)}
1894 spec.update(input_fc._parse_example_spec)
1895 return spec
1896 ```
1897 """
1898 pass
1900 def _reset_config(self):
1901 """Resets the configuration in the column.
1903 Some feature columns e.g. embedding or shared embedding columns might
1904 have some state that is needed to be reset sometimes. Use this method
1905 in that scenario.
1906 """
1909class _DenseColumn(_FeatureColumn):
1910 """Represents a column which can be represented as `Tensor`.
1912 WARNING: Do not subclass this layer unless you know what you are doing:
1913 the API is subject to future changes.
1915 Some examples of this type are: numeric_column, embedding_column,
1916 indicator_column.
1917 """
1919 @abc.abstractproperty
1920 def _variable_shape(self):
1921 """`TensorShape` of `_get_dense_tensor`, without batch dimension."""
1922 pass
1924 @abc.abstractmethod
1925 def _get_dense_tensor(self, inputs, weight_collections=None, trainable=None):
1926 """Returns a `Tensor`.
1928 The output of this function will be used by model-builder-functions. For
1929 example the pseudo code of `input_layer` will be like:
1931 ```python
1932 def input_layer(features, feature_columns, ...):
1933 outputs = [fc._get_dense_tensor(...) for fc in feature_columns]
1934 return tf.concat(outputs)
1935 ```
1937 Args:
1938 inputs: A `_LazyBuilder` object to access inputs.
1939 weight_collections: List of graph collections to which Variables (if any
1940 will be created) are added.
1941 trainable: If `True` also add variables to the graph collection
1942 `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`).
1944 Returns:
1945 `Tensor` of shape [batch_size] + `_variable_shape`.
1946 """
1947 pass
1950def _create_weighted_sum(column,
1951 builder,
1952 units,
1953 sparse_combiner,
1954 weight_collections,
1955 trainable,
1956 weight_var=None):
1957 """Creates a weighted sum for a dense/categorical column for linear_model."""
1958 if isinstance(column, _CategoricalColumn):
1959 return _create_categorical_column_weighted_sum(
1960 column=column,
1961 builder=builder,
1962 units=units,
1963 sparse_combiner=sparse_combiner,
1964 weight_collections=weight_collections,
1965 trainable=trainable,
1966 weight_var=weight_var)
1967 else:
1968 return _create_dense_column_weighted_sum(
1969 column=column,
1970 builder=builder,
1971 units=units,
1972 weight_collections=weight_collections,
1973 trainable=trainable,
1974 weight_var=weight_var)
1977def _create_dense_column_weighted_sum(column,
1978 builder,
1979 units,
1980 weight_collections,
1981 trainable,
1982 weight_var=None):
1983 """Create a weighted sum of a dense column for linear_model."""
1984 tensor = column._get_dense_tensor( # pylint: disable=protected-access
1985 builder,
1986 weight_collections=weight_collections,
1987 trainable=trainable)
1988 num_elements = column._variable_shape.num_elements() # pylint: disable=protected-access
1989 batch_size = array_ops.shape(tensor)[0]
1990 tensor = array_ops.reshape(tensor, shape=(batch_size, num_elements))
1991 if weight_var is not None:
1992 weight = weight_var
1993 else:
1994 weight = variable_scope.get_variable(
1995 name='weights',
1996 shape=[num_elements, units],
1997 initializer=init_ops.zeros_initializer(),
1998 trainable=trainable,
1999 collections=weight_collections)
2000 return math_ops.matmul(tensor, weight, name='weighted_sum')
2003class _CategoricalColumn(_FeatureColumn):
2004 """Represents a categorical feature.
2006 WARNING: Do not subclass this layer unless you know what you are doing:
2007 the API is subject to future changes.
2009 A categorical feature typically handled with a `tf.sparse.SparseTensor` of
2010 IDs.
2011 """
2013 IdWeightPair = collections.namedtuple( # pylint: disable=invalid-name
2014 'IdWeightPair', ['id_tensor', 'weight_tensor'])
2016 @abc.abstractproperty
2017 def _num_buckets(self):
2018 """Returns number of buckets in this sparse feature."""
2019 pass
2021 @abc.abstractmethod
2022 def _get_sparse_tensors(self,
2023 inputs,
2024 weight_collections=None,
2025 trainable=None):
2026 """Returns an IdWeightPair.
2028 `IdWeightPair` is a pair of `SparseTensor`s which represents ids and
2029 weights.
2031 `IdWeightPair.id_tensor` is typically a `batch_size` x `num_buckets`
2032 `SparseTensor` of `int64`. `IdWeightPair.weight_tensor` is either a
2033 `SparseTensor` of `float` or `None` to indicate all weights should be
2034 taken to be 1. If specified, `weight_tensor` must have exactly the same
2035 shape and indices as `sp_ids`. Expected `SparseTensor` is same as parsing
2036 output of a `VarLenFeature` which is a ragged matrix.
2038 Args:
2039 inputs: A `LazyBuilder` as a cache to get input tensors required to create
2040 `IdWeightPair`.
2041 weight_collections: List of graph collections to which variables (if any
2042 will be created) are added.
2043 trainable: If `True` also add variables to the graph collection
2044 `GraphKeys.TRAINABLE_VARIABLES` (see `tf.compat.v1.get_variable`).
2045 """
2046 pass
2049def _create_categorical_column_weighted_sum(column,
2050 builder,
2051 units,
2052 sparse_combiner,
2053 weight_collections,
2054 trainable,
2055 weight_var=None):
2056 # pylint: disable=g-doc-return-or-yield,g-doc-args
2057 """Create a weighted sum of a categorical column for linear_model.
2059 Note to maintainer: As implementation details, the weighted sum is
2060 implemented via embedding_lookup_sparse toward efficiency. Mathematically,
2061 they are the same.
2063 To be specific, conceptually, categorical column can be treated as multi-hot
2064 vector. Say:
2066 ```python
2067 x = [0 0 1] # categorical column input
2068 w = [a b c] # weights
2069 ```
2070 The weighted sum is `c` in this case, which is same as `w[2]`.
2072 Another example is
2074 ```python
2075 x = [0 1 1] # categorical column input
2076 w = [a b c] # weights
2077 ```
2078 The weighted sum is `b + c` in this case, which is same as `w[2] + w[3]`.
2080 For both cases, we can implement weighted sum via embedding_lookup with
2081 sparse_combiner = "sum".
2082 """
2084 sparse_tensors = column._get_sparse_tensors( # pylint: disable=protected-access
2085 builder,
2086 weight_collections=weight_collections,
2087 trainable=trainable)
2088 id_tensor = sparse_ops.sparse_reshape(
2089 sparse_tensors.id_tensor,
2090 [array_ops.shape(sparse_tensors.id_tensor)[0], -1])
2091 weight_tensor = sparse_tensors.weight_tensor
2092 if weight_tensor is not None:
2093 weight_tensor = sparse_ops.sparse_reshape(
2094 weight_tensor, [array_ops.shape(weight_tensor)[0], -1])
2096 if weight_var is not None:
2097 weight = weight_var
2098 else:
2099 weight = variable_scope.get_variable(
2100 name='weights',
2101 shape=(column._num_buckets, units), # pylint: disable=protected-access
2102 initializer=init_ops.zeros_initializer(),
2103 trainable=trainable,
2104 collections=weight_collections)
2105 return embedding_ops.safe_embedding_lookup_sparse(
2106 weight,
2107 id_tensor,
2108 sparse_weights=weight_tensor,
2109 combiner=sparse_combiner,
2110 name='weighted_sum')
2113class _SequenceDenseColumn(_FeatureColumn):
2114 """Represents dense sequence data."""
2116 TensorSequenceLengthPair = collections.namedtuple( # pylint: disable=invalid-name
2117 'TensorSequenceLengthPair', ['dense_tensor', 'sequence_length'])
2119 @abc.abstractmethod
2120 def _get_sequence_dense_tensor(self,
2121 inputs,
2122 weight_collections=None,
2123 trainable=None):
2124 """Returns a `TensorSequenceLengthPair`."""
2125 pass
2128class _LazyBuilder(object):
2129 """Handles caching of transformations while building the model.
2131 `_FeatureColumn` specifies how to digest an input column to the network. Some
2132 feature columns require data transformations. This class caches those
2133 transformations.
2135 Some features may be used in more than one place. For example, one can use a
2136 bucketized feature by itself and a cross with it. In that case we
2137 should create only one bucketization op instead of creating ops for each
2138 feature column separately. To handle re-use of transformed columns,
2139 `_LazyBuilder` caches all previously transformed columns.
2141 Example:
2142 We're trying to use the following `_FeatureColumn`s:
2144 ```python
2145 bucketized_age = fc.bucketized_column(fc.numeric_column("age"), ...)
2146 keywords = fc.categorical_column_with_hash_buckets("keywords", ...)
2147 age_X_keywords = fc.crossed_column([bucketized_age, "keywords"])
2148 ... = linear_model(features,
2149 [bucketized_age, keywords, age_X_keywords]
2150 ```
2152 If we transform each column independently, then we'll get duplication of
2153 bucketization (one for cross, one for bucketization itself).
2154 The `_LazyBuilder` eliminates this duplication.
2155 """
2157 def __init__(self, features):
2158 """Creates a `_LazyBuilder`.
2160 Args:
2161 features: A mapping from feature column to objects that are `Tensor` or
2162 `SparseTensor`, or can be converted to same via
2163 `sparse_tensor.convert_to_tensor_or_sparse_tensor`. A `string` key
2164 signifies a base feature (not-transformed). A `_FeatureColumn` key means
2165 that this `Tensor` is the output of an existing `_FeatureColumn` which
2166 can be reused.
2167 """
2168 self._features = features.copy()
2169 self._feature_tensors = {}
2171 def get(self, key):
2172 """Returns a `Tensor` for the given key.
2174 A `str` key is used to access a base feature (not-transformed). When a
2175 `_FeatureColumn` is passed, the transformed feature is returned if it
2176 already exists, otherwise the given `_FeatureColumn` is asked to provide its
2177 transformed output, which is then cached.
2179 Args:
2180 key: a `str` or a `_FeatureColumn`.
2182 Returns:
2183 The transformed `Tensor` corresponding to the `key`.
2185 Raises:
2186 ValueError: if key is not found or a transformed `Tensor` cannot be
2187 computed.
2188 """
2189 if key in self._feature_tensors:
2190 # FeatureColumn is already transformed or converted.
2191 return self._feature_tensors[key]
2193 if key in self._features:
2194 feature_tensor = self._get_raw_feature_as_tensor(key)
2195 self._feature_tensors[key] = feature_tensor
2196 return feature_tensor
2198 if isinstance(key, six.string_types):
2199 raise ValueError('Feature {} is not in features dictionary.'.format(key))
2201 if not isinstance(key, _FeatureColumn):
2202 raise TypeError('"key" must be either a "str" or "_FeatureColumn". '
2203 'Provided: {}'.format(key))
2205 column = key
2206 logging.debug('Transforming feature_column %s.', column)
2207 transformed = column._transform_feature(self) # pylint: disable=protected-access
2208 if transformed is None:
2209 raise ValueError('Column {} is not supported.'.format(column.name))
2210 self._feature_tensors[column] = transformed
2211 return transformed
2213 def _get_raw_feature_as_tensor(self, key):
2214 """Gets the raw_feature (keyed by `key`) as `tensor`.
2216 The raw feature is converted to (sparse) tensor and maybe expand dim.
2218 For both `Tensor` and `SparseTensor`, the rank will be expanded (to 2) if
2219 the rank is 1. This supports dynamic rank also. For rank 0 raw feature, will
2220 error out as it is not supported.
2222 Args:
2223 key: A `str` key to access the raw feature.
2225 Returns:
2226 A `Tensor` or `SparseTensor`.
2228 Raises:
2229 ValueError: if the raw feature has rank 0.
2230 """
2231 raw_feature = self._features[key]
2232 feature_tensor = sparse_tensor_lib.convert_to_tensor_or_sparse_tensor(
2233 raw_feature)
2235 def expand_dims(input_tensor):
2236 # Input_tensor must have rank 1.
2237 if isinstance(input_tensor, sparse_tensor_lib.SparseTensor):
2238 return sparse_ops.sparse_reshape(input_tensor,
2239 [array_ops.shape(input_tensor)[0], 1])
2240 else:
2241 return array_ops.expand_dims(input_tensor, -1)
2243 rank = feature_tensor.get_shape().ndims
2244 if rank is not None:
2245 if rank == 0:
2246 raise ValueError(
2247 'Feature (key: {}) cannot have rank 0. Given: {}'.format(
2248 key, feature_tensor))
2249 return feature_tensor if rank != 1 else expand_dims(feature_tensor)
2251 # Handle dynamic rank.
2252 with ops.control_dependencies([
2253 check_ops.assert_positive(
2254 array_ops.rank(feature_tensor),
2255 message='Feature (key: {}) cannot have rank 0. Given: {}'.format(
2256 key, feature_tensor))
2257 ]):
2258 return cond.cond(
2259 math_ops.equal(1, array_ops.rank(feature_tensor)),
2260 lambda: expand_dims(feature_tensor), lambda: feature_tensor)
2263# TODO(ptucker): Move to third_party/tensorflow/python/ops/sparse_ops.py
2264def _shape_offsets(shape):
2265 """Returns moving offset for each dimension given shape."""
2266 offsets = []
2267 for dim in reversed(shape):
2268 if offsets:
2269 offsets.append(dim * offsets[-1])
2270 else:
2271 offsets.append(dim)
2272 offsets.reverse()
2273 return offsets
2276# TODO(ptucker): Move to third_party/tensorflow/python/ops/sparse_ops.py
2277def _to_sparse_input_and_drop_ignore_values(input_tensor, ignore_value=None):
2278 """Converts a `Tensor` to a `SparseTensor`, dropping ignore_value cells.
2280 If `input_tensor` is already a `SparseTensor`, just return it.
2282 Args:
2283 input_tensor: A string or integer `Tensor`.
2284 ignore_value: Entries in `dense_tensor` equal to this value will be absent
2285 from the resulting `SparseTensor`. If `None`, default value of
2286 `dense_tensor`'s dtype will be used ('' for `str`, -1 for `int`).
2288 Returns:
2289 A `SparseTensor` with the same shape as `input_tensor`.
2291 Raises:
2292 ValueError: when `input_tensor`'s rank is `None`.
2293 """
2294 input_tensor = sparse_tensor_lib.convert_to_tensor_or_sparse_tensor(
2295 input_tensor)
2296 if isinstance(input_tensor, sparse_tensor_lib.SparseTensor):
2297 return input_tensor
2298 with ops.name_scope(None, 'to_sparse_input', (
2299 input_tensor,
2300 ignore_value,
2301 )):
2302 if ignore_value is None:
2303 if input_tensor.dtype == dtypes.string:
2304 # Exception due to TF strings are converted to numpy objects by default.
2305 ignore_value = ''
2306 elif input_tensor.dtype.is_integer:
2307 ignore_value = -1 # -1 has a special meaning of missing feature
2308 else:
2309 # NOTE: `as_numpy_dtype` is a property, so with the parentheses this is
2310 # constructing a new numpy object of the given type, which yields the
2311 # default value for that type.
2312 ignore_value = input_tensor.dtype.as_numpy_dtype()
2313 ignore_value = math_ops.cast(
2314 ignore_value, input_tensor.dtype, name='ignore_value')
2315 indices = array_ops.where(
2316 math_ops.not_equal(input_tensor, ignore_value), name='indices')
2317 return sparse_tensor_lib.SparseTensor(
2318 indices=indices,
2319 values=array_ops.gather_nd(input_tensor, indices, name='values'),
2320 dense_shape=array_ops.shape(
2321 input_tensor, out_type=dtypes.int64, name='dense_shape'))
2324def _normalize_feature_columns(feature_columns):
2325 """Normalizes the `feature_columns` input.
2327 This method converts the `feature_columns` to list type as best as it can. In
2328 addition, verifies the type and other parts of feature_columns, required by
2329 downstream library.
2331 Args:
2332 feature_columns: The raw feature columns, usually passed by users.
2334 Returns:
2335 The normalized feature column list.
2337 Raises:
2338 ValueError: for any invalid inputs, such as empty, duplicated names, etc.
2339 """
2340 if isinstance(feature_columns, _FeatureColumn):
2341 feature_columns = [feature_columns]
2343 if isinstance(feature_columns, collections_abc.Iterator):
2344 feature_columns = list(feature_columns)
2346 if isinstance(feature_columns, dict):
2347 raise ValueError('Expected feature_columns to be iterable, found dict.')
2349 for column in feature_columns:
2350 if not isinstance(column, _FeatureColumn):
2351 raise ValueError('Items of feature_columns must be a _FeatureColumn. '
2352 'Given (type {}): {}.'.format(type(column), column))
2353 if not feature_columns:
2354 raise ValueError('feature_columns must not be empty.')
2355 name_to_column = {}
2356 for column in feature_columns:
2357 if column.name in name_to_column:
2358 raise ValueError('Duplicate feature column name found for columns: {} '
2359 'and {}. This usually means that these columns refer to '
2360 'same base feature. Either one must be discarded or a '
2361 'duplicated but renamed item must be inserted in '
2362 'features dict.'.format(column,
2363 name_to_column[column.name]))
2364 name_to_column[column.name] = column
2366 return feature_columns
2369class _NumericColumn(
2370 _DenseColumn,
2371 collections.namedtuple(
2372 '_NumericColumn',
2373 ['key', 'shape', 'default_value', 'dtype', 'normalizer_fn'])):
2374 """see `numeric_column`."""
2376 @property
2377 def name(self):
2378 return self.key
2380 @property
2381 def _parse_example_spec(self):
2382 return {
2383 self.key:
2384 parsing_ops.FixedLenFeature(self.shape, self.dtype,
2385 self.default_value)
2386 }
2388 def _transform_feature(self, inputs):
2389 input_tensor = inputs.get(self.key)
2390 if isinstance(input_tensor, sparse_tensor_lib.SparseTensor):
2391 raise ValueError(
2392 'The corresponding Tensor of numerical column must be a Tensor. '
2393 'SparseTensor is not supported. key: {}'.format(self.key))
2394 if self.normalizer_fn is not None:
2395 input_tensor = self.normalizer_fn(input_tensor)
2396 return math_ops.cast(input_tensor, dtypes.float32)
2398 @property
2399 def _variable_shape(self):
2400 return tensor_shape.TensorShape(self.shape)
2402 def _get_dense_tensor(self, inputs, weight_collections=None, trainable=None):
2403 """Returns dense `Tensor` representing numeric feature.
2405 Args:
2406 inputs: A `_LazyBuilder` object to access inputs.
2407 weight_collections: Unused `weight_collections` since no variables are
2408 created in this function.
2409 trainable: Unused `trainable` bool since no variables are created in this
2410 function.
2412 Returns:
2413 Dense `Tensor` created within `_transform_feature`.
2414 """
2415 # Do nothing with weight_collections and trainable since no variables are
2416 # created in this function.
2417 del weight_collections
2418 del trainable
2419 # Feature has been already transformed. Return the intermediate
2420 # representation created by _transform_feature.
2421 return inputs.get(self)
2424class _BucketizedColumn(_DenseColumn, _CategoricalColumn,
2425 collections.namedtuple('_BucketizedColumn',
2426 ['source_column', 'boundaries'])
2427 ):
2428 """See `bucketized_column`."""
2430 @property
2431 def name(self):
2432 return '{}_bucketized'.format(self.source_column.name)
2434 @property
2435 def _parse_example_spec(self):
2436 return self.source_column._parse_example_spec # pylint: disable=protected-access
2438 def _transform_feature(self, inputs):
2439 source_tensor = inputs.get(self.source_column)
2440 return math_ops._bucketize( # pylint: disable=protected-access
2441 source_tensor,
2442 boundaries=self.boundaries)
2444 @property
2445 def _variable_shape(self):
2446 return tensor_shape.TensorShape(
2447 tuple(self.source_column.shape) + (len(self.boundaries) + 1,))
2449 def _get_dense_tensor(self, inputs, weight_collections=None, trainable=None):
2450 del weight_collections
2451 del trainable
2452 input_tensor = inputs.get(self)
2453 return array_ops.one_hot(
2454 indices=math_ops.cast(input_tensor, dtypes.int64),
2455 depth=len(self.boundaries) + 1,
2456 on_value=1.,
2457 off_value=0.)
2459 @property
2460 def _num_buckets(self):
2461 # By construction, source_column is always one-dimensional.
2462 return (len(self.boundaries) + 1) * self.source_column.shape[0]
2464 def _get_sparse_tensors(self,
2465 inputs,
2466 weight_collections=None,
2467 trainable=None):
2468 """Converts dense inputs to SparseTensor so downstream code can use it."""
2469 input_tensor = inputs.get(self)
2470 batch_size = array_ops.shape(input_tensor)[0]
2471 # By construction, source_column is always one-dimensional.
2472 source_dimension = self.source_column.shape[0]
2474 i1 = array_ops.reshape(
2475 array_ops.tile(
2476 array_ops.expand_dims(math_ops.range(0, batch_size), 1),
2477 [1, source_dimension]), (-1,))
2478 i2 = array_ops.tile(math_ops.range(0, source_dimension), [batch_size])
2479 # Flatten the bucket indices and unique them across dimensions
2480 # E.g. 2nd dimension indices will range from k to 2*k-1 with k buckets
2481 bucket_indices = (
2482 array_ops.reshape(input_tensor,
2483 (-1,)) + (len(self.boundaries) + 1) * i2)
2485 indices = math_ops.cast(
2486 array_ops.transpose(array_ops_stack.stack((i1, i2))), dtypes.int64)
2487 dense_shape = math_ops.cast(
2488 array_ops_stack.stack([batch_size, source_dimension]), dtypes.int64)
2489 sparse_tensor = sparse_tensor_lib.SparseTensor(
2490 indices=indices, values=bucket_indices, dense_shape=dense_shape)
2491 return _CategoricalColumn.IdWeightPair(sparse_tensor, None)
2494class _EmbeddingColumn(
2495 _DenseColumn, _SequenceDenseColumn,
2496 collections.namedtuple(
2497 '_EmbeddingColumn',
2498 ('categorical_column', 'dimension', 'combiner', 'layer_creator',
2499 'ckpt_to_load_from', 'tensor_name_in_ckpt', 'max_norm', 'trainable',
2500 'use_safe_embedding_lookup'))):
2501 """See `embedding_column`."""
2503 def __new__(cls,
2504 categorical_column,
2505 dimension,
2506 combiner,
2507 layer_creator,
2508 ckpt_to_load_from,
2509 tensor_name_in_ckpt,
2510 max_norm,
2511 trainable,
2512 use_safe_embedding_lookup=True):
2513 return super(_EmbeddingColumn, cls).__new__(
2514 cls,
2515 categorical_column=categorical_column,
2516 dimension=dimension,
2517 combiner=combiner,
2518 layer_creator=layer_creator,
2519 ckpt_to_load_from=ckpt_to_load_from,
2520 tensor_name_in_ckpt=tensor_name_in_ckpt,
2521 max_norm=max_norm,
2522 trainable=trainable,
2523 use_safe_embedding_lookup=use_safe_embedding_lookup)
2525 @property
2526 def name(self):
2527 if not hasattr(self, '_name'):
2528 self._name = '{}_embedding'.format(self.categorical_column.name)
2529 return self._name
2531 @property
2532 def _parse_example_spec(self):
2533 return self.categorical_column._parse_example_spec # pylint: disable=protected-access
2535 def _transform_feature(self, inputs):
2536 return inputs.get(self.categorical_column)
2538 @property
2539 def _variable_shape(self):
2540 if not hasattr(self, '_shape'):
2541 self._shape = tensor_shape.TensorShape([self.dimension])
2542 return self._shape
2544 def _get_dense_tensor_internal(self,
2545 inputs,
2546 weight_collections=None,
2547 trainable=None):
2548 """Private method that follows the signature of _get_dense_tensor."""
2549 # Get sparse IDs and weights.
2550 sparse_tensors = self.categorical_column._get_sparse_tensors( # pylint: disable=protected-access
2551 inputs,
2552 weight_collections=weight_collections,
2553 trainable=trainable)
2554 sparse_ids = sparse_tensors.id_tensor
2555 sparse_weights = sparse_tensors.weight_tensor
2557 embedding_weights = self.layer_creator(
2558 weight_collections=weight_collections,
2559 scope=variable_scope.get_variable_scope())
2561 if self.ckpt_to_load_from is not None:
2562 to_restore = embedding_weights
2563 if isinstance(to_restore, variables.PartitionedVariable):
2564 to_restore = to_restore._get_variable_list() # pylint: disable=protected-access
2565 checkpoint_utils.init_from_checkpoint(
2566 self.ckpt_to_load_from, {self.tensor_name_in_ckpt: to_restore})
2568 sparse_id_rank = tensor_shape.dimension_value(
2569 sparse_ids.dense_shape.get_shape()[0])
2570 embedding_lookup_sparse = embedding_ops.safe_embedding_lookup_sparse
2571 if (not self.use_safe_embedding_lookup and sparse_id_rank is not None and
2572 sparse_id_rank <= 2):
2573 embedding_lookup_sparse = embedding_ops.embedding_lookup_sparse_v2
2574 # Return embedding lookup result.
2575 return embedding_lookup_sparse(
2576 embedding_weights,
2577 sparse_ids,
2578 sparse_weights,
2579 combiner=self.combiner,
2580 name='%s_weights' % self.name,
2581 max_norm=self.max_norm)
2583 def _get_dense_tensor(self, inputs, weight_collections=None, trainable=None):
2584 if isinstance(self.categorical_column, _SequenceCategoricalColumn):
2585 raise ValueError(
2586 'In embedding_column: {}. '
2587 'categorical_column must not be of type _SequenceCategoricalColumn. '
2588 'Suggested fix A: If you wish to use input_layer, use a '
2589 'non-sequence categorical_column_with_*. '
2590 'Suggested fix B: If you wish to create sequence input, use '
2591 'sequence_input_layer instead of input_layer. '
2592 'Given (type {}): {}'.format(self.name, type(self.categorical_column),
2593 self.categorical_column))
2594 return self._get_dense_tensor_internal(
2595 inputs=inputs,
2596 weight_collections=weight_collections,
2597 trainable=trainable)
2599 def _get_sequence_dense_tensor(self,
2600 inputs,
2601 weight_collections=None,
2602 trainable=None):
2603 if not isinstance(self.categorical_column, _SequenceCategoricalColumn):
2604 raise ValueError(
2605 'In embedding_column: {}. '
2606 'categorical_column must be of type _SequenceCategoricalColumn '
2607 'to use sequence_input_layer. '
2608 'Suggested fix: Use one of sequence_categorical_column_with_*. '
2609 'Given (type {}): {}'.format(self.name, type(self.categorical_column),
2610 self.categorical_column))
2611 dense_tensor = self._get_dense_tensor_internal( # pylint: disable=protected-access
2612 inputs=inputs,
2613 weight_collections=weight_collections,
2614 trainable=trainable)
2616 sparse_tensors = self.categorical_column._get_sparse_tensors(inputs) # pylint: disable=protected-access
2617 sequence_length = fc_utils.sequence_length_from_sparse_tensor(
2618 sparse_tensors.id_tensor)
2619 return _SequenceDenseColumn.TensorSequenceLengthPair(
2620 dense_tensor=dense_tensor, sequence_length=sequence_length)
2623def _get_graph_for_variable(var):
2624 if isinstance(var, variables.PartitionedVariable):
2625 return list(var)[0].graph
2626 else:
2627 return var.graph
2630class _SharedEmbeddingColumn(
2631 _DenseColumn, _SequenceDenseColumn,
2632 collections.namedtuple(
2633 '_SharedEmbeddingColumn',
2634 ('categorical_column', 'dimension', 'combiner', 'initializer',
2635 'shared_embedding_collection_name', 'ckpt_to_load_from',
2636 'tensor_name_in_ckpt', 'max_norm', 'trainable',
2637 'use_safe_embedding_lookup'))):
2638 """See `embedding_column`."""
2640 @property
2641 def name(self):
2642 if not hasattr(self, '_name'):
2643 self._name = '{}_shared_embedding'.format(self.categorical_column.name)
2644 return self._name
2646 @property
2647 def _var_scope_name(self):
2648 return self.shared_embedding_collection_name
2650 @property
2651 def _parse_example_spec(self):
2652 return self.categorical_column._parse_example_spec # pylint: disable=protected-access
2654 def _transform_feature(self, inputs):
2655 return inputs.get(self.categorical_column)
2657 @property
2658 def _variable_shape(self):
2659 if not hasattr(self, '_shape'):
2660 self._shape = tensor_shape.TensorShape([self.dimension])
2661 return self._shape
2663 def _get_dense_tensor_internal(self,
2664 inputs,
2665 weight_collections=None,
2666 trainable=None):
2667 """Private method that follows the signature of _get_dense_tensor."""
2668 # This method is called from a variable_scope with name _var_scope_name,
2669 # which is shared among all shared embeddings. Open a name_scope here, so
2670 # that the ops for different columns have distinct names.
2671 with ops.name_scope(None, default_name=self.name):
2672 # Get sparse IDs and weights.
2673 sparse_tensors = self.categorical_column._get_sparse_tensors( # pylint: disable=protected-access
2674 inputs,
2675 weight_collections=weight_collections,
2676 trainable=trainable)
2677 sparse_ids = sparse_tensors.id_tensor
2678 sparse_weights = sparse_tensors.weight_tensor
2680 embedding_shape = (self.categorical_column._num_buckets, self.dimension) # pylint: disable=protected-access
2681 shared_embedding_collection = ops.get_collection(
2682 self.shared_embedding_collection_name)
2683 if shared_embedding_collection:
2684 if len(shared_embedding_collection) > 1:
2685 raise ValueError(
2686 'Collection {} can only contain one variable. '
2687 'Suggested fix A: Choose a unique name for this collection. '
2688 'Suggested fix B: Do not add any variables to this collection. '
2689 'The feature_column library already adds a variable under the '
2690 'hood.'.format(shared_embedding_collection))
2691 embedding_weights = shared_embedding_collection[0]
2692 if embedding_weights.get_shape() != embedding_shape:
2693 raise ValueError(
2694 'Shared embedding collection {} contains variable {} of '
2695 'unexpected shape {}. Expected shape is {}. '
2696 'Suggested fix A: Choose a unique name for this collection. '
2697 'Suggested fix B: Do not add any variables to this collection. '
2698 'The feature_column library already adds a variable under the '
2699 'hood.'.format(self.shared_embedding_collection_name,
2700 embedding_weights.name,
2701 embedding_weights.get_shape(), embedding_shape))
2702 else:
2703 embedding_weights = variable_scope.get_variable(
2704 name='embedding_weights',
2705 shape=embedding_shape,
2706 dtype=dtypes.float32,
2707 initializer=self.initializer,
2708 trainable=self.trainable and trainable,
2709 collections=weight_collections)
2710 ops.add_to_collection(self.shared_embedding_collection_name,
2711 embedding_weights)
2712 if self.ckpt_to_load_from is not None:
2713 to_restore = embedding_weights
2714 if isinstance(to_restore, variables.PartitionedVariable):
2715 to_restore = to_restore._get_variable_list() # pylint: disable=protected-access
2716 checkpoint_utils.init_from_checkpoint(
2717 self.ckpt_to_load_from, {self.tensor_name_in_ckpt: to_restore})
2719 sparse_id_rank = tensor_shape.dimension_value(
2720 sparse_ids.dense_shape.get_shape()[0])
2721 embedding_lookup_sparse = embedding_ops.safe_embedding_lookup_sparse
2722 if (not self.use_safe_embedding_lookup and sparse_id_rank is not None and
2723 sparse_id_rank <= 2):
2724 embedding_lookup_sparse = embedding_ops.embedding_lookup_sparse_v2
2725 # Return embedding lookup result.
2726 return embedding_lookup_sparse(
2727 embedding_weights,
2728 sparse_ids,
2729 sparse_weights,
2730 combiner=self.combiner,
2731 name='%s_weights' % self.name,
2732 max_norm=self.max_norm)
2734 def _get_dense_tensor(self, inputs, weight_collections=None, trainable=None):
2735 if isinstance(self.categorical_column, _SequenceCategoricalColumn):
2736 raise ValueError(
2737 'In embedding_column: {}. '
2738 'categorical_column must not be of type _SequenceCategoricalColumn. '
2739 'Suggested fix A: If you wish to use input_layer, use a '
2740 'non-sequence categorical_column_with_*. '
2741 'Suggested fix B: If you wish to create sequence input, use '
2742 'sequence_input_layer instead of input_layer. '
2743 'Given (type {}): {}'.format(self.name, type(self.categorical_column),
2744 self.categorical_column))
2745 return self._get_dense_tensor_internal(
2746 inputs=inputs,
2747 weight_collections=weight_collections,
2748 trainable=trainable)
2750 def _get_sequence_dense_tensor(self,
2751 inputs,
2752 weight_collections=None,
2753 trainable=None):
2754 if not isinstance(self.categorical_column, _SequenceCategoricalColumn):
2755 raise ValueError(
2756 'In embedding_column: {}. '
2757 'categorical_column must be of type _SequenceCategoricalColumn '
2758 'to use sequence_input_layer. '
2759 'Suggested fix: Use one of sequence_categorical_column_with_*. '
2760 'Given (type {}): {}'.format(self.name, type(self.categorical_column),
2761 self.categorical_column))
2762 dense_tensor = self._get_dense_tensor_internal( # pylint: disable=protected-access
2763 inputs=inputs,
2764 weight_collections=weight_collections,
2765 trainable=trainable)
2766 sparse_tensors = self.categorical_column._get_sparse_tensors(inputs) # pylint: disable=protected-access
2767 sequence_length = fc_utils.sequence_length_from_sparse_tensor(
2768 sparse_tensors.id_tensor)
2769 return _SequenceDenseColumn.TensorSequenceLengthPair(
2770 dense_tensor=dense_tensor, sequence_length=sequence_length)
2773def _check_shape(shape, key):
2774 """Returns shape if it's valid, raises error otherwise."""
2775 assert shape is not None
2776 if not nest.is_nested(shape):
2777 shape = [shape]
2778 shape = tuple(shape)
2779 for dimension in shape:
2780 if not isinstance(dimension, six.integer_types):
2781 raise TypeError('shape dimensions must be integer. '
2782 'shape: {}, key: {}'.format(shape, key))
2783 if dimension < 1:
2784 raise ValueError('shape dimensions must be greater than 0. '
2785 'shape: {}, key: {}'.format(shape, key))
2786 return shape
2789class _HashedCategoricalColumn(_CategoricalColumn,
2790 collections.namedtuple(
2791 '_HashedCategoricalColumn',
2792 ['key', 'hash_bucket_size', 'dtype'])):
2793 """see `categorical_column_with_hash_bucket`."""
2795 @property
2796 def name(self):
2797 return self.key
2799 @property
2800 def _parse_example_spec(self):
2801 return {self.key: parsing_ops.VarLenFeature(self.dtype)}
2803 def _transform_feature(self, inputs):
2804 input_tensor = _to_sparse_input_and_drop_ignore_values(inputs.get(self.key))
2805 if not isinstance(input_tensor, sparse_tensor_lib.SparseTensor):
2806 raise ValueError('SparseColumn input must be a SparseTensor.')
2808 fc_utils.assert_string_or_int(
2809 input_tensor.dtype,
2810 prefix='column_name: {} input_tensor'.format(self.key))
2812 if self.dtype.is_integer != input_tensor.dtype.is_integer:
2813 raise ValueError(
2814 'Column dtype and SparseTensors dtype must be compatible. '
2815 'key: {}, column dtype: {}, tensor dtype: {}'.format(
2816 self.key, self.dtype, input_tensor.dtype))
2818 if self.dtype == dtypes.string:
2819 sparse_values = input_tensor.values
2820 else:
2821 sparse_values = string_ops.as_string(input_tensor.values)
2823 sparse_id_values = string_ops.string_to_hash_bucket_fast(
2824 sparse_values, self.hash_bucket_size, name='lookup')
2825 return sparse_tensor_lib.SparseTensor(input_tensor.indices,
2826 sparse_id_values,
2827 input_tensor.dense_shape)
2829 @property
2830 def _num_buckets(self):
2831 """Returns number of buckets in this sparse feature."""
2832 return self.hash_bucket_size
2834 def _get_sparse_tensors(self,
2835 inputs,
2836 weight_collections=None,
2837 trainable=None):
2838 return _CategoricalColumn.IdWeightPair(inputs.get(self), None)
2841class _VocabularyFileCategoricalColumn(
2842 _CategoricalColumn,
2843 collections.namedtuple('_VocabularyFileCategoricalColumn',
2844 ('key', 'vocabulary_file', 'vocabulary_size',
2845 'num_oov_buckets', 'dtype', 'default_value'))):
2846 """See `categorical_column_with_vocabulary_file`."""
2848 @property
2849 def name(self):
2850 return self.key
2852 @property
2853 def _parse_example_spec(self):
2854 return {self.key: parsing_ops.VarLenFeature(self.dtype)}
2856 def _transform_feature(self, inputs):
2857 input_tensor = _to_sparse_input_and_drop_ignore_values(inputs.get(self.key))
2859 if self.dtype.is_integer != input_tensor.dtype.is_integer:
2860 raise ValueError(
2861 'Column dtype and SparseTensors dtype must be compatible. '
2862 'key: {}, column dtype: {}, tensor dtype: {}'.format(
2863 self.key, self.dtype, input_tensor.dtype))
2865 fc_utils.assert_string_or_int(
2866 input_tensor.dtype,
2867 prefix='column_name: {} input_tensor'.format(self.key))
2869 key_dtype = self.dtype
2870 if input_tensor.dtype.is_integer:
2871 # `index_table_from_file` requires 64-bit integer keys.
2872 key_dtype = dtypes.int64
2873 input_tensor = math_ops.cast(input_tensor, dtypes.int64)
2875 return lookup_ops.index_table_from_file(
2876 vocabulary_file=self.vocabulary_file,
2877 num_oov_buckets=self.num_oov_buckets,
2878 vocab_size=self.vocabulary_size,
2879 default_value=self.default_value,
2880 key_dtype=key_dtype,
2881 name='{}_lookup'.format(self.key)).lookup(input_tensor)
2883 @property
2884 def _num_buckets(self):
2885 """Returns number of buckets in this sparse feature."""
2886 return self.vocabulary_size + self.num_oov_buckets
2888 def _get_sparse_tensors(self,
2889 inputs,
2890 weight_collections=None,
2891 trainable=None):
2892 return _CategoricalColumn.IdWeightPair(inputs.get(self), None)
2895class _VocabularyListCategoricalColumn(
2896 _CategoricalColumn,
2897 collections.namedtuple(
2898 '_VocabularyListCategoricalColumn',
2899 ('key', 'vocabulary_list', 'dtype', 'default_value', 'num_oov_buckets'))
2900):
2901 """See `categorical_column_with_vocabulary_list`."""
2903 @property
2904 def name(self):
2905 return self.key
2907 @property
2908 def _parse_example_spec(self):
2909 return {self.key: parsing_ops.VarLenFeature(self.dtype)}
2911 def _transform_feature(self, inputs):
2912 input_tensor = _to_sparse_input_and_drop_ignore_values(inputs.get(self.key))
2914 if self.dtype.is_integer != input_tensor.dtype.is_integer:
2915 raise ValueError(
2916 'Column dtype and SparseTensors dtype must be compatible. '
2917 'key: {}, column dtype: {}, tensor dtype: {}'.format(
2918 self.key, self.dtype, input_tensor.dtype))
2920 fc_utils.assert_string_or_int(
2921 input_tensor.dtype,
2922 prefix='column_name: {} input_tensor'.format(self.key))
2924 key_dtype = self.dtype
2925 if input_tensor.dtype.is_integer:
2926 # `index_table_from_tensor` requires 64-bit integer keys.
2927 key_dtype = dtypes.int64
2928 input_tensor = math_ops.cast(input_tensor, dtypes.int64)
2930 return lookup_ops.index_table_from_tensor(
2931 vocabulary_list=tuple(self.vocabulary_list),
2932 default_value=self.default_value,
2933 num_oov_buckets=self.num_oov_buckets,
2934 dtype=key_dtype,
2935 name='{}_lookup'.format(self.key)).lookup(input_tensor)
2937 @property
2938 def _num_buckets(self):
2939 """Returns number of buckets in this sparse feature."""
2940 return len(self.vocabulary_list) + self.num_oov_buckets
2942 def _get_sparse_tensors(self,
2943 inputs,
2944 weight_collections=None,
2945 trainable=None):
2946 return _CategoricalColumn.IdWeightPair(inputs.get(self), None)
2949class _IdentityCategoricalColumn(_CategoricalColumn,
2950 collections.namedtuple(
2951 '_IdentityCategoricalColumn',
2952 ('key', 'num_buckets', 'default_value'))):
2953 """See `categorical_column_with_identity`."""
2955 @property
2956 def name(self):
2957 return self.key
2959 @property
2960 def _parse_example_spec(self):
2961 return {self.key: parsing_ops.VarLenFeature(dtypes.int64)}
2963 def _transform_feature(self, inputs):
2964 input_tensor = _to_sparse_input_and_drop_ignore_values(inputs.get(self.key))
2966 if not input_tensor.dtype.is_integer:
2967 raise ValueError('Invalid input, not integer. key: {} dtype: {}'.format(
2968 self.key, input_tensor.dtype))
2969 values = input_tensor.values
2970 if input_tensor.values.dtype != dtypes.int64:
2971 values = math_ops.cast(values, dtypes.int64, name='values')
2972 if self.default_value is not None:
2973 num_buckets = math_ops.cast(
2974 self.num_buckets, dtypes.int64, name='num_buckets')
2975 zero = math_ops.cast(0, dtypes.int64, name='zero')
2976 # Assign default for out-of-range values.
2977 values = array_ops.where(
2978 math_ops.logical_or(
2979 values < zero, values >= num_buckets, name='out_of_range'),
2980 array_ops.fill(
2981 dims=array_ops.shape(values),
2982 value=math_ops.cast(self.default_value, dtypes.int64),
2983 name='default_values'), values)
2984 return sparse_tensor_lib.SparseTensor(
2985 indices=input_tensor.indices,
2986 values=values,
2987 dense_shape=input_tensor.dense_shape)
2989 @property
2990 def _num_buckets(self):
2991 """Returns number of buckets in this sparse feature."""
2992 return self.num_buckets
2994 def _get_sparse_tensors(self,
2995 inputs,
2996 weight_collections=None,
2997 trainable=None):
2998 return _CategoricalColumn.IdWeightPair(inputs.get(self), None)
3001class _WeightedCategoricalColumn(
3002 _CategoricalColumn,
3003 collections.namedtuple(
3004 '_WeightedCategoricalColumn',
3005 ('categorical_column', 'weight_feature_key', 'dtype'))):
3006 """See `weighted_categorical_column`."""
3008 @property
3009 def name(self):
3010 return '{}_weighted_by_{}'.format(self.categorical_column.name,
3011 self.weight_feature_key)
3013 @property
3014 def _parse_example_spec(self):
3015 config = self.categorical_column._parse_example_spec # pylint: disable=protected-access
3016 if self.weight_feature_key in config:
3017 raise ValueError('Parse config {} already exists for {}.'.format(
3018 config[self.weight_feature_key], self.weight_feature_key))
3019 config[self.weight_feature_key] = parsing_ops.VarLenFeature(self.dtype)
3020 return config
3022 @property
3023 def _num_buckets(self):
3024 return self.categorical_column._num_buckets # pylint: disable=protected-access
3026 def _transform_feature(self, inputs):
3027 weight_tensor = inputs.get(self.weight_feature_key)
3028 if weight_tensor is None:
3029 raise ValueError('Missing weights {}.'.format(self.weight_feature_key))
3030 weight_tensor = sparse_tensor_lib.convert_to_tensor_or_sparse_tensor(
3031 weight_tensor)
3032 if self.dtype != weight_tensor.dtype.base_dtype:
3033 raise ValueError('Bad dtype, expected {}, but got {}.'.format(
3034 self.dtype, weight_tensor.dtype))
3035 if not isinstance(weight_tensor, sparse_tensor_lib.SparseTensor):
3036 # The weight tensor can be a regular Tensor. In this case, sparsify it.
3037 weight_tensor = _to_sparse_input_and_drop_ignore_values(
3038 weight_tensor, ignore_value=0.0)
3039 if not weight_tensor.dtype.is_floating:
3040 weight_tensor = math_ops.cast(weight_tensor, dtypes.float32)
3041 return (inputs.get(self.categorical_column), weight_tensor)
3043 def _get_sparse_tensors(self,
3044 inputs,
3045 weight_collections=None,
3046 trainable=None):
3047 del weight_collections
3048 del trainable
3049 tensors = inputs.get(self)
3050 return _CategoricalColumn.IdWeightPair(tensors[0], tensors[1])
3053class _CrossedColumn(
3054 _CategoricalColumn,
3055 collections.namedtuple('_CrossedColumn',
3056 ['keys', 'hash_bucket_size', 'hash_key'])):
3057 """See `crossed_column`."""
3059 @property
3060 def name(self):
3061 feature_names = []
3062 for key in _collect_leaf_level_keys(self):
3063 if isinstance(key, _FeatureColumn):
3064 feature_names.append(key.name)
3065 else: # key must be a string
3066 feature_names.append(key)
3067 return '_X_'.join(sorted(feature_names))
3069 @property
3070 def _parse_example_spec(self):
3071 config = {}
3072 for key in self.keys:
3073 if isinstance(key, _FeatureColumn):
3074 config.update(key._parse_example_spec) # pylint: disable=protected-access
3075 else: # key must be a string
3076 config.update({key: parsing_ops.VarLenFeature(dtypes.string)})
3077 return config
3079 def _transform_feature(self, inputs):
3080 feature_tensors = []
3081 for key in _collect_leaf_level_keys(self):
3082 if isinstance(key, six.string_types):
3083 feature_tensors.append(inputs.get(key))
3084 elif isinstance(key, _CategoricalColumn):
3085 ids_and_weights = key._get_sparse_tensors(inputs) # pylint: disable=protected-access
3086 if ids_and_weights.weight_tensor is not None:
3087 raise ValueError(
3088 'crossed_column does not support weight_tensor, but the given '
3089 'column populates weight_tensor. '
3090 'Given column: {}'.format(key.name))
3091 feature_tensors.append(ids_and_weights.id_tensor)
3092 else:
3093 raise ValueError('Unsupported column type. Given: {}'.format(key))
3094 return sparse_ops.sparse_cross_hashed(
3095 inputs=feature_tensors,
3096 num_buckets=self.hash_bucket_size,
3097 hash_key=self.hash_key)
3099 @property
3100 def _num_buckets(self):
3101 """Returns number of buckets in this sparse feature."""
3102 return self.hash_bucket_size
3104 def _get_sparse_tensors(self,
3105 inputs,
3106 weight_collections=None,
3107 trainable=None):
3108 return _CategoricalColumn.IdWeightPair(inputs.get(self), None)
3111def _collect_leaf_level_keys(cross):
3112 """Collects base keys by expanding all nested crosses.
3114 Args:
3115 cross: A `_CrossedColumn`.
3117 Returns:
3118 A list of strings or `_CategoricalColumn` instances.
3119 """
3120 leaf_level_keys = []
3121 for k in cross.keys:
3122 if isinstance(k, _CrossedColumn):
3123 leaf_level_keys.extend(_collect_leaf_level_keys(k))
3124 else:
3125 leaf_level_keys.append(k)
3126 return leaf_level_keys
3129class _IndicatorColumn(_DenseColumn, _SequenceDenseColumn,
3130 collections.namedtuple('_IndicatorColumn',
3131 ['categorical_column'])):
3132 """Represents a one-hot column for use in deep networks.
3134 Args:
3135 categorical_column: A `_CategoricalColumn` which is created by
3136 `categorical_column_with_*` function.
3137 """
3139 @property
3140 def name(self):
3141 return '{}_indicator'.format(self.categorical_column.name)
3143 def _transform_feature(self, inputs):
3144 """Returns dense `Tensor` representing feature.
3146 Args:
3147 inputs: A `_LazyBuilder` object to access inputs.
3149 Returns:
3150 Transformed feature `Tensor`.
3152 Raises:
3153 ValueError: if input rank is not known at graph building time.
3154 """
3155 id_weight_pair = self.categorical_column._get_sparse_tensors(inputs) # pylint: disable=protected-access
3156 id_tensor = id_weight_pair.id_tensor
3157 weight_tensor = id_weight_pair.weight_tensor
3159 # If the underlying column is weighted, return the input as a dense tensor.
3160 if weight_tensor is not None:
3161 weighted_column = sparse_ops.sparse_merge(
3162 sp_ids=id_tensor,
3163 sp_values=weight_tensor,
3164 vocab_size=int(self._variable_shape[-1]))
3165 # Remove (?, -1) index.
3166 weighted_column = sparse_ops.sparse_slice(weighted_column, [0, 0],
3167 weighted_column.dense_shape)
3168 # Use scatter_nd to merge duplicated indices if existed,
3169 # instead of sparse_tensor_to_dense.
3170 return array_ops.scatter_nd(weighted_column.indices,
3171 weighted_column.values,
3172 weighted_column.dense_shape)
3174 dense_id_tensor = sparse_ops.sparse_tensor_to_dense(
3175 id_tensor, default_value=-1)
3177 # One hot must be float for tf.concat reasons since all other inputs to
3178 # input_layer are float32.
3179 one_hot_id_tensor = array_ops.one_hot(
3180 dense_id_tensor,
3181 depth=self._variable_shape[-1],
3182 on_value=1.0,
3183 off_value=0.0)
3185 # Reduce to get a multi-hot per example.
3186 return math_ops.reduce_sum(one_hot_id_tensor, axis=[-2])
3188 @property
3189 def _parse_example_spec(self):
3190 return self.categorical_column._parse_example_spec # pylint: disable=protected-access
3192 @property
3193 def _variable_shape(self):
3194 """Returns a `TensorShape` representing the shape of the dense `Tensor`."""
3195 return tensor_shape.TensorShape([1, self.categorical_column._num_buckets]) # pylint: disable=protected-access
3197 def _get_dense_tensor(self, inputs, weight_collections=None, trainable=None):
3198 """Returns dense `Tensor` representing feature.
3200 Args:
3201 inputs: A `_LazyBuilder` object to access inputs.
3202 weight_collections: Unused `weight_collections` since no variables are
3203 created in this function.
3204 trainable: Unused `trainable` bool since no variables are created in this
3205 function.
3207 Returns:
3208 Dense `Tensor` created within `_transform_feature`.
3210 Raises:
3211 ValueError: If `categorical_column` is a `_SequenceCategoricalColumn`.
3212 """
3213 # Do nothing with weight_collections and trainable since no variables are
3214 # created in this function.
3215 del weight_collections
3216 del trainable
3217 if isinstance(self.categorical_column, _SequenceCategoricalColumn):
3218 raise ValueError(
3219 'In indicator_column: {}. '
3220 'categorical_column must not be of type _SequenceCategoricalColumn. '
3221 'Suggested fix A: If you wish to use input_layer, use a '
3222 'non-sequence categorical_column_with_*. '
3223 'Suggested fix B: If you wish to create sequence input, use '
3224 'sequence_input_layer instead of input_layer. '
3225 'Given (type {}): {}'.format(self.name, type(self.categorical_column),
3226 self.categorical_column))
3227 # Feature has been already transformed. Return the intermediate
3228 # representation created by _transform_feature.
3229 return inputs.get(self)
3231 def _get_sequence_dense_tensor(self,
3232 inputs,
3233 weight_collections=None,
3234 trainable=None):
3235 # Do nothing with weight_collections and trainable since no variables are
3236 # created in this function.
3237 del weight_collections
3238 del trainable
3239 if not isinstance(self.categorical_column, _SequenceCategoricalColumn):
3240 raise ValueError(
3241 'In indicator_column: {}. '
3242 'categorical_column must be of type _SequenceCategoricalColumn '
3243 'to use sequence_input_layer. '
3244 'Suggested fix: Use one of sequence_categorical_column_with_*. '
3245 'Given (type {}): {}'.format(self.name, type(self.categorical_column),
3246 self.categorical_column))
3247 # Feature has been already transformed. Return the intermediate
3248 # representation created by _transform_feature.
3249 dense_tensor = inputs.get(self)
3250 sparse_tensors = self.categorical_column._get_sparse_tensors(inputs) # pylint: disable=protected-access
3251 sequence_length = fc_utils.sequence_length_from_sparse_tensor(
3252 sparse_tensors.id_tensor)
3253 return _SequenceDenseColumn.TensorSequenceLengthPair(
3254 dense_tensor=dense_tensor, sequence_length=sequence_length)
3257def _verify_static_batch_size_equality(tensors, columns):
3258 """Validates that the first dim (batch size) of all tensors are equal or None.
3260 Args:
3261 tensors: list of tensors to check.
3262 columns: list of feature columns matching tensors. Will be used for error
3263 messaging.
3265 Raises:
3266 ValueError: if one of the tensors has a variant batch size
3267 """
3268 # bath_size is a tf.compat.v1.Dimension object.
3269 expected_batch_size = None
3270 for i in range(0, len(tensors)):
3271 if tensors[i].shape.dims[0].value is not None:
3272 if expected_batch_size is None:
3273 bath_size_column_index = i
3274 expected_batch_size = tensors[i].shape.dims[0]
3275 elif not expected_batch_size.is_compatible_with(tensors[i].shape.dims[0]):
3276 raise ValueError(
3277 'Batch size (first dimension) of each feature must be same. '
3278 'Batch size of columns ({}, {}): ({}, {})'.format(
3279 columns[bath_size_column_index].name, columns[i].name,
3280 expected_batch_size, tensors[i].shape.dims[0]))
3283class _SequenceCategoricalColumn(_CategoricalColumn,
3284 collections.namedtuple(
3285 '_SequenceCategoricalColumn',
3286 ['categorical_column'])):
3287 """Represents sequences of categorical data."""
3289 @property
3290 def name(self):
3291 return self.categorical_column.name
3293 @property
3294 def _parse_example_spec(self):
3295 return self.categorical_column._parse_example_spec # pylint: disable=protected-access
3297 def _transform_feature(self, inputs):
3298 return self.categorical_column._transform_feature(inputs) # pylint: disable=protected-access
3300 @property
3301 def _num_buckets(self):
3302 return self.categorical_column._num_buckets # pylint: disable=protected-access
3304 def _get_sparse_tensors(self,
3305 inputs,
3306 weight_collections=None,
3307 trainable=None):
3308 sparse_tensors = self.categorical_column._get_sparse_tensors(inputs) # pylint: disable=protected-access
3309 id_tensor = sparse_tensors.id_tensor
3310 weight_tensor = sparse_tensors.weight_tensor
3312 # Expands third dimension, if necessary so that embeddings are not
3313 # combined during embedding lookup. If the tensor is already 3D, leave
3314 # as-is.
3315 shape = array_ops.shape(id_tensor)
3316 # Compute the third dimension explicitly instead of setting it to -1, as
3317 # that doesn't work for dynamically shaped tensors with 0-length at runtime.
3318 # This happens for empty sequences.
3319 target_shape = [shape[0], shape[1], math_ops.reduce_prod(shape[2:])]
3320 id_tensor = sparse_ops.sparse_reshape(id_tensor, target_shape)
3321 if weight_tensor is not None:
3322 weight_tensor = sparse_ops.sparse_reshape(weight_tensor, target_shape)
3324 return _CategoricalColumn.IdWeightPair(id_tensor, weight_tensor)