Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/tensorflow/python/feature_column/sequence_feature_column.py: 51%
105 statements
« prev ^ index » next coverage.py v7.4.0, created at 2024-01-03 07:57 +0000
« prev ^ index » next coverage.py v7.4.0, created at 2024-01-03 07:57 +0000
1# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
2#
3# Licensed under the Apache License, Version 2.0 (the "License");
4# you may not use this file except in compliance with the License.
5# You may obtain a copy of the License at
6#
7# http://www.apache.org/licenses/LICENSE-2.0
8#
9# Unless required by applicable law or agreed to in writing, software
10# distributed under the License is distributed on an "AS IS" BASIS,
11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12# See the License for the specific language governing permissions and
13# limitations under the License.
14# ==============================================================================
15"""This API defines FeatureColumn for sequential input.
17NOTE: This API is a work in progress and will likely be changing frequently.
18"""
20import collections
22from tensorflow.python.feature_column import feature_column_v2 as fc
23from tensorflow.python.feature_column import utils as fc_utils
24from tensorflow.python.framework import dtypes
25from tensorflow.python.framework import ops
26from tensorflow.python.framework import tensor_shape
27from tensorflow.python.ops import array_ops
28from tensorflow.python.ops import check_ops
29from tensorflow.python.ops import parsing_ops
30from tensorflow.python.ops import sparse_ops
31from tensorflow.python.util import deprecation
32from tensorflow.python.util.tf_export import tf_export
33from tensorflow.tools.docs import doc_controls
35_FEATURE_COLUMN_DEPRECATION_WARNING = """\
36 Warning: tf.feature_column is not recommended for new code. Instead,
37 feature preprocessing can be done directly using either [Keras preprocessing
38 layers](https://www.tensorflow.org/guide/migrate/migrating_feature_columns)
39 or through the one-stop utility [`tf.keras.utils.FeatureSpace`](https://www.tensorflow.org/api_docs/python/tf/keras/utils/FeatureSpace)
40 built on top of them. See the [migration guide](https://tensorflow.org/guide/migrate)
41 for details.
42 """
44_FEATURE_COLUMN_DEPRECATION_RUNTIME_WARNING = (
45 'Use Keras preprocessing layers instead, either directly or via the '
46 '`tf.keras.utils.FeatureSpace` utility. Each of `tf.feature_column.*` has '
47 'a functional equivalent in `tf.keras.layers` for feature preprocessing '
48 'when training a Keras model.')
51# pylint: disable=protected-access
52def concatenate_context_input(context_input, sequence_input):
53 """Replicates `context_input` across all timesteps of `sequence_input`.
55 Expands dimension 1 of `context_input` then tiles it `sequence_length` times.
56 This value is appended to `sequence_input` on dimension 2 and the result is
57 returned.
59 Args:
60 context_input: A `Tensor` of dtype `float32` and shape `[batch_size, d1]`.
61 sequence_input: A `Tensor` of dtype `float32` and shape `[batch_size,
62 padded_length, d0]`.
64 Returns:
65 A `Tensor` of dtype `float32` and shape `[batch_size, padded_length,
66 d0 + d1]`.
68 Raises:
69 ValueError: If `sequence_input` does not have rank 3 or `context_input` does
70 not have rank 2.
71 """
72 seq_rank_check = check_ops.assert_rank(
73 sequence_input,
74 3,
75 message='sequence_input must have rank 3',
76 data=[array_ops.shape(sequence_input)])
77 seq_type_check = check_ops.assert_type(
78 sequence_input,
79 dtypes.float32,
80 message='sequence_input must have dtype float32; got {}.'.format(
81 sequence_input.dtype))
82 ctx_rank_check = check_ops.assert_rank(
83 context_input,
84 2,
85 message='context_input must have rank 2',
86 data=[array_ops.shape(context_input)])
87 ctx_type_check = check_ops.assert_type(
88 context_input,
89 dtypes.float32,
90 message='context_input must have dtype float32; got {}.'.format(
91 context_input.dtype))
92 with ops.control_dependencies(
93 [seq_rank_check, seq_type_check, ctx_rank_check, ctx_type_check]):
94 padded_length = array_ops.shape(sequence_input)[1]
95 tiled_context_input = array_ops.tile(
96 array_ops.expand_dims(context_input, 1),
97 array_ops.concat([[1], [padded_length], [1]], 0))
98 return array_ops.concat([sequence_input, tiled_context_input], 2)
101@doc_controls.header(_FEATURE_COLUMN_DEPRECATION_WARNING)
102@tf_export('feature_column.sequence_categorical_column_with_identity')
103@deprecation.deprecated(None, _FEATURE_COLUMN_DEPRECATION_RUNTIME_WARNING)
104def sequence_categorical_column_with_identity(key,
105 num_buckets,
106 default_value=None):
107 """Returns a feature column that represents sequences of integers.
109 Pass this to `embedding_column` or `indicator_column` to convert sequence
110 categorical data into dense representation for input to sequence NN, such as
111 RNN.
113 Example:
115 ```python
116 watches = sequence_categorical_column_with_identity(
117 'watches', num_buckets=1000)
118 watches_embedding = embedding_column(watches, dimension=10)
119 columns = [watches_embedding]
121 features = tf.io.parse_example(..., features=make_parse_example_spec(columns))
122 sequence_feature_layer = SequenceFeatures(columns)
123 sequence_input, sequence_length = sequence_feature_layer(features)
124 sequence_length_mask = tf.sequence_mask(sequence_length)
126 rnn_cell = tf.keras.layers.SimpleRNNCell(hidden_size)
127 rnn_layer = tf.keras.layers.RNN(rnn_cell)
128 outputs, state = rnn_layer(sequence_input, mask=sequence_length_mask)
129 ```
131 Args:
132 key: A unique string identifying the input feature.
133 num_buckets: Range of inputs. Namely, inputs are expected to be in the range
134 `[0, num_buckets)`.
135 default_value: If `None`, this column's graph operations will fail for
136 out-of-range inputs. Otherwise, this value must be in the range `[0,
137 num_buckets)`, and will replace out-of-range inputs.
139 Returns:
140 A `SequenceCategoricalColumn`.
142 Raises:
143 ValueError: if `num_buckets` is less than one.
144 ValueError: if `default_value` is not in range `[0, num_buckets)`.
145 """
146 return fc.SequenceCategoricalColumn(
147 fc.categorical_column_with_identity(
148 key=key, num_buckets=num_buckets, default_value=default_value))
151@doc_controls.header(_FEATURE_COLUMN_DEPRECATION_WARNING)
152@tf_export('feature_column.sequence_categorical_column_with_hash_bucket')
153@deprecation.deprecated(None, _FEATURE_COLUMN_DEPRECATION_RUNTIME_WARNING)
154def sequence_categorical_column_with_hash_bucket(key,
155 hash_bucket_size,
156 dtype=dtypes.string):
157 """A sequence of categorical terms where ids are set by hashing.
159 Pass this to `embedding_column` or `indicator_column` to convert sequence
160 categorical data into dense representation for input to sequence NN, such as
161 RNN.
163 Example:
165 ```python
166 tokens = sequence_categorical_column_with_hash_bucket(
167 'tokens', hash_bucket_size=1000)
168 tokens_embedding = embedding_column(tokens, dimension=10)
169 columns = [tokens_embedding]
171 features = tf.io.parse_example(..., features=make_parse_example_spec(columns))
172 sequence_feature_layer = SequenceFeatures(columns)
173 sequence_input, sequence_length = sequence_feature_layer(features)
174 sequence_length_mask = tf.sequence_mask(sequence_length)
176 rnn_cell = tf.keras.layers.SimpleRNNCell(hidden_size)
177 rnn_layer = tf.keras.layers.RNN(rnn_cell)
178 outputs, state = rnn_layer(sequence_input, mask=sequence_length_mask)
179 ```
181 Args:
182 key: A unique string identifying the input feature.
183 hash_bucket_size: An int > 1. The number of buckets.
184 dtype: The type of features. Only string and integer types are supported.
186 Returns:
187 A `SequenceCategoricalColumn`.
189 Raises:
190 ValueError: `hash_bucket_size` is not greater than 1.
191 ValueError: `dtype` is neither string nor integer.
192 """
193 return fc.SequenceCategoricalColumn(
194 fc.categorical_column_with_hash_bucket(
195 key=key, hash_bucket_size=hash_bucket_size, dtype=dtype))
198@doc_controls.header(_FEATURE_COLUMN_DEPRECATION_WARNING)
199@tf_export('feature_column.sequence_categorical_column_with_vocabulary_file')
200@deprecation.deprecated(None, _FEATURE_COLUMN_DEPRECATION_RUNTIME_WARNING)
201def sequence_categorical_column_with_vocabulary_file(key,
202 vocabulary_file,
203 vocabulary_size=None,
204 num_oov_buckets=0,
205 default_value=None,
206 dtype=dtypes.string):
207 """A sequence of categorical terms where ids use a vocabulary file.
209 Pass this to `embedding_column` or `indicator_column` to convert sequence
210 categorical data into dense representation for input to sequence NN, such as
211 RNN.
213 Example:
215 ```python
216 states = sequence_categorical_column_with_vocabulary_file(
217 key='states', vocabulary_file='/us/states.txt', vocabulary_size=50,
218 num_oov_buckets=5)
219 states_embedding = embedding_column(states, dimension=10)
220 columns = [states_embedding]
222 features = tf.io.parse_example(..., features=make_parse_example_spec(columns))
223 sequence_feature_layer = SequenceFeatures(columns)
224 sequence_input, sequence_length = sequence_feature_layer(features)
225 sequence_length_mask = tf.sequence_mask(sequence_length)
227 rnn_cell = tf.keras.layers.SimpleRNNCell(hidden_size)
228 rnn_layer = tf.keras.layers.RNN(rnn_cell)
229 outputs, state = rnn_layer(sequence_input, mask=sequence_length_mask)
230 ```
232 Args:
233 key: A unique string identifying the input feature.
234 vocabulary_file: The vocabulary file name.
235 vocabulary_size: Number of the elements in the vocabulary. This must be no
236 greater than length of `vocabulary_file`, if less than length, later
237 values are ignored. If None, it is set to the length of `vocabulary_file`.
238 num_oov_buckets: Non-negative integer, the number of out-of-vocabulary
239 buckets. All out-of-vocabulary inputs will be assigned IDs in the range
240 `[vocabulary_size, vocabulary_size+num_oov_buckets)` based on a hash of
241 the input value. A positive `num_oov_buckets` can not be specified with
242 `default_value`.
243 default_value: The integer ID value to return for out-of-vocabulary feature
244 values, defaults to `-1`. This can not be specified with a positive
245 `num_oov_buckets`.
246 dtype: The type of features. Only string and integer types are supported.
248 Returns:
249 A `SequenceCategoricalColumn`.
251 Raises:
252 ValueError: `vocabulary_file` is missing or cannot be opened.
253 ValueError: `vocabulary_size` is missing or < 1.
254 ValueError: `num_oov_buckets` is a negative integer.
255 ValueError: `num_oov_buckets` and `default_value` are both specified.
256 ValueError: `dtype` is neither string nor integer.
257 """
258 return fc.SequenceCategoricalColumn(
259 fc.categorical_column_with_vocabulary_file(
260 key=key,
261 vocabulary_file=vocabulary_file,
262 vocabulary_size=vocabulary_size,
263 num_oov_buckets=num_oov_buckets,
264 default_value=default_value,
265 dtype=dtype))
268@doc_controls.header(_FEATURE_COLUMN_DEPRECATION_WARNING)
269@tf_export('feature_column.sequence_categorical_column_with_vocabulary_list')
270@deprecation.deprecated(None, _FEATURE_COLUMN_DEPRECATION_RUNTIME_WARNING)
271def sequence_categorical_column_with_vocabulary_list(key,
272 vocabulary_list,
273 dtype=None,
274 default_value=-1,
275 num_oov_buckets=0):
276 """A sequence of categorical terms where ids use an in-memory list.
278 Pass this to `embedding_column` or `indicator_column` to convert sequence
279 categorical data into dense representation for input to sequence NN, such as
280 RNN.
282 Example:
284 ```python
285 colors = sequence_categorical_column_with_vocabulary_list(
286 key='colors', vocabulary_list=('R', 'G', 'B', 'Y'),
287 num_oov_buckets=2)
288 colors_embedding = embedding_column(colors, dimension=3)
289 columns = [colors_embedding]
291 features = tf.io.parse_example(..., features=make_parse_example_spec(columns))
292 sequence_feature_layer = SequenceFeatures(columns)
293 sequence_input, sequence_length = sequence_feature_layer(features)
294 sequence_length_mask = tf.sequence_mask(sequence_length)
296 rnn_cell = tf.keras.layers.SimpleRNNCell(hidden_size)
297 rnn_layer = tf.keras.layers.RNN(rnn_cell)
298 outputs, state = rnn_layer(sequence_input, mask=sequence_length_mask)
299 ```
301 Args:
302 key: A unique string identifying the input feature.
303 vocabulary_list: An ordered iterable defining the vocabulary. Each feature
304 is mapped to the index of its value (if present) in `vocabulary_list`.
305 Must be castable to `dtype`.
306 dtype: The type of features. Only string and integer types are supported. If
307 `None`, it will be inferred from `vocabulary_list`.
308 default_value: The integer ID value to return for out-of-vocabulary feature
309 values, defaults to `-1`. This can not be specified with a positive
310 `num_oov_buckets`.
311 num_oov_buckets: Non-negative integer, the number of out-of-vocabulary
312 buckets. All out-of-vocabulary inputs will be assigned IDs in the range
313 `[len(vocabulary_list), len(vocabulary_list)+num_oov_buckets)` based on a
314 hash of the input value. A positive `num_oov_buckets` can not be specified
315 with `default_value`.
317 Returns:
318 A `SequenceCategoricalColumn`.
320 Raises:
321 ValueError: if `vocabulary_list` is empty, or contains duplicate keys.
322 ValueError: `num_oov_buckets` is a negative integer.
323 ValueError: `num_oov_buckets` and `default_value` are both specified.
324 ValueError: if `dtype` is not integer or string.
325 """
326 return fc.SequenceCategoricalColumn(
327 fc.categorical_column_with_vocabulary_list(
328 key=key,
329 vocabulary_list=vocabulary_list,
330 dtype=dtype,
331 default_value=default_value,
332 num_oov_buckets=num_oov_buckets))
335@doc_controls.header(_FEATURE_COLUMN_DEPRECATION_WARNING)
336@tf_export('feature_column.sequence_numeric_column')
337@deprecation.deprecated(None, _FEATURE_COLUMN_DEPRECATION_RUNTIME_WARNING)
338def sequence_numeric_column(key,
339 shape=(1,),
340 default_value=0.,
341 dtype=dtypes.float32,
342 normalizer_fn=None):
343 """Returns a feature column that represents sequences of numeric data.
345 Example:
347 ```python
348 temperature = sequence_numeric_column('temperature')
349 columns = [temperature]
351 features = tf.io.parse_example(..., features=make_parse_example_spec(columns))
352 sequence_feature_layer = SequenceFeatures(columns)
353 sequence_input, sequence_length = sequence_feature_layer(features)
354 sequence_length_mask = tf.sequence_mask(sequence_length)
356 rnn_cell = tf.keras.layers.SimpleRNNCell(hidden_size)
357 rnn_layer = tf.keras.layers.RNN(rnn_cell)
358 outputs, state = rnn_layer(sequence_input, mask=sequence_length_mask)
359 ```
361 Args:
362 key: A unique string identifying the input features.
363 shape: The shape of the input data per sequence id. E.g. if `shape=(2,)`,
364 each example must contain `2 * sequence_length` values.
365 default_value: A single value compatible with `dtype` that is used for
366 padding the sparse data into a dense `Tensor`.
367 dtype: The type of values.
368 normalizer_fn: If not `None`, a function that can be used to normalize the
369 value of the tensor after `default_value` is applied for parsing.
370 Normalizer function takes the input `Tensor` as its argument, and returns
371 the output `Tensor`. (e.g. lambda x: (x - 3.0) / 4.2). Please note that
372 even though the most common use case of this function is normalization, it
373 can be used for any kind of Tensorflow transformations.
375 Returns:
376 A `SequenceNumericColumn`.
378 Raises:
379 TypeError: if any dimension in shape is not an int.
380 ValueError: if any dimension in shape is not a positive integer.
381 ValueError: if `dtype` is not convertible to `tf.float32`.
382 """
383 shape = fc._check_shape(shape=shape, key=key)
384 if not (dtype.is_integer or dtype.is_floating):
385 raise ValueError('dtype must be convertible to float. '
386 'dtype: {}, key: {}'.format(dtype, key))
387 if normalizer_fn is not None and not callable(normalizer_fn):
388 raise TypeError(
389 'normalizer_fn must be a callable. Given: {}'.format(normalizer_fn))
391 return SequenceNumericColumn(
392 key,
393 shape=shape,
394 default_value=default_value,
395 dtype=dtype,
396 normalizer_fn=normalizer_fn)
399def _assert_all_equal_and_return(tensors, name=None):
400 """Asserts that all tensors are equal and returns the first one."""
401 with ops.name_scope(name, 'assert_all_equal', values=tensors):
402 if len(tensors) == 1:
403 return tensors[0]
404 assert_equal_ops = []
405 for t in tensors[1:]:
406 assert_equal_ops.append(check_ops.assert_equal(tensors[0], t))
407 with ops.control_dependencies(assert_equal_ops):
408 return array_ops.identity(tensors[0])
413class SequenceNumericColumn(
414 fc.SequenceDenseColumn,
415 collections.namedtuple(
416 'SequenceNumericColumn',
417 ('key', 'shape', 'default_value', 'dtype', 'normalizer_fn'))):
418 """Represents sequences of numeric data."""
420 @property
421 def _is_v2_column(self):
422 return True
424 @property
425 def name(self):
426 """See `FeatureColumn` base class."""
427 return self.key
429 @property
430 def parse_example_spec(self):
431 """See `FeatureColumn` base class."""
432 return {self.key: parsing_ops.VarLenFeature(self.dtype)}
434 def transform_feature(self, transformation_cache, state_manager):
435 """See `FeatureColumn` base class.
437 In this case, we apply the `normalizer_fn` to the input tensor.
439 Args:
440 transformation_cache: A `FeatureTransformationCache` object to access
441 features.
442 state_manager: A `StateManager` to create / access resources such as
443 lookup tables.
445 Returns:
446 Normalized input tensor.
447 """
448 input_tensor = transformation_cache.get(self.key, state_manager)
449 if self.normalizer_fn is not None:
450 input_tensor = self.normalizer_fn(input_tensor)
451 return input_tensor
453 @property
454 def variable_shape(self):
455 """Returns a `TensorShape` representing the shape of sequence input."""
456 return tensor_shape.TensorShape(self.shape)
458 def get_sequence_dense_tensor(self, transformation_cache, state_manager):
459 """Returns a `TensorSequenceLengthPair`.
461 Args:
462 transformation_cache: A `FeatureTransformationCache` object to access
463 features.
464 state_manager: A `StateManager` to create / access resources such as
465 lookup tables.
466 """
467 sp_tensor = transformation_cache.get(self, state_manager)
468 dense_tensor = sparse_ops.sparse_tensor_to_dense(
469 sp_tensor, default_value=self.default_value)
470 # Reshape into [batch_size, T, variable_shape].
471 dense_shape = array_ops.concat(
472 [array_ops.shape(dense_tensor)[:1], [-1], self.variable_shape], axis=0)
473 dense_tensor = array_ops.reshape(dense_tensor, shape=dense_shape)
475 # Get the number of timesteps per example
476 # For the 2D case, the raw values are grouped according to num_elements;
477 # for the 3D case, the grouping happens in the third dimension, and
478 # sequence length is not affected.
479 if sp_tensor.shape.ndims == 2:
480 num_elements = self.variable_shape.num_elements()
481 else:
482 num_elements = 1
483 seq_length = fc_utils.sequence_length_from_sparse_tensor(
484 sp_tensor, num_elements=num_elements)
486 return fc.SequenceDenseColumn.TensorSequenceLengthPair(
487 dense_tensor=dense_tensor, sequence_length=seq_length)
489 @property
490 def parents(self):
491 """See 'FeatureColumn` base class."""
492 return [self.key]
494 def get_config(self):
495 """See 'FeatureColumn` base class."""
496 config = dict(zip(self._fields, self))
497 config['dtype'] = self.dtype.name
498 return config
500 @classmethod
501 def from_config(cls, config, custom_objects=None, columns_by_name=None):
502 """See 'FeatureColumn` base class."""
503 fc._check_config_keys(config, cls._fields)
504 kwargs = fc._standardize_and_copy_config(config)
505 kwargs['dtype'] = dtypes.as_dtype(config['dtype'])
506 return cls(**kwargs)
509# pylint: enable=protected-access