Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/tensorflow/python/feature_column/feature_column

3# Licensed under the Apache License, Version 2.0 (the "License");

4# you may not use this file except in compliance with the License.

5# You may obtain a copy of the License at

7# http://www.apache.org/licenses/LICENSE-2.0

9# Unless required by applicable law or agreed to in writing, software

10# distributed under the License is distributed on an "AS IS" BASIS,

11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

12# See the License for the specific language governing permissions and

13# limitations under the License.

14# ==============================================================================

15"""This API defines FeatureColumn abstraction.

17FeatureColumns provide a high level abstraction for ingesting and representing

18features. FeatureColumns are also the primary way of encoding features for

19canned `tf.estimator.Estimator`s.

21When using FeatureColumns with `Estimators`, the type of feature column you

22should choose depends on (1) the feature type and (2) the model type.

241. Feature type:

26 * Continuous features can be represented by `numeric_column`.

27 * Categorical features can be represented by any `categorical_column_with_*`

28 column:

29 - `categorical_column_with_vocabulary_list`

30 - `categorical_column_with_vocabulary_file`

31 - `categorical_column_with_hash_bucket`

32 - `categorical_column_with_identity`

33 - `weighted_categorical_column`

352. Model type:

37 * Deep neural network models (`DNNClassifier`, `DNNRegressor`).

39 Continuous features can be directly fed into deep neural network models.

41 age_column = numeric_column("age")

43 To feed sparse features into DNN models, wrap the column with

44 `embedding_column` or `indicator_column`. `indicator_column` is recommended

45 for features with only a few possible values. For features with many

46 possible values, to reduce the size of your model, `embedding_column` is

47 recommended.

49 embedded_dept_column = embedding_column(

50 categorical_column_with_vocabulary_list(

51 "department", ["math", "philosophy", ...]), dimension=10)

53 * Wide (aka linear) models (`LinearClassifier`, `LinearRegressor`).

55 Sparse features can be fed directly into linear models. They behave like an

56 indicator column but with an efficient implementation.

58 dept_column = categorical_column_with_vocabulary_list("department",

59 ["math", "philosophy", "english"])

61 It is recommended that continuous features be bucketized before being

62 fed into linear models.

64 bucketized_age_column = bucketized_column(

65 source_column=age_column,

66 boundaries=[18, 25, 30, 35, 40, 45, 50, 55, 60, 65])

68 Sparse features can be crossed (also known as conjuncted or combined) in

69 order to form non-linearities, and then fed into linear models.

71 cross_dept_age_column = crossed_column(

72 columns=["department", bucketized_age_column],

73 hash_bucket_size=1000)

75Example of building canned `Estimator`s using FeatureColumns:

77 ```python

78 # Define features and transformations

79 deep_feature_columns = [age_column, embedded_dept_column]

80 wide_feature_columns = [dept_column, bucketized_age_column,

81 cross_dept_age_column]

83 # Build deep model

84 estimator = DNNClassifier(

85 feature_columns=deep_feature_columns,

86 hidden_units=[500, 250, 50])

87 estimator.train(...)

89 # Or build a wide model

90 estimator = LinearClassifier(

91 feature_columns=wide_feature_columns)

92 estimator.train(...)

94 # Or build a wide and deep model!

95 estimator = DNNLinearCombinedClassifier(

96 linear_feature_columns=wide_feature_columns,

97 dnn_feature_columns=deep_feature_columns,

98 dnn_hidden_units=[500, 250, 50])

99 estimator.train(...)

100 ```

101

102

103FeatureColumns can also be transformed into a generic input layer for

104custom models using `input_layer`.

105

106Example of building model using FeatureColumns, this can be used in a

107`model_fn` which is given to the {tf.estimator.Estimator}:

108

109 ```python

110 # Building model via layers

111

112 deep_feature_columns = [age_column, embedded_dept_column]

113 columns_to_tensor = parse_feature_columns_from_examples(

114 serialized=my_data,

115 feature_columns=deep_feature_columns)

116 first_layer = input_layer(

117 features=columns_to_tensor,

118 feature_columns=deep_feature_columns)

119 second_layer = fully_connected(first_layer, ...)

120 ```

121

122NOTE: Functions prefixed with "_" indicate experimental or private parts of

123the API subject to change, and should not be relied upon!

124"""

125

126import abc

127import collections

128import math

129import re

130

131import numpy as np

132import six

133

134from tensorflow.python.data.experimental.ops import lookup_ops as data_lookup_ops

135from tensorflow.python.data.ops import readers

136from tensorflow.python.eager import context

137from tensorflow.python.feature_column import feature_column as fc_old

138from tensorflow.python.feature_column import utils as fc_utils

139from tensorflow.python.framework import dtypes

140from tensorflow.python.framework import ops

141from tensorflow.python.framework import sparse_tensor as sparse_tensor_lib

142from tensorflow.python.framework import tensor_shape

143from tensorflow.python.ops import array_ops

144from tensorflow.python.ops import array_ops_stack

145from tensorflow.python.ops import check_ops

146from tensorflow.python.ops import cond

147from tensorflow.python.ops import embedding_ops

148from tensorflow.python.ops import init_ops

149from tensorflow.python.ops import lookup_ops

150from tensorflow.python.ops import math_ops

151from tensorflow.python.ops import parsing_ops

152from tensorflow.python.ops import sparse_ops

153from tensorflow.python.ops import string_ops

154from tensorflow.python.ops import variable_scope

155from tensorflow.python.ops import variables

156from tensorflow.python.platform import gfile

157from tensorflow.python.platform import tf_logging as logging

158from tensorflow.python.trackable import autotrackable

159from tensorflow.python.trackable import base as trackable

160from tensorflow.python.trackable import data_structures

161from tensorflow.python.training import checkpoint_utils

162from tensorflow.python.util import deprecation

163from tensorflow.python.util import nest

164from tensorflow.python.util import tf_inspect

165from tensorflow.python.util.compat import collections_abc

166from tensorflow.python.util.tf_export import tf_export

167from tensorflow.tools.docs import doc_controls

168

169_FEATURE_COLUMN_DEPRECATION_DATE = None

170_FEATURE_COLUMN_DEPRECATION = ('The old _FeatureColumn APIs are being '

171 'deprecated. Please use the new FeatureColumn '

172 'APIs instead.')

173_FEATURE_COLUMN_DEPRECATION_WARNING = """\

174 Warning: tf.feature_column is not recommended for new code. Instead,

175 feature preprocessing can be done directly using either [Keras preprocessing

176 layers](https://www.tensorflow.org/guide/migrate/migrating_feature_columns)

177 or through the one-stop utility [`tf.keras.utils.FeatureSpace`](https://www.tensorflow.org/api_docs/python/tf/keras/utils/FeatureSpace)

178 built on top of them. See the [migration guide](https://tensorflow.org/guide/migrate)

179 for details.

180 """

181_FEATURE_COLUMN_DEPRECATION_RUNTIME_WARNING = (

182 'Use Keras preprocessing layers instead, either directly or via the '

183 '`tf.keras.utils.FeatureSpace` utility. Each of `tf.feature_column.*` has '

184 'a functional equivalent in `tf.keras.layers` for feature preprocessing '

185 'when training a Keras model.')

186

187

188class StateManager(object):

189 """Manages the state associated with FeatureColumns.

190

191 Some `FeatureColumn`s create variables or resources to assist their

192 computation. The `StateManager` is responsible for creating and storing these

193 objects since `FeatureColumn`s are supposed to be stateless configuration

194 only.

195 """

196

197 def create_variable(self,

198 feature_column,

199 name,

200 shape,

201 dtype=None,

202 trainable=True,

203 use_resource=True,

204 initializer=None):

205 """Creates a new variable.

206

207 Args:

208 feature_column: A `FeatureColumn` object this variable corresponds to.

209 name: variable name.

210 shape: variable shape.

211 dtype: The type of the variable. Defaults to `self.dtype` or `float32`.

212 trainable: Whether this variable is trainable or not.

213 use_resource: If true, we use resource variables. Otherwise we use

214 RefVariable.

215 initializer: initializer instance (callable).

216

217 Returns:

218 The created variable.

219 """

220 del feature_column, name, shape, dtype, trainable, use_resource, initializer

221 raise NotImplementedError('StateManager.create_variable')

222

223 def add_variable(self, feature_column, var):

224 """Adds an existing variable to the state.

225

226 Args:

227 feature_column: A `FeatureColumn` object to associate this variable with.

228 var: The variable.

229 """

230 del feature_column, var

231 raise NotImplementedError('StateManager.add_variable')

232

233 def get_variable(self, feature_column, name):

234 """Returns an existing variable.

235

236 Args:

237 feature_column: A `FeatureColumn` object this variable corresponds to.

238 name: variable name.

239 """

240 del feature_column, name

241 raise NotImplementedError('StateManager.get_var')

242

243 def add_resource(self, feature_column, name, resource):

244 """Creates a new resource.

245

246 Resources can be things such as tables, variables, trackables, etc.

247

248 Args:

249 feature_column: A `FeatureColumn` object this resource corresponds to.

250 name: Name of the resource.

251 resource: The resource.

252

253 Returns:

254 The created resource.

255 """

256 del feature_column, name, resource

257 raise NotImplementedError('StateManager.add_resource')

258

259 def has_resource(self, feature_column, name):

260 """Returns true iff a resource with same name exists.

261

262 Resources can be things such as tables, variables, trackables, etc.

263

264 Args:

265 feature_column: A `FeatureColumn` object this variable corresponds to.

266 name: Name of the resource.

267 """

268 del feature_column, name

269 raise NotImplementedError('StateManager.has_resource')

270

271 def get_resource(self, feature_column, name):

272 """Returns an already created resource.

273

274 Resources can be things such as tables, variables, trackables, etc.

275

276 Args:

277 feature_column: A `FeatureColumn` object this variable corresponds to.

278 name: Name of the resource.

279 """

280 del feature_column, name

281 raise NotImplementedError('StateManager.get_resource')

282

283

284@tf_export('__internal__.feature_column.StateManager', v1=[])

285class _StateManagerImpl(StateManager):

286 """Manages the state of DenseFeatures and LinearLayer.

287

288 Some `FeatureColumn`s create variables or resources to assist their

289 computation. The `StateManager` is responsible for creating and storing these

290 objects since `FeatureColumn`s are supposed to be stateless configuration

291 only.

292 """

293

294 def __init__(self, layer, trainable):

295 """Creates an _StateManagerImpl object.

296

297 Args:

298 layer: The input layer this state manager is associated with.

299 trainable: Whether by default, variables created are trainable or not.

300 """

301 self._trainable = trainable

302 self._layer = layer

303 if self._layer is not None and not hasattr(self._layer, '_resources'):

304 self._layer._resources = data_structures.Mapping() # pylint: disable=protected-access

305 self._cols_to_vars_map = collections.defaultdict(lambda: {})

306 self._cols_to_resources_map = collections.defaultdict(lambda: {})

307

308 def create_variable(self,

309 feature_column,

310 name,

311 shape,

312 dtype=None,

313 trainable=True,

314 use_resource=True,

315 initializer=None):

316 """Creates a new variable.

317

318 Args:

319 feature_column: A `FeatureColumn` object this variable corresponds to.

320 name: variable name.

321 shape: variable shape.

322 dtype: The type of the variable. Defaults to `self.dtype` or `float32`.

323 trainable: Whether this variable is trainable or not.

324 use_resource: If true, we use resource variables. Otherwise we use

325 RefVariable.

326 initializer: initializer instance (callable).

327

328 Returns:

329 The created variable.

330 """

331 if name in self._cols_to_vars_map[feature_column]:

332 raise ValueError('Variable already exists.')

333

334 # We explicitly track these variables since `name` is not guaranteed to be

335 # unique and disable manual tracking that the add_weight call does.

336 with trackable.no_manual_dependency_tracking_scope(self._layer):

337 var = self._layer.add_weight(

338 name=name,

339 shape=shape,

340 dtype=dtype,

341 initializer=initializer,

342 trainable=self._trainable and trainable,

343 use_resource=use_resource,

344 # TODO(rohanj): Get rid of this hack once we have a mechanism for

345 # specifying a default partitioner for an entire layer. In that case,

346 # the default getter for Layers should work.

347 getter=variable_scope.get_variable)

348 if isinstance(var, variables.PartitionedVariable):

349 for v in var:

350 part_name = name + '/' + str(v._get_save_slice_info().var_offset[0]) # pylint: disable=protected-access

351 self._layer._track_trackable(v, feature_column.name + '/' + part_name) # pylint: disable=protected-access

352 else:

353 if isinstance(var, trackable.Trackable):

354 self._layer._track_trackable(var, feature_column.name + '/' + name) # pylint: disable=protected-access

355

356 self._cols_to_vars_map[feature_column][name] = var

357 return var

358

359 def get_variable(self, feature_column, name):

360 """Returns an existing variable.

361

362 Args:

363 feature_column: A `FeatureColumn` object this variable corresponds to.

364 name: variable name.

365 """

366 if name in self._cols_to_vars_map[feature_column]:

367 return self._cols_to_vars_map[feature_column][name]

368 raise ValueError('Variable does not exist.')

369

370 def add_resource(self, feature_column, resource_name, resource):

371 """Creates a new resource.

372

373 Resources can be things such as tables, variables, trackables, etc.

374

375 Args:

376 feature_column: A `FeatureColumn` object this resource corresponds to.

377 resource_name: Name of the resource.

378 resource: The resource.

379

380 Returns:

381 The created resource.

382 """

383 self._cols_to_resources_map[feature_column][resource_name] = resource

384 # pylint: disable=protected-access

385 if self._layer is not None and isinstance(resource, trackable.Trackable):

386 # Add trackable resources to the layer for serialization.

387 if feature_column.name not in self._layer._resources:

388 self._layer._resources[feature_column.name] = data_structures.Mapping()

389 if resource_name not in self._layer._resources[feature_column.name]:

390 self._layer._resources[feature_column.name][resource_name] = resource

391 # pylint: enable=protected-access

392

393 def has_resource(self, feature_column, resource_name):

394 """Returns true iff a resource with same name exists.

395

396 Resources can be things such as tables, variables, trackables, etc.

397

398 Args:

399 feature_column: A `FeatureColumn` object this variable corresponds to.

400 resource_name: Name of the resource.

401 """

402 return resource_name in self._cols_to_resources_map[feature_column]

403

404 def get_resource(self, feature_column, resource_name):

405 """Returns an already created resource.

406

407 Resources can be things such as tables, variables, trackables, etc.

408

409 Args:

410 feature_column: A `FeatureColumn` object this variable corresponds to.

411 resource_name: Name of the resource.

412 """

413 if (feature_column not in self._cols_to_resources_map or

414 resource_name not in self._cols_to_resources_map[feature_column]):

415 raise ValueError('Resource does not exist.')

416 return self._cols_to_resources_map[feature_column][resource_name]

417

418

419def _transform_features_v2(features, feature_columns, state_manager):

420 """Returns transformed features based on features columns passed in.

421

422 Please note that most probably you would not need to use this function. Please

423 check `input_layer` and `linear_model` to see whether they will

424 satisfy your use case or not.

425

426 Example:

427

428 ```python

429 # Define features and transformations

430 crosses_a_x_b = crossed_column(

431 columns=["sparse_feature_a", "sparse_feature_b"], hash_bucket_size=10000)

432 price_buckets = bucketized_column(

433 source_column=numeric_column("price"), boundaries=[...])

434

435 columns = [crosses_a_x_b, price_buckets]

436 features = tf.io.parse_example(..., features=make_parse_example_spec(columns))

437 transformed = transform_features(features=features, feature_columns=columns)

438

439 assertCountEqual(columns, transformed.keys())

440 ```

441

442 Args:

443 features: A mapping from key to tensors. `FeatureColumn`s look up via these

444 keys. For example `numeric_column('price')` will look at 'price' key in

445 this dict. Values can be a `SparseTensor` or a `Tensor` depends on

446 corresponding `FeatureColumn`.

447 feature_columns: An iterable containing all the `FeatureColumn`s.

448 state_manager: A StateManager object that holds the FeatureColumn state.

449

450 Returns:

451 A `dict` mapping `FeatureColumn` to `Tensor` and `SparseTensor` values.

452 """

453 feature_columns = _normalize_feature_columns(feature_columns)

454 outputs = {}

455 with ops.name_scope(

456 None, default_name='transform_features', values=features.values()):

457 transformation_cache = FeatureTransformationCache(features)

458 for column in feature_columns:

459 with ops.name_scope(

460 None,

461 default_name=_sanitize_column_name_for_variable_scope(column.name)):

462 outputs[column] = transformation_cache.get(column, state_manager)

463 return outputs

464

465

466@doc_controls.header(_FEATURE_COLUMN_DEPRECATION_WARNING)

467@tf_export(

468 'feature_column.make_parse_example_spec',

469 v1=[])

470@deprecation.deprecated(None, _FEATURE_COLUMN_DEPRECATION_RUNTIME_WARNING)

471def make_parse_example_spec_v2(feature_columns):

472 """Creates parsing spec dictionary from input feature_columns.

473

474 The returned dictionary can be used as arg 'features' in

475 `tf.io.parse_example`.

476

477 Typical usage example:

478

479 ```python

480 # Define features and transformations

481 feature_a = tf.feature_column.categorical_column_with_vocabulary_file(...)

482 feature_b = tf.feature_column.numeric_column(...)

483 feature_c_bucketized = tf.feature_column.bucketized_column(

484 tf.feature_column.numeric_column("feature_c"), ...)

485 feature_a_x_feature_c = tf.feature_column.crossed_column(

486 columns=["feature_a", feature_c_bucketized], ...)

487

488 feature_columns = set(

489 [feature_b, feature_c_bucketized, feature_a_x_feature_c])

490 features = tf.io.parse_example(

491 serialized=serialized_examples,

492 features=tf.feature_column.make_parse_example_spec(feature_columns))

493 ```

494

495 For the above example, make_parse_example_spec would return the dict:

496

497 ```python

498 {

499 "feature_a": parsing_ops.VarLenFeature(tf.string),

500 "feature_b": parsing_ops.FixedLenFeature([1], dtype=tf.float32),

501 "feature_c": parsing_ops.FixedLenFeature([1], dtype=tf.float32)

502 }

503 ```

504

505 Args:

506 feature_columns: An iterable containing all feature columns. All items

507 should be instances of classes derived from `FeatureColumn`.

508

509 Returns:

510 A dict mapping each feature key to a `FixedLenFeature` or `VarLenFeature`

511 value.

512

513 Raises:

514 ValueError: If any of the given `feature_columns` is not a `FeatureColumn`

515 instance.

516 """

517 result = {}

518 for column in feature_columns:

519 if not isinstance(column, FeatureColumn):

520 raise ValueError('All feature_columns must be FeatureColumn instances. '

521 'Given: {}'.format(column))

522 config = column.parse_example_spec

523 for key, value in six.iteritems(config):

524 if key in result and value != result[key]:

525 raise ValueError('feature_columns contain different parse_spec for key '

526 '{}. Given {} and {}'.format(key, value, result[key]))

527 result.update(config)

528 return result

529

530

531@doc_controls.header(_FEATURE_COLUMN_DEPRECATION_WARNING)

532@tf_export('feature_column.embedding_column')

533@deprecation.deprecated(None, _FEATURE_COLUMN_DEPRECATION_RUNTIME_WARNING)

534def embedding_column(categorical_column,

535 dimension,

536 combiner='mean',

537 initializer=None,

538 ckpt_to_load_from=None,

539 tensor_name_in_ckpt=None,

540 max_norm=None,

541 trainable=True,

542 use_safe_embedding_lookup=True):

543 """`DenseColumn` that converts from sparse, categorical input.

544

545 Use this when your inputs are sparse, but you want to convert them to a dense

546 representation (e.g., to feed to a DNN).

547

548 Inputs must be a `CategoricalColumn` created by any of the

549 `categorical_column_*` function. Here is an example of using

550 `embedding_column` with `DNNClassifier`:

551

552 ```python

553 video_id = categorical_column_with_identity(

554 key='video_id', num_buckets=1000000, default_value=0)

555 columns = [embedding_column(video_id, 9),...]

556

557 estimator = tf.estimator.DNNClassifier(feature_columns=columns, ...)

558

559 label_column = ...

560 def input_fn():

561 features = tf.io.parse_example(

562 ..., features=make_parse_example_spec(columns + [label_column]))

563 labels = features.pop(label_column.name)

564 return features, labels

565

566 estimator.train(input_fn=input_fn, steps=100)

567 ```

568

569 Here is an example using `embedding_column` with model_fn:

570

571 ```python

572 def model_fn(features, ...):

573 video_id = categorical_column_with_identity(

574 key='video_id', num_buckets=1000000, default_value=0)

575 columns = [embedding_column(video_id, 9),...]

576 dense_tensor = input_layer(features, columns)

577 # Form DNN layers, calculate loss, and return EstimatorSpec.

578 ...

579 ```

580

581 Args:

582 categorical_column: A `CategoricalColumn` created by a

583 `categorical_column_with_*` function. This column produces the sparse IDs

584 that are inputs to the embedding lookup.

585 dimension: An integer specifying dimension of the embedding, must be > 0.

586 combiner: A string specifying how to reduce if there are multiple entries in

587 a single row. Currently 'mean', 'sqrtn' and 'sum' are supported, with

588 'mean' the default. 'sqrtn' often achieves good accuracy, in particular

589 with bag-of-words columns. Each of this can be thought as example level

590 normalizations on the column. For more information, see

591 `tf.embedding_lookup_sparse`.

592 initializer: A variable initializer function to be used in embedding

593 variable initialization. If not specified, defaults to

594 `truncated_normal_initializer` with mean `0.0` and standard deviation

595 `1/sqrt(dimension)`.

596 ckpt_to_load_from: String representing checkpoint name/pattern from which to

597 restore column weights. Required if `tensor_name_in_ckpt` is not `None`.

598 tensor_name_in_ckpt: Name of the `Tensor` in `ckpt_to_load_from` from which

599 to restore the column weights. Required if `ckpt_to_load_from` is not

600 `None`.

601 max_norm: If not `None`, embedding values are l2-normalized to this value.

602 trainable: Whether or not the embedding is trainable. Default is True.

603 use_safe_embedding_lookup: If true, uses safe_embedding_lookup_sparse

604 instead of embedding_lookup_sparse. safe_embedding_lookup_sparse ensures

605 there are no empty rows and all weights and ids are positive at the

606 expense of extra compute cost. This only applies to rank 2 (NxM) shaped

607 input tensors. Defaults to true, consider turning off if the above checks

608 are not needed. Note that having empty rows will not trigger any error

609 though the output result might be 0 or omitted.

610

611 Returns:

612 `DenseColumn` that converts from sparse input.

613

614 Raises:

615 ValueError: if `dimension` not > 0.

616 ValueError: if exactly one of `ckpt_to_load_from` and `tensor_name_in_ckpt`

617 is specified.

618 ValueError: if `initializer` is specified and is not callable.

619 RuntimeError: If eager execution is enabled.

620 """

621 if (dimension is None) or (dimension < 1):

622 raise ValueError('Invalid dimension {}.'.format(dimension))

623 if (ckpt_to_load_from is None) != (tensor_name_in_ckpt is None):

624 raise ValueError('Must specify both `ckpt_to_load_from` and '

625 '`tensor_name_in_ckpt` or none of them.')

626

627 if (initializer is not None) and (not callable(initializer)):

628 raise ValueError('initializer must be callable if specified. '

629 'Embedding of column_name: {}'.format(

630 categorical_column.name))

631 if initializer is None:

632 initializer = init_ops.truncated_normal_initializer(

633 mean=0.0, stddev=1 / math.sqrt(dimension))

634

635 return EmbeddingColumn(

636 categorical_column=categorical_column,

637 dimension=dimension,

638 combiner=combiner,

639 initializer=initializer,

640 ckpt_to_load_from=ckpt_to_load_from,

641 tensor_name_in_ckpt=tensor_name_in_ckpt,

642 max_norm=max_norm,

643 trainable=trainable,

644 use_safe_embedding_lookup=use_safe_embedding_lookup)

645

646

647@doc_controls.header(_FEATURE_COLUMN_DEPRECATION_WARNING)

648@tf_export(v1=['feature_column.shared_embedding_columns'])

649@deprecation.deprecated(None, _FEATURE_COLUMN_DEPRECATION_RUNTIME_WARNING)

650def shared_embedding_columns(categorical_columns,

651 dimension,

652 combiner='mean',

653 initializer=None,

654 shared_embedding_collection_name=None,

655 ckpt_to_load_from=None,

656 tensor_name_in_ckpt=None,

657 max_norm=None,

658 trainable=True,

659 use_safe_embedding_lookup=True):

660 """List of dense columns that convert from sparse, categorical input.

661

662 This is similar to `embedding_column`, except that it produces a list of

663 embedding columns that share the same embedding weights.

664

665 Use this when your inputs are sparse and of the same type (e.g. watched and

666 impression video IDs that share the same vocabulary), and you want to convert

667 them to a dense representation (e.g., to feed to a DNN).

668

669 Inputs must be a list of categorical columns created by any of the

670 `categorical_column_*` function. They must all be of the same type and have

671 the same arguments except `key`. E.g. they can be

672 categorical_column_with_vocabulary_file with the same vocabulary_file. Some or

673 all columns could also be weighted_categorical_column.

674

675 Here is an example embedding of two features for a DNNClassifier model:

676

677 ```python

678 watched_video_id = categorical_column_with_vocabulary_file(

679 'watched_video_id', video_vocabulary_file, video_vocabulary_size)

680 impression_video_id = categorical_column_with_vocabulary_file(

681 'impression_video_id', video_vocabulary_file, video_vocabulary_size)

682 columns = shared_embedding_columns(

683 [watched_video_id, impression_video_id], dimension=10)

684

685 estimator = tf.estimator.DNNClassifier(feature_columns=columns, ...)

686

687 label_column = ...

688 def input_fn():

689 features = tf.io.parse_example(

690 ..., features=make_parse_example_spec(columns + [label_column]))

691 labels = features.pop(label_column.name)

692 return features, labels

693

694 estimator.train(input_fn=input_fn, steps=100)

695 ```

696

697 Here is an example using `shared_embedding_columns` with model_fn:

698

699 ```python

700 def model_fn(features, ...):

701 watched_video_id = categorical_column_with_vocabulary_file(

702 'watched_video_id', video_vocabulary_file, video_vocabulary_size)

703 impression_video_id = categorical_column_with_vocabulary_file(

704 'impression_video_id', video_vocabulary_file, video_vocabulary_size)

705 columns = shared_embedding_columns(

706 [watched_video_id, impression_video_id], dimension=10)

707 dense_tensor = input_layer(features, columns)

708 # Form DNN layers, calculate loss, and return EstimatorSpec.

709 ...

710 ```

711

712 Args:

713 categorical_columns: List of categorical columns created by a

714 `categorical_column_with_*` function. These columns produce the sparse IDs

715 that are inputs to the embedding lookup. All columns must be of the same

716 type and have the same arguments except `key`. E.g. they can be

717 categorical_column_with_vocabulary_file with the same vocabulary_file.

718 Some or all columns could also be weighted_categorical_column.

719 dimension: An integer specifying dimension of the embedding, must be > 0.

720 combiner: A string specifying how to reduce if there are multiple entries in

721 a single row. Currently 'mean', 'sqrtn' and 'sum' are supported, with

722 'mean' the default. 'sqrtn' often achieves good accuracy, in particular

723 with bag-of-words columns. Each of this can be thought as example level

724 normalizations on the column. For more information, see

725 `tf.embedding_lookup_sparse`.

726 initializer: A variable initializer function to be used in embedding

727 variable initialization. If not specified, defaults to

728 `truncated_normal_initializer` with mean `0.0` and standard deviation

729 `1/sqrt(dimension)`.

730 shared_embedding_collection_name: Optional name of the collection where

731 shared embedding weights are added. If not given, a reasonable name will

732 be chosen based on the names of `categorical_columns`. This is also used

733 in `variable_scope` when creating shared embedding weights.

734 ckpt_to_load_from: String representing checkpoint name/pattern from which to

735 restore column weights. Required if `tensor_name_in_ckpt` is not `None`.

736 tensor_name_in_ckpt: Name of the `Tensor` in `ckpt_to_load_from` from which

737 to restore the column weights. Required if `ckpt_to_load_from` is not

738 `None`.

739 max_norm: If not `None`, each embedding is clipped if its l2-norm is larger

740 than this value, before combining.

741 trainable: Whether or not the embedding is trainable. Default is True.

742 use_safe_embedding_lookup: If true, uses safe_embedding_lookup_sparse

743 instead of embedding_lookup_sparse. safe_embedding_lookup_sparse ensures

744 there are no empty rows and all weights and ids are positive at the

745 expense of extra compute cost. This only applies to rank 2 (NxM) shaped

746 input tensors. Defaults to true, consider turning off if the above checks

747 are not needed. Note that having empty rows will not trigger any error

748 though the output result might be 0 or omitted.

749

750 Returns:

751 A list of dense columns that converts from sparse input. The order of

752 results follows the ordering of `categorical_columns`.

753

754 Raises:

755 ValueError: if `dimension` not > 0.

756 ValueError: if any of the given `categorical_columns` is of different type

757 or has different arguments than the others.

758 ValueError: if exactly one of `ckpt_to_load_from` and `tensor_name_in_ckpt`

759 is specified.

760 ValueError: if `initializer` is specified and is not callable.

761 RuntimeError: if eager execution is enabled.

762 """

763 if context.executing_eagerly():

764 raise RuntimeError('shared_embedding_columns are not supported when eager '

765 'execution is enabled.')

766

767 if (dimension is None) or (dimension < 1):

768 raise ValueError('Invalid dimension {}.'.format(dimension))

769 if (ckpt_to_load_from is None) != (tensor_name_in_ckpt is None):

770 raise ValueError('Must specify both `ckpt_to_load_from` and '

771 '`tensor_name_in_ckpt` or none of them.')

772

773 if (initializer is not None) and (not callable(initializer)):

774 raise ValueError('initializer must be callable if specified.')

775 if initializer is None:

776 initializer = init_ops.truncated_normal_initializer(

777 mean=0.0, stddev=1. / math.sqrt(dimension))

778

779 # Sort the columns so the default collection name is deterministic even if the

780 # user passes columns from an unsorted collection, such as dict.values().

781 sorted_columns = sorted(categorical_columns, key=lambda x: x.name)

782

783 c0 = sorted_columns[0]

784 num_buckets = c0._num_buckets # pylint: disable=protected-access

785 if not isinstance(c0, fc_old._CategoricalColumn): # pylint: disable=protected-access

786 raise ValueError(

787 'All categorical_columns must be subclasses of _CategoricalColumn. '

788 'Given: {}, of type: {}'.format(c0, type(c0)))

789 while isinstance(

790 c0,

791 (

792 fc_old._WeightedCategoricalColumn, # pylint: disable=protected-access

793 WeightedCategoricalColumn,

794 fc_old._SequenceCategoricalColumn, # pylint: disable=protected-access

795 SequenceCategoricalColumn)):

796 c0 = c0.categorical_column

797 for c in sorted_columns[1:]:

798 while isinstance(

799 c,

800 (

801 fc_old._WeightedCategoricalColumn, # pylint: disable=protected-access

802 WeightedCategoricalColumn,

803 fc_old._SequenceCategoricalColumn, # pylint: disable=protected-access

804 SequenceCategoricalColumn)):

805 c = c.categorical_column

806 if not isinstance(c, type(c0)):

807 raise ValueError(

808 'To use shared_embedding_column, all categorical_columns must have '

809 'the same type, or be weighted_categorical_column or sequence column '

810 'of the same type. Given column: {} of type: {} does not match given '

811 'column: {} of type: {}'.format(c0, type(c0), c, type(c)))

812 if num_buckets != c._num_buckets: # pylint: disable=protected-access

813 raise ValueError(

814 'To use shared_embedding_column, all categorical_columns must have '

815 'the same number of buckets. ven column: {} with buckets: {} does '

816 'not match column: {} with buckets: {}'.format(

817 c0, num_buckets, c, c._num_buckets)) # pylint: disable=protected-access

818

819 if not shared_embedding_collection_name:

820 shared_embedding_collection_name = '_'.join(c.name for c in sorted_columns)

821 shared_embedding_collection_name += '_shared_embedding'

822

823 result = []

824 for column in categorical_columns:

825 result.append(

826 fc_old._SharedEmbeddingColumn( # pylint: disable=protected-access

827 categorical_column=column,

828 initializer=initializer,

829 dimension=dimension,

830 combiner=combiner,

831 shared_embedding_collection_name=shared_embedding_collection_name,

832 ckpt_to_load_from=ckpt_to_load_from,

833 tensor_name_in_ckpt=tensor_name_in_ckpt,

834 max_norm=max_norm,

835 trainable=trainable,

836 use_safe_embedding_lookup=use_safe_embedding_lookup))

837

838 return result

839

840

841@doc_controls.header(_FEATURE_COLUMN_DEPRECATION_WARNING)

842@tf_export(

843 'feature_column.shared_embeddings',

844 v1=[])

845@deprecation.deprecated(None, _FEATURE_COLUMN_DEPRECATION_RUNTIME_WARNING)

846def shared_embedding_columns_v2(categorical_columns,

847 dimension,

848 combiner='mean',

849 initializer=None,

850 shared_embedding_collection_name=None,

851 ckpt_to_load_from=None,

852 tensor_name_in_ckpt=None,

853 max_norm=None,

854 trainable=True,

855 use_safe_embedding_lookup=True):

856 """List of dense columns that convert from sparse, categorical input.

857

858 This is similar to `embedding_column`, except that it produces a list of

859 embedding columns that share the same embedding weights.

860

861 Use this when your inputs are sparse and of the same type (e.g. watched and

862 impression video IDs that share the same vocabulary), and you want to convert

863 them to a dense representation (e.g., to feed to a DNN).

864

865 Inputs must be a list of categorical columns created by any of the

866 `categorical_column_*` function. They must all be of the same type and have

867 the same arguments except `key`. E.g. they can be

868 categorical_column_with_vocabulary_file with the same vocabulary_file. Some or

869 all columns could also be weighted_categorical_column.

870

871 Here is an example embedding of two features for a DNNClassifier model:

872

873 ```python

874 watched_video_id = categorical_column_with_vocabulary_file(

875 'watched_video_id', video_vocabulary_file, video_vocabulary_size)

876 impression_video_id = categorical_column_with_vocabulary_file(

877 'impression_video_id', video_vocabulary_file, video_vocabulary_size)

878 columns = shared_embedding_columns(

879 [watched_video_id, impression_video_id], dimension=10)

880

881 estimator = tf.estimator.DNNClassifier(feature_columns=columns, ...)

882

883 label_column = ...

884 def input_fn():

885 features = tf.io.parse_example(

886 ..., features=make_parse_example_spec(columns + [label_column]))

887 labels = features.pop(label_column.name)

888 return features, labels

889

890 estimator.train(input_fn=input_fn, steps=100)

891 ```

892

893 Here is an example using `shared_embedding_columns` with model_fn:

894

895 ```python

896 def model_fn(features, ...):

897 watched_video_id = categorical_column_with_vocabulary_file(

898 'watched_video_id', video_vocabulary_file, video_vocabulary_size)

899 impression_video_id = categorical_column_with_vocabulary_file(

900 'impression_video_id', video_vocabulary_file, video_vocabulary_size)

901 columns = shared_embedding_columns(

902 [watched_video_id, impression_video_id], dimension=10)

903 dense_tensor = input_layer(features, columns)

904 # Form DNN layers, calculate loss, and return EstimatorSpec.

905 ...

906 ```

907

908 Args:

909 categorical_columns: List of categorical columns created by a

910 `categorical_column_with_*` function. These columns produce the sparse IDs

911 that are inputs to the embedding lookup. All columns must be of the same

912 type and have the same arguments except `key`. E.g. they can be

913 categorical_column_with_vocabulary_file with the same vocabulary_file.

914 Some or all columns could also be weighted_categorical_column.

915 dimension: An integer specifying dimension of the embedding, must be > 0.

916 combiner: A string specifying how to reduce if there are multiple entries in

917 a single row. Currently 'mean', 'sqrtn' and 'sum' are supported, with

918 'mean' the default. 'sqrtn' often achieves good accuracy, in particular

919 with bag-of-words columns. Each of this can be thought as example level

920 normalizations on the column. For more information, see

921 `tf.embedding_lookup_sparse`.

922 initializer: A variable initializer function to be used in embedding

923 variable initialization. If not specified, defaults to

924 `truncated_normal_initializer` with mean `0.0` and standard deviation

925 `1/sqrt(dimension)`.

926 shared_embedding_collection_name: Optional collective name of these columns.

927 If not given, a reasonable name will be chosen based on the names of

928 `categorical_columns`.

929 ckpt_to_load_from: String representing checkpoint name/pattern from which to

930 restore column weights. Required if `tensor_name_in_ckpt` is not `None`.

931 tensor_name_in_ckpt: Name of the `Tensor` in `ckpt_to_load_from` from which

932 to restore the column weights. Required if `ckpt_to_load_from` is not

933 `None`.

934 max_norm: If not `None`, each embedding is clipped if its l2-norm is larger

935 than this value, before combining.

936 trainable: Whether or not the embedding is trainable. Default is True.

937 use_safe_embedding_lookup: If true, uses safe_embedding_lookup_sparse

938 instead of embedding_lookup_sparse. safe_embedding_lookup_sparse ensures

939 there are no empty rows and all weights and ids are positive at the

940 expense of extra compute cost. This only applies to rank 2 (NxM) shaped

941 input tensors. Defaults to true, consider turning off if the above checks

942 are not needed. Note that having empty rows will not trigger any error

943 though the output result might be 0 or omitted.

944

945 Returns:

946 A list of dense columns that converts from sparse input. The order of

947 results follows the ordering of `categorical_columns`.

948

949 Raises:

950 ValueError: if `dimension` not > 0.

951 ValueError: if any of the given `categorical_columns` is of different type

952 or has different arguments than the others.

953 ValueError: if exactly one of `ckpt_to_load_from` and `tensor_name_in_ckpt`

954 is specified.

955 ValueError: if `initializer` is specified and is not callable.

956 RuntimeError: if eager execution is enabled.

957 """

958 if context.executing_eagerly():

959 raise RuntimeError('shared_embedding_columns are not supported when eager '

960 'execution is enabled.')

961

962 if (dimension is None) or (dimension < 1):

963 raise ValueError('Invalid dimension {}.'.format(dimension))

964 if (ckpt_to_load_from is None) != (tensor_name_in_ckpt is None):

965 raise ValueError('Must specify both `ckpt_to_load_from` and '

966 '`tensor_name_in_ckpt` or none of them.')

967

968 if (initializer is not None) and (not callable(initializer)):

969 raise ValueError('initializer must be callable if specified.')

970 if initializer is None:

971 initializer = init_ops.truncated_normal_initializer(

972 mean=0.0, stddev=1. / math.sqrt(dimension))

973

974 # Sort the columns so the default collection name is deterministic even if the

975 # user passes columns from an unsorted collection, such as dict.values().

976 sorted_columns = sorted(categorical_columns, key=lambda x: x.name)

977

978 c0 = sorted_columns[0]

979 num_buckets = c0.num_buckets

980 if not isinstance(c0, CategoricalColumn):

981 raise ValueError(

982 'All categorical_columns must be subclasses of CategoricalColumn. '

983 'Given: {}, of type: {}'.format(c0, type(c0)))

984 while isinstance(c0, (WeightedCategoricalColumn, SequenceCategoricalColumn)):

985 c0 = c0.categorical_column

986 for c in sorted_columns[1:]:

987 while isinstance(c, (WeightedCategoricalColumn, SequenceCategoricalColumn)):

988 c = c.categorical_column

989 if not isinstance(c, type(c0)):

990 raise ValueError(

991 'To use shared_embedding_column, all categorical_columns must have '

992 'the same type, or be weighted_categorical_column or sequence column '

993 'of the same type. Given column: {} of type: {} does not match given '

994 'column: {} of type: {}'.format(c0, type(c0), c, type(c)))

995 if num_buckets != c.num_buckets:

996 raise ValueError(

997 'To use shared_embedding_column, all categorical_columns must have '

998 'the same number of buckets. Given column: {} with buckets: {} does '

999 'not match column: {} with buckets: {}'.format(

1000 c0, num_buckets, c, c.num_buckets))

1001

1002 if not shared_embedding_collection_name:

1003 shared_embedding_collection_name = '_'.join(c.name for c in sorted_columns)

1004 shared_embedding_collection_name += '_shared_embedding'

1005

1006 column_creator = SharedEmbeddingColumnCreator(

1007 dimension, initializer, ckpt_to_load_from, tensor_name_in_ckpt,

1008 num_buckets, trainable, shared_embedding_collection_name,

1009 use_safe_embedding_lookup)

1010

1011 result = []

1012 for column in categorical_columns:

1013 result.append(

1014 column_creator(

1015 categorical_column=column, combiner=combiner, max_norm=max_norm))

1016

1017 return result

1018

1019

1020@doc_controls.header(_FEATURE_COLUMN_DEPRECATION_WARNING)

1021@tf_export('feature_column.numeric_column')

1022@deprecation.deprecated(None, _FEATURE_COLUMN_DEPRECATION_RUNTIME_WARNING)

1023def numeric_column(key,

1024 shape=(1,),

1025 default_value=None,

1026 dtype=dtypes.float32,

1027 normalizer_fn=None):

1028 """Represents real valued or numerical features.

1029

1030 Example:

1031

1032 Assume we have data with two features `a` and `b`.

1033

1034 >>> data = {'a': [15, 9, 17, 19, 21, 18, 25, 30],

1035 ... 'b': [5.0, 6.4, 10.5, 13.6, 15.7, 19.9, 20.3 , 0.0]}

1036

1037 Let us represent the features `a` and `b` as numerical features.

1038

1039 >>> a = tf.feature_column.numeric_column('a')

1040 >>> b = tf.feature_column.numeric_column('b')

1041

1042 Feature column describe a set of transformations to the inputs.

1043

1044 For example, to "bucketize" feature `a`, wrap the `a` column in a

1045 `feature_column.bucketized_column`.

1046 Providing `5` bucket boundaries, the bucketized_column api

1047 will bucket this feature in total of `6` buckets.

1048

1049 >>> a_buckets = tf.feature_column.bucketized_column(a,

1050 ... boundaries=[10, 15, 20, 25, 30])

1051

1052 Create a `DenseFeatures` layer which will apply the transformations

1053 described by the set of `tf.feature_column` objects:

1054

1055 >>> feature_layer = tf.keras.layers.DenseFeatures([a_buckets, b])

1056 >>> print(feature_layer(data))

1057 tf.Tensor(

1058 [[ 0. 0. 1. 0. 0. 0. 5. ]

1059 [ 1. 0. 0. 0. 0. 0. 6.4]

1060 [ 0. 0. 1. 0. 0. 0. 10.5]

1061 [ 0. 0. 1. 0. 0. 0. 13.6]

1062 [ 0. 0. 0. 1. 0. 0. 15.7]

1063 [ 0. 0. 1. 0. 0. 0. 19.9]

1064 [ 0. 0. 0. 0. 1. 0. 20.3]

1065 [ 0. 0. 0. 0. 0. 1. 0. ]], shape=(8, 7), dtype=float32)

1066

1067 Args:

1068 key: A unique string identifying the input feature. It is used as the column

1069 name and the dictionary key for feature parsing configs, feature `Tensor`

1070 objects, and feature columns.

1071 shape: An iterable of integers specifies the shape of the `Tensor`. An

1072 integer can be given which means a single dimension `Tensor` with given

1073 width. The `Tensor` representing the column will have the shape of

1074 [batch_size] + `shape`.

1075 default_value: A single value compatible with `dtype` or an iterable of

1076 values compatible with `dtype` which the column takes on during

1077 `tf.Example` parsing if data is missing. A default value of `None` will

1078 cause `tf.io.parse_example` to fail if an example does not contain this

1079 column. If a single value is provided, the same value will be applied as

1080 the default value for every item. If an iterable of values is provided,

1081 the shape of the `default_value` should be equal to the given `shape`.

1082 dtype: defines the type of values. Default value is `tf.float32`. Must be a

1083 non-quantized, real integer or floating point type.

1084 normalizer_fn: If not `None`, a function that can be used to normalize the

1085 value of the tensor after `default_value` is applied for parsing.

1086 Normalizer function takes the input `Tensor` as its argument, and returns

1087 the output `Tensor`. (e.g. lambda x: (x - 3.0) / 4.2). Please note that

1088 even though the most common use case of this function is normalization, it

1089 can be used for any kind of Tensorflow transformations.

1090

1091 Returns:

1092 A `NumericColumn`.

1093

1094 Raises:

1095 TypeError: if any dimension in shape is not an int

1096 ValueError: if any dimension in shape is not a positive integer

1097 TypeError: if `default_value` is an iterable but not compatible with `shape`

1098 TypeError: if `default_value` is not compatible with `dtype`.

1099 ValueError: if `dtype` is not convertible to `tf.float32`.

1100 """

1101 shape = _check_shape(shape, key)

1102 if not (dtype.is_integer or dtype.is_floating):

1103 raise ValueError('dtype must be convertible to float. '

1104 'dtype: {}, key: {}'.format(dtype, key))

1105 default_value = fc_utils.check_default_value(shape, default_value, dtype, key)

1106

1107 if normalizer_fn is not None and not callable(normalizer_fn):

1108 raise TypeError(

1109 'normalizer_fn must be a callable. Given: {}'.format(normalizer_fn))

1110

1111 fc_utils.assert_key_is_string(key)

1112 return NumericColumn(

1113 key,

1114 shape=shape,

1115 default_value=default_value,

1116 dtype=dtype,

1117 normalizer_fn=normalizer_fn)

1118

1119

1120@doc_controls.header(_FEATURE_COLUMN_DEPRECATION_WARNING)

1121@tf_export('feature_column.bucketized_column')

1122@deprecation.deprecated(None, _FEATURE_COLUMN_DEPRECATION_RUNTIME_WARNING)

1123def bucketized_column(source_column, boundaries):

1124 """Represents discretized dense input bucketed by `boundaries`.

1125

1126 Buckets include the left boundary, and exclude the right boundary. Namely,

1127 `boundaries=[0., 1., 2.]` generates buckets `(-inf, 0.)`, `[0., 1.)`,

1128 `[1., 2.)`, and `[2., +inf)`.

1129

1130 For example, if the inputs are

1131

1132 ```python

1133 boundaries = [0, 10, 100]

1134 input tensor = [[-5, 10000]

1135 [150, 10]

1136 [5, 100]]

1137 ```

1138

1139 then the output will be

1140

1141 ```python

1142 output = [[0, 3]

1143 [3, 2]

1144 [1, 3]]

1145 ```

1146

1147 Example:

1148

1149 ```python

1150 price = tf.feature_column.numeric_column('price')

1151 bucketized_price = tf.feature_column.bucketized_column(

1152 price, boundaries=[...])

1153 columns = [bucketized_price, ...]

1154 features = tf.io.parse_example(

1155 ..., features=tf.feature_column.make_parse_example_spec(columns))

1156 dense_tensor = tf.keras.layers.DenseFeatures(columns)(features)

1157 ```

1158

1159 A `bucketized_column` can also be crossed with another categorical column

1160 using `crossed_column`:

1161

1162 ```python

1163 price = tf.feature_column.numeric_column('price')

1164 # bucketized_column converts numerical feature to a categorical one.

1165 bucketized_price = tf.feature_column.bucketized_column(

1166 price, boundaries=[...])

1167 # 'keywords' is a string feature.

1168 price_x_keywords = tf.feature_column.crossed_column(

1169 [bucketized_price, 'keywords'], 50K)

1170 columns = [price_x_keywords, ...]

1171 features = tf.io.parse_example(

1172 ..., features=tf.feature_column.make_parse_example_spec(columns))

1173 dense_tensor = tf.keras.layers.DenseFeatures(columns)(features)

1174 linear_model = tf.keras.experimental.LinearModel(units=...)(dense_tensor)

1175 ```

1176

1177 Args:

1178 source_column: A one-dimensional dense column which is generated with

1179 `numeric_column`.

1180 boundaries: A sorted list or tuple of floats specifying the boundaries.

1181

1182 Returns:

1183 A `BucketizedColumn`.

1184

1185 Raises:

1186 ValueError: If `source_column` is not a numeric column, or if it is not

1187 one-dimensional.

1188 ValueError: If `boundaries` is not a sorted list or tuple.

1189 """

1190 if not isinstance(source_column, (NumericColumn, fc_old._NumericColumn)): # pylint: disable=protected-access

1191 raise ValueError(

1192 'source_column must be a column generated with numeric_column(). '

1193 'Given: {}'.format(source_column))

1194 if len(source_column.shape) > 1:

1195 raise ValueError('source_column must be one-dimensional column. '

1196 'Given: {}'.format(source_column))

1197 if not boundaries:

1198 raise ValueError('boundaries must not be empty.')

1199 if not (isinstance(boundaries, list) or isinstance(boundaries, tuple)):

1200 raise ValueError('boundaries must be a sorted list.')

1201 for i in range(len(boundaries) - 1):

1202 if boundaries[i] >= boundaries[i + 1]:

1203 raise ValueError('boundaries must be a sorted list.')

1204 return BucketizedColumn(source_column, tuple(boundaries))

1205

1206

1207@doc_controls.header(_FEATURE_COLUMN_DEPRECATION_WARNING)

1208@tf_export('feature_column.categorical_column_with_hash_bucket')

1209@deprecation.deprecated(None, _FEATURE_COLUMN_DEPRECATION_RUNTIME_WARNING)

1210def categorical_column_with_hash_bucket(key,

1211 hash_bucket_size,

1212 dtype=dtypes.string):

1213 """Represents sparse feature where ids are set by hashing.

1214

1215 Use this when your sparse features are in string or integer format, and you

1216 want to distribute your inputs into a finite number of buckets by hashing.

1217 output_id = Hash(input_feature_string) % bucket_size for string type input.

1218 For int type input, the value is converted to its string representation first

1219 and then hashed by the same formula.

1220

1221 For input dictionary `features`, `features[key]` is either `Tensor` or

1222 `SparseTensor`. If `Tensor`, missing values can be represented by `-1` for int

1223 and `''` for string, which will be dropped by this feature column.

1224

1225 Example:

1226

1227 ```python

1228 import tensorflow as tf

1229 keywords = tf.feature_column.categorical_column_with_hash_bucket("keywords",

1230 10000)

1231 columns = [keywords]

1232 features = {'keywords': tf.constant([['Tensorflow', 'Keras', 'RNN', 'LSTM',

1233 'CNN'], ['LSTM', 'CNN', 'Tensorflow', 'Keras', 'RNN'], ['CNN', 'Tensorflow',

1234 'LSTM', 'Keras', 'RNN']])}

1235 linear_prediction, _, _ = tf.compat.v1.feature_column.linear_model(features,

1236 columns)

1237

1238 # or

1239 import tensorflow as tf

1240 keywords = tf.feature_column.categorical_column_with_hash_bucket("keywords",

1241 10000)

1242 keywords_embedded = tf.feature_column.embedding_column(keywords, 16)

1243 columns = [keywords_embedded]

1244 features = {'keywords': tf.constant([['Tensorflow', 'Keras', 'RNN', 'LSTM',

1245 'CNN'], ['LSTM', 'CNN', 'Tensorflow', 'Keras', 'RNN'], ['CNN', 'Tensorflow',

1246 'LSTM', 'Keras', 'RNN']])}

1247 input_layer = tf.keras.layers.DenseFeatures(columns)

1248 dense_tensor = input_layer(features)

1249 ```

1250

1251 Args:

1252 key: A unique string identifying the input feature. It is used as the column

1253 name and the dictionary key for feature parsing configs, feature `Tensor`

1254 objects, and feature columns.

1255 hash_bucket_size: An int > 1. The number of buckets.

1256 dtype: The type of features. Only string and integer types are supported.

1257

1258 Returns:

1259 A `HashedCategoricalColumn`.

1260

1261 Raises:

1262 ValueError: `hash_bucket_size` is not greater than 1.

1263 ValueError: `dtype` is neither string nor integer.

1264 """

1265 if hash_bucket_size is None:

1266 raise ValueError('hash_bucket_size must be set. ' 'key: {}'.format(key))

1267

1268 if hash_bucket_size < 1:

1269 raise ValueError('hash_bucket_size must be at least 1. '

1270 'hash_bucket_size: {}, key: {}'.format(

1271 hash_bucket_size, key))

1272

1273 fc_utils.assert_key_is_string(key)

1274 fc_utils.assert_string_or_int(dtype, prefix='column_name: {}'.format(key))

1275

1276 return HashedCategoricalColumn(key, hash_bucket_size, dtype)

1277

1278

1279@doc_controls.header(_FEATURE_COLUMN_DEPRECATION_WARNING)

1280@tf_export(v1=['feature_column.categorical_column_with_vocabulary_file'])

1281@deprecation.deprecated(None, _FEATURE_COLUMN_DEPRECATION_RUNTIME_WARNING)

1282def categorical_column_with_vocabulary_file(key,

1283 vocabulary_file,

1284 vocabulary_size=None,

1285 num_oov_buckets=0,

1286 default_value=None,

1287 dtype=dtypes.string):

1288 """A `CategoricalColumn` with a vocabulary file.

1289

1290 Use this when your inputs are in string or integer format, and you have a

1291 vocabulary file that maps each value to an integer ID. By default,

1292 out-of-vocabulary values are ignored. Use either (but not both) of

1293 `num_oov_buckets` and `default_value` to specify how to include

1294 out-of-vocabulary values.

1295

1296 For input dictionary `features`, `features[key]` is either `Tensor` or

1297 `SparseTensor`. If `Tensor`, missing values can be represented by `-1` for int

1298 and `''` for string, which will be dropped by this feature column.

1299

1300 Example with `num_oov_buckets`:

1301 File '/us/states.txt' contains 50 lines, each with a 2-character U.S. state

1302 abbreviation. All inputs with values in that file are assigned an ID 0-49,

1303 corresponding to its line number. All other values are hashed and assigned an

1304 ID 50-54.

1305

1306 ```python

1307 import tensorflow as tf

1308 states = tf.feature_column.categorical_column_with_vocabulary_file(

1309 key='states', vocabulary_file='states.txt', vocabulary_size=5,

1310 num_oov_buckets=1)

1311 columns = [states]

1312 features = {'states':tf.constant([['california', 'georgia', 'michigan',

1313 'texas', 'new york'], ['new york', 'georgia', 'california', 'michigan',

1314 'texas']])}

1315 linear_prediction = tf.compat.v1.feature_column.linear_model(features,

1316 columns)

1317 ```

1318

1319 Example with `default_value`:

1320 File '/us/states.txt' contains 51 lines - the first line is 'XX', and the

1321 other 50 each have a 2-character U.S. state abbreviation. Both a literal 'XX'

1322 in input, and other values missing from the file, will be assigned ID 0. All

1323 others are assigned the corresponding line number 1-50.

1324

1325 ```python

1326 import tensorflow as tf

1327 states = tf.feature_column.categorical_column_with_vocabulary_file(

1328 key='states', vocabulary_file='states.txt', vocabulary_size=6,

1329 default_value=0)

1330 columns = [states]

1331 features = {'states':tf.constant([['california', 'georgia', 'michigan',

1332 'texas', 'new york'], ['new york', 'georgia', 'california', 'michigan',

1333 'texas']])}

1334 linear_prediction = tf.compat.v1.feature_column.linear_model(features,

1335 columns)

1336 ```

1337

1338 And to make an embedding with either:

1339

1340 ```python

1341 import tensorflow as tf

1342 states = tf.feature_column.categorical_column_with_vocabulary_file(

1343 key='states', vocabulary_file='states.txt', vocabulary_size=5,

1344 num_oov_buckets=1)

1345 columns = [tf.feature_column.embedding_column(states, 3)]

1346 features = {'states':tf.constant([['california', 'georgia', 'michigan',

1347 'texas', 'new york'], ['new york', 'georgia', 'california', 'michigan',

1348 'texas']])}

1349 input_layer = tf.keras.layers.DenseFeatures(columns)

1350 dense_tensor = input_layer(features)

1351 ```

1352

1353 Args:

1354 key: A unique string identifying the input feature. It is used as the column

1355 name and the dictionary key for feature parsing configs, feature `Tensor`

1356 objects, and feature columns.

1357 vocabulary_file: The vocabulary file name.

1358 vocabulary_size: Number of the elements in the vocabulary. This must be no

1359 greater than length of `vocabulary_file`, if less than length, later

1360 values are ignored. If None, it is set to the length of `vocabulary_file`.

1361 num_oov_buckets: Non-negative integer, the number of out-of-vocabulary

1362 buckets. All out-of-vocabulary inputs will be assigned IDs in the range

1363 `[vocabulary_size, vocabulary_size+num_oov_buckets)` based on a hash of

1364 the input value. A positive `num_oov_buckets` can not be specified with

1365 `default_value`.

1366 default_value: The integer ID value to return for out-of-vocabulary feature

1367 values, defaults to `-1`. This can not be specified with a positive

1368 `num_oov_buckets`.

1369 dtype: The type of features. Only string and integer types are supported.

1370

1371 Returns:

1372 A `CategoricalColumn` with a vocabulary file.

1373

1374 Raises:

1375 ValueError: `vocabulary_file` is missing or cannot be opened.

1376 ValueError: `vocabulary_size` is missing or < 1.

1377 ValueError: `num_oov_buckets` is a negative integer.

1378 ValueError: `num_oov_buckets` and `default_value` are both specified.

1379 ValueError: `dtype` is neither string nor integer.

1380 """

1381 return categorical_column_with_vocabulary_file_v2(key, vocabulary_file,

1382 vocabulary_size, dtype,

1383 default_value,

1384 num_oov_buckets)

1385

1386

1387@doc_controls.header(_FEATURE_COLUMN_DEPRECATION_WARNING)

1388@tf_export(

1389 'feature_column.categorical_column_with_vocabulary_file',

1390 v1=[])

1391@deprecation.deprecated(None, _FEATURE_COLUMN_DEPRECATION_RUNTIME_WARNING)

1392def categorical_column_with_vocabulary_file_v2(key,

1393 vocabulary_file,

1394 vocabulary_size=None,

1395 dtype=dtypes.string,

1396 default_value=None,

1397 num_oov_buckets=0,

1398 file_format=None):

1399 """A `CategoricalColumn` with a vocabulary file.

1400

1401 Use this when your inputs are in string or integer format, and you have a

1402 vocabulary file that maps each value to an integer ID. By default,

1403 out-of-vocabulary values are ignored. Use either (but not both) of

1404 `num_oov_buckets` and `default_value` to specify how to include

1405 out-of-vocabulary values.

1406

1407 For input dictionary `features`, `features[key]` is either `Tensor` or

1408 `SparseTensor`. If `Tensor`, missing values can be represented by `-1` for int

1409 and `''` for string, which will be dropped by this feature column.

1410

1411 Example with `num_oov_buckets`:

1412 File `'/us/states.txt'` contains 50 lines, each with a 2-character U.S. state

1413 abbreviation. All inputs with values in that file are assigned an ID 0-49,

1414 corresponding to its line number. All other values are hashed and assigned an

1415 ID 50-54.

1416

1417 ```python

1418 states = categorical_column_with_vocabulary_file(

1419 key='states', vocabulary_file='/us/states.txt', vocabulary_size=50,

1420 num_oov_buckets=5)

1421 columns = [states, ...]

1422 features = tf.io.parse_example(..., features=make_parse_example_spec(columns))

1423 linear_prediction = linear_model(features, columns)

1424 ```

1425

1426 Example with `default_value`:

1427 File `'/us/states.txt'` contains 51 lines - the first line is `'XX'`, and the

1428 other 50 each have a 2-character U.S. state abbreviation. Both a literal

1429 `'XX'` in input, and other values missing from the file, will be assigned

1430 ID 0. All others are assigned the corresponding line number 1-50.

1431

1432 ```python

1433 states = categorical_column_with_vocabulary_file(

1434 key='states', vocabulary_file='/us/states.txt', vocabulary_size=51,

1435 default_value=0)

1436 columns = [states, ...]

1437 features = tf.io.parse_example(..., features=make_parse_example_spec(columns))

1438 linear_prediction, _, _ = linear_model(features, columns)

1439 ```

1440

1441 And to make an embedding with either:

1442

1443 ```python

1444 columns = [embedding_column(states, 3),...]

1445 features = tf.io.parse_example(..., features=make_parse_example_spec(columns))

1446 dense_tensor = input_layer(features, columns)

1447 ```

1448

1449 Args:

1450 key: A unique string identifying the input feature. It is used as the column

1451 name and the dictionary key for feature parsing configs, feature `Tensor`

1452 objects, and feature columns.

1453 vocabulary_file: The vocabulary file name.

1454 vocabulary_size: Number of the elements in the vocabulary. This must be no

1455 greater than length of `vocabulary_file`, if less than length, later

1456 values are ignored. If None, it is set to the length of `vocabulary_file`.

1457 dtype: The type of features. Only string and integer types are supported.

1458 default_value: The integer ID value to return for out-of-vocabulary feature

1459 values, defaults to `-1`. This can not be specified with a positive

1460 `num_oov_buckets`.

1461 num_oov_buckets: Non-negative integer, the number of out-of-vocabulary

1462 buckets. All out-of-vocabulary inputs will be assigned IDs in the range

1463 `[vocabulary_size, vocabulary_size+num_oov_buckets)` based on a hash of

1464 the input value. A positive `num_oov_buckets` can not be specified with

1465 `default_value`.

1466 file_format: The format of the vocabulary file. The format is 'text' by

1467 default unless `vocabulary_file` is a string which ends in 'tfrecord.gz'.

1468 Accepted alternative value for `file_format` is 'tfrecord_gzip'.

1469

1470 Returns:

1471 A `CategoricalColumn` with a vocabulary file.

1472

1473 Raises:

1474 ValueError: `vocabulary_file` is missing or cannot be opened.

1475 ValueError: `vocabulary_size` is missing or < 1.

1476 ValueError: `num_oov_buckets` is a negative integer.

1477 ValueError: `num_oov_buckets` and `default_value` are both specified.

1478 ValueError: `dtype` is neither string nor integer.

1479 """

1480 if not vocabulary_file:

1481 raise ValueError('Missing vocabulary_file in {}.'.format(key))

1482

1483 if file_format is None and vocabulary_file.endswith('tfrecord.gz'):

1484 file_format = 'tfrecord_gzip'

1485

1486 if vocabulary_size is None:

1487 if not gfile.Exists(vocabulary_file):

1488 raise ValueError('vocabulary_file in {} does not exist.'.format(key))

1489

1490 if file_format == 'tfrecord_gzip':

1491 ds = readers.TFRecordDataset(vocabulary_file, 'GZIP')

1492 vocabulary_size = ds.reduce(0, lambda x, _: x + 1)

1493 if context.executing_eagerly():

1494 vocabulary_size = vocabulary_size.numpy()

1495 else:

1496 with gfile.GFile(vocabulary_file, mode='rb') as f:

1497 vocabulary_size = sum(1 for _ in f)

1498 logging.info(

1499 'vocabulary_size = %d in %s is inferred from the number of elements '

1500 'in the vocabulary_file %s.', vocabulary_size, key, vocabulary_file)

1501

1502 # `vocabulary_size` isn't required for lookup, but it is for `_num_buckets`.

1503 if not isinstance(vocabulary_size, ops.Tensor) and vocabulary_size < 1:

1504 raise ValueError('Invalid vocabulary_size in {}.'.format(key))

1505 if num_oov_buckets:

1506 if default_value is not None:

1507 raise ValueError(

1508 'Can\'t specify both num_oov_buckets and default_value in {}.'.format(

1509 key))

1510 if num_oov_buckets < 0:

1511 raise ValueError('Invalid num_oov_buckets {} in {}.'.format(

1512 num_oov_buckets, key))

1513 fc_utils.assert_string_or_int(dtype, prefix='column_name: {}'.format(key))

1514 fc_utils.assert_key_is_string(key)

1515 return VocabularyFileCategoricalColumn(

1516 key=key,

1517 vocabulary_file=vocabulary_file,

1518 vocabulary_size=vocabulary_size,

1519 num_oov_buckets=0 if num_oov_buckets is None else num_oov_buckets,

1520 default_value=-1 if default_value is None else default_value,

1521 dtype=dtype,

1522 file_format=file_format)

1523

1524

1525@doc_controls.header(_FEATURE_COLUMN_DEPRECATION_WARNING)

1526@tf_export('feature_column.categorical_column_with_vocabulary_list')

1527@deprecation.deprecated(None, _FEATURE_COLUMN_DEPRECATION_RUNTIME_WARNING)

1528def categorical_column_with_vocabulary_list(key,

1529 vocabulary_list,

1530 dtype=None,

1531 default_value=-1,

1532 num_oov_buckets=0):

1533 """A `CategoricalColumn` with in-memory vocabulary.

1534

1535 Use this when your inputs are in string or integer format, and you have an

1536 in-memory vocabulary mapping each value to an integer ID. By default,

1537 out-of-vocabulary values are ignored. Use either (but not both) of

1538 `num_oov_buckets` and `default_value` to specify how to include

1539 out-of-vocabulary values.

1540

1541 For input dictionary `features`, `features[key]` is either `Tensor` or

1542 `SparseTensor`. If `Tensor`, missing values can be represented by `-1` for int

1543 and `''` for string, which will be dropped by this feature column.

1544

1545 Example with `num_oov_buckets`:

1546 In the following example, each input in `vocabulary_list` is assigned an ID

1547 0-3 corresponding to its index (e.g., input 'B' produces output 2). All other

1548 inputs are hashed and assigned an ID 4-5.

1549

1550 ```python

1551 colors = categorical_column_with_vocabulary_list(

1552 key='colors', vocabulary_list=('R', 'G', 'B', 'Y'),

1553 num_oov_buckets=2)

1554 columns = [colors, ...]

1555 features = tf.io.parse_example(..., features=make_parse_example_spec(columns))

1556 linear_prediction, _, _ = linear_model(features, columns)

1557 ```

1558

1559 Example with `default_value`:

1560 In the following example, each input in `vocabulary_list` is assigned an ID

1561 0-4 corresponding to its index (e.g., input 'B' produces output 3). All other

1562 inputs are assigned `default_value` 0.

1563

1564

1565 ```python

1566 colors = categorical_column_with_vocabulary_list(

1567 key='colors', vocabulary_list=('X', 'R', 'G', 'B', 'Y'), default_value=0)

1568 columns = [colors, ...]

1569 features = tf.io.parse_example(..., features=make_parse_example_spec(columns))

1570 linear_prediction, _, _ = linear_model(features, columns)

1571 ```

1572

1573 And to make an embedding with either:

1574

1575 ```python

1576 columns = [embedding_column(colors, 3),...]

1577 features = tf.io.parse_example(..., features=make_parse_example_spec(columns))

1578 dense_tensor = input_layer(features, columns)

1579 ```

1580

1581 Args:

1582 key: A unique string identifying the input feature. It is used as the column

1583 name and the dictionary key for feature parsing configs, feature `Tensor`

1584 objects, and feature columns.

1585 vocabulary_list: An ordered iterable defining the vocabulary. Each feature

1586 is mapped to the index of its value (if present) in `vocabulary_list`.

1587 Must be castable to `dtype`.

1588 dtype: The type of features. Only string and integer types are supported. If

1589 `None`, it will be inferred from `vocabulary_list`.

1590 default_value: The integer ID value to return for out-of-vocabulary feature

1591 values, defaults to `-1`. This can not be specified with a positive

1592 `num_oov_buckets`.

1593 num_oov_buckets: Non-negative integer, the number of out-of-vocabulary

1594 buckets. All out-of-vocabulary inputs will be assigned IDs in the range

1595 `[len(vocabulary_list), len(vocabulary_list)+num_oov_buckets)` based on a

1596 hash of the input value. A positive `num_oov_buckets` can not be specified

1597 with `default_value`.

1598

1599 Returns:

1600 A `CategoricalColumn` with in-memory vocabulary.

1601

1602 Raises:

1603 ValueError: if `vocabulary_list` is empty, or contains duplicate keys.

1604 ValueError: `num_oov_buckets` is a negative integer.

1605 ValueError: `num_oov_buckets` and `default_value` are both specified.

1606 ValueError: if `dtype` is not integer or string.

1607 """

1608 if (vocabulary_list is None) or (len(vocabulary_list) < 1):

1609 raise ValueError(

1610 'vocabulary_list {} must be non-empty, column_name: {}'.format(

1611 vocabulary_list, key))

1612 if len(set(vocabulary_list)) != len(vocabulary_list):

1613 raise ValueError(

1614 'Duplicate keys in vocabulary_list {}, column_name: {}'.format(

1615 vocabulary_list, key))

1616 vocabulary_dtype = dtypes.as_dtype(np.array(vocabulary_list).dtype)

1617 if num_oov_buckets:

1618 if default_value != -1:

1619 raise ValueError(

1620 'Can\'t specify both num_oov_buckets and default_value in {}.'.format(

1621 key))

1622 if num_oov_buckets < 0:

1623 raise ValueError('Invalid num_oov_buckets {} in {}.'.format(

1624 num_oov_buckets, key))

1625 fc_utils.assert_string_or_int(

1626 vocabulary_dtype, prefix='column_name: {} vocabulary'.format(key))

1627 if dtype is None:

1628 dtype = vocabulary_dtype

1629 elif dtype.is_integer != vocabulary_dtype.is_integer:

1630 raise ValueError(

1631 'dtype {} and vocabulary dtype {} do not match, column_name: {}'.format(

1632 dtype, vocabulary_dtype, key))

1633 fc_utils.assert_string_or_int(dtype, prefix='column_name: {}'.format(key))

1634 fc_utils.assert_key_is_string(key)

1635

1636 return VocabularyListCategoricalColumn(

1637 key=key,

1638 vocabulary_list=tuple(vocabulary_list),

1639 dtype=dtype,

1640 default_value=default_value,

1641 num_oov_buckets=num_oov_buckets)

1642

1643

1644@doc_controls.header(_FEATURE_COLUMN_DEPRECATION_WARNING)

1645@tf_export('feature_column.categorical_column_with_identity')

1646@deprecation.deprecated(None, _FEATURE_COLUMN_DEPRECATION_RUNTIME_WARNING)

1647def categorical_column_with_identity(key, num_buckets, default_value=None):

1648 """A `CategoricalColumn` that returns identity values.

1649

1650 Use this when your inputs are integers in the range `[0, num_buckets)`, and

1651 you want to use the input value itself as the categorical ID. Values outside

1652 this range will result in `default_value` if specified, otherwise it will

1653 fail.

1654

1655 Typically, this is used for contiguous ranges of integer indexes, but

1656 it doesn't have to be. This might be inefficient, however, if many of IDs

1657 are unused. Consider `categorical_column_with_hash_bucket` in that case.

1658

1659 For input dictionary `features`, `features[key]` is either `Tensor` or

1660 `SparseTensor`. If `Tensor`, missing values can be represented by `-1` for int

1661 and `''` for string, which will be dropped by this feature column.

1662

1663 In the following examples, each input in the range `[0, 1000000)` is assigned

1664 the same value. All other inputs are assigned `default_value` 0. Note that a

1665 literal 0 in inputs will result in the same default ID.

1666

1667 Linear model:

1668

1669 ```python

1670 import tensorflow as tf

1671 video_id = tf.feature_column.categorical_column_with_identity(

1672 key='video_id', num_buckets=1000000, default_value=0)

1673 columns = [video_id]

1674 features = {'video_id': tf.sparse.from_dense([[2, 85, 0, 0, 0],

1675 [33,78, 2, 73, 1]])}

1676 linear_prediction = tf.compat.v1.feature_column.linear_model(features,

1677 columns)

1678 ```

1679

1680 Embedding for a DNN model:

1681

1682 ```python

1683 import tensorflow as tf

1684 video_id = tf.feature_column.categorical_column_with_identity(

1685 key='video_id', num_buckets=1000000, default_value=0)

1686 columns = [tf.feature_column.embedding_column(video_id, 9)]

1687 features = {'video_id': tf.sparse.from_dense([[2, 85, 0, 0, 0],

1688 [33,78, 2, 73, 1]])}

1689 input_layer = tf.keras.layers.DenseFeatures(columns)

1690 dense_tensor = input_layer(features)

1691 ```

1692

1693 Args:

1694 key: A unique string identifying the input feature. It is used as the column

1695 name and the dictionary key for feature parsing configs, feature `Tensor`

1696 objects, and feature columns.

1697 num_buckets: Range of inputs and outputs is `[0, num_buckets)`.

1698 default_value: If set, values outside of range `[0, num_buckets)` will be

1699 replaced with this value. If not set, values >= num_buckets will cause a

1700 failure while values < 0 will be dropped.

1701

1702 Returns:

1703 A `CategoricalColumn` that returns identity values.

1704

1705 Raises:

1706 ValueError: if `num_buckets` is less than one.

1707 ValueError: if `default_value` is not in range `[0, num_buckets)`.

1708 """

1709 if num_buckets < 1:

1710 raise ValueError('num_buckets {} < 1, column_name {}'.format(

1711 num_buckets, key))

1712 if (default_value is not None) and ((default_value < 0) or

1713 (default_value >= num_buckets)):

1714 raise ValueError(

1715 'default_value {} not in range [0, {}), column_name {}'.format(

1716 default_value, num_buckets, key))

1717 fc_utils.assert_key_is_string(key)

1718 return IdentityCategoricalColumn(

1719 key=key, number_buckets=num_buckets, default_value=default_value)

1720

1721

1722@doc_controls.header(_FEATURE_COLUMN_DEPRECATION_WARNING)

1723@tf_export('feature_column.indicator_column')

1724@deprecation.deprecated(None, _FEATURE_COLUMN_DEPRECATION_RUNTIME_WARNING)

1725def indicator_column(categorical_column):

1726 """Represents multi-hot representation of given categorical column.

1727

1728 - For DNN model, `indicator_column` can be used to wrap any

1729 `categorical_column_*` (e.g., to feed to DNN). Consider to Use

1730 `embedding_column` if the number of buckets/unique(values) are large.

1731

1732 - For Wide (aka linear) model, `indicator_column` is the internal

1733 representation for categorical column when passing categorical column

1734 directly (as any element in feature_columns) to `linear_model`. See

1735 `linear_model` for details.

1736

1737 ```python

1738 name = indicator_column(categorical_column_with_vocabulary_list(

1739 'name', ['bob', 'george', 'wanda']))

1740 columns = [name, ...]

1741 features = tf.io.parse_example(..., features=make_parse_example_spec(columns))

1742 dense_tensor = input_layer(features, columns)

1743

1744 dense_tensor == [[1, 0, 0]] # If "name" bytes_list is ["bob"]

1745 dense_tensor == [[1, 0, 1]] # If "name" bytes_list is ["bob", "wanda"]

1746 dense_tensor == [[2, 0, 0]] # If "name" bytes_list is ["bob", "bob"]

1747 ```

1748

1749 Args:

1750 categorical_column: A `CategoricalColumn` which is created by

1751 `categorical_column_with_*` or `crossed_column` functions.

1752

1753 Returns:

1754 An `IndicatorColumn`.

1755

1756 Raises:

1757 ValueError: If `categorical_column` is not CategoricalColumn type.

1758 """

1759 if not isinstance(categorical_column,

1760 (CategoricalColumn, fc_old._CategoricalColumn)): # pylint: disable=protected-access

1761 raise ValueError(

1762 'Unsupported input type. Input must be a CategoricalColumn. '

1763 'Given: {}'.format(categorical_column))

1764 return IndicatorColumn(categorical_column)

1765

1766

1767@doc_controls.header(_FEATURE_COLUMN_DEPRECATION_WARNING)

1768@tf_export('feature_column.weighted_categorical_column')

1769@deprecation.deprecated(None, _FEATURE_COLUMN_DEPRECATION_RUNTIME_WARNING)

1770def weighted_categorical_column(categorical_column,

1771 weight_feature_key,

1772 dtype=dtypes.float32):

1773 """Applies weight values to a `CategoricalColumn`.

1774

1775 Use this when each of your sparse inputs has both an ID and a value. For

1776 example, if you're representing text documents as a collection of word

1777 frequencies, you can provide 2 parallel sparse input features ('terms' and

1778 'frequencies' below).

1779

1780 Example:

1781

1782 Input `tf.Example` objects:

1783

1784 ```proto

1785 [

1786 features {

1787 feature {

1788 key: "terms"

1789 value {bytes_list {value: "very" value: "model"}}

1790 }

1791 feature {

1792 key: "frequencies"

1793 value {float_list {value: 0.3 value: 0.1}}

1794 }

1795 },

1796 features {

1797 feature {

1798 key: "terms"

1799 value {bytes_list {value: "when" value: "course" value: "human"}}

1800 }

1801 feature {

1802 key: "frequencies"

1803 value {float_list {value: 0.4 value: 0.1 value: 0.2}}

1804 }

1805 }

1806 ]

1807 ```

1808

1809 ```python

1810 categorical_column = categorical_column_with_hash_bucket(

1811 column_name='terms', hash_bucket_size=1000)

1812 weighted_column = weighted_categorical_column(

1813 categorical_column=categorical_column, weight_feature_key='frequencies')

1814 columns = [weighted_column, ...]

1815 features = tf.io.parse_example(..., features=make_parse_example_spec(columns))

1816 linear_prediction, _, _ = linear_model(features, columns)

1817 ```

1818

1819 This assumes the input dictionary contains a `SparseTensor` for key

1820 'terms', and a `SparseTensor` for key 'frequencies'. These 2 tensors must have

1821 the same indices and dense shape.

1822

1823 Args:

1824 categorical_column: A `CategoricalColumn` created by

1825 `categorical_column_with_*` functions.

1826 weight_feature_key: String key for weight values.

1827 dtype: Type of weights, such as `tf.float32`. Only float and integer weights

1828 are supported.

1829

1830 Returns:

1831 A `CategoricalColumn` composed of two sparse features: one represents id,

1832 the other represents weight (value) of the id feature in that example.

1833

1834 Raises:

1835 ValueError: if `dtype` is not convertible to float.

1836 """

1837 if (dtype is None) or not (dtype.is_integer or dtype.is_floating):

1838 raise ValueError('dtype {} is not convertible to float.'.format(dtype))

1839 return WeightedCategoricalColumn(

1840 categorical_column=categorical_column,

1841 weight_feature_key=weight_feature_key,

1842 dtype=dtype)

1843

1844

1845@doc_controls.header(_FEATURE_COLUMN_DEPRECATION_WARNING)

1846@tf_export('feature_column.crossed_column')

1847@deprecation.deprecated(

1848 None,

1849 'Use `tf.keras.layers.experimental.preprocessing.HashedCrossing` '

1850 'instead for feature crossing when preprocessing data to train a '

1851 'Keras model.')

1852def crossed_column(keys, hash_bucket_size, hash_key=None):

1853 """Returns a column for performing crosses of categorical features.

1854

1855 Crossed features will be hashed according to `hash_bucket_size`. Conceptually,

1856 the transformation can be thought of as:

1857 Hash(cartesian product of features) % `hash_bucket_size`

1858

1859 For example, if the input features are:

1860

1861 * SparseTensor referred by first key:

1862

1863 ```python

1864 shape = [2, 2]

1865 {

1866 [0, 0]: "a"

1867 [1, 0]: "b"

1868 [1, 1]: "c"

1869 }

1870 ```

1871

1872 * SparseTensor referred by second key:

1873

1874 ```python

1875 shape = [2, 1]

1876 {

1877 [0, 0]: "d"

1878 [1, 0]: "e"

1879 }

1880 ```

1881

1882 then crossed feature will look like:

1883

1884 ```python

1885 shape = [2, 2]

1886 {

1887 [0, 0]: Hash64("d", Hash64("a")) % hash_bucket_size

1888 [1, 0]: Hash64("e", Hash64("b")) % hash_bucket_size

1889 [1, 1]: Hash64("e", Hash64("c")) % hash_bucket_size

1890 }

1891 ```

1892

1893 Here is an example to create a linear model with crosses of string features:

1894

1895 ```python

1896 keywords_x_doc_terms = crossed_column(['keywords', 'doc_terms'], 50K)

1897 columns = [keywords_x_doc_terms, ...]

1898 features = tf.io.parse_example(..., features=make_parse_example_spec(columns))

1899 linear_prediction = linear_model(features, columns)

1900 ```

1901

1902 You could also use vocabulary lookup before crossing:

1903

1904 ```python

1905 keywords = categorical_column_with_vocabulary_file(

1906 'keywords', '/path/to/vocabulary/file', vocabulary_size=1K)

1907 keywords_x_doc_terms = crossed_column([keywords, 'doc_terms'], 50K)

1908 columns = [keywords_x_doc_terms, ...]

1909 features = tf.io.parse_example(..., features=make_parse_example_spec(columns))

1910 linear_prediction = linear_model(features, columns)

1911 ```

1912

1913 If an input feature is of numeric type, you can use

1914 `categorical_column_with_identity`, or `bucketized_column`, as in the example:

1915

1916 ```python

1917 # vertical_id is an integer categorical feature.

1918 vertical_id = categorical_column_with_identity('vertical_id', 10K)

1919 price = numeric_column('price')

1920 # bucketized_column converts numerical feature to a categorical one.

1921 bucketized_price = bucketized_column(price, boundaries=[...])

1922 vertical_id_x_price = crossed_column([vertical_id, bucketized_price], 50K)

1923 columns = [vertical_id_x_price, ...]

1924 features = tf.io.parse_example(..., features=make_parse_example_spec(columns))

1925 linear_prediction = linear_model(features, columns)

1926 ```

1927

1928 To use crossed column in DNN model, you need to add it in an embedding column

1929 as in this example:

1930

1931 ```python

1932 vertical_id_x_price = crossed_column([vertical_id, bucketized_price], 50K)

1933 vertical_id_x_price_embedded = embedding_column(vertical_id_x_price, 10)

1934 dense_tensor = input_layer(features, [vertical_id_x_price_embedded, ...])

1935 ```

1936

1937 Args:

1938 keys: An iterable identifying the features to be crossed. Each element can

1939 be either:

1940 * string: Will use the corresponding feature which must be of string type.

1941 * `CategoricalColumn`: Will use the transformed tensor produced by this

1942 column. Does not support hashed categorical column.

1943 hash_bucket_size: An int > 1. The number of buckets.

1944 hash_key: Specify the hash_key that will be used by the `FingerprintCat64`

1945 function to combine the crosses fingerprints on SparseCrossOp (optional).

1946

1947 Returns:

1948 A `CrossedColumn`.

1949

1950 Raises:

1951 ValueError: If `len(keys) < 2`.

1952 ValueError: If any of the keys is neither a string nor `CategoricalColumn`.

1953 ValueError: If any of the keys is `HashedCategoricalColumn`.

1954 ValueError: If `hash_bucket_size < 1`.

1955 """

1956 if not hash_bucket_size or hash_bucket_size < 1:

1957 raise ValueError('hash_bucket_size must be > 1. '

1958 'hash_bucket_size: {}'.format(hash_bucket_size))

1959 if not keys or len(keys) < 2:

1960 raise ValueError(

1961 'keys must be a list with length > 1. Given: {}'.format(keys))

1962 for key in keys:

1963 if (not isinstance(key, six.string_types) and

1964 not isinstance(key, (CategoricalColumn, fc_old._CategoricalColumn))): # pylint: disable=protected-access

1965 raise ValueError(

1966 'Unsupported key type. All keys must be either string, or '

1967 'categorical column except HashedCategoricalColumn. '

1968 'Given: {}'.format(key))

1969 if isinstance(key,

1970 (HashedCategoricalColumn, fc_old._HashedCategoricalColumn)): # pylint: disable=protected-access

1971 raise ValueError(

1972 'categorical_column_with_hash_bucket is not supported for crossing. '

1973 'Hashing before crossing will increase probability of collision. '

1974 'Instead, use the feature name as a string. Given: {}'.format(key))

1975 return CrossedColumn(

1976 keys=tuple(keys), hash_bucket_size=hash_bucket_size, hash_key=hash_key)

1981# TODO(b/181853833): Add a tf.type for instance type checking.

1982@tf_export('__internal__.feature_column.FeatureColumn', v1=[])

1983@six.add_metaclass(abc.ABCMeta)

1984class FeatureColumn(object):

1985 """Represents a feature column abstraction.

1986

1987 WARNING: Do not subclass this layer unless you know what you are doing:

1988 the API is subject to future changes.

1989

1990 To distinguish between the concept of a feature family and a specific binary

1991 feature within a family, we refer to a feature family like "country" as a

1992 feature column. For example, we can have a feature in a `tf.Example` format:

1993 {key: "country", value: [ "US" ]}

1994 In this example the value of feature is "US" and "country" refers to the

1995 column of the feature.

1996

1997 This class is an abstract class. Users should not create instances of this.

1998 """

1999

2000 @abc.abstractproperty

2001 def name(self):

2002 """Returns string. Used for naming."""

2003 pass

2004

2005 def __lt__(self, other):

2006 """Allows feature columns to be sorted in Python 3 as they are in Python 2.

2007

2008 Feature columns need to occasionally be sortable, for example when used as

2009 keys in a features dictionary passed to a layer.

2010

2011 In CPython, `__lt__` must be defined for all objects in the

2012 sequence being sorted.

2013

2014 If any objects in the sequence being sorted do not have an `__lt__` method

2015 compatible with feature column objects (such as strings), then CPython will

2016 fall back to using the `__gt__` method below.

2017 https://docs.python.org/3/library/stdtypes.html#list.sort

2018

2019 Args:

2020 other: The other object to compare to.

2021

2022 Returns:

2023 True if the string representation of this object is lexicographically less

2024 than the string representation of `other`. For FeatureColumn objects,

2025 this looks like "<__main__.FeatureColumn object at 0xa>".

2026 """

2027 return str(self) < str(other)

2028

2029 def __gt__(self, other):

2030 """Allows feature columns to be sorted in Python 3 as they are in Python 2.

2031

2032 Feature columns need to occasionally be sortable, for example when used as

2033 keys in a features dictionary passed to a layer.

2034

2035 `__gt__` is called when the "other" object being compared during the sort

2036 does not have `__lt__` defined.

2037 Example:

2038 ```

2039 # __lt__ only class

2040 class A():

2041 def __lt__(self, other): return str(self) < str(other)

2042

2043 a = A()

2044 a < "b" # True

2045 "0" < a # Error

2046

2047 # __lt__ and __gt__ class

2048 class B():

2049 def __lt__(self, other): return str(self) < str(other)

2050 def __gt__(self, other): return str(self) > str(other)

2051

2052 b = B()

2053 b < "c" # True

2054 "0" < b # True

2055 ```

2056

2057 Args:

2058 other: The other object to compare to.

2059

2060 Returns:

2061 True if the string representation of this object is lexicographically

2062 greater than the string representation of `other`. For FeatureColumn

2063 objects, this looks like "<__main__.FeatureColumn object at 0xa>".

2064 """

2065 return str(self) > str(other)

2066

2067 @abc.abstractmethod

2068 def transform_feature(self, transformation_cache, state_manager):

2069 """Returns intermediate representation (usually a `Tensor`).

2070

2071 Uses `transformation_cache` to create an intermediate representation

2072 (usually a `Tensor`) that other feature columns can use.

2073

2074 Example usage of `transformation_cache`:

2075 Let's say a Feature column depends on raw feature ('raw') and another

2076 `FeatureColumn` (input_fc). To access corresponding `Tensor`s,

2077 transformation_cache will be used as follows:

2078

2079 ```python

2080 raw_tensor = transformation_cache.get('raw', state_manager)

2081 fc_tensor = transformation_cache.get(input_fc, state_manager)

2082 ```

2083

2084 Args:

2085 transformation_cache: A `FeatureTransformationCache` object to access

2086 features.

2087 state_manager: A `StateManager` to create / access resources such as

2088 lookup tables.

2089

2090 Returns:

2091 Transformed feature `Tensor`.

2092 """

2093 pass

2094

2095 @abc.abstractproperty

2096 def parse_example_spec(self):

2097 """Returns a `tf.Example` parsing spec as dict.

2098

2099 It is used for get_parsing_spec for `tf.io.parse_example`. Returned spec is

2100 a dict from keys ('string') to `VarLenFeature`, `FixedLenFeature`, and other

2101 supported objects. Please check documentation of `tf.io.parse_example` for

2102 all supported spec objects.

2103

2104 Let's say a Feature column depends on raw feature ('raw') and another

2105 `FeatureColumn` (input_fc). One possible implementation of

2106 parse_example_spec is as follows:

2107

2108 ```python

2109 spec = {'raw': tf.io.FixedLenFeature(...)}

2110 spec.update(input_fc.parse_example_spec)

2111 return spec

2112 ```

2113 """

2114 pass

2115

2116 def create_state(self, state_manager):

2117 """Uses the `state_manager` to create state for the FeatureColumn.

2118

2119 Args:

2120 state_manager: A `StateManager` to create / access resources such as

2121 lookup tables and variables.

2122 """

2123 pass

2124

2125 @abc.abstractproperty

2126 def _is_v2_column(self):

2127 """Returns whether this FeatureColumn is fully conformant to the new API.

2128

2129 This is needed for composition type cases where an EmbeddingColumn etc.

2130 might take in old categorical columns as input and then we want to use the

2131 old API.

2132 """

2133 pass

2134

2135 @abc.abstractproperty

2136 def parents(self):

2137 """Returns a list of immediate raw feature and FeatureColumn dependencies.

2138

2139 For example:

2140 # For the following feature columns

2141 a = numeric_column('f1')

2142 c = crossed_column(a, 'f2')

2143 # The expected parents are:

2144 a.parents = ['f1']

2145 c.parents = [a, 'f2']

2146 """

2147 pass

2148

2149 def get_config(self):

2150 """Returns the config of the feature column.

2151

2152 A FeatureColumn config is a Python dictionary (serializable) containing the

2153 configuration of a FeatureColumn. The same FeatureColumn can be

2154 reinstantiated later from this configuration.

2155

2156 The config of a feature column does not include information about feature

2157 columns depending on it nor the FeatureColumn class name.

2158

2159 Example with (de)serialization practices followed in this file:

2160 ```python

2161 class SerializationExampleFeatureColumn(

2162 FeatureColumn, collections.namedtuple(

2163 'SerializationExampleFeatureColumn',

2164 ('dimension', 'parent', 'dtype', 'normalizer_fn'))):

2165

2166 def get_config(self):

2167 # Create a dict from the namedtuple.

2168 # Python attribute literals can be directly copied from / to the config.

2169 # For example 'dimension', assuming it is an integer literal.

2170 config = dict(zip(self._fields, self))

2171

2172 # (De)serialization of parent FeatureColumns should use the provided

2173 # (de)serialize_feature_column() methods that take care of de-duping.

2174 config['parent'] = serialize_feature_column(self.parent)

2175

2176 # Many objects provide custom (de)serialization e.g: for tf.DType

2177 # tf.DType.name, tf.as_dtype() can be used.

2178 config['dtype'] = self.dtype.name

2179

2180 # Non-trivial dependencies should be Keras-(de)serializable.

2181 config['normalizer_fn'] = generic_utils.serialize_keras_object(

2182 self.normalizer_fn)

2183

2184 return config

2185

2186 @classmethod

2187 def from_config(cls, config, custom_objects=None, columns_by_name=None):

2188 # This should do the inverse transform from `get_config` and construct

2189 # the namedtuple.

2190 kwargs = config.copy()

2191 kwargs['parent'] = deserialize_feature_column(

2192 config['parent'], custom_objects, columns_by_name)

2193 kwargs['dtype'] = dtypes.as_dtype(config['dtype'])

2194 kwargs['normalizer_fn'] = generic_utils.deserialize_keras_object(

2195 config['normalizer_fn'], custom_objects=custom_objects)

2196 return cls(**kwargs)

2197

2198 ```

2199 Returns:

2200 A serializable Dict that can be used to deserialize the object with

2201 from_config.

2202 """

2203 return self._get_config()

2204

2205 def _get_config(self):

2206 raise NotImplementedError('Must be implemented in subclasses.')

2207

2208 @classmethod

2209 def from_config(cls, config, custom_objects=None, columns_by_name=None):

2210 """Creates a FeatureColumn from its config.

2211

2212 This method should be the reverse of `get_config`, capable of instantiating

2213 the same FeatureColumn from the config dictionary. See `get_config` for an

2214 example of common (de)serialization practices followed in this file.

2215

2216 TODO(b/118939620): This is a private method until consensus is reached on

2217 supporting object deserialization deduping within Keras.

2218

2219 Args:

2220 config: A Dict config acquired with `get_config`.

2221 custom_objects: Optional dictionary mapping names (strings) to custom

2222 classes or functions to be considered during deserialization.

2223 columns_by_name: A Dict[String, FeatureColumn] of existing columns in

2224 order to avoid duplication. Should be passed to any calls to

2225 deserialize_feature_column().

2226

2227 Returns:

2228 A FeatureColumn for the input config.

2229 """

2230 return cls._from_config(config, custom_objects, columns_by_name)

2231

2232 @classmethod

2233 def _from_config(cls, config, custom_objects=None, columns_by_name=None):

2234 raise NotImplementedError('Must be implemented in subclasses.')

2235

2236

2237# TODO(b/181853833): Add a tf.type for instance type checking.

2238@tf_export('__internal__.feature_column.DenseColumn', v1=[])

2239class DenseColumn(FeatureColumn):

2240 """Represents a column which can be represented as `Tensor`.

2241

2242 Some examples of this type are: numeric_column, embedding_column,

2243 indicator_column.

2244 """

2245

2246 @abc.abstractproperty

2247 def variable_shape(self):

2248 """`TensorShape` of `get_dense_tensor`, without batch dimension."""

2249 pass

2250

2251 @abc.abstractmethod

2252 def get_dense_tensor(self, transformation_cache, state_manager):

2253 """Returns a `Tensor`.

2254

2255 The output of this function will be used by model-builder-functions. For

2256 example the pseudo code of `input_layer` will be like:

2257

2258 ```python

2259 def input_layer(features, feature_columns, ...):

2260 outputs = [fc.get_dense_tensor(...) for fc in feature_columns]

2261 return tf.concat(outputs)

2262 ```

2263

2264 Args:

2265 transformation_cache: A `FeatureTransformationCache` object to access

2266 features.

2267 state_manager: A `StateManager` to create / access resources such as

2268 lookup tables.

2269

2270 Returns:

2271 `Tensor` of shape [batch_size] + `variable_shape`.

2272 """

2273 pass

2274

2275

2276def is_feature_column_v2(feature_columns):

2277 """Returns True if all feature columns are V2."""

2278 for feature_column in feature_columns:

2279 if not isinstance(feature_column, FeatureColumn):

2280 return False

2281 if not feature_column._is_v2_column: # pylint: disable=protected-access

2282 return False

2283 return True

2284

2285

2286def _create_weighted_sum(column, transformation_cache, state_manager,

2287 sparse_combiner, weight_var):

2288 """Creates a weighted sum for a dense/categorical column for linear_model."""

2289 if isinstance(column, CategoricalColumn):

2290 return _create_categorical_column_weighted_sum(

2291 column=column,

2292 transformation_cache=transformation_cache,

2293 state_manager=state_manager,

2294 sparse_combiner=sparse_combiner,

2295 weight_var=weight_var)

2296 else:

2297 return _create_dense_column_weighted_sum(

2298 column=column,

2299 transformation_cache=transformation_cache,

2300 state_manager=state_manager,

2301 weight_var=weight_var)

2302

2303

2304def _create_dense_column_weighted_sum(column, transformation_cache,

2305 state_manager, weight_var):

2306 """Create a weighted sum of a dense column for linear_model."""

2307 tensor = column.get_dense_tensor(transformation_cache, state_manager)

2308 num_elements = column.variable_shape.num_elements()

2309 batch_size = array_ops.shape(tensor)[0]

2310 tensor = array_ops.reshape(tensor, shape=(batch_size, num_elements))

2311 return math_ops.matmul(tensor, weight_var, name='weighted_sum')

2312

2313

2314class CategoricalColumn(FeatureColumn):

2315 """Represents a categorical feature.

2316

2317 A categorical feature typically handled with a `tf.sparse.SparseTensor` of

2318 IDs.

2319 """

2320

2321 IdWeightPair = collections.namedtuple( # pylint: disable=invalid-name

2322 'IdWeightPair', ('id_tensor', 'weight_tensor'))

2323

2324 @abc.abstractproperty

2325 def num_buckets(self):

2326 """Returns number of buckets in this sparse feature."""

2327 pass

2328

2329 @abc.abstractmethod

2330 def get_sparse_tensors(self, transformation_cache, state_manager):

2331 """Returns an IdWeightPair.

2332

2333 `IdWeightPair` is a pair of `SparseTensor`s which represents ids and

2334 weights.

2335

2336 `IdWeightPair.id_tensor` is typically a `batch_size` x `num_buckets`

2337 `SparseTensor` of `int64`. `IdWeightPair.weight_tensor` is either a

2338 `SparseTensor` of `float` or `None` to indicate all weights should be

2339 taken to be 1. If specified, `weight_tensor` must have exactly the same

2340 shape and indices as `sp_ids`. Expected `SparseTensor` is same as parsing

2341 output of a `VarLenFeature` which is a ragged matrix.

2342

2343 Args:

2344 transformation_cache: A `FeatureTransformationCache` object to access

2345 features.

2346 state_manager: A `StateManager` to create / access resources such as

2347 lookup tables.

2348 """

2349 pass

2350

2351

2352def _create_categorical_column_weighted_sum(column, transformation_cache,

2353 state_manager, sparse_combiner,

2354 weight_var):

2355 # pylint: disable=g-doc-return-or-yield,g-doc-args

2356 """Create a weighted sum of a categorical column for linear_model.

2357

2358 Note to maintainer: As implementation details, the weighted sum is

2359 implemented via embedding_lookup_sparse toward efficiency. Mathematically,

2360 they are the same.

2361

2362 To be specific, conceptually, categorical column can be treated as multi-hot

2363 vector. Say:

2364

2365 ```python

2366 x = [0 0 1] # categorical column input

2367 w = [a b c] # weights

2368 ```

2369 The weighted sum is `c` in this case, which is same as `w[2]`.

2370

2371 Another example is

2372

2373 ```python

2374 x = [0 1 1] # categorical column input

2375 w = [a b c] # weights

2376 ```

2377 The weighted sum is `b + c` in this case, which is same as `w[2] + w[3]`.

2378

2379 For both cases, we can implement weighted sum via embedding_lookup with

2380 sparse_combiner = "sum".

2381 """

2382

2383 sparse_tensors = column.get_sparse_tensors(transformation_cache,

2384 state_manager)

2385 id_tensor = sparse_ops.sparse_reshape(

2386 sparse_tensors.id_tensor,

2387 [array_ops.shape(sparse_tensors.id_tensor)[0], -1])

2388 weight_tensor = sparse_tensors.weight_tensor

2389 if weight_tensor is not None:

2390 weight_tensor = sparse_ops.sparse_reshape(

2391 weight_tensor, [array_ops.shape(weight_tensor)[0], -1])

2392

2393 return embedding_ops.safe_embedding_lookup_sparse(

2394 weight_var,

2395 id_tensor,

2396 sparse_weights=weight_tensor,

2397 combiner=sparse_combiner,

2398 name='weighted_sum')

2399

2400

2401# TODO(b/181853833): Add a tf.type for instance type checking.

2402@tf_export('__internal__.feature_column.SequenceDenseColumn', v1=[])

2403class SequenceDenseColumn(FeatureColumn):

2404 """Represents dense sequence data."""

2405

2406 TensorSequenceLengthPair = collections.namedtuple( # pylint: disable=invalid-name

2407 'TensorSequenceLengthPair', ('dense_tensor', 'sequence_length'))

2408

2409 @abc.abstractmethod

2410 def get_sequence_dense_tensor(self, transformation_cache, state_manager):

2411 """Returns a `TensorSequenceLengthPair`.

2412

2413 Args:

2414 transformation_cache: A `FeatureTransformationCache` object to access

2415 features.

2416 state_manager: A `StateManager` to create / access resources such as

2417 lookup tables.

2418 """

2419 pass

2420

2421

2422@tf_export('__internal__.feature_column.FeatureTransformationCache', v1=[])

2423class FeatureTransformationCache(object):

2424 """Handles caching of transformations while building the model.

2425

2426 `FeatureColumn` specifies how to digest an input column to the network. Some

2427 feature columns require data transformations. This class caches those

2428 transformations.

2429

2430 Some features may be used in more than one place. For example, one can use a

2431 bucketized feature by itself and a cross with it. In that case we

2432 should create only one bucketization op instead of creating ops for each

2433 feature column separately. To handle re-use of transformed columns,

2434 `FeatureTransformationCache` caches all previously transformed columns.

2435

2436 Example:

2437 We're trying to use the following `FeatureColumn`s:

2438

2439 ```python

2440 bucketized_age = fc.bucketized_column(fc.numeric_column("age"), ...)

2441 keywords = fc.categorical_column_with_hash_buckets("keywords", ...)

2442 age_X_keywords = fc.crossed_column([bucketized_age, "keywords"])

2443 ... = linear_model(features,

2444 [bucketized_age, keywords, age_X_keywords]

2445 ```

2446

2447 If we transform each column independently, then we'll get duplication of

2448 bucketization (one for cross, one for bucketization itself).

2449 The `FeatureTransformationCache` eliminates this duplication.

2450 """

2451

2452 def __init__(self, features):

2453 """Creates a `FeatureTransformationCache`.

2454

2455 Args:

2456 features: A mapping from feature column to objects that are `Tensor` or

2457 `SparseTensor`, or can be converted to same via

2458 `sparse_tensor.convert_to_tensor_or_sparse_tensor`. A `string` key

2459 signifies a base feature (not-transformed). A `FeatureColumn` key means

2460 that this `Tensor` is the output of an existing `FeatureColumn` which

2461 can be reused.

2462 """

2463 self._features = features.copy()

2464 self._feature_tensors = {}

2465

2466 def get(self, key, state_manager, training=None):

2467 """Returns a `Tensor` for the given key.

2468

2469 A `str` key is used to access a base feature (not-transformed). When a

2470 `FeatureColumn` is passed, the transformed feature is returned if it

2471 already exists, otherwise the given `FeatureColumn` is asked to provide its

2472 transformed output, which is then cached.

2473

2474 Args:

2475 key: a `str` or a `FeatureColumn`.

2476 state_manager: A StateManager object that holds the FeatureColumn state.

2477 training: Boolean indicating whether to the column is being used in

2478 training mode. This argument is passed to the transform_feature method

2479 of any `FeatureColumn` that takes a `training` argument. For example, if

2480 a `FeatureColumn` performed dropout, it could expose a `training`

2481 argument to control whether the dropout should be applied.

2482

2483 Returns:

2484 The transformed `Tensor` corresponding to the `key`.

2485

2486 Raises:

2487 ValueError: if key is not found or a transformed `Tensor` cannot be

2488 computed.

2489 """

2490 if key in self._feature_tensors:

2491 # FeatureColumn is already transformed or converted.

2492 return self._feature_tensors[key]

2493

2494 if key in self._features:

2495 feature_tensor = self._get_raw_feature_as_tensor(key)

2496 self._feature_tensors[key] = feature_tensor

2497 return feature_tensor

2498

2499 if isinstance(key, six.string_types):

2500 raise ValueError('Feature {} is not in features dictionary.'.format(key))

2501

2502 if not isinstance(key, FeatureColumn):

2503 raise TypeError('"key" must be either a "str" or "FeatureColumn". '

2504 'Provided: {}'.format(key))

2505

2506 column = key

2507 logging.debug('Transforming feature_column %s.', column)

2508

2509 # Some columns may need information about whether the transformation is

2510 # happening in training or prediction mode, but not all columns expose this

2511 # argument.

2512 try:

2513 transformed = column.transform_feature(

2514 self, state_manager, training=training)

2515 except TypeError:

2516 transformed = column.transform_feature(self, state_manager)

2517 if transformed is None:

2518 raise ValueError('Column {} is not supported.'.format(column.name))

2519 self._feature_tensors[column] = transformed

2520 return transformed

2521

2522 def _get_raw_feature_as_tensor(self, key):

2523 """Gets the raw_feature (keyed by `key`) as `tensor`.

2524

2525 The raw feature is converted to (sparse) tensor and maybe expand dim.

2526

2527 For both `Tensor` and `SparseTensor`, the rank will be expanded (to 2) if

2528 the rank is 1. This supports dynamic rank also. For rank 0 raw feature, will

2529 error out as it is not supported.

2530

2531 Args:

2532 key: A `str` key to access the raw feature.

2533

2534 Returns:

2535 A `Tensor` or `SparseTensor`.

2536

2537 Raises:

2538 ValueError: if the raw feature has rank 0.

2539 """

2540 raw_feature = self._features[key]

2541 feature_tensor = sparse_tensor_lib.convert_to_tensor_or_sparse_tensor(

2542 raw_feature)

2543

2544 def expand_dims(input_tensor):

2545 # Input_tensor must have rank 1.

2546 if isinstance(input_tensor, sparse_tensor_lib.SparseTensor):

2547 return sparse_ops.sparse_reshape(input_tensor,

2548 [array_ops.shape(input_tensor)[0], 1])

2549 else:

2550 return array_ops.expand_dims(input_tensor, -1)

2551

2552 rank = feature_tensor.get_shape().ndims

2553 if rank is not None:

2554 if rank == 0:

2555 raise ValueError(

2556 'Feature (key: {}) cannot have rank 0. Given: {}'.format(

2557 key, feature_tensor))

2558 return feature_tensor if rank != 1 else expand_dims(feature_tensor)

2559

2560 # Handle dynamic rank.

2561 with ops.control_dependencies([

2562 check_ops.assert_positive(

2563 array_ops.rank(feature_tensor),

2564 message='Feature (key: {}) cannot have rank 0. Given: {}'.format(

2565 key, feature_tensor))

2566 ]):

2567 return cond.cond(

2568 math_ops.equal(1, array_ops.rank(feature_tensor)),

2569 lambda: expand_dims(feature_tensor), lambda: feature_tensor)

2570

2571

2572# TODO(ptucker): Move to third_party/tensorflow/python/ops/sparse_ops.py

2573def _to_sparse_input_and_drop_ignore_values(input_tensor, ignore_value=None):

2574 """Converts a `Tensor` to a `SparseTensor`, dropping ignore_value cells.

2575

2576 If `input_tensor` is already a `SparseTensor`, just return it.

2577

2578 Args:

2579 input_tensor: A string or integer `Tensor`.

2580 ignore_value: Entries in `dense_tensor` equal to this value will be absent

2581 from the resulting `SparseTensor`. If `None`, default value of

2582 `dense_tensor`'s dtype will be used ('' for `str`, -1 for `int`).

2583

2584 Returns:

2585 A `SparseTensor` with the same shape as `input_tensor`.

2586

2587 Raises:

2588 ValueError: when `input_tensor`'s rank is `None`.

2589 """

2590 input_tensor = sparse_tensor_lib.convert_to_tensor_or_sparse_tensor(

2591 input_tensor)

2592 if isinstance(input_tensor, sparse_tensor_lib.SparseTensor):

2593 return input_tensor

2594 with ops.name_scope(None, 'to_sparse_input', (

2595 input_tensor,

2596 ignore_value,

2597 )):

2598 if ignore_value is None:

2599 if input_tensor.dtype == dtypes.string:

2600 # Exception due to TF strings are converted to numpy objects by default.

2601 ignore_value = ''

2602 elif input_tensor.dtype.is_integer:

2603 ignore_value = -1 # -1 has a special meaning of missing feature

2604 else:

2605 # NOTE: `as_numpy_dtype` is a property, so with the parentheses this is

2606 # constructing a new numpy object of the given type, which yields the

2607 # default value for that type.

2608 ignore_value = input_tensor.dtype.as_numpy_dtype()

2609 ignore_value = math_ops.cast(

2610 ignore_value, input_tensor.dtype, name='ignore_value')

2611 indices = array_ops.where_v2(

2612 math_ops.not_equal(input_tensor, ignore_value), name='indices')

2613 return sparse_tensor_lib.SparseTensor(

2614 indices=indices,

2615 values=array_ops.gather_nd(input_tensor, indices, name='values'),

2616 dense_shape=array_ops.shape(

2617 input_tensor, out_type=dtypes.int64, name='dense_shape'))

2618

2619

2620def _normalize_feature_columns(feature_columns):

2621 """Normalizes the `feature_columns` input.

2622

2623 This method converts the `feature_columns` to list type as best as it can. In

2624 addition, verifies the type and other parts of feature_columns, required by

2625 downstream library.

2626

2627 Args:

2628 feature_columns: The raw feature columns, usually passed by users.

2629

2630 Returns:

2631 The normalized feature column list.

2632

2633 Raises:

2634 ValueError: for any invalid inputs, such as empty, duplicated names, etc.

2635 """

2636 if isinstance(feature_columns, FeatureColumn):

2637 feature_columns = [feature_columns]

2638

2639 if isinstance(feature_columns, collections_abc.Iterator):

2640 feature_columns = list(feature_columns)

2641

2642 if isinstance(feature_columns, dict):

2643 raise ValueError('Expected feature_columns to be iterable, found dict.')

2644

2645 for column in feature_columns:

2646 if not isinstance(column, FeatureColumn):

2647 raise ValueError('Items of feature_columns must be a FeatureColumn. '

2648 'Given (type {}): {}.'.format(type(column), column))

2649 if not feature_columns:

2650 raise ValueError('feature_columns must not be empty.')

2651 name_to_column = {}

2652 for column in feature_columns:

2653 if column.name in name_to_column:

2654 raise ValueError('Duplicate feature column name found for columns: {} '

2655 'and {}. This usually means that these columns refer to '

2656 'same base feature. Either one must be discarded or a '

2657 'duplicated but renamed item must be inserted in '

2658 'features dict.'.format(column,

2659 name_to_column[column.name]))

2660 name_to_column[column.name] = column

2661

2662 return sorted(feature_columns, key=lambda x: x.name)

2663

2664

2665class NumericColumn(

2666 DenseColumn,

2667 fc_old._DenseColumn, # pylint: disable=protected-access

2668 collections.namedtuple(

2669 'NumericColumn',

2670 ('key', 'shape', 'default_value', 'dtype', 'normalizer_fn'))):

2671 """see `numeric_column`."""

2672

2673 @property

2674 def _is_v2_column(self):

2675 return True

2676

2677 @property

2678 def name(self):

2679 """See `FeatureColumn` base class."""

2680 return self.key

2681

2682 @property

2683 def parse_example_spec(self):

2684 """See `FeatureColumn` base class."""

2685 return {

2686 self.key:

2687 parsing_ops.FixedLenFeature(self.shape, self.dtype,

2688 self.default_value)

2689 }

2690

2691 @property

2692 @deprecation.deprecated(_FEATURE_COLUMN_DEPRECATION_DATE,

2693 _FEATURE_COLUMN_DEPRECATION)

2694 def _parse_example_spec(self):

2695 return self.parse_example_spec

2696

2697 def _transform_input_tensor(self, input_tensor):

2698 if isinstance(input_tensor, sparse_tensor_lib.SparseTensor):

2699 raise ValueError(

2700 'The corresponding Tensor of numerical column must be a Tensor. '

2701 'SparseTensor is not supported. key: {}'.format(self.key))

2702 if self.normalizer_fn is not None:

2703 input_tensor = self.normalizer_fn(input_tensor)

2704 return math_ops.cast(input_tensor, dtypes.float32)

2705

2706 @deprecation.deprecated(_FEATURE_COLUMN_DEPRECATION_DATE,

2707 _FEATURE_COLUMN_DEPRECATION)

2708 def _transform_feature(self, inputs):

2709 input_tensor = inputs.get(self.key)

2710 return self._transform_input_tensor(input_tensor)

2711

2712 def transform_feature(self, transformation_cache, state_manager):

2713 """See `FeatureColumn` base class.

2714

2715 In this case, we apply the `normalizer_fn` to the input tensor.

2716

2717 Args:

2718 transformation_cache: A `FeatureTransformationCache` object to access

2719 features.

2720 state_manager: A `StateManager` to create / access resources such as

2721 lookup tables.

2722

2723 Returns:

2724 Normalized input tensor.

2725 Raises:

2726 ValueError: If a SparseTensor is passed in.

2727 """

2728 input_tensor = transformation_cache.get(self.key, state_manager)

2729 return self._transform_input_tensor(input_tensor)

2730

2731 @property

2732 def variable_shape(self):

2733 """See `DenseColumn` base class."""

2734 return tensor_shape.TensorShape(self.shape)

2735

2736 @property

2737 @deprecation.deprecated(_FEATURE_COLUMN_DEPRECATION_DATE,

2738 _FEATURE_COLUMN_DEPRECATION)

2739 def _variable_shape(self):

2740 return self.variable_shape

2741

2742 def get_dense_tensor(self, transformation_cache, state_manager):

2743 """Returns dense `Tensor` representing numeric feature.

2744

2745 Args:

2746 transformation_cache: A `FeatureTransformationCache` object to access

2747 features.

2748 state_manager: A `StateManager` to create / access resources such as

2749 lookup tables.

2750

2751 Returns:

2752 Dense `Tensor` created within `transform_feature`.

2753 """

2754 # Feature has been already transformed. Return the intermediate

2755 # representation created by _transform_feature.

2756 return transformation_cache.get(self, state_manager)

2757

2758 @deprecation.deprecated(_FEATURE_COLUMN_DEPRECATION_DATE,

2759 _FEATURE_COLUMN_DEPRECATION)

2760 def _get_dense_tensor(self, inputs, weight_collections=None, trainable=None):

2761 del weight_collections

2762 del trainable

2763 return inputs.get(self)

2764

2765 @property

2766 def parents(self):

2767 """See 'FeatureColumn` base class."""

2768 return [self.key]

2769

2770 def get_config(self):

2771 """See 'FeatureColumn` base class."""

2772 config = dict(zip(self._fields, self))

2773 from tensorflow.python.feature_column import serialization # pylint: disable=g-import-not-at-top

2774 config['normalizer_fn'] = serialization._serialize_keras_object( # pylint: disable=protected-access

2775 self.normalizer_fn)

2776 config['dtype'] = self.dtype.name

2777 return config

2778

2779 @classmethod

2780 def from_config(cls, config, custom_objects=None, columns_by_name=None):

2781 """See 'FeatureColumn` base class."""

2782 _check_config_keys(config, cls._fields)

2783 from tensorflow.python.feature_column import serialization # pylint: disable=g-import-not-at-top

2784 kwargs = _standardize_and_copy_config(config)

2785 kwargs['normalizer_fn'] = serialization._deserialize_keras_object( # pylint: disable=protected-access

2786 config['normalizer_fn'],

2787 custom_objects=custom_objects)

2788 kwargs['dtype'] = dtypes.as_dtype(config['dtype'])

2789

2790 return cls(**kwargs)

2791

2792

2793class BucketizedColumn(

2794 DenseColumn,

2795 CategoricalColumn,

2796 fc_old._DenseColumn, # pylint: disable=protected-access

2797 fc_old._CategoricalColumn, # pylint: disable=protected-access

2798 collections.namedtuple('BucketizedColumn',

2799 ('source_column', 'boundaries'))):

2800 """See `bucketized_column`."""

2801

2802 @property

2803 def _is_v2_column(self):

2804 return (isinstance(self.source_column, FeatureColumn) and

2805 self.source_column._is_v2_column) # pylint: disable=protected-access

2806

2807 @property

2808 def name(self):

2809 """See `FeatureColumn` base class."""

2810 return '{}_bucketized'.format(self.source_column.name)

2811

2812 @property

2813 def parse_example_spec(self):

2814 """See `FeatureColumn` base class."""

2815 return self.source_column.parse_example_spec

2816

2817 @property

2818 @deprecation.deprecated(_FEATURE_COLUMN_DEPRECATION_DATE,

2819 _FEATURE_COLUMN_DEPRECATION)

2820 def _parse_example_spec(self):

2821 return self.source_column._parse_example_spec # pylint: disable=protected-access

2822

2823 @deprecation.deprecated(_FEATURE_COLUMN_DEPRECATION_DATE,

2824 _FEATURE_COLUMN_DEPRECATION)

2825 def _transform_feature(self, inputs):

2826 """Returns bucketized categorical `source_column` tensor."""

2827 source_tensor = inputs.get(self.source_column)

2828 return math_ops._bucketize( # pylint: disable=protected-access

2829 source_tensor,

2830 boundaries=self.boundaries)

2831

2832 def transform_feature(self, transformation_cache, state_manager):

2833 """Returns bucketized categorical `source_column` tensor."""

2834 source_tensor = transformation_cache.get(self.source_column, state_manager)

2835 return math_ops._bucketize( # pylint: disable=protected-access

2836 source_tensor,

2837 boundaries=self.boundaries)

2838

2839 @property

2840 def variable_shape(self):

2841 """See `DenseColumn` base class."""

2842 return tensor_shape.TensorShape(

2843 tuple(self.source_column.shape) + (len(self.boundaries) + 1,))

2844

2845 @property

2846 @deprecation.deprecated(_FEATURE_COLUMN_DEPRECATION_DATE,

2847 _FEATURE_COLUMN_DEPRECATION)

2848 def _variable_shape(self):

2849 return self.variable_shape

2850

2851 def _get_dense_tensor_for_input_tensor(self, input_tensor):

2852 return array_ops.one_hot(

2853 indices=math_ops.cast(input_tensor, dtypes.int64),

2854 depth=len(self.boundaries) + 1,

2855 on_value=1.,

2856 off_value=0.)

2857

2858 def get_dense_tensor(self, transformation_cache, state_manager):

2859 """Returns one hot encoded dense `Tensor`."""

2860 input_tensor = transformation_cache.get(self, state_manager)

2861 return self._get_dense_tensor_for_input_tensor(input_tensor)

2862

2863 @deprecation.deprecated(_FEATURE_COLUMN_DEPRECATION_DATE,

2864 _FEATURE_COLUMN_DEPRECATION)

2865 def _get_dense_tensor(self, inputs, weight_collections=None, trainable=None):

2866 del weight_collections

2867 del trainable

2868 input_tensor = inputs.get(self)

2869 return self._get_dense_tensor_for_input_tensor(input_tensor)

2870

2871 @property

2872 def num_buckets(self):

2873 """See `CategoricalColumn` base class."""

2874 # By construction, source_column is always one-dimensional.

2875 return (len(self.boundaries) + 1) * self.source_column.shape[0]

2876

2877 @property

2878 @deprecation.deprecated(_FEATURE_COLUMN_DEPRECATION_DATE,

2879 _FEATURE_COLUMN_DEPRECATION)

2880 def _num_buckets(self):

2881 return self.num_buckets

2882

2883 def _get_sparse_tensors_for_input_tensor(self, input_tensor):

2884 batch_size = array_ops.shape(input_tensor)[0]

2885 # By construction, source_column is always one-dimensional.

2886 source_dimension = self.source_column.shape[0]

2887

2888 i1 = array_ops.reshape(

2889 array_ops.tile(

2890 array_ops.expand_dims(math_ops.range(0, batch_size), 1),

2891 [1, source_dimension]), (-1,))

2892 i2 = array_ops.tile(math_ops.range(0, source_dimension), [batch_size])

2893 # Flatten the bucket indices and unique them across dimensions

2894 # E.g. 2nd dimension indices will range from k to 2*k-1 with k buckets

2895 bucket_indices = (

2896 array_ops.reshape(input_tensor,

2897 (-1,)) + (len(self.boundaries) + 1) * i2)

2898

2899 indices = math_ops.cast(

2900 array_ops.transpose(array_ops_stack.stack((i1, i2))), dtypes.int64)

2901 dense_shape = math_ops.cast(

2902 array_ops_stack.stack([batch_size, source_dimension]), dtypes.int64)

2903 sparse_tensor = sparse_tensor_lib.SparseTensor(

2904 indices=indices, values=bucket_indices, dense_shape=dense_shape)

2905 return CategoricalColumn.IdWeightPair(sparse_tensor, None)

2906

2907 def get_sparse_tensors(self, transformation_cache, state_manager):

2908 """Converts dense inputs to SparseTensor so downstream code can use it."""

2909 input_tensor = transformation_cache.get(self, state_manager)

2910 return self._get_sparse_tensors_for_input_tensor(input_tensor)

2911

2912 @deprecation.deprecated(_FEATURE_COLUMN_DEPRECATION_DATE,

2913 _FEATURE_COLUMN_DEPRECATION)

2914 def _get_sparse_tensors(self,

2915 inputs,

2916 weight_collections=None,

2917 trainable=None):

2918 """Converts dense inputs to SparseTensor so downstream code can use it."""

2919 del weight_collections

2920 del trainable

2921 input_tensor = inputs.get(self)

2922 return self._get_sparse_tensors_for_input_tensor(input_tensor)

2923

2924 @property

2925 def parents(self):

2926 """See 'FeatureColumn` base class."""

2927 return [self.source_column]

2928

2929 def get_config(self):

2930 """See 'FeatureColumn` base class."""

2931 from tensorflow.python.feature_column.serialization import serialize_feature_column # pylint: disable=g-import-not-at-top

2932 config = dict(zip(self._fields, self))

2933 config['source_column'] = serialize_feature_column(self.source_column)

2934 return config

2935

2936 @classmethod

2937 def from_config(cls, config, custom_objects=None, columns_by_name=None):

2938 """See 'FeatureColumn` base class."""

2939 from tensorflow.python.feature_column.serialization import deserialize_feature_column # pylint: disable=g-import-not-at-top

2940 _check_config_keys(config, cls._fields)

2941 kwargs = _standardize_and_copy_config(config)

2942 kwargs['source_column'] = deserialize_feature_column(

2943 config['source_column'], custom_objects, columns_by_name)

2944 return cls(**kwargs)

2945

2946

2947class EmbeddingColumn(

2948 DenseColumn,

2949 SequenceDenseColumn,

2950 fc_old._DenseColumn, # pylint: disable=protected-access

2951 fc_old._SequenceDenseColumn, # pylint: disable=protected-access

2952 collections.namedtuple(

2953 'EmbeddingColumn',

2954 ('categorical_column', 'dimension', 'combiner', 'initializer',

2955 'ckpt_to_load_from', 'tensor_name_in_ckpt', 'max_norm', 'trainable',

2956 'use_safe_embedding_lookup'))):

2957 """See `embedding_column`."""

2958

2959 def __new__(cls,

2960 categorical_column,

2961 dimension,

2962 combiner,

2963 initializer,

2964 ckpt_to_load_from,

2965 tensor_name_in_ckpt,

2966 max_norm,

2967 trainable,

2968 use_safe_embedding_lookup=True):

2969 return super(EmbeddingColumn, cls).__new__(

2970 cls,

2971 categorical_column=categorical_column,

2972 dimension=dimension,

2973 combiner=combiner,

2974 initializer=initializer,

2975 ckpt_to_load_from=ckpt_to_load_from,

2976 tensor_name_in_ckpt=tensor_name_in_ckpt,

2977 max_norm=max_norm,

2978 trainable=trainable,

2979 use_safe_embedding_lookup=use_safe_embedding_lookup)

2980

2981 @property

2982 def _is_v2_column(self):

2983 return (isinstance(self.categorical_column, FeatureColumn) and

2984 self.categorical_column._is_v2_column) # pylint: disable=protected-access

2985

2986 @property

2987 def name(self):

2988 """See `FeatureColumn` base class."""

2989 return '{}_embedding'.format(self.categorical_column.name)

2990

2991 @property

2992 def parse_example_spec(self):

2993 """See `FeatureColumn` base class."""

2994 return self.categorical_column.parse_example_spec

2995

2996 @property

2997 @deprecation.deprecated(_FEATURE_COLUMN_DEPRECATION_DATE,

2998 _FEATURE_COLUMN_DEPRECATION)

2999 def _parse_example_spec(self):

3000 return self.categorical_column._parse_example_spec # pylint: disable=protected-access

3001

3002 def transform_feature(self, transformation_cache, state_manager):

3003 """Transforms underlying `categorical_column`."""

3004 return transformation_cache.get(self.categorical_column, state_manager)

3005

3006 @deprecation.deprecated(_FEATURE_COLUMN_DEPRECATION_DATE,

3007 _FEATURE_COLUMN_DEPRECATION)

3008 def _transform_feature(self, inputs):

3009 return inputs.get(self.categorical_column)

3010

3011 @property

3012 def variable_shape(self):

3013 """See `DenseColumn` base class."""

3014 return tensor_shape.TensorShape([self.dimension])

3015

3016 @property

3017 @deprecation.deprecated(_FEATURE_COLUMN_DEPRECATION_DATE,

3018 _FEATURE_COLUMN_DEPRECATION)

3019 def _variable_shape(self):

3020 return self.variable_shape

3021

3022 def create_state(self, state_manager):

3023 """Creates the embedding lookup variable."""

3024 default_num_buckets = (

3025 self.categorical_column.num_buckets

3026 if self._is_v2_column else self.categorical_column._num_buckets) # pylint: disable=protected-access

3027 num_buckets = getattr(self.categorical_column, 'num_buckets',

3028 default_num_buckets)

3029 embedding_shape = (num_buckets, self.dimension)

3030 state_manager.create_variable(

3031 self,

3032 name='embedding_weights',

3033 shape=embedding_shape,

3034 dtype=dtypes.float32,

3035 trainable=self.trainable,

3036 use_resource=True,

3037 initializer=self.initializer)

3038

3039 def _get_dense_tensor_internal_helper(self, sparse_tensors,

3040 embedding_weights):

3041 sparse_ids = sparse_tensors.id_tensor

3042 sparse_weights = sparse_tensors.weight_tensor

3043

3044 if self.ckpt_to_load_from is not None:

3045 to_restore = embedding_weights

3046 if isinstance(to_restore, variables.PartitionedVariable):

3047 to_restore = to_restore._get_variable_list() # pylint: disable=protected-access

3048 checkpoint_utils.init_from_checkpoint(

3049 self.ckpt_to_load_from, {self.tensor_name_in_ckpt: to_restore})

3050

3051 sparse_id_rank = tensor_shape.dimension_value(

3052 sparse_ids.dense_shape.get_shape()[0])

3053 embedding_lookup_sparse = embedding_ops.safe_embedding_lookup_sparse

3054 if (not self.use_safe_embedding_lookup and sparse_id_rank is not None and

3055 sparse_id_rank <= 2):

3056 embedding_lookup_sparse = embedding_ops.embedding_lookup_sparse_v2

3057 # Return embedding lookup result.

3058 return embedding_lookup_sparse(

3059 embedding_weights,

3060 sparse_ids,

3061 sparse_weights,

3062 combiner=self.combiner,

3063 name='%s_weights' % self.name,

3064 max_norm=self.max_norm)

3065

3066 def _get_dense_tensor_internal(self, sparse_tensors, state_manager):

3067 """Private method that follows the signature of get_dense_tensor."""

3068 embedding_weights = state_manager.get_variable(

3069 self, name='embedding_weights')

3070 return self._get_dense_tensor_internal_helper(sparse_tensors,

3071 embedding_weights)

3072

3073 def _old_get_dense_tensor_internal(self, sparse_tensors, weight_collections,

3074 trainable):

3075 """Private method that follows the signature of _get_dense_tensor."""

3076 embedding_shape = (self.categorical_column._num_buckets, self.dimension) # pylint: disable=protected-access

3077 if (weight_collections and

3078 ops.GraphKeys.GLOBAL_VARIABLES not in weight_collections):

3079 weight_collections.append(ops.GraphKeys.GLOBAL_VARIABLES)

3080 embedding_weights = variable_scope.get_variable(

3081 name='embedding_weights',

3082 shape=embedding_shape,

3083 dtype=dtypes.float32,

3084 initializer=self.initializer,

3085 trainable=self.trainable and trainable,

3086 collections=weight_collections)

3087 return self._get_dense_tensor_internal_helper(sparse_tensors,

3088 embedding_weights)

3089

3090 def get_dense_tensor(self, transformation_cache, state_manager):

3091 """Returns tensor after doing the embedding lookup.

3092

3093 Args:

3094 transformation_cache: A `FeatureTransformationCache` object to access

3095 features.

3096 state_manager: A `StateManager` to create / access resources such as

3097 lookup tables.

3098

3099 Returns:

3100 Embedding lookup tensor.

3101

3102 Raises:

3103 ValueError: `categorical_column` is SequenceCategoricalColumn.

3104 """

3105 if isinstance(self.categorical_column, SequenceCategoricalColumn):

3106 raise ValueError(

3107 'In embedding_column: {}. '

3108 'categorical_column must not be of type SequenceCategoricalColumn. '

3109 'Suggested fix A: If you wish to use DenseFeatures, use a '

3110 'non-sequence categorical_column_with_*. '

3111 'Suggested fix B: If you wish to create sequence input, use '

3112 'SequenceFeatures instead of DenseFeatures. '

3113 'Given (type {}): {}'.format(self.name, type(self.categorical_column),

3114 self.categorical_column))

3115 # Get sparse IDs and weights.

3116 sparse_tensors = self.categorical_column.get_sparse_tensors(

3117 transformation_cache, state_manager)

3118 return self._get_dense_tensor_internal(sparse_tensors, state_manager)

3119

3120 @deprecation.deprecated(_FEATURE_COLUMN_DEPRECATION_DATE,

3121 _FEATURE_COLUMN_DEPRECATION)

3122 def _get_dense_tensor(self, inputs, weight_collections=None, trainable=None):

3123 if isinstance(

3124 self.categorical_column,

3125 (SequenceCategoricalColumn, fc_old._SequenceCategoricalColumn)): # pylint: disable=protected-access

3126 raise ValueError(

3127 'In embedding_column: {}. '

3128 'categorical_column must not be of type _SequenceCategoricalColumn. '

3129 'Suggested fix A: If you wish to use DenseFeatures, use a '

3130 'non-sequence categorical_column_with_*. '

3131 'Suggested fix B: If you wish to create sequence input, use '

3132 'SequenceFeatures instead of DenseFeatures. '

3133 'Given (type {}): {}'.format(self.name, type(self.categorical_column),

3134 self.categorical_column))

3135 sparse_tensors = self.categorical_column._get_sparse_tensors( # pylint: disable=protected-access

3136 inputs, weight_collections, trainable)

3137 return self._old_get_dense_tensor_internal(sparse_tensors,

3138 weight_collections, trainable)

3139

3140 def get_sequence_dense_tensor(self, transformation_cache, state_manager):

3141 """See `SequenceDenseColumn` base class."""

3142 if not isinstance(self.categorical_column, SequenceCategoricalColumn):

3143 raise ValueError(

3144 'In embedding_column: {}. '

3145 'categorical_column must be of type SequenceCategoricalColumn '

3146 'to use SequenceFeatures. '

3147 'Suggested fix: Use one of sequence_categorical_column_with_*. '

3148 'Given (type {}): {}'.format(self.name, type(self.categorical_column),

3149 self.categorical_column))

3150 sparse_tensors = self.categorical_column.get_sparse_tensors(

3151 transformation_cache, state_manager)

3152 dense_tensor = self._get_dense_tensor_internal(sparse_tensors,

3153 state_manager)

3154 sequence_length = fc_utils.sequence_length_from_sparse_tensor(

3155 sparse_tensors.id_tensor)

3156 return SequenceDenseColumn.TensorSequenceLengthPair(

3157 dense_tensor=dense_tensor, sequence_length=sequence_length)

3158

3159 @deprecation.deprecated(_FEATURE_COLUMN_DEPRECATION_DATE,

3160 _FEATURE_COLUMN_DEPRECATION)

3161 def _get_sequence_dense_tensor(self,

3162 inputs,

3163 weight_collections=None,

3164 trainable=None):

3165 if not isinstance(

3166 self.categorical_column,

3167 (SequenceCategoricalColumn, fc_old._SequenceCategoricalColumn)): # pylint: disable=protected-access

3168 raise ValueError(

3169 'In embedding_column: {}. '

3170 'categorical_column must be of type SequenceCategoricalColumn '

3171 'to use SequenceFeatures. '

3172 'Suggested fix: Use one of sequence_categorical_column_with_*. '

3173 'Given (type {}): {}'.format(self.name, type(self.categorical_column),

3174 self.categorical_column))

3175 sparse_tensors = self.categorical_column._get_sparse_tensors(inputs) # pylint: disable=protected-access

3176 dense_tensor = self._old_get_dense_tensor_internal(

3177 sparse_tensors,

3178 weight_collections=weight_collections,

3179 trainable=trainable)

3180 sequence_length = fc_utils.sequence_length_from_sparse_tensor(

3181 sparse_tensors.id_tensor)

3182 return SequenceDenseColumn.TensorSequenceLengthPair(

3183 dense_tensor=dense_tensor, sequence_length=sequence_length)

3184

3185 @property

3186 def parents(self):

3187 """See 'FeatureColumn` base class."""

3188 return [self.categorical_column]

3189

3190 def get_config(self):

3191 """See 'FeatureColumn` base class."""

3192 from tensorflow.python.feature_column import serialization # pylint: disable=g-import-not-at-top

3193 config = dict(zip(self._fields, self))

3194 config['categorical_column'] = serialization.serialize_feature_column(

3195 self.categorical_column)

3196 config['initializer'] = serialization._serialize_keras_object( # pylint: disable=protected-access

3197 self.initializer)

3198 return config

3199

3200 @classmethod

3201 def from_config(cls, config, custom_objects=None, columns_by_name=None):

3202 """See 'FeatureColumn` base class."""

3203 if 'use_safe_embedding_lookup' not in config:

3204 config['use_safe_embedding_lookup'] = True

3205 from tensorflow.python.feature_column import serialization # pylint: disable=g-import-not-at-top

3206 _check_config_keys(config, cls._fields)

3207 kwargs = _standardize_and_copy_config(config)

3208 kwargs['categorical_column'] = serialization.deserialize_feature_column(

3209 config['categorical_column'], custom_objects, columns_by_name)

3210 all_initializers = dict(tf_inspect.getmembers(init_ops, tf_inspect.isclass))

3211 kwargs['initializer'] = serialization._deserialize_keras_object( # pylint: disable=protected-access

3212 config['initializer'],

3213 module_objects=all_initializers,

3214 custom_objects=custom_objects)

3215 return cls(**kwargs)

3216

3217

3218def _raise_shared_embedding_column_error():

3219 raise ValueError('SharedEmbeddingColumns are not supported in '

3220 '`linear_model` or `input_layer`. Please use '

3221 '`DenseFeatures` or `LinearModel` instead.')

3222

3223

3224class SharedEmbeddingColumnCreator(autotrackable.AutoTrackable):

3225 """Class that creates a `SharedEmbeddingColumn`."""

3226

3227 def __init__(self,

3228 dimension,

3229 initializer,

3230 ckpt_to_load_from,

3231 tensor_name_in_ckpt,

3232 num_buckets,

3233 trainable,

3234 name='shared_embedding_column_creator',

3235 use_safe_embedding_lookup=True):

3236 self._dimension = dimension

3237 self._initializer = initializer

3238 self._ckpt_to_load_from = ckpt_to_load_from

3239 self._tensor_name_in_ckpt = tensor_name_in_ckpt

3240 self._num_buckets = num_buckets

3241 self._trainable = trainable

3242 self._name = name

3243 self._use_safe_embedding_lookup = use_safe_embedding_lookup

3244 # Map from graph keys to embedding_weight variables.

3245 self._embedding_weights = {}

3246

3247 def __call__(self, categorical_column, combiner, max_norm):

3248 return SharedEmbeddingColumn(categorical_column, self, combiner, max_norm,

3249 self._use_safe_embedding_lookup)

3250

3251 @property

3252 def embedding_weights(self):

3253 key = ops.get_default_graph()._graph_key # pylint: disable=protected-access

3254 if key not in self._embedding_weights:

3255 embedding_shape = (self._num_buckets, self._dimension)

3256 var = variable_scope.get_variable(

3257 name=self._name,

3258 shape=embedding_shape,

3259 dtype=dtypes.float32,

3260 initializer=self._initializer,

3261 trainable=self._trainable)

3262

3263 if self._ckpt_to_load_from is not None:

3264 to_restore = var

3265 if isinstance(to_restore, variables.PartitionedVariable):

3266 to_restore = to_restore._get_variable_list() # pylint: disable=protected-access

3267 checkpoint_utils.init_from_checkpoint(

3268 self._ckpt_to_load_from, {self._tensor_name_in_ckpt: to_restore})

3269 self._embedding_weights[key] = var

3270 return self._embedding_weights[key]

3271

3272 @property

3273 def dimension(self):

3274 return self._dimension

3275

3276

3277class SharedEmbeddingColumn(

3278 DenseColumn,

3279 SequenceDenseColumn,

3280 fc_old._DenseColumn, # pylint: disable=protected-access

3281 fc_old._SequenceDenseColumn, # pylint: disable=protected-access

3282 collections.namedtuple(

3283 'SharedEmbeddingColumn',

3284 ('categorical_column', 'shared_embedding_column_creator', 'combiner',

3285 'max_norm', 'use_safe_embedding_lookup'))):

3286 """See `embedding_column`."""

3287

3288 def __new__(cls,

3289 categorical_column,

3290 shared_embedding_column_creator,

3291 combiner,

3292 max_norm,

3293 use_safe_embedding_lookup=True):

3294 return super(SharedEmbeddingColumn, cls).__new__(

3295 cls,

3296 categorical_column=categorical_column,

3297 shared_embedding_column_creator=shared_embedding_column_creator,

3298 combiner=combiner,

3299 max_norm=max_norm,

3300 use_safe_embedding_lookup=use_safe_embedding_lookup)

3301

3302 @property

3303 def _is_v2_column(self):

3304 return True

3305

3306 @property

3307 def name(self):

3308 """See `FeatureColumn` base class."""

3309 return '{}_shared_embedding'.format(self.categorical_column.name)

3310

3311 @property

3312 def parse_example_spec(self):

3313 """See `FeatureColumn` base class."""

3314 return self.categorical_column.parse_example_spec

3315

3316 @property

3317 def _parse_example_spec(self):

3318 return _raise_shared_embedding_column_error()

3319

3320 def transform_feature(self, transformation_cache, state_manager):

3321 """See `FeatureColumn` base class."""

3322 return transformation_cache.get(self.categorical_column, state_manager)

3323

3324 def _transform_feature(self, inputs):

3325 return _raise_shared_embedding_column_error()

3326

3327 @property

3328 def variable_shape(self):

3329 """See `DenseColumn` base class."""

3330 return tensor_shape.TensorShape(

3331 [self.shared_embedding_column_creator.dimension])

3332

3333 @property

3334 def _variable_shape(self):

3335 return _raise_shared_embedding_column_error()

3336

3337 def _get_dense_tensor_internal(self, transformation_cache, state_manager):

3338 """Private method that follows the signature of _get_dense_tensor."""

3339 # This method is called from a variable_scope with name _var_scope_name,

3340 # which is shared among all shared embeddings. Open a name_scope here, so

3341 # that the ops for different columns have distinct names.

3342 with ops.name_scope(None, default_name=self.name):

3343 # Get sparse IDs and weights.

3344 sparse_tensors = self.categorical_column.get_sparse_tensors(

3345 transformation_cache, state_manager)

3346 sparse_ids = sparse_tensors.id_tensor

3347 sparse_weights = sparse_tensors.weight_tensor

3348

3349 embedding_weights = self.shared_embedding_column_creator.embedding_weights

3350

3351 sparse_id_rank = tensor_shape.dimension_value(

3352 sparse_ids.dense_shape.get_shape()[0])

3353 embedding_lookup_sparse = embedding_ops.safe_embedding_lookup_sparse

3354 if (not self.use_safe_embedding_lookup and sparse_id_rank is not None and

3355 sparse_id_rank <= 2):

3356 embedding_lookup_sparse = embedding_ops.embedding_lookup_sparse_v2

3357 # Return embedding lookup result.

3358 return embedding_lookup_sparse(

3359 embedding_weights,

3360 sparse_ids,

3361 sparse_weights,

3362 combiner=self.combiner,

3363 name='%s_weights' % self.name,

3364 max_norm=self.max_norm)

3365

3366 def get_dense_tensor(self, transformation_cache, state_manager):

3367 """Returns the embedding lookup result."""

3368 if isinstance(self.categorical_column, SequenceCategoricalColumn):

3369 raise ValueError(

3370 'In embedding_column: {}. '

3371 'categorical_column must not be of type SequenceCategoricalColumn. '

3372 'Suggested fix A: If you wish to use DenseFeatures, use a '

3373 'non-sequence categorical_column_with_*. '

3374 'Suggested fix B: If you wish to create sequence input, use '

3375 'SequenceFeatures instead of DenseFeatures. '

3376 'Given (type {}): {}'.format(self.name, type(self.categorical_column),

3377 self.categorical_column))

3378 return self._get_dense_tensor_internal(transformation_cache, state_manager)

3379

3380 def _get_dense_tensor(self, inputs, weight_collections=None, trainable=None):

3381 return _raise_shared_embedding_column_error()

3382

3383 def get_sequence_dense_tensor(self, transformation_cache, state_manager):

3384 """See `SequenceDenseColumn` base class."""

3385 if not isinstance(self.categorical_column, SequenceCategoricalColumn):

3386 raise ValueError(

3387 'In embedding_column: {}. '

3388 'categorical_column must be of type SequenceCategoricalColumn '

3389 'to use SequenceFeatures. '

3390 'Suggested fix: Use one of sequence_categorical_column_with_*. '

3391 'Given (type {}): {}'.format(self.name, type(self.categorical_column),

3392 self.categorical_column))

3393 dense_tensor = self._get_dense_tensor_internal(transformation_cache,

3394 state_manager)

3395 sparse_tensors = self.categorical_column.get_sparse_tensors(

3396 transformation_cache, state_manager)

3397 sequence_length = fc_utils.sequence_length_from_sparse_tensor(

3398 sparse_tensors.id_tensor)

3399 return SequenceDenseColumn.TensorSequenceLengthPair(

3400 dense_tensor=dense_tensor, sequence_length=sequence_length)

3401

3402 def _get_sequence_dense_tensor(self,

3403 inputs,

3404 weight_collections=None,

3405 trainable=None):

3406 return _raise_shared_embedding_column_error()

3407

3408 @property

3409 def parents(self):

3410 """See 'FeatureColumn` base class."""

3411 return [self.categorical_column]

3412

3413

3414def _check_shape(shape, key):

3415 """Returns shape if it's valid, raises error otherwise."""

3416 assert shape is not None

3417 if not nest.is_nested(shape):

3418 shape = [shape]

3419 shape = tuple(shape)

3420 for dimension in shape:

3421 if not isinstance(dimension, int):

3422 raise TypeError('shape dimensions must be integer. '

3423 'shape: {}, key: {}'.format(shape, key))

3424 if dimension < 1:

3425 raise ValueError('shape dimensions must be greater than 0. '

3426 'shape: {}, key: {}'.format(shape, key))

3427 return shape

3428

3429

3430class HashedCategoricalColumn(

3431 CategoricalColumn,

3432 fc_old._CategoricalColumn, # pylint: disable=protected-access

3433 collections.namedtuple('HashedCategoricalColumn',

3434 ('key', 'hash_bucket_size', 'dtype'))):

3435 """see `categorical_column_with_hash_bucket`."""

3436

3437 @property

3438 def _is_v2_column(self):

3439 return True

3440

3441 @property

3442 def name(self):

3443 """See `FeatureColumn` base class."""

3444 return self.key

3445

3446 @property

3447 def parse_example_spec(self):

3448 """See `FeatureColumn` base class."""

3449 return {self.key: parsing_ops.VarLenFeature(self.dtype)}

3450

3451 @property

3452 @deprecation.deprecated(_FEATURE_COLUMN_DEPRECATION_DATE,

3453 _FEATURE_COLUMN_DEPRECATION)

3454 def _parse_example_spec(self):

3455 return self.parse_example_spec

3456

3457 def _transform_input_tensor(self, input_tensor):

3458 """Hashes the values in the feature_column."""

3459 if not isinstance(input_tensor, sparse_tensor_lib.SparseTensor):

3460 raise ValueError('SparseColumn input must be a SparseTensor.')

3461

3462 fc_utils.assert_string_or_int(

3463 input_tensor.dtype,

3464 prefix='column_name: {} input_tensor'.format(self.key))

3465

3466 if self.dtype.is_integer != input_tensor.dtype.is_integer:

3467 raise ValueError(

3468 'Column dtype and SparseTensors dtype must be compatible. '

3469 'key: {}, column dtype: {}, tensor dtype: {}'.format(

3470 self.key, self.dtype, input_tensor.dtype))

3471

3472 if self.dtype == dtypes.string:

3473 sparse_values = input_tensor.values

3474 else:

3475 sparse_values = string_ops.as_string(input_tensor.values)

3476

3477 sparse_id_values = string_ops.string_to_hash_bucket_fast(

3478 sparse_values, self.hash_bucket_size, name='lookup')

3479 return sparse_tensor_lib.SparseTensor(input_tensor.indices,

3480 sparse_id_values,

3481 input_tensor.dense_shape)

3482

3483 def transform_feature(self, transformation_cache, state_manager):

3484 """Hashes the values in the feature_column."""

3485 input_tensor = _to_sparse_input_and_drop_ignore_values(

3486 transformation_cache.get(self.key, state_manager))

3487 return self._transform_input_tensor(input_tensor)

3488

3489 @deprecation.deprecated(_FEATURE_COLUMN_DEPRECATION_DATE,

3490 _FEATURE_COLUMN_DEPRECATION)

3491 def _transform_feature(self, inputs):

3492 input_tensor = _to_sparse_input_and_drop_ignore_values(inputs.get(self.key))

3493 return self._transform_input_tensor(input_tensor)

3494

3495 @property

3496 def num_buckets(self):

3497 """Returns number of buckets in this sparse feature."""

3498 return self.hash_bucket_size

3499

3500 @property

3501 @deprecation.deprecated(_FEATURE_COLUMN_DEPRECATION_DATE,

3502 _FEATURE_COLUMN_DEPRECATION)

3503 def _num_buckets(self):

3504 return self.num_buckets

3505

3506 def get_sparse_tensors(self, transformation_cache, state_manager):

3507 """See `CategoricalColumn` base class."""

3508 return CategoricalColumn.IdWeightPair(

3509 transformation_cache.get(self, state_manager), None)

3510

3511 @deprecation.deprecated(_FEATURE_COLUMN_DEPRECATION_DATE,

3512 _FEATURE_COLUMN_DEPRECATION)

3513 def _get_sparse_tensors(self,

3514 inputs,

3515 weight_collections=None,

3516 trainable=None):

3517 del weight_collections

3518 del trainable

3519 return CategoricalColumn.IdWeightPair(inputs.get(self), None)

3520

3521 @property

3522 def parents(self):

3523 """See 'FeatureColumn` base class."""

3524 return [self.key]

3525

3526 def get_config(self):

3527 """See 'FeatureColumn` base class."""

3528 config = dict(zip(self._fields, self))

3529 config['dtype'] = self.dtype.name

3530 return config

3531

3532 @classmethod

3533 def from_config(cls, config, custom_objects=None, columns_by_name=None):

3534 """See 'FeatureColumn` base class."""

3535 _check_config_keys(config, cls._fields)

3536 kwargs = _standardize_and_copy_config(config)

3537 kwargs['dtype'] = dtypes.as_dtype(config['dtype'])

3538 return cls(**kwargs)

3539

3540

3541class VocabularyFileCategoricalColumn(

3542 CategoricalColumn,

3543 fc_old._CategoricalColumn, # pylint: disable=protected-access

3544 collections.namedtuple(

3545 'VocabularyFileCategoricalColumn',

3546 ('key', 'vocabulary_file', 'vocabulary_size', 'num_oov_buckets',

3547 'dtype', 'default_value', 'file_format'))):

3548 """See `categorical_column_with_vocabulary_file`."""

3549

3550 def __new__(cls,

3551 key,

3552 vocabulary_file,

3553 vocabulary_size,

3554 num_oov_buckets,

3555 dtype,

3556 default_value,

3557 file_format=None):

3558 return super(VocabularyFileCategoricalColumn, cls).__new__(

3559 cls,

3560 key=key,

3561 vocabulary_file=vocabulary_file,

3562 vocabulary_size=vocabulary_size,

3563 num_oov_buckets=num_oov_buckets,

3564 dtype=dtype,

3565 default_value=default_value,

3566 file_format=file_format)

3567

3568 @property

3569 def _is_v2_column(self):

3570 return True

3571

3572 @property

3573 def name(self):

3574 """See `FeatureColumn` base class."""

3575 return self.key

3576

3577 @property

3578 def parse_example_spec(self):

3579 """See `FeatureColumn` base class."""

3580 return {self.key: parsing_ops.VarLenFeature(self.dtype)}

3581

3582 @property

3583 @deprecation.deprecated(_FEATURE_COLUMN_DEPRECATION_DATE,

3584 _FEATURE_COLUMN_DEPRECATION)

3585 def _parse_example_spec(self):

3586 return self.parse_example_spec

3587

3588 def _make_table_from_tfrecord_gzip_file(self, key_dtype, name):

3589 dataset = readers.TFRecordDataset(

3590 self.vocabulary_file, compression_type='GZIP')

3591

3592 def key_dtype_fn(key):

3593 return key if key_dtype is dtypes.string else string_ops.string_to_number(

3594 key, out_type=key_dtype)

3595

3596 return data_lookup_ops.index_table_from_dataset(

3597 dataset.map(key_dtype_fn),

3598 num_oov_buckets=self.num_oov_buckets,

3599 vocab_size=self.vocabulary_size,

3600 default_value=self.default_value,

3601 key_dtype=key_dtype,

3602 name=name)

3603

3604 def _make_table(self, key_dtype, state_manager):

3605 name = '{}_lookup'.format(self.key)

3606 if state_manager is None or not state_manager.has_resource(self, name):

3607 with ops.init_scope():

3608 if self.file_format == 'tfrecord_gzip':

3609 table = self._make_table_from_tfrecord_gzip_file(key_dtype, name)

3610 else:

3611 table = lookup_ops.index_table_from_file(

3612 vocabulary_file=self.vocabulary_file,

3613 num_oov_buckets=self.num_oov_buckets,

3614 vocab_size=self.vocabulary_size,

3615 default_value=self.default_value,

3616 key_dtype=key_dtype,

3617 name=name)

3618 if state_manager is not None:

3619 state_manager.add_resource(self, name, table)

3620 else:

3621 # Reuse the table from the previous run.

3622 table = state_manager.get_resource(self, name)

3623 return table

3624

3625 def _transform_input_tensor(self, input_tensor, state_manager=None):

3626 """Creates a lookup table for the vocabulary."""

3627 if self.dtype.is_integer != input_tensor.dtype.is_integer:

3628 raise ValueError(

3629 'Column dtype and SparseTensors dtype must be compatible. '

3630 'key: {}, column dtype: {}, tensor dtype: {}'.format(

3631 self.key, self.dtype, input_tensor.dtype))

3632

3633 fc_utils.assert_string_or_int(

3634 input_tensor.dtype,

3635 prefix='column_name: {} input_tensor'.format(self.key))

3636

3637 key_dtype = self.dtype

3638 if input_tensor.dtype.is_integer:

3639 # `index_table_from_file` requires 64-bit integer keys.

3640 key_dtype = dtypes.int64

3641 input_tensor = math_ops.cast(input_tensor, dtypes.int64)

3642 return self._make_table(key_dtype, state_manager).lookup(input_tensor)

3643

3644 def transform_feature(self, transformation_cache, state_manager):

3645 """Creates a lookup table for the vocabulary."""

3646 input_tensor = _to_sparse_input_and_drop_ignore_values(

3647 transformation_cache.get(self.key, state_manager))

3648 return self._transform_input_tensor(input_tensor, state_manager)

3649

3650 @deprecation.deprecated(_FEATURE_COLUMN_DEPRECATION_DATE,

3651 _FEATURE_COLUMN_DEPRECATION)

3652 def _transform_feature(self, inputs):

3653 input_tensor = _to_sparse_input_and_drop_ignore_values(inputs.get(self.key))

3654 return self._transform_input_tensor(input_tensor)

3655

3656 @property

3657 def num_buckets(self):

3658 """Returns number of buckets in this sparse feature."""

3659 return self.vocabulary_size + self.num_oov_buckets

3660

3661 @property

3662 @deprecation.deprecated(_FEATURE_COLUMN_DEPRECATION_DATE,

3663 _FEATURE_COLUMN_DEPRECATION)

3664 def _num_buckets(self):

3665 return self.num_buckets

3666

3667 def get_sparse_tensors(self, transformation_cache, state_manager):

3668 """See `CategoricalColumn` base class."""

3669 return CategoricalColumn.IdWeightPair(

3670 transformation_cache.get(self, state_manager), None)

3671

3672 @deprecation.deprecated(_FEATURE_COLUMN_DEPRECATION_DATE,

3673 _FEATURE_COLUMN_DEPRECATION)

3674 def _get_sparse_tensors(self,

3675 inputs,

3676 weight_collections=None,

3677 trainable=None):

3678 del weight_collections

3679 del trainable

3680 return CategoricalColumn.IdWeightPair(inputs.get(self), None)

3681

3682 @property

3683 def parents(self):

3684 """See 'FeatureColumn` base class."""

3685 return [self.key]

3686

3687 def get_config(self):

3688 """See 'FeatureColumn` base class."""

3689 config = dict(zip(self._fields, self))

3690 config['dtype'] = self.dtype.name

3691 return config

3692

3693 @classmethod

3694 def from_config(cls, config, custom_objects=None, columns_by_name=None):

3695 """See 'FeatureColumn` base class."""

3696 _check_config_keys(config, cls._fields)

3697 kwargs = _standardize_and_copy_config(config)

3698 kwargs['dtype'] = dtypes.as_dtype(config['dtype'])

3699 return cls(**kwargs)

3700

3701

3702class VocabularyListCategoricalColumn(

3703 CategoricalColumn,

3704 fc_old._CategoricalColumn, # pylint: disable=protected-access

3705 collections.namedtuple(

3706 'VocabularyListCategoricalColumn',

3707 ('key', 'vocabulary_list', 'dtype', 'default_value', 'num_oov_buckets'))

3708):

3709 """See `categorical_column_with_vocabulary_list`."""

3710

3711 @property

3712 def _is_v2_column(self):

3713 return True

3714

3715 @property

3716 def name(self):

3717 """See `FeatureColumn` base class."""

3718 return self.key

3719

3720 @property

3721 def parse_example_spec(self):

3722 """See `FeatureColumn` base class."""

3723 return {self.key: parsing_ops.VarLenFeature(self.dtype)}

3724

3725 @property

3726 @deprecation.deprecated(_FEATURE_COLUMN_DEPRECATION_DATE,

3727 _FEATURE_COLUMN_DEPRECATION)

3728 def _parse_example_spec(self):

3729 return self.parse_example_spec

3730

3731 def _transform_input_tensor(self, input_tensor, state_manager=None):

3732 """Creates a lookup table for the vocabulary list."""

3733 if self.dtype.is_integer != input_tensor.dtype.is_integer:

3734 raise ValueError(

3735 'Column dtype and SparseTensors dtype must be compatible. '

3736 'key: {}, column dtype: {}, tensor dtype: {}'.format(

3737 self.key, self.dtype, input_tensor.dtype))

3738

3739 fc_utils.assert_string_or_int(

3740 input_tensor.dtype,

3741 prefix='column_name: {} input_tensor'.format(self.key))

3742

3743 key_dtype = self.dtype

3744 if input_tensor.dtype.is_integer:

3745 # `index_table_from_tensor` requires 64-bit integer keys.

3746 key_dtype = dtypes.int64

3747 input_tensor = math_ops.cast(input_tensor, dtypes.int64)

3748

3749 name = '{}_lookup'.format(self.key)

3750 if state_manager is None or not state_manager.has_resource(self, name):

3751 with ops.init_scope():

3752 table = lookup_ops.index_table_from_tensor(

3753 vocabulary_list=tuple(self.vocabulary_list),

3754 default_value=self.default_value,

3755 num_oov_buckets=self.num_oov_buckets,

3756 dtype=key_dtype,

3757 name=name)

3758 if state_manager is not None:

3759 state_manager.add_resource(self, name, table)

3760 else:

3761 # Reuse the table from the previous run.

3762 table = state_manager.get_resource(self, name)

3763 return table.lookup(input_tensor)

3764

3765 def transform_feature(self, transformation_cache, state_manager):

3766 """Creates a lookup table for the vocabulary list."""

3767 input_tensor = _to_sparse_input_and_drop_ignore_values(

3768 transformation_cache.get(self.key, state_manager))

3769 return self._transform_input_tensor(input_tensor, state_manager)

3770

3771 @deprecation.deprecated(_FEATURE_COLUMN_DEPRECATION_DATE,

3772 _FEATURE_COLUMN_DEPRECATION)

3773 def _transform_feature(self, inputs):

3774 input_tensor = _to_sparse_input_and_drop_ignore_values(inputs.get(self.key))

3775 return self._transform_input_tensor(input_tensor)

3776

3777 @property

3778 def num_buckets(self):

3779 """Returns number of buckets in this sparse feature."""

3780 return len(self.vocabulary_list) + self.num_oov_buckets

3781

3782 @property

3783 @deprecation.deprecated(_FEATURE_COLUMN_DEPRECATION_DATE,

3784 _FEATURE_COLUMN_DEPRECATION)

3785 def _num_buckets(self):

3786 return self.num_buckets

3787

3788 def get_sparse_tensors(self, transformation_cache, state_manager):

3789 """See `CategoricalColumn` base class."""

3790 return CategoricalColumn.IdWeightPair(

3791 transformation_cache.get(self, state_manager), None)

3792

3793 @deprecation.deprecated(_FEATURE_COLUMN_DEPRECATION_DATE,

3794 _FEATURE_COLUMN_DEPRECATION)

3795 def _get_sparse_tensors(self,

3796 inputs,

3797 weight_collections=None,

3798 trainable=None):

3799 del weight_collections

3800 del trainable

3801 return CategoricalColumn.IdWeightPair(inputs.get(self), None)

3802

3803 @property

3804 def parents(self):

3805 """See 'FeatureColumn` base class."""

3806 return [self.key]

3807

3808 def get_config(self):

3809 """See 'FeatureColumn` base class."""

3810 config = dict(zip(self._fields, self))

3811 config['dtype'] = self.dtype.name

3812 return config

3813

3814 @classmethod

3815 def from_config(cls, config, custom_objects=None, columns_by_name=None):

3816 """See 'FeatureColumn` base class."""

3817 _check_config_keys(config, cls._fields)

3818 kwargs = _standardize_and_copy_config(config)

3819 kwargs['dtype'] = dtypes.as_dtype(config['dtype'])

3820 return cls(**kwargs)

3821

3822

3823class IdentityCategoricalColumn(

3824 CategoricalColumn,

3825 fc_old._CategoricalColumn, # pylint: disable=protected-access

3826 collections.namedtuple('IdentityCategoricalColumn',

3827 ('key', 'number_buckets', 'default_value'))):

3828 """See `categorical_column_with_identity`."""

3829

3830 @property

3831 def _is_v2_column(self):

3832 return True

3833

3834 @property

3835 def name(self):

3836 """See `FeatureColumn` base class."""

3837 return self.key

3838

3839 @property

3840 def parse_example_spec(self):

3841 """See `FeatureColumn` base class."""

3842 return {self.key: parsing_ops.VarLenFeature(dtypes.int64)}

3843

3844 @property

3845 @deprecation.deprecated(_FEATURE_COLUMN_DEPRECATION_DATE,

3846 _FEATURE_COLUMN_DEPRECATION)

3847 def _parse_example_spec(self):

3848 return self.parse_example_spec

3849

3850 def _transform_input_tensor(self, input_tensor):

3851 """Returns a SparseTensor with identity values."""

3852 if not input_tensor.dtype.is_integer:

3853 raise ValueError('Invalid input, not integer. key: {} dtype: {}'.format(

3854 self.key, input_tensor.dtype))

3855 values = input_tensor.values

3856 if input_tensor.values.dtype != dtypes.int64:

3857 values = math_ops.cast(values, dtypes.int64, name='values')

3858 if self.default_value is not None:

3859 values = math_ops.cast(input_tensor.values, dtypes.int64, name='values')

3860 num_buckets = math_ops.cast(

3861 self.num_buckets, dtypes.int64, name='num_buckets')

3862 zero = math_ops.cast(0, dtypes.int64, name='zero')

3863 # Assign default for out-of-range values.

3864 values = array_ops.where_v2(

3865 math_ops.logical_or(

3866 values < zero, values >= num_buckets, name='out_of_range'),

3867 array_ops.fill(

3868 dims=array_ops.shape(values),

3869 value=math_ops.cast(self.default_value, dtypes.int64),

3870 name='default_values'), values)

3871

3872 return sparse_tensor_lib.SparseTensor(

3873 indices=input_tensor.indices,

3874 values=values,

3875 dense_shape=input_tensor.dense_shape)

3876

3877 def transform_feature(self, transformation_cache, state_manager):

3878 """Returns a SparseTensor with identity values."""

3879 input_tensor = _to_sparse_input_and_drop_ignore_values(

3880 transformation_cache.get(self.key, state_manager))

3881 return self._transform_input_tensor(input_tensor)

3882

3883 @deprecation.deprecated(_FEATURE_COLUMN_DEPRECATION_DATE,

3884 _FEATURE_COLUMN_DEPRECATION)

3885 def _transform_feature(self, inputs):

3886 input_tensor = _to_sparse_input_and_drop_ignore_values(inputs.get(self.key))

3887 return self._transform_input_tensor(input_tensor)

3888

3889 @property

3890 def num_buckets(self):

3891 """Returns number of buckets in this sparse feature."""

3892 return self.number_buckets

3893

3894 @property

3895 @deprecation.deprecated(_FEATURE_COLUMN_DEPRECATION_DATE,

3896 _FEATURE_COLUMN_DEPRECATION)

3897 def _num_buckets(self):

3898 return self.num_buckets

3899

3900 def get_sparse_tensors(self, transformation_cache, state_manager):

3901 """See `CategoricalColumn` base class."""

3902 return CategoricalColumn.IdWeightPair(

3903 transformation_cache.get(self, state_manager), None)

3904

3905 @deprecation.deprecated(_FEATURE_COLUMN_DEPRECATION_DATE,

3906 _FEATURE_COLUMN_DEPRECATION)

3907 def _get_sparse_tensors(self,

3908 inputs,

3909 weight_collections=None,

3910 trainable=None):

3911 del weight_collections

3912 del trainable

3913 return CategoricalColumn.IdWeightPair(inputs.get(self), None)

3914

3915 @property

3916 def parents(self):

3917 """See 'FeatureColumn` base class."""

3918 return [self.key]

3919

3920 def get_config(self):

3921 """See 'FeatureColumn` base class."""

3922 return dict(zip(self._fields, self))

3923

3924 @classmethod

3925 def from_config(cls, config, custom_objects=None, columns_by_name=None):

3926 """See 'FeatureColumn` base class."""

3927 _check_config_keys(config, cls._fields)

3928 kwargs = _standardize_and_copy_config(config)

3929 return cls(**kwargs)

3930

3931

3932class WeightedCategoricalColumn(

3933 CategoricalColumn,

3934 fc_old._CategoricalColumn, # pylint: disable=protected-access

3935 collections.namedtuple(

3936 'WeightedCategoricalColumn',

3937 ('categorical_column', 'weight_feature_key', 'dtype'))):

3938 """See `weighted_categorical_column`."""

3939

3940 @property

3941 def _is_v2_column(self):

3942 return (isinstance(self.categorical_column, FeatureColumn) and

3943 self.categorical_column._is_v2_column) # pylint: disable=protected-access

3944

3945 @property

3946 def name(self):

3947 """See `FeatureColumn` base class."""

3948 return '{}_weighted_by_{}'.format(self.categorical_column.name,

3949 self.weight_feature_key)

3950

3951 @property

3952 def parse_example_spec(self):

3953 """See `FeatureColumn` base class."""

3954 config = self.categorical_column.parse_example_spec

3955 if self.weight_feature_key in config:

3956 raise ValueError('Parse config {} already exists for {}.'.format(

3957 config[self.weight_feature_key], self.weight_feature_key))

3958 config[self.weight_feature_key] = parsing_ops.VarLenFeature(self.dtype)

3959 return config

3960

3961 @property

3962 @deprecation.deprecated(_FEATURE_COLUMN_DEPRECATION_DATE,

3963 _FEATURE_COLUMN_DEPRECATION)

3964 def _parse_example_spec(self):

3965 config = self.categorical_column._parse_example_spec # pylint: disable=protected-access

3966 if self.weight_feature_key in config:

3967 raise ValueError('Parse config {} already exists for {}.'.format(

3968 config[self.weight_feature_key], self.weight_feature_key))

3969 config[self.weight_feature_key] = parsing_ops.VarLenFeature(self.dtype)

3970 return config

3971

3972 @property

3973 def num_buckets(self):

3974 """See `DenseColumn` base class."""

3975 return self.categorical_column.num_buckets

3976

3977 @property

3978 @deprecation.deprecated(_FEATURE_COLUMN_DEPRECATION_DATE,

3979 _FEATURE_COLUMN_DEPRECATION)

3980 def _num_buckets(self):

3981 return self.categorical_column._num_buckets # pylint: disable=protected-access

3982

3983 def _transform_weight_tensor(self, weight_tensor):

3984 if weight_tensor is None:

3985 raise ValueError('Missing weights {}.'.format(self.weight_feature_key))

3986 weight_tensor = sparse_tensor_lib.convert_to_tensor_or_sparse_tensor(

3987 weight_tensor)

3988 if self.dtype != weight_tensor.dtype.base_dtype:

3989 raise ValueError('Bad dtype, expected {}, but got {}.'.format(

3990 self.dtype, weight_tensor.dtype))

3991 if not isinstance(weight_tensor, sparse_tensor_lib.SparseTensor):

3992 # The weight tensor can be a regular Tensor. In this case, sparsify it.

3993 weight_tensor = _to_sparse_input_and_drop_ignore_values(

3994 weight_tensor, ignore_value=0.0)

3995 if not weight_tensor.dtype.is_floating:

3996 weight_tensor = math_ops.cast(weight_tensor, dtypes.float32)

3997 return weight_tensor

3998

3999 def transform_feature(self, transformation_cache, state_manager):

4000 """Applies weights to tensor generated from `categorical_column`'."""

4001 weight_tensor = transformation_cache.get(self.weight_feature_key,

4002 state_manager)

4003 sparse_weight_tensor = self._transform_weight_tensor(weight_tensor)

4004 sparse_categorical_tensor = _to_sparse_input_and_drop_ignore_values(

4005 transformation_cache.get(self.categorical_column, state_manager))

4006 return (sparse_categorical_tensor, sparse_weight_tensor)

4007

4008 @deprecation.deprecated(_FEATURE_COLUMN_DEPRECATION_DATE,

4009 _FEATURE_COLUMN_DEPRECATION)

4010 def _transform_feature(self, inputs):

4011 """Applies weights to tensor generated from `categorical_column`'."""

4012 weight_tensor = inputs.get(self.weight_feature_key)

4013 weight_tensor = self._transform_weight_tensor(weight_tensor)

4014 return (inputs.get(self.categorical_column), weight_tensor)

4015

4016 def get_sparse_tensors(self, transformation_cache, state_manager):

4017 """See `CategoricalColumn` base class."""

4018 tensors = transformation_cache.get(self, state_manager)

4019 return CategoricalColumn.IdWeightPair(tensors[0], tensors[1])

4020

4021 @deprecation.deprecated(_FEATURE_COLUMN_DEPRECATION_DATE,

4022 _FEATURE_COLUMN_DEPRECATION)

4023 def _get_sparse_tensors(self,

4024 inputs,

4025 weight_collections=None,

4026 trainable=None):

4027 del weight_collections

4028 del trainable

4029 tensors = inputs.get(self)

4030 return CategoricalColumn.IdWeightPair(tensors[0], tensors[1])

4031

4032 @property

4033 def parents(self):

4034 """See 'FeatureColumn` base class."""

4035 return [self.categorical_column, self.weight_feature_key]

4036

4037 def get_config(self):

4038 """See 'FeatureColumn` base class."""

4039 from tensorflow.python.feature_column.serialization import serialize_feature_column # pylint: disable=g-import-not-at-top

4040 config = dict(zip(self._fields, self))

4041 config['categorical_column'] = serialize_feature_column(

4042 self.categorical_column)

4043 config['dtype'] = self.dtype.name

4044 return config

4045

4046 @classmethod

4047 def from_config(cls, config, custom_objects=None, columns_by_name=None):

4048 """See 'FeatureColumn` base class."""

4049 from tensorflow.python.feature_column.serialization import deserialize_feature_column # pylint: disable=g-import-not-at-top

4050 _check_config_keys(config, cls._fields)

4051 kwargs = _standardize_and_copy_config(config)

4052 kwargs['categorical_column'] = deserialize_feature_column(

4053 config['categorical_column'], custom_objects, columns_by_name)

4054 kwargs['dtype'] = dtypes.as_dtype(config['dtype'])

4055 return cls(**kwargs)

4056

4057

4058class CrossedColumn(

4059 CategoricalColumn,

4060 fc_old._CategoricalColumn, # pylint: disable=protected-access

4061 collections.namedtuple('CrossedColumn',

4062 ('keys', 'hash_bucket_size', 'hash_key'))):

4063 """See `crossed_column`."""

4064

4065 @property

4066 def _is_v2_column(self):

4067 for key in _collect_leaf_level_keys(self):

4068 if isinstance(key, six.string_types):

4069 continue

4070 if not isinstance(key, FeatureColumn):

4071 return False

4072 if not key._is_v2_column: # pylint: disable=protected-access

4073 return False

4074 return True

4075

4076 @property

4077 def name(self):

4078 """See `FeatureColumn` base class."""

4079 feature_names = []

4080 for key in _collect_leaf_level_keys(self):

4081 if isinstance(key, (FeatureColumn, fc_old._FeatureColumn)): # pylint: disable=protected-access

4082 feature_names.append(key.name)

4083 else: # key must be a string

4084 feature_names.append(key)

4085 return '_X_'.join(sorted(feature_names))

4086

4087 @property

4088 def parse_example_spec(self):

4089 """See `FeatureColumn` base class."""

4090 config = {}

4091 for key in self.keys:

4092 if isinstance(key, FeatureColumn):

4093 config.update(key.parse_example_spec)

4094 elif isinstance(key, fc_old._FeatureColumn): # pylint: disable=protected-access

4095 config.update(key._parse_example_spec) # pylint: disable=protected-access

4096 else: # key must be a string

4097 config.update({key: parsing_ops.VarLenFeature(dtypes.string)})

4098 return config

4099

4100 @property

4101 @deprecation.deprecated(_FEATURE_COLUMN_DEPRECATION_DATE,

4102 _FEATURE_COLUMN_DEPRECATION)

4103 def _parse_example_spec(self):

4104 return self.parse_example_spec

4105

4106 def transform_feature(self, transformation_cache, state_manager):

4107 """Generates a hashed sparse cross from the input tensors."""

4108 feature_tensors = []

4109 for key in _collect_leaf_level_keys(self):

4110 if isinstance(key, six.string_types):

4111 feature_tensors.append(transformation_cache.get(key, state_manager))

4112 elif isinstance(key, (fc_old._CategoricalColumn, CategoricalColumn)): # pylint: disable=protected-access

4113 ids_and_weights = key.get_sparse_tensors(transformation_cache,

4114 state_manager)

4115 if ids_and_weights.weight_tensor is not None:

4116 raise ValueError(

4117 'crossed_column does not support weight_tensor, but the given '

4118 'column populates weight_tensor. '

4119 'Given column: {}'.format(key.name))

4120 feature_tensors.append(ids_and_weights.id_tensor)

4121 else:

4122 raise ValueError('Unsupported column type. Given: {}'.format(key))

4123 return sparse_ops.sparse_cross_hashed(

4124 inputs=feature_tensors,

4125 num_buckets=self.hash_bucket_size,

4126 hash_key=self.hash_key)

4127

4128 @deprecation.deprecated(_FEATURE_COLUMN_DEPRECATION_DATE,

4129 _FEATURE_COLUMN_DEPRECATION)

4130 def _transform_feature(self, inputs):

4131 """Generates a hashed sparse cross from the input tensors."""

4132 feature_tensors = []

4133 for key in _collect_leaf_level_keys(self):

4134 if isinstance(key, six.string_types):

4135 feature_tensors.append(inputs.get(key))

4136 elif isinstance(key, (CategoricalColumn, fc_old._CategoricalColumn)): # pylint: disable=protected-access

4137 ids_and_weights = key._get_sparse_tensors(inputs) # pylint: disable=protected-access

4138 if ids_and_weights.weight_tensor is not None:

4139 raise ValueError(

4140 'crossed_column does not support weight_tensor, but the given '

4141 'column populates weight_tensor. '

4142 'Given column: {}'.format(key.name))

4143 feature_tensors.append(ids_and_weights.id_tensor)

4144 else:

4145 raise ValueError('Unsupported column type. Given: {}'.format(key))

4146 return sparse_ops.sparse_cross_hashed(

4147 inputs=feature_tensors,

4148 num_buckets=self.hash_bucket_size,

4149 hash_key=self.hash_key)

4150

4151 @property

4152 def num_buckets(self):

4153 """Returns number of buckets in this sparse feature."""

4154 return self.hash_bucket_size

4155

4156 @property

4157 @deprecation.deprecated(_FEATURE_COLUMN_DEPRECATION_DATE,

4158 _FEATURE_COLUMN_DEPRECATION)

4159 def _num_buckets(self):

4160 return self.num_buckets

4161

4162 def get_sparse_tensors(self, transformation_cache, state_manager):

4163 """See `CategoricalColumn` base class."""

4164 return CategoricalColumn.IdWeightPair(

4165 transformation_cache.get(self, state_manager), None)

4166

4167 @deprecation.deprecated(_FEATURE_COLUMN_DEPRECATION_DATE,

4168 _FEATURE_COLUMN_DEPRECATION)

4169 def _get_sparse_tensors(self,

4170 inputs,

4171 weight_collections=None,

4172 trainable=None):

4173 """See `CategoricalColumn` base class."""

4174 del weight_collections

4175 del trainable

4176 return CategoricalColumn.IdWeightPair(inputs.get(self), None)

4177

4178 @property

4179 def parents(self):

4180 """See 'FeatureColumn` base class."""

4181 return list(self.keys)

4182

4183 def get_config(self):

4184 """See 'FeatureColumn` base class."""

4185 from tensorflow.python.feature_column.serialization import serialize_feature_column # pylint: disable=g-import-not-at-top

4186 config = dict(zip(self._fields, self))

4187 config['keys'] = tuple([serialize_feature_column(fc) for fc in self.keys])

4188 return config

4189

4190 @classmethod

4191 def from_config(cls, config, custom_objects=None, columns_by_name=None):

4192 """See 'FeatureColumn` base class."""

4193 from tensorflow.python.feature_column.serialization import deserialize_feature_column # pylint: disable=g-import-not-at-top

4194 _check_config_keys(config, cls._fields)

4195 kwargs = _standardize_and_copy_config(config)

4196 kwargs['keys'] = tuple([

4197 deserialize_feature_column(c, custom_objects, columns_by_name)

4198 for c in config['keys']

4199 ])

4200 return cls(**kwargs)

4201

4202

4203def _collect_leaf_level_keys(cross):

4204 """Collects base keys by expanding all nested crosses.

4205

4206 Args:

4207 cross: A `CrossedColumn`.

4208

4209 Returns:

4210 A list of strings or `CategoricalColumn` instances.

4211 """

4212 leaf_level_keys = []

4213 for k in cross.keys:

4214 if isinstance(k, CrossedColumn):

4215 leaf_level_keys.extend(_collect_leaf_level_keys(k))

4216 else:

4217 leaf_level_keys.append(k)

4218 return leaf_level_keys

4219

4220

4221def _prune_invalid_ids(sparse_ids, sparse_weights):

4222 """Prune invalid IDs (< 0) from the input ids and weights."""

4223 is_id_valid = math_ops.greater_equal(sparse_ids.values, 0)

4224 if sparse_weights is not None:

4225 is_id_valid = math_ops.logical_and(

4226 is_id_valid,

4227 array_ops.ones_like(sparse_weights.values, dtype=dtypes.bool))

4228 sparse_ids = sparse_ops.sparse_retain(sparse_ids, is_id_valid)

4229 if sparse_weights is not None:

4230 sparse_weights = sparse_ops.sparse_retain(sparse_weights, is_id_valid)

4231 return sparse_ids, sparse_weights

4232

4233

4234def _prune_invalid_weights(sparse_ids, sparse_weights):

4235 """Prune invalid weights (< 0) from the input ids and weights."""

4236 if sparse_weights is not None:

4237 is_weights_valid = math_ops.greater(sparse_weights.values, 0)

4238 sparse_ids = sparse_ops.sparse_retain(sparse_ids, is_weights_valid)

4239 sparse_weights = sparse_ops.sparse_retain(sparse_weights, is_weights_valid)

4240 return sparse_ids, sparse_weights

4241

4242

4243class IndicatorColumn(

4244 DenseColumn,

4245 SequenceDenseColumn,

4246 fc_old._DenseColumn, # pylint: disable=protected-access

4247 fc_old._SequenceDenseColumn, # pylint: disable=protected-access

4248 collections.namedtuple('IndicatorColumn', ('categorical_column'))):

4249 """Represents a one-hot column for use in deep networks.

4250

4251 Args:

4252 categorical_column: A `CategoricalColumn` which is created by

4253 `categorical_column_with_*` function.

4254 """

4255

4256 @property

4257 def _is_v2_column(self):

4258 return (isinstance(self.categorical_column, FeatureColumn) and

4259 self.categorical_column._is_v2_column) # pylint: disable=protected-access

4260

4261 @property

4262 def name(self):

4263 """See `FeatureColumn` base class."""

4264 return '{}_indicator'.format(self.categorical_column.name)

4265

4266 def _transform_id_weight_pair(self, id_weight_pair, size):

4267 id_tensor = id_weight_pair.id_tensor

4268 weight_tensor = id_weight_pair.weight_tensor

4269

4270 # If the underlying column is weighted, return the input as a dense tensor.

4271 if weight_tensor is not None:

4272 weighted_column = sparse_ops.sparse_merge(

4273 sp_ids=id_tensor, sp_values=weight_tensor, vocab_size=int(size))

4274 # Remove (?, -1) index.

4275 weighted_column = sparse_ops.sparse_slice(weighted_column, [0, 0],

4276 weighted_column.dense_shape)

4277 # Use scatter_nd to merge duplicated indices if existed,

4278 # instead of sparse_tensor_to_dense.

4279 return array_ops.scatter_nd(weighted_column.indices,

4280 weighted_column.values,

4281 weighted_column.dense_shape)

4282

4283 dense_id_tensor = sparse_ops.sparse_tensor_to_dense(

4284 id_tensor, default_value=-1)

4285

4286 # One hot must be float for tf.concat reasons since all other inputs to

4287 # input_layer are float32.

4288 one_hot_id_tensor = array_ops.one_hot(

4289 dense_id_tensor, depth=size, on_value=1.0, off_value=0.0)

4290

4291 # Reduce to get a multi-hot per example.

4292 return math_ops.reduce_sum(one_hot_id_tensor, axis=[-2])

4293

4294 def transform_feature(self, transformation_cache, state_manager):

4295 """Returns dense `Tensor` representing feature.

4296

4297 Args:

4298 transformation_cache: A `FeatureTransformationCache` object to access

4299 features.

4300 state_manager: A `StateManager` to create / access resources such as

4301 lookup tables.

4302

4303 Returns:

4304 Transformed feature `Tensor`.

4305

4306 Raises:

4307 ValueError: if input rank is not known at graph building time.

4308 """

4309 id_weight_pair = self.categorical_column.get_sparse_tensors(

4310 transformation_cache, state_manager)

4311 return self._transform_id_weight_pair(id_weight_pair,

4312 self.variable_shape[-1])

4313

4314 @deprecation.deprecated(_FEATURE_COLUMN_DEPRECATION_DATE,

4315 _FEATURE_COLUMN_DEPRECATION)

4316 def _transform_feature(self, inputs):

4317 id_weight_pair = self.categorical_column._get_sparse_tensors(inputs) # pylint: disable=protected-access

4318 return self._transform_id_weight_pair(id_weight_pair,

4319 self._variable_shape[-1])

4320

4321 @property

4322 def parse_example_spec(self):

4323 """See `FeatureColumn` base class."""

4324 return self.categorical_column.parse_example_spec

4325

4326 @property

4327 @deprecation.deprecated(_FEATURE_COLUMN_DEPRECATION_DATE,

4328 _FEATURE_COLUMN_DEPRECATION)

4329 def _parse_example_spec(self):

4330 return self.categorical_column._parse_example_spec # pylint: disable=protected-access

4331

4332 @property

4333 def variable_shape(self):

4334 """Returns a `TensorShape` representing the shape of the dense `Tensor`."""

4335 if isinstance(self.categorical_column, FeatureColumn):

4336 return tensor_shape.TensorShape([1, self.categorical_column.num_buckets])

4337 else:

4338 return tensor_shape.TensorShape([1, self.categorical_column._num_buckets]) # pylint: disable=protected-access

4339

4340 @property

4341 @deprecation.deprecated(_FEATURE_COLUMN_DEPRECATION_DATE,

4342 _FEATURE_COLUMN_DEPRECATION)

4343 def _variable_shape(self):

4344 return tensor_shape.TensorShape([1, self.categorical_column._num_buckets]) # pylint: disable=protected-access

4345

4346 def get_dense_tensor(self, transformation_cache, state_manager):

4347 """Returns dense `Tensor` representing feature.

4348

4349 Args:

4350 transformation_cache: A `FeatureTransformationCache` object to access

4351 features.

4352 state_manager: A `StateManager` to create / access resources such as

4353 lookup tables.

4354

4355 Returns:

4356 Dense `Tensor` created within `transform_feature`.

4357

4358 Raises:

4359 ValueError: If `categorical_column` is a `SequenceCategoricalColumn`.

4360 """

4361 if isinstance(self.categorical_column, SequenceCategoricalColumn):

4362 raise ValueError(

4363 'In indicator_column: {}. '

4364 'categorical_column must not be of type SequenceCategoricalColumn. '

4365 'Suggested fix A: If you wish to use DenseFeatures, use a '

4366 'non-sequence categorical_column_with_*. '

4367 'Suggested fix B: If you wish to create sequence input, use '

4368 'SequenceFeatures instead of DenseFeatures. '

4369 'Given (type {}): {}'.format(self.name, type(self.categorical_column),

4370 self.categorical_column))

4371 # Feature has been already transformed. Return the intermediate

4372 # representation created by transform_feature.

4373 return transformation_cache.get(self, state_manager)

4374

4375 @deprecation.deprecated(_FEATURE_COLUMN_DEPRECATION_DATE,

4376 _FEATURE_COLUMN_DEPRECATION)

4377 def _get_dense_tensor(self, inputs, weight_collections=None, trainable=None):

4378 del weight_collections

4379 del trainable

4380 if isinstance(

4381 self.categorical_column,

4382 (SequenceCategoricalColumn, fc_old._SequenceCategoricalColumn)): # pylint: disable=protected-access

4383 raise ValueError(

4384 'In indicator_column: {}. '

4385 'categorical_column must not be of type _SequenceCategoricalColumn. '

4386 'Suggested fix A: If you wish to use DenseFeatures, use a '

4387 'non-sequence categorical_column_with_*. '

4388 'Suggested fix B: If you wish to create sequence input, use '

4389 'SequenceFeatures instead of DenseFeatures. '

4390 'Given (type {}): {}'.format(self.name, type(self.categorical_column),

4391 self.categorical_column))

4392 # Feature has been already transformed. Return the intermediate

4393 # representation created by transform_feature.

4394 return inputs.get(self)

4395

4396 def get_sequence_dense_tensor(self, transformation_cache, state_manager):

4397 """See `SequenceDenseColumn` base class."""

4398 if not isinstance(self.categorical_column, SequenceCategoricalColumn):

4399 raise ValueError(

4400 'In indicator_column: {}. '

4401 'categorical_column must be of type SequenceCategoricalColumn '

4402 'to use SequenceFeatures. '

4403 'Suggested fix: Use one of sequence_categorical_column_with_*. '

4404 'Given (type {}): {}'.format(self.name, type(self.categorical_column),

4405 self.categorical_column))

4406 # Feature has been already transformed. Return the intermediate

4407 # representation created by transform_feature.

4408 dense_tensor = transformation_cache.get(self, state_manager)

4409 sparse_tensors = self.categorical_column.get_sparse_tensors(

4410 transformation_cache, state_manager)

4411 sequence_length = fc_utils.sequence_length_from_sparse_tensor(

4412 sparse_tensors.id_tensor)

4413 return SequenceDenseColumn.TensorSequenceLengthPair(

4414 dense_tensor=dense_tensor, sequence_length=sequence_length)

4415

4416 @deprecation.deprecated(_FEATURE_COLUMN_DEPRECATION_DATE,

4417 _FEATURE_COLUMN_DEPRECATION)

4418 def _get_sequence_dense_tensor(self,

4419 inputs,

4420 weight_collections=None,

4421 trainable=None):

4422 # Do nothing with weight_collections and trainable since no variables are

4423 # created in this function.

4424 del weight_collections

4425 del trainable

4426 if not isinstance(

4427 self.categorical_column,

4428 (SequenceCategoricalColumn, fc_old._SequenceCategoricalColumn)): # pylint: disable=protected-access

4429 raise ValueError(

4430 'In indicator_column: {}. '

4431 'categorical_column must be of type _SequenceCategoricalColumn '

4432 'to use SequenceFeatures. '

4433 'Suggested fix: Use one of sequence_categorical_column_with_*. '

4434 'Given (type {}): {}'.format(self.name, type(self.categorical_column),

4435 self.categorical_column))

4436 # Feature has been already transformed. Return the intermediate

4437 # representation created by _transform_feature.

4438 dense_tensor = inputs.get(self)

4439 sparse_tensors = self.categorical_column._get_sparse_tensors(inputs) # pylint: disable=protected-access

4440 sequence_length = fc_utils.sequence_length_from_sparse_tensor(

4441 sparse_tensors.id_tensor)

4442 return SequenceDenseColumn.TensorSequenceLengthPair(

4443 dense_tensor=dense_tensor, sequence_length=sequence_length)

4444

4445 @property

4446 def parents(self):

4447 """See 'FeatureColumn` base class."""

4448 return [self.categorical_column]

4449

4450 def get_config(self):

4451 """See 'FeatureColumn` base class."""

4452 from tensorflow.python.feature_column.serialization import serialize_feature_column # pylint: disable=g-import-not-at-top

4453 config = dict(zip(self._fields, self))

4454 config['categorical_column'] = serialize_feature_column(

4455 self.categorical_column)

4456 return config

4457

4458 @classmethod

4459 def from_config(cls, config, custom_objects=None, columns_by_name=None):

4460 """See 'FeatureColumn` base class."""

4461 from tensorflow.python.feature_column.serialization import deserialize_feature_column # pylint: disable=g-import-not-at-top

4462 _check_config_keys(config, cls._fields)

4463 kwargs = _standardize_and_copy_config(config)

4464 kwargs['categorical_column'] = deserialize_feature_column(

4465 config['categorical_column'], custom_objects, columns_by_name)

4466 return cls(**kwargs)

4467

4468

4469def _verify_static_batch_size_equality(tensors, columns):

4470 """Verify equality between static batch sizes.

4471

4472 Args:

4473 tensors: iterable of input tensors.

4474 columns: Corresponding feature columns.

4475

4476 Raises:

4477 ValueError: in case of mismatched batch sizes.

4478 """

4479 # bath_size is a Dimension object.

4480 expected_batch_size = None

4481 for i in range(0, len(tensors)):

4482 batch_size = tensor_shape.Dimension(

4483 tensor_shape.dimension_value(tensors[i].shape[0]))

4484 if batch_size.value is not None:

4485 if expected_batch_size is None:

4486 bath_size_column_index = i

4487 expected_batch_size = batch_size

4488 elif not expected_batch_size.is_compatible_with(batch_size):

4489 raise ValueError(

4490 'Batch size (first dimension) of each feature must be same. '

4491 'Batch size of columns ({}, {}): ({}, {})'.format(

4492 columns[bath_size_column_index].name, columns[i].name,

4493 expected_batch_size, batch_size))

4494

4495

4496class SequenceCategoricalColumn(

4497 CategoricalColumn,

4498 fc_old._SequenceCategoricalColumn, # pylint: disable=protected-access

4499 collections.namedtuple('SequenceCategoricalColumn',

4500 ('categorical_column'))):

4501 """Represents sequences of categorical data."""

4502

4503 @property

4504 def _is_v2_column(self):

4505 return (isinstance(self.categorical_column, FeatureColumn) and

4506 self.categorical_column._is_v2_column) # pylint: disable=protected-access

4507

4508 @property

4509 def name(self):

4510 """See `FeatureColumn` base class."""

4511 return self.categorical_column.name

4512

4513 @property

4514 def parse_example_spec(self):

4515 """See `FeatureColumn` base class."""

4516 return self.categorical_column.parse_example_spec

4517

4518 @property

4519 @deprecation.deprecated(_FEATURE_COLUMN_DEPRECATION_DATE,

4520 _FEATURE_COLUMN_DEPRECATION)

4521 def _parse_example_spec(self):

4522 return self.categorical_column._parse_example_spec # pylint: disable=protected-access

4523

4524 def transform_feature(self, transformation_cache, state_manager):

4525 """See `FeatureColumn` base class."""

4526 return self.categorical_column.transform_feature(transformation_cache,

4527 state_manager)

4528

4529 @deprecation.deprecated(_FEATURE_COLUMN_DEPRECATION_DATE,

4530 _FEATURE_COLUMN_DEPRECATION)

4531 def _transform_feature(self, inputs):

4532 return self.categorical_column._transform_feature(inputs) # pylint: disable=protected-access

4533

4534 @property

4535 def num_buckets(self):

4536 """Returns number of buckets in this sparse feature."""

4537 return self.categorical_column.num_buckets

4538

4539 @property

4540 @deprecation.deprecated(_FEATURE_COLUMN_DEPRECATION_DATE,

4541 _FEATURE_COLUMN_DEPRECATION)

4542 def _num_buckets(self):

4543 return self.categorical_column._num_buckets # pylint: disable=protected-access

4544

4545 def _get_sparse_tensors_helper(self, sparse_tensors):

4546 id_tensor = sparse_tensors.id_tensor

4547 weight_tensor = sparse_tensors.weight_tensor

4548 # Expands third dimension, if necessary so that embeddings are not

4549 # combined during embedding lookup. If the tensor is already 3D, leave

4550 # as-is.

4551 shape = array_ops.shape(id_tensor)

4552 # Compute the third dimension explicitly instead of setting it to -1, as

4553 # that doesn't work for dynamically shaped tensors with 0-length at runtime.

4554 # This happens for empty sequences.

4555 target_shape = [shape[0], shape[1], math_ops.reduce_prod(shape[2:])]

4556 id_tensor = sparse_ops.sparse_reshape(id_tensor, target_shape)

4557 if weight_tensor is not None:

4558 weight_tensor = sparse_ops.sparse_reshape(weight_tensor, target_shape)

4559 return CategoricalColumn.IdWeightPair(id_tensor, weight_tensor)

4560

4561 def get_sparse_tensors(self, transformation_cache, state_manager):

4562 """Returns an IdWeightPair.

4563

4564 `IdWeightPair` is a pair of `SparseTensor`s which represents ids and

4565 weights.

4566

4567 `IdWeightPair.id_tensor` is typically a `batch_size` x `num_buckets`

4568 `SparseTensor` of `int64`. `IdWeightPair.weight_tensor` is either a

4569 `SparseTensor` of `float` or `None` to indicate all weights should be

4570 taken to be 1. If specified, `weight_tensor` must have exactly the same

4571 shape and indices as `sp_ids`. Expected `SparseTensor` is same as parsing

4572 output of a `VarLenFeature` which is a ragged matrix.

4573

4574 Args:

4575 transformation_cache: A `FeatureTransformationCache` object to access

4576 features.

4577 state_manager: A `StateManager` to create / access resources such as

4578 lookup tables.

4579 """

4580 sparse_tensors = self.categorical_column.get_sparse_tensors(

4581 transformation_cache, state_manager)

4582 return self._get_sparse_tensors_helper(sparse_tensors)

4583

4584 @deprecation.deprecated(_FEATURE_COLUMN_DEPRECATION_DATE,

4585 _FEATURE_COLUMN_DEPRECATION)

4586 def _get_sparse_tensors(self,

4587 inputs,

4588 weight_collections=None,

4589 trainable=None):

4590 sparse_tensors = self.categorical_column._get_sparse_tensors(inputs) # pylint: disable=protected-access

4591 return self._get_sparse_tensors_helper(sparse_tensors)

4592

4593 @property

4594 def parents(self):

4595 """See 'FeatureColumn` base class."""

4596 return [self.categorical_column]

4597

4598 def get_config(self):

4599 """See 'FeatureColumn` base class."""

4600 from tensorflow.python.feature_column.serialization import serialize_feature_column # pylint: disable=g-import-not-at-top

4601 config = dict(zip(self._fields, self))

4602 config['categorical_column'] = serialize_feature_column(

4603 self.categorical_column)

4604 return config

4605

4606 @classmethod

4607 def from_config(cls, config, custom_objects=None, columns_by_name=None):

4608 """See 'FeatureColumn` base class."""

4609 from tensorflow.python.feature_column.serialization import deserialize_feature_column # pylint: disable=g-import-not-at-top

4610 _check_config_keys(config, cls._fields)

4611 kwargs = _standardize_and_copy_config(config)

4612 kwargs['categorical_column'] = deserialize_feature_column(

4613 config['categorical_column'], custom_objects, columns_by_name)

4614 return cls(**kwargs)

4615

4616

4617def _check_config_keys(config, expected_keys):

4618 """Checks that a config has all expected_keys."""

4619 if set(config.keys()) != set(expected_keys):

4620 raise ValueError('Invalid config: {}, expected keys: {}'.format(

4621 config, expected_keys))

4622

4623

4624def _standardize_and_copy_config(config):

4625 """Returns a shallow copy of config with lists turned to tuples.

4626

4627 Keras serialization uses nest to listify everything.

4628 This causes problems with the NumericColumn shape, which becomes

4629 unhashable. We could try to solve this on the Keras side, but that

4630 would require lots of tracking to avoid changing existing behavior.

4631 Instead, we ensure here that we revive correctly.

4632

4633 Args:

4634 config: dict that will be used to revive a Feature Column

4635

4636 Returns:

4637 Shallow copy of config with lists turned to tuples.

4638 """

4639 kwargs = config.copy()

4640 for k, v in kwargs.items():

4641 if isinstance(v, list):

4642 kwargs[k] = tuple(v)

4643

4644 return kwargs

4645

4646

4647def _sanitize_column_name_for_variable_scope(name):

4648 """Sanitizes user-provided feature names for use as variable scopes."""

4649 invalid_char = re.compile('[^A-Za-z0-9_.\\-]')

4650 return invalid_char.sub('_', name)

Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/tensorflow/python/feature_column/feature_column_v2.py: 37%

1431 statements