Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/tensorflow/python/feature_column/feature

3# Licensed under the Apache License, Version 2.0 (the "License");

4# you may not use this file except in compliance with the License.

5# You may obtain a copy of the License at

7# http://www.apache.org/licenses/LICENSE-2.0

9# Unless required by applicable law or agreed to in writing, software

10# distributed under the License is distributed on an "AS IS" BASIS,

11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

12# See the License for the specific language governing permissions and

13# limitations under the License.

14# ==============================================================================

15"""This API defines FeatureColumn abstraction.

17FeatureColumns provide a high level abstraction for ingesting and representing

18features. FeatureColumns are also the primary way of encoding features for

19canned `tf.estimator.Estimator`s.

21When using FeatureColumns with `Estimators`, the type of feature column you

22should choose depends on (1) the feature type and (2) the model type.

241. Feature type:

26 * Continuous features can be represented by `numeric_column`.

27 * Categorical features can be represented by any `categorical_column_with_*`

28 column:

29 - `categorical_column_with_vocabulary_list`

30 - `categorical_column_with_vocabulary_file`

31 - `categorical_column_with_hash_bucket`

32 - `categorical_column_with_identity`

33 - `weighted_categorical_column`

352. Model type:

37 * Deep neural network models (`DNNClassifier`, `DNNRegressor`).

39 Continuous features can be directly fed into deep neural network models.

41 age_column = numeric_column("age")

43 To feed sparse features into DNN models, wrap the column with

44 `embedding_column` or `indicator_column`. `indicator_column` is recommended

45 for features with only a few possible values. For features with many

46 possible values, to reduce the size of your model, `embedding_column` is

47 recommended.

49 embedded_dept_column = embedding_column(

50 categorical_column_with_vocabulary_list(

51 "department", ["math", "philosophy", ...]), dimension=10)

53 * Wide (aka linear) models (`LinearClassifier`, `LinearRegressor`).

55 Sparse features can be fed directly into linear models. They behave like an

56 indicator column but with an efficient implementation.

58 dept_column = categorical_column_with_vocabulary_list("department",

59 ["math", "philosophy", "english"])

61 It is recommended that continuous features be bucketized before being

62 fed into linear models.

64 bucketized_age_column = bucketized_column(

65 source_column=age_column,

66 boundaries=[18, 25, 30, 35, 40, 45, 50, 55, 60, 65])

68 Sparse features can be crossed (also known as conjuncted or combined) in

69 order to form non-linearities, and then fed into linear models.

71 cross_dept_age_column = crossed_column(

72 columns=["department", bucketized_age_column],

73 hash_bucket_size=1000)

75Example of building canned `Estimator`s using FeatureColumns:

77 ```python

78 # Define features and transformations

79 deep_feature_columns = [age_column, embedded_dept_column]

80 wide_feature_columns = [dept_column, bucketized_age_column,

81 cross_dept_age_column]

83 # Build deep model

84 estimator = DNNClassifier(

85 feature_columns=deep_feature_columns,

86 hidden_units=[500, 250, 50])

87 estimator.train(...)

89 # Or build a wide model

90 estimator = LinearClassifier(

91 feature_columns=wide_feature_columns)

92 estimator.train(...)

94 # Or build a wide and deep model!

95 estimator = DNNLinearCombinedClassifier(

96 linear_feature_columns=wide_feature_columns,

97 dnn_feature_columns=deep_feature_columns,

98 dnn_hidden_units=[500, 250, 50])

99 estimator.train(...)

100 ```

101

102

103FeatureColumns can also be transformed into a generic input layer for

104custom models using `input_layer`.

105

106Example of building model using FeatureColumns, this can be used in a

107`model_fn` which is given to the {tf.estimator.Estimator}:

108

109 ```python

110 # Building model via layers

111

112 deep_feature_columns = [age_column, embedded_dept_column]

113 columns_to_tensor = parse_feature_columns_from_examples(

114 serialized=my_data,

115 feature_columns=deep_feature_columns)

116 first_layer = input_layer(

117 features=columns_to_tensor,

118 feature_columns=deep_feature_columns)

119 second_layer = fully_connected(first_layer, ...)

120 ```

121

122NOTE: Functions prefixed with "_" indicate experimental or private parts of

123the API subject to change, and should not be relied upon!

124

125NOTE: The new feature columns are being developed in feature_column_v2.py and

126are a somewhat duplicate of the code here. Please make sure to update logic

127in both places.

128"""

129

130import abc

131import collections

132import math

133

134import numpy as np

135import six

136

137from tensorflow.python.eager import context

138from tensorflow.python.feature_column import utils as fc_utils

139from tensorflow.python.framework import dtypes

140from tensorflow.python.framework import ops

141from tensorflow.python.framework import sparse_tensor as sparse_tensor_lib

142from tensorflow.python.framework import tensor_shape

143from tensorflow.python.layers import base

144from tensorflow.python.ops import array_ops

145from tensorflow.python.ops import array_ops_stack

146from tensorflow.python.ops import check_ops

147from tensorflow.python.ops import cond

148from tensorflow.python.ops import embedding_ops

149from tensorflow.python.ops import init_ops

150from tensorflow.python.ops import lookup_ops

151from tensorflow.python.ops import math_ops

152from tensorflow.python.ops import nn_ops

153from tensorflow.python.ops import parsing_ops

154from tensorflow.python.ops import resource_variable_ops

155from tensorflow.python.ops import sparse_ops

156from tensorflow.python.ops import string_ops

157from tensorflow.python.ops import template

158from tensorflow.python.ops import variable_scope

159from tensorflow.python.ops import variables

160from tensorflow.python.platform import gfile

161from tensorflow.python.platform import tf_logging as logging

162from tensorflow.python.training import checkpoint_utils

163from tensorflow.python.util import deprecation

164from tensorflow.python.util import nest

165from tensorflow.python.util.compat import collections_abc

166from tensorflow.python.util.tf_export import tf_export

167from tensorflow.tools.docs import doc_controls

168

169_FEATURE_COLUMN_DEPRECATION_WARNING = """\

170 Warning: tf.feature_column is not recommended for new code. Instead,

171 feature preprocessing can be done directly using either [Keras preprocessing

172 layers](https://www.tensorflow.org/guide/migrate/migrating_feature_columns)

173 or through the one-stop utility [`tf.keras.utils.FeatureSpace`](https://www.tensorflow.org/api_docs/python/tf/keras/utils/FeatureSpace)

174 built on top of them. See the [migration guide](https://tensorflow.org/guide/migrate)

175 for details.

176 """

177

178_FEATURE_COLUMN_DEPRECATION_RUNTIME_WARNING = (

179 'Use Keras preprocessing layers instead, either directly or via the '

180 '`tf.keras.utils.FeatureSpace` utility. Each of `tf.feature_column.*` has '

181 'a functional equivalent in `tf.keras.layers` for feature preprocessing '

182 'when training a Keras model.')

183

184

185def _internal_input_layer(features,

186 feature_columns,

187 weight_collections=None,

188 trainable=True,

189 cols_to_vars=None,

190 scope=None,

191 cols_to_output_tensors=None,

192 from_template=False):

193 """See input_layer. `scope` is a name or variable scope to use."""

194

195 feature_columns = _normalize_feature_columns(feature_columns)

196 for column in feature_columns:

197 if not isinstance(column, _DenseColumn):

198 raise ValueError(

199 'Items of feature_columns must be a _DenseColumn. '

200 'You can wrap a categorical column with an '

201 'embedding_column or indicator_column. Given: {}'.format(column))

202 weight_collections = list(weight_collections or [])

203 if ops.GraphKeys.GLOBAL_VARIABLES not in weight_collections:

204 weight_collections.append(ops.GraphKeys.GLOBAL_VARIABLES)

205 if ops.GraphKeys.MODEL_VARIABLES not in weight_collections:

206 weight_collections.append(ops.GraphKeys.MODEL_VARIABLES)

207

208 def _get_logits(): # pylint: disable=missing-docstring

209 builder = _LazyBuilder(features)

210 output_tensors = []

211 ordered_columns = []

212 for column in sorted(feature_columns, key=lambda x: x.name):

213 ordered_columns.append(column)

214 with variable_scope.variable_scope(

215 None, default_name=column._var_scope_name): # pylint: disable=protected-access

216 tensor = column._get_dense_tensor( # pylint: disable=protected-access

217 builder,

218 weight_collections=weight_collections,

219 trainable=trainable)

220 num_elements = column._variable_shape.num_elements() # pylint: disable=protected-access

221 batch_size = array_ops.shape(tensor)[0]

222 output_tensor = array_ops.reshape(

223 tensor, shape=(batch_size, num_elements))

224 output_tensors.append(output_tensor)

225 if cols_to_vars is not None:

226 # Retrieve any variables created (some _DenseColumn's don't create

227 # variables, in which case an empty list is returned).

228 cols_to_vars[column] = ops.get_collection(

229 ops.GraphKeys.GLOBAL_VARIABLES,

230 scope=variable_scope.get_variable_scope().name)

231 if cols_to_output_tensors is not None:

232 cols_to_output_tensors[column] = output_tensor

233 _verify_static_batch_size_equality(output_tensors, ordered_columns)

234 return array_ops.concat(output_tensors, 1)

235

236 # If we're constructing from the `make_template`, that by default adds a

237 # variable scope with the name of the layer. In that case, we dont want to

238 # add another `variable_scope` as that would break checkpoints.

239 if from_template:

240 return _get_logits()

241 else:

242 with variable_scope.variable_scope(

243 scope, default_name='input_layer', values=features.values()):

244 return _get_logits()

245

246

247@doc_controls.header(_FEATURE_COLUMN_DEPRECATION_WARNING)

248@tf_export(v1=['feature_column.input_layer'])

249@deprecation.deprecated(None, _FEATURE_COLUMN_DEPRECATION_RUNTIME_WARNING)

250def input_layer(features,

251 feature_columns,

252 weight_collections=None,

253 trainable=True,

254 cols_to_vars=None,

255 cols_to_output_tensors=None):

256 """Returns a dense `Tensor` as input layer based on given `feature_columns`.

257

258 Generally a single example in training data is described with FeatureColumns.

259 At the first layer of the model, this column oriented data should be converted

260 to a single `Tensor`.

261

262 Example:

263

264 ```python

265 price = numeric_column('price')

266 keywords_embedded = embedding_column(

267 categorical_column_with_hash_bucket("keywords", 10K), dimensions=16)

268 columns = [price, keywords_embedded, ...]

269 features = tf.io.parse_example(..., features=make_parse_example_spec(columns))

270 dense_tensor = input_layer(features, columns)

271 for units in [128, 64, 32]:

272 dense_tensor = tf.compat.v1.layers.dense(dense_tensor, units, tf.nn.relu)

273 prediction = tf.compat.v1.layers.dense(dense_tensor, 1)

274 ```

275

276 Args:

277 features: A mapping from key to tensors. `_FeatureColumn`s look up via these

278 keys. For example `numeric_column('price')` will look at 'price' key in

279 this dict. Values can be a `SparseTensor` or a `Tensor` depends on

280 corresponding `_FeatureColumn`.

281 feature_columns: An iterable containing the FeatureColumns to use as inputs

282 to your model. All items should be instances of classes derived from

283 `_DenseColumn` such as `numeric_column`, `embedding_column`,

284 `bucketized_column`, `indicator_column`. If you have categorical features,

285 you can wrap them with an `embedding_column` or `indicator_column`.

286 weight_collections: A list of collection names to which the Variable will be

287 added. Note that variables will also be added to collections

288 `tf.GraphKeys.GLOBAL_VARIABLES` and `ops.GraphKeys.MODEL_VARIABLES`.

289 trainable: If `True` also add the variable to the graph collection

290 `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`).

291 cols_to_vars: If not `None`, must be a dictionary that will be filled with a

292 mapping from `_FeatureColumn` to list of `Variable`s. For example, after

293 the call, we might have cols_to_vars = {_EmbeddingColumn(

294 categorical_column=_HashedCategoricalColumn( key='sparse_feature',

295 hash_bucket_size=5, dtype=tf.string), dimension=10): [<tf.Variable

296 'some_variable:0' shape=(5, 10), <tf.Variable 'some_variable:1' shape=(5,

297 10)]} If a column creates no variables, its value will be an empty list.

298 cols_to_output_tensors: If not `None`, must be a dictionary that will be

299 filled with a mapping from '_FeatureColumn' to the associated output

300 `Tensor`s.

301

302 Returns:

303 A `Tensor` which represents input layer of a model. Its shape

304 is (batch_size, first_layer_dimension) and its dtype is `float32`.

305 first_layer_dimension is determined based on given `feature_columns`.

306

307 Raises:

308 ValueError: if an item in `feature_columns` is not a `_DenseColumn`.

309 """

310 return _internal_input_layer(

311 features,

312 feature_columns,

313 weight_collections=weight_collections,

314 trainable=trainable,

315 cols_to_vars=cols_to_vars,

316 cols_to_output_tensors=cols_to_output_tensors)

317

318

319# TODO(akshayka): InputLayer should be a subclass of Layer, and it

320# should implement the logic in input_layer using Layer's build-and-call

321# paradigm; input_layer should create an instance of InputLayer and

322# return the result of invoking its apply method, just as functional layers do.

323class InputLayer(object):

324 """An object-oriented version of `input_layer` that reuses variables."""

325

326 def __init__(self,

327 feature_columns,

328 weight_collections=None,

329 trainable=True,

330 cols_to_vars=None,

331 name='feature_column_input_layer',

332 create_scope_now=True):

333 """See `input_layer`."""

334

335 self._feature_columns = feature_columns

336 self._weight_collections = weight_collections

337 self._trainable = trainable

338 self._cols_to_vars = cols_to_vars

339 self._name = name

340 self._input_layer_template = template.make_template(

341 self._name, _internal_input_layer, create_scope_now_=create_scope_now)

342 self._scope = self._input_layer_template.variable_scope

343

344 def __call__(self, features):

345 return self._input_layer_template(

346 features=features,

347 feature_columns=self._feature_columns,

348 weight_collections=self._weight_collections,

349 trainable=self._trainable,

350 cols_to_vars=None,

351 from_template=True)

352

353 @property

354 def name(self):

355 return self._name

356

357 @property

358 def non_trainable_variables(self):

359 return self._input_layer_template.non_trainable_variables

360

361 @property

362 def non_trainable_weights(self):

363 return self._input_layer_template.non_trainable_weights

364

365 @property

366 def trainable_variables(self):

367 return self._input_layer_template.trainable_variables

368

369 @property

370 def trainable_weights(self):

371 return self._input_layer_template.trainable_weights

372

373 @property

374 def variables(self):

375 return self._input_layer_template.variables

376

377 @property

378 def weights(self):

379 return self._input_layer_template.weights

380

381

382@doc_controls.header(_FEATURE_COLUMN_DEPRECATION_WARNING)

383@tf_export(v1=['feature_column.linear_model'])

384@deprecation.deprecated(None, _FEATURE_COLUMN_DEPRECATION_RUNTIME_WARNING)

385def linear_model(features,

386 feature_columns,

387 units=1,

388 sparse_combiner='sum',

389 weight_collections=None,

390 trainable=True,

391 cols_to_vars=None):

392 """Returns a linear prediction `Tensor` based on given `feature_columns`.

393

394 This function generates a weighted sum based on output dimension `units`.

395 Weighted sum refers to logits in classification problems. It refers to the

396 prediction itself for linear regression problems.

397

398 Note on supported columns: `linear_model` treats categorical columns as

399 `indicator_column`s. To be specific, assume the input as `SparseTensor` looks

400 like:

401

402 ```python

403 shape = [2, 2]

404 {

405 [0, 0]: "a"

406 [1, 0]: "b"

407 [1, 1]: "c"

408 }

409 ```

410 `linear_model` assigns weights for the presence of "a", "b", "c' implicitly,

411 just like `indicator_column`, while `input_layer` explicitly requires wrapping

412 each of categorical columns with an `embedding_column` or an

413 `indicator_column`.

414

415 Example of usage:

416

417 ```python

418 price = numeric_column('price')

419 price_buckets = bucketized_column(price, boundaries=[0., 10., 100., 1000.])

420 keywords = categorical_column_with_hash_bucket("keywords", 10K)

421 keywords_price = crossed_column('keywords', price_buckets, ...)

422 columns = [price_buckets, keywords, keywords_price ...]

423 features = tf.io.parse_example(..., features=make_parse_example_spec(columns))

424 prediction = linear_model(features, columns)

425 ```

426

427 The `sparse_combiner` argument works as follows

428 For example, for two features represented as the categorical columns:

429

430 ```python

431 # Feature 1

432

433 shape = [2, 2]

434 {

435 [0, 0]: "a"

436 [0, 1]: "b"

437 [1, 0]: "c"

438 }

439

440 # Feature 2

441

442 shape = [2, 3]

443 {

444 [0, 0]: "d"

445 [1, 0]: "e"

446 [1, 1]: "f"

447 [1, 2]: "f"

448 }

449 ```

450

451 with `sparse_combiner` as "mean", the linear model outputs consequently

452 are:

453

454 ```

455 y_0 = 1.0 / 2.0 * ( w_a + w_b ) + w_d + b

456 y_1 = w_c + 1.0 / 3.0 * ( w_e + 2.0 * w_f ) + b

457 ```

458

459 where `y_i` is the output, `b` is the bias, and `w_x` is the weight

460 assigned to the presence of `x` in the input features.

461

462 Args:

463 features: A mapping from key to tensors. `_FeatureColumn`s look up via these

464 keys. For example `numeric_column('price')` will look at 'price' key in

465 this dict. Values are `Tensor` or `SparseTensor` depending on

466 corresponding `_FeatureColumn`.

467 feature_columns: An iterable containing the FeatureColumns to use as inputs

468 to your model. All items should be instances of classes derived from

469 `_FeatureColumn`s.

470 units: An integer, dimensionality of the output space. Default value is 1.

471 sparse_combiner: A string specifying how to reduce if a categorical column

472 is multivalent. Except `numeric_column`, almost all columns passed to

473 `linear_model` are considered as categorical columns. It combines each

474 categorical column independently. Currently "mean", "sqrtn" and "sum" are

475 supported, with "sum" the default for linear model. "sqrtn" often achieves

476 good accuracy, in particular with bag-of-words columns.

477 * "sum": do not

478 normalize features in the column

479 * "mean": do l1 normalization on features

480 in the column

481 * "sqrtn": do l2 normalization on features in the column

482 weight_collections: A list of collection names to which the Variable will be

483 added. Note that, variables will also be added to collections

484 `tf.GraphKeys.GLOBAL_VARIABLES` and `ops.GraphKeys.MODEL_VARIABLES`.

485 trainable: If `True` also add the variable to the graph collection

486 `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`).

487 cols_to_vars: If not `None`, must be a dictionary that will be filled with a

488 mapping from `_FeatureColumn` to associated list of `Variable`s. For

489 example, after the call, we might have cols_to_vars = { _NumericColumn(

490 key='numeric_feature1', shape=(1,): [<tf.Variable

491 'linear_model/price2/weights:0' shape=(1, 1)>], 'bias': [<tf.Variable

492 'linear_model/bias_weights:0' shape=(1,)>], _NumericColumn(

493 key='numeric_feature2', shape=(2,)): [<tf.Variable

494 'linear_model/price1/weights:0' shape=(2, 1)>]} If a column creates no

495 variables, its value will be an empty list. Note that cols_to_vars will

496 also contain a string key 'bias' that maps to a list of Variables.

497

498 Returns:

499 A `Tensor` which represents predictions/logits of a linear model. Its shape

500 is (batch_size, units) and its dtype is `float32`.

501

502 Raises:

503 ValueError: if an item in `feature_columns` is neither a `_DenseColumn`

504 nor `_CategoricalColumn`.

505 """

506 with variable_scope.variable_scope(None, 'linear_model') as vs:

507 model_name = _strip_leading_slashes(vs.name)

508 linear_model_layer = _LinearModel(

509 feature_columns=feature_columns,

510 units=units,

511 sparse_combiner=sparse_combiner,

512 weight_collections=weight_collections,

513 trainable=trainable,

514 name=model_name)

515 retval = linear_model_layer(features) # pylint: disable=not-callable

516 if cols_to_vars is not None:

517 cols_to_vars.update(linear_model_layer.cols_to_vars())

518 return retval

519

520

521def _add_to_collections(var, weight_collections):

522 """Adds a var to the list of weight_collections provided.

523

524 Handles the case for partitioned and non-partitioned variables.

525

526 Args:

527 var: A variable or Partitioned Variable.

528 weight_collections: List of collections to add variable to.

529 """

530 for weight_collection in weight_collections:

531 # The layer self.add_variable call already adds it to GLOBAL_VARIABLES.

532 if weight_collection == ops.GraphKeys.GLOBAL_VARIABLES:

533 continue

534 # TODO(rohanj): Explore adding a _get_variable_list method on `Variable`

535 # so that we don't have to do this check.

536 if isinstance(var, variables.PartitionedVariable):

537 for constituent_var in list(var):

538 ops.add_to_collection(weight_collection, constituent_var)

539 else:

540 ops.add_to_collection(weight_collection, var)

541

542

543class _FCLinearWrapper(base.Layer):

544 """Wraps a _FeatureColumn in a layer for use in a linear model.

545

546 See `linear_model` above.

547 """

548

549 def __init__(self,

550 feature_column,

551 units=1,

552 sparse_combiner='sum',

553 weight_collections=None,

554 trainable=True,

555 name=None,

556 **kwargs):

557 super(_FCLinearWrapper, self).__init__(

558 trainable=trainable, name=name, **kwargs)

559 self._feature_column = feature_column

560 self._units = units

561 self._sparse_combiner = sparse_combiner

562 self._weight_collections = weight_collections

563

564 def build(self, _):

565 if isinstance(self._feature_column, _CategoricalColumn):

566 weight = self.add_variable(

567 name='weights',

568 shape=(self._feature_column._num_buckets, self._units), # pylint: disable=protected-access

569 initializer=init_ops.zeros_initializer(),

570 trainable=self.trainable)

571 else:

572 num_elements = self._feature_column._variable_shape.num_elements() # pylint: disable=protected-access

573 weight = self.add_variable(

574 name='weights',

575 shape=[num_elements, self._units],

576 initializer=init_ops.zeros_initializer(),

577 trainable=self.trainable)

578 _add_to_collections(weight, self._weight_collections)

579 self._weight_var = weight

580 self.built = True

581

582 def call(self, builder):

583 weighted_sum = _create_weighted_sum(

584 column=self._feature_column,

585 builder=builder,

586 units=self._units,

587 sparse_combiner=self._sparse_combiner,

588 weight_collections=self._weight_collections,

589 trainable=self.trainable,

590 weight_var=self._weight_var)

591 return weighted_sum

592

593

594class _BiasLayer(base.Layer):

595 """A layer for the bias term."""

596

597 def __init__(self,

598 units=1,

599 trainable=True,

600 weight_collections=None,

601 name=None,

602 **kwargs):

603 super(_BiasLayer, self).__init__(trainable=trainable, name=name, **kwargs)

604 self._units = units

605 self._weight_collections = weight_collections

606

607 def build(self, _):

608 self._bias_variable = self.add_variable(

609 'bias_weights',

610 shape=[self._units],

611 initializer=init_ops.zeros_initializer(),

612 trainable=self.trainable)

613 _add_to_collections(self._bias_variable, self._weight_collections)

614 self.built = True

615

616 def call(self, _):

617 return self._bias_variable

618

619

620def _get_expanded_variable_list(variable):

621 if (isinstance(variable, variables.Variable) or

622 resource_variable_ops.is_resource_variable(variable)):

623 return [variable] # Single variable case.

624 else: # Must be a PartitionedVariable, so convert into a list.

625 return list(variable)

626

627

628def _strip_leading_slashes(name):

629 return name.rsplit('/', 1)[-1]

630

631

632class _LinearModel(base.Layer):

633 """Creates a linear model using feature columns.

634

635 See `linear_model` for details.

636 """

637

638 def __init__(self,

639 feature_columns,

640 units=1,

641 sparse_combiner='sum',

642 weight_collections=None,

643 trainable=True,

644 name=None,

645 **kwargs):

646 super(_LinearModel, self).__init__(name=name, **kwargs)

647 # We force the keras_style to be True here, as a workaround to not being

648 # able to inherit keras.layers.Layer as base class. Setting this will let

649 # us skip all the legacy behavior for base.Layer.

650 # Also note that we use Layer as base class, instead of Model, since there

651 # isn't any Model specific behavior gets used, eg compile/fit.

652 self._keras_style = True

653 self._feature_columns = _normalize_feature_columns(feature_columns)

654 self._weight_collections = list(weight_collections or [])

655 if ops.GraphKeys.GLOBAL_VARIABLES not in self._weight_collections:

656 self._weight_collections.append(ops.GraphKeys.GLOBAL_VARIABLES)

657 if ops.GraphKeys.MODEL_VARIABLES not in self._weight_collections:

658 self._weight_collections.append(ops.GraphKeys.MODEL_VARIABLES)

659

660 column_layers = {}

661 for column in sorted(self._feature_columns, key=lambda x: x.name):

662 with variable_scope.variable_scope(

663 None, default_name=column._var_scope_name) as vs: # pylint: disable=protected-access

664 # Having the fully expressed variable scope name ends up doubly

665 # expressing the outer scope (scope with which this method was called)

666 # in the name of the variable that would get created.

667 column_name = _strip_leading_slashes(vs.name)

668 column_layer = _FCLinearWrapper(column, units, sparse_combiner,

669 self._weight_collections, trainable,

670 column_name, **kwargs)

671 column_layers[column_name] = column_layer

672 self._column_layers = self._add_layers(column_layers)

673 self._bias_layer = _BiasLayer(

674 units=units,

675 trainable=trainable,

676 weight_collections=self._weight_collections,

677 name='bias_layer',

678 **kwargs)

679 self._cols_to_vars = {}

680

681 def cols_to_vars(self):

682 """Returns a dict mapping _FeatureColumns to variables.

683

684 See `linear_model` for more information.

685 This is not populated till `call` is called i.e. layer is built.

686 """

687 return self._cols_to_vars

688

689 def call(self, features):

690 with variable_scope.variable_scope(self.name):

691 for column in self._feature_columns:

692 if not isinstance(column, (_DenseColumn, _CategoricalColumn)):

693 raise ValueError(

694 'Items of feature_columns must be either a '

695 '_DenseColumn or _CategoricalColumn. Given: {}'.format(column))

696 weighted_sums = []

697 ordered_columns = []

698 builder = _LazyBuilder(features)

699 for layer in sorted(self._column_layers.values(), key=lambda x: x.name):

700 column = layer._feature_column # pylint: disable=protected-access

701 ordered_columns.append(column)

702 weighted_sum = layer(builder)

703 weighted_sums.append(weighted_sum)

704 self._cols_to_vars[column] = ops.get_collection(

705 ops.GraphKeys.GLOBAL_VARIABLES, scope=layer.scope_name)

706

707 _verify_static_batch_size_equality(weighted_sums, ordered_columns)

708 predictions_no_bias = math_ops.add_n(

709 weighted_sums, name='weighted_sum_no_bias')

710 predictions = nn_ops.bias_add(

711 predictions_no_bias,

712 self._bias_layer( # pylint: disable=not-callable

713 builder,

714 scope=variable_scope.get_variable_scope()), # pylint: disable=not-callable

715 name='weighted_sum')

716 bias = self._bias_layer.variables[0]

717 self._cols_to_vars['bias'] = _get_expanded_variable_list(bias)

718 return predictions

719

720 def _add_layers(self, layers):

721 # "Magic" required for keras.Model classes to track all the variables in

722 # a list of layers.Layer objects.

723 # TODO(ashankar): Figure out API so user code doesn't have to do this.

724 for name, layer in layers.items():

725 setattr(self, 'layer-%s' % name, layer)

726 return layers

727

728

729def _transform_features(features, feature_columns):

730 """Returns transformed features based on features columns passed in.

731

732 Please note that most probably you would not need to use this function. Please

733 check `input_layer` and `linear_model` to see whether they will

734 satisfy your use case or not.

735

736 Example:

737

738 ```python

739 # Define features and transformations

740 crosses_a_x_b = crossed_column(

741 columns=["sparse_feature_a", "sparse_feature_b"], hash_bucket_size=10000)

742 price_buckets = bucketized_column(

743 source_column=numeric_column("price"), boundaries=[...])

744

745 columns = [crosses_a_x_b, price_buckets]

746 features = tf.io.parse_example(..., features=make_parse_example_spec(columns))

747 transformed = transform_features(features=features, feature_columns=columns)

748

749 assertCountEqual(columns, transformed.keys())

750 ```

751

752 Args:

753 features: A mapping from key to tensors. `_FeatureColumn`s look up via these

754 keys. For example `numeric_column('price')` will look at 'price' key in

755 this dict. Values can be a `SparseTensor` or a `Tensor` depends on

756 corresponding `_FeatureColumn`.

757 feature_columns: An iterable containing all the `_FeatureColumn`s.

758

759 Returns:

760 A `dict` mapping `_FeatureColumn` to `Tensor` and `SparseTensor` values.

761 """

762 feature_columns = _normalize_feature_columns(feature_columns)

763 outputs = {}

764 with ops.name_scope(

765 None, default_name='transform_features', values=features.values()):

766 builder = _LazyBuilder(features)

767 for column in sorted(feature_columns, key=lambda x: x.name):

768 with ops.name_scope(None, default_name=column.name):

769 outputs[column] = builder.get(column)

770 return outputs

771

772

773@doc_controls.header(_FEATURE_COLUMN_DEPRECATION_WARNING)

774@tf_export(v1=['feature_column.make_parse_example_spec'])

775@deprecation.deprecated(None, _FEATURE_COLUMN_DEPRECATION_RUNTIME_WARNING)

776def make_parse_example_spec(feature_columns):

777 """Creates parsing spec dictionary from input feature_columns.

778

779 The returned dictionary can be used as arg 'features' in

780 `tf.io.parse_example`.

781

782 Typical usage example:

783

784 ```python

785 # Define features and transformations

786 feature_a = categorical_column_with_vocabulary_file(...)

787 feature_b = numeric_column(...)

788 feature_c_bucketized = bucketized_column(numeric_column("feature_c"), ...)

789 feature_a_x_feature_c = crossed_column(

790 columns=["feature_a", feature_c_bucketized], ...)

791

792 feature_columns = set(

793 [feature_b, feature_c_bucketized, feature_a_x_feature_c])

794 features = tf.io.parse_example(

795 serialized=serialized_examples,

796 features=make_parse_example_spec(feature_columns))

797 ```

798

799 For the above example, make_parse_example_spec would return the dict:

800

801 ```python

802 {

803 "feature_a": parsing_ops.VarLenFeature(tf.string),

804 "feature_b": parsing_ops.FixedLenFeature([1], dtype=tf.float32),

805 "feature_c": parsing_ops.FixedLenFeature([1], dtype=tf.float32)

806 }

807 ```

808

809 Args:

810 feature_columns: An iterable containing all feature columns. All items

811 should be instances of classes derived from `_FeatureColumn`.

812

813 Returns:

814 A dict mapping each feature key to a `FixedLenFeature` or `VarLenFeature`

815 value.

816

817 Raises:

818 ValueError: If any of the given `feature_columns` is not a `_FeatureColumn`

819 instance.

820 """

821 result = {}

822 for column in feature_columns:

823 if not isinstance(column, _FeatureColumn):

824 raise ValueError('All feature_columns must be _FeatureColumn instances. '

825 'Given: {}'.format(column))

826 config = column._parse_example_spec # pylint: disable=protected-access

827 for key, value in six.iteritems(config):

828 if key in result and value != result[key]:

829 raise ValueError('feature_columns contain different parse_spec for key '

830 '{}. Given {} and {}'.format(key, value, result[key]))

831 result.update(config)

832 return result

833

834

835def _embedding_column(categorical_column,

836 dimension,

837 combiner='mean',

838 initializer=None,

839 ckpt_to_load_from=None,

840 tensor_name_in_ckpt=None,

841 max_norm=None,

842 trainable=True,

843 use_safe_embedding_lookup=True):

844 """`_DenseColumn` that converts from sparse, categorical input.

845

846 Use this when your inputs are sparse, but you want to convert them to a dense

847 representation (e.g., to feed to a DNN).

848

849 Inputs must be a `_CategoricalColumn` created by any of the

850 `categorical_column_*` function. Here is an example of using

851 `embedding_column` with `DNNClassifier`:

852

853 ```python

854 video_id = categorical_column_with_identity(

855 key='video_id', num_buckets=1000000, default_value=0)

856 columns = [embedding_column(video_id, 9),...]

857

858 estimator = tf.estimator.DNNClassifier(feature_columns=columns, ...)

859

860 label_column = ...

861 def input_fn():

862 features = tf.io.parse_example(

863 ..., features=make_parse_example_spec(columns + [label_column]))

864 labels = features.pop(label_column.name)

865 return features, labels

866

867 estimator.train(input_fn=input_fn, steps=100)

868 ```

869

870 Here is an example using `embedding_column` with model_fn:

871

872 ```python

873 def model_fn(features, ...):

874 video_id = categorical_column_with_identity(

875 key='video_id', num_buckets=1000000, default_value=0)

876 columns = [embedding_column(video_id, 9),...]

877 dense_tensor = input_layer(features, columns)

878 # Form DNN layers, calculate loss, and return EstimatorSpec.

879 ...

880 ```

881

882 Args:

883 categorical_column: A `_CategoricalColumn` created by a

884 `categorical_column_with_*` function. This column produces the sparse IDs

885 that are inputs to the embedding lookup.

886 dimension: An integer specifying dimension of the embedding, must be > 0.

887 combiner: A string specifying how to reduce if there are multiple entries in

888 a single row. Currently 'mean', 'sqrtn' and 'sum' are supported, with

889 'mean' the default. 'sqrtn' often achieves good accuracy, in particular

890 with bag-of-words columns. Each of this can be thought as example level

891 normalizations on the column. For more information, see

892 `tf.embedding_lookup_sparse`.

893 initializer: A variable initializer function to be used in embedding

894 variable initialization. If not specified, defaults to

895 `tf.compat.v1.truncated_normal_initializer` with mean `0.0` and standard

896 deviation `1/sqrt(dimension)`.

897 ckpt_to_load_from: String representing checkpoint name/pattern from which to

898 restore column weights. Required if `tensor_name_in_ckpt` is not `None`.

899 tensor_name_in_ckpt: Name of the `Tensor` in `ckpt_to_load_from` from which

900 to restore the column weights. Required if `ckpt_to_load_from` is not

901 `None`.

902 max_norm: If not `None`, embedding values are l2-normalized to this value.

903 trainable: Whether or not the embedding is trainable. Default is True.

904 use_safe_embedding_lookup: If true, uses safe_embedding_lookup_sparse

905 instead of embedding_lookup_sparse. safe_embedding_lookup_sparse ensures

906 there are no empty rows and all weights and ids are positive at the

907 expense of extra compute cost. This only applies to rank 2 (NxM) shaped

908 input tensors. Defaults to true, consider turning off if the above checks

909 are not needed. Note that having empty rows will not trigger any error

910 though the output result might be 0 or omitted.

911

912 Returns:

913 `_DenseColumn` that converts from sparse input.

914

915 Raises:

916 ValueError: if `dimension` not > 0.

917 ValueError: if exactly one of `ckpt_to_load_from` and `tensor_name_in_ckpt`

918 is specified.

919 ValueError: if `initializer` is specified and is not callable.

920 RuntimeError: If eager execution is enabled.

921 """

922 if (dimension is None) or (dimension < 1):

923 raise ValueError('Invalid dimension {}.'.format(dimension))

924 if (ckpt_to_load_from is None) != (tensor_name_in_ckpt is None):

925 raise ValueError('Must specify both `ckpt_to_load_from` and '

926 '`tensor_name_in_ckpt` or none of them.')

927

928 if (initializer is not None) and (not callable(initializer)):

929 raise ValueError('initializer must be callable if specified. '

930 'Embedding of column_name: {}'.format(

931 categorical_column.name))

932 if initializer is None:

933 initializer = init_ops.truncated_normal_initializer(

934 mean=0.0, stddev=1 / math.sqrt(dimension))

935

936 embedding_shape = categorical_column._num_buckets, dimension # pylint: disable=protected-access

937

938 def _creator(weight_collections, scope):

939 embedding_column_layer = _EmbeddingColumnLayer(

940 embedding_shape=embedding_shape,

941 initializer=initializer,

942 weight_collections=weight_collections,

943 trainable=trainable,

944 name='embedding_column_layer')

945 return embedding_column_layer(None, scope=scope) # pylint: disable=not-callable

946

947 return _EmbeddingColumn(

948 categorical_column=categorical_column,

949 dimension=dimension,

950 combiner=combiner,

951 layer_creator=_creator,

952 ckpt_to_load_from=ckpt_to_load_from,

953 tensor_name_in_ckpt=tensor_name_in_ckpt,

954 max_norm=max_norm,

955 trainable=trainable,

956 use_safe_embedding_lookup=use_safe_embedding_lookup)

957

958

959def _numeric_column(key,

960 shape=(1,),

961 default_value=None,

962 dtype=dtypes.float32,

963 normalizer_fn=None):

964 """Represents real valued or numerical features.

965

966 Example:

967

968 ```python

969 price = numeric_column('price')

970 columns = [price, ...]

971 features = tf.io.parse_example(..., features=make_parse_example_spec(columns))

972 dense_tensor = input_layer(features, columns)

973

974 # or

975 bucketized_price = bucketized_column(price, boundaries=[...])

976 columns = [bucketized_price, ...]

977 features = tf.io.parse_example(..., features=make_parse_example_spec(columns))

978 linear_prediction = linear_model(features, columns)

979 ```

980

981 Args:

982 key: A unique string identifying the input feature. It is used as the column

983 name and the dictionary key for feature parsing configs, feature `Tensor`

984 objects, and feature columns.

985 shape: An iterable of integers specifies the shape of the `Tensor`. An

986 integer can be given which means a single dimension `Tensor` with given

987 width. The `Tensor` representing the column will have the shape of

988 [batch_size] + `shape`.

989 default_value: A single value compatible with `dtype` or an iterable of

990 values compatible with `dtype` which the column takes on during

991 `tf.Example` parsing if data is missing. A default value of `None` will

992 cause `tf.io.parse_example` to fail if an example does not contain this

993 column. If a single value is provided, the same value will be applied as

994 the default value for every item. If an iterable of values is provided,

995 the shape of the `default_value` should be equal to the given `shape`.

996 dtype: defines the type of values. Default value is `tf.float32`. Must be a

997 non-quantized, real integer or floating point type.

998 normalizer_fn: If not `None`, a function that can be used to normalize the

999 value of the tensor after `default_value` is applied for parsing.

1000 Normalizer function takes the input `Tensor` as its argument, and returns

1001 the output `Tensor`. (e.g. lambda x: (x - 3.0) / 4.2). Please note that

1002 even though the most common use case of this function is normalization, it

1003 can be used for any kind of Tensorflow transformations.

1004

1005 Returns:

1006 A `_NumericColumn`.

1007

1008 Raises:

1009 TypeError: if any dimension in shape is not an int

1010 ValueError: if any dimension in shape is not a positive integer

1011 TypeError: if `default_value` is an iterable but not compatible with `shape`

1012 TypeError: if `default_value` is not compatible with `dtype`.

1013 ValueError: if `dtype` is not convertible to `tf.float32`.

1014 """

1015 shape = _check_shape(shape, key)

1016 if not (dtype.is_integer or dtype.is_floating):

1017 raise ValueError('dtype must be convertible to float. '

1018 'dtype: {}, key: {}'.format(dtype, key))

1019 default_value = fc_utils.check_default_value(shape, default_value, dtype, key)

1020

1021 if normalizer_fn is not None and not callable(normalizer_fn):

1022 raise TypeError(

1023 'normalizer_fn must be a callable. Given: {}'.format(normalizer_fn))

1024

1025 fc_utils.assert_key_is_string(key)

1026 return _NumericColumn(

1027 key,

1028 shape=shape,

1029 default_value=default_value,

1030 dtype=dtype,

1031 normalizer_fn=normalizer_fn)

1032

1033

1034def _bucketized_column(source_column, boundaries):

1035 """Represents discretized dense input.

1036

1037 Buckets include the left boundary, and exclude the right boundary. Namely,

1038 `boundaries=[0., 1., 2.]` generates buckets `(-inf, 0.)`, `[0., 1.)`,

1039 `[1., 2.)`, and `[2., +inf)`.

1040

1041 For example, if the inputs are

1042

1043 ```python

1044 boundaries = [0, 10, 100]

1045 input tensor = [[-5, 10000]

1046 [150, 10]

1047 [5, 100]]

1048 ```

1049

1050 then the output will be

1051

1052 ```python

1053 output = [[0, 3]

1054 [3, 2]

1055 [1, 3]]

1056 ```

1057

1058 Example:

1059

1060 ```python

1061 price = numeric_column('price')

1062 bucketized_price = bucketized_column(price, boundaries=[...])

1063 columns = [bucketized_price, ...]

1064 features = tf.io.parse_example(..., features=make_parse_example_spec(columns))

1065 linear_prediction = linear_model(features, columns)

1066

1067 # or

1068 columns = [bucketized_price, ...]

1069 features = tf.io.parse_example(..., features=make_parse_example_spec(columns))

1070 dense_tensor = input_layer(features, columns)

1071 ```

1072

1073 A `bucketized_column` can also be crossed with another categorical column

1074 using `crossed_column`:

1075

1076 ```python

1077 price = numeric_column('price')

1078 # bucketized_column converts numerical feature to a categorical one.

1079 bucketized_price = bucketized_column(price, boundaries=[...])

1080 # 'keywords' is a string feature.

1081 price_x_keywords = crossed_column([bucketized_price, 'keywords'], 50K)

1082 columns = [price_x_keywords, ...]

1083 features = tf.io.parse_example(..., features=make_parse_example_spec(columns))

1084 linear_prediction = linear_model(features, columns)

1085 ```

1086

1087 Args:

1088 source_column: A one-dimensional dense column which is generated with

1089 `numeric_column`.

1090 boundaries: A sorted list or tuple of floats specifying the boundaries.

1091

1092 Returns:

1093 A `_BucketizedColumn`.

1094

1095 Raises:

1096 ValueError: If `source_column` is not a numeric column, or if it is not

1097 one-dimensional.

1098 ValueError: If `boundaries` is not a sorted list or tuple.

1099 """

1100 if not isinstance(source_column, _NumericColumn):

1101 raise ValueError(

1102 'source_column must be a column generated with numeric_column(). '

1103 'Given: {}'.format(source_column))

1104 if len(source_column.shape) > 1:

1105 raise ValueError('source_column must be one-dimensional column. '

1106 'Given: {}'.format(source_column))

1107 if (not boundaries or

1108 not (isinstance(boundaries, list) or isinstance(boundaries, tuple))):

1109 raise ValueError('boundaries must be a sorted list.')

1110 for i in range(len(boundaries) - 1):

1111 if boundaries[i] >= boundaries[i + 1]:

1112 raise ValueError('boundaries must be a sorted list.')

1113 return _BucketizedColumn(source_column, tuple(boundaries))

1114

1115

1116def _categorical_column_with_hash_bucket(key,

1117 hash_bucket_size,

1118 dtype=dtypes.string):

1119 """Represents sparse feature where ids are set by hashing.

1120

1121 Use this when your sparse features are in string or integer format, and you

1122 want to distribute your inputs into a finite number of buckets by hashing.

1123 output_id = Hash(input_feature_string) % bucket_size for string type input.

1124 For int type input, the value is converted to its string representation first

1125 and then hashed by the same formula.

1126

1127 For input dictionary `features`, `features[key]` is either `Tensor` or

1128 `SparseTensor`. If `Tensor`, missing values can be represented by `-1` for int

1129 and `''` for string, which will be dropped by this feature column.

1130

1131 Example:

1132

1133 ```python

1134 keywords = categorical_column_with_hash_bucket("keywords", 10K)

1135 columns = [keywords, ...]

1136 features = tf.io.parse_example(..., features=make_parse_example_spec(columns))

1137 linear_prediction = linear_model(features, columns)

1138

1139 # or

1140 keywords_embedded = embedding_column(keywords, 16)

1141 columns = [keywords_embedded, ...]

1142 features = tf.io.parse_example(..., features=make_parse_example_spec(columns))

1143 dense_tensor = input_layer(features, columns)

1144 ```

1145

1146 Args:

1147 key: A unique string identifying the input feature. It is used as the column

1148 name and the dictionary key for feature parsing configs, feature `Tensor`

1149 objects, and feature columns.

1150 hash_bucket_size: An int > 1. The number of buckets.

1151 dtype: The type of features. Only string and integer types are supported.

1152

1153 Returns:

1154 A `_HashedCategoricalColumn`.

1155

1156 Raises:

1157 ValueError: `hash_bucket_size` is not greater than 1.

1158 ValueError: `dtype` is neither string nor integer.

1159 """

1160 if hash_bucket_size is None:

1161 raise ValueError('hash_bucket_size must be set. ' 'key: {}'.format(key))

1162

1163 if hash_bucket_size < 1:

1164 raise ValueError('hash_bucket_size must be at least 1. '

1165 'hash_bucket_size: {}, key: {}'.format(

1166 hash_bucket_size, key))

1167

1168 fc_utils.assert_key_is_string(key)

1169 fc_utils.assert_string_or_int(dtype, prefix='column_name: {}'.format(key))

1170

1171 return _HashedCategoricalColumn(key, hash_bucket_size, dtype)

1172

1173

1174def _categorical_column_with_vocabulary_file(key,

1175 vocabulary_file,

1176 vocabulary_size=None,

1177 num_oov_buckets=0,

1178 default_value=None,

1179 dtype=dtypes.string):

1180 """A `_CategoricalColumn` with a vocabulary file.

1181

1182 Use this when your inputs are in string or integer format, and you have a

1183 vocabulary file that maps each value to an integer ID. By default,

1184 out-of-vocabulary values are ignored. Use either (but not both) of

1185 `num_oov_buckets` and `default_value` to specify how to include

1186 out-of-vocabulary values.

1187

1188 For input dictionary `features`, `features[key]` is either `Tensor` or

1189 `SparseTensor`. If `Tensor`, missing values can be represented by `-1` for int

1190 and `''` for string, which will be dropped by this feature column.

1191

1192 Example with `num_oov_buckets`:

1193 File '/us/states.txt' contains 50 lines, each with a 2-character U.S. state

1194 abbreviation. All inputs with values in that file are assigned an ID 0-49,

1195 corresponding to its line number. All other values are hashed and assigned an

1196 ID 50-54.

1197

1198 ```python

1199 states = categorical_column_with_vocabulary_file(

1200 key='states', vocabulary_file='/us/states.txt', vocabulary_size=50,

1201 num_oov_buckets=5)

1202 columns = [states, ...]

1203 features = tf.io.parse_example(..., features=make_parse_example_spec(columns))

1204 linear_prediction = linear_model(features, columns)

1205 ```

1206

1207 Example with `default_value`:

1208 File '/us/states.txt' contains 51 lines - the first line is 'XX', and the

1209 other 50 each have a 2-character U.S. state abbreviation. Both a literal 'XX'

1210 in input, and other values missing from the file, will be assigned ID 0. All

1211 others are assigned the corresponding line number 1-50.

1212

1213 ```python

1214 states = categorical_column_with_vocabulary_file(

1215 key='states', vocabulary_file='/us/states.txt', vocabulary_size=51,

1216 default_value=0)

1217 columns = [states, ...]

1218 features = tf.io.parse_example(..., features=make_parse_example_spec(columns))

1219 linear_prediction, _, _ = linear_model(features, columns)

1220 ```

1221

1222 And to make an embedding with either:

1223

1224 ```python

1225 columns = [embedding_column(states, 3),...]

1226 features = tf.io.parse_example(..., features=make_parse_example_spec(columns))

1227 dense_tensor = input_layer(features, columns)

1228 ```

1229

1230 Args:

1231 key: A unique string identifying the input feature. It is used as the column

1232 name and the dictionary key for feature parsing configs, feature `Tensor`

1233 objects, and feature columns.

1234 vocabulary_file: The vocabulary file name.

1235 vocabulary_size: Number of the elements in the vocabulary. This must be no

1236 greater than length of `vocabulary_file`, if less than length, later

1237 values are ignored. If None, it is set to the length of `vocabulary_file`.

1238 num_oov_buckets: Non-negative integer, the number of out-of-vocabulary

1239 buckets. All out-of-vocabulary inputs will be assigned IDs in the range

1240 `[vocabulary_size, vocabulary_size+num_oov_buckets)` based on a hash of

1241 the input value. A positive `num_oov_buckets` can not be specified with

1242 `default_value`.

1243 default_value: The integer ID value to return for out-of-vocabulary feature

1244 values, defaults to `-1`. This can not be specified with a positive

1245 `num_oov_buckets`.

1246 dtype: The type of features. Only string and integer types are supported.

1247

1248 Returns:

1249 A `_CategoricalColumn` with a vocabulary file.

1250

1251 Raises:

1252 ValueError: `vocabulary_file` is missing or cannot be opened.

1253 ValueError: `vocabulary_size` is missing or < 1.

1254 ValueError: `num_oov_buckets` is a negative integer.

1255 ValueError: `num_oov_buckets` and `default_value` are both specified.

1256 ValueError: `dtype` is neither string nor integer.

1257 """

1258 if not vocabulary_file:

1259 raise ValueError('Missing vocabulary_file in {}.'.format(key))

1260

1261 if vocabulary_size is None:

1262 if not gfile.Exists(vocabulary_file):

1263 raise ValueError('vocabulary_file in {} does not exist.'.format(key))

1264

1265 with gfile.GFile(vocabulary_file) as f:

1266 vocabulary_size = sum(1 for _ in f)

1267 logging.info(

1268 'vocabulary_size = %d in %s is inferred from the number of elements '

1269 'in the vocabulary_file %s.', vocabulary_size, key, vocabulary_file)

1270

1271 # `vocabulary_size` isn't required for lookup, but it is for `_num_buckets`.

1272 if vocabulary_size < 1:

1273 raise ValueError('Invalid vocabulary_size in {}.'.format(key))

1274 if num_oov_buckets:

1275 if default_value is not None:

1276 raise ValueError(

1277 'Can\'t specify both num_oov_buckets and default_value in {}.'.format(

1278 key))

1279 if num_oov_buckets < 0:

1280 raise ValueError('Invalid num_oov_buckets {} in {}.'.format(

1281 num_oov_buckets, key))

1282 fc_utils.assert_string_or_int(dtype, prefix='column_name: {}'.format(key))

1283 fc_utils.assert_key_is_string(key)

1284 return _VocabularyFileCategoricalColumn(

1285 key=key,

1286 vocabulary_file=vocabulary_file,

1287 vocabulary_size=vocabulary_size,

1288 num_oov_buckets=0 if num_oov_buckets is None else num_oov_buckets,

1289 default_value=-1 if default_value is None else default_value,

1290 dtype=dtype)

1291

1292

1293def _categorical_column_with_vocabulary_list(key,

1294 vocabulary_list,

1295 dtype=None,

1296 default_value=-1,

1297 num_oov_buckets=0):

1298 """A `_CategoricalColumn` with in-memory vocabulary.

1299

1300 Use this when your inputs are in string or integer format, and you have an

1301 in-memory vocabulary mapping each value to an integer ID. By default,

1302 out-of-vocabulary values are ignored. Use either (but not both) of

1303 `num_oov_buckets` and `default_value` to specify how to include

1304 out-of-vocabulary values.

1305

1306 For input dictionary `features`, `features[key]` is either `Tensor` or

1307 `SparseTensor`. If `Tensor`, missing values can be represented by `-1` for int

1308 and `''` for string, which will be dropped by this feature column.

1309

1310 Example with `num_oov_buckets`:

1311 In the following example, each input in `vocabulary_list` is assigned an ID

1312 0-3 corresponding to its index (e.g., input 'B' produces output 2). All other

1313 inputs are hashed and assigned an ID 4-5.

1314

1315 ```python

1316 colors = categorical_column_with_vocabulary_list(

1317 key='colors', vocabulary_list=('R', 'G', 'B', 'Y'),

1318 num_oov_buckets=2)

1319 columns = [colors, ...]

1320 features = tf.io.parse_example(..., features=make_parse_example_spec(columns))

1321 linear_prediction, _, _ = linear_model(features, columns)

1322 ```

1323

1324 Example with `default_value`:

1325 In the following example, each input in `vocabulary_list` is assigned an ID

1326 0-4 corresponding to its index (e.g., input 'B' produces output 3). All other

1327 inputs are assigned `default_value` 0.

1328

1329

1330 ```python

1331 colors = categorical_column_with_vocabulary_list(

1332 key='colors', vocabulary_list=('X', 'R', 'G', 'B', 'Y'), default_value=0)

1333 columns = [colors, ...]

1334 features = tf.io.parse_example(..., features=make_parse_example_spec(columns))

1335 linear_prediction, _, _ = linear_model(features, columns)

1336 ```

1337

1338 And to make an embedding with either:

1339

1340 ```python

1341 columns = [embedding_column(colors, 3),...]

1342 features = tf.io.parse_example(..., features=make_parse_example_spec(columns))

1343 dense_tensor = input_layer(features, columns)

1344 ```

1345

1346 Args:

1347 key: A unique string identifying the input feature. It is used as the column

1348 name and the dictionary key for feature parsing configs, feature `Tensor`

1349 objects, and feature columns.

1350 vocabulary_list: An ordered iterable defining the vocabulary. Each feature

1351 is mapped to the index of its value (if present) in `vocabulary_list`.

1352 Must be castable to `dtype`.

1353 dtype: The type of features. Only string and integer types are supported. If

1354 `None`, it will be inferred from `vocabulary_list`.

1355 default_value: The integer ID value to return for out-of-vocabulary feature

1356 values, defaults to `-1`. This can not be specified with a positive

1357 `num_oov_buckets`.

1358 num_oov_buckets: Non-negative integer, the number of out-of-vocabulary

1359 buckets. All out-of-vocabulary inputs will be assigned IDs in the range

1360 `[len(vocabulary_list), len(vocabulary_list)+num_oov_buckets)` based on a

1361 hash of the input value. A positive `num_oov_buckets` can not be specified

1362 with `default_value`.

1363

1364 Returns:

1365 A `_CategoricalColumn` with in-memory vocabulary.

1366

1367 Raises:

1368 ValueError: if `vocabulary_list` is empty, or contains duplicate keys.

1369 ValueError: `num_oov_buckets` is a negative integer.

1370 ValueError: `num_oov_buckets` and `default_value` are both specified.

1371 ValueError: if `dtype` is not integer or string.

1372 """

1373 if (vocabulary_list is None) or (len(vocabulary_list) < 1):

1374 raise ValueError(

1375 'vocabulary_list {} must be non-empty, column_name: {}'.format(

1376 vocabulary_list, key))

1377 if len(set(vocabulary_list)) != len(vocabulary_list):

1378 raise ValueError(

1379 'Duplicate keys in vocabulary_list {}, column_name: {}'.format(

1380 vocabulary_list, key))

1381 vocabulary_dtype = dtypes.as_dtype(np.array(vocabulary_list).dtype)

1382 if num_oov_buckets:

1383 if default_value != -1:

1384 raise ValueError(

1385 'Can\'t specify both num_oov_buckets and default_value in {}.'.format(

1386 key))

1387 if num_oov_buckets < 0:

1388 raise ValueError('Invalid num_oov_buckets {} in {}.'.format(

1389 num_oov_buckets, key))

1390 fc_utils.assert_string_or_int(

1391 vocabulary_dtype, prefix='column_name: {} vocabulary'.format(key))

1392 if dtype is None:

1393 dtype = vocabulary_dtype

1394 elif dtype.is_integer != vocabulary_dtype.is_integer:

1395 raise ValueError(

1396 'dtype {} and vocabulary dtype {} do not match, column_name: {}'.format(

1397 dtype, vocabulary_dtype, key))

1398 fc_utils.assert_string_or_int(dtype, prefix='column_name: {}'.format(key))

1399 fc_utils.assert_key_is_string(key)

1400

1401 return _VocabularyListCategoricalColumn(

1402 key=key,

1403 vocabulary_list=tuple(vocabulary_list),

1404 dtype=dtype,

1405 default_value=default_value,

1406 num_oov_buckets=num_oov_buckets)

1407

1408

1409def _categorical_column_with_identity(key, num_buckets, default_value=None):

1410 """A `_CategoricalColumn` that returns identity values.

1411

1412 Use this when your inputs are integers in the range `[0, num_buckets)`, and

1413 you want to use the input value itself as the categorical ID. Values outside

1414 this range will result in `default_value` if specified, otherwise it will

1415 fail.

1416

1417 Typically, this is used for contiguous ranges of integer indexes, but

1418 it doesn't have to be. This might be inefficient, however, if many of IDs

1419 are unused. Consider `categorical_column_with_hash_bucket` in that case.

1420

1421 For input dictionary `features`, `features[key]` is either `Tensor` or

1422 `SparseTensor`. If `Tensor`, missing values can be represented by `-1` for int

1423 and `''` for string, which will be dropped by this feature column.

1424

1425 In the following examples, each input in the range `[0, 1000000)` is assigned

1426 the same value. All other inputs are assigned `default_value` 0. Note that a

1427 literal 0 in inputs will result in the same default ID.

1428

1429 Linear model:

1430

1431 ```python

1432 video_id = categorical_column_with_identity(

1433 key='video_id', num_buckets=1000000, default_value=0)

1434 columns = [video_id, ...]

1435 features = tf.io.parse_example(..., features=make_parse_example_spec(columns))

1436 linear_prediction, _, _ = linear_model(features, columns)

1437 ```

1438

1439 Embedding for a DNN model:

1440

1441 ```python

1442 columns = [embedding_column(video_id, 9),...]

1443 features = tf.io.parse_example(..., features=make_parse_example_spec(columns))

1444 dense_tensor = input_layer(features, columns)

1445 ```

1446

1447 Args:

1448 key: A unique string identifying the input feature. It is used as the column

1449 name and the dictionary key for feature parsing configs, feature `Tensor`

1450 objects, and feature columns.

1451 num_buckets: Range of inputs and outputs is `[0, num_buckets)`.

1452 default_value: If set, values outside of range `[0, num_buckets)` will be

1453 replaced with this value. If not set, values >= num_buckets will cause a

1454 failure while values < 0 will be dropped.

1455

1456 Returns:

1457 A `_CategoricalColumn` that returns identity values.

1458

1459 Raises:

1460 ValueError: if `num_buckets` is less than one.

1461 ValueError: if `default_value` is not in range `[0, num_buckets)`.

1462 """

1463 if num_buckets < 1:

1464 raise ValueError('num_buckets {} < 1, column_name {}'.format(

1465 num_buckets, key))

1466 if (default_value is not None) and ((default_value < 0) or

1467 (default_value >= num_buckets)):

1468 raise ValueError(

1469 'default_value {} not in range [0, {}), column_name {}'.format(

1470 default_value, num_buckets, key))

1471 fc_utils.assert_key_is_string(key)

1472 return _IdentityCategoricalColumn(

1473 key=key, num_buckets=num_buckets, default_value=default_value)

1474

1475

1476def _indicator_column(categorical_column):

1477 """Represents multi-hot representation of given categorical column.

1478

1479 - For DNN model, `indicator_column` can be used to wrap any

1480 `categorical_column_*` (e.g., to feed to DNN). Consider to Use

1481 `embedding_column` if the number of buckets/unique(values) are large.

1482

1483 - For Wide (aka linear) model, `indicator_column` is the internal

1484 representation for categorical column when passing categorical column

1485 directly (as any element in feature_columns) to `linear_model`. See

1486 `linear_model` for details.

1487

1488 ```python

1489 name = indicator_column(categorical_column_with_vocabulary_list(

1490 'name', ['bob', 'george', 'wanda'])

1491 columns = [name, ...]

1492 features = tf.io.parse_example(..., features=make_parse_example_spec(columns))

1493 dense_tensor = input_layer(features, columns)

1494

1495 dense_tensor == [[1, 0, 0]] # If "name" bytes_list is ["bob"]

1496 dense_tensor == [[1, 0, 1]] # If "name" bytes_list is ["bob", "wanda"]

1497 dense_tensor == [[2, 0, 0]] # If "name" bytes_list is ["bob", "bob"]

1498 ```

1499

1500 Args:

1501 categorical_column: A `_CategoricalColumn` which is created by

1502 `categorical_column_with_*` or `crossed_column` functions.

1503

1504 Returns:

1505 An `_IndicatorColumn`.

1506 """

1507 return _IndicatorColumn(categorical_column)

1508

1509

1510def _weighted_categorical_column(categorical_column,

1511 weight_feature_key,

1512 dtype=dtypes.float32):

1513 """Applies weight values to a `_CategoricalColumn`.

1514

1515 Use this when each of your sparse inputs has both an ID and a value. For

1516 example, if you're representing text documents as a collection of word

1517 frequencies, you can provide 2 parallel sparse input features ('terms' and

1518 'frequencies' below).

1519

1520 Example:

1521

1522 Input `tf.Example` objects:

1523

1524 ```proto

1525 [

1526 features {

1527 feature {

1528 key: "terms"

1529 value {bytes_list {value: "very" value: "model"}}

1530 }

1531 feature {

1532 key: "frequencies"

1533 value {float_list {value: 0.3 value: 0.1}}

1534 }

1535 },

1536 features {

1537 feature {

1538 key: "terms"

1539 value {bytes_list {value: "when" value: "course" value: "human"}}

1540 }

1541 feature {

1542 key: "frequencies"

1543 value {float_list {value: 0.4 value: 0.1 value: 0.2}}

1544 }

1545 }

1546 ]

1547 ```

1548

1549 ```python

1550 categorical_column = categorical_column_with_hash_bucket(

1551 column_name='terms', hash_bucket_size=1000)

1552 weighted_column = weighted_categorical_column(

1553 categorical_column=categorical_column, weight_feature_key='frequencies')

1554 columns = [weighted_column, ...]

1555 features = tf.io.parse_example(..., features=make_parse_example_spec(columns))

1556 linear_prediction, _, _ = linear_model(features, columns)

1557 ```

1558

1559 This assumes the input dictionary contains a `SparseTensor` for key

1560 'terms', and a `SparseTensor` for key 'frequencies'. These 2 tensors must have

1561 the same indices and dense shape.

1562

1563 Args:

1564 categorical_column: A `_CategoricalColumn` created by

1565 `categorical_column_with_*` functions.

1566 weight_feature_key: String key for weight values.

1567 dtype: Type of weights, such as `tf.float32`. Only float and integer weights

1568 are supported.

1569

1570 Returns:

1571 A `_CategoricalColumn` composed of two sparse features: one represents id,

1572 the other represents weight (value) of the id feature in that example.

1573

1574 Raises:

1575 ValueError: if `dtype` is not convertible to float.

1576 """

1577 if (dtype is None) or not (dtype.is_integer or dtype.is_floating):

1578 raise ValueError('dtype {} is not convertible to float.'.format(dtype))

1579 return _WeightedCategoricalColumn(

1580 categorical_column=categorical_column,

1581 weight_feature_key=weight_feature_key,

1582 dtype=dtype)

1583

1584

1585def _crossed_column(keys, hash_bucket_size, hash_key=None):

1586 """Returns a column for performing crosses of categorical features.

1587

1588 Crossed features are hashed according to `hash_bucket_size`. Conceptually,

1589 the transformation can be thought of as:

1590 Hash(cartesian product of features) % `hash_bucket_size`

1591

1592 For example, if the input features are:

1593

1594 * SparseTensor referred by first key:

1595

1596 ```python

1597 shape = [2, 2]

1598 {

1599 [0, 0]: "a"

1600 [1, 0]: "b"

1601 [1, 1]: "c"

1602 }

1603 ```

1604

1605 * SparseTensor referred by second key:

1606

1607 ```python

1608 shape = [2, 1]

1609 {

1610 [0, 0]: "d"

1611 [1, 0]: "e"

1612 }

1613 ```

1614

1615 then crossed feature will look like:

1616

1617 ```python

1618 shape = [2, 2]

1619 {

1620 [0, 0]: Hash64("d", Hash64("a")) % hash_bucket_size

1621 [1, 0]: Hash64("e", Hash64("b")) % hash_bucket_size

1622 [1, 1]: Hash64("e", Hash64("c")) % hash_bucket_size

1623 }

1624 ```

1625

1626 Here is an example to create a linear model with crosses of string features:

1627

1628 ```python

1629 keywords_x_doc_terms = crossed_column(['keywords', 'doc_terms'], 50K)

1630 columns = [keywords_x_doc_terms, ...]

1631 features = tf.io.parse_example(..., features=make_parse_example_spec(columns))

1632 linear_prediction = linear_model(features, columns)

1633 ```

1634

1635 You could also use vocabulary lookup before crossing:

1636

1637 ```python

1638 keywords = categorical_column_with_vocabulary_file(

1639 'keywords', '/path/to/vocabulary/file', vocabulary_size=1K)

1640 keywords_x_doc_terms = crossed_column([keywords, 'doc_terms'], 50K)

1641 columns = [keywords_x_doc_terms, ...]

1642 features = tf.io.parse_example(..., features=make_parse_example_spec(columns))

1643 linear_prediction = linear_model(features, columns)

1644 ```

1645

1646 If an input feature is of numeric type, you can use

1647 `categorical_column_with_identity`, or `bucketized_column`, as in the example:

1648

1649 ```python

1650 # vertical_id is an integer categorical feature.

1651 vertical_id = categorical_column_with_identity('vertical_id', 10K)

1652 price = numeric_column('price')

1653 # bucketized_column converts numerical feature to a categorical one.

1654 bucketized_price = bucketized_column(price, boundaries=[...])

1655 vertical_id_x_price = crossed_column([vertical_id, bucketized_price], 50K)

1656 columns = [vertical_id_x_price, ...]

1657 features = tf.io.parse_example(..., features=make_parse_example_spec(columns))

1658 linear_prediction = linear_model(features, columns)

1659 ```

1660

1661 To use crossed column in DNN model, you need to add it in an embedding column

1662 as in this example:

1663

1664 ```python

1665 vertical_id_x_price = crossed_column([vertical_id, bucketized_price], 50K)

1666 vertical_id_x_price_embedded = embedding_column(vertical_id_x_price, 10)

1667 dense_tensor = input_layer(features, [vertical_id_x_price_embedded, ...])

1668 ```

1669

1670 Args:

1671 keys: An iterable identifying the features to be crossed. Each element can

1672 be either:

1673 * string: Uses the corresponding feature which must be of string type.

1674 * `_CategoricalColumn`: Uses the transformed tensor produced by this

1675 column. Does not support hashed categorical column.

1676 hash_bucket_size: An int > 1. The number of buckets.

1677 hash_key: Specify the hash_key that will be used by the `FingerprintCat64`

1678 function to combine the crosses fingerprints on SparseCrossOp (optional).

1679

1680 Returns:

1681 A `_CrossedColumn`.

1682

1683 Raises:

1684 ValueError: If `len(keys) < 2`.

1685 ValueError: If any of the keys is neither a string nor `_CategoricalColumn`.

1686 ValueError: If any of the keys is `_HashedCategoricalColumn`.

1687 ValueError: If `hash_bucket_size < 1`.

1688 """

1689 if not hash_bucket_size or hash_bucket_size < 1:

1690 raise ValueError('hash_bucket_size must be > 1. '

1691 'hash_bucket_size: {}'.format(hash_bucket_size))

1692 if not keys or len(keys) < 2:

1693 raise ValueError(

1694 'keys must be a list with length > 1. Given: {}'.format(keys))

1695 for key in keys:

1696 if (not isinstance(key, six.string_types) and

1697 not isinstance(key, _CategoricalColumn)):

1698 raise ValueError(

1699 'Unsupported key type. All keys must be either string, or '

1700 'categorical column except _HashedCategoricalColumn. '

1701 'Given: {}'.format(key))

1702 if isinstance(key, _HashedCategoricalColumn):

1703 raise ValueError(

1704 'categorical_column_with_hash_bucket is not supported for crossing. '

1705 'Hashing before crossing will increase probability of collision. '

1706 'Instead, use the feature name as a string. Given: {}'.format(key))

1707 return _CrossedColumn(

1708 keys=tuple(keys), hash_bucket_size=hash_bucket_size, hash_key=hash_key)

1709

1710

1711# TODO(rohanj): Clearly define semantics of this layer.

1712class _EmbeddingColumnLayer(base.Layer):

1713 """A layer that stores all the state required for a embedding column."""

1714

1715 def __init__(self,

1716 embedding_shape,

1717 initializer,

1718 weight_collections=None,

1719 trainable=True,

1720 name=None,

1721 **kwargs):

1722 """Constructor.

1723

1724 Args:

1725 embedding_shape: Shape of the embedding variable used for lookup.

1726 initializer: A variable initializer function to be used in embedding

1727 variable initialization.

1728 weight_collections: A list of collection names to which the Variable will

1729 be added. Note that, variables will also be added to collections

1730 `tf.GraphKeys.GLOBAL_VARIABLES` and `ops.GraphKeys.MODEL_VARIABLES`.

1731 trainable: If `True` also add the variable to the graph collection

1732 `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`).

1733 name: Name of the layer

1734 **kwargs: keyword named properties.

1735 """

1736 super(_EmbeddingColumnLayer, self).__init__(

1737 trainable=trainable, name=name, **kwargs)

1738 self._embedding_shape = embedding_shape

1739 self._initializer = initializer

1740 self._weight_collections = weight_collections

1741

1742 def set_weight_collections(self, weight_collections):

1743 """Sets the weight collections for the layer.

1744

1745 Args:

1746 weight_collections: A list of collection names to which the Variable will

1747 be added.

1748 """

1749 self._weight_collections = weight_collections

1750

1751 def build(self, _):

1752 self._embedding_weight_var = self.add_variable(

1753 name='embedding_weights',

1754 shape=self._embedding_shape,

1755 dtype=dtypes.float32,

1756 initializer=self._initializer,

1757 trainable=self.trainable)

1758 if self._weight_collections and not context.executing_eagerly():

1759 _add_to_collections(self._embedding_weight_var, self._weight_collections)

1760 self.built = True

1761

1762 def call(self, _):

1763 return self._embedding_weight_var

1764

1765

1766@six.add_metaclass(abc.ABCMeta)

1767class _FeatureColumn(object):

1768 """Represents a feature column abstraction.

1769

1770 WARNING: Do not subclass this layer unless you know what you are doing:

1771 the API is subject to future changes.

1772

1773 To distinguish the concept of a feature family and a specific binary feature

1774 within a family, we refer to a feature family like "country" as a feature

1775 column. Following is an example feature in a `tf.Example` format:

1776 {key: "country", value: [ "US" ]}

1777 In this example the value of feature is "US" and "country" refers to the

1778 column of the feature.

1779

1780 This class is an abstract class. User should not create instances of this.

1781 """

1782

1783 @abc.abstractproperty

1784 def name(self):

1785 """Returns string. Used for naming and for name_scope."""

1786 pass

1787

1788 def __lt__(self, other):

1789 """Allows feature columns to be sorted in Python 3 as they are in Python 2.

1790

1791 Feature columns need to occasionally be sortable, for example when used as

1792 keys in a features dictionary passed to a layer.

1793

1794 In CPython, `__lt__` must be defined for all objects in the

1795 sequence being sorted. If any objects do not have an `__lt__` compatible

1796 with feature column objects (such as strings), then CPython will fall back

1797 to using the `__gt__` method below.

1798 https://docs.python.org/3/library/stdtypes.html#list.sort

1799

1800 Args:

1801 other: The other object to compare to.

1802

1803 Returns:

1804 True if the string representation of this object is lexicographically less

1805 than the string representation of `other`. For FeatureColumn objects,

1806 this looks like "<__main__.FeatureColumn object at 0xa>".

1807 """

1808 return str(self) < str(other)

1809

1810 def __gt__(self, other):

1811 """Allows feature columns to be sorted in Python 3 as they are in Python 2.

1812

1813 Feature columns need to occasionally be sortable, for example when used as

1814 keys in a features dictionary passed to a layer.

1815

1816 `__gt__` is called when the "other" object being compared during the sort

1817 does not have `__lt__` defined.

1818 Example:

1819 ```

1820 # __lt__ only class

1821 class A():

1822 def __lt__(self, other): return str(self) < str(other)

1823

1824 a = A()

1825 a < "b" # True

1826 "0" < a # Error

1827

1828 # __lt__ and __gt__ class

1829 class B():

1830 def __lt__(self, other): return str(self) < str(other)

1831 def __gt__(self, other): return str(self) > str(other)

1832

1833 b = B()

1834 b < "c" # True

1835 "0" < b # True

1836 ```

1837

1838

1839 Args:

1840 other: The other object to compare to.

1841

1842 Returns:

1843 True if the string representation of this object is lexicographically

1844 greater than the string representation of `other`. For FeatureColumn

1845 objects, this looks like "<__main__.FeatureColumn object at 0xa>".

1846 """

1847 return str(self) > str(other)

1848

1849 @property

1850 def _var_scope_name(self):

1851 """Returns string. Used for variable_scope. Defaults to self.name."""

1852 return self.name

1853

1854 @abc.abstractmethod

1855 def _transform_feature(self, inputs):

1856 """Returns intermediate representation (usually a `Tensor`).

1857

1858 Uses `inputs` to create an intermediate representation (usually a `Tensor`)

1859 that other feature columns can use.

1860

1861 Example usage of `inputs`:

1862 Let's say a Feature column depends on raw feature ('raw') and another

1863 `_FeatureColumn` (input_fc). To access corresponding `Tensor`s, inputs will

1864 be used as follows:

1865

1866 ```python

1867 raw_tensor = inputs.get('raw')

1868 fc_tensor = inputs.get(input_fc)

1869 ```

1870

1871 Args:

1872 inputs: A `_LazyBuilder` object to access inputs.

1873

1874 Returns:

1875 Transformed feature `Tensor`.

1876 """

1877 pass

1878

1879 @abc.abstractproperty

1880 def _parse_example_spec(self):

1881 """Returns a `tf.Example` parsing spec as dict.

1882

1883 It is used for get_parsing_spec for `tf.io.parse_example`. Returned spec is

1884 a dict from keys ('string') to `VarLenFeature`, `FixedLenFeature`, and other

1885 supported objects. Please check documentation of `tf.io.parse_example` for

1886 all supported spec objects.

1887

1888 Let's say a Feature column depends on raw feature ('raw') and another

1889 `_FeatureColumn` (input_fc). One possible implementation of

1890 _parse_example_spec is as follows:

1891

1892 ```python

1893 spec = {'raw': tf.io.FixedLenFeature(...)}

1894 spec.update(input_fc._parse_example_spec)

1895 return spec

1896 ```

1897 """

1898 pass

1899

1900 def _reset_config(self):

1901 """Resets the configuration in the column.

1902

1903 Some feature columns e.g. embedding or shared embedding columns might

1904 have some state that is needed to be reset sometimes. Use this method

1905 in that scenario.

1906 """

1907

1908

1909class _DenseColumn(_FeatureColumn):

1910 """Represents a column which can be represented as `Tensor`.

1911

1912 WARNING: Do not subclass this layer unless you know what you are doing:

1913 the API is subject to future changes.

1914

1915 Some examples of this type are: numeric_column, embedding_column,

1916 indicator_column.

1917 """

1918

1919 @abc.abstractproperty

1920 def _variable_shape(self):

1921 """`TensorShape` of `_get_dense_tensor`, without batch dimension."""

1922 pass

1923

1924 @abc.abstractmethod

1925 def _get_dense_tensor(self, inputs, weight_collections=None, trainable=None):

1926 """Returns a `Tensor`.

1927

1928 The output of this function will be used by model-builder-functions. For

1929 example the pseudo code of `input_layer` will be like:

1930

1931 ```python

1932 def input_layer(features, feature_columns, ...):

1933 outputs = [fc._get_dense_tensor(...) for fc in feature_columns]

1934 return tf.concat(outputs)

1935 ```

1936

1937 Args:

1938 inputs: A `_LazyBuilder` object to access inputs.

1939 weight_collections: List of graph collections to which Variables (if any

1940 will be created) are added.

1941 trainable: If `True` also add variables to the graph collection

1942 `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`).

1943

1944 Returns:

1945 `Tensor` of shape [batch_size] + `_variable_shape`.

1946 """

1947 pass

1948

1949

1950def _create_weighted_sum(column,

1951 builder,

1952 units,

1953 sparse_combiner,

1954 weight_collections,

1955 trainable,

1956 weight_var=None):

1957 """Creates a weighted sum for a dense/categorical column for linear_model."""

1958 if isinstance(column, _CategoricalColumn):

1959 return _create_categorical_column_weighted_sum(

1960 column=column,

1961 builder=builder,

1962 units=units,

1963 sparse_combiner=sparse_combiner,

1964 weight_collections=weight_collections,

1965 trainable=trainable,

1966 weight_var=weight_var)

1967 else:

1968 return _create_dense_column_weighted_sum(

1969 column=column,

1970 builder=builder,

1971 units=units,

1972 weight_collections=weight_collections,

1973 trainable=trainable,

1974 weight_var=weight_var)

1975

1976

1977def _create_dense_column_weighted_sum(column,

1978 builder,

1979 units,

1980 weight_collections,

1981 trainable,

1982 weight_var=None):

1983 """Create a weighted sum of a dense column for linear_model."""

1984 tensor = column._get_dense_tensor( # pylint: disable=protected-access

1985 builder,

1986 weight_collections=weight_collections,

1987 trainable=trainable)

1988 num_elements = column._variable_shape.num_elements() # pylint: disable=protected-access

1989 batch_size = array_ops.shape(tensor)[0]

1990 tensor = array_ops.reshape(tensor, shape=(batch_size, num_elements))

1991 if weight_var is not None:

1992 weight = weight_var

1993 else:

1994 weight = variable_scope.get_variable(

1995 name='weights',

1996 shape=[num_elements, units],

1997 initializer=init_ops.zeros_initializer(),

1998 trainable=trainable,

1999 collections=weight_collections)

2000 return math_ops.matmul(tensor, weight, name='weighted_sum')

2001

2002

2003class _CategoricalColumn(_FeatureColumn):

2004 """Represents a categorical feature.

2005

2006 WARNING: Do not subclass this layer unless you know what you are doing:

2007 the API is subject to future changes.

2008

2009 A categorical feature typically handled with a `tf.sparse.SparseTensor` of

2010 IDs.

2011 """

2012

2013 IdWeightPair = collections.namedtuple( # pylint: disable=invalid-name

2014 'IdWeightPair', ['id_tensor', 'weight_tensor'])

2015

2016 @abc.abstractproperty

2017 def _num_buckets(self):

2018 """Returns number of buckets in this sparse feature."""

2019 pass

2020

2021 @abc.abstractmethod

2022 def _get_sparse_tensors(self,

2023 inputs,

2024 weight_collections=None,

2025 trainable=None):

2026 """Returns an IdWeightPair.

2027

2028 `IdWeightPair` is a pair of `SparseTensor`s which represents ids and

2029 weights.

2030

2031 `IdWeightPair.id_tensor` is typically a `batch_size` x `num_buckets`

2032 `SparseTensor` of `int64`. `IdWeightPair.weight_tensor` is either a

2033 `SparseTensor` of `float` or `None` to indicate all weights should be

2034 taken to be 1. If specified, `weight_tensor` must have exactly the same

2035 shape and indices as `sp_ids`. Expected `SparseTensor` is same as parsing

2036 output of a `VarLenFeature` which is a ragged matrix.

2037

2038 Args:

2039 inputs: A `LazyBuilder` as a cache to get input tensors required to create

2040 `IdWeightPair`.

2041 weight_collections: List of graph collections to which variables (if any

2042 will be created) are added.

2043 trainable: If `True` also add variables to the graph collection

2044 `GraphKeys.TRAINABLE_VARIABLES` (see `tf.compat.v1.get_variable`).

2045 """

2046 pass

2047

2048

2049def _create_categorical_column_weighted_sum(column,

2050 builder,

2051 units,

2052 sparse_combiner,

2053 weight_collections,

2054 trainable,

2055 weight_var=None):

2056 # pylint: disable=g-doc-return-or-yield,g-doc-args

2057 """Create a weighted sum of a categorical column for linear_model.

2058

2059 Note to maintainer: As implementation details, the weighted sum is

2060 implemented via embedding_lookup_sparse toward efficiency. Mathematically,

2061 they are the same.

2062

2063 To be specific, conceptually, categorical column can be treated as multi-hot

2064 vector. Say:

2065

2066 ```python

2067 x = [0 0 1] # categorical column input

2068 w = [a b c] # weights

2069 ```

2070 The weighted sum is `c` in this case, which is same as `w[2]`.

2071

2072 Another example is

2073

2074 ```python

2075 x = [0 1 1] # categorical column input

2076 w = [a b c] # weights

2077 ```

2078 The weighted sum is `b + c` in this case, which is same as `w[2] + w[3]`.

2079

2080 For both cases, we can implement weighted sum via embedding_lookup with

2081 sparse_combiner = "sum".

2082 """

2083

2084 sparse_tensors = column._get_sparse_tensors( # pylint: disable=protected-access

2085 builder,

2086 weight_collections=weight_collections,

2087 trainable=trainable)

2088 id_tensor = sparse_ops.sparse_reshape(

2089 sparse_tensors.id_tensor,

2090 [array_ops.shape(sparse_tensors.id_tensor)[0], -1])

2091 weight_tensor = sparse_tensors.weight_tensor

2092 if weight_tensor is not None:

2093 weight_tensor = sparse_ops.sparse_reshape(

2094 weight_tensor, [array_ops.shape(weight_tensor)[0], -1])

2095

2096 if weight_var is not None:

2097 weight = weight_var

2098 else:

2099 weight = variable_scope.get_variable(

2100 name='weights',

2101 shape=(column._num_buckets, units), # pylint: disable=protected-access

2102 initializer=init_ops.zeros_initializer(),

2103 trainable=trainable,

2104 collections=weight_collections)

2105 return embedding_ops.safe_embedding_lookup_sparse(

2106 weight,

2107 id_tensor,

2108 sparse_weights=weight_tensor,

2109 combiner=sparse_combiner,

2110 name='weighted_sum')

2111

2112

2113class _SequenceDenseColumn(_FeatureColumn):

2114 """Represents dense sequence data."""

2115

2116 TensorSequenceLengthPair = collections.namedtuple( # pylint: disable=invalid-name

2117 'TensorSequenceLengthPair', ['dense_tensor', 'sequence_length'])

2118

2119 @abc.abstractmethod

2120 def _get_sequence_dense_tensor(self,

2121 inputs,

2122 weight_collections=None,

2123 trainable=None):

2124 """Returns a `TensorSequenceLengthPair`."""

2125 pass

2126

2127

2128class _LazyBuilder(object):

2129 """Handles caching of transformations while building the model.

2130

2131 `_FeatureColumn` specifies how to digest an input column to the network. Some

2132 feature columns require data transformations. This class caches those

2133 transformations.

2134

2135 Some features may be used in more than one place. For example, one can use a

2136 bucketized feature by itself and a cross with it. In that case we

2137 should create only one bucketization op instead of creating ops for each

2138 feature column separately. To handle re-use of transformed columns,

2139 `_LazyBuilder` caches all previously transformed columns.

2140

2141 Example:

2142 We're trying to use the following `_FeatureColumn`s:

2143

2144 ```python

2145 bucketized_age = fc.bucketized_column(fc.numeric_column("age"), ...)

2146 keywords = fc.categorical_column_with_hash_buckets("keywords", ...)

2147 age_X_keywords = fc.crossed_column([bucketized_age, "keywords"])

2148 ... = linear_model(features,

2149 [bucketized_age, keywords, age_X_keywords]

2150 ```

2151

2152 If we transform each column independently, then we'll get duplication of

2153 bucketization (one for cross, one for bucketization itself).

2154 The `_LazyBuilder` eliminates this duplication.

2155 """

2156

2157 def __init__(self, features):

2158 """Creates a `_LazyBuilder`.

2159

2160 Args:

2161 features: A mapping from feature column to objects that are `Tensor` or

2162 `SparseTensor`, or can be converted to same via

2163 `sparse_tensor.convert_to_tensor_or_sparse_tensor`. A `string` key

2164 signifies a base feature (not-transformed). A `_FeatureColumn` key means

2165 that this `Tensor` is the output of an existing `_FeatureColumn` which

2166 can be reused.

2167 """

2168 self._features = features.copy()

2169 self._feature_tensors = {}

2170

2171 def get(self, key):

2172 """Returns a `Tensor` for the given key.

2173

2174 A `str` key is used to access a base feature (not-transformed). When a

2175 `_FeatureColumn` is passed, the transformed feature is returned if it

2176 already exists, otherwise the given `_FeatureColumn` is asked to provide its

2177 transformed output, which is then cached.

2178

2179 Args:

2180 key: a `str` or a `_FeatureColumn`.

2181

2182 Returns:

2183 The transformed `Tensor` corresponding to the `key`.

2184

2185 Raises:

2186 ValueError: if key is not found or a transformed `Tensor` cannot be

2187 computed.

2188 """

2189 if key in self._feature_tensors:

2190 # FeatureColumn is already transformed or converted.

2191 return self._feature_tensors[key]

2192

2193 if key in self._features:

2194 feature_tensor = self._get_raw_feature_as_tensor(key)

2195 self._feature_tensors[key] = feature_tensor

2196 return feature_tensor

2197

2198 if isinstance(key, six.string_types):

2199 raise ValueError('Feature {} is not in features dictionary.'.format(key))

2200

2201 if not isinstance(key, _FeatureColumn):

2202 raise TypeError('"key" must be either a "str" or "_FeatureColumn". '

2203 'Provided: {}'.format(key))

2204

2205 column = key

2206 logging.debug('Transforming feature_column %s.', column)

2207 transformed = column._transform_feature(self) # pylint: disable=protected-access

2208 if transformed is None:

2209 raise ValueError('Column {} is not supported.'.format(column.name))

2210 self._feature_tensors[column] = transformed

2211 return transformed

2212

2213 def _get_raw_feature_as_tensor(self, key):

2214 """Gets the raw_feature (keyed by `key`) as `tensor`.

2215

2216 The raw feature is converted to (sparse) tensor and maybe expand dim.

2217

2218 For both `Tensor` and `SparseTensor`, the rank will be expanded (to 2) if

2219 the rank is 1. This supports dynamic rank also. For rank 0 raw feature, will

2220 error out as it is not supported.

2221

2222 Args:

2223 key: A `str` key to access the raw feature.

2224

2225 Returns:

2226 A `Tensor` or `SparseTensor`.

2227

2228 Raises:

2229 ValueError: if the raw feature has rank 0.

2230 """

2231 raw_feature = self._features[key]

2232 feature_tensor = sparse_tensor_lib.convert_to_tensor_or_sparse_tensor(

2233 raw_feature)

2234

2235 def expand_dims(input_tensor):

2236 # Input_tensor must have rank 1.

2237 if isinstance(input_tensor, sparse_tensor_lib.SparseTensor):

2238 return sparse_ops.sparse_reshape(input_tensor,

2239 [array_ops.shape(input_tensor)[0], 1])

2240 else:

2241 return array_ops.expand_dims(input_tensor, -1)

2242

2243 rank = feature_tensor.get_shape().ndims

2244 if rank is not None:

2245 if rank == 0:

2246 raise ValueError(

2247 'Feature (key: {}) cannot have rank 0. Given: {}'.format(

2248 key, feature_tensor))

2249 return feature_tensor if rank != 1 else expand_dims(feature_tensor)

2250

2251 # Handle dynamic rank.

2252 with ops.control_dependencies([

2253 check_ops.assert_positive(

2254 array_ops.rank(feature_tensor),

2255 message='Feature (key: {}) cannot have rank 0. Given: {}'.format(

2256 key, feature_tensor))

2257 ]):

2258 return cond.cond(

2259 math_ops.equal(1, array_ops.rank(feature_tensor)),

2260 lambda: expand_dims(feature_tensor), lambda: feature_tensor)

2261

2262

2263# TODO(ptucker): Move to third_party/tensorflow/python/ops/sparse_ops.py

2264def _shape_offsets(shape):

2265 """Returns moving offset for each dimension given shape."""

2266 offsets = []

2267 for dim in reversed(shape):

2268 if offsets:

2269 offsets.append(dim * offsets[-1])

2270 else:

2271 offsets.append(dim)

2272 offsets.reverse()

2273 return offsets

2274

2275

2276# TODO(ptucker): Move to third_party/tensorflow/python/ops/sparse_ops.py

2277def _to_sparse_input_and_drop_ignore_values(input_tensor, ignore_value=None):

2278 """Converts a `Tensor` to a `SparseTensor`, dropping ignore_value cells.

2279

2280 If `input_tensor` is already a `SparseTensor`, just return it.

2281

2282 Args:

2283 input_tensor: A string or integer `Tensor`.

2284 ignore_value: Entries in `dense_tensor` equal to this value will be absent

2285 from the resulting `SparseTensor`. If `None`, default value of

2286 `dense_tensor`'s dtype will be used ('' for `str`, -1 for `int`).

2287

2288 Returns:

2289 A `SparseTensor` with the same shape as `input_tensor`.

2290

2291 Raises:

2292 ValueError: when `input_tensor`'s rank is `None`.

2293 """

2294 input_tensor = sparse_tensor_lib.convert_to_tensor_or_sparse_tensor(

2295 input_tensor)

2296 if isinstance(input_tensor, sparse_tensor_lib.SparseTensor):

2297 return input_tensor

2298 with ops.name_scope(None, 'to_sparse_input', (

2299 input_tensor,

2300 ignore_value,

2301 )):

2302 if ignore_value is None:

2303 if input_tensor.dtype == dtypes.string:

2304 # Exception due to TF strings are converted to numpy objects by default.

2305 ignore_value = ''

2306 elif input_tensor.dtype.is_integer:

2307 ignore_value = -1 # -1 has a special meaning of missing feature

2308 else:

2309 # NOTE: `as_numpy_dtype` is a property, so with the parentheses this is

2310 # constructing a new numpy object of the given type, which yields the

2311 # default value for that type.

2312 ignore_value = input_tensor.dtype.as_numpy_dtype()

2313 ignore_value = math_ops.cast(

2314 ignore_value, input_tensor.dtype, name='ignore_value')

2315 indices = array_ops.where(

2316 math_ops.not_equal(input_tensor, ignore_value), name='indices')

2317 return sparse_tensor_lib.SparseTensor(

2318 indices=indices,

2319 values=array_ops.gather_nd(input_tensor, indices, name='values'),

2320 dense_shape=array_ops.shape(

2321 input_tensor, out_type=dtypes.int64, name='dense_shape'))

2322

2323

2324def _normalize_feature_columns(feature_columns):

2325 """Normalizes the `feature_columns` input.

2326

2327 This method converts the `feature_columns` to list type as best as it can. In

2328 addition, verifies the type and other parts of feature_columns, required by

2329 downstream library.

2330

2331 Args:

2332 feature_columns: The raw feature columns, usually passed by users.

2333

2334 Returns:

2335 The normalized feature column list.

2336

2337 Raises:

2338 ValueError: for any invalid inputs, such as empty, duplicated names, etc.

2339 """

2340 if isinstance(feature_columns, _FeatureColumn):

2341 feature_columns = [feature_columns]

2342

2343 if isinstance(feature_columns, collections_abc.Iterator):

2344 feature_columns = list(feature_columns)

2345

2346 if isinstance(feature_columns, dict):

2347 raise ValueError('Expected feature_columns to be iterable, found dict.')

2348

2349 for column in feature_columns:

2350 if not isinstance(column, _FeatureColumn):

2351 raise ValueError('Items of feature_columns must be a _FeatureColumn. '

2352 'Given (type {}): {}.'.format(type(column), column))

2353 if not feature_columns:

2354 raise ValueError('feature_columns must not be empty.')

2355 name_to_column = {}

2356 for column in feature_columns:

2357 if column.name in name_to_column:

2358 raise ValueError('Duplicate feature column name found for columns: {} '

2359 'and {}. This usually means that these columns refer to '

2360 'same base feature. Either one must be discarded or a '

2361 'duplicated but renamed item must be inserted in '

2362 'features dict.'.format(column,

2363 name_to_column[column.name]))

2364 name_to_column[column.name] = column

2365

2366 return feature_columns

2367

2368

2369class _NumericColumn(

2370 _DenseColumn,

2371 collections.namedtuple(

2372 '_NumericColumn',

2373 ['key', 'shape', 'default_value', 'dtype', 'normalizer_fn'])):

2374 """see `numeric_column`."""

2375

2376 @property

2377 def name(self):

2378 return self.key

2379

2380 @property

2381 def _parse_example_spec(self):

2382 return {

2383 self.key:

2384 parsing_ops.FixedLenFeature(self.shape, self.dtype,

2385 self.default_value)

2386 }

2387

2388 def _transform_feature(self, inputs):

2389 input_tensor = inputs.get(self.key)

2390 if isinstance(input_tensor, sparse_tensor_lib.SparseTensor):

2391 raise ValueError(

2392 'The corresponding Tensor of numerical column must be a Tensor. '

2393 'SparseTensor is not supported. key: {}'.format(self.key))

2394 if self.normalizer_fn is not None:

2395 input_tensor = self.normalizer_fn(input_tensor)

2396 return math_ops.cast(input_tensor, dtypes.float32)

2397

2398 @property

2399 def _variable_shape(self):

2400 return tensor_shape.TensorShape(self.shape)

2401

2402 def _get_dense_tensor(self, inputs, weight_collections=None, trainable=None):

2403 """Returns dense `Tensor` representing numeric feature.

2404

2405 Args:

2406 inputs: A `_LazyBuilder` object to access inputs.

2407 weight_collections: Unused `weight_collections` since no variables are

2408 created in this function.

2409 trainable: Unused `trainable` bool since no variables are created in this

2410 function.

2411

2412 Returns:

2413 Dense `Tensor` created within `_transform_feature`.

2414 """

2415 # Do nothing with weight_collections and trainable since no variables are

2416 # created in this function.

2417 del weight_collections

2418 del trainable

2419 # Feature has been already transformed. Return the intermediate

2420 # representation created by _transform_feature.

2421 return inputs.get(self)

2422

2423

2424class _BucketizedColumn(_DenseColumn, _CategoricalColumn,

2425 collections.namedtuple('_BucketizedColumn',

2426 ['source_column', 'boundaries'])

2427 ):

2428 """See `bucketized_column`."""

2429

2430 @property

2431 def name(self):

2432 return '{}_bucketized'.format(self.source_column.name)

2433

2434 @property

2435 def _parse_example_spec(self):

2436 return self.source_column._parse_example_spec # pylint: disable=protected-access

2437

2438 def _transform_feature(self, inputs):

2439 source_tensor = inputs.get(self.source_column)

2440 return math_ops._bucketize( # pylint: disable=protected-access

2441 source_tensor,

2442 boundaries=self.boundaries)

2443

2444 @property

2445 def _variable_shape(self):

2446 return tensor_shape.TensorShape(

2447 tuple(self.source_column.shape) + (len(self.boundaries) + 1,))

2448

2449 def _get_dense_tensor(self, inputs, weight_collections=None, trainable=None):

2450 del weight_collections

2451 del trainable

2452 input_tensor = inputs.get(self)

2453 return array_ops.one_hot(

2454 indices=math_ops.cast(input_tensor, dtypes.int64),

2455 depth=len(self.boundaries) + 1,

2456 on_value=1.,

2457 off_value=0.)

2458

2459 @property

2460 def _num_buckets(self):

2461 # By construction, source_column is always one-dimensional.

2462 return (len(self.boundaries) + 1) * self.source_column.shape[0]

2463

2464 def _get_sparse_tensors(self,

2465 inputs,

2466 weight_collections=None,

2467 trainable=None):

2468 """Converts dense inputs to SparseTensor so downstream code can use it."""

2469 input_tensor = inputs.get(self)

2470 batch_size = array_ops.shape(input_tensor)[0]

2471 # By construction, source_column is always one-dimensional.

2472 source_dimension = self.source_column.shape[0]

2473

2474 i1 = array_ops.reshape(

2475 array_ops.tile(

2476 array_ops.expand_dims(math_ops.range(0, batch_size), 1),

2477 [1, source_dimension]), (-1,))

2478 i2 = array_ops.tile(math_ops.range(0, source_dimension), [batch_size])

2479 # Flatten the bucket indices and unique them across dimensions

2480 # E.g. 2nd dimension indices will range from k to 2*k-1 with k buckets

2481 bucket_indices = (

2482 array_ops.reshape(input_tensor,

2483 (-1,)) + (len(self.boundaries) + 1) * i2)

2484

2485 indices = math_ops.cast(

2486 array_ops.transpose(array_ops_stack.stack((i1, i2))), dtypes.int64)

2487 dense_shape = math_ops.cast(

2488 array_ops_stack.stack([batch_size, source_dimension]), dtypes.int64)

2489 sparse_tensor = sparse_tensor_lib.SparseTensor(

2490 indices=indices, values=bucket_indices, dense_shape=dense_shape)

2491 return _CategoricalColumn.IdWeightPair(sparse_tensor, None)

2492

2493

2494class _EmbeddingColumn(

2495 _DenseColumn, _SequenceDenseColumn,

2496 collections.namedtuple(

2497 '_EmbeddingColumn',

2498 ('categorical_column', 'dimension', 'combiner', 'layer_creator',

2499 'ckpt_to_load_from', 'tensor_name_in_ckpt', 'max_norm', 'trainable',

2500 'use_safe_embedding_lookup'))):

2501 """See `embedding_column`."""

2502

2503 def __new__(cls,

2504 categorical_column,

2505 dimension,

2506 combiner,

2507 layer_creator,

2508 ckpt_to_load_from,

2509 tensor_name_in_ckpt,

2510 max_norm,

2511 trainable,

2512 use_safe_embedding_lookup=True):

2513 return super(_EmbeddingColumn, cls).__new__(

2514 cls,

2515 categorical_column=categorical_column,

2516 dimension=dimension,

2517 combiner=combiner,

2518 layer_creator=layer_creator,

2519 ckpt_to_load_from=ckpt_to_load_from,

2520 tensor_name_in_ckpt=tensor_name_in_ckpt,

2521 max_norm=max_norm,

2522 trainable=trainable,

2523 use_safe_embedding_lookup=use_safe_embedding_lookup)

2524

2525 @property

2526 def name(self):

2527 if not hasattr(self, '_name'):

2528 self._name = '{}_embedding'.format(self.categorical_column.name)

2529 return self._name

2530

2531 @property

2532 def _parse_example_spec(self):

2533 return self.categorical_column._parse_example_spec # pylint: disable=protected-access

2534

2535 def _transform_feature(self, inputs):

2536 return inputs.get(self.categorical_column)

2537

2538 @property

2539 def _variable_shape(self):

2540 if not hasattr(self, '_shape'):

2541 self._shape = tensor_shape.TensorShape([self.dimension])

2542 return self._shape

2543

2544 def _get_dense_tensor_internal(self,

2545 inputs,

2546 weight_collections=None,

2547 trainable=None):

2548 """Private method that follows the signature of _get_dense_tensor."""

2549 # Get sparse IDs and weights.

2550 sparse_tensors = self.categorical_column._get_sparse_tensors( # pylint: disable=protected-access

2551 inputs,

2552 weight_collections=weight_collections,

2553 trainable=trainable)

2554 sparse_ids = sparse_tensors.id_tensor

2555 sparse_weights = sparse_tensors.weight_tensor

2556

2557 embedding_weights = self.layer_creator(

2558 weight_collections=weight_collections,

2559 scope=variable_scope.get_variable_scope())

2560

2561 if self.ckpt_to_load_from is not None:

2562 to_restore = embedding_weights

2563 if isinstance(to_restore, variables.PartitionedVariable):

2564 to_restore = to_restore._get_variable_list() # pylint: disable=protected-access

2565 checkpoint_utils.init_from_checkpoint(

2566 self.ckpt_to_load_from, {self.tensor_name_in_ckpt: to_restore})

2567

2568 sparse_id_rank = tensor_shape.dimension_value(

2569 sparse_ids.dense_shape.get_shape()[0])

2570 embedding_lookup_sparse = embedding_ops.safe_embedding_lookup_sparse

2571 if (not self.use_safe_embedding_lookup and sparse_id_rank is not None and

2572 sparse_id_rank <= 2):

2573 embedding_lookup_sparse = embedding_ops.embedding_lookup_sparse_v2

2574 # Return embedding lookup result.

2575 return embedding_lookup_sparse(

2576 embedding_weights,

2577 sparse_ids,

2578 sparse_weights,

2579 combiner=self.combiner,

2580 name='%s_weights' % self.name,

2581 max_norm=self.max_norm)

2582

2583 def _get_dense_tensor(self, inputs, weight_collections=None, trainable=None):

2584 if isinstance(self.categorical_column, _SequenceCategoricalColumn):

2585 raise ValueError(

2586 'In embedding_column: {}. '

2587 'categorical_column must not be of type _SequenceCategoricalColumn. '

2588 'Suggested fix A: If you wish to use input_layer, use a '

2589 'non-sequence categorical_column_with_*. '

2590 'Suggested fix B: If you wish to create sequence input, use '

2591 'sequence_input_layer instead of input_layer. '

2592 'Given (type {}): {}'.format(self.name, type(self.categorical_column),

2593 self.categorical_column))

2594 return self._get_dense_tensor_internal(

2595 inputs=inputs,

2596 weight_collections=weight_collections,

2597 trainable=trainable)

2598

2599 def _get_sequence_dense_tensor(self,

2600 inputs,

2601 weight_collections=None,

2602 trainable=None):

2603 if not isinstance(self.categorical_column, _SequenceCategoricalColumn):

2604 raise ValueError(

2605 'In embedding_column: {}. '

2606 'categorical_column must be of type _SequenceCategoricalColumn '

2607 'to use sequence_input_layer. '

2608 'Suggested fix: Use one of sequence_categorical_column_with_*. '

2609 'Given (type {}): {}'.format(self.name, type(self.categorical_column),

2610 self.categorical_column))

2611 dense_tensor = self._get_dense_tensor_internal( # pylint: disable=protected-access

2612 inputs=inputs,

2613 weight_collections=weight_collections,

2614 trainable=trainable)

2615

2616 sparse_tensors = self.categorical_column._get_sparse_tensors(inputs) # pylint: disable=protected-access

2617 sequence_length = fc_utils.sequence_length_from_sparse_tensor(

2618 sparse_tensors.id_tensor)

2619 return _SequenceDenseColumn.TensorSequenceLengthPair(

2620 dense_tensor=dense_tensor, sequence_length=sequence_length)

2621

2622

2623def _get_graph_for_variable(var):

2624 if isinstance(var, variables.PartitionedVariable):

2625 return list(var)[0].graph

2626 else:

2627 return var.graph

2628

2629

2630class _SharedEmbeddingColumn(

2631 _DenseColumn, _SequenceDenseColumn,

2632 collections.namedtuple(

2633 '_SharedEmbeddingColumn',

2634 ('categorical_column', 'dimension', 'combiner', 'initializer',

2635 'shared_embedding_collection_name', 'ckpt_to_load_from',

2636 'tensor_name_in_ckpt', 'max_norm', 'trainable',

2637 'use_safe_embedding_lookup'))):

2638 """See `embedding_column`."""

2639

2640 @property

2641 def name(self):

2642 if not hasattr(self, '_name'):

2643 self._name = '{}_shared_embedding'.format(self.categorical_column.name)

2644 return self._name

2645

2646 @property

2647 def _var_scope_name(self):

2648 return self.shared_embedding_collection_name

2649

2650 @property

2651 def _parse_example_spec(self):

2652 return self.categorical_column._parse_example_spec # pylint: disable=protected-access

2653

2654 def _transform_feature(self, inputs):

2655 return inputs.get(self.categorical_column)

2656

2657 @property

2658 def _variable_shape(self):

2659 if not hasattr(self, '_shape'):

2660 self._shape = tensor_shape.TensorShape([self.dimension])

2661 return self._shape

2662

2663 def _get_dense_tensor_internal(self,

2664 inputs,

2665 weight_collections=None,

2666 trainable=None):

2667 """Private method that follows the signature of _get_dense_tensor."""

2668 # This method is called from a variable_scope with name _var_scope_name,

2669 # which is shared among all shared embeddings. Open a name_scope here, so

2670 # that the ops for different columns have distinct names.

2671 with ops.name_scope(None, default_name=self.name):

2672 # Get sparse IDs and weights.

2673 sparse_tensors = self.categorical_column._get_sparse_tensors( # pylint: disable=protected-access

2674 inputs,

2675 weight_collections=weight_collections,

2676 trainable=trainable)

2677 sparse_ids = sparse_tensors.id_tensor

2678 sparse_weights = sparse_tensors.weight_tensor

2679

2680 embedding_shape = (self.categorical_column._num_buckets, self.dimension) # pylint: disable=protected-access

2681 shared_embedding_collection = ops.get_collection(

2682 self.shared_embedding_collection_name)

2683 if shared_embedding_collection:

2684 if len(shared_embedding_collection) > 1:

2685 raise ValueError(

2686 'Collection {} can only contain one variable. '

2687 'Suggested fix A: Choose a unique name for this collection. '

2688 'Suggested fix B: Do not add any variables to this collection. '

2689 'The feature_column library already adds a variable under the '

2690 'hood.'.format(shared_embedding_collection))

2691 embedding_weights = shared_embedding_collection[0]

2692 if embedding_weights.get_shape() != embedding_shape:

2693 raise ValueError(

2694 'Shared embedding collection {} contains variable {} of '

2695 'unexpected shape {}. Expected shape is {}. '

2696 'Suggested fix A: Choose a unique name for this collection. '

2697 'Suggested fix B: Do not add any variables to this collection. '

2698 'The feature_column library already adds a variable under the '

2699 'hood.'.format(self.shared_embedding_collection_name,

2700 embedding_weights.name,

2701 embedding_weights.get_shape(), embedding_shape))

2702 else:

2703 embedding_weights = variable_scope.get_variable(

2704 name='embedding_weights',

2705 shape=embedding_shape,

2706 dtype=dtypes.float32,

2707 initializer=self.initializer,

2708 trainable=self.trainable and trainable,

2709 collections=weight_collections)

2710 ops.add_to_collection(self.shared_embedding_collection_name,

2711 embedding_weights)

2712 if self.ckpt_to_load_from is not None:

2713 to_restore = embedding_weights

2714 if isinstance(to_restore, variables.PartitionedVariable):

2715 to_restore = to_restore._get_variable_list() # pylint: disable=protected-access

2716 checkpoint_utils.init_from_checkpoint(

2717 self.ckpt_to_load_from, {self.tensor_name_in_ckpt: to_restore})

2718

2719 sparse_id_rank = tensor_shape.dimension_value(

2720 sparse_ids.dense_shape.get_shape()[0])

2721 embedding_lookup_sparse = embedding_ops.safe_embedding_lookup_sparse

2722 if (not self.use_safe_embedding_lookup and sparse_id_rank is not None and

2723 sparse_id_rank <= 2):

2724 embedding_lookup_sparse = embedding_ops.embedding_lookup_sparse_v2

2725 # Return embedding lookup result.

2726 return embedding_lookup_sparse(

2727 embedding_weights,

2728 sparse_ids,

2729 sparse_weights,

2730 combiner=self.combiner,

2731 name='%s_weights' % self.name,

2732 max_norm=self.max_norm)

2733

2734 def _get_dense_tensor(self, inputs, weight_collections=None, trainable=None):

2735 if isinstance(self.categorical_column, _SequenceCategoricalColumn):

2736 raise ValueError(

2737 'In embedding_column: {}. '

2738 'categorical_column must not be of type _SequenceCategoricalColumn. '

2739 'Suggested fix A: If you wish to use input_layer, use a '

2740 'non-sequence categorical_column_with_*. '

2741 'Suggested fix B: If you wish to create sequence input, use '

2742 'sequence_input_layer instead of input_layer. '

2743 'Given (type {}): {}'.format(self.name, type(self.categorical_column),

2744 self.categorical_column))

2745 return self._get_dense_tensor_internal(

2746 inputs=inputs,

2747 weight_collections=weight_collections,

2748 trainable=trainable)

2749

2750 def _get_sequence_dense_tensor(self,

2751 inputs,

2752 weight_collections=None,

2753 trainable=None):

2754 if not isinstance(self.categorical_column, _SequenceCategoricalColumn):

2755 raise ValueError(

2756 'In embedding_column: {}. '

2757 'categorical_column must be of type _SequenceCategoricalColumn '

2758 'to use sequence_input_layer. '

2759 'Suggested fix: Use one of sequence_categorical_column_with_*. '

2760 'Given (type {}): {}'.format(self.name, type(self.categorical_column),

2761 self.categorical_column))

2762 dense_tensor = self._get_dense_tensor_internal( # pylint: disable=protected-access

2763 inputs=inputs,

2764 weight_collections=weight_collections,

2765 trainable=trainable)

2766 sparse_tensors = self.categorical_column._get_sparse_tensors(inputs) # pylint: disable=protected-access

2767 sequence_length = fc_utils.sequence_length_from_sparse_tensor(

2768 sparse_tensors.id_tensor)

2769 return _SequenceDenseColumn.TensorSequenceLengthPair(

2770 dense_tensor=dense_tensor, sequence_length=sequence_length)

2771

2772

2773def _check_shape(shape, key):

2774 """Returns shape if it's valid, raises error otherwise."""

2775 assert shape is not None

2776 if not nest.is_nested(shape):

2777 shape = [shape]

2778 shape = tuple(shape)

2779 for dimension in shape:

2780 if not isinstance(dimension, six.integer_types):

2781 raise TypeError('shape dimensions must be integer. '

2782 'shape: {}, key: {}'.format(shape, key))

2783 if dimension < 1:

2784 raise ValueError('shape dimensions must be greater than 0. '

2785 'shape: {}, key: {}'.format(shape, key))

2786 return shape

2787

2788

2789class _HashedCategoricalColumn(_CategoricalColumn,

2790 collections.namedtuple(

2791 '_HashedCategoricalColumn',

2792 ['key', 'hash_bucket_size', 'dtype'])):

2793 """see `categorical_column_with_hash_bucket`."""

2794

2795 @property

2796 def name(self):

2797 return self.key

2798

2799 @property

2800 def _parse_example_spec(self):

2801 return {self.key: parsing_ops.VarLenFeature(self.dtype)}

2802

2803 def _transform_feature(self, inputs):

2804 input_tensor = _to_sparse_input_and_drop_ignore_values(inputs.get(self.key))

2805 if not isinstance(input_tensor, sparse_tensor_lib.SparseTensor):

2806 raise ValueError('SparseColumn input must be a SparseTensor.')

2807

2808 fc_utils.assert_string_or_int(

2809 input_tensor.dtype,

2810 prefix='column_name: {} input_tensor'.format(self.key))

2811

2812 if self.dtype.is_integer != input_tensor.dtype.is_integer:

2813 raise ValueError(

2814 'Column dtype and SparseTensors dtype must be compatible. '

2815 'key: {}, column dtype: {}, tensor dtype: {}'.format(

2816 self.key, self.dtype, input_tensor.dtype))

2817

2818 if self.dtype == dtypes.string:

2819 sparse_values = input_tensor.values

2820 else:

2821 sparse_values = string_ops.as_string(input_tensor.values)

2822

2823 sparse_id_values = string_ops.string_to_hash_bucket_fast(

2824 sparse_values, self.hash_bucket_size, name='lookup')

2825 return sparse_tensor_lib.SparseTensor(input_tensor.indices,

2826 sparse_id_values,

2827 input_tensor.dense_shape)

2828

2829 @property

2830 def _num_buckets(self):

2831 """Returns number of buckets in this sparse feature."""

2832 return self.hash_bucket_size

2833

2834 def _get_sparse_tensors(self,

2835 inputs,

2836 weight_collections=None,

2837 trainable=None):

2838 return _CategoricalColumn.IdWeightPair(inputs.get(self), None)

2839

2840

2841class _VocabularyFileCategoricalColumn(

2842 _CategoricalColumn,

2843 collections.namedtuple('_VocabularyFileCategoricalColumn',

2844 ('key', 'vocabulary_file', 'vocabulary_size',

2845 'num_oov_buckets', 'dtype', 'default_value'))):

2846 """See `categorical_column_with_vocabulary_file`."""

2847

2848 @property

2849 def name(self):

2850 return self.key

2851

2852 @property

2853 def _parse_example_spec(self):

2854 return {self.key: parsing_ops.VarLenFeature(self.dtype)}

2855

2856 def _transform_feature(self, inputs):

2857 input_tensor = _to_sparse_input_and_drop_ignore_values(inputs.get(self.key))

2858

2859 if self.dtype.is_integer != input_tensor.dtype.is_integer:

2860 raise ValueError(

2861 'Column dtype and SparseTensors dtype must be compatible. '

2862 'key: {}, column dtype: {}, tensor dtype: {}'.format(

2863 self.key, self.dtype, input_tensor.dtype))

2864

2865 fc_utils.assert_string_or_int(

2866 input_tensor.dtype,

2867 prefix='column_name: {} input_tensor'.format(self.key))

2868

2869 key_dtype = self.dtype

2870 if input_tensor.dtype.is_integer:

2871 # `index_table_from_file` requires 64-bit integer keys.

2872 key_dtype = dtypes.int64

2873 input_tensor = math_ops.cast(input_tensor, dtypes.int64)

2874

2875 return lookup_ops.index_table_from_file(

2876 vocabulary_file=self.vocabulary_file,

2877 num_oov_buckets=self.num_oov_buckets,

2878 vocab_size=self.vocabulary_size,

2879 default_value=self.default_value,

2880 key_dtype=key_dtype,

2881 name='{}_lookup'.format(self.key)).lookup(input_tensor)

2882

2883 @property

2884 def _num_buckets(self):

2885 """Returns number of buckets in this sparse feature."""

2886 return self.vocabulary_size + self.num_oov_buckets

2887

2888 def _get_sparse_tensors(self,

2889 inputs,

2890 weight_collections=None,

2891 trainable=None):

2892 return _CategoricalColumn.IdWeightPair(inputs.get(self), None)

2893

2894

2895class _VocabularyListCategoricalColumn(

2896 _CategoricalColumn,

2897 collections.namedtuple(

2898 '_VocabularyListCategoricalColumn',

2899 ('key', 'vocabulary_list', 'dtype', 'default_value', 'num_oov_buckets'))

2900):

2901 """See `categorical_column_with_vocabulary_list`."""

2902

2903 @property

2904 def name(self):

2905 return self.key

2906

2907 @property

2908 def _parse_example_spec(self):

2909 return {self.key: parsing_ops.VarLenFeature(self.dtype)}

2910

2911 def _transform_feature(self, inputs):

2912 input_tensor = _to_sparse_input_and_drop_ignore_values(inputs.get(self.key))

2913

2914 if self.dtype.is_integer != input_tensor.dtype.is_integer:

2915 raise ValueError(

2916 'Column dtype and SparseTensors dtype must be compatible. '

2917 'key: {}, column dtype: {}, tensor dtype: {}'.format(

2918 self.key, self.dtype, input_tensor.dtype))

2919

2920 fc_utils.assert_string_or_int(

2921 input_tensor.dtype,

2922 prefix='column_name: {} input_tensor'.format(self.key))

2923

2924 key_dtype = self.dtype

2925 if input_tensor.dtype.is_integer:

2926 # `index_table_from_tensor` requires 64-bit integer keys.

2927 key_dtype = dtypes.int64

2928 input_tensor = math_ops.cast(input_tensor, dtypes.int64)

2929

2930 return lookup_ops.index_table_from_tensor(

2931 vocabulary_list=tuple(self.vocabulary_list),

2932 default_value=self.default_value,

2933 num_oov_buckets=self.num_oov_buckets,

2934 dtype=key_dtype,

2935 name='{}_lookup'.format(self.key)).lookup(input_tensor)

2936

2937 @property

2938 def _num_buckets(self):

2939 """Returns number of buckets in this sparse feature."""

2940 return len(self.vocabulary_list) + self.num_oov_buckets

2941

2942 def _get_sparse_tensors(self,

2943 inputs,

2944 weight_collections=None,

2945 trainable=None):

2946 return _CategoricalColumn.IdWeightPair(inputs.get(self), None)

2947

2948

2949class _IdentityCategoricalColumn(_CategoricalColumn,

2950 collections.namedtuple(

2951 '_IdentityCategoricalColumn',

2952 ('key', 'num_buckets', 'default_value'))):

2953 """See `categorical_column_with_identity`."""

2954

2955 @property

2956 def name(self):

2957 return self.key

2958

2959 @property

2960 def _parse_example_spec(self):

2961 return {self.key: parsing_ops.VarLenFeature(dtypes.int64)}

2962

2963 def _transform_feature(self, inputs):

2964 input_tensor = _to_sparse_input_and_drop_ignore_values(inputs.get(self.key))

2965

2966 if not input_tensor.dtype.is_integer:

2967 raise ValueError('Invalid input, not integer. key: {} dtype: {}'.format(

2968 self.key, input_tensor.dtype))

2969 values = input_tensor.values

2970 if input_tensor.values.dtype != dtypes.int64:

2971 values = math_ops.cast(values, dtypes.int64, name='values')

2972 if self.default_value is not None:

2973 num_buckets = math_ops.cast(

2974 self.num_buckets, dtypes.int64, name='num_buckets')

2975 zero = math_ops.cast(0, dtypes.int64, name='zero')

2976 # Assign default for out-of-range values.

2977 values = array_ops.where(

2978 math_ops.logical_or(

2979 values < zero, values >= num_buckets, name='out_of_range'),

2980 array_ops.fill(

2981 dims=array_ops.shape(values),

2982 value=math_ops.cast(self.default_value, dtypes.int64),

2983 name='default_values'), values)

2984 return sparse_tensor_lib.SparseTensor(

2985 indices=input_tensor.indices,

2986 values=values,

2987 dense_shape=input_tensor.dense_shape)

2988

2989 @property

2990 def _num_buckets(self):

2991 """Returns number of buckets in this sparse feature."""

2992 return self.num_buckets

2993

2994 def _get_sparse_tensors(self,

2995 inputs,

2996 weight_collections=None,

2997 trainable=None):

2998 return _CategoricalColumn.IdWeightPair(inputs.get(self), None)

2999

3000

3001class _WeightedCategoricalColumn(

3002 _CategoricalColumn,

3003 collections.namedtuple(

3004 '_WeightedCategoricalColumn',

3005 ('categorical_column', 'weight_feature_key', 'dtype'))):

3006 """See `weighted_categorical_column`."""

3007

3008 @property

3009 def name(self):

3010 return '{}_weighted_by_{}'.format(self.categorical_column.name,

3011 self.weight_feature_key)

3012

3013 @property

3014 def _parse_example_spec(self):

3015 config = self.categorical_column._parse_example_spec # pylint: disable=protected-access

3016 if self.weight_feature_key in config:

3017 raise ValueError('Parse config {} already exists for {}.'.format(

3018 config[self.weight_feature_key], self.weight_feature_key))

3019 config[self.weight_feature_key] = parsing_ops.VarLenFeature(self.dtype)

3020 return config

3021

3022 @property

3023 def _num_buckets(self):

3024 return self.categorical_column._num_buckets # pylint: disable=protected-access

3025

3026 def _transform_feature(self, inputs):

3027 weight_tensor = inputs.get(self.weight_feature_key)

3028 if weight_tensor is None:

3029 raise ValueError('Missing weights {}.'.format(self.weight_feature_key))

3030 weight_tensor = sparse_tensor_lib.convert_to_tensor_or_sparse_tensor(

3031 weight_tensor)

3032 if self.dtype != weight_tensor.dtype.base_dtype:

3033 raise ValueError('Bad dtype, expected {}, but got {}.'.format(

3034 self.dtype, weight_tensor.dtype))

3035 if not isinstance(weight_tensor, sparse_tensor_lib.SparseTensor):

3036 # The weight tensor can be a regular Tensor. In this case, sparsify it.

3037 weight_tensor = _to_sparse_input_and_drop_ignore_values(

3038 weight_tensor, ignore_value=0.0)

3039 if not weight_tensor.dtype.is_floating:

3040 weight_tensor = math_ops.cast(weight_tensor, dtypes.float32)

3041 return (inputs.get(self.categorical_column), weight_tensor)

3042

3043 def _get_sparse_tensors(self,

3044 inputs,

3045 weight_collections=None,

3046 trainable=None):

3047 del weight_collections

3048 del trainable

3049 tensors = inputs.get(self)

3050 return _CategoricalColumn.IdWeightPair(tensors[0], tensors[1])

3051

3052

3053class _CrossedColumn(

3054 _CategoricalColumn,

3055 collections.namedtuple('_CrossedColumn',

3056 ['keys', 'hash_bucket_size', 'hash_key'])):

3057 """See `crossed_column`."""

3058

3059 @property

3060 def name(self):

3061 feature_names = []

3062 for key in _collect_leaf_level_keys(self):

3063 if isinstance(key, _FeatureColumn):

3064 feature_names.append(key.name)

3065 else: # key must be a string

3066 feature_names.append(key)

3067 return '_X_'.join(sorted(feature_names))

3068

3069 @property

3070 def _parse_example_spec(self):

3071 config = {}

3072 for key in self.keys:

3073 if isinstance(key, _FeatureColumn):

3074 config.update(key._parse_example_spec) # pylint: disable=protected-access

3075 else: # key must be a string

3076 config.update({key: parsing_ops.VarLenFeature(dtypes.string)})

3077 return config

3078

3079 def _transform_feature(self, inputs):

3080 feature_tensors = []

3081 for key in _collect_leaf_level_keys(self):

3082 if isinstance(key, six.string_types):

3083 feature_tensors.append(inputs.get(key))

3084 elif isinstance(key, _CategoricalColumn):

3085 ids_and_weights = key._get_sparse_tensors(inputs) # pylint: disable=protected-access

3086 if ids_and_weights.weight_tensor is not None:

3087 raise ValueError(

3088 'crossed_column does not support weight_tensor, but the given '

3089 'column populates weight_tensor. '

3090 'Given column: {}'.format(key.name))

3091 feature_tensors.append(ids_and_weights.id_tensor)

3092 else:

3093 raise ValueError('Unsupported column type. Given: {}'.format(key))

3094 return sparse_ops.sparse_cross_hashed(

3095 inputs=feature_tensors,

3096 num_buckets=self.hash_bucket_size,

3097 hash_key=self.hash_key)

3098

3099 @property

3100 def _num_buckets(self):

3101 """Returns number of buckets in this sparse feature."""

3102 return self.hash_bucket_size

3103

3104 def _get_sparse_tensors(self,

3105 inputs,

3106 weight_collections=None,

3107 trainable=None):

3108 return _CategoricalColumn.IdWeightPair(inputs.get(self), None)

3109

3110

3111def _collect_leaf_level_keys(cross):

3112 """Collects base keys by expanding all nested crosses.

3113

3114 Args:

3115 cross: A `_CrossedColumn`.

3116

3117 Returns:

3118 A list of strings or `_CategoricalColumn` instances.

3119 """

3120 leaf_level_keys = []

3121 for k in cross.keys:

3122 if isinstance(k, _CrossedColumn):

3123 leaf_level_keys.extend(_collect_leaf_level_keys(k))

3124 else:

3125 leaf_level_keys.append(k)

3126 return leaf_level_keys

3127

3128

3129class _IndicatorColumn(_DenseColumn, _SequenceDenseColumn,

3130 collections.namedtuple('_IndicatorColumn',

3131 ['categorical_column'])):

3132 """Represents a one-hot column for use in deep networks.

3133

3134 Args:

3135 categorical_column: A `_CategoricalColumn` which is created by

3136 `categorical_column_with_*` function.

3137 """

3138

3139 @property

3140 def name(self):

3141 return '{}_indicator'.format(self.categorical_column.name)

3142

3143 def _transform_feature(self, inputs):

3144 """Returns dense `Tensor` representing feature.

3145

3146 Args:

3147 inputs: A `_LazyBuilder` object to access inputs.

3148

3149 Returns:

3150 Transformed feature `Tensor`.

3151

3152 Raises:

3153 ValueError: if input rank is not known at graph building time.

3154 """

3155 id_weight_pair = self.categorical_column._get_sparse_tensors(inputs) # pylint: disable=protected-access

3156 id_tensor = id_weight_pair.id_tensor

3157 weight_tensor = id_weight_pair.weight_tensor

3158

3159 # If the underlying column is weighted, return the input as a dense tensor.

3160 if weight_tensor is not None:

3161 weighted_column = sparse_ops.sparse_merge(

3162 sp_ids=id_tensor,

3163 sp_values=weight_tensor,

3164 vocab_size=int(self._variable_shape[-1]))

3165 # Remove (?, -1) index.

3166 weighted_column = sparse_ops.sparse_slice(weighted_column, [0, 0],

3167 weighted_column.dense_shape)

3168 # Use scatter_nd to merge duplicated indices if existed,

3169 # instead of sparse_tensor_to_dense.

3170 return array_ops.scatter_nd(weighted_column.indices,

3171 weighted_column.values,

3172 weighted_column.dense_shape)

3173

3174 dense_id_tensor = sparse_ops.sparse_tensor_to_dense(

3175 id_tensor, default_value=-1)

3176

3177 # One hot must be float for tf.concat reasons since all other inputs to

3178 # input_layer are float32.

3179 one_hot_id_tensor = array_ops.one_hot(

3180 dense_id_tensor,

3181 depth=self._variable_shape[-1],

3182 on_value=1.0,

3183 off_value=0.0)

3184

3185 # Reduce to get a multi-hot per example.

3186 return math_ops.reduce_sum(one_hot_id_tensor, axis=[-2])

3187

3188 @property

3189 def _parse_example_spec(self):

3190 return self.categorical_column._parse_example_spec # pylint: disable=protected-access

3191

3192 @property

3193 def _variable_shape(self):

3194 """Returns a `TensorShape` representing the shape of the dense `Tensor`."""

3195 return tensor_shape.TensorShape([1, self.categorical_column._num_buckets]) # pylint: disable=protected-access

3196

3197 def _get_dense_tensor(self, inputs, weight_collections=None, trainable=None):

3198 """Returns dense `Tensor` representing feature.

3199

3200 Args:

3201 inputs: A `_LazyBuilder` object to access inputs.

3202 weight_collections: Unused `weight_collections` since no variables are

3203 created in this function.

3204 trainable: Unused `trainable` bool since no variables are created in this

3205 function.

3206

3207 Returns:

3208 Dense `Tensor` created within `_transform_feature`.

3209

3210 Raises:

3211 ValueError: If `categorical_column` is a `_SequenceCategoricalColumn`.

3212 """

3213 # Do nothing with weight_collections and trainable since no variables are

3214 # created in this function.

3215 del weight_collections

3216 del trainable

3217 if isinstance(self.categorical_column, _SequenceCategoricalColumn):

3218 raise ValueError(

3219 'In indicator_column: {}. '

3220 'categorical_column must not be of type _SequenceCategoricalColumn. '

3221 'Suggested fix A: If you wish to use input_layer, use a '

3222 'non-sequence categorical_column_with_*. '

3223 'Suggested fix B: If you wish to create sequence input, use '

3224 'sequence_input_layer instead of input_layer. '

3225 'Given (type {}): {}'.format(self.name, type(self.categorical_column),

3226 self.categorical_column))

3227 # Feature has been already transformed. Return the intermediate

3228 # representation created by _transform_feature.

3229 return inputs.get(self)

3230

3231 def _get_sequence_dense_tensor(self,

3232 inputs,

3233 weight_collections=None,

3234 trainable=None):

3235 # Do nothing with weight_collections and trainable since no variables are

3236 # created in this function.

3237 del weight_collections

3238 del trainable

3239 if not isinstance(self.categorical_column, _SequenceCategoricalColumn):

3240 raise ValueError(

3241 'In indicator_column: {}. '

3242 'categorical_column must be of type _SequenceCategoricalColumn '

3243 'to use sequence_input_layer. '

3244 'Suggested fix: Use one of sequence_categorical_column_with_*. '

3245 'Given (type {}): {}'.format(self.name, type(self.categorical_column),

3246 self.categorical_column))

3247 # Feature has been already transformed. Return the intermediate

3248 # representation created by _transform_feature.

3249 dense_tensor = inputs.get(self)

3250 sparse_tensors = self.categorical_column._get_sparse_tensors(inputs) # pylint: disable=protected-access

3251 sequence_length = fc_utils.sequence_length_from_sparse_tensor(

3252 sparse_tensors.id_tensor)

3253 return _SequenceDenseColumn.TensorSequenceLengthPair(

3254 dense_tensor=dense_tensor, sequence_length=sequence_length)

3255

3256

3257def _verify_static_batch_size_equality(tensors, columns):

3258 """Validates that the first dim (batch size) of all tensors are equal or None.

3259

3260 Args:

3261 tensors: list of tensors to check.

3262 columns: list of feature columns matching tensors. Will be used for error

3263 messaging.

3264

3265 Raises:

3266 ValueError: if one of the tensors has a variant batch size

3267 """

3268 # bath_size is a tf.compat.v1.Dimension object.

3269 expected_batch_size = None

3270 for i in range(0, len(tensors)):

3271 if tensors[i].shape.dims[0].value is not None:

3272 if expected_batch_size is None:

3273 bath_size_column_index = i

3274 expected_batch_size = tensors[i].shape.dims[0]

3275 elif not expected_batch_size.is_compatible_with(tensors[i].shape.dims[0]):

3276 raise ValueError(

3277 'Batch size (first dimension) of each feature must be same. '

3278 'Batch size of columns ({}, {}): ({}, {})'.format(

3279 columns[bath_size_column_index].name, columns[i].name,

3280 expected_batch_size, tensors[i].shape.dims[0]))

3281

3282

3283class _SequenceCategoricalColumn(_CategoricalColumn,

3284 collections.namedtuple(

3285 '_SequenceCategoricalColumn',

3286 ['categorical_column'])):

3287 """Represents sequences of categorical data."""

3288

3289 @property

3290 def name(self):

3291 return self.categorical_column.name

3292

3293 @property

3294 def _parse_example_spec(self):

3295 return self.categorical_column._parse_example_spec # pylint: disable=protected-access

3296

3297 def _transform_feature(self, inputs):

3298 return self.categorical_column._transform_feature(inputs) # pylint: disable=protected-access

3299

3300 @property

3301 def _num_buckets(self):

3302 return self.categorical_column._num_buckets # pylint: disable=protected-access

3303

3304 def _get_sparse_tensors(self,

3305 inputs,

3306 weight_collections=None,

3307 trainable=None):

3308 sparse_tensors = self.categorical_column._get_sparse_tensors(inputs) # pylint: disable=protected-access

3309 id_tensor = sparse_tensors.id_tensor

3310 weight_tensor = sparse_tensors.weight_tensor

3311

3312 # Expands third dimension, if necessary so that embeddings are not

3313 # combined during embedding lookup. If the tensor is already 3D, leave

3314 # as-is.

3315 shape = array_ops.shape(id_tensor)

3316 # Compute the third dimension explicitly instead of setting it to -1, as

3317 # that doesn't work for dynamically shaped tensors with 0-length at runtime.

3318 # This happens for empty sequences.

3319 target_shape = [shape[0], shape[1], math_ops.reduce_prod(shape[2:])]

3320 id_tensor = sparse_ops.sparse_reshape(id_tensor, target_shape)

3321 if weight_tensor is not None:

3322 weight_tensor = sparse_ops.sparse_reshape(weight_tensor, target_shape)

3323

3324 return _CategoricalColumn.IdWeightPair(id_tensor, weight_tensor)

Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/tensorflow/python/feature_column/feature_column.py: 30%

885 statements