Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/tensorflow/python/feature_column/feature_column.py: 30%

885 statements  

« prev     ^ index     » next       coverage.py v7.4.0, created at 2024-01-03 07:57 +0000

1# Copyright 2017 The TensorFlow Authors. All Rights Reserved. 

2# 

3# Licensed under the Apache License, Version 2.0 (the "License"); 

4# you may not use this file except in compliance with the License. 

5# You may obtain a copy of the License at 

6# 

7# http://www.apache.org/licenses/LICENSE-2.0 

8# 

9# Unless required by applicable law or agreed to in writing, software 

10# distributed under the License is distributed on an "AS IS" BASIS, 

11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 

12# See the License for the specific language governing permissions and 

13# limitations under the License. 

14# ============================================================================== 

15"""This API defines FeatureColumn abstraction. 

16 

17FeatureColumns provide a high level abstraction for ingesting and representing 

18features. FeatureColumns are also the primary way of encoding features for 

19canned `tf.estimator.Estimator`s. 

20 

21When using FeatureColumns with `Estimators`, the type of feature column you 

22should choose depends on (1) the feature type and (2) the model type. 

23 

241. Feature type: 

25 

26 * Continuous features can be represented by `numeric_column`. 

27 * Categorical features can be represented by any `categorical_column_with_*` 

28 column: 

29 - `categorical_column_with_vocabulary_list` 

30 - `categorical_column_with_vocabulary_file` 

31 - `categorical_column_with_hash_bucket` 

32 - `categorical_column_with_identity` 

33 - `weighted_categorical_column` 

34 

352. Model type: 

36 

37 * Deep neural network models (`DNNClassifier`, `DNNRegressor`). 

38 

39 Continuous features can be directly fed into deep neural network models. 

40 

41 age_column = numeric_column("age") 

42 

43 To feed sparse features into DNN models, wrap the column with 

44 `embedding_column` or `indicator_column`. `indicator_column` is recommended 

45 for features with only a few possible values. For features with many 

46 possible values, to reduce the size of your model, `embedding_column` is 

47 recommended. 

48 

49 embedded_dept_column = embedding_column( 

50 categorical_column_with_vocabulary_list( 

51 "department", ["math", "philosophy", ...]), dimension=10) 

52 

53 * Wide (aka linear) models (`LinearClassifier`, `LinearRegressor`). 

54 

55 Sparse features can be fed directly into linear models. They behave like an 

56 indicator column but with an efficient implementation. 

57 

58 dept_column = categorical_column_with_vocabulary_list("department", 

59 ["math", "philosophy", "english"]) 

60 

61 It is recommended that continuous features be bucketized before being 

62 fed into linear models. 

63 

64 bucketized_age_column = bucketized_column( 

65 source_column=age_column, 

66 boundaries=[18, 25, 30, 35, 40, 45, 50, 55, 60, 65]) 

67 

68 Sparse features can be crossed (also known as conjuncted or combined) in 

69 order to form non-linearities, and then fed into linear models. 

70 

71 cross_dept_age_column = crossed_column( 

72 columns=["department", bucketized_age_column], 

73 hash_bucket_size=1000) 

74 

75Example of building canned `Estimator`s using FeatureColumns: 

76 

77 ```python 

78 # Define features and transformations 

79 deep_feature_columns = [age_column, embedded_dept_column] 

80 wide_feature_columns = [dept_column, bucketized_age_column, 

81 cross_dept_age_column] 

82 

83 # Build deep model 

84 estimator = DNNClassifier( 

85 feature_columns=deep_feature_columns, 

86 hidden_units=[500, 250, 50]) 

87 estimator.train(...) 

88 

89 # Or build a wide model 

90 estimator = LinearClassifier( 

91 feature_columns=wide_feature_columns) 

92 estimator.train(...) 

93 

94 # Or build a wide and deep model! 

95 estimator = DNNLinearCombinedClassifier( 

96 linear_feature_columns=wide_feature_columns, 

97 dnn_feature_columns=deep_feature_columns, 

98 dnn_hidden_units=[500, 250, 50]) 

99 estimator.train(...) 

100 ``` 

101 

102 

103FeatureColumns can also be transformed into a generic input layer for 

104custom models using `input_layer`. 

105 

106Example of building model using FeatureColumns, this can be used in a 

107`model_fn` which is given to the {tf.estimator.Estimator}: 

108 

109 ```python 

110 # Building model via layers 

111 

112 deep_feature_columns = [age_column, embedded_dept_column] 

113 columns_to_tensor = parse_feature_columns_from_examples( 

114 serialized=my_data, 

115 feature_columns=deep_feature_columns) 

116 first_layer = input_layer( 

117 features=columns_to_tensor, 

118 feature_columns=deep_feature_columns) 

119 second_layer = fully_connected(first_layer, ...) 

120 ``` 

121 

122NOTE: Functions prefixed with "_" indicate experimental or private parts of 

123the API subject to change, and should not be relied upon! 

124 

125NOTE: The new feature columns are being developed in feature_column_v2.py and 

126are a somewhat duplicate of the code here. Please make sure to update logic 

127in both places. 

128""" 

129 

130import abc 

131import collections 

132import math 

133 

134import numpy as np 

135import six 

136 

137from tensorflow.python.eager import context 

138from tensorflow.python.feature_column import utils as fc_utils 

139from tensorflow.python.framework import dtypes 

140from tensorflow.python.framework import ops 

141from tensorflow.python.framework import sparse_tensor as sparse_tensor_lib 

142from tensorflow.python.framework import tensor_shape 

143from tensorflow.python.layers import base 

144from tensorflow.python.ops import array_ops 

145from tensorflow.python.ops import array_ops_stack 

146from tensorflow.python.ops import check_ops 

147from tensorflow.python.ops import cond 

148from tensorflow.python.ops import embedding_ops 

149from tensorflow.python.ops import init_ops 

150from tensorflow.python.ops import lookup_ops 

151from tensorflow.python.ops import math_ops 

152from tensorflow.python.ops import nn_ops 

153from tensorflow.python.ops import parsing_ops 

154from tensorflow.python.ops import resource_variable_ops 

155from tensorflow.python.ops import sparse_ops 

156from tensorflow.python.ops import string_ops 

157from tensorflow.python.ops import template 

158from tensorflow.python.ops import variable_scope 

159from tensorflow.python.ops import variables 

160from tensorflow.python.platform import gfile 

161from tensorflow.python.platform import tf_logging as logging 

162from tensorflow.python.training import checkpoint_utils 

163from tensorflow.python.util import deprecation 

164from tensorflow.python.util import nest 

165from tensorflow.python.util.compat import collections_abc 

166from tensorflow.python.util.tf_export import tf_export 

167from tensorflow.tools.docs import doc_controls 

168 

169_FEATURE_COLUMN_DEPRECATION_WARNING = """\ 

170 Warning: tf.feature_column is not recommended for new code. Instead, 

171 feature preprocessing can be done directly using either [Keras preprocessing 

172 layers](https://www.tensorflow.org/guide/migrate/migrating_feature_columns) 

173 or through the one-stop utility [`tf.keras.utils.FeatureSpace`](https://www.tensorflow.org/api_docs/python/tf/keras/utils/FeatureSpace) 

174 built on top of them. See the [migration guide](https://tensorflow.org/guide/migrate) 

175 for details. 

176 """ 

177 

178_FEATURE_COLUMN_DEPRECATION_RUNTIME_WARNING = ( 

179 'Use Keras preprocessing layers instead, either directly or via the ' 

180 '`tf.keras.utils.FeatureSpace` utility. Each of `tf.feature_column.*` has ' 

181 'a functional equivalent in `tf.keras.layers` for feature preprocessing ' 

182 'when training a Keras model.') 

183 

184 

185def _internal_input_layer(features, 

186 feature_columns, 

187 weight_collections=None, 

188 trainable=True, 

189 cols_to_vars=None, 

190 scope=None, 

191 cols_to_output_tensors=None, 

192 from_template=False): 

193 """See input_layer. `scope` is a name or variable scope to use.""" 

194 

195 feature_columns = _normalize_feature_columns(feature_columns) 

196 for column in feature_columns: 

197 if not isinstance(column, _DenseColumn): 

198 raise ValueError( 

199 'Items of feature_columns must be a _DenseColumn. ' 

200 'You can wrap a categorical column with an ' 

201 'embedding_column or indicator_column. Given: {}'.format(column)) 

202 weight_collections = list(weight_collections or []) 

203 if ops.GraphKeys.GLOBAL_VARIABLES not in weight_collections: 

204 weight_collections.append(ops.GraphKeys.GLOBAL_VARIABLES) 

205 if ops.GraphKeys.MODEL_VARIABLES not in weight_collections: 

206 weight_collections.append(ops.GraphKeys.MODEL_VARIABLES) 

207 

208 def _get_logits(): # pylint: disable=missing-docstring 

209 builder = _LazyBuilder(features) 

210 output_tensors = [] 

211 ordered_columns = [] 

212 for column in sorted(feature_columns, key=lambda x: x.name): 

213 ordered_columns.append(column) 

214 with variable_scope.variable_scope( 

215 None, default_name=column._var_scope_name): # pylint: disable=protected-access 

216 tensor = column._get_dense_tensor( # pylint: disable=protected-access 

217 builder, 

218 weight_collections=weight_collections, 

219 trainable=trainable) 

220 num_elements = column._variable_shape.num_elements() # pylint: disable=protected-access 

221 batch_size = array_ops.shape(tensor)[0] 

222 output_tensor = array_ops.reshape( 

223 tensor, shape=(batch_size, num_elements)) 

224 output_tensors.append(output_tensor) 

225 if cols_to_vars is not None: 

226 # Retrieve any variables created (some _DenseColumn's don't create 

227 # variables, in which case an empty list is returned). 

228 cols_to_vars[column] = ops.get_collection( 

229 ops.GraphKeys.GLOBAL_VARIABLES, 

230 scope=variable_scope.get_variable_scope().name) 

231 if cols_to_output_tensors is not None: 

232 cols_to_output_tensors[column] = output_tensor 

233 _verify_static_batch_size_equality(output_tensors, ordered_columns) 

234 return array_ops.concat(output_tensors, 1) 

235 

236 # If we're constructing from the `make_template`, that by default adds a 

237 # variable scope with the name of the layer. In that case, we dont want to 

238 # add another `variable_scope` as that would break checkpoints. 

239 if from_template: 

240 return _get_logits() 

241 else: 

242 with variable_scope.variable_scope( 

243 scope, default_name='input_layer', values=features.values()): 

244 return _get_logits() 

245 

246 

247@doc_controls.header(_FEATURE_COLUMN_DEPRECATION_WARNING) 

248@tf_export(v1=['feature_column.input_layer']) 

249@deprecation.deprecated(None, _FEATURE_COLUMN_DEPRECATION_RUNTIME_WARNING) 

250def input_layer(features, 

251 feature_columns, 

252 weight_collections=None, 

253 trainable=True, 

254 cols_to_vars=None, 

255 cols_to_output_tensors=None): 

256 """Returns a dense `Tensor` as input layer based on given `feature_columns`. 

257 

258 Generally a single example in training data is described with FeatureColumns. 

259 At the first layer of the model, this column oriented data should be converted 

260 to a single `Tensor`. 

261 

262 Example: 

263 

264 ```python 

265 price = numeric_column('price') 

266 keywords_embedded = embedding_column( 

267 categorical_column_with_hash_bucket("keywords", 10K), dimensions=16) 

268 columns = [price, keywords_embedded, ...] 

269 features = tf.io.parse_example(..., features=make_parse_example_spec(columns)) 

270 dense_tensor = input_layer(features, columns) 

271 for units in [128, 64, 32]: 

272 dense_tensor = tf.compat.v1.layers.dense(dense_tensor, units, tf.nn.relu) 

273 prediction = tf.compat.v1.layers.dense(dense_tensor, 1) 

274 ``` 

275 

276 Args: 

277 features: A mapping from key to tensors. `_FeatureColumn`s look up via these 

278 keys. For example `numeric_column('price')` will look at 'price' key in 

279 this dict. Values can be a `SparseTensor` or a `Tensor` depends on 

280 corresponding `_FeatureColumn`. 

281 feature_columns: An iterable containing the FeatureColumns to use as inputs 

282 to your model. All items should be instances of classes derived from 

283 `_DenseColumn` such as `numeric_column`, `embedding_column`, 

284 `bucketized_column`, `indicator_column`. If you have categorical features, 

285 you can wrap them with an `embedding_column` or `indicator_column`. 

286 weight_collections: A list of collection names to which the Variable will be 

287 added. Note that variables will also be added to collections 

288 `tf.GraphKeys.GLOBAL_VARIABLES` and `ops.GraphKeys.MODEL_VARIABLES`. 

289 trainable: If `True` also add the variable to the graph collection 

290 `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`). 

291 cols_to_vars: If not `None`, must be a dictionary that will be filled with a 

292 mapping from `_FeatureColumn` to list of `Variable`s. For example, after 

293 the call, we might have cols_to_vars = {_EmbeddingColumn( 

294 categorical_column=_HashedCategoricalColumn( key='sparse_feature', 

295 hash_bucket_size=5, dtype=tf.string), dimension=10): [<tf.Variable 

296 'some_variable:0' shape=(5, 10), <tf.Variable 'some_variable:1' shape=(5, 

297 10)]} If a column creates no variables, its value will be an empty list. 

298 cols_to_output_tensors: If not `None`, must be a dictionary that will be 

299 filled with a mapping from '_FeatureColumn' to the associated output 

300 `Tensor`s. 

301 

302 Returns: 

303 A `Tensor` which represents input layer of a model. Its shape 

304 is (batch_size, first_layer_dimension) and its dtype is `float32`. 

305 first_layer_dimension is determined based on given `feature_columns`. 

306 

307 Raises: 

308 ValueError: if an item in `feature_columns` is not a `_DenseColumn`. 

309 """ 

310 return _internal_input_layer( 

311 features, 

312 feature_columns, 

313 weight_collections=weight_collections, 

314 trainable=trainable, 

315 cols_to_vars=cols_to_vars, 

316 cols_to_output_tensors=cols_to_output_tensors) 

317 

318 

319# TODO(akshayka): InputLayer should be a subclass of Layer, and it 

320# should implement the logic in input_layer using Layer's build-and-call 

321# paradigm; input_layer should create an instance of InputLayer and 

322# return the result of invoking its apply method, just as functional layers do. 

323class InputLayer(object): 

324 """An object-oriented version of `input_layer` that reuses variables.""" 

325 

326 def __init__(self, 

327 feature_columns, 

328 weight_collections=None, 

329 trainable=True, 

330 cols_to_vars=None, 

331 name='feature_column_input_layer', 

332 create_scope_now=True): 

333 """See `input_layer`.""" 

334 

335 self._feature_columns = feature_columns 

336 self._weight_collections = weight_collections 

337 self._trainable = trainable 

338 self._cols_to_vars = cols_to_vars 

339 self._name = name 

340 self._input_layer_template = template.make_template( 

341 self._name, _internal_input_layer, create_scope_now_=create_scope_now) 

342 self._scope = self._input_layer_template.variable_scope 

343 

344 def __call__(self, features): 

345 return self._input_layer_template( 

346 features=features, 

347 feature_columns=self._feature_columns, 

348 weight_collections=self._weight_collections, 

349 trainable=self._trainable, 

350 cols_to_vars=None, 

351 from_template=True) 

352 

353 @property 

354 def name(self): 

355 return self._name 

356 

357 @property 

358 def non_trainable_variables(self): 

359 return self._input_layer_template.non_trainable_variables 

360 

361 @property 

362 def non_trainable_weights(self): 

363 return self._input_layer_template.non_trainable_weights 

364 

365 @property 

366 def trainable_variables(self): 

367 return self._input_layer_template.trainable_variables 

368 

369 @property 

370 def trainable_weights(self): 

371 return self._input_layer_template.trainable_weights 

372 

373 @property 

374 def variables(self): 

375 return self._input_layer_template.variables 

376 

377 @property 

378 def weights(self): 

379 return self._input_layer_template.weights 

380 

381 

382@doc_controls.header(_FEATURE_COLUMN_DEPRECATION_WARNING) 

383@tf_export(v1=['feature_column.linear_model']) 

384@deprecation.deprecated(None, _FEATURE_COLUMN_DEPRECATION_RUNTIME_WARNING) 

385def linear_model(features, 

386 feature_columns, 

387 units=1, 

388 sparse_combiner='sum', 

389 weight_collections=None, 

390 trainable=True, 

391 cols_to_vars=None): 

392 """Returns a linear prediction `Tensor` based on given `feature_columns`. 

393 

394 This function generates a weighted sum based on output dimension `units`. 

395 Weighted sum refers to logits in classification problems. It refers to the 

396 prediction itself for linear regression problems. 

397 

398 Note on supported columns: `linear_model` treats categorical columns as 

399 `indicator_column`s. To be specific, assume the input as `SparseTensor` looks 

400 like: 

401 

402 ```python 

403 shape = [2, 2] 

404 { 

405 [0, 0]: "a" 

406 [1, 0]: "b" 

407 [1, 1]: "c" 

408 } 

409 ``` 

410 `linear_model` assigns weights for the presence of "a", "b", "c' implicitly, 

411 just like `indicator_column`, while `input_layer` explicitly requires wrapping 

412 each of categorical columns with an `embedding_column` or an 

413 `indicator_column`. 

414 

415 Example of usage: 

416 

417 ```python 

418 price = numeric_column('price') 

419 price_buckets = bucketized_column(price, boundaries=[0., 10., 100., 1000.]) 

420 keywords = categorical_column_with_hash_bucket("keywords", 10K) 

421 keywords_price = crossed_column('keywords', price_buckets, ...) 

422 columns = [price_buckets, keywords, keywords_price ...] 

423 features = tf.io.parse_example(..., features=make_parse_example_spec(columns)) 

424 prediction = linear_model(features, columns) 

425 ``` 

426 

427 The `sparse_combiner` argument works as follows 

428 For example, for two features represented as the categorical columns: 

429 

430 ```python 

431 # Feature 1 

432 

433 shape = [2, 2] 

434 { 

435 [0, 0]: "a" 

436 [0, 1]: "b" 

437 [1, 0]: "c" 

438 } 

439 

440 # Feature 2 

441 

442 shape = [2, 3] 

443 { 

444 [0, 0]: "d" 

445 [1, 0]: "e" 

446 [1, 1]: "f" 

447 [1, 2]: "f" 

448 } 

449 ``` 

450 

451 with `sparse_combiner` as "mean", the linear model outputs consequently 

452 are: 

453 

454 ``` 

455 y_0 = 1.0 / 2.0 * ( w_a + w_b ) + w_d + b 

456 y_1 = w_c + 1.0 / 3.0 * ( w_e + 2.0 * w_f ) + b 

457 ``` 

458 

459 where `y_i` is the output, `b` is the bias, and `w_x` is the weight 

460 assigned to the presence of `x` in the input features. 

461 

462 Args: 

463 features: A mapping from key to tensors. `_FeatureColumn`s look up via these 

464 keys. For example `numeric_column('price')` will look at 'price' key in 

465 this dict. Values are `Tensor` or `SparseTensor` depending on 

466 corresponding `_FeatureColumn`. 

467 feature_columns: An iterable containing the FeatureColumns to use as inputs 

468 to your model. All items should be instances of classes derived from 

469 `_FeatureColumn`s. 

470 units: An integer, dimensionality of the output space. Default value is 1. 

471 sparse_combiner: A string specifying how to reduce if a categorical column 

472 is multivalent. Except `numeric_column`, almost all columns passed to 

473 `linear_model` are considered as categorical columns. It combines each 

474 categorical column independently. Currently "mean", "sqrtn" and "sum" are 

475 supported, with "sum" the default for linear model. "sqrtn" often achieves 

476 good accuracy, in particular with bag-of-words columns. 

477 * "sum": do not 

478 normalize features in the column 

479 * "mean": do l1 normalization on features 

480 in the column 

481 * "sqrtn": do l2 normalization on features in the column 

482 weight_collections: A list of collection names to which the Variable will be 

483 added. Note that, variables will also be added to collections 

484 `tf.GraphKeys.GLOBAL_VARIABLES` and `ops.GraphKeys.MODEL_VARIABLES`. 

485 trainable: If `True` also add the variable to the graph collection 

486 `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`). 

487 cols_to_vars: If not `None`, must be a dictionary that will be filled with a 

488 mapping from `_FeatureColumn` to associated list of `Variable`s. For 

489 example, after the call, we might have cols_to_vars = { _NumericColumn( 

490 key='numeric_feature1', shape=(1,): [<tf.Variable 

491 'linear_model/price2/weights:0' shape=(1, 1)>], 'bias': [<tf.Variable 

492 'linear_model/bias_weights:0' shape=(1,)>], _NumericColumn( 

493 key='numeric_feature2', shape=(2,)): [<tf.Variable 

494 'linear_model/price1/weights:0' shape=(2, 1)>]} If a column creates no 

495 variables, its value will be an empty list. Note that cols_to_vars will 

496 also contain a string key 'bias' that maps to a list of Variables. 

497 

498 Returns: 

499 A `Tensor` which represents predictions/logits of a linear model. Its shape 

500 is (batch_size, units) and its dtype is `float32`. 

501 

502 Raises: 

503 ValueError: if an item in `feature_columns` is neither a `_DenseColumn` 

504 nor `_CategoricalColumn`. 

505 """ 

506 with variable_scope.variable_scope(None, 'linear_model') as vs: 

507 model_name = _strip_leading_slashes(vs.name) 

508 linear_model_layer = _LinearModel( 

509 feature_columns=feature_columns, 

510 units=units, 

511 sparse_combiner=sparse_combiner, 

512 weight_collections=weight_collections, 

513 trainable=trainable, 

514 name=model_name) 

515 retval = linear_model_layer(features) # pylint: disable=not-callable 

516 if cols_to_vars is not None: 

517 cols_to_vars.update(linear_model_layer.cols_to_vars()) 

518 return retval 

519 

520 

521def _add_to_collections(var, weight_collections): 

522 """Adds a var to the list of weight_collections provided. 

523 

524 Handles the case for partitioned and non-partitioned variables. 

525 

526 Args: 

527 var: A variable or Partitioned Variable. 

528 weight_collections: List of collections to add variable to. 

529 """ 

530 for weight_collection in weight_collections: 

531 # The layer self.add_variable call already adds it to GLOBAL_VARIABLES. 

532 if weight_collection == ops.GraphKeys.GLOBAL_VARIABLES: 

533 continue 

534 # TODO(rohanj): Explore adding a _get_variable_list method on `Variable` 

535 # so that we don't have to do this check. 

536 if isinstance(var, variables.PartitionedVariable): 

537 for constituent_var in list(var): 

538 ops.add_to_collection(weight_collection, constituent_var) 

539 else: 

540 ops.add_to_collection(weight_collection, var) 

541 

542 

543class _FCLinearWrapper(base.Layer): 

544 """Wraps a _FeatureColumn in a layer for use in a linear model. 

545 

546 See `linear_model` above. 

547 """ 

548 

549 def __init__(self, 

550 feature_column, 

551 units=1, 

552 sparse_combiner='sum', 

553 weight_collections=None, 

554 trainable=True, 

555 name=None, 

556 **kwargs): 

557 super(_FCLinearWrapper, self).__init__( 

558 trainable=trainable, name=name, **kwargs) 

559 self._feature_column = feature_column 

560 self._units = units 

561 self._sparse_combiner = sparse_combiner 

562 self._weight_collections = weight_collections 

563 

564 def build(self, _): 

565 if isinstance(self._feature_column, _CategoricalColumn): 

566 weight = self.add_variable( 

567 name='weights', 

568 shape=(self._feature_column._num_buckets, self._units), # pylint: disable=protected-access 

569 initializer=init_ops.zeros_initializer(), 

570 trainable=self.trainable) 

571 else: 

572 num_elements = self._feature_column._variable_shape.num_elements() # pylint: disable=protected-access 

573 weight = self.add_variable( 

574 name='weights', 

575 shape=[num_elements, self._units], 

576 initializer=init_ops.zeros_initializer(), 

577 trainable=self.trainable) 

578 _add_to_collections(weight, self._weight_collections) 

579 self._weight_var = weight 

580 self.built = True 

581 

582 def call(self, builder): 

583 weighted_sum = _create_weighted_sum( 

584 column=self._feature_column, 

585 builder=builder, 

586 units=self._units, 

587 sparse_combiner=self._sparse_combiner, 

588 weight_collections=self._weight_collections, 

589 trainable=self.trainable, 

590 weight_var=self._weight_var) 

591 return weighted_sum 

592 

593 

594class _BiasLayer(base.Layer): 

595 """A layer for the bias term.""" 

596 

597 def __init__(self, 

598 units=1, 

599 trainable=True, 

600 weight_collections=None, 

601 name=None, 

602 **kwargs): 

603 super(_BiasLayer, self).__init__(trainable=trainable, name=name, **kwargs) 

604 self._units = units 

605 self._weight_collections = weight_collections 

606 

607 def build(self, _): 

608 self._bias_variable = self.add_variable( 

609 'bias_weights', 

610 shape=[self._units], 

611 initializer=init_ops.zeros_initializer(), 

612 trainable=self.trainable) 

613 _add_to_collections(self._bias_variable, self._weight_collections) 

614 self.built = True 

615 

616 def call(self, _): 

617 return self._bias_variable 

618 

619 

620def _get_expanded_variable_list(variable): 

621 if (isinstance(variable, variables.Variable) or 

622 resource_variable_ops.is_resource_variable(variable)): 

623 return [variable] # Single variable case. 

624 else: # Must be a PartitionedVariable, so convert into a list. 

625 return list(variable) 

626 

627 

628def _strip_leading_slashes(name): 

629 return name.rsplit('/', 1)[-1] 

630 

631 

632class _LinearModel(base.Layer): 

633 """Creates a linear model using feature columns. 

634 

635 See `linear_model` for details. 

636 """ 

637 

638 def __init__(self, 

639 feature_columns, 

640 units=1, 

641 sparse_combiner='sum', 

642 weight_collections=None, 

643 trainable=True, 

644 name=None, 

645 **kwargs): 

646 super(_LinearModel, self).__init__(name=name, **kwargs) 

647 # We force the keras_style to be True here, as a workaround to not being 

648 # able to inherit keras.layers.Layer as base class. Setting this will let 

649 # us skip all the legacy behavior for base.Layer. 

650 # Also note that we use Layer as base class, instead of Model, since there 

651 # isn't any Model specific behavior gets used, eg compile/fit. 

652 self._keras_style = True 

653 self._feature_columns = _normalize_feature_columns(feature_columns) 

654 self._weight_collections = list(weight_collections or []) 

655 if ops.GraphKeys.GLOBAL_VARIABLES not in self._weight_collections: 

656 self._weight_collections.append(ops.GraphKeys.GLOBAL_VARIABLES) 

657 if ops.GraphKeys.MODEL_VARIABLES not in self._weight_collections: 

658 self._weight_collections.append(ops.GraphKeys.MODEL_VARIABLES) 

659 

660 column_layers = {} 

661 for column in sorted(self._feature_columns, key=lambda x: x.name): 

662 with variable_scope.variable_scope( 

663 None, default_name=column._var_scope_name) as vs: # pylint: disable=protected-access 

664 # Having the fully expressed variable scope name ends up doubly 

665 # expressing the outer scope (scope with which this method was called) 

666 # in the name of the variable that would get created. 

667 column_name = _strip_leading_slashes(vs.name) 

668 column_layer = _FCLinearWrapper(column, units, sparse_combiner, 

669 self._weight_collections, trainable, 

670 column_name, **kwargs) 

671 column_layers[column_name] = column_layer 

672 self._column_layers = self._add_layers(column_layers) 

673 self._bias_layer = _BiasLayer( 

674 units=units, 

675 trainable=trainable, 

676 weight_collections=self._weight_collections, 

677 name='bias_layer', 

678 **kwargs) 

679 self._cols_to_vars = {} 

680 

681 def cols_to_vars(self): 

682 """Returns a dict mapping _FeatureColumns to variables. 

683 

684 See `linear_model` for more information. 

685 This is not populated till `call` is called i.e. layer is built. 

686 """ 

687 return self._cols_to_vars 

688 

689 def call(self, features): 

690 with variable_scope.variable_scope(self.name): 

691 for column in self._feature_columns: 

692 if not isinstance(column, (_DenseColumn, _CategoricalColumn)): 

693 raise ValueError( 

694 'Items of feature_columns must be either a ' 

695 '_DenseColumn or _CategoricalColumn. Given: {}'.format(column)) 

696 weighted_sums = [] 

697 ordered_columns = [] 

698 builder = _LazyBuilder(features) 

699 for layer in sorted(self._column_layers.values(), key=lambda x: x.name): 

700 column = layer._feature_column # pylint: disable=protected-access 

701 ordered_columns.append(column) 

702 weighted_sum = layer(builder) 

703 weighted_sums.append(weighted_sum) 

704 self._cols_to_vars[column] = ops.get_collection( 

705 ops.GraphKeys.GLOBAL_VARIABLES, scope=layer.scope_name) 

706 

707 _verify_static_batch_size_equality(weighted_sums, ordered_columns) 

708 predictions_no_bias = math_ops.add_n( 

709 weighted_sums, name='weighted_sum_no_bias') 

710 predictions = nn_ops.bias_add( 

711 predictions_no_bias, 

712 self._bias_layer( # pylint: disable=not-callable 

713 builder, 

714 scope=variable_scope.get_variable_scope()), # pylint: disable=not-callable 

715 name='weighted_sum') 

716 bias = self._bias_layer.variables[0] 

717 self._cols_to_vars['bias'] = _get_expanded_variable_list(bias) 

718 return predictions 

719 

720 def _add_layers(self, layers): 

721 # "Magic" required for keras.Model classes to track all the variables in 

722 # a list of layers.Layer objects. 

723 # TODO(ashankar): Figure out API so user code doesn't have to do this. 

724 for name, layer in layers.items(): 

725 setattr(self, 'layer-%s' % name, layer) 

726 return layers 

727 

728 

729def _transform_features(features, feature_columns): 

730 """Returns transformed features based on features columns passed in. 

731 

732 Please note that most probably you would not need to use this function. Please 

733 check `input_layer` and `linear_model` to see whether they will 

734 satisfy your use case or not. 

735 

736 Example: 

737 

738 ```python 

739 # Define features and transformations 

740 crosses_a_x_b = crossed_column( 

741 columns=["sparse_feature_a", "sparse_feature_b"], hash_bucket_size=10000) 

742 price_buckets = bucketized_column( 

743 source_column=numeric_column("price"), boundaries=[...]) 

744 

745 columns = [crosses_a_x_b, price_buckets] 

746 features = tf.io.parse_example(..., features=make_parse_example_spec(columns)) 

747 transformed = transform_features(features=features, feature_columns=columns) 

748 

749 assertCountEqual(columns, transformed.keys()) 

750 ``` 

751 

752 Args: 

753 features: A mapping from key to tensors. `_FeatureColumn`s look up via these 

754 keys. For example `numeric_column('price')` will look at 'price' key in 

755 this dict. Values can be a `SparseTensor` or a `Tensor` depends on 

756 corresponding `_FeatureColumn`. 

757 feature_columns: An iterable containing all the `_FeatureColumn`s. 

758 

759 Returns: 

760 A `dict` mapping `_FeatureColumn` to `Tensor` and `SparseTensor` values. 

761 """ 

762 feature_columns = _normalize_feature_columns(feature_columns) 

763 outputs = {} 

764 with ops.name_scope( 

765 None, default_name='transform_features', values=features.values()): 

766 builder = _LazyBuilder(features) 

767 for column in sorted(feature_columns, key=lambda x: x.name): 

768 with ops.name_scope(None, default_name=column.name): 

769 outputs[column] = builder.get(column) 

770 return outputs 

771 

772 

773@doc_controls.header(_FEATURE_COLUMN_DEPRECATION_WARNING) 

774@tf_export(v1=['feature_column.make_parse_example_spec']) 

775@deprecation.deprecated(None, _FEATURE_COLUMN_DEPRECATION_RUNTIME_WARNING) 

776def make_parse_example_spec(feature_columns): 

777 """Creates parsing spec dictionary from input feature_columns. 

778 

779 The returned dictionary can be used as arg 'features' in 

780 `tf.io.parse_example`. 

781 

782 Typical usage example: 

783 

784 ```python 

785 # Define features and transformations 

786 feature_a = categorical_column_with_vocabulary_file(...) 

787 feature_b = numeric_column(...) 

788 feature_c_bucketized = bucketized_column(numeric_column("feature_c"), ...) 

789 feature_a_x_feature_c = crossed_column( 

790 columns=["feature_a", feature_c_bucketized], ...) 

791 

792 feature_columns = set( 

793 [feature_b, feature_c_bucketized, feature_a_x_feature_c]) 

794 features = tf.io.parse_example( 

795 serialized=serialized_examples, 

796 features=make_parse_example_spec(feature_columns)) 

797 ``` 

798 

799 For the above example, make_parse_example_spec would return the dict: 

800 

801 ```python 

802 { 

803 "feature_a": parsing_ops.VarLenFeature(tf.string), 

804 "feature_b": parsing_ops.FixedLenFeature([1], dtype=tf.float32), 

805 "feature_c": parsing_ops.FixedLenFeature([1], dtype=tf.float32) 

806 } 

807 ``` 

808 

809 Args: 

810 feature_columns: An iterable containing all feature columns. All items 

811 should be instances of classes derived from `_FeatureColumn`. 

812 

813 Returns: 

814 A dict mapping each feature key to a `FixedLenFeature` or `VarLenFeature` 

815 value. 

816 

817 Raises: 

818 ValueError: If any of the given `feature_columns` is not a `_FeatureColumn` 

819 instance. 

820 """ 

821 result = {} 

822 for column in feature_columns: 

823 if not isinstance(column, _FeatureColumn): 

824 raise ValueError('All feature_columns must be _FeatureColumn instances. ' 

825 'Given: {}'.format(column)) 

826 config = column._parse_example_spec # pylint: disable=protected-access 

827 for key, value in six.iteritems(config): 

828 if key in result and value != result[key]: 

829 raise ValueError('feature_columns contain different parse_spec for key ' 

830 '{}. Given {} and {}'.format(key, value, result[key])) 

831 result.update(config) 

832 return result 

833 

834 

835def _embedding_column(categorical_column, 

836 dimension, 

837 combiner='mean', 

838 initializer=None, 

839 ckpt_to_load_from=None, 

840 tensor_name_in_ckpt=None, 

841 max_norm=None, 

842 trainable=True, 

843 use_safe_embedding_lookup=True): 

844 """`_DenseColumn` that converts from sparse, categorical input. 

845 

846 Use this when your inputs are sparse, but you want to convert them to a dense 

847 representation (e.g., to feed to a DNN). 

848 

849 Inputs must be a `_CategoricalColumn` created by any of the 

850 `categorical_column_*` function. Here is an example of using 

851 `embedding_column` with `DNNClassifier`: 

852 

853 ```python 

854 video_id = categorical_column_with_identity( 

855 key='video_id', num_buckets=1000000, default_value=0) 

856 columns = [embedding_column(video_id, 9),...] 

857 

858 estimator = tf.estimator.DNNClassifier(feature_columns=columns, ...) 

859 

860 label_column = ... 

861 def input_fn(): 

862 features = tf.io.parse_example( 

863 ..., features=make_parse_example_spec(columns + [label_column])) 

864 labels = features.pop(label_column.name) 

865 return features, labels 

866 

867 estimator.train(input_fn=input_fn, steps=100) 

868 ``` 

869 

870 Here is an example using `embedding_column` with model_fn: 

871 

872 ```python 

873 def model_fn(features, ...): 

874 video_id = categorical_column_with_identity( 

875 key='video_id', num_buckets=1000000, default_value=0) 

876 columns = [embedding_column(video_id, 9),...] 

877 dense_tensor = input_layer(features, columns) 

878 # Form DNN layers, calculate loss, and return EstimatorSpec. 

879 ... 

880 ``` 

881 

882 Args: 

883 categorical_column: A `_CategoricalColumn` created by a 

884 `categorical_column_with_*` function. This column produces the sparse IDs 

885 that are inputs to the embedding lookup. 

886 dimension: An integer specifying dimension of the embedding, must be > 0. 

887 combiner: A string specifying how to reduce if there are multiple entries in 

888 a single row. Currently 'mean', 'sqrtn' and 'sum' are supported, with 

889 'mean' the default. 'sqrtn' often achieves good accuracy, in particular 

890 with bag-of-words columns. Each of this can be thought as example level 

891 normalizations on the column. For more information, see 

892 `tf.embedding_lookup_sparse`. 

893 initializer: A variable initializer function to be used in embedding 

894 variable initialization. If not specified, defaults to 

895 `tf.compat.v1.truncated_normal_initializer` with mean `0.0` and standard 

896 deviation `1/sqrt(dimension)`. 

897 ckpt_to_load_from: String representing checkpoint name/pattern from which to 

898 restore column weights. Required if `tensor_name_in_ckpt` is not `None`. 

899 tensor_name_in_ckpt: Name of the `Tensor` in `ckpt_to_load_from` from which 

900 to restore the column weights. Required if `ckpt_to_load_from` is not 

901 `None`. 

902 max_norm: If not `None`, embedding values are l2-normalized to this value. 

903 trainable: Whether or not the embedding is trainable. Default is True. 

904 use_safe_embedding_lookup: If true, uses safe_embedding_lookup_sparse 

905 instead of embedding_lookup_sparse. safe_embedding_lookup_sparse ensures 

906 there are no empty rows and all weights and ids are positive at the 

907 expense of extra compute cost. This only applies to rank 2 (NxM) shaped 

908 input tensors. Defaults to true, consider turning off if the above checks 

909 are not needed. Note that having empty rows will not trigger any error 

910 though the output result might be 0 or omitted. 

911 

912 Returns: 

913 `_DenseColumn` that converts from sparse input. 

914 

915 Raises: 

916 ValueError: if `dimension` not > 0. 

917 ValueError: if exactly one of `ckpt_to_load_from` and `tensor_name_in_ckpt` 

918 is specified. 

919 ValueError: if `initializer` is specified and is not callable. 

920 RuntimeError: If eager execution is enabled. 

921 """ 

922 if (dimension is None) or (dimension < 1): 

923 raise ValueError('Invalid dimension {}.'.format(dimension)) 

924 if (ckpt_to_load_from is None) != (tensor_name_in_ckpt is None): 

925 raise ValueError('Must specify both `ckpt_to_load_from` and ' 

926 '`tensor_name_in_ckpt` or none of them.') 

927 

928 if (initializer is not None) and (not callable(initializer)): 

929 raise ValueError('initializer must be callable if specified. ' 

930 'Embedding of column_name: {}'.format( 

931 categorical_column.name)) 

932 if initializer is None: 

933 initializer = init_ops.truncated_normal_initializer( 

934 mean=0.0, stddev=1 / math.sqrt(dimension)) 

935 

936 embedding_shape = categorical_column._num_buckets, dimension # pylint: disable=protected-access 

937 

938 def _creator(weight_collections, scope): 

939 embedding_column_layer = _EmbeddingColumnLayer( 

940 embedding_shape=embedding_shape, 

941 initializer=initializer, 

942 weight_collections=weight_collections, 

943 trainable=trainable, 

944 name='embedding_column_layer') 

945 return embedding_column_layer(None, scope=scope) # pylint: disable=not-callable 

946 

947 return _EmbeddingColumn( 

948 categorical_column=categorical_column, 

949 dimension=dimension, 

950 combiner=combiner, 

951 layer_creator=_creator, 

952 ckpt_to_load_from=ckpt_to_load_from, 

953 tensor_name_in_ckpt=tensor_name_in_ckpt, 

954 max_norm=max_norm, 

955 trainable=trainable, 

956 use_safe_embedding_lookup=use_safe_embedding_lookup) 

957 

958 

959def _numeric_column(key, 

960 shape=(1,), 

961 default_value=None, 

962 dtype=dtypes.float32, 

963 normalizer_fn=None): 

964 """Represents real valued or numerical features. 

965 

966 Example: 

967 

968 ```python 

969 price = numeric_column('price') 

970 columns = [price, ...] 

971 features = tf.io.parse_example(..., features=make_parse_example_spec(columns)) 

972 dense_tensor = input_layer(features, columns) 

973 

974 # or 

975 bucketized_price = bucketized_column(price, boundaries=[...]) 

976 columns = [bucketized_price, ...] 

977 features = tf.io.parse_example(..., features=make_parse_example_spec(columns)) 

978 linear_prediction = linear_model(features, columns) 

979 ``` 

980 

981 Args: 

982 key: A unique string identifying the input feature. It is used as the column 

983 name and the dictionary key for feature parsing configs, feature `Tensor` 

984 objects, and feature columns. 

985 shape: An iterable of integers specifies the shape of the `Tensor`. An 

986 integer can be given which means a single dimension `Tensor` with given 

987 width. The `Tensor` representing the column will have the shape of 

988 [batch_size] + `shape`. 

989 default_value: A single value compatible with `dtype` or an iterable of 

990 values compatible with `dtype` which the column takes on during 

991 `tf.Example` parsing if data is missing. A default value of `None` will 

992 cause `tf.io.parse_example` to fail if an example does not contain this 

993 column. If a single value is provided, the same value will be applied as 

994 the default value for every item. If an iterable of values is provided, 

995 the shape of the `default_value` should be equal to the given `shape`. 

996 dtype: defines the type of values. Default value is `tf.float32`. Must be a 

997 non-quantized, real integer or floating point type. 

998 normalizer_fn: If not `None`, a function that can be used to normalize the 

999 value of the tensor after `default_value` is applied for parsing. 

1000 Normalizer function takes the input `Tensor` as its argument, and returns 

1001 the output `Tensor`. (e.g. lambda x: (x - 3.0) / 4.2). Please note that 

1002 even though the most common use case of this function is normalization, it 

1003 can be used for any kind of Tensorflow transformations. 

1004 

1005 Returns: 

1006 A `_NumericColumn`. 

1007 

1008 Raises: 

1009 TypeError: if any dimension in shape is not an int 

1010 ValueError: if any dimension in shape is not a positive integer 

1011 TypeError: if `default_value` is an iterable but not compatible with `shape` 

1012 TypeError: if `default_value` is not compatible with `dtype`. 

1013 ValueError: if `dtype` is not convertible to `tf.float32`. 

1014 """ 

1015 shape = _check_shape(shape, key) 

1016 if not (dtype.is_integer or dtype.is_floating): 

1017 raise ValueError('dtype must be convertible to float. ' 

1018 'dtype: {}, key: {}'.format(dtype, key)) 

1019 default_value = fc_utils.check_default_value(shape, default_value, dtype, key) 

1020 

1021 if normalizer_fn is not None and not callable(normalizer_fn): 

1022 raise TypeError( 

1023 'normalizer_fn must be a callable. Given: {}'.format(normalizer_fn)) 

1024 

1025 fc_utils.assert_key_is_string(key) 

1026 return _NumericColumn( 

1027 key, 

1028 shape=shape, 

1029 default_value=default_value, 

1030 dtype=dtype, 

1031 normalizer_fn=normalizer_fn) 

1032 

1033 

1034def _bucketized_column(source_column, boundaries): 

1035 """Represents discretized dense input. 

1036 

1037 Buckets include the left boundary, and exclude the right boundary. Namely, 

1038 `boundaries=[0., 1., 2.]` generates buckets `(-inf, 0.)`, `[0., 1.)`, 

1039 `[1., 2.)`, and `[2., +inf)`. 

1040 

1041 For example, if the inputs are 

1042 

1043 ```python 

1044 boundaries = [0, 10, 100] 

1045 input tensor = [[-5, 10000] 

1046 [150, 10] 

1047 [5, 100]] 

1048 ``` 

1049 

1050 then the output will be 

1051 

1052 ```python 

1053 output = [[0, 3] 

1054 [3, 2] 

1055 [1, 3]] 

1056 ``` 

1057 

1058 Example: 

1059 

1060 ```python 

1061 price = numeric_column('price') 

1062 bucketized_price = bucketized_column(price, boundaries=[...]) 

1063 columns = [bucketized_price, ...] 

1064 features = tf.io.parse_example(..., features=make_parse_example_spec(columns)) 

1065 linear_prediction = linear_model(features, columns) 

1066 

1067 # or 

1068 columns = [bucketized_price, ...] 

1069 features = tf.io.parse_example(..., features=make_parse_example_spec(columns)) 

1070 dense_tensor = input_layer(features, columns) 

1071 ``` 

1072 

1073 A `bucketized_column` can also be crossed with another categorical column 

1074 using `crossed_column`: 

1075 

1076 ```python 

1077 price = numeric_column('price') 

1078 # bucketized_column converts numerical feature to a categorical one. 

1079 bucketized_price = bucketized_column(price, boundaries=[...]) 

1080 # 'keywords' is a string feature. 

1081 price_x_keywords = crossed_column([bucketized_price, 'keywords'], 50K) 

1082 columns = [price_x_keywords, ...] 

1083 features = tf.io.parse_example(..., features=make_parse_example_spec(columns)) 

1084 linear_prediction = linear_model(features, columns) 

1085 ``` 

1086 

1087 Args: 

1088 source_column: A one-dimensional dense column which is generated with 

1089 `numeric_column`. 

1090 boundaries: A sorted list or tuple of floats specifying the boundaries. 

1091 

1092 Returns: 

1093 A `_BucketizedColumn`. 

1094 

1095 Raises: 

1096 ValueError: If `source_column` is not a numeric column, or if it is not 

1097 one-dimensional. 

1098 ValueError: If `boundaries` is not a sorted list or tuple. 

1099 """ 

1100 if not isinstance(source_column, _NumericColumn): 

1101 raise ValueError( 

1102 'source_column must be a column generated with numeric_column(). ' 

1103 'Given: {}'.format(source_column)) 

1104 if len(source_column.shape) > 1: 

1105 raise ValueError('source_column must be one-dimensional column. ' 

1106 'Given: {}'.format(source_column)) 

1107 if (not boundaries or 

1108 not (isinstance(boundaries, list) or isinstance(boundaries, tuple))): 

1109 raise ValueError('boundaries must be a sorted list.') 

1110 for i in range(len(boundaries) - 1): 

1111 if boundaries[i] >= boundaries[i + 1]: 

1112 raise ValueError('boundaries must be a sorted list.') 

1113 return _BucketizedColumn(source_column, tuple(boundaries)) 

1114 

1115 

1116def _categorical_column_with_hash_bucket(key, 

1117 hash_bucket_size, 

1118 dtype=dtypes.string): 

1119 """Represents sparse feature where ids are set by hashing. 

1120 

1121 Use this when your sparse features are in string or integer format, and you 

1122 want to distribute your inputs into a finite number of buckets by hashing. 

1123 output_id = Hash(input_feature_string) % bucket_size for string type input. 

1124 For int type input, the value is converted to its string representation first 

1125 and then hashed by the same formula. 

1126 

1127 For input dictionary `features`, `features[key]` is either `Tensor` or 

1128 `SparseTensor`. If `Tensor`, missing values can be represented by `-1` for int 

1129 and `''` for string, which will be dropped by this feature column. 

1130 

1131 Example: 

1132 

1133 ```python 

1134 keywords = categorical_column_with_hash_bucket("keywords", 10K) 

1135 columns = [keywords, ...] 

1136 features = tf.io.parse_example(..., features=make_parse_example_spec(columns)) 

1137 linear_prediction = linear_model(features, columns) 

1138 

1139 # or 

1140 keywords_embedded = embedding_column(keywords, 16) 

1141 columns = [keywords_embedded, ...] 

1142 features = tf.io.parse_example(..., features=make_parse_example_spec(columns)) 

1143 dense_tensor = input_layer(features, columns) 

1144 ``` 

1145 

1146 Args: 

1147 key: A unique string identifying the input feature. It is used as the column 

1148 name and the dictionary key for feature parsing configs, feature `Tensor` 

1149 objects, and feature columns. 

1150 hash_bucket_size: An int > 1. The number of buckets. 

1151 dtype: The type of features. Only string and integer types are supported. 

1152 

1153 Returns: 

1154 A `_HashedCategoricalColumn`. 

1155 

1156 Raises: 

1157 ValueError: `hash_bucket_size` is not greater than 1. 

1158 ValueError: `dtype` is neither string nor integer. 

1159 """ 

1160 if hash_bucket_size is None: 

1161 raise ValueError('hash_bucket_size must be set. ' 'key: {}'.format(key)) 

1162 

1163 if hash_bucket_size < 1: 

1164 raise ValueError('hash_bucket_size must be at least 1. ' 

1165 'hash_bucket_size: {}, key: {}'.format( 

1166 hash_bucket_size, key)) 

1167 

1168 fc_utils.assert_key_is_string(key) 

1169 fc_utils.assert_string_or_int(dtype, prefix='column_name: {}'.format(key)) 

1170 

1171 return _HashedCategoricalColumn(key, hash_bucket_size, dtype) 

1172 

1173 

1174def _categorical_column_with_vocabulary_file(key, 

1175 vocabulary_file, 

1176 vocabulary_size=None, 

1177 num_oov_buckets=0, 

1178 default_value=None, 

1179 dtype=dtypes.string): 

1180 """A `_CategoricalColumn` with a vocabulary file. 

1181 

1182 Use this when your inputs are in string or integer format, and you have a 

1183 vocabulary file that maps each value to an integer ID. By default, 

1184 out-of-vocabulary values are ignored. Use either (but not both) of 

1185 `num_oov_buckets` and `default_value` to specify how to include 

1186 out-of-vocabulary values. 

1187 

1188 For input dictionary `features`, `features[key]` is either `Tensor` or 

1189 `SparseTensor`. If `Tensor`, missing values can be represented by `-1` for int 

1190 and `''` for string, which will be dropped by this feature column. 

1191 

1192 Example with `num_oov_buckets`: 

1193 File '/us/states.txt' contains 50 lines, each with a 2-character U.S. state 

1194 abbreviation. All inputs with values in that file are assigned an ID 0-49, 

1195 corresponding to its line number. All other values are hashed and assigned an 

1196 ID 50-54. 

1197 

1198 ```python 

1199 states = categorical_column_with_vocabulary_file( 

1200 key='states', vocabulary_file='/us/states.txt', vocabulary_size=50, 

1201 num_oov_buckets=5) 

1202 columns = [states, ...] 

1203 features = tf.io.parse_example(..., features=make_parse_example_spec(columns)) 

1204 linear_prediction = linear_model(features, columns) 

1205 ``` 

1206 

1207 Example with `default_value`: 

1208 File '/us/states.txt' contains 51 lines - the first line is 'XX', and the 

1209 other 50 each have a 2-character U.S. state abbreviation. Both a literal 'XX' 

1210 in input, and other values missing from the file, will be assigned ID 0. All 

1211 others are assigned the corresponding line number 1-50. 

1212 

1213 ```python 

1214 states = categorical_column_with_vocabulary_file( 

1215 key='states', vocabulary_file='/us/states.txt', vocabulary_size=51, 

1216 default_value=0) 

1217 columns = [states, ...] 

1218 features = tf.io.parse_example(..., features=make_parse_example_spec(columns)) 

1219 linear_prediction, _, _ = linear_model(features, columns) 

1220 ``` 

1221 

1222 And to make an embedding with either: 

1223 

1224 ```python 

1225 columns = [embedding_column(states, 3),...] 

1226 features = tf.io.parse_example(..., features=make_parse_example_spec(columns)) 

1227 dense_tensor = input_layer(features, columns) 

1228 ``` 

1229 

1230 Args: 

1231 key: A unique string identifying the input feature. It is used as the column 

1232 name and the dictionary key for feature parsing configs, feature `Tensor` 

1233 objects, and feature columns. 

1234 vocabulary_file: The vocabulary file name. 

1235 vocabulary_size: Number of the elements in the vocabulary. This must be no 

1236 greater than length of `vocabulary_file`, if less than length, later 

1237 values are ignored. If None, it is set to the length of `vocabulary_file`. 

1238 num_oov_buckets: Non-negative integer, the number of out-of-vocabulary 

1239 buckets. All out-of-vocabulary inputs will be assigned IDs in the range 

1240 `[vocabulary_size, vocabulary_size+num_oov_buckets)` based on a hash of 

1241 the input value. A positive `num_oov_buckets` can not be specified with 

1242 `default_value`. 

1243 default_value: The integer ID value to return for out-of-vocabulary feature 

1244 values, defaults to `-1`. This can not be specified with a positive 

1245 `num_oov_buckets`. 

1246 dtype: The type of features. Only string and integer types are supported. 

1247 

1248 Returns: 

1249 A `_CategoricalColumn` with a vocabulary file. 

1250 

1251 Raises: 

1252 ValueError: `vocabulary_file` is missing or cannot be opened. 

1253 ValueError: `vocabulary_size` is missing or < 1. 

1254 ValueError: `num_oov_buckets` is a negative integer. 

1255 ValueError: `num_oov_buckets` and `default_value` are both specified. 

1256 ValueError: `dtype` is neither string nor integer. 

1257 """ 

1258 if not vocabulary_file: 

1259 raise ValueError('Missing vocabulary_file in {}.'.format(key)) 

1260 

1261 if vocabulary_size is None: 

1262 if not gfile.Exists(vocabulary_file): 

1263 raise ValueError('vocabulary_file in {} does not exist.'.format(key)) 

1264 

1265 with gfile.GFile(vocabulary_file) as f: 

1266 vocabulary_size = sum(1 for _ in f) 

1267 logging.info( 

1268 'vocabulary_size = %d in %s is inferred from the number of elements ' 

1269 'in the vocabulary_file %s.', vocabulary_size, key, vocabulary_file) 

1270 

1271 # `vocabulary_size` isn't required for lookup, but it is for `_num_buckets`. 

1272 if vocabulary_size < 1: 

1273 raise ValueError('Invalid vocabulary_size in {}.'.format(key)) 

1274 if num_oov_buckets: 

1275 if default_value is not None: 

1276 raise ValueError( 

1277 'Can\'t specify both num_oov_buckets and default_value in {}.'.format( 

1278 key)) 

1279 if num_oov_buckets < 0: 

1280 raise ValueError('Invalid num_oov_buckets {} in {}.'.format( 

1281 num_oov_buckets, key)) 

1282 fc_utils.assert_string_or_int(dtype, prefix='column_name: {}'.format(key)) 

1283 fc_utils.assert_key_is_string(key) 

1284 return _VocabularyFileCategoricalColumn( 

1285 key=key, 

1286 vocabulary_file=vocabulary_file, 

1287 vocabulary_size=vocabulary_size, 

1288 num_oov_buckets=0 if num_oov_buckets is None else num_oov_buckets, 

1289 default_value=-1 if default_value is None else default_value, 

1290 dtype=dtype) 

1291 

1292 

1293def _categorical_column_with_vocabulary_list(key, 

1294 vocabulary_list, 

1295 dtype=None, 

1296 default_value=-1, 

1297 num_oov_buckets=0): 

1298 """A `_CategoricalColumn` with in-memory vocabulary. 

1299 

1300 Use this when your inputs are in string or integer format, and you have an 

1301 in-memory vocabulary mapping each value to an integer ID. By default, 

1302 out-of-vocabulary values are ignored. Use either (but not both) of 

1303 `num_oov_buckets` and `default_value` to specify how to include 

1304 out-of-vocabulary values. 

1305 

1306 For input dictionary `features`, `features[key]` is either `Tensor` or 

1307 `SparseTensor`. If `Tensor`, missing values can be represented by `-1` for int 

1308 and `''` for string, which will be dropped by this feature column. 

1309 

1310 Example with `num_oov_buckets`: 

1311 In the following example, each input in `vocabulary_list` is assigned an ID 

1312 0-3 corresponding to its index (e.g., input 'B' produces output 2). All other 

1313 inputs are hashed and assigned an ID 4-5. 

1314 

1315 ```python 

1316 colors = categorical_column_with_vocabulary_list( 

1317 key='colors', vocabulary_list=('R', 'G', 'B', 'Y'), 

1318 num_oov_buckets=2) 

1319 columns = [colors, ...] 

1320 features = tf.io.parse_example(..., features=make_parse_example_spec(columns)) 

1321 linear_prediction, _, _ = linear_model(features, columns) 

1322 ``` 

1323 

1324 Example with `default_value`: 

1325 In the following example, each input in `vocabulary_list` is assigned an ID 

1326 0-4 corresponding to its index (e.g., input 'B' produces output 3). All other 

1327 inputs are assigned `default_value` 0. 

1328 

1329 

1330 ```python 

1331 colors = categorical_column_with_vocabulary_list( 

1332 key='colors', vocabulary_list=('X', 'R', 'G', 'B', 'Y'), default_value=0) 

1333 columns = [colors, ...] 

1334 features = tf.io.parse_example(..., features=make_parse_example_spec(columns)) 

1335 linear_prediction, _, _ = linear_model(features, columns) 

1336 ``` 

1337 

1338 And to make an embedding with either: 

1339 

1340 ```python 

1341 columns = [embedding_column(colors, 3),...] 

1342 features = tf.io.parse_example(..., features=make_parse_example_spec(columns)) 

1343 dense_tensor = input_layer(features, columns) 

1344 ``` 

1345 

1346 Args: 

1347 key: A unique string identifying the input feature. It is used as the column 

1348 name and the dictionary key for feature parsing configs, feature `Tensor` 

1349 objects, and feature columns. 

1350 vocabulary_list: An ordered iterable defining the vocabulary. Each feature 

1351 is mapped to the index of its value (if present) in `vocabulary_list`. 

1352 Must be castable to `dtype`. 

1353 dtype: The type of features. Only string and integer types are supported. If 

1354 `None`, it will be inferred from `vocabulary_list`. 

1355 default_value: The integer ID value to return for out-of-vocabulary feature 

1356 values, defaults to `-1`. This can not be specified with a positive 

1357 `num_oov_buckets`. 

1358 num_oov_buckets: Non-negative integer, the number of out-of-vocabulary 

1359 buckets. All out-of-vocabulary inputs will be assigned IDs in the range 

1360 `[len(vocabulary_list), len(vocabulary_list)+num_oov_buckets)` based on a 

1361 hash of the input value. A positive `num_oov_buckets` can not be specified 

1362 with `default_value`. 

1363 

1364 Returns: 

1365 A `_CategoricalColumn` with in-memory vocabulary. 

1366 

1367 Raises: 

1368 ValueError: if `vocabulary_list` is empty, or contains duplicate keys. 

1369 ValueError: `num_oov_buckets` is a negative integer. 

1370 ValueError: `num_oov_buckets` and `default_value` are both specified. 

1371 ValueError: if `dtype` is not integer or string. 

1372 """ 

1373 if (vocabulary_list is None) or (len(vocabulary_list) < 1): 

1374 raise ValueError( 

1375 'vocabulary_list {} must be non-empty, column_name: {}'.format( 

1376 vocabulary_list, key)) 

1377 if len(set(vocabulary_list)) != len(vocabulary_list): 

1378 raise ValueError( 

1379 'Duplicate keys in vocabulary_list {}, column_name: {}'.format( 

1380 vocabulary_list, key)) 

1381 vocabulary_dtype = dtypes.as_dtype(np.array(vocabulary_list).dtype) 

1382 if num_oov_buckets: 

1383 if default_value != -1: 

1384 raise ValueError( 

1385 'Can\'t specify both num_oov_buckets and default_value in {}.'.format( 

1386 key)) 

1387 if num_oov_buckets < 0: 

1388 raise ValueError('Invalid num_oov_buckets {} in {}.'.format( 

1389 num_oov_buckets, key)) 

1390 fc_utils.assert_string_or_int( 

1391 vocabulary_dtype, prefix='column_name: {} vocabulary'.format(key)) 

1392 if dtype is None: 

1393 dtype = vocabulary_dtype 

1394 elif dtype.is_integer != vocabulary_dtype.is_integer: 

1395 raise ValueError( 

1396 'dtype {} and vocabulary dtype {} do not match, column_name: {}'.format( 

1397 dtype, vocabulary_dtype, key)) 

1398 fc_utils.assert_string_or_int(dtype, prefix='column_name: {}'.format(key)) 

1399 fc_utils.assert_key_is_string(key) 

1400 

1401 return _VocabularyListCategoricalColumn( 

1402 key=key, 

1403 vocabulary_list=tuple(vocabulary_list), 

1404 dtype=dtype, 

1405 default_value=default_value, 

1406 num_oov_buckets=num_oov_buckets) 

1407 

1408 

1409def _categorical_column_with_identity(key, num_buckets, default_value=None): 

1410 """A `_CategoricalColumn` that returns identity values. 

1411 

1412 Use this when your inputs are integers in the range `[0, num_buckets)`, and 

1413 you want to use the input value itself as the categorical ID. Values outside 

1414 this range will result in `default_value` if specified, otherwise it will 

1415 fail. 

1416 

1417 Typically, this is used for contiguous ranges of integer indexes, but 

1418 it doesn't have to be. This might be inefficient, however, if many of IDs 

1419 are unused. Consider `categorical_column_with_hash_bucket` in that case. 

1420 

1421 For input dictionary `features`, `features[key]` is either `Tensor` or 

1422 `SparseTensor`. If `Tensor`, missing values can be represented by `-1` for int 

1423 and `''` for string, which will be dropped by this feature column. 

1424 

1425 In the following examples, each input in the range `[0, 1000000)` is assigned 

1426 the same value. All other inputs are assigned `default_value` 0. Note that a 

1427 literal 0 in inputs will result in the same default ID. 

1428 

1429 Linear model: 

1430 

1431 ```python 

1432 video_id = categorical_column_with_identity( 

1433 key='video_id', num_buckets=1000000, default_value=0) 

1434 columns = [video_id, ...] 

1435 features = tf.io.parse_example(..., features=make_parse_example_spec(columns)) 

1436 linear_prediction, _, _ = linear_model(features, columns) 

1437 ``` 

1438 

1439 Embedding for a DNN model: 

1440 

1441 ```python 

1442 columns = [embedding_column(video_id, 9),...] 

1443 features = tf.io.parse_example(..., features=make_parse_example_spec(columns)) 

1444 dense_tensor = input_layer(features, columns) 

1445 ``` 

1446 

1447 Args: 

1448 key: A unique string identifying the input feature. It is used as the column 

1449 name and the dictionary key for feature parsing configs, feature `Tensor` 

1450 objects, and feature columns. 

1451 num_buckets: Range of inputs and outputs is `[0, num_buckets)`. 

1452 default_value: If set, values outside of range `[0, num_buckets)` will be 

1453 replaced with this value. If not set, values >= num_buckets will cause a 

1454 failure while values < 0 will be dropped. 

1455 

1456 Returns: 

1457 A `_CategoricalColumn` that returns identity values. 

1458 

1459 Raises: 

1460 ValueError: if `num_buckets` is less than one. 

1461 ValueError: if `default_value` is not in range `[0, num_buckets)`. 

1462 """ 

1463 if num_buckets < 1: 

1464 raise ValueError('num_buckets {} < 1, column_name {}'.format( 

1465 num_buckets, key)) 

1466 if (default_value is not None) and ((default_value < 0) or 

1467 (default_value >= num_buckets)): 

1468 raise ValueError( 

1469 'default_value {} not in range [0, {}), column_name {}'.format( 

1470 default_value, num_buckets, key)) 

1471 fc_utils.assert_key_is_string(key) 

1472 return _IdentityCategoricalColumn( 

1473 key=key, num_buckets=num_buckets, default_value=default_value) 

1474 

1475 

1476def _indicator_column(categorical_column): 

1477 """Represents multi-hot representation of given categorical column. 

1478 

1479 - For DNN model, `indicator_column` can be used to wrap any 

1480 `categorical_column_*` (e.g., to feed to DNN). Consider to Use 

1481 `embedding_column` if the number of buckets/unique(values) are large. 

1482 

1483 - For Wide (aka linear) model, `indicator_column` is the internal 

1484 representation for categorical column when passing categorical column 

1485 directly (as any element in feature_columns) to `linear_model`. See 

1486 `linear_model` for details. 

1487 

1488 ```python 

1489 name = indicator_column(categorical_column_with_vocabulary_list( 

1490 'name', ['bob', 'george', 'wanda']) 

1491 columns = [name, ...] 

1492 features = tf.io.parse_example(..., features=make_parse_example_spec(columns)) 

1493 dense_tensor = input_layer(features, columns) 

1494 

1495 dense_tensor == [[1, 0, 0]] # If "name" bytes_list is ["bob"] 

1496 dense_tensor == [[1, 0, 1]] # If "name" bytes_list is ["bob", "wanda"] 

1497 dense_tensor == [[2, 0, 0]] # If "name" bytes_list is ["bob", "bob"] 

1498 ``` 

1499 

1500 Args: 

1501 categorical_column: A `_CategoricalColumn` which is created by 

1502 `categorical_column_with_*` or `crossed_column` functions. 

1503 

1504 Returns: 

1505 An `_IndicatorColumn`. 

1506 """ 

1507 return _IndicatorColumn(categorical_column) 

1508 

1509 

1510def _weighted_categorical_column(categorical_column, 

1511 weight_feature_key, 

1512 dtype=dtypes.float32): 

1513 """Applies weight values to a `_CategoricalColumn`. 

1514 

1515 Use this when each of your sparse inputs has both an ID and a value. For 

1516 example, if you're representing text documents as a collection of word 

1517 frequencies, you can provide 2 parallel sparse input features ('terms' and 

1518 'frequencies' below). 

1519 

1520 Example: 

1521 

1522 Input `tf.Example` objects: 

1523 

1524 ```proto 

1525 [ 

1526 features { 

1527 feature { 

1528 key: "terms" 

1529 value {bytes_list {value: "very" value: "model"}} 

1530 } 

1531 feature { 

1532 key: "frequencies" 

1533 value {float_list {value: 0.3 value: 0.1}} 

1534 } 

1535 }, 

1536 features { 

1537 feature { 

1538 key: "terms" 

1539 value {bytes_list {value: "when" value: "course" value: "human"}} 

1540 } 

1541 feature { 

1542 key: "frequencies" 

1543 value {float_list {value: 0.4 value: 0.1 value: 0.2}} 

1544 } 

1545 } 

1546 ] 

1547 ``` 

1548 

1549 ```python 

1550 categorical_column = categorical_column_with_hash_bucket( 

1551 column_name='terms', hash_bucket_size=1000) 

1552 weighted_column = weighted_categorical_column( 

1553 categorical_column=categorical_column, weight_feature_key='frequencies') 

1554 columns = [weighted_column, ...] 

1555 features = tf.io.parse_example(..., features=make_parse_example_spec(columns)) 

1556 linear_prediction, _, _ = linear_model(features, columns) 

1557 ``` 

1558 

1559 This assumes the input dictionary contains a `SparseTensor` for key 

1560 'terms', and a `SparseTensor` for key 'frequencies'. These 2 tensors must have 

1561 the same indices and dense shape. 

1562 

1563 Args: 

1564 categorical_column: A `_CategoricalColumn` created by 

1565 `categorical_column_with_*` functions. 

1566 weight_feature_key: String key for weight values. 

1567 dtype: Type of weights, such as `tf.float32`. Only float and integer weights 

1568 are supported. 

1569 

1570 Returns: 

1571 A `_CategoricalColumn` composed of two sparse features: one represents id, 

1572 the other represents weight (value) of the id feature in that example. 

1573 

1574 Raises: 

1575 ValueError: if `dtype` is not convertible to float. 

1576 """ 

1577 if (dtype is None) or not (dtype.is_integer or dtype.is_floating): 

1578 raise ValueError('dtype {} is not convertible to float.'.format(dtype)) 

1579 return _WeightedCategoricalColumn( 

1580 categorical_column=categorical_column, 

1581 weight_feature_key=weight_feature_key, 

1582 dtype=dtype) 

1583 

1584 

1585def _crossed_column(keys, hash_bucket_size, hash_key=None): 

1586 """Returns a column for performing crosses of categorical features. 

1587 

1588 Crossed features are hashed according to `hash_bucket_size`. Conceptually, 

1589 the transformation can be thought of as: 

1590 Hash(cartesian product of features) % `hash_bucket_size` 

1591 

1592 For example, if the input features are: 

1593 

1594 * SparseTensor referred by first key: 

1595 

1596 ```python 

1597 shape = [2, 2] 

1598 { 

1599 [0, 0]: "a" 

1600 [1, 0]: "b" 

1601 [1, 1]: "c" 

1602 } 

1603 ``` 

1604 

1605 * SparseTensor referred by second key: 

1606 

1607 ```python 

1608 shape = [2, 1] 

1609 { 

1610 [0, 0]: "d" 

1611 [1, 0]: "e" 

1612 } 

1613 ``` 

1614 

1615 then crossed feature will look like: 

1616 

1617 ```python 

1618 shape = [2, 2] 

1619 { 

1620 [0, 0]: Hash64("d", Hash64("a")) % hash_bucket_size 

1621 [1, 0]: Hash64("e", Hash64("b")) % hash_bucket_size 

1622 [1, 1]: Hash64("e", Hash64("c")) % hash_bucket_size 

1623 } 

1624 ``` 

1625 

1626 Here is an example to create a linear model with crosses of string features: 

1627 

1628 ```python 

1629 keywords_x_doc_terms = crossed_column(['keywords', 'doc_terms'], 50K) 

1630 columns = [keywords_x_doc_terms, ...] 

1631 features = tf.io.parse_example(..., features=make_parse_example_spec(columns)) 

1632 linear_prediction = linear_model(features, columns) 

1633 ``` 

1634 

1635 You could also use vocabulary lookup before crossing: 

1636 

1637 ```python 

1638 keywords = categorical_column_with_vocabulary_file( 

1639 'keywords', '/path/to/vocabulary/file', vocabulary_size=1K) 

1640 keywords_x_doc_terms = crossed_column([keywords, 'doc_terms'], 50K) 

1641 columns = [keywords_x_doc_terms, ...] 

1642 features = tf.io.parse_example(..., features=make_parse_example_spec(columns)) 

1643 linear_prediction = linear_model(features, columns) 

1644 ``` 

1645 

1646 If an input feature is of numeric type, you can use 

1647 `categorical_column_with_identity`, or `bucketized_column`, as in the example: 

1648 

1649 ```python 

1650 # vertical_id is an integer categorical feature. 

1651 vertical_id = categorical_column_with_identity('vertical_id', 10K) 

1652 price = numeric_column('price') 

1653 # bucketized_column converts numerical feature to a categorical one. 

1654 bucketized_price = bucketized_column(price, boundaries=[...]) 

1655 vertical_id_x_price = crossed_column([vertical_id, bucketized_price], 50K) 

1656 columns = [vertical_id_x_price, ...] 

1657 features = tf.io.parse_example(..., features=make_parse_example_spec(columns)) 

1658 linear_prediction = linear_model(features, columns) 

1659 ``` 

1660 

1661 To use crossed column in DNN model, you need to add it in an embedding column 

1662 as in this example: 

1663 

1664 ```python 

1665 vertical_id_x_price = crossed_column([vertical_id, bucketized_price], 50K) 

1666 vertical_id_x_price_embedded = embedding_column(vertical_id_x_price, 10) 

1667 dense_tensor = input_layer(features, [vertical_id_x_price_embedded, ...]) 

1668 ``` 

1669 

1670 Args: 

1671 keys: An iterable identifying the features to be crossed. Each element can 

1672 be either: 

1673 * string: Uses the corresponding feature which must be of string type. 

1674 * `_CategoricalColumn`: Uses the transformed tensor produced by this 

1675 column. Does not support hashed categorical column. 

1676 hash_bucket_size: An int > 1. The number of buckets. 

1677 hash_key: Specify the hash_key that will be used by the `FingerprintCat64` 

1678 function to combine the crosses fingerprints on SparseCrossOp (optional). 

1679 

1680 Returns: 

1681 A `_CrossedColumn`. 

1682 

1683 Raises: 

1684 ValueError: If `len(keys) < 2`. 

1685 ValueError: If any of the keys is neither a string nor `_CategoricalColumn`. 

1686 ValueError: If any of the keys is `_HashedCategoricalColumn`. 

1687 ValueError: If `hash_bucket_size < 1`. 

1688 """ 

1689 if not hash_bucket_size or hash_bucket_size < 1: 

1690 raise ValueError('hash_bucket_size must be > 1. ' 

1691 'hash_bucket_size: {}'.format(hash_bucket_size)) 

1692 if not keys or len(keys) < 2: 

1693 raise ValueError( 

1694 'keys must be a list with length > 1. Given: {}'.format(keys)) 

1695 for key in keys: 

1696 if (not isinstance(key, six.string_types) and 

1697 not isinstance(key, _CategoricalColumn)): 

1698 raise ValueError( 

1699 'Unsupported key type. All keys must be either string, or ' 

1700 'categorical column except _HashedCategoricalColumn. ' 

1701 'Given: {}'.format(key)) 

1702 if isinstance(key, _HashedCategoricalColumn): 

1703 raise ValueError( 

1704 'categorical_column_with_hash_bucket is not supported for crossing. ' 

1705 'Hashing before crossing will increase probability of collision. ' 

1706 'Instead, use the feature name as a string. Given: {}'.format(key)) 

1707 return _CrossedColumn( 

1708 keys=tuple(keys), hash_bucket_size=hash_bucket_size, hash_key=hash_key) 

1709 

1710 

1711# TODO(rohanj): Clearly define semantics of this layer. 

1712class _EmbeddingColumnLayer(base.Layer): 

1713 """A layer that stores all the state required for a embedding column.""" 

1714 

1715 def __init__(self, 

1716 embedding_shape, 

1717 initializer, 

1718 weight_collections=None, 

1719 trainable=True, 

1720 name=None, 

1721 **kwargs): 

1722 """Constructor. 

1723 

1724 Args: 

1725 embedding_shape: Shape of the embedding variable used for lookup. 

1726 initializer: A variable initializer function to be used in embedding 

1727 variable initialization. 

1728 weight_collections: A list of collection names to which the Variable will 

1729 be added. Note that, variables will also be added to collections 

1730 `tf.GraphKeys.GLOBAL_VARIABLES` and `ops.GraphKeys.MODEL_VARIABLES`. 

1731 trainable: If `True` also add the variable to the graph collection 

1732 `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`). 

1733 name: Name of the layer 

1734 **kwargs: keyword named properties. 

1735 """ 

1736 super(_EmbeddingColumnLayer, self).__init__( 

1737 trainable=trainable, name=name, **kwargs) 

1738 self._embedding_shape = embedding_shape 

1739 self._initializer = initializer 

1740 self._weight_collections = weight_collections 

1741 

1742 def set_weight_collections(self, weight_collections): 

1743 """Sets the weight collections for the layer. 

1744 

1745 Args: 

1746 weight_collections: A list of collection names to which the Variable will 

1747 be added. 

1748 """ 

1749 self._weight_collections = weight_collections 

1750 

1751 def build(self, _): 

1752 self._embedding_weight_var = self.add_variable( 

1753 name='embedding_weights', 

1754 shape=self._embedding_shape, 

1755 dtype=dtypes.float32, 

1756 initializer=self._initializer, 

1757 trainable=self.trainable) 

1758 if self._weight_collections and not context.executing_eagerly(): 

1759 _add_to_collections(self._embedding_weight_var, self._weight_collections) 

1760 self.built = True 

1761 

1762 def call(self, _): 

1763 return self._embedding_weight_var 

1764 

1765 

1766@six.add_metaclass(abc.ABCMeta) 

1767class _FeatureColumn(object): 

1768 """Represents a feature column abstraction. 

1769 

1770 WARNING: Do not subclass this layer unless you know what you are doing: 

1771 the API is subject to future changes. 

1772 

1773 To distinguish the concept of a feature family and a specific binary feature 

1774 within a family, we refer to a feature family like "country" as a feature 

1775 column. Following is an example feature in a `tf.Example` format: 

1776 {key: "country", value: [ "US" ]} 

1777 In this example the value of feature is "US" and "country" refers to the 

1778 column of the feature. 

1779 

1780 This class is an abstract class. User should not create instances of this. 

1781 """ 

1782 

1783 @abc.abstractproperty 

1784 def name(self): 

1785 """Returns string. Used for naming and for name_scope.""" 

1786 pass 

1787 

1788 def __lt__(self, other): 

1789 """Allows feature columns to be sorted in Python 3 as they are in Python 2. 

1790 

1791 Feature columns need to occasionally be sortable, for example when used as 

1792 keys in a features dictionary passed to a layer. 

1793 

1794 In CPython, `__lt__` must be defined for all objects in the 

1795 sequence being sorted. If any objects do not have an `__lt__` compatible 

1796 with feature column objects (such as strings), then CPython will fall back 

1797 to using the `__gt__` method below. 

1798 https://docs.python.org/3/library/stdtypes.html#list.sort 

1799 

1800 Args: 

1801 other: The other object to compare to. 

1802 

1803 Returns: 

1804 True if the string representation of this object is lexicographically less 

1805 than the string representation of `other`. For FeatureColumn objects, 

1806 this looks like "<__main__.FeatureColumn object at 0xa>". 

1807 """ 

1808 return str(self) < str(other) 

1809 

1810 def __gt__(self, other): 

1811 """Allows feature columns to be sorted in Python 3 as they are in Python 2. 

1812 

1813 Feature columns need to occasionally be sortable, for example when used as 

1814 keys in a features dictionary passed to a layer. 

1815 

1816 `__gt__` is called when the "other" object being compared during the sort 

1817 does not have `__lt__` defined. 

1818 Example: 

1819 ``` 

1820 # __lt__ only class 

1821 class A(): 

1822 def __lt__(self, other): return str(self) < str(other) 

1823 

1824 a = A() 

1825 a < "b" # True 

1826 "0" < a # Error 

1827 

1828 # __lt__ and __gt__ class 

1829 class B(): 

1830 def __lt__(self, other): return str(self) < str(other) 

1831 def __gt__(self, other): return str(self) > str(other) 

1832 

1833 b = B() 

1834 b < "c" # True 

1835 "0" < b # True 

1836 ``` 

1837 

1838 

1839 Args: 

1840 other: The other object to compare to. 

1841 

1842 Returns: 

1843 True if the string representation of this object is lexicographically 

1844 greater than the string representation of `other`. For FeatureColumn 

1845 objects, this looks like "<__main__.FeatureColumn object at 0xa>". 

1846 """ 

1847 return str(self) > str(other) 

1848 

1849 @property 

1850 def _var_scope_name(self): 

1851 """Returns string. Used for variable_scope. Defaults to self.name.""" 

1852 return self.name 

1853 

1854 @abc.abstractmethod 

1855 def _transform_feature(self, inputs): 

1856 """Returns intermediate representation (usually a `Tensor`). 

1857 

1858 Uses `inputs` to create an intermediate representation (usually a `Tensor`) 

1859 that other feature columns can use. 

1860 

1861 Example usage of `inputs`: 

1862 Let's say a Feature column depends on raw feature ('raw') and another 

1863 `_FeatureColumn` (input_fc). To access corresponding `Tensor`s, inputs will 

1864 be used as follows: 

1865 

1866 ```python 

1867 raw_tensor = inputs.get('raw') 

1868 fc_tensor = inputs.get(input_fc) 

1869 ``` 

1870 

1871 Args: 

1872 inputs: A `_LazyBuilder` object to access inputs. 

1873 

1874 Returns: 

1875 Transformed feature `Tensor`. 

1876 """ 

1877 pass 

1878 

1879 @abc.abstractproperty 

1880 def _parse_example_spec(self): 

1881 """Returns a `tf.Example` parsing spec as dict. 

1882 

1883 It is used for get_parsing_spec for `tf.io.parse_example`. Returned spec is 

1884 a dict from keys ('string') to `VarLenFeature`, `FixedLenFeature`, and other 

1885 supported objects. Please check documentation of `tf.io.parse_example` for 

1886 all supported spec objects. 

1887 

1888 Let's say a Feature column depends on raw feature ('raw') and another 

1889 `_FeatureColumn` (input_fc). One possible implementation of 

1890 _parse_example_spec is as follows: 

1891 

1892 ```python 

1893 spec = {'raw': tf.io.FixedLenFeature(...)} 

1894 spec.update(input_fc._parse_example_spec) 

1895 return spec 

1896 ``` 

1897 """ 

1898 pass 

1899 

1900 def _reset_config(self): 

1901 """Resets the configuration in the column. 

1902 

1903 Some feature columns e.g. embedding or shared embedding columns might 

1904 have some state that is needed to be reset sometimes. Use this method 

1905 in that scenario. 

1906 """ 

1907 

1908 

1909class _DenseColumn(_FeatureColumn): 

1910 """Represents a column which can be represented as `Tensor`. 

1911 

1912 WARNING: Do not subclass this layer unless you know what you are doing: 

1913 the API is subject to future changes. 

1914 

1915 Some examples of this type are: numeric_column, embedding_column, 

1916 indicator_column. 

1917 """ 

1918 

1919 @abc.abstractproperty 

1920 def _variable_shape(self): 

1921 """`TensorShape` of `_get_dense_tensor`, without batch dimension.""" 

1922 pass 

1923 

1924 @abc.abstractmethod 

1925 def _get_dense_tensor(self, inputs, weight_collections=None, trainable=None): 

1926 """Returns a `Tensor`. 

1927 

1928 The output of this function will be used by model-builder-functions. For 

1929 example the pseudo code of `input_layer` will be like: 

1930 

1931 ```python 

1932 def input_layer(features, feature_columns, ...): 

1933 outputs = [fc._get_dense_tensor(...) for fc in feature_columns] 

1934 return tf.concat(outputs) 

1935 ``` 

1936 

1937 Args: 

1938 inputs: A `_LazyBuilder` object to access inputs. 

1939 weight_collections: List of graph collections to which Variables (if any 

1940 will be created) are added. 

1941 trainable: If `True` also add variables to the graph collection 

1942 `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`). 

1943 

1944 Returns: 

1945 `Tensor` of shape [batch_size] + `_variable_shape`. 

1946 """ 

1947 pass 

1948 

1949 

1950def _create_weighted_sum(column, 

1951 builder, 

1952 units, 

1953 sparse_combiner, 

1954 weight_collections, 

1955 trainable, 

1956 weight_var=None): 

1957 """Creates a weighted sum for a dense/categorical column for linear_model.""" 

1958 if isinstance(column, _CategoricalColumn): 

1959 return _create_categorical_column_weighted_sum( 

1960 column=column, 

1961 builder=builder, 

1962 units=units, 

1963 sparse_combiner=sparse_combiner, 

1964 weight_collections=weight_collections, 

1965 trainable=trainable, 

1966 weight_var=weight_var) 

1967 else: 

1968 return _create_dense_column_weighted_sum( 

1969 column=column, 

1970 builder=builder, 

1971 units=units, 

1972 weight_collections=weight_collections, 

1973 trainable=trainable, 

1974 weight_var=weight_var) 

1975 

1976 

1977def _create_dense_column_weighted_sum(column, 

1978 builder, 

1979 units, 

1980 weight_collections, 

1981 trainable, 

1982 weight_var=None): 

1983 """Create a weighted sum of a dense column for linear_model.""" 

1984 tensor = column._get_dense_tensor( # pylint: disable=protected-access 

1985 builder, 

1986 weight_collections=weight_collections, 

1987 trainable=trainable) 

1988 num_elements = column._variable_shape.num_elements() # pylint: disable=protected-access 

1989 batch_size = array_ops.shape(tensor)[0] 

1990 tensor = array_ops.reshape(tensor, shape=(batch_size, num_elements)) 

1991 if weight_var is not None: 

1992 weight = weight_var 

1993 else: 

1994 weight = variable_scope.get_variable( 

1995 name='weights', 

1996 shape=[num_elements, units], 

1997 initializer=init_ops.zeros_initializer(), 

1998 trainable=trainable, 

1999 collections=weight_collections) 

2000 return math_ops.matmul(tensor, weight, name='weighted_sum') 

2001 

2002 

2003class _CategoricalColumn(_FeatureColumn): 

2004 """Represents a categorical feature. 

2005 

2006 WARNING: Do not subclass this layer unless you know what you are doing: 

2007 the API is subject to future changes. 

2008 

2009 A categorical feature typically handled with a `tf.sparse.SparseTensor` of 

2010 IDs. 

2011 """ 

2012 

2013 IdWeightPair = collections.namedtuple( # pylint: disable=invalid-name 

2014 'IdWeightPair', ['id_tensor', 'weight_tensor']) 

2015 

2016 @abc.abstractproperty 

2017 def _num_buckets(self): 

2018 """Returns number of buckets in this sparse feature.""" 

2019 pass 

2020 

2021 @abc.abstractmethod 

2022 def _get_sparse_tensors(self, 

2023 inputs, 

2024 weight_collections=None, 

2025 trainable=None): 

2026 """Returns an IdWeightPair. 

2027 

2028 `IdWeightPair` is a pair of `SparseTensor`s which represents ids and 

2029 weights. 

2030 

2031 `IdWeightPair.id_tensor` is typically a `batch_size` x `num_buckets` 

2032 `SparseTensor` of `int64`. `IdWeightPair.weight_tensor` is either a 

2033 `SparseTensor` of `float` or `None` to indicate all weights should be 

2034 taken to be 1. If specified, `weight_tensor` must have exactly the same 

2035 shape and indices as `sp_ids`. Expected `SparseTensor` is same as parsing 

2036 output of a `VarLenFeature` which is a ragged matrix. 

2037 

2038 Args: 

2039 inputs: A `LazyBuilder` as a cache to get input tensors required to create 

2040 `IdWeightPair`. 

2041 weight_collections: List of graph collections to which variables (if any 

2042 will be created) are added. 

2043 trainable: If `True` also add variables to the graph collection 

2044 `GraphKeys.TRAINABLE_VARIABLES` (see `tf.compat.v1.get_variable`). 

2045 """ 

2046 pass 

2047 

2048 

2049def _create_categorical_column_weighted_sum(column, 

2050 builder, 

2051 units, 

2052 sparse_combiner, 

2053 weight_collections, 

2054 trainable, 

2055 weight_var=None): 

2056 # pylint: disable=g-doc-return-or-yield,g-doc-args 

2057 """Create a weighted sum of a categorical column for linear_model. 

2058 

2059 Note to maintainer: As implementation details, the weighted sum is 

2060 implemented via embedding_lookup_sparse toward efficiency. Mathematically, 

2061 they are the same. 

2062 

2063 To be specific, conceptually, categorical column can be treated as multi-hot 

2064 vector. Say: 

2065 

2066 ```python 

2067 x = [0 0 1] # categorical column input 

2068 w = [a b c] # weights 

2069 ``` 

2070 The weighted sum is `c` in this case, which is same as `w[2]`. 

2071 

2072 Another example is 

2073 

2074 ```python 

2075 x = [0 1 1] # categorical column input 

2076 w = [a b c] # weights 

2077 ``` 

2078 The weighted sum is `b + c` in this case, which is same as `w[2] + w[3]`. 

2079 

2080 For both cases, we can implement weighted sum via embedding_lookup with 

2081 sparse_combiner = "sum". 

2082 """ 

2083 

2084 sparse_tensors = column._get_sparse_tensors( # pylint: disable=protected-access 

2085 builder, 

2086 weight_collections=weight_collections, 

2087 trainable=trainable) 

2088 id_tensor = sparse_ops.sparse_reshape( 

2089 sparse_tensors.id_tensor, 

2090 [array_ops.shape(sparse_tensors.id_tensor)[0], -1]) 

2091 weight_tensor = sparse_tensors.weight_tensor 

2092 if weight_tensor is not None: 

2093 weight_tensor = sparse_ops.sparse_reshape( 

2094 weight_tensor, [array_ops.shape(weight_tensor)[0], -1]) 

2095 

2096 if weight_var is not None: 

2097 weight = weight_var 

2098 else: 

2099 weight = variable_scope.get_variable( 

2100 name='weights', 

2101 shape=(column._num_buckets, units), # pylint: disable=protected-access 

2102 initializer=init_ops.zeros_initializer(), 

2103 trainable=trainable, 

2104 collections=weight_collections) 

2105 return embedding_ops.safe_embedding_lookup_sparse( 

2106 weight, 

2107 id_tensor, 

2108 sparse_weights=weight_tensor, 

2109 combiner=sparse_combiner, 

2110 name='weighted_sum') 

2111 

2112 

2113class _SequenceDenseColumn(_FeatureColumn): 

2114 """Represents dense sequence data.""" 

2115 

2116 TensorSequenceLengthPair = collections.namedtuple( # pylint: disable=invalid-name 

2117 'TensorSequenceLengthPair', ['dense_tensor', 'sequence_length']) 

2118 

2119 @abc.abstractmethod 

2120 def _get_sequence_dense_tensor(self, 

2121 inputs, 

2122 weight_collections=None, 

2123 trainable=None): 

2124 """Returns a `TensorSequenceLengthPair`.""" 

2125 pass 

2126 

2127 

2128class _LazyBuilder(object): 

2129 """Handles caching of transformations while building the model. 

2130 

2131 `_FeatureColumn` specifies how to digest an input column to the network. Some 

2132 feature columns require data transformations. This class caches those 

2133 transformations. 

2134 

2135 Some features may be used in more than one place. For example, one can use a 

2136 bucketized feature by itself and a cross with it. In that case we 

2137 should create only one bucketization op instead of creating ops for each 

2138 feature column separately. To handle re-use of transformed columns, 

2139 `_LazyBuilder` caches all previously transformed columns. 

2140 

2141 Example: 

2142 We're trying to use the following `_FeatureColumn`s: 

2143 

2144 ```python 

2145 bucketized_age = fc.bucketized_column(fc.numeric_column("age"), ...) 

2146 keywords = fc.categorical_column_with_hash_buckets("keywords", ...) 

2147 age_X_keywords = fc.crossed_column([bucketized_age, "keywords"]) 

2148 ... = linear_model(features, 

2149 [bucketized_age, keywords, age_X_keywords] 

2150 ``` 

2151 

2152 If we transform each column independently, then we'll get duplication of 

2153 bucketization (one for cross, one for bucketization itself). 

2154 The `_LazyBuilder` eliminates this duplication. 

2155 """ 

2156 

2157 def __init__(self, features): 

2158 """Creates a `_LazyBuilder`. 

2159 

2160 Args: 

2161 features: A mapping from feature column to objects that are `Tensor` or 

2162 `SparseTensor`, or can be converted to same via 

2163 `sparse_tensor.convert_to_tensor_or_sparse_tensor`. A `string` key 

2164 signifies a base feature (not-transformed). A `_FeatureColumn` key means 

2165 that this `Tensor` is the output of an existing `_FeatureColumn` which 

2166 can be reused. 

2167 """ 

2168 self._features = features.copy() 

2169 self._feature_tensors = {} 

2170 

2171 def get(self, key): 

2172 """Returns a `Tensor` for the given key. 

2173 

2174 A `str` key is used to access a base feature (not-transformed). When a 

2175 `_FeatureColumn` is passed, the transformed feature is returned if it 

2176 already exists, otherwise the given `_FeatureColumn` is asked to provide its 

2177 transformed output, which is then cached. 

2178 

2179 Args: 

2180 key: a `str` or a `_FeatureColumn`. 

2181 

2182 Returns: 

2183 The transformed `Tensor` corresponding to the `key`. 

2184 

2185 Raises: 

2186 ValueError: if key is not found or a transformed `Tensor` cannot be 

2187 computed. 

2188 """ 

2189 if key in self._feature_tensors: 

2190 # FeatureColumn is already transformed or converted. 

2191 return self._feature_tensors[key] 

2192 

2193 if key in self._features: 

2194 feature_tensor = self._get_raw_feature_as_tensor(key) 

2195 self._feature_tensors[key] = feature_tensor 

2196 return feature_tensor 

2197 

2198 if isinstance(key, six.string_types): 

2199 raise ValueError('Feature {} is not in features dictionary.'.format(key)) 

2200 

2201 if not isinstance(key, _FeatureColumn): 

2202 raise TypeError('"key" must be either a "str" or "_FeatureColumn". ' 

2203 'Provided: {}'.format(key)) 

2204 

2205 column = key 

2206 logging.debug('Transforming feature_column %s.', column) 

2207 transformed = column._transform_feature(self) # pylint: disable=protected-access 

2208 if transformed is None: 

2209 raise ValueError('Column {} is not supported.'.format(column.name)) 

2210 self._feature_tensors[column] = transformed 

2211 return transformed 

2212 

2213 def _get_raw_feature_as_tensor(self, key): 

2214 """Gets the raw_feature (keyed by `key`) as `tensor`. 

2215 

2216 The raw feature is converted to (sparse) tensor and maybe expand dim. 

2217 

2218 For both `Tensor` and `SparseTensor`, the rank will be expanded (to 2) if 

2219 the rank is 1. This supports dynamic rank also. For rank 0 raw feature, will 

2220 error out as it is not supported. 

2221 

2222 Args: 

2223 key: A `str` key to access the raw feature. 

2224 

2225 Returns: 

2226 A `Tensor` or `SparseTensor`. 

2227 

2228 Raises: 

2229 ValueError: if the raw feature has rank 0. 

2230 """ 

2231 raw_feature = self._features[key] 

2232 feature_tensor = sparse_tensor_lib.convert_to_tensor_or_sparse_tensor( 

2233 raw_feature) 

2234 

2235 def expand_dims(input_tensor): 

2236 # Input_tensor must have rank 1. 

2237 if isinstance(input_tensor, sparse_tensor_lib.SparseTensor): 

2238 return sparse_ops.sparse_reshape(input_tensor, 

2239 [array_ops.shape(input_tensor)[0], 1]) 

2240 else: 

2241 return array_ops.expand_dims(input_tensor, -1) 

2242 

2243 rank = feature_tensor.get_shape().ndims 

2244 if rank is not None: 

2245 if rank == 0: 

2246 raise ValueError( 

2247 'Feature (key: {}) cannot have rank 0. Given: {}'.format( 

2248 key, feature_tensor)) 

2249 return feature_tensor if rank != 1 else expand_dims(feature_tensor) 

2250 

2251 # Handle dynamic rank. 

2252 with ops.control_dependencies([ 

2253 check_ops.assert_positive( 

2254 array_ops.rank(feature_tensor), 

2255 message='Feature (key: {}) cannot have rank 0. Given: {}'.format( 

2256 key, feature_tensor)) 

2257 ]): 

2258 return cond.cond( 

2259 math_ops.equal(1, array_ops.rank(feature_tensor)), 

2260 lambda: expand_dims(feature_tensor), lambda: feature_tensor) 

2261 

2262 

2263# TODO(ptucker): Move to third_party/tensorflow/python/ops/sparse_ops.py 

2264def _shape_offsets(shape): 

2265 """Returns moving offset for each dimension given shape.""" 

2266 offsets = [] 

2267 for dim in reversed(shape): 

2268 if offsets: 

2269 offsets.append(dim * offsets[-1]) 

2270 else: 

2271 offsets.append(dim) 

2272 offsets.reverse() 

2273 return offsets 

2274 

2275 

2276# TODO(ptucker): Move to third_party/tensorflow/python/ops/sparse_ops.py 

2277def _to_sparse_input_and_drop_ignore_values(input_tensor, ignore_value=None): 

2278 """Converts a `Tensor` to a `SparseTensor`, dropping ignore_value cells. 

2279 

2280 If `input_tensor` is already a `SparseTensor`, just return it. 

2281 

2282 Args: 

2283 input_tensor: A string or integer `Tensor`. 

2284 ignore_value: Entries in `dense_tensor` equal to this value will be absent 

2285 from the resulting `SparseTensor`. If `None`, default value of 

2286 `dense_tensor`'s dtype will be used ('' for `str`, -1 for `int`). 

2287 

2288 Returns: 

2289 A `SparseTensor` with the same shape as `input_tensor`. 

2290 

2291 Raises: 

2292 ValueError: when `input_tensor`'s rank is `None`. 

2293 """ 

2294 input_tensor = sparse_tensor_lib.convert_to_tensor_or_sparse_tensor( 

2295 input_tensor) 

2296 if isinstance(input_tensor, sparse_tensor_lib.SparseTensor): 

2297 return input_tensor 

2298 with ops.name_scope(None, 'to_sparse_input', ( 

2299 input_tensor, 

2300 ignore_value, 

2301 )): 

2302 if ignore_value is None: 

2303 if input_tensor.dtype == dtypes.string: 

2304 # Exception due to TF strings are converted to numpy objects by default. 

2305 ignore_value = '' 

2306 elif input_tensor.dtype.is_integer: 

2307 ignore_value = -1 # -1 has a special meaning of missing feature 

2308 else: 

2309 # NOTE: `as_numpy_dtype` is a property, so with the parentheses this is 

2310 # constructing a new numpy object of the given type, which yields the 

2311 # default value for that type. 

2312 ignore_value = input_tensor.dtype.as_numpy_dtype() 

2313 ignore_value = math_ops.cast( 

2314 ignore_value, input_tensor.dtype, name='ignore_value') 

2315 indices = array_ops.where( 

2316 math_ops.not_equal(input_tensor, ignore_value), name='indices') 

2317 return sparse_tensor_lib.SparseTensor( 

2318 indices=indices, 

2319 values=array_ops.gather_nd(input_tensor, indices, name='values'), 

2320 dense_shape=array_ops.shape( 

2321 input_tensor, out_type=dtypes.int64, name='dense_shape')) 

2322 

2323 

2324def _normalize_feature_columns(feature_columns): 

2325 """Normalizes the `feature_columns` input. 

2326 

2327 This method converts the `feature_columns` to list type as best as it can. In 

2328 addition, verifies the type and other parts of feature_columns, required by 

2329 downstream library. 

2330 

2331 Args: 

2332 feature_columns: The raw feature columns, usually passed by users. 

2333 

2334 Returns: 

2335 The normalized feature column list. 

2336 

2337 Raises: 

2338 ValueError: for any invalid inputs, such as empty, duplicated names, etc. 

2339 """ 

2340 if isinstance(feature_columns, _FeatureColumn): 

2341 feature_columns = [feature_columns] 

2342 

2343 if isinstance(feature_columns, collections_abc.Iterator): 

2344 feature_columns = list(feature_columns) 

2345 

2346 if isinstance(feature_columns, dict): 

2347 raise ValueError('Expected feature_columns to be iterable, found dict.') 

2348 

2349 for column in feature_columns: 

2350 if not isinstance(column, _FeatureColumn): 

2351 raise ValueError('Items of feature_columns must be a _FeatureColumn. ' 

2352 'Given (type {}): {}.'.format(type(column), column)) 

2353 if not feature_columns: 

2354 raise ValueError('feature_columns must not be empty.') 

2355 name_to_column = {} 

2356 for column in feature_columns: 

2357 if column.name in name_to_column: 

2358 raise ValueError('Duplicate feature column name found for columns: {} ' 

2359 'and {}. This usually means that these columns refer to ' 

2360 'same base feature. Either one must be discarded or a ' 

2361 'duplicated but renamed item must be inserted in ' 

2362 'features dict.'.format(column, 

2363 name_to_column[column.name])) 

2364 name_to_column[column.name] = column 

2365 

2366 return feature_columns 

2367 

2368 

2369class _NumericColumn( 

2370 _DenseColumn, 

2371 collections.namedtuple( 

2372 '_NumericColumn', 

2373 ['key', 'shape', 'default_value', 'dtype', 'normalizer_fn'])): 

2374 """see `numeric_column`.""" 

2375 

2376 @property 

2377 def name(self): 

2378 return self.key 

2379 

2380 @property 

2381 def _parse_example_spec(self): 

2382 return { 

2383 self.key: 

2384 parsing_ops.FixedLenFeature(self.shape, self.dtype, 

2385 self.default_value) 

2386 } 

2387 

2388 def _transform_feature(self, inputs): 

2389 input_tensor = inputs.get(self.key) 

2390 if isinstance(input_tensor, sparse_tensor_lib.SparseTensor): 

2391 raise ValueError( 

2392 'The corresponding Tensor of numerical column must be a Tensor. ' 

2393 'SparseTensor is not supported. key: {}'.format(self.key)) 

2394 if self.normalizer_fn is not None: 

2395 input_tensor = self.normalizer_fn(input_tensor) 

2396 return math_ops.cast(input_tensor, dtypes.float32) 

2397 

2398 @property 

2399 def _variable_shape(self): 

2400 return tensor_shape.TensorShape(self.shape) 

2401 

2402 def _get_dense_tensor(self, inputs, weight_collections=None, trainable=None): 

2403 """Returns dense `Tensor` representing numeric feature. 

2404 

2405 Args: 

2406 inputs: A `_LazyBuilder` object to access inputs. 

2407 weight_collections: Unused `weight_collections` since no variables are 

2408 created in this function. 

2409 trainable: Unused `trainable` bool since no variables are created in this 

2410 function. 

2411 

2412 Returns: 

2413 Dense `Tensor` created within `_transform_feature`. 

2414 """ 

2415 # Do nothing with weight_collections and trainable since no variables are 

2416 # created in this function. 

2417 del weight_collections 

2418 del trainable 

2419 # Feature has been already transformed. Return the intermediate 

2420 # representation created by _transform_feature. 

2421 return inputs.get(self) 

2422 

2423 

2424class _BucketizedColumn(_DenseColumn, _CategoricalColumn, 

2425 collections.namedtuple('_BucketizedColumn', 

2426 ['source_column', 'boundaries']) 

2427 ): 

2428 """See `bucketized_column`.""" 

2429 

2430 @property 

2431 def name(self): 

2432 return '{}_bucketized'.format(self.source_column.name) 

2433 

2434 @property 

2435 def _parse_example_spec(self): 

2436 return self.source_column._parse_example_spec # pylint: disable=protected-access 

2437 

2438 def _transform_feature(self, inputs): 

2439 source_tensor = inputs.get(self.source_column) 

2440 return math_ops._bucketize( # pylint: disable=protected-access 

2441 source_tensor, 

2442 boundaries=self.boundaries) 

2443 

2444 @property 

2445 def _variable_shape(self): 

2446 return tensor_shape.TensorShape( 

2447 tuple(self.source_column.shape) + (len(self.boundaries) + 1,)) 

2448 

2449 def _get_dense_tensor(self, inputs, weight_collections=None, trainable=None): 

2450 del weight_collections 

2451 del trainable 

2452 input_tensor = inputs.get(self) 

2453 return array_ops.one_hot( 

2454 indices=math_ops.cast(input_tensor, dtypes.int64), 

2455 depth=len(self.boundaries) + 1, 

2456 on_value=1., 

2457 off_value=0.) 

2458 

2459 @property 

2460 def _num_buckets(self): 

2461 # By construction, source_column is always one-dimensional. 

2462 return (len(self.boundaries) + 1) * self.source_column.shape[0] 

2463 

2464 def _get_sparse_tensors(self, 

2465 inputs, 

2466 weight_collections=None, 

2467 trainable=None): 

2468 """Converts dense inputs to SparseTensor so downstream code can use it.""" 

2469 input_tensor = inputs.get(self) 

2470 batch_size = array_ops.shape(input_tensor)[0] 

2471 # By construction, source_column is always one-dimensional. 

2472 source_dimension = self.source_column.shape[0] 

2473 

2474 i1 = array_ops.reshape( 

2475 array_ops.tile( 

2476 array_ops.expand_dims(math_ops.range(0, batch_size), 1), 

2477 [1, source_dimension]), (-1,)) 

2478 i2 = array_ops.tile(math_ops.range(0, source_dimension), [batch_size]) 

2479 # Flatten the bucket indices and unique them across dimensions 

2480 # E.g. 2nd dimension indices will range from k to 2*k-1 with k buckets 

2481 bucket_indices = ( 

2482 array_ops.reshape(input_tensor, 

2483 (-1,)) + (len(self.boundaries) + 1) * i2) 

2484 

2485 indices = math_ops.cast( 

2486 array_ops.transpose(array_ops_stack.stack((i1, i2))), dtypes.int64) 

2487 dense_shape = math_ops.cast( 

2488 array_ops_stack.stack([batch_size, source_dimension]), dtypes.int64) 

2489 sparse_tensor = sparse_tensor_lib.SparseTensor( 

2490 indices=indices, values=bucket_indices, dense_shape=dense_shape) 

2491 return _CategoricalColumn.IdWeightPair(sparse_tensor, None) 

2492 

2493 

2494class _EmbeddingColumn( 

2495 _DenseColumn, _SequenceDenseColumn, 

2496 collections.namedtuple( 

2497 '_EmbeddingColumn', 

2498 ('categorical_column', 'dimension', 'combiner', 'layer_creator', 

2499 'ckpt_to_load_from', 'tensor_name_in_ckpt', 'max_norm', 'trainable', 

2500 'use_safe_embedding_lookup'))): 

2501 """See `embedding_column`.""" 

2502 

2503 def __new__(cls, 

2504 categorical_column, 

2505 dimension, 

2506 combiner, 

2507 layer_creator, 

2508 ckpt_to_load_from, 

2509 tensor_name_in_ckpt, 

2510 max_norm, 

2511 trainable, 

2512 use_safe_embedding_lookup=True): 

2513 return super(_EmbeddingColumn, cls).__new__( 

2514 cls, 

2515 categorical_column=categorical_column, 

2516 dimension=dimension, 

2517 combiner=combiner, 

2518 layer_creator=layer_creator, 

2519 ckpt_to_load_from=ckpt_to_load_from, 

2520 tensor_name_in_ckpt=tensor_name_in_ckpt, 

2521 max_norm=max_norm, 

2522 trainable=trainable, 

2523 use_safe_embedding_lookup=use_safe_embedding_lookup) 

2524 

2525 @property 

2526 def name(self): 

2527 if not hasattr(self, '_name'): 

2528 self._name = '{}_embedding'.format(self.categorical_column.name) 

2529 return self._name 

2530 

2531 @property 

2532 def _parse_example_spec(self): 

2533 return self.categorical_column._parse_example_spec # pylint: disable=protected-access 

2534 

2535 def _transform_feature(self, inputs): 

2536 return inputs.get(self.categorical_column) 

2537 

2538 @property 

2539 def _variable_shape(self): 

2540 if not hasattr(self, '_shape'): 

2541 self._shape = tensor_shape.TensorShape([self.dimension]) 

2542 return self._shape 

2543 

2544 def _get_dense_tensor_internal(self, 

2545 inputs, 

2546 weight_collections=None, 

2547 trainable=None): 

2548 """Private method that follows the signature of _get_dense_tensor.""" 

2549 # Get sparse IDs and weights. 

2550 sparse_tensors = self.categorical_column._get_sparse_tensors( # pylint: disable=protected-access 

2551 inputs, 

2552 weight_collections=weight_collections, 

2553 trainable=trainable) 

2554 sparse_ids = sparse_tensors.id_tensor 

2555 sparse_weights = sparse_tensors.weight_tensor 

2556 

2557 embedding_weights = self.layer_creator( 

2558 weight_collections=weight_collections, 

2559 scope=variable_scope.get_variable_scope()) 

2560 

2561 if self.ckpt_to_load_from is not None: 

2562 to_restore = embedding_weights 

2563 if isinstance(to_restore, variables.PartitionedVariable): 

2564 to_restore = to_restore._get_variable_list() # pylint: disable=protected-access 

2565 checkpoint_utils.init_from_checkpoint( 

2566 self.ckpt_to_load_from, {self.tensor_name_in_ckpt: to_restore}) 

2567 

2568 sparse_id_rank = tensor_shape.dimension_value( 

2569 sparse_ids.dense_shape.get_shape()[0]) 

2570 embedding_lookup_sparse = embedding_ops.safe_embedding_lookup_sparse 

2571 if (not self.use_safe_embedding_lookup and sparse_id_rank is not None and 

2572 sparse_id_rank <= 2): 

2573 embedding_lookup_sparse = embedding_ops.embedding_lookup_sparse_v2 

2574 # Return embedding lookup result. 

2575 return embedding_lookup_sparse( 

2576 embedding_weights, 

2577 sparse_ids, 

2578 sparse_weights, 

2579 combiner=self.combiner, 

2580 name='%s_weights' % self.name, 

2581 max_norm=self.max_norm) 

2582 

2583 def _get_dense_tensor(self, inputs, weight_collections=None, trainable=None): 

2584 if isinstance(self.categorical_column, _SequenceCategoricalColumn): 

2585 raise ValueError( 

2586 'In embedding_column: {}. ' 

2587 'categorical_column must not be of type _SequenceCategoricalColumn. ' 

2588 'Suggested fix A: If you wish to use input_layer, use a ' 

2589 'non-sequence categorical_column_with_*. ' 

2590 'Suggested fix B: If you wish to create sequence input, use ' 

2591 'sequence_input_layer instead of input_layer. ' 

2592 'Given (type {}): {}'.format(self.name, type(self.categorical_column), 

2593 self.categorical_column)) 

2594 return self._get_dense_tensor_internal( 

2595 inputs=inputs, 

2596 weight_collections=weight_collections, 

2597 trainable=trainable) 

2598 

2599 def _get_sequence_dense_tensor(self, 

2600 inputs, 

2601 weight_collections=None, 

2602 trainable=None): 

2603 if not isinstance(self.categorical_column, _SequenceCategoricalColumn): 

2604 raise ValueError( 

2605 'In embedding_column: {}. ' 

2606 'categorical_column must be of type _SequenceCategoricalColumn ' 

2607 'to use sequence_input_layer. ' 

2608 'Suggested fix: Use one of sequence_categorical_column_with_*. ' 

2609 'Given (type {}): {}'.format(self.name, type(self.categorical_column), 

2610 self.categorical_column)) 

2611 dense_tensor = self._get_dense_tensor_internal( # pylint: disable=protected-access 

2612 inputs=inputs, 

2613 weight_collections=weight_collections, 

2614 trainable=trainable) 

2615 

2616 sparse_tensors = self.categorical_column._get_sparse_tensors(inputs) # pylint: disable=protected-access 

2617 sequence_length = fc_utils.sequence_length_from_sparse_tensor( 

2618 sparse_tensors.id_tensor) 

2619 return _SequenceDenseColumn.TensorSequenceLengthPair( 

2620 dense_tensor=dense_tensor, sequence_length=sequence_length) 

2621 

2622 

2623def _get_graph_for_variable(var): 

2624 if isinstance(var, variables.PartitionedVariable): 

2625 return list(var)[0].graph 

2626 else: 

2627 return var.graph 

2628 

2629 

2630class _SharedEmbeddingColumn( 

2631 _DenseColumn, _SequenceDenseColumn, 

2632 collections.namedtuple( 

2633 '_SharedEmbeddingColumn', 

2634 ('categorical_column', 'dimension', 'combiner', 'initializer', 

2635 'shared_embedding_collection_name', 'ckpt_to_load_from', 

2636 'tensor_name_in_ckpt', 'max_norm', 'trainable', 

2637 'use_safe_embedding_lookup'))): 

2638 """See `embedding_column`.""" 

2639 

2640 @property 

2641 def name(self): 

2642 if not hasattr(self, '_name'): 

2643 self._name = '{}_shared_embedding'.format(self.categorical_column.name) 

2644 return self._name 

2645 

2646 @property 

2647 def _var_scope_name(self): 

2648 return self.shared_embedding_collection_name 

2649 

2650 @property 

2651 def _parse_example_spec(self): 

2652 return self.categorical_column._parse_example_spec # pylint: disable=protected-access 

2653 

2654 def _transform_feature(self, inputs): 

2655 return inputs.get(self.categorical_column) 

2656 

2657 @property 

2658 def _variable_shape(self): 

2659 if not hasattr(self, '_shape'): 

2660 self._shape = tensor_shape.TensorShape([self.dimension]) 

2661 return self._shape 

2662 

2663 def _get_dense_tensor_internal(self, 

2664 inputs, 

2665 weight_collections=None, 

2666 trainable=None): 

2667 """Private method that follows the signature of _get_dense_tensor.""" 

2668 # This method is called from a variable_scope with name _var_scope_name, 

2669 # which is shared among all shared embeddings. Open a name_scope here, so 

2670 # that the ops for different columns have distinct names. 

2671 with ops.name_scope(None, default_name=self.name): 

2672 # Get sparse IDs and weights. 

2673 sparse_tensors = self.categorical_column._get_sparse_tensors( # pylint: disable=protected-access 

2674 inputs, 

2675 weight_collections=weight_collections, 

2676 trainable=trainable) 

2677 sparse_ids = sparse_tensors.id_tensor 

2678 sparse_weights = sparse_tensors.weight_tensor 

2679 

2680 embedding_shape = (self.categorical_column._num_buckets, self.dimension) # pylint: disable=protected-access 

2681 shared_embedding_collection = ops.get_collection( 

2682 self.shared_embedding_collection_name) 

2683 if shared_embedding_collection: 

2684 if len(shared_embedding_collection) > 1: 

2685 raise ValueError( 

2686 'Collection {} can only contain one variable. ' 

2687 'Suggested fix A: Choose a unique name for this collection. ' 

2688 'Suggested fix B: Do not add any variables to this collection. ' 

2689 'The feature_column library already adds a variable under the ' 

2690 'hood.'.format(shared_embedding_collection)) 

2691 embedding_weights = shared_embedding_collection[0] 

2692 if embedding_weights.get_shape() != embedding_shape: 

2693 raise ValueError( 

2694 'Shared embedding collection {} contains variable {} of ' 

2695 'unexpected shape {}. Expected shape is {}. ' 

2696 'Suggested fix A: Choose a unique name for this collection. ' 

2697 'Suggested fix B: Do not add any variables to this collection. ' 

2698 'The feature_column library already adds a variable under the ' 

2699 'hood.'.format(self.shared_embedding_collection_name, 

2700 embedding_weights.name, 

2701 embedding_weights.get_shape(), embedding_shape)) 

2702 else: 

2703 embedding_weights = variable_scope.get_variable( 

2704 name='embedding_weights', 

2705 shape=embedding_shape, 

2706 dtype=dtypes.float32, 

2707 initializer=self.initializer, 

2708 trainable=self.trainable and trainable, 

2709 collections=weight_collections) 

2710 ops.add_to_collection(self.shared_embedding_collection_name, 

2711 embedding_weights) 

2712 if self.ckpt_to_load_from is not None: 

2713 to_restore = embedding_weights 

2714 if isinstance(to_restore, variables.PartitionedVariable): 

2715 to_restore = to_restore._get_variable_list() # pylint: disable=protected-access 

2716 checkpoint_utils.init_from_checkpoint( 

2717 self.ckpt_to_load_from, {self.tensor_name_in_ckpt: to_restore}) 

2718 

2719 sparse_id_rank = tensor_shape.dimension_value( 

2720 sparse_ids.dense_shape.get_shape()[0]) 

2721 embedding_lookup_sparse = embedding_ops.safe_embedding_lookup_sparse 

2722 if (not self.use_safe_embedding_lookup and sparse_id_rank is not None and 

2723 sparse_id_rank <= 2): 

2724 embedding_lookup_sparse = embedding_ops.embedding_lookup_sparse_v2 

2725 # Return embedding lookup result. 

2726 return embedding_lookup_sparse( 

2727 embedding_weights, 

2728 sparse_ids, 

2729 sparse_weights, 

2730 combiner=self.combiner, 

2731 name='%s_weights' % self.name, 

2732 max_norm=self.max_norm) 

2733 

2734 def _get_dense_tensor(self, inputs, weight_collections=None, trainable=None): 

2735 if isinstance(self.categorical_column, _SequenceCategoricalColumn): 

2736 raise ValueError( 

2737 'In embedding_column: {}. ' 

2738 'categorical_column must not be of type _SequenceCategoricalColumn. ' 

2739 'Suggested fix A: If you wish to use input_layer, use a ' 

2740 'non-sequence categorical_column_with_*. ' 

2741 'Suggested fix B: If you wish to create sequence input, use ' 

2742 'sequence_input_layer instead of input_layer. ' 

2743 'Given (type {}): {}'.format(self.name, type(self.categorical_column), 

2744 self.categorical_column)) 

2745 return self._get_dense_tensor_internal( 

2746 inputs=inputs, 

2747 weight_collections=weight_collections, 

2748 trainable=trainable) 

2749 

2750 def _get_sequence_dense_tensor(self, 

2751 inputs, 

2752 weight_collections=None, 

2753 trainable=None): 

2754 if not isinstance(self.categorical_column, _SequenceCategoricalColumn): 

2755 raise ValueError( 

2756 'In embedding_column: {}. ' 

2757 'categorical_column must be of type _SequenceCategoricalColumn ' 

2758 'to use sequence_input_layer. ' 

2759 'Suggested fix: Use one of sequence_categorical_column_with_*. ' 

2760 'Given (type {}): {}'.format(self.name, type(self.categorical_column), 

2761 self.categorical_column)) 

2762 dense_tensor = self._get_dense_tensor_internal( # pylint: disable=protected-access 

2763 inputs=inputs, 

2764 weight_collections=weight_collections, 

2765 trainable=trainable) 

2766 sparse_tensors = self.categorical_column._get_sparse_tensors(inputs) # pylint: disable=protected-access 

2767 sequence_length = fc_utils.sequence_length_from_sparse_tensor( 

2768 sparse_tensors.id_tensor) 

2769 return _SequenceDenseColumn.TensorSequenceLengthPair( 

2770 dense_tensor=dense_tensor, sequence_length=sequence_length) 

2771 

2772 

2773def _check_shape(shape, key): 

2774 """Returns shape if it's valid, raises error otherwise.""" 

2775 assert shape is not None 

2776 if not nest.is_nested(shape): 

2777 shape = [shape] 

2778 shape = tuple(shape) 

2779 for dimension in shape: 

2780 if not isinstance(dimension, six.integer_types): 

2781 raise TypeError('shape dimensions must be integer. ' 

2782 'shape: {}, key: {}'.format(shape, key)) 

2783 if dimension < 1: 

2784 raise ValueError('shape dimensions must be greater than 0. ' 

2785 'shape: {}, key: {}'.format(shape, key)) 

2786 return shape 

2787 

2788 

2789class _HashedCategoricalColumn(_CategoricalColumn, 

2790 collections.namedtuple( 

2791 '_HashedCategoricalColumn', 

2792 ['key', 'hash_bucket_size', 'dtype'])): 

2793 """see `categorical_column_with_hash_bucket`.""" 

2794 

2795 @property 

2796 def name(self): 

2797 return self.key 

2798 

2799 @property 

2800 def _parse_example_spec(self): 

2801 return {self.key: parsing_ops.VarLenFeature(self.dtype)} 

2802 

2803 def _transform_feature(self, inputs): 

2804 input_tensor = _to_sparse_input_and_drop_ignore_values(inputs.get(self.key)) 

2805 if not isinstance(input_tensor, sparse_tensor_lib.SparseTensor): 

2806 raise ValueError('SparseColumn input must be a SparseTensor.') 

2807 

2808 fc_utils.assert_string_or_int( 

2809 input_tensor.dtype, 

2810 prefix='column_name: {} input_tensor'.format(self.key)) 

2811 

2812 if self.dtype.is_integer != input_tensor.dtype.is_integer: 

2813 raise ValueError( 

2814 'Column dtype and SparseTensors dtype must be compatible. ' 

2815 'key: {}, column dtype: {}, tensor dtype: {}'.format( 

2816 self.key, self.dtype, input_tensor.dtype)) 

2817 

2818 if self.dtype == dtypes.string: 

2819 sparse_values = input_tensor.values 

2820 else: 

2821 sparse_values = string_ops.as_string(input_tensor.values) 

2822 

2823 sparse_id_values = string_ops.string_to_hash_bucket_fast( 

2824 sparse_values, self.hash_bucket_size, name='lookup') 

2825 return sparse_tensor_lib.SparseTensor(input_tensor.indices, 

2826 sparse_id_values, 

2827 input_tensor.dense_shape) 

2828 

2829 @property 

2830 def _num_buckets(self): 

2831 """Returns number of buckets in this sparse feature.""" 

2832 return self.hash_bucket_size 

2833 

2834 def _get_sparse_tensors(self, 

2835 inputs, 

2836 weight_collections=None, 

2837 trainable=None): 

2838 return _CategoricalColumn.IdWeightPair(inputs.get(self), None) 

2839 

2840 

2841class _VocabularyFileCategoricalColumn( 

2842 _CategoricalColumn, 

2843 collections.namedtuple('_VocabularyFileCategoricalColumn', 

2844 ('key', 'vocabulary_file', 'vocabulary_size', 

2845 'num_oov_buckets', 'dtype', 'default_value'))): 

2846 """See `categorical_column_with_vocabulary_file`.""" 

2847 

2848 @property 

2849 def name(self): 

2850 return self.key 

2851 

2852 @property 

2853 def _parse_example_spec(self): 

2854 return {self.key: parsing_ops.VarLenFeature(self.dtype)} 

2855 

2856 def _transform_feature(self, inputs): 

2857 input_tensor = _to_sparse_input_and_drop_ignore_values(inputs.get(self.key)) 

2858 

2859 if self.dtype.is_integer != input_tensor.dtype.is_integer: 

2860 raise ValueError( 

2861 'Column dtype and SparseTensors dtype must be compatible. ' 

2862 'key: {}, column dtype: {}, tensor dtype: {}'.format( 

2863 self.key, self.dtype, input_tensor.dtype)) 

2864 

2865 fc_utils.assert_string_or_int( 

2866 input_tensor.dtype, 

2867 prefix='column_name: {} input_tensor'.format(self.key)) 

2868 

2869 key_dtype = self.dtype 

2870 if input_tensor.dtype.is_integer: 

2871 # `index_table_from_file` requires 64-bit integer keys. 

2872 key_dtype = dtypes.int64 

2873 input_tensor = math_ops.cast(input_tensor, dtypes.int64) 

2874 

2875 return lookup_ops.index_table_from_file( 

2876 vocabulary_file=self.vocabulary_file, 

2877 num_oov_buckets=self.num_oov_buckets, 

2878 vocab_size=self.vocabulary_size, 

2879 default_value=self.default_value, 

2880 key_dtype=key_dtype, 

2881 name='{}_lookup'.format(self.key)).lookup(input_tensor) 

2882 

2883 @property 

2884 def _num_buckets(self): 

2885 """Returns number of buckets in this sparse feature.""" 

2886 return self.vocabulary_size + self.num_oov_buckets 

2887 

2888 def _get_sparse_tensors(self, 

2889 inputs, 

2890 weight_collections=None, 

2891 trainable=None): 

2892 return _CategoricalColumn.IdWeightPair(inputs.get(self), None) 

2893 

2894 

2895class _VocabularyListCategoricalColumn( 

2896 _CategoricalColumn, 

2897 collections.namedtuple( 

2898 '_VocabularyListCategoricalColumn', 

2899 ('key', 'vocabulary_list', 'dtype', 'default_value', 'num_oov_buckets')) 

2900): 

2901 """See `categorical_column_with_vocabulary_list`.""" 

2902 

2903 @property 

2904 def name(self): 

2905 return self.key 

2906 

2907 @property 

2908 def _parse_example_spec(self): 

2909 return {self.key: parsing_ops.VarLenFeature(self.dtype)} 

2910 

2911 def _transform_feature(self, inputs): 

2912 input_tensor = _to_sparse_input_and_drop_ignore_values(inputs.get(self.key)) 

2913 

2914 if self.dtype.is_integer != input_tensor.dtype.is_integer: 

2915 raise ValueError( 

2916 'Column dtype and SparseTensors dtype must be compatible. ' 

2917 'key: {}, column dtype: {}, tensor dtype: {}'.format( 

2918 self.key, self.dtype, input_tensor.dtype)) 

2919 

2920 fc_utils.assert_string_or_int( 

2921 input_tensor.dtype, 

2922 prefix='column_name: {} input_tensor'.format(self.key)) 

2923 

2924 key_dtype = self.dtype 

2925 if input_tensor.dtype.is_integer: 

2926 # `index_table_from_tensor` requires 64-bit integer keys. 

2927 key_dtype = dtypes.int64 

2928 input_tensor = math_ops.cast(input_tensor, dtypes.int64) 

2929 

2930 return lookup_ops.index_table_from_tensor( 

2931 vocabulary_list=tuple(self.vocabulary_list), 

2932 default_value=self.default_value, 

2933 num_oov_buckets=self.num_oov_buckets, 

2934 dtype=key_dtype, 

2935 name='{}_lookup'.format(self.key)).lookup(input_tensor) 

2936 

2937 @property 

2938 def _num_buckets(self): 

2939 """Returns number of buckets in this sparse feature.""" 

2940 return len(self.vocabulary_list) + self.num_oov_buckets 

2941 

2942 def _get_sparse_tensors(self, 

2943 inputs, 

2944 weight_collections=None, 

2945 trainable=None): 

2946 return _CategoricalColumn.IdWeightPair(inputs.get(self), None) 

2947 

2948 

2949class _IdentityCategoricalColumn(_CategoricalColumn, 

2950 collections.namedtuple( 

2951 '_IdentityCategoricalColumn', 

2952 ('key', 'num_buckets', 'default_value'))): 

2953 """See `categorical_column_with_identity`.""" 

2954 

2955 @property 

2956 def name(self): 

2957 return self.key 

2958 

2959 @property 

2960 def _parse_example_spec(self): 

2961 return {self.key: parsing_ops.VarLenFeature(dtypes.int64)} 

2962 

2963 def _transform_feature(self, inputs): 

2964 input_tensor = _to_sparse_input_and_drop_ignore_values(inputs.get(self.key)) 

2965 

2966 if not input_tensor.dtype.is_integer: 

2967 raise ValueError('Invalid input, not integer. key: {} dtype: {}'.format( 

2968 self.key, input_tensor.dtype)) 

2969 values = input_tensor.values 

2970 if input_tensor.values.dtype != dtypes.int64: 

2971 values = math_ops.cast(values, dtypes.int64, name='values') 

2972 if self.default_value is not None: 

2973 num_buckets = math_ops.cast( 

2974 self.num_buckets, dtypes.int64, name='num_buckets') 

2975 zero = math_ops.cast(0, dtypes.int64, name='zero') 

2976 # Assign default for out-of-range values. 

2977 values = array_ops.where( 

2978 math_ops.logical_or( 

2979 values < zero, values >= num_buckets, name='out_of_range'), 

2980 array_ops.fill( 

2981 dims=array_ops.shape(values), 

2982 value=math_ops.cast(self.default_value, dtypes.int64), 

2983 name='default_values'), values) 

2984 return sparse_tensor_lib.SparseTensor( 

2985 indices=input_tensor.indices, 

2986 values=values, 

2987 dense_shape=input_tensor.dense_shape) 

2988 

2989 @property 

2990 def _num_buckets(self): 

2991 """Returns number of buckets in this sparse feature.""" 

2992 return self.num_buckets 

2993 

2994 def _get_sparse_tensors(self, 

2995 inputs, 

2996 weight_collections=None, 

2997 trainable=None): 

2998 return _CategoricalColumn.IdWeightPair(inputs.get(self), None) 

2999 

3000 

3001class _WeightedCategoricalColumn( 

3002 _CategoricalColumn, 

3003 collections.namedtuple( 

3004 '_WeightedCategoricalColumn', 

3005 ('categorical_column', 'weight_feature_key', 'dtype'))): 

3006 """See `weighted_categorical_column`.""" 

3007 

3008 @property 

3009 def name(self): 

3010 return '{}_weighted_by_{}'.format(self.categorical_column.name, 

3011 self.weight_feature_key) 

3012 

3013 @property 

3014 def _parse_example_spec(self): 

3015 config = self.categorical_column._parse_example_spec # pylint: disable=protected-access 

3016 if self.weight_feature_key in config: 

3017 raise ValueError('Parse config {} already exists for {}.'.format( 

3018 config[self.weight_feature_key], self.weight_feature_key)) 

3019 config[self.weight_feature_key] = parsing_ops.VarLenFeature(self.dtype) 

3020 return config 

3021 

3022 @property 

3023 def _num_buckets(self): 

3024 return self.categorical_column._num_buckets # pylint: disable=protected-access 

3025 

3026 def _transform_feature(self, inputs): 

3027 weight_tensor = inputs.get(self.weight_feature_key) 

3028 if weight_tensor is None: 

3029 raise ValueError('Missing weights {}.'.format(self.weight_feature_key)) 

3030 weight_tensor = sparse_tensor_lib.convert_to_tensor_or_sparse_tensor( 

3031 weight_tensor) 

3032 if self.dtype != weight_tensor.dtype.base_dtype: 

3033 raise ValueError('Bad dtype, expected {}, but got {}.'.format( 

3034 self.dtype, weight_tensor.dtype)) 

3035 if not isinstance(weight_tensor, sparse_tensor_lib.SparseTensor): 

3036 # The weight tensor can be a regular Tensor. In this case, sparsify it. 

3037 weight_tensor = _to_sparse_input_and_drop_ignore_values( 

3038 weight_tensor, ignore_value=0.0) 

3039 if not weight_tensor.dtype.is_floating: 

3040 weight_tensor = math_ops.cast(weight_tensor, dtypes.float32) 

3041 return (inputs.get(self.categorical_column), weight_tensor) 

3042 

3043 def _get_sparse_tensors(self, 

3044 inputs, 

3045 weight_collections=None, 

3046 trainable=None): 

3047 del weight_collections 

3048 del trainable 

3049 tensors = inputs.get(self) 

3050 return _CategoricalColumn.IdWeightPair(tensors[0], tensors[1]) 

3051 

3052 

3053class _CrossedColumn( 

3054 _CategoricalColumn, 

3055 collections.namedtuple('_CrossedColumn', 

3056 ['keys', 'hash_bucket_size', 'hash_key'])): 

3057 """See `crossed_column`.""" 

3058 

3059 @property 

3060 def name(self): 

3061 feature_names = [] 

3062 for key in _collect_leaf_level_keys(self): 

3063 if isinstance(key, _FeatureColumn): 

3064 feature_names.append(key.name) 

3065 else: # key must be a string 

3066 feature_names.append(key) 

3067 return '_X_'.join(sorted(feature_names)) 

3068 

3069 @property 

3070 def _parse_example_spec(self): 

3071 config = {} 

3072 for key in self.keys: 

3073 if isinstance(key, _FeatureColumn): 

3074 config.update(key._parse_example_spec) # pylint: disable=protected-access 

3075 else: # key must be a string 

3076 config.update({key: parsing_ops.VarLenFeature(dtypes.string)}) 

3077 return config 

3078 

3079 def _transform_feature(self, inputs): 

3080 feature_tensors = [] 

3081 for key in _collect_leaf_level_keys(self): 

3082 if isinstance(key, six.string_types): 

3083 feature_tensors.append(inputs.get(key)) 

3084 elif isinstance(key, _CategoricalColumn): 

3085 ids_and_weights = key._get_sparse_tensors(inputs) # pylint: disable=protected-access 

3086 if ids_and_weights.weight_tensor is not None: 

3087 raise ValueError( 

3088 'crossed_column does not support weight_tensor, but the given ' 

3089 'column populates weight_tensor. ' 

3090 'Given column: {}'.format(key.name)) 

3091 feature_tensors.append(ids_and_weights.id_tensor) 

3092 else: 

3093 raise ValueError('Unsupported column type. Given: {}'.format(key)) 

3094 return sparse_ops.sparse_cross_hashed( 

3095 inputs=feature_tensors, 

3096 num_buckets=self.hash_bucket_size, 

3097 hash_key=self.hash_key) 

3098 

3099 @property 

3100 def _num_buckets(self): 

3101 """Returns number of buckets in this sparse feature.""" 

3102 return self.hash_bucket_size 

3103 

3104 def _get_sparse_tensors(self, 

3105 inputs, 

3106 weight_collections=None, 

3107 trainable=None): 

3108 return _CategoricalColumn.IdWeightPair(inputs.get(self), None) 

3109 

3110 

3111def _collect_leaf_level_keys(cross): 

3112 """Collects base keys by expanding all nested crosses. 

3113 

3114 Args: 

3115 cross: A `_CrossedColumn`. 

3116 

3117 Returns: 

3118 A list of strings or `_CategoricalColumn` instances. 

3119 """ 

3120 leaf_level_keys = [] 

3121 for k in cross.keys: 

3122 if isinstance(k, _CrossedColumn): 

3123 leaf_level_keys.extend(_collect_leaf_level_keys(k)) 

3124 else: 

3125 leaf_level_keys.append(k) 

3126 return leaf_level_keys 

3127 

3128 

3129class _IndicatorColumn(_DenseColumn, _SequenceDenseColumn, 

3130 collections.namedtuple('_IndicatorColumn', 

3131 ['categorical_column'])): 

3132 """Represents a one-hot column for use in deep networks. 

3133 

3134 Args: 

3135 categorical_column: A `_CategoricalColumn` which is created by 

3136 `categorical_column_with_*` function. 

3137 """ 

3138 

3139 @property 

3140 def name(self): 

3141 return '{}_indicator'.format(self.categorical_column.name) 

3142 

3143 def _transform_feature(self, inputs): 

3144 """Returns dense `Tensor` representing feature. 

3145 

3146 Args: 

3147 inputs: A `_LazyBuilder` object to access inputs. 

3148 

3149 Returns: 

3150 Transformed feature `Tensor`. 

3151 

3152 Raises: 

3153 ValueError: if input rank is not known at graph building time. 

3154 """ 

3155 id_weight_pair = self.categorical_column._get_sparse_tensors(inputs) # pylint: disable=protected-access 

3156 id_tensor = id_weight_pair.id_tensor 

3157 weight_tensor = id_weight_pair.weight_tensor 

3158 

3159 # If the underlying column is weighted, return the input as a dense tensor. 

3160 if weight_tensor is not None: 

3161 weighted_column = sparse_ops.sparse_merge( 

3162 sp_ids=id_tensor, 

3163 sp_values=weight_tensor, 

3164 vocab_size=int(self._variable_shape[-1])) 

3165 # Remove (?, -1) index. 

3166 weighted_column = sparse_ops.sparse_slice(weighted_column, [0, 0], 

3167 weighted_column.dense_shape) 

3168 # Use scatter_nd to merge duplicated indices if existed, 

3169 # instead of sparse_tensor_to_dense. 

3170 return array_ops.scatter_nd(weighted_column.indices, 

3171 weighted_column.values, 

3172 weighted_column.dense_shape) 

3173 

3174 dense_id_tensor = sparse_ops.sparse_tensor_to_dense( 

3175 id_tensor, default_value=-1) 

3176 

3177 # One hot must be float for tf.concat reasons since all other inputs to 

3178 # input_layer are float32. 

3179 one_hot_id_tensor = array_ops.one_hot( 

3180 dense_id_tensor, 

3181 depth=self._variable_shape[-1], 

3182 on_value=1.0, 

3183 off_value=0.0) 

3184 

3185 # Reduce to get a multi-hot per example. 

3186 return math_ops.reduce_sum(one_hot_id_tensor, axis=[-2]) 

3187 

3188 @property 

3189 def _parse_example_spec(self): 

3190 return self.categorical_column._parse_example_spec # pylint: disable=protected-access 

3191 

3192 @property 

3193 def _variable_shape(self): 

3194 """Returns a `TensorShape` representing the shape of the dense `Tensor`.""" 

3195 return tensor_shape.TensorShape([1, self.categorical_column._num_buckets]) # pylint: disable=protected-access 

3196 

3197 def _get_dense_tensor(self, inputs, weight_collections=None, trainable=None): 

3198 """Returns dense `Tensor` representing feature. 

3199 

3200 Args: 

3201 inputs: A `_LazyBuilder` object to access inputs. 

3202 weight_collections: Unused `weight_collections` since no variables are 

3203 created in this function. 

3204 trainable: Unused `trainable` bool since no variables are created in this 

3205 function. 

3206 

3207 Returns: 

3208 Dense `Tensor` created within `_transform_feature`. 

3209 

3210 Raises: 

3211 ValueError: If `categorical_column` is a `_SequenceCategoricalColumn`. 

3212 """ 

3213 # Do nothing with weight_collections and trainable since no variables are 

3214 # created in this function. 

3215 del weight_collections 

3216 del trainable 

3217 if isinstance(self.categorical_column, _SequenceCategoricalColumn): 

3218 raise ValueError( 

3219 'In indicator_column: {}. ' 

3220 'categorical_column must not be of type _SequenceCategoricalColumn. ' 

3221 'Suggested fix A: If you wish to use input_layer, use a ' 

3222 'non-sequence categorical_column_with_*. ' 

3223 'Suggested fix B: If you wish to create sequence input, use ' 

3224 'sequence_input_layer instead of input_layer. ' 

3225 'Given (type {}): {}'.format(self.name, type(self.categorical_column), 

3226 self.categorical_column)) 

3227 # Feature has been already transformed. Return the intermediate 

3228 # representation created by _transform_feature. 

3229 return inputs.get(self) 

3230 

3231 def _get_sequence_dense_tensor(self, 

3232 inputs, 

3233 weight_collections=None, 

3234 trainable=None): 

3235 # Do nothing with weight_collections and trainable since no variables are 

3236 # created in this function. 

3237 del weight_collections 

3238 del trainable 

3239 if not isinstance(self.categorical_column, _SequenceCategoricalColumn): 

3240 raise ValueError( 

3241 'In indicator_column: {}. ' 

3242 'categorical_column must be of type _SequenceCategoricalColumn ' 

3243 'to use sequence_input_layer. ' 

3244 'Suggested fix: Use one of sequence_categorical_column_with_*. ' 

3245 'Given (type {}): {}'.format(self.name, type(self.categorical_column), 

3246 self.categorical_column)) 

3247 # Feature has been already transformed. Return the intermediate 

3248 # representation created by _transform_feature. 

3249 dense_tensor = inputs.get(self) 

3250 sparse_tensors = self.categorical_column._get_sparse_tensors(inputs) # pylint: disable=protected-access 

3251 sequence_length = fc_utils.sequence_length_from_sparse_tensor( 

3252 sparse_tensors.id_tensor) 

3253 return _SequenceDenseColumn.TensorSequenceLengthPair( 

3254 dense_tensor=dense_tensor, sequence_length=sequence_length) 

3255 

3256 

3257def _verify_static_batch_size_equality(tensors, columns): 

3258 """Validates that the first dim (batch size) of all tensors are equal or None. 

3259 

3260 Args: 

3261 tensors: list of tensors to check. 

3262 columns: list of feature columns matching tensors. Will be used for error 

3263 messaging. 

3264 

3265 Raises: 

3266 ValueError: if one of the tensors has a variant batch size 

3267 """ 

3268 # bath_size is a tf.compat.v1.Dimension object. 

3269 expected_batch_size = None 

3270 for i in range(0, len(tensors)): 

3271 if tensors[i].shape.dims[0].value is not None: 

3272 if expected_batch_size is None: 

3273 bath_size_column_index = i 

3274 expected_batch_size = tensors[i].shape.dims[0] 

3275 elif not expected_batch_size.is_compatible_with(tensors[i].shape.dims[0]): 

3276 raise ValueError( 

3277 'Batch size (first dimension) of each feature must be same. ' 

3278 'Batch size of columns ({}, {}): ({}, {})'.format( 

3279 columns[bath_size_column_index].name, columns[i].name, 

3280 expected_batch_size, tensors[i].shape.dims[0])) 

3281 

3282 

3283class _SequenceCategoricalColumn(_CategoricalColumn, 

3284 collections.namedtuple( 

3285 '_SequenceCategoricalColumn', 

3286 ['categorical_column'])): 

3287 """Represents sequences of categorical data.""" 

3288 

3289 @property 

3290 def name(self): 

3291 return self.categorical_column.name 

3292 

3293 @property 

3294 def _parse_example_spec(self): 

3295 return self.categorical_column._parse_example_spec # pylint: disable=protected-access 

3296 

3297 def _transform_feature(self, inputs): 

3298 return self.categorical_column._transform_feature(inputs) # pylint: disable=protected-access 

3299 

3300 @property 

3301 def _num_buckets(self): 

3302 return self.categorical_column._num_buckets # pylint: disable=protected-access 

3303 

3304 def _get_sparse_tensors(self, 

3305 inputs, 

3306 weight_collections=None, 

3307 trainable=None): 

3308 sparse_tensors = self.categorical_column._get_sparse_tensors(inputs) # pylint: disable=protected-access 

3309 id_tensor = sparse_tensors.id_tensor 

3310 weight_tensor = sparse_tensors.weight_tensor 

3311 

3312 # Expands third dimension, if necessary so that embeddings are not 

3313 # combined during embedding lookup. If the tensor is already 3D, leave 

3314 # as-is. 

3315 shape = array_ops.shape(id_tensor) 

3316 # Compute the third dimension explicitly instead of setting it to -1, as 

3317 # that doesn't work for dynamically shaped tensors with 0-length at runtime. 

3318 # This happens for empty sequences. 

3319 target_shape = [shape[0], shape[1], math_ops.reduce_prod(shape[2:])] 

3320 id_tensor = sparse_ops.sparse_reshape(id_tensor, target_shape) 

3321 if weight_tensor is not None: 

3322 weight_tensor = sparse_ops.sparse_reshape(weight_tensor, target_shape) 

3323 

3324 return _CategoricalColumn.IdWeightPair(id_tensor, weight_tensor)