Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/tensorflow/python/feature_column/sequence_feature_column.py: 51%

105 statements  

« prev     ^ index     » next       coverage.py v7.4.0, created at 2024-01-03 07:57 +0000

1# Copyright 2018 The TensorFlow Authors. All Rights Reserved. 

2# 

3# Licensed under the Apache License, Version 2.0 (the "License"); 

4# you may not use this file except in compliance with the License. 

5# You may obtain a copy of the License at 

6# 

7# http://www.apache.org/licenses/LICENSE-2.0 

8# 

9# Unless required by applicable law or agreed to in writing, software 

10# distributed under the License is distributed on an "AS IS" BASIS, 

11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 

12# See the License for the specific language governing permissions and 

13# limitations under the License. 

14# ============================================================================== 

15"""This API defines FeatureColumn for sequential input. 

16 

17NOTE: This API is a work in progress and will likely be changing frequently. 

18""" 

19 

20import collections 

21 

22from tensorflow.python.feature_column import feature_column_v2 as fc 

23from tensorflow.python.feature_column import utils as fc_utils 

24from tensorflow.python.framework import dtypes 

25from tensorflow.python.framework import ops 

26from tensorflow.python.framework import tensor_shape 

27from tensorflow.python.ops import array_ops 

28from tensorflow.python.ops import check_ops 

29from tensorflow.python.ops import parsing_ops 

30from tensorflow.python.ops import sparse_ops 

31from tensorflow.python.util import deprecation 

32from tensorflow.python.util.tf_export import tf_export 

33from tensorflow.tools.docs import doc_controls 

34 

35_FEATURE_COLUMN_DEPRECATION_WARNING = """\ 

36 Warning: tf.feature_column is not recommended for new code. Instead, 

37 feature preprocessing can be done directly using either [Keras preprocessing 

38 layers](https://www.tensorflow.org/guide/migrate/migrating_feature_columns) 

39 or through the one-stop utility [`tf.keras.utils.FeatureSpace`](https://www.tensorflow.org/api_docs/python/tf/keras/utils/FeatureSpace) 

40 built on top of them. See the [migration guide](https://tensorflow.org/guide/migrate) 

41 for details. 

42 """ 

43 

44_FEATURE_COLUMN_DEPRECATION_RUNTIME_WARNING = ( 

45 'Use Keras preprocessing layers instead, either directly or via the ' 

46 '`tf.keras.utils.FeatureSpace` utility. Each of `tf.feature_column.*` has ' 

47 'a functional equivalent in `tf.keras.layers` for feature preprocessing ' 

48 'when training a Keras model.') 

49 

50 

51# pylint: disable=protected-access 

52def concatenate_context_input(context_input, sequence_input): 

53 """Replicates `context_input` across all timesteps of `sequence_input`. 

54 

55 Expands dimension 1 of `context_input` then tiles it `sequence_length` times. 

56 This value is appended to `sequence_input` on dimension 2 and the result is 

57 returned. 

58 

59 Args: 

60 context_input: A `Tensor` of dtype `float32` and shape `[batch_size, d1]`. 

61 sequence_input: A `Tensor` of dtype `float32` and shape `[batch_size, 

62 padded_length, d0]`. 

63 

64 Returns: 

65 A `Tensor` of dtype `float32` and shape `[batch_size, padded_length, 

66 d0 + d1]`. 

67 

68 Raises: 

69 ValueError: If `sequence_input` does not have rank 3 or `context_input` does 

70 not have rank 2. 

71 """ 

72 seq_rank_check = check_ops.assert_rank( 

73 sequence_input, 

74 3, 

75 message='sequence_input must have rank 3', 

76 data=[array_ops.shape(sequence_input)]) 

77 seq_type_check = check_ops.assert_type( 

78 sequence_input, 

79 dtypes.float32, 

80 message='sequence_input must have dtype float32; got {}.'.format( 

81 sequence_input.dtype)) 

82 ctx_rank_check = check_ops.assert_rank( 

83 context_input, 

84 2, 

85 message='context_input must have rank 2', 

86 data=[array_ops.shape(context_input)]) 

87 ctx_type_check = check_ops.assert_type( 

88 context_input, 

89 dtypes.float32, 

90 message='context_input must have dtype float32; got {}.'.format( 

91 context_input.dtype)) 

92 with ops.control_dependencies( 

93 [seq_rank_check, seq_type_check, ctx_rank_check, ctx_type_check]): 

94 padded_length = array_ops.shape(sequence_input)[1] 

95 tiled_context_input = array_ops.tile( 

96 array_ops.expand_dims(context_input, 1), 

97 array_ops.concat([[1], [padded_length], [1]], 0)) 

98 return array_ops.concat([sequence_input, tiled_context_input], 2) 

99 

100 

101@doc_controls.header(_FEATURE_COLUMN_DEPRECATION_WARNING) 

102@tf_export('feature_column.sequence_categorical_column_with_identity') 

103@deprecation.deprecated(None, _FEATURE_COLUMN_DEPRECATION_RUNTIME_WARNING) 

104def sequence_categorical_column_with_identity(key, 

105 num_buckets, 

106 default_value=None): 

107 """Returns a feature column that represents sequences of integers. 

108 

109 Pass this to `embedding_column` or `indicator_column` to convert sequence 

110 categorical data into dense representation for input to sequence NN, such as 

111 RNN. 

112 

113 Example: 

114 

115 ```python 

116 watches = sequence_categorical_column_with_identity( 

117 'watches', num_buckets=1000) 

118 watches_embedding = embedding_column(watches, dimension=10) 

119 columns = [watches_embedding] 

120 

121 features = tf.io.parse_example(..., features=make_parse_example_spec(columns)) 

122 sequence_feature_layer = SequenceFeatures(columns) 

123 sequence_input, sequence_length = sequence_feature_layer(features) 

124 sequence_length_mask = tf.sequence_mask(sequence_length) 

125 

126 rnn_cell = tf.keras.layers.SimpleRNNCell(hidden_size) 

127 rnn_layer = tf.keras.layers.RNN(rnn_cell) 

128 outputs, state = rnn_layer(sequence_input, mask=sequence_length_mask) 

129 ``` 

130 

131 Args: 

132 key: A unique string identifying the input feature. 

133 num_buckets: Range of inputs. Namely, inputs are expected to be in the range 

134 `[0, num_buckets)`. 

135 default_value: If `None`, this column's graph operations will fail for 

136 out-of-range inputs. Otherwise, this value must be in the range `[0, 

137 num_buckets)`, and will replace out-of-range inputs. 

138 

139 Returns: 

140 A `SequenceCategoricalColumn`. 

141 

142 Raises: 

143 ValueError: if `num_buckets` is less than one. 

144 ValueError: if `default_value` is not in range `[0, num_buckets)`. 

145 """ 

146 return fc.SequenceCategoricalColumn( 

147 fc.categorical_column_with_identity( 

148 key=key, num_buckets=num_buckets, default_value=default_value)) 

149 

150 

151@doc_controls.header(_FEATURE_COLUMN_DEPRECATION_WARNING) 

152@tf_export('feature_column.sequence_categorical_column_with_hash_bucket') 

153@deprecation.deprecated(None, _FEATURE_COLUMN_DEPRECATION_RUNTIME_WARNING) 

154def sequence_categorical_column_with_hash_bucket(key, 

155 hash_bucket_size, 

156 dtype=dtypes.string): 

157 """A sequence of categorical terms where ids are set by hashing. 

158 

159 Pass this to `embedding_column` or `indicator_column` to convert sequence 

160 categorical data into dense representation for input to sequence NN, such as 

161 RNN. 

162 

163 Example: 

164 

165 ```python 

166 tokens = sequence_categorical_column_with_hash_bucket( 

167 'tokens', hash_bucket_size=1000) 

168 tokens_embedding = embedding_column(tokens, dimension=10) 

169 columns = [tokens_embedding] 

170 

171 features = tf.io.parse_example(..., features=make_parse_example_spec(columns)) 

172 sequence_feature_layer = SequenceFeatures(columns) 

173 sequence_input, sequence_length = sequence_feature_layer(features) 

174 sequence_length_mask = tf.sequence_mask(sequence_length) 

175 

176 rnn_cell = tf.keras.layers.SimpleRNNCell(hidden_size) 

177 rnn_layer = tf.keras.layers.RNN(rnn_cell) 

178 outputs, state = rnn_layer(sequence_input, mask=sequence_length_mask) 

179 ``` 

180 

181 Args: 

182 key: A unique string identifying the input feature. 

183 hash_bucket_size: An int > 1. The number of buckets. 

184 dtype: The type of features. Only string and integer types are supported. 

185 

186 Returns: 

187 A `SequenceCategoricalColumn`. 

188 

189 Raises: 

190 ValueError: `hash_bucket_size` is not greater than 1. 

191 ValueError: `dtype` is neither string nor integer. 

192 """ 

193 return fc.SequenceCategoricalColumn( 

194 fc.categorical_column_with_hash_bucket( 

195 key=key, hash_bucket_size=hash_bucket_size, dtype=dtype)) 

196 

197 

198@doc_controls.header(_FEATURE_COLUMN_DEPRECATION_WARNING) 

199@tf_export('feature_column.sequence_categorical_column_with_vocabulary_file') 

200@deprecation.deprecated(None, _FEATURE_COLUMN_DEPRECATION_RUNTIME_WARNING) 

201def sequence_categorical_column_with_vocabulary_file(key, 

202 vocabulary_file, 

203 vocabulary_size=None, 

204 num_oov_buckets=0, 

205 default_value=None, 

206 dtype=dtypes.string): 

207 """A sequence of categorical terms where ids use a vocabulary file. 

208 

209 Pass this to `embedding_column` or `indicator_column` to convert sequence 

210 categorical data into dense representation for input to sequence NN, such as 

211 RNN. 

212 

213 Example: 

214 

215 ```python 

216 states = sequence_categorical_column_with_vocabulary_file( 

217 key='states', vocabulary_file='/us/states.txt', vocabulary_size=50, 

218 num_oov_buckets=5) 

219 states_embedding = embedding_column(states, dimension=10) 

220 columns = [states_embedding] 

221 

222 features = tf.io.parse_example(..., features=make_parse_example_spec(columns)) 

223 sequence_feature_layer = SequenceFeatures(columns) 

224 sequence_input, sequence_length = sequence_feature_layer(features) 

225 sequence_length_mask = tf.sequence_mask(sequence_length) 

226 

227 rnn_cell = tf.keras.layers.SimpleRNNCell(hidden_size) 

228 rnn_layer = tf.keras.layers.RNN(rnn_cell) 

229 outputs, state = rnn_layer(sequence_input, mask=sequence_length_mask) 

230 ``` 

231 

232 Args: 

233 key: A unique string identifying the input feature. 

234 vocabulary_file: The vocabulary file name. 

235 vocabulary_size: Number of the elements in the vocabulary. This must be no 

236 greater than length of `vocabulary_file`, if less than length, later 

237 values are ignored. If None, it is set to the length of `vocabulary_file`. 

238 num_oov_buckets: Non-negative integer, the number of out-of-vocabulary 

239 buckets. All out-of-vocabulary inputs will be assigned IDs in the range 

240 `[vocabulary_size, vocabulary_size+num_oov_buckets)` based on a hash of 

241 the input value. A positive `num_oov_buckets` can not be specified with 

242 `default_value`. 

243 default_value: The integer ID value to return for out-of-vocabulary feature 

244 values, defaults to `-1`. This can not be specified with a positive 

245 `num_oov_buckets`. 

246 dtype: The type of features. Only string and integer types are supported. 

247 

248 Returns: 

249 A `SequenceCategoricalColumn`. 

250 

251 Raises: 

252 ValueError: `vocabulary_file` is missing or cannot be opened. 

253 ValueError: `vocabulary_size` is missing or < 1. 

254 ValueError: `num_oov_buckets` is a negative integer. 

255 ValueError: `num_oov_buckets` and `default_value` are both specified. 

256 ValueError: `dtype` is neither string nor integer. 

257 """ 

258 return fc.SequenceCategoricalColumn( 

259 fc.categorical_column_with_vocabulary_file( 

260 key=key, 

261 vocabulary_file=vocabulary_file, 

262 vocabulary_size=vocabulary_size, 

263 num_oov_buckets=num_oov_buckets, 

264 default_value=default_value, 

265 dtype=dtype)) 

266 

267 

268@doc_controls.header(_FEATURE_COLUMN_DEPRECATION_WARNING) 

269@tf_export('feature_column.sequence_categorical_column_with_vocabulary_list') 

270@deprecation.deprecated(None, _FEATURE_COLUMN_DEPRECATION_RUNTIME_WARNING) 

271def sequence_categorical_column_with_vocabulary_list(key, 

272 vocabulary_list, 

273 dtype=None, 

274 default_value=-1, 

275 num_oov_buckets=0): 

276 """A sequence of categorical terms where ids use an in-memory list. 

277 

278 Pass this to `embedding_column` or `indicator_column` to convert sequence 

279 categorical data into dense representation for input to sequence NN, such as 

280 RNN. 

281 

282 Example: 

283 

284 ```python 

285 colors = sequence_categorical_column_with_vocabulary_list( 

286 key='colors', vocabulary_list=('R', 'G', 'B', 'Y'), 

287 num_oov_buckets=2) 

288 colors_embedding = embedding_column(colors, dimension=3) 

289 columns = [colors_embedding] 

290 

291 features = tf.io.parse_example(..., features=make_parse_example_spec(columns)) 

292 sequence_feature_layer = SequenceFeatures(columns) 

293 sequence_input, sequence_length = sequence_feature_layer(features) 

294 sequence_length_mask = tf.sequence_mask(sequence_length) 

295 

296 rnn_cell = tf.keras.layers.SimpleRNNCell(hidden_size) 

297 rnn_layer = tf.keras.layers.RNN(rnn_cell) 

298 outputs, state = rnn_layer(sequence_input, mask=sequence_length_mask) 

299 ``` 

300 

301 Args: 

302 key: A unique string identifying the input feature. 

303 vocabulary_list: An ordered iterable defining the vocabulary. Each feature 

304 is mapped to the index of its value (if present) in `vocabulary_list`. 

305 Must be castable to `dtype`. 

306 dtype: The type of features. Only string and integer types are supported. If 

307 `None`, it will be inferred from `vocabulary_list`. 

308 default_value: The integer ID value to return for out-of-vocabulary feature 

309 values, defaults to `-1`. This can not be specified with a positive 

310 `num_oov_buckets`. 

311 num_oov_buckets: Non-negative integer, the number of out-of-vocabulary 

312 buckets. All out-of-vocabulary inputs will be assigned IDs in the range 

313 `[len(vocabulary_list), len(vocabulary_list)+num_oov_buckets)` based on a 

314 hash of the input value. A positive `num_oov_buckets` can not be specified 

315 with `default_value`. 

316 

317 Returns: 

318 A `SequenceCategoricalColumn`. 

319 

320 Raises: 

321 ValueError: if `vocabulary_list` is empty, or contains duplicate keys. 

322 ValueError: `num_oov_buckets` is a negative integer. 

323 ValueError: `num_oov_buckets` and `default_value` are both specified. 

324 ValueError: if `dtype` is not integer or string. 

325 """ 

326 return fc.SequenceCategoricalColumn( 

327 fc.categorical_column_with_vocabulary_list( 

328 key=key, 

329 vocabulary_list=vocabulary_list, 

330 dtype=dtype, 

331 default_value=default_value, 

332 num_oov_buckets=num_oov_buckets)) 

333 

334 

335@doc_controls.header(_FEATURE_COLUMN_DEPRECATION_WARNING) 

336@tf_export('feature_column.sequence_numeric_column') 

337@deprecation.deprecated(None, _FEATURE_COLUMN_DEPRECATION_RUNTIME_WARNING) 

338def sequence_numeric_column(key, 

339 shape=(1,), 

340 default_value=0., 

341 dtype=dtypes.float32, 

342 normalizer_fn=None): 

343 """Returns a feature column that represents sequences of numeric data. 

344 

345 Example: 

346 

347 ```python 

348 temperature = sequence_numeric_column('temperature') 

349 columns = [temperature] 

350 

351 features = tf.io.parse_example(..., features=make_parse_example_spec(columns)) 

352 sequence_feature_layer = SequenceFeatures(columns) 

353 sequence_input, sequence_length = sequence_feature_layer(features) 

354 sequence_length_mask = tf.sequence_mask(sequence_length) 

355 

356 rnn_cell = tf.keras.layers.SimpleRNNCell(hidden_size) 

357 rnn_layer = tf.keras.layers.RNN(rnn_cell) 

358 outputs, state = rnn_layer(sequence_input, mask=sequence_length_mask) 

359 ``` 

360 

361 Args: 

362 key: A unique string identifying the input features. 

363 shape: The shape of the input data per sequence id. E.g. if `shape=(2,)`, 

364 each example must contain `2 * sequence_length` values. 

365 default_value: A single value compatible with `dtype` that is used for 

366 padding the sparse data into a dense `Tensor`. 

367 dtype: The type of values. 

368 normalizer_fn: If not `None`, a function that can be used to normalize the 

369 value of the tensor after `default_value` is applied for parsing. 

370 Normalizer function takes the input `Tensor` as its argument, and returns 

371 the output `Tensor`. (e.g. lambda x: (x - 3.0) / 4.2). Please note that 

372 even though the most common use case of this function is normalization, it 

373 can be used for any kind of Tensorflow transformations. 

374 

375 Returns: 

376 A `SequenceNumericColumn`. 

377 

378 Raises: 

379 TypeError: if any dimension in shape is not an int. 

380 ValueError: if any dimension in shape is not a positive integer. 

381 ValueError: if `dtype` is not convertible to `tf.float32`. 

382 """ 

383 shape = fc._check_shape(shape=shape, key=key) 

384 if not (dtype.is_integer or dtype.is_floating): 

385 raise ValueError('dtype must be convertible to float. ' 

386 'dtype: {}, key: {}'.format(dtype, key)) 

387 if normalizer_fn is not None and not callable(normalizer_fn): 

388 raise TypeError( 

389 'normalizer_fn must be a callable. Given: {}'.format(normalizer_fn)) 

390 

391 return SequenceNumericColumn( 

392 key, 

393 shape=shape, 

394 default_value=default_value, 

395 dtype=dtype, 

396 normalizer_fn=normalizer_fn) 

397 

398 

399def _assert_all_equal_and_return(tensors, name=None): 

400 """Asserts that all tensors are equal and returns the first one.""" 

401 with ops.name_scope(name, 'assert_all_equal', values=tensors): 

402 if len(tensors) == 1: 

403 return tensors[0] 

404 assert_equal_ops = [] 

405 for t in tensors[1:]: 

406 assert_equal_ops.append(check_ops.assert_equal(tensors[0], t)) 

407 with ops.control_dependencies(assert_equal_ops): 

408 return array_ops.identity(tensors[0]) 

409 

410 

411 

412 

413class SequenceNumericColumn( 

414 fc.SequenceDenseColumn, 

415 collections.namedtuple( 

416 'SequenceNumericColumn', 

417 ('key', 'shape', 'default_value', 'dtype', 'normalizer_fn'))): 

418 """Represents sequences of numeric data.""" 

419 

420 @property 

421 def _is_v2_column(self): 

422 return True 

423 

424 @property 

425 def name(self): 

426 """See `FeatureColumn` base class.""" 

427 return self.key 

428 

429 @property 

430 def parse_example_spec(self): 

431 """See `FeatureColumn` base class.""" 

432 return {self.key: parsing_ops.VarLenFeature(self.dtype)} 

433 

434 def transform_feature(self, transformation_cache, state_manager): 

435 """See `FeatureColumn` base class. 

436 

437 In this case, we apply the `normalizer_fn` to the input tensor. 

438 

439 Args: 

440 transformation_cache: A `FeatureTransformationCache` object to access 

441 features. 

442 state_manager: A `StateManager` to create / access resources such as 

443 lookup tables. 

444 

445 Returns: 

446 Normalized input tensor. 

447 """ 

448 input_tensor = transformation_cache.get(self.key, state_manager) 

449 if self.normalizer_fn is not None: 

450 input_tensor = self.normalizer_fn(input_tensor) 

451 return input_tensor 

452 

453 @property 

454 def variable_shape(self): 

455 """Returns a `TensorShape` representing the shape of sequence input.""" 

456 return tensor_shape.TensorShape(self.shape) 

457 

458 def get_sequence_dense_tensor(self, transformation_cache, state_manager): 

459 """Returns a `TensorSequenceLengthPair`. 

460 

461 Args: 

462 transformation_cache: A `FeatureTransformationCache` object to access 

463 features. 

464 state_manager: A `StateManager` to create / access resources such as 

465 lookup tables. 

466 """ 

467 sp_tensor = transformation_cache.get(self, state_manager) 

468 dense_tensor = sparse_ops.sparse_tensor_to_dense( 

469 sp_tensor, default_value=self.default_value) 

470 # Reshape into [batch_size, T, variable_shape]. 

471 dense_shape = array_ops.concat( 

472 [array_ops.shape(dense_tensor)[:1], [-1], self.variable_shape], axis=0) 

473 dense_tensor = array_ops.reshape(dense_tensor, shape=dense_shape) 

474 

475 # Get the number of timesteps per example 

476 # For the 2D case, the raw values are grouped according to num_elements; 

477 # for the 3D case, the grouping happens in the third dimension, and 

478 # sequence length is not affected. 

479 if sp_tensor.shape.ndims == 2: 

480 num_elements = self.variable_shape.num_elements() 

481 else: 

482 num_elements = 1 

483 seq_length = fc_utils.sequence_length_from_sparse_tensor( 

484 sp_tensor, num_elements=num_elements) 

485 

486 return fc.SequenceDenseColumn.TensorSequenceLengthPair( 

487 dense_tensor=dense_tensor, sequence_length=seq_length) 

488 

489 @property 

490 def parents(self): 

491 """See 'FeatureColumn` base class.""" 

492 return [self.key] 

493 

494 def get_config(self): 

495 """See 'FeatureColumn` base class.""" 

496 config = dict(zip(self._fields, self)) 

497 config['dtype'] = self.dtype.name 

498 return config 

499 

500 @classmethod 

501 def from_config(cls, config, custom_objects=None, columns_by_name=None): 

502 """See 'FeatureColumn` base class.""" 

503 fc._check_config_keys(config, cls._fields) 

504 kwargs = fc._standardize_and_copy_config(config) 

505 kwargs['dtype'] = dtypes.as_dtype(config['dtype']) 

506 return cls(**kwargs) 

507 

508 

509# pylint: enable=protected-access