Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/keras/src/layers/preprocessing/string

3# Licensed under the Apache License, Version 2.0 (the "License");

4# you may not use this file except in compliance with the License.

5# You may obtain a copy of the License at

7# http://www.apache.org/licenses/LICENSE-2.0

9# Unless required by applicable law or agreed to in writing, software

10# distributed under the License is distributed on an "AS IS" BASIS,

11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

12# See the License for the specific language governing permissions and

13# limitations under the License.

14# ==============================================================================

15"""Keras string lookup preprocessing layer."""

17import numpy as np

18import tensorflow.compat.v2 as tf

20from keras.src.engine import base_preprocessing_layer

21from keras.src.layers.preprocessing import index_lookup

23# isort: off

24from tensorflow.python.util.tf_export import keras_export

27@keras_export(

28 "keras.layers.StringLookup",

29 "keras.layers.experimental.preprocessing.StringLookup",

30 v1=[],

31)

32class StringLookup(index_lookup.IndexLookup):

33 """A preprocessing layer which maps string features to integer indices.

35 This layer translates a set of arbitrary strings into integer output via a

36 table-based vocabulary lookup. This layer will perform no splitting or

37 transformation of input strings. For a layer than can split and tokenize

38 natural language, see the `tf.keras.layers.TextVectorization` layer.

40 The vocabulary for the layer must be either supplied on construction or

41 learned via `adapt()`. During `adapt()`, the layer will analyze a data set,

42 determine the frequency of individual strings tokens, and create a

43 vocabulary from them. If the vocabulary is capped in size, the most frequent

44 tokens will be used to create the vocabulary and all others will be treated

45 as out-of-vocabulary (OOV).

47 There are two possible output modes for the layer.

48 When `output_mode` is `"int"`,

49 input strings are converted to their index in the vocabulary (an integer).

50 When `output_mode` is `"multi_hot"`, `"count"`, or `"tf_idf"`, input strings

51 are encoded into an array where each dimension corresponds to an element in

52 the vocabulary.

54 The vocabulary can optionally contain a mask token as well as an OOV token

55 (which can optionally occupy multiple indices in the vocabulary, as set

56 by `num_oov_indices`).

57 The position of these tokens in the vocabulary is fixed. When `output_mode`

58 is `"int"`, the vocabulary will begin with the mask token (if set), followed

59 by OOV indices, followed by the rest of the vocabulary. When `output_mode`

60 is `"multi_hot"`, `"count"`, or `"tf_idf"` the vocabulary will begin with

61 OOV indices and instances of the mask token will be dropped.

63 For an overview and full list of preprocessing layers, see the preprocessing

64 [guide](https://www.tensorflow.org/guide/keras/preprocessing_layers).

66 Args:

67 max_tokens: Maximum size of the vocabulary for this layer. This should

68 only be specified when adapting the vocabulary or when setting

69 `pad_to_max_tokens=True`. If None, there is no cap on the size of the

70 vocabulary. Note that this size includes the OOV and mask tokens.

71 Defaults to `None`.

72 num_oov_indices: The number of out-of-vocabulary tokens to use. If this

73 value is more than 1, OOV inputs are hashed to determine their OOV

74 value. If this value is 0, OOV inputs will cause an error when calling

75 the layer. Defaults to `1`.

76 mask_token: A token that represents masked inputs. When `output_mode` is

77 `"int"`, the token is included in vocabulary and mapped to index 0. In

78 other output modes, the token will not appear in the vocabulary and

79 instances of the mask token in the input will be dropped. If set to

80 None, no mask term will be added. Defaults to `None`.

81 oov_token: Only used when `invert` is True. The token to return for OOV

82 indices. Defaults to `"[UNK]"`.

83 vocabulary: Optional. Either an array of strings or a string path to a

84 text file. If passing an array, can pass a tuple, list, 1D numpy array,

85 or 1D tensor containing the string vocbulary terms. If passing a file

86 path, the file should contain one line per term in the vocabulary. If

87 this argument is set, there is no need to `adapt()` the layer.

88 idf_weights: Only valid when `output_mode` is `"tf_idf"`. A tuple, list,

89 1D numpy array, or 1D tensor or the same length as the vocabulary,

90 containing the floating point inverse document frequency weights, which

91 will be multiplied by per sample term counts for the final `tf_idf`

92 weight. If the `vocabulary` argument is set, and `output_mode` is

93 `"tf_idf"`, this argument must be supplied.

94 invert: Only valid when `output_mode` is `"int"`. If True, this layer will

95 map indices to vocabulary items instead of mapping vocabulary items to

96 indices. Defaults to `False`.

97 output_mode: Specification for the output of the layer. Values can be

98 `"int"`, `"one_hot"`, `"multi_hot"`, `"count"`, or `"tf_idf"`

99 configuring the layer as follows:

100 - `"int"`: Return the raw integer indices of the input tokens.

101 - `"one_hot"`: Encodes each individual element in the input into an

102 array the same size as the vocabulary, containing a 1 at the element

103 index. If the last dimension is size 1, will encode on that

104 dimension. If the last dimension is not size 1, will append a new

105 dimension for the encoded output.

106 - `"multi_hot"`: Encodes each sample in the input into a single array

107 the same size as the vocabulary, containing a 1 for each vocabulary

108 term present in the sample. Treats the last dimension as the sample

109 dimension, if input shape is (..., sample_length), output shape will

110 be (..., num_tokens).

111 - `"count"`: As `"multi_hot"`, but the int array contains a count of

112 the number of times the token at that index appeared in the sample.

113 - `"tf_idf"`: As `"multi_hot"`, but the TF-IDF algorithm is applied to

114 find the value in each token slot.

115 For `"int"` output, any shape of input and output is supported. For all

116 other output modes, currently only output up to rank 2 is supported.

117 Defaults to `"int"`

118 pad_to_max_tokens: Only applicable when `output_mode` is `"multi_hot"`,

119 `"count"`, or `"tf_idf"`. If True, the output will have its feature axis

120 padded to `max_tokens` even if the number of unique tokens in the

121 vocabulary is less than max_tokens, resulting in a tensor of shape

122 [batch_size, max_tokens] regardless of vocabulary size. Defaults to

123 False.

124 sparse: Boolean. Only applicable when `output_mode` is `"multi_hot"`,

125 `"count"`, or `"tf_idf"`. If True, returns a `SparseTensor` instead of a

126 dense `Tensor`. Defaults to `False`.

127 encoding: Optional. The text encoding to use to interpret the input

128 strings. Defaults to `"utf-8"`.

129

130 Examples:

131

132 **Creating a lookup layer with a known vocabulary**

133

134 This example creates a lookup layer with a pre-existing vocabulary.

135

136 >>> vocab = ["a", "b", "c", "d"]

137 >>> data = tf.constant([["a", "c", "d"], ["d", "z", "b"]])

138 >>> layer = tf.keras.layers.StringLookup(vocabulary=vocab)

139 >>> layer(data)

140 <tf.Tensor: shape=(2, 3), dtype=int64, numpy=

141 array([[1, 3, 4],

142 [4, 0, 2]])>

143

144 **Creating a lookup layer with an adapted vocabulary**

145

146 This example creates a lookup layer and generates the vocabulary by

147 analyzing the dataset.

148

149 >>> data = tf.constant([["a", "c", "d"], ["d", "z", "b"]])

150 >>> layer = tf.keras.layers.StringLookup()

151 >>> layer.adapt(data)

152 >>> layer.get_vocabulary()

153 ['[UNK]', 'd', 'z', 'c', 'b', 'a']

154

155 Note that the OOV token `"[UNK]"` has been added to the vocabulary.

156 The remaining tokens are sorted by frequency

157 (`"d"`, which has 2 occurrences, is first) then by inverse sort order.

158

159 >>> data = tf.constant([["a", "c", "d"], ["d", "z", "b"]])

160 >>> layer = tf.keras.layers.StringLookup()

161 >>> layer.adapt(data)

162 >>> layer(data)

163 <tf.Tensor: shape=(2, 3), dtype=int64, numpy=

164 array([[5, 3, 1],

165 [1, 2, 4]])>

166

167 **Lookups with multiple OOV indices**

168

169 This example demonstrates how to use a lookup layer with multiple OOV

170 indices. When a layer is created with more than one OOV index, any OOV

171 values are hashed into the number of OOV buckets, distributing OOV values in

172 a deterministic fashion across the set.

173

174 >>> vocab = ["a", "b", "c", "d"]

175 >>> data = tf.constant([["a", "c", "d"], ["m", "z", "b"]])

176 >>> layer = tf.keras.layers.StringLookup(vocabulary=vocab,

177 ... num_oov_indices=2)

178 >>> layer(data)

179 <tf.Tensor: shape=(2, 3), dtype=int64, numpy=

180 array([[2, 4, 5],

181 [0, 1, 3]])>

182

183 Note that the output for OOV value 'm' is 0, while the output for OOV value

184 'z' is 1. The in-vocab terms have their output index increased by 1 from

185 earlier examples (a maps to 2, etc) in order to make space for the extra OOV

186 value.

187

188 **One-hot output**

189

190 Configure the layer with `output_mode='one_hot'`. Note that the first

191 `num_oov_indices` dimensions in the ont_hot encoding represent OOV values.

192

193 >>> vocab = ["a", "b", "c", "d"]

194 >>> data = tf.constant(["a", "b", "c", "d", "z"])

195 >>> layer = tf.keras.layers.StringLookup(

196 ... vocabulary=vocab, output_mode='one_hot')

197 >>> layer(data)

198 <tf.Tensor: shape=(5, 5), dtype=float32, numpy=

199 array([[0., 1., 0., 0., 0.],

200 [0., 0., 1., 0., 0.],

201 [0., 0., 0., 1., 0.],

202 [0., 0., 0., 0., 1.],

203 [1., 0., 0., 0., 0.]], dtype=float32)>

204

205 **Multi-hot output**

206

207 Configure the layer with `output_mode='multi_hot'`. Note that the first

208 `num_oov_indices` dimensions in the multi_hot encoding represent OOV values.

209

210 >>> vocab = ["a", "b", "c", "d"]

211 >>> data = tf.constant([["a", "c", "d", "d"], ["d", "z", "b", "z"]])

212 >>> layer = tf.keras.layers.StringLookup(

213 ... vocabulary=vocab, output_mode='multi_hot')

214 >>> layer(data)

215 <tf.Tensor: shape=(2, 5), dtype=float32, numpy=

216 array([[0., 1., 0., 1., 1.],

217 [1., 0., 1., 0., 1.]], dtype=float32)>

218

219 **Token count output**

220

221 Configure the layer with `output_mode='count'`. As with multi_hot output,

222 the first `num_oov_indices` dimensions in the output represent OOV values.

223

224 >>> vocab = ["a", "b", "c", "d"]

225 >>> data = tf.constant([["a", "c", "d", "d"], ["d", "z", "b", "z"]])

226 >>> layer = tf.keras.layers.StringLookup(

227 ... vocabulary=vocab, output_mode='count')

228 >>> layer(data)

229 <tf.Tensor: shape=(2, 5), dtype=float32, numpy=

230 array([[0., 1., 0., 1., 2.],

231 [2., 0., 1., 0., 1.]], dtype=float32)>

232

233 **TF-IDF output**

234

235 Configure the layer with `output_mode="tf_idf"`. As with multi_hot output,

236 the first `num_oov_indices` dimensions in the output represent OOV values.

237

238 Each token bin will output `token_count * idf_weight`, where the idf weights

239 are the inverse document frequency weights per token. These should be

240 provided along with the vocabulary. Note that the `idf_weight` for OOV

241 values will default to the average of all idf weights passed in.

242

243 >>> vocab = ["a", "b", "c", "d"]

244 >>> idf_weights = [0.25, 0.75, 0.6, 0.4]

245 >>> data = tf.constant([["a", "c", "d", "d"], ["d", "z", "b", "z"]])

246 >>> layer = tf.keras.layers.StringLookup(output_mode="tf_idf")

247 >>> layer.set_vocabulary(vocab, idf_weights=idf_weights)

248 >>> layer(data)

249 <tf.Tensor: shape=(2, 5), dtype=float32, numpy=

250 array([[0. , 0.25, 0. , 0.6 , 0.8 ],

251 [1.0 , 0. , 0.75, 0. , 0.4 ]], dtype=float32)>

252

253 To specify the idf weights for oov values, you will need to pass the entire

254 vocabularly including the leading oov token.

255

256 >>> vocab = ["[UNK]", "a", "b", "c", "d"]

257 >>> idf_weights = [0.9, 0.25, 0.75, 0.6, 0.4]

258 >>> data = tf.constant([["a", "c", "d", "d"], ["d", "z", "b", "z"]])

259 >>> layer = tf.keras.layers.StringLookup(output_mode="tf_idf")

260 >>> layer.set_vocabulary(vocab, idf_weights=idf_weights)

261 >>> layer(data)

262 <tf.Tensor: shape=(2, 5), dtype=float32, numpy=

263 array([[0. , 0.25, 0. , 0.6 , 0.8 ],

264 [1.8 , 0. , 0.75, 0. , 0.4 ]], dtype=float32)>

265

266 When adapting the layer in `"tf_idf"` mode, each input sample will be

267 considered a document, and IDF weight per token will be calculated as

268 `log(1 + num_documents / (1 + token_document_count))`.

269

270 **Inverse lookup**

271

272 This example demonstrates how to map indices to strings using this layer.

273 (You can also use `adapt()` with `inverse=True`, but for simplicity we'll

274 pass the vocab in this example.)

275

276 >>> vocab = ["a", "b", "c", "d"]

277 >>> data = tf.constant([[1, 3, 4], [4, 0, 2]])

278 >>> layer = tf.keras.layers.StringLookup(vocabulary=vocab, invert=True)

279 >>> layer(data)

280 <tf.Tensor: shape=(2, 3), dtype=string, numpy=

281 array([[b'a', b'c', b'd'],

282 [b'd', b'[UNK]', b'b']], dtype=object)>

283

284 Note that the first index correspond to the oov token by default.

285

286

287 **Forward and inverse lookup pairs**

288

289 This example demonstrates how to use the vocabulary of a standard lookup

290 layer to create an inverse lookup layer.

291

292 >>> vocab = ["a", "b", "c", "d"]

293 >>> data = tf.constant([["a", "c", "d"], ["d", "z", "b"]])

294 >>> layer = tf.keras.layers.StringLookup(vocabulary=vocab)

295 >>> i_layer = tf.keras.layers.StringLookup(vocabulary=vocab, invert=True)

296 >>> int_data = layer(data)

297 >>> i_layer(int_data)

298 <tf.Tensor: shape=(2, 3), dtype=string, numpy=

299 array([[b'a', b'c', b'd'],

300 [b'd', b'[UNK]', b'b']], dtype=object)>

301

302 In this example, the input value `"z"` resulted in an output of `"[UNK]"`,

303 since 1000 was not in the vocabulary - it got represented as an OOV, and all

304 OOV values are returned as `"[UNK]"` in the inverse layer. Also, note that

305 for the inverse to work, you must have already set the forward layer

306 vocabulary either directly or via `adapt()` before calling

307 `get_vocabulary()`.

308 """

309

310 def __init__(

311 self,

312 max_tokens=None,

313 num_oov_indices=1,

314 mask_token=None,

315 oov_token="[UNK]",

316 vocabulary=None,

317 idf_weights=None,

318 encoding="utf-8",

319 invert=False,

320 output_mode="int",

321 sparse=False,

322 pad_to_max_tokens=False,

323 **kwargs

324 ):

325 # Legacy versions of the StringLookup layer set layer dtype to string,

326 # instead of the output type. If we see this, clear it.

327 if "dtype" in kwargs and (

328 kwargs["dtype"] == tf.string or kwargs["dtype"] == "string"

329 ):

330 del kwargs["dtype"]

331

332 self.encoding = encoding

333

334 super().__init__(

335 max_tokens=max_tokens,

336 num_oov_indices=num_oov_indices,

337 mask_token=mask_token,

338 oov_token=oov_token,

339 vocabulary=vocabulary,

340 vocabulary_dtype=tf.string,

341 idf_weights=idf_weights,

342 invert=invert,

343 output_mode=output_mode,

344 sparse=sparse,

345 pad_to_max_tokens=pad_to_max_tokens,

346 **kwargs

347 )

348 base_preprocessing_layer.keras_kpl_gauge.get_cell("StringLookup").set(

349 True

350 )

351

352 def get_config(self):

353 config = {"encoding": self.encoding}

354 base_config = super().get_config()

355 # There is only one valid dtype for strings, so we don't expose this.

356 del base_config["vocabulary_dtype"]

357 return dict(list(base_config.items()) + list(config.items()))

358

359 # We override this method solely to generate a docstring.

360 def adapt(self, data, batch_size=None, steps=None):

361 """Computes a vocabulary of string terms from tokens in a dataset.

362

363 Calling `adapt()` on a `StringLookup` layer is an alternative to passing

364 in a precomputed vocabulary on construction via the `vocabulary`

365 argument. A `StringLookup` layer should always be either adapted over a

366 dataset or supplied with a vocabulary.

367

368 During `adapt()`, the layer will build a vocabulary of all string tokens

369 seen in the dataset, sorted by occurrence count, with ties broken by

370 sort order of the tokens (high to low). At the end of `adapt()`, if

371 `max_tokens` is set, the vocabulary wil be truncated to `max_tokens`

372 size. For example, adapting a layer with `max_tokens=1000` will compute

373 the 1000 most frequent tokens occurring in the input dataset. If

374 `output_mode='tf-idf'`, `adapt()` will also learn the document

375 frequencies of each token in the input dataset.

376

377 In order to make `StringLookup` efficient in any distribution context,

378 the vocabulary is kept static with respect to any compiled `tf.Graph`s

379 that call the layer. As a consequence, if the layer is adapted a second

380 time, any models using the layer should be re-compiled. For more

381 information see

382 `tf.keras.layers.experimental.preprocessing.PreprocessingLayer.adapt`.

383

384 `adapt()` is meant only as a single machine utility to compute layer

385 state. To analyze a dataset that cannot fit on a single machine, see

386 [Tensorflow Transform](

387 https://www.tensorflow.org/tfx/transform/get_started) for a

388 multi-machine, map-reduce solution.

389

390 Arguments:

391 data: The data to train on. It can be passed either as a

392 `tf.data.Dataset`, or as a numpy array.

393 batch_size: Integer or `None`.

394 Number of samples per state update.

395 If unspecified, `batch_size` will default to 32.

396 Do not specify the `batch_size` if your data is in the

397 form of datasets, generators, or `keras.utils.Sequence` instances

398 (since they generate batches).

399 steps: Integer or `None`.

400 Total number of steps (batches of samples)

401 When training with input tensors such as

402 TensorFlow data tensors, the default `None` is equal to

403 the number of samples in your dataset divided by

404 the batch size, or 1 if that cannot be determined. If x is a

405 `tf.data` dataset, and 'steps' is None, the epoch will run until

406 the input dataset is exhausted. When passing an infinitely

407 repeating dataset, you must specify the `steps` argument. This

408 argument is not supported with array inputs.

409 """

410 super().adapt(data, batch_size=batch_size, steps=steps)

411

412 # Overridden methods from IndexLookup.

413 def _tensor_vocab_to_numpy(self, vocabulary):

414 vocabulary = vocabulary.numpy()

415 return np.array(

416 [tf.compat.as_text(x, self.encoding) for x in vocabulary]

417 )

418

Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/keras/src/layers/preprocessing/string_lookup.py: 50%

24 statements