Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/keras/src/preprocessing/text.py: 15%

3# Licensed under the Apache License, Version 2.0 (the "License");

4# you may not use this file except in compliance with the License.

5# You may obtain a copy of the License at

7# http://www.apache.org/licenses/LICENSE-2.0

9# Unless required by applicable law or agreed to in writing, software

10# distributed under the License is distributed on an "AS IS" BASIS,

11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

12# See the License for the specific language governing permissions and

13# limitations under the License.

14# ==============================================================================

15"""Utilities for text input preprocessing.

17Deprecated: `tf.keras.preprocessing.text` APIs are not recommended for new code.

18Prefer `tf.keras.utils.text_dataset_from_directory` and

19`tf.keras.layers.TextVectorization` which provide a more efficient approach

20for preprocessing text input. For an introduction to these APIs, see

21the [text loading tutorial]

22(https://www.tensorflow.org/tutorials/load_data/text)

23and [preprocessing layer guide]

24(https://www.tensorflow.org/guide/keras/preprocessing_layers).

25"""

28import collections

29import hashlib

30import json

31import warnings

33import numpy as np

35# isort: off

36from tensorflow.python.util.tf_export import keras_export

39@keras_export("keras.preprocessing.text.text_to_word_sequence")

40def text_to_word_sequence(

41 input_text,

42 filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',

43 lower=True,

44 split=" ",

45):

46 r"""Converts a text to a sequence of words (or tokens).

48 Deprecated: `tf.keras.preprocessing.text.text_to_word_sequence` does not

49 operate on tensors and is not recommended for new code. Prefer

50 `tf.strings.regex_replace` and `tf.strings.split` which provide equivalent

51 functionality and accept `tf.Tensor` input. For an overview of text handling

52 in Tensorflow, see the [text loading tutorial]

53 (https://www.tensorflow.org/tutorials/load_data/text).

55 This function transforms a string of text into a list of words

56 while ignoring `filters` which include punctuations by default.

58 >>> sample_text = 'This is a sample sentence.'

59 >>> tf.keras.preprocessing.text.text_to_word_sequence(sample_text)

60 ['this', 'is', 'a', 'sample', 'sentence']

62 Args:

63 input_text: Input text (string).

64 filters: list (or concatenation) of characters to filter out, such as

65 punctuation. Default: ``'!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\\t\\n'``,

66 includes basic punctuation, tabs, and newlines.

67 lower: boolean. Whether to convert the input to lowercase.

68 split: str. Separator for word splitting.

70 Returns:

71 A list of words (or tokens).

72 """

73 if lower:

74 input_text = input_text.lower()

76 translate_dict = {c: split for c in filters}

77 translate_map = str.maketrans(translate_dict)

78 input_text = input_text.translate(translate_map)

80 seq = input_text.split(split)

81 return [i for i in seq if i]

84@keras_export("keras.preprocessing.text.one_hot")

85def one_hot(

86 input_text,

87 n,

88 filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',

89 lower=True,

90 split=" ",

91 analyzer=None,

92):

93 r"""One-hot encodes a text into a list of word indexes of size `n`.

95 Deprecated: `tf.keras.text.preprocessing.one_hot` does not operate on

96 tensors and is not recommended for new code. Prefer

97 `tf.keras.layers.Hashing` with `output_mode='one_hot'` which provides

98 equivalent functionality through a layer which accepts `tf.Tensor` input.

99 See the [preprocessing layer guide]

100 (https://www.tensorflow.org/guide/keras/preprocessing_layers) for an

101 overview of preprocessing layers.

102

103 This function receives as input a string of text and returns a

104 list of encoded integers each corresponding to a word (or token)

105 in the given input string.

106

107 Args:

108 input_text: Input text (string).

109 n: int. Size of vocabulary.

110 filters: list (or concatenation) of characters to filter out, such as

111 punctuation. Default:

112 ```

113 '!"#$%&()*+,-./:;<=>?@[\]^_`{|}~\t\n

114 ```,

115 includes basic punctuation, tabs, and newlines.

116 lower: boolean. Whether to set the text to lowercase.

117 split: str. Separator for word splitting.

118 analyzer: function. Custom analyzer to split the text

119

120 Returns:

121 List of integers in `[1, n]`. Each integer encodes a word

122 (unicity non-guaranteed).

123 """

124 return hashing_trick(

125 input_text,

126 n,

127 hash_function=hash,

128 filters=filters,

129 lower=lower,

130 split=split,

131 analyzer=analyzer,

132 )

133

134

135@keras_export("keras.preprocessing.text.hashing_trick")

136def hashing_trick(

137 text,

138 n,

139 hash_function=None,

140 filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',

141 lower=True,

142 split=" ",

143 analyzer=None,

144):

145 r"""Converts a text to a sequence of indexes in a fixed-size hashing space.

146

147 Deprecated: `tf.keras.text.preprocessing.hashing_trick` does not operate on

148 tensors and is not recommended for new code. Prefer

149 `tf.keras.layers.Hashing` which provides equivalent functionality through a

150 layer which accepts `tf.Tensor` input. See the [preprocessing layer guide](

151 https://www.tensorflow.org/guide/keras/preprocessing_layers) for an

152 overview of preprocessing layers.

153

154 Args:

155 text: Input text (string).

156 n: Dimension of the hashing space.

157 hash_function: defaults to python `hash` function, can be 'md5' or

158 any function that takes in input a string and returns a int.

159 Note that 'hash' is not a stable hashing function, so

160 it is not consistent across different runs, while 'md5'

161 is a stable hashing function.

162 filters: list (or concatenation) of characters to filter out, such as

163 punctuation. Default: ``!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\\t\\n``,

164 includes basic punctuation, tabs, and newlines.

165 lower: boolean. Whether to set the text to lowercase.

166 split: str. Separator for word splitting.

167 analyzer: function. Custom analyzer to split the text

168

169 Returns:

170 A list of integer word indices (unicity non-guaranteed).

171 `0` is a reserved index that won't be assigned to any word.

172 Two or more words may be assigned to the same index, due to possible

173 collisions by the hashing function.

174 The [probability](

175 https://en.wikipedia.org/wiki/Birthday_problem#Probability_table)

176 of a collision is in relation to the dimension of the hashing space and

177 the number of distinct objects.

178 """

179 if hash_function is None:

180 hash_function = hash

181 elif hash_function == "md5":

182 hash_function = lambda w: int(hashlib.md5(w.encode()).hexdigest(), 16)

183

184 if analyzer is None:

185 seq = text_to_word_sequence(

186 text, filters=filters, lower=lower, split=split

187 )

188 else:

189 seq = analyzer(text)

190

191 return [(hash_function(w) % (n - 1) + 1) for w in seq]

192

193

194@keras_export("keras.preprocessing.text.Tokenizer")

195class Tokenizer(object):

196 """Text tokenization utility class.

197

198 Deprecated: `tf.keras.preprocessing.text.Tokenizer` does not operate on

199 tensors and is not recommended for new code. Prefer

200 `tf.keras.layers.TextVectorization` which provides equivalent functionality

201 through a layer which accepts `tf.Tensor` input. See the

202 [text loading tutorial](https://www.tensorflow.org/tutorials/load_data/text)

203 for an overview of the layer and text handling in tensorflow.

204

205 This class allows to vectorize a text corpus, by turning each

206 text into either a sequence of integers (each integer being the index

207 of a token in a dictionary) or into a vector where the coefficient

208 for each token could be binary, based on word count, based on tf-idf...

209

210 By default, all punctuation is removed, turning the texts into

211 space-separated sequences of words

212 (words may include the `'` character). These sequences are then

213 split into lists of tokens. They will then be indexed or vectorized.

214

215 `0` is a reserved index that won't be assigned to any word.

216

217 Args:

218 num_words: the maximum number of words to keep, based

219 on word frequency. Only the most common `num_words-1` words will

220 be kept.

221 filters: a string where each element is a character that will be

222 filtered from the texts. The default is all punctuation, plus

223 tabs and line breaks, minus the `'` character.

224 lower: boolean. Whether to convert the texts to lowercase.

225 split: str. Separator for word splitting.

226 char_level: if True, every character will be treated as a token.

227 oov_token: if given, it will be added to word_index and used to

228 replace out-of-vocabulary words during text_to_sequence calls

229 analyzer: function. Custom analyzer to split the text.

230 The default analyzer is text_to_word_sequence

231 """

232

233 def __init__(

234 self,

235 num_words=None,

236 filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',

237 lower=True,

238 split=" ",

239 char_level=False,

240 oov_token=None,

241 analyzer=None,

242 **kwargs

243 ):

244 # Legacy support

245 if "nb_words" in kwargs:

246 warnings.warn(

247 "The `nb_words` argument in `Tokenizer` "

248 "has been renamed `num_words`."

249 )

250 num_words = kwargs.pop("nb_words")

251 document_count = kwargs.pop("document_count", 0)

252 if kwargs:

253 raise TypeError("Unrecognized keyword arguments: " + str(kwargs))

254

255 self.word_counts = collections.OrderedDict()

256 self.word_docs = collections.defaultdict(int)

257 self.filters = filters

258 self.split = split

259 self.lower = lower

260 self.num_words = num_words

261 self.document_count = document_count

262 self.char_level = char_level

263 self.oov_token = oov_token

264 self.index_docs = collections.defaultdict(int)

265 self.word_index = {}

266 self.index_word = {}

267 self.analyzer = analyzer

268

269 def fit_on_texts(self, texts):

270 """Updates internal vocabulary based on a list of texts.

271

272 In the case where texts contains lists,

273 we assume each entry of the lists to be a token.

274

275 Required before using `texts_to_sequences` or `texts_to_matrix`.

276

277 Args:

278 texts: can be a list of strings,

279 a generator of strings (for memory-efficiency),

280 or a list of list of strings.

281 """

282 for text in texts:

283 self.document_count += 1

284 if self.char_level or isinstance(text, list):

285 if self.lower:

286 if isinstance(text, list):

287 text = [text_elem.lower() for text_elem in text]

288 else:

289 text = text.lower()

290 seq = text

291 else:

292 if self.analyzer is None:

293 seq = text_to_word_sequence(

294 text,

295 filters=self.filters,

296 lower=self.lower,

297 split=self.split,

298 )

299 else:

300 seq = self.analyzer(text)

301 for w in seq:

302 if w in self.word_counts:

303 self.word_counts[w] += 1

304 else:

305 self.word_counts[w] = 1

306 for w in set(seq):

307 # In how many documents each word occurs

308 self.word_docs[w] += 1

309

310 wcounts = list(self.word_counts.items())

311 wcounts.sort(key=lambda x: x[1], reverse=True)

312 # forcing the oov_token to index 1 if it exists

313 if self.oov_token is None:

314 sorted_voc = []

315 else:

316 sorted_voc = [self.oov_token]

317 sorted_voc.extend(wc[0] for wc in wcounts)

318

319 # note that index 0 is reserved, never assigned to an existing word

320 self.word_index = dict(

321 zip(sorted_voc, list(range(1, len(sorted_voc) + 1)))

322 )

323

324 self.index_word = {c: w for w, c in self.word_index.items()}

325

326 for w, c in list(self.word_docs.items()):

327 self.index_docs[self.word_index[w]] = c

328

329 def fit_on_sequences(self, sequences):

330 """Updates internal vocabulary based on a list of sequences.

331

332 Required before using `sequences_to_matrix`

333 (if `fit_on_texts` was never called).

334

335 Args:

336 sequences: A list of sequence.

337 A "sequence" is a list of integer word indices.

338 """

339 self.document_count += len(sequences)

340 for seq in sequences:

341 seq = set(seq)

342 for i in seq:

343 self.index_docs[i] += 1

344

345 def texts_to_sequences(self, texts):

346 """Transforms each text in texts to a sequence of integers.

347

348 Only top `num_words-1` most frequent words will be taken into account.

349 Only words known by the tokenizer will be taken into account.

350

351 Args:

352 texts: A list of texts (strings).

353

354 Returns:

355 A list of sequences.

356 """

357 return list(self.texts_to_sequences_generator(texts))

358

359 def texts_to_sequences_generator(self, texts):

360 """Transforms each text in `texts` to a sequence of integers.

361

362 Each item in texts can also be a list,

363 in which case we assume each item of that list to be a token.

364

365 Only top `num_words-1` most frequent words will be taken into account.

366 Only words known by the tokenizer will be taken into account.

367

368 Args:

369 texts: A list of texts (strings).

370

371 Yields:

372 Yields individual sequences.

373 """

374 num_words = self.num_words

375 oov_token_index = self.word_index.get(self.oov_token)

376 for text in texts:

377 if self.char_level or isinstance(text, list):

378 if self.lower:

379 if isinstance(text, list):

380 text = [text_elem.lower() for text_elem in text]

381 else:

382 text = text.lower()

383 seq = text

384 else:

385 if self.analyzer is None:

386 seq = text_to_word_sequence(

387 text,

388 filters=self.filters,

389 lower=self.lower,

390 split=self.split,

391 )

392 else:

393 seq = self.analyzer(text)

394 vect = []

395 for w in seq:

396 i = self.word_index.get(w)

397 if i is not None:

398 if num_words and i >= num_words:

399 if oov_token_index is not None:

400 vect.append(oov_token_index)

401 else:

402 vect.append(i)

403 elif self.oov_token is not None:

404 vect.append(oov_token_index)

405 yield vect

406

407 def sequences_to_texts(self, sequences):

408 """Transforms each sequence into a list of text.

409

410 Only top `num_words-1` most frequent words will be taken into account.

411 Only words known by the tokenizer will be taken into account.

412

413 Args:

414 sequences: A list of sequences (list of integers).

415

416 Returns:

417 A list of texts (strings)

418 """

419 return list(self.sequences_to_texts_generator(sequences))

420

421 def sequences_to_texts_generator(self, sequences):

422 """Transforms each sequence in `sequences` to a list of texts(strings).

423

424 Each sequence has to a list of integers.

425 In other words, sequences should be a list of sequences

426

427 Only top `num_words-1` most frequent words will be taken into account.

428 Only words known by the tokenizer will be taken into account.

429

430 Args:

431 sequences: A list of sequences.

432

433 Yields:

434 Yields individual texts.

435 """

436 num_words = self.num_words

437 oov_token_index = self.word_index.get(self.oov_token)

438 for seq in sequences:

439 vect = []

440 for num in seq:

441 word = self.index_word.get(num)

442 if word is not None:

443 if num_words and num >= num_words:

444 if oov_token_index is not None:

445 vect.append(self.index_word[oov_token_index])

446 else:

447 vect.append(word)

448 elif self.oov_token is not None:

449 vect.append(self.index_word[oov_token_index])

450 vect = " ".join(vect)

451 yield vect

452

453 def texts_to_matrix(self, texts, mode="binary"):

454 """Convert a list of texts to a Numpy matrix.

455

456 Args:

457 texts: list of strings.

458 mode: one of "binary", "count", "tfidf", "freq".

459

460 Returns:

461 A Numpy matrix.

462 """

463 sequences = self.texts_to_sequences(texts)

464 return self.sequences_to_matrix(sequences, mode=mode)

465

466 def sequences_to_matrix(self, sequences, mode="binary"):

467 """Converts a list of sequences into a Numpy matrix.

468

469 Args:

470 sequences: list of sequences

471 (a sequence is a list of integer word indices).

472 mode: one of "binary", "count", "tfidf", "freq"

473

474 Returns:

475 A Numpy matrix.

476

477 Raises:

478 ValueError: In case of invalid `mode` argument,

479 or if the Tokenizer requires to be fit to sample data.

480 """

481 if not self.num_words:

482 if self.word_index:

483 num_words = len(self.word_index) + 1

484 else:

485 raise ValueError(

486 "Specify a dimension (`num_words` argument), "

487 "or fit on some text data first."

488 )

489 else:

490 num_words = self.num_words

491

492 if mode == "tfidf" and not self.document_count:

493 raise ValueError(

494 "Fit the Tokenizer on some data before using tfidf mode."

495 )

496

497 x = np.zeros((len(sequences), num_words))

498 for i, seq in enumerate(sequences):

499 if not seq:

500 continue

501 counts = collections.defaultdict(int)

502 for j in seq:

503 if j >= num_words:

504 continue

505 counts[j] += 1

506 for j, c in list(counts.items()):

507 if mode == "count":

508 x[i][j] = c

509 elif mode == "freq":

510 x[i][j] = c / len(seq)

511 elif mode == "binary":

512 x[i][j] = 1

513 elif mode == "tfidf":

514 # Use weighting scheme 2 in

515 # https://en.wikipedia.org/wiki/Tf%E2%80%93idf

516 tf = 1 + np.log(c)

517 idf = np.log(

518 1

519 + self.document_count / (1 + self.index_docs.get(j, 0))

520 )

521 x[i][j] = tf * idf

522 else:

523 raise ValueError("Unknown vectorization mode:", mode)

524 return x

525

526 def get_config(self):

527 """Returns the tokenizer configuration as Python dictionary.

528

529 The word count dictionaries used by the tokenizer get serialized

530 into plain JSON, so that the configuration can be read by other

531 projects.

532

533 Returns:

534 A Python dictionary with the tokenizer configuration.

535 """

536 json_word_counts = json.dumps(self.word_counts)

537 json_word_docs = json.dumps(self.word_docs)

538 json_index_docs = json.dumps(self.index_docs)

539 json_word_index = json.dumps(self.word_index)

540 json_index_word = json.dumps(self.index_word)

541

542 return {

543 "num_words": self.num_words,

544 "filters": self.filters,

545 "lower": self.lower,

546 "split": self.split,

547 "char_level": self.char_level,

548 "oov_token": self.oov_token,

549 "document_count": self.document_count,

550 "word_counts": json_word_counts,

551 "word_docs": json_word_docs,

552 "index_docs": json_index_docs,

553 "index_word": json_index_word,

554 "word_index": json_word_index,

555 }

556

557 def to_json(self, **kwargs):

558 """Returns a JSON string containing the tokenizer configuration.

559

560 To load a tokenizer from a JSON string, use

561 `keras.preprocessing.text.tokenizer_from_json(json_string)`.

562

563 Args:

564 **kwargs: Additional keyword arguments

565 to be passed to `json.dumps()`.

566

567 Returns:

568 A JSON string containing the tokenizer configuration.

569 """

570 config = self.get_config()

571 tokenizer_config = {

572 "class_name": self.__class__.__name__,

573 "config": config,

574 }

575 return json.dumps(tokenizer_config, **kwargs)

576

577

578@keras_export("keras.preprocessing.text.tokenizer_from_json")

579def tokenizer_from_json(json_string):

580 """Parses a JSON tokenizer configuration and returns a tokenizer instance.

581

582 Deprecated: `tf.keras.preprocessing.text.Tokenizer` does not operate on

583 tensors and is not recommended for new code. Prefer

584 `tf.keras.layers.TextVectorization` which provides equivalent functionality

585 through a layer which accepts `tf.Tensor` input. See the

586 [text loading tutorial](https://www.tensorflow.org/tutorials/load_data/text)

587 for an overview of the layer and text handling in tensorflow.

588

589 Args:

590 json_string: JSON string encoding a tokenizer configuration.

591

592 Returns:

593 A Keras Tokenizer instance

594 """

595 tokenizer_config = json.loads(json_string)

596 config = tokenizer_config.get("config")

597

598 word_counts = json.loads(config.pop("word_counts"))

599 word_docs = json.loads(config.pop("word_docs"))

600 index_docs = json.loads(config.pop("index_docs"))

601 # Integer indexing gets converted to strings with json.dumps()

602 index_docs = {int(k): v for k, v in index_docs.items()}

603 index_word = json.loads(config.pop("index_word"))

604 index_word = {int(k): v for k, v in index_word.items()}

605 word_index = json.loads(config.pop("word_index"))

606

607 tokenizer = Tokenizer(**config)

608 tokenizer.word_counts = word_counts

609 tokenizer.word_docs = word_docs

610 tokenizer.index_docs = index_docs

611 tokenizer.word_index = word_index

612 tokenizer.index_word = index_word

613 return tokenizer

614