Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/keras/src/preprocessing/text.py: 15%

191 statements  

« prev     ^ index     » next       coverage.py v7.4.0, created at 2024-01-03 07:57 +0000

1# Copyright 2015 The TensorFlow Authors. All Rights Reserved. 

2# 

3# Licensed under the Apache License, Version 2.0 (the "License"); 

4# you may not use this file except in compliance with the License. 

5# You may obtain a copy of the License at 

6# 

7# http://www.apache.org/licenses/LICENSE-2.0 

8# 

9# Unless required by applicable law or agreed to in writing, software 

10# distributed under the License is distributed on an "AS IS" BASIS, 

11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 

12# See the License for the specific language governing permissions and 

13# limitations under the License. 

14# ============================================================================== 

15"""Utilities for text input preprocessing. 

16 

17Deprecated: `tf.keras.preprocessing.text` APIs are not recommended for new code. 

18Prefer `tf.keras.utils.text_dataset_from_directory` and 

19`tf.keras.layers.TextVectorization` which provide a more efficient approach 

20for preprocessing text input. For an introduction to these APIs, see 

21the [text loading tutorial] 

22(https://www.tensorflow.org/tutorials/load_data/text) 

23and [preprocessing layer guide] 

24(https://www.tensorflow.org/guide/keras/preprocessing_layers). 

25""" 

26 

27 

28import collections 

29import hashlib 

30import json 

31import warnings 

32 

33import numpy as np 

34 

35# isort: off 

36from tensorflow.python.util.tf_export import keras_export 

37 

38 

39@keras_export("keras.preprocessing.text.text_to_word_sequence") 

40def text_to_word_sequence( 

41 input_text, 

42 filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n', 

43 lower=True, 

44 split=" ", 

45): 

46 r"""Converts a text to a sequence of words (or tokens). 

47 

48 Deprecated: `tf.keras.preprocessing.text.text_to_word_sequence` does not 

49 operate on tensors and is not recommended for new code. Prefer 

50 `tf.strings.regex_replace` and `tf.strings.split` which provide equivalent 

51 functionality and accept `tf.Tensor` input. For an overview of text handling 

52 in Tensorflow, see the [text loading tutorial] 

53 (https://www.tensorflow.org/tutorials/load_data/text). 

54 

55 This function transforms a string of text into a list of words 

56 while ignoring `filters` which include punctuations by default. 

57 

58 >>> sample_text = 'This is a sample sentence.' 

59 >>> tf.keras.preprocessing.text.text_to_word_sequence(sample_text) 

60 ['this', 'is', 'a', 'sample', 'sentence'] 

61 

62 Args: 

63 input_text: Input text (string). 

64 filters: list (or concatenation) of characters to filter out, such as 

65 punctuation. Default: ``'!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\\t\\n'``, 

66 includes basic punctuation, tabs, and newlines. 

67 lower: boolean. Whether to convert the input to lowercase. 

68 split: str. Separator for word splitting. 

69 

70 Returns: 

71 A list of words (or tokens). 

72 """ 

73 if lower: 

74 input_text = input_text.lower() 

75 

76 translate_dict = {c: split for c in filters} 

77 translate_map = str.maketrans(translate_dict) 

78 input_text = input_text.translate(translate_map) 

79 

80 seq = input_text.split(split) 

81 return [i for i in seq if i] 

82 

83 

84@keras_export("keras.preprocessing.text.one_hot") 

85def one_hot( 

86 input_text, 

87 n, 

88 filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n', 

89 lower=True, 

90 split=" ", 

91 analyzer=None, 

92): 

93 r"""One-hot encodes a text into a list of word indexes of size `n`. 

94 

95 Deprecated: `tf.keras.text.preprocessing.one_hot` does not operate on 

96 tensors and is not recommended for new code. Prefer 

97 `tf.keras.layers.Hashing` with `output_mode='one_hot'` which provides 

98 equivalent functionality through a layer which accepts `tf.Tensor` input. 

99 See the [preprocessing layer guide] 

100 (https://www.tensorflow.org/guide/keras/preprocessing_layers) for an 

101 overview of preprocessing layers. 

102 

103 This function receives as input a string of text and returns a 

104 list of encoded integers each corresponding to a word (or token) 

105 in the given input string. 

106 

107 Args: 

108 input_text: Input text (string). 

109 n: int. Size of vocabulary. 

110 filters: list (or concatenation) of characters to filter out, such as 

111 punctuation. Default: 

112 ``` 

113 '!"#$%&()*+,-./:;<=>?@[\]^_`{|}~\t\n 

114 ```, 

115 includes basic punctuation, tabs, and newlines. 

116 lower: boolean. Whether to set the text to lowercase. 

117 split: str. Separator for word splitting. 

118 analyzer: function. Custom analyzer to split the text 

119 

120 Returns: 

121 List of integers in `[1, n]`. Each integer encodes a word 

122 (unicity non-guaranteed). 

123 """ 

124 return hashing_trick( 

125 input_text, 

126 n, 

127 hash_function=hash, 

128 filters=filters, 

129 lower=lower, 

130 split=split, 

131 analyzer=analyzer, 

132 ) 

133 

134 

135@keras_export("keras.preprocessing.text.hashing_trick") 

136def hashing_trick( 

137 text, 

138 n, 

139 hash_function=None, 

140 filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n', 

141 lower=True, 

142 split=" ", 

143 analyzer=None, 

144): 

145 r"""Converts a text to a sequence of indexes in a fixed-size hashing space. 

146 

147 Deprecated: `tf.keras.text.preprocessing.hashing_trick` does not operate on 

148 tensors and is not recommended for new code. Prefer 

149 `tf.keras.layers.Hashing` which provides equivalent functionality through a 

150 layer which accepts `tf.Tensor` input. See the [preprocessing layer guide]( 

151 https://www.tensorflow.org/guide/keras/preprocessing_layers) for an 

152 overview of preprocessing layers. 

153 

154 Args: 

155 text: Input text (string). 

156 n: Dimension of the hashing space. 

157 hash_function: defaults to python `hash` function, can be 'md5' or 

158 any function that takes in input a string and returns a int. 

159 Note that 'hash' is not a stable hashing function, so 

160 it is not consistent across different runs, while 'md5' 

161 is a stable hashing function. 

162 filters: list (or concatenation) of characters to filter out, such as 

163 punctuation. Default: ``!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\\t\\n``, 

164 includes basic punctuation, tabs, and newlines. 

165 lower: boolean. Whether to set the text to lowercase. 

166 split: str. Separator for word splitting. 

167 analyzer: function. Custom analyzer to split the text 

168 

169 Returns: 

170 A list of integer word indices (unicity non-guaranteed). 

171 `0` is a reserved index that won't be assigned to any word. 

172 Two or more words may be assigned to the same index, due to possible 

173 collisions by the hashing function. 

174 The [probability]( 

175 https://en.wikipedia.org/wiki/Birthday_problem#Probability_table) 

176 of a collision is in relation to the dimension of the hashing space and 

177 the number of distinct objects. 

178 """ 

179 if hash_function is None: 

180 hash_function = hash 

181 elif hash_function == "md5": 

182 hash_function = lambda w: int(hashlib.md5(w.encode()).hexdigest(), 16) 

183 

184 if analyzer is None: 

185 seq = text_to_word_sequence( 

186 text, filters=filters, lower=lower, split=split 

187 ) 

188 else: 

189 seq = analyzer(text) 

190 

191 return [(hash_function(w) % (n - 1) + 1) for w in seq] 

192 

193 

194@keras_export("keras.preprocessing.text.Tokenizer") 

195class Tokenizer(object): 

196 """Text tokenization utility class. 

197 

198 Deprecated: `tf.keras.preprocessing.text.Tokenizer` does not operate on 

199 tensors and is not recommended for new code. Prefer 

200 `tf.keras.layers.TextVectorization` which provides equivalent functionality 

201 through a layer which accepts `tf.Tensor` input. See the 

202 [text loading tutorial](https://www.tensorflow.org/tutorials/load_data/text) 

203 for an overview of the layer and text handling in tensorflow. 

204 

205 This class allows to vectorize a text corpus, by turning each 

206 text into either a sequence of integers (each integer being the index 

207 of a token in a dictionary) or into a vector where the coefficient 

208 for each token could be binary, based on word count, based on tf-idf... 

209 

210 By default, all punctuation is removed, turning the texts into 

211 space-separated sequences of words 

212 (words may include the `'` character). These sequences are then 

213 split into lists of tokens. They will then be indexed or vectorized. 

214 

215 `0` is a reserved index that won't be assigned to any word. 

216 

217 Args: 

218 num_words: the maximum number of words to keep, based 

219 on word frequency. Only the most common `num_words-1` words will 

220 be kept. 

221 filters: a string where each element is a character that will be 

222 filtered from the texts. The default is all punctuation, plus 

223 tabs and line breaks, minus the `'` character. 

224 lower: boolean. Whether to convert the texts to lowercase. 

225 split: str. Separator for word splitting. 

226 char_level: if True, every character will be treated as a token. 

227 oov_token: if given, it will be added to word_index and used to 

228 replace out-of-vocabulary words during text_to_sequence calls 

229 analyzer: function. Custom analyzer to split the text. 

230 The default analyzer is text_to_word_sequence 

231 """ 

232 

233 def __init__( 

234 self, 

235 num_words=None, 

236 filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n', 

237 lower=True, 

238 split=" ", 

239 char_level=False, 

240 oov_token=None, 

241 analyzer=None, 

242 **kwargs 

243 ): 

244 # Legacy support 

245 if "nb_words" in kwargs: 

246 warnings.warn( 

247 "The `nb_words` argument in `Tokenizer` " 

248 "has been renamed `num_words`." 

249 ) 

250 num_words = kwargs.pop("nb_words") 

251 document_count = kwargs.pop("document_count", 0) 

252 if kwargs: 

253 raise TypeError("Unrecognized keyword arguments: " + str(kwargs)) 

254 

255 self.word_counts = collections.OrderedDict() 

256 self.word_docs = collections.defaultdict(int) 

257 self.filters = filters 

258 self.split = split 

259 self.lower = lower 

260 self.num_words = num_words 

261 self.document_count = document_count 

262 self.char_level = char_level 

263 self.oov_token = oov_token 

264 self.index_docs = collections.defaultdict(int) 

265 self.word_index = {} 

266 self.index_word = {} 

267 self.analyzer = analyzer 

268 

269 def fit_on_texts(self, texts): 

270 """Updates internal vocabulary based on a list of texts. 

271 

272 In the case where texts contains lists, 

273 we assume each entry of the lists to be a token. 

274 

275 Required before using `texts_to_sequences` or `texts_to_matrix`. 

276 

277 Args: 

278 texts: can be a list of strings, 

279 a generator of strings (for memory-efficiency), 

280 or a list of list of strings. 

281 """ 

282 for text in texts: 

283 self.document_count += 1 

284 if self.char_level or isinstance(text, list): 

285 if self.lower: 

286 if isinstance(text, list): 

287 text = [text_elem.lower() for text_elem in text] 

288 else: 

289 text = text.lower() 

290 seq = text 

291 else: 

292 if self.analyzer is None: 

293 seq = text_to_word_sequence( 

294 text, 

295 filters=self.filters, 

296 lower=self.lower, 

297 split=self.split, 

298 ) 

299 else: 

300 seq = self.analyzer(text) 

301 for w in seq: 

302 if w in self.word_counts: 

303 self.word_counts[w] += 1 

304 else: 

305 self.word_counts[w] = 1 

306 for w in set(seq): 

307 # In how many documents each word occurs 

308 self.word_docs[w] += 1 

309 

310 wcounts = list(self.word_counts.items()) 

311 wcounts.sort(key=lambda x: x[1], reverse=True) 

312 # forcing the oov_token to index 1 if it exists 

313 if self.oov_token is None: 

314 sorted_voc = [] 

315 else: 

316 sorted_voc = [self.oov_token] 

317 sorted_voc.extend(wc[0] for wc in wcounts) 

318 

319 # note that index 0 is reserved, never assigned to an existing word 

320 self.word_index = dict( 

321 zip(sorted_voc, list(range(1, len(sorted_voc) + 1))) 

322 ) 

323 

324 self.index_word = {c: w for w, c in self.word_index.items()} 

325 

326 for w, c in list(self.word_docs.items()): 

327 self.index_docs[self.word_index[w]] = c 

328 

329 def fit_on_sequences(self, sequences): 

330 """Updates internal vocabulary based on a list of sequences. 

331 

332 Required before using `sequences_to_matrix` 

333 (if `fit_on_texts` was never called). 

334 

335 Args: 

336 sequences: A list of sequence. 

337 A "sequence" is a list of integer word indices. 

338 """ 

339 self.document_count += len(sequences) 

340 for seq in sequences: 

341 seq = set(seq) 

342 for i in seq: 

343 self.index_docs[i] += 1 

344 

345 def texts_to_sequences(self, texts): 

346 """Transforms each text in texts to a sequence of integers. 

347 

348 Only top `num_words-1` most frequent words will be taken into account. 

349 Only words known by the tokenizer will be taken into account. 

350 

351 Args: 

352 texts: A list of texts (strings). 

353 

354 Returns: 

355 A list of sequences. 

356 """ 

357 return list(self.texts_to_sequences_generator(texts)) 

358 

359 def texts_to_sequences_generator(self, texts): 

360 """Transforms each text in `texts` to a sequence of integers. 

361 

362 Each item in texts can also be a list, 

363 in which case we assume each item of that list to be a token. 

364 

365 Only top `num_words-1` most frequent words will be taken into account. 

366 Only words known by the tokenizer will be taken into account. 

367 

368 Args: 

369 texts: A list of texts (strings). 

370 

371 Yields: 

372 Yields individual sequences. 

373 """ 

374 num_words = self.num_words 

375 oov_token_index = self.word_index.get(self.oov_token) 

376 for text in texts: 

377 if self.char_level or isinstance(text, list): 

378 if self.lower: 

379 if isinstance(text, list): 

380 text = [text_elem.lower() for text_elem in text] 

381 else: 

382 text = text.lower() 

383 seq = text 

384 else: 

385 if self.analyzer is None: 

386 seq = text_to_word_sequence( 

387 text, 

388 filters=self.filters, 

389 lower=self.lower, 

390 split=self.split, 

391 ) 

392 else: 

393 seq = self.analyzer(text) 

394 vect = [] 

395 for w in seq: 

396 i = self.word_index.get(w) 

397 if i is not None: 

398 if num_words and i >= num_words: 

399 if oov_token_index is not None: 

400 vect.append(oov_token_index) 

401 else: 

402 vect.append(i) 

403 elif self.oov_token is not None: 

404 vect.append(oov_token_index) 

405 yield vect 

406 

407 def sequences_to_texts(self, sequences): 

408 """Transforms each sequence into a list of text. 

409 

410 Only top `num_words-1` most frequent words will be taken into account. 

411 Only words known by the tokenizer will be taken into account. 

412 

413 Args: 

414 sequences: A list of sequences (list of integers). 

415 

416 Returns: 

417 A list of texts (strings) 

418 """ 

419 return list(self.sequences_to_texts_generator(sequences)) 

420 

421 def sequences_to_texts_generator(self, sequences): 

422 """Transforms each sequence in `sequences` to a list of texts(strings). 

423 

424 Each sequence has to a list of integers. 

425 In other words, sequences should be a list of sequences 

426 

427 Only top `num_words-1` most frequent words will be taken into account. 

428 Only words known by the tokenizer will be taken into account. 

429 

430 Args: 

431 sequences: A list of sequences. 

432 

433 Yields: 

434 Yields individual texts. 

435 """ 

436 num_words = self.num_words 

437 oov_token_index = self.word_index.get(self.oov_token) 

438 for seq in sequences: 

439 vect = [] 

440 for num in seq: 

441 word = self.index_word.get(num) 

442 if word is not None: 

443 if num_words and num >= num_words: 

444 if oov_token_index is not None: 

445 vect.append(self.index_word[oov_token_index]) 

446 else: 

447 vect.append(word) 

448 elif self.oov_token is not None: 

449 vect.append(self.index_word[oov_token_index]) 

450 vect = " ".join(vect) 

451 yield vect 

452 

453 def texts_to_matrix(self, texts, mode="binary"): 

454 """Convert a list of texts to a Numpy matrix. 

455 

456 Args: 

457 texts: list of strings. 

458 mode: one of "binary", "count", "tfidf", "freq". 

459 

460 Returns: 

461 A Numpy matrix. 

462 """ 

463 sequences = self.texts_to_sequences(texts) 

464 return self.sequences_to_matrix(sequences, mode=mode) 

465 

466 def sequences_to_matrix(self, sequences, mode="binary"): 

467 """Converts a list of sequences into a Numpy matrix. 

468 

469 Args: 

470 sequences: list of sequences 

471 (a sequence is a list of integer word indices). 

472 mode: one of "binary", "count", "tfidf", "freq" 

473 

474 Returns: 

475 A Numpy matrix. 

476 

477 Raises: 

478 ValueError: In case of invalid `mode` argument, 

479 or if the Tokenizer requires to be fit to sample data. 

480 """ 

481 if not self.num_words: 

482 if self.word_index: 

483 num_words = len(self.word_index) + 1 

484 else: 

485 raise ValueError( 

486 "Specify a dimension (`num_words` argument), " 

487 "or fit on some text data first." 

488 ) 

489 else: 

490 num_words = self.num_words 

491 

492 if mode == "tfidf" and not self.document_count: 

493 raise ValueError( 

494 "Fit the Tokenizer on some data before using tfidf mode." 

495 ) 

496 

497 x = np.zeros((len(sequences), num_words)) 

498 for i, seq in enumerate(sequences): 

499 if not seq: 

500 continue 

501 counts = collections.defaultdict(int) 

502 for j in seq: 

503 if j >= num_words: 

504 continue 

505 counts[j] += 1 

506 for j, c in list(counts.items()): 

507 if mode == "count": 

508 x[i][j] = c 

509 elif mode == "freq": 

510 x[i][j] = c / len(seq) 

511 elif mode == "binary": 

512 x[i][j] = 1 

513 elif mode == "tfidf": 

514 # Use weighting scheme 2 in 

515 # https://en.wikipedia.org/wiki/Tf%E2%80%93idf 

516 tf = 1 + np.log(c) 

517 idf = np.log( 

518 1 

519 + self.document_count / (1 + self.index_docs.get(j, 0)) 

520 ) 

521 x[i][j] = tf * idf 

522 else: 

523 raise ValueError("Unknown vectorization mode:", mode) 

524 return x 

525 

526 def get_config(self): 

527 """Returns the tokenizer configuration as Python dictionary. 

528 

529 The word count dictionaries used by the tokenizer get serialized 

530 into plain JSON, so that the configuration can be read by other 

531 projects. 

532 

533 Returns: 

534 A Python dictionary with the tokenizer configuration. 

535 """ 

536 json_word_counts = json.dumps(self.word_counts) 

537 json_word_docs = json.dumps(self.word_docs) 

538 json_index_docs = json.dumps(self.index_docs) 

539 json_word_index = json.dumps(self.word_index) 

540 json_index_word = json.dumps(self.index_word) 

541 

542 return { 

543 "num_words": self.num_words, 

544 "filters": self.filters, 

545 "lower": self.lower, 

546 "split": self.split, 

547 "char_level": self.char_level, 

548 "oov_token": self.oov_token, 

549 "document_count": self.document_count, 

550 "word_counts": json_word_counts, 

551 "word_docs": json_word_docs, 

552 "index_docs": json_index_docs, 

553 "index_word": json_index_word, 

554 "word_index": json_word_index, 

555 } 

556 

557 def to_json(self, **kwargs): 

558 """Returns a JSON string containing the tokenizer configuration. 

559 

560 To load a tokenizer from a JSON string, use 

561 `keras.preprocessing.text.tokenizer_from_json(json_string)`. 

562 

563 Args: 

564 **kwargs: Additional keyword arguments 

565 to be passed to `json.dumps()`. 

566 

567 Returns: 

568 A JSON string containing the tokenizer configuration. 

569 """ 

570 config = self.get_config() 

571 tokenizer_config = { 

572 "class_name": self.__class__.__name__, 

573 "config": config, 

574 } 

575 return json.dumps(tokenizer_config, **kwargs) 

576 

577 

578@keras_export("keras.preprocessing.text.tokenizer_from_json") 

579def tokenizer_from_json(json_string): 

580 """Parses a JSON tokenizer configuration and returns a tokenizer instance. 

581 

582 Deprecated: `tf.keras.preprocessing.text.Tokenizer` does not operate on 

583 tensors and is not recommended for new code. Prefer 

584 `tf.keras.layers.TextVectorization` which provides equivalent functionality 

585 through a layer which accepts `tf.Tensor` input. See the 

586 [text loading tutorial](https://www.tensorflow.org/tutorials/load_data/text) 

587 for an overview of the layer and text handling in tensorflow. 

588 

589 Args: 

590 json_string: JSON string encoding a tokenizer configuration. 

591 

592 Returns: 

593 A Keras Tokenizer instance 

594 """ 

595 tokenizer_config = json.loads(json_string) 

596 config = tokenizer_config.get("config") 

597 

598 word_counts = json.loads(config.pop("word_counts")) 

599 word_docs = json.loads(config.pop("word_docs")) 

600 index_docs = json.loads(config.pop("index_docs")) 

601 # Integer indexing gets converted to strings with json.dumps() 

602 index_docs = {int(k): v for k, v in index_docs.items()} 

603 index_word = json.loads(config.pop("index_word")) 

604 index_word = {int(k): v for k, v in index_word.items()} 

605 word_index = json.loads(config.pop("word_index")) 

606 

607 tokenizer = Tokenizer(**config) 

608 tokenizer.word_counts = word_counts 

609 tokenizer.word_docs = word_docs 

610 tokenizer.index_docs = index_docs 

611 tokenizer.word_index = word_index 

612 tokenizer.index_word = index_word 

613 return tokenizer 

614