Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/keras/src/preprocessing/text.py: 15%
191 statements
« prev ^ index » next coverage.py v7.4.0, created at 2024-01-03 07:57 +0000
« prev ^ index » next coverage.py v7.4.0, created at 2024-01-03 07:57 +0000
1# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
2#
3# Licensed under the Apache License, Version 2.0 (the "License");
4# you may not use this file except in compliance with the License.
5# You may obtain a copy of the License at
6#
7# http://www.apache.org/licenses/LICENSE-2.0
8#
9# Unless required by applicable law or agreed to in writing, software
10# distributed under the License is distributed on an "AS IS" BASIS,
11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12# See the License for the specific language governing permissions and
13# limitations under the License.
14# ==============================================================================
15"""Utilities for text input preprocessing.
17Deprecated: `tf.keras.preprocessing.text` APIs are not recommended for new code.
18Prefer `tf.keras.utils.text_dataset_from_directory` and
19`tf.keras.layers.TextVectorization` which provide a more efficient approach
20for preprocessing text input. For an introduction to these APIs, see
21the [text loading tutorial]
22(https://www.tensorflow.org/tutorials/load_data/text)
23and [preprocessing layer guide]
24(https://www.tensorflow.org/guide/keras/preprocessing_layers).
25"""
28import collections
29import hashlib
30import json
31import warnings
33import numpy as np
35# isort: off
36from tensorflow.python.util.tf_export import keras_export
39@keras_export("keras.preprocessing.text.text_to_word_sequence")
40def text_to_word_sequence(
41 input_text,
42 filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
43 lower=True,
44 split=" ",
45):
46 r"""Converts a text to a sequence of words (or tokens).
48 Deprecated: `tf.keras.preprocessing.text.text_to_word_sequence` does not
49 operate on tensors and is not recommended for new code. Prefer
50 `tf.strings.regex_replace` and `tf.strings.split` which provide equivalent
51 functionality and accept `tf.Tensor` input. For an overview of text handling
52 in Tensorflow, see the [text loading tutorial]
53 (https://www.tensorflow.org/tutorials/load_data/text).
55 This function transforms a string of text into a list of words
56 while ignoring `filters` which include punctuations by default.
58 >>> sample_text = 'This is a sample sentence.'
59 >>> tf.keras.preprocessing.text.text_to_word_sequence(sample_text)
60 ['this', 'is', 'a', 'sample', 'sentence']
62 Args:
63 input_text: Input text (string).
64 filters: list (or concatenation) of characters to filter out, such as
65 punctuation. Default: ``'!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\\t\\n'``,
66 includes basic punctuation, tabs, and newlines.
67 lower: boolean. Whether to convert the input to lowercase.
68 split: str. Separator for word splitting.
70 Returns:
71 A list of words (or tokens).
72 """
73 if lower:
74 input_text = input_text.lower()
76 translate_dict = {c: split for c in filters}
77 translate_map = str.maketrans(translate_dict)
78 input_text = input_text.translate(translate_map)
80 seq = input_text.split(split)
81 return [i for i in seq if i]
84@keras_export("keras.preprocessing.text.one_hot")
85def one_hot(
86 input_text,
87 n,
88 filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
89 lower=True,
90 split=" ",
91 analyzer=None,
92):
93 r"""One-hot encodes a text into a list of word indexes of size `n`.
95 Deprecated: `tf.keras.text.preprocessing.one_hot` does not operate on
96 tensors and is not recommended for new code. Prefer
97 `tf.keras.layers.Hashing` with `output_mode='one_hot'` which provides
98 equivalent functionality through a layer which accepts `tf.Tensor` input.
99 See the [preprocessing layer guide]
100 (https://www.tensorflow.org/guide/keras/preprocessing_layers) for an
101 overview of preprocessing layers.
103 This function receives as input a string of text and returns a
104 list of encoded integers each corresponding to a word (or token)
105 in the given input string.
107 Args:
108 input_text: Input text (string).
109 n: int. Size of vocabulary.
110 filters: list (or concatenation) of characters to filter out, such as
111 punctuation. Default:
112 ```
113 '!"#$%&()*+,-./:;<=>?@[\]^_`{|}~\t\n
114 ```,
115 includes basic punctuation, tabs, and newlines.
116 lower: boolean. Whether to set the text to lowercase.
117 split: str. Separator for word splitting.
118 analyzer: function. Custom analyzer to split the text
120 Returns:
121 List of integers in `[1, n]`. Each integer encodes a word
122 (unicity non-guaranteed).
123 """
124 return hashing_trick(
125 input_text,
126 n,
127 hash_function=hash,
128 filters=filters,
129 lower=lower,
130 split=split,
131 analyzer=analyzer,
132 )
135@keras_export("keras.preprocessing.text.hashing_trick")
136def hashing_trick(
137 text,
138 n,
139 hash_function=None,
140 filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
141 lower=True,
142 split=" ",
143 analyzer=None,
144):
145 r"""Converts a text to a sequence of indexes in a fixed-size hashing space.
147 Deprecated: `tf.keras.text.preprocessing.hashing_trick` does not operate on
148 tensors and is not recommended for new code. Prefer
149 `tf.keras.layers.Hashing` which provides equivalent functionality through a
150 layer which accepts `tf.Tensor` input. See the [preprocessing layer guide](
151 https://www.tensorflow.org/guide/keras/preprocessing_layers) for an
152 overview of preprocessing layers.
154 Args:
155 text: Input text (string).
156 n: Dimension of the hashing space.
157 hash_function: defaults to python `hash` function, can be 'md5' or
158 any function that takes in input a string and returns a int.
159 Note that 'hash' is not a stable hashing function, so
160 it is not consistent across different runs, while 'md5'
161 is a stable hashing function.
162 filters: list (or concatenation) of characters to filter out, such as
163 punctuation. Default: ``!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\\t\\n``,
164 includes basic punctuation, tabs, and newlines.
165 lower: boolean. Whether to set the text to lowercase.
166 split: str. Separator for word splitting.
167 analyzer: function. Custom analyzer to split the text
169 Returns:
170 A list of integer word indices (unicity non-guaranteed).
171 `0` is a reserved index that won't be assigned to any word.
172 Two or more words may be assigned to the same index, due to possible
173 collisions by the hashing function.
174 The [probability](
175 https://en.wikipedia.org/wiki/Birthday_problem#Probability_table)
176 of a collision is in relation to the dimension of the hashing space and
177 the number of distinct objects.
178 """
179 if hash_function is None:
180 hash_function = hash
181 elif hash_function == "md5":
182 hash_function = lambda w: int(hashlib.md5(w.encode()).hexdigest(), 16)
184 if analyzer is None:
185 seq = text_to_word_sequence(
186 text, filters=filters, lower=lower, split=split
187 )
188 else:
189 seq = analyzer(text)
191 return [(hash_function(w) % (n - 1) + 1) for w in seq]
194@keras_export("keras.preprocessing.text.Tokenizer")
195class Tokenizer(object):
196 """Text tokenization utility class.
198 Deprecated: `tf.keras.preprocessing.text.Tokenizer` does not operate on
199 tensors and is not recommended for new code. Prefer
200 `tf.keras.layers.TextVectorization` which provides equivalent functionality
201 through a layer which accepts `tf.Tensor` input. See the
202 [text loading tutorial](https://www.tensorflow.org/tutorials/load_data/text)
203 for an overview of the layer and text handling in tensorflow.
205 This class allows to vectorize a text corpus, by turning each
206 text into either a sequence of integers (each integer being the index
207 of a token in a dictionary) or into a vector where the coefficient
208 for each token could be binary, based on word count, based on tf-idf...
210 By default, all punctuation is removed, turning the texts into
211 space-separated sequences of words
212 (words may include the `'` character). These sequences are then
213 split into lists of tokens. They will then be indexed or vectorized.
215 `0` is a reserved index that won't be assigned to any word.
217 Args:
218 num_words: the maximum number of words to keep, based
219 on word frequency. Only the most common `num_words-1` words will
220 be kept.
221 filters: a string where each element is a character that will be
222 filtered from the texts. The default is all punctuation, plus
223 tabs and line breaks, minus the `'` character.
224 lower: boolean. Whether to convert the texts to lowercase.
225 split: str. Separator for word splitting.
226 char_level: if True, every character will be treated as a token.
227 oov_token: if given, it will be added to word_index and used to
228 replace out-of-vocabulary words during text_to_sequence calls
229 analyzer: function. Custom analyzer to split the text.
230 The default analyzer is text_to_word_sequence
231 """
233 def __init__(
234 self,
235 num_words=None,
236 filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
237 lower=True,
238 split=" ",
239 char_level=False,
240 oov_token=None,
241 analyzer=None,
242 **kwargs
243 ):
244 # Legacy support
245 if "nb_words" in kwargs:
246 warnings.warn(
247 "The `nb_words` argument in `Tokenizer` "
248 "has been renamed `num_words`."
249 )
250 num_words = kwargs.pop("nb_words")
251 document_count = kwargs.pop("document_count", 0)
252 if kwargs:
253 raise TypeError("Unrecognized keyword arguments: " + str(kwargs))
255 self.word_counts = collections.OrderedDict()
256 self.word_docs = collections.defaultdict(int)
257 self.filters = filters
258 self.split = split
259 self.lower = lower
260 self.num_words = num_words
261 self.document_count = document_count
262 self.char_level = char_level
263 self.oov_token = oov_token
264 self.index_docs = collections.defaultdict(int)
265 self.word_index = {}
266 self.index_word = {}
267 self.analyzer = analyzer
269 def fit_on_texts(self, texts):
270 """Updates internal vocabulary based on a list of texts.
272 In the case where texts contains lists,
273 we assume each entry of the lists to be a token.
275 Required before using `texts_to_sequences` or `texts_to_matrix`.
277 Args:
278 texts: can be a list of strings,
279 a generator of strings (for memory-efficiency),
280 or a list of list of strings.
281 """
282 for text in texts:
283 self.document_count += 1
284 if self.char_level or isinstance(text, list):
285 if self.lower:
286 if isinstance(text, list):
287 text = [text_elem.lower() for text_elem in text]
288 else:
289 text = text.lower()
290 seq = text
291 else:
292 if self.analyzer is None:
293 seq = text_to_word_sequence(
294 text,
295 filters=self.filters,
296 lower=self.lower,
297 split=self.split,
298 )
299 else:
300 seq = self.analyzer(text)
301 for w in seq:
302 if w in self.word_counts:
303 self.word_counts[w] += 1
304 else:
305 self.word_counts[w] = 1
306 for w in set(seq):
307 # In how many documents each word occurs
308 self.word_docs[w] += 1
310 wcounts = list(self.word_counts.items())
311 wcounts.sort(key=lambda x: x[1], reverse=True)
312 # forcing the oov_token to index 1 if it exists
313 if self.oov_token is None:
314 sorted_voc = []
315 else:
316 sorted_voc = [self.oov_token]
317 sorted_voc.extend(wc[0] for wc in wcounts)
319 # note that index 0 is reserved, never assigned to an existing word
320 self.word_index = dict(
321 zip(sorted_voc, list(range(1, len(sorted_voc) + 1)))
322 )
324 self.index_word = {c: w for w, c in self.word_index.items()}
326 for w, c in list(self.word_docs.items()):
327 self.index_docs[self.word_index[w]] = c
329 def fit_on_sequences(self, sequences):
330 """Updates internal vocabulary based on a list of sequences.
332 Required before using `sequences_to_matrix`
333 (if `fit_on_texts` was never called).
335 Args:
336 sequences: A list of sequence.
337 A "sequence" is a list of integer word indices.
338 """
339 self.document_count += len(sequences)
340 for seq in sequences:
341 seq = set(seq)
342 for i in seq:
343 self.index_docs[i] += 1
345 def texts_to_sequences(self, texts):
346 """Transforms each text in texts to a sequence of integers.
348 Only top `num_words-1` most frequent words will be taken into account.
349 Only words known by the tokenizer will be taken into account.
351 Args:
352 texts: A list of texts (strings).
354 Returns:
355 A list of sequences.
356 """
357 return list(self.texts_to_sequences_generator(texts))
359 def texts_to_sequences_generator(self, texts):
360 """Transforms each text in `texts` to a sequence of integers.
362 Each item in texts can also be a list,
363 in which case we assume each item of that list to be a token.
365 Only top `num_words-1` most frequent words will be taken into account.
366 Only words known by the tokenizer will be taken into account.
368 Args:
369 texts: A list of texts (strings).
371 Yields:
372 Yields individual sequences.
373 """
374 num_words = self.num_words
375 oov_token_index = self.word_index.get(self.oov_token)
376 for text in texts:
377 if self.char_level or isinstance(text, list):
378 if self.lower:
379 if isinstance(text, list):
380 text = [text_elem.lower() for text_elem in text]
381 else:
382 text = text.lower()
383 seq = text
384 else:
385 if self.analyzer is None:
386 seq = text_to_word_sequence(
387 text,
388 filters=self.filters,
389 lower=self.lower,
390 split=self.split,
391 )
392 else:
393 seq = self.analyzer(text)
394 vect = []
395 for w in seq:
396 i = self.word_index.get(w)
397 if i is not None:
398 if num_words and i >= num_words:
399 if oov_token_index is not None:
400 vect.append(oov_token_index)
401 else:
402 vect.append(i)
403 elif self.oov_token is not None:
404 vect.append(oov_token_index)
405 yield vect
407 def sequences_to_texts(self, sequences):
408 """Transforms each sequence into a list of text.
410 Only top `num_words-1` most frequent words will be taken into account.
411 Only words known by the tokenizer will be taken into account.
413 Args:
414 sequences: A list of sequences (list of integers).
416 Returns:
417 A list of texts (strings)
418 """
419 return list(self.sequences_to_texts_generator(sequences))
421 def sequences_to_texts_generator(self, sequences):
422 """Transforms each sequence in `sequences` to a list of texts(strings).
424 Each sequence has to a list of integers.
425 In other words, sequences should be a list of sequences
427 Only top `num_words-1` most frequent words will be taken into account.
428 Only words known by the tokenizer will be taken into account.
430 Args:
431 sequences: A list of sequences.
433 Yields:
434 Yields individual texts.
435 """
436 num_words = self.num_words
437 oov_token_index = self.word_index.get(self.oov_token)
438 for seq in sequences:
439 vect = []
440 for num in seq:
441 word = self.index_word.get(num)
442 if word is not None:
443 if num_words and num >= num_words:
444 if oov_token_index is not None:
445 vect.append(self.index_word[oov_token_index])
446 else:
447 vect.append(word)
448 elif self.oov_token is not None:
449 vect.append(self.index_word[oov_token_index])
450 vect = " ".join(vect)
451 yield vect
453 def texts_to_matrix(self, texts, mode="binary"):
454 """Convert a list of texts to a Numpy matrix.
456 Args:
457 texts: list of strings.
458 mode: one of "binary", "count", "tfidf", "freq".
460 Returns:
461 A Numpy matrix.
462 """
463 sequences = self.texts_to_sequences(texts)
464 return self.sequences_to_matrix(sequences, mode=mode)
466 def sequences_to_matrix(self, sequences, mode="binary"):
467 """Converts a list of sequences into a Numpy matrix.
469 Args:
470 sequences: list of sequences
471 (a sequence is a list of integer word indices).
472 mode: one of "binary", "count", "tfidf", "freq"
474 Returns:
475 A Numpy matrix.
477 Raises:
478 ValueError: In case of invalid `mode` argument,
479 or if the Tokenizer requires to be fit to sample data.
480 """
481 if not self.num_words:
482 if self.word_index:
483 num_words = len(self.word_index) + 1
484 else:
485 raise ValueError(
486 "Specify a dimension (`num_words` argument), "
487 "or fit on some text data first."
488 )
489 else:
490 num_words = self.num_words
492 if mode == "tfidf" and not self.document_count:
493 raise ValueError(
494 "Fit the Tokenizer on some data before using tfidf mode."
495 )
497 x = np.zeros((len(sequences), num_words))
498 for i, seq in enumerate(sequences):
499 if not seq:
500 continue
501 counts = collections.defaultdict(int)
502 for j in seq:
503 if j >= num_words:
504 continue
505 counts[j] += 1
506 for j, c in list(counts.items()):
507 if mode == "count":
508 x[i][j] = c
509 elif mode == "freq":
510 x[i][j] = c / len(seq)
511 elif mode == "binary":
512 x[i][j] = 1
513 elif mode == "tfidf":
514 # Use weighting scheme 2 in
515 # https://en.wikipedia.org/wiki/Tf%E2%80%93idf
516 tf = 1 + np.log(c)
517 idf = np.log(
518 1
519 + self.document_count / (1 + self.index_docs.get(j, 0))
520 )
521 x[i][j] = tf * idf
522 else:
523 raise ValueError("Unknown vectorization mode:", mode)
524 return x
526 def get_config(self):
527 """Returns the tokenizer configuration as Python dictionary.
529 The word count dictionaries used by the tokenizer get serialized
530 into plain JSON, so that the configuration can be read by other
531 projects.
533 Returns:
534 A Python dictionary with the tokenizer configuration.
535 """
536 json_word_counts = json.dumps(self.word_counts)
537 json_word_docs = json.dumps(self.word_docs)
538 json_index_docs = json.dumps(self.index_docs)
539 json_word_index = json.dumps(self.word_index)
540 json_index_word = json.dumps(self.index_word)
542 return {
543 "num_words": self.num_words,
544 "filters": self.filters,
545 "lower": self.lower,
546 "split": self.split,
547 "char_level": self.char_level,
548 "oov_token": self.oov_token,
549 "document_count": self.document_count,
550 "word_counts": json_word_counts,
551 "word_docs": json_word_docs,
552 "index_docs": json_index_docs,
553 "index_word": json_index_word,
554 "word_index": json_word_index,
555 }
557 def to_json(self, **kwargs):
558 """Returns a JSON string containing the tokenizer configuration.
560 To load a tokenizer from a JSON string, use
561 `keras.preprocessing.text.tokenizer_from_json(json_string)`.
563 Args:
564 **kwargs: Additional keyword arguments
565 to be passed to `json.dumps()`.
567 Returns:
568 A JSON string containing the tokenizer configuration.
569 """
570 config = self.get_config()
571 tokenizer_config = {
572 "class_name": self.__class__.__name__,
573 "config": config,
574 }
575 return json.dumps(tokenizer_config, **kwargs)
578@keras_export("keras.preprocessing.text.tokenizer_from_json")
579def tokenizer_from_json(json_string):
580 """Parses a JSON tokenizer configuration and returns a tokenizer instance.
582 Deprecated: `tf.keras.preprocessing.text.Tokenizer` does not operate on
583 tensors and is not recommended for new code. Prefer
584 `tf.keras.layers.TextVectorization` which provides equivalent functionality
585 through a layer which accepts `tf.Tensor` input. See the
586 [text loading tutorial](https://www.tensorflow.org/tutorials/load_data/text)
587 for an overview of the layer and text handling in tensorflow.
589 Args:
590 json_string: JSON string encoding a tokenizer configuration.
592 Returns:
593 A Keras Tokenizer instance
594 """
595 tokenizer_config = json.loads(json_string)
596 config = tokenizer_config.get("config")
598 word_counts = json.loads(config.pop("word_counts"))
599 word_docs = json.loads(config.pop("word_docs"))
600 index_docs = json.loads(config.pop("index_docs"))
601 # Integer indexing gets converted to strings with json.dumps()
602 index_docs = {int(k): v for k, v in index_docs.items()}
603 index_word = json.loads(config.pop("index_word"))
604 index_word = {int(k): v for k, v in index_word.items()}
605 word_index = json.loads(config.pop("word_index"))
607 tokenizer = Tokenizer(**config)
608 tokenizer.word_counts = word_counts
609 tokenizer.word_docs = word_docs
610 tokenizer.index_docs = index_docs
611 tokenizer.word_index = word_index
612 tokenizer.index_word = index_word
613 return tokenizer