Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/keras/src/preprocessing/sequence.py: 17%
109 statements
« prev ^ index » next coverage.py v7.4.0, created at 2024-01-03 07:57 +0000
« prev ^ index » next coverage.py v7.4.0, created at 2024-01-03 07:57 +0000
1# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
2#
3# Licensed under the Apache License, Version 2.0 (the "License");
4# you may not use this file except in compliance with the License.
5# You may obtain a copy of the License at
6#
7# http://www.apache.org/licenses/LICENSE-2.0
8#
9# Unless required by applicable law or agreed to in writing, software
10# distributed under the License is distributed on an "AS IS" BASIS,
11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12# See the License for the specific language governing permissions and
13# limitations under the License.
14# ==============================================================================
15"""Utilities for preprocessing sequence data.
17Deprecated: `tf.keras.preprocessing.sequence` APIs are not recommended for new
18code. Prefer `tf.keras.utils.timeseries_dataset_from_array` and
19the `tf.data` APIs which provide a much more flexible mechanisms for dealing
20with sequences. See the [tf.data guide](https://www.tensorflow.org/guide/data)
21for more details.
22"""
25import json
26import random
28import numpy as np
30from keras.src.utils import data_utils
32# isort: off
33from tensorflow.python.util.tf_export import keras_export
36def _remove_long_seq(maxlen, seq, label):
37 """Removes sequences that exceed the maximum length.
39 Args:
40 maxlen: Int, maximum length of the output sequences.
41 seq: List of lists, where each sublist is a sequence.
42 label: List where each element is an integer.
44 Returns:
45 new_seq, new_label: shortened lists for `seq` and `label`.
46 """
47 new_seq, new_label = [], []
48 for x, y in zip(seq, label):
49 if len(x) < maxlen:
50 new_seq.append(x)
51 new_label.append(y)
52 return new_seq, new_label
55@keras_export("keras.preprocessing.sequence.TimeseriesGenerator")
56class TimeseriesGenerator(data_utils.Sequence):
57 """Utility class for generating batches of temporal data.
59 Deprecated: `tf.keras.preprocessing.sequence.TimeseriesGenerator` does not
60 operate on tensors and is not recommended for new code. Prefer using a
61 `tf.data.Dataset` which provides a more efficient and flexible mechanism for
62 batching, shuffling, and windowing input. See the
63 [tf.data guide](https://www.tensorflow.org/guide/data) for more details.
65 This class takes in a sequence of data-points gathered at
66 equal intervals, along with time series parameters such as
67 stride, length of history, etc., to produce batches for
68 training/validation.
70 Arguments:
71 data: Indexable generator (such as list or Numpy array)
72 containing consecutive data points (timesteps).
73 The data should be at 2D, and axis 0 is expected
74 to be the time dimension.
75 targets: Targets corresponding to timesteps in `data`.
76 It should have same length as `data`.
77 length: Length of the output sequences (in number of timesteps).
78 sampling_rate: Period between successive individual timesteps
79 within sequences. For rate `r`, timesteps
80 `data[i]`, `data[i-r]`, ... `data[i - length]`
81 are used for create a sample sequence.
82 stride: Period between successive output sequences.
83 For stride `s`, consecutive output samples would
84 be centered around `data[i]`, `data[i+s]`, `data[i+2*s]`, etc.
85 start_index: Data points earlier than `start_index` will not be used
86 in the output sequences. This is useful to reserve part of the
87 data for test or validation.
88 end_index: Data points later than `end_index` will not be used
89 in the output sequences. This is useful to reserve part of the
90 data for test or validation.
91 shuffle: Whether to shuffle output samples,
92 or instead draw them in chronological order.
93 reverse: Boolean: if `true`, timesteps in each output sample will be
94 in reverse chronological order.
95 batch_size: Number of timeseries samples in each batch
96 (except maybe the last one).
98 Returns:
99 A [Sequence](
100 https://www.tensorflow.org/api_docs/python/tf/keras/utils/Sequence)
101 instance.
103 Examples:
104 ```python
105 from keras.src.preprocessing.sequence import TimeseriesGenerator
106 import numpy as np
107 data = np.array([[i] for i in range(50)])
108 targets = np.array([[i] for i in range(50)])
109 data_gen = TimeseriesGenerator(data, targets,
110 length=10, sampling_rate=2,
111 batch_size=2)
112 assert len(data_gen) == 20
113 batch_0 = data_gen[0]
114 x, y = batch_0
115 assert np.array_equal(x,
116 np.array([[[0], [2], [4], [6], [8]],
117 [[1], [3], [5], [7], [9]]]))
118 assert np.array_equal(y,
119 np.array([[10], [11]]))
120 ```
121 """
123 def __init__(
124 self,
125 data,
126 targets,
127 length,
128 sampling_rate=1,
129 stride=1,
130 start_index=0,
131 end_index=None,
132 shuffle=False,
133 reverse=False,
134 batch_size=128,
135 ):
137 if len(data) != len(targets):
138 raise ValueError(
139 "Data and targets have to be"
140 + f" of same length. Data length is {len(data)}"
141 + f" while target length is {len(targets)}"
142 )
144 self.data = data
145 self.targets = targets
146 self.length = length
147 self.sampling_rate = sampling_rate
148 self.stride = stride
149 self.start_index = start_index + length
150 if end_index is None:
151 end_index = len(data) - 1
152 self.end_index = end_index
153 self.shuffle = shuffle
154 self.reverse = reverse
155 self.batch_size = batch_size
157 if self.start_index > self.end_index:
158 raise ValueError(
159 "`start_index+length=%i > end_index=%i` "
160 "is disallowed, as no part of the sequence "
161 "would be left to be used as current step."
162 % (self.start_index, self.end_index)
163 )
165 def __len__(self):
166 return (
167 self.end_index - self.start_index + self.batch_size * self.stride
168 ) // (self.batch_size * self.stride)
170 def __getitem__(self, index):
171 if self.shuffle:
172 rows = np.random.randint(
173 self.start_index, self.end_index + 1, size=self.batch_size
174 )
175 else:
176 i = self.start_index + self.batch_size * self.stride * index
177 rows = np.arange(
178 i,
179 min(i + self.batch_size * self.stride, self.end_index + 1),
180 self.stride,
181 )
183 samples = np.array(
184 [
185 self.data[row - self.length : row : self.sampling_rate]
186 for row in rows
187 ]
188 )
189 targets = np.array([self.targets[row] for row in rows])
191 if self.reverse:
192 return samples[:, ::-1, ...], targets
193 return samples, targets
195 def get_config(self):
196 """Returns the TimeseriesGenerator configuration as Python dictionary.
198 Returns:
199 A Python dictionary with the TimeseriesGenerator configuration.
200 """
201 data = self.data
202 if type(self.data).__module__ == np.__name__:
203 data = self.data.tolist()
204 try:
205 json_data = json.dumps(data)
206 except TypeError as e:
207 raise TypeError("Data not JSON Serializable:", data) from e
209 targets = self.targets
210 if type(self.targets).__module__ == np.__name__:
211 targets = self.targets.tolist()
212 try:
213 json_targets = json.dumps(targets)
214 except TypeError as e:
215 raise TypeError("Targets not JSON Serializable:", targets) from e
217 return {
218 "data": json_data,
219 "targets": json_targets,
220 "length": self.length,
221 "sampling_rate": self.sampling_rate,
222 "stride": self.stride,
223 "start_index": self.start_index,
224 "end_index": self.end_index,
225 "shuffle": self.shuffle,
226 "reverse": self.reverse,
227 "batch_size": self.batch_size,
228 }
230 def to_json(self, **kwargs):
231 """Returns a JSON string containing the generator's configuration.
233 Args:
234 **kwargs: Additional keyword arguments to be passed
235 to `json.dumps()`.
237 Returns:
238 A JSON string containing the tokenizer configuration.
239 """
240 config = self.get_config()
241 timeseries_generator_config = {
242 "class_name": self.__class__.__name__,
243 "config": config,
244 }
245 return json.dumps(timeseries_generator_config, **kwargs)
248@keras_export("keras.preprocessing.sequence.make_sampling_table")
249def make_sampling_table(size, sampling_factor=1e-5):
250 """Generates a word rank-based probabilistic sampling table.
252 Used for generating the `sampling_table` argument for `skipgrams`.
253 `sampling_table[i]` is the probability of sampling
254 the word i-th most common word in a dataset
255 (more common words should be sampled less frequently, for balance).
257 The sampling probabilities are generated according
258 to the sampling distribution used in word2vec:
260 ```
261 p(word) = (min(1, sqrt(word_frequency / sampling_factor) /
262 (word_frequency / sampling_factor)))
263 ```
265 We assume that the word frequencies follow Zipf's law (s=1) to derive
266 a numerical approximation of frequency(rank):
268 `frequency(rank) ~ 1/(rank * (log(rank) + gamma) + 1/2 - 1/(12*rank))`
269 where `gamma` is the Euler-Mascheroni constant.
271 Args:
272 size: Int, number of possible words to sample.
273 sampling_factor: The sampling factor in the word2vec formula.
275 Returns:
276 A 1D Numpy array of length `size` where the ith entry
277 is the probability that a word of rank i should be sampled.
278 """
279 gamma = 0.577
280 rank = np.arange(size)
281 rank[0] = 1
282 inv_fq = rank * (np.log(rank) + gamma) + 0.5 - 1.0 / (12.0 * rank)
283 f = sampling_factor * inv_fq
285 return np.minimum(1.0, f / np.sqrt(f))
288@keras_export("keras.preprocessing.sequence.skipgrams")
289def skipgrams(
290 sequence,
291 vocabulary_size,
292 window_size=4,
293 negative_samples=1.0,
294 shuffle=True,
295 categorical=False,
296 sampling_table=None,
297 seed=None,
298):
299 """Generates skipgram word pairs.
301 This function transforms a sequence of word indexes (list of integers)
302 into tuples of words of the form:
304 - (word, word in the same window), with label 1 (positive samples).
305 - (word, random word from the vocabulary), with label 0 (negative samples).
307 Read more about Skipgram in this gnomic paper by Mikolov et al.:
308 [Efficient Estimation of Word Representations in
309 Vector Space](http://arxiv.org/pdf/1301.3781v3.pdf)
311 Args:
312 sequence: A word sequence (sentence), encoded as a list
313 of word indices (integers). If using a `sampling_table`,
314 word indices are expected to match the rank
315 of the words in a reference dataset (e.g. 10 would encode
316 the 10-th most frequently occurring token).
317 Note that index 0 is expected to be a non-word and will be skipped.
318 vocabulary_size: Int, maximum possible word index + 1
319 window_size: Int, size of sampling windows (technically half-window).
320 The window of a word `w_i` will be
321 `[i - window_size, i + window_size+1]`.
322 negative_samples: Float >= 0. 0 for no negative (i.e. random) samples.
323 1 for same number as positive samples.
324 shuffle: Whether to shuffle the word couples before returning them.
325 categorical: bool. if False, labels will be
326 integers (eg. `[0, 1, 1 .. ]`),
327 if `True`, labels will be categorical, e.g.
328 `[[1,0],[0,1],[0,1] .. ]`.
329 sampling_table: 1D array of size `vocabulary_size` where the entry i
330 encodes the probability to sample a word of rank i.
331 seed: Random seed.
333 Returns:
334 couples, labels: where `couples` are int pairs and
335 `labels` are either 0 or 1.
337 Note:
338 By convention, index 0 in the vocabulary is
339 a non-word and will be skipped.
340 """
341 couples = []
342 labels = []
343 for i, wi in enumerate(sequence):
344 if not wi:
345 continue
346 if sampling_table is not None:
347 if sampling_table[wi] < random.random():
348 continue
350 window_start = max(0, i - window_size)
351 window_end = min(len(sequence), i + window_size + 1)
352 for j in range(window_start, window_end):
353 if j != i:
354 wj = sequence[j]
355 if not wj:
356 continue
357 couples.append([wi, wj])
358 if categorical:
359 labels.append([0, 1])
360 else:
361 labels.append(1)
363 if negative_samples > 0:
364 num_negative_samples = int(len(labels) * negative_samples)
365 words = [c[0] for c in couples]
366 random.shuffle(words)
368 couples += [
369 [words[i % len(words)], random.randint(1, vocabulary_size - 1)]
370 for i in range(num_negative_samples)
371 ]
372 if categorical:
373 labels += [[1, 0]] * num_negative_samples
374 else:
375 labels += [0] * num_negative_samples
377 if shuffle:
378 if seed is None:
379 seed = random.randint(0, 10e6)
380 random.seed(seed)
381 random.shuffle(couples)
382 random.seed(seed)
383 random.shuffle(labels)
385 return couples, labels