Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/keras/src/preprocessing/sequence.py: 17%

109 statements  

« prev     ^ index     » next       coverage.py v7.4.0, created at 2024-01-03 07:57 +0000

1# Copyright 2015 The TensorFlow Authors. All Rights Reserved. 

2# 

3# Licensed under the Apache License, Version 2.0 (the "License"); 

4# you may not use this file except in compliance with the License. 

5# You may obtain a copy of the License at 

6# 

7# http://www.apache.org/licenses/LICENSE-2.0 

8# 

9# Unless required by applicable law or agreed to in writing, software 

10# distributed under the License is distributed on an "AS IS" BASIS, 

11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 

12# See the License for the specific language governing permissions and 

13# limitations under the License. 

14# ============================================================================== 

15"""Utilities for preprocessing sequence data. 

16 

17Deprecated: `tf.keras.preprocessing.sequence` APIs are not recommended for new 

18code. Prefer `tf.keras.utils.timeseries_dataset_from_array` and 

19the `tf.data` APIs which provide a much more flexible mechanisms for dealing 

20with sequences. See the [tf.data guide](https://www.tensorflow.org/guide/data) 

21for more details. 

22""" 

23 

24 

25import json 

26import random 

27 

28import numpy as np 

29 

30from keras.src.utils import data_utils 

31 

32# isort: off 

33from tensorflow.python.util.tf_export import keras_export 

34 

35 

36def _remove_long_seq(maxlen, seq, label): 

37 """Removes sequences that exceed the maximum length. 

38 

39 Args: 

40 maxlen: Int, maximum length of the output sequences. 

41 seq: List of lists, where each sublist is a sequence. 

42 label: List where each element is an integer. 

43 

44 Returns: 

45 new_seq, new_label: shortened lists for `seq` and `label`. 

46 """ 

47 new_seq, new_label = [], [] 

48 for x, y in zip(seq, label): 

49 if len(x) < maxlen: 

50 new_seq.append(x) 

51 new_label.append(y) 

52 return new_seq, new_label 

53 

54 

55@keras_export("keras.preprocessing.sequence.TimeseriesGenerator") 

56class TimeseriesGenerator(data_utils.Sequence): 

57 """Utility class for generating batches of temporal data. 

58 

59 Deprecated: `tf.keras.preprocessing.sequence.TimeseriesGenerator` does not 

60 operate on tensors and is not recommended for new code. Prefer using a 

61 `tf.data.Dataset` which provides a more efficient and flexible mechanism for 

62 batching, shuffling, and windowing input. See the 

63 [tf.data guide](https://www.tensorflow.org/guide/data) for more details. 

64 

65 This class takes in a sequence of data-points gathered at 

66 equal intervals, along with time series parameters such as 

67 stride, length of history, etc., to produce batches for 

68 training/validation. 

69 

70 Arguments: 

71 data: Indexable generator (such as list or Numpy array) 

72 containing consecutive data points (timesteps). 

73 The data should be at 2D, and axis 0 is expected 

74 to be the time dimension. 

75 targets: Targets corresponding to timesteps in `data`. 

76 It should have same length as `data`. 

77 length: Length of the output sequences (in number of timesteps). 

78 sampling_rate: Period between successive individual timesteps 

79 within sequences. For rate `r`, timesteps 

80 `data[i]`, `data[i-r]`, ... `data[i - length]` 

81 are used for create a sample sequence. 

82 stride: Period between successive output sequences. 

83 For stride `s`, consecutive output samples would 

84 be centered around `data[i]`, `data[i+s]`, `data[i+2*s]`, etc. 

85 start_index: Data points earlier than `start_index` will not be used 

86 in the output sequences. This is useful to reserve part of the 

87 data for test or validation. 

88 end_index: Data points later than `end_index` will not be used 

89 in the output sequences. This is useful to reserve part of the 

90 data for test or validation. 

91 shuffle: Whether to shuffle output samples, 

92 or instead draw them in chronological order. 

93 reverse: Boolean: if `true`, timesteps in each output sample will be 

94 in reverse chronological order. 

95 batch_size: Number of timeseries samples in each batch 

96 (except maybe the last one). 

97 

98 Returns: 

99 A [Sequence]( 

100 https://www.tensorflow.org/api_docs/python/tf/keras/utils/Sequence) 

101 instance. 

102 

103 Examples: 

104 ```python 

105 from keras.src.preprocessing.sequence import TimeseriesGenerator 

106 import numpy as np 

107 data = np.array([[i] for i in range(50)]) 

108 targets = np.array([[i] for i in range(50)]) 

109 data_gen = TimeseriesGenerator(data, targets, 

110 length=10, sampling_rate=2, 

111 batch_size=2) 

112 assert len(data_gen) == 20 

113 batch_0 = data_gen[0] 

114 x, y = batch_0 

115 assert np.array_equal(x, 

116 np.array([[[0], [2], [4], [6], [8]], 

117 [[1], [3], [5], [7], [9]]])) 

118 assert np.array_equal(y, 

119 np.array([[10], [11]])) 

120 ``` 

121 """ 

122 

123 def __init__( 

124 self, 

125 data, 

126 targets, 

127 length, 

128 sampling_rate=1, 

129 stride=1, 

130 start_index=0, 

131 end_index=None, 

132 shuffle=False, 

133 reverse=False, 

134 batch_size=128, 

135 ): 

136 

137 if len(data) != len(targets): 

138 raise ValueError( 

139 "Data and targets have to be" 

140 + f" of same length. Data length is {len(data)}" 

141 + f" while target length is {len(targets)}" 

142 ) 

143 

144 self.data = data 

145 self.targets = targets 

146 self.length = length 

147 self.sampling_rate = sampling_rate 

148 self.stride = stride 

149 self.start_index = start_index + length 

150 if end_index is None: 

151 end_index = len(data) - 1 

152 self.end_index = end_index 

153 self.shuffle = shuffle 

154 self.reverse = reverse 

155 self.batch_size = batch_size 

156 

157 if self.start_index > self.end_index: 

158 raise ValueError( 

159 "`start_index+length=%i > end_index=%i` " 

160 "is disallowed, as no part of the sequence " 

161 "would be left to be used as current step." 

162 % (self.start_index, self.end_index) 

163 ) 

164 

165 def __len__(self): 

166 return ( 

167 self.end_index - self.start_index + self.batch_size * self.stride 

168 ) // (self.batch_size * self.stride) 

169 

170 def __getitem__(self, index): 

171 if self.shuffle: 

172 rows = np.random.randint( 

173 self.start_index, self.end_index + 1, size=self.batch_size 

174 ) 

175 else: 

176 i = self.start_index + self.batch_size * self.stride * index 

177 rows = np.arange( 

178 i, 

179 min(i + self.batch_size * self.stride, self.end_index + 1), 

180 self.stride, 

181 ) 

182 

183 samples = np.array( 

184 [ 

185 self.data[row - self.length : row : self.sampling_rate] 

186 for row in rows 

187 ] 

188 ) 

189 targets = np.array([self.targets[row] for row in rows]) 

190 

191 if self.reverse: 

192 return samples[:, ::-1, ...], targets 

193 return samples, targets 

194 

195 def get_config(self): 

196 """Returns the TimeseriesGenerator configuration as Python dictionary. 

197 

198 Returns: 

199 A Python dictionary with the TimeseriesGenerator configuration. 

200 """ 

201 data = self.data 

202 if type(self.data).__module__ == np.__name__: 

203 data = self.data.tolist() 

204 try: 

205 json_data = json.dumps(data) 

206 except TypeError as e: 

207 raise TypeError("Data not JSON Serializable:", data) from e 

208 

209 targets = self.targets 

210 if type(self.targets).__module__ == np.__name__: 

211 targets = self.targets.tolist() 

212 try: 

213 json_targets = json.dumps(targets) 

214 except TypeError as e: 

215 raise TypeError("Targets not JSON Serializable:", targets) from e 

216 

217 return { 

218 "data": json_data, 

219 "targets": json_targets, 

220 "length": self.length, 

221 "sampling_rate": self.sampling_rate, 

222 "stride": self.stride, 

223 "start_index": self.start_index, 

224 "end_index": self.end_index, 

225 "shuffle": self.shuffle, 

226 "reverse": self.reverse, 

227 "batch_size": self.batch_size, 

228 } 

229 

230 def to_json(self, **kwargs): 

231 """Returns a JSON string containing the generator's configuration. 

232 

233 Args: 

234 **kwargs: Additional keyword arguments to be passed 

235 to `json.dumps()`. 

236 

237 Returns: 

238 A JSON string containing the tokenizer configuration. 

239 """ 

240 config = self.get_config() 

241 timeseries_generator_config = { 

242 "class_name": self.__class__.__name__, 

243 "config": config, 

244 } 

245 return json.dumps(timeseries_generator_config, **kwargs) 

246 

247 

248@keras_export("keras.preprocessing.sequence.make_sampling_table") 

249def make_sampling_table(size, sampling_factor=1e-5): 

250 """Generates a word rank-based probabilistic sampling table. 

251 

252 Used for generating the `sampling_table` argument for `skipgrams`. 

253 `sampling_table[i]` is the probability of sampling 

254 the word i-th most common word in a dataset 

255 (more common words should be sampled less frequently, for balance). 

256 

257 The sampling probabilities are generated according 

258 to the sampling distribution used in word2vec: 

259 

260 ``` 

261 p(word) = (min(1, sqrt(word_frequency / sampling_factor) / 

262 (word_frequency / sampling_factor))) 

263 ``` 

264 

265 We assume that the word frequencies follow Zipf's law (s=1) to derive 

266 a numerical approximation of frequency(rank): 

267 

268 `frequency(rank) ~ 1/(rank * (log(rank) + gamma) + 1/2 - 1/(12*rank))` 

269 where `gamma` is the Euler-Mascheroni constant. 

270 

271 Args: 

272 size: Int, number of possible words to sample. 

273 sampling_factor: The sampling factor in the word2vec formula. 

274 

275 Returns: 

276 A 1D Numpy array of length `size` where the ith entry 

277 is the probability that a word of rank i should be sampled. 

278 """ 

279 gamma = 0.577 

280 rank = np.arange(size) 

281 rank[0] = 1 

282 inv_fq = rank * (np.log(rank) + gamma) + 0.5 - 1.0 / (12.0 * rank) 

283 f = sampling_factor * inv_fq 

284 

285 return np.minimum(1.0, f / np.sqrt(f)) 

286 

287 

288@keras_export("keras.preprocessing.sequence.skipgrams") 

289def skipgrams( 

290 sequence, 

291 vocabulary_size, 

292 window_size=4, 

293 negative_samples=1.0, 

294 shuffle=True, 

295 categorical=False, 

296 sampling_table=None, 

297 seed=None, 

298): 

299 """Generates skipgram word pairs. 

300 

301 This function transforms a sequence of word indexes (list of integers) 

302 into tuples of words of the form: 

303 

304 - (word, word in the same window), with label 1 (positive samples). 

305 - (word, random word from the vocabulary), with label 0 (negative samples). 

306 

307 Read more about Skipgram in this gnomic paper by Mikolov et al.: 

308 [Efficient Estimation of Word Representations in 

309 Vector Space](http://arxiv.org/pdf/1301.3781v3.pdf) 

310 

311 Args: 

312 sequence: A word sequence (sentence), encoded as a list 

313 of word indices (integers). If using a `sampling_table`, 

314 word indices are expected to match the rank 

315 of the words in a reference dataset (e.g. 10 would encode 

316 the 10-th most frequently occurring token). 

317 Note that index 0 is expected to be a non-word and will be skipped. 

318 vocabulary_size: Int, maximum possible word index + 1 

319 window_size: Int, size of sampling windows (technically half-window). 

320 The window of a word `w_i` will be 

321 `[i - window_size, i + window_size+1]`. 

322 negative_samples: Float >= 0. 0 for no negative (i.e. random) samples. 

323 1 for same number as positive samples. 

324 shuffle: Whether to shuffle the word couples before returning them. 

325 categorical: bool. if False, labels will be 

326 integers (eg. `[0, 1, 1 .. ]`), 

327 if `True`, labels will be categorical, e.g. 

328 `[[1,0],[0,1],[0,1] .. ]`. 

329 sampling_table: 1D array of size `vocabulary_size` where the entry i 

330 encodes the probability to sample a word of rank i. 

331 seed: Random seed. 

332 

333 Returns: 

334 couples, labels: where `couples` are int pairs and 

335 `labels` are either 0 or 1. 

336 

337 Note: 

338 By convention, index 0 in the vocabulary is 

339 a non-word and will be skipped. 

340 """ 

341 couples = [] 

342 labels = [] 

343 for i, wi in enumerate(sequence): 

344 if not wi: 

345 continue 

346 if sampling_table is not None: 

347 if sampling_table[wi] < random.random(): 

348 continue 

349 

350 window_start = max(0, i - window_size) 

351 window_end = min(len(sequence), i + window_size + 1) 

352 for j in range(window_start, window_end): 

353 if j != i: 

354 wj = sequence[j] 

355 if not wj: 

356 continue 

357 couples.append([wi, wj]) 

358 if categorical: 

359 labels.append([0, 1]) 

360 else: 

361 labels.append(1) 

362 

363 if negative_samples > 0: 

364 num_negative_samples = int(len(labels) * negative_samples) 

365 words = [c[0] for c in couples] 

366 random.shuffle(words) 

367 

368 couples += [ 

369 [words[i % len(words)], random.randint(1, vocabulary_size - 1)] 

370 for i in range(num_negative_samples) 

371 ] 

372 if categorical: 

373 labels += [[1, 0]] * num_negative_samples 

374 else: 

375 labels += [0] * num_negative_samples 

376 

377 if shuffle: 

378 if seed is None: 

379 seed = random.randint(0, 10e6) 

380 random.seed(seed) 

381 random.shuffle(couples) 

382 random.seed(seed) 

383 random.shuffle(labels) 

384 

385 return couples, labels 

386