Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/keras/src/preprocessing/sequence.py: 17%

3# Licensed under the Apache License, Version 2.0 (the "License");

4# you may not use this file except in compliance with the License.

5# You may obtain a copy of the License at

7# http://www.apache.org/licenses/LICENSE-2.0

9# Unless required by applicable law or agreed to in writing, software

10# distributed under the License is distributed on an "AS IS" BASIS,

11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

12# See the License for the specific language governing permissions and

13# limitations under the License.

14# ==============================================================================

15"""Utilities for preprocessing sequence data.

17Deprecated: `tf.keras.preprocessing.sequence` APIs are not recommended for new

18code. Prefer `tf.keras.utils.timeseries_dataset_from_array` and

19the `tf.data` APIs which provide a much more flexible mechanisms for dealing

20with sequences. See the [tf.data guide](https://www.tensorflow.org/guide/data)

21for more details.

22"""

25import json

26import random

28import numpy as np

30from keras.src.utils import data_utils

32# isort: off

33from tensorflow.python.util.tf_export import keras_export

36def _remove_long_seq(maxlen, seq, label):

37 """Removes sequences that exceed the maximum length.

39 Args:

40 maxlen: Int, maximum length of the output sequences.

41 seq: List of lists, where each sublist is a sequence.

42 label: List where each element is an integer.

44 Returns:

45 new_seq, new_label: shortened lists for `seq` and `label`.

46 """

47 new_seq, new_label = [], []

48 for x, y in zip(seq, label):

49 if len(x) < maxlen:

50 new_seq.append(x)

51 new_label.append(y)

52 return new_seq, new_label

55@keras_export("keras.preprocessing.sequence.TimeseriesGenerator")

56class TimeseriesGenerator(data_utils.Sequence):

57 """Utility class for generating batches of temporal data.

59 Deprecated: `tf.keras.preprocessing.sequence.TimeseriesGenerator` does not

60 operate on tensors and is not recommended for new code. Prefer using a

61 `tf.data.Dataset` which provides a more efficient and flexible mechanism for

62 batching, shuffling, and windowing input. See the

63 [tf.data guide](https://www.tensorflow.org/guide/data) for more details.

65 This class takes in a sequence of data-points gathered at

66 equal intervals, along with time series parameters such as

67 stride, length of history, etc., to produce batches for

68 training/validation.

70 Arguments:

71 data: Indexable generator (such as list or Numpy array)

72 containing consecutive data points (timesteps).

73 The data should be at 2D, and axis 0 is expected

74 to be the time dimension.

75 targets: Targets corresponding to timesteps in `data`.

76 It should have same length as `data`.

77 length: Length of the output sequences (in number of timesteps).

78 sampling_rate: Period between successive individual timesteps

79 within sequences. For rate `r`, timesteps

80 `data[i]`, `data[i-r]`, ... `data[i - length]`

81 are used for create a sample sequence.

82 stride: Period between successive output sequences.

83 For stride `s`, consecutive output samples would

84 be centered around `data[i]`, `data[i+s]`, `data[i+2*s]`, etc.

85 start_index: Data points earlier than `start_index` will not be used

86 in the output sequences. This is useful to reserve part of the

87 data for test or validation.

88 end_index: Data points later than `end_index` will not be used

89 in the output sequences. This is useful to reserve part of the

90 data for test or validation.

91 shuffle: Whether to shuffle output samples,

92 or instead draw them in chronological order.

93 reverse: Boolean: if `true`, timesteps in each output sample will be

94 in reverse chronological order.

95 batch_size: Number of timeseries samples in each batch

96 (except maybe the last one).

98 Returns:

99 A [Sequence](

100 https://www.tensorflow.org/api_docs/python/tf/keras/utils/Sequence)

101 instance.

102

103 Examples:

104 ```python

105 from keras.src.preprocessing.sequence import TimeseriesGenerator

106 import numpy as np

107 data = np.array([[i] for i in range(50)])

108 targets = np.array([[i] for i in range(50)])

109 data_gen = TimeseriesGenerator(data, targets,

110 length=10, sampling_rate=2,

111 batch_size=2)

112 assert len(data_gen) == 20

113 batch_0 = data_gen[0]

114 x, y = batch_0

115 assert np.array_equal(x,

116 np.array([[[0], [2], [4], [6], [8]],

117 [[1], [3], [5], [7], [9]]]))

118 assert np.array_equal(y,

119 np.array([[10], [11]]))

120 ```

121 """

122

123 def __init__(

124 self,

125 data,

126 targets,

127 length,

128 sampling_rate=1,

129 stride=1,

130 start_index=0,

131 end_index=None,

132 shuffle=False,

133 reverse=False,

134 batch_size=128,

135 ):

136

137 if len(data) != len(targets):

138 raise ValueError(

139 "Data and targets have to be"

140 + f" of same length. Data length is {len(data)}"

141 + f" while target length is {len(targets)}"

142 )

143

144 self.data = data

145 self.targets = targets

146 self.length = length

147 self.sampling_rate = sampling_rate

148 self.stride = stride

149 self.start_index = start_index + length

150 if end_index is None:

151 end_index = len(data) - 1

152 self.end_index = end_index

153 self.shuffle = shuffle

154 self.reverse = reverse

155 self.batch_size = batch_size

156

157 if self.start_index > self.end_index:

158 raise ValueError(

159 "`start_index+length=%i > end_index=%i` "

160 "is disallowed, as no part of the sequence "

161 "would be left to be used as current step."

162 % (self.start_index, self.end_index)

163 )

164

165 def __len__(self):

166 return (

167 self.end_index - self.start_index + self.batch_size * self.stride

168 ) // (self.batch_size * self.stride)

169

170 def __getitem__(self, index):

171 if self.shuffle:

172 rows = np.random.randint(

173 self.start_index, self.end_index + 1, size=self.batch_size

174 )

175 else:

176 i = self.start_index + self.batch_size * self.stride * index

177 rows = np.arange(

178 i,

179 min(i + self.batch_size * self.stride, self.end_index + 1),

180 self.stride,

181 )

182

183 samples = np.array(

184 [

185 self.data[row - self.length : row : self.sampling_rate]

186 for row in rows

187 ]

188 )

189 targets = np.array([self.targets[row] for row in rows])

190

191 if self.reverse:

192 return samples[:, ::-1, ...], targets

193 return samples, targets

194

195 def get_config(self):

196 """Returns the TimeseriesGenerator configuration as Python dictionary.

197

198 Returns:

199 A Python dictionary with the TimeseriesGenerator configuration.

200 """

201 data = self.data

202 if type(self.data).__module__ == np.__name__:

203 data = self.data.tolist()

204 try:

205 json_data = json.dumps(data)

206 except TypeError as e:

207 raise TypeError("Data not JSON Serializable:", data) from e

208

209 targets = self.targets

210 if type(self.targets).__module__ == np.__name__:

211 targets = self.targets.tolist()

212 try:

213 json_targets = json.dumps(targets)

214 except TypeError as e:

215 raise TypeError("Targets not JSON Serializable:", targets) from e

216

217 return {

218 "data": json_data,

219 "targets": json_targets,

220 "length": self.length,

221 "sampling_rate": self.sampling_rate,

222 "stride": self.stride,

223 "start_index": self.start_index,

224 "end_index": self.end_index,

225 "shuffle": self.shuffle,

226 "reverse": self.reverse,

227 "batch_size": self.batch_size,

228 }

229

230 def to_json(self, **kwargs):

231 """Returns a JSON string containing the generator's configuration.

232

233 Args:

234 **kwargs: Additional keyword arguments to be passed

235 to `json.dumps()`.

236

237 Returns:

238 A JSON string containing the tokenizer configuration.

239 """

240 config = self.get_config()

241 timeseries_generator_config = {

242 "class_name": self.__class__.__name__,

243 "config": config,

244 }

245 return json.dumps(timeseries_generator_config, **kwargs)

246

247

248@keras_export("keras.preprocessing.sequence.make_sampling_table")

249def make_sampling_table(size, sampling_factor=1e-5):

250 """Generates a word rank-based probabilistic sampling table.

251

252 Used for generating the `sampling_table` argument for `skipgrams`.

253 `sampling_table[i]` is the probability of sampling

254 the word i-th most common word in a dataset

255 (more common words should be sampled less frequently, for balance).

256

257 The sampling probabilities are generated according

258 to the sampling distribution used in word2vec:

259

260 ```

261 p(word) = (min(1, sqrt(word_frequency / sampling_factor) /

262 (word_frequency / sampling_factor)))

263 ```

264

265 We assume that the word frequencies follow Zipf's law (s=1) to derive

266 a numerical approximation of frequency(rank):

267

268 `frequency(rank) ~ 1/(rank * (log(rank) + gamma) + 1/2 - 1/(12*rank))`

269 where `gamma` is the Euler-Mascheroni constant.

270

271 Args:

272 size: Int, number of possible words to sample.

273 sampling_factor: The sampling factor in the word2vec formula.

274

275 Returns:

276 A 1D Numpy array of length `size` where the ith entry

277 is the probability that a word of rank i should be sampled.

278 """

279 gamma = 0.577

280 rank = np.arange(size)

281 rank[0] = 1

282 inv_fq = rank * (np.log(rank) + gamma) + 0.5 - 1.0 / (12.0 * rank)

283 f = sampling_factor * inv_fq

284

285 return np.minimum(1.0, f / np.sqrt(f))

286

287

288@keras_export("keras.preprocessing.sequence.skipgrams")

289def skipgrams(

290 sequence,

291 vocabulary_size,

292 window_size=4,

293 negative_samples=1.0,

294 shuffle=True,

295 categorical=False,

296 sampling_table=None,

297 seed=None,

298):

299 """Generates skipgram word pairs.

300

301 This function transforms a sequence of word indexes (list of integers)

302 into tuples of words of the form:

303

304 - (word, word in the same window), with label 1 (positive samples).

305 - (word, random word from the vocabulary), with label 0 (negative samples).

306

307 Read more about Skipgram in this gnomic paper by Mikolov et al.:

308 [Efficient Estimation of Word Representations in

309 Vector Space](http://arxiv.org/pdf/1301.3781v3.pdf)

310

311 Args:

312 sequence: A word sequence (sentence), encoded as a list

313 of word indices (integers). If using a `sampling_table`,

314 word indices are expected to match the rank

315 of the words in a reference dataset (e.g. 10 would encode

316 the 10-th most frequently occurring token).

317 Note that index 0 is expected to be a non-word and will be skipped.

318 vocabulary_size: Int, maximum possible word index + 1

319 window_size: Int, size of sampling windows (technically half-window).

320 The window of a word `w_i` will be

321 `[i - window_size, i + window_size+1]`.

322 negative_samples: Float >= 0. 0 for no negative (i.e. random) samples.

323 1 for same number as positive samples.

324 shuffle: Whether to shuffle the word couples before returning them.

325 categorical: bool. if False, labels will be

326 integers (eg. `[0, 1, 1 .. ]`),

327 if `True`, labels will be categorical, e.g.

328 `[[1,0],[0,1],[0,1] .. ]`.

329 sampling_table: 1D array of size `vocabulary_size` where the entry i

330 encodes the probability to sample a word of rank i.

331 seed: Random seed.

332

333 Returns:

334 couples, labels: where `couples` are int pairs and

335 `labels` are either 0 or 1.

336

337 Note:

338 By convention, index 0 in the vocabulary is

339 a non-word and will be skipped.

340 """

341 couples = []

342 labels = []

343 for i, wi in enumerate(sequence):

344 if not wi:

345 continue

346 if sampling_table is not None:

347 if sampling_table[wi] < random.random():

348 continue

349

350 window_start = max(0, i - window_size)

351 window_end = min(len(sequence), i + window_size + 1)

352 for j in range(window_start, window_end):

353 if j != i:

354 wj = sequence[j]

355 if not wj:

356 continue

357 couples.append([wi, wj])

358 if categorical:

359 labels.append([0, 1])

360 else:

361 labels.append(1)

362

363 if negative_samples > 0:

364 num_negative_samples = int(len(labels) * negative_samples)

365 words = [c[0] for c in couples]

366 random.shuffle(words)

367

368 couples += [

369 [words[i % len(words)], random.randint(1, vocabulary_size - 1)]

370 for i in range(num_negative_samples)

371 ]

372 if categorical:

373 labels += [[1, 0]] * num_negative_samples

374 else:

375 labels += [0] * num_negative_samples

376

377 if shuffle:

378 if seed is None:

379 seed = random.randint(0, 10e6)

380 random.seed(seed)

381 random.shuffle(couples)

382 random.seed(seed)

383 random.shuffle(labels)

384

385 return couples, labels

386