Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/keras/src/layers/preprocessing/discretization.py: 25%

3# Licensed under the Apache License, Version 2.0 (the "License");

4# you may not use this file except in compliance with the License.

5# You may obtain a copy of the License at

7# http://www.apache.org/licenses/LICENSE-2.0

9# Unless required by applicable law or agreed to in writing, software

10# distributed under the License is distributed on an "AS IS" BASIS,

11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

12# See the License for the specific language governing permissions and

13# limitations under the License.

14# ==============================================================================

15"""Keras discretization preprocessing layer."""

18import numpy as np

19import tensorflow.compat.v2 as tf

21from keras.src import backend

22from keras.src.engine import base_preprocessing_layer

23from keras.src.layers.preprocessing import preprocessing_utils as utils

24from keras.src.utils import layer_utils

25from keras.src.utils import tf_utils

27# isort: off

28from tensorflow.python.platform import tf_logging as logging

29from tensorflow.python.util.tf_export import keras_export

31INT = utils.INT

32MULTI_HOT = utils.MULTI_HOT

33ONE_HOT = utils.ONE_HOT

34COUNT = utils.COUNT

37def summarize(values, epsilon):

38 """Reduce a 1D sequence of values to a summary.

40 This algorithm is based on numpy.quantiles but modified to allow for

41 intermediate steps between multiple data sets. It first finds the target

42 number of bins as the reciprocal of epsilon and then takes the individual

43 values spaced at appropriate intervals to arrive at that target.

44 The final step is to return the corresponding counts between those values

45 If the target num_bins is larger than the size of values, the whole array is

46 returned (with weights of 1).

48 Args:

49 values: 1D `np.ndarray` to be summarized.

50 epsilon: A `'float32'` that determines the approximate desired

51 precision.

53 Returns:

54 A 2D `np.ndarray` that is a summary of the inputs. First column is the

55 interpolated partition values, the second is the weights (counts).

56 """

58 values = tf.reshape(values, [-1])

59 values = tf.sort(values)

60 elements = tf.cast(tf.size(values), tf.float32)

61 num_buckets = 1.0 / epsilon

62 increment = tf.cast(elements / num_buckets, tf.int32)

63 start = increment

64 step = tf.maximum(increment, 1)

65 boundaries = values[start::step]

66 weights = tf.ones_like(boundaries)

67 weights = weights * tf.cast(step, tf.float32)

68 return tf.stack([boundaries, weights])

71def compress(summary, epsilon):

72 """Compress a summary to within `epsilon` accuracy.

74 The compression step is needed to keep the summary sizes small after

75 merging, and also used to return the final target boundaries. It finds the

76 new bins based on interpolating cumulative weight percentages from the large

77 summary. Taking the difference of the cumulative weights from the previous

78 bin's cumulative weight will give the new weight for that bin.

80 Args:

81 summary: 2D `np.ndarray` summary to be compressed.

82 epsilon: A `'float32'` that determines the approxmiate desired

83 precision.

85 Returns:

86 A 2D `np.ndarray` that is a compressed summary. First column is the

87 interpolated partition values, the second is the weights (counts).

88 """

89 # TODO(b/184863356): remove the numpy escape hatch here.

90 return tf.numpy_function(

91 lambda s: _compress_summary_numpy(s, epsilon), [summary], tf.float32

92 )

95def _compress_summary_numpy(summary, epsilon):

96 """Compress a summary with numpy."""

97 if summary.shape[1] * epsilon < 1:

98 return summary

100 percents = epsilon + np.arange(0.0, 1.0, epsilon)

101 cum_weights = summary[1].cumsum()

102 cum_weight_percents = cum_weights / cum_weights[-1]

103 new_bins = np.interp(percents, cum_weight_percents, summary[0])

104 cum_weights = np.interp(percents, cum_weight_percents, cum_weights)

105 new_weights = cum_weights - np.concatenate(

106 (np.array([0]), cum_weights[:-1])

107 )

108 summary = np.stack((new_bins, new_weights))

109 return summary.astype(np.float32)

110

111

112def merge_summaries(prev_summary, next_summary, epsilon):

113 """Weighted merge sort of summaries.

114

115 Given two summaries of distinct data, this function merges (and compresses)

116 them to stay within `epsilon` error tolerance.

117

118 Args:

119 prev_summary: 2D `np.ndarray` summary to be merged with `next_summary`.

120 next_summary: 2D `np.ndarray` summary to be merged with `prev_summary`.

121 epsilon: A float that determines the approxmiate desired precision.

122

123 Returns:

124 A 2-D `np.ndarray` that is a merged summary. First column is the

125 interpolated partition values, the second is the weights (counts).

126 """

127 merged = tf.concat((prev_summary, next_summary), axis=1)

128 merged = tf.gather(merged, tf.argsort(merged[0]), axis=1)

129 return compress(merged, epsilon)

130

131

132def get_bin_boundaries(summary, num_bins):

133 return compress(summary, 1.0 / num_bins)[0, :-1]

134

135

136@keras_export(

137 "keras.layers.Discretization",

138 "keras.layers.experimental.preprocessing.Discretization",

139)

140class Discretization(base_preprocessing_layer.PreprocessingLayer):

141 """A preprocessing layer which buckets continuous features by ranges.

142

143 This layer will place each element of its input data into one of several

144 contiguous ranges and output an integer index indicating which range each

145 element was placed in.

146

147 For an overview and full list of preprocessing layers, see the preprocessing

148 [guide](https://www.tensorflow.org/guide/keras/preprocessing_layers).

149

150 Input shape:

151 Any `tf.Tensor` or `tf.RaggedTensor` of dimension 2 or higher.

152

153 Output shape:

154 Same as input shape.

155

156 Arguments:

157 bin_boundaries: A list of bin boundaries. The leftmost and rightmost bins

158 will always extend to `-inf` and `inf`, so `bin_boundaries=[0., 1., 2.]`

159 generates bins `(-inf, 0.)`, `[0., 1.)`, `[1., 2.)`, and `[2., +inf)`.

160 If this option is set, `adapt()` should not be called.

161 num_bins: The integer number of bins to compute. If this option is set,

162 `adapt()` should be called to learn the bin boundaries.

163 epsilon: Error tolerance, typically a small fraction close to zero (e.g.

164 0.01). Higher values of epsilon increase the quantile approximation, and

165 hence result in more unequal buckets, but could improve performance

166 and resource consumption.

167 output_mode: Specification for the output of the layer. Values can be

168 `"int"`, `"one_hot"`, `"multi_hot"`, or

169 `"count"` configuring the layer as follows:

170 - `"int"`: Return the discretized bin indices directly.

171 - `"one_hot"`: Encodes each individual element in the input into an

172 array the same size as `num_bins`, containing a 1 at the input's bin

173 index. If the last dimension is size 1, will encode on that

174 dimension. If the last dimension is not size 1, will append a new

175 dimension for the encoded output.

176 - `"multi_hot"`: Encodes each sample in the input into a single array

177 the same size as `num_bins`, containing a 1 for each bin index

178 index present in the sample. Treats the last dimension as the sample

179 dimension, if input shape is `(..., sample_length)`, output shape

180 will be `(..., num_tokens)`.

181 - `"count"`: As `"multi_hot"`, but the int array contains a count of

182 the number of times the bin index appeared in the sample.

183 Defaults to `"int"`.

184 sparse: Boolean. Only applicable to `"one_hot"`, `"multi_hot"`,

185 and `"count"` output modes. If True, returns a `SparseTensor` instead of

186 a dense `Tensor`. Defaults to `False`.

187

188 Examples:

189

190 Bucketize float values based on provided buckets.

191 >>> input = np.array([[-1.5, 1.0, 3.4, .5], [0.0, 3.0, 1.3, 0.0]])

192 >>> layer = tf.keras.layers.Discretization(bin_boundaries=[0., 1., 2.])

193 >>> layer(input)

194 <tf.Tensor: shape=(2, 4), dtype=int64, numpy=

195 array([[0, 2, 3, 1],

196 [1, 3, 2, 1]])>

197

198 Bucketize float values based on a number of buckets to compute.

199 >>> input = np.array([[-1.5, 1.0, 3.4, .5], [0.0, 3.0, 1.3, 0.0]])

200 >>> layer = tf.keras.layers.Discretization(num_bins=4, epsilon=0.01)

201 >>> layer.adapt(input)

202 >>> layer(input)

203 <tf.Tensor: shape=(2, 4), dtype=int64, numpy=

204 array([[0, 2, 3, 2],

205 [1, 3, 3, 1]])>

206 """

207

208 def __init__(

209 self,

210 bin_boundaries=None,

211 num_bins=None,

212 epsilon=0.01,

213 output_mode="int",

214 sparse=False,

215 **kwargs,

216 ):

217 # bins is a deprecated arg for setting bin_boundaries or num_bins that

218 # still has some usage.

219 if "bins" in kwargs:

220 logging.warning(

221 "bins is deprecated, "

222 "please use bin_boundaries or num_bins instead."

223 )

224 if isinstance(kwargs["bins"], int) and num_bins is None:

225 num_bins = kwargs["bins"]

226 elif bin_boundaries is None:

227 bin_boundaries = kwargs["bins"]

228 del kwargs["bins"]

229

230 # By default, output int64 when output_mode='int' and floats otherwise.

231 if "dtype" not in kwargs or kwargs["dtype"] is None:

232 kwargs["dtype"] = (

233 tf.int64 if output_mode == INT else backend.floatx()

234 )

235 elif (

236 output_mode == "int" and not tf.as_dtype(kwargs["dtype"]).is_integer

237 ):

238 # Compat for when dtype was always floating and ignored by the

239 # layer.

240 kwargs["dtype"] = tf.int64

241

242 super().__init__(**kwargs)

243 base_preprocessing_layer.keras_kpl_gauge.get_cell("Discretization").set(

244 True

245 )

246

247 # Check dtype only after base layer parses it; dtype parsing is complex.

248 if (

249 output_mode == INT

250 and not tf.as_dtype(self.compute_dtype).is_integer

251 ):

252 input_dtype = kwargs["dtype"]

253 raise ValueError(

254 "When `output_mode='int'`, `dtype` should be an integer "

255 f"type. Received: dtype={input_dtype}"

256 )

257

258 # 'output_mode' must be one of (INT, ONE_HOT, MULTI_HOT, COUNT)

259 layer_utils.validate_string_arg(

260 output_mode,

261 allowable_strings=(INT, ONE_HOT, MULTI_HOT, COUNT),

262 layer_name=self.__class__.__name__,

263 arg_name="output_mode",

264 )

265

266 if sparse and output_mode == INT:

267 raise ValueError(

268 "`sparse` may only be true if `output_mode` is "

269 "`'one_hot'`, `'multi_hot'`, or `'count'`. "

270 f"Received: sparse={sparse} and "

271 f"output_mode={output_mode}"

272 )

273

274 if num_bins is not None and num_bins < 0:

275 raise ValueError(

276 "`num_bins` must be greater than or equal to 0. "

277 "You passed `num_bins={}`".format(num_bins)

278 )

279 if num_bins is not None and bin_boundaries is not None:

280 raise ValueError(

281 "Both `num_bins` and `bin_boundaries` should not be "

282 "set. You passed `num_bins={}` and "

283 "`bin_boundaries={}`".format(num_bins, bin_boundaries)

284 )

285 bin_boundaries = utils.listify_tensors(bin_boundaries)

286 self.input_bin_boundaries = bin_boundaries

287 self.bin_boundaries = (

288 bin_boundaries if bin_boundaries is not None else []

289 )

290 self.num_bins = num_bins

291 self.epsilon = epsilon

292 self.output_mode = output_mode

293 self.sparse = sparse

294

295 def build(self, input_shape):

296 super().build(input_shape)

297

298 if self.input_bin_boundaries is not None:

299 return

300

301 # Summary contains two equal length vectors of bins at index 0 and

302 # weights at index 1.

303 self.summary = self.add_weight(

304 name="summary",

305 shape=(2, None),

306 dtype=tf.float32,

307 initializer=lambda shape, dtype: [

308 [],

309 [],

310 ],

311 trainable=False,

312 )

313

314 # We override this method solely to generate a docstring.

315 def adapt(self, data, batch_size=None, steps=None):

316 """Computes bin boundaries from quantiles in a input dataset.

317

318 Calling `adapt()` on a `Discretization` layer is an alternative to

319 passing in a `bin_boundaries` argument during construction. A

320 `Discretization` layer should always be either adapted over a dataset or

321 passed `bin_boundaries`.

322

323 During `adapt()`, the layer will estimate the quantile boundaries of the

324 input dataset. The number of quantiles can be controlled via the

325 `num_bins` argument, and the error tolerance for quantile boundaries can

326 be controlled via the `epsilon` argument.

327

328 In order to make `Discretization` efficient in any distribution context,

329 the computed boundaries are kept static with respect to any compiled

330 `tf.Graph`s that call the layer. As a consequence, if the layer is

331 adapted a second time, any models using the layer should be re-compiled.

332 For more information see

333 `tf.keras.layers.experimental.preprocessing.PreprocessingLayer.adapt`.

334

335 `adapt()` is meant only as a single machine utility to compute layer

336 state. To analyze a dataset that cannot fit on a single machine, see

337 [Tensorflow Transform](

338 https://www.tensorflow.org/tfx/transform/get_started) for a

339 multi-machine, map-reduce solution.

340

341 Arguments:

342 data: The data to train on. It can be passed either as a

343 `tf.data.Dataset`, or as a numpy array.

344 batch_size: Integer or `None`.

345 Number of samples per state update.

346 If unspecified, `batch_size` will default to 32.

347 Do not specify the `batch_size` if your data is in the

348 form of datasets, generators, or `keras.utils.Sequence` instances

349 (since they generate batches).

350 steps: Integer or `None`.

351 Total number of steps (batches of samples)

352 When training with input tensors such as

353 TensorFlow data tensors, the default `None` is equal to

354 the number of samples in your dataset divided by

355 the batch size, or 1 if that cannot be determined. If x is a

356 `tf.data` dataset, and 'steps' is None, the epoch will run until

357 the input dataset is exhausted. When passing an infinitely

358 repeating dataset, you must specify the `steps` argument. This

359 argument is not supported with array inputs.

360 """

361 super().adapt(data, batch_size=batch_size, steps=steps)

362

363 def update_state(self, data):

364 if self.input_bin_boundaries is not None:

365 raise ValueError(

366 "Cannot adapt a Discretization layer that has been initialized "

367 "with `bin_boundaries`, use `num_bins` instead. You passed "

368 "`bin_boundaries={}`.".format(self.input_bin_boundaries)

369 )

370

371 if not self.built:

372 raise RuntimeError("`build` must be called before `update_state`.")

373

374 data = tf.convert_to_tensor(data)

375 if data.dtype != tf.float32:

376 data = tf.cast(data, tf.float32)

377 summary = summarize(data, self.epsilon)

378 self.summary.assign(

379 merge_summaries(summary, self.summary, self.epsilon)

380 )

381

382 def finalize_state(self):

383 if self.input_bin_boundaries is not None or not self.built:

384 return

385

386 # The bucketize op only support list boundaries.

387 self.bin_boundaries = utils.listify_tensors(

388 get_bin_boundaries(self.summary, self.num_bins)

389 )

390

391 def reset_state(self):

392 if self.input_bin_boundaries is not None or not self.built:

393 return

394

395 self.summary.assign([[], []])

396

397 def get_config(self):

398 config = super().get_config()

399 config.update(

400 {

401 "bin_boundaries": self.input_bin_boundaries,

402 "num_bins": self.num_bins,

403 "epsilon": self.epsilon,

404 "output_mode": self.output_mode,

405 "sparse": self.sparse,

406 }

407 )

408 return config

409

410 def compute_output_shape(self, input_shape):

411 return input_shape

412

413 def compute_output_signature(self, input_spec):

414 output_shape = self.compute_output_shape(input_spec.shape.as_list())

415 if isinstance(input_spec, tf.SparseTensorSpec):

416 return tf.SparseTensorSpec(

417 shape=output_shape, dtype=self.compute_dtype

418 )

419 return tf.TensorSpec(shape=output_shape, dtype=self.compute_dtype)

420

421 def call(self, inputs):

422 def bucketize(inputs):

423 return tf.raw_ops.Bucketize(

424 input=inputs, boundaries=self.bin_boundaries

425 )

426

427 if tf_utils.is_ragged(inputs):

428 indices = tf.ragged.map_flat_values(bucketize, inputs)

429 elif tf_utils.is_sparse(inputs):

430 indices = tf.SparseTensor(

431 indices=tf.identity(inputs.indices),

432 values=bucketize(inputs.values),

433 dense_shape=tf.identity(inputs.dense_shape),

434 )

435 else:

436 indices = bucketize(inputs)

437

438 return utils.encode_categorical_inputs(

439 indices,

440 output_mode=self.output_mode,

441 depth=len(self.bin_boundaries) + 1,

442 sparse=self.sparse,

443 dtype=self.compute_dtype,

444 )

445