Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/keras/src/layers/preprocessing/discretization.py: 25%
123 statements
« prev ^ index » next coverage.py v7.4.0, created at 2024-01-03 07:57 +0000
« prev ^ index » next coverage.py v7.4.0, created at 2024-01-03 07:57 +0000
1# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
2#
3# Licensed under the Apache License, Version 2.0 (the "License");
4# you may not use this file except in compliance with the License.
5# You may obtain a copy of the License at
6#
7# http://www.apache.org/licenses/LICENSE-2.0
8#
9# Unless required by applicable law or agreed to in writing, software
10# distributed under the License is distributed on an "AS IS" BASIS,
11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12# See the License for the specific language governing permissions and
13# limitations under the License.
14# ==============================================================================
15"""Keras discretization preprocessing layer."""
18import numpy as np
19import tensorflow.compat.v2 as tf
21from keras.src import backend
22from keras.src.engine import base_preprocessing_layer
23from keras.src.layers.preprocessing import preprocessing_utils as utils
24from keras.src.utils import layer_utils
25from keras.src.utils import tf_utils
27# isort: off
28from tensorflow.python.platform import tf_logging as logging
29from tensorflow.python.util.tf_export import keras_export
31INT = utils.INT
32MULTI_HOT = utils.MULTI_HOT
33ONE_HOT = utils.ONE_HOT
34COUNT = utils.COUNT
37def summarize(values, epsilon):
38 """Reduce a 1D sequence of values to a summary.
40 This algorithm is based on numpy.quantiles but modified to allow for
41 intermediate steps between multiple data sets. It first finds the target
42 number of bins as the reciprocal of epsilon and then takes the individual
43 values spaced at appropriate intervals to arrive at that target.
44 The final step is to return the corresponding counts between those values
45 If the target num_bins is larger than the size of values, the whole array is
46 returned (with weights of 1).
48 Args:
49 values: 1D `np.ndarray` to be summarized.
50 epsilon: A `'float32'` that determines the approximate desired
51 precision.
53 Returns:
54 A 2D `np.ndarray` that is a summary of the inputs. First column is the
55 interpolated partition values, the second is the weights (counts).
56 """
58 values = tf.reshape(values, [-1])
59 values = tf.sort(values)
60 elements = tf.cast(tf.size(values), tf.float32)
61 num_buckets = 1.0 / epsilon
62 increment = tf.cast(elements / num_buckets, tf.int32)
63 start = increment
64 step = tf.maximum(increment, 1)
65 boundaries = values[start::step]
66 weights = tf.ones_like(boundaries)
67 weights = weights * tf.cast(step, tf.float32)
68 return tf.stack([boundaries, weights])
71def compress(summary, epsilon):
72 """Compress a summary to within `epsilon` accuracy.
74 The compression step is needed to keep the summary sizes small after
75 merging, and also used to return the final target boundaries. It finds the
76 new bins based on interpolating cumulative weight percentages from the large
77 summary. Taking the difference of the cumulative weights from the previous
78 bin's cumulative weight will give the new weight for that bin.
80 Args:
81 summary: 2D `np.ndarray` summary to be compressed.
82 epsilon: A `'float32'` that determines the approxmiate desired
83 precision.
85 Returns:
86 A 2D `np.ndarray` that is a compressed summary. First column is the
87 interpolated partition values, the second is the weights (counts).
88 """
89 # TODO(b/184863356): remove the numpy escape hatch here.
90 return tf.numpy_function(
91 lambda s: _compress_summary_numpy(s, epsilon), [summary], tf.float32
92 )
95def _compress_summary_numpy(summary, epsilon):
96 """Compress a summary with numpy."""
97 if summary.shape[1] * epsilon < 1:
98 return summary
100 percents = epsilon + np.arange(0.0, 1.0, epsilon)
101 cum_weights = summary[1].cumsum()
102 cum_weight_percents = cum_weights / cum_weights[-1]
103 new_bins = np.interp(percents, cum_weight_percents, summary[0])
104 cum_weights = np.interp(percents, cum_weight_percents, cum_weights)
105 new_weights = cum_weights - np.concatenate(
106 (np.array([0]), cum_weights[:-1])
107 )
108 summary = np.stack((new_bins, new_weights))
109 return summary.astype(np.float32)
112def merge_summaries(prev_summary, next_summary, epsilon):
113 """Weighted merge sort of summaries.
115 Given two summaries of distinct data, this function merges (and compresses)
116 them to stay within `epsilon` error tolerance.
118 Args:
119 prev_summary: 2D `np.ndarray` summary to be merged with `next_summary`.
120 next_summary: 2D `np.ndarray` summary to be merged with `prev_summary`.
121 epsilon: A float that determines the approxmiate desired precision.
123 Returns:
124 A 2-D `np.ndarray` that is a merged summary. First column is the
125 interpolated partition values, the second is the weights (counts).
126 """
127 merged = tf.concat((prev_summary, next_summary), axis=1)
128 merged = tf.gather(merged, tf.argsort(merged[0]), axis=1)
129 return compress(merged, epsilon)
132def get_bin_boundaries(summary, num_bins):
133 return compress(summary, 1.0 / num_bins)[0, :-1]
136@keras_export(
137 "keras.layers.Discretization",
138 "keras.layers.experimental.preprocessing.Discretization",
139)
140class Discretization(base_preprocessing_layer.PreprocessingLayer):
141 """A preprocessing layer which buckets continuous features by ranges.
143 This layer will place each element of its input data into one of several
144 contiguous ranges and output an integer index indicating which range each
145 element was placed in.
147 For an overview and full list of preprocessing layers, see the preprocessing
148 [guide](https://www.tensorflow.org/guide/keras/preprocessing_layers).
150 Input shape:
151 Any `tf.Tensor` or `tf.RaggedTensor` of dimension 2 or higher.
153 Output shape:
154 Same as input shape.
156 Arguments:
157 bin_boundaries: A list of bin boundaries. The leftmost and rightmost bins
158 will always extend to `-inf` and `inf`, so `bin_boundaries=[0., 1., 2.]`
159 generates bins `(-inf, 0.)`, `[0., 1.)`, `[1., 2.)`, and `[2., +inf)`.
160 If this option is set, `adapt()` should not be called.
161 num_bins: The integer number of bins to compute. If this option is set,
162 `adapt()` should be called to learn the bin boundaries.
163 epsilon: Error tolerance, typically a small fraction close to zero (e.g.
164 0.01). Higher values of epsilon increase the quantile approximation, and
165 hence result in more unequal buckets, but could improve performance
166 and resource consumption.
167 output_mode: Specification for the output of the layer. Values can be
168 `"int"`, `"one_hot"`, `"multi_hot"`, or
169 `"count"` configuring the layer as follows:
170 - `"int"`: Return the discretized bin indices directly.
171 - `"one_hot"`: Encodes each individual element in the input into an
172 array the same size as `num_bins`, containing a 1 at the input's bin
173 index. If the last dimension is size 1, will encode on that
174 dimension. If the last dimension is not size 1, will append a new
175 dimension for the encoded output.
176 - `"multi_hot"`: Encodes each sample in the input into a single array
177 the same size as `num_bins`, containing a 1 for each bin index
178 index present in the sample. Treats the last dimension as the sample
179 dimension, if input shape is `(..., sample_length)`, output shape
180 will be `(..., num_tokens)`.
181 - `"count"`: As `"multi_hot"`, but the int array contains a count of
182 the number of times the bin index appeared in the sample.
183 Defaults to `"int"`.
184 sparse: Boolean. Only applicable to `"one_hot"`, `"multi_hot"`,
185 and `"count"` output modes. If True, returns a `SparseTensor` instead of
186 a dense `Tensor`. Defaults to `False`.
188 Examples:
190 Bucketize float values based on provided buckets.
191 >>> input = np.array([[-1.5, 1.0, 3.4, .5], [0.0, 3.0, 1.3, 0.0]])
192 >>> layer = tf.keras.layers.Discretization(bin_boundaries=[0., 1., 2.])
193 >>> layer(input)
194 <tf.Tensor: shape=(2, 4), dtype=int64, numpy=
195 array([[0, 2, 3, 1],
196 [1, 3, 2, 1]])>
198 Bucketize float values based on a number of buckets to compute.
199 >>> input = np.array([[-1.5, 1.0, 3.4, .5], [0.0, 3.0, 1.3, 0.0]])
200 >>> layer = tf.keras.layers.Discretization(num_bins=4, epsilon=0.01)
201 >>> layer.adapt(input)
202 >>> layer(input)
203 <tf.Tensor: shape=(2, 4), dtype=int64, numpy=
204 array([[0, 2, 3, 2],
205 [1, 3, 3, 1]])>
206 """
208 def __init__(
209 self,
210 bin_boundaries=None,
211 num_bins=None,
212 epsilon=0.01,
213 output_mode="int",
214 sparse=False,
215 **kwargs,
216 ):
217 # bins is a deprecated arg for setting bin_boundaries or num_bins that
218 # still has some usage.
219 if "bins" in kwargs:
220 logging.warning(
221 "bins is deprecated, "
222 "please use bin_boundaries or num_bins instead."
223 )
224 if isinstance(kwargs["bins"], int) and num_bins is None:
225 num_bins = kwargs["bins"]
226 elif bin_boundaries is None:
227 bin_boundaries = kwargs["bins"]
228 del kwargs["bins"]
230 # By default, output int64 when output_mode='int' and floats otherwise.
231 if "dtype" not in kwargs or kwargs["dtype"] is None:
232 kwargs["dtype"] = (
233 tf.int64 if output_mode == INT else backend.floatx()
234 )
235 elif (
236 output_mode == "int" and not tf.as_dtype(kwargs["dtype"]).is_integer
237 ):
238 # Compat for when dtype was always floating and ignored by the
239 # layer.
240 kwargs["dtype"] = tf.int64
242 super().__init__(**kwargs)
243 base_preprocessing_layer.keras_kpl_gauge.get_cell("Discretization").set(
244 True
245 )
247 # Check dtype only after base layer parses it; dtype parsing is complex.
248 if (
249 output_mode == INT
250 and not tf.as_dtype(self.compute_dtype).is_integer
251 ):
252 input_dtype = kwargs["dtype"]
253 raise ValueError(
254 "When `output_mode='int'`, `dtype` should be an integer "
255 f"type. Received: dtype={input_dtype}"
256 )
258 # 'output_mode' must be one of (INT, ONE_HOT, MULTI_HOT, COUNT)
259 layer_utils.validate_string_arg(
260 output_mode,
261 allowable_strings=(INT, ONE_HOT, MULTI_HOT, COUNT),
262 layer_name=self.__class__.__name__,
263 arg_name="output_mode",
264 )
266 if sparse and output_mode == INT:
267 raise ValueError(
268 "`sparse` may only be true if `output_mode` is "
269 "`'one_hot'`, `'multi_hot'`, or `'count'`. "
270 f"Received: sparse={sparse} and "
271 f"output_mode={output_mode}"
272 )
274 if num_bins is not None and num_bins < 0:
275 raise ValueError(
276 "`num_bins` must be greater than or equal to 0. "
277 "You passed `num_bins={}`".format(num_bins)
278 )
279 if num_bins is not None and bin_boundaries is not None:
280 raise ValueError(
281 "Both `num_bins` and `bin_boundaries` should not be "
282 "set. You passed `num_bins={}` and "
283 "`bin_boundaries={}`".format(num_bins, bin_boundaries)
284 )
285 bin_boundaries = utils.listify_tensors(bin_boundaries)
286 self.input_bin_boundaries = bin_boundaries
287 self.bin_boundaries = (
288 bin_boundaries if bin_boundaries is not None else []
289 )
290 self.num_bins = num_bins
291 self.epsilon = epsilon
292 self.output_mode = output_mode
293 self.sparse = sparse
295 def build(self, input_shape):
296 super().build(input_shape)
298 if self.input_bin_boundaries is not None:
299 return
301 # Summary contains two equal length vectors of bins at index 0 and
302 # weights at index 1.
303 self.summary = self.add_weight(
304 name="summary",
305 shape=(2, None),
306 dtype=tf.float32,
307 initializer=lambda shape, dtype: [
308 [],
309 [],
310 ],
311 trainable=False,
312 )
314 # We override this method solely to generate a docstring.
315 def adapt(self, data, batch_size=None, steps=None):
316 """Computes bin boundaries from quantiles in a input dataset.
318 Calling `adapt()` on a `Discretization` layer is an alternative to
319 passing in a `bin_boundaries` argument during construction. A
320 `Discretization` layer should always be either adapted over a dataset or
321 passed `bin_boundaries`.
323 During `adapt()`, the layer will estimate the quantile boundaries of the
324 input dataset. The number of quantiles can be controlled via the
325 `num_bins` argument, and the error tolerance for quantile boundaries can
326 be controlled via the `epsilon` argument.
328 In order to make `Discretization` efficient in any distribution context,
329 the computed boundaries are kept static with respect to any compiled
330 `tf.Graph`s that call the layer. As a consequence, if the layer is
331 adapted a second time, any models using the layer should be re-compiled.
332 For more information see
333 `tf.keras.layers.experimental.preprocessing.PreprocessingLayer.adapt`.
335 `adapt()` is meant only as a single machine utility to compute layer
336 state. To analyze a dataset that cannot fit on a single machine, see
337 [Tensorflow Transform](
338 https://www.tensorflow.org/tfx/transform/get_started) for a
339 multi-machine, map-reduce solution.
341 Arguments:
342 data: The data to train on. It can be passed either as a
343 `tf.data.Dataset`, or as a numpy array.
344 batch_size: Integer or `None`.
345 Number of samples per state update.
346 If unspecified, `batch_size` will default to 32.
347 Do not specify the `batch_size` if your data is in the
348 form of datasets, generators, or `keras.utils.Sequence` instances
349 (since they generate batches).
350 steps: Integer or `None`.
351 Total number of steps (batches of samples)
352 When training with input tensors such as
353 TensorFlow data tensors, the default `None` is equal to
354 the number of samples in your dataset divided by
355 the batch size, or 1 if that cannot be determined. If x is a
356 `tf.data` dataset, and 'steps' is None, the epoch will run until
357 the input dataset is exhausted. When passing an infinitely
358 repeating dataset, you must specify the `steps` argument. This
359 argument is not supported with array inputs.
360 """
361 super().adapt(data, batch_size=batch_size, steps=steps)
363 def update_state(self, data):
364 if self.input_bin_boundaries is not None:
365 raise ValueError(
366 "Cannot adapt a Discretization layer that has been initialized "
367 "with `bin_boundaries`, use `num_bins` instead. You passed "
368 "`bin_boundaries={}`.".format(self.input_bin_boundaries)
369 )
371 if not self.built:
372 raise RuntimeError("`build` must be called before `update_state`.")
374 data = tf.convert_to_tensor(data)
375 if data.dtype != tf.float32:
376 data = tf.cast(data, tf.float32)
377 summary = summarize(data, self.epsilon)
378 self.summary.assign(
379 merge_summaries(summary, self.summary, self.epsilon)
380 )
382 def finalize_state(self):
383 if self.input_bin_boundaries is not None or not self.built:
384 return
386 # The bucketize op only support list boundaries.
387 self.bin_boundaries = utils.listify_tensors(
388 get_bin_boundaries(self.summary, self.num_bins)
389 )
391 def reset_state(self):
392 if self.input_bin_boundaries is not None or not self.built:
393 return
395 self.summary.assign([[], []])
397 def get_config(self):
398 config = super().get_config()
399 config.update(
400 {
401 "bin_boundaries": self.input_bin_boundaries,
402 "num_bins": self.num_bins,
403 "epsilon": self.epsilon,
404 "output_mode": self.output_mode,
405 "sparse": self.sparse,
406 }
407 )
408 return config
410 def compute_output_shape(self, input_shape):
411 return input_shape
413 def compute_output_signature(self, input_spec):
414 output_shape = self.compute_output_shape(input_spec.shape.as_list())
415 if isinstance(input_spec, tf.SparseTensorSpec):
416 return tf.SparseTensorSpec(
417 shape=output_shape, dtype=self.compute_dtype
418 )
419 return tf.TensorSpec(shape=output_shape, dtype=self.compute_dtype)
421 def call(self, inputs):
422 def bucketize(inputs):
423 return tf.raw_ops.Bucketize(
424 input=inputs, boundaries=self.bin_boundaries
425 )
427 if tf_utils.is_ragged(inputs):
428 indices = tf.ragged.map_flat_values(bucketize, inputs)
429 elif tf_utils.is_sparse(inputs):
430 indices = tf.SparseTensor(
431 indices=tf.identity(inputs.indices),
432 values=bucketize(inputs.values),
433 dense_shape=tf.identity(inputs.dense_shape),
434 )
435 else:
436 indices = bucketize(inputs)
438 return utils.encode_categorical_inputs(
439 indices,
440 output_mode=self.output_mode,
441 depth=len(self.bin_boundaries) + 1,
442 sparse=self.sparse,
443 dtype=self.compute_dtype,
444 )