1# Author: Henry Lin <hlin117@gmail.com>
2# Tom Dupré la Tour
3
4# License: BSD
5
6
7import warnings
8from numbers import Integral
9
10import numpy as np
11
12from ..base import BaseEstimator, TransformerMixin, _fit_context
13from ..utils import _safe_indexing
14from ..utils._param_validation import Hidden, Interval, Options, StrOptions
15from ..utils.stats import _weighted_percentile
16from ..utils.validation import (
17 _check_feature_names_in,
18 _check_sample_weight,
19 check_array,
20 check_is_fitted,
21 check_random_state,
22)
23from ._encoders import OneHotEncoder
24
25
26class KBinsDiscretizer(TransformerMixin, BaseEstimator):
27 """
28 Bin continuous data into intervals.
29
30 Read more in the :ref:`User Guide <preprocessing_discretization>`.
31
32 .. versionadded:: 0.20
33
34 Parameters
35 ----------
36 n_bins : int or array-like of shape (n_features,), default=5
37 The number of bins to produce. Raises ValueError if ``n_bins < 2``.
38
39 encode : {'onehot', 'onehot-dense', 'ordinal'}, default='onehot'
40 Method used to encode the transformed result.
41
42 - 'onehot': Encode the transformed result with one-hot encoding
43 and return a sparse matrix. Ignored features are always
44 stacked to the right.
45 - 'onehot-dense': Encode the transformed result with one-hot encoding
46 and return a dense array. Ignored features are always
47 stacked to the right.
48 - 'ordinal': Return the bin identifier encoded as an integer value.
49
50 strategy : {'uniform', 'quantile', 'kmeans'}, default='quantile'
51 Strategy used to define the widths of the bins.
52
53 - 'uniform': All bins in each feature have identical widths.
54 - 'quantile': All bins in each feature have the same number of points.
55 - 'kmeans': Values in each bin have the same nearest center of a 1D
56 k-means cluster.
57
58 For an example of the different strategies see:
59 :ref:`sphx_glr_auto_examples_preprocessing_plot_discretization_strategies.py`.
60
61 dtype : {np.float32, np.float64}, default=None
62 The desired data-type for the output. If None, output dtype is
63 consistent with input dtype. Only np.float32 and np.float64 are
64 supported.
65
66 .. versionadded:: 0.24
67
68 subsample : int or None, default='warn'
69 Maximum number of samples, used to fit the model, for computational
70 efficiency. Defaults to 200_000 when `strategy='quantile'` and to `None`
71 when `strategy='uniform'` or `strategy='kmeans'`.
72 `subsample=None` means that all the training samples are used when
73 computing the quantiles that determine the binning thresholds.
74 Since quantile computation relies on sorting each column of `X` and
75 that sorting has an `n log(n)` time complexity,
76 it is recommended to use subsampling on datasets with a
77 very large number of samples.
78
79 .. versionchanged:: 1.3
80 The default value of `subsample` changed from `None` to `200_000` when
81 `strategy="quantile"`.
82
83 .. versionchanged:: 1.5
84 The default value of `subsample` changed from `None` to `200_000` when
85 `strategy="uniform"` or `strategy="kmeans"`.
86
87 random_state : int, RandomState instance or None, default=None
88 Determines random number generation for subsampling.
89 Pass an int for reproducible results across multiple function calls.
90 See the `subsample` parameter for more details.
91 See :term:`Glossary <random_state>`.
92
93 .. versionadded:: 1.1
94
95 Attributes
96 ----------
97 bin_edges_ : ndarray of ndarray of shape (n_features,)
98 The edges of each bin. Contain arrays of varying shapes ``(n_bins_, )``
99 Ignored features will have empty arrays.
100
101 n_bins_ : ndarray of shape (n_features,), dtype=np.int64
102 Number of bins per feature. Bins whose width are too small
103 (i.e., <= 1e-8) are removed with a warning.
104
105 n_features_in_ : int
106 Number of features seen during :term:`fit`.
107
108 .. versionadded:: 0.24
109
110 feature_names_in_ : ndarray of shape (`n_features_in_`,)
111 Names of features seen during :term:`fit`. Defined only when `X`
112 has feature names that are all strings.
113
114 .. versionadded:: 1.0
115
116 See Also
117 --------
118 Binarizer : Class used to bin values as ``0`` or
119 ``1`` based on a parameter ``threshold``.
120
121 Notes
122 -----
123
124 For a visualization of discretization on different datasets refer to
125 :ref:`sphx_glr_auto_examples_preprocessing_plot_discretization_classification.py`.
126 On the effect of discretization on linear models see:
127 :ref:`sphx_glr_auto_examples_preprocessing_plot_discretization.py`.
128
129 In bin edges for feature ``i``, the first and last values are used only for
130 ``inverse_transform``. During transform, bin edges are extended to::
131
132 np.concatenate([-np.inf, bin_edges_[i][1:-1], np.inf])
133
134 You can combine ``KBinsDiscretizer`` with
135 :class:`~sklearn.compose.ColumnTransformer` if you only want to preprocess
136 part of the features.
137
138 ``KBinsDiscretizer`` might produce constant features (e.g., when
139 ``encode = 'onehot'`` and certain bins do not contain any data).
140 These features can be removed with feature selection algorithms
141 (e.g., :class:`~sklearn.feature_selection.VarianceThreshold`).
142
143 Examples
144 --------
145 >>> from sklearn.preprocessing import KBinsDiscretizer
146 >>> X = [[-2, 1, -4, -1],
147 ... [-1, 2, -3, -0.5],
148 ... [ 0, 3, -2, 0.5],
149 ... [ 1, 4, -1, 2]]
150 >>> est = KBinsDiscretizer(
151 ... n_bins=3, encode='ordinal', strategy='uniform', subsample=None
152 ... )
153 >>> est.fit(X)
154 KBinsDiscretizer(...)
155 >>> Xt = est.transform(X)
156 >>> Xt # doctest: +SKIP
157 array([[ 0., 0., 0., 0.],
158 [ 1., 1., 1., 0.],
159 [ 2., 2., 2., 1.],
160 [ 2., 2., 2., 2.]])
161
162 Sometimes it may be useful to convert the data back into the original
163 feature space. The ``inverse_transform`` function converts the binned
164 data into the original feature space. Each value will be equal to the mean
165 of the two bin edges.
166
167 >>> est.bin_edges_[0]
168 array([-2., -1., 0., 1.])
169 >>> est.inverse_transform(Xt)
170 array([[-1.5, 1.5, -3.5, -0.5],
171 [-0.5, 2.5, -2.5, -0.5],
172 [ 0.5, 3.5, -1.5, 0.5],
173 [ 0.5, 3.5, -1.5, 1.5]])
174 """
175
176 _parameter_constraints: dict = {
177 "n_bins": [Interval(Integral, 2, None, closed="left"), "array-like"],
178 "encode": [StrOptions({"onehot", "onehot-dense", "ordinal"})],
179 "strategy": [StrOptions({"uniform", "quantile", "kmeans"})],
180 "dtype": [Options(type, {np.float64, np.float32}), None],
181 "subsample": [
182 Interval(Integral, 1, None, closed="left"),
183 None,
184 Hidden(StrOptions({"warn"})),
185 ],
186 "random_state": ["random_state"],
187 }
188
189 def __init__(
190 self,
191 n_bins=5,
192 *,
193 encode="onehot",
194 strategy="quantile",
195 dtype=None,
196 subsample="warn",
197 random_state=None,
198 ):
199 self.n_bins = n_bins
200 self.encode = encode
201 self.strategy = strategy
202 self.dtype = dtype
203 self.subsample = subsample
204 self.random_state = random_state
205
206 @_fit_context(prefer_skip_nested_validation=True)
207 def fit(self, X, y=None, sample_weight=None):
208 """
209 Fit the estimator.
210
211 Parameters
212 ----------
213 X : array-like of shape (n_samples, n_features)
214 Data to be discretized.
215
216 y : None
217 Ignored. This parameter exists only for compatibility with
218 :class:`~sklearn.pipeline.Pipeline`.
219
220 sample_weight : ndarray of shape (n_samples,)
221 Contains weight values to be associated with each sample.
222 Only possible when `strategy` is set to `"quantile"`.
223
224 .. versionadded:: 1.3
225
226 Returns
227 -------
228 self : object
229 Returns the instance itself.
230 """
231 X = self._validate_data(X, dtype="numeric")
232
233 if self.dtype in (np.float64, np.float32):
234 output_dtype = self.dtype
235 else: # self.dtype is None
236 output_dtype = X.dtype
237
238 n_samples, n_features = X.shape
239
240 if sample_weight is not None and self.strategy == "uniform":
241 raise ValueError(
242 "`sample_weight` was provided but it cannot be "
243 "used with strategy='uniform'. Got strategy="
244 f"{self.strategy!r} instead."
245 )
246
247 if self.strategy in ("uniform", "kmeans") and self.subsample == "warn":
248 warnings.warn(
249 (
250 "In version 1.5 onwards, subsample=200_000 "
251 "will be used by default. Set subsample explicitly to "
252 "silence this warning in the mean time. Set "
253 "subsample=None to disable subsampling explicitly."
254 ),
255 FutureWarning,
256 )
257
258 subsample = self.subsample
259 if subsample == "warn":
260 subsample = 200000 if self.strategy == "quantile" else None
261 if subsample is not None and n_samples > subsample:
262 rng = check_random_state(self.random_state)
263 subsample_idx = rng.choice(n_samples, size=subsample, replace=False)
264 X = _safe_indexing(X, subsample_idx)
265
266 n_features = X.shape[1]
267 n_bins = self._validate_n_bins(n_features)
268
269 if sample_weight is not None:
270 sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype)
271
272 bin_edges = np.zeros(n_features, dtype=object)
273 for jj in range(n_features):
274 column = X[:, jj]
275 col_min, col_max = column.min(), column.max()
276
277 if col_min == col_max:
278 warnings.warn(
279 "Feature %d is constant and will be replaced with 0." % jj
280 )
281 n_bins[jj] = 1
282 bin_edges[jj] = np.array([-np.inf, np.inf])
283 continue
284
285 if self.strategy == "uniform":
286 bin_edges[jj] = np.linspace(col_min, col_max, n_bins[jj] + 1)
287
288 elif self.strategy == "quantile":
289 quantiles = np.linspace(0, 100, n_bins[jj] + 1)
290 if sample_weight is None:
291 bin_edges[jj] = np.asarray(np.percentile(column, quantiles))
292 else:
293 bin_edges[jj] = np.asarray(
294 [
295 _weighted_percentile(column, sample_weight, q)
296 for q in quantiles
297 ],
298 dtype=np.float64,
299 )
300 elif self.strategy == "kmeans":
301 from ..cluster import KMeans # fixes import loops
302
303 # Deterministic initialization with uniform spacing
304 uniform_edges = np.linspace(col_min, col_max, n_bins[jj] + 1)
305 init = (uniform_edges[1:] + uniform_edges[:-1])[:, None] * 0.5
306
307 # 1D k-means procedure
308 km = KMeans(n_clusters=n_bins[jj], init=init, n_init=1)
309 centers = km.fit(
310 column[:, None], sample_weight=sample_weight
311 ).cluster_centers_[:, 0]
312 # Must sort, centers may be unsorted even with sorted init
313 centers.sort()
314 bin_edges[jj] = (centers[1:] + centers[:-1]) * 0.5
315 bin_edges[jj] = np.r_[col_min, bin_edges[jj], col_max]
316
317 # Remove bins whose width are too small (i.e., <= 1e-8)
318 if self.strategy in ("quantile", "kmeans"):
319 mask = np.ediff1d(bin_edges[jj], to_begin=np.inf) > 1e-8
320 bin_edges[jj] = bin_edges[jj][mask]
321 if len(bin_edges[jj]) - 1 != n_bins[jj]:
322 warnings.warn(
323 "Bins whose width are too small (i.e., <= "
324 "1e-8) in feature %d are removed. Consider "
325 "decreasing the number of bins." % jj
326 )
327 n_bins[jj] = len(bin_edges[jj]) - 1
328
329 self.bin_edges_ = bin_edges
330 self.n_bins_ = n_bins
331
332 if "onehot" in self.encode:
333 self._encoder = OneHotEncoder(
334 categories=[np.arange(i) for i in self.n_bins_],
335 sparse_output=self.encode == "onehot",
336 dtype=output_dtype,
337 )
338 # Fit the OneHotEncoder with toy datasets
339 # so that it's ready for use after the KBinsDiscretizer is fitted
340 self._encoder.fit(np.zeros((1, len(self.n_bins_))))
341
342 return self
343
344 def _validate_n_bins(self, n_features):
345 """Returns n_bins_, the number of bins per feature."""
346 orig_bins = self.n_bins
347 if isinstance(orig_bins, Integral):
348 return np.full(n_features, orig_bins, dtype=int)
349
350 n_bins = check_array(orig_bins, dtype=int, copy=True, ensure_2d=False)
351
352 if n_bins.ndim > 1 or n_bins.shape[0] != n_features:
353 raise ValueError("n_bins must be a scalar or array of shape (n_features,).")
354
355 bad_nbins_value = (n_bins < 2) | (n_bins != orig_bins)
356
357 violating_indices = np.where(bad_nbins_value)[0]
358 if violating_indices.shape[0] > 0:
359 indices = ", ".join(str(i) for i in violating_indices)
360 raise ValueError(
361 "{} received an invalid number "
362 "of bins at indices {}. Number of bins "
363 "must be at least 2, and must be an int.".format(
364 KBinsDiscretizer.__name__, indices
365 )
366 )
367 return n_bins
368
369 def transform(self, X):
370 """
371 Discretize the data.
372
373 Parameters
374 ----------
375 X : array-like of shape (n_samples, n_features)
376 Data to be discretized.
377
378 Returns
379 -------
380 Xt : {ndarray, sparse matrix}, dtype={np.float32, np.float64}
381 Data in the binned space. Will be a sparse matrix if
382 `self.encode='onehot'` and ndarray otherwise.
383 """
384 check_is_fitted(self)
385
386 # check input and attribute dtypes
387 dtype = (np.float64, np.float32) if self.dtype is None else self.dtype
388 Xt = self._validate_data(X, copy=True, dtype=dtype, reset=False)
389
390 bin_edges = self.bin_edges_
391 for jj in range(Xt.shape[1]):
392 Xt[:, jj] = np.searchsorted(bin_edges[jj][1:-1], Xt[:, jj], side="right")
393
394 if self.encode == "ordinal":
395 return Xt
396
397 dtype_init = None
398 if "onehot" in self.encode:
399 dtype_init = self._encoder.dtype
400 self._encoder.dtype = Xt.dtype
401 try:
402 Xt_enc = self._encoder.transform(Xt)
403 finally:
404 # revert the initial dtype to avoid modifying self.
405 self._encoder.dtype = dtype_init
406 return Xt_enc
407
408 def inverse_transform(self, Xt):
409 """
410 Transform discretized data back to original feature space.
411
412 Note that this function does not regenerate the original data
413 due to discretization rounding.
414
415 Parameters
416 ----------
417 Xt : array-like of shape (n_samples, n_features)
418 Transformed data in the binned space.
419
420 Returns
421 -------
422 Xinv : ndarray, dtype={np.float32, np.float64}
423 Data in the original feature space.
424 """
425 check_is_fitted(self)
426
427 if "onehot" in self.encode:
428 Xt = self._encoder.inverse_transform(Xt)
429
430 Xinv = check_array(Xt, copy=True, dtype=(np.float64, np.float32))
431 n_features = self.n_bins_.shape[0]
432 if Xinv.shape[1] != n_features:
433 raise ValueError(
434 "Incorrect number of features. Expecting {}, received {}.".format(
435 n_features, Xinv.shape[1]
436 )
437 )
438
439 for jj in range(n_features):
440 bin_edges = self.bin_edges_[jj]
441 bin_centers = (bin_edges[1:] + bin_edges[:-1]) * 0.5
442 Xinv[:, jj] = bin_centers[(Xinv[:, jj]).astype(np.int64)]
443
444 return Xinv
445
446 def get_feature_names_out(self, input_features=None):
447 """Get output feature names.
448
449 Parameters
450 ----------
451 input_features : array-like of str or None, default=None
452 Input features.
453
454 - If `input_features` is `None`, then `feature_names_in_` is
455 used as feature names in. If `feature_names_in_` is not defined,
456 then the following input feature names are generated:
457 `["x0", "x1", ..., "x(n_features_in_ - 1)"]`.
458 - If `input_features` is an array-like, then `input_features` must
459 match `feature_names_in_` if `feature_names_in_` is defined.
460
461 Returns
462 -------
463 feature_names_out : ndarray of str objects
464 Transformed feature names.
465 """
466 check_is_fitted(self, "n_features_in_")
467 input_features = _check_feature_names_in(self, input_features)
468 if hasattr(self, "_encoder"):
469 return self._encoder.get_feature_names_out(input_features)
470
471 # ordinal encoding
472 return input_features