1# Authors: Alexandre Gramfort <alexandre.gramfort@inria.fr>
2# Mathieu Blondel <mathieu@mblondel.org>
3# Olivier Grisel <olivier.grisel@ensta.org>
4# Andreas Mueller <amueller@ais.uni-bonn.de>
5# Eric Martin <eric@ericmart.in>
6# Giorgio Patrini <giorgio.patrini@anu.edu.au>
7# Eric Chang <ericchang2017@u.northwestern.edu>
8# License: BSD 3 clause
9
10
11import warnings
12from numbers import Integral, Real
13
14import numpy as np
15from scipy import optimize, sparse, stats
16from scipy.special import boxcox
17
18from ..base import (
19 BaseEstimator,
20 ClassNamePrefixFeaturesOutMixin,
21 OneToOneFeatureMixin,
22 TransformerMixin,
23 _fit_context,
24)
25from ..utils import _array_api, check_array
26from ..utils._array_api import get_namespace
27from ..utils._param_validation import Interval, Options, StrOptions, validate_params
28from ..utils.extmath import _incremental_mean_and_var, row_norms
29from ..utils.sparsefuncs import (
30 incr_mean_variance_axis,
31 inplace_column_scale,
32 mean_variance_axis,
33 min_max_axis,
34)
35from ..utils.sparsefuncs_fast import (
36 inplace_csr_row_normalize_l1,
37 inplace_csr_row_normalize_l2,
38)
39from ..utils.validation import (
40 FLOAT_DTYPES,
41 _check_sample_weight,
42 check_is_fitted,
43 check_random_state,
44)
45from ._encoders import OneHotEncoder
46
47BOUNDS_THRESHOLD = 1e-7
48
49__all__ = [
50 "Binarizer",
51 "KernelCenterer",
52 "MinMaxScaler",
53 "MaxAbsScaler",
54 "Normalizer",
55 "OneHotEncoder",
56 "RobustScaler",
57 "StandardScaler",
58 "QuantileTransformer",
59 "PowerTransformer",
60 "add_dummy_feature",
61 "binarize",
62 "normalize",
63 "scale",
64 "robust_scale",
65 "maxabs_scale",
66 "minmax_scale",
67 "quantile_transform",
68 "power_transform",
69]
70
71
72def _is_constant_feature(var, mean, n_samples):
73 """Detect if a feature is indistinguishable from a constant feature.
74
75 The detection is based on its computed variance and on the theoretical
76 error bounds of the '2 pass algorithm' for variance computation.
77
78 See "Algorithms for computing the sample variance: analysis and
79 recommendations", by Chan, Golub, and LeVeque.
80 """
81 # In scikit-learn, variance is always computed using float64 accumulators.
82 eps = np.finfo(np.float64).eps
83
84 upper_bound = n_samples * eps * var + (n_samples * mean * eps) ** 2
85 return var <= upper_bound
86
87
88def _handle_zeros_in_scale(scale, copy=True, constant_mask=None):
89 """Set scales of near constant features to 1.
90
91 The goal is to avoid division by very small or zero values.
92
93 Near constant features are detected automatically by identifying
94 scales close to machine precision unless they are precomputed by
95 the caller and passed with the `constant_mask` kwarg.
96
97 Typically for standard scaling, the scales are the standard
98 deviation while near constant features are better detected on the
99 computed variances which are closer to machine precision by
100 construction.
101 """
102 # if we are fitting on 1D arrays, scale might be a scalar
103 if np.isscalar(scale):
104 if scale == 0.0:
105 scale = 1.0
106 return scale
107 # scale is an array
108 else:
109 xp, _ = get_namespace(scale)
110 if constant_mask is None:
111 # Detect near constant values to avoid dividing by a very small
112 # value that could lead to surprising results and numerical
113 # stability issues.
114 constant_mask = scale < 10 * xp.finfo(scale.dtype).eps
115
116 if copy:
117 # New array to avoid side-effects
118 scale = xp.asarray(scale, copy=True)
119 scale[constant_mask] = 1.0
120 return scale
121
122
123@validate_params(
124 {
125 "X": ["array-like", "sparse matrix"],
126 "axis": [Options(Integral, {0, 1})],
127 "with_mean": ["boolean"],
128 "with_std": ["boolean"],
129 "copy": ["boolean"],
130 },
131 prefer_skip_nested_validation=True,
132)
133def scale(X, *, axis=0, with_mean=True, with_std=True, copy=True):
134 """Standardize a dataset along any axis.
135
136 Center to the mean and component wise scale to unit variance.
137
138 Read more in the :ref:`User Guide <preprocessing_scaler>`.
139
140 Parameters
141 ----------
142 X : {array-like, sparse matrix} of shape (n_samples, n_features)
143 The data to center and scale.
144
145 axis : {0, 1}, default=0
146 Axis used to compute the means and standard deviations along. If 0,
147 independently standardize each feature, otherwise (if 1) standardize
148 each sample.
149
150 with_mean : bool, default=True
151 If True, center the data before scaling.
152
153 with_std : bool, default=True
154 If True, scale the data to unit variance (or equivalently,
155 unit standard deviation).
156
157 copy : bool, default=True
158 If False, try to avoid a copy and scale in place.
159 This is not guaranteed to always work in place; e.g. if the data is
160 a numpy array with an int dtype, a copy will be returned even with
161 copy=False.
162
163 Returns
164 -------
165 X_tr : {ndarray, sparse matrix} of shape (n_samples, n_features)
166 The transformed data.
167
168 See Also
169 --------
170 StandardScaler : Performs scaling to unit variance using the Transformer
171 API (e.g. as part of a preprocessing
172 :class:`~sklearn.pipeline.Pipeline`).
173
174 Notes
175 -----
176 This implementation will refuse to center scipy.sparse matrices
177 since it would make them non-sparse and would potentially crash the
178 program with memory exhaustion problems.
179
180 Instead the caller is expected to either set explicitly
181 `with_mean=False` (in that case, only variance scaling will be
182 performed on the features of the CSC matrix) or to call `X.toarray()`
183 if he/she expects the materialized dense array to fit in memory.
184
185 To avoid memory copy the caller should pass a CSC matrix.
186
187 NaNs are treated as missing values: disregarded to compute the statistics,
188 and maintained during the data transformation.
189
190 We use a biased estimator for the standard deviation, equivalent to
191 `numpy.std(x, ddof=0)`. Note that the choice of `ddof` is unlikely to
192 affect model performance.
193
194 For a comparison of the different scalers, transformers, and normalizers,
195 see: :ref:`sphx_glr_auto_examples_preprocessing_plot_all_scaling.py`.
196
197 .. warning:: Risk of data leak
198
199 Do not use :func:`~sklearn.preprocessing.scale` unless you know
200 what you are doing. A common mistake is to apply it to the entire data
201 *before* splitting into training and test sets. This will bias the
202 model evaluation because information would have leaked from the test
203 set to the training set.
204 In general, we recommend using
205 :class:`~sklearn.preprocessing.StandardScaler` within a
206 :ref:`Pipeline <pipeline>` in order to prevent most risks of data
207 leaking: `pipe = make_pipeline(StandardScaler(), LogisticRegression())`.
208 """ # noqa
209 X = check_array(
210 X,
211 accept_sparse="csc",
212 copy=copy,
213 ensure_2d=False,
214 estimator="the scale function",
215 dtype=FLOAT_DTYPES,
216 force_all_finite="allow-nan",
217 )
218 if sparse.issparse(X):
219 if with_mean:
220 raise ValueError(
221 "Cannot center sparse matrices: pass `with_mean=False` instead"
222 " See docstring for motivation and alternatives."
223 )
224 if axis != 0:
225 raise ValueError(
226 "Can only scale sparse matrix on axis=0, got axis=%d" % axis
227 )
228 if with_std:
229 _, var = mean_variance_axis(X, axis=0)
230 var = _handle_zeros_in_scale(var, copy=False)
231 inplace_column_scale(X, 1 / np.sqrt(var))
232 else:
233 X = np.asarray(X)
234 if with_mean:
235 mean_ = np.nanmean(X, axis)
236 if with_std:
237 scale_ = np.nanstd(X, axis)
238 # Xr is a view on the original array that enables easy use of
239 # broadcasting on the axis in which we are interested in
240 Xr = np.rollaxis(X, axis)
241 if with_mean:
242 Xr -= mean_
243 mean_1 = np.nanmean(Xr, axis=0)
244 # Verify that mean_1 is 'close to zero'. If X contains very
245 # large values, mean_1 can also be very large, due to a lack of
246 # precision of mean_. In this case, a pre-scaling of the
247 # concerned feature is efficient, for instance by its mean or
248 # maximum.
249 if not np.allclose(mean_1, 0):
250 warnings.warn(
251 "Numerical issues were encountered "
252 "when centering the data "
253 "and might not be solved. Dataset may "
254 "contain too large values. You may need "
255 "to prescale your features."
256 )
257 Xr -= mean_1
258 if with_std:
259 scale_ = _handle_zeros_in_scale(scale_, copy=False)
260 Xr /= scale_
261 if with_mean:
262 mean_2 = np.nanmean(Xr, axis=0)
263 # If mean_2 is not 'close to zero', it comes from the fact that
264 # scale_ is very small so that mean_2 = mean_1/scale_ > 0, even
265 # if mean_1 was close to zero. The problem is thus essentially
266 # due to the lack of precision of mean_. A solution is then to
267 # subtract the mean again:
268 if not np.allclose(mean_2, 0):
269 warnings.warn(
270 "Numerical issues were encountered "
271 "when scaling the data "
272 "and might not be solved. The standard "
273 "deviation of the data is probably "
274 "very close to 0. "
275 )
276 Xr -= mean_2
277 return X
278
279
280class MinMaxScaler(OneToOneFeatureMixin, TransformerMixin, BaseEstimator):
281 """Transform features by scaling each feature to a given range.
282
283 This estimator scales and translates each feature individually such
284 that it is in the given range on the training set, e.g. between
285 zero and one.
286
287 The transformation is given by::
288
289 X_std = (X - X.min(axis=0)) / (X.max(axis=0) - X.min(axis=0))
290 X_scaled = X_std * (max - min) + min
291
292 where min, max = feature_range.
293
294 This transformation is often used as an alternative to zero mean,
295 unit variance scaling.
296
297 `MinMaxScaler` doesn't reduce the effect of outliers, but it linearly
298 scales them down into a fixed range, where the largest occurring data point
299 corresponds to the maximum value and the smallest one corresponds to the
300 minimum value. For an example visualization, refer to :ref:`Compare
301 MinMaxScaler with other scalers <plot_all_scaling_minmax_scaler_section>`.
302
303 Read more in the :ref:`User Guide <preprocessing_scaler>`.
304
305 Parameters
306 ----------
307 feature_range : tuple (min, max), default=(0, 1)
308 Desired range of transformed data.
309
310 copy : bool, default=True
311 Set to False to perform inplace row normalization and avoid a
312 copy (if the input is already a numpy array).
313
314 clip : bool, default=False
315 Set to True to clip transformed values of held-out data to
316 provided `feature range`.
317
318 .. versionadded:: 0.24
319
320 Attributes
321 ----------
322 min_ : ndarray of shape (n_features,)
323 Per feature adjustment for minimum. Equivalent to
324 ``min - X.min(axis=0) * self.scale_``
325
326 scale_ : ndarray of shape (n_features,)
327 Per feature relative scaling of the data. Equivalent to
328 ``(max - min) / (X.max(axis=0) - X.min(axis=0))``
329
330 .. versionadded:: 0.17
331 *scale_* attribute.
332
333 data_min_ : ndarray of shape (n_features,)
334 Per feature minimum seen in the data
335
336 .. versionadded:: 0.17
337 *data_min_*
338
339 data_max_ : ndarray of shape (n_features,)
340 Per feature maximum seen in the data
341
342 .. versionadded:: 0.17
343 *data_max_*
344
345 data_range_ : ndarray of shape (n_features,)
346 Per feature range ``(data_max_ - data_min_)`` seen in the data
347
348 .. versionadded:: 0.17
349 *data_range_*
350
351 n_features_in_ : int
352 Number of features seen during :term:`fit`.
353
354 .. versionadded:: 0.24
355
356 n_samples_seen_ : int
357 The number of samples processed by the estimator.
358 It will be reset on new calls to fit, but increments across
359 ``partial_fit`` calls.
360
361 feature_names_in_ : ndarray of shape (`n_features_in_`,)
362 Names of features seen during :term:`fit`. Defined only when `X`
363 has feature names that are all strings.
364
365 .. versionadded:: 1.0
366
367 See Also
368 --------
369 minmax_scale : Equivalent function without the estimator API.
370
371 Notes
372 -----
373 NaNs are treated as missing values: disregarded in fit, and maintained in
374 transform.
375
376 Examples
377 --------
378 >>> from sklearn.preprocessing import MinMaxScaler
379 >>> data = [[-1, 2], [-0.5, 6], [0, 10], [1, 18]]
380 >>> scaler = MinMaxScaler()
381 >>> print(scaler.fit(data))
382 MinMaxScaler()
383 >>> print(scaler.data_max_)
384 [ 1. 18.]
385 >>> print(scaler.transform(data))
386 [[0. 0. ]
387 [0.25 0.25]
388 [0.5 0.5 ]
389 [1. 1. ]]
390 >>> print(scaler.transform([[2, 2]]))
391 [[1.5 0. ]]
392 """
393
394 _parameter_constraints: dict = {
395 "feature_range": [tuple],
396 "copy": ["boolean"],
397 "clip": ["boolean"],
398 }
399
400 def __init__(self, feature_range=(0, 1), *, copy=True, clip=False):
401 self.feature_range = feature_range
402 self.copy = copy
403 self.clip = clip
404
405 def _reset(self):
406 """Reset internal data-dependent state of the scaler, if necessary.
407
408 __init__ parameters are not touched.
409 """
410 # Checking one attribute is enough, because they are all set together
411 # in partial_fit
412 if hasattr(self, "scale_"):
413 del self.scale_
414 del self.min_
415 del self.n_samples_seen_
416 del self.data_min_
417 del self.data_max_
418 del self.data_range_
419
420 def fit(self, X, y=None):
421 """Compute the minimum and maximum to be used for later scaling.
422
423 Parameters
424 ----------
425 X : array-like of shape (n_samples, n_features)
426 The data used to compute the per-feature minimum and maximum
427 used for later scaling along the features axis.
428
429 y : None
430 Ignored.
431
432 Returns
433 -------
434 self : object
435 Fitted scaler.
436 """
437 # Reset internal state before fitting
438 self._reset()
439 return self.partial_fit(X, y)
440
441 @_fit_context(prefer_skip_nested_validation=True)
442 def partial_fit(self, X, y=None):
443 """Online computation of min and max on X for later scaling.
444
445 All of X is processed as a single batch. This is intended for cases
446 when :meth:`fit` is not feasible due to very large number of
447 `n_samples` or because X is read from a continuous stream.
448
449 Parameters
450 ----------
451 X : array-like of shape (n_samples, n_features)
452 The data used to compute the mean and standard deviation
453 used for later scaling along the features axis.
454
455 y : None
456 Ignored.
457
458 Returns
459 -------
460 self : object
461 Fitted scaler.
462 """
463 feature_range = self.feature_range
464 if feature_range[0] >= feature_range[1]:
465 raise ValueError(
466 "Minimum of desired feature range must be smaller than maximum. Got %s."
467 % str(feature_range)
468 )
469
470 if sparse.issparse(X):
471 raise TypeError(
472 "MinMaxScaler does not support sparse input. "
473 "Consider using MaxAbsScaler instead."
474 )
475
476 xp, _ = get_namespace(X)
477
478 first_pass = not hasattr(self, "n_samples_seen_")
479 X = self._validate_data(
480 X,
481 reset=first_pass,
482 dtype=_array_api.supported_float_dtypes(xp),
483 force_all_finite="allow-nan",
484 )
485
486 data_min = _array_api._nanmin(X, axis=0)
487 data_max = _array_api._nanmax(X, axis=0)
488
489 if first_pass:
490 self.n_samples_seen_ = X.shape[0]
491 else:
492 data_min = xp.minimum(self.data_min_, data_min)
493 data_max = xp.maximum(self.data_max_, data_max)
494 self.n_samples_seen_ += X.shape[0]
495
496 data_range = data_max - data_min
497 self.scale_ = (feature_range[1] - feature_range[0]) / _handle_zeros_in_scale(
498 data_range, copy=True
499 )
500 self.min_ = feature_range[0] - data_min * self.scale_
501 self.data_min_ = data_min
502 self.data_max_ = data_max
503 self.data_range_ = data_range
504 return self
505
506 def transform(self, X):
507 """Scale features of X according to feature_range.
508
509 Parameters
510 ----------
511 X : array-like of shape (n_samples, n_features)
512 Input data that will be transformed.
513
514 Returns
515 -------
516 Xt : ndarray of shape (n_samples, n_features)
517 Transformed data.
518 """
519 check_is_fitted(self)
520
521 xp, _ = get_namespace(X)
522
523 X = self._validate_data(
524 X,
525 copy=self.copy,
526 dtype=_array_api.supported_float_dtypes(xp),
527 force_all_finite="allow-nan",
528 reset=False,
529 )
530
531 X *= self.scale_
532 X += self.min_
533 if self.clip:
534 xp.clip(X, self.feature_range[0], self.feature_range[1], out=X)
535 return X
536
537 def inverse_transform(self, X):
538 """Undo the scaling of X according to feature_range.
539
540 Parameters
541 ----------
542 X : array-like of shape (n_samples, n_features)
543 Input data that will be transformed. It cannot be sparse.
544
545 Returns
546 -------
547 Xt : ndarray of shape (n_samples, n_features)
548 Transformed data.
549 """
550 check_is_fitted(self)
551
552 xp, _ = get_namespace(X)
553
554 X = check_array(
555 X,
556 copy=self.copy,
557 dtype=_array_api.supported_float_dtypes(xp),
558 force_all_finite="allow-nan",
559 )
560
561 X -= self.min_
562 X /= self.scale_
563 return X
564
565 def _more_tags(self):
566 return {"allow_nan": True}
567
568
569@validate_params(
570 {
571 "X": ["array-like"],
572 "axis": [Options(Integral, {0, 1})],
573 },
574 prefer_skip_nested_validation=False,
575)
576def minmax_scale(X, feature_range=(0, 1), *, axis=0, copy=True):
577 """Transform features by scaling each feature to a given range.
578
579 This estimator scales and translates each feature individually such
580 that it is in the given range on the training set, i.e. between
581 zero and one.
582
583 The transformation is given by (when ``axis=0``)::
584
585 X_std = (X - X.min(axis=0)) / (X.max(axis=0) - X.min(axis=0))
586 X_scaled = X_std * (max - min) + min
587
588 where min, max = feature_range.
589
590 The transformation is calculated as (when ``axis=0``)::
591
592 X_scaled = scale * X + min - X.min(axis=0) * scale
593 where scale = (max - min) / (X.max(axis=0) - X.min(axis=0))
594
595 This transformation is often used as an alternative to zero mean,
596 unit variance scaling.
597
598 Read more in the :ref:`User Guide <preprocessing_scaler>`.
599
600 .. versionadded:: 0.17
601 *minmax_scale* function interface
602 to :class:`~sklearn.preprocessing.MinMaxScaler`.
603
604 Parameters
605 ----------
606 X : array-like of shape (n_samples, n_features)
607 The data.
608
609 feature_range : tuple (min, max), default=(0, 1)
610 Desired range of transformed data.
611
612 axis : {0, 1}, default=0
613 Axis used to scale along. If 0, independently scale each feature,
614 otherwise (if 1) scale each sample.
615
616 copy : bool, default=True
617 If False, try to avoid a copy and scale in place.
618 This is not guaranteed to always work in place; e.g. if the data is
619 a numpy array with an int dtype, a copy will be returned even with
620 copy=False.
621
622 Returns
623 -------
624 X_tr : ndarray of shape (n_samples, n_features)
625 The transformed data.
626
627 .. warning:: Risk of data leak
628
629 Do not use :func:`~sklearn.preprocessing.minmax_scale` unless you know
630 what you are doing. A common mistake is to apply it to the entire data
631 *before* splitting into training and test sets. This will bias the
632 model evaluation because information would have leaked from the test
633 set to the training set.
634 In general, we recommend using
635 :class:`~sklearn.preprocessing.MinMaxScaler` within a
636 :ref:`Pipeline <pipeline>` in order to prevent most risks of data
637 leaking: `pipe = make_pipeline(MinMaxScaler(), LogisticRegression())`.
638
639 See Also
640 --------
641 MinMaxScaler : Performs scaling to a given range using the Transformer
642 API (e.g. as part of a preprocessing
643 :class:`~sklearn.pipeline.Pipeline`).
644
645 Notes
646 -----
647 For a comparison of the different scalers, transformers, and normalizers,
648 see: :ref:`sphx_glr_auto_examples_preprocessing_plot_all_scaling.py`.
649 """
650 # Unlike the scaler object, this function allows 1d input.
651 # If copy is required, it will be done inside the scaler object.
652 X = check_array(
653 X, copy=False, ensure_2d=False, dtype=FLOAT_DTYPES, force_all_finite="allow-nan"
654 )
655 original_ndim = X.ndim
656
657 if original_ndim == 1:
658 X = X.reshape(X.shape[0], 1)
659
660 s = MinMaxScaler(feature_range=feature_range, copy=copy)
661 if axis == 0:
662 X = s.fit_transform(X)
663 else:
664 X = s.fit_transform(X.T).T
665
666 if original_ndim == 1:
667 X = X.ravel()
668
669 return X
670
671
672class StandardScaler(OneToOneFeatureMixin, TransformerMixin, BaseEstimator):
673 """Standardize features by removing the mean and scaling to unit variance.
674
675 The standard score of a sample `x` is calculated as:
676
677 z = (x - u) / s
678
679 where `u` is the mean of the training samples or zero if `with_mean=False`,
680 and `s` is the standard deviation of the training samples or one if
681 `with_std=False`.
682
683 Centering and scaling happen independently on each feature by computing
684 the relevant statistics on the samples in the training set. Mean and
685 standard deviation are then stored to be used on later data using
686 :meth:`transform`.
687
688 Standardization of a dataset is a common requirement for many
689 machine learning estimators: they might behave badly if the
690 individual features do not more or less look like standard normally
691 distributed data (e.g. Gaussian with 0 mean and unit variance).
692
693 For instance many elements used in the objective function of
694 a learning algorithm (such as the RBF kernel of Support Vector
695 Machines or the L1 and L2 regularizers of linear models) assume that
696 all features are centered around 0 and have variance in the same
697 order. If a feature has a variance that is orders of magnitude larger
698 than others, it might dominate the objective function and make the
699 estimator unable to learn from other features correctly as expected.
700
701 `StandardScaler` is sensitive to outliers, and the features may scale
702 differently from each other in the presence of outliers. For an example
703 visualization, refer to :ref:`Compare StandardScaler with other scalers
704 <plot_all_scaling_standard_scaler_section>`.
705
706 This scaler can also be applied to sparse CSR or CSC matrices by passing
707 `with_mean=False` to avoid breaking the sparsity structure of the data.
708
709 Read more in the :ref:`User Guide <preprocessing_scaler>`.
710
711 Parameters
712 ----------
713 copy : bool, default=True
714 If False, try to avoid a copy and do inplace scaling instead.
715 This is not guaranteed to always work inplace; e.g. if the data is
716 not a NumPy array or scipy.sparse CSR matrix, a copy may still be
717 returned.
718
719 with_mean : bool, default=True
720 If True, center the data before scaling.
721 This does not work (and will raise an exception) when attempted on
722 sparse matrices, because centering them entails building a dense
723 matrix which in common use cases is likely to be too large to fit in
724 memory.
725
726 with_std : bool, default=True
727 If True, scale the data to unit variance (or equivalently,
728 unit standard deviation).
729
730 Attributes
731 ----------
732 scale_ : ndarray of shape (n_features,) or None
733 Per feature relative scaling of the data to achieve zero mean and unit
734 variance. Generally this is calculated using `np.sqrt(var_)`. If a
735 variance is zero, we can't achieve unit variance, and the data is left
736 as-is, giving a scaling factor of 1. `scale_` is equal to `None`
737 when `with_std=False`.
738
739 .. versionadded:: 0.17
740 *scale_*
741
742 mean_ : ndarray of shape (n_features,) or None
743 The mean value for each feature in the training set.
744 Equal to ``None`` when ``with_mean=False`` and ``with_std=False``.
745
746 var_ : ndarray of shape (n_features,) or None
747 The variance for each feature in the training set. Used to compute
748 `scale_`. Equal to ``None`` when ``with_mean=False`` and
749 ``with_std=False``.
750
751 n_features_in_ : int
752 Number of features seen during :term:`fit`.
753
754 .. versionadded:: 0.24
755
756 feature_names_in_ : ndarray of shape (`n_features_in_`,)
757 Names of features seen during :term:`fit`. Defined only when `X`
758 has feature names that are all strings.
759
760 .. versionadded:: 1.0
761
762 n_samples_seen_ : int or ndarray of shape (n_features,)
763 The number of samples processed by the estimator for each feature.
764 If there are no missing samples, the ``n_samples_seen`` will be an
765 integer, otherwise it will be an array of dtype int. If
766 `sample_weights` are used it will be a float (if no missing data)
767 or an array of dtype float that sums the weights seen so far.
768 Will be reset on new calls to fit, but increments across
769 ``partial_fit`` calls.
770
771 See Also
772 --------
773 scale : Equivalent function without the estimator API.
774
775 :class:`~sklearn.decomposition.PCA` : Further removes the linear
776 correlation across features with 'whiten=True'.
777
778 Notes
779 -----
780 NaNs are treated as missing values: disregarded in fit, and maintained in
781 transform.
782
783 We use a biased estimator for the standard deviation, equivalent to
784 `numpy.std(x, ddof=0)`. Note that the choice of `ddof` is unlikely to
785 affect model performance.
786
787 Examples
788 --------
789 >>> from sklearn.preprocessing import StandardScaler
790 >>> data = [[0, 0], [0, 0], [1, 1], [1, 1]]
791 >>> scaler = StandardScaler()
792 >>> print(scaler.fit(data))
793 StandardScaler()
794 >>> print(scaler.mean_)
795 [0.5 0.5]
796 >>> print(scaler.transform(data))
797 [[-1. -1.]
798 [-1. -1.]
799 [ 1. 1.]
800 [ 1. 1.]]
801 >>> print(scaler.transform([[2, 2]]))
802 [[3. 3.]]
803 """
804
805 _parameter_constraints: dict = {
806 "copy": ["boolean"],
807 "with_mean": ["boolean"],
808 "with_std": ["boolean"],
809 }
810
811 def __init__(self, *, copy=True, with_mean=True, with_std=True):
812 self.with_mean = with_mean
813 self.with_std = with_std
814 self.copy = copy
815
816 def _reset(self):
817 """Reset internal data-dependent state of the scaler, if necessary.
818
819 __init__ parameters are not touched.
820 """
821 # Checking one attribute is enough, because they are all set together
822 # in partial_fit
823 if hasattr(self, "scale_"):
824 del self.scale_
825 del self.n_samples_seen_
826 del self.mean_
827 del self.var_
828
829 def fit(self, X, y=None, sample_weight=None):
830 """Compute the mean and std to be used for later scaling.
831
832 Parameters
833 ----------
834 X : {array-like, sparse matrix} of shape (n_samples, n_features)
835 The data used to compute the mean and standard deviation
836 used for later scaling along the features axis.
837
838 y : None
839 Ignored.
840
841 sample_weight : array-like of shape (n_samples,), default=None
842 Individual weights for each sample.
843
844 .. versionadded:: 0.24
845 parameter *sample_weight* support to StandardScaler.
846
847 Returns
848 -------
849 self : object
850 Fitted scaler.
851 """
852 # Reset internal state before fitting
853 self._reset()
854 return self.partial_fit(X, y, sample_weight)
855
856 @_fit_context(prefer_skip_nested_validation=True)
857 def partial_fit(self, X, y=None, sample_weight=None):
858 """Online computation of mean and std on X for later scaling.
859
860 All of X is processed as a single batch. This is intended for cases
861 when :meth:`fit` is not feasible due to very large number of
862 `n_samples` or because X is read from a continuous stream.
863
864 The algorithm for incremental mean and std is given in Equation 1.5a,b
865 in Chan, Tony F., Gene H. Golub, and Randall J. LeVeque. "Algorithms
866 for computing the sample variance: Analysis and recommendations."
867 The American Statistician 37.3 (1983): 242-247:
868
869 Parameters
870 ----------
871 X : {array-like, sparse matrix} of shape (n_samples, n_features)
872 The data used to compute the mean and standard deviation
873 used for later scaling along the features axis.
874
875 y : None
876 Ignored.
877
878 sample_weight : array-like of shape (n_samples,), default=None
879 Individual weights for each sample.
880
881 .. versionadded:: 0.24
882 parameter *sample_weight* support to StandardScaler.
883
884 Returns
885 -------
886 self : object
887 Fitted scaler.
888 """
889 first_call = not hasattr(self, "n_samples_seen_")
890 X = self._validate_data(
891 X,
892 accept_sparse=("csr", "csc"),
893 dtype=FLOAT_DTYPES,
894 force_all_finite="allow-nan",
895 reset=first_call,
896 )
897 n_features = X.shape[1]
898
899 if sample_weight is not None:
900 sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype)
901
902 # Even in the case of `with_mean=False`, we update the mean anyway
903 # This is needed for the incremental computation of the var
904 # See incr_mean_variance_axis and _incremental_mean_variance_axis
905
906 # if n_samples_seen_ is an integer (i.e. no missing values), we need to
907 # transform it to a NumPy array of shape (n_features,) required by
908 # incr_mean_variance_axis and _incremental_variance_axis
909 dtype = np.int64 if sample_weight is None else X.dtype
910 if not hasattr(self, "n_samples_seen_"):
911 self.n_samples_seen_ = np.zeros(n_features, dtype=dtype)
912 elif np.size(self.n_samples_seen_) == 1:
913 self.n_samples_seen_ = np.repeat(self.n_samples_seen_, X.shape[1])
914 self.n_samples_seen_ = self.n_samples_seen_.astype(dtype, copy=False)
915
916 if sparse.issparse(X):
917 if self.with_mean:
918 raise ValueError(
919 "Cannot center sparse matrices: pass `with_mean=False` "
920 "instead. See docstring for motivation and alternatives."
921 )
922 sparse_constructor = (
923 sparse.csr_matrix if X.format == "csr" else sparse.csc_matrix
924 )
925
926 if self.with_std:
927 # First pass
928 if not hasattr(self, "scale_"):
929 self.mean_, self.var_, self.n_samples_seen_ = mean_variance_axis(
930 X, axis=0, weights=sample_weight, return_sum_weights=True
931 )
932 # Next passes
933 else:
934 (
935 self.mean_,
936 self.var_,
937 self.n_samples_seen_,
938 ) = incr_mean_variance_axis(
939 X,
940 axis=0,
941 last_mean=self.mean_,
942 last_var=self.var_,
943 last_n=self.n_samples_seen_,
944 weights=sample_weight,
945 )
946 # We force the mean and variance to float64 for large arrays
947 # See https://github.com/scikit-learn/scikit-learn/pull/12338
948 self.mean_ = self.mean_.astype(np.float64, copy=False)
949 self.var_ = self.var_.astype(np.float64, copy=False)
950 else:
951 self.mean_ = None # as with_mean must be False for sparse
952 self.var_ = None
953 weights = _check_sample_weight(sample_weight, X)
954 sum_weights_nan = weights @ sparse_constructor(
955 (np.isnan(X.data), X.indices, X.indptr), shape=X.shape
956 )
957 self.n_samples_seen_ += (np.sum(weights) - sum_weights_nan).astype(
958 dtype
959 )
960 else:
961 # First pass
962 if not hasattr(self, "scale_"):
963 self.mean_ = 0.0
964 if self.with_std:
965 self.var_ = 0.0
966 else:
967 self.var_ = None
968
969 if not self.with_mean and not self.with_std:
970 self.mean_ = None
971 self.var_ = None
972 self.n_samples_seen_ += X.shape[0] - np.isnan(X).sum(axis=0)
973
974 else:
975 self.mean_, self.var_, self.n_samples_seen_ = _incremental_mean_and_var(
976 X,
977 self.mean_,
978 self.var_,
979 self.n_samples_seen_,
980 sample_weight=sample_weight,
981 )
982
983 # for backward-compatibility, reduce n_samples_seen_ to an integer
984 # if the number of samples is the same for each feature (i.e. no
985 # missing values)
986 if np.ptp(self.n_samples_seen_) == 0:
987 self.n_samples_seen_ = self.n_samples_seen_[0]
988
989 if self.with_std:
990 # Extract the list of near constant features on the raw variances,
991 # before taking the square root.
992 constant_mask = _is_constant_feature(
993 self.var_, self.mean_, self.n_samples_seen_
994 )
995 self.scale_ = _handle_zeros_in_scale(
996 np.sqrt(self.var_), copy=False, constant_mask=constant_mask
997 )
998 else:
999 self.scale_ = None
1000
1001 return self
1002
1003 def transform(self, X, copy=None):
1004 """Perform standardization by centering and scaling.
1005
1006 Parameters
1007 ----------
1008 X : {array-like, sparse matrix of shape (n_samples, n_features)
1009 The data used to scale along the features axis.
1010 copy : bool, default=None
1011 Copy the input X or not.
1012
1013 Returns
1014 -------
1015 X_tr : {ndarray, sparse matrix} of shape (n_samples, n_features)
1016 Transformed array.
1017 """
1018 check_is_fitted(self)
1019
1020 copy = copy if copy is not None else self.copy
1021 X = self._validate_data(
1022 X,
1023 reset=False,
1024 accept_sparse="csr",
1025 copy=copy,
1026 dtype=FLOAT_DTYPES,
1027 force_all_finite="allow-nan",
1028 )
1029
1030 if sparse.issparse(X):
1031 if self.with_mean:
1032 raise ValueError(
1033 "Cannot center sparse matrices: pass `with_mean=False` "
1034 "instead. See docstring for motivation and alternatives."
1035 )
1036 if self.scale_ is not None:
1037 inplace_column_scale(X, 1 / self.scale_)
1038 else:
1039 if self.with_mean:
1040 X -= self.mean_
1041 if self.with_std:
1042 X /= self.scale_
1043 return X
1044
1045 def inverse_transform(self, X, copy=None):
1046 """Scale back the data to the original representation.
1047
1048 Parameters
1049 ----------
1050 X : {array-like, sparse matrix} of shape (n_samples, n_features)
1051 The data used to scale along the features axis.
1052 copy : bool, default=None
1053 Copy the input X or not.
1054
1055 Returns
1056 -------
1057 X_tr : {ndarray, sparse matrix} of shape (n_samples, n_features)
1058 Transformed array.
1059 """
1060 check_is_fitted(self)
1061
1062 copy = copy if copy is not None else self.copy
1063 X = check_array(
1064 X,
1065 accept_sparse="csr",
1066 copy=copy,
1067 dtype=FLOAT_DTYPES,
1068 force_all_finite="allow-nan",
1069 )
1070
1071 if sparse.issparse(X):
1072 if self.with_mean:
1073 raise ValueError(
1074 "Cannot uncenter sparse matrices: pass `with_mean=False` "
1075 "instead See docstring for motivation and alternatives."
1076 )
1077 if self.scale_ is not None:
1078 inplace_column_scale(X, self.scale_)
1079 else:
1080 if self.with_std:
1081 X *= self.scale_
1082 if self.with_mean:
1083 X += self.mean_
1084 return X
1085
1086 def _more_tags(self):
1087 return {"allow_nan": True, "preserves_dtype": [np.float64, np.float32]}
1088
1089
1090class MaxAbsScaler(OneToOneFeatureMixin, TransformerMixin, BaseEstimator):
1091 """Scale each feature by its maximum absolute value.
1092
1093 This estimator scales and translates each feature individually such
1094 that the maximal absolute value of each feature in the
1095 training set will be 1.0. It does not shift/center the data, and
1096 thus does not destroy any sparsity.
1097
1098 This scaler can also be applied to sparse CSR or CSC matrices.
1099
1100 `MaxAbsScaler` doesn't reduce the effect of outliers; it only linearly
1101 scales them down. For an example visualization, refer to :ref:`Compare
1102 MaxAbsScaler with other scalers <plot_all_scaling_max_abs_scaler_section>`.
1103
1104 .. versionadded:: 0.17
1105
1106 Parameters
1107 ----------
1108 copy : bool, default=True
1109 Set to False to perform inplace scaling and avoid a copy (if the input
1110 is already a numpy array).
1111
1112 Attributes
1113 ----------
1114 scale_ : ndarray of shape (n_features,)
1115 Per feature relative scaling of the data.
1116
1117 .. versionadded:: 0.17
1118 *scale_* attribute.
1119
1120 max_abs_ : ndarray of shape (n_features,)
1121 Per feature maximum absolute value.
1122
1123 n_features_in_ : int
1124 Number of features seen during :term:`fit`.
1125
1126 .. versionadded:: 0.24
1127
1128 feature_names_in_ : ndarray of shape (`n_features_in_`,)
1129 Names of features seen during :term:`fit`. Defined only when `X`
1130 has feature names that are all strings.
1131
1132 .. versionadded:: 1.0
1133
1134 n_samples_seen_ : int
1135 The number of samples processed by the estimator. Will be reset on
1136 new calls to fit, but increments across ``partial_fit`` calls.
1137
1138 See Also
1139 --------
1140 maxabs_scale : Equivalent function without the estimator API.
1141
1142 Notes
1143 -----
1144 NaNs are treated as missing values: disregarded in fit, and maintained in
1145 transform.
1146
1147 Examples
1148 --------
1149 >>> from sklearn.preprocessing import MaxAbsScaler
1150 >>> X = [[ 1., -1., 2.],
1151 ... [ 2., 0., 0.],
1152 ... [ 0., 1., -1.]]
1153 >>> transformer = MaxAbsScaler().fit(X)
1154 >>> transformer
1155 MaxAbsScaler()
1156 >>> transformer.transform(X)
1157 array([[ 0.5, -1. , 1. ],
1158 [ 1. , 0. , 0. ],
1159 [ 0. , 1. , -0.5]])
1160 """
1161
1162 _parameter_constraints: dict = {"copy": ["boolean"]}
1163
1164 def __init__(self, *, copy=True):
1165 self.copy = copy
1166
1167 def _reset(self):
1168 """Reset internal data-dependent state of the scaler, if necessary.
1169
1170 __init__ parameters are not touched.
1171 """
1172 # Checking one attribute is enough, because they are all set together
1173 # in partial_fit
1174 if hasattr(self, "scale_"):
1175 del self.scale_
1176 del self.n_samples_seen_
1177 del self.max_abs_
1178
1179 def fit(self, X, y=None):
1180 """Compute the maximum absolute value to be used for later scaling.
1181
1182 Parameters
1183 ----------
1184 X : {array-like, sparse matrix} of shape (n_samples, n_features)
1185 The data used to compute the per-feature minimum and maximum
1186 used for later scaling along the features axis.
1187
1188 y : None
1189 Ignored.
1190
1191 Returns
1192 -------
1193 self : object
1194 Fitted scaler.
1195 """
1196 # Reset internal state before fitting
1197 self._reset()
1198 return self.partial_fit(X, y)
1199
1200 @_fit_context(prefer_skip_nested_validation=True)
1201 def partial_fit(self, X, y=None):
1202 """Online computation of max absolute value of X for later scaling.
1203
1204 All of X is processed as a single batch. This is intended for cases
1205 when :meth:`fit` is not feasible due to very large number of
1206 `n_samples` or because X is read from a continuous stream.
1207
1208 Parameters
1209 ----------
1210 X : {array-like, sparse matrix} of shape (n_samples, n_features)
1211 The data used to compute the mean and standard deviation
1212 used for later scaling along the features axis.
1213
1214 y : None
1215 Ignored.
1216
1217 Returns
1218 -------
1219 self : object
1220 Fitted scaler.
1221 """
1222 xp, _ = get_namespace(X)
1223
1224 first_pass = not hasattr(self, "n_samples_seen_")
1225 X = self._validate_data(
1226 X,
1227 reset=first_pass,
1228 accept_sparse=("csr", "csc"),
1229 dtype=_array_api.supported_float_dtypes(xp),
1230 force_all_finite="allow-nan",
1231 )
1232
1233 if sparse.issparse(X):
1234 mins, maxs = min_max_axis(X, axis=0, ignore_nan=True)
1235 max_abs = np.maximum(np.abs(mins), np.abs(maxs))
1236 else:
1237 max_abs = _array_api._nanmax(xp.abs(X), axis=0)
1238
1239 if first_pass:
1240 self.n_samples_seen_ = X.shape[0]
1241 else:
1242 max_abs = xp.maximum(self.max_abs_, max_abs)
1243 self.n_samples_seen_ += X.shape[0]
1244
1245 self.max_abs_ = max_abs
1246 self.scale_ = _handle_zeros_in_scale(max_abs, copy=True)
1247 return self
1248
1249 def transform(self, X):
1250 """Scale the data.
1251
1252 Parameters
1253 ----------
1254 X : {array-like, sparse matrix} of shape (n_samples, n_features)
1255 The data that should be scaled.
1256
1257 Returns
1258 -------
1259 X_tr : {ndarray, sparse matrix} of shape (n_samples, n_features)
1260 Transformed array.
1261 """
1262 check_is_fitted(self)
1263
1264 xp, _ = get_namespace(X)
1265
1266 X = self._validate_data(
1267 X,
1268 accept_sparse=("csr", "csc"),
1269 copy=self.copy,
1270 reset=False,
1271 dtype=_array_api.supported_float_dtypes(xp),
1272 force_all_finite="allow-nan",
1273 )
1274
1275 if sparse.issparse(X):
1276 inplace_column_scale(X, 1.0 / self.scale_)
1277 else:
1278 X /= self.scale_
1279 return X
1280
1281 def inverse_transform(self, X):
1282 """Scale back the data to the original representation.
1283
1284 Parameters
1285 ----------
1286 X : {array-like, sparse matrix} of shape (n_samples, n_features)
1287 The data that should be transformed back.
1288
1289 Returns
1290 -------
1291 X_tr : {ndarray, sparse matrix} of shape (n_samples, n_features)
1292 Transformed array.
1293 """
1294 check_is_fitted(self)
1295
1296 xp, _ = get_namespace(X)
1297
1298 X = check_array(
1299 X,
1300 accept_sparse=("csr", "csc"),
1301 copy=self.copy,
1302 dtype=_array_api.supported_float_dtypes(xp),
1303 force_all_finite="allow-nan",
1304 )
1305
1306 if sparse.issparse(X):
1307 inplace_column_scale(X, self.scale_)
1308 else:
1309 X *= self.scale_
1310 return X
1311
1312 def _more_tags(self):
1313 return {"allow_nan": True}
1314
1315
1316@validate_params(
1317 {
1318 "X": ["array-like", "sparse matrix"],
1319 "axis": [Options(Integral, {0, 1})],
1320 },
1321 prefer_skip_nested_validation=False,
1322)
1323def maxabs_scale(X, *, axis=0, copy=True):
1324 """Scale each feature to the [-1, 1] range without breaking the sparsity.
1325
1326 This estimator scales each feature individually such
1327 that the maximal absolute value of each feature in the
1328 training set will be 1.0.
1329
1330 This scaler can also be applied to sparse CSR or CSC matrices.
1331
1332 Parameters
1333 ----------
1334 X : {array-like, sparse matrix} of shape (n_samples, n_features)
1335 The data.
1336
1337 axis : {0, 1}, default=0
1338 Axis used to scale along. If 0, independently scale each feature,
1339 otherwise (if 1) scale each sample.
1340
1341 copy : bool, default=True
1342 If False, try to avoid a copy and scale in place.
1343 This is not guaranteed to always work in place; e.g. if the data is
1344 a numpy array with an int dtype, a copy will be returned even with
1345 copy=False.
1346
1347 Returns
1348 -------
1349 X_tr : {ndarray, sparse matrix} of shape (n_samples, n_features)
1350 The transformed data.
1351
1352 .. warning:: Risk of data leak
1353
1354 Do not use :func:`~sklearn.preprocessing.maxabs_scale` unless you know
1355 what you are doing. A common mistake is to apply it to the entire data
1356 *before* splitting into training and test sets. This will bias the
1357 model evaluation because information would have leaked from the test
1358 set to the training set.
1359 In general, we recommend using
1360 :class:`~sklearn.preprocessing.MaxAbsScaler` within a
1361 :ref:`Pipeline <pipeline>` in order to prevent most risks of data
1362 leaking: `pipe = make_pipeline(MaxAbsScaler(), LogisticRegression())`.
1363
1364 See Also
1365 --------
1366 MaxAbsScaler : Performs scaling to the [-1, 1] range using
1367 the Transformer API (e.g. as part of a preprocessing
1368 :class:`~sklearn.pipeline.Pipeline`).
1369
1370 Notes
1371 -----
1372 NaNs are treated as missing values: disregarded to compute the statistics,
1373 and maintained during the data transformation.
1374
1375 For a comparison of the different scalers, transformers, and normalizers,
1376 see: :ref:`sphx_glr_auto_examples_preprocessing_plot_all_scaling.py`.
1377 """
1378 # Unlike the scaler object, this function allows 1d input.
1379
1380 # If copy is required, it will be done inside the scaler object.
1381 X = check_array(
1382 X,
1383 accept_sparse=("csr", "csc"),
1384 copy=False,
1385 ensure_2d=False,
1386 dtype=FLOAT_DTYPES,
1387 force_all_finite="allow-nan",
1388 )
1389 original_ndim = X.ndim
1390
1391 if original_ndim == 1:
1392 X = X.reshape(X.shape[0], 1)
1393
1394 s = MaxAbsScaler(copy=copy)
1395 if axis == 0:
1396 X = s.fit_transform(X)
1397 else:
1398 X = s.fit_transform(X.T).T
1399
1400 if original_ndim == 1:
1401 X = X.ravel()
1402
1403 return X
1404
1405
1406class RobustScaler(OneToOneFeatureMixin, TransformerMixin, BaseEstimator):
1407 """Scale features using statistics that are robust to outliers.
1408
1409 This Scaler removes the median and scales the data according to
1410 the quantile range (defaults to IQR: Interquartile Range).
1411 The IQR is the range between the 1st quartile (25th quantile)
1412 and the 3rd quartile (75th quantile).
1413
1414 Centering and scaling happen independently on each feature by
1415 computing the relevant statistics on the samples in the training
1416 set. Median and interquartile range are then stored to be used on
1417 later data using the :meth:`transform` method.
1418
1419 Standardization of a dataset is a common preprocessing for many machine
1420 learning estimators. Typically this is done by removing the mean and
1421 scaling to unit variance. However, outliers can often influence the sample
1422 mean / variance in a negative way. In such cases, using the median and the
1423 interquartile range often give better results. For an example visualization
1424 and comparison to other scalers, refer to :ref:`Compare RobustScaler with
1425 other scalers <plot_all_scaling_robust_scaler_section>`.
1426
1427 .. versionadded:: 0.17
1428
1429 Read more in the :ref:`User Guide <preprocessing_scaler>`.
1430
1431 Parameters
1432 ----------
1433 with_centering : bool, default=True
1434 If `True`, center the data before scaling.
1435 This will cause :meth:`transform` to raise an exception when attempted
1436 on sparse matrices, because centering them entails building a dense
1437 matrix which in common use cases is likely to be too large to fit in
1438 memory.
1439
1440 with_scaling : bool, default=True
1441 If `True`, scale the data to interquartile range.
1442
1443 quantile_range : tuple (q_min, q_max), 0.0 < q_min < q_max < 100.0, \
1444 default=(25.0, 75.0)
1445 Quantile range used to calculate `scale_`. By default this is equal to
1446 the IQR, i.e., `q_min` is the first quantile and `q_max` is the third
1447 quantile.
1448
1449 .. versionadded:: 0.18
1450
1451 copy : bool, default=True
1452 If `False`, try to avoid a copy and do inplace scaling instead.
1453 This is not guaranteed to always work inplace; e.g. if the data is
1454 not a NumPy array or scipy.sparse CSR matrix, a copy may still be
1455 returned.
1456
1457 unit_variance : bool, default=False
1458 If `True`, scale data so that normally distributed features have a
1459 variance of 1. In general, if the difference between the x-values of
1460 `q_max` and `q_min` for a standard normal distribution is greater
1461 than 1, the dataset will be scaled down. If less than 1, the dataset
1462 will be scaled up.
1463
1464 .. versionadded:: 0.24
1465
1466 Attributes
1467 ----------
1468 center_ : array of floats
1469 The median value for each feature in the training set.
1470
1471 scale_ : array of floats
1472 The (scaled) interquartile range for each feature in the training set.
1473
1474 .. versionadded:: 0.17
1475 *scale_* attribute.
1476
1477 n_features_in_ : int
1478 Number of features seen during :term:`fit`.
1479
1480 .. versionadded:: 0.24
1481
1482 feature_names_in_ : ndarray of shape (`n_features_in_`,)
1483 Names of features seen during :term:`fit`. Defined only when `X`
1484 has feature names that are all strings.
1485
1486 .. versionadded:: 1.0
1487
1488 See Also
1489 --------
1490 robust_scale : Equivalent function without the estimator API.
1491 sklearn.decomposition.PCA : Further removes the linear correlation across
1492 features with 'whiten=True'.
1493
1494 Notes
1495 -----
1496
1497 https://en.wikipedia.org/wiki/Median
1498 https://en.wikipedia.org/wiki/Interquartile_range
1499
1500 Examples
1501 --------
1502 >>> from sklearn.preprocessing import RobustScaler
1503 >>> X = [[ 1., -2., 2.],
1504 ... [ -2., 1., 3.],
1505 ... [ 4., 1., -2.]]
1506 >>> transformer = RobustScaler().fit(X)
1507 >>> transformer
1508 RobustScaler()
1509 >>> transformer.transform(X)
1510 array([[ 0. , -2. , 0. ],
1511 [-1. , 0. , 0.4],
1512 [ 1. , 0. , -1.6]])
1513 """
1514
1515 _parameter_constraints: dict = {
1516 "with_centering": ["boolean"],
1517 "with_scaling": ["boolean"],
1518 "quantile_range": [tuple],
1519 "copy": ["boolean"],
1520 "unit_variance": ["boolean"],
1521 }
1522
1523 def __init__(
1524 self,
1525 *,
1526 with_centering=True,
1527 with_scaling=True,
1528 quantile_range=(25.0, 75.0),
1529 copy=True,
1530 unit_variance=False,
1531 ):
1532 self.with_centering = with_centering
1533 self.with_scaling = with_scaling
1534 self.quantile_range = quantile_range
1535 self.unit_variance = unit_variance
1536 self.copy = copy
1537
1538 @_fit_context(prefer_skip_nested_validation=True)
1539 def fit(self, X, y=None):
1540 """Compute the median and quantiles to be used for scaling.
1541
1542 Parameters
1543 ----------
1544 X : {array-like, sparse matrix} of shape (n_samples, n_features)
1545 The data used to compute the median and quantiles
1546 used for later scaling along the features axis.
1547
1548 y : Ignored
1549 Not used, present here for API consistency by convention.
1550
1551 Returns
1552 -------
1553 self : object
1554 Fitted scaler.
1555 """
1556 # at fit, convert sparse matrices to csc for optimized computation of
1557 # the quantiles
1558 X = self._validate_data(
1559 X,
1560 accept_sparse="csc",
1561 dtype=FLOAT_DTYPES,
1562 force_all_finite="allow-nan",
1563 )
1564
1565 q_min, q_max = self.quantile_range
1566 if not 0 <= q_min <= q_max <= 100:
1567 raise ValueError("Invalid quantile range: %s" % str(self.quantile_range))
1568
1569 if self.with_centering:
1570 if sparse.issparse(X):
1571 raise ValueError(
1572 "Cannot center sparse matrices: use `with_centering=False`"
1573 " instead. See docstring for motivation and alternatives."
1574 )
1575 self.center_ = np.nanmedian(X, axis=0)
1576 else:
1577 self.center_ = None
1578
1579 if self.with_scaling:
1580 quantiles = []
1581 for feature_idx in range(X.shape[1]):
1582 if sparse.issparse(X):
1583 column_nnz_data = X.data[
1584 X.indptr[feature_idx] : X.indptr[feature_idx + 1]
1585 ]
1586 column_data = np.zeros(shape=X.shape[0], dtype=X.dtype)
1587 column_data[: len(column_nnz_data)] = column_nnz_data
1588 else:
1589 column_data = X[:, feature_idx]
1590
1591 quantiles.append(np.nanpercentile(column_data, self.quantile_range))
1592
1593 quantiles = np.transpose(quantiles)
1594
1595 self.scale_ = quantiles[1] - quantiles[0]
1596 self.scale_ = _handle_zeros_in_scale(self.scale_, copy=False)
1597 if self.unit_variance:
1598 adjust = stats.norm.ppf(q_max / 100.0) - stats.norm.ppf(q_min / 100.0)
1599 self.scale_ = self.scale_ / adjust
1600 else:
1601 self.scale_ = None
1602
1603 return self
1604
1605 def transform(self, X):
1606 """Center and scale the data.
1607
1608 Parameters
1609 ----------
1610 X : {array-like, sparse matrix} of shape (n_samples, n_features)
1611 The data used to scale along the specified axis.
1612
1613 Returns
1614 -------
1615 X_tr : {ndarray, sparse matrix} of shape (n_samples, n_features)
1616 Transformed array.
1617 """
1618 check_is_fitted(self)
1619 X = self._validate_data(
1620 X,
1621 accept_sparse=("csr", "csc"),
1622 copy=self.copy,
1623 dtype=FLOAT_DTYPES,
1624 reset=False,
1625 force_all_finite="allow-nan",
1626 )
1627
1628 if sparse.issparse(X):
1629 if self.with_scaling:
1630 inplace_column_scale(X, 1.0 / self.scale_)
1631 else:
1632 if self.with_centering:
1633 X -= self.center_
1634 if self.with_scaling:
1635 X /= self.scale_
1636 return X
1637
1638 def inverse_transform(self, X):
1639 """Scale back the data to the original representation.
1640
1641 Parameters
1642 ----------
1643 X : {array-like, sparse matrix} of shape (n_samples, n_features)
1644 The rescaled data to be transformed back.
1645
1646 Returns
1647 -------
1648 X_tr : {ndarray, sparse matrix} of shape (n_samples, n_features)
1649 Transformed array.
1650 """
1651 check_is_fitted(self)
1652 X = check_array(
1653 X,
1654 accept_sparse=("csr", "csc"),
1655 copy=self.copy,
1656 dtype=FLOAT_DTYPES,
1657 force_all_finite="allow-nan",
1658 )
1659
1660 if sparse.issparse(X):
1661 if self.with_scaling:
1662 inplace_column_scale(X, self.scale_)
1663 else:
1664 if self.with_scaling:
1665 X *= self.scale_
1666 if self.with_centering:
1667 X += self.center_
1668 return X
1669
1670 def _more_tags(self):
1671 return {"allow_nan": True}
1672
1673
1674@validate_params(
1675 {"X": ["array-like", "sparse matrix"], "axis": [Options(Integral, {0, 1})]},
1676 prefer_skip_nested_validation=False,
1677)
1678def robust_scale(
1679 X,
1680 *,
1681 axis=0,
1682 with_centering=True,
1683 with_scaling=True,
1684 quantile_range=(25.0, 75.0),
1685 copy=True,
1686 unit_variance=False,
1687):
1688 """Standardize a dataset along any axis.
1689
1690 Center to the median and component wise scale
1691 according to the interquartile range.
1692
1693 Read more in the :ref:`User Guide <preprocessing_scaler>`.
1694
1695 Parameters
1696 ----------
1697 X : {array-like, sparse matrix} of shape (n_sample, n_features)
1698 The data to center and scale.
1699
1700 axis : int, default=0
1701 Axis used to compute the medians and IQR along. If 0,
1702 independently scale each feature, otherwise (if 1) scale
1703 each sample.
1704
1705 with_centering : bool, default=True
1706 If `True`, center the data before scaling.
1707
1708 with_scaling : bool, default=True
1709 If `True`, scale the data to unit variance (or equivalently,
1710 unit standard deviation).
1711
1712 quantile_range : tuple (q_min, q_max), 0.0 < q_min < q_max < 100.0,\
1713 default=(25.0, 75.0)
1714 Quantile range used to calculate `scale_`. By default this is equal to
1715 the IQR, i.e., `q_min` is the first quantile and `q_max` is the third
1716 quantile.
1717
1718 .. versionadded:: 0.18
1719
1720 copy : bool, default=True
1721 If False, try to avoid a copy and scale in place.
1722 This is not guaranteed to always work in place; e.g. if the data is
1723 a numpy array with an int dtype, a copy will be returned even with
1724 copy=False.
1725
1726 unit_variance : bool, default=False
1727 If `True`, scale data so that normally distributed features have a
1728 variance of 1. In general, if the difference between the x-values of
1729 `q_max` and `q_min` for a standard normal distribution is greater
1730 than 1, the dataset will be scaled down. If less than 1, the dataset
1731 will be scaled up.
1732
1733 .. versionadded:: 0.24
1734
1735 Returns
1736 -------
1737 X_tr : {ndarray, sparse matrix} of shape (n_samples, n_features)
1738 The transformed data.
1739
1740 See Also
1741 --------
1742 RobustScaler : Performs centering and scaling using the Transformer API
1743 (e.g. as part of a preprocessing :class:`~sklearn.pipeline.Pipeline`).
1744
1745 Notes
1746 -----
1747 This implementation will refuse to center scipy.sparse matrices
1748 since it would make them non-sparse and would potentially crash the
1749 program with memory exhaustion problems.
1750
1751 Instead the caller is expected to either set explicitly
1752 `with_centering=False` (in that case, only variance scaling will be
1753 performed on the features of the CSR matrix) or to call `X.toarray()`
1754 if he/she expects the materialized dense array to fit in memory.
1755
1756 To avoid memory copy the caller should pass a CSR matrix.
1757
1758 For a comparison of the different scalers, transformers, and normalizers,
1759 see: :ref:`sphx_glr_auto_examples_preprocessing_plot_all_scaling.py`.
1760
1761 .. warning:: Risk of data leak
1762
1763 Do not use :func:`~sklearn.preprocessing.robust_scale` unless you know
1764 what you are doing. A common mistake is to apply it to the entire data
1765 *before* splitting into training and test sets. This will bias the
1766 model evaluation because information would have leaked from the test
1767 set to the training set.
1768 In general, we recommend using
1769 :class:`~sklearn.preprocessing.RobustScaler` within a
1770 :ref:`Pipeline <pipeline>` in order to prevent most risks of data
1771 leaking: `pipe = make_pipeline(RobustScaler(), LogisticRegression())`.
1772 """
1773 X = check_array(
1774 X,
1775 accept_sparse=("csr", "csc"),
1776 copy=False,
1777 ensure_2d=False,
1778 dtype=FLOAT_DTYPES,
1779 force_all_finite="allow-nan",
1780 )
1781 original_ndim = X.ndim
1782
1783 if original_ndim == 1:
1784 X = X.reshape(X.shape[0], 1)
1785
1786 s = RobustScaler(
1787 with_centering=with_centering,
1788 with_scaling=with_scaling,
1789 quantile_range=quantile_range,
1790 unit_variance=unit_variance,
1791 copy=copy,
1792 )
1793 if axis == 0:
1794 X = s.fit_transform(X)
1795 else:
1796 X = s.fit_transform(X.T).T
1797
1798 if original_ndim == 1:
1799 X = X.ravel()
1800
1801 return X
1802
1803
1804@validate_params(
1805 {
1806 "X": ["array-like", "sparse matrix"],
1807 "norm": [StrOptions({"l1", "l2", "max"})],
1808 "axis": [Options(Integral, {0, 1})],
1809 "copy": ["boolean"],
1810 "return_norm": ["boolean"],
1811 },
1812 prefer_skip_nested_validation=True,
1813)
1814def normalize(X, norm="l2", *, axis=1, copy=True, return_norm=False):
1815 """Scale input vectors individually to unit norm (vector length).
1816
1817 Read more in the :ref:`User Guide <preprocessing_normalization>`.
1818
1819 Parameters
1820 ----------
1821 X : {array-like, sparse matrix} of shape (n_samples, n_features)
1822 The data to normalize, element by element.
1823 scipy.sparse matrices should be in CSR format to avoid an
1824 un-necessary copy.
1825
1826 norm : {'l1', 'l2', 'max'}, default='l2'
1827 The norm to use to normalize each non zero sample (or each non-zero
1828 feature if axis is 0).
1829
1830 axis : {0, 1}, default=1
1831 Define axis used to normalize the data along. If 1, independently
1832 normalize each sample, otherwise (if 0) normalize each feature.
1833
1834 copy : bool, default=True
1835 If False, try to avoid a copy and normalize in place.
1836 This is not guaranteed to always work in place; e.g. if the data is
1837 a numpy array with an int dtype, a copy will be returned even with
1838 copy=False.
1839
1840 return_norm : bool, default=False
1841 Whether to return the computed norms.
1842
1843 Returns
1844 -------
1845 X : {ndarray, sparse matrix} of shape (n_samples, n_features)
1846 Normalized input X.
1847
1848 norms : ndarray of shape (n_samples, ) if axis=1 else (n_features, )
1849 An array of norms along given axis for X.
1850 When X is sparse, a NotImplementedError will be raised
1851 for norm 'l1' or 'l2'.
1852
1853 See Also
1854 --------
1855 Normalizer : Performs normalization using the Transformer API
1856 (e.g. as part of a preprocessing :class:`~sklearn.pipeline.Pipeline`).
1857
1858 Notes
1859 -----
1860 For a comparison of the different scalers, transformers, and normalizers,
1861 see: :ref:`sphx_glr_auto_examples_preprocessing_plot_all_scaling.py`.
1862 """
1863 if axis == 0:
1864 sparse_format = "csc"
1865 else: # axis == 1:
1866 sparse_format = "csr"
1867
1868 xp, _ = get_namespace(X)
1869
1870 X = check_array(
1871 X,
1872 accept_sparse=sparse_format,
1873 copy=copy,
1874 estimator="the normalize function",
1875 dtype=_array_api.supported_float_dtypes(xp),
1876 )
1877 if axis == 0:
1878 X = X.T
1879
1880 if sparse.issparse(X):
1881 if return_norm and norm in ("l1", "l2"):
1882 raise NotImplementedError(
1883 "return_norm=True is not implemented "
1884 "for sparse matrices with norm 'l1' "
1885 "or norm 'l2'"
1886 )
1887 if norm == "l1":
1888 inplace_csr_row_normalize_l1(X)
1889 elif norm == "l2":
1890 inplace_csr_row_normalize_l2(X)
1891 elif norm == "max":
1892 mins, maxes = min_max_axis(X, 1)
1893 norms = np.maximum(abs(mins), maxes)
1894 norms_elementwise = norms.repeat(np.diff(X.indptr))
1895 mask = norms_elementwise != 0
1896 X.data[mask] /= norms_elementwise[mask]
1897 else:
1898 if norm == "l1":
1899 norms = xp.sum(xp.abs(X), axis=1)
1900 elif norm == "l2":
1901 norms = row_norms(X)
1902 elif norm == "max":
1903 norms = xp.max(xp.abs(X), axis=1)
1904 norms = _handle_zeros_in_scale(norms, copy=False)
1905 X /= norms[:, None]
1906
1907 if axis == 0:
1908 X = X.T
1909
1910 if return_norm:
1911 return X, norms
1912 else:
1913 return X
1914
1915
1916class Normalizer(OneToOneFeatureMixin, TransformerMixin, BaseEstimator):
1917 """Normalize samples individually to unit norm.
1918
1919 Each sample (i.e. each row of the data matrix) with at least one
1920 non zero component is rescaled independently of other samples so
1921 that its norm (l1, l2 or inf) equals one.
1922
1923 This transformer is able to work both with dense numpy arrays and
1924 scipy.sparse matrix (use CSR format if you want to avoid the burden of
1925 a copy / conversion).
1926
1927 Scaling inputs to unit norms is a common operation for text
1928 classification or clustering for instance. For instance the dot
1929 product of two l2-normalized TF-IDF vectors is the cosine similarity
1930 of the vectors and is the base similarity metric for the Vector
1931 Space Model commonly used by the Information Retrieval community.
1932
1933 For an example visualization, refer to :ref:`Compare Normalizer with other
1934 scalers <plot_all_scaling_normalizer_section>`.
1935
1936 Read more in the :ref:`User Guide <preprocessing_normalization>`.
1937
1938 Parameters
1939 ----------
1940 norm : {'l1', 'l2', 'max'}, default='l2'
1941 The norm to use to normalize each non zero sample. If norm='max'
1942 is used, values will be rescaled by the maximum of the absolute
1943 values.
1944
1945 copy : bool, default=True
1946 Set to False to perform inplace row normalization and avoid a
1947 copy (if the input is already a numpy array or a scipy.sparse
1948 CSR matrix).
1949
1950 Attributes
1951 ----------
1952 n_features_in_ : int
1953 Number of features seen during :term:`fit`.
1954
1955 .. versionadded:: 0.24
1956
1957 feature_names_in_ : ndarray of shape (`n_features_in_`,)
1958 Names of features seen during :term:`fit`. Defined only when `X`
1959 has feature names that are all strings.
1960
1961 .. versionadded:: 1.0
1962
1963 See Also
1964 --------
1965 normalize : Equivalent function without the estimator API.
1966
1967 Notes
1968 -----
1969 This estimator is :term:`stateless` and does not need to be fitted.
1970 However, we recommend to call :meth:`fit_transform` instead of
1971 :meth:`transform`, as parameter validation is only performed in
1972 :meth:`fit`.
1973
1974 Examples
1975 --------
1976 >>> from sklearn.preprocessing import Normalizer
1977 >>> X = [[4, 1, 2, 2],
1978 ... [1, 3, 9, 3],
1979 ... [5, 7, 5, 1]]
1980 >>> transformer = Normalizer().fit(X) # fit does nothing.
1981 >>> transformer
1982 Normalizer()
1983 >>> transformer.transform(X)
1984 array([[0.8, 0.2, 0.4, 0.4],
1985 [0.1, 0.3, 0.9, 0.3],
1986 [0.5, 0.7, 0.5, 0.1]])
1987 """
1988
1989 _parameter_constraints: dict = {
1990 "norm": [StrOptions({"l1", "l2", "max"})],
1991 "copy": ["boolean"],
1992 }
1993
1994 def __init__(self, norm="l2", *, copy=True):
1995 self.norm = norm
1996 self.copy = copy
1997
1998 @_fit_context(prefer_skip_nested_validation=True)
1999 def fit(self, X, y=None):
2000 """Only validates estimator's parameters.
2001
2002 This method allows to: (i) validate the estimator's parameters and
2003 (ii) be consistent with the scikit-learn transformer API.
2004
2005 Parameters
2006 ----------
2007 X : {array-like, sparse matrix} of shape (n_samples, n_features)
2008 The data to estimate the normalization parameters.
2009
2010 y : Ignored
2011 Not used, present here for API consistency by convention.
2012
2013 Returns
2014 -------
2015 self : object
2016 Fitted transformer.
2017 """
2018 self._validate_data(X, accept_sparse="csr")
2019 return self
2020
2021 def transform(self, X, copy=None):
2022 """Scale each non zero row of X to unit norm.
2023
2024 Parameters
2025 ----------
2026 X : {array-like, sparse matrix} of shape (n_samples, n_features)
2027 The data to normalize, row by row. scipy.sparse matrices should be
2028 in CSR format to avoid an un-necessary copy.
2029
2030 copy : bool, default=None
2031 Copy the input X or not.
2032
2033 Returns
2034 -------
2035 X_tr : {ndarray, sparse matrix} of shape (n_samples, n_features)
2036 Transformed array.
2037 """
2038 copy = copy if copy is not None else self.copy
2039 X = self._validate_data(X, accept_sparse="csr", reset=False)
2040 return normalize(X, norm=self.norm, axis=1, copy=copy)
2041
2042 def _more_tags(self):
2043 return {"stateless": True, "array_api_support": True}
2044
2045
2046@validate_params(
2047 {
2048 "X": ["array-like", "sparse matrix"],
2049 "threshold": [Interval(Real, None, None, closed="neither")],
2050 "copy": ["boolean"],
2051 },
2052 prefer_skip_nested_validation=True,
2053)
2054def binarize(X, *, threshold=0.0, copy=True):
2055 """Boolean thresholding of array-like or scipy.sparse matrix.
2056
2057 Read more in the :ref:`User Guide <preprocessing_binarization>`.
2058
2059 Parameters
2060 ----------
2061 X : {array-like, sparse matrix} of shape (n_samples, n_features)
2062 The data to binarize, element by element.
2063 scipy.sparse matrices should be in CSR or CSC format to avoid an
2064 un-necessary copy.
2065
2066 threshold : float, default=0.0
2067 Feature values below or equal to this are replaced by 0, above it by 1.
2068 Threshold may not be less than 0 for operations on sparse matrices.
2069
2070 copy : bool, default=True
2071 If False, try to avoid a copy and binarize in place.
2072 This is not guaranteed to always work in place; e.g. if the data is
2073 a numpy array with an object dtype, a copy will be returned even with
2074 copy=False.
2075
2076 Returns
2077 -------
2078 X_tr : {ndarray, sparse matrix} of shape (n_samples, n_features)
2079 The transformed data.
2080
2081 See Also
2082 --------
2083 Binarizer : Performs binarization using the Transformer API
2084 (e.g. as part of a preprocessing :class:`~sklearn.pipeline.Pipeline`).
2085 """
2086 X = check_array(X, accept_sparse=["csr", "csc"], copy=copy)
2087 if sparse.issparse(X):
2088 if threshold < 0:
2089 raise ValueError("Cannot binarize a sparse matrix with threshold < 0")
2090 cond = X.data > threshold
2091 not_cond = np.logical_not(cond)
2092 X.data[cond] = 1
2093 X.data[not_cond] = 0
2094 X.eliminate_zeros()
2095 else:
2096 cond = X > threshold
2097 not_cond = np.logical_not(cond)
2098 X[cond] = 1
2099 X[not_cond] = 0
2100 return X
2101
2102
2103class Binarizer(OneToOneFeatureMixin, TransformerMixin, BaseEstimator):
2104 """Binarize data (set feature values to 0 or 1) according to a threshold.
2105
2106 Values greater than the threshold map to 1, while values less than
2107 or equal to the threshold map to 0. With the default threshold of 0,
2108 only positive values map to 1.
2109
2110 Binarization is a common operation on text count data where the
2111 analyst can decide to only consider the presence or absence of a
2112 feature rather than a quantified number of occurrences for instance.
2113
2114 It can also be used as a pre-processing step for estimators that
2115 consider boolean random variables (e.g. modelled using the Bernoulli
2116 distribution in a Bayesian setting).
2117
2118 Read more in the :ref:`User Guide <preprocessing_binarization>`.
2119
2120 Parameters
2121 ----------
2122 threshold : float, default=0.0
2123 Feature values below or equal to this are replaced by 0, above it by 1.
2124 Threshold may not be less than 0 for operations on sparse matrices.
2125
2126 copy : bool, default=True
2127 Set to False to perform inplace binarization and avoid a copy (if
2128 the input is already a numpy array or a scipy.sparse CSR matrix).
2129
2130 Attributes
2131 ----------
2132 n_features_in_ : int
2133 Number of features seen during :term:`fit`.
2134
2135 .. versionadded:: 0.24
2136
2137 feature_names_in_ : ndarray of shape (`n_features_in_`,)
2138 Names of features seen during :term:`fit`. Defined only when `X`
2139 has feature names that are all strings.
2140
2141 .. versionadded:: 1.0
2142
2143 See Also
2144 --------
2145 binarize : Equivalent function without the estimator API.
2146 KBinsDiscretizer : Bin continuous data into intervals.
2147 OneHotEncoder : Encode categorical features as a one-hot numeric array.
2148
2149 Notes
2150 -----
2151 If the input is a sparse matrix, only the non-zero values are subject
2152 to update by the :class:`Binarizer` class.
2153
2154 This estimator is :term:`stateless` and does not need to be fitted.
2155 However, we recommend to call :meth:`fit_transform` instead of
2156 :meth:`transform`, as parameter validation is only performed in
2157 :meth:`fit`.
2158
2159 Examples
2160 --------
2161 >>> from sklearn.preprocessing import Binarizer
2162 >>> X = [[ 1., -1., 2.],
2163 ... [ 2., 0., 0.],
2164 ... [ 0., 1., -1.]]
2165 >>> transformer = Binarizer().fit(X) # fit does nothing.
2166 >>> transformer
2167 Binarizer()
2168 >>> transformer.transform(X)
2169 array([[1., 0., 1.],
2170 [1., 0., 0.],
2171 [0., 1., 0.]])
2172 """
2173
2174 _parameter_constraints: dict = {
2175 "threshold": [Real],
2176 "copy": ["boolean"],
2177 }
2178
2179 def __init__(self, *, threshold=0.0, copy=True):
2180 self.threshold = threshold
2181 self.copy = copy
2182
2183 @_fit_context(prefer_skip_nested_validation=True)
2184 def fit(self, X, y=None):
2185 """Only validates estimator's parameters.
2186
2187 This method allows to: (i) validate the estimator's parameters and
2188 (ii) be consistent with the scikit-learn transformer API.
2189
2190 Parameters
2191 ----------
2192 X : {array-like, sparse matrix} of shape (n_samples, n_features)
2193 The data.
2194
2195 y : None
2196 Ignored.
2197
2198 Returns
2199 -------
2200 self : object
2201 Fitted transformer.
2202 """
2203 self._validate_data(X, accept_sparse="csr")
2204 return self
2205
2206 def transform(self, X, copy=None):
2207 """Binarize each element of X.
2208
2209 Parameters
2210 ----------
2211 X : {array-like, sparse matrix} of shape (n_samples, n_features)
2212 The data to binarize, element by element.
2213 scipy.sparse matrices should be in CSR format to avoid an
2214 un-necessary copy.
2215
2216 copy : bool
2217 Copy the input X or not.
2218
2219 Returns
2220 -------
2221 X_tr : {ndarray, sparse matrix} of shape (n_samples, n_features)
2222 Transformed array.
2223 """
2224 copy = copy if copy is not None else self.copy
2225 # TODO: This should be refactored because binarize also calls
2226 # check_array
2227 X = self._validate_data(X, accept_sparse=["csr", "csc"], copy=copy, reset=False)
2228 return binarize(X, threshold=self.threshold, copy=False)
2229
2230 def _more_tags(self):
2231 return {"stateless": True}
2232
2233
2234class KernelCenterer(ClassNamePrefixFeaturesOutMixin, TransformerMixin, BaseEstimator):
2235 r"""Center an arbitrary kernel matrix :math:`K`.
2236
2237 Let define a kernel :math:`K` such that:
2238
2239 .. math::
2240 K(X, Y) = \phi(X) . \phi(Y)^{T}
2241
2242 :math:`\phi(X)` is a function mapping of rows of :math:`X` to a
2243 Hilbert space and :math:`K` is of shape `(n_samples, n_samples)`.
2244
2245 This class allows to compute :math:`\tilde{K}(X, Y)` such that:
2246
2247 .. math::
2248 \tilde{K(X, Y)} = \tilde{\phi}(X) . \tilde{\phi}(Y)^{T}
2249
2250 :math:`\tilde{\phi}(X)` is the centered mapped data in the Hilbert
2251 space.
2252
2253 `KernelCenterer` centers the features without explicitly computing the
2254 mapping :math:`\phi(\cdot)`. Working with centered kernels is sometime
2255 expected when dealing with algebra computation such as eigendecomposition
2256 for :class:`~sklearn.decomposition.KernelPCA` for instance.
2257
2258 Read more in the :ref:`User Guide <kernel_centering>`.
2259
2260 Attributes
2261 ----------
2262 K_fit_rows_ : ndarray of shape (n_samples,)
2263 Average of each column of kernel matrix.
2264
2265 K_fit_all_ : float
2266 Average of kernel matrix.
2267
2268 n_features_in_ : int
2269 Number of features seen during :term:`fit`.
2270
2271 .. versionadded:: 0.24
2272
2273 feature_names_in_ : ndarray of shape (`n_features_in_`,)
2274 Names of features seen during :term:`fit`. Defined only when `X`
2275 has feature names that are all strings.
2276
2277 .. versionadded:: 1.0
2278
2279 See Also
2280 --------
2281 sklearn.kernel_approximation.Nystroem : Approximate a kernel map
2282 using a subset of the training data.
2283
2284 References
2285 ----------
2286 .. [1] `Schölkopf, Bernhard, Alexander Smola, and Klaus-Robert Müller.
2287 "Nonlinear component analysis as a kernel eigenvalue problem."
2288 Neural computation 10.5 (1998): 1299-1319.
2289 <https://www.mlpack.org/papers/kpca.pdf>`_
2290
2291 Examples
2292 --------
2293 >>> from sklearn.preprocessing import KernelCenterer
2294 >>> from sklearn.metrics.pairwise import pairwise_kernels
2295 >>> X = [[ 1., -2., 2.],
2296 ... [ -2., 1., 3.],
2297 ... [ 4., 1., -2.]]
2298 >>> K = pairwise_kernels(X, metric='linear')
2299 >>> K
2300 array([[ 9., 2., -2.],
2301 [ 2., 14., -13.],
2302 [ -2., -13., 21.]])
2303 >>> transformer = KernelCenterer().fit(K)
2304 >>> transformer
2305 KernelCenterer()
2306 >>> transformer.transform(K)
2307 array([[ 5., 0., -5.],
2308 [ 0., 14., -14.],
2309 [ -5., -14., 19.]])
2310 """
2311
2312 def __init__(self):
2313 # Needed for backported inspect.signature compatibility with PyPy
2314 pass
2315
2316 def fit(self, K, y=None):
2317 """Fit KernelCenterer.
2318
2319 Parameters
2320 ----------
2321 K : ndarray of shape (n_samples, n_samples)
2322 Kernel matrix.
2323
2324 y : None
2325 Ignored.
2326
2327 Returns
2328 -------
2329 self : object
2330 Returns the instance itself.
2331 """
2332 xp, _ = get_namespace(K)
2333
2334 K = self._validate_data(K, dtype=_array_api.supported_float_dtypes(xp))
2335
2336 if K.shape[0] != K.shape[1]:
2337 raise ValueError(
2338 "Kernel matrix must be a square matrix."
2339 " Input is a {}x{} matrix.".format(K.shape[0], K.shape[1])
2340 )
2341
2342 n_samples = K.shape[0]
2343 self.K_fit_rows_ = xp.sum(K, axis=0) / n_samples
2344 self.K_fit_all_ = xp.sum(self.K_fit_rows_) / n_samples
2345 return self
2346
2347 def transform(self, K, copy=True):
2348 """Center kernel matrix.
2349
2350 Parameters
2351 ----------
2352 K : ndarray of shape (n_samples1, n_samples2)
2353 Kernel matrix.
2354
2355 copy : bool, default=True
2356 Set to False to perform inplace computation.
2357
2358 Returns
2359 -------
2360 K_new : ndarray of shape (n_samples1, n_samples2)
2361 Returns the instance itself.
2362 """
2363 check_is_fitted(self)
2364
2365 xp, _ = get_namespace(K)
2366
2367 K = self._validate_data(
2368 K, copy=copy, dtype=_array_api.supported_float_dtypes(xp), reset=False
2369 )
2370
2371 K_pred_cols = (xp.sum(K, axis=1) / self.K_fit_rows_.shape[0])[:, None]
2372
2373 K -= self.K_fit_rows_
2374 K -= K_pred_cols
2375 K += self.K_fit_all_
2376
2377 return K
2378
2379 @property
2380 def _n_features_out(self):
2381 """Number of transformed output features."""
2382 # Used by ClassNamePrefixFeaturesOutMixin. This model preserves the
2383 # number of input features but this is not a one-to-one mapping in the
2384 # usual sense. Hence the choice not to use OneToOneFeatureMixin to
2385 # implement get_feature_names_out for this class.
2386 return self.n_features_in_
2387
2388 def _more_tags(self):
2389 return {"pairwise": True, "array_api_support": True}
2390
2391
2392@validate_params(
2393 {
2394 "X": ["array-like", "sparse matrix"],
2395 "value": [Interval(Real, None, None, closed="neither")],
2396 },
2397 prefer_skip_nested_validation=True,
2398)
2399def add_dummy_feature(X, value=1.0):
2400 """Augment dataset with an additional dummy feature.
2401
2402 This is useful for fitting an intercept term with implementations which
2403 cannot otherwise fit it directly.
2404
2405 Parameters
2406 ----------
2407 X : {array-like, sparse matrix} of shape (n_samples, n_features)
2408 Data.
2409
2410 value : float
2411 Value to use for the dummy feature.
2412
2413 Returns
2414 -------
2415 X : {ndarray, sparse matrix} of shape (n_samples, n_features + 1)
2416 Same data with dummy feature added as first column.
2417
2418 Examples
2419 --------
2420 >>> from sklearn.preprocessing import add_dummy_feature
2421 >>> add_dummy_feature([[0, 1], [1, 0]])
2422 array([[1., 0., 1.],
2423 [1., 1., 0.]])
2424 """
2425 X = check_array(X, accept_sparse=["csc", "csr", "coo"], dtype=FLOAT_DTYPES)
2426 n_samples, n_features = X.shape
2427 shape = (n_samples, n_features + 1)
2428 if sparse.issparse(X):
2429 if X.format == "coo":
2430 # Shift columns to the right.
2431 col = X.col + 1
2432 # Column indices of dummy feature are 0 everywhere.
2433 col = np.concatenate((np.zeros(n_samples), col))
2434 # Row indices of dummy feature are 0, ..., n_samples-1.
2435 row = np.concatenate((np.arange(n_samples), X.row))
2436 # Prepend the dummy feature n_samples times.
2437 data = np.concatenate((np.full(n_samples, value), X.data))
2438 return sparse.coo_matrix((data, (row, col)), shape)
2439 elif X.format == "csc":
2440 # Shift index pointers since we need to add n_samples elements.
2441 indptr = X.indptr + n_samples
2442 # indptr[0] must be 0.
2443 indptr = np.concatenate((np.array([0]), indptr))
2444 # Row indices of dummy feature are 0, ..., n_samples-1.
2445 indices = np.concatenate((np.arange(n_samples), X.indices))
2446 # Prepend the dummy feature n_samples times.
2447 data = np.concatenate((np.full(n_samples, value), X.data))
2448 return sparse.csc_matrix((data, indices, indptr), shape)
2449 else:
2450 klass = X.__class__
2451 return klass(add_dummy_feature(X.tocoo(), value))
2452 else:
2453 return np.hstack((np.full((n_samples, 1), value), X))
2454
2455
2456class QuantileTransformer(OneToOneFeatureMixin, TransformerMixin, BaseEstimator):
2457 """Transform features using quantiles information.
2458
2459 This method transforms the features to follow a uniform or a normal
2460 distribution. Therefore, for a given feature, this transformation tends
2461 to spread out the most frequent values. It also reduces the impact of
2462 (marginal) outliers: this is therefore a robust preprocessing scheme.
2463
2464 The transformation is applied on each feature independently. First an
2465 estimate of the cumulative distribution function of a feature is
2466 used to map the original values to a uniform distribution. The obtained
2467 values are then mapped to the desired output distribution using the
2468 associated quantile function. Features values of new/unseen data that fall
2469 below or above the fitted range will be mapped to the bounds of the output
2470 distribution. Note that this transform is non-linear. It may distort linear
2471 correlations between variables measured at the same scale but renders
2472 variables measured at different scales more directly comparable.
2473
2474 For example visualizations, refer to :ref:`Compare QuantileTransformer with
2475 other scalers <plot_all_scaling_quantile_transformer_section>`.
2476
2477 Read more in the :ref:`User Guide <preprocessing_transformer>`.
2478
2479 .. versionadded:: 0.19
2480
2481 Parameters
2482 ----------
2483 n_quantiles : int, default=1000 or n_samples
2484 Number of quantiles to be computed. It corresponds to the number
2485 of landmarks used to discretize the cumulative distribution function.
2486 If n_quantiles is larger than the number of samples, n_quantiles is set
2487 to the number of samples as a larger number of quantiles does not give
2488 a better approximation of the cumulative distribution function
2489 estimator.
2490
2491 output_distribution : {'uniform', 'normal'}, default='uniform'
2492 Marginal distribution for the transformed data. The choices are
2493 'uniform' (default) or 'normal'.
2494
2495 ignore_implicit_zeros : bool, default=False
2496 Only applies to sparse matrices. If True, the sparse entries of the
2497 matrix are discarded to compute the quantile statistics. If False,
2498 these entries are treated as zeros.
2499
2500 subsample : int, default=10_000
2501 Maximum number of samples used to estimate the quantiles for
2502 computational efficiency. Note that the subsampling procedure may
2503 differ for value-identical sparse and dense matrices.
2504
2505 random_state : int, RandomState instance or None, default=None
2506 Determines random number generation for subsampling and smoothing
2507 noise.
2508 Please see ``subsample`` for more details.
2509 Pass an int for reproducible results across multiple function calls.
2510 See :term:`Glossary <random_state>`.
2511
2512 copy : bool, default=True
2513 Set to False to perform inplace transformation and avoid a copy (if the
2514 input is already a numpy array).
2515
2516 Attributes
2517 ----------
2518 n_quantiles_ : int
2519 The actual number of quantiles used to discretize the cumulative
2520 distribution function.
2521
2522 quantiles_ : ndarray of shape (n_quantiles, n_features)
2523 The values corresponding the quantiles of reference.
2524
2525 references_ : ndarray of shape (n_quantiles, )
2526 Quantiles of references.
2527
2528 n_features_in_ : int
2529 Number of features seen during :term:`fit`.
2530
2531 .. versionadded:: 0.24
2532
2533 feature_names_in_ : ndarray of shape (`n_features_in_`,)
2534 Names of features seen during :term:`fit`. Defined only when `X`
2535 has feature names that are all strings.
2536
2537 .. versionadded:: 1.0
2538
2539 See Also
2540 --------
2541 quantile_transform : Equivalent function without the estimator API.
2542 PowerTransformer : Perform mapping to a normal distribution using a power
2543 transform.
2544 StandardScaler : Perform standardization that is faster, but less robust
2545 to outliers.
2546 RobustScaler : Perform robust standardization that removes the influence
2547 of outliers but does not put outliers and inliers on the same scale.
2548
2549 Notes
2550 -----
2551 NaNs are treated as missing values: disregarded in fit, and maintained in
2552 transform.
2553
2554 Examples
2555 --------
2556 >>> import numpy as np
2557 >>> from sklearn.preprocessing import QuantileTransformer
2558 >>> rng = np.random.RandomState(0)
2559 >>> X = np.sort(rng.normal(loc=0.5, scale=0.25, size=(25, 1)), axis=0)
2560 >>> qt = QuantileTransformer(n_quantiles=10, random_state=0)
2561 >>> qt.fit_transform(X)
2562 array([...])
2563 """
2564
2565 _parameter_constraints: dict = {
2566 "n_quantiles": [Interval(Integral, 1, None, closed="left")],
2567 "output_distribution": [StrOptions({"uniform", "normal"})],
2568 "ignore_implicit_zeros": ["boolean"],
2569 "subsample": [Interval(Integral, 1, None, closed="left")],
2570 "random_state": ["random_state"],
2571 "copy": ["boolean"],
2572 }
2573
2574 def __init__(
2575 self,
2576 *,
2577 n_quantiles=1000,
2578 output_distribution="uniform",
2579 ignore_implicit_zeros=False,
2580 subsample=10_000,
2581 random_state=None,
2582 copy=True,
2583 ):
2584 self.n_quantiles = n_quantiles
2585 self.output_distribution = output_distribution
2586 self.ignore_implicit_zeros = ignore_implicit_zeros
2587 self.subsample = subsample
2588 self.random_state = random_state
2589 self.copy = copy
2590
2591 def _dense_fit(self, X, random_state):
2592 """Compute percentiles for dense matrices.
2593
2594 Parameters
2595 ----------
2596 X : ndarray of shape (n_samples, n_features)
2597 The data used to scale along the features axis.
2598 """
2599 if self.ignore_implicit_zeros:
2600 warnings.warn(
2601 "'ignore_implicit_zeros' takes effect only with"
2602 " sparse matrix. This parameter has no effect."
2603 )
2604
2605 n_samples, n_features = X.shape
2606 references = self.references_ * 100
2607
2608 self.quantiles_ = []
2609 for col in X.T:
2610 if self.subsample < n_samples:
2611 subsample_idx = random_state.choice(
2612 n_samples, size=self.subsample, replace=False
2613 )
2614 col = col.take(subsample_idx, mode="clip")
2615 self.quantiles_.append(np.nanpercentile(col, references))
2616 self.quantiles_ = np.transpose(self.quantiles_)
2617 # Due to floating-point precision error in `np.nanpercentile`,
2618 # make sure that quantiles are monotonically increasing.
2619 # Upstream issue in numpy:
2620 # https://github.com/numpy/numpy/issues/14685
2621 self.quantiles_ = np.maximum.accumulate(self.quantiles_)
2622
2623 def _sparse_fit(self, X, random_state):
2624 """Compute percentiles for sparse matrices.
2625
2626 Parameters
2627 ----------
2628 X : sparse matrix of shape (n_samples, n_features)
2629 The data used to scale along the features axis. The sparse matrix
2630 needs to be nonnegative. If a sparse matrix is provided,
2631 it will be converted into a sparse ``csc_matrix``.
2632 """
2633 n_samples, n_features = X.shape
2634 references = self.references_ * 100
2635
2636 self.quantiles_ = []
2637 for feature_idx in range(n_features):
2638 column_nnz_data = X.data[X.indptr[feature_idx] : X.indptr[feature_idx + 1]]
2639 if len(column_nnz_data) > self.subsample:
2640 column_subsample = self.subsample * len(column_nnz_data) // n_samples
2641 if self.ignore_implicit_zeros:
2642 column_data = np.zeros(shape=column_subsample, dtype=X.dtype)
2643 else:
2644 column_data = np.zeros(shape=self.subsample, dtype=X.dtype)
2645 column_data[:column_subsample] = random_state.choice(
2646 column_nnz_data, size=column_subsample, replace=False
2647 )
2648 else:
2649 if self.ignore_implicit_zeros:
2650 column_data = np.zeros(shape=len(column_nnz_data), dtype=X.dtype)
2651 else:
2652 column_data = np.zeros(shape=n_samples, dtype=X.dtype)
2653 column_data[: len(column_nnz_data)] = column_nnz_data
2654
2655 if not column_data.size:
2656 # if no nnz, an error will be raised for computing the
2657 # quantiles. Force the quantiles to be zeros.
2658 self.quantiles_.append([0] * len(references))
2659 else:
2660 self.quantiles_.append(np.nanpercentile(column_data, references))
2661 self.quantiles_ = np.transpose(self.quantiles_)
2662 # due to floating-point precision error in `np.nanpercentile`,
2663 # make sure the quantiles are monotonically increasing
2664 # Upstream issue in numpy:
2665 # https://github.com/numpy/numpy/issues/14685
2666 self.quantiles_ = np.maximum.accumulate(self.quantiles_)
2667
2668 @_fit_context(prefer_skip_nested_validation=True)
2669 def fit(self, X, y=None):
2670 """Compute the quantiles used for transforming.
2671
2672 Parameters
2673 ----------
2674 X : {array-like, sparse matrix} of shape (n_samples, n_features)
2675 The data used to scale along the features axis. If a sparse
2676 matrix is provided, it will be converted into a sparse
2677 ``csc_matrix``. Additionally, the sparse matrix needs to be
2678 nonnegative if `ignore_implicit_zeros` is False.
2679
2680 y : None
2681 Ignored.
2682
2683 Returns
2684 -------
2685 self : object
2686 Fitted transformer.
2687 """
2688 if self.n_quantiles > self.subsample:
2689 raise ValueError(
2690 "The number of quantiles cannot be greater than"
2691 " the number of samples used. Got {} quantiles"
2692 " and {} samples.".format(self.n_quantiles, self.subsample)
2693 )
2694
2695 X = self._check_inputs(X, in_fit=True, copy=False)
2696 n_samples = X.shape[0]
2697
2698 if self.n_quantiles > n_samples:
2699 warnings.warn(
2700 "n_quantiles (%s) is greater than the total number "
2701 "of samples (%s). n_quantiles is set to "
2702 "n_samples." % (self.n_quantiles, n_samples)
2703 )
2704 self.n_quantiles_ = max(1, min(self.n_quantiles, n_samples))
2705
2706 rng = check_random_state(self.random_state)
2707
2708 # Create the quantiles of reference
2709 self.references_ = np.linspace(0, 1, self.n_quantiles_, endpoint=True)
2710 if sparse.issparse(X):
2711 self._sparse_fit(X, rng)
2712 else:
2713 self._dense_fit(X, rng)
2714
2715 return self
2716
2717 def _transform_col(self, X_col, quantiles, inverse):
2718 """Private function to transform a single feature."""
2719
2720 output_distribution = self.output_distribution
2721
2722 if not inverse:
2723 lower_bound_x = quantiles[0]
2724 upper_bound_x = quantiles[-1]
2725 lower_bound_y = 0
2726 upper_bound_y = 1
2727 else:
2728 lower_bound_x = 0
2729 upper_bound_x = 1
2730 lower_bound_y = quantiles[0]
2731 upper_bound_y = quantiles[-1]
2732 # for inverse transform, match a uniform distribution
2733 with np.errstate(invalid="ignore"): # hide NaN comparison warnings
2734 if output_distribution == "normal":
2735 X_col = stats.norm.cdf(X_col)
2736 # else output distribution is already a uniform distribution
2737
2738 # find index for lower and higher bounds
2739 with np.errstate(invalid="ignore"): # hide NaN comparison warnings
2740 if output_distribution == "normal":
2741 lower_bounds_idx = X_col - BOUNDS_THRESHOLD < lower_bound_x
2742 upper_bounds_idx = X_col + BOUNDS_THRESHOLD > upper_bound_x
2743 if output_distribution == "uniform":
2744 lower_bounds_idx = X_col == lower_bound_x
2745 upper_bounds_idx = X_col == upper_bound_x
2746
2747 isfinite_mask = ~np.isnan(X_col)
2748 X_col_finite = X_col[isfinite_mask]
2749 if not inverse:
2750 # Interpolate in one direction and in the other and take the
2751 # mean. This is in case of repeated values in the features
2752 # and hence repeated quantiles
2753 #
2754 # If we don't do this, only one extreme of the duplicated is
2755 # used (the upper when we do ascending, and the
2756 # lower for descending). We take the mean of these two
2757 X_col[isfinite_mask] = 0.5 * (
2758 np.interp(X_col_finite, quantiles, self.references_)
2759 - np.interp(-X_col_finite, -quantiles[::-1], -self.references_[::-1])
2760 )
2761 else:
2762 X_col[isfinite_mask] = np.interp(X_col_finite, self.references_, quantiles)
2763
2764 X_col[upper_bounds_idx] = upper_bound_y
2765 X_col[lower_bounds_idx] = lower_bound_y
2766 # for forward transform, match the output distribution
2767 if not inverse:
2768 with np.errstate(invalid="ignore"): # hide NaN comparison warnings
2769 if output_distribution == "normal":
2770 X_col = stats.norm.ppf(X_col)
2771 # find the value to clip the data to avoid mapping to
2772 # infinity. Clip such that the inverse transform will be
2773 # consistent
2774 clip_min = stats.norm.ppf(BOUNDS_THRESHOLD - np.spacing(1))
2775 clip_max = stats.norm.ppf(1 - (BOUNDS_THRESHOLD - np.spacing(1)))
2776 X_col = np.clip(X_col, clip_min, clip_max)
2777 # else output distribution is uniform and the ppf is the
2778 # identity function so we let X_col unchanged
2779
2780 return X_col
2781
2782 def _check_inputs(self, X, in_fit, accept_sparse_negative=False, copy=False):
2783 """Check inputs before fit and transform."""
2784 X = self._validate_data(
2785 X,
2786 reset=in_fit,
2787 accept_sparse="csc",
2788 copy=copy,
2789 dtype=FLOAT_DTYPES,
2790 force_all_finite="allow-nan",
2791 )
2792 # we only accept positive sparse matrix when ignore_implicit_zeros is
2793 # false and that we call fit or transform.
2794 with np.errstate(invalid="ignore"): # hide NaN comparison warnings
2795 if (
2796 not accept_sparse_negative
2797 and not self.ignore_implicit_zeros
2798 and (sparse.issparse(X) and np.any(X.data < 0))
2799 ):
2800 raise ValueError(
2801 "QuantileTransformer only accepts non-negative sparse matrices."
2802 )
2803
2804 return X
2805
2806 def _transform(self, X, inverse=False):
2807 """Forward and inverse transform.
2808
2809 Parameters
2810 ----------
2811 X : ndarray of shape (n_samples, n_features)
2812 The data used to scale along the features axis.
2813
2814 inverse : bool, default=False
2815 If False, apply forward transform. If True, apply
2816 inverse transform.
2817
2818 Returns
2819 -------
2820 X : ndarray of shape (n_samples, n_features)
2821 Projected data.
2822 """
2823 if sparse.issparse(X):
2824 for feature_idx in range(X.shape[1]):
2825 column_slice = slice(X.indptr[feature_idx], X.indptr[feature_idx + 1])
2826 X.data[column_slice] = self._transform_col(
2827 X.data[column_slice], self.quantiles_[:, feature_idx], inverse
2828 )
2829 else:
2830 for feature_idx in range(X.shape[1]):
2831 X[:, feature_idx] = self._transform_col(
2832 X[:, feature_idx], self.quantiles_[:, feature_idx], inverse
2833 )
2834
2835 return X
2836
2837 def transform(self, X):
2838 """Feature-wise transformation of the data.
2839
2840 Parameters
2841 ----------
2842 X : {array-like, sparse matrix} of shape (n_samples, n_features)
2843 The data used to scale along the features axis. If a sparse
2844 matrix is provided, it will be converted into a sparse
2845 ``csc_matrix``. Additionally, the sparse matrix needs to be
2846 nonnegative if `ignore_implicit_zeros` is False.
2847
2848 Returns
2849 -------
2850 Xt : {ndarray, sparse matrix} of shape (n_samples, n_features)
2851 The projected data.
2852 """
2853 check_is_fitted(self)
2854 X = self._check_inputs(X, in_fit=False, copy=self.copy)
2855
2856 return self._transform(X, inverse=False)
2857
2858 def inverse_transform(self, X):
2859 """Back-projection to the original space.
2860
2861 Parameters
2862 ----------
2863 X : {array-like, sparse matrix} of shape (n_samples, n_features)
2864 The data used to scale along the features axis. If a sparse
2865 matrix is provided, it will be converted into a sparse
2866 ``csc_matrix``. Additionally, the sparse matrix needs to be
2867 nonnegative if `ignore_implicit_zeros` is False.
2868
2869 Returns
2870 -------
2871 Xt : {ndarray, sparse matrix} of (n_samples, n_features)
2872 The projected data.
2873 """
2874 check_is_fitted(self)
2875 X = self._check_inputs(
2876 X, in_fit=False, accept_sparse_negative=True, copy=self.copy
2877 )
2878
2879 return self._transform(X, inverse=True)
2880
2881 def _more_tags(self):
2882 return {"allow_nan": True}
2883
2884
2885@validate_params(
2886 {"X": ["array-like", "sparse matrix"], "axis": [Options(Integral, {0, 1})]},
2887 prefer_skip_nested_validation=False,
2888)
2889def quantile_transform(
2890 X,
2891 *,
2892 axis=0,
2893 n_quantiles=1000,
2894 output_distribution="uniform",
2895 ignore_implicit_zeros=False,
2896 subsample=int(1e5),
2897 random_state=None,
2898 copy=True,
2899):
2900 """Transform features using quantiles information.
2901
2902 This method transforms the features to follow a uniform or a normal
2903 distribution. Therefore, for a given feature, this transformation tends
2904 to spread out the most frequent values. It also reduces the impact of
2905 (marginal) outliers: this is therefore a robust preprocessing scheme.
2906
2907 The transformation is applied on each feature independently. First an
2908 estimate of the cumulative distribution function of a feature is
2909 used to map the original values to a uniform distribution. The obtained
2910 values are then mapped to the desired output distribution using the
2911 associated quantile function. Features values of new/unseen data that fall
2912 below or above the fitted range will be mapped to the bounds of the output
2913 distribution. Note that this transform is non-linear. It may distort linear
2914 correlations between variables measured at the same scale but renders
2915 variables measured at different scales more directly comparable.
2916
2917 Read more in the :ref:`User Guide <preprocessing_transformer>`.
2918
2919 Parameters
2920 ----------
2921 X : {array-like, sparse matrix} of shape (n_samples, n_features)
2922 The data to transform.
2923
2924 axis : int, default=0
2925 Axis used to compute the means and standard deviations along. If 0,
2926 transform each feature, otherwise (if 1) transform each sample.
2927
2928 n_quantiles : int, default=1000 or n_samples
2929 Number of quantiles to be computed. It corresponds to the number
2930 of landmarks used to discretize the cumulative distribution function.
2931 If n_quantiles is larger than the number of samples, n_quantiles is set
2932 to the number of samples as a larger number of quantiles does not give
2933 a better approximation of the cumulative distribution function
2934 estimator.
2935
2936 output_distribution : {'uniform', 'normal'}, default='uniform'
2937 Marginal distribution for the transformed data. The choices are
2938 'uniform' (default) or 'normal'.
2939
2940 ignore_implicit_zeros : bool, default=False
2941 Only applies to sparse matrices. If True, the sparse entries of the
2942 matrix are discarded to compute the quantile statistics. If False,
2943 these entries are treated as zeros.
2944
2945 subsample : int, default=1e5
2946 Maximum number of samples used to estimate the quantiles for
2947 computational efficiency. Note that the subsampling procedure may
2948 differ for value-identical sparse and dense matrices.
2949
2950 random_state : int, RandomState instance or None, default=None
2951 Determines random number generation for subsampling and smoothing
2952 noise.
2953 Please see ``subsample`` for more details.
2954 Pass an int for reproducible results across multiple function calls.
2955 See :term:`Glossary <random_state>`.
2956
2957 copy : bool, default=True
2958 If False, try to avoid a copy and transform in place.
2959 This is not guaranteed to always work in place; e.g. if the data is
2960 a numpy array with an int dtype, a copy will be returned even with
2961 copy=False.
2962
2963 .. versionchanged:: 0.23
2964 The default value of `copy` changed from False to True in 0.23.
2965
2966 Returns
2967 -------
2968 Xt : {ndarray, sparse matrix} of shape (n_samples, n_features)
2969 The transformed data.
2970
2971 See Also
2972 --------
2973 QuantileTransformer : Performs quantile-based scaling using the
2974 Transformer API (e.g. as part of a preprocessing
2975 :class:`~sklearn.pipeline.Pipeline`).
2976 power_transform : Maps data to a normal distribution using a
2977 power transformation.
2978 scale : Performs standardization that is faster, but less robust
2979 to outliers.
2980 robust_scale : Performs robust standardization that removes the influence
2981 of outliers but does not put outliers and inliers on the same scale.
2982
2983 Notes
2984 -----
2985 NaNs are treated as missing values: disregarded in fit, and maintained in
2986 transform.
2987
2988 .. warning:: Risk of data leak
2989
2990 Do not use :func:`~sklearn.preprocessing.quantile_transform` unless
2991 you know what you are doing. A common mistake is to apply it
2992 to the entire data *before* splitting into training and
2993 test sets. This will bias the model evaluation because
2994 information would have leaked from the test set to the
2995 training set.
2996 In general, we recommend using
2997 :class:`~sklearn.preprocessing.QuantileTransformer` within a
2998 :ref:`Pipeline <pipeline>` in order to prevent most risks of data
2999 leaking:`pipe = make_pipeline(QuantileTransformer(),
3000 LogisticRegression())`.
3001
3002 For a comparison of the different scalers, transformers, and normalizers,
3003 see: :ref:`sphx_glr_auto_examples_preprocessing_plot_all_scaling.py`.
3004
3005 Examples
3006 --------
3007 >>> import numpy as np
3008 >>> from sklearn.preprocessing import quantile_transform
3009 >>> rng = np.random.RandomState(0)
3010 >>> X = np.sort(rng.normal(loc=0.5, scale=0.25, size=(25, 1)), axis=0)
3011 >>> quantile_transform(X, n_quantiles=10, random_state=0, copy=True)
3012 array([...])
3013 """
3014 n = QuantileTransformer(
3015 n_quantiles=n_quantiles,
3016 output_distribution=output_distribution,
3017 subsample=subsample,
3018 ignore_implicit_zeros=ignore_implicit_zeros,
3019 random_state=random_state,
3020 copy=copy,
3021 )
3022 if axis == 0:
3023 X = n.fit_transform(X)
3024 else: # axis == 1
3025 X = n.fit_transform(X.T).T
3026 return X
3027
3028
3029class PowerTransformer(OneToOneFeatureMixin, TransformerMixin, BaseEstimator):
3030 """Apply a power transform featurewise to make data more Gaussian-like.
3031
3032 Power transforms are a family of parametric, monotonic transformations
3033 that are applied to make data more Gaussian-like. This is useful for
3034 modeling issues related to heteroscedasticity (non-constant variance),
3035 or other situations where normality is desired.
3036
3037 Currently, PowerTransformer supports the Box-Cox transform and the
3038 Yeo-Johnson transform. The optimal parameter for stabilizing variance and
3039 minimizing skewness is estimated through maximum likelihood.
3040
3041 Box-Cox requires input data to be strictly positive, while Yeo-Johnson
3042 supports both positive or negative data.
3043
3044 By default, zero-mean, unit-variance normalization is applied to the
3045 transformed data.
3046
3047 For an example visualization, refer to :ref:`Compare PowerTransformer with
3048 other scalers <plot_all_scaling_power_transformer_section>`. To see the
3049 effect of Box-Cox and Yeo-Johnson transformations on different
3050 distributions, see:
3051 :ref:`sphx_glr_auto_examples_preprocessing_plot_map_data_to_normal.py`.
3052
3053 Read more in the :ref:`User Guide <preprocessing_transformer>`.
3054
3055 .. versionadded:: 0.20
3056
3057 Parameters
3058 ----------
3059 method : {'yeo-johnson', 'box-cox'}, default='yeo-johnson'
3060 The power transform method. Available methods are:
3061
3062 - 'yeo-johnson' [1]_, works with positive and negative values
3063 - 'box-cox' [2]_, only works with strictly positive values
3064
3065 standardize : bool, default=True
3066 Set to True to apply zero-mean, unit-variance normalization to the
3067 transformed output.
3068
3069 copy : bool, default=True
3070 Set to False to perform inplace computation during transformation.
3071
3072 Attributes
3073 ----------
3074 lambdas_ : ndarray of float of shape (n_features,)
3075 The parameters of the power transformation for the selected features.
3076
3077 n_features_in_ : int
3078 Number of features seen during :term:`fit`.
3079
3080 .. versionadded:: 0.24
3081
3082 feature_names_in_ : ndarray of shape (`n_features_in_`,)
3083 Names of features seen during :term:`fit`. Defined only when `X`
3084 has feature names that are all strings.
3085
3086 .. versionadded:: 1.0
3087
3088 See Also
3089 --------
3090 power_transform : Equivalent function without the estimator API.
3091
3092 QuantileTransformer : Maps data to a standard normal distribution with
3093 the parameter `output_distribution='normal'`.
3094
3095 Notes
3096 -----
3097 NaNs are treated as missing values: disregarded in ``fit``, and maintained
3098 in ``transform``.
3099
3100 References
3101 ----------
3102
3103 .. [1] :doi:`I.K. Yeo and R.A. Johnson, "A new family of power
3104 transformations to improve normality or symmetry." Biometrika,
3105 87(4), pp.954-959, (2000). <10.1093/biomet/87.4.954>`
3106
3107 .. [2] :doi:`G.E.P. Box and D.R. Cox, "An Analysis of Transformations",
3108 Journal of the Royal Statistical Society B, 26, 211-252 (1964).
3109 <10.1111/j.2517-6161.1964.tb00553.x>`
3110
3111 Examples
3112 --------
3113 >>> import numpy as np
3114 >>> from sklearn.preprocessing import PowerTransformer
3115 >>> pt = PowerTransformer()
3116 >>> data = [[1, 2], [3, 2], [4, 5]]
3117 >>> print(pt.fit(data))
3118 PowerTransformer()
3119 >>> print(pt.lambdas_)
3120 [ 1.386... -3.100...]
3121 >>> print(pt.transform(data))
3122 [[-1.316... -0.707...]
3123 [ 0.209... -0.707...]
3124 [ 1.106... 1.414...]]
3125 """
3126
3127 _parameter_constraints: dict = {
3128 "method": [StrOptions({"yeo-johnson", "box-cox"})],
3129 "standardize": ["boolean"],
3130 "copy": ["boolean"],
3131 }
3132
3133 def __init__(self, method="yeo-johnson", *, standardize=True, copy=True):
3134 self.method = method
3135 self.standardize = standardize
3136 self.copy = copy
3137
3138 @_fit_context(prefer_skip_nested_validation=True)
3139 def fit(self, X, y=None):
3140 """Estimate the optimal parameter lambda for each feature.
3141
3142 The optimal lambda parameter for minimizing skewness is estimated on
3143 each feature independently using maximum likelihood.
3144
3145 Parameters
3146 ----------
3147 X : array-like of shape (n_samples, n_features)
3148 The data used to estimate the optimal transformation parameters.
3149
3150 y : None
3151 Ignored.
3152
3153 Returns
3154 -------
3155 self : object
3156 Fitted transformer.
3157 """
3158 self._fit(X, y=y, force_transform=False)
3159 return self
3160
3161 @_fit_context(prefer_skip_nested_validation=True)
3162 def fit_transform(self, X, y=None):
3163 """Fit `PowerTransformer` to `X`, then transform `X`.
3164
3165 Parameters
3166 ----------
3167 X : array-like of shape (n_samples, n_features)
3168 The data used to estimate the optimal transformation parameters
3169 and to be transformed using a power transformation.
3170
3171 y : Ignored
3172 Not used, present for API consistency by convention.
3173
3174 Returns
3175 -------
3176 X_new : ndarray of shape (n_samples, n_features)
3177 Transformed data.
3178 """
3179 return self._fit(X, y, force_transform=True)
3180
3181 def _fit(self, X, y=None, force_transform=False):
3182 X = self._check_input(X, in_fit=True, check_positive=True)
3183
3184 if not self.copy and not force_transform: # if call from fit()
3185 X = X.copy() # force copy so that fit does not change X inplace
3186
3187 n_samples = X.shape[0]
3188 mean = np.mean(X, axis=0, dtype=np.float64)
3189 var = np.var(X, axis=0, dtype=np.float64)
3190
3191 optim_function = {
3192 "box-cox": self._box_cox_optimize,
3193 "yeo-johnson": self._yeo_johnson_optimize,
3194 }[self.method]
3195
3196 transform_function = {
3197 "box-cox": boxcox,
3198 "yeo-johnson": self._yeo_johnson_transform,
3199 }[self.method]
3200
3201 with np.errstate(invalid="ignore"): # hide NaN warnings
3202 self.lambdas_ = np.empty(X.shape[1], dtype=X.dtype)
3203 for i, col in enumerate(X.T):
3204 # For yeo-johnson, leave constant features unchanged
3205 # lambda=1 corresponds to the identity transformation
3206 is_constant_feature = _is_constant_feature(var[i], mean[i], n_samples)
3207 if self.method == "yeo-johnson" and is_constant_feature:
3208 self.lambdas_[i] = 1.0
3209 continue
3210
3211 self.lambdas_[i] = optim_function(col)
3212
3213 if self.standardize or force_transform:
3214 X[:, i] = transform_function(X[:, i], self.lambdas_[i])
3215
3216 if self.standardize:
3217 self._scaler = StandardScaler(copy=False).set_output(transform="default")
3218 if force_transform:
3219 X = self._scaler.fit_transform(X)
3220 else:
3221 self._scaler.fit(X)
3222
3223 return X
3224
3225 def transform(self, X):
3226 """Apply the power transform to each feature using the fitted lambdas.
3227
3228 Parameters
3229 ----------
3230 X : array-like of shape (n_samples, n_features)
3231 The data to be transformed using a power transformation.
3232
3233 Returns
3234 -------
3235 X_trans : ndarray of shape (n_samples, n_features)
3236 The transformed data.
3237 """
3238 check_is_fitted(self)
3239 X = self._check_input(X, in_fit=False, check_positive=True, check_shape=True)
3240
3241 transform_function = {
3242 "box-cox": boxcox,
3243 "yeo-johnson": self._yeo_johnson_transform,
3244 }[self.method]
3245 for i, lmbda in enumerate(self.lambdas_):
3246 with np.errstate(invalid="ignore"): # hide NaN warnings
3247 X[:, i] = transform_function(X[:, i], lmbda)
3248
3249 if self.standardize:
3250 X = self._scaler.transform(X)
3251
3252 return X
3253
3254 def inverse_transform(self, X):
3255 """Apply the inverse power transformation using the fitted lambdas.
3256
3257 The inverse of the Box-Cox transformation is given by::
3258
3259 if lambda_ == 0:
3260 X = exp(X_trans)
3261 else:
3262 X = (X_trans * lambda_ + 1) ** (1 / lambda_)
3263
3264 The inverse of the Yeo-Johnson transformation is given by::
3265
3266 if X >= 0 and lambda_ == 0:
3267 X = exp(X_trans) - 1
3268 elif X >= 0 and lambda_ != 0:
3269 X = (X_trans * lambda_ + 1) ** (1 / lambda_) - 1
3270 elif X < 0 and lambda_ != 2:
3271 X = 1 - (-(2 - lambda_) * X_trans + 1) ** (1 / (2 - lambda_))
3272 elif X < 0 and lambda_ == 2:
3273 X = 1 - exp(-X_trans)
3274
3275 Parameters
3276 ----------
3277 X : array-like of shape (n_samples, n_features)
3278 The transformed data.
3279
3280 Returns
3281 -------
3282 X : ndarray of shape (n_samples, n_features)
3283 The original data.
3284 """
3285 check_is_fitted(self)
3286 X = self._check_input(X, in_fit=False, check_shape=True)
3287
3288 if self.standardize:
3289 X = self._scaler.inverse_transform(X)
3290
3291 inv_fun = {
3292 "box-cox": self._box_cox_inverse_tranform,
3293 "yeo-johnson": self._yeo_johnson_inverse_transform,
3294 }[self.method]
3295 for i, lmbda in enumerate(self.lambdas_):
3296 with np.errstate(invalid="ignore"): # hide NaN warnings
3297 X[:, i] = inv_fun(X[:, i], lmbda)
3298
3299 return X
3300
3301 def _box_cox_inverse_tranform(self, x, lmbda):
3302 """Return inverse-transformed input x following Box-Cox inverse
3303 transform with parameter lambda.
3304 """
3305 if lmbda == 0:
3306 x_inv = np.exp(x)
3307 else:
3308 x_inv = (x * lmbda + 1) ** (1 / lmbda)
3309
3310 return x_inv
3311
3312 def _yeo_johnson_inverse_transform(self, x, lmbda):
3313 """Return inverse-transformed input x following Yeo-Johnson inverse
3314 transform with parameter lambda.
3315 """
3316 x_inv = np.zeros_like(x)
3317 pos = x >= 0
3318
3319 # when x >= 0
3320 if abs(lmbda) < np.spacing(1.0):
3321 x_inv[pos] = np.exp(x[pos]) - 1
3322 else: # lmbda != 0
3323 x_inv[pos] = np.power(x[pos] * lmbda + 1, 1 / lmbda) - 1
3324
3325 # when x < 0
3326 if abs(lmbda - 2) > np.spacing(1.0):
3327 x_inv[~pos] = 1 - np.power(-(2 - lmbda) * x[~pos] + 1, 1 / (2 - lmbda))
3328 else: # lmbda == 2
3329 x_inv[~pos] = 1 - np.exp(-x[~pos])
3330
3331 return x_inv
3332
3333 def _yeo_johnson_transform(self, x, lmbda):
3334 """Return transformed input x following Yeo-Johnson transform with
3335 parameter lambda.
3336 """
3337
3338 out = np.zeros_like(x)
3339 pos = x >= 0 # binary mask
3340
3341 # when x >= 0
3342 if abs(lmbda) < np.spacing(1.0):
3343 out[pos] = np.log1p(x[pos])
3344 else: # lmbda != 0
3345 out[pos] = (np.power(x[pos] + 1, lmbda) - 1) / lmbda
3346
3347 # when x < 0
3348 if abs(lmbda - 2) > np.spacing(1.0):
3349 out[~pos] = -(np.power(-x[~pos] + 1, 2 - lmbda) - 1) / (2 - lmbda)
3350 else: # lmbda == 2
3351 out[~pos] = -np.log1p(-x[~pos])
3352
3353 return out
3354
3355 def _box_cox_optimize(self, x):
3356 """Find and return optimal lambda parameter of the Box-Cox transform by
3357 MLE, for observed data x.
3358
3359 We here use scipy builtins which uses the brent optimizer.
3360 """
3361 mask = np.isnan(x)
3362 if np.all(mask):
3363 raise ValueError("Column must not be all nan.")
3364
3365 # the computation of lambda is influenced by NaNs so we need to
3366 # get rid of them
3367 _, lmbda = stats.boxcox(x[~mask], lmbda=None)
3368
3369 return lmbda
3370
3371 def _yeo_johnson_optimize(self, x):
3372 """Find and return optimal lambda parameter of the Yeo-Johnson
3373 transform by MLE, for observed data x.
3374
3375 Like for Box-Cox, MLE is done via the brent optimizer.
3376 """
3377 x_tiny = np.finfo(np.float64).tiny
3378
3379 def _neg_log_likelihood(lmbda):
3380 """Return the negative log likelihood of the observed data x as a
3381 function of lambda."""
3382 x_trans = self._yeo_johnson_transform(x, lmbda)
3383 n_samples = x.shape[0]
3384 x_trans_var = x_trans.var()
3385
3386 # Reject transformed data that would raise a RuntimeWarning in np.log
3387 if x_trans_var < x_tiny:
3388 return np.inf
3389
3390 log_var = np.log(x_trans_var)
3391 loglike = -n_samples / 2 * log_var
3392 loglike += (lmbda - 1) * (np.sign(x) * np.log1p(np.abs(x))).sum()
3393
3394 return -loglike
3395
3396 # the computation of lambda is influenced by NaNs so we need to
3397 # get rid of them
3398 x = x[~np.isnan(x)]
3399 # choosing bracket -2, 2 like for boxcox
3400 return optimize.brent(_neg_log_likelihood, brack=(-2, 2))
3401
3402 def _check_input(self, X, in_fit, check_positive=False, check_shape=False):
3403 """Validate the input before fit and transform.
3404
3405 Parameters
3406 ----------
3407 X : array-like of shape (n_samples, n_features)
3408
3409 in_fit : bool
3410 Whether or not `_check_input` is called from `fit` or other
3411 methods, e.g. `predict`, `transform`, etc.
3412
3413 check_positive : bool, default=False
3414 If True, check that all data is positive and non-zero (only if
3415 ``self.method=='box-cox'``).
3416
3417 check_shape : bool, default=False
3418 If True, check that n_features matches the length of self.lambdas_
3419 """
3420 X = self._validate_data(
3421 X,
3422 ensure_2d=True,
3423 dtype=FLOAT_DTYPES,
3424 copy=self.copy,
3425 force_all_finite="allow-nan",
3426 reset=in_fit,
3427 )
3428
3429 with warnings.catch_warnings():
3430 warnings.filterwarnings("ignore", r"All-NaN (slice|axis) encountered")
3431 if check_positive and self.method == "box-cox" and np.nanmin(X) <= 0:
3432 raise ValueError(
3433 "The Box-Cox transformation can only be "
3434 "applied to strictly positive data"
3435 )
3436
3437 if check_shape and not X.shape[1] == len(self.lambdas_):
3438 raise ValueError(
3439 "Input data has a different number of features "
3440 "than fitting data. Should have {n}, data has {m}".format(
3441 n=len(self.lambdas_), m=X.shape[1]
3442 )
3443 )
3444
3445 return X
3446
3447 def _more_tags(self):
3448 return {"allow_nan": True}
3449
3450
3451@validate_params(
3452 {"X": ["array-like"]},
3453 prefer_skip_nested_validation=False,
3454)
3455def power_transform(X, method="yeo-johnson", *, standardize=True, copy=True):
3456 """Parametric, monotonic transformation to make data more Gaussian-like.
3457
3458 Power transforms are a family of parametric, monotonic transformations
3459 that are applied to make data more Gaussian-like. This is useful for
3460 modeling issues related to heteroscedasticity (non-constant variance),
3461 or other situations where normality is desired.
3462
3463 Currently, power_transform supports the Box-Cox transform and the
3464 Yeo-Johnson transform. The optimal parameter for stabilizing variance and
3465 minimizing skewness is estimated through maximum likelihood.
3466
3467 Box-Cox requires input data to be strictly positive, while Yeo-Johnson
3468 supports both positive or negative data.
3469
3470 By default, zero-mean, unit-variance normalization is applied to the
3471 transformed data.
3472
3473 Read more in the :ref:`User Guide <preprocessing_transformer>`.
3474
3475 Parameters
3476 ----------
3477 X : array-like of shape (n_samples, n_features)
3478 The data to be transformed using a power transformation.
3479
3480 method : {'yeo-johnson', 'box-cox'}, default='yeo-johnson'
3481 The power transform method. Available methods are:
3482
3483 - 'yeo-johnson' [1]_, works with positive and negative values
3484 - 'box-cox' [2]_, only works with strictly positive values
3485
3486 .. versionchanged:: 0.23
3487 The default value of the `method` parameter changed from
3488 'box-cox' to 'yeo-johnson' in 0.23.
3489
3490 standardize : bool, default=True
3491 Set to True to apply zero-mean, unit-variance normalization to the
3492 transformed output.
3493
3494 copy : bool, default=True
3495 If False, try to avoid a copy and transform in place.
3496 This is not guaranteed to always work in place; e.g. if the data is
3497 a numpy array with an int dtype, a copy will be returned even with
3498 copy=False.
3499
3500 Returns
3501 -------
3502 X_trans : ndarray of shape (n_samples, n_features)
3503 The transformed data.
3504
3505 See Also
3506 --------
3507 PowerTransformer : Equivalent transformation with the
3508 Transformer API (e.g. as part of a preprocessing
3509 :class:`~sklearn.pipeline.Pipeline`).
3510
3511 quantile_transform : Maps data to a standard normal distribution with
3512 the parameter `output_distribution='normal'`.
3513
3514 Notes
3515 -----
3516 NaNs are treated as missing values: disregarded in ``fit``, and maintained
3517 in ``transform``.
3518
3519 For a comparison of the different scalers, transformers, and normalizers,
3520 see: :ref:`sphx_glr_auto_examples_preprocessing_plot_all_scaling.py`.
3521
3522 References
3523 ----------
3524
3525 .. [1] I.K. Yeo and R.A. Johnson, "A new family of power transformations to
3526 improve normality or symmetry." Biometrika, 87(4), pp.954-959,
3527 (2000).
3528
3529 .. [2] G.E.P. Box and D.R. Cox, "An Analysis of Transformations", Journal
3530 of the Royal Statistical Society B, 26, 211-252 (1964).
3531
3532 Examples
3533 --------
3534 >>> import numpy as np
3535 >>> from sklearn.preprocessing import power_transform
3536 >>> data = [[1, 2], [3, 2], [4, 5]]
3537 >>> print(power_transform(data, method='box-cox'))
3538 [[-1.332... -0.707...]
3539 [ 0.256... -0.707...]
3540 [ 1.076... 1.414...]]
3541
3542 .. warning:: Risk of data leak.
3543 Do not use :func:`~sklearn.preprocessing.power_transform` unless you
3544 know what you are doing. A common mistake is to apply it to the entire
3545 data *before* splitting into training and test sets. This will bias the
3546 model evaluation because information would have leaked from the test
3547 set to the training set.
3548 In general, we recommend using
3549 :class:`~sklearn.preprocessing.PowerTransformer` within a
3550 :ref:`Pipeline <pipeline>` in order to prevent most risks of data
3551 leaking, e.g.: `pipe = make_pipeline(PowerTransformer(),
3552 LogisticRegression())`.
3553 """
3554 pt = PowerTransformer(method=method, standardize=standardize, copy=copy)
3555 return pt.fit_transform(X)