Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/scikit_learn-1.4.dev0-py3.8-linux-x86_64.egg/sklearn/preprocessing/

1# Authors: Alexandre Gramfort <alexandre.gramfort@inria.fr>

2# Mathieu Blondel <mathieu@mblondel.org>

3# Olivier Grisel <olivier.grisel@ensta.org>

4# Andreas Mueller <amueller@ais.uni-bonn.de>

5# Eric Martin <eric@ericmart.in>

6# Giorgio Patrini <giorgio.patrini@anu.edu.au>

7# Eric Chang <ericchang2017@u.northwestern.edu>

8# License: BSD 3 clause

11import warnings

12from numbers import Integral, Real

14import numpy as np

15from scipy import optimize, sparse, stats

16from scipy.special import boxcox

18from ..base import (

19 BaseEstimator,

20 ClassNamePrefixFeaturesOutMixin,

21 OneToOneFeatureMixin,

22 TransformerMixin,

23 _fit_context,

24)

25from ..utils import _array_api, check_array

26from ..utils._array_api import get_namespace

27from ..utils._param_validation import Interval, Options, StrOptions, validate_params

28from ..utils.extmath import _incremental_mean_and_var, row_norms

29from ..utils.sparsefuncs import (

30 incr_mean_variance_axis,

31 inplace_column_scale,

32 mean_variance_axis,

33 min_max_axis,

34)

35from ..utils.sparsefuncs_fast import (

36 inplace_csr_row_normalize_l1,

37 inplace_csr_row_normalize_l2,

38)

39from ..utils.validation import (

40 FLOAT_DTYPES,

41 _check_sample_weight,

42 check_is_fitted,

43 check_random_state,

44)

45from ._encoders import OneHotEncoder

47BOUNDS_THRESHOLD = 1e-7

49__all__ = [

50 "Binarizer",

51 "KernelCenterer",

52 "MinMaxScaler",

53 "MaxAbsScaler",

54 "Normalizer",

55 "OneHotEncoder",

56 "RobustScaler",

57 "StandardScaler",

58 "QuantileTransformer",

59 "PowerTransformer",

60 "add_dummy_feature",

61 "binarize",

62 "normalize",

63 "scale",

64 "robust_scale",

65 "maxabs_scale",

66 "minmax_scale",

67 "quantile_transform",

68 "power_transform",

69]

72def _is_constant_feature(var, mean, n_samples):

73 """Detect if a feature is indistinguishable from a constant feature.

75 The detection is based on its computed variance and on the theoretical

76 error bounds of the '2 pass algorithm' for variance computation.

78 See "Algorithms for computing the sample variance: analysis and

79 recommendations", by Chan, Golub, and LeVeque.

80 """

81 # In scikit-learn, variance is always computed using float64 accumulators.

82 eps = np.finfo(np.float64).eps

84 upper_bound = n_samples * eps * var + (n_samples * mean * eps) ** 2

85 return var <= upper_bound

88def _handle_zeros_in_scale(scale, copy=True, constant_mask=None):

89 """Set scales of near constant features to 1.

91 The goal is to avoid division by very small or zero values.

93 Near constant features are detected automatically by identifying

94 scales close to machine precision unless they are precomputed by

95 the caller and passed with the `constant_mask` kwarg.

97 Typically for standard scaling, the scales are the standard

98 deviation while near constant features are better detected on the

99 computed variances which are closer to machine precision by

100 construction.

101 """

102 # if we are fitting on 1D arrays, scale might be a scalar

103 if np.isscalar(scale):

104 if scale == 0.0:

105 scale = 1.0

106 return scale

107 # scale is an array

108 else:

109 xp, _ = get_namespace(scale)

110 if constant_mask is None:

111 # Detect near constant values to avoid dividing by a very small

112 # value that could lead to surprising results and numerical

113 # stability issues.

114 constant_mask = scale < 10 * xp.finfo(scale.dtype).eps

115

116 if copy:

117 # New array to avoid side-effects

118 scale = xp.asarray(scale, copy=True)

119 scale[constant_mask] = 1.0

120 return scale

121

122

123@validate_params(

124 {

125 "X": ["array-like", "sparse matrix"],

126 "axis": [Options(Integral, {0, 1})],

127 "with_mean": ["boolean"],

128 "with_std": ["boolean"],

129 "copy": ["boolean"],

130 },

131 prefer_skip_nested_validation=True,

132)

133def scale(X, *, axis=0, with_mean=True, with_std=True, copy=True):

134 """Standardize a dataset along any axis.

135

136 Center to the mean and component wise scale to unit variance.

137

138 Read more in the :ref:`User Guide <preprocessing_scaler>`.

139

140 Parameters

141 ----------

142 X : {array-like, sparse matrix} of shape (n_samples, n_features)

143 The data to center and scale.

144

145 axis : {0, 1}, default=0

146 Axis used to compute the means and standard deviations along. If 0,

147 independently standardize each feature, otherwise (if 1) standardize

148 each sample.

149

150 with_mean : bool, default=True

151 If True, center the data before scaling.

152

153 with_std : bool, default=True

154 If True, scale the data to unit variance (or equivalently,

155 unit standard deviation).

156

157 copy : bool, default=True

158 If False, try to avoid a copy and scale in place.

159 This is not guaranteed to always work in place; e.g. if the data is

160 a numpy array with an int dtype, a copy will be returned even with

161 copy=False.

162

163 Returns

164 -------

165 X_tr : {ndarray, sparse matrix} of shape (n_samples, n_features)

166 The transformed data.

167

168 See Also

169 --------

170 StandardScaler : Performs scaling to unit variance using the Transformer

171 API (e.g. as part of a preprocessing

172 :class:`~sklearn.pipeline.Pipeline`).

173

174 Notes

175 -----

176 This implementation will refuse to center scipy.sparse matrices

177 since it would make them non-sparse and would potentially crash the

178 program with memory exhaustion problems.

179

180 Instead the caller is expected to either set explicitly

181 `with_mean=False` (in that case, only variance scaling will be

182 performed on the features of the CSC matrix) or to call `X.toarray()`

183 if he/she expects the materialized dense array to fit in memory.

184

185 To avoid memory copy the caller should pass a CSC matrix.

186

187 NaNs are treated as missing values: disregarded to compute the statistics,

188 and maintained during the data transformation.

189

190 We use a biased estimator for the standard deviation, equivalent to

191 `numpy.std(x, ddof=0)`. Note that the choice of `ddof` is unlikely to

192 affect model performance.

193

194 For a comparison of the different scalers, transformers, and normalizers,

195 see: :ref:`sphx_glr_auto_examples_preprocessing_plot_all_scaling.py`.

196

197 .. warning:: Risk of data leak

198

199 Do not use :func:`~sklearn.preprocessing.scale` unless you know

200 what you are doing. A common mistake is to apply it to the entire data

201 *before* splitting into training and test sets. This will bias the

202 model evaluation because information would have leaked from the test

203 set to the training set.

204 In general, we recommend using

205 :class:`~sklearn.preprocessing.StandardScaler` within a

206 :ref:`Pipeline <pipeline>` in order to prevent most risks of data

207 leaking: `pipe = make_pipeline(StandardScaler(), LogisticRegression())`.

208 """ # noqa

209 X = check_array(

210 X,

211 accept_sparse="csc",

212 copy=copy,

213 ensure_2d=False,

214 estimator="the scale function",

215 dtype=FLOAT_DTYPES,

216 force_all_finite="allow-nan",

217 )

218 if sparse.issparse(X):

219 if with_mean:

220 raise ValueError(

221 "Cannot center sparse matrices: pass `with_mean=False` instead"

222 " See docstring for motivation and alternatives."

223 )

224 if axis != 0:

225 raise ValueError(

226 "Can only scale sparse matrix on axis=0, got axis=%d" % axis

227 )

228 if with_std:

229 _, var = mean_variance_axis(X, axis=0)

230 var = _handle_zeros_in_scale(var, copy=False)

231 inplace_column_scale(X, 1 / np.sqrt(var))

232 else:

233 X = np.asarray(X)

234 if with_mean:

235 mean_ = np.nanmean(X, axis)

236 if with_std:

237 scale_ = np.nanstd(X, axis)

238 # Xr is a view on the original array that enables easy use of

239 # broadcasting on the axis in which we are interested in

240 Xr = np.rollaxis(X, axis)

241 if with_mean:

242 Xr -= mean_

243 mean_1 = np.nanmean(Xr, axis=0)

244 # Verify that mean_1 is 'close to zero'. If X contains very

245 # large values, mean_1 can also be very large, due to a lack of

246 # precision of mean_. In this case, a pre-scaling of the

247 # concerned feature is efficient, for instance by its mean or

248 # maximum.

249 if not np.allclose(mean_1, 0):

250 warnings.warn(

251 "Numerical issues were encountered "

252 "when centering the data "

253 "and might not be solved. Dataset may "

254 "contain too large values. You may need "

255 "to prescale your features."

256 )

257 Xr -= mean_1

258 if with_std:

259 scale_ = _handle_zeros_in_scale(scale_, copy=False)

260 Xr /= scale_

261 if with_mean:

262 mean_2 = np.nanmean(Xr, axis=0)

263 # If mean_2 is not 'close to zero', it comes from the fact that

264 # scale_ is very small so that mean_2 = mean_1/scale_ > 0, even

265 # if mean_1 was close to zero. The problem is thus essentially

266 # due to the lack of precision of mean_. A solution is then to

267 # subtract the mean again:

268 if not np.allclose(mean_2, 0):

269 warnings.warn(

270 "Numerical issues were encountered "

271 "when scaling the data "

272 "and might not be solved. The standard "

273 "deviation of the data is probably "

274 "very close to 0. "

275 )

276 Xr -= mean_2

277 return X

278

279

280class MinMaxScaler(OneToOneFeatureMixin, TransformerMixin, BaseEstimator):

281 """Transform features by scaling each feature to a given range.

282

283 This estimator scales and translates each feature individually such

284 that it is in the given range on the training set, e.g. between

285 zero and one.

286

287 The transformation is given by::

288

289 X_std = (X - X.min(axis=0)) / (X.max(axis=0) - X.min(axis=0))

290 X_scaled = X_std * (max - min) + min

291

292 where min, max = feature_range.

293

294 This transformation is often used as an alternative to zero mean,

295 unit variance scaling.

296

297 `MinMaxScaler` doesn't reduce the effect of outliers, but it linearly

298 scales them down into a fixed range, where the largest occurring data point

299 corresponds to the maximum value and the smallest one corresponds to the

300 minimum value. For an example visualization, refer to :ref:`Compare

301 MinMaxScaler with other scalers <plot_all_scaling_minmax_scaler_section>`.

302

303 Read more in the :ref:`User Guide <preprocessing_scaler>`.

304

305 Parameters

306 ----------

307 feature_range : tuple (min, max), default=(0, 1)

308 Desired range of transformed data.

309

310 copy : bool, default=True

311 Set to False to perform inplace row normalization and avoid a

312 copy (if the input is already a numpy array).

313

314 clip : bool, default=False

315 Set to True to clip transformed values of held-out data to

316 provided `feature range`.

317

318 .. versionadded:: 0.24

319

320 Attributes

321 ----------

322 min_ : ndarray of shape (n_features,)

323 Per feature adjustment for minimum. Equivalent to

324 ``min - X.min(axis=0) * self.scale_``

325

326 scale_ : ndarray of shape (n_features,)

327 Per feature relative scaling of the data. Equivalent to

328 ``(max - min) / (X.max(axis=0) - X.min(axis=0))``

329

330 .. versionadded:: 0.17

331 *scale_* attribute.

332

333 data_min_ : ndarray of shape (n_features,)

334 Per feature minimum seen in the data

335

336 .. versionadded:: 0.17

337 *data_min_*

338

339 data_max_ : ndarray of shape (n_features,)

340 Per feature maximum seen in the data

341

342 .. versionadded:: 0.17

343 *data_max_*

344

345 data_range_ : ndarray of shape (n_features,)

346 Per feature range ``(data_max_ - data_min_)`` seen in the data

347

348 .. versionadded:: 0.17

349 *data_range_*

350

351 n_features_in_ : int

352 Number of features seen during :term:`fit`.

353

354 .. versionadded:: 0.24

355

356 n_samples_seen_ : int

357 The number of samples processed by the estimator.

358 It will be reset on new calls to fit, but increments across

359 ``partial_fit`` calls.

360

361 feature_names_in_ : ndarray of shape (`n_features_in_`,)

362 Names of features seen during :term:`fit`. Defined only when `X`

363 has feature names that are all strings.

364

365 .. versionadded:: 1.0

366

367 See Also

368 --------

369 minmax_scale : Equivalent function without the estimator API.

370

371 Notes

372 -----

373 NaNs are treated as missing values: disregarded in fit, and maintained in

374 transform.

375

376 Examples

377 --------

378 >>> from sklearn.preprocessing import MinMaxScaler

379 >>> data = [[-1, 2], [-0.5, 6], [0, 10], [1, 18]]

380 >>> scaler = MinMaxScaler()

381 >>> print(scaler.fit(data))

382 MinMaxScaler()

383 >>> print(scaler.data_max_)

384 [ 1. 18.]

385 >>> print(scaler.transform(data))

386 [[0. 0. ]

387 [0.25 0.25]

388 [0.5 0.5 ]

389 [1. 1. ]]

390 >>> print(scaler.transform([[2, 2]]))

391 [[1.5 0. ]]

392 """

393

394 _parameter_constraints: dict = {

395 "feature_range": [tuple],

396 "copy": ["boolean"],

397 "clip": ["boolean"],

398 }

399

400 def __init__(self, feature_range=(0, 1), *, copy=True, clip=False):

401 self.feature_range = feature_range

402 self.copy = copy

403 self.clip = clip

404

405 def _reset(self):

406 """Reset internal data-dependent state of the scaler, if necessary.

407

408 __init__ parameters are not touched.

409 """

410 # Checking one attribute is enough, because they are all set together

411 # in partial_fit

412 if hasattr(self, "scale_"):

413 del self.scale_

414 del self.min_

415 del self.n_samples_seen_

416 del self.data_min_

417 del self.data_max_

418 del self.data_range_

419

420 def fit(self, X, y=None):

421 """Compute the minimum and maximum to be used for later scaling.

422

423 Parameters

424 ----------

425 X : array-like of shape (n_samples, n_features)

426 The data used to compute the per-feature minimum and maximum

427 used for later scaling along the features axis.

428

429 y : None

430 Ignored.

431

432 Returns

433 -------

434 self : object

435 Fitted scaler.

436 """

437 # Reset internal state before fitting

438 self._reset()

439 return self.partial_fit(X, y)

440

441 @_fit_context(prefer_skip_nested_validation=True)

442 def partial_fit(self, X, y=None):

443 """Online computation of min and max on X for later scaling.

444

445 All of X is processed as a single batch. This is intended for cases

446 when :meth:`fit` is not feasible due to very large number of

447 `n_samples` or because X is read from a continuous stream.

448

449 Parameters

450 ----------

451 X : array-like of shape (n_samples, n_features)

452 The data used to compute the mean and standard deviation

453 used for later scaling along the features axis.

454

455 y : None

456 Ignored.

457

458 Returns

459 -------

460 self : object

461 Fitted scaler.

462 """

463 feature_range = self.feature_range

464 if feature_range[0] >= feature_range[1]:

465 raise ValueError(

466 "Minimum of desired feature range must be smaller than maximum. Got %s."

467 % str(feature_range)

468 )

469

470 if sparse.issparse(X):

471 raise TypeError(

472 "MinMaxScaler does not support sparse input. "

473 "Consider using MaxAbsScaler instead."

474 )

475

476 xp, _ = get_namespace(X)

477

478 first_pass = not hasattr(self, "n_samples_seen_")

479 X = self._validate_data(

480 X,

481 reset=first_pass,

482 dtype=_array_api.supported_float_dtypes(xp),

483 force_all_finite="allow-nan",

484 )

485

486 data_min = _array_api._nanmin(X, axis=0)

487 data_max = _array_api._nanmax(X, axis=0)

488

489 if first_pass:

490 self.n_samples_seen_ = X.shape[0]

491 else:

492 data_min = xp.minimum(self.data_min_, data_min)

493 data_max = xp.maximum(self.data_max_, data_max)

494 self.n_samples_seen_ += X.shape[0]

495

496 data_range = data_max - data_min

497 self.scale_ = (feature_range[1] - feature_range[0]) / _handle_zeros_in_scale(

498 data_range, copy=True

499 )

500 self.min_ = feature_range[0] - data_min * self.scale_

501 self.data_min_ = data_min

502 self.data_max_ = data_max

503 self.data_range_ = data_range

504 return self

505

506 def transform(self, X):

507 """Scale features of X according to feature_range.

508

509 Parameters

510 ----------

511 X : array-like of shape (n_samples, n_features)

512 Input data that will be transformed.

513

514 Returns

515 -------

516 Xt : ndarray of shape (n_samples, n_features)

517 Transformed data.

518 """

519 check_is_fitted(self)

520

521 xp, _ = get_namespace(X)

522

523 X = self._validate_data(

524 X,

525 copy=self.copy,

526 dtype=_array_api.supported_float_dtypes(xp),

527 force_all_finite="allow-nan",

528 reset=False,

529 )

530

531 X *= self.scale_

532 X += self.min_

533 if self.clip:

534 xp.clip(X, self.feature_range[0], self.feature_range[1], out=X)

535 return X

536

537 def inverse_transform(self, X):

538 """Undo the scaling of X according to feature_range.

539

540 Parameters

541 ----------

542 X : array-like of shape (n_samples, n_features)

543 Input data that will be transformed. It cannot be sparse.

544

545 Returns

546 -------

547 Xt : ndarray of shape (n_samples, n_features)

548 Transformed data.

549 """

550 check_is_fitted(self)

551

552 xp, _ = get_namespace(X)

553

554 X = check_array(

555 X,

556 copy=self.copy,

557 dtype=_array_api.supported_float_dtypes(xp),

558 force_all_finite="allow-nan",

559 )

560

561 X -= self.min_

562 X /= self.scale_

563 return X

564

565 def _more_tags(self):

566 return {"allow_nan": True}

567

568

569@validate_params(

570 {

571 "X": ["array-like"],

572 "axis": [Options(Integral, {0, 1})],

573 },

574 prefer_skip_nested_validation=False,

575)

576def minmax_scale(X, feature_range=(0, 1), *, axis=0, copy=True):

577 """Transform features by scaling each feature to a given range.

578

579 This estimator scales and translates each feature individually such

580 that it is in the given range on the training set, i.e. between

581 zero and one.

582

583 The transformation is given by (when ``axis=0``)::

584

585 X_std = (X - X.min(axis=0)) / (X.max(axis=0) - X.min(axis=0))

586 X_scaled = X_std * (max - min) + min

587

588 where min, max = feature_range.

589

590 The transformation is calculated as (when ``axis=0``)::

591

592 X_scaled = scale * X + min - X.min(axis=0) * scale

593 where scale = (max - min) / (X.max(axis=0) - X.min(axis=0))

594

595 This transformation is often used as an alternative to zero mean,

596 unit variance scaling.

597

598 Read more in the :ref:`User Guide <preprocessing_scaler>`.

599

600 .. versionadded:: 0.17

601 *minmax_scale* function interface

602 to :class:`~sklearn.preprocessing.MinMaxScaler`.

603

604 Parameters

605 ----------

606 X : array-like of shape (n_samples, n_features)

607 The data.

608

609 feature_range : tuple (min, max), default=(0, 1)

610 Desired range of transformed data.

611

612 axis : {0, 1}, default=0

613 Axis used to scale along. If 0, independently scale each feature,

614 otherwise (if 1) scale each sample.

615

616 copy : bool, default=True

617 If False, try to avoid a copy and scale in place.

618 This is not guaranteed to always work in place; e.g. if the data is

619 a numpy array with an int dtype, a copy will be returned even with

620 copy=False.

621

622 Returns

623 -------

624 X_tr : ndarray of shape (n_samples, n_features)

625 The transformed data.

626

627 .. warning:: Risk of data leak

628

629 Do not use :func:`~sklearn.preprocessing.minmax_scale` unless you know

630 what you are doing. A common mistake is to apply it to the entire data

631 *before* splitting into training and test sets. This will bias the

632 model evaluation because information would have leaked from the test

633 set to the training set.

634 In general, we recommend using

635 :class:`~sklearn.preprocessing.MinMaxScaler` within a

636 :ref:`Pipeline <pipeline>` in order to prevent most risks of data

637 leaking: `pipe = make_pipeline(MinMaxScaler(), LogisticRegression())`.

638

639 See Also

640 --------

641 MinMaxScaler : Performs scaling to a given range using the Transformer

642 API (e.g. as part of a preprocessing

643 :class:`~sklearn.pipeline.Pipeline`).

644

645 Notes

646 -----

647 For a comparison of the different scalers, transformers, and normalizers,

648 see: :ref:`sphx_glr_auto_examples_preprocessing_plot_all_scaling.py`.

649 """

650 # Unlike the scaler object, this function allows 1d input.

651 # If copy is required, it will be done inside the scaler object.

652 X = check_array(

653 X, copy=False, ensure_2d=False, dtype=FLOAT_DTYPES, force_all_finite="allow-nan"

654 )

655 original_ndim = X.ndim

656

657 if original_ndim == 1:

658 X = X.reshape(X.shape[0], 1)

659

660 s = MinMaxScaler(feature_range=feature_range, copy=copy)

661 if axis == 0:

662 X = s.fit_transform(X)

663 else:

664 X = s.fit_transform(X.T).T

665

666 if original_ndim == 1:

667 X = X.ravel()

668

669 return X

670

671

672class StandardScaler(OneToOneFeatureMixin, TransformerMixin, BaseEstimator):

673 """Standardize features by removing the mean and scaling to unit variance.

674

675 The standard score of a sample `x` is calculated as:

676

677 z = (x - u) / s

678

679 where `u` is the mean of the training samples or zero if `with_mean=False`,

680 and `s` is the standard deviation of the training samples or one if

681 `with_std=False`.

682

683 Centering and scaling happen independently on each feature by computing

684 the relevant statistics on the samples in the training set. Mean and

685 standard deviation are then stored to be used on later data using

686 :meth:`transform`.

687

688 Standardization of a dataset is a common requirement for many

689 machine learning estimators: they might behave badly if the

690 individual features do not more or less look like standard normally

691 distributed data (e.g. Gaussian with 0 mean and unit variance).

692

693 For instance many elements used in the objective function of

694 a learning algorithm (such as the RBF kernel of Support Vector

695 Machines or the L1 and L2 regularizers of linear models) assume that

696 all features are centered around 0 and have variance in the same

697 order. If a feature has a variance that is orders of magnitude larger

698 than others, it might dominate the objective function and make the

699 estimator unable to learn from other features correctly as expected.

700

701 `StandardScaler` is sensitive to outliers, and the features may scale

702 differently from each other in the presence of outliers. For an example

703 visualization, refer to :ref:`Compare StandardScaler with other scalers

704 <plot_all_scaling_standard_scaler_section>`.

705

706 This scaler can also be applied to sparse CSR or CSC matrices by passing

707 `with_mean=False` to avoid breaking the sparsity structure of the data.

708

709 Read more in the :ref:`User Guide <preprocessing_scaler>`.

710

711 Parameters

712 ----------

713 copy : bool, default=True

714 If False, try to avoid a copy and do inplace scaling instead.

715 This is not guaranteed to always work inplace; e.g. if the data is

716 not a NumPy array or scipy.sparse CSR matrix, a copy may still be

717 returned.

718

719 with_mean : bool, default=True

720 If True, center the data before scaling.

721 This does not work (and will raise an exception) when attempted on

722 sparse matrices, because centering them entails building a dense

723 matrix which in common use cases is likely to be too large to fit in

724 memory.

725

726 with_std : bool, default=True

727 If True, scale the data to unit variance (or equivalently,

728 unit standard deviation).

729

730 Attributes

731 ----------

732 scale_ : ndarray of shape (n_features,) or None

733 Per feature relative scaling of the data to achieve zero mean and unit

734 variance. Generally this is calculated using `np.sqrt(var_)`. If a

735 variance is zero, we can't achieve unit variance, and the data is left

736 as-is, giving a scaling factor of 1. `scale_` is equal to `None`

737 when `with_std=False`.

738

739 .. versionadded:: 0.17

740 *scale_*

741

742 mean_ : ndarray of shape (n_features,) or None

743 The mean value for each feature in the training set.

744 Equal to ``None`` when ``with_mean=False`` and ``with_std=False``.

745

746 var_ : ndarray of shape (n_features,) or None

747 The variance for each feature in the training set. Used to compute

748 `scale_`. Equal to ``None`` when ``with_mean=False`` and

749 ``with_std=False``.

750

751 n_features_in_ : int

752 Number of features seen during :term:`fit`.

753

754 .. versionadded:: 0.24

755

756 feature_names_in_ : ndarray of shape (`n_features_in_`,)

757 Names of features seen during :term:`fit`. Defined only when `X`

758 has feature names that are all strings.

759

760 .. versionadded:: 1.0

761

762 n_samples_seen_ : int or ndarray of shape (n_features,)

763 The number of samples processed by the estimator for each feature.

764 If there are no missing samples, the ``n_samples_seen`` will be an

765 integer, otherwise it will be an array of dtype int. If

766 `sample_weights` are used it will be a float (if no missing data)

767 or an array of dtype float that sums the weights seen so far.

768 Will be reset on new calls to fit, but increments across

769 ``partial_fit`` calls.

770

771 See Also

772 --------

773 scale : Equivalent function without the estimator API.

774

775 :class:`~sklearn.decomposition.PCA` : Further removes the linear

776 correlation across features with 'whiten=True'.

777

778 Notes

779 -----

780 NaNs are treated as missing values: disregarded in fit, and maintained in

781 transform.

782

783 We use a biased estimator for the standard deviation, equivalent to

784 `numpy.std(x, ddof=0)`. Note that the choice of `ddof` is unlikely to

785 affect model performance.

786

787 Examples

788 --------

789 >>> from sklearn.preprocessing import StandardScaler

790 >>> data = [[0, 0], [0, 0], [1, 1], [1, 1]]

791 >>> scaler = StandardScaler()

792 >>> print(scaler.fit(data))

793 StandardScaler()

794 >>> print(scaler.mean_)

795 [0.5 0.5]

796 >>> print(scaler.transform(data))

797 [[-1. -1.]

798 [-1. -1.]

799 [ 1. 1.]

800 [ 1. 1.]]

801 >>> print(scaler.transform([[2, 2]]))

802 [[3. 3.]]

803 """

804

805 _parameter_constraints: dict = {

806 "copy": ["boolean"],

807 "with_mean": ["boolean"],

808 "with_std": ["boolean"],

809 }

810

811 def __init__(self, *, copy=True, with_mean=True, with_std=True):

812 self.with_mean = with_mean

813 self.with_std = with_std

814 self.copy = copy

815

816 def _reset(self):

817 """Reset internal data-dependent state of the scaler, if necessary.

818

819 __init__ parameters are not touched.

820 """

821 # Checking one attribute is enough, because they are all set together

822 # in partial_fit

823 if hasattr(self, "scale_"):

824 del self.scale_

825 del self.n_samples_seen_

826 del self.mean_

827 del self.var_

828

829 def fit(self, X, y=None, sample_weight=None):

830 """Compute the mean and std to be used for later scaling.

831

832 Parameters

833 ----------

834 X : {array-like, sparse matrix} of shape (n_samples, n_features)

835 The data used to compute the mean and standard deviation

836 used for later scaling along the features axis.

837

838 y : None

839 Ignored.

840

841 sample_weight : array-like of shape (n_samples,), default=None

842 Individual weights for each sample.

843

844 .. versionadded:: 0.24

845 parameter *sample_weight* support to StandardScaler.

846

847 Returns

848 -------

849 self : object

850 Fitted scaler.

851 """

852 # Reset internal state before fitting

853 self._reset()

854 return self.partial_fit(X, y, sample_weight)

855

856 @_fit_context(prefer_skip_nested_validation=True)

857 def partial_fit(self, X, y=None, sample_weight=None):

858 """Online computation of mean and std on X for later scaling.

859

860 All of X is processed as a single batch. This is intended for cases

861 when :meth:`fit` is not feasible due to very large number of

862 `n_samples` or because X is read from a continuous stream.

863

864 The algorithm for incremental mean and std is given in Equation 1.5a,b

865 in Chan, Tony F., Gene H. Golub, and Randall J. LeVeque. "Algorithms

866 for computing the sample variance: Analysis and recommendations."

867 The American Statistician 37.3 (1983): 242-247:

868

869 Parameters

870 ----------

871 X : {array-like, sparse matrix} of shape (n_samples, n_features)

872 The data used to compute the mean and standard deviation

873 used for later scaling along the features axis.

874

875 y : None

876 Ignored.

877

878 sample_weight : array-like of shape (n_samples,), default=None

879 Individual weights for each sample.

880

881 .. versionadded:: 0.24

882 parameter *sample_weight* support to StandardScaler.

883

884 Returns

885 -------

886 self : object

887 Fitted scaler.

888 """

889 first_call = not hasattr(self, "n_samples_seen_")

890 X = self._validate_data(

891 X,

892 accept_sparse=("csr", "csc"),

893 dtype=FLOAT_DTYPES,

894 force_all_finite="allow-nan",

895 reset=first_call,

896 )

897 n_features = X.shape[1]

898

899 if sample_weight is not None:

900 sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype)

901

902 # Even in the case of `with_mean=False`, we update the mean anyway

903 # This is needed for the incremental computation of the var

904 # See incr_mean_variance_axis and _incremental_mean_variance_axis

905

906 # if n_samples_seen_ is an integer (i.e. no missing values), we need to

907 # transform it to a NumPy array of shape (n_features,) required by

908 # incr_mean_variance_axis and _incremental_variance_axis

909 dtype = np.int64 if sample_weight is None else X.dtype

910 if not hasattr(self, "n_samples_seen_"):

911 self.n_samples_seen_ = np.zeros(n_features, dtype=dtype)

912 elif np.size(self.n_samples_seen_) == 1:

913 self.n_samples_seen_ = np.repeat(self.n_samples_seen_, X.shape[1])

914 self.n_samples_seen_ = self.n_samples_seen_.astype(dtype, copy=False)

915

916 if sparse.issparse(X):

917 if self.with_mean:

918 raise ValueError(

919 "Cannot center sparse matrices: pass `with_mean=False` "

920 "instead. See docstring for motivation and alternatives."

921 )

922 sparse_constructor = (

923 sparse.csr_matrix if X.format == "csr" else sparse.csc_matrix

924 )

925

926 if self.with_std:

927 # First pass

928 if not hasattr(self, "scale_"):

929 self.mean_, self.var_, self.n_samples_seen_ = mean_variance_axis(

930 X, axis=0, weights=sample_weight, return_sum_weights=True

931 )

932 # Next passes

933 else:

934 (

935 self.mean_,

936 self.var_,

937 self.n_samples_seen_,

938 ) = incr_mean_variance_axis(

939 X,

940 axis=0,

941 last_mean=self.mean_,

942 last_var=self.var_,

943 last_n=self.n_samples_seen_,

944 weights=sample_weight,

945 )

946 # We force the mean and variance to float64 for large arrays

947 # See https://github.com/scikit-learn/scikit-learn/pull/12338

948 self.mean_ = self.mean_.astype(np.float64, copy=False)

949 self.var_ = self.var_.astype(np.float64, copy=False)

950 else:

951 self.mean_ = None # as with_mean must be False for sparse

952 self.var_ = None

953 weights = _check_sample_weight(sample_weight, X)

954 sum_weights_nan = weights @ sparse_constructor(

955 (np.isnan(X.data), X.indices, X.indptr), shape=X.shape

956 )

957 self.n_samples_seen_ += (np.sum(weights) - sum_weights_nan).astype(

958 dtype

959 )

960 else:

961 # First pass

962 if not hasattr(self, "scale_"):

963 self.mean_ = 0.0

964 if self.with_std:

965 self.var_ = 0.0

966 else:

967 self.var_ = None

968

969 if not self.with_mean and not self.with_std:

970 self.mean_ = None

971 self.var_ = None

972 self.n_samples_seen_ += X.shape[0] - np.isnan(X).sum(axis=0)

973

974 else:

975 self.mean_, self.var_, self.n_samples_seen_ = _incremental_mean_and_var(

976 X,

977 self.mean_,

978 self.var_,

979 self.n_samples_seen_,

980 sample_weight=sample_weight,

981 )

982

983 # for backward-compatibility, reduce n_samples_seen_ to an integer

984 # if the number of samples is the same for each feature (i.e. no

985 # missing values)

986 if np.ptp(self.n_samples_seen_) == 0:

987 self.n_samples_seen_ = self.n_samples_seen_[0]

988

989 if self.with_std:

990 # Extract the list of near constant features on the raw variances,

991 # before taking the square root.

992 constant_mask = _is_constant_feature(

993 self.var_, self.mean_, self.n_samples_seen_

994 )

995 self.scale_ = _handle_zeros_in_scale(

996 np.sqrt(self.var_), copy=False, constant_mask=constant_mask

997 )

998 else:

999 self.scale_ = None

1000

1001 return self

1002

1003 def transform(self, X, copy=None):

1004 """Perform standardization by centering and scaling.

1005

1006 Parameters

1007 ----------

1008 X : {array-like, sparse matrix of shape (n_samples, n_features)

1009 The data used to scale along the features axis.

1010 copy : bool, default=None

1011 Copy the input X or not.

1012

1013 Returns

1014 -------

1015 X_tr : {ndarray, sparse matrix} of shape (n_samples, n_features)

1016 Transformed array.

1017 """

1018 check_is_fitted(self)

1019

1020 copy = copy if copy is not None else self.copy

1021 X = self._validate_data(

1022 X,

1023 reset=False,

1024 accept_sparse="csr",

1025 copy=copy,

1026 dtype=FLOAT_DTYPES,

1027 force_all_finite="allow-nan",

1028 )

1029

1030 if sparse.issparse(X):

1031 if self.with_mean:

1032 raise ValueError(

1033 "Cannot center sparse matrices: pass `with_mean=False` "

1034 "instead. See docstring for motivation and alternatives."

1035 )

1036 if self.scale_ is not None:

1037 inplace_column_scale(X, 1 / self.scale_)

1038 else:

1039 if self.with_mean:

1040 X -= self.mean_

1041 if self.with_std:

1042 X /= self.scale_

1043 return X

1044

1045 def inverse_transform(self, X, copy=None):

1046 """Scale back the data to the original representation.

1047

1048 Parameters

1049 ----------

1050 X : {array-like, sparse matrix} of shape (n_samples, n_features)

1051 The data used to scale along the features axis.

1052 copy : bool, default=None

1053 Copy the input X or not.

1054

1055 Returns

1056 -------

1057 X_tr : {ndarray, sparse matrix} of shape (n_samples, n_features)

1058 Transformed array.

1059 """

1060 check_is_fitted(self)

1061

1062 copy = copy if copy is not None else self.copy

1063 X = check_array(

1064 X,

1065 accept_sparse="csr",

1066 copy=copy,

1067 dtype=FLOAT_DTYPES,

1068 force_all_finite="allow-nan",

1069 )

1070

1071 if sparse.issparse(X):

1072 if self.with_mean:

1073 raise ValueError(

1074 "Cannot uncenter sparse matrices: pass `with_mean=False` "

1075 "instead See docstring for motivation and alternatives."

1076 )

1077 if self.scale_ is not None:

1078 inplace_column_scale(X, self.scale_)

1079 else:

1080 if self.with_std:

1081 X *= self.scale_

1082 if self.with_mean:

1083 X += self.mean_

1084 return X

1085

1086 def _more_tags(self):

1087 return {"allow_nan": True, "preserves_dtype": [np.float64, np.float32]}

1088

1089

1090class MaxAbsScaler(OneToOneFeatureMixin, TransformerMixin, BaseEstimator):

1091 """Scale each feature by its maximum absolute value.

1092

1093 This estimator scales and translates each feature individually such

1094 that the maximal absolute value of each feature in the

1095 training set will be 1.0. It does not shift/center the data, and

1096 thus does not destroy any sparsity.

1097

1098 This scaler can also be applied to sparse CSR or CSC matrices.

1099

1100 `MaxAbsScaler` doesn't reduce the effect of outliers; it only linearly

1101 scales them down. For an example visualization, refer to :ref:`Compare

1102 MaxAbsScaler with other scalers <plot_all_scaling_max_abs_scaler_section>`.

1103

1104 .. versionadded:: 0.17

1105

1106 Parameters

1107 ----------

1108 copy : bool, default=True

1109 Set to False to perform inplace scaling and avoid a copy (if the input

1110 is already a numpy array).

1111

1112 Attributes

1113 ----------

1114 scale_ : ndarray of shape (n_features,)

1115 Per feature relative scaling of the data.

1116

1117 .. versionadded:: 0.17

1118 *scale_* attribute.

1119

1120 max_abs_ : ndarray of shape (n_features,)

1121 Per feature maximum absolute value.

1122

1123 n_features_in_ : int

1124 Number of features seen during :term:`fit`.

1125

1126 .. versionadded:: 0.24

1127

1128 feature_names_in_ : ndarray of shape (`n_features_in_`,)

1129 Names of features seen during :term:`fit`. Defined only when `X`

1130 has feature names that are all strings.

1131

1132 .. versionadded:: 1.0

1133

1134 n_samples_seen_ : int

1135 The number of samples processed by the estimator. Will be reset on

1136 new calls to fit, but increments across ``partial_fit`` calls.

1137

1138 See Also

1139 --------

1140 maxabs_scale : Equivalent function without the estimator API.

1141

1142 Notes

1143 -----

1144 NaNs are treated as missing values: disregarded in fit, and maintained in

1145 transform.

1146

1147 Examples

1148 --------

1149 >>> from sklearn.preprocessing import MaxAbsScaler

1150 >>> X = [[ 1., -1., 2.],

1151 ... [ 2., 0., 0.],

1152 ... [ 0., 1., -1.]]

1153 >>> transformer = MaxAbsScaler().fit(X)

1154 >>> transformer

1155 MaxAbsScaler()

1156 >>> transformer.transform(X)

1157 array([[ 0.5, -1. , 1. ],

1158 [ 1. , 0. , 0. ],

1159 [ 0. , 1. , -0.5]])

1160 """

1161

1162 _parameter_constraints: dict = {"copy": ["boolean"]}

1163

1164 def __init__(self, *, copy=True):

1165 self.copy = copy

1166

1167 def _reset(self):

1168 """Reset internal data-dependent state of the scaler, if necessary.

1169

1170 __init__ parameters are not touched.

1171 """

1172 # Checking one attribute is enough, because they are all set together

1173 # in partial_fit

1174 if hasattr(self, "scale_"):

1175 del self.scale_

1176 del self.n_samples_seen_

1177 del self.max_abs_

1178

1179 def fit(self, X, y=None):

1180 """Compute the maximum absolute value to be used for later scaling.

1181

1182 Parameters

1183 ----------

1184 X : {array-like, sparse matrix} of shape (n_samples, n_features)

1185 The data used to compute the per-feature minimum and maximum

1186 used for later scaling along the features axis.

1187

1188 y : None

1189 Ignored.

1190

1191 Returns

1192 -------

1193 self : object

1194 Fitted scaler.

1195 """

1196 # Reset internal state before fitting

1197 self._reset()

1198 return self.partial_fit(X, y)

1199

1200 @_fit_context(prefer_skip_nested_validation=True)

1201 def partial_fit(self, X, y=None):

1202 """Online computation of max absolute value of X for later scaling.

1203

1204 All of X is processed as a single batch. This is intended for cases

1205 when :meth:`fit` is not feasible due to very large number of

1206 `n_samples` or because X is read from a continuous stream.

1207

1208 Parameters

1209 ----------

1210 X : {array-like, sparse matrix} of shape (n_samples, n_features)

1211 The data used to compute the mean and standard deviation

1212 used for later scaling along the features axis.

1213

1214 y : None

1215 Ignored.

1216

1217 Returns

1218 -------

1219 self : object

1220 Fitted scaler.

1221 """

1222 xp, _ = get_namespace(X)

1223

1224 first_pass = not hasattr(self, "n_samples_seen_")

1225 X = self._validate_data(

1226 X,

1227 reset=first_pass,

1228 accept_sparse=("csr", "csc"),

1229 dtype=_array_api.supported_float_dtypes(xp),

1230 force_all_finite="allow-nan",

1231 )

1232

1233 if sparse.issparse(X):

1234 mins, maxs = min_max_axis(X, axis=0, ignore_nan=True)

1235 max_abs = np.maximum(np.abs(mins), np.abs(maxs))

1236 else:

1237 max_abs = _array_api._nanmax(xp.abs(X), axis=0)

1238

1239 if first_pass:

1240 self.n_samples_seen_ = X.shape[0]

1241 else:

1242 max_abs = xp.maximum(self.max_abs_, max_abs)

1243 self.n_samples_seen_ += X.shape[0]

1244

1245 self.max_abs_ = max_abs

1246 self.scale_ = _handle_zeros_in_scale(max_abs, copy=True)

1247 return self

1248

1249 def transform(self, X):

1250 """Scale the data.

1251

1252 Parameters

1253 ----------

1254 X : {array-like, sparse matrix} of shape (n_samples, n_features)

1255 The data that should be scaled.

1256

1257 Returns

1258 -------

1259 X_tr : {ndarray, sparse matrix} of shape (n_samples, n_features)

1260 Transformed array.

1261 """

1262 check_is_fitted(self)

1263

1264 xp, _ = get_namespace(X)

1265

1266 X = self._validate_data(

1267 X,

1268 accept_sparse=("csr", "csc"),

1269 copy=self.copy,

1270 reset=False,

1271 dtype=_array_api.supported_float_dtypes(xp),

1272 force_all_finite="allow-nan",

1273 )

1274

1275 if sparse.issparse(X):

1276 inplace_column_scale(X, 1.0 / self.scale_)

1277 else:

1278 X /= self.scale_

1279 return X

1280

1281 def inverse_transform(self, X):

1282 """Scale back the data to the original representation.

1283

1284 Parameters

1285 ----------

1286 X : {array-like, sparse matrix} of shape (n_samples, n_features)

1287 The data that should be transformed back.

1288

1289 Returns

1290 -------

1291 X_tr : {ndarray, sparse matrix} of shape (n_samples, n_features)

1292 Transformed array.

1293 """

1294 check_is_fitted(self)

1295

1296 xp, _ = get_namespace(X)

1297

1298 X = check_array(

1299 X,

1300 accept_sparse=("csr", "csc"),

1301 copy=self.copy,

1302 dtype=_array_api.supported_float_dtypes(xp),

1303 force_all_finite="allow-nan",

1304 )

1305

1306 if sparse.issparse(X):

1307 inplace_column_scale(X, self.scale_)

1308 else:

1309 X *= self.scale_

1310 return X

1311

1312 def _more_tags(self):

1313 return {"allow_nan": True}

1314

1315

1316@validate_params(

1317 {

1318 "X": ["array-like", "sparse matrix"],

1319 "axis": [Options(Integral, {0, 1})],

1320 },

1321 prefer_skip_nested_validation=False,

1322)

1323def maxabs_scale(X, *, axis=0, copy=True):

1324 """Scale each feature to the [-1, 1] range without breaking the sparsity.

1325

1326 This estimator scales each feature individually such

1327 that the maximal absolute value of each feature in the

1328 training set will be 1.0.

1329

1330 This scaler can also be applied to sparse CSR or CSC matrices.

1331

1332 Parameters

1333 ----------

1334 X : {array-like, sparse matrix} of shape (n_samples, n_features)

1335 The data.

1336

1337 axis : {0, 1}, default=0

1338 Axis used to scale along. If 0, independently scale each feature,

1339 otherwise (if 1) scale each sample.

1340

1341 copy : bool, default=True

1342 If False, try to avoid a copy and scale in place.

1343 This is not guaranteed to always work in place; e.g. if the data is

1344 a numpy array with an int dtype, a copy will be returned even with

1345 copy=False.

1346

1347 Returns

1348 -------

1349 X_tr : {ndarray, sparse matrix} of shape (n_samples, n_features)

1350 The transformed data.

1351

1352 .. warning:: Risk of data leak

1353

1354 Do not use :func:`~sklearn.preprocessing.maxabs_scale` unless you know

1355 what you are doing. A common mistake is to apply it to the entire data

1356 *before* splitting into training and test sets. This will bias the

1357 model evaluation because information would have leaked from the test

1358 set to the training set.

1359 In general, we recommend using

1360 :class:`~sklearn.preprocessing.MaxAbsScaler` within a

1361 :ref:`Pipeline <pipeline>` in order to prevent most risks of data

1362 leaking: `pipe = make_pipeline(MaxAbsScaler(), LogisticRegression())`.

1363

1364 See Also

1365 --------

1366 MaxAbsScaler : Performs scaling to the [-1, 1] range using

1367 the Transformer API (e.g. as part of a preprocessing

1368 :class:`~sklearn.pipeline.Pipeline`).

1369

1370 Notes

1371 -----

1372 NaNs are treated as missing values: disregarded to compute the statistics,

1373 and maintained during the data transformation.

1374

1375 For a comparison of the different scalers, transformers, and normalizers,

1376 see: :ref:`sphx_glr_auto_examples_preprocessing_plot_all_scaling.py`.

1377 """

1378 # Unlike the scaler object, this function allows 1d input.

1379

1380 # If copy is required, it will be done inside the scaler object.

1381 X = check_array(

1382 X,

1383 accept_sparse=("csr", "csc"),

1384 copy=False,

1385 ensure_2d=False,

1386 dtype=FLOAT_DTYPES,

1387 force_all_finite="allow-nan",

1388 )

1389 original_ndim = X.ndim

1390

1391 if original_ndim == 1:

1392 X = X.reshape(X.shape[0], 1)

1393

1394 s = MaxAbsScaler(copy=copy)

1395 if axis == 0:

1396 X = s.fit_transform(X)

1397 else:

1398 X = s.fit_transform(X.T).T

1399

1400 if original_ndim == 1:

1401 X = X.ravel()

1402

1403 return X

1404

1405

1406class RobustScaler(OneToOneFeatureMixin, TransformerMixin, BaseEstimator):

1407 """Scale features using statistics that are robust to outliers.

1408

1409 This Scaler removes the median and scales the data according to

1410 the quantile range (defaults to IQR: Interquartile Range).

1411 The IQR is the range between the 1st quartile (25th quantile)

1412 and the 3rd quartile (75th quantile).

1413

1414 Centering and scaling happen independently on each feature by

1415 computing the relevant statistics on the samples in the training

1416 set. Median and interquartile range are then stored to be used on

1417 later data using the :meth:`transform` method.

1418

1419 Standardization of a dataset is a common preprocessing for many machine

1420 learning estimators. Typically this is done by removing the mean and

1421 scaling to unit variance. However, outliers can often influence the sample

1422 mean / variance in a negative way. In such cases, using the median and the

1423 interquartile range often give better results. For an example visualization

1424 and comparison to other scalers, refer to :ref:`Compare RobustScaler with

1425 other scalers <plot_all_scaling_robust_scaler_section>`.

1426

1427 .. versionadded:: 0.17

1428

1429 Read more in the :ref:`User Guide <preprocessing_scaler>`.

1430

1431 Parameters

1432 ----------

1433 with_centering : bool, default=True

1434 If `True`, center the data before scaling.

1435 This will cause :meth:`transform` to raise an exception when attempted

1436 on sparse matrices, because centering them entails building a dense

1437 matrix which in common use cases is likely to be too large to fit in

1438 memory.

1439

1440 with_scaling : bool, default=True

1441 If `True`, scale the data to interquartile range.

1442

1443 quantile_range : tuple (q_min, q_max), 0.0 < q_min < q_max < 100.0, \

1444 default=(25.0, 75.0)

1445 Quantile range used to calculate `scale_`. By default this is equal to

1446 the IQR, i.e., `q_min` is the first quantile and `q_max` is the third

1447 quantile.

1448

1449 .. versionadded:: 0.18

1450

1451 copy : bool, default=True

1452 If `False`, try to avoid a copy and do inplace scaling instead.

1453 This is not guaranteed to always work inplace; e.g. if the data is

1454 not a NumPy array or scipy.sparse CSR matrix, a copy may still be

1455 returned.

1456

1457 unit_variance : bool, default=False

1458 If `True`, scale data so that normally distributed features have a

1459 variance of 1. In general, if the difference between the x-values of

1460 `q_max` and `q_min` for a standard normal distribution is greater

1461 than 1, the dataset will be scaled down. If less than 1, the dataset

1462 will be scaled up.

1463

1464 .. versionadded:: 0.24

1465

1466 Attributes

1467 ----------

1468 center_ : array of floats

1469 The median value for each feature in the training set.

1470

1471 scale_ : array of floats

1472 The (scaled) interquartile range for each feature in the training set.

1473

1474 .. versionadded:: 0.17

1475 *scale_* attribute.

1476

1477 n_features_in_ : int

1478 Number of features seen during :term:`fit`.

1479

1480 .. versionadded:: 0.24

1481

1482 feature_names_in_ : ndarray of shape (`n_features_in_`,)

1483 Names of features seen during :term:`fit`. Defined only when `X`

1484 has feature names that are all strings.

1485

1486 .. versionadded:: 1.0

1487

1488 See Also

1489 --------

1490 robust_scale : Equivalent function without the estimator API.

1491 sklearn.decomposition.PCA : Further removes the linear correlation across

1492 features with 'whiten=True'.

1493

1494 Notes

1495 -----

1496

1497 https://en.wikipedia.org/wiki/Median

1498 https://en.wikipedia.org/wiki/Interquartile_range

1499

1500 Examples

1501 --------

1502 >>> from sklearn.preprocessing import RobustScaler

1503 >>> X = [[ 1., -2., 2.],

1504 ... [ -2., 1., 3.],

1505 ... [ 4., 1., -2.]]

1506 >>> transformer = RobustScaler().fit(X)

1507 >>> transformer

1508 RobustScaler()

1509 >>> transformer.transform(X)

1510 array([[ 0. , -2. , 0. ],

1511 [-1. , 0. , 0.4],

1512 [ 1. , 0. , -1.6]])

1513 """

1514

1515 _parameter_constraints: dict = {

1516 "with_centering": ["boolean"],

1517 "with_scaling": ["boolean"],

1518 "quantile_range": [tuple],

1519 "copy": ["boolean"],

1520 "unit_variance": ["boolean"],

1521 }

1522

1523 def __init__(

1524 self,

1525 *,

1526 with_centering=True,

1527 with_scaling=True,

1528 quantile_range=(25.0, 75.0),

1529 copy=True,

1530 unit_variance=False,

1531 ):

1532 self.with_centering = with_centering

1533 self.with_scaling = with_scaling

1534 self.quantile_range = quantile_range

1535 self.unit_variance = unit_variance

1536 self.copy = copy

1537

1538 @_fit_context(prefer_skip_nested_validation=True)

1539 def fit(self, X, y=None):

1540 """Compute the median and quantiles to be used for scaling.

1541

1542 Parameters

1543 ----------

1544 X : {array-like, sparse matrix} of shape (n_samples, n_features)

1545 The data used to compute the median and quantiles

1546 used for later scaling along the features axis.

1547

1548 y : Ignored

1549 Not used, present here for API consistency by convention.

1550

1551 Returns

1552 -------

1553 self : object

1554 Fitted scaler.

1555 """

1556 # at fit, convert sparse matrices to csc for optimized computation of

1557 # the quantiles

1558 X = self._validate_data(

1559 X,

1560 accept_sparse="csc",

1561 dtype=FLOAT_DTYPES,

1562 force_all_finite="allow-nan",

1563 )

1564

1565 q_min, q_max = self.quantile_range

1566 if not 0 <= q_min <= q_max <= 100:

1567 raise ValueError("Invalid quantile range: %s" % str(self.quantile_range))

1568

1569 if self.with_centering:

1570 if sparse.issparse(X):

1571 raise ValueError(

1572 "Cannot center sparse matrices: use `with_centering=False`"

1573 " instead. See docstring for motivation and alternatives."

1574 )

1575 self.center_ = np.nanmedian(X, axis=0)

1576 else:

1577 self.center_ = None

1578

1579 if self.with_scaling:

1580 quantiles = []

1581 for feature_idx in range(X.shape[1]):

1582 if sparse.issparse(X):

1583 column_nnz_data = X.data[

1584 X.indptr[feature_idx] : X.indptr[feature_idx + 1]

1585 ]

1586 column_data = np.zeros(shape=X.shape[0], dtype=X.dtype)

1587 column_data[: len(column_nnz_data)] = column_nnz_data

1588 else:

1589 column_data = X[:, feature_idx]

1590

1591 quantiles.append(np.nanpercentile(column_data, self.quantile_range))

1592

1593 quantiles = np.transpose(quantiles)

1594

1595 self.scale_ = quantiles[1] - quantiles[0]

1596 self.scale_ = _handle_zeros_in_scale(self.scale_, copy=False)

1597 if self.unit_variance:

1598 adjust = stats.norm.ppf(q_max / 100.0) - stats.norm.ppf(q_min / 100.0)

1599 self.scale_ = self.scale_ / adjust

1600 else:

1601 self.scale_ = None

1602

1603 return self

1604

1605 def transform(self, X):

1606 """Center and scale the data.

1607

1608 Parameters

1609 ----------

1610 X : {array-like, sparse matrix} of shape (n_samples, n_features)

1611 The data used to scale along the specified axis.

1612

1613 Returns

1614 -------

1615 X_tr : {ndarray, sparse matrix} of shape (n_samples, n_features)

1616 Transformed array.

1617 """

1618 check_is_fitted(self)

1619 X = self._validate_data(

1620 X,

1621 accept_sparse=("csr", "csc"),

1622 copy=self.copy,

1623 dtype=FLOAT_DTYPES,

1624 reset=False,

1625 force_all_finite="allow-nan",

1626 )

1627

1628 if sparse.issparse(X):

1629 if self.with_scaling:

1630 inplace_column_scale(X, 1.0 / self.scale_)

1631 else:

1632 if self.with_centering:

1633 X -= self.center_

1634 if self.with_scaling:

1635 X /= self.scale_

1636 return X

1637

1638 def inverse_transform(self, X):

1639 """Scale back the data to the original representation.

1640

1641 Parameters

1642 ----------

1643 X : {array-like, sparse matrix} of shape (n_samples, n_features)

1644 The rescaled data to be transformed back.

1645

1646 Returns

1647 -------

1648 X_tr : {ndarray, sparse matrix} of shape (n_samples, n_features)

1649 Transformed array.

1650 """

1651 check_is_fitted(self)

1652 X = check_array(

1653 X,

1654 accept_sparse=("csr", "csc"),

1655 copy=self.copy,

1656 dtype=FLOAT_DTYPES,

1657 force_all_finite="allow-nan",

1658 )

1659

1660 if sparse.issparse(X):

1661 if self.with_scaling:

1662 inplace_column_scale(X, self.scale_)

1663 else:

1664 if self.with_scaling:

1665 X *= self.scale_

1666 if self.with_centering:

1667 X += self.center_

1668 return X

1669

1670 def _more_tags(self):

1671 return {"allow_nan": True}

1672

1673

1674@validate_params(

1675 {"X": ["array-like", "sparse matrix"], "axis": [Options(Integral, {0, 1})]},

1676 prefer_skip_nested_validation=False,

1677)

1678def robust_scale(

1679 X,

1680 *,

1681 axis=0,

1682 with_centering=True,

1683 with_scaling=True,

1684 quantile_range=(25.0, 75.0),

1685 copy=True,

1686 unit_variance=False,

1687):

1688 """Standardize a dataset along any axis.

1689

1690 Center to the median and component wise scale

1691 according to the interquartile range.

1692

1693 Read more in the :ref:`User Guide <preprocessing_scaler>`.

1694

1695 Parameters

1696 ----------

1697 X : {array-like, sparse matrix} of shape (n_sample, n_features)

1698 The data to center and scale.

1699

1700 axis : int, default=0

1701 Axis used to compute the medians and IQR along. If 0,

1702 independently scale each feature, otherwise (if 1) scale

1703 each sample.

1704

1705 with_centering : bool, default=True

1706 If `True`, center the data before scaling.

1707

1708 with_scaling : bool, default=True

1709 If `True`, scale the data to unit variance (or equivalently,

1710 unit standard deviation).

1711

1712 quantile_range : tuple (q_min, q_max), 0.0 < q_min < q_max < 100.0,\

1713 default=(25.0, 75.0)

1714 Quantile range used to calculate `scale_`. By default this is equal to

1715 the IQR, i.e., `q_min` is the first quantile and `q_max` is the third

1716 quantile.

1717

1718 .. versionadded:: 0.18

1719

1720 copy : bool, default=True

1721 If False, try to avoid a copy and scale in place.

1722 This is not guaranteed to always work in place; e.g. if the data is

1723 a numpy array with an int dtype, a copy will be returned even with

1724 copy=False.

1725

1726 unit_variance : bool, default=False

1727 If `True`, scale data so that normally distributed features have a

1728 variance of 1. In general, if the difference between the x-values of

1729 `q_max` and `q_min` for a standard normal distribution is greater

1730 than 1, the dataset will be scaled down. If less than 1, the dataset

1731 will be scaled up.

1732

1733 .. versionadded:: 0.24

1734

1735 Returns

1736 -------

1737 X_tr : {ndarray, sparse matrix} of shape (n_samples, n_features)

1738 The transformed data.

1739

1740 See Also

1741 --------

1742 RobustScaler : Performs centering and scaling using the Transformer API

1743 (e.g. as part of a preprocessing :class:`~sklearn.pipeline.Pipeline`).

1744

1745 Notes

1746 -----

1747 This implementation will refuse to center scipy.sparse matrices

1748 since it would make them non-sparse and would potentially crash the

1749 program with memory exhaustion problems.

1750

1751 Instead the caller is expected to either set explicitly

1752 `with_centering=False` (in that case, only variance scaling will be

1753 performed on the features of the CSR matrix) or to call `X.toarray()`

1754 if he/she expects the materialized dense array to fit in memory.

1755

1756 To avoid memory copy the caller should pass a CSR matrix.

1757

1758 For a comparison of the different scalers, transformers, and normalizers,

1759 see: :ref:`sphx_glr_auto_examples_preprocessing_plot_all_scaling.py`.

1760

1761 .. warning:: Risk of data leak

1762

1763 Do not use :func:`~sklearn.preprocessing.robust_scale` unless you know

1764 what you are doing. A common mistake is to apply it to the entire data

1765 *before* splitting into training and test sets. This will bias the

1766 model evaluation because information would have leaked from the test

1767 set to the training set.

1768 In general, we recommend using

1769 :class:`~sklearn.preprocessing.RobustScaler` within a

1770 :ref:`Pipeline <pipeline>` in order to prevent most risks of data

1771 leaking: `pipe = make_pipeline(RobustScaler(), LogisticRegression())`.

1772 """

1773 X = check_array(

1774 X,

1775 accept_sparse=("csr", "csc"),

1776 copy=False,

1777 ensure_2d=False,

1778 dtype=FLOAT_DTYPES,

1779 force_all_finite="allow-nan",

1780 )

1781 original_ndim = X.ndim

1782

1783 if original_ndim == 1:

1784 X = X.reshape(X.shape[0], 1)

1785

1786 s = RobustScaler(

1787 with_centering=with_centering,

1788 with_scaling=with_scaling,

1789 quantile_range=quantile_range,

1790 unit_variance=unit_variance,

1791 copy=copy,

1792 )

1793 if axis == 0:

1794 X = s.fit_transform(X)

1795 else:

1796 X = s.fit_transform(X.T).T

1797

1798 if original_ndim == 1:

1799 X = X.ravel()

1800

1801 return X

1802

1803

1804@validate_params(

1805 {

1806 "X": ["array-like", "sparse matrix"],

1807 "norm": [StrOptions({"l1", "l2", "max"})],

1808 "axis": [Options(Integral, {0, 1})],

1809 "copy": ["boolean"],

1810 "return_norm": ["boolean"],

1811 },

1812 prefer_skip_nested_validation=True,

1813)

1814def normalize(X, norm="l2", *, axis=1, copy=True, return_norm=False):

1815 """Scale input vectors individually to unit norm (vector length).

1816

1817 Read more in the :ref:`User Guide <preprocessing_normalization>`.

1818

1819 Parameters

1820 ----------

1821 X : {array-like, sparse matrix} of shape (n_samples, n_features)

1822 The data to normalize, element by element.

1823 scipy.sparse matrices should be in CSR format to avoid an

1824 un-necessary copy.

1825

1826 norm : {'l1', 'l2', 'max'}, default='l2'

1827 The norm to use to normalize each non zero sample (or each non-zero

1828 feature if axis is 0).

1829

1830 axis : {0, 1}, default=1

1831 Define axis used to normalize the data along. If 1, independently

1832 normalize each sample, otherwise (if 0) normalize each feature.

1833

1834 copy : bool, default=True

1835 If False, try to avoid a copy and normalize in place.

1836 This is not guaranteed to always work in place; e.g. if the data is

1837 a numpy array with an int dtype, a copy will be returned even with

1838 copy=False.

1839

1840 return_norm : bool, default=False

1841 Whether to return the computed norms.

1842

1843 Returns

1844 -------

1845 X : {ndarray, sparse matrix} of shape (n_samples, n_features)

1846 Normalized input X.

1847

1848 norms : ndarray of shape (n_samples, ) if axis=1 else (n_features, )

1849 An array of norms along given axis for X.

1850 When X is sparse, a NotImplementedError will be raised

1851 for norm 'l1' or 'l2'.

1852

1853 See Also

1854 --------

1855 Normalizer : Performs normalization using the Transformer API

1856 (e.g. as part of a preprocessing :class:`~sklearn.pipeline.Pipeline`).

1857

1858 Notes

1859 -----

1860 For a comparison of the different scalers, transformers, and normalizers,

1861 see: :ref:`sphx_glr_auto_examples_preprocessing_plot_all_scaling.py`.

1862 """

1863 if axis == 0:

1864 sparse_format = "csc"

1865 else: # axis == 1:

1866 sparse_format = "csr"

1867

1868 xp, _ = get_namespace(X)

1869

1870 X = check_array(

1871 X,

1872 accept_sparse=sparse_format,

1873 copy=copy,

1874 estimator="the normalize function",

1875 dtype=_array_api.supported_float_dtypes(xp),

1876 )

1877 if axis == 0:

1878 X = X.T

1879

1880 if sparse.issparse(X):

1881 if return_norm and norm in ("l1", "l2"):

1882 raise NotImplementedError(

1883 "return_norm=True is not implemented "

1884 "for sparse matrices with norm 'l1' "

1885 "or norm 'l2'"

1886 )

1887 if norm == "l1":

1888 inplace_csr_row_normalize_l1(X)

1889 elif norm == "l2":

1890 inplace_csr_row_normalize_l2(X)

1891 elif norm == "max":

1892 mins, maxes = min_max_axis(X, 1)

1893 norms = np.maximum(abs(mins), maxes)

1894 norms_elementwise = norms.repeat(np.diff(X.indptr))

1895 mask = norms_elementwise != 0

1896 X.data[mask] /= norms_elementwise[mask]

1897 else:

1898 if norm == "l1":

1899 norms = xp.sum(xp.abs(X), axis=1)

1900 elif norm == "l2":

1901 norms = row_norms(X)

1902 elif norm == "max":

1903 norms = xp.max(xp.abs(X), axis=1)

1904 norms = _handle_zeros_in_scale(norms, copy=False)

1905 X /= norms[:, None]

1906

1907 if axis == 0:

1908 X = X.T

1909

1910 if return_norm:

1911 return X, norms

1912 else:

1913 return X

1914

1915

1916class Normalizer(OneToOneFeatureMixin, TransformerMixin, BaseEstimator):

1917 """Normalize samples individually to unit norm.

1918

1919 Each sample (i.e. each row of the data matrix) with at least one

1920 non zero component is rescaled independently of other samples so

1921 that its norm (l1, l2 or inf) equals one.

1922

1923 This transformer is able to work both with dense numpy arrays and

1924 scipy.sparse matrix (use CSR format if you want to avoid the burden of

1925 a copy / conversion).

1926

1927 Scaling inputs to unit norms is a common operation for text

1928 classification or clustering for instance. For instance the dot

1929 product of two l2-normalized TF-IDF vectors is the cosine similarity

1930 of the vectors and is the base similarity metric for the Vector

1931 Space Model commonly used by the Information Retrieval community.

1932

1933 For an example visualization, refer to :ref:`Compare Normalizer with other

1934 scalers <plot_all_scaling_normalizer_section>`.

1935

1936 Read more in the :ref:`User Guide <preprocessing_normalization>`.

1937

1938 Parameters

1939 ----------

1940 norm : {'l1', 'l2', 'max'}, default='l2'

1941 The norm to use to normalize each non zero sample. If norm='max'

1942 is used, values will be rescaled by the maximum of the absolute

1943 values.

1944

1945 copy : bool, default=True

1946 Set to False to perform inplace row normalization and avoid a

1947 copy (if the input is already a numpy array or a scipy.sparse

1948 CSR matrix).

1949

1950 Attributes

1951 ----------

1952 n_features_in_ : int

1953 Number of features seen during :term:`fit`.

1954

1955 .. versionadded:: 0.24

1956

1957 feature_names_in_ : ndarray of shape (`n_features_in_`,)

1958 Names of features seen during :term:`fit`. Defined only when `X`

1959 has feature names that are all strings.

1960

1961 .. versionadded:: 1.0

1962

1963 See Also

1964 --------

1965 normalize : Equivalent function without the estimator API.

1966

1967 Notes

1968 -----

1969 This estimator is :term:`stateless` and does not need to be fitted.

1970 However, we recommend to call :meth:`fit_transform` instead of

1971 :meth:`transform`, as parameter validation is only performed in

1972 :meth:`fit`.

1973

1974 Examples

1975 --------

1976 >>> from sklearn.preprocessing import Normalizer

1977 >>> X = [[4, 1, 2, 2],

1978 ... [1, 3, 9, 3],

1979 ... [5, 7, 5, 1]]

1980 >>> transformer = Normalizer().fit(X) # fit does nothing.

1981 >>> transformer

1982 Normalizer()

1983 >>> transformer.transform(X)

1984 array([[0.8, 0.2, 0.4, 0.4],

1985 [0.1, 0.3, 0.9, 0.3],

1986 [0.5, 0.7, 0.5, 0.1]])

1987 """

1988

1989 _parameter_constraints: dict = {

1990 "norm": [StrOptions({"l1", "l2", "max"})],

1991 "copy": ["boolean"],

1992 }

1993

1994 def __init__(self, norm="l2", *, copy=True):

1995 self.norm = norm

1996 self.copy = copy

1997

1998 @_fit_context(prefer_skip_nested_validation=True)

1999 def fit(self, X, y=None):

2000 """Only validates estimator's parameters.

2001

2002 This method allows to: (i) validate the estimator's parameters and

2003 (ii) be consistent with the scikit-learn transformer API.

2004

2005 Parameters

2006 ----------

2007 X : {array-like, sparse matrix} of shape (n_samples, n_features)

2008 The data to estimate the normalization parameters.

2009

2010 y : Ignored

2011 Not used, present here for API consistency by convention.

2012

2013 Returns

2014 -------

2015 self : object

2016 Fitted transformer.

2017 """

2018 self._validate_data(X, accept_sparse="csr")

2019 return self

2020

2021 def transform(self, X, copy=None):

2022 """Scale each non zero row of X to unit norm.

2023

2024 Parameters

2025 ----------

2026 X : {array-like, sparse matrix} of shape (n_samples, n_features)

2027 The data to normalize, row by row. scipy.sparse matrices should be

2028 in CSR format to avoid an un-necessary copy.

2029

2030 copy : bool, default=None

2031 Copy the input X or not.

2032

2033 Returns

2034 -------

2035 X_tr : {ndarray, sparse matrix} of shape (n_samples, n_features)

2036 Transformed array.

2037 """

2038 copy = copy if copy is not None else self.copy

2039 X = self._validate_data(X, accept_sparse="csr", reset=False)

2040 return normalize(X, norm=self.norm, axis=1, copy=copy)

2041

2042 def _more_tags(self):

2043 return {"stateless": True, "array_api_support": True}

2044

2045

2046@validate_params(

2047 {

2048 "X": ["array-like", "sparse matrix"],

2049 "threshold": [Interval(Real, None, None, closed="neither")],

2050 "copy": ["boolean"],

2051 },

2052 prefer_skip_nested_validation=True,

2053)

2054def binarize(X, *, threshold=0.0, copy=True):

2055 """Boolean thresholding of array-like or scipy.sparse matrix.

2056

2057 Read more in the :ref:`User Guide <preprocessing_binarization>`.

2058

2059 Parameters

2060 ----------

2061 X : {array-like, sparse matrix} of shape (n_samples, n_features)

2062 The data to binarize, element by element.

2063 scipy.sparse matrices should be in CSR or CSC format to avoid an

2064 un-necessary copy.

2065

2066 threshold : float, default=0.0

2067 Feature values below or equal to this are replaced by 0, above it by 1.

2068 Threshold may not be less than 0 for operations on sparse matrices.

2069

2070 copy : bool, default=True

2071 If False, try to avoid a copy and binarize in place.

2072 This is not guaranteed to always work in place; e.g. if the data is

2073 a numpy array with an object dtype, a copy will be returned even with

2074 copy=False.

2075

2076 Returns

2077 -------

2078 X_tr : {ndarray, sparse matrix} of shape (n_samples, n_features)

2079 The transformed data.

2080

2081 See Also

2082 --------

2083 Binarizer : Performs binarization using the Transformer API

2084 (e.g. as part of a preprocessing :class:`~sklearn.pipeline.Pipeline`).

2085 """

2086 X = check_array(X, accept_sparse=["csr", "csc"], copy=copy)

2087 if sparse.issparse(X):

2088 if threshold < 0:

2089 raise ValueError("Cannot binarize a sparse matrix with threshold < 0")

2090 cond = X.data > threshold

2091 not_cond = np.logical_not(cond)

2092 X.data[cond] = 1

2093 X.data[not_cond] = 0

2094 X.eliminate_zeros()

2095 else:

2096 cond = X > threshold

2097 not_cond = np.logical_not(cond)

2098 X[cond] = 1

2099 X[not_cond] = 0

2100 return X

2101

2102

2103class Binarizer(OneToOneFeatureMixin, TransformerMixin, BaseEstimator):

2104 """Binarize data (set feature values to 0 or 1) according to a threshold.

2105

2106 Values greater than the threshold map to 1, while values less than

2107 or equal to the threshold map to 0. With the default threshold of 0,

2108 only positive values map to 1.

2109

2110 Binarization is a common operation on text count data where the

2111 analyst can decide to only consider the presence or absence of a

2112 feature rather than a quantified number of occurrences for instance.

2113

2114 It can also be used as a pre-processing step for estimators that

2115 consider boolean random variables (e.g. modelled using the Bernoulli

2116 distribution in a Bayesian setting).

2117

2118 Read more in the :ref:`User Guide <preprocessing_binarization>`.

2119

2120 Parameters

2121 ----------

2122 threshold : float, default=0.0

2123 Feature values below or equal to this are replaced by 0, above it by 1.

2124 Threshold may not be less than 0 for operations on sparse matrices.

2125

2126 copy : bool, default=True

2127 Set to False to perform inplace binarization and avoid a copy (if

2128 the input is already a numpy array or a scipy.sparse CSR matrix).

2129

2130 Attributes

2131 ----------

2132 n_features_in_ : int

2133 Number of features seen during :term:`fit`.

2134

2135 .. versionadded:: 0.24

2136

2137 feature_names_in_ : ndarray of shape (`n_features_in_`,)

2138 Names of features seen during :term:`fit`. Defined only when `X`

2139 has feature names that are all strings.

2140

2141 .. versionadded:: 1.0

2142

2143 See Also

2144 --------

2145 binarize : Equivalent function without the estimator API.

2146 KBinsDiscretizer : Bin continuous data into intervals.

2147 OneHotEncoder : Encode categorical features as a one-hot numeric array.

2148

2149 Notes

2150 -----

2151 If the input is a sparse matrix, only the non-zero values are subject

2152 to update by the :class:`Binarizer` class.

2153

2154 This estimator is :term:`stateless` and does not need to be fitted.

2155 However, we recommend to call :meth:`fit_transform` instead of

2156 :meth:`transform`, as parameter validation is only performed in

2157 :meth:`fit`.

2158

2159 Examples

2160 --------

2161 >>> from sklearn.preprocessing import Binarizer

2162 >>> X = [[ 1., -1., 2.],

2163 ... [ 2., 0., 0.],

2164 ... [ 0., 1., -1.]]

2165 >>> transformer = Binarizer().fit(X) # fit does nothing.

2166 >>> transformer

2167 Binarizer()

2168 >>> transformer.transform(X)

2169 array([[1., 0., 1.],

2170 [1., 0., 0.],

2171 [0., 1., 0.]])

2172 """

2173

2174 _parameter_constraints: dict = {

2175 "threshold": [Real],

2176 "copy": ["boolean"],

2177 }

2178

2179 def __init__(self, *, threshold=0.0, copy=True):

2180 self.threshold = threshold

2181 self.copy = copy

2182

2183 @_fit_context(prefer_skip_nested_validation=True)

2184 def fit(self, X, y=None):

2185 """Only validates estimator's parameters.

2186

2187 This method allows to: (i) validate the estimator's parameters and

2188 (ii) be consistent with the scikit-learn transformer API.

2189

2190 Parameters

2191 ----------

2192 X : {array-like, sparse matrix} of shape (n_samples, n_features)

2193 The data.

2194

2195 y : None

2196 Ignored.

2197

2198 Returns

2199 -------

2200 self : object

2201 Fitted transformer.

2202 """

2203 self._validate_data(X, accept_sparse="csr")

2204 return self

2205

2206 def transform(self, X, copy=None):

2207 """Binarize each element of X.

2208

2209 Parameters

2210 ----------

2211 X : {array-like, sparse matrix} of shape (n_samples, n_features)

2212 The data to binarize, element by element.

2213 scipy.sparse matrices should be in CSR format to avoid an

2214 un-necessary copy.

2215

2216 copy : bool

2217 Copy the input X or not.

2218

2219 Returns

2220 -------

2221 X_tr : {ndarray, sparse matrix} of shape (n_samples, n_features)

2222 Transformed array.

2223 """

2224 copy = copy if copy is not None else self.copy

2225 # TODO: This should be refactored because binarize also calls

2226 # check_array

2227 X = self._validate_data(X, accept_sparse=["csr", "csc"], copy=copy, reset=False)

2228 return binarize(X, threshold=self.threshold, copy=False)

2229

2230 def _more_tags(self):

2231 return {"stateless": True}

2232

2233

2234class KernelCenterer(ClassNamePrefixFeaturesOutMixin, TransformerMixin, BaseEstimator):

2235 r"""Center an arbitrary kernel matrix :math:`K`.

2236

2237 Let define a kernel :math:`K` such that:

2238

2239 .. math::

2240 K(X, Y) = \phi(X) . \phi(Y)^{T}

2241

2242 :math:`\phi(X)` is a function mapping of rows of :math:`X` to a

2243 Hilbert space and :math:`K` is of shape `(n_samples, n_samples)`.

2244

2245 This class allows to compute :math:`\tilde{K}(X, Y)` such that:

2246

2247 .. math::

2248 \tilde{K(X, Y)} = \tilde{\phi}(X) . \tilde{\phi}(Y)^{T}

2249

2250 :math:`\tilde{\phi}(X)` is the centered mapped data in the Hilbert

2251 space.

2252

2253 `KernelCenterer` centers the features without explicitly computing the

2254 mapping :math:`\phi(\cdot)`. Working with centered kernels is sometime

2255 expected when dealing with algebra computation such as eigendecomposition

2256 for :class:`~sklearn.decomposition.KernelPCA` for instance.

2257

2258 Read more in the :ref:`User Guide <kernel_centering>`.

2259

2260 Attributes

2261 ----------

2262 K_fit_rows_ : ndarray of shape (n_samples,)

2263 Average of each column of kernel matrix.

2264

2265 K_fit_all_ : float

2266 Average of kernel matrix.

2267

2268 n_features_in_ : int

2269 Number of features seen during :term:`fit`.

2270

2271 .. versionadded:: 0.24

2272

2273 feature_names_in_ : ndarray of shape (`n_features_in_`,)

2274 Names of features seen during :term:`fit`. Defined only when `X`

2275 has feature names that are all strings.

2276

2277 .. versionadded:: 1.0

2278

2279 See Also

2280 --------

2281 sklearn.kernel_approximation.Nystroem : Approximate a kernel map

2282 using a subset of the training data.

2283

2284 References

2285 ----------

2286 .. [1] `Schölkopf, Bernhard, Alexander Smola, and Klaus-Robert Müller.

2287 "Nonlinear component analysis as a kernel eigenvalue problem."

2288 Neural computation 10.5 (1998): 1299-1319.

2289 <https://www.mlpack.org/papers/kpca.pdf>`_

2290

2291 Examples

2292 --------

2293 >>> from sklearn.preprocessing import KernelCenterer

2294 >>> from sklearn.metrics.pairwise import pairwise_kernels

2295 >>> X = [[ 1., -2., 2.],

2296 ... [ -2., 1., 3.],

2297 ... [ 4., 1., -2.]]

2298 >>> K = pairwise_kernels(X, metric='linear')

2299 >>> K

2300 array([[ 9., 2., -2.],

2301 [ 2., 14., -13.],

2302 [ -2., -13., 21.]])

2303 >>> transformer = KernelCenterer().fit(K)

2304 >>> transformer

2305 KernelCenterer()

2306 >>> transformer.transform(K)

2307 array([[ 5., 0., -5.],

2308 [ 0., 14., -14.],

2309 [ -5., -14., 19.]])

2310 """

2311

2312 def __init__(self):

2313 # Needed for backported inspect.signature compatibility with PyPy

2314 pass

2315

2316 def fit(self, K, y=None):

2317 """Fit KernelCenterer.

2318

2319 Parameters

2320 ----------

2321 K : ndarray of shape (n_samples, n_samples)

2322 Kernel matrix.

2323

2324 y : None

2325 Ignored.

2326

2327 Returns

2328 -------

2329 self : object

2330 Returns the instance itself.

2331 """

2332 xp, _ = get_namespace(K)

2333

2334 K = self._validate_data(K, dtype=_array_api.supported_float_dtypes(xp))

2335

2336 if K.shape[0] != K.shape[1]:

2337 raise ValueError(

2338 "Kernel matrix must be a square matrix."

2339 " Input is a {}x{} matrix.".format(K.shape[0], K.shape[1])

2340 )

2341

2342 n_samples = K.shape[0]

2343 self.K_fit_rows_ = xp.sum(K, axis=0) / n_samples

2344 self.K_fit_all_ = xp.sum(self.K_fit_rows_) / n_samples

2345 return self

2346

2347 def transform(self, K, copy=True):

2348 """Center kernel matrix.

2349

2350 Parameters

2351 ----------

2352 K : ndarray of shape (n_samples1, n_samples2)

2353 Kernel matrix.

2354

2355 copy : bool, default=True

2356 Set to False to perform inplace computation.

2357

2358 Returns

2359 -------

2360 K_new : ndarray of shape (n_samples1, n_samples2)

2361 Returns the instance itself.

2362 """

2363 check_is_fitted(self)

2364

2365 xp, _ = get_namespace(K)

2366

2367 K = self._validate_data(

2368 K, copy=copy, dtype=_array_api.supported_float_dtypes(xp), reset=False

2369 )

2370

2371 K_pred_cols = (xp.sum(K, axis=1) / self.K_fit_rows_.shape[0])[:, None]

2372

2373 K -= self.K_fit_rows_

2374 K -= K_pred_cols

2375 K += self.K_fit_all_

2376

2377 return K

2378

2379 @property

2380 def _n_features_out(self):

2381 """Number of transformed output features."""

2382 # Used by ClassNamePrefixFeaturesOutMixin. This model preserves the

2383 # number of input features but this is not a one-to-one mapping in the

2384 # usual sense. Hence the choice not to use OneToOneFeatureMixin to

2385 # implement get_feature_names_out for this class.

2386 return self.n_features_in_

2387

2388 def _more_tags(self):

2389 return {"pairwise": True, "array_api_support": True}

2390

2391

2392@validate_params(

2393 {

2394 "X": ["array-like", "sparse matrix"],

2395 "value": [Interval(Real, None, None, closed="neither")],

2396 },

2397 prefer_skip_nested_validation=True,

2398)

2399def add_dummy_feature(X, value=1.0):

2400 """Augment dataset with an additional dummy feature.

2401

2402 This is useful for fitting an intercept term with implementations which

2403 cannot otherwise fit it directly.

2404

2405 Parameters

2406 ----------

2407 X : {array-like, sparse matrix} of shape (n_samples, n_features)

2408 Data.

2409

2410 value : float

2411 Value to use for the dummy feature.

2412

2413 Returns

2414 -------

2415 X : {ndarray, sparse matrix} of shape (n_samples, n_features + 1)

2416 Same data with dummy feature added as first column.

2417

2418 Examples

2419 --------

2420 >>> from sklearn.preprocessing import add_dummy_feature

2421 >>> add_dummy_feature([[0, 1], [1, 0]])

2422 array([[1., 0., 1.],

2423 [1., 1., 0.]])

2424 """

2425 X = check_array(X, accept_sparse=["csc", "csr", "coo"], dtype=FLOAT_DTYPES)

2426 n_samples, n_features = X.shape

2427 shape = (n_samples, n_features + 1)

2428 if sparse.issparse(X):

2429 if X.format == "coo":

2430 # Shift columns to the right.

2431 col = X.col + 1

2432 # Column indices of dummy feature are 0 everywhere.

2433 col = np.concatenate((np.zeros(n_samples), col))

2434 # Row indices of dummy feature are 0, ..., n_samples-1.

2435 row = np.concatenate((np.arange(n_samples), X.row))

2436 # Prepend the dummy feature n_samples times.

2437 data = np.concatenate((np.full(n_samples, value), X.data))

2438 return sparse.coo_matrix((data, (row, col)), shape)

2439 elif X.format == "csc":

2440 # Shift index pointers since we need to add n_samples elements.

2441 indptr = X.indptr + n_samples

2442 # indptr[0] must be 0.

2443 indptr = np.concatenate((np.array([0]), indptr))

2444 # Row indices of dummy feature are 0, ..., n_samples-1.

2445 indices = np.concatenate((np.arange(n_samples), X.indices))

2446 # Prepend the dummy feature n_samples times.

2447 data = np.concatenate((np.full(n_samples, value), X.data))

2448 return sparse.csc_matrix((data, indices, indptr), shape)

2449 else:

2450 klass = X.__class__

2451 return klass(add_dummy_feature(X.tocoo(), value))

2452 else:

2453 return np.hstack((np.full((n_samples, 1), value), X))

2454

2455

2456class QuantileTransformer(OneToOneFeatureMixin, TransformerMixin, BaseEstimator):

2457 """Transform features using quantiles information.

2458

2459 This method transforms the features to follow a uniform or a normal

2460 distribution. Therefore, for a given feature, this transformation tends

2461 to spread out the most frequent values. It also reduces the impact of

2462 (marginal) outliers: this is therefore a robust preprocessing scheme.

2463

2464 The transformation is applied on each feature independently. First an

2465 estimate of the cumulative distribution function of a feature is

2466 used to map the original values to a uniform distribution. The obtained

2467 values are then mapped to the desired output distribution using the

2468 associated quantile function. Features values of new/unseen data that fall

2469 below or above the fitted range will be mapped to the bounds of the output

2470 distribution. Note that this transform is non-linear. It may distort linear

2471 correlations between variables measured at the same scale but renders

2472 variables measured at different scales more directly comparable.

2473

2474 For example visualizations, refer to :ref:`Compare QuantileTransformer with

2475 other scalers <plot_all_scaling_quantile_transformer_section>`.

2476

2477 Read more in the :ref:`User Guide <preprocessing_transformer>`.

2478

2479 .. versionadded:: 0.19

2480

2481 Parameters

2482 ----------

2483 n_quantiles : int, default=1000 or n_samples

2484 Number of quantiles to be computed. It corresponds to the number

2485 of landmarks used to discretize the cumulative distribution function.

2486 If n_quantiles is larger than the number of samples, n_quantiles is set

2487 to the number of samples as a larger number of quantiles does not give

2488 a better approximation of the cumulative distribution function

2489 estimator.

2490

2491 output_distribution : {'uniform', 'normal'}, default='uniform'

2492 Marginal distribution for the transformed data. The choices are

2493 'uniform' (default) or 'normal'.

2494

2495 ignore_implicit_zeros : bool, default=False

2496 Only applies to sparse matrices. If True, the sparse entries of the

2497 matrix are discarded to compute the quantile statistics. If False,

2498 these entries are treated as zeros.

2499

2500 subsample : int, default=10_000

2501 Maximum number of samples used to estimate the quantiles for

2502 computational efficiency. Note that the subsampling procedure may

2503 differ for value-identical sparse and dense matrices.

2504

2505 random_state : int, RandomState instance or None, default=None

2506 Determines random number generation for subsampling and smoothing

2507 noise.

2508 Please see ``subsample`` for more details.

2509 Pass an int for reproducible results across multiple function calls.

2510 See :term:`Glossary <random_state>`.

2511

2512 copy : bool, default=True

2513 Set to False to perform inplace transformation and avoid a copy (if the

2514 input is already a numpy array).

2515

2516 Attributes

2517 ----------

2518 n_quantiles_ : int

2519 The actual number of quantiles used to discretize the cumulative

2520 distribution function.

2521

2522 quantiles_ : ndarray of shape (n_quantiles, n_features)

2523 The values corresponding the quantiles of reference.

2524

2525 references_ : ndarray of shape (n_quantiles, )

2526 Quantiles of references.

2527

2528 n_features_in_ : int

2529 Number of features seen during :term:`fit`.

2530

2531 .. versionadded:: 0.24

2532

2533 feature_names_in_ : ndarray of shape (`n_features_in_`,)

2534 Names of features seen during :term:`fit`. Defined only when `X`

2535 has feature names that are all strings.

2536

2537 .. versionadded:: 1.0

2538

2539 See Also

2540 --------

2541 quantile_transform : Equivalent function without the estimator API.

2542 PowerTransformer : Perform mapping to a normal distribution using a power

2543 transform.

2544 StandardScaler : Perform standardization that is faster, but less robust

2545 to outliers.

2546 RobustScaler : Perform robust standardization that removes the influence

2547 of outliers but does not put outliers and inliers on the same scale.

2548

2549 Notes

2550 -----

2551 NaNs are treated as missing values: disregarded in fit, and maintained in

2552 transform.

2553

2554 Examples

2555 --------

2556 >>> import numpy as np

2557 >>> from sklearn.preprocessing import QuantileTransformer

2558 >>> rng = np.random.RandomState(0)

2559 >>> X = np.sort(rng.normal(loc=0.5, scale=0.25, size=(25, 1)), axis=0)

2560 >>> qt = QuantileTransformer(n_quantiles=10, random_state=0)

2561 >>> qt.fit_transform(X)

2562 array([...])

2563 """

2564

2565 _parameter_constraints: dict = {

2566 "n_quantiles": [Interval(Integral, 1, None, closed="left")],

2567 "output_distribution": [StrOptions({"uniform", "normal"})],

2568 "ignore_implicit_zeros": ["boolean"],

2569 "subsample": [Interval(Integral, 1, None, closed="left")],

2570 "random_state": ["random_state"],

2571 "copy": ["boolean"],

2572 }

2573

2574 def __init__(

2575 self,

2576 *,

2577 n_quantiles=1000,

2578 output_distribution="uniform",

2579 ignore_implicit_zeros=False,

2580 subsample=10_000,

2581 random_state=None,

2582 copy=True,

2583 ):

2584 self.n_quantiles = n_quantiles

2585 self.output_distribution = output_distribution

2586 self.ignore_implicit_zeros = ignore_implicit_zeros

2587 self.subsample = subsample

2588 self.random_state = random_state

2589 self.copy = copy

2590

2591 def _dense_fit(self, X, random_state):

2592 """Compute percentiles for dense matrices.

2593

2594 Parameters

2595 ----------

2596 X : ndarray of shape (n_samples, n_features)

2597 The data used to scale along the features axis.

2598 """

2599 if self.ignore_implicit_zeros:

2600 warnings.warn(

2601 "'ignore_implicit_zeros' takes effect only with"

2602 " sparse matrix. This parameter has no effect."

2603 )

2604

2605 n_samples, n_features = X.shape

2606 references = self.references_ * 100

2607

2608 self.quantiles_ = []

2609 for col in X.T:

2610 if self.subsample < n_samples:

2611 subsample_idx = random_state.choice(

2612 n_samples, size=self.subsample, replace=False

2613 )

2614 col = col.take(subsample_idx, mode="clip")

2615 self.quantiles_.append(np.nanpercentile(col, references))

2616 self.quantiles_ = np.transpose(self.quantiles_)

2617 # Due to floating-point precision error in `np.nanpercentile`,

2618 # make sure that quantiles are monotonically increasing.

2619 # Upstream issue in numpy:

2620 # https://github.com/numpy/numpy/issues/14685

2621 self.quantiles_ = np.maximum.accumulate(self.quantiles_)

2622

2623 def _sparse_fit(self, X, random_state):

2624 """Compute percentiles for sparse matrices.

2625

2626 Parameters

2627 ----------

2628 X : sparse matrix of shape (n_samples, n_features)

2629 The data used to scale along the features axis. The sparse matrix

2630 needs to be nonnegative. If a sparse matrix is provided,

2631 it will be converted into a sparse ``csc_matrix``.

2632 """

2633 n_samples, n_features = X.shape

2634 references = self.references_ * 100

2635

2636 self.quantiles_ = []

2637 for feature_idx in range(n_features):

2638 column_nnz_data = X.data[X.indptr[feature_idx] : X.indptr[feature_idx + 1]]

2639 if len(column_nnz_data) > self.subsample:

2640 column_subsample = self.subsample * len(column_nnz_data) // n_samples

2641 if self.ignore_implicit_zeros:

2642 column_data = np.zeros(shape=column_subsample, dtype=X.dtype)

2643 else:

2644 column_data = np.zeros(shape=self.subsample, dtype=X.dtype)

2645 column_data[:column_subsample] = random_state.choice(

2646 column_nnz_data, size=column_subsample, replace=False

2647 )

2648 else:

2649 if self.ignore_implicit_zeros:

2650 column_data = np.zeros(shape=len(column_nnz_data), dtype=X.dtype)

2651 else:

2652 column_data = np.zeros(shape=n_samples, dtype=X.dtype)

2653 column_data[: len(column_nnz_data)] = column_nnz_data

2654

2655 if not column_data.size:

2656 # if no nnz, an error will be raised for computing the

2657 # quantiles. Force the quantiles to be zeros.

2658 self.quantiles_.append([0] * len(references))

2659 else:

2660 self.quantiles_.append(np.nanpercentile(column_data, references))

2661 self.quantiles_ = np.transpose(self.quantiles_)

2662 # due to floating-point precision error in `np.nanpercentile`,

2663 # make sure the quantiles are monotonically increasing

2664 # Upstream issue in numpy:

2665 # https://github.com/numpy/numpy/issues/14685

2666 self.quantiles_ = np.maximum.accumulate(self.quantiles_)

2667

2668 @_fit_context(prefer_skip_nested_validation=True)

2669 def fit(self, X, y=None):

2670 """Compute the quantiles used for transforming.

2671

2672 Parameters

2673 ----------

2674 X : {array-like, sparse matrix} of shape (n_samples, n_features)

2675 The data used to scale along the features axis. If a sparse

2676 matrix is provided, it will be converted into a sparse

2677 ``csc_matrix``. Additionally, the sparse matrix needs to be

2678 nonnegative if `ignore_implicit_zeros` is False.

2679

2680 y : None

2681 Ignored.

2682

2683 Returns

2684 -------

2685 self : object

2686 Fitted transformer.

2687 """

2688 if self.n_quantiles > self.subsample:

2689 raise ValueError(

2690 "The number of quantiles cannot be greater than"

2691 " the number of samples used. Got {} quantiles"

2692 " and {} samples.".format(self.n_quantiles, self.subsample)

2693 )

2694

2695 X = self._check_inputs(X, in_fit=True, copy=False)

2696 n_samples = X.shape[0]

2697

2698 if self.n_quantiles > n_samples:

2699 warnings.warn(

2700 "n_quantiles (%s) is greater than the total number "

2701 "of samples (%s). n_quantiles is set to "

2702 "n_samples." % (self.n_quantiles, n_samples)

2703 )

2704 self.n_quantiles_ = max(1, min(self.n_quantiles, n_samples))

2705

2706 rng = check_random_state(self.random_state)

2707

2708 # Create the quantiles of reference

2709 self.references_ = np.linspace(0, 1, self.n_quantiles_, endpoint=True)

2710 if sparse.issparse(X):

2711 self._sparse_fit(X, rng)

2712 else:

2713 self._dense_fit(X, rng)

2714

2715 return self

2716

2717 def _transform_col(self, X_col, quantiles, inverse):

2718 """Private function to transform a single feature."""

2719

2720 output_distribution = self.output_distribution

2721

2722 if not inverse:

2723 lower_bound_x = quantiles[0]

2724 upper_bound_x = quantiles[-1]

2725 lower_bound_y = 0

2726 upper_bound_y = 1

2727 else:

2728 lower_bound_x = 0

2729 upper_bound_x = 1

2730 lower_bound_y = quantiles[0]

2731 upper_bound_y = quantiles[-1]

2732 # for inverse transform, match a uniform distribution

2733 with np.errstate(invalid="ignore"): # hide NaN comparison warnings

2734 if output_distribution == "normal":

2735 X_col = stats.norm.cdf(X_col)

2736 # else output distribution is already a uniform distribution

2737

2738 # find index for lower and higher bounds

2739 with np.errstate(invalid="ignore"): # hide NaN comparison warnings

2740 if output_distribution == "normal":

2741 lower_bounds_idx = X_col - BOUNDS_THRESHOLD < lower_bound_x

2742 upper_bounds_idx = X_col + BOUNDS_THRESHOLD > upper_bound_x

2743 if output_distribution == "uniform":

2744 lower_bounds_idx = X_col == lower_bound_x

2745 upper_bounds_idx = X_col == upper_bound_x

2746

2747 isfinite_mask = ~np.isnan(X_col)

2748 X_col_finite = X_col[isfinite_mask]

2749 if not inverse:

2750 # Interpolate in one direction and in the other and take the

2751 # mean. This is in case of repeated values in the features

2752 # and hence repeated quantiles

2753 #

2754 # If we don't do this, only one extreme of the duplicated is

2755 # used (the upper when we do ascending, and the

2756 # lower for descending). We take the mean of these two

2757 X_col[isfinite_mask] = 0.5 * (

2758 np.interp(X_col_finite, quantiles, self.references_)

2759 - np.interp(-X_col_finite, -quantiles[::-1], -self.references_[::-1])

2760 )

2761 else:

2762 X_col[isfinite_mask] = np.interp(X_col_finite, self.references_, quantiles)

2763

2764 X_col[upper_bounds_idx] = upper_bound_y

2765 X_col[lower_bounds_idx] = lower_bound_y

2766 # for forward transform, match the output distribution

2767 if not inverse:

2768 with np.errstate(invalid="ignore"): # hide NaN comparison warnings

2769 if output_distribution == "normal":

2770 X_col = stats.norm.ppf(X_col)

2771 # find the value to clip the data to avoid mapping to

2772 # infinity. Clip such that the inverse transform will be

2773 # consistent

2774 clip_min = stats.norm.ppf(BOUNDS_THRESHOLD - np.spacing(1))

2775 clip_max = stats.norm.ppf(1 - (BOUNDS_THRESHOLD - np.spacing(1)))

2776 X_col = np.clip(X_col, clip_min, clip_max)

2777 # else output distribution is uniform and the ppf is the

2778 # identity function so we let X_col unchanged

2779

2780 return X_col

2781

2782 def _check_inputs(self, X, in_fit, accept_sparse_negative=False, copy=False):

2783 """Check inputs before fit and transform."""

2784 X = self._validate_data(

2785 X,

2786 reset=in_fit,

2787 accept_sparse="csc",

2788 copy=copy,

2789 dtype=FLOAT_DTYPES,

2790 force_all_finite="allow-nan",

2791 )

2792 # we only accept positive sparse matrix when ignore_implicit_zeros is

2793 # false and that we call fit or transform.

2794 with np.errstate(invalid="ignore"): # hide NaN comparison warnings

2795 if (

2796 not accept_sparse_negative

2797 and not self.ignore_implicit_zeros

2798 and (sparse.issparse(X) and np.any(X.data < 0))

2799 ):

2800 raise ValueError(

2801 "QuantileTransformer only accepts non-negative sparse matrices."

2802 )

2803

2804 return X

2805

2806 def _transform(self, X, inverse=False):

2807 """Forward and inverse transform.

2808

2809 Parameters

2810 ----------

2811 X : ndarray of shape (n_samples, n_features)

2812 The data used to scale along the features axis.

2813

2814 inverse : bool, default=False

2815 If False, apply forward transform. If True, apply

2816 inverse transform.

2817

2818 Returns

2819 -------

2820 X : ndarray of shape (n_samples, n_features)

2821 Projected data.

2822 """

2823 if sparse.issparse(X):

2824 for feature_idx in range(X.shape[1]):

2825 column_slice = slice(X.indptr[feature_idx], X.indptr[feature_idx + 1])

2826 X.data[column_slice] = self._transform_col(

2827 X.data[column_slice], self.quantiles_[:, feature_idx], inverse

2828 )

2829 else:

2830 for feature_idx in range(X.shape[1]):

2831 X[:, feature_idx] = self._transform_col(

2832 X[:, feature_idx], self.quantiles_[:, feature_idx], inverse

2833 )

2834

2835 return X

2836

2837 def transform(self, X):

2838 """Feature-wise transformation of the data.

2839

2840 Parameters

2841 ----------

2842 X : {array-like, sparse matrix} of shape (n_samples, n_features)

2843 The data used to scale along the features axis. If a sparse

2844 matrix is provided, it will be converted into a sparse

2845 ``csc_matrix``. Additionally, the sparse matrix needs to be

2846 nonnegative if `ignore_implicit_zeros` is False.

2847

2848 Returns

2849 -------

2850 Xt : {ndarray, sparse matrix} of shape (n_samples, n_features)

2851 The projected data.

2852 """

2853 check_is_fitted(self)

2854 X = self._check_inputs(X, in_fit=False, copy=self.copy)

2855

2856 return self._transform(X, inverse=False)

2857

2858 def inverse_transform(self, X):

2859 """Back-projection to the original space.

2860

2861 Parameters

2862 ----------

2863 X : {array-like, sparse matrix} of shape (n_samples, n_features)

2864 The data used to scale along the features axis. If a sparse

2865 matrix is provided, it will be converted into a sparse

2866 ``csc_matrix``. Additionally, the sparse matrix needs to be

2867 nonnegative if `ignore_implicit_zeros` is False.

2868

2869 Returns

2870 -------

2871 Xt : {ndarray, sparse matrix} of (n_samples, n_features)

2872 The projected data.

2873 """

2874 check_is_fitted(self)

2875 X = self._check_inputs(

2876 X, in_fit=False, accept_sparse_negative=True, copy=self.copy

2877 )

2878

2879 return self._transform(X, inverse=True)

2880

2881 def _more_tags(self):

2882 return {"allow_nan": True}

2883

2884

2885@validate_params(

2886 {"X": ["array-like", "sparse matrix"], "axis": [Options(Integral, {0, 1})]},

2887 prefer_skip_nested_validation=False,

2888)

2889def quantile_transform(

2890 X,

2891 *,

2892 axis=0,

2893 n_quantiles=1000,

2894 output_distribution="uniform",

2895 ignore_implicit_zeros=False,

2896 subsample=int(1e5),

2897 random_state=None,

2898 copy=True,

2899):

2900 """Transform features using quantiles information.

2901

2902 This method transforms the features to follow a uniform or a normal

2903 distribution. Therefore, for a given feature, this transformation tends

2904 to spread out the most frequent values. It also reduces the impact of

2905 (marginal) outliers: this is therefore a robust preprocessing scheme.

2906

2907 The transformation is applied on each feature independently. First an

2908 estimate of the cumulative distribution function of a feature is

2909 used to map the original values to a uniform distribution. The obtained

2910 values are then mapped to the desired output distribution using the

2911 associated quantile function. Features values of new/unseen data that fall

2912 below or above the fitted range will be mapped to the bounds of the output

2913 distribution. Note that this transform is non-linear. It may distort linear

2914 correlations between variables measured at the same scale but renders

2915 variables measured at different scales more directly comparable.

2916

2917 Read more in the :ref:`User Guide <preprocessing_transformer>`.

2918

2919 Parameters

2920 ----------

2921 X : {array-like, sparse matrix} of shape (n_samples, n_features)

2922 The data to transform.

2923

2924 axis : int, default=0

2925 Axis used to compute the means and standard deviations along. If 0,

2926 transform each feature, otherwise (if 1) transform each sample.

2927

2928 n_quantiles : int, default=1000 or n_samples

2929 Number of quantiles to be computed. It corresponds to the number

2930 of landmarks used to discretize the cumulative distribution function.

2931 If n_quantiles is larger than the number of samples, n_quantiles is set

2932 to the number of samples as a larger number of quantiles does not give

2933 a better approximation of the cumulative distribution function

2934 estimator.

2935

2936 output_distribution : {'uniform', 'normal'}, default='uniform'

2937 Marginal distribution for the transformed data. The choices are

2938 'uniform' (default) or 'normal'.

2939

2940 ignore_implicit_zeros : bool, default=False

2941 Only applies to sparse matrices. If True, the sparse entries of the

2942 matrix are discarded to compute the quantile statistics. If False,

2943 these entries are treated as zeros.

2944

2945 subsample : int, default=1e5

2946 Maximum number of samples used to estimate the quantiles for

2947 computational efficiency. Note that the subsampling procedure may

2948 differ for value-identical sparse and dense matrices.

2949

2950 random_state : int, RandomState instance or None, default=None

2951 Determines random number generation for subsampling and smoothing

2952 noise.

2953 Please see ``subsample`` for more details.

2954 Pass an int for reproducible results across multiple function calls.

2955 See :term:`Glossary <random_state>`.

2956

2957 copy : bool, default=True

2958 If False, try to avoid a copy and transform in place.

2959 This is not guaranteed to always work in place; e.g. if the data is

2960 a numpy array with an int dtype, a copy will be returned even with

2961 copy=False.

2962

2963 .. versionchanged:: 0.23

2964 The default value of `copy` changed from False to True in 0.23.

2965

2966 Returns

2967 -------

2968 Xt : {ndarray, sparse matrix} of shape (n_samples, n_features)

2969 The transformed data.

2970

2971 See Also

2972 --------

2973 QuantileTransformer : Performs quantile-based scaling using the

2974 Transformer API (e.g. as part of a preprocessing

2975 :class:`~sklearn.pipeline.Pipeline`).

2976 power_transform : Maps data to a normal distribution using a

2977 power transformation.

2978 scale : Performs standardization that is faster, but less robust

2979 to outliers.

2980 robust_scale : Performs robust standardization that removes the influence

2981 of outliers but does not put outliers and inliers on the same scale.

2982

2983 Notes

2984 -----

2985 NaNs are treated as missing values: disregarded in fit, and maintained in

2986 transform.

2987

2988 .. warning:: Risk of data leak

2989

2990 Do not use :func:`~sklearn.preprocessing.quantile_transform` unless

2991 you know what you are doing. A common mistake is to apply it

2992 to the entire data *before* splitting into training and

2993 test sets. This will bias the model evaluation because

2994 information would have leaked from the test set to the

2995 training set.

2996 In general, we recommend using

2997 :class:`~sklearn.preprocessing.QuantileTransformer` within a

2998 :ref:`Pipeline <pipeline>` in order to prevent most risks of data

2999 leaking:`pipe = make_pipeline(QuantileTransformer(),

3000 LogisticRegression())`.

3001

3002 For a comparison of the different scalers, transformers, and normalizers,

3003 see: :ref:`sphx_glr_auto_examples_preprocessing_plot_all_scaling.py`.

3004

3005 Examples

3006 --------

3007 >>> import numpy as np

3008 >>> from sklearn.preprocessing import quantile_transform

3009 >>> rng = np.random.RandomState(0)

3010 >>> X = np.sort(rng.normal(loc=0.5, scale=0.25, size=(25, 1)), axis=0)

3011 >>> quantile_transform(X, n_quantiles=10, random_state=0, copy=True)

3012 array([...])

3013 """

3014 n = QuantileTransformer(

3015 n_quantiles=n_quantiles,

3016 output_distribution=output_distribution,

3017 subsample=subsample,

3018 ignore_implicit_zeros=ignore_implicit_zeros,

3019 random_state=random_state,

3020 copy=copy,

3021 )

3022 if axis == 0:

3023 X = n.fit_transform(X)

3024 else: # axis == 1

3025 X = n.fit_transform(X.T).T

3026 return X

3027

3028

3029class PowerTransformer(OneToOneFeatureMixin, TransformerMixin, BaseEstimator):

3030 """Apply a power transform featurewise to make data more Gaussian-like.

3031

3032 Power transforms are a family of parametric, monotonic transformations

3033 that are applied to make data more Gaussian-like. This is useful for

3034 modeling issues related to heteroscedasticity (non-constant variance),

3035 or other situations where normality is desired.

3036

3037 Currently, PowerTransformer supports the Box-Cox transform and the

3038 Yeo-Johnson transform. The optimal parameter for stabilizing variance and

3039 minimizing skewness is estimated through maximum likelihood.

3040

3041 Box-Cox requires input data to be strictly positive, while Yeo-Johnson

3042 supports both positive or negative data.

3043

3044 By default, zero-mean, unit-variance normalization is applied to the

3045 transformed data.

3046

3047 For an example visualization, refer to :ref:`Compare PowerTransformer with

3048 other scalers <plot_all_scaling_power_transformer_section>`. To see the

3049 effect of Box-Cox and Yeo-Johnson transformations on different

3050 distributions, see:

3051 :ref:`sphx_glr_auto_examples_preprocessing_plot_map_data_to_normal.py`.

3052

3053 Read more in the :ref:`User Guide <preprocessing_transformer>`.

3054

3055 .. versionadded:: 0.20

3056

3057 Parameters

3058 ----------

3059 method : {'yeo-johnson', 'box-cox'}, default='yeo-johnson'

3060 The power transform method. Available methods are:

3061

3062 - 'yeo-johnson' [1]_, works with positive and negative values

3063 - 'box-cox' [2]_, only works with strictly positive values

3064

3065 standardize : bool, default=True

3066 Set to True to apply zero-mean, unit-variance normalization to the

3067 transformed output.

3068

3069 copy : bool, default=True

3070 Set to False to perform inplace computation during transformation.

3071

3072 Attributes

3073 ----------

3074 lambdas_ : ndarray of float of shape (n_features,)

3075 The parameters of the power transformation for the selected features.

3076

3077 n_features_in_ : int

3078 Number of features seen during :term:`fit`.

3079

3080 .. versionadded:: 0.24

3081

3082 feature_names_in_ : ndarray of shape (`n_features_in_`,)

3083 Names of features seen during :term:`fit`. Defined only when `X`

3084 has feature names that are all strings.

3085

3086 .. versionadded:: 1.0

3087

3088 See Also

3089 --------

3090 power_transform : Equivalent function without the estimator API.

3091

3092 QuantileTransformer : Maps data to a standard normal distribution with

3093 the parameter `output_distribution='normal'`.

3094

3095 Notes

3096 -----

3097 NaNs are treated as missing values: disregarded in ``fit``, and maintained

3098 in ``transform``.

3099

3100 References

3101 ----------

3102

3103 .. [1] :doi:`I.K. Yeo and R.A. Johnson, "A new family of power

3104 transformations to improve normality or symmetry." Biometrika,

3105 87(4), pp.954-959, (2000). <10.1093/biomet/87.4.954>`

3106

3107 .. [2] :doi:`G.E.P. Box and D.R. Cox, "An Analysis of Transformations",

3108 Journal of the Royal Statistical Society B, 26, 211-252 (1964).

3109 <10.1111/j.2517-6161.1964.tb00553.x>`

3110

3111 Examples

3112 --------

3113 >>> import numpy as np

3114 >>> from sklearn.preprocessing import PowerTransformer

3115 >>> pt = PowerTransformer()

3116 >>> data = [[1, 2], [3, 2], [4, 5]]

3117 >>> print(pt.fit(data))

3118 PowerTransformer()

3119 >>> print(pt.lambdas_)

3120 [ 1.386... -3.100...]

3121 >>> print(pt.transform(data))

3122 [[-1.316... -0.707...]

3123 [ 0.209... -0.707...]

3124 [ 1.106... 1.414...]]

3125 """

3126

3127 _parameter_constraints: dict = {

3128 "method": [StrOptions({"yeo-johnson", "box-cox"})],

3129 "standardize": ["boolean"],

3130 "copy": ["boolean"],

3131 }

3132

3133 def __init__(self, method="yeo-johnson", *, standardize=True, copy=True):

3134 self.method = method

3135 self.standardize = standardize

3136 self.copy = copy

3137

3138 @_fit_context(prefer_skip_nested_validation=True)

3139 def fit(self, X, y=None):

3140 """Estimate the optimal parameter lambda for each feature.

3141

3142 The optimal lambda parameter for minimizing skewness is estimated on

3143 each feature independently using maximum likelihood.

3144

3145 Parameters

3146 ----------

3147 X : array-like of shape (n_samples, n_features)

3148 The data used to estimate the optimal transformation parameters.

3149

3150 y : None

3151 Ignored.

3152

3153 Returns

3154 -------

3155 self : object

3156 Fitted transformer.

3157 """

3158 self._fit(X, y=y, force_transform=False)

3159 return self

3160

3161 @_fit_context(prefer_skip_nested_validation=True)

3162 def fit_transform(self, X, y=None):

3163 """Fit `PowerTransformer` to `X`, then transform `X`.

3164

3165 Parameters

3166 ----------

3167 X : array-like of shape (n_samples, n_features)

3168 The data used to estimate the optimal transformation parameters

3169 and to be transformed using a power transformation.

3170

3171 y : Ignored

3172 Not used, present for API consistency by convention.

3173

3174 Returns

3175 -------

3176 X_new : ndarray of shape (n_samples, n_features)

3177 Transformed data.

3178 """

3179 return self._fit(X, y, force_transform=True)

3180

3181 def _fit(self, X, y=None, force_transform=False):

3182 X = self._check_input(X, in_fit=True, check_positive=True)

3183

3184 if not self.copy and not force_transform: # if call from fit()

3185 X = X.copy() # force copy so that fit does not change X inplace

3186

3187 n_samples = X.shape[0]

3188 mean = np.mean(X, axis=0, dtype=np.float64)

3189 var = np.var(X, axis=0, dtype=np.float64)

3190

3191 optim_function = {

3192 "box-cox": self._box_cox_optimize,

3193 "yeo-johnson": self._yeo_johnson_optimize,

3194 }[self.method]

3195

3196 transform_function = {

3197 "box-cox": boxcox,

3198 "yeo-johnson": self._yeo_johnson_transform,

3199 }[self.method]

3200

3201 with np.errstate(invalid="ignore"): # hide NaN warnings

3202 self.lambdas_ = np.empty(X.shape[1], dtype=X.dtype)

3203 for i, col in enumerate(X.T):

3204 # For yeo-johnson, leave constant features unchanged

3205 # lambda=1 corresponds to the identity transformation

3206 is_constant_feature = _is_constant_feature(var[i], mean[i], n_samples)

3207 if self.method == "yeo-johnson" and is_constant_feature:

3208 self.lambdas_[i] = 1.0

3209 continue

3210

3211 self.lambdas_[i] = optim_function(col)

3212

3213 if self.standardize or force_transform:

3214 X[:, i] = transform_function(X[:, i], self.lambdas_[i])

3215

3216 if self.standardize:

3217 self._scaler = StandardScaler(copy=False).set_output(transform="default")

3218 if force_transform:

3219 X = self._scaler.fit_transform(X)

3220 else:

3221 self._scaler.fit(X)

3222

3223 return X

3224

3225 def transform(self, X):

3226 """Apply the power transform to each feature using the fitted lambdas.

3227

3228 Parameters

3229 ----------

3230 X : array-like of shape (n_samples, n_features)

3231 The data to be transformed using a power transformation.

3232

3233 Returns

3234 -------

3235 X_trans : ndarray of shape (n_samples, n_features)

3236 The transformed data.

3237 """

3238 check_is_fitted(self)

3239 X = self._check_input(X, in_fit=False, check_positive=True, check_shape=True)

3240

3241 transform_function = {

3242 "box-cox": boxcox,

3243 "yeo-johnson": self._yeo_johnson_transform,

3244 }[self.method]

3245 for i, lmbda in enumerate(self.lambdas_):

3246 with np.errstate(invalid="ignore"): # hide NaN warnings

3247 X[:, i] = transform_function(X[:, i], lmbda)

3248

3249 if self.standardize:

3250 X = self._scaler.transform(X)

3251

3252 return X

3253

3254 def inverse_transform(self, X):

3255 """Apply the inverse power transformation using the fitted lambdas.

3256

3257 The inverse of the Box-Cox transformation is given by::

3258

3259 if lambda_ == 0:

3260 X = exp(X_trans)

3261 else:

3262 X = (X_trans * lambda_ + 1) ** (1 / lambda_)

3263

3264 The inverse of the Yeo-Johnson transformation is given by::

3265

3266 if X >= 0 and lambda_ == 0:

3267 X = exp(X_trans) - 1

3268 elif X >= 0 and lambda_ != 0:

3269 X = (X_trans * lambda_ + 1) ** (1 / lambda_) - 1

3270 elif X < 0 and lambda_ != 2:

3271 X = 1 - (-(2 - lambda_) * X_trans + 1) ** (1 / (2 - lambda_))

3272 elif X < 0 and lambda_ == 2:

3273 X = 1 - exp(-X_trans)

3274

3275 Parameters

3276 ----------

3277 X : array-like of shape (n_samples, n_features)

3278 The transformed data.

3279

3280 Returns

3281 -------

3282 X : ndarray of shape (n_samples, n_features)

3283 The original data.

3284 """

3285 check_is_fitted(self)

3286 X = self._check_input(X, in_fit=False, check_shape=True)

3287

3288 if self.standardize:

3289 X = self._scaler.inverse_transform(X)

3290

3291 inv_fun = {

3292 "box-cox": self._box_cox_inverse_tranform,

3293 "yeo-johnson": self._yeo_johnson_inverse_transform,

3294 }[self.method]

3295 for i, lmbda in enumerate(self.lambdas_):

3296 with np.errstate(invalid="ignore"): # hide NaN warnings

3297 X[:, i] = inv_fun(X[:, i], lmbda)

3298

3299 return X

3300

3301 def _box_cox_inverse_tranform(self, x, lmbda):

3302 """Return inverse-transformed input x following Box-Cox inverse

3303 transform with parameter lambda.

3304 """

3305 if lmbda == 0:

3306 x_inv = np.exp(x)

3307 else:

3308 x_inv = (x * lmbda + 1) ** (1 / lmbda)

3309

3310 return x_inv

3311

3312 def _yeo_johnson_inverse_transform(self, x, lmbda):

3313 """Return inverse-transformed input x following Yeo-Johnson inverse

3314 transform with parameter lambda.

3315 """

3316 x_inv = np.zeros_like(x)

3317 pos = x >= 0

3318

3319 # when x >= 0

3320 if abs(lmbda) < np.spacing(1.0):

3321 x_inv[pos] = np.exp(x[pos]) - 1

3322 else: # lmbda != 0

3323 x_inv[pos] = np.power(x[pos] * lmbda + 1, 1 / lmbda) - 1

3324

3325 # when x < 0

3326 if abs(lmbda - 2) > np.spacing(1.0):

3327 x_inv[~pos] = 1 - np.power(-(2 - lmbda) * x[~pos] + 1, 1 / (2 - lmbda))

3328 else: # lmbda == 2

3329 x_inv[~pos] = 1 - np.exp(-x[~pos])

3330

3331 return x_inv

3332

3333 def _yeo_johnson_transform(self, x, lmbda):

3334 """Return transformed input x following Yeo-Johnson transform with

3335 parameter lambda.

3336 """

3337

3338 out = np.zeros_like(x)

3339 pos = x >= 0 # binary mask

3340

3341 # when x >= 0

3342 if abs(lmbda) < np.spacing(1.0):

3343 out[pos] = np.log1p(x[pos])

3344 else: # lmbda != 0

3345 out[pos] = (np.power(x[pos] + 1, lmbda) - 1) / lmbda

3346

3347 # when x < 0

3348 if abs(lmbda - 2) > np.spacing(1.0):

3349 out[~pos] = -(np.power(-x[~pos] + 1, 2 - lmbda) - 1) / (2 - lmbda)

3350 else: # lmbda == 2

3351 out[~pos] = -np.log1p(-x[~pos])

3352

3353 return out

3354

3355 def _box_cox_optimize(self, x):

3356 """Find and return optimal lambda parameter of the Box-Cox transform by

3357 MLE, for observed data x.

3358

3359 We here use scipy builtins which uses the brent optimizer.

3360 """

3361 mask = np.isnan(x)

3362 if np.all(mask):

3363 raise ValueError("Column must not be all nan.")

3364

3365 # the computation of lambda is influenced by NaNs so we need to

3366 # get rid of them

3367 _, lmbda = stats.boxcox(x[~mask], lmbda=None)

3368

3369 return lmbda

3370

3371 def _yeo_johnson_optimize(self, x):

3372 """Find and return optimal lambda parameter of the Yeo-Johnson

3373 transform by MLE, for observed data x.

3374

3375 Like for Box-Cox, MLE is done via the brent optimizer.

3376 """

3377 x_tiny = np.finfo(np.float64).tiny

3378

3379 def _neg_log_likelihood(lmbda):

3380 """Return the negative log likelihood of the observed data x as a

3381 function of lambda."""

3382 x_trans = self._yeo_johnson_transform(x, lmbda)

3383 n_samples = x.shape[0]

3384 x_trans_var = x_trans.var()

3385

3386 # Reject transformed data that would raise a RuntimeWarning in np.log

3387 if x_trans_var < x_tiny:

3388 return np.inf

3389

3390 log_var = np.log(x_trans_var)

3391 loglike = -n_samples / 2 * log_var

3392 loglike += (lmbda - 1) * (np.sign(x) * np.log1p(np.abs(x))).sum()

3393

3394 return -loglike

3395

3396 # the computation of lambda is influenced by NaNs so we need to

3397 # get rid of them

3398 x = x[~np.isnan(x)]

3399 # choosing bracket -2, 2 like for boxcox

3400 return optimize.brent(_neg_log_likelihood, brack=(-2, 2))

3401

3402 def _check_input(self, X, in_fit, check_positive=False, check_shape=False):

3403 """Validate the input before fit and transform.

3404

3405 Parameters

3406 ----------

3407 X : array-like of shape (n_samples, n_features)

3408

3409 in_fit : bool

3410 Whether or not `_check_input` is called from `fit` or other

3411 methods, e.g. `predict`, `transform`, etc.

3412

3413 check_positive : bool, default=False

3414 If True, check that all data is positive and non-zero (only if

3415 ``self.method=='box-cox'``).

3416

3417 check_shape : bool, default=False

3418 If True, check that n_features matches the length of self.lambdas_

3419 """

3420 X = self._validate_data(

3421 X,

3422 ensure_2d=True,

3423 dtype=FLOAT_DTYPES,

3424 copy=self.copy,

3425 force_all_finite="allow-nan",

3426 reset=in_fit,

3427 )

3428

3429 with warnings.catch_warnings():

3430 warnings.filterwarnings("ignore", r"All-NaN (slice|axis) encountered")

3431 if check_positive and self.method == "box-cox" and np.nanmin(X) <= 0:

3432 raise ValueError(

3433 "The Box-Cox transformation can only be "

3434 "applied to strictly positive data"

3435 )

3436

3437 if check_shape and not X.shape[1] == len(self.lambdas_):

3438 raise ValueError(

3439 "Input data has a different number of features "

3440 "than fitting data. Should have {n}, data has {m}".format(

3441 n=len(self.lambdas_), m=X.shape[1]

3442 )

3443 )

3444

3445 return X

3446

3447 def _more_tags(self):

3448 return {"allow_nan": True}

3449

3450

3451@validate_params(

3452 {"X": ["array-like"]},

3453 prefer_skip_nested_validation=False,

3454)

3455def power_transform(X, method="yeo-johnson", *, standardize=True, copy=True):

3456 """Parametric, monotonic transformation to make data more Gaussian-like.

3457

3458 Power transforms are a family of parametric, monotonic transformations

3459 that are applied to make data more Gaussian-like. This is useful for

3460 modeling issues related to heteroscedasticity (non-constant variance),

3461 or other situations where normality is desired.

3462

3463 Currently, power_transform supports the Box-Cox transform and the

3464 Yeo-Johnson transform. The optimal parameter for stabilizing variance and

3465 minimizing skewness is estimated through maximum likelihood.

3466

3467 Box-Cox requires input data to be strictly positive, while Yeo-Johnson

3468 supports both positive or negative data.

3469

3470 By default, zero-mean, unit-variance normalization is applied to the

3471 transformed data.

3472

3473 Read more in the :ref:`User Guide <preprocessing_transformer>`.

3474

3475 Parameters

3476 ----------

3477 X : array-like of shape (n_samples, n_features)

3478 The data to be transformed using a power transformation.

3479

3480 method : {'yeo-johnson', 'box-cox'}, default='yeo-johnson'

3481 The power transform method. Available methods are:

3482

3483 - 'yeo-johnson' [1]_, works with positive and negative values

3484 - 'box-cox' [2]_, only works with strictly positive values

3485

3486 .. versionchanged:: 0.23

3487 The default value of the `method` parameter changed from

3488 'box-cox' to 'yeo-johnson' in 0.23.

3489

3490 standardize : bool, default=True

3491 Set to True to apply zero-mean, unit-variance normalization to the

3492 transformed output.

3493

3494 copy : bool, default=True

3495 If False, try to avoid a copy and transform in place.

3496 This is not guaranteed to always work in place; e.g. if the data is

3497 a numpy array with an int dtype, a copy will be returned even with

3498 copy=False.

3499

3500 Returns

3501 -------

3502 X_trans : ndarray of shape (n_samples, n_features)

3503 The transformed data.

3504

3505 See Also

3506 --------

3507 PowerTransformer : Equivalent transformation with the

3508 Transformer API (e.g. as part of a preprocessing

3509 :class:`~sklearn.pipeline.Pipeline`).

3510

3511 quantile_transform : Maps data to a standard normal distribution with

3512 the parameter `output_distribution='normal'`.

3513

3514 Notes

3515 -----

3516 NaNs are treated as missing values: disregarded in ``fit``, and maintained

3517 in ``transform``.

3518

3519 For a comparison of the different scalers, transformers, and normalizers,

3520 see: :ref:`sphx_glr_auto_examples_preprocessing_plot_all_scaling.py`.

3521

3522 References

3523 ----------

3524

3525 .. [1] I.K. Yeo and R.A. Johnson, "A new family of power transformations to

3526 improve normality or symmetry." Biometrika, 87(4), pp.954-959,

3527 (2000).

3528

3529 .. [2] G.E.P. Box and D.R. Cox, "An Analysis of Transformations", Journal

3530 of the Royal Statistical Society B, 26, 211-252 (1964).

3531

3532 Examples

3533 --------

3534 >>> import numpy as np

3535 >>> from sklearn.preprocessing import power_transform

3536 >>> data = [[1, 2], [3, 2], [4, 5]]

3537 >>> print(power_transform(data, method='box-cox'))

3538 [[-1.332... -0.707...]

3539 [ 0.256... -0.707...]

3540 [ 1.076... 1.414...]]

3541

3542 .. warning:: Risk of data leak.

3543 Do not use :func:`~sklearn.preprocessing.power_transform` unless you

3544 know what you are doing. A common mistake is to apply it to the entire

3545 data *before* splitting into training and test sets. This will bias the

3546 model evaluation because information would have leaked from the test

3547 set to the training set.

3548 In general, we recommend using

3549 :class:`~sklearn.preprocessing.PowerTransformer` within a

3550 :ref:`Pipeline <pipeline>` in order to prevent most risks of data

3551 leaking, e.g.: `pipe = make_pipeline(PowerTransformer(),

3552 LogisticRegression())`.

3553 """

3554 pt = PowerTransformer(method=method, standardize=standardize, copy=copy)

3555 return pt.fit_transform(X)

Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/scikit_learn-1.4.dev0-py3.8-linux-x86_64.egg/sklearn/preprocessing/_data.py: 17%

730 statements