Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/scikit_learn-1.4.dev0-py3.8-linux-x86_64.egg/sklearn/preprocessing/_data.py: 17%

730 statements  

« prev     ^ index     » next       coverage.py v7.3.2, created at 2023-12-12 06:31 +0000

1# Authors: Alexandre Gramfort <alexandre.gramfort@inria.fr> 

2# Mathieu Blondel <mathieu@mblondel.org> 

3# Olivier Grisel <olivier.grisel@ensta.org> 

4# Andreas Mueller <amueller@ais.uni-bonn.de> 

5# Eric Martin <eric@ericmart.in> 

6# Giorgio Patrini <giorgio.patrini@anu.edu.au> 

7# Eric Chang <ericchang2017@u.northwestern.edu> 

8# License: BSD 3 clause 

9 

10 

11import warnings 

12from numbers import Integral, Real 

13 

14import numpy as np 

15from scipy import optimize, sparse, stats 

16from scipy.special import boxcox 

17 

18from ..base import ( 

19 BaseEstimator, 

20 ClassNamePrefixFeaturesOutMixin, 

21 OneToOneFeatureMixin, 

22 TransformerMixin, 

23 _fit_context, 

24) 

25from ..utils import _array_api, check_array 

26from ..utils._array_api import get_namespace 

27from ..utils._param_validation import Interval, Options, StrOptions, validate_params 

28from ..utils.extmath import _incremental_mean_and_var, row_norms 

29from ..utils.sparsefuncs import ( 

30 incr_mean_variance_axis, 

31 inplace_column_scale, 

32 mean_variance_axis, 

33 min_max_axis, 

34) 

35from ..utils.sparsefuncs_fast import ( 

36 inplace_csr_row_normalize_l1, 

37 inplace_csr_row_normalize_l2, 

38) 

39from ..utils.validation import ( 

40 FLOAT_DTYPES, 

41 _check_sample_weight, 

42 check_is_fitted, 

43 check_random_state, 

44) 

45from ._encoders import OneHotEncoder 

46 

47BOUNDS_THRESHOLD = 1e-7 

48 

49__all__ = [ 

50 "Binarizer", 

51 "KernelCenterer", 

52 "MinMaxScaler", 

53 "MaxAbsScaler", 

54 "Normalizer", 

55 "OneHotEncoder", 

56 "RobustScaler", 

57 "StandardScaler", 

58 "QuantileTransformer", 

59 "PowerTransformer", 

60 "add_dummy_feature", 

61 "binarize", 

62 "normalize", 

63 "scale", 

64 "robust_scale", 

65 "maxabs_scale", 

66 "minmax_scale", 

67 "quantile_transform", 

68 "power_transform", 

69] 

70 

71 

72def _is_constant_feature(var, mean, n_samples): 

73 """Detect if a feature is indistinguishable from a constant feature. 

74 

75 The detection is based on its computed variance and on the theoretical 

76 error bounds of the '2 pass algorithm' for variance computation. 

77 

78 See "Algorithms for computing the sample variance: analysis and 

79 recommendations", by Chan, Golub, and LeVeque. 

80 """ 

81 # In scikit-learn, variance is always computed using float64 accumulators. 

82 eps = np.finfo(np.float64).eps 

83 

84 upper_bound = n_samples * eps * var + (n_samples * mean * eps) ** 2 

85 return var <= upper_bound 

86 

87 

88def _handle_zeros_in_scale(scale, copy=True, constant_mask=None): 

89 """Set scales of near constant features to 1. 

90 

91 The goal is to avoid division by very small or zero values. 

92 

93 Near constant features are detected automatically by identifying 

94 scales close to machine precision unless they are precomputed by 

95 the caller and passed with the `constant_mask` kwarg. 

96 

97 Typically for standard scaling, the scales are the standard 

98 deviation while near constant features are better detected on the 

99 computed variances which are closer to machine precision by 

100 construction. 

101 """ 

102 # if we are fitting on 1D arrays, scale might be a scalar 

103 if np.isscalar(scale): 

104 if scale == 0.0: 

105 scale = 1.0 

106 return scale 

107 # scale is an array 

108 else: 

109 xp, _ = get_namespace(scale) 

110 if constant_mask is None: 

111 # Detect near constant values to avoid dividing by a very small 

112 # value that could lead to surprising results and numerical 

113 # stability issues. 

114 constant_mask = scale < 10 * xp.finfo(scale.dtype).eps 

115 

116 if copy: 

117 # New array to avoid side-effects 

118 scale = xp.asarray(scale, copy=True) 

119 scale[constant_mask] = 1.0 

120 return scale 

121 

122 

123@validate_params( 

124 { 

125 "X": ["array-like", "sparse matrix"], 

126 "axis": [Options(Integral, {0, 1})], 

127 "with_mean": ["boolean"], 

128 "with_std": ["boolean"], 

129 "copy": ["boolean"], 

130 }, 

131 prefer_skip_nested_validation=True, 

132) 

133def scale(X, *, axis=0, with_mean=True, with_std=True, copy=True): 

134 """Standardize a dataset along any axis. 

135 

136 Center to the mean and component wise scale to unit variance. 

137 

138 Read more in the :ref:`User Guide <preprocessing_scaler>`. 

139 

140 Parameters 

141 ---------- 

142 X : {array-like, sparse matrix} of shape (n_samples, n_features) 

143 The data to center and scale. 

144 

145 axis : {0, 1}, default=0 

146 Axis used to compute the means and standard deviations along. If 0, 

147 independently standardize each feature, otherwise (if 1) standardize 

148 each sample. 

149 

150 with_mean : bool, default=True 

151 If True, center the data before scaling. 

152 

153 with_std : bool, default=True 

154 If True, scale the data to unit variance (or equivalently, 

155 unit standard deviation). 

156 

157 copy : bool, default=True 

158 If False, try to avoid a copy and scale in place. 

159 This is not guaranteed to always work in place; e.g. if the data is 

160 a numpy array with an int dtype, a copy will be returned even with 

161 copy=False. 

162 

163 Returns 

164 ------- 

165 X_tr : {ndarray, sparse matrix} of shape (n_samples, n_features) 

166 The transformed data. 

167 

168 See Also 

169 -------- 

170 StandardScaler : Performs scaling to unit variance using the Transformer 

171 API (e.g. as part of a preprocessing 

172 :class:`~sklearn.pipeline.Pipeline`). 

173 

174 Notes 

175 ----- 

176 This implementation will refuse to center scipy.sparse matrices 

177 since it would make them non-sparse and would potentially crash the 

178 program with memory exhaustion problems. 

179 

180 Instead the caller is expected to either set explicitly 

181 `with_mean=False` (in that case, only variance scaling will be 

182 performed on the features of the CSC matrix) or to call `X.toarray()` 

183 if he/she expects the materialized dense array to fit in memory. 

184 

185 To avoid memory copy the caller should pass a CSC matrix. 

186 

187 NaNs are treated as missing values: disregarded to compute the statistics, 

188 and maintained during the data transformation. 

189 

190 We use a biased estimator for the standard deviation, equivalent to 

191 `numpy.std(x, ddof=0)`. Note that the choice of `ddof` is unlikely to 

192 affect model performance. 

193 

194 For a comparison of the different scalers, transformers, and normalizers, 

195 see: :ref:`sphx_glr_auto_examples_preprocessing_plot_all_scaling.py`. 

196 

197 .. warning:: Risk of data leak 

198 

199 Do not use :func:`~sklearn.preprocessing.scale` unless you know 

200 what you are doing. A common mistake is to apply it to the entire data 

201 *before* splitting into training and test sets. This will bias the 

202 model evaluation because information would have leaked from the test 

203 set to the training set. 

204 In general, we recommend using 

205 :class:`~sklearn.preprocessing.StandardScaler` within a 

206 :ref:`Pipeline <pipeline>` in order to prevent most risks of data 

207 leaking: `pipe = make_pipeline(StandardScaler(), LogisticRegression())`. 

208 """ # noqa 

209 X = check_array( 

210 X, 

211 accept_sparse="csc", 

212 copy=copy, 

213 ensure_2d=False, 

214 estimator="the scale function", 

215 dtype=FLOAT_DTYPES, 

216 force_all_finite="allow-nan", 

217 ) 

218 if sparse.issparse(X): 

219 if with_mean: 

220 raise ValueError( 

221 "Cannot center sparse matrices: pass `with_mean=False` instead" 

222 " See docstring for motivation and alternatives." 

223 ) 

224 if axis != 0: 

225 raise ValueError( 

226 "Can only scale sparse matrix on axis=0, got axis=%d" % axis 

227 ) 

228 if with_std: 

229 _, var = mean_variance_axis(X, axis=0) 

230 var = _handle_zeros_in_scale(var, copy=False) 

231 inplace_column_scale(X, 1 / np.sqrt(var)) 

232 else: 

233 X = np.asarray(X) 

234 if with_mean: 

235 mean_ = np.nanmean(X, axis) 

236 if with_std: 

237 scale_ = np.nanstd(X, axis) 

238 # Xr is a view on the original array that enables easy use of 

239 # broadcasting on the axis in which we are interested in 

240 Xr = np.rollaxis(X, axis) 

241 if with_mean: 

242 Xr -= mean_ 

243 mean_1 = np.nanmean(Xr, axis=0) 

244 # Verify that mean_1 is 'close to zero'. If X contains very 

245 # large values, mean_1 can also be very large, due to a lack of 

246 # precision of mean_. In this case, a pre-scaling of the 

247 # concerned feature is efficient, for instance by its mean or 

248 # maximum. 

249 if not np.allclose(mean_1, 0): 

250 warnings.warn( 

251 "Numerical issues were encountered " 

252 "when centering the data " 

253 "and might not be solved. Dataset may " 

254 "contain too large values. You may need " 

255 "to prescale your features." 

256 ) 

257 Xr -= mean_1 

258 if with_std: 

259 scale_ = _handle_zeros_in_scale(scale_, copy=False) 

260 Xr /= scale_ 

261 if with_mean: 

262 mean_2 = np.nanmean(Xr, axis=0) 

263 # If mean_2 is not 'close to zero', it comes from the fact that 

264 # scale_ is very small so that mean_2 = mean_1/scale_ > 0, even 

265 # if mean_1 was close to zero. The problem is thus essentially 

266 # due to the lack of precision of mean_. A solution is then to 

267 # subtract the mean again: 

268 if not np.allclose(mean_2, 0): 

269 warnings.warn( 

270 "Numerical issues were encountered " 

271 "when scaling the data " 

272 "and might not be solved. The standard " 

273 "deviation of the data is probably " 

274 "very close to 0. " 

275 ) 

276 Xr -= mean_2 

277 return X 

278 

279 

280class MinMaxScaler(OneToOneFeatureMixin, TransformerMixin, BaseEstimator): 

281 """Transform features by scaling each feature to a given range. 

282 

283 This estimator scales and translates each feature individually such 

284 that it is in the given range on the training set, e.g. between 

285 zero and one. 

286 

287 The transformation is given by:: 

288 

289 X_std = (X - X.min(axis=0)) / (X.max(axis=0) - X.min(axis=0)) 

290 X_scaled = X_std * (max - min) + min 

291 

292 where min, max = feature_range. 

293 

294 This transformation is often used as an alternative to zero mean, 

295 unit variance scaling. 

296 

297 `MinMaxScaler` doesn't reduce the effect of outliers, but it linearly 

298 scales them down into a fixed range, where the largest occurring data point 

299 corresponds to the maximum value and the smallest one corresponds to the 

300 minimum value. For an example visualization, refer to :ref:`Compare 

301 MinMaxScaler with other scalers <plot_all_scaling_minmax_scaler_section>`. 

302 

303 Read more in the :ref:`User Guide <preprocessing_scaler>`. 

304 

305 Parameters 

306 ---------- 

307 feature_range : tuple (min, max), default=(0, 1) 

308 Desired range of transformed data. 

309 

310 copy : bool, default=True 

311 Set to False to perform inplace row normalization and avoid a 

312 copy (if the input is already a numpy array). 

313 

314 clip : bool, default=False 

315 Set to True to clip transformed values of held-out data to 

316 provided `feature range`. 

317 

318 .. versionadded:: 0.24 

319 

320 Attributes 

321 ---------- 

322 min_ : ndarray of shape (n_features,) 

323 Per feature adjustment for minimum. Equivalent to 

324 ``min - X.min(axis=0) * self.scale_`` 

325 

326 scale_ : ndarray of shape (n_features,) 

327 Per feature relative scaling of the data. Equivalent to 

328 ``(max - min) / (X.max(axis=0) - X.min(axis=0))`` 

329 

330 .. versionadded:: 0.17 

331 *scale_* attribute. 

332 

333 data_min_ : ndarray of shape (n_features,) 

334 Per feature minimum seen in the data 

335 

336 .. versionadded:: 0.17 

337 *data_min_* 

338 

339 data_max_ : ndarray of shape (n_features,) 

340 Per feature maximum seen in the data 

341 

342 .. versionadded:: 0.17 

343 *data_max_* 

344 

345 data_range_ : ndarray of shape (n_features,) 

346 Per feature range ``(data_max_ - data_min_)`` seen in the data 

347 

348 .. versionadded:: 0.17 

349 *data_range_* 

350 

351 n_features_in_ : int 

352 Number of features seen during :term:`fit`. 

353 

354 .. versionadded:: 0.24 

355 

356 n_samples_seen_ : int 

357 The number of samples processed by the estimator. 

358 It will be reset on new calls to fit, but increments across 

359 ``partial_fit`` calls. 

360 

361 feature_names_in_ : ndarray of shape (`n_features_in_`,) 

362 Names of features seen during :term:`fit`. Defined only when `X` 

363 has feature names that are all strings. 

364 

365 .. versionadded:: 1.0 

366 

367 See Also 

368 -------- 

369 minmax_scale : Equivalent function without the estimator API. 

370 

371 Notes 

372 ----- 

373 NaNs are treated as missing values: disregarded in fit, and maintained in 

374 transform. 

375 

376 Examples 

377 -------- 

378 >>> from sklearn.preprocessing import MinMaxScaler 

379 >>> data = [[-1, 2], [-0.5, 6], [0, 10], [1, 18]] 

380 >>> scaler = MinMaxScaler() 

381 >>> print(scaler.fit(data)) 

382 MinMaxScaler() 

383 >>> print(scaler.data_max_) 

384 [ 1. 18.] 

385 >>> print(scaler.transform(data)) 

386 [[0. 0. ] 

387 [0.25 0.25] 

388 [0.5 0.5 ] 

389 [1. 1. ]] 

390 >>> print(scaler.transform([[2, 2]])) 

391 [[1.5 0. ]] 

392 """ 

393 

394 _parameter_constraints: dict = { 

395 "feature_range": [tuple], 

396 "copy": ["boolean"], 

397 "clip": ["boolean"], 

398 } 

399 

400 def __init__(self, feature_range=(0, 1), *, copy=True, clip=False): 

401 self.feature_range = feature_range 

402 self.copy = copy 

403 self.clip = clip 

404 

405 def _reset(self): 

406 """Reset internal data-dependent state of the scaler, if necessary. 

407 

408 __init__ parameters are not touched. 

409 """ 

410 # Checking one attribute is enough, because they are all set together 

411 # in partial_fit 

412 if hasattr(self, "scale_"): 

413 del self.scale_ 

414 del self.min_ 

415 del self.n_samples_seen_ 

416 del self.data_min_ 

417 del self.data_max_ 

418 del self.data_range_ 

419 

420 def fit(self, X, y=None): 

421 """Compute the minimum and maximum to be used for later scaling. 

422 

423 Parameters 

424 ---------- 

425 X : array-like of shape (n_samples, n_features) 

426 The data used to compute the per-feature minimum and maximum 

427 used for later scaling along the features axis. 

428 

429 y : None 

430 Ignored. 

431 

432 Returns 

433 ------- 

434 self : object 

435 Fitted scaler. 

436 """ 

437 # Reset internal state before fitting 

438 self._reset() 

439 return self.partial_fit(X, y) 

440 

441 @_fit_context(prefer_skip_nested_validation=True) 

442 def partial_fit(self, X, y=None): 

443 """Online computation of min and max on X for later scaling. 

444 

445 All of X is processed as a single batch. This is intended for cases 

446 when :meth:`fit` is not feasible due to very large number of 

447 `n_samples` or because X is read from a continuous stream. 

448 

449 Parameters 

450 ---------- 

451 X : array-like of shape (n_samples, n_features) 

452 The data used to compute the mean and standard deviation 

453 used for later scaling along the features axis. 

454 

455 y : None 

456 Ignored. 

457 

458 Returns 

459 ------- 

460 self : object 

461 Fitted scaler. 

462 """ 

463 feature_range = self.feature_range 

464 if feature_range[0] >= feature_range[1]: 

465 raise ValueError( 

466 "Minimum of desired feature range must be smaller than maximum. Got %s." 

467 % str(feature_range) 

468 ) 

469 

470 if sparse.issparse(X): 

471 raise TypeError( 

472 "MinMaxScaler does not support sparse input. " 

473 "Consider using MaxAbsScaler instead." 

474 ) 

475 

476 xp, _ = get_namespace(X) 

477 

478 first_pass = not hasattr(self, "n_samples_seen_") 

479 X = self._validate_data( 

480 X, 

481 reset=first_pass, 

482 dtype=_array_api.supported_float_dtypes(xp), 

483 force_all_finite="allow-nan", 

484 ) 

485 

486 data_min = _array_api._nanmin(X, axis=0) 

487 data_max = _array_api._nanmax(X, axis=0) 

488 

489 if first_pass: 

490 self.n_samples_seen_ = X.shape[0] 

491 else: 

492 data_min = xp.minimum(self.data_min_, data_min) 

493 data_max = xp.maximum(self.data_max_, data_max) 

494 self.n_samples_seen_ += X.shape[0] 

495 

496 data_range = data_max - data_min 

497 self.scale_ = (feature_range[1] - feature_range[0]) / _handle_zeros_in_scale( 

498 data_range, copy=True 

499 ) 

500 self.min_ = feature_range[0] - data_min * self.scale_ 

501 self.data_min_ = data_min 

502 self.data_max_ = data_max 

503 self.data_range_ = data_range 

504 return self 

505 

506 def transform(self, X): 

507 """Scale features of X according to feature_range. 

508 

509 Parameters 

510 ---------- 

511 X : array-like of shape (n_samples, n_features) 

512 Input data that will be transformed. 

513 

514 Returns 

515 ------- 

516 Xt : ndarray of shape (n_samples, n_features) 

517 Transformed data. 

518 """ 

519 check_is_fitted(self) 

520 

521 xp, _ = get_namespace(X) 

522 

523 X = self._validate_data( 

524 X, 

525 copy=self.copy, 

526 dtype=_array_api.supported_float_dtypes(xp), 

527 force_all_finite="allow-nan", 

528 reset=False, 

529 ) 

530 

531 X *= self.scale_ 

532 X += self.min_ 

533 if self.clip: 

534 xp.clip(X, self.feature_range[0], self.feature_range[1], out=X) 

535 return X 

536 

537 def inverse_transform(self, X): 

538 """Undo the scaling of X according to feature_range. 

539 

540 Parameters 

541 ---------- 

542 X : array-like of shape (n_samples, n_features) 

543 Input data that will be transformed. It cannot be sparse. 

544 

545 Returns 

546 ------- 

547 Xt : ndarray of shape (n_samples, n_features) 

548 Transformed data. 

549 """ 

550 check_is_fitted(self) 

551 

552 xp, _ = get_namespace(X) 

553 

554 X = check_array( 

555 X, 

556 copy=self.copy, 

557 dtype=_array_api.supported_float_dtypes(xp), 

558 force_all_finite="allow-nan", 

559 ) 

560 

561 X -= self.min_ 

562 X /= self.scale_ 

563 return X 

564 

565 def _more_tags(self): 

566 return {"allow_nan": True} 

567 

568 

569@validate_params( 

570 { 

571 "X": ["array-like"], 

572 "axis": [Options(Integral, {0, 1})], 

573 }, 

574 prefer_skip_nested_validation=False, 

575) 

576def minmax_scale(X, feature_range=(0, 1), *, axis=0, copy=True): 

577 """Transform features by scaling each feature to a given range. 

578 

579 This estimator scales and translates each feature individually such 

580 that it is in the given range on the training set, i.e. between 

581 zero and one. 

582 

583 The transformation is given by (when ``axis=0``):: 

584 

585 X_std = (X - X.min(axis=0)) / (X.max(axis=0) - X.min(axis=0)) 

586 X_scaled = X_std * (max - min) + min 

587 

588 where min, max = feature_range. 

589 

590 The transformation is calculated as (when ``axis=0``):: 

591 

592 X_scaled = scale * X + min - X.min(axis=0) * scale 

593 where scale = (max - min) / (X.max(axis=0) - X.min(axis=0)) 

594 

595 This transformation is often used as an alternative to zero mean, 

596 unit variance scaling. 

597 

598 Read more in the :ref:`User Guide <preprocessing_scaler>`. 

599 

600 .. versionadded:: 0.17 

601 *minmax_scale* function interface 

602 to :class:`~sklearn.preprocessing.MinMaxScaler`. 

603 

604 Parameters 

605 ---------- 

606 X : array-like of shape (n_samples, n_features) 

607 The data. 

608 

609 feature_range : tuple (min, max), default=(0, 1) 

610 Desired range of transformed data. 

611 

612 axis : {0, 1}, default=0 

613 Axis used to scale along. If 0, independently scale each feature, 

614 otherwise (if 1) scale each sample. 

615 

616 copy : bool, default=True 

617 If False, try to avoid a copy and scale in place. 

618 This is not guaranteed to always work in place; e.g. if the data is 

619 a numpy array with an int dtype, a copy will be returned even with 

620 copy=False. 

621 

622 Returns 

623 ------- 

624 X_tr : ndarray of shape (n_samples, n_features) 

625 The transformed data. 

626 

627 .. warning:: Risk of data leak 

628 

629 Do not use :func:`~sklearn.preprocessing.minmax_scale` unless you know 

630 what you are doing. A common mistake is to apply it to the entire data 

631 *before* splitting into training and test sets. This will bias the 

632 model evaluation because information would have leaked from the test 

633 set to the training set. 

634 In general, we recommend using 

635 :class:`~sklearn.preprocessing.MinMaxScaler` within a 

636 :ref:`Pipeline <pipeline>` in order to prevent most risks of data 

637 leaking: `pipe = make_pipeline(MinMaxScaler(), LogisticRegression())`. 

638 

639 See Also 

640 -------- 

641 MinMaxScaler : Performs scaling to a given range using the Transformer 

642 API (e.g. as part of a preprocessing 

643 :class:`~sklearn.pipeline.Pipeline`). 

644 

645 Notes 

646 ----- 

647 For a comparison of the different scalers, transformers, and normalizers, 

648 see: :ref:`sphx_glr_auto_examples_preprocessing_plot_all_scaling.py`. 

649 """ 

650 # Unlike the scaler object, this function allows 1d input. 

651 # If copy is required, it will be done inside the scaler object. 

652 X = check_array( 

653 X, copy=False, ensure_2d=False, dtype=FLOAT_DTYPES, force_all_finite="allow-nan" 

654 ) 

655 original_ndim = X.ndim 

656 

657 if original_ndim == 1: 

658 X = X.reshape(X.shape[0], 1) 

659 

660 s = MinMaxScaler(feature_range=feature_range, copy=copy) 

661 if axis == 0: 

662 X = s.fit_transform(X) 

663 else: 

664 X = s.fit_transform(X.T).T 

665 

666 if original_ndim == 1: 

667 X = X.ravel() 

668 

669 return X 

670 

671 

672class StandardScaler(OneToOneFeatureMixin, TransformerMixin, BaseEstimator): 

673 """Standardize features by removing the mean and scaling to unit variance. 

674 

675 The standard score of a sample `x` is calculated as: 

676 

677 z = (x - u) / s 

678 

679 where `u` is the mean of the training samples or zero if `with_mean=False`, 

680 and `s` is the standard deviation of the training samples or one if 

681 `with_std=False`. 

682 

683 Centering and scaling happen independently on each feature by computing 

684 the relevant statistics on the samples in the training set. Mean and 

685 standard deviation are then stored to be used on later data using 

686 :meth:`transform`. 

687 

688 Standardization of a dataset is a common requirement for many 

689 machine learning estimators: they might behave badly if the 

690 individual features do not more or less look like standard normally 

691 distributed data (e.g. Gaussian with 0 mean and unit variance). 

692 

693 For instance many elements used in the objective function of 

694 a learning algorithm (such as the RBF kernel of Support Vector 

695 Machines or the L1 and L2 regularizers of linear models) assume that 

696 all features are centered around 0 and have variance in the same 

697 order. If a feature has a variance that is orders of magnitude larger 

698 than others, it might dominate the objective function and make the 

699 estimator unable to learn from other features correctly as expected. 

700 

701 `StandardScaler` is sensitive to outliers, and the features may scale 

702 differently from each other in the presence of outliers. For an example 

703 visualization, refer to :ref:`Compare StandardScaler with other scalers 

704 <plot_all_scaling_standard_scaler_section>`. 

705 

706 This scaler can also be applied to sparse CSR or CSC matrices by passing 

707 `with_mean=False` to avoid breaking the sparsity structure of the data. 

708 

709 Read more in the :ref:`User Guide <preprocessing_scaler>`. 

710 

711 Parameters 

712 ---------- 

713 copy : bool, default=True 

714 If False, try to avoid a copy and do inplace scaling instead. 

715 This is not guaranteed to always work inplace; e.g. if the data is 

716 not a NumPy array or scipy.sparse CSR matrix, a copy may still be 

717 returned. 

718 

719 with_mean : bool, default=True 

720 If True, center the data before scaling. 

721 This does not work (and will raise an exception) when attempted on 

722 sparse matrices, because centering them entails building a dense 

723 matrix which in common use cases is likely to be too large to fit in 

724 memory. 

725 

726 with_std : bool, default=True 

727 If True, scale the data to unit variance (or equivalently, 

728 unit standard deviation). 

729 

730 Attributes 

731 ---------- 

732 scale_ : ndarray of shape (n_features,) or None 

733 Per feature relative scaling of the data to achieve zero mean and unit 

734 variance. Generally this is calculated using `np.sqrt(var_)`. If a 

735 variance is zero, we can't achieve unit variance, and the data is left 

736 as-is, giving a scaling factor of 1. `scale_` is equal to `None` 

737 when `with_std=False`. 

738 

739 .. versionadded:: 0.17 

740 *scale_* 

741 

742 mean_ : ndarray of shape (n_features,) or None 

743 The mean value for each feature in the training set. 

744 Equal to ``None`` when ``with_mean=False`` and ``with_std=False``. 

745 

746 var_ : ndarray of shape (n_features,) or None 

747 The variance for each feature in the training set. Used to compute 

748 `scale_`. Equal to ``None`` when ``with_mean=False`` and 

749 ``with_std=False``. 

750 

751 n_features_in_ : int 

752 Number of features seen during :term:`fit`. 

753 

754 .. versionadded:: 0.24 

755 

756 feature_names_in_ : ndarray of shape (`n_features_in_`,) 

757 Names of features seen during :term:`fit`. Defined only when `X` 

758 has feature names that are all strings. 

759 

760 .. versionadded:: 1.0 

761 

762 n_samples_seen_ : int or ndarray of shape (n_features,) 

763 The number of samples processed by the estimator for each feature. 

764 If there are no missing samples, the ``n_samples_seen`` will be an 

765 integer, otherwise it will be an array of dtype int. If 

766 `sample_weights` are used it will be a float (if no missing data) 

767 or an array of dtype float that sums the weights seen so far. 

768 Will be reset on new calls to fit, but increments across 

769 ``partial_fit`` calls. 

770 

771 See Also 

772 -------- 

773 scale : Equivalent function without the estimator API. 

774 

775 :class:`~sklearn.decomposition.PCA` : Further removes the linear 

776 correlation across features with 'whiten=True'. 

777 

778 Notes 

779 ----- 

780 NaNs are treated as missing values: disregarded in fit, and maintained in 

781 transform. 

782 

783 We use a biased estimator for the standard deviation, equivalent to 

784 `numpy.std(x, ddof=0)`. Note that the choice of `ddof` is unlikely to 

785 affect model performance. 

786 

787 Examples 

788 -------- 

789 >>> from sklearn.preprocessing import StandardScaler 

790 >>> data = [[0, 0], [0, 0], [1, 1], [1, 1]] 

791 >>> scaler = StandardScaler() 

792 >>> print(scaler.fit(data)) 

793 StandardScaler() 

794 >>> print(scaler.mean_) 

795 [0.5 0.5] 

796 >>> print(scaler.transform(data)) 

797 [[-1. -1.] 

798 [-1. -1.] 

799 [ 1. 1.] 

800 [ 1. 1.]] 

801 >>> print(scaler.transform([[2, 2]])) 

802 [[3. 3.]] 

803 """ 

804 

805 _parameter_constraints: dict = { 

806 "copy": ["boolean"], 

807 "with_mean": ["boolean"], 

808 "with_std": ["boolean"], 

809 } 

810 

811 def __init__(self, *, copy=True, with_mean=True, with_std=True): 

812 self.with_mean = with_mean 

813 self.with_std = with_std 

814 self.copy = copy 

815 

816 def _reset(self): 

817 """Reset internal data-dependent state of the scaler, if necessary. 

818 

819 __init__ parameters are not touched. 

820 """ 

821 # Checking one attribute is enough, because they are all set together 

822 # in partial_fit 

823 if hasattr(self, "scale_"): 

824 del self.scale_ 

825 del self.n_samples_seen_ 

826 del self.mean_ 

827 del self.var_ 

828 

829 def fit(self, X, y=None, sample_weight=None): 

830 """Compute the mean and std to be used for later scaling. 

831 

832 Parameters 

833 ---------- 

834 X : {array-like, sparse matrix} of shape (n_samples, n_features) 

835 The data used to compute the mean and standard deviation 

836 used for later scaling along the features axis. 

837 

838 y : None 

839 Ignored. 

840 

841 sample_weight : array-like of shape (n_samples,), default=None 

842 Individual weights for each sample. 

843 

844 .. versionadded:: 0.24 

845 parameter *sample_weight* support to StandardScaler. 

846 

847 Returns 

848 ------- 

849 self : object 

850 Fitted scaler. 

851 """ 

852 # Reset internal state before fitting 

853 self._reset() 

854 return self.partial_fit(X, y, sample_weight) 

855 

856 @_fit_context(prefer_skip_nested_validation=True) 

857 def partial_fit(self, X, y=None, sample_weight=None): 

858 """Online computation of mean and std on X for later scaling. 

859 

860 All of X is processed as a single batch. This is intended for cases 

861 when :meth:`fit` is not feasible due to very large number of 

862 `n_samples` or because X is read from a continuous stream. 

863 

864 The algorithm for incremental mean and std is given in Equation 1.5a,b 

865 in Chan, Tony F., Gene H. Golub, and Randall J. LeVeque. "Algorithms 

866 for computing the sample variance: Analysis and recommendations." 

867 The American Statistician 37.3 (1983): 242-247: 

868 

869 Parameters 

870 ---------- 

871 X : {array-like, sparse matrix} of shape (n_samples, n_features) 

872 The data used to compute the mean and standard deviation 

873 used for later scaling along the features axis. 

874 

875 y : None 

876 Ignored. 

877 

878 sample_weight : array-like of shape (n_samples,), default=None 

879 Individual weights for each sample. 

880 

881 .. versionadded:: 0.24 

882 parameter *sample_weight* support to StandardScaler. 

883 

884 Returns 

885 ------- 

886 self : object 

887 Fitted scaler. 

888 """ 

889 first_call = not hasattr(self, "n_samples_seen_") 

890 X = self._validate_data( 

891 X, 

892 accept_sparse=("csr", "csc"), 

893 dtype=FLOAT_DTYPES, 

894 force_all_finite="allow-nan", 

895 reset=first_call, 

896 ) 

897 n_features = X.shape[1] 

898 

899 if sample_weight is not None: 

900 sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype) 

901 

902 # Even in the case of `with_mean=False`, we update the mean anyway 

903 # This is needed for the incremental computation of the var 

904 # See incr_mean_variance_axis and _incremental_mean_variance_axis 

905 

906 # if n_samples_seen_ is an integer (i.e. no missing values), we need to 

907 # transform it to a NumPy array of shape (n_features,) required by 

908 # incr_mean_variance_axis and _incremental_variance_axis 

909 dtype = np.int64 if sample_weight is None else X.dtype 

910 if not hasattr(self, "n_samples_seen_"): 

911 self.n_samples_seen_ = np.zeros(n_features, dtype=dtype) 

912 elif np.size(self.n_samples_seen_) == 1: 

913 self.n_samples_seen_ = np.repeat(self.n_samples_seen_, X.shape[1]) 

914 self.n_samples_seen_ = self.n_samples_seen_.astype(dtype, copy=False) 

915 

916 if sparse.issparse(X): 

917 if self.with_mean: 

918 raise ValueError( 

919 "Cannot center sparse matrices: pass `with_mean=False` " 

920 "instead. See docstring for motivation and alternatives." 

921 ) 

922 sparse_constructor = ( 

923 sparse.csr_matrix if X.format == "csr" else sparse.csc_matrix 

924 ) 

925 

926 if self.with_std: 

927 # First pass 

928 if not hasattr(self, "scale_"): 

929 self.mean_, self.var_, self.n_samples_seen_ = mean_variance_axis( 

930 X, axis=0, weights=sample_weight, return_sum_weights=True 

931 ) 

932 # Next passes 

933 else: 

934 ( 

935 self.mean_, 

936 self.var_, 

937 self.n_samples_seen_, 

938 ) = incr_mean_variance_axis( 

939 X, 

940 axis=0, 

941 last_mean=self.mean_, 

942 last_var=self.var_, 

943 last_n=self.n_samples_seen_, 

944 weights=sample_weight, 

945 ) 

946 # We force the mean and variance to float64 for large arrays 

947 # See https://github.com/scikit-learn/scikit-learn/pull/12338 

948 self.mean_ = self.mean_.astype(np.float64, copy=False) 

949 self.var_ = self.var_.astype(np.float64, copy=False) 

950 else: 

951 self.mean_ = None # as with_mean must be False for sparse 

952 self.var_ = None 

953 weights = _check_sample_weight(sample_weight, X) 

954 sum_weights_nan = weights @ sparse_constructor( 

955 (np.isnan(X.data), X.indices, X.indptr), shape=X.shape 

956 ) 

957 self.n_samples_seen_ += (np.sum(weights) - sum_weights_nan).astype( 

958 dtype 

959 ) 

960 else: 

961 # First pass 

962 if not hasattr(self, "scale_"): 

963 self.mean_ = 0.0 

964 if self.with_std: 

965 self.var_ = 0.0 

966 else: 

967 self.var_ = None 

968 

969 if not self.with_mean and not self.with_std: 

970 self.mean_ = None 

971 self.var_ = None 

972 self.n_samples_seen_ += X.shape[0] - np.isnan(X).sum(axis=0) 

973 

974 else: 

975 self.mean_, self.var_, self.n_samples_seen_ = _incremental_mean_and_var( 

976 X, 

977 self.mean_, 

978 self.var_, 

979 self.n_samples_seen_, 

980 sample_weight=sample_weight, 

981 ) 

982 

983 # for backward-compatibility, reduce n_samples_seen_ to an integer 

984 # if the number of samples is the same for each feature (i.e. no 

985 # missing values) 

986 if np.ptp(self.n_samples_seen_) == 0: 

987 self.n_samples_seen_ = self.n_samples_seen_[0] 

988 

989 if self.with_std: 

990 # Extract the list of near constant features on the raw variances, 

991 # before taking the square root. 

992 constant_mask = _is_constant_feature( 

993 self.var_, self.mean_, self.n_samples_seen_ 

994 ) 

995 self.scale_ = _handle_zeros_in_scale( 

996 np.sqrt(self.var_), copy=False, constant_mask=constant_mask 

997 ) 

998 else: 

999 self.scale_ = None 

1000 

1001 return self 

1002 

1003 def transform(self, X, copy=None): 

1004 """Perform standardization by centering and scaling. 

1005 

1006 Parameters 

1007 ---------- 

1008 X : {array-like, sparse matrix of shape (n_samples, n_features) 

1009 The data used to scale along the features axis. 

1010 copy : bool, default=None 

1011 Copy the input X or not. 

1012 

1013 Returns 

1014 ------- 

1015 X_tr : {ndarray, sparse matrix} of shape (n_samples, n_features) 

1016 Transformed array. 

1017 """ 

1018 check_is_fitted(self) 

1019 

1020 copy = copy if copy is not None else self.copy 

1021 X = self._validate_data( 

1022 X, 

1023 reset=False, 

1024 accept_sparse="csr", 

1025 copy=copy, 

1026 dtype=FLOAT_DTYPES, 

1027 force_all_finite="allow-nan", 

1028 ) 

1029 

1030 if sparse.issparse(X): 

1031 if self.with_mean: 

1032 raise ValueError( 

1033 "Cannot center sparse matrices: pass `with_mean=False` " 

1034 "instead. See docstring for motivation and alternatives." 

1035 ) 

1036 if self.scale_ is not None: 

1037 inplace_column_scale(X, 1 / self.scale_) 

1038 else: 

1039 if self.with_mean: 

1040 X -= self.mean_ 

1041 if self.with_std: 

1042 X /= self.scale_ 

1043 return X 

1044 

1045 def inverse_transform(self, X, copy=None): 

1046 """Scale back the data to the original representation. 

1047 

1048 Parameters 

1049 ---------- 

1050 X : {array-like, sparse matrix} of shape (n_samples, n_features) 

1051 The data used to scale along the features axis. 

1052 copy : bool, default=None 

1053 Copy the input X or not. 

1054 

1055 Returns 

1056 ------- 

1057 X_tr : {ndarray, sparse matrix} of shape (n_samples, n_features) 

1058 Transformed array. 

1059 """ 

1060 check_is_fitted(self) 

1061 

1062 copy = copy if copy is not None else self.copy 

1063 X = check_array( 

1064 X, 

1065 accept_sparse="csr", 

1066 copy=copy, 

1067 dtype=FLOAT_DTYPES, 

1068 force_all_finite="allow-nan", 

1069 ) 

1070 

1071 if sparse.issparse(X): 

1072 if self.with_mean: 

1073 raise ValueError( 

1074 "Cannot uncenter sparse matrices: pass `with_mean=False` " 

1075 "instead See docstring for motivation and alternatives." 

1076 ) 

1077 if self.scale_ is not None: 

1078 inplace_column_scale(X, self.scale_) 

1079 else: 

1080 if self.with_std: 

1081 X *= self.scale_ 

1082 if self.with_mean: 

1083 X += self.mean_ 

1084 return X 

1085 

1086 def _more_tags(self): 

1087 return {"allow_nan": True, "preserves_dtype": [np.float64, np.float32]} 

1088 

1089 

1090class MaxAbsScaler(OneToOneFeatureMixin, TransformerMixin, BaseEstimator): 

1091 """Scale each feature by its maximum absolute value. 

1092 

1093 This estimator scales and translates each feature individually such 

1094 that the maximal absolute value of each feature in the 

1095 training set will be 1.0. It does not shift/center the data, and 

1096 thus does not destroy any sparsity. 

1097 

1098 This scaler can also be applied to sparse CSR or CSC matrices. 

1099 

1100 `MaxAbsScaler` doesn't reduce the effect of outliers; it only linearly 

1101 scales them down. For an example visualization, refer to :ref:`Compare 

1102 MaxAbsScaler with other scalers <plot_all_scaling_max_abs_scaler_section>`. 

1103 

1104 .. versionadded:: 0.17 

1105 

1106 Parameters 

1107 ---------- 

1108 copy : bool, default=True 

1109 Set to False to perform inplace scaling and avoid a copy (if the input 

1110 is already a numpy array). 

1111 

1112 Attributes 

1113 ---------- 

1114 scale_ : ndarray of shape (n_features,) 

1115 Per feature relative scaling of the data. 

1116 

1117 .. versionadded:: 0.17 

1118 *scale_* attribute. 

1119 

1120 max_abs_ : ndarray of shape (n_features,) 

1121 Per feature maximum absolute value. 

1122 

1123 n_features_in_ : int 

1124 Number of features seen during :term:`fit`. 

1125 

1126 .. versionadded:: 0.24 

1127 

1128 feature_names_in_ : ndarray of shape (`n_features_in_`,) 

1129 Names of features seen during :term:`fit`. Defined only when `X` 

1130 has feature names that are all strings. 

1131 

1132 .. versionadded:: 1.0 

1133 

1134 n_samples_seen_ : int 

1135 The number of samples processed by the estimator. Will be reset on 

1136 new calls to fit, but increments across ``partial_fit`` calls. 

1137 

1138 See Also 

1139 -------- 

1140 maxabs_scale : Equivalent function without the estimator API. 

1141 

1142 Notes 

1143 ----- 

1144 NaNs are treated as missing values: disregarded in fit, and maintained in 

1145 transform. 

1146 

1147 Examples 

1148 -------- 

1149 >>> from sklearn.preprocessing import MaxAbsScaler 

1150 >>> X = [[ 1., -1., 2.], 

1151 ... [ 2., 0., 0.], 

1152 ... [ 0., 1., -1.]] 

1153 >>> transformer = MaxAbsScaler().fit(X) 

1154 >>> transformer 

1155 MaxAbsScaler() 

1156 >>> transformer.transform(X) 

1157 array([[ 0.5, -1. , 1. ], 

1158 [ 1. , 0. , 0. ], 

1159 [ 0. , 1. , -0.5]]) 

1160 """ 

1161 

1162 _parameter_constraints: dict = {"copy": ["boolean"]} 

1163 

1164 def __init__(self, *, copy=True): 

1165 self.copy = copy 

1166 

1167 def _reset(self): 

1168 """Reset internal data-dependent state of the scaler, if necessary. 

1169 

1170 __init__ parameters are not touched. 

1171 """ 

1172 # Checking one attribute is enough, because they are all set together 

1173 # in partial_fit 

1174 if hasattr(self, "scale_"): 

1175 del self.scale_ 

1176 del self.n_samples_seen_ 

1177 del self.max_abs_ 

1178 

1179 def fit(self, X, y=None): 

1180 """Compute the maximum absolute value to be used for later scaling. 

1181 

1182 Parameters 

1183 ---------- 

1184 X : {array-like, sparse matrix} of shape (n_samples, n_features) 

1185 The data used to compute the per-feature minimum and maximum 

1186 used for later scaling along the features axis. 

1187 

1188 y : None 

1189 Ignored. 

1190 

1191 Returns 

1192 ------- 

1193 self : object 

1194 Fitted scaler. 

1195 """ 

1196 # Reset internal state before fitting 

1197 self._reset() 

1198 return self.partial_fit(X, y) 

1199 

1200 @_fit_context(prefer_skip_nested_validation=True) 

1201 def partial_fit(self, X, y=None): 

1202 """Online computation of max absolute value of X for later scaling. 

1203 

1204 All of X is processed as a single batch. This is intended for cases 

1205 when :meth:`fit` is not feasible due to very large number of 

1206 `n_samples` or because X is read from a continuous stream. 

1207 

1208 Parameters 

1209 ---------- 

1210 X : {array-like, sparse matrix} of shape (n_samples, n_features) 

1211 The data used to compute the mean and standard deviation 

1212 used for later scaling along the features axis. 

1213 

1214 y : None 

1215 Ignored. 

1216 

1217 Returns 

1218 ------- 

1219 self : object 

1220 Fitted scaler. 

1221 """ 

1222 xp, _ = get_namespace(X) 

1223 

1224 first_pass = not hasattr(self, "n_samples_seen_") 

1225 X = self._validate_data( 

1226 X, 

1227 reset=first_pass, 

1228 accept_sparse=("csr", "csc"), 

1229 dtype=_array_api.supported_float_dtypes(xp), 

1230 force_all_finite="allow-nan", 

1231 ) 

1232 

1233 if sparse.issparse(X): 

1234 mins, maxs = min_max_axis(X, axis=0, ignore_nan=True) 

1235 max_abs = np.maximum(np.abs(mins), np.abs(maxs)) 

1236 else: 

1237 max_abs = _array_api._nanmax(xp.abs(X), axis=0) 

1238 

1239 if first_pass: 

1240 self.n_samples_seen_ = X.shape[0] 

1241 else: 

1242 max_abs = xp.maximum(self.max_abs_, max_abs) 

1243 self.n_samples_seen_ += X.shape[0] 

1244 

1245 self.max_abs_ = max_abs 

1246 self.scale_ = _handle_zeros_in_scale(max_abs, copy=True) 

1247 return self 

1248 

1249 def transform(self, X): 

1250 """Scale the data. 

1251 

1252 Parameters 

1253 ---------- 

1254 X : {array-like, sparse matrix} of shape (n_samples, n_features) 

1255 The data that should be scaled. 

1256 

1257 Returns 

1258 ------- 

1259 X_tr : {ndarray, sparse matrix} of shape (n_samples, n_features) 

1260 Transformed array. 

1261 """ 

1262 check_is_fitted(self) 

1263 

1264 xp, _ = get_namespace(X) 

1265 

1266 X = self._validate_data( 

1267 X, 

1268 accept_sparse=("csr", "csc"), 

1269 copy=self.copy, 

1270 reset=False, 

1271 dtype=_array_api.supported_float_dtypes(xp), 

1272 force_all_finite="allow-nan", 

1273 ) 

1274 

1275 if sparse.issparse(X): 

1276 inplace_column_scale(X, 1.0 / self.scale_) 

1277 else: 

1278 X /= self.scale_ 

1279 return X 

1280 

1281 def inverse_transform(self, X): 

1282 """Scale back the data to the original representation. 

1283 

1284 Parameters 

1285 ---------- 

1286 X : {array-like, sparse matrix} of shape (n_samples, n_features) 

1287 The data that should be transformed back. 

1288 

1289 Returns 

1290 ------- 

1291 X_tr : {ndarray, sparse matrix} of shape (n_samples, n_features) 

1292 Transformed array. 

1293 """ 

1294 check_is_fitted(self) 

1295 

1296 xp, _ = get_namespace(X) 

1297 

1298 X = check_array( 

1299 X, 

1300 accept_sparse=("csr", "csc"), 

1301 copy=self.copy, 

1302 dtype=_array_api.supported_float_dtypes(xp), 

1303 force_all_finite="allow-nan", 

1304 ) 

1305 

1306 if sparse.issparse(X): 

1307 inplace_column_scale(X, self.scale_) 

1308 else: 

1309 X *= self.scale_ 

1310 return X 

1311 

1312 def _more_tags(self): 

1313 return {"allow_nan": True} 

1314 

1315 

1316@validate_params( 

1317 { 

1318 "X": ["array-like", "sparse matrix"], 

1319 "axis": [Options(Integral, {0, 1})], 

1320 }, 

1321 prefer_skip_nested_validation=False, 

1322) 

1323def maxabs_scale(X, *, axis=0, copy=True): 

1324 """Scale each feature to the [-1, 1] range without breaking the sparsity. 

1325 

1326 This estimator scales each feature individually such 

1327 that the maximal absolute value of each feature in the 

1328 training set will be 1.0. 

1329 

1330 This scaler can also be applied to sparse CSR or CSC matrices. 

1331 

1332 Parameters 

1333 ---------- 

1334 X : {array-like, sparse matrix} of shape (n_samples, n_features) 

1335 The data. 

1336 

1337 axis : {0, 1}, default=0 

1338 Axis used to scale along. If 0, independently scale each feature, 

1339 otherwise (if 1) scale each sample. 

1340 

1341 copy : bool, default=True 

1342 If False, try to avoid a copy and scale in place. 

1343 This is not guaranteed to always work in place; e.g. if the data is 

1344 a numpy array with an int dtype, a copy will be returned even with 

1345 copy=False. 

1346 

1347 Returns 

1348 ------- 

1349 X_tr : {ndarray, sparse matrix} of shape (n_samples, n_features) 

1350 The transformed data. 

1351 

1352 .. warning:: Risk of data leak 

1353 

1354 Do not use :func:`~sklearn.preprocessing.maxabs_scale` unless you know 

1355 what you are doing. A common mistake is to apply it to the entire data 

1356 *before* splitting into training and test sets. This will bias the 

1357 model evaluation because information would have leaked from the test 

1358 set to the training set. 

1359 In general, we recommend using 

1360 :class:`~sklearn.preprocessing.MaxAbsScaler` within a 

1361 :ref:`Pipeline <pipeline>` in order to prevent most risks of data 

1362 leaking: `pipe = make_pipeline(MaxAbsScaler(), LogisticRegression())`. 

1363 

1364 See Also 

1365 -------- 

1366 MaxAbsScaler : Performs scaling to the [-1, 1] range using 

1367 the Transformer API (e.g. as part of a preprocessing 

1368 :class:`~sklearn.pipeline.Pipeline`). 

1369 

1370 Notes 

1371 ----- 

1372 NaNs are treated as missing values: disregarded to compute the statistics, 

1373 and maintained during the data transformation. 

1374 

1375 For a comparison of the different scalers, transformers, and normalizers, 

1376 see: :ref:`sphx_glr_auto_examples_preprocessing_plot_all_scaling.py`. 

1377 """ 

1378 # Unlike the scaler object, this function allows 1d input. 

1379 

1380 # If copy is required, it will be done inside the scaler object. 

1381 X = check_array( 

1382 X, 

1383 accept_sparse=("csr", "csc"), 

1384 copy=False, 

1385 ensure_2d=False, 

1386 dtype=FLOAT_DTYPES, 

1387 force_all_finite="allow-nan", 

1388 ) 

1389 original_ndim = X.ndim 

1390 

1391 if original_ndim == 1: 

1392 X = X.reshape(X.shape[0], 1) 

1393 

1394 s = MaxAbsScaler(copy=copy) 

1395 if axis == 0: 

1396 X = s.fit_transform(X) 

1397 else: 

1398 X = s.fit_transform(X.T).T 

1399 

1400 if original_ndim == 1: 

1401 X = X.ravel() 

1402 

1403 return X 

1404 

1405 

1406class RobustScaler(OneToOneFeatureMixin, TransformerMixin, BaseEstimator): 

1407 """Scale features using statistics that are robust to outliers. 

1408 

1409 This Scaler removes the median and scales the data according to 

1410 the quantile range (defaults to IQR: Interquartile Range). 

1411 The IQR is the range between the 1st quartile (25th quantile) 

1412 and the 3rd quartile (75th quantile). 

1413 

1414 Centering and scaling happen independently on each feature by 

1415 computing the relevant statistics on the samples in the training 

1416 set. Median and interquartile range are then stored to be used on 

1417 later data using the :meth:`transform` method. 

1418 

1419 Standardization of a dataset is a common preprocessing for many machine 

1420 learning estimators. Typically this is done by removing the mean and 

1421 scaling to unit variance. However, outliers can often influence the sample 

1422 mean / variance in a negative way. In such cases, using the median and the 

1423 interquartile range often give better results. For an example visualization 

1424 and comparison to other scalers, refer to :ref:`Compare RobustScaler with 

1425 other scalers <plot_all_scaling_robust_scaler_section>`. 

1426 

1427 .. versionadded:: 0.17 

1428 

1429 Read more in the :ref:`User Guide <preprocessing_scaler>`. 

1430 

1431 Parameters 

1432 ---------- 

1433 with_centering : bool, default=True 

1434 If `True`, center the data before scaling. 

1435 This will cause :meth:`transform` to raise an exception when attempted 

1436 on sparse matrices, because centering them entails building a dense 

1437 matrix which in common use cases is likely to be too large to fit in 

1438 memory. 

1439 

1440 with_scaling : bool, default=True 

1441 If `True`, scale the data to interquartile range. 

1442 

1443 quantile_range : tuple (q_min, q_max), 0.0 < q_min < q_max < 100.0, \ 

1444 default=(25.0, 75.0) 

1445 Quantile range used to calculate `scale_`. By default this is equal to 

1446 the IQR, i.e., `q_min` is the first quantile and `q_max` is the third 

1447 quantile. 

1448 

1449 .. versionadded:: 0.18 

1450 

1451 copy : bool, default=True 

1452 If `False`, try to avoid a copy and do inplace scaling instead. 

1453 This is not guaranteed to always work inplace; e.g. if the data is 

1454 not a NumPy array or scipy.sparse CSR matrix, a copy may still be 

1455 returned. 

1456 

1457 unit_variance : bool, default=False 

1458 If `True`, scale data so that normally distributed features have a 

1459 variance of 1. In general, if the difference between the x-values of 

1460 `q_max` and `q_min` for a standard normal distribution is greater 

1461 than 1, the dataset will be scaled down. If less than 1, the dataset 

1462 will be scaled up. 

1463 

1464 .. versionadded:: 0.24 

1465 

1466 Attributes 

1467 ---------- 

1468 center_ : array of floats 

1469 The median value for each feature in the training set. 

1470 

1471 scale_ : array of floats 

1472 The (scaled) interquartile range for each feature in the training set. 

1473 

1474 .. versionadded:: 0.17 

1475 *scale_* attribute. 

1476 

1477 n_features_in_ : int 

1478 Number of features seen during :term:`fit`. 

1479 

1480 .. versionadded:: 0.24 

1481 

1482 feature_names_in_ : ndarray of shape (`n_features_in_`,) 

1483 Names of features seen during :term:`fit`. Defined only when `X` 

1484 has feature names that are all strings. 

1485 

1486 .. versionadded:: 1.0 

1487 

1488 See Also 

1489 -------- 

1490 robust_scale : Equivalent function without the estimator API. 

1491 sklearn.decomposition.PCA : Further removes the linear correlation across 

1492 features with 'whiten=True'. 

1493 

1494 Notes 

1495 ----- 

1496 

1497 https://en.wikipedia.org/wiki/Median 

1498 https://en.wikipedia.org/wiki/Interquartile_range 

1499 

1500 Examples 

1501 -------- 

1502 >>> from sklearn.preprocessing import RobustScaler 

1503 >>> X = [[ 1., -2., 2.], 

1504 ... [ -2., 1., 3.], 

1505 ... [ 4., 1., -2.]] 

1506 >>> transformer = RobustScaler().fit(X) 

1507 >>> transformer 

1508 RobustScaler() 

1509 >>> transformer.transform(X) 

1510 array([[ 0. , -2. , 0. ], 

1511 [-1. , 0. , 0.4], 

1512 [ 1. , 0. , -1.6]]) 

1513 """ 

1514 

1515 _parameter_constraints: dict = { 

1516 "with_centering": ["boolean"], 

1517 "with_scaling": ["boolean"], 

1518 "quantile_range": [tuple], 

1519 "copy": ["boolean"], 

1520 "unit_variance": ["boolean"], 

1521 } 

1522 

1523 def __init__( 

1524 self, 

1525 *, 

1526 with_centering=True, 

1527 with_scaling=True, 

1528 quantile_range=(25.0, 75.0), 

1529 copy=True, 

1530 unit_variance=False, 

1531 ): 

1532 self.with_centering = with_centering 

1533 self.with_scaling = with_scaling 

1534 self.quantile_range = quantile_range 

1535 self.unit_variance = unit_variance 

1536 self.copy = copy 

1537 

1538 @_fit_context(prefer_skip_nested_validation=True) 

1539 def fit(self, X, y=None): 

1540 """Compute the median and quantiles to be used for scaling. 

1541 

1542 Parameters 

1543 ---------- 

1544 X : {array-like, sparse matrix} of shape (n_samples, n_features) 

1545 The data used to compute the median and quantiles 

1546 used for later scaling along the features axis. 

1547 

1548 y : Ignored 

1549 Not used, present here for API consistency by convention. 

1550 

1551 Returns 

1552 ------- 

1553 self : object 

1554 Fitted scaler. 

1555 """ 

1556 # at fit, convert sparse matrices to csc for optimized computation of 

1557 # the quantiles 

1558 X = self._validate_data( 

1559 X, 

1560 accept_sparse="csc", 

1561 dtype=FLOAT_DTYPES, 

1562 force_all_finite="allow-nan", 

1563 ) 

1564 

1565 q_min, q_max = self.quantile_range 

1566 if not 0 <= q_min <= q_max <= 100: 

1567 raise ValueError("Invalid quantile range: %s" % str(self.quantile_range)) 

1568 

1569 if self.with_centering: 

1570 if sparse.issparse(X): 

1571 raise ValueError( 

1572 "Cannot center sparse matrices: use `with_centering=False`" 

1573 " instead. See docstring for motivation and alternatives." 

1574 ) 

1575 self.center_ = np.nanmedian(X, axis=0) 

1576 else: 

1577 self.center_ = None 

1578 

1579 if self.with_scaling: 

1580 quantiles = [] 

1581 for feature_idx in range(X.shape[1]): 

1582 if sparse.issparse(X): 

1583 column_nnz_data = X.data[ 

1584 X.indptr[feature_idx] : X.indptr[feature_idx + 1] 

1585 ] 

1586 column_data = np.zeros(shape=X.shape[0], dtype=X.dtype) 

1587 column_data[: len(column_nnz_data)] = column_nnz_data 

1588 else: 

1589 column_data = X[:, feature_idx] 

1590 

1591 quantiles.append(np.nanpercentile(column_data, self.quantile_range)) 

1592 

1593 quantiles = np.transpose(quantiles) 

1594 

1595 self.scale_ = quantiles[1] - quantiles[0] 

1596 self.scale_ = _handle_zeros_in_scale(self.scale_, copy=False) 

1597 if self.unit_variance: 

1598 adjust = stats.norm.ppf(q_max / 100.0) - stats.norm.ppf(q_min / 100.0) 

1599 self.scale_ = self.scale_ / adjust 

1600 else: 

1601 self.scale_ = None 

1602 

1603 return self 

1604 

1605 def transform(self, X): 

1606 """Center and scale the data. 

1607 

1608 Parameters 

1609 ---------- 

1610 X : {array-like, sparse matrix} of shape (n_samples, n_features) 

1611 The data used to scale along the specified axis. 

1612 

1613 Returns 

1614 ------- 

1615 X_tr : {ndarray, sparse matrix} of shape (n_samples, n_features) 

1616 Transformed array. 

1617 """ 

1618 check_is_fitted(self) 

1619 X = self._validate_data( 

1620 X, 

1621 accept_sparse=("csr", "csc"), 

1622 copy=self.copy, 

1623 dtype=FLOAT_DTYPES, 

1624 reset=False, 

1625 force_all_finite="allow-nan", 

1626 ) 

1627 

1628 if sparse.issparse(X): 

1629 if self.with_scaling: 

1630 inplace_column_scale(X, 1.0 / self.scale_) 

1631 else: 

1632 if self.with_centering: 

1633 X -= self.center_ 

1634 if self.with_scaling: 

1635 X /= self.scale_ 

1636 return X 

1637 

1638 def inverse_transform(self, X): 

1639 """Scale back the data to the original representation. 

1640 

1641 Parameters 

1642 ---------- 

1643 X : {array-like, sparse matrix} of shape (n_samples, n_features) 

1644 The rescaled data to be transformed back. 

1645 

1646 Returns 

1647 ------- 

1648 X_tr : {ndarray, sparse matrix} of shape (n_samples, n_features) 

1649 Transformed array. 

1650 """ 

1651 check_is_fitted(self) 

1652 X = check_array( 

1653 X, 

1654 accept_sparse=("csr", "csc"), 

1655 copy=self.copy, 

1656 dtype=FLOAT_DTYPES, 

1657 force_all_finite="allow-nan", 

1658 ) 

1659 

1660 if sparse.issparse(X): 

1661 if self.with_scaling: 

1662 inplace_column_scale(X, self.scale_) 

1663 else: 

1664 if self.with_scaling: 

1665 X *= self.scale_ 

1666 if self.with_centering: 

1667 X += self.center_ 

1668 return X 

1669 

1670 def _more_tags(self): 

1671 return {"allow_nan": True} 

1672 

1673 

1674@validate_params( 

1675 {"X": ["array-like", "sparse matrix"], "axis": [Options(Integral, {0, 1})]}, 

1676 prefer_skip_nested_validation=False, 

1677) 

1678def robust_scale( 

1679 X, 

1680 *, 

1681 axis=0, 

1682 with_centering=True, 

1683 with_scaling=True, 

1684 quantile_range=(25.0, 75.0), 

1685 copy=True, 

1686 unit_variance=False, 

1687): 

1688 """Standardize a dataset along any axis. 

1689 

1690 Center to the median and component wise scale 

1691 according to the interquartile range. 

1692 

1693 Read more in the :ref:`User Guide <preprocessing_scaler>`. 

1694 

1695 Parameters 

1696 ---------- 

1697 X : {array-like, sparse matrix} of shape (n_sample, n_features) 

1698 The data to center and scale. 

1699 

1700 axis : int, default=0 

1701 Axis used to compute the medians and IQR along. If 0, 

1702 independently scale each feature, otherwise (if 1) scale 

1703 each sample. 

1704 

1705 with_centering : bool, default=True 

1706 If `True`, center the data before scaling. 

1707 

1708 with_scaling : bool, default=True 

1709 If `True`, scale the data to unit variance (or equivalently, 

1710 unit standard deviation). 

1711 

1712 quantile_range : tuple (q_min, q_max), 0.0 < q_min < q_max < 100.0,\ 

1713 default=(25.0, 75.0) 

1714 Quantile range used to calculate `scale_`. By default this is equal to 

1715 the IQR, i.e., `q_min` is the first quantile and `q_max` is the third 

1716 quantile. 

1717 

1718 .. versionadded:: 0.18 

1719 

1720 copy : bool, default=True 

1721 If False, try to avoid a copy and scale in place. 

1722 This is not guaranteed to always work in place; e.g. if the data is 

1723 a numpy array with an int dtype, a copy will be returned even with 

1724 copy=False. 

1725 

1726 unit_variance : bool, default=False 

1727 If `True`, scale data so that normally distributed features have a 

1728 variance of 1. In general, if the difference between the x-values of 

1729 `q_max` and `q_min` for a standard normal distribution is greater 

1730 than 1, the dataset will be scaled down. If less than 1, the dataset 

1731 will be scaled up. 

1732 

1733 .. versionadded:: 0.24 

1734 

1735 Returns 

1736 ------- 

1737 X_tr : {ndarray, sparse matrix} of shape (n_samples, n_features) 

1738 The transformed data. 

1739 

1740 See Also 

1741 -------- 

1742 RobustScaler : Performs centering and scaling using the Transformer API 

1743 (e.g. as part of a preprocessing :class:`~sklearn.pipeline.Pipeline`). 

1744 

1745 Notes 

1746 ----- 

1747 This implementation will refuse to center scipy.sparse matrices 

1748 since it would make them non-sparse and would potentially crash the 

1749 program with memory exhaustion problems. 

1750 

1751 Instead the caller is expected to either set explicitly 

1752 `with_centering=False` (in that case, only variance scaling will be 

1753 performed on the features of the CSR matrix) or to call `X.toarray()` 

1754 if he/she expects the materialized dense array to fit in memory. 

1755 

1756 To avoid memory copy the caller should pass a CSR matrix. 

1757 

1758 For a comparison of the different scalers, transformers, and normalizers, 

1759 see: :ref:`sphx_glr_auto_examples_preprocessing_plot_all_scaling.py`. 

1760 

1761 .. warning:: Risk of data leak 

1762 

1763 Do not use :func:`~sklearn.preprocessing.robust_scale` unless you know 

1764 what you are doing. A common mistake is to apply it to the entire data 

1765 *before* splitting into training and test sets. This will bias the 

1766 model evaluation because information would have leaked from the test 

1767 set to the training set. 

1768 In general, we recommend using 

1769 :class:`~sklearn.preprocessing.RobustScaler` within a 

1770 :ref:`Pipeline <pipeline>` in order to prevent most risks of data 

1771 leaking: `pipe = make_pipeline(RobustScaler(), LogisticRegression())`. 

1772 """ 

1773 X = check_array( 

1774 X, 

1775 accept_sparse=("csr", "csc"), 

1776 copy=False, 

1777 ensure_2d=False, 

1778 dtype=FLOAT_DTYPES, 

1779 force_all_finite="allow-nan", 

1780 ) 

1781 original_ndim = X.ndim 

1782 

1783 if original_ndim == 1: 

1784 X = X.reshape(X.shape[0], 1) 

1785 

1786 s = RobustScaler( 

1787 with_centering=with_centering, 

1788 with_scaling=with_scaling, 

1789 quantile_range=quantile_range, 

1790 unit_variance=unit_variance, 

1791 copy=copy, 

1792 ) 

1793 if axis == 0: 

1794 X = s.fit_transform(X) 

1795 else: 

1796 X = s.fit_transform(X.T).T 

1797 

1798 if original_ndim == 1: 

1799 X = X.ravel() 

1800 

1801 return X 

1802 

1803 

1804@validate_params( 

1805 { 

1806 "X": ["array-like", "sparse matrix"], 

1807 "norm": [StrOptions({"l1", "l2", "max"})], 

1808 "axis": [Options(Integral, {0, 1})], 

1809 "copy": ["boolean"], 

1810 "return_norm": ["boolean"], 

1811 }, 

1812 prefer_skip_nested_validation=True, 

1813) 

1814def normalize(X, norm="l2", *, axis=1, copy=True, return_norm=False): 

1815 """Scale input vectors individually to unit norm (vector length). 

1816 

1817 Read more in the :ref:`User Guide <preprocessing_normalization>`. 

1818 

1819 Parameters 

1820 ---------- 

1821 X : {array-like, sparse matrix} of shape (n_samples, n_features) 

1822 The data to normalize, element by element. 

1823 scipy.sparse matrices should be in CSR format to avoid an 

1824 un-necessary copy. 

1825 

1826 norm : {'l1', 'l2', 'max'}, default='l2' 

1827 The norm to use to normalize each non zero sample (or each non-zero 

1828 feature if axis is 0). 

1829 

1830 axis : {0, 1}, default=1 

1831 Define axis used to normalize the data along. If 1, independently 

1832 normalize each sample, otherwise (if 0) normalize each feature. 

1833 

1834 copy : bool, default=True 

1835 If False, try to avoid a copy and normalize in place. 

1836 This is not guaranteed to always work in place; e.g. if the data is 

1837 a numpy array with an int dtype, a copy will be returned even with 

1838 copy=False. 

1839 

1840 return_norm : bool, default=False 

1841 Whether to return the computed norms. 

1842 

1843 Returns 

1844 ------- 

1845 X : {ndarray, sparse matrix} of shape (n_samples, n_features) 

1846 Normalized input X. 

1847 

1848 norms : ndarray of shape (n_samples, ) if axis=1 else (n_features, ) 

1849 An array of norms along given axis for X. 

1850 When X is sparse, a NotImplementedError will be raised 

1851 for norm 'l1' or 'l2'. 

1852 

1853 See Also 

1854 -------- 

1855 Normalizer : Performs normalization using the Transformer API 

1856 (e.g. as part of a preprocessing :class:`~sklearn.pipeline.Pipeline`). 

1857 

1858 Notes 

1859 ----- 

1860 For a comparison of the different scalers, transformers, and normalizers, 

1861 see: :ref:`sphx_glr_auto_examples_preprocessing_plot_all_scaling.py`. 

1862 """ 

1863 if axis == 0: 

1864 sparse_format = "csc" 

1865 else: # axis == 1: 

1866 sparse_format = "csr" 

1867 

1868 xp, _ = get_namespace(X) 

1869 

1870 X = check_array( 

1871 X, 

1872 accept_sparse=sparse_format, 

1873 copy=copy, 

1874 estimator="the normalize function", 

1875 dtype=_array_api.supported_float_dtypes(xp), 

1876 ) 

1877 if axis == 0: 

1878 X = X.T 

1879 

1880 if sparse.issparse(X): 

1881 if return_norm and norm in ("l1", "l2"): 

1882 raise NotImplementedError( 

1883 "return_norm=True is not implemented " 

1884 "for sparse matrices with norm 'l1' " 

1885 "or norm 'l2'" 

1886 ) 

1887 if norm == "l1": 

1888 inplace_csr_row_normalize_l1(X) 

1889 elif norm == "l2": 

1890 inplace_csr_row_normalize_l2(X) 

1891 elif norm == "max": 

1892 mins, maxes = min_max_axis(X, 1) 

1893 norms = np.maximum(abs(mins), maxes) 

1894 norms_elementwise = norms.repeat(np.diff(X.indptr)) 

1895 mask = norms_elementwise != 0 

1896 X.data[mask] /= norms_elementwise[mask] 

1897 else: 

1898 if norm == "l1": 

1899 norms = xp.sum(xp.abs(X), axis=1) 

1900 elif norm == "l2": 

1901 norms = row_norms(X) 

1902 elif norm == "max": 

1903 norms = xp.max(xp.abs(X), axis=1) 

1904 norms = _handle_zeros_in_scale(norms, copy=False) 

1905 X /= norms[:, None] 

1906 

1907 if axis == 0: 

1908 X = X.T 

1909 

1910 if return_norm: 

1911 return X, norms 

1912 else: 

1913 return X 

1914 

1915 

1916class Normalizer(OneToOneFeatureMixin, TransformerMixin, BaseEstimator): 

1917 """Normalize samples individually to unit norm. 

1918 

1919 Each sample (i.e. each row of the data matrix) with at least one 

1920 non zero component is rescaled independently of other samples so 

1921 that its norm (l1, l2 or inf) equals one. 

1922 

1923 This transformer is able to work both with dense numpy arrays and 

1924 scipy.sparse matrix (use CSR format if you want to avoid the burden of 

1925 a copy / conversion). 

1926 

1927 Scaling inputs to unit norms is a common operation for text 

1928 classification or clustering for instance. For instance the dot 

1929 product of two l2-normalized TF-IDF vectors is the cosine similarity 

1930 of the vectors and is the base similarity metric for the Vector 

1931 Space Model commonly used by the Information Retrieval community. 

1932 

1933 For an example visualization, refer to :ref:`Compare Normalizer with other 

1934 scalers <plot_all_scaling_normalizer_section>`. 

1935 

1936 Read more in the :ref:`User Guide <preprocessing_normalization>`. 

1937 

1938 Parameters 

1939 ---------- 

1940 norm : {'l1', 'l2', 'max'}, default='l2' 

1941 The norm to use to normalize each non zero sample. If norm='max' 

1942 is used, values will be rescaled by the maximum of the absolute 

1943 values. 

1944 

1945 copy : bool, default=True 

1946 Set to False to perform inplace row normalization and avoid a 

1947 copy (if the input is already a numpy array or a scipy.sparse 

1948 CSR matrix). 

1949 

1950 Attributes 

1951 ---------- 

1952 n_features_in_ : int 

1953 Number of features seen during :term:`fit`. 

1954 

1955 .. versionadded:: 0.24 

1956 

1957 feature_names_in_ : ndarray of shape (`n_features_in_`,) 

1958 Names of features seen during :term:`fit`. Defined only when `X` 

1959 has feature names that are all strings. 

1960 

1961 .. versionadded:: 1.0 

1962 

1963 See Also 

1964 -------- 

1965 normalize : Equivalent function without the estimator API. 

1966 

1967 Notes 

1968 ----- 

1969 This estimator is :term:`stateless` and does not need to be fitted. 

1970 However, we recommend to call :meth:`fit_transform` instead of 

1971 :meth:`transform`, as parameter validation is only performed in 

1972 :meth:`fit`. 

1973 

1974 Examples 

1975 -------- 

1976 >>> from sklearn.preprocessing import Normalizer 

1977 >>> X = [[4, 1, 2, 2], 

1978 ... [1, 3, 9, 3], 

1979 ... [5, 7, 5, 1]] 

1980 >>> transformer = Normalizer().fit(X) # fit does nothing. 

1981 >>> transformer 

1982 Normalizer() 

1983 >>> transformer.transform(X) 

1984 array([[0.8, 0.2, 0.4, 0.4], 

1985 [0.1, 0.3, 0.9, 0.3], 

1986 [0.5, 0.7, 0.5, 0.1]]) 

1987 """ 

1988 

1989 _parameter_constraints: dict = { 

1990 "norm": [StrOptions({"l1", "l2", "max"})], 

1991 "copy": ["boolean"], 

1992 } 

1993 

1994 def __init__(self, norm="l2", *, copy=True): 

1995 self.norm = norm 

1996 self.copy = copy 

1997 

1998 @_fit_context(prefer_skip_nested_validation=True) 

1999 def fit(self, X, y=None): 

2000 """Only validates estimator's parameters. 

2001 

2002 This method allows to: (i) validate the estimator's parameters and 

2003 (ii) be consistent with the scikit-learn transformer API. 

2004 

2005 Parameters 

2006 ---------- 

2007 X : {array-like, sparse matrix} of shape (n_samples, n_features) 

2008 The data to estimate the normalization parameters. 

2009 

2010 y : Ignored 

2011 Not used, present here for API consistency by convention. 

2012 

2013 Returns 

2014 ------- 

2015 self : object 

2016 Fitted transformer. 

2017 """ 

2018 self._validate_data(X, accept_sparse="csr") 

2019 return self 

2020 

2021 def transform(self, X, copy=None): 

2022 """Scale each non zero row of X to unit norm. 

2023 

2024 Parameters 

2025 ---------- 

2026 X : {array-like, sparse matrix} of shape (n_samples, n_features) 

2027 The data to normalize, row by row. scipy.sparse matrices should be 

2028 in CSR format to avoid an un-necessary copy. 

2029 

2030 copy : bool, default=None 

2031 Copy the input X or not. 

2032 

2033 Returns 

2034 ------- 

2035 X_tr : {ndarray, sparse matrix} of shape (n_samples, n_features) 

2036 Transformed array. 

2037 """ 

2038 copy = copy if copy is not None else self.copy 

2039 X = self._validate_data(X, accept_sparse="csr", reset=False) 

2040 return normalize(X, norm=self.norm, axis=1, copy=copy) 

2041 

2042 def _more_tags(self): 

2043 return {"stateless": True, "array_api_support": True} 

2044 

2045 

2046@validate_params( 

2047 { 

2048 "X": ["array-like", "sparse matrix"], 

2049 "threshold": [Interval(Real, None, None, closed="neither")], 

2050 "copy": ["boolean"], 

2051 }, 

2052 prefer_skip_nested_validation=True, 

2053) 

2054def binarize(X, *, threshold=0.0, copy=True): 

2055 """Boolean thresholding of array-like or scipy.sparse matrix. 

2056 

2057 Read more in the :ref:`User Guide <preprocessing_binarization>`. 

2058 

2059 Parameters 

2060 ---------- 

2061 X : {array-like, sparse matrix} of shape (n_samples, n_features) 

2062 The data to binarize, element by element. 

2063 scipy.sparse matrices should be in CSR or CSC format to avoid an 

2064 un-necessary copy. 

2065 

2066 threshold : float, default=0.0 

2067 Feature values below or equal to this are replaced by 0, above it by 1. 

2068 Threshold may not be less than 0 for operations on sparse matrices. 

2069 

2070 copy : bool, default=True 

2071 If False, try to avoid a copy and binarize in place. 

2072 This is not guaranteed to always work in place; e.g. if the data is 

2073 a numpy array with an object dtype, a copy will be returned even with 

2074 copy=False. 

2075 

2076 Returns 

2077 ------- 

2078 X_tr : {ndarray, sparse matrix} of shape (n_samples, n_features) 

2079 The transformed data. 

2080 

2081 See Also 

2082 -------- 

2083 Binarizer : Performs binarization using the Transformer API 

2084 (e.g. as part of a preprocessing :class:`~sklearn.pipeline.Pipeline`). 

2085 """ 

2086 X = check_array(X, accept_sparse=["csr", "csc"], copy=copy) 

2087 if sparse.issparse(X): 

2088 if threshold < 0: 

2089 raise ValueError("Cannot binarize a sparse matrix with threshold < 0") 

2090 cond = X.data > threshold 

2091 not_cond = np.logical_not(cond) 

2092 X.data[cond] = 1 

2093 X.data[not_cond] = 0 

2094 X.eliminate_zeros() 

2095 else: 

2096 cond = X > threshold 

2097 not_cond = np.logical_not(cond) 

2098 X[cond] = 1 

2099 X[not_cond] = 0 

2100 return X 

2101 

2102 

2103class Binarizer(OneToOneFeatureMixin, TransformerMixin, BaseEstimator): 

2104 """Binarize data (set feature values to 0 or 1) according to a threshold. 

2105 

2106 Values greater than the threshold map to 1, while values less than 

2107 or equal to the threshold map to 0. With the default threshold of 0, 

2108 only positive values map to 1. 

2109 

2110 Binarization is a common operation on text count data where the 

2111 analyst can decide to only consider the presence or absence of a 

2112 feature rather than a quantified number of occurrences for instance. 

2113 

2114 It can also be used as a pre-processing step for estimators that 

2115 consider boolean random variables (e.g. modelled using the Bernoulli 

2116 distribution in a Bayesian setting). 

2117 

2118 Read more in the :ref:`User Guide <preprocessing_binarization>`. 

2119 

2120 Parameters 

2121 ---------- 

2122 threshold : float, default=0.0 

2123 Feature values below or equal to this are replaced by 0, above it by 1. 

2124 Threshold may not be less than 0 for operations on sparse matrices. 

2125 

2126 copy : bool, default=True 

2127 Set to False to perform inplace binarization and avoid a copy (if 

2128 the input is already a numpy array or a scipy.sparse CSR matrix). 

2129 

2130 Attributes 

2131 ---------- 

2132 n_features_in_ : int 

2133 Number of features seen during :term:`fit`. 

2134 

2135 .. versionadded:: 0.24 

2136 

2137 feature_names_in_ : ndarray of shape (`n_features_in_`,) 

2138 Names of features seen during :term:`fit`. Defined only when `X` 

2139 has feature names that are all strings. 

2140 

2141 .. versionadded:: 1.0 

2142 

2143 See Also 

2144 -------- 

2145 binarize : Equivalent function without the estimator API. 

2146 KBinsDiscretizer : Bin continuous data into intervals. 

2147 OneHotEncoder : Encode categorical features as a one-hot numeric array. 

2148 

2149 Notes 

2150 ----- 

2151 If the input is a sparse matrix, only the non-zero values are subject 

2152 to update by the :class:`Binarizer` class. 

2153 

2154 This estimator is :term:`stateless` and does not need to be fitted. 

2155 However, we recommend to call :meth:`fit_transform` instead of 

2156 :meth:`transform`, as parameter validation is only performed in 

2157 :meth:`fit`. 

2158 

2159 Examples 

2160 -------- 

2161 >>> from sklearn.preprocessing import Binarizer 

2162 >>> X = [[ 1., -1., 2.], 

2163 ... [ 2., 0., 0.], 

2164 ... [ 0., 1., -1.]] 

2165 >>> transformer = Binarizer().fit(X) # fit does nothing. 

2166 >>> transformer 

2167 Binarizer() 

2168 >>> transformer.transform(X) 

2169 array([[1., 0., 1.], 

2170 [1., 0., 0.], 

2171 [0., 1., 0.]]) 

2172 """ 

2173 

2174 _parameter_constraints: dict = { 

2175 "threshold": [Real], 

2176 "copy": ["boolean"], 

2177 } 

2178 

2179 def __init__(self, *, threshold=0.0, copy=True): 

2180 self.threshold = threshold 

2181 self.copy = copy 

2182 

2183 @_fit_context(prefer_skip_nested_validation=True) 

2184 def fit(self, X, y=None): 

2185 """Only validates estimator's parameters. 

2186 

2187 This method allows to: (i) validate the estimator's parameters and 

2188 (ii) be consistent with the scikit-learn transformer API. 

2189 

2190 Parameters 

2191 ---------- 

2192 X : {array-like, sparse matrix} of shape (n_samples, n_features) 

2193 The data. 

2194 

2195 y : None 

2196 Ignored. 

2197 

2198 Returns 

2199 ------- 

2200 self : object 

2201 Fitted transformer. 

2202 """ 

2203 self._validate_data(X, accept_sparse="csr") 

2204 return self 

2205 

2206 def transform(self, X, copy=None): 

2207 """Binarize each element of X. 

2208 

2209 Parameters 

2210 ---------- 

2211 X : {array-like, sparse matrix} of shape (n_samples, n_features) 

2212 The data to binarize, element by element. 

2213 scipy.sparse matrices should be in CSR format to avoid an 

2214 un-necessary copy. 

2215 

2216 copy : bool 

2217 Copy the input X or not. 

2218 

2219 Returns 

2220 ------- 

2221 X_tr : {ndarray, sparse matrix} of shape (n_samples, n_features) 

2222 Transformed array. 

2223 """ 

2224 copy = copy if copy is not None else self.copy 

2225 # TODO: This should be refactored because binarize also calls 

2226 # check_array 

2227 X = self._validate_data(X, accept_sparse=["csr", "csc"], copy=copy, reset=False) 

2228 return binarize(X, threshold=self.threshold, copy=False) 

2229 

2230 def _more_tags(self): 

2231 return {"stateless": True} 

2232 

2233 

2234class KernelCenterer(ClassNamePrefixFeaturesOutMixin, TransformerMixin, BaseEstimator): 

2235 r"""Center an arbitrary kernel matrix :math:`K`. 

2236 

2237 Let define a kernel :math:`K` such that: 

2238 

2239 .. math:: 

2240 K(X, Y) = \phi(X) . \phi(Y)^{T} 

2241 

2242 :math:`\phi(X)` is a function mapping of rows of :math:`X` to a 

2243 Hilbert space and :math:`K` is of shape `(n_samples, n_samples)`. 

2244 

2245 This class allows to compute :math:`\tilde{K}(X, Y)` such that: 

2246 

2247 .. math:: 

2248 \tilde{K(X, Y)} = \tilde{\phi}(X) . \tilde{\phi}(Y)^{T} 

2249 

2250 :math:`\tilde{\phi}(X)` is the centered mapped data in the Hilbert 

2251 space. 

2252 

2253 `KernelCenterer` centers the features without explicitly computing the 

2254 mapping :math:`\phi(\cdot)`. Working with centered kernels is sometime 

2255 expected when dealing with algebra computation such as eigendecomposition 

2256 for :class:`~sklearn.decomposition.KernelPCA` for instance. 

2257 

2258 Read more in the :ref:`User Guide <kernel_centering>`. 

2259 

2260 Attributes 

2261 ---------- 

2262 K_fit_rows_ : ndarray of shape (n_samples,) 

2263 Average of each column of kernel matrix. 

2264 

2265 K_fit_all_ : float 

2266 Average of kernel matrix. 

2267 

2268 n_features_in_ : int 

2269 Number of features seen during :term:`fit`. 

2270 

2271 .. versionadded:: 0.24 

2272 

2273 feature_names_in_ : ndarray of shape (`n_features_in_`,) 

2274 Names of features seen during :term:`fit`. Defined only when `X` 

2275 has feature names that are all strings. 

2276 

2277 .. versionadded:: 1.0 

2278 

2279 See Also 

2280 -------- 

2281 sklearn.kernel_approximation.Nystroem : Approximate a kernel map 

2282 using a subset of the training data. 

2283 

2284 References 

2285 ---------- 

2286 .. [1] `Schölkopf, Bernhard, Alexander Smola, and Klaus-Robert Müller. 

2287 "Nonlinear component analysis as a kernel eigenvalue problem." 

2288 Neural computation 10.5 (1998): 1299-1319. 

2289 <https://www.mlpack.org/papers/kpca.pdf>`_ 

2290 

2291 Examples 

2292 -------- 

2293 >>> from sklearn.preprocessing import KernelCenterer 

2294 >>> from sklearn.metrics.pairwise import pairwise_kernels 

2295 >>> X = [[ 1., -2., 2.], 

2296 ... [ -2., 1., 3.], 

2297 ... [ 4., 1., -2.]] 

2298 >>> K = pairwise_kernels(X, metric='linear') 

2299 >>> K 

2300 array([[ 9., 2., -2.], 

2301 [ 2., 14., -13.], 

2302 [ -2., -13., 21.]]) 

2303 >>> transformer = KernelCenterer().fit(K) 

2304 >>> transformer 

2305 KernelCenterer() 

2306 >>> transformer.transform(K) 

2307 array([[ 5., 0., -5.], 

2308 [ 0., 14., -14.], 

2309 [ -5., -14., 19.]]) 

2310 """ 

2311 

2312 def __init__(self): 

2313 # Needed for backported inspect.signature compatibility with PyPy 

2314 pass 

2315 

2316 def fit(self, K, y=None): 

2317 """Fit KernelCenterer. 

2318 

2319 Parameters 

2320 ---------- 

2321 K : ndarray of shape (n_samples, n_samples) 

2322 Kernel matrix. 

2323 

2324 y : None 

2325 Ignored. 

2326 

2327 Returns 

2328 ------- 

2329 self : object 

2330 Returns the instance itself. 

2331 """ 

2332 xp, _ = get_namespace(K) 

2333 

2334 K = self._validate_data(K, dtype=_array_api.supported_float_dtypes(xp)) 

2335 

2336 if K.shape[0] != K.shape[1]: 

2337 raise ValueError( 

2338 "Kernel matrix must be a square matrix." 

2339 " Input is a {}x{} matrix.".format(K.shape[0], K.shape[1]) 

2340 ) 

2341 

2342 n_samples = K.shape[0] 

2343 self.K_fit_rows_ = xp.sum(K, axis=0) / n_samples 

2344 self.K_fit_all_ = xp.sum(self.K_fit_rows_) / n_samples 

2345 return self 

2346 

2347 def transform(self, K, copy=True): 

2348 """Center kernel matrix. 

2349 

2350 Parameters 

2351 ---------- 

2352 K : ndarray of shape (n_samples1, n_samples2) 

2353 Kernel matrix. 

2354 

2355 copy : bool, default=True 

2356 Set to False to perform inplace computation. 

2357 

2358 Returns 

2359 ------- 

2360 K_new : ndarray of shape (n_samples1, n_samples2) 

2361 Returns the instance itself. 

2362 """ 

2363 check_is_fitted(self) 

2364 

2365 xp, _ = get_namespace(K) 

2366 

2367 K = self._validate_data( 

2368 K, copy=copy, dtype=_array_api.supported_float_dtypes(xp), reset=False 

2369 ) 

2370 

2371 K_pred_cols = (xp.sum(K, axis=1) / self.K_fit_rows_.shape[0])[:, None] 

2372 

2373 K -= self.K_fit_rows_ 

2374 K -= K_pred_cols 

2375 K += self.K_fit_all_ 

2376 

2377 return K 

2378 

2379 @property 

2380 def _n_features_out(self): 

2381 """Number of transformed output features.""" 

2382 # Used by ClassNamePrefixFeaturesOutMixin. This model preserves the 

2383 # number of input features but this is not a one-to-one mapping in the 

2384 # usual sense. Hence the choice not to use OneToOneFeatureMixin to 

2385 # implement get_feature_names_out for this class. 

2386 return self.n_features_in_ 

2387 

2388 def _more_tags(self): 

2389 return {"pairwise": True, "array_api_support": True} 

2390 

2391 

2392@validate_params( 

2393 { 

2394 "X": ["array-like", "sparse matrix"], 

2395 "value": [Interval(Real, None, None, closed="neither")], 

2396 }, 

2397 prefer_skip_nested_validation=True, 

2398) 

2399def add_dummy_feature(X, value=1.0): 

2400 """Augment dataset with an additional dummy feature. 

2401 

2402 This is useful for fitting an intercept term with implementations which 

2403 cannot otherwise fit it directly. 

2404 

2405 Parameters 

2406 ---------- 

2407 X : {array-like, sparse matrix} of shape (n_samples, n_features) 

2408 Data. 

2409 

2410 value : float 

2411 Value to use for the dummy feature. 

2412 

2413 Returns 

2414 ------- 

2415 X : {ndarray, sparse matrix} of shape (n_samples, n_features + 1) 

2416 Same data with dummy feature added as first column. 

2417 

2418 Examples 

2419 -------- 

2420 >>> from sklearn.preprocessing import add_dummy_feature 

2421 >>> add_dummy_feature([[0, 1], [1, 0]]) 

2422 array([[1., 0., 1.], 

2423 [1., 1., 0.]]) 

2424 """ 

2425 X = check_array(X, accept_sparse=["csc", "csr", "coo"], dtype=FLOAT_DTYPES) 

2426 n_samples, n_features = X.shape 

2427 shape = (n_samples, n_features + 1) 

2428 if sparse.issparse(X): 

2429 if X.format == "coo": 

2430 # Shift columns to the right. 

2431 col = X.col + 1 

2432 # Column indices of dummy feature are 0 everywhere. 

2433 col = np.concatenate((np.zeros(n_samples), col)) 

2434 # Row indices of dummy feature are 0, ..., n_samples-1. 

2435 row = np.concatenate((np.arange(n_samples), X.row)) 

2436 # Prepend the dummy feature n_samples times. 

2437 data = np.concatenate((np.full(n_samples, value), X.data)) 

2438 return sparse.coo_matrix((data, (row, col)), shape) 

2439 elif X.format == "csc": 

2440 # Shift index pointers since we need to add n_samples elements. 

2441 indptr = X.indptr + n_samples 

2442 # indptr[0] must be 0. 

2443 indptr = np.concatenate((np.array([0]), indptr)) 

2444 # Row indices of dummy feature are 0, ..., n_samples-1. 

2445 indices = np.concatenate((np.arange(n_samples), X.indices)) 

2446 # Prepend the dummy feature n_samples times. 

2447 data = np.concatenate((np.full(n_samples, value), X.data)) 

2448 return sparse.csc_matrix((data, indices, indptr), shape) 

2449 else: 

2450 klass = X.__class__ 

2451 return klass(add_dummy_feature(X.tocoo(), value)) 

2452 else: 

2453 return np.hstack((np.full((n_samples, 1), value), X)) 

2454 

2455 

2456class QuantileTransformer(OneToOneFeatureMixin, TransformerMixin, BaseEstimator): 

2457 """Transform features using quantiles information. 

2458 

2459 This method transforms the features to follow a uniform or a normal 

2460 distribution. Therefore, for a given feature, this transformation tends 

2461 to spread out the most frequent values. It also reduces the impact of 

2462 (marginal) outliers: this is therefore a robust preprocessing scheme. 

2463 

2464 The transformation is applied on each feature independently. First an 

2465 estimate of the cumulative distribution function of a feature is 

2466 used to map the original values to a uniform distribution. The obtained 

2467 values are then mapped to the desired output distribution using the 

2468 associated quantile function. Features values of new/unseen data that fall 

2469 below or above the fitted range will be mapped to the bounds of the output 

2470 distribution. Note that this transform is non-linear. It may distort linear 

2471 correlations between variables measured at the same scale but renders 

2472 variables measured at different scales more directly comparable. 

2473 

2474 For example visualizations, refer to :ref:`Compare QuantileTransformer with 

2475 other scalers <plot_all_scaling_quantile_transformer_section>`. 

2476 

2477 Read more in the :ref:`User Guide <preprocessing_transformer>`. 

2478 

2479 .. versionadded:: 0.19 

2480 

2481 Parameters 

2482 ---------- 

2483 n_quantiles : int, default=1000 or n_samples 

2484 Number of quantiles to be computed. It corresponds to the number 

2485 of landmarks used to discretize the cumulative distribution function. 

2486 If n_quantiles is larger than the number of samples, n_quantiles is set 

2487 to the number of samples as a larger number of quantiles does not give 

2488 a better approximation of the cumulative distribution function 

2489 estimator. 

2490 

2491 output_distribution : {'uniform', 'normal'}, default='uniform' 

2492 Marginal distribution for the transformed data. The choices are 

2493 'uniform' (default) or 'normal'. 

2494 

2495 ignore_implicit_zeros : bool, default=False 

2496 Only applies to sparse matrices. If True, the sparse entries of the 

2497 matrix are discarded to compute the quantile statistics. If False, 

2498 these entries are treated as zeros. 

2499 

2500 subsample : int, default=10_000 

2501 Maximum number of samples used to estimate the quantiles for 

2502 computational efficiency. Note that the subsampling procedure may 

2503 differ for value-identical sparse and dense matrices. 

2504 

2505 random_state : int, RandomState instance or None, default=None 

2506 Determines random number generation for subsampling and smoothing 

2507 noise. 

2508 Please see ``subsample`` for more details. 

2509 Pass an int for reproducible results across multiple function calls. 

2510 See :term:`Glossary <random_state>`. 

2511 

2512 copy : bool, default=True 

2513 Set to False to perform inplace transformation and avoid a copy (if the 

2514 input is already a numpy array). 

2515 

2516 Attributes 

2517 ---------- 

2518 n_quantiles_ : int 

2519 The actual number of quantiles used to discretize the cumulative 

2520 distribution function. 

2521 

2522 quantiles_ : ndarray of shape (n_quantiles, n_features) 

2523 The values corresponding the quantiles of reference. 

2524 

2525 references_ : ndarray of shape (n_quantiles, ) 

2526 Quantiles of references. 

2527 

2528 n_features_in_ : int 

2529 Number of features seen during :term:`fit`. 

2530 

2531 .. versionadded:: 0.24 

2532 

2533 feature_names_in_ : ndarray of shape (`n_features_in_`,) 

2534 Names of features seen during :term:`fit`. Defined only when `X` 

2535 has feature names that are all strings. 

2536 

2537 .. versionadded:: 1.0 

2538 

2539 See Also 

2540 -------- 

2541 quantile_transform : Equivalent function without the estimator API. 

2542 PowerTransformer : Perform mapping to a normal distribution using a power 

2543 transform. 

2544 StandardScaler : Perform standardization that is faster, but less robust 

2545 to outliers. 

2546 RobustScaler : Perform robust standardization that removes the influence 

2547 of outliers but does not put outliers and inliers on the same scale. 

2548 

2549 Notes 

2550 ----- 

2551 NaNs are treated as missing values: disregarded in fit, and maintained in 

2552 transform. 

2553 

2554 Examples 

2555 -------- 

2556 >>> import numpy as np 

2557 >>> from sklearn.preprocessing import QuantileTransformer 

2558 >>> rng = np.random.RandomState(0) 

2559 >>> X = np.sort(rng.normal(loc=0.5, scale=0.25, size=(25, 1)), axis=0) 

2560 >>> qt = QuantileTransformer(n_quantiles=10, random_state=0) 

2561 >>> qt.fit_transform(X) 

2562 array([...]) 

2563 """ 

2564 

2565 _parameter_constraints: dict = { 

2566 "n_quantiles": [Interval(Integral, 1, None, closed="left")], 

2567 "output_distribution": [StrOptions({"uniform", "normal"})], 

2568 "ignore_implicit_zeros": ["boolean"], 

2569 "subsample": [Interval(Integral, 1, None, closed="left")], 

2570 "random_state": ["random_state"], 

2571 "copy": ["boolean"], 

2572 } 

2573 

2574 def __init__( 

2575 self, 

2576 *, 

2577 n_quantiles=1000, 

2578 output_distribution="uniform", 

2579 ignore_implicit_zeros=False, 

2580 subsample=10_000, 

2581 random_state=None, 

2582 copy=True, 

2583 ): 

2584 self.n_quantiles = n_quantiles 

2585 self.output_distribution = output_distribution 

2586 self.ignore_implicit_zeros = ignore_implicit_zeros 

2587 self.subsample = subsample 

2588 self.random_state = random_state 

2589 self.copy = copy 

2590 

2591 def _dense_fit(self, X, random_state): 

2592 """Compute percentiles for dense matrices. 

2593 

2594 Parameters 

2595 ---------- 

2596 X : ndarray of shape (n_samples, n_features) 

2597 The data used to scale along the features axis. 

2598 """ 

2599 if self.ignore_implicit_zeros: 

2600 warnings.warn( 

2601 "'ignore_implicit_zeros' takes effect only with" 

2602 " sparse matrix. This parameter has no effect." 

2603 ) 

2604 

2605 n_samples, n_features = X.shape 

2606 references = self.references_ * 100 

2607 

2608 self.quantiles_ = [] 

2609 for col in X.T: 

2610 if self.subsample < n_samples: 

2611 subsample_idx = random_state.choice( 

2612 n_samples, size=self.subsample, replace=False 

2613 ) 

2614 col = col.take(subsample_idx, mode="clip") 

2615 self.quantiles_.append(np.nanpercentile(col, references)) 

2616 self.quantiles_ = np.transpose(self.quantiles_) 

2617 # Due to floating-point precision error in `np.nanpercentile`, 

2618 # make sure that quantiles are monotonically increasing. 

2619 # Upstream issue in numpy: 

2620 # https://github.com/numpy/numpy/issues/14685 

2621 self.quantiles_ = np.maximum.accumulate(self.quantiles_) 

2622 

2623 def _sparse_fit(self, X, random_state): 

2624 """Compute percentiles for sparse matrices. 

2625 

2626 Parameters 

2627 ---------- 

2628 X : sparse matrix of shape (n_samples, n_features) 

2629 The data used to scale along the features axis. The sparse matrix 

2630 needs to be nonnegative. If a sparse matrix is provided, 

2631 it will be converted into a sparse ``csc_matrix``. 

2632 """ 

2633 n_samples, n_features = X.shape 

2634 references = self.references_ * 100 

2635 

2636 self.quantiles_ = [] 

2637 for feature_idx in range(n_features): 

2638 column_nnz_data = X.data[X.indptr[feature_idx] : X.indptr[feature_idx + 1]] 

2639 if len(column_nnz_data) > self.subsample: 

2640 column_subsample = self.subsample * len(column_nnz_data) // n_samples 

2641 if self.ignore_implicit_zeros: 

2642 column_data = np.zeros(shape=column_subsample, dtype=X.dtype) 

2643 else: 

2644 column_data = np.zeros(shape=self.subsample, dtype=X.dtype) 

2645 column_data[:column_subsample] = random_state.choice( 

2646 column_nnz_data, size=column_subsample, replace=False 

2647 ) 

2648 else: 

2649 if self.ignore_implicit_zeros: 

2650 column_data = np.zeros(shape=len(column_nnz_data), dtype=X.dtype) 

2651 else: 

2652 column_data = np.zeros(shape=n_samples, dtype=X.dtype) 

2653 column_data[: len(column_nnz_data)] = column_nnz_data 

2654 

2655 if not column_data.size: 

2656 # if no nnz, an error will be raised for computing the 

2657 # quantiles. Force the quantiles to be zeros. 

2658 self.quantiles_.append([0] * len(references)) 

2659 else: 

2660 self.quantiles_.append(np.nanpercentile(column_data, references)) 

2661 self.quantiles_ = np.transpose(self.quantiles_) 

2662 # due to floating-point precision error in `np.nanpercentile`, 

2663 # make sure the quantiles are monotonically increasing 

2664 # Upstream issue in numpy: 

2665 # https://github.com/numpy/numpy/issues/14685 

2666 self.quantiles_ = np.maximum.accumulate(self.quantiles_) 

2667 

2668 @_fit_context(prefer_skip_nested_validation=True) 

2669 def fit(self, X, y=None): 

2670 """Compute the quantiles used for transforming. 

2671 

2672 Parameters 

2673 ---------- 

2674 X : {array-like, sparse matrix} of shape (n_samples, n_features) 

2675 The data used to scale along the features axis. If a sparse 

2676 matrix is provided, it will be converted into a sparse 

2677 ``csc_matrix``. Additionally, the sparse matrix needs to be 

2678 nonnegative if `ignore_implicit_zeros` is False. 

2679 

2680 y : None 

2681 Ignored. 

2682 

2683 Returns 

2684 ------- 

2685 self : object 

2686 Fitted transformer. 

2687 """ 

2688 if self.n_quantiles > self.subsample: 

2689 raise ValueError( 

2690 "The number of quantiles cannot be greater than" 

2691 " the number of samples used. Got {} quantiles" 

2692 " and {} samples.".format(self.n_quantiles, self.subsample) 

2693 ) 

2694 

2695 X = self._check_inputs(X, in_fit=True, copy=False) 

2696 n_samples = X.shape[0] 

2697 

2698 if self.n_quantiles > n_samples: 

2699 warnings.warn( 

2700 "n_quantiles (%s) is greater than the total number " 

2701 "of samples (%s). n_quantiles is set to " 

2702 "n_samples." % (self.n_quantiles, n_samples) 

2703 ) 

2704 self.n_quantiles_ = max(1, min(self.n_quantiles, n_samples)) 

2705 

2706 rng = check_random_state(self.random_state) 

2707 

2708 # Create the quantiles of reference 

2709 self.references_ = np.linspace(0, 1, self.n_quantiles_, endpoint=True) 

2710 if sparse.issparse(X): 

2711 self._sparse_fit(X, rng) 

2712 else: 

2713 self._dense_fit(X, rng) 

2714 

2715 return self 

2716 

2717 def _transform_col(self, X_col, quantiles, inverse): 

2718 """Private function to transform a single feature.""" 

2719 

2720 output_distribution = self.output_distribution 

2721 

2722 if not inverse: 

2723 lower_bound_x = quantiles[0] 

2724 upper_bound_x = quantiles[-1] 

2725 lower_bound_y = 0 

2726 upper_bound_y = 1 

2727 else: 

2728 lower_bound_x = 0 

2729 upper_bound_x = 1 

2730 lower_bound_y = quantiles[0] 

2731 upper_bound_y = quantiles[-1] 

2732 # for inverse transform, match a uniform distribution 

2733 with np.errstate(invalid="ignore"): # hide NaN comparison warnings 

2734 if output_distribution == "normal": 

2735 X_col = stats.norm.cdf(X_col) 

2736 # else output distribution is already a uniform distribution 

2737 

2738 # find index for lower and higher bounds 

2739 with np.errstate(invalid="ignore"): # hide NaN comparison warnings 

2740 if output_distribution == "normal": 

2741 lower_bounds_idx = X_col - BOUNDS_THRESHOLD < lower_bound_x 

2742 upper_bounds_idx = X_col + BOUNDS_THRESHOLD > upper_bound_x 

2743 if output_distribution == "uniform": 

2744 lower_bounds_idx = X_col == lower_bound_x 

2745 upper_bounds_idx = X_col == upper_bound_x 

2746 

2747 isfinite_mask = ~np.isnan(X_col) 

2748 X_col_finite = X_col[isfinite_mask] 

2749 if not inverse: 

2750 # Interpolate in one direction and in the other and take the 

2751 # mean. This is in case of repeated values in the features 

2752 # and hence repeated quantiles 

2753 # 

2754 # If we don't do this, only one extreme of the duplicated is 

2755 # used (the upper when we do ascending, and the 

2756 # lower for descending). We take the mean of these two 

2757 X_col[isfinite_mask] = 0.5 * ( 

2758 np.interp(X_col_finite, quantiles, self.references_) 

2759 - np.interp(-X_col_finite, -quantiles[::-1], -self.references_[::-1]) 

2760 ) 

2761 else: 

2762 X_col[isfinite_mask] = np.interp(X_col_finite, self.references_, quantiles) 

2763 

2764 X_col[upper_bounds_idx] = upper_bound_y 

2765 X_col[lower_bounds_idx] = lower_bound_y 

2766 # for forward transform, match the output distribution 

2767 if not inverse: 

2768 with np.errstate(invalid="ignore"): # hide NaN comparison warnings 

2769 if output_distribution == "normal": 

2770 X_col = stats.norm.ppf(X_col) 

2771 # find the value to clip the data to avoid mapping to 

2772 # infinity. Clip such that the inverse transform will be 

2773 # consistent 

2774 clip_min = stats.norm.ppf(BOUNDS_THRESHOLD - np.spacing(1)) 

2775 clip_max = stats.norm.ppf(1 - (BOUNDS_THRESHOLD - np.spacing(1))) 

2776 X_col = np.clip(X_col, clip_min, clip_max) 

2777 # else output distribution is uniform and the ppf is the 

2778 # identity function so we let X_col unchanged 

2779 

2780 return X_col 

2781 

2782 def _check_inputs(self, X, in_fit, accept_sparse_negative=False, copy=False): 

2783 """Check inputs before fit and transform.""" 

2784 X = self._validate_data( 

2785 X, 

2786 reset=in_fit, 

2787 accept_sparse="csc", 

2788 copy=copy, 

2789 dtype=FLOAT_DTYPES, 

2790 force_all_finite="allow-nan", 

2791 ) 

2792 # we only accept positive sparse matrix when ignore_implicit_zeros is 

2793 # false and that we call fit or transform. 

2794 with np.errstate(invalid="ignore"): # hide NaN comparison warnings 

2795 if ( 

2796 not accept_sparse_negative 

2797 and not self.ignore_implicit_zeros 

2798 and (sparse.issparse(X) and np.any(X.data < 0)) 

2799 ): 

2800 raise ValueError( 

2801 "QuantileTransformer only accepts non-negative sparse matrices." 

2802 ) 

2803 

2804 return X 

2805 

2806 def _transform(self, X, inverse=False): 

2807 """Forward and inverse transform. 

2808 

2809 Parameters 

2810 ---------- 

2811 X : ndarray of shape (n_samples, n_features) 

2812 The data used to scale along the features axis. 

2813 

2814 inverse : bool, default=False 

2815 If False, apply forward transform. If True, apply 

2816 inverse transform. 

2817 

2818 Returns 

2819 ------- 

2820 X : ndarray of shape (n_samples, n_features) 

2821 Projected data. 

2822 """ 

2823 if sparse.issparse(X): 

2824 for feature_idx in range(X.shape[1]): 

2825 column_slice = slice(X.indptr[feature_idx], X.indptr[feature_idx + 1]) 

2826 X.data[column_slice] = self._transform_col( 

2827 X.data[column_slice], self.quantiles_[:, feature_idx], inverse 

2828 ) 

2829 else: 

2830 for feature_idx in range(X.shape[1]): 

2831 X[:, feature_idx] = self._transform_col( 

2832 X[:, feature_idx], self.quantiles_[:, feature_idx], inverse 

2833 ) 

2834 

2835 return X 

2836 

2837 def transform(self, X): 

2838 """Feature-wise transformation of the data. 

2839 

2840 Parameters 

2841 ---------- 

2842 X : {array-like, sparse matrix} of shape (n_samples, n_features) 

2843 The data used to scale along the features axis. If a sparse 

2844 matrix is provided, it will be converted into a sparse 

2845 ``csc_matrix``. Additionally, the sparse matrix needs to be 

2846 nonnegative if `ignore_implicit_zeros` is False. 

2847 

2848 Returns 

2849 ------- 

2850 Xt : {ndarray, sparse matrix} of shape (n_samples, n_features) 

2851 The projected data. 

2852 """ 

2853 check_is_fitted(self) 

2854 X = self._check_inputs(X, in_fit=False, copy=self.copy) 

2855 

2856 return self._transform(X, inverse=False) 

2857 

2858 def inverse_transform(self, X): 

2859 """Back-projection to the original space. 

2860 

2861 Parameters 

2862 ---------- 

2863 X : {array-like, sparse matrix} of shape (n_samples, n_features) 

2864 The data used to scale along the features axis. If a sparse 

2865 matrix is provided, it will be converted into a sparse 

2866 ``csc_matrix``. Additionally, the sparse matrix needs to be 

2867 nonnegative if `ignore_implicit_zeros` is False. 

2868 

2869 Returns 

2870 ------- 

2871 Xt : {ndarray, sparse matrix} of (n_samples, n_features) 

2872 The projected data. 

2873 """ 

2874 check_is_fitted(self) 

2875 X = self._check_inputs( 

2876 X, in_fit=False, accept_sparse_negative=True, copy=self.copy 

2877 ) 

2878 

2879 return self._transform(X, inverse=True) 

2880 

2881 def _more_tags(self): 

2882 return {"allow_nan": True} 

2883 

2884 

2885@validate_params( 

2886 {"X": ["array-like", "sparse matrix"], "axis": [Options(Integral, {0, 1})]}, 

2887 prefer_skip_nested_validation=False, 

2888) 

2889def quantile_transform( 

2890 X, 

2891 *, 

2892 axis=0, 

2893 n_quantiles=1000, 

2894 output_distribution="uniform", 

2895 ignore_implicit_zeros=False, 

2896 subsample=int(1e5), 

2897 random_state=None, 

2898 copy=True, 

2899): 

2900 """Transform features using quantiles information. 

2901 

2902 This method transforms the features to follow a uniform or a normal 

2903 distribution. Therefore, for a given feature, this transformation tends 

2904 to spread out the most frequent values. It also reduces the impact of 

2905 (marginal) outliers: this is therefore a robust preprocessing scheme. 

2906 

2907 The transformation is applied on each feature independently. First an 

2908 estimate of the cumulative distribution function of a feature is 

2909 used to map the original values to a uniform distribution. The obtained 

2910 values are then mapped to the desired output distribution using the 

2911 associated quantile function. Features values of new/unseen data that fall 

2912 below or above the fitted range will be mapped to the bounds of the output 

2913 distribution. Note that this transform is non-linear. It may distort linear 

2914 correlations between variables measured at the same scale but renders 

2915 variables measured at different scales more directly comparable. 

2916 

2917 Read more in the :ref:`User Guide <preprocessing_transformer>`. 

2918 

2919 Parameters 

2920 ---------- 

2921 X : {array-like, sparse matrix} of shape (n_samples, n_features) 

2922 The data to transform. 

2923 

2924 axis : int, default=0 

2925 Axis used to compute the means and standard deviations along. If 0, 

2926 transform each feature, otherwise (if 1) transform each sample. 

2927 

2928 n_quantiles : int, default=1000 or n_samples 

2929 Number of quantiles to be computed. It corresponds to the number 

2930 of landmarks used to discretize the cumulative distribution function. 

2931 If n_quantiles is larger than the number of samples, n_quantiles is set 

2932 to the number of samples as a larger number of quantiles does not give 

2933 a better approximation of the cumulative distribution function 

2934 estimator. 

2935 

2936 output_distribution : {'uniform', 'normal'}, default='uniform' 

2937 Marginal distribution for the transformed data. The choices are 

2938 'uniform' (default) or 'normal'. 

2939 

2940 ignore_implicit_zeros : bool, default=False 

2941 Only applies to sparse matrices. If True, the sparse entries of the 

2942 matrix are discarded to compute the quantile statistics. If False, 

2943 these entries are treated as zeros. 

2944 

2945 subsample : int, default=1e5 

2946 Maximum number of samples used to estimate the quantiles for 

2947 computational efficiency. Note that the subsampling procedure may 

2948 differ for value-identical sparse and dense matrices. 

2949 

2950 random_state : int, RandomState instance or None, default=None 

2951 Determines random number generation for subsampling and smoothing 

2952 noise. 

2953 Please see ``subsample`` for more details. 

2954 Pass an int for reproducible results across multiple function calls. 

2955 See :term:`Glossary <random_state>`. 

2956 

2957 copy : bool, default=True 

2958 If False, try to avoid a copy and transform in place. 

2959 This is not guaranteed to always work in place; e.g. if the data is 

2960 a numpy array with an int dtype, a copy will be returned even with 

2961 copy=False. 

2962 

2963 .. versionchanged:: 0.23 

2964 The default value of `copy` changed from False to True in 0.23. 

2965 

2966 Returns 

2967 ------- 

2968 Xt : {ndarray, sparse matrix} of shape (n_samples, n_features) 

2969 The transformed data. 

2970 

2971 See Also 

2972 -------- 

2973 QuantileTransformer : Performs quantile-based scaling using the 

2974 Transformer API (e.g. as part of a preprocessing 

2975 :class:`~sklearn.pipeline.Pipeline`). 

2976 power_transform : Maps data to a normal distribution using a 

2977 power transformation. 

2978 scale : Performs standardization that is faster, but less robust 

2979 to outliers. 

2980 robust_scale : Performs robust standardization that removes the influence 

2981 of outliers but does not put outliers and inliers on the same scale. 

2982 

2983 Notes 

2984 ----- 

2985 NaNs are treated as missing values: disregarded in fit, and maintained in 

2986 transform. 

2987 

2988 .. warning:: Risk of data leak 

2989 

2990 Do not use :func:`~sklearn.preprocessing.quantile_transform` unless 

2991 you know what you are doing. A common mistake is to apply it 

2992 to the entire data *before* splitting into training and 

2993 test sets. This will bias the model evaluation because 

2994 information would have leaked from the test set to the 

2995 training set. 

2996 In general, we recommend using 

2997 :class:`~sklearn.preprocessing.QuantileTransformer` within a 

2998 :ref:`Pipeline <pipeline>` in order to prevent most risks of data 

2999 leaking:`pipe = make_pipeline(QuantileTransformer(), 

3000 LogisticRegression())`. 

3001 

3002 For a comparison of the different scalers, transformers, and normalizers, 

3003 see: :ref:`sphx_glr_auto_examples_preprocessing_plot_all_scaling.py`. 

3004 

3005 Examples 

3006 -------- 

3007 >>> import numpy as np 

3008 >>> from sklearn.preprocessing import quantile_transform 

3009 >>> rng = np.random.RandomState(0) 

3010 >>> X = np.sort(rng.normal(loc=0.5, scale=0.25, size=(25, 1)), axis=0) 

3011 >>> quantile_transform(X, n_quantiles=10, random_state=0, copy=True) 

3012 array([...]) 

3013 """ 

3014 n = QuantileTransformer( 

3015 n_quantiles=n_quantiles, 

3016 output_distribution=output_distribution, 

3017 subsample=subsample, 

3018 ignore_implicit_zeros=ignore_implicit_zeros, 

3019 random_state=random_state, 

3020 copy=copy, 

3021 ) 

3022 if axis == 0: 

3023 X = n.fit_transform(X) 

3024 else: # axis == 1 

3025 X = n.fit_transform(X.T).T 

3026 return X 

3027 

3028 

3029class PowerTransformer(OneToOneFeatureMixin, TransformerMixin, BaseEstimator): 

3030 """Apply a power transform featurewise to make data more Gaussian-like. 

3031 

3032 Power transforms are a family of parametric, monotonic transformations 

3033 that are applied to make data more Gaussian-like. This is useful for 

3034 modeling issues related to heteroscedasticity (non-constant variance), 

3035 or other situations where normality is desired. 

3036 

3037 Currently, PowerTransformer supports the Box-Cox transform and the 

3038 Yeo-Johnson transform. The optimal parameter for stabilizing variance and 

3039 minimizing skewness is estimated through maximum likelihood. 

3040 

3041 Box-Cox requires input data to be strictly positive, while Yeo-Johnson 

3042 supports both positive or negative data. 

3043 

3044 By default, zero-mean, unit-variance normalization is applied to the 

3045 transformed data. 

3046 

3047 For an example visualization, refer to :ref:`Compare PowerTransformer with 

3048 other scalers <plot_all_scaling_power_transformer_section>`. To see the 

3049 effect of Box-Cox and Yeo-Johnson transformations on different 

3050 distributions, see: 

3051 :ref:`sphx_glr_auto_examples_preprocessing_plot_map_data_to_normal.py`. 

3052 

3053 Read more in the :ref:`User Guide <preprocessing_transformer>`. 

3054 

3055 .. versionadded:: 0.20 

3056 

3057 Parameters 

3058 ---------- 

3059 method : {'yeo-johnson', 'box-cox'}, default='yeo-johnson' 

3060 The power transform method. Available methods are: 

3061 

3062 - 'yeo-johnson' [1]_, works with positive and negative values 

3063 - 'box-cox' [2]_, only works with strictly positive values 

3064 

3065 standardize : bool, default=True 

3066 Set to True to apply zero-mean, unit-variance normalization to the 

3067 transformed output. 

3068 

3069 copy : bool, default=True 

3070 Set to False to perform inplace computation during transformation. 

3071 

3072 Attributes 

3073 ---------- 

3074 lambdas_ : ndarray of float of shape (n_features,) 

3075 The parameters of the power transformation for the selected features. 

3076 

3077 n_features_in_ : int 

3078 Number of features seen during :term:`fit`. 

3079 

3080 .. versionadded:: 0.24 

3081 

3082 feature_names_in_ : ndarray of shape (`n_features_in_`,) 

3083 Names of features seen during :term:`fit`. Defined only when `X` 

3084 has feature names that are all strings. 

3085 

3086 .. versionadded:: 1.0 

3087 

3088 See Also 

3089 -------- 

3090 power_transform : Equivalent function without the estimator API. 

3091 

3092 QuantileTransformer : Maps data to a standard normal distribution with 

3093 the parameter `output_distribution='normal'`. 

3094 

3095 Notes 

3096 ----- 

3097 NaNs are treated as missing values: disregarded in ``fit``, and maintained 

3098 in ``transform``. 

3099 

3100 References 

3101 ---------- 

3102 

3103 .. [1] :doi:`I.K. Yeo and R.A. Johnson, "A new family of power 

3104 transformations to improve normality or symmetry." Biometrika, 

3105 87(4), pp.954-959, (2000). <10.1093/biomet/87.4.954>` 

3106 

3107 .. [2] :doi:`G.E.P. Box and D.R. Cox, "An Analysis of Transformations", 

3108 Journal of the Royal Statistical Society B, 26, 211-252 (1964). 

3109 <10.1111/j.2517-6161.1964.tb00553.x>` 

3110 

3111 Examples 

3112 -------- 

3113 >>> import numpy as np 

3114 >>> from sklearn.preprocessing import PowerTransformer 

3115 >>> pt = PowerTransformer() 

3116 >>> data = [[1, 2], [3, 2], [4, 5]] 

3117 >>> print(pt.fit(data)) 

3118 PowerTransformer() 

3119 >>> print(pt.lambdas_) 

3120 [ 1.386... -3.100...] 

3121 >>> print(pt.transform(data)) 

3122 [[-1.316... -0.707...] 

3123 [ 0.209... -0.707...] 

3124 [ 1.106... 1.414...]] 

3125 """ 

3126 

3127 _parameter_constraints: dict = { 

3128 "method": [StrOptions({"yeo-johnson", "box-cox"})], 

3129 "standardize": ["boolean"], 

3130 "copy": ["boolean"], 

3131 } 

3132 

3133 def __init__(self, method="yeo-johnson", *, standardize=True, copy=True): 

3134 self.method = method 

3135 self.standardize = standardize 

3136 self.copy = copy 

3137 

3138 @_fit_context(prefer_skip_nested_validation=True) 

3139 def fit(self, X, y=None): 

3140 """Estimate the optimal parameter lambda for each feature. 

3141 

3142 The optimal lambda parameter for minimizing skewness is estimated on 

3143 each feature independently using maximum likelihood. 

3144 

3145 Parameters 

3146 ---------- 

3147 X : array-like of shape (n_samples, n_features) 

3148 The data used to estimate the optimal transformation parameters. 

3149 

3150 y : None 

3151 Ignored. 

3152 

3153 Returns 

3154 ------- 

3155 self : object 

3156 Fitted transformer. 

3157 """ 

3158 self._fit(X, y=y, force_transform=False) 

3159 return self 

3160 

3161 @_fit_context(prefer_skip_nested_validation=True) 

3162 def fit_transform(self, X, y=None): 

3163 """Fit `PowerTransformer` to `X`, then transform `X`. 

3164 

3165 Parameters 

3166 ---------- 

3167 X : array-like of shape (n_samples, n_features) 

3168 The data used to estimate the optimal transformation parameters 

3169 and to be transformed using a power transformation. 

3170 

3171 y : Ignored 

3172 Not used, present for API consistency by convention. 

3173 

3174 Returns 

3175 ------- 

3176 X_new : ndarray of shape (n_samples, n_features) 

3177 Transformed data. 

3178 """ 

3179 return self._fit(X, y, force_transform=True) 

3180 

3181 def _fit(self, X, y=None, force_transform=False): 

3182 X = self._check_input(X, in_fit=True, check_positive=True) 

3183 

3184 if not self.copy and not force_transform: # if call from fit() 

3185 X = X.copy() # force copy so that fit does not change X inplace 

3186 

3187 n_samples = X.shape[0] 

3188 mean = np.mean(X, axis=0, dtype=np.float64) 

3189 var = np.var(X, axis=0, dtype=np.float64) 

3190 

3191 optim_function = { 

3192 "box-cox": self._box_cox_optimize, 

3193 "yeo-johnson": self._yeo_johnson_optimize, 

3194 }[self.method] 

3195 

3196 transform_function = { 

3197 "box-cox": boxcox, 

3198 "yeo-johnson": self._yeo_johnson_transform, 

3199 }[self.method] 

3200 

3201 with np.errstate(invalid="ignore"): # hide NaN warnings 

3202 self.lambdas_ = np.empty(X.shape[1], dtype=X.dtype) 

3203 for i, col in enumerate(X.T): 

3204 # For yeo-johnson, leave constant features unchanged 

3205 # lambda=1 corresponds to the identity transformation 

3206 is_constant_feature = _is_constant_feature(var[i], mean[i], n_samples) 

3207 if self.method == "yeo-johnson" and is_constant_feature: 

3208 self.lambdas_[i] = 1.0 

3209 continue 

3210 

3211 self.lambdas_[i] = optim_function(col) 

3212 

3213 if self.standardize or force_transform: 

3214 X[:, i] = transform_function(X[:, i], self.lambdas_[i]) 

3215 

3216 if self.standardize: 

3217 self._scaler = StandardScaler(copy=False).set_output(transform="default") 

3218 if force_transform: 

3219 X = self._scaler.fit_transform(X) 

3220 else: 

3221 self._scaler.fit(X) 

3222 

3223 return X 

3224 

3225 def transform(self, X): 

3226 """Apply the power transform to each feature using the fitted lambdas. 

3227 

3228 Parameters 

3229 ---------- 

3230 X : array-like of shape (n_samples, n_features) 

3231 The data to be transformed using a power transformation. 

3232 

3233 Returns 

3234 ------- 

3235 X_trans : ndarray of shape (n_samples, n_features) 

3236 The transformed data. 

3237 """ 

3238 check_is_fitted(self) 

3239 X = self._check_input(X, in_fit=False, check_positive=True, check_shape=True) 

3240 

3241 transform_function = { 

3242 "box-cox": boxcox, 

3243 "yeo-johnson": self._yeo_johnson_transform, 

3244 }[self.method] 

3245 for i, lmbda in enumerate(self.lambdas_): 

3246 with np.errstate(invalid="ignore"): # hide NaN warnings 

3247 X[:, i] = transform_function(X[:, i], lmbda) 

3248 

3249 if self.standardize: 

3250 X = self._scaler.transform(X) 

3251 

3252 return X 

3253 

3254 def inverse_transform(self, X): 

3255 """Apply the inverse power transformation using the fitted lambdas. 

3256 

3257 The inverse of the Box-Cox transformation is given by:: 

3258 

3259 if lambda_ == 0: 

3260 X = exp(X_trans) 

3261 else: 

3262 X = (X_trans * lambda_ + 1) ** (1 / lambda_) 

3263 

3264 The inverse of the Yeo-Johnson transformation is given by:: 

3265 

3266 if X >= 0 and lambda_ == 0: 

3267 X = exp(X_trans) - 1 

3268 elif X >= 0 and lambda_ != 0: 

3269 X = (X_trans * lambda_ + 1) ** (1 / lambda_) - 1 

3270 elif X < 0 and lambda_ != 2: 

3271 X = 1 - (-(2 - lambda_) * X_trans + 1) ** (1 / (2 - lambda_)) 

3272 elif X < 0 and lambda_ == 2: 

3273 X = 1 - exp(-X_trans) 

3274 

3275 Parameters 

3276 ---------- 

3277 X : array-like of shape (n_samples, n_features) 

3278 The transformed data. 

3279 

3280 Returns 

3281 ------- 

3282 X : ndarray of shape (n_samples, n_features) 

3283 The original data. 

3284 """ 

3285 check_is_fitted(self) 

3286 X = self._check_input(X, in_fit=False, check_shape=True) 

3287 

3288 if self.standardize: 

3289 X = self._scaler.inverse_transform(X) 

3290 

3291 inv_fun = { 

3292 "box-cox": self._box_cox_inverse_tranform, 

3293 "yeo-johnson": self._yeo_johnson_inverse_transform, 

3294 }[self.method] 

3295 for i, lmbda in enumerate(self.lambdas_): 

3296 with np.errstate(invalid="ignore"): # hide NaN warnings 

3297 X[:, i] = inv_fun(X[:, i], lmbda) 

3298 

3299 return X 

3300 

3301 def _box_cox_inverse_tranform(self, x, lmbda): 

3302 """Return inverse-transformed input x following Box-Cox inverse 

3303 transform with parameter lambda. 

3304 """ 

3305 if lmbda == 0: 

3306 x_inv = np.exp(x) 

3307 else: 

3308 x_inv = (x * lmbda + 1) ** (1 / lmbda) 

3309 

3310 return x_inv 

3311 

3312 def _yeo_johnson_inverse_transform(self, x, lmbda): 

3313 """Return inverse-transformed input x following Yeo-Johnson inverse 

3314 transform with parameter lambda. 

3315 """ 

3316 x_inv = np.zeros_like(x) 

3317 pos = x >= 0 

3318 

3319 # when x >= 0 

3320 if abs(lmbda) < np.spacing(1.0): 

3321 x_inv[pos] = np.exp(x[pos]) - 1 

3322 else: # lmbda != 0 

3323 x_inv[pos] = np.power(x[pos] * lmbda + 1, 1 / lmbda) - 1 

3324 

3325 # when x < 0 

3326 if abs(lmbda - 2) > np.spacing(1.0): 

3327 x_inv[~pos] = 1 - np.power(-(2 - lmbda) * x[~pos] + 1, 1 / (2 - lmbda)) 

3328 else: # lmbda == 2 

3329 x_inv[~pos] = 1 - np.exp(-x[~pos]) 

3330 

3331 return x_inv 

3332 

3333 def _yeo_johnson_transform(self, x, lmbda): 

3334 """Return transformed input x following Yeo-Johnson transform with 

3335 parameter lambda. 

3336 """ 

3337 

3338 out = np.zeros_like(x) 

3339 pos = x >= 0 # binary mask 

3340 

3341 # when x >= 0 

3342 if abs(lmbda) < np.spacing(1.0): 

3343 out[pos] = np.log1p(x[pos]) 

3344 else: # lmbda != 0 

3345 out[pos] = (np.power(x[pos] + 1, lmbda) - 1) / lmbda 

3346 

3347 # when x < 0 

3348 if abs(lmbda - 2) > np.spacing(1.0): 

3349 out[~pos] = -(np.power(-x[~pos] + 1, 2 - lmbda) - 1) / (2 - lmbda) 

3350 else: # lmbda == 2 

3351 out[~pos] = -np.log1p(-x[~pos]) 

3352 

3353 return out 

3354 

3355 def _box_cox_optimize(self, x): 

3356 """Find and return optimal lambda parameter of the Box-Cox transform by 

3357 MLE, for observed data x. 

3358 

3359 We here use scipy builtins which uses the brent optimizer. 

3360 """ 

3361 mask = np.isnan(x) 

3362 if np.all(mask): 

3363 raise ValueError("Column must not be all nan.") 

3364 

3365 # the computation of lambda is influenced by NaNs so we need to 

3366 # get rid of them 

3367 _, lmbda = stats.boxcox(x[~mask], lmbda=None) 

3368 

3369 return lmbda 

3370 

3371 def _yeo_johnson_optimize(self, x): 

3372 """Find and return optimal lambda parameter of the Yeo-Johnson 

3373 transform by MLE, for observed data x. 

3374 

3375 Like for Box-Cox, MLE is done via the brent optimizer. 

3376 """ 

3377 x_tiny = np.finfo(np.float64).tiny 

3378 

3379 def _neg_log_likelihood(lmbda): 

3380 """Return the negative log likelihood of the observed data x as a 

3381 function of lambda.""" 

3382 x_trans = self._yeo_johnson_transform(x, lmbda) 

3383 n_samples = x.shape[0] 

3384 x_trans_var = x_trans.var() 

3385 

3386 # Reject transformed data that would raise a RuntimeWarning in np.log 

3387 if x_trans_var < x_tiny: 

3388 return np.inf 

3389 

3390 log_var = np.log(x_trans_var) 

3391 loglike = -n_samples / 2 * log_var 

3392 loglike += (lmbda - 1) * (np.sign(x) * np.log1p(np.abs(x))).sum() 

3393 

3394 return -loglike 

3395 

3396 # the computation of lambda is influenced by NaNs so we need to 

3397 # get rid of them 

3398 x = x[~np.isnan(x)] 

3399 # choosing bracket -2, 2 like for boxcox 

3400 return optimize.brent(_neg_log_likelihood, brack=(-2, 2)) 

3401 

3402 def _check_input(self, X, in_fit, check_positive=False, check_shape=False): 

3403 """Validate the input before fit and transform. 

3404 

3405 Parameters 

3406 ---------- 

3407 X : array-like of shape (n_samples, n_features) 

3408 

3409 in_fit : bool 

3410 Whether or not `_check_input` is called from `fit` or other 

3411 methods, e.g. `predict`, `transform`, etc. 

3412 

3413 check_positive : bool, default=False 

3414 If True, check that all data is positive and non-zero (only if 

3415 ``self.method=='box-cox'``). 

3416 

3417 check_shape : bool, default=False 

3418 If True, check that n_features matches the length of self.lambdas_ 

3419 """ 

3420 X = self._validate_data( 

3421 X, 

3422 ensure_2d=True, 

3423 dtype=FLOAT_DTYPES, 

3424 copy=self.copy, 

3425 force_all_finite="allow-nan", 

3426 reset=in_fit, 

3427 ) 

3428 

3429 with warnings.catch_warnings(): 

3430 warnings.filterwarnings("ignore", r"All-NaN (slice|axis) encountered") 

3431 if check_positive and self.method == "box-cox" and np.nanmin(X) <= 0: 

3432 raise ValueError( 

3433 "The Box-Cox transformation can only be " 

3434 "applied to strictly positive data" 

3435 ) 

3436 

3437 if check_shape and not X.shape[1] == len(self.lambdas_): 

3438 raise ValueError( 

3439 "Input data has a different number of features " 

3440 "than fitting data. Should have {n}, data has {m}".format( 

3441 n=len(self.lambdas_), m=X.shape[1] 

3442 ) 

3443 ) 

3444 

3445 return X 

3446 

3447 def _more_tags(self): 

3448 return {"allow_nan": True} 

3449 

3450 

3451@validate_params( 

3452 {"X": ["array-like"]}, 

3453 prefer_skip_nested_validation=False, 

3454) 

3455def power_transform(X, method="yeo-johnson", *, standardize=True, copy=True): 

3456 """Parametric, monotonic transformation to make data more Gaussian-like. 

3457 

3458 Power transforms are a family of parametric, monotonic transformations 

3459 that are applied to make data more Gaussian-like. This is useful for 

3460 modeling issues related to heteroscedasticity (non-constant variance), 

3461 or other situations where normality is desired. 

3462 

3463 Currently, power_transform supports the Box-Cox transform and the 

3464 Yeo-Johnson transform. The optimal parameter for stabilizing variance and 

3465 minimizing skewness is estimated through maximum likelihood. 

3466 

3467 Box-Cox requires input data to be strictly positive, while Yeo-Johnson 

3468 supports both positive or negative data. 

3469 

3470 By default, zero-mean, unit-variance normalization is applied to the 

3471 transformed data. 

3472 

3473 Read more in the :ref:`User Guide <preprocessing_transformer>`. 

3474 

3475 Parameters 

3476 ---------- 

3477 X : array-like of shape (n_samples, n_features) 

3478 The data to be transformed using a power transformation. 

3479 

3480 method : {'yeo-johnson', 'box-cox'}, default='yeo-johnson' 

3481 The power transform method. Available methods are: 

3482 

3483 - 'yeo-johnson' [1]_, works with positive and negative values 

3484 - 'box-cox' [2]_, only works with strictly positive values 

3485 

3486 .. versionchanged:: 0.23 

3487 The default value of the `method` parameter changed from 

3488 'box-cox' to 'yeo-johnson' in 0.23. 

3489 

3490 standardize : bool, default=True 

3491 Set to True to apply zero-mean, unit-variance normalization to the 

3492 transformed output. 

3493 

3494 copy : bool, default=True 

3495 If False, try to avoid a copy and transform in place. 

3496 This is not guaranteed to always work in place; e.g. if the data is 

3497 a numpy array with an int dtype, a copy will be returned even with 

3498 copy=False. 

3499 

3500 Returns 

3501 ------- 

3502 X_trans : ndarray of shape (n_samples, n_features) 

3503 The transformed data. 

3504 

3505 See Also 

3506 -------- 

3507 PowerTransformer : Equivalent transformation with the 

3508 Transformer API (e.g. as part of a preprocessing 

3509 :class:`~sklearn.pipeline.Pipeline`). 

3510 

3511 quantile_transform : Maps data to a standard normal distribution with 

3512 the parameter `output_distribution='normal'`. 

3513 

3514 Notes 

3515 ----- 

3516 NaNs are treated as missing values: disregarded in ``fit``, and maintained 

3517 in ``transform``. 

3518 

3519 For a comparison of the different scalers, transformers, and normalizers, 

3520 see: :ref:`sphx_glr_auto_examples_preprocessing_plot_all_scaling.py`. 

3521 

3522 References 

3523 ---------- 

3524 

3525 .. [1] I.K. Yeo and R.A. Johnson, "A new family of power transformations to 

3526 improve normality or symmetry." Biometrika, 87(4), pp.954-959, 

3527 (2000). 

3528 

3529 .. [2] G.E.P. Box and D.R. Cox, "An Analysis of Transformations", Journal 

3530 of the Royal Statistical Society B, 26, 211-252 (1964). 

3531 

3532 Examples 

3533 -------- 

3534 >>> import numpy as np 

3535 >>> from sklearn.preprocessing import power_transform 

3536 >>> data = [[1, 2], [3, 2], [4, 5]] 

3537 >>> print(power_transform(data, method='box-cox')) 

3538 [[-1.332... -0.707...] 

3539 [ 0.256... -0.707...] 

3540 [ 1.076... 1.414...]] 

3541 

3542 .. warning:: Risk of data leak. 

3543 Do not use :func:`~sklearn.preprocessing.power_transform` unless you 

3544 know what you are doing. A common mistake is to apply it to the entire 

3545 data *before* splitting into training and test sets. This will bias the 

3546 model evaluation because information would have leaked from the test 

3547 set to the training set. 

3548 In general, we recommend using 

3549 :class:`~sklearn.preprocessing.PowerTransformer` within a 

3550 :ref:`Pipeline <pipeline>` in order to prevent most risks of data 

3551 leaking, e.g.: `pipe = make_pipeline(PowerTransformer(), 

3552 LogisticRegression())`. 

3553 """ 

3554 pt = PowerTransformer(method=method, standardize=standardize, copy=copy) 

3555 return pt.fit_transform(X)