Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/scikit_learn-1.4.dev0-py3.8-linux-x86_64.egg/sklearn/preprocessing/

1# Author: Henry Lin <hlin117@gmail.com>

2# Tom Dupré la Tour

4# License: BSD

7import warnings

8from numbers import Integral

10import numpy as np

12from ..base import BaseEstimator, TransformerMixin, _fit_context

13from ..utils import _safe_indexing

14from ..utils._param_validation import Hidden, Interval, Options, StrOptions

15from ..utils.stats import _weighted_percentile

16from ..utils.validation import (

17 _check_feature_names_in,

18 _check_sample_weight,

19 check_array,

20 check_is_fitted,

21 check_random_state,

22)

23from ._encoders import OneHotEncoder

26class KBinsDiscretizer(TransformerMixin, BaseEstimator):

27 """

28 Bin continuous data into intervals.

30 Read more in the :ref:`User Guide <preprocessing_discretization>`.

32 .. versionadded:: 0.20

34 Parameters

35 ----------

36 n_bins : int or array-like of shape (n_features,), default=5

37 The number of bins to produce. Raises ValueError if ``n_bins < 2``.

39 encode : {'onehot', 'onehot-dense', 'ordinal'}, default='onehot'

40 Method used to encode the transformed result.

42 - 'onehot': Encode the transformed result with one-hot encoding

43 and return a sparse matrix. Ignored features are always

44 stacked to the right.

45 - 'onehot-dense': Encode the transformed result with one-hot encoding

46 and return a dense array. Ignored features are always

47 stacked to the right.

48 - 'ordinal': Return the bin identifier encoded as an integer value.

50 strategy : {'uniform', 'quantile', 'kmeans'}, default='quantile'

51 Strategy used to define the widths of the bins.

53 - 'uniform': All bins in each feature have identical widths.

54 - 'quantile': All bins in each feature have the same number of points.

55 - 'kmeans': Values in each bin have the same nearest center of a 1D

56 k-means cluster.

58 For an example of the different strategies see:

59 :ref:`sphx_glr_auto_examples_preprocessing_plot_discretization_strategies.py`.

61 dtype : {np.float32, np.float64}, default=None

62 The desired data-type for the output. If None, output dtype is

63 consistent with input dtype. Only np.float32 and np.float64 are

64 supported.

66 .. versionadded:: 0.24

68 subsample : int or None, default='warn'

69 Maximum number of samples, used to fit the model, for computational

70 efficiency. Defaults to 200_000 when `strategy='quantile'` and to `None`

71 when `strategy='uniform'` or `strategy='kmeans'`.

72 `subsample=None` means that all the training samples are used when

73 computing the quantiles that determine the binning thresholds.

74 Since quantile computation relies on sorting each column of `X` and

75 that sorting has an `n log(n)` time complexity,

76 it is recommended to use subsampling on datasets with a

77 very large number of samples.

79 .. versionchanged:: 1.3

80 The default value of `subsample` changed from `None` to `200_000` when

81 `strategy="quantile"`.

83 .. versionchanged:: 1.5

84 The default value of `subsample` changed from `None` to `200_000` when

85 `strategy="uniform"` or `strategy="kmeans"`.

87 random_state : int, RandomState instance or None, default=None

88 Determines random number generation for subsampling.

89 Pass an int for reproducible results across multiple function calls.

90 See the `subsample` parameter for more details.

91 See :term:`Glossary <random_state>`.

93 .. versionadded:: 1.1

95 Attributes

96 ----------

97 bin_edges_ : ndarray of ndarray of shape (n_features,)

98 The edges of each bin. Contain arrays of varying shapes ``(n_bins_, )``

99 Ignored features will have empty arrays.

100

101 n_bins_ : ndarray of shape (n_features,), dtype=np.int64

102 Number of bins per feature. Bins whose width are too small

103 (i.e., <= 1e-8) are removed with a warning.

104

105 n_features_in_ : int

106 Number of features seen during :term:`fit`.

107

108 .. versionadded:: 0.24

109

110 feature_names_in_ : ndarray of shape (`n_features_in_`,)

111 Names of features seen during :term:`fit`. Defined only when `X`

112 has feature names that are all strings.

113

114 .. versionadded:: 1.0

115

116 See Also

117 --------

118 Binarizer : Class used to bin values as ``0`` or

119 ``1`` based on a parameter ``threshold``.

120

121 Notes

122 -----

123

124 For a visualization of discretization on different datasets refer to

125 :ref:`sphx_glr_auto_examples_preprocessing_plot_discretization_classification.py`.

126 On the effect of discretization on linear models see:

127 :ref:`sphx_glr_auto_examples_preprocessing_plot_discretization.py`.

128

129 In bin edges for feature ``i``, the first and last values are used only for

130 ``inverse_transform``. During transform, bin edges are extended to::

131

132 np.concatenate([-np.inf, bin_edges_[i][1:-1], np.inf])

133

134 You can combine ``KBinsDiscretizer`` with

135 :class:`~sklearn.compose.ColumnTransformer` if you only want to preprocess

136 part of the features.

137

138 ``KBinsDiscretizer`` might produce constant features (e.g., when

139 ``encode = 'onehot'`` and certain bins do not contain any data).

140 These features can be removed with feature selection algorithms

141 (e.g., :class:`~sklearn.feature_selection.VarianceThreshold`).

142

143 Examples

144 --------

145 >>> from sklearn.preprocessing import KBinsDiscretizer

146 >>> X = [[-2, 1, -4, -1],

147 ... [-1, 2, -3, -0.5],

148 ... [ 0, 3, -2, 0.5],

149 ... [ 1, 4, -1, 2]]

150 >>> est = KBinsDiscretizer(

151 ... n_bins=3, encode='ordinal', strategy='uniform', subsample=None

152 ... )

153 >>> est.fit(X)

154 KBinsDiscretizer(...)

155 >>> Xt = est.transform(X)

156 >>> Xt # doctest: +SKIP

157 array([[ 0., 0., 0., 0.],

158 [ 1., 1., 1., 0.],

159 [ 2., 2., 2., 1.],

160 [ 2., 2., 2., 2.]])

161

162 Sometimes it may be useful to convert the data back into the original

163 feature space. The ``inverse_transform`` function converts the binned

164 data into the original feature space. Each value will be equal to the mean

165 of the two bin edges.

166

167 >>> est.bin_edges_[0]

168 array([-2., -1., 0., 1.])

169 >>> est.inverse_transform(Xt)

170 array([[-1.5, 1.5, -3.5, -0.5],

171 [-0.5, 2.5, -2.5, -0.5],

172 [ 0.5, 3.5, -1.5, 0.5],

173 [ 0.5, 3.5, -1.5, 1.5]])

174 """

175

176 _parameter_constraints: dict = {

177 "n_bins": [Interval(Integral, 2, None, closed="left"), "array-like"],

178 "encode": [StrOptions({"onehot", "onehot-dense", "ordinal"})],

179 "strategy": [StrOptions({"uniform", "quantile", "kmeans"})],

180 "dtype": [Options(type, {np.float64, np.float32}), None],

181 "subsample": [

182 Interval(Integral, 1, None, closed="left"),

183 None,

184 Hidden(StrOptions({"warn"})),

185 ],

186 "random_state": ["random_state"],

187 }

188

189 def __init__(

190 self,

191 n_bins=5,

192 *,

193 encode="onehot",

194 strategy="quantile",

195 dtype=None,

196 subsample="warn",

197 random_state=None,

198 ):

199 self.n_bins = n_bins

200 self.encode = encode

201 self.strategy = strategy

202 self.dtype = dtype

203 self.subsample = subsample

204 self.random_state = random_state

205

206 @_fit_context(prefer_skip_nested_validation=True)

207 def fit(self, X, y=None, sample_weight=None):

208 """

209 Fit the estimator.

210

211 Parameters

212 ----------

213 X : array-like of shape (n_samples, n_features)

214 Data to be discretized.

215

216 y : None

217 Ignored. This parameter exists only for compatibility with

218 :class:`~sklearn.pipeline.Pipeline`.

219

220 sample_weight : ndarray of shape (n_samples,)

221 Contains weight values to be associated with each sample.

222 Only possible when `strategy` is set to `"quantile"`.

223

224 .. versionadded:: 1.3

225

226 Returns

227 -------

228 self : object

229 Returns the instance itself.

230 """

231 X = self._validate_data(X, dtype="numeric")

232

233 if self.dtype in (np.float64, np.float32):

234 output_dtype = self.dtype

235 else: # self.dtype is None

236 output_dtype = X.dtype

237

238 n_samples, n_features = X.shape

239

240 if sample_weight is not None and self.strategy == "uniform":

241 raise ValueError(

242 "`sample_weight` was provided but it cannot be "

243 "used with strategy='uniform'. Got strategy="

244 f"{self.strategy!r} instead."

245 )

246

247 if self.strategy in ("uniform", "kmeans") and self.subsample == "warn":

248 warnings.warn(

249 (

250 "In version 1.5 onwards, subsample=200_000 "

251 "will be used by default. Set subsample explicitly to "

252 "silence this warning in the mean time. Set "

253 "subsample=None to disable subsampling explicitly."

254 ),

255 FutureWarning,

256 )

257

258 subsample = self.subsample

259 if subsample == "warn":

260 subsample = 200000 if self.strategy == "quantile" else None

261 if subsample is not None and n_samples > subsample:

262 rng = check_random_state(self.random_state)

263 subsample_idx = rng.choice(n_samples, size=subsample, replace=False)

264 X = _safe_indexing(X, subsample_idx)

265

266 n_features = X.shape[1]

267 n_bins = self._validate_n_bins(n_features)

268

269 if sample_weight is not None:

270 sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype)

271

272 bin_edges = np.zeros(n_features, dtype=object)

273 for jj in range(n_features):

274 column = X[:, jj]

275 col_min, col_max = column.min(), column.max()

276

277 if col_min == col_max:

278 warnings.warn(

279 "Feature %d is constant and will be replaced with 0." % jj

280 )

281 n_bins[jj] = 1

282 bin_edges[jj] = np.array([-np.inf, np.inf])

283 continue

284

285 if self.strategy == "uniform":

286 bin_edges[jj] = np.linspace(col_min, col_max, n_bins[jj] + 1)

287

288 elif self.strategy == "quantile":

289 quantiles = np.linspace(0, 100, n_bins[jj] + 1)

290 if sample_weight is None:

291 bin_edges[jj] = np.asarray(np.percentile(column, quantiles))

292 else:

293 bin_edges[jj] = np.asarray(

294 [

295 _weighted_percentile(column, sample_weight, q)

296 for q in quantiles

297 ],

298 dtype=np.float64,

299 )

300 elif self.strategy == "kmeans":

301 from ..cluster import KMeans # fixes import loops

302

303 # Deterministic initialization with uniform spacing

304 uniform_edges = np.linspace(col_min, col_max, n_bins[jj] + 1)

305 init = (uniform_edges[1:] + uniform_edges[:-1])[:, None] * 0.5

306

307 # 1D k-means procedure

308 km = KMeans(n_clusters=n_bins[jj], init=init, n_init=1)

309 centers = km.fit(

310 column[:, None], sample_weight=sample_weight

311 ).cluster_centers_[:, 0]

312 # Must sort, centers may be unsorted even with sorted init

313 centers.sort()

314 bin_edges[jj] = (centers[1:] + centers[:-1]) * 0.5

315 bin_edges[jj] = np.r_[col_min, bin_edges[jj], col_max]

316

317 # Remove bins whose width are too small (i.e., <= 1e-8)

318 if self.strategy in ("quantile", "kmeans"):

319 mask = np.ediff1d(bin_edges[jj], to_begin=np.inf) > 1e-8

320 bin_edges[jj] = bin_edges[jj][mask]

321 if len(bin_edges[jj]) - 1 != n_bins[jj]:

322 warnings.warn(

323 "Bins whose width are too small (i.e., <= "

324 "1e-8) in feature %d are removed. Consider "

325 "decreasing the number of bins." % jj

326 )

327 n_bins[jj] = len(bin_edges[jj]) - 1

328

329 self.bin_edges_ = bin_edges

330 self.n_bins_ = n_bins

331

332 if "onehot" in self.encode:

333 self._encoder = OneHotEncoder(

334 categories=[np.arange(i) for i in self.n_bins_],

335 sparse_output=self.encode == "onehot",

336 dtype=output_dtype,

337 )

338 # Fit the OneHotEncoder with toy datasets

339 # so that it's ready for use after the KBinsDiscretizer is fitted

340 self._encoder.fit(np.zeros((1, len(self.n_bins_))))

341

342 return self

343

344 def _validate_n_bins(self, n_features):

345 """Returns n_bins_, the number of bins per feature."""

346 orig_bins = self.n_bins

347 if isinstance(orig_bins, Integral):

348 return np.full(n_features, orig_bins, dtype=int)

349

350 n_bins = check_array(orig_bins, dtype=int, copy=True, ensure_2d=False)

351

352 if n_bins.ndim > 1 or n_bins.shape[0] != n_features:

353 raise ValueError("n_bins must be a scalar or array of shape (n_features,).")

354

355 bad_nbins_value = (n_bins < 2) | (n_bins != orig_bins)

356

357 violating_indices = np.where(bad_nbins_value)[0]

358 if violating_indices.shape[0] > 0:

359 indices = ", ".join(str(i) for i in violating_indices)

360 raise ValueError(

361 "{} received an invalid number "

362 "of bins at indices {}. Number of bins "

363 "must be at least 2, and must be an int.".format(

364 KBinsDiscretizer.__name__, indices

365 )

366 )

367 return n_bins

368

369 def transform(self, X):

370 """

371 Discretize the data.

372

373 Parameters

374 ----------

375 X : array-like of shape (n_samples, n_features)

376 Data to be discretized.

377

378 Returns

379 -------

380 Xt : {ndarray, sparse matrix}, dtype={np.float32, np.float64}

381 Data in the binned space. Will be a sparse matrix if

382 `self.encode='onehot'` and ndarray otherwise.

383 """

384 check_is_fitted(self)

385

386 # check input and attribute dtypes

387 dtype = (np.float64, np.float32) if self.dtype is None else self.dtype

388 Xt = self._validate_data(X, copy=True, dtype=dtype, reset=False)

389

390 bin_edges = self.bin_edges_

391 for jj in range(Xt.shape[1]):

392 Xt[:, jj] = np.searchsorted(bin_edges[jj][1:-1], Xt[:, jj], side="right")

393

394 if self.encode == "ordinal":

395 return Xt

396

397 dtype_init = None

398 if "onehot" in self.encode:

399 dtype_init = self._encoder.dtype

400 self._encoder.dtype = Xt.dtype

401 try:

402 Xt_enc = self._encoder.transform(Xt)

403 finally:

404 # revert the initial dtype to avoid modifying self.

405 self._encoder.dtype = dtype_init

406 return Xt_enc

407

408 def inverse_transform(self, Xt):

409 """

410 Transform discretized data back to original feature space.

411

412 Note that this function does not regenerate the original data

413 due to discretization rounding.

414

415 Parameters

416 ----------

417 Xt : array-like of shape (n_samples, n_features)

418 Transformed data in the binned space.

419

420 Returns

421 -------

422 Xinv : ndarray, dtype={np.float32, np.float64}

423 Data in the original feature space.

424 """

425 check_is_fitted(self)

426

427 if "onehot" in self.encode:

428 Xt = self._encoder.inverse_transform(Xt)

429

430 Xinv = check_array(Xt, copy=True, dtype=(np.float64, np.float32))

431 n_features = self.n_bins_.shape[0]

432 if Xinv.shape[1] != n_features:

433 raise ValueError(

434 "Incorrect number of features. Expecting {}, received {}.".format(

435 n_features, Xinv.shape[1]

436 )

437 )

438

439 for jj in range(n_features):

440 bin_edges = self.bin_edges_[jj]

441 bin_centers = (bin_edges[1:] + bin_edges[:-1]) * 0.5

442 Xinv[:, jj] = bin_centers[(Xinv[:, jj]).astype(np.int64)]

443

444 return Xinv

445

446 def get_feature_names_out(self, input_features=None):

447 """Get output feature names.

448

449 Parameters

450 ----------

451 input_features : array-like of str or None, default=None

452 Input features.

453

454 - If `input_features` is `None`, then `feature_names_in_` is

455 used as feature names in. If `feature_names_in_` is not defined,

456 then the following input feature names are generated:

457 `["x0", "x1", ..., "x(n_features_in_ - 1)"]`.

458 - If `input_features` is an array-like, then `input_features` must

459 match `feature_names_in_` if `feature_names_in_` is defined.

460

461 Returns

462 -------

463 feature_names_out : ndarray of str objects

464 Transformed feature names.

465 """

466 check_is_fitted(self, "n_features_in_")

467 input_features = _check_feature_names_in(self, input_features)

468 if hasattr(self, "_encoder"):

469 return self._encoder.get_feature_names_out(input_features)

470

471 # ordinal encoding

472 return input_features

Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/scikit_learn-1.4.dev0-py3.8-linux-x86_64.egg/sklearn/preprocessing/_discretization.py: 14%

126 statements