Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/scikit_learn-1.4.dev0-py3.8-linux-x86_64.egg/sklearn/preprocessing/_discretization.py: 14%

126 statements  

« prev     ^ index     » next       coverage.py v7.3.2, created at 2023-12-12 06:31 +0000

1# Author: Henry Lin <hlin117@gmail.com> 

2# Tom Dupré la Tour 

3 

4# License: BSD 

5 

6 

7import warnings 

8from numbers import Integral 

9 

10import numpy as np 

11 

12from ..base import BaseEstimator, TransformerMixin, _fit_context 

13from ..utils import _safe_indexing 

14from ..utils._param_validation import Hidden, Interval, Options, StrOptions 

15from ..utils.stats import _weighted_percentile 

16from ..utils.validation import ( 

17 _check_feature_names_in, 

18 _check_sample_weight, 

19 check_array, 

20 check_is_fitted, 

21 check_random_state, 

22) 

23from ._encoders import OneHotEncoder 

24 

25 

26class KBinsDiscretizer(TransformerMixin, BaseEstimator): 

27 """ 

28 Bin continuous data into intervals. 

29 

30 Read more in the :ref:`User Guide <preprocessing_discretization>`. 

31 

32 .. versionadded:: 0.20 

33 

34 Parameters 

35 ---------- 

36 n_bins : int or array-like of shape (n_features,), default=5 

37 The number of bins to produce. Raises ValueError if ``n_bins < 2``. 

38 

39 encode : {'onehot', 'onehot-dense', 'ordinal'}, default='onehot' 

40 Method used to encode the transformed result. 

41 

42 - 'onehot': Encode the transformed result with one-hot encoding 

43 and return a sparse matrix. Ignored features are always 

44 stacked to the right. 

45 - 'onehot-dense': Encode the transformed result with one-hot encoding 

46 and return a dense array. Ignored features are always 

47 stacked to the right. 

48 - 'ordinal': Return the bin identifier encoded as an integer value. 

49 

50 strategy : {'uniform', 'quantile', 'kmeans'}, default='quantile' 

51 Strategy used to define the widths of the bins. 

52 

53 - 'uniform': All bins in each feature have identical widths. 

54 - 'quantile': All bins in each feature have the same number of points. 

55 - 'kmeans': Values in each bin have the same nearest center of a 1D 

56 k-means cluster. 

57 

58 For an example of the different strategies see: 

59 :ref:`sphx_glr_auto_examples_preprocessing_plot_discretization_strategies.py`. 

60 

61 dtype : {np.float32, np.float64}, default=None 

62 The desired data-type for the output. If None, output dtype is 

63 consistent with input dtype. Only np.float32 and np.float64 are 

64 supported. 

65 

66 .. versionadded:: 0.24 

67 

68 subsample : int or None, default='warn' 

69 Maximum number of samples, used to fit the model, for computational 

70 efficiency. Defaults to 200_000 when `strategy='quantile'` and to `None` 

71 when `strategy='uniform'` or `strategy='kmeans'`. 

72 `subsample=None` means that all the training samples are used when 

73 computing the quantiles that determine the binning thresholds. 

74 Since quantile computation relies on sorting each column of `X` and 

75 that sorting has an `n log(n)` time complexity, 

76 it is recommended to use subsampling on datasets with a 

77 very large number of samples. 

78 

79 .. versionchanged:: 1.3 

80 The default value of `subsample` changed from `None` to `200_000` when 

81 `strategy="quantile"`. 

82 

83 .. versionchanged:: 1.5 

84 The default value of `subsample` changed from `None` to `200_000` when 

85 `strategy="uniform"` or `strategy="kmeans"`. 

86 

87 random_state : int, RandomState instance or None, default=None 

88 Determines random number generation for subsampling. 

89 Pass an int for reproducible results across multiple function calls. 

90 See the `subsample` parameter for more details. 

91 See :term:`Glossary <random_state>`. 

92 

93 .. versionadded:: 1.1 

94 

95 Attributes 

96 ---------- 

97 bin_edges_ : ndarray of ndarray of shape (n_features,) 

98 The edges of each bin. Contain arrays of varying shapes ``(n_bins_, )`` 

99 Ignored features will have empty arrays. 

100 

101 n_bins_ : ndarray of shape (n_features,), dtype=np.int64 

102 Number of bins per feature. Bins whose width are too small 

103 (i.e., <= 1e-8) are removed with a warning. 

104 

105 n_features_in_ : int 

106 Number of features seen during :term:`fit`. 

107 

108 .. versionadded:: 0.24 

109 

110 feature_names_in_ : ndarray of shape (`n_features_in_`,) 

111 Names of features seen during :term:`fit`. Defined only when `X` 

112 has feature names that are all strings. 

113 

114 .. versionadded:: 1.0 

115 

116 See Also 

117 -------- 

118 Binarizer : Class used to bin values as ``0`` or 

119 ``1`` based on a parameter ``threshold``. 

120 

121 Notes 

122 ----- 

123 

124 For a visualization of discretization on different datasets refer to 

125 :ref:`sphx_glr_auto_examples_preprocessing_plot_discretization_classification.py`. 

126 On the effect of discretization on linear models see: 

127 :ref:`sphx_glr_auto_examples_preprocessing_plot_discretization.py`. 

128 

129 In bin edges for feature ``i``, the first and last values are used only for 

130 ``inverse_transform``. During transform, bin edges are extended to:: 

131 

132 np.concatenate([-np.inf, bin_edges_[i][1:-1], np.inf]) 

133 

134 You can combine ``KBinsDiscretizer`` with 

135 :class:`~sklearn.compose.ColumnTransformer` if you only want to preprocess 

136 part of the features. 

137 

138 ``KBinsDiscretizer`` might produce constant features (e.g., when 

139 ``encode = 'onehot'`` and certain bins do not contain any data). 

140 These features can be removed with feature selection algorithms 

141 (e.g., :class:`~sklearn.feature_selection.VarianceThreshold`). 

142 

143 Examples 

144 -------- 

145 >>> from sklearn.preprocessing import KBinsDiscretizer 

146 >>> X = [[-2, 1, -4, -1], 

147 ... [-1, 2, -3, -0.5], 

148 ... [ 0, 3, -2, 0.5], 

149 ... [ 1, 4, -1, 2]] 

150 >>> est = KBinsDiscretizer( 

151 ... n_bins=3, encode='ordinal', strategy='uniform', subsample=None 

152 ... ) 

153 >>> est.fit(X) 

154 KBinsDiscretizer(...) 

155 >>> Xt = est.transform(X) 

156 >>> Xt # doctest: +SKIP 

157 array([[ 0., 0., 0., 0.], 

158 [ 1., 1., 1., 0.], 

159 [ 2., 2., 2., 1.], 

160 [ 2., 2., 2., 2.]]) 

161 

162 Sometimes it may be useful to convert the data back into the original 

163 feature space. The ``inverse_transform`` function converts the binned 

164 data into the original feature space. Each value will be equal to the mean 

165 of the two bin edges. 

166 

167 >>> est.bin_edges_[0] 

168 array([-2., -1., 0., 1.]) 

169 >>> est.inverse_transform(Xt) 

170 array([[-1.5, 1.5, -3.5, -0.5], 

171 [-0.5, 2.5, -2.5, -0.5], 

172 [ 0.5, 3.5, -1.5, 0.5], 

173 [ 0.5, 3.5, -1.5, 1.5]]) 

174 """ 

175 

176 _parameter_constraints: dict = { 

177 "n_bins": [Interval(Integral, 2, None, closed="left"), "array-like"], 

178 "encode": [StrOptions({"onehot", "onehot-dense", "ordinal"})], 

179 "strategy": [StrOptions({"uniform", "quantile", "kmeans"})], 

180 "dtype": [Options(type, {np.float64, np.float32}), None], 

181 "subsample": [ 

182 Interval(Integral, 1, None, closed="left"), 

183 None, 

184 Hidden(StrOptions({"warn"})), 

185 ], 

186 "random_state": ["random_state"], 

187 } 

188 

189 def __init__( 

190 self, 

191 n_bins=5, 

192 *, 

193 encode="onehot", 

194 strategy="quantile", 

195 dtype=None, 

196 subsample="warn", 

197 random_state=None, 

198 ): 

199 self.n_bins = n_bins 

200 self.encode = encode 

201 self.strategy = strategy 

202 self.dtype = dtype 

203 self.subsample = subsample 

204 self.random_state = random_state 

205 

206 @_fit_context(prefer_skip_nested_validation=True) 

207 def fit(self, X, y=None, sample_weight=None): 

208 """ 

209 Fit the estimator. 

210 

211 Parameters 

212 ---------- 

213 X : array-like of shape (n_samples, n_features) 

214 Data to be discretized. 

215 

216 y : None 

217 Ignored. This parameter exists only for compatibility with 

218 :class:`~sklearn.pipeline.Pipeline`. 

219 

220 sample_weight : ndarray of shape (n_samples,) 

221 Contains weight values to be associated with each sample. 

222 Only possible when `strategy` is set to `"quantile"`. 

223 

224 .. versionadded:: 1.3 

225 

226 Returns 

227 ------- 

228 self : object 

229 Returns the instance itself. 

230 """ 

231 X = self._validate_data(X, dtype="numeric") 

232 

233 if self.dtype in (np.float64, np.float32): 

234 output_dtype = self.dtype 

235 else: # self.dtype is None 

236 output_dtype = X.dtype 

237 

238 n_samples, n_features = X.shape 

239 

240 if sample_weight is not None and self.strategy == "uniform": 

241 raise ValueError( 

242 "`sample_weight` was provided but it cannot be " 

243 "used with strategy='uniform'. Got strategy=" 

244 f"{self.strategy!r} instead." 

245 ) 

246 

247 if self.strategy in ("uniform", "kmeans") and self.subsample == "warn": 

248 warnings.warn( 

249 ( 

250 "In version 1.5 onwards, subsample=200_000 " 

251 "will be used by default. Set subsample explicitly to " 

252 "silence this warning in the mean time. Set " 

253 "subsample=None to disable subsampling explicitly." 

254 ), 

255 FutureWarning, 

256 ) 

257 

258 subsample = self.subsample 

259 if subsample == "warn": 

260 subsample = 200000 if self.strategy == "quantile" else None 

261 if subsample is not None and n_samples > subsample: 

262 rng = check_random_state(self.random_state) 

263 subsample_idx = rng.choice(n_samples, size=subsample, replace=False) 

264 X = _safe_indexing(X, subsample_idx) 

265 

266 n_features = X.shape[1] 

267 n_bins = self._validate_n_bins(n_features) 

268 

269 if sample_weight is not None: 

270 sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype) 

271 

272 bin_edges = np.zeros(n_features, dtype=object) 

273 for jj in range(n_features): 

274 column = X[:, jj] 

275 col_min, col_max = column.min(), column.max() 

276 

277 if col_min == col_max: 

278 warnings.warn( 

279 "Feature %d is constant and will be replaced with 0." % jj 

280 ) 

281 n_bins[jj] = 1 

282 bin_edges[jj] = np.array([-np.inf, np.inf]) 

283 continue 

284 

285 if self.strategy == "uniform": 

286 bin_edges[jj] = np.linspace(col_min, col_max, n_bins[jj] + 1) 

287 

288 elif self.strategy == "quantile": 

289 quantiles = np.linspace(0, 100, n_bins[jj] + 1) 

290 if sample_weight is None: 

291 bin_edges[jj] = np.asarray(np.percentile(column, quantiles)) 

292 else: 

293 bin_edges[jj] = np.asarray( 

294 [ 

295 _weighted_percentile(column, sample_weight, q) 

296 for q in quantiles 

297 ], 

298 dtype=np.float64, 

299 ) 

300 elif self.strategy == "kmeans": 

301 from ..cluster import KMeans # fixes import loops 

302 

303 # Deterministic initialization with uniform spacing 

304 uniform_edges = np.linspace(col_min, col_max, n_bins[jj] + 1) 

305 init = (uniform_edges[1:] + uniform_edges[:-1])[:, None] * 0.5 

306 

307 # 1D k-means procedure 

308 km = KMeans(n_clusters=n_bins[jj], init=init, n_init=1) 

309 centers = km.fit( 

310 column[:, None], sample_weight=sample_weight 

311 ).cluster_centers_[:, 0] 

312 # Must sort, centers may be unsorted even with sorted init 

313 centers.sort() 

314 bin_edges[jj] = (centers[1:] + centers[:-1]) * 0.5 

315 bin_edges[jj] = np.r_[col_min, bin_edges[jj], col_max] 

316 

317 # Remove bins whose width are too small (i.e., <= 1e-8) 

318 if self.strategy in ("quantile", "kmeans"): 

319 mask = np.ediff1d(bin_edges[jj], to_begin=np.inf) > 1e-8 

320 bin_edges[jj] = bin_edges[jj][mask] 

321 if len(bin_edges[jj]) - 1 != n_bins[jj]: 

322 warnings.warn( 

323 "Bins whose width are too small (i.e., <= " 

324 "1e-8) in feature %d are removed. Consider " 

325 "decreasing the number of bins." % jj 

326 ) 

327 n_bins[jj] = len(bin_edges[jj]) - 1 

328 

329 self.bin_edges_ = bin_edges 

330 self.n_bins_ = n_bins 

331 

332 if "onehot" in self.encode: 

333 self._encoder = OneHotEncoder( 

334 categories=[np.arange(i) for i in self.n_bins_], 

335 sparse_output=self.encode == "onehot", 

336 dtype=output_dtype, 

337 ) 

338 # Fit the OneHotEncoder with toy datasets 

339 # so that it's ready for use after the KBinsDiscretizer is fitted 

340 self._encoder.fit(np.zeros((1, len(self.n_bins_)))) 

341 

342 return self 

343 

344 def _validate_n_bins(self, n_features): 

345 """Returns n_bins_, the number of bins per feature.""" 

346 orig_bins = self.n_bins 

347 if isinstance(orig_bins, Integral): 

348 return np.full(n_features, orig_bins, dtype=int) 

349 

350 n_bins = check_array(orig_bins, dtype=int, copy=True, ensure_2d=False) 

351 

352 if n_bins.ndim > 1 or n_bins.shape[0] != n_features: 

353 raise ValueError("n_bins must be a scalar or array of shape (n_features,).") 

354 

355 bad_nbins_value = (n_bins < 2) | (n_bins != orig_bins) 

356 

357 violating_indices = np.where(bad_nbins_value)[0] 

358 if violating_indices.shape[0] > 0: 

359 indices = ", ".join(str(i) for i in violating_indices) 

360 raise ValueError( 

361 "{} received an invalid number " 

362 "of bins at indices {}. Number of bins " 

363 "must be at least 2, and must be an int.".format( 

364 KBinsDiscretizer.__name__, indices 

365 ) 

366 ) 

367 return n_bins 

368 

369 def transform(self, X): 

370 """ 

371 Discretize the data. 

372 

373 Parameters 

374 ---------- 

375 X : array-like of shape (n_samples, n_features) 

376 Data to be discretized. 

377 

378 Returns 

379 ------- 

380 Xt : {ndarray, sparse matrix}, dtype={np.float32, np.float64} 

381 Data in the binned space. Will be a sparse matrix if 

382 `self.encode='onehot'` and ndarray otherwise. 

383 """ 

384 check_is_fitted(self) 

385 

386 # check input and attribute dtypes 

387 dtype = (np.float64, np.float32) if self.dtype is None else self.dtype 

388 Xt = self._validate_data(X, copy=True, dtype=dtype, reset=False) 

389 

390 bin_edges = self.bin_edges_ 

391 for jj in range(Xt.shape[1]): 

392 Xt[:, jj] = np.searchsorted(bin_edges[jj][1:-1], Xt[:, jj], side="right") 

393 

394 if self.encode == "ordinal": 

395 return Xt 

396 

397 dtype_init = None 

398 if "onehot" in self.encode: 

399 dtype_init = self._encoder.dtype 

400 self._encoder.dtype = Xt.dtype 

401 try: 

402 Xt_enc = self._encoder.transform(Xt) 

403 finally: 

404 # revert the initial dtype to avoid modifying self. 

405 self._encoder.dtype = dtype_init 

406 return Xt_enc 

407 

408 def inverse_transform(self, Xt): 

409 """ 

410 Transform discretized data back to original feature space. 

411 

412 Note that this function does not regenerate the original data 

413 due to discretization rounding. 

414 

415 Parameters 

416 ---------- 

417 Xt : array-like of shape (n_samples, n_features) 

418 Transformed data in the binned space. 

419 

420 Returns 

421 ------- 

422 Xinv : ndarray, dtype={np.float32, np.float64} 

423 Data in the original feature space. 

424 """ 

425 check_is_fitted(self) 

426 

427 if "onehot" in self.encode: 

428 Xt = self._encoder.inverse_transform(Xt) 

429 

430 Xinv = check_array(Xt, copy=True, dtype=(np.float64, np.float32)) 

431 n_features = self.n_bins_.shape[0] 

432 if Xinv.shape[1] != n_features: 

433 raise ValueError( 

434 "Incorrect number of features. Expecting {}, received {}.".format( 

435 n_features, Xinv.shape[1] 

436 ) 

437 ) 

438 

439 for jj in range(n_features): 

440 bin_edges = self.bin_edges_[jj] 

441 bin_centers = (bin_edges[1:] + bin_edges[:-1]) * 0.5 

442 Xinv[:, jj] = bin_centers[(Xinv[:, jj]).astype(np.int64)] 

443 

444 return Xinv 

445 

446 def get_feature_names_out(self, input_features=None): 

447 """Get output feature names. 

448 

449 Parameters 

450 ---------- 

451 input_features : array-like of str or None, default=None 

452 Input features. 

453 

454 - If `input_features` is `None`, then `feature_names_in_` is 

455 used as feature names in. If `feature_names_in_` is not defined, 

456 then the following input feature names are generated: 

457 `["x0", "x1", ..., "x(n_features_in_ - 1)"]`. 

458 - If `input_features` is an array-like, then `input_features` must 

459 match `feature_names_in_` if `feature_names_in_` is defined. 

460 

461 Returns 

462 ------- 

463 feature_names_out : ndarray of str objects 

464 Transformed feature names. 

465 """ 

466 check_is_fitted(self, "n_features_in_") 

467 input_features = _check_feature_names_in(self, input_features) 

468 if hasattr(self, "_encoder"): 

469 return self._encoder.get_feature_names_out(input_features) 

470 

471 # ordinal encoding 

472 return input_features