Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/scikit_learn-1.4.dev0-py3.8-linux-x86_64.egg/sklearn/preprocessing/_target_encoder.py: 20%

111 statements  

« prev     ^ index     » next       coverage.py v7.3.2, created at 2023-12-12 06:31 +0000

1from numbers import Integral, Real 

2 

3import numpy as np 

4 

5from ..base import OneToOneFeatureMixin, _fit_context 

6from ..utils._param_validation import Interval, StrOptions 

7from ..utils.multiclass import type_of_target 

8from ..utils.validation import ( 

9 _check_feature_names_in, 

10 _check_y, 

11 check_consistent_length, 

12 check_is_fitted, 

13) 

14from ._encoders import _BaseEncoder 

15from ._target_encoder_fast import _fit_encoding_fast, _fit_encoding_fast_auto_smooth 

16 

17 

18class TargetEncoder(OneToOneFeatureMixin, _BaseEncoder): 

19 """Target Encoder for regression and classification targets. 

20 

21 Each category is encoded based on a shrunk estimate of the average target 

22 values for observations belonging to the category. The encoding scheme mixes 

23 the global target mean with the target mean conditioned on the value of the 

24 category (see [MIC]_). 

25 

26 When the target type is "multiclass", encodings are based 

27 on the conditional probability estimate for each class. The target is first 

28 binarized using the "one-vs-all" scheme via 

29 :class:`~sklearn.preprocessing.LabelBinarizer`, then the average target 

30 value for each class and each category is used for encoding, resulting in 

31 `n_features` * `n_classes` encoded output features. 

32 

33 :class:`TargetEncoder` considers missing values, such as `np.nan` or `None`, 

34 as another category and encodes them like any other category. Categories 

35 that are not seen during :meth:`fit` are encoded with the target mean, i.e. 

36 `target_mean_`. 

37 

38 For a demo on the importance of the `TargetEncoder` internal cross-fitting, 

39 see 

40 :ref:`sphx_glr_auto_examples_preprocessing_plot_target_encoder_cross_val.py`. 

41 For a comparison of different encoders, refer to 

42 :ref:`sphx_glr_auto_examples_preprocessing_plot_target_encoder.py`. Read 

43 more in the :ref:`User Guide <target_encoder>`. 

44 

45 .. note:: 

46 `fit(X, y).transform(X)` does not equal `fit_transform(X, y)` because a 

47 :term:`cross fitting` scheme is used in `fit_transform` for encoding. 

48 See the :ref:`User Guide <target_encoder>` for details. 

49 

50 .. versionadded:: 1.3 

51 

52 Parameters 

53 ---------- 

54 categories : "auto" or list of shape (n_features,) of array-like, default="auto" 

55 Categories (unique values) per feature: 

56 

57 - `"auto"` : Determine categories automatically from the training data. 

58 - list : `categories[i]` holds the categories expected in the i-th column. The 

59 passed categories should not mix strings and numeric values within a single 

60 feature, and should be sorted in case of numeric values. 

61 

62 The used categories are stored in the `categories_` fitted attribute. 

63 

64 target_type : {"auto", "continuous", "binary", "multiclass"}, default="auto" 

65 Type of target. 

66 

67 - `"auto"` : Type of target is inferred with 

68 :func:`~sklearn.utils.multiclass.type_of_target`. 

69 - `"continuous"` : Continuous target 

70 - `"binary"` : Binary target 

71 - `"multiclass"` : Multiclass target 

72 

73 .. note:: 

74 The type of target inferred with `"auto"` may not be the desired target 

75 type used for modeling. For example, if the target consisted of integers 

76 between 0 and 100, then :func:`~sklearn.utils.multiclass.type_of_target` 

77 will infer the target as `"multiclass"`. In this case, setting 

78 `target_type="continuous"` will specify the target as a regression 

79 problem. The `target_type_` attribute gives the target type used by the 

80 encoder. 

81 

82 .. versionchanged:: 1.4 

83 Added the option 'multiclass'. 

84 

85 smooth : "auto" or float, default="auto" 

86 The amount of mixing of the target mean conditioned on the value of the 

87 category with the global target mean. A larger `smooth` value will put 

88 more weight on the global target mean. 

89 If `"auto"`, then `smooth` is set to an empirical Bayes estimate. 

90 

91 cv : int, default=5 

92 Determines the number of folds in the :term:`cross fitting` strategy used in 

93 :meth:`fit_transform`. For classification targets, `StratifiedKFold` is used 

94 and for continuous targets, `KFold` is used. 

95 

96 shuffle : bool, default=True 

97 Whether to shuffle the data in :meth:`fit_transform` before splitting into 

98 folds. Note that the samples within each split will not be shuffled. 

99 

100 random_state : int, RandomState instance or None, default=None 

101 When `shuffle` is True, `random_state` affects the ordering of the 

102 indices, which controls the randomness of each fold. Otherwise, this 

103 parameter has no effect. 

104 Pass an int for reproducible output across multiple function calls. 

105 See :term:`Glossary <random_state>`. 

106 

107 Attributes 

108 ---------- 

109 encodings_ : list of shape (n_features,) or (n_features * n_classes) of \ 

110 ndarray 

111 Encodings learnt on all of `X`. 

112 For feature `i`, `encodings_[i]` are the encodings matching the 

113 categories listed in `categories_[i]`. When `target_type_` is 

114 "multiclass", the encoding for feature `i` and class `j` is stored in 

115 `encodings_[j + (i * len(classes_))]`. E.g., for 2 features (f) and 

116 3 classes (c), encodings are ordered: 

117 f0_c0, f0_c1, f0_c2, f1_c0, f1_c1, f1_c2, 

118 

119 categories_ : list of shape (n_features,) of ndarray 

120 The categories of each input feature determined during fitting or 

121 specified in `categories` 

122 (in order of the features in `X` and corresponding with the output 

123 of :meth:`transform`). 

124 

125 target_type_ : str 

126 Type of target. 

127 

128 target_mean_ : float 

129 The overall mean of the target. This value is only used in :meth:`transform` 

130 to encode categories. 

131 

132 n_features_in_ : int 

133 Number of features seen during :term:`fit`. 

134 

135 feature_names_in_ : ndarray of shape (`n_features_in_`,) 

136 Names of features seen during :term:`fit`. Defined only when `X` 

137 has feature names that are all strings. 

138 

139 classes_ : ndarray or None 

140 If `target_type_` is 'binary' or 'multiclass', holds the label for each class, 

141 otherwise `None`. 

142 

143 See Also 

144 -------- 

145 OrdinalEncoder : Performs an ordinal (integer) encoding of the categorical features. 

146 Contrary to TargetEncoder, this encoding is not supervised. Treating the 

147 resulting encoding as a numerical features therefore lead arbitrarily 

148 ordered values and therefore typically lead to lower predictive performance 

149 when used as preprocessing for a classifier or regressor. 

150 OneHotEncoder : Performs a one-hot encoding of categorical features. This 

151 unsupervised encoding is better suited for low cardinality categorical 

152 variables as it generate one new feature per unique category. 

153 

154 References 

155 ---------- 

156 .. [MIC] :doi:`Micci-Barreca, Daniele. "A preprocessing scheme for high-cardinality 

157 categorical attributes in classification and prediction problems" 

158 SIGKDD Explor. Newsl. 3, 1 (July 2001), 27–32. <10.1145/507533.507538>` 

159 

160 Examples 

161 -------- 

162 With `smooth="auto"`, the smoothing parameter is set to an empirical Bayes estimate: 

163 

164 >>> import numpy as np 

165 >>> from sklearn.preprocessing import TargetEncoder 

166 >>> X = np.array([["dog"] * 20 + ["cat"] * 30 + ["snake"] * 38], dtype=object).T 

167 >>> y = [90.3] * 5 + [80.1] * 15 + [20.4] * 5 + [20.1] * 25 + [21.2] * 8 + [49] * 30 

168 >>> enc_auto = TargetEncoder(smooth="auto") 

169 >>> X_trans = enc_auto.fit_transform(X, y) 

170 

171 >>> # A high `smooth` parameter puts more weight on global mean on the categorical 

172 >>> # encodings: 

173 >>> enc_high_smooth = TargetEncoder(smooth=5000.0).fit(X, y) 

174 >>> enc_high_smooth.target_mean_ 

175 44... 

176 >>> enc_high_smooth.encodings_ 

177 [array([44..., 44..., 44...])] 

178 

179 >>> # On the other hand, a low `smooth` parameter puts more weight on target 

180 >>> # conditioned on the value of the categorical: 

181 >>> enc_low_smooth = TargetEncoder(smooth=1.0).fit(X, y) 

182 >>> enc_low_smooth.encodings_ 

183 [array([20..., 80..., 43...])] 

184 """ 

185 

186 _parameter_constraints: dict = { 

187 "categories": [StrOptions({"auto"}), list], 

188 "target_type": [StrOptions({"auto", "continuous", "binary", "multiclass"})], 

189 "smooth": [StrOptions({"auto"}), Interval(Real, 0, None, closed="left")], 

190 "cv": [Interval(Integral, 2, None, closed="left")], 

191 "shuffle": ["boolean"], 

192 "random_state": ["random_state"], 

193 } 

194 

195 def __init__( 

196 self, 

197 categories="auto", 

198 target_type="auto", 

199 smooth="auto", 

200 cv=5, 

201 shuffle=True, 

202 random_state=None, 

203 ): 

204 self.categories = categories 

205 self.smooth = smooth 

206 self.target_type = target_type 

207 self.cv = cv 

208 self.shuffle = shuffle 

209 self.random_state = random_state 

210 

211 @_fit_context(prefer_skip_nested_validation=True) 

212 def fit(self, X, y): 

213 """Fit the :class:`TargetEncoder` to X and y. 

214 

215 Parameters 

216 ---------- 

217 X : array-like of shape (n_samples, n_features) 

218 The data to determine the categories of each feature. 

219 

220 y : array-like of shape (n_samples,) 

221 The target data used to encode the categories. 

222 

223 Returns 

224 ------- 

225 self : object 

226 Fitted encoder. 

227 """ 

228 self._fit_encodings_all(X, y) 

229 return self 

230 

231 @_fit_context(prefer_skip_nested_validation=True) 

232 def fit_transform(self, X, y): 

233 """Fit :class:`TargetEncoder` and transform X with the target encoding. 

234 

235 .. note:: 

236 `fit(X, y).transform(X)` does not equal `fit_transform(X, y)` because a 

237 :term:`cross fitting` scheme is used in `fit_transform` for encoding. 

238 See the :ref:`User Guide <target_encoder>`. for details. 

239 

240 Parameters 

241 ---------- 

242 X : array-like of shape (n_samples, n_features) 

243 The data to determine the categories of each feature. 

244 

245 y : array-like of shape (n_samples,) 

246 The target data used to encode the categories. 

247 

248 Returns 

249 ------- 

250 X_trans : ndarray of shape (n_samples, n_features) or \ 

251 (n_samples, (n_features * n_classes)) 

252 Transformed input. 

253 """ 

254 from ..model_selection import KFold, StratifiedKFold # avoid circular import 

255 

256 X_ordinal, X_known_mask, y_encoded, n_categories = self._fit_encodings_all(X, y) 

257 

258 # The cv splitter is voluntarily restricted to *KFold to enforce non 

259 # overlapping validation folds, otherwise the fit_transform output will 

260 # not be well-specified. 

261 if self.target_type_ == "continuous": 

262 cv = KFold(self.cv, shuffle=self.shuffle, random_state=self.random_state) 

263 else: 

264 cv = StratifiedKFold( 

265 self.cv, shuffle=self.shuffle, random_state=self.random_state 

266 ) 

267 

268 # If 'multiclass' multiply axis=1 by num classes else keep shape the same 

269 if self.target_type_ == "multiclass": 

270 X_out = np.empty( 

271 (X_ordinal.shape[0], X_ordinal.shape[1] * len(self.classes_)), 

272 dtype=np.float64, 

273 ) 

274 else: 

275 X_out = np.empty_like(X_ordinal, dtype=np.float64) 

276 

277 for train_idx, test_idx in cv.split(X, y): 

278 X_train, y_train = X_ordinal[train_idx, :], y_encoded[train_idx] 

279 y_train_mean = np.mean(y_train, axis=0) 

280 

281 if self.target_type_ == "multiclass": 

282 encodings = self._fit_encoding_multiclass( 

283 X_train, 

284 y_train, 

285 n_categories, 

286 y_train_mean, 

287 ) 

288 else: 

289 encodings = self._fit_encoding_binary_or_continuous( 

290 X_train, 

291 y_train, 

292 n_categories, 

293 y_train_mean, 

294 ) 

295 self._transform_X_ordinal( 

296 X_out, 

297 X_ordinal, 

298 ~X_known_mask, 

299 test_idx, 

300 encodings, 

301 y_train_mean, 

302 ) 

303 return X_out 

304 

305 def transform(self, X): 

306 """Transform X with the target encoding. 

307 

308 .. note:: 

309 `fit(X, y).transform(X)` does not equal `fit_transform(X, y)` because a 

310 :term:`cross fitting` scheme is used in `fit_transform` for encoding. 

311 See the :ref:`User Guide <target_encoder>`. for details. 

312 

313 Parameters 

314 ---------- 

315 X : array-like of shape (n_samples, n_features) 

316 The data to determine the categories of each feature. 

317 

318 Returns 

319 ------- 

320 X_trans : ndarray of shape (n_samples, n_features) or \ 

321 (n_samples, (n_features * n_classes)) 

322 Transformed input. 

323 """ 

324 X_ordinal, X_known_mask = self._transform( 

325 X, handle_unknown="ignore", force_all_finite="allow-nan" 

326 ) 

327 

328 # If 'multiclass' multiply axis=1 by num of classes else keep shape the same 

329 if self.target_type_ == "multiclass": 

330 X_out = np.empty( 

331 (X_ordinal.shape[0], X_ordinal.shape[1] * len(self.classes_)), 

332 dtype=np.float64, 

333 ) 

334 else: 

335 X_out = np.empty_like(X_ordinal, dtype=np.float64) 

336 

337 self._transform_X_ordinal( 

338 X_out, 

339 X_ordinal, 

340 ~X_known_mask, 

341 slice(None), 

342 self.encodings_, 

343 self.target_mean_, 

344 ) 

345 return X_out 

346 

347 def _fit_encodings_all(self, X, y): 

348 """Fit a target encoding with all the data.""" 

349 # avoid circular import 

350 from ..preprocessing import ( 

351 LabelBinarizer, 

352 LabelEncoder, 

353 ) 

354 

355 check_consistent_length(X, y) 

356 self._fit(X, handle_unknown="ignore", force_all_finite="allow-nan") 

357 

358 if self.target_type == "auto": 

359 accepted_target_types = ("binary", "multiclass", "continuous") 

360 inferred_type_of_target = type_of_target(y, input_name="y") 

361 if inferred_type_of_target not in accepted_target_types: 

362 raise ValueError( 

363 "Unknown label type: Target type was inferred to be " 

364 f"{inferred_type_of_target!r}. Only {accepted_target_types} are " 

365 "supported." 

366 ) 

367 self.target_type_ = inferred_type_of_target 

368 else: 

369 self.target_type_ = self.target_type 

370 

371 self.classes_ = None 

372 if self.target_type_ == "binary": 

373 label_encoder = LabelEncoder() 

374 y = label_encoder.fit_transform(y) 

375 self.classes_ = label_encoder.classes_ 

376 elif self.target_type_ == "multiclass": 

377 label_binarizer = LabelBinarizer() 

378 y = label_binarizer.fit_transform(y) 

379 self.classes_ = label_binarizer.classes_ 

380 else: # continuous 

381 y = _check_y(y, y_numeric=True, estimator=self) 

382 

383 self.target_mean_ = np.mean(y, axis=0) 

384 

385 X_ordinal, X_known_mask = self._transform( 

386 X, handle_unknown="ignore", force_all_finite="allow-nan" 

387 ) 

388 n_categories = np.fromiter( 

389 (len(category_for_feature) for category_for_feature in self.categories_), 

390 dtype=np.int64, 

391 count=len(self.categories_), 

392 ) 

393 if self.target_type_ == "multiclass": 

394 encodings = self._fit_encoding_multiclass( 

395 X_ordinal, 

396 y, 

397 n_categories, 

398 self.target_mean_, 

399 ) 

400 else: 

401 encodings = self._fit_encoding_binary_or_continuous( 

402 X_ordinal, 

403 y, 

404 n_categories, 

405 self.target_mean_, 

406 ) 

407 self.encodings_ = encodings 

408 

409 return X_ordinal, X_known_mask, y, n_categories 

410 

411 def _fit_encoding_binary_or_continuous( 

412 self, X_ordinal, y, n_categories, target_mean 

413 ): 

414 """Learn target encodings.""" 

415 if self.smooth == "auto": 

416 y_variance = np.var(y) 

417 encodings = _fit_encoding_fast_auto_smooth( 

418 X_ordinal, 

419 y, 

420 n_categories, 

421 target_mean, 

422 y_variance, 

423 ) 

424 else: 

425 encodings = _fit_encoding_fast( 

426 X_ordinal, 

427 y, 

428 n_categories, 

429 self.smooth, 

430 target_mean, 

431 ) 

432 return encodings 

433 

434 def _fit_encoding_multiclass(self, X_ordinal, y, n_categories, target_mean): 

435 """Learn multiclass encodings. 

436 

437 Learn encodings for each class (c) then reorder encodings such that 

438 the same features (f) are grouped together. `reorder_index` enables 

439 converting from: 

440 f0_c0, f1_c0, f0_c1, f1_c1, f0_c2, f1_c2 

441 to: 

442 f0_c0, f0_c1, f0_c2, f1_c0, f1_c1, f1_c2 

443 """ 

444 n_features = self.n_features_in_ 

445 n_classes = len(self.classes_) 

446 

447 encodings = [] 

448 for i in range(n_classes): 

449 y_class = y[:, i] 

450 encoding = self._fit_encoding_binary_or_continuous( 

451 X_ordinal, 

452 y_class, 

453 n_categories, 

454 target_mean[i], 

455 ) 

456 encodings.extend(encoding) 

457 

458 reorder_index = ( 

459 idx 

460 for start in range(n_features) 

461 for idx in range(start, (n_classes * n_features), n_features) 

462 ) 

463 return [encodings[idx] for idx in reorder_index] 

464 

465 def _transform_X_ordinal( 

466 self, 

467 X_out, 

468 X_ordinal, 

469 X_unknown_mask, 

470 row_indices, 

471 encodings, 

472 target_mean, 

473 ): 

474 """Transform X_ordinal using encodings. 

475 

476 In the multiclass case, `X_ordinal` and `X_unknown_mask` have column 

477 (axis=1) size `n_features`, while `encodings` has length of size 

478 `n_features * n_classes`. `feat_idx` deals with this by repeating 

479 feature indices by `n_classes` E.g., for 3 features, 2 classes: 

480 0,0,1,1,2,2 

481 

482 Additionally, `target_mean` is of shape (`n_classes`,) so `mean_idx` 

483 cycles through 0 to `n_classes` - 1, `n_features` times. 

484 """ 

485 if self.target_type_ == "multiclass": 

486 n_classes = len(self.classes_) 

487 for e_idx, encoding in enumerate(encodings): 

488 # Repeat feature indices by n_classes 

489 feat_idx = e_idx // n_classes 

490 # Cycle through each class 

491 mean_idx = e_idx % n_classes 

492 X_out[row_indices, e_idx] = encoding[X_ordinal[row_indices, feat_idx]] 

493 X_out[X_unknown_mask[:, feat_idx], e_idx] = target_mean[mean_idx] 

494 else: 

495 for e_idx, encoding in enumerate(encodings): 

496 X_out[row_indices, e_idx] = encoding[X_ordinal[row_indices, e_idx]] 

497 X_out[X_unknown_mask[:, e_idx], e_idx] = target_mean 

498 

499 def get_feature_names_out(self, input_features=None): 

500 """Get output feature names for transformation. 

501 

502 Parameters 

503 ---------- 

504 input_features : array-like of str or None, default=None 

505 Not used, present here for API consistency by convention. 

506 

507 Returns 

508 ------- 

509 feature_names_out : ndarray of str objects 

510 Transformed feature names. `feature_names_in_` is used unless it is 

511 not defined, in which case the following input feature names are 

512 generated: `["x0", "x1", ..., "x(n_features_in_ - 1)"]`. 

513 When `type_of_target_` is "multiclass" the names are of the format 

514 '<feature_name>_<class_name>'. 

515 """ 

516 check_is_fitted(self, "n_features_in_") 

517 feature_names = _check_feature_names_in(self, input_features) 

518 if self.target_type_ == "multiclass": 

519 feature_names = [ 

520 f"{feature_name}_{class_name}" 

521 for feature_name in feature_names 

522 for class_name in self.classes_ 

523 ] 

524 return np.asarray(feature_names, dtype=object) 

525 else: 

526 return feature_names 

527 

528 def _more_tags(self): 

529 return { 

530 "requires_y": True, 

531 }