Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/scikit_learn-1.4.dev0-py3.8-linux-x86_64.egg/sklearn/preprocessing/_target

1from numbers import Integral, Real

3import numpy as np

5from ..base import OneToOneFeatureMixin, _fit_context

6from ..utils._param_validation import Interval, StrOptions

7from ..utils.multiclass import type_of_target

8from ..utils.validation import (

9 _check_feature_names_in,

10 _check_y,

11 check_consistent_length,

12 check_is_fitted,

13)

14from ._encoders import _BaseEncoder

15from ._target_encoder_fast import _fit_encoding_fast, _fit_encoding_fast_auto_smooth

18class TargetEncoder(OneToOneFeatureMixin, _BaseEncoder):

19 """Target Encoder for regression and classification targets.

21 Each category is encoded based on a shrunk estimate of the average target

22 values for observations belonging to the category. The encoding scheme mixes

23 the global target mean with the target mean conditioned on the value of the

24 category (see [MIC]_).

26 When the target type is "multiclass", encodings are based

27 on the conditional probability estimate for each class. The target is first

28 binarized using the "one-vs-all" scheme via

29 :class:`~sklearn.preprocessing.LabelBinarizer`, then the average target

30 value for each class and each category is used for encoding, resulting in

31 `n_features` * `n_classes` encoded output features.

33 :class:`TargetEncoder` considers missing values, such as `np.nan` or `None`,

34 as another category and encodes them like any other category. Categories

35 that are not seen during :meth:`fit` are encoded with the target mean, i.e.

36 `target_mean_`.

38 For a demo on the importance of the `TargetEncoder` internal cross-fitting,

39 see

40 :ref:`sphx_glr_auto_examples_preprocessing_plot_target_encoder_cross_val.py`.

41 For a comparison of different encoders, refer to

42 :ref:`sphx_glr_auto_examples_preprocessing_plot_target_encoder.py`. Read

43 more in the :ref:`User Guide <target_encoder>`.

45 .. note::

46 `fit(X, y).transform(X)` does not equal `fit_transform(X, y)` because a

47 :term:`cross fitting` scheme is used in `fit_transform` for encoding.

48 See the :ref:`User Guide <target_encoder>` for details.

50 .. versionadded:: 1.3

52 Parameters

53 ----------

54 categories : "auto" or list of shape (n_features,) of array-like, default="auto"

55 Categories (unique values) per feature:

57 - `"auto"` : Determine categories automatically from the training data.

58 - list : `categories[i]` holds the categories expected in the i-th column. The

59 passed categories should not mix strings and numeric values within a single

60 feature, and should be sorted in case of numeric values.

62 The used categories are stored in the `categories_` fitted attribute.

64 target_type : {"auto", "continuous", "binary", "multiclass"}, default="auto"

65 Type of target.

67 - `"auto"` : Type of target is inferred with

68 :func:`~sklearn.utils.multiclass.type_of_target`.

69 - `"continuous"` : Continuous target

70 - `"binary"` : Binary target

71 - `"multiclass"` : Multiclass target

73 .. note::

74 The type of target inferred with `"auto"` may not be the desired target

75 type used for modeling. For example, if the target consisted of integers

76 between 0 and 100, then :func:`~sklearn.utils.multiclass.type_of_target`

77 will infer the target as `"multiclass"`. In this case, setting

78 `target_type="continuous"` will specify the target as a regression

79 problem. The `target_type_` attribute gives the target type used by the

80 encoder.

82 .. versionchanged:: 1.4

83 Added the option 'multiclass'.

85 smooth : "auto" or float, default="auto"

86 The amount of mixing of the target mean conditioned on the value of the

87 category with the global target mean. A larger `smooth` value will put

88 more weight on the global target mean.

89 If `"auto"`, then `smooth` is set to an empirical Bayes estimate.

91 cv : int, default=5

92 Determines the number of folds in the :term:`cross fitting` strategy used in

93 :meth:`fit_transform`. For classification targets, `StratifiedKFold` is used

94 and for continuous targets, `KFold` is used.

96 shuffle : bool, default=True

97 Whether to shuffle the data in :meth:`fit_transform` before splitting into

98 folds. Note that the samples within each split will not be shuffled.

100 random_state : int, RandomState instance or None, default=None

101 When `shuffle` is True, `random_state` affects the ordering of the

102 indices, which controls the randomness of each fold. Otherwise, this

103 parameter has no effect.

104 Pass an int for reproducible output across multiple function calls.

105 See :term:`Glossary <random_state>`.

106

107 Attributes

108 ----------

109 encodings_ : list of shape (n_features,) or (n_features * n_classes) of \

110 ndarray

111 Encodings learnt on all of `X`.

112 For feature `i`, `encodings_[i]` are the encodings matching the

113 categories listed in `categories_[i]`. When `target_type_` is

114 "multiclass", the encoding for feature `i` and class `j` is stored in

115 `encodings_[j + (i * len(classes_))]`. E.g., for 2 features (f) and

116 3 classes (c), encodings are ordered:

117 f0_c0, f0_c1, f0_c2, f1_c0, f1_c1, f1_c2,

118

119 categories_ : list of shape (n_features,) of ndarray

120 The categories of each input feature determined during fitting or

121 specified in `categories`

122 (in order of the features in `X` and corresponding with the output

123 of :meth:`transform`).

124

125 target_type_ : str

126 Type of target.

127

128 target_mean_ : float

129 The overall mean of the target. This value is only used in :meth:`transform`

130 to encode categories.

131

132 n_features_in_ : int

133 Number of features seen during :term:`fit`.

134

135 feature_names_in_ : ndarray of shape (`n_features_in_`,)

136 Names of features seen during :term:`fit`. Defined only when `X`

137 has feature names that are all strings.

138

139 classes_ : ndarray or None

140 If `target_type_` is 'binary' or 'multiclass', holds the label for each class,

141 otherwise `None`.

142

143 See Also

144 --------

145 OrdinalEncoder : Performs an ordinal (integer) encoding of the categorical features.

146 Contrary to TargetEncoder, this encoding is not supervised. Treating the

147 resulting encoding as a numerical features therefore lead arbitrarily

148 ordered values and therefore typically lead to lower predictive performance

149 when used as preprocessing for a classifier or regressor.

150 OneHotEncoder : Performs a one-hot encoding of categorical features. This

151 unsupervised encoding is better suited for low cardinality categorical

152 variables as it generate one new feature per unique category.

153

154 References

155 ----------

156 .. [MIC] :doi:`Micci-Barreca, Daniele. "A preprocessing scheme for high-cardinality

157 categorical attributes in classification and prediction problems"

158 SIGKDD Explor. Newsl. 3, 1 (July 2001), 27–32. <10.1145/507533.507538>`

159

160 Examples

161 --------

162 With `smooth="auto"`, the smoothing parameter is set to an empirical Bayes estimate:

163

164 >>> import numpy as np

165 >>> from sklearn.preprocessing import TargetEncoder

166 >>> X = np.array([["dog"] * 20 + ["cat"] * 30 + ["snake"] * 38], dtype=object).T

167 >>> y = [90.3] * 5 + [80.1] * 15 + [20.4] * 5 + [20.1] * 25 + [21.2] * 8 + [49] * 30

168 >>> enc_auto = TargetEncoder(smooth="auto")

169 >>> X_trans = enc_auto.fit_transform(X, y)

170

171 >>> # A high `smooth` parameter puts more weight on global mean on the categorical

172 >>> # encodings:

173 >>> enc_high_smooth = TargetEncoder(smooth=5000.0).fit(X, y)

174 >>> enc_high_smooth.target_mean_

175 44...

176 >>> enc_high_smooth.encodings_

177 [array([44..., 44..., 44...])]

178

179 >>> # On the other hand, a low `smooth` parameter puts more weight on target

180 >>> # conditioned on the value of the categorical:

181 >>> enc_low_smooth = TargetEncoder(smooth=1.0).fit(X, y)

182 >>> enc_low_smooth.encodings_

183 [array([20..., 80..., 43...])]

184 """

185

186 _parameter_constraints: dict = {

187 "categories": [StrOptions({"auto"}), list],

188 "target_type": [StrOptions({"auto", "continuous", "binary", "multiclass"})],

189 "smooth": [StrOptions({"auto"}), Interval(Real, 0, None, closed="left")],

190 "cv": [Interval(Integral, 2, None, closed="left")],

191 "shuffle": ["boolean"],

192 "random_state": ["random_state"],

193 }

194

195 def __init__(

196 self,

197 categories="auto",

198 target_type="auto",

199 smooth="auto",

200 cv=5,

201 shuffle=True,

202 random_state=None,

203 ):

204 self.categories = categories

205 self.smooth = smooth

206 self.target_type = target_type

207 self.cv = cv

208 self.shuffle = shuffle

209 self.random_state = random_state

210

211 @_fit_context(prefer_skip_nested_validation=True)

212 def fit(self, X, y):

213 """Fit the :class:`TargetEncoder` to X and y.

214

215 Parameters

216 ----------

217 X : array-like of shape (n_samples, n_features)

218 The data to determine the categories of each feature.

219

220 y : array-like of shape (n_samples,)

221 The target data used to encode the categories.

222

223 Returns

224 -------

225 self : object

226 Fitted encoder.

227 """

228 self._fit_encodings_all(X, y)

229 return self

230

231 @_fit_context(prefer_skip_nested_validation=True)

232 def fit_transform(self, X, y):

233 """Fit :class:`TargetEncoder` and transform X with the target encoding.

234

235 .. note::

236 `fit(X, y).transform(X)` does not equal `fit_transform(X, y)` because a

237 :term:`cross fitting` scheme is used in `fit_transform` for encoding.

238 See the :ref:`User Guide <target_encoder>`. for details.

239

240 Parameters

241 ----------

242 X : array-like of shape (n_samples, n_features)

243 The data to determine the categories of each feature.

244

245 y : array-like of shape (n_samples,)

246 The target data used to encode the categories.

247

248 Returns

249 -------

250 X_trans : ndarray of shape (n_samples, n_features) or \

251 (n_samples, (n_features * n_classes))

252 Transformed input.

253 """

254 from ..model_selection import KFold, StratifiedKFold # avoid circular import

255

256 X_ordinal, X_known_mask, y_encoded, n_categories = self._fit_encodings_all(X, y)

257

258 # The cv splitter is voluntarily restricted to *KFold to enforce non

259 # overlapping validation folds, otherwise the fit_transform output will

260 # not be well-specified.

261 if self.target_type_ == "continuous":

262 cv = KFold(self.cv, shuffle=self.shuffle, random_state=self.random_state)

263 else:

264 cv = StratifiedKFold(

265 self.cv, shuffle=self.shuffle, random_state=self.random_state

266 )

267

268 # If 'multiclass' multiply axis=1 by num classes else keep shape the same

269 if self.target_type_ == "multiclass":

270 X_out = np.empty(

271 (X_ordinal.shape[0], X_ordinal.shape[1] * len(self.classes_)),

272 dtype=np.float64,

273 )

274 else:

275 X_out = np.empty_like(X_ordinal, dtype=np.float64)

276

277 for train_idx, test_idx in cv.split(X, y):

278 X_train, y_train = X_ordinal[train_idx, :], y_encoded[train_idx]

279 y_train_mean = np.mean(y_train, axis=0)

280

281 if self.target_type_ == "multiclass":

282 encodings = self._fit_encoding_multiclass(

283 X_train,

284 y_train,

285 n_categories,

286 y_train_mean,

287 )

288 else:

289 encodings = self._fit_encoding_binary_or_continuous(

290 X_train,

291 y_train,

292 n_categories,

293 y_train_mean,

294 )

295 self._transform_X_ordinal(

296 X_out,

297 X_ordinal,

298 ~X_known_mask,

299 test_idx,

300 encodings,

301 y_train_mean,

302 )

303 return X_out

304

305 def transform(self, X):

306 """Transform X with the target encoding.

307

308 .. note::

309 `fit(X, y).transform(X)` does not equal `fit_transform(X, y)` because a

310 :term:`cross fitting` scheme is used in `fit_transform` for encoding.

311 See the :ref:`User Guide <target_encoder>`. for details.

312

313 Parameters

314 ----------

315 X : array-like of shape (n_samples, n_features)

316 The data to determine the categories of each feature.

317

318 Returns

319 -------

320 X_trans : ndarray of shape (n_samples, n_features) or \

321 (n_samples, (n_features * n_classes))

322 Transformed input.

323 """

324 X_ordinal, X_known_mask = self._transform(

325 X, handle_unknown="ignore", force_all_finite="allow-nan"

326 )

327

328 # If 'multiclass' multiply axis=1 by num of classes else keep shape the same

329 if self.target_type_ == "multiclass":

330 X_out = np.empty(

331 (X_ordinal.shape[0], X_ordinal.shape[1] * len(self.classes_)),

332 dtype=np.float64,

333 )

334 else:

335 X_out = np.empty_like(X_ordinal, dtype=np.float64)

336

337 self._transform_X_ordinal(

338 X_out,

339 X_ordinal,

340 ~X_known_mask,

341 slice(None),

342 self.encodings_,

343 self.target_mean_,

344 )

345 return X_out

346

347 def _fit_encodings_all(self, X, y):

348 """Fit a target encoding with all the data."""

349 # avoid circular import

350 from ..preprocessing import (

351 LabelBinarizer,

352 LabelEncoder,

353 )

354

355 check_consistent_length(X, y)

356 self._fit(X, handle_unknown="ignore", force_all_finite="allow-nan")

357

358 if self.target_type == "auto":

359 accepted_target_types = ("binary", "multiclass", "continuous")

360 inferred_type_of_target = type_of_target(y, input_name="y")

361 if inferred_type_of_target not in accepted_target_types:

362 raise ValueError(

363 "Unknown label type: Target type was inferred to be "

364 f"{inferred_type_of_target!r}. Only {accepted_target_types} are "

365 "supported."

366 )

367 self.target_type_ = inferred_type_of_target

368 else:

369 self.target_type_ = self.target_type

370

371 self.classes_ = None

372 if self.target_type_ == "binary":

373 label_encoder = LabelEncoder()

374 y = label_encoder.fit_transform(y)

375 self.classes_ = label_encoder.classes_

376 elif self.target_type_ == "multiclass":

377 label_binarizer = LabelBinarizer()

378 y = label_binarizer.fit_transform(y)

379 self.classes_ = label_binarizer.classes_

380 else: # continuous

381 y = _check_y(y, y_numeric=True, estimator=self)

382

383 self.target_mean_ = np.mean(y, axis=0)

384

385 X_ordinal, X_known_mask = self._transform(

386 X, handle_unknown="ignore", force_all_finite="allow-nan"

387 )

388 n_categories = np.fromiter(

389 (len(category_for_feature) for category_for_feature in self.categories_),

390 dtype=np.int64,

391 count=len(self.categories_),

392 )

393 if self.target_type_ == "multiclass":

394 encodings = self._fit_encoding_multiclass(

395 X_ordinal,

396 y,

397 n_categories,

398 self.target_mean_,

399 )

400 else:

401 encodings = self._fit_encoding_binary_or_continuous(

402 X_ordinal,

403 y,

404 n_categories,

405 self.target_mean_,

406 )

407 self.encodings_ = encodings

408

409 return X_ordinal, X_known_mask, y, n_categories

410

411 def _fit_encoding_binary_or_continuous(

412 self, X_ordinal, y, n_categories, target_mean

413 ):

414 """Learn target encodings."""

415 if self.smooth == "auto":

416 y_variance = np.var(y)

417 encodings = _fit_encoding_fast_auto_smooth(

418 X_ordinal,

419 y,

420 n_categories,

421 target_mean,

422 y_variance,

423 )

424 else:

425 encodings = _fit_encoding_fast(

426 X_ordinal,

427 y,

428 n_categories,

429 self.smooth,

430 target_mean,

431 )

432 return encodings

433

434 def _fit_encoding_multiclass(self, X_ordinal, y, n_categories, target_mean):

435 """Learn multiclass encodings.

436

437 Learn encodings for each class (c) then reorder encodings such that

438 the same features (f) are grouped together. `reorder_index` enables

439 converting from:

440 f0_c0, f1_c0, f0_c1, f1_c1, f0_c2, f1_c2

441 to:

442 f0_c0, f0_c1, f0_c2, f1_c0, f1_c1, f1_c2

443 """

444 n_features = self.n_features_in_

445 n_classes = len(self.classes_)

446

447 encodings = []

448 for i in range(n_classes):

449 y_class = y[:, i]

450 encoding = self._fit_encoding_binary_or_continuous(

451 X_ordinal,

452 y_class,

453 n_categories,

454 target_mean[i],

455 )

456 encodings.extend(encoding)

457

458 reorder_index = (

459 idx

460 for start in range(n_features)

461 for idx in range(start, (n_classes * n_features), n_features)

462 )

463 return [encodings[idx] for idx in reorder_index]

464

465 def _transform_X_ordinal(

466 self,

467 X_out,

468 X_ordinal,

469 X_unknown_mask,

470 row_indices,

471 encodings,

472 target_mean,

473 ):

474 """Transform X_ordinal using encodings.

475

476 In the multiclass case, `X_ordinal` and `X_unknown_mask` have column

477 (axis=1) size `n_features`, while `encodings` has length of size

478 `n_features * n_classes`. `feat_idx` deals with this by repeating

479 feature indices by `n_classes` E.g., for 3 features, 2 classes:

480 0,0,1,1,2,2

481

482 Additionally, `target_mean` is of shape (`n_classes`,) so `mean_idx`

483 cycles through 0 to `n_classes` - 1, `n_features` times.

484 """

485 if self.target_type_ == "multiclass":

486 n_classes = len(self.classes_)

487 for e_idx, encoding in enumerate(encodings):

488 # Repeat feature indices by n_classes

489 feat_idx = e_idx // n_classes

490 # Cycle through each class

491 mean_idx = e_idx % n_classes

492 X_out[row_indices, e_idx] = encoding[X_ordinal[row_indices, feat_idx]]

493 X_out[X_unknown_mask[:, feat_idx], e_idx] = target_mean[mean_idx]

494 else:

495 for e_idx, encoding in enumerate(encodings):

496 X_out[row_indices, e_idx] = encoding[X_ordinal[row_indices, e_idx]]

497 X_out[X_unknown_mask[:, e_idx], e_idx] = target_mean

498

499 def get_feature_names_out(self, input_features=None):

500 """Get output feature names for transformation.

501

502 Parameters

503 ----------

504 input_features : array-like of str or None, default=None

505 Not used, present here for API consistency by convention.

506

507 Returns

508 -------

509 feature_names_out : ndarray of str objects

510 Transformed feature names. `feature_names_in_` is used unless it is

511 not defined, in which case the following input feature names are

512 generated: `["x0", "x1", ..., "x(n_features_in_ - 1)"]`.

513 When `type_of_target_` is "multiclass" the names are of the format

514 '<feature_name>_<class_name>'.

515 """

516 check_is_fitted(self, "n_features_in_")

517 feature_names = _check_feature_names_in(self, input_features)

518 if self.target_type_ == "multiclass":

519 feature_names = [

520 f"{feature_name}_{class_name}"

521 for feature_name in feature_names

522 for class_name in self.classes_

523 ]

524 return np.asarray(feature_names, dtype=object)

525 else:

526 return feature_names

527

528 def _more_tags(self):

529 return {

530 "requires_y": True,

531 }

Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/scikit_learn-1.4.dev0-py3.8-linux-x86_64.egg/sklearn/preprocessing/_target_encoder.py: 20%

111 statements