1from numbers import Integral, Real
2
3import numpy as np
4
5from ..base import OneToOneFeatureMixin, _fit_context
6from ..utils._param_validation import Interval, StrOptions
7from ..utils.multiclass import type_of_target
8from ..utils.validation import (
9 _check_feature_names_in,
10 _check_y,
11 check_consistent_length,
12 check_is_fitted,
13)
14from ._encoders import _BaseEncoder
15from ._target_encoder_fast import _fit_encoding_fast, _fit_encoding_fast_auto_smooth
16
17
18class TargetEncoder(OneToOneFeatureMixin, _BaseEncoder):
19 """Target Encoder for regression and classification targets.
20
21 Each category is encoded based on a shrunk estimate of the average target
22 values for observations belonging to the category. The encoding scheme mixes
23 the global target mean with the target mean conditioned on the value of the
24 category (see [MIC]_).
25
26 When the target type is "multiclass", encodings are based
27 on the conditional probability estimate for each class. The target is first
28 binarized using the "one-vs-all" scheme via
29 :class:`~sklearn.preprocessing.LabelBinarizer`, then the average target
30 value for each class and each category is used for encoding, resulting in
31 `n_features` * `n_classes` encoded output features.
32
33 :class:`TargetEncoder` considers missing values, such as `np.nan` or `None`,
34 as another category and encodes them like any other category. Categories
35 that are not seen during :meth:`fit` are encoded with the target mean, i.e.
36 `target_mean_`.
37
38 For a demo on the importance of the `TargetEncoder` internal cross-fitting,
39 see
40 :ref:`sphx_glr_auto_examples_preprocessing_plot_target_encoder_cross_val.py`.
41 For a comparison of different encoders, refer to
42 :ref:`sphx_glr_auto_examples_preprocessing_plot_target_encoder.py`. Read
43 more in the :ref:`User Guide <target_encoder>`.
44
45 .. note::
46 `fit(X, y).transform(X)` does not equal `fit_transform(X, y)` because a
47 :term:`cross fitting` scheme is used in `fit_transform` for encoding.
48 See the :ref:`User Guide <target_encoder>` for details.
49
50 .. versionadded:: 1.3
51
52 Parameters
53 ----------
54 categories : "auto" or list of shape (n_features,) of array-like, default="auto"
55 Categories (unique values) per feature:
56
57 - `"auto"` : Determine categories automatically from the training data.
58 - list : `categories[i]` holds the categories expected in the i-th column. The
59 passed categories should not mix strings and numeric values within a single
60 feature, and should be sorted in case of numeric values.
61
62 The used categories are stored in the `categories_` fitted attribute.
63
64 target_type : {"auto", "continuous", "binary", "multiclass"}, default="auto"
65 Type of target.
66
67 - `"auto"` : Type of target is inferred with
68 :func:`~sklearn.utils.multiclass.type_of_target`.
69 - `"continuous"` : Continuous target
70 - `"binary"` : Binary target
71 - `"multiclass"` : Multiclass target
72
73 .. note::
74 The type of target inferred with `"auto"` may not be the desired target
75 type used for modeling. For example, if the target consisted of integers
76 between 0 and 100, then :func:`~sklearn.utils.multiclass.type_of_target`
77 will infer the target as `"multiclass"`. In this case, setting
78 `target_type="continuous"` will specify the target as a regression
79 problem. The `target_type_` attribute gives the target type used by the
80 encoder.
81
82 .. versionchanged:: 1.4
83 Added the option 'multiclass'.
84
85 smooth : "auto" or float, default="auto"
86 The amount of mixing of the target mean conditioned on the value of the
87 category with the global target mean. A larger `smooth` value will put
88 more weight on the global target mean.
89 If `"auto"`, then `smooth` is set to an empirical Bayes estimate.
90
91 cv : int, default=5
92 Determines the number of folds in the :term:`cross fitting` strategy used in
93 :meth:`fit_transform`. For classification targets, `StratifiedKFold` is used
94 and for continuous targets, `KFold` is used.
95
96 shuffle : bool, default=True
97 Whether to shuffle the data in :meth:`fit_transform` before splitting into
98 folds. Note that the samples within each split will not be shuffled.
99
100 random_state : int, RandomState instance or None, default=None
101 When `shuffle` is True, `random_state` affects the ordering of the
102 indices, which controls the randomness of each fold. Otherwise, this
103 parameter has no effect.
104 Pass an int for reproducible output across multiple function calls.
105 See :term:`Glossary <random_state>`.
106
107 Attributes
108 ----------
109 encodings_ : list of shape (n_features,) or (n_features * n_classes) of \
110 ndarray
111 Encodings learnt on all of `X`.
112 For feature `i`, `encodings_[i]` are the encodings matching the
113 categories listed in `categories_[i]`. When `target_type_` is
114 "multiclass", the encoding for feature `i` and class `j` is stored in
115 `encodings_[j + (i * len(classes_))]`. E.g., for 2 features (f) and
116 3 classes (c), encodings are ordered:
117 f0_c0, f0_c1, f0_c2, f1_c0, f1_c1, f1_c2,
118
119 categories_ : list of shape (n_features,) of ndarray
120 The categories of each input feature determined during fitting or
121 specified in `categories`
122 (in order of the features in `X` and corresponding with the output
123 of :meth:`transform`).
124
125 target_type_ : str
126 Type of target.
127
128 target_mean_ : float
129 The overall mean of the target. This value is only used in :meth:`transform`
130 to encode categories.
131
132 n_features_in_ : int
133 Number of features seen during :term:`fit`.
134
135 feature_names_in_ : ndarray of shape (`n_features_in_`,)
136 Names of features seen during :term:`fit`. Defined only when `X`
137 has feature names that are all strings.
138
139 classes_ : ndarray or None
140 If `target_type_` is 'binary' or 'multiclass', holds the label for each class,
141 otherwise `None`.
142
143 See Also
144 --------
145 OrdinalEncoder : Performs an ordinal (integer) encoding of the categorical features.
146 Contrary to TargetEncoder, this encoding is not supervised. Treating the
147 resulting encoding as a numerical features therefore lead arbitrarily
148 ordered values and therefore typically lead to lower predictive performance
149 when used as preprocessing for a classifier or regressor.
150 OneHotEncoder : Performs a one-hot encoding of categorical features. This
151 unsupervised encoding is better suited for low cardinality categorical
152 variables as it generate one new feature per unique category.
153
154 References
155 ----------
156 .. [MIC] :doi:`Micci-Barreca, Daniele. "A preprocessing scheme for high-cardinality
157 categorical attributes in classification and prediction problems"
158 SIGKDD Explor. Newsl. 3, 1 (July 2001), 27–32. <10.1145/507533.507538>`
159
160 Examples
161 --------
162 With `smooth="auto"`, the smoothing parameter is set to an empirical Bayes estimate:
163
164 >>> import numpy as np
165 >>> from sklearn.preprocessing import TargetEncoder
166 >>> X = np.array([["dog"] * 20 + ["cat"] * 30 + ["snake"] * 38], dtype=object).T
167 >>> y = [90.3] * 5 + [80.1] * 15 + [20.4] * 5 + [20.1] * 25 + [21.2] * 8 + [49] * 30
168 >>> enc_auto = TargetEncoder(smooth="auto")
169 >>> X_trans = enc_auto.fit_transform(X, y)
170
171 >>> # A high `smooth` parameter puts more weight on global mean on the categorical
172 >>> # encodings:
173 >>> enc_high_smooth = TargetEncoder(smooth=5000.0).fit(X, y)
174 >>> enc_high_smooth.target_mean_
175 44...
176 >>> enc_high_smooth.encodings_
177 [array([44..., 44..., 44...])]
178
179 >>> # On the other hand, a low `smooth` parameter puts more weight on target
180 >>> # conditioned on the value of the categorical:
181 >>> enc_low_smooth = TargetEncoder(smooth=1.0).fit(X, y)
182 >>> enc_low_smooth.encodings_
183 [array([20..., 80..., 43...])]
184 """
185
186 _parameter_constraints: dict = {
187 "categories": [StrOptions({"auto"}), list],
188 "target_type": [StrOptions({"auto", "continuous", "binary", "multiclass"})],
189 "smooth": [StrOptions({"auto"}), Interval(Real, 0, None, closed="left")],
190 "cv": [Interval(Integral, 2, None, closed="left")],
191 "shuffle": ["boolean"],
192 "random_state": ["random_state"],
193 }
194
195 def __init__(
196 self,
197 categories="auto",
198 target_type="auto",
199 smooth="auto",
200 cv=5,
201 shuffle=True,
202 random_state=None,
203 ):
204 self.categories = categories
205 self.smooth = smooth
206 self.target_type = target_type
207 self.cv = cv
208 self.shuffle = shuffle
209 self.random_state = random_state
210
211 @_fit_context(prefer_skip_nested_validation=True)
212 def fit(self, X, y):
213 """Fit the :class:`TargetEncoder` to X and y.
214
215 Parameters
216 ----------
217 X : array-like of shape (n_samples, n_features)
218 The data to determine the categories of each feature.
219
220 y : array-like of shape (n_samples,)
221 The target data used to encode the categories.
222
223 Returns
224 -------
225 self : object
226 Fitted encoder.
227 """
228 self._fit_encodings_all(X, y)
229 return self
230
231 @_fit_context(prefer_skip_nested_validation=True)
232 def fit_transform(self, X, y):
233 """Fit :class:`TargetEncoder` and transform X with the target encoding.
234
235 .. note::
236 `fit(X, y).transform(X)` does not equal `fit_transform(X, y)` because a
237 :term:`cross fitting` scheme is used in `fit_transform` for encoding.
238 See the :ref:`User Guide <target_encoder>`. for details.
239
240 Parameters
241 ----------
242 X : array-like of shape (n_samples, n_features)
243 The data to determine the categories of each feature.
244
245 y : array-like of shape (n_samples,)
246 The target data used to encode the categories.
247
248 Returns
249 -------
250 X_trans : ndarray of shape (n_samples, n_features) or \
251 (n_samples, (n_features * n_classes))
252 Transformed input.
253 """
254 from ..model_selection import KFold, StratifiedKFold # avoid circular import
255
256 X_ordinal, X_known_mask, y_encoded, n_categories = self._fit_encodings_all(X, y)
257
258 # The cv splitter is voluntarily restricted to *KFold to enforce non
259 # overlapping validation folds, otherwise the fit_transform output will
260 # not be well-specified.
261 if self.target_type_ == "continuous":
262 cv = KFold(self.cv, shuffle=self.shuffle, random_state=self.random_state)
263 else:
264 cv = StratifiedKFold(
265 self.cv, shuffle=self.shuffle, random_state=self.random_state
266 )
267
268 # If 'multiclass' multiply axis=1 by num classes else keep shape the same
269 if self.target_type_ == "multiclass":
270 X_out = np.empty(
271 (X_ordinal.shape[0], X_ordinal.shape[1] * len(self.classes_)),
272 dtype=np.float64,
273 )
274 else:
275 X_out = np.empty_like(X_ordinal, dtype=np.float64)
276
277 for train_idx, test_idx in cv.split(X, y):
278 X_train, y_train = X_ordinal[train_idx, :], y_encoded[train_idx]
279 y_train_mean = np.mean(y_train, axis=0)
280
281 if self.target_type_ == "multiclass":
282 encodings = self._fit_encoding_multiclass(
283 X_train,
284 y_train,
285 n_categories,
286 y_train_mean,
287 )
288 else:
289 encodings = self._fit_encoding_binary_or_continuous(
290 X_train,
291 y_train,
292 n_categories,
293 y_train_mean,
294 )
295 self._transform_X_ordinal(
296 X_out,
297 X_ordinal,
298 ~X_known_mask,
299 test_idx,
300 encodings,
301 y_train_mean,
302 )
303 return X_out
304
305 def transform(self, X):
306 """Transform X with the target encoding.
307
308 .. note::
309 `fit(X, y).transform(X)` does not equal `fit_transform(X, y)` because a
310 :term:`cross fitting` scheme is used in `fit_transform` for encoding.
311 See the :ref:`User Guide <target_encoder>`. for details.
312
313 Parameters
314 ----------
315 X : array-like of shape (n_samples, n_features)
316 The data to determine the categories of each feature.
317
318 Returns
319 -------
320 X_trans : ndarray of shape (n_samples, n_features) or \
321 (n_samples, (n_features * n_classes))
322 Transformed input.
323 """
324 X_ordinal, X_known_mask = self._transform(
325 X, handle_unknown="ignore", force_all_finite="allow-nan"
326 )
327
328 # If 'multiclass' multiply axis=1 by num of classes else keep shape the same
329 if self.target_type_ == "multiclass":
330 X_out = np.empty(
331 (X_ordinal.shape[0], X_ordinal.shape[1] * len(self.classes_)),
332 dtype=np.float64,
333 )
334 else:
335 X_out = np.empty_like(X_ordinal, dtype=np.float64)
336
337 self._transform_X_ordinal(
338 X_out,
339 X_ordinal,
340 ~X_known_mask,
341 slice(None),
342 self.encodings_,
343 self.target_mean_,
344 )
345 return X_out
346
347 def _fit_encodings_all(self, X, y):
348 """Fit a target encoding with all the data."""
349 # avoid circular import
350 from ..preprocessing import (
351 LabelBinarizer,
352 LabelEncoder,
353 )
354
355 check_consistent_length(X, y)
356 self._fit(X, handle_unknown="ignore", force_all_finite="allow-nan")
357
358 if self.target_type == "auto":
359 accepted_target_types = ("binary", "multiclass", "continuous")
360 inferred_type_of_target = type_of_target(y, input_name="y")
361 if inferred_type_of_target not in accepted_target_types:
362 raise ValueError(
363 "Unknown label type: Target type was inferred to be "
364 f"{inferred_type_of_target!r}. Only {accepted_target_types} are "
365 "supported."
366 )
367 self.target_type_ = inferred_type_of_target
368 else:
369 self.target_type_ = self.target_type
370
371 self.classes_ = None
372 if self.target_type_ == "binary":
373 label_encoder = LabelEncoder()
374 y = label_encoder.fit_transform(y)
375 self.classes_ = label_encoder.classes_
376 elif self.target_type_ == "multiclass":
377 label_binarizer = LabelBinarizer()
378 y = label_binarizer.fit_transform(y)
379 self.classes_ = label_binarizer.classes_
380 else: # continuous
381 y = _check_y(y, y_numeric=True, estimator=self)
382
383 self.target_mean_ = np.mean(y, axis=0)
384
385 X_ordinal, X_known_mask = self._transform(
386 X, handle_unknown="ignore", force_all_finite="allow-nan"
387 )
388 n_categories = np.fromiter(
389 (len(category_for_feature) for category_for_feature in self.categories_),
390 dtype=np.int64,
391 count=len(self.categories_),
392 )
393 if self.target_type_ == "multiclass":
394 encodings = self._fit_encoding_multiclass(
395 X_ordinal,
396 y,
397 n_categories,
398 self.target_mean_,
399 )
400 else:
401 encodings = self._fit_encoding_binary_or_continuous(
402 X_ordinal,
403 y,
404 n_categories,
405 self.target_mean_,
406 )
407 self.encodings_ = encodings
408
409 return X_ordinal, X_known_mask, y, n_categories
410
411 def _fit_encoding_binary_or_continuous(
412 self, X_ordinal, y, n_categories, target_mean
413 ):
414 """Learn target encodings."""
415 if self.smooth == "auto":
416 y_variance = np.var(y)
417 encodings = _fit_encoding_fast_auto_smooth(
418 X_ordinal,
419 y,
420 n_categories,
421 target_mean,
422 y_variance,
423 )
424 else:
425 encodings = _fit_encoding_fast(
426 X_ordinal,
427 y,
428 n_categories,
429 self.smooth,
430 target_mean,
431 )
432 return encodings
433
434 def _fit_encoding_multiclass(self, X_ordinal, y, n_categories, target_mean):
435 """Learn multiclass encodings.
436
437 Learn encodings for each class (c) then reorder encodings such that
438 the same features (f) are grouped together. `reorder_index` enables
439 converting from:
440 f0_c0, f1_c0, f0_c1, f1_c1, f0_c2, f1_c2
441 to:
442 f0_c0, f0_c1, f0_c2, f1_c0, f1_c1, f1_c2
443 """
444 n_features = self.n_features_in_
445 n_classes = len(self.classes_)
446
447 encodings = []
448 for i in range(n_classes):
449 y_class = y[:, i]
450 encoding = self._fit_encoding_binary_or_continuous(
451 X_ordinal,
452 y_class,
453 n_categories,
454 target_mean[i],
455 )
456 encodings.extend(encoding)
457
458 reorder_index = (
459 idx
460 for start in range(n_features)
461 for idx in range(start, (n_classes * n_features), n_features)
462 )
463 return [encodings[idx] for idx in reorder_index]
464
465 def _transform_X_ordinal(
466 self,
467 X_out,
468 X_ordinal,
469 X_unknown_mask,
470 row_indices,
471 encodings,
472 target_mean,
473 ):
474 """Transform X_ordinal using encodings.
475
476 In the multiclass case, `X_ordinal` and `X_unknown_mask` have column
477 (axis=1) size `n_features`, while `encodings` has length of size
478 `n_features * n_classes`. `feat_idx` deals with this by repeating
479 feature indices by `n_classes` E.g., for 3 features, 2 classes:
480 0,0,1,1,2,2
481
482 Additionally, `target_mean` is of shape (`n_classes`,) so `mean_idx`
483 cycles through 0 to `n_classes` - 1, `n_features` times.
484 """
485 if self.target_type_ == "multiclass":
486 n_classes = len(self.classes_)
487 for e_idx, encoding in enumerate(encodings):
488 # Repeat feature indices by n_classes
489 feat_idx = e_idx // n_classes
490 # Cycle through each class
491 mean_idx = e_idx % n_classes
492 X_out[row_indices, e_idx] = encoding[X_ordinal[row_indices, feat_idx]]
493 X_out[X_unknown_mask[:, feat_idx], e_idx] = target_mean[mean_idx]
494 else:
495 for e_idx, encoding in enumerate(encodings):
496 X_out[row_indices, e_idx] = encoding[X_ordinal[row_indices, e_idx]]
497 X_out[X_unknown_mask[:, e_idx], e_idx] = target_mean
498
499 def get_feature_names_out(self, input_features=None):
500 """Get output feature names for transformation.
501
502 Parameters
503 ----------
504 input_features : array-like of str or None, default=None
505 Not used, present here for API consistency by convention.
506
507 Returns
508 -------
509 feature_names_out : ndarray of str objects
510 Transformed feature names. `feature_names_in_` is used unless it is
511 not defined, in which case the following input feature names are
512 generated: `["x0", "x1", ..., "x(n_features_in_ - 1)"]`.
513 When `type_of_target_` is "multiclass" the names are of the format
514 '<feature_name>_<class_name>'.
515 """
516 check_is_fitted(self, "n_features_in_")
517 feature_names = _check_feature_names_in(self, input_features)
518 if self.target_type_ == "multiclass":
519 feature_names = [
520 f"{feature_name}_{class_name}"
521 for feature_name in feature_names
522 for class_name in self.classes_
523 ]
524 return np.asarray(feature_names, dtype=object)
525 else:
526 return feature_names
527
528 def _more_tags(self):
529 return {
530 "requires_y": True,
531 }