1# Authors: Andreas Mueller <amueller@ais.uni-bonn.de>
2# Joris Van den Bossche <jorisvandenbossche@gmail.com>
3# License: BSD 3 clause
4
5import numbers
6import warnings
7from numbers import Integral
8
9import numpy as np
10from scipy import sparse
11
12from ..base import BaseEstimator, OneToOneFeatureMixin, TransformerMixin, _fit_context
13from ..utils import _safe_indexing, check_array, is_scalar_nan
14from ..utils._encode import _check_unknown, _encode, _get_counts, _unique
15from ..utils._mask import _get_mask
16from ..utils._param_validation import Interval, RealNotInt, StrOptions
17from ..utils._set_output import _get_output_config
18from ..utils.validation import _check_feature_names_in, check_is_fitted
19
20__all__ = ["OneHotEncoder", "OrdinalEncoder"]
21
22
23class _BaseEncoder(TransformerMixin, BaseEstimator):
24 """
25 Base class for encoders that includes the code to categorize and
26 transform the input features.
27
28 """
29
30 def _check_X(self, X, force_all_finite=True):
31 """
32 Perform custom check_array:
33 - convert list of strings to object dtype
34 - check for missing values for object dtype data (check_array does
35 not do that)
36 - return list of features (arrays): this list of features is
37 constructed feature by feature to preserve the data types
38 of pandas DataFrame columns, as otherwise information is lost
39 and cannot be used, e.g. for the `categories_` attribute.
40
41 """
42 if not (hasattr(X, "iloc") and getattr(X, "ndim", 0) == 2):
43 # if not a dataframe, do normal check_array validation
44 X_temp = check_array(X, dtype=None, force_all_finite=force_all_finite)
45 if not hasattr(X, "dtype") and np.issubdtype(X_temp.dtype, np.str_):
46 X = check_array(X, dtype=object, force_all_finite=force_all_finite)
47 else:
48 X = X_temp
49 needs_validation = False
50 else:
51 # pandas dataframe, do validation later column by column, in order
52 # to keep the dtype information to be used in the encoder.
53 needs_validation = force_all_finite
54
55 n_samples, n_features = X.shape
56 X_columns = []
57
58 for i in range(n_features):
59 Xi = _safe_indexing(X, indices=i, axis=1)
60 Xi = check_array(
61 Xi, ensure_2d=False, dtype=None, force_all_finite=needs_validation
62 )
63 X_columns.append(Xi)
64
65 return X_columns, n_samples, n_features
66
67 def _fit(
68 self,
69 X,
70 handle_unknown="error",
71 force_all_finite=True,
72 return_counts=False,
73 return_and_ignore_missing_for_infrequent=False,
74 ):
75 self._check_infrequent_enabled()
76 self._check_n_features(X, reset=True)
77 self._check_feature_names(X, reset=True)
78 X_list, n_samples, n_features = self._check_X(
79 X, force_all_finite=force_all_finite
80 )
81 self.n_features_in_ = n_features
82
83 if self.categories != "auto":
84 if len(self.categories) != n_features:
85 raise ValueError(
86 "Shape mismatch: if categories is an array,"
87 " it has to be of shape (n_features,)."
88 )
89
90 self.categories_ = []
91 category_counts = []
92 compute_counts = return_counts or self._infrequent_enabled
93
94 for i in range(n_features):
95 Xi = X_list[i]
96
97 if self.categories == "auto":
98 result = _unique(Xi, return_counts=compute_counts)
99 if compute_counts:
100 cats, counts = result
101 category_counts.append(counts)
102 else:
103 cats = result
104 else:
105 if np.issubdtype(Xi.dtype, np.str_):
106 # Always convert string categories to objects to avoid
107 # unexpected string truncation for longer category labels
108 # passed in the constructor.
109 Xi_dtype = object
110 else:
111 Xi_dtype = Xi.dtype
112
113 cats = np.array(self.categories[i], dtype=Xi_dtype)
114 if (
115 cats.dtype == object
116 and isinstance(cats[0], bytes)
117 and Xi.dtype.kind != "S"
118 ):
119 msg = (
120 f"In column {i}, the predefined categories have type 'bytes'"
121 " which is incompatible with values of type"
122 f" '{type(Xi[0]).__name__}'."
123 )
124 raise ValueError(msg)
125
126 # `nan` must be the last stated category
127 for category in cats[:-1]:
128 if is_scalar_nan(category):
129 raise ValueError(
130 "Nan should be the last element in user"
131 f" provided categories, see categories {cats}"
132 f" in column #{i}"
133 )
134
135 if cats.size != len(_unique(cats)):
136 msg = (
137 f"In column {i}, the predefined categories"
138 " contain duplicate elements."
139 )
140 raise ValueError(msg)
141
142 if Xi.dtype.kind not in "OUS":
143 sorted_cats = np.sort(cats)
144 error_msg = (
145 "Unsorted categories are not supported for numerical categories"
146 )
147 # if there are nans, nan should be the last element
148 stop_idx = -1 if np.isnan(sorted_cats[-1]) else None
149 if np.any(sorted_cats[:stop_idx] != cats[:stop_idx]):
150 raise ValueError(error_msg)
151
152 if handle_unknown == "error":
153 diff = _check_unknown(Xi, cats)
154 if diff:
155 msg = (
156 "Found unknown categories {0} in column {1}"
157 " during fit".format(diff, i)
158 )
159 raise ValueError(msg)
160 if compute_counts:
161 category_counts.append(_get_counts(Xi, cats))
162
163 self.categories_.append(cats)
164
165 output = {"n_samples": n_samples}
166 if return_counts:
167 output["category_counts"] = category_counts
168
169 missing_indices = {}
170 if return_and_ignore_missing_for_infrequent:
171 for feature_idx, categories_for_idx in enumerate(self.categories_):
172 if is_scalar_nan(categories_for_idx[-1]):
173 # `nan` values can only be placed in the latest position
174 missing_indices[feature_idx] = categories_for_idx.size - 1
175 output["missing_indices"] = missing_indices
176
177 if self._infrequent_enabled:
178 self._fit_infrequent_category_mapping(
179 n_samples,
180 category_counts,
181 missing_indices,
182 )
183 return output
184
185 def _transform(
186 self,
187 X,
188 handle_unknown="error",
189 force_all_finite=True,
190 warn_on_unknown=False,
191 ignore_category_indices=None,
192 ):
193 X_list, n_samples, n_features = self._check_X(
194 X, force_all_finite=force_all_finite
195 )
196 self._check_feature_names(X, reset=False)
197 self._check_n_features(X, reset=False)
198
199 X_int = np.zeros((n_samples, n_features), dtype=int)
200 X_mask = np.ones((n_samples, n_features), dtype=bool)
201
202 columns_with_unknown = []
203 for i in range(n_features):
204 Xi = X_list[i]
205 diff, valid_mask = _check_unknown(Xi, self.categories_[i], return_mask=True)
206
207 if not np.all(valid_mask):
208 if handle_unknown == "error":
209 msg = (
210 "Found unknown categories {0} in column {1}"
211 " during transform".format(diff, i)
212 )
213 raise ValueError(msg)
214 else:
215 if warn_on_unknown:
216 columns_with_unknown.append(i)
217 # Set the problematic rows to an acceptable value and
218 # continue `The rows are marked `X_mask` and will be
219 # removed later.
220 X_mask[:, i] = valid_mask
221 # cast Xi into the largest string type necessary
222 # to handle different lengths of numpy strings
223 if (
224 self.categories_[i].dtype.kind in ("U", "S")
225 and self.categories_[i].itemsize > Xi.itemsize
226 ):
227 Xi = Xi.astype(self.categories_[i].dtype)
228 elif self.categories_[i].dtype.kind == "O" and Xi.dtype.kind == "U":
229 # categories are objects and Xi are numpy strings.
230 # Cast Xi to an object dtype to prevent truncation
231 # when setting invalid values.
232 Xi = Xi.astype("O")
233 else:
234 Xi = Xi.copy()
235
236 Xi[~valid_mask] = self.categories_[i][0]
237 # We use check_unknown=False, since _check_unknown was
238 # already called above.
239 X_int[:, i] = _encode(Xi, uniques=self.categories_[i], check_unknown=False)
240 if columns_with_unknown:
241 warnings.warn(
242 (
243 "Found unknown categories in columns "
244 f"{columns_with_unknown} during transform. These "
245 "unknown categories will be encoded as all zeros"
246 ),
247 UserWarning,
248 )
249
250 self._map_infrequent_categories(X_int, X_mask, ignore_category_indices)
251 return X_int, X_mask
252
253 @property
254 def infrequent_categories_(self):
255 """Infrequent categories for each feature."""
256 # raises an AttributeError if `_infrequent_indices` is not defined
257 infrequent_indices = self._infrequent_indices
258 return [
259 None if indices is None else category[indices]
260 for category, indices in zip(self.categories_, infrequent_indices)
261 ]
262
263 def _check_infrequent_enabled(self):
264 """
265 This functions checks whether _infrequent_enabled is True or False.
266 This has to be called after parameter validation in the fit function.
267 """
268 max_categories = getattr(self, "max_categories", None)
269 min_frequency = getattr(self, "min_frequency", None)
270 self._infrequent_enabled = (
271 max_categories is not None and max_categories >= 1
272 ) or min_frequency is not None
273
274 def _identify_infrequent(self, category_count, n_samples, col_idx):
275 """Compute the infrequent indices.
276
277 Parameters
278 ----------
279 category_count : ndarray of shape (n_cardinality,)
280 Category counts.
281
282 n_samples : int
283 Number of samples.
284
285 col_idx : int
286 Index of the current category. Only used for the error message.
287
288 Returns
289 -------
290 output : ndarray of shape (n_infrequent_categories,) or None
291 If there are infrequent categories, indices of infrequent
292 categories. Otherwise None.
293 """
294 if isinstance(self.min_frequency, numbers.Integral):
295 infrequent_mask = category_count < self.min_frequency
296 elif isinstance(self.min_frequency, numbers.Real):
297 min_frequency_abs = n_samples * self.min_frequency
298 infrequent_mask = category_count < min_frequency_abs
299 else:
300 infrequent_mask = np.zeros(category_count.shape[0], dtype=bool)
301
302 n_current_features = category_count.size - infrequent_mask.sum() + 1
303 if self.max_categories is not None and self.max_categories < n_current_features:
304 # max_categories includes the one infrequent category
305 frequent_category_count = self.max_categories - 1
306 if frequent_category_count == 0:
307 # All categories are infrequent
308 infrequent_mask[:] = True
309 else:
310 # stable sort to preserve original count order
311 smallest_levels = np.argsort(category_count, kind="mergesort")[
312 :-frequent_category_count
313 ]
314 infrequent_mask[smallest_levels] = True
315
316 output = np.flatnonzero(infrequent_mask)
317 return output if output.size > 0 else None
318
319 def _fit_infrequent_category_mapping(
320 self, n_samples, category_counts, missing_indices
321 ):
322 """Fit infrequent categories.
323
324 Defines the private attribute: `_default_to_infrequent_mappings`. For
325 feature `i`, `_default_to_infrequent_mappings[i]` defines the mapping
326 from the integer encoding returned by `super().transform()` into
327 infrequent categories. If `_default_to_infrequent_mappings[i]` is None,
328 there were no infrequent categories in the training set.
329
330 For example if categories 0, 2 and 4 were frequent, while categories
331 1, 3, 5 were infrequent for feature 7, then these categories are mapped
332 to a single output:
333 `_default_to_infrequent_mappings[7] = array([0, 3, 1, 3, 2, 3])`
334
335 Defines private attribute: `_infrequent_indices`. `_infrequent_indices[i]`
336 is an array of indices such that
337 `categories_[i][_infrequent_indices[i]]` are all the infrequent category
338 labels. If the feature `i` has no infrequent categories
339 `_infrequent_indices[i]` is None.
340
341 .. versionadded:: 1.1
342
343 Parameters
344 ----------
345 n_samples : int
346 Number of samples in training set.
347 category_counts: list of ndarray
348 `category_counts[i]` is the category counts corresponding to
349 `self.categories_[i]`.
350 missing_indices : dict
351 Dict mapping from feature_idx to category index with a missing value.
352 """
353 # Remove missing value from counts, so it is not considered as infrequent
354 if missing_indices:
355 category_counts_ = []
356 for feature_idx, count in enumerate(category_counts):
357 if feature_idx in missing_indices:
358 category_counts_.append(
359 np.delete(count, missing_indices[feature_idx])
360 )
361 else:
362 category_counts_.append(count)
363 else:
364 category_counts_ = category_counts
365
366 self._infrequent_indices = [
367 self._identify_infrequent(category_count, n_samples, col_idx)
368 for col_idx, category_count in enumerate(category_counts_)
369 ]
370
371 # compute mapping from default mapping to infrequent mapping
372 self._default_to_infrequent_mappings = []
373
374 for feature_idx, infreq_idx in enumerate(self._infrequent_indices):
375 cats = self.categories_[feature_idx]
376 # no infrequent categories
377 if infreq_idx is None:
378 self._default_to_infrequent_mappings.append(None)
379 continue
380
381 n_cats = len(cats)
382 if feature_idx in missing_indices:
383 # Missing index was removed from this category when computing
384 # infrequent indices, thus we need to decrease the number of
385 # total categories when considering the infrequent mapping.
386 n_cats -= 1
387
388 # infrequent indices exist
389 mapping = np.empty(n_cats, dtype=np.int64)
390 n_infrequent_cats = infreq_idx.size
391
392 # infrequent categories are mapped to the last element.
393 n_frequent_cats = n_cats - n_infrequent_cats
394 mapping[infreq_idx] = n_frequent_cats
395
396 frequent_indices = np.setdiff1d(np.arange(n_cats), infreq_idx)
397 mapping[frequent_indices] = np.arange(n_frequent_cats)
398
399 self._default_to_infrequent_mappings.append(mapping)
400
401 def _map_infrequent_categories(self, X_int, X_mask, ignore_category_indices):
402 """Map infrequent categories to integer representing the infrequent category.
403
404 This modifies X_int in-place. Values that were invalid based on `X_mask`
405 are mapped to the infrequent category if there was an infrequent
406 category for that feature.
407
408 Parameters
409 ----------
410 X_int: ndarray of shape (n_samples, n_features)
411 Integer encoded categories.
412
413 X_mask: ndarray of shape (n_samples, n_features)
414 Bool mask for valid values in `X_int`.
415
416 ignore_category_indices : dict
417 Dictionary mapping from feature_idx to category index to ignore.
418 Ignored indexes will not be grouped and the original ordinal encoding
419 will remain.
420 """
421 if not self._infrequent_enabled:
422 return
423
424 ignore_category_indices = ignore_category_indices or {}
425
426 for col_idx in range(X_int.shape[1]):
427 infrequent_idx = self._infrequent_indices[col_idx]
428 if infrequent_idx is None:
429 continue
430
431 X_int[~X_mask[:, col_idx], col_idx] = infrequent_idx[0]
432 if self.handle_unknown == "infrequent_if_exist":
433 # All the unknown values are now mapped to the
434 # infrequent_idx[0], which makes the unknown values valid
435 # This is needed in `transform` when the encoding is formed
436 # using `X_mask`.
437 X_mask[:, col_idx] = True
438
439 # Remaps encoding in `X_int` where the infrequent categories are
440 # grouped together.
441 for i, mapping in enumerate(self._default_to_infrequent_mappings):
442 if mapping is None:
443 continue
444
445 if i in ignore_category_indices:
446 # Update rows that are **not** ignored
447 rows_to_update = X_int[:, i] != ignore_category_indices[i]
448 else:
449 rows_to_update = slice(None)
450
451 X_int[rows_to_update, i] = np.take(mapping, X_int[rows_to_update, i])
452
453 def _more_tags(self):
454 return {"X_types": ["2darray", "categorical"], "allow_nan": True}
455
456
457class OneHotEncoder(_BaseEncoder):
458 """
459 Encode categorical features as a one-hot numeric array.
460
461 The input to this transformer should be an array-like of integers or
462 strings, denoting the values taken on by categorical (discrete) features.
463 The features are encoded using a one-hot (aka 'one-of-K' or 'dummy')
464 encoding scheme. This creates a binary column for each category and
465 returns a sparse matrix or dense array (depending on the ``sparse_output``
466 parameter).
467
468 By default, the encoder derives the categories based on the unique values
469 in each feature. Alternatively, you can also specify the `categories`
470 manually.
471
472 This encoding is needed for feeding categorical data to many scikit-learn
473 estimators, notably linear models and SVMs with the standard kernels.
474
475 Note: a one-hot encoding of y labels should use a LabelBinarizer
476 instead.
477
478 Read more in the :ref:`User Guide <preprocessing_categorical_features>`.
479 For a comparison of different encoders, refer to:
480 :ref:`sphx_glr_auto_examples_preprocessing_plot_target_encoder.py`.
481
482 Parameters
483 ----------
484 categories : 'auto' or a list of array-like, default='auto'
485 Categories (unique values) per feature:
486
487 - 'auto' : Determine categories automatically from the training data.
488 - list : ``categories[i]`` holds the categories expected in the ith
489 column. The passed categories should not mix strings and numeric
490 values within a single feature, and should be sorted in case of
491 numeric values.
492
493 The used categories can be found in the ``categories_`` attribute.
494
495 .. versionadded:: 0.20
496
497 drop : {'first', 'if_binary'} or an array-like of shape (n_features,), \
498 default=None
499 Specifies a methodology to use to drop one of the categories per
500 feature. This is useful in situations where perfectly collinear
501 features cause problems, such as when feeding the resulting data
502 into an unregularized linear regression model.
503
504 However, dropping one category breaks the symmetry of the original
505 representation and can therefore induce a bias in downstream models,
506 for instance for penalized linear classification or regression models.
507
508 - None : retain all features (the default).
509 - 'first' : drop the first category in each feature. If only one
510 category is present, the feature will be dropped entirely.
511 - 'if_binary' : drop the first category in each feature with two
512 categories. Features with 1 or more than 2 categories are
513 left intact.
514 - array : ``drop[i]`` is the category in feature ``X[:, i]`` that
515 should be dropped.
516
517 When `max_categories` or `min_frequency` is configured to group
518 infrequent categories, the dropping behavior is handled after the
519 grouping.
520
521 .. versionadded:: 0.21
522 The parameter `drop` was added in 0.21.
523
524 .. versionchanged:: 0.23
525 The option `drop='if_binary'` was added in 0.23.
526
527 .. versionchanged:: 1.1
528 Support for dropping infrequent categories.
529
530 sparse_output : bool, default=True
531 When ``True``, it returns a :class:`scipy.sparse.csr_matrix`,
532 i.e. a sparse matrix in "Compressed Sparse Row" (CSR) format.
533
534 .. versionadded:: 1.2
535 `sparse` was renamed to `sparse_output`
536
537 dtype : number type, default=np.float64
538 Desired dtype of output.
539
540 handle_unknown : {'error', 'ignore', 'infrequent_if_exist'}, \
541 default='error'
542 Specifies the way unknown categories are handled during :meth:`transform`.
543
544 - 'error' : Raise an error if an unknown category is present during transform.
545 - 'ignore' : When an unknown category is encountered during
546 transform, the resulting one-hot encoded columns for this feature
547 will be all zeros. In the inverse transform, an unknown category
548 will be denoted as None.
549 - 'infrequent_if_exist' : When an unknown category is encountered
550 during transform, the resulting one-hot encoded columns for this
551 feature will map to the infrequent category if it exists. The
552 infrequent category will be mapped to the last position in the
553 encoding. During inverse transform, an unknown category will be
554 mapped to the category denoted `'infrequent'` if it exists. If the
555 `'infrequent'` category does not exist, then :meth:`transform` and
556 :meth:`inverse_transform` will handle an unknown category as with
557 `handle_unknown='ignore'`. Infrequent categories exist based on
558 `min_frequency` and `max_categories`. Read more in the
559 :ref:`User Guide <encoder_infrequent_categories>`.
560
561 .. versionchanged:: 1.1
562 `'infrequent_if_exist'` was added to automatically handle unknown
563 categories and infrequent categories.
564
565 min_frequency : int or float, default=None
566 Specifies the minimum frequency below which a category will be
567 considered infrequent.
568
569 - If `int`, categories with a smaller cardinality will be considered
570 infrequent.
571
572 - If `float`, categories with a smaller cardinality than
573 `min_frequency * n_samples` will be considered infrequent.
574
575 .. versionadded:: 1.1
576 Read more in the :ref:`User Guide <encoder_infrequent_categories>`.
577
578 max_categories : int, default=None
579 Specifies an upper limit to the number of output features for each input
580 feature when considering infrequent categories. If there are infrequent
581 categories, `max_categories` includes the category representing the
582 infrequent categories along with the frequent categories. If `None`,
583 there is no limit to the number of output features.
584
585 .. versionadded:: 1.1
586 Read more in the :ref:`User Guide <encoder_infrequent_categories>`.
587
588 feature_name_combiner : "concat" or callable, default="concat"
589 Callable with signature `def callable(input_feature, category)` that returns a
590 string. This is used to create feature names to be returned by
591 :meth:`get_feature_names_out`.
592
593 `"concat"` concatenates encoded feature name and category with
594 `feature + "_" + str(category)`.E.g. feature X with values 1, 6, 7 create
595 feature names `X_1, X_6, X_7`.
596
597 .. versionadded:: 1.3
598
599 Attributes
600 ----------
601 categories_ : list of arrays
602 The categories of each feature determined during fitting
603 (in order of the features in X and corresponding with the output
604 of ``transform``). This includes the category specified in ``drop``
605 (if any).
606
607 drop_idx_ : array of shape (n_features,)
608 - ``drop_idx_[i]`` is the index in ``categories_[i]`` of the category
609 to be dropped for each feature.
610 - ``drop_idx_[i] = None`` if no category is to be dropped from the
611 feature with index ``i``, e.g. when `drop='if_binary'` and the
612 feature isn't binary.
613 - ``drop_idx_ = None`` if all the transformed features will be
614 retained.
615
616 If infrequent categories are enabled by setting `min_frequency` or
617 `max_categories` to a non-default value and `drop_idx[i]` corresponds
618 to a infrequent category, then the entire infrequent category is
619 dropped.
620
621 .. versionchanged:: 0.23
622 Added the possibility to contain `None` values.
623
624 infrequent_categories_ : list of ndarray
625 Defined only if infrequent categories are enabled by setting
626 `min_frequency` or `max_categories` to a non-default value.
627 `infrequent_categories_[i]` are the infrequent categories for feature
628 `i`. If the feature `i` has no infrequent categories
629 `infrequent_categories_[i]` is None.
630
631 .. versionadded:: 1.1
632
633 n_features_in_ : int
634 Number of features seen during :term:`fit`.
635
636 .. versionadded:: 1.0
637
638 feature_names_in_ : ndarray of shape (`n_features_in_`,)
639 Names of features seen during :term:`fit`. Defined only when `X`
640 has feature names that are all strings.
641
642 .. versionadded:: 1.0
643
644 feature_name_combiner : callable or None
645 Callable with signature `def callable(input_feature, category)` that returns a
646 string. This is used to create feature names to be returned by
647 :meth:`get_feature_names_out`.
648
649 .. versionadded:: 1.3
650
651 See Also
652 --------
653 OrdinalEncoder : Performs an ordinal (integer)
654 encoding of the categorical features.
655 TargetEncoder : Encodes categorical features using the target.
656 sklearn.feature_extraction.DictVectorizer : Performs a one-hot encoding of
657 dictionary items (also handles string-valued features).
658 sklearn.feature_extraction.FeatureHasher : Performs an approximate one-hot
659 encoding of dictionary items or strings.
660 LabelBinarizer : Binarizes labels in a one-vs-all
661 fashion.
662 MultiLabelBinarizer : Transforms between iterable of
663 iterables and a multilabel format, e.g. a (samples x classes) binary
664 matrix indicating the presence of a class label.
665
666 Examples
667 --------
668 Given a dataset with two features, we let the encoder find the unique
669 values per feature and transform the data to a binary one-hot encoding.
670
671 >>> from sklearn.preprocessing import OneHotEncoder
672
673 One can discard categories not seen during `fit`:
674
675 >>> enc = OneHotEncoder(handle_unknown='ignore')
676 >>> X = [['Male', 1], ['Female', 3], ['Female', 2]]
677 >>> enc.fit(X)
678 OneHotEncoder(handle_unknown='ignore')
679 >>> enc.categories_
680 [array(['Female', 'Male'], dtype=object), array([1, 2, 3], dtype=object)]
681 >>> enc.transform([['Female', 1], ['Male', 4]]).toarray()
682 array([[1., 0., 1., 0., 0.],
683 [0., 1., 0., 0., 0.]])
684 >>> enc.inverse_transform([[0, 1, 1, 0, 0], [0, 0, 0, 1, 0]])
685 array([['Male', 1],
686 [None, 2]], dtype=object)
687 >>> enc.get_feature_names_out(['gender', 'group'])
688 array(['gender_Female', 'gender_Male', 'group_1', 'group_2', 'group_3'], ...)
689
690 One can always drop the first column for each feature:
691
692 >>> drop_enc = OneHotEncoder(drop='first').fit(X)
693 >>> drop_enc.categories_
694 [array(['Female', 'Male'], dtype=object), array([1, 2, 3], dtype=object)]
695 >>> drop_enc.transform([['Female', 1], ['Male', 2]]).toarray()
696 array([[0., 0., 0.],
697 [1., 1., 0.]])
698
699 Or drop a column for feature only having 2 categories:
700
701 >>> drop_binary_enc = OneHotEncoder(drop='if_binary').fit(X)
702 >>> drop_binary_enc.transform([['Female', 1], ['Male', 2]]).toarray()
703 array([[0., 1., 0., 0.],
704 [1., 0., 1., 0.]])
705
706 One can change the way feature names are created.
707
708 >>> def custom_combiner(feature, category):
709 ... return str(feature) + "_" + type(category).__name__ + "_" + str(category)
710 >>> custom_fnames_enc = OneHotEncoder(feature_name_combiner=custom_combiner).fit(X)
711 >>> custom_fnames_enc.get_feature_names_out()
712 array(['x0_str_Female', 'x0_str_Male', 'x1_int_1', 'x1_int_2', 'x1_int_3'],
713 dtype=object)
714
715 Infrequent categories are enabled by setting `max_categories` or `min_frequency`.
716
717 >>> import numpy as np
718 >>> X = np.array([["a"] * 5 + ["b"] * 20 + ["c"] * 10 + ["d"] * 3], dtype=object).T
719 >>> ohe = OneHotEncoder(max_categories=3, sparse_output=False).fit(X)
720 >>> ohe.infrequent_categories_
721 [array(['a', 'd'], dtype=object)]
722 >>> ohe.transform([["a"], ["b"]])
723 array([[0., 0., 1.],
724 [1., 0., 0.]])
725 """
726
727 _parameter_constraints: dict = {
728 "categories": [StrOptions({"auto"}), list],
729 "drop": [StrOptions({"first", "if_binary"}), "array-like", None],
730 "dtype": "no_validation", # validation delegated to numpy
731 "handle_unknown": [StrOptions({"error", "ignore", "infrequent_if_exist"})],
732 "max_categories": [Interval(Integral, 1, None, closed="left"), None],
733 "min_frequency": [
734 Interval(Integral, 1, None, closed="left"),
735 Interval(RealNotInt, 0, 1, closed="neither"),
736 None,
737 ],
738 "sparse_output": ["boolean"],
739 "feature_name_combiner": [StrOptions({"concat"}), callable],
740 }
741
742 def __init__(
743 self,
744 *,
745 categories="auto",
746 drop=None,
747 sparse_output=True,
748 dtype=np.float64,
749 handle_unknown="error",
750 min_frequency=None,
751 max_categories=None,
752 feature_name_combiner="concat",
753 ):
754 self.categories = categories
755 self.sparse_output = sparse_output
756 self.dtype = dtype
757 self.handle_unknown = handle_unknown
758 self.drop = drop
759 self.min_frequency = min_frequency
760 self.max_categories = max_categories
761 self.feature_name_combiner = feature_name_combiner
762
763 def _map_drop_idx_to_infrequent(self, feature_idx, drop_idx):
764 """Convert `drop_idx` into the index for infrequent categories.
765
766 If there are no infrequent categories, then `drop_idx` is
767 returned. This method is called in `_set_drop_idx` when the `drop`
768 parameter is an array-like.
769 """
770 if not self._infrequent_enabled:
771 return drop_idx
772
773 default_to_infrequent = self._default_to_infrequent_mappings[feature_idx]
774 if default_to_infrequent is None:
775 return drop_idx
776
777 # Raise error when explicitly dropping a category that is infrequent
778 infrequent_indices = self._infrequent_indices[feature_idx]
779 if infrequent_indices is not None and drop_idx in infrequent_indices:
780 categories = self.categories_[feature_idx]
781 raise ValueError(
782 f"Unable to drop category {categories[drop_idx].item()!r} from"
783 f" feature {feature_idx} because it is infrequent"
784 )
785 return default_to_infrequent[drop_idx]
786
787 def _set_drop_idx(self):
788 """Compute the drop indices associated with `self.categories_`.
789
790 If `self.drop` is:
791 - `None`, No categories have been dropped.
792 - `'first'`, All zeros to drop the first category.
793 - `'if_binary'`, All zeros if the category is binary and `None`
794 otherwise.
795 - array-like, The indices of the categories that match the
796 categories in `self.drop`. If the dropped category is an infrequent
797 category, then the index for the infrequent category is used. This
798 means that the entire infrequent category is dropped.
799
800 This methods defines a public `drop_idx_` and a private
801 `_drop_idx_after_grouping`.
802
803 - `drop_idx_`: Public facing API that references the drop category in
804 `self.categories_`.
805 - `_drop_idx_after_grouping`: Used internally to drop categories *after* the
806 infrequent categories are grouped together.
807
808 If there are no infrequent categories or drop is `None`, then
809 `drop_idx_=_drop_idx_after_grouping`.
810 """
811 if self.drop is None:
812 drop_idx_after_grouping = None
813 elif isinstance(self.drop, str):
814 if self.drop == "first":
815 drop_idx_after_grouping = np.zeros(len(self.categories_), dtype=object)
816 elif self.drop == "if_binary":
817 n_features_out_no_drop = [len(cat) for cat in self.categories_]
818 if self._infrequent_enabled:
819 for i, infreq_idx in enumerate(self._infrequent_indices):
820 if infreq_idx is None:
821 continue
822 n_features_out_no_drop[i] -= infreq_idx.size - 1
823
824 drop_idx_after_grouping = np.array(
825 [
826 0 if n_features_out == 2 else None
827 for n_features_out in n_features_out_no_drop
828 ],
829 dtype=object,
830 )
831
832 else:
833 drop_array = np.asarray(self.drop, dtype=object)
834 droplen = len(drop_array)
835
836 if droplen != len(self.categories_):
837 msg = (
838 "`drop` should have length equal to the number "
839 "of features ({}), got {}"
840 )
841 raise ValueError(msg.format(len(self.categories_), droplen))
842 missing_drops = []
843 drop_indices = []
844 for feature_idx, (drop_val, cat_list) in enumerate(
845 zip(drop_array, self.categories_)
846 ):
847 if not is_scalar_nan(drop_val):
848 drop_idx = np.where(cat_list == drop_val)[0]
849 if drop_idx.size: # found drop idx
850 drop_indices.append(
851 self._map_drop_idx_to_infrequent(feature_idx, drop_idx[0])
852 )
853 else:
854 missing_drops.append((feature_idx, drop_val))
855 continue
856
857 # drop_val is nan, find nan in categories manually
858 if is_scalar_nan(cat_list[-1]):
859 drop_indices.append(
860 self._map_drop_idx_to_infrequent(feature_idx, cat_list.size - 1)
861 )
862 else: # nan is missing
863 missing_drops.append((feature_idx, drop_val))
864
865 if any(missing_drops):
866 msg = (
867 "The following categories were supposed to be "
868 "dropped, but were not found in the training "
869 "data.\n{}".format(
870 "\n".join(
871 [
872 "Category: {}, Feature: {}".format(c, v)
873 for c, v in missing_drops
874 ]
875 )
876 )
877 )
878 raise ValueError(msg)
879 drop_idx_after_grouping = np.array(drop_indices, dtype=object)
880
881 # `_drop_idx_after_grouping` are the categories to drop *after* the infrequent
882 # categories are grouped together. If needed, we remap `drop_idx` back
883 # to the categories seen in `self.categories_`.
884 self._drop_idx_after_grouping = drop_idx_after_grouping
885
886 if not self._infrequent_enabled or drop_idx_after_grouping is None:
887 self.drop_idx_ = self._drop_idx_after_grouping
888 else:
889 drop_idx_ = []
890 for feature_idx, drop_idx in enumerate(drop_idx_after_grouping):
891 default_to_infrequent = self._default_to_infrequent_mappings[
892 feature_idx
893 ]
894 if drop_idx is None or default_to_infrequent is None:
895 orig_drop_idx = drop_idx
896 else:
897 orig_drop_idx = np.flatnonzero(default_to_infrequent == drop_idx)[0]
898
899 drop_idx_.append(orig_drop_idx)
900
901 self.drop_idx_ = np.asarray(drop_idx_, dtype=object)
902
903 def _compute_transformed_categories(self, i, remove_dropped=True):
904 """Compute the transformed categories used for column `i`.
905
906 1. If there are infrequent categories, the category is named
907 'infrequent_sklearn'.
908 2. Dropped columns are removed when remove_dropped=True.
909 """
910 cats = self.categories_[i]
911
912 if self._infrequent_enabled:
913 infreq_map = self._default_to_infrequent_mappings[i]
914 if infreq_map is not None:
915 frequent_mask = infreq_map < infreq_map.max()
916 infrequent_cat = "infrequent_sklearn"
917 # infrequent category is always at the end
918 cats = np.concatenate(
919 (cats[frequent_mask], np.array([infrequent_cat], dtype=object))
920 )
921
922 if remove_dropped:
923 cats = self._remove_dropped_categories(cats, i)
924 return cats
925
926 def _remove_dropped_categories(self, categories, i):
927 """Remove dropped categories."""
928 if (
929 self._drop_idx_after_grouping is not None
930 and self._drop_idx_after_grouping[i] is not None
931 ):
932 return np.delete(categories, self._drop_idx_after_grouping[i])
933 return categories
934
935 def _compute_n_features_outs(self):
936 """Compute the n_features_out for each input feature."""
937 output = [len(cats) for cats in self.categories_]
938
939 if self._drop_idx_after_grouping is not None:
940 for i, drop_idx in enumerate(self._drop_idx_after_grouping):
941 if drop_idx is not None:
942 output[i] -= 1
943
944 if not self._infrequent_enabled:
945 return output
946
947 # infrequent is enabled, the number of features out are reduced
948 # because the infrequent categories are grouped together
949 for i, infreq_idx in enumerate(self._infrequent_indices):
950 if infreq_idx is None:
951 continue
952 output[i] -= infreq_idx.size - 1
953
954 return output
955
956 @_fit_context(prefer_skip_nested_validation=True)
957 def fit(self, X, y=None):
958 """
959 Fit OneHotEncoder to X.
960
961 Parameters
962 ----------
963 X : array-like of shape (n_samples, n_features)
964 The data to determine the categories of each feature.
965
966 y : None
967 Ignored. This parameter exists only for compatibility with
968 :class:`~sklearn.pipeline.Pipeline`.
969
970 Returns
971 -------
972 self
973 Fitted encoder.
974 """
975 self._fit(
976 X,
977 handle_unknown=self.handle_unknown,
978 force_all_finite="allow-nan",
979 )
980 self._set_drop_idx()
981 self._n_features_outs = self._compute_n_features_outs()
982 return self
983
984 def transform(self, X):
985 """
986 Transform X using one-hot encoding.
987
988 If `sparse_output=True` (default), it returns an instance of
989 :class:`scipy.sparse._csr.csr_matrix` (CSR format).
990
991 If there are infrequent categories for a feature, set by specifying
992 `max_categories` or `min_frequency`, the infrequent categories are
993 grouped into a single category.
994
995 Parameters
996 ----------
997 X : array-like of shape (n_samples, n_features)
998 The data to encode.
999
1000 Returns
1001 -------
1002 X_out : {ndarray, sparse matrix} of shape \
1003 (n_samples, n_encoded_features)
1004 Transformed input. If `sparse_output=True`, a sparse matrix will be
1005 returned.
1006 """
1007 check_is_fitted(self)
1008 transform_output = _get_output_config("transform", estimator=self)["dense"]
1009 if transform_output != "default" and self.sparse_output:
1010 capitalize_transform_output = transform_output.capitalize()
1011 raise ValueError(
1012 f"{capitalize_transform_output} output does not support sparse data."
1013 f" Set sparse_output=False to output {transform_output} dataframes or"
1014 f" disable {capitalize_transform_output} output via"
1015 '` ohe.set_output(transform="default").'
1016 )
1017
1018 # validation of X happens in _check_X called by _transform
1019 warn_on_unknown = self.drop is not None and self.handle_unknown in {
1020 "ignore",
1021 "infrequent_if_exist",
1022 }
1023 X_int, X_mask = self._transform(
1024 X,
1025 handle_unknown=self.handle_unknown,
1026 force_all_finite="allow-nan",
1027 warn_on_unknown=warn_on_unknown,
1028 )
1029
1030 n_samples, n_features = X_int.shape
1031
1032 if self._drop_idx_after_grouping is not None:
1033 to_drop = self._drop_idx_after_grouping.copy()
1034 # We remove all the dropped categories from mask, and decrement all
1035 # categories that occur after them to avoid an empty column.
1036 keep_cells = X_int != to_drop
1037 for i, cats in enumerate(self.categories_):
1038 # drop='if_binary' but feature isn't binary
1039 if to_drop[i] is None:
1040 # set to cardinality to not drop from X_int
1041 to_drop[i] = len(cats)
1042
1043 to_drop = to_drop.reshape(1, -1)
1044 X_int[X_int > to_drop] -= 1
1045 X_mask &= keep_cells
1046
1047 mask = X_mask.ravel()
1048 feature_indices = np.cumsum([0] + self._n_features_outs)
1049 indices = (X_int + feature_indices[:-1]).ravel()[mask]
1050
1051 indptr = np.empty(n_samples + 1, dtype=int)
1052 indptr[0] = 0
1053 np.sum(X_mask, axis=1, out=indptr[1:], dtype=indptr.dtype)
1054 np.cumsum(indptr[1:], out=indptr[1:])
1055 data = np.ones(indptr[-1])
1056
1057 out = sparse.csr_matrix(
1058 (data, indices, indptr),
1059 shape=(n_samples, feature_indices[-1]),
1060 dtype=self.dtype,
1061 )
1062 if not self.sparse_output:
1063 return out.toarray()
1064 else:
1065 return out
1066
1067 def inverse_transform(self, X):
1068 """
1069 Convert the data back to the original representation.
1070
1071 When unknown categories are encountered (all zeros in the
1072 one-hot encoding), ``None`` is used to represent this category. If the
1073 feature with the unknown category has a dropped category, the dropped
1074 category will be its inverse.
1075
1076 For a given input feature, if there is an infrequent category,
1077 'infrequent_sklearn' will be used to represent the infrequent category.
1078
1079 Parameters
1080 ----------
1081 X : {array-like, sparse matrix} of shape \
1082 (n_samples, n_encoded_features)
1083 The transformed data.
1084
1085 Returns
1086 -------
1087 X_tr : ndarray of shape (n_samples, n_features)
1088 Inverse transformed array.
1089 """
1090 check_is_fitted(self)
1091 X = check_array(X, accept_sparse="csr")
1092
1093 n_samples, _ = X.shape
1094 n_features = len(self.categories_)
1095
1096 n_features_out = np.sum(self._n_features_outs)
1097
1098 # validate shape of passed X
1099 msg = (
1100 "Shape of the passed X data is not correct. Expected {0} columns, got {1}."
1101 )
1102 if X.shape[1] != n_features_out:
1103 raise ValueError(msg.format(n_features_out, X.shape[1]))
1104
1105 transformed_features = [
1106 self._compute_transformed_categories(i, remove_dropped=False)
1107 for i, _ in enumerate(self.categories_)
1108 ]
1109
1110 # create resulting array of appropriate dtype
1111 dt = np.result_type(*[cat.dtype for cat in transformed_features])
1112 X_tr = np.empty((n_samples, n_features), dtype=dt)
1113
1114 j = 0
1115 found_unknown = {}
1116
1117 if self._infrequent_enabled:
1118 infrequent_indices = self._infrequent_indices
1119 else:
1120 infrequent_indices = [None] * n_features
1121
1122 for i in range(n_features):
1123 cats_wo_dropped = self._remove_dropped_categories(
1124 transformed_features[i], i
1125 )
1126 n_categories = cats_wo_dropped.shape[0]
1127
1128 # Only happens if there was a column with a unique
1129 # category. In this case we just fill the column with this
1130 # unique category value.
1131 if n_categories == 0:
1132 X_tr[:, i] = self.categories_[i][self._drop_idx_after_grouping[i]]
1133 j += n_categories
1134 continue
1135 sub = X[:, j : j + n_categories]
1136 # for sparse X argmax returns 2D matrix, ensure 1D array
1137 labels = np.asarray(sub.argmax(axis=1)).flatten()
1138 X_tr[:, i] = cats_wo_dropped[labels]
1139
1140 if self.handle_unknown == "ignore" or (
1141 self.handle_unknown == "infrequent_if_exist"
1142 and infrequent_indices[i] is None
1143 ):
1144 unknown = np.asarray(sub.sum(axis=1) == 0).flatten()
1145 # ignored unknown categories: we have a row of all zero
1146 if unknown.any():
1147 # if categories were dropped then unknown categories will
1148 # be mapped to the dropped category
1149 if (
1150 self._drop_idx_after_grouping is None
1151 or self._drop_idx_after_grouping[i] is None
1152 ):
1153 found_unknown[i] = unknown
1154 else:
1155 X_tr[unknown, i] = self.categories_[i][
1156 self._drop_idx_after_grouping[i]
1157 ]
1158 else:
1159 dropped = np.asarray(sub.sum(axis=1) == 0).flatten()
1160 if dropped.any():
1161 if self._drop_idx_after_grouping is None:
1162 all_zero_samples = np.flatnonzero(dropped)
1163 raise ValueError(
1164 f"Samples {all_zero_samples} can not be inverted "
1165 "when drop=None and handle_unknown='error' "
1166 "because they contain all zeros"
1167 )
1168 # we can safely assume that all of the nulls in each column
1169 # are the dropped value
1170 drop_idx = self._drop_idx_after_grouping[i]
1171 X_tr[dropped, i] = transformed_features[i][drop_idx]
1172
1173 j += n_categories
1174
1175 # if ignored are found: potentially need to upcast result to
1176 # insert None values
1177 if found_unknown:
1178 if X_tr.dtype != object:
1179 X_tr = X_tr.astype(object)
1180
1181 for idx, mask in found_unknown.items():
1182 X_tr[mask, idx] = None
1183
1184 return X_tr
1185
1186 def get_feature_names_out(self, input_features=None):
1187 """Get output feature names for transformation.
1188
1189 Parameters
1190 ----------
1191 input_features : array-like of str or None, default=None
1192 Input features.
1193
1194 - If `input_features` is `None`, then `feature_names_in_` is
1195 used as feature names in. If `feature_names_in_` is not defined,
1196 then the following input feature names are generated:
1197 `["x0", "x1", ..., "x(n_features_in_ - 1)"]`.
1198 - If `input_features` is an array-like, then `input_features` must
1199 match `feature_names_in_` if `feature_names_in_` is defined.
1200
1201 Returns
1202 -------
1203 feature_names_out : ndarray of str objects
1204 Transformed feature names.
1205 """
1206 check_is_fitted(self)
1207 input_features = _check_feature_names_in(self, input_features)
1208 cats = [
1209 self._compute_transformed_categories(i)
1210 for i, _ in enumerate(self.categories_)
1211 ]
1212
1213 name_combiner = self._check_get_feature_name_combiner()
1214 feature_names = []
1215 for i in range(len(cats)):
1216 names = [name_combiner(input_features[i], t) for t in cats[i]]
1217 feature_names.extend(names)
1218
1219 return np.array(feature_names, dtype=object)
1220
1221 def _check_get_feature_name_combiner(self):
1222 if self.feature_name_combiner == "concat":
1223 return lambda feature, category: feature + "_" + str(category)
1224 else: # callable
1225 dry_run_combiner = self.feature_name_combiner("feature", "category")
1226 if not isinstance(dry_run_combiner, str):
1227 raise TypeError(
1228 "When `feature_name_combiner` is a callable, it should return a "
1229 f"Python string. Got {type(dry_run_combiner)} instead."
1230 )
1231 return self.feature_name_combiner
1232
1233
1234class OrdinalEncoder(OneToOneFeatureMixin, _BaseEncoder):
1235 """
1236 Encode categorical features as an integer array.
1237
1238 The input to this transformer should be an array-like of integers or
1239 strings, denoting the values taken on by categorical (discrete) features.
1240 The features are converted to ordinal integers. This results in
1241 a single column of integers (0 to n_categories - 1) per feature.
1242
1243 Read more in the :ref:`User Guide <preprocessing_categorical_features>`.
1244 For a comparison of different encoders, refer to:
1245 :ref:`sphx_glr_auto_examples_preprocessing_plot_target_encoder.py`.
1246
1247 .. versionadded:: 0.20
1248
1249 Parameters
1250 ----------
1251 categories : 'auto' or a list of array-like, default='auto'
1252 Categories (unique values) per feature:
1253
1254 - 'auto' : Determine categories automatically from the training data.
1255 - list : ``categories[i]`` holds the categories expected in the ith
1256 column. The passed categories should not mix strings and numeric
1257 values, and should be sorted in case of numeric values.
1258
1259 The used categories can be found in the ``categories_`` attribute.
1260
1261 dtype : number type, default=np.float64
1262 Desired dtype of output.
1263
1264 handle_unknown : {'error', 'use_encoded_value'}, default='error'
1265 When set to 'error' an error will be raised in case an unknown
1266 categorical feature is present during transform. When set to
1267 'use_encoded_value', the encoded value of unknown categories will be
1268 set to the value given for the parameter `unknown_value`. In
1269 :meth:`inverse_transform`, an unknown category will be denoted as None.
1270
1271 .. versionadded:: 0.24
1272
1273 unknown_value : int or np.nan, default=None
1274 When the parameter handle_unknown is set to 'use_encoded_value', this
1275 parameter is required and will set the encoded value of unknown
1276 categories. It has to be distinct from the values used to encode any of
1277 the categories in `fit`. If set to np.nan, the `dtype` parameter must
1278 be a float dtype.
1279
1280 .. versionadded:: 0.24
1281
1282 encoded_missing_value : int or np.nan, default=np.nan
1283 Encoded value of missing categories. If set to `np.nan`, then the `dtype`
1284 parameter must be a float dtype.
1285
1286 .. versionadded:: 1.1
1287
1288 min_frequency : int or float, default=None
1289 Specifies the minimum frequency below which a category will be
1290 considered infrequent.
1291
1292 - If `int`, categories with a smaller cardinality will be considered
1293 infrequent.
1294
1295 - If `float`, categories with a smaller cardinality than
1296 `min_frequency * n_samples` will be considered infrequent.
1297
1298 .. versionadded:: 1.3
1299 Read more in the :ref:`User Guide <encoder_infrequent_categories>`.
1300
1301 max_categories : int, default=None
1302 Specifies an upper limit to the number of output categories for each input
1303 feature when considering infrequent categories. If there are infrequent
1304 categories, `max_categories` includes the category representing the
1305 infrequent categories along with the frequent categories. If `None`,
1306 there is no limit to the number of output features.
1307
1308 `max_categories` do **not** take into account missing or unknown
1309 categories. Setting `unknown_value` or `encoded_missing_value` to an
1310 integer will increase the number of unique integer codes by one each.
1311 This can result in up to `max_categories + 2` integer codes.
1312
1313 .. versionadded:: 1.3
1314 Read more in the :ref:`User Guide <encoder_infrequent_categories>`.
1315
1316 Attributes
1317 ----------
1318 categories_ : list of arrays
1319 The categories of each feature determined during ``fit`` (in order of
1320 the features in X and corresponding with the output of ``transform``).
1321 This does not include categories that weren't seen during ``fit``.
1322
1323 n_features_in_ : int
1324 Number of features seen during :term:`fit`.
1325
1326 .. versionadded:: 1.0
1327
1328 feature_names_in_ : ndarray of shape (`n_features_in_`,)
1329 Names of features seen during :term:`fit`. Defined only when `X`
1330 has feature names that are all strings.
1331
1332 .. versionadded:: 1.0
1333
1334 infrequent_categories_ : list of ndarray
1335 Defined only if infrequent categories are enabled by setting
1336 `min_frequency` or `max_categories` to a non-default value.
1337 `infrequent_categories_[i]` are the infrequent categories for feature
1338 `i`. If the feature `i` has no infrequent categories
1339 `infrequent_categories_[i]` is None.
1340
1341 .. versionadded:: 1.3
1342
1343 See Also
1344 --------
1345 OneHotEncoder : Performs a one-hot encoding of categorical features. This encoding
1346 is suitable for low to medium cardinality categorical variables, both in
1347 supervised and unsupervised settings.
1348 TargetEncoder : Encodes categorical features using supervised signal
1349 in a classification or regression pipeline. This encoding is typically
1350 suitable for high cardinality categorical variables.
1351 LabelEncoder : Encodes target labels with values between 0 and
1352 ``n_classes-1``.
1353
1354 Notes
1355 -----
1356 With a high proportion of `nan` values, inferring categories becomes slow with
1357 Python versions before 3.10. The handling of `nan` values was improved
1358 from Python 3.10 onwards, (c.f.
1359 `bpo-43475 <https://github.com/python/cpython/issues/87641>`_).
1360
1361 Examples
1362 --------
1363 Given a dataset with two features, we let the encoder find the unique
1364 values per feature and transform the data to an ordinal encoding.
1365
1366 >>> from sklearn.preprocessing import OrdinalEncoder
1367 >>> enc = OrdinalEncoder()
1368 >>> X = [['Male', 1], ['Female', 3], ['Female', 2]]
1369 >>> enc.fit(X)
1370 OrdinalEncoder()
1371 >>> enc.categories_
1372 [array(['Female', 'Male'], dtype=object), array([1, 2, 3], dtype=object)]
1373 >>> enc.transform([['Female', 3], ['Male', 1]])
1374 array([[0., 2.],
1375 [1., 0.]])
1376
1377 >>> enc.inverse_transform([[1, 0], [0, 1]])
1378 array([['Male', 1],
1379 ['Female', 2]], dtype=object)
1380
1381 By default, :class:`OrdinalEncoder` is lenient towards missing values by
1382 propagating them.
1383
1384 >>> import numpy as np
1385 >>> X = [['Male', 1], ['Female', 3], ['Female', np.nan]]
1386 >>> enc.fit_transform(X)
1387 array([[ 1., 0.],
1388 [ 0., 1.],
1389 [ 0., nan]])
1390
1391 You can use the parameter `encoded_missing_value` to encode missing values.
1392
1393 >>> enc.set_params(encoded_missing_value=-1).fit_transform(X)
1394 array([[ 1., 0.],
1395 [ 0., 1.],
1396 [ 0., -1.]])
1397
1398 Infrequent categories are enabled by setting `max_categories` or `min_frequency`.
1399 In the following example, "a" and "d" are considered infrequent and grouped
1400 together into a single category, "b" and "c" are their own categories, unknown
1401 values are encoded as 3 and missing values are encoded as 4.
1402
1403 >>> X_train = np.array(
1404 ... [["a"] * 5 + ["b"] * 20 + ["c"] * 10 + ["d"] * 3 + [np.nan]],
1405 ... dtype=object).T
1406 >>> enc = OrdinalEncoder(
1407 ... handle_unknown="use_encoded_value", unknown_value=3,
1408 ... max_categories=3, encoded_missing_value=4)
1409 >>> _ = enc.fit(X_train)
1410 >>> X_test = np.array([["a"], ["b"], ["c"], ["d"], ["e"], [np.nan]], dtype=object)
1411 >>> enc.transform(X_test)
1412 array([[2.],
1413 [0.],
1414 [1.],
1415 [2.],
1416 [3.],
1417 [4.]])
1418 """
1419
1420 _parameter_constraints: dict = {
1421 "categories": [StrOptions({"auto"}), list],
1422 "dtype": "no_validation", # validation delegated to numpy
1423 "encoded_missing_value": [Integral, type(np.nan)],
1424 "handle_unknown": [StrOptions({"error", "use_encoded_value"})],
1425 "unknown_value": [Integral, type(np.nan), None],
1426 "max_categories": [Interval(Integral, 1, None, closed="left"), None],
1427 "min_frequency": [
1428 Interval(Integral, 1, None, closed="left"),
1429 Interval(RealNotInt, 0, 1, closed="neither"),
1430 None,
1431 ],
1432 }
1433
1434 def __init__(
1435 self,
1436 *,
1437 categories="auto",
1438 dtype=np.float64,
1439 handle_unknown="error",
1440 unknown_value=None,
1441 encoded_missing_value=np.nan,
1442 min_frequency=None,
1443 max_categories=None,
1444 ):
1445 self.categories = categories
1446 self.dtype = dtype
1447 self.handle_unknown = handle_unknown
1448 self.unknown_value = unknown_value
1449 self.encoded_missing_value = encoded_missing_value
1450 self.min_frequency = min_frequency
1451 self.max_categories = max_categories
1452
1453 @_fit_context(prefer_skip_nested_validation=True)
1454 def fit(self, X, y=None):
1455 """
1456 Fit the OrdinalEncoder to X.
1457
1458 Parameters
1459 ----------
1460 X : array-like of shape (n_samples, n_features)
1461 The data to determine the categories of each feature.
1462
1463 y : None
1464 Ignored. This parameter exists only for compatibility with
1465 :class:`~sklearn.pipeline.Pipeline`.
1466
1467 Returns
1468 -------
1469 self : object
1470 Fitted encoder.
1471 """
1472 if self.handle_unknown == "use_encoded_value":
1473 if is_scalar_nan(self.unknown_value):
1474 if np.dtype(self.dtype).kind != "f":
1475 raise ValueError(
1476 "When unknown_value is np.nan, the dtype "
1477 "parameter should be "
1478 f"a float dtype. Got {self.dtype}."
1479 )
1480 elif not isinstance(self.unknown_value, numbers.Integral):
1481 raise TypeError(
1482 "unknown_value should be an integer or "
1483 "np.nan when "
1484 "handle_unknown is 'use_encoded_value', "
1485 f"got {self.unknown_value}."
1486 )
1487 elif self.unknown_value is not None:
1488 raise TypeError(
1489 "unknown_value should only be set when "
1490 "handle_unknown is 'use_encoded_value', "
1491 f"got {self.unknown_value}."
1492 )
1493
1494 # `_fit` will only raise an error when `self.handle_unknown="error"`
1495 fit_results = self._fit(
1496 X,
1497 handle_unknown=self.handle_unknown,
1498 force_all_finite="allow-nan",
1499 return_and_ignore_missing_for_infrequent=True,
1500 )
1501 self._missing_indices = fit_results["missing_indices"]
1502
1503 cardinalities = [len(categories) for categories in self.categories_]
1504 if self._infrequent_enabled:
1505 # Cardinality decreases because the infrequent categories are grouped
1506 # together
1507 for feature_idx, infrequent in enumerate(self.infrequent_categories_):
1508 if infrequent is not None:
1509 cardinalities[feature_idx] -= len(infrequent)
1510
1511 # missing values are not considered part of the cardinality
1512 # when considering unknown categories or encoded_missing_value
1513 for cat_idx, categories_for_idx in enumerate(self.categories_):
1514 if is_scalar_nan(categories_for_idx[-1]):
1515 cardinalities[cat_idx] -= 1
1516
1517 if self.handle_unknown == "use_encoded_value":
1518 for cardinality in cardinalities:
1519 if 0 <= self.unknown_value < cardinality:
1520 raise ValueError(
1521 "The used value for unknown_value "
1522 f"{self.unknown_value} is one of the "
1523 "values already used for encoding the "
1524 "seen categories."
1525 )
1526
1527 if self._missing_indices:
1528 if np.dtype(self.dtype).kind != "f" and is_scalar_nan(
1529 self.encoded_missing_value
1530 ):
1531 raise ValueError(
1532 "There are missing values in features "
1533 f"{list(self._missing_indices)}. For OrdinalEncoder to "
1534 f"encode missing values with dtype: {self.dtype}, set "
1535 "encoded_missing_value to a non-nan value, or "
1536 "set dtype to a float"
1537 )
1538
1539 if not is_scalar_nan(self.encoded_missing_value):
1540 # Features are invalid when they contain a missing category
1541 # and encoded_missing_value was already used to encode a
1542 # known category
1543 invalid_features = [
1544 cat_idx
1545 for cat_idx, cardinality in enumerate(cardinalities)
1546 if cat_idx in self._missing_indices
1547 and 0 <= self.encoded_missing_value < cardinality
1548 ]
1549
1550 if invalid_features:
1551 # Use feature names if they are available
1552 if hasattr(self, "feature_names_in_"):
1553 invalid_features = self.feature_names_in_[invalid_features]
1554 raise ValueError(
1555 f"encoded_missing_value ({self.encoded_missing_value}) "
1556 "is already used to encode a known category in features: "
1557 f"{invalid_features}"
1558 )
1559
1560 return self
1561
1562 def transform(self, X):
1563 """
1564 Transform X to ordinal codes.
1565
1566 Parameters
1567 ----------
1568 X : array-like of shape (n_samples, n_features)
1569 The data to encode.
1570
1571 Returns
1572 -------
1573 X_out : ndarray of shape (n_samples, n_features)
1574 Transformed input.
1575 """
1576 check_is_fitted(self, "categories_")
1577 X_int, X_mask = self._transform(
1578 X,
1579 handle_unknown=self.handle_unknown,
1580 force_all_finite="allow-nan",
1581 ignore_category_indices=self._missing_indices,
1582 )
1583 X_trans = X_int.astype(self.dtype, copy=False)
1584
1585 for cat_idx, missing_idx in self._missing_indices.items():
1586 X_missing_mask = X_int[:, cat_idx] == missing_idx
1587 X_trans[X_missing_mask, cat_idx] = self.encoded_missing_value
1588
1589 # create separate category for unknown values
1590 if self.handle_unknown == "use_encoded_value":
1591 X_trans[~X_mask] = self.unknown_value
1592 return X_trans
1593
1594 def inverse_transform(self, X):
1595 """
1596 Convert the data back to the original representation.
1597
1598 Parameters
1599 ----------
1600 X : array-like of shape (n_samples, n_encoded_features)
1601 The transformed data.
1602
1603 Returns
1604 -------
1605 X_tr : ndarray of shape (n_samples, n_features)
1606 Inverse transformed array.
1607 """
1608 check_is_fitted(self)
1609 X = check_array(X, force_all_finite="allow-nan")
1610
1611 n_samples, _ = X.shape
1612 n_features = len(self.categories_)
1613
1614 # validate shape of passed X
1615 msg = (
1616 "Shape of the passed X data is not correct. Expected {0} columns, got {1}."
1617 )
1618 if X.shape[1] != n_features:
1619 raise ValueError(msg.format(n_features, X.shape[1]))
1620
1621 # create resulting array of appropriate dtype
1622 dt = np.result_type(*[cat.dtype for cat in self.categories_])
1623 X_tr = np.empty((n_samples, n_features), dtype=dt)
1624
1625 found_unknown = {}
1626 infrequent_masks = {}
1627
1628 infrequent_indices = getattr(self, "_infrequent_indices", None)
1629
1630 for i in range(n_features):
1631 labels = X[:, i]
1632
1633 # replace values of X[:, i] that were nan with actual indices
1634 if i in self._missing_indices:
1635 X_i_mask = _get_mask(labels, self.encoded_missing_value)
1636 labels[X_i_mask] = self._missing_indices[i]
1637
1638 rows_to_update = slice(None)
1639 categories = self.categories_[i]
1640
1641 if infrequent_indices is not None and infrequent_indices[i] is not None:
1642 # Compute mask for frequent categories
1643 infrequent_encoding_value = len(categories) - len(infrequent_indices[i])
1644 infrequent_masks[i] = labels == infrequent_encoding_value
1645 rows_to_update = ~infrequent_masks[i]
1646
1647 # Remap categories to be only frequent categories. The infrequent
1648 # categories will be mapped to "infrequent_sklearn" later
1649 frequent_categories_mask = np.ones_like(categories, dtype=bool)
1650 frequent_categories_mask[infrequent_indices[i]] = False
1651 categories = categories[frequent_categories_mask]
1652
1653 if self.handle_unknown == "use_encoded_value":
1654 unknown_labels = _get_mask(labels, self.unknown_value)
1655 found_unknown[i] = unknown_labels
1656
1657 known_labels = ~unknown_labels
1658 if isinstance(rows_to_update, np.ndarray):
1659 rows_to_update &= known_labels
1660 else:
1661 rows_to_update = known_labels
1662
1663 labels_int = labels[rows_to_update].astype("int64", copy=False)
1664 X_tr[rows_to_update, i] = categories[labels_int]
1665
1666 if found_unknown or infrequent_masks:
1667 X_tr = X_tr.astype(object, copy=False)
1668
1669 # insert None values for unknown values
1670 if found_unknown:
1671 for idx, mask in found_unknown.items():
1672 X_tr[mask, idx] = None
1673
1674 if infrequent_masks:
1675 for idx, mask in infrequent_masks.items():
1676 X_tr[mask, idx] = "infrequent_sklearn"
1677
1678 return X_tr