Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/scikit_learn-1.4.dev0-py3.8-linux-x86_64.egg/sklearn/preprocessing/

1# Authors: Andreas Mueller <amueller@ais.uni-bonn.de>

2# Joris Van den Bossche <jorisvandenbossche@gmail.com>

3# License: BSD 3 clause

5import numbers

6import warnings

7from numbers import Integral

9import numpy as np

10from scipy import sparse

12from ..base import BaseEstimator, OneToOneFeatureMixin, TransformerMixin, _fit_context

13from ..utils import _safe_indexing, check_array, is_scalar_nan

14from ..utils._encode import _check_unknown, _encode, _get_counts, _unique

15from ..utils._mask import _get_mask

16from ..utils._param_validation import Interval, RealNotInt, StrOptions

17from ..utils._set_output import _get_output_config

18from ..utils.validation import _check_feature_names_in, check_is_fitted

20__all__ = ["OneHotEncoder", "OrdinalEncoder"]

23class _BaseEncoder(TransformerMixin, BaseEstimator):

24 """

25 Base class for encoders that includes the code to categorize and

26 transform the input features.

28 """

30 def _check_X(self, X, force_all_finite=True):

31 """

32 Perform custom check_array:

33 - convert list of strings to object dtype

34 - check for missing values for object dtype data (check_array does

35 not do that)

36 - return list of features (arrays): this list of features is

37 constructed feature by feature to preserve the data types

38 of pandas DataFrame columns, as otherwise information is lost

39 and cannot be used, e.g. for the `categories_` attribute.

41 """

42 if not (hasattr(X, "iloc") and getattr(X, "ndim", 0) == 2):

43 # if not a dataframe, do normal check_array validation

44 X_temp = check_array(X, dtype=None, force_all_finite=force_all_finite)

45 if not hasattr(X, "dtype") and np.issubdtype(X_temp.dtype, np.str_):

46 X = check_array(X, dtype=object, force_all_finite=force_all_finite)

47 else:

48 X = X_temp

49 needs_validation = False

50 else:

51 # pandas dataframe, do validation later column by column, in order

52 # to keep the dtype information to be used in the encoder.

53 needs_validation = force_all_finite

55 n_samples, n_features = X.shape

56 X_columns = []

58 for i in range(n_features):

59 Xi = _safe_indexing(X, indices=i, axis=1)

60 Xi = check_array(

61 Xi, ensure_2d=False, dtype=None, force_all_finite=needs_validation

62 )

63 X_columns.append(Xi)

65 return X_columns, n_samples, n_features

67 def _fit(

68 self,

69 X,

70 handle_unknown="error",

71 force_all_finite=True,

72 return_counts=False,

73 return_and_ignore_missing_for_infrequent=False,

74 ):

75 self._check_infrequent_enabled()

76 self._check_n_features(X, reset=True)

77 self._check_feature_names(X, reset=True)

78 X_list, n_samples, n_features = self._check_X(

79 X, force_all_finite=force_all_finite

80 )

81 self.n_features_in_ = n_features

83 if self.categories != "auto":

84 if len(self.categories) != n_features:

85 raise ValueError(

86 "Shape mismatch: if categories is an array,"

87 " it has to be of shape (n_features,)."

88 )

90 self.categories_ = []

91 category_counts = []

92 compute_counts = return_counts or self._infrequent_enabled

94 for i in range(n_features):

95 Xi = X_list[i]

97 if self.categories == "auto":

98 result = _unique(Xi, return_counts=compute_counts)

99 if compute_counts:

100 cats, counts = result

101 category_counts.append(counts)

102 else:

103 cats = result

104 else:

105 if np.issubdtype(Xi.dtype, np.str_):

106 # Always convert string categories to objects to avoid

107 # unexpected string truncation for longer category labels

108 # passed in the constructor.

109 Xi_dtype = object

110 else:

111 Xi_dtype = Xi.dtype

112

113 cats = np.array(self.categories[i], dtype=Xi_dtype)

114 if (

115 cats.dtype == object

116 and isinstance(cats[0], bytes)

117 and Xi.dtype.kind != "S"

118 ):

119 msg = (

120 f"In column {i}, the predefined categories have type 'bytes'"

121 " which is incompatible with values of type"

122 f" '{type(Xi[0]).__name__}'."

123 )

124 raise ValueError(msg)

125

126 # `nan` must be the last stated category

127 for category in cats[:-1]:

128 if is_scalar_nan(category):

129 raise ValueError(

130 "Nan should be the last element in user"

131 f" provided categories, see categories {cats}"

132 f" in column #{i}"

133 )

134

135 if cats.size != len(_unique(cats)):

136 msg = (

137 f"In column {i}, the predefined categories"

138 " contain duplicate elements."

139 )

140 raise ValueError(msg)

141

142 if Xi.dtype.kind not in "OUS":

143 sorted_cats = np.sort(cats)

144 error_msg = (

145 "Unsorted categories are not supported for numerical categories"

146 )

147 # if there are nans, nan should be the last element

148 stop_idx = -1 if np.isnan(sorted_cats[-1]) else None

149 if np.any(sorted_cats[:stop_idx] != cats[:stop_idx]):

150 raise ValueError(error_msg)

151

152 if handle_unknown == "error":

153 diff = _check_unknown(Xi, cats)

154 if diff:

155 msg = (

156 "Found unknown categories {0} in column {1}"

157 " during fit".format(diff, i)

158 )

159 raise ValueError(msg)

160 if compute_counts:

161 category_counts.append(_get_counts(Xi, cats))

162

163 self.categories_.append(cats)

164

165 output = {"n_samples": n_samples}

166 if return_counts:

167 output["category_counts"] = category_counts

168

169 missing_indices = {}

170 if return_and_ignore_missing_for_infrequent:

171 for feature_idx, categories_for_idx in enumerate(self.categories_):

172 if is_scalar_nan(categories_for_idx[-1]):

173 # `nan` values can only be placed in the latest position

174 missing_indices[feature_idx] = categories_for_idx.size - 1

175 output["missing_indices"] = missing_indices

176

177 if self._infrequent_enabled:

178 self._fit_infrequent_category_mapping(

179 n_samples,

180 category_counts,

181 missing_indices,

182 )

183 return output

184

185 def _transform(

186 self,

187 X,

188 handle_unknown="error",

189 force_all_finite=True,

190 warn_on_unknown=False,

191 ignore_category_indices=None,

192 ):

193 X_list, n_samples, n_features = self._check_X(

194 X, force_all_finite=force_all_finite

195 )

196 self._check_feature_names(X, reset=False)

197 self._check_n_features(X, reset=False)

198

199 X_int = np.zeros((n_samples, n_features), dtype=int)

200 X_mask = np.ones((n_samples, n_features), dtype=bool)

201

202 columns_with_unknown = []

203 for i in range(n_features):

204 Xi = X_list[i]

205 diff, valid_mask = _check_unknown(Xi, self.categories_[i], return_mask=True)

206

207 if not np.all(valid_mask):

208 if handle_unknown == "error":

209 msg = (

210 "Found unknown categories {0} in column {1}"

211 " during transform".format(diff, i)

212 )

213 raise ValueError(msg)

214 else:

215 if warn_on_unknown:

216 columns_with_unknown.append(i)

217 # Set the problematic rows to an acceptable value and

218 # continue `The rows are marked `X_mask` and will be

219 # removed later.

220 X_mask[:, i] = valid_mask

221 # cast Xi into the largest string type necessary

222 # to handle different lengths of numpy strings

223 if (

224 self.categories_[i].dtype.kind in ("U", "S")

225 and self.categories_[i].itemsize > Xi.itemsize

226 ):

227 Xi = Xi.astype(self.categories_[i].dtype)

228 elif self.categories_[i].dtype.kind == "O" and Xi.dtype.kind == "U":

229 # categories are objects and Xi are numpy strings.

230 # Cast Xi to an object dtype to prevent truncation

231 # when setting invalid values.

232 Xi = Xi.astype("O")

233 else:

234 Xi = Xi.copy()

235

236 Xi[~valid_mask] = self.categories_[i][0]

237 # We use check_unknown=False, since _check_unknown was

238 # already called above.

239 X_int[:, i] = _encode(Xi, uniques=self.categories_[i], check_unknown=False)

240 if columns_with_unknown:

241 warnings.warn(

242 (

243 "Found unknown categories in columns "

244 f"{columns_with_unknown} during transform. These "

245 "unknown categories will be encoded as all zeros"

246 ),

247 UserWarning,

248 )

249

250 self._map_infrequent_categories(X_int, X_mask, ignore_category_indices)

251 return X_int, X_mask

252

253 @property

254 def infrequent_categories_(self):

255 """Infrequent categories for each feature."""

256 # raises an AttributeError if `_infrequent_indices` is not defined

257 infrequent_indices = self._infrequent_indices

258 return [

259 None if indices is None else category[indices]

260 for category, indices in zip(self.categories_, infrequent_indices)

261 ]

262

263 def _check_infrequent_enabled(self):

264 """

265 This functions checks whether _infrequent_enabled is True or False.

266 This has to be called after parameter validation in the fit function.

267 """

268 max_categories = getattr(self, "max_categories", None)

269 min_frequency = getattr(self, "min_frequency", None)

270 self._infrequent_enabled = (

271 max_categories is not None and max_categories >= 1

272 ) or min_frequency is not None

273

274 def _identify_infrequent(self, category_count, n_samples, col_idx):

275 """Compute the infrequent indices.

276

277 Parameters

278 ----------

279 category_count : ndarray of shape (n_cardinality,)

280 Category counts.

281

282 n_samples : int

283 Number of samples.

284

285 col_idx : int

286 Index of the current category. Only used for the error message.

287

288 Returns

289 -------

290 output : ndarray of shape (n_infrequent_categories,) or None

291 If there are infrequent categories, indices of infrequent

292 categories. Otherwise None.

293 """

294 if isinstance(self.min_frequency, numbers.Integral):

295 infrequent_mask = category_count < self.min_frequency

296 elif isinstance(self.min_frequency, numbers.Real):

297 min_frequency_abs = n_samples * self.min_frequency

298 infrequent_mask = category_count < min_frequency_abs

299 else:

300 infrequent_mask = np.zeros(category_count.shape[0], dtype=bool)

301

302 n_current_features = category_count.size - infrequent_mask.sum() + 1

303 if self.max_categories is not None and self.max_categories < n_current_features:

304 # max_categories includes the one infrequent category

305 frequent_category_count = self.max_categories - 1

306 if frequent_category_count == 0:

307 # All categories are infrequent

308 infrequent_mask[:] = True

309 else:

310 # stable sort to preserve original count order

311 smallest_levels = np.argsort(category_count, kind="mergesort")[

312 :-frequent_category_count

313 ]

314 infrequent_mask[smallest_levels] = True

315

316 output = np.flatnonzero(infrequent_mask)

317 return output if output.size > 0 else None

318

319 def _fit_infrequent_category_mapping(

320 self, n_samples, category_counts, missing_indices

321 ):

322 """Fit infrequent categories.

323

324 Defines the private attribute: `_default_to_infrequent_mappings`. For

325 feature `i`, `_default_to_infrequent_mappings[i]` defines the mapping

326 from the integer encoding returned by `super().transform()` into

327 infrequent categories. If `_default_to_infrequent_mappings[i]` is None,

328 there were no infrequent categories in the training set.

329

330 For example if categories 0, 2 and 4 were frequent, while categories

331 1, 3, 5 were infrequent for feature 7, then these categories are mapped

332 to a single output:

333 `_default_to_infrequent_mappings[7] = array([0, 3, 1, 3, 2, 3])`

334

335 Defines private attribute: `_infrequent_indices`. `_infrequent_indices[i]`

336 is an array of indices such that

337 `categories_[i][_infrequent_indices[i]]` are all the infrequent category

338 labels. If the feature `i` has no infrequent categories

339 `_infrequent_indices[i]` is None.

340

341 .. versionadded:: 1.1

342

343 Parameters

344 ----------

345 n_samples : int

346 Number of samples in training set.

347 category_counts: list of ndarray

348 `category_counts[i]` is the category counts corresponding to

349 `self.categories_[i]`.

350 missing_indices : dict

351 Dict mapping from feature_idx to category index with a missing value.

352 """

353 # Remove missing value from counts, so it is not considered as infrequent

354 if missing_indices:

355 category_counts_ = []

356 for feature_idx, count in enumerate(category_counts):

357 if feature_idx in missing_indices:

358 category_counts_.append(

359 np.delete(count, missing_indices[feature_idx])

360 )

361 else:

362 category_counts_.append(count)

363 else:

364 category_counts_ = category_counts

365

366 self._infrequent_indices = [

367 self._identify_infrequent(category_count, n_samples, col_idx)

368 for col_idx, category_count in enumerate(category_counts_)

369 ]

370

371 # compute mapping from default mapping to infrequent mapping

372 self._default_to_infrequent_mappings = []

373

374 for feature_idx, infreq_idx in enumerate(self._infrequent_indices):

375 cats = self.categories_[feature_idx]

376 # no infrequent categories

377 if infreq_idx is None:

378 self._default_to_infrequent_mappings.append(None)

379 continue

380

381 n_cats = len(cats)

382 if feature_idx in missing_indices:

383 # Missing index was removed from this category when computing

384 # infrequent indices, thus we need to decrease the number of

385 # total categories when considering the infrequent mapping.

386 n_cats -= 1

387

388 # infrequent indices exist

389 mapping = np.empty(n_cats, dtype=np.int64)

390 n_infrequent_cats = infreq_idx.size

391

392 # infrequent categories are mapped to the last element.

393 n_frequent_cats = n_cats - n_infrequent_cats

394 mapping[infreq_idx] = n_frequent_cats

395

396 frequent_indices = np.setdiff1d(np.arange(n_cats), infreq_idx)

397 mapping[frequent_indices] = np.arange(n_frequent_cats)

398

399 self._default_to_infrequent_mappings.append(mapping)

400

401 def _map_infrequent_categories(self, X_int, X_mask, ignore_category_indices):

402 """Map infrequent categories to integer representing the infrequent category.

403

404 This modifies X_int in-place. Values that were invalid based on `X_mask`

405 are mapped to the infrequent category if there was an infrequent

406 category for that feature.

407

408 Parameters

409 ----------

410 X_int: ndarray of shape (n_samples, n_features)

411 Integer encoded categories.

412

413 X_mask: ndarray of shape (n_samples, n_features)

414 Bool mask for valid values in `X_int`.

415

416 ignore_category_indices : dict

417 Dictionary mapping from feature_idx to category index to ignore.

418 Ignored indexes will not be grouped and the original ordinal encoding

419 will remain.

420 """

421 if not self._infrequent_enabled:

422 return

423

424 ignore_category_indices = ignore_category_indices or {}

425

426 for col_idx in range(X_int.shape[1]):

427 infrequent_idx = self._infrequent_indices[col_idx]

428 if infrequent_idx is None:

429 continue

430

431 X_int[~X_mask[:, col_idx], col_idx] = infrequent_idx[0]

432 if self.handle_unknown == "infrequent_if_exist":

433 # All the unknown values are now mapped to the

434 # infrequent_idx[0], which makes the unknown values valid

435 # This is needed in `transform` when the encoding is formed

436 # using `X_mask`.

437 X_mask[:, col_idx] = True

438

439 # Remaps encoding in `X_int` where the infrequent categories are

440 # grouped together.

441 for i, mapping in enumerate(self._default_to_infrequent_mappings):

442 if mapping is None:

443 continue

444

445 if i in ignore_category_indices:

446 # Update rows that are **not** ignored

447 rows_to_update = X_int[:, i] != ignore_category_indices[i]

448 else:

449 rows_to_update = slice(None)

450

451 X_int[rows_to_update, i] = np.take(mapping, X_int[rows_to_update, i])

452

453 def _more_tags(self):

454 return {"X_types": ["2darray", "categorical"], "allow_nan": True}

455

456

457class OneHotEncoder(_BaseEncoder):

458 """

459 Encode categorical features as a one-hot numeric array.

460

461 The input to this transformer should be an array-like of integers or

462 strings, denoting the values taken on by categorical (discrete) features.

463 The features are encoded using a one-hot (aka 'one-of-K' or 'dummy')

464 encoding scheme. This creates a binary column for each category and

465 returns a sparse matrix or dense array (depending on the ``sparse_output``

466 parameter).

467

468 By default, the encoder derives the categories based on the unique values

469 in each feature. Alternatively, you can also specify the `categories`

470 manually.

471

472 This encoding is needed for feeding categorical data to many scikit-learn

473 estimators, notably linear models and SVMs with the standard kernels.

474

475 Note: a one-hot encoding of y labels should use a LabelBinarizer

476 instead.

477

478 Read more in the :ref:`User Guide <preprocessing_categorical_features>`.

479 For a comparison of different encoders, refer to:

480 :ref:`sphx_glr_auto_examples_preprocessing_plot_target_encoder.py`.

481

482 Parameters

483 ----------

484 categories : 'auto' or a list of array-like, default='auto'

485 Categories (unique values) per feature:

486

487 - 'auto' : Determine categories automatically from the training data.

488 - list : ``categories[i]`` holds the categories expected in the ith

489 column. The passed categories should not mix strings and numeric

490 values within a single feature, and should be sorted in case of

491 numeric values.

492

493 The used categories can be found in the ``categories_`` attribute.

494

495 .. versionadded:: 0.20

496

497 drop : {'first', 'if_binary'} or an array-like of shape (n_features,), \

498 default=None

499 Specifies a methodology to use to drop one of the categories per

500 feature. This is useful in situations where perfectly collinear

501 features cause problems, such as when feeding the resulting data

502 into an unregularized linear regression model.

503

504 However, dropping one category breaks the symmetry of the original

505 representation and can therefore induce a bias in downstream models,

506 for instance for penalized linear classification or regression models.

507

508 - None : retain all features (the default).

509 - 'first' : drop the first category in each feature. If only one

510 category is present, the feature will be dropped entirely.

511 - 'if_binary' : drop the first category in each feature with two

512 categories. Features with 1 or more than 2 categories are

513 left intact.

514 - array : ``drop[i]`` is the category in feature ``X[:, i]`` that

515 should be dropped.

516

517 When `max_categories` or `min_frequency` is configured to group

518 infrequent categories, the dropping behavior is handled after the

519 grouping.

520

521 .. versionadded:: 0.21

522 The parameter `drop` was added in 0.21.

523

524 .. versionchanged:: 0.23

525 The option `drop='if_binary'` was added in 0.23.

526

527 .. versionchanged:: 1.1

528 Support for dropping infrequent categories.

529

530 sparse_output : bool, default=True

531 When ``True``, it returns a :class:`scipy.sparse.csr_matrix`,

532 i.e. a sparse matrix in "Compressed Sparse Row" (CSR) format.

533

534 .. versionadded:: 1.2

535 `sparse` was renamed to `sparse_output`

536

537 dtype : number type, default=np.float64

538 Desired dtype of output.

539

540 handle_unknown : {'error', 'ignore', 'infrequent_if_exist'}, \

541 default='error'

542 Specifies the way unknown categories are handled during :meth:`transform`.

543

544 - 'error' : Raise an error if an unknown category is present during transform.

545 - 'ignore' : When an unknown category is encountered during

546 transform, the resulting one-hot encoded columns for this feature

547 will be all zeros. In the inverse transform, an unknown category

548 will be denoted as None.

549 - 'infrequent_if_exist' : When an unknown category is encountered

550 during transform, the resulting one-hot encoded columns for this

551 feature will map to the infrequent category if it exists. The

552 infrequent category will be mapped to the last position in the

553 encoding. During inverse transform, an unknown category will be

554 mapped to the category denoted `'infrequent'` if it exists. If the

555 `'infrequent'` category does not exist, then :meth:`transform` and

556 :meth:`inverse_transform` will handle an unknown category as with

557 `handle_unknown='ignore'`. Infrequent categories exist based on

558 `min_frequency` and `max_categories`. Read more in the

559 :ref:`User Guide <encoder_infrequent_categories>`.

560

561 .. versionchanged:: 1.1

562 `'infrequent_if_exist'` was added to automatically handle unknown

563 categories and infrequent categories.

564

565 min_frequency : int or float, default=None

566 Specifies the minimum frequency below which a category will be

567 considered infrequent.

568

569 - If `int`, categories with a smaller cardinality will be considered

570 infrequent.

571

572 - If `float`, categories with a smaller cardinality than

573 `min_frequency * n_samples` will be considered infrequent.

574

575 .. versionadded:: 1.1

576 Read more in the :ref:`User Guide <encoder_infrequent_categories>`.

577

578 max_categories : int, default=None

579 Specifies an upper limit to the number of output features for each input

580 feature when considering infrequent categories. If there are infrequent

581 categories, `max_categories` includes the category representing the

582 infrequent categories along with the frequent categories. If `None`,

583 there is no limit to the number of output features.

584

585 .. versionadded:: 1.1

586 Read more in the :ref:`User Guide <encoder_infrequent_categories>`.

587

588 feature_name_combiner : "concat" or callable, default="concat"

589 Callable with signature `def callable(input_feature, category)` that returns a

590 string. This is used to create feature names to be returned by

591 :meth:`get_feature_names_out`.

592

593 `"concat"` concatenates encoded feature name and category with

594 `feature + "_" + str(category)`.E.g. feature X with values 1, 6, 7 create

595 feature names `X_1, X_6, X_7`.

596

597 .. versionadded:: 1.3

598

599 Attributes

600 ----------

601 categories_ : list of arrays

602 The categories of each feature determined during fitting

603 (in order of the features in X and corresponding with the output

604 of ``transform``). This includes the category specified in ``drop``

605 (if any).

606

607 drop_idx_ : array of shape (n_features,)

608 - ``drop_idx_[i]`` is the index in ``categories_[i]`` of the category

609 to be dropped for each feature.

610 - ``drop_idx_[i] = None`` if no category is to be dropped from the

611 feature with index ``i``, e.g. when `drop='if_binary'` and the

612 feature isn't binary.

613 - ``drop_idx_ = None`` if all the transformed features will be

614 retained.

615

616 If infrequent categories are enabled by setting `min_frequency` or

617 `max_categories` to a non-default value and `drop_idx[i]` corresponds

618 to a infrequent category, then the entire infrequent category is

619 dropped.

620

621 .. versionchanged:: 0.23

622 Added the possibility to contain `None` values.

623

624 infrequent_categories_ : list of ndarray

625 Defined only if infrequent categories are enabled by setting

626 `min_frequency` or `max_categories` to a non-default value.

627 `infrequent_categories_[i]` are the infrequent categories for feature

628 `i`. If the feature `i` has no infrequent categories

629 `infrequent_categories_[i]` is None.

630

631 .. versionadded:: 1.1

632

633 n_features_in_ : int

634 Number of features seen during :term:`fit`.

635

636 .. versionadded:: 1.0

637

638 feature_names_in_ : ndarray of shape (`n_features_in_`,)

639 Names of features seen during :term:`fit`. Defined only when `X`

640 has feature names that are all strings.

641

642 .. versionadded:: 1.0

643

644 feature_name_combiner : callable or None

645 Callable with signature `def callable(input_feature, category)` that returns a

646 string. This is used to create feature names to be returned by

647 :meth:`get_feature_names_out`.

648

649 .. versionadded:: 1.3

650

651 See Also

652 --------

653 OrdinalEncoder : Performs an ordinal (integer)

654 encoding of the categorical features.

655 TargetEncoder : Encodes categorical features using the target.

656 sklearn.feature_extraction.DictVectorizer : Performs a one-hot encoding of

657 dictionary items (also handles string-valued features).

658 sklearn.feature_extraction.FeatureHasher : Performs an approximate one-hot

659 encoding of dictionary items or strings.

660 LabelBinarizer : Binarizes labels in a one-vs-all

661 fashion.

662 MultiLabelBinarizer : Transforms between iterable of

663 iterables and a multilabel format, e.g. a (samples x classes) binary

664 matrix indicating the presence of a class label.

665

666 Examples

667 --------

668 Given a dataset with two features, we let the encoder find the unique

669 values per feature and transform the data to a binary one-hot encoding.

670

671 >>> from sklearn.preprocessing import OneHotEncoder

672

673 One can discard categories not seen during `fit`:

674

675 >>> enc = OneHotEncoder(handle_unknown='ignore')

676 >>> X = [['Male', 1], ['Female', 3], ['Female', 2]]

677 >>> enc.fit(X)

678 OneHotEncoder(handle_unknown='ignore')

679 >>> enc.categories_

680 [array(['Female', 'Male'], dtype=object), array([1, 2, 3], dtype=object)]

681 >>> enc.transform([['Female', 1], ['Male', 4]]).toarray()

682 array([[1., 0., 1., 0., 0.],

683 [0., 1., 0., 0., 0.]])

684 >>> enc.inverse_transform([[0, 1, 1, 0, 0], [0, 0, 0, 1, 0]])

685 array([['Male', 1],

686 [None, 2]], dtype=object)

687 >>> enc.get_feature_names_out(['gender', 'group'])

688 array(['gender_Female', 'gender_Male', 'group_1', 'group_2', 'group_3'], ...)

689

690 One can always drop the first column for each feature:

691

692 >>> drop_enc = OneHotEncoder(drop='first').fit(X)

693 >>> drop_enc.categories_

694 [array(['Female', 'Male'], dtype=object), array([1, 2, 3], dtype=object)]

695 >>> drop_enc.transform([['Female', 1], ['Male', 2]]).toarray()

696 array([[0., 0., 0.],

697 [1., 1., 0.]])

698

699 Or drop a column for feature only having 2 categories:

700

701 >>> drop_binary_enc = OneHotEncoder(drop='if_binary').fit(X)

702 >>> drop_binary_enc.transform([['Female', 1], ['Male', 2]]).toarray()

703 array([[0., 1., 0., 0.],

704 [1., 0., 1., 0.]])

705

706 One can change the way feature names are created.

707

708 >>> def custom_combiner(feature, category):

709 ... return str(feature) + "_" + type(category).__name__ + "_" + str(category)

710 >>> custom_fnames_enc = OneHotEncoder(feature_name_combiner=custom_combiner).fit(X)

711 >>> custom_fnames_enc.get_feature_names_out()

712 array(['x0_str_Female', 'x0_str_Male', 'x1_int_1', 'x1_int_2', 'x1_int_3'],

713 dtype=object)

714

715 Infrequent categories are enabled by setting `max_categories` or `min_frequency`.

716

717 >>> import numpy as np

718 >>> X = np.array([["a"] * 5 + ["b"] * 20 + ["c"] * 10 + ["d"] * 3], dtype=object).T

719 >>> ohe = OneHotEncoder(max_categories=3, sparse_output=False).fit(X)

720 >>> ohe.infrequent_categories_

721 [array(['a', 'd'], dtype=object)]

722 >>> ohe.transform([["a"], ["b"]])

723 array([[0., 0., 1.],

724 [1., 0., 0.]])

725 """

726

727 _parameter_constraints: dict = {

728 "categories": [StrOptions({"auto"}), list],

729 "drop": [StrOptions({"first", "if_binary"}), "array-like", None],

730 "dtype": "no_validation", # validation delegated to numpy

731 "handle_unknown": [StrOptions({"error", "ignore", "infrequent_if_exist"})],

732 "max_categories": [Interval(Integral, 1, None, closed="left"), None],

733 "min_frequency": [

734 Interval(Integral, 1, None, closed="left"),

735 Interval(RealNotInt, 0, 1, closed="neither"),

736 None,

737 ],

738 "sparse_output": ["boolean"],

739 "feature_name_combiner": [StrOptions({"concat"}), callable],

740 }

741

742 def __init__(

743 self,

744 *,

745 categories="auto",

746 drop=None,

747 sparse_output=True,

748 dtype=np.float64,

749 handle_unknown="error",

750 min_frequency=None,

751 max_categories=None,

752 feature_name_combiner="concat",

753 ):

754 self.categories = categories

755 self.sparse_output = sparse_output

756 self.dtype = dtype

757 self.handle_unknown = handle_unknown

758 self.drop = drop

759 self.min_frequency = min_frequency

760 self.max_categories = max_categories

761 self.feature_name_combiner = feature_name_combiner

762

763 def _map_drop_idx_to_infrequent(self, feature_idx, drop_idx):

764 """Convert `drop_idx` into the index for infrequent categories.

765

766 If there are no infrequent categories, then `drop_idx` is

767 returned. This method is called in `_set_drop_idx` when the `drop`

768 parameter is an array-like.

769 """

770 if not self._infrequent_enabled:

771 return drop_idx

772

773 default_to_infrequent = self._default_to_infrequent_mappings[feature_idx]

774 if default_to_infrequent is None:

775 return drop_idx

776

777 # Raise error when explicitly dropping a category that is infrequent

778 infrequent_indices = self._infrequent_indices[feature_idx]

779 if infrequent_indices is not None and drop_idx in infrequent_indices:

780 categories = self.categories_[feature_idx]

781 raise ValueError(

782 f"Unable to drop category {categories[drop_idx].item()!r} from"

783 f" feature {feature_idx} because it is infrequent"

784 )

785 return default_to_infrequent[drop_idx]

786

787 def _set_drop_idx(self):

788 """Compute the drop indices associated with `self.categories_`.

789

790 If `self.drop` is:

791 - `None`, No categories have been dropped.

792 - `'first'`, All zeros to drop the first category.

793 - `'if_binary'`, All zeros if the category is binary and `None`

794 otherwise.

795 - array-like, The indices of the categories that match the

796 categories in `self.drop`. If the dropped category is an infrequent

797 category, then the index for the infrequent category is used. This

798 means that the entire infrequent category is dropped.

799

800 This methods defines a public `drop_idx_` and a private

801 `_drop_idx_after_grouping`.

802

803 - `drop_idx_`: Public facing API that references the drop category in

804 `self.categories_`.

805 - `_drop_idx_after_grouping`: Used internally to drop categories *after* the

806 infrequent categories are grouped together.

807

808 If there are no infrequent categories or drop is `None`, then

809 `drop_idx_=_drop_idx_after_grouping`.

810 """

811 if self.drop is None:

812 drop_idx_after_grouping = None

813 elif isinstance(self.drop, str):

814 if self.drop == "first":

815 drop_idx_after_grouping = np.zeros(len(self.categories_), dtype=object)

816 elif self.drop == "if_binary":

817 n_features_out_no_drop = [len(cat) for cat in self.categories_]

818 if self._infrequent_enabled:

819 for i, infreq_idx in enumerate(self._infrequent_indices):

820 if infreq_idx is None:

821 continue

822 n_features_out_no_drop[i] -= infreq_idx.size - 1

823

824 drop_idx_after_grouping = np.array(

825 [

826 0 if n_features_out == 2 else None

827 for n_features_out in n_features_out_no_drop

828 ],

829 dtype=object,

830 )

831

832 else:

833 drop_array = np.asarray(self.drop, dtype=object)

834 droplen = len(drop_array)

835

836 if droplen != len(self.categories_):

837 msg = (

838 "`drop` should have length equal to the number "

839 "of features ({}), got {}"

840 )

841 raise ValueError(msg.format(len(self.categories_), droplen))

842 missing_drops = []

843 drop_indices = []

844 for feature_idx, (drop_val, cat_list) in enumerate(

845 zip(drop_array, self.categories_)

846 ):

847 if not is_scalar_nan(drop_val):

848 drop_idx = np.where(cat_list == drop_val)[0]

849 if drop_idx.size: # found drop idx

850 drop_indices.append(

851 self._map_drop_idx_to_infrequent(feature_idx, drop_idx[0])

852 )

853 else:

854 missing_drops.append((feature_idx, drop_val))

855 continue

856

857 # drop_val is nan, find nan in categories manually

858 if is_scalar_nan(cat_list[-1]):

859 drop_indices.append(

860 self._map_drop_idx_to_infrequent(feature_idx, cat_list.size - 1)

861 )

862 else: # nan is missing

863 missing_drops.append((feature_idx, drop_val))

864

865 if any(missing_drops):

866 msg = (

867 "The following categories were supposed to be "

868 "dropped, but were not found in the training "

869 "data.\n{}".format(

870 "\n".join(

871 [

872 "Category: {}, Feature: {}".format(c, v)

873 for c, v in missing_drops

874 ]

875 )

876 )

877 )

878 raise ValueError(msg)

879 drop_idx_after_grouping = np.array(drop_indices, dtype=object)

880

881 # `_drop_idx_after_grouping` are the categories to drop *after* the infrequent

882 # categories are grouped together. If needed, we remap `drop_idx` back

883 # to the categories seen in `self.categories_`.

884 self._drop_idx_after_grouping = drop_idx_after_grouping

885

886 if not self._infrequent_enabled or drop_idx_after_grouping is None:

887 self.drop_idx_ = self._drop_idx_after_grouping

888 else:

889 drop_idx_ = []

890 for feature_idx, drop_idx in enumerate(drop_idx_after_grouping):

891 default_to_infrequent = self._default_to_infrequent_mappings[

892 feature_idx

893 ]

894 if drop_idx is None or default_to_infrequent is None:

895 orig_drop_idx = drop_idx

896 else:

897 orig_drop_idx = np.flatnonzero(default_to_infrequent == drop_idx)[0]

898

899 drop_idx_.append(orig_drop_idx)

900

901 self.drop_idx_ = np.asarray(drop_idx_, dtype=object)

902

903 def _compute_transformed_categories(self, i, remove_dropped=True):

904 """Compute the transformed categories used for column `i`.

905

906 1. If there are infrequent categories, the category is named

907 'infrequent_sklearn'.

908 2. Dropped columns are removed when remove_dropped=True.

909 """

910 cats = self.categories_[i]

911

912 if self._infrequent_enabled:

913 infreq_map = self._default_to_infrequent_mappings[i]

914 if infreq_map is not None:

915 frequent_mask = infreq_map < infreq_map.max()

916 infrequent_cat = "infrequent_sklearn"

917 # infrequent category is always at the end

918 cats = np.concatenate(

919 (cats[frequent_mask], np.array([infrequent_cat], dtype=object))

920 )

921

922 if remove_dropped:

923 cats = self._remove_dropped_categories(cats, i)

924 return cats

925

926 def _remove_dropped_categories(self, categories, i):

927 """Remove dropped categories."""

928 if (

929 self._drop_idx_after_grouping is not None

930 and self._drop_idx_after_grouping[i] is not None

931 ):

932 return np.delete(categories, self._drop_idx_after_grouping[i])

933 return categories

934

935 def _compute_n_features_outs(self):

936 """Compute the n_features_out for each input feature."""

937 output = [len(cats) for cats in self.categories_]

938

939 if self._drop_idx_after_grouping is not None:

940 for i, drop_idx in enumerate(self._drop_idx_after_grouping):

941 if drop_idx is not None:

942 output[i] -= 1

943

944 if not self._infrequent_enabled:

945 return output

946

947 # infrequent is enabled, the number of features out are reduced

948 # because the infrequent categories are grouped together

949 for i, infreq_idx in enumerate(self._infrequent_indices):

950 if infreq_idx is None:

951 continue

952 output[i] -= infreq_idx.size - 1

953

954 return output

955

956 @_fit_context(prefer_skip_nested_validation=True)

957 def fit(self, X, y=None):

958 """

959 Fit OneHotEncoder to X.

960

961 Parameters

962 ----------

963 X : array-like of shape (n_samples, n_features)

964 The data to determine the categories of each feature.

965

966 y : None

967 Ignored. This parameter exists only for compatibility with

968 :class:`~sklearn.pipeline.Pipeline`.

969

970 Returns

971 -------

972 self

973 Fitted encoder.

974 """

975 self._fit(

976 X,

977 handle_unknown=self.handle_unknown,

978 force_all_finite="allow-nan",

979 )

980 self._set_drop_idx()

981 self._n_features_outs = self._compute_n_features_outs()

982 return self

983

984 def transform(self, X):

985 """

986 Transform X using one-hot encoding.

987

988 If `sparse_output=True` (default), it returns an instance of

989 :class:`scipy.sparse._csr.csr_matrix` (CSR format).

990

991 If there are infrequent categories for a feature, set by specifying

992 `max_categories` or `min_frequency`, the infrequent categories are

993 grouped into a single category.

994

995 Parameters

996 ----------

997 X : array-like of shape (n_samples, n_features)

998 The data to encode.

999

1000 Returns

1001 -------

1002 X_out : {ndarray, sparse matrix} of shape \

1003 (n_samples, n_encoded_features)

1004 Transformed input. If `sparse_output=True`, a sparse matrix will be

1005 returned.

1006 """

1007 check_is_fitted(self)

1008 transform_output = _get_output_config("transform", estimator=self)["dense"]

1009 if transform_output != "default" and self.sparse_output:

1010 capitalize_transform_output = transform_output.capitalize()

1011 raise ValueError(

1012 f"{capitalize_transform_output} output does not support sparse data."

1013 f" Set sparse_output=False to output {transform_output} dataframes or"

1014 f" disable {capitalize_transform_output} output via"

1015 '` ohe.set_output(transform="default").'

1016 )

1017

1018 # validation of X happens in _check_X called by _transform

1019 warn_on_unknown = self.drop is not None and self.handle_unknown in {

1020 "ignore",

1021 "infrequent_if_exist",

1022 }

1023 X_int, X_mask = self._transform(

1024 X,

1025 handle_unknown=self.handle_unknown,

1026 force_all_finite="allow-nan",

1027 warn_on_unknown=warn_on_unknown,

1028 )

1029

1030 n_samples, n_features = X_int.shape

1031

1032 if self._drop_idx_after_grouping is not None:

1033 to_drop = self._drop_idx_after_grouping.copy()

1034 # We remove all the dropped categories from mask, and decrement all

1035 # categories that occur after them to avoid an empty column.

1036 keep_cells = X_int != to_drop

1037 for i, cats in enumerate(self.categories_):

1038 # drop='if_binary' but feature isn't binary

1039 if to_drop[i] is None:

1040 # set to cardinality to not drop from X_int

1041 to_drop[i] = len(cats)

1042

1043 to_drop = to_drop.reshape(1, -1)

1044 X_int[X_int > to_drop] -= 1

1045 X_mask &= keep_cells

1046

1047 mask = X_mask.ravel()

1048 feature_indices = np.cumsum([0] + self._n_features_outs)

1049 indices = (X_int + feature_indices[:-1]).ravel()[mask]

1050

1051 indptr = np.empty(n_samples + 1, dtype=int)

1052 indptr[0] = 0

1053 np.sum(X_mask, axis=1, out=indptr[1:], dtype=indptr.dtype)

1054 np.cumsum(indptr[1:], out=indptr[1:])

1055 data = np.ones(indptr[-1])

1056

1057 out = sparse.csr_matrix(

1058 (data, indices, indptr),

1059 shape=(n_samples, feature_indices[-1]),

1060 dtype=self.dtype,

1061 )

1062 if not self.sparse_output:

1063 return out.toarray()

1064 else:

1065 return out

1066

1067 def inverse_transform(self, X):

1068 """

1069 Convert the data back to the original representation.

1070

1071 When unknown categories are encountered (all zeros in the

1072 one-hot encoding), ``None`` is used to represent this category. If the

1073 feature with the unknown category has a dropped category, the dropped

1074 category will be its inverse.

1075

1076 For a given input feature, if there is an infrequent category,

1077 'infrequent_sklearn' will be used to represent the infrequent category.

1078

1079 Parameters

1080 ----------

1081 X : {array-like, sparse matrix} of shape \

1082 (n_samples, n_encoded_features)

1083 The transformed data.

1084

1085 Returns

1086 -------

1087 X_tr : ndarray of shape (n_samples, n_features)

1088 Inverse transformed array.

1089 """

1090 check_is_fitted(self)

1091 X = check_array(X, accept_sparse="csr")

1092

1093 n_samples, _ = X.shape

1094 n_features = len(self.categories_)

1095

1096 n_features_out = np.sum(self._n_features_outs)

1097

1098 # validate shape of passed X

1099 msg = (

1100 "Shape of the passed X data is not correct. Expected {0} columns, got {1}."

1101 )

1102 if X.shape[1] != n_features_out:

1103 raise ValueError(msg.format(n_features_out, X.shape[1]))

1104

1105 transformed_features = [

1106 self._compute_transformed_categories(i, remove_dropped=False)

1107 for i, _ in enumerate(self.categories_)

1108 ]

1109

1110 # create resulting array of appropriate dtype

1111 dt = np.result_type(*[cat.dtype for cat in transformed_features])

1112 X_tr = np.empty((n_samples, n_features), dtype=dt)

1113

1114 j = 0

1115 found_unknown = {}

1116

1117 if self._infrequent_enabled:

1118 infrequent_indices = self._infrequent_indices

1119 else:

1120 infrequent_indices = [None] * n_features

1121

1122 for i in range(n_features):

1123 cats_wo_dropped = self._remove_dropped_categories(

1124 transformed_features[i], i

1125 )

1126 n_categories = cats_wo_dropped.shape[0]

1127

1128 # Only happens if there was a column with a unique

1129 # category. In this case we just fill the column with this

1130 # unique category value.

1131 if n_categories == 0:

1132 X_tr[:, i] = self.categories_[i][self._drop_idx_after_grouping[i]]

1133 j += n_categories

1134 continue

1135 sub = X[:, j : j + n_categories]

1136 # for sparse X argmax returns 2D matrix, ensure 1D array

1137 labels = np.asarray(sub.argmax(axis=1)).flatten()

1138 X_tr[:, i] = cats_wo_dropped[labels]

1139

1140 if self.handle_unknown == "ignore" or (

1141 self.handle_unknown == "infrequent_if_exist"

1142 and infrequent_indices[i] is None

1143 ):

1144 unknown = np.asarray(sub.sum(axis=1) == 0).flatten()

1145 # ignored unknown categories: we have a row of all zero

1146 if unknown.any():

1147 # if categories were dropped then unknown categories will

1148 # be mapped to the dropped category

1149 if (

1150 self._drop_idx_after_grouping is None

1151 or self._drop_idx_after_grouping[i] is None

1152 ):

1153 found_unknown[i] = unknown

1154 else:

1155 X_tr[unknown, i] = self.categories_[i][

1156 self._drop_idx_after_grouping[i]

1157 ]

1158 else:

1159 dropped = np.asarray(sub.sum(axis=1) == 0).flatten()

1160 if dropped.any():

1161 if self._drop_idx_after_grouping is None:

1162 all_zero_samples = np.flatnonzero(dropped)

1163 raise ValueError(

1164 f"Samples {all_zero_samples} can not be inverted "

1165 "when drop=None and handle_unknown='error' "

1166 "because they contain all zeros"

1167 )

1168 # we can safely assume that all of the nulls in each column

1169 # are the dropped value

1170 drop_idx = self._drop_idx_after_grouping[i]

1171 X_tr[dropped, i] = transformed_features[i][drop_idx]

1172

1173 j += n_categories

1174

1175 # if ignored are found: potentially need to upcast result to

1176 # insert None values

1177 if found_unknown:

1178 if X_tr.dtype != object:

1179 X_tr = X_tr.astype(object)

1180

1181 for idx, mask in found_unknown.items():

1182 X_tr[mask, idx] = None

1183

1184 return X_tr

1185

1186 def get_feature_names_out(self, input_features=None):

1187 """Get output feature names for transformation.

1188

1189 Parameters

1190 ----------

1191 input_features : array-like of str or None, default=None

1192 Input features.

1193

1194 - If `input_features` is `None`, then `feature_names_in_` is

1195 used as feature names in. If `feature_names_in_` is not defined,

1196 then the following input feature names are generated:

1197 `["x0", "x1", ..., "x(n_features_in_ - 1)"]`.

1198 - If `input_features` is an array-like, then `input_features` must

1199 match `feature_names_in_` if `feature_names_in_` is defined.

1200

1201 Returns

1202 -------

1203 feature_names_out : ndarray of str objects

1204 Transformed feature names.

1205 """

1206 check_is_fitted(self)

1207 input_features = _check_feature_names_in(self, input_features)

1208 cats = [

1209 self._compute_transformed_categories(i)

1210 for i, _ in enumerate(self.categories_)

1211 ]

1212

1213 name_combiner = self._check_get_feature_name_combiner()

1214 feature_names = []

1215 for i in range(len(cats)):

1216 names = [name_combiner(input_features[i], t) for t in cats[i]]

1217 feature_names.extend(names)

1218

1219 return np.array(feature_names, dtype=object)

1220

1221 def _check_get_feature_name_combiner(self):

1222 if self.feature_name_combiner == "concat":

1223 return lambda feature, category: feature + "_" + str(category)

1224 else: # callable

1225 dry_run_combiner = self.feature_name_combiner("feature", "category")

1226 if not isinstance(dry_run_combiner, str):

1227 raise TypeError(

1228 "When `feature_name_combiner` is a callable, it should return a "

1229 f"Python string. Got {type(dry_run_combiner)} instead."

1230 )

1231 return self.feature_name_combiner

1232

1233

1234class OrdinalEncoder(OneToOneFeatureMixin, _BaseEncoder):

1235 """

1236 Encode categorical features as an integer array.

1237

1238 The input to this transformer should be an array-like of integers or

1239 strings, denoting the values taken on by categorical (discrete) features.

1240 The features are converted to ordinal integers. This results in

1241 a single column of integers (0 to n_categories - 1) per feature.

1242

1243 Read more in the :ref:`User Guide <preprocessing_categorical_features>`.

1244 For a comparison of different encoders, refer to:

1245 :ref:`sphx_glr_auto_examples_preprocessing_plot_target_encoder.py`.

1246

1247 .. versionadded:: 0.20

1248

1249 Parameters

1250 ----------

1251 categories : 'auto' or a list of array-like, default='auto'

1252 Categories (unique values) per feature:

1253

1254 - 'auto' : Determine categories automatically from the training data.

1255 - list : ``categories[i]`` holds the categories expected in the ith

1256 column. The passed categories should not mix strings and numeric

1257 values, and should be sorted in case of numeric values.

1258

1259 The used categories can be found in the ``categories_`` attribute.

1260

1261 dtype : number type, default=np.float64

1262 Desired dtype of output.

1263

1264 handle_unknown : {'error', 'use_encoded_value'}, default='error'

1265 When set to 'error' an error will be raised in case an unknown

1266 categorical feature is present during transform. When set to

1267 'use_encoded_value', the encoded value of unknown categories will be

1268 set to the value given for the parameter `unknown_value`. In

1269 :meth:`inverse_transform`, an unknown category will be denoted as None.

1270

1271 .. versionadded:: 0.24

1272

1273 unknown_value : int or np.nan, default=None

1274 When the parameter handle_unknown is set to 'use_encoded_value', this

1275 parameter is required and will set the encoded value of unknown

1276 categories. It has to be distinct from the values used to encode any of

1277 the categories in `fit`. If set to np.nan, the `dtype` parameter must

1278 be a float dtype.

1279

1280 .. versionadded:: 0.24

1281

1282 encoded_missing_value : int or np.nan, default=np.nan

1283 Encoded value of missing categories. If set to `np.nan`, then the `dtype`

1284 parameter must be a float dtype.

1285

1286 .. versionadded:: 1.1

1287

1288 min_frequency : int or float, default=None

1289 Specifies the minimum frequency below which a category will be

1290 considered infrequent.

1291

1292 - If `int`, categories with a smaller cardinality will be considered

1293 infrequent.

1294

1295 - If `float`, categories with a smaller cardinality than

1296 `min_frequency * n_samples` will be considered infrequent.

1297

1298 .. versionadded:: 1.3

1299 Read more in the :ref:`User Guide <encoder_infrequent_categories>`.

1300

1301 max_categories : int, default=None

1302 Specifies an upper limit to the number of output categories for each input

1303 feature when considering infrequent categories. If there are infrequent

1304 categories, `max_categories` includes the category representing the

1305 infrequent categories along with the frequent categories. If `None`,

1306 there is no limit to the number of output features.

1307

1308 `max_categories` do **not** take into account missing or unknown

1309 categories. Setting `unknown_value` or `encoded_missing_value` to an

1310 integer will increase the number of unique integer codes by one each.

1311 This can result in up to `max_categories + 2` integer codes.

1312

1313 .. versionadded:: 1.3

1314 Read more in the :ref:`User Guide <encoder_infrequent_categories>`.

1315

1316 Attributes

1317 ----------

1318 categories_ : list of arrays

1319 The categories of each feature determined during ``fit`` (in order of

1320 the features in X and corresponding with the output of ``transform``).

1321 This does not include categories that weren't seen during ``fit``.

1322

1323 n_features_in_ : int

1324 Number of features seen during :term:`fit`.

1325

1326 .. versionadded:: 1.0

1327

1328 feature_names_in_ : ndarray of shape (`n_features_in_`,)

1329 Names of features seen during :term:`fit`. Defined only when `X`

1330 has feature names that are all strings.

1331

1332 .. versionadded:: 1.0

1333

1334 infrequent_categories_ : list of ndarray

1335 Defined only if infrequent categories are enabled by setting

1336 `min_frequency` or `max_categories` to a non-default value.

1337 `infrequent_categories_[i]` are the infrequent categories for feature

1338 `i`. If the feature `i` has no infrequent categories

1339 `infrequent_categories_[i]` is None.

1340

1341 .. versionadded:: 1.3

1342

1343 See Also

1344 --------

1345 OneHotEncoder : Performs a one-hot encoding of categorical features. This encoding

1346 is suitable for low to medium cardinality categorical variables, both in

1347 supervised and unsupervised settings.

1348 TargetEncoder : Encodes categorical features using supervised signal

1349 in a classification or regression pipeline. This encoding is typically

1350 suitable for high cardinality categorical variables.

1351 LabelEncoder : Encodes target labels with values between 0 and

1352 ``n_classes-1``.

1353

1354 Notes

1355 -----

1356 With a high proportion of `nan` values, inferring categories becomes slow with

1357 Python versions before 3.10. The handling of `nan` values was improved

1358 from Python 3.10 onwards, (c.f.

1359 `bpo-43475 <https://github.com/python/cpython/issues/87641>`_).

1360

1361 Examples

1362 --------

1363 Given a dataset with two features, we let the encoder find the unique

1364 values per feature and transform the data to an ordinal encoding.

1365

1366 >>> from sklearn.preprocessing import OrdinalEncoder

1367 >>> enc = OrdinalEncoder()

1368 >>> X = [['Male', 1], ['Female', 3], ['Female', 2]]

1369 >>> enc.fit(X)

1370 OrdinalEncoder()

1371 >>> enc.categories_

1372 [array(['Female', 'Male'], dtype=object), array([1, 2, 3], dtype=object)]

1373 >>> enc.transform([['Female', 3], ['Male', 1]])

1374 array([[0., 2.],

1375 [1., 0.]])

1376

1377 >>> enc.inverse_transform([[1, 0], [0, 1]])

1378 array([['Male', 1],

1379 ['Female', 2]], dtype=object)

1380

1381 By default, :class:`OrdinalEncoder` is lenient towards missing values by

1382 propagating them.

1383

1384 >>> import numpy as np

1385 >>> X = [['Male', 1], ['Female', 3], ['Female', np.nan]]

1386 >>> enc.fit_transform(X)

1387 array([[ 1., 0.],

1388 [ 0., 1.],

1389 [ 0., nan]])

1390

1391 You can use the parameter `encoded_missing_value` to encode missing values.

1392

1393 >>> enc.set_params(encoded_missing_value=-1).fit_transform(X)

1394 array([[ 1., 0.],

1395 [ 0., 1.],

1396 [ 0., -1.]])

1397

1398 Infrequent categories are enabled by setting `max_categories` or `min_frequency`.

1399 In the following example, "a" and "d" are considered infrequent and grouped

1400 together into a single category, "b" and "c" are their own categories, unknown

1401 values are encoded as 3 and missing values are encoded as 4.

1402

1403 >>> X_train = np.array(

1404 ... [["a"] * 5 + ["b"] * 20 + ["c"] * 10 + ["d"] * 3 + [np.nan]],

1405 ... dtype=object).T

1406 >>> enc = OrdinalEncoder(

1407 ... handle_unknown="use_encoded_value", unknown_value=3,

1408 ... max_categories=3, encoded_missing_value=4)

1409 >>> _ = enc.fit(X_train)

1410 >>> X_test = np.array([["a"], ["b"], ["c"], ["d"], ["e"], [np.nan]], dtype=object)

1411 >>> enc.transform(X_test)

1412 array([[2.],

1413 [0.],

1414 [1.],

1415 [2.],

1416 [3.],

1417 [4.]])

1418 """

1419

1420 _parameter_constraints: dict = {

1421 "categories": [StrOptions({"auto"}), list],

1422 "dtype": "no_validation", # validation delegated to numpy

1423 "encoded_missing_value": [Integral, type(np.nan)],

1424 "handle_unknown": [StrOptions({"error", "use_encoded_value"})],

1425 "unknown_value": [Integral, type(np.nan), None],

1426 "max_categories": [Interval(Integral, 1, None, closed="left"), None],

1427 "min_frequency": [

1428 Interval(Integral, 1, None, closed="left"),

1429 Interval(RealNotInt, 0, 1, closed="neither"),

1430 None,

1431 ],

1432 }

1433

1434 def __init__(

1435 self,

1436 *,

1437 categories="auto",

1438 dtype=np.float64,

1439 handle_unknown="error",

1440 unknown_value=None,

1441 encoded_missing_value=np.nan,

1442 min_frequency=None,

1443 max_categories=None,

1444 ):

1445 self.categories = categories

1446 self.dtype = dtype

1447 self.handle_unknown = handle_unknown

1448 self.unknown_value = unknown_value

1449 self.encoded_missing_value = encoded_missing_value

1450 self.min_frequency = min_frequency

1451 self.max_categories = max_categories

1452

1453 @_fit_context(prefer_skip_nested_validation=True)

1454 def fit(self, X, y=None):

1455 """

1456 Fit the OrdinalEncoder to X.

1457

1458 Parameters

1459 ----------

1460 X : array-like of shape (n_samples, n_features)

1461 The data to determine the categories of each feature.

1462

1463 y : None

1464 Ignored. This parameter exists only for compatibility with

1465 :class:`~sklearn.pipeline.Pipeline`.

1466

1467 Returns

1468 -------

1469 self : object

1470 Fitted encoder.

1471 """

1472 if self.handle_unknown == "use_encoded_value":

1473 if is_scalar_nan(self.unknown_value):

1474 if np.dtype(self.dtype).kind != "f":

1475 raise ValueError(

1476 "When unknown_value is np.nan, the dtype "

1477 "parameter should be "

1478 f"a float dtype. Got {self.dtype}."

1479 )

1480 elif not isinstance(self.unknown_value, numbers.Integral):

1481 raise TypeError(

1482 "unknown_value should be an integer or "

1483 "np.nan when "

1484 "handle_unknown is 'use_encoded_value', "

1485 f"got {self.unknown_value}."

1486 )

1487 elif self.unknown_value is not None:

1488 raise TypeError(

1489 "unknown_value should only be set when "

1490 "handle_unknown is 'use_encoded_value', "

1491 f"got {self.unknown_value}."

1492 )

1493

1494 # `_fit` will only raise an error when `self.handle_unknown="error"`

1495 fit_results = self._fit(

1496 X,

1497 handle_unknown=self.handle_unknown,

1498 force_all_finite="allow-nan",

1499 return_and_ignore_missing_for_infrequent=True,

1500 )

1501 self._missing_indices = fit_results["missing_indices"]

1502

1503 cardinalities = [len(categories) for categories in self.categories_]

1504 if self._infrequent_enabled:

1505 # Cardinality decreases because the infrequent categories are grouped

1506 # together

1507 for feature_idx, infrequent in enumerate(self.infrequent_categories_):

1508 if infrequent is not None:

1509 cardinalities[feature_idx] -= len(infrequent)

1510

1511 # missing values are not considered part of the cardinality

1512 # when considering unknown categories or encoded_missing_value

1513 for cat_idx, categories_for_idx in enumerate(self.categories_):

1514 if is_scalar_nan(categories_for_idx[-1]):

1515 cardinalities[cat_idx] -= 1

1516

1517 if self.handle_unknown == "use_encoded_value":

1518 for cardinality in cardinalities:

1519 if 0 <= self.unknown_value < cardinality:

1520 raise ValueError(

1521 "The used value for unknown_value "

1522 f"{self.unknown_value} is one of the "

1523 "values already used for encoding the "

1524 "seen categories."

1525 )

1526

1527 if self._missing_indices:

1528 if np.dtype(self.dtype).kind != "f" and is_scalar_nan(

1529 self.encoded_missing_value

1530 ):

1531 raise ValueError(

1532 "There are missing values in features "

1533 f"{list(self._missing_indices)}. For OrdinalEncoder to "

1534 f"encode missing values with dtype: {self.dtype}, set "

1535 "encoded_missing_value to a non-nan value, or "

1536 "set dtype to a float"

1537 )

1538

1539 if not is_scalar_nan(self.encoded_missing_value):

1540 # Features are invalid when they contain a missing category

1541 # and encoded_missing_value was already used to encode a

1542 # known category

1543 invalid_features = [

1544 cat_idx

1545 for cat_idx, cardinality in enumerate(cardinalities)

1546 if cat_idx in self._missing_indices

1547 and 0 <= self.encoded_missing_value < cardinality

1548 ]

1549

1550 if invalid_features:

1551 # Use feature names if they are available

1552 if hasattr(self, "feature_names_in_"):

1553 invalid_features = self.feature_names_in_[invalid_features]

1554 raise ValueError(

1555 f"encoded_missing_value ({self.encoded_missing_value}) "

1556 "is already used to encode a known category in features: "

1557 f"{invalid_features}"

1558 )

1559

1560 return self

1561

1562 def transform(self, X):

1563 """

1564 Transform X to ordinal codes.

1565

1566 Parameters

1567 ----------

1568 X : array-like of shape (n_samples, n_features)

1569 The data to encode.

1570

1571 Returns

1572 -------

1573 X_out : ndarray of shape (n_samples, n_features)

1574 Transformed input.

1575 """

1576 check_is_fitted(self, "categories_")

1577 X_int, X_mask = self._transform(

1578 X,

1579 handle_unknown=self.handle_unknown,

1580 force_all_finite="allow-nan",

1581 ignore_category_indices=self._missing_indices,

1582 )

1583 X_trans = X_int.astype(self.dtype, copy=False)

1584

1585 for cat_idx, missing_idx in self._missing_indices.items():

1586 X_missing_mask = X_int[:, cat_idx] == missing_idx

1587 X_trans[X_missing_mask, cat_idx] = self.encoded_missing_value

1588

1589 # create separate category for unknown values

1590 if self.handle_unknown == "use_encoded_value":

1591 X_trans[~X_mask] = self.unknown_value

1592 return X_trans

1593

1594 def inverse_transform(self, X):

1595 """

1596 Convert the data back to the original representation.

1597

1598 Parameters

1599 ----------

1600 X : array-like of shape (n_samples, n_encoded_features)

1601 The transformed data.

1602

1603 Returns

1604 -------

1605 X_tr : ndarray of shape (n_samples, n_features)

1606 Inverse transformed array.

1607 """

1608 check_is_fitted(self)

1609 X = check_array(X, force_all_finite="allow-nan")

1610

1611 n_samples, _ = X.shape

1612 n_features = len(self.categories_)

1613

1614 # validate shape of passed X

1615 msg = (

1616 "Shape of the passed X data is not correct. Expected {0} columns, got {1}."

1617 )

1618 if X.shape[1] != n_features:

1619 raise ValueError(msg.format(n_features, X.shape[1]))

1620

1621 # create resulting array of appropriate dtype

1622 dt = np.result_type(*[cat.dtype for cat in self.categories_])

1623 X_tr = np.empty((n_samples, n_features), dtype=dt)

1624

1625 found_unknown = {}

1626 infrequent_masks = {}

1627

1628 infrequent_indices = getattr(self, "_infrequent_indices", None)

1629

1630 for i in range(n_features):

1631 labels = X[:, i]

1632

1633 # replace values of X[:, i] that were nan with actual indices

1634 if i in self._missing_indices:

1635 X_i_mask = _get_mask(labels, self.encoded_missing_value)

1636 labels[X_i_mask] = self._missing_indices[i]

1637

1638 rows_to_update = slice(None)

1639 categories = self.categories_[i]

1640

1641 if infrequent_indices is not None and infrequent_indices[i] is not None:

1642 # Compute mask for frequent categories

1643 infrequent_encoding_value = len(categories) - len(infrequent_indices[i])

1644 infrequent_masks[i] = labels == infrequent_encoding_value

1645 rows_to_update = ~infrequent_masks[i]

1646

1647 # Remap categories to be only frequent categories. The infrequent

1648 # categories will be mapped to "infrequent_sklearn" later

1649 frequent_categories_mask = np.ones_like(categories, dtype=bool)

1650 frequent_categories_mask[infrequent_indices[i]] = False

1651 categories = categories[frequent_categories_mask]

1652

1653 if self.handle_unknown == "use_encoded_value":

1654 unknown_labels = _get_mask(labels, self.unknown_value)

1655 found_unknown[i] = unknown_labels

1656

1657 known_labels = ~unknown_labels

1658 if isinstance(rows_to_update, np.ndarray):

1659 rows_to_update &= known_labels

1660 else:

1661 rows_to_update = known_labels

1662

1663 labels_int = labels[rows_to_update].astype("int64", copy=False)

1664 X_tr[rows_to_update, i] = categories[labels_int]

1665

1666 if found_unknown or infrequent_masks:

1667 X_tr = X_tr.astype(object, copy=False)

1668

1669 # insert None values for unknown values

1670 if found_unknown:

1671 for idx, mask in found_unknown.items():

1672 X_tr[mask, idx] = None

1673

1674 if infrequent_masks:

1675 for idx, mask in infrequent_masks.items():

1676 X_tr[mask, idx] = "infrequent_sklearn"

1677

1678 return X_tr

Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/scikit_learn-1.4.dev0-py3.8-linux-x86_64.egg/sklearn/preprocessing/_encoders.py: 37%

480 statements