Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/scikit_learn-1.4.dev0-py3.8-linux-x86_64.egg/sklearn/preprocessing/_encoders.py: 37%

480 statements  

« prev     ^ index     » next       coverage.py v7.3.2, created at 2023-12-12 06:31 +0000

1# Authors: Andreas Mueller <amueller@ais.uni-bonn.de> 

2# Joris Van den Bossche <jorisvandenbossche@gmail.com> 

3# License: BSD 3 clause 

4 

5import numbers 

6import warnings 

7from numbers import Integral 

8 

9import numpy as np 

10from scipy import sparse 

11 

12from ..base import BaseEstimator, OneToOneFeatureMixin, TransformerMixin, _fit_context 

13from ..utils import _safe_indexing, check_array, is_scalar_nan 

14from ..utils._encode import _check_unknown, _encode, _get_counts, _unique 

15from ..utils._mask import _get_mask 

16from ..utils._param_validation import Interval, RealNotInt, StrOptions 

17from ..utils._set_output import _get_output_config 

18from ..utils.validation import _check_feature_names_in, check_is_fitted 

19 

20__all__ = ["OneHotEncoder", "OrdinalEncoder"] 

21 

22 

23class _BaseEncoder(TransformerMixin, BaseEstimator): 

24 """ 

25 Base class for encoders that includes the code to categorize and 

26 transform the input features. 

27 

28 """ 

29 

30 def _check_X(self, X, force_all_finite=True): 

31 """ 

32 Perform custom check_array: 

33 - convert list of strings to object dtype 

34 - check for missing values for object dtype data (check_array does 

35 not do that) 

36 - return list of features (arrays): this list of features is 

37 constructed feature by feature to preserve the data types 

38 of pandas DataFrame columns, as otherwise information is lost 

39 and cannot be used, e.g. for the `categories_` attribute. 

40 

41 """ 

42 if not (hasattr(X, "iloc") and getattr(X, "ndim", 0) == 2): 

43 # if not a dataframe, do normal check_array validation 

44 X_temp = check_array(X, dtype=None, force_all_finite=force_all_finite) 

45 if not hasattr(X, "dtype") and np.issubdtype(X_temp.dtype, np.str_): 

46 X = check_array(X, dtype=object, force_all_finite=force_all_finite) 

47 else: 

48 X = X_temp 

49 needs_validation = False 

50 else: 

51 # pandas dataframe, do validation later column by column, in order 

52 # to keep the dtype information to be used in the encoder. 

53 needs_validation = force_all_finite 

54 

55 n_samples, n_features = X.shape 

56 X_columns = [] 

57 

58 for i in range(n_features): 

59 Xi = _safe_indexing(X, indices=i, axis=1) 

60 Xi = check_array( 

61 Xi, ensure_2d=False, dtype=None, force_all_finite=needs_validation 

62 ) 

63 X_columns.append(Xi) 

64 

65 return X_columns, n_samples, n_features 

66 

67 def _fit( 

68 self, 

69 X, 

70 handle_unknown="error", 

71 force_all_finite=True, 

72 return_counts=False, 

73 return_and_ignore_missing_for_infrequent=False, 

74 ): 

75 self._check_infrequent_enabled() 

76 self._check_n_features(X, reset=True) 

77 self._check_feature_names(X, reset=True) 

78 X_list, n_samples, n_features = self._check_X( 

79 X, force_all_finite=force_all_finite 

80 ) 

81 self.n_features_in_ = n_features 

82 

83 if self.categories != "auto": 

84 if len(self.categories) != n_features: 

85 raise ValueError( 

86 "Shape mismatch: if categories is an array," 

87 " it has to be of shape (n_features,)." 

88 ) 

89 

90 self.categories_ = [] 

91 category_counts = [] 

92 compute_counts = return_counts or self._infrequent_enabled 

93 

94 for i in range(n_features): 

95 Xi = X_list[i] 

96 

97 if self.categories == "auto": 

98 result = _unique(Xi, return_counts=compute_counts) 

99 if compute_counts: 

100 cats, counts = result 

101 category_counts.append(counts) 

102 else: 

103 cats = result 

104 else: 

105 if np.issubdtype(Xi.dtype, np.str_): 

106 # Always convert string categories to objects to avoid 

107 # unexpected string truncation for longer category labels 

108 # passed in the constructor. 

109 Xi_dtype = object 

110 else: 

111 Xi_dtype = Xi.dtype 

112 

113 cats = np.array(self.categories[i], dtype=Xi_dtype) 

114 if ( 

115 cats.dtype == object 

116 and isinstance(cats[0], bytes) 

117 and Xi.dtype.kind != "S" 

118 ): 

119 msg = ( 

120 f"In column {i}, the predefined categories have type 'bytes'" 

121 " which is incompatible with values of type" 

122 f" '{type(Xi[0]).__name__}'." 

123 ) 

124 raise ValueError(msg) 

125 

126 # `nan` must be the last stated category 

127 for category in cats[:-1]: 

128 if is_scalar_nan(category): 

129 raise ValueError( 

130 "Nan should be the last element in user" 

131 f" provided categories, see categories {cats}" 

132 f" in column #{i}" 

133 ) 

134 

135 if cats.size != len(_unique(cats)): 

136 msg = ( 

137 f"In column {i}, the predefined categories" 

138 " contain duplicate elements." 

139 ) 

140 raise ValueError(msg) 

141 

142 if Xi.dtype.kind not in "OUS": 

143 sorted_cats = np.sort(cats) 

144 error_msg = ( 

145 "Unsorted categories are not supported for numerical categories" 

146 ) 

147 # if there are nans, nan should be the last element 

148 stop_idx = -1 if np.isnan(sorted_cats[-1]) else None 

149 if np.any(sorted_cats[:stop_idx] != cats[:stop_idx]): 

150 raise ValueError(error_msg) 

151 

152 if handle_unknown == "error": 

153 diff = _check_unknown(Xi, cats) 

154 if diff: 

155 msg = ( 

156 "Found unknown categories {0} in column {1}" 

157 " during fit".format(diff, i) 

158 ) 

159 raise ValueError(msg) 

160 if compute_counts: 

161 category_counts.append(_get_counts(Xi, cats)) 

162 

163 self.categories_.append(cats) 

164 

165 output = {"n_samples": n_samples} 

166 if return_counts: 

167 output["category_counts"] = category_counts 

168 

169 missing_indices = {} 

170 if return_and_ignore_missing_for_infrequent: 

171 for feature_idx, categories_for_idx in enumerate(self.categories_): 

172 if is_scalar_nan(categories_for_idx[-1]): 

173 # `nan` values can only be placed in the latest position 

174 missing_indices[feature_idx] = categories_for_idx.size - 1 

175 output["missing_indices"] = missing_indices 

176 

177 if self._infrequent_enabled: 

178 self._fit_infrequent_category_mapping( 

179 n_samples, 

180 category_counts, 

181 missing_indices, 

182 ) 

183 return output 

184 

185 def _transform( 

186 self, 

187 X, 

188 handle_unknown="error", 

189 force_all_finite=True, 

190 warn_on_unknown=False, 

191 ignore_category_indices=None, 

192 ): 

193 X_list, n_samples, n_features = self._check_X( 

194 X, force_all_finite=force_all_finite 

195 ) 

196 self._check_feature_names(X, reset=False) 

197 self._check_n_features(X, reset=False) 

198 

199 X_int = np.zeros((n_samples, n_features), dtype=int) 

200 X_mask = np.ones((n_samples, n_features), dtype=bool) 

201 

202 columns_with_unknown = [] 

203 for i in range(n_features): 

204 Xi = X_list[i] 

205 diff, valid_mask = _check_unknown(Xi, self.categories_[i], return_mask=True) 

206 

207 if not np.all(valid_mask): 

208 if handle_unknown == "error": 

209 msg = ( 

210 "Found unknown categories {0} in column {1}" 

211 " during transform".format(diff, i) 

212 ) 

213 raise ValueError(msg) 

214 else: 

215 if warn_on_unknown: 

216 columns_with_unknown.append(i) 

217 # Set the problematic rows to an acceptable value and 

218 # continue `The rows are marked `X_mask` and will be 

219 # removed later. 

220 X_mask[:, i] = valid_mask 

221 # cast Xi into the largest string type necessary 

222 # to handle different lengths of numpy strings 

223 if ( 

224 self.categories_[i].dtype.kind in ("U", "S") 

225 and self.categories_[i].itemsize > Xi.itemsize 

226 ): 

227 Xi = Xi.astype(self.categories_[i].dtype) 

228 elif self.categories_[i].dtype.kind == "O" and Xi.dtype.kind == "U": 

229 # categories are objects and Xi are numpy strings. 

230 # Cast Xi to an object dtype to prevent truncation 

231 # when setting invalid values. 

232 Xi = Xi.astype("O") 

233 else: 

234 Xi = Xi.copy() 

235 

236 Xi[~valid_mask] = self.categories_[i][0] 

237 # We use check_unknown=False, since _check_unknown was 

238 # already called above. 

239 X_int[:, i] = _encode(Xi, uniques=self.categories_[i], check_unknown=False) 

240 if columns_with_unknown: 

241 warnings.warn( 

242 ( 

243 "Found unknown categories in columns " 

244 f"{columns_with_unknown} during transform. These " 

245 "unknown categories will be encoded as all zeros" 

246 ), 

247 UserWarning, 

248 ) 

249 

250 self._map_infrequent_categories(X_int, X_mask, ignore_category_indices) 

251 return X_int, X_mask 

252 

253 @property 

254 def infrequent_categories_(self): 

255 """Infrequent categories for each feature.""" 

256 # raises an AttributeError if `_infrequent_indices` is not defined 

257 infrequent_indices = self._infrequent_indices 

258 return [ 

259 None if indices is None else category[indices] 

260 for category, indices in zip(self.categories_, infrequent_indices) 

261 ] 

262 

263 def _check_infrequent_enabled(self): 

264 """ 

265 This functions checks whether _infrequent_enabled is True or False. 

266 This has to be called after parameter validation in the fit function. 

267 """ 

268 max_categories = getattr(self, "max_categories", None) 

269 min_frequency = getattr(self, "min_frequency", None) 

270 self._infrequent_enabled = ( 

271 max_categories is not None and max_categories >= 1 

272 ) or min_frequency is not None 

273 

274 def _identify_infrequent(self, category_count, n_samples, col_idx): 

275 """Compute the infrequent indices. 

276 

277 Parameters 

278 ---------- 

279 category_count : ndarray of shape (n_cardinality,) 

280 Category counts. 

281 

282 n_samples : int 

283 Number of samples. 

284 

285 col_idx : int 

286 Index of the current category. Only used for the error message. 

287 

288 Returns 

289 ------- 

290 output : ndarray of shape (n_infrequent_categories,) or None 

291 If there are infrequent categories, indices of infrequent 

292 categories. Otherwise None. 

293 """ 

294 if isinstance(self.min_frequency, numbers.Integral): 

295 infrequent_mask = category_count < self.min_frequency 

296 elif isinstance(self.min_frequency, numbers.Real): 

297 min_frequency_abs = n_samples * self.min_frequency 

298 infrequent_mask = category_count < min_frequency_abs 

299 else: 

300 infrequent_mask = np.zeros(category_count.shape[0], dtype=bool) 

301 

302 n_current_features = category_count.size - infrequent_mask.sum() + 1 

303 if self.max_categories is not None and self.max_categories < n_current_features: 

304 # max_categories includes the one infrequent category 

305 frequent_category_count = self.max_categories - 1 

306 if frequent_category_count == 0: 

307 # All categories are infrequent 

308 infrequent_mask[:] = True 

309 else: 

310 # stable sort to preserve original count order 

311 smallest_levels = np.argsort(category_count, kind="mergesort")[ 

312 :-frequent_category_count 

313 ] 

314 infrequent_mask[smallest_levels] = True 

315 

316 output = np.flatnonzero(infrequent_mask) 

317 return output if output.size > 0 else None 

318 

319 def _fit_infrequent_category_mapping( 

320 self, n_samples, category_counts, missing_indices 

321 ): 

322 """Fit infrequent categories. 

323 

324 Defines the private attribute: `_default_to_infrequent_mappings`. For 

325 feature `i`, `_default_to_infrequent_mappings[i]` defines the mapping 

326 from the integer encoding returned by `super().transform()` into 

327 infrequent categories. If `_default_to_infrequent_mappings[i]` is None, 

328 there were no infrequent categories in the training set. 

329 

330 For example if categories 0, 2 and 4 were frequent, while categories 

331 1, 3, 5 were infrequent for feature 7, then these categories are mapped 

332 to a single output: 

333 `_default_to_infrequent_mappings[7] = array([0, 3, 1, 3, 2, 3])` 

334 

335 Defines private attribute: `_infrequent_indices`. `_infrequent_indices[i]` 

336 is an array of indices such that 

337 `categories_[i][_infrequent_indices[i]]` are all the infrequent category 

338 labels. If the feature `i` has no infrequent categories 

339 `_infrequent_indices[i]` is None. 

340 

341 .. versionadded:: 1.1 

342 

343 Parameters 

344 ---------- 

345 n_samples : int 

346 Number of samples in training set. 

347 category_counts: list of ndarray 

348 `category_counts[i]` is the category counts corresponding to 

349 `self.categories_[i]`. 

350 missing_indices : dict 

351 Dict mapping from feature_idx to category index with a missing value. 

352 """ 

353 # Remove missing value from counts, so it is not considered as infrequent 

354 if missing_indices: 

355 category_counts_ = [] 

356 for feature_idx, count in enumerate(category_counts): 

357 if feature_idx in missing_indices: 

358 category_counts_.append( 

359 np.delete(count, missing_indices[feature_idx]) 

360 ) 

361 else: 

362 category_counts_.append(count) 

363 else: 

364 category_counts_ = category_counts 

365 

366 self._infrequent_indices = [ 

367 self._identify_infrequent(category_count, n_samples, col_idx) 

368 for col_idx, category_count in enumerate(category_counts_) 

369 ] 

370 

371 # compute mapping from default mapping to infrequent mapping 

372 self._default_to_infrequent_mappings = [] 

373 

374 for feature_idx, infreq_idx in enumerate(self._infrequent_indices): 

375 cats = self.categories_[feature_idx] 

376 # no infrequent categories 

377 if infreq_idx is None: 

378 self._default_to_infrequent_mappings.append(None) 

379 continue 

380 

381 n_cats = len(cats) 

382 if feature_idx in missing_indices: 

383 # Missing index was removed from this category when computing 

384 # infrequent indices, thus we need to decrease the number of 

385 # total categories when considering the infrequent mapping. 

386 n_cats -= 1 

387 

388 # infrequent indices exist 

389 mapping = np.empty(n_cats, dtype=np.int64) 

390 n_infrequent_cats = infreq_idx.size 

391 

392 # infrequent categories are mapped to the last element. 

393 n_frequent_cats = n_cats - n_infrequent_cats 

394 mapping[infreq_idx] = n_frequent_cats 

395 

396 frequent_indices = np.setdiff1d(np.arange(n_cats), infreq_idx) 

397 mapping[frequent_indices] = np.arange(n_frequent_cats) 

398 

399 self._default_to_infrequent_mappings.append(mapping) 

400 

401 def _map_infrequent_categories(self, X_int, X_mask, ignore_category_indices): 

402 """Map infrequent categories to integer representing the infrequent category. 

403 

404 This modifies X_int in-place. Values that were invalid based on `X_mask` 

405 are mapped to the infrequent category if there was an infrequent 

406 category for that feature. 

407 

408 Parameters 

409 ---------- 

410 X_int: ndarray of shape (n_samples, n_features) 

411 Integer encoded categories. 

412 

413 X_mask: ndarray of shape (n_samples, n_features) 

414 Bool mask for valid values in `X_int`. 

415 

416 ignore_category_indices : dict 

417 Dictionary mapping from feature_idx to category index to ignore. 

418 Ignored indexes will not be grouped and the original ordinal encoding 

419 will remain. 

420 """ 

421 if not self._infrequent_enabled: 

422 return 

423 

424 ignore_category_indices = ignore_category_indices or {} 

425 

426 for col_idx in range(X_int.shape[1]): 

427 infrequent_idx = self._infrequent_indices[col_idx] 

428 if infrequent_idx is None: 

429 continue 

430 

431 X_int[~X_mask[:, col_idx], col_idx] = infrequent_idx[0] 

432 if self.handle_unknown == "infrequent_if_exist": 

433 # All the unknown values are now mapped to the 

434 # infrequent_idx[0], which makes the unknown values valid 

435 # This is needed in `transform` when the encoding is formed 

436 # using `X_mask`. 

437 X_mask[:, col_idx] = True 

438 

439 # Remaps encoding in `X_int` where the infrequent categories are 

440 # grouped together. 

441 for i, mapping in enumerate(self._default_to_infrequent_mappings): 

442 if mapping is None: 

443 continue 

444 

445 if i in ignore_category_indices: 

446 # Update rows that are **not** ignored 

447 rows_to_update = X_int[:, i] != ignore_category_indices[i] 

448 else: 

449 rows_to_update = slice(None) 

450 

451 X_int[rows_to_update, i] = np.take(mapping, X_int[rows_to_update, i]) 

452 

453 def _more_tags(self): 

454 return {"X_types": ["2darray", "categorical"], "allow_nan": True} 

455 

456 

457class OneHotEncoder(_BaseEncoder): 

458 """ 

459 Encode categorical features as a one-hot numeric array. 

460 

461 The input to this transformer should be an array-like of integers or 

462 strings, denoting the values taken on by categorical (discrete) features. 

463 The features are encoded using a one-hot (aka 'one-of-K' or 'dummy') 

464 encoding scheme. This creates a binary column for each category and 

465 returns a sparse matrix or dense array (depending on the ``sparse_output`` 

466 parameter). 

467 

468 By default, the encoder derives the categories based on the unique values 

469 in each feature. Alternatively, you can also specify the `categories` 

470 manually. 

471 

472 This encoding is needed for feeding categorical data to many scikit-learn 

473 estimators, notably linear models and SVMs with the standard kernels. 

474 

475 Note: a one-hot encoding of y labels should use a LabelBinarizer 

476 instead. 

477 

478 Read more in the :ref:`User Guide <preprocessing_categorical_features>`. 

479 For a comparison of different encoders, refer to: 

480 :ref:`sphx_glr_auto_examples_preprocessing_plot_target_encoder.py`. 

481 

482 Parameters 

483 ---------- 

484 categories : 'auto' or a list of array-like, default='auto' 

485 Categories (unique values) per feature: 

486 

487 - 'auto' : Determine categories automatically from the training data. 

488 - list : ``categories[i]`` holds the categories expected in the ith 

489 column. The passed categories should not mix strings and numeric 

490 values within a single feature, and should be sorted in case of 

491 numeric values. 

492 

493 The used categories can be found in the ``categories_`` attribute. 

494 

495 .. versionadded:: 0.20 

496 

497 drop : {'first', 'if_binary'} or an array-like of shape (n_features,), \ 

498 default=None 

499 Specifies a methodology to use to drop one of the categories per 

500 feature. This is useful in situations where perfectly collinear 

501 features cause problems, such as when feeding the resulting data 

502 into an unregularized linear regression model. 

503 

504 However, dropping one category breaks the symmetry of the original 

505 representation and can therefore induce a bias in downstream models, 

506 for instance for penalized linear classification or regression models. 

507 

508 - None : retain all features (the default). 

509 - 'first' : drop the first category in each feature. If only one 

510 category is present, the feature will be dropped entirely. 

511 - 'if_binary' : drop the first category in each feature with two 

512 categories. Features with 1 or more than 2 categories are 

513 left intact. 

514 - array : ``drop[i]`` is the category in feature ``X[:, i]`` that 

515 should be dropped. 

516 

517 When `max_categories` or `min_frequency` is configured to group 

518 infrequent categories, the dropping behavior is handled after the 

519 grouping. 

520 

521 .. versionadded:: 0.21 

522 The parameter `drop` was added in 0.21. 

523 

524 .. versionchanged:: 0.23 

525 The option `drop='if_binary'` was added in 0.23. 

526 

527 .. versionchanged:: 1.1 

528 Support for dropping infrequent categories. 

529 

530 sparse_output : bool, default=True 

531 When ``True``, it returns a :class:`scipy.sparse.csr_matrix`, 

532 i.e. a sparse matrix in "Compressed Sparse Row" (CSR) format. 

533 

534 .. versionadded:: 1.2 

535 `sparse` was renamed to `sparse_output` 

536 

537 dtype : number type, default=np.float64 

538 Desired dtype of output. 

539 

540 handle_unknown : {'error', 'ignore', 'infrequent_if_exist'}, \ 

541 default='error' 

542 Specifies the way unknown categories are handled during :meth:`transform`. 

543 

544 - 'error' : Raise an error if an unknown category is present during transform. 

545 - 'ignore' : When an unknown category is encountered during 

546 transform, the resulting one-hot encoded columns for this feature 

547 will be all zeros. In the inverse transform, an unknown category 

548 will be denoted as None. 

549 - 'infrequent_if_exist' : When an unknown category is encountered 

550 during transform, the resulting one-hot encoded columns for this 

551 feature will map to the infrequent category if it exists. The 

552 infrequent category will be mapped to the last position in the 

553 encoding. During inverse transform, an unknown category will be 

554 mapped to the category denoted `'infrequent'` if it exists. If the 

555 `'infrequent'` category does not exist, then :meth:`transform` and 

556 :meth:`inverse_transform` will handle an unknown category as with 

557 `handle_unknown='ignore'`. Infrequent categories exist based on 

558 `min_frequency` and `max_categories`. Read more in the 

559 :ref:`User Guide <encoder_infrequent_categories>`. 

560 

561 .. versionchanged:: 1.1 

562 `'infrequent_if_exist'` was added to automatically handle unknown 

563 categories and infrequent categories. 

564 

565 min_frequency : int or float, default=None 

566 Specifies the minimum frequency below which a category will be 

567 considered infrequent. 

568 

569 - If `int`, categories with a smaller cardinality will be considered 

570 infrequent. 

571 

572 - If `float`, categories with a smaller cardinality than 

573 `min_frequency * n_samples` will be considered infrequent. 

574 

575 .. versionadded:: 1.1 

576 Read more in the :ref:`User Guide <encoder_infrequent_categories>`. 

577 

578 max_categories : int, default=None 

579 Specifies an upper limit to the number of output features for each input 

580 feature when considering infrequent categories. If there are infrequent 

581 categories, `max_categories` includes the category representing the 

582 infrequent categories along with the frequent categories. If `None`, 

583 there is no limit to the number of output features. 

584 

585 .. versionadded:: 1.1 

586 Read more in the :ref:`User Guide <encoder_infrequent_categories>`. 

587 

588 feature_name_combiner : "concat" or callable, default="concat" 

589 Callable with signature `def callable(input_feature, category)` that returns a 

590 string. This is used to create feature names to be returned by 

591 :meth:`get_feature_names_out`. 

592 

593 `"concat"` concatenates encoded feature name and category with 

594 `feature + "_" + str(category)`.E.g. feature X with values 1, 6, 7 create 

595 feature names `X_1, X_6, X_7`. 

596 

597 .. versionadded:: 1.3 

598 

599 Attributes 

600 ---------- 

601 categories_ : list of arrays 

602 The categories of each feature determined during fitting 

603 (in order of the features in X and corresponding with the output 

604 of ``transform``). This includes the category specified in ``drop`` 

605 (if any). 

606 

607 drop_idx_ : array of shape (n_features,) 

608 - ``drop_idx_[i]`` is the index in ``categories_[i]`` of the category 

609 to be dropped for each feature. 

610 - ``drop_idx_[i] = None`` if no category is to be dropped from the 

611 feature with index ``i``, e.g. when `drop='if_binary'` and the 

612 feature isn't binary. 

613 - ``drop_idx_ = None`` if all the transformed features will be 

614 retained. 

615 

616 If infrequent categories are enabled by setting `min_frequency` or 

617 `max_categories` to a non-default value and `drop_idx[i]` corresponds 

618 to a infrequent category, then the entire infrequent category is 

619 dropped. 

620 

621 .. versionchanged:: 0.23 

622 Added the possibility to contain `None` values. 

623 

624 infrequent_categories_ : list of ndarray 

625 Defined only if infrequent categories are enabled by setting 

626 `min_frequency` or `max_categories` to a non-default value. 

627 `infrequent_categories_[i]` are the infrequent categories for feature 

628 `i`. If the feature `i` has no infrequent categories 

629 `infrequent_categories_[i]` is None. 

630 

631 .. versionadded:: 1.1 

632 

633 n_features_in_ : int 

634 Number of features seen during :term:`fit`. 

635 

636 .. versionadded:: 1.0 

637 

638 feature_names_in_ : ndarray of shape (`n_features_in_`,) 

639 Names of features seen during :term:`fit`. Defined only when `X` 

640 has feature names that are all strings. 

641 

642 .. versionadded:: 1.0 

643 

644 feature_name_combiner : callable or None 

645 Callable with signature `def callable(input_feature, category)` that returns a 

646 string. This is used to create feature names to be returned by 

647 :meth:`get_feature_names_out`. 

648 

649 .. versionadded:: 1.3 

650 

651 See Also 

652 -------- 

653 OrdinalEncoder : Performs an ordinal (integer) 

654 encoding of the categorical features. 

655 TargetEncoder : Encodes categorical features using the target. 

656 sklearn.feature_extraction.DictVectorizer : Performs a one-hot encoding of 

657 dictionary items (also handles string-valued features). 

658 sklearn.feature_extraction.FeatureHasher : Performs an approximate one-hot 

659 encoding of dictionary items or strings. 

660 LabelBinarizer : Binarizes labels in a one-vs-all 

661 fashion. 

662 MultiLabelBinarizer : Transforms between iterable of 

663 iterables and a multilabel format, e.g. a (samples x classes) binary 

664 matrix indicating the presence of a class label. 

665 

666 Examples 

667 -------- 

668 Given a dataset with two features, we let the encoder find the unique 

669 values per feature and transform the data to a binary one-hot encoding. 

670 

671 >>> from sklearn.preprocessing import OneHotEncoder 

672 

673 One can discard categories not seen during `fit`: 

674 

675 >>> enc = OneHotEncoder(handle_unknown='ignore') 

676 >>> X = [['Male', 1], ['Female', 3], ['Female', 2]] 

677 >>> enc.fit(X) 

678 OneHotEncoder(handle_unknown='ignore') 

679 >>> enc.categories_ 

680 [array(['Female', 'Male'], dtype=object), array([1, 2, 3], dtype=object)] 

681 >>> enc.transform([['Female', 1], ['Male', 4]]).toarray() 

682 array([[1., 0., 1., 0., 0.], 

683 [0., 1., 0., 0., 0.]]) 

684 >>> enc.inverse_transform([[0, 1, 1, 0, 0], [0, 0, 0, 1, 0]]) 

685 array([['Male', 1], 

686 [None, 2]], dtype=object) 

687 >>> enc.get_feature_names_out(['gender', 'group']) 

688 array(['gender_Female', 'gender_Male', 'group_1', 'group_2', 'group_3'], ...) 

689 

690 One can always drop the first column for each feature: 

691 

692 >>> drop_enc = OneHotEncoder(drop='first').fit(X) 

693 >>> drop_enc.categories_ 

694 [array(['Female', 'Male'], dtype=object), array([1, 2, 3], dtype=object)] 

695 >>> drop_enc.transform([['Female', 1], ['Male', 2]]).toarray() 

696 array([[0., 0., 0.], 

697 [1., 1., 0.]]) 

698 

699 Or drop a column for feature only having 2 categories: 

700 

701 >>> drop_binary_enc = OneHotEncoder(drop='if_binary').fit(X) 

702 >>> drop_binary_enc.transform([['Female', 1], ['Male', 2]]).toarray() 

703 array([[0., 1., 0., 0.], 

704 [1., 0., 1., 0.]]) 

705 

706 One can change the way feature names are created. 

707 

708 >>> def custom_combiner(feature, category): 

709 ... return str(feature) + "_" + type(category).__name__ + "_" + str(category) 

710 >>> custom_fnames_enc = OneHotEncoder(feature_name_combiner=custom_combiner).fit(X) 

711 >>> custom_fnames_enc.get_feature_names_out() 

712 array(['x0_str_Female', 'x0_str_Male', 'x1_int_1', 'x1_int_2', 'x1_int_3'], 

713 dtype=object) 

714 

715 Infrequent categories are enabled by setting `max_categories` or `min_frequency`. 

716 

717 >>> import numpy as np 

718 >>> X = np.array([["a"] * 5 + ["b"] * 20 + ["c"] * 10 + ["d"] * 3], dtype=object).T 

719 >>> ohe = OneHotEncoder(max_categories=3, sparse_output=False).fit(X) 

720 >>> ohe.infrequent_categories_ 

721 [array(['a', 'd'], dtype=object)] 

722 >>> ohe.transform([["a"], ["b"]]) 

723 array([[0., 0., 1.], 

724 [1., 0., 0.]]) 

725 """ 

726 

727 _parameter_constraints: dict = { 

728 "categories": [StrOptions({"auto"}), list], 

729 "drop": [StrOptions({"first", "if_binary"}), "array-like", None], 

730 "dtype": "no_validation", # validation delegated to numpy 

731 "handle_unknown": [StrOptions({"error", "ignore", "infrequent_if_exist"})], 

732 "max_categories": [Interval(Integral, 1, None, closed="left"), None], 

733 "min_frequency": [ 

734 Interval(Integral, 1, None, closed="left"), 

735 Interval(RealNotInt, 0, 1, closed="neither"), 

736 None, 

737 ], 

738 "sparse_output": ["boolean"], 

739 "feature_name_combiner": [StrOptions({"concat"}), callable], 

740 } 

741 

742 def __init__( 

743 self, 

744 *, 

745 categories="auto", 

746 drop=None, 

747 sparse_output=True, 

748 dtype=np.float64, 

749 handle_unknown="error", 

750 min_frequency=None, 

751 max_categories=None, 

752 feature_name_combiner="concat", 

753 ): 

754 self.categories = categories 

755 self.sparse_output = sparse_output 

756 self.dtype = dtype 

757 self.handle_unknown = handle_unknown 

758 self.drop = drop 

759 self.min_frequency = min_frequency 

760 self.max_categories = max_categories 

761 self.feature_name_combiner = feature_name_combiner 

762 

763 def _map_drop_idx_to_infrequent(self, feature_idx, drop_idx): 

764 """Convert `drop_idx` into the index for infrequent categories. 

765 

766 If there are no infrequent categories, then `drop_idx` is 

767 returned. This method is called in `_set_drop_idx` when the `drop` 

768 parameter is an array-like. 

769 """ 

770 if not self._infrequent_enabled: 

771 return drop_idx 

772 

773 default_to_infrequent = self._default_to_infrequent_mappings[feature_idx] 

774 if default_to_infrequent is None: 

775 return drop_idx 

776 

777 # Raise error when explicitly dropping a category that is infrequent 

778 infrequent_indices = self._infrequent_indices[feature_idx] 

779 if infrequent_indices is not None and drop_idx in infrequent_indices: 

780 categories = self.categories_[feature_idx] 

781 raise ValueError( 

782 f"Unable to drop category {categories[drop_idx].item()!r} from" 

783 f" feature {feature_idx} because it is infrequent" 

784 ) 

785 return default_to_infrequent[drop_idx] 

786 

787 def _set_drop_idx(self): 

788 """Compute the drop indices associated with `self.categories_`. 

789 

790 If `self.drop` is: 

791 - `None`, No categories have been dropped. 

792 - `'first'`, All zeros to drop the first category. 

793 - `'if_binary'`, All zeros if the category is binary and `None` 

794 otherwise. 

795 - array-like, The indices of the categories that match the 

796 categories in `self.drop`. If the dropped category is an infrequent 

797 category, then the index for the infrequent category is used. This 

798 means that the entire infrequent category is dropped. 

799 

800 This methods defines a public `drop_idx_` and a private 

801 `_drop_idx_after_grouping`. 

802 

803 - `drop_idx_`: Public facing API that references the drop category in 

804 `self.categories_`. 

805 - `_drop_idx_after_grouping`: Used internally to drop categories *after* the 

806 infrequent categories are grouped together. 

807 

808 If there are no infrequent categories or drop is `None`, then 

809 `drop_idx_=_drop_idx_after_grouping`. 

810 """ 

811 if self.drop is None: 

812 drop_idx_after_grouping = None 

813 elif isinstance(self.drop, str): 

814 if self.drop == "first": 

815 drop_idx_after_grouping = np.zeros(len(self.categories_), dtype=object) 

816 elif self.drop == "if_binary": 

817 n_features_out_no_drop = [len(cat) for cat in self.categories_] 

818 if self._infrequent_enabled: 

819 for i, infreq_idx in enumerate(self._infrequent_indices): 

820 if infreq_idx is None: 

821 continue 

822 n_features_out_no_drop[i] -= infreq_idx.size - 1 

823 

824 drop_idx_after_grouping = np.array( 

825 [ 

826 0 if n_features_out == 2 else None 

827 for n_features_out in n_features_out_no_drop 

828 ], 

829 dtype=object, 

830 ) 

831 

832 else: 

833 drop_array = np.asarray(self.drop, dtype=object) 

834 droplen = len(drop_array) 

835 

836 if droplen != len(self.categories_): 

837 msg = ( 

838 "`drop` should have length equal to the number " 

839 "of features ({}), got {}" 

840 ) 

841 raise ValueError(msg.format(len(self.categories_), droplen)) 

842 missing_drops = [] 

843 drop_indices = [] 

844 for feature_idx, (drop_val, cat_list) in enumerate( 

845 zip(drop_array, self.categories_) 

846 ): 

847 if not is_scalar_nan(drop_val): 

848 drop_idx = np.where(cat_list == drop_val)[0] 

849 if drop_idx.size: # found drop idx 

850 drop_indices.append( 

851 self._map_drop_idx_to_infrequent(feature_idx, drop_idx[0]) 

852 ) 

853 else: 

854 missing_drops.append((feature_idx, drop_val)) 

855 continue 

856 

857 # drop_val is nan, find nan in categories manually 

858 if is_scalar_nan(cat_list[-1]): 

859 drop_indices.append( 

860 self._map_drop_idx_to_infrequent(feature_idx, cat_list.size - 1) 

861 ) 

862 else: # nan is missing 

863 missing_drops.append((feature_idx, drop_val)) 

864 

865 if any(missing_drops): 

866 msg = ( 

867 "The following categories were supposed to be " 

868 "dropped, but were not found in the training " 

869 "data.\n{}".format( 

870 "\n".join( 

871 [ 

872 "Category: {}, Feature: {}".format(c, v) 

873 for c, v in missing_drops 

874 ] 

875 ) 

876 ) 

877 ) 

878 raise ValueError(msg) 

879 drop_idx_after_grouping = np.array(drop_indices, dtype=object) 

880 

881 # `_drop_idx_after_grouping` are the categories to drop *after* the infrequent 

882 # categories are grouped together. If needed, we remap `drop_idx` back 

883 # to the categories seen in `self.categories_`. 

884 self._drop_idx_after_grouping = drop_idx_after_grouping 

885 

886 if not self._infrequent_enabled or drop_idx_after_grouping is None: 

887 self.drop_idx_ = self._drop_idx_after_grouping 

888 else: 

889 drop_idx_ = [] 

890 for feature_idx, drop_idx in enumerate(drop_idx_after_grouping): 

891 default_to_infrequent = self._default_to_infrequent_mappings[ 

892 feature_idx 

893 ] 

894 if drop_idx is None or default_to_infrequent is None: 

895 orig_drop_idx = drop_idx 

896 else: 

897 orig_drop_idx = np.flatnonzero(default_to_infrequent == drop_idx)[0] 

898 

899 drop_idx_.append(orig_drop_idx) 

900 

901 self.drop_idx_ = np.asarray(drop_idx_, dtype=object) 

902 

903 def _compute_transformed_categories(self, i, remove_dropped=True): 

904 """Compute the transformed categories used for column `i`. 

905 

906 1. If there are infrequent categories, the category is named 

907 'infrequent_sklearn'. 

908 2. Dropped columns are removed when remove_dropped=True. 

909 """ 

910 cats = self.categories_[i] 

911 

912 if self._infrequent_enabled: 

913 infreq_map = self._default_to_infrequent_mappings[i] 

914 if infreq_map is not None: 

915 frequent_mask = infreq_map < infreq_map.max() 

916 infrequent_cat = "infrequent_sklearn" 

917 # infrequent category is always at the end 

918 cats = np.concatenate( 

919 (cats[frequent_mask], np.array([infrequent_cat], dtype=object)) 

920 ) 

921 

922 if remove_dropped: 

923 cats = self._remove_dropped_categories(cats, i) 

924 return cats 

925 

926 def _remove_dropped_categories(self, categories, i): 

927 """Remove dropped categories.""" 

928 if ( 

929 self._drop_idx_after_grouping is not None 

930 and self._drop_idx_after_grouping[i] is not None 

931 ): 

932 return np.delete(categories, self._drop_idx_after_grouping[i]) 

933 return categories 

934 

935 def _compute_n_features_outs(self): 

936 """Compute the n_features_out for each input feature.""" 

937 output = [len(cats) for cats in self.categories_] 

938 

939 if self._drop_idx_after_grouping is not None: 

940 for i, drop_idx in enumerate(self._drop_idx_after_grouping): 

941 if drop_idx is not None: 

942 output[i] -= 1 

943 

944 if not self._infrequent_enabled: 

945 return output 

946 

947 # infrequent is enabled, the number of features out are reduced 

948 # because the infrequent categories are grouped together 

949 for i, infreq_idx in enumerate(self._infrequent_indices): 

950 if infreq_idx is None: 

951 continue 

952 output[i] -= infreq_idx.size - 1 

953 

954 return output 

955 

956 @_fit_context(prefer_skip_nested_validation=True) 

957 def fit(self, X, y=None): 

958 """ 

959 Fit OneHotEncoder to X. 

960 

961 Parameters 

962 ---------- 

963 X : array-like of shape (n_samples, n_features) 

964 The data to determine the categories of each feature. 

965 

966 y : None 

967 Ignored. This parameter exists only for compatibility with 

968 :class:`~sklearn.pipeline.Pipeline`. 

969 

970 Returns 

971 ------- 

972 self 

973 Fitted encoder. 

974 """ 

975 self._fit( 

976 X, 

977 handle_unknown=self.handle_unknown, 

978 force_all_finite="allow-nan", 

979 ) 

980 self._set_drop_idx() 

981 self._n_features_outs = self._compute_n_features_outs() 

982 return self 

983 

984 def transform(self, X): 

985 """ 

986 Transform X using one-hot encoding. 

987 

988 If `sparse_output=True` (default), it returns an instance of 

989 :class:`scipy.sparse._csr.csr_matrix` (CSR format). 

990 

991 If there are infrequent categories for a feature, set by specifying 

992 `max_categories` or `min_frequency`, the infrequent categories are 

993 grouped into a single category. 

994 

995 Parameters 

996 ---------- 

997 X : array-like of shape (n_samples, n_features) 

998 The data to encode. 

999 

1000 Returns 

1001 ------- 

1002 X_out : {ndarray, sparse matrix} of shape \ 

1003 (n_samples, n_encoded_features) 

1004 Transformed input. If `sparse_output=True`, a sparse matrix will be 

1005 returned. 

1006 """ 

1007 check_is_fitted(self) 

1008 transform_output = _get_output_config("transform", estimator=self)["dense"] 

1009 if transform_output != "default" and self.sparse_output: 

1010 capitalize_transform_output = transform_output.capitalize() 

1011 raise ValueError( 

1012 f"{capitalize_transform_output} output does not support sparse data." 

1013 f" Set sparse_output=False to output {transform_output} dataframes or" 

1014 f" disable {capitalize_transform_output} output via" 

1015 '` ohe.set_output(transform="default").' 

1016 ) 

1017 

1018 # validation of X happens in _check_X called by _transform 

1019 warn_on_unknown = self.drop is not None and self.handle_unknown in { 

1020 "ignore", 

1021 "infrequent_if_exist", 

1022 } 

1023 X_int, X_mask = self._transform( 

1024 X, 

1025 handle_unknown=self.handle_unknown, 

1026 force_all_finite="allow-nan", 

1027 warn_on_unknown=warn_on_unknown, 

1028 ) 

1029 

1030 n_samples, n_features = X_int.shape 

1031 

1032 if self._drop_idx_after_grouping is not None: 

1033 to_drop = self._drop_idx_after_grouping.copy() 

1034 # We remove all the dropped categories from mask, and decrement all 

1035 # categories that occur after them to avoid an empty column. 

1036 keep_cells = X_int != to_drop 

1037 for i, cats in enumerate(self.categories_): 

1038 # drop='if_binary' but feature isn't binary 

1039 if to_drop[i] is None: 

1040 # set to cardinality to not drop from X_int 

1041 to_drop[i] = len(cats) 

1042 

1043 to_drop = to_drop.reshape(1, -1) 

1044 X_int[X_int > to_drop] -= 1 

1045 X_mask &= keep_cells 

1046 

1047 mask = X_mask.ravel() 

1048 feature_indices = np.cumsum([0] + self._n_features_outs) 

1049 indices = (X_int + feature_indices[:-1]).ravel()[mask] 

1050 

1051 indptr = np.empty(n_samples + 1, dtype=int) 

1052 indptr[0] = 0 

1053 np.sum(X_mask, axis=1, out=indptr[1:], dtype=indptr.dtype) 

1054 np.cumsum(indptr[1:], out=indptr[1:]) 

1055 data = np.ones(indptr[-1]) 

1056 

1057 out = sparse.csr_matrix( 

1058 (data, indices, indptr), 

1059 shape=(n_samples, feature_indices[-1]), 

1060 dtype=self.dtype, 

1061 ) 

1062 if not self.sparse_output: 

1063 return out.toarray() 

1064 else: 

1065 return out 

1066 

1067 def inverse_transform(self, X): 

1068 """ 

1069 Convert the data back to the original representation. 

1070 

1071 When unknown categories are encountered (all zeros in the 

1072 one-hot encoding), ``None`` is used to represent this category. If the 

1073 feature with the unknown category has a dropped category, the dropped 

1074 category will be its inverse. 

1075 

1076 For a given input feature, if there is an infrequent category, 

1077 'infrequent_sklearn' will be used to represent the infrequent category. 

1078 

1079 Parameters 

1080 ---------- 

1081 X : {array-like, sparse matrix} of shape \ 

1082 (n_samples, n_encoded_features) 

1083 The transformed data. 

1084 

1085 Returns 

1086 ------- 

1087 X_tr : ndarray of shape (n_samples, n_features) 

1088 Inverse transformed array. 

1089 """ 

1090 check_is_fitted(self) 

1091 X = check_array(X, accept_sparse="csr") 

1092 

1093 n_samples, _ = X.shape 

1094 n_features = len(self.categories_) 

1095 

1096 n_features_out = np.sum(self._n_features_outs) 

1097 

1098 # validate shape of passed X 

1099 msg = ( 

1100 "Shape of the passed X data is not correct. Expected {0} columns, got {1}." 

1101 ) 

1102 if X.shape[1] != n_features_out: 

1103 raise ValueError(msg.format(n_features_out, X.shape[1])) 

1104 

1105 transformed_features = [ 

1106 self._compute_transformed_categories(i, remove_dropped=False) 

1107 for i, _ in enumerate(self.categories_) 

1108 ] 

1109 

1110 # create resulting array of appropriate dtype 

1111 dt = np.result_type(*[cat.dtype for cat in transformed_features]) 

1112 X_tr = np.empty((n_samples, n_features), dtype=dt) 

1113 

1114 j = 0 

1115 found_unknown = {} 

1116 

1117 if self._infrequent_enabled: 

1118 infrequent_indices = self._infrequent_indices 

1119 else: 

1120 infrequent_indices = [None] * n_features 

1121 

1122 for i in range(n_features): 

1123 cats_wo_dropped = self._remove_dropped_categories( 

1124 transformed_features[i], i 

1125 ) 

1126 n_categories = cats_wo_dropped.shape[0] 

1127 

1128 # Only happens if there was a column with a unique 

1129 # category. In this case we just fill the column with this 

1130 # unique category value. 

1131 if n_categories == 0: 

1132 X_tr[:, i] = self.categories_[i][self._drop_idx_after_grouping[i]] 

1133 j += n_categories 

1134 continue 

1135 sub = X[:, j : j + n_categories] 

1136 # for sparse X argmax returns 2D matrix, ensure 1D array 

1137 labels = np.asarray(sub.argmax(axis=1)).flatten() 

1138 X_tr[:, i] = cats_wo_dropped[labels] 

1139 

1140 if self.handle_unknown == "ignore" or ( 

1141 self.handle_unknown == "infrequent_if_exist" 

1142 and infrequent_indices[i] is None 

1143 ): 

1144 unknown = np.asarray(sub.sum(axis=1) == 0).flatten() 

1145 # ignored unknown categories: we have a row of all zero 

1146 if unknown.any(): 

1147 # if categories were dropped then unknown categories will 

1148 # be mapped to the dropped category 

1149 if ( 

1150 self._drop_idx_after_grouping is None 

1151 or self._drop_idx_after_grouping[i] is None 

1152 ): 

1153 found_unknown[i] = unknown 

1154 else: 

1155 X_tr[unknown, i] = self.categories_[i][ 

1156 self._drop_idx_after_grouping[i] 

1157 ] 

1158 else: 

1159 dropped = np.asarray(sub.sum(axis=1) == 0).flatten() 

1160 if dropped.any(): 

1161 if self._drop_idx_after_grouping is None: 

1162 all_zero_samples = np.flatnonzero(dropped) 

1163 raise ValueError( 

1164 f"Samples {all_zero_samples} can not be inverted " 

1165 "when drop=None and handle_unknown='error' " 

1166 "because they contain all zeros" 

1167 ) 

1168 # we can safely assume that all of the nulls in each column 

1169 # are the dropped value 

1170 drop_idx = self._drop_idx_after_grouping[i] 

1171 X_tr[dropped, i] = transformed_features[i][drop_idx] 

1172 

1173 j += n_categories 

1174 

1175 # if ignored are found: potentially need to upcast result to 

1176 # insert None values 

1177 if found_unknown: 

1178 if X_tr.dtype != object: 

1179 X_tr = X_tr.astype(object) 

1180 

1181 for idx, mask in found_unknown.items(): 

1182 X_tr[mask, idx] = None 

1183 

1184 return X_tr 

1185 

1186 def get_feature_names_out(self, input_features=None): 

1187 """Get output feature names for transformation. 

1188 

1189 Parameters 

1190 ---------- 

1191 input_features : array-like of str or None, default=None 

1192 Input features. 

1193 

1194 - If `input_features` is `None`, then `feature_names_in_` is 

1195 used as feature names in. If `feature_names_in_` is not defined, 

1196 then the following input feature names are generated: 

1197 `["x0", "x1", ..., "x(n_features_in_ - 1)"]`. 

1198 - If `input_features` is an array-like, then `input_features` must 

1199 match `feature_names_in_` if `feature_names_in_` is defined. 

1200 

1201 Returns 

1202 ------- 

1203 feature_names_out : ndarray of str objects 

1204 Transformed feature names. 

1205 """ 

1206 check_is_fitted(self) 

1207 input_features = _check_feature_names_in(self, input_features) 

1208 cats = [ 

1209 self._compute_transformed_categories(i) 

1210 for i, _ in enumerate(self.categories_) 

1211 ] 

1212 

1213 name_combiner = self._check_get_feature_name_combiner() 

1214 feature_names = [] 

1215 for i in range(len(cats)): 

1216 names = [name_combiner(input_features[i], t) for t in cats[i]] 

1217 feature_names.extend(names) 

1218 

1219 return np.array(feature_names, dtype=object) 

1220 

1221 def _check_get_feature_name_combiner(self): 

1222 if self.feature_name_combiner == "concat": 

1223 return lambda feature, category: feature + "_" + str(category) 

1224 else: # callable 

1225 dry_run_combiner = self.feature_name_combiner("feature", "category") 

1226 if not isinstance(dry_run_combiner, str): 

1227 raise TypeError( 

1228 "When `feature_name_combiner` is a callable, it should return a " 

1229 f"Python string. Got {type(dry_run_combiner)} instead." 

1230 ) 

1231 return self.feature_name_combiner 

1232 

1233 

1234class OrdinalEncoder(OneToOneFeatureMixin, _BaseEncoder): 

1235 """ 

1236 Encode categorical features as an integer array. 

1237 

1238 The input to this transformer should be an array-like of integers or 

1239 strings, denoting the values taken on by categorical (discrete) features. 

1240 The features are converted to ordinal integers. This results in 

1241 a single column of integers (0 to n_categories - 1) per feature. 

1242 

1243 Read more in the :ref:`User Guide <preprocessing_categorical_features>`. 

1244 For a comparison of different encoders, refer to: 

1245 :ref:`sphx_glr_auto_examples_preprocessing_plot_target_encoder.py`. 

1246 

1247 .. versionadded:: 0.20 

1248 

1249 Parameters 

1250 ---------- 

1251 categories : 'auto' or a list of array-like, default='auto' 

1252 Categories (unique values) per feature: 

1253 

1254 - 'auto' : Determine categories automatically from the training data. 

1255 - list : ``categories[i]`` holds the categories expected in the ith 

1256 column. The passed categories should not mix strings and numeric 

1257 values, and should be sorted in case of numeric values. 

1258 

1259 The used categories can be found in the ``categories_`` attribute. 

1260 

1261 dtype : number type, default=np.float64 

1262 Desired dtype of output. 

1263 

1264 handle_unknown : {'error', 'use_encoded_value'}, default='error' 

1265 When set to 'error' an error will be raised in case an unknown 

1266 categorical feature is present during transform. When set to 

1267 'use_encoded_value', the encoded value of unknown categories will be 

1268 set to the value given for the parameter `unknown_value`. In 

1269 :meth:`inverse_transform`, an unknown category will be denoted as None. 

1270 

1271 .. versionadded:: 0.24 

1272 

1273 unknown_value : int or np.nan, default=None 

1274 When the parameter handle_unknown is set to 'use_encoded_value', this 

1275 parameter is required and will set the encoded value of unknown 

1276 categories. It has to be distinct from the values used to encode any of 

1277 the categories in `fit`. If set to np.nan, the `dtype` parameter must 

1278 be a float dtype. 

1279 

1280 .. versionadded:: 0.24 

1281 

1282 encoded_missing_value : int or np.nan, default=np.nan 

1283 Encoded value of missing categories. If set to `np.nan`, then the `dtype` 

1284 parameter must be a float dtype. 

1285 

1286 .. versionadded:: 1.1 

1287 

1288 min_frequency : int or float, default=None 

1289 Specifies the minimum frequency below which a category will be 

1290 considered infrequent. 

1291 

1292 - If `int`, categories with a smaller cardinality will be considered 

1293 infrequent. 

1294 

1295 - If `float`, categories with a smaller cardinality than 

1296 `min_frequency * n_samples` will be considered infrequent. 

1297 

1298 .. versionadded:: 1.3 

1299 Read more in the :ref:`User Guide <encoder_infrequent_categories>`. 

1300 

1301 max_categories : int, default=None 

1302 Specifies an upper limit to the number of output categories for each input 

1303 feature when considering infrequent categories. If there are infrequent 

1304 categories, `max_categories` includes the category representing the 

1305 infrequent categories along with the frequent categories. If `None`, 

1306 there is no limit to the number of output features. 

1307 

1308 `max_categories` do **not** take into account missing or unknown 

1309 categories. Setting `unknown_value` or `encoded_missing_value` to an 

1310 integer will increase the number of unique integer codes by one each. 

1311 This can result in up to `max_categories + 2` integer codes. 

1312 

1313 .. versionadded:: 1.3 

1314 Read more in the :ref:`User Guide <encoder_infrequent_categories>`. 

1315 

1316 Attributes 

1317 ---------- 

1318 categories_ : list of arrays 

1319 The categories of each feature determined during ``fit`` (in order of 

1320 the features in X and corresponding with the output of ``transform``). 

1321 This does not include categories that weren't seen during ``fit``. 

1322 

1323 n_features_in_ : int 

1324 Number of features seen during :term:`fit`. 

1325 

1326 .. versionadded:: 1.0 

1327 

1328 feature_names_in_ : ndarray of shape (`n_features_in_`,) 

1329 Names of features seen during :term:`fit`. Defined only when `X` 

1330 has feature names that are all strings. 

1331 

1332 .. versionadded:: 1.0 

1333 

1334 infrequent_categories_ : list of ndarray 

1335 Defined only if infrequent categories are enabled by setting 

1336 `min_frequency` or `max_categories` to a non-default value. 

1337 `infrequent_categories_[i]` are the infrequent categories for feature 

1338 `i`. If the feature `i` has no infrequent categories 

1339 `infrequent_categories_[i]` is None. 

1340 

1341 .. versionadded:: 1.3 

1342 

1343 See Also 

1344 -------- 

1345 OneHotEncoder : Performs a one-hot encoding of categorical features. This encoding 

1346 is suitable for low to medium cardinality categorical variables, both in 

1347 supervised and unsupervised settings. 

1348 TargetEncoder : Encodes categorical features using supervised signal 

1349 in a classification or regression pipeline. This encoding is typically 

1350 suitable for high cardinality categorical variables. 

1351 LabelEncoder : Encodes target labels with values between 0 and 

1352 ``n_classes-1``. 

1353 

1354 Notes 

1355 ----- 

1356 With a high proportion of `nan` values, inferring categories becomes slow with 

1357 Python versions before 3.10. The handling of `nan` values was improved 

1358 from Python 3.10 onwards, (c.f. 

1359 `bpo-43475 <https://github.com/python/cpython/issues/87641>`_). 

1360 

1361 Examples 

1362 -------- 

1363 Given a dataset with two features, we let the encoder find the unique 

1364 values per feature and transform the data to an ordinal encoding. 

1365 

1366 >>> from sklearn.preprocessing import OrdinalEncoder 

1367 >>> enc = OrdinalEncoder() 

1368 >>> X = [['Male', 1], ['Female', 3], ['Female', 2]] 

1369 >>> enc.fit(X) 

1370 OrdinalEncoder() 

1371 >>> enc.categories_ 

1372 [array(['Female', 'Male'], dtype=object), array([1, 2, 3], dtype=object)] 

1373 >>> enc.transform([['Female', 3], ['Male', 1]]) 

1374 array([[0., 2.], 

1375 [1., 0.]]) 

1376 

1377 >>> enc.inverse_transform([[1, 0], [0, 1]]) 

1378 array([['Male', 1], 

1379 ['Female', 2]], dtype=object) 

1380 

1381 By default, :class:`OrdinalEncoder` is lenient towards missing values by 

1382 propagating them. 

1383 

1384 >>> import numpy as np 

1385 >>> X = [['Male', 1], ['Female', 3], ['Female', np.nan]] 

1386 >>> enc.fit_transform(X) 

1387 array([[ 1., 0.], 

1388 [ 0., 1.], 

1389 [ 0., nan]]) 

1390 

1391 You can use the parameter `encoded_missing_value` to encode missing values. 

1392 

1393 >>> enc.set_params(encoded_missing_value=-1).fit_transform(X) 

1394 array([[ 1., 0.], 

1395 [ 0., 1.], 

1396 [ 0., -1.]]) 

1397 

1398 Infrequent categories are enabled by setting `max_categories` or `min_frequency`. 

1399 In the following example, "a" and "d" are considered infrequent and grouped 

1400 together into a single category, "b" and "c" are their own categories, unknown 

1401 values are encoded as 3 and missing values are encoded as 4. 

1402 

1403 >>> X_train = np.array( 

1404 ... [["a"] * 5 + ["b"] * 20 + ["c"] * 10 + ["d"] * 3 + [np.nan]], 

1405 ... dtype=object).T 

1406 >>> enc = OrdinalEncoder( 

1407 ... handle_unknown="use_encoded_value", unknown_value=3, 

1408 ... max_categories=3, encoded_missing_value=4) 

1409 >>> _ = enc.fit(X_train) 

1410 >>> X_test = np.array([["a"], ["b"], ["c"], ["d"], ["e"], [np.nan]], dtype=object) 

1411 >>> enc.transform(X_test) 

1412 array([[2.], 

1413 [0.], 

1414 [1.], 

1415 [2.], 

1416 [3.], 

1417 [4.]]) 

1418 """ 

1419 

1420 _parameter_constraints: dict = { 

1421 "categories": [StrOptions({"auto"}), list], 

1422 "dtype": "no_validation", # validation delegated to numpy 

1423 "encoded_missing_value": [Integral, type(np.nan)], 

1424 "handle_unknown": [StrOptions({"error", "use_encoded_value"})], 

1425 "unknown_value": [Integral, type(np.nan), None], 

1426 "max_categories": [Interval(Integral, 1, None, closed="left"), None], 

1427 "min_frequency": [ 

1428 Interval(Integral, 1, None, closed="left"), 

1429 Interval(RealNotInt, 0, 1, closed="neither"), 

1430 None, 

1431 ], 

1432 } 

1433 

1434 def __init__( 

1435 self, 

1436 *, 

1437 categories="auto", 

1438 dtype=np.float64, 

1439 handle_unknown="error", 

1440 unknown_value=None, 

1441 encoded_missing_value=np.nan, 

1442 min_frequency=None, 

1443 max_categories=None, 

1444 ): 

1445 self.categories = categories 

1446 self.dtype = dtype 

1447 self.handle_unknown = handle_unknown 

1448 self.unknown_value = unknown_value 

1449 self.encoded_missing_value = encoded_missing_value 

1450 self.min_frequency = min_frequency 

1451 self.max_categories = max_categories 

1452 

1453 @_fit_context(prefer_skip_nested_validation=True) 

1454 def fit(self, X, y=None): 

1455 """ 

1456 Fit the OrdinalEncoder to X. 

1457 

1458 Parameters 

1459 ---------- 

1460 X : array-like of shape (n_samples, n_features) 

1461 The data to determine the categories of each feature. 

1462 

1463 y : None 

1464 Ignored. This parameter exists only for compatibility with 

1465 :class:`~sklearn.pipeline.Pipeline`. 

1466 

1467 Returns 

1468 ------- 

1469 self : object 

1470 Fitted encoder. 

1471 """ 

1472 if self.handle_unknown == "use_encoded_value": 

1473 if is_scalar_nan(self.unknown_value): 

1474 if np.dtype(self.dtype).kind != "f": 

1475 raise ValueError( 

1476 "When unknown_value is np.nan, the dtype " 

1477 "parameter should be " 

1478 f"a float dtype. Got {self.dtype}." 

1479 ) 

1480 elif not isinstance(self.unknown_value, numbers.Integral): 

1481 raise TypeError( 

1482 "unknown_value should be an integer or " 

1483 "np.nan when " 

1484 "handle_unknown is 'use_encoded_value', " 

1485 f"got {self.unknown_value}." 

1486 ) 

1487 elif self.unknown_value is not None: 

1488 raise TypeError( 

1489 "unknown_value should only be set when " 

1490 "handle_unknown is 'use_encoded_value', " 

1491 f"got {self.unknown_value}." 

1492 ) 

1493 

1494 # `_fit` will only raise an error when `self.handle_unknown="error"` 

1495 fit_results = self._fit( 

1496 X, 

1497 handle_unknown=self.handle_unknown, 

1498 force_all_finite="allow-nan", 

1499 return_and_ignore_missing_for_infrequent=True, 

1500 ) 

1501 self._missing_indices = fit_results["missing_indices"] 

1502 

1503 cardinalities = [len(categories) for categories in self.categories_] 

1504 if self._infrequent_enabled: 

1505 # Cardinality decreases because the infrequent categories are grouped 

1506 # together 

1507 for feature_idx, infrequent in enumerate(self.infrequent_categories_): 

1508 if infrequent is not None: 

1509 cardinalities[feature_idx] -= len(infrequent) 

1510 

1511 # missing values are not considered part of the cardinality 

1512 # when considering unknown categories or encoded_missing_value 

1513 for cat_idx, categories_for_idx in enumerate(self.categories_): 

1514 if is_scalar_nan(categories_for_idx[-1]): 

1515 cardinalities[cat_idx] -= 1 

1516 

1517 if self.handle_unknown == "use_encoded_value": 

1518 for cardinality in cardinalities: 

1519 if 0 <= self.unknown_value < cardinality: 

1520 raise ValueError( 

1521 "The used value for unknown_value " 

1522 f"{self.unknown_value} is one of the " 

1523 "values already used for encoding the " 

1524 "seen categories." 

1525 ) 

1526 

1527 if self._missing_indices: 

1528 if np.dtype(self.dtype).kind != "f" and is_scalar_nan( 

1529 self.encoded_missing_value 

1530 ): 

1531 raise ValueError( 

1532 "There are missing values in features " 

1533 f"{list(self._missing_indices)}. For OrdinalEncoder to " 

1534 f"encode missing values with dtype: {self.dtype}, set " 

1535 "encoded_missing_value to a non-nan value, or " 

1536 "set dtype to a float" 

1537 ) 

1538 

1539 if not is_scalar_nan(self.encoded_missing_value): 

1540 # Features are invalid when they contain a missing category 

1541 # and encoded_missing_value was already used to encode a 

1542 # known category 

1543 invalid_features = [ 

1544 cat_idx 

1545 for cat_idx, cardinality in enumerate(cardinalities) 

1546 if cat_idx in self._missing_indices 

1547 and 0 <= self.encoded_missing_value < cardinality 

1548 ] 

1549 

1550 if invalid_features: 

1551 # Use feature names if they are available 

1552 if hasattr(self, "feature_names_in_"): 

1553 invalid_features = self.feature_names_in_[invalid_features] 

1554 raise ValueError( 

1555 f"encoded_missing_value ({self.encoded_missing_value}) " 

1556 "is already used to encode a known category in features: " 

1557 f"{invalid_features}" 

1558 ) 

1559 

1560 return self 

1561 

1562 def transform(self, X): 

1563 """ 

1564 Transform X to ordinal codes. 

1565 

1566 Parameters 

1567 ---------- 

1568 X : array-like of shape (n_samples, n_features) 

1569 The data to encode. 

1570 

1571 Returns 

1572 ------- 

1573 X_out : ndarray of shape (n_samples, n_features) 

1574 Transformed input. 

1575 """ 

1576 check_is_fitted(self, "categories_") 

1577 X_int, X_mask = self._transform( 

1578 X, 

1579 handle_unknown=self.handle_unknown, 

1580 force_all_finite="allow-nan", 

1581 ignore_category_indices=self._missing_indices, 

1582 ) 

1583 X_trans = X_int.astype(self.dtype, copy=False) 

1584 

1585 for cat_idx, missing_idx in self._missing_indices.items(): 

1586 X_missing_mask = X_int[:, cat_idx] == missing_idx 

1587 X_trans[X_missing_mask, cat_idx] = self.encoded_missing_value 

1588 

1589 # create separate category for unknown values 

1590 if self.handle_unknown == "use_encoded_value": 

1591 X_trans[~X_mask] = self.unknown_value 

1592 return X_trans 

1593 

1594 def inverse_transform(self, X): 

1595 """ 

1596 Convert the data back to the original representation. 

1597 

1598 Parameters 

1599 ---------- 

1600 X : array-like of shape (n_samples, n_encoded_features) 

1601 The transformed data. 

1602 

1603 Returns 

1604 ------- 

1605 X_tr : ndarray of shape (n_samples, n_features) 

1606 Inverse transformed array. 

1607 """ 

1608 check_is_fitted(self) 

1609 X = check_array(X, force_all_finite="allow-nan") 

1610 

1611 n_samples, _ = X.shape 

1612 n_features = len(self.categories_) 

1613 

1614 # validate shape of passed X 

1615 msg = ( 

1616 "Shape of the passed X data is not correct. Expected {0} columns, got {1}." 

1617 ) 

1618 if X.shape[1] != n_features: 

1619 raise ValueError(msg.format(n_features, X.shape[1])) 

1620 

1621 # create resulting array of appropriate dtype 

1622 dt = np.result_type(*[cat.dtype for cat in self.categories_]) 

1623 X_tr = np.empty((n_samples, n_features), dtype=dt) 

1624 

1625 found_unknown = {} 

1626 infrequent_masks = {} 

1627 

1628 infrequent_indices = getattr(self, "_infrequent_indices", None) 

1629 

1630 for i in range(n_features): 

1631 labels = X[:, i] 

1632 

1633 # replace values of X[:, i] that were nan with actual indices 

1634 if i in self._missing_indices: 

1635 X_i_mask = _get_mask(labels, self.encoded_missing_value) 

1636 labels[X_i_mask] = self._missing_indices[i] 

1637 

1638 rows_to_update = slice(None) 

1639 categories = self.categories_[i] 

1640 

1641 if infrequent_indices is not None and infrequent_indices[i] is not None: 

1642 # Compute mask for frequent categories 

1643 infrequent_encoding_value = len(categories) - len(infrequent_indices[i]) 

1644 infrequent_masks[i] = labels == infrequent_encoding_value 

1645 rows_to_update = ~infrequent_masks[i] 

1646 

1647 # Remap categories to be only frequent categories. The infrequent 

1648 # categories will be mapped to "infrequent_sklearn" later 

1649 frequent_categories_mask = np.ones_like(categories, dtype=bool) 

1650 frequent_categories_mask[infrequent_indices[i]] = False 

1651 categories = categories[frequent_categories_mask] 

1652 

1653 if self.handle_unknown == "use_encoded_value": 

1654 unknown_labels = _get_mask(labels, self.unknown_value) 

1655 found_unknown[i] = unknown_labels 

1656 

1657 known_labels = ~unknown_labels 

1658 if isinstance(rows_to_update, np.ndarray): 

1659 rows_to_update &= known_labels 

1660 else: 

1661 rows_to_update = known_labels 

1662 

1663 labels_int = labels[rows_to_update].astype("int64", copy=False) 

1664 X_tr[rows_to_update, i] = categories[labels_int] 

1665 

1666 if found_unknown or infrequent_masks: 

1667 X_tr = X_tr.astype(object, copy=False) 

1668 

1669 # insert None values for unknown values 

1670 if found_unknown: 

1671 for idx, mask in found_unknown.items(): 

1672 X_tr[mask, idx] = None 

1673 

1674 if infrequent_masks: 

1675 for idx, mask in infrequent_masks.items(): 

1676 X_tr[mask, idx] = "infrequent_sklearn" 

1677 

1678 return X_tr