Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/scikit_learn-1.4.dev0-py3.8-linux-x86_64.egg/sklearn/preprocessing/_label.py: 17%

275 statements  

« prev     ^ index     » next       coverage.py v7.3.2, created at 2023-12-12 06:31 +0000

1# Authors: Alexandre Gramfort <alexandre.gramfort@inria.fr> 

2# Mathieu Blondel <mathieu@mblondel.org> 

3# Olivier Grisel <olivier.grisel@ensta.org> 

4# Andreas Mueller <amueller@ais.uni-bonn.de> 

5# Joel Nothman <joel.nothman@gmail.com> 

6# Hamzeh Alsalhi <ha258@cornell.edu> 

7# License: BSD 3 clause 

8 

9import array 

10import itertools 

11import warnings 

12from collections import defaultdict 

13from numbers import Integral 

14 

15import numpy as np 

16import scipy.sparse as sp 

17 

18from ..base import BaseEstimator, TransformerMixin, _fit_context 

19from ..utils import column_or_1d 

20from ..utils._encode import _encode, _unique 

21from ..utils._param_validation import Interval, validate_params 

22from ..utils.multiclass import type_of_target, unique_labels 

23from ..utils.sparsefuncs import min_max_axis 

24from ..utils.validation import _num_samples, check_array, check_is_fitted 

25 

26__all__ = [ 

27 "label_binarize", 

28 "LabelBinarizer", 

29 "LabelEncoder", 

30 "MultiLabelBinarizer", 

31] 

32 

33 

34class LabelEncoder(TransformerMixin, BaseEstimator, auto_wrap_output_keys=None): 

35 """Encode target labels with value between 0 and n_classes-1. 

36 

37 This transformer should be used to encode target values, *i.e.* `y`, and 

38 not the input `X`. 

39 

40 Read more in the :ref:`User Guide <preprocessing_targets>`. 

41 

42 .. versionadded:: 0.12 

43 

44 Attributes 

45 ---------- 

46 classes_ : ndarray of shape (n_classes,) 

47 Holds the label for each class. 

48 

49 See Also 

50 -------- 

51 OrdinalEncoder : Encode categorical features using an ordinal encoding 

52 scheme. 

53 OneHotEncoder : Encode categorical features as a one-hot numeric array. 

54 

55 Examples 

56 -------- 

57 `LabelEncoder` can be used to normalize labels. 

58 

59 >>> from sklearn.preprocessing import LabelEncoder 

60 >>> le = LabelEncoder() 

61 >>> le.fit([1, 2, 2, 6]) 

62 LabelEncoder() 

63 >>> le.classes_ 

64 array([1, 2, 6]) 

65 >>> le.transform([1, 1, 2, 6]) 

66 array([0, 0, 1, 2]...) 

67 >>> le.inverse_transform([0, 0, 1, 2]) 

68 array([1, 1, 2, 6]) 

69 

70 It can also be used to transform non-numerical labels (as long as they are 

71 hashable and comparable) to numerical labels. 

72 

73 >>> le = LabelEncoder() 

74 >>> le.fit(["paris", "paris", "tokyo", "amsterdam"]) 

75 LabelEncoder() 

76 >>> list(le.classes_) 

77 ['amsterdam', 'paris', 'tokyo'] 

78 >>> le.transform(["tokyo", "tokyo", "paris"]) 

79 array([2, 2, 1]...) 

80 >>> list(le.inverse_transform([2, 2, 1])) 

81 ['tokyo', 'tokyo', 'paris'] 

82 """ 

83 

84 def fit(self, y): 

85 """Fit label encoder. 

86 

87 Parameters 

88 ---------- 

89 y : array-like of shape (n_samples,) 

90 Target values. 

91 

92 Returns 

93 ------- 

94 self : returns an instance of self. 

95 Fitted label encoder. 

96 """ 

97 y = column_or_1d(y, warn=True) 

98 self.classes_ = _unique(y) 

99 return self 

100 

101 def fit_transform(self, y): 

102 """Fit label encoder and return encoded labels. 

103 

104 Parameters 

105 ---------- 

106 y : array-like of shape (n_samples,) 

107 Target values. 

108 

109 Returns 

110 ------- 

111 y : array-like of shape (n_samples,) 

112 Encoded labels. 

113 """ 

114 y = column_or_1d(y, warn=True) 

115 self.classes_, y = _unique(y, return_inverse=True) 

116 return y 

117 

118 def transform(self, y): 

119 """Transform labels to normalized encoding. 

120 

121 Parameters 

122 ---------- 

123 y : array-like of shape (n_samples,) 

124 Target values. 

125 

126 Returns 

127 ------- 

128 y : array-like of shape (n_samples,) 

129 Labels as normalized encodings. 

130 """ 

131 check_is_fitted(self) 

132 y = column_or_1d(y, dtype=self.classes_.dtype, warn=True) 

133 # transform of empty array is empty array 

134 if _num_samples(y) == 0: 

135 return np.array([]) 

136 

137 return _encode(y, uniques=self.classes_) 

138 

139 def inverse_transform(self, y): 

140 """Transform labels back to original encoding. 

141 

142 Parameters 

143 ---------- 

144 y : ndarray of shape (n_samples,) 

145 Target values. 

146 

147 Returns 

148 ------- 

149 y : ndarray of shape (n_samples,) 

150 Original encoding. 

151 """ 

152 check_is_fitted(self) 

153 y = column_or_1d(y, warn=True) 

154 # inverse transform of empty array is empty array 

155 if _num_samples(y) == 0: 

156 return np.array([]) 

157 

158 diff = np.setdiff1d(y, np.arange(len(self.classes_))) 

159 if len(diff): 

160 raise ValueError("y contains previously unseen labels: %s" % str(diff)) 

161 y = np.asarray(y) 

162 return self.classes_[y] 

163 

164 def _more_tags(self): 

165 return {"X_types": ["1dlabels"]} 

166 

167 

168class LabelBinarizer(TransformerMixin, BaseEstimator, auto_wrap_output_keys=None): 

169 """Binarize labels in a one-vs-all fashion. 

170 

171 Several regression and binary classification algorithms are 

172 available in scikit-learn. A simple way to extend these algorithms 

173 to the multi-class classification case is to use the so-called 

174 one-vs-all scheme. 

175 

176 At learning time, this simply consists in learning one regressor 

177 or binary classifier per class. In doing so, one needs to convert 

178 multi-class labels to binary labels (belong or does not belong 

179 to the class). `LabelBinarizer` makes this process easy with the 

180 transform method. 

181 

182 At prediction time, one assigns the class for which the corresponding 

183 model gave the greatest confidence. `LabelBinarizer` makes this easy 

184 with the :meth:`inverse_transform` method. 

185 

186 Read more in the :ref:`User Guide <preprocessing_targets>`. 

187 

188 Parameters 

189 ---------- 

190 neg_label : int, default=0 

191 Value with which negative labels must be encoded. 

192 

193 pos_label : int, default=1 

194 Value with which positive labels must be encoded. 

195 

196 sparse_output : bool, default=False 

197 True if the returned array from transform is desired to be in sparse 

198 CSR format. 

199 

200 Attributes 

201 ---------- 

202 classes_ : ndarray of shape (n_classes,) 

203 Holds the label for each class. 

204 

205 y_type_ : str 

206 Represents the type of the target data as evaluated by 

207 :func:`~sklearn.utils.multiclass.type_of_target`. Possible type are 

208 'continuous', 'continuous-multioutput', 'binary', 'multiclass', 

209 'multiclass-multioutput', 'multilabel-indicator', and 'unknown'. 

210 

211 sparse_input_ : bool 

212 `True` if the input data to transform is given as a sparse matrix, 

213 `False` otherwise. 

214 

215 See Also 

216 -------- 

217 label_binarize : Function to perform the transform operation of 

218 LabelBinarizer with fixed classes. 

219 OneHotEncoder : Encode categorical features using a one-hot aka one-of-K 

220 scheme. 

221 

222 Examples 

223 -------- 

224 >>> from sklearn.preprocessing import LabelBinarizer 

225 >>> lb = LabelBinarizer() 

226 >>> lb.fit([1, 2, 6, 4, 2]) 

227 LabelBinarizer() 

228 >>> lb.classes_ 

229 array([1, 2, 4, 6]) 

230 >>> lb.transform([1, 6]) 

231 array([[1, 0, 0, 0], 

232 [0, 0, 0, 1]]) 

233 

234 Binary targets transform to a column vector 

235 

236 >>> lb = LabelBinarizer() 

237 >>> lb.fit_transform(['yes', 'no', 'no', 'yes']) 

238 array([[1], 

239 [0], 

240 [0], 

241 [1]]) 

242 

243 Passing a 2D matrix for multilabel classification 

244 

245 >>> import numpy as np 

246 >>> lb.fit(np.array([[0, 1, 1], [1, 0, 0]])) 

247 LabelBinarizer() 

248 >>> lb.classes_ 

249 array([0, 1, 2]) 

250 >>> lb.transform([0, 1, 2, 1]) 

251 array([[1, 0, 0], 

252 [0, 1, 0], 

253 [0, 0, 1], 

254 [0, 1, 0]]) 

255 """ 

256 

257 _parameter_constraints: dict = { 

258 "neg_label": [Integral], 

259 "pos_label": [Integral], 

260 "sparse_output": ["boolean"], 

261 } 

262 

263 def __init__(self, *, neg_label=0, pos_label=1, sparse_output=False): 

264 self.neg_label = neg_label 

265 self.pos_label = pos_label 

266 self.sparse_output = sparse_output 

267 

268 @_fit_context(prefer_skip_nested_validation=True) 

269 def fit(self, y): 

270 """Fit label binarizer. 

271 

272 Parameters 

273 ---------- 

274 y : ndarray of shape (n_samples,) or (n_samples, n_classes) 

275 Target values. The 2-d matrix should only contain 0 and 1, 

276 represents multilabel classification. 

277 

278 Returns 

279 ------- 

280 self : object 

281 Returns the instance itself. 

282 """ 

283 if self.neg_label >= self.pos_label: 

284 raise ValueError( 

285 f"neg_label={self.neg_label} must be strictly less than " 

286 f"pos_label={self.pos_label}." 

287 ) 

288 

289 if self.sparse_output and (self.pos_label == 0 or self.neg_label != 0): 

290 raise ValueError( 

291 "Sparse binarization is only supported with non " 

292 "zero pos_label and zero neg_label, got " 

293 f"pos_label={self.pos_label} and neg_label={self.neg_label}" 

294 ) 

295 

296 self.y_type_ = type_of_target(y, input_name="y") 

297 

298 if "multioutput" in self.y_type_: 

299 raise ValueError( 

300 "Multioutput target data is not supported with label binarization" 

301 ) 

302 if _num_samples(y) == 0: 

303 raise ValueError("y has 0 samples: %r" % y) 

304 

305 self.sparse_input_ = sp.issparse(y) 

306 self.classes_ = unique_labels(y) 

307 return self 

308 

309 def fit_transform(self, y): 

310 """Fit label binarizer/transform multi-class labels to binary labels. 

311 

312 The output of transform is sometimes referred to as 

313 the 1-of-K coding scheme. 

314 

315 Parameters 

316 ---------- 

317 y : {ndarray, sparse matrix} of shape (n_samples,) or \ 

318 (n_samples, n_classes) 

319 Target values. The 2-d matrix should only contain 0 and 1, 

320 represents multilabel classification. Sparse matrix can be 

321 CSR, CSC, COO, DOK, or LIL. 

322 

323 Returns 

324 ------- 

325 Y : {ndarray, sparse matrix} of shape (n_samples, n_classes) 

326 Shape will be (n_samples, 1) for binary problems. Sparse matrix 

327 will be of CSR format. 

328 """ 

329 return self.fit(y).transform(y) 

330 

331 def transform(self, y): 

332 """Transform multi-class labels to binary labels. 

333 

334 The output of transform is sometimes referred to by some authors as 

335 the 1-of-K coding scheme. 

336 

337 Parameters 

338 ---------- 

339 y : {array, sparse matrix} of shape (n_samples,) or \ 

340 (n_samples, n_classes) 

341 Target values. The 2-d matrix should only contain 0 and 1, 

342 represents multilabel classification. Sparse matrix can be 

343 CSR, CSC, COO, DOK, or LIL. 

344 

345 Returns 

346 ------- 

347 Y : {ndarray, sparse matrix} of shape (n_samples, n_classes) 

348 Shape will be (n_samples, 1) for binary problems. Sparse matrix 

349 will be of CSR format. 

350 """ 

351 check_is_fitted(self) 

352 

353 y_is_multilabel = type_of_target(y).startswith("multilabel") 

354 if y_is_multilabel and not self.y_type_.startswith("multilabel"): 

355 raise ValueError("The object was not fitted with multilabel input.") 

356 

357 return label_binarize( 

358 y, 

359 classes=self.classes_, 

360 pos_label=self.pos_label, 

361 neg_label=self.neg_label, 

362 sparse_output=self.sparse_output, 

363 ) 

364 

365 def inverse_transform(self, Y, threshold=None): 

366 """Transform binary labels back to multi-class labels. 

367 

368 Parameters 

369 ---------- 

370 Y : {ndarray, sparse matrix} of shape (n_samples, n_classes) 

371 Target values. All sparse matrices are converted to CSR before 

372 inverse transformation. 

373 

374 threshold : float, default=None 

375 Threshold used in the binary and multi-label cases. 

376 

377 Use 0 when ``Y`` contains the output of :term:`decision_function` 

378 (classifier). 

379 Use 0.5 when ``Y`` contains the output of :term:`predict_proba`. 

380 

381 If None, the threshold is assumed to be half way between 

382 neg_label and pos_label. 

383 

384 Returns 

385 ------- 

386 y : {ndarray, sparse matrix} of shape (n_samples,) 

387 Target values. Sparse matrix will be of CSR format. 

388 

389 Notes 

390 ----- 

391 In the case when the binary labels are fractional 

392 (probabilistic), :meth:`inverse_transform` chooses the class with the 

393 greatest value. Typically, this allows to use the output of a 

394 linear model's :term:`decision_function` method directly as the input 

395 of :meth:`inverse_transform`. 

396 """ 

397 check_is_fitted(self) 

398 

399 if threshold is None: 

400 threshold = (self.pos_label + self.neg_label) / 2.0 

401 

402 if self.y_type_ == "multiclass": 

403 y_inv = _inverse_binarize_multiclass(Y, self.classes_) 

404 else: 

405 y_inv = _inverse_binarize_thresholding( 

406 Y, self.y_type_, self.classes_, threshold 

407 ) 

408 

409 if self.sparse_input_: 

410 y_inv = sp.csr_matrix(y_inv) 

411 elif sp.issparse(y_inv): 

412 y_inv = y_inv.toarray() 

413 

414 return y_inv 

415 

416 def _more_tags(self): 

417 return {"X_types": ["1dlabels"]} 

418 

419 

420@validate_params( 

421 { 

422 "y": ["array-like"], 

423 "classes": ["array-like"], 

424 "neg_label": [Interval(Integral, None, None, closed="neither")], 

425 "pos_label": [Interval(Integral, None, None, closed="neither")], 

426 "sparse_output": ["boolean"], 

427 }, 

428 prefer_skip_nested_validation=True, 

429) 

430def label_binarize(y, *, classes, neg_label=0, pos_label=1, sparse_output=False): 

431 """Binarize labels in a one-vs-all fashion. 

432 

433 Several regression and binary classification algorithms are 

434 available in scikit-learn. A simple way to extend these algorithms 

435 to the multi-class classification case is to use the so-called 

436 one-vs-all scheme. 

437 

438 This function makes it possible to compute this transformation for a 

439 fixed set of class labels known ahead of time. 

440 

441 Parameters 

442 ---------- 

443 y : array-like 

444 Sequence of integer labels or multilabel data to encode. 

445 

446 classes : array-like of shape (n_classes,) 

447 Uniquely holds the label for each class. 

448 

449 neg_label : int, default=0 

450 Value with which negative labels must be encoded. 

451 

452 pos_label : int, default=1 

453 Value with which positive labels must be encoded. 

454 

455 sparse_output : bool, default=False, 

456 Set to true if output binary array is desired in CSR sparse format. 

457 

458 Returns 

459 ------- 

460 Y : {ndarray, sparse matrix} of shape (n_samples, n_classes) 

461 Shape will be (n_samples, 1) for binary problems. Sparse matrix will 

462 be of CSR format. 

463 

464 See Also 

465 -------- 

466 LabelBinarizer : Class used to wrap the functionality of label_binarize and 

467 allow for fitting to classes independently of the transform operation. 

468 

469 Examples 

470 -------- 

471 >>> from sklearn.preprocessing import label_binarize 

472 >>> label_binarize([1, 6], classes=[1, 2, 4, 6]) 

473 array([[1, 0, 0, 0], 

474 [0, 0, 0, 1]]) 

475 

476 The class ordering is preserved: 

477 

478 >>> label_binarize([1, 6], classes=[1, 6, 4, 2]) 

479 array([[1, 0, 0, 0], 

480 [0, 1, 0, 0]]) 

481 

482 Binary targets transform to a column vector 

483 

484 >>> label_binarize(['yes', 'no', 'no', 'yes'], classes=['no', 'yes']) 

485 array([[1], 

486 [0], 

487 [0], 

488 [1]]) 

489 """ 

490 if not isinstance(y, list): 

491 # XXX Workaround that will be removed when list of list format is 

492 # dropped 

493 y = check_array( 

494 y, input_name="y", accept_sparse="csr", ensure_2d=False, dtype=None 

495 ) 

496 else: 

497 if _num_samples(y) == 0: 

498 raise ValueError("y has 0 samples: %r" % y) 

499 if neg_label >= pos_label: 

500 raise ValueError( 

501 "neg_label={0} must be strictly less than pos_label={1}.".format( 

502 neg_label, pos_label 

503 ) 

504 ) 

505 

506 if sparse_output and (pos_label == 0 or neg_label != 0): 

507 raise ValueError( 

508 "Sparse binarization is only supported with non " 

509 "zero pos_label and zero neg_label, got " 

510 "pos_label={0} and neg_label={1}" 

511 "".format(pos_label, neg_label) 

512 ) 

513 

514 # To account for pos_label == 0 in the dense case 

515 pos_switch = pos_label == 0 

516 if pos_switch: 

517 pos_label = -neg_label 

518 

519 y_type = type_of_target(y) 

520 if "multioutput" in y_type: 

521 raise ValueError( 

522 "Multioutput target data is not supported with label binarization" 

523 ) 

524 if y_type == "unknown": 

525 raise ValueError("The type of target data is not known") 

526 

527 n_samples = y.shape[0] if sp.issparse(y) else len(y) 

528 n_classes = len(classes) 

529 classes = np.asarray(classes) 

530 

531 if y_type == "binary": 

532 if n_classes == 1: 

533 if sparse_output: 

534 return sp.csr_matrix((n_samples, 1), dtype=int) 

535 else: 

536 Y = np.zeros((len(y), 1), dtype=int) 

537 Y += neg_label 

538 return Y 

539 elif len(classes) >= 3: 

540 y_type = "multiclass" 

541 

542 sorted_class = np.sort(classes) 

543 if y_type == "multilabel-indicator": 

544 y_n_classes = y.shape[1] if hasattr(y, "shape") else len(y[0]) 

545 if classes.size != y_n_classes: 

546 raise ValueError( 

547 "classes {0} mismatch with the labels {1} found in the data".format( 

548 classes, unique_labels(y) 

549 ) 

550 ) 

551 

552 if y_type in ("binary", "multiclass"): 

553 y = column_or_1d(y) 

554 

555 # pick out the known labels from y 

556 y_in_classes = np.isin(y, classes) 

557 y_seen = y[y_in_classes] 

558 indices = np.searchsorted(sorted_class, y_seen) 

559 indptr = np.hstack((0, np.cumsum(y_in_classes))) 

560 

561 data = np.empty_like(indices) 

562 data.fill(pos_label) 

563 Y = sp.csr_matrix((data, indices, indptr), shape=(n_samples, n_classes)) 

564 elif y_type == "multilabel-indicator": 

565 Y = sp.csr_matrix(y) 

566 if pos_label != 1: 

567 data = np.empty_like(Y.data) 

568 data.fill(pos_label) 

569 Y.data = data 

570 else: 

571 raise ValueError( 

572 "%s target data is not supported with label binarization" % y_type 

573 ) 

574 

575 if not sparse_output: 

576 Y = Y.toarray() 

577 Y = Y.astype(int, copy=False) 

578 

579 if neg_label != 0: 

580 Y[Y == 0] = neg_label 

581 

582 if pos_switch: 

583 Y[Y == pos_label] = 0 

584 else: 

585 Y.data = Y.data.astype(int, copy=False) 

586 

587 # preserve label ordering 

588 if np.any(classes != sorted_class): 

589 indices = np.searchsorted(sorted_class, classes) 

590 Y = Y[:, indices] 

591 

592 if y_type == "binary": 

593 if sparse_output: 

594 Y = Y.getcol(-1) 

595 else: 

596 Y = Y[:, -1].reshape((-1, 1)) 

597 

598 return Y 

599 

600 

601def _inverse_binarize_multiclass(y, classes): 

602 """Inverse label binarization transformation for multiclass. 

603 

604 Multiclass uses the maximal score instead of a threshold. 

605 """ 

606 classes = np.asarray(classes) 

607 

608 if sp.issparse(y): 

609 # Find the argmax for each row in y where y is a CSR matrix 

610 

611 y = y.tocsr() 

612 n_samples, n_outputs = y.shape 

613 outputs = np.arange(n_outputs) 

614 row_max = min_max_axis(y, 1)[1] 

615 row_nnz = np.diff(y.indptr) 

616 

617 y_data_repeated_max = np.repeat(row_max, row_nnz) 

618 # picks out all indices obtaining the maximum per row 

619 y_i_all_argmax = np.flatnonzero(y_data_repeated_max == y.data) 

620 

621 # For corner case where last row has a max of 0 

622 if row_max[-1] == 0: 

623 y_i_all_argmax = np.append(y_i_all_argmax, [len(y.data)]) 

624 

625 # Gets the index of the first argmax in each row from y_i_all_argmax 

626 index_first_argmax = np.searchsorted(y_i_all_argmax, y.indptr[:-1]) 

627 # first argmax of each row 

628 y_ind_ext = np.append(y.indices, [0]) 

629 y_i_argmax = y_ind_ext[y_i_all_argmax[index_first_argmax]] 

630 # Handle rows of all 0 

631 y_i_argmax[np.where(row_nnz == 0)[0]] = 0 

632 

633 # Handles rows with max of 0 that contain negative numbers 

634 samples = np.arange(n_samples)[(row_nnz > 0) & (row_max.ravel() == 0)] 

635 for i in samples: 

636 ind = y.indices[y.indptr[i] : y.indptr[i + 1]] 

637 y_i_argmax[i] = classes[np.setdiff1d(outputs, ind)][0] 

638 

639 return classes[y_i_argmax] 

640 else: 

641 return classes.take(y.argmax(axis=1), mode="clip") 

642 

643 

644def _inverse_binarize_thresholding(y, output_type, classes, threshold): 

645 """Inverse label binarization transformation using thresholding.""" 

646 

647 if output_type == "binary" and y.ndim == 2 and y.shape[1] > 2: 

648 raise ValueError("output_type='binary', but y.shape = {0}".format(y.shape)) 

649 

650 if output_type != "binary" and y.shape[1] != len(classes): 

651 raise ValueError( 

652 "The number of class is not equal to the number of dimension of y." 

653 ) 

654 

655 classes = np.asarray(classes) 

656 

657 # Perform thresholding 

658 if sp.issparse(y): 

659 if threshold > 0: 

660 if y.format not in ("csr", "csc"): 

661 y = y.tocsr() 

662 y.data = np.array(y.data > threshold, dtype=int) 

663 y.eliminate_zeros() 

664 else: 

665 y = np.array(y.toarray() > threshold, dtype=int) 

666 else: 

667 y = np.array(y > threshold, dtype=int) 

668 

669 # Inverse transform data 

670 if output_type == "binary": 

671 if sp.issparse(y): 

672 y = y.toarray() 

673 if y.ndim == 2 and y.shape[1] == 2: 

674 return classes[y[:, 1]] 

675 else: 

676 if len(classes) == 1: 

677 return np.repeat(classes[0], len(y)) 

678 else: 

679 return classes[y.ravel()] 

680 

681 elif output_type == "multilabel-indicator": 

682 return y 

683 

684 else: 

685 raise ValueError("{0} format is not supported".format(output_type)) 

686 

687 

688class MultiLabelBinarizer(TransformerMixin, BaseEstimator, auto_wrap_output_keys=None): 

689 """Transform between iterable of iterables and a multilabel format. 

690 

691 Although a list of sets or tuples is a very intuitive format for multilabel 

692 data, it is unwieldy to process. This transformer converts between this 

693 intuitive format and the supported multilabel format: a (samples x classes) 

694 binary matrix indicating the presence of a class label. 

695 

696 Parameters 

697 ---------- 

698 classes : array-like of shape (n_classes,), default=None 

699 Indicates an ordering for the class labels. 

700 All entries should be unique (cannot contain duplicate classes). 

701 

702 sparse_output : bool, default=False 

703 Set to True if output binary array is desired in CSR sparse format. 

704 

705 Attributes 

706 ---------- 

707 classes_ : ndarray of shape (n_classes,) 

708 A copy of the `classes` parameter when provided. 

709 Otherwise it corresponds to the sorted set of classes found 

710 when fitting. 

711 

712 See Also 

713 -------- 

714 OneHotEncoder : Encode categorical features using a one-hot aka one-of-K 

715 scheme. 

716 

717 Examples 

718 -------- 

719 >>> from sklearn.preprocessing import MultiLabelBinarizer 

720 >>> mlb = MultiLabelBinarizer() 

721 >>> mlb.fit_transform([(1, 2), (3,)]) 

722 array([[1, 1, 0], 

723 [0, 0, 1]]) 

724 >>> mlb.classes_ 

725 array([1, 2, 3]) 

726 

727 >>> mlb.fit_transform([{'sci-fi', 'thriller'}, {'comedy'}]) 

728 array([[0, 1, 1], 

729 [1, 0, 0]]) 

730 >>> list(mlb.classes_) 

731 ['comedy', 'sci-fi', 'thriller'] 

732 

733 A common mistake is to pass in a list, which leads to the following issue: 

734 

735 >>> mlb = MultiLabelBinarizer() 

736 >>> mlb.fit(['sci-fi', 'thriller', 'comedy']) 

737 MultiLabelBinarizer() 

738 >>> mlb.classes_ 

739 array(['-', 'c', 'd', 'e', 'f', 'h', 'i', 'l', 'm', 'o', 'r', 's', 't', 

740 'y'], dtype=object) 

741 

742 To correct this, the list of labels should be passed in as: 

743 

744 >>> mlb = MultiLabelBinarizer() 

745 >>> mlb.fit([['sci-fi', 'thriller', 'comedy']]) 

746 MultiLabelBinarizer() 

747 >>> mlb.classes_ 

748 array(['comedy', 'sci-fi', 'thriller'], dtype=object) 

749 """ 

750 

751 _parameter_constraints: dict = { 

752 "classes": ["array-like", None], 

753 "sparse_output": ["boolean"], 

754 } 

755 

756 def __init__(self, *, classes=None, sparse_output=False): 

757 self.classes = classes 

758 self.sparse_output = sparse_output 

759 

760 @_fit_context(prefer_skip_nested_validation=True) 

761 def fit(self, y): 

762 """Fit the label sets binarizer, storing :term:`classes_`. 

763 

764 Parameters 

765 ---------- 

766 y : iterable of iterables 

767 A set of labels (any orderable and hashable object) for each 

768 sample. If the `classes` parameter is set, `y` will not be 

769 iterated. 

770 

771 Returns 

772 ------- 

773 self : object 

774 Fitted estimator. 

775 """ 

776 self._cached_dict = None 

777 

778 if self.classes is None: 

779 classes = sorted(set(itertools.chain.from_iterable(y))) 

780 elif len(set(self.classes)) < len(self.classes): 

781 raise ValueError( 

782 "The classes argument contains duplicate " 

783 "classes. Remove these duplicates before passing " 

784 "them to MultiLabelBinarizer." 

785 ) 

786 else: 

787 classes = self.classes 

788 dtype = int if all(isinstance(c, int) for c in classes) else object 

789 self.classes_ = np.empty(len(classes), dtype=dtype) 

790 self.classes_[:] = classes 

791 return self 

792 

793 @_fit_context(prefer_skip_nested_validation=True) 

794 def fit_transform(self, y): 

795 """Fit the label sets binarizer and transform the given label sets. 

796 

797 Parameters 

798 ---------- 

799 y : iterable of iterables 

800 A set of labels (any orderable and hashable object) for each 

801 sample. If the `classes` parameter is set, `y` will not be 

802 iterated. 

803 

804 Returns 

805 ------- 

806 y_indicator : {ndarray, sparse matrix} of shape (n_samples, n_classes) 

807 A matrix such that `y_indicator[i, j] = 1` iff `classes_[j]` 

808 is in `y[i]`, and 0 otherwise. Sparse matrix will be of CSR 

809 format. 

810 """ 

811 if self.classes is not None: 

812 return self.fit(y).transform(y) 

813 

814 self._cached_dict = None 

815 

816 # Automatically increment on new class 

817 class_mapping = defaultdict(int) 

818 class_mapping.default_factory = class_mapping.__len__ 

819 yt = self._transform(y, class_mapping) 

820 

821 # sort classes and reorder columns 

822 tmp = sorted(class_mapping, key=class_mapping.get) 

823 

824 # (make safe for tuples) 

825 dtype = int if all(isinstance(c, int) for c in tmp) else object 

826 class_mapping = np.empty(len(tmp), dtype=dtype) 

827 class_mapping[:] = tmp 

828 self.classes_, inverse = np.unique(class_mapping, return_inverse=True) 

829 # ensure yt.indices keeps its current dtype 

830 yt.indices = np.array(inverse[yt.indices], dtype=yt.indices.dtype, copy=False) 

831 

832 if not self.sparse_output: 

833 yt = yt.toarray() 

834 

835 return yt 

836 

837 def transform(self, y): 

838 """Transform the given label sets. 

839 

840 Parameters 

841 ---------- 

842 y : iterable of iterables 

843 A set of labels (any orderable and hashable object) for each 

844 sample. If the `classes` parameter is set, `y` will not be 

845 iterated. 

846 

847 Returns 

848 ------- 

849 y_indicator : array or CSR matrix, shape (n_samples, n_classes) 

850 A matrix such that `y_indicator[i, j] = 1` iff `classes_[j]` is in 

851 `y[i]`, and 0 otherwise. 

852 """ 

853 check_is_fitted(self) 

854 

855 class_to_index = self._build_cache() 

856 yt = self._transform(y, class_to_index) 

857 

858 if not self.sparse_output: 

859 yt = yt.toarray() 

860 

861 return yt 

862 

863 def _build_cache(self): 

864 if self._cached_dict is None: 

865 self._cached_dict = dict(zip(self.classes_, range(len(self.classes_)))) 

866 

867 return self._cached_dict 

868 

869 def _transform(self, y, class_mapping): 

870 """Transforms the label sets with a given mapping. 

871 

872 Parameters 

873 ---------- 

874 y : iterable of iterables 

875 A set of labels (any orderable and hashable object) for each 

876 sample. If the `classes` parameter is set, `y` will not be 

877 iterated. 

878 

879 class_mapping : Mapping 

880 Maps from label to column index in label indicator matrix. 

881 

882 Returns 

883 ------- 

884 y_indicator : sparse matrix of shape (n_samples, n_classes) 

885 Label indicator matrix. Will be of CSR format. 

886 """ 

887 indices = array.array("i") 

888 indptr = array.array("i", [0]) 

889 unknown = set() 

890 for labels in y: 

891 index = set() 

892 for label in labels: 

893 try: 

894 index.add(class_mapping[label]) 

895 except KeyError: 

896 unknown.add(label) 

897 indices.extend(index) 

898 indptr.append(len(indices)) 

899 if unknown: 

900 warnings.warn( 

901 "unknown class(es) {0} will be ignored".format(sorted(unknown, key=str)) 

902 ) 

903 data = np.ones(len(indices), dtype=int) 

904 

905 return sp.csr_matrix( 

906 (data, indices, indptr), shape=(len(indptr) - 1, len(class_mapping)) 

907 ) 

908 

909 def inverse_transform(self, yt): 

910 """Transform the given indicator matrix into label sets. 

911 

912 Parameters 

913 ---------- 

914 yt : {ndarray, sparse matrix} of shape (n_samples, n_classes) 

915 A matrix containing only 1s ands 0s. 

916 

917 Returns 

918 ------- 

919 y : list of tuples 

920 The set of labels for each sample such that `y[i]` consists of 

921 `classes_[j]` for each `yt[i, j] == 1`. 

922 """ 

923 check_is_fitted(self) 

924 

925 if yt.shape[1] != len(self.classes_): 

926 raise ValueError( 

927 "Expected indicator for {0} classes, but got {1}".format( 

928 len(self.classes_), yt.shape[1] 

929 ) 

930 ) 

931 

932 if sp.issparse(yt): 

933 yt = yt.tocsr() 

934 if len(yt.data) != 0 and len(np.setdiff1d(yt.data, [0, 1])) > 0: 

935 raise ValueError("Expected only 0s and 1s in label indicator.") 

936 return [ 

937 tuple(self.classes_.take(yt.indices[start:end])) 

938 for start, end in zip(yt.indptr[:-1], yt.indptr[1:]) 

939 ] 

940 else: 

941 unexpected = np.setdiff1d(yt, [0, 1]) 

942 if len(unexpected) > 0: 

943 raise ValueError( 

944 "Expected only 0s and 1s in label indicator. Also got {0}".format( 

945 unexpected 

946 ) 

947 ) 

948 return [tuple(self.classes_.compress(indicators)) for indicators in yt] 

949 

950 def _more_tags(self): 

951 return {"X_types": ["2dlabels"]}