Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/scikit_learn-1.4.dev0-py3.8-linux-x86_64.egg/sklearn/preprocessing/

1# Authors: Alexandre Gramfort <alexandre.gramfort@inria.fr>

2# Mathieu Blondel <mathieu@mblondel.org>

3# Olivier Grisel <olivier.grisel@ensta.org>

4# Andreas Mueller <amueller@ais.uni-bonn.de>

5# Joel Nothman <joel.nothman@gmail.com>

6# Hamzeh Alsalhi <ha258@cornell.edu>

7# License: BSD 3 clause

9import array

10import itertools

11import warnings

12from collections import defaultdict

13from numbers import Integral

15import numpy as np

16import scipy.sparse as sp

18from ..base import BaseEstimator, TransformerMixin, _fit_context

19from ..utils import column_or_1d

20from ..utils._encode import _encode, _unique

21from ..utils._param_validation import Interval, validate_params

22from ..utils.multiclass import type_of_target, unique_labels

23from ..utils.sparsefuncs import min_max_axis

24from ..utils.validation import _num_samples, check_array, check_is_fitted

26__all__ = [

27 "label_binarize",

28 "LabelBinarizer",

29 "LabelEncoder",

30 "MultiLabelBinarizer",

31]

34class LabelEncoder(TransformerMixin, BaseEstimator, auto_wrap_output_keys=None):

35 """Encode target labels with value between 0 and n_classes-1.

37 This transformer should be used to encode target values, *i.e.* `y`, and

38 not the input `X`.

40 Read more in the :ref:`User Guide <preprocessing_targets>`.

42 .. versionadded:: 0.12

44 Attributes

45 ----------

46 classes_ : ndarray of shape (n_classes,)

47 Holds the label for each class.

49 See Also

50 --------

51 OrdinalEncoder : Encode categorical features using an ordinal encoding

52 scheme.

53 OneHotEncoder : Encode categorical features as a one-hot numeric array.

55 Examples

56 --------

57 `LabelEncoder` can be used to normalize labels.

59 >>> from sklearn.preprocessing import LabelEncoder

60 >>> le = LabelEncoder()

61 >>> le.fit([1, 2, 2, 6])

62 LabelEncoder()

63 >>> le.classes_

64 array([1, 2, 6])

65 >>> le.transform([1, 1, 2, 6])

66 array([0, 0, 1, 2]...)

67 >>> le.inverse_transform([0, 0, 1, 2])

68 array([1, 1, 2, 6])

70 It can also be used to transform non-numerical labels (as long as they are

71 hashable and comparable) to numerical labels.

73 >>> le = LabelEncoder()

74 >>> le.fit(["paris", "paris", "tokyo", "amsterdam"])

75 LabelEncoder()

76 >>> list(le.classes_)

77 ['amsterdam', 'paris', 'tokyo']

78 >>> le.transform(["tokyo", "tokyo", "paris"])

79 array([2, 2, 1]...)

80 >>> list(le.inverse_transform([2, 2, 1]))

81 ['tokyo', 'tokyo', 'paris']

82 """

84 def fit(self, y):

85 """Fit label encoder.

87 Parameters

88 ----------

89 y : array-like of shape (n_samples,)

90 Target values.

92 Returns

93 -------

94 self : returns an instance of self.

95 Fitted label encoder.

96 """

97 y = column_or_1d(y, warn=True)

98 self.classes_ = _unique(y)

99 return self

100

101 def fit_transform(self, y):

102 """Fit label encoder and return encoded labels.

103

104 Parameters

105 ----------

106 y : array-like of shape (n_samples,)

107 Target values.

108

109 Returns

110 -------

111 y : array-like of shape (n_samples,)

112 Encoded labels.

113 """

114 y = column_or_1d(y, warn=True)

115 self.classes_, y = _unique(y, return_inverse=True)

116 return y

117

118 def transform(self, y):

119 """Transform labels to normalized encoding.

120

121 Parameters

122 ----------

123 y : array-like of shape (n_samples,)

124 Target values.

125

126 Returns

127 -------

128 y : array-like of shape (n_samples,)

129 Labels as normalized encodings.

130 """

131 check_is_fitted(self)

132 y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)

133 # transform of empty array is empty array

134 if _num_samples(y) == 0:

135 return np.array([])

136

137 return _encode(y, uniques=self.classes_)

138

139 def inverse_transform(self, y):

140 """Transform labels back to original encoding.

141

142 Parameters

143 ----------

144 y : ndarray of shape (n_samples,)

145 Target values.

146

147 Returns

148 -------

149 y : ndarray of shape (n_samples,)

150 Original encoding.

151 """

152 check_is_fitted(self)

153 y = column_or_1d(y, warn=True)

154 # inverse transform of empty array is empty array

155 if _num_samples(y) == 0:

156 return np.array([])

157

158 diff = np.setdiff1d(y, np.arange(len(self.classes_)))

159 if len(diff):

160 raise ValueError("y contains previously unseen labels: %s" % str(diff))

161 y = np.asarray(y)

162 return self.classes_[y]

163

164 def _more_tags(self):

165 return {"X_types": ["1dlabels"]}

166

167

168class LabelBinarizer(TransformerMixin, BaseEstimator, auto_wrap_output_keys=None):

169 """Binarize labels in a one-vs-all fashion.

170

171 Several regression and binary classification algorithms are

172 available in scikit-learn. A simple way to extend these algorithms

173 to the multi-class classification case is to use the so-called

174 one-vs-all scheme.

175

176 At learning time, this simply consists in learning one regressor

177 or binary classifier per class. In doing so, one needs to convert

178 multi-class labels to binary labels (belong or does not belong

179 to the class). `LabelBinarizer` makes this process easy with the

180 transform method.

181

182 At prediction time, one assigns the class for which the corresponding

183 model gave the greatest confidence. `LabelBinarizer` makes this easy

184 with the :meth:`inverse_transform` method.

185

186 Read more in the :ref:`User Guide <preprocessing_targets>`.

187

188 Parameters

189 ----------

190 neg_label : int, default=0

191 Value with which negative labels must be encoded.

192

193 pos_label : int, default=1

194 Value with which positive labels must be encoded.

195

196 sparse_output : bool, default=False

197 True if the returned array from transform is desired to be in sparse

198 CSR format.

199

200 Attributes

201 ----------

202 classes_ : ndarray of shape (n_classes,)

203 Holds the label for each class.

204

205 y_type_ : str

206 Represents the type of the target data as evaluated by

207 :func:`~sklearn.utils.multiclass.type_of_target`. Possible type are

208 'continuous', 'continuous-multioutput', 'binary', 'multiclass',

209 'multiclass-multioutput', 'multilabel-indicator', and 'unknown'.

210

211 sparse_input_ : bool

212 `True` if the input data to transform is given as a sparse matrix,

213 `False` otherwise.

214

215 See Also

216 --------

217 label_binarize : Function to perform the transform operation of

218 LabelBinarizer with fixed classes.

219 OneHotEncoder : Encode categorical features using a one-hot aka one-of-K

220 scheme.

221

222 Examples

223 --------

224 >>> from sklearn.preprocessing import LabelBinarizer

225 >>> lb = LabelBinarizer()

226 >>> lb.fit([1, 2, 6, 4, 2])

227 LabelBinarizer()

228 >>> lb.classes_

229 array([1, 2, 4, 6])

230 >>> lb.transform([1, 6])

231 array([[1, 0, 0, 0],

232 [0, 0, 0, 1]])

233

234 Binary targets transform to a column vector

235

236 >>> lb = LabelBinarizer()

237 >>> lb.fit_transform(['yes', 'no', 'no', 'yes'])

238 array([[1],

239 [0],

240 [0],

241 [1]])

242

243 Passing a 2D matrix for multilabel classification

244

245 >>> import numpy as np

246 >>> lb.fit(np.array([[0, 1, 1], [1, 0, 0]]))

247 LabelBinarizer()

248 >>> lb.classes_

249 array([0, 1, 2])

250 >>> lb.transform([0, 1, 2, 1])

251 array([[1, 0, 0],

252 [0, 1, 0],

253 [0, 0, 1],

254 [0, 1, 0]])

255 """

256

257 _parameter_constraints: dict = {

258 "neg_label": [Integral],

259 "pos_label": [Integral],

260 "sparse_output": ["boolean"],

261 }

262

263 def __init__(self, *, neg_label=0, pos_label=1, sparse_output=False):

264 self.neg_label = neg_label

265 self.pos_label = pos_label

266 self.sparse_output = sparse_output

267

268 @_fit_context(prefer_skip_nested_validation=True)

269 def fit(self, y):

270 """Fit label binarizer.

271

272 Parameters

273 ----------

274 y : ndarray of shape (n_samples,) or (n_samples, n_classes)

275 Target values. The 2-d matrix should only contain 0 and 1,

276 represents multilabel classification.

277

278 Returns

279 -------

280 self : object

281 Returns the instance itself.

282 """

283 if self.neg_label >= self.pos_label:

284 raise ValueError(

285 f"neg_label={self.neg_label} must be strictly less than "

286 f"pos_label={self.pos_label}."

287 )

288

289 if self.sparse_output and (self.pos_label == 0 or self.neg_label != 0):

290 raise ValueError(

291 "Sparse binarization is only supported with non "

292 "zero pos_label and zero neg_label, got "

293 f"pos_label={self.pos_label} and neg_label={self.neg_label}"

294 )

295

296 self.y_type_ = type_of_target(y, input_name="y")

297

298 if "multioutput" in self.y_type_:

299 raise ValueError(

300 "Multioutput target data is not supported with label binarization"

301 )

302 if _num_samples(y) == 0:

303 raise ValueError("y has 0 samples: %r" % y)

304

305 self.sparse_input_ = sp.issparse(y)

306 self.classes_ = unique_labels(y)

307 return self

308

309 def fit_transform(self, y):

310 """Fit label binarizer/transform multi-class labels to binary labels.

311

312 The output of transform is sometimes referred to as

313 the 1-of-K coding scheme.

314

315 Parameters

316 ----------

317 y : {ndarray, sparse matrix} of shape (n_samples,) or \

318 (n_samples, n_classes)

319 Target values. The 2-d matrix should only contain 0 and 1,

320 represents multilabel classification. Sparse matrix can be

321 CSR, CSC, COO, DOK, or LIL.

322

323 Returns

324 -------

325 Y : {ndarray, sparse matrix} of shape (n_samples, n_classes)

326 Shape will be (n_samples, 1) for binary problems. Sparse matrix

327 will be of CSR format.

328 """

329 return self.fit(y).transform(y)

330

331 def transform(self, y):

332 """Transform multi-class labels to binary labels.

333

334 The output of transform is sometimes referred to by some authors as

335 the 1-of-K coding scheme.

336

337 Parameters

338 ----------

339 y : {array, sparse matrix} of shape (n_samples,) or \

340 (n_samples, n_classes)

341 Target values. The 2-d matrix should only contain 0 and 1,

342 represents multilabel classification. Sparse matrix can be

343 CSR, CSC, COO, DOK, or LIL.

344

345 Returns

346 -------

347 Y : {ndarray, sparse matrix} of shape (n_samples, n_classes)

348 Shape will be (n_samples, 1) for binary problems. Sparse matrix

349 will be of CSR format.

350 """

351 check_is_fitted(self)

352

353 y_is_multilabel = type_of_target(y).startswith("multilabel")

354 if y_is_multilabel and not self.y_type_.startswith("multilabel"):

355 raise ValueError("The object was not fitted with multilabel input.")

356

357 return label_binarize(

358 y,

359 classes=self.classes_,

360 pos_label=self.pos_label,

361 neg_label=self.neg_label,

362 sparse_output=self.sparse_output,

363 )

364

365 def inverse_transform(self, Y, threshold=None):

366 """Transform binary labels back to multi-class labels.

367

368 Parameters

369 ----------

370 Y : {ndarray, sparse matrix} of shape (n_samples, n_classes)

371 Target values. All sparse matrices are converted to CSR before

372 inverse transformation.

373

374 threshold : float, default=None

375 Threshold used in the binary and multi-label cases.

376

377 Use 0 when ``Y`` contains the output of :term:`decision_function`

378 (classifier).

379 Use 0.5 when ``Y`` contains the output of :term:`predict_proba`.

380

381 If None, the threshold is assumed to be half way between

382 neg_label and pos_label.

383

384 Returns

385 -------

386 y : {ndarray, sparse matrix} of shape (n_samples,)

387 Target values. Sparse matrix will be of CSR format.

388

389 Notes

390 -----

391 In the case when the binary labels are fractional

392 (probabilistic), :meth:`inverse_transform` chooses the class with the

393 greatest value. Typically, this allows to use the output of a

394 linear model's :term:`decision_function` method directly as the input

395 of :meth:`inverse_transform`.

396 """

397 check_is_fitted(self)

398

399 if threshold is None:

400 threshold = (self.pos_label + self.neg_label) / 2.0

401

402 if self.y_type_ == "multiclass":

403 y_inv = _inverse_binarize_multiclass(Y, self.classes_)

404 else:

405 y_inv = _inverse_binarize_thresholding(

406 Y, self.y_type_, self.classes_, threshold

407 )

408

409 if self.sparse_input_:

410 y_inv = sp.csr_matrix(y_inv)

411 elif sp.issparse(y_inv):

412 y_inv = y_inv.toarray()

413

414 return y_inv

415

416 def _more_tags(self):

417 return {"X_types": ["1dlabels"]}

418

419

420@validate_params(

421 {

422 "y": ["array-like"],

423 "classes": ["array-like"],

424 "neg_label": [Interval(Integral, None, None, closed="neither")],

425 "pos_label": [Interval(Integral, None, None, closed="neither")],

426 "sparse_output": ["boolean"],

427 },

428 prefer_skip_nested_validation=True,

429)

430def label_binarize(y, *, classes, neg_label=0, pos_label=1, sparse_output=False):

431 """Binarize labels in a one-vs-all fashion.

432

433 Several regression and binary classification algorithms are

434 available in scikit-learn. A simple way to extend these algorithms

435 to the multi-class classification case is to use the so-called

436 one-vs-all scheme.

437

438 This function makes it possible to compute this transformation for a

439 fixed set of class labels known ahead of time.

440

441 Parameters

442 ----------

443 y : array-like

444 Sequence of integer labels or multilabel data to encode.

445

446 classes : array-like of shape (n_classes,)

447 Uniquely holds the label for each class.

448

449 neg_label : int, default=0

450 Value with which negative labels must be encoded.

451

452 pos_label : int, default=1

453 Value with which positive labels must be encoded.

454

455 sparse_output : bool, default=False,

456 Set to true if output binary array is desired in CSR sparse format.

457

458 Returns

459 -------

460 Y : {ndarray, sparse matrix} of shape (n_samples, n_classes)

461 Shape will be (n_samples, 1) for binary problems. Sparse matrix will

462 be of CSR format.

463

464 See Also

465 --------

466 LabelBinarizer : Class used to wrap the functionality of label_binarize and

467 allow for fitting to classes independently of the transform operation.

468

469 Examples

470 --------

471 >>> from sklearn.preprocessing import label_binarize

472 >>> label_binarize([1, 6], classes=[1, 2, 4, 6])

473 array([[1, 0, 0, 0],

474 [0, 0, 0, 1]])

475

476 The class ordering is preserved:

477

478 >>> label_binarize([1, 6], classes=[1, 6, 4, 2])

479 array([[1, 0, 0, 0],

480 [0, 1, 0, 0]])

481

482 Binary targets transform to a column vector

483

484 >>> label_binarize(['yes', 'no', 'no', 'yes'], classes=['no', 'yes'])

485 array([[1],

486 [0],

487 [0],

488 [1]])

489 """

490 if not isinstance(y, list):

491 # XXX Workaround that will be removed when list of list format is

492 # dropped

493 y = check_array(

494 y, input_name="y", accept_sparse="csr", ensure_2d=False, dtype=None

495 )

496 else:

497 if _num_samples(y) == 0:

498 raise ValueError("y has 0 samples: %r" % y)

499 if neg_label >= pos_label:

500 raise ValueError(

501 "neg_label={0} must be strictly less than pos_label={1}.".format(

502 neg_label, pos_label

503 )

504 )

505

506 if sparse_output and (pos_label == 0 or neg_label != 0):

507 raise ValueError(

508 "Sparse binarization is only supported with non "

509 "zero pos_label and zero neg_label, got "

510 "pos_label={0} and neg_label={1}"

511 "".format(pos_label, neg_label)

512 )

513

514 # To account for pos_label == 0 in the dense case

515 pos_switch = pos_label == 0

516 if pos_switch:

517 pos_label = -neg_label

518

519 y_type = type_of_target(y)

520 if "multioutput" in y_type:

521 raise ValueError(

522 "Multioutput target data is not supported with label binarization"

523 )

524 if y_type == "unknown":

525 raise ValueError("The type of target data is not known")

526

527 n_samples = y.shape[0] if sp.issparse(y) else len(y)

528 n_classes = len(classes)

529 classes = np.asarray(classes)

530

531 if y_type == "binary":

532 if n_classes == 1:

533 if sparse_output:

534 return sp.csr_matrix((n_samples, 1), dtype=int)

535 else:

536 Y = np.zeros((len(y), 1), dtype=int)

537 Y += neg_label

538 return Y

539 elif len(classes) >= 3:

540 y_type = "multiclass"

541

542 sorted_class = np.sort(classes)

543 if y_type == "multilabel-indicator":

544 y_n_classes = y.shape[1] if hasattr(y, "shape") else len(y[0])

545 if classes.size != y_n_classes:

546 raise ValueError(

547 "classes {0} mismatch with the labels {1} found in the data".format(

548 classes, unique_labels(y)

549 )

550 )

551

552 if y_type in ("binary", "multiclass"):

553 y = column_or_1d(y)

554

555 # pick out the known labels from y

556 y_in_classes = np.isin(y, classes)

557 y_seen = y[y_in_classes]

558 indices = np.searchsorted(sorted_class, y_seen)

559 indptr = np.hstack((0, np.cumsum(y_in_classes)))

560

561 data = np.empty_like(indices)

562 data.fill(pos_label)

563 Y = sp.csr_matrix((data, indices, indptr), shape=(n_samples, n_classes))

564 elif y_type == "multilabel-indicator":

565 Y = sp.csr_matrix(y)

566 if pos_label != 1:

567 data = np.empty_like(Y.data)

568 data.fill(pos_label)

569 Y.data = data

570 else:

571 raise ValueError(

572 "%s target data is not supported with label binarization" % y_type

573 )

574

575 if not sparse_output:

576 Y = Y.toarray()

577 Y = Y.astype(int, copy=False)

578

579 if neg_label != 0:

580 Y[Y == 0] = neg_label

581

582 if pos_switch:

583 Y[Y == pos_label] = 0

584 else:

585 Y.data = Y.data.astype(int, copy=False)

586

587 # preserve label ordering

588 if np.any(classes != sorted_class):

589 indices = np.searchsorted(sorted_class, classes)

590 Y = Y[:, indices]

591

592 if y_type == "binary":

593 if sparse_output:

594 Y = Y.getcol(-1)

595 else:

596 Y = Y[:, -1].reshape((-1, 1))

597

598 return Y

599

600

601def _inverse_binarize_multiclass(y, classes):

602 """Inverse label binarization transformation for multiclass.

603

604 Multiclass uses the maximal score instead of a threshold.

605 """

606 classes = np.asarray(classes)

607

608 if sp.issparse(y):

609 # Find the argmax for each row in y where y is a CSR matrix

610

611 y = y.tocsr()

612 n_samples, n_outputs = y.shape

613 outputs = np.arange(n_outputs)

614 row_max = min_max_axis(y, 1)[1]

615 row_nnz = np.diff(y.indptr)

616

617 y_data_repeated_max = np.repeat(row_max, row_nnz)

618 # picks out all indices obtaining the maximum per row

619 y_i_all_argmax = np.flatnonzero(y_data_repeated_max == y.data)

620

621 # For corner case where last row has a max of 0

622 if row_max[-1] == 0:

623 y_i_all_argmax = np.append(y_i_all_argmax, [len(y.data)])

624

625 # Gets the index of the first argmax in each row from y_i_all_argmax

626 index_first_argmax = np.searchsorted(y_i_all_argmax, y.indptr[:-1])

627 # first argmax of each row

628 y_ind_ext = np.append(y.indices, [0])

629 y_i_argmax = y_ind_ext[y_i_all_argmax[index_first_argmax]]

630 # Handle rows of all 0

631 y_i_argmax[np.where(row_nnz == 0)[0]] = 0

632

633 # Handles rows with max of 0 that contain negative numbers

634 samples = np.arange(n_samples)[(row_nnz > 0) & (row_max.ravel() == 0)]

635 for i in samples:

636 ind = y.indices[y.indptr[i] : y.indptr[i + 1]]

637 y_i_argmax[i] = classes[np.setdiff1d(outputs, ind)][0]

638

639 return classes[y_i_argmax]

640 else:

641 return classes.take(y.argmax(axis=1), mode="clip")

642

643

644def _inverse_binarize_thresholding(y, output_type, classes, threshold):

645 """Inverse label binarization transformation using thresholding."""

646

647 if output_type == "binary" and y.ndim == 2 and y.shape[1] > 2:

648 raise ValueError("output_type='binary', but y.shape = {0}".format(y.shape))

649

650 if output_type != "binary" and y.shape[1] != len(classes):

651 raise ValueError(

652 "The number of class is not equal to the number of dimension of y."

653 )

654

655 classes = np.asarray(classes)

656

657 # Perform thresholding

658 if sp.issparse(y):

659 if threshold > 0:

660 if y.format not in ("csr", "csc"):

661 y = y.tocsr()

662 y.data = np.array(y.data > threshold, dtype=int)

663 y.eliminate_zeros()

664 else:

665 y = np.array(y.toarray() > threshold, dtype=int)

666 else:

667 y = np.array(y > threshold, dtype=int)

668

669 # Inverse transform data

670 if output_type == "binary":

671 if sp.issparse(y):

672 y = y.toarray()

673 if y.ndim == 2 and y.shape[1] == 2:

674 return classes[y[:, 1]]

675 else:

676 if len(classes) == 1:

677 return np.repeat(classes[0], len(y))

678 else:

679 return classes[y.ravel()]

680

681 elif output_type == "multilabel-indicator":

682 return y

683

684 else:

685 raise ValueError("{0} format is not supported".format(output_type))

686

687

688class MultiLabelBinarizer(TransformerMixin, BaseEstimator, auto_wrap_output_keys=None):

689 """Transform between iterable of iterables and a multilabel format.

690

691 Although a list of sets or tuples is a very intuitive format for multilabel

692 data, it is unwieldy to process. This transformer converts between this

693 intuitive format and the supported multilabel format: a (samples x classes)

694 binary matrix indicating the presence of a class label.

695

696 Parameters

697 ----------

698 classes : array-like of shape (n_classes,), default=None

699 Indicates an ordering for the class labels.

700 All entries should be unique (cannot contain duplicate classes).

701

702 sparse_output : bool, default=False

703 Set to True if output binary array is desired in CSR sparse format.

704

705 Attributes

706 ----------

707 classes_ : ndarray of shape (n_classes,)

708 A copy of the `classes` parameter when provided.

709 Otherwise it corresponds to the sorted set of classes found

710 when fitting.

711

712 See Also

713 --------

714 OneHotEncoder : Encode categorical features using a one-hot aka one-of-K

715 scheme.

716

717 Examples

718 --------

719 >>> from sklearn.preprocessing import MultiLabelBinarizer

720 >>> mlb = MultiLabelBinarizer()

721 >>> mlb.fit_transform([(1, 2), (3,)])

722 array([[1, 1, 0],

723 [0, 0, 1]])

724 >>> mlb.classes_

725 array([1, 2, 3])

726

727 >>> mlb.fit_transform([{'sci-fi', 'thriller'}, {'comedy'}])

728 array([[0, 1, 1],

729 [1, 0, 0]])

730 >>> list(mlb.classes_)

731 ['comedy', 'sci-fi', 'thriller']

732

733 A common mistake is to pass in a list, which leads to the following issue:

734

735 >>> mlb = MultiLabelBinarizer()

736 >>> mlb.fit(['sci-fi', 'thriller', 'comedy'])

737 MultiLabelBinarizer()

738 >>> mlb.classes_

739 array(['-', 'c', 'd', 'e', 'f', 'h', 'i', 'l', 'm', 'o', 'r', 's', 't',

740 'y'], dtype=object)

741

742 To correct this, the list of labels should be passed in as:

743

744 >>> mlb = MultiLabelBinarizer()

745 >>> mlb.fit([['sci-fi', 'thriller', 'comedy']])

746 MultiLabelBinarizer()

747 >>> mlb.classes_

748 array(['comedy', 'sci-fi', 'thriller'], dtype=object)

749 """

750

751 _parameter_constraints: dict = {

752 "classes": ["array-like", None],

753 "sparse_output": ["boolean"],

754 }

755

756 def __init__(self, *, classes=None, sparse_output=False):

757 self.classes = classes

758 self.sparse_output = sparse_output

759

760 @_fit_context(prefer_skip_nested_validation=True)

761 def fit(self, y):

762 """Fit the label sets binarizer, storing :term:`classes_`.

763

764 Parameters

765 ----------

766 y : iterable of iterables

767 A set of labels (any orderable and hashable object) for each

768 sample. If the `classes` parameter is set, `y` will not be

769 iterated.

770

771 Returns

772 -------

773 self : object

774 Fitted estimator.

775 """

776 self._cached_dict = None

777

778 if self.classes is None:

779 classes = sorted(set(itertools.chain.from_iterable(y)))

780 elif len(set(self.classes)) < len(self.classes):

781 raise ValueError(

782 "The classes argument contains duplicate "

783 "classes. Remove these duplicates before passing "

784 "them to MultiLabelBinarizer."

785 )

786 else:

787 classes = self.classes

788 dtype = int if all(isinstance(c, int) for c in classes) else object

789 self.classes_ = np.empty(len(classes), dtype=dtype)

790 self.classes_[:] = classes

791 return self

792

793 @_fit_context(prefer_skip_nested_validation=True)

794 def fit_transform(self, y):

795 """Fit the label sets binarizer and transform the given label sets.

796

797 Parameters

798 ----------

799 y : iterable of iterables

800 A set of labels (any orderable and hashable object) for each

801 sample. If the `classes` parameter is set, `y` will not be

802 iterated.

803

804 Returns

805 -------

806 y_indicator : {ndarray, sparse matrix} of shape (n_samples, n_classes)

807 A matrix such that `y_indicator[i, j] = 1` iff `classes_[j]`

808 is in `y[i]`, and 0 otherwise. Sparse matrix will be of CSR

809 format.

810 """

811 if self.classes is not None:

812 return self.fit(y).transform(y)

813

814 self._cached_dict = None

815

816 # Automatically increment on new class

817 class_mapping = defaultdict(int)

818 class_mapping.default_factory = class_mapping.__len__

819 yt = self._transform(y, class_mapping)

820

821 # sort classes and reorder columns

822 tmp = sorted(class_mapping, key=class_mapping.get)

823

824 # (make safe for tuples)

825 dtype = int if all(isinstance(c, int) for c in tmp) else object

826 class_mapping = np.empty(len(tmp), dtype=dtype)

827 class_mapping[:] = tmp

828 self.classes_, inverse = np.unique(class_mapping, return_inverse=True)

829 # ensure yt.indices keeps its current dtype

830 yt.indices = np.array(inverse[yt.indices], dtype=yt.indices.dtype, copy=False)

831

832 if not self.sparse_output:

833 yt = yt.toarray()

834

835 return yt

836

837 def transform(self, y):

838 """Transform the given label sets.

839

840 Parameters

841 ----------

842 y : iterable of iterables

843 A set of labels (any orderable and hashable object) for each

844 sample. If the `classes` parameter is set, `y` will not be

845 iterated.

846

847 Returns

848 -------

849 y_indicator : array or CSR matrix, shape (n_samples, n_classes)

850 A matrix such that `y_indicator[i, j] = 1` iff `classes_[j]` is in

851 `y[i]`, and 0 otherwise.

852 """

853 check_is_fitted(self)

854

855 class_to_index = self._build_cache()

856 yt = self._transform(y, class_to_index)

857

858 if not self.sparse_output:

859 yt = yt.toarray()

860

861 return yt

862

863 def _build_cache(self):

864 if self._cached_dict is None:

865 self._cached_dict = dict(zip(self.classes_, range(len(self.classes_))))

866

867 return self._cached_dict

868

869 def _transform(self, y, class_mapping):

870 """Transforms the label sets with a given mapping.

871

872 Parameters

873 ----------

874 y : iterable of iterables

875 A set of labels (any orderable and hashable object) for each

876 sample. If the `classes` parameter is set, `y` will not be

877 iterated.

878

879 class_mapping : Mapping

880 Maps from label to column index in label indicator matrix.

881

882 Returns

883 -------

884 y_indicator : sparse matrix of shape (n_samples, n_classes)

885 Label indicator matrix. Will be of CSR format.

886 """

887 indices = array.array("i")

888 indptr = array.array("i", [0])

889 unknown = set()

890 for labels in y:

891 index = set()

892 for label in labels:

893 try:

894 index.add(class_mapping[label])

895 except KeyError:

896 unknown.add(label)

897 indices.extend(index)

898 indptr.append(len(indices))

899 if unknown:

900 warnings.warn(

901 "unknown class(es) {0} will be ignored".format(sorted(unknown, key=str))

902 )

903 data = np.ones(len(indices), dtype=int)

904

905 return sp.csr_matrix(

906 (data, indices, indptr), shape=(len(indptr) - 1, len(class_mapping))

907 )

908

909 def inverse_transform(self, yt):

910 """Transform the given indicator matrix into label sets.

911

912 Parameters

913 ----------

914 yt : {ndarray, sparse matrix} of shape (n_samples, n_classes)

915 A matrix containing only 1s ands 0s.

916

917 Returns

918 -------

919 y : list of tuples

920 The set of labels for each sample such that `y[i]` consists of

921 `classes_[j]` for each `yt[i, j] == 1`.

922 """

923 check_is_fitted(self)

924

925 if yt.shape[1] != len(self.classes_):

926 raise ValueError(

927 "Expected indicator for {0} classes, but got {1}".format(

928 len(self.classes_), yt.shape[1]

929 )

930 )

931

932 if sp.issparse(yt):

933 yt = yt.tocsr()

934 if len(yt.data) != 0 and len(np.setdiff1d(yt.data, [0, 1])) > 0:

935 raise ValueError("Expected only 0s and 1s in label indicator.")

936 return [

937 tuple(self.classes_.take(yt.indices[start:end]))

938 for start, end in zip(yt.indptr[:-1], yt.indptr[1:])

939 ]

940 else:

941 unexpected = np.setdiff1d(yt, [0, 1])

942 if len(unexpected) > 0:

943 raise ValueError(

944 "Expected only 0s and 1s in label indicator. Also got {0}".format(

945 unexpected

946 )

947 )

948 return [tuple(self.classes_.compress(indicators)) for indicators in yt]

949

950 def _more_tags(self):

951 return {"X_types": ["2dlabels"]}

Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/scikit_learn-1.4.dev0-py3.8-linux-x86_64.egg/sklearn/preprocessing/_label.py: 17%

275 statements