1# Authors: Alexandre Gramfort <alexandre.gramfort@inria.fr>
2# Mathieu Blondel <mathieu@mblondel.org>
3# Olivier Grisel <olivier.grisel@ensta.org>
4# Andreas Mueller <amueller@ais.uni-bonn.de>
5# Joel Nothman <joel.nothman@gmail.com>
6# Hamzeh Alsalhi <ha258@cornell.edu>
7# License: BSD 3 clause
8
9import array
10import itertools
11import warnings
12from collections import defaultdict
13from numbers import Integral
14
15import numpy as np
16import scipy.sparse as sp
17
18from ..base import BaseEstimator, TransformerMixin, _fit_context
19from ..utils import column_or_1d
20from ..utils._encode import _encode, _unique
21from ..utils._param_validation import Interval, validate_params
22from ..utils.multiclass import type_of_target, unique_labels
23from ..utils.sparsefuncs import min_max_axis
24from ..utils.validation import _num_samples, check_array, check_is_fitted
25
26__all__ = [
27 "label_binarize",
28 "LabelBinarizer",
29 "LabelEncoder",
30 "MultiLabelBinarizer",
31]
32
33
34class LabelEncoder(TransformerMixin, BaseEstimator, auto_wrap_output_keys=None):
35 """Encode target labels with value between 0 and n_classes-1.
36
37 This transformer should be used to encode target values, *i.e.* `y`, and
38 not the input `X`.
39
40 Read more in the :ref:`User Guide <preprocessing_targets>`.
41
42 .. versionadded:: 0.12
43
44 Attributes
45 ----------
46 classes_ : ndarray of shape (n_classes,)
47 Holds the label for each class.
48
49 See Also
50 --------
51 OrdinalEncoder : Encode categorical features using an ordinal encoding
52 scheme.
53 OneHotEncoder : Encode categorical features as a one-hot numeric array.
54
55 Examples
56 --------
57 `LabelEncoder` can be used to normalize labels.
58
59 >>> from sklearn.preprocessing import LabelEncoder
60 >>> le = LabelEncoder()
61 >>> le.fit([1, 2, 2, 6])
62 LabelEncoder()
63 >>> le.classes_
64 array([1, 2, 6])
65 >>> le.transform([1, 1, 2, 6])
66 array([0, 0, 1, 2]...)
67 >>> le.inverse_transform([0, 0, 1, 2])
68 array([1, 1, 2, 6])
69
70 It can also be used to transform non-numerical labels (as long as they are
71 hashable and comparable) to numerical labels.
72
73 >>> le = LabelEncoder()
74 >>> le.fit(["paris", "paris", "tokyo", "amsterdam"])
75 LabelEncoder()
76 >>> list(le.classes_)
77 ['amsterdam', 'paris', 'tokyo']
78 >>> le.transform(["tokyo", "tokyo", "paris"])
79 array([2, 2, 1]...)
80 >>> list(le.inverse_transform([2, 2, 1]))
81 ['tokyo', 'tokyo', 'paris']
82 """
83
84 def fit(self, y):
85 """Fit label encoder.
86
87 Parameters
88 ----------
89 y : array-like of shape (n_samples,)
90 Target values.
91
92 Returns
93 -------
94 self : returns an instance of self.
95 Fitted label encoder.
96 """
97 y = column_or_1d(y, warn=True)
98 self.classes_ = _unique(y)
99 return self
100
101 def fit_transform(self, y):
102 """Fit label encoder and return encoded labels.
103
104 Parameters
105 ----------
106 y : array-like of shape (n_samples,)
107 Target values.
108
109 Returns
110 -------
111 y : array-like of shape (n_samples,)
112 Encoded labels.
113 """
114 y = column_or_1d(y, warn=True)
115 self.classes_, y = _unique(y, return_inverse=True)
116 return y
117
118 def transform(self, y):
119 """Transform labels to normalized encoding.
120
121 Parameters
122 ----------
123 y : array-like of shape (n_samples,)
124 Target values.
125
126 Returns
127 -------
128 y : array-like of shape (n_samples,)
129 Labels as normalized encodings.
130 """
131 check_is_fitted(self)
132 y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
133 # transform of empty array is empty array
134 if _num_samples(y) == 0:
135 return np.array([])
136
137 return _encode(y, uniques=self.classes_)
138
139 def inverse_transform(self, y):
140 """Transform labels back to original encoding.
141
142 Parameters
143 ----------
144 y : ndarray of shape (n_samples,)
145 Target values.
146
147 Returns
148 -------
149 y : ndarray of shape (n_samples,)
150 Original encoding.
151 """
152 check_is_fitted(self)
153 y = column_or_1d(y, warn=True)
154 # inverse transform of empty array is empty array
155 if _num_samples(y) == 0:
156 return np.array([])
157
158 diff = np.setdiff1d(y, np.arange(len(self.classes_)))
159 if len(diff):
160 raise ValueError("y contains previously unseen labels: %s" % str(diff))
161 y = np.asarray(y)
162 return self.classes_[y]
163
164 def _more_tags(self):
165 return {"X_types": ["1dlabels"]}
166
167
168class LabelBinarizer(TransformerMixin, BaseEstimator, auto_wrap_output_keys=None):
169 """Binarize labels in a one-vs-all fashion.
170
171 Several regression and binary classification algorithms are
172 available in scikit-learn. A simple way to extend these algorithms
173 to the multi-class classification case is to use the so-called
174 one-vs-all scheme.
175
176 At learning time, this simply consists in learning one regressor
177 or binary classifier per class. In doing so, one needs to convert
178 multi-class labels to binary labels (belong or does not belong
179 to the class). `LabelBinarizer` makes this process easy with the
180 transform method.
181
182 At prediction time, one assigns the class for which the corresponding
183 model gave the greatest confidence. `LabelBinarizer` makes this easy
184 with the :meth:`inverse_transform` method.
185
186 Read more in the :ref:`User Guide <preprocessing_targets>`.
187
188 Parameters
189 ----------
190 neg_label : int, default=0
191 Value with which negative labels must be encoded.
192
193 pos_label : int, default=1
194 Value with which positive labels must be encoded.
195
196 sparse_output : bool, default=False
197 True if the returned array from transform is desired to be in sparse
198 CSR format.
199
200 Attributes
201 ----------
202 classes_ : ndarray of shape (n_classes,)
203 Holds the label for each class.
204
205 y_type_ : str
206 Represents the type of the target data as evaluated by
207 :func:`~sklearn.utils.multiclass.type_of_target`. Possible type are
208 'continuous', 'continuous-multioutput', 'binary', 'multiclass',
209 'multiclass-multioutput', 'multilabel-indicator', and 'unknown'.
210
211 sparse_input_ : bool
212 `True` if the input data to transform is given as a sparse matrix,
213 `False` otherwise.
214
215 See Also
216 --------
217 label_binarize : Function to perform the transform operation of
218 LabelBinarizer with fixed classes.
219 OneHotEncoder : Encode categorical features using a one-hot aka one-of-K
220 scheme.
221
222 Examples
223 --------
224 >>> from sklearn.preprocessing import LabelBinarizer
225 >>> lb = LabelBinarizer()
226 >>> lb.fit([1, 2, 6, 4, 2])
227 LabelBinarizer()
228 >>> lb.classes_
229 array([1, 2, 4, 6])
230 >>> lb.transform([1, 6])
231 array([[1, 0, 0, 0],
232 [0, 0, 0, 1]])
233
234 Binary targets transform to a column vector
235
236 >>> lb = LabelBinarizer()
237 >>> lb.fit_transform(['yes', 'no', 'no', 'yes'])
238 array([[1],
239 [0],
240 [0],
241 [1]])
242
243 Passing a 2D matrix for multilabel classification
244
245 >>> import numpy as np
246 >>> lb.fit(np.array([[0, 1, 1], [1, 0, 0]]))
247 LabelBinarizer()
248 >>> lb.classes_
249 array([0, 1, 2])
250 >>> lb.transform([0, 1, 2, 1])
251 array([[1, 0, 0],
252 [0, 1, 0],
253 [0, 0, 1],
254 [0, 1, 0]])
255 """
256
257 _parameter_constraints: dict = {
258 "neg_label": [Integral],
259 "pos_label": [Integral],
260 "sparse_output": ["boolean"],
261 }
262
263 def __init__(self, *, neg_label=0, pos_label=1, sparse_output=False):
264 self.neg_label = neg_label
265 self.pos_label = pos_label
266 self.sparse_output = sparse_output
267
268 @_fit_context(prefer_skip_nested_validation=True)
269 def fit(self, y):
270 """Fit label binarizer.
271
272 Parameters
273 ----------
274 y : ndarray of shape (n_samples,) or (n_samples, n_classes)
275 Target values. The 2-d matrix should only contain 0 and 1,
276 represents multilabel classification.
277
278 Returns
279 -------
280 self : object
281 Returns the instance itself.
282 """
283 if self.neg_label >= self.pos_label:
284 raise ValueError(
285 f"neg_label={self.neg_label} must be strictly less than "
286 f"pos_label={self.pos_label}."
287 )
288
289 if self.sparse_output and (self.pos_label == 0 or self.neg_label != 0):
290 raise ValueError(
291 "Sparse binarization is only supported with non "
292 "zero pos_label and zero neg_label, got "
293 f"pos_label={self.pos_label} and neg_label={self.neg_label}"
294 )
295
296 self.y_type_ = type_of_target(y, input_name="y")
297
298 if "multioutput" in self.y_type_:
299 raise ValueError(
300 "Multioutput target data is not supported with label binarization"
301 )
302 if _num_samples(y) == 0:
303 raise ValueError("y has 0 samples: %r" % y)
304
305 self.sparse_input_ = sp.issparse(y)
306 self.classes_ = unique_labels(y)
307 return self
308
309 def fit_transform(self, y):
310 """Fit label binarizer/transform multi-class labels to binary labels.
311
312 The output of transform is sometimes referred to as
313 the 1-of-K coding scheme.
314
315 Parameters
316 ----------
317 y : {ndarray, sparse matrix} of shape (n_samples,) or \
318 (n_samples, n_classes)
319 Target values. The 2-d matrix should only contain 0 and 1,
320 represents multilabel classification. Sparse matrix can be
321 CSR, CSC, COO, DOK, or LIL.
322
323 Returns
324 -------
325 Y : {ndarray, sparse matrix} of shape (n_samples, n_classes)
326 Shape will be (n_samples, 1) for binary problems. Sparse matrix
327 will be of CSR format.
328 """
329 return self.fit(y).transform(y)
330
331 def transform(self, y):
332 """Transform multi-class labels to binary labels.
333
334 The output of transform is sometimes referred to by some authors as
335 the 1-of-K coding scheme.
336
337 Parameters
338 ----------
339 y : {array, sparse matrix} of shape (n_samples,) or \
340 (n_samples, n_classes)
341 Target values. The 2-d matrix should only contain 0 and 1,
342 represents multilabel classification. Sparse matrix can be
343 CSR, CSC, COO, DOK, or LIL.
344
345 Returns
346 -------
347 Y : {ndarray, sparse matrix} of shape (n_samples, n_classes)
348 Shape will be (n_samples, 1) for binary problems. Sparse matrix
349 will be of CSR format.
350 """
351 check_is_fitted(self)
352
353 y_is_multilabel = type_of_target(y).startswith("multilabel")
354 if y_is_multilabel and not self.y_type_.startswith("multilabel"):
355 raise ValueError("The object was not fitted with multilabel input.")
356
357 return label_binarize(
358 y,
359 classes=self.classes_,
360 pos_label=self.pos_label,
361 neg_label=self.neg_label,
362 sparse_output=self.sparse_output,
363 )
364
365 def inverse_transform(self, Y, threshold=None):
366 """Transform binary labels back to multi-class labels.
367
368 Parameters
369 ----------
370 Y : {ndarray, sparse matrix} of shape (n_samples, n_classes)
371 Target values. All sparse matrices are converted to CSR before
372 inverse transformation.
373
374 threshold : float, default=None
375 Threshold used in the binary and multi-label cases.
376
377 Use 0 when ``Y`` contains the output of :term:`decision_function`
378 (classifier).
379 Use 0.5 when ``Y`` contains the output of :term:`predict_proba`.
380
381 If None, the threshold is assumed to be half way between
382 neg_label and pos_label.
383
384 Returns
385 -------
386 y : {ndarray, sparse matrix} of shape (n_samples,)
387 Target values. Sparse matrix will be of CSR format.
388
389 Notes
390 -----
391 In the case when the binary labels are fractional
392 (probabilistic), :meth:`inverse_transform` chooses the class with the
393 greatest value. Typically, this allows to use the output of a
394 linear model's :term:`decision_function` method directly as the input
395 of :meth:`inverse_transform`.
396 """
397 check_is_fitted(self)
398
399 if threshold is None:
400 threshold = (self.pos_label + self.neg_label) / 2.0
401
402 if self.y_type_ == "multiclass":
403 y_inv = _inverse_binarize_multiclass(Y, self.classes_)
404 else:
405 y_inv = _inverse_binarize_thresholding(
406 Y, self.y_type_, self.classes_, threshold
407 )
408
409 if self.sparse_input_:
410 y_inv = sp.csr_matrix(y_inv)
411 elif sp.issparse(y_inv):
412 y_inv = y_inv.toarray()
413
414 return y_inv
415
416 def _more_tags(self):
417 return {"X_types": ["1dlabels"]}
418
419
420@validate_params(
421 {
422 "y": ["array-like"],
423 "classes": ["array-like"],
424 "neg_label": [Interval(Integral, None, None, closed="neither")],
425 "pos_label": [Interval(Integral, None, None, closed="neither")],
426 "sparse_output": ["boolean"],
427 },
428 prefer_skip_nested_validation=True,
429)
430def label_binarize(y, *, classes, neg_label=0, pos_label=1, sparse_output=False):
431 """Binarize labels in a one-vs-all fashion.
432
433 Several regression and binary classification algorithms are
434 available in scikit-learn. A simple way to extend these algorithms
435 to the multi-class classification case is to use the so-called
436 one-vs-all scheme.
437
438 This function makes it possible to compute this transformation for a
439 fixed set of class labels known ahead of time.
440
441 Parameters
442 ----------
443 y : array-like
444 Sequence of integer labels or multilabel data to encode.
445
446 classes : array-like of shape (n_classes,)
447 Uniquely holds the label for each class.
448
449 neg_label : int, default=0
450 Value with which negative labels must be encoded.
451
452 pos_label : int, default=1
453 Value with which positive labels must be encoded.
454
455 sparse_output : bool, default=False,
456 Set to true if output binary array is desired in CSR sparse format.
457
458 Returns
459 -------
460 Y : {ndarray, sparse matrix} of shape (n_samples, n_classes)
461 Shape will be (n_samples, 1) for binary problems. Sparse matrix will
462 be of CSR format.
463
464 See Also
465 --------
466 LabelBinarizer : Class used to wrap the functionality of label_binarize and
467 allow for fitting to classes independently of the transform operation.
468
469 Examples
470 --------
471 >>> from sklearn.preprocessing import label_binarize
472 >>> label_binarize([1, 6], classes=[1, 2, 4, 6])
473 array([[1, 0, 0, 0],
474 [0, 0, 0, 1]])
475
476 The class ordering is preserved:
477
478 >>> label_binarize([1, 6], classes=[1, 6, 4, 2])
479 array([[1, 0, 0, 0],
480 [0, 1, 0, 0]])
481
482 Binary targets transform to a column vector
483
484 >>> label_binarize(['yes', 'no', 'no', 'yes'], classes=['no', 'yes'])
485 array([[1],
486 [0],
487 [0],
488 [1]])
489 """
490 if not isinstance(y, list):
491 # XXX Workaround that will be removed when list of list format is
492 # dropped
493 y = check_array(
494 y, input_name="y", accept_sparse="csr", ensure_2d=False, dtype=None
495 )
496 else:
497 if _num_samples(y) == 0:
498 raise ValueError("y has 0 samples: %r" % y)
499 if neg_label >= pos_label:
500 raise ValueError(
501 "neg_label={0} must be strictly less than pos_label={1}.".format(
502 neg_label, pos_label
503 )
504 )
505
506 if sparse_output and (pos_label == 0 or neg_label != 0):
507 raise ValueError(
508 "Sparse binarization is only supported with non "
509 "zero pos_label and zero neg_label, got "
510 "pos_label={0} and neg_label={1}"
511 "".format(pos_label, neg_label)
512 )
513
514 # To account for pos_label == 0 in the dense case
515 pos_switch = pos_label == 0
516 if pos_switch:
517 pos_label = -neg_label
518
519 y_type = type_of_target(y)
520 if "multioutput" in y_type:
521 raise ValueError(
522 "Multioutput target data is not supported with label binarization"
523 )
524 if y_type == "unknown":
525 raise ValueError("The type of target data is not known")
526
527 n_samples = y.shape[0] if sp.issparse(y) else len(y)
528 n_classes = len(classes)
529 classes = np.asarray(classes)
530
531 if y_type == "binary":
532 if n_classes == 1:
533 if sparse_output:
534 return sp.csr_matrix((n_samples, 1), dtype=int)
535 else:
536 Y = np.zeros((len(y), 1), dtype=int)
537 Y += neg_label
538 return Y
539 elif len(classes) >= 3:
540 y_type = "multiclass"
541
542 sorted_class = np.sort(classes)
543 if y_type == "multilabel-indicator":
544 y_n_classes = y.shape[1] if hasattr(y, "shape") else len(y[0])
545 if classes.size != y_n_classes:
546 raise ValueError(
547 "classes {0} mismatch with the labels {1} found in the data".format(
548 classes, unique_labels(y)
549 )
550 )
551
552 if y_type in ("binary", "multiclass"):
553 y = column_or_1d(y)
554
555 # pick out the known labels from y
556 y_in_classes = np.isin(y, classes)
557 y_seen = y[y_in_classes]
558 indices = np.searchsorted(sorted_class, y_seen)
559 indptr = np.hstack((0, np.cumsum(y_in_classes)))
560
561 data = np.empty_like(indices)
562 data.fill(pos_label)
563 Y = sp.csr_matrix((data, indices, indptr), shape=(n_samples, n_classes))
564 elif y_type == "multilabel-indicator":
565 Y = sp.csr_matrix(y)
566 if pos_label != 1:
567 data = np.empty_like(Y.data)
568 data.fill(pos_label)
569 Y.data = data
570 else:
571 raise ValueError(
572 "%s target data is not supported with label binarization" % y_type
573 )
574
575 if not sparse_output:
576 Y = Y.toarray()
577 Y = Y.astype(int, copy=False)
578
579 if neg_label != 0:
580 Y[Y == 0] = neg_label
581
582 if pos_switch:
583 Y[Y == pos_label] = 0
584 else:
585 Y.data = Y.data.astype(int, copy=False)
586
587 # preserve label ordering
588 if np.any(classes != sorted_class):
589 indices = np.searchsorted(sorted_class, classes)
590 Y = Y[:, indices]
591
592 if y_type == "binary":
593 if sparse_output:
594 Y = Y.getcol(-1)
595 else:
596 Y = Y[:, -1].reshape((-1, 1))
597
598 return Y
599
600
601def _inverse_binarize_multiclass(y, classes):
602 """Inverse label binarization transformation for multiclass.
603
604 Multiclass uses the maximal score instead of a threshold.
605 """
606 classes = np.asarray(classes)
607
608 if sp.issparse(y):
609 # Find the argmax for each row in y where y is a CSR matrix
610
611 y = y.tocsr()
612 n_samples, n_outputs = y.shape
613 outputs = np.arange(n_outputs)
614 row_max = min_max_axis(y, 1)[1]
615 row_nnz = np.diff(y.indptr)
616
617 y_data_repeated_max = np.repeat(row_max, row_nnz)
618 # picks out all indices obtaining the maximum per row
619 y_i_all_argmax = np.flatnonzero(y_data_repeated_max == y.data)
620
621 # For corner case where last row has a max of 0
622 if row_max[-1] == 0:
623 y_i_all_argmax = np.append(y_i_all_argmax, [len(y.data)])
624
625 # Gets the index of the first argmax in each row from y_i_all_argmax
626 index_first_argmax = np.searchsorted(y_i_all_argmax, y.indptr[:-1])
627 # first argmax of each row
628 y_ind_ext = np.append(y.indices, [0])
629 y_i_argmax = y_ind_ext[y_i_all_argmax[index_first_argmax]]
630 # Handle rows of all 0
631 y_i_argmax[np.where(row_nnz == 0)[0]] = 0
632
633 # Handles rows with max of 0 that contain negative numbers
634 samples = np.arange(n_samples)[(row_nnz > 0) & (row_max.ravel() == 0)]
635 for i in samples:
636 ind = y.indices[y.indptr[i] : y.indptr[i + 1]]
637 y_i_argmax[i] = classes[np.setdiff1d(outputs, ind)][0]
638
639 return classes[y_i_argmax]
640 else:
641 return classes.take(y.argmax(axis=1), mode="clip")
642
643
644def _inverse_binarize_thresholding(y, output_type, classes, threshold):
645 """Inverse label binarization transformation using thresholding."""
646
647 if output_type == "binary" and y.ndim == 2 and y.shape[1] > 2:
648 raise ValueError("output_type='binary', but y.shape = {0}".format(y.shape))
649
650 if output_type != "binary" and y.shape[1] != len(classes):
651 raise ValueError(
652 "The number of class is not equal to the number of dimension of y."
653 )
654
655 classes = np.asarray(classes)
656
657 # Perform thresholding
658 if sp.issparse(y):
659 if threshold > 0:
660 if y.format not in ("csr", "csc"):
661 y = y.tocsr()
662 y.data = np.array(y.data > threshold, dtype=int)
663 y.eliminate_zeros()
664 else:
665 y = np.array(y.toarray() > threshold, dtype=int)
666 else:
667 y = np.array(y > threshold, dtype=int)
668
669 # Inverse transform data
670 if output_type == "binary":
671 if sp.issparse(y):
672 y = y.toarray()
673 if y.ndim == 2 and y.shape[1] == 2:
674 return classes[y[:, 1]]
675 else:
676 if len(classes) == 1:
677 return np.repeat(classes[0], len(y))
678 else:
679 return classes[y.ravel()]
680
681 elif output_type == "multilabel-indicator":
682 return y
683
684 else:
685 raise ValueError("{0} format is not supported".format(output_type))
686
687
688class MultiLabelBinarizer(TransformerMixin, BaseEstimator, auto_wrap_output_keys=None):
689 """Transform between iterable of iterables and a multilabel format.
690
691 Although a list of sets or tuples is a very intuitive format for multilabel
692 data, it is unwieldy to process. This transformer converts between this
693 intuitive format and the supported multilabel format: a (samples x classes)
694 binary matrix indicating the presence of a class label.
695
696 Parameters
697 ----------
698 classes : array-like of shape (n_classes,), default=None
699 Indicates an ordering for the class labels.
700 All entries should be unique (cannot contain duplicate classes).
701
702 sparse_output : bool, default=False
703 Set to True if output binary array is desired in CSR sparse format.
704
705 Attributes
706 ----------
707 classes_ : ndarray of shape (n_classes,)
708 A copy of the `classes` parameter when provided.
709 Otherwise it corresponds to the sorted set of classes found
710 when fitting.
711
712 See Also
713 --------
714 OneHotEncoder : Encode categorical features using a one-hot aka one-of-K
715 scheme.
716
717 Examples
718 --------
719 >>> from sklearn.preprocessing import MultiLabelBinarizer
720 >>> mlb = MultiLabelBinarizer()
721 >>> mlb.fit_transform([(1, 2), (3,)])
722 array([[1, 1, 0],
723 [0, 0, 1]])
724 >>> mlb.classes_
725 array([1, 2, 3])
726
727 >>> mlb.fit_transform([{'sci-fi', 'thriller'}, {'comedy'}])
728 array([[0, 1, 1],
729 [1, 0, 0]])
730 >>> list(mlb.classes_)
731 ['comedy', 'sci-fi', 'thriller']
732
733 A common mistake is to pass in a list, which leads to the following issue:
734
735 >>> mlb = MultiLabelBinarizer()
736 >>> mlb.fit(['sci-fi', 'thriller', 'comedy'])
737 MultiLabelBinarizer()
738 >>> mlb.classes_
739 array(['-', 'c', 'd', 'e', 'f', 'h', 'i', 'l', 'm', 'o', 'r', 's', 't',
740 'y'], dtype=object)
741
742 To correct this, the list of labels should be passed in as:
743
744 >>> mlb = MultiLabelBinarizer()
745 >>> mlb.fit([['sci-fi', 'thriller', 'comedy']])
746 MultiLabelBinarizer()
747 >>> mlb.classes_
748 array(['comedy', 'sci-fi', 'thriller'], dtype=object)
749 """
750
751 _parameter_constraints: dict = {
752 "classes": ["array-like", None],
753 "sparse_output": ["boolean"],
754 }
755
756 def __init__(self, *, classes=None, sparse_output=False):
757 self.classes = classes
758 self.sparse_output = sparse_output
759
760 @_fit_context(prefer_skip_nested_validation=True)
761 def fit(self, y):
762 """Fit the label sets binarizer, storing :term:`classes_`.
763
764 Parameters
765 ----------
766 y : iterable of iterables
767 A set of labels (any orderable and hashable object) for each
768 sample. If the `classes` parameter is set, `y` will not be
769 iterated.
770
771 Returns
772 -------
773 self : object
774 Fitted estimator.
775 """
776 self._cached_dict = None
777
778 if self.classes is None:
779 classes = sorted(set(itertools.chain.from_iterable(y)))
780 elif len(set(self.classes)) < len(self.classes):
781 raise ValueError(
782 "The classes argument contains duplicate "
783 "classes. Remove these duplicates before passing "
784 "them to MultiLabelBinarizer."
785 )
786 else:
787 classes = self.classes
788 dtype = int if all(isinstance(c, int) for c in classes) else object
789 self.classes_ = np.empty(len(classes), dtype=dtype)
790 self.classes_[:] = classes
791 return self
792
793 @_fit_context(prefer_skip_nested_validation=True)
794 def fit_transform(self, y):
795 """Fit the label sets binarizer and transform the given label sets.
796
797 Parameters
798 ----------
799 y : iterable of iterables
800 A set of labels (any orderable and hashable object) for each
801 sample. If the `classes` parameter is set, `y` will not be
802 iterated.
803
804 Returns
805 -------
806 y_indicator : {ndarray, sparse matrix} of shape (n_samples, n_classes)
807 A matrix such that `y_indicator[i, j] = 1` iff `classes_[j]`
808 is in `y[i]`, and 0 otherwise. Sparse matrix will be of CSR
809 format.
810 """
811 if self.classes is not None:
812 return self.fit(y).transform(y)
813
814 self._cached_dict = None
815
816 # Automatically increment on new class
817 class_mapping = defaultdict(int)
818 class_mapping.default_factory = class_mapping.__len__
819 yt = self._transform(y, class_mapping)
820
821 # sort classes and reorder columns
822 tmp = sorted(class_mapping, key=class_mapping.get)
823
824 # (make safe for tuples)
825 dtype = int if all(isinstance(c, int) for c in tmp) else object
826 class_mapping = np.empty(len(tmp), dtype=dtype)
827 class_mapping[:] = tmp
828 self.classes_, inverse = np.unique(class_mapping, return_inverse=True)
829 # ensure yt.indices keeps its current dtype
830 yt.indices = np.array(inverse[yt.indices], dtype=yt.indices.dtype, copy=False)
831
832 if not self.sparse_output:
833 yt = yt.toarray()
834
835 return yt
836
837 def transform(self, y):
838 """Transform the given label sets.
839
840 Parameters
841 ----------
842 y : iterable of iterables
843 A set of labels (any orderable and hashable object) for each
844 sample. If the `classes` parameter is set, `y` will not be
845 iterated.
846
847 Returns
848 -------
849 y_indicator : array or CSR matrix, shape (n_samples, n_classes)
850 A matrix such that `y_indicator[i, j] = 1` iff `classes_[j]` is in
851 `y[i]`, and 0 otherwise.
852 """
853 check_is_fitted(self)
854
855 class_to_index = self._build_cache()
856 yt = self._transform(y, class_to_index)
857
858 if not self.sparse_output:
859 yt = yt.toarray()
860
861 return yt
862
863 def _build_cache(self):
864 if self._cached_dict is None:
865 self._cached_dict = dict(zip(self.classes_, range(len(self.classes_))))
866
867 return self._cached_dict
868
869 def _transform(self, y, class_mapping):
870 """Transforms the label sets with a given mapping.
871
872 Parameters
873 ----------
874 y : iterable of iterables
875 A set of labels (any orderable and hashable object) for each
876 sample. If the `classes` parameter is set, `y` will not be
877 iterated.
878
879 class_mapping : Mapping
880 Maps from label to column index in label indicator matrix.
881
882 Returns
883 -------
884 y_indicator : sparse matrix of shape (n_samples, n_classes)
885 Label indicator matrix. Will be of CSR format.
886 """
887 indices = array.array("i")
888 indptr = array.array("i", [0])
889 unknown = set()
890 for labels in y:
891 index = set()
892 for label in labels:
893 try:
894 index.add(class_mapping[label])
895 except KeyError:
896 unknown.add(label)
897 indices.extend(index)
898 indptr.append(len(indices))
899 if unknown:
900 warnings.warn(
901 "unknown class(es) {0} will be ignored".format(sorted(unknown, key=str))
902 )
903 data = np.ones(len(indices), dtype=int)
904
905 return sp.csr_matrix(
906 (data, indices, indptr), shape=(len(indptr) - 1, len(class_mapping))
907 )
908
909 def inverse_transform(self, yt):
910 """Transform the given indicator matrix into label sets.
911
912 Parameters
913 ----------
914 yt : {ndarray, sparse matrix} of shape (n_samples, n_classes)
915 A matrix containing only 1s ands 0s.
916
917 Returns
918 -------
919 y : list of tuples
920 The set of labels for each sample such that `y[i]` consists of
921 `classes_[j]` for each `yt[i, j] == 1`.
922 """
923 check_is_fitted(self)
924
925 if yt.shape[1] != len(self.classes_):
926 raise ValueError(
927 "Expected indicator for {0} classes, but got {1}".format(
928 len(self.classes_), yt.shape[1]
929 )
930 )
931
932 if sp.issparse(yt):
933 yt = yt.tocsr()
934 if len(yt.data) != 0 and len(np.setdiff1d(yt.data, [0, 1])) > 0:
935 raise ValueError("Expected only 0s and 1s in label indicator.")
936 return [
937 tuple(self.classes_.take(yt.indices[start:end]))
938 for start, end in zip(yt.indptr[:-1], yt.indptr[1:])
939 ]
940 else:
941 unexpected = np.setdiff1d(yt, [0, 1])
942 if len(unexpected) > 0:
943 raise ValueError(
944 "Expected only 0s and 1s in label indicator. Also got {0}".format(
945 unexpected
946 )
947 )
948 return [tuple(self.classes_.compress(indicators)) for indicators in yt]
949
950 def _more_tags(self):
951 return {"X_types": ["2dlabels"]}