1"""
2The :mod:`sklearn.utils.multiclass` module includes utilities to handle
3multiclass/multioutput target in classifiers.
4"""
5
6# Author: Arnaud Joly, Joel Nothman, Hamzeh Alsalhi
7#
8# License: BSD 3 clause
9import warnings
10from collections.abc import Sequence
11from itertools import chain
12
13import numpy as np
14from scipy.sparse import issparse
15
16from ..utils._array_api import get_namespace
17from ..utils.fixes import VisibleDeprecationWarning
18from .validation import _assert_all_finite, check_array
19
20
21def _unique_multiclass(y):
22 xp, is_array_api_compliant = get_namespace(y)
23 if hasattr(y, "__array__") or is_array_api_compliant:
24 return xp.unique_values(xp.asarray(y))
25 else:
26 return set(y)
27
28
29def _unique_indicator(y):
30 xp, _ = get_namespace(y)
31 return xp.arange(
32 check_array(y, input_name="y", accept_sparse=["csr", "csc", "coo"]).shape[1]
33 )
34
35
36_FN_UNIQUE_LABELS = {
37 "binary": _unique_multiclass,
38 "multiclass": _unique_multiclass,
39 "multilabel-indicator": _unique_indicator,
40}
41
42
43def unique_labels(*ys):
44 """Extract an ordered array of unique labels.
45
46 We don't allow:
47 - mix of multilabel and multiclass (single label) targets
48 - mix of label indicator matrix and anything else,
49 because there are no explicit labels)
50 - mix of label indicator matrices of different sizes
51 - mix of string and integer labels
52
53 At the moment, we also don't allow "multiclass-multioutput" input type.
54
55 Parameters
56 ----------
57 *ys : array-likes
58 Label values.
59
60 Returns
61 -------
62 out : ndarray of shape (n_unique_labels,)
63 An ordered array of unique labels.
64
65 Examples
66 --------
67 >>> from sklearn.utils.multiclass import unique_labels
68 >>> unique_labels([3, 5, 5, 5, 7, 7])
69 array([3, 5, 7])
70 >>> unique_labels([1, 2, 3, 4], [2, 2, 3, 4])
71 array([1, 2, 3, 4])
72 >>> unique_labels([1, 2, 10], [5, 11])
73 array([ 1, 2, 5, 10, 11])
74 """
75 xp, is_array_api_compliant = get_namespace(*ys)
76 if not ys:
77 raise ValueError("No argument has been passed.")
78 # Check that we don't mix label format
79
80 ys_types = set(type_of_target(x) for x in ys)
81 if ys_types == {"binary", "multiclass"}:
82 ys_types = {"multiclass"}
83
84 if len(ys_types) > 1:
85 raise ValueError("Mix type of y not allowed, got types %s" % ys_types)
86
87 label_type = ys_types.pop()
88
89 # Check consistency for the indicator format
90 if (
91 label_type == "multilabel-indicator"
92 and len(
93 set(
94 check_array(y, accept_sparse=["csr", "csc", "coo"]).shape[1] for y in ys
95 )
96 )
97 > 1
98 ):
99 raise ValueError(
100 "Multi-label binary indicator input with different numbers of labels"
101 )
102
103 # Get the unique set of labels
104 _unique_labels = _FN_UNIQUE_LABELS.get(label_type, None)
105 if not _unique_labels:
106 raise ValueError("Unknown label type: %s" % repr(ys))
107
108 if is_array_api_compliant:
109 # array_api does not allow for mixed dtypes
110 unique_ys = xp.concat([_unique_labels(y) for y in ys])
111 return xp.unique_values(unique_ys)
112
113 ys_labels = set(chain.from_iterable((i for i in _unique_labels(y)) for y in ys))
114 # Check that we don't mix string type with number type
115 if len(set(isinstance(label, str) for label in ys_labels)) > 1:
116 raise ValueError("Mix of label input types (string and number)")
117
118 return xp.asarray(sorted(ys_labels))
119
120
121def _is_integral_float(y):
122 xp, is_array_api_compliant = get_namespace(y)
123 return xp.isdtype(y.dtype, "real floating") and bool(
124 xp.all(xp.astype((xp.astype(y, xp.int64)), y.dtype) == y)
125 )
126
127
128def is_multilabel(y):
129 """Check if ``y`` is in a multilabel format.
130
131 Parameters
132 ----------
133 y : ndarray of shape (n_samples,)
134 Target values.
135
136 Returns
137 -------
138 out : bool
139 Return ``True``, if ``y`` is in a multilabel format, else ```False``.
140
141 Examples
142 --------
143 >>> import numpy as np
144 >>> from sklearn.utils.multiclass import is_multilabel
145 >>> is_multilabel([0, 1, 0, 1])
146 False
147 >>> is_multilabel([[1], [0, 2], []])
148 False
149 >>> is_multilabel(np.array([[1, 0], [0, 0]]))
150 True
151 >>> is_multilabel(np.array([[1], [0], [0]]))
152 False
153 >>> is_multilabel(np.array([[1, 0, 0]]))
154 True
155 """
156 xp, is_array_api_compliant = get_namespace(y)
157 if hasattr(y, "__array__") or isinstance(y, Sequence) or is_array_api_compliant:
158 # DeprecationWarning will be replaced by ValueError, see NEP 34
159 # https://numpy.org/neps/nep-0034-infer-dtype-is-object.html
160 check_y_kwargs = dict(
161 accept_sparse=True,
162 allow_nd=True,
163 force_all_finite=False,
164 ensure_2d=False,
165 ensure_min_samples=0,
166 ensure_min_features=0,
167 )
168 with warnings.catch_warnings():
169 warnings.simplefilter("error", VisibleDeprecationWarning)
170 try:
171 y = check_array(y, dtype=None, **check_y_kwargs)
172 except (VisibleDeprecationWarning, ValueError) as e:
173 if str(e).startswith("Complex data not supported"):
174 raise
175
176 # dtype=object should be provided explicitly for ragged arrays,
177 # see NEP 34
178 y = check_array(y, dtype=object, **check_y_kwargs)
179
180 if not (hasattr(y, "shape") and y.ndim == 2 and y.shape[1] > 1):
181 return False
182
183 if issparse(y):
184 if y.format in ("dok", "lil"):
185 y = y.tocsr()
186 labels = xp.unique_values(y.data)
187 return (
188 len(y.data) == 0
189 or (labels.size == 1 or (labels.size == 2) and (0 in labels))
190 and (y.dtype.kind in "biu" or _is_integral_float(labels)) # bool, int, uint
191 )
192 else:
193 labels = xp.unique_values(y)
194
195 return labels.shape[0] < 3 and (
196 xp.isdtype(y.dtype, ("bool", "signed integer", "unsigned integer"))
197 or _is_integral_float(labels)
198 )
199
200
201def check_classification_targets(y):
202 """Ensure that target y is of a non-regression type.
203
204 Only the following target types (as defined in type_of_target) are allowed:
205 'binary', 'multiclass', 'multiclass-multioutput',
206 'multilabel-indicator', 'multilabel-sequences'
207
208 Parameters
209 ----------
210 y : array-like
211 Target values.
212 """
213 y_type = type_of_target(y, input_name="y")
214 if y_type not in [
215 "binary",
216 "multiclass",
217 "multiclass-multioutput",
218 "multilabel-indicator",
219 "multilabel-sequences",
220 ]:
221 raise ValueError(
222 f"Unknown label type: {y_type}. Maybe you are trying to fit a "
223 "classifier, which expects discrete classes on a "
224 "regression target with continuous values."
225 )
226
227
228def type_of_target(y, input_name=""):
229 """Determine the type of data indicated by the target.
230
231 Note that this type is the most specific type that can be inferred.
232 For example:
233
234 * ``binary`` is more specific but compatible with ``multiclass``.
235 * ``multiclass`` of integers is more specific but compatible with
236 ``continuous``.
237 * ``multilabel-indicator`` is more specific but compatible with
238 ``multiclass-multioutput``.
239
240 Parameters
241 ----------
242 y : {array-like, sparse matrix}
243 Target values. If a sparse matrix, `y` is expected to be a
244 CSR/CSC matrix.
245
246 input_name : str, default=""
247 The data name used to construct the error message.
248
249 .. versionadded:: 1.1.0
250
251 Returns
252 -------
253 target_type : str
254 One of:
255
256 * 'continuous': `y` is an array-like of floats that are not all
257 integers, and is 1d or a column vector.
258 * 'continuous-multioutput': `y` is a 2d array of floats that are
259 not all integers, and both dimensions are of size > 1.
260 * 'binary': `y` contains <= 2 discrete values and is 1d or a column
261 vector.
262 * 'multiclass': `y` contains more than two discrete values, is not a
263 sequence of sequences, and is 1d or a column vector.
264 * 'multiclass-multioutput': `y` is a 2d array that contains more
265 than two discrete values, is not a sequence of sequences, and both
266 dimensions are of size > 1.
267 * 'multilabel-indicator': `y` is a label indicator matrix, an array
268 of two dimensions with at least two columns, and at most 2 unique
269 values.
270 * 'unknown': `y` is array-like but none of the above, such as a 3d
271 array, sequence of sequences, or an array of non-sequence objects.
272
273 Examples
274 --------
275 >>> from sklearn.utils.multiclass import type_of_target
276 >>> import numpy as np
277 >>> type_of_target([0.1, 0.6])
278 'continuous'
279 >>> type_of_target([1, -1, -1, 1])
280 'binary'
281 >>> type_of_target(['a', 'b', 'a'])
282 'binary'
283 >>> type_of_target([1.0, 2.0])
284 'binary'
285 >>> type_of_target([1, 0, 2])
286 'multiclass'
287 >>> type_of_target([1.0, 0.0, 3.0])
288 'multiclass'
289 >>> type_of_target(['a', 'b', 'c'])
290 'multiclass'
291 >>> type_of_target(np.array([[1, 2], [3, 1]]))
292 'multiclass-multioutput'
293 >>> type_of_target([[1, 2]])
294 'multilabel-indicator'
295 >>> type_of_target(np.array([[1.5, 2.0], [3.0, 1.6]]))
296 'continuous-multioutput'
297 >>> type_of_target(np.array([[0, 1], [1, 1]]))
298 'multilabel-indicator'
299 """
300 xp, is_array_api_compliant = get_namespace(y)
301 valid = (
302 (isinstance(y, Sequence) or issparse(y) or hasattr(y, "__array__"))
303 and not isinstance(y, str)
304 or is_array_api_compliant
305 )
306
307 if not valid:
308 raise ValueError(
309 "Expected array-like (array or non-string sequence), got %r" % y
310 )
311
312 sparse_pandas = y.__class__.__name__ in ["SparseSeries", "SparseArray"]
313 if sparse_pandas:
314 raise ValueError("y cannot be class 'SparseSeries' or 'SparseArray'")
315
316 if is_multilabel(y):
317 return "multilabel-indicator"
318
319 # DeprecationWarning will be replaced by ValueError, see NEP 34
320 # https://numpy.org/neps/nep-0034-infer-dtype-is-object.html
321 # We therefore catch both deprecation (NumPy < 1.24) warning and
322 # value error (NumPy >= 1.24).
323 check_y_kwargs = dict(
324 accept_sparse=True,
325 allow_nd=True,
326 force_all_finite=False,
327 ensure_2d=False,
328 ensure_min_samples=0,
329 ensure_min_features=0,
330 )
331
332 with warnings.catch_warnings():
333 warnings.simplefilter("error", VisibleDeprecationWarning)
334 if not issparse(y):
335 try:
336 y = check_array(y, dtype=None, **check_y_kwargs)
337 except (VisibleDeprecationWarning, ValueError) as e:
338 if str(e).startswith("Complex data not supported"):
339 raise
340
341 # dtype=object should be provided explicitly for ragged arrays,
342 # see NEP 34
343 y = check_array(y, dtype=object, **check_y_kwargs)
344
345 # The old sequence of sequences format
346 try:
347 first_row = y[[0], :] if issparse(y) else y[0]
348 if (
349 not hasattr(first_row, "__array__")
350 and isinstance(first_row, Sequence)
351 and not isinstance(first_row, str)
352 ):
353 raise ValueError(
354 "You appear to be using a legacy multi-label data"
355 " representation. Sequence of sequences are no"
356 " longer supported; use a binary array or sparse"
357 " matrix instead - the MultiLabelBinarizer"
358 " transformer can convert to this format."
359 )
360 except IndexError:
361 pass
362
363 # Invalid inputs
364 if y.ndim not in (1, 2):
365 # Number of dimension greater than 2: [[[1, 2]]]
366 return "unknown"
367 if not min(y.shape):
368 # Empty ndarray: []/[[]]
369 if y.ndim == 1:
370 # 1-D empty array: []
371 return "binary" # []
372 # 2-D empty array: [[]]
373 return "unknown"
374 if not issparse(y) and y.dtype == object and not isinstance(y.flat[0], str):
375 # [obj_1] and not ["label_1"]
376 return "unknown"
377
378 # Check if multioutput
379 if y.ndim == 2 and y.shape[1] > 1:
380 suffix = "-multioutput" # [[1, 2], [1, 2]]
381 else:
382 suffix = "" # [1, 2, 3] or [[1], [2], [3]]
383
384 # Check float and contains non-integer float values
385 if xp.isdtype(y.dtype, "real floating"):
386 # [.1, .2, 3] or [[.1, .2, 3]] or [[1., .2]] and not [1., 2., 3.]
387 data = y.data if issparse(y) else y
388 if xp.any(data != xp.astype(data, int)):
389 _assert_all_finite(data, input_name=input_name)
390 return "continuous" + suffix
391
392 # Check multiclass
393 if issparse(first_row):
394 first_row = first_row.data
395 if xp.unique_values(y).shape[0] > 2 or (y.ndim == 2 and len(first_row) > 1):
396 # [1, 2, 3] or [[1., 2., 3]] or [[1, 2]]
397 return "multiclass" + suffix
398 else:
399 return "binary" # [1, 2] or [["a"], ["b"]]
400
401
402def _check_partial_fit_first_call(clf, classes=None):
403 """Private helper function for factorizing common classes param logic.
404
405 Estimators that implement the ``partial_fit`` API need to be provided with
406 the list of possible classes at the first call to partial_fit.
407
408 Subsequent calls to partial_fit should check that ``classes`` is still
409 consistent with a previous value of ``clf.classes_`` when provided.
410
411 This function returns True if it detects that this was the first call to
412 ``partial_fit`` on ``clf``. In that case the ``classes_`` attribute is also
413 set on ``clf``.
414
415 """
416 if getattr(clf, "classes_", None) is None and classes is None:
417 raise ValueError("classes must be passed on the first call to partial_fit.")
418
419 elif classes is not None:
420 if getattr(clf, "classes_", None) is not None:
421 if not np.array_equal(clf.classes_, unique_labels(classes)):
422 raise ValueError(
423 "`classes=%r` is not the same as on last call "
424 "to partial_fit, was: %r" % (classes, clf.classes_)
425 )
426
427 else:
428 # This is the first call to partial_fit
429 clf.classes_ = unique_labels(classes)
430 return True
431
432 # classes is None and clf.classes_ has already previously been set:
433 # nothing to do
434 return False
435
436
437def class_distribution(y, sample_weight=None):
438 """Compute class priors from multioutput-multiclass target data.
439
440 Parameters
441 ----------
442 y : {array-like, sparse matrix} of size (n_samples, n_outputs)
443 The labels for each example.
444
445 sample_weight : array-like of shape (n_samples,), default=None
446 Sample weights.
447
448 Returns
449 -------
450 classes : list of size n_outputs of ndarray of size (n_classes,)
451 List of classes for each column.
452
453 n_classes : list of int of size n_outputs
454 Number of classes in each column.
455
456 class_prior : list of size n_outputs of ndarray of size (n_classes,)
457 Class distribution of each column.
458 """
459 classes = []
460 n_classes = []
461 class_prior = []
462
463 n_samples, n_outputs = y.shape
464 if sample_weight is not None:
465 sample_weight = np.asarray(sample_weight)
466
467 if issparse(y):
468 y = y.tocsc()
469 y_nnz = np.diff(y.indptr)
470
471 for k in range(n_outputs):
472 col_nonzero = y.indices[y.indptr[k] : y.indptr[k + 1]]
473 # separate sample weights for zero and non-zero elements
474 if sample_weight is not None:
475 nz_samp_weight = sample_weight[col_nonzero]
476 zeros_samp_weight_sum = np.sum(sample_weight) - np.sum(nz_samp_weight)
477 else:
478 nz_samp_weight = None
479 zeros_samp_weight_sum = y.shape[0] - y_nnz[k]
480
481 classes_k, y_k = np.unique(
482 y.data[y.indptr[k] : y.indptr[k + 1]], return_inverse=True
483 )
484 class_prior_k = np.bincount(y_k, weights=nz_samp_weight)
485
486 # An explicit zero was found, combine its weight with the weight
487 # of the implicit zeros
488 if 0 in classes_k:
489 class_prior_k[classes_k == 0] += zeros_samp_weight_sum
490
491 # If an there is an implicit zero and it is not in classes and
492 # class_prior, make an entry for it
493 if 0 not in classes_k and y_nnz[k] < y.shape[0]:
494 classes_k = np.insert(classes_k, 0, 0)
495 class_prior_k = np.insert(class_prior_k, 0, zeros_samp_weight_sum)
496
497 classes.append(classes_k)
498 n_classes.append(classes_k.shape[0])
499 class_prior.append(class_prior_k / class_prior_k.sum())
500 else:
501 for k in range(n_outputs):
502 classes_k, y_k = np.unique(y[:, k], return_inverse=True)
503 classes.append(classes_k)
504 n_classes.append(classes_k.shape[0])
505 class_prior_k = np.bincount(y_k, weights=sample_weight)
506 class_prior.append(class_prior_k / class_prior_k.sum())
507
508 return (classes, n_classes, class_prior)
509
510
511def _ovr_decision_function(predictions, confidences, n_classes):
512 """Compute a continuous, tie-breaking OvR decision function from OvO.
513
514 It is important to include a continuous value, not only votes,
515 to make computing AUC or calibration meaningful.
516
517 Parameters
518 ----------
519 predictions : array-like of shape (n_samples, n_classifiers)
520 Predicted classes for each binary classifier.
521
522 confidences : array-like of shape (n_samples, n_classifiers)
523 Decision functions or predicted probabilities for positive class
524 for each binary classifier.
525
526 n_classes : int
527 Number of classes. n_classifiers must be
528 ``n_classes * (n_classes - 1 ) / 2``.
529 """
530 n_samples = predictions.shape[0]
531 votes = np.zeros((n_samples, n_classes))
532 sum_of_confidences = np.zeros((n_samples, n_classes))
533
534 k = 0
535 for i in range(n_classes):
536 for j in range(i + 1, n_classes):
537 sum_of_confidences[:, i] -= confidences[:, k]
538 sum_of_confidences[:, j] += confidences[:, k]
539 votes[predictions[:, k] == 0, i] += 1
540 votes[predictions[:, k] == 1, j] += 1
541 k += 1
542
543 # Monotonically transform the sum_of_confidences to (-1/3, 1/3)
544 # and add it with votes. The monotonic transformation is
545 # f: x -> x / (3 * (|x| + 1)), it uses 1/3 instead of 1/2
546 # to ensure that we won't reach the limits and change vote order.
547 # The motivation is to use confidence levels as a way to break ties in
548 # the votes without switching any decision made based on a difference
549 # of 1 vote.
550 transformed_confidences = sum_of_confidences / (
551 3 * (np.abs(sum_of_confidences) + 1)
552 )
553 return votes + transformed_confidences