Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/scikit_learn-1.4.dev0-py3.8-linux-x86

1"""

2The :mod:`sklearn.utils.multiclass` module includes utilities to handle

3multiclass/multioutput target in classifiers.

4"""

6# Author: Arnaud Joly, Joel Nothman, Hamzeh Alsalhi

8# License: BSD 3 clause

9import warnings

10from collections.abc import Sequence

11from itertools import chain

13import numpy as np

14from scipy.sparse import issparse

16from ..utils._array_api import get_namespace

17from ..utils.fixes import VisibleDeprecationWarning

18from .validation import _assert_all_finite, check_array

21def _unique_multiclass(y):

22 xp, is_array_api_compliant = get_namespace(y)

23 if hasattr(y, "__array__") or is_array_api_compliant:

24 return xp.unique_values(xp.asarray(y))

25 else:

26 return set(y)

29def _unique_indicator(y):

30 xp, _ = get_namespace(y)

31 return xp.arange(

32 check_array(y, input_name="y", accept_sparse=["csr", "csc", "coo"]).shape[1]

33 )

36_FN_UNIQUE_LABELS = {

37 "binary": _unique_multiclass,

38 "multiclass": _unique_multiclass,

39 "multilabel-indicator": _unique_indicator,

40}

43def unique_labels(*ys):

44 """Extract an ordered array of unique labels.

46 We don't allow:

47 - mix of multilabel and multiclass (single label) targets

48 - mix of label indicator matrix and anything else,

49 because there are no explicit labels)

50 - mix of label indicator matrices of different sizes

51 - mix of string and integer labels

53 At the moment, we also don't allow "multiclass-multioutput" input type.

55 Parameters

56 ----------

57 *ys : array-likes

58 Label values.

60 Returns

61 -------

62 out : ndarray of shape (n_unique_labels,)

63 An ordered array of unique labels.

65 Examples

66 --------

67 >>> from sklearn.utils.multiclass import unique_labels

68 >>> unique_labels([3, 5, 5, 5, 7, 7])

69 array([3, 5, 7])

70 >>> unique_labels([1, 2, 3, 4], [2, 2, 3, 4])

71 array([1, 2, 3, 4])

72 >>> unique_labels([1, 2, 10], [5, 11])

73 array([ 1, 2, 5, 10, 11])

74 """

75 xp, is_array_api_compliant = get_namespace(*ys)

76 if not ys:

77 raise ValueError("No argument has been passed.")

78 # Check that we don't mix label format

80 ys_types = set(type_of_target(x) for x in ys)

81 if ys_types == {"binary", "multiclass"}:

82 ys_types = {"multiclass"}

84 if len(ys_types) > 1:

85 raise ValueError("Mix type of y not allowed, got types %s" % ys_types)

87 label_type = ys_types.pop()

89 # Check consistency for the indicator format

90 if (

91 label_type == "multilabel-indicator"

92 and len(

93 set(

94 check_array(y, accept_sparse=["csr", "csc", "coo"]).shape[1] for y in ys

95 )

96 )

97 > 1

98 ):

99 raise ValueError(

100 "Multi-label binary indicator input with different numbers of labels"

101 )

102

103 # Get the unique set of labels

104 _unique_labels = _FN_UNIQUE_LABELS.get(label_type, None)

105 if not _unique_labels:

106 raise ValueError("Unknown label type: %s" % repr(ys))

107

108 if is_array_api_compliant:

109 # array_api does not allow for mixed dtypes

110 unique_ys = xp.concat([_unique_labels(y) for y in ys])

111 return xp.unique_values(unique_ys)

112

113 ys_labels = set(chain.from_iterable((i for i in _unique_labels(y)) for y in ys))

114 # Check that we don't mix string type with number type

115 if len(set(isinstance(label, str) for label in ys_labels)) > 1:

116 raise ValueError("Mix of label input types (string and number)")

117

118 return xp.asarray(sorted(ys_labels))

119

120

121def _is_integral_float(y):

122 xp, is_array_api_compliant = get_namespace(y)

123 return xp.isdtype(y.dtype, "real floating") and bool(

124 xp.all(xp.astype((xp.astype(y, xp.int64)), y.dtype) == y)

125 )

126

127

128def is_multilabel(y):

129 """Check if ``y`` is in a multilabel format.

130

131 Parameters

132 ----------

133 y : ndarray of shape (n_samples,)

134 Target values.

135

136 Returns

137 -------

138 out : bool

139 Return ``True``, if ``y`` is in a multilabel format, else ```False``.

140

141 Examples

142 --------

143 >>> import numpy as np

144 >>> from sklearn.utils.multiclass import is_multilabel

145 >>> is_multilabel([0, 1, 0, 1])

146 False

147 >>> is_multilabel([[1], [0, 2], []])

148 False

149 >>> is_multilabel(np.array([[1, 0], [0, 0]]))

150 True

151 >>> is_multilabel(np.array([[1], [0], [0]]))

152 False

153 >>> is_multilabel(np.array([[1, 0, 0]]))

154 True

155 """

156 xp, is_array_api_compliant = get_namespace(y)

157 if hasattr(y, "__array__") or isinstance(y, Sequence) or is_array_api_compliant:

158 # DeprecationWarning will be replaced by ValueError, see NEP 34

159 # https://numpy.org/neps/nep-0034-infer-dtype-is-object.html

160 check_y_kwargs = dict(

161 accept_sparse=True,

162 allow_nd=True,

163 force_all_finite=False,

164 ensure_2d=False,

165 ensure_min_samples=0,

166 ensure_min_features=0,

167 )

168 with warnings.catch_warnings():

169 warnings.simplefilter("error", VisibleDeprecationWarning)

170 try:

171 y = check_array(y, dtype=None, **check_y_kwargs)

172 except (VisibleDeprecationWarning, ValueError) as e:

173 if str(e).startswith("Complex data not supported"):

174 raise

175

176 # dtype=object should be provided explicitly for ragged arrays,

177 # see NEP 34

178 y = check_array(y, dtype=object, **check_y_kwargs)

179

180 if not (hasattr(y, "shape") and y.ndim == 2 and y.shape[1] > 1):

181 return False

182

183 if issparse(y):

184 if y.format in ("dok", "lil"):

185 y = y.tocsr()

186 labels = xp.unique_values(y.data)

187 return (

188 len(y.data) == 0

189 or (labels.size == 1 or (labels.size == 2) and (0 in labels))

190 and (y.dtype.kind in "biu" or _is_integral_float(labels)) # bool, int, uint

191 )

192 else:

193 labels = xp.unique_values(y)

194

195 return labels.shape[0] < 3 and (

196 xp.isdtype(y.dtype, ("bool", "signed integer", "unsigned integer"))

197 or _is_integral_float(labels)

198 )

199

200

201def check_classification_targets(y):

202 """Ensure that target y is of a non-regression type.

203

204 Only the following target types (as defined in type_of_target) are allowed:

205 'binary', 'multiclass', 'multiclass-multioutput',

206 'multilabel-indicator', 'multilabel-sequences'

207

208 Parameters

209 ----------

210 y : array-like

211 Target values.

212 """

213 y_type = type_of_target(y, input_name="y")

214 if y_type not in [

215 "binary",

216 "multiclass",

217 "multiclass-multioutput",

218 "multilabel-indicator",

219 "multilabel-sequences",

220 ]:

221 raise ValueError(

222 f"Unknown label type: {y_type}. Maybe you are trying to fit a "

223 "classifier, which expects discrete classes on a "

224 "regression target with continuous values."

225 )

226

227

228def type_of_target(y, input_name=""):

229 """Determine the type of data indicated by the target.

230

231 Note that this type is the most specific type that can be inferred.

232 For example:

233

234 * ``binary`` is more specific but compatible with ``multiclass``.

235 * ``multiclass`` of integers is more specific but compatible with

236 ``continuous``.

237 * ``multilabel-indicator`` is more specific but compatible with

238 ``multiclass-multioutput``.

239

240 Parameters

241 ----------

242 y : {array-like, sparse matrix}

243 Target values. If a sparse matrix, `y` is expected to be a

244 CSR/CSC matrix.

245

246 input_name : str, default=""

247 The data name used to construct the error message.

248

249 .. versionadded:: 1.1.0

250

251 Returns

252 -------

253 target_type : str

254 One of:

255

256 * 'continuous': `y` is an array-like of floats that are not all

257 integers, and is 1d or a column vector.

258 * 'continuous-multioutput': `y` is a 2d array of floats that are

259 not all integers, and both dimensions are of size > 1.

260 * 'binary': `y` contains <= 2 discrete values and is 1d or a column

261 vector.

262 * 'multiclass': `y` contains more than two discrete values, is not a

263 sequence of sequences, and is 1d or a column vector.

264 * 'multiclass-multioutput': `y` is a 2d array that contains more

265 than two discrete values, is not a sequence of sequences, and both

266 dimensions are of size > 1.

267 * 'multilabel-indicator': `y` is a label indicator matrix, an array

268 of two dimensions with at least two columns, and at most 2 unique

269 values.

270 * 'unknown': `y` is array-like but none of the above, such as a 3d

271 array, sequence of sequences, or an array of non-sequence objects.

272

273 Examples

274 --------

275 >>> from sklearn.utils.multiclass import type_of_target

276 >>> import numpy as np

277 >>> type_of_target([0.1, 0.6])

278 'continuous'

279 >>> type_of_target([1, -1, -1, 1])

280 'binary'

281 >>> type_of_target(['a', 'b', 'a'])

282 'binary'

283 >>> type_of_target([1.0, 2.0])

284 'binary'

285 >>> type_of_target([1, 0, 2])

286 'multiclass'

287 >>> type_of_target([1.0, 0.0, 3.0])

288 'multiclass'

289 >>> type_of_target(['a', 'b', 'c'])

290 'multiclass'

291 >>> type_of_target(np.array([[1, 2], [3, 1]]))

292 'multiclass-multioutput'

293 >>> type_of_target([[1, 2]])

294 'multilabel-indicator'

295 >>> type_of_target(np.array([[1.5, 2.0], [3.0, 1.6]]))

296 'continuous-multioutput'

297 >>> type_of_target(np.array([[0, 1], [1, 1]]))

298 'multilabel-indicator'

299 """

300 xp, is_array_api_compliant = get_namespace(y)

301 valid = (

302 (isinstance(y, Sequence) or issparse(y) or hasattr(y, "__array__"))

303 and not isinstance(y, str)

304 or is_array_api_compliant

305 )

306

307 if not valid:

308 raise ValueError(

309 "Expected array-like (array or non-string sequence), got %r" % y

310 )

311

312 sparse_pandas = y.__class__.__name__ in ["SparseSeries", "SparseArray"]

313 if sparse_pandas:

314 raise ValueError("y cannot be class 'SparseSeries' or 'SparseArray'")

315

316 if is_multilabel(y):

317 return "multilabel-indicator"

318

319 # DeprecationWarning will be replaced by ValueError, see NEP 34

320 # https://numpy.org/neps/nep-0034-infer-dtype-is-object.html

321 # We therefore catch both deprecation (NumPy < 1.24) warning and

322 # value error (NumPy >= 1.24).

323 check_y_kwargs = dict(

324 accept_sparse=True,

325 allow_nd=True,

326 force_all_finite=False,

327 ensure_2d=False,

328 ensure_min_samples=0,

329 ensure_min_features=0,

330 )

331

332 with warnings.catch_warnings():

333 warnings.simplefilter("error", VisibleDeprecationWarning)

334 if not issparse(y):

335 try:

336 y = check_array(y, dtype=None, **check_y_kwargs)

337 except (VisibleDeprecationWarning, ValueError) as e:

338 if str(e).startswith("Complex data not supported"):

339 raise

340

341 # dtype=object should be provided explicitly for ragged arrays,

342 # see NEP 34

343 y = check_array(y, dtype=object, **check_y_kwargs)

344

345 # The old sequence of sequences format

346 try:

347 first_row = y[[0], :] if issparse(y) else y[0]

348 if (

349 not hasattr(first_row, "__array__")

350 and isinstance(first_row, Sequence)

351 and not isinstance(first_row, str)

352 ):

353 raise ValueError(

354 "You appear to be using a legacy multi-label data"

355 " representation. Sequence of sequences are no"

356 " longer supported; use a binary array or sparse"

357 " matrix instead - the MultiLabelBinarizer"

358 " transformer can convert to this format."

359 )

360 except IndexError:

361 pass

362

363 # Invalid inputs

364 if y.ndim not in (1, 2):

365 # Number of dimension greater than 2: [[[1, 2]]]

366 return "unknown"

367 if not min(y.shape):

368 # Empty ndarray: []/[[]]

369 if y.ndim == 1:

370 # 1-D empty array: []

371 return "binary" # []

372 # 2-D empty array: [[]]

373 return "unknown"

374 if not issparse(y) and y.dtype == object and not isinstance(y.flat[0], str):

375 # [obj_1] and not ["label_1"]

376 return "unknown"

377

378 # Check if multioutput

379 if y.ndim == 2 and y.shape[1] > 1:

380 suffix = "-multioutput" # [[1, 2], [1, 2]]

381 else:

382 suffix = "" # [1, 2, 3] or [[1], [2], [3]]

383

384 # Check float and contains non-integer float values

385 if xp.isdtype(y.dtype, "real floating"):

386 # [.1, .2, 3] or [[.1, .2, 3]] or [[1., .2]] and not [1., 2., 3.]

387 data = y.data if issparse(y) else y

388 if xp.any(data != xp.astype(data, int)):

389 _assert_all_finite(data, input_name=input_name)

390 return "continuous" + suffix

391

392 # Check multiclass

393 if issparse(first_row):

394 first_row = first_row.data

395 if xp.unique_values(y).shape[0] > 2 or (y.ndim == 2 and len(first_row) > 1):

396 # [1, 2, 3] or [[1., 2., 3]] or [[1, 2]]

397 return "multiclass" + suffix

398 else:

399 return "binary" # [1, 2] or [["a"], ["b"]]

400

401

402def _check_partial_fit_first_call(clf, classes=None):

403 """Private helper function for factorizing common classes param logic.

404

405 Estimators that implement the ``partial_fit`` API need to be provided with

406 the list of possible classes at the first call to partial_fit.

407

408 Subsequent calls to partial_fit should check that ``classes`` is still

409 consistent with a previous value of ``clf.classes_`` when provided.

410

411 This function returns True if it detects that this was the first call to

412 ``partial_fit`` on ``clf``. In that case the ``classes_`` attribute is also

413 set on ``clf``.

414

415 """

416 if getattr(clf, "classes_", None) is None and classes is None:

417 raise ValueError("classes must be passed on the first call to partial_fit.")

418

419 elif classes is not None:

420 if getattr(clf, "classes_", None) is not None:

421 if not np.array_equal(clf.classes_, unique_labels(classes)):

422 raise ValueError(

423 "`classes=%r` is not the same as on last call "

424 "to partial_fit, was: %r" % (classes, clf.classes_)

425 )

426

427 else:

428 # This is the first call to partial_fit

429 clf.classes_ = unique_labels(classes)

430 return True

431

432 # classes is None and clf.classes_ has already previously been set:

433 # nothing to do

434 return False

435

436

437def class_distribution(y, sample_weight=None):

438 """Compute class priors from multioutput-multiclass target data.

439

440 Parameters

441 ----------

442 y : {array-like, sparse matrix} of size (n_samples, n_outputs)

443 The labels for each example.

444

445 sample_weight : array-like of shape (n_samples,), default=None

446 Sample weights.

447

448 Returns

449 -------

450 classes : list of size n_outputs of ndarray of size (n_classes,)

451 List of classes for each column.

452

453 n_classes : list of int of size n_outputs

454 Number of classes in each column.

455

456 class_prior : list of size n_outputs of ndarray of size (n_classes,)

457 Class distribution of each column.

458 """

459 classes = []

460 n_classes = []

461 class_prior = []

462

463 n_samples, n_outputs = y.shape

464 if sample_weight is not None:

465 sample_weight = np.asarray(sample_weight)

466

467 if issparse(y):

468 y = y.tocsc()

469 y_nnz = np.diff(y.indptr)

470

471 for k in range(n_outputs):

472 col_nonzero = y.indices[y.indptr[k] : y.indptr[k + 1]]

473 # separate sample weights for zero and non-zero elements

474 if sample_weight is not None:

475 nz_samp_weight = sample_weight[col_nonzero]

476 zeros_samp_weight_sum = np.sum(sample_weight) - np.sum(nz_samp_weight)

477 else:

478 nz_samp_weight = None

479 zeros_samp_weight_sum = y.shape[0] - y_nnz[k]

480

481 classes_k, y_k = np.unique(

482 y.data[y.indptr[k] : y.indptr[k + 1]], return_inverse=True

483 )

484 class_prior_k = np.bincount(y_k, weights=nz_samp_weight)

485

486 # An explicit zero was found, combine its weight with the weight

487 # of the implicit zeros

488 if 0 in classes_k:

489 class_prior_k[classes_k == 0] += zeros_samp_weight_sum

490

491 # If an there is an implicit zero and it is not in classes and

492 # class_prior, make an entry for it

493 if 0 not in classes_k and y_nnz[k] < y.shape[0]:

494 classes_k = np.insert(classes_k, 0, 0)

495 class_prior_k = np.insert(class_prior_k, 0, zeros_samp_weight_sum)

496

497 classes.append(classes_k)

498 n_classes.append(classes_k.shape[0])

499 class_prior.append(class_prior_k / class_prior_k.sum())

500 else:

501 for k in range(n_outputs):

502 classes_k, y_k = np.unique(y[:, k], return_inverse=True)

503 classes.append(classes_k)

504 n_classes.append(classes_k.shape[0])

505 class_prior_k = np.bincount(y_k, weights=sample_weight)

506 class_prior.append(class_prior_k / class_prior_k.sum())

507

508 return (classes, n_classes, class_prior)

509

510

511def _ovr_decision_function(predictions, confidences, n_classes):

512 """Compute a continuous, tie-breaking OvR decision function from OvO.

513

514 It is important to include a continuous value, not only votes,

515 to make computing AUC or calibration meaningful.

516

517 Parameters

518 ----------

519 predictions : array-like of shape (n_samples, n_classifiers)

520 Predicted classes for each binary classifier.

521

522 confidences : array-like of shape (n_samples, n_classifiers)

523 Decision functions or predicted probabilities for positive class

524 for each binary classifier.

525

526 n_classes : int

527 Number of classes. n_classifiers must be

528 ``n_classes * (n_classes - 1 ) / 2``.

529 """

530 n_samples = predictions.shape[0]

531 votes = np.zeros((n_samples, n_classes))

532 sum_of_confidences = np.zeros((n_samples, n_classes))

533

534 k = 0

535 for i in range(n_classes):

536 for j in range(i + 1, n_classes):

537 sum_of_confidences[:, i] -= confidences[:, k]

538 sum_of_confidences[:, j] += confidences[:, k]

539 votes[predictions[:, k] == 0, i] += 1

540 votes[predictions[:, k] == 1, j] += 1

541 k += 1

542

543 # Monotonically transform the sum_of_confidences to (-1/3, 1/3)

544 # and add it with votes. The monotonic transformation is

545 # f: x -> x / (3 * (|x| + 1)), it uses 1/3 instead of 1/2

546 # to ensure that we won't reach the limits and change vote order.

547 # The motivation is to use confidence levels as a way to break ties in

548 # the votes without switching any decision made based on a difference

549 # of 1 vote.

550 transformed_confidences = sum_of_confidences / (

551 3 * (np.abs(sum_of_confidences) + 1)

552 )

553 return votes + transformed_confidences

Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/scikit_learn-1.4.dev0-py3.8-linux-x86_64.egg/sklearn/utils/multiclass.py: 11%

172 statements