Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/scikit_learn-1.4.dev0-py3.8-linux-x86_64.egg/sklearn/utils/multiclass.py: 11%

172 statements  

« prev     ^ index     » next       coverage.py v7.3.2, created at 2023-12-12 06:31 +0000

1""" 

2The :mod:`sklearn.utils.multiclass` module includes utilities to handle 

3multiclass/multioutput target in classifiers. 

4""" 

5 

6# Author: Arnaud Joly, Joel Nothman, Hamzeh Alsalhi 

7# 

8# License: BSD 3 clause 

9import warnings 

10from collections.abc import Sequence 

11from itertools import chain 

12 

13import numpy as np 

14from scipy.sparse import issparse 

15 

16from ..utils._array_api import get_namespace 

17from ..utils.fixes import VisibleDeprecationWarning 

18from .validation import _assert_all_finite, check_array 

19 

20 

21def _unique_multiclass(y): 

22 xp, is_array_api_compliant = get_namespace(y) 

23 if hasattr(y, "__array__") or is_array_api_compliant: 

24 return xp.unique_values(xp.asarray(y)) 

25 else: 

26 return set(y) 

27 

28 

29def _unique_indicator(y): 

30 xp, _ = get_namespace(y) 

31 return xp.arange( 

32 check_array(y, input_name="y", accept_sparse=["csr", "csc", "coo"]).shape[1] 

33 ) 

34 

35 

36_FN_UNIQUE_LABELS = { 

37 "binary": _unique_multiclass, 

38 "multiclass": _unique_multiclass, 

39 "multilabel-indicator": _unique_indicator, 

40} 

41 

42 

43def unique_labels(*ys): 

44 """Extract an ordered array of unique labels. 

45 

46 We don't allow: 

47 - mix of multilabel and multiclass (single label) targets 

48 - mix of label indicator matrix and anything else, 

49 because there are no explicit labels) 

50 - mix of label indicator matrices of different sizes 

51 - mix of string and integer labels 

52 

53 At the moment, we also don't allow "multiclass-multioutput" input type. 

54 

55 Parameters 

56 ---------- 

57 *ys : array-likes 

58 Label values. 

59 

60 Returns 

61 ------- 

62 out : ndarray of shape (n_unique_labels,) 

63 An ordered array of unique labels. 

64 

65 Examples 

66 -------- 

67 >>> from sklearn.utils.multiclass import unique_labels 

68 >>> unique_labels([3, 5, 5, 5, 7, 7]) 

69 array([3, 5, 7]) 

70 >>> unique_labels([1, 2, 3, 4], [2, 2, 3, 4]) 

71 array([1, 2, 3, 4]) 

72 >>> unique_labels([1, 2, 10], [5, 11]) 

73 array([ 1, 2, 5, 10, 11]) 

74 """ 

75 xp, is_array_api_compliant = get_namespace(*ys) 

76 if not ys: 

77 raise ValueError("No argument has been passed.") 

78 # Check that we don't mix label format 

79 

80 ys_types = set(type_of_target(x) for x in ys) 

81 if ys_types == {"binary", "multiclass"}: 

82 ys_types = {"multiclass"} 

83 

84 if len(ys_types) > 1: 

85 raise ValueError("Mix type of y not allowed, got types %s" % ys_types) 

86 

87 label_type = ys_types.pop() 

88 

89 # Check consistency for the indicator format 

90 if ( 

91 label_type == "multilabel-indicator" 

92 and len( 

93 set( 

94 check_array(y, accept_sparse=["csr", "csc", "coo"]).shape[1] for y in ys 

95 ) 

96 ) 

97 > 1 

98 ): 

99 raise ValueError( 

100 "Multi-label binary indicator input with different numbers of labels" 

101 ) 

102 

103 # Get the unique set of labels 

104 _unique_labels = _FN_UNIQUE_LABELS.get(label_type, None) 

105 if not _unique_labels: 

106 raise ValueError("Unknown label type: %s" % repr(ys)) 

107 

108 if is_array_api_compliant: 

109 # array_api does not allow for mixed dtypes 

110 unique_ys = xp.concat([_unique_labels(y) for y in ys]) 

111 return xp.unique_values(unique_ys) 

112 

113 ys_labels = set(chain.from_iterable((i for i in _unique_labels(y)) for y in ys)) 

114 # Check that we don't mix string type with number type 

115 if len(set(isinstance(label, str) for label in ys_labels)) > 1: 

116 raise ValueError("Mix of label input types (string and number)") 

117 

118 return xp.asarray(sorted(ys_labels)) 

119 

120 

121def _is_integral_float(y): 

122 xp, is_array_api_compliant = get_namespace(y) 

123 return xp.isdtype(y.dtype, "real floating") and bool( 

124 xp.all(xp.astype((xp.astype(y, xp.int64)), y.dtype) == y) 

125 ) 

126 

127 

128def is_multilabel(y): 

129 """Check if ``y`` is in a multilabel format. 

130 

131 Parameters 

132 ---------- 

133 y : ndarray of shape (n_samples,) 

134 Target values. 

135 

136 Returns 

137 ------- 

138 out : bool 

139 Return ``True``, if ``y`` is in a multilabel format, else ```False``. 

140 

141 Examples 

142 -------- 

143 >>> import numpy as np 

144 >>> from sklearn.utils.multiclass import is_multilabel 

145 >>> is_multilabel([0, 1, 0, 1]) 

146 False 

147 >>> is_multilabel([[1], [0, 2], []]) 

148 False 

149 >>> is_multilabel(np.array([[1, 0], [0, 0]])) 

150 True 

151 >>> is_multilabel(np.array([[1], [0], [0]])) 

152 False 

153 >>> is_multilabel(np.array([[1, 0, 0]])) 

154 True 

155 """ 

156 xp, is_array_api_compliant = get_namespace(y) 

157 if hasattr(y, "__array__") or isinstance(y, Sequence) or is_array_api_compliant: 

158 # DeprecationWarning will be replaced by ValueError, see NEP 34 

159 # https://numpy.org/neps/nep-0034-infer-dtype-is-object.html 

160 check_y_kwargs = dict( 

161 accept_sparse=True, 

162 allow_nd=True, 

163 force_all_finite=False, 

164 ensure_2d=False, 

165 ensure_min_samples=0, 

166 ensure_min_features=0, 

167 ) 

168 with warnings.catch_warnings(): 

169 warnings.simplefilter("error", VisibleDeprecationWarning) 

170 try: 

171 y = check_array(y, dtype=None, **check_y_kwargs) 

172 except (VisibleDeprecationWarning, ValueError) as e: 

173 if str(e).startswith("Complex data not supported"): 

174 raise 

175 

176 # dtype=object should be provided explicitly for ragged arrays, 

177 # see NEP 34 

178 y = check_array(y, dtype=object, **check_y_kwargs) 

179 

180 if not (hasattr(y, "shape") and y.ndim == 2 and y.shape[1] > 1): 

181 return False 

182 

183 if issparse(y): 

184 if y.format in ("dok", "lil"): 

185 y = y.tocsr() 

186 labels = xp.unique_values(y.data) 

187 return ( 

188 len(y.data) == 0 

189 or (labels.size == 1 or (labels.size == 2) and (0 in labels)) 

190 and (y.dtype.kind in "biu" or _is_integral_float(labels)) # bool, int, uint 

191 ) 

192 else: 

193 labels = xp.unique_values(y) 

194 

195 return labels.shape[0] < 3 and ( 

196 xp.isdtype(y.dtype, ("bool", "signed integer", "unsigned integer")) 

197 or _is_integral_float(labels) 

198 ) 

199 

200 

201def check_classification_targets(y): 

202 """Ensure that target y is of a non-regression type. 

203 

204 Only the following target types (as defined in type_of_target) are allowed: 

205 'binary', 'multiclass', 'multiclass-multioutput', 

206 'multilabel-indicator', 'multilabel-sequences' 

207 

208 Parameters 

209 ---------- 

210 y : array-like 

211 Target values. 

212 """ 

213 y_type = type_of_target(y, input_name="y") 

214 if y_type not in [ 

215 "binary", 

216 "multiclass", 

217 "multiclass-multioutput", 

218 "multilabel-indicator", 

219 "multilabel-sequences", 

220 ]: 

221 raise ValueError( 

222 f"Unknown label type: {y_type}. Maybe you are trying to fit a " 

223 "classifier, which expects discrete classes on a " 

224 "regression target with continuous values." 

225 ) 

226 

227 

228def type_of_target(y, input_name=""): 

229 """Determine the type of data indicated by the target. 

230 

231 Note that this type is the most specific type that can be inferred. 

232 For example: 

233 

234 * ``binary`` is more specific but compatible with ``multiclass``. 

235 * ``multiclass`` of integers is more specific but compatible with 

236 ``continuous``. 

237 * ``multilabel-indicator`` is more specific but compatible with 

238 ``multiclass-multioutput``. 

239 

240 Parameters 

241 ---------- 

242 y : {array-like, sparse matrix} 

243 Target values. If a sparse matrix, `y` is expected to be a 

244 CSR/CSC matrix. 

245 

246 input_name : str, default="" 

247 The data name used to construct the error message. 

248 

249 .. versionadded:: 1.1.0 

250 

251 Returns 

252 ------- 

253 target_type : str 

254 One of: 

255 

256 * 'continuous': `y` is an array-like of floats that are not all 

257 integers, and is 1d or a column vector. 

258 * 'continuous-multioutput': `y` is a 2d array of floats that are 

259 not all integers, and both dimensions are of size > 1. 

260 * 'binary': `y` contains <= 2 discrete values and is 1d or a column 

261 vector. 

262 * 'multiclass': `y` contains more than two discrete values, is not a 

263 sequence of sequences, and is 1d or a column vector. 

264 * 'multiclass-multioutput': `y` is a 2d array that contains more 

265 than two discrete values, is not a sequence of sequences, and both 

266 dimensions are of size > 1. 

267 * 'multilabel-indicator': `y` is a label indicator matrix, an array 

268 of two dimensions with at least two columns, and at most 2 unique 

269 values. 

270 * 'unknown': `y` is array-like but none of the above, such as a 3d 

271 array, sequence of sequences, or an array of non-sequence objects. 

272 

273 Examples 

274 -------- 

275 >>> from sklearn.utils.multiclass import type_of_target 

276 >>> import numpy as np 

277 >>> type_of_target([0.1, 0.6]) 

278 'continuous' 

279 >>> type_of_target([1, -1, -1, 1]) 

280 'binary' 

281 >>> type_of_target(['a', 'b', 'a']) 

282 'binary' 

283 >>> type_of_target([1.0, 2.0]) 

284 'binary' 

285 >>> type_of_target([1, 0, 2]) 

286 'multiclass' 

287 >>> type_of_target([1.0, 0.0, 3.0]) 

288 'multiclass' 

289 >>> type_of_target(['a', 'b', 'c']) 

290 'multiclass' 

291 >>> type_of_target(np.array([[1, 2], [3, 1]])) 

292 'multiclass-multioutput' 

293 >>> type_of_target([[1, 2]]) 

294 'multilabel-indicator' 

295 >>> type_of_target(np.array([[1.5, 2.0], [3.0, 1.6]])) 

296 'continuous-multioutput' 

297 >>> type_of_target(np.array([[0, 1], [1, 1]])) 

298 'multilabel-indicator' 

299 """ 

300 xp, is_array_api_compliant = get_namespace(y) 

301 valid = ( 

302 (isinstance(y, Sequence) or issparse(y) or hasattr(y, "__array__")) 

303 and not isinstance(y, str) 

304 or is_array_api_compliant 

305 ) 

306 

307 if not valid: 

308 raise ValueError( 

309 "Expected array-like (array or non-string sequence), got %r" % y 

310 ) 

311 

312 sparse_pandas = y.__class__.__name__ in ["SparseSeries", "SparseArray"] 

313 if sparse_pandas: 

314 raise ValueError("y cannot be class 'SparseSeries' or 'SparseArray'") 

315 

316 if is_multilabel(y): 

317 return "multilabel-indicator" 

318 

319 # DeprecationWarning will be replaced by ValueError, see NEP 34 

320 # https://numpy.org/neps/nep-0034-infer-dtype-is-object.html 

321 # We therefore catch both deprecation (NumPy < 1.24) warning and 

322 # value error (NumPy >= 1.24). 

323 check_y_kwargs = dict( 

324 accept_sparse=True, 

325 allow_nd=True, 

326 force_all_finite=False, 

327 ensure_2d=False, 

328 ensure_min_samples=0, 

329 ensure_min_features=0, 

330 ) 

331 

332 with warnings.catch_warnings(): 

333 warnings.simplefilter("error", VisibleDeprecationWarning) 

334 if not issparse(y): 

335 try: 

336 y = check_array(y, dtype=None, **check_y_kwargs) 

337 except (VisibleDeprecationWarning, ValueError) as e: 

338 if str(e).startswith("Complex data not supported"): 

339 raise 

340 

341 # dtype=object should be provided explicitly for ragged arrays, 

342 # see NEP 34 

343 y = check_array(y, dtype=object, **check_y_kwargs) 

344 

345 # The old sequence of sequences format 

346 try: 

347 first_row = y[[0], :] if issparse(y) else y[0] 

348 if ( 

349 not hasattr(first_row, "__array__") 

350 and isinstance(first_row, Sequence) 

351 and not isinstance(first_row, str) 

352 ): 

353 raise ValueError( 

354 "You appear to be using a legacy multi-label data" 

355 " representation. Sequence of sequences are no" 

356 " longer supported; use a binary array or sparse" 

357 " matrix instead - the MultiLabelBinarizer" 

358 " transformer can convert to this format." 

359 ) 

360 except IndexError: 

361 pass 

362 

363 # Invalid inputs 

364 if y.ndim not in (1, 2): 

365 # Number of dimension greater than 2: [[[1, 2]]] 

366 return "unknown" 

367 if not min(y.shape): 

368 # Empty ndarray: []/[[]] 

369 if y.ndim == 1: 

370 # 1-D empty array: [] 

371 return "binary" # [] 

372 # 2-D empty array: [[]] 

373 return "unknown" 

374 if not issparse(y) and y.dtype == object and not isinstance(y.flat[0], str): 

375 # [obj_1] and not ["label_1"] 

376 return "unknown" 

377 

378 # Check if multioutput 

379 if y.ndim == 2 and y.shape[1] > 1: 

380 suffix = "-multioutput" # [[1, 2], [1, 2]] 

381 else: 

382 suffix = "" # [1, 2, 3] or [[1], [2], [3]] 

383 

384 # Check float and contains non-integer float values 

385 if xp.isdtype(y.dtype, "real floating"): 

386 # [.1, .2, 3] or [[.1, .2, 3]] or [[1., .2]] and not [1., 2., 3.] 

387 data = y.data if issparse(y) else y 

388 if xp.any(data != xp.astype(data, int)): 

389 _assert_all_finite(data, input_name=input_name) 

390 return "continuous" + suffix 

391 

392 # Check multiclass 

393 if issparse(first_row): 

394 first_row = first_row.data 

395 if xp.unique_values(y).shape[0] > 2 or (y.ndim == 2 and len(first_row) > 1): 

396 # [1, 2, 3] or [[1., 2., 3]] or [[1, 2]] 

397 return "multiclass" + suffix 

398 else: 

399 return "binary" # [1, 2] or [["a"], ["b"]] 

400 

401 

402def _check_partial_fit_first_call(clf, classes=None): 

403 """Private helper function for factorizing common classes param logic. 

404 

405 Estimators that implement the ``partial_fit`` API need to be provided with 

406 the list of possible classes at the first call to partial_fit. 

407 

408 Subsequent calls to partial_fit should check that ``classes`` is still 

409 consistent with a previous value of ``clf.classes_`` when provided. 

410 

411 This function returns True if it detects that this was the first call to 

412 ``partial_fit`` on ``clf``. In that case the ``classes_`` attribute is also 

413 set on ``clf``. 

414 

415 """ 

416 if getattr(clf, "classes_", None) is None and classes is None: 

417 raise ValueError("classes must be passed on the first call to partial_fit.") 

418 

419 elif classes is not None: 

420 if getattr(clf, "classes_", None) is not None: 

421 if not np.array_equal(clf.classes_, unique_labels(classes)): 

422 raise ValueError( 

423 "`classes=%r` is not the same as on last call " 

424 "to partial_fit, was: %r" % (classes, clf.classes_) 

425 ) 

426 

427 else: 

428 # This is the first call to partial_fit 

429 clf.classes_ = unique_labels(classes) 

430 return True 

431 

432 # classes is None and clf.classes_ has already previously been set: 

433 # nothing to do 

434 return False 

435 

436 

437def class_distribution(y, sample_weight=None): 

438 """Compute class priors from multioutput-multiclass target data. 

439 

440 Parameters 

441 ---------- 

442 y : {array-like, sparse matrix} of size (n_samples, n_outputs) 

443 The labels for each example. 

444 

445 sample_weight : array-like of shape (n_samples,), default=None 

446 Sample weights. 

447 

448 Returns 

449 ------- 

450 classes : list of size n_outputs of ndarray of size (n_classes,) 

451 List of classes for each column. 

452 

453 n_classes : list of int of size n_outputs 

454 Number of classes in each column. 

455 

456 class_prior : list of size n_outputs of ndarray of size (n_classes,) 

457 Class distribution of each column. 

458 """ 

459 classes = [] 

460 n_classes = [] 

461 class_prior = [] 

462 

463 n_samples, n_outputs = y.shape 

464 if sample_weight is not None: 

465 sample_weight = np.asarray(sample_weight) 

466 

467 if issparse(y): 

468 y = y.tocsc() 

469 y_nnz = np.diff(y.indptr) 

470 

471 for k in range(n_outputs): 

472 col_nonzero = y.indices[y.indptr[k] : y.indptr[k + 1]] 

473 # separate sample weights for zero and non-zero elements 

474 if sample_weight is not None: 

475 nz_samp_weight = sample_weight[col_nonzero] 

476 zeros_samp_weight_sum = np.sum(sample_weight) - np.sum(nz_samp_weight) 

477 else: 

478 nz_samp_weight = None 

479 zeros_samp_weight_sum = y.shape[0] - y_nnz[k] 

480 

481 classes_k, y_k = np.unique( 

482 y.data[y.indptr[k] : y.indptr[k + 1]], return_inverse=True 

483 ) 

484 class_prior_k = np.bincount(y_k, weights=nz_samp_weight) 

485 

486 # An explicit zero was found, combine its weight with the weight 

487 # of the implicit zeros 

488 if 0 in classes_k: 

489 class_prior_k[classes_k == 0] += zeros_samp_weight_sum 

490 

491 # If an there is an implicit zero and it is not in classes and 

492 # class_prior, make an entry for it 

493 if 0 not in classes_k and y_nnz[k] < y.shape[0]: 

494 classes_k = np.insert(classes_k, 0, 0) 

495 class_prior_k = np.insert(class_prior_k, 0, zeros_samp_weight_sum) 

496 

497 classes.append(classes_k) 

498 n_classes.append(classes_k.shape[0]) 

499 class_prior.append(class_prior_k / class_prior_k.sum()) 

500 else: 

501 for k in range(n_outputs): 

502 classes_k, y_k = np.unique(y[:, k], return_inverse=True) 

503 classes.append(classes_k) 

504 n_classes.append(classes_k.shape[0]) 

505 class_prior_k = np.bincount(y_k, weights=sample_weight) 

506 class_prior.append(class_prior_k / class_prior_k.sum()) 

507 

508 return (classes, n_classes, class_prior) 

509 

510 

511def _ovr_decision_function(predictions, confidences, n_classes): 

512 """Compute a continuous, tie-breaking OvR decision function from OvO. 

513 

514 It is important to include a continuous value, not only votes, 

515 to make computing AUC or calibration meaningful. 

516 

517 Parameters 

518 ---------- 

519 predictions : array-like of shape (n_samples, n_classifiers) 

520 Predicted classes for each binary classifier. 

521 

522 confidences : array-like of shape (n_samples, n_classifiers) 

523 Decision functions or predicted probabilities for positive class 

524 for each binary classifier. 

525 

526 n_classes : int 

527 Number of classes. n_classifiers must be 

528 ``n_classes * (n_classes - 1 ) / 2``. 

529 """ 

530 n_samples = predictions.shape[0] 

531 votes = np.zeros((n_samples, n_classes)) 

532 sum_of_confidences = np.zeros((n_samples, n_classes)) 

533 

534 k = 0 

535 for i in range(n_classes): 

536 for j in range(i + 1, n_classes): 

537 sum_of_confidences[:, i] -= confidences[:, k] 

538 sum_of_confidences[:, j] += confidences[:, k] 

539 votes[predictions[:, k] == 0, i] += 1 

540 votes[predictions[:, k] == 1, j] += 1 

541 k += 1 

542 

543 # Monotonically transform the sum_of_confidences to (-1/3, 1/3) 

544 # and add it with votes. The monotonic transformation is 

545 # f: x -> x / (3 * (|x| + 1)), it uses 1/3 instead of 1/2 

546 # to ensure that we won't reach the limits and change vote order. 

547 # The motivation is to use confidence levels as a way to break ties in 

548 # the votes without switching any decision made based on a difference 

549 # of 1 vote. 

550 transformed_confidences = sum_of_confidences / ( 

551 3 * (np.abs(sum_of_confidences) + 1) 

552 ) 

553 return votes + transformed_confidences