Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/scikit_learn-1.4.dev0-py3.8-linux-x86_64.egg/sklearn/utils/__init__.py: 23%

366 statements  

« prev     ^ index     » next       coverage.py v7.3.2, created at 2023-12-12 06:31 +0000

1""" 

2The :mod:`sklearn.utils` module includes various utilities. 

3""" 

4 

5import math 

6import numbers 

7import platform 

8import struct 

9import timeit 

10import warnings 

11from collections.abc import Sequence 

12from contextlib import contextmanager, suppress 

13from itertools import compress, islice 

14 

15import numpy as np 

16from scipy.sparse import issparse 

17 

18from .. import get_config 

19from ..exceptions import DataConversionWarning 

20from . import _joblib, metadata_routing 

21from ._bunch import Bunch 

22from ._estimator_html_repr import estimator_html_repr 

23from ._param_validation import Integral, Interval, validate_params 

24from .class_weight import compute_class_weight, compute_sample_weight 

25from .deprecation import deprecated 

26from .discovery import all_estimators 

27from .fixes import parse_version, threadpool_info 

28from .murmurhash import murmurhash3_32 

29from .validation import ( 

30 _is_arraylike_not_scalar, 

31 _is_pandas_df, 

32 _is_polars_df, 

33 _use_interchange_protocol, 

34 as_float_array, 

35 assert_all_finite, 

36 check_array, 

37 check_consistent_length, 

38 check_random_state, 

39 check_scalar, 

40 check_symmetric, 

41 check_X_y, 

42 column_or_1d, 

43 indexable, 

44) 

45 

46# Do not deprecate parallel_backend and register_parallel_backend as they are 

47# needed to tune `scikit-learn` behavior and have different effect if called 

48# from the vendored version or or the site-package version. The other are 

49# utilities that are independent of scikit-learn so they are not part of 

50# scikit-learn public API. 

51parallel_backend = _joblib.parallel_backend 

52register_parallel_backend = _joblib.register_parallel_backend 

53 

54__all__ = [ 

55 "murmurhash3_32", 

56 "as_float_array", 

57 "assert_all_finite", 

58 "check_array", 

59 "check_random_state", 

60 "compute_class_weight", 

61 "compute_sample_weight", 

62 "column_or_1d", 

63 "check_consistent_length", 

64 "check_X_y", 

65 "check_scalar", 

66 "indexable", 

67 "check_symmetric", 

68 "indices_to_mask", 

69 "deprecated", 

70 "parallel_backend", 

71 "register_parallel_backend", 

72 "resample", 

73 "shuffle", 

74 "check_matplotlib_support", 

75 "all_estimators", 

76 "DataConversionWarning", 

77 "estimator_html_repr", 

78 "Bunch", 

79 "metadata_routing", 

80] 

81 

82IS_PYPY = platform.python_implementation() == "PyPy" 

83_IS_32BIT = 8 * struct.calcsize("P") == 32 

84_IS_WASM = platform.machine() in ["wasm32", "wasm64"] 

85 

86 

87def _in_unstable_openblas_configuration(): 

88 """Return True if in an unstable configuration for OpenBLAS""" 

89 

90 # Import libraries which might load OpenBLAS. 

91 import numpy # noqa 

92 import scipy # noqa 

93 

94 modules_info = threadpool_info() 

95 

96 open_blas_used = any(info["internal_api"] == "openblas" for info in modules_info) 

97 if not open_blas_used: 

98 return False 

99 

100 # OpenBLAS 0.3.16 fixed instability for arm64, see: 

101 # https://github.com/xianyi/OpenBLAS/blob/1b6db3dbba672b4f8af935bd43a1ff6cff4d20b7/Changelog.txt#L56-L58 # noqa 

102 openblas_arm64_stable_version = parse_version("0.3.16") 

103 for info in modules_info: 

104 if info["internal_api"] != "openblas": 

105 continue 

106 openblas_version = info.get("version") 

107 openblas_architecture = info.get("architecture") 

108 if openblas_version is None or openblas_architecture is None: 

109 # Cannot be sure that OpenBLAS is good enough. Assume unstable: 

110 return True 

111 if ( 

112 openblas_architecture == "neoversen1" 

113 and parse_version(openblas_version) < openblas_arm64_stable_version 

114 ): 

115 # See discussions in https://github.com/numpy/numpy/issues/19411 

116 return True 

117 return False 

118 

119 

120@validate_params( 

121 { 

122 "X": ["array-like", "sparse matrix"], 

123 "mask": ["array-like"], 

124 }, 

125 prefer_skip_nested_validation=True, 

126) 

127def safe_mask(X, mask): 

128 """Return a mask which is safe to use on X. 

129 

130 Parameters 

131 ---------- 

132 X : {array-like, sparse matrix} 

133 Data on which to apply mask. 

134 

135 mask : array-like 

136 Mask to be used on X. 

137 

138 Returns 

139 ------- 

140 mask : ndarray 

141 Array that is safe to use on X. 

142 """ 

143 mask = np.asarray(mask) 

144 if np.issubdtype(mask.dtype, np.signedinteger): 

145 return mask 

146 

147 if hasattr(X, "toarray"): 

148 ind = np.arange(mask.shape[0]) 

149 mask = ind[mask] 

150 return mask 

151 

152 

153def axis0_safe_slice(X, mask, len_mask): 

154 """Return a mask which is safer to use on X than safe_mask. 

155 

156 This mask is safer than safe_mask since it returns an 

157 empty array, when a sparse matrix is sliced with a boolean mask 

158 with all False, instead of raising an unhelpful error in older 

159 versions of SciPy. 

160 

161 See: https://github.com/scipy/scipy/issues/5361 

162 

163 Also note that we can avoid doing the dot product by checking if 

164 the len_mask is not zero in _huber_loss_and_gradient but this 

165 is not going to be the bottleneck, since the number of outliers 

166 and non_outliers are typically non-zero and it makes the code 

167 tougher to follow. 

168 

169 Parameters 

170 ---------- 

171 X : {array-like, sparse matrix} 

172 Data on which to apply mask. 

173 

174 mask : ndarray 

175 Mask to be used on X. 

176 

177 len_mask : int 

178 The length of the mask. 

179 

180 Returns 

181 ------- 

182 mask : ndarray 

183 Array that is safe to use on X. 

184 """ 

185 if len_mask != 0: 

186 return X[safe_mask(X, mask), :] 

187 return np.zeros(shape=(0, X.shape[1])) 

188 

189 

190def _array_indexing(array, key, key_dtype, axis): 

191 """Index an array or scipy.sparse consistently across NumPy version.""" 

192 if issparse(array) and key_dtype == "bool": 

193 key = np.asarray(key) 

194 if isinstance(key, tuple): 

195 key = list(key) 

196 return array[key, ...] if axis == 0 else array[:, key] 

197 

198 

199def _pandas_indexing(X, key, key_dtype, axis): 

200 """Index a pandas dataframe or a series.""" 

201 if _is_arraylike_not_scalar(key): 

202 key = np.asarray(key) 

203 

204 if key_dtype == "int" and not (isinstance(key, slice) or np.isscalar(key)): 

205 # using take() instead of iloc[] ensures the return value is a "proper" 

206 # copy that will not raise SettingWithCopyWarning 

207 return X.take(key, axis=axis) 

208 else: 

209 # check whether we should index with loc or iloc 

210 indexer = X.iloc if key_dtype == "int" else X.loc 

211 return indexer[:, key] if axis else indexer[key] 

212 

213 

214def _list_indexing(X, key, key_dtype): 

215 """Index a Python list.""" 

216 if np.isscalar(key) or isinstance(key, slice): 

217 # key is a slice or a scalar 

218 return X[key] 

219 if key_dtype == "bool": 

220 # key is a boolean array-like 

221 return list(compress(X, key)) 

222 # key is a integer array-like of key 

223 return [X[idx] for idx in key] 

224 

225 

226def _polars_indexing(X, key, key_dtype, axis): 

227 """Indexing X with polars interchange protocol.""" 

228 # Polars behavior is more consistent with lists 

229 if isinstance(key, np.ndarray): 

230 key = key.tolist() 

231 

232 if axis == 1: 

233 return X[:, key] 

234 else: 

235 return X[key] 

236 

237 

238def _determine_key_type(key, accept_slice=True): 

239 """Determine the data type of key. 

240 

241 Parameters 

242 ---------- 

243 key : scalar, slice or array-like 

244 The key from which we want to infer the data type. 

245 

246 accept_slice : bool, default=True 

247 Whether or not to raise an error if the key is a slice. 

248 

249 Returns 

250 ------- 

251 dtype : {'int', 'str', 'bool', None} 

252 Returns the data type of key. 

253 """ 

254 err_msg = ( 

255 "No valid specification of the columns. Only a scalar, list or " 

256 "slice of all integers or all strings, or boolean mask is " 

257 "allowed" 

258 ) 

259 

260 dtype_to_str = {int: "int", str: "str", bool: "bool", np.bool_: "bool"} 

261 array_dtype_to_str = { 

262 "i": "int", 

263 "u": "int", 

264 "b": "bool", 

265 "O": "str", 

266 "U": "str", 

267 "S": "str", 

268 } 

269 

270 if key is None: 

271 return None 

272 if isinstance(key, tuple(dtype_to_str.keys())): 

273 try: 

274 return dtype_to_str[type(key)] 

275 except KeyError: 

276 raise ValueError(err_msg) 

277 if isinstance(key, slice): 

278 if not accept_slice: 

279 raise TypeError( 

280 "Only array-like or scalar are supported. A Python slice was given." 

281 ) 

282 if key.start is None and key.stop is None: 

283 return None 

284 key_start_type = _determine_key_type(key.start) 

285 key_stop_type = _determine_key_type(key.stop) 

286 if key_start_type is not None and key_stop_type is not None: 

287 if key_start_type != key_stop_type: 

288 raise ValueError(err_msg) 

289 if key_start_type is not None: 

290 return key_start_type 

291 return key_stop_type 

292 if isinstance(key, (list, tuple)): 

293 unique_key = set(key) 

294 key_type = {_determine_key_type(elt) for elt in unique_key} 

295 if not key_type: 

296 return None 

297 if len(key_type) != 1: 

298 raise ValueError(err_msg) 

299 return key_type.pop() 

300 if hasattr(key, "dtype"): 

301 try: 

302 return array_dtype_to_str[key.dtype.kind] 

303 except KeyError: 

304 raise ValueError(err_msg) 

305 raise ValueError(err_msg) 

306 

307 

308def _safe_indexing(X, indices, *, axis=0): 

309 """Return rows, items or columns of X using indices. 

310 

311 .. warning:: 

312 

313 This utility is documented, but **private**. This means that 

314 backward compatibility might be broken without any deprecation 

315 cycle. 

316 

317 Parameters 

318 ---------- 

319 X : array-like, sparse-matrix, list, pandas.DataFrame, pandas.Series 

320 Data from which to sample rows, items or columns. `list` are only 

321 supported when `axis=0`. 

322 indices : bool, int, str, slice, array-like 

323 - If `axis=0`, boolean and integer array-like, integer slice, 

324 and scalar integer are supported. 

325 - If `axis=1`: 

326 - to select a single column, `indices` can be of `int` type for 

327 all `X` types and `str` only for dataframe. The selected subset 

328 will be 1D, unless `X` is a sparse matrix in which case it will 

329 be 2D. 

330 - to select multiples columns, `indices` can be one of the 

331 following: `list`, `array`, `slice`. The type used in 

332 these containers can be one of the following: `int`, 'bool' and 

333 `str`. However, `str` is only supported when `X` is a dataframe. 

334 The selected subset will be 2D. 

335 axis : int, default=0 

336 The axis along which `X` will be subsampled. `axis=0` will select 

337 rows while `axis=1` will select columns. 

338 

339 Returns 

340 ------- 

341 subset 

342 Subset of X on axis 0 or 1. 

343 

344 Notes 

345 ----- 

346 CSR, CSC, and LIL sparse matrices are supported. COO sparse matrices are 

347 not supported. 

348 """ 

349 if indices is None: 

350 return X 

351 

352 if axis not in (0, 1): 

353 raise ValueError( 

354 "'axis' should be either 0 (to index rows) or 1 (to index " 

355 " column). Got {} instead.".format(axis) 

356 ) 

357 

358 indices_dtype = _determine_key_type(indices) 

359 

360 if axis == 0 and indices_dtype == "str": 

361 raise ValueError("String indexing is not supported with 'axis=0'") 

362 

363 if axis == 1 and hasattr(X, "ndim") and X.ndim != 2: 

364 raise ValueError( 

365 "'X' should be a 2D NumPy array, 2D sparse matrix or pandas " 

366 "dataframe when indexing the columns (i.e. 'axis=1'). " 

367 "Got {} instead with {} dimension(s).".format(type(X), X.ndim) 

368 ) 

369 

370 if ( 

371 axis == 1 

372 and indices_dtype == "str" 

373 and not (_is_pandas_df(X) or _use_interchange_protocol(X)) 

374 ): 

375 raise ValueError( 

376 "Specifying the columns using strings is only supported for dataframes." 

377 ) 

378 

379 if hasattr(X, "iloc"): 

380 # TODO: we should probably use _is_pandas_df(X) instead but this would 

381 # require updating some tests such as test_train_test_split_mock_pandas. 

382 return _pandas_indexing(X, indices, indices_dtype, axis=axis) 

383 elif _is_polars_df(X): 

384 return _polars_indexing(X, indices, indices_dtype, axis=axis) 

385 elif hasattr(X, "shape"): 

386 return _array_indexing(X, indices, indices_dtype, axis=axis) 

387 else: 

388 return _list_indexing(X, indices, indices_dtype) 

389 

390 

391def _safe_assign(X, values, *, row_indexer=None, column_indexer=None): 

392 """Safe assignment to a numpy array, sparse matrix, or pandas dataframe. 

393 

394 Parameters 

395 ---------- 

396 X : {ndarray, sparse-matrix, dataframe} 

397 Array to be modified. It is expected to be 2-dimensional. 

398 

399 values : ndarray 

400 The values to be assigned to `X`. 

401 

402 row_indexer : array-like, dtype={int, bool}, default=None 

403 A 1-dimensional array to select the rows of interest. If `None`, all 

404 rows are selected. 

405 

406 column_indexer : array-like, dtype={int, bool}, default=None 

407 A 1-dimensional array to select the columns of interest. If `None`, all 

408 columns are selected. 

409 """ 

410 row_indexer = slice(None, None, None) if row_indexer is None else row_indexer 

411 column_indexer = ( 

412 slice(None, None, None) if column_indexer is None else column_indexer 

413 ) 

414 

415 if hasattr(X, "iloc"): # pandas dataframe 

416 with warnings.catch_warnings(): 

417 # pandas >= 1.5 raises a warning when using iloc to set values in a column 

418 # that does not have the same type as the column being set. It happens 

419 # for instance when setting a categorical column with a string. 

420 # In the future the behavior won't change and the warning should disappear. 

421 # TODO(1.3): check if the warning is still raised or remove the filter. 

422 warnings.simplefilter("ignore", FutureWarning) 

423 X.iloc[row_indexer, column_indexer] = values 

424 else: # numpy array or sparse matrix 

425 X[row_indexer, column_indexer] = values 

426 

427 

428def _get_column_indices_for_bool_or_int(key, n_columns): 

429 # Convert key into list of positive integer indexes 

430 try: 

431 idx = _safe_indexing(np.arange(n_columns), key) 

432 except IndexError as e: 

433 raise ValueError( 

434 f"all features must be in [0, {n_columns - 1}] or [-{n_columns}, 0]" 

435 ) from e 

436 return np.atleast_1d(idx).tolist() 

437 

438 

439def _get_column_indices(X, key): 

440 """Get feature column indices for input data X and key. 

441 

442 For accepted values of `key`, see the docstring of 

443 :func:`_safe_indexing`. 

444 """ 

445 key_dtype = _determine_key_type(key) 

446 if _use_interchange_protocol(X): 

447 return _get_column_indices_interchange(X.__dataframe__(), key, key_dtype) 

448 

449 n_columns = X.shape[1] 

450 if isinstance(key, (list, tuple)) and not key: 

451 # we get an empty list 

452 return [] 

453 elif key_dtype in ("bool", "int"): 

454 return _get_column_indices_for_bool_or_int(key, n_columns) 

455 else: 

456 try: 

457 all_columns = X.columns 

458 except AttributeError: 

459 raise ValueError( 

460 "Specifying the columns using strings is only supported for dataframes." 

461 ) 

462 if isinstance(key, str): 

463 columns = [key] 

464 elif isinstance(key, slice): 

465 start, stop = key.start, key.stop 

466 if start is not None: 

467 start = all_columns.get_loc(start) 

468 if stop is not None: 

469 # pandas indexing with strings is endpoint included 

470 stop = all_columns.get_loc(stop) + 1 

471 else: 

472 stop = n_columns + 1 

473 return list(islice(range(n_columns), start, stop)) 

474 else: 

475 columns = list(key) 

476 

477 try: 

478 column_indices = [] 

479 for col in columns: 

480 col_idx = all_columns.get_loc(col) 

481 if not isinstance(col_idx, numbers.Integral): 

482 raise ValueError( 

483 f"Selected columns, {columns}, are not unique in dataframe" 

484 ) 

485 column_indices.append(col_idx) 

486 

487 except KeyError as e: 

488 raise ValueError("A given column is not a column of the dataframe") from e 

489 

490 return column_indices 

491 

492 

493def _get_column_indices_interchange(X_interchange, key, key_dtype): 

494 """Same as _get_column_indices but for X with __dataframe__ protocol.""" 

495 

496 n_columns = X_interchange.num_columns() 

497 

498 if isinstance(key, (list, tuple)) and not key: 

499 # we get an empty list 

500 return [] 

501 elif key_dtype in ("bool", "int"): 

502 return _get_column_indices_for_bool_or_int(key, n_columns) 

503 else: 

504 column_names = list(X_interchange.column_names()) 

505 

506 if isinstance(key, slice): 

507 if key.step not in [1, None]: 

508 raise NotImplementedError("key.step must be 1 or None") 

509 start, stop = key.start, key.stop 

510 if start is not None: 

511 start = column_names.index(start) 

512 

513 if stop is not None: 

514 stop = column_names.index(stop) + 1 

515 else: 

516 stop = n_columns + 1 

517 return list(islice(range(n_columns), start, stop)) 

518 

519 selected_columns = [key] if np.isscalar(key) else key 

520 

521 try: 

522 return [column_names.index(col) for col in selected_columns] 

523 except ValueError as e: 

524 raise ValueError("A given column is not a column of the dataframe") from e 

525 

526 

527@validate_params( 

528 { 

529 "replace": ["boolean"], 

530 "n_samples": [Interval(numbers.Integral, 1, None, closed="left"), None], 

531 "random_state": ["random_state"], 

532 "stratify": ["array-like", None], 

533 }, 

534 prefer_skip_nested_validation=True, 

535) 

536def resample(*arrays, replace=True, n_samples=None, random_state=None, stratify=None): 

537 """Resample arrays or sparse matrices in a consistent way. 

538 

539 The default strategy implements one step of the bootstrapping 

540 procedure. 

541 

542 Parameters 

543 ---------- 

544 *arrays : sequence of array-like of shape (n_samples,) or \ 

545 (n_samples, n_outputs) 

546 Indexable data-structures can be arrays, lists, dataframes or scipy 

547 sparse matrices with consistent first dimension. 

548 

549 replace : bool, default=True 

550 Implements resampling with replacement. If False, this will implement 

551 (sliced) random permutations. 

552 

553 n_samples : int, default=None 

554 Number of samples to generate. If left to None this is 

555 automatically set to the first dimension of the arrays. 

556 If replace is False it should not be larger than the length of 

557 arrays. 

558 

559 random_state : int, RandomState instance or None, default=None 

560 Determines random number generation for shuffling 

561 the data. 

562 Pass an int for reproducible results across multiple function calls. 

563 See :term:`Glossary <random_state>`. 

564 

565 stratify : array-like of shape (n_samples,) or (n_samples, n_outputs), \ 

566 default=None 

567 If not None, data is split in a stratified fashion, using this as 

568 the class labels. 

569 

570 Returns 

571 ------- 

572 resampled_arrays : sequence of array-like of shape (n_samples,) or \ 

573 (n_samples, n_outputs) 

574 Sequence of resampled copies of the collections. The original arrays 

575 are not impacted. 

576 

577 See Also 

578 -------- 

579 shuffle : Shuffle arrays or sparse matrices in a consistent way. 

580 

581 Examples 

582 -------- 

583 It is possible to mix sparse and dense arrays in the same run:: 

584 

585 >>> import numpy as np 

586 >>> X = np.array([[1., 0.], [2., 1.], [0., 0.]]) 

587 >>> y = np.array([0, 1, 2]) 

588 

589 >>> from scipy.sparse import coo_matrix 

590 >>> X_sparse = coo_matrix(X) 

591 

592 >>> from sklearn.utils import resample 

593 >>> X, X_sparse, y = resample(X, X_sparse, y, random_state=0) 

594 >>> X 

595 array([[1., 0.], 

596 [2., 1.], 

597 [1., 0.]]) 

598 

599 >>> X_sparse 

600 <3x2 sparse matrix of type '<... 'numpy.float64'>' 

601 with 4 stored elements in Compressed Sparse Row format> 

602 

603 >>> X_sparse.toarray() 

604 array([[1., 0.], 

605 [2., 1.], 

606 [1., 0.]]) 

607 

608 >>> y 

609 array([0, 1, 0]) 

610 

611 >>> resample(y, n_samples=2, random_state=0) 

612 array([0, 1]) 

613 

614 Example using stratification:: 

615 

616 >>> y = [0, 0, 1, 1, 1, 1, 1, 1, 1] 

617 >>> resample(y, n_samples=5, replace=False, stratify=y, 

618 ... random_state=0) 

619 [1, 1, 1, 0, 1] 

620 """ 

621 max_n_samples = n_samples 

622 random_state = check_random_state(random_state) 

623 

624 if len(arrays) == 0: 

625 return None 

626 

627 first = arrays[0] 

628 n_samples = first.shape[0] if hasattr(first, "shape") else len(first) 

629 

630 if max_n_samples is None: 

631 max_n_samples = n_samples 

632 elif (max_n_samples > n_samples) and (not replace): 

633 raise ValueError( 

634 "Cannot sample %d out of arrays with dim %d when replace is False" 

635 % (max_n_samples, n_samples) 

636 ) 

637 

638 check_consistent_length(*arrays) 

639 

640 if stratify is None: 

641 if replace: 

642 indices = random_state.randint(0, n_samples, size=(max_n_samples,)) 

643 else: 

644 indices = np.arange(n_samples) 

645 random_state.shuffle(indices) 

646 indices = indices[:max_n_samples] 

647 else: 

648 # Code adapted from StratifiedShuffleSplit() 

649 y = check_array(stratify, ensure_2d=False, dtype=None) 

650 if y.ndim == 2: 

651 # for multi-label y, map each distinct row to a string repr 

652 # using join because str(row) uses an ellipsis if len(row) > 1000 

653 y = np.array([" ".join(row.astype("str")) for row in y]) 

654 

655 classes, y_indices = np.unique(y, return_inverse=True) 

656 n_classes = classes.shape[0] 

657 

658 class_counts = np.bincount(y_indices) 

659 

660 # Find the sorted list of instances for each class: 

661 # (np.unique above performs a sort, so code is O(n logn) already) 

662 class_indices = np.split( 

663 np.argsort(y_indices, kind="mergesort"), np.cumsum(class_counts)[:-1] 

664 ) 

665 

666 n_i = _approximate_mode(class_counts, max_n_samples, random_state) 

667 

668 indices = [] 

669 

670 for i in range(n_classes): 

671 indices_i = random_state.choice(class_indices[i], n_i[i], replace=replace) 

672 indices.extend(indices_i) 

673 

674 indices = random_state.permutation(indices) 

675 

676 # convert sparse matrices to CSR for row-based indexing 

677 arrays = [a.tocsr() if issparse(a) else a for a in arrays] 

678 resampled_arrays = [_safe_indexing(a, indices) for a in arrays] 

679 if len(resampled_arrays) == 1: 

680 # syntactic sugar for the unit argument case 

681 return resampled_arrays[0] 

682 else: 

683 return resampled_arrays 

684 

685 

686def shuffle(*arrays, random_state=None, n_samples=None): 

687 """Shuffle arrays or sparse matrices in a consistent way. 

688 

689 This is a convenience alias to ``resample(*arrays, replace=False)`` to do 

690 random permutations of the collections. 

691 

692 Parameters 

693 ---------- 

694 *arrays : sequence of indexable data-structures 

695 Indexable data-structures can be arrays, lists, dataframes or scipy 

696 sparse matrices with consistent first dimension. 

697 

698 random_state : int, RandomState instance or None, default=None 

699 Determines random number generation for shuffling 

700 the data. 

701 Pass an int for reproducible results across multiple function calls. 

702 See :term:`Glossary <random_state>`. 

703 

704 n_samples : int, default=None 

705 Number of samples to generate. If left to None this is 

706 automatically set to the first dimension of the arrays. It should 

707 not be larger than the length of arrays. 

708 

709 Returns 

710 ------- 

711 shuffled_arrays : sequence of indexable data-structures 

712 Sequence of shuffled copies of the collections. The original arrays 

713 are not impacted. 

714 

715 See Also 

716 -------- 

717 resample : Resample arrays or sparse matrices in a consistent way. 

718 

719 Examples 

720 -------- 

721 It is possible to mix sparse and dense arrays in the same run:: 

722 

723 >>> import numpy as np 

724 >>> X = np.array([[1., 0.], [2., 1.], [0., 0.]]) 

725 >>> y = np.array([0, 1, 2]) 

726 

727 >>> from scipy.sparse import coo_matrix 

728 >>> X_sparse = coo_matrix(X) 

729 

730 >>> from sklearn.utils import shuffle 

731 >>> X, X_sparse, y = shuffle(X, X_sparse, y, random_state=0) 

732 >>> X 

733 array([[0., 0.], 

734 [2., 1.], 

735 [1., 0.]]) 

736 

737 >>> X_sparse 

738 <3x2 sparse matrix of type '<... 'numpy.float64'>' 

739 with 3 stored elements in Compressed Sparse Row format> 

740 

741 >>> X_sparse.toarray() 

742 array([[0., 0.], 

743 [2., 1.], 

744 [1., 0.]]) 

745 

746 >>> y 

747 array([2, 1, 0]) 

748 

749 >>> shuffle(y, n_samples=2, random_state=0) 

750 array([0, 1]) 

751 """ 

752 return resample( 

753 *arrays, replace=False, n_samples=n_samples, random_state=random_state 

754 ) 

755 

756 

757def safe_sqr(X, *, copy=True): 

758 """Element wise squaring of array-likes and sparse matrices. 

759 

760 Parameters 

761 ---------- 

762 X : {array-like, ndarray, sparse matrix} 

763 

764 copy : bool, default=True 

765 Whether to create a copy of X and operate on it or to perform 

766 inplace computation (default behaviour). 

767 

768 Returns 

769 ------- 

770 X ** 2 : element wise square 

771 Return the element-wise square of the input. 

772 """ 

773 X = check_array(X, accept_sparse=["csr", "csc", "coo"], ensure_2d=False) 

774 if issparse(X): 

775 if copy: 

776 X = X.copy() 

777 X.data **= 2 

778 else: 

779 if copy: 

780 X = X**2 

781 else: 

782 X **= 2 

783 return X 

784 

785 

786def _chunk_generator(gen, chunksize): 

787 """Chunk generator, ``gen`` into lists of length ``chunksize``. The last 

788 chunk may have a length less than ``chunksize``.""" 

789 while True: 

790 chunk = list(islice(gen, chunksize)) 

791 if chunk: 

792 yield chunk 

793 else: 

794 return 

795 

796 

797@validate_params( 

798 { 

799 "n": [Interval(numbers.Integral, 1, None, closed="left")], 

800 "batch_size": [Interval(numbers.Integral, 1, None, closed="left")], 

801 "min_batch_size": [Interval(numbers.Integral, 0, None, closed="left")], 

802 }, 

803 prefer_skip_nested_validation=True, 

804) 

805def gen_batches(n, batch_size, *, min_batch_size=0): 

806 """Generator to create slices containing `batch_size` elements from 0 to `n`. 

807 

808 The last slice may contain less than `batch_size` elements, when 

809 `batch_size` does not divide `n`. 

810 

811 Parameters 

812 ---------- 

813 n : int 

814 Size of the sequence. 

815 batch_size : int 

816 Number of elements in each batch. 

817 min_batch_size : int, default=0 

818 Minimum number of elements in each batch. 

819 

820 Yields 

821 ------ 

822 slice of `batch_size` elements 

823 

824 See Also 

825 -------- 

826 gen_even_slices: Generator to create n_packs slices going up to n. 

827 

828 Examples 

829 -------- 

830 >>> from sklearn.utils import gen_batches 

831 >>> list(gen_batches(7, 3)) 

832 [slice(0, 3, None), slice(3, 6, None), slice(6, 7, None)] 

833 >>> list(gen_batches(6, 3)) 

834 [slice(0, 3, None), slice(3, 6, None)] 

835 >>> list(gen_batches(2, 3)) 

836 [slice(0, 2, None)] 

837 >>> list(gen_batches(7, 3, min_batch_size=0)) 

838 [slice(0, 3, None), slice(3, 6, None), slice(6, 7, None)] 

839 >>> list(gen_batches(7, 3, min_batch_size=2)) 

840 [slice(0, 3, None), slice(3, 7, None)] 

841 """ 

842 start = 0 

843 for _ in range(int(n // batch_size)): 

844 end = start + batch_size 

845 if end + min_batch_size > n: 

846 continue 

847 yield slice(start, end) 

848 start = end 

849 if start < n: 

850 yield slice(start, n) 

851 

852 

853@validate_params( 

854 { 

855 "n": [Interval(Integral, 1, None, closed="left")], 

856 "n_packs": [Interval(Integral, 1, None, closed="left")], 

857 "n_samples": [Interval(Integral, 1, None, closed="left"), None], 

858 }, 

859 prefer_skip_nested_validation=True, 

860) 

861def gen_even_slices(n, n_packs, *, n_samples=None): 

862 """Generator to create `n_packs` evenly spaced slices going up to `n`. 

863 

864 If `n_packs` does not divide `n`, except for the first `n % n_packs` 

865 slices, remaining slices may contain fewer elements. 

866 

867 Parameters 

868 ---------- 

869 n : int 

870 Size of the sequence. 

871 n_packs : int 

872 Number of slices to generate. 

873 n_samples : int, default=None 

874 Number of samples. Pass `n_samples` when the slices are to be used for 

875 sparse matrix indexing; slicing off-the-end raises an exception, while 

876 it works for NumPy arrays. 

877 

878 Yields 

879 ------ 

880 `slice` representing a set of indices from 0 to n. 

881 

882 See Also 

883 -------- 

884 gen_batches: Generator to create slices containing batch_size elements 

885 from 0 to n. 

886 

887 Examples 

888 -------- 

889 >>> from sklearn.utils import gen_even_slices 

890 >>> list(gen_even_slices(10, 1)) 

891 [slice(0, 10, None)] 

892 >>> list(gen_even_slices(10, 10)) 

893 [slice(0, 1, None), slice(1, 2, None), ..., slice(9, 10, None)] 

894 >>> list(gen_even_slices(10, 5)) 

895 [slice(0, 2, None), slice(2, 4, None), ..., slice(8, 10, None)] 

896 >>> list(gen_even_slices(10, 3)) 

897 [slice(0, 4, None), slice(4, 7, None), slice(7, 10, None)] 

898 """ 

899 start = 0 

900 for pack_num in range(n_packs): 

901 this_n = n // n_packs 

902 if pack_num < n % n_packs: 

903 this_n += 1 

904 if this_n > 0: 

905 end = start + this_n 

906 if n_samples is not None: 

907 end = min(n_samples, end) 

908 yield slice(start, end, None) 

909 start = end 

910 

911 

912def tosequence(x): 

913 """Cast iterable x to a Sequence, avoiding a copy if possible. 

914 

915 Parameters 

916 ---------- 

917 x : iterable 

918 The iterable to be converted. 

919 

920 Returns 

921 ------- 

922 x : Sequence 

923 If `x` is a NumPy array, it returns it as a `ndarray`. If `x` 

924 is a `Sequence`, `x` is returned as-is. If `x` is from any other 

925 type, `x` is returned casted as a list. 

926 """ 

927 if isinstance(x, np.ndarray): 

928 return np.asarray(x) 

929 elif isinstance(x, Sequence): 

930 return x 

931 else: 

932 return list(x) 

933 

934 

935def _to_object_array(sequence): 

936 """Convert sequence to a 1-D NumPy array of object dtype. 

937 

938 numpy.array constructor has a similar use but it's output 

939 is ambiguous. It can be 1-D NumPy array of object dtype if 

940 the input is a ragged array, but if the input is a list of 

941 equal length arrays, then the output is a 2D numpy.array. 

942 _to_object_array solves this ambiguity by guarantying that 

943 the output is a 1-D NumPy array of objects for any input. 

944 

945 Parameters 

946 ---------- 

947 sequence : array-like of shape (n_elements,) 

948 The sequence to be converted. 

949 

950 Returns 

951 ------- 

952 out : ndarray of shape (n_elements,), dtype=object 

953 The converted sequence into a 1-D NumPy array of object dtype. 

954 

955 Examples 

956 -------- 

957 >>> import numpy as np 

958 >>> from sklearn.utils import _to_object_array 

959 >>> _to_object_array([np.array([0]), np.array([1])]) 

960 array([array([0]), array([1])], dtype=object) 

961 >>> _to_object_array([np.array([0]), np.array([1, 2])]) 

962 array([array([0]), array([1, 2])], dtype=object) 

963 >>> _to_object_array([np.array([0]), np.array([1, 2])]) 

964 array([array([0]), array([1, 2])], dtype=object) 

965 """ 

966 out = np.empty(len(sequence), dtype=object) 

967 out[:] = sequence 

968 return out 

969 

970 

971def indices_to_mask(indices, mask_length): 

972 """Convert list of indices to boolean mask. 

973 

974 Parameters 

975 ---------- 

976 indices : list-like 

977 List of integers treated as indices. 

978 mask_length : int 

979 Length of boolean mask to be generated. 

980 This parameter must be greater than max(indices). 

981 

982 Returns 

983 ------- 

984 mask : 1d boolean nd-array 

985 Boolean array that is True where indices are present, else False. 

986 

987 Examples 

988 -------- 

989 >>> from sklearn.utils import indices_to_mask 

990 >>> indices = [1, 2 , 3, 4] 

991 >>> indices_to_mask(indices, 5) 

992 array([False, True, True, True, True]) 

993 """ 

994 if mask_length <= np.max(indices): 

995 raise ValueError("mask_length must be greater than max(indices)") 

996 

997 mask = np.zeros(mask_length, dtype=bool) 

998 mask[indices] = True 

999 

1000 return mask 

1001 

1002 

1003def _message_with_time(source, message, time): 

1004 """Create one line message for logging purposes. 

1005 

1006 Parameters 

1007 ---------- 

1008 source : str 

1009 String indicating the source or the reference of the message. 

1010 

1011 message : str 

1012 Short message. 

1013 

1014 time : int 

1015 Time in seconds. 

1016 """ 

1017 start_message = "[%s] " % source 

1018 

1019 # adapted from joblib.logger.short_format_time without the Windows -.1s 

1020 # adjustment 

1021 if time > 60: 

1022 time_str = "%4.1fmin" % (time / 60) 

1023 else: 

1024 time_str = " %5.1fs" % time 

1025 end_message = " %s, total=%s" % (message, time_str) 

1026 dots_len = 70 - len(start_message) - len(end_message) 

1027 return "%s%s%s" % (start_message, dots_len * ".", end_message) 

1028 

1029 

1030@contextmanager 

1031def _print_elapsed_time(source, message=None): 

1032 """Log elapsed time to stdout when the context is exited. 

1033 

1034 Parameters 

1035 ---------- 

1036 source : str 

1037 String indicating the source or the reference of the message. 

1038 

1039 message : str, default=None 

1040 Short message. If None, nothing will be printed. 

1041 

1042 Returns 

1043 ------- 

1044 context_manager 

1045 Prints elapsed time upon exit if verbose. 

1046 """ 

1047 if message is None: 

1048 yield 

1049 else: 

1050 start = timeit.default_timer() 

1051 yield 

1052 print(_message_with_time(source, message, timeit.default_timer() - start)) 

1053 

1054 

1055def get_chunk_n_rows(row_bytes, *, max_n_rows=None, working_memory=None): 

1056 """Calculate how many rows can be processed within `working_memory`. 

1057 

1058 Parameters 

1059 ---------- 

1060 row_bytes : int 

1061 The expected number of bytes of memory that will be consumed 

1062 during the processing of each row. 

1063 max_n_rows : int, default=None 

1064 The maximum return value. 

1065 working_memory : int or float, default=None 

1066 The number of rows to fit inside this number of MiB will be 

1067 returned. When None (default), the value of 

1068 ``sklearn.get_config()['working_memory']`` is used. 

1069 

1070 Returns 

1071 ------- 

1072 int 

1073 The number of rows which can be processed within `working_memory`. 

1074 

1075 Warns 

1076 ----- 

1077 Issues a UserWarning if `row_bytes exceeds `working_memory` MiB. 

1078 """ 

1079 

1080 if working_memory is None: 

1081 working_memory = get_config()["working_memory"] 

1082 

1083 chunk_n_rows = int(working_memory * (2**20) // row_bytes) 

1084 if max_n_rows is not None: 

1085 chunk_n_rows = min(chunk_n_rows, max_n_rows) 

1086 if chunk_n_rows < 1: 

1087 warnings.warn( 

1088 "Could not adhere to working_memory config. " 

1089 "Currently %.0fMiB, %.0fMiB required." 

1090 % (working_memory, np.ceil(row_bytes * 2**-20)) 

1091 ) 

1092 chunk_n_rows = 1 

1093 return chunk_n_rows 

1094 

1095 

1096def _is_pandas_na(x): 

1097 """Test if x is pandas.NA. 

1098 

1099 We intentionally do not use this function to return `True` for `pd.NA` in 

1100 `is_scalar_nan`, because estimators that support `pd.NA` are the exception 

1101 rather than the rule at the moment. When `pd.NA` is more universally 

1102 supported, we may reconsider this decision. 

1103 

1104 Parameters 

1105 ---------- 

1106 x : any type 

1107 

1108 Returns 

1109 ------- 

1110 boolean 

1111 """ 

1112 with suppress(ImportError): 

1113 from pandas import NA 

1114 

1115 return x is NA 

1116 

1117 return False 

1118 

1119 

1120def is_scalar_nan(x): 

1121 """Test if x is NaN. 

1122 

1123 This function is meant to overcome the issue that np.isnan does not allow 

1124 non-numerical types as input, and that np.nan is not float('nan'). 

1125 

1126 Parameters 

1127 ---------- 

1128 x : any type 

1129 Any scalar value. 

1130 

1131 Returns 

1132 ------- 

1133 bool 

1134 Returns true if x is NaN, and false otherwise. 

1135 

1136 Examples 

1137 -------- 

1138 >>> import numpy as np 

1139 >>> from sklearn.utils import is_scalar_nan 

1140 >>> is_scalar_nan(np.nan) 

1141 True 

1142 >>> is_scalar_nan(float("nan")) 

1143 True 

1144 >>> is_scalar_nan(None) 

1145 False 

1146 >>> is_scalar_nan("") 

1147 False 

1148 >>> is_scalar_nan([np.nan]) 

1149 False 

1150 """ 

1151 return ( 

1152 not isinstance(x, numbers.Integral) 

1153 and isinstance(x, numbers.Real) 

1154 and math.isnan(x) 

1155 ) 

1156 

1157 

1158def _approximate_mode(class_counts, n_draws, rng): 

1159 """Computes approximate mode of multivariate hypergeometric. 

1160 

1161 This is an approximation to the mode of the multivariate 

1162 hypergeometric given by class_counts and n_draws. 

1163 It shouldn't be off by more than one. 

1164 

1165 It is the mostly likely outcome of drawing n_draws many 

1166 samples from the population given by class_counts. 

1167 

1168 Parameters 

1169 ---------- 

1170 class_counts : ndarray of int 

1171 Population per class. 

1172 n_draws : int 

1173 Number of draws (samples to draw) from the overall population. 

1174 rng : random state 

1175 Used to break ties. 

1176 

1177 Returns 

1178 ------- 

1179 sampled_classes : ndarray of int 

1180 Number of samples drawn from each class. 

1181 np.sum(sampled_classes) == n_draws 

1182 

1183 Examples 

1184 -------- 

1185 >>> import numpy as np 

1186 >>> from sklearn.utils import _approximate_mode 

1187 >>> _approximate_mode(class_counts=np.array([4, 2]), n_draws=3, rng=0) 

1188 array([2, 1]) 

1189 >>> _approximate_mode(class_counts=np.array([5, 2]), n_draws=4, rng=0) 

1190 array([3, 1]) 

1191 >>> _approximate_mode(class_counts=np.array([2, 2, 2, 1]), 

1192 ... n_draws=2, rng=0) 

1193 array([0, 1, 1, 0]) 

1194 >>> _approximate_mode(class_counts=np.array([2, 2, 2, 1]), 

1195 ... n_draws=2, rng=42) 

1196 array([1, 1, 0, 0]) 

1197 """ 

1198 rng = check_random_state(rng) 

1199 # this computes a bad approximation to the mode of the 

1200 # multivariate hypergeometric given by class_counts and n_draws 

1201 continuous = class_counts / class_counts.sum() * n_draws 

1202 # floored means we don't overshoot n_samples, but probably undershoot 

1203 floored = np.floor(continuous) 

1204 # we add samples according to how much "left over" probability 

1205 # they had, until we arrive at n_samples 

1206 need_to_add = int(n_draws - floored.sum()) 

1207 if need_to_add > 0: 

1208 remainder = continuous - floored 

1209 values = np.sort(np.unique(remainder))[::-1] 

1210 # add according to remainder, but break ties 

1211 # randomly to avoid biases 

1212 for value in values: 

1213 (inds,) = np.where(remainder == value) 

1214 # if we need_to_add less than what's in inds 

1215 # we draw randomly from them. 

1216 # if we need to add more, we add them all and 

1217 # go to the next value 

1218 add_now = min(len(inds), need_to_add) 

1219 inds = rng.choice(inds, size=add_now, replace=False) 

1220 floored[inds] += 1 

1221 need_to_add -= add_now 

1222 if need_to_add == 0: 

1223 break 

1224 return floored.astype(int) 

1225 

1226 

1227def check_matplotlib_support(caller_name): 

1228 """Raise ImportError with detailed error message if mpl is not installed. 

1229 

1230 Plot utilities like any of the Display's plotting functions should lazily import 

1231 matplotlib and call this helper before any computation. 

1232 

1233 Parameters 

1234 ---------- 

1235 caller_name : str 

1236 The name of the caller that requires matplotlib. 

1237 """ 

1238 try: 

1239 import matplotlib # noqa 

1240 except ImportError as e: 

1241 raise ImportError( 

1242 "{} requires matplotlib. You can install matplotlib with " 

1243 "`pip install matplotlib`".format(caller_name) 

1244 ) from e 

1245 

1246 

1247def check_pandas_support(caller_name): 

1248 """Raise ImportError with detailed error message if pandas is not installed. 

1249 

1250 Plot utilities like :func:`fetch_openml` should lazily import 

1251 pandas and call this helper before any computation. 

1252 

1253 Parameters 

1254 ---------- 

1255 caller_name : str 

1256 The name of the caller that requires pandas. 

1257 

1258 Returns 

1259 ------- 

1260 pandas 

1261 The pandas package. 

1262 """ 

1263 try: 

1264 import pandas # noqa 

1265 

1266 return pandas 

1267 except ImportError as e: 

1268 raise ImportError("{} requires pandas.".format(caller_name)) from e