Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/scikit_learn-1.4.dev0-py3.8-linux-x86_64.egg/sklearn/utils/__init_

1"""

2The :mod:`sklearn.utils` module includes various utilities.

3"""

5import math

6import numbers

7import platform

8import struct

9import timeit

10import warnings

11from collections.abc import Sequence

12from contextlib import contextmanager, suppress

13from itertools import compress, islice

15import numpy as np

16from scipy.sparse import issparse

18from .. import get_config

19from ..exceptions import DataConversionWarning

20from . import _joblib, metadata_routing

21from ._bunch import Bunch

22from ._estimator_html_repr import estimator_html_repr

23from ._param_validation import Integral, Interval, validate_params

24from .class_weight import compute_class_weight, compute_sample_weight

25from .deprecation import deprecated

26from .discovery import all_estimators

27from .fixes import parse_version, threadpool_info

28from .murmurhash import murmurhash3_32

29from .validation import (

30 _is_arraylike_not_scalar,

31 _is_pandas_df,

32 _is_polars_df,

33 _use_interchange_protocol,

34 as_float_array,

35 assert_all_finite,

36 check_array,

37 check_consistent_length,

38 check_random_state,

39 check_scalar,

40 check_symmetric,

41 check_X_y,

42 column_or_1d,

43 indexable,

44)

46# Do not deprecate parallel_backend and register_parallel_backend as they are

47# needed to tune `scikit-learn` behavior and have different effect if called

48# from the vendored version or or the site-package version. The other are

49# utilities that are independent of scikit-learn so they are not part of

50# scikit-learn public API.

51parallel_backend = _joblib.parallel_backend

52register_parallel_backend = _joblib.register_parallel_backend

54__all__ = [

55 "murmurhash3_32",

56 "as_float_array",

57 "assert_all_finite",

58 "check_array",

59 "check_random_state",

60 "compute_class_weight",

61 "compute_sample_weight",

62 "column_or_1d",

63 "check_consistent_length",

64 "check_X_y",

65 "check_scalar",

66 "indexable",

67 "check_symmetric",

68 "indices_to_mask",

69 "deprecated",

70 "parallel_backend",

71 "register_parallel_backend",

72 "resample",

73 "shuffle",

74 "check_matplotlib_support",

75 "all_estimators",

76 "DataConversionWarning",

77 "estimator_html_repr",

78 "Bunch",

79 "metadata_routing",

80]

82IS_PYPY = platform.python_implementation() == "PyPy"

83_IS_32BIT = 8 * struct.calcsize("P") == 32

84_IS_WASM = platform.machine() in ["wasm32", "wasm64"]

87def _in_unstable_openblas_configuration():

88 """Return True if in an unstable configuration for OpenBLAS"""

90 # Import libraries which might load OpenBLAS.

91 import numpy # noqa

92 import scipy # noqa

94 modules_info = threadpool_info()

96 open_blas_used = any(info["internal_api"] == "openblas" for info in modules_info)

97 if not open_blas_used:

98 return False

100 # OpenBLAS 0.3.16 fixed instability for arm64, see:

101 # https://github.com/xianyi/OpenBLAS/blob/1b6db3dbba672b4f8af935bd43a1ff6cff4d20b7/Changelog.txt#L56-L58 # noqa

102 openblas_arm64_stable_version = parse_version("0.3.16")

103 for info in modules_info:

104 if info["internal_api"] != "openblas":

105 continue

106 openblas_version = info.get("version")

107 openblas_architecture = info.get("architecture")

108 if openblas_version is None or openblas_architecture is None:

109 # Cannot be sure that OpenBLAS is good enough. Assume unstable:

110 return True

111 if (

112 openblas_architecture == "neoversen1"

113 and parse_version(openblas_version) < openblas_arm64_stable_version

114 ):

115 # See discussions in https://github.com/numpy/numpy/issues/19411

116 return True

117 return False

118

119

120@validate_params(

121 {

122 "X": ["array-like", "sparse matrix"],

123 "mask": ["array-like"],

124 },

125 prefer_skip_nested_validation=True,

126)

127def safe_mask(X, mask):

128 """Return a mask which is safe to use on X.

129

130 Parameters

131 ----------

132 X : {array-like, sparse matrix}

133 Data on which to apply mask.

134

135 mask : array-like

136 Mask to be used on X.

137

138 Returns

139 -------

140 mask : ndarray

141 Array that is safe to use on X.

142 """

143 mask = np.asarray(mask)

144 if np.issubdtype(mask.dtype, np.signedinteger):

145 return mask

146

147 if hasattr(X, "toarray"):

148 ind = np.arange(mask.shape[0])

149 mask = ind[mask]

150 return mask

151

152

153def axis0_safe_slice(X, mask, len_mask):

154 """Return a mask which is safer to use on X than safe_mask.

155

156 This mask is safer than safe_mask since it returns an

157 empty array, when a sparse matrix is sliced with a boolean mask

158 with all False, instead of raising an unhelpful error in older

159 versions of SciPy.

160

161 See: https://github.com/scipy/scipy/issues/5361

162

163 Also note that we can avoid doing the dot product by checking if

164 the len_mask is not zero in _huber_loss_and_gradient but this

165 is not going to be the bottleneck, since the number of outliers

166 and non_outliers are typically non-zero and it makes the code

167 tougher to follow.

168

169 Parameters

170 ----------

171 X : {array-like, sparse matrix}

172 Data on which to apply mask.

173

174 mask : ndarray

175 Mask to be used on X.

176

177 len_mask : int

178 The length of the mask.

179

180 Returns

181 -------

182 mask : ndarray

183 Array that is safe to use on X.

184 """

185 if len_mask != 0:

186 return X[safe_mask(X, mask), :]

187 return np.zeros(shape=(0, X.shape[1]))

188

189

190def _array_indexing(array, key, key_dtype, axis):

191 """Index an array or scipy.sparse consistently across NumPy version."""

192 if issparse(array) and key_dtype == "bool":

193 key = np.asarray(key)

194 if isinstance(key, tuple):

195 key = list(key)

196 return array[key, ...] if axis == 0 else array[:, key]

197

198

199def _pandas_indexing(X, key, key_dtype, axis):

200 """Index a pandas dataframe or a series."""

201 if _is_arraylike_not_scalar(key):

202 key = np.asarray(key)

203

204 if key_dtype == "int" and not (isinstance(key, slice) or np.isscalar(key)):

205 # using take() instead of iloc[] ensures the return value is a "proper"

206 # copy that will not raise SettingWithCopyWarning

207 return X.take(key, axis=axis)

208 else:

209 # check whether we should index with loc or iloc

210 indexer = X.iloc if key_dtype == "int" else X.loc

211 return indexer[:, key] if axis else indexer[key]

212

213

214def _list_indexing(X, key, key_dtype):

215 """Index a Python list."""

216 if np.isscalar(key) or isinstance(key, slice):

217 # key is a slice or a scalar

218 return X[key]

219 if key_dtype == "bool":

220 # key is a boolean array-like

221 return list(compress(X, key))

222 # key is a integer array-like of key

223 return [X[idx] for idx in key]

224

225

226def _polars_indexing(X, key, key_dtype, axis):

227 """Indexing X with polars interchange protocol."""

228 # Polars behavior is more consistent with lists

229 if isinstance(key, np.ndarray):

230 key = key.tolist()

231

232 if axis == 1:

233 return X[:, key]

234 else:

235 return X[key]

236

237

238def _determine_key_type(key, accept_slice=True):

239 """Determine the data type of key.

240

241 Parameters

242 ----------

243 key : scalar, slice or array-like

244 The key from which we want to infer the data type.

245

246 accept_slice : bool, default=True

247 Whether or not to raise an error if the key is a slice.

248

249 Returns

250 -------

251 dtype : {'int', 'str', 'bool', None}

252 Returns the data type of key.

253 """

254 err_msg = (

255 "No valid specification of the columns. Only a scalar, list or "

256 "slice of all integers or all strings, or boolean mask is "

257 "allowed"

258 )

259

260 dtype_to_str = {int: "int", str: "str", bool: "bool", np.bool_: "bool"}

261 array_dtype_to_str = {

262 "i": "int",

263 "u": "int",

264 "b": "bool",

265 "O": "str",

266 "U": "str",

267 "S": "str",

268 }

269

270 if key is None:

271 return None

272 if isinstance(key, tuple(dtype_to_str.keys())):

273 try:

274 return dtype_to_str[type(key)]

275 except KeyError:

276 raise ValueError(err_msg)

277 if isinstance(key, slice):

278 if not accept_slice:

279 raise TypeError(

280 "Only array-like or scalar are supported. A Python slice was given."

281 )

282 if key.start is None and key.stop is None:

283 return None

284 key_start_type = _determine_key_type(key.start)

285 key_stop_type = _determine_key_type(key.stop)

286 if key_start_type is not None and key_stop_type is not None:

287 if key_start_type != key_stop_type:

288 raise ValueError(err_msg)

289 if key_start_type is not None:

290 return key_start_type

291 return key_stop_type

292 if isinstance(key, (list, tuple)):

293 unique_key = set(key)

294 key_type = {_determine_key_type(elt) for elt in unique_key}

295 if not key_type:

296 return None

297 if len(key_type) != 1:

298 raise ValueError(err_msg)

299 return key_type.pop()

300 if hasattr(key, "dtype"):

301 try:

302 return array_dtype_to_str[key.dtype.kind]

303 except KeyError:

304 raise ValueError(err_msg)

305 raise ValueError(err_msg)

306

307

308def _safe_indexing(X, indices, *, axis=0):

309 """Return rows, items or columns of X using indices.

310

311 .. warning::

312

313 This utility is documented, but **private**. This means that

314 backward compatibility might be broken without any deprecation

315 cycle.

316

317 Parameters

318 ----------

319 X : array-like, sparse-matrix, list, pandas.DataFrame, pandas.Series

320 Data from which to sample rows, items or columns. `list` are only

321 supported when `axis=0`.

322 indices : bool, int, str, slice, array-like

323 - If `axis=0`, boolean and integer array-like, integer slice,

324 and scalar integer are supported.

325 - If `axis=1`:

326 - to select a single column, `indices` can be of `int` type for

327 all `X` types and `str` only for dataframe. The selected subset

328 will be 1D, unless `X` is a sparse matrix in which case it will

329 be 2D.

330 - to select multiples columns, `indices` can be one of the

331 following: `list`, `array`, `slice`. The type used in

332 these containers can be one of the following: `int`, 'bool' and

333 `str`. However, `str` is only supported when `X` is a dataframe.

334 The selected subset will be 2D.

335 axis : int, default=0

336 The axis along which `X` will be subsampled. `axis=0` will select

337 rows while `axis=1` will select columns.

338

339 Returns

340 -------

341 subset

342 Subset of X on axis 0 or 1.

343

344 Notes

345 -----

346 CSR, CSC, and LIL sparse matrices are supported. COO sparse matrices are

347 not supported.

348 """

349 if indices is None:

350 return X

351

352 if axis not in (0, 1):

353 raise ValueError(

354 "'axis' should be either 0 (to index rows) or 1 (to index "

355 " column). Got {} instead.".format(axis)

356 )

357

358 indices_dtype = _determine_key_type(indices)

359

360 if axis == 0 and indices_dtype == "str":

361 raise ValueError("String indexing is not supported with 'axis=0'")

362

363 if axis == 1 and hasattr(X, "ndim") and X.ndim != 2:

364 raise ValueError(

365 "'X' should be a 2D NumPy array, 2D sparse matrix or pandas "

366 "dataframe when indexing the columns (i.e. 'axis=1'). "

367 "Got {} instead with {} dimension(s).".format(type(X), X.ndim)

368 )

369

370 if (

371 axis == 1

372 and indices_dtype == "str"

373 and not (_is_pandas_df(X) or _use_interchange_protocol(X))

374 ):

375 raise ValueError(

376 "Specifying the columns using strings is only supported for dataframes."

377 )

378

379 if hasattr(X, "iloc"):

380 # TODO: we should probably use _is_pandas_df(X) instead but this would

381 # require updating some tests such as test_train_test_split_mock_pandas.

382 return _pandas_indexing(X, indices, indices_dtype, axis=axis)

383 elif _is_polars_df(X):

384 return _polars_indexing(X, indices, indices_dtype, axis=axis)

385 elif hasattr(X, "shape"):

386 return _array_indexing(X, indices, indices_dtype, axis=axis)

387 else:

388 return _list_indexing(X, indices, indices_dtype)

389

390

391def _safe_assign(X, values, *, row_indexer=None, column_indexer=None):

392 """Safe assignment to a numpy array, sparse matrix, or pandas dataframe.

393

394 Parameters

395 ----------

396 X : {ndarray, sparse-matrix, dataframe}

397 Array to be modified. It is expected to be 2-dimensional.

398

399 values : ndarray

400 The values to be assigned to `X`.

401

402 row_indexer : array-like, dtype={int, bool}, default=None

403 A 1-dimensional array to select the rows of interest. If `None`, all

404 rows are selected.

405

406 column_indexer : array-like, dtype={int, bool}, default=None

407 A 1-dimensional array to select the columns of interest. If `None`, all

408 columns are selected.

409 """

410 row_indexer = slice(None, None, None) if row_indexer is None else row_indexer

411 column_indexer = (

412 slice(None, None, None) if column_indexer is None else column_indexer

413 )

414

415 if hasattr(X, "iloc"): # pandas dataframe

416 with warnings.catch_warnings():

417 # pandas >= 1.5 raises a warning when using iloc to set values in a column

418 # that does not have the same type as the column being set. It happens

419 # for instance when setting a categorical column with a string.

420 # In the future the behavior won't change and the warning should disappear.

421 # TODO(1.3): check if the warning is still raised or remove the filter.

422 warnings.simplefilter("ignore", FutureWarning)

423 X.iloc[row_indexer, column_indexer] = values

424 else: # numpy array or sparse matrix

425 X[row_indexer, column_indexer] = values

426

427

428def _get_column_indices_for_bool_or_int(key, n_columns):

429 # Convert key into list of positive integer indexes

430 try:

431 idx = _safe_indexing(np.arange(n_columns), key)

432 except IndexError as e:

433 raise ValueError(

434 f"all features must be in [0, {n_columns - 1}] or [-{n_columns}, 0]"

435 ) from e

436 return np.atleast_1d(idx).tolist()

437

438

439def _get_column_indices(X, key):

440 """Get feature column indices for input data X and key.

441

442 For accepted values of `key`, see the docstring of

443 :func:`_safe_indexing`.

444 """

445 key_dtype = _determine_key_type(key)

446 if _use_interchange_protocol(X):

447 return _get_column_indices_interchange(X.__dataframe__(), key, key_dtype)

448

449 n_columns = X.shape[1]

450 if isinstance(key, (list, tuple)) and not key:

451 # we get an empty list

452 return []

453 elif key_dtype in ("bool", "int"):

454 return _get_column_indices_for_bool_or_int(key, n_columns)

455 else:

456 try:

457 all_columns = X.columns

458 except AttributeError:

459 raise ValueError(

460 "Specifying the columns using strings is only supported for dataframes."

461 )

462 if isinstance(key, str):

463 columns = [key]

464 elif isinstance(key, slice):

465 start, stop = key.start, key.stop

466 if start is not None:

467 start = all_columns.get_loc(start)

468 if stop is not None:

469 # pandas indexing with strings is endpoint included

470 stop = all_columns.get_loc(stop) + 1

471 else:

472 stop = n_columns + 1

473 return list(islice(range(n_columns), start, stop))

474 else:

475 columns = list(key)

476

477 try:

478 column_indices = []

479 for col in columns:

480 col_idx = all_columns.get_loc(col)

481 if not isinstance(col_idx, numbers.Integral):

482 raise ValueError(

483 f"Selected columns, {columns}, are not unique in dataframe"

484 )

485 column_indices.append(col_idx)

486

487 except KeyError as e:

488 raise ValueError("A given column is not a column of the dataframe") from e

489

490 return column_indices

491

492

493def _get_column_indices_interchange(X_interchange, key, key_dtype):

494 """Same as _get_column_indices but for X with __dataframe__ protocol."""

495

496 n_columns = X_interchange.num_columns()

497

498 if isinstance(key, (list, tuple)) and not key:

499 # we get an empty list

500 return []

501 elif key_dtype in ("bool", "int"):

502 return _get_column_indices_for_bool_or_int(key, n_columns)

503 else:

504 column_names = list(X_interchange.column_names())

505

506 if isinstance(key, slice):

507 if key.step not in [1, None]:

508 raise NotImplementedError("key.step must be 1 or None")

509 start, stop = key.start, key.stop

510 if start is not None:

511 start = column_names.index(start)

512

513 if stop is not None:

514 stop = column_names.index(stop) + 1

515 else:

516 stop = n_columns + 1

517 return list(islice(range(n_columns), start, stop))

518

519 selected_columns = [key] if np.isscalar(key) else key

520

521 try:

522 return [column_names.index(col) for col in selected_columns]

523 except ValueError as e:

524 raise ValueError("A given column is not a column of the dataframe") from e

525

526

527@validate_params(

528 {

529 "replace": ["boolean"],

530 "n_samples": [Interval(numbers.Integral, 1, None, closed="left"), None],

531 "random_state": ["random_state"],

532 "stratify": ["array-like", None],

533 },

534 prefer_skip_nested_validation=True,

535)

536def resample(*arrays, replace=True, n_samples=None, random_state=None, stratify=None):

537 """Resample arrays or sparse matrices in a consistent way.

538

539 The default strategy implements one step of the bootstrapping

540 procedure.

541

542 Parameters

543 ----------

544 *arrays : sequence of array-like of shape (n_samples,) or \

545 (n_samples, n_outputs)

546 Indexable data-structures can be arrays, lists, dataframes or scipy

547 sparse matrices with consistent first dimension.

548

549 replace : bool, default=True

550 Implements resampling with replacement. If False, this will implement

551 (sliced) random permutations.

552

553 n_samples : int, default=None

554 Number of samples to generate. If left to None this is

555 automatically set to the first dimension of the arrays.

556 If replace is False it should not be larger than the length of

557 arrays.

558

559 random_state : int, RandomState instance or None, default=None

560 Determines random number generation for shuffling

561 the data.

562 Pass an int for reproducible results across multiple function calls.

563 See :term:`Glossary <random_state>`.

564

565 stratify : array-like of shape (n_samples,) or (n_samples, n_outputs), \

566 default=None

567 If not None, data is split in a stratified fashion, using this as

568 the class labels.

569

570 Returns

571 -------

572 resampled_arrays : sequence of array-like of shape (n_samples,) or \

573 (n_samples, n_outputs)

574 Sequence of resampled copies of the collections. The original arrays

575 are not impacted.

576

577 See Also

578 --------

579 shuffle : Shuffle arrays or sparse matrices in a consistent way.

580

581 Examples

582 --------

583 It is possible to mix sparse and dense arrays in the same run::

584

585 >>> import numpy as np

586 >>> X = np.array([[1., 0.], [2., 1.], [0., 0.]])

587 >>> y = np.array([0, 1, 2])

588

589 >>> from scipy.sparse import coo_matrix

590 >>> X_sparse = coo_matrix(X)

591

592 >>> from sklearn.utils import resample

593 >>> X, X_sparse, y = resample(X, X_sparse, y, random_state=0)

594 >>> X

595 array([[1., 0.],

596 [2., 1.],

597 [1., 0.]])

598

599 >>> X_sparse

600 <3x2 sparse matrix of type '<... 'numpy.float64'>'

601 with 4 stored elements in Compressed Sparse Row format>

602

603 >>> X_sparse.toarray()

604 array([[1., 0.],

605 [2., 1.],

606 [1., 0.]])

607

608 >>> y

609 array([0, 1, 0])

610

611 >>> resample(y, n_samples=2, random_state=0)

612 array([0, 1])

613

614 Example using stratification::

615

616 >>> y = [0, 0, 1, 1, 1, 1, 1, 1, 1]

617 >>> resample(y, n_samples=5, replace=False, stratify=y,

618 ... random_state=0)

619 [1, 1, 1, 0, 1]

620 """

621 max_n_samples = n_samples

622 random_state = check_random_state(random_state)

623

624 if len(arrays) == 0:

625 return None

626

627 first = arrays[0]

628 n_samples = first.shape[0] if hasattr(first, "shape") else len(first)

629

630 if max_n_samples is None:

631 max_n_samples = n_samples

632 elif (max_n_samples > n_samples) and (not replace):

633 raise ValueError(

634 "Cannot sample %d out of arrays with dim %d when replace is False"

635 % (max_n_samples, n_samples)

636 )

637

638 check_consistent_length(*arrays)

639

640 if stratify is None:

641 if replace:

642 indices = random_state.randint(0, n_samples, size=(max_n_samples,))

643 else:

644 indices = np.arange(n_samples)

645 random_state.shuffle(indices)

646 indices = indices[:max_n_samples]

647 else:

648 # Code adapted from StratifiedShuffleSplit()

649 y = check_array(stratify, ensure_2d=False, dtype=None)

650 if y.ndim == 2:

651 # for multi-label y, map each distinct row to a string repr

652 # using join because str(row) uses an ellipsis if len(row) > 1000

653 y = np.array([" ".join(row.astype("str")) for row in y])

654

655 classes, y_indices = np.unique(y, return_inverse=True)

656 n_classes = classes.shape[0]

657

658 class_counts = np.bincount(y_indices)

659

660 # Find the sorted list of instances for each class:

661 # (np.unique above performs a sort, so code is O(n logn) already)

662 class_indices = np.split(

663 np.argsort(y_indices, kind="mergesort"), np.cumsum(class_counts)[:-1]

664 )

665

666 n_i = _approximate_mode(class_counts, max_n_samples, random_state)

667

668 indices = []

669

670 for i in range(n_classes):

671 indices_i = random_state.choice(class_indices[i], n_i[i], replace=replace)

672 indices.extend(indices_i)

673

674 indices = random_state.permutation(indices)

675

676 # convert sparse matrices to CSR for row-based indexing

677 arrays = [a.tocsr() if issparse(a) else a for a in arrays]

678 resampled_arrays = [_safe_indexing(a, indices) for a in arrays]

679 if len(resampled_arrays) == 1:

680 # syntactic sugar for the unit argument case

681 return resampled_arrays[0]

682 else:

683 return resampled_arrays

684

685

686def shuffle(*arrays, random_state=None, n_samples=None):

687 """Shuffle arrays or sparse matrices in a consistent way.

688

689 This is a convenience alias to ``resample(*arrays, replace=False)`` to do

690 random permutations of the collections.

691

692 Parameters

693 ----------

694 *arrays : sequence of indexable data-structures

695 Indexable data-structures can be arrays, lists, dataframes or scipy

696 sparse matrices with consistent first dimension.

697

698 random_state : int, RandomState instance or None, default=None

699 Determines random number generation for shuffling

700 the data.

701 Pass an int for reproducible results across multiple function calls.

702 See :term:`Glossary <random_state>`.

703

704 n_samples : int, default=None

705 Number of samples to generate. If left to None this is

706 automatically set to the first dimension of the arrays. It should

707 not be larger than the length of arrays.

708

709 Returns

710 -------

711 shuffled_arrays : sequence of indexable data-structures

712 Sequence of shuffled copies of the collections. The original arrays

713 are not impacted.

714

715 See Also

716 --------

717 resample : Resample arrays or sparse matrices in a consistent way.

718

719 Examples

720 --------

721 It is possible to mix sparse and dense arrays in the same run::

722

723 >>> import numpy as np

724 >>> X = np.array([[1., 0.], [2., 1.], [0., 0.]])

725 >>> y = np.array([0, 1, 2])

726

727 >>> from scipy.sparse import coo_matrix

728 >>> X_sparse = coo_matrix(X)

729

730 >>> from sklearn.utils import shuffle

731 >>> X, X_sparse, y = shuffle(X, X_sparse, y, random_state=0)

732 >>> X

733 array([[0., 0.],

734 [2., 1.],

735 [1., 0.]])

736

737 >>> X_sparse

738 <3x2 sparse matrix of type '<... 'numpy.float64'>'

739 with 3 stored elements in Compressed Sparse Row format>

740

741 >>> X_sparse.toarray()

742 array([[0., 0.],

743 [2., 1.],

744 [1., 0.]])

745

746 >>> y

747 array([2, 1, 0])

748

749 >>> shuffle(y, n_samples=2, random_state=0)

750 array([0, 1])

751 """

752 return resample(

753 *arrays, replace=False, n_samples=n_samples, random_state=random_state

754 )

755

756

757def safe_sqr(X, *, copy=True):

758 """Element wise squaring of array-likes and sparse matrices.

759

760 Parameters

761 ----------

762 X : {array-like, ndarray, sparse matrix}

763

764 copy : bool, default=True

765 Whether to create a copy of X and operate on it or to perform

766 inplace computation (default behaviour).

767

768 Returns

769 -------

770 X ** 2 : element wise square

771 Return the element-wise square of the input.

772 """

773 X = check_array(X, accept_sparse=["csr", "csc", "coo"], ensure_2d=False)

774 if issparse(X):

775 if copy:

776 X = X.copy()

777 X.data **= 2

778 else:

779 if copy:

780 X = X**2

781 else:

782 X **= 2

783 return X

784

785

786def _chunk_generator(gen, chunksize):

787 """Chunk generator, ``gen`` into lists of length ``chunksize``. The last

788 chunk may have a length less than ``chunksize``."""

789 while True:

790 chunk = list(islice(gen, chunksize))

791 if chunk:

792 yield chunk

793 else:

794 return

795

796

797@validate_params(

798 {

799 "n": [Interval(numbers.Integral, 1, None, closed="left")],

800 "batch_size": [Interval(numbers.Integral, 1, None, closed="left")],

801 "min_batch_size": [Interval(numbers.Integral, 0, None, closed="left")],

802 },

803 prefer_skip_nested_validation=True,

804)

805def gen_batches(n, batch_size, *, min_batch_size=0):

806 """Generator to create slices containing `batch_size` elements from 0 to `n`.

807

808 The last slice may contain less than `batch_size` elements, when

809 `batch_size` does not divide `n`.

810

811 Parameters

812 ----------

813 n : int

814 Size of the sequence.

815 batch_size : int

816 Number of elements in each batch.

817 min_batch_size : int, default=0

818 Minimum number of elements in each batch.

819

820 Yields

821 ------

822 slice of `batch_size` elements

823

824 See Also

825 --------

826 gen_even_slices: Generator to create n_packs slices going up to n.

827

828 Examples

829 --------

830 >>> from sklearn.utils import gen_batches

831 >>> list(gen_batches(7, 3))

832 [slice(0, 3, None), slice(3, 6, None), slice(6, 7, None)]

833 >>> list(gen_batches(6, 3))

834 [slice(0, 3, None), slice(3, 6, None)]

835 >>> list(gen_batches(2, 3))

836 [slice(0, 2, None)]

837 >>> list(gen_batches(7, 3, min_batch_size=0))

838 [slice(0, 3, None), slice(3, 6, None), slice(6, 7, None)]

839 >>> list(gen_batches(7, 3, min_batch_size=2))

840 [slice(0, 3, None), slice(3, 7, None)]

841 """

842 start = 0

843 for _ in range(int(n // batch_size)):

844 end = start + batch_size

845 if end + min_batch_size > n:

846 continue

847 yield slice(start, end)

848 start = end

849 if start < n:

850 yield slice(start, n)

851

852

853@validate_params(

854 {

855 "n": [Interval(Integral, 1, None, closed="left")],

856 "n_packs": [Interval(Integral, 1, None, closed="left")],

857 "n_samples": [Interval(Integral, 1, None, closed="left"), None],

858 },

859 prefer_skip_nested_validation=True,

860)

861def gen_even_slices(n, n_packs, *, n_samples=None):

862 """Generator to create `n_packs` evenly spaced slices going up to `n`.

863

864 If `n_packs` does not divide `n`, except for the first `n % n_packs`

865 slices, remaining slices may contain fewer elements.

866

867 Parameters

868 ----------

869 n : int

870 Size of the sequence.

871 n_packs : int

872 Number of slices to generate.

873 n_samples : int, default=None

874 Number of samples. Pass `n_samples` when the slices are to be used for

875 sparse matrix indexing; slicing off-the-end raises an exception, while

876 it works for NumPy arrays.

877

878 Yields

879 ------

880 `slice` representing a set of indices from 0 to n.

881

882 See Also

883 --------

884 gen_batches: Generator to create slices containing batch_size elements

885 from 0 to n.

886

887 Examples

888 --------

889 >>> from sklearn.utils import gen_even_slices

890 >>> list(gen_even_slices(10, 1))

891 [slice(0, 10, None)]

892 >>> list(gen_even_slices(10, 10))

893 [slice(0, 1, None), slice(1, 2, None), ..., slice(9, 10, None)]

894 >>> list(gen_even_slices(10, 5))

895 [slice(0, 2, None), slice(2, 4, None), ..., slice(8, 10, None)]

896 >>> list(gen_even_slices(10, 3))

897 [slice(0, 4, None), slice(4, 7, None), slice(7, 10, None)]

898 """

899 start = 0

900 for pack_num in range(n_packs):

901 this_n = n // n_packs

902 if pack_num < n % n_packs:

903 this_n += 1

904 if this_n > 0:

905 end = start + this_n

906 if n_samples is not None:

907 end = min(n_samples, end)

908 yield slice(start, end, None)

909 start = end

910

911

912def tosequence(x):

913 """Cast iterable x to a Sequence, avoiding a copy if possible.

914

915 Parameters

916 ----------

917 x : iterable

918 The iterable to be converted.

919

920 Returns

921 -------

922 x : Sequence

923 If `x` is a NumPy array, it returns it as a `ndarray`. If `x`

924 is a `Sequence`, `x` is returned as-is. If `x` is from any other

925 type, `x` is returned casted as a list.

926 """

927 if isinstance(x, np.ndarray):

928 return np.asarray(x)

929 elif isinstance(x, Sequence):

930 return x

931 else:

932 return list(x)

933

934

935def _to_object_array(sequence):

936 """Convert sequence to a 1-D NumPy array of object dtype.

937

938 numpy.array constructor has a similar use but it's output

939 is ambiguous. It can be 1-D NumPy array of object dtype if

940 the input is a ragged array, but if the input is a list of

941 equal length arrays, then the output is a 2D numpy.array.

942 _to_object_array solves this ambiguity by guarantying that

943 the output is a 1-D NumPy array of objects for any input.

944

945 Parameters

946 ----------

947 sequence : array-like of shape (n_elements,)

948 The sequence to be converted.

949

950 Returns

951 -------

952 out : ndarray of shape (n_elements,), dtype=object

953 The converted sequence into a 1-D NumPy array of object dtype.

954

955 Examples

956 --------

957 >>> import numpy as np

958 >>> from sklearn.utils import _to_object_array

959 >>> _to_object_array([np.array([0]), np.array([1])])

960 array([array([0]), array([1])], dtype=object)

961 >>> _to_object_array([np.array([0]), np.array([1, 2])])

962 array([array([0]), array([1, 2])], dtype=object)

963 >>> _to_object_array([np.array([0]), np.array([1, 2])])

964 array([array([0]), array([1, 2])], dtype=object)

965 """

966 out = np.empty(len(sequence), dtype=object)

967 out[:] = sequence

968 return out

969

970

971def indices_to_mask(indices, mask_length):

972 """Convert list of indices to boolean mask.

973

974 Parameters

975 ----------

976 indices : list-like

977 List of integers treated as indices.

978 mask_length : int

979 Length of boolean mask to be generated.

980 This parameter must be greater than max(indices).

981

982 Returns

983 -------

984 mask : 1d boolean nd-array

985 Boolean array that is True where indices are present, else False.

986

987 Examples

988 --------

989 >>> from sklearn.utils import indices_to_mask

990 >>> indices = [1, 2 , 3, 4]

991 >>> indices_to_mask(indices, 5)

992 array([False, True, True, True, True])

993 """

994 if mask_length <= np.max(indices):

995 raise ValueError("mask_length must be greater than max(indices)")

996

997 mask = np.zeros(mask_length, dtype=bool)

998 mask[indices] = True

999

1000 return mask

1001

1002

1003def _message_with_time(source, message, time):

1004 """Create one line message for logging purposes.

1005

1006 Parameters

1007 ----------

1008 source : str

1009 String indicating the source or the reference of the message.

1010

1011 message : str

1012 Short message.

1013

1014 time : int

1015 Time in seconds.

1016 """

1017 start_message = "[%s] " % source

1018

1019 # adapted from joblib.logger.short_format_time without the Windows -.1s

1020 # adjustment

1021 if time > 60:

1022 time_str = "%4.1fmin" % (time / 60)

1023 else:

1024 time_str = " %5.1fs" % time

1025 end_message = " %s, total=%s" % (message, time_str)

1026 dots_len = 70 - len(start_message) - len(end_message)

1027 return "%s%s%s" % (start_message, dots_len * ".", end_message)

1028

1029

1030@contextmanager

1031def _print_elapsed_time(source, message=None):

1032 """Log elapsed time to stdout when the context is exited.

1033

1034 Parameters

1035 ----------

1036 source : str

1037 String indicating the source or the reference of the message.

1038

1039 message : str, default=None

1040 Short message. If None, nothing will be printed.

1041

1042 Returns

1043 -------

1044 context_manager

1045 Prints elapsed time upon exit if verbose.

1046 """

1047 if message is None:

1048 yield

1049 else:

1050 start = timeit.default_timer()

1051 yield

1052 print(_message_with_time(source, message, timeit.default_timer() - start))

1053

1054

1055def get_chunk_n_rows(row_bytes, *, max_n_rows=None, working_memory=None):

1056 """Calculate how many rows can be processed within `working_memory`.

1057

1058 Parameters

1059 ----------

1060 row_bytes : int

1061 The expected number of bytes of memory that will be consumed

1062 during the processing of each row.

1063 max_n_rows : int, default=None

1064 The maximum return value.

1065 working_memory : int or float, default=None

1066 The number of rows to fit inside this number of MiB will be

1067 returned. When None (default), the value of

1068 ``sklearn.get_config()['working_memory']`` is used.

1069

1070 Returns

1071 -------

1072 int

1073 The number of rows which can be processed within `working_memory`.

1074

1075 Warns

1076 -----

1077 Issues a UserWarning if `row_bytes exceeds `working_memory` MiB.

1078 """

1079

1080 if working_memory is None:

1081 working_memory = get_config()["working_memory"]

1082

1083 chunk_n_rows = int(working_memory * (2**20) // row_bytes)

1084 if max_n_rows is not None:

1085 chunk_n_rows = min(chunk_n_rows, max_n_rows)

1086 if chunk_n_rows < 1:

1087 warnings.warn(

1088 "Could not adhere to working_memory config. "

1089 "Currently %.0fMiB, %.0fMiB required."

1090 % (working_memory, np.ceil(row_bytes * 2**-20))

1091 )

1092 chunk_n_rows = 1

1093 return chunk_n_rows

1094

1095

1096def _is_pandas_na(x):

1097 """Test if x is pandas.NA.

1098

1099 We intentionally do not use this function to return `True` for `pd.NA` in

1100 `is_scalar_nan`, because estimators that support `pd.NA` are the exception

1101 rather than the rule at the moment. When `pd.NA` is more universally

1102 supported, we may reconsider this decision.

1103

1104 Parameters

1105 ----------

1106 x : any type

1107

1108 Returns

1109 -------

1110 boolean

1111 """

1112 with suppress(ImportError):

1113 from pandas import NA

1115 return x is NA

1117 return False

1120def is_scalar_nan(x):

1121 """Test if x is NaN.

1122

1123 This function is meant to overcome the issue that np.isnan does not allow

1124 non-numerical types as input, and that np.nan is not float('nan').

1125

1126 Parameters

1127 ----------

1128 x : any type

1129 Any scalar value.

1130

1131 Returns

1132 -------

1133 bool

1134 Returns true if x is NaN, and false otherwise.

1135

1136 Examples

1137 --------

1138 >>> import numpy as np

1139 >>> from sklearn.utils import is_scalar_nan

1140 >>> is_scalar_nan(np.nan)

1141 True

1142 >>> is_scalar_nan(float("nan"))

1143 True

1144 >>> is_scalar_nan(None)

1145 False

1146 >>> is_scalar_nan("")

1147 False

1148 >>> is_scalar_nan([np.nan])

1149 False

1150 """

1151 return (

1152 not isinstance(x, numbers.Integral)

1153 and isinstance(x, numbers.Real)

1154 and math.isnan(x)

1155 )

1156

1157

1158def _approximate_mode(class_counts, n_draws, rng):

1159 """Computes approximate mode of multivariate hypergeometric.

1160

1161 This is an approximation to the mode of the multivariate

1162 hypergeometric given by class_counts and n_draws.

1163 It shouldn't be off by more than one.

1164

1165 It is the mostly likely outcome of drawing n_draws many

1166 samples from the population given by class_counts.

1167

1168 Parameters

1169 ----------

1170 class_counts : ndarray of int

1171 Population per class.

1172 n_draws : int

1173 Number of draws (samples to draw) from the overall population.

1174 rng : random state

1175 Used to break ties.

1176

1177 Returns

1178 -------

1179 sampled_classes : ndarray of int

1180 Number of samples drawn from each class.

1181 np.sum(sampled_classes) == n_draws

1182

1183 Examples

1184 --------

1185 >>> import numpy as np

1186 >>> from sklearn.utils import _approximate_mode

1187 >>> _approximate_mode(class_counts=np.array([4, 2]), n_draws=3, rng=0)

1188 array([2, 1])

1189 >>> _approximate_mode(class_counts=np.array([5, 2]), n_draws=4, rng=0)

1190 array([3, 1])

1191 >>> _approximate_mode(class_counts=np.array([2, 2, 2, 1]),

1192 ... n_draws=2, rng=0)

1193 array([0, 1, 1, 0])

1194 >>> _approximate_mode(class_counts=np.array([2, 2, 2, 1]),

1195 ... n_draws=2, rng=42)

1196 array([1, 1, 0, 0])

1197 """

1198 rng = check_random_state(rng)

1199 # this computes a bad approximation to the mode of the

1200 # multivariate hypergeometric given by class_counts and n_draws

1201 continuous = class_counts / class_counts.sum() * n_draws

1202 # floored means we don't overshoot n_samples, but probably undershoot

1203 floored = np.floor(continuous)

1204 # we add samples according to how much "left over" probability

1205 # they had, until we arrive at n_samples

1206 need_to_add = int(n_draws - floored.sum())

1207 if need_to_add > 0:

1208 remainder = continuous - floored

1209 values = np.sort(np.unique(remainder))[::-1]

1210 # add according to remainder, but break ties

1211 # randomly to avoid biases

1212 for value in values:

1213 (inds,) = np.where(remainder == value)

1214 # if we need_to_add less than what's in inds

1215 # we draw randomly from them.

1216 # if we need to add more, we add them all and

1217 # go to the next value

1218 add_now = min(len(inds), need_to_add)

1219 inds = rng.choice(inds, size=add_now, replace=False)

1220 floored[inds] += 1

1221 need_to_add -= add_now

1222 if need_to_add == 0:

1223 break

1224 return floored.astype(int)

1225

1226

1227def check_matplotlib_support(caller_name):

1228 """Raise ImportError with detailed error message if mpl is not installed.

1229

1230 Plot utilities like any of the Display's plotting functions should lazily import

1231 matplotlib and call this helper before any computation.

1232

1233 Parameters

1234 ----------

1235 caller_name : str

1236 The name of the caller that requires matplotlib.

1237 """

1238 try:

1239 import matplotlib # noqa

1240 except ImportError as e:

1241 raise ImportError(

1242 "{} requires matplotlib. You can install matplotlib with "

1243 "`pip install matplotlib`".format(caller_name)

1244 ) from e

1245

1246

1247def check_pandas_support(caller_name):

1248 """Raise ImportError with detailed error message if pandas is not installed.

1249

1250 Plot utilities like :func:`fetch_openml` should lazily import

1251 pandas and call this helper before any computation.

1252

1253 Parameters

1254 ----------

1255 caller_name : str

1256 The name of the caller that requires pandas.

1257

1258 Returns

1259 -------

1260 pandas

1261 The pandas package.

1262 """

1263 try:

1264 import pandas # noqa

1265

1266 return pandas

1267 except ImportError as e:

1268 raise ImportError("{} requires pandas.".format(caller_name)) from e

Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/scikit_learn-1.4.dev0-py3.8-linux-x86_64.egg/sklearn/utils/init.py: 23%

366 statements