1"""
2The :mod:`sklearn.utils` module includes various utilities.
3"""
4
5import math
6import numbers
7import platform
8import struct
9import timeit
10import warnings
11from collections.abc import Sequence
12from contextlib import contextmanager, suppress
13from itertools import compress, islice
14
15import numpy as np
16from scipy.sparse import issparse
17
18from .. import get_config
19from ..exceptions import DataConversionWarning
20from . import _joblib, metadata_routing
21from ._bunch import Bunch
22from ._estimator_html_repr import estimator_html_repr
23from ._param_validation import Integral, Interval, validate_params
24from .class_weight import compute_class_weight, compute_sample_weight
25from .deprecation import deprecated
26from .discovery import all_estimators
27from .fixes import parse_version, threadpool_info
28from .murmurhash import murmurhash3_32
29from .validation import (
30 _is_arraylike_not_scalar,
31 _is_pandas_df,
32 _is_polars_df,
33 _use_interchange_protocol,
34 as_float_array,
35 assert_all_finite,
36 check_array,
37 check_consistent_length,
38 check_random_state,
39 check_scalar,
40 check_symmetric,
41 check_X_y,
42 column_or_1d,
43 indexable,
44)
45
46# Do not deprecate parallel_backend and register_parallel_backend as they are
47# needed to tune `scikit-learn` behavior and have different effect if called
48# from the vendored version or or the site-package version. The other are
49# utilities that are independent of scikit-learn so they are not part of
50# scikit-learn public API.
51parallel_backend = _joblib.parallel_backend
52register_parallel_backend = _joblib.register_parallel_backend
53
54__all__ = [
55 "murmurhash3_32",
56 "as_float_array",
57 "assert_all_finite",
58 "check_array",
59 "check_random_state",
60 "compute_class_weight",
61 "compute_sample_weight",
62 "column_or_1d",
63 "check_consistent_length",
64 "check_X_y",
65 "check_scalar",
66 "indexable",
67 "check_symmetric",
68 "indices_to_mask",
69 "deprecated",
70 "parallel_backend",
71 "register_parallel_backend",
72 "resample",
73 "shuffle",
74 "check_matplotlib_support",
75 "all_estimators",
76 "DataConversionWarning",
77 "estimator_html_repr",
78 "Bunch",
79 "metadata_routing",
80]
81
82IS_PYPY = platform.python_implementation() == "PyPy"
83_IS_32BIT = 8 * struct.calcsize("P") == 32
84_IS_WASM = platform.machine() in ["wasm32", "wasm64"]
85
86
87def _in_unstable_openblas_configuration():
88 """Return True if in an unstable configuration for OpenBLAS"""
89
90 # Import libraries which might load OpenBLAS.
91 import numpy # noqa
92 import scipy # noqa
93
94 modules_info = threadpool_info()
95
96 open_blas_used = any(info["internal_api"] == "openblas" for info in modules_info)
97 if not open_blas_used:
98 return False
99
100 # OpenBLAS 0.3.16 fixed instability for arm64, see:
101 # https://github.com/xianyi/OpenBLAS/blob/1b6db3dbba672b4f8af935bd43a1ff6cff4d20b7/Changelog.txt#L56-L58 # noqa
102 openblas_arm64_stable_version = parse_version("0.3.16")
103 for info in modules_info:
104 if info["internal_api"] != "openblas":
105 continue
106 openblas_version = info.get("version")
107 openblas_architecture = info.get("architecture")
108 if openblas_version is None or openblas_architecture is None:
109 # Cannot be sure that OpenBLAS is good enough. Assume unstable:
110 return True
111 if (
112 openblas_architecture == "neoversen1"
113 and parse_version(openblas_version) < openblas_arm64_stable_version
114 ):
115 # See discussions in https://github.com/numpy/numpy/issues/19411
116 return True
117 return False
118
119
120@validate_params(
121 {
122 "X": ["array-like", "sparse matrix"],
123 "mask": ["array-like"],
124 },
125 prefer_skip_nested_validation=True,
126)
127def safe_mask(X, mask):
128 """Return a mask which is safe to use on X.
129
130 Parameters
131 ----------
132 X : {array-like, sparse matrix}
133 Data on which to apply mask.
134
135 mask : array-like
136 Mask to be used on X.
137
138 Returns
139 -------
140 mask : ndarray
141 Array that is safe to use on X.
142 """
143 mask = np.asarray(mask)
144 if np.issubdtype(mask.dtype, np.signedinteger):
145 return mask
146
147 if hasattr(X, "toarray"):
148 ind = np.arange(mask.shape[0])
149 mask = ind[mask]
150 return mask
151
152
153def axis0_safe_slice(X, mask, len_mask):
154 """Return a mask which is safer to use on X than safe_mask.
155
156 This mask is safer than safe_mask since it returns an
157 empty array, when a sparse matrix is sliced with a boolean mask
158 with all False, instead of raising an unhelpful error in older
159 versions of SciPy.
160
161 See: https://github.com/scipy/scipy/issues/5361
162
163 Also note that we can avoid doing the dot product by checking if
164 the len_mask is not zero in _huber_loss_and_gradient but this
165 is not going to be the bottleneck, since the number of outliers
166 and non_outliers are typically non-zero and it makes the code
167 tougher to follow.
168
169 Parameters
170 ----------
171 X : {array-like, sparse matrix}
172 Data on which to apply mask.
173
174 mask : ndarray
175 Mask to be used on X.
176
177 len_mask : int
178 The length of the mask.
179
180 Returns
181 -------
182 mask : ndarray
183 Array that is safe to use on X.
184 """
185 if len_mask != 0:
186 return X[safe_mask(X, mask), :]
187 return np.zeros(shape=(0, X.shape[1]))
188
189
190def _array_indexing(array, key, key_dtype, axis):
191 """Index an array or scipy.sparse consistently across NumPy version."""
192 if issparse(array) and key_dtype == "bool":
193 key = np.asarray(key)
194 if isinstance(key, tuple):
195 key = list(key)
196 return array[key, ...] if axis == 0 else array[:, key]
197
198
199def _pandas_indexing(X, key, key_dtype, axis):
200 """Index a pandas dataframe or a series."""
201 if _is_arraylike_not_scalar(key):
202 key = np.asarray(key)
203
204 if key_dtype == "int" and not (isinstance(key, slice) or np.isscalar(key)):
205 # using take() instead of iloc[] ensures the return value is a "proper"
206 # copy that will not raise SettingWithCopyWarning
207 return X.take(key, axis=axis)
208 else:
209 # check whether we should index with loc or iloc
210 indexer = X.iloc if key_dtype == "int" else X.loc
211 return indexer[:, key] if axis else indexer[key]
212
213
214def _list_indexing(X, key, key_dtype):
215 """Index a Python list."""
216 if np.isscalar(key) or isinstance(key, slice):
217 # key is a slice or a scalar
218 return X[key]
219 if key_dtype == "bool":
220 # key is a boolean array-like
221 return list(compress(X, key))
222 # key is a integer array-like of key
223 return [X[idx] for idx in key]
224
225
226def _polars_indexing(X, key, key_dtype, axis):
227 """Indexing X with polars interchange protocol."""
228 # Polars behavior is more consistent with lists
229 if isinstance(key, np.ndarray):
230 key = key.tolist()
231
232 if axis == 1:
233 return X[:, key]
234 else:
235 return X[key]
236
237
238def _determine_key_type(key, accept_slice=True):
239 """Determine the data type of key.
240
241 Parameters
242 ----------
243 key : scalar, slice or array-like
244 The key from which we want to infer the data type.
245
246 accept_slice : bool, default=True
247 Whether or not to raise an error if the key is a slice.
248
249 Returns
250 -------
251 dtype : {'int', 'str', 'bool', None}
252 Returns the data type of key.
253 """
254 err_msg = (
255 "No valid specification of the columns. Only a scalar, list or "
256 "slice of all integers or all strings, or boolean mask is "
257 "allowed"
258 )
259
260 dtype_to_str = {int: "int", str: "str", bool: "bool", np.bool_: "bool"}
261 array_dtype_to_str = {
262 "i": "int",
263 "u": "int",
264 "b": "bool",
265 "O": "str",
266 "U": "str",
267 "S": "str",
268 }
269
270 if key is None:
271 return None
272 if isinstance(key, tuple(dtype_to_str.keys())):
273 try:
274 return dtype_to_str[type(key)]
275 except KeyError:
276 raise ValueError(err_msg)
277 if isinstance(key, slice):
278 if not accept_slice:
279 raise TypeError(
280 "Only array-like or scalar are supported. A Python slice was given."
281 )
282 if key.start is None and key.stop is None:
283 return None
284 key_start_type = _determine_key_type(key.start)
285 key_stop_type = _determine_key_type(key.stop)
286 if key_start_type is not None and key_stop_type is not None:
287 if key_start_type != key_stop_type:
288 raise ValueError(err_msg)
289 if key_start_type is not None:
290 return key_start_type
291 return key_stop_type
292 if isinstance(key, (list, tuple)):
293 unique_key = set(key)
294 key_type = {_determine_key_type(elt) for elt in unique_key}
295 if not key_type:
296 return None
297 if len(key_type) != 1:
298 raise ValueError(err_msg)
299 return key_type.pop()
300 if hasattr(key, "dtype"):
301 try:
302 return array_dtype_to_str[key.dtype.kind]
303 except KeyError:
304 raise ValueError(err_msg)
305 raise ValueError(err_msg)
306
307
308def _safe_indexing(X, indices, *, axis=0):
309 """Return rows, items or columns of X using indices.
310
311 .. warning::
312
313 This utility is documented, but **private**. This means that
314 backward compatibility might be broken without any deprecation
315 cycle.
316
317 Parameters
318 ----------
319 X : array-like, sparse-matrix, list, pandas.DataFrame, pandas.Series
320 Data from which to sample rows, items or columns. `list` are only
321 supported when `axis=0`.
322 indices : bool, int, str, slice, array-like
323 - If `axis=0`, boolean and integer array-like, integer slice,
324 and scalar integer are supported.
325 - If `axis=1`:
326 - to select a single column, `indices` can be of `int` type for
327 all `X` types and `str` only for dataframe. The selected subset
328 will be 1D, unless `X` is a sparse matrix in which case it will
329 be 2D.
330 - to select multiples columns, `indices` can be one of the
331 following: `list`, `array`, `slice`. The type used in
332 these containers can be one of the following: `int`, 'bool' and
333 `str`. However, `str` is only supported when `X` is a dataframe.
334 The selected subset will be 2D.
335 axis : int, default=0
336 The axis along which `X` will be subsampled. `axis=0` will select
337 rows while `axis=1` will select columns.
338
339 Returns
340 -------
341 subset
342 Subset of X on axis 0 or 1.
343
344 Notes
345 -----
346 CSR, CSC, and LIL sparse matrices are supported. COO sparse matrices are
347 not supported.
348 """
349 if indices is None:
350 return X
351
352 if axis not in (0, 1):
353 raise ValueError(
354 "'axis' should be either 0 (to index rows) or 1 (to index "
355 " column). Got {} instead.".format(axis)
356 )
357
358 indices_dtype = _determine_key_type(indices)
359
360 if axis == 0 and indices_dtype == "str":
361 raise ValueError("String indexing is not supported with 'axis=0'")
362
363 if axis == 1 and hasattr(X, "ndim") and X.ndim != 2:
364 raise ValueError(
365 "'X' should be a 2D NumPy array, 2D sparse matrix or pandas "
366 "dataframe when indexing the columns (i.e. 'axis=1'). "
367 "Got {} instead with {} dimension(s).".format(type(X), X.ndim)
368 )
369
370 if (
371 axis == 1
372 and indices_dtype == "str"
373 and not (_is_pandas_df(X) or _use_interchange_protocol(X))
374 ):
375 raise ValueError(
376 "Specifying the columns using strings is only supported for dataframes."
377 )
378
379 if hasattr(X, "iloc"):
380 # TODO: we should probably use _is_pandas_df(X) instead but this would
381 # require updating some tests such as test_train_test_split_mock_pandas.
382 return _pandas_indexing(X, indices, indices_dtype, axis=axis)
383 elif _is_polars_df(X):
384 return _polars_indexing(X, indices, indices_dtype, axis=axis)
385 elif hasattr(X, "shape"):
386 return _array_indexing(X, indices, indices_dtype, axis=axis)
387 else:
388 return _list_indexing(X, indices, indices_dtype)
389
390
391def _safe_assign(X, values, *, row_indexer=None, column_indexer=None):
392 """Safe assignment to a numpy array, sparse matrix, or pandas dataframe.
393
394 Parameters
395 ----------
396 X : {ndarray, sparse-matrix, dataframe}
397 Array to be modified. It is expected to be 2-dimensional.
398
399 values : ndarray
400 The values to be assigned to `X`.
401
402 row_indexer : array-like, dtype={int, bool}, default=None
403 A 1-dimensional array to select the rows of interest. If `None`, all
404 rows are selected.
405
406 column_indexer : array-like, dtype={int, bool}, default=None
407 A 1-dimensional array to select the columns of interest. If `None`, all
408 columns are selected.
409 """
410 row_indexer = slice(None, None, None) if row_indexer is None else row_indexer
411 column_indexer = (
412 slice(None, None, None) if column_indexer is None else column_indexer
413 )
414
415 if hasattr(X, "iloc"): # pandas dataframe
416 with warnings.catch_warnings():
417 # pandas >= 1.5 raises a warning when using iloc to set values in a column
418 # that does not have the same type as the column being set. It happens
419 # for instance when setting a categorical column with a string.
420 # In the future the behavior won't change and the warning should disappear.
421 # TODO(1.3): check if the warning is still raised or remove the filter.
422 warnings.simplefilter("ignore", FutureWarning)
423 X.iloc[row_indexer, column_indexer] = values
424 else: # numpy array or sparse matrix
425 X[row_indexer, column_indexer] = values
426
427
428def _get_column_indices_for_bool_or_int(key, n_columns):
429 # Convert key into list of positive integer indexes
430 try:
431 idx = _safe_indexing(np.arange(n_columns), key)
432 except IndexError as e:
433 raise ValueError(
434 f"all features must be in [0, {n_columns - 1}] or [-{n_columns}, 0]"
435 ) from e
436 return np.atleast_1d(idx).tolist()
437
438
439def _get_column_indices(X, key):
440 """Get feature column indices for input data X and key.
441
442 For accepted values of `key`, see the docstring of
443 :func:`_safe_indexing`.
444 """
445 key_dtype = _determine_key_type(key)
446 if _use_interchange_protocol(X):
447 return _get_column_indices_interchange(X.__dataframe__(), key, key_dtype)
448
449 n_columns = X.shape[1]
450 if isinstance(key, (list, tuple)) and not key:
451 # we get an empty list
452 return []
453 elif key_dtype in ("bool", "int"):
454 return _get_column_indices_for_bool_or_int(key, n_columns)
455 else:
456 try:
457 all_columns = X.columns
458 except AttributeError:
459 raise ValueError(
460 "Specifying the columns using strings is only supported for dataframes."
461 )
462 if isinstance(key, str):
463 columns = [key]
464 elif isinstance(key, slice):
465 start, stop = key.start, key.stop
466 if start is not None:
467 start = all_columns.get_loc(start)
468 if stop is not None:
469 # pandas indexing with strings is endpoint included
470 stop = all_columns.get_loc(stop) + 1
471 else:
472 stop = n_columns + 1
473 return list(islice(range(n_columns), start, stop))
474 else:
475 columns = list(key)
476
477 try:
478 column_indices = []
479 for col in columns:
480 col_idx = all_columns.get_loc(col)
481 if not isinstance(col_idx, numbers.Integral):
482 raise ValueError(
483 f"Selected columns, {columns}, are not unique in dataframe"
484 )
485 column_indices.append(col_idx)
486
487 except KeyError as e:
488 raise ValueError("A given column is not a column of the dataframe") from e
489
490 return column_indices
491
492
493def _get_column_indices_interchange(X_interchange, key, key_dtype):
494 """Same as _get_column_indices but for X with __dataframe__ protocol."""
495
496 n_columns = X_interchange.num_columns()
497
498 if isinstance(key, (list, tuple)) and not key:
499 # we get an empty list
500 return []
501 elif key_dtype in ("bool", "int"):
502 return _get_column_indices_for_bool_or_int(key, n_columns)
503 else:
504 column_names = list(X_interchange.column_names())
505
506 if isinstance(key, slice):
507 if key.step not in [1, None]:
508 raise NotImplementedError("key.step must be 1 or None")
509 start, stop = key.start, key.stop
510 if start is not None:
511 start = column_names.index(start)
512
513 if stop is not None:
514 stop = column_names.index(stop) + 1
515 else:
516 stop = n_columns + 1
517 return list(islice(range(n_columns), start, stop))
518
519 selected_columns = [key] if np.isscalar(key) else key
520
521 try:
522 return [column_names.index(col) for col in selected_columns]
523 except ValueError as e:
524 raise ValueError("A given column is not a column of the dataframe") from e
525
526
527@validate_params(
528 {
529 "replace": ["boolean"],
530 "n_samples": [Interval(numbers.Integral, 1, None, closed="left"), None],
531 "random_state": ["random_state"],
532 "stratify": ["array-like", None],
533 },
534 prefer_skip_nested_validation=True,
535)
536def resample(*arrays, replace=True, n_samples=None, random_state=None, stratify=None):
537 """Resample arrays or sparse matrices in a consistent way.
538
539 The default strategy implements one step of the bootstrapping
540 procedure.
541
542 Parameters
543 ----------
544 *arrays : sequence of array-like of shape (n_samples,) or \
545 (n_samples, n_outputs)
546 Indexable data-structures can be arrays, lists, dataframes or scipy
547 sparse matrices with consistent first dimension.
548
549 replace : bool, default=True
550 Implements resampling with replacement. If False, this will implement
551 (sliced) random permutations.
552
553 n_samples : int, default=None
554 Number of samples to generate. If left to None this is
555 automatically set to the first dimension of the arrays.
556 If replace is False it should not be larger than the length of
557 arrays.
558
559 random_state : int, RandomState instance or None, default=None
560 Determines random number generation for shuffling
561 the data.
562 Pass an int for reproducible results across multiple function calls.
563 See :term:`Glossary <random_state>`.
564
565 stratify : array-like of shape (n_samples,) or (n_samples, n_outputs), \
566 default=None
567 If not None, data is split in a stratified fashion, using this as
568 the class labels.
569
570 Returns
571 -------
572 resampled_arrays : sequence of array-like of shape (n_samples,) or \
573 (n_samples, n_outputs)
574 Sequence of resampled copies of the collections. The original arrays
575 are not impacted.
576
577 See Also
578 --------
579 shuffle : Shuffle arrays or sparse matrices in a consistent way.
580
581 Examples
582 --------
583 It is possible to mix sparse and dense arrays in the same run::
584
585 >>> import numpy as np
586 >>> X = np.array([[1., 0.], [2., 1.], [0., 0.]])
587 >>> y = np.array([0, 1, 2])
588
589 >>> from scipy.sparse import coo_matrix
590 >>> X_sparse = coo_matrix(X)
591
592 >>> from sklearn.utils import resample
593 >>> X, X_sparse, y = resample(X, X_sparse, y, random_state=0)
594 >>> X
595 array([[1., 0.],
596 [2., 1.],
597 [1., 0.]])
598
599 >>> X_sparse
600 <3x2 sparse matrix of type '<... 'numpy.float64'>'
601 with 4 stored elements in Compressed Sparse Row format>
602
603 >>> X_sparse.toarray()
604 array([[1., 0.],
605 [2., 1.],
606 [1., 0.]])
607
608 >>> y
609 array([0, 1, 0])
610
611 >>> resample(y, n_samples=2, random_state=0)
612 array([0, 1])
613
614 Example using stratification::
615
616 >>> y = [0, 0, 1, 1, 1, 1, 1, 1, 1]
617 >>> resample(y, n_samples=5, replace=False, stratify=y,
618 ... random_state=0)
619 [1, 1, 1, 0, 1]
620 """
621 max_n_samples = n_samples
622 random_state = check_random_state(random_state)
623
624 if len(arrays) == 0:
625 return None
626
627 first = arrays[0]
628 n_samples = first.shape[0] if hasattr(first, "shape") else len(first)
629
630 if max_n_samples is None:
631 max_n_samples = n_samples
632 elif (max_n_samples > n_samples) and (not replace):
633 raise ValueError(
634 "Cannot sample %d out of arrays with dim %d when replace is False"
635 % (max_n_samples, n_samples)
636 )
637
638 check_consistent_length(*arrays)
639
640 if stratify is None:
641 if replace:
642 indices = random_state.randint(0, n_samples, size=(max_n_samples,))
643 else:
644 indices = np.arange(n_samples)
645 random_state.shuffle(indices)
646 indices = indices[:max_n_samples]
647 else:
648 # Code adapted from StratifiedShuffleSplit()
649 y = check_array(stratify, ensure_2d=False, dtype=None)
650 if y.ndim == 2:
651 # for multi-label y, map each distinct row to a string repr
652 # using join because str(row) uses an ellipsis if len(row) > 1000
653 y = np.array([" ".join(row.astype("str")) for row in y])
654
655 classes, y_indices = np.unique(y, return_inverse=True)
656 n_classes = classes.shape[0]
657
658 class_counts = np.bincount(y_indices)
659
660 # Find the sorted list of instances for each class:
661 # (np.unique above performs a sort, so code is O(n logn) already)
662 class_indices = np.split(
663 np.argsort(y_indices, kind="mergesort"), np.cumsum(class_counts)[:-1]
664 )
665
666 n_i = _approximate_mode(class_counts, max_n_samples, random_state)
667
668 indices = []
669
670 for i in range(n_classes):
671 indices_i = random_state.choice(class_indices[i], n_i[i], replace=replace)
672 indices.extend(indices_i)
673
674 indices = random_state.permutation(indices)
675
676 # convert sparse matrices to CSR for row-based indexing
677 arrays = [a.tocsr() if issparse(a) else a for a in arrays]
678 resampled_arrays = [_safe_indexing(a, indices) for a in arrays]
679 if len(resampled_arrays) == 1:
680 # syntactic sugar for the unit argument case
681 return resampled_arrays[0]
682 else:
683 return resampled_arrays
684
685
686def shuffle(*arrays, random_state=None, n_samples=None):
687 """Shuffle arrays or sparse matrices in a consistent way.
688
689 This is a convenience alias to ``resample(*arrays, replace=False)`` to do
690 random permutations of the collections.
691
692 Parameters
693 ----------
694 *arrays : sequence of indexable data-structures
695 Indexable data-structures can be arrays, lists, dataframes or scipy
696 sparse matrices with consistent first dimension.
697
698 random_state : int, RandomState instance or None, default=None
699 Determines random number generation for shuffling
700 the data.
701 Pass an int for reproducible results across multiple function calls.
702 See :term:`Glossary <random_state>`.
703
704 n_samples : int, default=None
705 Number of samples to generate. If left to None this is
706 automatically set to the first dimension of the arrays. It should
707 not be larger than the length of arrays.
708
709 Returns
710 -------
711 shuffled_arrays : sequence of indexable data-structures
712 Sequence of shuffled copies of the collections. The original arrays
713 are not impacted.
714
715 See Also
716 --------
717 resample : Resample arrays or sparse matrices in a consistent way.
718
719 Examples
720 --------
721 It is possible to mix sparse and dense arrays in the same run::
722
723 >>> import numpy as np
724 >>> X = np.array([[1., 0.], [2., 1.], [0., 0.]])
725 >>> y = np.array([0, 1, 2])
726
727 >>> from scipy.sparse import coo_matrix
728 >>> X_sparse = coo_matrix(X)
729
730 >>> from sklearn.utils import shuffle
731 >>> X, X_sparse, y = shuffle(X, X_sparse, y, random_state=0)
732 >>> X
733 array([[0., 0.],
734 [2., 1.],
735 [1., 0.]])
736
737 >>> X_sparse
738 <3x2 sparse matrix of type '<... 'numpy.float64'>'
739 with 3 stored elements in Compressed Sparse Row format>
740
741 >>> X_sparse.toarray()
742 array([[0., 0.],
743 [2., 1.],
744 [1., 0.]])
745
746 >>> y
747 array([2, 1, 0])
748
749 >>> shuffle(y, n_samples=2, random_state=0)
750 array([0, 1])
751 """
752 return resample(
753 *arrays, replace=False, n_samples=n_samples, random_state=random_state
754 )
755
756
757def safe_sqr(X, *, copy=True):
758 """Element wise squaring of array-likes and sparse matrices.
759
760 Parameters
761 ----------
762 X : {array-like, ndarray, sparse matrix}
763
764 copy : bool, default=True
765 Whether to create a copy of X and operate on it or to perform
766 inplace computation (default behaviour).
767
768 Returns
769 -------
770 X ** 2 : element wise square
771 Return the element-wise square of the input.
772 """
773 X = check_array(X, accept_sparse=["csr", "csc", "coo"], ensure_2d=False)
774 if issparse(X):
775 if copy:
776 X = X.copy()
777 X.data **= 2
778 else:
779 if copy:
780 X = X**2
781 else:
782 X **= 2
783 return X
784
785
786def _chunk_generator(gen, chunksize):
787 """Chunk generator, ``gen`` into lists of length ``chunksize``. The last
788 chunk may have a length less than ``chunksize``."""
789 while True:
790 chunk = list(islice(gen, chunksize))
791 if chunk:
792 yield chunk
793 else:
794 return
795
796
797@validate_params(
798 {
799 "n": [Interval(numbers.Integral, 1, None, closed="left")],
800 "batch_size": [Interval(numbers.Integral, 1, None, closed="left")],
801 "min_batch_size": [Interval(numbers.Integral, 0, None, closed="left")],
802 },
803 prefer_skip_nested_validation=True,
804)
805def gen_batches(n, batch_size, *, min_batch_size=0):
806 """Generator to create slices containing `batch_size` elements from 0 to `n`.
807
808 The last slice may contain less than `batch_size` elements, when
809 `batch_size` does not divide `n`.
810
811 Parameters
812 ----------
813 n : int
814 Size of the sequence.
815 batch_size : int
816 Number of elements in each batch.
817 min_batch_size : int, default=0
818 Minimum number of elements in each batch.
819
820 Yields
821 ------
822 slice of `batch_size` elements
823
824 See Also
825 --------
826 gen_even_slices: Generator to create n_packs slices going up to n.
827
828 Examples
829 --------
830 >>> from sklearn.utils import gen_batches
831 >>> list(gen_batches(7, 3))
832 [slice(0, 3, None), slice(3, 6, None), slice(6, 7, None)]
833 >>> list(gen_batches(6, 3))
834 [slice(0, 3, None), slice(3, 6, None)]
835 >>> list(gen_batches(2, 3))
836 [slice(0, 2, None)]
837 >>> list(gen_batches(7, 3, min_batch_size=0))
838 [slice(0, 3, None), slice(3, 6, None), slice(6, 7, None)]
839 >>> list(gen_batches(7, 3, min_batch_size=2))
840 [slice(0, 3, None), slice(3, 7, None)]
841 """
842 start = 0
843 for _ in range(int(n // batch_size)):
844 end = start + batch_size
845 if end + min_batch_size > n:
846 continue
847 yield slice(start, end)
848 start = end
849 if start < n:
850 yield slice(start, n)
851
852
853@validate_params(
854 {
855 "n": [Interval(Integral, 1, None, closed="left")],
856 "n_packs": [Interval(Integral, 1, None, closed="left")],
857 "n_samples": [Interval(Integral, 1, None, closed="left"), None],
858 },
859 prefer_skip_nested_validation=True,
860)
861def gen_even_slices(n, n_packs, *, n_samples=None):
862 """Generator to create `n_packs` evenly spaced slices going up to `n`.
863
864 If `n_packs` does not divide `n`, except for the first `n % n_packs`
865 slices, remaining slices may contain fewer elements.
866
867 Parameters
868 ----------
869 n : int
870 Size of the sequence.
871 n_packs : int
872 Number of slices to generate.
873 n_samples : int, default=None
874 Number of samples. Pass `n_samples` when the slices are to be used for
875 sparse matrix indexing; slicing off-the-end raises an exception, while
876 it works for NumPy arrays.
877
878 Yields
879 ------
880 `slice` representing a set of indices from 0 to n.
881
882 See Also
883 --------
884 gen_batches: Generator to create slices containing batch_size elements
885 from 0 to n.
886
887 Examples
888 --------
889 >>> from sklearn.utils import gen_even_slices
890 >>> list(gen_even_slices(10, 1))
891 [slice(0, 10, None)]
892 >>> list(gen_even_slices(10, 10))
893 [slice(0, 1, None), slice(1, 2, None), ..., slice(9, 10, None)]
894 >>> list(gen_even_slices(10, 5))
895 [slice(0, 2, None), slice(2, 4, None), ..., slice(8, 10, None)]
896 >>> list(gen_even_slices(10, 3))
897 [slice(0, 4, None), slice(4, 7, None), slice(7, 10, None)]
898 """
899 start = 0
900 for pack_num in range(n_packs):
901 this_n = n // n_packs
902 if pack_num < n % n_packs:
903 this_n += 1
904 if this_n > 0:
905 end = start + this_n
906 if n_samples is not None:
907 end = min(n_samples, end)
908 yield slice(start, end, None)
909 start = end
910
911
912def tosequence(x):
913 """Cast iterable x to a Sequence, avoiding a copy if possible.
914
915 Parameters
916 ----------
917 x : iterable
918 The iterable to be converted.
919
920 Returns
921 -------
922 x : Sequence
923 If `x` is a NumPy array, it returns it as a `ndarray`. If `x`
924 is a `Sequence`, `x` is returned as-is. If `x` is from any other
925 type, `x` is returned casted as a list.
926 """
927 if isinstance(x, np.ndarray):
928 return np.asarray(x)
929 elif isinstance(x, Sequence):
930 return x
931 else:
932 return list(x)
933
934
935def _to_object_array(sequence):
936 """Convert sequence to a 1-D NumPy array of object dtype.
937
938 numpy.array constructor has a similar use but it's output
939 is ambiguous. It can be 1-D NumPy array of object dtype if
940 the input is a ragged array, but if the input is a list of
941 equal length arrays, then the output is a 2D numpy.array.
942 _to_object_array solves this ambiguity by guarantying that
943 the output is a 1-D NumPy array of objects for any input.
944
945 Parameters
946 ----------
947 sequence : array-like of shape (n_elements,)
948 The sequence to be converted.
949
950 Returns
951 -------
952 out : ndarray of shape (n_elements,), dtype=object
953 The converted sequence into a 1-D NumPy array of object dtype.
954
955 Examples
956 --------
957 >>> import numpy as np
958 >>> from sklearn.utils import _to_object_array
959 >>> _to_object_array([np.array([0]), np.array([1])])
960 array([array([0]), array([1])], dtype=object)
961 >>> _to_object_array([np.array([0]), np.array([1, 2])])
962 array([array([0]), array([1, 2])], dtype=object)
963 >>> _to_object_array([np.array([0]), np.array([1, 2])])
964 array([array([0]), array([1, 2])], dtype=object)
965 """
966 out = np.empty(len(sequence), dtype=object)
967 out[:] = sequence
968 return out
969
970
971def indices_to_mask(indices, mask_length):
972 """Convert list of indices to boolean mask.
973
974 Parameters
975 ----------
976 indices : list-like
977 List of integers treated as indices.
978 mask_length : int
979 Length of boolean mask to be generated.
980 This parameter must be greater than max(indices).
981
982 Returns
983 -------
984 mask : 1d boolean nd-array
985 Boolean array that is True where indices are present, else False.
986
987 Examples
988 --------
989 >>> from sklearn.utils import indices_to_mask
990 >>> indices = [1, 2 , 3, 4]
991 >>> indices_to_mask(indices, 5)
992 array([False, True, True, True, True])
993 """
994 if mask_length <= np.max(indices):
995 raise ValueError("mask_length must be greater than max(indices)")
996
997 mask = np.zeros(mask_length, dtype=bool)
998 mask[indices] = True
999
1000 return mask
1001
1002
1003def _message_with_time(source, message, time):
1004 """Create one line message for logging purposes.
1005
1006 Parameters
1007 ----------
1008 source : str
1009 String indicating the source or the reference of the message.
1010
1011 message : str
1012 Short message.
1013
1014 time : int
1015 Time in seconds.
1016 """
1017 start_message = "[%s] " % source
1018
1019 # adapted from joblib.logger.short_format_time without the Windows -.1s
1020 # adjustment
1021 if time > 60:
1022 time_str = "%4.1fmin" % (time / 60)
1023 else:
1024 time_str = " %5.1fs" % time
1025 end_message = " %s, total=%s" % (message, time_str)
1026 dots_len = 70 - len(start_message) - len(end_message)
1027 return "%s%s%s" % (start_message, dots_len * ".", end_message)
1028
1029
1030@contextmanager
1031def _print_elapsed_time(source, message=None):
1032 """Log elapsed time to stdout when the context is exited.
1033
1034 Parameters
1035 ----------
1036 source : str
1037 String indicating the source or the reference of the message.
1038
1039 message : str, default=None
1040 Short message. If None, nothing will be printed.
1041
1042 Returns
1043 -------
1044 context_manager
1045 Prints elapsed time upon exit if verbose.
1046 """
1047 if message is None:
1048 yield
1049 else:
1050 start = timeit.default_timer()
1051 yield
1052 print(_message_with_time(source, message, timeit.default_timer() - start))
1053
1054
1055def get_chunk_n_rows(row_bytes, *, max_n_rows=None, working_memory=None):
1056 """Calculate how many rows can be processed within `working_memory`.
1057
1058 Parameters
1059 ----------
1060 row_bytes : int
1061 The expected number of bytes of memory that will be consumed
1062 during the processing of each row.
1063 max_n_rows : int, default=None
1064 The maximum return value.
1065 working_memory : int or float, default=None
1066 The number of rows to fit inside this number of MiB will be
1067 returned. When None (default), the value of
1068 ``sklearn.get_config()['working_memory']`` is used.
1069
1070 Returns
1071 -------
1072 int
1073 The number of rows which can be processed within `working_memory`.
1074
1075 Warns
1076 -----
1077 Issues a UserWarning if `row_bytes exceeds `working_memory` MiB.
1078 """
1079
1080 if working_memory is None:
1081 working_memory = get_config()["working_memory"]
1082
1083 chunk_n_rows = int(working_memory * (2**20) // row_bytes)
1084 if max_n_rows is not None:
1085 chunk_n_rows = min(chunk_n_rows, max_n_rows)
1086 if chunk_n_rows < 1:
1087 warnings.warn(
1088 "Could not adhere to working_memory config. "
1089 "Currently %.0fMiB, %.0fMiB required."
1090 % (working_memory, np.ceil(row_bytes * 2**-20))
1091 )
1092 chunk_n_rows = 1
1093 return chunk_n_rows
1094
1095
1096def _is_pandas_na(x):
1097 """Test if x is pandas.NA.
1098
1099 We intentionally do not use this function to return `True` for `pd.NA` in
1100 `is_scalar_nan`, because estimators that support `pd.NA` are the exception
1101 rather than the rule at the moment. When `pd.NA` is more universally
1102 supported, we may reconsider this decision.
1103
1104 Parameters
1105 ----------
1106 x : any type
1107
1108 Returns
1109 -------
1110 boolean
1111 """
1112 with suppress(ImportError):
1113 from pandas import NA
1114
1115 return x is NA
1116
1117 return False
1118
1119
1120def is_scalar_nan(x):
1121 """Test if x is NaN.
1122
1123 This function is meant to overcome the issue that np.isnan does not allow
1124 non-numerical types as input, and that np.nan is not float('nan').
1125
1126 Parameters
1127 ----------
1128 x : any type
1129 Any scalar value.
1130
1131 Returns
1132 -------
1133 bool
1134 Returns true if x is NaN, and false otherwise.
1135
1136 Examples
1137 --------
1138 >>> import numpy as np
1139 >>> from sklearn.utils import is_scalar_nan
1140 >>> is_scalar_nan(np.nan)
1141 True
1142 >>> is_scalar_nan(float("nan"))
1143 True
1144 >>> is_scalar_nan(None)
1145 False
1146 >>> is_scalar_nan("")
1147 False
1148 >>> is_scalar_nan([np.nan])
1149 False
1150 """
1151 return (
1152 not isinstance(x, numbers.Integral)
1153 and isinstance(x, numbers.Real)
1154 and math.isnan(x)
1155 )
1156
1157
1158def _approximate_mode(class_counts, n_draws, rng):
1159 """Computes approximate mode of multivariate hypergeometric.
1160
1161 This is an approximation to the mode of the multivariate
1162 hypergeometric given by class_counts and n_draws.
1163 It shouldn't be off by more than one.
1164
1165 It is the mostly likely outcome of drawing n_draws many
1166 samples from the population given by class_counts.
1167
1168 Parameters
1169 ----------
1170 class_counts : ndarray of int
1171 Population per class.
1172 n_draws : int
1173 Number of draws (samples to draw) from the overall population.
1174 rng : random state
1175 Used to break ties.
1176
1177 Returns
1178 -------
1179 sampled_classes : ndarray of int
1180 Number of samples drawn from each class.
1181 np.sum(sampled_classes) == n_draws
1182
1183 Examples
1184 --------
1185 >>> import numpy as np
1186 >>> from sklearn.utils import _approximate_mode
1187 >>> _approximate_mode(class_counts=np.array([4, 2]), n_draws=3, rng=0)
1188 array([2, 1])
1189 >>> _approximate_mode(class_counts=np.array([5, 2]), n_draws=4, rng=0)
1190 array([3, 1])
1191 >>> _approximate_mode(class_counts=np.array([2, 2, 2, 1]),
1192 ... n_draws=2, rng=0)
1193 array([0, 1, 1, 0])
1194 >>> _approximate_mode(class_counts=np.array([2, 2, 2, 1]),
1195 ... n_draws=2, rng=42)
1196 array([1, 1, 0, 0])
1197 """
1198 rng = check_random_state(rng)
1199 # this computes a bad approximation to the mode of the
1200 # multivariate hypergeometric given by class_counts and n_draws
1201 continuous = class_counts / class_counts.sum() * n_draws
1202 # floored means we don't overshoot n_samples, but probably undershoot
1203 floored = np.floor(continuous)
1204 # we add samples according to how much "left over" probability
1205 # they had, until we arrive at n_samples
1206 need_to_add = int(n_draws - floored.sum())
1207 if need_to_add > 0:
1208 remainder = continuous - floored
1209 values = np.sort(np.unique(remainder))[::-1]
1210 # add according to remainder, but break ties
1211 # randomly to avoid biases
1212 for value in values:
1213 (inds,) = np.where(remainder == value)
1214 # if we need_to_add less than what's in inds
1215 # we draw randomly from them.
1216 # if we need to add more, we add them all and
1217 # go to the next value
1218 add_now = min(len(inds), need_to_add)
1219 inds = rng.choice(inds, size=add_now, replace=False)
1220 floored[inds] += 1
1221 need_to_add -= add_now
1222 if need_to_add == 0:
1223 break
1224 return floored.astype(int)
1225
1226
1227def check_matplotlib_support(caller_name):
1228 """Raise ImportError with detailed error message if mpl is not installed.
1229
1230 Plot utilities like any of the Display's plotting functions should lazily import
1231 matplotlib and call this helper before any computation.
1232
1233 Parameters
1234 ----------
1235 caller_name : str
1236 The name of the caller that requires matplotlib.
1237 """
1238 try:
1239 import matplotlib # noqa
1240 except ImportError as e:
1241 raise ImportError(
1242 "{} requires matplotlib. You can install matplotlib with "
1243 "`pip install matplotlib`".format(caller_name)
1244 ) from e
1245
1246
1247def check_pandas_support(caller_name):
1248 """Raise ImportError with detailed error message if pandas is not installed.
1249
1250 Plot utilities like :func:`fetch_openml` should lazily import
1251 pandas and call this helper before any computation.
1252
1253 Parameters
1254 ----------
1255 caller_name : str
1256 The name of the caller that requires pandas.
1257
1258 Returns
1259 -------
1260 pandas
1261 The pandas package.
1262 """
1263 try:
1264 import pandas # noqa
1265
1266 return pandas
1267 except ImportError as e:
1268 raise ImportError("{} requires pandas.".format(caller_name)) from e