1"""
2The :mod:`sklearn.utils.validation` module includes functions to validate
3input and parameters within scikit-learn estimators.
4"""
5
6# Authors: Olivier Grisel
7# Gael Varoquaux
8# Andreas Mueller
9# Lars Buitinck
10# Alexandre Gramfort
11# Nicolas Tresegnie
12# Sylvain Marie
13# License: BSD 3 clause
14
15import numbers
16import operator
17import sys
18import warnings
19from contextlib import suppress
20from functools import reduce, wraps
21from inspect import Parameter, isclass, signature
22
23import joblib
24import numpy as np
25import scipy.sparse as sp
26
27from .. import get_config as _get_config
28from ..exceptions import DataConversionWarning, NotFittedError, PositiveSpectrumWarning
29from ..utils._array_api import _asarray_with_order, _is_numpy_namespace, get_namespace
30from ..utils.fixes import ComplexWarning, _preserve_dia_indices_dtype
31from ._isfinite import FiniteStatus, cy_isfinite
32from .fixes import _object_dtype_isnan
33
34FLOAT_DTYPES = (np.float64, np.float32, np.float16)
35
36
37# This function is not used anymore at this moment in the code base but we keep it in
38# case that we merge a new public function without kwarg only by mistake, which would
39# require a deprecation cycle to fix.
40def _deprecate_positional_args(func=None, *, version="1.3"):
41 """Decorator for methods that issues warnings for positional arguments.
42
43 Using the keyword-only argument syntax in pep 3102, arguments after the
44 * will issue a warning when passed as a positional argument.
45
46 Parameters
47 ----------
48 func : callable, default=None
49 Function to check arguments on.
50 version : callable, default="1.3"
51 The version when positional arguments will result in error.
52 """
53
54 def _inner_deprecate_positional_args(f):
55 sig = signature(f)
56 kwonly_args = []
57 all_args = []
58
59 for name, param in sig.parameters.items():
60 if param.kind == Parameter.POSITIONAL_OR_KEYWORD:
61 all_args.append(name)
62 elif param.kind == Parameter.KEYWORD_ONLY:
63 kwonly_args.append(name)
64
65 @wraps(f)
66 def inner_f(*args, **kwargs):
67 extra_args = len(args) - len(all_args)
68 if extra_args <= 0:
69 return f(*args, **kwargs)
70
71 # extra_args > 0
72 args_msg = [
73 "{}={}".format(name, arg)
74 for name, arg in zip(kwonly_args[:extra_args], args[-extra_args:])
75 ]
76 args_msg = ", ".join(args_msg)
77 warnings.warn(
78 (
79 f"Pass {args_msg} as keyword args. From version "
80 f"{version} passing these as positional arguments "
81 "will result in an error"
82 ),
83 FutureWarning,
84 )
85 kwargs.update(zip(sig.parameters, args))
86 return f(**kwargs)
87
88 return inner_f
89
90 if func is not None:
91 return _inner_deprecate_positional_args(func)
92
93 return _inner_deprecate_positional_args
94
95
96def _assert_all_finite(
97 X, allow_nan=False, msg_dtype=None, estimator_name=None, input_name=""
98):
99 """Like assert_all_finite, but only for ndarray."""
100
101 xp, _ = get_namespace(X)
102
103 if _get_config()["assume_finite"]:
104 return
105
106 X = xp.asarray(X)
107
108 # for object dtype data, we only check for NaNs (GH-13254)
109 if X.dtype == np.dtype("object") and not allow_nan:
110 if _object_dtype_isnan(X).any():
111 raise ValueError("Input contains NaN")
112
113 # We need only consider float arrays, hence can early return for all else.
114 if not xp.isdtype(X.dtype, ("real floating", "complex floating")):
115 return
116
117 # First try an O(n) time, O(1) space solution for the common case that
118 # everything is finite; fall back to O(n) space `np.isinf/isnan` or custom
119 # Cython implementation to prevent false positives and provide a detailed
120 # error message.
121 with np.errstate(over="ignore"):
122 first_pass_isfinite = xp.isfinite(xp.sum(X))
123 if first_pass_isfinite:
124 return
125
126 _assert_all_finite_element_wise(
127 X,
128 xp=xp,
129 allow_nan=allow_nan,
130 msg_dtype=msg_dtype,
131 estimator_name=estimator_name,
132 input_name=input_name,
133 )
134
135
136def _assert_all_finite_element_wise(
137 X, *, xp, allow_nan, msg_dtype=None, estimator_name=None, input_name=""
138):
139 # Cython implementation doesn't support FP16 or complex numbers
140 use_cython = (
141 xp is np and X.data.contiguous and X.dtype.type in {np.float32, np.float64}
142 )
143 if use_cython:
144 out = cy_isfinite(X.reshape(-1), allow_nan=allow_nan)
145 has_nan_error = False if allow_nan else out == FiniteStatus.has_nan
146 has_inf = out == FiniteStatus.has_infinite
147 else:
148 has_inf = xp.any(xp.isinf(X))
149 has_nan_error = False if allow_nan else xp.any(xp.isnan(X))
150 if has_inf or has_nan_error:
151 if has_nan_error:
152 type_err = "NaN"
153 else:
154 msg_dtype = msg_dtype if msg_dtype is not None else X.dtype
155 type_err = f"infinity or a value too large for {msg_dtype!r}"
156 padded_input_name = input_name + " " if input_name else ""
157 msg_err = f"Input {padded_input_name}contains {type_err}."
158 if estimator_name and input_name == "X" and has_nan_error:
159 # Improve the error message on how to handle missing values in
160 # scikit-learn.
161 msg_err += (
162 f"\n{estimator_name} does not accept missing values"
163 " encoded as NaN natively. For supervised learning, you might want"
164 " to consider sklearn.ensemble.HistGradientBoostingClassifier and"
165 " Regressor which accept missing values encoded as NaNs natively."
166 " Alternatively, it is possible to preprocess the data, for"
167 " instance by using an imputer transformer in a pipeline or drop"
168 " samples with missing values. See"
169 " https://scikit-learn.org/stable/modules/impute.html"
170 " You can find a list of all estimators that handle NaN values"
171 " at the following page:"
172 " https://scikit-learn.org/stable/modules/impute.html"
173 "#estimators-that-handle-nan-values"
174 )
175 raise ValueError(msg_err)
176
177
178def assert_all_finite(
179 X,
180 *,
181 allow_nan=False,
182 estimator_name=None,
183 input_name="",
184):
185 """Throw a ValueError if X contains NaN or infinity.
186
187 Parameters
188 ----------
189 X : {ndarray, sparse matrix}
190 The input data.
191
192 allow_nan : bool, default=False
193 If True, do not throw error when `X` contains NaN.
194
195 estimator_name : str, default=None
196 The estimator name, used to construct the error message.
197
198 input_name : str, default=""
199 The data name used to construct the error message. In particular
200 if `input_name` is "X" and the data has NaN values and
201 allow_nan is False, the error message will link to the imputer
202 documentation.
203 """
204 _assert_all_finite(
205 X.data if sp.issparse(X) else X,
206 allow_nan=allow_nan,
207 estimator_name=estimator_name,
208 input_name=input_name,
209 )
210
211
212def as_float_array(X, *, copy=True, force_all_finite=True):
213 """Convert an array-like to an array of floats.
214
215 The new dtype will be np.float32 or np.float64, depending on the original
216 type. The function can create a copy or modify the argument depending
217 on the argument copy.
218
219 Parameters
220 ----------
221 X : {array-like, sparse matrix}
222 The input data.
223
224 copy : bool, default=True
225 If True, a copy of X will be created. If False, a copy may still be
226 returned if X's dtype is not a floating point type.
227
228 force_all_finite : bool or 'allow-nan', default=True
229 Whether to raise an error on np.inf, np.nan, pd.NA in X. The
230 possibilities are:
231
232 - True: Force all values of X to be finite.
233 - False: accepts np.inf, np.nan, pd.NA in X.
234 - 'allow-nan': accepts only np.nan and pd.NA values in X. Values cannot
235 be infinite.
236
237 .. versionadded:: 0.20
238 ``force_all_finite`` accepts the string ``'allow-nan'``.
239
240 .. versionchanged:: 0.23
241 Accepts `pd.NA` and converts it into `np.nan`
242
243 Returns
244 -------
245 XT : {ndarray, sparse matrix}
246 An array of type float.
247 """
248 if isinstance(X, np.matrix) or (
249 not isinstance(X, np.ndarray) and not sp.issparse(X)
250 ):
251 return check_array(
252 X,
253 accept_sparse=["csr", "csc", "coo"],
254 dtype=np.float64,
255 copy=copy,
256 force_all_finite=force_all_finite,
257 ensure_2d=False,
258 )
259 elif sp.issparse(X) and X.dtype in [np.float32, np.float64]:
260 return X.copy() if copy else X
261 elif X.dtype in [np.float32, np.float64]: # is numpy array
262 return X.copy("F" if X.flags["F_CONTIGUOUS"] else "C") if copy else X
263 else:
264 if X.dtype.kind in "uib" and X.dtype.itemsize <= 4:
265 return_dtype = np.float32
266 else:
267 return_dtype = np.float64
268 return X.astype(return_dtype)
269
270
271def _is_arraylike(x):
272 """Returns whether the input is array-like."""
273 return hasattr(x, "__len__") or hasattr(x, "shape") or hasattr(x, "__array__")
274
275
276def _is_arraylike_not_scalar(array):
277 """Return True if array is array-like and not a scalar"""
278 return _is_arraylike(array) and not np.isscalar(array)
279
280
281def _use_interchange_protocol(X):
282 """Use interchange protocol for non-pandas dataframes that follow the protocol.
283
284 Note: at this point we chose not to use the interchange API on pandas dataframe
285 to ensure strict behavioral backward compatibility with older versions of
286 scikit-learn.
287 """
288 return not _is_pandas_df(X) and hasattr(X, "__dataframe__")
289
290
291def _num_features(X):
292 """Return the number of features in an array-like X.
293
294 This helper function tries hard to avoid to materialize an array version
295 of X unless necessary. For instance, if X is a list of lists,
296 this function will return the length of the first element, assuming
297 that subsequent elements are all lists of the same length without
298 checking.
299 Parameters
300 ----------
301 X : array-like
302 array-like to get the number of features.
303
304 Returns
305 -------
306 features : int
307 Number of features
308 """
309 type_ = type(X)
310 if type_.__module__ == "builtins":
311 type_name = type_.__qualname__
312 else:
313 type_name = f"{type_.__module__}.{type_.__qualname__}"
314 message = f"Unable to find the number of features from X of type {type_name}"
315 if not hasattr(X, "__len__") and not hasattr(X, "shape"):
316 if not hasattr(X, "__array__"):
317 raise TypeError(message)
318 # Only convert X to a numpy array if there is no cheaper, heuristic
319 # option.
320 X = np.asarray(X)
321
322 if hasattr(X, "shape"):
323 if not hasattr(X.shape, "__len__") or len(X.shape) <= 1:
324 message += f" with shape {X.shape}"
325 raise TypeError(message)
326 return X.shape[1]
327
328 first_sample = X[0]
329
330 # Do not consider an array-like of strings or dicts to be a 2D array
331 if isinstance(first_sample, (str, bytes, dict)):
332 message += f" where the samples are of type {type(first_sample).__qualname__}"
333 raise TypeError(message)
334
335 try:
336 # If X is a list of lists, for instance, we assume that all nested
337 # lists have the same length without checking or converting to
338 # a numpy array to keep this function call as cheap as possible.
339 return len(first_sample)
340 except Exception as err:
341 raise TypeError(message) from err
342
343
344def _num_samples(x):
345 """Return number of samples in array-like x."""
346 message = "Expected sequence or array-like, got %s" % type(x)
347 if hasattr(x, "fit") and callable(x.fit):
348 # Don't get num_samples from an ensembles length!
349 raise TypeError(message)
350
351 if _use_interchange_protocol(x):
352 return x.__dataframe__().num_rows()
353
354 if not hasattr(x, "__len__") and not hasattr(x, "shape"):
355 if hasattr(x, "__array__"):
356 x = np.asarray(x)
357 else:
358 raise TypeError(message)
359
360 if hasattr(x, "shape") and x.shape is not None:
361 if len(x.shape) == 0:
362 raise TypeError(
363 "Singleton array %r cannot be considered a valid collection." % x
364 )
365 # Check that shape is returning an integer or default to len
366 # Dask dataframes may not return numeric shape[0] value
367 if isinstance(x.shape[0], numbers.Integral):
368 return x.shape[0]
369
370 try:
371 return len(x)
372 except TypeError as type_error:
373 raise TypeError(message) from type_error
374
375
376def check_memory(memory):
377 """Check that ``memory`` is joblib.Memory-like.
378
379 joblib.Memory-like means that ``memory`` can be converted into a
380 joblib.Memory instance (typically a str denoting the ``location``)
381 or has the same interface (has a ``cache`` method).
382
383 Parameters
384 ----------
385 memory : None, str or object with the joblib.Memory interface
386 - If string, the location where to create the `joblib.Memory` interface.
387 - If None, no caching is done and the Memory object is completely transparent.
388
389 Returns
390 -------
391 memory : object with the joblib.Memory interface
392 A correct joblib.Memory object.
393
394 Raises
395 ------
396 ValueError
397 If ``memory`` is not joblib.Memory-like.
398 """
399 if memory is None or isinstance(memory, str):
400 memory = joblib.Memory(location=memory, verbose=0)
401 elif not hasattr(memory, "cache"):
402 raise ValueError(
403 "'memory' should be None, a string or have the same"
404 " interface as joblib.Memory."
405 " Got memory='{}' instead.".format(memory)
406 )
407 return memory
408
409
410def check_consistent_length(*arrays):
411 """Check that all arrays have consistent first dimensions.
412
413 Checks whether all objects in arrays have the same shape or length.
414
415 Parameters
416 ----------
417 *arrays : list or tuple of input objects.
418 Objects that will be checked for consistent length.
419 """
420
421 lengths = [_num_samples(X) for X in arrays if X is not None]
422 uniques = np.unique(lengths)
423 if len(uniques) > 1:
424 raise ValueError(
425 "Found input variables with inconsistent numbers of samples: %r"
426 % [int(l) for l in lengths]
427 )
428
429
430def _make_indexable(iterable):
431 """Ensure iterable supports indexing or convert to an indexable variant.
432
433 Convert sparse matrices to csr and other non-indexable iterable to arrays.
434 Let `None` and indexable objects (e.g. pandas dataframes) pass unchanged.
435
436 Parameters
437 ----------
438 iterable : {list, dataframe, ndarray, sparse matrix} or None
439 Object to be converted to an indexable iterable.
440 """
441 if sp.issparse(iterable):
442 return iterable.tocsr()
443 elif hasattr(iterable, "__getitem__") or hasattr(iterable, "iloc"):
444 return iterable
445 elif iterable is None:
446 return iterable
447 return np.array(iterable)
448
449
450def indexable(*iterables):
451 """Make arrays indexable for cross-validation.
452
453 Checks consistent length, passes through None, and ensures that everything
454 can be indexed by converting sparse matrices to csr and converting
455 non-interable objects to arrays.
456
457 Parameters
458 ----------
459 *iterables : {lists, dataframes, ndarrays, sparse matrices}
460 List of objects to ensure sliceability.
461
462 Returns
463 -------
464 result : list of {ndarray, sparse matrix, dataframe} or None
465 Returns a list containing indexable arrays (i.e. NumPy array,
466 sparse matrix, or dataframe) or `None`.
467 """
468
469 result = [_make_indexable(X) for X in iterables]
470 check_consistent_length(*result)
471 return result
472
473
474def _ensure_sparse_format(
475 sparse_container,
476 accept_sparse,
477 dtype,
478 copy,
479 force_all_finite,
480 accept_large_sparse,
481 estimator_name=None,
482 input_name="",
483):
484 """Convert a sparse container to a given format.
485
486 Checks the sparse format of `sparse_container` and converts if necessary.
487
488 Parameters
489 ----------
490 sparse_container : sparse matrix or array
491 Input to validate and convert.
492
493 accept_sparse : str, bool or list/tuple of str
494 String[s] representing allowed sparse matrix formats ('csc',
495 'csr', 'coo', 'dok', 'bsr', 'lil', 'dia'). If the input is sparse but
496 not in the allowed format, it will be converted to the first listed
497 format. True allows the input to be any format. False means
498 that a sparse matrix input will raise an error.
499
500 dtype : str, type or None
501 Data type of result. If None, the dtype of the input is preserved.
502
503 copy : bool
504 Whether a forced copy will be triggered. If copy=False, a copy might
505 be triggered by a conversion.
506
507 force_all_finite : bool or 'allow-nan'
508 Whether to raise an error on np.inf, np.nan, pd.NA in X. The
509 possibilities are:
510
511 - True: Force all values of X to be finite.
512 - False: accepts np.inf, np.nan, pd.NA in X.
513 - 'allow-nan': accepts only np.nan and pd.NA values in X. Values cannot
514 be infinite.
515
516 .. versionadded:: 0.20
517 ``force_all_finite`` accepts the string ``'allow-nan'``.
518
519 .. versionchanged:: 0.23
520 Accepts `pd.NA` and converts it into `np.nan`
521
522
523 estimator_name : str, default=None
524 The estimator name, used to construct the error message.
525
526 input_name : str, default=""
527 The data name used to construct the error message. In particular
528 if `input_name` is "X" and the data has NaN values and
529 allow_nan is False, the error message will link to the imputer
530 documentation.
531
532 Returns
533 -------
534 sparse_container_converted : sparse matrix or array
535 Sparse container (matrix/array) that is ensured to have an allowed type.
536 """
537 if dtype is None:
538 dtype = sparse_container.dtype
539
540 changed_format = False
541 sparse_container_type_name = type(sparse_container).__name__
542
543 if isinstance(accept_sparse, str):
544 accept_sparse = [accept_sparse]
545
546 # Indices dtype validation
547 _check_large_sparse(sparse_container, accept_large_sparse)
548
549 if accept_sparse is False:
550 padded_input = " for " + input_name if input_name else ""
551 raise TypeError(
552 f"Sparse data was passed{padded_input}, but dense data is required. "
553 "Use '.toarray()' to convert to a dense numpy array."
554 )
555 elif isinstance(accept_sparse, (list, tuple)):
556 if len(accept_sparse) == 0:
557 raise ValueError(
558 "When providing 'accept_sparse' as a tuple or list, it must contain at "
559 "least one string value."
560 )
561 # ensure correct sparse format
562 if sparse_container.format not in accept_sparse:
563 # create new with correct sparse
564 sparse_container = sparse_container.asformat(accept_sparse[0])
565 changed_format = True
566 elif accept_sparse is not True:
567 # any other type
568 raise ValueError(
569 "Parameter 'accept_sparse' should be a string, boolean or list of strings."
570 f" You provided 'accept_sparse={accept_sparse}'."
571 )
572
573 if dtype != sparse_container.dtype:
574 # convert dtype
575 sparse_container = sparse_container.astype(dtype)
576 elif copy and not changed_format:
577 # force copy
578 sparse_container = sparse_container.copy()
579
580 if force_all_finite:
581 if not hasattr(sparse_container, "data"):
582 warnings.warn(
583 f"Can't check {sparse_container.format} sparse matrix for nan or inf.",
584 stacklevel=2,
585 )
586 else:
587 _assert_all_finite(
588 sparse_container.data,
589 allow_nan=force_all_finite == "allow-nan",
590 estimator_name=estimator_name,
591 input_name=input_name,
592 )
593
594 # TODO: Remove when the minimum version of SciPy supported is 1.12
595 # With SciPy sparse arrays, conversion from DIA format to COO, CSR, or BSR
596 # triggers the use of `np.int64` indices even if the data is such that it could
597 # be more efficiently represented with `np.int32` indices.
598 # https://github.com/scipy/scipy/issues/19245 Since not all scikit-learn
599 # algorithms support large indices, the following code downcasts to `np.int32`
600 # indices when it's safe to do so.
601 if changed_format:
602 # accept_sparse is specified to a specific format and a conversion occurred
603 requested_sparse_format = accept_sparse[0]
604 _preserve_dia_indices_dtype(
605 sparse_container, sparse_container_type_name, requested_sparse_format
606 )
607
608 return sparse_container
609
610
611def _ensure_no_complex_data(array):
612 if (
613 hasattr(array, "dtype")
614 and array.dtype is not None
615 and hasattr(array.dtype, "kind")
616 and array.dtype.kind == "c"
617 ):
618 raise ValueError("Complex data not supported\n{}\n".format(array))
619
620
621def _check_estimator_name(estimator):
622 if estimator is not None:
623 if isinstance(estimator, str):
624 return estimator
625 else:
626 return estimator.__class__.__name__
627 return None
628
629
630def _pandas_dtype_needs_early_conversion(pd_dtype):
631 """Return True if pandas extension pd_dtype need to be converted early."""
632 # Check these early for pandas versions without extension dtypes
633 from pandas import SparseDtype
634 from pandas.api.types import (
635 is_bool_dtype,
636 is_float_dtype,
637 is_integer_dtype,
638 )
639
640 if is_bool_dtype(pd_dtype):
641 # bool and extension booleans need early conversion because __array__
642 # converts mixed dtype dataframes into object dtypes
643 return True
644
645 if isinstance(pd_dtype, SparseDtype):
646 # Sparse arrays will be converted later in `check_array`
647 return False
648
649 try:
650 from pandas.api.types import is_extension_array_dtype
651 except ImportError:
652 return False
653
654 if isinstance(pd_dtype, SparseDtype) or not is_extension_array_dtype(pd_dtype):
655 # Sparse arrays will be converted later in `check_array`
656 # Only handle extension arrays for integer and floats
657 return False
658 elif is_float_dtype(pd_dtype):
659 # Float ndarrays can normally support nans. They need to be converted
660 # first to map pd.NA to np.nan
661 return True
662 elif is_integer_dtype(pd_dtype):
663 # XXX: Warn when converting from a high integer to a float
664 return True
665
666 return False
667
668
669def _is_extension_array_dtype(array):
670 # Pandas extension arrays have a dtype with an na_value
671 return hasattr(array, "dtype") and hasattr(array.dtype, "na_value")
672
673
674def check_array(
675 array,
676 accept_sparse=False,
677 *,
678 accept_large_sparse=True,
679 dtype="numeric",
680 order=None,
681 copy=False,
682 force_all_finite=True,
683 ensure_2d=True,
684 allow_nd=False,
685 ensure_min_samples=1,
686 ensure_min_features=1,
687 estimator=None,
688 input_name="",
689):
690 """Input validation on an array, list, sparse matrix or similar.
691
692 By default, the input is checked to be a non-empty 2D array containing
693 only finite values. If the dtype of the array is object, attempt
694 converting to float, raising on failure.
695
696 Parameters
697 ----------
698 array : object
699 Input object to check / convert.
700
701 accept_sparse : str, bool or list/tuple of str, default=False
702 String[s] representing allowed sparse matrix formats, such as 'csc',
703 'csr', etc. If the input is sparse but not in the allowed format,
704 it will be converted to the first listed format. True allows the input
705 to be any format. False means that a sparse matrix input will
706 raise an error.
707
708 accept_large_sparse : bool, default=True
709 If a CSR, CSC, COO or BSR sparse matrix is supplied and accepted by
710 accept_sparse, accept_large_sparse=False will cause it to be accepted
711 only if its indices are stored with a 32-bit dtype.
712
713 .. versionadded:: 0.20
714
715 dtype : 'numeric', type, list of type or None, default='numeric'
716 Data type of result. If None, the dtype of the input is preserved.
717 If "numeric", dtype is preserved unless array.dtype is object.
718 If dtype is a list of types, conversion on the first type is only
719 performed if the dtype of the input is not in the list.
720
721 order : {'F', 'C'} or None, default=None
722 Whether an array will be forced to be fortran or c-style.
723 When order is None (default), then if copy=False, nothing is ensured
724 about the memory layout of the output array; otherwise (copy=True)
725 the memory layout of the returned array is kept as close as possible
726 to the original array.
727
728 copy : bool, default=False
729 Whether a forced copy will be triggered. If copy=False, a copy might
730 be triggered by a conversion.
731
732 force_all_finite : bool or 'allow-nan', default=True
733 Whether to raise an error on np.inf, np.nan, pd.NA in array. The
734 possibilities are:
735
736 - True: Force all values of array to be finite.
737 - False: accepts np.inf, np.nan, pd.NA in array.
738 - 'allow-nan': accepts only np.nan and pd.NA values in array. Values
739 cannot be infinite.
740
741 .. versionadded:: 0.20
742 ``force_all_finite`` accepts the string ``'allow-nan'``.
743
744 .. versionchanged:: 0.23
745 Accepts `pd.NA` and converts it into `np.nan`
746
747 ensure_2d : bool, default=True
748 Whether to raise a value error if array is not 2D.
749
750 allow_nd : bool, default=False
751 Whether to allow array.ndim > 2.
752
753 ensure_min_samples : int, default=1
754 Make sure that the array has a minimum number of samples in its first
755 axis (rows for a 2D array). Setting to 0 disables this check.
756
757 ensure_min_features : int, default=1
758 Make sure that the 2D array has some minimum number of features
759 (columns). The default value of 1 rejects empty datasets.
760 This check is only enforced when the input data has effectively 2
761 dimensions or is originally 1D and ``ensure_2d`` is True. Setting to 0
762 disables this check.
763
764 estimator : str or estimator instance, default=None
765 If passed, include the name of the estimator in warning messages.
766
767 input_name : str, default=""
768 The data name used to construct the error message. In particular
769 if `input_name` is "X" and the data has NaN values and
770 allow_nan is False, the error message will link to the imputer
771 documentation.
772
773 .. versionadded:: 1.1.0
774
775 Returns
776 -------
777 array_converted : object
778 The converted and validated array.
779 """
780 if isinstance(array, np.matrix):
781 raise TypeError(
782 "np.matrix is not supported. Please convert to a numpy array with "
783 "np.asarray. For more information see: "
784 "https://numpy.org/doc/stable/reference/generated/numpy.matrix.html"
785 )
786
787 xp, is_array_api_compliant = get_namespace(array)
788
789 # store reference to original array to check if copy is needed when
790 # function returns
791 array_orig = array
792
793 # store whether originally we wanted numeric dtype
794 dtype_numeric = isinstance(dtype, str) and dtype == "numeric"
795
796 dtype_orig = getattr(array, "dtype", None)
797 if not is_array_api_compliant and not hasattr(dtype_orig, "kind"):
798 # not a data type (e.g. a column named dtype in a pandas DataFrame)
799 dtype_orig = None
800
801 # check if the object contains several dtypes (typically a pandas
802 # DataFrame), and store them. If not, store None.
803 dtypes_orig = None
804 pandas_requires_conversion = False
805 if hasattr(array, "dtypes") and hasattr(array.dtypes, "__array__"):
806 # throw warning if columns are sparse. If all columns are sparse, then
807 # array.sparse exists and sparsity will be preserved (later).
808 with suppress(ImportError):
809 from pandas import SparseDtype
810
811 def is_sparse(dtype):
812 return isinstance(dtype, SparseDtype)
813
814 if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
815 warnings.warn(
816 "pandas.DataFrame with sparse columns found."
817 "It will be converted to a dense numpy array."
818 )
819
820 dtypes_orig = list(array.dtypes)
821 pandas_requires_conversion = any(
822 _pandas_dtype_needs_early_conversion(i) for i in dtypes_orig
823 )
824 if all(isinstance(dtype_iter, np.dtype) for dtype_iter in dtypes_orig):
825 dtype_orig = np.result_type(*dtypes_orig)
826 elif pandas_requires_conversion and any(d == object for d in dtypes_orig):
827 # Force object if any of the dtypes is an object
828 dtype_orig = object
829
830 elif (_is_extension_array_dtype(array) or hasattr(array, "iloc")) and hasattr(
831 array, "dtype"
832 ):
833 # array is a pandas series
834 pandas_requires_conversion = _pandas_dtype_needs_early_conversion(array.dtype)
835 if isinstance(array.dtype, np.dtype):
836 dtype_orig = array.dtype
837 else:
838 # Set to None to let array.astype work out the best dtype
839 dtype_orig = None
840
841 if dtype_numeric:
842 if (
843 dtype_orig is not None
844 and hasattr(dtype_orig, "kind")
845 and dtype_orig.kind == "O"
846 ):
847 # if input is object, convert to float.
848 dtype = xp.float64
849 else:
850 dtype = None
851
852 if isinstance(dtype, (list, tuple)):
853 if dtype_orig is not None and dtype_orig in dtype:
854 # no dtype conversion required
855 dtype = None
856 else:
857 # dtype conversion required. Let's select the first element of the
858 # list of accepted types.
859 dtype = dtype[0]
860
861 if pandas_requires_conversion:
862 # pandas dataframe requires conversion earlier to handle extension dtypes with
863 # nans
864 # Use the original dtype for conversion if dtype is None
865 new_dtype = dtype_orig if dtype is None else dtype
866 array = array.astype(new_dtype)
867 # Since we converted here, we do not need to convert again later
868 dtype = None
869
870 if force_all_finite not in (True, False, "allow-nan"):
871 raise ValueError(
872 'force_all_finite should be a bool or "allow-nan". Got {!r} instead'.format(
873 force_all_finite
874 )
875 )
876
877 if dtype is not None and _is_numpy_namespace(xp):
878 # convert to dtype object to conform to Array API to be use `xp.isdtype` later
879 dtype = np.dtype(dtype)
880
881 estimator_name = _check_estimator_name(estimator)
882 context = " by %s" % estimator_name if estimator is not None else ""
883
884 # When all dataframe columns are sparse, convert to a sparse array
885 if hasattr(array, "sparse") and array.ndim > 1:
886 with suppress(ImportError):
887 from pandas import SparseDtype # noqa: F811
888
889 def is_sparse(dtype):
890 return isinstance(dtype, SparseDtype)
891
892 if array.dtypes.apply(is_sparse).all():
893 # DataFrame.sparse only supports `to_coo`
894 array = array.sparse.to_coo()
895 if array.dtype == np.dtype("object"):
896 unique_dtypes = set([dt.subtype.name for dt in array_orig.dtypes])
897 if len(unique_dtypes) > 1:
898 raise ValueError(
899 "Pandas DataFrame with mixed sparse extension arrays "
900 "generated a sparse matrix with object dtype which "
901 "can not be converted to a scipy sparse matrix."
902 "Sparse extension arrays should all have the same "
903 "numeric type."
904 )
905
906 if sp.issparse(array):
907 _ensure_no_complex_data(array)
908 array = _ensure_sparse_format(
909 array,
910 accept_sparse=accept_sparse,
911 dtype=dtype,
912 copy=copy,
913 force_all_finite=force_all_finite,
914 accept_large_sparse=accept_large_sparse,
915 estimator_name=estimator_name,
916 input_name=input_name,
917 )
918 else:
919 # If np.array(..) gives ComplexWarning, then we convert the warning
920 # to an error. This is needed because specifying a non complex
921 # dtype to the function converts complex to real dtype,
922 # thereby passing the test made in the lines following the scope
923 # of warnings context manager.
924 with warnings.catch_warnings():
925 try:
926 warnings.simplefilter("error", ComplexWarning)
927 if dtype is not None and xp.isdtype(dtype, "integral"):
928 # Conversion float -> int should not contain NaN or
929 # inf (numpy#14412). We cannot use casting='safe' because
930 # then conversion float -> int would be disallowed.
931 array = _asarray_with_order(array, order=order, xp=xp)
932 if xp.isdtype(array.dtype, ("real floating", "complex floating")):
933 _assert_all_finite(
934 array,
935 allow_nan=False,
936 msg_dtype=dtype,
937 estimator_name=estimator_name,
938 input_name=input_name,
939 )
940 array = xp.astype(array, dtype, copy=False)
941 else:
942 array = _asarray_with_order(array, order=order, dtype=dtype, xp=xp)
943 except ComplexWarning as complex_warning:
944 raise ValueError(
945 "Complex data not supported\n{}\n".format(array)
946 ) from complex_warning
947
948 # It is possible that the np.array(..) gave no warning. This happens
949 # when no dtype conversion happened, for example dtype = None. The
950 # result is that np.array(..) produces an array of complex dtype
951 # and we need to catch and raise exception for such cases.
952 _ensure_no_complex_data(array)
953
954 if ensure_2d:
955 # If input is scalar raise error
956 if array.ndim == 0:
957 raise ValueError(
958 "Expected 2D array, got scalar array instead:\narray={}.\n"
959 "Reshape your data either using array.reshape(-1, 1) if "
960 "your data has a single feature or array.reshape(1, -1) "
961 "if it contains a single sample.".format(array)
962 )
963 # If input is 1D raise error
964 if array.ndim == 1:
965 raise ValueError(
966 "Expected 2D array, got 1D array instead:\narray={}.\n"
967 "Reshape your data either using array.reshape(-1, 1) if "
968 "your data has a single feature or array.reshape(1, -1) "
969 "if it contains a single sample.".format(array)
970 )
971
972 if dtype_numeric and hasattr(array.dtype, "kind") and array.dtype.kind in "USV":
973 raise ValueError(
974 "dtype='numeric' is not compatible with arrays of bytes/strings."
975 "Convert your data to numeric values explicitly instead."
976 )
977 if not allow_nd and array.ndim >= 3:
978 raise ValueError(
979 "Found array with dim %d. %s expected <= 2."
980 % (array.ndim, estimator_name)
981 )
982
983 if force_all_finite:
984 _assert_all_finite(
985 array,
986 input_name=input_name,
987 estimator_name=estimator_name,
988 allow_nan=force_all_finite == "allow-nan",
989 )
990
991 if copy:
992 if _is_numpy_namespace(xp):
993 # only make a copy if `array` and `array_orig` may share memory`
994 if np.may_share_memory(array, array_orig):
995 array = _asarray_with_order(
996 array, dtype=dtype, order=order, copy=True, xp=xp
997 )
998 else:
999 # always make a copy for non-numpy arrays
1000 array = _asarray_with_order(
1001 array, dtype=dtype, order=order, copy=True, xp=xp
1002 )
1003
1004 if ensure_min_samples > 0:
1005 n_samples = _num_samples(array)
1006 if n_samples < ensure_min_samples:
1007 raise ValueError(
1008 "Found array with %d sample(s) (shape=%s) while a"
1009 " minimum of %d is required%s."
1010 % (n_samples, array.shape, ensure_min_samples, context)
1011 )
1012
1013 if ensure_min_features > 0 and array.ndim == 2:
1014 n_features = array.shape[1]
1015 if n_features < ensure_min_features:
1016 raise ValueError(
1017 "Found array with %d feature(s) (shape=%s) while"
1018 " a minimum of %d is required%s."
1019 % (n_features, array.shape, ensure_min_features, context)
1020 )
1021
1022 return array
1023
1024
1025def _check_large_sparse(X, accept_large_sparse=False):
1026 """Raise a ValueError if X has 64bit indices and accept_large_sparse=False"""
1027 if not accept_large_sparse:
1028 supported_indices = ["int32"]
1029 if X.format == "coo":
1030 index_keys = ["col", "row"]
1031 elif X.format in ["csr", "csc", "bsr"]:
1032 index_keys = ["indices", "indptr"]
1033 else:
1034 return
1035 for key in index_keys:
1036 indices_datatype = getattr(X, key).dtype
1037 if indices_datatype not in supported_indices:
1038 raise ValueError(
1039 "Only sparse matrices with 32-bit integer indices are accepted."
1040 f" Got {indices_datatype} indices. Please do report a minimal"
1041 " reproducer on scikit-learn issue tracker so that support for"
1042 " your use-case can be studied by maintainers. See:"
1043 " https://scikit-learn.org/dev/developers/minimal_reproducer.html"
1044 )
1045
1046
1047def check_X_y(
1048 X,
1049 y,
1050 accept_sparse=False,
1051 *,
1052 accept_large_sparse=True,
1053 dtype="numeric",
1054 order=None,
1055 copy=False,
1056 force_all_finite=True,
1057 ensure_2d=True,
1058 allow_nd=False,
1059 multi_output=False,
1060 ensure_min_samples=1,
1061 ensure_min_features=1,
1062 y_numeric=False,
1063 estimator=None,
1064):
1065 """Input validation for standard estimators.
1066
1067 Checks X and y for consistent length, enforces X to be 2D and y 1D. By
1068 default, X is checked to be non-empty and containing only finite values.
1069 Standard input checks are also applied to y, such as checking that y
1070 does not have np.nan or np.inf targets. For multi-label y, set
1071 multi_output=True to allow 2D and sparse y. If the dtype of X is
1072 object, attempt converting to float, raising on failure.
1073
1074 Parameters
1075 ----------
1076 X : {ndarray, list, sparse matrix}
1077 Input data.
1078
1079 y : {ndarray, list, sparse matrix}
1080 Labels.
1081
1082 accept_sparse : str, bool or list of str, default=False
1083 String[s] representing allowed sparse matrix formats, such as 'csc',
1084 'csr', etc. If the input is sparse but not in the allowed format,
1085 it will be converted to the first listed format. True allows the input
1086 to be any format. False means that a sparse matrix input will
1087 raise an error.
1088
1089 accept_large_sparse : bool, default=True
1090 If a CSR, CSC, COO or BSR sparse matrix is supplied and accepted by
1091 accept_sparse, accept_large_sparse will cause it to be accepted only
1092 if its indices are stored with a 32-bit dtype.
1093
1094 .. versionadded:: 0.20
1095
1096 dtype : 'numeric', type, list of type or None, default='numeric'
1097 Data type of result. If None, the dtype of the input is preserved.
1098 If "numeric", dtype is preserved unless array.dtype is object.
1099 If dtype is a list of types, conversion on the first type is only
1100 performed if the dtype of the input is not in the list.
1101
1102 order : {'F', 'C'}, default=None
1103 Whether an array will be forced to be fortran or c-style. If
1104 `None`, then the input data's order is preserved when possible.
1105
1106 copy : bool, default=False
1107 Whether a forced copy will be triggered. If copy=False, a copy might
1108 be triggered by a conversion.
1109
1110 force_all_finite : bool or 'allow-nan', default=True
1111 Whether to raise an error on np.inf, np.nan, pd.NA in X. This parameter
1112 does not influence whether y can have np.inf, np.nan, pd.NA values.
1113 The possibilities are:
1114
1115 - True: Force all values of X to be finite.
1116 - False: accepts np.inf, np.nan, pd.NA in X.
1117 - 'allow-nan': accepts only np.nan or pd.NA values in X. Values cannot
1118 be infinite.
1119
1120 .. versionadded:: 0.20
1121 ``force_all_finite`` accepts the string ``'allow-nan'``.
1122
1123 .. versionchanged:: 0.23
1124 Accepts `pd.NA` and converts it into `np.nan`
1125
1126 ensure_2d : bool, default=True
1127 Whether to raise a value error if X is not 2D.
1128
1129 allow_nd : bool, default=False
1130 Whether to allow X.ndim > 2.
1131
1132 multi_output : bool, default=False
1133 Whether to allow 2D y (array or sparse matrix). If false, y will be
1134 validated as a vector. y cannot have np.nan or np.inf values if
1135 multi_output=True.
1136
1137 ensure_min_samples : int, default=1
1138 Make sure that X has a minimum number of samples in its first
1139 axis (rows for a 2D array).
1140
1141 ensure_min_features : int, default=1
1142 Make sure that the 2D array has some minimum number of features
1143 (columns). The default value of 1 rejects empty datasets.
1144 This check is only enforced when X has effectively 2 dimensions or
1145 is originally 1D and ``ensure_2d`` is True. Setting to 0 disables
1146 this check.
1147
1148 y_numeric : bool, default=False
1149 Whether to ensure that y has a numeric type. If dtype of y is object,
1150 it is converted to float64. Should only be used for regression
1151 algorithms.
1152
1153 estimator : str or estimator instance, default=None
1154 If passed, include the name of the estimator in warning messages.
1155
1156 Returns
1157 -------
1158 X_converted : object
1159 The converted and validated X.
1160
1161 y_converted : object
1162 The converted and validated y.
1163 """
1164 if y is None:
1165 if estimator is None:
1166 estimator_name = "estimator"
1167 else:
1168 estimator_name = _check_estimator_name(estimator)
1169 raise ValueError(
1170 f"{estimator_name} requires y to be passed, but the target y is None"
1171 )
1172
1173 X = check_array(
1174 X,
1175 accept_sparse=accept_sparse,
1176 accept_large_sparse=accept_large_sparse,
1177 dtype=dtype,
1178 order=order,
1179 copy=copy,
1180 force_all_finite=force_all_finite,
1181 ensure_2d=ensure_2d,
1182 allow_nd=allow_nd,
1183 ensure_min_samples=ensure_min_samples,
1184 ensure_min_features=ensure_min_features,
1185 estimator=estimator,
1186 input_name="X",
1187 )
1188
1189 y = _check_y(y, multi_output=multi_output, y_numeric=y_numeric, estimator=estimator)
1190
1191 check_consistent_length(X, y)
1192
1193 return X, y
1194
1195
1196def _check_y(y, multi_output=False, y_numeric=False, estimator=None):
1197 """Isolated part of check_X_y dedicated to y validation"""
1198 if multi_output:
1199 y = check_array(
1200 y,
1201 accept_sparse="csr",
1202 force_all_finite=True,
1203 ensure_2d=False,
1204 dtype=None,
1205 input_name="y",
1206 estimator=estimator,
1207 )
1208 else:
1209 estimator_name = _check_estimator_name(estimator)
1210 y = column_or_1d(y, warn=True)
1211 _assert_all_finite(y, input_name="y", estimator_name=estimator_name)
1212 _ensure_no_complex_data(y)
1213 if y_numeric and y.dtype.kind == "O":
1214 y = y.astype(np.float64)
1215
1216 return y
1217
1218
1219def column_or_1d(y, *, dtype=None, warn=False):
1220 """Ravel column or 1d numpy array, else raises an error.
1221
1222 Parameters
1223 ----------
1224 y : array-like
1225 Input data.
1226
1227 dtype : data-type, default=None
1228 Data type for `y`.
1229
1230 .. versionadded:: 1.2
1231
1232 warn : bool, default=False
1233 To control display of warnings.
1234
1235 Returns
1236 -------
1237 y : ndarray
1238 Output data.
1239
1240 Raises
1241 ------
1242 ValueError
1243 If `y` is not a 1D array or a 2D array with a single row or column.
1244 """
1245 xp, _ = get_namespace(y)
1246 y = check_array(
1247 y,
1248 ensure_2d=False,
1249 dtype=dtype,
1250 input_name="y",
1251 force_all_finite=False,
1252 ensure_min_samples=0,
1253 )
1254
1255 shape = y.shape
1256 if len(shape) == 1:
1257 return _asarray_with_order(xp.reshape(y, (-1,)), order="C", xp=xp)
1258 if len(shape) == 2 and shape[1] == 1:
1259 if warn:
1260 warnings.warn(
1261 (
1262 "A column-vector y was passed when a 1d array was"
1263 " expected. Please change the shape of y to "
1264 "(n_samples, ), for example using ravel()."
1265 ),
1266 DataConversionWarning,
1267 stacklevel=2,
1268 )
1269 return _asarray_with_order(xp.reshape(y, (-1,)), order="C", xp=xp)
1270
1271 raise ValueError(
1272 "y should be a 1d array, got an array of shape {} instead.".format(shape)
1273 )
1274
1275
1276def check_random_state(seed):
1277 """Turn seed into a np.random.RandomState instance.
1278
1279 Parameters
1280 ----------
1281 seed : None, int or instance of RandomState
1282 If seed is None, return the RandomState singleton used by np.random.
1283 If seed is an int, return a new RandomState instance seeded with seed.
1284 If seed is already a RandomState instance, return it.
1285 Otherwise raise ValueError.
1286
1287 Returns
1288 -------
1289 :class:`numpy:numpy.random.RandomState`
1290 The random state object based on `seed` parameter.
1291 """
1292 if seed is None or seed is np.random:
1293 return np.random.mtrand._rand
1294 if isinstance(seed, numbers.Integral):
1295 return np.random.RandomState(seed)
1296 if isinstance(seed, np.random.RandomState):
1297 return seed
1298 raise ValueError(
1299 "%r cannot be used to seed a numpy.random.RandomState instance" % seed
1300 )
1301
1302
1303def has_fit_parameter(estimator, parameter):
1304 """Check whether the estimator's fit method supports the given parameter.
1305
1306 Parameters
1307 ----------
1308 estimator : object
1309 An estimator to inspect.
1310
1311 parameter : str
1312 The searched parameter.
1313
1314 Returns
1315 -------
1316 is_parameter : bool
1317 Whether the parameter was found to be a named parameter of the
1318 estimator's fit method.
1319
1320 Examples
1321 --------
1322 >>> from sklearn.svm import SVC
1323 >>> from sklearn.utils.validation import has_fit_parameter
1324 >>> has_fit_parameter(SVC(), "sample_weight")
1325 True
1326 """
1327 return parameter in signature(estimator.fit).parameters
1328
1329
1330def check_symmetric(array, *, tol=1e-10, raise_warning=True, raise_exception=False):
1331 """Make sure that array is 2D, square and symmetric.
1332
1333 If the array is not symmetric, then a symmetrized version is returned.
1334 Optionally, a warning or exception is raised if the matrix is not
1335 symmetric.
1336
1337 Parameters
1338 ----------
1339 array : {ndarray, sparse matrix}
1340 Input object to check / convert. Must be two-dimensional and square,
1341 otherwise a ValueError will be raised.
1342
1343 tol : float, default=1e-10
1344 Absolute tolerance for equivalence of arrays. Default = 1E-10.
1345
1346 raise_warning : bool, default=True
1347 If True then raise a warning if conversion is required.
1348
1349 raise_exception : bool, default=False
1350 If True then raise an exception if array is not symmetric.
1351
1352 Returns
1353 -------
1354 array_sym : {ndarray, sparse matrix}
1355 Symmetrized version of the input array, i.e. the average of array
1356 and array.transpose(). If sparse, then duplicate entries are first
1357 summed and zeros are eliminated.
1358 """
1359 if (array.ndim != 2) or (array.shape[0] != array.shape[1]):
1360 raise ValueError(
1361 "array must be 2-dimensional and square. shape = {0}".format(array.shape)
1362 )
1363
1364 if sp.issparse(array):
1365 diff = array - array.T
1366 # only csr, csc, and coo have `data` attribute
1367 if diff.format not in ["csr", "csc", "coo"]:
1368 diff = diff.tocsr()
1369 symmetric = np.all(abs(diff.data) < tol)
1370 else:
1371 symmetric = np.allclose(array, array.T, atol=tol)
1372
1373 if not symmetric:
1374 if raise_exception:
1375 raise ValueError("Array must be symmetric")
1376 if raise_warning:
1377 warnings.warn(
1378 (
1379 "Array is not symmetric, and will be converted "
1380 "to symmetric by average with its transpose."
1381 ),
1382 stacklevel=2,
1383 )
1384 if sp.issparse(array):
1385 conversion = "to" + array.format
1386 array = getattr(0.5 * (array + array.T), conversion)()
1387 else:
1388 array = 0.5 * (array + array.T)
1389
1390 return array
1391
1392
1393def _is_fitted(estimator, attributes=None, all_or_any=all):
1394 """Determine if an estimator is fitted
1395
1396 Parameters
1397 ----------
1398 estimator : estimator instance
1399 Estimator instance for which the check is performed.
1400
1401 attributes : str, list or tuple of str, default=None
1402 Attribute name(s) given as string or a list/tuple of strings
1403 Eg.: ``["coef_", "estimator_", ...], "coef_"``
1404
1405 If `None`, `estimator` is considered fitted if there exist an
1406 attribute that ends with a underscore and does not start with double
1407 underscore.
1408
1409 all_or_any : callable, {all, any}, default=all
1410 Specify whether all or any of the given attributes must exist.
1411
1412 Returns
1413 -------
1414 fitted : bool
1415 Whether the estimator is fitted.
1416 """
1417 if attributes is not None:
1418 if not isinstance(attributes, (list, tuple)):
1419 attributes = [attributes]
1420 return all_or_any([hasattr(estimator, attr) for attr in attributes])
1421
1422 if hasattr(estimator, "__sklearn_is_fitted__"):
1423 return estimator.__sklearn_is_fitted__()
1424
1425 fitted_attrs = [
1426 v for v in vars(estimator) if v.endswith("_") and not v.startswith("__")
1427 ]
1428 return len(fitted_attrs) > 0
1429
1430
1431def check_is_fitted(estimator, attributes=None, *, msg=None, all_or_any=all):
1432 """Perform is_fitted validation for estimator.
1433
1434 Checks if the estimator is fitted by verifying the presence of
1435 fitted attributes (ending with a trailing underscore) and otherwise
1436 raises a NotFittedError with the given message.
1437
1438 If an estimator does not set any attributes with a trailing underscore, it
1439 can define a ``__sklearn_is_fitted__`` method returning a boolean to
1440 specify if the estimator is fitted or not. See
1441 :ref:`sphx_glr_auto_examples_developing_estimators_sklearn_is_fitted.py`
1442 for an example on how to use the API.
1443
1444 Parameters
1445 ----------
1446 estimator : estimator instance
1447 Estimator instance for which the check is performed.
1448
1449 attributes : str, list or tuple of str, default=None
1450 Attribute name(s) given as string or a list/tuple of strings
1451 Eg.: ``["coef_", "estimator_", ...], "coef_"``
1452
1453 If `None`, `estimator` is considered fitted if there exist an
1454 attribute that ends with a underscore and does not start with double
1455 underscore.
1456
1457 msg : str, default=None
1458 The default error message is, "This %(name)s instance is not fitted
1459 yet. Call 'fit' with appropriate arguments before using this
1460 estimator."
1461
1462 For custom messages if "%(name)s" is present in the message string,
1463 it is substituted for the estimator name.
1464
1465 Eg. : "Estimator, %(name)s, must be fitted before sparsifying".
1466
1467 all_or_any : callable, {all, any}, default=all
1468 Specify whether all or any of the given attributes must exist.
1469
1470 Raises
1471 ------
1472 TypeError
1473 If the estimator is a class or not an estimator instance
1474
1475 NotFittedError
1476 If the attributes are not found.
1477 """
1478 if isclass(estimator):
1479 raise TypeError("{} is a class, not an instance.".format(estimator))
1480 if msg is None:
1481 msg = (
1482 "This %(name)s instance is not fitted yet. Call 'fit' with "
1483 "appropriate arguments before using this estimator."
1484 )
1485
1486 if not hasattr(estimator, "fit"):
1487 raise TypeError("%s is not an estimator instance." % (estimator))
1488
1489 if not _is_fitted(estimator, attributes, all_or_any):
1490 raise NotFittedError(msg % {"name": type(estimator).__name__})
1491
1492
1493def check_non_negative(X, whom):
1494 """
1495 Check if there is any negative value in an array.
1496
1497 Parameters
1498 ----------
1499 X : {array-like, sparse matrix}
1500 Input data.
1501
1502 whom : str
1503 Who passed X to this function.
1504 """
1505 xp, _ = get_namespace(X)
1506 # avoid X.min() on sparse matrix since it also sorts the indices
1507 if sp.issparse(X):
1508 if X.format in ["lil", "dok"]:
1509 X = X.tocsr()
1510 if X.data.size == 0:
1511 X_min = 0
1512 else:
1513 X_min = X.data.min()
1514 else:
1515 X_min = xp.min(X)
1516
1517 if X_min < 0:
1518 raise ValueError("Negative values in data passed to %s" % whom)
1519
1520
1521def check_scalar(
1522 x,
1523 name,
1524 target_type,
1525 *,
1526 min_val=None,
1527 max_val=None,
1528 include_boundaries="both",
1529):
1530 """Validate scalar parameters type and value.
1531
1532 Parameters
1533 ----------
1534 x : object
1535 The scalar parameter to validate.
1536
1537 name : str
1538 The name of the parameter to be printed in error messages.
1539
1540 target_type : type or tuple
1541 Acceptable data types for the parameter.
1542
1543 min_val : float or int, default=None
1544 The minimum valid value the parameter can take. If None (default) it
1545 is implied that the parameter does not have a lower bound.
1546
1547 max_val : float or int, default=None
1548 The maximum valid value the parameter can take. If None (default) it
1549 is implied that the parameter does not have an upper bound.
1550
1551 include_boundaries : {"left", "right", "both", "neither"}, default="both"
1552 Whether the interval defined by `min_val` and `max_val` should include
1553 the boundaries. Possible choices are:
1554
1555 - `"left"`: only `min_val` is included in the valid interval.
1556 It is equivalent to the interval `[ min_val, max_val )`.
1557 - `"right"`: only `max_val` is included in the valid interval.
1558 It is equivalent to the interval `( min_val, max_val ]`.
1559 - `"both"`: `min_val` and `max_val` are included in the valid interval.
1560 It is equivalent to the interval `[ min_val, max_val ]`.
1561 - `"neither"`: neither `min_val` nor `max_val` are included in the
1562 valid interval. It is equivalent to the interval `( min_val, max_val )`.
1563
1564 Returns
1565 -------
1566 x : numbers.Number
1567 The validated number.
1568
1569 Raises
1570 ------
1571 TypeError
1572 If the parameter's type does not match the desired type.
1573
1574 ValueError
1575 If the parameter's value violates the given bounds.
1576 If `min_val`, `max_val` and `include_boundaries` are inconsistent.
1577 """
1578
1579 def type_name(t):
1580 """Convert type into humman readable string."""
1581 module = t.__module__
1582 qualname = t.__qualname__
1583 if module == "builtins":
1584 return qualname
1585 elif t == numbers.Real:
1586 return "float"
1587 elif t == numbers.Integral:
1588 return "int"
1589 return f"{module}.{qualname}"
1590
1591 if not isinstance(x, target_type):
1592 if isinstance(target_type, tuple):
1593 types_str = ", ".join(type_name(t) for t in target_type)
1594 target_type_str = f"{{{types_str}}}"
1595 else:
1596 target_type_str = type_name(target_type)
1597
1598 raise TypeError(
1599 f"{name} must be an instance of {target_type_str}, not"
1600 f" {type(x).__qualname__}."
1601 )
1602
1603 expected_include_boundaries = ("left", "right", "both", "neither")
1604 if include_boundaries not in expected_include_boundaries:
1605 raise ValueError(
1606 f"Unknown value for `include_boundaries`: {repr(include_boundaries)}. "
1607 f"Possible values are: {expected_include_boundaries}."
1608 )
1609
1610 if max_val is None and include_boundaries == "right":
1611 raise ValueError(
1612 "`include_boundaries`='right' without specifying explicitly `max_val` "
1613 "is inconsistent."
1614 )
1615
1616 if min_val is None and include_boundaries == "left":
1617 raise ValueError(
1618 "`include_boundaries`='left' without specifying explicitly `min_val` "
1619 "is inconsistent."
1620 )
1621
1622 comparison_operator = (
1623 operator.lt if include_boundaries in ("left", "both") else operator.le
1624 )
1625 if min_val is not None and comparison_operator(x, min_val):
1626 raise ValueError(
1627 f"{name} == {x}, must be"
1628 f" {'>=' if include_boundaries in ('left', 'both') else '>'} {min_val}."
1629 )
1630
1631 comparison_operator = (
1632 operator.gt if include_boundaries in ("right", "both") else operator.ge
1633 )
1634 if max_val is not None and comparison_operator(x, max_val):
1635 raise ValueError(
1636 f"{name} == {x}, must be"
1637 f" {'<=' if include_boundaries in ('right', 'both') else '<'} {max_val}."
1638 )
1639
1640 return x
1641
1642
1643def _check_psd_eigenvalues(lambdas, enable_warnings=False):
1644 """Check the eigenvalues of a positive semidefinite (PSD) matrix.
1645
1646 Checks the provided array of PSD matrix eigenvalues for numerical or
1647 conditioning issues and returns a fixed validated version. This method
1648 should typically be used if the PSD matrix is user-provided (e.g. a
1649 Gram matrix) or computed using a user-provided dissimilarity metric
1650 (e.g. kernel function), or if the decomposition process uses approximation
1651 methods (randomized SVD, etc.).
1652
1653 It checks for three things:
1654
1655 - that there are no significant imaginary parts in eigenvalues (more than
1656 1e-5 times the maximum real part). If this check fails, it raises a
1657 ``ValueError``. Otherwise all non-significant imaginary parts that may
1658 remain are set to zero. This operation is traced with a
1659 ``PositiveSpectrumWarning`` when ``enable_warnings=True``.
1660
1661 - that eigenvalues are not all negative. If this check fails, it raises a
1662 ``ValueError``
1663
1664 - that there are no significant negative eigenvalues with absolute value
1665 more than 1e-10 (1e-6) and more than 1e-5 (5e-3) times the largest
1666 positive eigenvalue in double (simple) precision. If this check fails,
1667 it raises a ``ValueError``. Otherwise all negative eigenvalues that may
1668 remain are set to zero. This operation is traced with a
1669 ``PositiveSpectrumWarning`` when ``enable_warnings=True``.
1670
1671 Finally, all the positive eigenvalues that are too small (with a value
1672 smaller than the maximum eigenvalue multiplied by 1e-12 (2e-7)) are set to
1673 zero. This operation is traced with a ``PositiveSpectrumWarning`` when
1674 ``enable_warnings=True``.
1675
1676 Parameters
1677 ----------
1678 lambdas : array-like of shape (n_eigenvalues,)
1679 Array of eigenvalues to check / fix.
1680
1681 enable_warnings : bool, default=False
1682 When this is set to ``True``, a ``PositiveSpectrumWarning`` will be
1683 raised when there are imaginary parts, negative eigenvalues, or
1684 extremely small non-zero eigenvalues. Otherwise no warning will be
1685 raised. In both cases, imaginary parts, negative eigenvalues, and
1686 extremely small non-zero eigenvalues will be set to zero.
1687
1688 Returns
1689 -------
1690 lambdas_fixed : ndarray of shape (n_eigenvalues,)
1691 A fixed validated copy of the array of eigenvalues.
1692
1693 Examples
1694 --------
1695 >>> from sklearn.utils.validation import _check_psd_eigenvalues
1696 >>> _check_psd_eigenvalues([1, 2]) # nominal case
1697 array([1, 2])
1698 >>> _check_psd_eigenvalues([5, 5j]) # significant imag part
1699 Traceback (most recent call last):
1700 ...
1701 ValueError: There are significant imaginary parts in eigenvalues (1
1702 of the maximum real part). Either the matrix is not PSD, or there was
1703 an issue while computing the eigendecomposition of the matrix.
1704 >>> _check_psd_eigenvalues([5, 5e-5j]) # insignificant imag part
1705 array([5., 0.])
1706 >>> _check_psd_eigenvalues([-5, -1]) # all negative
1707 Traceback (most recent call last):
1708 ...
1709 ValueError: All eigenvalues are negative (maximum is -1). Either the
1710 matrix is not PSD, or there was an issue while computing the
1711 eigendecomposition of the matrix.
1712 >>> _check_psd_eigenvalues([5, -1]) # significant negative
1713 Traceback (most recent call last):
1714 ...
1715 ValueError: There are significant negative eigenvalues (0.2 of the
1716 maximum positive). Either the matrix is not PSD, or there was an issue
1717 while computing the eigendecomposition of the matrix.
1718 >>> _check_psd_eigenvalues([5, -5e-5]) # insignificant negative
1719 array([5., 0.])
1720 >>> _check_psd_eigenvalues([5, 4e-12]) # bad conditioning (too small)
1721 array([5., 0.])
1722
1723 """
1724
1725 lambdas = np.array(lambdas)
1726 is_double_precision = lambdas.dtype == np.float64
1727
1728 # note: the minimum value available is
1729 # - single-precision: np.finfo('float32').eps = 1.2e-07
1730 # - double-precision: np.finfo('float64').eps = 2.2e-16
1731
1732 # the various thresholds used for validation
1733 # we may wish to change the value according to precision.
1734 significant_imag_ratio = 1e-5
1735 significant_neg_ratio = 1e-5 if is_double_precision else 5e-3
1736 significant_neg_value = 1e-10 if is_double_precision else 1e-6
1737 small_pos_ratio = 1e-12 if is_double_precision else 2e-7
1738
1739 # Check that there are no significant imaginary parts
1740 if not np.isreal(lambdas).all():
1741 max_imag_abs = np.abs(np.imag(lambdas)).max()
1742 max_real_abs = np.abs(np.real(lambdas)).max()
1743 if max_imag_abs > significant_imag_ratio * max_real_abs:
1744 raise ValueError(
1745 "There are significant imaginary parts in eigenvalues (%g "
1746 "of the maximum real part). Either the matrix is not PSD, or "
1747 "there was an issue while computing the eigendecomposition "
1748 "of the matrix." % (max_imag_abs / max_real_abs)
1749 )
1750
1751 # warn about imaginary parts being removed
1752 if enable_warnings:
1753 warnings.warn(
1754 "There are imaginary parts in eigenvalues (%g "
1755 "of the maximum real part). Either the matrix is not"
1756 " PSD, or there was an issue while computing the "
1757 "eigendecomposition of the matrix. Only the real "
1758 "parts will be kept." % (max_imag_abs / max_real_abs),
1759 PositiveSpectrumWarning,
1760 )
1761
1762 # Remove all imaginary parts (even if zero)
1763 lambdas = np.real(lambdas)
1764
1765 # Check that there are no significant negative eigenvalues
1766 max_eig = lambdas.max()
1767 if max_eig < 0:
1768 raise ValueError(
1769 "All eigenvalues are negative (maximum is %g). "
1770 "Either the matrix is not PSD, or there was an "
1771 "issue while computing the eigendecomposition of "
1772 "the matrix." % max_eig
1773 )
1774
1775 else:
1776 min_eig = lambdas.min()
1777 if (
1778 min_eig < -significant_neg_ratio * max_eig
1779 and min_eig < -significant_neg_value
1780 ):
1781 raise ValueError(
1782 "There are significant negative eigenvalues (%g"
1783 " of the maximum positive). Either the matrix is "
1784 "not PSD, or there was an issue while computing "
1785 "the eigendecomposition of the matrix." % (-min_eig / max_eig)
1786 )
1787 elif min_eig < 0:
1788 # Remove all negative values and warn about it
1789 if enable_warnings:
1790 warnings.warn(
1791 "There are negative eigenvalues (%g of the "
1792 "maximum positive). Either the matrix is not "
1793 "PSD, or there was an issue while computing the"
1794 " eigendecomposition of the matrix. Negative "
1795 "eigenvalues will be replaced with 0." % (-min_eig / max_eig),
1796 PositiveSpectrumWarning,
1797 )
1798 lambdas[lambdas < 0] = 0
1799
1800 # Check for conditioning (small positive non-zeros)
1801 too_small_lambdas = (0 < lambdas) & (lambdas < small_pos_ratio * max_eig)
1802 if too_small_lambdas.any():
1803 if enable_warnings:
1804 warnings.warn(
1805 "Badly conditioned PSD matrix spectrum: the largest "
1806 "eigenvalue is more than %g times the smallest. "
1807 "Small eigenvalues will be replaced with 0."
1808 "" % (1 / small_pos_ratio),
1809 PositiveSpectrumWarning,
1810 )
1811 lambdas[too_small_lambdas] = 0
1812
1813 return lambdas
1814
1815
1816def _check_sample_weight(
1817 sample_weight, X, dtype=None, copy=False, only_non_negative=False
1818):
1819 """Validate sample weights.
1820
1821 Note that passing sample_weight=None will output an array of ones.
1822 Therefore, in some cases, you may want to protect the call with:
1823 if sample_weight is not None:
1824 sample_weight = _check_sample_weight(...)
1825
1826 Parameters
1827 ----------
1828 sample_weight : {ndarray, Number or None}, shape (n_samples,)
1829 Input sample weights.
1830
1831 X : {ndarray, list, sparse matrix}
1832 Input data.
1833
1834 only_non_negative : bool, default=False,
1835 Whether or not the weights are expected to be non-negative.
1836
1837 .. versionadded:: 1.0
1838
1839 dtype : dtype, default=None
1840 dtype of the validated `sample_weight`.
1841 If None, and the input `sample_weight` is an array, the dtype of the
1842 input is preserved; otherwise an array with the default numpy dtype
1843 is be allocated. If `dtype` is not one of `float32`, `float64`,
1844 `None`, the output will be of dtype `float64`.
1845
1846 copy : bool, default=False
1847 If True, a copy of sample_weight will be created.
1848
1849 Returns
1850 -------
1851 sample_weight : ndarray of shape (n_samples,)
1852 Validated sample weight. It is guaranteed to be "C" contiguous.
1853 """
1854 n_samples = _num_samples(X)
1855
1856 if dtype is not None and dtype not in [np.float32, np.float64]:
1857 dtype = np.float64
1858
1859 if sample_weight is None:
1860 sample_weight = np.ones(n_samples, dtype=dtype)
1861 elif isinstance(sample_weight, numbers.Number):
1862 sample_weight = np.full(n_samples, sample_weight, dtype=dtype)
1863 else:
1864 if dtype is None:
1865 dtype = [np.float64, np.float32]
1866 sample_weight = check_array(
1867 sample_weight,
1868 accept_sparse=False,
1869 ensure_2d=False,
1870 dtype=dtype,
1871 order="C",
1872 copy=copy,
1873 input_name="sample_weight",
1874 )
1875 if sample_weight.ndim != 1:
1876 raise ValueError("Sample weights must be 1D array or scalar")
1877
1878 if sample_weight.shape != (n_samples,):
1879 raise ValueError(
1880 "sample_weight.shape == {}, expected {}!".format(
1881 sample_weight.shape, (n_samples,)
1882 )
1883 )
1884
1885 if only_non_negative:
1886 check_non_negative(sample_weight, "`sample_weight`")
1887
1888 return sample_weight
1889
1890
1891def _allclose_dense_sparse(x, y, rtol=1e-7, atol=1e-9):
1892 """Check allclose for sparse and dense data.
1893
1894 Both x and y need to be either sparse or dense, they
1895 can't be mixed.
1896
1897 Parameters
1898 ----------
1899 x : {array-like, sparse matrix}
1900 First array to compare.
1901
1902 y : {array-like, sparse matrix}
1903 Second array to compare.
1904
1905 rtol : float, default=1e-7
1906 Relative tolerance; see numpy.allclose.
1907
1908 atol : float, default=1e-9
1909 absolute tolerance; see numpy.allclose. Note that the default here is
1910 more tolerant than the default for numpy.testing.assert_allclose, where
1911 atol=0.
1912 """
1913 if sp.issparse(x) and sp.issparse(y):
1914 x = x.tocsr()
1915 y = y.tocsr()
1916 x.sum_duplicates()
1917 y.sum_duplicates()
1918 return (
1919 np.array_equal(x.indices, y.indices)
1920 and np.array_equal(x.indptr, y.indptr)
1921 and np.allclose(x.data, y.data, rtol=rtol, atol=atol)
1922 )
1923 elif not sp.issparse(x) and not sp.issparse(y):
1924 return np.allclose(x, y, rtol=rtol, atol=atol)
1925 raise ValueError(
1926 "Can only compare two sparse matrices, not a sparse matrix and an array"
1927 )
1928
1929
1930def _check_response_method(estimator, response_method):
1931 """Check if `response_method` is available in estimator and return it.
1932
1933 .. versionadded:: 1.3
1934
1935 Parameters
1936 ----------
1937 estimator : estimator instance
1938 Classifier or regressor to check.
1939
1940 response_method : {"predict_proba", "predict_log_proba", "decision_function",
1941 "predict"} or list of such str
1942 Specifies the response method to use get prediction from an estimator
1943 (i.e. :term:`predict_proba`, :term:`predict_log_proba`,
1944 :term:`decision_function` or :term:`predict`). Possible choices are:
1945 - if `str`, it corresponds to the name to the method to return;
1946 - if a list of `str`, it provides the method names in order of
1947 preference. The method returned corresponds to the first method in
1948 the list and which is implemented by `estimator`.
1949
1950 Returns
1951 -------
1952 prediction_method : callable
1953 Prediction method of estimator.
1954
1955 Raises
1956 ------
1957 AttributeError
1958 If `response_method` is not available in `estimator`.
1959 """
1960 if isinstance(response_method, str):
1961 list_methods = [response_method]
1962 else:
1963 list_methods = response_method
1964
1965 prediction_method = [getattr(estimator, method, None) for method in list_methods]
1966 prediction_method = reduce(lambda x, y: x or y, prediction_method)
1967 if prediction_method is None:
1968 raise AttributeError(
1969 f"{estimator.__class__.__name__} has none of the following attributes: "
1970 f"{', '.join(list_methods)}."
1971 )
1972
1973 return prediction_method
1974
1975
1976def _check_method_params(X, params, indices=None):
1977 """Check and validate the parameters passed to a specific
1978 method like `fit`.
1979
1980 Parameters
1981 ----------
1982 X : array-like of shape (n_samples, n_features)
1983 Data array.
1984
1985 params : dict
1986 Dictionary containing the parameters passed to the method.
1987
1988 indices : array-like of shape (n_samples,), default=None
1989 Indices to be selected if the parameter has the same size as `X`.
1990
1991 Returns
1992 -------
1993 method_params_validated : dict
1994 Validated parameters. We ensure that the values support indexing.
1995 """
1996 from . import _safe_indexing
1997
1998 method_params_validated = {}
1999 for param_key, param_value in params.items():
2000 if not _is_arraylike(param_value) or _num_samples(param_value) != _num_samples(
2001 X
2002 ):
2003 # Non-indexable pass-through (for now for backward-compatibility).
2004 # https://github.com/scikit-learn/scikit-learn/issues/15805
2005 method_params_validated[param_key] = param_value
2006 else:
2007 # Any other method_params should support indexing
2008 # (e.g. for cross-validation).
2009 method_params_validated[param_key] = _make_indexable(param_value)
2010 method_params_validated[param_key] = _safe_indexing(
2011 method_params_validated[param_key], indices
2012 )
2013
2014 return method_params_validated
2015
2016
2017def _is_pandas_df(X):
2018 """Return True if the X is a pandas dataframe."""
2019 if hasattr(X, "columns") and hasattr(X, "iloc"):
2020 # Likely a pandas DataFrame, we explicitly check the type to confirm.
2021 try:
2022 pd = sys.modules["pandas"]
2023 except KeyError:
2024 return False
2025 return isinstance(X, pd.DataFrame)
2026 return False
2027
2028
2029def _is_polars_df(X):
2030 """Return True if the X is a polars dataframe."""
2031 if hasattr(X, "columns") and hasattr(X, "schema"):
2032 # Likely a polars DataFrame, we explicitly check the type to confirm.
2033 try:
2034 pl = sys.modules["polars"]
2035 except KeyError:
2036 return False
2037 return isinstance(X, pl.DataFrame)
2038 return False
2039
2040
2041def _get_feature_names(X):
2042 """Get feature names from X.
2043
2044 Support for other array containers should place its implementation here.
2045
2046 Parameters
2047 ----------
2048 X : {ndarray, dataframe} of shape (n_samples, n_features)
2049 Array container to extract feature names.
2050
2051 - pandas dataframe : The columns will be considered to be feature
2052 names. If the dataframe contains non-string feature names, `None` is
2053 returned.
2054 - All other array containers will return `None`.
2055
2056 Returns
2057 -------
2058 names: ndarray or None
2059 Feature names of `X`. Unrecognized array containers will return `None`.
2060 """
2061 feature_names = None
2062
2063 # extract feature names for support array containers
2064 if _is_pandas_df(X):
2065 # Make sure we can inspect columns names from pandas, even with
2066 # versions too old to expose a working implementation of
2067 # __dataframe__.column_names() and avoid introducing any
2068 # additional copy.
2069 # TODO: remove the pandas-specific branch once the minimum supported
2070 # version of pandas has a working implementation of
2071 # __dataframe__.column_names() that is guaranteed to not introduce any
2072 # additional copy of the data without having to impose allow_copy=False
2073 # that could fail with other libraries. Note: in the longer term, we
2074 # could decide to instead rely on the __dataframe_namespace__ API once
2075 # adopted by our minimally supported pandas version.
2076 feature_names = np.asarray(X.columns, dtype=object)
2077 elif hasattr(X, "__dataframe__"):
2078 df_protocol = X.__dataframe__()
2079 feature_names = np.asarray(list(df_protocol.column_names()), dtype=object)
2080
2081 if feature_names is None or len(feature_names) == 0:
2082 return
2083
2084 types = sorted(t.__qualname__ for t in set(type(v) for v in feature_names))
2085
2086 # mixed type of string and non-string is not supported
2087 if len(types) > 1 and "str" in types:
2088 raise TypeError(
2089 "Feature names are only supported if all input features have string names, "
2090 f"but your input has {types} as feature name / column name types. "
2091 "If you want feature names to be stored and validated, you must convert "
2092 "them all to strings, by using X.columns = X.columns.astype(str) for "
2093 "example. Otherwise you can remove feature / column names from your input "
2094 "data, or convert them all to a non-string data type."
2095 )
2096
2097 # Only feature names of all strings are supported
2098 if len(types) == 1 and types[0] == "str":
2099 return feature_names
2100
2101
2102def _check_feature_names_in(estimator, input_features=None, *, generate_names=True):
2103 """Check `input_features` and generate names if needed.
2104
2105 Commonly used in :term:`get_feature_names_out`.
2106
2107 Parameters
2108 ----------
2109 input_features : array-like of str or None, default=None
2110 Input features.
2111
2112 - If `input_features` is `None`, then `feature_names_in_` is
2113 used as feature names in. If `feature_names_in_` is not defined,
2114 then the following input feature names are generated:
2115 `["x0", "x1", ..., "x(n_features_in_ - 1)"]`.
2116 - If `input_features` is an array-like, then `input_features` must
2117 match `feature_names_in_` if `feature_names_in_` is defined.
2118
2119 generate_names : bool, default=True
2120 Whether to generate names when `input_features` is `None` and
2121 `estimator.feature_names_in_` is not defined. This is useful for transformers
2122 that validates `input_features` but do not require them in
2123 :term:`get_feature_names_out` e.g. `PCA`.
2124
2125 Returns
2126 -------
2127 feature_names_in : ndarray of str or `None`
2128 Feature names in.
2129 """
2130
2131 feature_names_in_ = getattr(estimator, "feature_names_in_", None)
2132 n_features_in_ = getattr(estimator, "n_features_in_", None)
2133
2134 if input_features is not None:
2135 input_features = np.asarray(input_features, dtype=object)
2136 if feature_names_in_ is not None and not np.array_equal(
2137 feature_names_in_, input_features
2138 ):
2139 raise ValueError("input_features is not equal to feature_names_in_")
2140
2141 if n_features_in_ is not None and len(input_features) != n_features_in_:
2142 raise ValueError(
2143 "input_features should have length equal to number of "
2144 f"features ({n_features_in_}), got {len(input_features)}"
2145 )
2146 return input_features
2147
2148 if feature_names_in_ is not None:
2149 return feature_names_in_
2150
2151 if not generate_names:
2152 return
2153
2154 # Generates feature names if `n_features_in_` is defined
2155 if n_features_in_ is None:
2156 raise ValueError("Unable to generate feature names without n_features_in_")
2157
2158 return np.asarray([f"x{i}" for i in range(n_features_in_)], dtype=object)
2159
2160
2161def _generate_get_feature_names_out(estimator, n_features_out, input_features=None):
2162 """Generate feature names out for estimator using the estimator name as the prefix.
2163
2164 The input_feature names are validated but not used. This function is useful
2165 for estimators that generate their own names based on `n_features_out`, i.e. PCA.
2166
2167 Parameters
2168 ----------
2169 estimator : estimator instance
2170 Estimator producing output feature names.
2171
2172 n_feature_out : int
2173 Number of feature names out.
2174
2175 input_features : array-like of str or None, default=None
2176 Only used to validate feature names with `estimator.feature_names_in_`.
2177
2178 Returns
2179 -------
2180 feature_names_in : ndarray of str or `None`
2181 Feature names in.
2182 """
2183 _check_feature_names_in(estimator, input_features, generate_names=False)
2184 estimator_name = estimator.__class__.__name__.lower()
2185 return np.asarray(
2186 [f"{estimator_name}{i}" for i in range(n_features_out)], dtype=object
2187 )
2188
2189
2190def _check_monotonic_cst(estimator, monotonic_cst=None):
2191 """Check the monotonic constraints and return the corresponding array.
2192
2193 This helper function should be used in the `fit` method of an estimator
2194 that supports monotonic constraints and called after the estimator has
2195 introspected input data to set the `n_features_in_` and optionally the
2196 `feature_names_in_` attributes.
2197
2198 .. versionadded:: 1.2
2199
2200 Parameters
2201 ----------
2202 estimator : estimator instance
2203
2204 monotonic_cst : array-like of int, dict of str or None, default=None
2205 Monotonic constraints for the features.
2206
2207 - If array-like, then it should contain only -1, 0 or 1. Each value
2208 will be checked to be in [-1, 0, 1]. If a value is -1, then the
2209 corresponding feature is required to be monotonically decreasing.
2210 - If dict, then it the keys should be the feature names occurring in
2211 `estimator.feature_names_in_` and the values should be -1, 0 or 1.
2212 - If None, then an array of 0s will be allocated.
2213
2214 Returns
2215 -------
2216 monotonic_cst : ndarray of int
2217 Monotonic constraints for each feature.
2218 """
2219 original_monotonic_cst = monotonic_cst
2220 if monotonic_cst is None or isinstance(monotonic_cst, dict):
2221 monotonic_cst = np.full(
2222 shape=estimator.n_features_in_,
2223 fill_value=0,
2224 dtype=np.int8,
2225 )
2226 if isinstance(original_monotonic_cst, dict):
2227 if not hasattr(estimator, "feature_names_in_"):
2228 raise ValueError(
2229 f"{estimator.__class__.__name__} was not fitted on data "
2230 "with feature names. Pass monotonic_cst as an integer "
2231 "array instead."
2232 )
2233 unexpected_feature_names = list(
2234 set(original_monotonic_cst) - set(estimator.feature_names_in_)
2235 )
2236 unexpected_feature_names.sort() # deterministic error message
2237 n_unexpeced = len(unexpected_feature_names)
2238 if unexpected_feature_names:
2239 if len(unexpected_feature_names) > 5:
2240 unexpected_feature_names = unexpected_feature_names[:5]
2241 unexpected_feature_names.append("...")
2242 raise ValueError(
2243 f"monotonic_cst contains {n_unexpeced} unexpected feature "
2244 f"names: {unexpected_feature_names}."
2245 )
2246 for feature_idx, feature_name in enumerate(estimator.feature_names_in_):
2247 if feature_name in original_monotonic_cst:
2248 cst = original_monotonic_cst[feature_name]
2249 if cst not in [-1, 0, 1]:
2250 raise ValueError(
2251 f"monotonic_cst['{feature_name}'] must be either "
2252 f"-1, 0 or 1. Got {cst!r}."
2253 )
2254 monotonic_cst[feature_idx] = cst
2255 else:
2256 unexpected_cst = np.setdiff1d(monotonic_cst, [-1, 0, 1])
2257 if unexpected_cst.shape[0]:
2258 raise ValueError(
2259 "monotonic_cst must be an array-like of -1, 0 or 1. Observed "
2260 f"values: {unexpected_cst.tolist()}."
2261 )
2262
2263 monotonic_cst = np.asarray(monotonic_cst, dtype=np.int8)
2264 if monotonic_cst.shape[0] != estimator.n_features_in_:
2265 raise ValueError(
2266 f"monotonic_cst has shape {monotonic_cst.shape} but the input data "
2267 f"X has {estimator.n_features_in_} features."
2268 )
2269 return monotonic_cst
2270
2271
2272def _check_pos_label_consistency(pos_label, y_true):
2273 """Check if `pos_label` need to be specified or not.
2274
2275 In binary classification, we fix `pos_label=1` if the labels are in the set
2276 {-1, 1} or {0, 1}. Otherwise, we raise an error asking to specify the
2277 `pos_label` parameters.
2278
2279 Parameters
2280 ----------
2281 pos_label : int, float, bool, str or None
2282 The positive label.
2283 y_true : ndarray of shape (n_samples,)
2284 The target vector.
2285
2286 Returns
2287 -------
2288 pos_label : int, float, bool or str
2289 If `pos_label` can be inferred, it will be returned.
2290
2291 Raises
2292 ------
2293 ValueError
2294 In the case that `y_true` does not have label in {-1, 1} or {0, 1},
2295 it will raise a `ValueError`.
2296 """
2297 # ensure binary classification if pos_label is not specified
2298 # classes.dtype.kind in ('O', 'U', 'S') is required to avoid
2299 # triggering a FutureWarning by calling np.array_equal(a, b)
2300 # when elements in the two arrays are not comparable.
2301 classes = np.unique(y_true)
2302 if pos_label is None and (
2303 classes.dtype.kind in "OUS"
2304 or not (
2305 np.array_equal(classes, [0, 1])
2306 or np.array_equal(classes, [-1, 1])
2307 or np.array_equal(classes, [0])
2308 or np.array_equal(classes, [-1])
2309 or np.array_equal(classes, [1])
2310 )
2311 ):
2312 classes_repr = ", ".join([repr(c) for c in classes.tolist()])
2313 raise ValueError(
2314 f"y_true takes value in {{{classes_repr}}} and pos_label is not "
2315 "specified: either make y_true take value in {0, 1} or "
2316 "{-1, 1} or pass pos_label explicitly."
2317 )
2318 elif pos_label is None:
2319 pos_label = 1
2320
2321 return pos_label