Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/scikit_learn-1.4.dev0-py3.8-linux-x86

1"""

2The :mod:`sklearn.utils.validation` module includes functions to validate

3input and parameters within scikit-learn estimators.

4"""

6# Authors: Olivier Grisel

7# Gael Varoquaux

8# Andreas Mueller

9# Lars Buitinck

10# Alexandre Gramfort

11# Nicolas Tresegnie

12# Sylvain Marie

13# License: BSD 3 clause

15import numbers

16import operator

17import sys

18import warnings

19from contextlib import suppress

20from functools import reduce, wraps

21from inspect import Parameter, isclass, signature

23import joblib

24import numpy as np

25import scipy.sparse as sp

27from .. import get_config as _get_config

28from ..exceptions import DataConversionWarning, NotFittedError, PositiveSpectrumWarning

29from ..utils._array_api import _asarray_with_order, _is_numpy_namespace, get_namespace

30from ..utils.fixes import ComplexWarning, _preserve_dia_indices_dtype

31from ._isfinite import FiniteStatus, cy_isfinite

32from .fixes import _object_dtype_isnan

34FLOAT_DTYPES = (np.float64, np.float32, np.float16)

37# This function is not used anymore at this moment in the code base but we keep it in

38# case that we merge a new public function without kwarg only by mistake, which would

39# require a deprecation cycle to fix.

40def _deprecate_positional_args(func=None, *, version="1.3"):

41 """Decorator for methods that issues warnings for positional arguments.

43 Using the keyword-only argument syntax in pep 3102, arguments after the

44 * will issue a warning when passed as a positional argument.

46 Parameters

47 ----------

48 func : callable, default=None

49 Function to check arguments on.

50 version : callable, default="1.3"

51 The version when positional arguments will result in error.

52 """

54 def _inner_deprecate_positional_args(f):

55 sig = signature(f)

56 kwonly_args = []

57 all_args = []

59 for name, param in sig.parameters.items():

60 if param.kind == Parameter.POSITIONAL_OR_KEYWORD:

61 all_args.append(name)

62 elif param.kind == Parameter.KEYWORD_ONLY:

63 kwonly_args.append(name)

65 @wraps(f)

66 def inner_f(*args, **kwargs):

67 extra_args = len(args) - len(all_args)

68 if extra_args <= 0:

69 return f(*args, **kwargs)

71 # extra_args > 0

72 args_msg = [

73 "{}={}".format(name, arg)

74 for name, arg in zip(kwonly_args[:extra_args], args[-extra_args:])

75 ]

76 args_msg = ", ".join(args_msg)

77 warnings.warn(

78 (

79 f"Pass {args_msg} as keyword args. From version "

80 f"{version} passing these as positional arguments "

81 "will result in an error"

82 ),

83 FutureWarning,

84 )

85 kwargs.update(zip(sig.parameters, args))

86 return f(**kwargs)

88 return inner_f

90 if func is not None:

91 return _inner_deprecate_positional_args(func)

93 return _inner_deprecate_positional_args

96def _assert_all_finite(

97 X, allow_nan=False, msg_dtype=None, estimator_name=None, input_name=""

98):

99 """Like assert_all_finite, but only for ndarray."""

100

101 xp, _ = get_namespace(X)

102

103 if _get_config()["assume_finite"]:

104 return

105

106 X = xp.asarray(X)

107

108 # for object dtype data, we only check for NaNs (GH-13254)

109 if X.dtype == np.dtype("object") and not allow_nan:

110 if _object_dtype_isnan(X).any():

111 raise ValueError("Input contains NaN")

112

113 # We need only consider float arrays, hence can early return for all else.

114 if not xp.isdtype(X.dtype, ("real floating", "complex floating")):

115 return

116

117 # First try an O(n) time, O(1) space solution for the common case that

118 # everything is finite; fall back to O(n) space `np.isinf/isnan` or custom

119 # Cython implementation to prevent false positives and provide a detailed

120 # error message.

121 with np.errstate(over="ignore"):

122 first_pass_isfinite = xp.isfinite(xp.sum(X))

123 if first_pass_isfinite:

124 return

125

126 _assert_all_finite_element_wise(

127 X,

128 xp=xp,

129 allow_nan=allow_nan,

130 msg_dtype=msg_dtype,

131 estimator_name=estimator_name,

132 input_name=input_name,

133 )

134

135

136def _assert_all_finite_element_wise(

137 X, *, xp, allow_nan, msg_dtype=None, estimator_name=None, input_name=""

138):

139 # Cython implementation doesn't support FP16 or complex numbers

140 use_cython = (

141 xp is np and X.data.contiguous and X.dtype.type in {np.float32, np.float64}

142 )

143 if use_cython:

144 out = cy_isfinite(X.reshape(-1), allow_nan=allow_nan)

145 has_nan_error = False if allow_nan else out == FiniteStatus.has_nan

146 has_inf = out == FiniteStatus.has_infinite

147 else:

148 has_inf = xp.any(xp.isinf(X))

149 has_nan_error = False if allow_nan else xp.any(xp.isnan(X))

150 if has_inf or has_nan_error:

151 if has_nan_error:

152 type_err = "NaN"

153 else:

154 msg_dtype = msg_dtype if msg_dtype is not None else X.dtype

155 type_err = f"infinity or a value too large for {msg_dtype!r}"

156 padded_input_name = input_name + " " if input_name else ""

157 msg_err = f"Input {padded_input_name}contains {type_err}."

158 if estimator_name and input_name == "X" and has_nan_error:

159 # Improve the error message on how to handle missing values in

160 # scikit-learn.

161 msg_err += (

162 f"\n{estimator_name} does not accept missing values"

163 " encoded as NaN natively. For supervised learning, you might want"

164 " to consider sklearn.ensemble.HistGradientBoostingClassifier and"

165 " Regressor which accept missing values encoded as NaNs natively."

166 " Alternatively, it is possible to preprocess the data, for"

167 " instance by using an imputer transformer in a pipeline or drop"

168 " samples with missing values. See"

169 " https://scikit-learn.org/stable/modules/impute.html"

170 " You can find a list of all estimators that handle NaN values"

171 " at the following page:"

172 " https://scikit-learn.org/stable/modules/impute.html"

173 "#estimators-that-handle-nan-values"

174 )

175 raise ValueError(msg_err)

176

177

178def assert_all_finite(

179 X,

180 *,

181 allow_nan=False,

182 estimator_name=None,

183 input_name="",

184):

185 """Throw a ValueError if X contains NaN or infinity.

186

187 Parameters

188 ----------

189 X : {ndarray, sparse matrix}

190 The input data.

191

192 allow_nan : bool, default=False

193 If True, do not throw error when `X` contains NaN.

194

195 estimator_name : str, default=None

196 The estimator name, used to construct the error message.

197

198 input_name : str, default=""

199 The data name used to construct the error message. In particular

200 if `input_name` is "X" and the data has NaN values and

201 allow_nan is False, the error message will link to the imputer

202 documentation.

203 """

204 _assert_all_finite(

205 X.data if sp.issparse(X) else X,

206 allow_nan=allow_nan,

207 estimator_name=estimator_name,

208 input_name=input_name,

209 )

210

211

212def as_float_array(X, *, copy=True, force_all_finite=True):

213 """Convert an array-like to an array of floats.

214

215 The new dtype will be np.float32 or np.float64, depending on the original

216 type. The function can create a copy or modify the argument depending

217 on the argument copy.

218

219 Parameters

220 ----------

221 X : {array-like, sparse matrix}

222 The input data.

223

224 copy : bool, default=True

225 If True, a copy of X will be created. If False, a copy may still be

226 returned if X's dtype is not a floating point type.

227

228 force_all_finite : bool or 'allow-nan', default=True

229 Whether to raise an error on np.inf, np.nan, pd.NA in X. The

230 possibilities are:

231

232 - True: Force all values of X to be finite.

233 - False: accepts np.inf, np.nan, pd.NA in X.

234 - 'allow-nan': accepts only np.nan and pd.NA values in X. Values cannot

235 be infinite.

236

237 .. versionadded:: 0.20

238 ``force_all_finite`` accepts the string ``'allow-nan'``.

239

240 .. versionchanged:: 0.23

241 Accepts `pd.NA` and converts it into `np.nan`

242

243 Returns

244 -------

245 XT : {ndarray, sparse matrix}

246 An array of type float.

247 """

248 if isinstance(X, np.matrix) or (

249 not isinstance(X, np.ndarray) and not sp.issparse(X)

250 ):

251 return check_array(

252 X,

253 accept_sparse=["csr", "csc", "coo"],

254 dtype=np.float64,

255 copy=copy,

256 force_all_finite=force_all_finite,

257 ensure_2d=False,

258 )

259 elif sp.issparse(X) and X.dtype in [np.float32, np.float64]:

260 return X.copy() if copy else X

261 elif X.dtype in [np.float32, np.float64]: # is numpy array

262 return X.copy("F" if X.flags["F_CONTIGUOUS"] else "C") if copy else X

263 else:

264 if X.dtype.kind in "uib" and X.dtype.itemsize <= 4:

265 return_dtype = np.float32

266 else:

267 return_dtype = np.float64

268 return X.astype(return_dtype)

269

270

271def _is_arraylike(x):

272 """Returns whether the input is array-like."""

273 return hasattr(x, "__len__") or hasattr(x, "shape") or hasattr(x, "__array__")

274

275

276def _is_arraylike_not_scalar(array):

277 """Return True if array is array-like and not a scalar"""

278 return _is_arraylike(array) and not np.isscalar(array)

279

280

281def _use_interchange_protocol(X):

282 """Use interchange protocol for non-pandas dataframes that follow the protocol.

283

284 Note: at this point we chose not to use the interchange API on pandas dataframe

285 to ensure strict behavioral backward compatibility with older versions of

286 scikit-learn.

287 """

288 return not _is_pandas_df(X) and hasattr(X, "__dataframe__")

289

290

291def _num_features(X):

292 """Return the number of features in an array-like X.

293

294 This helper function tries hard to avoid to materialize an array version

295 of X unless necessary. For instance, if X is a list of lists,

296 this function will return the length of the first element, assuming

297 that subsequent elements are all lists of the same length without

298 checking.

299 Parameters

300 ----------

301 X : array-like

302 array-like to get the number of features.

303

304 Returns

305 -------

306 features : int

307 Number of features

308 """

309 type_ = type(X)

310 if type_.__module__ == "builtins":

311 type_name = type_.__qualname__

312 else:

313 type_name = f"{type_.__module__}.{type_.__qualname__}"

314 message = f"Unable to find the number of features from X of type {type_name}"

315 if not hasattr(X, "__len__") and not hasattr(X, "shape"):

316 if not hasattr(X, "__array__"):

317 raise TypeError(message)

318 # Only convert X to a numpy array if there is no cheaper, heuristic

319 # option.

320 X = np.asarray(X)

321

322 if hasattr(X, "shape"):

323 if not hasattr(X.shape, "__len__") or len(X.shape) <= 1:

324 message += f" with shape {X.shape}"

325 raise TypeError(message)

326 return X.shape[1]

327

328 first_sample = X[0]

329

330 # Do not consider an array-like of strings or dicts to be a 2D array

331 if isinstance(first_sample, (str, bytes, dict)):

332 message += f" where the samples are of type {type(first_sample).__qualname__}"

333 raise TypeError(message)

334

335 try:

336 # If X is a list of lists, for instance, we assume that all nested

337 # lists have the same length without checking or converting to

338 # a numpy array to keep this function call as cheap as possible.

339 return len(first_sample)

340 except Exception as err:

341 raise TypeError(message) from err

342

343

344def _num_samples(x):

345 """Return number of samples in array-like x."""

346 message = "Expected sequence or array-like, got %s" % type(x)

347 if hasattr(x, "fit") and callable(x.fit):

348 # Don't get num_samples from an ensembles length!

349 raise TypeError(message)

350

351 if _use_interchange_protocol(x):

352 return x.__dataframe__().num_rows()

353

354 if not hasattr(x, "__len__") and not hasattr(x, "shape"):

355 if hasattr(x, "__array__"):

356 x = np.asarray(x)

357 else:

358 raise TypeError(message)

359

360 if hasattr(x, "shape") and x.shape is not None:

361 if len(x.shape) == 0:

362 raise TypeError(

363 "Singleton array %r cannot be considered a valid collection." % x

364 )

365 # Check that shape is returning an integer or default to len

366 # Dask dataframes may not return numeric shape[0] value

367 if isinstance(x.shape[0], numbers.Integral):

368 return x.shape[0]

369

370 try:

371 return len(x)

372 except TypeError as type_error:

373 raise TypeError(message) from type_error

374

375

376def check_memory(memory):

377 """Check that ``memory`` is joblib.Memory-like.

378

379 joblib.Memory-like means that ``memory`` can be converted into a

380 joblib.Memory instance (typically a str denoting the ``location``)

381 or has the same interface (has a ``cache`` method).

382

383 Parameters

384 ----------

385 memory : None, str or object with the joblib.Memory interface

386 - If string, the location where to create the `joblib.Memory` interface.

387 - If None, no caching is done and the Memory object is completely transparent.

388

389 Returns

390 -------

391 memory : object with the joblib.Memory interface

392 A correct joblib.Memory object.

393

394 Raises

395 ------

396 ValueError

397 If ``memory`` is not joblib.Memory-like.

398 """

399 if memory is None or isinstance(memory, str):

400 memory = joblib.Memory(location=memory, verbose=0)

401 elif not hasattr(memory, "cache"):

402 raise ValueError(

403 "'memory' should be None, a string or have the same"

404 " interface as joblib.Memory."

405 " Got memory='{}' instead.".format(memory)

406 )

407 return memory

408

409

410def check_consistent_length(*arrays):

411 """Check that all arrays have consistent first dimensions.

412

413 Checks whether all objects in arrays have the same shape or length.

414

415 Parameters

416 ----------

417 *arrays : list or tuple of input objects.

418 Objects that will be checked for consistent length.

419 """

420

421 lengths = [_num_samples(X) for X in arrays if X is not None]

422 uniques = np.unique(lengths)

423 if len(uniques) > 1:

424 raise ValueError(

425 "Found input variables with inconsistent numbers of samples: %r"

426 % [int(l) for l in lengths]

427 )

428

429

430def _make_indexable(iterable):

431 """Ensure iterable supports indexing or convert to an indexable variant.

432

433 Convert sparse matrices to csr and other non-indexable iterable to arrays.

434 Let `None` and indexable objects (e.g. pandas dataframes) pass unchanged.

435

436 Parameters

437 ----------

438 iterable : {list, dataframe, ndarray, sparse matrix} or None

439 Object to be converted to an indexable iterable.

440 """

441 if sp.issparse(iterable):

442 return iterable.tocsr()

443 elif hasattr(iterable, "__getitem__") or hasattr(iterable, "iloc"):

444 return iterable

445 elif iterable is None:

446 return iterable

447 return np.array(iterable)

448

449

450def indexable(*iterables):

451 """Make arrays indexable for cross-validation.

452

453 Checks consistent length, passes through None, and ensures that everything

454 can be indexed by converting sparse matrices to csr and converting

455 non-interable objects to arrays.

456

457 Parameters

458 ----------

459 *iterables : {lists, dataframes, ndarrays, sparse matrices}

460 List of objects to ensure sliceability.

461

462 Returns

463 -------

464 result : list of {ndarray, sparse matrix, dataframe} or None

465 Returns a list containing indexable arrays (i.e. NumPy array,

466 sparse matrix, or dataframe) or `None`.

467 """

468

469 result = [_make_indexable(X) for X in iterables]

470 check_consistent_length(*result)

471 return result

472

473

474def _ensure_sparse_format(

475 sparse_container,

476 accept_sparse,

477 dtype,

478 copy,

479 force_all_finite,

480 accept_large_sparse,

481 estimator_name=None,

482 input_name="",

483):

484 """Convert a sparse container to a given format.

485

486 Checks the sparse format of `sparse_container` and converts if necessary.

487

488 Parameters

489 ----------

490 sparse_container : sparse matrix or array

491 Input to validate and convert.

492

493 accept_sparse : str, bool or list/tuple of str

494 String[s] representing allowed sparse matrix formats ('csc',

495 'csr', 'coo', 'dok', 'bsr', 'lil', 'dia'). If the input is sparse but

496 not in the allowed format, it will be converted to the first listed

497 format. True allows the input to be any format. False means

498 that a sparse matrix input will raise an error.

499

500 dtype : str, type or None

501 Data type of result. If None, the dtype of the input is preserved.

502

503 copy : bool

504 Whether a forced copy will be triggered. If copy=False, a copy might

505 be triggered by a conversion.

506

507 force_all_finite : bool or 'allow-nan'

508 Whether to raise an error on np.inf, np.nan, pd.NA in X. The

509 possibilities are:

510

511 - True: Force all values of X to be finite.

512 - False: accepts np.inf, np.nan, pd.NA in X.

513 - 'allow-nan': accepts only np.nan and pd.NA values in X. Values cannot

514 be infinite.

515

516 .. versionadded:: 0.20

517 ``force_all_finite`` accepts the string ``'allow-nan'``.

518

519 .. versionchanged:: 0.23

520 Accepts `pd.NA` and converts it into `np.nan`

521

522

523 estimator_name : str, default=None

524 The estimator name, used to construct the error message.

525

526 input_name : str, default=""

527 The data name used to construct the error message. In particular

528 if `input_name` is "X" and the data has NaN values and

529 allow_nan is False, the error message will link to the imputer

530 documentation.

531

532 Returns

533 -------

534 sparse_container_converted : sparse matrix or array

535 Sparse container (matrix/array) that is ensured to have an allowed type.

536 """

537 if dtype is None:

538 dtype = sparse_container.dtype

539

540 changed_format = False

541 sparse_container_type_name = type(sparse_container).__name__

542

543 if isinstance(accept_sparse, str):

544 accept_sparse = [accept_sparse]

545

546 # Indices dtype validation

547 _check_large_sparse(sparse_container, accept_large_sparse)

548

549 if accept_sparse is False:

550 padded_input = " for " + input_name if input_name else ""

551 raise TypeError(

552 f"Sparse data was passed{padded_input}, but dense data is required. "

553 "Use '.toarray()' to convert to a dense numpy array."

554 )

555 elif isinstance(accept_sparse, (list, tuple)):

556 if len(accept_sparse) == 0:

557 raise ValueError(

558 "When providing 'accept_sparse' as a tuple or list, it must contain at "

559 "least one string value."

560 )

561 # ensure correct sparse format

562 if sparse_container.format not in accept_sparse:

563 # create new with correct sparse

564 sparse_container = sparse_container.asformat(accept_sparse[0])

565 changed_format = True

566 elif accept_sparse is not True:

567 # any other type

568 raise ValueError(

569 "Parameter 'accept_sparse' should be a string, boolean or list of strings."

570 f" You provided 'accept_sparse={accept_sparse}'."

571 )

572

573 if dtype != sparse_container.dtype:

574 # convert dtype

575 sparse_container = sparse_container.astype(dtype)

576 elif copy and not changed_format:

577 # force copy

578 sparse_container = sparse_container.copy()

579

580 if force_all_finite:

581 if not hasattr(sparse_container, "data"):

582 warnings.warn(

583 f"Can't check {sparse_container.format} sparse matrix for nan or inf.",

584 stacklevel=2,

585 )

586 else:

587 _assert_all_finite(

588 sparse_container.data,

589 allow_nan=force_all_finite == "allow-nan",

590 estimator_name=estimator_name,

591 input_name=input_name,

592 )

593

594 # TODO: Remove when the minimum version of SciPy supported is 1.12

595 # With SciPy sparse arrays, conversion from DIA format to COO, CSR, or BSR

596 # triggers the use of `np.int64` indices even if the data is such that it could

597 # be more efficiently represented with `np.int32` indices.

598 # https://github.com/scipy/scipy/issues/19245 Since not all scikit-learn

599 # algorithms support large indices, the following code downcasts to `np.int32`

600 # indices when it's safe to do so.

601 if changed_format:

602 # accept_sparse is specified to a specific format and a conversion occurred

603 requested_sparse_format = accept_sparse[0]

604 _preserve_dia_indices_dtype(

605 sparse_container, sparse_container_type_name, requested_sparse_format

606 )

607

608 return sparse_container

609

610

611def _ensure_no_complex_data(array):

612 if (

613 hasattr(array, "dtype")

614 and array.dtype is not None

615 and hasattr(array.dtype, "kind")

616 and array.dtype.kind == "c"

617 ):

618 raise ValueError("Complex data not supported\n{}\n".format(array))

619

620

621def _check_estimator_name(estimator):

622 if estimator is not None:

623 if isinstance(estimator, str):

624 return estimator

625 else:

626 return estimator.__class__.__name__

627 return None

628

629

630def _pandas_dtype_needs_early_conversion(pd_dtype):

631 """Return True if pandas extension pd_dtype need to be converted early."""

632 # Check these early for pandas versions without extension dtypes

633 from pandas import SparseDtype

634 from pandas.api.types import (

635 is_bool_dtype,

636 is_float_dtype,

637 is_integer_dtype,

638 )

639

640 if is_bool_dtype(pd_dtype):

641 # bool and extension booleans need early conversion because __array__

642 # converts mixed dtype dataframes into object dtypes

643 return True

644

645 if isinstance(pd_dtype, SparseDtype):

646 # Sparse arrays will be converted later in `check_array`

647 return False

648

649 try:

650 from pandas.api.types import is_extension_array_dtype

651 except ImportError:

652 return False

653

654 if isinstance(pd_dtype, SparseDtype) or not is_extension_array_dtype(pd_dtype):

655 # Sparse arrays will be converted later in `check_array`

656 # Only handle extension arrays for integer and floats

657 return False

658 elif is_float_dtype(pd_dtype):

659 # Float ndarrays can normally support nans. They need to be converted

660 # first to map pd.NA to np.nan

661 return True

662 elif is_integer_dtype(pd_dtype):

663 # XXX: Warn when converting from a high integer to a float

664 return True

665

666 return False

667

668

669def _is_extension_array_dtype(array):

670 # Pandas extension arrays have a dtype with an na_value

671 return hasattr(array, "dtype") and hasattr(array.dtype, "na_value")

672

673

674def check_array(

675 array,

676 accept_sparse=False,

677 *,

678 accept_large_sparse=True,

679 dtype="numeric",

680 order=None,

681 copy=False,

682 force_all_finite=True,

683 ensure_2d=True,

684 allow_nd=False,

685 ensure_min_samples=1,

686 ensure_min_features=1,

687 estimator=None,

688 input_name="",

689):

690 """Input validation on an array, list, sparse matrix or similar.

691

692 By default, the input is checked to be a non-empty 2D array containing

693 only finite values. If the dtype of the array is object, attempt

694 converting to float, raising on failure.

695

696 Parameters

697 ----------

698 array : object

699 Input object to check / convert.

700

701 accept_sparse : str, bool or list/tuple of str, default=False

702 String[s] representing allowed sparse matrix formats, such as 'csc',

703 'csr', etc. If the input is sparse but not in the allowed format,

704 it will be converted to the first listed format. True allows the input

705 to be any format. False means that a sparse matrix input will

706 raise an error.

707

708 accept_large_sparse : bool, default=True

709 If a CSR, CSC, COO or BSR sparse matrix is supplied and accepted by

710 accept_sparse, accept_large_sparse=False will cause it to be accepted

711 only if its indices are stored with a 32-bit dtype.

712

713 .. versionadded:: 0.20

714

715 dtype : 'numeric', type, list of type or None, default='numeric'

716 Data type of result. If None, the dtype of the input is preserved.

717 If "numeric", dtype is preserved unless array.dtype is object.

718 If dtype is a list of types, conversion on the first type is only

719 performed if the dtype of the input is not in the list.

720

721 order : {'F', 'C'} or None, default=None

722 Whether an array will be forced to be fortran or c-style.

723 When order is None (default), then if copy=False, nothing is ensured

724 about the memory layout of the output array; otherwise (copy=True)

725 the memory layout of the returned array is kept as close as possible

726 to the original array.

727

728 copy : bool, default=False

729 Whether a forced copy will be triggered. If copy=False, a copy might

730 be triggered by a conversion.

731

732 force_all_finite : bool or 'allow-nan', default=True

733 Whether to raise an error on np.inf, np.nan, pd.NA in array. The

734 possibilities are:

735

736 - True: Force all values of array to be finite.

737 - False: accepts np.inf, np.nan, pd.NA in array.

738 - 'allow-nan': accepts only np.nan and pd.NA values in array. Values

739 cannot be infinite.

740

741 .. versionadded:: 0.20

742 ``force_all_finite`` accepts the string ``'allow-nan'``.

743

744 .. versionchanged:: 0.23

745 Accepts `pd.NA` and converts it into `np.nan`

746

747 ensure_2d : bool, default=True

748 Whether to raise a value error if array is not 2D.

749

750 allow_nd : bool, default=False

751 Whether to allow array.ndim > 2.

752

753 ensure_min_samples : int, default=1

754 Make sure that the array has a minimum number of samples in its first

755 axis (rows for a 2D array). Setting to 0 disables this check.

756

757 ensure_min_features : int, default=1

758 Make sure that the 2D array has some minimum number of features

759 (columns). The default value of 1 rejects empty datasets.

760 This check is only enforced when the input data has effectively 2

761 dimensions or is originally 1D and ``ensure_2d`` is True. Setting to 0

762 disables this check.

763

764 estimator : str or estimator instance, default=None

765 If passed, include the name of the estimator in warning messages.

766

767 input_name : str, default=""

768 The data name used to construct the error message. In particular

769 if `input_name` is "X" and the data has NaN values and

770 allow_nan is False, the error message will link to the imputer

771 documentation.

772

773 .. versionadded:: 1.1.0

774

775 Returns

776 -------

777 array_converted : object

778 The converted and validated array.

779 """

780 if isinstance(array, np.matrix):

781 raise TypeError(

782 "np.matrix is not supported. Please convert to a numpy array with "

783 "np.asarray. For more information see: "

784 "https://numpy.org/doc/stable/reference/generated/numpy.matrix.html"

785 )

786

787 xp, is_array_api_compliant = get_namespace(array)

788

789 # store reference to original array to check if copy is needed when

790 # function returns

791 array_orig = array

792

793 # store whether originally we wanted numeric dtype

794 dtype_numeric = isinstance(dtype, str) and dtype == "numeric"

795

796 dtype_orig = getattr(array, "dtype", None)

797 if not is_array_api_compliant and not hasattr(dtype_orig, "kind"):

798 # not a data type (e.g. a column named dtype in a pandas DataFrame)

799 dtype_orig = None

800

801 # check if the object contains several dtypes (typically a pandas

802 # DataFrame), and store them. If not, store None.

803 dtypes_orig = None

804 pandas_requires_conversion = False

805 if hasattr(array, "dtypes") and hasattr(array.dtypes, "__array__"):

806 # throw warning if columns are sparse. If all columns are sparse, then

807 # array.sparse exists and sparsity will be preserved (later).

808 with suppress(ImportError):

809 from pandas import SparseDtype

810

811 def is_sparse(dtype):

812 return isinstance(dtype, SparseDtype)

813

814 if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():

815 warnings.warn(

816 "pandas.DataFrame with sparse columns found."

817 "It will be converted to a dense numpy array."

818 )

819

820 dtypes_orig = list(array.dtypes)

821 pandas_requires_conversion = any(

822 _pandas_dtype_needs_early_conversion(i) for i in dtypes_orig

823 )

824 if all(isinstance(dtype_iter, np.dtype) for dtype_iter in dtypes_orig):

825 dtype_orig = np.result_type(*dtypes_orig)

826 elif pandas_requires_conversion and any(d == object for d in dtypes_orig):

827 # Force object if any of the dtypes is an object

828 dtype_orig = object

829

830 elif (_is_extension_array_dtype(array) or hasattr(array, "iloc")) and hasattr(

831 array, "dtype"

832 ):

833 # array is a pandas series

834 pandas_requires_conversion = _pandas_dtype_needs_early_conversion(array.dtype)

835 if isinstance(array.dtype, np.dtype):

836 dtype_orig = array.dtype

837 else:

838 # Set to None to let array.astype work out the best dtype

839 dtype_orig = None

840

841 if dtype_numeric:

842 if (

843 dtype_orig is not None

844 and hasattr(dtype_orig, "kind")

845 and dtype_orig.kind == "O"

846 ):

847 # if input is object, convert to float.

848 dtype = xp.float64

849 else:

850 dtype = None

851

852 if isinstance(dtype, (list, tuple)):

853 if dtype_orig is not None and dtype_orig in dtype:

854 # no dtype conversion required

855 dtype = None

856 else:

857 # dtype conversion required. Let's select the first element of the

858 # list of accepted types.

859 dtype = dtype[0]

860

861 if pandas_requires_conversion:

862 # pandas dataframe requires conversion earlier to handle extension dtypes with

863 # nans

864 # Use the original dtype for conversion if dtype is None

865 new_dtype = dtype_orig if dtype is None else dtype

866 array = array.astype(new_dtype)

867 # Since we converted here, we do not need to convert again later

868 dtype = None

869

870 if force_all_finite not in (True, False, "allow-nan"):

871 raise ValueError(

872 'force_all_finite should be a bool or "allow-nan". Got {!r} instead'.format(

873 force_all_finite

874 )

875 )

876

877 if dtype is not None and _is_numpy_namespace(xp):

878 # convert to dtype object to conform to Array API to be use `xp.isdtype` later

879 dtype = np.dtype(dtype)

880

881 estimator_name = _check_estimator_name(estimator)

882 context = " by %s" % estimator_name if estimator is not None else ""

883

884 # When all dataframe columns are sparse, convert to a sparse array

885 if hasattr(array, "sparse") and array.ndim > 1:

886 with suppress(ImportError):

887 from pandas import SparseDtype # noqa: F811

888

889 def is_sparse(dtype):

890 return isinstance(dtype, SparseDtype)

891

892 if array.dtypes.apply(is_sparse).all():

893 # DataFrame.sparse only supports `to_coo`

894 array = array.sparse.to_coo()

895 if array.dtype == np.dtype("object"):

896 unique_dtypes = set([dt.subtype.name for dt in array_orig.dtypes])

897 if len(unique_dtypes) > 1:

898 raise ValueError(

899 "Pandas DataFrame with mixed sparse extension arrays "

900 "generated a sparse matrix with object dtype which "

901 "can not be converted to a scipy sparse matrix."

902 "Sparse extension arrays should all have the same "

903 "numeric type."

904 )

905

906 if sp.issparse(array):

907 _ensure_no_complex_data(array)

908 array = _ensure_sparse_format(

909 array,

910 accept_sparse=accept_sparse,

911 dtype=dtype,

912 copy=copy,

913 force_all_finite=force_all_finite,

914 accept_large_sparse=accept_large_sparse,

915 estimator_name=estimator_name,

916 input_name=input_name,

917 )

918 else:

919 # If np.array(..) gives ComplexWarning, then we convert the warning

920 # to an error. This is needed because specifying a non complex

921 # dtype to the function converts complex to real dtype,

922 # thereby passing the test made in the lines following the scope

923 # of warnings context manager.

924 with warnings.catch_warnings():

925 try:

926 warnings.simplefilter("error", ComplexWarning)

927 if dtype is not None and xp.isdtype(dtype, "integral"):

928 # Conversion float -> int should not contain NaN or

929 # inf (numpy#14412). We cannot use casting='safe' because

930 # then conversion float -> int would be disallowed.

931 array = _asarray_with_order(array, order=order, xp=xp)

932 if xp.isdtype(array.dtype, ("real floating", "complex floating")):

933 _assert_all_finite(

934 array,

935 allow_nan=False,

936 msg_dtype=dtype,

937 estimator_name=estimator_name,

938 input_name=input_name,

939 )

940 array = xp.astype(array, dtype, copy=False)

941 else:

942 array = _asarray_with_order(array, order=order, dtype=dtype, xp=xp)

943 except ComplexWarning as complex_warning:

944 raise ValueError(

945 "Complex data not supported\n{}\n".format(array)

946 ) from complex_warning

947

948 # It is possible that the np.array(..) gave no warning. This happens

949 # when no dtype conversion happened, for example dtype = None. The

950 # result is that np.array(..) produces an array of complex dtype

951 # and we need to catch and raise exception for such cases.

952 _ensure_no_complex_data(array)

953

954 if ensure_2d:

955 # If input is scalar raise error

956 if array.ndim == 0:

957 raise ValueError(

958 "Expected 2D array, got scalar array instead:\narray={}.\n"

959 "Reshape your data either using array.reshape(-1, 1) if "

960 "your data has a single feature or array.reshape(1, -1) "

961 "if it contains a single sample.".format(array)

962 )

963 # If input is 1D raise error

964 if array.ndim == 1:

965 raise ValueError(

966 "Expected 2D array, got 1D array instead:\narray={}.\n"

967 "Reshape your data either using array.reshape(-1, 1) if "

968 "your data has a single feature or array.reshape(1, -1) "

969 "if it contains a single sample.".format(array)

970 )

971

972 if dtype_numeric and hasattr(array.dtype, "kind") and array.dtype.kind in "USV":

973 raise ValueError(

974 "dtype='numeric' is not compatible with arrays of bytes/strings."

975 "Convert your data to numeric values explicitly instead."

976 )

977 if not allow_nd and array.ndim >= 3:

978 raise ValueError(

979 "Found array with dim %d. %s expected <= 2."

980 % (array.ndim, estimator_name)

981 )

982

983 if force_all_finite:

984 _assert_all_finite(

985 array,

986 input_name=input_name,

987 estimator_name=estimator_name,

988 allow_nan=force_all_finite == "allow-nan",

989 )

990

991 if copy:

992 if _is_numpy_namespace(xp):

993 # only make a copy if `array` and `array_orig` may share memory`

994 if np.may_share_memory(array, array_orig):

995 array = _asarray_with_order(

996 array, dtype=dtype, order=order, copy=True, xp=xp

997 )

998 else:

999 # always make a copy for non-numpy arrays

1000 array = _asarray_with_order(

1001 array, dtype=dtype, order=order, copy=True, xp=xp

1002 )

1003

1004 if ensure_min_samples > 0:

1005 n_samples = _num_samples(array)

1006 if n_samples < ensure_min_samples:

1007 raise ValueError(

1008 "Found array with %d sample(s) (shape=%s) while a"

1009 " minimum of %d is required%s."

1010 % (n_samples, array.shape, ensure_min_samples, context)

1011 )

1012

1013 if ensure_min_features > 0 and array.ndim == 2:

1014 n_features = array.shape[1]

1015 if n_features < ensure_min_features:

1016 raise ValueError(

1017 "Found array with %d feature(s) (shape=%s) while"

1018 " a minimum of %d is required%s."

1019 % (n_features, array.shape, ensure_min_features, context)

1020 )

1021

1022 return array

1023

1024

1025def _check_large_sparse(X, accept_large_sparse=False):

1026 """Raise a ValueError if X has 64bit indices and accept_large_sparse=False"""

1027 if not accept_large_sparse:

1028 supported_indices = ["int32"]

1029 if X.format == "coo":

1030 index_keys = ["col", "row"]

1031 elif X.format in ["csr", "csc", "bsr"]:

1032 index_keys = ["indices", "indptr"]

1033 else:

1034 return

1035 for key in index_keys:

1036 indices_datatype = getattr(X, key).dtype

1037 if indices_datatype not in supported_indices:

1038 raise ValueError(

1039 "Only sparse matrices with 32-bit integer indices are accepted."

1040 f" Got {indices_datatype} indices. Please do report a minimal"

1041 " reproducer on scikit-learn issue tracker so that support for"

1042 " your use-case can be studied by maintainers. See:"

1043 " https://scikit-learn.org/dev/developers/minimal_reproducer.html"

1044 )

1045

1046

1047def check_X_y(

1048 X,

1049 y,

1050 accept_sparse=False,

1051 *,

1052 accept_large_sparse=True,

1053 dtype="numeric",

1054 order=None,

1055 copy=False,

1056 force_all_finite=True,

1057 ensure_2d=True,

1058 allow_nd=False,

1059 multi_output=False,

1060 ensure_min_samples=1,

1061 ensure_min_features=1,

1062 y_numeric=False,

1063 estimator=None,

1064):

1065 """Input validation for standard estimators.

1066

1067 Checks X and y for consistent length, enforces X to be 2D and y 1D. By

1068 default, X is checked to be non-empty and containing only finite values.

1069 Standard input checks are also applied to y, such as checking that y

1070 does not have np.nan or np.inf targets. For multi-label y, set

1071 multi_output=True to allow 2D and sparse y. If the dtype of X is

1072 object, attempt converting to float, raising on failure.

1073

1074 Parameters

1075 ----------

1076 X : {ndarray, list, sparse matrix}

1077 Input data.

1078

1079 y : {ndarray, list, sparse matrix}

1080 Labels.

1081

1082 accept_sparse : str, bool or list of str, default=False

1083 String[s] representing allowed sparse matrix formats, such as 'csc',

1084 'csr', etc. If the input is sparse but not in the allowed format,

1085 it will be converted to the first listed format. True allows the input

1086 to be any format. False means that a sparse matrix input will

1087 raise an error.

1088

1089 accept_large_sparse : bool, default=True

1090 If a CSR, CSC, COO or BSR sparse matrix is supplied and accepted by

1091 accept_sparse, accept_large_sparse will cause it to be accepted only

1092 if its indices are stored with a 32-bit dtype.

1093

1094 .. versionadded:: 0.20

1095

1096 dtype : 'numeric', type, list of type or None, default='numeric'

1097 Data type of result. If None, the dtype of the input is preserved.

1098 If "numeric", dtype is preserved unless array.dtype is object.

1099 If dtype is a list of types, conversion on the first type is only

1100 performed if the dtype of the input is not in the list.

1101

1102 order : {'F', 'C'}, default=None

1103 Whether an array will be forced to be fortran or c-style. If

1104 `None`, then the input data's order is preserved when possible.

1105

1106 copy : bool, default=False

1107 Whether a forced copy will be triggered. If copy=False, a copy might

1108 be triggered by a conversion.

1109

1110 force_all_finite : bool or 'allow-nan', default=True

1111 Whether to raise an error on np.inf, np.nan, pd.NA in X. This parameter

1112 does not influence whether y can have np.inf, np.nan, pd.NA values.

1113 The possibilities are:

1114

1115 - True: Force all values of X to be finite.

1116 - False: accepts np.inf, np.nan, pd.NA in X.

1117 - 'allow-nan': accepts only np.nan or pd.NA values in X. Values cannot

1118 be infinite.

1119

1120 .. versionadded:: 0.20

1121 ``force_all_finite`` accepts the string ``'allow-nan'``.

1122

1123 .. versionchanged:: 0.23

1124 Accepts `pd.NA` and converts it into `np.nan`

1125

1126 ensure_2d : bool, default=True

1127 Whether to raise a value error if X is not 2D.

1128

1129 allow_nd : bool, default=False

1130 Whether to allow X.ndim > 2.

1131

1132 multi_output : bool, default=False

1133 Whether to allow 2D y (array or sparse matrix). If false, y will be

1134 validated as a vector. y cannot have np.nan or np.inf values if

1135 multi_output=True.

1136

1137 ensure_min_samples : int, default=1

1138 Make sure that X has a minimum number of samples in its first

1139 axis (rows for a 2D array).

1140

1141 ensure_min_features : int, default=1

1142 Make sure that the 2D array has some minimum number of features

1143 (columns). The default value of 1 rejects empty datasets.

1144 This check is only enforced when X has effectively 2 dimensions or

1145 is originally 1D and ``ensure_2d`` is True. Setting to 0 disables

1146 this check.

1147

1148 y_numeric : bool, default=False

1149 Whether to ensure that y has a numeric type. If dtype of y is object,

1150 it is converted to float64. Should only be used for regression

1151 algorithms.

1152

1153 estimator : str or estimator instance, default=None

1154 If passed, include the name of the estimator in warning messages.

1155

1156 Returns

1157 -------

1158 X_converted : object

1159 The converted and validated X.

1160

1161 y_converted : object

1162 The converted and validated y.

1163 """

1164 if y is None:

1165 if estimator is None:

1166 estimator_name = "estimator"

1167 else:

1168 estimator_name = _check_estimator_name(estimator)

1169 raise ValueError(

1170 f"{estimator_name} requires y to be passed, but the target y is None"

1171 )

1172

1173 X = check_array(

1174 X,

1175 accept_sparse=accept_sparse,

1176 accept_large_sparse=accept_large_sparse,

1177 dtype=dtype,

1178 order=order,

1179 copy=copy,

1180 force_all_finite=force_all_finite,

1181 ensure_2d=ensure_2d,

1182 allow_nd=allow_nd,

1183 ensure_min_samples=ensure_min_samples,

1184 ensure_min_features=ensure_min_features,

1185 estimator=estimator,

1186 input_name="X",

1187 )

1188

1189 y = _check_y(y, multi_output=multi_output, y_numeric=y_numeric, estimator=estimator)

1190

1191 check_consistent_length(X, y)

1192

1193 return X, y

1194

1195

1196def _check_y(y, multi_output=False, y_numeric=False, estimator=None):

1197 """Isolated part of check_X_y dedicated to y validation"""

1198 if multi_output:

1199 y = check_array(

1200 y,

1201 accept_sparse="csr",

1202 force_all_finite=True,

1203 ensure_2d=False,

1204 dtype=None,

1205 input_name="y",

1206 estimator=estimator,

1207 )

1208 else:

1209 estimator_name = _check_estimator_name(estimator)

1210 y = column_or_1d(y, warn=True)

1211 _assert_all_finite(y, input_name="y", estimator_name=estimator_name)

1212 _ensure_no_complex_data(y)

1213 if y_numeric and y.dtype.kind == "O":

1214 y = y.astype(np.float64)

1215

1216 return y

1217

1218

1219def column_or_1d(y, *, dtype=None, warn=False):

1220 """Ravel column or 1d numpy array, else raises an error.

1221

1222 Parameters

1223 ----------

1224 y : array-like

1225 Input data.

1226

1227 dtype : data-type, default=None

1228 Data type for `y`.

1229

1230 .. versionadded:: 1.2

1231

1232 warn : bool, default=False

1233 To control display of warnings.

1234

1235 Returns

1236 -------

1237 y : ndarray

1238 Output data.

1239

1240 Raises

1241 ------

1242 ValueError

1243 If `y` is not a 1D array or a 2D array with a single row or column.

1244 """

1245 xp, _ = get_namespace(y)

1246 y = check_array(

1247 y,

1248 ensure_2d=False,

1249 dtype=dtype,

1250 input_name="y",

1251 force_all_finite=False,

1252 ensure_min_samples=0,

1253 )

1254

1255 shape = y.shape

1256 if len(shape) == 1:

1257 return _asarray_with_order(xp.reshape(y, (-1,)), order="C", xp=xp)

1258 if len(shape) == 2 and shape[1] == 1:

1259 if warn:

1260 warnings.warn(

1261 (

1262 "A column-vector y was passed when a 1d array was"

1263 " expected. Please change the shape of y to "

1264 "(n_samples, ), for example using ravel()."

1265 ),

1266 DataConversionWarning,

1267 stacklevel=2,

1268 )

1269 return _asarray_with_order(xp.reshape(y, (-1,)), order="C", xp=xp)

1270

1271 raise ValueError(

1272 "y should be a 1d array, got an array of shape {} instead.".format(shape)

1273 )

1274

1275

1276def check_random_state(seed):

1277 """Turn seed into a np.random.RandomState instance.

1278

1279 Parameters

1280 ----------

1281 seed : None, int or instance of RandomState

1282 If seed is None, return the RandomState singleton used by np.random.

1283 If seed is an int, return a new RandomState instance seeded with seed.

1284 If seed is already a RandomState instance, return it.

1285 Otherwise raise ValueError.

1286

1287 Returns

1288 -------

1289 :class:`numpy:numpy.random.RandomState`

1290 The random state object based on `seed` parameter.

1291 """

1292 if seed is None or seed is np.random:

1293 return np.random.mtrand._rand

1294 if isinstance(seed, numbers.Integral):

1295 return np.random.RandomState(seed)

1296 if isinstance(seed, np.random.RandomState):

1297 return seed

1298 raise ValueError(

1299 "%r cannot be used to seed a numpy.random.RandomState instance" % seed

1300 )

1301

1302

1303def has_fit_parameter(estimator, parameter):

1304 """Check whether the estimator's fit method supports the given parameter.

1305

1306 Parameters

1307 ----------

1308 estimator : object

1309 An estimator to inspect.

1310

1311 parameter : str

1312 The searched parameter.

1313

1314 Returns

1315 -------

1316 is_parameter : bool

1317 Whether the parameter was found to be a named parameter of the

1318 estimator's fit method.

1319

1320 Examples

1321 --------

1322 >>> from sklearn.svm import SVC

1323 >>> from sklearn.utils.validation import has_fit_parameter

1324 >>> has_fit_parameter(SVC(), "sample_weight")

1325 True

1326 """

1327 return parameter in signature(estimator.fit).parameters

1328

1329

1330def check_symmetric(array, *, tol=1e-10, raise_warning=True, raise_exception=False):

1331 """Make sure that array is 2D, square and symmetric.

1332

1333 If the array is not symmetric, then a symmetrized version is returned.

1334 Optionally, a warning or exception is raised if the matrix is not

1335 symmetric.

1336

1337 Parameters

1338 ----------

1339 array : {ndarray, sparse matrix}

1340 Input object to check / convert. Must be two-dimensional and square,

1341 otherwise a ValueError will be raised.

1342

1343 tol : float, default=1e-10

1344 Absolute tolerance for equivalence of arrays. Default = 1E-10.

1345

1346 raise_warning : bool, default=True

1347 If True then raise a warning if conversion is required.

1348

1349 raise_exception : bool, default=False

1350 If True then raise an exception if array is not symmetric.

1351

1352 Returns

1353 -------

1354 array_sym : {ndarray, sparse matrix}

1355 Symmetrized version of the input array, i.e. the average of array

1356 and array.transpose(). If sparse, then duplicate entries are first

1357 summed and zeros are eliminated.

1358 """

1359 if (array.ndim != 2) or (array.shape[0] != array.shape[1]):

1360 raise ValueError(

1361 "array must be 2-dimensional and square. shape = {0}".format(array.shape)

1362 )

1363

1364 if sp.issparse(array):

1365 diff = array - array.T

1366 # only csr, csc, and coo have `data` attribute

1367 if diff.format not in ["csr", "csc", "coo"]:

1368 diff = diff.tocsr()

1369 symmetric = np.all(abs(diff.data) < tol)

1370 else:

1371 symmetric = np.allclose(array, array.T, atol=tol)

1372

1373 if not symmetric:

1374 if raise_exception:

1375 raise ValueError("Array must be symmetric")

1376 if raise_warning:

1377 warnings.warn(

1378 (

1379 "Array is not symmetric, and will be converted "

1380 "to symmetric by average with its transpose."

1381 ),

1382 stacklevel=2,

1383 )

1384 if sp.issparse(array):

1385 conversion = "to" + array.format

1386 array = getattr(0.5 * (array + array.T), conversion)()

1387 else:

1388 array = 0.5 * (array + array.T)

1389

1390 return array

1391

1392

1393def _is_fitted(estimator, attributes=None, all_or_any=all):

1394 """Determine if an estimator is fitted

1395

1396 Parameters

1397 ----------

1398 estimator : estimator instance

1399 Estimator instance for which the check is performed.

1400

1401 attributes : str, list or tuple of str, default=None

1402 Attribute name(s) given as string or a list/tuple of strings

1403 Eg.: ``["coef_", "estimator_", ...], "coef_"``

1404

1405 If `None`, `estimator` is considered fitted if there exist an

1406 attribute that ends with a underscore and does not start with double

1407 underscore.

1408

1409 all_or_any : callable, {all, any}, default=all

1410 Specify whether all or any of the given attributes must exist.

1411

1412 Returns

1413 -------

1414 fitted : bool

1415 Whether the estimator is fitted.

1416 """

1417 if attributes is not None:

1418 if not isinstance(attributes, (list, tuple)):

1419 attributes = [attributes]

1420 return all_or_any([hasattr(estimator, attr) for attr in attributes])

1421

1422 if hasattr(estimator, "__sklearn_is_fitted__"):

1423 return estimator.__sklearn_is_fitted__()

1424

1425 fitted_attrs = [

1426 v for v in vars(estimator) if v.endswith("_") and not v.startswith("__")

1427 ]

1428 return len(fitted_attrs) > 0

1429

1430

1431def check_is_fitted(estimator, attributes=None, *, msg=None, all_or_any=all):

1432 """Perform is_fitted validation for estimator.

1433

1434 Checks if the estimator is fitted by verifying the presence of

1435 fitted attributes (ending with a trailing underscore) and otherwise

1436 raises a NotFittedError with the given message.

1437

1438 If an estimator does not set any attributes with a trailing underscore, it

1439 can define a ``__sklearn_is_fitted__`` method returning a boolean to

1440 specify if the estimator is fitted or not. See

1441 :ref:`sphx_glr_auto_examples_developing_estimators_sklearn_is_fitted.py`

1442 for an example on how to use the API.

1443

1444 Parameters

1445 ----------

1446 estimator : estimator instance

1447 Estimator instance for which the check is performed.

1448

1449 attributes : str, list or tuple of str, default=None

1450 Attribute name(s) given as string or a list/tuple of strings

1451 Eg.: ``["coef_", "estimator_", ...], "coef_"``

1452

1453 If `None`, `estimator` is considered fitted if there exist an

1454 attribute that ends with a underscore and does not start with double

1455 underscore.

1456

1457 msg : str, default=None

1458 The default error message is, "This %(name)s instance is not fitted

1459 yet. Call 'fit' with appropriate arguments before using this

1460 estimator."

1461

1462 For custom messages if "%(name)s" is present in the message string,

1463 it is substituted for the estimator name.

1464

1465 Eg. : "Estimator, %(name)s, must be fitted before sparsifying".

1466

1467 all_or_any : callable, {all, any}, default=all

1468 Specify whether all or any of the given attributes must exist.

1469

1470 Raises

1471 ------

1472 TypeError

1473 If the estimator is a class or not an estimator instance

1474

1475 NotFittedError

1476 If the attributes are not found.

1477 """

1478 if isclass(estimator):

1479 raise TypeError("{} is a class, not an instance.".format(estimator))

1480 if msg is None:

1481 msg = (

1482 "This %(name)s instance is not fitted yet. Call 'fit' with "

1483 "appropriate arguments before using this estimator."

1484 )

1485

1486 if not hasattr(estimator, "fit"):

1487 raise TypeError("%s is not an estimator instance." % (estimator))

1488

1489 if not _is_fitted(estimator, attributes, all_or_any):

1490 raise NotFittedError(msg % {"name": type(estimator).__name__})

1491

1492

1493def check_non_negative(X, whom):

1494 """

1495 Check if there is any negative value in an array.

1496

1497 Parameters

1498 ----------

1499 X : {array-like, sparse matrix}

1500 Input data.

1501

1502 whom : str

1503 Who passed X to this function.

1504 """

1505 xp, _ = get_namespace(X)

1506 # avoid X.min() on sparse matrix since it also sorts the indices

1507 if sp.issparse(X):

1508 if X.format in ["lil", "dok"]:

1509 X = X.tocsr()

1510 if X.data.size == 0:

1511 X_min = 0

1512 else:

1513 X_min = X.data.min()

1514 else:

1515 X_min = xp.min(X)

1516

1517 if X_min < 0:

1518 raise ValueError("Negative values in data passed to %s" % whom)

1519

1520

1521def check_scalar(

1522 x,

1523 name,

1524 target_type,

1525 *,

1526 min_val=None,

1527 max_val=None,

1528 include_boundaries="both",

1529):

1530 """Validate scalar parameters type and value.

1531

1532 Parameters

1533 ----------

1534 x : object

1535 The scalar parameter to validate.

1536

1537 name : str

1538 The name of the parameter to be printed in error messages.

1539

1540 target_type : type or tuple

1541 Acceptable data types for the parameter.

1542

1543 min_val : float or int, default=None

1544 The minimum valid value the parameter can take. If None (default) it

1545 is implied that the parameter does not have a lower bound.

1546

1547 max_val : float or int, default=None

1548 The maximum valid value the parameter can take. If None (default) it

1549 is implied that the parameter does not have an upper bound.

1550

1551 include_boundaries : {"left", "right", "both", "neither"}, default="both"

1552 Whether the interval defined by `min_val` and `max_val` should include

1553 the boundaries. Possible choices are:

1554

1555 - `"left"`: only `min_val` is included in the valid interval.

1556 It is equivalent to the interval `[ min_val, max_val )`.

1557 - `"right"`: only `max_val` is included in the valid interval.

1558 It is equivalent to the interval `( min_val, max_val ]`.

1559 - `"both"`: `min_val` and `max_val` are included in the valid interval.

1560 It is equivalent to the interval `[ min_val, max_val ]`.

1561 - `"neither"`: neither `min_val` nor `max_val` are included in the

1562 valid interval. It is equivalent to the interval `( min_val, max_val )`.

1563

1564 Returns

1565 -------

1566 x : numbers.Number

1567 The validated number.

1568

1569 Raises

1570 ------

1571 TypeError

1572 If the parameter's type does not match the desired type.

1573

1574 ValueError

1575 If the parameter's value violates the given bounds.

1576 If `min_val`, `max_val` and `include_boundaries` are inconsistent.

1577 """

1578

1579 def type_name(t):

1580 """Convert type into humman readable string."""

1581 module = t.__module__

1582 qualname = t.__qualname__

1583 if module == "builtins":

1584 return qualname

1585 elif t == numbers.Real:

1586 return "float"

1587 elif t == numbers.Integral:

1588 return "int"

1589 return f"{module}.{qualname}"

1590

1591 if not isinstance(x, target_type):

1592 if isinstance(target_type, tuple):

1593 types_str = ", ".join(type_name(t) for t in target_type)

1594 target_type_str = f"{{{types_str}}}"

1595 else:

1596 target_type_str = type_name(target_type)

1597

1598 raise TypeError(

1599 f"{name} must be an instance of {target_type_str}, not"

1600 f" {type(x).__qualname__}."

1601 )

1602

1603 expected_include_boundaries = ("left", "right", "both", "neither")

1604 if include_boundaries not in expected_include_boundaries:

1605 raise ValueError(

1606 f"Unknown value for `include_boundaries`: {repr(include_boundaries)}. "

1607 f"Possible values are: {expected_include_boundaries}."

1608 )

1609

1610 if max_val is None and include_boundaries == "right":

1611 raise ValueError(

1612 "`include_boundaries`='right' without specifying explicitly `max_val` "

1613 "is inconsistent."

1614 )

1615

1616 if min_val is None and include_boundaries == "left":

1617 raise ValueError(

1618 "`include_boundaries`='left' without specifying explicitly `min_val` "

1619 "is inconsistent."

1620 )

1621

1622 comparison_operator = (

1623 operator.lt if include_boundaries in ("left", "both") else operator.le

1624 )

1625 if min_val is not None and comparison_operator(x, min_val):

1626 raise ValueError(

1627 f"{name} == {x}, must be"

1628 f" {'>=' if include_boundaries in ('left', 'both') else '>'} {min_val}."

1629 )

1630

1631 comparison_operator = (

1632 operator.gt if include_boundaries in ("right", "both") else operator.ge

1633 )

1634 if max_val is not None and comparison_operator(x, max_val):

1635 raise ValueError(

1636 f"{name} == {x}, must be"

1637 f" {'<=' if include_boundaries in ('right', 'both') else '<'} {max_val}."

1638 )

1639

1640 return x

1641

1642

1643def _check_psd_eigenvalues(lambdas, enable_warnings=False):

1644 """Check the eigenvalues of a positive semidefinite (PSD) matrix.

1645

1646 Checks the provided array of PSD matrix eigenvalues for numerical or

1647 conditioning issues and returns a fixed validated version. This method

1648 should typically be used if the PSD matrix is user-provided (e.g. a

1649 Gram matrix) or computed using a user-provided dissimilarity metric

1650 (e.g. kernel function), or if the decomposition process uses approximation

1651 methods (randomized SVD, etc.).

1652

1653 It checks for three things:

1654

1655 - that there are no significant imaginary parts in eigenvalues (more than

1656 1e-5 times the maximum real part). If this check fails, it raises a

1657 ``ValueError``. Otherwise all non-significant imaginary parts that may

1658 remain are set to zero. This operation is traced with a

1659 ``PositiveSpectrumWarning`` when ``enable_warnings=True``.

1660

1661 - that eigenvalues are not all negative. If this check fails, it raises a

1662 ``ValueError``

1663

1664 - that there are no significant negative eigenvalues with absolute value

1665 more than 1e-10 (1e-6) and more than 1e-5 (5e-3) times the largest

1666 positive eigenvalue in double (simple) precision. If this check fails,

1667 it raises a ``ValueError``. Otherwise all negative eigenvalues that may

1668 remain are set to zero. This operation is traced with a

1669 ``PositiveSpectrumWarning`` when ``enable_warnings=True``.

1670

1671 Finally, all the positive eigenvalues that are too small (with a value

1672 smaller than the maximum eigenvalue multiplied by 1e-12 (2e-7)) are set to

1673 zero. This operation is traced with a ``PositiveSpectrumWarning`` when

1674 ``enable_warnings=True``.

1675

1676 Parameters

1677 ----------

1678 lambdas : array-like of shape (n_eigenvalues,)

1679 Array of eigenvalues to check / fix.

1680

1681 enable_warnings : bool, default=False

1682 When this is set to ``True``, a ``PositiveSpectrumWarning`` will be

1683 raised when there are imaginary parts, negative eigenvalues, or

1684 extremely small non-zero eigenvalues. Otherwise no warning will be

1685 raised. In both cases, imaginary parts, negative eigenvalues, and

1686 extremely small non-zero eigenvalues will be set to zero.

1687

1688 Returns

1689 -------

1690 lambdas_fixed : ndarray of shape (n_eigenvalues,)

1691 A fixed validated copy of the array of eigenvalues.

1692

1693 Examples

1694 --------

1695 >>> from sklearn.utils.validation import _check_psd_eigenvalues

1696 >>> _check_psd_eigenvalues([1, 2]) # nominal case

1697 array([1, 2])

1698 >>> _check_psd_eigenvalues([5, 5j]) # significant imag part

1699 Traceback (most recent call last):

1700 ...

1701 ValueError: There are significant imaginary parts in eigenvalues (1

1702 of the maximum real part). Either the matrix is not PSD, or there was

1703 an issue while computing the eigendecomposition of the matrix.

1704 >>> _check_psd_eigenvalues([5, 5e-5j]) # insignificant imag part

1705 array([5., 0.])

1706 >>> _check_psd_eigenvalues([-5, -1]) # all negative

1707 Traceback (most recent call last):

1708 ...

1709 ValueError: All eigenvalues are negative (maximum is -1). Either the

1710 matrix is not PSD, or there was an issue while computing the

1711 eigendecomposition of the matrix.

1712 >>> _check_psd_eigenvalues([5, -1]) # significant negative

1713 Traceback (most recent call last):

1714 ...

1715 ValueError: There are significant negative eigenvalues (0.2 of the

1716 maximum positive). Either the matrix is not PSD, or there was an issue

1717 while computing the eigendecomposition of the matrix.

1718 >>> _check_psd_eigenvalues([5, -5e-5]) # insignificant negative

1719 array([5., 0.])

1720 >>> _check_psd_eigenvalues([5, 4e-12]) # bad conditioning (too small)

1721 array([5., 0.])

1722

1723 """

1724

1725 lambdas = np.array(lambdas)

1726 is_double_precision = lambdas.dtype == np.float64

1727

1728 # note: the minimum value available is

1729 # - single-precision: np.finfo('float32').eps = 1.2e-07

1730 # - double-precision: np.finfo('float64').eps = 2.2e-16

1731

1732 # the various thresholds used for validation

1733 # we may wish to change the value according to precision.

1734 significant_imag_ratio = 1e-5

1735 significant_neg_ratio = 1e-5 if is_double_precision else 5e-3

1736 significant_neg_value = 1e-10 if is_double_precision else 1e-6

1737 small_pos_ratio = 1e-12 if is_double_precision else 2e-7

1738

1739 # Check that there are no significant imaginary parts

1740 if not np.isreal(lambdas).all():

1741 max_imag_abs = np.abs(np.imag(lambdas)).max()

1742 max_real_abs = np.abs(np.real(lambdas)).max()

1743 if max_imag_abs > significant_imag_ratio * max_real_abs:

1744 raise ValueError(

1745 "There are significant imaginary parts in eigenvalues (%g "

1746 "of the maximum real part). Either the matrix is not PSD, or "

1747 "there was an issue while computing the eigendecomposition "

1748 "of the matrix." % (max_imag_abs / max_real_abs)

1749 )

1750

1751 # warn about imaginary parts being removed

1752 if enable_warnings:

1753 warnings.warn(

1754 "There are imaginary parts in eigenvalues (%g "

1755 "of the maximum real part). Either the matrix is not"

1756 " PSD, or there was an issue while computing the "

1757 "eigendecomposition of the matrix. Only the real "

1758 "parts will be kept." % (max_imag_abs / max_real_abs),

1759 PositiveSpectrumWarning,

1760 )

1761

1762 # Remove all imaginary parts (even if zero)

1763 lambdas = np.real(lambdas)

1764

1765 # Check that there are no significant negative eigenvalues

1766 max_eig = lambdas.max()

1767 if max_eig < 0:

1768 raise ValueError(

1769 "All eigenvalues are negative (maximum is %g). "

1770 "Either the matrix is not PSD, or there was an "

1771 "issue while computing the eigendecomposition of "

1772 "the matrix." % max_eig

1773 )

1774

1775 else:

1776 min_eig = lambdas.min()

1777 if (

1778 min_eig < -significant_neg_ratio * max_eig

1779 and min_eig < -significant_neg_value

1780 ):

1781 raise ValueError(

1782 "There are significant negative eigenvalues (%g"

1783 " of the maximum positive). Either the matrix is "

1784 "not PSD, or there was an issue while computing "

1785 "the eigendecomposition of the matrix." % (-min_eig / max_eig)

1786 )

1787 elif min_eig < 0:

1788 # Remove all negative values and warn about it

1789 if enable_warnings:

1790 warnings.warn(

1791 "There are negative eigenvalues (%g of the "

1792 "maximum positive). Either the matrix is not "

1793 "PSD, or there was an issue while computing the"

1794 " eigendecomposition of the matrix. Negative "

1795 "eigenvalues will be replaced with 0." % (-min_eig / max_eig),

1796 PositiveSpectrumWarning,

1797 )

1798 lambdas[lambdas < 0] = 0

1799

1800 # Check for conditioning (small positive non-zeros)

1801 too_small_lambdas = (0 < lambdas) & (lambdas < small_pos_ratio * max_eig)

1802 if too_small_lambdas.any():

1803 if enable_warnings:

1804 warnings.warn(

1805 "Badly conditioned PSD matrix spectrum: the largest "

1806 "eigenvalue is more than %g times the smallest. "

1807 "Small eigenvalues will be replaced with 0."

1808 "" % (1 / small_pos_ratio),

1809 PositiveSpectrumWarning,

1810 )

1811 lambdas[too_small_lambdas] = 0

1812

1813 return lambdas

1814

1815

1816def _check_sample_weight(

1817 sample_weight, X, dtype=None, copy=False, only_non_negative=False

1818):

1819 """Validate sample weights.

1820

1821 Note that passing sample_weight=None will output an array of ones.

1822 Therefore, in some cases, you may want to protect the call with:

1823 if sample_weight is not None:

1824 sample_weight = _check_sample_weight(...)

1825

1826 Parameters

1827 ----------

1828 sample_weight : {ndarray, Number or None}, shape (n_samples,)

1829 Input sample weights.

1830

1831 X : {ndarray, list, sparse matrix}

1832 Input data.

1833

1834 only_non_negative : bool, default=False,

1835 Whether or not the weights are expected to be non-negative.

1836

1837 .. versionadded:: 1.0

1838

1839 dtype : dtype, default=None

1840 dtype of the validated `sample_weight`.

1841 If None, and the input `sample_weight` is an array, the dtype of the

1842 input is preserved; otherwise an array with the default numpy dtype

1843 is be allocated. If `dtype` is not one of `float32`, `float64`,

1844 `None`, the output will be of dtype `float64`.

1845

1846 copy : bool, default=False

1847 If True, a copy of sample_weight will be created.

1848

1849 Returns

1850 -------

1851 sample_weight : ndarray of shape (n_samples,)

1852 Validated sample weight. It is guaranteed to be "C" contiguous.

1853 """

1854 n_samples = _num_samples(X)

1855

1856 if dtype is not None and dtype not in [np.float32, np.float64]:

1857 dtype = np.float64

1858

1859 if sample_weight is None:

1860 sample_weight = np.ones(n_samples, dtype=dtype)

1861 elif isinstance(sample_weight, numbers.Number):

1862 sample_weight = np.full(n_samples, sample_weight, dtype=dtype)

1863 else:

1864 if dtype is None:

1865 dtype = [np.float64, np.float32]

1866 sample_weight = check_array(

1867 sample_weight,

1868 accept_sparse=False,

1869 ensure_2d=False,

1870 dtype=dtype,

1871 order="C",

1872 copy=copy,

1873 input_name="sample_weight",

1874 )

1875 if sample_weight.ndim != 1:

1876 raise ValueError("Sample weights must be 1D array or scalar")

1877

1878 if sample_weight.shape != (n_samples,):

1879 raise ValueError(

1880 "sample_weight.shape == {}, expected {}!".format(

1881 sample_weight.shape, (n_samples,)

1882 )

1883 )

1884

1885 if only_non_negative:

1886 check_non_negative(sample_weight, "`sample_weight`")

1887

1888 return sample_weight

1889

1890

1891def _allclose_dense_sparse(x, y, rtol=1e-7, atol=1e-9):

1892 """Check allclose for sparse and dense data.

1893

1894 Both x and y need to be either sparse or dense, they

1895 can't be mixed.

1896

1897 Parameters

1898 ----------

1899 x : {array-like, sparse matrix}

1900 First array to compare.

1901

1902 y : {array-like, sparse matrix}

1903 Second array to compare.

1904

1905 rtol : float, default=1e-7

1906 Relative tolerance; see numpy.allclose.

1907

1908 atol : float, default=1e-9

1909 absolute tolerance; see numpy.allclose. Note that the default here is

1910 more tolerant than the default for numpy.testing.assert_allclose, where

1911 atol=0.

1912 """

1913 if sp.issparse(x) and sp.issparse(y):

1914 x = x.tocsr()

1915 y = y.tocsr()

1916 x.sum_duplicates()

1917 y.sum_duplicates()

1918 return (

1919 np.array_equal(x.indices, y.indices)

1920 and np.array_equal(x.indptr, y.indptr)

1921 and np.allclose(x.data, y.data, rtol=rtol, atol=atol)

1922 )

1923 elif not sp.issparse(x) and not sp.issparse(y):

1924 return np.allclose(x, y, rtol=rtol, atol=atol)

1925 raise ValueError(

1926 "Can only compare two sparse matrices, not a sparse matrix and an array"

1927 )

1928

1929

1930def _check_response_method(estimator, response_method):

1931 """Check if `response_method` is available in estimator and return it.

1932

1933 .. versionadded:: 1.3

1934

1935 Parameters

1936 ----------

1937 estimator : estimator instance

1938 Classifier or regressor to check.

1939

1940 response_method : {"predict_proba", "predict_log_proba", "decision_function",

1941 "predict"} or list of such str

1942 Specifies the response method to use get prediction from an estimator

1943 (i.e. :term:`predict_proba`, :term:`predict_log_proba`,

1944 :term:`decision_function` or :term:`predict`). Possible choices are:

1945 - if `str`, it corresponds to the name to the method to return;

1946 - if a list of `str`, it provides the method names in order of

1947 preference. The method returned corresponds to the first method in

1948 the list and which is implemented by `estimator`.

1949

1950 Returns

1951 -------

1952 prediction_method : callable

1953 Prediction method of estimator.

1954

1955 Raises

1956 ------

1957 AttributeError

1958 If `response_method` is not available in `estimator`.

1959 """

1960 if isinstance(response_method, str):

1961 list_methods = [response_method]

1962 else:

1963 list_methods = response_method

1964

1965 prediction_method = [getattr(estimator, method, None) for method in list_methods]

1966 prediction_method = reduce(lambda x, y: x or y, prediction_method)

1967 if prediction_method is None:

1968 raise AttributeError(

1969 f"{estimator.__class__.__name__} has none of the following attributes: "

1970 f"{', '.join(list_methods)}."

1971 )

1972

1973 return prediction_method

1974

1975

1976def _check_method_params(X, params, indices=None):

1977 """Check and validate the parameters passed to a specific

1978 method like `fit`.

1979

1980 Parameters

1981 ----------

1982 X : array-like of shape (n_samples, n_features)

1983 Data array.

1984

1985 params : dict

1986 Dictionary containing the parameters passed to the method.

1987

1988 indices : array-like of shape (n_samples,), default=None

1989 Indices to be selected if the parameter has the same size as `X`.

1990

1991 Returns

1992 -------

1993 method_params_validated : dict

1994 Validated parameters. We ensure that the values support indexing.

1995 """

1996 from . import _safe_indexing

1997

1998 method_params_validated = {}

1999 for param_key, param_value in params.items():

2000 if not _is_arraylike(param_value) or _num_samples(param_value) != _num_samples(

2001 X

2002 ):

2003 # Non-indexable pass-through (for now for backward-compatibility).

2004 # https://github.com/scikit-learn/scikit-learn/issues/15805

2005 method_params_validated[param_key] = param_value

2006 else:

2007 # Any other method_params should support indexing

2008 # (e.g. for cross-validation).

2009 method_params_validated[param_key] = _make_indexable(param_value)

2010 method_params_validated[param_key] = _safe_indexing(

2011 method_params_validated[param_key], indices

2012 )

2013

2014 return method_params_validated

2015

2016

2017def _is_pandas_df(X):

2018 """Return True if the X is a pandas dataframe."""

2019 if hasattr(X, "columns") and hasattr(X, "iloc"):

2020 # Likely a pandas DataFrame, we explicitly check the type to confirm.

2021 try:

2022 pd = sys.modules["pandas"]

2023 except KeyError:

2024 return False

2025 return isinstance(X, pd.DataFrame)

2026 return False

2027

2028

2029def _is_polars_df(X):

2030 """Return True if the X is a polars dataframe."""

2031 if hasattr(X, "columns") and hasattr(X, "schema"):

2032 # Likely a polars DataFrame, we explicitly check the type to confirm.

2033 try:

2034 pl = sys.modules["polars"]

2035 except KeyError:

2036 return False

2037 return isinstance(X, pl.DataFrame)

2038 return False

2039

2040

2041def _get_feature_names(X):

2042 """Get feature names from X.

2043

2044 Support for other array containers should place its implementation here.

2045

2046 Parameters

2047 ----------

2048 X : {ndarray, dataframe} of shape (n_samples, n_features)

2049 Array container to extract feature names.

2050

2051 - pandas dataframe : The columns will be considered to be feature

2052 names. If the dataframe contains non-string feature names, `None` is

2053 returned.

2054 - All other array containers will return `None`.

2055

2056 Returns

2057 -------

2058 names: ndarray or None

2059 Feature names of `X`. Unrecognized array containers will return `None`.

2060 """

2061 feature_names = None

2062

2063 # extract feature names for support array containers

2064 if _is_pandas_df(X):

2065 # Make sure we can inspect columns names from pandas, even with

2066 # versions too old to expose a working implementation of

2067 # __dataframe__.column_names() and avoid introducing any

2068 # additional copy.

2069 # TODO: remove the pandas-specific branch once the minimum supported

2070 # version of pandas has a working implementation of

2071 # __dataframe__.column_names() that is guaranteed to not introduce any

2072 # additional copy of the data without having to impose allow_copy=False

2073 # that could fail with other libraries. Note: in the longer term, we

2074 # could decide to instead rely on the __dataframe_namespace__ API once

2075 # adopted by our minimally supported pandas version.

2076 feature_names = np.asarray(X.columns, dtype=object)

2077 elif hasattr(X, "__dataframe__"):

2078 df_protocol = X.__dataframe__()

2079 feature_names = np.asarray(list(df_protocol.column_names()), dtype=object)

2080

2081 if feature_names is None or len(feature_names) == 0:

2082 return

2083

2084 types = sorted(t.__qualname__ for t in set(type(v) for v in feature_names))

2085

2086 # mixed type of string and non-string is not supported

2087 if len(types) > 1 and "str" in types:

2088 raise TypeError(

2089 "Feature names are only supported if all input features have string names, "

2090 f"but your input has {types} as feature name / column name types. "

2091 "If you want feature names to be stored and validated, you must convert "

2092 "them all to strings, by using X.columns = X.columns.astype(str) for "

2093 "example. Otherwise you can remove feature / column names from your input "

2094 "data, or convert them all to a non-string data type."

2095 )

2096

2097 # Only feature names of all strings are supported

2098 if len(types) == 1 and types[0] == "str":

2099 return feature_names

2100

2101

2102def _check_feature_names_in(estimator, input_features=None, *, generate_names=True):

2103 """Check `input_features` and generate names if needed.

2104

2105 Commonly used in :term:`get_feature_names_out`.

2106

2107 Parameters

2108 ----------

2109 input_features : array-like of str or None, default=None

2110 Input features.

2111

2112 - If `input_features` is `None`, then `feature_names_in_` is

2113 used as feature names in. If `feature_names_in_` is not defined,

2114 then the following input feature names are generated:

2115 `["x0", "x1", ..., "x(n_features_in_ - 1)"]`.

2116 - If `input_features` is an array-like, then `input_features` must

2117 match `feature_names_in_` if `feature_names_in_` is defined.

2118

2119 generate_names : bool, default=True

2120 Whether to generate names when `input_features` is `None` and

2121 `estimator.feature_names_in_` is not defined. This is useful for transformers

2122 that validates `input_features` but do not require them in

2123 :term:`get_feature_names_out` e.g. `PCA`.

2124

2125 Returns

2126 -------

2127 feature_names_in : ndarray of str or `None`

2128 Feature names in.

2129 """

2130

2131 feature_names_in_ = getattr(estimator, "feature_names_in_", None)

2132 n_features_in_ = getattr(estimator, "n_features_in_", None)

2133

2134 if input_features is not None:

2135 input_features = np.asarray(input_features, dtype=object)

2136 if feature_names_in_ is not None and not np.array_equal(

2137 feature_names_in_, input_features

2138 ):

2139 raise ValueError("input_features is not equal to feature_names_in_")

2140

2141 if n_features_in_ is not None and len(input_features) != n_features_in_:

2142 raise ValueError(

2143 "input_features should have length equal to number of "

2144 f"features ({n_features_in_}), got {len(input_features)}"

2145 )

2146 return input_features

2147

2148 if feature_names_in_ is not None:

2149 return feature_names_in_

2150

2151 if not generate_names:

2152 return

2153

2154 # Generates feature names if `n_features_in_` is defined

2155 if n_features_in_ is None:

2156 raise ValueError("Unable to generate feature names without n_features_in_")

2157

2158 return np.asarray([f"x{i}" for i in range(n_features_in_)], dtype=object)

2159

2160

2161def _generate_get_feature_names_out(estimator, n_features_out, input_features=None):

2162 """Generate feature names out for estimator using the estimator name as the prefix.

2163

2164 The input_feature names are validated but not used. This function is useful

2165 for estimators that generate their own names based on `n_features_out`, i.e. PCA.

2166

2167 Parameters

2168 ----------

2169 estimator : estimator instance

2170 Estimator producing output feature names.

2171

2172 n_feature_out : int

2173 Number of feature names out.

2174

2175 input_features : array-like of str or None, default=None

2176 Only used to validate feature names with `estimator.feature_names_in_`.

2177

2178 Returns

2179 -------

2180 feature_names_in : ndarray of str or `None`

2181 Feature names in.

2182 """

2183 _check_feature_names_in(estimator, input_features, generate_names=False)

2184 estimator_name = estimator.__class__.__name__.lower()

2185 return np.asarray(

2186 [f"{estimator_name}{i}" for i in range(n_features_out)], dtype=object

2187 )

2188

2189

2190def _check_monotonic_cst(estimator, monotonic_cst=None):

2191 """Check the monotonic constraints and return the corresponding array.

2192

2193 This helper function should be used in the `fit` method of an estimator

2194 that supports monotonic constraints and called after the estimator has

2195 introspected input data to set the `n_features_in_` and optionally the

2196 `feature_names_in_` attributes.

2197

2198 .. versionadded:: 1.2

2199

2200 Parameters

2201 ----------

2202 estimator : estimator instance

2203

2204 monotonic_cst : array-like of int, dict of str or None, default=None

2205 Monotonic constraints for the features.

2206

2207 - If array-like, then it should contain only -1, 0 or 1. Each value

2208 will be checked to be in [-1, 0, 1]. If a value is -1, then the

2209 corresponding feature is required to be monotonically decreasing.

2210 - If dict, then it the keys should be the feature names occurring in

2211 `estimator.feature_names_in_` and the values should be -1, 0 or 1.

2212 - If None, then an array of 0s will be allocated.

2213

2214 Returns

2215 -------

2216 monotonic_cst : ndarray of int

2217 Monotonic constraints for each feature.

2218 """

2219 original_monotonic_cst = monotonic_cst

2220 if monotonic_cst is None or isinstance(monotonic_cst, dict):

2221 monotonic_cst = np.full(

2222 shape=estimator.n_features_in_,

2223 fill_value=0,

2224 dtype=np.int8,

2225 )

2226 if isinstance(original_monotonic_cst, dict):

2227 if not hasattr(estimator, "feature_names_in_"):

2228 raise ValueError(

2229 f"{estimator.__class__.__name__} was not fitted on data "

2230 "with feature names. Pass monotonic_cst as an integer "

2231 "array instead."

2232 )

2233 unexpected_feature_names = list(

2234 set(original_monotonic_cst) - set(estimator.feature_names_in_)

2235 )

2236 unexpected_feature_names.sort() # deterministic error message

2237 n_unexpeced = len(unexpected_feature_names)

2238 if unexpected_feature_names:

2239 if len(unexpected_feature_names) > 5:

2240 unexpected_feature_names = unexpected_feature_names[:5]

2241 unexpected_feature_names.append("...")

2242 raise ValueError(

2243 f"monotonic_cst contains {n_unexpeced} unexpected feature "

2244 f"names: {unexpected_feature_names}."

2245 )

2246 for feature_idx, feature_name in enumerate(estimator.feature_names_in_):

2247 if feature_name in original_monotonic_cst:

2248 cst = original_monotonic_cst[feature_name]

2249 if cst not in [-1, 0, 1]:

2250 raise ValueError(

2251 f"monotonic_cst['{feature_name}'] must be either "

2252 f"-1, 0 or 1. Got {cst!r}."

2253 )

2254 monotonic_cst[feature_idx] = cst

2255 else:

2256 unexpected_cst = np.setdiff1d(monotonic_cst, [-1, 0, 1])

2257 if unexpected_cst.shape[0]:

2258 raise ValueError(

2259 "monotonic_cst must be an array-like of -1, 0 or 1. Observed "

2260 f"values: {unexpected_cst.tolist()}."

2261 )

2262

2263 monotonic_cst = np.asarray(monotonic_cst, dtype=np.int8)

2264 if monotonic_cst.shape[0] != estimator.n_features_in_:

2265 raise ValueError(

2266 f"monotonic_cst has shape {monotonic_cst.shape} but the input data "

2267 f"X has {estimator.n_features_in_} features."

2268 )

2269 return monotonic_cst

2270

2271

2272def _check_pos_label_consistency(pos_label, y_true):

2273 """Check if `pos_label` need to be specified or not.

2274

2275 In binary classification, we fix `pos_label=1` if the labels are in the set

2276 {-1, 1} or {0, 1}. Otherwise, we raise an error asking to specify the

2277 `pos_label` parameters.

2278

2279 Parameters

2280 ----------

2281 pos_label : int, float, bool, str or None

2282 The positive label.

2283 y_true : ndarray of shape (n_samples,)

2284 The target vector.

2285

2286 Returns

2287 -------

2288 pos_label : int, float, bool or str

2289 If `pos_label` can be inferred, it will be returned.

2290

2291 Raises

2292 ------

2293 ValueError

2294 In the case that `y_true` does not have label in {-1, 1} or {0, 1},

2295 it will raise a `ValueError`.

2296 """

2297 # ensure binary classification if pos_label is not specified

2298 # classes.dtype.kind in ('O', 'U', 'S') is required to avoid

2299 # triggering a FutureWarning by calling np.array_equal(a, b)

2300 # when elements in the two arrays are not comparable.

2301 classes = np.unique(y_true)

2302 if pos_label is None and (

2303 classes.dtype.kind in "OUS"

2304 or not (

2305 np.array_equal(classes, [0, 1])

2306 or np.array_equal(classes, [-1, 1])

2307 or np.array_equal(classes, [0])

2308 or np.array_equal(classes, [-1])

2309 or np.array_equal(classes, [1])

2310 )

2311 ):

2312 classes_repr = ", ".join([repr(c) for c in classes.tolist()])

2313 raise ValueError(

2314 f"y_true takes value in {{{classes_repr}}} and pos_label is not "

2315 "specified: either make y_true take value in {0, 1} or "

2316 "{-1, 1} or pass pos_label explicitly."

2317 )

2318 elif pos_label is None:

2319 pos_label = 1

2320

2321 return pos_label

Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/scikit_learn-1.4.dev0-py3.8-linux-x86_64.egg/sklearn/utils/validation.py: 25%

612 statements