Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/scikit_learn-1.4.dev0-py3.8-linux-x86_64.egg/sklearn/utils/validation.py: 25%

612 statements  

« prev     ^ index     » next       coverage.py v7.3.2, created at 2023-12-12 06:31 +0000

1""" 

2The :mod:`sklearn.utils.validation` module includes functions to validate 

3input and parameters within scikit-learn estimators. 

4""" 

5 

6# Authors: Olivier Grisel 

7# Gael Varoquaux 

8# Andreas Mueller 

9# Lars Buitinck 

10# Alexandre Gramfort 

11# Nicolas Tresegnie 

12# Sylvain Marie 

13# License: BSD 3 clause 

14 

15import numbers 

16import operator 

17import sys 

18import warnings 

19from contextlib import suppress 

20from functools import reduce, wraps 

21from inspect import Parameter, isclass, signature 

22 

23import joblib 

24import numpy as np 

25import scipy.sparse as sp 

26 

27from .. import get_config as _get_config 

28from ..exceptions import DataConversionWarning, NotFittedError, PositiveSpectrumWarning 

29from ..utils._array_api import _asarray_with_order, _is_numpy_namespace, get_namespace 

30from ..utils.fixes import ComplexWarning, _preserve_dia_indices_dtype 

31from ._isfinite import FiniteStatus, cy_isfinite 

32from .fixes import _object_dtype_isnan 

33 

34FLOAT_DTYPES = (np.float64, np.float32, np.float16) 

35 

36 

37# This function is not used anymore at this moment in the code base but we keep it in 

38# case that we merge a new public function without kwarg only by mistake, which would 

39# require a deprecation cycle to fix. 

40def _deprecate_positional_args(func=None, *, version="1.3"): 

41 """Decorator for methods that issues warnings for positional arguments. 

42 

43 Using the keyword-only argument syntax in pep 3102, arguments after the 

44 * will issue a warning when passed as a positional argument. 

45 

46 Parameters 

47 ---------- 

48 func : callable, default=None 

49 Function to check arguments on. 

50 version : callable, default="1.3" 

51 The version when positional arguments will result in error. 

52 """ 

53 

54 def _inner_deprecate_positional_args(f): 

55 sig = signature(f) 

56 kwonly_args = [] 

57 all_args = [] 

58 

59 for name, param in sig.parameters.items(): 

60 if param.kind == Parameter.POSITIONAL_OR_KEYWORD: 

61 all_args.append(name) 

62 elif param.kind == Parameter.KEYWORD_ONLY: 

63 kwonly_args.append(name) 

64 

65 @wraps(f) 

66 def inner_f(*args, **kwargs): 

67 extra_args = len(args) - len(all_args) 

68 if extra_args <= 0: 

69 return f(*args, **kwargs) 

70 

71 # extra_args > 0 

72 args_msg = [ 

73 "{}={}".format(name, arg) 

74 for name, arg in zip(kwonly_args[:extra_args], args[-extra_args:]) 

75 ] 

76 args_msg = ", ".join(args_msg) 

77 warnings.warn( 

78 ( 

79 f"Pass {args_msg} as keyword args. From version " 

80 f"{version} passing these as positional arguments " 

81 "will result in an error" 

82 ), 

83 FutureWarning, 

84 ) 

85 kwargs.update(zip(sig.parameters, args)) 

86 return f(**kwargs) 

87 

88 return inner_f 

89 

90 if func is not None: 

91 return _inner_deprecate_positional_args(func) 

92 

93 return _inner_deprecate_positional_args 

94 

95 

96def _assert_all_finite( 

97 X, allow_nan=False, msg_dtype=None, estimator_name=None, input_name="" 

98): 

99 """Like assert_all_finite, but only for ndarray.""" 

100 

101 xp, _ = get_namespace(X) 

102 

103 if _get_config()["assume_finite"]: 

104 return 

105 

106 X = xp.asarray(X) 

107 

108 # for object dtype data, we only check for NaNs (GH-13254) 

109 if X.dtype == np.dtype("object") and not allow_nan: 

110 if _object_dtype_isnan(X).any(): 

111 raise ValueError("Input contains NaN") 

112 

113 # We need only consider float arrays, hence can early return for all else. 

114 if not xp.isdtype(X.dtype, ("real floating", "complex floating")): 

115 return 

116 

117 # First try an O(n) time, O(1) space solution for the common case that 

118 # everything is finite; fall back to O(n) space `np.isinf/isnan` or custom 

119 # Cython implementation to prevent false positives and provide a detailed 

120 # error message. 

121 with np.errstate(over="ignore"): 

122 first_pass_isfinite = xp.isfinite(xp.sum(X)) 

123 if first_pass_isfinite: 

124 return 

125 

126 _assert_all_finite_element_wise( 

127 X, 

128 xp=xp, 

129 allow_nan=allow_nan, 

130 msg_dtype=msg_dtype, 

131 estimator_name=estimator_name, 

132 input_name=input_name, 

133 ) 

134 

135 

136def _assert_all_finite_element_wise( 

137 X, *, xp, allow_nan, msg_dtype=None, estimator_name=None, input_name="" 

138): 

139 # Cython implementation doesn't support FP16 or complex numbers 

140 use_cython = ( 

141 xp is np and X.data.contiguous and X.dtype.type in {np.float32, np.float64} 

142 ) 

143 if use_cython: 

144 out = cy_isfinite(X.reshape(-1), allow_nan=allow_nan) 

145 has_nan_error = False if allow_nan else out == FiniteStatus.has_nan 

146 has_inf = out == FiniteStatus.has_infinite 

147 else: 

148 has_inf = xp.any(xp.isinf(X)) 

149 has_nan_error = False if allow_nan else xp.any(xp.isnan(X)) 

150 if has_inf or has_nan_error: 

151 if has_nan_error: 

152 type_err = "NaN" 

153 else: 

154 msg_dtype = msg_dtype if msg_dtype is not None else X.dtype 

155 type_err = f"infinity or a value too large for {msg_dtype!r}" 

156 padded_input_name = input_name + " " if input_name else "" 

157 msg_err = f"Input {padded_input_name}contains {type_err}." 

158 if estimator_name and input_name == "X" and has_nan_error: 

159 # Improve the error message on how to handle missing values in 

160 # scikit-learn. 

161 msg_err += ( 

162 f"\n{estimator_name} does not accept missing values" 

163 " encoded as NaN natively. For supervised learning, you might want" 

164 " to consider sklearn.ensemble.HistGradientBoostingClassifier and" 

165 " Regressor which accept missing values encoded as NaNs natively." 

166 " Alternatively, it is possible to preprocess the data, for" 

167 " instance by using an imputer transformer in a pipeline or drop" 

168 " samples with missing values. See" 

169 " https://scikit-learn.org/stable/modules/impute.html" 

170 " You can find a list of all estimators that handle NaN values" 

171 " at the following page:" 

172 " https://scikit-learn.org/stable/modules/impute.html" 

173 "#estimators-that-handle-nan-values" 

174 ) 

175 raise ValueError(msg_err) 

176 

177 

178def assert_all_finite( 

179 X, 

180 *, 

181 allow_nan=False, 

182 estimator_name=None, 

183 input_name="", 

184): 

185 """Throw a ValueError if X contains NaN or infinity. 

186 

187 Parameters 

188 ---------- 

189 X : {ndarray, sparse matrix} 

190 The input data. 

191 

192 allow_nan : bool, default=False 

193 If True, do not throw error when `X` contains NaN. 

194 

195 estimator_name : str, default=None 

196 The estimator name, used to construct the error message. 

197 

198 input_name : str, default="" 

199 The data name used to construct the error message. In particular 

200 if `input_name` is "X" and the data has NaN values and 

201 allow_nan is False, the error message will link to the imputer 

202 documentation. 

203 """ 

204 _assert_all_finite( 

205 X.data if sp.issparse(X) else X, 

206 allow_nan=allow_nan, 

207 estimator_name=estimator_name, 

208 input_name=input_name, 

209 ) 

210 

211 

212def as_float_array(X, *, copy=True, force_all_finite=True): 

213 """Convert an array-like to an array of floats. 

214 

215 The new dtype will be np.float32 or np.float64, depending on the original 

216 type. The function can create a copy or modify the argument depending 

217 on the argument copy. 

218 

219 Parameters 

220 ---------- 

221 X : {array-like, sparse matrix} 

222 The input data. 

223 

224 copy : bool, default=True 

225 If True, a copy of X will be created. If False, a copy may still be 

226 returned if X's dtype is not a floating point type. 

227 

228 force_all_finite : bool or 'allow-nan', default=True 

229 Whether to raise an error on np.inf, np.nan, pd.NA in X. The 

230 possibilities are: 

231 

232 - True: Force all values of X to be finite. 

233 - False: accepts np.inf, np.nan, pd.NA in X. 

234 - 'allow-nan': accepts only np.nan and pd.NA values in X. Values cannot 

235 be infinite. 

236 

237 .. versionadded:: 0.20 

238 ``force_all_finite`` accepts the string ``'allow-nan'``. 

239 

240 .. versionchanged:: 0.23 

241 Accepts `pd.NA` and converts it into `np.nan` 

242 

243 Returns 

244 ------- 

245 XT : {ndarray, sparse matrix} 

246 An array of type float. 

247 """ 

248 if isinstance(X, np.matrix) or ( 

249 not isinstance(X, np.ndarray) and not sp.issparse(X) 

250 ): 

251 return check_array( 

252 X, 

253 accept_sparse=["csr", "csc", "coo"], 

254 dtype=np.float64, 

255 copy=copy, 

256 force_all_finite=force_all_finite, 

257 ensure_2d=False, 

258 ) 

259 elif sp.issparse(X) and X.dtype in [np.float32, np.float64]: 

260 return X.copy() if copy else X 

261 elif X.dtype in [np.float32, np.float64]: # is numpy array 

262 return X.copy("F" if X.flags["F_CONTIGUOUS"] else "C") if copy else X 

263 else: 

264 if X.dtype.kind in "uib" and X.dtype.itemsize <= 4: 

265 return_dtype = np.float32 

266 else: 

267 return_dtype = np.float64 

268 return X.astype(return_dtype) 

269 

270 

271def _is_arraylike(x): 

272 """Returns whether the input is array-like.""" 

273 return hasattr(x, "__len__") or hasattr(x, "shape") or hasattr(x, "__array__") 

274 

275 

276def _is_arraylike_not_scalar(array): 

277 """Return True if array is array-like and not a scalar""" 

278 return _is_arraylike(array) and not np.isscalar(array) 

279 

280 

281def _use_interchange_protocol(X): 

282 """Use interchange protocol for non-pandas dataframes that follow the protocol. 

283 

284 Note: at this point we chose not to use the interchange API on pandas dataframe 

285 to ensure strict behavioral backward compatibility with older versions of 

286 scikit-learn. 

287 """ 

288 return not _is_pandas_df(X) and hasattr(X, "__dataframe__") 

289 

290 

291def _num_features(X): 

292 """Return the number of features in an array-like X. 

293 

294 This helper function tries hard to avoid to materialize an array version 

295 of X unless necessary. For instance, if X is a list of lists, 

296 this function will return the length of the first element, assuming 

297 that subsequent elements are all lists of the same length without 

298 checking. 

299 Parameters 

300 ---------- 

301 X : array-like 

302 array-like to get the number of features. 

303 

304 Returns 

305 ------- 

306 features : int 

307 Number of features 

308 """ 

309 type_ = type(X) 

310 if type_.__module__ == "builtins": 

311 type_name = type_.__qualname__ 

312 else: 

313 type_name = f"{type_.__module__}.{type_.__qualname__}" 

314 message = f"Unable to find the number of features from X of type {type_name}" 

315 if not hasattr(X, "__len__") and not hasattr(X, "shape"): 

316 if not hasattr(X, "__array__"): 

317 raise TypeError(message) 

318 # Only convert X to a numpy array if there is no cheaper, heuristic 

319 # option. 

320 X = np.asarray(X) 

321 

322 if hasattr(X, "shape"): 

323 if not hasattr(X.shape, "__len__") or len(X.shape) <= 1: 

324 message += f" with shape {X.shape}" 

325 raise TypeError(message) 

326 return X.shape[1] 

327 

328 first_sample = X[0] 

329 

330 # Do not consider an array-like of strings or dicts to be a 2D array 

331 if isinstance(first_sample, (str, bytes, dict)): 

332 message += f" where the samples are of type {type(first_sample).__qualname__}" 

333 raise TypeError(message) 

334 

335 try: 

336 # If X is a list of lists, for instance, we assume that all nested 

337 # lists have the same length without checking or converting to 

338 # a numpy array to keep this function call as cheap as possible. 

339 return len(first_sample) 

340 except Exception as err: 

341 raise TypeError(message) from err 

342 

343 

344def _num_samples(x): 

345 """Return number of samples in array-like x.""" 

346 message = "Expected sequence or array-like, got %s" % type(x) 

347 if hasattr(x, "fit") and callable(x.fit): 

348 # Don't get num_samples from an ensembles length! 

349 raise TypeError(message) 

350 

351 if _use_interchange_protocol(x): 

352 return x.__dataframe__().num_rows() 

353 

354 if not hasattr(x, "__len__") and not hasattr(x, "shape"): 

355 if hasattr(x, "__array__"): 

356 x = np.asarray(x) 

357 else: 

358 raise TypeError(message) 

359 

360 if hasattr(x, "shape") and x.shape is not None: 

361 if len(x.shape) == 0: 

362 raise TypeError( 

363 "Singleton array %r cannot be considered a valid collection." % x 

364 ) 

365 # Check that shape is returning an integer or default to len 

366 # Dask dataframes may not return numeric shape[0] value 

367 if isinstance(x.shape[0], numbers.Integral): 

368 return x.shape[0] 

369 

370 try: 

371 return len(x) 

372 except TypeError as type_error: 

373 raise TypeError(message) from type_error 

374 

375 

376def check_memory(memory): 

377 """Check that ``memory`` is joblib.Memory-like. 

378 

379 joblib.Memory-like means that ``memory`` can be converted into a 

380 joblib.Memory instance (typically a str denoting the ``location``) 

381 or has the same interface (has a ``cache`` method). 

382 

383 Parameters 

384 ---------- 

385 memory : None, str or object with the joblib.Memory interface 

386 - If string, the location where to create the `joblib.Memory` interface. 

387 - If None, no caching is done and the Memory object is completely transparent. 

388 

389 Returns 

390 ------- 

391 memory : object with the joblib.Memory interface 

392 A correct joblib.Memory object. 

393 

394 Raises 

395 ------ 

396 ValueError 

397 If ``memory`` is not joblib.Memory-like. 

398 """ 

399 if memory is None or isinstance(memory, str): 

400 memory = joblib.Memory(location=memory, verbose=0) 

401 elif not hasattr(memory, "cache"): 

402 raise ValueError( 

403 "'memory' should be None, a string or have the same" 

404 " interface as joblib.Memory." 

405 " Got memory='{}' instead.".format(memory) 

406 ) 

407 return memory 

408 

409 

410def check_consistent_length(*arrays): 

411 """Check that all arrays have consistent first dimensions. 

412 

413 Checks whether all objects in arrays have the same shape or length. 

414 

415 Parameters 

416 ---------- 

417 *arrays : list or tuple of input objects. 

418 Objects that will be checked for consistent length. 

419 """ 

420 

421 lengths = [_num_samples(X) for X in arrays if X is not None] 

422 uniques = np.unique(lengths) 

423 if len(uniques) > 1: 

424 raise ValueError( 

425 "Found input variables with inconsistent numbers of samples: %r" 

426 % [int(l) for l in lengths] 

427 ) 

428 

429 

430def _make_indexable(iterable): 

431 """Ensure iterable supports indexing or convert to an indexable variant. 

432 

433 Convert sparse matrices to csr and other non-indexable iterable to arrays. 

434 Let `None` and indexable objects (e.g. pandas dataframes) pass unchanged. 

435 

436 Parameters 

437 ---------- 

438 iterable : {list, dataframe, ndarray, sparse matrix} or None 

439 Object to be converted to an indexable iterable. 

440 """ 

441 if sp.issparse(iterable): 

442 return iterable.tocsr() 

443 elif hasattr(iterable, "__getitem__") or hasattr(iterable, "iloc"): 

444 return iterable 

445 elif iterable is None: 

446 return iterable 

447 return np.array(iterable) 

448 

449 

450def indexable(*iterables): 

451 """Make arrays indexable for cross-validation. 

452 

453 Checks consistent length, passes through None, and ensures that everything 

454 can be indexed by converting sparse matrices to csr and converting 

455 non-interable objects to arrays. 

456 

457 Parameters 

458 ---------- 

459 *iterables : {lists, dataframes, ndarrays, sparse matrices} 

460 List of objects to ensure sliceability. 

461 

462 Returns 

463 ------- 

464 result : list of {ndarray, sparse matrix, dataframe} or None 

465 Returns a list containing indexable arrays (i.e. NumPy array, 

466 sparse matrix, or dataframe) or `None`. 

467 """ 

468 

469 result = [_make_indexable(X) for X in iterables] 

470 check_consistent_length(*result) 

471 return result 

472 

473 

474def _ensure_sparse_format( 

475 sparse_container, 

476 accept_sparse, 

477 dtype, 

478 copy, 

479 force_all_finite, 

480 accept_large_sparse, 

481 estimator_name=None, 

482 input_name="", 

483): 

484 """Convert a sparse container to a given format. 

485 

486 Checks the sparse format of `sparse_container` and converts if necessary. 

487 

488 Parameters 

489 ---------- 

490 sparse_container : sparse matrix or array 

491 Input to validate and convert. 

492 

493 accept_sparse : str, bool or list/tuple of str 

494 String[s] representing allowed sparse matrix formats ('csc', 

495 'csr', 'coo', 'dok', 'bsr', 'lil', 'dia'). If the input is sparse but 

496 not in the allowed format, it will be converted to the first listed 

497 format. True allows the input to be any format. False means 

498 that a sparse matrix input will raise an error. 

499 

500 dtype : str, type or None 

501 Data type of result. If None, the dtype of the input is preserved. 

502 

503 copy : bool 

504 Whether a forced copy will be triggered. If copy=False, a copy might 

505 be triggered by a conversion. 

506 

507 force_all_finite : bool or 'allow-nan' 

508 Whether to raise an error on np.inf, np.nan, pd.NA in X. The 

509 possibilities are: 

510 

511 - True: Force all values of X to be finite. 

512 - False: accepts np.inf, np.nan, pd.NA in X. 

513 - 'allow-nan': accepts only np.nan and pd.NA values in X. Values cannot 

514 be infinite. 

515 

516 .. versionadded:: 0.20 

517 ``force_all_finite`` accepts the string ``'allow-nan'``. 

518 

519 .. versionchanged:: 0.23 

520 Accepts `pd.NA` and converts it into `np.nan` 

521 

522 

523 estimator_name : str, default=None 

524 The estimator name, used to construct the error message. 

525 

526 input_name : str, default="" 

527 The data name used to construct the error message. In particular 

528 if `input_name` is "X" and the data has NaN values and 

529 allow_nan is False, the error message will link to the imputer 

530 documentation. 

531 

532 Returns 

533 ------- 

534 sparse_container_converted : sparse matrix or array 

535 Sparse container (matrix/array) that is ensured to have an allowed type. 

536 """ 

537 if dtype is None: 

538 dtype = sparse_container.dtype 

539 

540 changed_format = False 

541 sparse_container_type_name = type(sparse_container).__name__ 

542 

543 if isinstance(accept_sparse, str): 

544 accept_sparse = [accept_sparse] 

545 

546 # Indices dtype validation 

547 _check_large_sparse(sparse_container, accept_large_sparse) 

548 

549 if accept_sparse is False: 

550 padded_input = " for " + input_name if input_name else "" 

551 raise TypeError( 

552 f"Sparse data was passed{padded_input}, but dense data is required. " 

553 "Use '.toarray()' to convert to a dense numpy array." 

554 ) 

555 elif isinstance(accept_sparse, (list, tuple)): 

556 if len(accept_sparse) == 0: 

557 raise ValueError( 

558 "When providing 'accept_sparse' as a tuple or list, it must contain at " 

559 "least one string value." 

560 ) 

561 # ensure correct sparse format 

562 if sparse_container.format not in accept_sparse: 

563 # create new with correct sparse 

564 sparse_container = sparse_container.asformat(accept_sparse[0]) 

565 changed_format = True 

566 elif accept_sparse is not True: 

567 # any other type 

568 raise ValueError( 

569 "Parameter 'accept_sparse' should be a string, boolean or list of strings." 

570 f" You provided 'accept_sparse={accept_sparse}'." 

571 ) 

572 

573 if dtype != sparse_container.dtype: 

574 # convert dtype 

575 sparse_container = sparse_container.astype(dtype) 

576 elif copy and not changed_format: 

577 # force copy 

578 sparse_container = sparse_container.copy() 

579 

580 if force_all_finite: 

581 if not hasattr(sparse_container, "data"): 

582 warnings.warn( 

583 f"Can't check {sparse_container.format} sparse matrix for nan or inf.", 

584 stacklevel=2, 

585 ) 

586 else: 

587 _assert_all_finite( 

588 sparse_container.data, 

589 allow_nan=force_all_finite == "allow-nan", 

590 estimator_name=estimator_name, 

591 input_name=input_name, 

592 ) 

593 

594 # TODO: Remove when the minimum version of SciPy supported is 1.12 

595 # With SciPy sparse arrays, conversion from DIA format to COO, CSR, or BSR 

596 # triggers the use of `np.int64` indices even if the data is such that it could 

597 # be more efficiently represented with `np.int32` indices. 

598 # https://github.com/scipy/scipy/issues/19245 Since not all scikit-learn 

599 # algorithms support large indices, the following code downcasts to `np.int32` 

600 # indices when it's safe to do so. 

601 if changed_format: 

602 # accept_sparse is specified to a specific format and a conversion occurred 

603 requested_sparse_format = accept_sparse[0] 

604 _preserve_dia_indices_dtype( 

605 sparse_container, sparse_container_type_name, requested_sparse_format 

606 ) 

607 

608 return sparse_container 

609 

610 

611def _ensure_no_complex_data(array): 

612 if ( 

613 hasattr(array, "dtype") 

614 and array.dtype is not None 

615 and hasattr(array.dtype, "kind") 

616 and array.dtype.kind == "c" 

617 ): 

618 raise ValueError("Complex data not supported\n{}\n".format(array)) 

619 

620 

621def _check_estimator_name(estimator): 

622 if estimator is not None: 

623 if isinstance(estimator, str): 

624 return estimator 

625 else: 

626 return estimator.__class__.__name__ 

627 return None 

628 

629 

630def _pandas_dtype_needs_early_conversion(pd_dtype): 

631 """Return True if pandas extension pd_dtype need to be converted early.""" 

632 # Check these early for pandas versions without extension dtypes 

633 from pandas import SparseDtype 

634 from pandas.api.types import ( 

635 is_bool_dtype, 

636 is_float_dtype, 

637 is_integer_dtype, 

638 ) 

639 

640 if is_bool_dtype(pd_dtype): 

641 # bool and extension booleans need early conversion because __array__ 

642 # converts mixed dtype dataframes into object dtypes 

643 return True 

644 

645 if isinstance(pd_dtype, SparseDtype): 

646 # Sparse arrays will be converted later in `check_array` 

647 return False 

648 

649 try: 

650 from pandas.api.types import is_extension_array_dtype 

651 except ImportError: 

652 return False 

653 

654 if isinstance(pd_dtype, SparseDtype) or not is_extension_array_dtype(pd_dtype): 

655 # Sparse arrays will be converted later in `check_array` 

656 # Only handle extension arrays for integer and floats 

657 return False 

658 elif is_float_dtype(pd_dtype): 

659 # Float ndarrays can normally support nans. They need to be converted 

660 # first to map pd.NA to np.nan 

661 return True 

662 elif is_integer_dtype(pd_dtype): 

663 # XXX: Warn when converting from a high integer to a float 

664 return True 

665 

666 return False 

667 

668 

669def _is_extension_array_dtype(array): 

670 # Pandas extension arrays have a dtype with an na_value 

671 return hasattr(array, "dtype") and hasattr(array.dtype, "na_value") 

672 

673 

674def check_array( 

675 array, 

676 accept_sparse=False, 

677 *, 

678 accept_large_sparse=True, 

679 dtype="numeric", 

680 order=None, 

681 copy=False, 

682 force_all_finite=True, 

683 ensure_2d=True, 

684 allow_nd=False, 

685 ensure_min_samples=1, 

686 ensure_min_features=1, 

687 estimator=None, 

688 input_name="", 

689): 

690 """Input validation on an array, list, sparse matrix or similar. 

691 

692 By default, the input is checked to be a non-empty 2D array containing 

693 only finite values. If the dtype of the array is object, attempt 

694 converting to float, raising on failure. 

695 

696 Parameters 

697 ---------- 

698 array : object 

699 Input object to check / convert. 

700 

701 accept_sparse : str, bool or list/tuple of str, default=False 

702 String[s] representing allowed sparse matrix formats, such as 'csc', 

703 'csr', etc. If the input is sparse but not in the allowed format, 

704 it will be converted to the first listed format. True allows the input 

705 to be any format. False means that a sparse matrix input will 

706 raise an error. 

707 

708 accept_large_sparse : bool, default=True 

709 If a CSR, CSC, COO or BSR sparse matrix is supplied and accepted by 

710 accept_sparse, accept_large_sparse=False will cause it to be accepted 

711 only if its indices are stored with a 32-bit dtype. 

712 

713 .. versionadded:: 0.20 

714 

715 dtype : 'numeric', type, list of type or None, default='numeric' 

716 Data type of result. If None, the dtype of the input is preserved. 

717 If "numeric", dtype is preserved unless array.dtype is object. 

718 If dtype is a list of types, conversion on the first type is only 

719 performed if the dtype of the input is not in the list. 

720 

721 order : {'F', 'C'} or None, default=None 

722 Whether an array will be forced to be fortran or c-style. 

723 When order is None (default), then if copy=False, nothing is ensured 

724 about the memory layout of the output array; otherwise (copy=True) 

725 the memory layout of the returned array is kept as close as possible 

726 to the original array. 

727 

728 copy : bool, default=False 

729 Whether a forced copy will be triggered. If copy=False, a copy might 

730 be triggered by a conversion. 

731 

732 force_all_finite : bool or 'allow-nan', default=True 

733 Whether to raise an error on np.inf, np.nan, pd.NA in array. The 

734 possibilities are: 

735 

736 - True: Force all values of array to be finite. 

737 - False: accepts np.inf, np.nan, pd.NA in array. 

738 - 'allow-nan': accepts only np.nan and pd.NA values in array. Values 

739 cannot be infinite. 

740 

741 .. versionadded:: 0.20 

742 ``force_all_finite`` accepts the string ``'allow-nan'``. 

743 

744 .. versionchanged:: 0.23 

745 Accepts `pd.NA` and converts it into `np.nan` 

746 

747 ensure_2d : bool, default=True 

748 Whether to raise a value error if array is not 2D. 

749 

750 allow_nd : bool, default=False 

751 Whether to allow array.ndim > 2. 

752 

753 ensure_min_samples : int, default=1 

754 Make sure that the array has a minimum number of samples in its first 

755 axis (rows for a 2D array). Setting to 0 disables this check. 

756 

757 ensure_min_features : int, default=1 

758 Make sure that the 2D array has some minimum number of features 

759 (columns). The default value of 1 rejects empty datasets. 

760 This check is only enforced when the input data has effectively 2 

761 dimensions or is originally 1D and ``ensure_2d`` is True. Setting to 0 

762 disables this check. 

763 

764 estimator : str or estimator instance, default=None 

765 If passed, include the name of the estimator in warning messages. 

766 

767 input_name : str, default="" 

768 The data name used to construct the error message. In particular 

769 if `input_name` is "X" and the data has NaN values and 

770 allow_nan is False, the error message will link to the imputer 

771 documentation. 

772 

773 .. versionadded:: 1.1.0 

774 

775 Returns 

776 ------- 

777 array_converted : object 

778 The converted and validated array. 

779 """ 

780 if isinstance(array, np.matrix): 

781 raise TypeError( 

782 "np.matrix is not supported. Please convert to a numpy array with " 

783 "np.asarray. For more information see: " 

784 "https://numpy.org/doc/stable/reference/generated/numpy.matrix.html" 

785 ) 

786 

787 xp, is_array_api_compliant = get_namespace(array) 

788 

789 # store reference to original array to check if copy is needed when 

790 # function returns 

791 array_orig = array 

792 

793 # store whether originally we wanted numeric dtype 

794 dtype_numeric = isinstance(dtype, str) and dtype == "numeric" 

795 

796 dtype_orig = getattr(array, "dtype", None) 

797 if not is_array_api_compliant and not hasattr(dtype_orig, "kind"): 

798 # not a data type (e.g. a column named dtype in a pandas DataFrame) 

799 dtype_orig = None 

800 

801 # check if the object contains several dtypes (typically a pandas 

802 # DataFrame), and store them. If not, store None. 

803 dtypes_orig = None 

804 pandas_requires_conversion = False 

805 if hasattr(array, "dtypes") and hasattr(array.dtypes, "__array__"): 

806 # throw warning if columns are sparse. If all columns are sparse, then 

807 # array.sparse exists and sparsity will be preserved (later). 

808 with suppress(ImportError): 

809 from pandas import SparseDtype 

810 

811 def is_sparse(dtype): 

812 return isinstance(dtype, SparseDtype) 

813 

814 if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any(): 

815 warnings.warn( 

816 "pandas.DataFrame with sparse columns found." 

817 "It will be converted to a dense numpy array." 

818 ) 

819 

820 dtypes_orig = list(array.dtypes) 

821 pandas_requires_conversion = any( 

822 _pandas_dtype_needs_early_conversion(i) for i in dtypes_orig 

823 ) 

824 if all(isinstance(dtype_iter, np.dtype) for dtype_iter in dtypes_orig): 

825 dtype_orig = np.result_type(*dtypes_orig) 

826 elif pandas_requires_conversion and any(d == object for d in dtypes_orig): 

827 # Force object if any of the dtypes is an object 

828 dtype_orig = object 

829 

830 elif (_is_extension_array_dtype(array) or hasattr(array, "iloc")) and hasattr( 

831 array, "dtype" 

832 ): 

833 # array is a pandas series 

834 pandas_requires_conversion = _pandas_dtype_needs_early_conversion(array.dtype) 

835 if isinstance(array.dtype, np.dtype): 

836 dtype_orig = array.dtype 

837 else: 

838 # Set to None to let array.astype work out the best dtype 

839 dtype_orig = None 

840 

841 if dtype_numeric: 

842 if ( 

843 dtype_orig is not None 

844 and hasattr(dtype_orig, "kind") 

845 and dtype_orig.kind == "O" 

846 ): 

847 # if input is object, convert to float. 

848 dtype = xp.float64 

849 else: 

850 dtype = None 

851 

852 if isinstance(dtype, (list, tuple)): 

853 if dtype_orig is not None and dtype_orig in dtype: 

854 # no dtype conversion required 

855 dtype = None 

856 else: 

857 # dtype conversion required. Let's select the first element of the 

858 # list of accepted types. 

859 dtype = dtype[0] 

860 

861 if pandas_requires_conversion: 

862 # pandas dataframe requires conversion earlier to handle extension dtypes with 

863 # nans 

864 # Use the original dtype for conversion if dtype is None 

865 new_dtype = dtype_orig if dtype is None else dtype 

866 array = array.astype(new_dtype) 

867 # Since we converted here, we do not need to convert again later 

868 dtype = None 

869 

870 if force_all_finite not in (True, False, "allow-nan"): 

871 raise ValueError( 

872 'force_all_finite should be a bool or "allow-nan". Got {!r} instead'.format( 

873 force_all_finite 

874 ) 

875 ) 

876 

877 if dtype is not None and _is_numpy_namespace(xp): 

878 # convert to dtype object to conform to Array API to be use `xp.isdtype` later 

879 dtype = np.dtype(dtype) 

880 

881 estimator_name = _check_estimator_name(estimator) 

882 context = " by %s" % estimator_name if estimator is not None else "" 

883 

884 # When all dataframe columns are sparse, convert to a sparse array 

885 if hasattr(array, "sparse") and array.ndim > 1: 

886 with suppress(ImportError): 

887 from pandas import SparseDtype # noqa: F811 

888 

889 def is_sparse(dtype): 

890 return isinstance(dtype, SparseDtype) 

891 

892 if array.dtypes.apply(is_sparse).all(): 

893 # DataFrame.sparse only supports `to_coo` 

894 array = array.sparse.to_coo() 

895 if array.dtype == np.dtype("object"): 

896 unique_dtypes = set([dt.subtype.name for dt in array_orig.dtypes]) 

897 if len(unique_dtypes) > 1: 

898 raise ValueError( 

899 "Pandas DataFrame with mixed sparse extension arrays " 

900 "generated a sparse matrix with object dtype which " 

901 "can not be converted to a scipy sparse matrix." 

902 "Sparse extension arrays should all have the same " 

903 "numeric type." 

904 ) 

905 

906 if sp.issparse(array): 

907 _ensure_no_complex_data(array) 

908 array = _ensure_sparse_format( 

909 array, 

910 accept_sparse=accept_sparse, 

911 dtype=dtype, 

912 copy=copy, 

913 force_all_finite=force_all_finite, 

914 accept_large_sparse=accept_large_sparse, 

915 estimator_name=estimator_name, 

916 input_name=input_name, 

917 ) 

918 else: 

919 # If np.array(..) gives ComplexWarning, then we convert the warning 

920 # to an error. This is needed because specifying a non complex 

921 # dtype to the function converts complex to real dtype, 

922 # thereby passing the test made in the lines following the scope 

923 # of warnings context manager. 

924 with warnings.catch_warnings(): 

925 try: 

926 warnings.simplefilter("error", ComplexWarning) 

927 if dtype is not None and xp.isdtype(dtype, "integral"): 

928 # Conversion float -> int should not contain NaN or 

929 # inf (numpy#14412). We cannot use casting='safe' because 

930 # then conversion float -> int would be disallowed. 

931 array = _asarray_with_order(array, order=order, xp=xp) 

932 if xp.isdtype(array.dtype, ("real floating", "complex floating")): 

933 _assert_all_finite( 

934 array, 

935 allow_nan=False, 

936 msg_dtype=dtype, 

937 estimator_name=estimator_name, 

938 input_name=input_name, 

939 ) 

940 array = xp.astype(array, dtype, copy=False) 

941 else: 

942 array = _asarray_with_order(array, order=order, dtype=dtype, xp=xp) 

943 except ComplexWarning as complex_warning: 

944 raise ValueError( 

945 "Complex data not supported\n{}\n".format(array) 

946 ) from complex_warning 

947 

948 # It is possible that the np.array(..) gave no warning. This happens 

949 # when no dtype conversion happened, for example dtype = None. The 

950 # result is that np.array(..) produces an array of complex dtype 

951 # and we need to catch and raise exception for such cases. 

952 _ensure_no_complex_data(array) 

953 

954 if ensure_2d: 

955 # If input is scalar raise error 

956 if array.ndim == 0: 

957 raise ValueError( 

958 "Expected 2D array, got scalar array instead:\narray={}.\n" 

959 "Reshape your data either using array.reshape(-1, 1) if " 

960 "your data has a single feature or array.reshape(1, -1) " 

961 "if it contains a single sample.".format(array) 

962 ) 

963 # If input is 1D raise error 

964 if array.ndim == 1: 

965 raise ValueError( 

966 "Expected 2D array, got 1D array instead:\narray={}.\n" 

967 "Reshape your data either using array.reshape(-1, 1) if " 

968 "your data has a single feature or array.reshape(1, -1) " 

969 "if it contains a single sample.".format(array) 

970 ) 

971 

972 if dtype_numeric and hasattr(array.dtype, "kind") and array.dtype.kind in "USV": 

973 raise ValueError( 

974 "dtype='numeric' is not compatible with arrays of bytes/strings." 

975 "Convert your data to numeric values explicitly instead." 

976 ) 

977 if not allow_nd and array.ndim >= 3: 

978 raise ValueError( 

979 "Found array with dim %d. %s expected <= 2." 

980 % (array.ndim, estimator_name) 

981 ) 

982 

983 if force_all_finite: 

984 _assert_all_finite( 

985 array, 

986 input_name=input_name, 

987 estimator_name=estimator_name, 

988 allow_nan=force_all_finite == "allow-nan", 

989 ) 

990 

991 if copy: 

992 if _is_numpy_namespace(xp): 

993 # only make a copy if `array` and `array_orig` may share memory` 

994 if np.may_share_memory(array, array_orig): 

995 array = _asarray_with_order( 

996 array, dtype=dtype, order=order, copy=True, xp=xp 

997 ) 

998 else: 

999 # always make a copy for non-numpy arrays 

1000 array = _asarray_with_order( 

1001 array, dtype=dtype, order=order, copy=True, xp=xp 

1002 ) 

1003 

1004 if ensure_min_samples > 0: 

1005 n_samples = _num_samples(array) 

1006 if n_samples < ensure_min_samples: 

1007 raise ValueError( 

1008 "Found array with %d sample(s) (shape=%s) while a" 

1009 " minimum of %d is required%s." 

1010 % (n_samples, array.shape, ensure_min_samples, context) 

1011 ) 

1012 

1013 if ensure_min_features > 0 and array.ndim == 2: 

1014 n_features = array.shape[1] 

1015 if n_features < ensure_min_features: 

1016 raise ValueError( 

1017 "Found array with %d feature(s) (shape=%s) while" 

1018 " a minimum of %d is required%s." 

1019 % (n_features, array.shape, ensure_min_features, context) 

1020 ) 

1021 

1022 return array 

1023 

1024 

1025def _check_large_sparse(X, accept_large_sparse=False): 

1026 """Raise a ValueError if X has 64bit indices and accept_large_sparse=False""" 

1027 if not accept_large_sparse: 

1028 supported_indices = ["int32"] 

1029 if X.format == "coo": 

1030 index_keys = ["col", "row"] 

1031 elif X.format in ["csr", "csc", "bsr"]: 

1032 index_keys = ["indices", "indptr"] 

1033 else: 

1034 return 

1035 for key in index_keys: 

1036 indices_datatype = getattr(X, key).dtype 

1037 if indices_datatype not in supported_indices: 

1038 raise ValueError( 

1039 "Only sparse matrices with 32-bit integer indices are accepted." 

1040 f" Got {indices_datatype} indices. Please do report a minimal" 

1041 " reproducer on scikit-learn issue tracker so that support for" 

1042 " your use-case can be studied by maintainers. See:" 

1043 " https://scikit-learn.org/dev/developers/minimal_reproducer.html" 

1044 ) 

1045 

1046 

1047def check_X_y( 

1048 X, 

1049 y, 

1050 accept_sparse=False, 

1051 *, 

1052 accept_large_sparse=True, 

1053 dtype="numeric", 

1054 order=None, 

1055 copy=False, 

1056 force_all_finite=True, 

1057 ensure_2d=True, 

1058 allow_nd=False, 

1059 multi_output=False, 

1060 ensure_min_samples=1, 

1061 ensure_min_features=1, 

1062 y_numeric=False, 

1063 estimator=None, 

1064): 

1065 """Input validation for standard estimators. 

1066 

1067 Checks X and y for consistent length, enforces X to be 2D and y 1D. By 

1068 default, X is checked to be non-empty and containing only finite values. 

1069 Standard input checks are also applied to y, such as checking that y 

1070 does not have np.nan or np.inf targets. For multi-label y, set 

1071 multi_output=True to allow 2D and sparse y. If the dtype of X is 

1072 object, attempt converting to float, raising on failure. 

1073 

1074 Parameters 

1075 ---------- 

1076 X : {ndarray, list, sparse matrix} 

1077 Input data. 

1078 

1079 y : {ndarray, list, sparse matrix} 

1080 Labels. 

1081 

1082 accept_sparse : str, bool or list of str, default=False 

1083 String[s] representing allowed sparse matrix formats, such as 'csc', 

1084 'csr', etc. If the input is sparse but not in the allowed format, 

1085 it will be converted to the first listed format. True allows the input 

1086 to be any format. False means that a sparse matrix input will 

1087 raise an error. 

1088 

1089 accept_large_sparse : bool, default=True 

1090 If a CSR, CSC, COO or BSR sparse matrix is supplied and accepted by 

1091 accept_sparse, accept_large_sparse will cause it to be accepted only 

1092 if its indices are stored with a 32-bit dtype. 

1093 

1094 .. versionadded:: 0.20 

1095 

1096 dtype : 'numeric', type, list of type or None, default='numeric' 

1097 Data type of result. If None, the dtype of the input is preserved. 

1098 If "numeric", dtype is preserved unless array.dtype is object. 

1099 If dtype is a list of types, conversion on the first type is only 

1100 performed if the dtype of the input is not in the list. 

1101 

1102 order : {'F', 'C'}, default=None 

1103 Whether an array will be forced to be fortran or c-style. If 

1104 `None`, then the input data's order is preserved when possible. 

1105 

1106 copy : bool, default=False 

1107 Whether a forced copy will be triggered. If copy=False, a copy might 

1108 be triggered by a conversion. 

1109 

1110 force_all_finite : bool or 'allow-nan', default=True 

1111 Whether to raise an error on np.inf, np.nan, pd.NA in X. This parameter 

1112 does not influence whether y can have np.inf, np.nan, pd.NA values. 

1113 The possibilities are: 

1114 

1115 - True: Force all values of X to be finite. 

1116 - False: accepts np.inf, np.nan, pd.NA in X. 

1117 - 'allow-nan': accepts only np.nan or pd.NA values in X. Values cannot 

1118 be infinite. 

1119 

1120 .. versionadded:: 0.20 

1121 ``force_all_finite`` accepts the string ``'allow-nan'``. 

1122 

1123 .. versionchanged:: 0.23 

1124 Accepts `pd.NA` and converts it into `np.nan` 

1125 

1126 ensure_2d : bool, default=True 

1127 Whether to raise a value error if X is not 2D. 

1128 

1129 allow_nd : bool, default=False 

1130 Whether to allow X.ndim > 2. 

1131 

1132 multi_output : bool, default=False 

1133 Whether to allow 2D y (array or sparse matrix). If false, y will be 

1134 validated as a vector. y cannot have np.nan or np.inf values if 

1135 multi_output=True. 

1136 

1137 ensure_min_samples : int, default=1 

1138 Make sure that X has a minimum number of samples in its first 

1139 axis (rows for a 2D array). 

1140 

1141 ensure_min_features : int, default=1 

1142 Make sure that the 2D array has some minimum number of features 

1143 (columns). The default value of 1 rejects empty datasets. 

1144 This check is only enforced when X has effectively 2 dimensions or 

1145 is originally 1D and ``ensure_2d`` is True. Setting to 0 disables 

1146 this check. 

1147 

1148 y_numeric : bool, default=False 

1149 Whether to ensure that y has a numeric type. If dtype of y is object, 

1150 it is converted to float64. Should only be used for regression 

1151 algorithms. 

1152 

1153 estimator : str or estimator instance, default=None 

1154 If passed, include the name of the estimator in warning messages. 

1155 

1156 Returns 

1157 ------- 

1158 X_converted : object 

1159 The converted and validated X. 

1160 

1161 y_converted : object 

1162 The converted and validated y. 

1163 """ 

1164 if y is None: 

1165 if estimator is None: 

1166 estimator_name = "estimator" 

1167 else: 

1168 estimator_name = _check_estimator_name(estimator) 

1169 raise ValueError( 

1170 f"{estimator_name} requires y to be passed, but the target y is None" 

1171 ) 

1172 

1173 X = check_array( 

1174 X, 

1175 accept_sparse=accept_sparse, 

1176 accept_large_sparse=accept_large_sparse, 

1177 dtype=dtype, 

1178 order=order, 

1179 copy=copy, 

1180 force_all_finite=force_all_finite, 

1181 ensure_2d=ensure_2d, 

1182 allow_nd=allow_nd, 

1183 ensure_min_samples=ensure_min_samples, 

1184 ensure_min_features=ensure_min_features, 

1185 estimator=estimator, 

1186 input_name="X", 

1187 ) 

1188 

1189 y = _check_y(y, multi_output=multi_output, y_numeric=y_numeric, estimator=estimator) 

1190 

1191 check_consistent_length(X, y) 

1192 

1193 return X, y 

1194 

1195 

1196def _check_y(y, multi_output=False, y_numeric=False, estimator=None): 

1197 """Isolated part of check_X_y dedicated to y validation""" 

1198 if multi_output: 

1199 y = check_array( 

1200 y, 

1201 accept_sparse="csr", 

1202 force_all_finite=True, 

1203 ensure_2d=False, 

1204 dtype=None, 

1205 input_name="y", 

1206 estimator=estimator, 

1207 ) 

1208 else: 

1209 estimator_name = _check_estimator_name(estimator) 

1210 y = column_or_1d(y, warn=True) 

1211 _assert_all_finite(y, input_name="y", estimator_name=estimator_name) 

1212 _ensure_no_complex_data(y) 

1213 if y_numeric and y.dtype.kind == "O": 

1214 y = y.astype(np.float64) 

1215 

1216 return y 

1217 

1218 

1219def column_or_1d(y, *, dtype=None, warn=False): 

1220 """Ravel column or 1d numpy array, else raises an error. 

1221 

1222 Parameters 

1223 ---------- 

1224 y : array-like 

1225 Input data. 

1226 

1227 dtype : data-type, default=None 

1228 Data type for `y`. 

1229 

1230 .. versionadded:: 1.2 

1231 

1232 warn : bool, default=False 

1233 To control display of warnings. 

1234 

1235 Returns 

1236 ------- 

1237 y : ndarray 

1238 Output data. 

1239 

1240 Raises 

1241 ------ 

1242 ValueError 

1243 If `y` is not a 1D array or a 2D array with a single row or column. 

1244 """ 

1245 xp, _ = get_namespace(y) 

1246 y = check_array( 

1247 y, 

1248 ensure_2d=False, 

1249 dtype=dtype, 

1250 input_name="y", 

1251 force_all_finite=False, 

1252 ensure_min_samples=0, 

1253 ) 

1254 

1255 shape = y.shape 

1256 if len(shape) == 1: 

1257 return _asarray_with_order(xp.reshape(y, (-1,)), order="C", xp=xp) 

1258 if len(shape) == 2 and shape[1] == 1: 

1259 if warn: 

1260 warnings.warn( 

1261 ( 

1262 "A column-vector y was passed when a 1d array was" 

1263 " expected. Please change the shape of y to " 

1264 "(n_samples, ), for example using ravel()." 

1265 ), 

1266 DataConversionWarning, 

1267 stacklevel=2, 

1268 ) 

1269 return _asarray_with_order(xp.reshape(y, (-1,)), order="C", xp=xp) 

1270 

1271 raise ValueError( 

1272 "y should be a 1d array, got an array of shape {} instead.".format(shape) 

1273 ) 

1274 

1275 

1276def check_random_state(seed): 

1277 """Turn seed into a np.random.RandomState instance. 

1278 

1279 Parameters 

1280 ---------- 

1281 seed : None, int or instance of RandomState 

1282 If seed is None, return the RandomState singleton used by np.random. 

1283 If seed is an int, return a new RandomState instance seeded with seed. 

1284 If seed is already a RandomState instance, return it. 

1285 Otherwise raise ValueError. 

1286 

1287 Returns 

1288 ------- 

1289 :class:`numpy:numpy.random.RandomState` 

1290 The random state object based on `seed` parameter. 

1291 """ 

1292 if seed is None or seed is np.random: 

1293 return np.random.mtrand._rand 

1294 if isinstance(seed, numbers.Integral): 

1295 return np.random.RandomState(seed) 

1296 if isinstance(seed, np.random.RandomState): 

1297 return seed 

1298 raise ValueError( 

1299 "%r cannot be used to seed a numpy.random.RandomState instance" % seed 

1300 ) 

1301 

1302 

1303def has_fit_parameter(estimator, parameter): 

1304 """Check whether the estimator's fit method supports the given parameter. 

1305 

1306 Parameters 

1307 ---------- 

1308 estimator : object 

1309 An estimator to inspect. 

1310 

1311 parameter : str 

1312 The searched parameter. 

1313 

1314 Returns 

1315 ------- 

1316 is_parameter : bool 

1317 Whether the parameter was found to be a named parameter of the 

1318 estimator's fit method. 

1319 

1320 Examples 

1321 -------- 

1322 >>> from sklearn.svm import SVC 

1323 >>> from sklearn.utils.validation import has_fit_parameter 

1324 >>> has_fit_parameter(SVC(), "sample_weight") 

1325 True 

1326 """ 

1327 return parameter in signature(estimator.fit).parameters 

1328 

1329 

1330def check_symmetric(array, *, tol=1e-10, raise_warning=True, raise_exception=False): 

1331 """Make sure that array is 2D, square and symmetric. 

1332 

1333 If the array is not symmetric, then a symmetrized version is returned. 

1334 Optionally, a warning or exception is raised if the matrix is not 

1335 symmetric. 

1336 

1337 Parameters 

1338 ---------- 

1339 array : {ndarray, sparse matrix} 

1340 Input object to check / convert. Must be two-dimensional and square, 

1341 otherwise a ValueError will be raised. 

1342 

1343 tol : float, default=1e-10 

1344 Absolute tolerance for equivalence of arrays. Default = 1E-10. 

1345 

1346 raise_warning : bool, default=True 

1347 If True then raise a warning if conversion is required. 

1348 

1349 raise_exception : bool, default=False 

1350 If True then raise an exception if array is not symmetric. 

1351 

1352 Returns 

1353 ------- 

1354 array_sym : {ndarray, sparse matrix} 

1355 Symmetrized version of the input array, i.e. the average of array 

1356 and array.transpose(). If sparse, then duplicate entries are first 

1357 summed and zeros are eliminated. 

1358 """ 

1359 if (array.ndim != 2) or (array.shape[0] != array.shape[1]): 

1360 raise ValueError( 

1361 "array must be 2-dimensional and square. shape = {0}".format(array.shape) 

1362 ) 

1363 

1364 if sp.issparse(array): 

1365 diff = array - array.T 

1366 # only csr, csc, and coo have `data` attribute 

1367 if diff.format not in ["csr", "csc", "coo"]: 

1368 diff = diff.tocsr() 

1369 symmetric = np.all(abs(diff.data) < tol) 

1370 else: 

1371 symmetric = np.allclose(array, array.T, atol=tol) 

1372 

1373 if not symmetric: 

1374 if raise_exception: 

1375 raise ValueError("Array must be symmetric") 

1376 if raise_warning: 

1377 warnings.warn( 

1378 ( 

1379 "Array is not symmetric, and will be converted " 

1380 "to symmetric by average with its transpose." 

1381 ), 

1382 stacklevel=2, 

1383 ) 

1384 if sp.issparse(array): 

1385 conversion = "to" + array.format 

1386 array = getattr(0.5 * (array + array.T), conversion)() 

1387 else: 

1388 array = 0.5 * (array + array.T) 

1389 

1390 return array 

1391 

1392 

1393def _is_fitted(estimator, attributes=None, all_or_any=all): 

1394 """Determine if an estimator is fitted 

1395 

1396 Parameters 

1397 ---------- 

1398 estimator : estimator instance 

1399 Estimator instance for which the check is performed. 

1400 

1401 attributes : str, list or tuple of str, default=None 

1402 Attribute name(s) given as string or a list/tuple of strings 

1403 Eg.: ``["coef_", "estimator_", ...], "coef_"`` 

1404 

1405 If `None`, `estimator` is considered fitted if there exist an 

1406 attribute that ends with a underscore and does not start with double 

1407 underscore. 

1408 

1409 all_or_any : callable, {all, any}, default=all 

1410 Specify whether all or any of the given attributes must exist. 

1411 

1412 Returns 

1413 ------- 

1414 fitted : bool 

1415 Whether the estimator is fitted. 

1416 """ 

1417 if attributes is not None: 

1418 if not isinstance(attributes, (list, tuple)): 

1419 attributes = [attributes] 

1420 return all_or_any([hasattr(estimator, attr) for attr in attributes]) 

1421 

1422 if hasattr(estimator, "__sklearn_is_fitted__"): 

1423 return estimator.__sklearn_is_fitted__() 

1424 

1425 fitted_attrs = [ 

1426 v for v in vars(estimator) if v.endswith("_") and not v.startswith("__") 

1427 ] 

1428 return len(fitted_attrs) > 0 

1429 

1430 

1431def check_is_fitted(estimator, attributes=None, *, msg=None, all_or_any=all): 

1432 """Perform is_fitted validation for estimator. 

1433 

1434 Checks if the estimator is fitted by verifying the presence of 

1435 fitted attributes (ending with a trailing underscore) and otherwise 

1436 raises a NotFittedError with the given message. 

1437 

1438 If an estimator does not set any attributes with a trailing underscore, it 

1439 can define a ``__sklearn_is_fitted__`` method returning a boolean to 

1440 specify if the estimator is fitted or not. See 

1441 :ref:`sphx_glr_auto_examples_developing_estimators_sklearn_is_fitted.py` 

1442 for an example on how to use the API. 

1443 

1444 Parameters 

1445 ---------- 

1446 estimator : estimator instance 

1447 Estimator instance for which the check is performed. 

1448 

1449 attributes : str, list or tuple of str, default=None 

1450 Attribute name(s) given as string or a list/tuple of strings 

1451 Eg.: ``["coef_", "estimator_", ...], "coef_"`` 

1452 

1453 If `None`, `estimator` is considered fitted if there exist an 

1454 attribute that ends with a underscore and does not start with double 

1455 underscore. 

1456 

1457 msg : str, default=None 

1458 The default error message is, "This %(name)s instance is not fitted 

1459 yet. Call 'fit' with appropriate arguments before using this 

1460 estimator." 

1461 

1462 For custom messages if "%(name)s" is present in the message string, 

1463 it is substituted for the estimator name. 

1464 

1465 Eg. : "Estimator, %(name)s, must be fitted before sparsifying". 

1466 

1467 all_or_any : callable, {all, any}, default=all 

1468 Specify whether all or any of the given attributes must exist. 

1469 

1470 Raises 

1471 ------ 

1472 TypeError 

1473 If the estimator is a class or not an estimator instance 

1474 

1475 NotFittedError 

1476 If the attributes are not found. 

1477 """ 

1478 if isclass(estimator): 

1479 raise TypeError("{} is a class, not an instance.".format(estimator)) 

1480 if msg is None: 

1481 msg = ( 

1482 "This %(name)s instance is not fitted yet. Call 'fit' with " 

1483 "appropriate arguments before using this estimator." 

1484 ) 

1485 

1486 if not hasattr(estimator, "fit"): 

1487 raise TypeError("%s is not an estimator instance." % (estimator)) 

1488 

1489 if not _is_fitted(estimator, attributes, all_or_any): 

1490 raise NotFittedError(msg % {"name": type(estimator).__name__}) 

1491 

1492 

1493def check_non_negative(X, whom): 

1494 """ 

1495 Check if there is any negative value in an array. 

1496 

1497 Parameters 

1498 ---------- 

1499 X : {array-like, sparse matrix} 

1500 Input data. 

1501 

1502 whom : str 

1503 Who passed X to this function. 

1504 """ 

1505 xp, _ = get_namespace(X) 

1506 # avoid X.min() on sparse matrix since it also sorts the indices 

1507 if sp.issparse(X): 

1508 if X.format in ["lil", "dok"]: 

1509 X = X.tocsr() 

1510 if X.data.size == 0: 

1511 X_min = 0 

1512 else: 

1513 X_min = X.data.min() 

1514 else: 

1515 X_min = xp.min(X) 

1516 

1517 if X_min < 0: 

1518 raise ValueError("Negative values in data passed to %s" % whom) 

1519 

1520 

1521def check_scalar( 

1522 x, 

1523 name, 

1524 target_type, 

1525 *, 

1526 min_val=None, 

1527 max_val=None, 

1528 include_boundaries="both", 

1529): 

1530 """Validate scalar parameters type and value. 

1531 

1532 Parameters 

1533 ---------- 

1534 x : object 

1535 The scalar parameter to validate. 

1536 

1537 name : str 

1538 The name of the parameter to be printed in error messages. 

1539 

1540 target_type : type or tuple 

1541 Acceptable data types for the parameter. 

1542 

1543 min_val : float or int, default=None 

1544 The minimum valid value the parameter can take. If None (default) it 

1545 is implied that the parameter does not have a lower bound. 

1546 

1547 max_val : float or int, default=None 

1548 The maximum valid value the parameter can take. If None (default) it 

1549 is implied that the parameter does not have an upper bound. 

1550 

1551 include_boundaries : {"left", "right", "both", "neither"}, default="both" 

1552 Whether the interval defined by `min_val` and `max_val` should include 

1553 the boundaries. Possible choices are: 

1554 

1555 - `"left"`: only `min_val` is included in the valid interval. 

1556 It is equivalent to the interval `[ min_val, max_val )`. 

1557 - `"right"`: only `max_val` is included in the valid interval. 

1558 It is equivalent to the interval `( min_val, max_val ]`. 

1559 - `"both"`: `min_val` and `max_val` are included in the valid interval. 

1560 It is equivalent to the interval `[ min_val, max_val ]`. 

1561 - `"neither"`: neither `min_val` nor `max_val` are included in the 

1562 valid interval. It is equivalent to the interval `( min_val, max_val )`. 

1563 

1564 Returns 

1565 ------- 

1566 x : numbers.Number 

1567 The validated number. 

1568 

1569 Raises 

1570 ------ 

1571 TypeError 

1572 If the parameter's type does not match the desired type. 

1573 

1574 ValueError 

1575 If the parameter's value violates the given bounds. 

1576 If `min_val`, `max_val` and `include_boundaries` are inconsistent. 

1577 """ 

1578 

1579 def type_name(t): 

1580 """Convert type into humman readable string.""" 

1581 module = t.__module__ 

1582 qualname = t.__qualname__ 

1583 if module == "builtins": 

1584 return qualname 

1585 elif t == numbers.Real: 

1586 return "float" 

1587 elif t == numbers.Integral: 

1588 return "int" 

1589 return f"{module}.{qualname}" 

1590 

1591 if not isinstance(x, target_type): 

1592 if isinstance(target_type, tuple): 

1593 types_str = ", ".join(type_name(t) for t in target_type) 

1594 target_type_str = f"{{{types_str}}}" 

1595 else: 

1596 target_type_str = type_name(target_type) 

1597 

1598 raise TypeError( 

1599 f"{name} must be an instance of {target_type_str}, not" 

1600 f" {type(x).__qualname__}." 

1601 ) 

1602 

1603 expected_include_boundaries = ("left", "right", "both", "neither") 

1604 if include_boundaries not in expected_include_boundaries: 

1605 raise ValueError( 

1606 f"Unknown value for `include_boundaries`: {repr(include_boundaries)}. " 

1607 f"Possible values are: {expected_include_boundaries}." 

1608 ) 

1609 

1610 if max_val is None and include_boundaries == "right": 

1611 raise ValueError( 

1612 "`include_boundaries`='right' without specifying explicitly `max_val` " 

1613 "is inconsistent." 

1614 ) 

1615 

1616 if min_val is None and include_boundaries == "left": 

1617 raise ValueError( 

1618 "`include_boundaries`='left' without specifying explicitly `min_val` " 

1619 "is inconsistent." 

1620 ) 

1621 

1622 comparison_operator = ( 

1623 operator.lt if include_boundaries in ("left", "both") else operator.le 

1624 ) 

1625 if min_val is not None and comparison_operator(x, min_val): 

1626 raise ValueError( 

1627 f"{name} == {x}, must be" 

1628 f" {'>=' if include_boundaries in ('left', 'both') else '>'} {min_val}." 

1629 ) 

1630 

1631 comparison_operator = ( 

1632 operator.gt if include_boundaries in ("right", "both") else operator.ge 

1633 ) 

1634 if max_val is not None and comparison_operator(x, max_val): 

1635 raise ValueError( 

1636 f"{name} == {x}, must be" 

1637 f" {'<=' if include_boundaries in ('right', 'both') else '<'} {max_val}." 

1638 ) 

1639 

1640 return x 

1641 

1642 

1643def _check_psd_eigenvalues(lambdas, enable_warnings=False): 

1644 """Check the eigenvalues of a positive semidefinite (PSD) matrix. 

1645 

1646 Checks the provided array of PSD matrix eigenvalues for numerical or 

1647 conditioning issues and returns a fixed validated version. This method 

1648 should typically be used if the PSD matrix is user-provided (e.g. a 

1649 Gram matrix) or computed using a user-provided dissimilarity metric 

1650 (e.g. kernel function), or if the decomposition process uses approximation 

1651 methods (randomized SVD, etc.). 

1652 

1653 It checks for three things: 

1654 

1655 - that there are no significant imaginary parts in eigenvalues (more than 

1656 1e-5 times the maximum real part). If this check fails, it raises a 

1657 ``ValueError``. Otherwise all non-significant imaginary parts that may 

1658 remain are set to zero. This operation is traced with a 

1659 ``PositiveSpectrumWarning`` when ``enable_warnings=True``. 

1660 

1661 - that eigenvalues are not all negative. If this check fails, it raises a 

1662 ``ValueError`` 

1663 

1664 - that there are no significant negative eigenvalues with absolute value 

1665 more than 1e-10 (1e-6) and more than 1e-5 (5e-3) times the largest 

1666 positive eigenvalue in double (simple) precision. If this check fails, 

1667 it raises a ``ValueError``. Otherwise all negative eigenvalues that may 

1668 remain are set to zero. This operation is traced with a 

1669 ``PositiveSpectrumWarning`` when ``enable_warnings=True``. 

1670 

1671 Finally, all the positive eigenvalues that are too small (with a value 

1672 smaller than the maximum eigenvalue multiplied by 1e-12 (2e-7)) are set to 

1673 zero. This operation is traced with a ``PositiveSpectrumWarning`` when 

1674 ``enable_warnings=True``. 

1675 

1676 Parameters 

1677 ---------- 

1678 lambdas : array-like of shape (n_eigenvalues,) 

1679 Array of eigenvalues to check / fix. 

1680 

1681 enable_warnings : bool, default=False 

1682 When this is set to ``True``, a ``PositiveSpectrumWarning`` will be 

1683 raised when there are imaginary parts, negative eigenvalues, or 

1684 extremely small non-zero eigenvalues. Otherwise no warning will be 

1685 raised. In both cases, imaginary parts, negative eigenvalues, and 

1686 extremely small non-zero eigenvalues will be set to zero. 

1687 

1688 Returns 

1689 ------- 

1690 lambdas_fixed : ndarray of shape (n_eigenvalues,) 

1691 A fixed validated copy of the array of eigenvalues. 

1692 

1693 Examples 

1694 -------- 

1695 >>> from sklearn.utils.validation import _check_psd_eigenvalues 

1696 >>> _check_psd_eigenvalues([1, 2]) # nominal case 

1697 array([1, 2]) 

1698 >>> _check_psd_eigenvalues([5, 5j]) # significant imag part 

1699 Traceback (most recent call last): 

1700 ... 

1701 ValueError: There are significant imaginary parts in eigenvalues (1 

1702 of the maximum real part). Either the matrix is not PSD, or there was 

1703 an issue while computing the eigendecomposition of the matrix. 

1704 >>> _check_psd_eigenvalues([5, 5e-5j]) # insignificant imag part 

1705 array([5., 0.]) 

1706 >>> _check_psd_eigenvalues([-5, -1]) # all negative 

1707 Traceback (most recent call last): 

1708 ... 

1709 ValueError: All eigenvalues are negative (maximum is -1). Either the 

1710 matrix is not PSD, or there was an issue while computing the 

1711 eigendecomposition of the matrix. 

1712 >>> _check_psd_eigenvalues([5, -1]) # significant negative 

1713 Traceback (most recent call last): 

1714 ... 

1715 ValueError: There are significant negative eigenvalues (0.2 of the 

1716 maximum positive). Either the matrix is not PSD, or there was an issue 

1717 while computing the eigendecomposition of the matrix. 

1718 >>> _check_psd_eigenvalues([5, -5e-5]) # insignificant negative 

1719 array([5., 0.]) 

1720 >>> _check_psd_eigenvalues([5, 4e-12]) # bad conditioning (too small) 

1721 array([5., 0.]) 

1722 

1723 """ 

1724 

1725 lambdas = np.array(lambdas) 

1726 is_double_precision = lambdas.dtype == np.float64 

1727 

1728 # note: the minimum value available is 

1729 # - single-precision: np.finfo('float32').eps = 1.2e-07 

1730 # - double-precision: np.finfo('float64').eps = 2.2e-16 

1731 

1732 # the various thresholds used for validation 

1733 # we may wish to change the value according to precision. 

1734 significant_imag_ratio = 1e-5 

1735 significant_neg_ratio = 1e-5 if is_double_precision else 5e-3 

1736 significant_neg_value = 1e-10 if is_double_precision else 1e-6 

1737 small_pos_ratio = 1e-12 if is_double_precision else 2e-7 

1738 

1739 # Check that there are no significant imaginary parts 

1740 if not np.isreal(lambdas).all(): 

1741 max_imag_abs = np.abs(np.imag(lambdas)).max() 

1742 max_real_abs = np.abs(np.real(lambdas)).max() 

1743 if max_imag_abs > significant_imag_ratio * max_real_abs: 

1744 raise ValueError( 

1745 "There are significant imaginary parts in eigenvalues (%g " 

1746 "of the maximum real part). Either the matrix is not PSD, or " 

1747 "there was an issue while computing the eigendecomposition " 

1748 "of the matrix." % (max_imag_abs / max_real_abs) 

1749 ) 

1750 

1751 # warn about imaginary parts being removed 

1752 if enable_warnings: 

1753 warnings.warn( 

1754 "There are imaginary parts in eigenvalues (%g " 

1755 "of the maximum real part). Either the matrix is not" 

1756 " PSD, or there was an issue while computing the " 

1757 "eigendecomposition of the matrix. Only the real " 

1758 "parts will be kept." % (max_imag_abs / max_real_abs), 

1759 PositiveSpectrumWarning, 

1760 ) 

1761 

1762 # Remove all imaginary parts (even if zero) 

1763 lambdas = np.real(lambdas) 

1764 

1765 # Check that there are no significant negative eigenvalues 

1766 max_eig = lambdas.max() 

1767 if max_eig < 0: 

1768 raise ValueError( 

1769 "All eigenvalues are negative (maximum is %g). " 

1770 "Either the matrix is not PSD, or there was an " 

1771 "issue while computing the eigendecomposition of " 

1772 "the matrix." % max_eig 

1773 ) 

1774 

1775 else: 

1776 min_eig = lambdas.min() 

1777 if ( 

1778 min_eig < -significant_neg_ratio * max_eig 

1779 and min_eig < -significant_neg_value 

1780 ): 

1781 raise ValueError( 

1782 "There are significant negative eigenvalues (%g" 

1783 " of the maximum positive). Either the matrix is " 

1784 "not PSD, or there was an issue while computing " 

1785 "the eigendecomposition of the matrix." % (-min_eig / max_eig) 

1786 ) 

1787 elif min_eig < 0: 

1788 # Remove all negative values and warn about it 

1789 if enable_warnings: 

1790 warnings.warn( 

1791 "There are negative eigenvalues (%g of the " 

1792 "maximum positive). Either the matrix is not " 

1793 "PSD, or there was an issue while computing the" 

1794 " eigendecomposition of the matrix. Negative " 

1795 "eigenvalues will be replaced with 0." % (-min_eig / max_eig), 

1796 PositiveSpectrumWarning, 

1797 ) 

1798 lambdas[lambdas < 0] = 0 

1799 

1800 # Check for conditioning (small positive non-zeros) 

1801 too_small_lambdas = (0 < lambdas) & (lambdas < small_pos_ratio * max_eig) 

1802 if too_small_lambdas.any(): 

1803 if enable_warnings: 

1804 warnings.warn( 

1805 "Badly conditioned PSD matrix spectrum: the largest " 

1806 "eigenvalue is more than %g times the smallest. " 

1807 "Small eigenvalues will be replaced with 0." 

1808 "" % (1 / small_pos_ratio), 

1809 PositiveSpectrumWarning, 

1810 ) 

1811 lambdas[too_small_lambdas] = 0 

1812 

1813 return lambdas 

1814 

1815 

1816def _check_sample_weight( 

1817 sample_weight, X, dtype=None, copy=False, only_non_negative=False 

1818): 

1819 """Validate sample weights. 

1820 

1821 Note that passing sample_weight=None will output an array of ones. 

1822 Therefore, in some cases, you may want to protect the call with: 

1823 if sample_weight is not None: 

1824 sample_weight = _check_sample_weight(...) 

1825 

1826 Parameters 

1827 ---------- 

1828 sample_weight : {ndarray, Number or None}, shape (n_samples,) 

1829 Input sample weights. 

1830 

1831 X : {ndarray, list, sparse matrix} 

1832 Input data. 

1833 

1834 only_non_negative : bool, default=False, 

1835 Whether or not the weights are expected to be non-negative. 

1836 

1837 .. versionadded:: 1.0 

1838 

1839 dtype : dtype, default=None 

1840 dtype of the validated `sample_weight`. 

1841 If None, and the input `sample_weight` is an array, the dtype of the 

1842 input is preserved; otherwise an array with the default numpy dtype 

1843 is be allocated. If `dtype` is not one of `float32`, `float64`, 

1844 `None`, the output will be of dtype `float64`. 

1845 

1846 copy : bool, default=False 

1847 If True, a copy of sample_weight will be created. 

1848 

1849 Returns 

1850 ------- 

1851 sample_weight : ndarray of shape (n_samples,) 

1852 Validated sample weight. It is guaranteed to be "C" contiguous. 

1853 """ 

1854 n_samples = _num_samples(X) 

1855 

1856 if dtype is not None and dtype not in [np.float32, np.float64]: 

1857 dtype = np.float64 

1858 

1859 if sample_weight is None: 

1860 sample_weight = np.ones(n_samples, dtype=dtype) 

1861 elif isinstance(sample_weight, numbers.Number): 

1862 sample_weight = np.full(n_samples, sample_weight, dtype=dtype) 

1863 else: 

1864 if dtype is None: 

1865 dtype = [np.float64, np.float32] 

1866 sample_weight = check_array( 

1867 sample_weight, 

1868 accept_sparse=False, 

1869 ensure_2d=False, 

1870 dtype=dtype, 

1871 order="C", 

1872 copy=copy, 

1873 input_name="sample_weight", 

1874 ) 

1875 if sample_weight.ndim != 1: 

1876 raise ValueError("Sample weights must be 1D array or scalar") 

1877 

1878 if sample_weight.shape != (n_samples,): 

1879 raise ValueError( 

1880 "sample_weight.shape == {}, expected {}!".format( 

1881 sample_weight.shape, (n_samples,) 

1882 ) 

1883 ) 

1884 

1885 if only_non_negative: 

1886 check_non_negative(sample_weight, "`sample_weight`") 

1887 

1888 return sample_weight 

1889 

1890 

1891def _allclose_dense_sparse(x, y, rtol=1e-7, atol=1e-9): 

1892 """Check allclose for sparse and dense data. 

1893 

1894 Both x and y need to be either sparse or dense, they 

1895 can't be mixed. 

1896 

1897 Parameters 

1898 ---------- 

1899 x : {array-like, sparse matrix} 

1900 First array to compare. 

1901 

1902 y : {array-like, sparse matrix} 

1903 Second array to compare. 

1904 

1905 rtol : float, default=1e-7 

1906 Relative tolerance; see numpy.allclose. 

1907 

1908 atol : float, default=1e-9 

1909 absolute tolerance; see numpy.allclose. Note that the default here is 

1910 more tolerant than the default for numpy.testing.assert_allclose, where 

1911 atol=0. 

1912 """ 

1913 if sp.issparse(x) and sp.issparse(y): 

1914 x = x.tocsr() 

1915 y = y.tocsr() 

1916 x.sum_duplicates() 

1917 y.sum_duplicates() 

1918 return ( 

1919 np.array_equal(x.indices, y.indices) 

1920 and np.array_equal(x.indptr, y.indptr) 

1921 and np.allclose(x.data, y.data, rtol=rtol, atol=atol) 

1922 ) 

1923 elif not sp.issparse(x) and not sp.issparse(y): 

1924 return np.allclose(x, y, rtol=rtol, atol=atol) 

1925 raise ValueError( 

1926 "Can only compare two sparse matrices, not a sparse matrix and an array" 

1927 ) 

1928 

1929 

1930def _check_response_method(estimator, response_method): 

1931 """Check if `response_method` is available in estimator and return it. 

1932 

1933 .. versionadded:: 1.3 

1934 

1935 Parameters 

1936 ---------- 

1937 estimator : estimator instance 

1938 Classifier or regressor to check. 

1939 

1940 response_method : {"predict_proba", "predict_log_proba", "decision_function", 

1941 "predict"} or list of such str 

1942 Specifies the response method to use get prediction from an estimator 

1943 (i.e. :term:`predict_proba`, :term:`predict_log_proba`, 

1944 :term:`decision_function` or :term:`predict`). Possible choices are: 

1945 - if `str`, it corresponds to the name to the method to return; 

1946 - if a list of `str`, it provides the method names in order of 

1947 preference. The method returned corresponds to the first method in 

1948 the list and which is implemented by `estimator`. 

1949 

1950 Returns 

1951 ------- 

1952 prediction_method : callable 

1953 Prediction method of estimator. 

1954 

1955 Raises 

1956 ------ 

1957 AttributeError 

1958 If `response_method` is not available in `estimator`. 

1959 """ 

1960 if isinstance(response_method, str): 

1961 list_methods = [response_method] 

1962 else: 

1963 list_methods = response_method 

1964 

1965 prediction_method = [getattr(estimator, method, None) for method in list_methods] 

1966 prediction_method = reduce(lambda x, y: x or y, prediction_method) 

1967 if prediction_method is None: 

1968 raise AttributeError( 

1969 f"{estimator.__class__.__name__} has none of the following attributes: " 

1970 f"{', '.join(list_methods)}." 

1971 ) 

1972 

1973 return prediction_method 

1974 

1975 

1976def _check_method_params(X, params, indices=None): 

1977 """Check and validate the parameters passed to a specific 

1978 method like `fit`. 

1979 

1980 Parameters 

1981 ---------- 

1982 X : array-like of shape (n_samples, n_features) 

1983 Data array. 

1984 

1985 params : dict 

1986 Dictionary containing the parameters passed to the method. 

1987 

1988 indices : array-like of shape (n_samples,), default=None 

1989 Indices to be selected if the parameter has the same size as `X`. 

1990 

1991 Returns 

1992 ------- 

1993 method_params_validated : dict 

1994 Validated parameters. We ensure that the values support indexing. 

1995 """ 

1996 from . import _safe_indexing 

1997 

1998 method_params_validated = {} 

1999 for param_key, param_value in params.items(): 

2000 if not _is_arraylike(param_value) or _num_samples(param_value) != _num_samples( 

2001 X 

2002 ): 

2003 # Non-indexable pass-through (for now for backward-compatibility). 

2004 # https://github.com/scikit-learn/scikit-learn/issues/15805 

2005 method_params_validated[param_key] = param_value 

2006 else: 

2007 # Any other method_params should support indexing 

2008 # (e.g. for cross-validation). 

2009 method_params_validated[param_key] = _make_indexable(param_value) 

2010 method_params_validated[param_key] = _safe_indexing( 

2011 method_params_validated[param_key], indices 

2012 ) 

2013 

2014 return method_params_validated 

2015 

2016 

2017def _is_pandas_df(X): 

2018 """Return True if the X is a pandas dataframe.""" 

2019 if hasattr(X, "columns") and hasattr(X, "iloc"): 

2020 # Likely a pandas DataFrame, we explicitly check the type to confirm. 

2021 try: 

2022 pd = sys.modules["pandas"] 

2023 except KeyError: 

2024 return False 

2025 return isinstance(X, pd.DataFrame) 

2026 return False 

2027 

2028 

2029def _is_polars_df(X): 

2030 """Return True if the X is a polars dataframe.""" 

2031 if hasattr(X, "columns") and hasattr(X, "schema"): 

2032 # Likely a polars DataFrame, we explicitly check the type to confirm. 

2033 try: 

2034 pl = sys.modules["polars"] 

2035 except KeyError: 

2036 return False 

2037 return isinstance(X, pl.DataFrame) 

2038 return False 

2039 

2040 

2041def _get_feature_names(X): 

2042 """Get feature names from X. 

2043 

2044 Support for other array containers should place its implementation here. 

2045 

2046 Parameters 

2047 ---------- 

2048 X : {ndarray, dataframe} of shape (n_samples, n_features) 

2049 Array container to extract feature names. 

2050 

2051 - pandas dataframe : The columns will be considered to be feature 

2052 names. If the dataframe contains non-string feature names, `None` is 

2053 returned. 

2054 - All other array containers will return `None`. 

2055 

2056 Returns 

2057 ------- 

2058 names: ndarray or None 

2059 Feature names of `X`. Unrecognized array containers will return `None`. 

2060 """ 

2061 feature_names = None 

2062 

2063 # extract feature names for support array containers 

2064 if _is_pandas_df(X): 

2065 # Make sure we can inspect columns names from pandas, even with 

2066 # versions too old to expose a working implementation of 

2067 # __dataframe__.column_names() and avoid introducing any 

2068 # additional copy. 

2069 # TODO: remove the pandas-specific branch once the minimum supported 

2070 # version of pandas has a working implementation of 

2071 # __dataframe__.column_names() that is guaranteed to not introduce any 

2072 # additional copy of the data without having to impose allow_copy=False 

2073 # that could fail with other libraries. Note: in the longer term, we 

2074 # could decide to instead rely on the __dataframe_namespace__ API once 

2075 # adopted by our minimally supported pandas version. 

2076 feature_names = np.asarray(X.columns, dtype=object) 

2077 elif hasattr(X, "__dataframe__"): 

2078 df_protocol = X.__dataframe__() 

2079 feature_names = np.asarray(list(df_protocol.column_names()), dtype=object) 

2080 

2081 if feature_names is None or len(feature_names) == 0: 

2082 return 

2083 

2084 types = sorted(t.__qualname__ for t in set(type(v) for v in feature_names)) 

2085 

2086 # mixed type of string and non-string is not supported 

2087 if len(types) > 1 and "str" in types: 

2088 raise TypeError( 

2089 "Feature names are only supported if all input features have string names, " 

2090 f"but your input has {types} as feature name / column name types. " 

2091 "If you want feature names to be stored and validated, you must convert " 

2092 "them all to strings, by using X.columns = X.columns.astype(str) for " 

2093 "example. Otherwise you can remove feature / column names from your input " 

2094 "data, or convert them all to a non-string data type." 

2095 ) 

2096 

2097 # Only feature names of all strings are supported 

2098 if len(types) == 1 and types[0] == "str": 

2099 return feature_names 

2100 

2101 

2102def _check_feature_names_in(estimator, input_features=None, *, generate_names=True): 

2103 """Check `input_features` and generate names if needed. 

2104 

2105 Commonly used in :term:`get_feature_names_out`. 

2106 

2107 Parameters 

2108 ---------- 

2109 input_features : array-like of str or None, default=None 

2110 Input features. 

2111 

2112 - If `input_features` is `None`, then `feature_names_in_` is 

2113 used as feature names in. If `feature_names_in_` is not defined, 

2114 then the following input feature names are generated: 

2115 `["x0", "x1", ..., "x(n_features_in_ - 1)"]`. 

2116 - If `input_features` is an array-like, then `input_features` must 

2117 match `feature_names_in_` if `feature_names_in_` is defined. 

2118 

2119 generate_names : bool, default=True 

2120 Whether to generate names when `input_features` is `None` and 

2121 `estimator.feature_names_in_` is not defined. This is useful for transformers 

2122 that validates `input_features` but do not require them in 

2123 :term:`get_feature_names_out` e.g. `PCA`. 

2124 

2125 Returns 

2126 ------- 

2127 feature_names_in : ndarray of str or `None` 

2128 Feature names in. 

2129 """ 

2130 

2131 feature_names_in_ = getattr(estimator, "feature_names_in_", None) 

2132 n_features_in_ = getattr(estimator, "n_features_in_", None) 

2133 

2134 if input_features is not None: 

2135 input_features = np.asarray(input_features, dtype=object) 

2136 if feature_names_in_ is not None and not np.array_equal( 

2137 feature_names_in_, input_features 

2138 ): 

2139 raise ValueError("input_features is not equal to feature_names_in_") 

2140 

2141 if n_features_in_ is not None and len(input_features) != n_features_in_: 

2142 raise ValueError( 

2143 "input_features should have length equal to number of " 

2144 f"features ({n_features_in_}), got {len(input_features)}" 

2145 ) 

2146 return input_features 

2147 

2148 if feature_names_in_ is not None: 

2149 return feature_names_in_ 

2150 

2151 if not generate_names: 

2152 return 

2153 

2154 # Generates feature names if `n_features_in_` is defined 

2155 if n_features_in_ is None: 

2156 raise ValueError("Unable to generate feature names without n_features_in_") 

2157 

2158 return np.asarray([f"x{i}" for i in range(n_features_in_)], dtype=object) 

2159 

2160 

2161def _generate_get_feature_names_out(estimator, n_features_out, input_features=None): 

2162 """Generate feature names out for estimator using the estimator name as the prefix. 

2163 

2164 The input_feature names are validated but not used. This function is useful 

2165 for estimators that generate their own names based on `n_features_out`, i.e. PCA. 

2166 

2167 Parameters 

2168 ---------- 

2169 estimator : estimator instance 

2170 Estimator producing output feature names. 

2171 

2172 n_feature_out : int 

2173 Number of feature names out. 

2174 

2175 input_features : array-like of str or None, default=None 

2176 Only used to validate feature names with `estimator.feature_names_in_`. 

2177 

2178 Returns 

2179 ------- 

2180 feature_names_in : ndarray of str or `None` 

2181 Feature names in. 

2182 """ 

2183 _check_feature_names_in(estimator, input_features, generate_names=False) 

2184 estimator_name = estimator.__class__.__name__.lower() 

2185 return np.asarray( 

2186 [f"{estimator_name}{i}" for i in range(n_features_out)], dtype=object 

2187 ) 

2188 

2189 

2190def _check_monotonic_cst(estimator, monotonic_cst=None): 

2191 """Check the monotonic constraints and return the corresponding array. 

2192 

2193 This helper function should be used in the `fit` method of an estimator 

2194 that supports monotonic constraints and called after the estimator has 

2195 introspected input data to set the `n_features_in_` and optionally the 

2196 `feature_names_in_` attributes. 

2197 

2198 .. versionadded:: 1.2 

2199 

2200 Parameters 

2201 ---------- 

2202 estimator : estimator instance 

2203 

2204 monotonic_cst : array-like of int, dict of str or None, default=None 

2205 Monotonic constraints for the features. 

2206 

2207 - If array-like, then it should contain only -1, 0 or 1. Each value 

2208 will be checked to be in [-1, 0, 1]. If a value is -1, then the 

2209 corresponding feature is required to be monotonically decreasing. 

2210 - If dict, then it the keys should be the feature names occurring in 

2211 `estimator.feature_names_in_` and the values should be -1, 0 or 1. 

2212 - If None, then an array of 0s will be allocated. 

2213 

2214 Returns 

2215 ------- 

2216 monotonic_cst : ndarray of int 

2217 Monotonic constraints for each feature. 

2218 """ 

2219 original_monotonic_cst = monotonic_cst 

2220 if monotonic_cst is None or isinstance(monotonic_cst, dict): 

2221 monotonic_cst = np.full( 

2222 shape=estimator.n_features_in_, 

2223 fill_value=0, 

2224 dtype=np.int8, 

2225 ) 

2226 if isinstance(original_monotonic_cst, dict): 

2227 if not hasattr(estimator, "feature_names_in_"): 

2228 raise ValueError( 

2229 f"{estimator.__class__.__name__} was not fitted on data " 

2230 "with feature names. Pass monotonic_cst as an integer " 

2231 "array instead." 

2232 ) 

2233 unexpected_feature_names = list( 

2234 set(original_monotonic_cst) - set(estimator.feature_names_in_) 

2235 ) 

2236 unexpected_feature_names.sort() # deterministic error message 

2237 n_unexpeced = len(unexpected_feature_names) 

2238 if unexpected_feature_names: 

2239 if len(unexpected_feature_names) > 5: 

2240 unexpected_feature_names = unexpected_feature_names[:5] 

2241 unexpected_feature_names.append("...") 

2242 raise ValueError( 

2243 f"monotonic_cst contains {n_unexpeced} unexpected feature " 

2244 f"names: {unexpected_feature_names}." 

2245 ) 

2246 for feature_idx, feature_name in enumerate(estimator.feature_names_in_): 

2247 if feature_name in original_monotonic_cst: 

2248 cst = original_monotonic_cst[feature_name] 

2249 if cst not in [-1, 0, 1]: 

2250 raise ValueError( 

2251 f"monotonic_cst['{feature_name}'] must be either " 

2252 f"-1, 0 or 1. Got {cst!r}." 

2253 ) 

2254 monotonic_cst[feature_idx] = cst 

2255 else: 

2256 unexpected_cst = np.setdiff1d(monotonic_cst, [-1, 0, 1]) 

2257 if unexpected_cst.shape[0]: 

2258 raise ValueError( 

2259 "monotonic_cst must be an array-like of -1, 0 or 1. Observed " 

2260 f"values: {unexpected_cst.tolist()}." 

2261 ) 

2262 

2263 monotonic_cst = np.asarray(monotonic_cst, dtype=np.int8) 

2264 if monotonic_cst.shape[0] != estimator.n_features_in_: 

2265 raise ValueError( 

2266 f"monotonic_cst has shape {monotonic_cst.shape} but the input data " 

2267 f"X has {estimator.n_features_in_} features." 

2268 ) 

2269 return monotonic_cst 

2270 

2271 

2272def _check_pos_label_consistency(pos_label, y_true): 

2273 """Check if `pos_label` need to be specified or not. 

2274 

2275 In binary classification, we fix `pos_label=1` if the labels are in the set 

2276 {-1, 1} or {0, 1}. Otherwise, we raise an error asking to specify the 

2277 `pos_label` parameters. 

2278 

2279 Parameters 

2280 ---------- 

2281 pos_label : int, float, bool, str or None 

2282 The positive label. 

2283 y_true : ndarray of shape (n_samples,) 

2284 The target vector. 

2285 

2286 Returns 

2287 ------- 

2288 pos_label : int, float, bool or str 

2289 If `pos_label` can be inferred, it will be returned. 

2290 

2291 Raises 

2292 ------ 

2293 ValueError 

2294 In the case that `y_true` does not have label in {-1, 1} or {0, 1}, 

2295 it will raise a `ValueError`. 

2296 """ 

2297 # ensure binary classification if pos_label is not specified 

2298 # classes.dtype.kind in ('O', 'U', 'S') is required to avoid 

2299 # triggering a FutureWarning by calling np.array_equal(a, b) 

2300 # when elements in the two arrays are not comparable. 

2301 classes = np.unique(y_true) 

2302 if pos_label is None and ( 

2303 classes.dtype.kind in "OUS" 

2304 or not ( 

2305 np.array_equal(classes, [0, 1]) 

2306 or np.array_equal(classes, [-1, 1]) 

2307 or np.array_equal(classes, [0]) 

2308 or np.array_equal(classes, [-1]) 

2309 or np.array_equal(classes, [1]) 

2310 ) 

2311 ): 

2312 classes_repr = ", ".join([repr(c) for c in classes.tolist()]) 

2313 raise ValueError( 

2314 f"y_true takes value in {{{classes_repr}}} and pos_label is not " 

2315 "specified: either make y_true take value in {0, 1} or " 

2316 "{-1, 1} or pass pos_label explicitly." 

2317 ) 

2318 elif pos_label is None: 

2319 pos_label = 1 

2320 

2321 return pos_label