Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/scikit_learn-1.4.dev0-py3.8-linux-x86_64.egg/sklearn/base.py: 36%

337 statements  

« prev     ^ index     » next       coverage.py v7.3.2, created at 2023-12-12 06:31 +0000

1"""Base classes for all estimators.""" 

2 

3# Author: Gael Varoquaux <gael.varoquaux@normalesup.org> 

4# License: BSD 3 clause 

5 

6import copy 

7import functools 

8import inspect 

9import platform 

10import re 

11import warnings 

12from collections import defaultdict 

13 

14import numpy as np 

15 

16from . import __version__ 

17from ._config import config_context, get_config 

18from .exceptions import InconsistentVersionWarning 

19from .utils import _IS_32BIT 

20from .utils._estimator_html_repr import _HTMLDocumentationLinkMixin, estimator_html_repr 

21from .utils._metadata_requests import _MetadataRequester, _routing_enabled 

22from .utils._param_validation import validate_parameter_constraints 

23from .utils._set_output import _SetOutputMixin 

24from .utils._tags import ( 

25 _DEFAULT_TAGS, 

26) 

27from .utils.validation import ( 

28 _check_feature_names_in, 

29 _check_y, 

30 _generate_get_feature_names_out, 

31 _get_feature_names, 

32 _is_fitted, 

33 _num_features, 

34 check_array, 

35 check_is_fitted, 

36 check_X_y, 

37) 

38 

39 

40def clone(estimator, *, safe=True): 

41 """Construct a new unfitted estimator with the same parameters. 

42 

43 Clone does a deep copy of the model in an estimator 

44 without actually copying attached data. It returns a new estimator 

45 with the same parameters that has not been fitted on any data. 

46 

47 .. versionchanged:: 1.3 

48 Delegates to `estimator.__sklearn_clone__` if the method exists. 

49 

50 Parameters 

51 ---------- 

52 estimator : {list, tuple, set} of estimator instance or a single \ 

53 estimator instance 

54 The estimator or group of estimators to be cloned. 

55 safe : bool, default=True 

56 If safe is False, clone will fall back to a deep copy on objects 

57 that are not estimators. Ignored if `estimator.__sklearn_clone__` 

58 exists. 

59 

60 Returns 

61 ------- 

62 estimator : object 

63 The deep copy of the input, an estimator if input is an estimator. 

64 

65 Notes 

66 ----- 

67 If the estimator's `random_state` parameter is an integer (or if the 

68 estimator doesn't have a `random_state` parameter), an *exact clone* is 

69 returned: the clone and the original estimator will give the exact same 

70 results. Otherwise, *statistical clone* is returned: the clone might 

71 return different results from the original estimator. More details can be 

72 found in :ref:`randomness`. 

73 """ 

74 if hasattr(estimator, "__sklearn_clone__") and not inspect.isclass(estimator): 

75 return estimator.__sklearn_clone__() 

76 return _clone_parametrized(estimator, safe=safe) 

77 

78 

79def _clone_parametrized(estimator, *, safe=True): 

80 """Default implementation of clone. See :func:`sklearn.base.clone` for details.""" 

81 

82 estimator_type = type(estimator) 

83 if estimator_type is dict: 

84 return {k: clone(v, safe=safe) for k, v in estimator.items()} 

85 elif estimator_type in (list, tuple, set, frozenset): 

86 return estimator_type([clone(e, safe=safe) for e in estimator]) 

87 elif not hasattr(estimator, "get_params") or isinstance(estimator, type): 

88 if not safe: 

89 return copy.deepcopy(estimator) 

90 else: 

91 if isinstance(estimator, type): 

92 raise TypeError( 

93 "Cannot clone object. " 

94 + "You should provide an instance of " 

95 + "scikit-learn estimator instead of a class." 

96 ) 

97 else: 

98 raise TypeError( 

99 "Cannot clone object '%s' (type %s): " 

100 "it does not seem to be a scikit-learn " 

101 "estimator as it does not implement a " 

102 "'get_params' method." % (repr(estimator), type(estimator)) 

103 ) 

104 

105 klass = estimator.__class__ 

106 new_object_params = estimator.get_params(deep=False) 

107 for name, param in new_object_params.items(): 

108 new_object_params[name] = clone(param, safe=False) 

109 

110 new_object = klass(**new_object_params) 

111 try: 

112 new_object._metadata_request = copy.deepcopy(estimator._metadata_request) 

113 except AttributeError: 

114 pass 

115 

116 params_set = new_object.get_params(deep=False) 

117 

118 # quick sanity check of the parameters of the clone 

119 for name in new_object_params: 

120 param1 = new_object_params[name] 

121 param2 = params_set[name] 

122 if param1 is not param2: 

123 raise RuntimeError( 

124 "Cannot clone object %s, as the constructor " 

125 "either does not set or modifies parameter %s" % (estimator, name) 

126 ) 

127 

128 # _sklearn_output_config is used by `set_output` to configure the output 

129 # container of an estimator. 

130 if hasattr(estimator, "_sklearn_output_config"): 

131 new_object._sklearn_output_config = copy.deepcopy( 

132 estimator._sklearn_output_config 

133 ) 

134 return new_object 

135 

136 

137class BaseEstimator(_HTMLDocumentationLinkMixin, _MetadataRequester): 

138 """Base class for all estimators in scikit-learn. 

139 

140 Notes 

141 ----- 

142 All estimators should specify all the parameters that can be set 

143 at the class level in their ``__init__`` as explicit keyword 

144 arguments (no ``*args`` or ``**kwargs``). 

145 """ 

146 

147 @classmethod 

148 def _get_param_names(cls): 

149 """Get parameter names for the estimator""" 

150 # fetch the constructor or the original constructor before 

151 # deprecation wrapping if any 

152 init = getattr(cls.__init__, "deprecated_original", cls.__init__) 

153 if init is object.__init__: 

154 # No explicit constructor to introspect 

155 return [] 

156 

157 # introspect the constructor arguments to find the model parameters 

158 # to represent 

159 init_signature = inspect.signature(init) 

160 # Consider the constructor parameters excluding 'self' 

161 parameters = [ 

162 p 

163 for p in init_signature.parameters.values() 

164 if p.name != "self" and p.kind != p.VAR_KEYWORD 

165 ] 

166 for p in parameters: 

167 if p.kind == p.VAR_POSITIONAL: 

168 raise RuntimeError( 

169 "scikit-learn estimators should always " 

170 "specify their parameters in the signature" 

171 " of their __init__ (no varargs)." 

172 " %s with constructor %s doesn't " 

173 " follow this convention." % (cls, init_signature) 

174 ) 

175 # Extract and sort argument names excluding 'self' 

176 return sorted([p.name for p in parameters]) 

177 

178 def get_params(self, deep=True): 

179 """ 

180 Get parameters for this estimator. 

181 

182 Parameters 

183 ---------- 

184 deep : bool, default=True 

185 If True, will return the parameters for this estimator and 

186 contained subobjects that are estimators. 

187 

188 Returns 

189 ------- 

190 params : dict 

191 Parameter names mapped to their values. 

192 """ 

193 out = dict() 

194 for key in self._get_param_names(): 

195 value = getattr(self, key) 

196 if deep and hasattr(value, "get_params") and not isinstance(value, type): 

197 deep_items = value.get_params().items() 

198 out.update((key + "__" + k, val) for k, val in deep_items) 

199 out[key] = value 

200 return out 

201 

202 def set_params(self, **params): 

203 """Set the parameters of this estimator. 

204 

205 The method works on simple estimators as well as on nested objects 

206 (such as :class:`~sklearn.pipeline.Pipeline`). The latter have 

207 parameters of the form ``<component>__<parameter>`` so that it's 

208 possible to update each component of a nested object. 

209 

210 Parameters 

211 ---------- 

212 **params : dict 

213 Estimator parameters. 

214 

215 Returns 

216 ------- 

217 self : estimator instance 

218 Estimator instance. 

219 """ 

220 if not params: 

221 # Simple optimization to gain speed (inspect is slow) 

222 return self 

223 valid_params = self.get_params(deep=True) 

224 

225 nested_params = defaultdict(dict) # grouped by prefix 

226 for key, value in params.items(): 

227 key, delim, sub_key = key.partition("__") 

228 if key not in valid_params: 

229 local_valid_params = self._get_param_names() 

230 raise ValueError( 

231 f"Invalid parameter {key!r} for estimator {self}. " 

232 f"Valid parameters are: {local_valid_params!r}." 

233 ) 

234 

235 if delim: 

236 nested_params[key][sub_key] = value 

237 else: 

238 setattr(self, key, value) 

239 valid_params[key] = value 

240 

241 for key, sub_params in nested_params.items(): 

242 valid_params[key].set_params(**sub_params) 

243 

244 return self 

245 

246 def __sklearn_clone__(self): 

247 return _clone_parametrized(self) 

248 

249 def __repr__(self, N_CHAR_MAX=700): 

250 # N_CHAR_MAX is the (approximate) maximum number of non-blank 

251 # characters to render. We pass it as an optional parameter to ease 

252 # the tests. 

253 

254 from .utils._pprint import _EstimatorPrettyPrinter 

255 

256 N_MAX_ELEMENTS_TO_SHOW = 30 # number of elements to show in sequences 

257 

258 # use ellipsis for sequences with a lot of elements 

259 pp = _EstimatorPrettyPrinter( 

260 compact=True, 

261 indent=1, 

262 indent_at_name=True, 

263 n_max_elements_to_show=N_MAX_ELEMENTS_TO_SHOW, 

264 ) 

265 

266 repr_ = pp.pformat(self) 

267 

268 # Use bruteforce ellipsis when there are a lot of non-blank characters 

269 n_nonblank = len("".join(repr_.split())) 

270 if n_nonblank > N_CHAR_MAX: 

271 lim = N_CHAR_MAX // 2 # apprx number of chars to keep on both ends 

272 regex = r"^(\s*\S){%d}" % lim 

273 # The regex '^(\s*\S){%d}' % n 

274 # matches from the start of the string until the nth non-blank 

275 # character: 

276 # - ^ matches the start of string 

277 # - (pattern){n} matches n repetitions of pattern 

278 # - \s*\S matches a non-blank char following zero or more blanks 

279 left_lim = re.match(regex, repr_).end() 

280 right_lim = re.match(regex, repr_[::-1]).end() 

281 

282 if "\n" in repr_[left_lim:-right_lim]: 

283 # The left side and right side aren't on the same line. 

284 # To avoid weird cuts, e.g.: 

285 # categoric...ore', 

286 # we need to start the right side with an appropriate newline 

287 # character so that it renders properly as: 

288 # categoric... 

289 # handle_unknown='ignore', 

290 # so we add [^\n]*\n which matches until the next \n 

291 regex += r"[^\n]*\n" 

292 right_lim = re.match(regex, repr_[::-1]).end() 

293 

294 ellipsis = "..." 

295 if left_lim + len(ellipsis) < len(repr_) - right_lim: 

296 # Only add ellipsis if it results in a shorter repr 

297 repr_ = repr_[:left_lim] + "..." + repr_[-right_lim:] 

298 

299 return repr_ 

300 

301 def __getstate__(self): 

302 if getattr(self, "__slots__", None): 

303 raise TypeError( 

304 "You cannot use `__slots__` in objects inheriting from " 

305 "`sklearn.base.BaseEstimator`." 

306 ) 

307 

308 try: 

309 state = super().__getstate__() 

310 if state is None: 

311 # For Python 3.11+, empty instance (no `__slots__`, 

312 # and `__dict__`) will return a state equal to `None`. 

313 state = self.__dict__.copy() 

314 except AttributeError: 

315 # Python < 3.11 

316 state = self.__dict__.copy() 

317 

318 if type(self).__module__.startswith("sklearn."): 

319 return dict(state.items(), _sklearn_version=__version__) 

320 else: 

321 return state 

322 

323 def __setstate__(self, state): 

324 if type(self).__module__.startswith("sklearn."): 

325 pickle_version = state.pop("_sklearn_version", "pre-0.18") 

326 if pickle_version != __version__: 

327 warnings.warn( 

328 InconsistentVersionWarning( 

329 estimator_name=self.__class__.__name__, 

330 current_sklearn_version=__version__, 

331 original_sklearn_version=pickle_version, 

332 ), 

333 ) 

334 try: 

335 super().__setstate__(state) 

336 except AttributeError: 

337 self.__dict__.update(state) 

338 

339 def _more_tags(self): 

340 return _DEFAULT_TAGS 

341 

342 def _get_tags(self): 

343 collected_tags = {} 

344 for base_class in reversed(inspect.getmro(self.__class__)): 

345 if hasattr(base_class, "_more_tags"): 

346 # need the if because mixins might not have _more_tags 

347 # but might do redundant work in estimators 

348 # (i.e. calling more tags on BaseEstimator multiple times) 

349 more_tags = base_class._more_tags(self) 

350 collected_tags.update(more_tags) 

351 return collected_tags 

352 

353 def _check_n_features(self, X, reset): 

354 """Set the `n_features_in_` attribute, or check against it. 

355 

356 Parameters 

357 ---------- 

358 X : {ndarray, sparse matrix} of shape (n_samples, n_features) 

359 The input samples. 

360 reset : bool 

361 If True, the `n_features_in_` attribute is set to `X.shape[1]`. 

362 If False and the attribute exists, then check that it is equal to 

363 `X.shape[1]`. If False and the attribute does *not* exist, then 

364 the check is skipped. 

365 .. note:: 

366 It is recommended to call reset=True in `fit` and in the first 

367 call to `partial_fit`. All other methods that validate `X` 

368 should set `reset=False`. 

369 """ 

370 try: 

371 n_features = _num_features(X) 

372 except TypeError as e: 

373 if not reset and hasattr(self, "n_features_in_"): 

374 raise ValueError( 

375 "X does not contain any features, but " 

376 f"{self.__class__.__name__} is expecting " 

377 f"{self.n_features_in_} features" 

378 ) from e 

379 # If the number of features is not defined and reset=True, 

380 # then we skip this check 

381 return 

382 

383 if reset: 

384 self.n_features_in_ = n_features 

385 return 

386 

387 if not hasattr(self, "n_features_in_"): 

388 # Skip this check if the expected number of expected input features 

389 # was not recorded by calling fit first. This is typically the case 

390 # for stateless transformers. 

391 return 

392 

393 if n_features != self.n_features_in_: 

394 raise ValueError( 

395 f"X has {n_features} features, but {self.__class__.__name__} " 

396 f"is expecting {self.n_features_in_} features as input." 

397 ) 

398 

399 def _check_feature_names(self, X, *, reset): 

400 """Set or check the `feature_names_in_` attribute. 

401 

402 .. versionadded:: 1.0 

403 

404 Parameters 

405 ---------- 

406 X : {ndarray, dataframe} of shape (n_samples, n_features) 

407 The input samples. 

408 

409 reset : bool 

410 Whether to reset the `feature_names_in_` attribute. 

411 If False, the input will be checked for consistency with 

412 feature names of data provided when reset was last True. 

413 .. note:: 

414 It is recommended to call `reset=True` in `fit` and in the first 

415 call to `partial_fit`. All other methods that validate `X` 

416 should set `reset=False`. 

417 """ 

418 

419 if reset: 

420 feature_names_in = _get_feature_names(X) 

421 if feature_names_in is not None: 

422 self.feature_names_in_ = feature_names_in 

423 elif hasattr(self, "feature_names_in_"): 

424 # Delete the attribute when the estimator is fitted on a new dataset 

425 # that has no feature names. 

426 delattr(self, "feature_names_in_") 

427 return 

428 

429 fitted_feature_names = getattr(self, "feature_names_in_", None) 

430 X_feature_names = _get_feature_names(X) 

431 

432 if fitted_feature_names is None and X_feature_names is None: 

433 # no feature names seen in fit and in X 

434 return 

435 

436 if X_feature_names is not None and fitted_feature_names is None: 

437 warnings.warn( 

438 f"X has feature names, but {self.__class__.__name__} was fitted without" 

439 " feature names" 

440 ) 

441 return 

442 

443 if X_feature_names is None and fitted_feature_names is not None: 

444 warnings.warn( 

445 "X does not have valid feature names, but" 

446 f" {self.__class__.__name__} was fitted with feature names" 

447 ) 

448 return 

449 

450 # validate the feature names against the `feature_names_in_` attribute 

451 if len(fitted_feature_names) != len(X_feature_names) or np.any( 

452 fitted_feature_names != X_feature_names 

453 ): 

454 message = ( 

455 "The feature names should match those that were passed during fit.\n" 

456 ) 

457 fitted_feature_names_set = set(fitted_feature_names) 

458 X_feature_names_set = set(X_feature_names) 

459 

460 unexpected_names = sorted(X_feature_names_set - fitted_feature_names_set) 

461 missing_names = sorted(fitted_feature_names_set - X_feature_names_set) 

462 

463 def add_names(names): 

464 output = "" 

465 max_n_names = 5 

466 for i, name in enumerate(names): 

467 if i >= max_n_names: 

468 output += "- ...\n" 

469 break 

470 output += f"- {name}\n" 

471 return output 

472 

473 if unexpected_names: 

474 message += "Feature names unseen at fit time:\n" 

475 message += add_names(unexpected_names) 

476 

477 if missing_names: 

478 message += "Feature names seen at fit time, yet now missing:\n" 

479 message += add_names(missing_names) 

480 

481 if not missing_names and not unexpected_names: 

482 message += ( 

483 "Feature names must be in the same order as they were in fit.\n" 

484 ) 

485 

486 raise ValueError(message) 

487 

488 def _validate_data( 

489 self, 

490 X="no_validation", 

491 y="no_validation", 

492 reset=True, 

493 validate_separately=False, 

494 cast_to_ndarray=True, 

495 **check_params, 

496 ): 

497 """Validate input data and set or check the `n_features_in_` attribute. 

498 

499 Parameters 

500 ---------- 

501 X : {array-like, sparse matrix, dataframe} of shape \ 

502 (n_samples, n_features), default='no validation' 

503 The input samples. 

504 If `'no_validation'`, no validation is performed on `X`. This is 

505 useful for meta-estimator which can delegate input validation to 

506 their underlying estimator(s). In that case `y` must be passed and 

507 the only accepted `check_params` are `multi_output` and 

508 `y_numeric`. 

509 

510 y : array-like of shape (n_samples,), default='no_validation' 

511 The targets. 

512 

513 - If `None`, `check_array` is called on `X`. If the estimator's 

514 requires_y tag is True, then an error will be raised. 

515 - If `'no_validation'`, `check_array` is called on `X` and the 

516 estimator's requires_y tag is ignored. This is a default 

517 placeholder and is never meant to be explicitly set. In that case 

518 `X` must be passed. 

519 - Otherwise, only `y` with `_check_y` or both `X` and `y` are 

520 checked with either `check_array` or `check_X_y` depending on 

521 `validate_separately`. 

522 

523 reset : bool, default=True 

524 Whether to reset the `n_features_in_` attribute. 

525 If False, the input will be checked for consistency with data 

526 provided when reset was last True. 

527 .. note:: 

528 It is recommended to call reset=True in `fit` and in the first 

529 call to `partial_fit`. All other methods that validate `X` 

530 should set `reset=False`. 

531 

532 validate_separately : False or tuple of dicts, default=False 

533 Only used if y is not None. 

534 If False, call validate_X_y(). Else, it must be a tuple of kwargs 

535 to be used for calling check_array() on X and y respectively. 

536 

537 `estimator=self` is automatically added to these dicts to generate 

538 more informative error message in case of invalid input data. 

539 

540 cast_to_ndarray : bool, default=True 

541 Cast `X` and `y` to ndarray with checks in `check_params`. If 

542 `False`, `X` and `y` are unchanged and only `feature_names_in_` and 

543 `n_features_in_` are checked. 

544 

545 **check_params : kwargs 

546 Parameters passed to :func:`sklearn.utils.check_array` or 

547 :func:`sklearn.utils.check_X_y`. Ignored if validate_separately 

548 is not False. 

549 

550 `estimator=self` is automatically added to these params to generate 

551 more informative error message in case of invalid input data. 

552 

553 Returns 

554 ------- 

555 out : {ndarray, sparse matrix} or tuple of these 

556 The validated input. A tuple is returned if both `X` and `y` are 

557 validated. 

558 """ 

559 self._check_feature_names(X, reset=reset) 

560 

561 if y is None and self._get_tags()["requires_y"]: 

562 raise ValueError( 

563 f"This {self.__class__.__name__} estimator " 

564 "requires y to be passed, but the target y is None." 

565 ) 

566 

567 no_val_X = isinstance(X, str) and X == "no_validation" 

568 no_val_y = y is None or isinstance(y, str) and y == "no_validation" 

569 

570 if no_val_X and no_val_y: 

571 raise ValueError("Validation should be done on X, y or both.") 

572 

573 default_check_params = {"estimator": self} 

574 check_params = {**default_check_params, **check_params} 

575 

576 if not cast_to_ndarray: 

577 if not no_val_X and no_val_y: 

578 out = X 

579 elif no_val_X and not no_val_y: 

580 out = y 

581 else: 

582 out = X, y 

583 elif not no_val_X and no_val_y: 

584 out = check_array(X, input_name="X", **check_params) 

585 elif no_val_X and not no_val_y: 

586 out = _check_y(y, **check_params) 

587 else: 

588 if validate_separately: 

589 # We need this because some estimators validate X and y 

590 # separately, and in general, separately calling check_array() 

591 # on X and y isn't equivalent to just calling check_X_y() 

592 # :( 

593 check_X_params, check_y_params = validate_separately 

594 if "estimator" not in check_X_params: 

595 check_X_params = {**default_check_params, **check_X_params} 

596 X = check_array(X, input_name="X", **check_X_params) 

597 if "estimator" not in check_y_params: 

598 check_y_params = {**default_check_params, **check_y_params} 

599 y = check_array(y, input_name="y", **check_y_params) 

600 else: 

601 X, y = check_X_y(X, y, **check_params) 

602 out = X, y 

603 

604 if not no_val_X and check_params.get("ensure_2d", True): 

605 self._check_n_features(X, reset=reset) 

606 

607 return out 

608 

609 def _validate_params(self): 

610 """Validate types and values of constructor parameters 

611 

612 The expected type and values must be defined in the `_parameter_constraints` 

613 class attribute, which is a dictionary `param_name: list of constraints`. See 

614 the docstring of `validate_parameter_constraints` for a description of the 

615 accepted constraints. 

616 """ 

617 validate_parameter_constraints( 

618 self._parameter_constraints, 

619 self.get_params(deep=False), 

620 caller_name=self.__class__.__name__, 

621 ) 

622 

623 @property 

624 def _repr_html_(self): 

625 """HTML representation of estimator. 

626 

627 This is redundant with the logic of `_repr_mimebundle_`. The latter 

628 should be favorted in the long term, `_repr_html_` is only 

629 implemented for consumers who do not interpret `_repr_mimbundle_`. 

630 """ 

631 if get_config()["display"] != "diagram": 

632 raise AttributeError( 

633 "_repr_html_ is only defined when the " 

634 "'display' configuration option is set to " 

635 "'diagram'" 

636 ) 

637 return self._repr_html_inner 

638 

639 def _repr_html_inner(self): 

640 """This function is returned by the @property `_repr_html_` to make 

641 `hasattr(estimator, "_repr_html_") return `True` or `False` depending 

642 on `get_config()["display"]`. 

643 """ 

644 return estimator_html_repr(self) 

645 

646 def _repr_mimebundle_(self, **kwargs): 

647 """Mime bundle used by jupyter kernels to display estimator""" 

648 output = {"text/plain": repr(self)} 

649 if get_config()["display"] == "diagram": 

650 output["text/html"] = estimator_html_repr(self) 

651 return output 

652 

653 

654class ClassifierMixin: 

655 """Mixin class for all classifiers in scikit-learn.""" 

656 

657 _estimator_type = "classifier" 

658 

659 def score(self, X, y, sample_weight=None): 

660 """ 

661 Return the mean accuracy on the given test data and labels. 

662 

663 In multi-label classification, this is the subset accuracy 

664 which is a harsh metric since you require for each sample that 

665 each label set be correctly predicted. 

666 

667 Parameters 

668 ---------- 

669 X : array-like of shape (n_samples, n_features) 

670 Test samples. 

671 

672 y : array-like of shape (n_samples,) or (n_samples, n_outputs) 

673 True labels for `X`. 

674 

675 sample_weight : array-like of shape (n_samples,), default=None 

676 Sample weights. 

677 

678 Returns 

679 ------- 

680 score : float 

681 Mean accuracy of ``self.predict(X)`` w.r.t. `y`. 

682 """ 

683 from .metrics import accuracy_score 

684 

685 return accuracy_score(y, self.predict(X), sample_weight=sample_weight) 

686 

687 def _more_tags(self): 

688 return {"requires_y": True} 

689 

690 

691class RegressorMixin: 

692 """Mixin class for all regression estimators in scikit-learn.""" 

693 

694 _estimator_type = "regressor" 

695 

696 def score(self, X, y, sample_weight=None): 

697 """Return the coefficient of determination of the prediction. 

698 

699 The coefficient of determination :math:`R^2` is defined as 

700 :math:`(1 - \\frac{u}{v})`, where :math:`u` is the residual 

701 sum of squares ``((y_true - y_pred)** 2).sum()`` and :math:`v` 

702 is the total sum of squares ``((y_true - y_true.mean()) ** 2).sum()``. 

703 The best possible score is 1.0 and it can be negative (because the 

704 model can be arbitrarily worse). A constant model that always predicts 

705 the expected value of `y`, disregarding the input features, would get 

706 a :math:`R^2` score of 0.0. 

707 

708 Parameters 

709 ---------- 

710 X : array-like of shape (n_samples, n_features) 

711 Test samples. For some estimators this may be a precomputed 

712 kernel matrix or a list of generic objects instead with shape 

713 ``(n_samples, n_samples_fitted)``, where ``n_samples_fitted`` 

714 is the number of samples used in the fitting for the estimator. 

715 

716 y : array-like of shape (n_samples,) or (n_samples, n_outputs) 

717 True values for `X`. 

718 

719 sample_weight : array-like of shape (n_samples,), default=None 

720 Sample weights. 

721 

722 Returns 

723 ------- 

724 score : float 

725 :math:`R^2` of ``self.predict(X)`` w.r.t. `y`. 

726 

727 Notes 

728 ----- 

729 The :math:`R^2` score used when calling ``score`` on a regressor uses 

730 ``multioutput='uniform_average'`` from version 0.23 to keep consistent 

731 with default value of :func:`~sklearn.metrics.r2_score`. 

732 This influences the ``score`` method of all the multioutput 

733 regressors (except for 

734 :class:`~sklearn.multioutput.MultiOutputRegressor`). 

735 """ 

736 

737 from .metrics import r2_score 

738 

739 y_pred = self.predict(X) 

740 return r2_score(y, y_pred, sample_weight=sample_weight) 

741 

742 def _more_tags(self): 

743 return {"requires_y": True} 

744 

745 

746class ClusterMixin: 

747 """Mixin class for all cluster estimators in scikit-learn.""" 

748 

749 _estimator_type = "clusterer" 

750 

751 def fit_predict(self, X, y=None, **kwargs): 

752 """ 

753 Perform clustering on `X` and returns cluster labels. 

754 

755 Parameters 

756 ---------- 

757 X : array-like of shape (n_samples, n_features) 

758 Input data. 

759 

760 y : Ignored 

761 Not used, present for API consistency by convention. 

762 

763 **kwargs : dict 

764 Arguments to be passed to ``fit``. 

765 

766 .. versionadded:: 1.4 

767 

768 Returns 

769 ------- 

770 labels : ndarray of shape (n_samples,), dtype=np.int64 

771 Cluster labels. 

772 """ 

773 # non-optimized default implementation; override when a better 

774 # method is possible for a given clustering algorithm 

775 self.fit(X, **kwargs) 

776 return self.labels_ 

777 

778 def _more_tags(self): 

779 return {"preserves_dtype": []} 

780 

781 

782class BiclusterMixin: 

783 """Mixin class for all bicluster estimators in scikit-learn.""" 

784 

785 @property 

786 def biclusters_(self): 

787 """Convenient way to get row and column indicators together. 

788 

789 Returns the ``rows_`` and ``columns_`` members. 

790 """ 

791 return self.rows_, self.columns_ 

792 

793 def get_indices(self, i): 

794 """Row and column indices of the `i`'th bicluster. 

795 

796 Only works if ``rows_`` and ``columns_`` attributes exist. 

797 

798 Parameters 

799 ---------- 

800 i : int 

801 The index of the cluster. 

802 

803 Returns 

804 ------- 

805 row_ind : ndarray, dtype=np.intp 

806 Indices of rows in the dataset that belong to the bicluster. 

807 col_ind : ndarray, dtype=np.intp 

808 Indices of columns in the dataset that belong to the bicluster. 

809 """ 

810 rows = self.rows_[i] 

811 columns = self.columns_[i] 

812 return np.nonzero(rows)[0], np.nonzero(columns)[0] 

813 

814 def get_shape(self, i): 

815 """Shape of the `i`'th bicluster. 

816 

817 Parameters 

818 ---------- 

819 i : int 

820 The index of the cluster. 

821 

822 Returns 

823 ------- 

824 n_rows : int 

825 Number of rows in the bicluster. 

826 

827 n_cols : int 

828 Number of columns in the bicluster. 

829 """ 

830 indices = self.get_indices(i) 

831 return tuple(len(i) for i in indices) 

832 

833 def get_submatrix(self, i, data): 

834 """Return the submatrix corresponding to bicluster `i`. 

835 

836 Parameters 

837 ---------- 

838 i : int 

839 The index of the cluster. 

840 data : array-like of shape (n_samples, n_features) 

841 The data. 

842 

843 Returns 

844 ------- 

845 submatrix : ndarray of shape (n_rows, n_cols) 

846 The submatrix corresponding to bicluster `i`. 

847 

848 Notes 

849 ----- 

850 Works with sparse matrices. Only works if ``rows_`` and 

851 ``columns_`` attributes exist. 

852 """ 

853 from .utils.validation import check_array 

854 

855 data = check_array(data, accept_sparse="csr") 

856 row_ind, col_ind = self.get_indices(i) 

857 return data[row_ind[:, np.newaxis], col_ind] 

858 

859 

860class TransformerMixin(_SetOutputMixin): 

861 """Mixin class for all transformers in scikit-learn. 

862 

863 If :term:`get_feature_names_out` is defined, then :class:`BaseEstimator` will 

864 automatically wrap `transform` and `fit_transform` to follow the `set_output` 

865 API. See the :ref:`developer_api_set_output` for details. 

866 

867 :class:`OneToOneFeatureMixin` and 

868 :class:`ClassNamePrefixFeaturesOutMixin` are helpful mixins for 

869 defining :term:`get_feature_names_out`. 

870 """ 

871 

872 def fit_transform(self, X, y=None, **fit_params): 

873 """ 

874 Fit to data, then transform it. 

875 

876 Fits transformer to `X` and `y` with optional parameters `fit_params` 

877 and returns a transformed version of `X`. 

878 

879 Parameters 

880 ---------- 

881 X : array-like of shape (n_samples, n_features) 

882 Input samples. 

883 

884 y : array-like of shape (n_samples,) or (n_samples, n_outputs), \ 

885 default=None 

886 Target values (None for unsupervised transformations). 

887 

888 **fit_params : dict 

889 Additional fit parameters. 

890 

891 Returns 

892 ------- 

893 X_new : ndarray array of shape (n_samples, n_features_new) 

894 Transformed array. 

895 """ 

896 # non-optimized default implementation; override when a better 

897 # method is possible for a given clustering algorithm 

898 

899 # we do not route parameters here, since consumers don't route. But 

900 # since it's possible for a `transform` method to also consume 

901 # metadata, we check if that's the case, and we raise a warning telling 

902 # users that they should implement a custom `fit_transform` method 

903 # to forward metadata to `transform` as well. 

904 # 

905 # For that, we calculate routing and check if anything would be routed 

906 # to `transform` if we were to route them. 

907 if _routing_enabled(): 

908 transform_params = self.get_metadata_routing().consumes( 

909 method="transform", params=fit_params.keys() 

910 ) 

911 if transform_params: 

912 warnings.warn( 

913 ( 

914 f"This object ({self.__class__.__name__}) has a `transform`" 

915 " method which consumes metadata, but `fit_transform` does not" 

916 " forward metadata to `transform`. Please implement a custom" 

917 " `fit_transform` method to forward metadata to `transform` as" 

918 " well. Alternatively, you can explicitly do" 

919 " `set_transform_request`and set all values to `False` to" 

920 " disable metadata routed to `transform`, if that's an option." 

921 ), 

922 UserWarning, 

923 ) 

924 

925 if y is None: 

926 # fit method of arity 1 (unsupervised transformation) 

927 return self.fit(X, **fit_params).transform(X) 

928 else: 

929 # fit method of arity 2 (supervised transformation) 

930 return self.fit(X, y, **fit_params).transform(X) 

931 

932 

933class OneToOneFeatureMixin: 

934 """Provides `get_feature_names_out` for simple transformers. 

935 

936 This mixin assumes there's a 1-to-1 correspondence between input features 

937 and output features, such as :class:`~sklearn.preprocessing.StandardScaler`. 

938 """ 

939 

940 def get_feature_names_out(self, input_features=None): 

941 """Get output feature names for transformation. 

942 

943 Parameters 

944 ---------- 

945 input_features : array-like of str or None, default=None 

946 Input features. 

947 

948 - If `input_features` is `None`, then `feature_names_in_` is 

949 used as feature names in. If `feature_names_in_` is not defined, 

950 then the following input feature names are generated: 

951 `["x0", "x1", ..., "x(n_features_in_ - 1)"]`. 

952 - If `input_features` is an array-like, then `input_features` must 

953 match `feature_names_in_` if `feature_names_in_` is defined. 

954 

955 Returns 

956 ------- 

957 feature_names_out : ndarray of str objects 

958 Same as input features. 

959 """ 

960 check_is_fitted(self, "n_features_in_") 

961 return _check_feature_names_in(self, input_features) 

962 

963 

964class ClassNamePrefixFeaturesOutMixin: 

965 """Mixin class for transformers that generate their own names by prefixing. 

966 

967 This mixin is useful when the transformer needs to generate its own feature 

968 names out, such as :class:`~sklearn.decomposition.PCA`. For example, if 

969 :class:`~sklearn.decomposition.PCA` outputs 3 features, then the generated feature 

970 names out are: `["pca0", "pca1", "pca2"]`. 

971 

972 This mixin assumes that a `_n_features_out` attribute is defined when the 

973 transformer is fitted. `_n_features_out` is the number of output features 

974 that the transformer will return in `transform` of `fit_transform`. 

975 """ 

976 

977 def get_feature_names_out(self, input_features=None): 

978 """Get output feature names for transformation. 

979 

980 The feature names out will prefixed by the lowercased class name. For 

981 example, if the transformer outputs 3 features, then the feature names 

982 out are: `["class_name0", "class_name1", "class_name2"]`. 

983 

984 Parameters 

985 ---------- 

986 input_features : array-like of str or None, default=None 

987 Only used to validate feature names with the names seen in `fit`. 

988 

989 Returns 

990 ------- 

991 feature_names_out : ndarray of str objects 

992 Transformed feature names. 

993 """ 

994 check_is_fitted(self, "_n_features_out") 

995 return _generate_get_feature_names_out( 

996 self, self._n_features_out, input_features=input_features 

997 ) 

998 

999 

1000class DensityMixin: 

1001 """Mixin class for all density estimators in scikit-learn.""" 

1002 

1003 _estimator_type = "DensityEstimator" 

1004 

1005 def score(self, X, y=None): 

1006 """Return the score of the model on the data `X`. 

1007 

1008 Parameters 

1009 ---------- 

1010 X : array-like of shape (n_samples, n_features) 

1011 Test samples. 

1012 

1013 y : Ignored 

1014 Not used, present for API consistency by convention. 

1015 

1016 Returns 

1017 ------- 

1018 score : float 

1019 """ 

1020 pass 

1021 

1022 

1023class OutlierMixin: 

1024 """Mixin class for all outlier detection estimators in scikit-learn.""" 

1025 

1026 _estimator_type = "outlier_detector" 

1027 

1028 def fit_predict(self, X, y=None, **kwargs): 

1029 """Perform fit on X and returns labels for X. 

1030 

1031 Returns -1 for outliers and 1 for inliers. 

1032 

1033 Parameters 

1034 ---------- 

1035 X : {array-like, sparse matrix} of shape (n_samples, n_features) 

1036 The input samples. 

1037 

1038 y : Ignored 

1039 Not used, present for API consistency by convention. 

1040 

1041 **kwargs : dict 

1042 Arguments to be passed to ``fit``. 

1043 

1044 .. versionadded:: 1.4 

1045 

1046 Returns 

1047 ------- 

1048 y : ndarray of shape (n_samples,) 

1049 1 for inliers, -1 for outliers. 

1050 """ 

1051 # we do not route parameters here, since consumers don't route. But 

1052 # since it's possible for a `predict` method to also consume 

1053 # metadata, we check if that's the case, and we raise a warning telling 

1054 # users that they should implement a custom `fit_predict` method 

1055 # to forward metadata to `predict` as well. 

1056 # 

1057 # For that, we calculate routing and check if anything would be routed 

1058 # to `predict` if we were to route them. 

1059 if _routing_enabled(): 

1060 transform_params = self.get_metadata_routing().consumes( 

1061 method="predict", params=kwargs.keys() 

1062 ) 

1063 if transform_params: 

1064 warnings.warn( 

1065 ( 

1066 f"This object ({self.__class__.__name__}) has a `predict` " 

1067 "method which consumes metadata, but `fit_predict` does not " 

1068 "forward metadata to `predict`. Please implement a custom " 

1069 "`fit_predict` method to forward metadata to `predict` as well." 

1070 "Alternatively, you can explicitly do `set_predict_request`" 

1071 "and set all values to `False` to disable metadata routed to " 

1072 "`predict`, if that's an option." 

1073 ), 

1074 UserWarning, 

1075 ) 

1076 

1077 # override for transductive outlier detectors like LocalOulierFactor 

1078 return self.fit(X, **kwargs).predict(X) 

1079 

1080 

1081class MetaEstimatorMixin: 

1082 _required_parameters = ["estimator"] 

1083 """Mixin class for all meta estimators in scikit-learn.""" 

1084 

1085 

1086class MultiOutputMixin: 

1087 """Mixin to mark estimators that support multioutput.""" 

1088 

1089 def _more_tags(self): 

1090 return {"multioutput": True} 

1091 

1092 

1093class _UnstableArchMixin: 

1094 """Mark estimators that are non-determinstic on 32bit or PowerPC""" 

1095 

1096 def _more_tags(self): 

1097 return { 

1098 "non_deterministic": _IS_32BIT or platform.machine().startswith( 

1099 ("ppc", "powerpc") 

1100 ) 

1101 } 

1102 

1103 

1104def is_classifier(estimator): 

1105 """Return True if the given estimator is (probably) a classifier. 

1106 

1107 Parameters 

1108 ---------- 

1109 estimator : object 

1110 Estimator object to test. 

1111 

1112 Returns 

1113 ------- 

1114 out : bool 

1115 True if estimator is a classifier and False otherwise. 

1116 """ 

1117 return getattr(estimator, "_estimator_type", None) == "classifier" 

1118 

1119 

1120def is_regressor(estimator): 

1121 """Return True if the given estimator is (probably) a regressor. 

1122 

1123 Parameters 

1124 ---------- 

1125 estimator : estimator instance 

1126 Estimator object to test. 

1127 

1128 Returns 

1129 ------- 

1130 out : bool 

1131 True if estimator is a regressor and False otherwise. 

1132 """ 

1133 return getattr(estimator, "_estimator_type", None) == "regressor" 

1134 

1135 

1136def is_outlier_detector(estimator): 

1137 """Return True if the given estimator is (probably) an outlier detector. 

1138 

1139 Parameters 

1140 ---------- 

1141 estimator : estimator instance 

1142 Estimator object to test. 

1143 

1144 Returns 

1145 ------- 

1146 out : bool 

1147 True if estimator is an outlier detector and False otherwise. 

1148 """ 

1149 return getattr(estimator, "_estimator_type", None) == "outlier_detector" 

1150 

1151 

1152def _fit_context(*, prefer_skip_nested_validation): 

1153 """Decorator to run the fit methods of estimators within context managers. 

1154 

1155 Parameters 

1156 ---------- 

1157 prefer_skip_nested_validation : bool 

1158 If True, the validation of parameters of inner estimators or functions 

1159 called during fit will be skipped. 

1160 

1161 This is useful to avoid validating many times the parameters passed by the 

1162 user from the public facing API. It's also useful to avoid validating 

1163 parameters that we pass internally to inner functions that are guaranteed to 

1164 be valid by the test suite. 

1165 

1166 It should be set to True for most estimators, except for those that receive 

1167 non-validated objects as parameters, such as meta-estimators that are given 

1168 estimator objects. 

1169 

1170 Returns 

1171 ------- 

1172 decorated_fit : method 

1173 The decorated fit method. 

1174 """ 

1175 

1176 def decorator(fit_method): 

1177 @functools.wraps(fit_method) 

1178 def wrapper(estimator, *args, **kwargs): 

1179 global_skip_validation = get_config()["skip_parameter_validation"] 

1180 

1181 # we don't want to validate again for each call to partial_fit 

1182 partial_fit_and_fitted = ( 

1183 fit_method.__name__ == "partial_fit" and _is_fitted(estimator) 

1184 ) 

1185 

1186 if not global_skip_validation and not partial_fit_and_fitted: 

1187 estimator._validate_params() 

1188 

1189 with config_context( 

1190 skip_parameter_validation=( 

1191 prefer_skip_nested_validation or global_skip_validation 

1192 ) 

1193 ): 

1194 return fit_method(estimator, *args, **kwargs) 

1195 

1196 return wrapper 

1197 

1198 return decorator