Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/scikit_learn-1.4.dev0-py3.8-linux-x86_64.egg/sklearn/base.py: 36%
337 statements
« prev ^ index » next coverage.py v7.3.2, created at 2023-12-12 06:31 +0000
« prev ^ index » next coverage.py v7.3.2, created at 2023-12-12 06:31 +0000
1"""Base classes for all estimators."""
3# Author: Gael Varoquaux <gael.varoquaux@normalesup.org>
4# License: BSD 3 clause
6import copy
7import functools
8import inspect
9import platform
10import re
11import warnings
12from collections import defaultdict
14import numpy as np
16from . import __version__
17from ._config import config_context, get_config
18from .exceptions import InconsistentVersionWarning
19from .utils import _IS_32BIT
20from .utils._estimator_html_repr import _HTMLDocumentationLinkMixin, estimator_html_repr
21from .utils._metadata_requests import _MetadataRequester, _routing_enabled
22from .utils._param_validation import validate_parameter_constraints
23from .utils._set_output import _SetOutputMixin
24from .utils._tags import (
25 _DEFAULT_TAGS,
26)
27from .utils.validation import (
28 _check_feature_names_in,
29 _check_y,
30 _generate_get_feature_names_out,
31 _get_feature_names,
32 _is_fitted,
33 _num_features,
34 check_array,
35 check_is_fitted,
36 check_X_y,
37)
40def clone(estimator, *, safe=True):
41 """Construct a new unfitted estimator with the same parameters.
43 Clone does a deep copy of the model in an estimator
44 without actually copying attached data. It returns a new estimator
45 with the same parameters that has not been fitted on any data.
47 .. versionchanged:: 1.3
48 Delegates to `estimator.__sklearn_clone__` if the method exists.
50 Parameters
51 ----------
52 estimator : {list, tuple, set} of estimator instance or a single \
53 estimator instance
54 The estimator or group of estimators to be cloned.
55 safe : bool, default=True
56 If safe is False, clone will fall back to a deep copy on objects
57 that are not estimators. Ignored if `estimator.__sklearn_clone__`
58 exists.
60 Returns
61 -------
62 estimator : object
63 The deep copy of the input, an estimator if input is an estimator.
65 Notes
66 -----
67 If the estimator's `random_state` parameter is an integer (or if the
68 estimator doesn't have a `random_state` parameter), an *exact clone* is
69 returned: the clone and the original estimator will give the exact same
70 results. Otherwise, *statistical clone* is returned: the clone might
71 return different results from the original estimator. More details can be
72 found in :ref:`randomness`.
73 """
74 if hasattr(estimator, "__sklearn_clone__") and not inspect.isclass(estimator):
75 return estimator.__sklearn_clone__()
76 return _clone_parametrized(estimator, safe=safe)
79def _clone_parametrized(estimator, *, safe=True):
80 """Default implementation of clone. See :func:`sklearn.base.clone` for details."""
82 estimator_type = type(estimator)
83 if estimator_type is dict:
84 return {k: clone(v, safe=safe) for k, v in estimator.items()}
85 elif estimator_type in (list, tuple, set, frozenset):
86 return estimator_type([clone(e, safe=safe) for e in estimator])
87 elif not hasattr(estimator, "get_params") or isinstance(estimator, type):
88 if not safe:
89 return copy.deepcopy(estimator)
90 else:
91 if isinstance(estimator, type):
92 raise TypeError(
93 "Cannot clone object. "
94 + "You should provide an instance of "
95 + "scikit-learn estimator instead of a class."
96 )
97 else:
98 raise TypeError(
99 "Cannot clone object '%s' (type %s): "
100 "it does not seem to be a scikit-learn "
101 "estimator as it does not implement a "
102 "'get_params' method." % (repr(estimator), type(estimator))
103 )
105 klass = estimator.__class__
106 new_object_params = estimator.get_params(deep=False)
107 for name, param in new_object_params.items():
108 new_object_params[name] = clone(param, safe=False)
110 new_object = klass(**new_object_params)
111 try:
112 new_object._metadata_request = copy.deepcopy(estimator._metadata_request)
113 except AttributeError:
114 pass
116 params_set = new_object.get_params(deep=False)
118 # quick sanity check of the parameters of the clone
119 for name in new_object_params:
120 param1 = new_object_params[name]
121 param2 = params_set[name]
122 if param1 is not param2:
123 raise RuntimeError(
124 "Cannot clone object %s, as the constructor "
125 "either does not set or modifies parameter %s" % (estimator, name)
126 )
128 # _sklearn_output_config is used by `set_output` to configure the output
129 # container of an estimator.
130 if hasattr(estimator, "_sklearn_output_config"):
131 new_object._sklearn_output_config = copy.deepcopy(
132 estimator._sklearn_output_config
133 )
134 return new_object
137class BaseEstimator(_HTMLDocumentationLinkMixin, _MetadataRequester):
138 """Base class for all estimators in scikit-learn.
140 Notes
141 -----
142 All estimators should specify all the parameters that can be set
143 at the class level in their ``__init__`` as explicit keyword
144 arguments (no ``*args`` or ``**kwargs``).
145 """
147 @classmethod
148 def _get_param_names(cls):
149 """Get parameter names for the estimator"""
150 # fetch the constructor or the original constructor before
151 # deprecation wrapping if any
152 init = getattr(cls.__init__, "deprecated_original", cls.__init__)
153 if init is object.__init__:
154 # No explicit constructor to introspect
155 return []
157 # introspect the constructor arguments to find the model parameters
158 # to represent
159 init_signature = inspect.signature(init)
160 # Consider the constructor parameters excluding 'self'
161 parameters = [
162 p
163 for p in init_signature.parameters.values()
164 if p.name != "self" and p.kind != p.VAR_KEYWORD
165 ]
166 for p in parameters:
167 if p.kind == p.VAR_POSITIONAL:
168 raise RuntimeError(
169 "scikit-learn estimators should always "
170 "specify their parameters in the signature"
171 " of their __init__ (no varargs)."
172 " %s with constructor %s doesn't "
173 " follow this convention." % (cls, init_signature)
174 )
175 # Extract and sort argument names excluding 'self'
176 return sorted([p.name for p in parameters])
178 def get_params(self, deep=True):
179 """
180 Get parameters for this estimator.
182 Parameters
183 ----------
184 deep : bool, default=True
185 If True, will return the parameters for this estimator and
186 contained subobjects that are estimators.
188 Returns
189 -------
190 params : dict
191 Parameter names mapped to their values.
192 """
193 out = dict()
194 for key in self._get_param_names():
195 value = getattr(self, key)
196 if deep and hasattr(value, "get_params") and not isinstance(value, type):
197 deep_items = value.get_params().items()
198 out.update((key + "__" + k, val) for k, val in deep_items)
199 out[key] = value
200 return out
202 def set_params(self, **params):
203 """Set the parameters of this estimator.
205 The method works on simple estimators as well as on nested objects
206 (such as :class:`~sklearn.pipeline.Pipeline`). The latter have
207 parameters of the form ``<component>__<parameter>`` so that it's
208 possible to update each component of a nested object.
210 Parameters
211 ----------
212 **params : dict
213 Estimator parameters.
215 Returns
216 -------
217 self : estimator instance
218 Estimator instance.
219 """
220 if not params:
221 # Simple optimization to gain speed (inspect is slow)
222 return self
223 valid_params = self.get_params(deep=True)
225 nested_params = defaultdict(dict) # grouped by prefix
226 for key, value in params.items():
227 key, delim, sub_key = key.partition("__")
228 if key not in valid_params:
229 local_valid_params = self._get_param_names()
230 raise ValueError(
231 f"Invalid parameter {key!r} for estimator {self}. "
232 f"Valid parameters are: {local_valid_params!r}."
233 )
235 if delim:
236 nested_params[key][sub_key] = value
237 else:
238 setattr(self, key, value)
239 valid_params[key] = value
241 for key, sub_params in nested_params.items():
242 valid_params[key].set_params(**sub_params)
244 return self
246 def __sklearn_clone__(self):
247 return _clone_parametrized(self)
249 def __repr__(self, N_CHAR_MAX=700):
250 # N_CHAR_MAX is the (approximate) maximum number of non-blank
251 # characters to render. We pass it as an optional parameter to ease
252 # the tests.
254 from .utils._pprint import _EstimatorPrettyPrinter
256 N_MAX_ELEMENTS_TO_SHOW = 30 # number of elements to show in sequences
258 # use ellipsis for sequences with a lot of elements
259 pp = _EstimatorPrettyPrinter(
260 compact=True,
261 indent=1,
262 indent_at_name=True,
263 n_max_elements_to_show=N_MAX_ELEMENTS_TO_SHOW,
264 )
266 repr_ = pp.pformat(self)
268 # Use bruteforce ellipsis when there are a lot of non-blank characters
269 n_nonblank = len("".join(repr_.split()))
270 if n_nonblank > N_CHAR_MAX:
271 lim = N_CHAR_MAX // 2 # apprx number of chars to keep on both ends
272 regex = r"^(\s*\S){%d}" % lim
273 # The regex '^(\s*\S){%d}' % n
274 # matches from the start of the string until the nth non-blank
275 # character:
276 # - ^ matches the start of string
277 # - (pattern){n} matches n repetitions of pattern
278 # - \s*\S matches a non-blank char following zero or more blanks
279 left_lim = re.match(regex, repr_).end()
280 right_lim = re.match(regex, repr_[::-1]).end()
282 if "\n" in repr_[left_lim:-right_lim]:
283 # The left side and right side aren't on the same line.
284 # To avoid weird cuts, e.g.:
285 # categoric...ore',
286 # we need to start the right side with an appropriate newline
287 # character so that it renders properly as:
288 # categoric...
289 # handle_unknown='ignore',
290 # so we add [^\n]*\n which matches until the next \n
291 regex += r"[^\n]*\n"
292 right_lim = re.match(regex, repr_[::-1]).end()
294 ellipsis = "..."
295 if left_lim + len(ellipsis) < len(repr_) - right_lim:
296 # Only add ellipsis if it results in a shorter repr
297 repr_ = repr_[:left_lim] + "..." + repr_[-right_lim:]
299 return repr_
301 def __getstate__(self):
302 if getattr(self, "__slots__", None):
303 raise TypeError(
304 "You cannot use `__slots__` in objects inheriting from "
305 "`sklearn.base.BaseEstimator`."
306 )
308 try:
309 state = super().__getstate__()
310 if state is None:
311 # For Python 3.11+, empty instance (no `__slots__`,
312 # and `__dict__`) will return a state equal to `None`.
313 state = self.__dict__.copy()
314 except AttributeError:
315 # Python < 3.11
316 state = self.__dict__.copy()
318 if type(self).__module__.startswith("sklearn."):
319 return dict(state.items(), _sklearn_version=__version__)
320 else:
321 return state
323 def __setstate__(self, state):
324 if type(self).__module__.startswith("sklearn."):
325 pickle_version = state.pop("_sklearn_version", "pre-0.18")
326 if pickle_version != __version__:
327 warnings.warn(
328 InconsistentVersionWarning(
329 estimator_name=self.__class__.__name__,
330 current_sklearn_version=__version__,
331 original_sklearn_version=pickle_version,
332 ),
333 )
334 try:
335 super().__setstate__(state)
336 except AttributeError:
337 self.__dict__.update(state)
339 def _more_tags(self):
340 return _DEFAULT_TAGS
342 def _get_tags(self):
343 collected_tags = {}
344 for base_class in reversed(inspect.getmro(self.__class__)):
345 if hasattr(base_class, "_more_tags"):
346 # need the if because mixins might not have _more_tags
347 # but might do redundant work in estimators
348 # (i.e. calling more tags on BaseEstimator multiple times)
349 more_tags = base_class._more_tags(self)
350 collected_tags.update(more_tags)
351 return collected_tags
353 def _check_n_features(self, X, reset):
354 """Set the `n_features_in_` attribute, or check against it.
356 Parameters
357 ----------
358 X : {ndarray, sparse matrix} of shape (n_samples, n_features)
359 The input samples.
360 reset : bool
361 If True, the `n_features_in_` attribute is set to `X.shape[1]`.
362 If False and the attribute exists, then check that it is equal to
363 `X.shape[1]`. If False and the attribute does *not* exist, then
364 the check is skipped.
365 .. note::
366 It is recommended to call reset=True in `fit` and in the first
367 call to `partial_fit`. All other methods that validate `X`
368 should set `reset=False`.
369 """
370 try:
371 n_features = _num_features(X)
372 except TypeError as e:
373 if not reset and hasattr(self, "n_features_in_"):
374 raise ValueError(
375 "X does not contain any features, but "
376 f"{self.__class__.__name__} is expecting "
377 f"{self.n_features_in_} features"
378 ) from e
379 # If the number of features is not defined and reset=True,
380 # then we skip this check
381 return
383 if reset:
384 self.n_features_in_ = n_features
385 return
387 if not hasattr(self, "n_features_in_"):
388 # Skip this check if the expected number of expected input features
389 # was not recorded by calling fit first. This is typically the case
390 # for stateless transformers.
391 return
393 if n_features != self.n_features_in_:
394 raise ValueError(
395 f"X has {n_features} features, but {self.__class__.__name__} "
396 f"is expecting {self.n_features_in_} features as input."
397 )
399 def _check_feature_names(self, X, *, reset):
400 """Set or check the `feature_names_in_` attribute.
402 .. versionadded:: 1.0
404 Parameters
405 ----------
406 X : {ndarray, dataframe} of shape (n_samples, n_features)
407 The input samples.
409 reset : bool
410 Whether to reset the `feature_names_in_` attribute.
411 If False, the input will be checked for consistency with
412 feature names of data provided when reset was last True.
413 .. note::
414 It is recommended to call `reset=True` in `fit` and in the first
415 call to `partial_fit`. All other methods that validate `X`
416 should set `reset=False`.
417 """
419 if reset:
420 feature_names_in = _get_feature_names(X)
421 if feature_names_in is not None:
422 self.feature_names_in_ = feature_names_in
423 elif hasattr(self, "feature_names_in_"):
424 # Delete the attribute when the estimator is fitted on a new dataset
425 # that has no feature names.
426 delattr(self, "feature_names_in_")
427 return
429 fitted_feature_names = getattr(self, "feature_names_in_", None)
430 X_feature_names = _get_feature_names(X)
432 if fitted_feature_names is None and X_feature_names is None:
433 # no feature names seen in fit and in X
434 return
436 if X_feature_names is not None and fitted_feature_names is None:
437 warnings.warn(
438 f"X has feature names, but {self.__class__.__name__} was fitted without"
439 " feature names"
440 )
441 return
443 if X_feature_names is None and fitted_feature_names is not None:
444 warnings.warn(
445 "X does not have valid feature names, but"
446 f" {self.__class__.__name__} was fitted with feature names"
447 )
448 return
450 # validate the feature names against the `feature_names_in_` attribute
451 if len(fitted_feature_names) != len(X_feature_names) or np.any(
452 fitted_feature_names != X_feature_names
453 ):
454 message = (
455 "The feature names should match those that were passed during fit.\n"
456 )
457 fitted_feature_names_set = set(fitted_feature_names)
458 X_feature_names_set = set(X_feature_names)
460 unexpected_names = sorted(X_feature_names_set - fitted_feature_names_set)
461 missing_names = sorted(fitted_feature_names_set - X_feature_names_set)
463 def add_names(names):
464 output = ""
465 max_n_names = 5
466 for i, name in enumerate(names):
467 if i >= max_n_names:
468 output += "- ...\n"
469 break
470 output += f"- {name}\n"
471 return output
473 if unexpected_names:
474 message += "Feature names unseen at fit time:\n"
475 message += add_names(unexpected_names)
477 if missing_names:
478 message += "Feature names seen at fit time, yet now missing:\n"
479 message += add_names(missing_names)
481 if not missing_names and not unexpected_names:
482 message += (
483 "Feature names must be in the same order as they were in fit.\n"
484 )
486 raise ValueError(message)
488 def _validate_data(
489 self,
490 X="no_validation",
491 y="no_validation",
492 reset=True,
493 validate_separately=False,
494 cast_to_ndarray=True,
495 **check_params,
496 ):
497 """Validate input data and set or check the `n_features_in_` attribute.
499 Parameters
500 ----------
501 X : {array-like, sparse matrix, dataframe} of shape \
502 (n_samples, n_features), default='no validation'
503 The input samples.
504 If `'no_validation'`, no validation is performed on `X`. This is
505 useful for meta-estimator which can delegate input validation to
506 their underlying estimator(s). In that case `y` must be passed and
507 the only accepted `check_params` are `multi_output` and
508 `y_numeric`.
510 y : array-like of shape (n_samples,), default='no_validation'
511 The targets.
513 - If `None`, `check_array` is called on `X`. If the estimator's
514 requires_y tag is True, then an error will be raised.
515 - If `'no_validation'`, `check_array` is called on `X` and the
516 estimator's requires_y tag is ignored. This is a default
517 placeholder and is never meant to be explicitly set. In that case
518 `X` must be passed.
519 - Otherwise, only `y` with `_check_y` or both `X` and `y` are
520 checked with either `check_array` or `check_X_y` depending on
521 `validate_separately`.
523 reset : bool, default=True
524 Whether to reset the `n_features_in_` attribute.
525 If False, the input will be checked for consistency with data
526 provided when reset was last True.
527 .. note::
528 It is recommended to call reset=True in `fit` and in the first
529 call to `partial_fit`. All other methods that validate `X`
530 should set `reset=False`.
532 validate_separately : False or tuple of dicts, default=False
533 Only used if y is not None.
534 If False, call validate_X_y(). Else, it must be a tuple of kwargs
535 to be used for calling check_array() on X and y respectively.
537 `estimator=self` is automatically added to these dicts to generate
538 more informative error message in case of invalid input data.
540 cast_to_ndarray : bool, default=True
541 Cast `X` and `y` to ndarray with checks in `check_params`. If
542 `False`, `X` and `y` are unchanged and only `feature_names_in_` and
543 `n_features_in_` are checked.
545 **check_params : kwargs
546 Parameters passed to :func:`sklearn.utils.check_array` or
547 :func:`sklearn.utils.check_X_y`. Ignored if validate_separately
548 is not False.
550 `estimator=self` is automatically added to these params to generate
551 more informative error message in case of invalid input data.
553 Returns
554 -------
555 out : {ndarray, sparse matrix} or tuple of these
556 The validated input. A tuple is returned if both `X` and `y` are
557 validated.
558 """
559 self._check_feature_names(X, reset=reset)
561 if y is None and self._get_tags()["requires_y"]:
562 raise ValueError(
563 f"This {self.__class__.__name__} estimator "
564 "requires y to be passed, but the target y is None."
565 )
567 no_val_X = isinstance(X, str) and X == "no_validation"
568 no_val_y = y is None or isinstance(y, str) and y == "no_validation"
570 if no_val_X and no_val_y:
571 raise ValueError("Validation should be done on X, y or both.")
573 default_check_params = {"estimator": self}
574 check_params = {**default_check_params, **check_params}
576 if not cast_to_ndarray:
577 if not no_val_X and no_val_y:
578 out = X
579 elif no_val_X and not no_val_y:
580 out = y
581 else:
582 out = X, y
583 elif not no_val_X and no_val_y:
584 out = check_array(X, input_name="X", **check_params)
585 elif no_val_X and not no_val_y:
586 out = _check_y(y, **check_params)
587 else:
588 if validate_separately:
589 # We need this because some estimators validate X and y
590 # separately, and in general, separately calling check_array()
591 # on X and y isn't equivalent to just calling check_X_y()
592 # :(
593 check_X_params, check_y_params = validate_separately
594 if "estimator" not in check_X_params:
595 check_X_params = {**default_check_params, **check_X_params}
596 X = check_array(X, input_name="X", **check_X_params)
597 if "estimator" not in check_y_params:
598 check_y_params = {**default_check_params, **check_y_params}
599 y = check_array(y, input_name="y", **check_y_params)
600 else:
601 X, y = check_X_y(X, y, **check_params)
602 out = X, y
604 if not no_val_X and check_params.get("ensure_2d", True):
605 self._check_n_features(X, reset=reset)
607 return out
609 def _validate_params(self):
610 """Validate types and values of constructor parameters
612 The expected type and values must be defined in the `_parameter_constraints`
613 class attribute, which is a dictionary `param_name: list of constraints`. See
614 the docstring of `validate_parameter_constraints` for a description of the
615 accepted constraints.
616 """
617 validate_parameter_constraints(
618 self._parameter_constraints,
619 self.get_params(deep=False),
620 caller_name=self.__class__.__name__,
621 )
623 @property
624 def _repr_html_(self):
625 """HTML representation of estimator.
627 This is redundant with the logic of `_repr_mimebundle_`. The latter
628 should be favorted in the long term, `_repr_html_` is only
629 implemented for consumers who do not interpret `_repr_mimbundle_`.
630 """
631 if get_config()["display"] != "diagram":
632 raise AttributeError(
633 "_repr_html_ is only defined when the "
634 "'display' configuration option is set to "
635 "'diagram'"
636 )
637 return self._repr_html_inner
639 def _repr_html_inner(self):
640 """This function is returned by the @property `_repr_html_` to make
641 `hasattr(estimator, "_repr_html_") return `True` or `False` depending
642 on `get_config()["display"]`.
643 """
644 return estimator_html_repr(self)
646 def _repr_mimebundle_(self, **kwargs):
647 """Mime bundle used by jupyter kernels to display estimator"""
648 output = {"text/plain": repr(self)}
649 if get_config()["display"] == "diagram":
650 output["text/html"] = estimator_html_repr(self)
651 return output
654class ClassifierMixin:
655 """Mixin class for all classifiers in scikit-learn."""
657 _estimator_type = "classifier"
659 def score(self, X, y, sample_weight=None):
660 """
661 Return the mean accuracy on the given test data and labels.
663 In multi-label classification, this is the subset accuracy
664 which is a harsh metric since you require for each sample that
665 each label set be correctly predicted.
667 Parameters
668 ----------
669 X : array-like of shape (n_samples, n_features)
670 Test samples.
672 y : array-like of shape (n_samples,) or (n_samples, n_outputs)
673 True labels for `X`.
675 sample_weight : array-like of shape (n_samples,), default=None
676 Sample weights.
678 Returns
679 -------
680 score : float
681 Mean accuracy of ``self.predict(X)`` w.r.t. `y`.
682 """
683 from .metrics import accuracy_score
685 return accuracy_score(y, self.predict(X), sample_weight=sample_weight)
687 def _more_tags(self):
688 return {"requires_y": True}
691class RegressorMixin:
692 """Mixin class for all regression estimators in scikit-learn."""
694 _estimator_type = "regressor"
696 def score(self, X, y, sample_weight=None):
697 """Return the coefficient of determination of the prediction.
699 The coefficient of determination :math:`R^2` is defined as
700 :math:`(1 - \\frac{u}{v})`, where :math:`u` is the residual
701 sum of squares ``((y_true - y_pred)** 2).sum()`` and :math:`v`
702 is the total sum of squares ``((y_true - y_true.mean()) ** 2).sum()``.
703 The best possible score is 1.0 and it can be negative (because the
704 model can be arbitrarily worse). A constant model that always predicts
705 the expected value of `y`, disregarding the input features, would get
706 a :math:`R^2` score of 0.0.
708 Parameters
709 ----------
710 X : array-like of shape (n_samples, n_features)
711 Test samples. For some estimators this may be a precomputed
712 kernel matrix or a list of generic objects instead with shape
713 ``(n_samples, n_samples_fitted)``, where ``n_samples_fitted``
714 is the number of samples used in the fitting for the estimator.
716 y : array-like of shape (n_samples,) or (n_samples, n_outputs)
717 True values for `X`.
719 sample_weight : array-like of shape (n_samples,), default=None
720 Sample weights.
722 Returns
723 -------
724 score : float
725 :math:`R^2` of ``self.predict(X)`` w.r.t. `y`.
727 Notes
728 -----
729 The :math:`R^2` score used when calling ``score`` on a regressor uses
730 ``multioutput='uniform_average'`` from version 0.23 to keep consistent
731 with default value of :func:`~sklearn.metrics.r2_score`.
732 This influences the ``score`` method of all the multioutput
733 regressors (except for
734 :class:`~sklearn.multioutput.MultiOutputRegressor`).
735 """
737 from .metrics import r2_score
739 y_pred = self.predict(X)
740 return r2_score(y, y_pred, sample_weight=sample_weight)
742 def _more_tags(self):
743 return {"requires_y": True}
746class ClusterMixin:
747 """Mixin class for all cluster estimators in scikit-learn."""
749 _estimator_type = "clusterer"
751 def fit_predict(self, X, y=None, **kwargs):
752 """
753 Perform clustering on `X` and returns cluster labels.
755 Parameters
756 ----------
757 X : array-like of shape (n_samples, n_features)
758 Input data.
760 y : Ignored
761 Not used, present for API consistency by convention.
763 **kwargs : dict
764 Arguments to be passed to ``fit``.
766 .. versionadded:: 1.4
768 Returns
769 -------
770 labels : ndarray of shape (n_samples,), dtype=np.int64
771 Cluster labels.
772 """
773 # non-optimized default implementation; override when a better
774 # method is possible for a given clustering algorithm
775 self.fit(X, **kwargs)
776 return self.labels_
778 def _more_tags(self):
779 return {"preserves_dtype": []}
782class BiclusterMixin:
783 """Mixin class for all bicluster estimators in scikit-learn."""
785 @property
786 def biclusters_(self):
787 """Convenient way to get row and column indicators together.
789 Returns the ``rows_`` and ``columns_`` members.
790 """
791 return self.rows_, self.columns_
793 def get_indices(self, i):
794 """Row and column indices of the `i`'th bicluster.
796 Only works if ``rows_`` and ``columns_`` attributes exist.
798 Parameters
799 ----------
800 i : int
801 The index of the cluster.
803 Returns
804 -------
805 row_ind : ndarray, dtype=np.intp
806 Indices of rows in the dataset that belong to the bicluster.
807 col_ind : ndarray, dtype=np.intp
808 Indices of columns in the dataset that belong to the bicluster.
809 """
810 rows = self.rows_[i]
811 columns = self.columns_[i]
812 return np.nonzero(rows)[0], np.nonzero(columns)[0]
814 def get_shape(self, i):
815 """Shape of the `i`'th bicluster.
817 Parameters
818 ----------
819 i : int
820 The index of the cluster.
822 Returns
823 -------
824 n_rows : int
825 Number of rows in the bicluster.
827 n_cols : int
828 Number of columns in the bicluster.
829 """
830 indices = self.get_indices(i)
831 return tuple(len(i) for i in indices)
833 def get_submatrix(self, i, data):
834 """Return the submatrix corresponding to bicluster `i`.
836 Parameters
837 ----------
838 i : int
839 The index of the cluster.
840 data : array-like of shape (n_samples, n_features)
841 The data.
843 Returns
844 -------
845 submatrix : ndarray of shape (n_rows, n_cols)
846 The submatrix corresponding to bicluster `i`.
848 Notes
849 -----
850 Works with sparse matrices. Only works if ``rows_`` and
851 ``columns_`` attributes exist.
852 """
853 from .utils.validation import check_array
855 data = check_array(data, accept_sparse="csr")
856 row_ind, col_ind = self.get_indices(i)
857 return data[row_ind[:, np.newaxis], col_ind]
860class TransformerMixin(_SetOutputMixin):
861 """Mixin class for all transformers in scikit-learn.
863 If :term:`get_feature_names_out` is defined, then :class:`BaseEstimator` will
864 automatically wrap `transform` and `fit_transform` to follow the `set_output`
865 API. See the :ref:`developer_api_set_output` for details.
867 :class:`OneToOneFeatureMixin` and
868 :class:`ClassNamePrefixFeaturesOutMixin` are helpful mixins for
869 defining :term:`get_feature_names_out`.
870 """
872 def fit_transform(self, X, y=None, **fit_params):
873 """
874 Fit to data, then transform it.
876 Fits transformer to `X` and `y` with optional parameters `fit_params`
877 and returns a transformed version of `X`.
879 Parameters
880 ----------
881 X : array-like of shape (n_samples, n_features)
882 Input samples.
884 y : array-like of shape (n_samples,) or (n_samples, n_outputs), \
885 default=None
886 Target values (None for unsupervised transformations).
888 **fit_params : dict
889 Additional fit parameters.
891 Returns
892 -------
893 X_new : ndarray array of shape (n_samples, n_features_new)
894 Transformed array.
895 """
896 # non-optimized default implementation; override when a better
897 # method is possible for a given clustering algorithm
899 # we do not route parameters here, since consumers don't route. But
900 # since it's possible for a `transform` method to also consume
901 # metadata, we check if that's the case, and we raise a warning telling
902 # users that they should implement a custom `fit_transform` method
903 # to forward metadata to `transform` as well.
904 #
905 # For that, we calculate routing and check if anything would be routed
906 # to `transform` if we were to route them.
907 if _routing_enabled():
908 transform_params = self.get_metadata_routing().consumes(
909 method="transform", params=fit_params.keys()
910 )
911 if transform_params:
912 warnings.warn(
913 (
914 f"This object ({self.__class__.__name__}) has a `transform`"
915 " method which consumes metadata, but `fit_transform` does not"
916 " forward metadata to `transform`. Please implement a custom"
917 " `fit_transform` method to forward metadata to `transform` as"
918 " well. Alternatively, you can explicitly do"
919 " `set_transform_request`and set all values to `False` to"
920 " disable metadata routed to `transform`, if that's an option."
921 ),
922 UserWarning,
923 )
925 if y is None:
926 # fit method of arity 1 (unsupervised transformation)
927 return self.fit(X, **fit_params).transform(X)
928 else:
929 # fit method of arity 2 (supervised transformation)
930 return self.fit(X, y, **fit_params).transform(X)
933class OneToOneFeatureMixin:
934 """Provides `get_feature_names_out` for simple transformers.
936 This mixin assumes there's a 1-to-1 correspondence between input features
937 and output features, such as :class:`~sklearn.preprocessing.StandardScaler`.
938 """
940 def get_feature_names_out(self, input_features=None):
941 """Get output feature names for transformation.
943 Parameters
944 ----------
945 input_features : array-like of str or None, default=None
946 Input features.
948 - If `input_features` is `None`, then `feature_names_in_` is
949 used as feature names in. If `feature_names_in_` is not defined,
950 then the following input feature names are generated:
951 `["x0", "x1", ..., "x(n_features_in_ - 1)"]`.
952 - If `input_features` is an array-like, then `input_features` must
953 match `feature_names_in_` if `feature_names_in_` is defined.
955 Returns
956 -------
957 feature_names_out : ndarray of str objects
958 Same as input features.
959 """
960 check_is_fitted(self, "n_features_in_")
961 return _check_feature_names_in(self, input_features)
964class ClassNamePrefixFeaturesOutMixin:
965 """Mixin class for transformers that generate their own names by prefixing.
967 This mixin is useful when the transformer needs to generate its own feature
968 names out, such as :class:`~sklearn.decomposition.PCA`. For example, if
969 :class:`~sklearn.decomposition.PCA` outputs 3 features, then the generated feature
970 names out are: `["pca0", "pca1", "pca2"]`.
972 This mixin assumes that a `_n_features_out` attribute is defined when the
973 transformer is fitted. `_n_features_out` is the number of output features
974 that the transformer will return in `transform` of `fit_transform`.
975 """
977 def get_feature_names_out(self, input_features=None):
978 """Get output feature names for transformation.
980 The feature names out will prefixed by the lowercased class name. For
981 example, if the transformer outputs 3 features, then the feature names
982 out are: `["class_name0", "class_name1", "class_name2"]`.
984 Parameters
985 ----------
986 input_features : array-like of str or None, default=None
987 Only used to validate feature names with the names seen in `fit`.
989 Returns
990 -------
991 feature_names_out : ndarray of str objects
992 Transformed feature names.
993 """
994 check_is_fitted(self, "_n_features_out")
995 return _generate_get_feature_names_out(
996 self, self._n_features_out, input_features=input_features
997 )
1000class DensityMixin:
1001 """Mixin class for all density estimators in scikit-learn."""
1003 _estimator_type = "DensityEstimator"
1005 def score(self, X, y=None):
1006 """Return the score of the model on the data `X`.
1008 Parameters
1009 ----------
1010 X : array-like of shape (n_samples, n_features)
1011 Test samples.
1013 y : Ignored
1014 Not used, present for API consistency by convention.
1016 Returns
1017 -------
1018 score : float
1019 """
1020 pass
1023class OutlierMixin:
1024 """Mixin class for all outlier detection estimators in scikit-learn."""
1026 _estimator_type = "outlier_detector"
1028 def fit_predict(self, X, y=None, **kwargs):
1029 """Perform fit on X and returns labels for X.
1031 Returns -1 for outliers and 1 for inliers.
1033 Parameters
1034 ----------
1035 X : {array-like, sparse matrix} of shape (n_samples, n_features)
1036 The input samples.
1038 y : Ignored
1039 Not used, present for API consistency by convention.
1041 **kwargs : dict
1042 Arguments to be passed to ``fit``.
1044 .. versionadded:: 1.4
1046 Returns
1047 -------
1048 y : ndarray of shape (n_samples,)
1049 1 for inliers, -1 for outliers.
1050 """
1051 # we do not route parameters here, since consumers don't route. But
1052 # since it's possible for a `predict` method to also consume
1053 # metadata, we check if that's the case, and we raise a warning telling
1054 # users that they should implement a custom `fit_predict` method
1055 # to forward metadata to `predict` as well.
1056 #
1057 # For that, we calculate routing and check if anything would be routed
1058 # to `predict` if we were to route them.
1059 if _routing_enabled():
1060 transform_params = self.get_metadata_routing().consumes(
1061 method="predict", params=kwargs.keys()
1062 )
1063 if transform_params:
1064 warnings.warn(
1065 (
1066 f"This object ({self.__class__.__name__}) has a `predict` "
1067 "method which consumes metadata, but `fit_predict` does not "
1068 "forward metadata to `predict`. Please implement a custom "
1069 "`fit_predict` method to forward metadata to `predict` as well."
1070 "Alternatively, you can explicitly do `set_predict_request`"
1071 "and set all values to `False` to disable metadata routed to "
1072 "`predict`, if that's an option."
1073 ),
1074 UserWarning,
1075 )
1077 # override for transductive outlier detectors like LocalOulierFactor
1078 return self.fit(X, **kwargs).predict(X)
1081class MetaEstimatorMixin:
1082 _required_parameters = ["estimator"]
1083 """Mixin class for all meta estimators in scikit-learn."""
1086class MultiOutputMixin:
1087 """Mixin to mark estimators that support multioutput."""
1089 def _more_tags(self):
1090 return {"multioutput": True}
1093class _UnstableArchMixin:
1094 """Mark estimators that are non-determinstic on 32bit or PowerPC"""
1096 def _more_tags(self):
1097 return {
1098 "non_deterministic": _IS_32BIT or platform.machine().startswith(
1099 ("ppc", "powerpc")
1100 )
1101 }
1104def is_classifier(estimator):
1105 """Return True if the given estimator is (probably) a classifier.
1107 Parameters
1108 ----------
1109 estimator : object
1110 Estimator object to test.
1112 Returns
1113 -------
1114 out : bool
1115 True if estimator is a classifier and False otherwise.
1116 """
1117 return getattr(estimator, "_estimator_type", None) == "classifier"
1120def is_regressor(estimator):
1121 """Return True if the given estimator is (probably) a regressor.
1123 Parameters
1124 ----------
1125 estimator : estimator instance
1126 Estimator object to test.
1128 Returns
1129 -------
1130 out : bool
1131 True if estimator is a regressor and False otherwise.
1132 """
1133 return getattr(estimator, "_estimator_type", None) == "regressor"
1136def is_outlier_detector(estimator):
1137 """Return True if the given estimator is (probably) an outlier detector.
1139 Parameters
1140 ----------
1141 estimator : estimator instance
1142 Estimator object to test.
1144 Returns
1145 -------
1146 out : bool
1147 True if estimator is an outlier detector and False otherwise.
1148 """
1149 return getattr(estimator, "_estimator_type", None) == "outlier_detector"
1152def _fit_context(*, prefer_skip_nested_validation):
1153 """Decorator to run the fit methods of estimators within context managers.
1155 Parameters
1156 ----------
1157 prefer_skip_nested_validation : bool
1158 If True, the validation of parameters of inner estimators or functions
1159 called during fit will be skipped.
1161 This is useful to avoid validating many times the parameters passed by the
1162 user from the public facing API. It's also useful to avoid validating
1163 parameters that we pass internally to inner functions that are guaranteed to
1164 be valid by the test suite.
1166 It should be set to True for most estimators, except for those that receive
1167 non-validated objects as parameters, such as meta-estimators that are given
1168 estimator objects.
1170 Returns
1171 -------
1172 decorated_fit : method
1173 The decorated fit method.
1174 """
1176 def decorator(fit_method):
1177 @functools.wraps(fit_method)
1178 def wrapper(estimator, *args, **kwargs):
1179 global_skip_validation = get_config()["skip_parameter_validation"]
1181 # we don't want to validate again for each call to partial_fit
1182 partial_fit_and_fitted = (
1183 fit_method.__name__ == "partial_fit" and _is_fitted(estimator)
1184 )
1186 if not global_skip_validation and not partial_fit_and_fitted:
1187 estimator._validate_params()
1189 with config_context(
1190 skip_parameter_validation=(
1191 prefer_skip_nested_validation or global_skip_validation
1192 )
1193 ):
1194 return fit_method(estimator, *args, **kwargs)
1196 return wrapper
1198 return decorator