1"""
2This file contains preprocessing tools based on polynomials.
3"""
4import collections
5from itertools import chain, combinations
6from itertools import combinations_with_replacement as combinations_w_r
7from numbers import Integral
8
9import numpy as np
10from scipy import sparse
11from scipy.interpolate import BSpline
12from scipy.special import comb
13
14from ..base import BaseEstimator, TransformerMixin, _fit_context
15from ..utils import check_array
16from ..utils._param_validation import Interval, StrOptions
17from ..utils.fixes import parse_version, sp_version
18from ..utils.stats import _weighted_percentile
19from ..utils.validation import (
20 FLOAT_DTYPES,
21 _check_feature_names_in,
22 _check_sample_weight,
23 check_is_fitted,
24)
25from ._csr_polynomial_expansion import (
26 _calc_expanded_nnz,
27 _calc_total_nnz,
28 _csr_polynomial_expansion,
29)
30
31__all__ = [
32 "PolynomialFeatures",
33 "SplineTransformer",
34]
35
36
37def _create_expansion(X, interaction_only, deg, n_features, cumulative_size=0):
38 """Helper function for creating and appending sparse expansion matrices"""
39
40 total_nnz = _calc_total_nnz(X.indptr, interaction_only, deg)
41 expanded_col = _calc_expanded_nnz(n_features, interaction_only, deg)
42
43 if expanded_col == 0:
44 return None
45 # This only checks whether each block needs 64bit integers upon
46 # expansion. We prefer to keep int32 indexing where we can,
47 # since currently SciPy's CSR construction downcasts when possible,
48 # so we prefer to avoid an unnecessary cast. The dtype may still
49 # change in the concatenation process if needed.
50 # See: https://github.com/scipy/scipy/issues/16569
51 max_indices = expanded_col - 1
52 max_indptr = total_nnz
53 max_int32 = np.iinfo(np.int32).max
54 needs_int64 = max(max_indices, max_indptr) > max_int32
55 index_dtype = np.int64 if needs_int64 else np.int32
56
57 # This is a pretty specific bug that is hard to work around by a user,
58 # hence we do not detail the entire bug and all possible avoidance
59 # mechnasisms. Instead we recommend upgrading scipy or shrinking their data.
60 cumulative_size += expanded_col
61 if (
62 sp_version < parse_version("1.8.0")
63 and cumulative_size - 1 > max_int32
64 and not needs_int64
65 ):
66 raise ValueError(
67 "In scipy versions `<1.8.0`, the function `scipy.sparse.hstack`"
68 " sometimes produces negative columns when the output shape contains"
69 " `n_cols` too large to be represented by a 32bit signed"
70 " integer. To avoid this error, either use a version"
71 " of scipy `>=1.8.0` or alter the `PolynomialFeatures`"
72 " transformer to produce fewer than 2^31 output features."
73 )
74
75 # Result of the expansion, modified in place by the
76 # `_csr_polynomial_expansion` routine.
77 expanded_data = np.empty(shape=total_nnz, dtype=X.data.dtype)
78 expanded_indices = np.empty(shape=total_nnz, dtype=index_dtype)
79 expanded_indptr = np.empty(shape=X.indptr.shape[0], dtype=index_dtype)
80 _csr_polynomial_expansion(
81 X.data,
82 X.indices,
83 X.indptr,
84 X.shape[1],
85 expanded_data,
86 expanded_indices,
87 expanded_indptr,
88 interaction_only,
89 deg,
90 )
91 return sparse.csr_matrix(
92 (expanded_data, expanded_indices, expanded_indptr),
93 shape=(X.indptr.shape[0] - 1, expanded_col),
94 dtype=X.dtype,
95 )
96
97
98class PolynomialFeatures(TransformerMixin, BaseEstimator):
99 """Generate polynomial and interaction features.
100
101 Generate a new feature matrix consisting of all polynomial combinations
102 of the features with degree less than or equal to the specified degree.
103 For example, if an input sample is two dimensional and of the form
104 [a, b], the degree-2 polynomial features are [1, a, b, a^2, ab, b^2].
105
106 Read more in the :ref:`User Guide <polynomial_features>`.
107
108 Parameters
109 ----------
110 degree : int or tuple (min_degree, max_degree), default=2
111 If a single int is given, it specifies the maximal degree of the
112 polynomial features. If a tuple `(min_degree, max_degree)` is passed,
113 then `min_degree` is the minimum and `max_degree` is the maximum
114 polynomial degree of the generated features. Note that `min_degree=0`
115 and `min_degree=1` are equivalent as outputting the degree zero term is
116 determined by `include_bias`.
117
118 interaction_only : bool, default=False
119 If `True`, only interaction features are produced: features that are
120 products of at most `degree` *distinct* input features, i.e. terms with
121 power of 2 or higher of the same input feature are excluded:
122
123 - included: `x[0]`, `x[1]`, `x[0] * x[1]`, etc.
124 - excluded: `x[0] ** 2`, `x[0] ** 2 * x[1]`, etc.
125
126 include_bias : bool, default=True
127 If `True` (default), then include a bias column, the feature in which
128 all polynomial powers are zero (i.e. a column of ones - acts as an
129 intercept term in a linear model).
130
131 order : {'C', 'F'}, default='C'
132 Order of output array in the dense case. `'F'` order is faster to
133 compute, but may slow down subsequent estimators.
134
135 .. versionadded:: 0.21
136
137 Attributes
138 ----------
139 powers_ : ndarray of shape (`n_output_features_`, `n_features_in_`)
140 `powers_[i, j]` is the exponent of the jth input in the ith output.
141
142 n_features_in_ : int
143 Number of features seen during :term:`fit`.
144
145 .. versionadded:: 0.24
146
147 feature_names_in_ : ndarray of shape (`n_features_in_`,)
148 Names of features seen during :term:`fit`. Defined only when `X`
149 has feature names that are all strings.
150
151 .. versionadded:: 1.0
152
153 n_output_features_ : int
154 The total number of polynomial output features. The number of output
155 features is computed by iterating over all suitably sized combinations
156 of input features.
157
158 See Also
159 --------
160 SplineTransformer : Transformer that generates univariate B-spline bases
161 for features.
162
163 Notes
164 -----
165 Be aware that the number of features in the output array scales
166 polynomially in the number of features of the input array, and
167 exponentially in the degree. High degrees can cause overfitting.
168
169 See :ref:`examples/linear_model/plot_polynomial_interpolation.py
170 <sphx_glr_auto_examples_linear_model_plot_polynomial_interpolation.py>`
171
172 Examples
173 --------
174 >>> import numpy as np
175 >>> from sklearn.preprocessing import PolynomialFeatures
176 >>> X = np.arange(6).reshape(3, 2)
177 >>> X
178 array([[0, 1],
179 [2, 3],
180 [4, 5]])
181 >>> poly = PolynomialFeatures(2)
182 >>> poly.fit_transform(X)
183 array([[ 1., 0., 1., 0., 0., 1.],
184 [ 1., 2., 3., 4., 6., 9.],
185 [ 1., 4., 5., 16., 20., 25.]])
186 >>> poly = PolynomialFeatures(interaction_only=True)
187 >>> poly.fit_transform(X)
188 array([[ 1., 0., 1., 0.],
189 [ 1., 2., 3., 6.],
190 [ 1., 4., 5., 20.]])
191 """
192
193 _parameter_constraints: dict = {
194 "degree": [Interval(Integral, 0, None, closed="left"), "array-like"],
195 "interaction_only": ["boolean"],
196 "include_bias": ["boolean"],
197 "order": [StrOptions({"C", "F"})],
198 }
199
200 def __init__(
201 self, degree=2, *, interaction_only=False, include_bias=True, order="C"
202 ):
203 self.degree = degree
204 self.interaction_only = interaction_only
205 self.include_bias = include_bias
206 self.order = order
207
208 @staticmethod
209 def _combinations(
210 n_features, min_degree, max_degree, interaction_only, include_bias
211 ):
212 comb = combinations if interaction_only else combinations_w_r
213 start = max(1, min_degree)
214 iter = chain.from_iterable(
215 comb(range(n_features), i) for i in range(start, max_degree + 1)
216 )
217 if include_bias:
218 iter = chain(comb(range(n_features), 0), iter)
219 return iter
220
221 @staticmethod
222 def _num_combinations(
223 n_features, min_degree, max_degree, interaction_only, include_bias
224 ):
225 """Calculate number of terms in polynomial expansion
226
227 This should be equivalent to counting the number of terms returned by
228 _combinations(...) but much faster.
229 """
230
231 if interaction_only:
232 combinations = sum(
233 [
234 comb(n_features, i, exact=True)
235 for i in range(max(1, min_degree), min(max_degree, n_features) + 1)
236 ]
237 )
238 else:
239 combinations = comb(n_features + max_degree, max_degree, exact=True) - 1
240 if min_degree > 0:
241 d = min_degree - 1
242 combinations -= comb(n_features + d, d, exact=True) - 1
243
244 if include_bias:
245 combinations += 1
246
247 return combinations
248
249 @property
250 def powers_(self):
251 """Exponent for each of the inputs in the output."""
252 check_is_fitted(self)
253
254 combinations = self._combinations(
255 n_features=self.n_features_in_,
256 min_degree=self._min_degree,
257 max_degree=self._max_degree,
258 interaction_only=self.interaction_only,
259 include_bias=self.include_bias,
260 )
261 return np.vstack(
262 [np.bincount(c, minlength=self.n_features_in_) for c in combinations]
263 )
264
265 def get_feature_names_out(self, input_features=None):
266 """Get output feature names for transformation.
267
268 Parameters
269 ----------
270 input_features : array-like of str or None, default=None
271 Input features.
272
273 - If `input_features is None`, then `feature_names_in_` is
274 used as feature names in. If `feature_names_in_` is not defined,
275 then the following input feature names are generated:
276 `["x0", "x1", ..., "x(n_features_in_ - 1)"]`.
277 - If `input_features` is an array-like, then `input_features` must
278 match `feature_names_in_` if `feature_names_in_` is defined.
279
280 Returns
281 -------
282 feature_names_out : ndarray of str objects
283 Transformed feature names.
284 """
285 powers = self.powers_
286 input_features = _check_feature_names_in(self, input_features)
287 feature_names = []
288 for row in powers:
289 inds = np.where(row)[0]
290 if len(inds):
291 name = " ".join(
292 (
293 "%s^%d" % (input_features[ind], exp)
294 if exp != 1
295 else input_features[ind]
296 )
297 for ind, exp in zip(inds, row[inds])
298 )
299 else:
300 name = "1"
301 feature_names.append(name)
302 return np.asarray(feature_names, dtype=object)
303
304 @_fit_context(prefer_skip_nested_validation=True)
305 def fit(self, X, y=None):
306 """
307 Compute number of output features.
308
309 Parameters
310 ----------
311 X : {array-like, sparse matrix} of shape (n_samples, n_features)
312 The data.
313
314 y : Ignored
315 Not used, present here for API consistency by convention.
316
317 Returns
318 -------
319 self : object
320 Fitted transformer.
321 """
322 _, n_features = self._validate_data(X, accept_sparse=True).shape
323
324 if isinstance(self.degree, Integral):
325 if self.degree == 0 and not self.include_bias:
326 raise ValueError(
327 "Setting degree to zero and include_bias to False would result in"
328 " an empty output array."
329 )
330
331 self._min_degree = 0
332 self._max_degree = self.degree
333 elif (
334 isinstance(self.degree, collections.abc.Iterable) and len(self.degree) == 2
335 ):
336 self._min_degree, self._max_degree = self.degree
337 if not (
338 isinstance(self._min_degree, Integral)
339 and isinstance(self._max_degree, Integral)
340 and self._min_degree >= 0
341 and self._min_degree <= self._max_degree
342 ):
343 raise ValueError(
344 "degree=(min_degree, max_degree) must "
345 "be non-negative integers that fulfil "
346 "min_degree <= max_degree, got "
347 f"{self.degree}."
348 )
349 elif self._max_degree == 0 and not self.include_bias:
350 raise ValueError(
351 "Setting both min_degree and max_degree to zero and include_bias to"
352 " False would result in an empty output array."
353 )
354 else:
355 raise ValueError(
356 "degree must be a non-negative int or tuple "
357 "(min_degree, max_degree), got "
358 f"{self.degree}."
359 )
360
361 self.n_output_features_ = self._num_combinations(
362 n_features=n_features,
363 min_degree=self._min_degree,
364 max_degree=self._max_degree,
365 interaction_only=self.interaction_only,
366 include_bias=self.include_bias,
367 )
368 if self.n_output_features_ > np.iinfo(np.intp).max:
369 msg = (
370 "The output that would result from the current configuration would"
371 f" have {self.n_output_features_} features which is too large to be"
372 f" indexed by {np.intp().dtype.name}. Please change some or all of the"
373 " following:\n- The number of features in the input, currently"
374 f" {n_features=}\n- The range of degrees to calculate, currently"
375 f" [{self._min_degree}, {self._max_degree}]\n- Whether to include only"
376 f" interaction terms, currently {self.interaction_only}\n- Whether to"
377 f" include a bias term, currently {self.include_bias}."
378 )
379 if (
380 np.intp == np.int32
381 and self.n_output_features_ <= np.iinfo(np.int64).max
382 ): # pragma: nocover
383 msg += (
384 "\nNote that the current Python runtime has a limited 32 bit "
385 "address space and that this configuration would have been "
386 "admissible if run on a 64 bit Python runtime."
387 )
388 raise ValueError(msg)
389 # We also record the number of output features for
390 # _max_degree = 0
391 self._n_out_full = self._num_combinations(
392 n_features=n_features,
393 min_degree=0,
394 max_degree=self._max_degree,
395 interaction_only=self.interaction_only,
396 include_bias=self.include_bias,
397 )
398
399 return self
400
401 def transform(self, X):
402 """Transform data to polynomial features.
403
404 Parameters
405 ----------
406 X : {array-like, sparse matrix} of shape (n_samples, n_features)
407 The data to transform, row by row.
408
409 Prefer CSR over CSC for sparse input (for speed), but CSC is
410 required if the degree is 4 or higher. If the degree is less than
411 4 and the input format is CSC, it will be converted to CSR, have
412 its polynomial features generated, then converted back to CSC.
413
414 If the degree is 2 or 3, the method described in "Leveraging
415 Sparsity to Speed Up Polynomial Feature Expansions of CSR Matrices
416 Using K-Simplex Numbers" by Andrew Nystrom and John Hughes is
417 used, which is much faster than the method used on CSC input. For
418 this reason, a CSC input will be converted to CSR, and the output
419 will be converted back to CSC prior to being returned, hence the
420 preference of CSR.
421
422 Returns
423 -------
424 XP : {ndarray, sparse matrix} of shape (n_samples, NP)
425 The matrix of features, where `NP` is the number of polynomial
426 features generated from the combination of inputs. If a sparse
427 matrix is provided, it will be converted into a sparse
428 `csr_matrix`.
429 """
430 check_is_fitted(self)
431
432 X = self._validate_data(
433 X, order="F", dtype=FLOAT_DTYPES, reset=False, accept_sparse=("csr", "csc")
434 )
435
436 n_samples, n_features = X.shape
437 max_int32 = np.iinfo(np.int32).max
438 if sparse.issparse(X) and X.format == "csr":
439 if self._max_degree > 3:
440 return self.transform(X.tocsc()).tocsr()
441 to_stack = []
442 if self.include_bias:
443 to_stack.append(
444 sparse.csr_matrix(np.ones(shape=(n_samples, 1), dtype=X.dtype))
445 )
446 if self._min_degree <= 1 and self._max_degree > 0:
447 to_stack.append(X)
448
449 cumulative_size = sum(mat.shape[1] for mat in to_stack)
450 for deg in range(max(2, self._min_degree), self._max_degree + 1):
451 expanded = _create_expansion(
452 X=X,
453 interaction_only=self.interaction_only,
454 deg=deg,
455 n_features=n_features,
456 cumulative_size=cumulative_size,
457 )
458 if expanded is not None:
459 to_stack.append(expanded)
460 cumulative_size += expanded.shape[1]
461 if len(to_stack) == 0:
462 # edge case: deal with empty matrix
463 XP = sparse.csr_matrix((n_samples, 0), dtype=X.dtype)
464 else:
465 # `scipy.sparse.hstack` breaks in scipy<1.9.2
466 # when `n_output_features_ > max_int32`
467 all_int32 = all(mat.indices.dtype == np.int32 for mat in to_stack)
468 if (
469 sp_version < parse_version("1.9.2")
470 and self.n_output_features_ > max_int32
471 and all_int32
472 ):
473 raise ValueError( # pragma: no cover
474 "In scipy versions `<1.9.2`, the function `scipy.sparse.hstack`"
475 " produces negative columns when:\n1. The output shape contains"
476 " `n_cols` too large to be represented by a 32bit signed"
477 " integer.\n2. All sub-matrices to be stacked have indices of"
478 " dtype `np.int32`.\nTo avoid this error, either use a version"
479 " of scipy `>=1.9.2` or alter the `PolynomialFeatures`"
480 " transformer to produce fewer than 2^31 output features"
481 )
482 XP = sparse.hstack(to_stack, dtype=X.dtype, format="csr")
483 elif sparse.issparse(X) and X.format == "csc" and self._max_degree < 4:
484 return self.transform(X.tocsr()).tocsc()
485 elif sparse.issparse(X):
486 combinations = self._combinations(
487 n_features=n_features,
488 min_degree=self._min_degree,
489 max_degree=self._max_degree,
490 interaction_only=self.interaction_only,
491 include_bias=self.include_bias,
492 )
493 columns = []
494 for combi in combinations:
495 if combi:
496 out_col = 1
497 for col_idx in combi:
498 out_col = X[:, [col_idx]].multiply(out_col)
499 columns.append(out_col)
500 else:
501 bias = sparse.csc_matrix(np.ones((X.shape[0], 1)))
502 columns.append(bias)
503 XP = sparse.hstack(columns, dtype=X.dtype).tocsc()
504 else:
505 # Do as if _min_degree = 0 and cut down array after the
506 # computation, i.e. use _n_out_full instead of n_output_features_.
507 XP = np.empty(
508 shape=(n_samples, self._n_out_full), dtype=X.dtype, order=self.order
509 )
510
511 # What follows is a faster implementation of:
512 # for i, comb in enumerate(combinations):
513 # XP[:, i] = X[:, comb].prod(1)
514 # This implementation uses two optimisations.
515 # First one is broadcasting,
516 # multiply ([X1, ..., Xn], X1) -> [X1 X1, ..., Xn X1]
517 # multiply ([X2, ..., Xn], X2) -> [X2 X2, ..., Xn X2]
518 # ...
519 # multiply ([X[:, start:end], X[:, start]) -> ...
520 # Second optimisation happens for degrees >= 3.
521 # Xi^3 is computed reusing previous computation:
522 # Xi^3 = Xi^2 * Xi.
523
524 # degree 0 term
525 if self.include_bias:
526 XP[:, 0] = 1
527 current_col = 1
528 else:
529 current_col = 0
530
531 if self._max_degree == 0:
532 return XP
533
534 # degree 1 term
535 XP[:, current_col : current_col + n_features] = X
536 index = list(range(current_col, current_col + n_features))
537 current_col += n_features
538 index.append(current_col)
539
540 # loop over degree >= 2 terms
541 for _ in range(2, self._max_degree + 1):
542 new_index = []
543 end = index[-1]
544 for feature_idx in range(n_features):
545 start = index[feature_idx]
546 new_index.append(current_col)
547 if self.interaction_only:
548 start += index[feature_idx + 1] - index[feature_idx]
549 next_col = current_col + end - start
550 if next_col <= current_col:
551 break
552 # XP[:, start:end] are terms of degree d - 1
553 # that exclude feature #feature_idx.
554 np.multiply(
555 XP[:, start:end],
556 X[:, feature_idx : feature_idx + 1],
557 out=XP[:, current_col:next_col],
558 casting="no",
559 )
560 current_col = next_col
561
562 new_index.append(current_col)
563 index = new_index
564
565 if self._min_degree > 1:
566 n_XP, n_Xout = self._n_out_full, self.n_output_features_
567 if self.include_bias:
568 Xout = np.empty(
569 shape=(n_samples, n_Xout), dtype=XP.dtype, order=self.order
570 )
571 Xout[:, 0] = 1
572 Xout[:, 1:] = XP[:, n_XP - n_Xout + 1 :]
573 else:
574 Xout = XP[:, n_XP - n_Xout :].copy()
575 XP = Xout
576 return XP
577
578
579class SplineTransformer(TransformerMixin, BaseEstimator):
580 """Generate univariate B-spline bases for features.
581
582 Generate a new feature matrix consisting of
583 `n_splines=n_knots + degree - 1` (`n_knots - 1` for
584 `extrapolation="periodic"`) spline basis functions
585 (B-splines) of polynomial order=`degree` for each feature.
586
587 In order to learn more about the SplineTransformer class go to:
588 :ref:`sphx_glr_auto_examples_applications_plot_cyclical_feature_engineering.py`
589
590 Read more in the :ref:`User Guide <spline_transformer>`.
591
592 .. versionadded:: 1.0
593
594 Parameters
595 ----------
596 n_knots : int, default=5
597 Number of knots of the splines if `knots` equals one of
598 {'uniform', 'quantile'}. Must be larger or equal 2. Ignored if `knots`
599 is array-like.
600
601 degree : int, default=3
602 The polynomial degree of the spline basis. Must be a non-negative
603 integer.
604
605 knots : {'uniform', 'quantile'} or array-like of shape \
606 (n_knots, n_features), default='uniform'
607 Set knot positions such that first knot <= features <= last knot.
608
609 - If 'uniform', `n_knots` number of knots are distributed uniformly
610 from min to max values of the features.
611 - If 'quantile', they are distributed uniformly along the quantiles of
612 the features.
613 - If an array-like is given, it directly specifies the sorted knot
614 positions including the boundary knots. Note that, internally,
615 `degree` number of knots are added before the first knot, the same
616 after the last knot.
617
618 extrapolation : {'error', 'constant', 'linear', 'continue', 'periodic'}, \
619 default='constant'
620 If 'error', values outside the min and max values of the training
621 features raises a `ValueError`. If 'constant', the value of the
622 splines at minimum and maximum value of the features is used as
623 constant extrapolation. If 'linear', a linear extrapolation is used.
624 If 'continue', the splines are extrapolated as is, i.e. option
625 `extrapolate=True` in :class:`scipy.interpolate.BSpline`. If
626 'periodic', periodic splines with a periodicity equal to the distance
627 between the first and last knot are used. Periodic splines enforce
628 equal function values and derivatives at the first and last knot.
629 For example, this makes it possible to avoid introducing an arbitrary
630 jump between Dec 31st and Jan 1st in spline features derived from a
631 naturally periodic "day-of-year" input feature. In this case it is
632 recommended to manually set the knot values to control the period.
633
634 include_bias : bool, default=True
635 If False, then the last spline element inside the data range
636 of a feature is dropped. As B-splines sum to one over the spline basis
637 functions for each data point, they implicitly include a bias term,
638 i.e. a column of ones. It acts as an intercept term in a linear models.
639
640 order : {'C', 'F'}, default='C'
641 Order of output array in the dense case. `'F'` order is faster to compute, but
642 may slow down subsequent estimators.
643
644 sparse_output : bool, default=False
645 Will return sparse CSR matrix if set True else will return an array. This
646 option is only available with `scipy>=1.8`.
647
648 .. versionadded:: 1.2
649
650 Attributes
651 ----------
652 bsplines_ : list of shape (n_features,)
653 List of BSplines objects, one for each feature.
654
655 n_features_in_ : int
656 The total number of input features.
657
658 feature_names_in_ : ndarray of shape (`n_features_in_`,)
659 Names of features seen during :term:`fit`. Defined only when `X`
660 has feature names that are all strings.
661
662 .. versionadded:: 1.0
663
664 n_features_out_ : int
665 The total number of output features, which is computed as
666 `n_features * n_splines`, where `n_splines` is
667 the number of bases elements of the B-splines,
668 `n_knots + degree - 1` for non-periodic splines and
669 `n_knots - 1` for periodic ones.
670 If `include_bias=False`, then it is only
671 `n_features * (n_splines - 1)`.
672
673 See Also
674 --------
675 KBinsDiscretizer : Transformer that bins continuous data into intervals.
676
677 PolynomialFeatures : Transformer that generates polynomial and interaction
678 features.
679
680 Notes
681 -----
682 High degrees and a high number of knots can cause overfitting.
683
684 See :ref:`examples/linear_model/plot_polynomial_interpolation.py
685 <sphx_glr_auto_examples_linear_model_plot_polynomial_interpolation.py>`.
686
687 Examples
688 --------
689 >>> import numpy as np
690 >>> from sklearn.preprocessing import SplineTransformer
691 >>> X = np.arange(6).reshape(6, 1)
692 >>> spline = SplineTransformer(degree=2, n_knots=3)
693 >>> spline.fit_transform(X)
694 array([[0.5 , 0.5 , 0. , 0. ],
695 [0.18, 0.74, 0.08, 0. ],
696 [0.02, 0.66, 0.32, 0. ],
697 [0. , 0.32, 0.66, 0.02],
698 [0. , 0.08, 0.74, 0.18],
699 [0. , 0. , 0.5 , 0.5 ]])
700 """
701
702 _parameter_constraints: dict = {
703 "n_knots": [Interval(Integral, 2, None, closed="left")],
704 "degree": [Interval(Integral, 0, None, closed="left")],
705 "knots": [StrOptions({"uniform", "quantile"}), "array-like"],
706 "extrapolation": [
707 StrOptions({"error", "constant", "linear", "continue", "periodic"})
708 ],
709 "include_bias": ["boolean"],
710 "order": [StrOptions({"C", "F"})],
711 "sparse_output": ["boolean"],
712 }
713
714 def __init__(
715 self,
716 n_knots=5,
717 degree=3,
718 *,
719 knots="uniform",
720 extrapolation="constant",
721 include_bias=True,
722 order="C",
723 sparse_output=False,
724 ):
725 self.n_knots = n_knots
726 self.degree = degree
727 self.knots = knots
728 self.extrapolation = extrapolation
729 self.include_bias = include_bias
730 self.order = order
731 self.sparse_output = sparse_output
732
733 @staticmethod
734 def _get_base_knot_positions(X, n_knots=10, knots="uniform", sample_weight=None):
735 """Calculate base knot positions.
736
737 Base knots such that first knot <= feature <= last knot. For the
738 B-spline construction with scipy.interpolate.BSpline, 2*degree knots
739 beyond the base interval are added.
740
741 Returns
742 -------
743 knots : ndarray of shape (n_knots, n_features), dtype=np.float64
744 Knot positions (points) of base interval.
745 """
746 if knots == "quantile":
747 percentiles = 100 * np.linspace(
748 start=0, stop=1, num=n_knots, dtype=np.float64
749 )
750
751 if sample_weight is None:
752 knots = np.percentile(X, percentiles, axis=0)
753 else:
754 knots = np.array(
755 [
756 _weighted_percentile(X, sample_weight, percentile)
757 for percentile in percentiles
758 ]
759 )
760
761 else:
762 # knots == 'uniform':
763 # Note that the variable `knots` has already been validated and
764 # `else` is therefore safe.
765 # Disregard observations with zero weight.
766 mask = slice(None, None, 1) if sample_weight is None else sample_weight > 0
767 x_min = np.amin(X[mask], axis=0)
768 x_max = np.amax(X[mask], axis=0)
769
770 knots = np.linspace(
771 start=x_min,
772 stop=x_max,
773 num=n_knots,
774 endpoint=True,
775 dtype=np.float64,
776 )
777
778 return knots
779
780 def get_feature_names_out(self, input_features=None):
781 """Get output feature names for transformation.
782
783 Parameters
784 ----------
785 input_features : array-like of str or None, default=None
786 Input features.
787
788 - If `input_features` is `None`, then `feature_names_in_` is
789 used as feature names in. If `feature_names_in_` is not defined,
790 then the following input feature names are generated:
791 `["x0", "x1", ..., "x(n_features_in_ - 1)"]`.
792 - If `input_features` is an array-like, then `input_features` must
793 match `feature_names_in_` if `feature_names_in_` is defined.
794
795 Returns
796 -------
797 feature_names_out : ndarray of str objects
798 Transformed feature names.
799 """
800 check_is_fitted(self, "n_features_in_")
801 n_splines = self.bsplines_[0].c.shape[1]
802
803 input_features = _check_feature_names_in(self, input_features)
804 feature_names = []
805 for i in range(self.n_features_in_):
806 for j in range(n_splines - 1 + self.include_bias):
807 feature_names.append(f"{input_features[i]}_sp_{j}")
808 return np.asarray(feature_names, dtype=object)
809
810 @_fit_context(prefer_skip_nested_validation=True)
811 def fit(self, X, y=None, sample_weight=None):
812 """Compute knot positions of splines.
813
814 Parameters
815 ----------
816 X : array-like of shape (n_samples, n_features)
817 The data.
818
819 y : None
820 Ignored.
821
822 sample_weight : array-like of shape (n_samples,), default = None
823 Individual weights for each sample. Used to calculate quantiles if
824 `knots="quantile"`. For `knots="uniform"`, zero weighted
825 observations are ignored for finding the min and max of `X`.
826
827 Returns
828 -------
829 self : object
830 Fitted transformer.
831 """
832 X = self._validate_data(
833 X,
834 reset=True,
835 accept_sparse=False,
836 ensure_min_samples=2,
837 ensure_2d=True,
838 )
839 if sample_weight is not None:
840 sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype)
841
842 _, n_features = X.shape
843
844 if isinstance(self.knots, str):
845 base_knots = self._get_base_knot_positions(
846 X, n_knots=self.n_knots, knots=self.knots, sample_weight=sample_weight
847 )
848 else:
849 base_knots = check_array(self.knots, dtype=np.float64)
850 if base_knots.shape[0] < 2:
851 raise ValueError("Number of knots, knots.shape[0], must be >= 2.")
852 elif base_knots.shape[1] != n_features:
853 raise ValueError("knots.shape[1] == n_features is violated.")
854 elif not np.all(np.diff(base_knots, axis=0) > 0):
855 raise ValueError("knots must be sorted without duplicates.")
856
857 if self.sparse_output and sp_version < parse_version("1.8.0"):
858 raise ValueError(
859 "Option sparse_output=True is only available with scipy>=1.8.0, "
860 f"but here scipy=={sp_version} is used."
861 )
862
863 # number of knots for base interval
864 n_knots = base_knots.shape[0]
865
866 if self.extrapolation == "periodic" and n_knots <= self.degree:
867 raise ValueError(
868 "Periodic splines require degree < n_knots. Got n_knots="
869 f"{n_knots} and degree={self.degree}."
870 )
871
872 # number of splines basis functions
873 if self.extrapolation != "periodic":
874 n_splines = n_knots + self.degree - 1
875 else:
876 # periodic splines have self.degree less degrees of freedom
877 n_splines = n_knots - 1
878
879 degree = self.degree
880 n_out = n_features * n_splines
881 # We have to add degree number of knots below, and degree number knots
882 # above the base knots in order to make the spline basis complete.
883 if self.extrapolation == "periodic":
884 # For periodic splines the spacing of the first / last degree knots
885 # needs to be a continuation of the spacing of the last / first
886 # base knots.
887 period = base_knots[-1] - base_knots[0]
888 knots = np.r_[
889 base_knots[-(degree + 1) : -1] - period,
890 base_knots,
891 base_knots[1 : (degree + 1)] + period,
892 ]
893
894 else:
895 # Eilers & Marx in "Flexible smoothing with B-splines and
896 # penalties" https://doi.org/10.1214/ss/1038425655 advice
897 # against repeating first and last knot several times, which
898 # would have inferior behaviour at boundaries if combined with
899 # a penalty (hence P-Spline). We follow this advice even if our
900 # splines are unpenalized. Meaning we do not:
901 # knots = np.r_[
902 # np.tile(base_knots.min(axis=0), reps=[degree, 1]),
903 # base_knots,
904 # np.tile(base_knots.max(axis=0), reps=[degree, 1])
905 # ]
906 # Instead, we reuse the distance of the 2 fist/last knots.
907 dist_min = base_knots[1] - base_knots[0]
908 dist_max = base_knots[-1] - base_knots[-2]
909
910 knots = np.r_[
911 np.linspace(
912 base_knots[0] - degree * dist_min,
913 base_knots[0] - dist_min,
914 num=degree,
915 ),
916 base_knots,
917 np.linspace(
918 base_knots[-1] + dist_max,
919 base_knots[-1] + degree * dist_max,
920 num=degree,
921 ),
922 ]
923
924 # With a diagonal coefficient matrix, we get back the spline basis
925 # elements, i.e. the design matrix of the spline.
926 # Note, BSpline appreciates C-contiguous float64 arrays as c=coef.
927 coef = np.eye(n_splines, dtype=np.float64)
928 if self.extrapolation == "periodic":
929 coef = np.concatenate((coef, coef[:degree, :]))
930
931 extrapolate = self.extrapolation in ["periodic", "continue"]
932
933 bsplines = [
934 BSpline.construct_fast(
935 knots[:, i], coef, self.degree, extrapolate=extrapolate
936 )
937 for i in range(n_features)
938 ]
939 self.bsplines_ = bsplines
940
941 self.n_features_out_ = n_out - n_features * (1 - self.include_bias)
942 return self
943
944 def transform(self, X):
945 """Transform each feature data to B-splines.
946
947 Parameters
948 ----------
949 X : array-like of shape (n_samples, n_features)
950 The data to transform.
951
952 Returns
953 -------
954 XBS : {ndarray, sparse matrix} of shape (n_samples, n_features * n_splines)
955 The matrix of features, where n_splines is the number of bases
956 elements of the B-splines, n_knots + degree - 1.
957 """
958 check_is_fitted(self)
959
960 X = self._validate_data(X, reset=False, accept_sparse=False, ensure_2d=True)
961
962 n_samples, n_features = X.shape
963 n_splines = self.bsplines_[0].c.shape[1]
964 degree = self.degree
965
966 # TODO: Remove this condition, once scipy 1.10 is the minimum version.
967 # Only scipy => 1.10 supports design_matrix(.., extrapolate=..).
968 # The default (implicit in scipy < 1.10) is extrapolate=False.
969 scipy_1_10 = sp_version >= parse_version("1.10.0")
970 # Note: self.bsplines_[0].extrapolate is True for extrapolation in
971 # ["periodic", "continue"]
972 if scipy_1_10:
973 use_sparse = self.sparse_output
974 kwargs_extrapolate = {"extrapolate": self.bsplines_[0].extrapolate}
975 else:
976 use_sparse = self.sparse_output and not self.bsplines_[0].extrapolate
977 kwargs_extrapolate = dict()
978
979 # Note that scipy BSpline returns float64 arrays and converts input
980 # x=X[:, i] to c-contiguous float64.
981 n_out = self.n_features_out_ + n_features * (1 - self.include_bias)
982 if X.dtype in FLOAT_DTYPES:
983 dtype = X.dtype
984 else:
985 dtype = np.float64
986 if use_sparse:
987 output_list = []
988 else:
989 XBS = np.zeros((n_samples, n_out), dtype=dtype, order=self.order)
990
991 for i in range(n_features):
992 spl = self.bsplines_[i]
993
994 if self.extrapolation in ("continue", "error", "periodic"):
995 if self.extrapolation == "periodic":
996 # With periodic extrapolation we map x to the segment
997 # [spl.t[k], spl.t[n]].
998 # This is equivalent to BSpline(.., extrapolate="periodic")
999 # for scipy>=1.0.0.
1000 n = spl.t.size - spl.k - 1
1001 # Assign to new array to avoid inplace operation
1002 x = spl.t[spl.k] + (X[:, i] - spl.t[spl.k]) % (
1003 spl.t[n] - spl.t[spl.k]
1004 )
1005 else:
1006 x = X[:, i]
1007
1008 if use_sparse:
1009 XBS_sparse = BSpline.design_matrix(
1010 x, spl.t, spl.k, **kwargs_extrapolate
1011 )
1012 if self.extrapolation == "periodic":
1013 # See the construction of coef in fit. We need to add the last
1014 # degree spline basis function to the first degree ones and
1015 # then drop the last ones.
1016 # Note: See comment about SparseEfficiencyWarning below.
1017 XBS_sparse = XBS_sparse.tolil()
1018 XBS_sparse[:, :degree] += XBS_sparse[:, -degree:]
1019 XBS_sparse = XBS_sparse[:, :-degree]
1020 else:
1021 XBS[:, (i * n_splines) : ((i + 1) * n_splines)] = spl(x)
1022 else: # extrapolation in ("constant", "linear")
1023 xmin, xmax = spl.t[degree], spl.t[-degree - 1]
1024 # spline values at boundaries
1025 f_min, f_max = spl(xmin), spl(xmax)
1026 mask = (xmin <= X[:, i]) & (X[:, i] <= xmax)
1027 if use_sparse:
1028 mask_inv = ~mask
1029 x = X[:, i].copy()
1030 # Set some arbitrary values outside boundary that will be reassigned
1031 # later.
1032 x[mask_inv] = spl.t[self.degree]
1033 XBS_sparse = BSpline.design_matrix(x, spl.t, spl.k)
1034 # Note: Without converting to lil_matrix we would get:
1035 # scipy.sparse._base.SparseEfficiencyWarning: Changing the sparsity
1036 # structure of a csr_matrix is expensive. lil_matrix is more
1037 # efficient.
1038 if np.any(mask_inv):
1039 XBS_sparse = XBS_sparse.tolil()
1040 XBS_sparse[mask_inv, :] = 0
1041 else:
1042 XBS[mask, (i * n_splines) : ((i + 1) * n_splines)] = spl(X[mask, i])
1043
1044 # Note for extrapolation:
1045 # 'continue' is already returned as is by scipy BSplines
1046 if self.extrapolation == "error":
1047 # BSpline with extrapolate=False does not raise an error, but
1048 # outputs np.nan.
1049 if (use_sparse and np.any(np.isnan(XBS_sparse.data))) or (
1050 not use_sparse
1051 and np.any(
1052 np.isnan(XBS[:, (i * n_splines) : ((i + 1) * n_splines)])
1053 )
1054 ):
1055 raise ValueError(
1056 "X contains values beyond the limits of the knots."
1057 )
1058 elif self.extrapolation == "constant":
1059 # Set all values beyond xmin and xmax to the value of the
1060 # spline basis functions at those two positions.
1061 # Only the first degree and last degree number of splines
1062 # have non-zero values at the boundaries.
1063
1064 mask = X[:, i] < xmin
1065 if np.any(mask):
1066 if use_sparse:
1067 # Note: See comment about SparseEfficiencyWarning above.
1068 XBS_sparse = XBS_sparse.tolil()
1069 XBS_sparse[mask, :degree] = f_min[:degree]
1070
1071 else:
1072 XBS[mask, (i * n_splines) : (i * n_splines + degree)] = f_min[
1073 :degree
1074 ]
1075
1076 mask = X[:, i] > xmax
1077 if np.any(mask):
1078 if use_sparse:
1079 # Note: See comment about SparseEfficiencyWarning above.
1080 XBS_sparse = XBS_sparse.tolil()
1081 XBS_sparse[mask, -degree:] = f_max[-degree:]
1082 else:
1083 XBS[
1084 mask,
1085 ((i + 1) * n_splines - degree) : ((i + 1) * n_splines),
1086 ] = f_max[-degree:]
1087
1088 elif self.extrapolation == "linear":
1089 # Continue the degree first and degree last spline bases
1090 # linearly beyond the boundaries, with slope = derivative at
1091 # the boundary.
1092 # Note that all others have derivative = value = 0 at the
1093 # boundaries.
1094
1095 # spline derivatives = slopes at boundaries
1096 fp_min, fp_max = spl(xmin, nu=1), spl(xmax, nu=1)
1097 # Compute the linear continuation.
1098 if degree <= 1:
1099 # For degree=1, the derivative of 2nd spline is not zero at
1100 # boundary. For degree=0 it is the same as 'constant'.
1101 degree += 1
1102 for j in range(degree):
1103 mask = X[:, i] < xmin
1104 if np.any(mask):
1105 linear_extr = f_min[j] + (X[mask, i] - xmin) * fp_min[j]
1106 if use_sparse:
1107 # Note: See comment about SparseEfficiencyWarning above.
1108 XBS_sparse = XBS_sparse.tolil()
1109 XBS_sparse[mask, j] = linear_extr
1110 else:
1111 XBS[mask, i * n_splines + j] = linear_extr
1112
1113 mask = X[:, i] > xmax
1114 if np.any(mask):
1115 k = n_splines - 1 - j
1116 linear_extr = f_max[k] + (X[mask, i] - xmax) * fp_max[k]
1117 if use_sparse:
1118 # Note: See comment about SparseEfficiencyWarning above.
1119 XBS_sparse = XBS_sparse.tolil()
1120 XBS_sparse[mask, k : k + 1] = linear_extr[:, None]
1121 else:
1122 XBS[mask, i * n_splines + k] = linear_extr
1123
1124 if use_sparse:
1125 XBS_sparse = XBS_sparse.tocsr()
1126 output_list.append(XBS_sparse)
1127
1128 if use_sparse:
1129 # TODO: Remove this conditional error when the minimum supported version of
1130 # SciPy is 1.9.2
1131 # `scipy.sparse.hstack` breaks in scipy<1.9.2
1132 # when `n_features_out_ > max_int32`
1133 max_int32 = np.iinfo(np.int32).max
1134 all_int32 = True
1135 for mat in output_list:
1136 all_int32 &= mat.indices.dtype == np.int32
1137 if (
1138 sp_version < parse_version("1.9.2")
1139 and self.n_features_out_ > max_int32
1140 and all_int32
1141 ):
1142 raise ValueError(
1143 "In scipy versions `<1.9.2`, the function `scipy.sparse.hstack`"
1144 " produces negative columns when:\n1. The output shape contains"
1145 " `n_cols` too large to be represented by a 32bit signed"
1146 " integer.\n. All sub-matrices to be stacked have indices of"
1147 " dtype `np.int32`.\nTo avoid this error, either use a version"
1148 " of scipy `>=1.9.2` or alter the `SplineTransformer`"
1149 " transformer to produce fewer than 2^31 output features"
1150 )
1151 XBS = sparse.hstack(output_list, format="csr")
1152 elif self.sparse_output:
1153 # TODO: Remove ones scipy 1.10 is the minimum version. See comments above.
1154 XBS = sparse.csr_matrix(XBS)
1155
1156 if self.include_bias:
1157 return XBS
1158 else:
1159 # We throw away one spline basis per feature.
1160 # We chose the last one.
1161 indices = [j for j in range(XBS.shape[1]) if (j + 1) % n_splines != 0]
1162 return XBS[:, indices]
1163
1164 def _more_tags(self):
1165 return {
1166 "_xfail_checks": {
1167 "check_estimators_pickle": (
1168 "Current Scipy implementation of _bsplines does not"
1169 "support const memory views."
1170 ),
1171 }
1172 }