Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/scikit_learn-1.4.dev0-py3.8-linux-x86_64.egg/sklearn/preprocessing/_polynomial.py: 12%

340 statements  

« prev     ^ index     » next       coverage.py v7.3.2, created at 2023-12-12 06:31 +0000

1""" 

2This file contains preprocessing tools based on polynomials. 

3""" 

4import collections 

5from itertools import chain, combinations 

6from itertools import combinations_with_replacement as combinations_w_r 

7from numbers import Integral 

8 

9import numpy as np 

10from scipy import sparse 

11from scipy.interpolate import BSpline 

12from scipy.special import comb 

13 

14from ..base import BaseEstimator, TransformerMixin, _fit_context 

15from ..utils import check_array 

16from ..utils._param_validation import Interval, StrOptions 

17from ..utils.fixes import parse_version, sp_version 

18from ..utils.stats import _weighted_percentile 

19from ..utils.validation import ( 

20 FLOAT_DTYPES, 

21 _check_feature_names_in, 

22 _check_sample_weight, 

23 check_is_fitted, 

24) 

25from ._csr_polynomial_expansion import ( 

26 _calc_expanded_nnz, 

27 _calc_total_nnz, 

28 _csr_polynomial_expansion, 

29) 

30 

31__all__ = [ 

32 "PolynomialFeatures", 

33 "SplineTransformer", 

34] 

35 

36 

37def _create_expansion(X, interaction_only, deg, n_features, cumulative_size=0): 

38 """Helper function for creating and appending sparse expansion matrices""" 

39 

40 total_nnz = _calc_total_nnz(X.indptr, interaction_only, deg) 

41 expanded_col = _calc_expanded_nnz(n_features, interaction_only, deg) 

42 

43 if expanded_col == 0: 

44 return None 

45 # This only checks whether each block needs 64bit integers upon 

46 # expansion. We prefer to keep int32 indexing where we can, 

47 # since currently SciPy's CSR construction downcasts when possible, 

48 # so we prefer to avoid an unnecessary cast. The dtype may still 

49 # change in the concatenation process if needed. 

50 # See: https://github.com/scipy/scipy/issues/16569 

51 max_indices = expanded_col - 1 

52 max_indptr = total_nnz 

53 max_int32 = np.iinfo(np.int32).max 

54 needs_int64 = max(max_indices, max_indptr) > max_int32 

55 index_dtype = np.int64 if needs_int64 else np.int32 

56 

57 # This is a pretty specific bug that is hard to work around by a user, 

58 # hence we do not detail the entire bug and all possible avoidance 

59 # mechnasisms. Instead we recommend upgrading scipy or shrinking their data. 

60 cumulative_size += expanded_col 

61 if ( 

62 sp_version < parse_version("1.8.0") 

63 and cumulative_size - 1 > max_int32 

64 and not needs_int64 

65 ): 

66 raise ValueError( 

67 "In scipy versions `<1.8.0`, the function `scipy.sparse.hstack`" 

68 " sometimes produces negative columns when the output shape contains" 

69 " `n_cols` too large to be represented by a 32bit signed" 

70 " integer. To avoid this error, either use a version" 

71 " of scipy `>=1.8.0` or alter the `PolynomialFeatures`" 

72 " transformer to produce fewer than 2^31 output features." 

73 ) 

74 

75 # Result of the expansion, modified in place by the 

76 # `_csr_polynomial_expansion` routine. 

77 expanded_data = np.empty(shape=total_nnz, dtype=X.data.dtype) 

78 expanded_indices = np.empty(shape=total_nnz, dtype=index_dtype) 

79 expanded_indptr = np.empty(shape=X.indptr.shape[0], dtype=index_dtype) 

80 _csr_polynomial_expansion( 

81 X.data, 

82 X.indices, 

83 X.indptr, 

84 X.shape[1], 

85 expanded_data, 

86 expanded_indices, 

87 expanded_indptr, 

88 interaction_only, 

89 deg, 

90 ) 

91 return sparse.csr_matrix( 

92 (expanded_data, expanded_indices, expanded_indptr), 

93 shape=(X.indptr.shape[0] - 1, expanded_col), 

94 dtype=X.dtype, 

95 ) 

96 

97 

98class PolynomialFeatures(TransformerMixin, BaseEstimator): 

99 """Generate polynomial and interaction features. 

100 

101 Generate a new feature matrix consisting of all polynomial combinations 

102 of the features with degree less than or equal to the specified degree. 

103 For example, if an input sample is two dimensional and of the form 

104 [a, b], the degree-2 polynomial features are [1, a, b, a^2, ab, b^2]. 

105 

106 Read more in the :ref:`User Guide <polynomial_features>`. 

107 

108 Parameters 

109 ---------- 

110 degree : int or tuple (min_degree, max_degree), default=2 

111 If a single int is given, it specifies the maximal degree of the 

112 polynomial features. If a tuple `(min_degree, max_degree)` is passed, 

113 then `min_degree` is the minimum and `max_degree` is the maximum 

114 polynomial degree of the generated features. Note that `min_degree=0` 

115 and `min_degree=1` are equivalent as outputting the degree zero term is 

116 determined by `include_bias`. 

117 

118 interaction_only : bool, default=False 

119 If `True`, only interaction features are produced: features that are 

120 products of at most `degree` *distinct* input features, i.e. terms with 

121 power of 2 or higher of the same input feature are excluded: 

122 

123 - included: `x[0]`, `x[1]`, `x[0] * x[1]`, etc. 

124 - excluded: `x[0] ** 2`, `x[0] ** 2 * x[1]`, etc. 

125 

126 include_bias : bool, default=True 

127 If `True` (default), then include a bias column, the feature in which 

128 all polynomial powers are zero (i.e. a column of ones - acts as an 

129 intercept term in a linear model). 

130 

131 order : {'C', 'F'}, default='C' 

132 Order of output array in the dense case. `'F'` order is faster to 

133 compute, but may slow down subsequent estimators. 

134 

135 .. versionadded:: 0.21 

136 

137 Attributes 

138 ---------- 

139 powers_ : ndarray of shape (`n_output_features_`, `n_features_in_`) 

140 `powers_[i, j]` is the exponent of the jth input in the ith output. 

141 

142 n_features_in_ : int 

143 Number of features seen during :term:`fit`. 

144 

145 .. versionadded:: 0.24 

146 

147 feature_names_in_ : ndarray of shape (`n_features_in_`,) 

148 Names of features seen during :term:`fit`. Defined only when `X` 

149 has feature names that are all strings. 

150 

151 .. versionadded:: 1.0 

152 

153 n_output_features_ : int 

154 The total number of polynomial output features. The number of output 

155 features is computed by iterating over all suitably sized combinations 

156 of input features. 

157 

158 See Also 

159 -------- 

160 SplineTransformer : Transformer that generates univariate B-spline bases 

161 for features. 

162 

163 Notes 

164 ----- 

165 Be aware that the number of features in the output array scales 

166 polynomially in the number of features of the input array, and 

167 exponentially in the degree. High degrees can cause overfitting. 

168 

169 See :ref:`examples/linear_model/plot_polynomial_interpolation.py 

170 <sphx_glr_auto_examples_linear_model_plot_polynomial_interpolation.py>` 

171 

172 Examples 

173 -------- 

174 >>> import numpy as np 

175 >>> from sklearn.preprocessing import PolynomialFeatures 

176 >>> X = np.arange(6).reshape(3, 2) 

177 >>> X 

178 array([[0, 1], 

179 [2, 3], 

180 [4, 5]]) 

181 >>> poly = PolynomialFeatures(2) 

182 >>> poly.fit_transform(X) 

183 array([[ 1., 0., 1., 0., 0., 1.], 

184 [ 1., 2., 3., 4., 6., 9.], 

185 [ 1., 4., 5., 16., 20., 25.]]) 

186 >>> poly = PolynomialFeatures(interaction_only=True) 

187 >>> poly.fit_transform(X) 

188 array([[ 1., 0., 1., 0.], 

189 [ 1., 2., 3., 6.], 

190 [ 1., 4., 5., 20.]]) 

191 """ 

192 

193 _parameter_constraints: dict = { 

194 "degree": [Interval(Integral, 0, None, closed="left"), "array-like"], 

195 "interaction_only": ["boolean"], 

196 "include_bias": ["boolean"], 

197 "order": [StrOptions({"C", "F"})], 

198 } 

199 

200 def __init__( 

201 self, degree=2, *, interaction_only=False, include_bias=True, order="C" 

202 ): 

203 self.degree = degree 

204 self.interaction_only = interaction_only 

205 self.include_bias = include_bias 

206 self.order = order 

207 

208 @staticmethod 

209 def _combinations( 

210 n_features, min_degree, max_degree, interaction_only, include_bias 

211 ): 

212 comb = combinations if interaction_only else combinations_w_r 

213 start = max(1, min_degree) 

214 iter = chain.from_iterable( 

215 comb(range(n_features), i) for i in range(start, max_degree + 1) 

216 ) 

217 if include_bias: 

218 iter = chain(comb(range(n_features), 0), iter) 

219 return iter 

220 

221 @staticmethod 

222 def _num_combinations( 

223 n_features, min_degree, max_degree, interaction_only, include_bias 

224 ): 

225 """Calculate number of terms in polynomial expansion 

226 

227 This should be equivalent to counting the number of terms returned by 

228 _combinations(...) but much faster. 

229 """ 

230 

231 if interaction_only: 

232 combinations = sum( 

233 [ 

234 comb(n_features, i, exact=True) 

235 for i in range(max(1, min_degree), min(max_degree, n_features) + 1) 

236 ] 

237 ) 

238 else: 

239 combinations = comb(n_features + max_degree, max_degree, exact=True) - 1 

240 if min_degree > 0: 

241 d = min_degree - 1 

242 combinations -= comb(n_features + d, d, exact=True) - 1 

243 

244 if include_bias: 

245 combinations += 1 

246 

247 return combinations 

248 

249 @property 

250 def powers_(self): 

251 """Exponent for each of the inputs in the output.""" 

252 check_is_fitted(self) 

253 

254 combinations = self._combinations( 

255 n_features=self.n_features_in_, 

256 min_degree=self._min_degree, 

257 max_degree=self._max_degree, 

258 interaction_only=self.interaction_only, 

259 include_bias=self.include_bias, 

260 ) 

261 return np.vstack( 

262 [np.bincount(c, minlength=self.n_features_in_) for c in combinations] 

263 ) 

264 

265 def get_feature_names_out(self, input_features=None): 

266 """Get output feature names for transformation. 

267 

268 Parameters 

269 ---------- 

270 input_features : array-like of str or None, default=None 

271 Input features. 

272 

273 - If `input_features is None`, then `feature_names_in_` is 

274 used as feature names in. If `feature_names_in_` is not defined, 

275 then the following input feature names are generated: 

276 `["x0", "x1", ..., "x(n_features_in_ - 1)"]`. 

277 - If `input_features` is an array-like, then `input_features` must 

278 match `feature_names_in_` if `feature_names_in_` is defined. 

279 

280 Returns 

281 ------- 

282 feature_names_out : ndarray of str objects 

283 Transformed feature names. 

284 """ 

285 powers = self.powers_ 

286 input_features = _check_feature_names_in(self, input_features) 

287 feature_names = [] 

288 for row in powers: 

289 inds = np.where(row)[0] 

290 if len(inds): 

291 name = " ".join( 

292 ( 

293 "%s^%d" % (input_features[ind], exp) 

294 if exp != 1 

295 else input_features[ind] 

296 ) 

297 for ind, exp in zip(inds, row[inds]) 

298 ) 

299 else: 

300 name = "1" 

301 feature_names.append(name) 

302 return np.asarray(feature_names, dtype=object) 

303 

304 @_fit_context(prefer_skip_nested_validation=True) 

305 def fit(self, X, y=None): 

306 """ 

307 Compute number of output features. 

308 

309 Parameters 

310 ---------- 

311 X : {array-like, sparse matrix} of shape (n_samples, n_features) 

312 The data. 

313 

314 y : Ignored 

315 Not used, present here for API consistency by convention. 

316 

317 Returns 

318 ------- 

319 self : object 

320 Fitted transformer. 

321 """ 

322 _, n_features = self._validate_data(X, accept_sparse=True).shape 

323 

324 if isinstance(self.degree, Integral): 

325 if self.degree == 0 and not self.include_bias: 

326 raise ValueError( 

327 "Setting degree to zero and include_bias to False would result in" 

328 " an empty output array." 

329 ) 

330 

331 self._min_degree = 0 

332 self._max_degree = self.degree 

333 elif ( 

334 isinstance(self.degree, collections.abc.Iterable) and len(self.degree) == 2 

335 ): 

336 self._min_degree, self._max_degree = self.degree 

337 if not ( 

338 isinstance(self._min_degree, Integral) 

339 and isinstance(self._max_degree, Integral) 

340 and self._min_degree >= 0 

341 and self._min_degree <= self._max_degree 

342 ): 

343 raise ValueError( 

344 "degree=(min_degree, max_degree) must " 

345 "be non-negative integers that fulfil " 

346 "min_degree <= max_degree, got " 

347 f"{self.degree}." 

348 ) 

349 elif self._max_degree == 0 and not self.include_bias: 

350 raise ValueError( 

351 "Setting both min_degree and max_degree to zero and include_bias to" 

352 " False would result in an empty output array." 

353 ) 

354 else: 

355 raise ValueError( 

356 "degree must be a non-negative int or tuple " 

357 "(min_degree, max_degree), got " 

358 f"{self.degree}." 

359 ) 

360 

361 self.n_output_features_ = self._num_combinations( 

362 n_features=n_features, 

363 min_degree=self._min_degree, 

364 max_degree=self._max_degree, 

365 interaction_only=self.interaction_only, 

366 include_bias=self.include_bias, 

367 ) 

368 if self.n_output_features_ > np.iinfo(np.intp).max: 

369 msg = ( 

370 "The output that would result from the current configuration would" 

371 f" have {self.n_output_features_} features which is too large to be" 

372 f" indexed by {np.intp().dtype.name}. Please change some or all of the" 

373 " following:\n- The number of features in the input, currently" 

374 f" {n_features=}\n- The range of degrees to calculate, currently" 

375 f" [{self._min_degree}, {self._max_degree}]\n- Whether to include only" 

376 f" interaction terms, currently {self.interaction_only}\n- Whether to" 

377 f" include a bias term, currently {self.include_bias}." 

378 ) 

379 if ( 

380 np.intp == np.int32 

381 and self.n_output_features_ <= np.iinfo(np.int64).max 

382 ): # pragma: nocover 

383 msg += ( 

384 "\nNote that the current Python runtime has a limited 32 bit " 

385 "address space and that this configuration would have been " 

386 "admissible if run on a 64 bit Python runtime." 

387 ) 

388 raise ValueError(msg) 

389 # We also record the number of output features for 

390 # _max_degree = 0 

391 self._n_out_full = self._num_combinations( 

392 n_features=n_features, 

393 min_degree=0, 

394 max_degree=self._max_degree, 

395 interaction_only=self.interaction_only, 

396 include_bias=self.include_bias, 

397 ) 

398 

399 return self 

400 

401 def transform(self, X): 

402 """Transform data to polynomial features. 

403 

404 Parameters 

405 ---------- 

406 X : {array-like, sparse matrix} of shape (n_samples, n_features) 

407 The data to transform, row by row. 

408 

409 Prefer CSR over CSC for sparse input (for speed), but CSC is 

410 required if the degree is 4 or higher. If the degree is less than 

411 4 and the input format is CSC, it will be converted to CSR, have 

412 its polynomial features generated, then converted back to CSC. 

413 

414 If the degree is 2 or 3, the method described in "Leveraging 

415 Sparsity to Speed Up Polynomial Feature Expansions of CSR Matrices 

416 Using K-Simplex Numbers" by Andrew Nystrom and John Hughes is 

417 used, which is much faster than the method used on CSC input. For 

418 this reason, a CSC input will be converted to CSR, and the output 

419 will be converted back to CSC prior to being returned, hence the 

420 preference of CSR. 

421 

422 Returns 

423 ------- 

424 XP : {ndarray, sparse matrix} of shape (n_samples, NP) 

425 The matrix of features, where `NP` is the number of polynomial 

426 features generated from the combination of inputs. If a sparse 

427 matrix is provided, it will be converted into a sparse 

428 `csr_matrix`. 

429 """ 

430 check_is_fitted(self) 

431 

432 X = self._validate_data( 

433 X, order="F", dtype=FLOAT_DTYPES, reset=False, accept_sparse=("csr", "csc") 

434 ) 

435 

436 n_samples, n_features = X.shape 

437 max_int32 = np.iinfo(np.int32).max 

438 if sparse.issparse(X) and X.format == "csr": 

439 if self._max_degree > 3: 

440 return self.transform(X.tocsc()).tocsr() 

441 to_stack = [] 

442 if self.include_bias: 

443 to_stack.append( 

444 sparse.csr_matrix(np.ones(shape=(n_samples, 1), dtype=X.dtype)) 

445 ) 

446 if self._min_degree <= 1 and self._max_degree > 0: 

447 to_stack.append(X) 

448 

449 cumulative_size = sum(mat.shape[1] for mat in to_stack) 

450 for deg in range(max(2, self._min_degree), self._max_degree + 1): 

451 expanded = _create_expansion( 

452 X=X, 

453 interaction_only=self.interaction_only, 

454 deg=deg, 

455 n_features=n_features, 

456 cumulative_size=cumulative_size, 

457 ) 

458 if expanded is not None: 

459 to_stack.append(expanded) 

460 cumulative_size += expanded.shape[1] 

461 if len(to_stack) == 0: 

462 # edge case: deal with empty matrix 

463 XP = sparse.csr_matrix((n_samples, 0), dtype=X.dtype) 

464 else: 

465 # `scipy.sparse.hstack` breaks in scipy<1.9.2 

466 # when `n_output_features_ > max_int32` 

467 all_int32 = all(mat.indices.dtype == np.int32 for mat in to_stack) 

468 if ( 

469 sp_version < parse_version("1.9.2") 

470 and self.n_output_features_ > max_int32 

471 and all_int32 

472 ): 

473 raise ValueError( # pragma: no cover 

474 "In scipy versions `<1.9.2`, the function `scipy.sparse.hstack`" 

475 " produces negative columns when:\n1. The output shape contains" 

476 " `n_cols` too large to be represented by a 32bit signed" 

477 " integer.\n2. All sub-matrices to be stacked have indices of" 

478 " dtype `np.int32`.\nTo avoid this error, either use a version" 

479 " of scipy `>=1.9.2` or alter the `PolynomialFeatures`" 

480 " transformer to produce fewer than 2^31 output features" 

481 ) 

482 XP = sparse.hstack(to_stack, dtype=X.dtype, format="csr") 

483 elif sparse.issparse(X) and X.format == "csc" and self._max_degree < 4: 

484 return self.transform(X.tocsr()).tocsc() 

485 elif sparse.issparse(X): 

486 combinations = self._combinations( 

487 n_features=n_features, 

488 min_degree=self._min_degree, 

489 max_degree=self._max_degree, 

490 interaction_only=self.interaction_only, 

491 include_bias=self.include_bias, 

492 ) 

493 columns = [] 

494 for combi in combinations: 

495 if combi: 

496 out_col = 1 

497 for col_idx in combi: 

498 out_col = X[:, [col_idx]].multiply(out_col) 

499 columns.append(out_col) 

500 else: 

501 bias = sparse.csc_matrix(np.ones((X.shape[0], 1))) 

502 columns.append(bias) 

503 XP = sparse.hstack(columns, dtype=X.dtype).tocsc() 

504 else: 

505 # Do as if _min_degree = 0 and cut down array after the 

506 # computation, i.e. use _n_out_full instead of n_output_features_. 

507 XP = np.empty( 

508 shape=(n_samples, self._n_out_full), dtype=X.dtype, order=self.order 

509 ) 

510 

511 # What follows is a faster implementation of: 

512 # for i, comb in enumerate(combinations): 

513 # XP[:, i] = X[:, comb].prod(1) 

514 # This implementation uses two optimisations. 

515 # First one is broadcasting, 

516 # multiply ([X1, ..., Xn], X1) -> [X1 X1, ..., Xn X1] 

517 # multiply ([X2, ..., Xn], X2) -> [X2 X2, ..., Xn X2] 

518 # ... 

519 # multiply ([X[:, start:end], X[:, start]) -> ... 

520 # Second optimisation happens for degrees >= 3. 

521 # Xi^3 is computed reusing previous computation: 

522 # Xi^3 = Xi^2 * Xi. 

523 

524 # degree 0 term 

525 if self.include_bias: 

526 XP[:, 0] = 1 

527 current_col = 1 

528 else: 

529 current_col = 0 

530 

531 if self._max_degree == 0: 

532 return XP 

533 

534 # degree 1 term 

535 XP[:, current_col : current_col + n_features] = X 

536 index = list(range(current_col, current_col + n_features)) 

537 current_col += n_features 

538 index.append(current_col) 

539 

540 # loop over degree >= 2 terms 

541 for _ in range(2, self._max_degree + 1): 

542 new_index = [] 

543 end = index[-1] 

544 for feature_idx in range(n_features): 

545 start = index[feature_idx] 

546 new_index.append(current_col) 

547 if self.interaction_only: 

548 start += index[feature_idx + 1] - index[feature_idx] 

549 next_col = current_col + end - start 

550 if next_col <= current_col: 

551 break 

552 # XP[:, start:end] are terms of degree d - 1 

553 # that exclude feature #feature_idx. 

554 np.multiply( 

555 XP[:, start:end], 

556 X[:, feature_idx : feature_idx + 1], 

557 out=XP[:, current_col:next_col], 

558 casting="no", 

559 ) 

560 current_col = next_col 

561 

562 new_index.append(current_col) 

563 index = new_index 

564 

565 if self._min_degree > 1: 

566 n_XP, n_Xout = self._n_out_full, self.n_output_features_ 

567 if self.include_bias: 

568 Xout = np.empty( 

569 shape=(n_samples, n_Xout), dtype=XP.dtype, order=self.order 

570 ) 

571 Xout[:, 0] = 1 

572 Xout[:, 1:] = XP[:, n_XP - n_Xout + 1 :] 

573 else: 

574 Xout = XP[:, n_XP - n_Xout :].copy() 

575 XP = Xout 

576 return XP 

577 

578 

579class SplineTransformer(TransformerMixin, BaseEstimator): 

580 """Generate univariate B-spline bases for features. 

581 

582 Generate a new feature matrix consisting of 

583 `n_splines=n_knots + degree - 1` (`n_knots - 1` for 

584 `extrapolation="periodic"`) spline basis functions 

585 (B-splines) of polynomial order=`degree` for each feature. 

586 

587 In order to learn more about the SplineTransformer class go to: 

588 :ref:`sphx_glr_auto_examples_applications_plot_cyclical_feature_engineering.py` 

589 

590 Read more in the :ref:`User Guide <spline_transformer>`. 

591 

592 .. versionadded:: 1.0 

593 

594 Parameters 

595 ---------- 

596 n_knots : int, default=5 

597 Number of knots of the splines if `knots` equals one of 

598 {'uniform', 'quantile'}. Must be larger or equal 2. Ignored if `knots` 

599 is array-like. 

600 

601 degree : int, default=3 

602 The polynomial degree of the spline basis. Must be a non-negative 

603 integer. 

604 

605 knots : {'uniform', 'quantile'} or array-like of shape \ 

606 (n_knots, n_features), default='uniform' 

607 Set knot positions such that first knot <= features <= last knot. 

608 

609 - If 'uniform', `n_knots` number of knots are distributed uniformly 

610 from min to max values of the features. 

611 - If 'quantile', they are distributed uniformly along the quantiles of 

612 the features. 

613 - If an array-like is given, it directly specifies the sorted knot 

614 positions including the boundary knots. Note that, internally, 

615 `degree` number of knots are added before the first knot, the same 

616 after the last knot. 

617 

618 extrapolation : {'error', 'constant', 'linear', 'continue', 'periodic'}, \ 

619 default='constant' 

620 If 'error', values outside the min and max values of the training 

621 features raises a `ValueError`. If 'constant', the value of the 

622 splines at minimum and maximum value of the features is used as 

623 constant extrapolation. If 'linear', a linear extrapolation is used. 

624 If 'continue', the splines are extrapolated as is, i.e. option 

625 `extrapolate=True` in :class:`scipy.interpolate.BSpline`. If 

626 'periodic', periodic splines with a periodicity equal to the distance 

627 between the first and last knot are used. Periodic splines enforce 

628 equal function values and derivatives at the first and last knot. 

629 For example, this makes it possible to avoid introducing an arbitrary 

630 jump between Dec 31st and Jan 1st in spline features derived from a 

631 naturally periodic "day-of-year" input feature. In this case it is 

632 recommended to manually set the knot values to control the period. 

633 

634 include_bias : bool, default=True 

635 If False, then the last spline element inside the data range 

636 of a feature is dropped. As B-splines sum to one over the spline basis 

637 functions for each data point, they implicitly include a bias term, 

638 i.e. a column of ones. It acts as an intercept term in a linear models. 

639 

640 order : {'C', 'F'}, default='C' 

641 Order of output array in the dense case. `'F'` order is faster to compute, but 

642 may slow down subsequent estimators. 

643 

644 sparse_output : bool, default=False 

645 Will return sparse CSR matrix if set True else will return an array. This 

646 option is only available with `scipy>=1.8`. 

647 

648 .. versionadded:: 1.2 

649 

650 Attributes 

651 ---------- 

652 bsplines_ : list of shape (n_features,) 

653 List of BSplines objects, one for each feature. 

654 

655 n_features_in_ : int 

656 The total number of input features. 

657 

658 feature_names_in_ : ndarray of shape (`n_features_in_`,) 

659 Names of features seen during :term:`fit`. Defined only when `X` 

660 has feature names that are all strings. 

661 

662 .. versionadded:: 1.0 

663 

664 n_features_out_ : int 

665 The total number of output features, which is computed as 

666 `n_features * n_splines`, where `n_splines` is 

667 the number of bases elements of the B-splines, 

668 `n_knots + degree - 1` for non-periodic splines and 

669 `n_knots - 1` for periodic ones. 

670 If `include_bias=False`, then it is only 

671 `n_features * (n_splines - 1)`. 

672 

673 See Also 

674 -------- 

675 KBinsDiscretizer : Transformer that bins continuous data into intervals. 

676 

677 PolynomialFeatures : Transformer that generates polynomial and interaction 

678 features. 

679 

680 Notes 

681 ----- 

682 High degrees and a high number of knots can cause overfitting. 

683 

684 See :ref:`examples/linear_model/plot_polynomial_interpolation.py 

685 <sphx_glr_auto_examples_linear_model_plot_polynomial_interpolation.py>`. 

686 

687 Examples 

688 -------- 

689 >>> import numpy as np 

690 >>> from sklearn.preprocessing import SplineTransformer 

691 >>> X = np.arange(6).reshape(6, 1) 

692 >>> spline = SplineTransformer(degree=2, n_knots=3) 

693 >>> spline.fit_transform(X) 

694 array([[0.5 , 0.5 , 0. , 0. ], 

695 [0.18, 0.74, 0.08, 0. ], 

696 [0.02, 0.66, 0.32, 0. ], 

697 [0. , 0.32, 0.66, 0.02], 

698 [0. , 0.08, 0.74, 0.18], 

699 [0. , 0. , 0.5 , 0.5 ]]) 

700 """ 

701 

702 _parameter_constraints: dict = { 

703 "n_knots": [Interval(Integral, 2, None, closed="left")], 

704 "degree": [Interval(Integral, 0, None, closed="left")], 

705 "knots": [StrOptions({"uniform", "quantile"}), "array-like"], 

706 "extrapolation": [ 

707 StrOptions({"error", "constant", "linear", "continue", "periodic"}) 

708 ], 

709 "include_bias": ["boolean"], 

710 "order": [StrOptions({"C", "F"})], 

711 "sparse_output": ["boolean"], 

712 } 

713 

714 def __init__( 

715 self, 

716 n_knots=5, 

717 degree=3, 

718 *, 

719 knots="uniform", 

720 extrapolation="constant", 

721 include_bias=True, 

722 order="C", 

723 sparse_output=False, 

724 ): 

725 self.n_knots = n_knots 

726 self.degree = degree 

727 self.knots = knots 

728 self.extrapolation = extrapolation 

729 self.include_bias = include_bias 

730 self.order = order 

731 self.sparse_output = sparse_output 

732 

733 @staticmethod 

734 def _get_base_knot_positions(X, n_knots=10, knots="uniform", sample_weight=None): 

735 """Calculate base knot positions. 

736 

737 Base knots such that first knot <= feature <= last knot. For the 

738 B-spline construction with scipy.interpolate.BSpline, 2*degree knots 

739 beyond the base interval are added. 

740 

741 Returns 

742 ------- 

743 knots : ndarray of shape (n_knots, n_features), dtype=np.float64 

744 Knot positions (points) of base interval. 

745 """ 

746 if knots == "quantile": 

747 percentiles = 100 * np.linspace( 

748 start=0, stop=1, num=n_knots, dtype=np.float64 

749 ) 

750 

751 if sample_weight is None: 

752 knots = np.percentile(X, percentiles, axis=0) 

753 else: 

754 knots = np.array( 

755 [ 

756 _weighted_percentile(X, sample_weight, percentile) 

757 for percentile in percentiles 

758 ] 

759 ) 

760 

761 else: 

762 # knots == 'uniform': 

763 # Note that the variable `knots` has already been validated and 

764 # `else` is therefore safe. 

765 # Disregard observations with zero weight. 

766 mask = slice(None, None, 1) if sample_weight is None else sample_weight > 0 

767 x_min = np.amin(X[mask], axis=0) 

768 x_max = np.amax(X[mask], axis=0) 

769 

770 knots = np.linspace( 

771 start=x_min, 

772 stop=x_max, 

773 num=n_knots, 

774 endpoint=True, 

775 dtype=np.float64, 

776 ) 

777 

778 return knots 

779 

780 def get_feature_names_out(self, input_features=None): 

781 """Get output feature names for transformation. 

782 

783 Parameters 

784 ---------- 

785 input_features : array-like of str or None, default=None 

786 Input features. 

787 

788 - If `input_features` is `None`, then `feature_names_in_` is 

789 used as feature names in. If `feature_names_in_` is not defined, 

790 then the following input feature names are generated: 

791 `["x0", "x1", ..., "x(n_features_in_ - 1)"]`. 

792 - If `input_features` is an array-like, then `input_features` must 

793 match `feature_names_in_` if `feature_names_in_` is defined. 

794 

795 Returns 

796 ------- 

797 feature_names_out : ndarray of str objects 

798 Transformed feature names. 

799 """ 

800 check_is_fitted(self, "n_features_in_") 

801 n_splines = self.bsplines_[0].c.shape[1] 

802 

803 input_features = _check_feature_names_in(self, input_features) 

804 feature_names = [] 

805 for i in range(self.n_features_in_): 

806 for j in range(n_splines - 1 + self.include_bias): 

807 feature_names.append(f"{input_features[i]}_sp_{j}") 

808 return np.asarray(feature_names, dtype=object) 

809 

810 @_fit_context(prefer_skip_nested_validation=True) 

811 def fit(self, X, y=None, sample_weight=None): 

812 """Compute knot positions of splines. 

813 

814 Parameters 

815 ---------- 

816 X : array-like of shape (n_samples, n_features) 

817 The data. 

818 

819 y : None 

820 Ignored. 

821 

822 sample_weight : array-like of shape (n_samples,), default = None 

823 Individual weights for each sample. Used to calculate quantiles if 

824 `knots="quantile"`. For `knots="uniform"`, zero weighted 

825 observations are ignored for finding the min and max of `X`. 

826 

827 Returns 

828 ------- 

829 self : object 

830 Fitted transformer. 

831 """ 

832 X = self._validate_data( 

833 X, 

834 reset=True, 

835 accept_sparse=False, 

836 ensure_min_samples=2, 

837 ensure_2d=True, 

838 ) 

839 if sample_weight is not None: 

840 sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype) 

841 

842 _, n_features = X.shape 

843 

844 if isinstance(self.knots, str): 

845 base_knots = self._get_base_knot_positions( 

846 X, n_knots=self.n_knots, knots=self.knots, sample_weight=sample_weight 

847 ) 

848 else: 

849 base_knots = check_array(self.knots, dtype=np.float64) 

850 if base_knots.shape[0] < 2: 

851 raise ValueError("Number of knots, knots.shape[0], must be >= 2.") 

852 elif base_knots.shape[1] != n_features: 

853 raise ValueError("knots.shape[1] == n_features is violated.") 

854 elif not np.all(np.diff(base_knots, axis=0) > 0): 

855 raise ValueError("knots must be sorted without duplicates.") 

856 

857 if self.sparse_output and sp_version < parse_version("1.8.0"): 

858 raise ValueError( 

859 "Option sparse_output=True is only available with scipy>=1.8.0, " 

860 f"but here scipy=={sp_version} is used." 

861 ) 

862 

863 # number of knots for base interval 

864 n_knots = base_knots.shape[0] 

865 

866 if self.extrapolation == "periodic" and n_knots <= self.degree: 

867 raise ValueError( 

868 "Periodic splines require degree < n_knots. Got n_knots=" 

869 f"{n_knots} and degree={self.degree}." 

870 ) 

871 

872 # number of splines basis functions 

873 if self.extrapolation != "periodic": 

874 n_splines = n_knots + self.degree - 1 

875 else: 

876 # periodic splines have self.degree less degrees of freedom 

877 n_splines = n_knots - 1 

878 

879 degree = self.degree 

880 n_out = n_features * n_splines 

881 # We have to add degree number of knots below, and degree number knots 

882 # above the base knots in order to make the spline basis complete. 

883 if self.extrapolation == "periodic": 

884 # For periodic splines the spacing of the first / last degree knots 

885 # needs to be a continuation of the spacing of the last / first 

886 # base knots. 

887 period = base_knots[-1] - base_knots[0] 

888 knots = np.r_[ 

889 base_knots[-(degree + 1) : -1] - period, 

890 base_knots, 

891 base_knots[1 : (degree + 1)] + period, 

892 ] 

893 

894 else: 

895 # Eilers & Marx in "Flexible smoothing with B-splines and 

896 # penalties" https://doi.org/10.1214/ss/1038425655 advice 

897 # against repeating first and last knot several times, which 

898 # would have inferior behaviour at boundaries if combined with 

899 # a penalty (hence P-Spline). We follow this advice even if our 

900 # splines are unpenalized. Meaning we do not: 

901 # knots = np.r_[ 

902 # np.tile(base_knots.min(axis=0), reps=[degree, 1]), 

903 # base_knots, 

904 # np.tile(base_knots.max(axis=0), reps=[degree, 1]) 

905 # ] 

906 # Instead, we reuse the distance of the 2 fist/last knots. 

907 dist_min = base_knots[1] - base_knots[0] 

908 dist_max = base_knots[-1] - base_knots[-2] 

909 

910 knots = np.r_[ 

911 np.linspace( 

912 base_knots[0] - degree * dist_min, 

913 base_knots[0] - dist_min, 

914 num=degree, 

915 ), 

916 base_knots, 

917 np.linspace( 

918 base_knots[-1] + dist_max, 

919 base_knots[-1] + degree * dist_max, 

920 num=degree, 

921 ), 

922 ] 

923 

924 # With a diagonal coefficient matrix, we get back the spline basis 

925 # elements, i.e. the design matrix of the spline. 

926 # Note, BSpline appreciates C-contiguous float64 arrays as c=coef. 

927 coef = np.eye(n_splines, dtype=np.float64) 

928 if self.extrapolation == "periodic": 

929 coef = np.concatenate((coef, coef[:degree, :])) 

930 

931 extrapolate = self.extrapolation in ["periodic", "continue"] 

932 

933 bsplines = [ 

934 BSpline.construct_fast( 

935 knots[:, i], coef, self.degree, extrapolate=extrapolate 

936 ) 

937 for i in range(n_features) 

938 ] 

939 self.bsplines_ = bsplines 

940 

941 self.n_features_out_ = n_out - n_features * (1 - self.include_bias) 

942 return self 

943 

944 def transform(self, X): 

945 """Transform each feature data to B-splines. 

946 

947 Parameters 

948 ---------- 

949 X : array-like of shape (n_samples, n_features) 

950 The data to transform. 

951 

952 Returns 

953 ------- 

954 XBS : {ndarray, sparse matrix} of shape (n_samples, n_features * n_splines) 

955 The matrix of features, where n_splines is the number of bases 

956 elements of the B-splines, n_knots + degree - 1. 

957 """ 

958 check_is_fitted(self) 

959 

960 X = self._validate_data(X, reset=False, accept_sparse=False, ensure_2d=True) 

961 

962 n_samples, n_features = X.shape 

963 n_splines = self.bsplines_[0].c.shape[1] 

964 degree = self.degree 

965 

966 # TODO: Remove this condition, once scipy 1.10 is the minimum version. 

967 # Only scipy => 1.10 supports design_matrix(.., extrapolate=..). 

968 # The default (implicit in scipy < 1.10) is extrapolate=False. 

969 scipy_1_10 = sp_version >= parse_version("1.10.0") 

970 # Note: self.bsplines_[0].extrapolate is True for extrapolation in 

971 # ["periodic", "continue"] 

972 if scipy_1_10: 

973 use_sparse = self.sparse_output 

974 kwargs_extrapolate = {"extrapolate": self.bsplines_[0].extrapolate} 

975 else: 

976 use_sparse = self.sparse_output and not self.bsplines_[0].extrapolate 

977 kwargs_extrapolate = dict() 

978 

979 # Note that scipy BSpline returns float64 arrays and converts input 

980 # x=X[:, i] to c-contiguous float64. 

981 n_out = self.n_features_out_ + n_features * (1 - self.include_bias) 

982 if X.dtype in FLOAT_DTYPES: 

983 dtype = X.dtype 

984 else: 

985 dtype = np.float64 

986 if use_sparse: 

987 output_list = [] 

988 else: 

989 XBS = np.zeros((n_samples, n_out), dtype=dtype, order=self.order) 

990 

991 for i in range(n_features): 

992 spl = self.bsplines_[i] 

993 

994 if self.extrapolation in ("continue", "error", "periodic"): 

995 if self.extrapolation == "periodic": 

996 # With periodic extrapolation we map x to the segment 

997 # [spl.t[k], spl.t[n]]. 

998 # This is equivalent to BSpline(.., extrapolate="periodic") 

999 # for scipy>=1.0.0. 

1000 n = spl.t.size - spl.k - 1 

1001 # Assign to new array to avoid inplace operation 

1002 x = spl.t[spl.k] + (X[:, i] - spl.t[spl.k]) % ( 

1003 spl.t[n] - spl.t[spl.k] 

1004 ) 

1005 else: 

1006 x = X[:, i] 

1007 

1008 if use_sparse: 

1009 XBS_sparse = BSpline.design_matrix( 

1010 x, spl.t, spl.k, **kwargs_extrapolate 

1011 ) 

1012 if self.extrapolation == "periodic": 

1013 # See the construction of coef in fit. We need to add the last 

1014 # degree spline basis function to the first degree ones and 

1015 # then drop the last ones. 

1016 # Note: See comment about SparseEfficiencyWarning below. 

1017 XBS_sparse = XBS_sparse.tolil() 

1018 XBS_sparse[:, :degree] += XBS_sparse[:, -degree:] 

1019 XBS_sparse = XBS_sparse[:, :-degree] 

1020 else: 

1021 XBS[:, (i * n_splines) : ((i + 1) * n_splines)] = spl(x) 

1022 else: # extrapolation in ("constant", "linear") 

1023 xmin, xmax = spl.t[degree], spl.t[-degree - 1] 

1024 # spline values at boundaries 

1025 f_min, f_max = spl(xmin), spl(xmax) 

1026 mask = (xmin <= X[:, i]) & (X[:, i] <= xmax) 

1027 if use_sparse: 

1028 mask_inv = ~mask 

1029 x = X[:, i].copy() 

1030 # Set some arbitrary values outside boundary that will be reassigned 

1031 # later. 

1032 x[mask_inv] = spl.t[self.degree] 

1033 XBS_sparse = BSpline.design_matrix(x, spl.t, spl.k) 

1034 # Note: Without converting to lil_matrix we would get: 

1035 # scipy.sparse._base.SparseEfficiencyWarning: Changing the sparsity 

1036 # structure of a csr_matrix is expensive. lil_matrix is more 

1037 # efficient. 

1038 if np.any(mask_inv): 

1039 XBS_sparse = XBS_sparse.tolil() 

1040 XBS_sparse[mask_inv, :] = 0 

1041 else: 

1042 XBS[mask, (i * n_splines) : ((i + 1) * n_splines)] = spl(X[mask, i]) 

1043 

1044 # Note for extrapolation: 

1045 # 'continue' is already returned as is by scipy BSplines 

1046 if self.extrapolation == "error": 

1047 # BSpline with extrapolate=False does not raise an error, but 

1048 # outputs np.nan. 

1049 if (use_sparse and np.any(np.isnan(XBS_sparse.data))) or ( 

1050 not use_sparse 

1051 and np.any( 

1052 np.isnan(XBS[:, (i * n_splines) : ((i + 1) * n_splines)]) 

1053 ) 

1054 ): 

1055 raise ValueError( 

1056 "X contains values beyond the limits of the knots." 

1057 ) 

1058 elif self.extrapolation == "constant": 

1059 # Set all values beyond xmin and xmax to the value of the 

1060 # spline basis functions at those two positions. 

1061 # Only the first degree and last degree number of splines 

1062 # have non-zero values at the boundaries. 

1063 

1064 mask = X[:, i] < xmin 

1065 if np.any(mask): 

1066 if use_sparse: 

1067 # Note: See comment about SparseEfficiencyWarning above. 

1068 XBS_sparse = XBS_sparse.tolil() 

1069 XBS_sparse[mask, :degree] = f_min[:degree] 

1070 

1071 else: 

1072 XBS[mask, (i * n_splines) : (i * n_splines + degree)] = f_min[ 

1073 :degree 

1074 ] 

1075 

1076 mask = X[:, i] > xmax 

1077 if np.any(mask): 

1078 if use_sparse: 

1079 # Note: See comment about SparseEfficiencyWarning above. 

1080 XBS_sparse = XBS_sparse.tolil() 

1081 XBS_sparse[mask, -degree:] = f_max[-degree:] 

1082 else: 

1083 XBS[ 

1084 mask, 

1085 ((i + 1) * n_splines - degree) : ((i + 1) * n_splines), 

1086 ] = f_max[-degree:] 

1087 

1088 elif self.extrapolation == "linear": 

1089 # Continue the degree first and degree last spline bases 

1090 # linearly beyond the boundaries, with slope = derivative at 

1091 # the boundary. 

1092 # Note that all others have derivative = value = 0 at the 

1093 # boundaries. 

1094 

1095 # spline derivatives = slopes at boundaries 

1096 fp_min, fp_max = spl(xmin, nu=1), spl(xmax, nu=1) 

1097 # Compute the linear continuation. 

1098 if degree <= 1: 

1099 # For degree=1, the derivative of 2nd spline is not zero at 

1100 # boundary. For degree=0 it is the same as 'constant'. 

1101 degree += 1 

1102 for j in range(degree): 

1103 mask = X[:, i] < xmin 

1104 if np.any(mask): 

1105 linear_extr = f_min[j] + (X[mask, i] - xmin) * fp_min[j] 

1106 if use_sparse: 

1107 # Note: See comment about SparseEfficiencyWarning above. 

1108 XBS_sparse = XBS_sparse.tolil() 

1109 XBS_sparse[mask, j] = linear_extr 

1110 else: 

1111 XBS[mask, i * n_splines + j] = linear_extr 

1112 

1113 mask = X[:, i] > xmax 

1114 if np.any(mask): 

1115 k = n_splines - 1 - j 

1116 linear_extr = f_max[k] + (X[mask, i] - xmax) * fp_max[k] 

1117 if use_sparse: 

1118 # Note: See comment about SparseEfficiencyWarning above. 

1119 XBS_sparse = XBS_sparse.tolil() 

1120 XBS_sparse[mask, k : k + 1] = linear_extr[:, None] 

1121 else: 

1122 XBS[mask, i * n_splines + k] = linear_extr 

1123 

1124 if use_sparse: 

1125 XBS_sparse = XBS_sparse.tocsr() 

1126 output_list.append(XBS_sparse) 

1127 

1128 if use_sparse: 

1129 # TODO: Remove this conditional error when the minimum supported version of 

1130 # SciPy is 1.9.2 

1131 # `scipy.sparse.hstack` breaks in scipy<1.9.2 

1132 # when `n_features_out_ > max_int32` 

1133 max_int32 = np.iinfo(np.int32).max 

1134 all_int32 = True 

1135 for mat in output_list: 

1136 all_int32 &= mat.indices.dtype == np.int32 

1137 if ( 

1138 sp_version < parse_version("1.9.2") 

1139 and self.n_features_out_ > max_int32 

1140 and all_int32 

1141 ): 

1142 raise ValueError( 

1143 "In scipy versions `<1.9.2`, the function `scipy.sparse.hstack`" 

1144 " produces negative columns when:\n1. The output shape contains" 

1145 " `n_cols` too large to be represented by a 32bit signed" 

1146 " integer.\n. All sub-matrices to be stacked have indices of" 

1147 " dtype `np.int32`.\nTo avoid this error, either use a version" 

1148 " of scipy `>=1.9.2` or alter the `SplineTransformer`" 

1149 " transformer to produce fewer than 2^31 output features" 

1150 ) 

1151 XBS = sparse.hstack(output_list, format="csr") 

1152 elif self.sparse_output: 

1153 # TODO: Remove ones scipy 1.10 is the minimum version. See comments above. 

1154 XBS = sparse.csr_matrix(XBS) 

1155 

1156 if self.include_bias: 

1157 return XBS 

1158 else: 

1159 # We throw away one spline basis per feature. 

1160 # We chose the last one. 

1161 indices = [j for j in range(XBS.shape[1]) if (j + 1) % n_splines != 0] 

1162 return XBS[:, indices] 

1163 

1164 def _more_tags(self): 

1165 return { 

1166 "_xfail_checks": { 

1167 "check_estimators_pickle": ( 

1168 "Current Scipy implementation of _bsplines does not" 

1169 "support const memory views." 

1170 ), 

1171 } 

1172 }