Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/scikit_learn-1.4.dev0-py3.8-linux-x86_64.egg/sklearn/preprocessing/

1"""

2This file contains preprocessing tools based on polynomials.

3"""

4import collections

5from itertools import chain, combinations

6from itertools import combinations_with_replacement as combinations_w_r

7from numbers import Integral

9import numpy as np

10from scipy import sparse

11from scipy.interpolate import BSpline

12from scipy.special import comb

14from ..base import BaseEstimator, TransformerMixin, _fit_context

15from ..utils import check_array

16from ..utils._param_validation import Interval, StrOptions

17from ..utils.fixes import parse_version, sp_version

18from ..utils.stats import _weighted_percentile

19from ..utils.validation import (

20 FLOAT_DTYPES,

21 _check_feature_names_in,

22 _check_sample_weight,

23 check_is_fitted,

24)

25from ._csr_polynomial_expansion import (

26 _calc_expanded_nnz,

27 _calc_total_nnz,

28 _csr_polynomial_expansion,

29)

31__all__ = [

32 "PolynomialFeatures",

33 "SplineTransformer",

34]

37def _create_expansion(X, interaction_only, deg, n_features, cumulative_size=0):

38 """Helper function for creating and appending sparse expansion matrices"""

40 total_nnz = _calc_total_nnz(X.indptr, interaction_only, deg)

41 expanded_col = _calc_expanded_nnz(n_features, interaction_only, deg)

43 if expanded_col == 0:

44 return None

45 # This only checks whether each block needs 64bit integers upon

46 # expansion. We prefer to keep int32 indexing where we can,

47 # since currently SciPy's CSR construction downcasts when possible,

48 # so we prefer to avoid an unnecessary cast. The dtype may still

49 # change in the concatenation process if needed.

50 # See: https://github.com/scipy/scipy/issues/16569

51 max_indices = expanded_col - 1

52 max_indptr = total_nnz

53 max_int32 = np.iinfo(np.int32).max

54 needs_int64 = max(max_indices, max_indptr) > max_int32

55 index_dtype = np.int64 if needs_int64 else np.int32

57 # This is a pretty specific bug that is hard to work around by a user,

58 # hence we do not detail the entire bug and all possible avoidance

59 # mechnasisms. Instead we recommend upgrading scipy or shrinking their data.

60 cumulative_size += expanded_col

61 if (

62 sp_version < parse_version("1.8.0")

63 and cumulative_size - 1 > max_int32

64 and not needs_int64

65 ):

66 raise ValueError(

67 "In scipy versions `<1.8.0`, the function `scipy.sparse.hstack`"

68 " sometimes produces negative columns when the output shape contains"

69 " `n_cols` too large to be represented by a 32bit signed"

70 " integer. To avoid this error, either use a version"

71 " of scipy `>=1.8.0` or alter the `PolynomialFeatures`"

72 " transformer to produce fewer than 2^31 output features."

73 )

75 # Result of the expansion, modified in place by the

76 # `_csr_polynomial_expansion` routine.

77 expanded_data = np.empty(shape=total_nnz, dtype=X.data.dtype)

78 expanded_indices = np.empty(shape=total_nnz, dtype=index_dtype)

79 expanded_indptr = np.empty(shape=X.indptr.shape[0], dtype=index_dtype)

80 _csr_polynomial_expansion(

81 X.data,

82 X.indices,

83 X.indptr,

84 X.shape[1],

85 expanded_data,

86 expanded_indices,

87 expanded_indptr,

88 interaction_only,

89 deg,

90 )

91 return sparse.csr_matrix(

92 (expanded_data, expanded_indices, expanded_indptr),

93 shape=(X.indptr.shape[0] - 1, expanded_col),

94 dtype=X.dtype,

95 )

98class PolynomialFeatures(TransformerMixin, BaseEstimator):

99 """Generate polynomial and interaction features.

100

101 Generate a new feature matrix consisting of all polynomial combinations

102 of the features with degree less than or equal to the specified degree.

103 For example, if an input sample is two dimensional and of the form

104 [a, b], the degree-2 polynomial features are [1, a, b, a^2, ab, b^2].

105

106 Read more in the :ref:`User Guide <polynomial_features>`.

107

108 Parameters

109 ----------

110 degree : int or tuple (min_degree, max_degree), default=2

111 If a single int is given, it specifies the maximal degree of the

112 polynomial features. If a tuple `(min_degree, max_degree)` is passed,

113 then `min_degree` is the minimum and `max_degree` is the maximum

114 polynomial degree of the generated features. Note that `min_degree=0`

115 and `min_degree=1` are equivalent as outputting the degree zero term is

116 determined by `include_bias`.

117

118 interaction_only : bool, default=False

119 If `True`, only interaction features are produced: features that are

120 products of at most `degree` *distinct* input features, i.e. terms with

121 power of 2 or higher of the same input feature are excluded:

122

123 - included: `x[0]`, `x[1]`, `x[0] * x[1]`, etc.

124 - excluded: `x[0] ** 2`, `x[0] ** 2 * x[1]`, etc.

125

126 include_bias : bool, default=True

127 If `True` (default), then include a bias column, the feature in which

128 all polynomial powers are zero (i.e. a column of ones - acts as an

129 intercept term in a linear model).

130

131 order : {'C', 'F'}, default='C'

132 Order of output array in the dense case. `'F'` order is faster to

133 compute, but may slow down subsequent estimators.

134

135 .. versionadded:: 0.21

136

137 Attributes

138 ----------

139 powers_ : ndarray of shape (`n_output_features_`, `n_features_in_`)

140 `powers_[i, j]` is the exponent of the jth input in the ith output.

141

142 n_features_in_ : int

143 Number of features seen during :term:`fit`.

144

145 .. versionadded:: 0.24

146

147 feature_names_in_ : ndarray of shape (`n_features_in_`,)

148 Names of features seen during :term:`fit`. Defined only when `X`

149 has feature names that are all strings.

150

151 .. versionadded:: 1.0

152

153 n_output_features_ : int

154 The total number of polynomial output features. The number of output

155 features is computed by iterating over all suitably sized combinations

156 of input features.

157

158 See Also

159 --------

160 SplineTransformer : Transformer that generates univariate B-spline bases

161 for features.

162

163 Notes

164 -----

165 Be aware that the number of features in the output array scales

166 polynomially in the number of features of the input array, and

167 exponentially in the degree. High degrees can cause overfitting.

168

169 See :ref:`examples/linear_model/plot_polynomial_interpolation.py

170 <sphx_glr_auto_examples_linear_model_plot_polynomial_interpolation.py>`

171

172 Examples

173 --------

174 >>> import numpy as np

175 >>> from sklearn.preprocessing import PolynomialFeatures

176 >>> X = np.arange(6).reshape(3, 2)

177 >>> X

178 array([[0, 1],

179 [2, 3],

180 [4, 5]])

181 >>> poly = PolynomialFeatures(2)

182 >>> poly.fit_transform(X)

183 array([[ 1., 0., 1., 0., 0., 1.],

184 [ 1., 2., 3., 4., 6., 9.],

185 [ 1., 4., 5., 16., 20., 25.]])

186 >>> poly = PolynomialFeatures(interaction_only=True)

187 >>> poly.fit_transform(X)

188 array([[ 1., 0., 1., 0.],

189 [ 1., 2., 3., 6.],

190 [ 1., 4., 5., 20.]])

191 """

192

193 _parameter_constraints: dict = {

194 "degree": [Interval(Integral, 0, None, closed="left"), "array-like"],

195 "interaction_only": ["boolean"],

196 "include_bias": ["boolean"],

197 "order": [StrOptions({"C", "F"})],

198 }

199

200 def __init__(

201 self, degree=2, *, interaction_only=False, include_bias=True, order="C"

202 ):

203 self.degree = degree

204 self.interaction_only = interaction_only

205 self.include_bias = include_bias

206 self.order = order

207

208 @staticmethod

209 def _combinations(

210 n_features, min_degree, max_degree, interaction_only, include_bias

211 ):

212 comb = combinations if interaction_only else combinations_w_r

213 start = max(1, min_degree)

214 iter = chain.from_iterable(

215 comb(range(n_features), i) for i in range(start, max_degree + 1)

216 )

217 if include_bias:

218 iter = chain(comb(range(n_features), 0), iter)

219 return iter

220

221 @staticmethod

222 def _num_combinations(

223 n_features, min_degree, max_degree, interaction_only, include_bias

224 ):

225 """Calculate number of terms in polynomial expansion

226

227 This should be equivalent to counting the number of terms returned by

228 _combinations(...) but much faster.

229 """

230

231 if interaction_only:

232 combinations = sum(

233 [

234 comb(n_features, i, exact=True)

235 for i in range(max(1, min_degree), min(max_degree, n_features) + 1)

236 ]

237 )

238 else:

239 combinations = comb(n_features + max_degree, max_degree, exact=True) - 1

240 if min_degree > 0:

241 d = min_degree - 1

242 combinations -= comb(n_features + d, d, exact=True) - 1

243

244 if include_bias:

245 combinations += 1

246

247 return combinations

248

249 @property

250 def powers_(self):

251 """Exponent for each of the inputs in the output."""

252 check_is_fitted(self)

253

254 combinations = self._combinations(

255 n_features=self.n_features_in_,

256 min_degree=self._min_degree,

257 max_degree=self._max_degree,

258 interaction_only=self.interaction_only,

259 include_bias=self.include_bias,

260 )

261 return np.vstack(

262 [np.bincount(c, minlength=self.n_features_in_) for c in combinations]

263 )

264

265 def get_feature_names_out(self, input_features=None):

266 """Get output feature names for transformation.

267

268 Parameters

269 ----------

270 input_features : array-like of str or None, default=None

271 Input features.

272

273 - If `input_features is None`, then `feature_names_in_` is

274 used as feature names in. If `feature_names_in_` is not defined,

275 then the following input feature names are generated:

276 `["x0", "x1", ..., "x(n_features_in_ - 1)"]`.

277 - If `input_features` is an array-like, then `input_features` must

278 match `feature_names_in_` if `feature_names_in_` is defined.

279

280 Returns

281 -------

282 feature_names_out : ndarray of str objects

283 Transformed feature names.

284 """

285 powers = self.powers_

286 input_features = _check_feature_names_in(self, input_features)

287 feature_names = []

288 for row in powers:

289 inds = np.where(row)[0]

290 if len(inds):

291 name = " ".join(

292 (

293 "%s^%d" % (input_features[ind], exp)

294 if exp != 1

295 else input_features[ind]

296 )

297 for ind, exp in zip(inds, row[inds])

298 )

299 else:

300 name = "1"

301 feature_names.append(name)

302 return np.asarray(feature_names, dtype=object)

303

304 @_fit_context(prefer_skip_nested_validation=True)

305 def fit(self, X, y=None):

306 """

307 Compute number of output features.

308

309 Parameters

310 ----------

311 X : {array-like, sparse matrix} of shape (n_samples, n_features)

312 The data.

313

314 y : Ignored

315 Not used, present here for API consistency by convention.

316

317 Returns

318 -------

319 self : object

320 Fitted transformer.

321 """

322 _, n_features = self._validate_data(X, accept_sparse=True).shape

323

324 if isinstance(self.degree, Integral):

325 if self.degree == 0 and not self.include_bias:

326 raise ValueError(

327 "Setting degree to zero and include_bias to False would result in"

328 " an empty output array."

329 )

330

331 self._min_degree = 0

332 self._max_degree = self.degree

333 elif (

334 isinstance(self.degree, collections.abc.Iterable) and len(self.degree) == 2

335 ):

336 self._min_degree, self._max_degree = self.degree

337 if not (

338 isinstance(self._min_degree, Integral)

339 and isinstance(self._max_degree, Integral)

340 and self._min_degree >= 0

341 and self._min_degree <= self._max_degree

342 ):

343 raise ValueError(

344 "degree=(min_degree, max_degree) must "

345 "be non-negative integers that fulfil "

346 "min_degree <= max_degree, got "

347 f"{self.degree}."

348 )

349 elif self._max_degree == 0 and not self.include_bias:

350 raise ValueError(

351 "Setting both min_degree and max_degree to zero and include_bias to"

352 " False would result in an empty output array."

353 )

354 else:

355 raise ValueError(

356 "degree must be a non-negative int or tuple "

357 "(min_degree, max_degree), got "

358 f"{self.degree}."

359 )

360

361 self.n_output_features_ = self._num_combinations(

362 n_features=n_features,

363 min_degree=self._min_degree,

364 max_degree=self._max_degree,

365 interaction_only=self.interaction_only,

366 include_bias=self.include_bias,

367 )

368 if self.n_output_features_ > np.iinfo(np.intp).max:

369 msg = (

370 "The output that would result from the current configuration would"

371 f" have {self.n_output_features_} features which is too large to be"

372 f" indexed by {np.intp().dtype.name}. Please change some or all of the"

373 " following:\n- The number of features in the input, currently"

374 f" {n_features=}\n- The range of degrees to calculate, currently"

375 f" [{self._min_degree}, {self._max_degree}]\n- Whether to include only"

376 f" interaction terms, currently {self.interaction_only}\n- Whether to"

377 f" include a bias term, currently {self.include_bias}."

378 )

379 if (

380 np.intp == np.int32

381 and self.n_output_features_ <= np.iinfo(np.int64).max

382 ): # pragma: nocover

383 msg += (

384 "\nNote that the current Python runtime has a limited 32 bit "

385 "address space and that this configuration would have been "

386 "admissible if run on a 64 bit Python runtime."

387 )

388 raise ValueError(msg)

389 # We also record the number of output features for

390 # _max_degree = 0

391 self._n_out_full = self._num_combinations(

392 n_features=n_features,

393 min_degree=0,

394 max_degree=self._max_degree,

395 interaction_only=self.interaction_only,

396 include_bias=self.include_bias,

397 )

398

399 return self

400

401 def transform(self, X):

402 """Transform data to polynomial features.

403

404 Parameters

405 ----------

406 X : {array-like, sparse matrix} of shape (n_samples, n_features)

407 The data to transform, row by row.

408

409 Prefer CSR over CSC for sparse input (for speed), but CSC is

410 required if the degree is 4 or higher. If the degree is less than

411 4 and the input format is CSC, it will be converted to CSR, have

412 its polynomial features generated, then converted back to CSC.

413

414 If the degree is 2 or 3, the method described in "Leveraging

415 Sparsity to Speed Up Polynomial Feature Expansions of CSR Matrices

416 Using K-Simplex Numbers" by Andrew Nystrom and John Hughes is

417 used, which is much faster than the method used on CSC input. For

418 this reason, a CSC input will be converted to CSR, and the output

419 will be converted back to CSC prior to being returned, hence the

420 preference of CSR.

421

422 Returns

423 -------

424 XP : {ndarray, sparse matrix} of shape (n_samples, NP)

425 The matrix of features, where `NP` is the number of polynomial

426 features generated from the combination of inputs. If a sparse

427 matrix is provided, it will be converted into a sparse

428 `csr_matrix`.

429 """

430 check_is_fitted(self)

431

432 X = self._validate_data(

433 X, order="F", dtype=FLOAT_DTYPES, reset=False, accept_sparse=("csr", "csc")

434 )

435

436 n_samples, n_features = X.shape

437 max_int32 = np.iinfo(np.int32).max

438 if sparse.issparse(X) and X.format == "csr":

439 if self._max_degree > 3:

440 return self.transform(X.tocsc()).tocsr()

441 to_stack = []

442 if self.include_bias:

443 to_stack.append(

444 sparse.csr_matrix(np.ones(shape=(n_samples, 1), dtype=X.dtype))

445 )

446 if self._min_degree <= 1 and self._max_degree > 0:

447 to_stack.append(X)

448

449 cumulative_size = sum(mat.shape[1] for mat in to_stack)

450 for deg in range(max(2, self._min_degree), self._max_degree + 1):

451 expanded = _create_expansion(

452 X=X,

453 interaction_only=self.interaction_only,

454 deg=deg,

455 n_features=n_features,

456 cumulative_size=cumulative_size,

457 )

458 if expanded is not None:

459 to_stack.append(expanded)

460 cumulative_size += expanded.shape[1]

461 if len(to_stack) == 0:

462 # edge case: deal with empty matrix

463 XP = sparse.csr_matrix((n_samples, 0), dtype=X.dtype)

464 else:

465 # `scipy.sparse.hstack` breaks in scipy<1.9.2

466 # when `n_output_features_ > max_int32`

467 all_int32 = all(mat.indices.dtype == np.int32 for mat in to_stack)

468 if (

469 sp_version < parse_version("1.9.2")

470 and self.n_output_features_ > max_int32

471 and all_int32

472 ):

473 raise ValueError( # pragma: no cover

474 "In scipy versions `<1.9.2`, the function `scipy.sparse.hstack`"

475 " produces negative columns when:\n1. The output shape contains"

476 " `n_cols` too large to be represented by a 32bit signed"

477 " integer.\n2. All sub-matrices to be stacked have indices of"

478 " dtype `np.int32`.\nTo avoid this error, either use a version"

479 " of scipy `>=1.9.2` or alter the `PolynomialFeatures`"

480 " transformer to produce fewer than 2^31 output features"

481 )

482 XP = sparse.hstack(to_stack, dtype=X.dtype, format="csr")

483 elif sparse.issparse(X) and X.format == "csc" and self._max_degree < 4:

484 return self.transform(X.tocsr()).tocsc()

485 elif sparse.issparse(X):

486 combinations = self._combinations(

487 n_features=n_features,

488 min_degree=self._min_degree,

489 max_degree=self._max_degree,

490 interaction_only=self.interaction_only,

491 include_bias=self.include_bias,

492 )

493 columns = []

494 for combi in combinations:

495 if combi:

496 out_col = 1

497 for col_idx in combi:

498 out_col = X[:, [col_idx]].multiply(out_col)

499 columns.append(out_col)

500 else:

501 bias = sparse.csc_matrix(np.ones((X.shape[0], 1)))

502 columns.append(bias)

503 XP = sparse.hstack(columns, dtype=X.dtype).tocsc()

504 else:

505 # Do as if _min_degree = 0 and cut down array after the

506 # computation, i.e. use _n_out_full instead of n_output_features_.

507 XP = np.empty(

508 shape=(n_samples, self._n_out_full), dtype=X.dtype, order=self.order

509 )

510

511 # What follows is a faster implementation of:

512 # for i, comb in enumerate(combinations):

513 # XP[:, i] = X[:, comb].prod(1)

514 # This implementation uses two optimisations.

515 # First one is broadcasting,

516 # multiply ([X1, ..., Xn], X1) -> [X1 X1, ..., Xn X1]

517 # multiply ([X2, ..., Xn], X2) -> [X2 X2, ..., Xn X2]

518 # ...

519 # multiply ([X[:, start:end], X[:, start]) -> ...

520 # Second optimisation happens for degrees >= 3.

521 # Xi^3 is computed reusing previous computation:

522 # Xi^3 = Xi^2 * Xi.

523

524 # degree 0 term

525 if self.include_bias:

526 XP[:, 0] = 1

527 current_col = 1

528 else:

529 current_col = 0

530

531 if self._max_degree == 0:

532 return XP

533

534 # degree 1 term

535 XP[:, current_col : current_col + n_features] = X

536 index = list(range(current_col, current_col + n_features))

537 current_col += n_features

538 index.append(current_col)

539

540 # loop over degree >= 2 terms

541 for _ in range(2, self._max_degree + 1):

542 new_index = []

543 end = index[-1]

544 for feature_idx in range(n_features):

545 start = index[feature_idx]

546 new_index.append(current_col)

547 if self.interaction_only:

548 start += index[feature_idx + 1] - index[feature_idx]

549 next_col = current_col + end - start

550 if next_col <= current_col:

551 break

552 # XP[:, start:end] are terms of degree d - 1

553 # that exclude feature #feature_idx.

554 np.multiply(

555 XP[:, start:end],

556 X[:, feature_idx : feature_idx + 1],

557 out=XP[:, current_col:next_col],

558 casting="no",

559 )

560 current_col = next_col

561

562 new_index.append(current_col)

563 index = new_index

564

565 if self._min_degree > 1:

566 n_XP, n_Xout = self._n_out_full, self.n_output_features_

567 if self.include_bias:

568 Xout = np.empty(

569 shape=(n_samples, n_Xout), dtype=XP.dtype, order=self.order

570 )

571 Xout[:, 0] = 1

572 Xout[:, 1:] = XP[:, n_XP - n_Xout + 1 :]

573 else:

574 Xout = XP[:, n_XP - n_Xout :].copy()

575 XP = Xout

576 return XP

577

578

579class SplineTransformer(TransformerMixin, BaseEstimator):

580 """Generate univariate B-spline bases for features.

581

582 Generate a new feature matrix consisting of

583 `n_splines=n_knots + degree - 1` (`n_knots - 1` for

584 `extrapolation="periodic"`) spline basis functions

585 (B-splines) of polynomial order=`degree` for each feature.

586

587 In order to learn more about the SplineTransformer class go to:

588 :ref:`sphx_glr_auto_examples_applications_plot_cyclical_feature_engineering.py`

589

590 Read more in the :ref:`User Guide <spline_transformer>`.

591

592 .. versionadded:: 1.0

593

594 Parameters

595 ----------

596 n_knots : int, default=5

597 Number of knots of the splines if `knots` equals one of

598 {'uniform', 'quantile'}. Must be larger or equal 2. Ignored if `knots`

599 is array-like.

600

601 degree : int, default=3

602 The polynomial degree of the spline basis. Must be a non-negative

603 integer.

604

605 knots : {'uniform', 'quantile'} or array-like of shape \

606 (n_knots, n_features), default='uniform'

607 Set knot positions such that first knot <= features <= last knot.

608

609 - If 'uniform', `n_knots` number of knots are distributed uniformly

610 from min to max values of the features.

611 - If 'quantile', they are distributed uniformly along the quantiles of

612 the features.

613 - If an array-like is given, it directly specifies the sorted knot

614 positions including the boundary knots. Note that, internally,

615 `degree` number of knots are added before the first knot, the same

616 after the last knot.

617

618 extrapolation : {'error', 'constant', 'linear', 'continue', 'periodic'}, \

619 default='constant'

620 If 'error', values outside the min and max values of the training

621 features raises a `ValueError`. If 'constant', the value of the

622 splines at minimum and maximum value of the features is used as

623 constant extrapolation. If 'linear', a linear extrapolation is used.

624 If 'continue', the splines are extrapolated as is, i.e. option

625 `extrapolate=True` in :class:`scipy.interpolate.BSpline`. If

626 'periodic', periodic splines with a periodicity equal to the distance

627 between the first and last knot are used. Periodic splines enforce

628 equal function values and derivatives at the first and last knot.

629 For example, this makes it possible to avoid introducing an arbitrary

630 jump between Dec 31st and Jan 1st in spline features derived from a

631 naturally periodic "day-of-year" input feature. In this case it is

632 recommended to manually set the knot values to control the period.

633

634 include_bias : bool, default=True

635 If False, then the last spline element inside the data range

636 of a feature is dropped. As B-splines sum to one over the spline basis

637 functions for each data point, they implicitly include a bias term,

638 i.e. a column of ones. It acts as an intercept term in a linear models.

639

640 order : {'C', 'F'}, default='C'

641 Order of output array in the dense case. `'F'` order is faster to compute, but

642 may slow down subsequent estimators.

643

644 sparse_output : bool, default=False

645 Will return sparse CSR matrix if set True else will return an array. This

646 option is only available with `scipy>=1.8`.

647

648 .. versionadded:: 1.2

649

650 Attributes

651 ----------

652 bsplines_ : list of shape (n_features,)

653 List of BSplines objects, one for each feature.

654

655 n_features_in_ : int

656 The total number of input features.

657

658 feature_names_in_ : ndarray of shape (`n_features_in_`,)

659 Names of features seen during :term:`fit`. Defined only when `X`

660 has feature names that are all strings.

661

662 .. versionadded:: 1.0

663

664 n_features_out_ : int

665 The total number of output features, which is computed as

666 `n_features * n_splines`, where `n_splines` is

667 the number of bases elements of the B-splines,

668 `n_knots + degree - 1` for non-periodic splines and

669 `n_knots - 1` for periodic ones.

670 If `include_bias=False`, then it is only

671 `n_features * (n_splines - 1)`.

672

673 See Also

674 --------

675 KBinsDiscretizer : Transformer that bins continuous data into intervals.

676

677 PolynomialFeatures : Transformer that generates polynomial and interaction

678 features.

679

680 Notes

681 -----

682 High degrees and a high number of knots can cause overfitting.

683

684 See :ref:`examples/linear_model/plot_polynomial_interpolation.py

685 <sphx_glr_auto_examples_linear_model_plot_polynomial_interpolation.py>`.

686

687 Examples

688 --------

689 >>> import numpy as np

690 >>> from sklearn.preprocessing import SplineTransformer

691 >>> X = np.arange(6).reshape(6, 1)

692 >>> spline = SplineTransformer(degree=2, n_knots=3)

693 >>> spline.fit_transform(X)

694 array([[0.5 , 0.5 , 0. , 0. ],

695 [0.18, 0.74, 0.08, 0. ],

696 [0.02, 0.66, 0.32, 0. ],

697 [0. , 0.32, 0.66, 0.02],

698 [0. , 0.08, 0.74, 0.18],

699 [0. , 0. , 0.5 , 0.5 ]])

700 """

701

702 _parameter_constraints: dict = {

703 "n_knots": [Interval(Integral, 2, None, closed="left")],

704 "degree": [Interval(Integral, 0, None, closed="left")],

705 "knots": [StrOptions({"uniform", "quantile"}), "array-like"],

706 "extrapolation": [

707 StrOptions({"error", "constant", "linear", "continue", "periodic"})

708 ],

709 "include_bias": ["boolean"],

710 "order": [StrOptions({"C", "F"})],

711 "sparse_output": ["boolean"],

712 }

713

714 def __init__(

715 self,

716 n_knots=5,

717 degree=3,

718 *,

719 knots="uniform",

720 extrapolation="constant",

721 include_bias=True,

722 order="C",

723 sparse_output=False,

724 ):

725 self.n_knots = n_knots

726 self.degree = degree

727 self.knots = knots

728 self.extrapolation = extrapolation

729 self.include_bias = include_bias

730 self.order = order

731 self.sparse_output = sparse_output

732

733 @staticmethod

734 def _get_base_knot_positions(X, n_knots=10, knots="uniform", sample_weight=None):

735 """Calculate base knot positions.

736

737 Base knots such that first knot <= feature <= last knot. For the

738 B-spline construction with scipy.interpolate.BSpline, 2*degree knots

739 beyond the base interval are added.

740

741 Returns

742 -------

743 knots : ndarray of shape (n_knots, n_features), dtype=np.float64

744 Knot positions (points) of base interval.

745 """

746 if knots == "quantile":

747 percentiles = 100 * np.linspace(

748 start=0, stop=1, num=n_knots, dtype=np.float64

749 )

750

751 if sample_weight is None:

752 knots = np.percentile(X, percentiles, axis=0)

753 else:

754 knots = np.array(

755 [

756 _weighted_percentile(X, sample_weight, percentile)

757 for percentile in percentiles

758 ]

759 )

760

761 else:

762 # knots == 'uniform':

763 # Note that the variable `knots` has already been validated and

764 # `else` is therefore safe.

765 # Disregard observations with zero weight.

766 mask = slice(None, None, 1) if sample_weight is None else sample_weight > 0

767 x_min = np.amin(X[mask], axis=0)

768 x_max = np.amax(X[mask], axis=0)

769

770 knots = np.linspace(

771 start=x_min,

772 stop=x_max,

773 num=n_knots,

774 endpoint=True,

775 dtype=np.float64,

776 )

777

778 return knots

779

780 def get_feature_names_out(self, input_features=None):

781 """Get output feature names for transformation.

782

783 Parameters

784 ----------

785 input_features : array-like of str or None, default=None

786 Input features.

787

788 - If `input_features` is `None`, then `feature_names_in_` is

789 used as feature names in. If `feature_names_in_` is not defined,

790 then the following input feature names are generated:

791 `["x0", "x1", ..., "x(n_features_in_ - 1)"]`.

792 - If `input_features` is an array-like, then `input_features` must

793 match `feature_names_in_` if `feature_names_in_` is defined.

794

795 Returns

796 -------

797 feature_names_out : ndarray of str objects

798 Transformed feature names.

799 """

800 check_is_fitted(self, "n_features_in_")

801 n_splines = self.bsplines_[0].c.shape[1]

802

803 input_features = _check_feature_names_in(self, input_features)

804 feature_names = []

805 for i in range(self.n_features_in_):

806 for j in range(n_splines - 1 + self.include_bias):

807 feature_names.append(f"{input_features[i]}_sp_{j}")

808 return np.asarray(feature_names, dtype=object)

809

810 @_fit_context(prefer_skip_nested_validation=True)

811 def fit(self, X, y=None, sample_weight=None):

812 """Compute knot positions of splines.

813

814 Parameters

815 ----------

816 X : array-like of shape (n_samples, n_features)

817 The data.

818

819 y : None

820 Ignored.

821

822 sample_weight : array-like of shape (n_samples,), default = None

823 Individual weights for each sample. Used to calculate quantiles if

824 `knots="quantile"`. For `knots="uniform"`, zero weighted

825 observations are ignored for finding the min and max of `X`.

826

827 Returns

828 -------

829 self : object

830 Fitted transformer.

831 """

832 X = self._validate_data(

833 X,

834 reset=True,

835 accept_sparse=False,

836 ensure_min_samples=2,

837 ensure_2d=True,

838 )

839 if sample_weight is not None:

840 sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype)

841

842 _, n_features = X.shape

843

844 if isinstance(self.knots, str):

845 base_knots = self._get_base_knot_positions(

846 X, n_knots=self.n_knots, knots=self.knots, sample_weight=sample_weight

847 )

848 else:

849 base_knots = check_array(self.knots, dtype=np.float64)

850 if base_knots.shape[0] < 2:

851 raise ValueError("Number of knots, knots.shape[0], must be >= 2.")

852 elif base_knots.shape[1] != n_features:

853 raise ValueError("knots.shape[1] == n_features is violated.")

854 elif not np.all(np.diff(base_knots, axis=0) > 0):

855 raise ValueError("knots must be sorted without duplicates.")

856

857 if self.sparse_output and sp_version < parse_version("1.8.0"):

858 raise ValueError(

859 "Option sparse_output=True is only available with scipy>=1.8.0, "

860 f"but here scipy=={sp_version} is used."

861 )

862

863 # number of knots for base interval

864 n_knots = base_knots.shape[0]

865

866 if self.extrapolation == "periodic" and n_knots <= self.degree:

867 raise ValueError(

868 "Periodic splines require degree < n_knots. Got n_knots="

869 f"{n_knots} and degree={self.degree}."

870 )

871

872 # number of splines basis functions

873 if self.extrapolation != "periodic":

874 n_splines = n_knots + self.degree - 1

875 else:

876 # periodic splines have self.degree less degrees of freedom

877 n_splines = n_knots - 1

878

879 degree = self.degree

880 n_out = n_features * n_splines

881 # We have to add degree number of knots below, and degree number knots

882 # above the base knots in order to make the spline basis complete.

883 if self.extrapolation == "periodic":

884 # For periodic splines the spacing of the first / last degree knots

885 # needs to be a continuation of the spacing of the last / first

886 # base knots.

887 period = base_knots[-1] - base_knots[0]

888 knots = np.r_[

889 base_knots[-(degree + 1) : -1] - period,

890 base_knots,

891 base_knots[1 : (degree + 1)] + period,

892 ]

893

894 else:

895 # Eilers & Marx in "Flexible smoothing with B-splines and

896 # penalties" https://doi.org/10.1214/ss/1038425655 advice

897 # against repeating first and last knot several times, which

898 # would have inferior behaviour at boundaries if combined with

899 # a penalty (hence P-Spline). We follow this advice even if our

900 # splines are unpenalized. Meaning we do not:

901 # knots = np.r_[

902 # np.tile(base_knots.min(axis=0), reps=[degree, 1]),

903 # base_knots,

904 # np.tile(base_knots.max(axis=0), reps=[degree, 1])

905 # ]

906 # Instead, we reuse the distance of the 2 fist/last knots.

907 dist_min = base_knots[1] - base_knots[0]

908 dist_max = base_knots[-1] - base_knots[-2]

909

910 knots = np.r_[

911 np.linspace(

912 base_knots[0] - degree * dist_min,

913 base_knots[0] - dist_min,

914 num=degree,

915 ),

916 base_knots,

917 np.linspace(

918 base_knots[-1] + dist_max,

919 base_knots[-1] + degree * dist_max,

920 num=degree,

921 ),

922 ]

923

924 # With a diagonal coefficient matrix, we get back the spline basis

925 # elements, i.e. the design matrix of the spline.

926 # Note, BSpline appreciates C-contiguous float64 arrays as c=coef.

927 coef = np.eye(n_splines, dtype=np.float64)

928 if self.extrapolation == "periodic":

929 coef = np.concatenate((coef, coef[:degree, :]))

930

931 extrapolate = self.extrapolation in ["periodic", "continue"]

932

933 bsplines = [

934 BSpline.construct_fast(

935 knots[:, i], coef, self.degree, extrapolate=extrapolate

936 )

937 for i in range(n_features)

938 ]

939 self.bsplines_ = bsplines

940

941 self.n_features_out_ = n_out - n_features * (1 - self.include_bias)

942 return self

943

944 def transform(self, X):

945 """Transform each feature data to B-splines.

946

947 Parameters

948 ----------

949 X : array-like of shape (n_samples, n_features)

950 The data to transform.

951

952 Returns

953 -------

954 XBS : {ndarray, sparse matrix} of shape (n_samples, n_features * n_splines)

955 The matrix of features, where n_splines is the number of bases

956 elements of the B-splines, n_knots + degree - 1.

957 """

958 check_is_fitted(self)

959

960 X = self._validate_data(X, reset=False, accept_sparse=False, ensure_2d=True)

961

962 n_samples, n_features = X.shape

963 n_splines = self.bsplines_[0].c.shape[1]

964 degree = self.degree

965

966 # TODO: Remove this condition, once scipy 1.10 is the minimum version.

967 # Only scipy => 1.10 supports design_matrix(.., extrapolate=..).

968 # The default (implicit in scipy < 1.10) is extrapolate=False.

969 scipy_1_10 = sp_version >= parse_version("1.10.0")

970 # Note: self.bsplines_[0].extrapolate is True for extrapolation in

971 # ["periodic", "continue"]

972 if scipy_1_10:

973 use_sparse = self.sparse_output

974 kwargs_extrapolate = {"extrapolate": self.bsplines_[0].extrapolate}

975 else:

976 use_sparse = self.sparse_output and not self.bsplines_[0].extrapolate

977 kwargs_extrapolate = dict()

978

979 # Note that scipy BSpline returns float64 arrays and converts input

980 # x=X[:, i] to c-contiguous float64.

981 n_out = self.n_features_out_ + n_features * (1 - self.include_bias)

982 if X.dtype in FLOAT_DTYPES:

983 dtype = X.dtype

984 else:

985 dtype = np.float64

986 if use_sparse:

987 output_list = []

988 else:

989 XBS = np.zeros((n_samples, n_out), dtype=dtype, order=self.order)

990

991 for i in range(n_features):

992 spl = self.bsplines_[i]

993

994 if self.extrapolation in ("continue", "error", "periodic"):

995 if self.extrapolation == "periodic":

996 # With periodic extrapolation we map x to the segment

997 # [spl.t[k], spl.t[n]].

998 # This is equivalent to BSpline(.., extrapolate="periodic")

999 # for scipy>=1.0.0.

1000 n = spl.t.size - spl.k - 1

1001 # Assign to new array to avoid inplace operation

1002 x = spl.t[spl.k] + (X[:, i] - spl.t[spl.k]) % (

1003 spl.t[n] - spl.t[spl.k]

1004 )

1005 else:

1006 x = X[:, i]

1007

1008 if use_sparse:

1009 XBS_sparse = BSpline.design_matrix(

1010 x, spl.t, spl.k, **kwargs_extrapolate

1011 )

1012 if self.extrapolation == "periodic":

1013 # See the construction of coef in fit. We need to add the last

1014 # degree spline basis function to the first degree ones and

1015 # then drop the last ones.

1016 # Note: See comment about SparseEfficiencyWarning below.

1017 XBS_sparse = XBS_sparse.tolil()

1018 XBS_sparse[:, :degree] += XBS_sparse[:, -degree:]

1019 XBS_sparse = XBS_sparse[:, :-degree]

1020 else:

1021 XBS[:, (i * n_splines) : ((i + 1) * n_splines)] = spl(x)

1022 else: # extrapolation in ("constant", "linear")

1023 xmin, xmax = spl.t[degree], spl.t[-degree - 1]

1024 # spline values at boundaries

1025 f_min, f_max = spl(xmin), spl(xmax)

1026 mask = (xmin <= X[:, i]) & (X[:, i] <= xmax)

1027 if use_sparse:

1028 mask_inv = ~mask

1029 x = X[:, i].copy()

1030 # Set some arbitrary values outside boundary that will be reassigned

1031 # later.

1032 x[mask_inv] = spl.t[self.degree]

1033 XBS_sparse = BSpline.design_matrix(x, spl.t, spl.k)

1034 # Note: Without converting to lil_matrix we would get:

1035 # scipy.sparse._base.SparseEfficiencyWarning: Changing the sparsity

1036 # structure of a csr_matrix is expensive. lil_matrix is more

1037 # efficient.

1038 if np.any(mask_inv):

1039 XBS_sparse = XBS_sparse.tolil()

1040 XBS_sparse[mask_inv, :] = 0

1041 else:

1042 XBS[mask, (i * n_splines) : ((i + 1) * n_splines)] = spl(X[mask, i])

1043

1044 # Note for extrapolation:

1045 # 'continue' is already returned as is by scipy BSplines

1046 if self.extrapolation == "error":

1047 # BSpline with extrapolate=False does not raise an error, but

1048 # outputs np.nan.

1049 if (use_sparse and np.any(np.isnan(XBS_sparse.data))) or (

1050 not use_sparse

1051 and np.any(

1052 np.isnan(XBS[:, (i * n_splines) : ((i + 1) * n_splines)])

1053 )

1054 ):

1055 raise ValueError(

1056 "X contains values beyond the limits of the knots."

1057 )

1058 elif self.extrapolation == "constant":

1059 # Set all values beyond xmin and xmax to the value of the

1060 # spline basis functions at those two positions.

1061 # Only the first degree and last degree number of splines

1062 # have non-zero values at the boundaries.

1063

1064 mask = X[:, i] < xmin

1065 if np.any(mask):

1066 if use_sparse:

1067 # Note: See comment about SparseEfficiencyWarning above.

1068 XBS_sparse = XBS_sparse.tolil()

1069 XBS_sparse[mask, :degree] = f_min[:degree]

1070

1071 else:

1072 XBS[mask, (i * n_splines) : (i * n_splines + degree)] = f_min[

1073 :degree

1074 ]

1075

1076 mask = X[:, i] > xmax

1077 if np.any(mask):

1078 if use_sparse:

1079 # Note: See comment about SparseEfficiencyWarning above.

1080 XBS_sparse = XBS_sparse.tolil()

1081 XBS_sparse[mask, -degree:] = f_max[-degree:]

1082 else:

1083 XBS[

1084 mask,

1085 ((i + 1) * n_splines - degree) : ((i + 1) * n_splines),

1086 ] = f_max[-degree:]

1087

1088 elif self.extrapolation == "linear":

1089 # Continue the degree first and degree last spline bases

1090 # linearly beyond the boundaries, with slope = derivative at

1091 # the boundary.

1092 # Note that all others have derivative = value = 0 at the

1093 # boundaries.

1094

1095 # spline derivatives = slopes at boundaries

1096 fp_min, fp_max = spl(xmin, nu=1), spl(xmax, nu=1)

1097 # Compute the linear continuation.

1098 if degree <= 1:

1099 # For degree=1, the derivative of 2nd spline is not zero at

1100 # boundary. For degree=0 it is the same as 'constant'.

1101 degree += 1

1102 for j in range(degree):

1103 mask = X[:, i] < xmin

1104 if np.any(mask):

1105 linear_extr = f_min[j] + (X[mask, i] - xmin) * fp_min[j]

1106 if use_sparse:

1107 # Note: See comment about SparseEfficiencyWarning above.

1108 XBS_sparse = XBS_sparse.tolil()

1109 XBS_sparse[mask, j] = linear_extr

1110 else:

1111 XBS[mask, i * n_splines + j] = linear_extr

1112

1113 mask = X[:, i] > xmax

1114 if np.any(mask):

1115 k = n_splines - 1 - j

1116 linear_extr = f_max[k] + (X[mask, i] - xmax) * fp_max[k]

1117 if use_sparse:

1118 # Note: See comment about SparseEfficiencyWarning above.

1119 XBS_sparse = XBS_sparse.tolil()

1120 XBS_sparse[mask, k : k + 1] = linear_extr[:, None]

1121 else:

1122 XBS[mask, i * n_splines + k] = linear_extr

1123

1124 if use_sparse:

1125 XBS_sparse = XBS_sparse.tocsr()

1126 output_list.append(XBS_sparse)

1127

1128 if use_sparse:

1129 # TODO: Remove this conditional error when the minimum supported version of

1130 # SciPy is 1.9.2

1131 # `scipy.sparse.hstack` breaks in scipy<1.9.2

1132 # when `n_features_out_ > max_int32`

1133 max_int32 = np.iinfo(np.int32).max

1134 all_int32 = True

1135 for mat in output_list:

1136 all_int32 &= mat.indices.dtype == np.int32

1137 if (

1138 sp_version < parse_version("1.9.2")

1139 and self.n_features_out_ > max_int32

1140 and all_int32

1141 ):

1142 raise ValueError(

1143 "In scipy versions `<1.9.2`, the function `scipy.sparse.hstack`"

1144 " produces negative columns when:\n1. The output shape contains"

1145 " `n_cols` too large to be represented by a 32bit signed"

1146 " integer.\n. All sub-matrices to be stacked have indices of"

1147 " dtype `np.int32`.\nTo avoid this error, either use a version"

1148 " of scipy `>=1.9.2` or alter the `SplineTransformer`"

1149 " transformer to produce fewer than 2^31 output features"

1150 )

1151 XBS = sparse.hstack(output_list, format="csr")

1152 elif self.sparse_output:

1153 # TODO: Remove ones scipy 1.10 is the minimum version. See comments above.

1154 XBS = sparse.csr_matrix(XBS)

1155

1156 if self.include_bias:

1157 return XBS

1158 else:

1159 # We throw away one spline basis per feature.

1160 # We chose the last one.

1161 indices = [j for j in range(XBS.shape[1]) if (j + 1) % n_splines != 0]

1162 return XBS[:, indices]

1163

1164 def _more_tags(self):

1165 return {

1166 "_xfail_checks": {

1167 "check_estimators_pickle": (

1168 "Current Scipy implementation of _bsplines does not"

1169 "support const memory views."

1170 ),

1171 }

1172 }

Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/scikit_learn-1.4.dev0-py3.8-linux-x86_64.egg/sklearn/preprocessing/_polynomial.py: 12%

340 statements