Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/scikit_learn-1.4.dev0-py3.8-linux-x86

1"""

2The :mod:`sklearn.utils.sparsefuncs` module includes a collection of utilities to

3work with sparse matrices and arrays.

4"""

6# Authors: Manoj Kumar

7# Thomas Unterthiner

8# Giorgio Patrini

10# License: BSD 3 clause

11import numpy as np

12import scipy.sparse as sp

13from scipy.sparse.linalg import LinearOperator

15from ..utils.fixes import _sparse_min_max, _sparse_nan_min_max

16from ..utils.validation import _check_sample_weight

17from .sparsefuncs_fast import (

18 csc_mean_variance_axis0 as _csc_mean_var_axis0,

19)

20from .sparsefuncs_fast import (

21 csr_mean_variance_axis0 as _csr_mean_var_axis0,

22)

23from .sparsefuncs_fast import (

24 incr_mean_variance_axis0 as _incr_mean_var_axis0,

25)

28def _raise_typeerror(X):

29 """Raises a TypeError if X is not a CSR or CSC matrix"""

30 input_type = X.format if sp.issparse(X) else type(X)

31 err = "Expected a CSR or CSC sparse matrix, got %s." % input_type

32 raise TypeError(err)

35def _raise_error_wrong_axis(axis):

36 if axis not in (0, 1):

37 raise ValueError(

38 "Unknown axis value: %d. Use 0 for rows, or 1 for columns" % axis

39 )

42def inplace_csr_column_scale(X, scale):

43 """Inplace column scaling of a CSR matrix.

45 Scale each feature of the data matrix by multiplying with specific scale

46 provided by the caller assuming a (n_samples, n_features) shape.

48 Parameters

49 ----------

50 X : sparse matrix of shape (n_samples, n_features)

51 Matrix to normalize using the variance of the features.

52 It should be of CSR format.

54 scale : ndarray of shape (n_features,), dtype={np.float32, np.float64}

55 Array of precomputed feature-wise values to use for scaling.

56 """

57 assert scale.shape[0] == X.shape[1]

58 X.data *= scale.take(X.indices, mode="clip")

61def inplace_csr_row_scale(X, scale):

62 """Inplace row scaling of a CSR matrix.

64 Scale each sample of the data matrix by multiplying with specific scale

65 provided by the caller assuming a (n_samples, n_features) shape.

67 Parameters

68 ----------

69 X : sparse matrix of shape (n_samples, n_features)

70 Matrix to be scaled. It should be of CSR format.

72 scale : ndarray of float of shape (n_samples,)

73 Array of precomputed sample-wise values to use for scaling.

74 """

75 assert scale.shape[0] == X.shape[0]

76 X.data *= np.repeat(scale, np.diff(X.indptr))

79def mean_variance_axis(X, axis, weights=None, return_sum_weights=False):

80 """Compute mean and variance along an axis on a CSR or CSC matrix.

82 Parameters

83 ----------

84 X : sparse matrix of shape (n_samples, n_features)

85 Input data. It can be of CSR or CSC format.

87 axis : {0, 1}

88 Axis along which the axis should be computed.

90 weights : ndarray of shape (n_samples,) or (n_features,), default=None

91 If axis is set to 0 shape is (n_samples,) or

92 if axis is set to 1 shape is (n_features,).

93 If it is set to None, then samples are equally weighted.

95 .. versionadded:: 0.24

97 return_sum_weights : bool, default=False

98 If True, returns the sum of weights seen for each feature

99 if `axis=0` or each sample if `axis=1`.

100

101 .. versionadded:: 0.24

102

103 Returns

104 -------

105

106 means : ndarray of shape (n_features,), dtype=floating

107 Feature-wise means.

108

109 variances : ndarray of shape (n_features,), dtype=floating

110 Feature-wise variances.

111

112 sum_weights : ndarray of shape (n_features,), dtype=floating

113 Returned if `return_sum_weights` is `True`.

114 """

115 _raise_error_wrong_axis(axis)

116

117 if sp.issparse(X) and X.format == "csr":

118 if axis == 0:

119 return _csr_mean_var_axis0(

120 X, weights=weights, return_sum_weights=return_sum_weights

121 )

122 else:

123 return _csc_mean_var_axis0(

124 X.T, weights=weights, return_sum_weights=return_sum_weights

125 )

126 elif sp.issparse(X) and X.format == "csc":

127 if axis == 0:

128 return _csc_mean_var_axis0(

129 X, weights=weights, return_sum_weights=return_sum_weights

130 )

131 else:

132 return _csr_mean_var_axis0(

133 X.T, weights=weights, return_sum_weights=return_sum_weights

134 )

135 else:

136 _raise_typeerror(X)

137

138

139def incr_mean_variance_axis(X, *, axis, last_mean, last_var, last_n, weights=None):

140 """Compute incremental mean and variance along an axis on a CSR or CSC matrix.

141

142 last_mean, last_var are the statistics computed at the last step by this

143 function. Both must be initialized to 0-arrays of the proper size, i.e.

144 the number of features in X. last_n is the number of samples encountered

145 until now.

146

147 Parameters

148 ----------

149 X : CSR or CSC sparse matrix of shape (n_samples, n_features)

150 Input data.

151

152 axis : {0, 1}

153 Axis along which the axis should be computed.

154

155 last_mean : ndarray of shape (n_features,) or (n_samples,), dtype=floating

156 Array of means to update with the new data X.

157 Should be of shape (n_features,) if axis=0 or (n_samples,) if axis=1.

158

159 last_var : ndarray of shape (n_features,) or (n_samples,), dtype=floating

160 Array of variances to update with the new data X.

161 Should be of shape (n_features,) if axis=0 or (n_samples,) if axis=1.

162

163 last_n : float or ndarray of shape (n_features,) or (n_samples,), \

164 dtype=floating

165 Sum of the weights seen so far, excluding the current weights

166 If not float, it should be of shape (n_features,) if

167 axis=0 or (n_samples,) if axis=1. If float it corresponds to

168 having same weights for all samples (or features).

169

170 weights : ndarray of shape (n_samples,) or (n_features,), default=None

171 If axis is set to 0 shape is (n_samples,) or

172 if axis is set to 1 shape is (n_features,).

173 If it is set to None, then samples are equally weighted.

174

175 .. versionadded:: 0.24

176

177 Returns

178 -------

179 means : ndarray of shape (n_features,) or (n_samples,), dtype=floating

180 Updated feature-wise means if axis = 0 or

181 sample-wise means if axis = 1.

182

183 variances : ndarray of shape (n_features,) or (n_samples,), dtype=floating

184 Updated feature-wise variances if axis = 0 or

185 sample-wise variances if axis = 1.

186

187 n : ndarray of shape (n_features,) or (n_samples,), dtype=integral

188 Updated number of seen samples per feature if axis=0

189 or number of seen features per sample if axis=1.

190

191 If weights is not None, n is a sum of the weights of the seen

192 samples or features instead of the actual number of seen

193 samples or features.

194

195 Notes

196 -----

197 NaNs are ignored in the algorithm.

198 """

199 _raise_error_wrong_axis(axis)

200

201 if not (sp.issparse(X) and X.format in ("csc", "csr")):

202 _raise_typeerror(X)

203

204 if np.size(last_n) == 1:

205 last_n = np.full(last_mean.shape, last_n, dtype=last_mean.dtype)

206

207 if not (np.size(last_mean) == np.size(last_var) == np.size(last_n)):

208 raise ValueError("last_mean, last_var, last_n do not have the same shapes.")

209

210 if axis == 1:

211 if np.size(last_mean) != X.shape[0]:

212 raise ValueError(

213 "If axis=1, then last_mean, last_n, last_var should be of "

214 f"size n_samples {X.shape[0]} (Got {np.size(last_mean)})."

215 )

216 else: # axis == 0

217 if np.size(last_mean) != X.shape[1]:

218 raise ValueError(

219 "If axis=0, then last_mean, last_n, last_var should be of "

220 f"size n_features {X.shape[1]} (Got {np.size(last_mean)})."

221 )

222

223 X = X.T if axis == 1 else X

224

225 if weights is not None:

226 weights = _check_sample_weight(weights, X, dtype=X.dtype)

227

228 return _incr_mean_var_axis0(

229 X, last_mean=last_mean, last_var=last_var, last_n=last_n, weights=weights

230 )

231

232

233def inplace_column_scale(X, scale):

234 """Inplace column scaling of a CSC/CSR matrix.

235

236 Scale each feature of the data matrix by multiplying with specific scale

237 provided by the caller assuming a (n_samples, n_features) shape.

238

239 Parameters

240 ----------

241 X : sparse matrix of shape (n_samples, n_features)

242 Matrix to normalize using the variance of the features. It should be

243 of CSC or CSR format.

244

245 scale : ndarray of shape (n_features,), dtype={np.float32, np.float64}

246 Array of precomputed feature-wise values to use for scaling.

247 """

248 if sp.issparse(X) and X.format == "csc":

249 inplace_csr_row_scale(X.T, scale)

250 elif sp.issparse(X) and X.format == "csr":

251 inplace_csr_column_scale(X, scale)

252 else:

253 _raise_typeerror(X)

254

255

256def inplace_row_scale(X, scale):

257 """Inplace row scaling of a CSR or CSC matrix.

258

259 Scale each row of the data matrix by multiplying with specific scale

260 provided by the caller assuming a (n_samples, n_features) shape.

261

262 Parameters

263 ----------

264 X : sparse matrix of shape (n_samples, n_features)

265 Matrix to be scaled. It should be of CSR or CSC format.

266

267 scale : ndarray of shape (n_features,), dtype={np.float32, np.float64}

268 Array of precomputed sample-wise values to use for scaling.

269 """

270 if sp.issparse(X) and X.format == "csc":

271 inplace_csr_column_scale(X.T, scale)

272 elif sp.issparse(X) and X.format == "csr":

273 inplace_csr_row_scale(X, scale)

274 else:

275 _raise_typeerror(X)

276

277

278def inplace_swap_row_csc(X, m, n):

279 """Swap two rows of a CSC matrix in-place.

280

281 Parameters

282 ----------

283 X : sparse matrix of shape (n_samples, n_features)

284 Matrix whose two rows are to be swapped. It should be of

285 CSC format.

286

287 m : int

288 Index of the row of X to be swapped.

289

290 n : int

291 Index of the row of X to be swapped.

292 """

293 for t in [m, n]:

294 if isinstance(t, np.ndarray):

295 raise TypeError("m and n should be valid integers")

296

297 if m < 0:

298 m += X.shape[0]

299 if n < 0:

300 n += X.shape[0]

301

302 m_mask = X.indices == m

303 X.indices[X.indices == n] = m

304 X.indices[m_mask] = n

305

306

307def inplace_swap_row_csr(X, m, n):

308 """Swap two rows of a CSR matrix in-place.

309

310 Parameters

311 ----------

312 X : sparse matrix of shape (n_samples, n_features)

313 Matrix whose two rows are to be swapped. It should be of

314 CSR format.

315

316 m : int

317 Index of the row of X to be swapped.

318

319 n : int

320 Index of the row of X to be swapped.

321 """

322 for t in [m, n]:

323 if isinstance(t, np.ndarray):

324 raise TypeError("m and n should be valid integers")

325

326 if m < 0:

327 m += X.shape[0]

328 if n < 0:

329 n += X.shape[0]

330

331 # The following swapping makes life easier since m is assumed to be the

332 # smaller integer below.

333 if m > n:

334 m, n = n, m

335

336 indptr = X.indptr

337 m_start = indptr[m]

338 m_stop = indptr[m + 1]

339 n_start = indptr[n]

340 n_stop = indptr[n + 1]

341 nz_m = m_stop - m_start

342 nz_n = n_stop - n_start

343

344 if nz_m != nz_n:

345 # Modify indptr first

346 X.indptr[m + 2 : n] += nz_n - nz_m

347 X.indptr[m + 1] = m_start + nz_n

348 X.indptr[n] = n_stop - nz_m

349

350 X.indices = np.concatenate(

351 [

352 X.indices[:m_start],

353 X.indices[n_start:n_stop],

354 X.indices[m_stop:n_start],

355 X.indices[m_start:m_stop],

356 X.indices[n_stop:],

357 ]

358 )

359 X.data = np.concatenate(

360 [

361 X.data[:m_start],

362 X.data[n_start:n_stop],

363 X.data[m_stop:n_start],

364 X.data[m_start:m_stop],

365 X.data[n_stop:],

366 ]

367 )

368

369

370def inplace_swap_row(X, m, n):

371 """

372 Swap two rows of a CSC/CSR matrix in-place.

373

374 Parameters

375 ----------

376 X : sparse matrix of shape (n_samples, n_features)

377 Matrix whose two rows are to be swapped. It should be of CSR or

378 CSC format.

379

380 m : int

381 Index of the row of X to be swapped.

382

383 n : int

384 Index of the row of X to be swapped.

385 """

386 if sp.issparse(X) and X.format == "csc":

387 inplace_swap_row_csc(X, m, n)

388 elif sp.issparse(X) and X.format == "csr":

389 inplace_swap_row_csr(X, m, n)

390 else:

391 _raise_typeerror(X)

392

393

394def inplace_swap_column(X, m, n):

395 """

396 Swap two columns of a CSC/CSR matrix in-place.

397

398 Parameters

399 ----------

400 X : sparse matrix of shape (n_samples, n_features)

401 Matrix whose two columns are to be swapped. It should be of

402 CSR or CSC format.

403

404 m : int

405 Index of the column of X to be swapped.

406

407 n : int

408 Index of the column of X to be swapped.

409 """

410 if m < 0:

411 m += X.shape[1]

412 if n < 0:

413 n += X.shape[1]

414 if sp.issparse(X) and X.format == "csc":

415 inplace_swap_row_csr(X, m, n)

416 elif sp.issparse(X) and X.format == "csr":

417 inplace_swap_row_csc(X, m, n)

418 else:

419 _raise_typeerror(X)

420

421

422def min_max_axis(X, axis, ignore_nan=False):

423 """Compute minimum and maximum along an axis on a CSR or CSC matrix.

424

425 Optionally ignore NaN values.

426

427 Parameters

428 ----------

429 X : sparse matrix of shape (n_samples, n_features)

430 Input data. It should be of CSR or CSC format.

431

432 axis : {0, 1}

433 Axis along which the axis should be computed.

434

435 ignore_nan : bool, default=False

436 Ignore or passing through NaN values.

437

438 .. versionadded:: 0.20

439

440 Returns

441 -------

442

443 mins : ndarray of shape (n_features,), dtype={np.float32, np.float64}

444 Feature-wise minima.

445

446 maxs : ndarray of shape (n_features,), dtype={np.float32, np.float64}

447 Feature-wise maxima.

448 """

449 if sp.issparse(X) and X.format in ("csr", "csc"):

450 if ignore_nan:

451 return _sparse_nan_min_max(X, axis=axis)

452 else:

453 return _sparse_min_max(X, axis=axis)

454 else:

455 _raise_typeerror(X)

456

457

458def count_nonzero(X, axis=None, sample_weight=None):

459 """A variant of X.getnnz() with extension to weighting on axis 0.

460

461 Useful in efficiently calculating multilabel metrics.

462

463 Parameters

464 ----------

465 X : sparse matrix of shape (n_samples, n_labels)

466 Input data. It should be of CSR format.

467

468 axis : {0, 1}, default=None

469 The axis on which the data is aggregated.

470

471 sample_weight : array-like of shape (n_samples,), default=None

472 Weight for each row of X.

473

474 Returns

475 -------

476 nnz : int, float, ndarray of shape (n_samples,) or ndarray of shape (n_features,)

477 Number of non-zero values in the array along a given axis. Otherwise,

478 the total number of non-zero values in the array is returned.

479 """

480 if axis == -1:

481 axis = 1

482 elif axis == -2:

483 axis = 0

484 elif X.format != "csr":

485 raise TypeError("Expected CSR sparse format, got {0}".format(X.format))

486

487 # We rely here on the fact that np.diff(Y.indptr) for a CSR

488 # will return the number of nonzero entries in each row.

489 # A bincount over Y.indices will return the number of nonzeros

490 # in each column. See ``csr_matrix.getnnz`` in scipy >= 0.14.

491 if axis is None:

492 if sample_weight is None:

493 return X.nnz

494 else:

495 return np.dot(np.diff(X.indptr), sample_weight)

496 elif axis == 1:

497 out = np.diff(X.indptr)

498 if sample_weight is None:

499 # astype here is for consistency with axis=0 dtype

500 return out.astype("intp")

501 return out * sample_weight

502 elif axis == 0:

503 if sample_weight is None:

504 return np.bincount(X.indices, minlength=X.shape[1])

505 else:

506 weights = np.repeat(sample_weight, np.diff(X.indptr))

507 return np.bincount(X.indices, minlength=X.shape[1], weights=weights)

508 else:

509 raise ValueError("Unsupported axis: {0}".format(axis))

510

511

512def _get_median(data, n_zeros):

513 """Compute the median of data with n_zeros additional zeros.

514

515 This function is used to support sparse matrices; it modifies data

516 in-place.

517 """

518 n_elems = len(data) + n_zeros

519 if not n_elems:

520 return np.nan

521 n_negative = np.count_nonzero(data < 0)

522 middle, is_odd = divmod(n_elems, 2)

523 data.sort()

524

525 if is_odd:

526 return _get_elem_at_rank(middle, data, n_negative, n_zeros)

527

528 return (

529 _get_elem_at_rank(middle - 1, data, n_negative, n_zeros)

530 + _get_elem_at_rank(middle, data, n_negative, n_zeros)

531 ) / 2.0

532

533

534def _get_elem_at_rank(rank, data, n_negative, n_zeros):

535 """Find the value in data augmented with n_zeros for the given rank"""

536 if rank < n_negative:

537 return data[rank]

538 if rank - n_negative < n_zeros:

539 return 0

540 return data[rank - n_zeros]

541

542

543def csc_median_axis_0(X):

544 """Find the median across axis 0 of a CSC matrix.

545

546 It is equivalent to doing np.median(X, axis=0).

547

548 Parameters

549 ----------

550 X : sparse matrix of shape (n_samples, n_features)

551 Input data. It should be of CSC format.

552

553 Returns

554 -------

555 median : ndarray of shape (n_features,)

556 Median.

557 """

558 if not (sp.issparse(X) and X.format == "csc"):

559 raise TypeError("Expected matrix of CSC format, got %s" % X.format)

560

561 indptr = X.indptr

562 n_samples, n_features = X.shape

563 median = np.zeros(n_features)

564

565 for f_ind, (start, end) in enumerate(zip(indptr[:-1], indptr[1:])):

566 # Prevent modifying X in place

567 data = np.copy(X.data[start:end])

568 nz = n_samples - data.size

569 median[f_ind] = _get_median(data, nz)

570

571 return median

572

573

574def _implicit_column_offset(X, offset):

575 """Create an implicitly offset linear operator.

576

577 This is used by PCA on sparse data to avoid densifying the whole data

578 matrix.

579

580 Params

581 ------

582 X : sparse matrix of shape (n_samples, n_features)

583 offset : ndarray of shape (n_features,)

584

585 Returns

586 -------

587 centered : LinearOperator

588 """

589 offset = offset[None, :]

590 XT = X.T

591 return LinearOperator(

592 matvec=lambda x: X @ x - offset @ x,

593 matmat=lambda x: X @ x - offset @ x,

594 rmatvec=lambda x: XT @ x - (offset * x.sum()),

595 rmatmat=lambda x: XT @ x - offset.T @ x.sum(axis=0)[None, :],

596 dtype=X.dtype,

597 shape=X.shape,

598 )

Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/scikit_learn-1.4.dev0-py3.8-linux-x86_64.egg/sklearn/utils/sparsefuncs.py: 15%

170 statements