Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/scikit_learn-1.4.dev0-py3.8-linux-x86_64.egg/sklearn/utils/sparsefuncs.py: 15%

170 statements  

« prev     ^ index     » next       coverage.py v7.3.2, created at 2023-12-12 06:31 +0000

1""" 

2The :mod:`sklearn.utils.sparsefuncs` module includes a collection of utilities to 

3work with sparse matrices and arrays. 

4""" 

5 

6# Authors: Manoj Kumar 

7# Thomas Unterthiner 

8# Giorgio Patrini 

9# 

10# License: BSD 3 clause 

11import numpy as np 

12import scipy.sparse as sp 

13from scipy.sparse.linalg import LinearOperator 

14 

15from ..utils.fixes import _sparse_min_max, _sparse_nan_min_max 

16from ..utils.validation import _check_sample_weight 

17from .sparsefuncs_fast import ( 

18 csc_mean_variance_axis0 as _csc_mean_var_axis0, 

19) 

20from .sparsefuncs_fast import ( 

21 csr_mean_variance_axis0 as _csr_mean_var_axis0, 

22) 

23from .sparsefuncs_fast import ( 

24 incr_mean_variance_axis0 as _incr_mean_var_axis0, 

25) 

26 

27 

28def _raise_typeerror(X): 

29 """Raises a TypeError if X is not a CSR or CSC matrix""" 

30 input_type = X.format if sp.issparse(X) else type(X) 

31 err = "Expected a CSR or CSC sparse matrix, got %s." % input_type 

32 raise TypeError(err) 

33 

34 

35def _raise_error_wrong_axis(axis): 

36 if axis not in (0, 1): 

37 raise ValueError( 

38 "Unknown axis value: %d. Use 0 for rows, or 1 for columns" % axis 

39 ) 

40 

41 

42def inplace_csr_column_scale(X, scale): 

43 """Inplace column scaling of a CSR matrix. 

44 

45 Scale each feature of the data matrix by multiplying with specific scale 

46 provided by the caller assuming a (n_samples, n_features) shape. 

47 

48 Parameters 

49 ---------- 

50 X : sparse matrix of shape (n_samples, n_features) 

51 Matrix to normalize using the variance of the features. 

52 It should be of CSR format. 

53 

54 scale : ndarray of shape (n_features,), dtype={np.float32, np.float64} 

55 Array of precomputed feature-wise values to use for scaling. 

56 """ 

57 assert scale.shape[0] == X.shape[1] 

58 X.data *= scale.take(X.indices, mode="clip") 

59 

60 

61def inplace_csr_row_scale(X, scale): 

62 """Inplace row scaling of a CSR matrix. 

63 

64 Scale each sample of the data matrix by multiplying with specific scale 

65 provided by the caller assuming a (n_samples, n_features) shape. 

66 

67 Parameters 

68 ---------- 

69 X : sparse matrix of shape (n_samples, n_features) 

70 Matrix to be scaled. It should be of CSR format. 

71 

72 scale : ndarray of float of shape (n_samples,) 

73 Array of precomputed sample-wise values to use for scaling. 

74 """ 

75 assert scale.shape[0] == X.shape[0] 

76 X.data *= np.repeat(scale, np.diff(X.indptr)) 

77 

78 

79def mean_variance_axis(X, axis, weights=None, return_sum_weights=False): 

80 """Compute mean and variance along an axis on a CSR or CSC matrix. 

81 

82 Parameters 

83 ---------- 

84 X : sparse matrix of shape (n_samples, n_features) 

85 Input data. It can be of CSR or CSC format. 

86 

87 axis : {0, 1} 

88 Axis along which the axis should be computed. 

89 

90 weights : ndarray of shape (n_samples,) or (n_features,), default=None 

91 If axis is set to 0 shape is (n_samples,) or 

92 if axis is set to 1 shape is (n_features,). 

93 If it is set to None, then samples are equally weighted. 

94 

95 .. versionadded:: 0.24 

96 

97 return_sum_weights : bool, default=False 

98 If True, returns the sum of weights seen for each feature 

99 if `axis=0` or each sample if `axis=1`. 

100 

101 .. versionadded:: 0.24 

102 

103 Returns 

104 ------- 

105 

106 means : ndarray of shape (n_features,), dtype=floating 

107 Feature-wise means. 

108 

109 variances : ndarray of shape (n_features,), dtype=floating 

110 Feature-wise variances. 

111 

112 sum_weights : ndarray of shape (n_features,), dtype=floating 

113 Returned if `return_sum_weights` is `True`. 

114 """ 

115 _raise_error_wrong_axis(axis) 

116 

117 if sp.issparse(X) and X.format == "csr": 

118 if axis == 0: 

119 return _csr_mean_var_axis0( 

120 X, weights=weights, return_sum_weights=return_sum_weights 

121 ) 

122 else: 

123 return _csc_mean_var_axis0( 

124 X.T, weights=weights, return_sum_weights=return_sum_weights 

125 ) 

126 elif sp.issparse(X) and X.format == "csc": 

127 if axis == 0: 

128 return _csc_mean_var_axis0( 

129 X, weights=weights, return_sum_weights=return_sum_weights 

130 ) 

131 else: 

132 return _csr_mean_var_axis0( 

133 X.T, weights=weights, return_sum_weights=return_sum_weights 

134 ) 

135 else: 

136 _raise_typeerror(X) 

137 

138 

139def incr_mean_variance_axis(X, *, axis, last_mean, last_var, last_n, weights=None): 

140 """Compute incremental mean and variance along an axis on a CSR or CSC matrix. 

141 

142 last_mean, last_var are the statistics computed at the last step by this 

143 function. Both must be initialized to 0-arrays of the proper size, i.e. 

144 the number of features in X. last_n is the number of samples encountered 

145 until now. 

146 

147 Parameters 

148 ---------- 

149 X : CSR or CSC sparse matrix of shape (n_samples, n_features) 

150 Input data. 

151 

152 axis : {0, 1} 

153 Axis along which the axis should be computed. 

154 

155 last_mean : ndarray of shape (n_features,) or (n_samples,), dtype=floating 

156 Array of means to update with the new data X. 

157 Should be of shape (n_features,) if axis=0 or (n_samples,) if axis=1. 

158 

159 last_var : ndarray of shape (n_features,) or (n_samples,), dtype=floating 

160 Array of variances to update with the new data X. 

161 Should be of shape (n_features,) if axis=0 or (n_samples,) if axis=1. 

162 

163 last_n : float or ndarray of shape (n_features,) or (n_samples,), \ 

164 dtype=floating 

165 Sum of the weights seen so far, excluding the current weights 

166 If not float, it should be of shape (n_features,) if 

167 axis=0 or (n_samples,) if axis=1. If float it corresponds to 

168 having same weights for all samples (or features). 

169 

170 weights : ndarray of shape (n_samples,) or (n_features,), default=None 

171 If axis is set to 0 shape is (n_samples,) or 

172 if axis is set to 1 shape is (n_features,). 

173 If it is set to None, then samples are equally weighted. 

174 

175 .. versionadded:: 0.24 

176 

177 Returns 

178 ------- 

179 means : ndarray of shape (n_features,) or (n_samples,), dtype=floating 

180 Updated feature-wise means if axis = 0 or 

181 sample-wise means if axis = 1. 

182 

183 variances : ndarray of shape (n_features,) or (n_samples,), dtype=floating 

184 Updated feature-wise variances if axis = 0 or 

185 sample-wise variances if axis = 1. 

186 

187 n : ndarray of shape (n_features,) or (n_samples,), dtype=integral 

188 Updated number of seen samples per feature if axis=0 

189 or number of seen features per sample if axis=1. 

190 

191 If weights is not None, n is a sum of the weights of the seen 

192 samples or features instead of the actual number of seen 

193 samples or features. 

194 

195 Notes 

196 ----- 

197 NaNs are ignored in the algorithm. 

198 """ 

199 _raise_error_wrong_axis(axis) 

200 

201 if not (sp.issparse(X) and X.format in ("csc", "csr")): 

202 _raise_typeerror(X) 

203 

204 if np.size(last_n) == 1: 

205 last_n = np.full(last_mean.shape, last_n, dtype=last_mean.dtype) 

206 

207 if not (np.size(last_mean) == np.size(last_var) == np.size(last_n)): 

208 raise ValueError("last_mean, last_var, last_n do not have the same shapes.") 

209 

210 if axis == 1: 

211 if np.size(last_mean) != X.shape[0]: 

212 raise ValueError( 

213 "If axis=1, then last_mean, last_n, last_var should be of " 

214 f"size n_samples {X.shape[0]} (Got {np.size(last_mean)})." 

215 ) 

216 else: # axis == 0 

217 if np.size(last_mean) != X.shape[1]: 

218 raise ValueError( 

219 "If axis=0, then last_mean, last_n, last_var should be of " 

220 f"size n_features {X.shape[1]} (Got {np.size(last_mean)})." 

221 ) 

222 

223 X = X.T if axis == 1 else X 

224 

225 if weights is not None: 

226 weights = _check_sample_weight(weights, X, dtype=X.dtype) 

227 

228 return _incr_mean_var_axis0( 

229 X, last_mean=last_mean, last_var=last_var, last_n=last_n, weights=weights 

230 ) 

231 

232 

233def inplace_column_scale(X, scale): 

234 """Inplace column scaling of a CSC/CSR matrix. 

235 

236 Scale each feature of the data matrix by multiplying with specific scale 

237 provided by the caller assuming a (n_samples, n_features) shape. 

238 

239 Parameters 

240 ---------- 

241 X : sparse matrix of shape (n_samples, n_features) 

242 Matrix to normalize using the variance of the features. It should be 

243 of CSC or CSR format. 

244 

245 scale : ndarray of shape (n_features,), dtype={np.float32, np.float64} 

246 Array of precomputed feature-wise values to use for scaling. 

247 """ 

248 if sp.issparse(X) and X.format == "csc": 

249 inplace_csr_row_scale(X.T, scale) 

250 elif sp.issparse(X) and X.format == "csr": 

251 inplace_csr_column_scale(X, scale) 

252 else: 

253 _raise_typeerror(X) 

254 

255 

256def inplace_row_scale(X, scale): 

257 """Inplace row scaling of a CSR or CSC matrix. 

258 

259 Scale each row of the data matrix by multiplying with specific scale 

260 provided by the caller assuming a (n_samples, n_features) shape. 

261 

262 Parameters 

263 ---------- 

264 X : sparse matrix of shape (n_samples, n_features) 

265 Matrix to be scaled. It should be of CSR or CSC format. 

266 

267 scale : ndarray of shape (n_features,), dtype={np.float32, np.float64} 

268 Array of precomputed sample-wise values to use for scaling. 

269 """ 

270 if sp.issparse(X) and X.format == "csc": 

271 inplace_csr_column_scale(X.T, scale) 

272 elif sp.issparse(X) and X.format == "csr": 

273 inplace_csr_row_scale(X, scale) 

274 else: 

275 _raise_typeerror(X) 

276 

277 

278def inplace_swap_row_csc(X, m, n): 

279 """Swap two rows of a CSC matrix in-place. 

280 

281 Parameters 

282 ---------- 

283 X : sparse matrix of shape (n_samples, n_features) 

284 Matrix whose two rows are to be swapped. It should be of 

285 CSC format. 

286 

287 m : int 

288 Index of the row of X to be swapped. 

289 

290 n : int 

291 Index of the row of X to be swapped. 

292 """ 

293 for t in [m, n]: 

294 if isinstance(t, np.ndarray): 

295 raise TypeError("m and n should be valid integers") 

296 

297 if m < 0: 

298 m += X.shape[0] 

299 if n < 0: 

300 n += X.shape[0] 

301 

302 m_mask = X.indices == m 

303 X.indices[X.indices == n] = m 

304 X.indices[m_mask] = n 

305 

306 

307def inplace_swap_row_csr(X, m, n): 

308 """Swap two rows of a CSR matrix in-place. 

309 

310 Parameters 

311 ---------- 

312 X : sparse matrix of shape (n_samples, n_features) 

313 Matrix whose two rows are to be swapped. It should be of 

314 CSR format. 

315 

316 m : int 

317 Index of the row of X to be swapped. 

318 

319 n : int 

320 Index of the row of X to be swapped. 

321 """ 

322 for t in [m, n]: 

323 if isinstance(t, np.ndarray): 

324 raise TypeError("m and n should be valid integers") 

325 

326 if m < 0: 

327 m += X.shape[0] 

328 if n < 0: 

329 n += X.shape[0] 

330 

331 # The following swapping makes life easier since m is assumed to be the 

332 # smaller integer below. 

333 if m > n: 

334 m, n = n, m 

335 

336 indptr = X.indptr 

337 m_start = indptr[m] 

338 m_stop = indptr[m + 1] 

339 n_start = indptr[n] 

340 n_stop = indptr[n + 1] 

341 nz_m = m_stop - m_start 

342 nz_n = n_stop - n_start 

343 

344 if nz_m != nz_n: 

345 # Modify indptr first 

346 X.indptr[m + 2 : n] += nz_n - nz_m 

347 X.indptr[m + 1] = m_start + nz_n 

348 X.indptr[n] = n_stop - nz_m 

349 

350 X.indices = np.concatenate( 

351 [ 

352 X.indices[:m_start], 

353 X.indices[n_start:n_stop], 

354 X.indices[m_stop:n_start], 

355 X.indices[m_start:m_stop], 

356 X.indices[n_stop:], 

357 ] 

358 ) 

359 X.data = np.concatenate( 

360 [ 

361 X.data[:m_start], 

362 X.data[n_start:n_stop], 

363 X.data[m_stop:n_start], 

364 X.data[m_start:m_stop], 

365 X.data[n_stop:], 

366 ] 

367 ) 

368 

369 

370def inplace_swap_row(X, m, n): 

371 """ 

372 Swap two rows of a CSC/CSR matrix in-place. 

373 

374 Parameters 

375 ---------- 

376 X : sparse matrix of shape (n_samples, n_features) 

377 Matrix whose two rows are to be swapped. It should be of CSR or 

378 CSC format. 

379 

380 m : int 

381 Index of the row of X to be swapped. 

382 

383 n : int 

384 Index of the row of X to be swapped. 

385 """ 

386 if sp.issparse(X) and X.format == "csc": 

387 inplace_swap_row_csc(X, m, n) 

388 elif sp.issparse(X) and X.format == "csr": 

389 inplace_swap_row_csr(X, m, n) 

390 else: 

391 _raise_typeerror(X) 

392 

393 

394def inplace_swap_column(X, m, n): 

395 """ 

396 Swap two columns of a CSC/CSR matrix in-place. 

397 

398 Parameters 

399 ---------- 

400 X : sparse matrix of shape (n_samples, n_features) 

401 Matrix whose two columns are to be swapped. It should be of 

402 CSR or CSC format. 

403 

404 m : int 

405 Index of the column of X to be swapped. 

406 

407 n : int 

408 Index of the column of X to be swapped. 

409 """ 

410 if m < 0: 

411 m += X.shape[1] 

412 if n < 0: 

413 n += X.shape[1] 

414 if sp.issparse(X) and X.format == "csc": 

415 inplace_swap_row_csr(X, m, n) 

416 elif sp.issparse(X) and X.format == "csr": 

417 inplace_swap_row_csc(X, m, n) 

418 else: 

419 _raise_typeerror(X) 

420 

421 

422def min_max_axis(X, axis, ignore_nan=False): 

423 """Compute minimum and maximum along an axis on a CSR or CSC matrix. 

424 

425 Optionally ignore NaN values. 

426 

427 Parameters 

428 ---------- 

429 X : sparse matrix of shape (n_samples, n_features) 

430 Input data. It should be of CSR or CSC format. 

431 

432 axis : {0, 1} 

433 Axis along which the axis should be computed. 

434 

435 ignore_nan : bool, default=False 

436 Ignore or passing through NaN values. 

437 

438 .. versionadded:: 0.20 

439 

440 Returns 

441 ------- 

442 

443 mins : ndarray of shape (n_features,), dtype={np.float32, np.float64} 

444 Feature-wise minima. 

445 

446 maxs : ndarray of shape (n_features,), dtype={np.float32, np.float64} 

447 Feature-wise maxima. 

448 """ 

449 if sp.issparse(X) and X.format in ("csr", "csc"): 

450 if ignore_nan: 

451 return _sparse_nan_min_max(X, axis=axis) 

452 else: 

453 return _sparse_min_max(X, axis=axis) 

454 else: 

455 _raise_typeerror(X) 

456 

457 

458def count_nonzero(X, axis=None, sample_weight=None): 

459 """A variant of X.getnnz() with extension to weighting on axis 0. 

460 

461 Useful in efficiently calculating multilabel metrics. 

462 

463 Parameters 

464 ---------- 

465 X : sparse matrix of shape (n_samples, n_labels) 

466 Input data. It should be of CSR format. 

467 

468 axis : {0, 1}, default=None 

469 The axis on which the data is aggregated. 

470 

471 sample_weight : array-like of shape (n_samples,), default=None 

472 Weight for each row of X. 

473 

474 Returns 

475 ------- 

476 nnz : int, float, ndarray of shape (n_samples,) or ndarray of shape (n_features,) 

477 Number of non-zero values in the array along a given axis. Otherwise, 

478 the total number of non-zero values in the array is returned. 

479 """ 

480 if axis == -1: 

481 axis = 1 

482 elif axis == -2: 

483 axis = 0 

484 elif X.format != "csr": 

485 raise TypeError("Expected CSR sparse format, got {0}".format(X.format)) 

486 

487 # We rely here on the fact that np.diff(Y.indptr) for a CSR 

488 # will return the number of nonzero entries in each row. 

489 # A bincount over Y.indices will return the number of nonzeros 

490 # in each column. See ``csr_matrix.getnnz`` in scipy >= 0.14. 

491 if axis is None: 

492 if sample_weight is None: 

493 return X.nnz 

494 else: 

495 return np.dot(np.diff(X.indptr), sample_weight) 

496 elif axis == 1: 

497 out = np.diff(X.indptr) 

498 if sample_weight is None: 

499 # astype here is for consistency with axis=0 dtype 

500 return out.astype("intp") 

501 return out * sample_weight 

502 elif axis == 0: 

503 if sample_weight is None: 

504 return np.bincount(X.indices, minlength=X.shape[1]) 

505 else: 

506 weights = np.repeat(sample_weight, np.diff(X.indptr)) 

507 return np.bincount(X.indices, minlength=X.shape[1], weights=weights) 

508 else: 

509 raise ValueError("Unsupported axis: {0}".format(axis)) 

510 

511 

512def _get_median(data, n_zeros): 

513 """Compute the median of data with n_zeros additional zeros. 

514 

515 This function is used to support sparse matrices; it modifies data 

516 in-place. 

517 """ 

518 n_elems = len(data) + n_zeros 

519 if not n_elems: 

520 return np.nan 

521 n_negative = np.count_nonzero(data < 0) 

522 middle, is_odd = divmod(n_elems, 2) 

523 data.sort() 

524 

525 if is_odd: 

526 return _get_elem_at_rank(middle, data, n_negative, n_zeros) 

527 

528 return ( 

529 _get_elem_at_rank(middle - 1, data, n_negative, n_zeros) 

530 + _get_elem_at_rank(middle, data, n_negative, n_zeros) 

531 ) / 2.0 

532 

533 

534def _get_elem_at_rank(rank, data, n_negative, n_zeros): 

535 """Find the value in data augmented with n_zeros for the given rank""" 

536 if rank < n_negative: 

537 return data[rank] 

538 if rank - n_negative < n_zeros: 

539 return 0 

540 return data[rank - n_zeros] 

541 

542 

543def csc_median_axis_0(X): 

544 """Find the median across axis 0 of a CSC matrix. 

545 

546 It is equivalent to doing np.median(X, axis=0). 

547 

548 Parameters 

549 ---------- 

550 X : sparse matrix of shape (n_samples, n_features) 

551 Input data. It should be of CSC format. 

552 

553 Returns 

554 ------- 

555 median : ndarray of shape (n_features,) 

556 Median. 

557 """ 

558 if not (sp.issparse(X) and X.format == "csc"): 

559 raise TypeError("Expected matrix of CSC format, got %s" % X.format) 

560 

561 indptr = X.indptr 

562 n_samples, n_features = X.shape 

563 median = np.zeros(n_features) 

564 

565 for f_ind, (start, end) in enumerate(zip(indptr[:-1], indptr[1:])): 

566 # Prevent modifying X in place 

567 data = np.copy(X.data[start:end]) 

568 nz = n_samples - data.size 

569 median[f_ind] = _get_median(data, nz) 

570 

571 return median 

572 

573 

574def _implicit_column_offset(X, offset): 

575 """Create an implicitly offset linear operator. 

576 

577 This is used by PCA on sparse data to avoid densifying the whole data 

578 matrix. 

579 

580 Params 

581 ------ 

582 X : sparse matrix of shape (n_samples, n_features) 

583 offset : ndarray of shape (n_features,) 

584 

585 Returns 

586 ------- 

587 centered : LinearOperator 

588 """ 

589 offset = offset[None, :] 

590 XT = X.T 

591 return LinearOperator( 

592 matvec=lambda x: X @ x - offset @ x, 

593 matmat=lambda x: X @ x - offset @ x, 

594 rmatvec=lambda x: XT @ x - (offset * x.sum()), 

595 rmatmat=lambda x: XT @ x - offset.T @ x.sum(axis=0)[None, :], 

596 dtype=X.dtype, 

597 shape=X.shape, 

598 )