1"""
2The :mod:`sklearn.utils.sparsefuncs` module includes a collection of utilities to
3work with sparse matrices and arrays.
4"""
5
6# Authors: Manoj Kumar
7# Thomas Unterthiner
8# Giorgio Patrini
9#
10# License: BSD 3 clause
11import numpy as np
12import scipy.sparse as sp
13from scipy.sparse.linalg import LinearOperator
14
15from ..utils.fixes import _sparse_min_max, _sparse_nan_min_max
16from ..utils.validation import _check_sample_weight
17from .sparsefuncs_fast import (
18 csc_mean_variance_axis0 as _csc_mean_var_axis0,
19)
20from .sparsefuncs_fast import (
21 csr_mean_variance_axis0 as _csr_mean_var_axis0,
22)
23from .sparsefuncs_fast import (
24 incr_mean_variance_axis0 as _incr_mean_var_axis0,
25)
26
27
28def _raise_typeerror(X):
29 """Raises a TypeError if X is not a CSR or CSC matrix"""
30 input_type = X.format if sp.issparse(X) else type(X)
31 err = "Expected a CSR or CSC sparse matrix, got %s." % input_type
32 raise TypeError(err)
33
34
35def _raise_error_wrong_axis(axis):
36 if axis not in (0, 1):
37 raise ValueError(
38 "Unknown axis value: %d. Use 0 for rows, or 1 for columns" % axis
39 )
40
41
42def inplace_csr_column_scale(X, scale):
43 """Inplace column scaling of a CSR matrix.
44
45 Scale each feature of the data matrix by multiplying with specific scale
46 provided by the caller assuming a (n_samples, n_features) shape.
47
48 Parameters
49 ----------
50 X : sparse matrix of shape (n_samples, n_features)
51 Matrix to normalize using the variance of the features.
52 It should be of CSR format.
53
54 scale : ndarray of shape (n_features,), dtype={np.float32, np.float64}
55 Array of precomputed feature-wise values to use for scaling.
56 """
57 assert scale.shape[0] == X.shape[1]
58 X.data *= scale.take(X.indices, mode="clip")
59
60
61def inplace_csr_row_scale(X, scale):
62 """Inplace row scaling of a CSR matrix.
63
64 Scale each sample of the data matrix by multiplying with specific scale
65 provided by the caller assuming a (n_samples, n_features) shape.
66
67 Parameters
68 ----------
69 X : sparse matrix of shape (n_samples, n_features)
70 Matrix to be scaled. It should be of CSR format.
71
72 scale : ndarray of float of shape (n_samples,)
73 Array of precomputed sample-wise values to use for scaling.
74 """
75 assert scale.shape[0] == X.shape[0]
76 X.data *= np.repeat(scale, np.diff(X.indptr))
77
78
79def mean_variance_axis(X, axis, weights=None, return_sum_weights=False):
80 """Compute mean and variance along an axis on a CSR or CSC matrix.
81
82 Parameters
83 ----------
84 X : sparse matrix of shape (n_samples, n_features)
85 Input data. It can be of CSR or CSC format.
86
87 axis : {0, 1}
88 Axis along which the axis should be computed.
89
90 weights : ndarray of shape (n_samples,) or (n_features,), default=None
91 If axis is set to 0 shape is (n_samples,) or
92 if axis is set to 1 shape is (n_features,).
93 If it is set to None, then samples are equally weighted.
94
95 .. versionadded:: 0.24
96
97 return_sum_weights : bool, default=False
98 If True, returns the sum of weights seen for each feature
99 if `axis=0` or each sample if `axis=1`.
100
101 .. versionadded:: 0.24
102
103 Returns
104 -------
105
106 means : ndarray of shape (n_features,), dtype=floating
107 Feature-wise means.
108
109 variances : ndarray of shape (n_features,), dtype=floating
110 Feature-wise variances.
111
112 sum_weights : ndarray of shape (n_features,), dtype=floating
113 Returned if `return_sum_weights` is `True`.
114 """
115 _raise_error_wrong_axis(axis)
116
117 if sp.issparse(X) and X.format == "csr":
118 if axis == 0:
119 return _csr_mean_var_axis0(
120 X, weights=weights, return_sum_weights=return_sum_weights
121 )
122 else:
123 return _csc_mean_var_axis0(
124 X.T, weights=weights, return_sum_weights=return_sum_weights
125 )
126 elif sp.issparse(X) and X.format == "csc":
127 if axis == 0:
128 return _csc_mean_var_axis0(
129 X, weights=weights, return_sum_weights=return_sum_weights
130 )
131 else:
132 return _csr_mean_var_axis0(
133 X.T, weights=weights, return_sum_weights=return_sum_weights
134 )
135 else:
136 _raise_typeerror(X)
137
138
139def incr_mean_variance_axis(X, *, axis, last_mean, last_var, last_n, weights=None):
140 """Compute incremental mean and variance along an axis on a CSR or CSC matrix.
141
142 last_mean, last_var are the statistics computed at the last step by this
143 function. Both must be initialized to 0-arrays of the proper size, i.e.
144 the number of features in X. last_n is the number of samples encountered
145 until now.
146
147 Parameters
148 ----------
149 X : CSR or CSC sparse matrix of shape (n_samples, n_features)
150 Input data.
151
152 axis : {0, 1}
153 Axis along which the axis should be computed.
154
155 last_mean : ndarray of shape (n_features,) or (n_samples,), dtype=floating
156 Array of means to update with the new data X.
157 Should be of shape (n_features,) if axis=0 or (n_samples,) if axis=1.
158
159 last_var : ndarray of shape (n_features,) or (n_samples,), dtype=floating
160 Array of variances to update with the new data X.
161 Should be of shape (n_features,) if axis=0 or (n_samples,) if axis=1.
162
163 last_n : float or ndarray of shape (n_features,) or (n_samples,), \
164 dtype=floating
165 Sum of the weights seen so far, excluding the current weights
166 If not float, it should be of shape (n_features,) if
167 axis=0 or (n_samples,) if axis=1. If float it corresponds to
168 having same weights for all samples (or features).
169
170 weights : ndarray of shape (n_samples,) or (n_features,), default=None
171 If axis is set to 0 shape is (n_samples,) or
172 if axis is set to 1 shape is (n_features,).
173 If it is set to None, then samples are equally weighted.
174
175 .. versionadded:: 0.24
176
177 Returns
178 -------
179 means : ndarray of shape (n_features,) or (n_samples,), dtype=floating
180 Updated feature-wise means if axis = 0 or
181 sample-wise means if axis = 1.
182
183 variances : ndarray of shape (n_features,) or (n_samples,), dtype=floating
184 Updated feature-wise variances if axis = 0 or
185 sample-wise variances if axis = 1.
186
187 n : ndarray of shape (n_features,) or (n_samples,), dtype=integral
188 Updated number of seen samples per feature if axis=0
189 or number of seen features per sample if axis=1.
190
191 If weights is not None, n is a sum of the weights of the seen
192 samples or features instead of the actual number of seen
193 samples or features.
194
195 Notes
196 -----
197 NaNs are ignored in the algorithm.
198 """
199 _raise_error_wrong_axis(axis)
200
201 if not (sp.issparse(X) and X.format in ("csc", "csr")):
202 _raise_typeerror(X)
203
204 if np.size(last_n) == 1:
205 last_n = np.full(last_mean.shape, last_n, dtype=last_mean.dtype)
206
207 if not (np.size(last_mean) == np.size(last_var) == np.size(last_n)):
208 raise ValueError("last_mean, last_var, last_n do not have the same shapes.")
209
210 if axis == 1:
211 if np.size(last_mean) != X.shape[0]:
212 raise ValueError(
213 "If axis=1, then last_mean, last_n, last_var should be of "
214 f"size n_samples {X.shape[0]} (Got {np.size(last_mean)})."
215 )
216 else: # axis == 0
217 if np.size(last_mean) != X.shape[1]:
218 raise ValueError(
219 "If axis=0, then last_mean, last_n, last_var should be of "
220 f"size n_features {X.shape[1]} (Got {np.size(last_mean)})."
221 )
222
223 X = X.T if axis == 1 else X
224
225 if weights is not None:
226 weights = _check_sample_weight(weights, X, dtype=X.dtype)
227
228 return _incr_mean_var_axis0(
229 X, last_mean=last_mean, last_var=last_var, last_n=last_n, weights=weights
230 )
231
232
233def inplace_column_scale(X, scale):
234 """Inplace column scaling of a CSC/CSR matrix.
235
236 Scale each feature of the data matrix by multiplying with specific scale
237 provided by the caller assuming a (n_samples, n_features) shape.
238
239 Parameters
240 ----------
241 X : sparse matrix of shape (n_samples, n_features)
242 Matrix to normalize using the variance of the features. It should be
243 of CSC or CSR format.
244
245 scale : ndarray of shape (n_features,), dtype={np.float32, np.float64}
246 Array of precomputed feature-wise values to use for scaling.
247 """
248 if sp.issparse(X) and X.format == "csc":
249 inplace_csr_row_scale(X.T, scale)
250 elif sp.issparse(X) and X.format == "csr":
251 inplace_csr_column_scale(X, scale)
252 else:
253 _raise_typeerror(X)
254
255
256def inplace_row_scale(X, scale):
257 """Inplace row scaling of a CSR or CSC matrix.
258
259 Scale each row of the data matrix by multiplying with specific scale
260 provided by the caller assuming a (n_samples, n_features) shape.
261
262 Parameters
263 ----------
264 X : sparse matrix of shape (n_samples, n_features)
265 Matrix to be scaled. It should be of CSR or CSC format.
266
267 scale : ndarray of shape (n_features,), dtype={np.float32, np.float64}
268 Array of precomputed sample-wise values to use for scaling.
269 """
270 if sp.issparse(X) and X.format == "csc":
271 inplace_csr_column_scale(X.T, scale)
272 elif sp.issparse(X) and X.format == "csr":
273 inplace_csr_row_scale(X, scale)
274 else:
275 _raise_typeerror(X)
276
277
278def inplace_swap_row_csc(X, m, n):
279 """Swap two rows of a CSC matrix in-place.
280
281 Parameters
282 ----------
283 X : sparse matrix of shape (n_samples, n_features)
284 Matrix whose two rows are to be swapped. It should be of
285 CSC format.
286
287 m : int
288 Index of the row of X to be swapped.
289
290 n : int
291 Index of the row of X to be swapped.
292 """
293 for t in [m, n]:
294 if isinstance(t, np.ndarray):
295 raise TypeError("m and n should be valid integers")
296
297 if m < 0:
298 m += X.shape[0]
299 if n < 0:
300 n += X.shape[0]
301
302 m_mask = X.indices == m
303 X.indices[X.indices == n] = m
304 X.indices[m_mask] = n
305
306
307def inplace_swap_row_csr(X, m, n):
308 """Swap two rows of a CSR matrix in-place.
309
310 Parameters
311 ----------
312 X : sparse matrix of shape (n_samples, n_features)
313 Matrix whose two rows are to be swapped. It should be of
314 CSR format.
315
316 m : int
317 Index of the row of X to be swapped.
318
319 n : int
320 Index of the row of X to be swapped.
321 """
322 for t in [m, n]:
323 if isinstance(t, np.ndarray):
324 raise TypeError("m and n should be valid integers")
325
326 if m < 0:
327 m += X.shape[0]
328 if n < 0:
329 n += X.shape[0]
330
331 # The following swapping makes life easier since m is assumed to be the
332 # smaller integer below.
333 if m > n:
334 m, n = n, m
335
336 indptr = X.indptr
337 m_start = indptr[m]
338 m_stop = indptr[m + 1]
339 n_start = indptr[n]
340 n_stop = indptr[n + 1]
341 nz_m = m_stop - m_start
342 nz_n = n_stop - n_start
343
344 if nz_m != nz_n:
345 # Modify indptr first
346 X.indptr[m + 2 : n] += nz_n - nz_m
347 X.indptr[m + 1] = m_start + nz_n
348 X.indptr[n] = n_stop - nz_m
349
350 X.indices = np.concatenate(
351 [
352 X.indices[:m_start],
353 X.indices[n_start:n_stop],
354 X.indices[m_stop:n_start],
355 X.indices[m_start:m_stop],
356 X.indices[n_stop:],
357 ]
358 )
359 X.data = np.concatenate(
360 [
361 X.data[:m_start],
362 X.data[n_start:n_stop],
363 X.data[m_stop:n_start],
364 X.data[m_start:m_stop],
365 X.data[n_stop:],
366 ]
367 )
368
369
370def inplace_swap_row(X, m, n):
371 """
372 Swap two rows of a CSC/CSR matrix in-place.
373
374 Parameters
375 ----------
376 X : sparse matrix of shape (n_samples, n_features)
377 Matrix whose two rows are to be swapped. It should be of CSR or
378 CSC format.
379
380 m : int
381 Index of the row of X to be swapped.
382
383 n : int
384 Index of the row of X to be swapped.
385 """
386 if sp.issparse(X) and X.format == "csc":
387 inplace_swap_row_csc(X, m, n)
388 elif sp.issparse(X) and X.format == "csr":
389 inplace_swap_row_csr(X, m, n)
390 else:
391 _raise_typeerror(X)
392
393
394def inplace_swap_column(X, m, n):
395 """
396 Swap two columns of a CSC/CSR matrix in-place.
397
398 Parameters
399 ----------
400 X : sparse matrix of shape (n_samples, n_features)
401 Matrix whose two columns are to be swapped. It should be of
402 CSR or CSC format.
403
404 m : int
405 Index of the column of X to be swapped.
406
407 n : int
408 Index of the column of X to be swapped.
409 """
410 if m < 0:
411 m += X.shape[1]
412 if n < 0:
413 n += X.shape[1]
414 if sp.issparse(X) and X.format == "csc":
415 inplace_swap_row_csr(X, m, n)
416 elif sp.issparse(X) and X.format == "csr":
417 inplace_swap_row_csc(X, m, n)
418 else:
419 _raise_typeerror(X)
420
421
422def min_max_axis(X, axis, ignore_nan=False):
423 """Compute minimum and maximum along an axis on a CSR or CSC matrix.
424
425 Optionally ignore NaN values.
426
427 Parameters
428 ----------
429 X : sparse matrix of shape (n_samples, n_features)
430 Input data. It should be of CSR or CSC format.
431
432 axis : {0, 1}
433 Axis along which the axis should be computed.
434
435 ignore_nan : bool, default=False
436 Ignore or passing through NaN values.
437
438 .. versionadded:: 0.20
439
440 Returns
441 -------
442
443 mins : ndarray of shape (n_features,), dtype={np.float32, np.float64}
444 Feature-wise minima.
445
446 maxs : ndarray of shape (n_features,), dtype={np.float32, np.float64}
447 Feature-wise maxima.
448 """
449 if sp.issparse(X) and X.format in ("csr", "csc"):
450 if ignore_nan:
451 return _sparse_nan_min_max(X, axis=axis)
452 else:
453 return _sparse_min_max(X, axis=axis)
454 else:
455 _raise_typeerror(X)
456
457
458def count_nonzero(X, axis=None, sample_weight=None):
459 """A variant of X.getnnz() with extension to weighting on axis 0.
460
461 Useful in efficiently calculating multilabel metrics.
462
463 Parameters
464 ----------
465 X : sparse matrix of shape (n_samples, n_labels)
466 Input data. It should be of CSR format.
467
468 axis : {0, 1}, default=None
469 The axis on which the data is aggregated.
470
471 sample_weight : array-like of shape (n_samples,), default=None
472 Weight for each row of X.
473
474 Returns
475 -------
476 nnz : int, float, ndarray of shape (n_samples,) or ndarray of shape (n_features,)
477 Number of non-zero values in the array along a given axis. Otherwise,
478 the total number of non-zero values in the array is returned.
479 """
480 if axis == -1:
481 axis = 1
482 elif axis == -2:
483 axis = 0
484 elif X.format != "csr":
485 raise TypeError("Expected CSR sparse format, got {0}".format(X.format))
486
487 # We rely here on the fact that np.diff(Y.indptr) for a CSR
488 # will return the number of nonzero entries in each row.
489 # A bincount over Y.indices will return the number of nonzeros
490 # in each column. See ``csr_matrix.getnnz`` in scipy >= 0.14.
491 if axis is None:
492 if sample_weight is None:
493 return X.nnz
494 else:
495 return np.dot(np.diff(X.indptr), sample_weight)
496 elif axis == 1:
497 out = np.diff(X.indptr)
498 if sample_weight is None:
499 # astype here is for consistency with axis=0 dtype
500 return out.astype("intp")
501 return out * sample_weight
502 elif axis == 0:
503 if sample_weight is None:
504 return np.bincount(X.indices, minlength=X.shape[1])
505 else:
506 weights = np.repeat(sample_weight, np.diff(X.indptr))
507 return np.bincount(X.indices, minlength=X.shape[1], weights=weights)
508 else:
509 raise ValueError("Unsupported axis: {0}".format(axis))
510
511
512def _get_median(data, n_zeros):
513 """Compute the median of data with n_zeros additional zeros.
514
515 This function is used to support sparse matrices; it modifies data
516 in-place.
517 """
518 n_elems = len(data) + n_zeros
519 if not n_elems:
520 return np.nan
521 n_negative = np.count_nonzero(data < 0)
522 middle, is_odd = divmod(n_elems, 2)
523 data.sort()
524
525 if is_odd:
526 return _get_elem_at_rank(middle, data, n_negative, n_zeros)
527
528 return (
529 _get_elem_at_rank(middle - 1, data, n_negative, n_zeros)
530 + _get_elem_at_rank(middle, data, n_negative, n_zeros)
531 ) / 2.0
532
533
534def _get_elem_at_rank(rank, data, n_negative, n_zeros):
535 """Find the value in data augmented with n_zeros for the given rank"""
536 if rank < n_negative:
537 return data[rank]
538 if rank - n_negative < n_zeros:
539 return 0
540 return data[rank - n_zeros]
541
542
543def csc_median_axis_0(X):
544 """Find the median across axis 0 of a CSC matrix.
545
546 It is equivalent to doing np.median(X, axis=0).
547
548 Parameters
549 ----------
550 X : sparse matrix of shape (n_samples, n_features)
551 Input data. It should be of CSC format.
552
553 Returns
554 -------
555 median : ndarray of shape (n_features,)
556 Median.
557 """
558 if not (sp.issparse(X) and X.format == "csc"):
559 raise TypeError("Expected matrix of CSC format, got %s" % X.format)
560
561 indptr = X.indptr
562 n_samples, n_features = X.shape
563 median = np.zeros(n_features)
564
565 for f_ind, (start, end) in enumerate(zip(indptr[:-1], indptr[1:])):
566 # Prevent modifying X in place
567 data = np.copy(X.data[start:end])
568 nz = n_samples - data.size
569 median[f_ind] = _get_median(data, nz)
570
571 return median
572
573
574def _implicit_column_offset(X, offset):
575 """Create an implicitly offset linear operator.
576
577 This is used by PCA on sparse data to avoid densifying the whole data
578 matrix.
579
580 Params
581 ------
582 X : sparse matrix of shape (n_samples, n_features)
583 offset : ndarray of shape (n_features,)
584
585 Returns
586 -------
587 centered : LinearOperator
588 """
589 offset = offset[None, :]
590 XT = X.T
591 return LinearOperator(
592 matvec=lambda x: X @ x - offset @ x,
593 matmat=lambda x: X @ x - offset @ x,
594 rmatvec=lambda x: XT @ x - (offset * x.sum()),
595 rmatmat=lambda x: XT @ x - offset.T @ x.sum(axis=0)[None, :],
596 dtype=X.dtype,
597 shape=X.shape,
598 )