1"""Sparse accessor"""
2from __future__ import annotations
3
4from typing import TYPE_CHECKING
5
6import numpy as np
7
8from pandas.compat._optional import import_optional_dependency
9
10from pandas.core.dtypes.cast import find_common_type
11from pandas.core.dtypes.dtypes import SparseDtype
12
13from pandas.core.accessor import (
14 PandasDelegate,
15 delegate_names,
16)
17from pandas.core.arrays.sparse.array import SparseArray
18
19if TYPE_CHECKING:
20 from pandas import (
21 DataFrame,
22 Series,
23 )
24
25
26class BaseAccessor:
27 _validation_msg = "Can only use the '.sparse' accessor with Sparse data."
28
29 def __init__(self, data=None) -> None:
30 self._parent = data
31 self._validate(data)
32
33 def _validate(self, data):
34 raise NotImplementedError
35
36
37@delegate_names(
38 SparseArray, ["npoints", "density", "fill_value", "sp_values"], typ="property"
39)
40class SparseAccessor(BaseAccessor, PandasDelegate):
41 """
42 Accessor for SparseSparse from other sparse matrix data types.
43
44 Examples
45 --------
46 >>> ser = pd.Series([0, 0, 2, 2, 2], dtype="Sparse[int]")
47 >>> ser.sparse.density
48 0.6
49 >>> ser.sparse.sp_values
50 array([2, 2, 2])
51 """
52
53 def _validate(self, data):
54 if not isinstance(data.dtype, SparseDtype):
55 raise AttributeError(self._validation_msg)
56
57 def _delegate_property_get(self, name: str, *args, **kwargs):
58 return getattr(self._parent.array, name)
59
60 def _delegate_method(self, name: str, *args, **kwargs):
61 if name == "from_coo":
62 return self.from_coo(*args, **kwargs)
63 elif name == "to_coo":
64 return self.to_coo(*args, **kwargs)
65 else:
66 raise ValueError
67
68 @classmethod
69 def from_coo(cls, A, dense_index: bool = False) -> Series:
70 """
71 Create a Series with sparse values from a scipy.sparse.coo_matrix.
72
73 Parameters
74 ----------
75 A : scipy.sparse.coo_matrix
76 dense_index : bool, default False
77 If False (default), the index consists of only the
78 coords of the non-null entries of the original coo_matrix.
79 If True, the index consists of the full sorted
80 (row, col) coordinates of the coo_matrix.
81
82 Returns
83 -------
84 s : Series
85 A Series with sparse values.
86
87 Examples
88 --------
89 >>> from scipy import sparse
90
91 >>> A = sparse.coo_matrix(
92 ... ([3.0, 1.0, 2.0], ([1, 0, 0], [0, 2, 3])), shape=(3, 4)
93 ... )
94 >>> A
95 <COOrdinate sparse matrix of dtype 'float64'
96 with 3 stored elements and shape (3, 4)>
97
98 >>> A.todense()
99 matrix([[0., 0., 1., 2.],
100 [3., 0., 0., 0.],
101 [0., 0., 0., 0.]])
102
103 >>> ss = pd.Series.sparse.from_coo(A)
104 >>> ss
105 0 2 1.0
106 3 2.0
107 1 0 3.0
108 dtype: Sparse[float64, nan]
109 """
110 from pandas import Series
111 from pandas.core.arrays.sparse.scipy_sparse import coo_to_sparse_series
112
113 result = coo_to_sparse_series(A, dense_index=dense_index)
114 result = Series(result.array, index=result.index, copy=False)
115
116 return result
117
118 def to_coo(self, row_levels=(0,), column_levels=(1,), sort_labels: bool = False):
119 """
120 Create a scipy.sparse.coo_matrix from a Series with MultiIndex.
121
122 Use row_levels and column_levels to determine the row and column
123 coordinates respectively. row_levels and column_levels are the names
124 (labels) or numbers of the levels. {row_levels, column_levels} must be
125 a partition of the MultiIndex level names (or numbers).
126
127 Parameters
128 ----------
129 row_levels : tuple/list
130 column_levels : tuple/list
131 sort_labels : bool, default False
132 Sort the row and column labels before forming the sparse matrix.
133 When `row_levels` and/or `column_levels` refer to a single level,
134 set to `True` for a faster execution.
135
136 Returns
137 -------
138 y : scipy.sparse.coo_matrix
139 rows : list (row labels)
140 columns : list (column labels)
141
142 Examples
143 --------
144 >>> s = pd.Series([3.0, np.nan, 1.0, 3.0, np.nan, np.nan])
145 >>> s.index = pd.MultiIndex.from_tuples(
146 ... [
147 ... (1, 2, "a", 0),
148 ... (1, 2, "a", 1),
149 ... (1, 1, "b", 0),
150 ... (1, 1, "b", 1),
151 ... (2, 1, "b", 0),
152 ... (2, 1, "b", 1)
153 ... ],
154 ... names=["A", "B", "C", "D"],
155 ... )
156 >>> s
157 A B C D
158 1 2 a 0 3.0
159 1 NaN
160 1 b 0 1.0
161 1 3.0
162 2 1 b 0 NaN
163 1 NaN
164 dtype: float64
165
166 >>> ss = s.astype("Sparse")
167 >>> ss
168 A B C D
169 1 2 a 0 3.0
170 1 NaN
171 1 b 0 1.0
172 1 3.0
173 2 1 b 0 NaN
174 1 NaN
175 dtype: Sparse[float64, nan]
176
177 >>> A, rows, columns = ss.sparse.to_coo(
178 ... row_levels=["A", "B"], column_levels=["C", "D"], sort_labels=True
179 ... )
180 >>> A
181 <COOrdinate sparse matrix of dtype 'float64'
182 with 3 stored elements and shape (3, 4)>
183 >>> A.todense()
184 matrix([[0., 0., 1., 3.],
185 [3., 0., 0., 0.],
186 [0., 0., 0., 0.]])
187
188 >>> rows
189 [(1, 1), (1, 2), (2, 1)]
190 >>> columns
191 [('a', 0), ('a', 1), ('b', 0), ('b', 1)]
192 """
193 from pandas.core.arrays.sparse.scipy_sparse import sparse_series_to_coo
194
195 A, rows, columns = sparse_series_to_coo(
196 self._parent, row_levels, column_levels, sort_labels=sort_labels
197 )
198 return A, rows, columns
199
200 def to_dense(self) -> Series:
201 """
202 Convert a Series from sparse values to dense.
203
204 Returns
205 -------
206 Series:
207 A Series with the same values, stored as a dense array.
208
209 Examples
210 --------
211 >>> series = pd.Series(pd.arrays.SparseArray([0, 1, 0]))
212 >>> series
213 0 0
214 1 1
215 2 0
216 dtype: Sparse[int64, 0]
217
218 >>> series.sparse.to_dense()
219 0 0
220 1 1
221 2 0
222 dtype: int64
223 """
224 from pandas import Series
225
226 return Series(
227 self._parent.array.to_dense(),
228 index=self._parent.index,
229 name=self._parent.name,
230 copy=False,
231 )
232
233
234class SparseFrameAccessor(BaseAccessor, PandasDelegate):
235 """
236 DataFrame accessor for sparse data.
237
238 Examples
239 --------
240 >>> df = pd.DataFrame({"a": [1, 2, 0, 0],
241 ... "b": [3, 0, 0, 4]}, dtype="Sparse[int]")
242 >>> df.sparse.density
243 0.5
244 """
245
246 def _validate(self, data):
247 dtypes = data.dtypes
248 if not all(isinstance(t, SparseDtype) for t in dtypes):
249 raise AttributeError(self._validation_msg)
250
251 @classmethod
252 def from_spmatrix(cls, data, index=None, columns=None) -> DataFrame:
253 """
254 Create a new DataFrame from a scipy sparse matrix.
255
256 Parameters
257 ----------
258 data : scipy.sparse.spmatrix
259 Must be convertible to csc format.
260 index, columns : Index, optional
261 Row and column labels to use for the resulting DataFrame.
262 Defaults to a RangeIndex.
263
264 Returns
265 -------
266 DataFrame
267 Each column of the DataFrame is stored as a
268 :class:`arrays.SparseArray`.
269
270 Examples
271 --------
272 >>> import scipy.sparse
273 >>> mat = scipy.sparse.eye(3, dtype=float)
274 >>> pd.DataFrame.sparse.from_spmatrix(mat)
275 0 1 2
276 0 1.0 0 0
277 1 0 1.0 0
278 2 0 0 1.0
279 """
280 from pandas._libs.sparse import IntIndex
281
282 from pandas import DataFrame
283
284 data = data.tocsc()
285 index, columns = cls._prep_index(data, index, columns)
286 n_rows, n_columns = data.shape
287 # We need to make sure indices are sorted, as we create
288 # IntIndex with no input validation (i.e. check_integrity=False ).
289 # Indices may already be sorted in scipy in which case this adds
290 # a small overhead.
291 data.sort_indices()
292 indices = data.indices
293 indptr = data.indptr
294 array_data = data.data
295 dtype = SparseDtype(array_data.dtype, 0)
296 arrays = []
297 for i in range(n_columns):
298 sl = slice(indptr[i], indptr[i + 1])
299 idx = IntIndex(n_rows, indices[sl], check_integrity=False)
300 arr = SparseArray._simple_new(array_data[sl], idx, dtype)
301 arrays.append(arr)
302 return DataFrame._from_arrays(
303 arrays, columns=columns, index=index, verify_integrity=False
304 )
305
306 def to_dense(self) -> DataFrame:
307 """
308 Convert a DataFrame with sparse values to dense.
309
310 Returns
311 -------
312 DataFrame
313 A DataFrame with the same values stored as dense arrays.
314
315 Examples
316 --------
317 >>> df = pd.DataFrame({"A": pd.arrays.SparseArray([0, 1, 0])})
318 >>> df.sparse.to_dense()
319 A
320 0 0
321 1 1
322 2 0
323 """
324 from pandas import DataFrame
325
326 data = {k: v.array.to_dense() for k, v in self._parent.items()}
327 return DataFrame(data, index=self._parent.index, columns=self._parent.columns)
328
329 def to_coo(self):
330 """
331 Return the contents of the frame as a sparse SciPy COO matrix.
332
333 Returns
334 -------
335 scipy.sparse.spmatrix
336 If the caller is heterogeneous and contains booleans or objects,
337 the result will be of dtype=object. See Notes.
338
339 Notes
340 -----
341 The dtype will be the lowest-common-denominator type (implicit
342 upcasting); that is to say if the dtypes (even of numeric types)
343 are mixed, the one that accommodates all will be chosen.
344
345 e.g. If the dtypes are float16 and float32, dtype will be upcast to
346 float32. By numpy.find_common_type convention, mixing int64 and
347 and uint64 will result in a float64 dtype.
348
349 Examples
350 --------
351 >>> df = pd.DataFrame({"A": pd.arrays.SparseArray([0, 1, 0, 1])})
352 >>> df.sparse.to_coo()
353 <COOrdinate sparse matrix of dtype 'int64'
354 with 2 stored elements and shape (4, 1)>
355 """
356 import_optional_dependency("scipy")
357 from scipy.sparse import coo_matrix
358
359 dtype = find_common_type(self._parent.dtypes.to_list())
360 if isinstance(dtype, SparseDtype):
361 dtype = dtype.subtype
362
363 cols, rows, data = [], [], []
364 for col, (_, ser) in enumerate(self._parent.items()):
365 sp_arr = ser.array
366 if sp_arr.fill_value != 0:
367 raise ValueError("fill value must be 0 when converting to COO matrix")
368
369 row = sp_arr.sp_index.indices
370 cols.append(np.repeat(col, len(row)))
371 rows.append(row)
372 data.append(sp_arr.sp_values.astype(dtype, copy=False))
373
374 cols = np.concatenate(cols)
375 rows = np.concatenate(rows)
376 data = np.concatenate(data)
377 return coo_matrix((data, (rows, cols)), shape=self._parent.shape)
378
379 @property
380 def density(self) -> float:
381 """
382 Ratio of non-sparse points to total (dense) data points.
383
384 Examples
385 --------
386 >>> df = pd.DataFrame({"A": pd.arrays.SparseArray([0, 1, 0, 1])})
387 >>> df.sparse.density
388 0.5
389 """
390 tmp = np.mean([column.array.density for _, column in self._parent.items()])
391 return tmp
392
393 @staticmethod
394 def _prep_index(data, index, columns):
395 from pandas.core.indexes.api import (
396 default_index,
397 ensure_index,
398 )
399
400 N, K = data.shape
401 if index is None:
402 index = default_index(N)
403 else:
404 index = ensure_index(index)
405 if columns is None:
406 columns = default_index(K)
407 else:
408 columns = ensure_index(columns)
409
410 if len(columns) != K:
411 raise ValueError(f"Column length mismatch: {len(columns)} vs. {K}")
412 if len(index) != N:
413 raise ValueError(f"Index length mismatch: {len(index)} vs. {N}")
414 return index, columns