1"""Sparse accessor"""
2from __future__ import annotations
3
4from typing import TYPE_CHECKING
5
6import numpy as np
7
8from pandas.compat._optional import import_optional_dependency
9
10from pandas.core.dtypes.cast import find_common_type
11
12from pandas.core.accessor import (
13 PandasDelegate,
14 delegate_names,
15)
16from pandas.core.arrays.sparse.array import SparseArray
17from pandas.core.arrays.sparse.dtype import SparseDtype
18
19if TYPE_CHECKING:
20 from pandas import (
21 DataFrame,
22 Series,
23 )
24
25
26class BaseAccessor:
27 _validation_msg = "Can only use the '.sparse' accessor with Sparse data."
28
29 def __init__(self, data=None) -> None:
30 self._parent = data
31 self._validate(data)
32
33 def _validate(self, data):
34 raise NotImplementedError
35
36
37@delegate_names(
38 SparseArray, ["npoints", "density", "fill_value", "sp_values"], typ="property"
39)
40class SparseAccessor(BaseAccessor, PandasDelegate):
41 """
42 Accessor for SparseSparse from other sparse matrix data types.
43 """
44
45 def _validate(self, data):
46 if not isinstance(data.dtype, SparseDtype):
47 raise AttributeError(self._validation_msg)
48
49 def _delegate_property_get(self, name, *args, **kwargs):
50 return getattr(self._parent.array, name)
51
52 def _delegate_method(self, name, *args, **kwargs):
53 if name == "from_coo":
54 return self.from_coo(*args, **kwargs)
55 elif name == "to_coo":
56 return self.to_coo(*args, **kwargs)
57 else:
58 raise ValueError
59
60 @classmethod
61 def from_coo(cls, A, dense_index: bool = False) -> Series:
62 """
63 Create a Series with sparse values from a scipy.sparse.coo_matrix.
64
65 Parameters
66 ----------
67 A : scipy.sparse.coo_matrix
68 dense_index : bool, default False
69 If False (default), the index consists of only the
70 coords of the non-null entries of the original coo_matrix.
71 If True, the index consists of the full sorted
72 (row, col) coordinates of the coo_matrix.
73
74 Returns
75 -------
76 s : Series
77 A Series with sparse values.
78
79 Examples
80 --------
81 >>> from scipy import sparse
82
83 >>> A = sparse.coo_matrix(
84 ... ([3.0, 1.0, 2.0], ([1, 0, 0], [0, 2, 3])), shape=(3, 4)
85 ... )
86 >>> A
87 <3x4 sparse matrix of type '<class 'numpy.float64'>'
88 with 3 stored elements in COOrdinate format>
89
90 >>> A.todense()
91 matrix([[0., 0., 1., 2.],
92 [3., 0., 0., 0.],
93 [0., 0., 0., 0.]])
94
95 >>> ss = pd.Series.sparse.from_coo(A)
96 >>> ss
97 0 2 1.0
98 3 2.0
99 1 0 3.0
100 dtype: Sparse[float64, nan]
101 """
102 from pandas import Series
103 from pandas.core.arrays.sparse.scipy_sparse import coo_to_sparse_series
104
105 result = coo_to_sparse_series(A, dense_index=dense_index)
106 result = Series(result.array, index=result.index, copy=False)
107
108 return result
109
110 def to_coo(self, row_levels=(0,), column_levels=(1,), sort_labels: bool = False):
111 """
112 Create a scipy.sparse.coo_matrix from a Series with MultiIndex.
113
114 Use row_levels and column_levels to determine the row and column
115 coordinates respectively. row_levels and column_levels are the names
116 (labels) or numbers of the levels. {row_levels, column_levels} must be
117 a partition of the MultiIndex level names (or numbers).
118
119 Parameters
120 ----------
121 row_levels : tuple/list
122 column_levels : tuple/list
123 sort_labels : bool, default False
124 Sort the row and column labels before forming the sparse matrix.
125 When `row_levels` and/or `column_levels` refer to a single level,
126 set to `True` for a faster execution.
127
128 Returns
129 -------
130 y : scipy.sparse.coo_matrix
131 rows : list (row labels)
132 columns : list (column labels)
133
134 Examples
135 --------
136 >>> s = pd.Series([3.0, np.nan, 1.0, 3.0, np.nan, np.nan])
137 >>> s.index = pd.MultiIndex.from_tuples(
138 ... [
139 ... (1, 2, "a", 0),
140 ... (1, 2, "a", 1),
141 ... (1, 1, "b", 0),
142 ... (1, 1, "b", 1),
143 ... (2, 1, "b", 0),
144 ... (2, 1, "b", 1)
145 ... ],
146 ... names=["A", "B", "C", "D"],
147 ... )
148 >>> s
149 A B C D
150 1 2 a 0 3.0
151 1 NaN
152 1 b 0 1.0
153 1 3.0
154 2 1 b 0 NaN
155 1 NaN
156 dtype: float64
157
158 >>> ss = s.astype("Sparse")
159 >>> ss
160 A B C D
161 1 2 a 0 3.0
162 1 NaN
163 1 b 0 1.0
164 1 3.0
165 2 1 b 0 NaN
166 1 NaN
167 dtype: Sparse[float64, nan]
168
169 >>> A, rows, columns = ss.sparse.to_coo(
170 ... row_levels=["A", "B"], column_levels=["C", "D"], sort_labels=True
171 ... )
172 >>> A
173 <3x4 sparse matrix of type '<class 'numpy.float64'>'
174 with 3 stored elements in COOrdinate format>
175 >>> A.todense()
176 matrix([[0., 0., 1., 3.],
177 [3., 0., 0., 0.],
178 [0., 0., 0., 0.]])
179
180 >>> rows
181 [(1, 1), (1, 2), (2, 1)]
182 >>> columns
183 [('a', 0), ('a', 1), ('b', 0), ('b', 1)]
184 """
185 from pandas.core.arrays.sparse.scipy_sparse import sparse_series_to_coo
186
187 A, rows, columns = sparse_series_to_coo(
188 self._parent, row_levels, column_levels, sort_labels=sort_labels
189 )
190 return A, rows, columns
191
192 def to_dense(self) -> Series:
193 """
194 Convert a Series from sparse values to dense.
195
196 Returns
197 -------
198 Series:
199 A Series with the same values, stored as a dense array.
200
201 Examples
202 --------
203 >>> series = pd.Series(pd.arrays.SparseArray([0, 1, 0]))
204 >>> series
205 0 0
206 1 1
207 2 0
208 dtype: Sparse[int64, 0]
209
210 >>> series.sparse.to_dense()
211 0 0
212 1 1
213 2 0
214 dtype: int64
215 """
216 from pandas import Series
217
218 return Series(
219 self._parent.array.to_dense(),
220 index=self._parent.index,
221 name=self._parent.name,
222 copy=False,
223 )
224
225
226class SparseFrameAccessor(BaseAccessor, PandasDelegate):
227 """
228 DataFrame accessor for sparse data.
229 """
230
231 def _validate(self, data):
232 dtypes = data.dtypes
233 if not all(isinstance(t, SparseDtype) for t in dtypes):
234 raise AttributeError(self._validation_msg)
235
236 @classmethod
237 def from_spmatrix(cls, data, index=None, columns=None) -> DataFrame:
238 """
239 Create a new DataFrame from a scipy sparse matrix.
240
241 Parameters
242 ----------
243 data : scipy.sparse.spmatrix
244 Must be convertible to csc format.
245 index, columns : Index, optional
246 Row and column labels to use for the resulting DataFrame.
247 Defaults to a RangeIndex.
248
249 Returns
250 -------
251 DataFrame
252 Each column of the DataFrame is stored as a
253 :class:`arrays.SparseArray`.
254
255 Examples
256 --------
257 >>> import scipy.sparse
258 >>> mat = scipy.sparse.eye(3)
259 >>> pd.DataFrame.sparse.from_spmatrix(mat)
260 0 1 2
261 0 1.0 0.0 0.0
262 1 0.0 1.0 0.0
263 2 0.0 0.0 1.0
264 """
265 from pandas._libs.sparse import IntIndex
266
267 from pandas import DataFrame
268
269 data = data.tocsc()
270 index, columns = cls._prep_index(data, index, columns)
271 n_rows, n_columns = data.shape
272 # We need to make sure indices are sorted, as we create
273 # IntIndex with no input validation (i.e. check_integrity=False ).
274 # Indices may already be sorted in scipy in which case this adds
275 # a small overhead.
276 data.sort_indices()
277 indices = data.indices
278 indptr = data.indptr
279 array_data = data.data
280 dtype = SparseDtype(array_data.dtype, 0)
281 arrays = []
282 for i in range(n_columns):
283 sl = slice(indptr[i], indptr[i + 1])
284 idx = IntIndex(n_rows, indices[sl], check_integrity=False)
285 arr = SparseArray._simple_new(array_data[sl], idx, dtype)
286 arrays.append(arr)
287 return DataFrame._from_arrays(
288 arrays, columns=columns, index=index, verify_integrity=False
289 )
290
291 def to_dense(self) -> DataFrame:
292 """
293 Convert a DataFrame with sparse values to dense.
294
295 Returns
296 -------
297 DataFrame
298 A DataFrame with the same values stored as dense arrays.
299
300 Examples
301 --------
302 >>> df = pd.DataFrame({"A": pd.arrays.SparseArray([0, 1, 0])})
303 >>> df.sparse.to_dense()
304 A
305 0 0
306 1 1
307 2 0
308 """
309 from pandas import DataFrame
310
311 data = {k: v.array.to_dense() for k, v in self._parent.items()}
312 return DataFrame(data, index=self._parent.index, columns=self._parent.columns)
313
314 def to_coo(self):
315 """
316 Return the contents of the frame as a sparse SciPy COO matrix.
317
318 Returns
319 -------
320 scipy.sparse.spmatrix
321 If the caller is heterogeneous and contains booleans or objects,
322 the result will be of dtype=object. See Notes.
323
324 Notes
325 -----
326 The dtype will be the lowest-common-denominator type (implicit
327 upcasting); that is to say if the dtypes (even of numeric types)
328 are mixed, the one that accommodates all will be chosen.
329
330 e.g. If the dtypes are float16 and float32, dtype will be upcast to
331 float32. By numpy.find_common_type convention, mixing int64 and
332 and uint64 will result in a float64 dtype.
333 """
334 import_optional_dependency("scipy")
335 from scipy.sparse import coo_matrix
336
337 dtype = find_common_type(self._parent.dtypes.to_list())
338 if isinstance(dtype, SparseDtype):
339 dtype = dtype.subtype
340
341 cols, rows, data = [], [], []
342 for col, (_, ser) in enumerate(self._parent.items()):
343 sp_arr = ser.array
344 if sp_arr.fill_value != 0:
345 raise ValueError("fill value must be 0 when converting to COO matrix")
346
347 row = sp_arr.sp_index.indices
348 cols.append(np.repeat(col, len(row)))
349 rows.append(row)
350 data.append(sp_arr.sp_values.astype(dtype, copy=False))
351
352 cols = np.concatenate(cols)
353 rows = np.concatenate(rows)
354 data = np.concatenate(data)
355 return coo_matrix((data, (rows, cols)), shape=self._parent.shape)
356
357 @property
358 def density(self) -> float:
359 """
360 Ratio of non-sparse points to total (dense) data points.
361 """
362 tmp = np.mean([column.array.density for _, column in self._parent.items()])
363 return tmp
364
365 @staticmethod
366 def _prep_index(data, index, columns):
367 from pandas.core.indexes.api import (
368 default_index,
369 ensure_index,
370 )
371
372 N, K = data.shape
373 if index is None:
374 index = default_index(N)
375 else:
376 index = ensure_index(index)
377 if columns is None:
378 columns = default_index(K)
379 else:
380 columns = ensure_index(columns)
381
382 if len(columns) != K:
383 raise ValueError(f"Column length mismatch: {len(columns)} vs. {K}")
384 if len(index) != N:
385 raise ValueError(f"Index length mismatch: {len(index)} vs. {N}")
386 return index, columns