Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.9/dist-packages/pandas/core/arrays/sparse/accessor.py: 29%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

110 statements  

1"""Sparse accessor""" 

2from __future__ import annotations 

3 

4from typing import TYPE_CHECKING 

5 

6import numpy as np 

7 

8from pandas.compat._optional import import_optional_dependency 

9 

10from pandas.core.dtypes.cast import find_common_type 

11from pandas.core.dtypes.dtypes import SparseDtype 

12 

13from pandas.core.accessor import ( 

14 PandasDelegate, 

15 delegate_names, 

16) 

17from pandas.core.arrays.sparse.array import SparseArray 

18 

19if TYPE_CHECKING: 

20 from pandas import ( 

21 DataFrame, 

22 Series, 

23 ) 

24 

25 

26class BaseAccessor: 

27 _validation_msg = "Can only use the '.sparse' accessor with Sparse data." 

28 

29 def __init__(self, data=None) -> None: 

30 self._parent = data 

31 self._validate(data) 

32 

33 def _validate(self, data): 

34 raise NotImplementedError 

35 

36 

37@delegate_names( 

38 SparseArray, ["npoints", "density", "fill_value", "sp_values"], typ="property" 

39) 

40class SparseAccessor(BaseAccessor, PandasDelegate): 

41 """ 

42 Accessor for SparseSparse from other sparse matrix data types. 

43 

44 Examples 

45 -------- 

46 >>> ser = pd.Series([0, 0, 2, 2, 2], dtype="Sparse[int]") 

47 >>> ser.sparse.density 

48 0.6 

49 >>> ser.sparse.sp_values 

50 array([2, 2, 2]) 

51 """ 

52 

53 def _validate(self, data): 

54 if not isinstance(data.dtype, SparseDtype): 

55 raise AttributeError(self._validation_msg) 

56 

57 def _delegate_property_get(self, name: str, *args, **kwargs): 

58 return getattr(self._parent.array, name) 

59 

60 def _delegate_method(self, name: str, *args, **kwargs): 

61 if name == "from_coo": 

62 return self.from_coo(*args, **kwargs) 

63 elif name == "to_coo": 

64 return self.to_coo(*args, **kwargs) 

65 else: 

66 raise ValueError 

67 

68 @classmethod 

69 def from_coo(cls, A, dense_index: bool = False) -> Series: 

70 """ 

71 Create a Series with sparse values from a scipy.sparse.coo_matrix. 

72 

73 Parameters 

74 ---------- 

75 A : scipy.sparse.coo_matrix 

76 dense_index : bool, default False 

77 If False (default), the index consists of only the 

78 coords of the non-null entries of the original coo_matrix. 

79 If True, the index consists of the full sorted 

80 (row, col) coordinates of the coo_matrix. 

81 

82 Returns 

83 ------- 

84 s : Series 

85 A Series with sparse values. 

86 

87 Examples 

88 -------- 

89 >>> from scipy import sparse 

90 

91 >>> A = sparse.coo_matrix( 

92 ... ([3.0, 1.0, 2.0], ([1, 0, 0], [0, 2, 3])), shape=(3, 4) 

93 ... ) 

94 >>> A 

95 <COOrdinate sparse matrix of dtype 'float64' 

96 with 3 stored elements and shape (3, 4)> 

97 

98 >>> A.todense() 

99 matrix([[0., 0., 1., 2.], 

100 [3., 0., 0., 0.], 

101 [0., 0., 0., 0.]]) 

102 

103 >>> ss = pd.Series.sparse.from_coo(A) 

104 >>> ss 

105 0 2 1.0 

106 3 2.0 

107 1 0 3.0 

108 dtype: Sparse[float64, nan] 

109 """ 

110 from pandas import Series 

111 from pandas.core.arrays.sparse.scipy_sparse import coo_to_sparse_series 

112 

113 result = coo_to_sparse_series(A, dense_index=dense_index) 

114 result = Series(result.array, index=result.index, copy=False) 

115 

116 return result 

117 

118 def to_coo(self, row_levels=(0,), column_levels=(1,), sort_labels: bool = False): 

119 """ 

120 Create a scipy.sparse.coo_matrix from a Series with MultiIndex. 

121 

122 Use row_levels and column_levels to determine the row and column 

123 coordinates respectively. row_levels and column_levels are the names 

124 (labels) or numbers of the levels. {row_levels, column_levels} must be 

125 a partition of the MultiIndex level names (or numbers). 

126 

127 Parameters 

128 ---------- 

129 row_levels : tuple/list 

130 column_levels : tuple/list 

131 sort_labels : bool, default False 

132 Sort the row and column labels before forming the sparse matrix. 

133 When `row_levels` and/or `column_levels` refer to a single level, 

134 set to `True` for a faster execution. 

135 

136 Returns 

137 ------- 

138 y : scipy.sparse.coo_matrix 

139 rows : list (row labels) 

140 columns : list (column labels) 

141 

142 Examples 

143 -------- 

144 >>> s = pd.Series([3.0, np.nan, 1.0, 3.0, np.nan, np.nan]) 

145 >>> s.index = pd.MultiIndex.from_tuples( 

146 ... [ 

147 ... (1, 2, "a", 0), 

148 ... (1, 2, "a", 1), 

149 ... (1, 1, "b", 0), 

150 ... (1, 1, "b", 1), 

151 ... (2, 1, "b", 0), 

152 ... (2, 1, "b", 1) 

153 ... ], 

154 ... names=["A", "B", "C", "D"], 

155 ... ) 

156 >>> s 

157 A B C D 

158 1 2 a 0 3.0 

159 1 NaN 

160 1 b 0 1.0 

161 1 3.0 

162 2 1 b 0 NaN 

163 1 NaN 

164 dtype: float64 

165 

166 >>> ss = s.astype("Sparse") 

167 >>> ss 

168 A B C D 

169 1 2 a 0 3.0 

170 1 NaN 

171 1 b 0 1.0 

172 1 3.0 

173 2 1 b 0 NaN 

174 1 NaN 

175 dtype: Sparse[float64, nan] 

176 

177 >>> A, rows, columns = ss.sparse.to_coo( 

178 ... row_levels=["A", "B"], column_levels=["C", "D"], sort_labels=True 

179 ... ) 

180 >>> A 

181 <COOrdinate sparse matrix of dtype 'float64' 

182 with 3 stored elements and shape (3, 4)> 

183 >>> A.todense() 

184 matrix([[0., 0., 1., 3.], 

185 [3., 0., 0., 0.], 

186 [0., 0., 0., 0.]]) 

187 

188 >>> rows 

189 [(1, 1), (1, 2), (2, 1)] 

190 >>> columns 

191 [('a', 0), ('a', 1), ('b', 0), ('b', 1)] 

192 """ 

193 from pandas.core.arrays.sparse.scipy_sparse import sparse_series_to_coo 

194 

195 A, rows, columns = sparse_series_to_coo( 

196 self._parent, row_levels, column_levels, sort_labels=sort_labels 

197 ) 

198 return A, rows, columns 

199 

200 def to_dense(self) -> Series: 

201 """ 

202 Convert a Series from sparse values to dense. 

203 

204 Returns 

205 ------- 

206 Series: 

207 A Series with the same values, stored as a dense array. 

208 

209 Examples 

210 -------- 

211 >>> series = pd.Series(pd.arrays.SparseArray([0, 1, 0])) 

212 >>> series 

213 0 0 

214 1 1 

215 2 0 

216 dtype: Sparse[int64, 0] 

217 

218 >>> series.sparse.to_dense() 

219 0 0 

220 1 1 

221 2 0 

222 dtype: int64 

223 """ 

224 from pandas import Series 

225 

226 return Series( 

227 self._parent.array.to_dense(), 

228 index=self._parent.index, 

229 name=self._parent.name, 

230 copy=False, 

231 ) 

232 

233 

234class SparseFrameAccessor(BaseAccessor, PandasDelegate): 

235 """ 

236 DataFrame accessor for sparse data. 

237 

238 Examples 

239 -------- 

240 >>> df = pd.DataFrame({"a": [1, 2, 0, 0], 

241 ... "b": [3, 0, 0, 4]}, dtype="Sparse[int]") 

242 >>> df.sparse.density 

243 0.5 

244 """ 

245 

246 def _validate(self, data): 

247 dtypes = data.dtypes 

248 if not all(isinstance(t, SparseDtype) for t in dtypes): 

249 raise AttributeError(self._validation_msg) 

250 

251 @classmethod 

252 def from_spmatrix(cls, data, index=None, columns=None) -> DataFrame: 

253 """ 

254 Create a new DataFrame from a scipy sparse matrix. 

255 

256 Parameters 

257 ---------- 

258 data : scipy.sparse.spmatrix 

259 Must be convertible to csc format. 

260 index, columns : Index, optional 

261 Row and column labels to use for the resulting DataFrame. 

262 Defaults to a RangeIndex. 

263 

264 Returns 

265 ------- 

266 DataFrame 

267 Each column of the DataFrame is stored as a 

268 :class:`arrays.SparseArray`. 

269 

270 Examples 

271 -------- 

272 >>> import scipy.sparse 

273 >>> mat = scipy.sparse.eye(3, dtype=float) 

274 >>> pd.DataFrame.sparse.from_spmatrix(mat) 

275 0 1 2 

276 0 1.0 0 0 

277 1 0 1.0 0 

278 2 0 0 1.0 

279 """ 

280 from pandas._libs.sparse import IntIndex 

281 

282 from pandas import DataFrame 

283 

284 data = data.tocsc() 

285 index, columns = cls._prep_index(data, index, columns) 

286 n_rows, n_columns = data.shape 

287 # We need to make sure indices are sorted, as we create 

288 # IntIndex with no input validation (i.e. check_integrity=False ). 

289 # Indices may already be sorted in scipy in which case this adds 

290 # a small overhead. 

291 data.sort_indices() 

292 indices = data.indices 

293 indptr = data.indptr 

294 array_data = data.data 

295 dtype = SparseDtype(array_data.dtype, 0) 

296 arrays = [] 

297 for i in range(n_columns): 

298 sl = slice(indptr[i], indptr[i + 1]) 

299 idx = IntIndex(n_rows, indices[sl], check_integrity=False) 

300 arr = SparseArray._simple_new(array_data[sl], idx, dtype) 

301 arrays.append(arr) 

302 return DataFrame._from_arrays( 

303 arrays, columns=columns, index=index, verify_integrity=False 

304 ) 

305 

306 def to_dense(self) -> DataFrame: 

307 """ 

308 Convert a DataFrame with sparse values to dense. 

309 

310 Returns 

311 ------- 

312 DataFrame 

313 A DataFrame with the same values stored as dense arrays. 

314 

315 Examples 

316 -------- 

317 >>> df = pd.DataFrame({"A": pd.arrays.SparseArray([0, 1, 0])}) 

318 >>> df.sparse.to_dense() 

319 A 

320 0 0 

321 1 1 

322 2 0 

323 """ 

324 from pandas import DataFrame 

325 

326 data = {k: v.array.to_dense() for k, v in self._parent.items()} 

327 return DataFrame(data, index=self._parent.index, columns=self._parent.columns) 

328 

329 def to_coo(self): 

330 """ 

331 Return the contents of the frame as a sparse SciPy COO matrix. 

332 

333 Returns 

334 ------- 

335 scipy.sparse.spmatrix 

336 If the caller is heterogeneous and contains booleans or objects, 

337 the result will be of dtype=object. See Notes. 

338 

339 Notes 

340 ----- 

341 The dtype will be the lowest-common-denominator type (implicit 

342 upcasting); that is to say if the dtypes (even of numeric types) 

343 are mixed, the one that accommodates all will be chosen. 

344 

345 e.g. If the dtypes are float16 and float32, dtype will be upcast to 

346 float32. By numpy.find_common_type convention, mixing int64 and 

347 and uint64 will result in a float64 dtype. 

348 

349 Examples 

350 -------- 

351 >>> df = pd.DataFrame({"A": pd.arrays.SparseArray([0, 1, 0, 1])}) 

352 >>> df.sparse.to_coo() 

353 <COOrdinate sparse matrix of dtype 'int64' 

354 with 2 stored elements and shape (4, 1)> 

355 """ 

356 import_optional_dependency("scipy") 

357 from scipy.sparse import coo_matrix 

358 

359 dtype = find_common_type(self._parent.dtypes.to_list()) 

360 if isinstance(dtype, SparseDtype): 

361 dtype = dtype.subtype 

362 

363 cols, rows, data = [], [], [] 

364 for col, (_, ser) in enumerate(self._parent.items()): 

365 sp_arr = ser.array 

366 if sp_arr.fill_value != 0: 

367 raise ValueError("fill value must be 0 when converting to COO matrix") 

368 

369 row = sp_arr.sp_index.indices 

370 cols.append(np.repeat(col, len(row))) 

371 rows.append(row) 

372 data.append(sp_arr.sp_values.astype(dtype, copy=False)) 

373 

374 cols = np.concatenate(cols) 

375 rows = np.concatenate(rows) 

376 data = np.concatenate(data) 

377 return coo_matrix((data, (rows, cols)), shape=self._parent.shape) 

378 

379 @property 

380 def density(self) -> float: 

381 """ 

382 Ratio of non-sparse points to total (dense) data points. 

383 

384 Examples 

385 -------- 

386 >>> df = pd.DataFrame({"A": pd.arrays.SparseArray([0, 1, 0, 1])}) 

387 >>> df.sparse.density 

388 0.5 

389 """ 

390 tmp = np.mean([column.array.density for _, column in self._parent.items()]) 

391 return tmp 

392 

393 @staticmethod 

394 def _prep_index(data, index, columns): 

395 from pandas.core.indexes.api import ( 

396 default_index, 

397 ensure_index, 

398 ) 

399 

400 N, K = data.shape 

401 if index is None: 

402 index = default_index(N) 

403 else: 

404 index = ensure_index(index) 

405 if columns is None: 

406 columns = default_index(K) 

407 else: 

408 columns = ensure_index(columns) 

409 

410 if len(columns) != K: 

411 raise ValueError(f"Column length mismatch: {len(columns)} vs. {K}") 

412 if len(index) != N: 

413 raise ValueError(f"Index length mismatch: {len(index)} vs. {N}") 

414 return index, columns