Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/pandas/core/arrays/sparse/accessor.py: 29%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

110 statements  

1"""Sparse accessor""" 

2from __future__ import annotations 

3 

4from typing import TYPE_CHECKING 

5 

6import numpy as np 

7 

8from pandas.compat._optional import import_optional_dependency 

9 

10from pandas.core.dtypes.cast import find_common_type 

11 

12from pandas.core.accessor import ( 

13 PandasDelegate, 

14 delegate_names, 

15) 

16from pandas.core.arrays.sparse.array import SparseArray 

17from pandas.core.arrays.sparse.dtype import SparseDtype 

18 

19if TYPE_CHECKING: 

20 from pandas import ( 

21 DataFrame, 

22 Series, 

23 ) 

24 

25 

26class BaseAccessor: 

27 _validation_msg = "Can only use the '.sparse' accessor with Sparse data." 

28 

29 def __init__(self, data=None) -> None: 

30 self._parent = data 

31 self._validate(data) 

32 

33 def _validate(self, data): 

34 raise NotImplementedError 

35 

36 

37@delegate_names( 

38 SparseArray, ["npoints", "density", "fill_value", "sp_values"], typ="property" 

39) 

40class SparseAccessor(BaseAccessor, PandasDelegate): 

41 """ 

42 Accessor for SparseSparse from other sparse matrix data types. 

43 """ 

44 

45 def _validate(self, data): 

46 if not isinstance(data.dtype, SparseDtype): 

47 raise AttributeError(self._validation_msg) 

48 

49 def _delegate_property_get(self, name, *args, **kwargs): 

50 return getattr(self._parent.array, name) 

51 

52 def _delegate_method(self, name, *args, **kwargs): 

53 if name == "from_coo": 

54 return self.from_coo(*args, **kwargs) 

55 elif name == "to_coo": 

56 return self.to_coo(*args, **kwargs) 

57 else: 

58 raise ValueError 

59 

60 @classmethod 

61 def from_coo(cls, A, dense_index: bool = False) -> Series: 

62 """ 

63 Create a Series with sparse values from a scipy.sparse.coo_matrix. 

64 

65 Parameters 

66 ---------- 

67 A : scipy.sparse.coo_matrix 

68 dense_index : bool, default False 

69 If False (default), the index consists of only the 

70 coords of the non-null entries of the original coo_matrix. 

71 If True, the index consists of the full sorted 

72 (row, col) coordinates of the coo_matrix. 

73 

74 Returns 

75 ------- 

76 s : Series 

77 A Series with sparse values. 

78 

79 Examples 

80 -------- 

81 >>> from scipy import sparse 

82 

83 >>> A = sparse.coo_matrix( 

84 ... ([3.0, 1.0, 2.0], ([1, 0, 0], [0, 2, 3])), shape=(3, 4) 

85 ... ) 

86 >>> A 

87 <3x4 sparse matrix of type '<class 'numpy.float64'>' 

88 with 3 stored elements in COOrdinate format> 

89 

90 >>> A.todense() 

91 matrix([[0., 0., 1., 2.], 

92 [3., 0., 0., 0.], 

93 [0., 0., 0., 0.]]) 

94 

95 >>> ss = pd.Series.sparse.from_coo(A) 

96 >>> ss 

97 0 2 1.0 

98 3 2.0 

99 1 0 3.0 

100 dtype: Sparse[float64, nan] 

101 """ 

102 from pandas import Series 

103 from pandas.core.arrays.sparse.scipy_sparse import coo_to_sparse_series 

104 

105 result = coo_to_sparse_series(A, dense_index=dense_index) 

106 result = Series(result.array, index=result.index, copy=False) 

107 

108 return result 

109 

110 def to_coo(self, row_levels=(0,), column_levels=(1,), sort_labels: bool = False): 

111 """ 

112 Create a scipy.sparse.coo_matrix from a Series with MultiIndex. 

113 

114 Use row_levels and column_levels to determine the row and column 

115 coordinates respectively. row_levels and column_levels are the names 

116 (labels) or numbers of the levels. {row_levels, column_levels} must be 

117 a partition of the MultiIndex level names (or numbers). 

118 

119 Parameters 

120 ---------- 

121 row_levels : tuple/list 

122 column_levels : tuple/list 

123 sort_labels : bool, default False 

124 Sort the row and column labels before forming the sparse matrix. 

125 When `row_levels` and/or `column_levels` refer to a single level, 

126 set to `True` for a faster execution. 

127 

128 Returns 

129 ------- 

130 y : scipy.sparse.coo_matrix 

131 rows : list (row labels) 

132 columns : list (column labels) 

133 

134 Examples 

135 -------- 

136 >>> s = pd.Series([3.0, np.nan, 1.0, 3.0, np.nan, np.nan]) 

137 >>> s.index = pd.MultiIndex.from_tuples( 

138 ... [ 

139 ... (1, 2, "a", 0), 

140 ... (1, 2, "a", 1), 

141 ... (1, 1, "b", 0), 

142 ... (1, 1, "b", 1), 

143 ... (2, 1, "b", 0), 

144 ... (2, 1, "b", 1) 

145 ... ], 

146 ... names=["A", "B", "C", "D"], 

147 ... ) 

148 >>> s 

149 A B C D 

150 1 2 a 0 3.0 

151 1 NaN 

152 1 b 0 1.0 

153 1 3.0 

154 2 1 b 0 NaN 

155 1 NaN 

156 dtype: float64 

157 

158 >>> ss = s.astype("Sparse") 

159 >>> ss 

160 A B C D 

161 1 2 a 0 3.0 

162 1 NaN 

163 1 b 0 1.0 

164 1 3.0 

165 2 1 b 0 NaN 

166 1 NaN 

167 dtype: Sparse[float64, nan] 

168 

169 >>> A, rows, columns = ss.sparse.to_coo( 

170 ... row_levels=["A", "B"], column_levels=["C", "D"], sort_labels=True 

171 ... ) 

172 >>> A 

173 <3x4 sparse matrix of type '<class 'numpy.float64'>' 

174 with 3 stored elements in COOrdinate format> 

175 >>> A.todense() 

176 matrix([[0., 0., 1., 3.], 

177 [3., 0., 0., 0.], 

178 [0., 0., 0., 0.]]) 

179 

180 >>> rows 

181 [(1, 1), (1, 2), (2, 1)] 

182 >>> columns 

183 [('a', 0), ('a', 1), ('b', 0), ('b', 1)] 

184 """ 

185 from pandas.core.arrays.sparse.scipy_sparse import sparse_series_to_coo 

186 

187 A, rows, columns = sparse_series_to_coo( 

188 self._parent, row_levels, column_levels, sort_labels=sort_labels 

189 ) 

190 return A, rows, columns 

191 

192 def to_dense(self) -> Series: 

193 """ 

194 Convert a Series from sparse values to dense. 

195 

196 Returns 

197 ------- 

198 Series: 

199 A Series with the same values, stored as a dense array. 

200 

201 Examples 

202 -------- 

203 >>> series = pd.Series(pd.arrays.SparseArray([0, 1, 0])) 

204 >>> series 

205 0 0 

206 1 1 

207 2 0 

208 dtype: Sparse[int64, 0] 

209 

210 >>> series.sparse.to_dense() 

211 0 0 

212 1 1 

213 2 0 

214 dtype: int64 

215 """ 

216 from pandas import Series 

217 

218 return Series( 

219 self._parent.array.to_dense(), 

220 index=self._parent.index, 

221 name=self._parent.name, 

222 copy=False, 

223 ) 

224 

225 

226class SparseFrameAccessor(BaseAccessor, PandasDelegate): 

227 """ 

228 DataFrame accessor for sparse data. 

229 """ 

230 

231 def _validate(self, data): 

232 dtypes = data.dtypes 

233 if not all(isinstance(t, SparseDtype) for t in dtypes): 

234 raise AttributeError(self._validation_msg) 

235 

236 @classmethod 

237 def from_spmatrix(cls, data, index=None, columns=None) -> DataFrame: 

238 """ 

239 Create a new DataFrame from a scipy sparse matrix. 

240 

241 Parameters 

242 ---------- 

243 data : scipy.sparse.spmatrix 

244 Must be convertible to csc format. 

245 index, columns : Index, optional 

246 Row and column labels to use for the resulting DataFrame. 

247 Defaults to a RangeIndex. 

248 

249 Returns 

250 ------- 

251 DataFrame 

252 Each column of the DataFrame is stored as a 

253 :class:`arrays.SparseArray`. 

254 

255 Examples 

256 -------- 

257 >>> import scipy.sparse 

258 >>> mat = scipy.sparse.eye(3) 

259 >>> pd.DataFrame.sparse.from_spmatrix(mat) 

260 0 1 2 

261 0 1.0 0.0 0.0 

262 1 0.0 1.0 0.0 

263 2 0.0 0.0 1.0 

264 """ 

265 from pandas._libs.sparse import IntIndex 

266 

267 from pandas import DataFrame 

268 

269 data = data.tocsc() 

270 index, columns = cls._prep_index(data, index, columns) 

271 n_rows, n_columns = data.shape 

272 # We need to make sure indices are sorted, as we create 

273 # IntIndex with no input validation (i.e. check_integrity=False ). 

274 # Indices may already be sorted in scipy in which case this adds 

275 # a small overhead. 

276 data.sort_indices() 

277 indices = data.indices 

278 indptr = data.indptr 

279 array_data = data.data 

280 dtype = SparseDtype(array_data.dtype, 0) 

281 arrays = [] 

282 for i in range(n_columns): 

283 sl = slice(indptr[i], indptr[i + 1]) 

284 idx = IntIndex(n_rows, indices[sl], check_integrity=False) 

285 arr = SparseArray._simple_new(array_data[sl], idx, dtype) 

286 arrays.append(arr) 

287 return DataFrame._from_arrays( 

288 arrays, columns=columns, index=index, verify_integrity=False 

289 ) 

290 

291 def to_dense(self) -> DataFrame: 

292 """ 

293 Convert a DataFrame with sparse values to dense. 

294 

295 Returns 

296 ------- 

297 DataFrame 

298 A DataFrame with the same values stored as dense arrays. 

299 

300 Examples 

301 -------- 

302 >>> df = pd.DataFrame({"A": pd.arrays.SparseArray([0, 1, 0])}) 

303 >>> df.sparse.to_dense() 

304 A 

305 0 0 

306 1 1 

307 2 0 

308 """ 

309 from pandas import DataFrame 

310 

311 data = {k: v.array.to_dense() for k, v in self._parent.items()} 

312 return DataFrame(data, index=self._parent.index, columns=self._parent.columns) 

313 

314 def to_coo(self): 

315 """ 

316 Return the contents of the frame as a sparse SciPy COO matrix. 

317 

318 Returns 

319 ------- 

320 scipy.sparse.spmatrix 

321 If the caller is heterogeneous and contains booleans or objects, 

322 the result will be of dtype=object. See Notes. 

323 

324 Notes 

325 ----- 

326 The dtype will be the lowest-common-denominator type (implicit 

327 upcasting); that is to say if the dtypes (even of numeric types) 

328 are mixed, the one that accommodates all will be chosen. 

329 

330 e.g. If the dtypes are float16 and float32, dtype will be upcast to 

331 float32. By numpy.find_common_type convention, mixing int64 and 

332 and uint64 will result in a float64 dtype. 

333 """ 

334 import_optional_dependency("scipy") 

335 from scipy.sparse import coo_matrix 

336 

337 dtype = find_common_type(self._parent.dtypes.to_list()) 

338 if isinstance(dtype, SparseDtype): 

339 dtype = dtype.subtype 

340 

341 cols, rows, data = [], [], [] 

342 for col, (_, ser) in enumerate(self._parent.items()): 

343 sp_arr = ser.array 

344 if sp_arr.fill_value != 0: 

345 raise ValueError("fill value must be 0 when converting to COO matrix") 

346 

347 row = sp_arr.sp_index.indices 

348 cols.append(np.repeat(col, len(row))) 

349 rows.append(row) 

350 data.append(sp_arr.sp_values.astype(dtype, copy=False)) 

351 

352 cols = np.concatenate(cols) 

353 rows = np.concatenate(rows) 

354 data = np.concatenate(data) 

355 return coo_matrix((data, (rows, cols)), shape=self._parent.shape) 

356 

357 @property 

358 def density(self) -> float: 

359 """ 

360 Ratio of non-sparse points to total (dense) data points. 

361 """ 

362 tmp = np.mean([column.array.density for _, column in self._parent.items()]) 

363 return tmp 

364 

365 @staticmethod 

366 def _prep_index(data, index, columns): 

367 from pandas.core.indexes.api import ( 

368 default_index, 

369 ensure_index, 

370 ) 

371 

372 N, K = data.shape 

373 if index is None: 

374 index = default_index(N) 

375 else: 

376 index = ensure_index(index) 

377 if columns is None: 

378 columns = default_index(K) 

379 else: 

380 columns = ensure_index(columns) 

381 

382 if len(columns) != K: 

383 raise ValueError(f"Column length mismatch: {len(columns)} vs. {K}") 

384 if len(index) != N: 

385 raise ValueError(f"Index length mismatch: {len(index)} vs. {N}") 

386 return index, columns