Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/pandas/core/arrays/sparse/accessor.py: 29%

1"""Sparse accessor"""

2from __future__ import annotations

4from typing import TYPE_CHECKING

6import numpy as np

8from pandas.compat._optional import import_optional_dependency

10from pandas.core.dtypes.cast import find_common_type

12from pandas.core.accessor import (

13 PandasDelegate,

14 delegate_names,

15)

16from pandas.core.arrays.sparse.array import SparseArray

17from pandas.core.arrays.sparse.dtype import SparseDtype

19if TYPE_CHECKING:

20 from pandas import (

21 DataFrame,

22 Series,

23 )

26class BaseAccessor:

27 _validation_msg = "Can only use the '.sparse' accessor with Sparse data."

29 def __init__(self, data=None) -> None:

30 self._parent = data

31 self._validate(data)

33 def _validate(self, data):

34 raise NotImplementedError

37@delegate_names(

38 SparseArray, ["npoints", "density", "fill_value", "sp_values"], typ="property"

39)

40class SparseAccessor(BaseAccessor, PandasDelegate):

41 """

42 Accessor for SparseSparse from other sparse matrix data types.

43 """

45 def _validate(self, data):

46 if not isinstance(data.dtype, SparseDtype):

47 raise AttributeError(self._validation_msg)

49 def _delegate_property_get(self, name, *args, **kwargs):

50 return getattr(self._parent.array, name)

52 def _delegate_method(self, name, *args, **kwargs):

53 if name == "from_coo":

54 return self.from_coo(*args, **kwargs)

55 elif name == "to_coo":

56 return self.to_coo(*args, **kwargs)

57 else:

58 raise ValueError

60 @classmethod

61 def from_coo(cls, A, dense_index: bool = False) -> Series:

62 """

63 Create a Series with sparse values from a scipy.sparse.coo_matrix.

65 Parameters

66 ----------

67 A : scipy.sparse.coo_matrix

68 dense_index : bool, default False

69 If False (default), the index consists of only the

70 coords of the non-null entries of the original coo_matrix.

71 If True, the index consists of the full sorted

72 (row, col) coordinates of the coo_matrix.

74 Returns

75 -------

76 s : Series

77 A Series with sparse values.

79 Examples

80 --------

81 >>> from scipy import sparse

83 >>> A = sparse.coo_matrix(

84 ... ([3.0, 1.0, 2.0], ([1, 0, 0], [0, 2, 3])), shape=(3, 4)

85 ... )

86 >>> A

87 <3x4 sparse matrix of type '<class 'numpy.float64'>'

88 with 3 stored elements in COOrdinate format>

90 >>> A.todense()

91 matrix([[0., 0., 1., 2.],

92 [3., 0., 0., 0.],

93 [0., 0., 0., 0.]])

95 >>> ss = pd.Series.sparse.from_coo(A)

96 >>> ss

97 0 2 1.0

98 3 2.0

99 1 0 3.0

100 dtype: Sparse[float64, nan]

101 """

102 from pandas import Series

103 from pandas.core.arrays.sparse.scipy_sparse import coo_to_sparse_series

104

105 result = coo_to_sparse_series(A, dense_index=dense_index)

106 result = Series(result.array, index=result.index, copy=False)

107

108 return result

109

110 def to_coo(self, row_levels=(0,), column_levels=(1,), sort_labels: bool = False):

111 """

112 Create a scipy.sparse.coo_matrix from a Series with MultiIndex.

113

114 Use row_levels and column_levels to determine the row and column

115 coordinates respectively. row_levels and column_levels are the names

116 (labels) or numbers of the levels. {row_levels, column_levels} must be

117 a partition of the MultiIndex level names (or numbers).

118

119 Parameters

120 ----------

121 row_levels : tuple/list

122 column_levels : tuple/list

123 sort_labels : bool, default False

124 Sort the row and column labels before forming the sparse matrix.

125 When `row_levels` and/or `column_levels` refer to a single level,

126 set to `True` for a faster execution.

127

128 Returns

129 -------

130 y : scipy.sparse.coo_matrix

131 rows : list (row labels)

132 columns : list (column labels)

133

134 Examples

135 --------

136 >>> s = pd.Series([3.0, np.nan, 1.0, 3.0, np.nan, np.nan])

137 >>> s.index = pd.MultiIndex.from_tuples(

138 ... [

139 ... (1, 2, "a", 0),

140 ... (1, 2, "a", 1),

141 ... (1, 1, "b", 0),

142 ... (1, 1, "b", 1),

143 ... (2, 1, "b", 0),

144 ... (2, 1, "b", 1)

145 ... ],

146 ... names=["A", "B", "C", "D"],

147 ... )

148 >>> s

149 A B C D

150 1 2 a 0 3.0

151 1 NaN

152 1 b 0 1.0

153 1 3.0

154 2 1 b 0 NaN

155 1 NaN

156 dtype: float64

157

158 >>> ss = s.astype("Sparse")

159 >>> ss

160 A B C D

161 1 2 a 0 3.0

162 1 NaN

163 1 b 0 1.0

164 1 3.0

165 2 1 b 0 NaN

166 1 NaN

167 dtype: Sparse[float64, nan]

168

169 >>> A, rows, columns = ss.sparse.to_coo(

170 ... row_levels=["A", "B"], column_levels=["C", "D"], sort_labels=True

171 ... )

172 >>> A

173 <3x4 sparse matrix of type '<class 'numpy.float64'>'

174 with 3 stored elements in COOrdinate format>

175 >>> A.todense()

176 matrix([[0., 0., 1., 3.],

177 [3., 0., 0., 0.],

178 [0., 0., 0., 0.]])

179

180 >>> rows

181 [(1, 1), (1, 2), (2, 1)]

182 >>> columns

183 [('a', 0), ('a', 1), ('b', 0), ('b', 1)]

184 """

185 from pandas.core.arrays.sparse.scipy_sparse import sparse_series_to_coo

186

187 A, rows, columns = sparse_series_to_coo(

188 self._parent, row_levels, column_levels, sort_labels=sort_labels

189 )

190 return A, rows, columns

191

192 def to_dense(self) -> Series:

193 """

194 Convert a Series from sparse values to dense.

195

196 Returns

197 -------

198 Series:

199 A Series with the same values, stored as a dense array.

200

201 Examples

202 --------

203 >>> series = pd.Series(pd.arrays.SparseArray([0, 1, 0]))

204 >>> series

205 0 0

206 1 1

207 2 0

208 dtype: Sparse[int64, 0]

209

210 >>> series.sparse.to_dense()

211 0 0

212 1 1

213 2 0

214 dtype: int64

215 """

216 from pandas import Series

217

218 return Series(

219 self._parent.array.to_dense(),

220 index=self._parent.index,

221 name=self._parent.name,

222 copy=False,

223 )

224

225

226class SparseFrameAccessor(BaseAccessor, PandasDelegate):

227 """

228 DataFrame accessor for sparse data.

229 """

230

231 def _validate(self, data):

232 dtypes = data.dtypes

233 if not all(isinstance(t, SparseDtype) for t in dtypes):

234 raise AttributeError(self._validation_msg)

235

236 @classmethod

237 def from_spmatrix(cls, data, index=None, columns=None) -> DataFrame:

238 """

239 Create a new DataFrame from a scipy sparse matrix.

240

241 Parameters

242 ----------

243 data : scipy.sparse.spmatrix

244 Must be convertible to csc format.

245 index, columns : Index, optional

246 Row and column labels to use for the resulting DataFrame.

247 Defaults to a RangeIndex.

248

249 Returns

250 -------

251 DataFrame

252 Each column of the DataFrame is stored as a

253 :class:`arrays.SparseArray`.

254

255 Examples

256 --------

257 >>> import scipy.sparse

258 >>> mat = scipy.sparse.eye(3)

259 >>> pd.DataFrame.sparse.from_spmatrix(mat)

260 0 1 2

261 0 1.0 0.0 0.0

262 1 0.0 1.0 0.0

263 2 0.0 0.0 1.0

264 """

265 from pandas._libs.sparse import IntIndex

266

267 from pandas import DataFrame

268

269 data = data.tocsc()

270 index, columns = cls._prep_index(data, index, columns)

271 n_rows, n_columns = data.shape

272 # We need to make sure indices are sorted, as we create

273 # IntIndex with no input validation (i.e. check_integrity=False ).

274 # Indices may already be sorted in scipy in which case this adds

275 # a small overhead.

276 data.sort_indices()

277 indices = data.indices

278 indptr = data.indptr

279 array_data = data.data

280 dtype = SparseDtype(array_data.dtype, 0)

281 arrays = []

282 for i in range(n_columns):

283 sl = slice(indptr[i], indptr[i + 1])

284 idx = IntIndex(n_rows, indices[sl], check_integrity=False)

285 arr = SparseArray._simple_new(array_data[sl], idx, dtype)

286 arrays.append(arr)

287 return DataFrame._from_arrays(

288 arrays, columns=columns, index=index, verify_integrity=False

289 )

290

291 def to_dense(self) -> DataFrame:

292 """

293 Convert a DataFrame with sparse values to dense.

294

295 Returns

296 -------

297 DataFrame

298 A DataFrame with the same values stored as dense arrays.

299

300 Examples

301 --------

302 >>> df = pd.DataFrame({"A": pd.arrays.SparseArray([0, 1, 0])})

303 >>> df.sparse.to_dense()

304 A

305 0 0

306 1 1

307 2 0

308 """

309 from pandas import DataFrame

310

311 data = {k: v.array.to_dense() for k, v in self._parent.items()}

312 return DataFrame(data, index=self._parent.index, columns=self._parent.columns)

313

314 def to_coo(self):

315 """

316 Return the contents of the frame as a sparse SciPy COO matrix.

317

318 Returns

319 -------

320 scipy.sparse.spmatrix

321 If the caller is heterogeneous and contains booleans or objects,

322 the result will be of dtype=object. See Notes.

323

324 Notes

325 -----

326 The dtype will be the lowest-common-denominator type (implicit

327 upcasting); that is to say if the dtypes (even of numeric types)

328 are mixed, the one that accommodates all will be chosen.

329

330 e.g. If the dtypes are float16 and float32, dtype will be upcast to

331 float32. By numpy.find_common_type convention, mixing int64 and

332 and uint64 will result in a float64 dtype.

333 """

334 import_optional_dependency("scipy")

335 from scipy.sparse import coo_matrix

336

337 dtype = find_common_type(self._parent.dtypes.to_list())

338 if isinstance(dtype, SparseDtype):

339 dtype = dtype.subtype

340

341 cols, rows, data = [], [], []

342 for col, (_, ser) in enumerate(self._parent.items()):

343 sp_arr = ser.array

344 if sp_arr.fill_value != 0:

345 raise ValueError("fill value must be 0 when converting to COO matrix")

346

347 row = sp_arr.sp_index.indices

348 cols.append(np.repeat(col, len(row)))

349 rows.append(row)

350 data.append(sp_arr.sp_values.astype(dtype, copy=False))

351

352 cols = np.concatenate(cols)

353 rows = np.concatenate(rows)

354 data = np.concatenate(data)

355 return coo_matrix((data, (rows, cols)), shape=self._parent.shape)

356

357 @property

358 def density(self) -> float:

359 """

360 Ratio of non-sparse points to total (dense) data points.

361 """

362 tmp = np.mean([column.array.density for _, column in self._parent.items()])

363 return tmp

364

365 @staticmethod

366 def _prep_index(data, index, columns):

367 from pandas.core.indexes.api import (

368 default_index,

369 ensure_index,

370 )

371

372 N, K = data.shape

373 if index is None:

374 index = default_index(N)

375 else:

376 index = ensure_index(index)

377 if columns is None:

378 columns = default_index(K)

379 else:

380 columns = ensure_index(columns)

381

382 if len(columns) != K:

383 raise ValueError(f"Column length mismatch: {len(columns)} vs. {K}")

384 if len(index) != N:

385 raise ValueError(f"Index length mismatch: {len(index)} vs. {N}")

386 return index, columns