Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.9/dist-packages/pandas/core/arrays/sparse/accessor.py: 29%

1"""Sparse accessor"""

2from __future__ import annotations

4from typing import TYPE_CHECKING

6import numpy as np

8from pandas.compat._optional import import_optional_dependency

10from pandas.core.dtypes.cast import find_common_type

11from pandas.core.dtypes.dtypes import SparseDtype

13from pandas.core.accessor import (

14 PandasDelegate,

15 delegate_names,

16)

17from pandas.core.arrays.sparse.array import SparseArray

19if TYPE_CHECKING:

20 from pandas import (

21 DataFrame,

22 Series,

23 )

26class BaseAccessor:

27 _validation_msg = "Can only use the '.sparse' accessor with Sparse data."

29 def __init__(self, data=None) -> None:

30 self._parent = data

31 self._validate(data)

33 def _validate(self, data):

34 raise NotImplementedError

37@delegate_names(

38 SparseArray, ["npoints", "density", "fill_value", "sp_values"], typ="property"

39)

40class SparseAccessor(BaseAccessor, PandasDelegate):

41 """

42 Accessor for SparseSparse from other sparse matrix data types.

44 Examples

45 --------

46 >>> ser = pd.Series([0, 0, 2, 2, 2], dtype="Sparse[int]")

47 >>> ser.sparse.density

48 0.6

49 >>> ser.sparse.sp_values

50 array([2, 2, 2])

51 """

53 def _validate(self, data):

54 if not isinstance(data.dtype, SparseDtype):

55 raise AttributeError(self._validation_msg)

57 def _delegate_property_get(self, name: str, *args, **kwargs):

58 return getattr(self._parent.array, name)

60 def _delegate_method(self, name: str, *args, **kwargs):

61 if name == "from_coo":

62 return self.from_coo(*args, **kwargs)

63 elif name == "to_coo":

64 return self.to_coo(*args, **kwargs)

65 else:

66 raise ValueError

68 @classmethod

69 def from_coo(cls, A, dense_index: bool = False) -> Series:

70 """

71 Create a Series with sparse values from a scipy.sparse.coo_matrix.

73 Parameters

74 ----------

75 A : scipy.sparse.coo_matrix

76 dense_index : bool, default False

77 If False (default), the index consists of only the

78 coords of the non-null entries of the original coo_matrix.

79 If True, the index consists of the full sorted

80 (row, col) coordinates of the coo_matrix.

82 Returns

83 -------

84 s : Series

85 A Series with sparse values.

87 Examples

88 --------

89 >>> from scipy import sparse

91 >>> A = sparse.coo_matrix(

92 ... ([3.0, 1.0, 2.0], ([1, 0, 0], [0, 2, 3])), shape=(3, 4)

93 ... )

94 >>> A

95 <COOrdinate sparse matrix of dtype 'float64'

96 with 3 stored elements and shape (3, 4)>

98 >>> A.todense()

99 matrix([[0., 0., 1., 2.],

100 [3., 0., 0., 0.],

101 [0., 0., 0., 0.]])

102

103 >>> ss = pd.Series.sparse.from_coo(A)

104 >>> ss

105 0 2 1.0

106 3 2.0

107 1 0 3.0

108 dtype: Sparse[float64, nan]

109 """

110 from pandas import Series

111 from pandas.core.arrays.sparse.scipy_sparse import coo_to_sparse_series

112

113 result = coo_to_sparse_series(A, dense_index=dense_index)

114 result = Series(result.array, index=result.index, copy=False)

115

116 return result

117

118 def to_coo(self, row_levels=(0,), column_levels=(1,), sort_labels: bool = False):

119 """

120 Create a scipy.sparse.coo_matrix from a Series with MultiIndex.

121

122 Use row_levels and column_levels to determine the row and column

123 coordinates respectively. row_levels and column_levels are the names

124 (labels) or numbers of the levels. {row_levels, column_levels} must be

125 a partition of the MultiIndex level names (or numbers).

126

127 Parameters

128 ----------

129 row_levels : tuple/list

130 column_levels : tuple/list

131 sort_labels : bool, default False

132 Sort the row and column labels before forming the sparse matrix.

133 When `row_levels` and/or `column_levels` refer to a single level,

134 set to `True` for a faster execution.

135

136 Returns

137 -------

138 y : scipy.sparse.coo_matrix

139 rows : list (row labels)

140 columns : list (column labels)

141

142 Examples

143 --------

144 >>> s = pd.Series([3.0, np.nan, 1.0, 3.0, np.nan, np.nan])

145 >>> s.index = pd.MultiIndex.from_tuples(

146 ... [

147 ... (1, 2, "a", 0),

148 ... (1, 2, "a", 1),

149 ... (1, 1, "b", 0),

150 ... (1, 1, "b", 1),

151 ... (2, 1, "b", 0),

152 ... (2, 1, "b", 1)

153 ... ],

154 ... names=["A", "B", "C", "D"],

155 ... )

156 >>> s

157 A B C D

158 1 2 a 0 3.0

159 1 NaN

160 1 b 0 1.0

161 1 3.0

162 2 1 b 0 NaN

163 1 NaN

164 dtype: float64

165

166 >>> ss = s.astype("Sparse")

167 >>> ss

168 A B C D

169 1 2 a 0 3.0

170 1 NaN

171 1 b 0 1.0

172 1 3.0

173 2 1 b 0 NaN

174 1 NaN

175 dtype: Sparse[float64, nan]

176

177 >>> A, rows, columns = ss.sparse.to_coo(

178 ... row_levels=["A", "B"], column_levels=["C", "D"], sort_labels=True

179 ... )

180 >>> A

181 <COOrdinate sparse matrix of dtype 'float64'

182 with 3 stored elements and shape (3, 4)>

183 >>> A.todense()

184 matrix([[0., 0., 1., 3.],

185 [3., 0., 0., 0.],

186 [0., 0., 0., 0.]])

187

188 >>> rows

189 [(1, 1), (1, 2), (2, 1)]

190 >>> columns

191 [('a', 0), ('a', 1), ('b', 0), ('b', 1)]

192 """

193 from pandas.core.arrays.sparse.scipy_sparse import sparse_series_to_coo

194

195 A, rows, columns = sparse_series_to_coo(

196 self._parent, row_levels, column_levels, sort_labels=sort_labels

197 )

198 return A, rows, columns

199

200 def to_dense(self) -> Series:

201 """

202 Convert a Series from sparse values to dense.

203

204 Returns

205 -------

206 Series:

207 A Series with the same values, stored as a dense array.

208

209 Examples

210 --------

211 >>> series = pd.Series(pd.arrays.SparseArray([0, 1, 0]))

212 >>> series

213 0 0

214 1 1

215 2 0

216 dtype: Sparse[int64, 0]

217

218 >>> series.sparse.to_dense()

219 0 0

220 1 1

221 2 0

222 dtype: int64

223 """

224 from pandas import Series

225

226 return Series(

227 self._parent.array.to_dense(),

228 index=self._parent.index,

229 name=self._parent.name,

230 copy=False,

231 )

232

233

234class SparseFrameAccessor(BaseAccessor, PandasDelegate):

235 """

236 DataFrame accessor for sparse data.

237

238 Examples

239 --------

240 >>> df = pd.DataFrame({"a": [1, 2, 0, 0],

241 ... "b": [3, 0, 0, 4]}, dtype="Sparse[int]")

242 >>> df.sparse.density

243 0.5

244 """

245

246 def _validate(self, data):

247 dtypes = data.dtypes

248 if not all(isinstance(t, SparseDtype) for t in dtypes):

249 raise AttributeError(self._validation_msg)

250

251 @classmethod

252 def from_spmatrix(cls, data, index=None, columns=None) -> DataFrame:

253 """

254 Create a new DataFrame from a scipy sparse matrix.

255

256 Parameters

257 ----------

258 data : scipy.sparse.spmatrix

259 Must be convertible to csc format.

260 index, columns : Index, optional

261 Row and column labels to use for the resulting DataFrame.

262 Defaults to a RangeIndex.

263

264 Returns

265 -------

266 DataFrame

267 Each column of the DataFrame is stored as a

268 :class:`arrays.SparseArray`.

269

270 Examples

271 --------

272 >>> import scipy.sparse

273 >>> mat = scipy.sparse.eye(3, dtype=float)

274 >>> pd.DataFrame.sparse.from_spmatrix(mat)

275 0 1 2

276 0 1.0 0 0

277 1 0 1.0 0

278 2 0 0 1.0

279 """

280 from pandas._libs.sparse import IntIndex

281

282 from pandas import DataFrame

283

284 data = data.tocsc()

285 index, columns = cls._prep_index(data, index, columns)

286 n_rows, n_columns = data.shape

287 # We need to make sure indices are sorted, as we create

288 # IntIndex with no input validation (i.e. check_integrity=False ).

289 # Indices may already be sorted in scipy in which case this adds

290 # a small overhead.

291 data.sort_indices()

292 indices = data.indices

293 indptr = data.indptr

294 array_data = data.data

295 dtype = SparseDtype(array_data.dtype, 0)

296 arrays = []

297 for i in range(n_columns):

298 sl = slice(indptr[i], indptr[i + 1])

299 idx = IntIndex(n_rows, indices[sl], check_integrity=False)

300 arr = SparseArray._simple_new(array_data[sl], idx, dtype)

301 arrays.append(arr)

302 return DataFrame._from_arrays(

303 arrays, columns=columns, index=index, verify_integrity=False

304 )

305

306 def to_dense(self) -> DataFrame:

307 """

308 Convert a DataFrame with sparse values to dense.

309

310 Returns

311 -------

312 DataFrame

313 A DataFrame with the same values stored as dense arrays.

314

315 Examples

316 --------

317 >>> df = pd.DataFrame({"A": pd.arrays.SparseArray([0, 1, 0])})

318 >>> df.sparse.to_dense()

319 A

320 0 0

321 1 1

322 2 0

323 """

324 from pandas import DataFrame

325

326 data = {k: v.array.to_dense() for k, v in self._parent.items()}

327 return DataFrame(data, index=self._parent.index, columns=self._parent.columns)

328

329 def to_coo(self):

330 """

331 Return the contents of the frame as a sparse SciPy COO matrix.

332

333 Returns

334 -------

335 scipy.sparse.spmatrix

336 If the caller is heterogeneous and contains booleans or objects,

337 the result will be of dtype=object. See Notes.

338

339 Notes

340 -----

341 The dtype will be the lowest-common-denominator type (implicit

342 upcasting); that is to say if the dtypes (even of numeric types)

343 are mixed, the one that accommodates all will be chosen.

344

345 e.g. If the dtypes are float16 and float32, dtype will be upcast to

346 float32. By numpy.find_common_type convention, mixing int64 and

347 and uint64 will result in a float64 dtype.

348

349 Examples

350 --------

351 >>> df = pd.DataFrame({"A": pd.arrays.SparseArray([0, 1, 0, 1])})

352 >>> df.sparse.to_coo()

353 <COOrdinate sparse matrix of dtype 'int64'

354 with 2 stored elements and shape (4, 1)>

355 """

356 import_optional_dependency("scipy")

357 from scipy.sparse import coo_matrix

358

359 dtype = find_common_type(self._parent.dtypes.to_list())

360 if isinstance(dtype, SparseDtype):

361 dtype = dtype.subtype

362

363 cols, rows, data = [], [], []

364 for col, (_, ser) in enumerate(self._parent.items()):

365 sp_arr = ser.array

366 if sp_arr.fill_value != 0:

367 raise ValueError("fill value must be 0 when converting to COO matrix")

368

369 row = sp_arr.sp_index.indices

370 cols.append(np.repeat(col, len(row)))

371 rows.append(row)

372 data.append(sp_arr.sp_values.astype(dtype, copy=False))

373

374 cols = np.concatenate(cols)

375 rows = np.concatenate(rows)

376 data = np.concatenate(data)

377 return coo_matrix((data, (rows, cols)), shape=self._parent.shape)

378

379 @property

380 def density(self) -> float:

381 """

382 Ratio of non-sparse points to total (dense) data points.

383

384 Examples

385 --------

386 >>> df = pd.DataFrame({"A": pd.arrays.SparseArray([0, 1, 0, 1])})

387 >>> df.sparse.density

388 0.5

389 """

390 tmp = np.mean([column.array.density for _, column in self._parent.items()])

391 return tmp

392

393 @staticmethod

394 def _prep_index(data, index, columns):

395 from pandas.core.indexes.api import (

396 default_index,

397 ensure_index,

398 )

399

400 N, K = data.shape

401 if index is None:

402 index = default_index(N)

403 else:

404 index = ensure_index(index)

405 if columns is None:

406 columns = default_index(K)

407 else:

408 columns = ensure_index(columns)

409

410 if len(columns) != K:

411 raise ValueError(f"Column length mismatch: {len(columns)} vs. {K}")

412 if len(index) != N:

413 raise ValueError(f"Index length mismatch: {len(index)} vs. {N}")

414 return index, columns