Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.9/dist-packages/pandas/core/arrays/sparse/array.py: 18%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

794 statements  

1""" 

2SparseArray data structure 

3""" 

4from __future__ import annotations 

5 

6from collections import abc 

7import numbers 

8import operator 

9from typing import ( 

10 TYPE_CHECKING, 

11 Any, 

12 Callable, 

13 Literal, 

14 cast, 

15 overload, 

16) 

17import warnings 

18 

19import numpy as np 

20 

21from pandas._libs import lib 

22import pandas._libs.sparse as splib 

23from pandas._libs.sparse import ( 

24 BlockIndex, 

25 IntIndex, 

26 SparseIndex, 

27) 

28from pandas._libs.tslibs import NaT 

29from pandas.compat.numpy import function as nv 

30from pandas.errors import PerformanceWarning 

31from pandas.util._decorators import doc 

32from pandas.util._exceptions import find_stack_level 

33from pandas.util._validators import ( 

34 validate_bool_kwarg, 

35 validate_insert_loc, 

36) 

37 

38from pandas.core.dtypes.astype import astype_array 

39from pandas.core.dtypes.cast import ( 

40 construct_1d_arraylike_from_scalar, 

41 find_common_type, 

42 maybe_box_datetimelike, 

43) 

44from pandas.core.dtypes.common import ( 

45 is_bool_dtype, 

46 is_integer, 

47 is_list_like, 

48 is_object_dtype, 

49 is_scalar, 

50 is_string_dtype, 

51 pandas_dtype, 

52) 

53from pandas.core.dtypes.dtypes import ( 

54 DatetimeTZDtype, 

55 SparseDtype, 

56) 

57from pandas.core.dtypes.generic import ( 

58 ABCIndex, 

59 ABCSeries, 

60) 

61from pandas.core.dtypes.missing import ( 

62 isna, 

63 na_value_for_dtype, 

64 notna, 

65) 

66 

67from pandas.core import arraylike 

68import pandas.core.algorithms as algos 

69from pandas.core.arraylike import OpsMixin 

70from pandas.core.arrays import ExtensionArray 

71from pandas.core.base import PandasObject 

72import pandas.core.common as com 

73from pandas.core.construction import ( 

74 ensure_wrapped_if_datetimelike, 

75 extract_array, 

76 sanitize_array, 

77) 

78from pandas.core.indexers import ( 

79 check_array_indexer, 

80 unpack_tuple_and_ellipses, 

81) 

82from pandas.core.nanops import check_below_min_count 

83 

84from pandas.io.formats import printing 

85 

86# See https://github.com/python/typing/issues/684 

87if TYPE_CHECKING: 

88 from collections.abc import Sequence 

89 from enum import Enum 

90 

91 class ellipsis(Enum): 

92 Ellipsis = "..." 

93 

94 Ellipsis = ellipsis.Ellipsis 

95 

96 from scipy.sparse import spmatrix 

97 

98 from pandas._typing import ( 

99 FillnaOptions, 

100 NumpySorter, 

101 ) 

102 

103 SparseIndexKind = Literal["integer", "block"] 

104 

105 from pandas._typing import ( 

106 ArrayLike, 

107 AstypeArg, 

108 Axis, 

109 AxisInt, 

110 Dtype, 

111 NpDtype, 

112 PositionalIndexer, 

113 Scalar, 

114 ScalarIndexer, 

115 Self, 

116 SequenceIndexer, 

117 npt, 

118 ) 

119 

120 from pandas import Series 

121 

122else: 

123 ellipsis = type(Ellipsis) 

124 

125 

126# ---------------------------------------------------------------------------- 

127# Array 

128 

129_sparray_doc_kwargs = {"klass": "SparseArray"} 

130 

131 

132def _get_fill(arr: SparseArray) -> np.ndarray: 

133 """ 

134 Create a 0-dim ndarray containing the fill value 

135 

136 Parameters 

137 ---------- 

138 arr : SparseArray 

139 

140 Returns 

141 ------- 

142 fill_value : ndarray 

143 0-dim ndarray with just the fill value. 

144 

145 Notes 

146 ----- 

147 coerce fill_value to arr dtype if possible 

148 int64 SparseArray can have NaN as fill_value if there is no missing 

149 """ 

150 try: 

151 return np.asarray(arr.fill_value, dtype=arr.dtype.subtype) 

152 except ValueError: 

153 return np.asarray(arr.fill_value) 

154 

155 

156def _sparse_array_op( 

157 left: SparseArray, right: SparseArray, op: Callable, name: str 

158) -> SparseArray: 

159 """ 

160 Perform a binary operation between two arrays. 

161 

162 Parameters 

163 ---------- 

164 left : Union[SparseArray, ndarray] 

165 right : Union[SparseArray, ndarray] 

166 op : Callable 

167 The binary operation to perform 

168 name str 

169 Name of the callable. 

170 

171 Returns 

172 ------- 

173 SparseArray 

174 """ 

175 if name.startswith("__"): 

176 # For lookups in _libs.sparse we need non-dunder op name 

177 name = name[2:-2] 

178 

179 # dtype used to find corresponding sparse method 

180 ltype = left.dtype.subtype 

181 rtype = right.dtype.subtype 

182 

183 if ltype != rtype: 

184 subtype = find_common_type([ltype, rtype]) 

185 ltype = SparseDtype(subtype, left.fill_value) 

186 rtype = SparseDtype(subtype, right.fill_value) 

187 

188 left = left.astype(ltype, copy=False) 

189 right = right.astype(rtype, copy=False) 

190 dtype = ltype.subtype 

191 else: 

192 dtype = ltype 

193 

194 # dtype the result must have 

195 result_dtype = None 

196 

197 if left.sp_index.ngaps == 0 or right.sp_index.ngaps == 0: 

198 with np.errstate(all="ignore"): 

199 result = op(left.to_dense(), right.to_dense()) 

200 fill = op(_get_fill(left), _get_fill(right)) 

201 

202 if left.sp_index.ngaps == 0: 

203 index = left.sp_index 

204 else: 

205 index = right.sp_index 

206 elif left.sp_index.equals(right.sp_index): 

207 with np.errstate(all="ignore"): 

208 result = op(left.sp_values, right.sp_values) 

209 fill = op(_get_fill(left), _get_fill(right)) 

210 index = left.sp_index 

211 else: 

212 if name[0] == "r": 

213 left, right = right, left 

214 name = name[1:] 

215 

216 if name in ("and", "or", "xor") and dtype == "bool": 

217 opname = f"sparse_{name}_uint8" 

218 # to make template simple, cast here 

219 left_sp_values = left.sp_values.view(np.uint8) 

220 right_sp_values = right.sp_values.view(np.uint8) 

221 result_dtype = bool 

222 else: 

223 opname = f"sparse_{name}_{dtype}" 

224 left_sp_values = left.sp_values 

225 right_sp_values = right.sp_values 

226 

227 if ( 

228 name in ["floordiv", "mod"] 

229 and (right == 0).any() 

230 and left.dtype.kind in "iu" 

231 ): 

232 # Match the non-Sparse Series behavior 

233 opname = f"sparse_{name}_float64" 

234 left_sp_values = left_sp_values.astype("float64") 

235 right_sp_values = right_sp_values.astype("float64") 

236 

237 sparse_op = getattr(splib, opname) 

238 

239 with np.errstate(all="ignore"): 

240 result, index, fill = sparse_op( 

241 left_sp_values, 

242 left.sp_index, 

243 left.fill_value, 

244 right_sp_values, 

245 right.sp_index, 

246 right.fill_value, 

247 ) 

248 

249 if name == "divmod": 

250 # result is a 2-tuple 

251 # error: Incompatible return value type (got "Tuple[SparseArray, 

252 # SparseArray]", expected "SparseArray") 

253 return ( # type: ignore[return-value] 

254 _wrap_result(name, result[0], index, fill[0], dtype=result_dtype), 

255 _wrap_result(name, result[1], index, fill[1], dtype=result_dtype), 

256 ) 

257 

258 if result_dtype is None: 

259 result_dtype = result.dtype 

260 

261 return _wrap_result(name, result, index, fill, dtype=result_dtype) 

262 

263 

264def _wrap_result( 

265 name: str, data, sparse_index, fill_value, dtype: Dtype | None = None 

266) -> SparseArray: 

267 """ 

268 wrap op result to have correct dtype 

269 """ 

270 if name.startswith("__"): 

271 # e.g. __eq__ --> eq 

272 name = name[2:-2] 

273 

274 if name in ("eq", "ne", "lt", "gt", "le", "ge"): 

275 dtype = bool 

276 

277 fill_value = lib.item_from_zerodim(fill_value) 

278 

279 if is_bool_dtype(dtype): 

280 # fill_value may be np.bool_ 

281 fill_value = bool(fill_value) 

282 return SparseArray( 

283 data, sparse_index=sparse_index, fill_value=fill_value, dtype=dtype 

284 ) 

285 

286 

287class SparseArray(OpsMixin, PandasObject, ExtensionArray): 

288 """ 

289 An ExtensionArray for storing sparse data. 

290 

291 Parameters 

292 ---------- 

293 data : array-like or scalar 

294 A dense array of values to store in the SparseArray. This may contain 

295 `fill_value`. 

296 sparse_index : SparseIndex, optional 

297 fill_value : scalar, optional 

298 Elements in data that are ``fill_value`` are not stored in the 

299 SparseArray. For memory savings, this should be the most common value 

300 in `data`. By default, `fill_value` depends on the dtype of `data`: 

301 

302 =========== ========== 

303 data.dtype na_value 

304 =========== ========== 

305 float ``np.nan`` 

306 int ``0`` 

307 bool False 

308 datetime64 ``pd.NaT`` 

309 timedelta64 ``pd.NaT`` 

310 =========== ========== 

311 

312 The fill value is potentially specified in three ways. In order of 

313 precedence, these are 

314 

315 1. The `fill_value` argument 

316 2. ``dtype.fill_value`` if `fill_value` is None and `dtype` is 

317 a ``SparseDtype`` 

318 3. ``data.dtype.fill_value`` if `fill_value` is None and `dtype` 

319 is not a ``SparseDtype`` and `data` is a ``SparseArray``. 

320 

321 kind : str 

322 Can be 'integer' or 'block', default is 'integer'. 

323 The type of storage for sparse locations. 

324 

325 * 'block': Stores a `block` and `block_length` for each 

326 contiguous *span* of sparse values. This is best when 

327 sparse data tends to be clumped together, with large 

328 regions of ``fill-value`` values between sparse values. 

329 * 'integer': uses an integer to store the location of 

330 each sparse value. 

331 

332 dtype : np.dtype or SparseDtype, optional 

333 The dtype to use for the SparseArray. For numpy dtypes, this 

334 determines the dtype of ``self.sp_values``. For SparseDtype, 

335 this determines ``self.sp_values`` and ``self.fill_value``. 

336 copy : bool, default False 

337 Whether to explicitly copy the incoming `data` array. 

338 

339 Attributes 

340 ---------- 

341 None 

342 

343 Methods 

344 ------- 

345 None 

346 

347 Examples 

348 -------- 

349 >>> from pandas.arrays import SparseArray 

350 >>> arr = SparseArray([0, 0, 1, 2]) 

351 >>> arr 

352 [0, 0, 1, 2] 

353 Fill: 0 

354 IntIndex 

355 Indices: array([2, 3], dtype=int32) 

356 """ 

357 

358 _subtyp = "sparse_array" # register ABCSparseArray 

359 _hidden_attrs = PandasObject._hidden_attrs | frozenset([]) 

360 _sparse_index: SparseIndex 

361 _sparse_values: np.ndarray 

362 _dtype: SparseDtype 

363 

364 def __init__( 

365 self, 

366 data, 

367 sparse_index=None, 

368 fill_value=None, 

369 kind: SparseIndexKind = "integer", 

370 dtype: Dtype | None = None, 

371 copy: bool = False, 

372 ) -> None: 

373 if fill_value is None and isinstance(dtype, SparseDtype): 

374 fill_value = dtype.fill_value 

375 

376 if isinstance(data, type(self)): 

377 # disable normal inference on dtype, sparse_index, & fill_value 

378 if sparse_index is None: 

379 sparse_index = data.sp_index 

380 if fill_value is None: 

381 fill_value = data.fill_value 

382 if dtype is None: 

383 dtype = data.dtype 

384 # TODO: make kind=None, and use data.kind? 

385 data = data.sp_values 

386 

387 # Handle use-provided dtype 

388 if isinstance(dtype, str): 

389 # Two options: dtype='int', regular numpy dtype 

390 # or dtype='Sparse[int]', a sparse dtype 

391 try: 

392 dtype = SparseDtype.construct_from_string(dtype) 

393 except TypeError: 

394 dtype = pandas_dtype(dtype) 

395 

396 if isinstance(dtype, SparseDtype): 

397 if fill_value is None: 

398 fill_value = dtype.fill_value 

399 dtype = dtype.subtype 

400 

401 if is_scalar(data): 

402 warnings.warn( 

403 f"Constructing {type(self).__name__} with scalar data is deprecated " 

404 "and will raise in a future version. Pass a sequence instead.", 

405 FutureWarning, 

406 stacklevel=find_stack_level(), 

407 ) 

408 if sparse_index is None: 

409 npoints = 1 

410 else: 

411 npoints = sparse_index.length 

412 

413 data = construct_1d_arraylike_from_scalar(data, npoints, dtype=None) 

414 dtype = data.dtype 

415 

416 if dtype is not None: 

417 dtype = pandas_dtype(dtype) 

418 

419 # TODO: disentangle the fill_value dtype inference from 

420 # dtype inference 

421 if data is None: 

422 # TODO: What should the empty dtype be? Object or float? 

423 

424 # error: Argument "dtype" to "array" has incompatible type 

425 # "Union[ExtensionDtype, dtype[Any], None]"; expected "Union[dtype[Any], 

426 # None, type, _SupportsDType, str, Union[Tuple[Any, int], Tuple[Any, 

427 # Union[int, Sequence[int]]], List[Any], _DTypeDict, Tuple[Any, Any]]]" 

428 data = np.array([], dtype=dtype) # type: ignore[arg-type] 

429 

430 try: 

431 data = sanitize_array(data, index=None) 

432 except ValueError: 

433 # NumPy may raise a ValueError on data like [1, []] 

434 # we retry with object dtype here. 

435 if dtype is None: 

436 dtype = np.dtype(object) 

437 data = np.atleast_1d(np.asarray(data, dtype=dtype)) 

438 else: 

439 raise 

440 

441 if copy: 

442 # TODO: avoid double copy when dtype forces cast. 

443 data = data.copy() 

444 

445 if fill_value is None: 

446 fill_value_dtype = data.dtype if dtype is None else dtype 

447 if fill_value_dtype is None: 

448 fill_value = np.nan 

449 else: 

450 fill_value = na_value_for_dtype(fill_value_dtype) 

451 

452 if isinstance(data, type(self)) and sparse_index is None: 

453 sparse_index = data._sparse_index 

454 # error: Argument "dtype" to "asarray" has incompatible type 

455 # "Union[ExtensionDtype, dtype[Any], None]"; expected "None" 

456 sparse_values = np.asarray( 

457 data.sp_values, dtype=dtype # type: ignore[arg-type] 

458 ) 

459 elif sparse_index is None: 

460 data = extract_array(data, extract_numpy=True) 

461 if not isinstance(data, np.ndarray): 

462 # EA 

463 if isinstance(data.dtype, DatetimeTZDtype): 

464 warnings.warn( 

465 f"Creating SparseArray from {data.dtype} data " 

466 "loses timezone information. Cast to object before " 

467 "sparse to retain timezone information.", 

468 UserWarning, 

469 stacklevel=find_stack_level(), 

470 ) 

471 data = np.asarray(data, dtype="datetime64[ns]") 

472 if fill_value is NaT: 

473 fill_value = np.datetime64("NaT", "ns") 

474 data = np.asarray(data) 

475 sparse_values, sparse_index, fill_value = _make_sparse( 

476 # error: Argument "dtype" to "_make_sparse" has incompatible type 

477 # "Union[ExtensionDtype, dtype[Any], None]"; expected 

478 # "Optional[dtype[Any]]" 

479 data, 

480 kind=kind, 

481 fill_value=fill_value, 

482 dtype=dtype, # type: ignore[arg-type] 

483 ) 

484 else: 

485 # error: Argument "dtype" to "asarray" has incompatible type 

486 # "Union[ExtensionDtype, dtype[Any], None]"; expected "None" 

487 sparse_values = np.asarray(data, dtype=dtype) # type: ignore[arg-type] 

488 if len(sparse_values) != sparse_index.npoints: 

489 raise AssertionError( 

490 f"Non array-like type {type(sparse_values)} must " 

491 "have the same length as the index" 

492 ) 

493 self._sparse_index = sparse_index 

494 self._sparse_values = sparse_values 

495 self._dtype = SparseDtype(sparse_values.dtype, fill_value) 

496 

497 @classmethod 

498 def _simple_new( 

499 cls, 

500 sparse_array: np.ndarray, 

501 sparse_index: SparseIndex, 

502 dtype: SparseDtype, 

503 ) -> Self: 

504 new = object.__new__(cls) 

505 new._sparse_index = sparse_index 

506 new._sparse_values = sparse_array 

507 new._dtype = dtype 

508 return new 

509 

510 @classmethod 

511 def from_spmatrix(cls, data: spmatrix) -> Self: 

512 """ 

513 Create a SparseArray from a scipy.sparse matrix. 

514 

515 Parameters 

516 ---------- 

517 data : scipy.sparse.sp_matrix 

518 This should be a SciPy sparse matrix where the size 

519 of the second dimension is 1. In other words, a 

520 sparse matrix with a single column. 

521 

522 Returns 

523 ------- 

524 SparseArray 

525 

526 Examples 

527 -------- 

528 >>> import scipy.sparse 

529 >>> mat = scipy.sparse.coo_matrix((4, 1)) 

530 >>> pd.arrays.SparseArray.from_spmatrix(mat) 

531 [0.0, 0.0, 0.0, 0.0] 

532 Fill: 0.0 

533 IntIndex 

534 Indices: array([], dtype=int32) 

535 """ 

536 length, ncol = data.shape 

537 

538 if ncol != 1: 

539 raise ValueError(f"'data' must have a single column, not '{ncol}'") 

540 

541 # our sparse index classes require that the positions be strictly 

542 # increasing. So we need to sort loc, and arr accordingly. 

543 data = data.tocsc() 

544 data.sort_indices() 

545 arr = data.data 

546 idx = data.indices 

547 

548 zero = np.array(0, dtype=arr.dtype).item() 

549 dtype = SparseDtype(arr.dtype, zero) 

550 index = IntIndex(length, idx) 

551 

552 return cls._simple_new(arr, index, dtype) 

553 

554 def __array__( 

555 self, dtype: NpDtype | None = None, copy: bool | None = None 

556 ) -> np.ndarray: 

557 fill_value = self.fill_value 

558 

559 if self.sp_index.ngaps == 0: 

560 # Compat for na dtype and int values. 

561 return self.sp_values 

562 if dtype is None: 

563 # Can NumPy represent this type? 

564 # If not, `np.result_type` will raise. We catch that 

565 # and return object. 

566 if self.sp_values.dtype.kind == "M": 

567 # However, we *do* special-case the common case of 

568 # a datetime64 with pandas NaT. 

569 if fill_value is NaT: 

570 # Can't put pd.NaT in a datetime64[ns] 

571 fill_value = np.datetime64("NaT") 

572 try: 

573 dtype = np.result_type(self.sp_values.dtype, type(fill_value)) 

574 except TypeError: 

575 dtype = object 

576 

577 out = np.full(self.shape, fill_value, dtype=dtype) 

578 out[self.sp_index.indices] = self.sp_values 

579 return out 

580 

581 def __setitem__(self, key, value) -> None: 

582 # I suppose we could allow setting of non-fill_value elements. 

583 # TODO(SparseArray.__setitem__): remove special cases in 

584 # ExtensionBlock.where 

585 msg = "SparseArray does not support item assignment via setitem" 

586 raise TypeError(msg) 

587 

588 @classmethod 

589 def _from_sequence(cls, scalars, *, dtype: Dtype | None = None, copy: bool = False): 

590 return cls(scalars, dtype=dtype) 

591 

592 @classmethod 

593 def _from_factorized(cls, values, original): 

594 return cls(values, dtype=original.dtype) 

595 

596 # ------------------------------------------------------------------------ 

597 # Data 

598 # ------------------------------------------------------------------------ 

599 @property 

600 def sp_index(self) -> SparseIndex: 

601 """ 

602 The SparseIndex containing the location of non- ``fill_value`` points. 

603 """ 

604 return self._sparse_index 

605 

606 @property 

607 def sp_values(self) -> np.ndarray: 

608 """ 

609 An ndarray containing the non- ``fill_value`` values. 

610 

611 Examples 

612 -------- 

613 >>> from pandas.arrays import SparseArray 

614 >>> s = SparseArray([0, 0, 1, 0, 2], fill_value=0) 

615 >>> s.sp_values 

616 array([1, 2]) 

617 """ 

618 return self._sparse_values 

619 

620 @property 

621 def dtype(self) -> SparseDtype: 

622 return self._dtype 

623 

624 @property 

625 def fill_value(self): 

626 """ 

627 Elements in `data` that are `fill_value` are not stored. 

628 

629 For memory savings, this should be the most common value in the array. 

630 

631 Examples 

632 -------- 

633 >>> ser = pd.Series([0, 0, 2, 2, 2], dtype="Sparse[int]") 

634 >>> ser.sparse.fill_value 

635 0 

636 >>> spa_dtype = pd.SparseDtype(dtype=np.int32, fill_value=2) 

637 >>> ser = pd.Series([0, 0, 2, 2, 2], dtype=spa_dtype) 

638 >>> ser.sparse.fill_value 

639 2 

640 """ 

641 return self.dtype.fill_value 

642 

643 @fill_value.setter 

644 def fill_value(self, value) -> None: 

645 self._dtype = SparseDtype(self.dtype.subtype, value) 

646 

647 @property 

648 def kind(self) -> SparseIndexKind: 

649 """ 

650 The kind of sparse index for this array. One of {'integer', 'block'}. 

651 """ 

652 if isinstance(self.sp_index, IntIndex): 

653 return "integer" 

654 else: 

655 return "block" 

656 

657 @property 

658 def _valid_sp_values(self) -> np.ndarray: 

659 sp_vals = self.sp_values 

660 mask = notna(sp_vals) 

661 return sp_vals[mask] 

662 

663 def __len__(self) -> int: 

664 return self.sp_index.length 

665 

666 @property 

667 def _null_fill_value(self) -> bool: 

668 return self._dtype._is_na_fill_value 

669 

670 def _fill_value_matches(self, fill_value) -> bool: 

671 if self._null_fill_value: 

672 return isna(fill_value) 

673 else: 

674 return self.fill_value == fill_value 

675 

676 @property 

677 def nbytes(self) -> int: 

678 return self.sp_values.nbytes + self.sp_index.nbytes 

679 

680 @property 

681 def density(self) -> float: 

682 """ 

683 The percent of non- ``fill_value`` points, as decimal. 

684 

685 Examples 

686 -------- 

687 >>> from pandas.arrays import SparseArray 

688 >>> s = SparseArray([0, 0, 1, 1, 1], fill_value=0) 

689 >>> s.density 

690 0.6 

691 """ 

692 return self.sp_index.npoints / self.sp_index.length 

693 

694 @property 

695 def npoints(self) -> int: 

696 """ 

697 The number of non- ``fill_value`` points. 

698 

699 Examples 

700 -------- 

701 >>> from pandas.arrays import SparseArray 

702 >>> s = SparseArray([0, 0, 1, 1, 1], fill_value=0) 

703 >>> s.npoints 

704 3 

705 """ 

706 return self.sp_index.npoints 

707 

708 # error: Return type "SparseArray" of "isna" incompatible with return type 

709 # "ndarray[Any, Any] | ExtensionArraySupportsAnyAll" in supertype "ExtensionArray" 

710 def isna(self) -> Self: # type: ignore[override] 

711 # If null fill value, we want SparseDtype[bool, true] 

712 # to preserve the same memory usage. 

713 dtype = SparseDtype(bool, self._null_fill_value) 

714 if self._null_fill_value: 

715 return type(self)._simple_new(isna(self.sp_values), self.sp_index, dtype) 

716 mask = np.full(len(self), False, dtype=np.bool_) 

717 mask[self.sp_index.indices] = isna(self.sp_values) 

718 return type(self)(mask, fill_value=False, dtype=dtype) 

719 

720 def _pad_or_backfill( # pylint: disable=useless-parent-delegation 

721 self, 

722 *, 

723 method: FillnaOptions, 

724 limit: int | None = None, 

725 limit_area: Literal["inside", "outside"] | None = None, 

726 copy: bool = True, 

727 ) -> Self: 

728 # TODO(3.0): We can remove this method once deprecation for fillna method 

729 # keyword is enforced. 

730 return super()._pad_or_backfill( 

731 method=method, limit=limit, limit_area=limit_area, copy=copy 

732 ) 

733 

734 def fillna( 

735 self, 

736 value=None, 

737 method: FillnaOptions | None = None, 

738 limit: int | None = None, 

739 copy: bool = True, 

740 ) -> Self: 

741 """ 

742 Fill missing values with `value`. 

743 

744 Parameters 

745 ---------- 

746 value : scalar, optional 

747 method : str, optional 

748 

749 .. warning:: 

750 

751 Using 'method' will result in high memory use, 

752 as all `fill_value` methods will be converted to 

753 an in-memory ndarray 

754 

755 limit : int, optional 

756 

757 copy: bool, default True 

758 Ignored for SparseArray. 

759 

760 Returns 

761 ------- 

762 SparseArray 

763 

764 Notes 

765 ----- 

766 When `value` is specified, the result's ``fill_value`` depends on 

767 ``self.fill_value``. The goal is to maintain low-memory use. 

768 

769 If ``self.fill_value`` is NA, the result dtype will be 

770 ``SparseDtype(self.dtype, fill_value=value)``. This will preserve 

771 amount of memory used before and after filling. 

772 

773 When ``self.fill_value`` is not NA, the result dtype will be 

774 ``self.dtype``. Again, this preserves the amount of memory used. 

775 """ 

776 if (method is None and value is None) or ( 

777 method is not None and value is not None 

778 ): 

779 raise ValueError("Must specify one of 'method' or 'value'.") 

780 

781 if method is not None: 

782 return super().fillna(method=method, limit=limit) 

783 

784 else: 

785 new_values = np.where(isna(self.sp_values), value, self.sp_values) 

786 

787 if self._null_fill_value: 

788 # This is essentially just updating the dtype. 

789 new_dtype = SparseDtype(self.dtype.subtype, fill_value=value) 

790 else: 

791 new_dtype = self.dtype 

792 

793 return self._simple_new(new_values, self._sparse_index, new_dtype) 

794 

795 def shift(self, periods: int = 1, fill_value=None) -> Self: 

796 if not len(self) or periods == 0: 

797 return self.copy() 

798 

799 if isna(fill_value): 

800 fill_value = self.dtype.na_value 

801 

802 subtype = np.result_type(fill_value, self.dtype.subtype) 

803 

804 if subtype != self.dtype.subtype: 

805 # just coerce up front 

806 arr = self.astype(SparseDtype(subtype, self.fill_value)) 

807 else: 

808 arr = self 

809 

810 empty = self._from_sequence( 

811 [fill_value] * min(abs(periods), len(self)), dtype=arr.dtype 

812 ) 

813 

814 if periods > 0: 

815 a = empty 

816 b = arr[:-periods] 

817 else: 

818 a = arr[abs(periods) :] 

819 b = empty 

820 return arr._concat_same_type([a, b]) 

821 

822 def _first_fill_value_loc(self): 

823 """ 

824 Get the location of the first fill value. 

825 

826 Returns 

827 ------- 

828 int 

829 """ 

830 if len(self) == 0 or self.sp_index.npoints == len(self): 

831 return -1 

832 

833 indices = self.sp_index.indices 

834 if not len(indices) or indices[0] > 0: 

835 return 0 

836 

837 # a number larger than 1 should be appended to 

838 # the last in case of fill value only appears 

839 # in the tail of array 

840 diff = np.r_[np.diff(indices), 2] 

841 return indices[(diff > 1).argmax()] + 1 

842 

843 @doc(ExtensionArray.duplicated) 

844 def duplicated( 

845 self, keep: Literal["first", "last", False] = "first" 

846 ) -> npt.NDArray[np.bool_]: 

847 values = np.asarray(self) 

848 mask = np.asarray(self.isna()) 

849 return algos.duplicated(values, keep=keep, mask=mask) 

850 

851 def unique(self) -> Self: 

852 uniques = algos.unique(self.sp_values) 

853 if len(self.sp_values) != len(self): 

854 fill_loc = self._first_fill_value_loc() 

855 # Inorder to align the behavior of pd.unique or 

856 # pd.Series.unique, we should keep the original 

857 # order, here we use unique again to find the 

858 # insertion place. Since the length of sp_values 

859 # is not large, maybe minor performance hurt 

860 # is worthwhile to the correctness. 

861 insert_loc = len(algos.unique(self.sp_values[:fill_loc])) 

862 uniques = np.insert(uniques, insert_loc, self.fill_value) 

863 return type(self)._from_sequence(uniques, dtype=self.dtype) 

864 

865 def _values_for_factorize(self): 

866 # Still override this for hash_pandas_object 

867 return np.asarray(self), self.fill_value 

868 

869 def factorize( 

870 self, 

871 use_na_sentinel: bool = True, 

872 ) -> tuple[np.ndarray, SparseArray]: 

873 # Currently, ExtensionArray.factorize -> Tuple[ndarray, EA] 

874 # The sparsity on this is backwards from what Sparse would want. Want 

875 # ExtensionArray.factorize -> Tuple[EA, EA] 

876 # Given that we have to return a dense array of codes, why bother 

877 # implementing an efficient factorize? 

878 codes, uniques = algos.factorize( 

879 np.asarray(self), use_na_sentinel=use_na_sentinel 

880 ) 

881 uniques_sp = SparseArray(uniques, dtype=self.dtype) 

882 return codes, uniques_sp 

883 

884 def value_counts(self, dropna: bool = True) -> Series: 

885 """ 

886 Returns a Series containing counts of unique values. 

887 

888 Parameters 

889 ---------- 

890 dropna : bool, default True 

891 Don't include counts of NaN, even if NaN is in sp_values. 

892 

893 Returns 

894 ------- 

895 counts : Series 

896 """ 

897 from pandas import ( 

898 Index, 

899 Series, 

900 ) 

901 

902 keys, counts, _ = algos.value_counts_arraylike(self.sp_values, dropna=dropna) 

903 fcounts = self.sp_index.ngaps 

904 if fcounts > 0 and (not self._null_fill_value or not dropna): 

905 mask = isna(keys) if self._null_fill_value else keys == self.fill_value 

906 if mask.any(): 

907 counts[mask] += fcounts 

908 else: 

909 # error: Argument 1 to "insert" has incompatible type "Union[ 

910 # ExtensionArray,ndarray[Any, Any]]"; expected "Union[ 

911 # _SupportsArray[dtype[Any]], Sequence[_SupportsArray[dtype 

912 # [Any]]], Sequence[Sequence[_SupportsArray[dtype[Any]]]], 

913 # Sequence[Sequence[Sequence[_SupportsArray[dtype[Any]]]]], Sequence 

914 # [Sequence[Sequence[Sequence[_SupportsArray[dtype[Any]]]]]]]" 

915 keys = np.insert(keys, 0, self.fill_value) # type: ignore[arg-type] 

916 counts = np.insert(counts, 0, fcounts) 

917 

918 if not isinstance(keys, ABCIndex): 

919 index = Index(keys) 

920 else: 

921 index = keys 

922 return Series(counts, index=index, copy=False) 

923 

924 # -------- 

925 # Indexing 

926 # -------- 

927 @overload 

928 def __getitem__(self, key: ScalarIndexer) -> Any: 

929 ... 

930 

931 @overload 

932 def __getitem__( 

933 self, 

934 key: SequenceIndexer | tuple[int | ellipsis, ...], 

935 ) -> Self: 

936 ... 

937 

938 def __getitem__( 

939 self, 

940 key: PositionalIndexer | tuple[int | ellipsis, ...], 

941 ) -> Self | Any: 

942 if isinstance(key, tuple): 

943 key = unpack_tuple_and_ellipses(key) 

944 if key is Ellipsis: 

945 raise ValueError("Cannot slice with Ellipsis") 

946 

947 if is_integer(key): 

948 return self._get_val_at(key) 

949 elif isinstance(key, tuple): 

950 # error: Invalid index type "Tuple[Union[int, ellipsis], ...]" 

951 # for "ndarray[Any, Any]"; expected type 

952 # "Union[SupportsIndex, _SupportsArray[dtype[Union[bool_, 

953 # integer[Any]]]], _NestedSequence[_SupportsArray[dtype[ 

954 # Union[bool_, integer[Any]]]]], _NestedSequence[Union[ 

955 # bool, int]], Tuple[Union[SupportsIndex, _SupportsArray[ 

956 # dtype[Union[bool_, integer[Any]]]], _NestedSequence[ 

957 # _SupportsArray[dtype[Union[bool_, integer[Any]]]]], 

958 # _NestedSequence[Union[bool, int]]], ...]]" 

959 data_slice = self.to_dense()[key] # type: ignore[index] 

960 elif isinstance(key, slice): 

961 # Avoid densifying when handling contiguous slices 

962 if key.step is None or key.step == 1: 

963 start = 0 if key.start is None else key.start 

964 if start < 0: 

965 start += len(self) 

966 

967 end = len(self) if key.stop is None else key.stop 

968 if end < 0: 

969 end += len(self) 

970 

971 indices = self.sp_index.indices 

972 keep_inds = np.flatnonzero((indices >= start) & (indices < end)) 

973 sp_vals = self.sp_values[keep_inds] 

974 

975 sp_index = indices[keep_inds].copy() 

976 

977 # If we've sliced to not include the start of the array, all our indices 

978 # should be shifted. NB: here we are careful to also not shift by a 

979 # negative value for a case like [0, 1][-100:] where the start index 

980 # should be treated like 0 

981 if start > 0: 

982 sp_index -= start 

983 

984 # Length of our result should match applying this slice to a range 

985 # of the length of our original array 

986 new_len = len(range(len(self))[key]) 

987 new_sp_index = make_sparse_index(new_len, sp_index, self.kind) 

988 return type(self)._simple_new(sp_vals, new_sp_index, self.dtype) 

989 else: 

990 indices = np.arange(len(self), dtype=np.int32)[key] 

991 return self.take(indices) 

992 

993 elif not is_list_like(key): 

994 # e.g. "foo" or 2.5 

995 # exception message copied from numpy 

996 raise IndexError( 

997 r"only integers, slices (`:`), ellipsis (`...`), numpy.newaxis " 

998 r"(`None`) and integer or boolean arrays are valid indices" 

999 ) 

1000 

1001 else: 

1002 if isinstance(key, SparseArray): 

1003 # NOTE: If we guarantee that SparseDType(bool) 

1004 # has only fill_value - true, false or nan 

1005 # (see GH PR 44955) 

1006 # we can apply mask very fast: 

1007 if is_bool_dtype(key): 

1008 if isna(key.fill_value): 

1009 return self.take(key.sp_index.indices[key.sp_values]) 

1010 if not key.fill_value: 

1011 return self.take(key.sp_index.indices) 

1012 n = len(self) 

1013 mask = np.full(n, True, dtype=np.bool_) 

1014 mask[key.sp_index.indices] = False 

1015 return self.take(np.arange(n)[mask]) 

1016 else: 

1017 key = np.asarray(key) 

1018 

1019 key = check_array_indexer(self, key) 

1020 

1021 if com.is_bool_indexer(key): 

1022 # mypy doesn't know we have an array here 

1023 key = cast(np.ndarray, key) 

1024 return self.take(np.arange(len(key), dtype=np.int32)[key]) 

1025 elif hasattr(key, "__len__"): 

1026 return self.take(key) 

1027 else: 

1028 raise ValueError(f"Cannot slice with '{key}'") 

1029 

1030 return type(self)(data_slice, kind=self.kind) 

1031 

1032 def _get_val_at(self, loc): 

1033 loc = validate_insert_loc(loc, len(self)) 

1034 

1035 sp_loc = self.sp_index.lookup(loc) 

1036 if sp_loc == -1: 

1037 return self.fill_value 

1038 else: 

1039 val = self.sp_values[sp_loc] 

1040 val = maybe_box_datetimelike(val, self.sp_values.dtype) 

1041 return val 

1042 

1043 def take(self, indices, *, allow_fill: bool = False, fill_value=None) -> Self: 

1044 if is_scalar(indices): 

1045 raise ValueError(f"'indices' must be an array, not a scalar '{indices}'.") 

1046 indices = np.asarray(indices, dtype=np.int32) 

1047 

1048 dtype = None 

1049 if indices.size == 0: 

1050 result = np.array([], dtype="object") 

1051 dtype = self.dtype 

1052 elif allow_fill: 

1053 result = self._take_with_fill(indices, fill_value=fill_value) 

1054 else: 

1055 return self._take_without_fill(indices) 

1056 

1057 return type(self)( 

1058 result, fill_value=self.fill_value, kind=self.kind, dtype=dtype 

1059 ) 

1060 

1061 def _take_with_fill(self, indices, fill_value=None) -> np.ndarray: 

1062 if fill_value is None: 

1063 fill_value = self.dtype.na_value 

1064 

1065 if indices.min() < -1: 

1066 raise ValueError( 

1067 "Invalid value in 'indices'. Must be between -1 " 

1068 "and the length of the array." 

1069 ) 

1070 

1071 if indices.max() >= len(self): 

1072 raise IndexError("out of bounds value in 'indices'.") 

1073 

1074 if len(self) == 0: 

1075 # Empty... Allow taking only if all empty 

1076 if (indices == -1).all(): 

1077 dtype = np.result_type(self.sp_values, type(fill_value)) 

1078 taken = np.empty_like(indices, dtype=dtype) 

1079 taken.fill(fill_value) 

1080 return taken 

1081 else: 

1082 raise IndexError("cannot do a non-empty take from an empty axes.") 

1083 

1084 # sp_indexer may be -1 for two reasons 

1085 # 1.) we took for an index of -1 (new) 

1086 # 2.) we took a value that was self.fill_value (old) 

1087 sp_indexer = self.sp_index.lookup_array(indices) 

1088 new_fill_indices = indices == -1 

1089 old_fill_indices = (sp_indexer == -1) & ~new_fill_indices 

1090 

1091 if self.sp_index.npoints == 0 and old_fill_indices.all(): 

1092 # We've looked up all valid points on an all-sparse array. 

1093 taken = np.full( 

1094 sp_indexer.shape, fill_value=self.fill_value, dtype=self.dtype.subtype 

1095 ) 

1096 

1097 elif self.sp_index.npoints == 0: 

1098 # Use the old fill_value unless we took for an index of -1 

1099 _dtype = np.result_type(self.dtype.subtype, type(fill_value)) 

1100 taken = np.full(sp_indexer.shape, fill_value=fill_value, dtype=_dtype) 

1101 taken[old_fill_indices] = self.fill_value 

1102 else: 

1103 taken = self.sp_values.take(sp_indexer) 

1104 

1105 # Fill in two steps. 

1106 # Old fill values 

1107 # New fill values 

1108 # potentially coercing to a new dtype at each stage. 

1109 

1110 m0 = sp_indexer[old_fill_indices] < 0 

1111 m1 = sp_indexer[new_fill_indices] < 0 

1112 

1113 result_type = taken.dtype 

1114 

1115 if m0.any(): 

1116 result_type = np.result_type(result_type, type(self.fill_value)) 

1117 taken = taken.astype(result_type) 

1118 taken[old_fill_indices] = self.fill_value 

1119 

1120 if m1.any(): 

1121 result_type = np.result_type(result_type, type(fill_value)) 

1122 taken = taken.astype(result_type) 

1123 taken[new_fill_indices] = fill_value 

1124 

1125 return taken 

1126 

1127 def _take_without_fill(self, indices) -> Self: 

1128 to_shift = indices < 0 

1129 

1130 n = len(self) 

1131 

1132 if (indices.max() >= n) or (indices.min() < -n): 

1133 if n == 0: 

1134 raise IndexError("cannot do a non-empty take from an empty axes.") 

1135 raise IndexError("out of bounds value in 'indices'.") 

1136 

1137 if to_shift.any(): 

1138 indices = indices.copy() 

1139 indices[to_shift] += n 

1140 

1141 sp_indexer = self.sp_index.lookup_array(indices) 

1142 value_mask = sp_indexer != -1 

1143 new_sp_values = self.sp_values[sp_indexer[value_mask]] 

1144 

1145 value_indices = np.flatnonzero(value_mask).astype(np.int32, copy=False) 

1146 

1147 new_sp_index = make_sparse_index(len(indices), value_indices, kind=self.kind) 

1148 return type(self)._simple_new(new_sp_values, new_sp_index, dtype=self.dtype) 

1149 

1150 def searchsorted( 

1151 self, 

1152 v: ArrayLike | object, 

1153 side: Literal["left", "right"] = "left", 

1154 sorter: NumpySorter | None = None, 

1155 ) -> npt.NDArray[np.intp] | np.intp: 

1156 msg = "searchsorted requires high memory usage." 

1157 warnings.warn(msg, PerformanceWarning, stacklevel=find_stack_level()) 

1158 v = np.asarray(v) 

1159 return np.asarray(self, dtype=self.dtype.subtype).searchsorted(v, side, sorter) 

1160 

1161 def copy(self) -> Self: 

1162 values = self.sp_values.copy() 

1163 return self._simple_new(values, self.sp_index, self.dtype) 

1164 

1165 @classmethod 

1166 def _concat_same_type(cls, to_concat: Sequence[Self]) -> Self: 

1167 fill_value = to_concat[0].fill_value 

1168 

1169 values = [] 

1170 length = 0 

1171 

1172 if to_concat: 

1173 sp_kind = to_concat[0].kind 

1174 else: 

1175 sp_kind = "integer" 

1176 

1177 sp_index: SparseIndex 

1178 if sp_kind == "integer": 

1179 indices = [] 

1180 

1181 for arr in to_concat: 

1182 int_idx = arr.sp_index.indices.copy() 

1183 int_idx += length # TODO: wraparound 

1184 length += arr.sp_index.length 

1185 

1186 values.append(arr.sp_values) 

1187 indices.append(int_idx) 

1188 

1189 data = np.concatenate(values) 

1190 indices_arr = np.concatenate(indices) 

1191 # error: Argument 2 to "IntIndex" has incompatible type 

1192 # "ndarray[Any, dtype[signedinteger[_32Bit]]]"; 

1193 # expected "Sequence[int]" 

1194 sp_index = IntIndex(length, indices_arr) # type: ignore[arg-type] 

1195 

1196 else: 

1197 # when concatenating block indices, we don't claim that you'll 

1198 # get an identical index as concatenating the values and then 

1199 # creating a new index. We don't want to spend the time trying 

1200 # to merge blocks across arrays in `to_concat`, so the resulting 

1201 # BlockIndex may have more blocks. 

1202 blengths = [] 

1203 blocs = [] 

1204 

1205 for arr in to_concat: 

1206 block_idx = arr.sp_index.to_block_index() 

1207 

1208 values.append(arr.sp_values) 

1209 blocs.append(block_idx.blocs.copy() + length) 

1210 blengths.append(block_idx.blengths) 

1211 length += arr.sp_index.length 

1212 

1213 data = np.concatenate(values) 

1214 blocs_arr = np.concatenate(blocs) 

1215 blengths_arr = np.concatenate(blengths) 

1216 

1217 sp_index = BlockIndex(length, blocs_arr, blengths_arr) 

1218 

1219 return cls(data, sparse_index=sp_index, fill_value=fill_value) 

1220 

1221 def astype(self, dtype: AstypeArg | None = None, copy: bool = True): 

1222 """ 

1223 Change the dtype of a SparseArray. 

1224 

1225 The output will always be a SparseArray. To convert to a dense 

1226 ndarray with a certain dtype, use :meth:`numpy.asarray`. 

1227 

1228 Parameters 

1229 ---------- 

1230 dtype : np.dtype or ExtensionDtype 

1231 For SparseDtype, this changes the dtype of 

1232 ``self.sp_values`` and the ``self.fill_value``. 

1233 

1234 For other dtypes, this only changes the dtype of 

1235 ``self.sp_values``. 

1236 

1237 copy : bool, default True 

1238 Whether to ensure a copy is made, even if not necessary. 

1239 

1240 Returns 

1241 ------- 

1242 SparseArray 

1243 

1244 Examples 

1245 -------- 

1246 >>> arr = pd.arrays.SparseArray([0, 0, 1, 2]) 

1247 >>> arr 

1248 [0, 0, 1, 2] 

1249 Fill: 0 

1250 IntIndex 

1251 Indices: array([2, 3], dtype=int32) 

1252 

1253 >>> arr.astype(SparseDtype(np.dtype('int32'))) 

1254 [0, 0, 1, 2] 

1255 Fill: 0 

1256 IntIndex 

1257 Indices: array([2, 3], dtype=int32) 

1258 

1259 Using a NumPy dtype with a different kind (e.g. float) will coerce 

1260 just ``self.sp_values``. 

1261 

1262 >>> arr.astype(SparseDtype(np.dtype('float64'))) 

1263 ... # doctest: +NORMALIZE_WHITESPACE 

1264 [nan, nan, 1.0, 2.0] 

1265 Fill: nan 

1266 IntIndex 

1267 Indices: array([2, 3], dtype=int32) 

1268 

1269 Using a SparseDtype, you can also change the fill value as well. 

1270 

1271 >>> arr.astype(SparseDtype("float64", fill_value=0.0)) 

1272 ... # doctest: +NORMALIZE_WHITESPACE 

1273 [0.0, 0.0, 1.0, 2.0] 

1274 Fill: 0.0 

1275 IntIndex 

1276 Indices: array([2, 3], dtype=int32) 

1277 """ 

1278 if dtype == self._dtype: 

1279 if not copy: 

1280 return self 

1281 else: 

1282 return self.copy() 

1283 

1284 future_dtype = pandas_dtype(dtype) 

1285 if not isinstance(future_dtype, SparseDtype): 

1286 # GH#34457 

1287 values = np.asarray(self) 

1288 values = ensure_wrapped_if_datetimelike(values) 

1289 return astype_array(values, dtype=future_dtype, copy=False) 

1290 

1291 dtype = self.dtype.update_dtype(dtype) 

1292 subtype = pandas_dtype(dtype._subtype_with_str) 

1293 subtype = cast(np.dtype, subtype) # ensured by update_dtype 

1294 values = ensure_wrapped_if_datetimelike(self.sp_values) 

1295 sp_values = astype_array(values, subtype, copy=copy) 

1296 sp_values = np.asarray(sp_values) 

1297 

1298 return self._simple_new(sp_values, self.sp_index, dtype) 

1299 

1300 def map(self, mapper, na_action=None) -> Self: 

1301 """ 

1302 Map categories using an input mapping or function. 

1303 

1304 Parameters 

1305 ---------- 

1306 mapper : dict, Series, callable 

1307 The correspondence from old values to new. 

1308 na_action : {None, 'ignore'}, default None 

1309 If 'ignore', propagate NA values, without passing them to the 

1310 mapping correspondence. 

1311 

1312 Returns 

1313 ------- 

1314 SparseArray 

1315 The output array will have the same density as the input. 

1316 The output fill value will be the result of applying the 

1317 mapping to ``self.fill_value`` 

1318 

1319 Examples 

1320 -------- 

1321 >>> arr = pd.arrays.SparseArray([0, 1, 2]) 

1322 >>> arr.map(lambda x: x + 10) 

1323 [10, 11, 12] 

1324 Fill: 10 

1325 IntIndex 

1326 Indices: array([1, 2], dtype=int32) 

1327 

1328 >>> arr.map({0: 10, 1: 11, 2: 12}) 

1329 [10, 11, 12] 

1330 Fill: 10 

1331 IntIndex 

1332 Indices: array([1, 2], dtype=int32) 

1333 

1334 >>> arr.map(pd.Series([10, 11, 12], index=[0, 1, 2])) 

1335 [10, 11, 12] 

1336 Fill: 10 

1337 IntIndex 

1338 Indices: array([1, 2], dtype=int32) 

1339 """ 

1340 is_map = isinstance(mapper, (abc.Mapping, ABCSeries)) 

1341 

1342 fill_val = self.fill_value 

1343 

1344 if na_action is None or notna(fill_val): 

1345 fill_val = mapper.get(fill_val, fill_val) if is_map else mapper(fill_val) 

1346 

1347 def func(sp_val): 

1348 new_sp_val = mapper.get(sp_val, None) if is_map else mapper(sp_val) 

1349 # check identity and equality because nans are not equal to each other 

1350 if new_sp_val is fill_val or new_sp_val == fill_val: 

1351 msg = "fill value in the sparse values not supported" 

1352 raise ValueError(msg) 

1353 return new_sp_val 

1354 

1355 sp_values = [func(x) for x in self.sp_values] 

1356 

1357 return type(self)(sp_values, sparse_index=self.sp_index, fill_value=fill_val) 

1358 

1359 def to_dense(self) -> np.ndarray: 

1360 """ 

1361 Convert SparseArray to a NumPy array. 

1362 

1363 Returns 

1364 ------- 

1365 arr : NumPy array 

1366 """ 

1367 return np.asarray(self, dtype=self.sp_values.dtype) 

1368 

1369 def _where(self, mask, value): 

1370 # NB: may not preserve dtype, e.g. result may be Sparse[float64] 

1371 # while self is Sparse[int64] 

1372 naive_implementation = np.where(mask, self, value) 

1373 dtype = SparseDtype(naive_implementation.dtype, fill_value=self.fill_value) 

1374 result = type(self)._from_sequence(naive_implementation, dtype=dtype) 

1375 return result 

1376 

1377 # ------------------------------------------------------------------------ 

1378 # IO 

1379 # ------------------------------------------------------------------------ 

1380 def __setstate__(self, state) -> None: 

1381 """Necessary for making this object picklable""" 

1382 if isinstance(state, tuple): 

1383 # Compat for pandas < 0.24.0 

1384 nd_state, (fill_value, sp_index) = state 

1385 sparse_values = np.array([]) 

1386 sparse_values.__setstate__(nd_state) 

1387 

1388 self._sparse_values = sparse_values 

1389 self._sparse_index = sp_index 

1390 self._dtype = SparseDtype(sparse_values.dtype, fill_value) 

1391 else: 

1392 self.__dict__.update(state) 

1393 

1394 def nonzero(self) -> tuple[npt.NDArray[np.int32]]: 

1395 if self.fill_value == 0: 

1396 return (self.sp_index.indices,) 

1397 else: 

1398 return (self.sp_index.indices[self.sp_values != 0],) 

1399 

1400 # ------------------------------------------------------------------------ 

1401 # Reductions 

1402 # ------------------------------------------------------------------------ 

1403 

1404 def _reduce( 

1405 self, name: str, *, skipna: bool = True, keepdims: bool = False, **kwargs 

1406 ): 

1407 method = getattr(self, name, None) 

1408 

1409 if method is None: 

1410 raise TypeError(f"cannot perform {name} with type {self.dtype}") 

1411 

1412 if skipna: 

1413 arr = self 

1414 else: 

1415 arr = self.dropna() 

1416 

1417 result = getattr(arr, name)(**kwargs) 

1418 

1419 if keepdims: 

1420 return type(self)([result], dtype=self.dtype) 

1421 else: 

1422 return result 

1423 

1424 def all(self, axis=None, *args, **kwargs): 

1425 """ 

1426 Tests whether all elements evaluate True 

1427 

1428 Returns 

1429 ------- 

1430 all : bool 

1431 

1432 See Also 

1433 -------- 

1434 numpy.all 

1435 """ 

1436 nv.validate_all(args, kwargs) 

1437 

1438 values = self.sp_values 

1439 

1440 if len(values) != len(self) and not np.all(self.fill_value): 

1441 return False 

1442 

1443 return values.all() 

1444 

1445 def any(self, axis: AxisInt = 0, *args, **kwargs) -> bool: 

1446 """ 

1447 Tests whether at least one of elements evaluate True 

1448 

1449 Returns 

1450 ------- 

1451 any : bool 

1452 

1453 See Also 

1454 -------- 

1455 numpy.any 

1456 """ 

1457 nv.validate_any(args, kwargs) 

1458 

1459 values = self.sp_values 

1460 

1461 if len(values) != len(self) and np.any(self.fill_value): 

1462 return True 

1463 

1464 return values.any().item() 

1465 

1466 def sum( 

1467 self, 

1468 axis: AxisInt = 0, 

1469 min_count: int = 0, 

1470 skipna: bool = True, 

1471 *args, 

1472 **kwargs, 

1473 ) -> Scalar: 

1474 """ 

1475 Sum of non-NA/null values 

1476 

1477 Parameters 

1478 ---------- 

1479 axis : int, default 0 

1480 Not Used. NumPy compatibility. 

1481 min_count : int, default 0 

1482 The required number of valid values to perform the summation. If fewer 

1483 than ``min_count`` valid values are present, the result will be the missing 

1484 value indicator for subarray type. 

1485 *args, **kwargs 

1486 Not Used. NumPy compatibility. 

1487 

1488 Returns 

1489 ------- 

1490 scalar 

1491 """ 

1492 nv.validate_sum(args, kwargs) 

1493 valid_vals = self._valid_sp_values 

1494 sp_sum = valid_vals.sum() 

1495 has_na = self.sp_index.ngaps > 0 and not self._null_fill_value 

1496 

1497 if has_na and not skipna: 

1498 return na_value_for_dtype(self.dtype.subtype, compat=False) 

1499 

1500 if self._null_fill_value: 

1501 if check_below_min_count(valid_vals.shape, None, min_count): 

1502 return na_value_for_dtype(self.dtype.subtype, compat=False) 

1503 return sp_sum 

1504 else: 

1505 nsparse = self.sp_index.ngaps 

1506 if check_below_min_count(valid_vals.shape, None, min_count - nsparse): 

1507 return na_value_for_dtype(self.dtype.subtype, compat=False) 

1508 return sp_sum + self.fill_value * nsparse 

1509 

1510 def cumsum(self, axis: AxisInt = 0, *args, **kwargs) -> SparseArray: 

1511 """ 

1512 Cumulative sum of non-NA/null values. 

1513 

1514 When performing the cumulative summation, any non-NA/null values will 

1515 be skipped. The resulting SparseArray will preserve the locations of 

1516 NaN values, but the fill value will be `np.nan` regardless. 

1517 

1518 Parameters 

1519 ---------- 

1520 axis : int or None 

1521 Axis over which to perform the cumulative summation. If None, 

1522 perform cumulative summation over flattened array. 

1523 

1524 Returns 

1525 ------- 

1526 cumsum : SparseArray 

1527 """ 

1528 nv.validate_cumsum(args, kwargs) 

1529 

1530 if axis is not None and axis >= self.ndim: # Mimic ndarray behaviour. 

1531 raise ValueError(f"axis(={axis}) out of bounds") 

1532 

1533 if not self._null_fill_value: 

1534 return SparseArray(self.to_dense()).cumsum() 

1535 

1536 return SparseArray( 

1537 self.sp_values.cumsum(), 

1538 sparse_index=self.sp_index, 

1539 fill_value=self.fill_value, 

1540 ) 

1541 

1542 def mean(self, axis: Axis = 0, *args, **kwargs): 

1543 """ 

1544 Mean of non-NA/null values 

1545 

1546 Returns 

1547 ------- 

1548 mean : float 

1549 """ 

1550 nv.validate_mean(args, kwargs) 

1551 valid_vals = self._valid_sp_values 

1552 sp_sum = valid_vals.sum() 

1553 ct = len(valid_vals) 

1554 

1555 if self._null_fill_value: 

1556 return sp_sum / ct 

1557 else: 

1558 nsparse = self.sp_index.ngaps 

1559 return (sp_sum + self.fill_value * nsparse) / (ct + nsparse) 

1560 

1561 def max(self, *, axis: AxisInt | None = None, skipna: bool = True): 

1562 """ 

1563 Max of array values, ignoring NA values if specified. 

1564 

1565 Parameters 

1566 ---------- 

1567 axis : int, default 0 

1568 Not Used. NumPy compatibility. 

1569 skipna : bool, default True 

1570 Whether to ignore NA values. 

1571 

1572 Returns 

1573 ------- 

1574 scalar 

1575 """ 

1576 nv.validate_minmax_axis(axis, self.ndim) 

1577 return self._min_max("max", skipna=skipna) 

1578 

1579 def min(self, *, axis: AxisInt | None = None, skipna: bool = True): 

1580 """ 

1581 Min of array values, ignoring NA values if specified. 

1582 

1583 Parameters 

1584 ---------- 

1585 axis : int, default 0 

1586 Not Used. NumPy compatibility. 

1587 skipna : bool, default True 

1588 Whether to ignore NA values. 

1589 

1590 Returns 

1591 ------- 

1592 scalar 

1593 """ 

1594 nv.validate_minmax_axis(axis, self.ndim) 

1595 return self._min_max("min", skipna=skipna) 

1596 

1597 def _min_max(self, kind: Literal["min", "max"], skipna: bool) -> Scalar: 

1598 """ 

1599 Min/max of non-NA/null values 

1600 

1601 Parameters 

1602 ---------- 

1603 kind : {"min", "max"} 

1604 skipna : bool 

1605 

1606 Returns 

1607 ------- 

1608 scalar 

1609 """ 

1610 valid_vals = self._valid_sp_values 

1611 has_nonnull_fill_vals = not self._null_fill_value and self.sp_index.ngaps > 0 

1612 

1613 if len(valid_vals) > 0: 

1614 sp_min_max = getattr(valid_vals, kind)() 

1615 

1616 # If a non-null fill value is currently present, it might be the min/max 

1617 if has_nonnull_fill_vals: 

1618 func = max if kind == "max" else min 

1619 return func(sp_min_max, self.fill_value) 

1620 elif skipna: 

1621 return sp_min_max 

1622 elif self.sp_index.ngaps == 0: 

1623 # No NAs present 

1624 return sp_min_max 

1625 else: 

1626 return na_value_for_dtype(self.dtype.subtype, compat=False) 

1627 elif has_nonnull_fill_vals: 

1628 return self.fill_value 

1629 else: 

1630 return na_value_for_dtype(self.dtype.subtype, compat=False) 

1631 

1632 def _argmin_argmax(self, kind: Literal["argmin", "argmax"]) -> int: 

1633 values = self._sparse_values 

1634 index = self._sparse_index.indices 

1635 mask = np.asarray(isna(values)) 

1636 func = np.argmax if kind == "argmax" else np.argmin 

1637 

1638 idx = np.arange(values.shape[0]) 

1639 non_nans = values[~mask] 

1640 non_nan_idx = idx[~mask] 

1641 

1642 _candidate = non_nan_idx[func(non_nans)] 

1643 candidate = index[_candidate] 

1644 

1645 if isna(self.fill_value): 

1646 return candidate 

1647 if kind == "argmin" and self[candidate] < self.fill_value: 

1648 return candidate 

1649 if kind == "argmax" and self[candidate] > self.fill_value: 

1650 return candidate 

1651 _loc = self._first_fill_value_loc() 

1652 if _loc == -1: 

1653 # fill_value doesn't exist 

1654 return candidate 

1655 else: 

1656 return _loc 

1657 

1658 def argmax(self, skipna: bool = True) -> int: 

1659 validate_bool_kwarg(skipna, "skipna") 

1660 if not skipna and self._hasna: 

1661 raise NotImplementedError 

1662 return self._argmin_argmax("argmax") 

1663 

1664 def argmin(self, skipna: bool = True) -> int: 

1665 validate_bool_kwarg(skipna, "skipna") 

1666 if not skipna and self._hasna: 

1667 raise NotImplementedError 

1668 return self._argmin_argmax("argmin") 

1669 

1670 # ------------------------------------------------------------------------ 

1671 # Ufuncs 

1672 # ------------------------------------------------------------------------ 

1673 

1674 _HANDLED_TYPES = (np.ndarray, numbers.Number) 

1675 

1676 def __array_ufunc__(self, ufunc: np.ufunc, method: str, *inputs, **kwargs): 

1677 out = kwargs.get("out", ()) 

1678 

1679 for x in inputs + out: 

1680 if not isinstance(x, self._HANDLED_TYPES + (SparseArray,)): 

1681 return NotImplemented 

1682 

1683 # for binary ops, use our custom dunder methods 

1684 result = arraylike.maybe_dispatch_ufunc_to_dunder_op( 

1685 self, ufunc, method, *inputs, **kwargs 

1686 ) 

1687 if result is not NotImplemented: 

1688 return result 

1689 

1690 if "out" in kwargs: 

1691 # e.g. tests.arrays.sparse.test_arithmetics.test_ndarray_inplace 

1692 res = arraylike.dispatch_ufunc_with_out( 

1693 self, ufunc, method, *inputs, **kwargs 

1694 ) 

1695 return res 

1696 

1697 if method == "reduce": 

1698 result = arraylike.dispatch_reduction_ufunc( 

1699 self, ufunc, method, *inputs, **kwargs 

1700 ) 

1701 if result is not NotImplemented: 

1702 # e.g. tests.series.test_ufunc.TestNumpyReductions 

1703 return result 

1704 

1705 if len(inputs) == 1: 

1706 # No alignment necessary. 

1707 sp_values = getattr(ufunc, method)(self.sp_values, **kwargs) 

1708 fill_value = getattr(ufunc, method)(self.fill_value, **kwargs) 

1709 

1710 if ufunc.nout > 1: 

1711 # multiple outputs. e.g. modf 

1712 arrays = tuple( 

1713 self._simple_new( 

1714 sp_value, self.sp_index, SparseDtype(sp_value.dtype, fv) 

1715 ) 

1716 for sp_value, fv in zip(sp_values, fill_value) 

1717 ) 

1718 return arrays 

1719 elif method == "reduce": 

1720 # e.g. reductions 

1721 return sp_values 

1722 

1723 return self._simple_new( 

1724 sp_values, self.sp_index, SparseDtype(sp_values.dtype, fill_value) 

1725 ) 

1726 

1727 new_inputs = tuple(np.asarray(x) for x in inputs) 

1728 result = getattr(ufunc, method)(*new_inputs, **kwargs) 

1729 if out: 

1730 if len(out) == 1: 

1731 out = out[0] 

1732 return out 

1733 

1734 if ufunc.nout > 1: 

1735 return tuple(type(self)(x) for x in result) 

1736 elif method == "at": 

1737 # no return value 

1738 return None 

1739 else: 

1740 return type(self)(result) 

1741 

1742 # ------------------------------------------------------------------------ 

1743 # Ops 

1744 # ------------------------------------------------------------------------ 

1745 

1746 def _arith_method(self, other, op): 

1747 op_name = op.__name__ 

1748 

1749 if isinstance(other, SparseArray): 

1750 return _sparse_array_op(self, other, op, op_name) 

1751 

1752 elif is_scalar(other): 

1753 with np.errstate(all="ignore"): 

1754 fill = op(_get_fill(self), np.asarray(other)) 

1755 result = op(self.sp_values, other) 

1756 

1757 if op_name == "divmod": 

1758 left, right = result 

1759 lfill, rfill = fill 

1760 return ( 

1761 _wrap_result(op_name, left, self.sp_index, lfill), 

1762 _wrap_result(op_name, right, self.sp_index, rfill), 

1763 ) 

1764 

1765 return _wrap_result(op_name, result, self.sp_index, fill) 

1766 

1767 else: 

1768 other = np.asarray(other) 

1769 with np.errstate(all="ignore"): 

1770 if len(self) != len(other): 

1771 raise AssertionError( 

1772 f"length mismatch: {len(self)} vs. {len(other)}" 

1773 ) 

1774 if not isinstance(other, SparseArray): 

1775 dtype = getattr(other, "dtype", None) 

1776 other = SparseArray(other, fill_value=self.fill_value, dtype=dtype) 

1777 return _sparse_array_op(self, other, op, op_name) 

1778 

1779 def _cmp_method(self, other, op) -> SparseArray: 

1780 if not is_scalar(other) and not isinstance(other, type(self)): 

1781 # convert list-like to ndarray 

1782 other = np.asarray(other) 

1783 

1784 if isinstance(other, np.ndarray): 

1785 # TODO: make this more flexible than just ndarray... 

1786 other = SparseArray(other, fill_value=self.fill_value) 

1787 

1788 if isinstance(other, SparseArray): 

1789 if len(self) != len(other): 

1790 raise ValueError( 

1791 f"operands have mismatched length {len(self)} and {len(other)}" 

1792 ) 

1793 

1794 op_name = op.__name__.strip("_") 

1795 return _sparse_array_op(self, other, op, op_name) 

1796 else: 

1797 # scalar 

1798 fill_value = op(self.fill_value, other) 

1799 result = np.full(len(self), fill_value, dtype=np.bool_) 

1800 result[self.sp_index.indices] = op(self.sp_values, other) 

1801 

1802 return type(self)( 

1803 result, 

1804 fill_value=fill_value, 

1805 dtype=np.bool_, 

1806 ) 

1807 

1808 _logical_method = _cmp_method 

1809 

1810 def _unary_method(self, op) -> SparseArray: 

1811 fill_value = op(np.array(self.fill_value)).item() 

1812 dtype = SparseDtype(self.dtype.subtype, fill_value) 

1813 # NOTE: if fill_value doesn't change 

1814 # we just have to apply op to sp_values 

1815 if isna(self.fill_value) or fill_value == self.fill_value: 

1816 values = op(self.sp_values) 

1817 return type(self)._simple_new(values, self.sp_index, self.dtype) 

1818 # In the other case we have to recalc indexes 

1819 return type(self)(op(self.to_dense()), dtype=dtype) 

1820 

1821 def __pos__(self) -> SparseArray: 

1822 return self._unary_method(operator.pos) 

1823 

1824 def __neg__(self) -> SparseArray: 

1825 return self._unary_method(operator.neg) 

1826 

1827 def __invert__(self) -> SparseArray: 

1828 return self._unary_method(operator.invert) 

1829 

1830 def __abs__(self) -> SparseArray: 

1831 return self._unary_method(operator.abs) 

1832 

1833 # ---------- 

1834 # Formatting 

1835 # ----------- 

1836 def __repr__(self) -> str: 

1837 pp_str = printing.pprint_thing(self) 

1838 pp_fill = printing.pprint_thing(self.fill_value) 

1839 pp_index = printing.pprint_thing(self.sp_index) 

1840 return f"{pp_str}\nFill: {pp_fill}\n{pp_index}" 

1841 

1842 def _formatter(self, boxed: bool = False): 

1843 # Defer to the formatter from the GenericArrayFormatter calling us. 

1844 # This will infer the correct formatter from the dtype of the values. 

1845 return None 

1846 

1847 

1848def _make_sparse( 

1849 arr: np.ndarray, 

1850 kind: SparseIndexKind = "block", 

1851 fill_value=None, 

1852 dtype: np.dtype | None = None, 

1853): 

1854 """ 

1855 Convert ndarray to sparse format 

1856 

1857 Parameters 

1858 ---------- 

1859 arr : ndarray 

1860 kind : {'block', 'integer'} 

1861 fill_value : NaN or another value 

1862 dtype : np.dtype, optional 

1863 copy : bool, default False 

1864 

1865 Returns 

1866 ------- 

1867 (sparse_values, index, fill_value) : (ndarray, SparseIndex, Scalar) 

1868 """ 

1869 assert isinstance(arr, np.ndarray) 

1870 

1871 if arr.ndim > 1: 

1872 raise TypeError("expected dimension <= 1 data") 

1873 

1874 if fill_value is None: 

1875 fill_value = na_value_for_dtype(arr.dtype) 

1876 

1877 if isna(fill_value): 

1878 mask = notna(arr) 

1879 else: 

1880 # cast to object comparison to be safe 

1881 if is_string_dtype(arr.dtype): 

1882 arr = arr.astype(object) 

1883 

1884 if is_object_dtype(arr.dtype): 

1885 # element-wise equality check method in numpy doesn't treat 

1886 # each element type, eg. 0, 0.0, and False are treated as 

1887 # same. So we have to check the both of its type and value. 

1888 mask = splib.make_mask_object_ndarray(arr, fill_value) 

1889 else: 

1890 mask = arr != fill_value 

1891 

1892 length = len(arr) 

1893 if length != len(mask): 

1894 # the arr is a SparseArray 

1895 indices = mask.sp_index.indices 

1896 else: 

1897 indices = mask.nonzero()[0].astype(np.int32) 

1898 

1899 index = make_sparse_index(length, indices, kind) 

1900 sparsified_values = arr[mask] 

1901 if dtype is not None: 

1902 sparsified_values = ensure_wrapped_if_datetimelike(sparsified_values) 

1903 sparsified_values = astype_array(sparsified_values, dtype=dtype) 

1904 sparsified_values = np.asarray(sparsified_values) 

1905 

1906 # TODO: copy 

1907 return sparsified_values, index, fill_value 

1908 

1909 

1910@overload 

1911def make_sparse_index(length: int, indices, kind: Literal["block"]) -> BlockIndex: 

1912 ... 

1913 

1914 

1915@overload 

1916def make_sparse_index(length: int, indices, kind: Literal["integer"]) -> IntIndex: 

1917 ... 

1918 

1919 

1920def make_sparse_index(length: int, indices, kind: SparseIndexKind) -> SparseIndex: 

1921 index: SparseIndex 

1922 if kind == "block": 

1923 locs, lens = splib.get_blocks(indices) 

1924 index = BlockIndex(length, locs, lens) 

1925 elif kind == "integer": 

1926 index = IntIndex(length, indices) 

1927 else: # pragma: no cover 

1928 raise ValueError("must be block or integer type") 

1929 return index