Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/pandas/core/arrays/sparse/array.py: 18%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

786 statements  

1""" 

2SparseArray data structure 

3""" 

4from __future__ import annotations 

5 

6from collections import abc 

7import numbers 

8import operator 

9from typing import ( 

10 TYPE_CHECKING, 

11 Any, 

12 Callable, 

13 Literal, 

14 Sequence, 

15 TypeVar, 

16 cast, 

17 overload, 

18) 

19import warnings 

20 

21import numpy as np 

22 

23from pandas._libs import lib 

24import pandas._libs.sparse as splib 

25from pandas._libs.sparse import ( 

26 BlockIndex, 

27 IntIndex, 

28 SparseIndex, 

29) 

30from pandas._libs.tslibs import NaT 

31from pandas._typing import ( 

32 ArrayLike, 

33 AstypeArg, 

34 Axis, 

35 AxisInt, 

36 Dtype, 

37 NpDtype, 

38 PositionalIndexer, 

39 Scalar, 

40 ScalarIndexer, 

41 SequenceIndexer, 

42 npt, 

43) 

44from pandas.compat.numpy import function as nv 

45from pandas.errors import PerformanceWarning 

46from pandas.util._exceptions import find_stack_level 

47from pandas.util._validators import ( 

48 validate_bool_kwarg, 

49 validate_insert_loc, 

50) 

51 

52from pandas.core.dtypes.astype import astype_array 

53from pandas.core.dtypes.cast import ( 

54 construct_1d_arraylike_from_scalar, 

55 find_common_type, 

56 maybe_box_datetimelike, 

57) 

58from pandas.core.dtypes.common import ( 

59 is_array_like, 

60 is_bool_dtype, 

61 is_datetime64_any_dtype, 

62 is_datetime64tz_dtype, 

63 is_dtype_equal, 

64 is_integer, 

65 is_list_like, 

66 is_object_dtype, 

67 is_scalar, 

68 is_string_dtype, 

69 pandas_dtype, 

70) 

71from pandas.core.dtypes.generic import ( 

72 ABCIndex, 

73 ABCSeries, 

74) 

75from pandas.core.dtypes.missing import ( 

76 isna, 

77 na_value_for_dtype, 

78 notna, 

79) 

80 

81from pandas.core import ( 

82 arraylike, 

83 ops, 

84) 

85import pandas.core.algorithms as algos 

86from pandas.core.arraylike import OpsMixin 

87from pandas.core.arrays import ExtensionArray 

88from pandas.core.arrays.sparse.dtype import SparseDtype 

89from pandas.core.base import PandasObject 

90import pandas.core.common as com 

91from pandas.core.construction import ( 

92 ensure_wrapped_if_datetimelike, 

93 extract_array, 

94 sanitize_array, 

95) 

96from pandas.core.indexers import ( 

97 check_array_indexer, 

98 unpack_tuple_and_ellipses, 

99) 

100from pandas.core.missing import interpolate_2d 

101from pandas.core.nanops import check_below_min_count 

102 

103from pandas.io.formats import printing 

104 

105# See https://github.com/python/typing/issues/684 

106if TYPE_CHECKING: 

107 from enum import Enum 

108 

109 class ellipsis(Enum): 

110 Ellipsis = "..." 

111 

112 Ellipsis = ellipsis.Ellipsis 

113 

114 from scipy.sparse import spmatrix 

115 

116 from pandas._typing import ( 

117 FillnaOptions, 

118 NumpySorter, 

119 ) 

120 

121 SparseIndexKind = Literal["integer", "block"] 

122 

123 from pandas import Series 

124 

125else: 

126 ellipsis = type(Ellipsis) 

127 

128 

129# ---------------------------------------------------------------------------- 

130# Array 

131 

132SparseArrayT = TypeVar("SparseArrayT", bound="SparseArray") 

133 

134_sparray_doc_kwargs = {"klass": "SparseArray"} 

135 

136 

137def _get_fill(arr: SparseArray) -> np.ndarray: 

138 """ 

139 Create a 0-dim ndarray containing the fill value 

140 

141 Parameters 

142 ---------- 

143 arr : SparseArray 

144 

145 Returns 

146 ------- 

147 fill_value : ndarray 

148 0-dim ndarray with just the fill value. 

149 

150 Notes 

151 ----- 

152 coerce fill_value to arr dtype if possible 

153 int64 SparseArray can have NaN as fill_value if there is no missing 

154 """ 

155 try: 

156 return np.asarray(arr.fill_value, dtype=arr.dtype.subtype) 

157 except ValueError: 

158 return np.asarray(arr.fill_value) 

159 

160 

161def _sparse_array_op( 

162 left: SparseArray, right: SparseArray, op: Callable, name: str 

163) -> SparseArray: 

164 """ 

165 Perform a binary operation between two arrays. 

166 

167 Parameters 

168 ---------- 

169 left : Union[SparseArray, ndarray] 

170 right : Union[SparseArray, ndarray] 

171 op : Callable 

172 The binary operation to perform 

173 name str 

174 Name of the callable. 

175 

176 Returns 

177 ------- 

178 SparseArray 

179 """ 

180 if name.startswith("__"): 

181 # For lookups in _libs.sparse we need non-dunder op name 

182 name = name[2:-2] 

183 

184 # dtype used to find corresponding sparse method 

185 ltype = left.dtype.subtype 

186 rtype = right.dtype.subtype 

187 

188 if not is_dtype_equal(ltype, rtype): 

189 subtype = find_common_type([ltype, rtype]) 

190 ltype = SparseDtype(subtype, left.fill_value) 

191 rtype = SparseDtype(subtype, right.fill_value) 

192 

193 left = left.astype(ltype, copy=False) 

194 right = right.astype(rtype, copy=False) 

195 dtype = ltype.subtype 

196 else: 

197 dtype = ltype 

198 

199 # dtype the result must have 

200 result_dtype = None 

201 

202 if left.sp_index.ngaps == 0 or right.sp_index.ngaps == 0: 

203 with np.errstate(all="ignore"): 

204 result = op(left.to_dense(), right.to_dense()) 

205 fill = op(_get_fill(left), _get_fill(right)) 

206 

207 if left.sp_index.ngaps == 0: 

208 index = left.sp_index 

209 else: 

210 index = right.sp_index 

211 elif left.sp_index.equals(right.sp_index): 

212 with np.errstate(all="ignore"): 

213 result = op(left.sp_values, right.sp_values) 

214 fill = op(_get_fill(left), _get_fill(right)) 

215 index = left.sp_index 

216 else: 

217 if name[0] == "r": 

218 left, right = right, left 

219 name = name[1:] 

220 

221 if name in ("and", "or", "xor") and dtype == "bool": 

222 opname = f"sparse_{name}_uint8" 

223 # to make template simple, cast here 

224 left_sp_values = left.sp_values.view(np.uint8) 

225 right_sp_values = right.sp_values.view(np.uint8) 

226 result_dtype = bool 

227 else: 

228 opname = f"sparse_{name}_{dtype}" 

229 left_sp_values = left.sp_values 

230 right_sp_values = right.sp_values 

231 

232 if ( 

233 name in ["floordiv", "mod"] 

234 and (right == 0).any() 

235 and left.dtype.kind in ["i", "u"] 

236 ): 

237 # Match the non-Sparse Series behavior 

238 opname = f"sparse_{name}_float64" 

239 left_sp_values = left_sp_values.astype("float64") 

240 right_sp_values = right_sp_values.astype("float64") 

241 

242 sparse_op = getattr(splib, opname) 

243 

244 with np.errstate(all="ignore"): 

245 result, index, fill = sparse_op( 

246 left_sp_values, 

247 left.sp_index, 

248 left.fill_value, 

249 right_sp_values, 

250 right.sp_index, 

251 right.fill_value, 

252 ) 

253 

254 if name == "divmod": 

255 # result is a 2-tuple 

256 # error: Incompatible return value type (got "Tuple[SparseArray, 

257 # SparseArray]", expected "SparseArray") 

258 return ( # type: ignore[return-value] 

259 _wrap_result(name, result[0], index, fill[0], dtype=result_dtype), 

260 _wrap_result(name, result[1], index, fill[1], dtype=result_dtype), 

261 ) 

262 

263 if result_dtype is None: 

264 result_dtype = result.dtype 

265 

266 return _wrap_result(name, result, index, fill, dtype=result_dtype) 

267 

268 

269def _wrap_result( 

270 name: str, data, sparse_index, fill_value, dtype: Dtype | None = None 

271) -> SparseArray: 

272 """ 

273 wrap op result to have correct dtype 

274 """ 

275 if name.startswith("__"): 

276 # e.g. __eq__ --> eq 

277 name = name[2:-2] 

278 

279 if name in ("eq", "ne", "lt", "gt", "le", "ge"): 

280 dtype = bool 

281 

282 fill_value = lib.item_from_zerodim(fill_value) 

283 

284 if is_bool_dtype(dtype): 

285 # fill_value may be np.bool_ 

286 fill_value = bool(fill_value) 

287 return SparseArray( 

288 data, sparse_index=sparse_index, fill_value=fill_value, dtype=dtype 

289 ) 

290 

291 

292class SparseArray(OpsMixin, PandasObject, ExtensionArray): 

293 """ 

294 An ExtensionArray for storing sparse data. 

295 

296 Parameters 

297 ---------- 

298 data : array-like or scalar 

299 A dense array of values to store in the SparseArray. This may contain 

300 `fill_value`. 

301 sparse_index : SparseIndex, optional 

302 fill_value : scalar, optional 

303 Elements in data that are ``fill_value`` are not stored in the 

304 SparseArray. For memory savings, this should be the most common value 

305 in `data`. By default, `fill_value` depends on the dtype of `data`: 

306 

307 =========== ========== 

308 data.dtype na_value 

309 =========== ========== 

310 float ``np.nan`` 

311 int ``0`` 

312 bool False 

313 datetime64 ``pd.NaT`` 

314 timedelta64 ``pd.NaT`` 

315 =========== ========== 

316 

317 The fill value is potentially specified in three ways. In order of 

318 precedence, these are 

319 

320 1. The `fill_value` argument 

321 2. ``dtype.fill_value`` if `fill_value` is None and `dtype` is 

322 a ``SparseDtype`` 

323 3. ``data.dtype.fill_value`` if `fill_value` is None and `dtype` 

324 is not a ``SparseDtype`` and `data` is a ``SparseArray``. 

325 

326 kind : str 

327 Can be 'integer' or 'block', default is 'integer'. 

328 The type of storage for sparse locations. 

329 

330 * 'block': Stores a `block` and `block_length` for each 

331 contiguous *span* of sparse values. This is best when 

332 sparse data tends to be clumped together, with large 

333 regions of ``fill-value`` values between sparse values. 

334 * 'integer': uses an integer to store the location of 

335 each sparse value. 

336 

337 dtype : np.dtype or SparseDtype, optional 

338 The dtype to use for the SparseArray. For numpy dtypes, this 

339 determines the dtype of ``self.sp_values``. For SparseDtype, 

340 this determines ``self.sp_values`` and ``self.fill_value``. 

341 copy : bool, default False 

342 Whether to explicitly copy the incoming `data` array. 

343 

344 Attributes 

345 ---------- 

346 None 

347 

348 Methods 

349 ------- 

350 None 

351 

352 Examples 

353 -------- 

354 >>> from pandas.arrays import SparseArray 

355 >>> arr = SparseArray([0, 0, 1, 2]) 

356 >>> arr 

357 [0, 0, 1, 2] 

358 Fill: 0 

359 IntIndex 

360 Indices: array([2, 3], dtype=int32) 

361 """ 

362 

363 _subtyp = "sparse_array" # register ABCSparseArray 

364 _hidden_attrs = PandasObject._hidden_attrs | frozenset([]) 

365 _sparse_index: SparseIndex 

366 _sparse_values: np.ndarray 

367 _dtype: SparseDtype 

368 

369 def __init__( 

370 self, 

371 data, 

372 sparse_index=None, 

373 fill_value=None, 

374 kind: SparseIndexKind = "integer", 

375 dtype: Dtype | None = None, 

376 copy: bool = False, 

377 ) -> None: 

378 if fill_value is None and isinstance(dtype, SparseDtype): 

379 fill_value = dtype.fill_value 

380 

381 if isinstance(data, type(self)): 

382 # disable normal inference on dtype, sparse_index, & fill_value 

383 if sparse_index is None: 

384 sparse_index = data.sp_index 

385 if fill_value is None: 

386 fill_value = data.fill_value 

387 if dtype is None: 

388 dtype = data.dtype 

389 # TODO: make kind=None, and use data.kind? 

390 data = data.sp_values 

391 

392 # Handle use-provided dtype 

393 if isinstance(dtype, str): 

394 # Two options: dtype='int', regular numpy dtype 

395 # or dtype='Sparse[int]', a sparse dtype 

396 try: 

397 dtype = SparseDtype.construct_from_string(dtype) 

398 except TypeError: 

399 dtype = pandas_dtype(dtype) 

400 

401 if isinstance(dtype, SparseDtype): 

402 if fill_value is None: 

403 fill_value = dtype.fill_value 

404 dtype = dtype.subtype 

405 

406 if is_scalar(data): 

407 if sparse_index is None: 

408 npoints = 1 

409 else: 

410 npoints = sparse_index.length 

411 

412 data = construct_1d_arraylike_from_scalar(data, npoints, dtype=None) 

413 dtype = data.dtype 

414 

415 if dtype is not None: 

416 dtype = pandas_dtype(dtype) 

417 

418 # TODO: disentangle the fill_value dtype inference from 

419 # dtype inference 

420 if data is None: 

421 # TODO: What should the empty dtype be? Object or float? 

422 

423 # error: Argument "dtype" to "array" has incompatible type 

424 # "Union[ExtensionDtype, dtype[Any], None]"; expected "Union[dtype[Any], 

425 # None, type, _SupportsDType, str, Union[Tuple[Any, int], Tuple[Any, 

426 # Union[int, Sequence[int]]], List[Any], _DTypeDict, Tuple[Any, Any]]]" 

427 data = np.array([], dtype=dtype) # type: ignore[arg-type] 

428 

429 if not is_array_like(data): 

430 try: 

431 # probably shared code in sanitize_series 

432 

433 data = sanitize_array(data, index=None) 

434 except ValueError: 

435 # NumPy may raise a ValueError on data like [1, []] 

436 # we retry with object dtype here. 

437 if dtype is None: 

438 dtype = np.dtype(object) 

439 data = np.atleast_1d(np.asarray(data, dtype=dtype)) 

440 else: 

441 raise 

442 

443 if copy: 

444 # TODO: avoid double copy when dtype forces cast. 

445 data = data.copy() 

446 

447 if fill_value is None: 

448 fill_value_dtype = data.dtype if dtype is None else dtype 

449 if fill_value_dtype is None: 

450 fill_value = np.nan 

451 else: 

452 fill_value = na_value_for_dtype(fill_value_dtype) 

453 

454 if isinstance(data, type(self)) and sparse_index is None: 

455 sparse_index = data._sparse_index 

456 # error: Argument "dtype" to "asarray" has incompatible type 

457 # "Union[ExtensionDtype, dtype[Any], None]"; expected "None" 

458 sparse_values = np.asarray( 

459 data.sp_values, dtype=dtype # type: ignore[arg-type] 

460 ) 

461 elif sparse_index is None: 

462 data = extract_array(data, extract_numpy=True) 

463 if not isinstance(data, np.ndarray): 

464 # EA 

465 if is_datetime64tz_dtype(data.dtype): 

466 warnings.warn( 

467 f"Creating SparseArray from {data.dtype} data " 

468 "loses timezone information. Cast to object before " 

469 "sparse to retain timezone information.", 

470 UserWarning, 

471 stacklevel=find_stack_level(), 

472 ) 

473 data = np.asarray(data, dtype="datetime64[ns]") 

474 if fill_value is NaT: 

475 fill_value = np.datetime64("NaT", "ns") 

476 data = np.asarray(data) 

477 sparse_values, sparse_index, fill_value = _make_sparse( 

478 # error: Argument "dtype" to "_make_sparse" has incompatible type 

479 # "Union[ExtensionDtype, dtype[Any], None]"; expected 

480 # "Optional[dtype[Any]]" 

481 data, 

482 kind=kind, 

483 fill_value=fill_value, 

484 dtype=dtype, # type: ignore[arg-type] 

485 ) 

486 else: 

487 # error: Argument "dtype" to "asarray" has incompatible type 

488 # "Union[ExtensionDtype, dtype[Any], None]"; expected "None" 

489 sparse_values = np.asarray(data, dtype=dtype) # type: ignore[arg-type] 

490 if len(sparse_values) != sparse_index.npoints: 

491 raise AssertionError( 

492 f"Non array-like type {type(sparse_values)} must " 

493 "have the same length as the index" 

494 ) 

495 self._sparse_index = sparse_index 

496 self._sparse_values = sparse_values 

497 self._dtype = SparseDtype(sparse_values.dtype, fill_value) 

498 

499 @classmethod 

500 def _simple_new( 

501 cls: type[SparseArrayT], 

502 sparse_array: np.ndarray, 

503 sparse_index: SparseIndex, 

504 dtype: SparseDtype, 

505 ) -> SparseArrayT: 

506 new = object.__new__(cls) 

507 new._sparse_index = sparse_index 

508 new._sparse_values = sparse_array 

509 new._dtype = dtype 

510 return new 

511 

512 @classmethod 

513 def from_spmatrix(cls: type[SparseArrayT], data: spmatrix) -> SparseArrayT: 

514 """ 

515 Create a SparseArray from a scipy.sparse matrix. 

516 

517 Parameters 

518 ---------- 

519 data : scipy.sparse.sp_matrix 

520 This should be a SciPy sparse matrix where the size 

521 of the second dimension is 1. In other words, a 

522 sparse matrix with a single column. 

523 

524 Returns 

525 ------- 

526 SparseArray 

527 

528 Examples 

529 -------- 

530 >>> import scipy.sparse 

531 >>> mat = scipy.sparse.coo_matrix((4, 1)) 

532 >>> pd.arrays.SparseArray.from_spmatrix(mat) 

533 [0.0, 0.0, 0.0, 0.0] 

534 Fill: 0.0 

535 IntIndex 

536 Indices: array([], dtype=int32) 

537 """ 

538 length, ncol = data.shape 

539 

540 if ncol != 1: 

541 raise ValueError(f"'data' must have a single column, not '{ncol}'") 

542 

543 # our sparse index classes require that the positions be strictly 

544 # increasing. So we need to sort loc, and arr accordingly. 

545 data = data.tocsc() 

546 data.sort_indices() 

547 arr = data.data 

548 idx = data.indices 

549 

550 zero = np.array(0, dtype=arr.dtype).item() 

551 dtype = SparseDtype(arr.dtype, zero) 

552 index = IntIndex(length, idx) 

553 

554 return cls._simple_new(arr, index, dtype) 

555 

556 def __array__(self, dtype: NpDtype | None = None) -> np.ndarray: 

557 fill_value = self.fill_value 

558 

559 if self.sp_index.ngaps == 0: 

560 # Compat for na dtype and int values. 

561 return self.sp_values 

562 if dtype is None: 

563 # Can NumPy represent this type? 

564 # If not, `np.result_type` will raise. We catch that 

565 # and return object. 

566 if is_datetime64_any_dtype(self.sp_values.dtype): 

567 # However, we *do* special-case the common case of 

568 # a datetime64 with pandas NaT. 

569 if fill_value is NaT: 

570 # Can't put pd.NaT in a datetime64[ns] 

571 fill_value = np.datetime64("NaT") 

572 try: 

573 dtype = np.result_type(self.sp_values.dtype, type(fill_value)) 

574 except TypeError: 

575 dtype = object 

576 

577 out = np.full(self.shape, fill_value, dtype=dtype) 

578 out[self.sp_index.indices] = self.sp_values 

579 return out 

580 

581 def __setitem__(self, key, value): 

582 # I suppose we could allow setting of non-fill_value elements. 

583 # TODO(SparseArray.__setitem__): remove special cases in 

584 # ExtensionBlock.where 

585 msg = "SparseArray does not support item assignment via setitem" 

586 raise TypeError(msg) 

587 

588 @classmethod 

589 def _from_sequence(cls, scalars, *, dtype: Dtype | None = None, copy: bool = False): 

590 return cls(scalars, dtype=dtype) 

591 

592 @classmethod 

593 def _from_factorized(cls, values, original): 

594 return cls(values, dtype=original.dtype) 

595 

596 # ------------------------------------------------------------------------ 

597 # Data 

598 # ------------------------------------------------------------------------ 

599 @property 

600 def sp_index(self) -> SparseIndex: 

601 """ 

602 The SparseIndex containing the location of non- ``fill_value`` points. 

603 """ 

604 return self._sparse_index 

605 

606 @property 

607 def sp_values(self) -> np.ndarray: 

608 """ 

609 An ndarray containing the non- ``fill_value`` values. 

610 

611 Examples 

612 -------- 

613 >>> from pandas.arrays import SparseArray 

614 >>> s = SparseArray([0, 0, 1, 0, 2], fill_value=0) 

615 >>> s.sp_values 

616 array([1, 2]) 

617 """ 

618 return self._sparse_values 

619 

620 @property 

621 def dtype(self) -> SparseDtype: 

622 return self._dtype 

623 

624 @property 

625 def fill_value(self): 

626 """ 

627 Elements in `data` that are `fill_value` are not stored. 

628 

629 For memory savings, this should be the most common value in the array. 

630 """ 

631 return self.dtype.fill_value 

632 

633 @fill_value.setter 

634 def fill_value(self, value) -> None: 

635 self._dtype = SparseDtype(self.dtype.subtype, value) 

636 

637 @property 

638 def kind(self) -> SparseIndexKind: 

639 """ 

640 The kind of sparse index for this array. One of {'integer', 'block'}. 

641 """ 

642 if isinstance(self.sp_index, IntIndex): 

643 return "integer" 

644 else: 

645 return "block" 

646 

647 @property 

648 def _valid_sp_values(self) -> np.ndarray: 

649 sp_vals = self.sp_values 

650 mask = notna(sp_vals) 

651 return sp_vals[mask] 

652 

653 def __len__(self) -> int: 

654 return self.sp_index.length 

655 

656 @property 

657 def _null_fill_value(self) -> bool: 

658 return self._dtype._is_na_fill_value 

659 

660 def _fill_value_matches(self, fill_value) -> bool: 

661 if self._null_fill_value: 

662 return isna(fill_value) 

663 else: 

664 return self.fill_value == fill_value 

665 

666 @property 

667 def nbytes(self) -> int: 

668 return self.sp_values.nbytes + self.sp_index.nbytes 

669 

670 @property 

671 def density(self) -> float: 

672 """ 

673 The percent of non- ``fill_value`` points, as decimal. 

674 

675 Examples 

676 -------- 

677 >>> from pandas.arrays import SparseArray 

678 >>> s = SparseArray([0, 0, 1, 1, 1], fill_value=0) 

679 >>> s.density 

680 0.6 

681 """ 

682 return self.sp_index.npoints / self.sp_index.length 

683 

684 @property 

685 def npoints(self) -> int: 

686 """ 

687 The number of non- ``fill_value`` points. 

688 

689 Examples 

690 -------- 

691 >>> from pandas.arrays import SparseArray 

692 >>> s = SparseArray([0, 0, 1, 1, 1], fill_value=0) 

693 >>> s.npoints 

694 3 

695 """ 

696 return self.sp_index.npoints 

697 

698 def isna(self): 

699 # If null fill value, we want SparseDtype[bool, true] 

700 # to preserve the same memory usage. 

701 dtype = SparseDtype(bool, self._null_fill_value) 

702 if self._null_fill_value: 

703 return type(self)._simple_new(isna(self.sp_values), self.sp_index, dtype) 

704 mask = np.full(len(self), False, dtype=np.bool_) 

705 mask[self.sp_index.indices] = isna(self.sp_values) 

706 return type(self)(mask, fill_value=False, dtype=dtype) 

707 

708 def fillna( 

709 self: SparseArrayT, 

710 value=None, 

711 method: FillnaOptions | None = None, 

712 limit: int | None = None, 

713 ) -> SparseArrayT: 

714 """ 

715 Fill missing values with `value`. 

716 

717 Parameters 

718 ---------- 

719 value : scalar, optional 

720 method : str, optional 

721 

722 .. warning:: 

723 

724 Using 'method' will result in high memory use, 

725 as all `fill_value` methods will be converted to 

726 an in-memory ndarray 

727 

728 limit : int, optional 

729 

730 Returns 

731 ------- 

732 SparseArray 

733 

734 Notes 

735 ----- 

736 When `value` is specified, the result's ``fill_value`` depends on 

737 ``self.fill_value``. The goal is to maintain low-memory use. 

738 

739 If ``self.fill_value`` is NA, the result dtype will be 

740 ``SparseDtype(self.dtype, fill_value=value)``. This will preserve 

741 amount of memory used before and after filling. 

742 

743 When ``self.fill_value`` is not NA, the result dtype will be 

744 ``self.dtype``. Again, this preserves the amount of memory used. 

745 """ 

746 if (method is None and value is None) or ( 

747 method is not None and value is not None 

748 ): 

749 raise ValueError("Must specify one of 'method' or 'value'.") 

750 

751 if method is not None: 

752 msg = "fillna with 'method' requires high memory usage." 

753 warnings.warn( 

754 msg, 

755 PerformanceWarning, 

756 stacklevel=find_stack_level(), 

757 ) 

758 new_values = np.asarray(self) 

759 # interpolate_2d modifies new_values inplace 

760 interpolate_2d(new_values, method=method, limit=limit) 

761 return type(self)(new_values, fill_value=self.fill_value) 

762 

763 else: 

764 new_values = np.where(isna(self.sp_values), value, self.sp_values) 

765 

766 if self._null_fill_value: 

767 # This is essentially just updating the dtype. 

768 new_dtype = SparseDtype(self.dtype.subtype, fill_value=value) 

769 else: 

770 new_dtype = self.dtype 

771 

772 return self._simple_new(new_values, self._sparse_index, new_dtype) 

773 

774 def shift(self: SparseArrayT, periods: int = 1, fill_value=None) -> SparseArrayT: 

775 if not len(self) or periods == 0: 

776 return self.copy() 

777 

778 if isna(fill_value): 

779 fill_value = self.dtype.na_value 

780 

781 subtype = np.result_type(fill_value, self.dtype.subtype) 

782 

783 if subtype != self.dtype.subtype: 

784 # just coerce up front 

785 arr = self.astype(SparseDtype(subtype, self.fill_value)) 

786 else: 

787 arr = self 

788 

789 empty = self._from_sequence( 

790 [fill_value] * min(abs(periods), len(self)), dtype=arr.dtype 

791 ) 

792 

793 if periods > 0: 

794 a = empty 

795 b = arr[:-periods] 

796 else: 

797 a = arr[abs(periods) :] 

798 b = empty 

799 return arr._concat_same_type([a, b]) 

800 

801 def _first_fill_value_loc(self): 

802 """ 

803 Get the location of the first fill value. 

804 

805 Returns 

806 ------- 

807 int 

808 """ 

809 if len(self) == 0 or self.sp_index.npoints == len(self): 

810 return -1 

811 

812 indices = self.sp_index.indices 

813 if not len(indices) or indices[0] > 0: 

814 return 0 

815 

816 # a number larger than 1 should be appended to 

817 # the last in case of fill value only appears 

818 # in the tail of array 

819 diff = np.r_[np.diff(indices), 2] 

820 return indices[(diff > 1).argmax()] + 1 

821 

822 def unique(self: SparseArrayT) -> SparseArrayT: 

823 uniques = algos.unique(self.sp_values) 

824 if len(self.sp_values) != len(self): 

825 fill_loc = self._first_fill_value_loc() 

826 # Inorder to align the behavior of pd.unique or 

827 # pd.Series.unique, we should keep the original 

828 # order, here we use unique again to find the 

829 # insertion place. Since the length of sp_values 

830 # is not large, maybe minor performance hurt 

831 # is worthwhile to the correctness. 

832 insert_loc = len(algos.unique(self.sp_values[:fill_loc])) 

833 uniques = np.insert(uniques, insert_loc, self.fill_value) 

834 return type(self)._from_sequence(uniques, dtype=self.dtype) 

835 

836 def _values_for_factorize(self): 

837 # Still override this for hash_pandas_object 

838 return np.asarray(self), self.fill_value 

839 

840 def factorize( 

841 self, 

842 use_na_sentinel: bool = True, 

843 ) -> tuple[np.ndarray, SparseArray]: 

844 # Currently, ExtensionArray.factorize -> Tuple[ndarray, EA] 

845 # The sparsity on this is backwards from what Sparse would want. Want 

846 # ExtensionArray.factorize -> Tuple[EA, EA] 

847 # Given that we have to return a dense array of codes, why bother 

848 # implementing an efficient factorize? 

849 codes, uniques = algos.factorize( 

850 np.asarray(self), use_na_sentinel=use_na_sentinel 

851 ) 

852 uniques_sp = SparseArray(uniques, dtype=self.dtype) 

853 return codes, uniques_sp 

854 

855 def value_counts(self, dropna: bool = True) -> Series: 

856 """ 

857 Returns a Series containing counts of unique values. 

858 

859 Parameters 

860 ---------- 

861 dropna : bool, default True 

862 Don't include counts of NaN, even if NaN is in sp_values. 

863 

864 Returns 

865 ------- 

866 counts : Series 

867 """ 

868 from pandas import ( 

869 Index, 

870 Series, 

871 ) 

872 

873 keys, counts = algos.value_counts_arraylike(self.sp_values, dropna=dropna) 

874 fcounts = self.sp_index.ngaps 

875 if fcounts > 0 and (not self._null_fill_value or not dropna): 

876 mask = isna(keys) if self._null_fill_value else keys == self.fill_value 

877 if mask.any(): 

878 counts[mask] += fcounts 

879 else: 

880 # error: Argument 1 to "insert" has incompatible type "Union[ 

881 # ExtensionArray,ndarray[Any, Any]]"; expected "Union[ 

882 # _SupportsArray[dtype[Any]], Sequence[_SupportsArray[dtype 

883 # [Any]]], Sequence[Sequence[_SupportsArray[dtype[Any]]]], 

884 # Sequence[Sequence[Sequence[_SupportsArray[dtype[Any]]]]], Sequence 

885 # [Sequence[Sequence[Sequence[_SupportsArray[dtype[Any]]]]]]]" 

886 keys = np.insert(keys, 0, self.fill_value) # type: ignore[arg-type] 

887 counts = np.insert(counts, 0, fcounts) 

888 

889 if not isinstance(keys, ABCIndex): 

890 index = Index(keys) 

891 else: 

892 index = keys 

893 return Series(counts, index=index, copy=False) 

894 

895 # -------- 

896 # Indexing 

897 # -------- 

898 @overload 

899 def __getitem__(self, key: ScalarIndexer) -> Any: 

900 ... 

901 

902 @overload 

903 def __getitem__( 

904 self: SparseArrayT, 

905 key: SequenceIndexer | tuple[int | ellipsis, ...], 

906 ) -> SparseArrayT: 

907 ... 

908 

909 def __getitem__( 

910 self: SparseArrayT, 

911 key: PositionalIndexer | tuple[int | ellipsis, ...], 

912 ) -> SparseArrayT | Any: 

913 if isinstance(key, tuple): 

914 key = unpack_tuple_and_ellipses(key) 

915 if key is Ellipsis: 

916 raise ValueError("Cannot slice with Ellipsis") 

917 

918 if is_integer(key): 

919 return self._get_val_at(key) 

920 elif isinstance(key, tuple): 

921 # error: Invalid index type "Tuple[Union[int, ellipsis], ...]" 

922 # for "ndarray[Any, Any]"; expected type 

923 # "Union[SupportsIndex, _SupportsArray[dtype[Union[bool_, 

924 # integer[Any]]]], _NestedSequence[_SupportsArray[dtype[ 

925 # Union[bool_, integer[Any]]]]], _NestedSequence[Union[ 

926 # bool, int]], Tuple[Union[SupportsIndex, _SupportsArray[ 

927 # dtype[Union[bool_, integer[Any]]]], _NestedSequence[ 

928 # _SupportsArray[dtype[Union[bool_, integer[Any]]]]], 

929 # _NestedSequence[Union[bool, int]]], ...]]" 

930 data_slice = self.to_dense()[key] # type: ignore[index] 

931 elif isinstance(key, slice): 

932 # Avoid densifying when handling contiguous slices 

933 if key.step is None or key.step == 1: 

934 start = 0 if key.start is None else key.start 

935 if start < 0: 

936 start += len(self) 

937 

938 end = len(self) if key.stop is None else key.stop 

939 if end < 0: 

940 end += len(self) 

941 

942 indices = self.sp_index.indices 

943 keep_inds = np.flatnonzero((indices >= start) & (indices < end)) 

944 sp_vals = self.sp_values[keep_inds] 

945 

946 sp_index = indices[keep_inds].copy() 

947 

948 # If we've sliced to not include the start of the array, all our indices 

949 # should be shifted. NB: here we are careful to also not shift by a 

950 # negative value for a case like [0, 1][-100:] where the start index 

951 # should be treated like 0 

952 if start > 0: 

953 sp_index -= start 

954 

955 # Length of our result should match applying this slice to a range 

956 # of the length of our original array 

957 new_len = len(range(len(self))[key]) 

958 new_sp_index = make_sparse_index(new_len, sp_index, self.kind) 

959 return type(self)._simple_new(sp_vals, new_sp_index, self.dtype) 

960 else: 

961 indices = np.arange(len(self), dtype=np.int32)[key] 

962 return self.take(indices) 

963 

964 elif not is_list_like(key): 

965 # e.g. "foo" or 2.5 

966 # exception message copied from numpy 

967 raise IndexError( 

968 r"only integers, slices (`:`), ellipsis (`...`), numpy.newaxis " 

969 r"(`None`) and integer or boolean arrays are valid indices" 

970 ) 

971 

972 else: 

973 if isinstance(key, SparseArray): 

974 # NOTE: If we guarantee that SparseDType(bool) 

975 # has only fill_value - true, false or nan 

976 # (see GH PR 44955) 

977 # we can apply mask very fast: 

978 if is_bool_dtype(key): 

979 if isna(key.fill_value): 

980 return self.take(key.sp_index.indices[key.sp_values]) 

981 if not key.fill_value: 

982 return self.take(key.sp_index.indices) 

983 n = len(self) 

984 mask = np.full(n, True, dtype=np.bool_) 

985 mask[key.sp_index.indices] = False 

986 return self.take(np.arange(n)[mask]) 

987 else: 

988 key = np.asarray(key) 

989 

990 key = check_array_indexer(self, key) 

991 

992 if com.is_bool_indexer(key): 

993 # mypy doesn't know we have an array here 

994 key = cast(np.ndarray, key) 

995 return self.take(np.arange(len(key), dtype=np.int32)[key]) 

996 elif hasattr(key, "__len__"): 

997 return self.take(key) 

998 else: 

999 raise ValueError(f"Cannot slice with '{key}'") 

1000 

1001 return type(self)(data_slice, kind=self.kind) 

1002 

1003 def _get_val_at(self, loc): 

1004 loc = validate_insert_loc(loc, len(self)) 

1005 

1006 sp_loc = self.sp_index.lookup(loc) 

1007 if sp_loc == -1: 

1008 return self.fill_value 

1009 else: 

1010 val = self.sp_values[sp_loc] 

1011 val = maybe_box_datetimelike(val, self.sp_values.dtype) 

1012 return val 

1013 

1014 def take( 

1015 self: SparseArrayT, indices, *, allow_fill: bool = False, fill_value=None 

1016 ) -> SparseArrayT: 

1017 if is_scalar(indices): 

1018 raise ValueError(f"'indices' must be an array, not a scalar '{indices}'.") 

1019 indices = np.asarray(indices, dtype=np.int32) 

1020 

1021 dtype = None 

1022 if indices.size == 0: 

1023 result = np.array([], dtype="object") 

1024 dtype = self.dtype 

1025 elif allow_fill: 

1026 result = self._take_with_fill(indices, fill_value=fill_value) 

1027 else: 

1028 return self._take_without_fill(indices) 

1029 

1030 return type(self)( 

1031 result, fill_value=self.fill_value, kind=self.kind, dtype=dtype 

1032 ) 

1033 

1034 def _take_with_fill(self, indices, fill_value=None) -> np.ndarray: 

1035 if fill_value is None: 

1036 fill_value = self.dtype.na_value 

1037 

1038 if indices.min() < -1: 

1039 raise ValueError( 

1040 "Invalid value in 'indices'. Must be between -1 " 

1041 "and the length of the array." 

1042 ) 

1043 

1044 if indices.max() >= len(self): 

1045 raise IndexError("out of bounds value in 'indices'.") 

1046 

1047 if len(self) == 0: 

1048 # Empty... Allow taking only if all empty 

1049 if (indices == -1).all(): 

1050 dtype = np.result_type(self.sp_values, type(fill_value)) 

1051 taken = np.empty_like(indices, dtype=dtype) 

1052 taken.fill(fill_value) 

1053 return taken 

1054 else: 

1055 raise IndexError("cannot do a non-empty take from an empty axes.") 

1056 

1057 # sp_indexer may be -1 for two reasons 

1058 # 1.) we took for an index of -1 (new) 

1059 # 2.) we took a value that was self.fill_value (old) 

1060 sp_indexer = self.sp_index.lookup_array(indices) 

1061 new_fill_indices = indices == -1 

1062 old_fill_indices = (sp_indexer == -1) & ~new_fill_indices 

1063 

1064 if self.sp_index.npoints == 0 and old_fill_indices.all(): 

1065 # We've looked up all valid points on an all-sparse array. 

1066 taken = np.full( 

1067 sp_indexer.shape, fill_value=self.fill_value, dtype=self.dtype.subtype 

1068 ) 

1069 

1070 elif self.sp_index.npoints == 0: 

1071 # Avoid taking from the empty self.sp_values 

1072 _dtype = np.result_type(self.dtype.subtype, type(fill_value)) 

1073 taken = np.full(sp_indexer.shape, fill_value=fill_value, dtype=_dtype) 

1074 else: 

1075 taken = self.sp_values.take(sp_indexer) 

1076 

1077 # Fill in two steps. 

1078 # Old fill values 

1079 # New fill values 

1080 # potentially coercing to a new dtype at each stage. 

1081 

1082 m0 = sp_indexer[old_fill_indices] < 0 

1083 m1 = sp_indexer[new_fill_indices] < 0 

1084 

1085 result_type = taken.dtype 

1086 

1087 if m0.any(): 

1088 result_type = np.result_type(result_type, type(self.fill_value)) 

1089 taken = taken.astype(result_type) 

1090 taken[old_fill_indices] = self.fill_value 

1091 

1092 if m1.any(): 

1093 result_type = np.result_type(result_type, type(fill_value)) 

1094 taken = taken.astype(result_type) 

1095 taken[new_fill_indices] = fill_value 

1096 

1097 return taken 

1098 

1099 def _take_without_fill(self: SparseArrayT, indices) -> SparseArrayT: 

1100 to_shift = indices < 0 

1101 

1102 n = len(self) 

1103 

1104 if (indices.max() >= n) or (indices.min() < -n): 

1105 if n == 0: 

1106 raise IndexError("cannot do a non-empty take from an empty axes.") 

1107 raise IndexError("out of bounds value in 'indices'.") 

1108 

1109 if to_shift.any(): 

1110 indices = indices.copy() 

1111 indices[to_shift] += n 

1112 

1113 sp_indexer = self.sp_index.lookup_array(indices) 

1114 value_mask = sp_indexer != -1 

1115 new_sp_values = self.sp_values[sp_indexer[value_mask]] 

1116 

1117 value_indices = np.flatnonzero(value_mask).astype(np.int32, copy=False) 

1118 

1119 new_sp_index = make_sparse_index(len(indices), value_indices, kind=self.kind) 

1120 return type(self)._simple_new(new_sp_values, new_sp_index, dtype=self.dtype) 

1121 

1122 def searchsorted( 

1123 self, 

1124 v: ArrayLike | object, 

1125 side: Literal["left", "right"] = "left", 

1126 sorter: NumpySorter = None, 

1127 ) -> npt.NDArray[np.intp] | np.intp: 

1128 msg = "searchsorted requires high memory usage." 

1129 warnings.warn(msg, PerformanceWarning, stacklevel=find_stack_level()) 

1130 if not is_scalar(v): 

1131 v = np.asarray(v) 

1132 v = np.asarray(v) 

1133 return np.asarray(self, dtype=self.dtype.subtype).searchsorted(v, side, sorter) 

1134 

1135 def copy(self: SparseArrayT) -> SparseArrayT: 

1136 values = self.sp_values.copy() 

1137 return self._simple_new(values, self.sp_index, self.dtype) 

1138 

1139 @classmethod 

1140 def _concat_same_type( 

1141 cls: type[SparseArrayT], to_concat: Sequence[SparseArrayT] 

1142 ) -> SparseArrayT: 

1143 fill_value = to_concat[0].fill_value 

1144 

1145 values = [] 

1146 length = 0 

1147 

1148 if to_concat: 

1149 sp_kind = to_concat[0].kind 

1150 else: 

1151 sp_kind = "integer" 

1152 

1153 sp_index: SparseIndex 

1154 if sp_kind == "integer": 

1155 indices = [] 

1156 

1157 for arr in to_concat: 

1158 int_idx = arr.sp_index.indices.copy() 

1159 int_idx += length # TODO: wraparound 

1160 length += arr.sp_index.length 

1161 

1162 values.append(arr.sp_values) 

1163 indices.append(int_idx) 

1164 

1165 data = np.concatenate(values) 

1166 indices_arr = np.concatenate(indices) 

1167 # error: Argument 2 to "IntIndex" has incompatible type 

1168 # "ndarray[Any, dtype[signedinteger[_32Bit]]]"; 

1169 # expected "Sequence[int]" 

1170 sp_index = IntIndex(length, indices_arr) # type: ignore[arg-type] 

1171 

1172 else: 

1173 # when concatenating block indices, we don't claim that you'll 

1174 # get an identical index as concatenating the values and then 

1175 # creating a new index. We don't want to spend the time trying 

1176 # to merge blocks across arrays in `to_concat`, so the resulting 

1177 # BlockIndex may have more blocks. 

1178 blengths = [] 

1179 blocs = [] 

1180 

1181 for arr in to_concat: 

1182 block_idx = arr.sp_index.to_block_index() 

1183 

1184 values.append(arr.sp_values) 

1185 blocs.append(block_idx.blocs.copy() + length) 

1186 blengths.append(block_idx.blengths) 

1187 length += arr.sp_index.length 

1188 

1189 data = np.concatenate(values) 

1190 blocs_arr = np.concatenate(blocs) 

1191 blengths_arr = np.concatenate(blengths) 

1192 

1193 sp_index = BlockIndex(length, blocs_arr, blengths_arr) 

1194 

1195 return cls(data, sparse_index=sp_index, fill_value=fill_value) 

1196 

1197 def astype(self, dtype: AstypeArg | None = None, copy: bool = True): 

1198 """ 

1199 Change the dtype of a SparseArray. 

1200 

1201 The output will always be a SparseArray. To convert to a dense 

1202 ndarray with a certain dtype, use :meth:`numpy.asarray`. 

1203 

1204 Parameters 

1205 ---------- 

1206 dtype : np.dtype or ExtensionDtype 

1207 For SparseDtype, this changes the dtype of 

1208 ``self.sp_values`` and the ``self.fill_value``. 

1209 

1210 For other dtypes, this only changes the dtype of 

1211 ``self.sp_values``. 

1212 

1213 copy : bool, default True 

1214 Whether to ensure a copy is made, even if not necessary. 

1215 

1216 Returns 

1217 ------- 

1218 SparseArray 

1219 

1220 Examples 

1221 -------- 

1222 >>> arr = pd.arrays.SparseArray([0, 0, 1, 2]) 

1223 >>> arr 

1224 [0, 0, 1, 2] 

1225 Fill: 0 

1226 IntIndex 

1227 Indices: array([2, 3], dtype=int32) 

1228 

1229 >>> arr.astype(SparseDtype(np.dtype('int32'))) 

1230 [0, 0, 1, 2] 

1231 Fill: 0 

1232 IntIndex 

1233 Indices: array([2, 3], dtype=int32) 

1234 

1235 Using a NumPy dtype with a different kind (e.g. float) will coerce 

1236 just ``self.sp_values``. 

1237 

1238 >>> arr.astype(SparseDtype(np.dtype('float64'))) 

1239 ... # doctest: +NORMALIZE_WHITESPACE 

1240 [nan, nan, 1.0, 2.0] 

1241 Fill: nan 

1242 IntIndex 

1243 Indices: array([2, 3], dtype=int32) 

1244 

1245 Using a SparseDtype, you can also change the fill value as well. 

1246 

1247 >>> arr.astype(SparseDtype("float64", fill_value=0.0)) 

1248 ... # doctest: +NORMALIZE_WHITESPACE 

1249 [0.0, 0.0, 1.0, 2.0] 

1250 Fill: 0.0 

1251 IntIndex 

1252 Indices: array([2, 3], dtype=int32) 

1253 """ 

1254 if is_dtype_equal(dtype, self._dtype): 

1255 if not copy: 

1256 return self 

1257 else: 

1258 return self.copy() 

1259 

1260 future_dtype = pandas_dtype(dtype) 

1261 if not isinstance(future_dtype, SparseDtype): 

1262 # GH#34457 

1263 values = np.asarray(self) 

1264 values = ensure_wrapped_if_datetimelike(values) 

1265 return astype_array(values, dtype=future_dtype, copy=False) 

1266 

1267 dtype = self.dtype.update_dtype(dtype) 

1268 subtype = pandas_dtype(dtype._subtype_with_str) 

1269 subtype = cast(np.dtype, subtype) # ensured by update_dtype 

1270 values = ensure_wrapped_if_datetimelike(self.sp_values) 

1271 sp_values = astype_array(values, subtype, copy=copy) 

1272 sp_values = np.asarray(sp_values) 

1273 

1274 return self._simple_new(sp_values, self.sp_index, dtype) 

1275 

1276 def map(self: SparseArrayT, mapper) -> SparseArrayT: 

1277 """ 

1278 Map categories using an input mapping or function. 

1279 

1280 Parameters 

1281 ---------- 

1282 mapper : dict, Series, callable 

1283 The correspondence from old values to new. 

1284 

1285 Returns 

1286 ------- 

1287 SparseArray 

1288 The output array will have the same density as the input. 

1289 The output fill value will be the result of applying the 

1290 mapping to ``self.fill_value`` 

1291 

1292 Examples 

1293 -------- 

1294 >>> arr = pd.arrays.SparseArray([0, 1, 2]) 

1295 >>> arr.map(lambda x: x + 10) 

1296 [10, 11, 12] 

1297 Fill: 10 

1298 IntIndex 

1299 Indices: array([1, 2], dtype=int32) 

1300 

1301 >>> arr.map({0: 10, 1: 11, 2: 12}) 

1302 [10, 11, 12] 

1303 Fill: 10 

1304 IntIndex 

1305 Indices: array([1, 2], dtype=int32) 

1306 

1307 >>> arr.map(pd.Series([10, 11, 12], index=[0, 1, 2])) 

1308 [10, 11, 12] 

1309 Fill: 10 

1310 IntIndex 

1311 Indices: array([1, 2], dtype=int32) 

1312 """ 

1313 # this is used in apply. 

1314 # We get hit since we're an "is_extension_array_dtype" but regular extension 

1315 # types are not hit. This may be worth adding to the interface. 

1316 if isinstance(mapper, ABCSeries): 

1317 mapper = mapper.to_dict() 

1318 

1319 if isinstance(mapper, abc.Mapping): 

1320 fill_value = mapper.get(self.fill_value, self.fill_value) 

1321 sp_values = [mapper.get(x, None) for x in self.sp_values] 

1322 else: 

1323 fill_value = mapper(self.fill_value) 

1324 sp_values = [mapper(x) for x in self.sp_values] 

1325 

1326 return type(self)(sp_values, sparse_index=self.sp_index, fill_value=fill_value) 

1327 

1328 def to_dense(self) -> np.ndarray: 

1329 """ 

1330 Convert SparseArray to a NumPy array. 

1331 

1332 Returns 

1333 ------- 

1334 arr : NumPy array 

1335 """ 

1336 return np.asarray(self, dtype=self.sp_values.dtype) 

1337 

1338 def _where(self, mask, value): 

1339 # NB: may not preserve dtype, e.g. result may be Sparse[float64] 

1340 # while self is Sparse[int64] 

1341 naive_implementation = np.where(mask, self, value) 

1342 dtype = SparseDtype(naive_implementation.dtype, fill_value=self.fill_value) 

1343 result = type(self)._from_sequence(naive_implementation, dtype=dtype) 

1344 return result 

1345 

1346 # ------------------------------------------------------------------------ 

1347 # IO 

1348 # ------------------------------------------------------------------------ 

1349 def __setstate__(self, state) -> None: 

1350 """Necessary for making this object picklable""" 

1351 if isinstance(state, tuple): 

1352 # Compat for pandas < 0.24.0 

1353 nd_state, (fill_value, sp_index) = state 

1354 sparse_values = np.array([]) 

1355 sparse_values.__setstate__(nd_state) 

1356 

1357 self._sparse_values = sparse_values 

1358 self._sparse_index = sp_index 

1359 self._dtype = SparseDtype(sparse_values.dtype, fill_value) 

1360 else: 

1361 self.__dict__.update(state) 

1362 

1363 def nonzero(self) -> tuple[npt.NDArray[np.int32]]: 

1364 if self.fill_value == 0: 

1365 return (self.sp_index.indices,) 

1366 else: 

1367 return (self.sp_index.indices[self.sp_values != 0],) 

1368 

1369 # ------------------------------------------------------------------------ 

1370 # Reductions 

1371 # ------------------------------------------------------------------------ 

1372 

1373 def _reduce(self, name: str, *, skipna: bool = True, **kwargs): 

1374 method = getattr(self, name, None) 

1375 

1376 if method is None: 

1377 raise TypeError(f"cannot perform {name} with type {self.dtype}") 

1378 

1379 if skipna: 

1380 arr = self 

1381 else: 

1382 arr = self.dropna() 

1383 

1384 return getattr(arr, name)(**kwargs) 

1385 

1386 def all(self, axis=None, *args, **kwargs): 

1387 """ 

1388 Tests whether all elements evaluate True 

1389 

1390 Returns 

1391 ------- 

1392 all : bool 

1393 

1394 See Also 

1395 -------- 

1396 numpy.all 

1397 """ 

1398 nv.validate_all(args, kwargs) 

1399 

1400 values = self.sp_values 

1401 

1402 if len(values) != len(self) and not np.all(self.fill_value): 

1403 return False 

1404 

1405 return values.all() 

1406 

1407 def any(self, axis: AxisInt = 0, *args, **kwargs): 

1408 """ 

1409 Tests whether at least one of elements evaluate True 

1410 

1411 Returns 

1412 ------- 

1413 any : bool 

1414 

1415 See Also 

1416 -------- 

1417 numpy.any 

1418 """ 

1419 nv.validate_any(args, kwargs) 

1420 

1421 values = self.sp_values 

1422 

1423 if len(values) != len(self) and np.any(self.fill_value): 

1424 return True 

1425 

1426 return values.any().item() 

1427 

1428 def sum( 

1429 self, 

1430 axis: AxisInt = 0, 

1431 min_count: int = 0, 

1432 skipna: bool = True, 

1433 *args, 

1434 **kwargs, 

1435 ) -> Scalar: 

1436 """ 

1437 Sum of non-NA/null values 

1438 

1439 Parameters 

1440 ---------- 

1441 axis : int, default 0 

1442 Not Used. NumPy compatibility. 

1443 min_count : int, default 0 

1444 The required number of valid values to perform the summation. If fewer 

1445 than ``min_count`` valid values are present, the result will be the missing 

1446 value indicator for subarray type. 

1447 *args, **kwargs 

1448 Not Used. NumPy compatibility. 

1449 

1450 Returns 

1451 ------- 

1452 scalar 

1453 """ 

1454 nv.validate_sum(args, kwargs) 

1455 valid_vals = self._valid_sp_values 

1456 sp_sum = valid_vals.sum() 

1457 has_na = self.sp_index.ngaps > 0 and not self._null_fill_value 

1458 

1459 if has_na and not skipna: 

1460 return na_value_for_dtype(self.dtype.subtype, compat=False) 

1461 

1462 if self._null_fill_value: 

1463 if check_below_min_count(valid_vals.shape, None, min_count): 

1464 return na_value_for_dtype(self.dtype.subtype, compat=False) 

1465 return sp_sum 

1466 else: 

1467 nsparse = self.sp_index.ngaps 

1468 if check_below_min_count(valid_vals.shape, None, min_count - nsparse): 

1469 return na_value_for_dtype(self.dtype.subtype, compat=False) 

1470 return sp_sum + self.fill_value * nsparse 

1471 

1472 def cumsum(self, axis: AxisInt = 0, *args, **kwargs) -> SparseArray: 

1473 """ 

1474 Cumulative sum of non-NA/null values. 

1475 

1476 When performing the cumulative summation, any non-NA/null values will 

1477 be skipped. The resulting SparseArray will preserve the locations of 

1478 NaN values, but the fill value will be `np.nan` regardless. 

1479 

1480 Parameters 

1481 ---------- 

1482 axis : int or None 

1483 Axis over which to perform the cumulative summation. If None, 

1484 perform cumulative summation over flattened array. 

1485 

1486 Returns 

1487 ------- 

1488 cumsum : SparseArray 

1489 """ 

1490 nv.validate_cumsum(args, kwargs) 

1491 

1492 if axis is not None and axis >= self.ndim: # Mimic ndarray behaviour. 

1493 raise ValueError(f"axis(={axis}) out of bounds") 

1494 

1495 if not self._null_fill_value: 

1496 return SparseArray(self.to_dense()).cumsum() 

1497 

1498 return SparseArray( 

1499 self.sp_values.cumsum(), 

1500 sparse_index=self.sp_index, 

1501 fill_value=self.fill_value, 

1502 ) 

1503 

1504 def mean(self, axis: Axis = 0, *args, **kwargs): 

1505 """ 

1506 Mean of non-NA/null values 

1507 

1508 Returns 

1509 ------- 

1510 mean : float 

1511 """ 

1512 nv.validate_mean(args, kwargs) 

1513 valid_vals = self._valid_sp_values 

1514 sp_sum = valid_vals.sum() 

1515 ct = len(valid_vals) 

1516 

1517 if self._null_fill_value: 

1518 return sp_sum / ct 

1519 else: 

1520 nsparse = self.sp_index.ngaps 

1521 return (sp_sum + self.fill_value * nsparse) / (ct + nsparse) 

1522 

1523 def max(self, *, axis: AxisInt | None = None, skipna: bool = True): 

1524 """ 

1525 Max of array values, ignoring NA values if specified. 

1526 

1527 Parameters 

1528 ---------- 

1529 axis : int, default 0 

1530 Not Used. NumPy compatibility. 

1531 skipna : bool, default True 

1532 Whether to ignore NA values. 

1533 

1534 Returns 

1535 ------- 

1536 scalar 

1537 """ 

1538 nv.validate_minmax_axis(axis, self.ndim) 

1539 return self._min_max("max", skipna=skipna) 

1540 

1541 def min(self, *, axis: AxisInt | None = None, skipna: bool = True): 

1542 """ 

1543 Min of array values, ignoring NA values if specified. 

1544 

1545 Parameters 

1546 ---------- 

1547 axis : int, default 0 

1548 Not Used. NumPy compatibility. 

1549 skipna : bool, default True 

1550 Whether to ignore NA values. 

1551 

1552 Returns 

1553 ------- 

1554 scalar 

1555 """ 

1556 nv.validate_minmax_axis(axis, self.ndim) 

1557 return self._min_max("min", skipna=skipna) 

1558 

1559 def _min_max(self, kind: Literal["min", "max"], skipna: bool) -> Scalar: 

1560 """ 

1561 Min/max of non-NA/null values 

1562 

1563 Parameters 

1564 ---------- 

1565 kind : {"min", "max"} 

1566 skipna : bool 

1567 

1568 Returns 

1569 ------- 

1570 scalar 

1571 """ 

1572 valid_vals = self._valid_sp_values 

1573 has_nonnull_fill_vals = not self._null_fill_value and self.sp_index.ngaps > 0 

1574 

1575 if len(valid_vals) > 0: 

1576 sp_min_max = getattr(valid_vals, kind)() 

1577 

1578 # If a non-null fill value is currently present, it might be the min/max 

1579 if has_nonnull_fill_vals: 

1580 func = max if kind == "max" else min 

1581 return func(sp_min_max, self.fill_value) 

1582 elif skipna: 

1583 return sp_min_max 

1584 elif self.sp_index.ngaps == 0: 

1585 # No NAs present 

1586 return sp_min_max 

1587 else: 

1588 return na_value_for_dtype(self.dtype.subtype, compat=False) 

1589 elif has_nonnull_fill_vals: 

1590 return self.fill_value 

1591 else: 

1592 return na_value_for_dtype(self.dtype.subtype, compat=False) 

1593 

1594 def _argmin_argmax(self, kind: Literal["argmin", "argmax"]) -> int: 

1595 values = self._sparse_values 

1596 index = self._sparse_index.indices 

1597 mask = np.asarray(isna(values)) 

1598 func = np.argmax if kind == "argmax" else np.argmin 

1599 

1600 idx = np.arange(values.shape[0]) 

1601 non_nans = values[~mask] 

1602 non_nan_idx = idx[~mask] 

1603 

1604 _candidate = non_nan_idx[func(non_nans)] 

1605 candidate = index[_candidate] 

1606 

1607 if isna(self.fill_value): 

1608 return candidate 

1609 if kind == "argmin" and self[candidate] < self.fill_value: 

1610 return candidate 

1611 if kind == "argmax" and self[candidate] > self.fill_value: 

1612 return candidate 

1613 _loc = self._first_fill_value_loc() 

1614 if _loc == -1: 

1615 # fill_value doesn't exist 

1616 return candidate 

1617 else: 

1618 return _loc 

1619 

1620 def argmax(self, skipna: bool = True) -> int: 

1621 validate_bool_kwarg(skipna, "skipna") 

1622 if not skipna and self._hasna: 

1623 raise NotImplementedError 

1624 return self._argmin_argmax("argmax") 

1625 

1626 def argmin(self, skipna: bool = True) -> int: 

1627 validate_bool_kwarg(skipna, "skipna") 

1628 if not skipna and self._hasna: 

1629 raise NotImplementedError 

1630 return self._argmin_argmax("argmin") 

1631 

1632 # ------------------------------------------------------------------------ 

1633 # Ufuncs 

1634 # ------------------------------------------------------------------------ 

1635 

1636 _HANDLED_TYPES = (np.ndarray, numbers.Number) 

1637 

1638 def __array_ufunc__(self, ufunc: np.ufunc, method: str, *inputs, **kwargs): 

1639 out = kwargs.get("out", ()) 

1640 

1641 for x in inputs + out: 

1642 if not isinstance(x, self._HANDLED_TYPES + (SparseArray,)): 

1643 return NotImplemented 

1644 

1645 # for binary ops, use our custom dunder methods 

1646 result = ops.maybe_dispatch_ufunc_to_dunder_op( 

1647 self, ufunc, method, *inputs, **kwargs 

1648 ) 

1649 if result is not NotImplemented: 

1650 return result 

1651 

1652 if "out" in kwargs: 

1653 # e.g. tests.arrays.sparse.test_arithmetics.test_ndarray_inplace 

1654 res = arraylike.dispatch_ufunc_with_out( 

1655 self, ufunc, method, *inputs, **kwargs 

1656 ) 

1657 return res 

1658 

1659 if method == "reduce": 

1660 result = arraylike.dispatch_reduction_ufunc( 

1661 self, ufunc, method, *inputs, **kwargs 

1662 ) 

1663 if result is not NotImplemented: 

1664 # e.g. tests.series.test_ufunc.TestNumpyReductions 

1665 return result 

1666 

1667 if len(inputs) == 1: 

1668 # No alignment necessary. 

1669 sp_values = getattr(ufunc, method)(self.sp_values, **kwargs) 

1670 fill_value = getattr(ufunc, method)(self.fill_value, **kwargs) 

1671 

1672 if ufunc.nout > 1: 

1673 # multiple outputs. e.g. modf 

1674 arrays = tuple( 

1675 self._simple_new( 

1676 sp_value, self.sp_index, SparseDtype(sp_value.dtype, fv) 

1677 ) 

1678 for sp_value, fv in zip(sp_values, fill_value) 

1679 ) 

1680 return arrays 

1681 elif method == "reduce": 

1682 # e.g. reductions 

1683 return sp_values 

1684 

1685 return self._simple_new( 

1686 sp_values, self.sp_index, SparseDtype(sp_values.dtype, fill_value) 

1687 ) 

1688 

1689 new_inputs = tuple(np.asarray(x) for x in inputs) 

1690 result = getattr(ufunc, method)(*new_inputs, **kwargs) 

1691 if out: 

1692 if len(out) == 1: 

1693 out = out[0] 

1694 return out 

1695 

1696 if ufunc.nout > 1: 

1697 return tuple(type(self)(x) for x in result) 

1698 elif method == "at": 

1699 # no return value 

1700 return None 

1701 else: 

1702 return type(self)(result) 

1703 

1704 # ------------------------------------------------------------------------ 

1705 # Ops 

1706 # ------------------------------------------------------------------------ 

1707 

1708 def _arith_method(self, other, op): 

1709 op_name = op.__name__ 

1710 

1711 if isinstance(other, SparseArray): 

1712 return _sparse_array_op(self, other, op, op_name) 

1713 

1714 elif is_scalar(other): 

1715 with np.errstate(all="ignore"): 

1716 fill = op(_get_fill(self), np.asarray(other)) 

1717 result = op(self.sp_values, other) 

1718 

1719 if op_name == "divmod": 

1720 left, right = result 

1721 lfill, rfill = fill 

1722 return ( 

1723 _wrap_result(op_name, left, self.sp_index, lfill), 

1724 _wrap_result(op_name, right, self.sp_index, rfill), 

1725 ) 

1726 

1727 return _wrap_result(op_name, result, self.sp_index, fill) 

1728 

1729 else: 

1730 other = np.asarray(other) 

1731 with np.errstate(all="ignore"): 

1732 if len(self) != len(other): 

1733 raise AssertionError( 

1734 f"length mismatch: {len(self)} vs. {len(other)}" 

1735 ) 

1736 if not isinstance(other, SparseArray): 

1737 dtype = getattr(other, "dtype", None) 

1738 other = SparseArray(other, fill_value=self.fill_value, dtype=dtype) 

1739 return _sparse_array_op(self, other, op, op_name) 

1740 

1741 def _cmp_method(self, other, op) -> SparseArray: 

1742 if not is_scalar(other) and not isinstance(other, type(self)): 

1743 # convert list-like to ndarray 

1744 other = np.asarray(other) 

1745 

1746 if isinstance(other, np.ndarray): 

1747 # TODO: make this more flexible than just ndarray... 

1748 other = SparseArray(other, fill_value=self.fill_value) 

1749 

1750 if isinstance(other, SparseArray): 

1751 if len(self) != len(other): 

1752 raise ValueError( 

1753 f"operands have mismatched length {len(self)} and {len(other)}" 

1754 ) 

1755 

1756 op_name = op.__name__.strip("_") 

1757 return _sparse_array_op(self, other, op, op_name) 

1758 else: 

1759 # scalar 

1760 with np.errstate(all="ignore"): 

1761 fill_value = op(self.fill_value, other) 

1762 result = np.full(len(self), fill_value, dtype=np.bool_) 

1763 result[self.sp_index.indices] = op(self.sp_values, other) 

1764 

1765 return type(self)( 

1766 result, 

1767 fill_value=fill_value, 

1768 dtype=np.bool_, 

1769 ) 

1770 

1771 _logical_method = _cmp_method 

1772 

1773 def _unary_method(self, op) -> SparseArray: 

1774 fill_value = op(np.array(self.fill_value)).item() 

1775 dtype = SparseDtype(self.dtype.subtype, fill_value) 

1776 # NOTE: if fill_value doesn't change 

1777 # we just have to apply op to sp_values 

1778 if isna(self.fill_value) or fill_value == self.fill_value: 

1779 values = op(self.sp_values) 

1780 return type(self)._simple_new(values, self.sp_index, self.dtype) 

1781 # In the other case we have to recalc indexes 

1782 return type(self)(op(self.to_dense()), dtype=dtype) 

1783 

1784 def __pos__(self) -> SparseArray: 

1785 return self._unary_method(operator.pos) 

1786 

1787 def __neg__(self) -> SparseArray: 

1788 return self._unary_method(operator.neg) 

1789 

1790 def __invert__(self) -> SparseArray: 

1791 return self._unary_method(operator.invert) 

1792 

1793 def __abs__(self) -> SparseArray: 

1794 return self._unary_method(operator.abs) 

1795 

1796 # ---------- 

1797 # Formatting 

1798 # ----------- 

1799 def __repr__(self) -> str: 

1800 pp_str = printing.pprint_thing(self) 

1801 pp_fill = printing.pprint_thing(self.fill_value) 

1802 pp_index = printing.pprint_thing(self.sp_index) 

1803 return f"{pp_str}\nFill: {pp_fill}\n{pp_index}" 

1804 

1805 def _formatter(self, boxed: bool = False): 

1806 # Defer to the formatter from the GenericArrayFormatter calling us. 

1807 # This will infer the correct formatter from the dtype of the values. 

1808 return None 

1809 

1810 

1811def _make_sparse( 

1812 arr: np.ndarray, 

1813 kind: SparseIndexKind = "block", 

1814 fill_value=None, 

1815 dtype: np.dtype | None = None, 

1816): 

1817 """ 

1818 Convert ndarray to sparse format 

1819 

1820 Parameters 

1821 ---------- 

1822 arr : ndarray 

1823 kind : {'block', 'integer'} 

1824 fill_value : NaN or another value 

1825 dtype : np.dtype, optional 

1826 copy : bool, default False 

1827 

1828 Returns 

1829 ------- 

1830 (sparse_values, index, fill_value) : (ndarray, SparseIndex, Scalar) 

1831 """ 

1832 assert isinstance(arr, np.ndarray) 

1833 

1834 if arr.ndim > 1: 

1835 raise TypeError("expected dimension <= 1 data") 

1836 

1837 if fill_value is None: 

1838 fill_value = na_value_for_dtype(arr.dtype) 

1839 

1840 if isna(fill_value): 

1841 mask = notna(arr) 

1842 else: 

1843 # cast to object comparison to be safe 

1844 if is_string_dtype(arr.dtype): 

1845 arr = arr.astype(object) 

1846 

1847 if is_object_dtype(arr.dtype): 

1848 # element-wise equality check method in numpy doesn't treat 

1849 # each element type, eg. 0, 0.0, and False are treated as 

1850 # same. So we have to check the both of its type and value. 

1851 mask = splib.make_mask_object_ndarray(arr, fill_value) 

1852 else: 

1853 mask = arr != fill_value 

1854 

1855 length = len(arr) 

1856 if length != len(mask): 

1857 # the arr is a SparseArray 

1858 indices = mask.sp_index.indices 

1859 else: 

1860 indices = mask.nonzero()[0].astype(np.int32) 

1861 

1862 index = make_sparse_index(length, indices, kind) 

1863 sparsified_values = arr[mask] 

1864 if dtype is not None: 

1865 sparsified_values = ensure_wrapped_if_datetimelike(sparsified_values) 

1866 sparsified_values = astype_array(sparsified_values, dtype=dtype) 

1867 sparsified_values = np.asarray(sparsified_values) 

1868 

1869 # TODO: copy 

1870 return sparsified_values, index, fill_value 

1871 

1872 

1873@overload 

1874def make_sparse_index(length: int, indices, kind: Literal["block"]) -> BlockIndex: 

1875 ... 

1876 

1877 

1878@overload 

1879def make_sparse_index(length: int, indices, kind: Literal["integer"]) -> IntIndex: 

1880 ... 

1881 

1882 

1883def make_sparse_index(length: int, indices, kind: SparseIndexKind) -> SparseIndex: 

1884 index: SparseIndex 

1885 if kind == "block": 

1886 locs, lens = splib.get_blocks(indices) 

1887 index = BlockIndex(length, locs, lens) 

1888 elif kind == "integer": 

1889 index = IntIndex(length, indices) 

1890 else: # pragma: no cover 

1891 raise ValueError("must be block or integer type") 

1892 return index