Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.9/dist-packages/pandas/core/arrays/sparse/array.py: 18%

1"""

2SparseArray data structure

3"""

4from __future__ import annotations

6from collections import abc

7import numbers

8import operator

9from typing import (

10 TYPE_CHECKING,

11 Any,

12 Callable,

13 Literal,

14 cast,

15 overload,

16)

17import warnings

19import numpy as np

21from pandas._libs import lib

22import pandas._libs.sparse as splib

23from pandas._libs.sparse import (

24 BlockIndex,

25 IntIndex,

26 SparseIndex,

27)

28from pandas._libs.tslibs import NaT

29from pandas.compat.numpy import function as nv

30from pandas.errors import PerformanceWarning

31from pandas.util._decorators import doc

32from pandas.util._exceptions import find_stack_level

33from pandas.util._validators import (

34 validate_bool_kwarg,

35 validate_insert_loc,

36)

38from pandas.core.dtypes.astype import astype_array

39from pandas.core.dtypes.cast import (

40 construct_1d_arraylike_from_scalar,

41 find_common_type,

42 maybe_box_datetimelike,

43)

44from pandas.core.dtypes.common import (

45 is_bool_dtype,

46 is_integer,

47 is_list_like,

48 is_object_dtype,

49 is_scalar,

50 is_string_dtype,

51 pandas_dtype,

52)

53from pandas.core.dtypes.dtypes import (

54 DatetimeTZDtype,

55 SparseDtype,

56)

57from pandas.core.dtypes.generic import (

58 ABCIndex,

59 ABCSeries,

60)

61from pandas.core.dtypes.missing import (

62 isna,

63 na_value_for_dtype,

64 notna,

65)

67from pandas.core import arraylike

68import pandas.core.algorithms as algos

69from pandas.core.arraylike import OpsMixin

70from pandas.core.arrays import ExtensionArray

71from pandas.core.base import PandasObject

72import pandas.core.common as com

73from pandas.core.construction import (

74 ensure_wrapped_if_datetimelike,

75 extract_array,

76 sanitize_array,

77)

78from pandas.core.indexers import (

79 check_array_indexer,

80 unpack_tuple_and_ellipses,

81)

82from pandas.core.nanops import check_below_min_count

84from pandas.io.formats import printing

86# See https://github.com/python/typing/issues/684

87if TYPE_CHECKING:

88 from collections.abc import Sequence

89 from enum import Enum

91 class ellipsis(Enum):

92 Ellipsis = "..."

94 Ellipsis = ellipsis.Ellipsis

96 from scipy.sparse import spmatrix

98 from pandas._typing import (

99 FillnaOptions,

100 NumpySorter,

101 )

102

103 SparseIndexKind = Literal["integer", "block"]

104

105 from pandas._typing import (

106 ArrayLike,

107 AstypeArg,

108 Axis,

109 AxisInt,

110 Dtype,

111 NpDtype,

112 PositionalIndexer,

113 Scalar,

114 ScalarIndexer,

115 Self,

116 SequenceIndexer,

117 npt,

118 )

119

120 from pandas import Series

121

122else:

123 ellipsis = type(Ellipsis)

124

125

126# ----------------------------------------------------------------------------

127# Array

128

129_sparray_doc_kwargs = {"klass": "SparseArray"}

130

131

132def _get_fill(arr: SparseArray) -> np.ndarray:

133 """

134 Create a 0-dim ndarray containing the fill value

135

136 Parameters

137 ----------

138 arr : SparseArray

139

140 Returns

141 -------

142 fill_value : ndarray

143 0-dim ndarray with just the fill value.

144

145 Notes

146 -----

147 coerce fill_value to arr dtype if possible

148 int64 SparseArray can have NaN as fill_value if there is no missing

149 """

150 try:

151 return np.asarray(arr.fill_value, dtype=arr.dtype.subtype)

152 except ValueError:

153 return np.asarray(arr.fill_value)

154

155

156def _sparse_array_op(

157 left: SparseArray, right: SparseArray, op: Callable, name: str

158) -> SparseArray:

159 """

160 Perform a binary operation between two arrays.

161

162 Parameters

163 ----------

164 left : Union[SparseArray, ndarray]

165 right : Union[SparseArray, ndarray]

166 op : Callable

167 The binary operation to perform

168 name str

169 Name of the callable.

170

171 Returns

172 -------

173 SparseArray

174 """

175 if name.startswith("__"):

176 # For lookups in _libs.sparse we need non-dunder op name

177 name = name[2:-2]

178

179 # dtype used to find corresponding sparse method

180 ltype = left.dtype.subtype

181 rtype = right.dtype.subtype

182

183 if ltype != rtype:

184 subtype = find_common_type([ltype, rtype])

185 ltype = SparseDtype(subtype, left.fill_value)

186 rtype = SparseDtype(subtype, right.fill_value)

187

188 left = left.astype(ltype, copy=False)

189 right = right.astype(rtype, copy=False)

190 dtype = ltype.subtype

191 else:

192 dtype = ltype

193

194 # dtype the result must have

195 result_dtype = None

196

197 if left.sp_index.ngaps == 0 or right.sp_index.ngaps == 0:

198 with np.errstate(all="ignore"):

199 result = op(left.to_dense(), right.to_dense())

200 fill = op(_get_fill(left), _get_fill(right))

201

202 if left.sp_index.ngaps == 0:

203 index = left.sp_index

204 else:

205 index = right.sp_index

206 elif left.sp_index.equals(right.sp_index):

207 with np.errstate(all="ignore"):

208 result = op(left.sp_values, right.sp_values)

209 fill = op(_get_fill(left), _get_fill(right))

210 index = left.sp_index

211 else:

212 if name[0] == "r":

213 left, right = right, left

214 name = name[1:]

215

216 if name in ("and", "or", "xor") and dtype == "bool":

217 opname = f"sparse_{name}_uint8"

218 # to make template simple, cast here

219 left_sp_values = left.sp_values.view(np.uint8)

220 right_sp_values = right.sp_values.view(np.uint8)

221 result_dtype = bool

222 else:

223 opname = f"sparse_{name}_{dtype}"

224 left_sp_values = left.sp_values

225 right_sp_values = right.sp_values

226

227 if (

228 name in ["floordiv", "mod"]

229 and (right == 0).any()

230 and left.dtype.kind in "iu"

231 ):

232 # Match the non-Sparse Series behavior

233 opname = f"sparse_{name}_float64"

234 left_sp_values = left_sp_values.astype("float64")

235 right_sp_values = right_sp_values.astype("float64")

236

237 sparse_op = getattr(splib, opname)

238

239 with np.errstate(all="ignore"):

240 result, index, fill = sparse_op(

241 left_sp_values,

242 left.sp_index,

243 left.fill_value,

244 right_sp_values,

245 right.sp_index,

246 right.fill_value,

247 )

248

249 if name == "divmod":

250 # result is a 2-tuple

251 # error: Incompatible return value type (got "Tuple[SparseArray,

252 # SparseArray]", expected "SparseArray")

253 return ( # type: ignore[return-value]

254 _wrap_result(name, result[0], index, fill[0], dtype=result_dtype),

255 _wrap_result(name, result[1], index, fill[1], dtype=result_dtype),

256 )

257

258 if result_dtype is None:

259 result_dtype = result.dtype

260

261 return _wrap_result(name, result, index, fill, dtype=result_dtype)

262

263

264def _wrap_result(

265 name: str, data, sparse_index, fill_value, dtype: Dtype | None = None

266) -> SparseArray:

267 """

268 wrap op result to have correct dtype

269 """

270 if name.startswith("__"):

271 # e.g. __eq__ --> eq

272 name = name[2:-2]

273

274 if name in ("eq", "ne", "lt", "gt", "le", "ge"):

275 dtype = bool

276

277 fill_value = lib.item_from_zerodim(fill_value)

278

279 if is_bool_dtype(dtype):

280 # fill_value may be np.bool_

281 fill_value = bool(fill_value)

282 return SparseArray(

283 data, sparse_index=sparse_index, fill_value=fill_value, dtype=dtype

284 )

285

286

287class SparseArray(OpsMixin, PandasObject, ExtensionArray):

288 """

289 An ExtensionArray for storing sparse data.

290

291 Parameters

292 ----------

293 data : array-like or scalar

294 A dense array of values to store in the SparseArray. This may contain

295 `fill_value`.

296 sparse_index : SparseIndex, optional

297 fill_value : scalar, optional

298 Elements in data that are ``fill_value`` are not stored in the

299 SparseArray. For memory savings, this should be the most common value

300 in `data`. By default, `fill_value` depends on the dtype of `data`:

301

302 =========== ==========

303 data.dtype na_value

304 =========== ==========

305 float ``np.nan``

306 int ``0``

307 bool False

308 datetime64 ``pd.NaT``

309 timedelta64 ``pd.NaT``

310 =========== ==========

311

312 The fill value is potentially specified in three ways. In order of

313 precedence, these are

314

315 1. The `fill_value` argument

316 2. ``dtype.fill_value`` if `fill_value` is None and `dtype` is

317 a ``SparseDtype``

318 3. ``data.dtype.fill_value`` if `fill_value` is None and `dtype`

319 is not a ``SparseDtype`` and `data` is a ``SparseArray``.

320

321 kind : str

322 Can be 'integer' or 'block', default is 'integer'.

323 The type of storage for sparse locations.

324

325 * 'block': Stores a `block` and `block_length` for each

326 contiguous *span* of sparse values. This is best when

327 sparse data tends to be clumped together, with large

328 regions of ``fill-value`` values between sparse values.

329 * 'integer': uses an integer to store the location of

330 each sparse value.

331

332 dtype : np.dtype or SparseDtype, optional

333 The dtype to use for the SparseArray. For numpy dtypes, this

334 determines the dtype of ``self.sp_values``. For SparseDtype,

335 this determines ``self.sp_values`` and ``self.fill_value``.

336 copy : bool, default False

337 Whether to explicitly copy the incoming `data` array.

338

339 Attributes

340 ----------

341 None

342

343 Methods

344 -------

345 None

346

347 Examples

348 --------

349 >>> from pandas.arrays import SparseArray

350 >>> arr = SparseArray([0, 0, 1, 2])

351 >>> arr

352 [0, 0, 1, 2]

353 Fill: 0

354 IntIndex

355 Indices: array([2, 3], dtype=int32)

356 """

357

358 _subtyp = "sparse_array" # register ABCSparseArray

359 _hidden_attrs = PandasObject._hidden_attrs | frozenset([])

360 _sparse_index: SparseIndex

361 _sparse_values: np.ndarray

362 _dtype: SparseDtype

363

364 def __init__(

365 self,

366 data,

367 sparse_index=None,

368 fill_value=None,

369 kind: SparseIndexKind = "integer",

370 dtype: Dtype | None = None,

371 copy: bool = False,

372 ) -> None:

373 if fill_value is None and isinstance(dtype, SparseDtype):

374 fill_value = dtype.fill_value

375

376 if isinstance(data, type(self)):

377 # disable normal inference on dtype, sparse_index, & fill_value

378 if sparse_index is None:

379 sparse_index = data.sp_index

380 if fill_value is None:

381 fill_value = data.fill_value

382 if dtype is None:

383 dtype = data.dtype

384 # TODO: make kind=None, and use data.kind?

385 data = data.sp_values

386

387 # Handle use-provided dtype

388 if isinstance(dtype, str):

389 # Two options: dtype='int', regular numpy dtype

390 # or dtype='Sparse[int]', a sparse dtype

391 try:

392 dtype = SparseDtype.construct_from_string(dtype)

393 except TypeError:

394 dtype = pandas_dtype(dtype)

395

396 if isinstance(dtype, SparseDtype):

397 if fill_value is None:

398 fill_value = dtype.fill_value

399 dtype = dtype.subtype

400

401 if is_scalar(data):

402 warnings.warn(

403 f"Constructing {type(self).__name__} with scalar data is deprecated "

404 "and will raise in a future version. Pass a sequence instead.",

405 FutureWarning,

406 stacklevel=find_stack_level(),

407 )

408 if sparse_index is None:

409 npoints = 1

410 else:

411 npoints = sparse_index.length

412

413 data = construct_1d_arraylike_from_scalar(data, npoints, dtype=None)

414 dtype = data.dtype

415

416 if dtype is not None:

417 dtype = pandas_dtype(dtype)

418

419 # TODO: disentangle the fill_value dtype inference from

420 # dtype inference

421 if data is None:

422 # TODO: What should the empty dtype be? Object or float?

423

424 # error: Argument "dtype" to "array" has incompatible type

425 # "Union[ExtensionDtype, dtype[Any], None]"; expected "Union[dtype[Any],

426 # None, type, _SupportsDType, str, Union[Tuple[Any, int], Tuple[Any,

427 # Union[int, Sequence[int]]], List[Any], _DTypeDict, Tuple[Any, Any]]]"

428 data = np.array([], dtype=dtype) # type: ignore[arg-type]

429

430 try:

431 data = sanitize_array(data, index=None)

432 except ValueError:

433 # NumPy may raise a ValueError on data like [1, []]

434 # we retry with object dtype here.

435 if dtype is None:

436 dtype = np.dtype(object)

437 data = np.atleast_1d(np.asarray(data, dtype=dtype))

438 else:

439 raise

440

441 if copy:

442 # TODO: avoid double copy when dtype forces cast.

443 data = data.copy()

444

445 if fill_value is None:

446 fill_value_dtype = data.dtype if dtype is None else dtype

447 if fill_value_dtype is None:

448 fill_value = np.nan

449 else:

450 fill_value = na_value_for_dtype(fill_value_dtype)

451

452 if isinstance(data, type(self)) and sparse_index is None:

453 sparse_index = data._sparse_index

454 # error: Argument "dtype" to "asarray" has incompatible type

455 # "Union[ExtensionDtype, dtype[Any], None]"; expected "None"

456 sparse_values = np.asarray(

457 data.sp_values, dtype=dtype # type: ignore[arg-type]

458 )

459 elif sparse_index is None:

460 data = extract_array(data, extract_numpy=True)

461 if not isinstance(data, np.ndarray):

462 # EA

463 if isinstance(data.dtype, DatetimeTZDtype):

464 warnings.warn(

465 f"Creating SparseArray from {data.dtype} data "

466 "loses timezone information. Cast to object before "

467 "sparse to retain timezone information.",

468 UserWarning,

469 stacklevel=find_stack_level(),

470 )

471 data = np.asarray(data, dtype="datetime64[ns]")

472 if fill_value is NaT:

473 fill_value = np.datetime64("NaT", "ns")

474 data = np.asarray(data)

475 sparse_values, sparse_index, fill_value = _make_sparse(

476 # error: Argument "dtype" to "_make_sparse" has incompatible type

477 # "Union[ExtensionDtype, dtype[Any], None]"; expected

478 # "Optional[dtype[Any]]"

479 data,

480 kind=kind,

481 fill_value=fill_value,

482 dtype=dtype, # type: ignore[arg-type]

483 )

484 else:

485 # error: Argument "dtype" to "asarray" has incompatible type

486 # "Union[ExtensionDtype, dtype[Any], None]"; expected "None"

487 sparse_values = np.asarray(data, dtype=dtype) # type: ignore[arg-type]

488 if len(sparse_values) != sparse_index.npoints:

489 raise AssertionError(

490 f"Non array-like type {type(sparse_values)} must "

491 "have the same length as the index"

492 )

493 self._sparse_index = sparse_index

494 self._sparse_values = sparse_values

495 self._dtype = SparseDtype(sparse_values.dtype, fill_value)

496

497 @classmethod

498 def _simple_new(

499 cls,

500 sparse_array: np.ndarray,

501 sparse_index: SparseIndex,

502 dtype: SparseDtype,

503 ) -> Self:

504 new = object.__new__(cls)

505 new._sparse_index = sparse_index

506 new._sparse_values = sparse_array

507 new._dtype = dtype

508 return new

509

510 @classmethod

511 def from_spmatrix(cls, data: spmatrix) -> Self:

512 """

513 Create a SparseArray from a scipy.sparse matrix.

514

515 Parameters

516 ----------

517 data : scipy.sparse.sp_matrix

518 This should be a SciPy sparse matrix where the size

519 of the second dimension is 1. In other words, a

520 sparse matrix with a single column.

521

522 Returns

523 -------

524 SparseArray

525

526 Examples

527 --------

528 >>> import scipy.sparse

529 >>> mat = scipy.sparse.coo_matrix((4, 1))

530 >>> pd.arrays.SparseArray.from_spmatrix(mat)

531 [0.0, 0.0, 0.0, 0.0]

532 Fill: 0.0

533 IntIndex

534 Indices: array([], dtype=int32)

535 """

536 length, ncol = data.shape

537

538 if ncol != 1:

539 raise ValueError(f"'data' must have a single column, not '{ncol}'")

540

541 # our sparse index classes require that the positions be strictly

542 # increasing. So we need to sort loc, and arr accordingly.

543 data = data.tocsc()

544 data.sort_indices()

545 arr = data.data

546 idx = data.indices

547

548 zero = np.array(0, dtype=arr.dtype).item()

549 dtype = SparseDtype(arr.dtype, zero)

550 index = IntIndex(length, idx)

551

552 return cls._simple_new(arr, index, dtype)

553

554 def __array__(

555 self, dtype: NpDtype | None = None, copy: bool | None = None

556 ) -> np.ndarray:

557 fill_value = self.fill_value

558

559 if self.sp_index.ngaps == 0:

560 # Compat for na dtype and int values.

561 return self.sp_values

562 if dtype is None:

563 # Can NumPy represent this type?

564 # If not, `np.result_type` will raise. We catch that

565 # and return object.

566 if self.sp_values.dtype.kind == "M":

567 # However, we *do* special-case the common case of

568 # a datetime64 with pandas NaT.

569 if fill_value is NaT:

570 # Can't put pd.NaT in a datetime64[ns]

571 fill_value = np.datetime64("NaT")

572 try:

573 dtype = np.result_type(self.sp_values.dtype, type(fill_value))

574 except TypeError:

575 dtype = object

576

577 out = np.full(self.shape, fill_value, dtype=dtype)

578 out[self.sp_index.indices] = self.sp_values

579 return out

580

581 def __setitem__(self, key, value) -> None:

582 # I suppose we could allow setting of non-fill_value elements.

583 # TODO(SparseArray.__setitem__): remove special cases in

584 # ExtensionBlock.where

585 msg = "SparseArray does not support item assignment via setitem"

586 raise TypeError(msg)

587

588 @classmethod

589 def _from_sequence(cls, scalars, *, dtype: Dtype | None = None, copy: bool = False):

590 return cls(scalars, dtype=dtype)

591

592 @classmethod

593 def _from_factorized(cls, values, original):

594 return cls(values, dtype=original.dtype)

595

596 # ------------------------------------------------------------------------

597 # Data

598 # ------------------------------------------------------------------------

599 @property

600 def sp_index(self) -> SparseIndex:

601 """

602 The SparseIndex containing the location of non- ``fill_value`` points.

603 """

604 return self._sparse_index

605

606 @property

607 def sp_values(self) -> np.ndarray:

608 """

609 An ndarray containing the non- ``fill_value`` values.

610

611 Examples

612 --------

613 >>> from pandas.arrays import SparseArray

614 >>> s = SparseArray([0, 0, 1, 0, 2], fill_value=0)

615 >>> s.sp_values

616 array([1, 2])

617 """

618 return self._sparse_values

619

620 @property

621 def dtype(self) -> SparseDtype:

622 return self._dtype

623

624 @property

625 def fill_value(self):

626 """

627 Elements in `data` that are `fill_value` are not stored.

628

629 For memory savings, this should be the most common value in the array.

630

631 Examples

632 --------

633 >>> ser = pd.Series([0, 0, 2, 2, 2], dtype="Sparse[int]")

634 >>> ser.sparse.fill_value

635 0

636 >>> spa_dtype = pd.SparseDtype(dtype=np.int32, fill_value=2)

637 >>> ser = pd.Series([0, 0, 2, 2, 2], dtype=spa_dtype)

638 >>> ser.sparse.fill_value

639 2

640 """

641 return self.dtype.fill_value

642

643 @fill_value.setter

644 def fill_value(self, value) -> None:

645 self._dtype = SparseDtype(self.dtype.subtype, value)

646

647 @property

648 def kind(self) -> SparseIndexKind:

649 """

650 The kind of sparse index for this array. One of {'integer', 'block'}.

651 """

652 if isinstance(self.sp_index, IntIndex):

653 return "integer"

654 else:

655 return "block"

656

657 @property

658 def _valid_sp_values(self) -> np.ndarray:

659 sp_vals = self.sp_values

660 mask = notna(sp_vals)

661 return sp_vals[mask]

662

663 def __len__(self) -> int:

664 return self.sp_index.length

665

666 @property

667 def _null_fill_value(self) -> bool:

668 return self._dtype._is_na_fill_value

669

670 def _fill_value_matches(self, fill_value) -> bool:

671 if self._null_fill_value:

672 return isna(fill_value)

673 else:

674 return self.fill_value == fill_value

675

676 @property

677 def nbytes(self) -> int:

678 return self.sp_values.nbytes + self.sp_index.nbytes

679

680 @property

681 def density(self) -> float:

682 """

683 The percent of non- ``fill_value`` points, as decimal.

684

685 Examples

686 --------

687 >>> from pandas.arrays import SparseArray

688 >>> s = SparseArray([0, 0, 1, 1, 1], fill_value=0)

689 >>> s.density

690 0.6

691 """

692 return self.sp_index.npoints / self.sp_index.length

693

694 @property

695 def npoints(self) -> int:

696 """

697 The number of non- ``fill_value`` points.

698

699 Examples

700 --------

701 >>> from pandas.arrays import SparseArray

702 >>> s = SparseArray([0, 0, 1, 1, 1], fill_value=0)

703 >>> s.npoints

704 3

705 """

706 return self.sp_index.npoints

707

708 # error: Return type "SparseArray" of "isna" incompatible with return type

709 # "ndarray[Any, Any] | ExtensionArraySupportsAnyAll" in supertype "ExtensionArray"

710 def isna(self) -> Self: # type: ignore[override]

711 # If null fill value, we want SparseDtype[bool, true]

712 # to preserve the same memory usage.

713 dtype = SparseDtype(bool, self._null_fill_value)

714 if self._null_fill_value:

715 return type(self)._simple_new(isna(self.sp_values), self.sp_index, dtype)

716 mask = np.full(len(self), False, dtype=np.bool_)

717 mask[self.sp_index.indices] = isna(self.sp_values)

718 return type(self)(mask, fill_value=False, dtype=dtype)

719

720 def _pad_or_backfill( # pylint: disable=useless-parent-delegation

721 self,

722 *,

723 method: FillnaOptions,

724 limit: int | None = None,

725 limit_area: Literal["inside", "outside"] | None = None,

726 copy: bool = True,

727 ) -> Self:

728 # TODO(3.0): We can remove this method once deprecation for fillna method

729 # keyword is enforced.

730 return super()._pad_or_backfill(

731 method=method, limit=limit, limit_area=limit_area, copy=copy

732 )

733

734 def fillna(

735 self,

736 value=None,

737 method: FillnaOptions | None = None,

738 limit: int | None = None,

739 copy: bool = True,

740 ) -> Self:

741 """

742 Fill missing values with `value`.

743

744 Parameters

745 ----------

746 value : scalar, optional

747 method : str, optional

748

749 .. warning::

750

751 Using 'method' will result in high memory use,

752 as all `fill_value` methods will be converted to

753 an in-memory ndarray

754

755 limit : int, optional

756

757 copy: bool, default True

758 Ignored for SparseArray.

759

760 Returns

761 -------

762 SparseArray

763

764 Notes

765 -----

766 When `value` is specified, the result's ``fill_value`` depends on

767 ``self.fill_value``. The goal is to maintain low-memory use.

768

769 If ``self.fill_value`` is NA, the result dtype will be

770 ``SparseDtype(self.dtype, fill_value=value)``. This will preserve

771 amount of memory used before and after filling.

772

773 When ``self.fill_value`` is not NA, the result dtype will be

774 ``self.dtype``. Again, this preserves the amount of memory used.

775 """

776 if (method is None and value is None) or (

777 method is not None and value is not None

778 ):

779 raise ValueError("Must specify one of 'method' or 'value'.")

780

781 if method is not None:

782 return super().fillna(method=method, limit=limit)

783

784 else:

785 new_values = np.where(isna(self.sp_values), value, self.sp_values)

786

787 if self._null_fill_value:

788 # This is essentially just updating the dtype.

789 new_dtype = SparseDtype(self.dtype.subtype, fill_value=value)

790 else:

791 new_dtype = self.dtype

792

793 return self._simple_new(new_values, self._sparse_index, new_dtype)

794

795 def shift(self, periods: int = 1, fill_value=None) -> Self:

796 if not len(self) or periods == 0:

797 return self.copy()

798

799 if isna(fill_value):

800 fill_value = self.dtype.na_value

801

802 subtype = np.result_type(fill_value, self.dtype.subtype)

803

804 if subtype != self.dtype.subtype:

805 # just coerce up front

806 arr = self.astype(SparseDtype(subtype, self.fill_value))

807 else:

808 arr = self

809

810 empty = self._from_sequence(

811 [fill_value] * min(abs(periods), len(self)), dtype=arr.dtype

812 )

813

814 if periods > 0:

815 a = empty

816 b = arr[:-periods]

817 else:

818 a = arr[abs(periods) :]

819 b = empty

820 return arr._concat_same_type([a, b])

821

822 def _first_fill_value_loc(self):

823 """

824 Get the location of the first fill value.

825

826 Returns

827 -------

828 int

829 """

830 if len(self) == 0 or self.sp_index.npoints == len(self):

831 return -1

832

833 indices = self.sp_index.indices

834 if not len(indices) or indices[0] > 0:

835 return 0

836

837 # a number larger than 1 should be appended to

838 # the last in case of fill value only appears

839 # in the tail of array

840 diff = np.r_[np.diff(indices), 2]

841 return indices[(diff > 1).argmax()] + 1

842

843 @doc(ExtensionArray.duplicated)

844 def duplicated(

845 self, keep: Literal["first", "last", False] = "first"

846 ) -> npt.NDArray[np.bool_]:

847 values = np.asarray(self)

848 mask = np.asarray(self.isna())

849 return algos.duplicated(values, keep=keep, mask=mask)

850

851 def unique(self) -> Self:

852 uniques = algos.unique(self.sp_values)

853 if len(self.sp_values) != len(self):

854 fill_loc = self._first_fill_value_loc()

855 # Inorder to align the behavior of pd.unique or

856 # pd.Series.unique, we should keep the original

857 # order, here we use unique again to find the

858 # insertion place. Since the length of sp_values

859 # is not large, maybe minor performance hurt

860 # is worthwhile to the correctness.

861 insert_loc = len(algos.unique(self.sp_values[:fill_loc]))

862 uniques = np.insert(uniques, insert_loc, self.fill_value)

863 return type(self)._from_sequence(uniques, dtype=self.dtype)

864

865 def _values_for_factorize(self):

866 # Still override this for hash_pandas_object

867 return np.asarray(self), self.fill_value

868

869 def factorize(

870 self,

871 use_na_sentinel: bool = True,

872 ) -> tuple[np.ndarray, SparseArray]:

873 # Currently, ExtensionArray.factorize -> Tuple[ndarray, EA]

874 # The sparsity on this is backwards from what Sparse would want. Want

875 # ExtensionArray.factorize -> Tuple[EA, EA]

876 # Given that we have to return a dense array of codes, why bother

877 # implementing an efficient factorize?

878 codes, uniques = algos.factorize(

879 np.asarray(self), use_na_sentinel=use_na_sentinel

880 )

881 uniques_sp = SparseArray(uniques, dtype=self.dtype)

882 return codes, uniques_sp

883

884 def value_counts(self, dropna: bool = True) -> Series:

885 """

886 Returns a Series containing counts of unique values.

887

888 Parameters

889 ----------

890 dropna : bool, default True

891 Don't include counts of NaN, even if NaN is in sp_values.

892

893 Returns

894 -------

895 counts : Series

896 """

897 from pandas import (

898 Index,

899 Series,

900 )

901

902 keys, counts, _ = algos.value_counts_arraylike(self.sp_values, dropna=dropna)

903 fcounts = self.sp_index.ngaps

904 if fcounts > 0 and (not self._null_fill_value or not dropna):

905 mask = isna(keys) if self._null_fill_value else keys == self.fill_value

906 if mask.any():

907 counts[mask] += fcounts

908 else:

909 # error: Argument 1 to "insert" has incompatible type "Union[

910 # ExtensionArray,ndarray[Any, Any]]"; expected "Union[

911 # _SupportsArray[dtype[Any]], Sequence[_SupportsArray[dtype

912 # [Any]]], Sequence[Sequence[_SupportsArray[dtype[Any]]]],

913 # Sequence[Sequence[Sequence[_SupportsArray[dtype[Any]]]]], Sequence

914 # [Sequence[Sequence[Sequence[_SupportsArray[dtype[Any]]]]]]]"

915 keys = np.insert(keys, 0, self.fill_value) # type: ignore[arg-type]

916 counts = np.insert(counts, 0, fcounts)

917

918 if not isinstance(keys, ABCIndex):

919 index = Index(keys)

920 else:

921 index = keys

922 return Series(counts, index=index, copy=False)

923

924 # --------

925 # Indexing

926 # --------

927 @overload

928 def __getitem__(self, key: ScalarIndexer) -> Any:

929 ...

930

931 @overload

932 def __getitem__(

933 self,

934 key: SequenceIndexer | tuple[int | ellipsis, ...],

935 ) -> Self:

936 ...

937

938 def __getitem__(

939 self,

940 key: PositionalIndexer | tuple[int | ellipsis, ...],

941 ) -> Self | Any:

942 if isinstance(key, tuple):

943 key = unpack_tuple_and_ellipses(key)

944 if key is Ellipsis:

945 raise ValueError("Cannot slice with Ellipsis")

946

947 if is_integer(key):

948 return self._get_val_at(key)

949 elif isinstance(key, tuple):

950 # error: Invalid index type "Tuple[Union[int, ellipsis], ...]"

951 # for "ndarray[Any, Any]"; expected type

952 # "Union[SupportsIndex, _SupportsArray[dtype[Union[bool_,

953 # integer[Any]]]], _NestedSequence[_SupportsArray[dtype[

954 # Union[bool_, integer[Any]]]]], _NestedSequence[Union[

955 # bool, int]], Tuple[Union[SupportsIndex, _SupportsArray[

956 # dtype[Union[bool_, integer[Any]]]], _NestedSequence[

957 # _SupportsArray[dtype[Union[bool_, integer[Any]]]]],

958 # _NestedSequence[Union[bool, int]]], ...]]"

959 data_slice = self.to_dense()[key] # type: ignore[index]

960 elif isinstance(key, slice):

961 # Avoid densifying when handling contiguous slices

962 if key.step is None or key.step == 1:

963 start = 0 if key.start is None else key.start

964 if start < 0:

965 start += len(self)

966

967 end = len(self) if key.stop is None else key.stop

968 if end < 0:

969 end += len(self)

970

971 indices = self.sp_index.indices

972 keep_inds = np.flatnonzero((indices >= start) & (indices < end))

973 sp_vals = self.sp_values[keep_inds]

974

975 sp_index = indices[keep_inds].copy()

976

977 # If we've sliced to not include the start of the array, all our indices

978 # should be shifted. NB: here we are careful to also not shift by a

979 # negative value for a case like [0, 1][-100:] where the start index

980 # should be treated like 0

981 if start > 0:

982 sp_index -= start

983

984 # Length of our result should match applying this slice to a range

985 # of the length of our original array

986 new_len = len(range(len(self))[key])

987 new_sp_index = make_sparse_index(new_len, sp_index, self.kind)

988 return type(self)._simple_new(sp_vals, new_sp_index, self.dtype)

989 else:

990 indices = np.arange(len(self), dtype=np.int32)[key]

991 return self.take(indices)

992

993 elif not is_list_like(key):

994 # e.g. "foo" or 2.5

995 # exception message copied from numpy

996 raise IndexError(

997 r"only integers, slices (`:`), ellipsis (`...`), numpy.newaxis "

998 r"(`None`) and integer or boolean arrays are valid indices"

999 )

1000

1001 else:

1002 if isinstance(key, SparseArray):

1003 # NOTE: If we guarantee that SparseDType(bool)

1004 # has only fill_value - true, false or nan

1005 # (see GH PR 44955)

1006 # we can apply mask very fast:

1007 if is_bool_dtype(key):

1008 if isna(key.fill_value):

1009 return self.take(key.sp_index.indices[key.sp_values])

1010 if not key.fill_value:

1011 return self.take(key.sp_index.indices)

1012 n = len(self)

1013 mask = np.full(n, True, dtype=np.bool_)

1014 mask[key.sp_index.indices] = False

1015 return self.take(np.arange(n)[mask])

1016 else:

1017 key = np.asarray(key)

1018

1019 key = check_array_indexer(self, key)

1020

1021 if com.is_bool_indexer(key):

1022 # mypy doesn't know we have an array here

1023 key = cast(np.ndarray, key)

1024 return self.take(np.arange(len(key), dtype=np.int32)[key])

1025 elif hasattr(key, "__len__"):

1026 return self.take(key)

1027 else:

1028 raise ValueError(f"Cannot slice with '{key}'")

1029

1030 return type(self)(data_slice, kind=self.kind)

1031

1032 def _get_val_at(self, loc):

1033 loc = validate_insert_loc(loc, len(self))

1034

1035 sp_loc = self.sp_index.lookup(loc)

1036 if sp_loc == -1:

1037 return self.fill_value

1038 else:

1039 val = self.sp_values[sp_loc]

1040 val = maybe_box_datetimelike(val, self.sp_values.dtype)

1041 return val

1042

1043 def take(self, indices, *, allow_fill: bool = False, fill_value=None) -> Self:

1044 if is_scalar(indices):

1045 raise ValueError(f"'indices' must be an array, not a scalar '{indices}'.")

1046 indices = np.asarray(indices, dtype=np.int32)

1047

1048 dtype = None

1049 if indices.size == 0:

1050 result = np.array([], dtype="object")

1051 dtype = self.dtype

1052 elif allow_fill:

1053 result = self._take_with_fill(indices, fill_value=fill_value)

1054 else:

1055 return self._take_without_fill(indices)

1056

1057 return type(self)(

1058 result, fill_value=self.fill_value, kind=self.kind, dtype=dtype

1059 )

1060

1061 def _take_with_fill(self, indices, fill_value=None) -> np.ndarray:

1062 if fill_value is None:

1063 fill_value = self.dtype.na_value

1064

1065 if indices.min() < -1:

1066 raise ValueError(

1067 "Invalid value in 'indices'. Must be between -1 "

1068 "and the length of the array."

1069 )

1070

1071 if indices.max() >= len(self):

1072 raise IndexError("out of bounds value in 'indices'.")

1073

1074 if len(self) == 0:

1075 # Empty... Allow taking only if all empty

1076 if (indices == -1).all():

1077 dtype = np.result_type(self.sp_values, type(fill_value))

1078 taken = np.empty_like(indices, dtype=dtype)

1079 taken.fill(fill_value)

1080 return taken

1081 else:

1082 raise IndexError("cannot do a non-empty take from an empty axes.")

1083

1084 # sp_indexer may be -1 for two reasons

1085 # 1.) we took for an index of -1 (new)

1086 # 2.) we took a value that was self.fill_value (old)

1087 sp_indexer = self.sp_index.lookup_array(indices)

1088 new_fill_indices = indices == -1

1089 old_fill_indices = (sp_indexer == -1) & ~new_fill_indices

1090

1091 if self.sp_index.npoints == 0 and old_fill_indices.all():

1092 # We've looked up all valid points on an all-sparse array.

1093 taken = np.full(

1094 sp_indexer.shape, fill_value=self.fill_value, dtype=self.dtype.subtype

1095 )

1096

1097 elif self.sp_index.npoints == 0:

1098 # Use the old fill_value unless we took for an index of -1

1099 _dtype = np.result_type(self.dtype.subtype, type(fill_value))

1100 taken = np.full(sp_indexer.shape, fill_value=fill_value, dtype=_dtype)

1101 taken[old_fill_indices] = self.fill_value

1102 else:

1103 taken = self.sp_values.take(sp_indexer)

1104

1105 # Fill in two steps.

1106 # Old fill values

1107 # New fill values

1108 # potentially coercing to a new dtype at each stage.

1109

1110 m0 = sp_indexer[old_fill_indices] < 0

1111 m1 = sp_indexer[new_fill_indices] < 0

1112

1113 result_type = taken.dtype

1114

1115 if m0.any():

1116 result_type = np.result_type(result_type, type(self.fill_value))

1117 taken = taken.astype(result_type)

1118 taken[old_fill_indices] = self.fill_value

1119

1120 if m1.any():

1121 result_type = np.result_type(result_type, type(fill_value))

1122 taken = taken.astype(result_type)

1123 taken[new_fill_indices] = fill_value

1124

1125 return taken

1126

1127 def _take_without_fill(self, indices) -> Self:

1128 to_shift = indices < 0

1129

1130 n = len(self)

1131

1132 if (indices.max() >= n) or (indices.min() < -n):

1133 if n == 0:

1134 raise IndexError("cannot do a non-empty take from an empty axes.")

1135 raise IndexError("out of bounds value in 'indices'.")

1136

1137 if to_shift.any():

1138 indices = indices.copy()

1139 indices[to_shift] += n

1140

1141 sp_indexer = self.sp_index.lookup_array(indices)

1142 value_mask = sp_indexer != -1

1143 new_sp_values = self.sp_values[sp_indexer[value_mask]]

1144

1145 value_indices = np.flatnonzero(value_mask).astype(np.int32, copy=False)

1146

1147 new_sp_index = make_sparse_index(len(indices), value_indices, kind=self.kind)

1148 return type(self)._simple_new(new_sp_values, new_sp_index, dtype=self.dtype)

1149

1150 def searchsorted(

1151 self,

1152 v: ArrayLike | object,

1153 side: Literal["left", "right"] = "left",

1154 sorter: NumpySorter | None = None,

1155 ) -> npt.NDArray[np.intp] | np.intp:

1156 msg = "searchsorted requires high memory usage."

1157 warnings.warn(msg, PerformanceWarning, stacklevel=find_stack_level())

1158 v = np.asarray(v)

1159 return np.asarray(self, dtype=self.dtype.subtype).searchsorted(v, side, sorter)

1160

1161 def copy(self) -> Self:

1162 values = self.sp_values.copy()

1163 return self._simple_new(values, self.sp_index, self.dtype)

1164

1165 @classmethod

1166 def _concat_same_type(cls, to_concat: Sequence[Self]) -> Self:

1167 fill_value = to_concat[0].fill_value

1168

1169 values = []

1170 length = 0

1171

1172 if to_concat:

1173 sp_kind = to_concat[0].kind

1174 else:

1175 sp_kind = "integer"

1176

1177 sp_index: SparseIndex

1178 if sp_kind == "integer":

1179 indices = []

1180

1181 for arr in to_concat:

1182 int_idx = arr.sp_index.indices.copy()

1183 int_idx += length # TODO: wraparound

1184 length += arr.sp_index.length

1185

1186 values.append(arr.sp_values)

1187 indices.append(int_idx)

1188

1189 data = np.concatenate(values)

1190 indices_arr = np.concatenate(indices)

1191 # error: Argument 2 to "IntIndex" has incompatible type

1192 # "ndarray[Any, dtype[signedinteger[_32Bit]]]";

1193 # expected "Sequence[int]"

1194 sp_index = IntIndex(length, indices_arr) # type: ignore[arg-type]

1195

1196 else:

1197 # when concatenating block indices, we don't claim that you'll

1198 # get an identical index as concatenating the values and then

1199 # creating a new index. We don't want to spend the time trying

1200 # to merge blocks across arrays in `to_concat`, so the resulting

1201 # BlockIndex may have more blocks.

1202 blengths = []

1203 blocs = []

1204

1205 for arr in to_concat:

1206 block_idx = arr.sp_index.to_block_index()

1207

1208 values.append(arr.sp_values)

1209 blocs.append(block_idx.blocs.copy() + length)

1210 blengths.append(block_idx.blengths)

1211 length += arr.sp_index.length

1212

1213 data = np.concatenate(values)

1214 blocs_arr = np.concatenate(blocs)

1215 blengths_arr = np.concatenate(blengths)

1216

1217 sp_index = BlockIndex(length, blocs_arr, blengths_arr)

1218

1219 return cls(data, sparse_index=sp_index, fill_value=fill_value)

1220

1221 def astype(self, dtype: AstypeArg | None = None, copy: bool = True):

1222 """

1223 Change the dtype of a SparseArray.

1224

1225 The output will always be a SparseArray. To convert to a dense

1226 ndarray with a certain dtype, use :meth:`numpy.asarray`.

1227

1228 Parameters

1229 ----------

1230 dtype : np.dtype or ExtensionDtype

1231 For SparseDtype, this changes the dtype of

1232 ``self.sp_values`` and the ``self.fill_value``.

1233

1234 For other dtypes, this only changes the dtype of

1235 ``self.sp_values``.

1236

1237 copy : bool, default True

1238 Whether to ensure a copy is made, even if not necessary.

1239

1240 Returns

1241 -------

1242 SparseArray

1243

1244 Examples

1245 --------

1246 >>> arr = pd.arrays.SparseArray([0, 0, 1, 2])

1247 >>> arr

1248 [0, 0, 1, 2]

1249 Fill: 0

1250 IntIndex

1251 Indices: array([2, 3], dtype=int32)

1252

1253 >>> arr.astype(SparseDtype(np.dtype('int32')))

1254 [0, 0, 1, 2]

1255 Fill: 0

1256 IntIndex

1257 Indices: array([2, 3], dtype=int32)

1258

1259 Using a NumPy dtype with a different kind (e.g. float) will coerce

1260 just ``self.sp_values``.

1261

1262 >>> arr.astype(SparseDtype(np.dtype('float64')))

1263 ... # doctest: +NORMALIZE_WHITESPACE

1264 [nan, nan, 1.0, 2.0]

1265 Fill: nan

1266 IntIndex

1267 Indices: array([2, 3], dtype=int32)

1268

1269 Using a SparseDtype, you can also change the fill value as well.

1270

1271 >>> arr.astype(SparseDtype("float64", fill_value=0.0))

1272 ... # doctest: +NORMALIZE_WHITESPACE

1273 [0.0, 0.0, 1.0, 2.0]

1274 Fill: 0.0

1275 IntIndex

1276 Indices: array([2, 3], dtype=int32)

1277 """

1278 if dtype == self._dtype:

1279 if not copy:

1280 return self

1281 else:

1282 return self.copy()

1283

1284 future_dtype = pandas_dtype(dtype)

1285 if not isinstance(future_dtype, SparseDtype):

1286 # GH#34457

1287 values = np.asarray(self)

1288 values = ensure_wrapped_if_datetimelike(values)

1289 return astype_array(values, dtype=future_dtype, copy=False)

1290

1291 dtype = self.dtype.update_dtype(dtype)

1292 subtype = pandas_dtype(dtype._subtype_with_str)

1293 subtype = cast(np.dtype, subtype) # ensured by update_dtype

1294 values = ensure_wrapped_if_datetimelike(self.sp_values)

1295 sp_values = astype_array(values, subtype, copy=copy)

1296 sp_values = np.asarray(sp_values)

1297

1298 return self._simple_new(sp_values, self.sp_index, dtype)

1299

1300 def map(self, mapper, na_action=None) -> Self:

1301 """

1302 Map categories using an input mapping or function.

1303

1304 Parameters

1305 ----------

1306 mapper : dict, Series, callable

1307 The correspondence from old values to new.

1308 na_action : {None, 'ignore'}, default None

1309 If 'ignore', propagate NA values, without passing them to the

1310 mapping correspondence.

1311

1312 Returns

1313 -------

1314 SparseArray

1315 The output array will have the same density as the input.

1316 The output fill value will be the result of applying the

1317 mapping to ``self.fill_value``

1318

1319 Examples

1320 --------

1321 >>> arr = pd.arrays.SparseArray([0, 1, 2])

1322 >>> arr.map(lambda x: x + 10)

1323 [10, 11, 12]

1324 Fill: 10

1325 IntIndex

1326 Indices: array([1, 2], dtype=int32)

1327

1328 >>> arr.map({0: 10, 1: 11, 2: 12})

1329 [10, 11, 12]

1330 Fill: 10

1331 IntIndex

1332 Indices: array([1, 2], dtype=int32)

1333

1334 >>> arr.map(pd.Series([10, 11, 12], index=[0, 1, 2]))

1335 [10, 11, 12]

1336 Fill: 10

1337 IntIndex

1338 Indices: array([1, 2], dtype=int32)

1339 """

1340 is_map = isinstance(mapper, (abc.Mapping, ABCSeries))

1341

1342 fill_val = self.fill_value

1343

1344 if na_action is None or notna(fill_val):

1345 fill_val = mapper.get(fill_val, fill_val) if is_map else mapper(fill_val)

1346

1347 def func(sp_val):

1348 new_sp_val = mapper.get(sp_val, None) if is_map else mapper(sp_val)

1349 # check identity and equality because nans are not equal to each other

1350 if new_sp_val is fill_val or new_sp_val == fill_val:

1351 msg = "fill value in the sparse values not supported"

1352 raise ValueError(msg)

1353 return new_sp_val

1354

1355 sp_values = [func(x) for x in self.sp_values]

1356

1357 return type(self)(sp_values, sparse_index=self.sp_index, fill_value=fill_val)

1358

1359 def to_dense(self) -> np.ndarray:

1360 """

1361 Convert SparseArray to a NumPy array.

1362

1363 Returns

1364 -------

1365 arr : NumPy array

1366 """

1367 return np.asarray(self, dtype=self.sp_values.dtype)

1368

1369 def _where(self, mask, value):

1370 # NB: may not preserve dtype, e.g. result may be Sparse[float64]

1371 # while self is Sparse[int64]

1372 naive_implementation = np.where(mask, self, value)

1373 dtype = SparseDtype(naive_implementation.dtype, fill_value=self.fill_value)

1374 result = type(self)._from_sequence(naive_implementation, dtype=dtype)

1375 return result

1376

1377 # ------------------------------------------------------------------------

1378 # IO

1379 # ------------------------------------------------------------------------

1380 def __setstate__(self, state) -> None:

1381 """Necessary for making this object picklable"""

1382 if isinstance(state, tuple):

1383 # Compat for pandas < 0.24.0

1384 nd_state, (fill_value, sp_index) = state

1385 sparse_values = np.array([])

1386 sparse_values.__setstate__(nd_state)

1387

1388 self._sparse_values = sparse_values

1389 self._sparse_index = sp_index

1390 self._dtype = SparseDtype(sparse_values.dtype, fill_value)

1391 else:

1392 self.__dict__.update(state)

1393

1394 def nonzero(self) -> tuple[npt.NDArray[np.int32]]:

1395 if self.fill_value == 0:

1396 return (self.sp_index.indices,)

1397 else:

1398 return (self.sp_index.indices[self.sp_values != 0],)

1399

1400 # ------------------------------------------------------------------------

1401 # Reductions

1402 # ------------------------------------------------------------------------

1403

1404 def _reduce(

1405 self, name: str, *, skipna: bool = True, keepdims: bool = False, **kwargs

1406 ):

1407 method = getattr(self, name, None)

1408

1409 if method is None:

1410 raise TypeError(f"cannot perform {name} with type {self.dtype}")

1411

1412 if skipna:

1413 arr = self

1414 else:

1415 arr = self.dropna()

1416

1417 result = getattr(arr, name)(**kwargs)

1418

1419 if keepdims:

1420 return type(self)([result], dtype=self.dtype)

1421 else:

1422 return result

1423

1424 def all(self, axis=None, *args, **kwargs):

1425 """

1426 Tests whether all elements evaluate True

1427

1428 Returns

1429 -------

1430 all : bool

1431

1432 See Also

1433 --------

1434 numpy.all

1435 """

1436 nv.validate_all(args, kwargs)

1437

1438 values = self.sp_values

1439

1440 if len(values) != len(self) and not np.all(self.fill_value):

1441 return False

1442

1443 return values.all()

1444

1445 def any(self, axis: AxisInt = 0, *args, **kwargs) -> bool:

1446 """

1447 Tests whether at least one of elements evaluate True

1448

1449 Returns

1450 -------

1451 any : bool

1452

1453 See Also

1454 --------

1455 numpy.any

1456 """

1457 nv.validate_any(args, kwargs)

1458

1459 values = self.sp_values

1460

1461 if len(values) != len(self) and np.any(self.fill_value):

1462 return True

1463

1464 return values.any().item()

1465

1466 def sum(

1467 self,

1468 axis: AxisInt = 0,

1469 min_count: int = 0,

1470 skipna: bool = True,

1471 *args,

1472 **kwargs,

1473 ) -> Scalar:

1474 """

1475 Sum of non-NA/null values

1476

1477 Parameters

1478 ----------

1479 axis : int, default 0

1480 Not Used. NumPy compatibility.

1481 min_count : int, default 0

1482 The required number of valid values to perform the summation. If fewer

1483 than ``min_count`` valid values are present, the result will be the missing

1484 value indicator for subarray type.

1485 *args, **kwargs

1486 Not Used. NumPy compatibility.

1487

1488 Returns

1489 -------

1490 scalar

1491 """

1492 nv.validate_sum(args, kwargs)

1493 valid_vals = self._valid_sp_values

1494 sp_sum = valid_vals.sum()

1495 has_na = self.sp_index.ngaps > 0 and not self._null_fill_value

1496

1497 if has_na and not skipna:

1498 return na_value_for_dtype(self.dtype.subtype, compat=False)

1499

1500 if self._null_fill_value:

1501 if check_below_min_count(valid_vals.shape, None, min_count):

1502 return na_value_for_dtype(self.dtype.subtype, compat=False)

1503 return sp_sum

1504 else:

1505 nsparse = self.sp_index.ngaps

1506 if check_below_min_count(valid_vals.shape, None, min_count - nsparse):

1507 return na_value_for_dtype(self.dtype.subtype, compat=False)

1508 return sp_sum + self.fill_value * nsparse

1509

1510 def cumsum(self, axis: AxisInt = 0, *args, **kwargs) -> SparseArray:

1511 """

1512 Cumulative sum of non-NA/null values.

1513

1514 When performing the cumulative summation, any non-NA/null values will

1515 be skipped. The resulting SparseArray will preserve the locations of

1516 NaN values, but the fill value will be `np.nan` regardless.

1517

1518 Parameters

1519 ----------

1520 axis : int or None

1521 Axis over which to perform the cumulative summation. If None,

1522 perform cumulative summation over flattened array.

1523

1524 Returns

1525 -------

1526 cumsum : SparseArray

1527 """

1528 nv.validate_cumsum(args, kwargs)

1529

1530 if axis is not None and axis >= self.ndim: # Mimic ndarray behaviour.

1531 raise ValueError(f"axis(={axis}) out of bounds")

1532

1533 if not self._null_fill_value:

1534 return SparseArray(self.to_dense()).cumsum()

1535

1536 return SparseArray(

1537 self.sp_values.cumsum(),

1538 sparse_index=self.sp_index,

1539 fill_value=self.fill_value,

1540 )

1541

1542 def mean(self, axis: Axis = 0, *args, **kwargs):

1543 """

1544 Mean of non-NA/null values

1545

1546 Returns

1547 -------

1548 mean : float

1549 """

1550 nv.validate_mean(args, kwargs)

1551 valid_vals = self._valid_sp_values

1552 sp_sum = valid_vals.sum()

1553 ct = len(valid_vals)

1554

1555 if self._null_fill_value:

1556 return sp_sum / ct

1557 else:

1558 nsparse = self.sp_index.ngaps

1559 return (sp_sum + self.fill_value * nsparse) / (ct + nsparse)

1560

1561 def max(self, *, axis: AxisInt | None = None, skipna: bool = True):

1562 """

1563 Max of array values, ignoring NA values if specified.

1564

1565 Parameters

1566 ----------

1567 axis : int, default 0

1568 Not Used. NumPy compatibility.

1569 skipna : bool, default True

1570 Whether to ignore NA values.

1571

1572 Returns

1573 -------

1574 scalar

1575 """

1576 nv.validate_minmax_axis(axis, self.ndim)

1577 return self._min_max("max", skipna=skipna)

1578

1579 def min(self, *, axis: AxisInt | None = None, skipna: bool = True):

1580 """

1581 Min of array values, ignoring NA values if specified.

1582

1583 Parameters

1584 ----------

1585 axis : int, default 0

1586 Not Used. NumPy compatibility.

1587 skipna : bool, default True

1588 Whether to ignore NA values.

1589

1590 Returns

1591 -------

1592 scalar

1593 """

1594 nv.validate_minmax_axis(axis, self.ndim)

1595 return self._min_max("min", skipna=skipna)

1596

1597 def _min_max(self, kind: Literal["min", "max"], skipna: bool) -> Scalar:

1598 """

1599 Min/max of non-NA/null values

1600

1601 Parameters

1602 ----------

1603 kind : {"min", "max"}

1604 skipna : bool

1605

1606 Returns

1607 -------

1608 scalar

1609 """

1610 valid_vals = self._valid_sp_values

1611 has_nonnull_fill_vals = not self._null_fill_value and self.sp_index.ngaps > 0

1612

1613 if len(valid_vals) > 0:

1614 sp_min_max = getattr(valid_vals, kind)()

1615

1616 # If a non-null fill value is currently present, it might be the min/max

1617 if has_nonnull_fill_vals:

1618 func = max if kind == "max" else min

1619 return func(sp_min_max, self.fill_value)

1620 elif skipna:

1621 return sp_min_max

1622 elif self.sp_index.ngaps == 0:

1623 # No NAs present

1624 return sp_min_max

1625 else:

1626 return na_value_for_dtype(self.dtype.subtype, compat=False)

1627 elif has_nonnull_fill_vals:

1628 return self.fill_value

1629 else:

1630 return na_value_for_dtype(self.dtype.subtype, compat=False)

1631

1632 def _argmin_argmax(self, kind: Literal["argmin", "argmax"]) -> int:

1633 values = self._sparse_values

1634 index = self._sparse_index.indices

1635 mask = np.asarray(isna(values))

1636 func = np.argmax if kind == "argmax" else np.argmin

1637

1638 idx = np.arange(values.shape[0])

1639 non_nans = values[~mask]

1640 non_nan_idx = idx[~mask]

1641

1642 _candidate = non_nan_idx[func(non_nans)]

1643 candidate = index[_candidate]

1644

1645 if isna(self.fill_value):

1646 return candidate

1647 if kind == "argmin" and self[candidate] < self.fill_value:

1648 return candidate

1649 if kind == "argmax" and self[candidate] > self.fill_value:

1650 return candidate

1651 _loc = self._first_fill_value_loc()

1652 if _loc == -1:

1653 # fill_value doesn't exist

1654 return candidate

1655 else:

1656 return _loc

1657

1658 def argmax(self, skipna: bool = True) -> int:

1659 validate_bool_kwarg(skipna, "skipna")

1660 if not skipna and self._hasna:

1661 raise NotImplementedError

1662 return self._argmin_argmax("argmax")

1663

1664 def argmin(self, skipna: bool = True) -> int:

1665 validate_bool_kwarg(skipna, "skipna")

1666 if not skipna and self._hasna:

1667 raise NotImplementedError

1668 return self._argmin_argmax("argmin")

1669

1670 # ------------------------------------------------------------------------

1671 # Ufuncs

1672 # ------------------------------------------------------------------------

1673

1674 _HANDLED_TYPES = (np.ndarray, numbers.Number)

1675

1676 def __array_ufunc__(self, ufunc: np.ufunc, method: str, *inputs, **kwargs):

1677 out = kwargs.get("out", ())

1678

1679 for x in inputs + out:

1680 if not isinstance(x, self._HANDLED_TYPES + (SparseArray,)):

1681 return NotImplemented

1682

1683 # for binary ops, use our custom dunder methods

1684 result = arraylike.maybe_dispatch_ufunc_to_dunder_op(

1685 self, ufunc, method, *inputs, **kwargs

1686 )

1687 if result is not NotImplemented:

1688 return result

1689

1690 if "out" in kwargs:

1691 # e.g. tests.arrays.sparse.test_arithmetics.test_ndarray_inplace

1692 res = arraylike.dispatch_ufunc_with_out(

1693 self, ufunc, method, *inputs, **kwargs

1694 )

1695 return res

1696

1697 if method == "reduce":

1698 result = arraylike.dispatch_reduction_ufunc(

1699 self, ufunc, method, *inputs, **kwargs

1700 )

1701 if result is not NotImplemented:

1702 # e.g. tests.series.test_ufunc.TestNumpyReductions

1703 return result

1704

1705 if len(inputs) == 1:

1706 # No alignment necessary.

1707 sp_values = getattr(ufunc, method)(self.sp_values, **kwargs)

1708 fill_value = getattr(ufunc, method)(self.fill_value, **kwargs)

1709

1710 if ufunc.nout > 1:

1711 # multiple outputs. e.g. modf

1712 arrays = tuple(

1713 self._simple_new(

1714 sp_value, self.sp_index, SparseDtype(sp_value.dtype, fv)

1715 )

1716 for sp_value, fv in zip(sp_values, fill_value)

1717 )

1718 return arrays

1719 elif method == "reduce":

1720 # e.g. reductions

1721 return sp_values

1722

1723 return self._simple_new(

1724 sp_values, self.sp_index, SparseDtype(sp_values.dtype, fill_value)

1725 )

1726

1727 new_inputs = tuple(np.asarray(x) for x in inputs)

1728 result = getattr(ufunc, method)(*new_inputs, **kwargs)

1729 if out:

1730 if len(out) == 1:

1731 out = out[0]

1732 return out

1733

1734 if ufunc.nout > 1:

1735 return tuple(type(self)(x) for x in result)

1736 elif method == "at":

1737 # no return value

1738 return None

1739 else:

1740 return type(self)(result)

1741

1742 # ------------------------------------------------------------------------

1743 # Ops

1744 # ------------------------------------------------------------------------

1745

1746 def _arith_method(self, other, op):

1747 op_name = op.__name__

1748

1749 if isinstance(other, SparseArray):

1750 return _sparse_array_op(self, other, op, op_name)

1751

1752 elif is_scalar(other):

1753 with np.errstate(all="ignore"):

1754 fill = op(_get_fill(self), np.asarray(other))

1755 result = op(self.sp_values, other)

1756

1757 if op_name == "divmod":

1758 left, right = result

1759 lfill, rfill = fill

1760 return (

1761 _wrap_result(op_name, left, self.sp_index, lfill),

1762 _wrap_result(op_name, right, self.sp_index, rfill),

1763 )

1764

1765 return _wrap_result(op_name, result, self.sp_index, fill)

1766

1767 else:

1768 other = np.asarray(other)

1769 with np.errstate(all="ignore"):

1770 if len(self) != len(other):

1771 raise AssertionError(

1772 f"length mismatch: {len(self)} vs. {len(other)}"

1773 )

1774 if not isinstance(other, SparseArray):

1775 dtype = getattr(other, "dtype", None)

1776 other = SparseArray(other, fill_value=self.fill_value, dtype=dtype)

1777 return _sparse_array_op(self, other, op, op_name)

1778

1779 def _cmp_method(self, other, op) -> SparseArray:

1780 if not is_scalar(other) and not isinstance(other, type(self)):

1781 # convert list-like to ndarray

1782 other = np.asarray(other)

1783

1784 if isinstance(other, np.ndarray):

1785 # TODO: make this more flexible than just ndarray...

1786 other = SparseArray(other, fill_value=self.fill_value)

1787

1788 if isinstance(other, SparseArray):

1789 if len(self) != len(other):

1790 raise ValueError(

1791 f"operands have mismatched length {len(self)} and {len(other)}"

1792 )

1793

1794 op_name = op.__name__.strip("_")

1795 return _sparse_array_op(self, other, op, op_name)

1796 else:

1797 # scalar

1798 fill_value = op(self.fill_value, other)

1799 result = np.full(len(self), fill_value, dtype=np.bool_)

1800 result[self.sp_index.indices] = op(self.sp_values, other)

1801

1802 return type(self)(

1803 result,

1804 fill_value=fill_value,

1805 dtype=np.bool_,

1806 )

1807

1808 _logical_method = _cmp_method

1809

1810 def _unary_method(self, op) -> SparseArray:

1811 fill_value = op(np.array(self.fill_value)).item()

1812 dtype = SparseDtype(self.dtype.subtype, fill_value)

1813 # NOTE: if fill_value doesn't change

1814 # we just have to apply op to sp_values

1815 if isna(self.fill_value) or fill_value == self.fill_value:

1816 values = op(self.sp_values)

1817 return type(self)._simple_new(values, self.sp_index, self.dtype)

1818 # In the other case we have to recalc indexes

1819 return type(self)(op(self.to_dense()), dtype=dtype)

1820

1821 def __pos__(self) -> SparseArray:

1822 return self._unary_method(operator.pos)

1823

1824 def __neg__(self) -> SparseArray:

1825 return self._unary_method(operator.neg)

1826

1827 def __invert__(self) -> SparseArray:

1828 return self._unary_method(operator.invert)

1829

1830 def __abs__(self) -> SparseArray:

1831 return self._unary_method(operator.abs)

1832

1833 # ----------

1834 # Formatting

1835 # -----------

1836 def __repr__(self) -> str:

1837 pp_str = printing.pprint_thing(self)

1838 pp_fill = printing.pprint_thing(self.fill_value)

1839 pp_index = printing.pprint_thing(self.sp_index)

1840 return f"{pp_str}\nFill: {pp_fill}\n{pp_index}"

1841

1842 def _formatter(self, boxed: bool = False):

1843 # Defer to the formatter from the GenericArrayFormatter calling us.

1844 # This will infer the correct formatter from the dtype of the values.

1845 return None

1846

1847

1848def _make_sparse(

1849 arr: np.ndarray,

1850 kind: SparseIndexKind = "block",

1851 fill_value=None,

1852 dtype: np.dtype | None = None,

1853):

1854 """

1855 Convert ndarray to sparse format

1856

1857 Parameters

1858 ----------

1859 arr : ndarray

1860 kind : {'block', 'integer'}

1861 fill_value : NaN or another value

1862 dtype : np.dtype, optional

1863 copy : bool, default False

1864

1865 Returns

1866 -------

1867 (sparse_values, index, fill_value) : (ndarray, SparseIndex, Scalar)

1868 """

1869 assert isinstance(arr, np.ndarray)

1870

1871 if arr.ndim > 1:

1872 raise TypeError("expected dimension <= 1 data")

1873

1874 if fill_value is None:

1875 fill_value = na_value_for_dtype(arr.dtype)

1876

1877 if isna(fill_value):

1878 mask = notna(arr)

1879 else:

1880 # cast to object comparison to be safe

1881 if is_string_dtype(arr.dtype):

1882 arr = arr.astype(object)

1883

1884 if is_object_dtype(arr.dtype):

1885 # element-wise equality check method in numpy doesn't treat

1886 # each element type, eg. 0, 0.0, and False are treated as

1887 # same. So we have to check the both of its type and value.

1888 mask = splib.make_mask_object_ndarray(arr, fill_value)

1889 else:

1890 mask = arr != fill_value

1891

1892 length = len(arr)

1893 if length != len(mask):

1894 # the arr is a SparseArray

1895 indices = mask.sp_index.indices

1896 else:

1897 indices = mask.nonzero()[0].astype(np.int32)

1898

1899 index = make_sparse_index(length, indices, kind)

1900 sparsified_values = arr[mask]

1901 if dtype is not None:

1902 sparsified_values = ensure_wrapped_if_datetimelike(sparsified_values)

1903 sparsified_values = astype_array(sparsified_values, dtype=dtype)

1904 sparsified_values = np.asarray(sparsified_values)

1905

1906 # TODO: copy

1907 return sparsified_values, index, fill_value

1908

1909

1910@overload

1911def make_sparse_index(length: int, indices, kind: Literal["block"]) -> BlockIndex:

1912 ...

1913

1914

1915@overload

1916def make_sparse_index(length: int, indices, kind: Literal["integer"]) -> IntIndex:

1917 ...

1918

1919

1920def make_sparse_index(length: int, indices, kind: SparseIndexKind) -> SparseIndex:

1921 index: SparseIndex

1922 if kind == "block":

1923 locs, lens = splib.get_blocks(indices)

1924 index = BlockIndex(length, locs, lens)

1925 elif kind == "integer":

1926 index = IntIndex(length, indices)

1927 else: # pragma: no cover

1928 raise ValueError("must be block or integer type")

1929 return index