Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/pandas/core/arrays/sparse/array.py: 18%

1"""

2SparseArray data structure

3"""

4from __future__ import annotations

6from collections import abc

7import numbers

8import operator

9from typing import (

10 TYPE_CHECKING,

11 Any,

12 Callable,

13 Literal,

14 Sequence,

15 TypeVar,

16 cast,

17 overload,

18)

19import warnings

21import numpy as np

23from pandas._libs import lib

24import pandas._libs.sparse as splib

25from pandas._libs.sparse import (

26 BlockIndex,

27 IntIndex,

28 SparseIndex,

29)

30from pandas._libs.tslibs import NaT

31from pandas._typing import (

32 ArrayLike,

33 AstypeArg,

34 Axis,

35 AxisInt,

36 Dtype,

37 NpDtype,

38 PositionalIndexer,

39 Scalar,

40 ScalarIndexer,

41 SequenceIndexer,

42 npt,

43)

44from pandas.compat.numpy import function as nv

45from pandas.errors import PerformanceWarning

46from pandas.util._exceptions import find_stack_level

47from pandas.util._validators import (

48 validate_bool_kwarg,

49 validate_insert_loc,

50)

52from pandas.core.dtypes.astype import astype_array

53from pandas.core.dtypes.cast import (

54 construct_1d_arraylike_from_scalar,

55 find_common_type,

56 maybe_box_datetimelike,

57)

58from pandas.core.dtypes.common import (

59 is_array_like,

60 is_bool_dtype,

61 is_datetime64_any_dtype,

62 is_datetime64tz_dtype,

63 is_dtype_equal,

64 is_integer,

65 is_list_like,

66 is_object_dtype,

67 is_scalar,

68 is_string_dtype,

69 pandas_dtype,

70)

71from pandas.core.dtypes.generic import (

72 ABCIndex,

73 ABCSeries,

74)

75from pandas.core.dtypes.missing import (

76 isna,

77 na_value_for_dtype,

78 notna,

79)

81from pandas.core import (

82 arraylike,

83 ops,

84)

85import pandas.core.algorithms as algos

86from pandas.core.arraylike import OpsMixin

87from pandas.core.arrays import ExtensionArray

88from pandas.core.arrays.sparse.dtype import SparseDtype

89from pandas.core.base import PandasObject

90import pandas.core.common as com

91from pandas.core.construction import (

92 ensure_wrapped_if_datetimelike,

93 extract_array,

94 sanitize_array,

95)

96from pandas.core.indexers import (

97 check_array_indexer,

98 unpack_tuple_and_ellipses,

99)

100from pandas.core.missing import interpolate_2d

101from pandas.core.nanops import check_below_min_count

102

103from pandas.io.formats import printing

104

105# See https://github.com/python/typing/issues/684

106if TYPE_CHECKING:

107 from enum import Enum

108

109 class ellipsis(Enum):

110 Ellipsis = "..."

111

112 Ellipsis = ellipsis.Ellipsis

113

114 from scipy.sparse import spmatrix

115

116 from pandas._typing import (

117 FillnaOptions,

118 NumpySorter,

119 )

120

121 SparseIndexKind = Literal["integer", "block"]

122

123 from pandas import Series

124

125else:

126 ellipsis = type(Ellipsis)

127

128

129# ----------------------------------------------------------------------------

130# Array

131

132SparseArrayT = TypeVar("SparseArrayT", bound="SparseArray")

133

134_sparray_doc_kwargs = {"klass": "SparseArray"}

135

136

137def _get_fill(arr: SparseArray) -> np.ndarray:

138 """

139 Create a 0-dim ndarray containing the fill value

140

141 Parameters

142 ----------

143 arr : SparseArray

144

145 Returns

146 -------

147 fill_value : ndarray

148 0-dim ndarray with just the fill value.

149

150 Notes

151 -----

152 coerce fill_value to arr dtype if possible

153 int64 SparseArray can have NaN as fill_value if there is no missing

154 """

155 try:

156 return np.asarray(arr.fill_value, dtype=arr.dtype.subtype)

157 except ValueError:

158 return np.asarray(arr.fill_value)

159

160

161def _sparse_array_op(

162 left: SparseArray, right: SparseArray, op: Callable, name: str

163) -> SparseArray:

164 """

165 Perform a binary operation between two arrays.

166

167 Parameters

168 ----------

169 left : Union[SparseArray, ndarray]

170 right : Union[SparseArray, ndarray]

171 op : Callable

172 The binary operation to perform

173 name str

174 Name of the callable.

175

176 Returns

177 -------

178 SparseArray

179 """

180 if name.startswith("__"):

181 # For lookups in _libs.sparse we need non-dunder op name

182 name = name[2:-2]

183

184 # dtype used to find corresponding sparse method

185 ltype = left.dtype.subtype

186 rtype = right.dtype.subtype

187

188 if not is_dtype_equal(ltype, rtype):

189 subtype = find_common_type([ltype, rtype])

190 ltype = SparseDtype(subtype, left.fill_value)

191 rtype = SparseDtype(subtype, right.fill_value)

192

193 left = left.astype(ltype, copy=False)

194 right = right.astype(rtype, copy=False)

195 dtype = ltype.subtype

196 else:

197 dtype = ltype

198

199 # dtype the result must have

200 result_dtype = None

201

202 if left.sp_index.ngaps == 0 or right.sp_index.ngaps == 0:

203 with np.errstate(all="ignore"):

204 result = op(left.to_dense(), right.to_dense())

205 fill = op(_get_fill(left), _get_fill(right))

206

207 if left.sp_index.ngaps == 0:

208 index = left.sp_index

209 else:

210 index = right.sp_index

211 elif left.sp_index.equals(right.sp_index):

212 with np.errstate(all="ignore"):

213 result = op(left.sp_values, right.sp_values)

214 fill = op(_get_fill(left), _get_fill(right))

215 index = left.sp_index

216 else:

217 if name[0] == "r":

218 left, right = right, left

219 name = name[1:]

220

221 if name in ("and", "or", "xor") and dtype == "bool":

222 opname = f"sparse_{name}_uint8"

223 # to make template simple, cast here

224 left_sp_values = left.sp_values.view(np.uint8)

225 right_sp_values = right.sp_values.view(np.uint8)

226 result_dtype = bool

227 else:

228 opname = f"sparse_{name}_{dtype}"

229 left_sp_values = left.sp_values

230 right_sp_values = right.sp_values

231

232 if (

233 name in ["floordiv", "mod"]

234 and (right == 0).any()

235 and left.dtype.kind in ["i", "u"]

236 ):

237 # Match the non-Sparse Series behavior

238 opname = f"sparse_{name}_float64"

239 left_sp_values = left_sp_values.astype("float64")

240 right_sp_values = right_sp_values.astype("float64")

241

242 sparse_op = getattr(splib, opname)

243

244 with np.errstate(all="ignore"):

245 result, index, fill = sparse_op(

246 left_sp_values,

247 left.sp_index,

248 left.fill_value,

249 right_sp_values,

250 right.sp_index,

251 right.fill_value,

252 )

253

254 if name == "divmod":

255 # result is a 2-tuple

256 # error: Incompatible return value type (got "Tuple[SparseArray,

257 # SparseArray]", expected "SparseArray")

258 return ( # type: ignore[return-value]

259 _wrap_result(name, result[0], index, fill[0], dtype=result_dtype),

260 _wrap_result(name, result[1], index, fill[1], dtype=result_dtype),

261 )

262

263 if result_dtype is None:

264 result_dtype = result.dtype

265

266 return _wrap_result(name, result, index, fill, dtype=result_dtype)

267

268

269def _wrap_result(

270 name: str, data, sparse_index, fill_value, dtype: Dtype | None = None

271) -> SparseArray:

272 """

273 wrap op result to have correct dtype

274 """

275 if name.startswith("__"):

276 # e.g. __eq__ --> eq

277 name = name[2:-2]

278

279 if name in ("eq", "ne", "lt", "gt", "le", "ge"):

280 dtype = bool

281

282 fill_value = lib.item_from_zerodim(fill_value)

283

284 if is_bool_dtype(dtype):

285 # fill_value may be np.bool_

286 fill_value = bool(fill_value)

287 return SparseArray(

288 data, sparse_index=sparse_index, fill_value=fill_value, dtype=dtype

289 )

290

291

292class SparseArray(OpsMixin, PandasObject, ExtensionArray):

293 """

294 An ExtensionArray for storing sparse data.

295

296 Parameters

297 ----------

298 data : array-like or scalar

299 A dense array of values to store in the SparseArray. This may contain

300 `fill_value`.

301 sparse_index : SparseIndex, optional

302 fill_value : scalar, optional

303 Elements in data that are ``fill_value`` are not stored in the

304 SparseArray. For memory savings, this should be the most common value

305 in `data`. By default, `fill_value` depends on the dtype of `data`:

306

307 =========== ==========

308 data.dtype na_value

309 =========== ==========

310 float ``np.nan``

311 int ``0``

312 bool False

313 datetime64 ``pd.NaT``

314 timedelta64 ``pd.NaT``

315 =========== ==========

316

317 The fill value is potentially specified in three ways. In order of

318 precedence, these are

319

320 1. The `fill_value` argument

321 2. ``dtype.fill_value`` if `fill_value` is None and `dtype` is

322 a ``SparseDtype``

323 3. ``data.dtype.fill_value`` if `fill_value` is None and `dtype`

324 is not a ``SparseDtype`` and `data` is a ``SparseArray``.

325

326 kind : str

327 Can be 'integer' or 'block', default is 'integer'.

328 The type of storage for sparse locations.

329

330 * 'block': Stores a `block` and `block_length` for each

331 contiguous *span* of sparse values. This is best when

332 sparse data tends to be clumped together, with large

333 regions of ``fill-value`` values between sparse values.

334 * 'integer': uses an integer to store the location of

335 each sparse value.

336

337 dtype : np.dtype or SparseDtype, optional

338 The dtype to use for the SparseArray. For numpy dtypes, this

339 determines the dtype of ``self.sp_values``. For SparseDtype,

340 this determines ``self.sp_values`` and ``self.fill_value``.

341 copy : bool, default False

342 Whether to explicitly copy the incoming `data` array.

343

344 Attributes

345 ----------

346 None

347

348 Methods

349 -------

350 None

351

352 Examples

353 --------

354 >>> from pandas.arrays import SparseArray

355 >>> arr = SparseArray([0, 0, 1, 2])

356 >>> arr

357 [0, 0, 1, 2]

358 Fill: 0

359 IntIndex

360 Indices: array([2, 3], dtype=int32)

361 """

362

363 _subtyp = "sparse_array" # register ABCSparseArray

364 _hidden_attrs = PandasObject._hidden_attrs | frozenset([])

365 _sparse_index: SparseIndex

366 _sparse_values: np.ndarray

367 _dtype: SparseDtype

368

369 def __init__(

370 self,

371 data,

372 sparse_index=None,

373 fill_value=None,

374 kind: SparseIndexKind = "integer",

375 dtype: Dtype | None = None,

376 copy: bool = False,

377 ) -> None:

378 if fill_value is None and isinstance(dtype, SparseDtype):

379 fill_value = dtype.fill_value

380

381 if isinstance(data, type(self)):

382 # disable normal inference on dtype, sparse_index, & fill_value

383 if sparse_index is None:

384 sparse_index = data.sp_index

385 if fill_value is None:

386 fill_value = data.fill_value

387 if dtype is None:

388 dtype = data.dtype

389 # TODO: make kind=None, and use data.kind?

390 data = data.sp_values

391

392 # Handle use-provided dtype

393 if isinstance(dtype, str):

394 # Two options: dtype='int', regular numpy dtype

395 # or dtype='Sparse[int]', a sparse dtype

396 try:

397 dtype = SparseDtype.construct_from_string(dtype)

398 except TypeError:

399 dtype = pandas_dtype(dtype)

400

401 if isinstance(dtype, SparseDtype):

402 if fill_value is None:

403 fill_value = dtype.fill_value

404 dtype = dtype.subtype

405

406 if is_scalar(data):

407 if sparse_index is None:

408 npoints = 1

409 else:

410 npoints = sparse_index.length

411

412 data = construct_1d_arraylike_from_scalar(data, npoints, dtype=None)

413 dtype = data.dtype

414

415 if dtype is not None:

416 dtype = pandas_dtype(dtype)

417

418 # TODO: disentangle the fill_value dtype inference from

419 # dtype inference

420 if data is None:

421 # TODO: What should the empty dtype be? Object or float?

422

423 # error: Argument "dtype" to "array" has incompatible type

424 # "Union[ExtensionDtype, dtype[Any], None]"; expected "Union[dtype[Any],

425 # None, type, _SupportsDType, str, Union[Tuple[Any, int], Tuple[Any,

426 # Union[int, Sequence[int]]], List[Any], _DTypeDict, Tuple[Any, Any]]]"

427 data = np.array([], dtype=dtype) # type: ignore[arg-type]

428

429 if not is_array_like(data):

430 try:

431 # probably shared code in sanitize_series

432

433 data = sanitize_array(data, index=None)

434 except ValueError:

435 # NumPy may raise a ValueError on data like [1, []]

436 # we retry with object dtype here.

437 if dtype is None:

438 dtype = np.dtype(object)

439 data = np.atleast_1d(np.asarray(data, dtype=dtype))

440 else:

441 raise

442

443 if copy:

444 # TODO: avoid double copy when dtype forces cast.

445 data = data.copy()

446

447 if fill_value is None:

448 fill_value_dtype = data.dtype if dtype is None else dtype

449 if fill_value_dtype is None:

450 fill_value = np.nan

451 else:

452 fill_value = na_value_for_dtype(fill_value_dtype)

453

454 if isinstance(data, type(self)) and sparse_index is None:

455 sparse_index = data._sparse_index

456 # error: Argument "dtype" to "asarray" has incompatible type

457 # "Union[ExtensionDtype, dtype[Any], None]"; expected "None"

458 sparse_values = np.asarray(

459 data.sp_values, dtype=dtype # type: ignore[arg-type]

460 )

461 elif sparse_index is None:

462 data = extract_array(data, extract_numpy=True)

463 if not isinstance(data, np.ndarray):

464 # EA

465 if is_datetime64tz_dtype(data.dtype):

466 warnings.warn(

467 f"Creating SparseArray from {data.dtype} data "

468 "loses timezone information. Cast to object before "

469 "sparse to retain timezone information.",

470 UserWarning,

471 stacklevel=find_stack_level(),

472 )

473 data = np.asarray(data, dtype="datetime64[ns]")

474 if fill_value is NaT:

475 fill_value = np.datetime64("NaT", "ns")

476 data = np.asarray(data)

477 sparse_values, sparse_index, fill_value = _make_sparse(

478 # error: Argument "dtype" to "_make_sparse" has incompatible type

479 # "Union[ExtensionDtype, dtype[Any], None]"; expected

480 # "Optional[dtype[Any]]"

481 data,

482 kind=kind,

483 fill_value=fill_value,

484 dtype=dtype, # type: ignore[arg-type]

485 )

486 else:

487 # error: Argument "dtype" to "asarray" has incompatible type

488 # "Union[ExtensionDtype, dtype[Any], None]"; expected "None"

489 sparse_values = np.asarray(data, dtype=dtype) # type: ignore[arg-type]

490 if len(sparse_values) != sparse_index.npoints:

491 raise AssertionError(

492 f"Non array-like type {type(sparse_values)} must "

493 "have the same length as the index"

494 )

495 self._sparse_index = sparse_index

496 self._sparse_values = sparse_values

497 self._dtype = SparseDtype(sparse_values.dtype, fill_value)

498

499 @classmethod

500 def _simple_new(

501 cls: type[SparseArrayT],

502 sparse_array: np.ndarray,

503 sparse_index: SparseIndex,

504 dtype: SparseDtype,

505 ) -> SparseArrayT:

506 new = object.__new__(cls)

507 new._sparse_index = sparse_index

508 new._sparse_values = sparse_array

509 new._dtype = dtype

510 return new

511

512 @classmethod

513 def from_spmatrix(cls: type[SparseArrayT], data: spmatrix) -> SparseArrayT:

514 """

515 Create a SparseArray from a scipy.sparse matrix.

516

517 Parameters

518 ----------

519 data : scipy.sparse.sp_matrix

520 This should be a SciPy sparse matrix where the size

521 of the second dimension is 1. In other words, a

522 sparse matrix with a single column.

523

524 Returns

525 -------

526 SparseArray

527

528 Examples

529 --------

530 >>> import scipy.sparse

531 >>> mat = scipy.sparse.coo_matrix((4, 1))

532 >>> pd.arrays.SparseArray.from_spmatrix(mat)

533 [0.0, 0.0, 0.0, 0.0]

534 Fill: 0.0

535 IntIndex

536 Indices: array([], dtype=int32)

537 """

538 length, ncol = data.shape

539

540 if ncol != 1:

541 raise ValueError(f"'data' must have a single column, not '{ncol}'")

542

543 # our sparse index classes require that the positions be strictly

544 # increasing. So we need to sort loc, and arr accordingly.

545 data = data.tocsc()

546 data.sort_indices()

547 arr = data.data

548 idx = data.indices

549

550 zero = np.array(0, dtype=arr.dtype).item()

551 dtype = SparseDtype(arr.dtype, zero)

552 index = IntIndex(length, idx)

553

554 return cls._simple_new(arr, index, dtype)

555

556 def __array__(self, dtype: NpDtype | None = None) -> np.ndarray:

557 fill_value = self.fill_value

558

559 if self.sp_index.ngaps == 0:

560 # Compat for na dtype and int values.

561 return self.sp_values

562 if dtype is None:

563 # Can NumPy represent this type?

564 # If not, `np.result_type` will raise. We catch that

565 # and return object.

566 if is_datetime64_any_dtype(self.sp_values.dtype):

567 # However, we *do* special-case the common case of

568 # a datetime64 with pandas NaT.

569 if fill_value is NaT:

570 # Can't put pd.NaT in a datetime64[ns]

571 fill_value = np.datetime64("NaT")

572 try:

573 dtype = np.result_type(self.sp_values.dtype, type(fill_value))

574 except TypeError:

575 dtype = object

576

577 out = np.full(self.shape, fill_value, dtype=dtype)

578 out[self.sp_index.indices] = self.sp_values

579 return out

580

581 def __setitem__(self, key, value):

582 # I suppose we could allow setting of non-fill_value elements.

583 # TODO(SparseArray.__setitem__): remove special cases in

584 # ExtensionBlock.where

585 msg = "SparseArray does not support item assignment via setitem"

586 raise TypeError(msg)

587

588 @classmethod

589 def _from_sequence(cls, scalars, *, dtype: Dtype | None = None, copy: bool = False):

590 return cls(scalars, dtype=dtype)

591

592 @classmethod

593 def _from_factorized(cls, values, original):

594 return cls(values, dtype=original.dtype)

595

596 # ------------------------------------------------------------------------

597 # Data

598 # ------------------------------------------------------------------------

599 @property

600 def sp_index(self) -> SparseIndex:

601 """

602 The SparseIndex containing the location of non- ``fill_value`` points.

603 """

604 return self._sparse_index

605

606 @property

607 def sp_values(self) -> np.ndarray:

608 """

609 An ndarray containing the non- ``fill_value`` values.

610

611 Examples

612 --------

613 >>> from pandas.arrays import SparseArray

614 >>> s = SparseArray([0, 0, 1, 0, 2], fill_value=0)

615 >>> s.sp_values

616 array([1, 2])

617 """

618 return self._sparse_values

619

620 @property

621 def dtype(self) -> SparseDtype:

622 return self._dtype

623

624 @property

625 def fill_value(self):

626 """

627 Elements in `data` that are `fill_value` are not stored.

628

629 For memory savings, this should be the most common value in the array.

630 """

631 return self.dtype.fill_value

632

633 @fill_value.setter

634 def fill_value(self, value) -> None:

635 self._dtype = SparseDtype(self.dtype.subtype, value)

636

637 @property

638 def kind(self) -> SparseIndexKind:

639 """

640 The kind of sparse index for this array. One of {'integer', 'block'}.

641 """

642 if isinstance(self.sp_index, IntIndex):

643 return "integer"

644 else:

645 return "block"

646

647 @property

648 def _valid_sp_values(self) -> np.ndarray:

649 sp_vals = self.sp_values

650 mask = notna(sp_vals)

651 return sp_vals[mask]

652

653 def __len__(self) -> int:

654 return self.sp_index.length

655

656 @property

657 def _null_fill_value(self) -> bool:

658 return self._dtype._is_na_fill_value

659

660 def _fill_value_matches(self, fill_value) -> bool:

661 if self._null_fill_value:

662 return isna(fill_value)

663 else:

664 return self.fill_value == fill_value

665

666 @property

667 def nbytes(self) -> int:

668 return self.sp_values.nbytes + self.sp_index.nbytes

669

670 @property

671 def density(self) -> float:

672 """

673 The percent of non- ``fill_value`` points, as decimal.

674

675 Examples

676 --------

677 >>> from pandas.arrays import SparseArray

678 >>> s = SparseArray([0, 0, 1, 1, 1], fill_value=0)

679 >>> s.density

680 0.6

681 """

682 return self.sp_index.npoints / self.sp_index.length

683

684 @property

685 def npoints(self) -> int:

686 """

687 The number of non- ``fill_value`` points.

688

689 Examples

690 --------

691 >>> from pandas.arrays import SparseArray

692 >>> s = SparseArray([0, 0, 1, 1, 1], fill_value=0)

693 >>> s.npoints

694 3

695 """

696 return self.sp_index.npoints

697

698 def isna(self):

699 # If null fill value, we want SparseDtype[bool, true]

700 # to preserve the same memory usage.

701 dtype = SparseDtype(bool, self._null_fill_value)

702 if self._null_fill_value:

703 return type(self)._simple_new(isna(self.sp_values), self.sp_index, dtype)

704 mask = np.full(len(self), False, dtype=np.bool_)

705 mask[self.sp_index.indices] = isna(self.sp_values)

706 return type(self)(mask, fill_value=False, dtype=dtype)

707

708 def fillna(

709 self: SparseArrayT,

710 value=None,

711 method: FillnaOptions | None = None,

712 limit: int | None = None,

713 ) -> SparseArrayT:

714 """

715 Fill missing values with `value`.

716

717 Parameters

718 ----------

719 value : scalar, optional

720 method : str, optional

721

722 .. warning::

723

724 Using 'method' will result in high memory use,

725 as all `fill_value` methods will be converted to

726 an in-memory ndarray

727

728 limit : int, optional

729

730 Returns

731 -------

732 SparseArray

733

734 Notes

735 -----

736 When `value` is specified, the result's ``fill_value`` depends on

737 ``self.fill_value``. The goal is to maintain low-memory use.

738

739 If ``self.fill_value`` is NA, the result dtype will be

740 ``SparseDtype(self.dtype, fill_value=value)``. This will preserve

741 amount of memory used before and after filling.

742

743 When ``self.fill_value`` is not NA, the result dtype will be

744 ``self.dtype``. Again, this preserves the amount of memory used.

745 """

746 if (method is None and value is None) or (

747 method is not None and value is not None

748 ):

749 raise ValueError("Must specify one of 'method' or 'value'.")

750

751 if method is not None:

752 msg = "fillna with 'method' requires high memory usage."

753 warnings.warn(

754 msg,

755 PerformanceWarning,

756 stacklevel=find_stack_level(),

757 )

758 new_values = np.asarray(self)

759 # interpolate_2d modifies new_values inplace

760 interpolate_2d(new_values, method=method, limit=limit)

761 return type(self)(new_values, fill_value=self.fill_value)

762

763 else:

764 new_values = np.where(isna(self.sp_values), value, self.sp_values)

765

766 if self._null_fill_value:

767 # This is essentially just updating the dtype.

768 new_dtype = SparseDtype(self.dtype.subtype, fill_value=value)

769 else:

770 new_dtype = self.dtype

771

772 return self._simple_new(new_values, self._sparse_index, new_dtype)

773

774 def shift(self: SparseArrayT, periods: int = 1, fill_value=None) -> SparseArrayT:

775 if not len(self) or periods == 0:

776 return self.copy()

777

778 if isna(fill_value):

779 fill_value = self.dtype.na_value

780

781 subtype = np.result_type(fill_value, self.dtype.subtype)

782

783 if subtype != self.dtype.subtype:

784 # just coerce up front

785 arr = self.astype(SparseDtype(subtype, self.fill_value))

786 else:

787 arr = self

788

789 empty = self._from_sequence(

790 [fill_value] * min(abs(periods), len(self)), dtype=arr.dtype

791 )

792

793 if periods > 0:

794 a = empty

795 b = arr[:-periods]

796 else:

797 a = arr[abs(periods) :]

798 b = empty

799 return arr._concat_same_type([a, b])

800

801 def _first_fill_value_loc(self):

802 """

803 Get the location of the first fill value.

804

805 Returns

806 -------

807 int

808 """

809 if len(self) == 0 or self.sp_index.npoints == len(self):

810 return -1

811

812 indices = self.sp_index.indices

813 if not len(indices) or indices[0] > 0:

814 return 0

815

816 # a number larger than 1 should be appended to

817 # the last in case of fill value only appears

818 # in the tail of array

819 diff = np.r_[np.diff(indices), 2]

820 return indices[(diff > 1).argmax()] + 1

821

822 def unique(self: SparseArrayT) -> SparseArrayT:

823 uniques = algos.unique(self.sp_values)

824 if len(self.sp_values) != len(self):

825 fill_loc = self._first_fill_value_loc()

826 # Inorder to align the behavior of pd.unique or

827 # pd.Series.unique, we should keep the original

828 # order, here we use unique again to find the

829 # insertion place. Since the length of sp_values

830 # is not large, maybe minor performance hurt

831 # is worthwhile to the correctness.

832 insert_loc = len(algos.unique(self.sp_values[:fill_loc]))

833 uniques = np.insert(uniques, insert_loc, self.fill_value)

834 return type(self)._from_sequence(uniques, dtype=self.dtype)

835

836 def _values_for_factorize(self):

837 # Still override this for hash_pandas_object

838 return np.asarray(self), self.fill_value

839

840 def factorize(

841 self,

842 use_na_sentinel: bool = True,

843 ) -> tuple[np.ndarray, SparseArray]:

844 # Currently, ExtensionArray.factorize -> Tuple[ndarray, EA]

845 # The sparsity on this is backwards from what Sparse would want. Want

846 # ExtensionArray.factorize -> Tuple[EA, EA]

847 # Given that we have to return a dense array of codes, why bother

848 # implementing an efficient factorize?

849 codes, uniques = algos.factorize(

850 np.asarray(self), use_na_sentinel=use_na_sentinel

851 )

852 uniques_sp = SparseArray(uniques, dtype=self.dtype)

853 return codes, uniques_sp

854

855 def value_counts(self, dropna: bool = True) -> Series:

856 """

857 Returns a Series containing counts of unique values.

858

859 Parameters

860 ----------

861 dropna : bool, default True

862 Don't include counts of NaN, even if NaN is in sp_values.

863

864 Returns

865 -------

866 counts : Series

867 """

868 from pandas import (

869 Index,

870 Series,

871 )

872

873 keys, counts = algos.value_counts_arraylike(self.sp_values, dropna=dropna)

874 fcounts = self.sp_index.ngaps

875 if fcounts > 0 and (not self._null_fill_value or not dropna):

876 mask = isna(keys) if self._null_fill_value else keys == self.fill_value

877 if mask.any():

878 counts[mask] += fcounts

879 else:

880 # error: Argument 1 to "insert" has incompatible type "Union[

881 # ExtensionArray,ndarray[Any, Any]]"; expected "Union[

882 # _SupportsArray[dtype[Any]], Sequence[_SupportsArray[dtype

883 # [Any]]], Sequence[Sequence[_SupportsArray[dtype[Any]]]],

884 # Sequence[Sequence[Sequence[_SupportsArray[dtype[Any]]]]], Sequence

885 # [Sequence[Sequence[Sequence[_SupportsArray[dtype[Any]]]]]]]"

886 keys = np.insert(keys, 0, self.fill_value) # type: ignore[arg-type]

887 counts = np.insert(counts, 0, fcounts)

888

889 if not isinstance(keys, ABCIndex):

890 index = Index(keys)

891 else:

892 index = keys

893 return Series(counts, index=index, copy=False)

894

895 # --------

896 # Indexing

897 # --------

898 @overload

899 def __getitem__(self, key: ScalarIndexer) -> Any:

900 ...

901

902 @overload

903 def __getitem__(

904 self: SparseArrayT,

905 key: SequenceIndexer | tuple[int | ellipsis, ...],

906 ) -> SparseArrayT:

907 ...

908

909 def __getitem__(

910 self: SparseArrayT,

911 key: PositionalIndexer | tuple[int | ellipsis, ...],

912 ) -> SparseArrayT | Any:

913 if isinstance(key, tuple):

914 key = unpack_tuple_and_ellipses(key)

915 if key is Ellipsis:

916 raise ValueError("Cannot slice with Ellipsis")

917

918 if is_integer(key):

919 return self._get_val_at(key)

920 elif isinstance(key, tuple):

921 # error: Invalid index type "Tuple[Union[int, ellipsis], ...]"

922 # for "ndarray[Any, Any]"; expected type

923 # "Union[SupportsIndex, _SupportsArray[dtype[Union[bool_,

924 # integer[Any]]]], _NestedSequence[_SupportsArray[dtype[

925 # Union[bool_, integer[Any]]]]], _NestedSequence[Union[

926 # bool, int]], Tuple[Union[SupportsIndex, _SupportsArray[

927 # dtype[Union[bool_, integer[Any]]]], _NestedSequence[

928 # _SupportsArray[dtype[Union[bool_, integer[Any]]]]],

929 # _NestedSequence[Union[bool, int]]], ...]]"

930 data_slice = self.to_dense()[key] # type: ignore[index]

931 elif isinstance(key, slice):

932 # Avoid densifying when handling contiguous slices

933 if key.step is None or key.step == 1:

934 start = 0 if key.start is None else key.start

935 if start < 0:

936 start += len(self)

937

938 end = len(self) if key.stop is None else key.stop

939 if end < 0:

940 end += len(self)

941

942 indices = self.sp_index.indices

943 keep_inds = np.flatnonzero((indices >= start) & (indices < end))

944 sp_vals = self.sp_values[keep_inds]

945

946 sp_index = indices[keep_inds].copy()

947

948 # If we've sliced to not include the start of the array, all our indices

949 # should be shifted. NB: here we are careful to also not shift by a

950 # negative value for a case like [0, 1][-100:] where the start index

951 # should be treated like 0

952 if start > 0:

953 sp_index -= start

954

955 # Length of our result should match applying this slice to a range

956 # of the length of our original array

957 new_len = len(range(len(self))[key])

958 new_sp_index = make_sparse_index(new_len, sp_index, self.kind)

959 return type(self)._simple_new(sp_vals, new_sp_index, self.dtype)

960 else:

961 indices = np.arange(len(self), dtype=np.int32)[key]

962 return self.take(indices)

963

964 elif not is_list_like(key):

965 # e.g. "foo" or 2.5

966 # exception message copied from numpy

967 raise IndexError(

968 r"only integers, slices (`:`), ellipsis (`...`), numpy.newaxis "

969 r"(`None`) and integer or boolean arrays are valid indices"

970 )

971

972 else:

973 if isinstance(key, SparseArray):

974 # NOTE: If we guarantee that SparseDType(bool)

975 # has only fill_value - true, false or nan

976 # (see GH PR 44955)

977 # we can apply mask very fast:

978 if is_bool_dtype(key):

979 if isna(key.fill_value):

980 return self.take(key.sp_index.indices[key.sp_values])

981 if not key.fill_value:

982 return self.take(key.sp_index.indices)

983 n = len(self)

984 mask = np.full(n, True, dtype=np.bool_)

985 mask[key.sp_index.indices] = False

986 return self.take(np.arange(n)[mask])

987 else:

988 key = np.asarray(key)

989

990 key = check_array_indexer(self, key)

991

992 if com.is_bool_indexer(key):

993 # mypy doesn't know we have an array here

994 key = cast(np.ndarray, key)

995 return self.take(np.arange(len(key), dtype=np.int32)[key])

996 elif hasattr(key, "__len__"):

997 return self.take(key)

998 else:

999 raise ValueError(f"Cannot slice with '{key}'")

1000

1001 return type(self)(data_slice, kind=self.kind)

1002

1003 def _get_val_at(self, loc):

1004 loc = validate_insert_loc(loc, len(self))

1005

1006 sp_loc = self.sp_index.lookup(loc)

1007 if sp_loc == -1:

1008 return self.fill_value

1009 else:

1010 val = self.sp_values[sp_loc]

1011 val = maybe_box_datetimelike(val, self.sp_values.dtype)

1012 return val

1013

1014 def take(

1015 self: SparseArrayT, indices, *, allow_fill: bool = False, fill_value=None

1016 ) -> SparseArrayT:

1017 if is_scalar(indices):

1018 raise ValueError(f"'indices' must be an array, not a scalar '{indices}'.")

1019 indices = np.asarray(indices, dtype=np.int32)

1020

1021 dtype = None

1022 if indices.size == 0:

1023 result = np.array([], dtype="object")

1024 dtype = self.dtype

1025 elif allow_fill:

1026 result = self._take_with_fill(indices, fill_value=fill_value)

1027 else:

1028 return self._take_without_fill(indices)

1029

1030 return type(self)(

1031 result, fill_value=self.fill_value, kind=self.kind, dtype=dtype

1032 )

1033

1034 def _take_with_fill(self, indices, fill_value=None) -> np.ndarray:

1035 if fill_value is None:

1036 fill_value = self.dtype.na_value

1037

1038 if indices.min() < -1:

1039 raise ValueError(

1040 "Invalid value in 'indices'. Must be between -1 "

1041 "and the length of the array."

1042 )

1043

1044 if indices.max() >= len(self):

1045 raise IndexError("out of bounds value in 'indices'.")

1046

1047 if len(self) == 0:

1048 # Empty... Allow taking only if all empty

1049 if (indices == -1).all():

1050 dtype = np.result_type(self.sp_values, type(fill_value))

1051 taken = np.empty_like(indices, dtype=dtype)

1052 taken.fill(fill_value)

1053 return taken

1054 else:

1055 raise IndexError("cannot do a non-empty take from an empty axes.")

1056

1057 # sp_indexer may be -1 for two reasons

1058 # 1.) we took for an index of -1 (new)

1059 # 2.) we took a value that was self.fill_value (old)

1060 sp_indexer = self.sp_index.lookup_array(indices)

1061 new_fill_indices = indices == -1

1062 old_fill_indices = (sp_indexer == -1) & ~new_fill_indices

1063

1064 if self.sp_index.npoints == 0 and old_fill_indices.all():

1065 # We've looked up all valid points on an all-sparse array.

1066 taken = np.full(

1067 sp_indexer.shape, fill_value=self.fill_value, dtype=self.dtype.subtype

1068 )

1069

1070 elif self.sp_index.npoints == 0:

1071 # Avoid taking from the empty self.sp_values

1072 _dtype = np.result_type(self.dtype.subtype, type(fill_value))

1073 taken = np.full(sp_indexer.shape, fill_value=fill_value, dtype=_dtype)

1074 else:

1075 taken = self.sp_values.take(sp_indexer)

1076

1077 # Fill in two steps.

1078 # Old fill values

1079 # New fill values

1080 # potentially coercing to a new dtype at each stage.

1081

1082 m0 = sp_indexer[old_fill_indices] < 0

1083 m1 = sp_indexer[new_fill_indices] < 0

1084

1085 result_type = taken.dtype

1086

1087 if m0.any():

1088 result_type = np.result_type(result_type, type(self.fill_value))

1089 taken = taken.astype(result_type)

1090 taken[old_fill_indices] = self.fill_value

1091

1092 if m1.any():

1093 result_type = np.result_type(result_type, type(fill_value))

1094 taken = taken.astype(result_type)

1095 taken[new_fill_indices] = fill_value

1096

1097 return taken

1098

1099 def _take_without_fill(self: SparseArrayT, indices) -> SparseArrayT:

1100 to_shift = indices < 0

1101

1102 n = len(self)

1103

1104 if (indices.max() >= n) or (indices.min() < -n):

1105 if n == 0:

1106 raise IndexError("cannot do a non-empty take from an empty axes.")

1107 raise IndexError("out of bounds value in 'indices'.")

1108

1109 if to_shift.any():

1110 indices = indices.copy()

1111 indices[to_shift] += n

1112

1113 sp_indexer = self.sp_index.lookup_array(indices)

1114 value_mask = sp_indexer != -1

1115 new_sp_values = self.sp_values[sp_indexer[value_mask]]

1116

1117 value_indices = np.flatnonzero(value_mask).astype(np.int32, copy=False)

1118

1119 new_sp_index = make_sparse_index(len(indices), value_indices, kind=self.kind)

1120 return type(self)._simple_new(new_sp_values, new_sp_index, dtype=self.dtype)

1121

1122 def searchsorted(

1123 self,

1124 v: ArrayLike | object,

1125 side: Literal["left", "right"] = "left",

1126 sorter: NumpySorter = None,

1127 ) -> npt.NDArray[np.intp] | np.intp:

1128 msg = "searchsorted requires high memory usage."

1129 warnings.warn(msg, PerformanceWarning, stacklevel=find_stack_level())

1130 if not is_scalar(v):

1131 v = np.asarray(v)

1132 v = np.asarray(v)

1133 return np.asarray(self, dtype=self.dtype.subtype).searchsorted(v, side, sorter)

1134

1135 def copy(self: SparseArrayT) -> SparseArrayT:

1136 values = self.sp_values.copy()

1137 return self._simple_new(values, self.sp_index, self.dtype)

1138

1139 @classmethod

1140 def _concat_same_type(

1141 cls: type[SparseArrayT], to_concat: Sequence[SparseArrayT]

1142 ) -> SparseArrayT:

1143 fill_value = to_concat[0].fill_value

1144

1145 values = []

1146 length = 0

1147

1148 if to_concat:

1149 sp_kind = to_concat[0].kind

1150 else:

1151 sp_kind = "integer"

1152

1153 sp_index: SparseIndex

1154 if sp_kind == "integer":

1155 indices = []

1156

1157 for arr in to_concat:

1158 int_idx = arr.sp_index.indices.copy()

1159 int_idx += length # TODO: wraparound

1160 length += arr.sp_index.length

1161

1162 values.append(arr.sp_values)

1163 indices.append(int_idx)

1164

1165 data = np.concatenate(values)

1166 indices_arr = np.concatenate(indices)

1167 # error: Argument 2 to "IntIndex" has incompatible type

1168 # "ndarray[Any, dtype[signedinteger[_32Bit]]]";

1169 # expected "Sequence[int]"

1170 sp_index = IntIndex(length, indices_arr) # type: ignore[arg-type]

1171

1172 else:

1173 # when concatenating block indices, we don't claim that you'll

1174 # get an identical index as concatenating the values and then

1175 # creating a new index. We don't want to spend the time trying

1176 # to merge blocks across arrays in `to_concat`, so the resulting

1177 # BlockIndex may have more blocks.

1178 blengths = []

1179 blocs = []

1180

1181 for arr in to_concat:

1182 block_idx = arr.sp_index.to_block_index()

1183

1184 values.append(arr.sp_values)

1185 blocs.append(block_idx.blocs.copy() + length)

1186 blengths.append(block_idx.blengths)

1187 length += arr.sp_index.length

1188

1189 data = np.concatenate(values)

1190 blocs_arr = np.concatenate(blocs)

1191 blengths_arr = np.concatenate(blengths)

1192

1193 sp_index = BlockIndex(length, blocs_arr, blengths_arr)

1194

1195 return cls(data, sparse_index=sp_index, fill_value=fill_value)

1196

1197 def astype(self, dtype: AstypeArg | None = None, copy: bool = True):

1198 """

1199 Change the dtype of a SparseArray.

1200

1201 The output will always be a SparseArray. To convert to a dense

1202 ndarray with a certain dtype, use :meth:`numpy.asarray`.

1203

1204 Parameters

1205 ----------

1206 dtype : np.dtype or ExtensionDtype

1207 For SparseDtype, this changes the dtype of

1208 ``self.sp_values`` and the ``self.fill_value``.

1209

1210 For other dtypes, this only changes the dtype of

1211 ``self.sp_values``.

1212

1213 copy : bool, default True

1214 Whether to ensure a copy is made, even if not necessary.

1215

1216 Returns

1217 -------

1218 SparseArray

1219

1220 Examples

1221 --------

1222 >>> arr = pd.arrays.SparseArray([0, 0, 1, 2])

1223 >>> arr

1224 [0, 0, 1, 2]

1225 Fill: 0

1226 IntIndex

1227 Indices: array([2, 3], dtype=int32)

1228

1229 >>> arr.astype(SparseDtype(np.dtype('int32')))

1230 [0, 0, 1, 2]

1231 Fill: 0

1232 IntIndex

1233 Indices: array([2, 3], dtype=int32)

1234

1235 Using a NumPy dtype with a different kind (e.g. float) will coerce

1236 just ``self.sp_values``.

1237

1238 >>> arr.astype(SparseDtype(np.dtype('float64')))

1239 ... # doctest: +NORMALIZE_WHITESPACE

1240 [nan, nan, 1.0, 2.0]

1241 Fill: nan

1242 IntIndex

1243 Indices: array([2, 3], dtype=int32)

1244

1245 Using a SparseDtype, you can also change the fill value as well.

1246

1247 >>> arr.astype(SparseDtype("float64", fill_value=0.0))

1248 ... # doctest: +NORMALIZE_WHITESPACE

1249 [0.0, 0.0, 1.0, 2.0]

1250 Fill: 0.0

1251 IntIndex

1252 Indices: array([2, 3], dtype=int32)

1253 """

1254 if is_dtype_equal(dtype, self._dtype):

1255 if not copy:

1256 return self

1257 else:

1258 return self.copy()

1259

1260 future_dtype = pandas_dtype(dtype)

1261 if not isinstance(future_dtype, SparseDtype):

1262 # GH#34457

1263 values = np.asarray(self)

1264 values = ensure_wrapped_if_datetimelike(values)

1265 return astype_array(values, dtype=future_dtype, copy=False)

1266

1267 dtype = self.dtype.update_dtype(dtype)

1268 subtype = pandas_dtype(dtype._subtype_with_str)

1269 subtype = cast(np.dtype, subtype) # ensured by update_dtype

1270 values = ensure_wrapped_if_datetimelike(self.sp_values)

1271 sp_values = astype_array(values, subtype, copy=copy)

1272 sp_values = np.asarray(sp_values)

1273

1274 return self._simple_new(sp_values, self.sp_index, dtype)

1275

1276 def map(self: SparseArrayT, mapper) -> SparseArrayT:

1277 """

1278 Map categories using an input mapping or function.

1279

1280 Parameters

1281 ----------

1282 mapper : dict, Series, callable

1283 The correspondence from old values to new.

1284

1285 Returns

1286 -------

1287 SparseArray

1288 The output array will have the same density as the input.

1289 The output fill value will be the result of applying the

1290 mapping to ``self.fill_value``

1291

1292 Examples

1293 --------

1294 >>> arr = pd.arrays.SparseArray([0, 1, 2])

1295 >>> arr.map(lambda x: x + 10)

1296 [10, 11, 12]

1297 Fill: 10

1298 IntIndex

1299 Indices: array([1, 2], dtype=int32)

1300

1301 >>> arr.map({0: 10, 1: 11, 2: 12})

1302 [10, 11, 12]

1303 Fill: 10

1304 IntIndex

1305 Indices: array([1, 2], dtype=int32)

1306

1307 >>> arr.map(pd.Series([10, 11, 12], index=[0, 1, 2]))

1308 [10, 11, 12]

1309 Fill: 10

1310 IntIndex

1311 Indices: array([1, 2], dtype=int32)

1312 """

1313 # this is used in apply.

1314 # We get hit since we're an "is_extension_array_dtype" but regular extension

1315 # types are not hit. This may be worth adding to the interface.

1316 if isinstance(mapper, ABCSeries):

1317 mapper = mapper.to_dict()

1318

1319 if isinstance(mapper, abc.Mapping):

1320 fill_value = mapper.get(self.fill_value, self.fill_value)

1321 sp_values = [mapper.get(x, None) for x in self.sp_values]

1322 else:

1323 fill_value = mapper(self.fill_value)

1324 sp_values = [mapper(x) for x in self.sp_values]

1325

1326 return type(self)(sp_values, sparse_index=self.sp_index, fill_value=fill_value)

1327

1328 def to_dense(self) -> np.ndarray:

1329 """

1330 Convert SparseArray to a NumPy array.

1331

1332 Returns

1333 -------

1334 arr : NumPy array

1335 """

1336 return np.asarray(self, dtype=self.sp_values.dtype)

1337

1338 def _where(self, mask, value):

1339 # NB: may not preserve dtype, e.g. result may be Sparse[float64]

1340 # while self is Sparse[int64]

1341 naive_implementation = np.where(mask, self, value)

1342 dtype = SparseDtype(naive_implementation.dtype, fill_value=self.fill_value)

1343 result = type(self)._from_sequence(naive_implementation, dtype=dtype)

1344 return result

1345

1346 # ------------------------------------------------------------------------

1347 # IO

1348 # ------------------------------------------------------------------------

1349 def __setstate__(self, state) -> None:

1350 """Necessary for making this object picklable"""

1351 if isinstance(state, tuple):

1352 # Compat for pandas < 0.24.0

1353 nd_state, (fill_value, sp_index) = state

1354 sparse_values = np.array([])

1355 sparse_values.__setstate__(nd_state)

1356

1357 self._sparse_values = sparse_values

1358 self._sparse_index = sp_index

1359 self._dtype = SparseDtype(sparse_values.dtype, fill_value)

1360 else:

1361 self.__dict__.update(state)

1362

1363 def nonzero(self) -> tuple[npt.NDArray[np.int32]]:

1364 if self.fill_value == 0:

1365 return (self.sp_index.indices,)

1366 else:

1367 return (self.sp_index.indices[self.sp_values != 0],)

1368

1369 # ------------------------------------------------------------------------

1370 # Reductions

1371 # ------------------------------------------------------------------------

1372

1373 def _reduce(self, name: str, *, skipna: bool = True, **kwargs):

1374 method = getattr(self, name, None)

1375

1376 if method is None:

1377 raise TypeError(f"cannot perform {name} with type {self.dtype}")

1378

1379 if skipna:

1380 arr = self

1381 else:

1382 arr = self.dropna()

1383

1384 return getattr(arr, name)(**kwargs)

1385

1386 def all(self, axis=None, *args, **kwargs):

1387 """

1388 Tests whether all elements evaluate True

1389

1390 Returns

1391 -------

1392 all : bool

1393

1394 See Also

1395 --------

1396 numpy.all

1397 """

1398 nv.validate_all(args, kwargs)

1399

1400 values = self.sp_values

1401

1402 if len(values) != len(self) and not np.all(self.fill_value):

1403 return False

1404

1405 return values.all()

1406

1407 def any(self, axis: AxisInt = 0, *args, **kwargs):

1408 """

1409 Tests whether at least one of elements evaluate True

1410

1411 Returns

1412 -------

1413 any : bool

1414

1415 See Also

1416 --------

1417 numpy.any

1418 """

1419 nv.validate_any(args, kwargs)

1420

1421 values = self.sp_values

1422

1423 if len(values) != len(self) and np.any(self.fill_value):

1424 return True

1425

1426 return values.any().item()

1427

1428 def sum(

1429 self,

1430 axis: AxisInt = 0,

1431 min_count: int = 0,

1432 skipna: bool = True,

1433 *args,

1434 **kwargs,

1435 ) -> Scalar:

1436 """

1437 Sum of non-NA/null values

1438

1439 Parameters

1440 ----------

1441 axis : int, default 0

1442 Not Used. NumPy compatibility.

1443 min_count : int, default 0

1444 The required number of valid values to perform the summation. If fewer

1445 than ``min_count`` valid values are present, the result will be the missing

1446 value indicator for subarray type.

1447 *args, **kwargs

1448 Not Used. NumPy compatibility.

1449

1450 Returns

1451 -------

1452 scalar

1453 """

1454 nv.validate_sum(args, kwargs)

1455 valid_vals = self._valid_sp_values

1456 sp_sum = valid_vals.sum()

1457 has_na = self.sp_index.ngaps > 0 and not self._null_fill_value

1458

1459 if has_na and not skipna:

1460 return na_value_for_dtype(self.dtype.subtype, compat=False)

1461

1462 if self._null_fill_value:

1463 if check_below_min_count(valid_vals.shape, None, min_count):

1464 return na_value_for_dtype(self.dtype.subtype, compat=False)

1465 return sp_sum

1466 else:

1467 nsparse = self.sp_index.ngaps

1468 if check_below_min_count(valid_vals.shape, None, min_count - nsparse):

1469 return na_value_for_dtype(self.dtype.subtype, compat=False)

1470 return sp_sum + self.fill_value * nsparse

1471

1472 def cumsum(self, axis: AxisInt = 0, *args, **kwargs) -> SparseArray:

1473 """

1474 Cumulative sum of non-NA/null values.

1475

1476 When performing the cumulative summation, any non-NA/null values will

1477 be skipped. The resulting SparseArray will preserve the locations of

1478 NaN values, but the fill value will be `np.nan` regardless.

1479

1480 Parameters

1481 ----------

1482 axis : int or None

1483 Axis over which to perform the cumulative summation. If None,

1484 perform cumulative summation over flattened array.

1485

1486 Returns

1487 -------

1488 cumsum : SparseArray

1489 """

1490 nv.validate_cumsum(args, kwargs)

1491

1492 if axis is not None and axis >= self.ndim: # Mimic ndarray behaviour.

1493 raise ValueError(f"axis(={axis}) out of bounds")

1494

1495 if not self._null_fill_value:

1496 return SparseArray(self.to_dense()).cumsum()

1497

1498 return SparseArray(

1499 self.sp_values.cumsum(),

1500 sparse_index=self.sp_index,

1501 fill_value=self.fill_value,

1502 )

1503

1504 def mean(self, axis: Axis = 0, *args, **kwargs):

1505 """

1506 Mean of non-NA/null values

1507

1508 Returns

1509 -------

1510 mean : float

1511 """

1512 nv.validate_mean(args, kwargs)

1513 valid_vals = self._valid_sp_values

1514 sp_sum = valid_vals.sum()

1515 ct = len(valid_vals)

1516

1517 if self._null_fill_value:

1518 return sp_sum / ct

1519 else:

1520 nsparse = self.sp_index.ngaps

1521 return (sp_sum + self.fill_value * nsparse) / (ct + nsparse)

1522

1523 def max(self, *, axis: AxisInt | None = None, skipna: bool = True):

1524 """

1525 Max of array values, ignoring NA values if specified.

1526

1527 Parameters

1528 ----------

1529 axis : int, default 0

1530 Not Used. NumPy compatibility.

1531 skipna : bool, default True

1532 Whether to ignore NA values.

1533

1534 Returns

1535 -------

1536 scalar

1537 """

1538 nv.validate_minmax_axis(axis, self.ndim)

1539 return self._min_max("max", skipna=skipna)

1540

1541 def min(self, *, axis: AxisInt | None = None, skipna: bool = True):

1542 """

1543 Min of array values, ignoring NA values if specified.

1544

1545 Parameters

1546 ----------

1547 axis : int, default 0

1548 Not Used. NumPy compatibility.

1549 skipna : bool, default True

1550 Whether to ignore NA values.

1551

1552 Returns

1553 -------

1554 scalar

1555 """

1556 nv.validate_minmax_axis(axis, self.ndim)

1557 return self._min_max("min", skipna=skipna)

1558

1559 def _min_max(self, kind: Literal["min", "max"], skipna: bool) -> Scalar:

1560 """

1561 Min/max of non-NA/null values

1562

1563 Parameters

1564 ----------

1565 kind : {"min", "max"}

1566 skipna : bool

1567

1568 Returns

1569 -------

1570 scalar

1571 """

1572 valid_vals = self._valid_sp_values

1573 has_nonnull_fill_vals = not self._null_fill_value and self.sp_index.ngaps > 0

1574

1575 if len(valid_vals) > 0:

1576 sp_min_max = getattr(valid_vals, kind)()

1577

1578 # If a non-null fill value is currently present, it might be the min/max

1579 if has_nonnull_fill_vals:

1580 func = max if kind == "max" else min

1581 return func(sp_min_max, self.fill_value)

1582 elif skipna:

1583 return sp_min_max

1584 elif self.sp_index.ngaps == 0:

1585 # No NAs present

1586 return sp_min_max

1587 else:

1588 return na_value_for_dtype(self.dtype.subtype, compat=False)

1589 elif has_nonnull_fill_vals:

1590 return self.fill_value

1591 else:

1592 return na_value_for_dtype(self.dtype.subtype, compat=False)

1593

1594 def _argmin_argmax(self, kind: Literal["argmin", "argmax"]) -> int:

1595 values = self._sparse_values

1596 index = self._sparse_index.indices

1597 mask = np.asarray(isna(values))

1598 func = np.argmax if kind == "argmax" else np.argmin

1599

1600 idx = np.arange(values.shape[0])

1601 non_nans = values[~mask]

1602 non_nan_idx = idx[~mask]

1603

1604 _candidate = non_nan_idx[func(non_nans)]

1605 candidate = index[_candidate]

1606

1607 if isna(self.fill_value):

1608 return candidate

1609 if kind == "argmin" and self[candidate] < self.fill_value:

1610 return candidate

1611 if kind == "argmax" and self[candidate] > self.fill_value:

1612 return candidate

1613 _loc = self._first_fill_value_loc()

1614 if _loc == -1:

1615 # fill_value doesn't exist

1616 return candidate

1617 else:

1618 return _loc

1619

1620 def argmax(self, skipna: bool = True) -> int:

1621 validate_bool_kwarg(skipna, "skipna")

1622 if not skipna and self._hasna:

1623 raise NotImplementedError

1624 return self._argmin_argmax("argmax")

1625

1626 def argmin(self, skipna: bool = True) -> int:

1627 validate_bool_kwarg(skipna, "skipna")

1628 if not skipna and self._hasna:

1629 raise NotImplementedError

1630 return self._argmin_argmax("argmin")

1631

1632 # ------------------------------------------------------------------------

1633 # Ufuncs

1634 # ------------------------------------------------------------------------

1635

1636 _HANDLED_TYPES = (np.ndarray, numbers.Number)

1637

1638 def __array_ufunc__(self, ufunc: np.ufunc, method: str, *inputs, **kwargs):

1639 out = kwargs.get("out", ())

1640

1641 for x in inputs + out:

1642 if not isinstance(x, self._HANDLED_TYPES + (SparseArray,)):

1643 return NotImplemented

1644

1645 # for binary ops, use our custom dunder methods

1646 result = ops.maybe_dispatch_ufunc_to_dunder_op(

1647 self, ufunc, method, *inputs, **kwargs

1648 )

1649 if result is not NotImplemented:

1650 return result

1651

1652 if "out" in kwargs:

1653 # e.g. tests.arrays.sparse.test_arithmetics.test_ndarray_inplace

1654 res = arraylike.dispatch_ufunc_with_out(

1655 self, ufunc, method, *inputs, **kwargs

1656 )

1657 return res

1658

1659 if method == "reduce":

1660 result = arraylike.dispatch_reduction_ufunc(

1661 self, ufunc, method, *inputs, **kwargs

1662 )

1663 if result is not NotImplemented:

1664 # e.g. tests.series.test_ufunc.TestNumpyReductions

1665 return result

1666

1667 if len(inputs) == 1:

1668 # No alignment necessary.

1669 sp_values = getattr(ufunc, method)(self.sp_values, **kwargs)

1670 fill_value = getattr(ufunc, method)(self.fill_value, **kwargs)

1671

1672 if ufunc.nout > 1:

1673 # multiple outputs. e.g. modf

1674 arrays = tuple(

1675 self._simple_new(

1676 sp_value, self.sp_index, SparseDtype(sp_value.dtype, fv)

1677 )

1678 for sp_value, fv in zip(sp_values, fill_value)

1679 )

1680 return arrays

1681 elif method == "reduce":

1682 # e.g. reductions

1683 return sp_values

1684

1685 return self._simple_new(

1686 sp_values, self.sp_index, SparseDtype(sp_values.dtype, fill_value)

1687 )

1688

1689 new_inputs = tuple(np.asarray(x) for x in inputs)

1690 result = getattr(ufunc, method)(*new_inputs, **kwargs)

1691 if out:

1692 if len(out) == 1:

1693 out = out[0]

1694 return out

1695

1696 if ufunc.nout > 1:

1697 return tuple(type(self)(x) for x in result)

1698 elif method == "at":

1699 # no return value

1700 return None

1701 else:

1702 return type(self)(result)

1703

1704 # ------------------------------------------------------------------------

1705 # Ops

1706 # ------------------------------------------------------------------------

1707

1708 def _arith_method(self, other, op):

1709 op_name = op.__name__

1710

1711 if isinstance(other, SparseArray):

1712 return _sparse_array_op(self, other, op, op_name)

1713

1714 elif is_scalar(other):

1715 with np.errstate(all="ignore"):

1716 fill = op(_get_fill(self), np.asarray(other))

1717 result = op(self.sp_values, other)

1718

1719 if op_name == "divmod":

1720 left, right = result

1721 lfill, rfill = fill

1722 return (

1723 _wrap_result(op_name, left, self.sp_index, lfill),

1724 _wrap_result(op_name, right, self.sp_index, rfill),

1725 )

1726

1727 return _wrap_result(op_name, result, self.sp_index, fill)

1728

1729 else:

1730 other = np.asarray(other)

1731 with np.errstate(all="ignore"):

1732 if len(self) != len(other):

1733 raise AssertionError(

1734 f"length mismatch: {len(self)} vs. {len(other)}"

1735 )

1736 if not isinstance(other, SparseArray):

1737 dtype = getattr(other, "dtype", None)

1738 other = SparseArray(other, fill_value=self.fill_value, dtype=dtype)

1739 return _sparse_array_op(self, other, op, op_name)

1740

1741 def _cmp_method(self, other, op) -> SparseArray:

1742 if not is_scalar(other) and not isinstance(other, type(self)):

1743 # convert list-like to ndarray

1744 other = np.asarray(other)

1745

1746 if isinstance(other, np.ndarray):

1747 # TODO: make this more flexible than just ndarray...

1748 other = SparseArray(other, fill_value=self.fill_value)

1749

1750 if isinstance(other, SparseArray):

1751 if len(self) != len(other):

1752 raise ValueError(

1753 f"operands have mismatched length {len(self)} and {len(other)}"

1754 )

1755

1756 op_name = op.__name__.strip("_")

1757 return _sparse_array_op(self, other, op, op_name)

1758 else:

1759 # scalar

1760 with np.errstate(all="ignore"):

1761 fill_value = op(self.fill_value, other)

1762 result = np.full(len(self), fill_value, dtype=np.bool_)

1763 result[self.sp_index.indices] = op(self.sp_values, other)

1764

1765 return type(self)(

1766 result,

1767 fill_value=fill_value,

1768 dtype=np.bool_,

1769 )

1770

1771 _logical_method = _cmp_method

1772

1773 def _unary_method(self, op) -> SparseArray:

1774 fill_value = op(np.array(self.fill_value)).item()

1775 dtype = SparseDtype(self.dtype.subtype, fill_value)

1776 # NOTE: if fill_value doesn't change

1777 # we just have to apply op to sp_values

1778 if isna(self.fill_value) or fill_value == self.fill_value:

1779 values = op(self.sp_values)

1780 return type(self)._simple_new(values, self.sp_index, self.dtype)

1781 # In the other case we have to recalc indexes

1782 return type(self)(op(self.to_dense()), dtype=dtype)

1783

1784 def __pos__(self) -> SparseArray:

1785 return self._unary_method(operator.pos)

1786

1787 def __neg__(self) -> SparseArray:

1788 return self._unary_method(operator.neg)

1789

1790 def __invert__(self) -> SparseArray:

1791 return self._unary_method(operator.invert)

1792

1793 def __abs__(self) -> SparseArray:

1794 return self._unary_method(operator.abs)

1795

1796 # ----------

1797 # Formatting

1798 # -----------

1799 def __repr__(self) -> str:

1800 pp_str = printing.pprint_thing(self)

1801 pp_fill = printing.pprint_thing(self.fill_value)

1802 pp_index = printing.pprint_thing(self.sp_index)

1803 return f"{pp_str}\nFill: {pp_fill}\n{pp_index}"

1804

1805 def _formatter(self, boxed: bool = False):

1806 # Defer to the formatter from the GenericArrayFormatter calling us.

1807 # This will infer the correct formatter from the dtype of the values.

1808 return None

1809

1810

1811def _make_sparse(

1812 arr: np.ndarray,

1813 kind: SparseIndexKind = "block",

1814 fill_value=None,

1815 dtype: np.dtype | None = None,

1816):

1817 """

1818 Convert ndarray to sparse format

1819

1820 Parameters

1821 ----------

1822 arr : ndarray

1823 kind : {'block', 'integer'}

1824 fill_value : NaN or another value

1825 dtype : np.dtype, optional

1826 copy : bool, default False

1827

1828 Returns

1829 -------

1830 (sparse_values, index, fill_value) : (ndarray, SparseIndex, Scalar)

1831 """

1832 assert isinstance(arr, np.ndarray)

1833

1834 if arr.ndim > 1:

1835 raise TypeError("expected dimension <= 1 data")

1836

1837 if fill_value is None:

1838 fill_value = na_value_for_dtype(arr.dtype)

1839

1840 if isna(fill_value):

1841 mask = notna(arr)

1842 else:

1843 # cast to object comparison to be safe

1844 if is_string_dtype(arr.dtype):

1845 arr = arr.astype(object)

1846

1847 if is_object_dtype(arr.dtype):

1848 # element-wise equality check method in numpy doesn't treat

1849 # each element type, eg. 0, 0.0, and False are treated as

1850 # same. So we have to check the both of its type and value.

1851 mask = splib.make_mask_object_ndarray(arr, fill_value)

1852 else:

1853 mask = arr != fill_value

1854

1855 length = len(arr)

1856 if length != len(mask):

1857 # the arr is a SparseArray

1858 indices = mask.sp_index.indices

1859 else:

1860 indices = mask.nonzero()[0].astype(np.int32)

1861

1862 index = make_sparse_index(length, indices, kind)

1863 sparsified_values = arr[mask]

1864 if dtype is not None:

1865 sparsified_values = ensure_wrapped_if_datetimelike(sparsified_values)

1866 sparsified_values = astype_array(sparsified_values, dtype=dtype)

1867 sparsified_values = np.asarray(sparsified_values)

1868

1869 # TODO: copy

1870 return sparsified_values, index, fill_value

1871

1872

1873@overload

1874def make_sparse_index(length: int, indices, kind: Literal["block"]) -> BlockIndex:

1875 ...

1876

1877

1878@overload

1879def make_sparse_index(length: int, indices, kind: Literal["integer"]) -> IntIndex:

1880 ...

1881

1882

1883def make_sparse_index(length: int, indices, kind: SparseIndexKind) -> SparseIndex:

1884 index: SparseIndex

1885 if kind == "block":

1886 locs, lens = splib.get_blocks(indices)

1887 index = BlockIndex(length, locs, lens)

1888 elif kind == "integer":

1889 index = IntIndex(length, indices)

1890 else: # pragma: no cover

1891 raise ValueError("must be block or integer type")

1892 return index