1"""
2SparseArray data structure
3"""
4from __future__ import annotations
5
6from collections import abc
7import numbers
8import operator
9from typing import (
10 TYPE_CHECKING,
11 Any,
12 Callable,
13 Literal,
14 Sequence,
15 TypeVar,
16 cast,
17 overload,
18)
19import warnings
20
21import numpy as np
22
23from pandas._libs import lib
24import pandas._libs.sparse as splib
25from pandas._libs.sparse import (
26 BlockIndex,
27 IntIndex,
28 SparseIndex,
29)
30from pandas._libs.tslibs import NaT
31from pandas._typing import (
32 ArrayLike,
33 AstypeArg,
34 Axis,
35 AxisInt,
36 Dtype,
37 NpDtype,
38 PositionalIndexer,
39 Scalar,
40 ScalarIndexer,
41 SequenceIndexer,
42 npt,
43)
44from pandas.compat.numpy import function as nv
45from pandas.errors import PerformanceWarning
46from pandas.util._exceptions import find_stack_level
47from pandas.util._validators import (
48 validate_bool_kwarg,
49 validate_insert_loc,
50)
51
52from pandas.core.dtypes.astype import astype_array
53from pandas.core.dtypes.cast import (
54 construct_1d_arraylike_from_scalar,
55 find_common_type,
56 maybe_box_datetimelike,
57)
58from pandas.core.dtypes.common import (
59 is_array_like,
60 is_bool_dtype,
61 is_datetime64_any_dtype,
62 is_datetime64tz_dtype,
63 is_dtype_equal,
64 is_integer,
65 is_list_like,
66 is_object_dtype,
67 is_scalar,
68 is_string_dtype,
69 pandas_dtype,
70)
71from pandas.core.dtypes.generic import (
72 ABCIndex,
73 ABCSeries,
74)
75from pandas.core.dtypes.missing import (
76 isna,
77 na_value_for_dtype,
78 notna,
79)
80
81from pandas.core import (
82 arraylike,
83 ops,
84)
85import pandas.core.algorithms as algos
86from pandas.core.arraylike import OpsMixin
87from pandas.core.arrays import ExtensionArray
88from pandas.core.arrays.sparse.dtype import SparseDtype
89from pandas.core.base import PandasObject
90import pandas.core.common as com
91from pandas.core.construction import (
92 ensure_wrapped_if_datetimelike,
93 extract_array,
94 sanitize_array,
95)
96from pandas.core.indexers import (
97 check_array_indexer,
98 unpack_tuple_and_ellipses,
99)
100from pandas.core.missing import interpolate_2d
101from pandas.core.nanops import check_below_min_count
102
103from pandas.io.formats import printing
104
105# See https://github.com/python/typing/issues/684
106if TYPE_CHECKING:
107 from enum import Enum
108
109 class ellipsis(Enum):
110 Ellipsis = "..."
111
112 Ellipsis = ellipsis.Ellipsis
113
114 from scipy.sparse import spmatrix
115
116 from pandas._typing import (
117 FillnaOptions,
118 NumpySorter,
119 )
120
121 SparseIndexKind = Literal["integer", "block"]
122
123 from pandas import Series
124
125else:
126 ellipsis = type(Ellipsis)
127
128
129# ----------------------------------------------------------------------------
130# Array
131
132SparseArrayT = TypeVar("SparseArrayT", bound="SparseArray")
133
134_sparray_doc_kwargs = {"klass": "SparseArray"}
135
136
137def _get_fill(arr: SparseArray) -> np.ndarray:
138 """
139 Create a 0-dim ndarray containing the fill value
140
141 Parameters
142 ----------
143 arr : SparseArray
144
145 Returns
146 -------
147 fill_value : ndarray
148 0-dim ndarray with just the fill value.
149
150 Notes
151 -----
152 coerce fill_value to arr dtype if possible
153 int64 SparseArray can have NaN as fill_value if there is no missing
154 """
155 try:
156 return np.asarray(arr.fill_value, dtype=arr.dtype.subtype)
157 except ValueError:
158 return np.asarray(arr.fill_value)
159
160
161def _sparse_array_op(
162 left: SparseArray, right: SparseArray, op: Callable, name: str
163) -> SparseArray:
164 """
165 Perform a binary operation between two arrays.
166
167 Parameters
168 ----------
169 left : Union[SparseArray, ndarray]
170 right : Union[SparseArray, ndarray]
171 op : Callable
172 The binary operation to perform
173 name str
174 Name of the callable.
175
176 Returns
177 -------
178 SparseArray
179 """
180 if name.startswith("__"):
181 # For lookups in _libs.sparse we need non-dunder op name
182 name = name[2:-2]
183
184 # dtype used to find corresponding sparse method
185 ltype = left.dtype.subtype
186 rtype = right.dtype.subtype
187
188 if not is_dtype_equal(ltype, rtype):
189 subtype = find_common_type([ltype, rtype])
190 ltype = SparseDtype(subtype, left.fill_value)
191 rtype = SparseDtype(subtype, right.fill_value)
192
193 left = left.astype(ltype, copy=False)
194 right = right.astype(rtype, copy=False)
195 dtype = ltype.subtype
196 else:
197 dtype = ltype
198
199 # dtype the result must have
200 result_dtype = None
201
202 if left.sp_index.ngaps == 0 or right.sp_index.ngaps == 0:
203 with np.errstate(all="ignore"):
204 result = op(left.to_dense(), right.to_dense())
205 fill = op(_get_fill(left), _get_fill(right))
206
207 if left.sp_index.ngaps == 0:
208 index = left.sp_index
209 else:
210 index = right.sp_index
211 elif left.sp_index.equals(right.sp_index):
212 with np.errstate(all="ignore"):
213 result = op(left.sp_values, right.sp_values)
214 fill = op(_get_fill(left), _get_fill(right))
215 index = left.sp_index
216 else:
217 if name[0] == "r":
218 left, right = right, left
219 name = name[1:]
220
221 if name in ("and", "or", "xor") and dtype == "bool":
222 opname = f"sparse_{name}_uint8"
223 # to make template simple, cast here
224 left_sp_values = left.sp_values.view(np.uint8)
225 right_sp_values = right.sp_values.view(np.uint8)
226 result_dtype = bool
227 else:
228 opname = f"sparse_{name}_{dtype}"
229 left_sp_values = left.sp_values
230 right_sp_values = right.sp_values
231
232 if (
233 name in ["floordiv", "mod"]
234 and (right == 0).any()
235 and left.dtype.kind in ["i", "u"]
236 ):
237 # Match the non-Sparse Series behavior
238 opname = f"sparse_{name}_float64"
239 left_sp_values = left_sp_values.astype("float64")
240 right_sp_values = right_sp_values.astype("float64")
241
242 sparse_op = getattr(splib, opname)
243
244 with np.errstate(all="ignore"):
245 result, index, fill = sparse_op(
246 left_sp_values,
247 left.sp_index,
248 left.fill_value,
249 right_sp_values,
250 right.sp_index,
251 right.fill_value,
252 )
253
254 if name == "divmod":
255 # result is a 2-tuple
256 # error: Incompatible return value type (got "Tuple[SparseArray,
257 # SparseArray]", expected "SparseArray")
258 return ( # type: ignore[return-value]
259 _wrap_result(name, result[0], index, fill[0], dtype=result_dtype),
260 _wrap_result(name, result[1], index, fill[1], dtype=result_dtype),
261 )
262
263 if result_dtype is None:
264 result_dtype = result.dtype
265
266 return _wrap_result(name, result, index, fill, dtype=result_dtype)
267
268
269def _wrap_result(
270 name: str, data, sparse_index, fill_value, dtype: Dtype | None = None
271) -> SparseArray:
272 """
273 wrap op result to have correct dtype
274 """
275 if name.startswith("__"):
276 # e.g. __eq__ --> eq
277 name = name[2:-2]
278
279 if name in ("eq", "ne", "lt", "gt", "le", "ge"):
280 dtype = bool
281
282 fill_value = lib.item_from_zerodim(fill_value)
283
284 if is_bool_dtype(dtype):
285 # fill_value may be np.bool_
286 fill_value = bool(fill_value)
287 return SparseArray(
288 data, sparse_index=sparse_index, fill_value=fill_value, dtype=dtype
289 )
290
291
292class SparseArray(OpsMixin, PandasObject, ExtensionArray):
293 """
294 An ExtensionArray for storing sparse data.
295
296 Parameters
297 ----------
298 data : array-like or scalar
299 A dense array of values to store in the SparseArray. This may contain
300 `fill_value`.
301 sparse_index : SparseIndex, optional
302 fill_value : scalar, optional
303 Elements in data that are ``fill_value`` are not stored in the
304 SparseArray. For memory savings, this should be the most common value
305 in `data`. By default, `fill_value` depends on the dtype of `data`:
306
307 =========== ==========
308 data.dtype na_value
309 =========== ==========
310 float ``np.nan``
311 int ``0``
312 bool False
313 datetime64 ``pd.NaT``
314 timedelta64 ``pd.NaT``
315 =========== ==========
316
317 The fill value is potentially specified in three ways. In order of
318 precedence, these are
319
320 1. The `fill_value` argument
321 2. ``dtype.fill_value`` if `fill_value` is None and `dtype` is
322 a ``SparseDtype``
323 3. ``data.dtype.fill_value`` if `fill_value` is None and `dtype`
324 is not a ``SparseDtype`` and `data` is a ``SparseArray``.
325
326 kind : str
327 Can be 'integer' or 'block', default is 'integer'.
328 The type of storage for sparse locations.
329
330 * 'block': Stores a `block` and `block_length` for each
331 contiguous *span* of sparse values. This is best when
332 sparse data tends to be clumped together, with large
333 regions of ``fill-value`` values between sparse values.
334 * 'integer': uses an integer to store the location of
335 each sparse value.
336
337 dtype : np.dtype or SparseDtype, optional
338 The dtype to use for the SparseArray. For numpy dtypes, this
339 determines the dtype of ``self.sp_values``. For SparseDtype,
340 this determines ``self.sp_values`` and ``self.fill_value``.
341 copy : bool, default False
342 Whether to explicitly copy the incoming `data` array.
343
344 Attributes
345 ----------
346 None
347
348 Methods
349 -------
350 None
351
352 Examples
353 --------
354 >>> from pandas.arrays import SparseArray
355 >>> arr = SparseArray([0, 0, 1, 2])
356 >>> arr
357 [0, 0, 1, 2]
358 Fill: 0
359 IntIndex
360 Indices: array([2, 3], dtype=int32)
361 """
362
363 _subtyp = "sparse_array" # register ABCSparseArray
364 _hidden_attrs = PandasObject._hidden_attrs | frozenset([])
365 _sparse_index: SparseIndex
366 _sparse_values: np.ndarray
367 _dtype: SparseDtype
368
369 def __init__(
370 self,
371 data,
372 sparse_index=None,
373 fill_value=None,
374 kind: SparseIndexKind = "integer",
375 dtype: Dtype | None = None,
376 copy: bool = False,
377 ) -> None:
378 if fill_value is None and isinstance(dtype, SparseDtype):
379 fill_value = dtype.fill_value
380
381 if isinstance(data, type(self)):
382 # disable normal inference on dtype, sparse_index, & fill_value
383 if sparse_index is None:
384 sparse_index = data.sp_index
385 if fill_value is None:
386 fill_value = data.fill_value
387 if dtype is None:
388 dtype = data.dtype
389 # TODO: make kind=None, and use data.kind?
390 data = data.sp_values
391
392 # Handle use-provided dtype
393 if isinstance(dtype, str):
394 # Two options: dtype='int', regular numpy dtype
395 # or dtype='Sparse[int]', a sparse dtype
396 try:
397 dtype = SparseDtype.construct_from_string(dtype)
398 except TypeError:
399 dtype = pandas_dtype(dtype)
400
401 if isinstance(dtype, SparseDtype):
402 if fill_value is None:
403 fill_value = dtype.fill_value
404 dtype = dtype.subtype
405
406 if is_scalar(data):
407 if sparse_index is None:
408 npoints = 1
409 else:
410 npoints = sparse_index.length
411
412 data = construct_1d_arraylike_from_scalar(data, npoints, dtype=None)
413 dtype = data.dtype
414
415 if dtype is not None:
416 dtype = pandas_dtype(dtype)
417
418 # TODO: disentangle the fill_value dtype inference from
419 # dtype inference
420 if data is None:
421 # TODO: What should the empty dtype be? Object or float?
422
423 # error: Argument "dtype" to "array" has incompatible type
424 # "Union[ExtensionDtype, dtype[Any], None]"; expected "Union[dtype[Any],
425 # None, type, _SupportsDType, str, Union[Tuple[Any, int], Tuple[Any,
426 # Union[int, Sequence[int]]], List[Any], _DTypeDict, Tuple[Any, Any]]]"
427 data = np.array([], dtype=dtype) # type: ignore[arg-type]
428
429 if not is_array_like(data):
430 try:
431 # probably shared code in sanitize_series
432
433 data = sanitize_array(data, index=None)
434 except ValueError:
435 # NumPy may raise a ValueError on data like [1, []]
436 # we retry with object dtype here.
437 if dtype is None:
438 dtype = np.dtype(object)
439 data = np.atleast_1d(np.asarray(data, dtype=dtype))
440 else:
441 raise
442
443 if copy:
444 # TODO: avoid double copy when dtype forces cast.
445 data = data.copy()
446
447 if fill_value is None:
448 fill_value_dtype = data.dtype if dtype is None else dtype
449 if fill_value_dtype is None:
450 fill_value = np.nan
451 else:
452 fill_value = na_value_for_dtype(fill_value_dtype)
453
454 if isinstance(data, type(self)) and sparse_index is None:
455 sparse_index = data._sparse_index
456 # error: Argument "dtype" to "asarray" has incompatible type
457 # "Union[ExtensionDtype, dtype[Any], None]"; expected "None"
458 sparse_values = np.asarray(
459 data.sp_values, dtype=dtype # type: ignore[arg-type]
460 )
461 elif sparse_index is None:
462 data = extract_array(data, extract_numpy=True)
463 if not isinstance(data, np.ndarray):
464 # EA
465 if is_datetime64tz_dtype(data.dtype):
466 warnings.warn(
467 f"Creating SparseArray from {data.dtype} data "
468 "loses timezone information. Cast to object before "
469 "sparse to retain timezone information.",
470 UserWarning,
471 stacklevel=find_stack_level(),
472 )
473 data = np.asarray(data, dtype="datetime64[ns]")
474 if fill_value is NaT:
475 fill_value = np.datetime64("NaT", "ns")
476 data = np.asarray(data)
477 sparse_values, sparse_index, fill_value = _make_sparse(
478 # error: Argument "dtype" to "_make_sparse" has incompatible type
479 # "Union[ExtensionDtype, dtype[Any], None]"; expected
480 # "Optional[dtype[Any]]"
481 data,
482 kind=kind,
483 fill_value=fill_value,
484 dtype=dtype, # type: ignore[arg-type]
485 )
486 else:
487 # error: Argument "dtype" to "asarray" has incompatible type
488 # "Union[ExtensionDtype, dtype[Any], None]"; expected "None"
489 sparse_values = np.asarray(data, dtype=dtype) # type: ignore[arg-type]
490 if len(sparse_values) != sparse_index.npoints:
491 raise AssertionError(
492 f"Non array-like type {type(sparse_values)} must "
493 "have the same length as the index"
494 )
495 self._sparse_index = sparse_index
496 self._sparse_values = sparse_values
497 self._dtype = SparseDtype(sparse_values.dtype, fill_value)
498
499 @classmethod
500 def _simple_new(
501 cls: type[SparseArrayT],
502 sparse_array: np.ndarray,
503 sparse_index: SparseIndex,
504 dtype: SparseDtype,
505 ) -> SparseArrayT:
506 new = object.__new__(cls)
507 new._sparse_index = sparse_index
508 new._sparse_values = sparse_array
509 new._dtype = dtype
510 return new
511
512 @classmethod
513 def from_spmatrix(cls: type[SparseArrayT], data: spmatrix) -> SparseArrayT:
514 """
515 Create a SparseArray from a scipy.sparse matrix.
516
517 Parameters
518 ----------
519 data : scipy.sparse.sp_matrix
520 This should be a SciPy sparse matrix where the size
521 of the second dimension is 1. In other words, a
522 sparse matrix with a single column.
523
524 Returns
525 -------
526 SparseArray
527
528 Examples
529 --------
530 >>> import scipy.sparse
531 >>> mat = scipy.sparse.coo_matrix((4, 1))
532 >>> pd.arrays.SparseArray.from_spmatrix(mat)
533 [0.0, 0.0, 0.0, 0.0]
534 Fill: 0.0
535 IntIndex
536 Indices: array([], dtype=int32)
537 """
538 length, ncol = data.shape
539
540 if ncol != 1:
541 raise ValueError(f"'data' must have a single column, not '{ncol}'")
542
543 # our sparse index classes require that the positions be strictly
544 # increasing. So we need to sort loc, and arr accordingly.
545 data = data.tocsc()
546 data.sort_indices()
547 arr = data.data
548 idx = data.indices
549
550 zero = np.array(0, dtype=arr.dtype).item()
551 dtype = SparseDtype(arr.dtype, zero)
552 index = IntIndex(length, idx)
553
554 return cls._simple_new(arr, index, dtype)
555
556 def __array__(self, dtype: NpDtype | None = None) -> np.ndarray:
557 fill_value = self.fill_value
558
559 if self.sp_index.ngaps == 0:
560 # Compat for na dtype and int values.
561 return self.sp_values
562 if dtype is None:
563 # Can NumPy represent this type?
564 # If not, `np.result_type` will raise. We catch that
565 # and return object.
566 if is_datetime64_any_dtype(self.sp_values.dtype):
567 # However, we *do* special-case the common case of
568 # a datetime64 with pandas NaT.
569 if fill_value is NaT:
570 # Can't put pd.NaT in a datetime64[ns]
571 fill_value = np.datetime64("NaT")
572 try:
573 dtype = np.result_type(self.sp_values.dtype, type(fill_value))
574 except TypeError:
575 dtype = object
576
577 out = np.full(self.shape, fill_value, dtype=dtype)
578 out[self.sp_index.indices] = self.sp_values
579 return out
580
581 def __setitem__(self, key, value):
582 # I suppose we could allow setting of non-fill_value elements.
583 # TODO(SparseArray.__setitem__): remove special cases in
584 # ExtensionBlock.where
585 msg = "SparseArray does not support item assignment via setitem"
586 raise TypeError(msg)
587
588 @classmethod
589 def _from_sequence(cls, scalars, *, dtype: Dtype | None = None, copy: bool = False):
590 return cls(scalars, dtype=dtype)
591
592 @classmethod
593 def _from_factorized(cls, values, original):
594 return cls(values, dtype=original.dtype)
595
596 # ------------------------------------------------------------------------
597 # Data
598 # ------------------------------------------------------------------------
599 @property
600 def sp_index(self) -> SparseIndex:
601 """
602 The SparseIndex containing the location of non- ``fill_value`` points.
603 """
604 return self._sparse_index
605
606 @property
607 def sp_values(self) -> np.ndarray:
608 """
609 An ndarray containing the non- ``fill_value`` values.
610
611 Examples
612 --------
613 >>> from pandas.arrays import SparseArray
614 >>> s = SparseArray([0, 0, 1, 0, 2], fill_value=0)
615 >>> s.sp_values
616 array([1, 2])
617 """
618 return self._sparse_values
619
620 @property
621 def dtype(self) -> SparseDtype:
622 return self._dtype
623
624 @property
625 def fill_value(self):
626 """
627 Elements in `data` that are `fill_value` are not stored.
628
629 For memory savings, this should be the most common value in the array.
630 """
631 return self.dtype.fill_value
632
633 @fill_value.setter
634 def fill_value(self, value) -> None:
635 self._dtype = SparseDtype(self.dtype.subtype, value)
636
637 @property
638 def kind(self) -> SparseIndexKind:
639 """
640 The kind of sparse index for this array. One of {'integer', 'block'}.
641 """
642 if isinstance(self.sp_index, IntIndex):
643 return "integer"
644 else:
645 return "block"
646
647 @property
648 def _valid_sp_values(self) -> np.ndarray:
649 sp_vals = self.sp_values
650 mask = notna(sp_vals)
651 return sp_vals[mask]
652
653 def __len__(self) -> int:
654 return self.sp_index.length
655
656 @property
657 def _null_fill_value(self) -> bool:
658 return self._dtype._is_na_fill_value
659
660 def _fill_value_matches(self, fill_value) -> bool:
661 if self._null_fill_value:
662 return isna(fill_value)
663 else:
664 return self.fill_value == fill_value
665
666 @property
667 def nbytes(self) -> int:
668 return self.sp_values.nbytes + self.sp_index.nbytes
669
670 @property
671 def density(self) -> float:
672 """
673 The percent of non- ``fill_value`` points, as decimal.
674
675 Examples
676 --------
677 >>> from pandas.arrays import SparseArray
678 >>> s = SparseArray([0, 0, 1, 1, 1], fill_value=0)
679 >>> s.density
680 0.6
681 """
682 return self.sp_index.npoints / self.sp_index.length
683
684 @property
685 def npoints(self) -> int:
686 """
687 The number of non- ``fill_value`` points.
688
689 Examples
690 --------
691 >>> from pandas.arrays import SparseArray
692 >>> s = SparseArray([0, 0, 1, 1, 1], fill_value=0)
693 >>> s.npoints
694 3
695 """
696 return self.sp_index.npoints
697
698 def isna(self):
699 # If null fill value, we want SparseDtype[bool, true]
700 # to preserve the same memory usage.
701 dtype = SparseDtype(bool, self._null_fill_value)
702 if self._null_fill_value:
703 return type(self)._simple_new(isna(self.sp_values), self.sp_index, dtype)
704 mask = np.full(len(self), False, dtype=np.bool_)
705 mask[self.sp_index.indices] = isna(self.sp_values)
706 return type(self)(mask, fill_value=False, dtype=dtype)
707
708 def fillna(
709 self: SparseArrayT,
710 value=None,
711 method: FillnaOptions | None = None,
712 limit: int | None = None,
713 ) -> SparseArrayT:
714 """
715 Fill missing values with `value`.
716
717 Parameters
718 ----------
719 value : scalar, optional
720 method : str, optional
721
722 .. warning::
723
724 Using 'method' will result in high memory use,
725 as all `fill_value` methods will be converted to
726 an in-memory ndarray
727
728 limit : int, optional
729
730 Returns
731 -------
732 SparseArray
733
734 Notes
735 -----
736 When `value` is specified, the result's ``fill_value`` depends on
737 ``self.fill_value``. The goal is to maintain low-memory use.
738
739 If ``self.fill_value`` is NA, the result dtype will be
740 ``SparseDtype(self.dtype, fill_value=value)``. This will preserve
741 amount of memory used before and after filling.
742
743 When ``self.fill_value`` is not NA, the result dtype will be
744 ``self.dtype``. Again, this preserves the amount of memory used.
745 """
746 if (method is None and value is None) or (
747 method is not None and value is not None
748 ):
749 raise ValueError("Must specify one of 'method' or 'value'.")
750
751 if method is not None:
752 msg = "fillna with 'method' requires high memory usage."
753 warnings.warn(
754 msg,
755 PerformanceWarning,
756 stacklevel=find_stack_level(),
757 )
758 new_values = np.asarray(self)
759 # interpolate_2d modifies new_values inplace
760 interpolate_2d(new_values, method=method, limit=limit)
761 return type(self)(new_values, fill_value=self.fill_value)
762
763 else:
764 new_values = np.where(isna(self.sp_values), value, self.sp_values)
765
766 if self._null_fill_value:
767 # This is essentially just updating the dtype.
768 new_dtype = SparseDtype(self.dtype.subtype, fill_value=value)
769 else:
770 new_dtype = self.dtype
771
772 return self._simple_new(new_values, self._sparse_index, new_dtype)
773
774 def shift(self: SparseArrayT, periods: int = 1, fill_value=None) -> SparseArrayT:
775 if not len(self) or periods == 0:
776 return self.copy()
777
778 if isna(fill_value):
779 fill_value = self.dtype.na_value
780
781 subtype = np.result_type(fill_value, self.dtype.subtype)
782
783 if subtype != self.dtype.subtype:
784 # just coerce up front
785 arr = self.astype(SparseDtype(subtype, self.fill_value))
786 else:
787 arr = self
788
789 empty = self._from_sequence(
790 [fill_value] * min(abs(periods), len(self)), dtype=arr.dtype
791 )
792
793 if periods > 0:
794 a = empty
795 b = arr[:-periods]
796 else:
797 a = arr[abs(periods) :]
798 b = empty
799 return arr._concat_same_type([a, b])
800
801 def _first_fill_value_loc(self):
802 """
803 Get the location of the first fill value.
804
805 Returns
806 -------
807 int
808 """
809 if len(self) == 0 or self.sp_index.npoints == len(self):
810 return -1
811
812 indices = self.sp_index.indices
813 if not len(indices) or indices[0] > 0:
814 return 0
815
816 # a number larger than 1 should be appended to
817 # the last in case of fill value only appears
818 # in the tail of array
819 diff = np.r_[np.diff(indices), 2]
820 return indices[(diff > 1).argmax()] + 1
821
822 def unique(self: SparseArrayT) -> SparseArrayT:
823 uniques = algos.unique(self.sp_values)
824 if len(self.sp_values) != len(self):
825 fill_loc = self._first_fill_value_loc()
826 # Inorder to align the behavior of pd.unique or
827 # pd.Series.unique, we should keep the original
828 # order, here we use unique again to find the
829 # insertion place. Since the length of sp_values
830 # is not large, maybe minor performance hurt
831 # is worthwhile to the correctness.
832 insert_loc = len(algos.unique(self.sp_values[:fill_loc]))
833 uniques = np.insert(uniques, insert_loc, self.fill_value)
834 return type(self)._from_sequence(uniques, dtype=self.dtype)
835
836 def _values_for_factorize(self):
837 # Still override this for hash_pandas_object
838 return np.asarray(self), self.fill_value
839
840 def factorize(
841 self,
842 use_na_sentinel: bool = True,
843 ) -> tuple[np.ndarray, SparseArray]:
844 # Currently, ExtensionArray.factorize -> Tuple[ndarray, EA]
845 # The sparsity on this is backwards from what Sparse would want. Want
846 # ExtensionArray.factorize -> Tuple[EA, EA]
847 # Given that we have to return a dense array of codes, why bother
848 # implementing an efficient factorize?
849 codes, uniques = algos.factorize(
850 np.asarray(self), use_na_sentinel=use_na_sentinel
851 )
852 uniques_sp = SparseArray(uniques, dtype=self.dtype)
853 return codes, uniques_sp
854
855 def value_counts(self, dropna: bool = True) -> Series:
856 """
857 Returns a Series containing counts of unique values.
858
859 Parameters
860 ----------
861 dropna : bool, default True
862 Don't include counts of NaN, even if NaN is in sp_values.
863
864 Returns
865 -------
866 counts : Series
867 """
868 from pandas import (
869 Index,
870 Series,
871 )
872
873 keys, counts = algos.value_counts_arraylike(self.sp_values, dropna=dropna)
874 fcounts = self.sp_index.ngaps
875 if fcounts > 0 and (not self._null_fill_value or not dropna):
876 mask = isna(keys) if self._null_fill_value else keys == self.fill_value
877 if mask.any():
878 counts[mask] += fcounts
879 else:
880 # error: Argument 1 to "insert" has incompatible type "Union[
881 # ExtensionArray,ndarray[Any, Any]]"; expected "Union[
882 # _SupportsArray[dtype[Any]], Sequence[_SupportsArray[dtype
883 # [Any]]], Sequence[Sequence[_SupportsArray[dtype[Any]]]],
884 # Sequence[Sequence[Sequence[_SupportsArray[dtype[Any]]]]], Sequence
885 # [Sequence[Sequence[Sequence[_SupportsArray[dtype[Any]]]]]]]"
886 keys = np.insert(keys, 0, self.fill_value) # type: ignore[arg-type]
887 counts = np.insert(counts, 0, fcounts)
888
889 if not isinstance(keys, ABCIndex):
890 index = Index(keys)
891 else:
892 index = keys
893 return Series(counts, index=index, copy=False)
894
895 # --------
896 # Indexing
897 # --------
898 @overload
899 def __getitem__(self, key: ScalarIndexer) -> Any:
900 ...
901
902 @overload
903 def __getitem__(
904 self: SparseArrayT,
905 key: SequenceIndexer | tuple[int | ellipsis, ...],
906 ) -> SparseArrayT:
907 ...
908
909 def __getitem__(
910 self: SparseArrayT,
911 key: PositionalIndexer | tuple[int | ellipsis, ...],
912 ) -> SparseArrayT | Any:
913 if isinstance(key, tuple):
914 key = unpack_tuple_and_ellipses(key)
915 if key is Ellipsis:
916 raise ValueError("Cannot slice with Ellipsis")
917
918 if is_integer(key):
919 return self._get_val_at(key)
920 elif isinstance(key, tuple):
921 # error: Invalid index type "Tuple[Union[int, ellipsis], ...]"
922 # for "ndarray[Any, Any]"; expected type
923 # "Union[SupportsIndex, _SupportsArray[dtype[Union[bool_,
924 # integer[Any]]]], _NestedSequence[_SupportsArray[dtype[
925 # Union[bool_, integer[Any]]]]], _NestedSequence[Union[
926 # bool, int]], Tuple[Union[SupportsIndex, _SupportsArray[
927 # dtype[Union[bool_, integer[Any]]]], _NestedSequence[
928 # _SupportsArray[dtype[Union[bool_, integer[Any]]]]],
929 # _NestedSequence[Union[bool, int]]], ...]]"
930 data_slice = self.to_dense()[key] # type: ignore[index]
931 elif isinstance(key, slice):
932 # Avoid densifying when handling contiguous slices
933 if key.step is None or key.step == 1:
934 start = 0 if key.start is None else key.start
935 if start < 0:
936 start += len(self)
937
938 end = len(self) if key.stop is None else key.stop
939 if end < 0:
940 end += len(self)
941
942 indices = self.sp_index.indices
943 keep_inds = np.flatnonzero((indices >= start) & (indices < end))
944 sp_vals = self.sp_values[keep_inds]
945
946 sp_index = indices[keep_inds].copy()
947
948 # If we've sliced to not include the start of the array, all our indices
949 # should be shifted. NB: here we are careful to also not shift by a
950 # negative value for a case like [0, 1][-100:] where the start index
951 # should be treated like 0
952 if start > 0:
953 sp_index -= start
954
955 # Length of our result should match applying this slice to a range
956 # of the length of our original array
957 new_len = len(range(len(self))[key])
958 new_sp_index = make_sparse_index(new_len, sp_index, self.kind)
959 return type(self)._simple_new(sp_vals, new_sp_index, self.dtype)
960 else:
961 indices = np.arange(len(self), dtype=np.int32)[key]
962 return self.take(indices)
963
964 elif not is_list_like(key):
965 # e.g. "foo" or 2.5
966 # exception message copied from numpy
967 raise IndexError(
968 r"only integers, slices (`:`), ellipsis (`...`), numpy.newaxis "
969 r"(`None`) and integer or boolean arrays are valid indices"
970 )
971
972 else:
973 if isinstance(key, SparseArray):
974 # NOTE: If we guarantee that SparseDType(bool)
975 # has only fill_value - true, false or nan
976 # (see GH PR 44955)
977 # we can apply mask very fast:
978 if is_bool_dtype(key):
979 if isna(key.fill_value):
980 return self.take(key.sp_index.indices[key.sp_values])
981 if not key.fill_value:
982 return self.take(key.sp_index.indices)
983 n = len(self)
984 mask = np.full(n, True, dtype=np.bool_)
985 mask[key.sp_index.indices] = False
986 return self.take(np.arange(n)[mask])
987 else:
988 key = np.asarray(key)
989
990 key = check_array_indexer(self, key)
991
992 if com.is_bool_indexer(key):
993 # mypy doesn't know we have an array here
994 key = cast(np.ndarray, key)
995 return self.take(np.arange(len(key), dtype=np.int32)[key])
996 elif hasattr(key, "__len__"):
997 return self.take(key)
998 else:
999 raise ValueError(f"Cannot slice with '{key}'")
1000
1001 return type(self)(data_slice, kind=self.kind)
1002
1003 def _get_val_at(self, loc):
1004 loc = validate_insert_loc(loc, len(self))
1005
1006 sp_loc = self.sp_index.lookup(loc)
1007 if sp_loc == -1:
1008 return self.fill_value
1009 else:
1010 val = self.sp_values[sp_loc]
1011 val = maybe_box_datetimelike(val, self.sp_values.dtype)
1012 return val
1013
1014 def take(
1015 self: SparseArrayT, indices, *, allow_fill: bool = False, fill_value=None
1016 ) -> SparseArrayT:
1017 if is_scalar(indices):
1018 raise ValueError(f"'indices' must be an array, not a scalar '{indices}'.")
1019 indices = np.asarray(indices, dtype=np.int32)
1020
1021 dtype = None
1022 if indices.size == 0:
1023 result = np.array([], dtype="object")
1024 dtype = self.dtype
1025 elif allow_fill:
1026 result = self._take_with_fill(indices, fill_value=fill_value)
1027 else:
1028 return self._take_without_fill(indices)
1029
1030 return type(self)(
1031 result, fill_value=self.fill_value, kind=self.kind, dtype=dtype
1032 )
1033
1034 def _take_with_fill(self, indices, fill_value=None) -> np.ndarray:
1035 if fill_value is None:
1036 fill_value = self.dtype.na_value
1037
1038 if indices.min() < -1:
1039 raise ValueError(
1040 "Invalid value in 'indices'. Must be between -1 "
1041 "and the length of the array."
1042 )
1043
1044 if indices.max() >= len(self):
1045 raise IndexError("out of bounds value in 'indices'.")
1046
1047 if len(self) == 0:
1048 # Empty... Allow taking only if all empty
1049 if (indices == -1).all():
1050 dtype = np.result_type(self.sp_values, type(fill_value))
1051 taken = np.empty_like(indices, dtype=dtype)
1052 taken.fill(fill_value)
1053 return taken
1054 else:
1055 raise IndexError("cannot do a non-empty take from an empty axes.")
1056
1057 # sp_indexer may be -1 for two reasons
1058 # 1.) we took for an index of -1 (new)
1059 # 2.) we took a value that was self.fill_value (old)
1060 sp_indexer = self.sp_index.lookup_array(indices)
1061 new_fill_indices = indices == -1
1062 old_fill_indices = (sp_indexer == -1) & ~new_fill_indices
1063
1064 if self.sp_index.npoints == 0 and old_fill_indices.all():
1065 # We've looked up all valid points on an all-sparse array.
1066 taken = np.full(
1067 sp_indexer.shape, fill_value=self.fill_value, dtype=self.dtype.subtype
1068 )
1069
1070 elif self.sp_index.npoints == 0:
1071 # Avoid taking from the empty self.sp_values
1072 _dtype = np.result_type(self.dtype.subtype, type(fill_value))
1073 taken = np.full(sp_indexer.shape, fill_value=fill_value, dtype=_dtype)
1074 else:
1075 taken = self.sp_values.take(sp_indexer)
1076
1077 # Fill in two steps.
1078 # Old fill values
1079 # New fill values
1080 # potentially coercing to a new dtype at each stage.
1081
1082 m0 = sp_indexer[old_fill_indices] < 0
1083 m1 = sp_indexer[new_fill_indices] < 0
1084
1085 result_type = taken.dtype
1086
1087 if m0.any():
1088 result_type = np.result_type(result_type, type(self.fill_value))
1089 taken = taken.astype(result_type)
1090 taken[old_fill_indices] = self.fill_value
1091
1092 if m1.any():
1093 result_type = np.result_type(result_type, type(fill_value))
1094 taken = taken.astype(result_type)
1095 taken[new_fill_indices] = fill_value
1096
1097 return taken
1098
1099 def _take_without_fill(self: SparseArrayT, indices) -> SparseArrayT:
1100 to_shift = indices < 0
1101
1102 n = len(self)
1103
1104 if (indices.max() >= n) or (indices.min() < -n):
1105 if n == 0:
1106 raise IndexError("cannot do a non-empty take from an empty axes.")
1107 raise IndexError("out of bounds value in 'indices'.")
1108
1109 if to_shift.any():
1110 indices = indices.copy()
1111 indices[to_shift] += n
1112
1113 sp_indexer = self.sp_index.lookup_array(indices)
1114 value_mask = sp_indexer != -1
1115 new_sp_values = self.sp_values[sp_indexer[value_mask]]
1116
1117 value_indices = np.flatnonzero(value_mask).astype(np.int32, copy=False)
1118
1119 new_sp_index = make_sparse_index(len(indices), value_indices, kind=self.kind)
1120 return type(self)._simple_new(new_sp_values, new_sp_index, dtype=self.dtype)
1121
1122 def searchsorted(
1123 self,
1124 v: ArrayLike | object,
1125 side: Literal["left", "right"] = "left",
1126 sorter: NumpySorter = None,
1127 ) -> npt.NDArray[np.intp] | np.intp:
1128 msg = "searchsorted requires high memory usage."
1129 warnings.warn(msg, PerformanceWarning, stacklevel=find_stack_level())
1130 if not is_scalar(v):
1131 v = np.asarray(v)
1132 v = np.asarray(v)
1133 return np.asarray(self, dtype=self.dtype.subtype).searchsorted(v, side, sorter)
1134
1135 def copy(self: SparseArrayT) -> SparseArrayT:
1136 values = self.sp_values.copy()
1137 return self._simple_new(values, self.sp_index, self.dtype)
1138
1139 @classmethod
1140 def _concat_same_type(
1141 cls: type[SparseArrayT], to_concat: Sequence[SparseArrayT]
1142 ) -> SparseArrayT:
1143 fill_value = to_concat[0].fill_value
1144
1145 values = []
1146 length = 0
1147
1148 if to_concat:
1149 sp_kind = to_concat[0].kind
1150 else:
1151 sp_kind = "integer"
1152
1153 sp_index: SparseIndex
1154 if sp_kind == "integer":
1155 indices = []
1156
1157 for arr in to_concat:
1158 int_idx = arr.sp_index.indices.copy()
1159 int_idx += length # TODO: wraparound
1160 length += arr.sp_index.length
1161
1162 values.append(arr.sp_values)
1163 indices.append(int_idx)
1164
1165 data = np.concatenate(values)
1166 indices_arr = np.concatenate(indices)
1167 # error: Argument 2 to "IntIndex" has incompatible type
1168 # "ndarray[Any, dtype[signedinteger[_32Bit]]]";
1169 # expected "Sequence[int]"
1170 sp_index = IntIndex(length, indices_arr) # type: ignore[arg-type]
1171
1172 else:
1173 # when concatenating block indices, we don't claim that you'll
1174 # get an identical index as concatenating the values and then
1175 # creating a new index. We don't want to spend the time trying
1176 # to merge blocks across arrays in `to_concat`, so the resulting
1177 # BlockIndex may have more blocks.
1178 blengths = []
1179 blocs = []
1180
1181 for arr in to_concat:
1182 block_idx = arr.sp_index.to_block_index()
1183
1184 values.append(arr.sp_values)
1185 blocs.append(block_idx.blocs.copy() + length)
1186 blengths.append(block_idx.blengths)
1187 length += arr.sp_index.length
1188
1189 data = np.concatenate(values)
1190 blocs_arr = np.concatenate(blocs)
1191 blengths_arr = np.concatenate(blengths)
1192
1193 sp_index = BlockIndex(length, blocs_arr, blengths_arr)
1194
1195 return cls(data, sparse_index=sp_index, fill_value=fill_value)
1196
1197 def astype(self, dtype: AstypeArg | None = None, copy: bool = True):
1198 """
1199 Change the dtype of a SparseArray.
1200
1201 The output will always be a SparseArray. To convert to a dense
1202 ndarray with a certain dtype, use :meth:`numpy.asarray`.
1203
1204 Parameters
1205 ----------
1206 dtype : np.dtype or ExtensionDtype
1207 For SparseDtype, this changes the dtype of
1208 ``self.sp_values`` and the ``self.fill_value``.
1209
1210 For other dtypes, this only changes the dtype of
1211 ``self.sp_values``.
1212
1213 copy : bool, default True
1214 Whether to ensure a copy is made, even if not necessary.
1215
1216 Returns
1217 -------
1218 SparseArray
1219
1220 Examples
1221 --------
1222 >>> arr = pd.arrays.SparseArray([0, 0, 1, 2])
1223 >>> arr
1224 [0, 0, 1, 2]
1225 Fill: 0
1226 IntIndex
1227 Indices: array([2, 3], dtype=int32)
1228
1229 >>> arr.astype(SparseDtype(np.dtype('int32')))
1230 [0, 0, 1, 2]
1231 Fill: 0
1232 IntIndex
1233 Indices: array([2, 3], dtype=int32)
1234
1235 Using a NumPy dtype with a different kind (e.g. float) will coerce
1236 just ``self.sp_values``.
1237
1238 >>> arr.astype(SparseDtype(np.dtype('float64')))
1239 ... # doctest: +NORMALIZE_WHITESPACE
1240 [nan, nan, 1.0, 2.0]
1241 Fill: nan
1242 IntIndex
1243 Indices: array([2, 3], dtype=int32)
1244
1245 Using a SparseDtype, you can also change the fill value as well.
1246
1247 >>> arr.astype(SparseDtype("float64", fill_value=0.0))
1248 ... # doctest: +NORMALIZE_WHITESPACE
1249 [0.0, 0.0, 1.0, 2.0]
1250 Fill: 0.0
1251 IntIndex
1252 Indices: array([2, 3], dtype=int32)
1253 """
1254 if is_dtype_equal(dtype, self._dtype):
1255 if not copy:
1256 return self
1257 else:
1258 return self.copy()
1259
1260 future_dtype = pandas_dtype(dtype)
1261 if not isinstance(future_dtype, SparseDtype):
1262 # GH#34457
1263 values = np.asarray(self)
1264 values = ensure_wrapped_if_datetimelike(values)
1265 return astype_array(values, dtype=future_dtype, copy=False)
1266
1267 dtype = self.dtype.update_dtype(dtype)
1268 subtype = pandas_dtype(dtype._subtype_with_str)
1269 subtype = cast(np.dtype, subtype) # ensured by update_dtype
1270 values = ensure_wrapped_if_datetimelike(self.sp_values)
1271 sp_values = astype_array(values, subtype, copy=copy)
1272 sp_values = np.asarray(sp_values)
1273
1274 return self._simple_new(sp_values, self.sp_index, dtype)
1275
1276 def map(self: SparseArrayT, mapper) -> SparseArrayT:
1277 """
1278 Map categories using an input mapping or function.
1279
1280 Parameters
1281 ----------
1282 mapper : dict, Series, callable
1283 The correspondence from old values to new.
1284
1285 Returns
1286 -------
1287 SparseArray
1288 The output array will have the same density as the input.
1289 The output fill value will be the result of applying the
1290 mapping to ``self.fill_value``
1291
1292 Examples
1293 --------
1294 >>> arr = pd.arrays.SparseArray([0, 1, 2])
1295 >>> arr.map(lambda x: x + 10)
1296 [10, 11, 12]
1297 Fill: 10
1298 IntIndex
1299 Indices: array([1, 2], dtype=int32)
1300
1301 >>> arr.map({0: 10, 1: 11, 2: 12})
1302 [10, 11, 12]
1303 Fill: 10
1304 IntIndex
1305 Indices: array([1, 2], dtype=int32)
1306
1307 >>> arr.map(pd.Series([10, 11, 12], index=[0, 1, 2]))
1308 [10, 11, 12]
1309 Fill: 10
1310 IntIndex
1311 Indices: array([1, 2], dtype=int32)
1312 """
1313 # this is used in apply.
1314 # We get hit since we're an "is_extension_array_dtype" but regular extension
1315 # types are not hit. This may be worth adding to the interface.
1316 if isinstance(mapper, ABCSeries):
1317 mapper = mapper.to_dict()
1318
1319 if isinstance(mapper, abc.Mapping):
1320 fill_value = mapper.get(self.fill_value, self.fill_value)
1321 sp_values = [mapper.get(x, None) for x in self.sp_values]
1322 else:
1323 fill_value = mapper(self.fill_value)
1324 sp_values = [mapper(x) for x in self.sp_values]
1325
1326 return type(self)(sp_values, sparse_index=self.sp_index, fill_value=fill_value)
1327
1328 def to_dense(self) -> np.ndarray:
1329 """
1330 Convert SparseArray to a NumPy array.
1331
1332 Returns
1333 -------
1334 arr : NumPy array
1335 """
1336 return np.asarray(self, dtype=self.sp_values.dtype)
1337
1338 def _where(self, mask, value):
1339 # NB: may not preserve dtype, e.g. result may be Sparse[float64]
1340 # while self is Sparse[int64]
1341 naive_implementation = np.where(mask, self, value)
1342 dtype = SparseDtype(naive_implementation.dtype, fill_value=self.fill_value)
1343 result = type(self)._from_sequence(naive_implementation, dtype=dtype)
1344 return result
1345
1346 # ------------------------------------------------------------------------
1347 # IO
1348 # ------------------------------------------------------------------------
1349 def __setstate__(self, state) -> None:
1350 """Necessary for making this object picklable"""
1351 if isinstance(state, tuple):
1352 # Compat for pandas < 0.24.0
1353 nd_state, (fill_value, sp_index) = state
1354 sparse_values = np.array([])
1355 sparse_values.__setstate__(nd_state)
1356
1357 self._sparse_values = sparse_values
1358 self._sparse_index = sp_index
1359 self._dtype = SparseDtype(sparse_values.dtype, fill_value)
1360 else:
1361 self.__dict__.update(state)
1362
1363 def nonzero(self) -> tuple[npt.NDArray[np.int32]]:
1364 if self.fill_value == 0:
1365 return (self.sp_index.indices,)
1366 else:
1367 return (self.sp_index.indices[self.sp_values != 0],)
1368
1369 # ------------------------------------------------------------------------
1370 # Reductions
1371 # ------------------------------------------------------------------------
1372
1373 def _reduce(self, name: str, *, skipna: bool = True, **kwargs):
1374 method = getattr(self, name, None)
1375
1376 if method is None:
1377 raise TypeError(f"cannot perform {name} with type {self.dtype}")
1378
1379 if skipna:
1380 arr = self
1381 else:
1382 arr = self.dropna()
1383
1384 return getattr(arr, name)(**kwargs)
1385
1386 def all(self, axis=None, *args, **kwargs):
1387 """
1388 Tests whether all elements evaluate True
1389
1390 Returns
1391 -------
1392 all : bool
1393
1394 See Also
1395 --------
1396 numpy.all
1397 """
1398 nv.validate_all(args, kwargs)
1399
1400 values = self.sp_values
1401
1402 if len(values) != len(self) and not np.all(self.fill_value):
1403 return False
1404
1405 return values.all()
1406
1407 def any(self, axis: AxisInt = 0, *args, **kwargs):
1408 """
1409 Tests whether at least one of elements evaluate True
1410
1411 Returns
1412 -------
1413 any : bool
1414
1415 See Also
1416 --------
1417 numpy.any
1418 """
1419 nv.validate_any(args, kwargs)
1420
1421 values = self.sp_values
1422
1423 if len(values) != len(self) and np.any(self.fill_value):
1424 return True
1425
1426 return values.any().item()
1427
1428 def sum(
1429 self,
1430 axis: AxisInt = 0,
1431 min_count: int = 0,
1432 skipna: bool = True,
1433 *args,
1434 **kwargs,
1435 ) -> Scalar:
1436 """
1437 Sum of non-NA/null values
1438
1439 Parameters
1440 ----------
1441 axis : int, default 0
1442 Not Used. NumPy compatibility.
1443 min_count : int, default 0
1444 The required number of valid values to perform the summation. If fewer
1445 than ``min_count`` valid values are present, the result will be the missing
1446 value indicator for subarray type.
1447 *args, **kwargs
1448 Not Used. NumPy compatibility.
1449
1450 Returns
1451 -------
1452 scalar
1453 """
1454 nv.validate_sum(args, kwargs)
1455 valid_vals = self._valid_sp_values
1456 sp_sum = valid_vals.sum()
1457 has_na = self.sp_index.ngaps > 0 and not self._null_fill_value
1458
1459 if has_na and not skipna:
1460 return na_value_for_dtype(self.dtype.subtype, compat=False)
1461
1462 if self._null_fill_value:
1463 if check_below_min_count(valid_vals.shape, None, min_count):
1464 return na_value_for_dtype(self.dtype.subtype, compat=False)
1465 return sp_sum
1466 else:
1467 nsparse = self.sp_index.ngaps
1468 if check_below_min_count(valid_vals.shape, None, min_count - nsparse):
1469 return na_value_for_dtype(self.dtype.subtype, compat=False)
1470 return sp_sum + self.fill_value * nsparse
1471
1472 def cumsum(self, axis: AxisInt = 0, *args, **kwargs) -> SparseArray:
1473 """
1474 Cumulative sum of non-NA/null values.
1475
1476 When performing the cumulative summation, any non-NA/null values will
1477 be skipped. The resulting SparseArray will preserve the locations of
1478 NaN values, but the fill value will be `np.nan` regardless.
1479
1480 Parameters
1481 ----------
1482 axis : int or None
1483 Axis over which to perform the cumulative summation. If None,
1484 perform cumulative summation over flattened array.
1485
1486 Returns
1487 -------
1488 cumsum : SparseArray
1489 """
1490 nv.validate_cumsum(args, kwargs)
1491
1492 if axis is not None and axis >= self.ndim: # Mimic ndarray behaviour.
1493 raise ValueError(f"axis(={axis}) out of bounds")
1494
1495 if not self._null_fill_value:
1496 return SparseArray(self.to_dense()).cumsum()
1497
1498 return SparseArray(
1499 self.sp_values.cumsum(),
1500 sparse_index=self.sp_index,
1501 fill_value=self.fill_value,
1502 )
1503
1504 def mean(self, axis: Axis = 0, *args, **kwargs):
1505 """
1506 Mean of non-NA/null values
1507
1508 Returns
1509 -------
1510 mean : float
1511 """
1512 nv.validate_mean(args, kwargs)
1513 valid_vals = self._valid_sp_values
1514 sp_sum = valid_vals.sum()
1515 ct = len(valid_vals)
1516
1517 if self._null_fill_value:
1518 return sp_sum / ct
1519 else:
1520 nsparse = self.sp_index.ngaps
1521 return (sp_sum + self.fill_value * nsparse) / (ct + nsparse)
1522
1523 def max(self, *, axis: AxisInt | None = None, skipna: bool = True):
1524 """
1525 Max of array values, ignoring NA values if specified.
1526
1527 Parameters
1528 ----------
1529 axis : int, default 0
1530 Not Used. NumPy compatibility.
1531 skipna : bool, default True
1532 Whether to ignore NA values.
1533
1534 Returns
1535 -------
1536 scalar
1537 """
1538 nv.validate_minmax_axis(axis, self.ndim)
1539 return self._min_max("max", skipna=skipna)
1540
1541 def min(self, *, axis: AxisInt | None = None, skipna: bool = True):
1542 """
1543 Min of array values, ignoring NA values if specified.
1544
1545 Parameters
1546 ----------
1547 axis : int, default 0
1548 Not Used. NumPy compatibility.
1549 skipna : bool, default True
1550 Whether to ignore NA values.
1551
1552 Returns
1553 -------
1554 scalar
1555 """
1556 nv.validate_minmax_axis(axis, self.ndim)
1557 return self._min_max("min", skipna=skipna)
1558
1559 def _min_max(self, kind: Literal["min", "max"], skipna: bool) -> Scalar:
1560 """
1561 Min/max of non-NA/null values
1562
1563 Parameters
1564 ----------
1565 kind : {"min", "max"}
1566 skipna : bool
1567
1568 Returns
1569 -------
1570 scalar
1571 """
1572 valid_vals = self._valid_sp_values
1573 has_nonnull_fill_vals = not self._null_fill_value and self.sp_index.ngaps > 0
1574
1575 if len(valid_vals) > 0:
1576 sp_min_max = getattr(valid_vals, kind)()
1577
1578 # If a non-null fill value is currently present, it might be the min/max
1579 if has_nonnull_fill_vals:
1580 func = max if kind == "max" else min
1581 return func(sp_min_max, self.fill_value)
1582 elif skipna:
1583 return sp_min_max
1584 elif self.sp_index.ngaps == 0:
1585 # No NAs present
1586 return sp_min_max
1587 else:
1588 return na_value_for_dtype(self.dtype.subtype, compat=False)
1589 elif has_nonnull_fill_vals:
1590 return self.fill_value
1591 else:
1592 return na_value_for_dtype(self.dtype.subtype, compat=False)
1593
1594 def _argmin_argmax(self, kind: Literal["argmin", "argmax"]) -> int:
1595 values = self._sparse_values
1596 index = self._sparse_index.indices
1597 mask = np.asarray(isna(values))
1598 func = np.argmax if kind == "argmax" else np.argmin
1599
1600 idx = np.arange(values.shape[0])
1601 non_nans = values[~mask]
1602 non_nan_idx = idx[~mask]
1603
1604 _candidate = non_nan_idx[func(non_nans)]
1605 candidate = index[_candidate]
1606
1607 if isna(self.fill_value):
1608 return candidate
1609 if kind == "argmin" and self[candidate] < self.fill_value:
1610 return candidate
1611 if kind == "argmax" and self[candidate] > self.fill_value:
1612 return candidate
1613 _loc = self._first_fill_value_loc()
1614 if _loc == -1:
1615 # fill_value doesn't exist
1616 return candidate
1617 else:
1618 return _loc
1619
1620 def argmax(self, skipna: bool = True) -> int:
1621 validate_bool_kwarg(skipna, "skipna")
1622 if not skipna and self._hasna:
1623 raise NotImplementedError
1624 return self._argmin_argmax("argmax")
1625
1626 def argmin(self, skipna: bool = True) -> int:
1627 validate_bool_kwarg(skipna, "skipna")
1628 if not skipna and self._hasna:
1629 raise NotImplementedError
1630 return self._argmin_argmax("argmin")
1631
1632 # ------------------------------------------------------------------------
1633 # Ufuncs
1634 # ------------------------------------------------------------------------
1635
1636 _HANDLED_TYPES = (np.ndarray, numbers.Number)
1637
1638 def __array_ufunc__(self, ufunc: np.ufunc, method: str, *inputs, **kwargs):
1639 out = kwargs.get("out", ())
1640
1641 for x in inputs + out:
1642 if not isinstance(x, self._HANDLED_TYPES + (SparseArray,)):
1643 return NotImplemented
1644
1645 # for binary ops, use our custom dunder methods
1646 result = ops.maybe_dispatch_ufunc_to_dunder_op(
1647 self, ufunc, method, *inputs, **kwargs
1648 )
1649 if result is not NotImplemented:
1650 return result
1651
1652 if "out" in kwargs:
1653 # e.g. tests.arrays.sparse.test_arithmetics.test_ndarray_inplace
1654 res = arraylike.dispatch_ufunc_with_out(
1655 self, ufunc, method, *inputs, **kwargs
1656 )
1657 return res
1658
1659 if method == "reduce":
1660 result = arraylike.dispatch_reduction_ufunc(
1661 self, ufunc, method, *inputs, **kwargs
1662 )
1663 if result is not NotImplemented:
1664 # e.g. tests.series.test_ufunc.TestNumpyReductions
1665 return result
1666
1667 if len(inputs) == 1:
1668 # No alignment necessary.
1669 sp_values = getattr(ufunc, method)(self.sp_values, **kwargs)
1670 fill_value = getattr(ufunc, method)(self.fill_value, **kwargs)
1671
1672 if ufunc.nout > 1:
1673 # multiple outputs. e.g. modf
1674 arrays = tuple(
1675 self._simple_new(
1676 sp_value, self.sp_index, SparseDtype(sp_value.dtype, fv)
1677 )
1678 for sp_value, fv in zip(sp_values, fill_value)
1679 )
1680 return arrays
1681 elif method == "reduce":
1682 # e.g. reductions
1683 return sp_values
1684
1685 return self._simple_new(
1686 sp_values, self.sp_index, SparseDtype(sp_values.dtype, fill_value)
1687 )
1688
1689 new_inputs = tuple(np.asarray(x) for x in inputs)
1690 result = getattr(ufunc, method)(*new_inputs, **kwargs)
1691 if out:
1692 if len(out) == 1:
1693 out = out[0]
1694 return out
1695
1696 if ufunc.nout > 1:
1697 return tuple(type(self)(x) for x in result)
1698 elif method == "at":
1699 # no return value
1700 return None
1701 else:
1702 return type(self)(result)
1703
1704 # ------------------------------------------------------------------------
1705 # Ops
1706 # ------------------------------------------------------------------------
1707
1708 def _arith_method(self, other, op):
1709 op_name = op.__name__
1710
1711 if isinstance(other, SparseArray):
1712 return _sparse_array_op(self, other, op, op_name)
1713
1714 elif is_scalar(other):
1715 with np.errstate(all="ignore"):
1716 fill = op(_get_fill(self), np.asarray(other))
1717 result = op(self.sp_values, other)
1718
1719 if op_name == "divmod":
1720 left, right = result
1721 lfill, rfill = fill
1722 return (
1723 _wrap_result(op_name, left, self.sp_index, lfill),
1724 _wrap_result(op_name, right, self.sp_index, rfill),
1725 )
1726
1727 return _wrap_result(op_name, result, self.sp_index, fill)
1728
1729 else:
1730 other = np.asarray(other)
1731 with np.errstate(all="ignore"):
1732 if len(self) != len(other):
1733 raise AssertionError(
1734 f"length mismatch: {len(self)} vs. {len(other)}"
1735 )
1736 if not isinstance(other, SparseArray):
1737 dtype = getattr(other, "dtype", None)
1738 other = SparseArray(other, fill_value=self.fill_value, dtype=dtype)
1739 return _sparse_array_op(self, other, op, op_name)
1740
1741 def _cmp_method(self, other, op) -> SparseArray:
1742 if not is_scalar(other) and not isinstance(other, type(self)):
1743 # convert list-like to ndarray
1744 other = np.asarray(other)
1745
1746 if isinstance(other, np.ndarray):
1747 # TODO: make this more flexible than just ndarray...
1748 other = SparseArray(other, fill_value=self.fill_value)
1749
1750 if isinstance(other, SparseArray):
1751 if len(self) != len(other):
1752 raise ValueError(
1753 f"operands have mismatched length {len(self)} and {len(other)}"
1754 )
1755
1756 op_name = op.__name__.strip("_")
1757 return _sparse_array_op(self, other, op, op_name)
1758 else:
1759 # scalar
1760 with np.errstate(all="ignore"):
1761 fill_value = op(self.fill_value, other)
1762 result = np.full(len(self), fill_value, dtype=np.bool_)
1763 result[self.sp_index.indices] = op(self.sp_values, other)
1764
1765 return type(self)(
1766 result,
1767 fill_value=fill_value,
1768 dtype=np.bool_,
1769 )
1770
1771 _logical_method = _cmp_method
1772
1773 def _unary_method(self, op) -> SparseArray:
1774 fill_value = op(np.array(self.fill_value)).item()
1775 dtype = SparseDtype(self.dtype.subtype, fill_value)
1776 # NOTE: if fill_value doesn't change
1777 # we just have to apply op to sp_values
1778 if isna(self.fill_value) or fill_value == self.fill_value:
1779 values = op(self.sp_values)
1780 return type(self)._simple_new(values, self.sp_index, self.dtype)
1781 # In the other case we have to recalc indexes
1782 return type(self)(op(self.to_dense()), dtype=dtype)
1783
1784 def __pos__(self) -> SparseArray:
1785 return self._unary_method(operator.pos)
1786
1787 def __neg__(self) -> SparseArray:
1788 return self._unary_method(operator.neg)
1789
1790 def __invert__(self) -> SparseArray:
1791 return self._unary_method(operator.invert)
1792
1793 def __abs__(self) -> SparseArray:
1794 return self._unary_method(operator.abs)
1795
1796 # ----------
1797 # Formatting
1798 # -----------
1799 def __repr__(self) -> str:
1800 pp_str = printing.pprint_thing(self)
1801 pp_fill = printing.pprint_thing(self.fill_value)
1802 pp_index = printing.pprint_thing(self.sp_index)
1803 return f"{pp_str}\nFill: {pp_fill}\n{pp_index}"
1804
1805 def _formatter(self, boxed: bool = False):
1806 # Defer to the formatter from the GenericArrayFormatter calling us.
1807 # This will infer the correct formatter from the dtype of the values.
1808 return None
1809
1810
1811def _make_sparse(
1812 arr: np.ndarray,
1813 kind: SparseIndexKind = "block",
1814 fill_value=None,
1815 dtype: np.dtype | None = None,
1816):
1817 """
1818 Convert ndarray to sparse format
1819
1820 Parameters
1821 ----------
1822 arr : ndarray
1823 kind : {'block', 'integer'}
1824 fill_value : NaN or another value
1825 dtype : np.dtype, optional
1826 copy : bool, default False
1827
1828 Returns
1829 -------
1830 (sparse_values, index, fill_value) : (ndarray, SparseIndex, Scalar)
1831 """
1832 assert isinstance(arr, np.ndarray)
1833
1834 if arr.ndim > 1:
1835 raise TypeError("expected dimension <= 1 data")
1836
1837 if fill_value is None:
1838 fill_value = na_value_for_dtype(arr.dtype)
1839
1840 if isna(fill_value):
1841 mask = notna(arr)
1842 else:
1843 # cast to object comparison to be safe
1844 if is_string_dtype(arr.dtype):
1845 arr = arr.astype(object)
1846
1847 if is_object_dtype(arr.dtype):
1848 # element-wise equality check method in numpy doesn't treat
1849 # each element type, eg. 0, 0.0, and False are treated as
1850 # same. So we have to check the both of its type and value.
1851 mask = splib.make_mask_object_ndarray(arr, fill_value)
1852 else:
1853 mask = arr != fill_value
1854
1855 length = len(arr)
1856 if length != len(mask):
1857 # the arr is a SparseArray
1858 indices = mask.sp_index.indices
1859 else:
1860 indices = mask.nonzero()[0].astype(np.int32)
1861
1862 index = make_sparse_index(length, indices, kind)
1863 sparsified_values = arr[mask]
1864 if dtype is not None:
1865 sparsified_values = ensure_wrapped_if_datetimelike(sparsified_values)
1866 sparsified_values = astype_array(sparsified_values, dtype=dtype)
1867 sparsified_values = np.asarray(sparsified_values)
1868
1869 # TODO: copy
1870 return sparsified_values, index, fill_value
1871
1872
1873@overload
1874def make_sparse_index(length: int, indices, kind: Literal["block"]) -> BlockIndex:
1875 ...
1876
1877
1878@overload
1879def make_sparse_index(length: int, indices, kind: Literal["integer"]) -> IntIndex:
1880 ...
1881
1882
1883def make_sparse_index(length: int, indices, kind: SparseIndexKind) -> SparseIndex:
1884 index: SparseIndex
1885 if kind == "block":
1886 locs, lens = splib.get_blocks(indices)
1887 index = BlockIndex(length, locs, lens)
1888 elif kind == "integer":
1889 index = IntIndex(length, indices)
1890 else: # pragma: no cover
1891 raise ValueError("must be block or integer type")
1892 return index