1"""
2SparseArray data structure
3"""
4from __future__ import annotations
5
6from collections import abc
7import numbers
8import operator
9from typing import (
10 TYPE_CHECKING,
11 Any,
12 Callable,
13 Literal,
14 cast,
15 overload,
16)
17import warnings
18
19import numpy as np
20
21from pandas._libs import lib
22import pandas._libs.sparse as splib
23from pandas._libs.sparse import (
24 BlockIndex,
25 IntIndex,
26 SparseIndex,
27)
28from pandas._libs.tslibs import NaT
29from pandas.compat.numpy import function as nv
30from pandas.errors import PerformanceWarning
31from pandas.util._decorators import doc
32from pandas.util._exceptions import find_stack_level
33from pandas.util._validators import (
34 validate_bool_kwarg,
35 validate_insert_loc,
36)
37
38from pandas.core.dtypes.astype import astype_array
39from pandas.core.dtypes.cast import (
40 construct_1d_arraylike_from_scalar,
41 find_common_type,
42 maybe_box_datetimelike,
43)
44from pandas.core.dtypes.common import (
45 is_bool_dtype,
46 is_integer,
47 is_list_like,
48 is_object_dtype,
49 is_scalar,
50 is_string_dtype,
51 pandas_dtype,
52)
53from pandas.core.dtypes.dtypes import (
54 DatetimeTZDtype,
55 SparseDtype,
56)
57from pandas.core.dtypes.generic import (
58 ABCIndex,
59 ABCSeries,
60)
61from pandas.core.dtypes.missing import (
62 isna,
63 na_value_for_dtype,
64 notna,
65)
66
67from pandas.core import arraylike
68import pandas.core.algorithms as algos
69from pandas.core.arraylike import OpsMixin
70from pandas.core.arrays import ExtensionArray
71from pandas.core.base import PandasObject
72import pandas.core.common as com
73from pandas.core.construction import (
74 ensure_wrapped_if_datetimelike,
75 extract_array,
76 sanitize_array,
77)
78from pandas.core.indexers import (
79 check_array_indexer,
80 unpack_tuple_and_ellipses,
81)
82from pandas.core.nanops import check_below_min_count
83
84from pandas.io.formats import printing
85
86# See https://github.com/python/typing/issues/684
87if TYPE_CHECKING:
88 from collections.abc import Sequence
89 from enum import Enum
90
91 class ellipsis(Enum):
92 Ellipsis = "..."
93
94 Ellipsis = ellipsis.Ellipsis
95
96 from scipy.sparse import spmatrix
97
98 from pandas._typing import (
99 FillnaOptions,
100 NumpySorter,
101 )
102
103 SparseIndexKind = Literal["integer", "block"]
104
105 from pandas._typing import (
106 ArrayLike,
107 AstypeArg,
108 Axis,
109 AxisInt,
110 Dtype,
111 NpDtype,
112 PositionalIndexer,
113 Scalar,
114 ScalarIndexer,
115 Self,
116 SequenceIndexer,
117 npt,
118 )
119
120 from pandas import Series
121
122else:
123 ellipsis = type(Ellipsis)
124
125
126# ----------------------------------------------------------------------------
127# Array
128
129_sparray_doc_kwargs = {"klass": "SparseArray"}
130
131
132def _get_fill(arr: SparseArray) -> np.ndarray:
133 """
134 Create a 0-dim ndarray containing the fill value
135
136 Parameters
137 ----------
138 arr : SparseArray
139
140 Returns
141 -------
142 fill_value : ndarray
143 0-dim ndarray with just the fill value.
144
145 Notes
146 -----
147 coerce fill_value to arr dtype if possible
148 int64 SparseArray can have NaN as fill_value if there is no missing
149 """
150 try:
151 return np.asarray(arr.fill_value, dtype=arr.dtype.subtype)
152 except ValueError:
153 return np.asarray(arr.fill_value)
154
155
156def _sparse_array_op(
157 left: SparseArray, right: SparseArray, op: Callable, name: str
158) -> SparseArray:
159 """
160 Perform a binary operation between two arrays.
161
162 Parameters
163 ----------
164 left : Union[SparseArray, ndarray]
165 right : Union[SparseArray, ndarray]
166 op : Callable
167 The binary operation to perform
168 name str
169 Name of the callable.
170
171 Returns
172 -------
173 SparseArray
174 """
175 if name.startswith("__"):
176 # For lookups in _libs.sparse we need non-dunder op name
177 name = name[2:-2]
178
179 # dtype used to find corresponding sparse method
180 ltype = left.dtype.subtype
181 rtype = right.dtype.subtype
182
183 if ltype != rtype:
184 subtype = find_common_type([ltype, rtype])
185 ltype = SparseDtype(subtype, left.fill_value)
186 rtype = SparseDtype(subtype, right.fill_value)
187
188 left = left.astype(ltype, copy=False)
189 right = right.astype(rtype, copy=False)
190 dtype = ltype.subtype
191 else:
192 dtype = ltype
193
194 # dtype the result must have
195 result_dtype = None
196
197 if left.sp_index.ngaps == 0 or right.sp_index.ngaps == 0:
198 with np.errstate(all="ignore"):
199 result = op(left.to_dense(), right.to_dense())
200 fill = op(_get_fill(left), _get_fill(right))
201
202 if left.sp_index.ngaps == 0:
203 index = left.sp_index
204 else:
205 index = right.sp_index
206 elif left.sp_index.equals(right.sp_index):
207 with np.errstate(all="ignore"):
208 result = op(left.sp_values, right.sp_values)
209 fill = op(_get_fill(left), _get_fill(right))
210 index = left.sp_index
211 else:
212 if name[0] == "r":
213 left, right = right, left
214 name = name[1:]
215
216 if name in ("and", "or", "xor") and dtype == "bool":
217 opname = f"sparse_{name}_uint8"
218 # to make template simple, cast here
219 left_sp_values = left.sp_values.view(np.uint8)
220 right_sp_values = right.sp_values.view(np.uint8)
221 result_dtype = bool
222 else:
223 opname = f"sparse_{name}_{dtype}"
224 left_sp_values = left.sp_values
225 right_sp_values = right.sp_values
226
227 if (
228 name in ["floordiv", "mod"]
229 and (right == 0).any()
230 and left.dtype.kind in "iu"
231 ):
232 # Match the non-Sparse Series behavior
233 opname = f"sparse_{name}_float64"
234 left_sp_values = left_sp_values.astype("float64")
235 right_sp_values = right_sp_values.astype("float64")
236
237 sparse_op = getattr(splib, opname)
238
239 with np.errstate(all="ignore"):
240 result, index, fill = sparse_op(
241 left_sp_values,
242 left.sp_index,
243 left.fill_value,
244 right_sp_values,
245 right.sp_index,
246 right.fill_value,
247 )
248
249 if name == "divmod":
250 # result is a 2-tuple
251 # error: Incompatible return value type (got "Tuple[SparseArray,
252 # SparseArray]", expected "SparseArray")
253 return ( # type: ignore[return-value]
254 _wrap_result(name, result[0], index, fill[0], dtype=result_dtype),
255 _wrap_result(name, result[1], index, fill[1], dtype=result_dtype),
256 )
257
258 if result_dtype is None:
259 result_dtype = result.dtype
260
261 return _wrap_result(name, result, index, fill, dtype=result_dtype)
262
263
264def _wrap_result(
265 name: str, data, sparse_index, fill_value, dtype: Dtype | None = None
266) -> SparseArray:
267 """
268 wrap op result to have correct dtype
269 """
270 if name.startswith("__"):
271 # e.g. __eq__ --> eq
272 name = name[2:-2]
273
274 if name in ("eq", "ne", "lt", "gt", "le", "ge"):
275 dtype = bool
276
277 fill_value = lib.item_from_zerodim(fill_value)
278
279 if is_bool_dtype(dtype):
280 # fill_value may be np.bool_
281 fill_value = bool(fill_value)
282 return SparseArray(
283 data, sparse_index=sparse_index, fill_value=fill_value, dtype=dtype
284 )
285
286
287class SparseArray(OpsMixin, PandasObject, ExtensionArray):
288 """
289 An ExtensionArray for storing sparse data.
290
291 Parameters
292 ----------
293 data : array-like or scalar
294 A dense array of values to store in the SparseArray. This may contain
295 `fill_value`.
296 sparse_index : SparseIndex, optional
297 fill_value : scalar, optional
298 Elements in data that are ``fill_value`` are not stored in the
299 SparseArray. For memory savings, this should be the most common value
300 in `data`. By default, `fill_value` depends on the dtype of `data`:
301
302 =========== ==========
303 data.dtype na_value
304 =========== ==========
305 float ``np.nan``
306 int ``0``
307 bool False
308 datetime64 ``pd.NaT``
309 timedelta64 ``pd.NaT``
310 =========== ==========
311
312 The fill value is potentially specified in three ways. In order of
313 precedence, these are
314
315 1. The `fill_value` argument
316 2. ``dtype.fill_value`` if `fill_value` is None and `dtype` is
317 a ``SparseDtype``
318 3. ``data.dtype.fill_value`` if `fill_value` is None and `dtype`
319 is not a ``SparseDtype`` and `data` is a ``SparseArray``.
320
321 kind : str
322 Can be 'integer' or 'block', default is 'integer'.
323 The type of storage for sparse locations.
324
325 * 'block': Stores a `block` and `block_length` for each
326 contiguous *span* of sparse values. This is best when
327 sparse data tends to be clumped together, with large
328 regions of ``fill-value`` values between sparse values.
329 * 'integer': uses an integer to store the location of
330 each sparse value.
331
332 dtype : np.dtype or SparseDtype, optional
333 The dtype to use for the SparseArray. For numpy dtypes, this
334 determines the dtype of ``self.sp_values``. For SparseDtype,
335 this determines ``self.sp_values`` and ``self.fill_value``.
336 copy : bool, default False
337 Whether to explicitly copy the incoming `data` array.
338
339 Attributes
340 ----------
341 None
342
343 Methods
344 -------
345 None
346
347 Examples
348 --------
349 >>> from pandas.arrays import SparseArray
350 >>> arr = SparseArray([0, 0, 1, 2])
351 >>> arr
352 [0, 0, 1, 2]
353 Fill: 0
354 IntIndex
355 Indices: array([2, 3], dtype=int32)
356 """
357
358 _subtyp = "sparse_array" # register ABCSparseArray
359 _hidden_attrs = PandasObject._hidden_attrs | frozenset([])
360 _sparse_index: SparseIndex
361 _sparse_values: np.ndarray
362 _dtype: SparseDtype
363
364 def __init__(
365 self,
366 data,
367 sparse_index=None,
368 fill_value=None,
369 kind: SparseIndexKind = "integer",
370 dtype: Dtype | None = None,
371 copy: bool = False,
372 ) -> None:
373 if fill_value is None and isinstance(dtype, SparseDtype):
374 fill_value = dtype.fill_value
375
376 if isinstance(data, type(self)):
377 # disable normal inference on dtype, sparse_index, & fill_value
378 if sparse_index is None:
379 sparse_index = data.sp_index
380 if fill_value is None:
381 fill_value = data.fill_value
382 if dtype is None:
383 dtype = data.dtype
384 # TODO: make kind=None, and use data.kind?
385 data = data.sp_values
386
387 # Handle use-provided dtype
388 if isinstance(dtype, str):
389 # Two options: dtype='int', regular numpy dtype
390 # or dtype='Sparse[int]', a sparse dtype
391 try:
392 dtype = SparseDtype.construct_from_string(dtype)
393 except TypeError:
394 dtype = pandas_dtype(dtype)
395
396 if isinstance(dtype, SparseDtype):
397 if fill_value is None:
398 fill_value = dtype.fill_value
399 dtype = dtype.subtype
400
401 if is_scalar(data):
402 warnings.warn(
403 f"Constructing {type(self).__name__} with scalar data is deprecated "
404 "and will raise in a future version. Pass a sequence instead.",
405 FutureWarning,
406 stacklevel=find_stack_level(),
407 )
408 if sparse_index is None:
409 npoints = 1
410 else:
411 npoints = sparse_index.length
412
413 data = construct_1d_arraylike_from_scalar(data, npoints, dtype=None)
414 dtype = data.dtype
415
416 if dtype is not None:
417 dtype = pandas_dtype(dtype)
418
419 # TODO: disentangle the fill_value dtype inference from
420 # dtype inference
421 if data is None:
422 # TODO: What should the empty dtype be? Object or float?
423
424 # error: Argument "dtype" to "array" has incompatible type
425 # "Union[ExtensionDtype, dtype[Any], None]"; expected "Union[dtype[Any],
426 # None, type, _SupportsDType, str, Union[Tuple[Any, int], Tuple[Any,
427 # Union[int, Sequence[int]]], List[Any], _DTypeDict, Tuple[Any, Any]]]"
428 data = np.array([], dtype=dtype) # type: ignore[arg-type]
429
430 try:
431 data = sanitize_array(data, index=None)
432 except ValueError:
433 # NumPy may raise a ValueError on data like [1, []]
434 # we retry with object dtype here.
435 if dtype is None:
436 dtype = np.dtype(object)
437 data = np.atleast_1d(np.asarray(data, dtype=dtype))
438 else:
439 raise
440
441 if copy:
442 # TODO: avoid double copy when dtype forces cast.
443 data = data.copy()
444
445 if fill_value is None:
446 fill_value_dtype = data.dtype if dtype is None else dtype
447 if fill_value_dtype is None:
448 fill_value = np.nan
449 else:
450 fill_value = na_value_for_dtype(fill_value_dtype)
451
452 if isinstance(data, type(self)) and sparse_index is None:
453 sparse_index = data._sparse_index
454 # error: Argument "dtype" to "asarray" has incompatible type
455 # "Union[ExtensionDtype, dtype[Any], None]"; expected "None"
456 sparse_values = np.asarray(
457 data.sp_values, dtype=dtype # type: ignore[arg-type]
458 )
459 elif sparse_index is None:
460 data = extract_array(data, extract_numpy=True)
461 if not isinstance(data, np.ndarray):
462 # EA
463 if isinstance(data.dtype, DatetimeTZDtype):
464 warnings.warn(
465 f"Creating SparseArray from {data.dtype} data "
466 "loses timezone information. Cast to object before "
467 "sparse to retain timezone information.",
468 UserWarning,
469 stacklevel=find_stack_level(),
470 )
471 data = np.asarray(data, dtype="datetime64[ns]")
472 if fill_value is NaT:
473 fill_value = np.datetime64("NaT", "ns")
474 data = np.asarray(data)
475 sparse_values, sparse_index, fill_value = _make_sparse(
476 # error: Argument "dtype" to "_make_sparse" has incompatible type
477 # "Union[ExtensionDtype, dtype[Any], None]"; expected
478 # "Optional[dtype[Any]]"
479 data,
480 kind=kind,
481 fill_value=fill_value,
482 dtype=dtype, # type: ignore[arg-type]
483 )
484 else:
485 # error: Argument "dtype" to "asarray" has incompatible type
486 # "Union[ExtensionDtype, dtype[Any], None]"; expected "None"
487 sparse_values = np.asarray(data, dtype=dtype) # type: ignore[arg-type]
488 if len(sparse_values) != sparse_index.npoints:
489 raise AssertionError(
490 f"Non array-like type {type(sparse_values)} must "
491 "have the same length as the index"
492 )
493 self._sparse_index = sparse_index
494 self._sparse_values = sparse_values
495 self._dtype = SparseDtype(sparse_values.dtype, fill_value)
496
497 @classmethod
498 def _simple_new(
499 cls,
500 sparse_array: np.ndarray,
501 sparse_index: SparseIndex,
502 dtype: SparseDtype,
503 ) -> Self:
504 new = object.__new__(cls)
505 new._sparse_index = sparse_index
506 new._sparse_values = sparse_array
507 new._dtype = dtype
508 return new
509
510 @classmethod
511 def from_spmatrix(cls, data: spmatrix) -> Self:
512 """
513 Create a SparseArray from a scipy.sparse matrix.
514
515 Parameters
516 ----------
517 data : scipy.sparse.sp_matrix
518 This should be a SciPy sparse matrix where the size
519 of the second dimension is 1. In other words, a
520 sparse matrix with a single column.
521
522 Returns
523 -------
524 SparseArray
525
526 Examples
527 --------
528 >>> import scipy.sparse
529 >>> mat = scipy.sparse.coo_matrix((4, 1))
530 >>> pd.arrays.SparseArray.from_spmatrix(mat)
531 [0.0, 0.0, 0.0, 0.0]
532 Fill: 0.0
533 IntIndex
534 Indices: array([], dtype=int32)
535 """
536 length, ncol = data.shape
537
538 if ncol != 1:
539 raise ValueError(f"'data' must have a single column, not '{ncol}'")
540
541 # our sparse index classes require that the positions be strictly
542 # increasing. So we need to sort loc, and arr accordingly.
543 data = data.tocsc()
544 data.sort_indices()
545 arr = data.data
546 idx = data.indices
547
548 zero = np.array(0, dtype=arr.dtype).item()
549 dtype = SparseDtype(arr.dtype, zero)
550 index = IntIndex(length, idx)
551
552 return cls._simple_new(arr, index, dtype)
553
554 def __array__(
555 self, dtype: NpDtype | None = None, copy: bool | None = None
556 ) -> np.ndarray:
557 fill_value = self.fill_value
558
559 if self.sp_index.ngaps == 0:
560 # Compat for na dtype and int values.
561 return self.sp_values
562 if dtype is None:
563 # Can NumPy represent this type?
564 # If not, `np.result_type` will raise. We catch that
565 # and return object.
566 if self.sp_values.dtype.kind == "M":
567 # However, we *do* special-case the common case of
568 # a datetime64 with pandas NaT.
569 if fill_value is NaT:
570 # Can't put pd.NaT in a datetime64[ns]
571 fill_value = np.datetime64("NaT")
572 try:
573 dtype = np.result_type(self.sp_values.dtype, type(fill_value))
574 except TypeError:
575 dtype = object
576
577 out = np.full(self.shape, fill_value, dtype=dtype)
578 out[self.sp_index.indices] = self.sp_values
579 return out
580
581 def __setitem__(self, key, value) -> None:
582 # I suppose we could allow setting of non-fill_value elements.
583 # TODO(SparseArray.__setitem__): remove special cases in
584 # ExtensionBlock.where
585 msg = "SparseArray does not support item assignment via setitem"
586 raise TypeError(msg)
587
588 @classmethod
589 def _from_sequence(cls, scalars, *, dtype: Dtype | None = None, copy: bool = False):
590 return cls(scalars, dtype=dtype)
591
592 @classmethod
593 def _from_factorized(cls, values, original):
594 return cls(values, dtype=original.dtype)
595
596 # ------------------------------------------------------------------------
597 # Data
598 # ------------------------------------------------------------------------
599 @property
600 def sp_index(self) -> SparseIndex:
601 """
602 The SparseIndex containing the location of non- ``fill_value`` points.
603 """
604 return self._sparse_index
605
606 @property
607 def sp_values(self) -> np.ndarray:
608 """
609 An ndarray containing the non- ``fill_value`` values.
610
611 Examples
612 --------
613 >>> from pandas.arrays import SparseArray
614 >>> s = SparseArray([0, 0, 1, 0, 2], fill_value=0)
615 >>> s.sp_values
616 array([1, 2])
617 """
618 return self._sparse_values
619
620 @property
621 def dtype(self) -> SparseDtype:
622 return self._dtype
623
624 @property
625 def fill_value(self):
626 """
627 Elements in `data` that are `fill_value` are not stored.
628
629 For memory savings, this should be the most common value in the array.
630
631 Examples
632 --------
633 >>> ser = pd.Series([0, 0, 2, 2, 2], dtype="Sparse[int]")
634 >>> ser.sparse.fill_value
635 0
636 >>> spa_dtype = pd.SparseDtype(dtype=np.int32, fill_value=2)
637 >>> ser = pd.Series([0, 0, 2, 2, 2], dtype=spa_dtype)
638 >>> ser.sparse.fill_value
639 2
640 """
641 return self.dtype.fill_value
642
643 @fill_value.setter
644 def fill_value(self, value) -> None:
645 self._dtype = SparseDtype(self.dtype.subtype, value)
646
647 @property
648 def kind(self) -> SparseIndexKind:
649 """
650 The kind of sparse index for this array. One of {'integer', 'block'}.
651 """
652 if isinstance(self.sp_index, IntIndex):
653 return "integer"
654 else:
655 return "block"
656
657 @property
658 def _valid_sp_values(self) -> np.ndarray:
659 sp_vals = self.sp_values
660 mask = notna(sp_vals)
661 return sp_vals[mask]
662
663 def __len__(self) -> int:
664 return self.sp_index.length
665
666 @property
667 def _null_fill_value(self) -> bool:
668 return self._dtype._is_na_fill_value
669
670 def _fill_value_matches(self, fill_value) -> bool:
671 if self._null_fill_value:
672 return isna(fill_value)
673 else:
674 return self.fill_value == fill_value
675
676 @property
677 def nbytes(self) -> int:
678 return self.sp_values.nbytes + self.sp_index.nbytes
679
680 @property
681 def density(self) -> float:
682 """
683 The percent of non- ``fill_value`` points, as decimal.
684
685 Examples
686 --------
687 >>> from pandas.arrays import SparseArray
688 >>> s = SparseArray([0, 0, 1, 1, 1], fill_value=0)
689 >>> s.density
690 0.6
691 """
692 return self.sp_index.npoints / self.sp_index.length
693
694 @property
695 def npoints(self) -> int:
696 """
697 The number of non- ``fill_value`` points.
698
699 Examples
700 --------
701 >>> from pandas.arrays import SparseArray
702 >>> s = SparseArray([0, 0, 1, 1, 1], fill_value=0)
703 >>> s.npoints
704 3
705 """
706 return self.sp_index.npoints
707
708 # error: Return type "SparseArray" of "isna" incompatible with return type
709 # "ndarray[Any, Any] | ExtensionArraySupportsAnyAll" in supertype "ExtensionArray"
710 def isna(self) -> Self: # type: ignore[override]
711 # If null fill value, we want SparseDtype[bool, true]
712 # to preserve the same memory usage.
713 dtype = SparseDtype(bool, self._null_fill_value)
714 if self._null_fill_value:
715 return type(self)._simple_new(isna(self.sp_values), self.sp_index, dtype)
716 mask = np.full(len(self), False, dtype=np.bool_)
717 mask[self.sp_index.indices] = isna(self.sp_values)
718 return type(self)(mask, fill_value=False, dtype=dtype)
719
720 def _pad_or_backfill( # pylint: disable=useless-parent-delegation
721 self,
722 *,
723 method: FillnaOptions,
724 limit: int | None = None,
725 limit_area: Literal["inside", "outside"] | None = None,
726 copy: bool = True,
727 ) -> Self:
728 # TODO(3.0): We can remove this method once deprecation for fillna method
729 # keyword is enforced.
730 return super()._pad_or_backfill(
731 method=method, limit=limit, limit_area=limit_area, copy=copy
732 )
733
734 def fillna(
735 self,
736 value=None,
737 method: FillnaOptions | None = None,
738 limit: int | None = None,
739 copy: bool = True,
740 ) -> Self:
741 """
742 Fill missing values with `value`.
743
744 Parameters
745 ----------
746 value : scalar, optional
747 method : str, optional
748
749 .. warning::
750
751 Using 'method' will result in high memory use,
752 as all `fill_value` methods will be converted to
753 an in-memory ndarray
754
755 limit : int, optional
756
757 copy: bool, default True
758 Ignored for SparseArray.
759
760 Returns
761 -------
762 SparseArray
763
764 Notes
765 -----
766 When `value` is specified, the result's ``fill_value`` depends on
767 ``self.fill_value``. The goal is to maintain low-memory use.
768
769 If ``self.fill_value`` is NA, the result dtype will be
770 ``SparseDtype(self.dtype, fill_value=value)``. This will preserve
771 amount of memory used before and after filling.
772
773 When ``self.fill_value`` is not NA, the result dtype will be
774 ``self.dtype``. Again, this preserves the amount of memory used.
775 """
776 if (method is None and value is None) or (
777 method is not None and value is not None
778 ):
779 raise ValueError("Must specify one of 'method' or 'value'.")
780
781 if method is not None:
782 return super().fillna(method=method, limit=limit)
783
784 else:
785 new_values = np.where(isna(self.sp_values), value, self.sp_values)
786
787 if self._null_fill_value:
788 # This is essentially just updating the dtype.
789 new_dtype = SparseDtype(self.dtype.subtype, fill_value=value)
790 else:
791 new_dtype = self.dtype
792
793 return self._simple_new(new_values, self._sparse_index, new_dtype)
794
795 def shift(self, periods: int = 1, fill_value=None) -> Self:
796 if not len(self) or periods == 0:
797 return self.copy()
798
799 if isna(fill_value):
800 fill_value = self.dtype.na_value
801
802 subtype = np.result_type(fill_value, self.dtype.subtype)
803
804 if subtype != self.dtype.subtype:
805 # just coerce up front
806 arr = self.astype(SparseDtype(subtype, self.fill_value))
807 else:
808 arr = self
809
810 empty = self._from_sequence(
811 [fill_value] * min(abs(periods), len(self)), dtype=arr.dtype
812 )
813
814 if periods > 0:
815 a = empty
816 b = arr[:-periods]
817 else:
818 a = arr[abs(periods) :]
819 b = empty
820 return arr._concat_same_type([a, b])
821
822 def _first_fill_value_loc(self):
823 """
824 Get the location of the first fill value.
825
826 Returns
827 -------
828 int
829 """
830 if len(self) == 0 or self.sp_index.npoints == len(self):
831 return -1
832
833 indices = self.sp_index.indices
834 if not len(indices) or indices[0] > 0:
835 return 0
836
837 # a number larger than 1 should be appended to
838 # the last in case of fill value only appears
839 # in the tail of array
840 diff = np.r_[np.diff(indices), 2]
841 return indices[(diff > 1).argmax()] + 1
842
843 @doc(ExtensionArray.duplicated)
844 def duplicated(
845 self, keep: Literal["first", "last", False] = "first"
846 ) -> npt.NDArray[np.bool_]:
847 values = np.asarray(self)
848 mask = np.asarray(self.isna())
849 return algos.duplicated(values, keep=keep, mask=mask)
850
851 def unique(self) -> Self:
852 uniques = algos.unique(self.sp_values)
853 if len(self.sp_values) != len(self):
854 fill_loc = self._first_fill_value_loc()
855 # Inorder to align the behavior of pd.unique or
856 # pd.Series.unique, we should keep the original
857 # order, here we use unique again to find the
858 # insertion place. Since the length of sp_values
859 # is not large, maybe minor performance hurt
860 # is worthwhile to the correctness.
861 insert_loc = len(algos.unique(self.sp_values[:fill_loc]))
862 uniques = np.insert(uniques, insert_loc, self.fill_value)
863 return type(self)._from_sequence(uniques, dtype=self.dtype)
864
865 def _values_for_factorize(self):
866 # Still override this for hash_pandas_object
867 return np.asarray(self), self.fill_value
868
869 def factorize(
870 self,
871 use_na_sentinel: bool = True,
872 ) -> tuple[np.ndarray, SparseArray]:
873 # Currently, ExtensionArray.factorize -> Tuple[ndarray, EA]
874 # The sparsity on this is backwards from what Sparse would want. Want
875 # ExtensionArray.factorize -> Tuple[EA, EA]
876 # Given that we have to return a dense array of codes, why bother
877 # implementing an efficient factorize?
878 codes, uniques = algos.factorize(
879 np.asarray(self), use_na_sentinel=use_na_sentinel
880 )
881 uniques_sp = SparseArray(uniques, dtype=self.dtype)
882 return codes, uniques_sp
883
884 def value_counts(self, dropna: bool = True) -> Series:
885 """
886 Returns a Series containing counts of unique values.
887
888 Parameters
889 ----------
890 dropna : bool, default True
891 Don't include counts of NaN, even if NaN is in sp_values.
892
893 Returns
894 -------
895 counts : Series
896 """
897 from pandas import (
898 Index,
899 Series,
900 )
901
902 keys, counts, _ = algos.value_counts_arraylike(self.sp_values, dropna=dropna)
903 fcounts = self.sp_index.ngaps
904 if fcounts > 0 and (not self._null_fill_value or not dropna):
905 mask = isna(keys) if self._null_fill_value else keys == self.fill_value
906 if mask.any():
907 counts[mask] += fcounts
908 else:
909 # error: Argument 1 to "insert" has incompatible type "Union[
910 # ExtensionArray,ndarray[Any, Any]]"; expected "Union[
911 # _SupportsArray[dtype[Any]], Sequence[_SupportsArray[dtype
912 # [Any]]], Sequence[Sequence[_SupportsArray[dtype[Any]]]],
913 # Sequence[Sequence[Sequence[_SupportsArray[dtype[Any]]]]], Sequence
914 # [Sequence[Sequence[Sequence[_SupportsArray[dtype[Any]]]]]]]"
915 keys = np.insert(keys, 0, self.fill_value) # type: ignore[arg-type]
916 counts = np.insert(counts, 0, fcounts)
917
918 if not isinstance(keys, ABCIndex):
919 index = Index(keys)
920 else:
921 index = keys
922 return Series(counts, index=index, copy=False)
923
924 # --------
925 # Indexing
926 # --------
927 @overload
928 def __getitem__(self, key: ScalarIndexer) -> Any:
929 ...
930
931 @overload
932 def __getitem__(
933 self,
934 key: SequenceIndexer | tuple[int | ellipsis, ...],
935 ) -> Self:
936 ...
937
938 def __getitem__(
939 self,
940 key: PositionalIndexer | tuple[int | ellipsis, ...],
941 ) -> Self | Any:
942 if isinstance(key, tuple):
943 key = unpack_tuple_and_ellipses(key)
944 if key is Ellipsis:
945 raise ValueError("Cannot slice with Ellipsis")
946
947 if is_integer(key):
948 return self._get_val_at(key)
949 elif isinstance(key, tuple):
950 # error: Invalid index type "Tuple[Union[int, ellipsis], ...]"
951 # for "ndarray[Any, Any]"; expected type
952 # "Union[SupportsIndex, _SupportsArray[dtype[Union[bool_,
953 # integer[Any]]]], _NestedSequence[_SupportsArray[dtype[
954 # Union[bool_, integer[Any]]]]], _NestedSequence[Union[
955 # bool, int]], Tuple[Union[SupportsIndex, _SupportsArray[
956 # dtype[Union[bool_, integer[Any]]]], _NestedSequence[
957 # _SupportsArray[dtype[Union[bool_, integer[Any]]]]],
958 # _NestedSequence[Union[bool, int]]], ...]]"
959 data_slice = self.to_dense()[key] # type: ignore[index]
960 elif isinstance(key, slice):
961 # Avoid densifying when handling contiguous slices
962 if key.step is None or key.step == 1:
963 start = 0 if key.start is None else key.start
964 if start < 0:
965 start += len(self)
966
967 end = len(self) if key.stop is None else key.stop
968 if end < 0:
969 end += len(self)
970
971 indices = self.sp_index.indices
972 keep_inds = np.flatnonzero((indices >= start) & (indices < end))
973 sp_vals = self.sp_values[keep_inds]
974
975 sp_index = indices[keep_inds].copy()
976
977 # If we've sliced to not include the start of the array, all our indices
978 # should be shifted. NB: here we are careful to also not shift by a
979 # negative value for a case like [0, 1][-100:] where the start index
980 # should be treated like 0
981 if start > 0:
982 sp_index -= start
983
984 # Length of our result should match applying this slice to a range
985 # of the length of our original array
986 new_len = len(range(len(self))[key])
987 new_sp_index = make_sparse_index(new_len, sp_index, self.kind)
988 return type(self)._simple_new(sp_vals, new_sp_index, self.dtype)
989 else:
990 indices = np.arange(len(self), dtype=np.int32)[key]
991 return self.take(indices)
992
993 elif not is_list_like(key):
994 # e.g. "foo" or 2.5
995 # exception message copied from numpy
996 raise IndexError(
997 r"only integers, slices (`:`), ellipsis (`...`), numpy.newaxis "
998 r"(`None`) and integer or boolean arrays are valid indices"
999 )
1000
1001 else:
1002 if isinstance(key, SparseArray):
1003 # NOTE: If we guarantee that SparseDType(bool)
1004 # has only fill_value - true, false or nan
1005 # (see GH PR 44955)
1006 # we can apply mask very fast:
1007 if is_bool_dtype(key):
1008 if isna(key.fill_value):
1009 return self.take(key.sp_index.indices[key.sp_values])
1010 if not key.fill_value:
1011 return self.take(key.sp_index.indices)
1012 n = len(self)
1013 mask = np.full(n, True, dtype=np.bool_)
1014 mask[key.sp_index.indices] = False
1015 return self.take(np.arange(n)[mask])
1016 else:
1017 key = np.asarray(key)
1018
1019 key = check_array_indexer(self, key)
1020
1021 if com.is_bool_indexer(key):
1022 # mypy doesn't know we have an array here
1023 key = cast(np.ndarray, key)
1024 return self.take(np.arange(len(key), dtype=np.int32)[key])
1025 elif hasattr(key, "__len__"):
1026 return self.take(key)
1027 else:
1028 raise ValueError(f"Cannot slice with '{key}'")
1029
1030 return type(self)(data_slice, kind=self.kind)
1031
1032 def _get_val_at(self, loc):
1033 loc = validate_insert_loc(loc, len(self))
1034
1035 sp_loc = self.sp_index.lookup(loc)
1036 if sp_loc == -1:
1037 return self.fill_value
1038 else:
1039 val = self.sp_values[sp_loc]
1040 val = maybe_box_datetimelike(val, self.sp_values.dtype)
1041 return val
1042
1043 def take(self, indices, *, allow_fill: bool = False, fill_value=None) -> Self:
1044 if is_scalar(indices):
1045 raise ValueError(f"'indices' must be an array, not a scalar '{indices}'.")
1046 indices = np.asarray(indices, dtype=np.int32)
1047
1048 dtype = None
1049 if indices.size == 0:
1050 result = np.array([], dtype="object")
1051 dtype = self.dtype
1052 elif allow_fill:
1053 result = self._take_with_fill(indices, fill_value=fill_value)
1054 else:
1055 return self._take_without_fill(indices)
1056
1057 return type(self)(
1058 result, fill_value=self.fill_value, kind=self.kind, dtype=dtype
1059 )
1060
1061 def _take_with_fill(self, indices, fill_value=None) -> np.ndarray:
1062 if fill_value is None:
1063 fill_value = self.dtype.na_value
1064
1065 if indices.min() < -1:
1066 raise ValueError(
1067 "Invalid value in 'indices'. Must be between -1 "
1068 "and the length of the array."
1069 )
1070
1071 if indices.max() >= len(self):
1072 raise IndexError("out of bounds value in 'indices'.")
1073
1074 if len(self) == 0:
1075 # Empty... Allow taking only if all empty
1076 if (indices == -1).all():
1077 dtype = np.result_type(self.sp_values, type(fill_value))
1078 taken = np.empty_like(indices, dtype=dtype)
1079 taken.fill(fill_value)
1080 return taken
1081 else:
1082 raise IndexError("cannot do a non-empty take from an empty axes.")
1083
1084 # sp_indexer may be -1 for two reasons
1085 # 1.) we took for an index of -1 (new)
1086 # 2.) we took a value that was self.fill_value (old)
1087 sp_indexer = self.sp_index.lookup_array(indices)
1088 new_fill_indices = indices == -1
1089 old_fill_indices = (sp_indexer == -1) & ~new_fill_indices
1090
1091 if self.sp_index.npoints == 0 and old_fill_indices.all():
1092 # We've looked up all valid points on an all-sparse array.
1093 taken = np.full(
1094 sp_indexer.shape, fill_value=self.fill_value, dtype=self.dtype.subtype
1095 )
1096
1097 elif self.sp_index.npoints == 0:
1098 # Use the old fill_value unless we took for an index of -1
1099 _dtype = np.result_type(self.dtype.subtype, type(fill_value))
1100 taken = np.full(sp_indexer.shape, fill_value=fill_value, dtype=_dtype)
1101 taken[old_fill_indices] = self.fill_value
1102 else:
1103 taken = self.sp_values.take(sp_indexer)
1104
1105 # Fill in two steps.
1106 # Old fill values
1107 # New fill values
1108 # potentially coercing to a new dtype at each stage.
1109
1110 m0 = sp_indexer[old_fill_indices] < 0
1111 m1 = sp_indexer[new_fill_indices] < 0
1112
1113 result_type = taken.dtype
1114
1115 if m0.any():
1116 result_type = np.result_type(result_type, type(self.fill_value))
1117 taken = taken.astype(result_type)
1118 taken[old_fill_indices] = self.fill_value
1119
1120 if m1.any():
1121 result_type = np.result_type(result_type, type(fill_value))
1122 taken = taken.astype(result_type)
1123 taken[new_fill_indices] = fill_value
1124
1125 return taken
1126
1127 def _take_without_fill(self, indices) -> Self:
1128 to_shift = indices < 0
1129
1130 n = len(self)
1131
1132 if (indices.max() >= n) or (indices.min() < -n):
1133 if n == 0:
1134 raise IndexError("cannot do a non-empty take from an empty axes.")
1135 raise IndexError("out of bounds value in 'indices'.")
1136
1137 if to_shift.any():
1138 indices = indices.copy()
1139 indices[to_shift] += n
1140
1141 sp_indexer = self.sp_index.lookup_array(indices)
1142 value_mask = sp_indexer != -1
1143 new_sp_values = self.sp_values[sp_indexer[value_mask]]
1144
1145 value_indices = np.flatnonzero(value_mask).astype(np.int32, copy=False)
1146
1147 new_sp_index = make_sparse_index(len(indices), value_indices, kind=self.kind)
1148 return type(self)._simple_new(new_sp_values, new_sp_index, dtype=self.dtype)
1149
1150 def searchsorted(
1151 self,
1152 v: ArrayLike | object,
1153 side: Literal["left", "right"] = "left",
1154 sorter: NumpySorter | None = None,
1155 ) -> npt.NDArray[np.intp] | np.intp:
1156 msg = "searchsorted requires high memory usage."
1157 warnings.warn(msg, PerformanceWarning, stacklevel=find_stack_level())
1158 v = np.asarray(v)
1159 return np.asarray(self, dtype=self.dtype.subtype).searchsorted(v, side, sorter)
1160
1161 def copy(self) -> Self:
1162 values = self.sp_values.copy()
1163 return self._simple_new(values, self.sp_index, self.dtype)
1164
1165 @classmethod
1166 def _concat_same_type(cls, to_concat: Sequence[Self]) -> Self:
1167 fill_value = to_concat[0].fill_value
1168
1169 values = []
1170 length = 0
1171
1172 if to_concat:
1173 sp_kind = to_concat[0].kind
1174 else:
1175 sp_kind = "integer"
1176
1177 sp_index: SparseIndex
1178 if sp_kind == "integer":
1179 indices = []
1180
1181 for arr in to_concat:
1182 int_idx = arr.sp_index.indices.copy()
1183 int_idx += length # TODO: wraparound
1184 length += arr.sp_index.length
1185
1186 values.append(arr.sp_values)
1187 indices.append(int_idx)
1188
1189 data = np.concatenate(values)
1190 indices_arr = np.concatenate(indices)
1191 # error: Argument 2 to "IntIndex" has incompatible type
1192 # "ndarray[Any, dtype[signedinteger[_32Bit]]]";
1193 # expected "Sequence[int]"
1194 sp_index = IntIndex(length, indices_arr) # type: ignore[arg-type]
1195
1196 else:
1197 # when concatenating block indices, we don't claim that you'll
1198 # get an identical index as concatenating the values and then
1199 # creating a new index. We don't want to spend the time trying
1200 # to merge blocks across arrays in `to_concat`, so the resulting
1201 # BlockIndex may have more blocks.
1202 blengths = []
1203 blocs = []
1204
1205 for arr in to_concat:
1206 block_idx = arr.sp_index.to_block_index()
1207
1208 values.append(arr.sp_values)
1209 blocs.append(block_idx.blocs.copy() + length)
1210 blengths.append(block_idx.blengths)
1211 length += arr.sp_index.length
1212
1213 data = np.concatenate(values)
1214 blocs_arr = np.concatenate(blocs)
1215 blengths_arr = np.concatenate(blengths)
1216
1217 sp_index = BlockIndex(length, blocs_arr, blengths_arr)
1218
1219 return cls(data, sparse_index=sp_index, fill_value=fill_value)
1220
1221 def astype(self, dtype: AstypeArg | None = None, copy: bool = True):
1222 """
1223 Change the dtype of a SparseArray.
1224
1225 The output will always be a SparseArray. To convert to a dense
1226 ndarray with a certain dtype, use :meth:`numpy.asarray`.
1227
1228 Parameters
1229 ----------
1230 dtype : np.dtype or ExtensionDtype
1231 For SparseDtype, this changes the dtype of
1232 ``self.sp_values`` and the ``self.fill_value``.
1233
1234 For other dtypes, this only changes the dtype of
1235 ``self.sp_values``.
1236
1237 copy : bool, default True
1238 Whether to ensure a copy is made, even if not necessary.
1239
1240 Returns
1241 -------
1242 SparseArray
1243
1244 Examples
1245 --------
1246 >>> arr = pd.arrays.SparseArray([0, 0, 1, 2])
1247 >>> arr
1248 [0, 0, 1, 2]
1249 Fill: 0
1250 IntIndex
1251 Indices: array([2, 3], dtype=int32)
1252
1253 >>> arr.astype(SparseDtype(np.dtype('int32')))
1254 [0, 0, 1, 2]
1255 Fill: 0
1256 IntIndex
1257 Indices: array([2, 3], dtype=int32)
1258
1259 Using a NumPy dtype with a different kind (e.g. float) will coerce
1260 just ``self.sp_values``.
1261
1262 >>> arr.astype(SparseDtype(np.dtype('float64')))
1263 ... # doctest: +NORMALIZE_WHITESPACE
1264 [nan, nan, 1.0, 2.0]
1265 Fill: nan
1266 IntIndex
1267 Indices: array([2, 3], dtype=int32)
1268
1269 Using a SparseDtype, you can also change the fill value as well.
1270
1271 >>> arr.astype(SparseDtype("float64", fill_value=0.0))
1272 ... # doctest: +NORMALIZE_WHITESPACE
1273 [0.0, 0.0, 1.0, 2.0]
1274 Fill: 0.0
1275 IntIndex
1276 Indices: array([2, 3], dtype=int32)
1277 """
1278 if dtype == self._dtype:
1279 if not copy:
1280 return self
1281 else:
1282 return self.copy()
1283
1284 future_dtype = pandas_dtype(dtype)
1285 if not isinstance(future_dtype, SparseDtype):
1286 # GH#34457
1287 values = np.asarray(self)
1288 values = ensure_wrapped_if_datetimelike(values)
1289 return astype_array(values, dtype=future_dtype, copy=False)
1290
1291 dtype = self.dtype.update_dtype(dtype)
1292 subtype = pandas_dtype(dtype._subtype_with_str)
1293 subtype = cast(np.dtype, subtype) # ensured by update_dtype
1294 values = ensure_wrapped_if_datetimelike(self.sp_values)
1295 sp_values = astype_array(values, subtype, copy=copy)
1296 sp_values = np.asarray(sp_values)
1297
1298 return self._simple_new(sp_values, self.sp_index, dtype)
1299
1300 def map(self, mapper, na_action=None) -> Self:
1301 """
1302 Map categories using an input mapping or function.
1303
1304 Parameters
1305 ----------
1306 mapper : dict, Series, callable
1307 The correspondence from old values to new.
1308 na_action : {None, 'ignore'}, default None
1309 If 'ignore', propagate NA values, without passing them to the
1310 mapping correspondence.
1311
1312 Returns
1313 -------
1314 SparseArray
1315 The output array will have the same density as the input.
1316 The output fill value will be the result of applying the
1317 mapping to ``self.fill_value``
1318
1319 Examples
1320 --------
1321 >>> arr = pd.arrays.SparseArray([0, 1, 2])
1322 >>> arr.map(lambda x: x + 10)
1323 [10, 11, 12]
1324 Fill: 10
1325 IntIndex
1326 Indices: array([1, 2], dtype=int32)
1327
1328 >>> arr.map({0: 10, 1: 11, 2: 12})
1329 [10, 11, 12]
1330 Fill: 10
1331 IntIndex
1332 Indices: array([1, 2], dtype=int32)
1333
1334 >>> arr.map(pd.Series([10, 11, 12], index=[0, 1, 2]))
1335 [10, 11, 12]
1336 Fill: 10
1337 IntIndex
1338 Indices: array([1, 2], dtype=int32)
1339 """
1340 is_map = isinstance(mapper, (abc.Mapping, ABCSeries))
1341
1342 fill_val = self.fill_value
1343
1344 if na_action is None or notna(fill_val):
1345 fill_val = mapper.get(fill_val, fill_val) if is_map else mapper(fill_val)
1346
1347 def func(sp_val):
1348 new_sp_val = mapper.get(sp_val, None) if is_map else mapper(sp_val)
1349 # check identity and equality because nans are not equal to each other
1350 if new_sp_val is fill_val or new_sp_val == fill_val:
1351 msg = "fill value in the sparse values not supported"
1352 raise ValueError(msg)
1353 return new_sp_val
1354
1355 sp_values = [func(x) for x in self.sp_values]
1356
1357 return type(self)(sp_values, sparse_index=self.sp_index, fill_value=fill_val)
1358
1359 def to_dense(self) -> np.ndarray:
1360 """
1361 Convert SparseArray to a NumPy array.
1362
1363 Returns
1364 -------
1365 arr : NumPy array
1366 """
1367 return np.asarray(self, dtype=self.sp_values.dtype)
1368
1369 def _where(self, mask, value):
1370 # NB: may not preserve dtype, e.g. result may be Sparse[float64]
1371 # while self is Sparse[int64]
1372 naive_implementation = np.where(mask, self, value)
1373 dtype = SparseDtype(naive_implementation.dtype, fill_value=self.fill_value)
1374 result = type(self)._from_sequence(naive_implementation, dtype=dtype)
1375 return result
1376
1377 # ------------------------------------------------------------------------
1378 # IO
1379 # ------------------------------------------------------------------------
1380 def __setstate__(self, state) -> None:
1381 """Necessary for making this object picklable"""
1382 if isinstance(state, tuple):
1383 # Compat for pandas < 0.24.0
1384 nd_state, (fill_value, sp_index) = state
1385 sparse_values = np.array([])
1386 sparse_values.__setstate__(nd_state)
1387
1388 self._sparse_values = sparse_values
1389 self._sparse_index = sp_index
1390 self._dtype = SparseDtype(sparse_values.dtype, fill_value)
1391 else:
1392 self.__dict__.update(state)
1393
1394 def nonzero(self) -> tuple[npt.NDArray[np.int32]]:
1395 if self.fill_value == 0:
1396 return (self.sp_index.indices,)
1397 else:
1398 return (self.sp_index.indices[self.sp_values != 0],)
1399
1400 # ------------------------------------------------------------------------
1401 # Reductions
1402 # ------------------------------------------------------------------------
1403
1404 def _reduce(
1405 self, name: str, *, skipna: bool = True, keepdims: bool = False, **kwargs
1406 ):
1407 method = getattr(self, name, None)
1408
1409 if method is None:
1410 raise TypeError(f"cannot perform {name} with type {self.dtype}")
1411
1412 if skipna:
1413 arr = self
1414 else:
1415 arr = self.dropna()
1416
1417 result = getattr(arr, name)(**kwargs)
1418
1419 if keepdims:
1420 return type(self)([result], dtype=self.dtype)
1421 else:
1422 return result
1423
1424 def all(self, axis=None, *args, **kwargs):
1425 """
1426 Tests whether all elements evaluate True
1427
1428 Returns
1429 -------
1430 all : bool
1431
1432 See Also
1433 --------
1434 numpy.all
1435 """
1436 nv.validate_all(args, kwargs)
1437
1438 values = self.sp_values
1439
1440 if len(values) != len(self) and not np.all(self.fill_value):
1441 return False
1442
1443 return values.all()
1444
1445 def any(self, axis: AxisInt = 0, *args, **kwargs) -> bool:
1446 """
1447 Tests whether at least one of elements evaluate True
1448
1449 Returns
1450 -------
1451 any : bool
1452
1453 See Also
1454 --------
1455 numpy.any
1456 """
1457 nv.validate_any(args, kwargs)
1458
1459 values = self.sp_values
1460
1461 if len(values) != len(self) and np.any(self.fill_value):
1462 return True
1463
1464 return values.any().item()
1465
1466 def sum(
1467 self,
1468 axis: AxisInt = 0,
1469 min_count: int = 0,
1470 skipna: bool = True,
1471 *args,
1472 **kwargs,
1473 ) -> Scalar:
1474 """
1475 Sum of non-NA/null values
1476
1477 Parameters
1478 ----------
1479 axis : int, default 0
1480 Not Used. NumPy compatibility.
1481 min_count : int, default 0
1482 The required number of valid values to perform the summation. If fewer
1483 than ``min_count`` valid values are present, the result will be the missing
1484 value indicator for subarray type.
1485 *args, **kwargs
1486 Not Used. NumPy compatibility.
1487
1488 Returns
1489 -------
1490 scalar
1491 """
1492 nv.validate_sum(args, kwargs)
1493 valid_vals = self._valid_sp_values
1494 sp_sum = valid_vals.sum()
1495 has_na = self.sp_index.ngaps > 0 and not self._null_fill_value
1496
1497 if has_na and not skipna:
1498 return na_value_for_dtype(self.dtype.subtype, compat=False)
1499
1500 if self._null_fill_value:
1501 if check_below_min_count(valid_vals.shape, None, min_count):
1502 return na_value_for_dtype(self.dtype.subtype, compat=False)
1503 return sp_sum
1504 else:
1505 nsparse = self.sp_index.ngaps
1506 if check_below_min_count(valid_vals.shape, None, min_count - nsparse):
1507 return na_value_for_dtype(self.dtype.subtype, compat=False)
1508 return sp_sum + self.fill_value * nsparse
1509
1510 def cumsum(self, axis: AxisInt = 0, *args, **kwargs) -> SparseArray:
1511 """
1512 Cumulative sum of non-NA/null values.
1513
1514 When performing the cumulative summation, any non-NA/null values will
1515 be skipped. The resulting SparseArray will preserve the locations of
1516 NaN values, but the fill value will be `np.nan` regardless.
1517
1518 Parameters
1519 ----------
1520 axis : int or None
1521 Axis over which to perform the cumulative summation. If None,
1522 perform cumulative summation over flattened array.
1523
1524 Returns
1525 -------
1526 cumsum : SparseArray
1527 """
1528 nv.validate_cumsum(args, kwargs)
1529
1530 if axis is not None and axis >= self.ndim: # Mimic ndarray behaviour.
1531 raise ValueError(f"axis(={axis}) out of bounds")
1532
1533 if not self._null_fill_value:
1534 return SparseArray(self.to_dense()).cumsum()
1535
1536 return SparseArray(
1537 self.sp_values.cumsum(),
1538 sparse_index=self.sp_index,
1539 fill_value=self.fill_value,
1540 )
1541
1542 def mean(self, axis: Axis = 0, *args, **kwargs):
1543 """
1544 Mean of non-NA/null values
1545
1546 Returns
1547 -------
1548 mean : float
1549 """
1550 nv.validate_mean(args, kwargs)
1551 valid_vals = self._valid_sp_values
1552 sp_sum = valid_vals.sum()
1553 ct = len(valid_vals)
1554
1555 if self._null_fill_value:
1556 return sp_sum / ct
1557 else:
1558 nsparse = self.sp_index.ngaps
1559 return (sp_sum + self.fill_value * nsparse) / (ct + nsparse)
1560
1561 def max(self, *, axis: AxisInt | None = None, skipna: bool = True):
1562 """
1563 Max of array values, ignoring NA values if specified.
1564
1565 Parameters
1566 ----------
1567 axis : int, default 0
1568 Not Used. NumPy compatibility.
1569 skipna : bool, default True
1570 Whether to ignore NA values.
1571
1572 Returns
1573 -------
1574 scalar
1575 """
1576 nv.validate_minmax_axis(axis, self.ndim)
1577 return self._min_max("max", skipna=skipna)
1578
1579 def min(self, *, axis: AxisInt | None = None, skipna: bool = True):
1580 """
1581 Min of array values, ignoring NA values if specified.
1582
1583 Parameters
1584 ----------
1585 axis : int, default 0
1586 Not Used. NumPy compatibility.
1587 skipna : bool, default True
1588 Whether to ignore NA values.
1589
1590 Returns
1591 -------
1592 scalar
1593 """
1594 nv.validate_minmax_axis(axis, self.ndim)
1595 return self._min_max("min", skipna=skipna)
1596
1597 def _min_max(self, kind: Literal["min", "max"], skipna: bool) -> Scalar:
1598 """
1599 Min/max of non-NA/null values
1600
1601 Parameters
1602 ----------
1603 kind : {"min", "max"}
1604 skipna : bool
1605
1606 Returns
1607 -------
1608 scalar
1609 """
1610 valid_vals = self._valid_sp_values
1611 has_nonnull_fill_vals = not self._null_fill_value and self.sp_index.ngaps > 0
1612
1613 if len(valid_vals) > 0:
1614 sp_min_max = getattr(valid_vals, kind)()
1615
1616 # If a non-null fill value is currently present, it might be the min/max
1617 if has_nonnull_fill_vals:
1618 func = max if kind == "max" else min
1619 return func(sp_min_max, self.fill_value)
1620 elif skipna:
1621 return sp_min_max
1622 elif self.sp_index.ngaps == 0:
1623 # No NAs present
1624 return sp_min_max
1625 else:
1626 return na_value_for_dtype(self.dtype.subtype, compat=False)
1627 elif has_nonnull_fill_vals:
1628 return self.fill_value
1629 else:
1630 return na_value_for_dtype(self.dtype.subtype, compat=False)
1631
1632 def _argmin_argmax(self, kind: Literal["argmin", "argmax"]) -> int:
1633 values = self._sparse_values
1634 index = self._sparse_index.indices
1635 mask = np.asarray(isna(values))
1636 func = np.argmax if kind == "argmax" else np.argmin
1637
1638 idx = np.arange(values.shape[0])
1639 non_nans = values[~mask]
1640 non_nan_idx = idx[~mask]
1641
1642 _candidate = non_nan_idx[func(non_nans)]
1643 candidate = index[_candidate]
1644
1645 if isna(self.fill_value):
1646 return candidate
1647 if kind == "argmin" and self[candidate] < self.fill_value:
1648 return candidate
1649 if kind == "argmax" and self[candidate] > self.fill_value:
1650 return candidate
1651 _loc = self._first_fill_value_loc()
1652 if _loc == -1:
1653 # fill_value doesn't exist
1654 return candidate
1655 else:
1656 return _loc
1657
1658 def argmax(self, skipna: bool = True) -> int:
1659 validate_bool_kwarg(skipna, "skipna")
1660 if not skipna and self._hasna:
1661 raise NotImplementedError
1662 return self._argmin_argmax("argmax")
1663
1664 def argmin(self, skipna: bool = True) -> int:
1665 validate_bool_kwarg(skipna, "skipna")
1666 if not skipna and self._hasna:
1667 raise NotImplementedError
1668 return self._argmin_argmax("argmin")
1669
1670 # ------------------------------------------------------------------------
1671 # Ufuncs
1672 # ------------------------------------------------------------------------
1673
1674 _HANDLED_TYPES = (np.ndarray, numbers.Number)
1675
1676 def __array_ufunc__(self, ufunc: np.ufunc, method: str, *inputs, **kwargs):
1677 out = kwargs.get("out", ())
1678
1679 for x in inputs + out:
1680 if not isinstance(x, self._HANDLED_TYPES + (SparseArray,)):
1681 return NotImplemented
1682
1683 # for binary ops, use our custom dunder methods
1684 result = arraylike.maybe_dispatch_ufunc_to_dunder_op(
1685 self, ufunc, method, *inputs, **kwargs
1686 )
1687 if result is not NotImplemented:
1688 return result
1689
1690 if "out" in kwargs:
1691 # e.g. tests.arrays.sparse.test_arithmetics.test_ndarray_inplace
1692 res = arraylike.dispatch_ufunc_with_out(
1693 self, ufunc, method, *inputs, **kwargs
1694 )
1695 return res
1696
1697 if method == "reduce":
1698 result = arraylike.dispatch_reduction_ufunc(
1699 self, ufunc, method, *inputs, **kwargs
1700 )
1701 if result is not NotImplemented:
1702 # e.g. tests.series.test_ufunc.TestNumpyReductions
1703 return result
1704
1705 if len(inputs) == 1:
1706 # No alignment necessary.
1707 sp_values = getattr(ufunc, method)(self.sp_values, **kwargs)
1708 fill_value = getattr(ufunc, method)(self.fill_value, **kwargs)
1709
1710 if ufunc.nout > 1:
1711 # multiple outputs. e.g. modf
1712 arrays = tuple(
1713 self._simple_new(
1714 sp_value, self.sp_index, SparseDtype(sp_value.dtype, fv)
1715 )
1716 for sp_value, fv in zip(sp_values, fill_value)
1717 )
1718 return arrays
1719 elif method == "reduce":
1720 # e.g. reductions
1721 return sp_values
1722
1723 return self._simple_new(
1724 sp_values, self.sp_index, SparseDtype(sp_values.dtype, fill_value)
1725 )
1726
1727 new_inputs = tuple(np.asarray(x) for x in inputs)
1728 result = getattr(ufunc, method)(*new_inputs, **kwargs)
1729 if out:
1730 if len(out) == 1:
1731 out = out[0]
1732 return out
1733
1734 if ufunc.nout > 1:
1735 return tuple(type(self)(x) for x in result)
1736 elif method == "at":
1737 # no return value
1738 return None
1739 else:
1740 return type(self)(result)
1741
1742 # ------------------------------------------------------------------------
1743 # Ops
1744 # ------------------------------------------------------------------------
1745
1746 def _arith_method(self, other, op):
1747 op_name = op.__name__
1748
1749 if isinstance(other, SparseArray):
1750 return _sparse_array_op(self, other, op, op_name)
1751
1752 elif is_scalar(other):
1753 with np.errstate(all="ignore"):
1754 fill = op(_get_fill(self), np.asarray(other))
1755 result = op(self.sp_values, other)
1756
1757 if op_name == "divmod":
1758 left, right = result
1759 lfill, rfill = fill
1760 return (
1761 _wrap_result(op_name, left, self.sp_index, lfill),
1762 _wrap_result(op_name, right, self.sp_index, rfill),
1763 )
1764
1765 return _wrap_result(op_name, result, self.sp_index, fill)
1766
1767 else:
1768 other = np.asarray(other)
1769 with np.errstate(all="ignore"):
1770 if len(self) != len(other):
1771 raise AssertionError(
1772 f"length mismatch: {len(self)} vs. {len(other)}"
1773 )
1774 if not isinstance(other, SparseArray):
1775 dtype = getattr(other, "dtype", None)
1776 other = SparseArray(other, fill_value=self.fill_value, dtype=dtype)
1777 return _sparse_array_op(self, other, op, op_name)
1778
1779 def _cmp_method(self, other, op) -> SparseArray:
1780 if not is_scalar(other) and not isinstance(other, type(self)):
1781 # convert list-like to ndarray
1782 other = np.asarray(other)
1783
1784 if isinstance(other, np.ndarray):
1785 # TODO: make this more flexible than just ndarray...
1786 other = SparseArray(other, fill_value=self.fill_value)
1787
1788 if isinstance(other, SparseArray):
1789 if len(self) != len(other):
1790 raise ValueError(
1791 f"operands have mismatched length {len(self)} and {len(other)}"
1792 )
1793
1794 op_name = op.__name__.strip("_")
1795 return _sparse_array_op(self, other, op, op_name)
1796 else:
1797 # scalar
1798 fill_value = op(self.fill_value, other)
1799 result = np.full(len(self), fill_value, dtype=np.bool_)
1800 result[self.sp_index.indices] = op(self.sp_values, other)
1801
1802 return type(self)(
1803 result,
1804 fill_value=fill_value,
1805 dtype=np.bool_,
1806 )
1807
1808 _logical_method = _cmp_method
1809
1810 def _unary_method(self, op) -> SparseArray:
1811 fill_value = op(np.array(self.fill_value)).item()
1812 dtype = SparseDtype(self.dtype.subtype, fill_value)
1813 # NOTE: if fill_value doesn't change
1814 # we just have to apply op to sp_values
1815 if isna(self.fill_value) or fill_value == self.fill_value:
1816 values = op(self.sp_values)
1817 return type(self)._simple_new(values, self.sp_index, self.dtype)
1818 # In the other case we have to recalc indexes
1819 return type(self)(op(self.to_dense()), dtype=dtype)
1820
1821 def __pos__(self) -> SparseArray:
1822 return self._unary_method(operator.pos)
1823
1824 def __neg__(self) -> SparseArray:
1825 return self._unary_method(operator.neg)
1826
1827 def __invert__(self) -> SparseArray:
1828 return self._unary_method(operator.invert)
1829
1830 def __abs__(self) -> SparseArray:
1831 return self._unary_method(operator.abs)
1832
1833 # ----------
1834 # Formatting
1835 # -----------
1836 def __repr__(self) -> str:
1837 pp_str = printing.pprint_thing(self)
1838 pp_fill = printing.pprint_thing(self.fill_value)
1839 pp_index = printing.pprint_thing(self.sp_index)
1840 return f"{pp_str}\nFill: {pp_fill}\n{pp_index}"
1841
1842 def _formatter(self, boxed: bool = False):
1843 # Defer to the formatter from the GenericArrayFormatter calling us.
1844 # This will infer the correct formatter from the dtype of the values.
1845 return None
1846
1847
1848def _make_sparse(
1849 arr: np.ndarray,
1850 kind: SparseIndexKind = "block",
1851 fill_value=None,
1852 dtype: np.dtype | None = None,
1853):
1854 """
1855 Convert ndarray to sparse format
1856
1857 Parameters
1858 ----------
1859 arr : ndarray
1860 kind : {'block', 'integer'}
1861 fill_value : NaN or another value
1862 dtype : np.dtype, optional
1863 copy : bool, default False
1864
1865 Returns
1866 -------
1867 (sparse_values, index, fill_value) : (ndarray, SparseIndex, Scalar)
1868 """
1869 assert isinstance(arr, np.ndarray)
1870
1871 if arr.ndim > 1:
1872 raise TypeError("expected dimension <= 1 data")
1873
1874 if fill_value is None:
1875 fill_value = na_value_for_dtype(arr.dtype)
1876
1877 if isna(fill_value):
1878 mask = notna(arr)
1879 else:
1880 # cast to object comparison to be safe
1881 if is_string_dtype(arr.dtype):
1882 arr = arr.astype(object)
1883
1884 if is_object_dtype(arr.dtype):
1885 # element-wise equality check method in numpy doesn't treat
1886 # each element type, eg. 0, 0.0, and False are treated as
1887 # same. So we have to check the both of its type and value.
1888 mask = splib.make_mask_object_ndarray(arr, fill_value)
1889 else:
1890 mask = arr != fill_value
1891
1892 length = len(arr)
1893 if length != len(mask):
1894 # the arr is a SparseArray
1895 indices = mask.sp_index.indices
1896 else:
1897 indices = mask.nonzero()[0].astype(np.int32)
1898
1899 index = make_sparse_index(length, indices, kind)
1900 sparsified_values = arr[mask]
1901 if dtype is not None:
1902 sparsified_values = ensure_wrapped_if_datetimelike(sparsified_values)
1903 sparsified_values = astype_array(sparsified_values, dtype=dtype)
1904 sparsified_values = np.asarray(sparsified_values)
1905
1906 # TODO: copy
1907 return sparsified_values, index, fill_value
1908
1909
1910@overload
1911def make_sparse_index(length: int, indices, kind: Literal["block"]) -> BlockIndex:
1912 ...
1913
1914
1915@overload
1916def make_sparse_index(length: int, indices, kind: Literal["integer"]) -> IntIndex:
1917 ...
1918
1919
1920def make_sparse_index(length: int, indices, kind: SparseIndexKind) -> SparseIndex:
1921 index: SparseIndex
1922 if kind == "block":
1923 locs, lens = splib.get_blocks(indices)
1924 index = BlockIndex(length, locs, lens)
1925 elif kind == "integer":
1926 index = IntIndex(length, indices)
1927 else: # pragma: no cover
1928 raise ValueError("must be block or integer type")
1929 return index