Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/pandas/core/base.py: 39%
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1"""
2Base and utility classes for pandas objects.
3"""
5from __future__ import annotations
7import textwrap
8from typing import (
9 TYPE_CHECKING,
10 Any,
11 Generic,
12 Hashable,
13 Iterator,
14 Literal,
15 TypeVar,
16 cast,
17 final,
18 overload,
19)
21import numpy as np
23from pandas._config import using_copy_on_write
25from pandas._libs import lib
26from pandas._typing import (
27 Axis,
28 AxisInt,
29 DtypeObj,
30 IndexLabel,
31 NDFrameT,
32 Shape,
33 npt,
34)
35from pandas.compat import PYPY
36from pandas.compat.numpy import function as nv
37from pandas.errors import AbstractMethodError
38from pandas.util._decorators import (
39 cache_readonly,
40 doc,
41)
43from pandas.core.dtypes.cast import can_hold_element
44from pandas.core.dtypes.common import (
45 is_categorical_dtype,
46 is_dict_like,
47 is_extension_array_dtype,
48 is_object_dtype,
49 is_scalar,
50)
51from pandas.core.dtypes.generic import (
52 ABCDataFrame,
53 ABCIndex,
54 ABCSeries,
55)
56from pandas.core.dtypes.missing import (
57 isna,
58 remove_na_arraylike,
59)
61from pandas.core import (
62 algorithms,
63 nanops,
64 ops,
65)
66from pandas.core.accessor import DirNamesMixin
67from pandas.core.arraylike import OpsMixin
68from pandas.core.arrays import ExtensionArray
69from pandas.core.construction import (
70 ensure_wrapped_if_datetimelike,
71 extract_array,
72)
74if TYPE_CHECKING:
75 from pandas._typing import (
76 DropKeep,
77 NumpySorter,
78 NumpyValueArrayLike,
79 ScalarLike_co,
80 )
82 from pandas import (
83 Categorical,
84 Index,
85 Series,
86 )
89_shared_docs: dict[str, str] = {}
90_indexops_doc_kwargs = {
91 "klass": "IndexOpsMixin",
92 "inplace": "",
93 "unique": "IndexOpsMixin",
94 "duplicated": "IndexOpsMixin",
95}
97_T = TypeVar("_T", bound="IndexOpsMixin")
100class PandasObject(DirNamesMixin):
101 """
102 Baseclass for various pandas objects.
103 """
105 # results from calls to methods decorated with cache_readonly get added to _cache
106 _cache: dict[str, Any]
108 @property
109 def _constructor(self):
110 """
111 Class constructor (for this class it's just `__class__`.
112 """
113 return type(self)
115 def __repr__(self) -> str:
116 """
117 Return a string representation for a particular object.
118 """
119 # Should be overwritten by base classes
120 return object.__repr__(self)
122 def _reset_cache(self, key: str | None = None) -> None:
123 """
124 Reset cached properties. If ``key`` is passed, only clears that key.
125 """
126 if not hasattr(self, "_cache"):
127 return
128 if key is None:
129 self._cache.clear()
130 else:
131 self._cache.pop(key, None)
133 def __sizeof__(self) -> int:
134 """
135 Generates the total memory usage for an object that returns
136 either a value or Series of values
137 """
138 memory_usage = getattr(self, "memory_usage", None)
139 if memory_usage:
140 mem = memory_usage(deep=True) # pylint: disable=not-callable
141 return int(mem if is_scalar(mem) else mem.sum())
143 # no memory_usage attribute, so fall back to object's 'sizeof'
144 return super().__sizeof__()
147class NoNewAttributesMixin:
148 """
149 Mixin which prevents adding new attributes.
151 Prevents additional attributes via xxx.attribute = "something" after a
152 call to `self.__freeze()`. Mainly used to prevent the user from using
153 wrong attributes on an accessor (`Series.cat/.str/.dt`).
155 If you really want to add a new attribute at a later time, you need to use
156 `object.__setattr__(self, key, value)`.
157 """
159 def _freeze(self) -> None:
160 """
161 Prevents setting additional attributes.
162 """
163 object.__setattr__(self, "__frozen", True)
165 # prevent adding any attribute via s.xxx.new_attribute = ...
166 def __setattr__(self, key: str, value) -> None:
167 # _cache is used by a decorator
168 # We need to check both 1.) cls.__dict__ and 2.) getattr(self, key)
169 # because
170 # 1.) getattr is false for attributes that raise errors
171 # 2.) cls.__dict__ doesn't traverse into base classes
172 if getattr(self, "__frozen", False) and not (
173 key == "_cache"
174 or key in type(self).__dict__
175 or getattr(self, key, None) is not None
176 ):
177 raise AttributeError(f"You cannot add any new attribute '{key}'")
178 object.__setattr__(self, key, value)
181class SelectionMixin(Generic[NDFrameT]):
182 """
183 mixin implementing the selection & aggregation interface on a group-like
184 object sub-classes need to define: obj, exclusions
185 """
187 obj: NDFrameT
188 _selection: IndexLabel | None = None
189 exclusions: frozenset[Hashable]
190 _internal_names = ["_cache", "__setstate__"]
191 _internal_names_set = set(_internal_names)
193 @final
194 @property
195 def _selection_list(self):
196 if not isinstance(
197 self._selection, (list, tuple, ABCSeries, ABCIndex, np.ndarray)
198 ):
199 return [self._selection]
200 return self._selection
202 @cache_readonly
203 def _selected_obj(self):
204 if self._selection is None or isinstance(self.obj, ABCSeries):
205 return self.obj
206 else:
207 return self.obj[self._selection]
209 @final
210 @cache_readonly
211 def ndim(self) -> int:
212 return self._selected_obj.ndim
214 @final
215 @cache_readonly
216 def _obj_with_exclusions(self):
217 if isinstance(self.obj, ABCSeries):
218 return self.obj
220 if self._selection is not None:
221 return self.obj._getitem_nocopy(self._selection_list)
223 if len(self.exclusions) > 0:
224 # equivalent to `self.obj.drop(self.exclusions, axis=1)
225 # but this avoids consolidating and making a copy
226 # TODO: following GH#45287 can we now use .drop directly without
227 # making a copy?
228 return self.obj._drop_axis(self.exclusions, axis=1, only_slice=True)
229 else:
230 return self.obj
232 def __getitem__(self, key):
233 if self._selection is not None:
234 raise IndexError(f"Column(s) {self._selection} already selected")
236 if isinstance(key, (list, tuple, ABCSeries, ABCIndex, np.ndarray)):
237 if len(self.obj.columns.intersection(key)) != len(set(key)):
238 bad_keys = list(set(key).difference(self.obj.columns))
239 raise KeyError(f"Columns not found: {str(bad_keys)[1:-1]}")
240 return self._gotitem(list(key), ndim=2)
242 else:
243 if key not in self.obj:
244 raise KeyError(f"Column not found: {key}")
245 ndim = self.obj[key].ndim
246 return self._gotitem(key, ndim=ndim)
248 def _gotitem(self, key, ndim: int, subset=None):
249 """
250 sub-classes to define
251 return a sliced object
253 Parameters
254 ----------
255 key : str / list of selections
256 ndim : {1, 2}
257 requested ndim of result
258 subset : object, default None
259 subset to act on
260 """
261 raise AbstractMethodError(self)
263 def aggregate(self, func, *args, **kwargs):
264 raise AbstractMethodError(self)
266 agg = aggregate
269class IndexOpsMixin(OpsMixin):
270 """
271 Common ops mixin to support a unified interface / docs for Series / Index
272 """
274 # ndarray compatibility
275 __array_priority__ = 1000
276 _hidden_attrs: frozenset[str] = frozenset(
277 ["tolist"] # tolist is not deprecated, just suppressed in the __dir__
278 )
280 @property
281 def dtype(self) -> DtypeObj:
282 # must be defined here as a property for mypy
283 raise AbstractMethodError(self)
285 @property
286 def _values(self) -> ExtensionArray | np.ndarray:
287 # must be defined here as a property for mypy
288 raise AbstractMethodError(self)
290 @final
291 def transpose(self: _T, *args, **kwargs) -> _T:
292 """
293 Return the transpose, which is by definition self.
295 Returns
296 -------
297 %(klass)s
298 """
299 nv.validate_transpose(args, kwargs)
300 return self
302 T = property(
303 transpose,
304 doc="""
305 Return the transpose, which is by definition self.
306 """,
307 )
309 @property
310 def shape(self) -> Shape:
311 """
312 Return a tuple of the shape of the underlying data.
314 Examples
315 --------
316 >>> s = pd.Series([1, 2, 3])
317 >>> s.shape
318 (3,)
319 """
320 return self._values.shape
322 def __len__(self) -> int:
323 # We need this defined here for mypy
324 raise AbstractMethodError(self)
326 @property
327 def ndim(self) -> Literal[1]:
328 """
329 Number of dimensions of the underlying data, by definition 1.
330 """
331 return 1
333 @final
334 def item(self):
335 """
336 Return the first element of the underlying data as a Python scalar.
338 Returns
339 -------
340 scalar
341 The first element of %(klass)s.
343 Raises
344 ------
345 ValueError
346 If the data is not length-1.
347 """
348 if len(self) == 1:
349 return next(iter(self))
350 raise ValueError("can only convert an array of size 1 to a Python scalar")
352 @property
353 def nbytes(self) -> int:
354 """
355 Return the number of bytes in the underlying data.
356 """
357 return self._values.nbytes
359 @property
360 def size(self) -> int:
361 """
362 Return the number of elements in the underlying data.
363 """
364 return len(self._values)
366 @property
367 def array(self) -> ExtensionArray:
368 """
369 The ExtensionArray of the data backing this Series or Index.
371 Returns
372 -------
373 ExtensionArray
374 An ExtensionArray of the values stored within. For extension
375 types, this is the actual array. For NumPy native types, this
376 is a thin (no copy) wrapper around :class:`numpy.ndarray`.
378 ``.array`` differs ``.values`` which may require converting the
379 data to a different form.
381 See Also
382 --------
383 Index.to_numpy : Similar method that always returns a NumPy array.
384 Series.to_numpy : Similar method that always returns a NumPy array.
386 Notes
387 -----
388 This table lays out the different array types for each extension
389 dtype within pandas.
391 ================== =============================
392 dtype array type
393 ================== =============================
394 category Categorical
395 period PeriodArray
396 interval IntervalArray
397 IntegerNA IntegerArray
398 string StringArray
399 boolean BooleanArray
400 datetime64[ns, tz] DatetimeArray
401 ================== =============================
403 For any 3rd-party extension types, the array type will be an
404 ExtensionArray.
406 For all remaining dtypes ``.array`` will be a
407 :class:`arrays.NumpyExtensionArray` wrapping the actual ndarray
408 stored within. If you absolutely need a NumPy array (possibly with
409 copying / coercing data), then use :meth:`Series.to_numpy` instead.
411 Examples
412 --------
413 For regular NumPy types like int, and float, a PandasArray
414 is returned.
416 >>> pd.Series([1, 2, 3]).array
417 <PandasArray>
418 [1, 2, 3]
419 Length: 3, dtype: int64
421 For extension types, like Categorical, the actual ExtensionArray
422 is returned
424 >>> ser = pd.Series(pd.Categorical(['a', 'b', 'a']))
425 >>> ser.array
426 ['a', 'b', 'a']
427 Categories (2, object): ['a', 'b']
428 """
429 raise AbstractMethodError(self)
431 @final
432 def to_numpy(
433 self,
434 dtype: npt.DTypeLike | None = None,
435 copy: bool = False,
436 na_value: object = lib.no_default,
437 **kwargs,
438 ) -> np.ndarray:
439 """
440 A NumPy ndarray representing the values in this Series or Index.
442 Parameters
443 ----------
444 dtype : str or numpy.dtype, optional
445 The dtype to pass to :meth:`numpy.asarray`.
446 copy : bool, default False
447 Whether to ensure that the returned value is not a view on
448 another array. Note that ``copy=False`` does not *ensure* that
449 ``to_numpy()`` is no-copy. Rather, ``copy=True`` ensure that
450 a copy is made, even if not strictly necessary.
451 na_value : Any, optional
452 The value to use for missing values. The default value depends
453 on `dtype` and the type of the array.
454 **kwargs
455 Additional keywords passed through to the ``to_numpy`` method
456 of the underlying array (for extension arrays).
458 Returns
459 -------
460 numpy.ndarray
462 See Also
463 --------
464 Series.array : Get the actual data stored within.
465 Index.array : Get the actual data stored within.
466 DataFrame.to_numpy : Similar method for DataFrame.
468 Notes
469 -----
470 The returned array will be the same up to equality (values equal
471 in `self` will be equal in the returned array; likewise for values
472 that are not equal). When `self` contains an ExtensionArray, the
473 dtype may be different. For example, for a category-dtype Series,
474 ``to_numpy()`` will return a NumPy array and the categorical dtype
475 will be lost.
477 For NumPy dtypes, this will be a reference to the actual data stored
478 in this Series or Index (assuming ``copy=False``). Modifying the result
479 in place will modify the data stored in the Series or Index (not that
480 we recommend doing that).
482 For extension types, ``to_numpy()`` *may* require copying data and
483 coercing the result to a NumPy type (possibly object), which may be
484 expensive. When you need a no-copy reference to the underlying data,
485 :attr:`Series.array` should be used instead.
487 This table lays out the different dtypes and default return types of
488 ``to_numpy()`` for various dtypes within pandas.
490 ================== ================================
491 dtype array type
492 ================== ================================
493 category[T] ndarray[T] (same dtype as input)
494 period ndarray[object] (Periods)
495 interval ndarray[object] (Intervals)
496 IntegerNA ndarray[object]
497 datetime64[ns] datetime64[ns]
498 datetime64[ns, tz] ndarray[object] (Timestamps)
499 ================== ================================
501 Examples
502 --------
503 >>> ser = pd.Series(pd.Categorical(['a', 'b', 'a']))
504 >>> ser.to_numpy()
505 array(['a', 'b', 'a'], dtype=object)
507 Specify the `dtype` to control how datetime-aware data is represented.
508 Use ``dtype=object`` to return an ndarray of pandas :class:`Timestamp`
509 objects, each with the correct ``tz``.
511 >>> ser = pd.Series(pd.date_range('2000', periods=2, tz="CET"))
512 >>> ser.to_numpy(dtype=object)
513 array([Timestamp('2000-01-01 00:00:00+0100', tz='CET'),
514 Timestamp('2000-01-02 00:00:00+0100', tz='CET')],
515 dtype=object)
517 Or ``dtype='datetime64[ns]'`` to return an ndarray of native
518 datetime64 values. The values are converted to UTC and the timezone
519 info is dropped.
521 >>> ser.to_numpy(dtype="datetime64[ns]")
522 ... # doctest: +ELLIPSIS
523 array(['1999-12-31T23:00:00.000000000', '2000-01-01T23:00:00...'],
524 dtype='datetime64[ns]')
525 """
526 if is_extension_array_dtype(self.dtype):
527 return self.array.to_numpy(dtype, copy=copy, na_value=na_value, **kwargs)
528 elif kwargs:
529 bad_keys = list(kwargs.keys())[0]
530 raise TypeError(
531 f"to_numpy() got an unexpected keyword argument '{bad_keys}'"
532 )
534 if na_value is not lib.no_default:
535 values = self._values
536 if not can_hold_element(values, na_value):
537 # if we can't hold the na_value asarray either makes a copy or we
538 # error before modifying values. The asarray later on thus won't make
539 # another copy
540 values = np.asarray(values, dtype=dtype)
541 else:
542 values = values.copy()
544 values[np.asanyarray(self.isna())] = na_value
545 else:
546 values = self._values
548 result = np.asarray(values, dtype=dtype)
550 if (copy and na_value is lib.no_default) or (
551 not copy and using_copy_on_write()
552 ):
553 if np.shares_memory(self._values[:2], result[:2]):
554 # Take slices to improve performance of check
555 if using_copy_on_write() and not copy:
556 result = result.view()
557 result.flags.writeable = False
558 else:
559 result = result.copy()
561 return result
563 @final
564 @property
565 def empty(self) -> bool:
566 return not self.size
568 def max(self, axis: AxisInt | None = None, skipna: bool = True, *args, **kwargs):
569 """
570 Return the maximum value of the Index.
572 Parameters
573 ----------
574 axis : int, optional
575 For compatibility with NumPy. Only 0 or None are allowed.
576 skipna : bool, default True
577 Exclude NA/null values when showing the result.
578 *args, **kwargs
579 Additional arguments and keywords for compatibility with NumPy.
581 Returns
582 -------
583 scalar
584 Maximum value.
586 See Also
587 --------
588 Index.min : Return the minimum value in an Index.
589 Series.max : Return the maximum value in a Series.
590 DataFrame.max : Return the maximum values in a DataFrame.
592 Examples
593 --------
594 >>> idx = pd.Index([3, 2, 1])
595 >>> idx.max()
596 3
598 >>> idx = pd.Index(['c', 'b', 'a'])
599 >>> idx.max()
600 'c'
602 For a MultiIndex, the maximum is determined lexicographically.
604 >>> idx = pd.MultiIndex.from_product([('a', 'b'), (2, 1)])
605 >>> idx.max()
606 ('b', 2)
607 """
608 nv.validate_minmax_axis(axis)
609 nv.validate_max(args, kwargs)
610 return nanops.nanmax(self._values, skipna=skipna)
612 @doc(op="max", oppose="min", value="largest")
613 def argmax(
614 self, axis: AxisInt | None = None, skipna: bool = True, *args, **kwargs
615 ) -> int:
616 """
617 Return int position of the {value} value in the Series.
619 If the {op}imum is achieved in multiple locations,
620 the first row position is returned.
622 Parameters
623 ----------
624 axis : {{None}}
625 Unused. Parameter needed for compatibility with DataFrame.
626 skipna : bool, default True
627 Exclude NA/null values when showing the result.
628 *args, **kwargs
629 Additional arguments and keywords for compatibility with NumPy.
631 Returns
632 -------
633 int
634 Row position of the {op}imum value.
636 See Also
637 --------
638 Series.arg{op} : Return position of the {op}imum value.
639 Series.arg{oppose} : Return position of the {oppose}imum value.
640 numpy.ndarray.arg{op} : Equivalent method for numpy arrays.
641 Series.idxmax : Return index label of the maximum values.
642 Series.idxmin : Return index label of the minimum values.
644 Examples
645 --------
646 Consider dataset containing cereal calories
648 >>> s = pd.Series({{'Corn Flakes': 100.0, 'Almond Delight': 110.0,
649 ... 'Cinnamon Toast Crunch': 120.0, 'Cocoa Puff': 110.0}})
650 >>> s
651 Corn Flakes 100.0
652 Almond Delight 110.0
653 Cinnamon Toast Crunch 120.0
654 Cocoa Puff 110.0
655 dtype: float64
657 >>> s.argmax()
658 2
659 >>> s.argmin()
660 0
662 The maximum cereal calories is the third element and
663 the minimum cereal calories is the first element,
664 since series is zero-indexed.
665 """
666 delegate = self._values
667 nv.validate_minmax_axis(axis)
668 skipna = nv.validate_argmax_with_skipna(skipna, args, kwargs)
670 if isinstance(delegate, ExtensionArray):
671 if not skipna and delegate.isna().any():
672 return -1
673 else:
674 return delegate.argmax()
675 else:
676 # error: Incompatible return value type (got "Union[int, ndarray]", expected
677 # "int")
678 return nanops.nanargmax( # type: ignore[return-value]
679 delegate, skipna=skipna
680 )
682 def min(self, axis: AxisInt | None = None, skipna: bool = True, *args, **kwargs):
683 """
684 Return the minimum value of the Index.
686 Parameters
687 ----------
688 axis : {None}
689 Dummy argument for consistency with Series.
690 skipna : bool, default True
691 Exclude NA/null values when showing the result.
692 *args, **kwargs
693 Additional arguments and keywords for compatibility with NumPy.
695 Returns
696 -------
697 scalar
698 Minimum value.
700 See Also
701 --------
702 Index.max : Return the maximum value of the object.
703 Series.min : Return the minimum value in a Series.
704 DataFrame.min : Return the minimum values in a DataFrame.
706 Examples
707 --------
708 >>> idx = pd.Index([3, 2, 1])
709 >>> idx.min()
710 1
712 >>> idx = pd.Index(['c', 'b', 'a'])
713 >>> idx.min()
714 'a'
716 For a MultiIndex, the minimum is determined lexicographically.
718 >>> idx = pd.MultiIndex.from_product([('a', 'b'), (2, 1)])
719 >>> idx.min()
720 ('a', 1)
721 """
722 nv.validate_minmax_axis(axis)
723 nv.validate_min(args, kwargs)
724 return nanops.nanmin(self._values, skipna=skipna)
726 @doc(argmax, op="min", oppose="max", value="smallest")
727 def argmin(
728 self, axis: AxisInt | None = None, skipna: bool = True, *args, **kwargs
729 ) -> int:
730 delegate = self._values
731 nv.validate_minmax_axis(axis)
732 skipna = nv.validate_argmin_with_skipna(skipna, args, kwargs)
734 if isinstance(delegate, ExtensionArray):
735 if not skipna and delegate.isna().any():
736 return -1
737 else:
738 return delegate.argmin()
739 else:
740 # error: Incompatible return value type (got "Union[int, ndarray]", expected
741 # "int")
742 return nanops.nanargmin( # type: ignore[return-value]
743 delegate, skipna=skipna
744 )
746 def tolist(self):
747 """
748 Return a list of the values.
750 These are each a scalar type, which is a Python scalar
751 (for str, int, float) or a pandas scalar
752 (for Timestamp/Timedelta/Interval/Period)
754 Returns
755 -------
756 list
758 See Also
759 --------
760 numpy.ndarray.tolist : Return the array as an a.ndim-levels deep
761 nested list of Python scalars.
762 """
763 return self._values.tolist()
765 to_list = tolist
767 def __iter__(self) -> Iterator:
768 """
769 Return an iterator of the values.
771 These are each a scalar type, which is a Python scalar
772 (for str, int, float) or a pandas scalar
773 (for Timestamp/Timedelta/Interval/Period)
775 Returns
776 -------
777 iterator
778 """
779 # We are explicitly making element iterators.
780 if not isinstance(self._values, np.ndarray):
781 # Check type instead of dtype to catch DTA/TDA
782 return iter(self._values)
783 else:
784 return map(self._values.item, range(self._values.size))
786 @cache_readonly
787 def hasnans(self) -> bool:
788 """
789 Return True if there are any NaNs.
791 Enables various performance speedups.
793 Returns
794 -------
795 bool
796 """
797 # error: Item "bool" of "Union[bool, ndarray[Any, dtype[bool_]], NDFrame]"
798 # has no attribute "any"
799 return bool(isna(self).any()) # type: ignore[union-attr]
801 def isna(self) -> npt.NDArray[np.bool_]:
802 return isna(self._values)
804 def _reduce(
805 self,
806 op,
807 name: str,
808 *,
809 axis: Axis = 0,
810 skipna: bool = True,
811 numeric_only=None,
812 filter_type=None,
813 **kwds,
814 ):
815 """
816 Perform the reduction type operation if we can.
817 """
818 func = getattr(self, name, None)
819 if func is None:
820 raise TypeError(
821 f"{type(self).__name__} cannot perform the operation {name}"
822 )
823 return func(skipna=skipna, **kwds)
825 @final
826 def _map_values(self, mapper, na_action=None):
827 """
828 An internal function that maps values using the input
829 correspondence (which can be a dict, Series, or function).
831 Parameters
832 ----------
833 mapper : function, dict, or Series
834 The input correspondence object
835 na_action : {None, 'ignore'}
836 If 'ignore', propagate NA values, without passing them to the
837 mapping function
839 Returns
840 -------
841 Union[Index, MultiIndex], inferred
842 The output of the mapping function applied to the index.
843 If the function returns a tuple with more than one element
844 a MultiIndex will be returned.
845 """
846 # we can fastpath dict/Series to an efficient map
847 # as we know that we are not going to have to yield
848 # python types
849 if is_dict_like(mapper):
850 if isinstance(mapper, dict) and hasattr(mapper, "__missing__"):
851 # If a dictionary subclass defines a default value method,
852 # convert mapper to a lookup function (GH #15999).
853 dict_with_default = mapper
854 mapper = lambda x: dict_with_default[
855 np.nan if isinstance(x, float) and np.isnan(x) else x
856 ]
857 else:
858 # Dictionary does not have a default. Thus it's safe to
859 # convert to an Series for efficiency.
860 # we specify the keys here to handle the
861 # possibility that they are tuples
863 # The return value of mapping with an empty mapper is
864 # expected to be pd.Series(np.nan, ...). As np.nan is
865 # of dtype float64 the return value of this method should
866 # be float64 as well
867 from pandas import Series
869 if len(mapper) == 0:
870 mapper = Series(mapper, dtype=np.float64)
871 else:
872 mapper = Series(mapper)
874 if isinstance(mapper, ABCSeries):
875 if na_action not in (None, "ignore"):
876 msg = (
877 "na_action must either be 'ignore' or None, "
878 f"{na_action} was passed"
879 )
880 raise ValueError(msg)
882 if na_action == "ignore":
883 mapper = mapper[mapper.index.notna()]
885 # Since values were input this means we came from either
886 # a dict or a series and mapper should be an index
887 if is_categorical_dtype(self.dtype):
888 # use the built in categorical series mapper which saves
889 # time by mapping the categories instead of all values
891 cat = cast("Categorical", self._values)
892 return cat.map(mapper)
894 values = self._values
896 indexer = mapper.index.get_indexer(values)
897 new_values = algorithms.take_nd(mapper._values, indexer)
899 return new_values
901 # we must convert to python types
902 if is_extension_array_dtype(self.dtype) and hasattr(self._values, "map"):
903 # GH#23179 some EAs do not have `map`
904 values = self._values
905 if na_action is not None:
906 raise NotImplementedError
907 map_f = lambda values, f: values.map(f)
908 else:
909 values = self._values.astype(object)
910 if na_action == "ignore":
911 map_f = lambda values, f: lib.map_infer_mask(
912 values, f, isna(values).view(np.uint8)
913 )
914 elif na_action is None:
915 map_f = lib.map_infer
916 else:
917 msg = (
918 "na_action must either be 'ignore' or None, "
919 f"{na_action} was passed"
920 )
921 raise ValueError(msg)
923 # mapper is a function
924 new_values = map_f(values, mapper)
926 return new_values
928 @final
929 def value_counts(
930 self,
931 normalize: bool = False,
932 sort: bool = True,
933 ascending: bool = False,
934 bins=None,
935 dropna: bool = True,
936 ) -> Series:
937 """
938 Return a Series containing counts of unique values.
940 The resulting object will be in descending order so that the
941 first element is the most frequently-occurring element.
942 Excludes NA values by default.
944 Parameters
945 ----------
946 normalize : bool, default False
947 If True then the object returned will contain the relative
948 frequencies of the unique values.
949 sort : bool, default True
950 Sort by frequencies.
951 ascending : bool, default False
952 Sort in ascending order.
953 bins : int, optional
954 Rather than count values, group them into half-open bins,
955 a convenience for ``pd.cut``, only works with numeric data.
956 dropna : bool, default True
957 Don't include counts of NaN.
959 Returns
960 -------
961 Series
963 See Also
964 --------
965 Series.count: Number of non-NA elements in a Series.
966 DataFrame.count: Number of non-NA elements in a DataFrame.
967 DataFrame.value_counts: Equivalent method on DataFrames.
969 Examples
970 --------
971 >>> index = pd.Index([3, 1, 2, 3, 4, np.nan])
972 >>> index.value_counts()
973 3.0 2
974 1.0 1
975 2.0 1
976 4.0 1
977 Name: count, dtype: int64
979 With `normalize` set to `True`, returns the relative frequency by
980 dividing all values by the sum of values.
982 >>> s = pd.Series([3, 1, 2, 3, 4, np.nan])
983 >>> s.value_counts(normalize=True)
984 3.0 0.4
985 1.0 0.2
986 2.0 0.2
987 4.0 0.2
988 Name: proportion, dtype: float64
990 **bins**
992 Bins can be useful for going from a continuous variable to a
993 categorical variable; instead of counting unique
994 apparitions of values, divide the index in the specified
995 number of half-open bins.
997 >>> s.value_counts(bins=3)
998 (0.996, 2.0] 2
999 (2.0, 3.0] 2
1000 (3.0, 4.0] 1
1001 Name: count, dtype: int64
1003 **dropna**
1005 With `dropna` set to `False` we can also see NaN index values.
1007 >>> s.value_counts(dropna=False)
1008 3.0 2
1009 1.0 1
1010 2.0 1
1011 4.0 1
1012 NaN 1
1013 Name: count, dtype: int64
1014 """
1015 return algorithms.value_counts(
1016 self,
1017 sort=sort,
1018 ascending=ascending,
1019 normalize=normalize,
1020 bins=bins,
1021 dropna=dropna,
1022 )
1024 def unique(self):
1025 values = self._values
1026 if not isinstance(values, np.ndarray):
1027 # i.e. ExtensionArray
1028 result = values.unique()
1029 else:
1030 result = algorithms.unique1d(values)
1031 return result
1033 @final
1034 def nunique(self, dropna: bool = True) -> int:
1035 """
1036 Return number of unique elements in the object.
1038 Excludes NA values by default.
1040 Parameters
1041 ----------
1042 dropna : bool, default True
1043 Don't include NaN in the count.
1045 Returns
1046 -------
1047 int
1049 See Also
1050 --------
1051 DataFrame.nunique: Method nunique for DataFrame.
1052 Series.count: Count non-NA/null observations in the Series.
1054 Examples
1055 --------
1056 >>> s = pd.Series([1, 3, 5, 7, 7])
1057 >>> s
1058 0 1
1059 1 3
1060 2 5
1061 3 7
1062 4 7
1063 dtype: int64
1065 >>> s.nunique()
1066 4
1067 """
1068 uniqs = self.unique()
1069 if dropna:
1070 uniqs = remove_na_arraylike(uniqs)
1071 return len(uniqs)
1073 @property
1074 def is_unique(self) -> bool:
1075 """
1076 Return boolean if values in the object are unique.
1078 Returns
1079 -------
1080 bool
1081 """
1082 return self.nunique(dropna=False) == len(self)
1084 @property
1085 def is_monotonic_increasing(self) -> bool:
1086 """
1087 Return boolean if values in the object are monotonically increasing.
1089 Returns
1090 -------
1091 bool
1092 """
1093 from pandas import Index
1095 return Index(self).is_monotonic_increasing
1097 @property
1098 def is_monotonic_decreasing(self) -> bool:
1099 """
1100 Return boolean if values in the object are monotonically decreasing.
1102 Returns
1103 -------
1104 bool
1105 """
1106 from pandas import Index
1108 return Index(self).is_monotonic_decreasing
1110 @final
1111 def _memory_usage(self, deep: bool = False) -> int:
1112 """
1113 Memory usage of the values.
1115 Parameters
1116 ----------
1117 deep : bool, default False
1118 Introspect the data deeply, interrogate
1119 `object` dtypes for system-level memory consumption.
1121 Returns
1122 -------
1123 bytes used
1125 See Also
1126 --------
1127 numpy.ndarray.nbytes : Total bytes consumed by the elements of the
1128 array.
1130 Notes
1131 -----
1132 Memory usage does not include memory consumed by elements that
1133 are not components of the array if deep=False or if used on PyPy
1134 """
1135 if hasattr(self.array, "memory_usage"):
1136 return self.array.memory_usage( # pyright: ignore[reportGeneralTypeIssues]
1137 deep=deep,
1138 )
1140 v = self.array.nbytes
1141 if deep and is_object_dtype(self) and not PYPY:
1142 values = cast(np.ndarray, self._values)
1143 v += lib.memory_usage_of_objects(values)
1144 return v
1146 @doc(
1147 algorithms.factorize,
1148 values="",
1149 order="",
1150 size_hint="",
1151 sort=textwrap.dedent(
1152 """\
1153 sort : bool, default False
1154 Sort `uniques` and shuffle `codes` to maintain the
1155 relationship.
1156 """
1157 ),
1158 )
1159 def factorize(
1160 self,
1161 sort: bool = False,
1162 use_na_sentinel: bool = True,
1163 ) -> tuple[npt.NDArray[np.intp], Index]:
1164 codes, uniques = algorithms.factorize(
1165 self._values, sort=sort, use_na_sentinel=use_na_sentinel
1166 )
1167 if uniques.dtype == np.float16:
1168 uniques = uniques.astype(np.float32)
1170 if isinstance(self, ABCIndex):
1171 # preserve e.g. MultiIndex
1172 uniques = self._constructor(uniques)
1173 else:
1174 from pandas import Index
1176 uniques = Index(uniques)
1177 return codes, uniques
1179 _shared_docs[
1180 "searchsorted"
1181 ] = """
1182 Find indices where elements should be inserted to maintain order.
1184 Find the indices into a sorted {klass} `self` such that, if the
1185 corresponding elements in `value` were inserted before the indices,
1186 the order of `self` would be preserved.
1188 .. note::
1190 The {klass} *must* be monotonically sorted, otherwise
1191 wrong locations will likely be returned. Pandas does *not*
1192 check this for you.
1194 Parameters
1195 ----------
1196 value : array-like or scalar
1197 Values to insert into `self`.
1198 side : {{'left', 'right'}}, optional
1199 If 'left', the index of the first suitable location found is given.
1200 If 'right', return the last such index. If there is no suitable
1201 index, return either 0 or N (where N is the length of `self`).
1202 sorter : 1-D array-like, optional
1203 Optional array of integer indices that sort `self` into ascending
1204 order. They are typically the result of ``np.argsort``.
1206 Returns
1207 -------
1208 int or array of int
1209 A scalar or array of insertion points with the
1210 same shape as `value`.
1212 See Also
1213 --------
1214 sort_values : Sort by the values along either axis.
1215 numpy.searchsorted : Similar method from NumPy.
1217 Notes
1218 -----
1219 Binary search is used to find the required insertion points.
1221 Examples
1222 --------
1223 >>> ser = pd.Series([1, 2, 3])
1224 >>> ser
1225 0 1
1226 1 2
1227 2 3
1228 dtype: int64
1230 >>> ser.searchsorted(4)
1231 3
1233 >>> ser.searchsorted([0, 4])
1234 array([0, 3])
1236 >>> ser.searchsorted([1, 3], side='left')
1237 array([0, 2])
1239 >>> ser.searchsorted([1, 3], side='right')
1240 array([1, 3])
1242 >>> ser = pd.Series(pd.to_datetime(['3/11/2000', '3/12/2000', '3/13/2000']))
1243 >>> ser
1244 0 2000-03-11
1245 1 2000-03-12
1246 2 2000-03-13
1247 dtype: datetime64[ns]
1249 >>> ser.searchsorted('3/14/2000')
1250 3
1252 >>> ser = pd.Categorical(
1253 ... ['apple', 'bread', 'bread', 'cheese', 'milk'], ordered=True
1254 ... )
1255 >>> ser
1256 ['apple', 'bread', 'bread', 'cheese', 'milk']
1257 Categories (4, object): ['apple' < 'bread' < 'cheese' < 'milk']
1259 >>> ser.searchsorted('bread')
1260 1
1262 >>> ser.searchsorted(['bread'], side='right')
1263 array([3])
1265 If the values are not monotonically sorted, wrong locations
1266 may be returned:
1268 >>> ser = pd.Series([2, 1, 3])
1269 >>> ser
1270 0 2
1271 1 1
1272 2 3
1273 dtype: int64
1275 >>> ser.searchsorted(1) # doctest: +SKIP
1276 0 # wrong result, correct would be 1
1277 """
1279 # This overload is needed so that the call to searchsorted in
1280 # pandas.core.resample.TimeGrouper._get_period_bins picks the correct result
1282 @overload
1283 # The following ignore is also present in numpy/__init__.pyi
1284 # Possibly a mypy bug??
1285 # error: Overloaded function signatures 1 and 2 overlap with incompatible
1286 # return types [misc]
1287 def searchsorted( # type: ignore[misc]
1288 self,
1289 value: ScalarLike_co,
1290 side: Literal["left", "right"] = ...,
1291 sorter: NumpySorter = ...,
1292 ) -> np.intp:
1293 ...
1295 @overload
1296 def searchsorted(
1297 self,
1298 value: npt.ArrayLike | ExtensionArray,
1299 side: Literal["left", "right"] = ...,
1300 sorter: NumpySorter = ...,
1301 ) -> npt.NDArray[np.intp]:
1302 ...
1304 @doc(_shared_docs["searchsorted"], klass="Index")
1305 def searchsorted(
1306 self,
1307 value: NumpyValueArrayLike | ExtensionArray,
1308 side: Literal["left", "right"] = "left",
1309 sorter: NumpySorter = None,
1310 ) -> npt.NDArray[np.intp] | np.intp:
1311 if isinstance(value, ABCDataFrame):
1312 msg = (
1313 "Value must be 1-D array-like or scalar, "
1314 f"{type(value).__name__} is not supported"
1315 )
1316 raise ValueError(msg)
1318 values = self._values
1319 if not isinstance(values, np.ndarray):
1320 # Going through EA.searchsorted directly improves performance GH#38083
1321 return values.searchsorted(value, side=side, sorter=sorter)
1323 return algorithms.searchsorted(
1324 values,
1325 value,
1326 side=side,
1327 sorter=sorter,
1328 )
1330 def drop_duplicates(self, *, keep: DropKeep = "first"):
1331 duplicated = self._duplicated(keep=keep)
1332 # error: Value of type "IndexOpsMixin" is not indexable
1333 return self[~duplicated] # type: ignore[index]
1335 @final
1336 def _duplicated(self, keep: DropKeep = "first") -> npt.NDArray[np.bool_]:
1337 return algorithms.duplicated(self._values, keep=keep)
1339 def _arith_method(self, other, op):
1340 res_name = ops.get_op_result_name(self, other)
1342 lvalues = self._values
1343 rvalues = extract_array(other, extract_numpy=True, extract_range=True)
1344 rvalues = ops.maybe_prepare_scalar_for_op(rvalues, lvalues.shape)
1345 rvalues = ensure_wrapped_if_datetimelike(rvalues)
1347 with np.errstate(all="ignore"):
1348 result = ops.arithmetic_op(lvalues, rvalues, op)
1350 return self._construct_result(result, name=res_name)
1352 def _construct_result(self, result, name):
1353 """
1354 Construct an appropriately-wrapped result from the ArrayLike result
1355 of an arithmetic-like operation.
1356 """
1357 raise AbstractMethodError(self)