1# pyright: reportPropertyTypeMismatch=false
2from __future__ import annotations
3
4import collections
5from copy import deepcopy
6import datetime as dt
7from functools import partial
8import gc
9from json import loads
10import operator
11import pickle
12import re
13import sys
14from typing import (
15 TYPE_CHECKING,
16 Any,
17 Callable,
18 ClassVar,
19 Literal,
20 NoReturn,
21 cast,
22 final,
23 overload,
24)
25import warnings
26import weakref
27
28import numpy as np
29
30from pandas._config import (
31 config,
32 using_copy_on_write,
33 warn_copy_on_write,
34)
35
36from pandas._libs import lib
37from pandas._libs.lib import is_range_indexer
38from pandas._libs.tslibs import (
39 Period,
40 Tick,
41 Timestamp,
42 to_offset,
43)
44from pandas._libs.tslibs.dtypes import freq_to_period_freqstr
45from pandas._typing import (
46 AlignJoin,
47 AnyArrayLike,
48 ArrayLike,
49 Axes,
50 Axis,
51 AxisInt,
52 CompressionOptions,
53 DtypeArg,
54 DtypeBackend,
55 DtypeObj,
56 FilePath,
57 FillnaOptions,
58 FloatFormatType,
59 FormattersType,
60 Frequency,
61 IgnoreRaise,
62 IndexKeyFunc,
63 IndexLabel,
64 InterpolateOptions,
65 IntervalClosedType,
66 JSONSerializable,
67 Level,
68 Manager,
69 NaPosition,
70 NDFrameT,
71 OpenFileErrors,
72 RandomState,
73 ReindexMethod,
74 Renamer,
75 Scalar,
76 Self,
77 SequenceNotStr,
78 SortKind,
79 StorageOptions,
80 Suffixes,
81 T,
82 TimeAmbiguous,
83 TimedeltaConvertibleTypes,
84 TimeNonexistent,
85 TimestampConvertibleTypes,
86 TimeUnit,
87 ValueKeyFunc,
88 WriteBuffer,
89 WriteExcelBuffer,
90 npt,
91)
92from pandas.compat import PYPY
93from pandas.compat._constants import REF_COUNT
94from pandas.compat._optional import import_optional_dependency
95from pandas.compat.numpy import function as nv
96from pandas.errors import (
97 AbstractMethodError,
98 ChainedAssignmentError,
99 InvalidIndexError,
100 SettingWithCopyError,
101 SettingWithCopyWarning,
102 _chained_assignment_method_msg,
103 _chained_assignment_warning_method_msg,
104 _check_cacher,
105)
106from pandas.util._decorators import (
107 deprecate_nonkeyword_arguments,
108 doc,
109)
110from pandas.util._exceptions import find_stack_level
111from pandas.util._validators import (
112 check_dtype_backend,
113 validate_ascending,
114 validate_bool_kwarg,
115 validate_fillna_kwargs,
116 validate_inclusive,
117)
118
119from pandas.core.dtypes.astype import astype_is_view
120from pandas.core.dtypes.common import (
121 ensure_object,
122 ensure_platform_int,
123 ensure_str,
124 is_bool,
125 is_bool_dtype,
126 is_dict_like,
127 is_extension_array_dtype,
128 is_list_like,
129 is_number,
130 is_numeric_dtype,
131 is_re_compilable,
132 is_scalar,
133 pandas_dtype,
134)
135from pandas.core.dtypes.dtypes import (
136 DatetimeTZDtype,
137 ExtensionDtype,
138)
139from pandas.core.dtypes.generic import (
140 ABCDataFrame,
141 ABCSeries,
142)
143from pandas.core.dtypes.inference import (
144 is_hashable,
145 is_nested_list_like,
146)
147from pandas.core.dtypes.missing import (
148 isna,
149 notna,
150)
151
152from pandas.core import (
153 algorithms as algos,
154 arraylike,
155 common,
156 indexing,
157 missing,
158 nanops,
159 sample,
160)
161from pandas.core.array_algos.replace import should_use_regex
162from pandas.core.arrays import ExtensionArray
163from pandas.core.base import PandasObject
164from pandas.core.construction import extract_array
165from pandas.core.flags import Flags
166from pandas.core.indexes.api import (
167 DatetimeIndex,
168 Index,
169 MultiIndex,
170 PeriodIndex,
171 RangeIndex,
172 default_index,
173 ensure_index,
174)
175from pandas.core.internals import (
176 ArrayManager,
177 BlockManager,
178 SingleArrayManager,
179)
180from pandas.core.internals.construction import (
181 mgr_to_mgr,
182 ndarray_to_mgr,
183)
184from pandas.core.methods.describe import describe_ndframe
185from pandas.core.missing import (
186 clean_fill_method,
187 clean_reindex_fill_method,
188 find_valid_index,
189)
190from pandas.core.reshape.concat import concat
191from pandas.core.shared_docs import _shared_docs
192from pandas.core.sorting import get_indexer_indexer
193from pandas.core.window import (
194 Expanding,
195 ExponentialMovingWindow,
196 Rolling,
197 Window,
198)
199
200from pandas.io.formats.format import (
201 DataFrameFormatter,
202 DataFrameRenderer,
203)
204from pandas.io.formats.printing import pprint_thing
205
206if TYPE_CHECKING:
207 from collections.abc import (
208 Hashable,
209 Iterator,
210 Mapping,
211 Sequence,
212 )
213
214 from pandas._libs.tslibs import BaseOffset
215
216 from pandas import (
217 DataFrame,
218 ExcelWriter,
219 HDFStore,
220 Series,
221 )
222 from pandas.core.indexers.objects import BaseIndexer
223 from pandas.core.resample import Resampler
224
225# goal is to be able to define the docs close to function, while still being
226# able to share
227_shared_docs = {**_shared_docs}
228_shared_doc_kwargs = {
229 "axes": "keywords for axes",
230 "klass": "Series/DataFrame",
231 "axes_single_arg": "{0 or 'index'} for Series, {0 or 'index', 1 or 'columns'} for DataFrame", # noqa: E501
232 "inplace": """
233 inplace : bool, default False
234 If True, performs operation inplace and returns None.""",
235 "optional_by": """
236 by : str or list of str
237 Name or list of names to sort by""",
238}
239
240
241bool_t = bool # Need alias because NDFrame has def bool:
242
243
244class NDFrame(PandasObject, indexing.IndexingMixin):
245 """
246 N-dimensional analogue of DataFrame. Store multi-dimensional in a
247 size-mutable, labeled data structure
248
249 Parameters
250 ----------
251 data : BlockManager
252 axes : list
253 copy : bool, default False
254 """
255
256 _internal_names: list[str] = [
257 "_mgr",
258 "_cacher",
259 "_item_cache",
260 "_cache",
261 "_is_copy",
262 "_name",
263 "_metadata",
264 "_flags",
265 ]
266 _internal_names_set: set[str] = set(_internal_names)
267 _accessors: set[str] = set()
268 _hidden_attrs: frozenset[str] = frozenset([])
269 _metadata: list[str] = []
270 _is_copy: weakref.ReferenceType[NDFrame] | str | None = None
271 _mgr: Manager
272 _attrs: dict[Hashable, Any]
273 _typ: str
274
275 # ----------------------------------------------------------------------
276 # Constructors
277
278 def __init__(self, data: Manager) -> None:
279 object.__setattr__(self, "_is_copy", None)
280 object.__setattr__(self, "_mgr", data)
281 object.__setattr__(self, "_item_cache", {})
282 object.__setattr__(self, "_attrs", {})
283 object.__setattr__(self, "_flags", Flags(self, allows_duplicate_labels=True))
284
285 @final
286 @classmethod
287 def _init_mgr(
288 cls,
289 mgr: Manager,
290 axes: dict[Literal["index", "columns"], Axes | None],
291 dtype: DtypeObj | None = None,
292 copy: bool_t = False,
293 ) -> Manager:
294 """passed a manager and a axes dict"""
295 for a, axe in axes.items():
296 if axe is not None:
297 axe = ensure_index(axe)
298 bm_axis = cls._get_block_manager_axis(a)
299 mgr = mgr.reindex_axis(axe, axis=bm_axis)
300
301 # make a copy if explicitly requested
302 if copy:
303 mgr = mgr.copy()
304 if dtype is not None:
305 # avoid further copies if we can
306 if (
307 isinstance(mgr, BlockManager)
308 and len(mgr.blocks) == 1
309 and mgr.blocks[0].values.dtype == dtype
310 ):
311 pass
312 else:
313 mgr = mgr.astype(dtype=dtype)
314 return mgr
315
316 @final
317 def _as_manager(self, typ: str, copy: bool_t = True) -> Self:
318 """
319 Private helper function to create a DataFrame with specific manager.
320
321 Parameters
322 ----------
323 typ : {"block", "array"}
324 copy : bool, default True
325 Only controls whether the conversion from Block->ArrayManager
326 copies the 1D arrays (to ensure proper/contiguous memory layout).
327
328 Returns
329 -------
330 DataFrame
331 New DataFrame using specified manager type. Is not guaranteed
332 to be a copy or not.
333 """
334 new_mgr: Manager
335 new_mgr = mgr_to_mgr(self._mgr, typ=typ, copy=copy)
336 # fastpath of passing a manager doesn't check the option/manager class
337 return self._constructor_from_mgr(new_mgr, axes=new_mgr.axes).__finalize__(self)
338
339 @final
340 @classmethod
341 def _from_mgr(cls, mgr: Manager, axes: list[Index]) -> Self:
342 """
343 Construct a new object of this type from a Manager object and axes.
344
345 Parameters
346 ----------
347 mgr : Manager
348 Must have the same ndim as cls.
349 axes : list[Index]
350
351 Notes
352 -----
353 The axes must match mgr.axes, but are required for future-proofing
354 in the event that axes are refactored out of the Manager objects.
355 """
356 obj = cls.__new__(cls)
357 NDFrame.__init__(obj, mgr)
358 return obj
359
360 # ----------------------------------------------------------------------
361 # attrs and flags
362
363 @property
364 def attrs(self) -> dict[Hashable, Any]:
365 """
366 Dictionary of global attributes of this dataset.
367
368 .. warning::
369
370 attrs is experimental and may change without warning.
371
372 See Also
373 --------
374 DataFrame.flags : Global flags applying to this object.
375
376 Notes
377 -----
378 Many operations that create new datasets will copy ``attrs``. Copies
379 are always deep so that changing ``attrs`` will only affect the
380 present dataset. ``pandas.concat`` copies ``attrs`` only if all input
381 datasets have the same ``attrs``.
382
383 Examples
384 --------
385 For Series:
386
387 >>> ser = pd.Series([1, 2, 3])
388 >>> ser.attrs = {"A": [10, 20, 30]}
389 >>> ser.attrs
390 {'A': [10, 20, 30]}
391
392 For DataFrame:
393
394 >>> df = pd.DataFrame({'A': [1, 2], 'B': [3, 4]})
395 >>> df.attrs = {"A": [10, 20, 30]}
396 >>> df.attrs
397 {'A': [10, 20, 30]}
398 """
399 return self._attrs
400
401 @attrs.setter
402 def attrs(self, value: Mapping[Hashable, Any]) -> None:
403 self._attrs = dict(value)
404
405 @final
406 @property
407 def flags(self) -> Flags:
408 """
409 Get the properties associated with this pandas object.
410
411 The available flags are
412
413 * :attr:`Flags.allows_duplicate_labels`
414
415 See Also
416 --------
417 Flags : Flags that apply to pandas objects.
418 DataFrame.attrs : Global metadata applying to this dataset.
419
420 Notes
421 -----
422 "Flags" differ from "metadata". Flags reflect properties of the
423 pandas object (the Series or DataFrame). Metadata refer to properties
424 of the dataset, and should be stored in :attr:`DataFrame.attrs`.
425
426 Examples
427 --------
428 >>> df = pd.DataFrame({"A": [1, 2]})
429 >>> df.flags
430 <Flags(allows_duplicate_labels=True)>
431
432 Flags can be get or set using ``.``
433
434 >>> df.flags.allows_duplicate_labels
435 True
436 >>> df.flags.allows_duplicate_labels = False
437
438 Or by slicing with a key
439
440 >>> df.flags["allows_duplicate_labels"]
441 False
442 >>> df.flags["allows_duplicate_labels"] = True
443 """
444 return self._flags
445
446 @final
447 def set_flags(
448 self,
449 *,
450 copy: bool_t = False,
451 allows_duplicate_labels: bool_t | None = None,
452 ) -> Self:
453 """
454 Return a new object with updated flags.
455
456 Parameters
457 ----------
458 copy : bool, default False
459 Specify if a copy of the object should be made.
460
461 .. note::
462 The `copy` keyword will change behavior in pandas 3.0.
463 `Copy-on-Write
464 <https://pandas.pydata.org/docs/dev/user_guide/copy_on_write.html>`__
465 will be enabled by default, which means that all methods with a
466 `copy` keyword will use a lazy copy mechanism to defer the copy and
467 ignore the `copy` keyword. The `copy` keyword will be removed in a
468 future version of pandas.
469
470 You can already get the future behavior and improvements through
471 enabling copy on write ``pd.options.mode.copy_on_write = True``
472 allows_duplicate_labels : bool, optional
473 Whether the returned object allows duplicate labels.
474
475 Returns
476 -------
477 Series or DataFrame
478 The same type as the caller.
479
480 See Also
481 --------
482 DataFrame.attrs : Global metadata applying to this dataset.
483 DataFrame.flags : Global flags applying to this object.
484
485 Notes
486 -----
487 This method returns a new object that's a view on the same data
488 as the input. Mutating the input or the output values will be reflected
489 in the other.
490
491 This method is intended to be used in method chains.
492
493 "Flags" differ from "metadata". Flags reflect properties of the
494 pandas object (the Series or DataFrame). Metadata refer to properties
495 of the dataset, and should be stored in :attr:`DataFrame.attrs`.
496
497 Examples
498 --------
499 >>> df = pd.DataFrame({"A": [1, 2]})
500 >>> df.flags.allows_duplicate_labels
501 True
502 >>> df2 = df.set_flags(allows_duplicate_labels=False)
503 >>> df2.flags.allows_duplicate_labels
504 False
505 """
506 df = self.copy(deep=copy and not using_copy_on_write())
507 if allows_duplicate_labels is not None:
508 df.flags["allows_duplicate_labels"] = allows_duplicate_labels
509 return df
510
511 @final
512 @classmethod
513 def _validate_dtype(cls, dtype) -> DtypeObj | None:
514 """validate the passed dtype"""
515 if dtype is not None:
516 dtype = pandas_dtype(dtype)
517
518 # a compound dtype
519 if dtype.kind == "V":
520 raise NotImplementedError(
521 "compound dtypes are not implemented "
522 f"in the {cls.__name__} constructor"
523 )
524
525 return dtype
526
527 # ----------------------------------------------------------------------
528 # Construction
529
530 @property
531 def _constructor(self) -> Callable[..., Self]:
532 """
533 Used when a manipulation result has the same dimensions as the
534 original.
535 """
536 raise AbstractMethodError(self)
537
538 # ----------------------------------------------------------------------
539 # Internals
540
541 @final
542 @property
543 def _data(self):
544 # GH#33054 retained because some downstream packages uses this,
545 # e.g. fastparquet
546 # GH#33333
547 warnings.warn(
548 f"{type(self).__name__}._data is deprecated and will be removed in "
549 "a future version. Use public APIs instead.",
550 DeprecationWarning,
551 stacklevel=find_stack_level(),
552 )
553 return self._mgr
554
555 # ----------------------------------------------------------------------
556 # Axis
557 _AXIS_ORDERS: list[Literal["index", "columns"]]
558 _AXIS_TO_AXIS_NUMBER: dict[Axis, AxisInt] = {0: 0, "index": 0, "rows": 0}
559 _info_axis_number: int
560 _info_axis_name: Literal["index", "columns"]
561 _AXIS_LEN: int
562
563 @final
564 def _construct_axes_dict(self, axes: Sequence[Axis] | None = None, **kwargs):
565 """Return an axes dictionary for myself."""
566 d = {a: self._get_axis(a) for a in (axes or self._AXIS_ORDERS)}
567 # error: Argument 1 to "update" of "MutableMapping" has incompatible type
568 # "Dict[str, Any]"; expected "SupportsKeysAndGetItem[Union[int, str], Any]"
569 d.update(kwargs) # type: ignore[arg-type]
570 return d
571
572 @final
573 @classmethod
574 def _get_axis_number(cls, axis: Axis) -> AxisInt:
575 try:
576 return cls._AXIS_TO_AXIS_NUMBER[axis]
577 except KeyError:
578 raise ValueError(f"No axis named {axis} for object type {cls.__name__}")
579
580 @final
581 @classmethod
582 def _get_axis_name(cls, axis: Axis) -> Literal["index", "columns"]:
583 axis_number = cls._get_axis_number(axis)
584 return cls._AXIS_ORDERS[axis_number]
585
586 @final
587 def _get_axis(self, axis: Axis) -> Index:
588 axis_number = self._get_axis_number(axis)
589 assert axis_number in {0, 1}
590 return self.index if axis_number == 0 else self.columns
591
592 @final
593 @classmethod
594 def _get_block_manager_axis(cls, axis: Axis) -> AxisInt:
595 """Map the axis to the block_manager axis."""
596 axis = cls._get_axis_number(axis)
597 ndim = cls._AXIS_LEN
598 if ndim == 2:
599 # i.e. DataFrame
600 return 1 - axis
601 return axis
602
603 @final
604 def _get_axis_resolvers(self, axis: str) -> dict[str, Series | MultiIndex]:
605 # index or columns
606 axis_index = getattr(self, axis)
607 d = {}
608 prefix = axis[0]
609
610 for i, name in enumerate(axis_index.names):
611 if name is not None:
612 key = level = name
613 else:
614 # prefix with 'i' or 'c' depending on the input axis
615 # e.g., you must do ilevel_0 for the 0th level of an unnamed
616 # multiiindex
617 key = f"{prefix}level_{i}"
618 level = i
619
620 level_values = axis_index.get_level_values(level)
621 s = level_values.to_series()
622 s.index = axis_index
623 d[key] = s
624
625 # put the index/columns itself in the dict
626 if isinstance(axis_index, MultiIndex):
627 dindex = axis_index
628 else:
629 dindex = axis_index.to_series()
630
631 d[axis] = dindex
632 return d
633
634 @final
635 def _get_index_resolvers(self) -> dict[Hashable, Series | MultiIndex]:
636 from pandas.core.computation.parsing import clean_column_name
637
638 d: dict[str, Series | MultiIndex] = {}
639 for axis_name in self._AXIS_ORDERS:
640 d.update(self._get_axis_resolvers(axis_name))
641
642 return {clean_column_name(k): v for k, v in d.items() if not isinstance(k, int)}
643
644 @final
645 def _get_cleaned_column_resolvers(self) -> dict[Hashable, Series]:
646 """
647 Return the special character free column resolvers of a dataframe.
648
649 Column names with special characters are 'cleaned up' so that they can
650 be referred to by backtick quoting.
651 Used in :meth:`DataFrame.eval`.
652 """
653 from pandas.core.computation.parsing import clean_column_name
654 from pandas.core.series import Series
655
656 if isinstance(self, ABCSeries):
657 return {clean_column_name(self.name): self}
658
659 return {
660 clean_column_name(k): Series(
661 v, copy=False, index=self.index, name=k, dtype=self.dtypes[k]
662 ).__finalize__(self)
663 for k, v in zip(self.columns, self._iter_column_arrays())
664 if not isinstance(k, int)
665 }
666
667 @final
668 @property
669 def _info_axis(self) -> Index:
670 return getattr(self, self._info_axis_name)
671
672 def _is_view_after_cow_rules(self):
673 # Only to be used in cases of chained assignment checks, this is a
674 # simplified check that assumes that either the whole object is a view
675 # or a copy
676 if len(self._mgr.blocks) == 0: # type: ignore[union-attr]
677 return False
678 return self._mgr.blocks[0].refs.has_reference() # type: ignore[union-attr]
679
680 @property
681 def shape(self) -> tuple[int, ...]:
682 """
683 Return a tuple of axis dimensions
684 """
685 return tuple(len(self._get_axis(a)) for a in self._AXIS_ORDERS)
686
687 @property
688 def axes(self) -> list[Index]:
689 """
690 Return index label(s) of the internal NDFrame
691 """
692 # we do it this way because if we have reversed axes, then
693 # the block manager shows then reversed
694 return [self._get_axis(a) for a in self._AXIS_ORDERS]
695
696 @final
697 @property
698 def ndim(self) -> int:
699 """
700 Return an int representing the number of axes / array dimensions.
701
702 Return 1 if Series. Otherwise return 2 if DataFrame.
703
704 See Also
705 --------
706 ndarray.ndim : Number of array dimensions.
707
708 Examples
709 --------
710 >>> s = pd.Series({'a': 1, 'b': 2, 'c': 3})
711 >>> s.ndim
712 1
713
714 >>> df = pd.DataFrame({'col1': [1, 2], 'col2': [3, 4]})
715 >>> df.ndim
716 2
717 """
718 return self._mgr.ndim
719
720 @final
721 @property
722 def size(self) -> int:
723 """
724 Return an int representing the number of elements in this object.
725
726 Return the number of rows if Series. Otherwise return the number of
727 rows times number of columns if DataFrame.
728
729 See Also
730 --------
731 ndarray.size : Number of elements in the array.
732
733 Examples
734 --------
735 >>> s = pd.Series({'a': 1, 'b': 2, 'c': 3})
736 >>> s.size
737 3
738
739 >>> df = pd.DataFrame({'col1': [1, 2], 'col2': [3, 4]})
740 >>> df.size
741 4
742 """
743
744 return int(np.prod(self.shape))
745
746 def set_axis(
747 self,
748 labels,
749 *,
750 axis: Axis = 0,
751 copy: bool_t | None = None,
752 ) -> Self:
753 """
754 Assign desired index to given axis.
755
756 Indexes for%(extended_summary_sub)s row labels can be changed by assigning
757 a list-like or Index.
758
759 Parameters
760 ----------
761 labels : list-like, Index
762 The values for the new index.
763
764 axis : %(axes_single_arg)s, default 0
765 The axis to update. The value 0 identifies the rows. For `Series`
766 this parameter is unused and defaults to 0.
767
768 copy : bool, default True
769 Whether to make a copy of the underlying data.
770
771 .. note::
772 The `copy` keyword will change behavior in pandas 3.0.
773 `Copy-on-Write
774 <https://pandas.pydata.org/docs/dev/user_guide/copy_on_write.html>`__
775 will be enabled by default, which means that all methods with a
776 `copy` keyword will use a lazy copy mechanism to defer the copy and
777 ignore the `copy` keyword. The `copy` keyword will be removed in a
778 future version of pandas.
779
780 You can already get the future behavior and improvements through
781 enabling copy on write ``pd.options.mode.copy_on_write = True``
782
783 Returns
784 -------
785 %(klass)s
786 An object of type %(klass)s.
787
788 See Also
789 --------
790 %(klass)s.rename_axis : Alter the name of the index%(see_also_sub)s.
791 """
792 return self._set_axis_nocheck(labels, axis, inplace=False, copy=copy)
793
794 @final
795 def _set_axis_nocheck(
796 self, labels, axis: Axis, inplace: bool_t, copy: bool_t | None
797 ):
798 if inplace:
799 setattr(self, self._get_axis_name(axis), labels)
800 else:
801 # With copy=False, we create a new object but don't copy the
802 # underlying data.
803 obj = self.copy(deep=copy and not using_copy_on_write())
804 setattr(obj, obj._get_axis_name(axis), labels)
805 return obj
806
807 @final
808 def _set_axis(self, axis: AxisInt, labels: AnyArrayLike | list) -> None:
809 """
810 This is called from the cython code when we set the `index` attribute
811 directly, e.g. `series.index = [1, 2, 3]`.
812 """
813 labels = ensure_index(labels)
814 self._mgr.set_axis(axis, labels)
815 self._clear_item_cache()
816
817 @final
818 def swapaxes(self, axis1: Axis, axis2: Axis, copy: bool_t | None = None) -> Self:
819 """
820 Interchange axes and swap values axes appropriately.
821
822 .. deprecated:: 2.1.0
823 ``swapaxes`` is deprecated and will be removed.
824 Please use ``transpose`` instead.
825
826 Returns
827 -------
828 same as input
829
830 Examples
831 --------
832 Please see examples for :meth:`DataFrame.transpose`.
833 """
834 warnings.warn(
835 # GH#51946
836 f"'{type(self).__name__}.swapaxes' is deprecated and "
837 "will be removed in a future version. "
838 f"Please use '{type(self).__name__}.transpose' instead.",
839 FutureWarning,
840 stacklevel=find_stack_level(),
841 )
842
843 i = self._get_axis_number(axis1)
844 j = self._get_axis_number(axis2)
845
846 if i == j:
847 return self.copy(deep=copy and not using_copy_on_write())
848
849 mapping = {i: j, j: i}
850
851 new_axes = [self._get_axis(mapping.get(k, k)) for k in range(self._AXIS_LEN)]
852 new_values = self._values.swapaxes(i, j) # type: ignore[union-attr]
853 if self._mgr.is_single_block and isinstance(self._mgr, BlockManager):
854 # This should only get hit in case of having a single block, otherwise a
855 # copy is made, we don't have to set up references.
856 new_mgr = ndarray_to_mgr(
857 new_values,
858 new_axes[0],
859 new_axes[1],
860 dtype=None,
861 copy=False,
862 typ="block",
863 )
864 assert isinstance(new_mgr, BlockManager)
865 assert isinstance(self._mgr, BlockManager)
866 new_mgr.blocks[0].refs = self._mgr.blocks[0].refs
867 new_mgr.blocks[0].refs.add_reference(new_mgr.blocks[0])
868 if not using_copy_on_write() and copy is not False:
869 new_mgr = new_mgr.copy(deep=True)
870
871 out = self._constructor_from_mgr(new_mgr, axes=new_mgr.axes)
872 return out.__finalize__(self, method="swapaxes")
873
874 return self._constructor(
875 new_values,
876 *new_axes,
877 # The no-copy case for CoW is handled above
878 copy=False,
879 ).__finalize__(self, method="swapaxes")
880
881 @final
882 @doc(klass=_shared_doc_kwargs["klass"])
883 def droplevel(self, level: IndexLabel, axis: Axis = 0) -> Self:
884 """
885 Return {klass} with requested index / column level(s) removed.
886
887 Parameters
888 ----------
889 level : int, str, or list-like
890 If a string is given, must be the name of a level
891 If list-like, elements must be names or positional indexes
892 of levels.
893
894 axis : {{0 or 'index', 1 or 'columns'}}, default 0
895 Axis along which the level(s) is removed:
896
897 * 0 or 'index': remove level(s) in column.
898 * 1 or 'columns': remove level(s) in row.
899
900 For `Series` this parameter is unused and defaults to 0.
901
902 Returns
903 -------
904 {klass}
905 {klass} with requested index / column level(s) removed.
906
907 Examples
908 --------
909 >>> df = pd.DataFrame([
910 ... [1, 2, 3, 4],
911 ... [5, 6, 7, 8],
912 ... [9, 10, 11, 12]
913 ... ]).set_index([0, 1]).rename_axis(['a', 'b'])
914
915 >>> df.columns = pd.MultiIndex.from_tuples([
916 ... ('c', 'e'), ('d', 'f')
917 ... ], names=['level_1', 'level_2'])
918
919 >>> df
920 level_1 c d
921 level_2 e f
922 a b
923 1 2 3 4
924 5 6 7 8
925 9 10 11 12
926
927 >>> df.droplevel('a')
928 level_1 c d
929 level_2 e f
930 b
931 2 3 4
932 6 7 8
933 10 11 12
934
935 >>> df.droplevel('level_2', axis=1)
936 level_1 c d
937 a b
938 1 2 3 4
939 5 6 7 8
940 9 10 11 12
941 """
942 labels = self._get_axis(axis)
943 new_labels = labels.droplevel(level)
944 return self.set_axis(new_labels, axis=axis, copy=None)
945
946 def pop(self, item: Hashable) -> Series | Any:
947 result = self[item]
948 del self[item]
949
950 return result
951
952 @final
953 def squeeze(self, axis: Axis | None = None):
954 """
955 Squeeze 1 dimensional axis objects into scalars.
956
957 Series or DataFrames with a single element are squeezed to a scalar.
958 DataFrames with a single column or a single row are squeezed to a
959 Series. Otherwise the object is unchanged.
960
961 This method is most useful when you don't know if your
962 object is a Series or DataFrame, but you do know it has just a single
963 column. In that case you can safely call `squeeze` to ensure you have a
964 Series.
965
966 Parameters
967 ----------
968 axis : {0 or 'index', 1 or 'columns', None}, default None
969 A specific axis to squeeze. By default, all length-1 axes are
970 squeezed. For `Series` this parameter is unused and defaults to `None`.
971
972 Returns
973 -------
974 DataFrame, Series, or scalar
975 The projection after squeezing `axis` or all the axes.
976
977 See Also
978 --------
979 Series.iloc : Integer-location based indexing for selecting scalars.
980 DataFrame.iloc : Integer-location based indexing for selecting Series.
981 Series.to_frame : Inverse of DataFrame.squeeze for a
982 single-column DataFrame.
983
984 Examples
985 --------
986 >>> primes = pd.Series([2, 3, 5, 7])
987
988 Slicing might produce a Series with a single value:
989
990 >>> even_primes = primes[primes % 2 == 0]
991 >>> even_primes
992 0 2
993 dtype: int64
994
995 >>> even_primes.squeeze()
996 2
997
998 Squeezing objects with more than one value in every axis does nothing:
999
1000 >>> odd_primes = primes[primes % 2 == 1]
1001 >>> odd_primes
1002 1 3
1003 2 5
1004 3 7
1005 dtype: int64
1006
1007 >>> odd_primes.squeeze()
1008 1 3
1009 2 5
1010 3 7
1011 dtype: int64
1012
1013 Squeezing is even more effective when used with DataFrames.
1014
1015 >>> df = pd.DataFrame([[1, 2], [3, 4]], columns=['a', 'b'])
1016 >>> df
1017 a b
1018 0 1 2
1019 1 3 4
1020
1021 Slicing a single column will produce a DataFrame with the columns
1022 having only one value:
1023
1024 >>> df_a = df[['a']]
1025 >>> df_a
1026 a
1027 0 1
1028 1 3
1029
1030 So the columns can be squeezed down, resulting in a Series:
1031
1032 >>> df_a.squeeze('columns')
1033 0 1
1034 1 3
1035 Name: a, dtype: int64
1036
1037 Slicing a single row from a single column will produce a single
1038 scalar DataFrame:
1039
1040 >>> df_0a = df.loc[df.index < 1, ['a']]
1041 >>> df_0a
1042 a
1043 0 1
1044
1045 Squeezing the rows produces a single scalar Series:
1046
1047 >>> df_0a.squeeze('rows')
1048 a 1
1049 Name: 0, dtype: int64
1050
1051 Squeezing all axes will project directly into a scalar:
1052
1053 >>> df_0a.squeeze()
1054 1
1055 """
1056 axes = range(self._AXIS_LEN) if axis is None else (self._get_axis_number(axis),)
1057 result = self.iloc[
1058 tuple(
1059 0 if i in axes and len(a) == 1 else slice(None)
1060 for i, a in enumerate(self.axes)
1061 )
1062 ]
1063 if isinstance(result, NDFrame):
1064 result = result.__finalize__(self, method="squeeze")
1065 return result
1066
1067 # ----------------------------------------------------------------------
1068 # Rename
1069
1070 @final
1071 def _rename(
1072 self,
1073 mapper: Renamer | None = None,
1074 *,
1075 index: Renamer | None = None,
1076 columns: Renamer | None = None,
1077 axis: Axis | None = None,
1078 copy: bool_t | None = None,
1079 inplace: bool_t = False,
1080 level: Level | None = None,
1081 errors: str = "ignore",
1082 ) -> Self | None:
1083 # called by Series.rename and DataFrame.rename
1084
1085 if mapper is None and index is None and columns is None:
1086 raise TypeError("must pass an index to rename")
1087
1088 if index is not None or columns is not None:
1089 if axis is not None:
1090 raise TypeError(
1091 "Cannot specify both 'axis' and any of 'index' or 'columns'"
1092 )
1093 if mapper is not None:
1094 raise TypeError(
1095 "Cannot specify both 'mapper' and any of 'index' or 'columns'"
1096 )
1097 else:
1098 # use the mapper argument
1099 if axis and self._get_axis_number(axis) == 1:
1100 columns = mapper
1101 else:
1102 index = mapper
1103
1104 self._check_inplace_and_allows_duplicate_labels(inplace)
1105 result = self if inplace else self.copy(deep=copy and not using_copy_on_write())
1106
1107 for axis_no, replacements in enumerate((index, columns)):
1108 if replacements is None:
1109 continue
1110
1111 ax = self._get_axis(axis_no)
1112 f = common.get_rename_function(replacements)
1113
1114 if level is not None:
1115 level = ax._get_level_number(level)
1116
1117 # GH 13473
1118 if not callable(replacements):
1119 if ax._is_multi and level is not None:
1120 indexer = ax.get_level_values(level).get_indexer_for(replacements)
1121 else:
1122 indexer = ax.get_indexer_for(replacements)
1123
1124 if errors == "raise" and len(indexer[indexer == -1]):
1125 missing_labels = [
1126 label
1127 for index, label in enumerate(replacements)
1128 if indexer[index] == -1
1129 ]
1130 raise KeyError(f"{missing_labels} not found in axis")
1131
1132 new_index = ax._transform_index(f, level=level)
1133 result._set_axis_nocheck(new_index, axis=axis_no, inplace=True, copy=False)
1134 result._clear_item_cache()
1135
1136 if inplace:
1137 self._update_inplace(result)
1138 return None
1139 else:
1140 return result.__finalize__(self, method="rename")
1141
1142 @overload
1143 def rename_axis(
1144 self,
1145 mapper: IndexLabel | lib.NoDefault = ...,
1146 *,
1147 index=...,
1148 columns=...,
1149 axis: Axis = ...,
1150 copy: bool_t | None = ...,
1151 inplace: Literal[False] = ...,
1152 ) -> Self:
1153 ...
1154
1155 @overload
1156 def rename_axis(
1157 self,
1158 mapper: IndexLabel | lib.NoDefault = ...,
1159 *,
1160 index=...,
1161 columns=...,
1162 axis: Axis = ...,
1163 copy: bool_t | None = ...,
1164 inplace: Literal[True],
1165 ) -> None:
1166 ...
1167
1168 @overload
1169 def rename_axis(
1170 self,
1171 mapper: IndexLabel | lib.NoDefault = ...,
1172 *,
1173 index=...,
1174 columns=...,
1175 axis: Axis = ...,
1176 copy: bool_t | None = ...,
1177 inplace: bool_t = ...,
1178 ) -> Self | None:
1179 ...
1180
1181 def rename_axis(
1182 self,
1183 mapper: IndexLabel | lib.NoDefault = lib.no_default,
1184 *,
1185 index=lib.no_default,
1186 columns=lib.no_default,
1187 axis: Axis = 0,
1188 copy: bool_t | None = None,
1189 inplace: bool_t = False,
1190 ) -> Self | None:
1191 """
1192 Set the name of the axis for the index or columns.
1193
1194 Parameters
1195 ----------
1196 mapper : scalar, list-like, optional
1197 Value to set the axis name attribute.
1198 index, columns : scalar, list-like, dict-like or function, optional
1199 A scalar, list-like, dict-like or functions transformations to
1200 apply to that axis' values.
1201 Note that the ``columns`` parameter is not allowed if the
1202 object is a Series. This parameter only apply for DataFrame
1203 type objects.
1204
1205 Use either ``mapper`` and ``axis`` to
1206 specify the axis to target with ``mapper``, or ``index``
1207 and/or ``columns``.
1208 axis : {0 or 'index', 1 or 'columns'}, default 0
1209 The axis to rename. For `Series` this parameter is unused and defaults to 0.
1210 copy : bool, default None
1211 Also copy underlying data.
1212
1213 .. note::
1214 The `copy` keyword will change behavior in pandas 3.0.
1215 `Copy-on-Write
1216 <https://pandas.pydata.org/docs/dev/user_guide/copy_on_write.html>`__
1217 will be enabled by default, which means that all methods with a
1218 `copy` keyword will use a lazy copy mechanism to defer the copy and
1219 ignore the `copy` keyword. The `copy` keyword will be removed in a
1220 future version of pandas.
1221
1222 You can already get the future behavior and improvements through
1223 enabling copy on write ``pd.options.mode.copy_on_write = True``
1224 inplace : bool, default False
1225 Modifies the object directly, instead of creating a new Series
1226 or DataFrame.
1227
1228 Returns
1229 -------
1230 Series, DataFrame, or None
1231 The same type as the caller or None if ``inplace=True``.
1232
1233 See Also
1234 --------
1235 Series.rename : Alter Series index labels or name.
1236 DataFrame.rename : Alter DataFrame index labels or name.
1237 Index.rename : Set new names on index.
1238
1239 Notes
1240 -----
1241 ``DataFrame.rename_axis`` supports two calling conventions
1242
1243 * ``(index=index_mapper, columns=columns_mapper, ...)``
1244 * ``(mapper, axis={'index', 'columns'}, ...)``
1245
1246 The first calling convention will only modify the names of
1247 the index and/or the names of the Index object that is the columns.
1248 In this case, the parameter ``copy`` is ignored.
1249
1250 The second calling convention will modify the names of the
1251 corresponding index if mapper is a list or a scalar.
1252 However, if mapper is dict-like or a function, it will use the
1253 deprecated behavior of modifying the axis *labels*.
1254
1255 We *highly* recommend using keyword arguments to clarify your
1256 intent.
1257
1258 Examples
1259 --------
1260 **Series**
1261
1262 >>> s = pd.Series(["dog", "cat", "monkey"])
1263 >>> s
1264 0 dog
1265 1 cat
1266 2 monkey
1267 dtype: object
1268 >>> s.rename_axis("animal")
1269 animal
1270 0 dog
1271 1 cat
1272 2 monkey
1273 dtype: object
1274
1275 **DataFrame**
1276
1277 >>> df = pd.DataFrame({"num_legs": [4, 4, 2],
1278 ... "num_arms": [0, 0, 2]},
1279 ... ["dog", "cat", "monkey"])
1280 >>> df
1281 num_legs num_arms
1282 dog 4 0
1283 cat 4 0
1284 monkey 2 2
1285 >>> df = df.rename_axis("animal")
1286 >>> df
1287 num_legs num_arms
1288 animal
1289 dog 4 0
1290 cat 4 0
1291 monkey 2 2
1292 >>> df = df.rename_axis("limbs", axis="columns")
1293 >>> df
1294 limbs num_legs num_arms
1295 animal
1296 dog 4 0
1297 cat 4 0
1298 monkey 2 2
1299
1300 **MultiIndex**
1301
1302 >>> df.index = pd.MultiIndex.from_product([['mammal'],
1303 ... ['dog', 'cat', 'monkey']],
1304 ... names=['type', 'name'])
1305 >>> df
1306 limbs num_legs num_arms
1307 type name
1308 mammal dog 4 0
1309 cat 4 0
1310 monkey 2 2
1311
1312 >>> df.rename_axis(index={'type': 'class'})
1313 limbs num_legs num_arms
1314 class name
1315 mammal dog 4 0
1316 cat 4 0
1317 monkey 2 2
1318
1319 >>> df.rename_axis(columns=str.upper)
1320 LIMBS num_legs num_arms
1321 type name
1322 mammal dog 4 0
1323 cat 4 0
1324 monkey 2 2
1325 """
1326 axes = {"index": index, "columns": columns}
1327
1328 if axis is not None:
1329 axis = self._get_axis_number(axis)
1330
1331 inplace = validate_bool_kwarg(inplace, "inplace")
1332
1333 if copy and using_copy_on_write():
1334 copy = False
1335
1336 if mapper is not lib.no_default:
1337 # Use v0.23 behavior if a scalar or list
1338 non_mapper = is_scalar(mapper) or (
1339 is_list_like(mapper) and not is_dict_like(mapper)
1340 )
1341 if non_mapper:
1342 return self._set_axis_name(
1343 mapper, axis=axis, inplace=inplace, copy=copy
1344 )
1345 else:
1346 raise ValueError("Use `.rename` to alter labels with a mapper.")
1347 else:
1348 # Use new behavior. Means that index and/or columns
1349 # is specified
1350 result = self if inplace else self.copy(deep=copy)
1351
1352 for axis in range(self._AXIS_LEN):
1353 v = axes.get(self._get_axis_name(axis))
1354 if v is lib.no_default:
1355 continue
1356 non_mapper = is_scalar(v) or (is_list_like(v) and not is_dict_like(v))
1357 if non_mapper:
1358 newnames = v
1359 else:
1360 f = common.get_rename_function(v)
1361 curnames = self._get_axis(axis).names
1362 newnames = [f(name) for name in curnames]
1363 result._set_axis_name(newnames, axis=axis, inplace=True, copy=copy)
1364 if not inplace:
1365 return result
1366 return None
1367
1368 @final
1369 def _set_axis_name(
1370 self, name, axis: Axis = 0, inplace: bool_t = False, copy: bool_t | None = True
1371 ):
1372 """
1373 Set the name(s) of the axis.
1374
1375 Parameters
1376 ----------
1377 name : str or list of str
1378 Name(s) to set.
1379 axis : {0 or 'index', 1 or 'columns'}, default 0
1380 The axis to set the label. The value 0 or 'index' specifies index,
1381 and the value 1 or 'columns' specifies columns.
1382 inplace : bool, default False
1383 If `True`, do operation inplace and return None.
1384 copy:
1385 Whether to make a copy of the result.
1386
1387 Returns
1388 -------
1389 Series, DataFrame, or None
1390 The same type as the caller or `None` if `inplace` is `True`.
1391
1392 See Also
1393 --------
1394 DataFrame.rename : Alter the axis labels of :class:`DataFrame`.
1395 Series.rename : Alter the index labels or set the index name
1396 of :class:`Series`.
1397 Index.rename : Set the name of :class:`Index` or :class:`MultiIndex`.
1398
1399 Examples
1400 --------
1401 >>> df = pd.DataFrame({"num_legs": [4, 4, 2]},
1402 ... ["dog", "cat", "monkey"])
1403 >>> df
1404 num_legs
1405 dog 4
1406 cat 4
1407 monkey 2
1408 >>> df._set_axis_name("animal")
1409 num_legs
1410 animal
1411 dog 4
1412 cat 4
1413 monkey 2
1414 >>> df.index = pd.MultiIndex.from_product(
1415 ... [["mammal"], ['dog', 'cat', 'monkey']])
1416 >>> df._set_axis_name(["type", "name"])
1417 num_legs
1418 type name
1419 mammal dog 4
1420 cat 4
1421 monkey 2
1422 """
1423 axis = self._get_axis_number(axis)
1424 idx = self._get_axis(axis).set_names(name)
1425
1426 inplace = validate_bool_kwarg(inplace, "inplace")
1427 renamed = self if inplace else self.copy(deep=copy)
1428 if axis == 0:
1429 renamed.index = idx
1430 else:
1431 renamed.columns = idx
1432
1433 if not inplace:
1434 return renamed
1435
1436 # ----------------------------------------------------------------------
1437 # Comparison Methods
1438
1439 @final
1440 def _indexed_same(self, other) -> bool_t:
1441 return all(
1442 self._get_axis(a).equals(other._get_axis(a)) for a in self._AXIS_ORDERS
1443 )
1444
1445 @final
1446 def equals(self, other: object) -> bool_t:
1447 """
1448 Test whether two objects contain the same elements.
1449
1450 This function allows two Series or DataFrames to be compared against
1451 each other to see if they have the same shape and elements. NaNs in
1452 the same location are considered equal.
1453
1454 The row/column index do not need to have the same type, as long
1455 as the values are considered equal. Corresponding columns and
1456 index must be of the same dtype.
1457
1458 Parameters
1459 ----------
1460 other : Series or DataFrame
1461 The other Series or DataFrame to be compared with the first.
1462
1463 Returns
1464 -------
1465 bool
1466 True if all elements are the same in both objects, False
1467 otherwise.
1468
1469 See Also
1470 --------
1471 Series.eq : Compare two Series objects of the same length
1472 and return a Series where each element is True if the element
1473 in each Series is equal, False otherwise.
1474 DataFrame.eq : Compare two DataFrame objects of the same shape and
1475 return a DataFrame where each element is True if the respective
1476 element in each DataFrame is equal, False otherwise.
1477 testing.assert_series_equal : Raises an AssertionError if left and
1478 right are not equal. Provides an easy interface to ignore
1479 inequality in dtypes, indexes and precision among others.
1480 testing.assert_frame_equal : Like assert_series_equal, but targets
1481 DataFrames.
1482 numpy.array_equal : Return True if two arrays have the same shape
1483 and elements, False otherwise.
1484
1485 Examples
1486 --------
1487 >>> df = pd.DataFrame({1: [10], 2: [20]})
1488 >>> df
1489 1 2
1490 0 10 20
1491
1492 DataFrames df and exactly_equal have the same types and values for
1493 their elements and column labels, which will return True.
1494
1495 >>> exactly_equal = pd.DataFrame({1: [10], 2: [20]})
1496 >>> exactly_equal
1497 1 2
1498 0 10 20
1499 >>> df.equals(exactly_equal)
1500 True
1501
1502 DataFrames df and different_column_type have the same element
1503 types and values, but have different types for the column labels,
1504 which will still return True.
1505
1506 >>> different_column_type = pd.DataFrame({1.0: [10], 2.0: [20]})
1507 >>> different_column_type
1508 1.0 2.0
1509 0 10 20
1510 >>> df.equals(different_column_type)
1511 True
1512
1513 DataFrames df and different_data_type have different types for the
1514 same values for their elements, and will return False even though
1515 their column labels are the same values and types.
1516
1517 >>> different_data_type = pd.DataFrame({1: [10.0], 2: [20.0]})
1518 >>> different_data_type
1519 1 2
1520 0 10.0 20.0
1521 >>> df.equals(different_data_type)
1522 False
1523 """
1524 if not (isinstance(other, type(self)) or isinstance(self, type(other))):
1525 return False
1526 other = cast(NDFrame, other)
1527 return self._mgr.equals(other._mgr)
1528
1529 # -------------------------------------------------------------------------
1530 # Unary Methods
1531
1532 @final
1533 def __neg__(self) -> Self:
1534 def blk_func(values: ArrayLike):
1535 if is_bool_dtype(values.dtype):
1536 # error: Argument 1 to "inv" has incompatible type "Union
1537 # [ExtensionArray, ndarray[Any, Any]]"; expected
1538 # "_SupportsInversion[ndarray[Any, dtype[bool_]]]"
1539 return operator.inv(values) # type: ignore[arg-type]
1540 else:
1541 # error: Argument 1 to "neg" has incompatible type "Union
1542 # [ExtensionArray, ndarray[Any, Any]]"; expected
1543 # "_SupportsNeg[ndarray[Any, dtype[Any]]]"
1544 return operator.neg(values) # type: ignore[arg-type]
1545
1546 new_data = self._mgr.apply(blk_func)
1547 res = self._constructor_from_mgr(new_data, axes=new_data.axes)
1548 return res.__finalize__(self, method="__neg__")
1549
1550 @final
1551 def __pos__(self) -> Self:
1552 def blk_func(values: ArrayLike):
1553 if is_bool_dtype(values.dtype):
1554 return values.copy()
1555 else:
1556 # error: Argument 1 to "pos" has incompatible type "Union
1557 # [ExtensionArray, ndarray[Any, Any]]"; expected
1558 # "_SupportsPos[ndarray[Any, dtype[Any]]]"
1559 return operator.pos(values) # type: ignore[arg-type]
1560
1561 new_data = self._mgr.apply(blk_func)
1562 res = self._constructor_from_mgr(new_data, axes=new_data.axes)
1563 return res.__finalize__(self, method="__pos__")
1564
1565 @final
1566 def __invert__(self) -> Self:
1567 if not self.size:
1568 # inv fails with 0 len
1569 return self.copy(deep=False)
1570
1571 new_data = self._mgr.apply(operator.invert)
1572 res = self._constructor_from_mgr(new_data, axes=new_data.axes)
1573 return res.__finalize__(self, method="__invert__")
1574
1575 @final
1576 def __nonzero__(self) -> NoReturn:
1577 raise ValueError(
1578 f"The truth value of a {type(self).__name__} is ambiguous. "
1579 "Use a.empty, a.bool(), a.item(), a.any() or a.all()."
1580 )
1581
1582 __bool__ = __nonzero__
1583
1584 @final
1585 def bool(self) -> bool_t:
1586 """
1587 Return the bool of a single element Series or DataFrame.
1588
1589 .. deprecated:: 2.1.0
1590
1591 bool is deprecated and will be removed in future version of pandas.
1592 For ``Series`` use ``pandas.Series.item``.
1593
1594 This must be a boolean scalar value, either True or False. It will raise a
1595 ValueError if the Series or DataFrame does not have exactly 1 element, or that
1596 element is not boolean (integer values 0 and 1 will also raise an exception).
1597
1598 Returns
1599 -------
1600 bool
1601 The value in the Series or DataFrame.
1602
1603 See Also
1604 --------
1605 Series.astype : Change the data type of a Series, including to boolean.
1606 DataFrame.astype : Change the data type of a DataFrame, including to boolean.
1607 numpy.bool_ : NumPy boolean data type, used by pandas for boolean values.
1608
1609 Examples
1610 --------
1611 The method will only work for single element objects with a boolean value:
1612
1613 >>> pd.Series([True]).bool() # doctest: +SKIP
1614 True
1615 >>> pd.Series([False]).bool() # doctest: +SKIP
1616 False
1617
1618 >>> pd.DataFrame({'col': [True]}).bool() # doctest: +SKIP
1619 True
1620 >>> pd.DataFrame({'col': [False]}).bool() # doctest: +SKIP
1621 False
1622
1623 This is an alternative method and will only work
1624 for single element objects with a boolean value:
1625
1626 >>> pd.Series([True]).item() # doctest: +SKIP
1627 True
1628 >>> pd.Series([False]).item() # doctest: +SKIP
1629 False
1630 """
1631
1632 warnings.warn(
1633 f"{type(self).__name__}.bool is now deprecated and will be removed "
1634 "in future version of pandas",
1635 FutureWarning,
1636 stacklevel=find_stack_level(),
1637 )
1638 v = self.squeeze()
1639 if isinstance(v, (bool, np.bool_)):
1640 return bool(v)
1641 elif is_scalar(v):
1642 raise ValueError(
1643 "bool cannot act on a non-boolean single element "
1644 f"{type(self).__name__}"
1645 )
1646
1647 self.__nonzero__()
1648 # for mypy (__nonzero__ raises)
1649 return True
1650
1651 @final
1652 def abs(self) -> Self:
1653 """
1654 Return a Series/DataFrame with absolute numeric value of each element.
1655
1656 This function only applies to elements that are all numeric.
1657
1658 Returns
1659 -------
1660 abs
1661 Series/DataFrame containing the absolute value of each element.
1662
1663 See Also
1664 --------
1665 numpy.absolute : Calculate the absolute value element-wise.
1666
1667 Notes
1668 -----
1669 For ``complex`` inputs, ``1.2 + 1j``, the absolute value is
1670 :math:`\\sqrt{ a^2 + b^2 }`.
1671
1672 Examples
1673 --------
1674 Absolute numeric values in a Series.
1675
1676 >>> s = pd.Series([-1.10, 2, -3.33, 4])
1677 >>> s.abs()
1678 0 1.10
1679 1 2.00
1680 2 3.33
1681 3 4.00
1682 dtype: float64
1683
1684 Absolute numeric values in a Series with complex numbers.
1685
1686 >>> s = pd.Series([1.2 + 1j])
1687 >>> s.abs()
1688 0 1.56205
1689 dtype: float64
1690
1691 Absolute numeric values in a Series with a Timedelta element.
1692
1693 >>> s = pd.Series([pd.Timedelta('1 days')])
1694 >>> s.abs()
1695 0 1 days
1696 dtype: timedelta64[ns]
1697
1698 Select rows with data closest to certain value using argsort (from
1699 `StackOverflow <https://stackoverflow.com/a/17758115>`__).
1700
1701 >>> df = pd.DataFrame({
1702 ... 'a': [4, 5, 6, 7],
1703 ... 'b': [10, 20, 30, 40],
1704 ... 'c': [100, 50, -30, -50]
1705 ... })
1706 >>> df
1707 a b c
1708 0 4 10 100
1709 1 5 20 50
1710 2 6 30 -30
1711 3 7 40 -50
1712 >>> df.loc[(df.c - 43).abs().argsort()]
1713 a b c
1714 1 5 20 50
1715 0 4 10 100
1716 2 6 30 -30
1717 3 7 40 -50
1718 """
1719 res_mgr = self._mgr.apply(np.abs)
1720 return self._constructor_from_mgr(res_mgr, axes=res_mgr.axes).__finalize__(
1721 self, name="abs"
1722 )
1723
1724 @final
1725 def __abs__(self) -> Self:
1726 return self.abs()
1727
1728 @final
1729 def __round__(self, decimals: int = 0) -> Self:
1730 return self.round(decimals).__finalize__(self, method="__round__")
1731
1732 # -------------------------------------------------------------------------
1733 # Label or Level Combination Helpers
1734 #
1735 # A collection of helper methods for DataFrame/Series operations that
1736 # accept a combination of column/index labels and levels. All such
1737 # operations should utilize/extend these methods when possible so that we
1738 # have consistent precedence and validation logic throughout the library.
1739
1740 @final
1741 def _is_level_reference(self, key: Level, axis: Axis = 0) -> bool_t:
1742 """
1743 Test whether a key is a level reference for a given axis.
1744
1745 To be considered a level reference, `key` must be a string that:
1746 - (axis=0): Matches the name of an index level and does NOT match
1747 a column label.
1748 - (axis=1): Matches the name of a column level and does NOT match
1749 an index label.
1750
1751 Parameters
1752 ----------
1753 key : Hashable
1754 Potential level name for the given axis
1755 axis : int, default 0
1756 Axis that levels are associated with (0 for index, 1 for columns)
1757
1758 Returns
1759 -------
1760 is_level : bool
1761 """
1762 axis_int = self._get_axis_number(axis)
1763
1764 return (
1765 key is not None
1766 and is_hashable(key)
1767 and key in self.axes[axis_int].names
1768 and not self._is_label_reference(key, axis=axis_int)
1769 )
1770
1771 @final
1772 def _is_label_reference(self, key: Level, axis: Axis = 0) -> bool_t:
1773 """
1774 Test whether a key is a label reference for a given axis.
1775
1776 To be considered a label reference, `key` must be a string that:
1777 - (axis=0): Matches a column label
1778 - (axis=1): Matches an index label
1779
1780 Parameters
1781 ----------
1782 key : Hashable
1783 Potential label name, i.e. Index entry.
1784 axis : int, default 0
1785 Axis perpendicular to the axis that labels are associated with
1786 (0 means search for column labels, 1 means search for index labels)
1787
1788 Returns
1789 -------
1790 is_label: bool
1791 """
1792 axis_int = self._get_axis_number(axis)
1793 other_axes = (ax for ax in range(self._AXIS_LEN) if ax != axis_int)
1794
1795 return (
1796 key is not None
1797 and is_hashable(key)
1798 and any(key in self.axes[ax] for ax in other_axes)
1799 )
1800
1801 @final
1802 def _is_label_or_level_reference(self, key: Level, axis: AxisInt = 0) -> bool_t:
1803 """
1804 Test whether a key is a label or level reference for a given axis.
1805
1806 To be considered either a label or a level reference, `key` must be a
1807 string that:
1808 - (axis=0): Matches a column label or an index level
1809 - (axis=1): Matches an index label or a column level
1810
1811 Parameters
1812 ----------
1813 key : Hashable
1814 Potential label or level name
1815 axis : int, default 0
1816 Axis that levels are associated with (0 for index, 1 for columns)
1817
1818 Returns
1819 -------
1820 bool
1821 """
1822 return self._is_level_reference(key, axis=axis) or self._is_label_reference(
1823 key, axis=axis
1824 )
1825
1826 @final
1827 def _check_label_or_level_ambiguity(self, key: Level, axis: Axis = 0) -> None:
1828 """
1829 Check whether `key` is ambiguous.
1830
1831 By ambiguous, we mean that it matches both a level of the input
1832 `axis` and a label of the other axis.
1833
1834 Parameters
1835 ----------
1836 key : Hashable
1837 Label or level name.
1838 axis : int, default 0
1839 Axis that levels are associated with (0 for index, 1 for columns).
1840
1841 Raises
1842 ------
1843 ValueError: `key` is ambiguous
1844 """
1845
1846 axis_int = self._get_axis_number(axis)
1847 other_axes = (ax for ax in range(self._AXIS_LEN) if ax != axis_int)
1848
1849 if (
1850 key is not None
1851 and is_hashable(key)
1852 and key in self.axes[axis_int].names
1853 and any(key in self.axes[ax] for ax in other_axes)
1854 ):
1855 # Build an informative and grammatical warning
1856 level_article, level_type = (
1857 ("an", "index") if axis_int == 0 else ("a", "column")
1858 )
1859
1860 label_article, label_type = (
1861 ("a", "column") if axis_int == 0 else ("an", "index")
1862 )
1863
1864 msg = (
1865 f"'{key}' is both {level_article} {level_type} level and "
1866 f"{label_article} {label_type} label, which is ambiguous."
1867 )
1868 raise ValueError(msg)
1869
1870 @final
1871 def _get_label_or_level_values(self, key: Level, axis: AxisInt = 0) -> ArrayLike:
1872 """
1873 Return a 1-D array of values associated with `key`, a label or level
1874 from the given `axis`.
1875
1876 Retrieval logic:
1877 - (axis=0): Return column values if `key` matches a column label.
1878 Otherwise return index level values if `key` matches an index
1879 level.
1880 - (axis=1): Return row values if `key` matches an index label.
1881 Otherwise return column level values if 'key' matches a column
1882 level
1883
1884 Parameters
1885 ----------
1886 key : Hashable
1887 Label or level name.
1888 axis : int, default 0
1889 Axis that levels are associated with (0 for index, 1 for columns)
1890
1891 Returns
1892 -------
1893 np.ndarray or ExtensionArray
1894
1895 Raises
1896 ------
1897 KeyError
1898 if `key` matches neither a label nor a level
1899 ValueError
1900 if `key` matches multiple labels
1901 """
1902 axis = self._get_axis_number(axis)
1903 other_axes = [ax for ax in range(self._AXIS_LEN) if ax != axis]
1904
1905 if self._is_label_reference(key, axis=axis):
1906 self._check_label_or_level_ambiguity(key, axis=axis)
1907 values = self.xs(key, axis=other_axes[0])._values
1908 elif self._is_level_reference(key, axis=axis):
1909 values = self.axes[axis].get_level_values(key)._values
1910 else:
1911 raise KeyError(key)
1912
1913 # Check for duplicates
1914 if values.ndim > 1:
1915 if other_axes and isinstance(self._get_axis(other_axes[0]), MultiIndex):
1916 multi_message = (
1917 "\n"
1918 "For a multi-index, the label must be a "
1919 "tuple with elements corresponding to each level."
1920 )
1921 else:
1922 multi_message = ""
1923
1924 label_axis_name = "column" if axis == 0 else "index"
1925 raise ValueError(
1926 f"The {label_axis_name} label '{key}' is not unique.{multi_message}"
1927 )
1928
1929 return values
1930
1931 @final
1932 def _drop_labels_or_levels(self, keys, axis: AxisInt = 0):
1933 """
1934 Drop labels and/or levels for the given `axis`.
1935
1936 For each key in `keys`:
1937 - (axis=0): If key matches a column label then drop the column.
1938 Otherwise if key matches an index level then drop the level.
1939 - (axis=1): If key matches an index label then drop the row.
1940 Otherwise if key matches a column level then drop the level.
1941
1942 Parameters
1943 ----------
1944 keys : str or list of str
1945 labels or levels to drop
1946 axis : int, default 0
1947 Axis that levels are associated with (0 for index, 1 for columns)
1948
1949 Returns
1950 -------
1951 dropped: DataFrame
1952
1953 Raises
1954 ------
1955 ValueError
1956 if any `keys` match neither a label nor a level
1957 """
1958 axis = self._get_axis_number(axis)
1959
1960 # Validate keys
1961 keys = common.maybe_make_list(keys)
1962 invalid_keys = [
1963 k for k in keys if not self._is_label_or_level_reference(k, axis=axis)
1964 ]
1965
1966 if invalid_keys:
1967 raise ValueError(
1968 "The following keys are not valid labels or "
1969 f"levels for axis {axis}: {invalid_keys}"
1970 )
1971
1972 # Compute levels and labels to drop
1973 levels_to_drop = [k for k in keys if self._is_level_reference(k, axis=axis)]
1974
1975 labels_to_drop = [k for k in keys if not self._is_level_reference(k, axis=axis)]
1976
1977 # Perform copy upfront and then use inplace operations below.
1978 # This ensures that we always perform exactly one copy.
1979 # ``copy`` and/or ``inplace`` options could be added in the future.
1980 dropped = self.copy(deep=False)
1981
1982 if axis == 0:
1983 # Handle dropping index levels
1984 if levels_to_drop:
1985 dropped.reset_index(levels_to_drop, drop=True, inplace=True)
1986
1987 # Handle dropping columns labels
1988 if labels_to_drop:
1989 dropped.drop(labels_to_drop, axis=1, inplace=True)
1990 else:
1991 # Handle dropping column levels
1992 if levels_to_drop:
1993 if isinstance(dropped.columns, MultiIndex):
1994 # Drop the specified levels from the MultiIndex
1995 dropped.columns = dropped.columns.droplevel(levels_to_drop)
1996 else:
1997 # Drop the last level of Index by replacing with
1998 # a RangeIndex
1999 dropped.columns = RangeIndex(dropped.columns.size)
2000
2001 # Handle dropping index labels
2002 if labels_to_drop:
2003 dropped.drop(labels_to_drop, axis=0, inplace=True)
2004
2005 return dropped
2006
2007 # ----------------------------------------------------------------------
2008 # Iteration
2009
2010 # https://github.com/python/typeshed/issues/2148#issuecomment-520783318
2011 # Incompatible types in assignment (expression has type "None", base class
2012 # "object" defined the type as "Callable[[object], int]")
2013 __hash__: ClassVar[None] # type: ignore[assignment]
2014
2015 def __iter__(self) -> Iterator:
2016 """
2017 Iterate over info axis.
2018
2019 Returns
2020 -------
2021 iterator
2022 Info axis as iterator.
2023
2024 Examples
2025 --------
2026 >>> df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})
2027 >>> for x in df:
2028 ... print(x)
2029 A
2030 B
2031 """
2032 return iter(self._info_axis)
2033
2034 # can we get a better explanation of this?
2035 def keys(self) -> Index:
2036 """
2037 Get the 'info axis' (see Indexing for more).
2038
2039 This is index for Series, columns for DataFrame.
2040
2041 Returns
2042 -------
2043 Index
2044 Info axis.
2045
2046 Examples
2047 --------
2048 >>> d = pd.DataFrame(data={'A': [1, 2, 3], 'B': [0, 4, 8]},
2049 ... index=['a', 'b', 'c'])
2050 >>> d
2051 A B
2052 a 1 0
2053 b 2 4
2054 c 3 8
2055 >>> d.keys()
2056 Index(['A', 'B'], dtype='object')
2057 """
2058 return self._info_axis
2059
2060 def items(self):
2061 """
2062 Iterate over (label, values) on info axis
2063
2064 This is index for Series and columns for DataFrame.
2065
2066 Returns
2067 -------
2068 Generator
2069 """
2070 for h in self._info_axis:
2071 yield h, self[h]
2072
2073 def __len__(self) -> int:
2074 """Returns length of info axis"""
2075 return len(self._info_axis)
2076
2077 @final
2078 def __contains__(self, key) -> bool_t:
2079 """True if the key is in the info axis"""
2080 return key in self._info_axis
2081
2082 @property
2083 def empty(self) -> bool_t:
2084 """
2085 Indicator whether Series/DataFrame is empty.
2086
2087 True if Series/DataFrame is entirely empty (no items), meaning any of the
2088 axes are of length 0.
2089
2090 Returns
2091 -------
2092 bool
2093 If Series/DataFrame is empty, return True, if not return False.
2094
2095 See Also
2096 --------
2097 Series.dropna : Return series without null values.
2098 DataFrame.dropna : Return DataFrame with labels on given axis omitted
2099 where (all or any) data are missing.
2100
2101 Notes
2102 -----
2103 If Series/DataFrame contains only NaNs, it is still not considered empty. See
2104 the example below.
2105
2106 Examples
2107 --------
2108 An example of an actual empty DataFrame. Notice the index is empty:
2109
2110 >>> df_empty = pd.DataFrame({'A' : []})
2111 >>> df_empty
2112 Empty DataFrame
2113 Columns: [A]
2114 Index: []
2115 >>> df_empty.empty
2116 True
2117
2118 If we only have NaNs in our DataFrame, it is not considered empty! We
2119 will need to drop the NaNs to make the DataFrame empty:
2120
2121 >>> df = pd.DataFrame({'A' : [np.nan]})
2122 >>> df
2123 A
2124 0 NaN
2125 >>> df.empty
2126 False
2127 >>> df.dropna().empty
2128 True
2129
2130 >>> ser_empty = pd.Series({'A' : []})
2131 >>> ser_empty
2132 A []
2133 dtype: object
2134 >>> ser_empty.empty
2135 False
2136 >>> ser_empty = pd.Series()
2137 >>> ser_empty.empty
2138 True
2139 """
2140 return any(len(self._get_axis(a)) == 0 for a in self._AXIS_ORDERS)
2141
2142 # ----------------------------------------------------------------------
2143 # Array Interface
2144
2145 # This is also set in IndexOpsMixin
2146 # GH#23114 Ensure ndarray.__op__(DataFrame) returns NotImplemented
2147 __array_priority__: int = 1000
2148
2149 def __array__(
2150 self, dtype: npt.DTypeLike | None = None, copy: bool_t | None = None
2151 ) -> np.ndarray:
2152 values = self._values
2153 arr = np.asarray(values, dtype=dtype)
2154 if (
2155 astype_is_view(values.dtype, arr.dtype)
2156 and using_copy_on_write()
2157 and self._mgr.is_single_block
2158 ):
2159 # Check if both conversions can be done without a copy
2160 if astype_is_view(self.dtypes.iloc[0], values.dtype) and astype_is_view(
2161 values.dtype, arr.dtype
2162 ):
2163 arr = arr.view()
2164 arr.flags.writeable = False
2165 return arr
2166
2167 @final
2168 def __array_ufunc__(
2169 self, ufunc: np.ufunc, method: str, *inputs: Any, **kwargs: Any
2170 ):
2171 return arraylike.array_ufunc(self, ufunc, method, *inputs, **kwargs)
2172
2173 # ----------------------------------------------------------------------
2174 # Picklability
2175
2176 @final
2177 def __getstate__(self) -> dict[str, Any]:
2178 meta = {k: getattr(self, k, None) for k in self._metadata}
2179 return {
2180 "_mgr": self._mgr,
2181 "_typ": self._typ,
2182 "_metadata": self._metadata,
2183 "attrs": self.attrs,
2184 "_flags": {k: self.flags[k] for k in self.flags._keys},
2185 **meta,
2186 }
2187
2188 @final
2189 def __setstate__(self, state) -> None:
2190 if isinstance(state, BlockManager):
2191 self._mgr = state
2192 elif isinstance(state, dict):
2193 if "_data" in state and "_mgr" not in state:
2194 # compat for older pickles
2195 state["_mgr"] = state.pop("_data")
2196 typ = state.get("_typ")
2197 if typ is not None:
2198 attrs = state.get("_attrs", {})
2199 if attrs is None: # should not happen, but better be on the safe side
2200 attrs = {}
2201 object.__setattr__(self, "_attrs", attrs)
2202 flags = state.get("_flags", {"allows_duplicate_labels": True})
2203 object.__setattr__(self, "_flags", Flags(self, **flags))
2204
2205 # set in the order of internal names
2206 # to avoid definitional recursion
2207 # e.g. say fill_value needing _mgr to be
2208 # defined
2209 meta = set(self._internal_names + self._metadata)
2210 for k in list(meta):
2211 if k in state and k != "_flags":
2212 v = state[k]
2213 object.__setattr__(self, k, v)
2214
2215 for k, v in state.items():
2216 if k not in meta:
2217 object.__setattr__(self, k, v)
2218
2219 else:
2220 raise NotImplementedError("Pre-0.12 pickles are no longer supported")
2221 elif len(state) == 2:
2222 raise NotImplementedError("Pre-0.12 pickles are no longer supported")
2223
2224 self._item_cache: dict[Hashable, Series] = {}
2225
2226 # ----------------------------------------------------------------------
2227 # Rendering Methods
2228
2229 def __repr__(self) -> str:
2230 # string representation based upon iterating over self
2231 # (since, by definition, `PandasContainers` are iterable)
2232 prepr = f"[{','.join(map(pprint_thing, self))}]"
2233 return f"{type(self).__name__}({prepr})"
2234
2235 @final
2236 def _repr_latex_(self):
2237 """
2238 Returns a LaTeX representation for a particular object.
2239 Mainly for use with nbconvert (jupyter notebook conversion to pdf).
2240 """
2241 if config.get_option("styler.render.repr") == "latex":
2242 return self.to_latex()
2243 else:
2244 return None
2245
2246 @final
2247 def _repr_data_resource_(self):
2248 """
2249 Not a real Jupyter special repr method, but we use the same
2250 naming convention.
2251 """
2252 if config.get_option("display.html.table_schema"):
2253 data = self.head(config.get_option("display.max_rows"))
2254
2255 as_json = data.to_json(orient="table")
2256 as_json = cast(str, as_json)
2257 return loads(as_json, object_pairs_hook=collections.OrderedDict)
2258
2259 # ----------------------------------------------------------------------
2260 # I/O Methods
2261
2262 @final
2263 @deprecate_nonkeyword_arguments(
2264 version="3.0", allowed_args=["self", "excel_writer"], name="to_excel"
2265 )
2266 @doc(
2267 klass="object",
2268 storage_options=_shared_docs["storage_options"],
2269 storage_options_versionadded="1.2.0",
2270 )
2271 def to_excel(
2272 self,
2273 excel_writer: FilePath | WriteExcelBuffer | ExcelWriter,
2274 sheet_name: str = "Sheet1",
2275 na_rep: str = "",
2276 float_format: str | None = None,
2277 columns: Sequence[Hashable] | None = None,
2278 header: Sequence[Hashable] | bool_t = True,
2279 index: bool_t = True,
2280 index_label: IndexLabel | None = None,
2281 startrow: int = 0,
2282 startcol: int = 0,
2283 engine: Literal["openpyxl", "xlsxwriter"] | None = None,
2284 merge_cells: bool_t = True,
2285 inf_rep: str = "inf",
2286 freeze_panes: tuple[int, int] | None = None,
2287 storage_options: StorageOptions | None = None,
2288 engine_kwargs: dict[str, Any] | None = None,
2289 ) -> None:
2290 """
2291 Write {klass} to an Excel sheet.
2292
2293 To write a single {klass} to an Excel .xlsx file it is only necessary to
2294 specify a target file name. To write to multiple sheets it is necessary to
2295 create an `ExcelWriter` object with a target file name, and specify a sheet
2296 in the file to write to.
2297
2298 Multiple sheets may be written to by specifying unique `sheet_name`.
2299 With all data written to the file it is necessary to save the changes.
2300 Note that creating an `ExcelWriter` object with a file name that already
2301 exists will result in the contents of the existing file being erased.
2302
2303 Parameters
2304 ----------
2305 excel_writer : path-like, file-like, or ExcelWriter object
2306 File path or existing ExcelWriter.
2307 sheet_name : str, default 'Sheet1'
2308 Name of sheet which will contain DataFrame.
2309 na_rep : str, default ''
2310 Missing data representation.
2311 float_format : str, optional
2312 Format string for floating point numbers. For example
2313 ``float_format="%.2f"`` will format 0.1234 to 0.12.
2314 columns : sequence or list of str, optional
2315 Columns to write.
2316 header : bool or list of str, default True
2317 Write out the column names. If a list of string is given it is
2318 assumed to be aliases for the column names.
2319 index : bool, default True
2320 Write row names (index).
2321 index_label : str or sequence, optional
2322 Column label for index column(s) if desired. If not specified, and
2323 `header` and `index` are True, then the index names are used. A
2324 sequence should be given if the DataFrame uses MultiIndex.
2325 startrow : int, default 0
2326 Upper left cell row to dump data frame.
2327 startcol : int, default 0
2328 Upper left cell column to dump data frame.
2329 engine : str, optional
2330 Write engine to use, 'openpyxl' or 'xlsxwriter'. You can also set this
2331 via the options ``io.excel.xlsx.writer`` or
2332 ``io.excel.xlsm.writer``.
2333
2334 merge_cells : bool, default True
2335 Write MultiIndex and Hierarchical Rows as merged cells.
2336 inf_rep : str, default 'inf'
2337 Representation for infinity (there is no native representation for
2338 infinity in Excel).
2339 freeze_panes : tuple of int (length 2), optional
2340 Specifies the one-based bottommost row and rightmost column that
2341 is to be frozen.
2342 {storage_options}
2343
2344 .. versionadded:: {storage_options_versionadded}
2345 engine_kwargs : dict, optional
2346 Arbitrary keyword arguments passed to excel engine.
2347
2348 See Also
2349 --------
2350 to_csv : Write DataFrame to a comma-separated values (csv) file.
2351 ExcelWriter : Class for writing DataFrame objects into excel sheets.
2352 read_excel : Read an Excel file into a pandas DataFrame.
2353 read_csv : Read a comma-separated values (csv) file into DataFrame.
2354 io.formats.style.Styler.to_excel : Add styles to Excel sheet.
2355
2356 Notes
2357 -----
2358 For compatibility with :meth:`~DataFrame.to_csv`,
2359 to_excel serializes lists and dicts to strings before writing.
2360
2361 Once a workbook has been saved it is not possible to write further
2362 data without rewriting the whole workbook.
2363
2364 Examples
2365 --------
2366
2367 Create, write to and save a workbook:
2368
2369 >>> df1 = pd.DataFrame([['a', 'b'], ['c', 'd']],
2370 ... index=['row 1', 'row 2'],
2371 ... columns=['col 1', 'col 2'])
2372 >>> df1.to_excel("output.xlsx") # doctest: +SKIP
2373
2374 To specify the sheet name:
2375
2376 >>> df1.to_excel("output.xlsx",
2377 ... sheet_name='Sheet_name_1') # doctest: +SKIP
2378
2379 If you wish to write to more than one sheet in the workbook, it is
2380 necessary to specify an ExcelWriter object:
2381
2382 >>> df2 = df1.copy()
2383 >>> with pd.ExcelWriter('output.xlsx') as writer: # doctest: +SKIP
2384 ... df1.to_excel(writer, sheet_name='Sheet_name_1')
2385 ... df2.to_excel(writer, sheet_name='Sheet_name_2')
2386
2387 ExcelWriter can also be used to append to an existing Excel file:
2388
2389 >>> with pd.ExcelWriter('output.xlsx',
2390 ... mode='a') as writer: # doctest: +SKIP
2391 ... df1.to_excel(writer, sheet_name='Sheet_name_3')
2392
2393 To set the library that is used to write the Excel file,
2394 you can pass the `engine` keyword (the default engine is
2395 automatically chosen depending on the file extension):
2396
2397 >>> df1.to_excel('output1.xlsx', engine='xlsxwriter') # doctest: +SKIP
2398 """
2399 if engine_kwargs is None:
2400 engine_kwargs = {}
2401
2402 df = self if isinstance(self, ABCDataFrame) else self.to_frame()
2403
2404 from pandas.io.formats.excel import ExcelFormatter
2405
2406 formatter = ExcelFormatter(
2407 df,
2408 na_rep=na_rep,
2409 cols=columns,
2410 header=header,
2411 float_format=float_format,
2412 index=index,
2413 index_label=index_label,
2414 merge_cells=merge_cells,
2415 inf_rep=inf_rep,
2416 )
2417 formatter.write(
2418 excel_writer,
2419 sheet_name=sheet_name,
2420 startrow=startrow,
2421 startcol=startcol,
2422 freeze_panes=freeze_panes,
2423 engine=engine,
2424 storage_options=storage_options,
2425 engine_kwargs=engine_kwargs,
2426 )
2427
2428 @final
2429 @deprecate_nonkeyword_arguments(
2430 version="3.0", allowed_args=["self", "path_or_buf"], name="to_json"
2431 )
2432 @doc(
2433 storage_options=_shared_docs["storage_options"],
2434 compression_options=_shared_docs["compression_options"] % "path_or_buf",
2435 )
2436 def to_json(
2437 self,
2438 path_or_buf: FilePath | WriteBuffer[bytes] | WriteBuffer[str] | None = None,
2439 orient: Literal["split", "records", "index", "table", "columns", "values"]
2440 | None = None,
2441 date_format: str | None = None,
2442 double_precision: int = 10,
2443 force_ascii: bool_t = True,
2444 date_unit: TimeUnit = "ms",
2445 default_handler: Callable[[Any], JSONSerializable] | None = None,
2446 lines: bool_t = False,
2447 compression: CompressionOptions = "infer",
2448 index: bool_t | None = None,
2449 indent: int | None = None,
2450 storage_options: StorageOptions | None = None,
2451 mode: Literal["a", "w"] = "w",
2452 ) -> str | None:
2453 """
2454 Convert the object to a JSON string.
2455
2456 Note NaN's and None will be converted to null and datetime objects
2457 will be converted to UNIX timestamps.
2458
2459 Parameters
2460 ----------
2461 path_or_buf : str, path object, file-like object, or None, default None
2462 String, path object (implementing os.PathLike[str]), or file-like
2463 object implementing a write() function. If None, the result is
2464 returned as a string.
2465 orient : str
2466 Indication of expected JSON string format.
2467
2468 * Series:
2469
2470 - default is 'index'
2471 - allowed values are: {{'split', 'records', 'index', 'table'}}.
2472
2473 * DataFrame:
2474
2475 - default is 'columns'
2476 - allowed values are: {{'split', 'records', 'index', 'columns',
2477 'values', 'table'}}.
2478
2479 * The format of the JSON string:
2480
2481 - 'split' : dict like {{'index' -> [index], 'columns' -> [columns],
2482 'data' -> [values]}}
2483 - 'records' : list like [{{column -> value}}, ... , {{column -> value}}]
2484 - 'index' : dict like {{index -> {{column -> value}}}}
2485 - 'columns' : dict like {{column -> {{index -> value}}}}
2486 - 'values' : just the values array
2487 - 'table' : dict like {{'schema': {{schema}}, 'data': {{data}}}}
2488
2489 Describing the data, where data component is like ``orient='records'``.
2490
2491 date_format : {{None, 'epoch', 'iso'}}
2492 Type of date conversion. 'epoch' = epoch milliseconds,
2493 'iso' = ISO8601. The default depends on the `orient`. For
2494 ``orient='table'``, the default is 'iso'. For all other orients,
2495 the default is 'epoch'.
2496 double_precision : int, default 10
2497 The number of decimal places to use when encoding
2498 floating point values. The possible maximal value is 15.
2499 Passing double_precision greater than 15 will raise a ValueError.
2500 force_ascii : bool, default True
2501 Force encoded string to be ASCII.
2502 date_unit : str, default 'ms' (milliseconds)
2503 The time unit to encode to, governs timestamp and ISO8601
2504 precision. One of 's', 'ms', 'us', 'ns' for second, millisecond,
2505 microsecond, and nanosecond respectively.
2506 default_handler : callable, default None
2507 Handler to call if object cannot otherwise be converted to a
2508 suitable format for JSON. Should receive a single argument which is
2509 the object to convert and return a serialisable object.
2510 lines : bool, default False
2511 If 'orient' is 'records' write out line-delimited json format. Will
2512 throw ValueError if incorrect 'orient' since others are not
2513 list-like.
2514 {compression_options}
2515
2516 .. versionchanged:: 1.4.0 Zstandard support.
2517
2518 index : bool or None, default None
2519 The index is only used when 'orient' is 'split', 'index', 'column',
2520 or 'table'. Of these, 'index' and 'column' do not support
2521 `index=False`.
2522
2523 indent : int, optional
2524 Length of whitespace used to indent each record.
2525
2526 {storage_options}
2527
2528 mode : str, default 'w' (writing)
2529 Specify the IO mode for output when supplying a path_or_buf.
2530 Accepted args are 'w' (writing) and 'a' (append) only.
2531 mode='a' is only supported when lines is True and orient is 'records'.
2532
2533 Returns
2534 -------
2535 None or str
2536 If path_or_buf is None, returns the resulting json format as a
2537 string. Otherwise returns None.
2538
2539 See Also
2540 --------
2541 read_json : Convert a JSON string to pandas object.
2542
2543 Notes
2544 -----
2545 The behavior of ``indent=0`` varies from the stdlib, which does not
2546 indent the output but does insert newlines. Currently, ``indent=0``
2547 and the default ``indent=None`` are equivalent in pandas, though this
2548 may change in a future release.
2549
2550 ``orient='table'`` contains a 'pandas_version' field under 'schema'.
2551 This stores the version of `pandas` used in the latest revision of the
2552 schema.
2553
2554 Examples
2555 --------
2556 >>> from json import loads, dumps
2557 >>> df = pd.DataFrame(
2558 ... [["a", "b"], ["c", "d"]],
2559 ... index=["row 1", "row 2"],
2560 ... columns=["col 1", "col 2"],
2561 ... )
2562
2563 >>> result = df.to_json(orient="split")
2564 >>> parsed = loads(result)
2565 >>> dumps(parsed, indent=4) # doctest: +SKIP
2566 {{
2567 "columns": [
2568 "col 1",
2569 "col 2"
2570 ],
2571 "index": [
2572 "row 1",
2573 "row 2"
2574 ],
2575 "data": [
2576 [
2577 "a",
2578 "b"
2579 ],
2580 [
2581 "c",
2582 "d"
2583 ]
2584 ]
2585 }}
2586
2587 Encoding/decoding a Dataframe using ``'records'`` formatted JSON.
2588 Note that index labels are not preserved with this encoding.
2589
2590 >>> result = df.to_json(orient="records")
2591 >>> parsed = loads(result)
2592 >>> dumps(parsed, indent=4) # doctest: +SKIP
2593 [
2594 {{
2595 "col 1": "a",
2596 "col 2": "b"
2597 }},
2598 {{
2599 "col 1": "c",
2600 "col 2": "d"
2601 }}
2602 ]
2603
2604 Encoding/decoding a Dataframe using ``'index'`` formatted JSON:
2605
2606 >>> result = df.to_json(orient="index")
2607 >>> parsed = loads(result)
2608 >>> dumps(parsed, indent=4) # doctest: +SKIP
2609 {{
2610 "row 1": {{
2611 "col 1": "a",
2612 "col 2": "b"
2613 }},
2614 "row 2": {{
2615 "col 1": "c",
2616 "col 2": "d"
2617 }}
2618 }}
2619
2620 Encoding/decoding a Dataframe using ``'columns'`` formatted JSON:
2621
2622 >>> result = df.to_json(orient="columns")
2623 >>> parsed = loads(result)
2624 >>> dumps(parsed, indent=4) # doctest: +SKIP
2625 {{
2626 "col 1": {{
2627 "row 1": "a",
2628 "row 2": "c"
2629 }},
2630 "col 2": {{
2631 "row 1": "b",
2632 "row 2": "d"
2633 }}
2634 }}
2635
2636 Encoding/decoding a Dataframe using ``'values'`` formatted JSON:
2637
2638 >>> result = df.to_json(orient="values")
2639 >>> parsed = loads(result)
2640 >>> dumps(parsed, indent=4) # doctest: +SKIP
2641 [
2642 [
2643 "a",
2644 "b"
2645 ],
2646 [
2647 "c",
2648 "d"
2649 ]
2650 ]
2651
2652 Encoding with Table Schema:
2653
2654 >>> result = df.to_json(orient="table")
2655 >>> parsed = loads(result)
2656 >>> dumps(parsed, indent=4) # doctest: +SKIP
2657 {{
2658 "schema": {{
2659 "fields": [
2660 {{
2661 "name": "index",
2662 "type": "string"
2663 }},
2664 {{
2665 "name": "col 1",
2666 "type": "string"
2667 }},
2668 {{
2669 "name": "col 2",
2670 "type": "string"
2671 }}
2672 ],
2673 "primaryKey": [
2674 "index"
2675 ],
2676 "pandas_version": "1.4.0"
2677 }},
2678 "data": [
2679 {{
2680 "index": "row 1",
2681 "col 1": "a",
2682 "col 2": "b"
2683 }},
2684 {{
2685 "index": "row 2",
2686 "col 1": "c",
2687 "col 2": "d"
2688 }}
2689 ]
2690 }}
2691 """
2692 from pandas.io import json
2693
2694 if date_format is None and orient == "table":
2695 date_format = "iso"
2696 elif date_format is None:
2697 date_format = "epoch"
2698
2699 config.is_nonnegative_int(indent)
2700 indent = indent or 0
2701
2702 return json.to_json(
2703 path_or_buf=path_or_buf,
2704 obj=self,
2705 orient=orient,
2706 date_format=date_format,
2707 double_precision=double_precision,
2708 force_ascii=force_ascii,
2709 date_unit=date_unit,
2710 default_handler=default_handler,
2711 lines=lines,
2712 compression=compression,
2713 index=index,
2714 indent=indent,
2715 storage_options=storage_options,
2716 mode=mode,
2717 )
2718
2719 @final
2720 @deprecate_nonkeyword_arguments(
2721 version="3.0", allowed_args=["self", "path_or_buf"], name="to_hdf"
2722 )
2723 def to_hdf(
2724 self,
2725 path_or_buf: FilePath | HDFStore,
2726 key: str,
2727 mode: Literal["a", "w", "r+"] = "a",
2728 complevel: int | None = None,
2729 complib: Literal["zlib", "lzo", "bzip2", "blosc"] | None = None,
2730 append: bool_t = False,
2731 format: Literal["fixed", "table"] | None = None,
2732 index: bool_t = True,
2733 min_itemsize: int | dict[str, int] | None = None,
2734 nan_rep=None,
2735 dropna: bool_t | None = None,
2736 data_columns: Literal[True] | list[str] | None = None,
2737 errors: OpenFileErrors = "strict",
2738 encoding: str = "UTF-8",
2739 ) -> None:
2740 """
2741 Write the contained data to an HDF5 file using HDFStore.
2742
2743 Hierarchical Data Format (HDF) is self-describing, allowing an
2744 application to interpret the structure and contents of a file with
2745 no outside information. One HDF file can hold a mix of related objects
2746 which can be accessed as a group or as individual objects.
2747
2748 In order to add another DataFrame or Series to an existing HDF file
2749 please use append mode and a different a key.
2750
2751 .. warning::
2752
2753 One can store a subclass of ``DataFrame`` or ``Series`` to HDF5,
2754 but the type of the subclass is lost upon storing.
2755
2756 For more information see the :ref:`user guide <io.hdf5>`.
2757
2758 Parameters
2759 ----------
2760 path_or_buf : str or pandas.HDFStore
2761 File path or HDFStore object.
2762 key : str
2763 Identifier for the group in the store.
2764 mode : {'a', 'w', 'r+'}, default 'a'
2765 Mode to open file:
2766
2767 - 'w': write, a new file is created (an existing file with
2768 the same name would be deleted).
2769 - 'a': append, an existing file is opened for reading and
2770 writing, and if the file does not exist it is created.
2771 - 'r+': similar to 'a', but the file must already exist.
2772 complevel : {0-9}, default None
2773 Specifies a compression level for data.
2774 A value of 0 or None disables compression.
2775 complib : {'zlib', 'lzo', 'bzip2', 'blosc'}, default 'zlib'
2776 Specifies the compression library to be used.
2777 These additional compressors for Blosc are supported
2778 (default if no compressor specified: 'blosc:blosclz'):
2779 {'blosc:blosclz', 'blosc:lz4', 'blosc:lz4hc', 'blosc:snappy',
2780 'blosc:zlib', 'blosc:zstd'}.
2781 Specifying a compression library which is not available issues
2782 a ValueError.
2783 append : bool, default False
2784 For Table formats, append the input data to the existing.
2785 format : {'fixed', 'table', None}, default 'fixed'
2786 Possible values:
2787
2788 - 'fixed': Fixed format. Fast writing/reading. Not-appendable,
2789 nor searchable.
2790 - 'table': Table format. Write as a PyTables Table structure
2791 which may perform worse but allow more flexible operations
2792 like searching / selecting subsets of the data.
2793 - If None, pd.get_option('io.hdf.default_format') is checked,
2794 followed by fallback to "fixed".
2795 index : bool, default True
2796 Write DataFrame index as a column.
2797 min_itemsize : dict or int, optional
2798 Map column names to minimum string sizes for columns.
2799 nan_rep : Any, optional
2800 How to represent null values as str.
2801 Not allowed with append=True.
2802 dropna : bool, default False, optional
2803 Remove missing values.
2804 data_columns : list of columns or True, optional
2805 List of columns to create as indexed data columns for on-disk
2806 queries, or True to use all columns. By default only the axes
2807 of the object are indexed. See
2808 :ref:`Query via data columns<io.hdf5-query-data-columns>`. for
2809 more information.
2810 Applicable only to format='table'.
2811 errors : str, default 'strict'
2812 Specifies how encoding and decoding errors are to be handled.
2813 See the errors argument for :func:`open` for a full list
2814 of options.
2815 encoding : str, default "UTF-8"
2816
2817 See Also
2818 --------
2819 read_hdf : Read from HDF file.
2820 DataFrame.to_orc : Write a DataFrame to the binary orc format.
2821 DataFrame.to_parquet : Write a DataFrame to the binary parquet format.
2822 DataFrame.to_sql : Write to a SQL table.
2823 DataFrame.to_feather : Write out feather-format for DataFrames.
2824 DataFrame.to_csv : Write out to a csv file.
2825
2826 Examples
2827 --------
2828 >>> df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]},
2829 ... index=['a', 'b', 'c']) # doctest: +SKIP
2830 >>> df.to_hdf('data.h5', key='df', mode='w') # doctest: +SKIP
2831
2832 We can add another object to the same file:
2833
2834 >>> s = pd.Series([1, 2, 3, 4]) # doctest: +SKIP
2835 >>> s.to_hdf('data.h5', key='s') # doctest: +SKIP
2836
2837 Reading from HDF file:
2838
2839 >>> pd.read_hdf('data.h5', 'df') # doctest: +SKIP
2840 A B
2841 a 1 4
2842 b 2 5
2843 c 3 6
2844 >>> pd.read_hdf('data.h5', 's') # doctest: +SKIP
2845 0 1
2846 1 2
2847 2 3
2848 3 4
2849 dtype: int64
2850 """
2851 from pandas.io import pytables
2852
2853 # Argument 3 to "to_hdf" has incompatible type "NDFrame"; expected
2854 # "Union[DataFrame, Series]" [arg-type]
2855 pytables.to_hdf(
2856 path_or_buf,
2857 key,
2858 self, # type: ignore[arg-type]
2859 mode=mode,
2860 complevel=complevel,
2861 complib=complib,
2862 append=append,
2863 format=format,
2864 index=index,
2865 min_itemsize=min_itemsize,
2866 nan_rep=nan_rep,
2867 dropna=dropna,
2868 data_columns=data_columns,
2869 errors=errors,
2870 encoding=encoding,
2871 )
2872
2873 @final
2874 @deprecate_nonkeyword_arguments(
2875 version="3.0", allowed_args=["self", "name", "con"], name="to_sql"
2876 )
2877 def to_sql(
2878 self,
2879 name: str,
2880 con,
2881 schema: str | None = None,
2882 if_exists: Literal["fail", "replace", "append"] = "fail",
2883 index: bool_t = True,
2884 index_label: IndexLabel | None = None,
2885 chunksize: int | None = None,
2886 dtype: DtypeArg | None = None,
2887 method: Literal["multi"] | Callable | None = None,
2888 ) -> int | None:
2889 """
2890 Write records stored in a DataFrame to a SQL database.
2891
2892 Databases supported by SQLAlchemy [1]_ are supported. Tables can be
2893 newly created, appended to, or overwritten.
2894
2895 Parameters
2896 ----------
2897 name : str
2898 Name of SQL table.
2899 con : sqlalchemy.engine.(Engine or Connection) or sqlite3.Connection
2900 Using SQLAlchemy makes it possible to use any DB supported by that
2901 library. Legacy support is provided for sqlite3.Connection objects. The user
2902 is responsible for engine disposal and connection closure for the SQLAlchemy
2903 connectable. See `here \
2904 <https://docs.sqlalchemy.org/en/20/core/connections.html>`_.
2905 If passing a sqlalchemy.engine.Connection which is already in a transaction,
2906 the transaction will not be committed. If passing a sqlite3.Connection,
2907 it will not be possible to roll back the record insertion.
2908
2909 schema : str, optional
2910 Specify the schema (if database flavor supports this). If None, use
2911 default schema.
2912 if_exists : {'fail', 'replace', 'append'}, default 'fail'
2913 How to behave if the table already exists.
2914
2915 * fail: Raise a ValueError.
2916 * replace: Drop the table before inserting new values.
2917 * append: Insert new values to the existing table.
2918
2919 index : bool, default True
2920 Write DataFrame index as a column. Uses `index_label` as the column
2921 name in the table. Creates a table index for this column.
2922 index_label : str or sequence, default None
2923 Column label for index column(s). If None is given (default) and
2924 `index` is True, then the index names are used.
2925 A sequence should be given if the DataFrame uses MultiIndex.
2926 chunksize : int, optional
2927 Specify the number of rows in each batch to be written at a time.
2928 By default, all rows will be written at once.
2929 dtype : dict or scalar, optional
2930 Specifying the datatype for columns. If a dictionary is used, the
2931 keys should be the column names and the values should be the
2932 SQLAlchemy types or strings for the sqlite3 legacy mode. If a
2933 scalar is provided, it will be applied to all columns.
2934 method : {None, 'multi', callable}, optional
2935 Controls the SQL insertion clause used:
2936
2937 * None : Uses standard SQL ``INSERT`` clause (one per row).
2938 * 'multi': Pass multiple values in a single ``INSERT`` clause.
2939 * callable with signature ``(pd_table, conn, keys, data_iter)``.
2940
2941 Details and a sample callable implementation can be found in the
2942 section :ref:`insert method <io.sql.method>`.
2943
2944 Returns
2945 -------
2946 None or int
2947 Number of rows affected by to_sql. None is returned if the callable
2948 passed into ``method`` does not return an integer number of rows.
2949
2950 The number of returned rows affected is the sum of the ``rowcount``
2951 attribute of ``sqlite3.Cursor`` or SQLAlchemy connectable which may not
2952 reflect the exact number of written rows as stipulated in the
2953 `sqlite3 <https://docs.python.org/3/library/sqlite3.html#sqlite3.Cursor.rowcount>`__ or
2954 `SQLAlchemy <https://docs.sqlalchemy.org/en/20/core/connections.html#sqlalchemy.engine.CursorResult.rowcount>`__.
2955
2956 .. versionadded:: 1.4.0
2957
2958 Raises
2959 ------
2960 ValueError
2961 When the table already exists and `if_exists` is 'fail' (the
2962 default).
2963
2964 See Also
2965 --------
2966 read_sql : Read a DataFrame from a table.
2967
2968 Notes
2969 -----
2970 Timezone aware datetime columns will be written as
2971 ``Timestamp with timezone`` type with SQLAlchemy if supported by the
2972 database. Otherwise, the datetimes will be stored as timezone unaware
2973 timestamps local to the original timezone.
2974
2975 Not all datastores support ``method="multi"``. Oracle, for example,
2976 does not support multi-value insert.
2977
2978 References
2979 ----------
2980 .. [1] https://docs.sqlalchemy.org
2981 .. [2] https://www.python.org/dev/peps/pep-0249/
2982
2983 Examples
2984 --------
2985 Create an in-memory SQLite database.
2986
2987 >>> from sqlalchemy import create_engine
2988 >>> engine = create_engine('sqlite://', echo=False)
2989
2990 Create a table from scratch with 3 rows.
2991
2992 >>> df = pd.DataFrame({'name' : ['User 1', 'User 2', 'User 3']})
2993 >>> df
2994 name
2995 0 User 1
2996 1 User 2
2997 2 User 3
2998
2999 >>> df.to_sql(name='users', con=engine)
3000 3
3001 >>> from sqlalchemy import text
3002 >>> with engine.connect() as conn:
3003 ... conn.execute(text("SELECT * FROM users")).fetchall()
3004 [(0, 'User 1'), (1, 'User 2'), (2, 'User 3')]
3005
3006 An `sqlalchemy.engine.Connection` can also be passed to `con`:
3007
3008 >>> with engine.begin() as connection:
3009 ... df1 = pd.DataFrame({'name' : ['User 4', 'User 5']})
3010 ... df1.to_sql(name='users', con=connection, if_exists='append')
3011 2
3012
3013 This is allowed to support operations that require that the same
3014 DBAPI connection is used for the entire operation.
3015
3016 >>> df2 = pd.DataFrame({'name' : ['User 6', 'User 7']})
3017 >>> df2.to_sql(name='users', con=engine, if_exists='append')
3018 2
3019 >>> with engine.connect() as conn:
3020 ... conn.execute(text("SELECT * FROM users")).fetchall()
3021 [(0, 'User 1'), (1, 'User 2'), (2, 'User 3'),
3022 (0, 'User 4'), (1, 'User 5'), (0, 'User 6'),
3023 (1, 'User 7')]
3024
3025 Overwrite the table with just ``df2``.
3026
3027 >>> df2.to_sql(name='users', con=engine, if_exists='replace',
3028 ... index_label='id')
3029 2
3030 >>> with engine.connect() as conn:
3031 ... conn.execute(text("SELECT * FROM users")).fetchall()
3032 [(0, 'User 6'), (1, 'User 7')]
3033
3034 Use ``method`` to define a callable insertion method to do nothing
3035 if there's a primary key conflict on a table in a PostgreSQL database.
3036
3037 >>> from sqlalchemy.dialects.postgresql import insert
3038 >>> def insert_on_conflict_nothing(table, conn, keys, data_iter):
3039 ... # "a" is the primary key in "conflict_table"
3040 ... data = [dict(zip(keys, row)) for row in data_iter]
3041 ... stmt = insert(table.table).values(data).on_conflict_do_nothing(index_elements=["a"])
3042 ... result = conn.execute(stmt)
3043 ... return result.rowcount
3044 >>> df_conflict.to_sql(name="conflict_table", con=conn, if_exists="append", method=insert_on_conflict_nothing) # doctest: +SKIP
3045 0
3046
3047 For MySQL, a callable to update columns ``b`` and ``c`` if there's a conflict
3048 on a primary key.
3049
3050 >>> from sqlalchemy.dialects.mysql import insert
3051 >>> def insert_on_conflict_update(table, conn, keys, data_iter):
3052 ... # update columns "b" and "c" on primary key conflict
3053 ... data = [dict(zip(keys, row)) for row in data_iter]
3054 ... stmt = (
3055 ... insert(table.table)
3056 ... .values(data)
3057 ... )
3058 ... stmt = stmt.on_duplicate_key_update(b=stmt.inserted.b, c=stmt.inserted.c)
3059 ... result = conn.execute(stmt)
3060 ... return result.rowcount
3061 >>> df_conflict.to_sql(name="conflict_table", con=conn, if_exists="append", method=insert_on_conflict_update) # doctest: +SKIP
3062 2
3063
3064 Specify the dtype (especially useful for integers with missing values).
3065 Notice that while pandas is forced to store the data as floating point,
3066 the database supports nullable integers. When fetching the data with
3067 Python, we get back integer scalars.
3068
3069 >>> df = pd.DataFrame({"A": [1, None, 2]})
3070 >>> df
3071 A
3072 0 1.0
3073 1 NaN
3074 2 2.0
3075
3076 >>> from sqlalchemy.types import Integer
3077 >>> df.to_sql(name='integers', con=engine, index=False,
3078 ... dtype={"A": Integer()})
3079 3
3080
3081 >>> with engine.connect() as conn:
3082 ... conn.execute(text("SELECT * FROM integers")).fetchall()
3083 [(1,), (None,), (2,)]
3084 """ # noqa: E501
3085 from pandas.io import sql
3086
3087 return sql.to_sql(
3088 self,
3089 name,
3090 con,
3091 schema=schema,
3092 if_exists=if_exists,
3093 index=index,
3094 index_label=index_label,
3095 chunksize=chunksize,
3096 dtype=dtype,
3097 method=method,
3098 )
3099
3100 @final
3101 @deprecate_nonkeyword_arguments(
3102 version="3.0", allowed_args=["self", "path"], name="to_pickle"
3103 )
3104 @doc(
3105 storage_options=_shared_docs["storage_options"],
3106 compression_options=_shared_docs["compression_options"] % "path",
3107 )
3108 def to_pickle(
3109 self,
3110 path: FilePath | WriteBuffer[bytes],
3111 compression: CompressionOptions = "infer",
3112 protocol: int = pickle.HIGHEST_PROTOCOL,
3113 storage_options: StorageOptions | None = None,
3114 ) -> None:
3115 """
3116 Pickle (serialize) object to file.
3117
3118 Parameters
3119 ----------
3120 path : str, path object, or file-like object
3121 String, path object (implementing ``os.PathLike[str]``), or file-like
3122 object implementing a binary ``write()`` function. File path where
3123 the pickled object will be stored.
3124 {compression_options}
3125 protocol : int
3126 Int which indicates which protocol should be used by the pickler,
3127 default HIGHEST_PROTOCOL (see [1]_ paragraph 12.1.2). The possible
3128 values are 0, 1, 2, 3, 4, 5. A negative value for the protocol
3129 parameter is equivalent to setting its value to HIGHEST_PROTOCOL.
3130
3131 .. [1] https://docs.python.org/3/library/pickle.html.
3132
3133 {storage_options}
3134
3135 See Also
3136 --------
3137 read_pickle : Load pickled pandas object (or any object) from file.
3138 DataFrame.to_hdf : Write DataFrame to an HDF5 file.
3139 DataFrame.to_sql : Write DataFrame to a SQL database.
3140 DataFrame.to_parquet : Write a DataFrame to the binary parquet format.
3141
3142 Examples
3143 --------
3144 >>> original_df = pd.DataFrame({{"foo": range(5), "bar": range(5, 10)}}) # doctest: +SKIP
3145 >>> original_df # doctest: +SKIP
3146 foo bar
3147 0 0 5
3148 1 1 6
3149 2 2 7
3150 3 3 8
3151 4 4 9
3152 >>> original_df.to_pickle("./dummy.pkl") # doctest: +SKIP
3153
3154 >>> unpickled_df = pd.read_pickle("./dummy.pkl") # doctest: +SKIP
3155 >>> unpickled_df # doctest: +SKIP
3156 foo bar
3157 0 0 5
3158 1 1 6
3159 2 2 7
3160 3 3 8
3161 4 4 9
3162 """ # noqa: E501
3163 from pandas.io.pickle import to_pickle
3164
3165 to_pickle(
3166 self,
3167 path,
3168 compression=compression,
3169 protocol=protocol,
3170 storage_options=storage_options,
3171 )
3172
3173 @final
3174 @deprecate_nonkeyword_arguments(
3175 version="3.0", allowed_args=["self"], name="to_clipboard"
3176 )
3177 def to_clipboard(
3178 self, excel: bool_t = True, sep: str | None = None, **kwargs
3179 ) -> None:
3180 r"""
3181 Copy object to the system clipboard.
3182
3183 Write a text representation of object to the system clipboard.
3184 This can be pasted into Excel, for example.
3185
3186 Parameters
3187 ----------
3188 excel : bool, default True
3189 Produce output in a csv format for easy pasting into excel.
3190
3191 - True, use the provided separator for csv pasting.
3192 - False, write a string representation of the object to the clipboard.
3193
3194 sep : str, default ``'\t'``
3195 Field delimiter.
3196 **kwargs
3197 These parameters will be passed to DataFrame.to_csv.
3198
3199 See Also
3200 --------
3201 DataFrame.to_csv : Write a DataFrame to a comma-separated values
3202 (csv) file.
3203 read_clipboard : Read text from clipboard and pass to read_csv.
3204
3205 Notes
3206 -----
3207 Requirements for your platform.
3208
3209 - Linux : `xclip`, or `xsel` (with `PyQt4` modules)
3210 - Windows : none
3211 - macOS : none
3212
3213 This method uses the processes developed for the package `pyperclip`. A
3214 solution to render any output string format is given in the examples.
3215
3216 Examples
3217 --------
3218 Copy the contents of a DataFrame to the clipboard.
3219
3220 >>> df = pd.DataFrame([[1, 2, 3], [4, 5, 6]], columns=['A', 'B', 'C'])
3221
3222 >>> df.to_clipboard(sep=',') # doctest: +SKIP
3223 ... # Wrote the following to the system clipboard:
3224 ... # ,A,B,C
3225 ... # 0,1,2,3
3226 ... # 1,4,5,6
3227
3228 We can omit the index by passing the keyword `index` and setting
3229 it to false.
3230
3231 >>> df.to_clipboard(sep=',', index=False) # doctest: +SKIP
3232 ... # Wrote the following to the system clipboard:
3233 ... # A,B,C
3234 ... # 1,2,3
3235 ... # 4,5,6
3236
3237 Using the original `pyperclip` package for any string output format.
3238
3239 .. code-block:: python
3240
3241 import pyperclip
3242 html = df.style.to_html()
3243 pyperclip.copy(html)
3244 """
3245 from pandas.io import clipboards
3246
3247 clipboards.to_clipboard(self, excel=excel, sep=sep, **kwargs)
3248
3249 @final
3250 def to_xarray(self):
3251 """
3252 Return an xarray object from the pandas object.
3253
3254 Returns
3255 -------
3256 xarray.DataArray or xarray.Dataset
3257 Data in the pandas structure converted to Dataset if the object is
3258 a DataFrame, or a DataArray if the object is a Series.
3259
3260 See Also
3261 --------
3262 DataFrame.to_hdf : Write DataFrame to an HDF5 file.
3263 DataFrame.to_parquet : Write a DataFrame to the binary parquet format.
3264
3265 Notes
3266 -----
3267 See the `xarray docs <https://xarray.pydata.org/en/stable/>`__
3268
3269 Examples
3270 --------
3271 >>> df = pd.DataFrame([('falcon', 'bird', 389.0, 2),
3272 ... ('parrot', 'bird', 24.0, 2),
3273 ... ('lion', 'mammal', 80.5, 4),
3274 ... ('monkey', 'mammal', np.nan, 4)],
3275 ... columns=['name', 'class', 'max_speed',
3276 ... 'num_legs'])
3277 >>> df
3278 name class max_speed num_legs
3279 0 falcon bird 389.0 2
3280 1 parrot bird 24.0 2
3281 2 lion mammal 80.5 4
3282 3 monkey mammal NaN 4
3283
3284 >>> df.to_xarray() # doctest: +SKIP
3285 <xarray.Dataset>
3286 Dimensions: (index: 4)
3287 Coordinates:
3288 * index (index) int64 32B 0 1 2 3
3289 Data variables:
3290 name (index) object 32B 'falcon' 'parrot' 'lion' 'monkey'
3291 class (index) object 32B 'bird' 'bird' 'mammal' 'mammal'
3292 max_speed (index) float64 32B 389.0 24.0 80.5 nan
3293 num_legs (index) int64 32B 2 2 4 4
3294
3295 >>> df['max_speed'].to_xarray() # doctest: +SKIP
3296 <xarray.DataArray 'max_speed' (index: 4)>
3297 array([389. , 24. , 80.5, nan])
3298 Coordinates:
3299 * index (index) int64 0 1 2 3
3300
3301 >>> dates = pd.to_datetime(['2018-01-01', '2018-01-01',
3302 ... '2018-01-02', '2018-01-02'])
3303 >>> df_multiindex = pd.DataFrame({'date': dates,
3304 ... 'animal': ['falcon', 'parrot',
3305 ... 'falcon', 'parrot'],
3306 ... 'speed': [350, 18, 361, 15]})
3307 >>> df_multiindex = df_multiindex.set_index(['date', 'animal'])
3308
3309 >>> df_multiindex
3310 speed
3311 date animal
3312 2018-01-01 falcon 350
3313 parrot 18
3314 2018-01-02 falcon 361
3315 parrot 15
3316
3317 >>> df_multiindex.to_xarray() # doctest: +SKIP
3318 <xarray.Dataset>
3319 Dimensions: (date: 2, animal: 2)
3320 Coordinates:
3321 * date (date) datetime64[ns] 2018-01-01 2018-01-02
3322 * animal (animal) object 'falcon' 'parrot'
3323 Data variables:
3324 speed (date, animal) int64 350 18 361 15
3325 """
3326 xarray = import_optional_dependency("xarray")
3327
3328 if self.ndim == 1:
3329 return xarray.DataArray.from_series(self)
3330 else:
3331 return xarray.Dataset.from_dataframe(self)
3332
3333 @overload
3334 def to_latex(
3335 self,
3336 buf: None = ...,
3337 columns: Sequence[Hashable] | None = ...,
3338 header: bool_t | SequenceNotStr[str] = ...,
3339 index: bool_t = ...,
3340 na_rep: str = ...,
3341 formatters: FormattersType | None = ...,
3342 float_format: FloatFormatType | None = ...,
3343 sparsify: bool_t | None = ...,
3344 index_names: bool_t = ...,
3345 bold_rows: bool_t = ...,
3346 column_format: str | None = ...,
3347 longtable: bool_t | None = ...,
3348 escape: bool_t | None = ...,
3349 encoding: str | None = ...,
3350 decimal: str = ...,
3351 multicolumn: bool_t | None = ...,
3352 multicolumn_format: str | None = ...,
3353 multirow: bool_t | None = ...,
3354 caption: str | tuple[str, str] | None = ...,
3355 label: str | None = ...,
3356 position: str | None = ...,
3357 ) -> str:
3358 ...
3359
3360 @overload
3361 def to_latex(
3362 self,
3363 buf: FilePath | WriteBuffer[str],
3364 columns: Sequence[Hashable] | None = ...,
3365 header: bool_t | SequenceNotStr[str] = ...,
3366 index: bool_t = ...,
3367 na_rep: str = ...,
3368 formatters: FormattersType | None = ...,
3369 float_format: FloatFormatType | None = ...,
3370 sparsify: bool_t | None = ...,
3371 index_names: bool_t = ...,
3372 bold_rows: bool_t = ...,
3373 column_format: str | None = ...,
3374 longtable: bool_t | None = ...,
3375 escape: bool_t | None = ...,
3376 encoding: str | None = ...,
3377 decimal: str = ...,
3378 multicolumn: bool_t | None = ...,
3379 multicolumn_format: str | None = ...,
3380 multirow: bool_t | None = ...,
3381 caption: str | tuple[str, str] | None = ...,
3382 label: str | None = ...,
3383 position: str | None = ...,
3384 ) -> None:
3385 ...
3386
3387 @final
3388 @deprecate_nonkeyword_arguments(
3389 version="3.0", allowed_args=["self", "buf"], name="to_latex"
3390 )
3391 def to_latex(
3392 self,
3393 buf: FilePath | WriteBuffer[str] | None = None,
3394 columns: Sequence[Hashable] | None = None,
3395 header: bool_t | SequenceNotStr[str] = True,
3396 index: bool_t = True,
3397 na_rep: str = "NaN",
3398 formatters: FormattersType | None = None,
3399 float_format: FloatFormatType | None = None,
3400 sparsify: bool_t | None = None,
3401 index_names: bool_t = True,
3402 bold_rows: bool_t = False,
3403 column_format: str | None = None,
3404 longtable: bool_t | None = None,
3405 escape: bool_t | None = None,
3406 encoding: str | None = None,
3407 decimal: str = ".",
3408 multicolumn: bool_t | None = None,
3409 multicolumn_format: str | None = None,
3410 multirow: bool_t | None = None,
3411 caption: str | tuple[str, str] | None = None,
3412 label: str | None = None,
3413 position: str | None = None,
3414 ) -> str | None:
3415 r"""
3416 Render object to a LaTeX tabular, longtable, or nested table.
3417
3418 Requires ``\usepackage{{booktabs}}``. The output can be copy/pasted
3419 into a main LaTeX document or read from an external file
3420 with ``\input{{table.tex}}``.
3421
3422 .. versionchanged:: 2.0.0
3423 Refactored to use the Styler implementation via jinja2 templating.
3424
3425 Parameters
3426 ----------
3427 buf : str, Path or StringIO-like, optional, default None
3428 Buffer to write to. If None, the output is returned as a string.
3429 columns : list of label, optional
3430 The subset of columns to write. Writes all columns by default.
3431 header : bool or list of str, default True
3432 Write out the column names. If a list of strings is given,
3433 it is assumed to be aliases for the column names.
3434 index : bool, default True
3435 Write row names (index).
3436 na_rep : str, default 'NaN'
3437 Missing data representation.
3438 formatters : list of functions or dict of {{str: function}}, optional
3439 Formatter functions to apply to columns' elements by position or
3440 name. The result of each function must be a unicode string.
3441 List must be of length equal to the number of columns.
3442 float_format : one-parameter function or str, optional, default None
3443 Formatter for floating point numbers. For example
3444 ``float_format="%.2f"`` and ``float_format="{{:0.2f}}".format`` will
3445 both result in 0.1234 being formatted as 0.12.
3446 sparsify : bool, optional
3447 Set to False for a DataFrame with a hierarchical index to print
3448 every multiindex key at each row. By default, the value will be
3449 read from the config module.
3450 index_names : bool, default True
3451 Prints the names of the indexes.
3452 bold_rows : bool, default False
3453 Make the row labels bold in the output.
3454 column_format : str, optional
3455 The columns format as specified in `LaTeX table format
3456 <https://en.wikibooks.org/wiki/LaTeX/Tables>`__ e.g. 'rcl' for 3
3457 columns. By default, 'l' will be used for all columns except
3458 columns of numbers, which default to 'r'.
3459 longtable : bool, optional
3460 Use a longtable environment instead of tabular. Requires
3461 adding a \usepackage{{longtable}} to your LaTeX preamble.
3462 By default, the value will be read from the pandas config
3463 module, and set to `True` if the option ``styler.latex.environment`` is
3464 `"longtable"`.
3465
3466 .. versionchanged:: 2.0.0
3467 The pandas option affecting this argument has changed.
3468 escape : bool, optional
3469 By default, the value will be read from the pandas config
3470 module and set to `True` if the option ``styler.format.escape`` is
3471 `"latex"`. When set to False prevents from escaping latex special
3472 characters in column names.
3473
3474 .. versionchanged:: 2.0.0
3475 The pandas option affecting this argument has changed, as has the
3476 default value to `False`.
3477 encoding : str, optional
3478 A string representing the encoding to use in the output file,
3479 defaults to 'utf-8'.
3480 decimal : str, default '.'
3481 Character recognized as decimal separator, e.g. ',' in Europe.
3482 multicolumn : bool, default True
3483 Use \multicolumn to enhance MultiIndex columns.
3484 The default will be read from the config module, and is set
3485 as the option ``styler.sparse.columns``.
3486
3487 .. versionchanged:: 2.0.0
3488 The pandas option affecting this argument has changed.
3489 multicolumn_format : str, default 'r'
3490 The alignment for multicolumns, similar to `column_format`
3491 The default will be read from the config module, and is set as the option
3492 ``styler.latex.multicol_align``.
3493
3494 .. versionchanged:: 2.0.0
3495 The pandas option affecting this argument has changed, as has the
3496 default value to "r".
3497 multirow : bool, default True
3498 Use \multirow to enhance MultiIndex rows. Requires adding a
3499 \usepackage{{multirow}} to your LaTeX preamble. Will print
3500 centered labels (instead of top-aligned) across the contained
3501 rows, separating groups via clines. The default will be read
3502 from the pandas config module, and is set as the option
3503 ``styler.sparse.index``.
3504
3505 .. versionchanged:: 2.0.0
3506 The pandas option affecting this argument has changed, as has the
3507 default value to `True`.
3508 caption : str or tuple, optional
3509 Tuple (full_caption, short_caption),
3510 which results in ``\caption[short_caption]{{full_caption}}``;
3511 if a single string is passed, no short caption will be set.
3512 label : str, optional
3513 The LaTeX label to be placed inside ``\label{{}}`` in the output.
3514 This is used with ``\ref{{}}`` in the main ``.tex`` file.
3515
3516 position : str, optional
3517 The LaTeX positional argument for tables, to be placed after
3518 ``\begin{{}}`` in the output.
3519
3520 Returns
3521 -------
3522 str or None
3523 If buf is None, returns the result as a string. Otherwise returns None.
3524
3525 See Also
3526 --------
3527 io.formats.style.Styler.to_latex : Render a DataFrame to LaTeX
3528 with conditional formatting.
3529 DataFrame.to_string : Render a DataFrame to a console-friendly
3530 tabular output.
3531 DataFrame.to_html : Render a DataFrame as an HTML table.
3532
3533 Notes
3534 -----
3535 As of v2.0.0 this method has changed to use the Styler implementation as
3536 part of :meth:`.Styler.to_latex` via ``jinja2`` templating. This means
3537 that ``jinja2`` is a requirement, and needs to be installed, for this method
3538 to function. It is advised that users switch to using Styler, since that
3539 implementation is more frequently updated and contains much more
3540 flexibility with the output.
3541
3542 Examples
3543 --------
3544 Convert a general DataFrame to LaTeX with formatting:
3545
3546 >>> df = pd.DataFrame(dict(name=['Raphael', 'Donatello'],
3547 ... age=[26, 45],
3548 ... height=[181.23, 177.65]))
3549 >>> print(df.to_latex(index=False,
3550 ... formatters={"name": str.upper},
3551 ... float_format="{:.1f}".format,
3552 ... )) # doctest: +SKIP
3553 \begin{tabular}{lrr}
3554 \toprule
3555 name & age & height \\
3556 \midrule
3557 RAPHAEL & 26 & 181.2 \\
3558 DONATELLO & 45 & 177.7 \\
3559 \bottomrule
3560 \end{tabular}
3561 """
3562 # Get defaults from the pandas config
3563 if self.ndim == 1:
3564 self = self.to_frame()
3565 if longtable is None:
3566 longtable = config.get_option("styler.latex.environment") == "longtable"
3567 if escape is None:
3568 escape = config.get_option("styler.format.escape") == "latex"
3569 if multicolumn is None:
3570 multicolumn = config.get_option("styler.sparse.columns")
3571 if multicolumn_format is None:
3572 multicolumn_format = config.get_option("styler.latex.multicol_align")
3573 if multirow is None:
3574 multirow = config.get_option("styler.sparse.index")
3575
3576 if column_format is not None and not isinstance(column_format, str):
3577 raise ValueError("`column_format` must be str or unicode")
3578 length = len(self.columns) if columns is None else len(columns)
3579 if isinstance(header, (list, tuple)) and len(header) != length:
3580 raise ValueError(f"Writing {length} cols but got {len(header)} aliases")
3581
3582 # Refactor formatters/float_format/decimal/na_rep/escape to Styler structure
3583 base_format_ = {
3584 "na_rep": na_rep,
3585 "escape": "latex" if escape else None,
3586 "decimal": decimal,
3587 }
3588 index_format_: dict[str, Any] = {"axis": 0, **base_format_}
3589 column_format_: dict[str, Any] = {"axis": 1, **base_format_}
3590
3591 if isinstance(float_format, str):
3592 float_format_: Callable | None = lambda x: float_format % x
3593 else:
3594 float_format_ = float_format
3595
3596 def _wrap(x, alt_format_):
3597 if isinstance(x, (float, complex)) and float_format_ is not None:
3598 return float_format_(x)
3599 else:
3600 return alt_format_(x)
3601
3602 formatters_: list | tuple | dict | Callable | None = None
3603 if isinstance(formatters, list):
3604 formatters_ = {
3605 c: partial(_wrap, alt_format_=formatters[i])
3606 for i, c in enumerate(self.columns)
3607 }
3608 elif isinstance(formatters, dict):
3609 index_formatter = formatters.pop("__index__", None)
3610 column_formatter = formatters.pop("__columns__", None)
3611 if index_formatter is not None:
3612 index_format_.update({"formatter": index_formatter})
3613 if column_formatter is not None:
3614 column_format_.update({"formatter": column_formatter})
3615
3616 formatters_ = formatters
3617 float_columns = self.select_dtypes(include="float").columns
3618 for col in float_columns:
3619 if col not in formatters.keys():
3620 formatters_.update({col: float_format_})
3621 elif formatters is None and float_format is not None:
3622 formatters_ = partial(_wrap, alt_format_=lambda v: v)
3623 format_index_ = [index_format_, column_format_]
3624
3625 # Deal with hiding indexes and relabelling column names
3626 hide_: list[dict] = []
3627 relabel_index_: list[dict] = []
3628 if columns:
3629 hide_.append(
3630 {
3631 "subset": [c for c in self.columns if c not in columns],
3632 "axis": "columns",
3633 }
3634 )
3635 if header is False:
3636 hide_.append({"axis": "columns"})
3637 elif isinstance(header, (list, tuple)):
3638 relabel_index_.append({"labels": header, "axis": "columns"})
3639 format_index_ = [index_format_] # column_format is overwritten
3640
3641 if index is False:
3642 hide_.append({"axis": "index"})
3643 if index_names is False:
3644 hide_.append({"names": True, "axis": "index"})
3645
3646 render_kwargs_ = {
3647 "hrules": True,
3648 "sparse_index": sparsify,
3649 "sparse_columns": sparsify,
3650 "environment": "longtable" if longtable else None,
3651 "multicol_align": multicolumn_format
3652 if multicolumn
3653 else f"naive-{multicolumn_format}",
3654 "multirow_align": "t" if multirow else "naive",
3655 "encoding": encoding,
3656 "caption": caption,
3657 "label": label,
3658 "position": position,
3659 "column_format": column_format,
3660 "clines": "skip-last;data"
3661 if (multirow and isinstance(self.index, MultiIndex))
3662 else None,
3663 "bold_rows": bold_rows,
3664 }
3665
3666 return self._to_latex_via_styler(
3667 buf,
3668 hide=hide_,
3669 relabel_index=relabel_index_,
3670 format={"formatter": formatters_, **base_format_},
3671 format_index=format_index_,
3672 render_kwargs=render_kwargs_,
3673 )
3674
3675 @final
3676 def _to_latex_via_styler(
3677 self,
3678 buf=None,
3679 *,
3680 hide: dict | list[dict] | None = None,
3681 relabel_index: dict | list[dict] | None = None,
3682 format: dict | list[dict] | None = None,
3683 format_index: dict | list[dict] | None = None,
3684 render_kwargs: dict | None = None,
3685 ):
3686 """
3687 Render object to a LaTeX tabular, longtable, or nested table.
3688
3689 Uses the ``Styler`` implementation with the following, ordered, method chaining:
3690
3691 .. code-block:: python
3692 styler = Styler(DataFrame)
3693 styler.hide(**hide)
3694 styler.relabel_index(**relabel_index)
3695 styler.format(**format)
3696 styler.format_index(**format_index)
3697 styler.to_latex(buf=buf, **render_kwargs)
3698
3699 Parameters
3700 ----------
3701 buf : str, Path or StringIO-like, optional, default None
3702 Buffer to write to. If None, the output is returned as a string.
3703 hide : dict, list of dict
3704 Keyword args to pass to the method call of ``Styler.hide``. If a list will
3705 call the method numerous times.
3706 relabel_index : dict, list of dict
3707 Keyword args to pass to the method of ``Styler.relabel_index``. If a list
3708 will call the method numerous times.
3709 format : dict, list of dict
3710 Keyword args to pass to the method call of ``Styler.format``. If a list will
3711 call the method numerous times.
3712 format_index : dict, list of dict
3713 Keyword args to pass to the method call of ``Styler.format_index``. If a
3714 list will call the method numerous times.
3715 render_kwargs : dict
3716 Keyword args to pass to the method call of ``Styler.to_latex``.
3717
3718 Returns
3719 -------
3720 str or None
3721 If buf is None, returns the result as a string. Otherwise returns None.
3722 """
3723 from pandas.io.formats.style import Styler
3724
3725 self = cast("DataFrame", self)
3726 styler = Styler(self, uuid="")
3727
3728 for kw_name in ["hide", "relabel_index", "format", "format_index"]:
3729 kw = vars()[kw_name]
3730 if isinstance(kw, dict):
3731 getattr(styler, kw_name)(**kw)
3732 elif isinstance(kw, list):
3733 for sub_kw in kw:
3734 getattr(styler, kw_name)(**sub_kw)
3735
3736 # bold_rows is not a direct kwarg of Styler.to_latex
3737 render_kwargs = {} if render_kwargs is None else render_kwargs
3738 if render_kwargs.pop("bold_rows"):
3739 styler.map_index(lambda v: "textbf:--rwrap;")
3740
3741 return styler.to_latex(buf=buf, **render_kwargs)
3742
3743 @overload
3744 def to_csv(
3745 self,
3746 path_or_buf: None = ...,
3747 sep: str = ...,
3748 na_rep: str = ...,
3749 float_format: str | Callable | None = ...,
3750 columns: Sequence[Hashable] | None = ...,
3751 header: bool_t | list[str] = ...,
3752 index: bool_t = ...,
3753 index_label: IndexLabel | None = ...,
3754 mode: str = ...,
3755 encoding: str | None = ...,
3756 compression: CompressionOptions = ...,
3757 quoting: int | None = ...,
3758 quotechar: str = ...,
3759 lineterminator: str | None = ...,
3760 chunksize: int | None = ...,
3761 date_format: str | None = ...,
3762 doublequote: bool_t = ...,
3763 escapechar: str | None = ...,
3764 decimal: str = ...,
3765 errors: OpenFileErrors = ...,
3766 storage_options: StorageOptions = ...,
3767 ) -> str:
3768 ...
3769
3770 @overload
3771 def to_csv(
3772 self,
3773 path_or_buf: FilePath | WriteBuffer[bytes] | WriteBuffer[str],
3774 sep: str = ...,
3775 na_rep: str = ...,
3776 float_format: str | Callable | None = ...,
3777 columns: Sequence[Hashable] | None = ...,
3778 header: bool_t | list[str] = ...,
3779 index: bool_t = ...,
3780 index_label: IndexLabel | None = ...,
3781 mode: str = ...,
3782 encoding: str | None = ...,
3783 compression: CompressionOptions = ...,
3784 quoting: int | None = ...,
3785 quotechar: str = ...,
3786 lineterminator: str | None = ...,
3787 chunksize: int | None = ...,
3788 date_format: str | None = ...,
3789 doublequote: bool_t = ...,
3790 escapechar: str | None = ...,
3791 decimal: str = ...,
3792 errors: OpenFileErrors = ...,
3793 storage_options: StorageOptions = ...,
3794 ) -> None:
3795 ...
3796
3797 @final
3798 @deprecate_nonkeyword_arguments(
3799 version="3.0", allowed_args=["self", "path_or_buf"], name="to_csv"
3800 )
3801 @doc(
3802 storage_options=_shared_docs["storage_options"],
3803 compression_options=_shared_docs["compression_options"] % "path_or_buf",
3804 )
3805 def to_csv(
3806 self,
3807 path_or_buf: FilePath | WriteBuffer[bytes] | WriteBuffer[str] | None = None,
3808 sep: str = ",",
3809 na_rep: str = "",
3810 float_format: str | Callable | None = None,
3811 columns: Sequence[Hashable] | None = None,
3812 header: bool_t | list[str] = True,
3813 index: bool_t = True,
3814 index_label: IndexLabel | None = None,
3815 mode: str = "w",
3816 encoding: str | None = None,
3817 compression: CompressionOptions = "infer",
3818 quoting: int | None = None,
3819 quotechar: str = '"',
3820 lineterminator: str | None = None,
3821 chunksize: int | None = None,
3822 date_format: str | None = None,
3823 doublequote: bool_t = True,
3824 escapechar: str | None = None,
3825 decimal: str = ".",
3826 errors: OpenFileErrors = "strict",
3827 storage_options: StorageOptions | None = None,
3828 ) -> str | None:
3829 r"""
3830 Write object to a comma-separated values (csv) file.
3831
3832 Parameters
3833 ----------
3834 path_or_buf : str, path object, file-like object, or None, default None
3835 String, path object (implementing os.PathLike[str]), or file-like
3836 object implementing a write() function. If None, the result is
3837 returned as a string. If a non-binary file object is passed, it should
3838 be opened with `newline=''`, disabling universal newlines. If a binary
3839 file object is passed, `mode` might need to contain a `'b'`.
3840 sep : str, default ','
3841 String of length 1. Field delimiter for the output file.
3842 na_rep : str, default ''
3843 Missing data representation.
3844 float_format : str, Callable, default None
3845 Format string for floating point numbers. If a Callable is given, it takes
3846 precedence over other numeric formatting parameters, like decimal.
3847 columns : sequence, optional
3848 Columns to write.
3849 header : bool or list of str, default True
3850 Write out the column names. If a list of strings is given it is
3851 assumed to be aliases for the column names.
3852 index : bool, default True
3853 Write row names (index).
3854 index_label : str or sequence, or False, default None
3855 Column label for index column(s) if desired. If None is given, and
3856 `header` and `index` are True, then the index names are used. A
3857 sequence should be given if the object uses MultiIndex. If
3858 False do not print fields for index names. Use index_label=False
3859 for easier importing in R.
3860 mode : {{'w', 'x', 'a'}}, default 'w'
3861 Forwarded to either `open(mode=)` or `fsspec.open(mode=)` to control
3862 the file opening. Typical values include:
3863
3864 - 'w', truncate the file first.
3865 - 'x', exclusive creation, failing if the file already exists.
3866 - 'a', append to the end of file if it exists.
3867
3868 encoding : str, optional
3869 A string representing the encoding to use in the output file,
3870 defaults to 'utf-8'. `encoding` is not supported if `path_or_buf`
3871 is a non-binary file object.
3872 {compression_options}
3873
3874 May be a dict with key 'method' as compression mode
3875 and other entries as additional compression options if
3876 compression mode is 'zip'.
3877
3878 Passing compression options as keys in dict is
3879 supported for compression modes 'gzip', 'bz2', 'zstd', and 'zip'.
3880 quoting : optional constant from csv module
3881 Defaults to csv.QUOTE_MINIMAL. If you have set a `float_format`
3882 then floats are converted to strings and thus csv.QUOTE_NONNUMERIC
3883 will treat them as non-numeric.
3884 quotechar : str, default '\"'
3885 String of length 1. Character used to quote fields.
3886 lineterminator : str, optional
3887 The newline character or character sequence to use in the output
3888 file. Defaults to `os.linesep`, which depends on the OS in which
3889 this method is called ('\\n' for linux, '\\r\\n' for Windows, i.e.).
3890
3891 .. versionchanged:: 1.5.0
3892
3893 Previously was line_terminator, changed for consistency with
3894 read_csv and the standard library 'csv' module.
3895
3896 chunksize : int or None
3897 Rows to write at a time.
3898 date_format : str, default None
3899 Format string for datetime objects.
3900 doublequote : bool, default True
3901 Control quoting of `quotechar` inside a field.
3902 escapechar : str, default None
3903 String of length 1. Character used to escape `sep` and `quotechar`
3904 when appropriate.
3905 decimal : str, default '.'
3906 Character recognized as decimal separator. E.g. use ',' for
3907 European data.
3908 errors : str, default 'strict'
3909 Specifies how encoding and decoding errors are to be handled.
3910 See the errors argument for :func:`open` for a full list
3911 of options.
3912
3913 {storage_options}
3914
3915 Returns
3916 -------
3917 None or str
3918 If path_or_buf is None, returns the resulting csv format as a
3919 string. Otherwise returns None.
3920
3921 See Also
3922 --------
3923 read_csv : Load a CSV file into a DataFrame.
3924 to_excel : Write DataFrame to an Excel file.
3925
3926 Examples
3927 --------
3928 Create 'out.csv' containing 'df' without indices
3929
3930 >>> df = pd.DataFrame({{'name': ['Raphael', 'Donatello'],
3931 ... 'mask': ['red', 'purple'],
3932 ... 'weapon': ['sai', 'bo staff']}})
3933 >>> df.to_csv('out.csv', index=False) # doctest: +SKIP
3934
3935 Create 'out.zip' containing 'out.csv'
3936
3937 >>> df.to_csv(index=False)
3938 'name,mask,weapon\nRaphael,red,sai\nDonatello,purple,bo staff\n'
3939 >>> compression_opts = dict(method='zip',
3940 ... archive_name='out.csv') # doctest: +SKIP
3941 >>> df.to_csv('out.zip', index=False,
3942 ... compression=compression_opts) # doctest: +SKIP
3943
3944 To write a csv file to a new folder or nested folder you will first
3945 need to create it using either Pathlib or os:
3946
3947 >>> from pathlib import Path # doctest: +SKIP
3948 >>> filepath = Path('folder/subfolder/out.csv') # doctest: +SKIP
3949 >>> filepath.parent.mkdir(parents=True, exist_ok=True) # doctest: +SKIP
3950 >>> df.to_csv(filepath) # doctest: +SKIP
3951
3952 >>> import os # doctest: +SKIP
3953 >>> os.makedirs('folder/subfolder', exist_ok=True) # doctest: +SKIP
3954 >>> df.to_csv('folder/subfolder/out.csv') # doctest: +SKIP
3955 """
3956 df = self if isinstance(self, ABCDataFrame) else self.to_frame()
3957
3958 formatter = DataFrameFormatter(
3959 frame=df,
3960 header=header,
3961 index=index,
3962 na_rep=na_rep,
3963 float_format=float_format,
3964 decimal=decimal,
3965 )
3966
3967 return DataFrameRenderer(formatter).to_csv(
3968 path_or_buf,
3969 lineterminator=lineterminator,
3970 sep=sep,
3971 encoding=encoding,
3972 errors=errors,
3973 compression=compression,
3974 quoting=quoting,
3975 columns=columns,
3976 index_label=index_label,
3977 mode=mode,
3978 chunksize=chunksize,
3979 quotechar=quotechar,
3980 date_format=date_format,
3981 doublequote=doublequote,
3982 escapechar=escapechar,
3983 storage_options=storage_options,
3984 )
3985
3986 # ----------------------------------------------------------------------
3987 # Lookup Caching
3988
3989 def _reset_cacher(self) -> None:
3990 """
3991 Reset the cacher.
3992 """
3993 raise AbstractMethodError(self)
3994
3995 def _maybe_update_cacher(
3996 self,
3997 clear: bool_t = False,
3998 verify_is_copy: bool_t = True,
3999 inplace: bool_t = False,
4000 ) -> None:
4001 """
4002 See if we need to update our parent cacher if clear, then clear our
4003 cache.
4004
4005 Parameters
4006 ----------
4007 clear : bool, default False
4008 Clear the item cache.
4009 verify_is_copy : bool, default True
4010 Provide is_copy checks.
4011 """
4012 if using_copy_on_write():
4013 return
4014
4015 if verify_is_copy:
4016 self._check_setitem_copy(t="referent")
4017
4018 if clear:
4019 self._clear_item_cache()
4020
4021 def _clear_item_cache(self) -> None:
4022 raise AbstractMethodError(self)
4023
4024 # ----------------------------------------------------------------------
4025 # Indexing Methods
4026
4027 @final
4028 def take(self, indices, axis: Axis = 0, **kwargs) -> Self:
4029 """
4030 Return the elements in the given *positional* indices along an axis.
4031
4032 This means that we are not indexing according to actual values in
4033 the index attribute of the object. We are indexing according to the
4034 actual position of the element in the object.
4035
4036 Parameters
4037 ----------
4038 indices : array-like
4039 An array of ints indicating which positions to take.
4040 axis : {0 or 'index', 1 or 'columns', None}, default 0
4041 The axis on which to select elements. ``0`` means that we are
4042 selecting rows, ``1`` means that we are selecting columns.
4043 For `Series` this parameter is unused and defaults to 0.
4044 **kwargs
4045 For compatibility with :meth:`numpy.take`. Has no effect on the
4046 output.
4047
4048 Returns
4049 -------
4050 same type as caller
4051 An array-like containing the elements taken from the object.
4052
4053 See Also
4054 --------
4055 DataFrame.loc : Select a subset of a DataFrame by labels.
4056 DataFrame.iloc : Select a subset of a DataFrame by positions.
4057 numpy.take : Take elements from an array along an axis.
4058
4059 Examples
4060 --------
4061 >>> df = pd.DataFrame([('falcon', 'bird', 389.0),
4062 ... ('parrot', 'bird', 24.0),
4063 ... ('lion', 'mammal', 80.5),
4064 ... ('monkey', 'mammal', np.nan)],
4065 ... columns=['name', 'class', 'max_speed'],
4066 ... index=[0, 2, 3, 1])
4067 >>> df
4068 name class max_speed
4069 0 falcon bird 389.0
4070 2 parrot bird 24.0
4071 3 lion mammal 80.5
4072 1 monkey mammal NaN
4073
4074 Take elements at positions 0 and 3 along the axis 0 (default).
4075
4076 Note how the actual indices selected (0 and 1) do not correspond to
4077 our selected indices 0 and 3. That's because we are selecting the 0th
4078 and 3rd rows, not rows whose indices equal 0 and 3.
4079
4080 >>> df.take([0, 3])
4081 name class max_speed
4082 0 falcon bird 389.0
4083 1 monkey mammal NaN
4084
4085 Take elements at indices 1 and 2 along the axis 1 (column selection).
4086
4087 >>> df.take([1, 2], axis=1)
4088 class max_speed
4089 0 bird 389.0
4090 2 bird 24.0
4091 3 mammal 80.5
4092 1 mammal NaN
4093
4094 We may take elements using negative integers for positive indices,
4095 starting from the end of the object, just like with Python lists.
4096
4097 >>> df.take([-1, -2])
4098 name class max_speed
4099 1 monkey mammal NaN
4100 3 lion mammal 80.5
4101 """
4102
4103 nv.validate_take((), kwargs)
4104
4105 if not isinstance(indices, slice):
4106 indices = np.asarray(indices, dtype=np.intp)
4107 if (
4108 axis == 0
4109 and indices.ndim == 1
4110 and using_copy_on_write()
4111 and is_range_indexer(indices, len(self))
4112 ):
4113 return self.copy(deep=None)
4114 elif self.ndim == 1:
4115 raise TypeError(
4116 f"{type(self).__name__}.take requires a sequence of integers, "
4117 "not slice."
4118 )
4119 else:
4120 warnings.warn(
4121 # GH#51539
4122 f"Passing a slice to {type(self).__name__}.take is deprecated "
4123 "and will raise in a future version. Use `obj[slicer]` or pass "
4124 "a sequence of integers instead.",
4125 FutureWarning,
4126 stacklevel=find_stack_level(),
4127 )
4128 # We can get here with a slice via DataFrame.__getitem__
4129 indices = np.arange(
4130 indices.start, indices.stop, indices.step, dtype=np.intp
4131 )
4132
4133 new_data = self._mgr.take(
4134 indices,
4135 axis=self._get_block_manager_axis(axis),
4136 verify=True,
4137 )
4138 return self._constructor_from_mgr(new_data, axes=new_data.axes).__finalize__(
4139 self, method="take"
4140 )
4141
4142 @final
4143 def _take_with_is_copy(self, indices, axis: Axis = 0) -> Self:
4144 """
4145 Internal version of the `take` method that sets the `_is_copy`
4146 attribute to keep track of the parent dataframe (using in indexing
4147 for the SettingWithCopyWarning).
4148
4149 For Series this does the same as the public take (it never sets `_is_copy`).
4150
4151 See the docstring of `take` for full explanation of the parameters.
4152 """
4153 result = self.take(indices=indices, axis=axis)
4154 # Maybe set copy if we didn't actually change the index.
4155 if self.ndim == 2 and not result._get_axis(axis).equals(self._get_axis(axis)):
4156 result._set_is_copy(self)
4157 return result
4158
4159 @final
4160 def xs(
4161 self,
4162 key: IndexLabel,
4163 axis: Axis = 0,
4164 level: IndexLabel | None = None,
4165 drop_level: bool_t = True,
4166 ) -> Self:
4167 """
4168 Return cross-section from the Series/DataFrame.
4169
4170 This method takes a `key` argument to select data at a particular
4171 level of a MultiIndex.
4172
4173 Parameters
4174 ----------
4175 key : label or tuple of label
4176 Label contained in the index, or partially in a MultiIndex.
4177 axis : {0 or 'index', 1 or 'columns'}, default 0
4178 Axis to retrieve cross-section on.
4179 level : object, defaults to first n levels (n=1 or len(key))
4180 In case of a key partially contained in a MultiIndex, indicate
4181 which levels are used. Levels can be referred by label or position.
4182 drop_level : bool, default True
4183 If False, returns object with same levels as self.
4184
4185 Returns
4186 -------
4187 Series or DataFrame
4188 Cross-section from the original Series or DataFrame
4189 corresponding to the selected index levels.
4190
4191 See Also
4192 --------
4193 DataFrame.loc : Access a group of rows and columns
4194 by label(s) or a boolean array.
4195 DataFrame.iloc : Purely integer-location based indexing
4196 for selection by position.
4197
4198 Notes
4199 -----
4200 `xs` can not be used to set values.
4201
4202 MultiIndex Slicers is a generic way to get/set values on
4203 any level or levels.
4204 It is a superset of `xs` functionality, see
4205 :ref:`MultiIndex Slicers <advanced.mi_slicers>`.
4206
4207 Examples
4208 --------
4209 >>> d = {'num_legs': [4, 4, 2, 2],
4210 ... 'num_wings': [0, 0, 2, 2],
4211 ... 'class': ['mammal', 'mammal', 'mammal', 'bird'],
4212 ... 'animal': ['cat', 'dog', 'bat', 'penguin'],
4213 ... 'locomotion': ['walks', 'walks', 'flies', 'walks']}
4214 >>> df = pd.DataFrame(data=d)
4215 >>> df = df.set_index(['class', 'animal', 'locomotion'])
4216 >>> df
4217 num_legs num_wings
4218 class animal locomotion
4219 mammal cat walks 4 0
4220 dog walks 4 0
4221 bat flies 2 2
4222 bird penguin walks 2 2
4223
4224 Get values at specified index
4225
4226 >>> df.xs('mammal')
4227 num_legs num_wings
4228 animal locomotion
4229 cat walks 4 0
4230 dog walks 4 0
4231 bat flies 2 2
4232
4233 Get values at several indexes
4234
4235 >>> df.xs(('mammal', 'dog', 'walks'))
4236 num_legs 4
4237 num_wings 0
4238 Name: (mammal, dog, walks), dtype: int64
4239
4240 Get values at specified index and level
4241
4242 >>> df.xs('cat', level=1)
4243 num_legs num_wings
4244 class locomotion
4245 mammal walks 4 0
4246
4247 Get values at several indexes and levels
4248
4249 >>> df.xs(('bird', 'walks'),
4250 ... level=[0, 'locomotion'])
4251 num_legs num_wings
4252 animal
4253 penguin 2 2
4254
4255 Get values at specified column and axis
4256
4257 >>> df.xs('num_wings', axis=1)
4258 class animal locomotion
4259 mammal cat walks 0
4260 dog walks 0
4261 bat flies 2
4262 bird penguin walks 2
4263 Name: num_wings, dtype: int64
4264 """
4265 axis = self._get_axis_number(axis)
4266 labels = self._get_axis(axis)
4267
4268 if isinstance(key, list):
4269 raise TypeError("list keys are not supported in xs, pass a tuple instead")
4270
4271 if level is not None:
4272 if not isinstance(labels, MultiIndex):
4273 raise TypeError("Index must be a MultiIndex")
4274 loc, new_ax = labels.get_loc_level(key, level=level, drop_level=drop_level)
4275
4276 # create the tuple of the indexer
4277 _indexer = [slice(None)] * self.ndim
4278 _indexer[axis] = loc
4279 indexer = tuple(_indexer)
4280
4281 result = self.iloc[indexer]
4282 setattr(result, result._get_axis_name(axis), new_ax)
4283 return result
4284
4285 if axis == 1:
4286 if drop_level:
4287 return self[key]
4288 index = self.columns
4289 else:
4290 index = self.index
4291
4292 if isinstance(index, MultiIndex):
4293 loc, new_index = index._get_loc_level(key, level=0)
4294 if not drop_level:
4295 if lib.is_integer(loc):
4296 # Slice index must be an integer or None
4297 new_index = index[loc : loc + 1]
4298 else:
4299 new_index = index[loc]
4300 else:
4301 loc = index.get_loc(key)
4302
4303 if isinstance(loc, np.ndarray):
4304 if loc.dtype == np.bool_:
4305 (inds,) = loc.nonzero()
4306 return self._take_with_is_copy(inds, axis=axis)
4307 else:
4308 return self._take_with_is_copy(loc, axis=axis)
4309
4310 if not is_scalar(loc):
4311 new_index = index[loc]
4312
4313 if is_scalar(loc) and axis == 0:
4314 # In this case loc should be an integer
4315 if self.ndim == 1:
4316 # if we encounter an array-like and we only have 1 dim
4317 # that means that their are list/ndarrays inside the Series!
4318 # so just return them (GH 6394)
4319 return self._values[loc]
4320
4321 new_mgr = self._mgr.fast_xs(loc)
4322
4323 result = self._constructor_sliced_from_mgr(new_mgr, axes=new_mgr.axes)
4324 result._name = self.index[loc]
4325 result = result.__finalize__(self)
4326 elif is_scalar(loc):
4327 result = self.iloc[:, slice(loc, loc + 1)]
4328 elif axis == 1:
4329 result = self.iloc[:, loc]
4330 else:
4331 result = self.iloc[loc]
4332 result.index = new_index
4333
4334 # this could be a view
4335 # but only in a single-dtyped view sliceable case
4336 result._set_is_copy(self, copy=not result._is_view)
4337 return result
4338
4339 def __getitem__(self, item):
4340 raise AbstractMethodError(self)
4341
4342 @final
4343 def _getitem_slice(self, key: slice) -> Self:
4344 """
4345 __getitem__ for the case where the key is a slice object.
4346 """
4347 # _convert_slice_indexer to determine if this slice is positional
4348 # or label based, and if the latter, convert to positional
4349 slobj = self.index._convert_slice_indexer(key, kind="getitem")
4350 if isinstance(slobj, np.ndarray):
4351 # reachable with DatetimeIndex
4352 indexer = lib.maybe_indices_to_slice(
4353 slobj.astype(np.intp, copy=False), len(self)
4354 )
4355 if isinstance(indexer, np.ndarray):
4356 # GH#43223 If we can not convert, use take
4357 return self.take(indexer, axis=0)
4358 slobj = indexer
4359 return self._slice(slobj)
4360
4361 def _slice(self, slobj: slice, axis: AxisInt = 0) -> Self:
4362 """
4363 Construct a slice of this container.
4364
4365 Slicing with this method is *always* positional.
4366 """
4367 assert isinstance(slobj, slice), type(slobj)
4368 axis = self._get_block_manager_axis(axis)
4369 new_mgr = self._mgr.get_slice(slobj, axis=axis)
4370 result = self._constructor_from_mgr(new_mgr, axes=new_mgr.axes)
4371 result = result.__finalize__(self)
4372
4373 # this could be a view
4374 # but only in a single-dtyped view sliceable case
4375 is_copy = axis != 0 or result._is_view
4376 result._set_is_copy(self, copy=is_copy)
4377 return result
4378
4379 @final
4380 def _set_is_copy(self, ref: NDFrame, copy: bool_t = True) -> None:
4381 if not copy:
4382 self._is_copy = None
4383 else:
4384 assert ref is not None
4385 self._is_copy = weakref.ref(ref)
4386
4387 def _check_is_chained_assignment_possible(self) -> bool_t:
4388 """
4389 Check if we are a view, have a cacher, and are of mixed type.
4390 If so, then force a setitem_copy check.
4391
4392 Should be called just near setting a value
4393
4394 Will return a boolean if it we are a view and are cached, but a
4395 single-dtype meaning that the cacher should be updated following
4396 setting.
4397 """
4398 if self._is_copy:
4399 self._check_setitem_copy(t="referent")
4400 return False
4401
4402 @final
4403 def _check_setitem_copy(self, t: str = "setting", force: bool_t = False):
4404 """
4405
4406 Parameters
4407 ----------
4408 t : str, the type of setting error
4409 force : bool, default False
4410 If True, then force showing an error.
4411
4412 validate if we are doing a setitem on a chained copy.
4413
4414 It is technically possible to figure out that we are setting on
4415 a copy even WITH a multi-dtyped pandas object. In other words, some
4416 blocks may be views while other are not. Currently _is_view will ALWAYS
4417 return False for multi-blocks to avoid having to handle this case.
4418
4419 df = DataFrame(np.arange(0,9), columns=['count'])
4420 df['group'] = 'b'
4421
4422 # This technically need not raise SettingWithCopy if both are view
4423 # (which is not generally guaranteed but is usually True. However,
4424 # this is in general not a good practice and we recommend using .loc.
4425 df.iloc[0:5]['group'] = 'a'
4426
4427 """
4428 if using_copy_on_write() or warn_copy_on_write():
4429 return
4430
4431 # return early if the check is not needed
4432 if not (force or self._is_copy):
4433 return
4434
4435 value = config.get_option("mode.chained_assignment")
4436 if value is None:
4437 return
4438
4439 # see if the copy is not actually referred; if so, then dissolve
4440 # the copy weakref
4441 if self._is_copy is not None and not isinstance(self._is_copy, str):
4442 r = self._is_copy()
4443 if not gc.get_referents(r) or (r is not None and r.shape == self.shape):
4444 self._is_copy = None
4445 return
4446
4447 # a custom message
4448 if isinstance(self._is_copy, str):
4449 t = self._is_copy
4450
4451 elif t == "referent":
4452 t = (
4453 "\n"
4454 "A value is trying to be set on a copy of a slice from a "
4455 "DataFrame\n\n"
4456 "See the caveats in the documentation: "
4457 "https://pandas.pydata.org/pandas-docs/stable/user_guide/"
4458 "indexing.html#returning-a-view-versus-a-copy"
4459 )
4460
4461 else:
4462 t = (
4463 "\n"
4464 "A value is trying to be set on a copy of a slice from a "
4465 "DataFrame.\n"
4466 "Try using .loc[row_indexer,col_indexer] = value "
4467 "instead\n\nSee the caveats in the documentation: "
4468 "https://pandas.pydata.org/pandas-docs/stable/user_guide/"
4469 "indexing.html#returning-a-view-versus-a-copy"
4470 )
4471
4472 if value == "raise":
4473 raise SettingWithCopyError(t)
4474 if value == "warn":
4475 warnings.warn(t, SettingWithCopyWarning, stacklevel=find_stack_level())
4476
4477 @final
4478 def __delitem__(self, key) -> None:
4479 """
4480 Delete item
4481 """
4482 deleted = False
4483
4484 maybe_shortcut = False
4485 if self.ndim == 2 and isinstance(self.columns, MultiIndex):
4486 try:
4487 # By using engine's __contains__ we effectively
4488 # restrict to same-length tuples
4489 maybe_shortcut = key not in self.columns._engine
4490 except TypeError:
4491 pass
4492
4493 if maybe_shortcut:
4494 # Allow shorthand to delete all columns whose first len(key)
4495 # elements match key:
4496 if not isinstance(key, tuple):
4497 key = (key,)
4498 for col in self.columns:
4499 if isinstance(col, tuple) and col[: len(key)] == key:
4500 del self[col]
4501 deleted = True
4502 if not deleted:
4503 # If the above loop ran and didn't delete anything because
4504 # there was no match, this call should raise the appropriate
4505 # exception:
4506 loc = self.axes[-1].get_loc(key)
4507 self._mgr = self._mgr.idelete(loc)
4508
4509 # delete from the caches
4510 try:
4511 del self._item_cache[key]
4512 except KeyError:
4513 pass
4514
4515 # ----------------------------------------------------------------------
4516 # Unsorted
4517
4518 @final
4519 def _check_inplace_and_allows_duplicate_labels(self, inplace: bool_t):
4520 if inplace and not self.flags.allows_duplicate_labels:
4521 raise ValueError(
4522 "Cannot specify 'inplace=True' when "
4523 "'self.flags.allows_duplicate_labels' is False."
4524 )
4525
4526 @final
4527 def get(self, key, default=None):
4528 """
4529 Get item from object for given key (ex: DataFrame column).
4530
4531 Returns default value if not found.
4532
4533 Parameters
4534 ----------
4535 key : object
4536
4537 Returns
4538 -------
4539 same type as items contained in object
4540
4541 Examples
4542 --------
4543 >>> df = pd.DataFrame(
4544 ... [
4545 ... [24.3, 75.7, "high"],
4546 ... [31, 87.8, "high"],
4547 ... [22, 71.6, "medium"],
4548 ... [35, 95, "medium"],
4549 ... ],
4550 ... columns=["temp_celsius", "temp_fahrenheit", "windspeed"],
4551 ... index=pd.date_range(start="2014-02-12", end="2014-02-15", freq="D"),
4552 ... )
4553
4554 >>> df
4555 temp_celsius temp_fahrenheit windspeed
4556 2014-02-12 24.3 75.7 high
4557 2014-02-13 31.0 87.8 high
4558 2014-02-14 22.0 71.6 medium
4559 2014-02-15 35.0 95.0 medium
4560
4561 >>> df.get(["temp_celsius", "windspeed"])
4562 temp_celsius windspeed
4563 2014-02-12 24.3 high
4564 2014-02-13 31.0 high
4565 2014-02-14 22.0 medium
4566 2014-02-15 35.0 medium
4567
4568 >>> ser = df['windspeed']
4569 >>> ser.get('2014-02-13')
4570 'high'
4571
4572 If the key isn't found, the default value will be used.
4573
4574 >>> df.get(["temp_celsius", "temp_kelvin"], default="default_value")
4575 'default_value'
4576
4577 >>> ser.get('2014-02-10', '[unknown]')
4578 '[unknown]'
4579 """
4580 try:
4581 return self[key]
4582 except (KeyError, ValueError, IndexError):
4583 return default
4584
4585 @final
4586 @property
4587 def _is_view(self) -> bool_t:
4588 """Return boolean indicating if self is view of another array"""
4589 return self._mgr.is_view
4590
4591 @final
4592 def reindex_like(
4593 self,
4594 other,
4595 method: Literal["backfill", "bfill", "pad", "ffill", "nearest"] | None = None,
4596 copy: bool_t | None = None,
4597 limit: int | None = None,
4598 tolerance=None,
4599 ) -> Self:
4600 """
4601 Return an object with matching indices as other object.
4602
4603 Conform the object to the same index on all axes. Optional
4604 filling logic, placing NaN in locations having no value
4605 in the previous index. A new object is produced unless the
4606 new index is equivalent to the current one and copy=False.
4607
4608 Parameters
4609 ----------
4610 other : Object of the same data type
4611 Its row and column indices are used to define the new indices
4612 of this object.
4613 method : {None, 'backfill'/'bfill', 'pad'/'ffill', 'nearest'}
4614 Method to use for filling holes in reindexed DataFrame.
4615 Please note: this is only applicable to DataFrames/Series with a
4616 monotonically increasing/decreasing index.
4617
4618 * None (default): don't fill gaps
4619 * pad / ffill: propagate last valid observation forward to next
4620 valid
4621 * backfill / bfill: use next valid observation to fill gap
4622 * nearest: use nearest valid observations to fill gap.
4623
4624 copy : bool, default True
4625 Return a new object, even if the passed indexes are the same.
4626
4627 .. note::
4628 The `copy` keyword will change behavior in pandas 3.0.
4629 `Copy-on-Write
4630 <https://pandas.pydata.org/docs/dev/user_guide/copy_on_write.html>`__
4631 will be enabled by default, which means that all methods with a
4632 `copy` keyword will use a lazy copy mechanism to defer the copy and
4633 ignore the `copy` keyword. The `copy` keyword will be removed in a
4634 future version of pandas.
4635
4636 You can already get the future behavior and improvements through
4637 enabling copy on write ``pd.options.mode.copy_on_write = True``
4638 limit : int, default None
4639 Maximum number of consecutive labels to fill for inexact matches.
4640 tolerance : optional
4641 Maximum distance between original and new labels for inexact
4642 matches. The values of the index at the matching locations must
4643 satisfy the equation ``abs(index[indexer] - target) <= tolerance``.
4644
4645 Tolerance may be a scalar value, which applies the same tolerance
4646 to all values, or list-like, which applies variable tolerance per
4647 element. List-like includes list, tuple, array, Series, and must be
4648 the same size as the index and its dtype must exactly match the
4649 index's type.
4650
4651 Returns
4652 -------
4653 Series or DataFrame
4654 Same type as caller, but with changed indices on each axis.
4655
4656 See Also
4657 --------
4658 DataFrame.set_index : Set row labels.
4659 DataFrame.reset_index : Remove row labels or move them to new columns.
4660 DataFrame.reindex : Change to new indices or expand indices.
4661
4662 Notes
4663 -----
4664 Same as calling
4665 ``.reindex(index=other.index, columns=other.columns,...)``.
4666
4667 Examples
4668 --------
4669 >>> df1 = pd.DataFrame([[24.3, 75.7, 'high'],
4670 ... [31, 87.8, 'high'],
4671 ... [22, 71.6, 'medium'],
4672 ... [35, 95, 'medium']],
4673 ... columns=['temp_celsius', 'temp_fahrenheit',
4674 ... 'windspeed'],
4675 ... index=pd.date_range(start='2014-02-12',
4676 ... end='2014-02-15', freq='D'))
4677
4678 >>> df1
4679 temp_celsius temp_fahrenheit windspeed
4680 2014-02-12 24.3 75.7 high
4681 2014-02-13 31.0 87.8 high
4682 2014-02-14 22.0 71.6 medium
4683 2014-02-15 35.0 95.0 medium
4684
4685 >>> df2 = pd.DataFrame([[28, 'low'],
4686 ... [30, 'low'],
4687 ... [35.1, 'medium']],
4688 ... columns=['temp_celsius', 'windspeed'],
4689 ... index=pd.DatetimeIndex(['2014-02-12', '2014-02-13',
4690 ... '2014-02-15']))
4691
4692 >>> df2
4693 temp_celsius windspeed
4694 2014-02-12 28.0 low
4695 2014-02-13 30.0 low
4696 2014-02-15 35.1 medium
4697
4698 >>> df2.reindex_like(df1)
4699 temp_celsius temp_fahrenheit windspeed
4700 2014-02-12 28.0 NaN low
4701 2014-02-13 30.0 NaN low
4702 2014-02-14 NaN NaN NaN
4703 2014-02-15 35.1 NaN medium
4704 """
4705 d = other._construct_axes_dict(
4706 axes=self._AXIS_ORDERS,
4707 method=method,
4708 copy=copy,
4709 limit=limit,
4710 tolerance=tolerance,
4711 )
4712
4713 return self.reindex(**d)
4714
4715 @overload
4716 def drop(
4717 self,
4718 labels: IndexLabel = ...,
4719 *,
4720 axis: Axis = ...,
4721 index: IndexLabel = ...,
4722 columns: IndexLabel = ...,
4723 level: Level | None = ...,
4724 inplace: Literal[True],
4725 errors: IgnoreRaise = ...,
4726 ) -> None:
4727 ...
4728
4729 @overload
4730 def drop(
4731 self,
4732 labels: IndexLabel = ...,
4733 *,
4734 axis: Axis = ...,
4735 index: IndexLabel = ...,
4736 columns: IndexLabel = ...,
4737 level: Level | None = ...,
4738 inplace: Literal[False] = ...,
4739 errors: IgnoreRaise = ...,
4740 ) -> Self:
4741 ...
4742
4743 @overload
4744 def drop(
4745 self,
4746 labels: IndexLabel = ...,
4747 *,
4748 axis: Axis = ...,
4749 index: IndexLabel = ...,
4750 columns: IndexLabel = ...,
4751 level: Level | None = ...,
4752 inplace: bool_t = ...,
4753 errors: IgnoreRaise = ...,
4754 ) -> Self | None:
4755 ...
4756
4757 def drop(
4758 self,
4759 labels: IndexLabel | None = None,
4760 *,
4761 axis: Axis = 0,
4762 index: IndexLabel | None = None,
4763 columns: IndexLabel | None = None,
4764 level: Level | None = None,
4765 inplace: bool_t = False,
4766 errors: IgnoreRaise = "raise",
4767 ) -> Self | None:
4768 inplace = validate_bool_kwarg(inplace, "inplace")
4769
4770 if labels is not None:
4771 if index is not None or columns is not None:
4772 raise ValueError("Cannot specify both 'labels' and 'index'/'columns'")
4773 axis_name = self._get_axis_name(axis)
4774 axes = {axis_name: labels}
4775 elif index is not None or columns is not None:
4776 axes = {"index": index}
4777 if self.ndim == 2:
4778 axes["columns"] = columns
4779 else:
4780 raise ValueError(
4781 "Need to specify at least one of 'labels', 'index' or 'columns'"
4782 )
4783
4784 obj = self
4785
4786 for axis, labels in axes.items():
4787 if labels is not None:
4788 obj = obj._drop_axis(labels, axis, level=level, errors=errors)
4789
4790 if inplace:
4791 self._update_inplace(obj)
4792 return None
4793 else:
4794 return obj
4795
4796 @final
4797 def _drop_axis(
4798 self,
4799 labels,
4800 axis,
4801 level=None,
4802 errors: IgnoreRaise = "raise",
4803 only_slice: bool_t = False,
4804 ) -> Self:
4805 """
4806 Drop labels from specified axis. Used in the ``drop`` method
4807 internally.
4808
4809 Parameters
4810 ----------
4811 labels : single label or list-like
4812 axis : int or axis name
4813 level : int or level name, default None
4814 For MultiIndex
4815 errors : {'ignore', 'raise'}, default 'raise'
4816 If 'ignore', suppress error and existing labels are dropped.
4817 only_slice : bool, default False
4818 Whether indexing along columns should be view-only.
4819
4820 """
4821 axis_num = self._get_axis_number(axis)
4822 axis = self._get_axis(axis)
4823
4824 if axis.is_unique:
4825 if level is not None:
4826 if not isinstance(axis, MultiIndex):
4827 raise AssertionError("axis must be a MultiIndex")
4828 new_axis = axis.drop(labels, level=level, errors=errors)
4829 else:
4830 new_axis = axis.drop(labels, errors=errors)
4831 indexer = axis.get_indexer(new_axis)
4832
4833 # Case for non-unique axis
4834 else:
4835 is_tuple_labels = is_nested_list_like(labels) or isinstance(labels, tuple)
4836 labels = ensure_object(common.index_labels_to_array(labels))
4837 if level is not None:
4838 if not isinstance(axis, MultiIndex):
4839 raise AssertionError("axis must be a MultiIndex")
4840 mask = ~axis.get_level_values(level).isin(labels)
4841
4842 # GH 18561 MultiIndex.drop should raise if label is absent
4843 if errors == "raise" and mask.all():
4844 raise KeyError(f"{labels} not found in axis")
4845 elif (
4846 isinstance(axis, MultiIndex)
4847 and labels.dtype == "object"
4848 and not is_tuple_labels
4849 ):
4850 # Set level to zero in case of MultiIndex and label is string,
4851 # because isin can't handle strings for MultiIndexes GH#36293
4852 # In case of tuples we get dtype object but have to use isin GH#42771
4853 mask = ~axis.get_level_values(0).isin(labels)
4854 else:
4855 mask = ~axis.isin(labels)
4856 # Check if label doesn't exist along axis
4857 labels_missing = (axis.get_indexer_for(labels) == -1).any()
4858 if errors == "raise" and labels_missing:
4859 raise KeyError(f"{labels} not found in axis")
4860
4861 if isinstance(mask.dtype, ExtensionDtype):
4862 # GH#45860
4863 mask = mask.to_numpy(dtype=bool)
4864
4865 indexer = mask.nonzero()[0]
4866 new_axis = axis.take(indexer)
4867
4868 bm_axis = self.ndim - axis_num - 1
4869 new_mgr = self._mgr.reindex_indexer(
4870 new_axis,
4871 indexer,
4872 axis=bm_axis,
4873 allow_dups=True,
4874 copy=None,
4875 only_slice=only_slice,
4876 )
4877 result = self._constructor_from_mgr(new_mgr, axes=new_mgr.axes)
4878 if self.ndim == 1:
4879 result._name = self.name
4880
4881 return result.__finalize__(self)
4882
4883 @final
4884 def _update_inplace(self, result, verify_is_copy: bool_t = True) -> None:
4885 """
4886 Replace self internals with result.
4887
4888 Parameters
4889 ----------
4890 result : same type as self
4891 verify_is_copy : bool, default True
4892 Provide is_copy checks.
4893 """
4894 # NOTE: This does *not* call __finalize__ and that's an explicit
4895 # decision that we may revisit in the future.
4896 self._reset_cache()
4897 self._clear_item_cache()
4898 self._mgr = result._mgr
4899 self._maybe_update_cacher(verify_is_copy=verify_is_copy, inplace=True)
4900
4901 @final
4902 def add_prefix(self, prefix: str, axis: Axis | None = None) -> Self:
4903 """
4904 Prefix labels with string `prefix`.
4905
4906 For Series, the row labels are prefixed.
4907 For DataFrame, the column labels are prefixed.
4908
4909 Parameters
4910 ----------
4911 prefix : str
4912 The string to add before each label.
4913 axis : {0 or 'index', 1 or 'columns', None}, default None
4914 Axis to add prefix on
4915
4916 .. versionadded:: 2.0.0
4917
4918 Returns
4919 -------
4920 Series or DataFrame
4921 New Series or DataFrame with updated labels.
4922
4923 See Also
4924 --------
4925 Series.add_suffix: Suffix row labels with string `suffix`.
4926 DataFrame.add_suffix: Suffix column labels with string `suffix`.
4927
4928 Examples
4929 --------
4930 >>> s = pd.Series([1, 2, 3, 4])
4931 >>> s
4932 0 1
4933 1 2
4934 2 3
4935 3 4
4936 dtype: int64
4937
4938 >>> s.add_prefix('item_')
4939 item_0 1
4940 item_1 2
4941 item_2 3
4942 item_3 4
4943 dtype: int64
4944
4945 >>> df = pd.DataFrame({'A': [1, 2, 3, 4], 'B': [3, 4, 5, 6]})
4946 >>> df
4947 A B
4948 0 1 3
4949 1 2 4
4950 2 3 5
4951 3 4 6
4952
4953 >>> df.add_prefix('col_')
4954 col_A col_B
4955 0 1 3
4956 1 2 4
4957 2 3 5
4958 3 4 6
4959 """
4960 f = lambda x: f"{prefix}{x}"
4961
4962 axis_name = self._info_axis_name
4963 if axis is not None:
4964 axis_name = self._get_axis_name(axis)
4965
4966 mapper = {axis_name: f}
4967
4968 # error: Incompatible return value type (got "Optional[Self]",
4969 # expected "Self")
4970 # error: Argument 1 to "rename" of "NDFrame" has incompatible type
4971 # "**Dict[str, partial[str]]"; expected "Union[str, int, None]"
4972 # error: Keywords must be strings
4973 return self._rename(**mapper) # type: ignore[return-value, arg-type, misc]
4974
4975 @final
4976 def add_suffix(self, suffix: str, axis: Axis | None = None) -> Self:
4977 """
4978 Suffix labels with string `suffix`.
4979
4980 For Series, the row labels are suffixed.
4981 For DataFrame, the column labels are suffixed.
4982
4983 Parameters
4984 ----------
4985 suffix : str
4986 The string to add after each label.
4987 axis : {0 or 'index', 1 or 'columns', None}, default None
4988 Axis to add suffix on
4989
4990 .. versionadded:: 2.0.0
4991
4992 Returns
4993 -------
4994 Series or DataFrame
4995 New Series or DataFrame with updated labels.
4996
4997 See Also
4998 --------
4999 Series.add_prefix: Prefix row labels with string `prefix`.
5000 DataFrame.add_prefix: Prefix column labels with string `prefix`.
5001
5002 Examples
5003 --------
5004 >>> s = pd.Series([1, 2, 3, 4])
5005 >>> s
5006 0 1
5007 1 2
5008 2 3
5009 3 4
5010 dtype: int64
5011
5012 >>> s.add_suffix('_item')
5013 0_item 1
5014 1_item 2
5015 2_item 3
5016 3_item 4
5017 dtype: int64
5018
5019 >>> df = pd.DataFrame({'A': [1, 2, 3, 4], 'B': [3, 4, 5, 6]})
5020 >>> df
5021 A B
5022 0 1 3
5023 1 2 4
5024 2 3 5
5025 3 4 6
5026
5027 >>> df.add_suffix('_col')
5028 A_col B_col
5029 0 1 3
5030 1 2 4
5031 2 3 5
5032 3 4 6
5033 """
5034 f = lambda x: f"{x}{suffix}"
5035
5036 axis_name = self._info_axis_name
5037 if axis is not None:
5038 axis_name = self._get_axis_name(axis)
5039
5040 mapper = {axis_name: f}
5041 # error: Incompatible return value type (got "Optional[Self]",
5042 # expected "Self")
5043 # error: Argument 1 to "rename" of "NDFrame" has incompatible type
5044 # "**Dict[str, partial[str]]"; expected "Union[str, int, None]"
5045 # error: Keywords must be strings
5046 return self._rename(**mapper) # type: ignore[return-value, arg-type, misc]
5047
5048 @overload
5049 def sort_values(
5050 self,
5051 *,
5052 axis: Axis = ...,
5053 ascending: bool_t | Sequence[bool_t] = ...,
5054 inplace: Literal[False] = ...,
5055 kind: SortKind = ...,
5056 na_position: NaPosition = ...,
5057 ignore_index: bool_t = ...,
5058 key: ValueKeyFunc = ...,
5059 ) -> Self:
5060 ...
5061
5062 @overload
5063 def sort_values(
5064 self,
5065 *,
5066 axis: Axis = ...,
5067 ascending: bool_t | Sequence[bool_t] = ...,
5068 inplace: Literal[True],
5069 kind: SortKind = ...,
5070 na_position: NaPosition = ...,
5071 ignore_index: bool_t = ...,
5072 key: ValueKeyFunc = ...,
5073 ) -> None:
5074 ...
5075
5076 @overload
5077 def sort_values(
5078 self,
5079 *,
5080 axis: Axis = ...,
5081 ascending: bool_t | Sequence[bool_t] = ...,
5082 inplace: bool_t = ...,
5083 kind: SortKind = ...,
5084 na_position: NaPosition = ...,
5085 ignore_index: bool_t = ...,
5086 key: ValueKeyFunc = ...,
5087 ) -> Self | None:
5088 ...
5089
5090 def sort_values(
5091 self,
5092 *,
5093 axis: Axis = 0,
5094 ascending: bool_t | Sequence[bool_t] = True,
5095 inplace: bool_t = False,
5096 kind: SortKind = "quicksort",
5097 na_position: NaPosition = "last",
5098 ignore_index: bool_t = False,
5099 key: ValueKeyFunc | None = None,
5100 ) -> Self | None:
5101 """
5102 Sort by the values along either axis.
5103
5104 Parameters
5105 ----------%(optional_by)s
5106 axis : %(axes_single_arg)s, default 0
5107 Axis to be sorted.
5108 ascending : bool or list of bool, default True
5109 Sort ascending vs. descending. Specify list for multiple sort
5110 orders. If this is a list of bools, must match the length of
5111 the by.
5112 inplace : bool, default False
5113 If True, perform operation in-place.
5114 kind : {'quicksort', 'mergesort', 'heapsort', 'stable'}, default 'quicksort'
5115 Choice of sorting algorithm. See also :func:`numpy.sort` for more
5116 information. `mergesort` and `stable` are the only stable algorithms. For
5117 DataFrames, this option is only applied when sorting on a single
5118 column or label.
5119 na_position : {'first', 'last'}, default 'last'
5120 Puts NaNs at the beginning if `first`; `last` puts NaNs at the
5121 end.
5122 ignore_index : bool, default False
5123 If True, the resulting axis will be labeled 0, 1, …, n - 1.
5124 key : callable, optional
5125 Apply the key function to the values
5126 before sorting. This is similar to the `key` argument in the
5127 builtin :meth:`sorted` function, with the notable difference that
5128 this `key` function should be *vectorized*. It should expect a
5129 ``Series`` and return a Series with the same shape as the input.
5130 It will be applied to each column in `by` independently.
5131
5132 Returns
5133 -------
5134 DataFrame or None
5135 DataFrame with sorted values or None if ``inplace=True``.
5136
5137 See Also
5138 --------
5139 DataFrame.sort_index : Sort a DataFrame by the index.
5140 Series.sort_values : Similar method for a Series.
5141
5142 Examples
5143 --------
5144 >>> df = pd.DataFrame({
5145 ... 'col1': ['A', 'A', 'B', np.nan, 'D', 'C'],
5146 ... 'col2': [2, 1, 9, 8, 7, 4],
5147 ... 'col3': [0, 1, 9, 4, 2, 3],
5148 ... 'col4': ['a', 'B', 'c', 'D', 'e', 'F']
5149 ... })
5150 >>> df
5151 col1 col2 col3 col4
5152 0 A 2 0 a
5153 1 A 1 1 B
5154 2 B 9 9 c
5155 3 NaN 8 4 D
5156 4 D 7 2 e
5157 5 C 4 3 F
5158
5159 Sort by col1
5160
5161 >>> df.sort_values(by=['col1'])
5162 col1 col2 col3 col4
5163 0 A 2 0 a
5164 1 A 1 1 B
5165 2 B 9 9 c
5166 5 C 4 3 F
5167 4 D 7 2 e
5168 3 NaN 8 4 D
5169
5170 Sort by multiple columns
5171
5172 >>> df.sort_values(by=['col1', 'col2'])
5173 col1 col2 col3 col4
5174 1 A 1 1 B
5175 0 A 2 0 a
5176 2 B 9 9 c
5177 5 C 4 3 F
5178 4 D 7 2 e
5179 3 NaN 8 4 D
5180
5181 Sort Descending
5182
5183 >>> df.sort_values(by='col1', ascending=False)
5184 col1 col2 col3 col4
5185 4 D 7 2 e
5186 5 C 4 3 F
5187 2 B 9 9 c
5188 0 A 2 0 a
5189 1 A 1 1 B
5190 3 NaN 8 4 D
5191
5192 Putting NAs first
5193
5194 >>> df.sort_values(by='col1', ascending=False, na_position='first')
5195 col1 col2 col3 col4
5196 3 NaN 8 4 D
5197 4 D 7 2 e
5198 5 C 4 3 F
5199 2 B 9 9 c
5200 0 A 2 0 a
5201 1 A 1 1 B
5202
5203 Sorting with a key function
5204
5205 >>> df.sort_values(by='col4', key=lambda col: col.str.lower())
5206 col1 col2 col3 col4
5207 0 A 2 0 a
5208 1 A 1 1 B
5209 2 B 9 9 c
5210 3 NaN 8 4 D
5211 4 D 7 2 e
5212 5 C 4 3 F
5213
5214 Natural sort with the key argument,
5215 using the `natsort <https://github.com/SethMMorton/natsort>` package.
5216
5217 >>> df = pd.DataFrame({
5218 ... "time": ['0hr', '128hr', '72hr', '48hr', '96hr'],
5219 ... "value": [10, 20, 30, 40, 50]
5220 ... })
5221 >>> df
5222 time value
5223 0 0hr 10
5224 1 128hr 20
5225 2 72hr 30
5226 3 48hr 40
5227 4 96hr 50
5228 >>> from natsort import index_natsorted
5229 >>> df.sort_values(
5230 ... by="time",
5231 ... key=lambda x: np.argsort(index_natsorted(df["time"]))
5232 ... )
5233 time value
5234 0 0hr 10
5235 3 48hr 40
5236 2 72hr 30
5237 4 96hr 50
5238 1 128hr 20
5239 """
5240 raise AbstractMethodError(self)
5241
5242 @overload
5243 def sort_index(
5244 self,
5245 *,
5246 axis: Axis = ...,
5247 level: IndexLabel = ...,
5248 ascending: bool_t | Sequence[bool_t] = ...,
5249 inplace: Literal[True],
5250 kind: SortKind = ...,
5251 na_position: NaPosition = ...,
5252 sort_remaining: bool_t = ...,
5253 ignore_index: bool_t = ...,
5254 key: IndexKeyFunc = ...,
5255 ) -> None:
5256 ...
5257
5258 @overload
5259 def sort_index(
5260 self,
5261 *,
5262 axis: Axis = ...,
5263 level: IndexLabel = ...,
5264 ascending: bool_t | Sequence[bool_t] = ...,
5265 inplace: Literal[False] = ...,
5266 kind: SortKind = ...,
5267 na_position: NaPosition = ...,
5268 sort_remaining: bool_t = ...,
5269 ignore_index: bool_t = ...,
5270 key: IndexKeyFunc = ...,
5271 ) -> Self:
5272 ...
5273
5274 @overload
5275 def sort_index(
5276 self,
5277 *,
5278 axis: Axis = ...,
5279 level: IndexLabel = ...,
5280 ascending: bool_t | Sequence[bool_t] = ...,
5281 inplace: bool_t = ...,
5282 kind: SortKind = ...,
5283 na_position: NaPosition = ...,
5284 sort_remaining: bool_t = ...,
5285 ignore_index: bool_t = ...,
5286 key: IndexKeyFunc = ...,
5287 ) -> Self | None:
5288 ...
5289
5290 def sort_index(
5291 self,
5292 *,
5293 axis: Axis = 0,
5294 level: IndexLabel | None = None,
5295 ascending: bool_t | Sequence[bool_t] = True,
5296 inplace: bool_t = False,
5297 kind: SortKind = "quicksort",
5298 na_position: NaPosition = "last",
5299 sort_remaining: bool_t = True,
5300 ignore_index: bool_t = False,
5301 key: IndexKeyFunc | None = None,
5302 ) -> Self | None:
5303 inplace = validate_bool_kwarg(inplace, "inplace")
5304 axis = self._get_axis_number(axis)
5305 ascending = validate_ascending(ascending)
5306
5307 target = self._get_axis(axis)
5308
5309 indexer = get_indexer_indexer(
5310 target, level, ascending, kind, na_position, sort_remaining, key
5311 )
5312
5313 if indexer is None:
5314 if inplace:
5315 result = self
5316 else:
5317 result = self.copy(deep=None)
5318
5319 if ignore_index:
5320 result.index = default_index(len(self))
5321 if inplace:
5322 return None
5323 else:
5324 return result
5325
5326 baxis = self._get_block_manager_axis(axis)
5327 new_data = self._mgr.take(indexer, axis=baxis, verify=False)
5328
5329 # reconstruct axis if needed
5330 if not ignore_index:
5331 new_axis = new_data.axes[baxis]._sort_levels_monotonic()
5332 else:
5333 new_axis = default_index(len(indexer))
5334 new_data.set_axis(baxis, new_axis)
5335
5336 result = self._constructor_from_mgr(new_data, axes=new_data.axes)
5337
5338 if inplace:
5339 return self._update_inplace(result)
5340 else:
5341 return result.__finalize__(self, method="sort_index")
5342
5343 @doc(
5344 klass=_shared_doc_kwargs["klass"],
5345 optional_reindex="",
5346 )
5347 def reindex(
5348 self,
5349 labels=None,
5350 *,
5351 index=None,
5352 columns=None,
5353 axis: Axis | None = None,
5354 method: ReindexMethod | None = None,
5355 copy: bool_t | None = None,
5356 level: Level | None = None,
5357 fill_value: Scalar | None = np.nan,
5358 limit: int | None = None,
5359 tolerance=None,
5360 ) -> Self:
5361 """
5362 Conform {klass} to new index with optional filling logic.
5363
5364 Places NA/NaN in locations having no value in the previous index. A new object
5365 is produced unless the new index is equivalent to the current one and
5366 ``copy=False``.
5367
5368 Parameters
5369 ----------
5370 {optional_reindex}
5371 method : {{None, 'backfill'/'bfill', 'pad'/'ffill', 'nearest'}}
5372 Method to use for filling holes in reindexed DataFrame.
5373 Please note: this is only applicable to DataFrames/Series with a
5374 monotonically increasing/decreasing index.
5375
5376 * None (default): don't fill gaps
5377 * pad / ffill: Propagate last valid observation forward to next
5378 valid.
5379 * backfill / bfill: Use next valid observation to fill gap.
5380 * nearest: Use nearest valid observations to fill gap.
5381
5382 copy : bool, default True
5383 Return a new object, even if the passed indexes are the same.
5384
5385 .. note::
5386 The `copy` keyword will change behavior in pandas 3.0.
5387 `Copy-on-Write
5388 <https://pandas.pydata.org/docs/dev/user_guide/copy_on_write.html>`__
5389 will be enabled by default, which means that all methods with a
5390 `copy` keyword will use a lazy copy mechanism to defer the copy and
5391 ignore the `copy` keyword. The `copy` keyword will be removed in a
5392 future version of pandas.
5393
5394 You can already get the future behavior and improvements through
5395 enabling copy on write ``pd.options.mode.copy_on_write = True``
5396 level : int or name
5397 Broadcast across a level, matching Index values on the
5398 passed MultiIndex level.
5399 fill_value : scalar, default np.nan
5400 Value to use for missing values. Defaults to NaN, but can be any
5401 "compatible" value.
5402 limit : int, default None
5403 Maximum number of consecutive elements to forward or backward fill.
5404 tolerance : optional
5405 Maximum distance between original and new labels for inexact
5406 matches. The values of the index at the matching locations most
5407 satisfy the equation ``abs(index[indexer] - target) <= tolerance``.
5408
5409 Tolerance may be a scalar value, which applies the same tolerance
5410 to all values, or list-like, which applies variable tolerance per
5411 element. List-like includes list, tuple, array, Series, and must be
5412 the same size as the index and its dtype must exactly match the
5413 index's type.
5414
5415 Returns
5416 -------
5417 {klass} with changed index.
5418
5419 See Also
5420 --------
5421 DataFrame.set_index : Set row labels.
5422 DataFrame.reset_index : Remove row labels or move them to new columns.
5423 DataFrame.reindex_like : Change to same indices as other DataFrame.
5424
5425 Examples
5426 --------
5427 ``DataFrame.reindex`` supports two calling conventions
5428
5429 * ``(index=index_labels, columns=column_labels, ...)``
5430 * ``(labels, axis={{'index', 'columns'}}, ...)``
5431
5432 We *highly* recommend using keyword arguments to clarify your
5433 intent.
5434
5435 Create a dataframe with some fictional data.
5436
5437 >>> index = ['Firefox', 'Chrome', 'Safari', 'IE10', 'Konqueror']
5438 >>> df = pd.DataFrame({{'http_status': [200, 200, 404, 404, 301],
5439 ... 'response_time': [0.04, 0.02, 0.07, 0.08, 1.0]}},
5440 ... index=index)
5441 >>> df
5442 http_status response_time
5443 Firefox 200 0.04
5444 Chrome 200 0.02
5445 Safari 404 0.07
5446 IE10 404 0.08
5447 Konqueror 301 1.00
5448
5449 Create a new index and reindex the dataframe. By default
5450 values in the new index that do not have corresponding
5451 records in the dataframe are assigned ``NaN``.
5452
5453 >>> new_index = ['Safari', 'Iceweasel', 'Comodo Dragon', 'IE10',
5454 ... 'Chrome']
5455 >>> df.reindex(new_index)
5456 http_status response_time
5457 Safari 404.0 0.07
5458 Iceweasel NaN NaN
5459 Comodo Dragon NaN NaN
5460 IE10 404.0 0.08
5461 Chrome 200.0 0.02
5462
5463 We can fill in the missing values by passing a value to
5464 the keyword ``fill_value``. Because the index is not monotonically
5465 increasing or decreasing, we cannot use arguments to the keyword
5466 ``method`` to fill the ``NaN`` values.
5467
5468 >>> df.reindex(new_index, fill_value=0)
5469 http_status response_time
5470 Safari 404 0.07
5471 Iceweasel 0 0.00
5472 Comodo Dragon 0 0.00
5473 IE10 404 0.08
5474 Chrome 200 0.02
5475
5476 >>> df.reindex(new_index, fill_value='missing')
5477 http_status response_time
5478 Safari 404 0.07
5479 Iceweasel missing missing
5480 Comodo Dragon missing missing
5481 IE10 404 0.08
5482 Chrome 200 0.02
5483
5484 We can also reindex the columns.
5485
5486 >>> df.reindex(columns=['http_status', 'user_agent'])
5487 http_status user_agent
5488 Firefox 200 NaN
5489 Chrome 200 NaN
5490 Safari 404 NaN
5491 IE10 404 NaN
5492 Konqueror 301 NaN
5493
5494 Or we can use "axis-style" keyword arguments
5495
5496 >>> df.reindex(['http_status', 'user_agent'], axis="columns")
5497 http_status user_agent
5498 Firefox 200 NaN
5499 Chrome 200 NaN
5500 Safari 404 NaN
5501 IE10 404 NaN
5502 Konqueror 301 NaN
5503
5504 To further illustrate the filling functionality in
5505 ``reindex``, we will create a dataframe with a
5506 monotonically increasing index (for example, a sequence
5507 of dates).
5508
5509 >>> date_index = pd.date_range('1/1/2010', periods=6, freq='D')
5510 >>> df2 = pd.DataFrame({{"prices": [100, 101, np.nan, 100, 89, 88]}},
5511 ... index=date_index)
5512 >>> df2
5513 prices
5514 2010-01-01 100.0
5515 2010-01-02 101.0
5516 2010-01-03 NaN
5517 2010-01-04 100.0
5518 2010-01-05 89.0
5519 2010-01-06 88.0
5520
5521 Suppose we decide to expand the dataframe to cover a wider
5522 date range.
5523
5524 >>> date_index2 = pd.date_range('12/29/2009', periods=10, freq='D')
5525 >>> df2.reindex(date_index2)
5526 prices
5527 2009-12-29 NaN
5528 2009-12-30 NaN
5529 2009-12-31 NaN
5530 2010-01-01 100.0
5531 2010-01-02 101.0
5532 2010-01-03 NaN
5533 2010-01-04 100.0
5534 2010-01-05 89.0
5535 2010-01-06 88.0
5536 2010-01-07 NaN
5537
5538 The index entries that did not have a value in the original data frame
5539 (for example, '2009-12-29') are by default filled with ``NaN``.
5540 If desired, we can fill in the missing values using one of several
5541 options.
5542
5543 For example, to back-propagate the last valid value to fill the ``NaN``
5544 values, pass ``bfill`` as an argument to the ``method`` keyword.
5545
5546 >>> df2.reindex(date_index2, method='bfill')
5547 prices
5548 2009-12-29 100.0
5549 2009-12-30 100.0
5550 2009-12-31 100.0
5551 2010-01-01 100.0
5552 2010-01-02 101.0
5553 2010-01-03 NaN
5554 2010-01-04 100.0
5555 2010-01-05 89.0
5556 2010-01-06 88.0
5557 2010-01-07 NaN
5558
5559 Please note that the ``NaN`` value present in the original dataframe
5560 (at index value 2010-01-03) will not be filled by any of the
5561 value propagation schemes. This is because filling while reindexing
5562 does not look at dataframe values, but only compares the original and
5563 desired indexes. If you do want to fill in the ``NaN`` values present
5564 in the original dataframe, use the ``fillna()`` method.
5565
5566 See the :ref:`user guide <basics.reindexing>` for more.
5567 """
5568 # TODO: Decide if we care about having different examples for different
5569 # kinds
5570
5571 if index is not None and columns is not None and labels is not None:
5572 raise TypeError("Cannot specify all of 'labels', 'index', 'columns'.")
5573 elif index is not None or columns is not None:
5574 if axis is not None:
5575 raise TypeError(
5576 "Cannot specify both 'axis' and any of 'index' or 'columns'"
5577 )
5578 if labels is not None:
5579 if index is not None:
5580 columns = labels
5581 else:
5582 index = labels
5583 else:
5584 if axis and self._get_axis_number(axis) == 1:
5585 columns = labels
5586 else:
5587 index = labels
5588 axes: dict[Literal["index", "columns"], Any] = {
5589 "index": index,
5590 "columns": columns,
5591 }
5592 method = clean_reindex_fill_method(method)
5593
5594 # if all axes that are requested to reindex are equal, then only copy
5595 # if indicated must have index names equal here as well as values
5596 if copy and using_copy_on_write():
5597 copy = False
5598 if all(
5599 self._get_axis(axis_name).identical(ax)
5600 for axis_name, ax in axes.items()
5601 if ax is not None
5602 ):
5603 return self.copy(deep=copy)
5604
5605 # check if we are a multi reindex
5606 if self._needs_reindex_multi(axes, method, level):
5607 return self._reindex_multi(axes, copy, fill_value)
5608
5609 # perform the reindex on the axes
5610 return self._reindex_axes(
5611 axes, level, limit, tolerance, method, fill_value, copy
5612 ).__finalize__(self, method="reindex")
5613
5614 @final
5615 def _reindex_axes(
5616 self,
5617 axes,
5618 level: Level | None,
5619 limit: int | None,
5620 tolerance,
5621 method,
5622 fill_value: Scalar | None,
5623 copy: bool_t | None,
5624 ) -> Self:
5625 """Perform the reindex for all the axes."""
5626 obj = self
5627 for a in self._AXIS_ORDERS:
5628 labels = axes[a]
5629 if labels is None:
5630 continue
5631
5632 ax = self._get_axis(a)
5633 new_index, indexer = ax.reindex(
5634 labels, level=level, limit=limit, tolerance=tolerance, method=method
5635 )
5636
5637 axis = self._get_axis_number(a)
5638 obj = obj._reindex_with_indexers(
5639 {axis: [new_index, indexer]},
5640 fill_value=fill_value,
5641 copy=copy,
5642 allow_dups=False,
5643 )
5644 # If we've made a copy once, no need to make another one
5645 copy = False
5646
5647 return obj
5648
5649 def _needs_reindex_multi(self, axes, method, level: Level | None) -> bool_t:
5650 """Check if we do need a multi reindex."""
5651 return (
5652 (common.count_not_none(*axes.values()) == self._AXIS_LEN)
5653 and method is None
5654 and level is None
5655 # reindex_multi calls self.values, so we only want to go
5656 # down that path when doing so is cheap.
5657 and self._can_fast_transpose
5658 )
5659
5660 def _reindex_multi(self, axes, copy, fill_value):
5661 raise AbstractMethodError(self)
5662
5663 @final
5664 def _reindex_with_indexers(
5665 self,
5666 reindexers,
5667 fill_value=None,
5668 copy: bool_t | None = False,
5669 allow_dups: bool_t = False,
5670 ) -> Self:
5671 """allow_dups indicates an internal call here"""
5672 # reindex doing multiple operations on different axes if indicated
5673 new_data = self._mgr
5674 for axis in sorted(reindexers.keys()):
5675 index, indexer = reindexers[axis]
5676 baxis = self._get_block_manager_axis(axis)
5677
5678 if index is None:
5679 continue
5680
5681 index = ensure_index(index)
5682 if indexer is not None:
5683 indexer = ensure_platform_int(indexer)
5684
5685 # TODO: speed up on homogeneous DataFrame objects (see _reindex_multi)
5686 new_data = new_data.reindex_indexer(
5687 index,
5688 indexer,
5689 axis=baxis,
5690 fill_value=fill_value,
5691 allow_dups=allow_dups,
5692 copy=copy,
5693 )
5694 # If we've made a copy once, no need to make another one
5695 copy = False
5696
5697 if (
5698 (copy or copy is None)
5699 and new_data is self._mgr
5700 and not using_copy_on_write()
5701 ):
5702 new_data = new_data.copy(deep=copy)
5703 elif using_copy_on_write() and new_data is self._mgr:
5704 new_data = new_data.copy(deep=False)
5705
5706 return self._constructor_from_mgr(new_data, axes=new_data.axes).__finalize__(
5707 self
5708 )
5709
5710 def filter(
5711 self,
5712 items=None,
5713 like: str | None = None,
5714 regex: str | None = None,
5715 axis: Axis | None = None,
5716 ) -> Self:
5717 """
5718 Subset the dataframe rows or columns according to the specified index labels.
5719
5720 Note that this routine does not filter a dataframe on its
5721 contents. The filter is applied to the labels of the index.
5722
5723 Parameters
5724 ----------
5725 items : list-like
5726 Keep labels from axis which are in items.
5727 like : str
5728 Keep labels from axis for which "like in label == True".
5729 regex : str (regular expression)
5730 Keep labels from axis for which re.search(regex, label) == True.
5731 axis : {0 or 'index', 1 or 'columns', None}, default None
5732 The axis to filter on, expressed either as an index (int)
5733 or axis name (str). By default this is the info axis, 'columns' for
5734 DataFrame. For `Series` this parameter is unused and defaults to `None`.
5735
5736 Returns
5737 -------
5738 same type as input object
5739
5740 See Also
5741 --------
5742 DataFrame.loc : Access a group of rows and columns
5743 by label(s) or a boolean array.
5744
5745 Notes
5746 -----
5747 The ``items``, ``like``, and ``regex`` parameters are
5748 enforced to be mutually exclusive.
5749
5750 ``axis`` defaults to the info axis that is used when indexing
5751 with ``[]``.
5752
5753 Examples
5754 --------
5755 >>> df = pd.DataFrame(np.array(([1, 2, 3], [4, 5, 6])),
5756 ... index=['mouse', 'rabbit'],
5757 ... columns=['one', 'two', 'three'])
5758 >>> df
5759 one two three
5760 mouse 1 2 3
5761 rabbit 4 5 6
5762
5763 >>> # select columns by name
5764 >>> df.filter(items=['one', 'three'])
5765 one three
5766 mouse 1 3
5767 rabbit 4 6
5768
5769 >>> # select columns by regular expression
5770 >>> df.filter(regex='e$', axis=1)
5771 one three
5772 mouse 1 3
5773 rabbit 4 6
5774
5775 >>> # select rows containing 'bbi'
5776 >>> df.filter(like='bbi', axis=0)
5777 one two three
5778 rabbit 4 5 6
5779 """
5780 nkw = common.count_not_none(items, like, regex)
5781 if nkw > 1:
5782 raise TypeError(
5783 "Keyword arguments `items`, `like`, or `regex` "
5784 "are mutually exclusive"
5785 )
5786
5787 if axis is None:
5788 axis = self._info_axis_name
5789 labels = self._get_axis(axis)
5790
5791 if items is not None:
5792 name = self._get_axis_name(axis)
5793 items = Index(items).intersection(labels)
5794 if len(items) == 0:
5795 # Keep the dtype of labels when we are empty
5796 items = items.astype(labels.dtype)
5797 # error: Keywords must be strings
5798 return self.reindex(**{name: items}) # type: ignore[misc]
5799 elif like:
5800
5801 def f(x) -> bool_t:
5802 assert like is not None # needed for mypy
5803 return like in ensure_str(x)
5804
5805 values = labels.map(f)
5806 return self.loc(axis=axis)[values]
5807 elif regex:
5808
5809 def f(x) -> bool_t:
5810 return matcher.search(ensure_str(x)) is not None
5811
5812 matcher = re.compile(regex)
5813 values = labels.map(f)
5814 return self.loc(axis=axis)[values]
5815 else:
5816 raise TypeError("Must pass either `items`, `like`, or `regex`")
5817
5818 @final
5819 def head(self, n: int = 5) -> Self:
5820 """
5821 Return the first `n` rows.
5822
5823 This function returns the first `n` rows for the object based
5824 on position. It is useful for quickly testing if your object
5825 has the right type of data in it.
5826
5827 For negative values of `n`, this function returns all rows except
5828 the last `|n|` rows, equivalent to ``df[:n]``.
5829
5830 If n is larger than the number of rows, this function returns all rows.
5831
5832 Parameters
5833 ----------
5834 n : int, default 5
5835 Number of rows to select.
5836
5837 Returns
5838 -------
5839 same type as caller
5840 The first `n` rows of the caller object.
5841
5842 See Also
5843 --------
5844 DataFrame.tail: Returns the last `n` rows.
5845
5846 Examples
5847 --------
5848 >>> df = pd.DataFrame({'animal': ['alligator', 'bee', 'falcon', 'lion',
5849 ... 'monkey', 'parrot', 'shark', 'whale', 'zebra']})
5850 >>> df
5851 animal
5852 0 alligator
5853 1 bee
5854 2 falcon
5855 3 lion
5856 4 monkey
5857 5 parrot
5858 6 shark
5859 7 whale
5860 8 zebra
5861
5862 Viewing the first 5 lines
5863
5864 >>> df.head()
5865 animal
5866 0 alligator
5867 1 bee
5868 2 falcon
5869 3 lion
5870 4 monkey
5871
5872 Viewing the first `n` lines (three in this case)
5873
5874 >>> df.head(3)
5875 animal
5876 0 alligator
5877 1 bee
5878 2 falcon
5879
5880 For negative values of `n`
5881
5882 >>> df.head(-3)
5883 animal
5884 0 alligator
5885 1 bee
5886 2 falcon
5887 3 lion
5888 4 monkey
5889 5 parrot
5890 """
5891 if using_copy_on_write():
5892 return self.iloc[:n].copy()
5893 return self.iloc[:n]
5894
5895 @final
5896 def tail(self, n: int = 5) -> Self:
5897 """
5898 Return the last `n` rows.
5899
5900 This function returns last `n` rows from the object based on
5901 position. It is useful for quickly verifying data, for example,
5902 after sorting or appending rows.
5903
5904 For negative values of `n`, this function returns all rows except
5905 the first `|n|` rows, equivalent to ``df[|n|:]``.
5906
5907 If n is larger than the number of rows, this function returns all rows.
5908
5909 Parameters
5910 ----------
5911 n : int, default 5
5912 Number of rows to select.
5913
5914 Returns
5915 -------
5916 type of caller
5917 The last `n` rows of the caller object.
5918
5919 See Also
5920 --------
5921 DataFrame.head : The first `n` rows of the caller object.
5922
5923 Examples
5924 --------
5925 >>> df = pd.DataFrame({'animal': ['alligator', 'bee', 'falcon', 'lion',
5926 ... 'monkey', 'parrot', 'shark', 'whale', 'zebra']})
5927 >>> df
5928 animal
5929 0 alligator
5930 1 bee
5931 2 falcon
5932 3 lion
5933 4 monkey
5934 5 parrot
5935 6 shark
5936 7 whale
5937 8 zebra
5938
5939 Viewing the last 5 lines
5940
5941 >>> df.tail()
5942 animal
5943 4 monkey
5944 5 parrot
5945 6 shark
5946 7 whale
5947 8 zebra
5948
5949 Viewing the last `n` lines (three in this case)
5950
5951 >>> df.tail(3)
5952 animal
5953 6 shark
5954 7 whale
5955 8 zebra
5956
5957 For negative values of `n`
5958
5959 >>> df.tail(-3)
5960 animal
5961 3 lion
5962 4 monkey
5963 5 parrot
5964 6 shark
5965 7 whale
5966 8 zebra
5967 """
5968 if using_copy_on_write():
5969 if n == 0:
5970 return self.iloc[0:0].copy()
5971 return self.iloc[-n:].copy()
5972 if n == 0:
5973 return self.iloc[0:0]
5974 return self.iloc[-n:]
5975
5976 @final
5977 def sample(
5978 self,
5979 n: int | None = None,
5980 frac: float | None = None,
5981 replace: bool_t = False,
5982 weights=None,
5983 random_state: RandomState | None = None,
5984 axis: Axis | None = None,
5985 ignore_index: bool_t = False,
5986 ) -> Self:
5987 """
5988 Return a random sample of items from an axis of object.
5989
5990 You can use `random_state` for reproducibility.
5991
5992 Parameters
5993 ----------
5994 n : int, optional
5995 Number of items from axis to return. Cannot be used with `frac`.
5996 Default = 1 if `frac` = None.
5997 frac : float, optional
5998 Fraction of axis items to return. Cannot be used with `n`.
5999 replace : bool, default False
6000 Allow or disallow sampling of the same row more than once.
6001 weights : str or ndarray-like, optional
6002 Default 'None' results in equal probability weighting.
6003 If passed a Series, will align with target object on index. Index
6004 values in weights not found in sampled object will be ignored and
6005 index values in sampled object not in weights will be assigned
6006 weights of zero.
6007 If called on a DataFrame, will accept the name of a column
6008 when axis = 0.
6009 Unless weights are a Series, weights must be same length as axis
6010 being sampled.
6011 If weights do not sum to 1, they will be normalized to sum to 1.
6012 Missing values in the weights column will be treated as zero.
6013 Infinite values not allowed.
6014 random_state : int, array-like, BitGenerator, np.random.RandomState, np.random.Generator, optional
6015 If int, array-like, or BitGenerator, seed for random number generator.
6016 If np.random.RandomState or np.random.Generator, use as given.
6017
6018 .. versionchanged:: 1.4.0
6019
6020 np.random.Generator objects now accepted
6021
6022 axis : {0 or 'index', 1 or 'columns', None}, default None
6023 Axis to sample. Accepts axis number or name. Default is stat axis
6024 for given data type. For `Series` this parameter is unused and defaults to `None`.
6025 ignore_index : bool, default False
6026 If True, the resulting index will be labeled 0, 1, …, n - 1.
6027
6028 .. versionadded:: 1.3.0
6029
6030 Returns
6031 -------
6032 Series or DataFrame
6033 A new object of same type as caller containing `n` items randomly
6034 sampled from the caller object.
6035
6036 See Also
6037 --------
6038 DataFrameGroupBy.sample: Generates random samples from each group of a
6039 DataFrame object.
6040 SeriesGroupBy.sample: Generates random samples from each group of a
6041 Series object.
6042 numpy.random.choice: Generates a random sample from a given 1-D numpy
6043 array.
6044
6045 Notes
6046 -----
6047 If `frac` > 1, `replacement` should be set to `True`.
6048
6049 Examples
6050 --------
6051 >>> df = pd.DataFrame({'num_legs': [2, 4, 8, 0],
6052 ... 'num_wings': [2, 0, 0, 0],
6053 ... 'num_specimen_seen': [10, 2, 1, 8]},
6054 ... index=['falcon', 'dog', 'spider', 'fish'])
6055 >>> df
6056 num_legs num_wings num_specimen_seen
6057 falcon 2 2 10
6058 dog 4 0 2
6059 spider 8 0 1
6060 fish 0 0 8
6061
6062 Extract 3 random elements from the ``Series`` ``df['num_legs']``:
6063 Note that we use `random_state` to ensure the reproducibility of
6064 the examples.
6065
6066 >>> df['num_legs'].sample(n=3, random_state=1)
6067 fish 0
6068 spider 8
6069 falcon 2
6070 Name: num_legs, dtype: int64
6071
6072 A random 50% sample of the ``DataFrame`` with replacement:
6073
6074 >>> df.sample(frac=0.5, replace=True, random_state=1)
6075 num_legs num_wings num_specimen_seen
6076 dog 4 0 2
6077 fish 0 0 8
6078
6079 An upsample sample of the ``DataFrame`` with replacement:
6080 Note that `replace` parameter has to be `True` for `frac` parameter > 1.
6081
6082 >>> df.sample(frac=2, replace=True, random_state=1)
6083 num_legs num_wings num_specimen_seen
6084 dog 4 0 2
6085 fish 0 0 8
6086 falcon 2 2 10
6087 falcon 2 2 10
6088 fish 0 0 8
6089 dog 4 0 2
6090 fish 0 0 8
6091 dog 4 0 2
6092
6093 Using a DataFrame column as weights. Rows with larger value in the
6094 `num_specimen_seen` column are more likely to be sampled.
6095
6096 >>> df.sample(n=2, weights='num_specimen_seen', random_state=1)
6097 num_legs num_wings num_specimen_seen
6098 falcon 2 2 10
6099 fish 0 0 8
6100 """ # noqa: E501
6101 if axis is None:
6102 axis = 0
6103
6104 axis = self._get_axis_number(axis)
6105 obj_len = self.shape[axis]
6106
6107 # Process random_state argument
6108 rs = common.random_state(random_state)
6109
6110 size = sample.process_sampling_size(n, frac, replace)
6111 if size is None:
6112 assert frac is not None
6113 size = round(frac * obj_len)
6114
6115 if weights is not None:
6116 weights = sample.preprocess_weights(self, weights, axis)
6117
6118 sampled_indices = sample.sample(obj_len, size, replace, weights, rs)
6119 result = self.take(sampled_indices, axis=axis)
6120
6121 if ignore_index:
6122 result.index = default_index(len(result))
6123
6124 return result
6125
6126 @final
6127 @doc(klass=_shared_doc_kwargs["klass"])
6128 def pipe(
6129 self,
6130 func: Callable[..., T] | tuple[Callable[..., T], str],
6131 *args,
6132 **kwargs,
6133 ) -> T:
6134 r"""
6135 Apply chainable functions that expect Series or DataFrames.
6136
6137 Parameters
6138 ----------
6139 func : function
6140 Function to apply to the {klass}.
6141 ``args``, and ``kwargs`` are passed into ``func``.
6142 Alternatively a ``(callable, data_keyword)`` tuple where
6143 ``data_keyword`` is a string indicating the keyword of
6144 ``callable`` that expects the {klass}.
6145 *args : iterable, optional
6146 Positional arguments passed into ``func``.
6147 **kwargs : mapping, optional
6148 A dictionary of keyword arguments passed into ``func``.
6149
6150 Returns
6151 -------
6152 the return type of ``func``.
6153
6154 See Also
6155 --------
6156 DataFrame.apply : Apply a function along input axis of DataFrame.
6157 DataFrame.map : Apply a function elementwise on a whole DataFrame.
6158 Series.map : Apply a mapping correspondence on a
6159 :class:`~pandas.Series`.
6160
6161 Notes
6162 -----
6163 Use ``.pipe`` when chaining together functions that expect
6164 Series, DataFrames or GroupBy objects.
6165
6166 Examples
6167 --------
6168 Constructing a income DataFrame from a dictionary.
6169
6170 >>> data = [[8000, 1000], [9500, np.nan], [5000, 2000]]
6171 >>> df = pd.DataFrame(data, columns=['Salary', 'Others'])
6172 >>> df
6173 Salary Others
6174 0 8000 1000.0
6175 1 9500 NaN
6176 2 5000 2000.0
6177
6178 Functions that perform tax reductions on an income DataFrame.
6179
6180 >>> def subtract_federal_tax(df):
6181 ... return df * 0.9
6182 >>> def subtract_state_tax(df, rate):
6183 ... return df * (1 - rate)
6184 >>> def subtract_national_insurance(df, rate, rate_increase):
6185 ... new_rate = rate + rate_increase
6186 ... return df * (1 - new_rate)
6187
6188 Instead of writing
6189
6190 >>> subtract_national_insurance(
6191 ... subtract_state_tax(subtract_federal_tax(df), rate=0.12),
6192 ... rate=0.05,
6193 ... rate_increase=0.02) # doctest: +SKIP
6194
6195 You can write
6196
6197 >>> (
6198 ... df.pipe(subtract_federal_tax)
6199 ... .pipe(subtract_state_tax, rate=0.12)
6200 ... .pipe(subtract_national_insurance, rate=0.05, rate_increase=0.02)
6201 ... )
6202 Salary Others
6203 0 5892.48 736.56
6204 1 6997.32 NaN
6205 2 3682.80 1473.12
6206
6207 If you have a function that takes the data as (say) the second
6208 argument, pass a tuple indicating which keyword expects the
6209 data. For example, suppose ``national_insurance`` takes its data as ``df``
6210 in the second argument:
6211
6212 >>> def subtract_national_insurance(rate, df, rate_increase):
6213 ... new_rate = rate + rate_increase
6214 ... return df * (1 - new_rate)
6215 >>> (
6216 ... df.pipe(subtract_federal_tax)
6217 ... .pipe(subtract_state_tax, rate=0.12)
6218 ... .pipe(
6219 ... (subtract_national_insurance, 'df'),
6220 ... rate=0.05,
6221 ... rate_increase=0.02
6222 ... )
6223 ... )
6224 Salary Others
6225 0 5892.48 736.56
6226 1 6997.32 NaN
6227 2 3682.80 1473.12
6228 """
6229 if using_copy_on_write():
6230 return common.pipe(self.copy(deep=None), func, *args, **kwargs)
6231 return common.pipe(self, func, *args, **kwargs)
6232
6233 # ----------------------------------------------------------------------
6234 # Attribute access
6235
6236 @final
6237 def __finalize__(self, other, method: str | None = None, **kwargs) -> Self:
6238 """
6239 Propagate metadata from other to self.
6240
6241 Parameters
6242 ----------
6243 other : the object from which to get the attributes that we are going
6244 to propagate
6245 method : str, optional
6246 A passed method name providing context on where ``__finalize__``
6247 was called.
6248
6249 .. warning::
6250
6251 The value passed as `method` are not currently considered
6252 stable across pandas releases.
6253 """
6254 if isinstance(other, NDFrame):
6255 if other.attrs:
6256 # We want attrs propagation to have minimal performance
6257 # impact if attrs are not used; i.e. attrs is an empty dict.
6258 # One could make the deepcopy unconditionally, but a deepcopy
6259 # of an empty dict is 50x more expensive than the empty check.
6260 self.attrs = deepcopy(other.attrs)
6261
6262 self.flags.allows_duplicate_labels = other.flags.allows_duplicate_labels
6263 # For subclasses using _metadata.
6264 for name in set(self._metadata) & set(other._metadata):
6265 assert isinstance(name, str)
6266 object.__setattr__(self, name, getattr(other, name, None))
6267
6268 if method == "concat":
6269 # propagate attrs only if all concat arguments have the same attrs
6270 if all(bool(obj.attrs) for obj in other.objs):
6271 # all concatenate arguments have non-empty attrs
6272 attrs = other.objs[0].attrs
6273 have_same_attrs = all(obj.attrs == attrs for obj in other.objs[1:])
6274 if have_same_attrs:
6275 self.attrs = deepcopy(attrs)
6276
6277 allows_duplicate_labels = all(
6278 x.flags.allows_duplicate_labels for x in other.objs
6279 )
6280 self.flags.allows_duplicate_labels = allows_duplicate_labels
6281
6282 return self
6283
6284 @final
6285 def __getattr__(self, name: str):
6286 """
6287 After regular attribute access, try looking up the name
6288 This allows simpler access to columns for interactive use.
6289 """
6290 # Note: obj.x will always call obj.__getattribute__('x') prior to
6291 # calling obj.__getattr__('x').
6292 if (
6293 name not in self._internal_names_set
6294 and name not in self._metadata
6295 and name not in self._accessors
6296 and self._info_axis._can_hold_identifiers_and_holds_name(name)
6297 ):
6298 return self[name]
6299 return object.__getattribute__(self, name)
6300
6301 @final
6302 def __setattr__(self, name: str, value) -> None:
6303 """
6304 After regular attribute access, try setting the name
6305 This allows simpler access to columns for interactive use.
6306 """
6307 # first try regular attribute access via __getattribute__, so that
6308 # e.g. ``obj.x`` and ``obj.x = 4`` will always reference/modify
6309 # the same attribute.
6310
6311 try:
6312 object.__getattribute__(self, name)
6313 return object.__setattr__(self, name, value)
6314 except AttributeError:
6315 pass
6316
6317 # if this fails, go on to more involved attribute setting
6318 # (note that this matches __getattr__, above).
6319 if name in self._internal_names_set:
6320 object.__setattr__(self, name, value)
6321 elif name in self._metadata:
6322 object.__setattr__(self, name, value)
6323 else:
6324 try:
6325 existing = getattr(self, name)
6326 if isinstance(existing, Index):
6327 object.__setattr__(self, name, value)
6328 elif name in self._info_axis:
6329 self[name] = value
6330 else:
6331 object.__setattr__(self, name, value)
6332 except (AttributeError, TypeError):
6333 if isinstance(self, ABCDataFrame) and (is_list_like(value)):
6334 warnings.warn(
6335 "Pandas doesn't allow columns to be "
6336 "created via a new attribute name - see "
6337 "https://pandas.pydata.org/pandas-docs/"
6338 "stable/indexing.html#attribute-access",
6339 stacklevel=find_stack_level(),
6340 )
6341 object.__setattr__(self, name, value)
6342
6343 @final
6344 def _dir_additions(self) -> set[str]:
6345 """
6346 add the string-like attributes from the info_axis.
6347 If info_axis is a MultiIndex, its first level values are used.
6348 """
6349 additions = super()._dir_additions()
6350 if self._info_axis._can_hold_strings:
6351 additions.update(self._info_axis._dir_additions_for_owner)
6352 return additions
6353
6354 # ----------------------------------------------------------------------
6355 # Consolidation of internals
6356
6357 @final
6358 def _protect_consolidate(self, f):
6359 """
6360 Consolidate _mgr -- if the blocks have changed, then clear the
6361 cache
6362 """
6363 if isinstance(self._mgr, (ArrayManager, SingleArrayManager)):
6364 return f()
6365 blocks_before = len(self._mgr.blocks)
6366 result = f()
6367 if len(self._mgr.blocks) != blocks_before:
6368 self._clear_item_cache()
6369 return result
6370
6371 @final
6372 def _consolidate_inplace(self) -> None:
6373 """Consolidate data in place and return None"""
6374
6375 def f() -> None:
6376 self._mgr = self._mgr.consolidate()
6377
6378 self._protect_consolidate(f)
6379
6380 @final
6381 def _consolidate(self):
6382 """
6383 Compute NDFrame with "consolidated" internals (data of each dtype
6384 grouped together in a single ndarray).
6385
6386 Returns
6387 -------
6388 consolidated : same type as caller
6389 """
6390 f = lambda: self._mgr.consolidate()
6391 cons_data = self._protect_consolidate(f)
6392 return self._constructor_from_mgr(cons_data, axes=cons_data.axes).__finalize__(
6393 self
6394 )
6395
6396 @final
6397 @property
6398 def _is_mixed_type(self) -> bool_t:
6399 if self._mgr.is_single_block:
6400 # Includes all Series cases
6401 return False
6402
6403 if self._mgr.any_extension_types:
6404 # Even if they have the same dtype, we can't consolidate them,
6405 # so we pretend this is "mixed'"
6406 return True
6407
6408 return self.dtypes.nunique() > 1
6409
6410 @final
6411 def _get_numeric_data(self) -> Self:
6412 new_mgr = self._mgr.get_numeric_data()
6413 return self._constructor_from_mgr(new_mgr, axes=new_mgr.axes).__finalize__(self)
6414
6415 @final
6416 def _get_bool_data(self):
6417 new_mgr = self._mgr.get_bool_data()
6418 return self._constructor_from_mgr(new_mgr, axes=new_mgr.axes).__finalize__(self)
6419
6420 # ----------------------------------------------------------------------
6421 # Internal Interface Methods
6422
6423 @property
6424 def values(self):
6425 raise AbstractMethodError(self)
6426
6427 @property
6428 def _values(self) -> ArrayLike:
6429 """internal implementation"""
6430 raise AbstractMethodError(self)
6431
6432 @property
6433 def dtypes(self):
6434 """
6435 Return the dtypes in the DataFrame.
6436
6437 This returns a Series with the data type of each column.
6438 The result's index is the original DataFrame's columns. Columns
6439 with mixed types are stored with the ``object`` dtype. See
6440 :ref:`the User Guide <basics.dtypes>` for more.
6441
6442 Returns
6443 -------
6444 pandas.Series
6445 The data type of each column.
6446
6447 Examples
6448 --------
6449 >>> df = pd.DataFrame({'float': [1.0],
6450 ... 'int': [1],
6451 ... 'datetime': [pd.Timestamp('20180310')],
6452 ... 'string': ['foo']})
6453 >>> df.dtypes
6454 float float64
6455 int int64
6456 datetime datetime64[ns]
6457 string object
6458 dtype: object
6459 """
6460 data = self._mgr.get_dtypes()
6461 return self._constructor_sliced(data, index=self._info_axis, dtype=np.object_)
6462
6463 @final
6464 def astype(
6465 self, dtype, copy: bool_t | None = None, errors: IgnoreRaise = "raise"
6466 ) -> Self:
6467 """
6468 Cast a pandas object to a specified dtype ``dtype``.
6469
6470 Parameters
6471 ----------
6472 dtype : str, data type, Series or Mapping of column name -> data type
6473 Use a str, numpy.dtype, pandas.ExtensionDtype or Python type to
6474 cast entire pandas object to the same type. Alternatively, use a
6475 mapping, e.g. {col: dtype, ...}, where col is a column label and dtype is
6476 a numpy.dtype or Python type to cast one or more of the DataFrame's
6477 columns to column-specific types.
6478 copy : bool, default True
6479 Return a copy when ``copy=True`` (be very careful setting
6480 ``copy=False`` as changes to values then may propagate to other
6481 pandas objects).
6482
6483 .. note::
6484 The `copy` keyword will change behavior in pandas 3.0.
6485 `Copy-on-Write
6486 <https://pandas.pydata.org/docs/dev/user_guide/copy_on_write.html>`__
6487 will be enabled by default, which means that all methods with a
6488 `copy` keyword will use a lazy copy mechanism to defer the copy and
6489 ignore the `copy` keyword. The `copy` keyword will be removed in a
6490 future version of pandas.
6491
6492 You can already get the future behavior and improvements through
6493 enabling copy on write ``pd.options.mode.copy_on_write = True``
6494 errors : {'raise', 'ignore'}, default 'raise'
6495 Control raising of exceptions on invalid data for provided dtype.
6496
6497 - ``raise`` : allow exceptions to be raised
6498 - ``ignore`` : suppress exceptions. On error return original object.
6499
6500 Returns
6501 -------
6502 same type as caller
6503
6504 See Also
6505 --------
6506 to_datetime : Convert argument to datetime.
6507 to_timedelta : Convert argument to timedelta.
6508 to_numeric : Convert argument to a numeric type.
6509 numpy.ndarray.astype : Cast a numpy array to a specified type.
6510
6511 Notes
6512 -----
6513 .. versionchanged:: 2.0.0
6514
6515 Using ``astype`` to convert from timezone-naive dtype to
6516 timezone-aware dtype will raise an exception.
6517 Use :meth:`Series.dt.tz_localize` instead.
6518
6519 Examples
6520 --------
6521 Create a DataFrame:
6522
6523 >>> d = {'col1': [1, 2], 'col2': [3, 4]}
6524 >>> df = pd.DataFrame(data=d)
6525 >>> df.dtypes
6526 col1 int64
6527 col2 int64
6528 dtype: object
6529
6530 Cast all columns to int32:
6531
6532 >>> df.astype('int32').dtypes
6533 col1 int32
6534 col2 int32
6535 dtype: object
6536
6537 Cast col1 to int32 using a dictionary:
6538
6539 >>> df.astype({'col1': 'int32'}).dtypes
6540 col1 int32
6541 col2 int64
6542 dtype: object
6543
6544 Create a series:
6545
6546 >>> ser = pd.Series([1, 2], dtype='int32')
6547 >>> ser
6548 0 1
6549 1 2
6550 dtype: int32
6551 >>> ser.astype('int64')
6552 0 1
6553 1 2
6554 dtype: int64
6555
6556 Convert to categorical type:
6557
6558 >>> ser.astype('category')
6559 0 1
6560 1 2
6561 dtype: category
6562 Categories (2, int32): [1, 2]
6563
6564 Convert to ordered categorical type with custom ordering:
6565
6566 >>> from pandas.api.types import CategoricalDtype
6567 >>> cat_dtype = CategoricalDtype(
6568 ... categories=[2, 1], ordered=True)
6569 >>> ser.astype(cat_dtype)
6570 0 1
6571 1 2
6572 dtype: category
6573 Categories (2, int64): [2 < 1]
6574
6575 Create a series of dates:
6576
6577 >>> ser_date = pd.Series(pd.date_range('20200101', periods=3))
6578 >>> ser_date
6579 0 2020-01-01
6580 1 2020-01-02
6581 2 2020-01-03
6582 dtype: datetime64[ns]
6583 """
6584 if copy and using_copy_on_write():
6585 copy = False
6586
6587 if is_dict_like(dtype):
6588 if self.ndim == 1: # i.e. Series
6589 if len(dtype) > 1 or self.name not in dtype:
6590 raise KeyError(
6591 "Only the Series name can be used for "
6592 "the key in Series dtype mappings."
6593 )
6594 new_type = dtype[self.name]
6595 return self.astype(new_type, copy, errors)
6596
6597 # GH#44417 cast to Series so we can use .iat below, which will be
6598 # robust in case we
6599 from pandas import Series
6600
6601 dtype_ser = Series(dtype, dtype=object)
6602
6603 for col_name in dtype_ser.index:
6604 if col_name not in self:
6605 raise KeyError(
6606 "Only a column name can be used for the "
6607 "key in a dtype mappings argument. "
6608 f"'{col_name}' not found in columns."
6609 )
6610
6611 dtype_ser = dtype_ser.reindex(self.columns, fill_value=None, copy=False)
6612
6613 results = []
6614 for i, (col_name, col) in enumerate(self.items()):
6615 cdt = dtype_ser.iat[i]
6616 if isna(cdt):
6617 res_col = col.copy(deep=copy)
6618 else:
6619 try:
6620 res_col = col.astype(dtype=cdt, copy=copy, errors=errors)
6621 except ValueError as ex:
6622 ex.args = (
6623 f"{ex}: Error while type casting for column '{col_name}'",
6624 )
6625 raise
6626 results.append(res_col)
6627
6628 elif is_extension_array_dtype(dtype) and self.ndim > 1:
6629 # TODO(EA2D): special case not needed with 2D EAs
6630 dtype = pandas_dtype(dtype)
6631 if isinstance(dtype, ExtensionDtype) and all(
6632 arr.dtype == dtype for arr in self._mgr.arrays
6633 ):
6634 return self.copy(deep=copy)
6635 # GH 18099/22869: columnwise conversion to extension dtype
6636 # GH 24704: self.items handles duplicate column names
6637 results = [
6638 ser.astype(dtype, copy=copy, errors=errors) for _, ser in self.items()
6639 ]
6640
6641 else:
6642 # else, only a single dtype is given
6643 new_data = self._mgr.astype(dtype=dtype, copy=copy, errors=errors)
6644 res = self._constructor_from_mgr(new_data, axes=new_data.axes)
6645 return res.__finalize__(self, method="astype")
6646
6647 # GH 33113: handle empty frame or series
6648 if not results:
6649 return self.copy(deep=None)
6650
6651 # GH 19920: retain column metadata after concat
6652 result = concat(results, axis=1, copy=False)
6653 # GH#40810 retain subclass
6654 # error: Incompatible types in assignment
6655 # (expression has type "Self", variable has type "DataFrame")
6656 result = self._constructor(result) # type: ignore[assignment]
6657 result.columns = self.columns
6658 result = result.__finalize__(self, method="astype")
6659 # https://github.com/python/mypy/issues/8354
6660 return cast(Self, result)
6661
6662 @final
6663 def copy(self, deep: bool_t | None = True) -> Self:
6664 """
6665 Make a copy of this object's indices and data.
6666
6667 When ``deep=True`` (default), a new object will be created with a
6668 copy of the calling object's data and indices. Modifications to
6669 the data or indices of the copy will not be reflected in the
6670 original object (see notes below).
6671
6672 When ``deep=False``, a new object will be created without copying
6673 the calling object's data or index (only references to the data
6674 and index are copied). Any changes to the data of the original
6675 will be reflected in the shallow copy (and vice versa).
6676
6677 .. note::
6678 The ``deep=False`` behaviour as described above will change
6679 in pandas 3.0. `Copy-on-Write
6680 <https://pandas.pydata.org/docs/dev/user_guide/copy_on_write.html>`__
6681 will be enabled by default, which means that the "shallow" copy
6682 is that is returned with ``deep=False`` will still avoid making
6683 an eager copy, but changes to the data of the original will *no*
6684 longer be reflected in the shallow copy (or vice versa). Instead,
6685 it makes use of a lazy (deferred) copy mechanism that will copy
6686 the data only when any changes to the original or shallow copy is
6687 made.
6688
6689 You can already get the future behavior and improvements through
6690 enabling copy on write ``pd.options.mode.copy_on_write = True``
6691
6692 Parameters
6693 ----------
6694 deep : bool, default True
6695 Make a deep copy, including a copy of the data and the indices.
6696 With ``deep=False`` neither the indices nor the data are copied.
6697
6698 Returns
6699 -------
6700 Series or DataFrame
6701 Object type matches caller.
6702
6703 Notes
6704 -----
6705 When ``deep=True``, data is copied but actual Python objects
6706 will not be copied recursively, only the reference to the object.
6707 This is in contrast to `copy.deepcopy` in the Standard Library,
6708 which recursively copies object data (see examples below).
6709
6710 While ``Index`` objects are copied when ``deep=True``, the underlying
6711 numpy array is not copied for performance reasons. Since ``Index`` is
6712 immutable, the underlying data can be safely shared and a copy
6713 is not needed.
6714
6715 Since pandas is not thread safe, see the
6716 :ref:`gotchas <gotchas.thread-safety>` when copying in a threading
6717 environment.
6718
6719 When ``copy_on_write`` in pandas config is set to ``True``, the
6720 ``copy_on_write`` config takes effect even when ``deep=False``.
6721 This means that any changes to the copied data would make a new copy
6722 of the data upon write (and vice versa). Changes made to either the
6723 original or copied variable would not be reflected in the counterpart.
6724 See :ref:`Copy_on_Write <copy_on_write>` for more information.
6725
6726 Examples
6727 --------
6728 >>> s = pd.Series([1, 2], index=["a", "b"])
6729 >>> s
6730 a 1
6731 b 2
6732 dtype: int64
6733
6734 >>> s_copy = s.copy()
6735 >>> s_copy
6736 a 1
6737 b 2
6738 dtype: int64
6739
6740 **Shallow copy versus default (deep) copy:**
6741
6742 >>> s = pd.Series([1, 2], index=["a", "b"])
6743 >>> deep = s.copy()
6744 >>> shallow = s.copy(deep=False)
6745
6746 Shallow copy shares data and index with original.
6747
6748 >>> s is shallow
6749 False
6750 >>> s.values is shallow.values and s.index is shallow.index
6751 True
6752
6753 Deep copy has own copy of data and index.
6754
6755 >>> s is deep
6756 False
6757 >>> s.values is deep.values or s.index is deep.index
6758 False
6759
6760 Updates to the data shared by shallow copy and original is reflected
6761 in both (NOTE: this will no longer be true for pandas >= 3.0);
6762 deep copy remains unchanged.
6763
6764 >>> s.iloc[0] = 3
6765 >>> shallow.iloc[1] = 4
6766 >>> s
6767 a 3
6768 b 4
6769 dtype: int64
6770 >>> shallow
6771 a 3
6772 b 4
6773 dtype: int64
6774 >>> deep
6775 a 1
6776 b 2
6777 dtype: int64
6778
6779 Note that when copying an object containing Python objects, a deep copy
6780 will copy the data, but will not do so recursively. Updating a nested
6781 data object will be reflected in the deep copy.
6782
6783 >>> s = pd.Series([[1, 2], [3, 4]])
6784 >>> deep = s.copy()
6785 >>> s[0][0] = 10
6786 >>> s
6787 0 [10, 2]
6788 1 [3, 4]
6789 dtype: object
6790 >>> deep
6791 0 [10, 2]
6792 1 [3, 4]
6793 dtype: object
6794
6795 **Copy-on-Write is set to true**, the shallow copy is not modified
6796 when the original data is changed:
6797
6798 >>> with pd.option_context("mode.copy_on_write", True):
6799 ... s = pd.Series([1, 2], index=["a", "b"])
6800 ... copy = s.copy(deep=False)
6801 ... s.iloc[0] = 100
6802 ... s
6803 a 100
6804 b 2
6805 dtype: int64
6806 >>> copy
6807 a 1
6808 b 2
6809 dtype: int64
6810 """
6811 data = self._mgr.copy(deep=deep)
6812 self._clear_item_cache()
6813 return self._constructor_from_mgr(data, axes=data.axes).__finalize__(
6814 self, method="copy"
6815 )
6816
6817 @final
6818 def __copy__(self, deep: bool_t = True) -> Self:
6819 return self.copy(deep=deep)
6820
6821 @final
6822 def __deepcopy__(self, memo=None) -> Self:
6823 """
6824 Parameters
6825 ----------
6826 memo, default None
6827 Standard signature. Unused
6828 """
6829 return self.copy(deep=True)
6830
6831 @final
6832 def infer_objects(self, copy: bool_t | None = None) -> Self:
6833 """
6834 Attempt to infer better dtypes for object columns.
6835
6836 Attempts soft conversion of object-dtyped
6837 columns, leaving non-object and unconvertible
6838 columns unchanged. The inference rules are the
6839 same as during normal Series/DataFrame construction.
6840
6841 Parameters
6842 ----------
6843 copy : bool, default True
6844 Whether to make a copy for non-object or non-inferable columns
6845 or Series.
6846
6847 .. note::
6848 The `copy` keyword will change behavior in pandas 3.0.
6849 `Copy-on-Write
6850 <https://pandas.pydata.org/docs/dev/user_guide/copy_on_write.html>`__
6851 will be enabled by default, which means that all methods with a
6852 `copy` keyword will use a lazy copy mechanism to defer the copy and
6853 ignore the `copy` keyword. The `copy` keyword will be removed in a
6854 future version of pandas.
6855
6856 You can already get the future behavior and improvements through
6857 enabling copy on write ``pd.options.mode.copy_on_write = True``
6858
6859 Returns
6860 -------
6861 same type as input object
6862
6863 See Also
6864 --------
6865 to_datetime : Convert argument to datetime.
6866 to_timedelta : Convert argument to timedelta.
6867 to_numeric : Convert argument to numeric type.
6868 convert_dtypes : Convert argument to best possible dtype.
6869
6870 Examples
6871 --------
6872 >>> df = pd.DataFrame({"A": ["a", 1, 2, 3]})
6873 >>> df = df.iloc[1:]
6874 >>> df
6875 A
6876 1 1
6877 2 2
6878 3 3
6879
6880 >>> df.dtypes
6881 A object
6882 dtype: object
6883
6884 >>> df.infer_objects().dtypes
6885 A int64
6886 dtype: object
6887 """
6888 new_mgr = self._mgr.convert(copy=copy)
6889 res = self._constructor_from_mgr(new_mgr, axes=new_mgr.axes)
6890 return res.__finalize__(self, method="infer_objects")
6891
6892 @final
6893 def convert_dtypes(
6894 self,
6895 infer_objects: bool_t = True,
6896 convert_string: bool_t = True,
6897 convert_integer: bool_t = True,
6898 convert_boolean: bool_t = True,
6899 convert_floating: bool_t = True,
6900 dtype_backend: DtypeBackend = "numpy_nullable",
6901 ) -> Self:
6902 """
6903 Convert columns to the best possible dtypes using dtypes supporting ``pd.NA``.
6904
6905 Parameters
6906 ----------
6907 infer_objects : bool, default True
6908 Whether object dtypes should be converted to the best possible types.
6909 convert_string : bool, default True
6910 Whether object dtypes should be converted to ``StringDtype()``.
6911 convert_integer : bool, default True
6912 Whether, if possible, conversion can be done to integer extension types.
6913 convert_boolean : bool, defaults True
6914 Whether object dtypes should be converted to ``BooleanDtypes()``.
6915 convert_floating : bool, defaults True
6916 Whether, if possible, conversion can be done to floating extension types.
6917 If `convert_integer` is also True, preference will be give to integer
6918 dtypes if the floats can be faithfully casted to integers.
6919 dtype_backend : {'numpy_nullable', 'pyarrow'}, default 'numpy_nullable'
6920 Back-end data type applied to the resultant :class:`DataFrame`
6921 (still experimental). Behaviour is as follows:
6922
6923 * ``"numpy_nullable"``: returns nullable-dtype-backed :class:`DataFrame`
6924 (default).
6925 * ``"pyarrow"``: returns pyarrow-backed nullable :class:`ArrowDtype`
6926 DataFrame.
6927
6928 .. versionadded:: 2.0
6929
6930 Returns
6931 -------
6932 Series or DataFrame
6933 Copy of input object with new dtype.
6934
6935 See Also
6936 --------
6937 infer_objects : Infer dtypes of objects.
6938 to_datetime : Convert argument to datetime.
6939 to_timedelta : Convert argument to timedelta.
6940 to_numeric : Convert argument to a numeric type.
6941
6942 Notes
6943 -----
6944 By default, ``convert_dtypes`` will attempt to convert a Series (or each
6945 Series in a DataFrame) to dtypes that support ``pd.NA``. By using the options
6946 ``convert_string``, ``convert_integer``, ``convert_boolean`` and
6947 ``convert_floating``, it is possible to turn off individual conversions
6948 to ``StringDtype``, the integer extension types, ``BooleanDtype``
6949 or floating extension types, respectively.
6950
6951 For object-dtyped columns, if ``infer_objects`` is ``True``, use the inference
6952 rules as during normal Series/DataFrame construction. Then, if possible,
6953 convert to ``StringDtype``, ``BooleanDtype`` or an appropriate integer
6954 or floating extension type, otherwise leave as ``object``.
6955
6956 If the dtype is integer, convert to an appropriate integer extension type.
6957
6958 If the dtype is numeric, and consists of all integers, convert to an
6959 appropriate integer extension type. Otherwise, convert to an
6960 appropriate floating extension type.
6961
6962 In the future, as new dtypes are added that support ``pd.NA``, the results
6963 of this method will change to support those new dtypes.
6964
6965 Examples
6966 --------
6967 >>> df = pd.DataFrame(
6968 ... {
6969 ... "a": pd.Series([1, 2, 3], dtype=np.dtype("int32")),
6970 ... "b": pd.Series(["x", "y", "z"], dtype=np.dtype("O")),
6971 ... "c": pd.Series([True, False, np.nan], dtype=np.dtype("O")),
6972 ... "d": pd.Series(["h", "i", np.nan], dtype=np.dtype("O")),
6973 ... "e": pd.Series([10, np.nan, 20], dtype=np.dtype("float")),
6974 ... "f": pd.Series([np.nan, 100.5, 200], dtype=np.dtype("float")),
6975 ... }
6976 ... )
6977
6978 Start with a DataFrame with default dtypes.
6979
6980 >>> df
6981 a b c d e f
6982 0 1 x True h 10.0 NaN
6983 1 2 y False i NaN 100.5
6984 2 3 z NaN NaN 20.0 200.0
6985
6986 >>> df.dtypes
6987 a int32
6988 b object
6989 c object
6990 d object
6991 e float64
6992 f float64
6993 dtype: object
6994
6995 Convert the DataFrame to use best possible dtypes.
6996
6997 >>> dfn = df.convert_dtypes()
6998 >>> dfn
6999 a b c d e f
7000 0 1 x True h 10 <NA>
7001 1 2 y False i <NA> 100.5
7002 2 3 z <NA> <NA> 20 200.0
7003
7004 >>> dfn.dtypes
7005 a Int32
7006 b string[python]
7007 c boolean
7008 d string[python]
7009 e Int64
7010 f Float64
7011 dtype: object
7012
7013 Start with a Series of strings and missing data represented by ``np.nan``.
7014
7015 >>> s = pd.Series(["a", "b", np.nan])
7016 >>> s
7017 0 a
7018 1 b
7019 2 NaN
7020 dtype: object
7021
7022 Obtain a Series with dtype ``StringDtype``.
7023
7024 >>> s.convert_dtypes()
7025 0 a
7026 1 b
7027 2 <NA>
7028 dtype: string
7029 """
7030 check_dtype_backend(dtype_backend)
7031 new_mgr = self._mgr.convert_dtypes( # type: ignore[union-attr]
7032 infer_objects=infer_objects,
7033 convert_string=convert_string,
7034 convert_integer=convert_integer,
7035 convert_boolean=convert_boolean,
7036 convert_floating=convert_floating,
7037 dtype_backend=dtype_backend,
7038 )
7039 res = self._constructor_from_mgr(new_mgr, axes=new_mgr.axes)
7040 return res.__finalize__(self, method="convert_dtypes")
7041
7042 # ----------------------------------------------------------------------
7043 # Filling NA's
7044
7045 def _deprecate_downcast(self, downcast, method_name: str):
7046 # GH#40988
7047 if downcast is not lib.no_default:
7048 warnings.warn(
7049 f"The 'downcast' keyword in {method_name} is deprecated and "
7050 "will be removed in a future version. Use "
7051 "res.infer_objects(copy=False) to infer non-object dtype, or "
7052 "pd.to_numeric with the 'downcast' keyword to downcast numeric "
7053 "results.",
7054 FutureWarning,
7055 stacklevel=find_stack_level(),
7056 )
7057 else:
7058 downcast = None
7059 return downcast
7060
7061 @final
7062 def _pad_or_backfill(
7063 self,
7064 method: Literal["ffill", "bfill", "pad", "backfill"],
7065 *,
7066 axis: None | Axis = None,
7067 inplace: bool_t = False,
7068 limit: None | int = None,
7069 limit_area: Literal["inside", "outside"] | None = None,
7070 downcast: dict | None = None,
7071 ):
7072 if axis is None:
7073 axis = 0
7074 axis = self._get_axis_number(axis)
7075 method = clean_fill_method(method)
7076
7077 if not self._mgr.is_single_block and axis == 1:
7078 # e.g. test_align_fill_method
7079 # TODO(3.0): once downcast is removed, we can do the .T
7080 # in all axis=1 cases, and remove axis kward from mgr.pad_or_backfill.
7081 if inplace:
7082 raise NotImplementedError()
7083 result = self.T._pad_or_backfill(
7084 method=method, limit=limit, limit_area=limit_area
7085 ).T
7086
7087 return result
7088
7089 new_mgr = self._mgr.pad_or_backfill(
7090 method=method,
7091 axis=self._get_block_manager_axis(axis),
7092 limit=limit,
7093 limit_area=limit_area,
7094 inplace=inplace,
7095 downcast=downcast,
7096 )
7097 result = self._constructor_from_mgr(new_mgr, axes=new_mgr.axes)
7098 if inplace:
7099 return self._update_inplace(result)
7100 else:
7101 return result.__finalize__(self, method="fillna")
7102
7103 @overload
7104 def fillna(
7105 self,
7106 value: Hashable | Mapping | Series | DataFrame = ...,
7107 *,
7108 method: FillnaOptions | None = ...,
7109 axis: Axis | None = ...,
7110 inplace: Literal[False] = ...,
7111 limit: int | None = ...,
7112 downcast: dict | None = ...,
7113 ) -> Self:
7114 ...
7115
7116 @overload
7117 def fillna(
7118 self,
7119 value: Hashable | Mapping | Series | DataFrame = ...,
7120 *,
7121 method: FillnaOptions | None = ...,
7122 axis: Axis | None = ...,
7123 inplace: Literal[True],
7124 limit: int | None = ...,
7125 downcast: dict | None = ...,
7126 ) -> None:
7127 ...
7128
7129 @overload
7130 def fillna(
7131 self,
7132 value: Hashable | Mapping | Series | DataFrame = ...,
7133 *,
7134 method: FillnaOptions | None = ...,
7135 axis: Axis | None = ...,
7136 inplace: bool_t = ...,
7137 limit: int | None = ...,
7138 downcast: dict | None = ...,
7139 ) -> Self | None:
7140 ...
7141
7142 @final
7143 @doc(
7144 klass=_shared_doc_kwargs["klass"],
7145 axes_single_arg=_shared_doc_kwargs["axes_single_arg"],
7146 )
7147 def fillna(
7148 self,
7149 value: Hashable | Mapping | Series | DataFrame | None = None,
7150 *,
7151 method: FillnaOptions | None = None,
7152 axis: Axis | None = None,
7153 inplace: bool_t = False,
7154 limit: int | None = None,
7155 downcast: dict | None | lib.NoDefault = lib.no_default,
7156 ) -> Self | None:
7157 """
7158 Fill NA/NaN values using the specified method.
7159
7160 Parameters
7161 ----------
7162 value : scalar, dict, Series, or DataFrame
7163 Value to use to fill holes (e.g. 0), alternately a
7164 dict/Series/DataFrame of values specifying which value to use for
7165 each index (for a Series) or column (for a DataFrame). Values not
7166 in the dict/Series/DataFrame will not be filled. This value cannot
7167 be a list.
7168 method : {{'backfill', 'bfill', 'ffill', None}}, default None
7169 Method to use for filling holes in reindexed Series:
7170
7171 * ffill: propagate last valid observation forward to next valid.
7172 * backfill / bfill: use next valid observation to fill gap.
7173
7174 .. deprecated:: 2.1.0
7175 Use ffill or bfill instead.
7176
7177 axis : {axes_single_arg}
7178 Axis along which to fill missing values. For `Series`
7179 this parameter is unused and defaults to 0.
7180 inplace : bool, default False
7181 If True, fill in-place. Note: this will modify any
7182 other views on this object (e.g., a no-copy slice for a column in a
7183 DataFrame).
7184 limit : int, default None
7185 If method is specified, this is the maximum number of consecutive
7186 NaN values to forward/backward fill. In other words, if there is
7187 a gap with more than this number of consecutive NaNs, it will only
7188 be partially filled. If method is not specified, this is the
7189 maximum number of entries along the entire axis where NaNs will be
7190 filled. Must be greater than 0 if not None.
7191 downcast : dict, default is None
7192 A dict of item->dtype of what to downcast if possible,
7193 or the string 'infer' which will try to downcast to an appropriate
7194 equal type (e.g. float64 to int64 if possible).
7195
7196 .. deprecated:: 2.2.0
7197
7198 Returns
7199 -------
7200 {klass} or None
7201 Object with missing values filled or None if ``inplace=True``.
7202
7203 See Also
7204 --------
7205 ffill : Fill values by propagating the last valid observation to next valid.
7206 bfill : Fill values by using the next valid observation to fill the gap.
7207 interpolate : Fill NaN values using interpolation.
7208 reindex : Conform object to new index.
7209 asfreq : Convert TimeSeries to specified frequency.
7210
7211 Examples
7212 --------
7213 >>> df = pd.DataFrame([[np.nan, 2, np.nan, 0],
7214 ... [3, 4, np.nan, 1],
7215 ... [np.nan, np.nan, np.nan, np.nan],
7216 ... [np.nan, 3, np.nan, 4]],
7217 ... columns=list("ABCD"))
7218 >>> df
7219 A B C D
7220 0 NaN 2.0 NaN 0.0
7221 1 3.0 4.0 NaN 1.0
7222 2 NaN NaN NaN NaN
7223 3 NaN 3.0 NaN 4.0
7224
7225 Replace all NaN elements with 0s.
7226
7227 >>> df.fillna(0)
7228 A B C D
7229 0 0.0 2.0 0.0 0.0
7230 1 3.0 4.0 0.0 1.0
7231 2 0.0 0.0 0.0 0.0
7232 3 0.0 3.0 0.0 4.0
7233
7234 Replace all NaN elements in column 'A', 'B', 'C', and 'D', with 0, 1,
7235 2, and 3 respectively.
7236
7237 >>> values = {{"A": 0, "B": 1, "C": 2, "D": 3}}
7238 >>> df.fillna(value=values)
7239 A B C D
7240 0 0.0 2.0 2.0 0.0
7241 1 3.0 4.0 2.0 1.0
7242 2 0.0 1.0 2.0 3.0
7243 3 0.0 3.0 2.0 4.0
7244
7245 Only replace the first NaN element.
7246
7247 >>> df.fillna(value=values, limit=1)
7248 A B C D
7249 0 0.0 2.0 2.0 0.0
7250 1 3.0 4.0 NaN 1.0
7251 2 NaN 1.0 NaN 3.0
7252 3 NaN 3.0 NaN 4.0
7253
7254 When filling using a DataFrame, replacement happens along
7255 the same column names and same indices
7256
7257 >>> df2 = pd.DataFrame(np.zeros((4, 4)), columns=list("ABCE"))
7258 >>> df.fillna(df2)
7259 A B C D
7260 0 0.0 2.0 0.0 0.0
7261 1 3.0 4.0 0.0 1.0
7262 2 0.0 0.0 0.0 NaN
7263 3 0.0 3.0 0.0 4.0
7264
7265 Note that column D is not affected since it is not present in df2.
7266 """
7267 inplace = validate_bool_kwarg(inplace, "inplace")
7268 if inplace:
7269 if not PYPY and using_copy_on_write():
7270 if sys.getrefcount(self) <= REF_COUNT:
7271 warnings.warn(
7272 _chained_assignment_method_msg,
7273 ChainedAssignmentError,
7274 stacklevel=2,
7275 )
7276 elif (
7277 not PYPY
7278 and not using_copy_on_write()
7279 and self._is_view_after_cow_rules()
7280 ):
7281 ctr = sys.getrefcount(self)
7282 ref_count = REF_COUNT
7283 if isinstance(self, ABCSeries) and _check_cacher(self):
7284 # see https://github.com/pandas-dev/pandas/pull/56060#discussion_r1399245221
7285 ref_count += 1
7286 if ctr <= ref_count:
7287 warnings.warn(
7288 _chained_assignment_warning_method_msg,
7289 FutureWarning,
7290 stacklevel=2,
7291 )
7292
7293 value, method = validate_fillna_kwargs(value, method)
7294 if method is not None:
7295 warnings.warn(
7296 f"{type(self).__name__}.fillna with 'method' is deprecated and "
7297 "will raise in a future version. Use obj.ffill() or obj.bfill() "
7298 "instead.",
7299 FutureWarning,
7300 stacklevel=find_stack_level(),
7301 )
7302
7303 was_no_default = downcast is lib.no_default
7304 downcast = self._deprecate_downcast(downcast, "fillna")
7305
7306 # set the default here, so functions examining the signaure
7307 # can detect if something was set (e.g. in groupby) (GH9221)
7308 if axis is None:
7309 axis = 0
7310 axis = self._get_axis_number(axis)
7311
7312 if value is None:
7313 return self._pad_or_backfill(
7314 # error: Argument 1 to "_pad_or_backfill" of "NDFrame" has
7315 # incompatible type "Optional[Literal['backfill', 'bfill', 'ffill',
7316 # 'pad']]"; expected "Literal['ffill', 'bfill', 'pad', 'backfill']"
7317 method, # type: ignore[arg-type]
7318 axis=axis,
7319 limit=limit,
7320 inplace=inplace,
7321 # error: Argument "downcast" to "_fillna_with_method" of "NDFrame"
7322 # has incompatible type "Union[Dict[Any, Any], None,
7323 # Literal[_NoDefault.no_default]]"; expected
7324 # "Optional[Dict[Any, Any]]"
7325 downcast=downcast, # type: ignore[arg-type]
7326 )
7327 else:
7328 if self.ndim == 1:
7329 if isinstance(value, (dict, ABCSeries)):
7330 if not len(value):
7331 # test_fillna_nonscalar
7332 if inplace:
7333 return None
7334 return self.copy(deep=None)
7335 from pandas import Series
7336
7337 value = Series(value)
7338 value = value.reindex(self.index, copy=False)
7339 value = value._values
7340 elif not is_list_like(value):
7341 pass
7342 else:
7343 raise TypeError(
7344 '"value" parameter must be a scalar, dict '
7345 "or Series, but you passed a "
7346 f'"{type(value).__name__}"'
7347 )
7348
7349 new_data = self._mgr.fillna(
7350 value=value, limit=limit, inplace=inplace, downcast=downcast
7351 )
7352
7353 elif isinstance(value, (dict, ABCSeries)):
7354 if axis == 1:
7355 raise NotImplementedError(
7356 "Currently only can fill "
7357 "with dict/Series column "
7358 "by column"
7359 )
7360 if using_copy_on_write():
7361 result = self.copy(deep=None)
7362 else:
7363 result = self if inplace else self.copy()
7364 is_dict = isinstance(downcast, dict)
7365 for k, v in value.items():
7366 if k not in result:
7367 continue
7368
7369 if was_no_default:
7370 downcast_k = lib.no_default
7371 else:
7372 downcast_k = (
7373 # error: Incompatible types in assignment (expression
7374 # has type "Union[Dict[Any, Any], None,
7375 # Literal[_NoDefault.no_default], Any]", variable has
7376 # type "_NoDefault")
7377 downcast # type: ignore[assignment]
7378 if not is_dict
7379 # error: Item "None" of "Optional[Dict[Any, Any]]" has
7380 # no attribute "get"
7381 else downcast.get(k) # type: ignore[union-attr]
7382 )
7383
7384 res_k = result[k].fillna(v, limit=limit, downcast=downcast_k)
7385
7386 if not inplace:
7387 result[k] = res_k
7388 else:
7389 # We can write into our existing column(s) iff dtype
7390 # was preserved.
7391 if isinstance(res_k, ABCSeries):
7392 # i.e. 'k' only shows up once in self.columns
7393 if res_k.dtype == result[k].dtype:
7394 result.loc[:, k] = res_k
7395 else:
7396 # Different dtype -> no way to do inplace.
7397 result[k] = res_k
7398 else:
7399 # see test_fillna_dict_inplace_nonunique_columns
7400 locs = result.columns.get_loc(k)
7401 if isinstance(locs, slice):
7402 locs = np.arange(self.shape[1])[locs]
7403 elif (
7404 isinstance(locs, np.ndarray) and locs.dtype.kind == "b"
7405 ):
7406 locs = locs.nonzero()[0]
7407 elif not (
7408 isinstance(locs, np.ndarray) and locs.dtype.kind == "i"
7409 ):
7410 # Should never be reached, but let's cover our bases
7411 raise NotImplementedError(
7412 "Unexpected get_loc result, please report a bug at "
7413 "https://github.com/pandas-dev/pandas"
7414 )
7415
7416 for i, loc in enumerate(locs):
7417 res_loc = res_k.iloc[:, i]
7418 target = self.iloc[:, loc]
7419
7420 if res_loc.dtype == target.dtype:
7421 result.iloc[:, loc] = res_loc
7422 else:
7423 result.isetitem(loc, res_loc)
7424 if inplace:
7425 return self._update_inplace(result)
7426 else:
7427 return result
7428
7429 elif not is_list_like(value):
7430 if axis == 1:
7431 result = self.T.fillna(value=value, limit=limit).T
7432 new_data = result._mgr
7433 else:
7434 new_data = self._mgr.fillna(
7435 value=value, limit=limit, inplace=inplace, downcast=downcast
7436 )
7437 elif isinstance(value, ABCDataFrame) and self.ndim == 2:
7438 new_data = self.where(self.notna(), value)._mgr
7439 else:
7440 raise ValueError(f"invalid fill value with a {type(value)}")
7441
7442 result = self._constructor_from_mgr(new_data, axes=new_data.axes)
7443 if inplace:
7444 return self._update_inplace(result)
7445 else:
7446 return result.__finalize__(self, method="fillna")
7447
7448 @overload
7449 def ffill(
7450 self,
7451 *,
7452 axis: None | Axis = ...,
7453 inplace: Literal[False] = ...,
7454 limit: None | int = ...,
7455 limit_area: Literal["inside", "outside"] | None = ...,
7456 downcast: dict | None | lib.NoDefault = ...,
7457 ) -> Self:
7458 ...
7459
7460 @overload
7461 def ffill(
7462 self,
7463 *,
7464 axis: None | Axis = ...,
7465 inplace: Literal[True],
7466 limit: None | int = ...,
7467 limit_area: Literal["inside", "outside"] | None = ...,
7468 downcast: dict | None | lib.NoDefault = ...,
7469 ) -> None:
7470 ...
7471
7472 @overload
7473 def ffill(
7474 self,
7475 *,
7476 axis: None | Axis = ...,
7477 inplace: bool_t = ...,
7478 limit: None | int = ...,
7479 limit_area: Literal["inside", "outside"] | None = ...,
7480 downcast: dict | None | lib.NoDefault = ...,
7481 ) -> Self | None:
7482 ...
7483
7484 @final
7485 @doc(
7486 klass=_shared_doc_kwargs["klass"],
7487 axes_single_arg=_shared_doc_kwargs["axes_single_arg"],
7488 )
7489 def ffill(
7490 self,
7491 *,
7492 axis: None | Axis = None,
7493 inplace: bool_t = False,
7494 limit: None | int = None,
7495 limit_area: Literal["inside", "outside"] | None = None,
7496 downcast: dict | None | lib.NoDefault = lib.no_default,
7497 ) -> Self | None:
7498 """
7499 Fill NA/NaN values by propagating the last valid observation to next valid.
7500
7501 Parameters
7502 ----------
7503 axis : {axes_single_arg}
7504 Axis along which to fill missing values. For `Series`
7505 this parameter is unused and defaults to 0.
7506 inplace : bool, default False
7507 If True, fill in-place. Note: this will modify any
7508 other views on this object (e.g., a no-copy slice for a column in a
7509 DataFrame).
7510 limit : int, default None
7511 If method is specified, this is the maximum number of consecutive
7512 NaN values to forward/backward fill. In other words, if there is
7513 a gap with more than this number of consecutive NaNs, it will only
7514 be partially filled. If method is not specified, this is the
7515 maximum number of entries along the entire axis where NaNs will be
7516 filled. Must be greater than 0 if not None.
7517 limit_area : {{`None`, 'inside', 'outside'}}, default None
7518 If limit is specified, consecutive NaNs will be filled with this
7519 restriction.
7520
7521 * ``None``: No fill restriction.
7522 * 'inside': Only fill NaNs surrounded by valid values
7523 (interpolate).
7524 * 'outside': Only fill NaNs outside valid values (extrapolate).
7525
7526 .. versionadded:: 2.2.0
7527
7528 downcast : dict, default is None
7529 A dict of item->dtype of what to downcast if possible,
7530 or the string 'infer' which will try to downcast to an appropriate
7531 equal type (e.g. float64 to int64 if possible).
7532
7533 .. deprecated:: 2.2.0
7534
7535 Returns
7536 -------
7537 {klass} or None
7538 Object with missing values filled or None if ``inplace=True``.
7539
7540 Examples
7541 --------
7542 >>> df = pd.DataFrame([[np.nan, 2, np.nan, 0],
7543 ... [3, 4, np.nan, 1],
7544 ... [np.nan, np.nan, np.nan, np.nan],
7545 ... [np.nan, 3, np.nan, 4]],
7546 ... columns=list("ABCD"))
7547 >>> df
7548 A B C D
7549 0 NaN 2.0 NaN 0.0
7550 1 3.0 4.0 NaN 1.0
7551 2 NaN NaN NaN NaN
7552 3 NaN 3.0 NaN 4.0
7553
7554 >>> df.ffill()
7555 A B C D
7556 0 NaN 2.0 NaN 0.0
7557 1 3.0 4.0 NaN 1.0
7558 2 3.0 4.0 NaN 1.0
7559 3 3.0 3.0 NaN 4.0
7560
7561 >>> ser = pd.Series([1, np.nan, 2, 3])
7562 >>> ser.ffill()
7563 0 1.0
7564 1 1.0
7565 2 2.0
7566 3 3.0
7567 dtype: float64
7568 """
7569 downcast = self._deprecate_downcast(downcast, "ffill")
7570 inplace = validate_bool_kwarg(inplace, "inplace")
7571 if inplace:
7572 if not PYPY and using_copy_on_write():
7573 if sys.getrefcount(self) <= REF_COUNT:
7574 warnings.warn(
7575 _chained_assignment_method_msg,
7576 ChainedAssignmentError,
7577 stacklevel=2,
7578 )
7579 elif (
7580 not PYPY
7581 and not using_copy_on_write()
7582 and self._is_view_after_cow_rules()
7583 ):
7584 ctr = sys.getrefcount(self)
7585 ref_count = REF_COUNT
7586 if isinstance(self, ABCSeries) and _check_cacher(self):
7587 # see https://github.com/pandas-dev/pandas/pull/56060#discussion_r1399245221
7588 ref_count += 1
7589 if ctr <= ref_count:
7590 warnings.warn(
7591 _chained_assignment_warning_method_msg,
7592 FutureWarning,
7593 stacklevel=2,
7594 )
7595
7596 return self._pad_or_backfill(
7597 "ffill",
7598 axis=axis,
7599 inplace=inplace,
7600 limit=limit,
7601 limit_area=limit_area,
7602 # error: Argument "downcast" to "_fillna_with_method" of "NDFrame"
7603 # has incompatible type "Union[Dict[Any, Any], None,
7604 # Literal[_NoDefault.no_default]]"; expected "Optional[Dict[Any, Any]]"
7605 downcast=downcast, # type: ignore[arg-type]
7606 )
7607
7608 @final
7609 @doc(klass=_shared_doc_kwargs["klass"])
7610 def pad(
7611 self,
7612 *,
7613 axis: None | Axis = None,
7614 inplace: bool_t = False,
7615 limit: None | int = None,
7616 downcast: dict | None | lib.NoDefault = lib.no_default,
7617 ) -> Self | None:
7618 """
7619 Fill NA/NaN values by propagating the last valid observation to next valid.
7620
7621 .. deprecated:: 2.0
7622
7623 {klass}.pad is deprecated. Use {klass}.ffill instead.
7624
7625 Returns
7626 -------
7627 {klass} or None
7628 Object with missing values filled or None if ``inplace=True``.
7629
7630 Examples
7631 --------
7632 Please see examples for :meth:`DataFrame.ffill` or :meth:`Series.ffill`.
7633 """
7634 warnings.warn(
7635 "DataFrame.pad/Series.pad is deprecated. Use "
7636 "DataFrame.ffill/Series.ffill instead",
7637 FutureWarning,
7638 stacklevel=find_stack_level(),
7639 )
7640 return self.ffill(axis=axis, inplace=inplace, limit=limit, downcast=downcast)
7641
7642 @overload
7643 def bfill(
7644 self,
7645 *,
7646 axis: None | Axis = ...,
7647 inplace: Literal[False] = ...,
7648 limit: None | int = ...,
7649 limit_area: Literal["inside", "outside"] | None = ...,
7650 downcast: dict | None | lib.NoDefault = ...,
7651 ) -> Self:
7652 ...
7653
7654 @overload
7655 def bfill(
7656 self,
7657 *,
7658 axis: None | Axis = ...,
7659 inplace: Literal[True],
7660 limit: None | int = ...,
7661 downcast: dict | None | lib.NoDefault = ...,
7662 ) -> None:
7663 ...
7664
7665 @overload
7666 def bfill(
7667 self,
7668 *,
7669 axis: None | Axis = ...,
7670 inplace: bool_t = ...,
7671 limit: None | int = ...,
7672 limit_area: Literal["inside", "outside"] | None = ...,
7673 downcast: dict | None | lib.NoDefault = ...,
7674 ) -> Self | None:
7675 ...
7676
7677 @final
7678 @doc(
7679 klass=_shared_doc_kwargs["klass"],
7680 axes_single_arg=_shared_doc_kwargs["axes_single_arg"],
7681 )
7682 def bfill(
7683 self,
7684 *,
7685 axis: None | Axis = None,
7686 inplace: bool_t = False,
7687 limit: None | int = None,
7688 limit_area: Literal["inside", "outside"] | None = None,
7689 downcast: dict | None | lib.NoDefault = lib.no_default,
7690 ) -> Self | None:
7691 """
7692 Fill NA/NaN values by using the next valid observation to fill the gap.
7693
7694 Parameters
7695 ----------
7696 axis : {axes_single_arg}
7697 Axis along which to fill missing values. For `Series`
7698 this parameter is unused and defaults to 0.
7699 inplace : bool, default False
7700 If True, fill in-place. Note: this will modify any
7701 other views on this object (e.g., a no-copy slice for a column in a
7702 DataFrame).
7703 limit : int, default None
7704 If method is specified, this is the maximum number of consecutive
7705 NaN values to forward/backward fill. In other words, if there is
7706 a gap with more than this number of consecutive NaNs, it will only
7707 be partially filled. If method is not specified, this is the
7708 maximum number of entries along the entire axis where NaNs will be
7709 filled. Must be greater than 0 if not None.
7710 limit_area : {{`None`, 'inside', 'outside'}}, default None
7711 If limit is specified, consecutive NaNs will be filled with this
7712 restriction.
7713
7714 * ``None``: No fill restriction.
7715 * 'inside': Only fill NaNs surrounded by valid values
7716 (interpolate).
7717 * 'outside': Only fill NaNs outside valid values (extrapolate).
7718
7719 .. versionadded:: 2.2.0
7720
7721 downcast : dict, default is None
7722 A dict of item->dtype of what to downcast if possible,
7723 or the string 'infer' which will try to downcast to an appropriate
7724 equal type (e.g. float64 to int64 if possible).
7725
7726 .. deprecated:: 2.2.0
7727
7728 Returns
7729 -------
7730 {klass} or None
7731 Object with missing values filled or None if ``inplace=True``.
7732
7733 Examples
7734 --------
7735 For Series:
7736
7737 >>> s = pd.Series([1, None, None, 2])
7738 >>> s.bfill()
7739 0 1.0
7740 1 2.0
7741 2 2.0
7742 3 2.0
7743 dtype: float64
7744 >>> s.bfill(limit=1)
7745 0 1.0
7746 1 NaN
7747 2 2.0
7748 3 2.0
7749 dtype: float64
7750
7751 With DataFrame:
7752
7753 >>> df = pd.DataFrame({{'A': [1, None, None, 4], 'B': [None, 5, None, 7]}})
7754 >>> df
7755 A B
7756 0 1.0 NaN
7757 1 NaN 5.0
7758 2 NaN NaN
7759 3 4.0 7.0
7760 >>> df.bfill()
7761 A B
7762 0 1.0 5.0
7763 1 4.0 5.0
7764 2 4.0 7.0
7765 3 4.0 7.0
7766 >>> df.bfill(limit=1)
7767 A B
7768 0 1.0 5.0
7769 1 NaN 5.0
7770 2 4.0 7.0
7771 3 4.0 7.0
7772 """
7773 downcast = self._deprecate_downcast(downcast, "bfill")
7774 inplace = validate_bool_kwarg(inplace, "inplace")
7775 if inplace:
7776 if not PYPY and using_copy_on_write():
7777 if sys.getrefcount(self) <= REF_COUNT:
7778 warnings.warn(
7779 _chained_assignment_method_msg,
7780 ChainedAssignmentError,
7781 stacklevel=2,
7782 )
7783 elif (
7784 not PYPY
7785 and not using_copy_on_write()
7786 and self._is_view_after_cow_rules()
7787 ):
7788 ctr = sys.getrefcount(self)
7789 ref_count = REF_COUNT
7790 if isinstance(self, ABCSeries) and _check_cacher(self):
7791 # see https://github.com/pandas-dev/pandas/pull/56060#discussion_r1399245221
7792 ref_count += 1
7793 if ctr <= ref_count:
7794 warnings.warn(
7795 _chained_assignment_warning_method_msg,
7796 FutureWarning,
7797 stacklevel=2,
7798 )
7799
7800 return self._pad_or_backfill(
7801 "bfill",
7802 axis=axis,
7803 inplace=inplace,
7804 limit=limit,
7805 limit_area=limit_area,
7806 # error: Argument "downcast" to "_fillna_with_method" of "NDFrame"
7807 # has incompatible type "Union[Dict[Any, Any], None,
7808 # Literal[_NoDefault.no_default]]"; expected "Optional[Dict[Any, Any]]"
7809 downcast=downcast, # type: ignore[arg-type]
7810 )
7811
7812 @final
7813 @doc(klass=_shared_doc_kwargs["klass"])
7814 def backfill(
7815 self,
7816 *,
7817 axis: None | Axis = None,
7818 inplace: bool_t = False,
7819 limit: None | int = None,
7820 downcast: dict | None | lib.NoDefault = lib.no_default,
7821 ) -> Self | None:
7822 """
7823 Fill NA/NaN values by using the next valid observation to fill the gap.
7824
7825 .. deprecated:: 2.0
7826
7827 {klass}.backfill is deprecated. Use {klass}.bfill instead.
7828
7829 Returns
7830 -------
7831 {klass} or None
7832 Object with missing values filled or None if ``inplace=True``.
7833
7834 Examples
7835 --------
7836 Please see examples for :meth:`DataFrame.bfill` or :meth:`Series.bfill`.
7837 """
7838 warnings.warn(
7839 "DataFrame.backfill/Series.backfill is deprecated. Use "
7840 "DataFrame.bfill/Series.bfill instead",
7841 FutureWarning,
7842 stacklevel=find_stack_level(),
7843 )
7844 return self.bfill(axis=axis, inplace=inplace, limit=limit, downcast=downcast)
7845
7846 @overload
7847 def replace(
7848 self,
7849 to_replace=...,
7850 value=...,
7851 *,
7852 inplace: Literal[False] = ...,
7853 limit: int | None = ...,
7854 regex: bool_t = ...,
7855 method: Literal["pad", "ffill", "bfill"] | lib.NoDefault = ...,
7856 ) -> Self:
7857 ...
7858
7859 @overload
7860 def replace(
7861 self,
7862 to_replace=...,
7863 value=...,
7864 *,
7865 inplace: Literal[True],
7866 limit: int | None = ...,
7867 regex: bool_t = ...,
7868 method: Literal["pad", "ffill", "bfill"] | lib.NoDefault = ...,
7869 ) -> None:
7870 ...
7871
7872 @overload
7873 def replace(
7874 self,
7875 to_replace=...,
7876 value=...,
7877 *,
7878 inplace: bool_t = ...,
7879 limit: int | None = ...,
7880 regex: bool_t = ...,
7881 method: Literal["pad", "ffill", "bfill"] | lib.NoDefault = ...,
7882 ) -> Self | None:
7883 ...
7884
7885 @final
7886 @doc(
7887 _shared_docs["replace"],
7888 klass=_shared_doc_kwargs["klass"],
7889 inplace=_shared_doc_kwargs["inplace"],
7890 )
7891 def replace(
7892 self,
7893 to_replace=None,
7894 value=lib.no_default,
7895 *,
7896 inplace: bool_t = False,
7897 limit: int | None = None,
7898 regex: bool_t = False,
7899 method: Literal["pad", "ffill", "bfill"] | lib.NoDefault = lib.no_default,
7900 ) -> Self | None:
7901 if method is not lib.no_default:
7902 warnings.warn(
7903 # GH#33302
7904 f"The 'method' keyword in {type(self).__name__}.replace is "
7905 "deprecated and will be removed in a future version.",
7906 FutureWarning,
7907 stacklevel=find_stack_level(),
7908 )
7909 elif limit is not None:
7910 warnings.warn(
7911 # GH#33302
7912 f"The 'limit' keyword in {type(self).__name__}.replace is "
7913 "deprecated and will be removed in a future version.",
7914 FutureWarning,
7915 stacklevel=find_stack_level(),
7916 )
7917 if (
7918 value is lib.no_default
7919 and method is lib.no_default
7920 and not is_dict_like(to_replace)
7921 and regex is False
7922 ):
7923 # case that goes through _replace_single and defaults to method="pad"
7924 warnings.warn(
7925 # GH#33302
7926 f"{type(self).__name__}.replace without 'value' and with "
7927 "non-dict-like 'to_replace' is deprecated "
7928 "and will raise in a future version. "
7929 "Explicitly specify the new values instead.",
7930 FutureWarning,
7931 stacklevel=find_stack_level(),
7932 )
7933
7934 if not (
7935 is_scalar(to_replace)
7936 or is_re_compilable(to_replace)
7937 or is_list_like(to_replace)
7938 ):
7939 raise TypeError(
7940 "Expecting 'to_replace' to be either a scalar, array-like, "
7941 "dict or None, got invalid type "
7942 f"{repr(type(to_replace).__name__)}"
7943 )
7944
7945 inplace = validate_bool_kwarg(inplace, "inplace")
7946 if inplace:
7947 if not PYPY and using_copy_on_write():
7948 if sys.getrefcount(self) <= REF_COUNT:
7949 warnings.warn(
7950 _chained_assignment_method_msg,
7951 ChainedAssignmentError,
7952 stacklevel=2,
7953 )
7954 elif (
7955 not PYPY
7956 and not using_copy_on_write()
7957 and self._is_view_after_cow_rules()
7958 ):
7959 ctr = sys.getrefcount(self)
7960 ref_count = REF_COUNT
7961 if isinstance(self, ABCSeries) and _check_cacher(self):
7962 # in non-CoW mode, chained Series access will populate the
7963 # `_item_cache` which results in an increased ref count not below
7964 # the threshold, while we still need to warn. We detect this case
7965 # of a Series derived from a DataFrame through the presence of
7966 # checking the `_cacher`
7967 ref_count += 1
7968 if ctr <= ref_count:
7969 warnings.warn(
7970 _chained_assignment_warning_method_msg,
7971 FutureWarning,
7972 stacklevel=2,
7973 )
7974
7975 if not is_bool(regex) and to_replace is not None:
7976 raise ValueError("'to_replace' must be 'None' if 'regex' is not a bool")
7977
7978 if value is lib.no_default or method is not lib.no_default:
7979 # GH#36984 if the user explicitly passes value=None we want to
7980 # respect that. We have the corner case where the user explicitly
7981 # passes value=None *and* a method, which we interpret as meaning
7982 # they want the (documented) default behavior.
7983 if method is lib.no_default:
7984 # TODO: get this to show up as the default in the docs?
7985 method = "pad"
7986
7987 # passing a single value that is scalar like
7988 # when value is None (GH5319), for compat
7989 if not is_dict_like(to_replace) and not is_dict_like(regex):
7990 to_replace = [to_replace]
7991
7992 if isinstance(to_replace, (tuple, list)):
7993 # TODO: Consider copy-on-write for non-replaced columns's here
7994 if isinstance(self, ABCDataFrame):
7995 from pandas import Series
7996
7997 result = self.apply(
7998 Series._replace_single,
7999 args=(to_replace, method, inplace, limit),
8000 )
8001 if inplace:
8002 return None
8003 return result
8004 return self._replace_single(to_replace, method, inplace, limit)
8005
8006 if not is_dict_like(to_replace):
8007 if not is_dict_like(regex):
8008 raise TypeError(
8009 'If "to_replace" and "value" are both None '
8010 'and "to_replace" is not a list, then '
8011 "regex must be a mapping"
8012 )
8013 to_replace = regex
8014 regex = True
8015
8016 items = list(to_replace.items())
8017 if items:
8018 keys, values = zip(*items)
8019 else:
8020 # error: Incompatible types in assignment (expression has type
8021 # "list[Never]", variable has type "tuple[Any, ...]")
8022 keys, values = ([], []) # type: ignore[assignment]
8023
8024 are_mappings = [is_dict_like(v) for v in values]
8025
8026 if any(are_mappings):
8027 if not all(are_mappings):
8028 raise TypeError(
8029 "If a nested mapping is passed, all values "
8030 "of the top level mapping must be mappings"
8031 )
8032 # passed a nested dict/Series
8033 to_rep_dict = {}
8034 value_dict = {}
8035
8036 for k, v in items:
8037 # error: Incompatible types in assignment (expression has type
8038 # "list[Never]", variable has type "tuple[Any, ...]")
8039 keys, values = list(zip(*v.items())) or ( # type: ignore[assignment]
8040 [],
8041 [],
8042 )
8043
8044 to_rep_dict[k] = list(keys)
8045 value_dict[k] = list(values)
8046
8047 to_replace, value = to_rep_dict, value_dict
8048 else:
8049 to_replace, value = keys, values
8050
8051 return self.replace(
8052 to_replace, value, inplace=inplace, limit=limit, regex=regex
8053 )
8054 else:
8055 # need a non-zero len on all axes
8056 if not self.size:
8057 if inplace:
8058 return None
8059 return self.copy(deep=None)
8060
8061 if is_dict_like(to_replace):
8062 if is_dict_like(value): # {'A' : NA} -> {'A' : 0}
8063 # Note: Checking below for `in foo.keys()` instead of
8064 # `in foo` is needed for when we have a Series and not dict
8065 mapping = {
8066 col: (to_replace[col], value[col])
8067 for col in to_replace.keys()
8068 if col in value.keys() and col in self
8069 }
8070 return self._replace_columnwise(mapping, inplace, regex)
8071
8072 # {'A': NA} -> 0
8073 elif not is_list_like(value):
8074 # Operate column-wise
8075 if self.ndim == 1:
8076 raise ValueError(
8077 "Series.replace cannot use dict-like to_replace "
8078 "and non-None value"
8079 )
8080 mapping = {
8081 col: (to_rep, value) for col, to_rep in to_replace.items()
8082 }
8083 return self._replace_columnwise(mapping, inplace, regex)
8084 else:
8085 raise TypeError("value argument must be scalar, dict, or Series")
8086
8087 elif is_list_like(to_replace):
8088 if not is_list_like(value):
8089 # e.g. to_replace = [NA, ''] and value is 0,
8090 # so we replace NA with 0 and then replace '' with 0
8091 value = [value] * len(to_replace)
8092
8093 # e.g. we have to_replace = [NA, ''] and value = [0, 'missing']
8094 if len(to_replace) != len(value):
8095 raise ValueError(
8096 f"Replacement lists must match in length. "
8097 f"Expecting {len(to_replace)} got {len(value)} "
8098 )
8099 new_data = self._mgr.replace_list(
8100 src_list=to_replace,
8101 dest_list=value,
8102 inplace=inplace,
8103 regex=regex,
8104 )
8105
8106 elif to_replace is None:
8107 if not (
8108 is_re_compilable(regex)
8109 or is_list_like(regex)
8110 or is_dict_like(regex)
8111 ):
8112 raise TypeError(
8113 f"'regex' must be a string or a compiled regular expression "
8114 f"or a list or dict of strings or regular expressions, "
8115 f"you passed a {repr(type(regex).__name__)}"
8116 )
8117 return self.replace(
8118 regex, value, inplace=inplace, limit=limit, regex=True
8119 )
8120 else:
8121 # dest iterable dict-like
8122 if is_dict_like(value): # NA -> {'A' : 0, 'B' : -1}
8123 # Operate column-wise
8124 if self.ndim == 1:
8125 raise ValueError(
8126 "Series.replace cannot use dict-value and "
8127 "non-None to_replace"
8128 )
8129 mapping = {col: (to_replace, val) for col, val in value.items()}
8130 return self._replace_columnwise(mapping, inplace, regex)
8131
8132 elif not is_list_like(value): # NA -> 0
8133 regex = should_use_regex(regex, to_replace)
8134 if regex:
8135 new_data = self._mgr.replace_regex(
8136 to_replace=to_replace,
8137 value=value,
8138 inplace=inplace,
8139 )
8140 else:
8141 new_data = self._mgr.replace(
8142 to_replace=to_replace, value=value, inplace=inplace
8143 )
8144 else:
8145 raise TypeError(
8146 f'Invalid "to_replace" type: {repr(type(to_replace).__name__)}'
8147 )
8148
8149 result = self._constructor_from_mgr(new_data, axes=new_data.axes)
8150 if inplace:
8151 return self._update_inplace(result)
8152 else:
8153 return result.__finalize__(self, method="replace")
8154
8155 @overload
8156 def interpolate(
8157 self,
8158 method: InterpolateOptions = ...,
8159 *,
8160 axis: Axis = ...,
8161 limit: int | None = ...,
8162 inplace: Literal[False] = ...,
8163 limit_direction: Literal["forward", "backward", "both"] | None = ...,
8164 limit_area: Literal["inside", "outside"] | None = ...,
8165 downcast: Literal["infer"] | None | lib.NoDefault = ...,
8166 **kwargs,
8167 ) -> Self:
8168 ...
8169
8170 @overload
8171 def interpolate(
8172 self,
8173 method: InterpolateOptions = ...,
8174 *,
8175 axis: Axis = ...,
8176 limit: int | None = ...,
8177 inplace: Literal[True],
8178 limit_direction: Literal["forward", "backward", "both"] | None = ...,
8179 limit_area: Literal["inside", "outside"] | None = ...,
8180 downcast: Literal["infer"] | None | lib.NoDefault = ...,
8181 **kwargs,
8182 ) -> None:
8183 ...
8184
8185 @overload
8186 def interpolate(
8187 self,
8188 method: InterpolateOptions = ...,
8189 *,
8190 axis: Axis = ...,
8191 limit: int | None = ...,
8192 inplace: bool_t = ...,
8193 limit_direction: Literal["forward", "backward", "both"] | None = ...,
8194 limit_area: Literal["inside", "outside"] | None = ...,
8195 downcast: Literal["infer"] | None | lib.NoDefault = ...,
8196 **kwargs,
8197 ) -> Self | None:
8198 ...
8199
8200 @final
8201 def interpolate(
8202 self,
8203 method: InterpolateOptions = "linear",
8204 *,
8205 axis: Axis = 0,
8206 limit: int | None = None,
8207 inplace: bool_t = False,
8208 limit_direction: Literal["forward", "backward", "both"] | None = None,
8209 limit_area: Literal["inside", "outside"] | None = None,
8210 downcast: Literal["infer"] | None | lib.NoDefault = lib.no_default,
8211 **kwargs,
8212 ) -> Self | None:
8213 """
8214 Fill NaN values using an interpolation method.
8215
8216 Please note that only ``method='linear'`` is supported for
8217 DataFrame/Series with a MultiIndex.
8218
8219 Parameters
8220 ----------
8221 method : str, default 'linear'
8222 Interpolation technique to use. One of:
8223
8224 * 'linear': Ignore the index and treat the values as equally
8225 spaced. This is the only method supported on MultiIndexes.
8226 * 'time': Works on daily and higher resolution data to interpolate
8227 given length of interval.
8228 * 'index', 'values': use the actual numerical values of the index.
8229 * 'pad': Fill in NaNs using existing values.
8230 * 'nearest', 'zero', 'slinear', 'quadratic', 'cubic',
8231 'barycentric', 'polynomial': Passed to
8232 `scipy.interpolate.interp1d`, whereas 'spline' is passed to
8233 `scipy.interpolate.UnivariateSpline`. These methods use the numerical
8234 values of the index. Both 'polynomial' and 'spline' require that
8235 you also specify an `order` (int), e.g.
8236 ``df.interpolate(method='polynomial', order=5)``. Note that,
8237 `slinear` method in Pandas refers to the Scipy first order `spline`
8238 instead of Pandas first order `spline`.
8239 * 'krogh', 'piecewise_polynomial', 'spline', 'pchip', 'akima',
8240 'cubicspline': Wrappers around the SciPy interpolation methods of
8241 similar names. See `Notes`.
8242 * 'from_derivatives': Refers to
8243 `scipy.interpolate.BPoly.from_derivatives`.
8244
8245 axis : {{0 or 'index', 1 or 'columns', None}}, default None
8246 Axis to interpolate along. For `Series` this parameter is unused
8247 and defaults to 0.
8248 limit : int, optional
8249 Maximum number of consecutive NaNs to fill. Must be greater than
8250 0.
8251 inplace : bool, default False
8252 Update the data in place if possible.
8253 limit_direction : {{'forward', 'backward', 'both'}}, Optional
8254 Consecutive NaNs will be filled in this direction.
8255
8256 If limit is specified:
8257 * If 'method' is 'pad' or 'ffill', 'limit_direction' must be 'forward'.
8258 * If 'method' is 'backfill' or 'bfill', 'limit_direction' must be
8259 'backwards'.
8260
8261 If 'limit' is not specified:
8262 * If 'method' is 'backfill' or 'bfill', the default is 'backward'
8263 * else the default is 'forward'
8264
8265 raises ValueError if `limit_direction` is 'forward' or 'both' and
8266 method is 'backfill' or 'bfill'.
8267 raises ValueError if `limit_direction` is 'backward' or 'both' and
8268 method is 'pad' or 'ffill'.
8269
8270 limit_area : {{`None`, 'inside', 'outside'}}, default None
8271 If limit is specified, consecutive NaNs will be filled with this
8272 restriction.
8273
8274 * ``None``: No fill restriction.
8275 * 'inside': Only fill NaNs surrounded by valid values
8276 (interpolate).
8277 * 'outside': Only fill NaNs outside valid values (extrapolate).
8278
8279 downcast : optional, 'infer' or None, defaults to None
8280 Downcast dtypes if possible.
8281
8282 .. deprecated:: 2.1.0
8283
8284 ``**kwargs`` : optional
8285 Keyword arguments to pass on to the interpolating function.
8286
8287 Returns
8288 -------
8289 Series or DataFrame or None
8290 Returns the same object type as the caller, interpolated at
8291 some or all ``NaN`` values or None if ``inplace=True``.
8292
8293 See Also
8294 --------
8295 fillna : Fill missing values using different methods.
8296 scipy.interpolate.Akima1DInterpolator : Piecewise cubic polynomials
8297 (Akima interpolator).
8298 scipy.interpolate.BPoly.from_derivatives : Piecewise polynomial in the
8299 Bernstein basis.
8300 scipy.interpolate.interp1d : Interpolate a 1-D function.
8301 scipy.interpolate.KroghInterpolator : Interpolate polynomial (Krogh
8302 interpolator).
8303 scipy.interpolate.PchipInterpolator : PCHIP 1-d monotonic cubic
8304 interpolation.
8305 scipy.interpolate.CubicSpline : Cubic spline data interpolator.
8306
8307 Notes
8308 -----
8309 The 'krogh', 'piecewise_polynomial', 'spline', 'pchip' and 'akima'
8310 methods are wrappers around the respective SciPy implementations of
8311 similar names. These use the actual numerical values of the index.
8312 For more information on their behavior, see the
8313 `SciPy documentation
8314 <https://docs.scipy.org/doc/scipy/reference/interpolate.html#univariate-interpolation>`__.
8315
8316 Examples
8317 --------
8318 Filling in ``NaN`` in a :class:`~pandas.Series` via linear
8319 interpolation.
8320
8321 >>> s = pd.Series([0, 1, np.nan, 3])
8322 >>> s
8323 0 0.0
8324 1 1.0
8325 2 NaN
8326 3 3.0
8327 dtype: float64
8328 >>> s.interpolate()
8329 0 0.0
8330 1 1.0
8331 2 2.0
8332 3 3.0
8333 dtype: float64
8334
8335 Filling in ``NaN`` in a Series via polynomial interpolation or splines:
8336 Both 'polynomial' and 'spline' methods require that you also specify
8337 an ``order`` (int).
8338
8339 >>> s = pd.Series([0, 2, np.nan, 8])
8340 >>> s.interpolate(method='polynomial', order=2)
8341 0 0.000000
8342 1 2.000000
8343 2 4.666667
8344 3 8.000000
8345 dtype: float64
8346
8347 Fill the DataFrame forward (that is, going down) along each column
8348 using linear interpolation.
8349
8350 Note how the last entry in column 'a' is interpolated differently,
8351 because there is no entry after it to use for interpolation.
8352 Note how the first entry in column 'b' remains ``NaN``, because there
8353 is no entry before it to use for interpolation.
8354
8355 >>> df = pd.DataFrame([(0.0, np.nan, -1.0, 1.0),
8356 ... (np.nan, 2.0, np.nan, np.nan),
8357 ... (2.0, 3.0, np.nan, 9.0),
8358 ... (np.nan, 4.0, -4.0, 16.0)],
8359 ... columns=list('abcd'))
8360 >>> df
8361 a b c d
8362 0 0.0 NaN -1.0 1.0
8363 1 NaN 2.0 NaN NaN
8364 2 2.0 3.0 NaN 9.0
8365 3 NaN 4.0 -4.0 16.0
8366 >>> df.interpolate(method='linear', limit_direction='forward', axis=0)
8367 a b c d
8368 0 0.0 NaN -1.0 1.0
8369 1 1.0 2.0 -2.0 5.0
8370 2 2.0 3.0 -3.0 9.0
8371 3 2.0 4.0 -4.0 16.0
8372
8373 Using polynomial interpolation.
8374
8375 >>> df['d'].interpolate(method='polynomial', order=2)
8376 0 1.0
8377 1 4.0
8378 2 9.0
8379 3 16.0
8380 Name: d, dtype: float64
8381 """
8382 if downcast is not lib.no_default:
8383 # GH#40988
8384 warnings.warn(
8385 f"The 'downcast' keyword in {type(self).__name__}.interpolate "
8386 "is deprecated and will be removed in a future version. "
8387 "Call result.infer_objects(copy=False) on the result instead.",
8388 FutureWarning,
8389 stacklevel=find_stack_level(),
8390 )
8391 else:
8392 downcast = None
8393 if downcast is not None and downcast != "infer":
8394 raise ValueError("downcast must be either None or 'infer'")
8395
8396 inplace = validate_bool_kwarg(inplace, "inplace")
8397
8398 if inplace:
8399 if not PYPY and using_copy_on_write():
8400 if sys.getrefcount(self) <= REF_COUNT:
8401 warnings.warn(
8402 _chained_assignment_method_msg,
8403 ChainedAssignmentError,
8404 stacklevel=2,
8405 )
8406 elif (
8407 not PYPY
8408 and not using_copy_on_write()
8409 and self._is_view_after_cow_rules()
8410 ):
8411 ctr = sys.getrefcount(self)
8412 ref_count = REF_COUNT
8413 if isinstance(self, ABCSeries) and _check_cacher(self):
8414 # see https://github.com/pandas-dev/pandas/pull/56060#discussion_r1399245221
8415 ref_count += 1
8416 if ctr <= ref_count:
8417 warnings.warn(
8418 _chained_assignment_warning_method_msg,
8419 FutureWarning,
8420 stacklevel=2,
8421 )
8422
8423 axis = self._get_axis_number(axis)
8424
8425 if self.empty:
8426 if inplace:
8427 return None
8428 return self.copy()
8429
8430 if not isinstance(method, str):
8431 raise ValueError("'method' should be a string, not None.")
8432
8433 fillna_methods = ["ffill", "bfill", "pad", "backfill"]
8434 if method.lower() in fillna_methods:
8435 # GH#53581
8436 warnings.warn(
8437 f"{type(self).__name__}.interpolate with method={method} is "
8438 "deprecated and will raise in a future version. "
8439 "Use obj.ffill() or obj.bfill() instead.",
8440 FutureWarning,
8441 stacklevel=find_stack_level(),
8442 )
8443 obj, should_transpose = self, False
8444 else:
8445 obj, should_transpose = (self.T, True) if axis == 1 else (self, False)
8446 if np.any(obj.dtypes == object):
8447 # GH#53631
8448 if not (obj.ndim == 2 and np.all(obj.dtypes == object)):
8449 # don't warn in cases that already raise
8450 warnings.warn(
8451 f"{type(self).__name__}.interpolate with object dtype is "
8452 "deprecated and will raise in a future version. Call "
8453 "obj.infer_objects(copy=False) before interpolating instead.",
8454 FutureWarning,
8455 stacklevel=find_stack_level(),
8456 )
8457
8458 if method in fillna_methods and "fill_value" in kwargs:
8459 raise ValueError(
8460 "'fill_value' is not a valid keyword for "
8461 f"{type(self).__name__}.interpolate with method from "
8462 f"{fillna_methods}"
8463 )
8464
8465 if isinstance(obj.index, MultiIndex) and method != "linear":
8466 raise ValueError(
8467 "Only `method=linear` interpolation is supported on MultiIndexes."
8468 )
8469
8470 limit_direction = missing.infer_limit_direction(limit_direction, method)
8471
8472 if obj.ndim == 2 and np.all(obj.dtypes == object):
8473 raise TypeError(
8474 "Cannot interpolate with all object-dtype columns "
8475 "in the DataFrame. Try setting at least one "
8476 "column to a numeric dtype."
8477 )
8478
8479 if method.lower() in fillna_methods:
8480 # TODO(3.0): remove this case
8481 # TODO: warn/raise on limit_direction or kwargs which are ignored?
8482 # as of 2023-06-26 no tests get here with either
8483 if not self._mgr.is_single_block and axis == 1:
8484 # GH#53898
8485 if inplace:
8486 raise NotImplementedError()
8487 obj, axis, should_transpose = self.T, 1 - axis, True
8488
8489 new_data = obj._mgr.pad_or_backfill(
8490 method=method,
8491 axis=self._get_block_manager_axis(axis),
8492 limit=limit,
8493 limit_area=limit_area,
8494 inplace=inplace,
8495 downcast=downcast,
8496 )
8497 else:
8498 index = missing.get_interp_index(method, obj.index)
8499 new_data = obj._mgr.interpolate(
8500 method=method,
8501 index=index,
8502 limit=limit,
8503 limit_direction=limit_direction,
8504 limit_area=limit_area,
8505 inplace=inplace,
8506 downcast=downcast,
8507 **kwargs,
8508 )
8509
8510 result = self._constructor_from_mgr(new_data, axes=new_data.axes)
8511 if should_transpose:
8512 result = result.T
8513 if inplace:
8514 return self._update_inplace(result)
8515 else:
8516 return result.__finalize__(self, method="interpolate")
8517
8518 # ----------------------------------------------------------------------
8519 # Timeseries methods Methods
8520
8521 @final
8522 def asof(self, where, subset=None):
8523 """
8524 Return the last row(s) without any NaNs before `where`.
8525
8526 The last row (for each element in `where`, if list) without any
8527 NaN is taken.
8528 In case of a :class:`~pandas.DataFrame`, the last row without NaN
8529 considering only the subset of columns (if not `None`)
8530
8531 If there is no good value, NaN is returned for a Series or
8532 a Series of NaN values for a DataFrame
8533
8534 Parameters
8535 ----------
8536 where : date or array-like of dates
8537 Date(s) before which the last row(s) are returned.
8538 subset : str or array-like of str, default `None`
8539 For DataFrame, if not `None`, only use these columns to
8540 check for NaNs.
8541
8542 Returns
8543 -------
8544 scalar, Series, or DataFrame
8545
8546 The return can be:
8547
8548 * scalar : when `self` is a Series and `where` is a scalar
8549 * Series: when `self` is a Series and `where` is an array-like,
8550 or when `self` is a DataFrame and `where` is a scalar
8551 * DataFrame : when `self` is a DataFrame and `where` is an
8552 array-like
8553
8554 See Also
8555 --------
8556 merge_asof : Perform an asof merge. Similar to left join.
8557
8558 Notes
8559 -----
8560 Dates are assumed to be sorted. Raises if this is not the case.
8561
8562 Examples
8563 --------
8564 A Series and a scalar `where`.
8565
8566 >>> s = pd.Series([1, 2, np.nan, 4], index=[10, 20, 30, 40])
8567 >>> s
8568 10 1.0
8569 20 2.0
8570 30 NaN
8571 40 4.0
8572 dtype: float64
8573
8574 >>> s.asof(20)
8575 2.0
8576
8577 For a sequence `where`, a Series is returned. The first value is
8578 NaN, because the first element of `where` is before the first
8579 index value.
8580
8581 >>> s.asof([5, 20])
8582 5 NaN
8583 20 2.0
8584 dtype: float64
8585
8586 Missing values are not considered. The following is ``2.0``, not
8587 NaN, even though NaN is at the index location for ``30``.
8588
8589 >>> s.asof(30)
8590 2.0
8591
8592 Take all columns into consideration
8593
8594 >>> df = pd.DataFrame({'a': [10., 20., 30., 40., 50.],
8595 ... 'b': [None, None, None, None, 500]},
8596 ... index=pd.DatetimeIndex(['2018-02-27 09:01:00',
8597 ... '2018-02-27 09:02:00',
8598 ... '2018-02-27 09:03:00',
8599 ... '2018-02-27 09:04:00',
8600 ... '2018-02-27 09:05:00']))
8601 >>> df.asof(pd.DatetimeIndex(['2018-02-27 09:03:30',
8602 ... '2018-02-27 09:04:30']))
8603 a b
8604 2018-02-27 09:03:30 NaN NaN
8605 2018-02-27 09:04:30 NaN NaN
8606
8607 Take a single column into consideration
8608
8609 >>> df.asof(pd.DatetimeIndex(['2018-02-27 09:03:30',
8610 ... '2018-02-27 09:04:30']),
8611 ... subset=['a'])
8612 a b
8613 2018-02-27 09:03:30 30.0 NaN
8614 2018-02-27 09:04:30 40.0 NaN
8615 """
8616 if isinstance(where, str):
8617 where = Timestamp(where)
8618
8619 if not self.index.is_monotonic_increasing:
8620 raise ValueError("asof requires a sorted index")
8621
8622 is_series = isinstance(self, ABCSeries)
8623 if is_series:
8624 if subset is not None:
8625 raise ValueError("subset is not valid for Series")
8626 else:
8627 if subset is None:
8628 subset = self.columns
8629 if not is_list_like(subset):
8630 subset = [subset]
8631
8632 is_list = is_list_like(where)
8633 if not is_list:
8634 start = self.index[0]
8635 if isinstance(self.index, PeriodIndex):
8636 where = Period(where, freq=self.index.freq)
8637
8638 if where < start:
8639 if not is_series:
8640 return self._constructor_sliced(
8641 index=self.columns, name=where, dtype=np.float64
8642 )
8643 return np.nan
8644
8645 # It's always much faster to use a *while* loop here for
8646 # Series than pre-computing all the NAs. However a
8647 # *while* loop is extremely expensive for DataFrame
8648 # so we later pre-compute all the NAs and use the same
8649 # code path whether *where* is a scalar or list.
8650 # See PR: https://github.com/pandas-dev/pandas/pull/14476
8651 if is_series:
8652 loc = self.index.searchsorted(where, side="right")
8653 if loc > 0:
8654 loc -= 1
8655
8656 values = self._values
8657 while loc > 0 and isna(values[loc]):
8658 loc -= 1
8659 return values[loc]
8660
8661 if not isinstance(where, Index):
8662 where = Index(where) if is_list else Index([where])
8663
8664 nulls = self.isna() if is_series else self[subset].isna().any(axis=1)
8665 if nulls.all():
8666 if is_series:
8667 self = cast("Series", self)
8668 return self._constructor(np.nan, index=where, name=self.name)
8669 elif is_list:
8670 self = cast("DataFrame", self)
8671 return self._constructor(np.nan, index=where, columns=self.columns)
8672 else:
8673 self = cast("DataFrame", self)
8674 return self._constructor_sliced(
8675 np.nan, index=self.columns, name=where[0]
8676 )
8677
8678 locs = self.index.asof_locs(where, ~(nulls._values))
8679
8680 # mask the missing
8681 mask = locs == -1
8682 data = self.take(locs)
8683 data.index = where
8684 if mask.any():
8685 # GH#16063 only do this setting when necessary, otherwise
8686 # we'd cast e.g. bools to floats
8687 data.loc[mask] = np.nan
8688 return data if is_list else data.iloc[-1]
8689
8690 # ----------------------------------------------------------------------
8691 # Action Methods
8692
8693 @doc(klass=_shared_doc_kwargs["klass"])
8694 def isna(self) -> Self:
8695 """
8696 Detect missing values.
8697
8698 Return a boolean same-sized object indicating if the values are NA.
8699 NA values, such as None or :attr:`numpy.NaN`, gets mapped to True
8700 values.
8701 Everything else gets mapped to False values. Characters such as empty
8702 strings ``''`` or :attr:`numpy.inf` are not considered NA values
8703 (unless you set ``pandas.options.mode.use_inf_as_na = True``).
8704
8705 Returns
8706 -------
8707 {klass}
8708 Mask of bool values for each element in {klass} that
8709 indicates whether an element is an NA value.
8710
8711 See Also
8712 --------
8713 {klass}.isnull : Alias of isna.
8714 {klass}.notna : Boolean inverse of isna.
8715 {klass}.dropna : Omit axes labels with missing values.
8716 isna : Top-level isna.
8717
8718 Examples
8719 --------
8720 Show which entries in a DataFrame are NA.
8721
8722 >>> df = pd.DataFrame(dict(age=[5, 6, np.nan],
8723 ... born=[pd.NaT, pd.Timestamp('1939-05-27'),
8724 ... pd.Timestamp('1940-04-25')],
8725 ... name=['Alfred', 'Batman', ''],
8726 ... toy=[None, 'Batmobile', 'Joker']))
8727 >>> df
8728 age born name toy
8729 0 5.0 NaT Alfred None
8730 1 6.0 1939-05-27 Batman Batmobile
8731 2 NaN 1940-04-25 Joker
8732
8733 >>> df.isna()
8734 age born name toy
8735 0 False True False True
8736 1 False False False False
8737 2 True False False False
8738
8739 Show which entries in a Series are NA.
8740
8741 >>> ser = pd.Series([5, 6, np.nan])
8742 >>> ser
8743 0 5.0
8744 1 6.0
8745 2 NaN
8746 dtype: float64
8747
8748 >>> ser.isna()
8749 0 False
8750 1 False
8751 2 True
8752 dtype: bool
8753 """
8754 return isna(self).__finalize__(self, method="isna")
8755
8756 @doc(isna, klass=_shared_doc_kwargs["klass"])
8757 def isnull(self) -> Self:
8758 return isna(self).__finalize__(self, method="isnull")
8759
8760 @doc(klass=_shared_doc_kwargs["klass"])
8761 def notna(self) -> Self:
8762 """
8763 Detect existing (non-missing) values.
8764
8765 Return a boolean same-sized object indicating if the values are not NA.
8766 Non-missing values get mapped to True. Characters such as empty
8767 strings ``''`` or :attr:`numpy.inf` are not considered NA values
8768 (unless you set ``pandas.options.mode.use_inf_as_na = True``).
8769 NA values, such as None or :attr:`numpy.NaN`, get mapped to False
8770 values.
8771
8772 Returns
8773 -------
8774 {klass}
8775 Mask of bool values for each element in {klass} that
8776 indicates whether an element is not an NA value.
8777
8778 See Also
8779 --------
8780 {klass}.notnull : Alias of notna.
8781 {klass}.isna : Boolean inverse of notna.
8782 {klass}.dropna : Omit axes labels with missing values.
8783 notna : Top-level notna.
8784
8785 Examples
8786 --------
8787 Show which entries in a DataFrame are not NA.
8788
8789 >>> df = pd.DataFrame(dict(age=[5, 6, np.nan],
8790 ... born=[pd.NaT, pd.Timestamp('1939-05-27'),
8791 ... pd.Timestamp('1940-04-25')],
8792 ... name=['Alfred', 'Batman', ''],
8793 ... toy=[None, 'Batmobile', 'Joker']))
8794 >>> df
8795 age born name toy
8796 0 5.0 NaT Alfred None
8797 1 6.0 1939-05-27 Batman Batmobile
8798 2 NaN 1940-04-25 Joker
8799
8800 >>> df.notna()
8801 age born name toy
8802 0 True False True False
8803 1 True True True True
8804 2 False True True True
8805
8806 Show which entries in a Series are not NA.
8807
8808 >>> ser = pd.Series([5, 6, np.nan])
8809 >>> ser
8810 0 5.0
8811 1 6.0
8812 2 NaN
8813 dtype: float64
8814
8815 >>> ser.notna()
8816 0 True
8817 1 True
8818 2 False
8819 dtype: bool
8820 """
8821 return notna(self).__finalize__(self, method="notna")
8822
8823 @doc(notna, klass=_shared_doc_kwargs["klass"])
8824 def notnull(self) -> Self:
8825 return notna(self).__finalize__(self, method="notnull")
8826
8827 @final
8828 def _clip_with_scalar(self, lower, upper, inplace: bool_t = False):
8829 if (lower is not None and np.any(isna(lower))) or (
8830 upper is not None and np.any(isna(upper))
8831 ):
8832 raise ValueError("Cannot use an NA value as a clip threshold")
8833
8834 result = self
8835 mask = self.isna()
8836
8837 if lower is not None:
8838 cond = mask | (self >= lower)
8839 result = result.where(
8840 cond, lower, inplace=inplace
8841 ) # type: ignore[assignment]
8842 if upper is not None:
8843 cond = mask | (self <= upper)
8844 result = self if inplace else result
8845 result = result.where(
8846 cond, upper, inplace=inplace
8847 ) # type: ignore[assignment]
8848
8849 return result
8850
8851 @final
8852 def _clip_with_one_bound(self, threshold, method, axis, inplace):
8853 if axis is not None:
8854 axis = self._get_axis_number(axis)
8855
8856 # method is self.le for upper bound and self.ge for lower bound
8857 if is_scalar(threshold) and is_number(threshold):
8858 if method.__name__ == "le":
8859 return self._clip_with_scalar(None, threshold, inplace=inplace)
8860 return self._clip_with_scalar(threshold, None, inplace=inplace)
8861
8862 # GH #15390
8863 # In order for where method to work, the threshold must
8864 # be transformed to NDFrame from other array like structure.
8865 if (not isinstance(threshold, ABCSeries)) and is_list_like(threshold):
8866 if isinstance(self, ABCSeries):
8867 threshold = self._constructor(threshold, index=self.index)
8868 else:
8869 threshold = self._align_for_op(threshold, axis, flex=None)[1]
8870
8871 # GH 40420
8872 # Treat missing thresholds as no bounds, not clipping the values
8873 if is_list_like(threshold):
8874 fill_value = np.inf if method.__name__ == "le" else -np.inf
8875 threshold_inf = threshold.fillna(fill_value)
8876 else:
8877 threshold_inf = threshold
8878
8879 subset = method(threshold_inf, axis=axis) | isna(self)
8880
8881 # GH 40420
8882 return self.where(subset, threshold, axis=axis, inplace=inplace)
8883
8884 @overload
8885 def clip(
8886 self,
8887 lower=...,
8888 upper=...,
8889 *,
8890 axis: Axis | None = ...,
8891 inplace: Literal[False] = ...,
8892 **kwargs,
8893 ) -> Self:
8894 ...
8895
8896 @overload
8897 def clip(
8898 self,
8899 lower=...,
8900 upper=...,
8901 *,
8902 axis: Axis | None = ...,
8903 inplace: Literal[True],
8904 **kwargs,
8905 ) -> None:
8906 ...
8907
8908 @overload
8909 def clip(
8910 self,
8911 lower=...,
8912 upper=...,
8913 *,
8914 axis: Axis | None = ...,
8915 inplace: bool_t = ...,
8916 **kwargs,
8917 ) -> Self | None:
8918 ...
8919
8920 @final
8921 def clip(
8922 self,
8923 lower=None,
8924 upper=None,
8925 *,
8926 axis: Axis | None = None,
8927 inplace: bool_t = False,
8928 **kwargs,
8929 ) -> Self | None:
8930 """
8931 Trim values at input threshold(s).
8932
8933 Assigns values outside boundary to boundary values. Thresholds
8934 can be singular values or array like, and in the latter case
8935 the clipping is performed element-wise in the specified axis.
8936
8937 Parameters
8938 ----------
8939 lower : float or array-like, default None
8940 Minimum threshold value. All values below this
8941 threshold will be set to it. A missing
8942 threshold (e.g `NA`) will not clip the value.
8943 upper : float or array-like, default None
8944 Maximum threshold value. All values above this
8945 threshold will be set to it. A missing
8946 threshold (e.g `NA`) will not clip the value.
8947 axis : {{0 or 'index', 1 or 'columns', None}}, default None
8948 Align object with lower and upper along the given axis.
8949 For `Series` this parameter is unused and defaults to `None`.
8950 inplace : bool, default False
8951 Whether to perform the operation in place on the data.
8952 *args, **kwargs
8953 Additional keywords have no effect but might be accepted
8954 for compatibility with numpy.
8955
8956 Returns
8957 -------
8958 Series or DataFrame or None
8959 Same type as calling object with the values outside the
8960 clip boundaries replaced or None if ``inplace=True``.
8961
8962 See Also
8963 --------
8964 Series.clip : Trim values at input threshold in series.
8965 DataFrame.clip : Trim values at input threshold in dataframe.
8966 numpy.clip : Clip (limit) the values in an array.
8967
8968 Examples
8969 --------
8970 >>> data = {'col_0': [9, -3, 0, -1, 5], 'col_1': [-2, -7, 6, 8, -5]}
8971 >>> df = pd.DataFrame(data)
8972 >>> df
8973 col_0 col_1
8974 0 9 -2
8975 1 -3 -7
8976 2 0 6
8977 3 -1 8
8978 4 5 -5
8979
8980 Clips per column using lower and upper thresholds:
8981
8982 >>> df.clip(-4, 6)
8983 col_0 col_1
8984 0 6 -2
8985 1 -3 -4
8986 2 0 6
8987 3 -1 6
8988 4 5 -4
8989
8990 Clips using specific lower and upper thresholds per column:
8991
8992 >>> df.clip([-2, -1], [4, 5])
8993 col_0 col_1
8994 0 4 -1
8995 1 -2 -1
8996 2 0 5
8997 3 -1 5
8998 4 4 -1
8999
9000 Clips using specific lower and upper thresholds per column element:
9001
9002 >>> t = pd.Series([2, -4, -1, 6, 3])
9003 >>> t
9004 0 2
9005 1 -4
9006 2 -1
9007 3 6
9008 4 3
9009 dtype: int64
9010
9011 >>> df.clip(t, t + 4, axis=0)
9012 col_0 col_1
9013 0 6 2
9014 1 -3 -4
9015 2 0 3
9016 3 6 8
9017 4 5 3
9018
9019 Clips using specific lower threshold per column element, with missing values:
9020
9021 >>> t = pd.Series([2, -4, np.nan, 6, 3])
9022 >>> t
9023 0 2.0
9024 1 -4.0
9025 2 NaN
9026 3 6.0
9027 4 3.0
9028 dtype: float64
9029
9030 >>> df.clip(t, axis=0)
9031 col_0 col_1
9032 0 9 2
9033 1 -3 -4
9034 2 0 6
9035 3 6 8
9036 4 5 3
9037 """
9038 inplace = validate_bool_kwarg(inplace, "inplace")
9039
9040 if inplace:
9041 if not PYPY and using_copy_on_write():
9042 if sys.getrefcount(self) <= REF_COUNT:
9043 warnings.warn(
9044 _chained_assignment_method_msg,
9045 ChainedAssignmentError,
9046 stacklevel=2,
9047 )
9048 elif (
9049 not PYPY
9050 and not using_copy_on_write()
9051 and self._is_view_after_cow_rules()
9052 ):
9053 ctr = sys.getrefcount(self)
9054 ref_count = REF_COUNT
9055 if isinstance(self, ABCSeries) and hasattr(self, "_cacher"):
9056 # see https://github.com/pandas-dev/pandas/pull/56060#discussion_r1399245221
9057 ref_count += 1
9058 if ctr <= ref_count:
9059 warnings.warn(
9060 _chained_assignment_warning_method_msg,
9061 FutureWarning,
9062 stacklevel=2,
9063 )
9064
9065 axis = nv.validate_clip_with_axis(axis, (), kwargs)
9066 if axis is not None:
9067 axis = self._get_axis_number(axis)
9068
9069 # GH 17276
9070 # numpy doesn't like NaN as a clip value
9071 # so ignore
9072 # GH 19992
9073 # numpy doesn't drop a list-like bound containing NaN
9074 isna_lower = isna(lower)
9075 if not is_list_like(lower):
9076 if np.any(isna_lower):
9077 lower = None
9078 elif np.all(isna_lower):
9079 lower = None
9080 isna_upper = isna(upper)
9081 if not is_list_like(upper):
9082 if np.any(isna_upper):
9083 upper = None
9084 elif np.all(isna_upper):
9085 upper = None
9086
9087 # GH 2747 (arguments were reversed)
9088 if (
9089 lower is not None
9090 and upper is not None
9091 and is_scalar(lower)
9092 and is_scalar(upper)
9093 ):
9094 lower, upper = min(lower, upper), max(lower, upper)
9095
9096 # fast-path for scalars
9097 if (lower is None or is_number(lower)) and (upper is None or is_number(upper)):
9098 return self._clip_with_scalar(lower, upper, inplace=inplace)
9099
9100 result = self
9101 if lower is not None:
9102 result = result._clip_with_one_bound(
9103 lower, method=self.ge, axis=axis, inplace=inplace
9104 )
9105 if upper is not None:
9106 if inplace:
9107 result = self
9108 result = result._clip_with_one_bound(
9109 upper, method=self.le, axis=axis, inplace=inplace
9110 )
9111
9112 return result
9113
9114 @final
9115 @doc(klass=_shared_doc_kwargs["klass"])
9116 def asfreq(
9117 self,
9118 freq: Frequency,
9119 method: FillnaOptions | None = None,
9120 how: Literal["start", "end"] | None = None,
9121 normalize: bool_t = False,
9122 fill_value: Hashable | None = None,
9123 ) -> Self:
9124 """
9125 Convert time series to specified frequency.
9126
9127 Returns the original data conformed to a new index with the specified
9128 frequency.
9129
9130 If the index of this {klass} is a :class:`~pandas.PeriodIndex`, the new index
9131 is the result of transforming the original index with
9132 :meth:`PeriodIndex.asfreq <pandas.PeriodIndex.asfreq>` (so the original index
9133 will map one-to-one to the new index).
9134
9135 Otherwise, the new index will be equivalent to ``pd.date_range(start, end,
9136 freq=freq)`` where ``start`` and ``end`` are, respectively, the first and
9137 last entries in the original index (see :func:`pandas.date_range`). The
9138 values corresponding to any timesteps in the new index which were not present
9139 in the original index will be null (``NaN``), unless a method for filling
9140 such unknowns is provided (see the ``method`` parameter below).
9141
9142 The :meth:`resample` method is more appropriate if an operation on each group of
9143 timesteps (such as an aggregate) is necessary to represent the data at the new
9144 frequency.
9145
9146 Parameters
9147 ----------
9148 freq : DateOffset or str
9149 Frequency DateOffset or string.
9150 method : {{'backfill'/'bfill', 'pad'/'ffill'}}, default None
9151 Method to use for filling holes in reindexed Series (note this
9152 does not fill NaNs that already were present):
9153
9154 * 'pad' / 'ffill': propagate last valid observation forward to next
9155 valid
9156 * 'backfill' / 'bfill': use NEXT valid observation to fill.
9157 how : {{'start', 'end'}}, default end
9158 For PeriodIndex only (see PeriodIndex.asfreq).
9159 normalize : bool, default False
9160 Whether to reset output index to midnight.
9161 fill_value : scalar, optional
9162 Value to use for missing values, applied during upsampling (note
9163 this does not fill NaNs that already were present).
9164
9165 Returns
9166 -------
9167 {klass}
9168 {klass} object reindexed to the specified frequency.
9169
9170 See Also
9171 --------
9172 reindex : Conform DataFrame to new index with optional filling logic.
9173
9174 Notes
9175 -----
9176 To learn more about the frequency strings, please see `this link
9177 <https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#offset-aliases>`__.
9178
9179 Examples
9180 --------
9181 Start by creating a series with 4 one minute timestamps.
9182
9183 >>> index = pd.date_range('1/1/2000', periods=4, freq='min')
9184 >>> series = pd.Series([0.0, None, 2.0, 3.0], index=index)
9185 >>> df = pd.DataFrame({{'s': series}})
9186 >>> df
9187 s
9188 2000-01-01 00:00:00 0.0
9189 2000-01-01 00:01:00 NaN
9190 2000-01-01 00:02:00 2.0
9191 2000-01-01 00:03:00 3.0
9192
9193 Upsample the series into 30 second bins.
9194
9195 >>> df.asfreq(freq='30s')
9196 s
9197 2000-01-01 00:00:00 0.0
9198 2000-01-01 00:00:30 NaN
9199 2000-01-01 00:01:00 NaN
9200 2000-01-01 00:01:30 NaN
9201 2000-01-01 00:02:00 2.0
9202 2000-01-01 00:02:30 NaN
9203 2000-01-01 00:03:00 3.0
9204
9205 Upsample again, providing a ``fill value``.
9206
9207 >>> df.asfreq(freq='30s', fill_value=9.0)
9208 s
9209 2000-01-01 00:00:00 0.0
9210 2000-01-01 00:00:30 9.0
9211 2000-01-01 00:01:00 NaN
9212 2000-01-01 00:01:30 9.0
9213 2000-01-01 00:02:00 2.0
9214 2000-01-01 00:02:30 9.0
9215 2000-01-01 00:03:00 3.0
9216
9217 Upsample again, providing a ``method``.
9218
9219 >>> df.asfreq(freq='30s', method='bfill')
9220 s
9221 2000-01-01 00:00:00 0.0
9222 2000-01-01 00:00:30 NaN
9223 2000-01-01 00:01:00 NaN
9224 2000-01-01 00:01:30 2.0
9225 2000-01-01 00:02:00 2.0
9226 2000-01-01 00:02:30 3.0
9227 2000-01-01 00:03:00 3.0
9228 """
9229 from pandas.core.resample import asfreq
9230
9231 return asfreq(
9232 self,
9233 freq,
9234 method=method,
9235 how=how,
9236 normalize=normalize,
9237 fill_value=fill_value,
9238 )
9239
9240 @final
9241 def at_time(self, time, asof: bool_t = False, axis: Axis | None = None) -> Self:
9242 """
9243 Select values at particular time of day (e.g., 9:30AM).
9244
9245 Parameters
9246 ----------
9247 time : datetime.time or str
9248 The values to select.
9249 axis : {0 or 'index', 1 or 'columns'}, default 0
9250 For `Series` this parameter is unused and defaults to 0.
9251
9252 Returns
9253 -------
9254 Series or DataFrame
9255
9256 Raises
9257 ------
9258 TypeError
9259 If the index is not a :class:`DatetimeIndex`
9260
9261 See Also
9262 --------
9263 between_time : Select values between particular times of the day.
9264 first : Select initial periods of time series based on a date offset.
9265 last : Select final periods of time series based on a date offset.
9266 DatetimeIndex.indexer_at_time : Get just the index locations for
9267 values at particular time of the day.
9268
9269 Examples
9270 --------
9271 >>> i = pd.date_range('2018-04-09', periods=4, freq='12h')
9272 >>> ts = pd.DataFrame({'A': [1, 2, 3, 4]}, index=i)
9273 >>> ts
9274 A
9275 2018-04-09 00:00:00 1
9276 2018-04-09 12:00:00 2
9277 2018-04-10 00:00:00 3
9278 2018-04-10 12:00:00 4
9279
9280 >>> ts.at_time('12:00')
9281 A
9282 2018-04-09 12:00:00 2
9283 2018-04-10 12:00:00 4
9284 """
9285 if axis is None:
9286 axis = 0
9287 axis = self._get_axis_number(axis)
9288
9289 index = self._get_axis(axis)
9290
9291 if not isinstance(index, DatetimeIndex):
9292 raise TypeError("Index must be DatetimeIndex")
9293
9294 indexer = index.indexer_at_time(time, asof=asof)
9295 return self._take_with_is_copy(indexer, axis=axis)
9296
9297 @final
9298 def between_time(
9299 self,
9300 start_time,
9301 end_time,
9302 inclusive: IntervalClosedType = "both",
9303 axis: Axis | None = None,
9304 ) -> Self:
9305 """
9306 Select values between particular times of the day (e.g., 9:00-9:30 AM).
9307
9308 By setting ``start_time`` to be later than ``end_time``,
9309 you can get the times that are *not* between the two times.
9310
9311 Parameters
9312 ----------
9313 start_time : datetime.time or str
9314 Initial time as a time filter limit.
9315 end_time : datetime.time or str
9316 End time as a time filter limit.
9317 inclusive : {"both", "neither", "left", "right"}, default "both"
9318 Include boundaries; whether to set each bound as closed or open.
9319 axis : {0 or 'index', 1 or 'columns'}, default 0
9320 Determine range time on index or columns value.
9321 For `Series` this parameter is unused and defaults to 0.
9322
9323 Returns
9324 -------
9325 Series or DataFrame
9326 Data from the original object filtered to the specified dates range.
9327
9328 Raises
9329 ------
9330 TypeError
9331 If the index is not a :class:`DatetimeIndex`
9332
9333 See Also
9334 --------
9335 at_time : Select values at a particular time of the day.
9336 first : Select initial periods of time series based on a date offset.
9337 last : Select final periods of time series based on a date offset.
9338 DatetimeIndex.indexer_between_time : Get just the index locations for
9339 values between particular times of the day.
9340
9341 Examples
9342 --------
9343 >>> i = pd.date_range('2018-04-09', periods=4, freq='1D20min')
9344 >>> ts = pd.DataFrame({'A': [1, 2, 3, 4]}, index=i)
9345 >>> ts
9346 A
9347 2018-04-09 00:00:00 1
9348 2018-04-10 00:20:00 2
9349 2018-04-11 00:40:00 3
9350 2018-04-12 01:00:00 4
9351
9352 >>> ts.between_time('0:15', '0:45')
9353 A
9354 2018-04-10 00:20:00 2
9355 2018-04-11 00:40:00 3
9356
9357 You get the times that are *not* between two times by setting
9358 ``start_time`` later than ``end_time``:
9359
9360 >>> ts.between_time('0:45', '0:15')
9361 A
9362 2018-04-09 00:00:00 1
9363 2018-04-12 01:00:00 4
9364 """
9365 if axis is None:
9366 axis = 0
9367 axis = self._get_axis_number(axis)
9368
9369 index = self._get_axis(axis)
9370 if not isinstance(index, DatetimeIndex):
9371 raise TypeError("Index must be DatetimeIndex")
9372
9373 left_inclusive, right_inclusive = validate_inclusive(inclusive)
9374 indexer = index.indexer_between_time(
9375 start_time,
9376 end_time,
9377 include_start=left_inclusive,
9378 include_end=right_inclusive,
9379 )
9380 return self._take_with_is_copy(indexer, axis=axis)
9381
9382 @final
9383 @doc(klass=_shared_doc_kwargs["klass"])
9384 def resample(
9385 self,
9386 rule,
9387 axis: Axis | lib.NoDefault = lib.no_default,
9388 closed: Literal["right", "left"] | None = None,
9389 label: Literal["right", "left"] | None = None,
9390 convention: Literal["start", "end", "s", "e"] | lib.NoDefault = lib.no_default,
9391 kind: Literal["timestamp", "period"] | None | lib.NoDefault = lib.no_default,
9392 on: Level | None = None,
9393 level: Level | None = None,
9394 origin: str | TimestampConvertibleTypes = "start_day",
9395 offset: TimedeltaConvertibleTypes | None = None,
9396 group_keys: bool_t = False,
9397 ) -> Resampler:
9398 """
9399 Resample time-series data.
9400
9401 Convenience method for frequency conversion and resampling of time series.
9402 The object must have a datetime-like index (`DatetimeIndex`, `PeriodIndex`,
9403 or `TimedeltaIndex`), or the caller must pass the label of a datetime-like
9404 series/index to the ``on``/``level`` keyword parameter.
9405
9406 Parameters
9407 ----------
9408 rule : DateOffset, Timedelta or str
9409 The offset string or object representing target conversion.
9410 axis : {{0 or 'index', 1 or 'columns'}}, default 0
9411 Which axis to use for up- or down-sampling. For `Series` this parameter
9412 is unused and defaults to 0. Must be
9413 `DatetimeIndex`, `TimedeltaIndex` or `PeriodIndex`.
9414
9415 .. deprecated:: 2.0.0
9416 Use frame.T.resample(...) instead.
9417 closed : {{'right', 'left'}}, default None
9418 Which side of bin interval is closed. The default is 'left'
9419 for all frequency offsets except for 'ME', 'YE', 'QE', 'BME',
9420 'BA', 'BQE', and 'W' which all have a default of 'right'.
9421 label : {{'right', 'left'}}, default None
9422 Which bin edge label to label bucket with. The default is 'left'
9423 for all frequency offsets except for 'ME', 'YE', 'QE', 'BME',
9424 'BA', 'BQE', and 'W' which all have a default of 'right'.
9425 convention : {{'start', 'end', 's', 'e'}}, default 'start'
9426 For `PeriodIndex` only, controls whether to use the start or
9427 end of `rule`.
9428
9429 .. deprecated:: 2.2.0
9430 Convert PeriodIndex to DatetimeIndex before resampling instead.
9431 kind : {{'timestamp', 'period'}}, optional, default None
9432 Pass 'timestamp' to convert the resulting index to a
9433 `DateTimeIndex` or 'period' to convert it to a `PeriodIndex`.
9434 By default the input representation is retained.
9435
9436 .. deprecated:: 2.2.0
9437 Convert index to desired type explicitly instead.
9438
9439 on : str, optional
9440 For a DataFrame, column to use instead of index for resampling.
9441 Column must be datetime-like.
9442 level : str or int, optional
9443 For a MultiIndex, level (name or number) to use for
9444 resampling. `level` must be datetime-like.
9445 origin : Timestamp or str, default 'start_day'
9446 The timestamp on which to adjust the grouping. The timezone of origin
9447 must match the timezone of the index.
9448 If string, must be one of the following:
9449
9450 - 'epoch': `origin` is 1970-01-01
9451 - 'start': `origin` is the first value of the timeseries
9452 - 'start_day': `origin` is the first day at midnight of the timeseries
9453
9454 - 'end': `origin` is the last value of the timeseries
9455 - 'end_day': `origin` is the ceiling midnight of the last day
9456
9457 .. versionadded:: 1.3.0
9458
9459 .. note::
9460
9461 Only takes effect for Tick-frequencies (i.e. fixed frequencies like
9462 days, hours, and minutes, rather than months or quarters).
9463 offset : Timedelta or str, default is None
9464 An offset timedelta added to the origin.
9465
9466 group_keys : bool, default False
9467 Whether to include the group keys in the result index when using
9468 ``.apply()`` on the resampled object.
9469
9470 .. versionadded:: 1.5.0
9471
9472 Not specifying ``group_keys`` will retain values-dependent behavior
9473 from pandas 1.4 and earlier (see :ref:`pandas 1.5.0 Release notes
9474 <whatsnew_150.enhancements.resample_group_keys>` for examples).
9475
9476 .. versionchanged:: 2.0.0
9477
9478 ``group_keys`` now defaults to ``False``.
9479
9480 Returns
9481 -------
9482 pandas.api.typing.Resampler
9483 :class:`~pandas.core.Resampler` object.
9484
9485 See Also
9486 --------
9487 Series.resample : Resample a Series.
9488 DataFrame.resample : Resample a DataFrame.
9489 groupby : Group {klass} by mapping, function, label, or list of labels.
9490 asfreq : Reindex a {klass} with the given frequency without grouping.
9491
9492 Notes
9493 -----
9494 See the `user guide
9495 <https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#resampling>`__
9496 for more.
9497
9498 To learn more about the offset strings, please see `this link
9499 <https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#dateoffset-objects>`__.
9500
9501 Examples
9502 --------
9503 Start by creating a series with 9 one minute timestamps.
9504
9505 >>> index = pd.date_range('1/1/2000', periods=9, freq='min')
9506 >>> series = pd.Series(range(9), index=index)
9507 >>> series
9508 2000-01-01 00:00:00 0
9509 2000-01-01 00:01:00 1
9510 2000-01-01 00:02:00 2
9511 2000-01-01 00:03:00 3
9512 2000-01-01 00:04:00 4
9513 2000-01-01 00:05:00 5
9514 2000-01-01 00:06:00 6
9515 2000-01-01 00:07:00 7
9516 2000-01-01 00:08:00 8
9517 Freq: min, dtype: int64
9518
9519 Downsample the series into 3 minute bins and sum the values
9520 of the timestamps falling into a bin.
9521
9522 >>> series.resample('3min').sum()
9523 2000-01-01 00:00:00 3
9524 2000-01-01 00:03:00 12
9525 2000-01-01 00:06:00 21
9526 Freq: 3min, dtype: int64
9527
9528 Downsample the series into 3 minute bins as above, but label each
9529 bin using the right edge instead of the left. Please note that the
9530 value in the bucket used as the label is not included in the bucket,
9531 which it labels. For example, in the original series the
9532 bucket ``2000-01-01 00:03:00`` contains the value 3, but the summed
9533 value in the resampled bucket with the label ``2000-01-01 00:03:00``
9534 does not include 3 (if it did, the summed value would be 6, not 3).
9535
9536 >>> series.resample('3min', label='right').sum()
9537 2000-01-01 00:03:00 3
9538 2000-01-01 00:06:00 12
9539 2000-01-01 00:09:00 21
9540 Freq: 3min, dtype: int64
9541
9542 To include this value close the right side of the bin interval,
9543 as shown below.
9544
9545 >>> series.resample('3min', label='right', closed='right').sum()
9546 2000-01-01 00:00:00 0
9547 2000-01-01 00:03:00 6
9548 2000-01-01 00:06:00 15
9549 2000-01-01 00:09:00 15
9550 Freq: 3min, dtype: int64
9551
9552 Upsample the series into 30 second bins.
9553
9554 >>> series.resample('30s').asfreq()[0:5] # Select first 5 rows
9555 2000-01-01 00:00:00 0.0
9556 2000-01-01 00:00:30 NaN
9557 2000-01-01 00:01:00 1.0
9558 2000-01-01 00:01:30 NaN
9559 2000-01-01 00:02:00 2.0
9560 Freq: 30s, dtype: float64
9561
9562 Upsample the series into 30 second bins and fill the ``NaN``
9563 values using the ``ffill`` method.
9564
9565 >>> series.resample('30s').ffill()[0:5]
9566 2000-01-01 00:00:00 0
9567 2000-01-01 00:00:30 0
9568 2000-01-01 00:01:00 1
9569 2000-01-01 00:01:30 1
9570 2000-01-01 00:02:00 2
9571 Freq: 30s, dtype: int64
9572
9573 Upsample the series into 30 second bins and fill the
9574 ``NaN`` values using the ``bfill`` method.
9575
9576 >>> series.resample('30s').bfill()[0:5]
9577 2000-01-01 00:00:00 0
9578 2000-01-01 00:00:30 1
9579 2000-01-01 00:01:00 1
9580 2000-01-01 00:01:30 2
9581 2000-01-01 00:02:00 2
9582 Freq: 30s, dtype: int64
9583
9584 Pass a custom function via ``apply``
9585
9586 >>> def custom_resampler(arraylike):
9587 ... return np.sum(arraylike) + 5
9588 ...
9589 >>> series.resample('3min').apply(custom_resampler)
9590 2000-01-01 00:00:00 8
9591 2000-01-01 00:03:00 17
9592 2000-01-01 00:06:00 26
9593 Freq: 3min, dtype: int64
9594
9595 For DataFrame objects, the keyword `on` can be used to specify the
9596 column instead of the index for resampling.
9597
9598 >>> d = {{'price': [10, 11, 9, 13, 14, 18, 17, 19],
9599 ... 'volume': [50, 60, 40, 100, 50, 100, 40, 50]}}
9600 >>> df = pd.DataFrame(d)
9601 >>> df['week_starting'] = pd.date_range('01/01/2018',
9602 ... periods=8,
9603 ... freq='W')
9604 >>> df
9605 price volume week_starting
9606 0 10 50 2018-01-07
9607 1 11 60 2018-01-14
9608 2 9 40 2018-01-21
9609 3 13 100 2018-01-28
9610 4 14 50 2018-02-04
9611 5 18 100 2018-02-11
9612 6 17 40 2018-02-18
9613 7 19 50 2018-02-25
9614 >>> df.resample('ME', on='week_starting').mean()
9615 price volume
9616 week_starting
9617 2018-01-31 10.75 62.5
9618 2018-02-28 17.00 60.0
9619
9620 For a DataFrame with MultiIndex, the keyword `level` can be used to
9621 specify on which level the resampling needs to take place.
9622
9623 >>> days = pd.date_range('1/1/2000', periods=4, freq='D')
9624 >>> d2 = {{'price': [10, 11, 9, 13, 14, 18, 17, 19],
9625 ... 'volume': [50, 60, 40, 100, 50, 100, 40, 50]}}
9626 >>> df2 = pd.DataFrame(
9627 ... d2,
9628 ... index=pd.MultiIndex.from_product(
9629 ... [days, ['morning', 'afternoon']]
9630 ... )
9631 ... )
9632 >>> df2
9633 price volume
9634 2000-01-01 morning 10 50
9635 afternoon 11 60
9636 2000-01-02 morning 9 40
9637 afternoon 13 100
9638 2000-01-03 morning 14 50
9639 afternoon 18 100
9640 2000-01-04 morning 17 40
9641 afternoon 19 50
9642 >>> df2.resample('D', level=0).sum()
9643 price volume
9644 2000-01-01 21 110
9645 2000-01-02 22 140
9646 2000-01-03 32 150
9647 2000-01-04 36 90
9648
9649 If you want to adjust the start of the bins based on a fixed timestamp:
9650
9651 >>> start, end = '2000-10-01 23:30:00', '2000-10-02 00:30:00'
9652 >>> rng = pd.date_range(start, end, freq='7min')
9653 >>> ts = pd.Series(np.arange(len(rng)) * 3, index=rng)
9654 >>> ts
9655 2000-10-01 23:30:00 0
9656 2000-10-01 23:37:00 3
9657 2000-10-01 23:44:00 6
9658 2000-10-01 23:51:00 9
9659 2000-10-01 23:58:00 12
9660 2000-10-02 00:05:00 15
9661 2000-10-02 00:12:00 18
9662 2000-10-02 00:19:00 21
9663 2000-10-02 00:26:00 24
9664 Freq: 7min, dtype: int64
9665
9666 >>> ts.resample('17min').sum()
9667 2000-10-01 23:14:00 0
9668 2000-10-01 23:31:00 9
9669 2000-10-01 23:48:00 21
9670 2000-10-02 00:05:00 54
9671 2000-10-02 00:22:00 24
9672 Freq: 17min, dtype: int64
9673
9674 >>> ts.resample('17min', origin='epoch').sum()
9675 2000-10-01 23:18:00 0
9676 2000-10-01 23:35:00 18
9677 2000-10-01 23:52:00 27
9678 2000-10-02 00:09:00 39
9679 2000-10-02 00:26:00 24
9680 Freq: 17min, dtype: int64
9681
9682 >>> ts.resample('17min', origin='2000-01-01').sum()
9683 2000-10-01 23:24:00 3
9684 2000-10-01 23:41:00 15
9685 2000-10-01 23:58:00 45
9686 2000-10-02 00:15:00 45
9687 Freq: 17min, dtype: int64
9688
9689 If you want to adjust the start of the bins with an `offset` Timedelta, the two
9690 following lines are equivalent:
9691
9692 >>> ts.resample('17min', origin='start').sum()
9693 2000-10-01 23:30:00 9
9694 2000-10-01 23:47:00 21
9695 2000-10-02 00:04:00 54
9696 2000-10-02 00:21:00 24
9697 Freq: 17min, dtype: int64
9698
9699 >>> ts.resample('17min', offset='23h30min').sum()
9700 2000-10-01 23:30:00 9
9701 2000-10-01 23:47:00 21
9702 2000-10-02 00:04:00 54
9703 2000-10-02 00:21:00 24
9704 Freq: 17min, dtype: int64
9705
9706 If you want to take the largest Timestamp as the end of the bins:
9707
9708 >>> ts.resample('17min', origin='end').sum()
9709 2000-10-01 23:35:00 0
9710 2000-10-01 23:52:00 18
9711 2000-10-02 00:09:00 27
9712 2000-10-02 00:26:00 63
9713 Freq: 17min, dtype: int64
9714
9715 In contrast with the `start_day`, you can use `end_day` to take the ceiling
9716 midnight of the largest Timestamp as the end of the bins and drop the bins
9717 not containing data:
9718
9719 >>> ts.resample('17min', origin='end_day').sum()
9720 2000-10-01 23:38:00 3
9721 2000-10-01 23:55:00 15
9722 2000-10-02 00:12:00 45
9723 2000-10-02 00:29:00 45
9724 Freq: 17min, dtype: int64
9725 """
9726 from pandas.core.resample import get_resampler
9727
9728 if axis is not lib.no_default:
9729 axis = self._get_axis_number(axis)
9730 if axis == 1:
9731 warnings.warn(
9732 "DataFrame.resample with axis=1 is deprecated. Do "
9733 "`frame.T.resample(...)` without axis instead.",
9734 FutureWarning,
9735 stacklevel=find_stack_level(),
9736 )
9737 else:
9738 warnings.warn(
9739 f"The 'axis' keyword in {type(self).__name__}.resample is "
9740 "deprecated and will be removed in a future version.",
9741 FutureWarning,
9742 stacklevel=find_stack_level(),
9743 )
9744 else:
9745 axis = 0
9746
9747 if kind is not lib.no_default:
9748 # GH#55895
9749 warnings.warn(
9750 f"The 'kind' keyword in {type(self).__name__}.resample is "
9751 "deprecated and will be removed in a future version. "
9752 "Explicitly cast the index to the desired type instead",
9753 FutureWarning,
9754 stacklevel=find_stack_level(),
9755 )
9756 else:
9757 kind = None
9758
9759 if convention is not lib.no_default:
9760 warnings.warn(
9761 f"The 'convention' keyword in {type(self).__name__}.resample is "
9762 "deprecated and will be removed in a future version. "
9763 "Explicitly cast PeriodIndex to DatetimeIndex before resampling "
9764 "instead.",
9765 FutureWarning,
9766 stacklevel=find_stack_level(),
9767 )
9768 else:
9769 convention = "start"
9770
9771 return get_resampler(
9772 cast("Series | DataFrame", self),
9773 freq=rule,
9774 label=label,
9775 closed=closed,
9776 axis=axis,
9777 kind=kind,
9778 convention=convention,
9779 key=on,
9780 level=level,
9781 origin=origin,
9782 offset=offset,
9783 group_keys=group_keys,
9784 )
9785
9786 @final
9787 def first(self, offset) -> Self:
9788 """
9789 Select initial periods of time series data based on a date offset.
9790
9791 .. deprecated:: 2.1
9792 :meth:`.first` is deprecated and will be removed in a future version.
9793 Please create a mask and filter using `.loc` instead.
9794
9795 For a DataFrame with a sorted DatetimeIndex, this function can
9796 select the first few rows based on a date offset.
9797
9798 Parameters
9799 ----------
9800 offset : str, DateOffset or dateutil.relativedelta
9801 The offset length of the data that will be selected. For instance,
9802 '1ME' will display all the rows having their index within the first month.
9803
9804 Returns
9805 -------
9806 Series or DataFrame
9807 A subset of the caller.
9808
9809 Raises
9810 ------
9811 TypeError
9812 If the index is not a :class:`DatetimeIndex`
9813
9814 See Also
9815 --------
9816 last : Select final periods of time series based on a date offset.
9817 at_time : Select values at a particular time of the day.
9818 between_time : Select values between particular times of the day.
9819
9820 Examples
9821 --------
9822 >>> i = pd.date_range('2018-04-09', periods=4, freq='2D')
9823 >>> ts = pd.DataFrame({'A': [1, 2, 3, 4]}, index=i)
9824 >>> ts
9825 A
9826 2018-04-09 1
9827 2018-04-11 2
9828 2018-04-13 3
9829 2018-04-15 4
9830
9831 Get the rows for the first 3 days:
9832
9833 >>> ts.first('3D')
9834 A
9835 2018-04-09 1
9836 2018-04-11 2
9837
9838 Notice the data for 3 first calendar days were returned, not the first
9839 3 days observed in the dataset, and therefore data for 2018-04-13 was
9840 not returned.
9841 """
9842 warnings.warn(
9843 "first is deprecated and will be removed in a future version. "
9844 "Please create a mask and filter using `.loc` instead",
9845 FutureWarning,
9846 stacklevel=find_stack_level(),
9847 )
9848 if not isinstance(self.index, DatetimeIndex):
9849 raise TypeError("'first' only supports a DatetimeIndex index")
9850
9851 if len(self.index) == 0:
9852 return self.copy(deep=False)
9853
9854 offset = to_offset(offset)
9855 if not isinstance(offset, Tick) and offset.is_on_offset(self.index[0]):
9856 # GH#29623 if first value is end of period, remove offset with n = 1
9857 # before adding the real offset
9858 end_date = end = self.index[0] - offset.base + offset
9859 else:
9860 end_date = end = self.index[0] + offset
9861
9862 # Tick-like, e.g. 3 weeks
9863 if isinstance(offset, Tick) and end_date in self.index:
9864 end = self.index.searchsorted(end_date, side="left")
9865 return self.iloc[:end]
9866
9867 return self.loc[:end]
9868
9869 @final
9870 def last(self, offset) -> Self:
9871 """
9872 Select final periods of time series data based on a date offset.
9873
9874 .. deprecated:: 2.1
9875 :meth:`.last` is deprecated and will be removed in a future version.
9876 Please create a mask and filter using `.loc` instead.
9877
9878 For a DataFrame with a sorted DatetimeIndex, this function
9879 selects the last few rows based on a date offset.
9880
9881 Parameters
9882 ----------
9883 offset : str, DateOffset, dateutil.relativedelta
9884 The offset length of the data that will be selected. For instance,
9885 '3D' will display all the rows having their index within the last 3 days.
9886
9887 Returns
9888 -------
9889 Series or DataFrame
9890 A subset of the caller.
9891
9892 Raises
9893 ------
9894 TypeError
9895 If the index is not a :class:`DatetimeIndex`
9896
9897 See Also
9898 --------
9899 first : Select initial periods of time series based on a date offset.
9900 at_time : Select values at a particular time of the day.
9901 between_time : Select values between particular times of the day.
9902
9903 Notes
9904 -----
9905 .. deprecated:: 2.1.0
9906 Please create a mask and filter using `.loc` instead
9907
9908 Examples
9909 --------
9910 >>> i = pd.date_range('2018-04-09', periods=4, freq='2D')
9911 >>> ts = pd.DataFrame({'A': [1, 2, 3, 4]}, index=i)
9912 >>> ts
9913 A
9914 2018-04-09 1
9915 2018-04-11 2
9916 2018-04-13 3
9917 2018-04-15 4
9918
9919 Get the rows for the last 3 days:
9920
9921 >>> ts.last('3D') # doctest: +SKIP
9922 A
9923 2018-04-13 3
9924 2018-04-15 4
9925
9926 Notice the data for 3 last calendar days were returned, not the last
9927 3 observed days in the dataset, and therefore data for 2018-04-11 was
9928 not returned.
9929 """
9930 warnings.warn(
9931 "last is deprecated and will be removed in a future version. "
9932 "Please create a mask and filter using `.loc` instead",
9933 FutureWarning,
9934 stacklevel=find_stack_level(),
9935 )
9936
9937 if not isinstance(self.index, DatetimeIndex):
9938 raise TypeError("'last' only supports a DatetimeIndex index")
9939
9940 if len(self.index) == 0:
9941 return self.copy(deep=False)
9942
9943 offset = to_offset(offset)
9944
9945 start_date = self.index[-1] - offset
9946 start = self.index.searchsorted(start_date, side="right")
9947 return self.iloc[start:]
9948
9949 @final
9950 def rank(
9951 self,
9952 axis: Axis = 0,
9953 method: Literal["average", "min", "max", "first", "dense"] = "average",
9954 numeric_only: bool_t = False,
9955 na_option: Literal["keep", "top", "bottom"] = "keep",
9956 ascending: bool_t = True,
9957 pct: bool_t = False,
9958 ) -> Self:
9959 """
9960 Compute numerical data ranks (1 through n) along axis.
9961
9962 By default, equal values are assigned a rank that is the average of the
9963 ranks of those values.
9964
9965 Parameters
9966 ----------
9967 axis : {0 or 'index', 1 or 'columns'}, default 0
9968 Index to direct ranking.
9969 For `Series` this parameter is unused and defaults to 0.
9970 method : {'average', 'min', 'max', 'first', 'dense'}, default 'average'
9971 How to rank the group of records that have the same value (i.e. ties):
9972
9973 * average: average rank of the group
9974 * min: lowest rank in the group
9975 * max: highest rank in the group
9976 * first: ranks assigned in order they appear in the array
9977 * dense: like 'min', but rank always increases by 1 between groups.
9978
9979 numeric_only : bool, default False
9980 For DataFrame objects, rank only numeric columns if set to True.
9981
9982 .. versionchanged:: 2.0.0
9983 The default value of ``numeric_only`` is now ``False``.
9984
9985 na_option : {'keep', 'top', 'bottom'}, default 'keep'
9986 How to rank NaN values:
9987
9988 * keep: assign NaN rank to NaN values
9989 * top: assign lowest rank to NaN values
9990 * bottom: assign highest rank to NaN values
9991
9992 ascending : bool, default True
9993 Whether or not the elements should be ranked in ascending order.
9994 pct : bool, default False
9995 Whether or not to display the returned rankings in percentile
9996 form.
9997
9998 Returns
9999 -------
10000 same type as caller
10001 Return a Series or DataFrame with data ranks as values.
10002
10003 See Also
10004 --------
10005 core.groupby.DataFrameGroupBy.rank : Rank of values within each group.
10006 core.groupby.SeriesGroupBy.rank : Rank of values within each group.
10007
10008 Examples
10009 --------
10010 >>> df = pd.DataFrame(data={'Animal': ['cat', 'penguin', 'dog',
10011 ... 'spider', 'snake'],
10012 ... 'Number_legs': [4, 2, 4, 8, np.nan]})
10013 >>> df
10014 Animal Number_legs
10015 0 cat 4.0
10016 1 penguin 2.0
10017 2 dog 4.0
10018 3 spider 8.0
10019 4 snake NaN
10020
10021 Ties are assigned the mean of the ranks (by default) for the group.
10022
10023 >>> s = pd.Series(range(5), index=list("abcde"))
10024 >>> s["d"] = s["b"]
10025 >>> s.rank()
10026 a 1.0
10027 b 2.5
10028 c 4.0
10029 d 2.5
10030 e 5.0
10031 dtype: float64
10032
10033 The following example shows how the method behaves with the above
10034 parameters:
10035
10036 * default_rank: this is the default behaviour obtained without using
10037 any parameter.
10038 * max_rank: setting ``method = 'max'`` the records that have the
10039 same values are ranked using the highest rank (e.g.: since 'cat'
10040 and 'dog' are both in the 2nd and 3rd position, rank 3 is assigned.)
10041 * NA_bottom: choosing ``na_option = 'bottom'``, if there are records
10042 with NaN values they are placed at the bottom of the ranking.
10043 * pct_rank: when setting ``pct = True``, the ranking is expressed as
10044 percentile rank.
10045
10046 >>> df['default_rank'] = df['Number_legs'].rank()
10047 >>> df['max_rank'] = df['Number_legs'].rank(method='max')
10048 >>> df['NA_bottom'] = df['Number_legs'].rank(na_option='bottom')
10049 >>> df['pct_rank'] = df['Number_legs'].rank(pct=True)
10050 >>> df
10051 Animal Number_legs default_rank max_rank NA_bottom pct_rank
10052 0 cat 4.0 2.5 3.0 2.5 0.625
10053 1 penguin 2.0 1.0 1.0 1.0 0.250
10054 2 dog 4.0 2.5 3.0 2.5 0.625
10055 3 spider 8.0 4.0 4.0 4.0 1.000
10056 4 snake NaN NaN NaN 5.0 NaN
10057 """
10058 axis_int = self._get_axis_number(axis)
10059
10060 if na_option not in {"keep", "top", "bottom"}:
10061 msg = "na_option must be one of 'keep', 'top', or 'bottom'"
10062 raise ValueError(msg)
10063
10064 def ranker(data):
10065 if data.ndim == 2:
10066 # i.e. DataFrame, we cast to ndarray
10067 values = data.values
10068 else:
10069 # i.e. Series, can dispatch to EA
10070 values = data._values
10071
10072 if isinstance(values, ExtensionArray):
10073 ranks = values._rank(
10074 axis=axis_int,
10075 method=method,
10076 ascending=ascending,
10077 na_option=na_option,
10078 pct=pct,
10079 )
10080 else:
10081 ranks = algos.rank(
10082 values,
10083 axis=axis_int,
10084 method=method,
10085 ascending=ascending,
10086 na_option=na_option,
10087 pct=pct,
10088 )
10089
10090 ranks_obj = self._constructor(ranks, **data._construct_axes_dict())
10091 return ranks_obj.__finalize__(self, method="rank")
10092
10093 if numeric_only:
10094 if self.ndim == 1 and not is_numeric_dtype(self.dtype):
10095 # GH#47500
10096 raise TypeError(
10097 "Series.rank does not allow numeric_only=True with "
10098 "non-numeric dtype."
10099 )
10100 data = self._get_numeric_data()
10101 else:
10102 data = self
10103
10104 return ranker(data)
10105
10106 @doc(_shared_docs["compare"], klass=_shared_doc_kwargs["klass"])
10107 def compare(
10108 self,
10109 other,
10110 align_axis: Axis = 1,
10111 keep_shape: bool_t = False,
10112 keep_equal: bool_t = False,
10113 result_names: Suffixes = ("self", "other"),
10114 ):
10115 if type(self) is not type(other):
10116 cls_self, cls_other = type(self).__name__, type(other).__name__
10117 raise TypeError(
10118 f"can only compare '{cls_self}' (not '{cls_other}') with '{cls_self}'"
10119 )
10120
10121 mask = ~((self == other) | (self.isna() & other.isna()))
10122 mask.fillna(True, inplace=True)
10123
10124 if not keep_equal:
10125 self = self.where(mask)
10126 other = other.where(mask)
10127
10128 if not keep_shape:
10129 if isinstance(self, ABCDataFrame):
10130 cmask = mask.any()
10131 rmask = mask.any(axis=1)
10132 self = self.loc[rmask, cmask]
10133 other = other.loc[rmask, cmask]
10134 else:
10135 self = self[mask]
10136 other = other[mask]
10137 if not isinstance(result_names, tuple):
10138 raise TypeError(
10139 f"Passing 'result_names' as a {type(result_names)} is not "
10140 "supported. Provide 'result_names' as a tuple instead."
10141 )
10142
10143 if align_axis in (1, "columns"): # This is needed for Series
10144 axis = 1
10145 else:
10146 axis = self._get_axis_number(align_axis)
10147
10148 # error: List item 0 has incompatible type "NDFrame"; expected
10149 # "Union[Series, DataFrame]"
10150 diff = concat(
10151 [self, other], # type: ignore[list-item]
10152 axis=axis,
10153 keys=result_names,
10154 )
10155
10156 if axis >= self.ndim:
10157 # No need to reorganize data if stacking on new axis
10158 # This currently applies for stacking two Series on columns
10159 return diff
10160
10161 ax = diff._get_axis(axis)
10162 ax_names = np.array(ax.names)
10163
10164 # set index names to positions to avoid confusion
10165 ax.names = np.arange(len(ax_names))
10166
10167 # bring self-other to inner level
10168 order = list(range(1, ax.nlevels)) + [0]
10169 if isinstance(diff, ABCDataFrame):
10170 diff = diff.reorder_levels(order, axis=axis)
10171 else:
10172 diff = diff.reorder_levels(order)
10173
10174 # restore the index names in order
10175 diff._get_axis(axis=axis).names = ax_names[order]
10176
10177 # reorder axis to keep things organized
10178 indices = (
10179 np.arange(diff.shape[axis]).reshape([2, diff.shape[axis] // 2]).T.flatten()
10180 )
10181 diff = diff.take(indices, axis=axis)
10182
10183 return diff
10184
10185 @final
10186 @doc(
10187 klass=_shared_doc_kwargs["klass"],
10188 axes_single_arg=_shared_doc_kwargs["axes_single_arg"],
10189 )
10190 def align(
10191 self,
10192 other: NDFrameT,
10193 join: AlignJoin = "outer",
10194 axis: Axis | None = None,
10195 level: Level | None = None,
10196 copy: bool_t | None = None,
10197 fill_value: Hashable | None = None,
10198 method: FillnaOptions | None | lib.NoDefault = lib.no_default,
10199 limit: int | None | lib.NoDefault = lib.no_default,
10200 fill_axis: Axis | lib.NoDefault = lib.no_default,
10201 broadcast_axis: Axis | None | lib.NoDefault = lib.no_default,
10202 ) -> tuple[Self, NDFrameT]:
10203 """
10204 Align two objects on their axes with the specified join method.
10205
10206 Join method is specified for each axis Index.
10207
10208 Parameters
10209 ----------
10210 other : DataFrame or Series
10211 join : {{'outer', 'inner', 'left', 'right'}}, default 'outer'
10212 Type of alignment to be performed.
10213
10214 * left: use only keys from left frame, preserve key order.
10215 * right: use only keys from right frame, preserve key order.
10216 * outer: use union of keys from both frames, sort keys lexicographically.
10217 * inner: use intersection of keys from both frames,
10218 preserve the order of the left keys.
10219
10220 axis : allowed axis of the other object, default None
10221 Align on index (0), columns (1), or both (None).
10222 level : int or level name, default None
10223 Broadcast across a level, matching Index values on the
10224 passed MultiIndex level.
10225 copy : bool, default True
10226 Always returns new objects. If copy=False and no reindexing is
10227 required then original objects are returned.
10228
10229 .. note::
10230 The `copy` keyword will change behavior in pandas 3.0.
10231 `Copy-on-Write
10232 <https://pandas.pydata.org/docs/dev/user_guide/copy_on_write.html>`__
10233 will be enabled by default, which means that all methods with a
10234 `copy` keyword will use a lazy copy mechanism to defer the copy and
10235 ignore the `copy` keyword. The `copy` keyword will be removed in a
10236 future version of pandas.
10237
10238 You can already get the future behavior and improvements through
10239 enabling copy on write ``pd.options.mode.copy_on_write = True``
10240 fill_value : scalar, default np.nan
10241 Value to use for missing values. Defaults to NaN, but can be any
10242 "compatible" value.
10243 method : {{'backfill', 'bfill', 'pad', 'ffill', None}}, default None
10244 Method to use for filling holes in reindexed Series:
10245
10246 - pad / ffill: propagate last valid observation forward to next valid.
10247 - backfill / bfill: use NEXT valid observation to fill gap.
10248
10249 .. deprecated:: 2.1
10250
10251 limit : int, default None
10252 If method is specified, this is the maximum number of consecutive
10253 NaN values to forward/backward fill. In other words, if there is
10254 a gap with more than this number of consecutive NaNs, it will only
10255 be partially filled. If method is not specified, this is the
10256 maximum number of entries along the entire axis where NaNs will be
10257 filled. Must be greater than 0 if not None.
10258
10259 .. deprecated:: 2.1
10260
10261 fill_axis : {axes_single_arg}, default 0
10262 Filling axis, method and limit.
10263
10264 .. deprecated:: 2.1
10265
10266 broadcast_axis : {axes_single_arg}, default None
10267 Broadcast values along this axis, if aligning two objects of
10268 different dimensions.
10269
10270 .. deprecated:: 2.1
10271
10272 Returns
10273 -------
10274 tuple of ({klass}, type of other)
10275 Aligned objects.
10276
10277 Examples
10278 --------
10279 >>> df = pd.DataFrame(
10280 ... [[1, 2, 3, 4], [6, 7, 8, 9]], columns=["D", "B", "E", "A"], index=[1, 2]
10281 ... )
10282 >>> other = pd.DataFrame(
10283 ... [[10, 20, 30, 40], [60, 70, 80, 90], [600, 700, 800, 900]],
10284 ... columns=["A", "B", "C", "D"],
10285 ... index=[2, 3, 4],
10286 ... )
10287 >>> df
10288 D B E A
10289 1 1 2 3 4
10290 2 6 7 8 9
10291 >>> other
10292 A B C D
10293 2 10 20 30 40
10294 3 60 70 80 90
10295 4 600 700 800 900
10296
10297 Align on columns:
10298
10299 >>> left, right = df.align(other, join="outer", axis=1)
10300 >>> left
10301 A B C D E
10302 1 4 2 NaN 1 3
10303 2 9 7 NaN 6 8
10304 >>> right
10305 A B C D E
10306 2 10 20 30 40 NaN
10307 3 60 70 80 90 NaN
10308 4 600 700 800 900 NaN
10309
10310 We can also align on the index:
10311
10312 >>> left, right = df.align(other, join="outer", axis=0)
10313 >>> left
10314 D B E A
10315 1 1.0 2.0 3.0 4.0
10316 2 6.0 7.0 8.0 9.0
10317 3 NaN NaN NaN NaN
10318 4 NaN NaN NaN NaN
10319 >>> right
10320 A B C D
10321 1 NaN NaN NaN NaN
10322 2 10.0 20.0 30.0 40.0
10323 3 60.0 70.0 80.0 90.0
10324 4 600.0 700.0 800.0 900.0
10325
10326 Finally, the default `axis=None` will align on both index and columns:
10327
10328 >>> left, right = df.align(other, join="outer", axis=None)
10329 >>> left
10330 A B C D E
10331 1 4.0 2.0 NaN 1.0 3.0
10332 2 9.0 7.0 NaN 6.0 8.0
10333 3 NaN NaN NaN NaN NaN
10334 4 NaN NaN NaN NaN NaN
10335 >>> right
10336 A B C D E
10337 1 NaN NaN NaN NaN NaN
10338 2 10.0 20.0 30.0 40.0 NaN
10339 3 60.0 70.0 80.0 90.0 NaN
10340 4 600.0 700.0 800.0 900.0 NaN
10341 """
10342 if (
10343 method is not lib.no_default
10344 or limit is not lib.no_default
10345 or fill_axis is not lib.no_default
10346 ):
10347 # GH#51856
10348 warnings.warn(
10349 "The 'method', 'limit', and 'fill_axis' keywords in "
10350 f"{type(self).__name__}.align are deprecated and will be removed "
10351 "in a future version. Call fillna directly on the returned objects "
10352 "instead.",
10353 FutureWarning,
10354 stacklevel=find_stack_level(),
10355 )
10356 if fill_axis is lib.no_default:
10357 fill_axis = 0
10358 if method is lib.no_default:
10359 method = None
10360 if limit is lib.no_default:
10361 limit = None
10362
10363 if method is not None:
10364 method = clean_fill_method(method)
10365
10366 if broadcast_axis is not lib.no_default:
10367 # GH#51856
10368 # TODO(3.0): enforcing this deprecation will close GH#13194
10369 msg = (
10370 f"The 'broadcast_axis' keyword in {type(self).__name__}.align is "
10371 "deprecated and will be removed in a future version."
10372 )
10373 if broadcast_axis is not None:
10374 if self.ndim == 1 and other.ndim == 2:
10375 msg += (
10376 " Use left = DataFrame({col: left for col in right.columns}, "
10377 "index=right.index) before calling `left.align(right)` instead."
10378 )
10379 elif self.ndim == 2 and other.ndim == 1:
10380 msg += (
10381 " Use right = DataFrame({col: right for col in left.columns}, "
10382 "index=left.index) before calling `left.align(right)` instead"
10383 )
10384 warnings.warn(msg, FutureWarning, stacklevel=find_stack_level())
10385 else:
10386 broadcast_axis = None
10387
10388 if broadcast_axis == 1 and self.ndim != other.ndim:
10389 if isinstance(self, ABCSeries):
10390 # this means other is a DataFrame, and we need to broadcast
10391 # self
10392 cons = self._constructor_expanddim
10393 df = cons(
10394 {c: self for c in other.columns}, **other._construct_axes_dict()
10395 )
10396 # error: Incompatible return value type (got "Tuple[DataFrame,
10397 # DataFrame]", expected "Tuple[Self, NDFrameT]")
10398 return df._align_frame( # type: ignore[return-value]
10399 other, # type: ignore[arg-type]
10400 join=join,
10401 axis=axis,
10402 level=level,
10403 copy=copy,
10404 fill_value=fill_value,
10405 method=method,
10406 limit=limit,
10407 fill_axis=fill_axis,
10408 )[:2]
10409 elif isinstance(other, ABCSeries):
10410 # this means self is a DataFrame, and we need to broadcast
10411 # other
10412 cons = other._constructor_expanddim
10413 df = cons(
10414 {c: other for c in self.columns}, **self._construct_axes_dict()
10415 )
10416 # error: Incompatible return value type (got "Tuple[NDFrameT,
10417 # DataFrame]", expected "Tuple[Self, NDFrameT]")
10418 return self._align_frame( # type: ignore[return-value]
10419 df,
10420 join=join,
10421 axis=axis,
10422 level=level,
10423 copy=copy,
10424 fill_value=fill_value,
10425 method=method,
10426 limit=limit,
10427 fill_axis=fill_axis,
10428 )[:2]
10429
10430 _right: DataFrame | Series
10431 if axis is not None:
10432 axis = self._get_axis_number(axis)
10433 if isinstance(other, ABCDataFrame):
10434 left, _right, join_index = self._align_frame(
10435 other,
10436 join=join,
10437 axis=axis,
10438 level=level,
10439 copy=copy,
10440 fill_value=fill_value,
10441 method=method,
10442 limit=limit,
10443 fill_axis=fill_axis,
10444 )
10445
10446 elif isinstance(other, ABCSeries):
10447 left, _right, join_index = self._align_series(
10448 other,
10449 join=join,
10450 axis=axis,
10451 level=level,
10452 copy=copy,
10453 fill_value=fill_value,
10454 method=method,
10455 limit=limit,
10456 fill_axis=fill_axis,
10457 )
10458 else: # pragma: no cover
10459 raise TypeError(f"unsupported type: {type(other)}")
10460
10461 right = cast(NDFrameT, _right)
10462 if self.ndim == 1 or axis == 0:
10463 # If we are aligning timezone-aware DatetimeIndexes and the timezones
10464 # do not match, convert both to UTC.
10465 if isinstance(left.index.dtype, DatetimeTZDtype):
10466 if left.index.tz != right.index.tz:
10467 if join_index is not None:
10468 # GH#33671 copy to ensure we don't change the index on
10469 # our original Series
10470 left = left.copy(deep=False)
10471 right = right.copy(deep=False)
10472 left.index = join_index
10473 right.index = join_index
10474
10475 left = left.__finalize__(self)
10476 right = right.__finalize__(other)
10477 return left, right
10478
10479 @final
10480 def _align_frame(
10481 self,
10482 other: DataFrame,
10483 join: AlignJoin = "outer",
10484 axis: Axis | None = None,
10485 level=None,
10486 copy: bool_t | None = None,
10487 fill_value=None,
10488 method=None,
10489 limit: int | None = None,
10490 fill_axis: Axis = 0,
10491 ) -> tuple[Self, DataFrame, Index | None]:
10492 # defaults
10493 join_index, join_columns = None, None
10494 ilidx, iridx = None, None
10495 clidx, cridx = None, None
10496
10497 is_series = isinstance(self, ABCSeries)
10498
10499 if (axis is None or axis == 0) and not self.index.equals(other.index):
10500 join_index, ilidx, iridx = self.index.join(
10501 other.index, how=join, level=level, return_indexers=True
10502 )
10503
10504 if (
10505 (axis is None or axis == 1)
10506 and not is_series
10507 and not self.columns.equals(other.columns)
10508 ):
10509 join_columns, clidx, cridx = self.columns.join(
10510 other.columns, how=join, level=level, return_indexers=True
10511 )
10512
10513 if is_series:
10514 reindexers = {0: [join_index, ilidx]}
10515 else:
10516 reindexers = {0: [join_index, ilidx], 1: [join_columns, clidx]}
10517
10518 left = self._reindex_with_indexers(
10519 reindexers, copy=copy, fill_value=fill_value, allow_dups=True
10520 )
10521 # other must be always DataFrame
10522 right = other._reindex_with_indexers(
10523 {0: [join_index, iridx], 1: [join_columns, cridx]},
10524 copy=copy,
10525 fill_value=fill_value,
10526 allow_dups=True,
10527 )
10528
10529 if method is not None:
10530 left = left._pad_or_backfill(method, axis=fill_axis, limit=limit)
10531 right = right._pad_or_backfill(method, axis=fill_axis, limit=limit)
10532
10533 return left, right, join_index
10534
10535 @final
10536 def _align_series(
10537 self,
10538 other: Series,
10539 join: AlignJoin = "outer",
10540 axis: Axis | None = None,
10541 level=None,
10542 copy: bool_t | None = None,
10543 fill_value=None,
10544 method=None,
10545 limit: int | None = None,
10546 fill_axis: Axis = 0,
10547 ) -> tuple[Self, Series, Index | None]:
10548 is_series = isinstance(self, ABCSeries)
10549 if copy and using_copy_on_write():
10550 copy = False
10551
10552 if (not is_series and axis is None) or axis not in [None, 0, 1]:
10553 raise ValueError("Must specify axis=0 or 1")
10554
10555 if is_series and axis == 1:
10556 raise ValueError("cannot align series to a series other than axis 0")
10557
10558 # series/series compat, other must always be a Series
10559 if not axis:
10560 # equal
10561 if self.index.equals(other.index):
10562 join_index, lidx, ridx = None, None, None
10563 else:
10564 join_index, lidx, ridx = self.index.join(
10565 other.index, how=join, level=level, return_indexers=True
10566 )
10567
10568 if is_series:
10569 left = self._reindex_indexer(join_index, lidx, copy)
10570 elif lidx is None or join_index is None:
10571 left = self.copy(deep=copy)
10572 else:
10573 new_mgr = self._mgr.reindex_indexer(join_index, lidx, axis=1, copy=copy)
10574 left = self._constructor_from_mgr(new_mgr, axes=new_mgr.axes)
10575
10576 right = other._reindex_indexer(join_index, ridx, copy)
10577
10578 else:
10579 # one has > 1 ndim
10580 fdata = self._mgr
10581 join_index = self.axes[1]
10582 lidx, ridx = None, None
10583 if not join_index.equals(other.index):
10584 join_index, lidx, ridx = join_index.join(
10585 other.index, how=join, level=level, return_indexers=True
10586 )
10587
10588 if lidx is not None:
10589 bm_axis = self._get_block_manager_axis(1)
10590 fdata = fdata.reindex_indexer(join_index, lidx, axis=bm_axis)
10591
10592 if copy and fdata is self._mgr:
10593 fdata = fdata.copy()
10594
10595 left = self._constructor_from_mgr(fdata, axes=fdata.axes)
10596
10597 if ridx is None:
10598 right = other.copy(deep=copy)
10599 else:
10600 right = other.reindex(join_index, level=level)
10601
10602 # fill
10603 fill_na = notna(fill_value) or (method is not None)
10604 if fill_na:
10605 fill_value, method = validate_fillna_kwargs(fill_value, method)
10606 if method is not None:
10607 left = left._pad_or_backfill(method, limit=limit, axis=fill_axis)
10608 right = right._pad_or_backfill(method, limit=limit)
10609 else:
10610 left = left.fillna(fill_value, limit=limit, axis=fill_axis)
10611 right = right.fillna(fill_value, limit=limit)
10612
10613 return left, right, join_index
10614
10615 @final
10616 def _where(
10617 self,
10618 cond,
10619 other=lib.no_default,
10620 inplace: bool_t = False,
10621 axis: Axis | None = None,
10622 level=None,
10623 warn: bool_t = True,
10624 ):
10625 """
10626 Equivalent to public method `where`, except that `other` is not
10627 applied as a function even if callable. Used in __setitem__.
10628 """
10629 inplace = validate_bool_kwarg(inplace, "inplace")
10630
10631 if axis is not None:
10632 axis = self._get_axis_number(axis)
10633
10634 # align the cond to same shape as myself
10635 cond = common.apply_if_callable(cond, self)
10636 if isinstance(cond, NDFrame):
10637 # CoW: Make sure reference is not kept alive
10638 if cond.ndim == 1 and self.ndim == 2:
10639 cond = cond._constructor_expanddim(
10640 {i: cond for i in range(len(self.columns))},
10641 copy=False,
10642 )
10643 cond.columns = self.columns
10644 cond = cond.align(self, join="right", copy=False)[0]
10645 else:
10646 if not hasattr(cond, "shape"):
10647 cond = np.asanyarray(cond)
10648 if cond.shape != self.shape:
10649 raise ValueError("Array conditional must be same shape as self")
10650 cond = self._constructor(cond, **self._construct_axes_dict(), copy=False)
10651
10652 # make sure we are boolean
10653 fill_value = bool(inplace)
10654 with warnings.catch_warnings():
10655 warnings.filterwarnings(
10656 "ignore",
10657 "Downcasting object dtype arrays",
10658 category=FutureWarning,
10659 )
10660 cond = cond.fillna(fill_value)
10661 cond = cond.infer_objects(copy=False)
10662
10663 msg = "Boolean array expected for the condition, not {dtype}"
10664
10665 if not cond.empty:
10666 if not isinstance(cond, ABCDataFrame):
10667 # This is a single-dimensional object.
10668 if not is_bool_dtype(cond):
10669 raise ValueError(msg.format(dtype=cond.dtype))
10670 else:
10671 for _dt in cond.dtypes:
10672 if not is_bool_dtype(_dt):
10673 raise ValueError(msg.format(dtype=_dt))
10674 if cond._mgr.any_extension_types:
10675 # GH51574: avoid object ndarray conversion later on
10676 cond = cond._constructor(
10677 cond.to_numpy(dtype=bool, na_value=fill_value),
10678 **cond._construct_axes_dict(),
10679 )
10680 else:
10681 # GH#21947 we have an empty DataFrame/Series, could be object-dtype
10682 cond = cond.astype(bool)
10683
10684 cond = -cond if inplace else cond
10685 cond = cond.reindex(self._info_axis, axis=self._info_axis_number, copy=False)
10686
10687 # try to align with other
10688 if isinstance(other, NDFrame):
10689 # align with me
10690 if other.ndim <= self.ndim:
10691 # CoW: Make sure reference is not kept alive
10692 other = self.align(
10693 other,
10694 join="left",
10695 axis=axis,
10696 level=level,
10697 fill_value=None,
10698 copy=False,
10699 )[1]
10700
10701 # if we are NOT aligned, raise as we cannot where index
10702 if axis is None and not other._indexed_same(self):
10703 raise InvalidIndexError
10704
10705 if other.ndim < self.ndim:
10706 # TODO(EA2D): avoid object-dtype cast in EA case GH#38729
10707 other = other._values
10708 if axis == 0:
10709 other = np.reshape(other, (-1, 1))
10710 elif axis == 1:
10711 other = np.reshape(other, (1, -1))
10712
10713 other = np.broadcast_to(other, self.shape)
10714
10715 # slice me out of the other
10716 else:
10717 raise NotImplementedError(
10718 "cannot align with a higher dimensional NDFrame"
10719 )
10720
10721 elif not isinstance(other, (MultiIndex, NDFrame)):
10722 # mainly just catching Index here
10723 other = extract_array(other, extract_numpy=True)
10724
10725 if isinstance(other, (np.ndarray, ExtensionArray)):
10726 if other.shape != self.shape:
10727 if self.ndim != 1:
10728 # In the ndim == 1 case we may have
10729 # other length 1, which we treat as scalar (GH#2745, GH#4192)
10730 # or len(other) == icond.sum(), which we treat like
10731 # __setitem__ (GH#3235)
10732 raise ValueError(
10733 "other must be the same shape as self when an ndarray"
10734 )
10735
10736 # we are the same shape, so create an actual object for alignment
10737 else:
10738 other = self._constructor(
10739 other, **self._construct_axes_dict(), copy=False
10740 )
10741
10742 if axis is None:
10743 axis = 0
10744
10745 if self.ndim == getattr(other, "ndim", 0):
10746 align = True
10747 else:
10748 align = self._get_axis_number(axis) == 1
10749
10750 if inplace:
10751 # we may have different type blocks come out of putmask, so
10752 # reconstruct the block manager
10753
10754 new_data = self._mgr.putmask(mask=cond, new=other, align=align, warn=warn)
10755 result = self._constructor_from_mgr(new_data, axes=new_data.axes)
10756 return self._update_inplace(result)
10757
10758 else:
10759 new_data = self._mgr.where(
10760 other=other,
10761 cond=cond,
10762 align=align,
10763 )
10764 result = self._constructor_from_mgr(new_data, axes=new_data.axes)
10765 return result.__finalize__(self)
10766
10767 @overload
10768 def where(
10769 self,
10770 cond,
10771 other=...,
10772 *,
10773 inplace: Literal[False] = ...,
10774 axis: Axis | None = ...,
10775 level: Level = ...,
10776 ) -> Self:
10777 ...
10778
10779 @overload
10780 def where(
10781 self,
10782 cond,
10783 other=...,
10784 *,
10785 inplace: Literal[True],
10786 axis: Axis | None = ...,
10787 level: Level = ...,
10788 ) -> None:
10789 ...
10790
10791 @overload
10792 def where(
10793 self,
10794 cond,
10795 other=...,
10796 *,
10797 inplace: bool_t = ...,
10798 axis: Axis | None = ...,
10799 level: Level = ...,
10800 ) -> Self | None:
10801 ...
10802
10803 @final
10804 @doc(
10805 klass=_shared_doc_kwargs["klass"],
10806 cond="True",
10807 cond_rev="False",
10808 name="where",
10809 name_other="mask",
10810 )
10811 def where(
10812 self,
10813 cond,
10814 other=np.nan,
10815 *,
10816 inplace: bool_t = False,
10817 axis: Axis | None = None,
10818 level: Level | None = None,
10819 ) -> Self | None:
10820 """
10821 Replace values where the condition is {cond_rev}.
10822
10823 Parameters
10824 ----------
10825 cond : bool {klass}, array-like, or callable
10826 Where `cond` is {cond}, keep the original value. Where
10827 {cond_rev}, replace with corresponding value from `other`.
10828 If `cond` is callable, it is computed on the {klass} and
10829 should return boolean {klass} or array. The callable must
10830 not change input {klass} (though pandas doesn't check it).
10831 other : scalar, {klass}, or callable
10832 Entries where `cond` is {cond_rev} are replaced with
10833 corresponding value from `other`.
10834 If other is callable, it is computed on the {klass} and
10835 should return scalar or {klass}. The callable must not
10836 change input {klass} (though pandas doesn't check it).
10837 If not specified, entries will be filled with the corresponding
10838 NULL value (``np.nan`` for numpy dtypes, ``pd.NA`` for extension
10839 dtypes).
10840 inplace : bool, default False
10841 Whether to perform the operation in place on the data.
10842 axis : int, default None
10843 Alignment axis if needed. For `Series` this parameter is
10844 unused and defaults to 0.
10845 level : int, default None
10846 Alignment level if needed.
10847
10848 Returns
10849 -------
10850 Same type as caller or None if ``inplace=True``.
10851
10852 See Also
10853 --------
10854 :func:`DataFrame.{name_other}` : Return an object of same shape as
10855 self.
10856
10857 Notes
10858 -----
10859 The {name} method is an application of the if-then idiom. For each
10860 element in the calling DataFrame, if ``cond`` is ``{cond}`` the
10861 element is used; otherwise the corresponding element from the DataFrame
10862 ``other`` is used. If the axis of ``other`` does not align with axis of
10863 ``cond`` {klass}, the misaligned index positions will be filled with
10864 {cond_rev}.
10865
10866 The signature for :func:`DataFrame.where` differs from
10867 :func:`numpy.where`. Roughly ``df1.where(m, df2)`` is equivalent to
10868 ``np.where(m, df1, df2)``.
10869
10870 For further details and examples see the ``{name}`` documentation in
10871 :ref:`indexing <indexing.where_mask>`.
10872
10873 The dtype of the object takes precedence. The fill value is casted to
10874 the object's dtype, if this can be done losslessly.
10875
10876 Examples
10877 --------
10878 >>> s = pd.Series(range(5))
10879 >>> s.where(s > 0)
10880 0 NaN
10881 1 1.0
10882 2 2.0
10883 3 3.0
10884 4 4.0
10885 dtype: float64
10886 >>> s.mask(s > 0)
10887 0 0.0
10888 1 NaN
10889 2 NaN
10890 3 NaN
10891 4 NaN
10892 dtype: float64
10893
10894 >>> s = pd.Series(range(5))
10895 >>> t = pd.Series([True, False])
10896 >>> s.where(t, 99)
10897 0 0
10898 1 99
10899 2 99
10900 3 99
10901 4 99
10902 dtype: int64
10903 >>> s.mask(t, 99)
10904 0 99
10905 1 1
10906 2 99
10907 3 99
10908 4 99
10909 dtype: int64
10910
10911 >>> s.where(s > 1, 10)
10912 0 10
10913 1 10
10914 2 2
10915 3 3
10916 4 4
10917 dtype: int64
10918 >>> s.mask(s > 1, 10)
10919 0 0
10920 1 1
10921 2 10
10922 3 10
10923 4 10
10924 dtype: int64
10925
10926 >>> df = pd.DataFrame(np.arange(10).reshape(-1, 2), columns=['A', 'B'])
10927 >>> df
10928 A B
10929 0 0 1
10930 1 2 3
10931 2 4 5
10932 3 6 7
10933 4 8 9
10934 >>> m = df % 3 == 0
10935 >>> df.where(m, -df)
10936 A B
10937 0 0 -1
10938 1 -2 3
10939 2 -4 -5
10940 3 6 -7
10941 4 -8 9
10942 >>> df.where(m, -df) == np.where(m, df, -df)
10943 A B
10944 0 True True
10945 1 True True
10946 2 True True
10947 3 True True
10948 4 True True
10949 >>> df.where(m, -df) == df.mask(~m, -df)
10950 A B
10951 0 True True
10952 1 True True
10953 2 True True
10954 3 True True
10955 4 True True
10956 """
10957 inplace = validate_bool_kwarg(inplace, "inplace")
10958 if inplace:
10959 if not PYPY and using_copy_on_write():
10960 if sys.getrefcount(self) <= REF_COUNT:
10961 warnings.warn(
10962 _chained_assignment_method_msg,
10963 ChainedAssignmentError,
10964 stacklevel=2,
10965 )
10966 elif (
10967 not PYPY
10968 and not using_copy_on_write()
10969 and self._is_view_after_cow_rules()
10970 ):
10971 ctr = sys.getrefcount(self)
10972 ref_count = REF_COUNT
10973 if isinstance(self, ABCSeries) and hasattr(self, "_cacher"):
10974 # see https://github.com/pandas-dev/pandas/pull/56060#discussion_r1399245221
10975 ref_count += 1
10976 if ctr <= ref_count:
10977 warnings.warn(
10978 _chained_assignment_warning_method_msg,
10979 FutureWarning,
10980 stacklevel=2,
10981 )
10982
10983 other = common.apply_if_callable(other, self)
10984 return self._where(cond, other, inplace, axis, level)
10985
10986 @overload
10987 def mask(
10988 self,
10989 cond,
10990 other=...,
10991 *,
10992 inplace: Literal[False] = ...,
10993 axis: Axis | None = ...,
10994 level: Level = ...,
10995 ) -> Self:
10996 ...
10997
10998 @overload
10999 def mask(
11000 self,
11001 cond,
11002 other=...,
11003 *,
11004 inplace: Literal[True],
11005 axis: Axis | None = ...,
11006 level: Level = ...,
11007 ) -> None:
11008 ...
11009
11010 @overload
11011 def mask(
11012 self,
11013 cond,
11014 other=...,
11015 *,
11016 inplace: bool_t = ...,
11017 axis: Axis | None = ...,
11018 level: Level = ...,
11019 ) -> Self | None:
11020 ...
11021
11022 @final
11023 @doc(
11024 where,
11025 klass=_shared_doc_kwargs["klass"],
11026 cond="False",
11027 cond_rev="True",
11028 name="mask",
11029 name_other="where",
11030 )
11031 def mask(
11032 self,
11033 cond,
11034 other=lib.no_default,
11035 *,
11036 inplace: bool_t = False,
11037 axis: Axis | None = None,
11038 level: Level | None = None,
11039 ) -> Self | None:
11040 inplace = validate_bool_kwarg(inplace, "inplace")
11041 if inplace:
11042 if not PYPY and using_copy_on_write():
11043 if sys.getrefcount(self) <= REF_COUNT:
11044 warnings.warn(
11045 _chained_assignment_method_msg,
11046 ChainedAssignmentError,
11047 stacklevel=2,
11048 )
11049 elif (
11050 not PYPY
11051 and not using_copy_on_write()
11052 and self._is_view_after_cow_rules()
11053 ):
11054 ctr = sys.getrefcount(self)
11055 ref_count = REF_COUNT
11056 if isinstance(self, ABCSeries) and hasattr(self, "_cacher"):
11057 # see https://github.com/pandas-dev/pandas/pull/56060#discussion_r1399245221
11058 ref_count += 1
11059 if ctr <= ref_count:
11060 warnings.warn(
11061 _chained_assignment_warning_method_msg,
11062 FutureWarning,
11063 stacklevel=2,
11064 )
11065
11066 cond = common.apply_if_callable(cond, self)
11067 other = common.apply_if_callable(other, self)
11068
11069 # see gh-21891
11070 if not hasattr(cond, "__invert__"):
11071 cond = np.array(cond)
11072
11073 return self._where(
11074 ~cond,
11075 other=other,
11076 inplace=inplace,
11077 axis=axis,
11078 level=level,
11079 )
11080
11081 @doc(klass=_shared_doc_kwargs["klass"])
11082 def shift(
11083 self,
11084 periods: int | Sequence[int] = 1,
11085 freq=None,
11086 axis: Axis = 0,
11087 fill_value: Hashable = lib.no_default,
11088 suffix: str | None = None,
11089 ) -> Self | DataFrame:
11090 """
11091 Shift index by desired number of periods with an optional time `freq`.
11092
11093 When `freq` is not passed, shift the index without realigning the data.
11094 If `freq` is passed (in this case, the index must be date or datetime,
11095 or it will raise a `NotImplementedError`), the index will be
11096 increased using the periods and the `freq`. `freq` can be inferred
11097 when specified as "infer" as long as either freq or inferred_freq
11098 attribute is set in the index.
11099
11100 Parameters
11101 ----------
11102 periods : int or Sequence
11103 Number of periods to shift. Can be positive or negative.
11104 If an iterable of ints, the data will be shifted once by each int.
11105 This is equivalent to shifting by one value at a time and
11106 concatenating all resulting frames. The resulting columns will have
11107 the shift suffixed to their column names. For multiple periods,
11108 axis must not be 1.
11109 freq : DateOffset, tseries.offsets, timedelta, or str, optional
11110 Offset to use from the tseries module or time rule (e.g. 'EOM').
11111 If `freq` is specified then the index values are shifted but the
11112 data is not realigned. That is, use `freq` if you would like to
11113 extend the index when shifting and preserve the original data.
11114 If `freq` is specified as "infer" then it will be inferred from
11115 the freq or inferred_freq attributes of the index. If neither of
11116 those attributes exist, a ValueError is thrown.
11117 axis : {{0 or 'index', 1 or 'columns', None}}, default None
11118 Shift direction. For `Series` this parameter is unused and defaults to 0.
11119 fill_value : object, optional
11120 The scalar value to use for newly introduced missing values.
11121 the default depends on the dtype of `self`.
11122 For numeric data, ``np.nan`` is used.
11123 For datetime, timedelta, or period data, etc. :attr:`NaT` is used.
11124 For extension dtypes, ``self.dtype.na_value`` is used.
11125 suffix : str, optional
11126 If str and periods is an iterable, this is added after the column
11127 name and before the shift value for each shifted column name.
11128
11129 Returns
11130 -------
11131 {klass}
11132 Copy of input object, shifted.
11133
11134 See Also
11135 --------
11136 Index.shift : Shift values of Index.
11137 DatetimeIndex.shift : Shift values of DatetimeIndex.
11138 PeriodIndex.shift : Shift values of PeriodIndex.
11139
11140 Examples
11141 --------
11142 >>> df = pd.DataFrame({{"Col1": [10, 20, 15, 30, 45],
11143 ... "Col2": [13, 23, 18, 33, 48],
11144 ... "Col3": [17, 27, 22, 37, 52]}},
11145 ... index=pd.date_range("2020-01-01", "2020-01-05"))
11146 >>> df
11147 Col1 Col2 Col3
11148 2020-01-01 10 13 17
11149 2020-01-02 20 23 27
11150 2020-01-03 15 18 22
11151 2020-01-04 30 33 37
11152 2020-01-05 45 48 52
11153
11154 >>> df.shift(periods=3)
11155 Col1 Col2 Col3
11156 2020-01-01 NaN NaN NaN
11157 2020-01-02 NaN NaN NaN
11158 2020-01-03 NaN NaN NaN
11159 2020-01-04 10.0 13.0 17.0
11160 2020-01-05 20.0 23.0 27.0
11161
11162 >>> df.shift(periods=1, axis="columns")
11163 Col1 Col2 Col3
11164 2020-01-01 NaN 10 13
11165 2020-01-02 NaN 20 23
11166 2020-01-03 NaN 15 18
11167 2020-01-04 NaN 30 33
11168 2020-01-05 NaN 45 48
11169
11170 >>> df.shift(periods=3, fill_value=0)
11171 Col1 Col2 Col3
11172 2020-01-01 0 0 0
11173 2020-01-02 0 0 0
11174 2020-01-03 0 0 0
11175 2020-01-04 10 13 17
11176 2020-01-05 20 23 27
11177
11178 >>> df.shift(periods=3, freq="D")
11179 Col1 Col2 Col3
11180 2020-01-04 10 13 17
11181 2020-01-05 20 23 27
11182 2020-01-06 15 18 22
11183 2020-01-07 30 33 37
11184 2020-01-08 45 48 52
11185
11186 >>> df.shift(periods=3, freq="infer")
11187 Col1 Col2 Col3
11188 2020-01-04 10 13 17
11189 2020-01-05 20 23 27
11190 2020-01-06 15 18 22
11191 2020-01-07 30 33 37
11192 2020-01-08 45 48 52
11193
11194 >>> df['Col1'].shift(periods=[0, 1, 2])
11195 Col1_0 Col1_1 Col1_2
11196 2020-01-01 10 NaN NaN
11197 2020-01-02 20 10.0 NaN
11198 2020-01-03 15 20.0 10.0
11199 2020-01-04 30 15.0 20.0
11200 2020-01-05 45 30.0 15.0
11201 """
11202 axis = self._get_axis_number(axis)
11203
11204 if freq is not None and fill_value is not lib.no_default:
11205 # GH#53832
11206 warnings.warn(
11207 "Passing a 'freq' together with a 'fill_value' silently ignores "
11208 "the fill_value and is deprecated. This will raise in a future "
11209 "version.",
11210 FutureWarning,
11211 stacklevel=find_stack_level(),
11212 )
11213 fill_value = lib.no_default
11214
11215 if periods == 0:
11216 return self.copy(deep=None)
11217
11218 if is_list_like(periods) and isinstance(self, ABCSeries):
11219 return self.to_frame().shift(
11220 periods=periods, freq=freq, axis=axis, fill_value=fill_value
11221 )
11222 periods = cast(int, periods)
11223
11224 if freq is None:
11225 # when freq is None, data is shifted, index is not
11226 axis = self._get_axis_number(axis)
11227 assert axis == 0 # axis == 1 cases handled in DataFrame.shift
11228 new_data = self._mgr.shift(periods=periods, fill_value=fill_value)
11229 return self._constructor_from_mgr(
11230 new_data, axes=new_data.axes
11231 ).__finalize__(self, method="shift")
11232
11233 return self._shift_with_freq(periods, axis, freq)
11234
11235 @final
11236 def _shift_with_freq(self, periods: int, axis: int, freq) -> Self:
11237 # see shift.__doc__
11238 # when freq is given, index is shifted, data is not
11239 index = self._get_axis(axis)
11240
11241 if freq == "infer":
11242 freq = getattr(index, "freq", None)
11243
11244 if freq is None:
11245 freq = getattr(index, "inferred_freq", None)
11246
11247 if freq is None:
11248 msg = "Freq was not set in the index hence cannot be inferred"
11249 raise ValueError(msg)
11250
11251 elif isinstance(freq, str):
11252 is_period = isinstance(index, PeriodIndex)
11253 freq = to_offset(freq, is_period=is_period)
11254
11255 if isinstance(index, PeriodIndex):
11256 orig_freq = to_offset(index.freq)
11257 if freq != orig_freq:
11258 assert orig_freq is not None # for mypy
11259 raise ValueError(
11260 f"Given freq {freq_to_period_freqstr(freq.n, freq.name)} "
11261 f"does not match PeriodIndex freq "
11262 f"{freq_to_period_freqstr(orig_freq.n, orig_freq.name)}"
11263 )
11264 new_ax = index.shift(periods)
11265 else:
11266 new_ax = index.shift(periods, freq)
11267
11268 result = self.set_axis(new_ax, axis=axis)
11269 return result.__finalize__(self, method="shift")
11270
11271 @final
11272 def truncate(
11273 self,
11274 before=None,
11275 after=None,
11276 axis: Axis | None = None,
11277 copy: bool_t | None = None,
11278 ) -> Self:
11279 """
11280 Truncate a Series or DataFrame before and after some index value.
11281
11282 This is a useful shorthand for boolean indexing based on index
11283 values above or below certain thresholds.
11284
11285 Parameters
11286 ----------
11287 before : date, str, int
11288 Truncate all rows before this index value.
11289 after : date, str, int
11290 Truncate all rows after this index value.
11291 axis : {0 or 'index', 1 or 'columns'}, optional
11292 Axis to truncate. Truncates the index (rows) by default.
11293 For `Series` this parameter is unused and defaults to 0.
11294 copy : bool, default is True,
11295 Return a copy of the truncated section.
11296
11297 .. note::
11298 The `copy` keyword will change behavior in pandas 3.0.
11299 `Copy-on-Write
11300 <https://pandas.pydata.org/docs/dev/user_guide/copy_on_write.html>`__
11301 will be enabled by default, which means that all methods with a
11302 `copy` keyword will use a lazy copy mechanism to defer the copy and
11303 ignore the `copy` keyword. The `copy` keyword will be removed in a
11304 future version of pandas.
11305
11306 You can already get the future behavior and improvements through
11307 enabling copy on write ``pd.options.mode.copy_on_write = True``
11308
11309 Returns
11310 -------
11311 type of caller
11312 The truncated Series or DataFrame.
11313
11314 See Also
11315 --------
11316 DataFrame.loc : Select a subset of a DataFrame by label.
11317 DataFrame.iloc : Select a subset of a DataFrame by position.
11318
11319 Notes
11320 -----
11321 If the index being truncated contains only datetime values,
11322 `before` and `after` may be specified as strings instead of
11323 Timestamps.
11324
11325 Examples
11326 --------
11327 >>> df = pd.DataFrame({'A': ['a', 'b', 'c', 'd', 'e'],
11328 ... 'B': ['f', 'g', 'h', 'i', 'j'],
11329 ... 'C': ['k', 'l', 'm', 'n', 'o']},
11330 ... index=[1, 2, 3, 4, 5])
11331 >>> df
11332 A B C
11333 1 a f k
11334 2 b g l
11335 3 c h m
11336 4 d i n
11337 5 e j o
11338
11339 >>> df.truncate(before=2, after=4)
11340 A B C
11341 2 b g l
11342 3 c h m
11343 4 d i n
11344
11345 The columns of a DataFrame can be truncated.
11346
11347 >>> df.truncate(before="A", after="B", axis="columns")
11348 A B
11349 1 a f
11350 2 b g
11351 3 c h
11352 4 d i
11353 5 e j
11354
11355 For Series, only rows can be truncated.
11356
11357 >>> df['A'].truncate(before=2, after=4)
11358 2 b
11359 3 c
11360 4 d
11361 Name: A, dtype: object
11362
11363 The index values in ``truncate`` can be datetimes or string
11364 dates.
11365
11366 >>> dates = pd.date_range('2016-01-01', '2016-02-01', freq='s')
11367 >>> df = pd.DataFrame(index=dates, data={'A': 1})
11368 >>> df.tail()
11369 A
11370 2016-01-31 23:59:56 1
11371 2016-01-31 23:59:57 1
11372 2016-01-31 23:59:58 1
11373 2016-01-31 23:59:59 1
11374 2016-02-01 00:00:00 1
11375
11376 >>> df.truncate(before=pd.Timestamp('2016-01-05'),
11377 ... after=pd.Timestamp('2016-01-10')).tail()
11378 A
11379 2016-01-09 23:59:56 1
11380 2016-01-09 23:59:57 1
11381 2016-01-09 23:59:58 1
11382 2016-01-09 23:59:59 1
11383 2016-01-10 00:00:00 1
11384
11385 Because the index is a DatetimeIndex containing only dates, we can
11386 specify `before` and `after` as strings. They will be coerced to
11387 Timestamps before truncation.
11388
11389 >>> df.truncate('2016-01-05', '2016-01-10').tail()
11390 A
11391 2016-01-09 23:59:56 1
11392 2016-01-09 23:59:57 1
11393 2016-01-09 23:59:58 1
11394 2016-01-09 23:59:59 1
11395 2016-01-10 00:00:00 1
11396
11397 Note that ``truncate`` assumes a 0 value for any unspecified time
11398 component (midnight). This differs from partial string slicing, which
11399 returns any partially matching dates.
11400
11401 >>> df.loc['2016-01-05':'2016-01-10', :].tail()
11402 A
11403 2016-01-10 23:59:55 1
11404 2016-01-10 23:59:56 1
11405 2016-01-10 23:59:57 1
11406 2016-01-10 23:59:58 1
11407 2016-01-10 23:59:59 1
11408 """
11409 if axis is None:
11410 axis = 0
11411 axis = self._get_axis_number(axis)
11412 ax = self._get_axis(axis)
11413
11414 # GH 17935
11415 # Check that index is sorted
11416 if not ax.is_monotonic_increasing and not ax.is_monotonic_decreasing:
11417 raise ValueError("truncate requires a sorted index")
11418
11419 # if we have a date index, convert to dates, otherwise
11420 # treat like a slice
11421 if ax._is_all_dates:
11422 from pandas.core.tools.datetimes import to_datetime
11423
11424 before = to_datetime(before)
11425 after = to_datetime(after)
11426
11427 if before is not None and after is not None and before > after:
11428 raise ValueError(f"Truncate: {after} must be after {before}")
11429
11430 if len(ax) > 1 and ax.is_monotonic_decreasing and ax.nunique() > 1:
11431 before, after = after, before
11432
11433 slicer = [slice(None, None)] * self._AXIS_LEN
11434 slicer[axis] = slice(before, after)
11435 result = self.loc[tuple(slicer)]
11436
11437 if isinstance(ax, MultiIndex):
11438 setattr(result, self._get_axis_name(axis), ax.truncate(before, after))
11439
11440 result = result.copy(deep=copy and not using_copy_on_write())
11441
11442 return result
11443
11444 @final
11445 @doc(klass=_shared_doc_kwargs["klass"])
11446 def tz_convert(
11447 self, tz, axis: Axis = 0, level=None, copy: bool_t | None = None
11448 ) -> Self:
11449 """
11450 Convert tz-aware axis to target time zone.
11451
11452 Parameters
11453 ----------
11454 tz : str or tzinfo object or None
11455 Target time zone. Passing ``None`` will convert to
11456 UTC and remove the timezone information.
11457 axis : {{0 or 'index', 1 or 'columns'}}, default 0
11458 The axis to convert
11459 level : int, str, default None
11460 If axis is a MultiIndex, convert a specific level. Otherwise
11461 must be None.
11462 copy : bool, default True
11463 Also make a copy of the underlying data.
11464
11465 .. note::
11466 The `copy` keyword will change behavior in pandas 3.0.
11467 `Copy-on-Write
11468 <https://pandas.pydata.org/docs/dev/user_guide/copy_on_write.html>`__
11469 will be enabled by default, which means that all methods with a
11470 `copy` keyword will use a lazy copy mechanism to defer the copy and
11471 ignore the `copy` keyword. The `copy` keyword will be removed in a
11472 future version of pandas.
11473
11474 You can already get the future behavior and improvements through
11475 enabling copy on write ``pd.options.mode.copy_on_write = True``
11476
11477 Returns
11478 -------
11479 {klass}
11480 Object with time zone converted axis.
11481
11482 Raises
11483 ------
11484 TypeError
11485 If the axis is tz-naive.
11486
11487 Examples
11488 --------
11489 Change to another time zone:
11490
11491 >>> s = pd.Series(
11492 ... [1],
11493 ... index=pd.DatetimeIndex(['2018-09-15 01:30:00+02:00']),
11494 ... )
11495 >>> s.tz_convert('Asia/Shanghai')
11496 2018-09-15 07:30:00+08:00 1
11497 dtype: int64
11498
11499 Pass None to convert to UTC and get a tz-naive index:
11500
11501 >>> s = pd.Series([1],
11502 ... index=pd.DatetimeIndex(['2018-09-15 01:30:00+02:00']))
11503 >>> s.tz_convert(None)
11504 2018-09-14 23:30:00 1
11505 dtype: int64
11506 """
11507 axis = self._get_axis_number(axis)
11508 ax = self._get_axis(axis)
11509
11510 def _tz_convert(ax, tz):
11511 if not hasattr(ax, "tz_convert"):
11512 if len(ax) > 0:
11513 ax_name = self._get_axis_name(axis)
11514 raise TypeError(
11515 f"{ax_name} is not a valid DatetimeIndex or PeriodIndex"
11516 )
11517 ax = DatetimeIndex([], tz=tz)
11518 else:
11519 ax = ax.tz_convert(tz)
11520 return ax
11521
11522 # if a level is given it must be a MultiIndex level or
11523 # equivalent to the axis name
11524 if isinstance(ax, MultiIndex):
11525 level = ax._get_level_number(level)
11526 new_level = _tz_convert(ax.levels[level], tz)
11527 ax = ax.set_levels(new_level, level=level)
11528 else:
11529 if level not in (None, 0, ax.name):
11530 raise ValueError(f"The level {level} is not valid")
11531 ax = _tz_convert(ax, tz)
11532
11533 result = self.copy(deep=copy and not using_copy_on_write())
11534 result = result.set_axis(ax, axis=axis, copy=False)
11535 return result.__finalize__(self, method="tz_convert")
11536
11537 @final
11538 @doc(klass=_shared_doc_kwargs["klass"])
11539 def tz_localize(
11540 self,
11541 tz,
11542 axis: Axis = 0,
11543 level=None,
11544 copy: bool_t | None = None,
11545 ambiguous: TimeAmbiguous = "raise",
11546 nonexistent: TimeNonexistent = "raise",
11547 ) -> Self:
11548 """
11549 Localize tz-naive index of a Series or DataFrame to target time zone.
11550
11551 This operation localizes the Index. To localize the values in a
11552 timezone-naive Series, use :meth:`Series.dt.tz_localize`.
11553
11554 Parameters
11555 ----------
11556 tz : str or tzinfo or None
11557 Time zone to localize. Passing ``None`` will remove the
11558 time zone information and preserve local time.
11559 axis : {{0 or 'index', 1 or 'columns'}}, default 0
11560 The axis to localize
11561 level : int, str, default None
11562 If axis ia a MultiIndex, localize a specific level. Otherwise
11563 must be None.
11564 copy : bool, default True
11565 Also make a copy of the underlying data.
11566
11567 .. note::
11568 The `copy` keyword will change behavior in pandas 3.0.
11569 `Copy-on-Write
11570 <https://pandas.pydata.org/docs/dev/user_guide/copy_on_write.html>`__
11571 will be enabled by default, which means that all methods with a
11572 `copy` keyword will use a lazy copy mechanism to defer the copy and
11573 ignore the `copy` keyword. The `copy` keyword will be removed in a
11574 future version of pandas.
11575
11576 You can already get the future behavior and improvements through
11577 enabling copy on write ``pd.options.mode.copy_on_write = True``
11578 ambiguous : 'infer', bool-ndarray, 'NaT', default 'raise'
11579 When clocks moved backward due to DST, ambiguous times may arise.
11580 For example in Central European Time (UTC+01), when going from
11581 03:00 DST to 02:00 non-DST, 02:30:00 local time occurs both at
11582 00:30:00 UTC and at 01:30:00 UTC. In such a situation, the
11583 `ambiguous` parameter dictates how ambiguous times should be
11584 handled.
11585
11586 - 'infer' will attempt to infer fall dst-transition hours based on
11587 order
11588 - bool-ndarray where True signifies a DST time, False designates
11589 a non-DST time (note that this flag is only applicable for
11590 ambiguous times)
11591 - 'NaT' will return NaT where there are ambiguous times
11592 - 'raise' will raise an AmbiguousTimeError if there are ambiguous
11593 times.
11594 nonexistent : str, default 'raise'
11595 A nonexistent time does not exist in a particular timezone
11596 where clocks moved forward due to DST. Valid values are:
11597
11598 - 'shift_forward' will shift the nonexistent time forward to the
11599 closest existing time
11600 - 'shift_backward' will shift the nonexistent time backward to the
11601 closest existing time
11602 - 'NaT' will return NaT where there are nonexistent times
11603 - timedelta objects will shift nonexistent times by the timedelta
11604 - 'raise' will raise an NonExistentTimeError if there are
11605 nonexistent times.
11606
11607 Returns
11608 -------
11609 {klass}
11610 Same type as the input.
11611
11612 Raises
11613 ------
11614 TypeError
11615 If the TimeSeries is tz-aware and tz is not None.
11616
11617 Examples
11618 --------
11619 Localize local times:
11620
11621 >>> s = pd.Series(
11622 ... [1],
11623 ... index=pd.DatetimeIndex(['2018-09-15 01:30:00']),
11624 ... )
11625 >>> s.tz_localize('CET')
11626 2018-09-15 01:30:00+02:00 1
11627 dtype: int64
11628
11629 Pass None to convert to tz-naive index and preserve local time:
11630
11631 >>> s = pd.Series([1],
11632 ... index=pd.DatetimeIndex(['2018-09-15 01:30:00+02:00']))
11633 >>> s.tz_localize(None)
11634 2018-09-15 01:30:00 1
11635 dtype: int64
11636
11637 Be careful with DST changes. When there is sequential data, pandas
11638 can infer the DST time:
11639
11640 >>> s = pd.Series(range(7),
11641 ... index=pd.DatetimeIndex(['2018-10-28 01:30:00',
11642 ... '2018-10-28 02:00:00',
11643 ... '2018-10-28 02:30:00',
11644 ... '2018-10-28 02:00:00',
11645 ... '2018-10-28 02:30:00',
11646 ... '2018-10-28 03:00:00',
11647 ... '2018-10-28 03:30:00']))
11648 >>> s.tz_localize('CET', ambiguous='infer')
11649 2018-10-28 01:30:00+02:00 0
11650 2018-10-28 02:00:00+02:00 1
11651 2018-10-28 02:30:00+02:00 2
11652 2018-10-28 02:00:00+01:00 3
11653 2018-10-28 02:30:00+01:00 4
11654 2018-10-28 03:00:00+01:00 5
11655 2018-10-28 03:30:00+01:00 6
11656 dtype: int64
11657
11658 In some cases, inferring the DST is impossible. In such cases, you can
11659 pass an ndarray to the ambiguous parameter to set the DST explicitly
11660
11661 >>> s = pd.Series(range(3),
11662 ... index=pd.DatetimeIndex(['2018-10-28 01:20:00',
11663 ... '2018-10-28 02:36:00',
11664 ... '2018-10-28 03:46:00']))
11665 >>> s.tz_localize('CET', ambiguous=np.array([True, True, False]))
11666 2018-10-28 01:20:00+02:00 0
11667 2018-10-28 02:36:00+02:00 1
11668 2018-10-28 03:46:00+01:00 2
11669 dtype: int64
11670
11671 If the DST transition causes nonexistent times, you can shift these
11672 dates forward or backward with a timedelta object or `'shift_forward'`
11673 or `'shift_backward'`.
11674
11675 >>> s = pd.Series(range(2),
11676 ... index=pd.DatetimeIndex(['2015-03-29 02:30:00',
11677 ... '2015-03-29 03:30:00']))
11678 >>> s.tz_localize('Europe/Warsaw', nonexistent='shift_forward')
11679 2015-03-29 03:00:00+02:00 0
11680 2015-03-29 03:30:00+02:00 1
11681 dtype: int64
11682 >>> s.tz_localize('Europe/Warsaw', nonexistent='shift_backward')
11683 2015-03-29 01:59:59.999999999+01:00 0
11684 2015-03-29 03:30:00+02:00 1
11685 dtype: int64
11686 >>> s.tz_localize('Europe/Warsaw', nonexistent=pd.Timedelta('1h'))
11687 2015-03-29 03:30:00+02:00 0
11688 2015-03-29 03:30:00+02:00 1
11689 dtype: int64
11690 """
11691 nonexistent_options = ("raise", "NaT", "shift_forward", "shift_backward")
11692 if nonexistent not in nonexistent_options and not isinstance(
11693 nonexistent, dt.timedelta
11694 ):
11695 raise ValueError(
11696 "The nonexistent argument must be one of 'raise', "
11697 "'NaT', 'shift_forward', 'shift_backward' or "
11698 "a timedelta object"
11699 )
11700
11701 axis = self._get_axis_number(axis)
11702 ax = self._get_axis(axis)
11703
11704 def _tz_localize(ax, tz, ambiguous, nonexistent):
11705 if not hasattr(ax, "tz_localize"):
11706 if len(ax) > 0:
11707 ax_name = self._get_axis_name(axis)
11708 raise TypeError(
11709 f"{ax_name} is not a valid DatetimeIndex or PeriodIndex"
11710 )
11711 ax = DatetimeIndex([], tz=tz)
11712 else:
11713 ax = ax.tz_localize(tz, ambiguous=ambiguous, nonexistent=nonexistent)
11714 return ax
11715
11716 # if a level is given it must be a MultiIndex level or
11717 # equivalent to the axis name
11718 if isinstance(ax, MultiIndex):
11719 level = ax._get_level_number(level)
11720 new_level = _tz_localize(ax.levels[level], tz, ambiguous, nonexistent)
11721 ax = ax.set_levels(new_level, level=level)
11722 else:
11723 if level not in (None, 0, ax.name):
11724 raise ValueError(f"The level {level} is not valid")
11725 ax = _tz_localize(ax, tz, ambiguous, nonexistent)
11726
11727 result = self.copy(deep=copy and not using_copy_on_write())
11728 result = result.set_axis(ax, axis=axis, copy=False)
11729 return result.__finalize__(self, method="tz_localize")
11730
11731 # ----------------------------------------------------------------------
11732 # Numeric Methods
11733
11734 @final
11735 def describe(
11736 self,
11737 percentiles=None,
11738 include=None,
11739 exclude=None,
11740 ) -> Self:
11741 """
11742 Generate descriptive statistics.
11743
11744 Descriptive statistics include those that summarize the central
11745 tendency, dispersion and shape of a
11746 dataset's distribution, excluding ``NaN`` values.
11747
11748 Analyzes both numeric and object series, as well
11749 as ``DataFrame`` column sets of mixed data types. The output
11750 will vary depending on what is provided. Refer to the notes
11751 below for more detail.
11752
11753 Parameters
11754 ----------
11755 percentiles : list-like of numbers, optional
11756 The percentiles to include in the output. All should
11757 fall between 0 and 1. The default is
11758 ``[.25, .5, .75]``, which returns the 25th, 50th, and
11759 75th percentiles.
11760 include : 'all', list-like of dtypes or None (default), optional
11761 A white list of data types to include in the result. Ignored
11762 for ``Series``. Here are the options:
11763
11764 - 'all' : All columns of the input will be included in the output.
11765 - A list-like of dtypes : Limits the results to the
11766 provided data types.
11767 To limit the result to numeric types submit
11768 ``numpy.number``. To limit it instead to object columns submit
11769 the ``numpy.object`` data type. Strings
11770 can also be used in the style of
11771 ``select_dtypes`` (e.g. ``df.describe(include=['O'])``). To
11772 select pandas categorical columns, use ``'category'``
11773 - None (default) : The result will include all numeric columns.
11774 exclude : list-like of dtypes or None (default), optional,
11775 A black list of data types to omit from the result. Ignored
11776 for ``Series``. Here are the options:
11777
11778 - A list-like of dtypes : Excludes the provided data types
11779 from the result. To exclude numeric types submit
11780 ``numpy.number``. To exclude object columns submit the data
11781 type ``numpy.object``. Strings can also be used in the style of
11782 ``select_dtypes`` (e.g. ``df.describe(exclude=['O'])``). To
11783 exclude pandas categorical columns, use ``'category'``
11784 - None (default) : The result will exclude nothing.
11785
11786 Returns
11787 -------
11788 Series or DataFrame
11789 Summary statistics of the Series or Dataframe provided.
11790
11791 See Also
11792 --------
11793 DataFrame.count: Count number of non-NA/null observations.
11794 DataFrame.max: Maximum of the values in the object.
11795 DataFrame.min: Minimum of the values in the object.
11796 DataFrame.mean: Mean of the values.
11797 DataFrame.std: Standard deviation of the observations.
11798 DataFrame.select_dtypes: Subset of a DataFrame including/excluding
11799 columns based on their dtype.
11800
11801 Notes
11802 -----
11803 For numeric data, the result's index will include ``count``,
11804 ``mean``, ``std``, ``min``, ``max`` as well as lower, ``50`` and
11805 upper percentiles. By default the lower percentile is ``25`` and the
11806 upper percentile is ``75``. The ``50`` percentile is the
11807 same as the median.
11808
11809 For object data (e.g. strings or timestamps), the result's index
11810 will include ``count``, ``unique``, ``top``, and ``freq``. The ``top``
11811 is the most common value. The ``freq`` is the most common value's
11812 frequency. Timestamps also include the ``first`` and ``last`` items.
11813
11814 If multiple object values have the highest count, then the
11815 ``count`` and ``top`` results will be arbitrarily chosen from
11816 among those with the highest count.
11817
11818 For mixed data types provided via a ``DataFrame``, the default is to
11819 return only an analysis of numeric columns. If the dataframe consists
11820 only of object and categorical data without any numeric columns, the
11821 default is to return an analysis of both the object and categorical
11822 columns. If ``include='all'`` is provided as an option, the result
11823 will include a union of attributes of each type.
11824
11825 The `include` and `exclude` parameters can be used to limit
11826 which columns in a ``DataFrame`` are analyzed for the output.
11827 The parameters are ignored when analyzing a ``Series``.
11828
11829 Examples
11830 --------
11831 Describing a numeric ``Series``.
11832
11833 >>> s = pd.Series([1, 2, 3])
11834 >>> s.describe()
11835 count 3.0
11836 mean 2.0
11837 std 1.0
11838 min 1.0
11839 25% 1.5
11840 50% 2.0
11841 75% 2.5
11842 max 3.0
11843 dtype: float64
11844
11845 Describing a categorical ``Series``.
11846
11847 >>> s = pd.Series(['a', 'a', 'b', 'c'])
11848 >>> s.describe()
11849 count 4
11850 unique 3
11851 top a
11852 freq 2
11853 dtype: object
11854
11855 Describing a timestamp ``Series``.
11856
11857 >>> s = pd.Series([
11858 ... np.datetime64("2000-01-01"),
11859 ... np.datetime64("2010-01-01"),
11860 ... np.datetime64("2010-01-01")
11861 ... ])
11862 >>> s.describe()
11863 count 3
11864 mean 2006-09-01 08:00:00
11865 min 2000-01-01 00:00:00
11866 25% 2004-12-31 12:00:00
11867 50% 2010-01-01 00:00:00
11868 75% 2010-01-01 00:00:00
11869 max 2010-01-01 00:00:00
11870 dtype: object
11871
11872 Describing a ``DataFrame``. By default only numeric fields
11873 are returned.
11874
11875 >>> df = pd.DataFrame({'categorical': pd.Categorical(['d', 'e', 'f']),
11876 ... 'numeric': [1, 2, 3],
11877 ... 'object': ['a', 'b', 'c']
11878 ... })
11879 >>> df.describe()
11880 numeric
11881 count 3.0
11882 mean 2.0
11883 std 1.0
11884 min 1.0
11885 25% 1.5
11886 50% 2.0
11887 75% 2.5
11888 max 3.0
11889
11890 Describing all columns of a ``DataFrame`` regardless of data type.
11891
11892 >>> df.describe(include='all') # doctest: +SKIP
11893 categorical numeric object
11894 count 3 3.0 3
11895 unique 3 NaN 3
11896 top f NaN a
11897 freq 1 NaN 1
11898 mean NaN 2.0 NaN
11899 std NaN 1.0 NaN
11900 min NaN 1.0 NaN
11901 25% NaN 1.5 NaN
11902 50% NaN 2.0 NaN
11903 75% NaN 2.5 NaN
11904 max NaN 3.0 NaN
11905
11906 Describing a column from a ``DataFrame`` by accessing it as
11907 an attribute.
11908
11909 >>> df.numeric.describe()
11910 count 3.0
11911 mean 2.0
11912 std 1.0
11913 min 1.0
11914 25% 1.5
11915 50% 2.0
11916 75% 2.5
11917 max 3.0
11918 Name: numeric, dtype: float64
11919
11920 Including only numeric columns in a ``DataFrame`` description.
11921
11922 >>> df.describe(include=[np.number])
11923 numeric
11924 count 3.0
11925 mean 2.0
11926 std 1.0
11927 min 1.0
11928 25% 1.5
11929 50% 2.0
11930 75% 2.5
11931 max 3.0
11932
11933 Including only string columns in a ``DataFrame`` description.
11934
11935 >>> df.describe(include=[object]) # doctest: +SKIP
11936 object
11937 count 3
11938 unique 3
11939 top a
11940 freq 1
11941
11942 Including only categorical columns from a ``DataFrame`` description.
11943
11944 >>> df.describe(include=['category'])
11945 categorical
11946 count 3
11947 unique 3
11948 top d
11949 freq 1
11950
11951 Excluding numeric columns from a ``DataFrame`` description.
11952
11953 >>> df.describe(exclude=[np.number]) # doctest: +SKIP
11954 categorical object
11955 count 3 3
11956 unique 3 3
11957 top f a
11958 freq 1 1
11959
11960 Excluding object columns from a ``DataFrame`` description.
11961
11962 >>> df.describe(exclude=[object]) # doctest: +SKIP
11963 categorical numeric
11964 count 3 3.0
11965 unique 3 NaN
11966 top f NaN
11967 freq 1 NaN
11968 mean NaN 2.0
11969 std NaN 1.0
11970 min NaN 1.0
11971 25% NaN 1.5
11972 50% NaN 2.0
11973 75% NaN 2.5
11974 max NaN 3.0
11975 """
11976 return describe_ndframe(
11977 obj=self,
11978 include=include,
11979 exclude=exclude,
11980 percentiles=percentiles,
11981 ).__finalize__(self, method="describe")
11982
11983 @final
11984 def pct_change(
11985 self,
11986 periods: int = 1,
11987 fill_method: FillnaOptions | None | lib.NoDefault = lib.no_default,
11988 limit: int | None | lib.NoDefault = lib.no_default,
11989 freq=None,
11990 **kwargs,
11991 ) -> Self:
11992 """
11993 Fractional change between the current and a prior element.
11994
11995 Computes the fractional change from the immediately previous row by
11996 default. This is useful in comparing the fraction of change in a time
11997 series of elements.
11998
11999 .. note::
12000
12001 Despite the name of this method, it calculates fractional change
12002 (also known as per unit change or relative change) and not
12003 percentage change. If you need the percentage change, multiply
12004 these values by 100.
12005
12006 Parameters
12007 ----------
12008 periods : int, default 1
12009 Periods to shift for forming percent change.
12010 fill_method : {'backfill', 'bfill', 'pad', 'ffill', None}, default 'pad'
12011 How to handle NAs **before** computing percent changes.
12012
12013 .. deprecated:: 2.1
12014 All options of `fill_method` are deprecated except `fill_method=None`.
12015
12016 limit : int, default None
12017 The number of consecutive NAs to fill before stopping.
12018
12019 .. deprecated:: 2.1
12020
12021 freq : DateOffset, timedelta, or str, optional
12022 Increment to use from time series API (e.g. 'ME' or BDay()).
12023 **kwargs
12024 Additional keyword arguments are passed into
12025 `DataFrame.shift` or `Series.shift`.
12026
12027 Returns
12028 -------
12029 Series or DataFrame
12030 The same type as the calling object.
12031
12032 See Also
12033 --------
12034 Series.diff : Compute the difference of two elements in a Series.
12035 DataFrame.diff : Compute the difference of two elements in a DataFrame.
12036 Series.shift : Shift the index by some number of periods.
12037 DataFrame.shift : Shift the index by some number of periods.
12038
12039 Examples
12040 --------
12041 **Series**
12042
12043 >>> s = pd.Series([90, 91, 85])
12044 >>> s
12045 0 90
12046 1 91
12047 2 85
12048 dtype: int64
12049
12050 >>> s.pct_change()
12051 0 NaN
12052 1 0.011111
12053 2 -0.065934
12054 dtype: float64
12055
12056 >>> s.pct_change(periods=2)
12057 0 NaN
12058 1 NaN
12059 2 -0.055556
12060 dtype: float64
12061
12062 See the percentage change in a Series where filling NAs with last
12063 valid observation forward to next valid.
12064
12065 >>> s = pd.Series([90, 91, None, 85])
12066 >>> s
12067 0 90.0
12068 1 91.0
12069 2 NaN
12070 3 85.0
12071 dtype: float64
12072
12073 >>> s.ffill().pct_change()
12074 0 NaN
12075 1 0.011111
12076 2 0.000000
12077 3 -0.065934
12078 dtype: float64
12079
12080 **DataFrame**
12081
12082 Percentage change in French franc, Deutsche Mark, and Italian lira from
12083 1980-01-01 to 1980-03-01.
12084
12085 >>> df = pd.DataFrame({
12086 ... 'FR': [4.0405, 4.0963, 4.3149],
12087 ... 'GR': [1.7246, 1.7482, 1.8519],
12088 ... 'IT': [804.74, 810.01, 860.13]},
12089 ... index=['1980-01-01', '1980-02-01', '1980-03-01'])
12090 >>> df
12091 FR GR IT
12092 1980-01-01 4.0405 1.7246 804.74
12093 1980-02-01 4.0963 1.7482 810.01
12094 1980-03-01 4.3149 1.8519 860.13
12095
12096 >>> df.pct_change()
12097 FR GR IT
12098 1980-01-01 NaN NaN NaN
12099 1980-02-01 0.013810 0.013684 0.006549
12100 1980-03-01 0.053365 0.059318 0.061876
12101
12102 Percentage of change in GOOG and APPL stock volume. Shows computing
12103 the percentage change between columns.
12104
12105 >>> df = pd.DataFrame({
12106 ... '2016': [1769950, 30586265],
12107 ... '2015': [1500923, 40912316],
12108 ... '2014': [1371819, 41403351]},
12109 ... index=['GOOG', 'APPL'])
12110 >>> df
12111 2016 2015 2014
12112 GOOG 1769950 1500923 1371819
12113 APPL 30586265 40912316 41403351
12114
12115 >>> df.pct_change(axis='columns', periods=-1)
12116 2016 2015 2014
12117 GOOG 0.179241 0.094112 NaN
12118 APPL -0.252395 -0.011860 NaN
12119 """
12120 # GH#53491
12121 if fill_method not in (lib.no_default, None) or limit is not lib.no_default:
12122 warnings.warn(
12123 "The 'fill_method' keyword being not None and the 'limit' keyword in "
12124 f"{type(self).__name__}.pct_change are deprecated and will be removed "
12125 "in a future version. Either fill in any non-leading NA values prior "
12126 "to calling pct_change or specify 'fill_method=None' to not fill NA "
12127 "values.",
12128 FutureWarning,
12129 stacklevel=find_stack_level(),
12130 )
12131 if fill_method is lib.no_default:
12132 if limit is lib.no_default:
12133 cols = self.items() if self.ndim == 2 else [(None, self)]
12134 for _, col in cols:
12135 if len(col) > 0:
12136 mask = col.isna().values
12137 mask = mask[np.argmax(~mask) :]
12138 if mask.any():
12139 warnings.warn(
12140 "The default fill_method='pad' in "
12141 f"{type(self).__name__}.pct_change is deprecated and "
12142 "will be removed in a future version. Either fill in "
12143 "any non-leading NA values prior to calling pct_change "
12144 "or specify 'fill_method=None' to not fill NA values.",
12145 FutureWarning,
12146 stacklevel=find_stack_level(),
12147 )
12148 break
12149 fill_method = "pad"
12150 if limit is lib.no_default:
12151 limit = None
12152
12153 axis = self._get_axis_number(kwargs.pop("axis", "index"))
12154 if fill_method is None:
12155 data = self
12156 else:
12157 data = self._pad_or_backfill(fill_method, axis=axis, limit=limit)
12158
12159 shifted = data.shift(periods=periods, freq=freq, axis=axis, **kwargs)
12160 # Unsupported left operand type for / ("Self")
12161 rs = data / shifted - 1 # type: ignore[operator]
12162 if freq is not None:
12163 # Shift method is implemented differently when freq is not None
12164 # We want to restore the original index
12165 rs = rs.loc[~rs.index.duplicated()]
12166 rs = rs.reindex_like(data)
12167 return rs.__finalize__(self, method="pct_change")
12168
12169 @final
12170 def _logical_func(
12171 self,
12172 name: str,
12173 func,
12174 axis: Axis | None = 0,
12175 bool_only: bool_t = False,
12176 skipna: bool_t = True,
12177 **kwargs,
12178 ) -> Series | bool_t:
12179 nv.validate_logical_func((), kwargs, fname=name)
12180 validate_bool_kwarg(skipna, "skipna", none_allowed=False)
12181
12182 if self.ndim > 1 and axis is None:
12183 # Reduce along one dimension then the other, to simplify DataFrame._reduce
12184 res = self._logical_func(
12185 name, func, axis=0, bool_only=bool_only, skipna=skipna, **kwargs
12186 )
12187 # error: Item "bool" of "Series | bool" has no attribute "_logical_func"
12188 return res._logical_func( # type: ignore[union-attr]
12189 name, func, skipna=skipna, **kwargs
12190 )
12191 elif axis is None:
12192 axis = 0
12193
12194 if (
12195 self.ndim > 1
12196 and axis == 1
12197 and len(self._mgr.arrays) > 1
12198 # TODO(EA2D): special-case not needed
12199 and all(x.ndim == 2 for x in self._mgr.arrays)
12200 and not kwargs
12201 ):
12202 # Fastpath avoiding potentially expensive transpose
12203 obj = self
12204 if bool_only:
12205 obj = self._get_bool_data()
12206 return obj._reduce_axis1(name, func, skipna=skipna)
12207
12208 return self._reduce(
12209 func,
12210 name=name,
12211 axis=axis,
12212 skipna=skipna,
12213 numeric_only=bool_only,
12214 filter_type="bool",
12215 )
12216
12217 def any(
12218 self,
12219 axis: Axis | None = 0,
12220 bool_only: bool_t = False,
12221 skipna: bool_t = True,
12222 **kwargs,
12223 ) -> Series | bool_t:
12224 return self._logical_func(
12225 "any", nanops.nanany, axis, bool_only, skipna, **kwargs
12226 )
12227
12228 def all(
12229 self,
12230 axis: Axis = 0,
12231 bool_only: bool_t = False,
12232 skipna: bool_t = True,
12233 **kwargs,
12234 ) -> Series | bool_t:
12235 return self._logical_func(
12236 "all", nanops.nanall, axis, bool_only, skipna, **kwargs
12237 )
12238
12239 @final
12240 def _accum_func(
12241 self,
12242 name: str,
12243 func,
12244 axis: Axis | None = None,
12245 skipna: bool_t = True,
12246 *args,
12247 **kwargs,
12248 ):
12249 skipna = nv.validate_cum_func_with_skipna(skipna, args, kwargs, name)
12250 if axis is None:
12251 axis = 0
12252 else:
12253 axis = self._get_axis_number(axis)
12254
12255 if axis == 1:
12256 return self.T._accum_func(
12257 name, func, axis=0, skipna=skipna, *args, **kwargs # noqa: B026
12258 ).T
12259
12260 def block_accum_func(blk_values):
12261 values = blk_values.T if hasattr(blk_values, "T") else blk_values
12262
12263 result: np.ndarray | ExtensionArray
12264 if isinstance(values, ExtensionArray):
12265 result = values._accumulate(name, skipna=skipna, **kwargs)
12266 else:
12267 result = nanops.na_accum_func(values, func, skipna=skipna)
12268
12269 result = result.T if hasattr(result, "T") else result
12270 return result
12271
12272 result = self._mgr.apply(block_accum_func)
12273
12274 return self._constructor_from_mgr(result, axes=result.axes).__finalize__(
12275 self, method=name
12276 )
12277
12278 def cummax(self, axis: Axis | None = None, skipna: bool_t = True, *args, **kwargs):
12279 return self._accum_func(
12280 "cummax", np.maximum.accumulate, axis, skipna, *args, **kwargs
12281 )
12282
12283 def cummin(self, axis: Axis | None = None, skipna: bool_t = True, *args, **kwargs):
12284 return self._accum_func(
12285 "cummin", np.minimum.accumulate, axis, skipna, *args, **kwargs
12286 )
12287
12288 def cumsum(self, axis: Axis | None = None, skipna: bool_t = True, *args, **kwargs):
12289 return self._accum_func("cumsum", np.cumsum, axis, skipna, *args, **kwargs)
12290
12291 def cumprod(self, axis: Axis | None = None, skipna: bool_t = True, *args, **kwargs):
12292 return self._accum_func("cumprod", np.cumprod, axis, skipna, *args, **kwargs)
12293
12294 @final
12295 def _stat_function_ddof(
12296 self,
12297 name: str,
12298 func,
12299 axis: Axis | None | lib.NoDefault = lib.no_default,
12300 skipna: bool_t = True,
12301 ddof: int = 1,
12302 numeric_only: bool_t = False,
12303 **kwargs,
12304 ) -> Series | float:
12305 nv.validate_stat_ddof_func((), kwargs, fname=name)
12306 validate_bool_kwarg(skipna, "skipna", none_allowed=False)
12307
12308 if axis is None:
12309 if self.ndim > 1:
12310 warnings.warn(
12311 f"The behavior of {type(self).__name__}.{name} with axis=None "
12312 "is deprecated, in a future version this will reduce over both "
12313 "axes and return a scalar. To retain the old behavior, pass "
12314 "axis=0 (or do not pass axis)",
12315 FutureWarning,
12316 stacklevel=find_stack_level(),
12317 )
12318 axis = 0
12319 elif axis is lib.no_default:
12320 axis = 0
12321
12322 return self._reduce(
12323 func, name, axis=axis, numeric_only=numeric_only, skipna=skipna, ddof=ddof
12324 )
12325
12326 def sem(
12327 self,
12328 axis: Axis | None = 0,
12329 skipna: bool_t = True,
12330 ddof: int = 1,
12331 numeric_only: bool_t = False,
12332 **kwargs,
12333 ) -> Series | float:
12334 return self._stat_function_ddof(
12335 "sem", nanops.nansem, axis, skipna, ddof, numeric_only, **kwargs
12336 )
12337
12338 def var(
12339 self,
12340 axis: Axis | None = 0,
12341 skipna: bool_t = True,
12342 ddof: int = 1,
12343 numeric_only: bool_t = False,
12344 **kwargs,
12345 ) -> Series | float:
12346 return self._stat_function_ddof(
12347 "var", nanops.nanvar, axis, skipna, ddof, numeric_only, **kwargs
12348 )
12349
12350 def std(
12351 self,
12352 axis: Axis | None = 0,
12353 skipna: bool_t = True,
12354 ddof: int = 1,
12355 numeric_only: bool_t = False,
12356 **kwargs,
12357 ) -> Series | float:
12358 return self._stat_function_ddof(
12359 "std", nanops.nanstd, axis, skipna, ddof, numeric_only, **kwargs
12360 )
12361
12362 @final
12363 def _stat_function(
12364 self,
12365 name: str,
12366 func,
12367 axis: Axis | None = 0,
12368 skipna: bool_t = True,
12369 numeric_only: bool_t = False,
12370 **kwargs,
12371 ):
12372 assert name in ["median", "mean", "min", "max", "kurt", "skew"], name
12373 nv.validate_func(name, (), kwargs)
12374
12375 validate_bool_kwarg(skipna, "skipna", none_allowed=False)
12376
12377 return self._reduce(
12378 func, name=name, axis=axis, skipna=skipna, numeric_only=numeric_only
12379 )
12380
12381 def min(
12382 self,
12383 axis: Axis | None = 0,
12384 skipna: bool_t = True,
12385 numeric_only: bool_t = False,
12386 **kwargs,
12387 ):
12388 return self._stat_function(
12389 "min",
12390 nanops.nanmin,
12391 axis,
12392 skipna,
12393 numeric_only,
12394 **kwargs,
12395 )
12396
12397 def max(
12398 self,
12399 axis: Axis | None = 0,
12400 skipna: bool_t = True,
12401 numeric_only: bool_t = False,
12402 **kwargs,
12403 ):
12404 return self._stat_function(
12405 "max",
12406 nanops.nanmax,
12407 axis,
12408 skipna,
12409 numeric_only,
12410 **kwargs,
12411 )
12412
12413 def mean(
12414 self,
12415 axis: Axis | None = 0,
12416 skipna: bool_t = True,
12417 numeric_only: bool_t = False,
12418 **kwargs,
12419 ) -> Series | float:
12420 return self._stat_function(
12421 "mean", nanops.nanmean, axis, skipna, numeric_only, **kwargs
12422 )
12423
12424 def median(
12425 self,
12426 axis: Axis | None = 0,
12427 skipna: bool_t = True,
12428 numeric_only: bool_t = False,
12429 **kwargs,
12430 ) -> Series | float:
12431 return self._stat_function(
12432 "median", nanops.nanmedian, axis, skipna, numeric_only, **kwargs
12433 )
12434
12435 def skew(
12436 self,
12437 axis: Axis | None = 0,
12438 skipna: bool_t = True,
12439 numeric_only: bool_t = False,
12440 **kwargs,
12441 ) -> Series | float:
12442 return self._stat_function(
12443 "skew", nanops.nanskew, axis, skipna, numeric_only, **kwargs
12444 )
12445
12446 def kurt(
12447 self,
12448 axis: Axis | None = 0,
12449 skipna: bool_t = True,
12450 numeric_only: bool_t = False,
12451 **kwargs,
12452 ) -> Series | float:
12453 return self._stat_function(
12454 "kurt", nanops.nankurt, axis, skipna, numeric_only, **kwargs
12455 )
12456
12457 kurtosis = kurt
12458
12459 @final
12460 def _min_count_stat_function(
12461 self,
12462 name: str,
12463 func,
12464 axis: Axis | None | lib.NoDefault = lib.no_default,
12465 skipna: bool_t = True,
12466 numeric_only: bool_t = False,
12467 min_count: int = 0,
12468 **kwargs,
12469 ):
12470 assert name in ["sum", "prod"], name
12471 nv.validate_func(name, (), kwargs)
12472
12473 validate_bool_kwarg(skipna, "skipna", none_allowed=False)
12474
12475 if axis is None:
12476 if self.ndim > 1:
12477 warnings.warn(
12478 f"The behavior of {type(self).__name__}.{name} with axis=None "
12479 "is deprecated, in a future version this will reduce over both "
12480 "axes and return a scalar. To retain the old behavior, pass "
12481 "axis=0 (or do not pass axis)",
12482 FutureWarning,
12483 stacklevel=find_stack_level(),
12484 )
12485 axis = 0
12486 elif axis is lib.no_default:
12487 axis = 0
12488
12489 return self._reduce(
12490 func,
12491 name=name,
12492 axis=axis,
12493 skipna=skipna,
12494 numeric_only=numeric_only,
12495 min_count=min_count,
12496 )
12497
12498 def sum(
12499 self,
12500 axis: Axis | None = 0,
12501 skipna: bool_t = True,
12502 numeric_only: bool_t = False,
12503 min_count: int = 0,
12504 **kwargs,
12505 ):
12506 return self._min_count_stat_function(
12507 "sum", nanops.nansum, axis, skipna, numeric_only, min_count, **kwargs
12508 )
12509
12510 def prod(
12511 self,
12512 axis: Axis | None = 0,
12513 skipna: bool_t = True,
12514 numeric_only: bool_t = False,
12515 min_count: int = 0,
12516 **kwargs,
12517 ):
12518 return self._min_count_stat_function(
12519 "prod",
12520 nanops.nanprod,
12521 axis,
12522 skipna,
12523 numeric_only,
12524 min_count,
12525 **kwargs,
12526 )
12527
12528 product = prod
12529
12530 @final
12531 @doc(Rolling)
12532 def rolling(
12533 self,
12534 window: int | dt.timedelta | str | BaseOffset | BaseIndexer,
12535 min_periods: int | None = None,
12536 center: bool_t = False,
12537 win_type: str | None = None,
12538 on: str | None = None,
12539 axis: Axis | lib.NoDefault = lib.no_default,
12540 closed: IntervalClosedType | None = None,
12541 step: int | None = None,
12542 method: str = "single",
12543 ) -> Window | Rolling:
12544 if axis is not lib.no_default:
12545 axis = self._get_axis_number(axis)
12546 name = "rolling"
12547 if axis == 1:
12548 warnings.warn(
12549 f"Support for axis=1 in {type(self).__name__}.{name} is "
12550 "deprecated and will be removed in a future version. "
12551 f"Use obj.T.{name}(...) instead",
12552 FutureWarning,
12553 stacklevel=find_stack_level(),
12554 )
12555 else:
12556 warnings.warn(
12557 f"The 'axis' keyword in {type(self).__name__}.{name} is "
12558 "deprecated and will be removed in a future version. "
12559 "Call the method without the axis keyword instead.",
12560 FutureWarning,
12561 stacklevel=find_stack_level(),
12562 )
12563 else:
12564 axis = 0
12565
12566 if win_type is not None:
12567 return Window(
12568 self,
12569 window=window,
12570 min_periods=min_periods,
12571 center=center,
12572 win_type=win_type,
12573 on=on,
12574 axis=axis,
12575 closed=closed,
12576 step=step,
12577 method=method,
12578 )
12579
12580 return Rolling(
12581 self,
12582 window=window,
12583 min_periods=min_periods,
12584 center=center,
12585 win_type=win_type,
12586 on=on,
12587 axis=axis,
12588 closed=closed,
12589 step=step,
12590 method=method,
12591 )
12592
12593 @final
12594 @doc(Expanding)
12595 def expanding(
12596 self,
12597 min_periods: int = 1,
12598 axis: Axis | lib.NoDefault = lib.no_default,
12599 method: Literal["single", "table"] = "single",
12600 ) -> Expanding:
12601 if axis is not lib.no_default:
12602 axis = self._get_axis_number(axis)
12603 name = "expanding"
12604 if axis == 1:
12605 warnings.warn(
12606 f"Support for axis=1 in {type(self).__name__}.{name} is "
12607 "deprecated and will be removed in a future version. "
12608 f"Use obj.T.{name}(...) instead",
12609 FutureWarning,
12610 stacklevel=find_stack_level(),
12611 )
12612 else:
12613 warnings.warn(
12614 f"The 'axis' keyword in {type(self).__name__}.{name} is "
12615 "deprecated and will be removed in a future version. "
12616 "Call the method without the axis keyword instead.",
12617 FutureWarning,
12618 stacklevel=find_stack_level(),
12619 )
12620 else:
12621 axis = 0
12622 return Expanding(self, min_periods=min_periods, axis=axis, method=method)
12623
12624 @final
12625 @doc(ExponentialMovingWindow)
12626 def ewm(
12627 self,
12628 com: float | None = None,
12629 span: float | None = None,
12630 halflife: float | TimedeltaConvertibleTypes | None = None,
12631 alpha: float | None = None,
12632 min_periods: int | None = 0,
12633 adjust: bool_t = True,
12634 ignore_na: bool_t = False,
12635 axis: Axis | lib.NoDefault = lib.no_default,
12636 times: np.ndarray | DataFrame | Series | None = None,
12637 method: Literal["single", "table"] = "single",
12638 ) -> ExponentialMovingWindow:
12639 if axis is not lib.no_default:
12640 axis = self._get_axis_number(axis)
12641 name = "ewm"
12642 if axis == 1:
12643 warnings.warn(
12644 f"Support for axis=1 in {type(self).__name__}.{name} is "
12645 "deprecated and will be removed in a future version. "
12646 f"Use obj.T.{name}(...) instead",
12647 FutureWarning,
12648 stacklevel=find_stack_level(),
12649 )
12650 else:
12651 warnings.warn(
12652 f"The 'axis' keyword in {type(self).__name__}.{name} is "
12653 "deprecated and will be removed in a future version. "
12654 "Call the method without the axis keyword instead.",
12655 FutureWarning,
12656 stacklevel=find_stack_level(),
12657 )
12658 else:
12659 axis = 0
12660
12661 return ExponentialMovingWindow(
12662 self,
12663 com=com,
12664 span=span,
12665 halflife=halflife,
12666 alpha=alpha,
12667 min_periods=min_periods,
12668 adjust=adjust,
12669 ignore_na=ignore_na,
12670 axis=axis,
12671 times=times,
12672 method=method,
12673 )
12674
12675 # ----------------------------------------------------------------------
12676 # Arithmetic Methods
12677
12678 @final
12679 def _inplace_method(self, other, op) -> Self:
12680 """
12681 Wrap arithmetic method to operate inplace.
12682 """
12683 warn = True
12684 if not PYPY and warn_copy_on_write():
12685 if sys.getrefcount(self) <= REF_COUNT + 2:
12686 # we are probably in an inplace setitem context (e.g. df['a'] += 1)
12687 warn = False
12688
12689 result = op(self, other)
12690
12691 if (
12692 self.ndim == 1
12693 and result._indexed_same(self)
12694 and result.dtype == self.dtype
12695 and not using_copy_on_write()
12696 and not (warn_copy_on_write() and not warn)
12697 ):
12698 # GH#36498 this inplace op can _actually_ be inplace.
12699 # Item "ArrayManager" of "Union[ArrayManager, SingleArrayManager,
12700 # BlockManager, SingleBlockManager]" has no attribute "setitem_inplace"
12701 self._mgr.setitem_inplace( # type: ignore[union-attr]
12702 slice(None), result._values, warn=warn
12703 )
12704 return self
12705
12706 # Delete cacher
12707 self._reset_cacher()
12708
12709 # this makes sure that we are aligned like the input
12710 # we are updating inplace so we want to ignore is_copy
12711 self._update_inplace(
12712 result.reindex_like(self, copy=False), verify_is_copy=False
12713 )
12714 return self
12715
12716 @final
12717 def __iadd__(self, other) -> Self:
12718 # error: Unsupported left operand type for + ("Type[NDFrame]")
12719 return self._inplace_method(other, type(self).__add__) # type: ignore[operator]
12720
12721 @final
12722 def __isub__(self, other) -> Self:
12723 # error: Unsupported left operand type for - ("Type[NDFrame]")
12724 return self._inplace_method(other, type(self).__sub__) # type: ignore[operator]
12725
12726 @final
12727 def __imul__(self, other) -> Self:
12728 # error: Unsupported left operand type for * ("Type[NDFrame]")
12729 return self._inplace_method(other, type(self).__mul__) # type: ignore[operator]
12730
12731 @final
12732 def __itruediv__(self, other) -> Self:
12733 # error: Unsupported left operand type for / ("Type[NDFrame]")
12734 return self._inplace_method(
12735 other, type(self).__truediv__ # type: ignore[operator]
12736 )
12737
12738 @final
12739 def __ifloordiv__(self, other) -> Self:
12740 # error: Unsupported left operand type for // ("Type[NDFrame]")
12741 return self._inplace_method(
12742 other, type(self).__floordiv__ # type: ignore[operator]
12743 )
12744
12745 @final
12746 def __imod__(self, other) -> Self:
12747 # error: Unsupported left operand type for % ("Type[NDFrame]")
12748 return self._inplace_method(other, type(self).__mod__) # type: ignore[operator]
12749
12750 @final
12751 def __ipow__(self, other) -> Self:
12752 # error: Unsupported left operand type for ** ("Type[NDFrame]")
12753 return self._inplace_method(other, type(self).__pow__) # type: ignore[operator]
12754
12755 @final
12756 def __iand__(self, other) -> Self:
12757 # error: Unsupported left operand type for & ("Type[NDFrame]")
12758 return self._inplace_method(other, type(self).__and__) # type: ignore[operator]
12759
12760 @final
12761 def __ior__(self, other) -> Self:
12762 return self._inplace_method(other, type(self).__or__)
12763
12764 @final
12765 def __ixor__(self, other) -> Self:
12766 # error: Unsupported left operand type for ^ ("Type[NDFrame]")
12767 return self._inplace_method(other, type(self).__xor__) # type: ignore[operator]
12768
12769 # ----------------------------------------------------------------------
12770 # Misc methods
12771
12772 @final
12773 def _find_valid_index(self, *, how: str) -> Hashable | None:
12774 """
12775 Retrieves the index of the first valid value.
12776
12777 Parameters
12778 ----------
12779 how : {'first', 'last'}
12780 Use this parameter to change between the first or last valid index.
12781
12782 Returns
12783 -------
12784 idx_first_valid : type of index
12785 """
12786 is_valid = self.notna().values
12787 idxpos = find_valid_index(how=how, is_valid=is_valid)
12788 if idxpos is None:
12789 return None
12790 return self.index[idxpos]
12791
12792 @final
12793 @doc(position="first", klass=_shared_doc_kwargs["klass"])
12794 def first_valid_index(self) -> Hashable | None:
12795 """
12796 Return index for {position} non-NA value or None, if no non-NA value is found.
12797
12798 Returns
12799 -------
12800 type of index
12801
12802 Examples
12803 --------
12804 For Series:
12805
12806 >>> s = pd.Series([None, 3, 4])
12807 >>> s.first_valid_index()
12808 1
12809 >>> s.last_valid_index()
12810 2
12811
12812 >>> s = pd.Series([None, None])
12813 >>> print(s.first_valid_index())
12814 None
12815 >>> print(s.last_valid_index())
12816 None
12817
12818 If all elements in Series are NA/null, returns None.
12819
12820 >>> s = pd.Series()
12821 >>> print(s.first_valid_index())
12822 None
12823 >>> print(s.last_valid_index())
12824 None
12825
12826 If Series is empty, returns None.
12827
12828 For DataFrame:
12829
12830 >>> df = pd.DataFrame({{'A': [None, None, 2], 'B': [None, 3, 4]}})
12831 >>> df
12832 A B
12833 0 NaN NaN
12834 1 NaN 3.0
12835 2 2.0 4.0
12836 >>> df.first_valid_index()
12837 1
12838 >>> df.last_valid_index()
12839 2
12840
12841 >>> df = pd.DataFrame({{'A': [None, None, None], 'B': [None, None, None]}})
12842 >>> df
12843 A B
12844 0 None None
12845 1 None None
12846 2 None None
12847 >>> print(df.first_valid_index())
12848 None
12849 >>> print(df.last_valid_index())
12850 None
12851
12852 If all elements in DataFrame are NA/null, returns None.
12853
12854 >>> df = pd.DataFrame()
12855 >>> df
12856 Empty DataFrame
12857 Columns: []
12858 Index: []
12859 >>> print(df.first_valid_index())
12860 None
12861 >>> print(df.last_valid_index())
12862 None
12863
12864 If DataFrame is empty, returns None.
12865 """
12866 return self._find_valid_index(how="first")
12867
12868 @final
12869 @doc(first_valid_index, position="last", klass=_shared_doc_kwargs["klass"])
12870 def last_valid_index(self) -> Hashable | None:
12871 return self._find_valid_index(how="last")
12872
12873
12874_num_doc = """
12875{desc}
12876
12877Parameters
12878----------
12879axis : {axis_descr}
12880 Axis for the function to be applied on.
12881 For `Series` this parameter is unused and defaults to 0.
12882
12883 For DataFrames, specifying ``axis=None`` will apply the aggregation
12884 across both axes.
12885
12886 .. versionadded:: 2.0.0
12887
12888skipna : bool, default True
12889 Exclude NA/null values when computing the result.
12890numeric_only : bool, default False
12891 Include only float, int, boolean columns. Not implemented for Series.
12892
12893{min_count}\
12894**kwargs
12895 Additional keyword arguments to be passed to the function.
12896
12897Returns
12898-------
12899{name1} or scalar\
12900{see_also}\
12901{examples}
12902"""
12903
12904_sum_prod_doc = """
12905{desc}
12906
12907Parameters
12908----------
12909axis : {axis_descr}
12910 Axis for the function to be applied on.
12911 For `Series` this parameter is unused and defaults to 0.
12912
12913 .. warning::
12914
12915 The behavior of DataFrame.{name} with ``axis=None`` is deprecated,
12916 in a future version this will reduce over both axes and return a scalar
12917 To retain the old behavior, pass axis=0 (or do not pass axis).
12918
12919 .. versionadded:: 2.0.0
12920
12921skipna : bool, default True
12922 Exclude NA/null values when computing the result.
12923numeric_only : bool, default False
12924 Include only float, int, boolean columns. Not implemented for Series.
12925
12926{min_count}\
12927**kwargs
12928 Additional keyword arguments to be passed to the function.
12929
12930Returns
12931-------
12932{name1} or scalar\
12933{see_also}\
12934{examples}
12935"""
12936
12937_num_ddof_doc = """
12938{desc}
12939
12940Parameters
12941----------
12942axis : {axis_descr}
12943 For `Series` this parameter is unused and defaults to 0.
12944
12945 .. warning::
12946
12947 The behavior of DataFrame.{name} with ``axis=None`` is deprecated,
12948 in a future version this will reduce over both axes and return a scalar
12949 To retain the old behavior, pass axis=0 (or do not pass axis).
12950
12951skipna : bool, default True
12952 Exclude NA/null values. If an entire row/column is NA, the result
12953 will be NA.
12954ddof : int, default 1
12955 Delta Degrees of Freedom. The divisor used in calculations is N - ddof,
12956 where N represents the number of elements.
12957numeric_only : bool, default False
12958 Include only float, int, boolean columns. Not implemented for Series.
12959
12960Returns
12961-------
12962{name1} or {name2} (if level specified) \
12963{notes}\
12964{examples}
12965"""
12966
12967_std_notes = """
12968
12969Notes
12970-----
12971To have the same behaviour as `numpy.std`, use `ddof=0` (instead of the
12972default `ddof=1`)"""
12973
12974_std_examples = """
12975
12976Examples
12977--------
12978>>> df = pd.DataFrame({'person_id': [0, 1, 2, 3],
12979... 'age': [21, 25, 62, 43],
12980... 'height': [1.61, 1.87, 1.49, 2.01]}
12981... ).set_index('person_id')
12982>>> df
12983 age height
12984person_id
129850 21 1.61
129861 25 1.87
129872 62 1.49
129883 43 2.01
12989
12990The standard deviation of the columns can be found as follows:
12991
12992>>> df.std()
12993age 18.786076
12994height 0.237417
12995dtype: float64
12996
12997Alternatively, `ddof=0` can be set to normalize by N instead of N-1:
12998
12999>>> df.std(ddof=0)
13000age 16.269219
13001height 0.205609
13002dtype: float64"""
13003
13004_var_examples = """
13005
13006Examples
13007--------
13008>>> df = pd.DataFrame({'person_id': [0, 1, 2, 3],
13009... 'age': [21, 25, 62, 43],
13010... 'height': [1.61, 1.87, 1.49, 2.01]}
13011... ).set_index('person_id')
13012>>> df
13013 age height
13014person_id
130150 21 1.61
130161 25 1.87
130172 62 1.49
130183 43 2.01
13019
13020>>> df.var()
13021age 352.916667
13022height 0.056367
13023dtype: float64
13024
13025Alternatively, ``ddof=0`` can be set to normalize by N instead of N-1:
13026
13027>>> df.var(ddof=0)
13028age 264.687500
13029height 0.042275
13030dtype: float64"""
13031
13032_bool_doc = """
13033{desc}
13034
13035Parameters
13036----------
13037axis : {{0 or 'index', 1 or 'columns', None}}, default 0
13038 Indicate which axis or axes should be reduced. For `Series` this parameter
13039 is unused and defaults to 0.
13040
13041 * 0 / 'index' : reduce the index, return a Series whose index is the
13042 original column labels.
13043 * 1 / 'columns' : reduce the columns, return a Series whose index is the
13044 original index.
13045 * None : reduce all axes, return a scalar.
13046
13047bool_only : bool, default False
13048 Include only boolean columns. Not implemented for Series.
13049skipna : bool, default True
13050 Exclude NA/null values. If the entire row/column is NA and skipna is
13051 True, then the result will be {empty_value}, as for an empty row/column.
13052 If skipna is False, then NA are treated as True, because these are not
13053 equal to zero.
13054**kwargs : any, default None
13055 Additional keywords have no effect but might be accepted for
13056 compatibility with NumPy.
13057
13058Returns
13059-------
13060{name1} or {name2}
13061 If level is specified, then, {name2} is returned; otherwise, {name1}
13062 is returned.
13063
13064{see_also}
13065{examples}"""
13066
13067_all_desc = """\
13068Return whether all elements are True, potentially over an axis.
13069
13070Returns True unless there at least one element within a series or
13071along a Dataframe axis that is False or equivalent (e.g. zero or
13072empty)."""
13073
13074_all_examples = """\
13075Examples
13076--------
13077**Series**
13078
13079>>> pd.Series([True, True]).all()
13080True
13081>>> pd.Series([True, False]).all()
13082False
13083>>> pd.Series([], dtype="float64").all()
13084True
13085>>> pd.Series([np.nan]).all()
13086True
13087>>> pd.Series([np.nan]).all(skipna=False)
13088True
13089
13090**DataFrames**
13091
13092Create a dataframe from a dictionary.
13093
13094>>> df = pd.DataFrame({'col1': [True, True], 'col2': [True, False]})
13095>>> df
13096 col1 col2
130970 True True
130981 True False
13099
13100Default behaviour checks if values in each column all return True.
13101
13102>>> df.all()
13103col1 True
13104col2 False
13105dtype: bool
13106
13107Specify ``axis='columns'`` to check if values in each row all return True.
13108
13109>>> df.all(axis='columns')
131100 True
131111 False
13112dtype: bool
13113
13114Or ``axis=None`` for whether every value is True.
13115
13116>>> df.all(axis=None)
13117False
13118"""
13119
13120_all_see_also = """\
13121See Also
13122--------
13123Series.all : Return True if all elements are True.
13124DataFrame.any : Return True if one (or more) elements are True.
13125"""
13126
13127_cnum_doc = """
13128Return cumulative {desc} over a DataFrame or Series axis.
13129
13130Returns a DataFrame or Series of the same size containing the cumulative
13131{desc}.
13132
13133Parameters
13134----------
13135axis : {{0 or 'index', 1 or 'columns'}}, default 0
13136 The index or the name of the axis. 0 is equivalent to None or 'index'.
13137 For `Series` this parameter is unused and defaults to 0.
13138skipna : bool, default True
13139 Exclude NA/null values. If an entire row/column is NA, the result
13140 will be NA.
13141*args, **kwargs
13142 Additional keywords have no effect but might be accepted for
13143 compatibility with NumPy.
13144
13145Returns
13146-------
13147{name1} or {name2}
13148 Return cumulative {desc} of {name1} or {name2}.
13149
13150See Also
13151--------
13152core.window.expanding.Expanding.{accum_func_name} : Similar functionality
13153 but ignores ``NaN`` values.
13154{name2}.{accum_func_name} : Return the {desc} over
13155 {name2} axis.
13156{name2}.cummax : Return cumulative maximum over {name2} axis.
13157{name2}.cummin : Return cumulative minimum over {name2} axis.
13158{name2}.cumsum : Return cumulative sum over {name2} axis.
13159{name2}.cumprod : Return cumulative product over {name2} axis.
13160
13161{examples}"""
13162
13163_cummin_examples = """\
13164Examples
13165--------
13166**Series**
13167
13168>>> s = pd.Series([2, np.nan, 5, -1, 0])
13169>>> s
131700 2.0
131711 NaN
131722 5.0
131733 -1.0
131744 0.0
13175dtype: float64
13176
13177By default, NA values are ignored.
13178
13179>>> s.cummin()
131800 2.0
131811 NaN
131822 2.0
131833 -1.0
131844 -1.0
13185dtype: float64
13186
13187To include NA values in the operation, use ``skipna=False``
13188
13189>>> s.cummin(skipna=False)
131900 2.0
131911 NaN
131922 NaN
131933 NaN
131944 NaN
13195dtype: float64
13196
13197**DataFrame**
13198
13199>>> df = pd.DataFrame([[2.0, 1.0],
13200... [3.0, np.nan],
13201... [1.0, 0.0]],
13202... columns=list('AB'))
13203>>> df
13204 A B
132050 2.0 1.0
132061 3.0 NaN
132072 1.0 0.0
13208
13209By default, iterates over rows and finds the minimum
13210in each column. This is equivalent to ``axis=None`` or ``axis='index'``.
13211
13212>>> df.cummin()
13213 A B
132140 2.0 1.0
132151 2.0 NaN
132162 1.0 0.0
13217
13218To iterate over columns and find the minimum in each row,
13219use ``axis=1``
13220
13221>>> df.cummin(axis=1)
13222 A B
132230 2.0 1.0
132241 3.0 NaN
132252 1.0 0.0
13226"""
13227
13228_cumsum_examples = """\
13229Examples
13230--------
13231**Series**
13232
13233>>> s = pd.Series([2, np.nan, 5, -1, 0])
13234>>> s
132350 2.0
132361 NaN
132372 5.0
132383 -1.0
132394 0.0
13240dtype: float64
13241
13242By default, NA values are ignored.
13243
13244>>> s.cumsum()
132450 2.0
132461 NaN
132472 7.0
132483 6.0
132494 6.0
13250dtype: float64
13251
13252To include NA values in the operation, use ``skipna=False``
13253
13254>>> s.cumsum(skipna=False)
132550 2.0
132561 NaN
132572 NaN
132583 NaN
132594 NaN
13260dtype: float64
13261
13262**DataFrame**
13263
13264>>> df = pd.DataFrame([[2.0, 1.0],
13265... [3.0, np.nan],
13266... [1.0, 0.0]],
13267... columns=list('AB'))
13268>>> df
13269 A B
132700 2.0 1.0
132711 3.0 NaN
132722 1.0 0.0
13273
13274By default, iterates over rows and finds the sum
13275in each column. This is equivalent to ``axis=None`` or ``axis='index'``.
13276
13277>>> df.cumsum()
13278 A B
132790 2.0 1.0
132801 5.0 NaN
132812 6.0 1.0
13282
13283To iterate over columns and find the sum in each row,
13284use ``axis=1``
13285
13286>>> df.cumsum(axis=1)
13287 A B
132880 2.0 3.0
132891 3.0 NaN
132902 1.0 1.0
13291"""
13292
13293_cumprod_examples = """\
13294Examples
13295--------
13296**Series**
13297
13298>>> s = pd.Series([2, np.nan, 5, -1, 0])
13299>>> s
133000 2.0
133011 NaN
133022 5.0
133033 -1.0
133044 0.0
13305dtype: float64
13306
13307By default, NA values are ignored.
13308
13309>>> s.cumprod()
133100 2.0
133111 NaN
133122 10.0
133133 -10.0
133144 -0.0
13315dtype: float64
13316
13317To include NA values in the operation, use ``skipna=False``
13318
13319>>> s.cumprod(skipna=False)
133200 2.0
133211 NaN
133222 NaN
133233 NaN
133244 NaN
13325dtype: float64
13326
13327**DataFrame**
13328
13329>>> df = pd.DataFrame([[2.0, 1.0],
13330... [3.0, np.nan],
13331... [1.0, 0.0]],
13332... columns=list('AB'))
13333>>> df
13334 A B
133350 2.0 1.0
133361 3.0 NaN
133372 1.0 0.0
13338
13339By default, iterates over rows and finds the product
13340in each column. This is equivalent to ``axis=None`` or ``axis='index'``.
13341
13342>>> df.cumprod()
13343 A B
133440 2.0 1.0
133451 6.0 NaN
133462 6.0 0.0
13347
13348To iterate over columns and find the product in each row,
13349use ``axis=1``
13350
13351>>> df.cumprod(axis=1)
13352 A B
133530 2.0 2.0
133541 3.0 NaN
133552 1.0 0.0
13356"""
13357
13358_cummax_examples = """\
13359Examples
13360--------
13361**Series**
13362
13363>>> s = pd.Series([2, np.nan, 5, -1, 0])
13364>>> s
133650 2.0
133661 NaN
133672 5.0
133683 -1.0
133694 0.0
13370dtype: float64
13371
13372By default, NA values are ignored.
13373
13374>>> s.cummax()
133750 2.0
133761 NaN
133772 5.0
133783 5.0
133794 5.0
13380dtype: float64
13381
13382To include NA values in the operation, use ``skipna=False``
13383
13384>>> s.cummax(skipna=False)
133850 2.0
133861 NaN
133872 NaN
133883 NaN
133894 NaN
13390dtype: float64
13391
13392**DataFrame**
13393
13394>>> df = pd.DataFrame([[2.0, 1.0],
13395... [3.0, np.nan],
13396... [1.0, 0.0]],
13397... columns=list('AB'))
13398>>> df
13399 A B
134000 2.0 1.0
134011 3.0 NaN
134022 1.0 0.0
13403
13404By default, iterates over rows and finds the maximum
13405in each column. This is equivalent to ``axis=None`` or ``axis='index'``.
13406
13407>>> df.cummax()
13408 A B
134090 2.0 1.0
134101 3.0 NaN
134112 3.0 1.0
13412
13413To iterate over columns and find the maximum in each row,
13414use ``axis=1``
13415
13416>>> df.cummax(axis=1)
13417 A B
134180 2.0 2.0
134191 3.0 NaN
134202 1.0 1.0
13421"""
13422
13423_any_see_also = """\
13424See Also
13425--------
13426numpy.any : Numpy version of this method.
13427Series.any : Return whether any element is True.
13428Series.all : Return whether all elements are True.
13429DataFrame.any : Return whether any element is True over requested axis.
13430DataFrame.all : Return whether all elements are True over requested axis.
13431"""
13432
13433_any_desc = """\
13434Return whether any element is True, potentially over an axis.
13435
13436Returns False unless there is at least one element within a series or
13437along a Dataframe axis that is True or equivalent (e.g. non-zero or
13438non-empty)."""
13439
13440_any_examples = """\
13441Examples
13442--------
13443**Series**
13444
13445For Series input, the output is a scalar indicating whether any element
13446is True.
13447
13448>>> pd.Series([False, False]).any()
13449False
13450>>> pd.Series([True, False]).any()
13451True
13452>>> pd.Series([], dtype="float64").any()
13453False
13454>>> pd.Series([np.nan]).any()
13455False
13456>>> pd.Series([np.nan]).any(skipna=False)
13457True
13458
13459**DataFrame**
13460
13461Whether each column contains at least one True element (the default).
13462
13463>>> df = pd.DataFrame({"A": [1, 2], "B": [0, 2], "C": [0, 0]})
13464>>> df
13465 A B C
134660 1 0 0
134671 2 2 0
13468
13469>>> df.any()
13470A True
13471B True
13472C False
13473dtype: bool
13474
13475Aggregating over the columns.
13476
13477>>> df = pd.DataFrame({"A": [True, False], "B": [1, 2]})
13478>>> df
13479 A B
134800 True 1
134811 False 2
13482
13483>>> df.any(axis='columns')
134840 True
134851 True
13486dtype: bool
13487
13488>>> df = pd.DataFrame({"A": [True, False], "B": [1, 0]})
13489>>> df
13490 A B
134910 True 1
134921 False 0
13493
13494>>> df.any(axis='columns')
134950 True
134961 False
13497dtype: bool
13498
13499Aggregating over the entire DataFrame with ``axis=None``.
13500
13501>>> df.any(axis=None)
13502True
13503
13504`any` for an empty DataFrame is an empty Series.
13505
13506>>> pd.DataFrame([]).any()
13507Series([], dtype: bool)
13508"""
13509
13510_shared_docs[
13511 "stat_func_example"
13512] = """
13513
13514Examples
13515--------
13516>>> idx = pd.MultiIndex.from_arrays([
13517... ['warm', 'warm', 'cold', 'cold'],
13518... ['dog', 'falcon', 'fish', 'spider']],
13519... names=['blooded', 'animal'])
13520>>> s = pd.Series([4, 2, 0, 8], name='legs', index=idx)
13521>>> s
13522blooded animal
13523warm dog 4
13524 falcon 2
13525cold fish 0
13526 spider 8
13527Name: legs, dtype: int64
13528
13529>>> s.{stat_func}()
13530{default_output}"""
13531
13532_sum_examples = _shared_docs["stat_func_example"].format(
13533 stat_func="sum", verb="Sum", default_output=14, level_output_0=6, level_output_1=8
13534)
13535
13536_sum_examples += """
13537
13538By default, the sum of an empty or all-NA Series is ``0``.
13539
13540>>> pd.Series([], dtype="float64").sum() # min_count=0 is the default
135410.0
13542
13543This can be controlled with the ``min_count`` parameter. For example, if
13544you'd like the sum of an empty series to be NaN, pass ``min_count=1``.
13545
13546>>> pd.Series([], dtype="float64").sum(min_count=1)
13547nan
13548
13549Thanks to the ``skipna`` parameter, ``min_count`` handles all-NA and
13550empty series identically.
13551
13552>>> pd.Series([np.nan]).sum()
135530.0
13554
13555>>> pd.Series([np.nan]).sum(min_count=1)
13556nan"""
13557
13558_max_examples: str = _shared_docs["stat_func_example"].format(
13559 stat_func="max", verb="Max", default_output=8, level_output_0=4, level_output_1=8
13560)
13561
13562_min_examples: str = _shared_docs["stat_func_example"].format(
13563 stat_func="min", verb="Min", default_output=0, level_output_0=2, level_output_1=0
13564)
13565
13566_stat_func_see_also = """
13567
13568See Also
13569--------
13570Series.sum : Return the sum.
13571Series.min : Return the minimum.
13572Series.max : Return the maximum.
13573Series.idxmin : Return the index of the minimum.
13574Series.idxmax : Return the index of the maximum.
13575DataFrame.sum : Return the sum over the requested axis.
13576DataFrame.min : Return the minimum over the requested axis.
13577DataFrame.max : Return the maximum over the requested axis.
13578DataFrame.idxmin : Return the index of the minimum over the requested axis.
13579DataFrame.idxmax : Return the index of the maximum over the requested axis."""
13580
13581_prod_examples = """
13582
13583Examples
13584--------
13585By default, the product of an empty or all-NA Series is ``1``
13586
13587>>> pd.Series([], dtype="float64").prod()
135881.0
13589
13590This can be controlled with the ``min_count`` parameter
13591
13592>>> pd.Series([], dtype="float64").prod(min_count=1)
13593nan
13594
13595Thanks to the ``skipna`` parameter, ``min_count`` handles all-NA and
13596empty series identically.
13597
13598>>> pd.Series([np.nan]).prod()
135991.0
13600
13601>>> pd.Series([np.nan]).prod(min_count=1)
13602nan"""
13603
13604_min_count_stub = """\
13605min_count : int, default 0
13606 The required number of valid values to perform the operation. If fewer than
13607 ``min_count`` non-NA values are present the result will be NA.
13608"""
13609
13610
13611def make_doc(name: str, ndim: int) -> str:
13612 """
13613 Generate the docstring for a Series/DataFrame reduction.
13614 """
13615 if ndim == 1:
13616 name1 = "scalar"
13617 name2 = "Series"
13618 axis_descr = "{index (0)}"
13619 else:
13620 name1 = "Series"
13621 name2 = "DataFrame"
13622 axis_descr = "{index (0), columns (1)}"
13623
13624 if name == "any":
13625 base_doc = _bool_doc
13626 desc = _any_desc
13627 see_also = _any_see_also
13628 examples = _any_examples
13629 kwargs = {"empty_value": "False"}
13630 elif name == "all":
13631 base_doc = _bool_doc
13632 desc = _all_desc
13633 see_also = _all_see_also
13634 examples = _all_examples
13635 kwargs = {"empty_value": "True"}
13636 elif name == "min":
13637 base_doc = _num_doc
13638 desc = (
13639 "Return the minimum of the values over the requested axis.\n\n"
13640 "If you want the *index* of the minimum, use ``idxmin``. This is "
13641 "the equivalent of the ``numpy.ndarray`` method ``argmin``."
13642 )
13643 see_also = _stat_func_see_also
13644 examples = _min_examples
13645 kwargs = {"min_count": ""}
13646 elif name == "max":
13647 base_doc = _num_doc
13648 desc = (
13649 "Return the maximum of the values over the requested axis.\n\n"
13650 "If you want the *index* of the maximum, use ``idxmax``. This is "
13651 "the equivalent of the ``numpy.ndarray`` method ``argmax``."
13652 )
13653 see_also = _stat_func_see_also
13654 examples = _max_examples
13655 kwargs = {"min_count": ""}
13656
13657 elif name == "sum":
13658 base_doc = _sum_prod_doc
13659 desc = (
13660 "Return the sum of the values over the requested axis.\n\n"
13661 "This is equivalent to the method ``numpy.sum``."
13662 )
13663 see_also = _stat_func_see_also
13664 examples = _sum_examples
13665 kwargs = {"min_count": _min_count_stub}
13666
13667 elif name == "prod":
13668 base_doc = _sum_prod_doc
13669 desc = "Return the product of the values over the requested axis."
13670 see_also = _stat_func_see_also
13671 examples = _prod_examples
13672 kwargs = {"min_count": _min_count_stub}
13673
13674 elif name == "median":
13675 base_doc = _num_doc
13676 desc = "Return the median of the values over the requested axis."
13677 see_also = ""
13678 examples = """
13679
13680 Examples
13681 --------
13682 >>> s = pd.Series([1, 2, 3])
13683 >>> s.median()
13684 2.0
13685
13686 With a DataFrame
13687
13688 >>> df = pd.DataFrame({'a': [1, 2], 'b': [2, 3]}, index=['tiger', 'zebra'])
13689 >>> df
13690 a b
13691 tiger 1 2
13692 zebra 2 3
13693 >>> df.median()
13694 a 1.5
13695 b 2.5
13696 dtype: float64
13697
13698 Using axis=1
13699
13700 >>> df.median(axis=1)
13701 tiger 1.5
13702 zebra 2.5
13703 dtype: float64
13704
13705 In this case, `numeric_only` should be set to `True`
13706 to avoid getting an error.
13707
13708 >>> df = pd.DataFrame({'a': [1, 2], 'b': ['T', 'Z']},
13709 ... index=['tiger', 'zebra'])
13710 >>> df.median(numeric_only=True)
13711 a 1.5
13712 dtype: float64"""
13713 kwargs = {"min_count": ""}
13714
13715 elif name == "mean":
13716 base_doc = _num_doc
13717 desc = "Return the mean of the values over the requested axis."
13718 see_also = ""
13719 examples = """
13720
13721 Examples
13722 --------
13723 >>> s = pd.Series([1, 2, 3])
13724 >>> s.mean()
13725 2.0
13726
13727 With a DataFrame
13728
13729 >>> df = pd.DataFrame({'a': [1, 2], 'b': [2, 3]}, index=['tiger', 'zebra'])
13730 >>> df
13731 a b
13732 tiger 1 2
13733 zebra 2 3
13734 >>> df.mean()
13735 a 1.5
13736 b 2.5
13737 dtype: float64
13738
13739 Using axis=1
13740
13741 >>> df.mean(axis=1)
13742 tiger 1.5
13743 zebra 2.5
13744 dtype: float64
13745
13746 In this case, `numeric_only` should be set to `True` to avoid
13747 getting an error.
13748
13749 >>> df = pd.DataFrame({'a': [1, 2], 'b': ['T', 'Z']},
13750 ... index=['tiger', 'zebra'])
13751 >>> df.mean(numeric_only=True)
13752 a 1.5
13753 dtype: float64"""
13754 kwargs = {"min_count": ""}
13755
13756 elif name == "var":
13757 base_doc = _num_ddof_doc
13758 desc = (
13759 "Return unbiased variance over requested axis.\n\nNormalized by "
13760 "N-1 by default. This can be changed using the ddof argument."
13761 )
13762 examples = _var_examples
13763 see_also = ""
13764 kwargs = {"notes": ""}
13765
13766 elif name == "std":
13767 base_doc = _num_ddof_doc
13768 desc = (
13769 "Return sample standard deviation over requested axis."
13770 "\n\nNormalized by N-1 by default. This can be changed using the "
13771 "ddof argument."
13772 )
13773 examples = _std_examples
13774 see_also = ""
13775 kwargs = {"notes": _std_notes}
13776
13777 elif name == "sem":
13778 base_doc = _num_ddof_doc
13779 desc = (
13780 "Return unbiased standard error of the mean over requested "
13781 "axis.\n\nNormalized by N-1 by default. This can be changed "
13782 "using the ddof argument"
13783 )
13784 examples = """
13785
13786 Examples
13787 --------
13788 >>> s = pd.Series([1, 2, 3])
13789 >>> s.sem().round(6)
13790 0.57735
13791
13792 With a DataFrame
13793
13794 >>> df = pd.DataFrame({'a': [1, 2], 'b': [2, 3]}, index=['tiger', 'zebra'])
13795 >>> df
13796 a b
13797 tiger 1 2
13798 zebra 2 3
13799 >>> df.sem()
13800 a 0.5
13801 b 0.5
13802 dtype: float64
13803
13804 Using axis=1
13805
13806 >>> df.sem(axis=1)
13807 tiger 0.5
13808 zebra 0.5
13809 dtype: float64
13810
13811 In this case, `numeric_only` should be set to `True`
13812 to avoid getting an error.
13813
13814 >>> df = pd.DataFrame({'a': [1, 2], 'b': ['T', 'Z']},
13815 ... index=['tiger', 'zebra'])
13816 >>> df.sem(numeric_only=True)
13817 a 0.5
13818 dtype: float64"""
13819 see_also = ""
13820 kwargs = {"notes": ""}
13821
13822 elif name == "skew":
13823 base_doc = _num_doc
13824 desc = "Return unbiased skew over requested axis.\n\nNormalized by N-1."
13825 see_also = ""
13826 examples = """
13827
13828 Examples
13829 --------
13830 >>> s = pd.Series([1, 2, 3])
13831 >>> s.skew()
13832 0.0
13833
13834 With a DataFrame
13835
13836 >>> df = pd.DataFrame({'a': [1, 2, 3], 'b': [2, 3, 4], 'c': [1, 3, 5]},
13837 ... index=['tiger', 'zebra', 'cow'])
13838 >>> df
13839 a b c
13840 tiger 1 2 1
13841 zebra 2 3 3
13842 cow 3 4 5
13843 >>> df.skew()
13844 a 0.0
13845 b 0.0
13846 c 0.0
13847 dtype: float64
13848
13849 Using axis=1
13850
13851 >>> df.skew(axis=1)
13852 tiger 1.732051
13853 zebra -1.732051
13854 cow 0.000000
13855 dtype: float64
13856
13857 In this case, `numeric_only` should be set to `True` to avoid
13858 getting an error.
13859
13860 >>> df = pd.DataFrame({'a': [1, 2, 3], 'b': ['T', 'Z', 'X']},
13861 ... index=['tiger', 'zebra', 'cow'])
13862 >>> df.skew(numeric_only=True)
13863 a 0.0
13864 dtype: float64"""
13865 kwargs = {"min_count": ""}
13866 elif name == "kurt":
13867 base_doc = _num_doc
13868 desc = (
13869 "Return unbiased kurtosis over requested axis.\n\n"
13870 "Kurtosis obtained using Fisher's definition of\n"
13871 "kurtosis (kurtosis of normal == 0.0). Normalized "
13872 "by N-1."
13873 )
13874 see_also = ""
13875 examples = """
13876
13877 Examples
13878 --------
13879 >>> s = pd.Series([1, 2, 2, 3], index=['cat', 'dog', 'dog', 'mouse'])
13880 >>> s
13881 cat 1
13882 dog 2
13883 dog 2
13884 mouse 3
13885 dtype: int64
13886 >>> s.kurt()
13887 1.5
13888
13889 With a DataFrame
13890
13891 >>> df = pd.DataFrame({'a': [1, 2, 2, 3], 'b': [3, 4, 4, 4]},
13892 ... index=['cat', 'dog', 'dog', 'mouse'])
13893 >>> df
13894 a b
13895 cat 1 3
13896 dog 2 4
13897 dog 2 4
13898 mouse 3 4
13899 >>> df.kurt()
13900 a 1.5
13901 b 4.0
13902 dtype: float64
13903
13904 With axis=None
13905
13906 >>> df.kurt(axis=None).round(6)
13907 -0.988693
13908
13909 Using axis=1
13910
13911 >>> df = pd.DataFrame({'a': [1, 2], 'b': [3, 4], 'c': [3, 4], 'd': [1, 2]},
13912 ... index=['cat', 'dog'])
13913 >>> df.kurt(axis=1)
13914 cat -6.0
13915 dog -6.0
13916 dtype: float64"""
13917 kwargs = {"min_count": ""}
13918
13919 elif name == "cumsum":
13920 base_doc = _cnum_doc
13921 desc = "sum"
13922 see_also = ""
13923 examples = _cumsum_examples
13924 kwargs = {"accum_func_name": "sum"}
13925
13926 elif name == "cumprod":
13927 base_doc = _cnum_doc
13928 desc = "product"
13929 see_also = ""
13930 examples = _cumprod_examples
13931 kwargs = {"accum_func_name": "prod"}
13932
13933 elif name == "cummin":
13934 base_doc = _cnum_doc
13935 desc = "minimum"
13936 see_also = ""
13937 examples = _cummin_examples
13938 kwargs = {"accum_func_name": "min"}
13939
13940 elif name == "cummax":
13941 base_doc = _cnum_doc
13942 desc = "maximum"
13943 see_also = ""
13944 examples = _cummax_examples
13945 kwargs = {"accum_func_name": "max"}
13946
13947 else:
13948 raise NotImplementedError
13949
13950 docstr = base_doc.format(
13951 desc=desc,
13952 name=name,
13953 name1=name1,
13954 name2=name2,
13955 axis_descr=axis_descr,
13956 see_also=see_also,
13957 examples=examples,
13958 **kwargs,
13959 )
13960 return docstr