1# pyright: reportPropertyTypeMismatch=false
2from __future__ import annotations
3
4import collections
5import datetime as dt
6from functools import partial
7import gc
8from json import loads
9import operator
10import pickle
11import re
12from typing import (
13 TYPE_CHECKING,
14 Any,
15 Callable,
16 ClassVar,
17 Hashable,
18 Iterator,
19 Literal,
20 Mapping,
21 NoReturn,
22 Sequence,
23 Type,
24 cast,
25 final,
26 overload,
27)
28import warnings
29import weakref
30
31import numpy as np
32
33from pandas._config import (
34 config,
35 using_copy_on_write,
36)
37
38from pandas._libs import lib
39from pandas._libs.lib import is_range_indexer
40from pandas._libs.tslibs import (
41 Period,
42 Tick,
43 Timestamp,
44 to_offset,
45)
46from pandas._typing import (
47 AlignJoin,
48 AnyArrayLike,
49 ArrayLike,
50 Axis,
51 AxisInt,
52 CompressionOptions,
53 Dtype,
54 DtypeArg,
55 DtypeBackend,
56 DtypeObj,
57 FilePath,
58 FillnaOptions,
59 FloatFormatType,
60 FormattersType,
61 Frequency,
62 IgnoreRaise,
63 IndexKeyFunc,
64 IndexLabel,
65 IntervalClosedType,
66 JSONSerializable,
67 Level,
68 Manager,
69 NaPosition,
70 NDFrameT,
71 RandomState,
72 Renamer,
73 Scalar,
74 SortKind,
75 StorageOptions,
76 Suffixes,
77 T,
78 TimeAmbiguous,
79 TimedeltaConvertibleTypes,
80 TimeNonexistent,
81 TimestampConvertibleTypes,
82 ValueKeyFunc,
83 WriteBuffer,
84 npt,
85)
86from pandas.compat._optional import import_optional_dependency
87from pandas.compat.numpy import function as nv
88from pandas.errors import (
89 AbstractMethodError,
90 InvalidIndexError,
91 SettingWithCopyError,
92 SettingWithCopyWarning,
93)
94from pandas.util._decorators import doc
95from pandas.util._exceptions import find_stack_level
96from pandas.util._validators import (
97 check_dtype_backend,
98 validate_ascending,
99 validate_bool_kwarg,
100 validate_fillna_kwargs,
101 validate_inclusive,
102)
103
104from pandas.core.dtypes.astype import astype_is_view
105from pandas.core.dtypes.common import (
106 ensure_object,
107 ensure_platform_int,
108 ensure_str,
109 is_bool,
110 is_bool_dtype,
111 is_datetime64_any_dtype,
112 is_datetime64tz_dtype,
113 is_dict_like,
114 is_dtype_equal,
115 is_extension_array_dtype,
116 is_float,
117 is_list_like,
118 is_number,
119 is_numeric_dtype,
120 is_re_compilable,
121 is_scalar,
122 is_timedelta64_dtype,
123 pandas_dtype,
124)
125from pandas.core.dtypes.generic import (
126 ABCDataFrame,
127 ABCSeries,
128)
129from pandas.core.dtypes.inference import (
130 is_hashable,
131 is_nested_list_like,
132)
133from pandas.core.dtypes.missing import (
134 isna,
135 notna,
136)
137
138from pandas.core import (
139 algorithms as algos,
140 arraylike,
141 common,
142 indexing,
143 nanops,
144 sample,
145)
146from pandas.core.array_algos.replace import should_use_regex
147from pandas.core.arrays import ExtensionArray
148from pandas.core.base import PandasObject
149from pandas.core.construction import extract_array
150from pandas.core.flags import Flags
151from pandas.core.indexes.api import (
152 DatetimeIndex,
153 Index,
154 MultiIndex,
155 PeriodIndex,
156 RangeIndex,
157 default_index,
158 ensure_index,
159)
160from pandas.core.internals import (
161 ArrayManager,
162 BlockManager,
163 SingleArrayManager,
164)
165from pandas.core.internals.construction import (
166 mgr_to_mgr,
167 ndarray_to_mgr,
168)
169from pandas.core.methods.describe import describe_ndframe
170from pandas.core.missing import (
171 clean_fill_method,
172 clean_reindex_fill_method,
173 find_valid_index,
174)
175from pandas.core.ops import align_method_FRAME
176from pandas.core.reshape.concat import concat
177from pandas.core.shared_docs import _shared_docs
178from pandas.core.sorting import get_indexer_indexer
179from pandas.core.window import (
180 Expanding,
181 ExponentialMovingWindow,
182 Rolling,
183 Window,
184)
185
186from pandas.io.formats.format import (
187 DataFrameFormatter,
188 DataFrameRenderer,
189)
190from pandas.io.formats.printing import pprint_thing
191
192if TYPE_CHECKING:
193 from pandas._libs.tslibs import BaseOffset
194
195 from pandas.core.frame import DataFrame
196 from pandas.core.indexers.objects import BaseIndexer
197 from pandas.core.resample import Resampler
198 from pandas.core.series import Series
199
200 from pandas.io.pytables import HDFStore
201
202
203# goal is to be able to define the docs close to function, while still being
204# able to share
205_shared_docs = {**_shared_docs}
206_shared_doc_kwargs = {
207 "axes": "keywords for axes",
208 "klass": "Series/DataFrame",
209 "axes_single_arg": "int or labels for object",
210 "args_transpose": "axes to permute (int or label for object)",
211 "inplace": """
212 inplace : bool, default False
213 If True, performs operation inplace and returns None.""",
214 "optional_by": """
215 by : str or list of str
216 Name or list of names to sort by""",
217 "replace_iloc": """
218 This differs from updating with ``.loc`` or ``.iloc``, which require
219 you to specify a location to update with some value.""",
220}
221
222
223bool_t = bool # Need alias because NDFrame has def bool:
224
225
226class NDFrame(PandasObject, indexing.IndexingMixin):
227 """
228 N-dimensional analogue of DataFrame. Store multi-dimensional in a
229 size-mutable, labeled data structure
230
231 Parameters
232 ----------
233 data : BlockManager
234 axes : list
235 copy : bool, default False
236 """
237
238 _internal_names: list[str] = [
239 "_mgr",
240 "_cacher",
241 "_item_cache",
242 "_cache",
243 "_is_copy",
244 "_subtyp",
245 "_name",
246 "_default_kind",
247 "_default_fill_value",
248 "_metadata",
249 "__array_struct__",
250 "__array_interface__",
251 "_flags",
252 ]
253 _internal_names_set: set[str] = set(_internal_names)
254 _accessors: set[str] = set()
255 _hidden_attrs: frozenset[str] = frozenset([])
256 _metadata: list[str] = []
257 _is_copy: weakref.ReferenceType[NDFrame] | None = None
258 _mgr: Manager
259 _attrs: dict[Hashable, Any]
260 _typ: str
261
262 # ----------------------------------------------------------------------
263 # Constructors
264
265 def __init__(
266 self,
267 data: Manager,
268 copy: bool_t = False,
269 attrs: Mapping[Hashable, Any] | None = None,
270 ) -> None:
271 # copy kwarg is retained for mypy compat, is not used
272
273 object.__setattr__(self, "_is_copy", None)
274 object.__setattr__(self, "_mgr", data)
275 object.__setattr__(self, "_item_cache", {})
276 if attrs is None:
277 attrs = {}
278 else:
279 attrs = dict(attrs)
280 object.__setattr__(self, "_attrs", attrs)
281 object.__setattr__(self, "_flags", Flags(self, allows_duplicate_labels=True))
282
283 @classmethod
284 def _init_mgr(
285 cls,
286 mgr: Manager,
287 axes,
288 dtype: Dtype | None = None,
289 copy: bool_t = False,
290 ) -> Manager:
291 """passed a manager and a axes dict"""
292 for a, axe in axes.items():
293 if axe is not None:
294 axe = ensure_index(axe)
295 bm_axis = cls._get_block_manager_axis(a)
296 mgr = mgr.reindex_axis(axe, axis=bm_axis)
297
298 # make a copy if explicitly requested
299 if copy:
300 mgr = mgr.copy()
301 if dtype is not None:
302 # avoid further copies if we can
303 if (
304 isinstance(mgr, BlockManager)
305 and len(mgr.blocks) == 1
306 and is_dtype_equal(mgr.blocks[0].values.dtype, dtype)
307 ):
308 pass
309 else:
310 mgr = mgr.astype(dtype=dtype)
311 return mgr
312
313 def _as_manager(self: NDFrameT, typ: str, copy: bool_t = True) -> NDFrameT:
314 """
315 Private helper function to create a DataFrame with specific manager.
316
317 Parameters
318 ----------
319 typ : {"block", "array"}
320 copy : bool, default True
321 Only controls whether the conversion from Block->ArrayManager
322 copies the 1D arrays (to ensure proper/contiguous memory layout).
323
324 Returns
325 -------
326 DataFrame
327 New DataFrame using specified manager type. Is not guaranteed
328 to be a copy or not.
329 """
330 new_mgr: Manager
331 new_mgr = mgr_to_mgr(self._mgr, typ=typ, copy=copy)
332 # fastpath of passing a manager doesn't check the option/manager class
333 return self._constructor(new_mgr).__finalize__(self)
334
335 # ----------------------------------------------------------------------
336 # attrs and flags
337
338 @property
339 def attrs(self) -> dict[Hashable, Any]:
340 """
341 Dictionary of global attributes of this dataset.
342
343 .. warning::
344
345 attrs is experimental and may change without warning.
346
347 See Also
348 --------
349 DataFrame.flags : Global flags applying to this object.
350 """
351 if self._attrs is None:
352 self._attrs = {}
353 return self._attrs
354
355 @attrs.setter
356 def attrs(self, value: Mapping[Hashable, Any]) -> None:
357 self._attrs = dict(value)
358
359 @final
360 @property
361 def flags(self) -> Flags:
362 """
363 Get the properties associated with this pandas object.
364
365 The available flags are
366
367 * :attr:`Flags.allows_duplicate_labels`
368
369 See Also
370 --------
371 Flags : Flags that apply to pandas objects.
372 DataFrame.attrs : Global metadata applying to this dataset.
373
374 Notes
375 -----
376 "Flags" differ from "metadata". Flags reflect properties of the
377 pandas object (the Series or DataFrame). Metadata refer to properties
378 of the dataset, and should be stored in :attr:`DataFrame.attrs`.
379
380 Examples
381 --------
382 >>> df = pd.DataFrame({"A": [1, 2]})
383 >>> df.flags
384 <Flags(allows_duplicate_labels=True)>
385
386 Flags can be get or set using ``.``
387
388 >>> df.flags.allows_duplicate_labels
389 True
390 >>> df.flags.allows_duplicate_labels = False
391
392 Or by slicing with a key
393
394 >>> df.flags["allows_duplicate_labels"]
395 False
396 >>> df.flags["allows_duplicate_labels"] = True
397 """
398 return self._flags
399
400 @final
401 def set_flags(
402 self: NDFrameT,
403 *,
404 copy: bool_t = False,
405 allows_duplicate_labels: bool_t | None = None,
406 ) -> NDFrameT:
407 """
408 Return a new object with updated flags.
409
410 Parameters
411 ----------
412 copy : bool, default False
413 Specify if a copy of the object should be made.
414 allows_duplicate_labels : bool, optional
415 Whether the returned object allows duplicate labels.
416
417 Returns
418 -------
419 Series or DataFrame
420 The same type as the caller.
421
422 See Also
423 --------
424 DataFrame.attrs : Global metadata applying to this dataset.
425 DataFrame.flags : Global flags applying to this object.
426
427 Notes
428 -----
429 This method returns a new object that's a view on the same data
430 as the input. Mutating the input or the output values will be reflected
431 in the other.
432
433 This method is intended to be used in method chains.
434
435 "Flags" differ from "metadata". Flags reflect properties of the
436 pandas object (the Series or DataFrame). Metadata refer to properties
437 of the dataset, and should be stored in :attr:`DataFrame.attrs`.
438
439 Examples
440 --------
441 >>> df = pd.DataFrame({"A": [1, 2]})
442 >>> df.flags.allows_duplicate_labels
443 True
444 >>> df2 = df.set_flags(allows_duplicate_labels=False)
445 >>> df2.flags.allows_duplicate_labels
446 False
447 """
448 df = self.copy(deep=copy and not using_copy_on_write())
449 if allows_duplicate_labels is not None:
450 df.flags["allows_duplicate_labels"] = allows_duplicate_labels
451 return df
452
453 @final
454 @classmethod
455 def _validate_dtype(cls, dtype) -> DtypeObj | None:
456 """validate the passed dtype"""
457 if dtype is not None:
458 dtype = pandas_dtype(dtype)
459
460 # a compound dtype
461 if dtype.kind == "V":
462 raise NotImplementedError(
463 "compound dtypes are not implemented "
464 f"in the {cls.__name__} constructor"
465 )
466
467 return dtype
468
469 # ----------------------------------------------------------------------
470 # Construction
471
472 @property
473 def _constructor(self: NDFrameT) -> Callable[..., NDFrameT]:
474 """
475 Used when a manipulation result has the same dimensions as the
476 original.
477 """
478 raise AbstractMethodError(self)
479
480 # ----------------------------------------------------------------------
481 # Internals
482
483 @final
484 @property
485 def _data(self):
486 # GH#33054 retained because some downstream packages uses this,
487 # e.g. fastparquet
488 return self._mgr
489
490 # ----------------------------------------------------------------------
491 # Axis
492 _stat_axis_number = 0
493 _stat_axis_name = "index"
494 _AXIS_ORDERS: list[Literal["index", "columns"]]
495 _AXIS_TO_AXIS_NUMBER: dict[Axis, AxisInt] = {0: 0, "index": 0, "rows": 0}
496 _info_axis_number: int
497 _info_axis_name: Literal["index", "columns"]
498 _AXIS_LEN: int
499
500 @final
501 def _construct_axes_dict(self, axes: Sequence[Axis] | None = None, **kwargs):
502 """Return an axes dictionary for myself."""
503 d = {a: self._get_axis(a) for a in (axes or self._AXIS_ORDERS)}
504 # error: Argument 1 to "update" of "MutableMapping" has incompatible type
505 # "Dict[str, Any]"; expected "SupportsKeysAndGetItem[Union[int, str], Any]"
506 d.update(kwargs) # type: ignore[arg-type]
507 return d
508
509 @final
510 @classmethod
511 def _get_axis_number(cls, axis: Axis) -> AxisInt:
512 try:
513 return cls._AXIS_TO_AXIS_NUMBER[axis]
514 except KeyError:
515 raise ValueError(f"No axis named {axis} for object type {cls.__name__}")
516
517 @final
518 @classmethod
519 def _get_axis_name(cls, axis: Axis) -> Literal["index", "columns"]:
520 axis_number = cls._get_axis_number(axis)
521 return cls._AXIS_ORDERS[axis_number]
522
523 @final
524 def _get_axis(self, axis: Axis) -> Index:
525 axis_number = self._get_axis_number(axis)
526 assert axis_number in {0, 1}
527 return self.index if axis_number == 0 else self.columns
528
529 @final
530 @classmethod
531 def _get_block_manager_axis(cls, axis: Axis) -> AxisInt:
532 """Map the axis to the block_manager axis."""
533 axis = cls._get_axis_number(axis)
534 ndim = cls._AXIS_LEN
535 if ndim == 2:
536 # i.e. DataFrame
537 return 1 - axis
538 return axis
539
540 @final
541 def _get_axis_resolvers(self, axis: str) -> dict[str, Series | MultiIndex]:
542 # index or columns
543 axis_index = getattr(self, axis)
544 d = {}
545 prefix = axis[0]
546
547 for i, name in enumerate(axis_index.names):
548 if name is not None:
549 key = level = name
550 else:
551 # prefix with 'i' or 'c' depending on the input axis
552 # e.g., you must do ilevel_0 for the 0th level of an unnamed
553 # multiiindex
554 key = f"{prefix}level_{i}"
555 level = i
556
557 level_values = axis_index.get_level_values(level)
558 s = level_values.to_series()
559 s.index = axis_index
560 d[key] = s
561
562 # put the index/columns itself in the dict
563 if isinstance(axis_index, MultiIndex):
564 dindex = axis_index
565 else:
566 dindex = axis_index.to_series()
567
568 d[axis] = dindex
569 return d
570
571 @final
572 def _get_index_resolvers(self) -> dict[Hashable, Series | MultiIndex]:
573 from pandas.core.computation.parsing import clean_column_name
574
575 d: dict[str, Series | MultiIndex] = {}
576 for axis_name in self._AXIS_ORDERS:
577 d.update(self._get_axis_resolvers(axis_name))
578
579 return {clean_column_name(k): v for k, v in d.items() if not isinstance(k, int)}
580
581 @final
582 def _get_cleaned_column_resolvers(self) -> dict[Hashable, Series]:
583 """
584 Return the special character free column resolvers of a dataframe.
585
586 Column names with special characters are 'cleaned up' so that they can
587 be referred to by backtick quoting.
588 Used in :meth:`DataFrame.eval`.
589 """
590 from pandas.core.computation.parsing import clean_column_name
591
592 if isinstance(self, ABCSeries):
593 return {clean_column_name(self.name): self}
594
595 return {
596 clean_column_name(k): v for k, v in self.items() if not isinstance(k, int)
597 }
598
599 @property
600 def _info_axis(self) -> Index:
601 return getattr(self, self._info_axis_name)
602
603 @property
604 def _stat_axis(self) -> Index:
605 return getattr(self, self._stat_axis_name)
606
607 @property
608 def shape(self) -> tuple[int, ...]:
609 """
610 Return a tuple of axis dimensions
611 """
612 return tuple(len(self._get_axis(a)) for a in self._AXIS_ORDERS)
613
614 @property
615 def axes(self) -> list[Index]:
616 """
617 Return index label(s) of the internal NDFrame
618 """
619 # we do it this way because if we have reversed axes, then
620 # the block manager shows then reversed
621 return [self._get_axis(a) for a in self._AXIS_ORDERS]
622
623 @property
624 def ndim(self) -> int:
625 """
626 Return an int representing the number of axes / array dimensions.
627
628 Return 1 if Series. Otherwise return 2 if DataFrame.
629
630 See Also
631 --------
632 ndarray.ndim : Number of array dimensions.
633
634 Examples
635 --------
636 >>> s = pd.Series({'a': 1, 'b': 2, 'c': 3})
637 >>> s.ndim
638 1
639
640 >>> df = pd.DataFrame({'col1': [1, 2], 'col2': [3, 4]})
641 >>> df.ndim
642 2
643 """
644 return self._mgr.ndim
645
646 @property
647 def size(self) -> int:
648 """
649 Return an int representing the number of elements in this object.
650
651 Return the number of rows if Series. Otherwise return the number of
652 rows times number of columns if DataFrame.
653
654 See Also
655 --------
656 ndarray.size : Number of elements in the array.
657
658 Examples
659 --------
660 >>> s = pd.Series({'a': 1, 'b': 2, 'c': 3})
661 >>> s.size
662 3
663
664 >>> df = pd.DataFrame({'col1': [1, 2], 'col2': [3, 4]})
665 >>> df.size
666 4
667 """
668 # error: Incompatible return value type (got "signedinteger[_64Bit]",
669 # expected "int") [return-value]
670 return np.prod(self.shape) # type: ignore[return-value]
671
672 def set_axis(
673 self: NDFrameT,
674 labels,
675 *,
676 axis: Axis = 0,
677 copy: bool_t | None = None,
678 ) -> NDFrameT:
679 """
680 Assign desired index to given axis.
681
682 Indexes for%(extended_summary_sub)s row labels can be changed by assigning
683 a list-like or Index.
684
685 Parameters
686 ----------
687 labels : list-like, Index
688 The values for the new index.
689
690 axis : %(axes_single_arg)s, default 0
691 The axis to update. The value 0 identifies the rows. For `Series`
692 this parameter is unused and defaults to 0.
693
694 copy : bool, default True
695 Whether to make a copy of the underlying data.
696
697 .. versionadded:: 1.5.0
698
699 Returns
700 -------
701 %(klass)s
702 An object of type %(klass)s.
703
704 See Also
705 --------
706 %(klass)s.rename_axis : Alter the name of the index%(see_also_sub)s.
707 """
708 return self._set_axis_nocheck(labels, axis, inplace=False, copy=copy)
709
710 @final
711 def _set_axis_nocheck(
712 self, labels, axis: Axis, inplace: bool_t, copy: bool_t | None
713 ):
714 if inplace:
715 setattr(self, self._get_axis_name(axis), labels)
716 else:
717 # With copy=False, we create a new object but don't copy the
718 # underlying data.
719 obj = self.copy(deep=copy and not using_copy_on_write())
720 setattr(obj, obj._get_axis_name(axis), labels)
721 return obj
722
723 @final
724 def _set_axis(self, axis: AxisInt, labels: AnyArrayLike | list) -> None:
725 """
726 This is called from the cython code when we set the `index` attribute
727 directly, e.g. `series.index = [1, 2, 3]`.
728 """
729 labels = ensure_index(labels)
730 self._mgr.set_axis(axis, labels)
731 self._clear_item_cache()
732
733 @final
734 def swapaxes(
735 self: NDFrameT, axis1: Axis, axis2: Axis, copy: bool_t | None = None
736 ) -> NDFrameT:
737 """
738 Interchange axes and swap values axes appropriately.
739
740 Returns
741 -------
742 same as input
743 """
744 i = self._get_axis_number(axis1)
745 j = self._get_axis_number(axis2)
746
747 if i == j:
748 return self.copy(deep=copy and not using_copy_on_write())
749
750 mapping = {i: j, j: i}
751
752 new_axes = [self._get_axis(mapping.get(k, k)) for k in range(self._AXIS_LEN)]
753 new_values = self._values.swapaxes(i, j) # type: ignore[union-attr]
754 if (
755 using_copy_on_write()
756 and self._mgr.is_single_block
757 and isinstance(self._mgr, BlockManager)
758 ):
759 # This should only get hit in case of having a single block, otherwise a
760 # copy is made, we don't have to set up references.
761 new_mgr = ndarray_to_mgr(
762 new_values,
763 new_axes[0],
764 new_axes[1],
765 dtype=None,
766 copy=False,
767 typ="block",
768 )
769 assert isinstance(new_mgr, BlockManager)
770 assert isinstance(self._mgr, BlockManager)
771 new_mgr.blocks[0].refs = self._mgr.blocks[0].refs
772 new_mgr.blocks[0].refs.add_reference(
773 new_mgr.blocks[0] # type: ignore[arg-type]
774 )
775 return self._constructor(new_mgr).__finalize__(self, method="swapaxes")
776
777 elif (copy or copy is None) and self._mgr.is_single_block:
778 new_values = new_values.copy()
779
780 return self._constructor(
781 new_values,
782 *new_axes,
783 # The no-copy case for CoW is handled above
784 copy=False,
785 ).__finalize__(self, method="swapaxes")
786
787 @final
788 @doc(klass=_shared_doc_kwargs["klass"])
789 def droplevel(self: NDFrameT, level: IndexLabel, axis: Axis = 0) -> NDFrameT:
790 """
791 Return {klass} with requested index / column level(s) removed.
792
793 Parameters
794 ----------
795 level : int, str, or list-like
796 If a string is given, must be the name of a level
797 If list-like, elements must be names or positional indexes
798 of levels.
799
800 axis : {{0 or 'index', 1 or 'columns'}}, default 0
801 Axis along which the level(s) is removed:
802
803 * 0 or 'index': remove level(s) in column.
804 * 1 or 'columns': remove level(s) in row.
805
806 For `Series` this parameter is unused and defaults to 0.
807
808 Returns
809 -------
810 {klass}
811 {klass} with requested index / column level(s) removed.
812
813 Examples
814 --------
815 >>> df = pd.DataFrame([
816 ... [1, 2, 3, 4],
817 ... [5, 6, 7, 8],
818 ... [9, 10, 11, 12]
819 ... ]).set_index([0, 1]).rename_axis(['a', 'b'])
820
821 >>> df.columns = pd.MultiIndex.from_tuples([
822 ... ('c', 'e'), ('d', 'f')
823 ... ], names=['level_1', 'level_2'])
824
825 >>> df
826 level_1 c d
827 level_2 e f
828 a b
829 1 2 3 4
830 5 6 7 8
831 9 10 11 12
832
833 >>> df.droplevel('a')
834 level_1 c d
835 level_2 e f
836 b
837 2 3 4
838 6 7 8
839 10 11 12
840
841 >>> df.droplevel('level_2', axis=1)
842 level_1 c d
843 a b
844 1 2 3 4
845 5 6 7 8
846 9 10 11 12
847 """
848 labels = self._get_axis(axis)
849 new_labels = labels.droplevel(level)
850 return self.set_axis(new_labels, axis=axis, copy=None)
851
852 def pop(self, item: Hashable) -> Series | Any:
853 result = self[item]
854 del self[item]
855
856 return result
857
858 @final
859 def squeeze(self, axis: Axis | None = None):
860 """
861 Squeeze 1 dimensional axis objects into scalars.
862
863 Series or DataFrames with a single element are squeezed to a scalar.
864 DataFrames with a single column or a single row are squeezed to a
865 Series. Otherwise the object is unchanged.
866
867 This method is most useful when you don't know if your
868 object is a Series or DataFrame, but you do know it has just a single
869 column. In that case you can safely call `squeeze` to ensure you have a
870 Series.
871
872 Parameters
873 ----------
874 axis : {0 or 'index', 1 or 'columns', None}, default None
875 A specific axis to squeeze. By default, all length-1 axes are
876 squeezed. For `Series` this parameter is unused and defaults to `None`.
877
878 Returns
879 -------
880 DataFrame, Series, or scalar
881 The projection after squeezing `axis` or all the axes.
882
883 See Also
884 --------
885 Series.iloc : Integer-location based indexing for selecting scalars.
886 DataFrame.iloc : Integer-location based indexing for selecting Series.
887 Series.to_frame : Inverse of DataFrame.squeeze for a
888 single-column DataFrame.
889
890 Examples
891 --------
892 >>> primes = pd.Series([2, 3, 5, 7])
893
894 Slicing might produce a Series with a single value:
895
896 >>> even_primes = primes[primes % 2 == 0]
897 >>> even_primes
898 0 2
899 dtype: int64
900
901 >>> even_primes.squeeze()
902 2
903
904 Squeezing objects with more than one value in every axis does nothing:
905
906 >>> odd_primes = primes[primes % 2 == 1]
907 >>> odd_primes
908 1 3
909 2 5
910 3 7
911 dtype: int64
912
913 >>> odd_primes.squeeze()
914 1 3
915 2 5
916 3 7
917 dtype: int64
918
919 Squeezing is even more effective when used with DataFrames.
920
921 >>> df = pd.DataFrame([[1, 2], [3, 4]], columns=['a', 'b'])
922 >>> df
923 a b
924 0 1 2
925 1 3 4
926
927 Slicing a single column will produce a DataFrame with the columns
928 having only one value:
929
930 >>> df_a = df[['a']]
931 >>> df_a
932 a
933 0 1
934 1 3
935
936 So the columns can be squeezed down, resulting in a Series:
937
938 >>> df_a.squeeze('columns')
939 0 1
940 1 3
941 Name: a, dtype: int64
942
943 Slicing a single row from a single column will produce a single
944 scalar DataFrame:
945
946 >>> df_0a = df.loc[df.index < 1, ['a']]
947 >>> df_0a
948 a
949 0 1
950
951 Squeezing the rows produces a single scalar Series:
952
953 >>> df_0a.squeeze('rows')
954 a 1
955 Name: 0, dtype: int64
956
957 Squeezing all axes will project directly into a scalar:
958
959 >>> df_0a.squeeze()
960 1
961 """
962 axes = range(self._AXIS_LEN) if axis is None else (self._get_axis_number(axis),)
963 return self.iloc[
964 tuple(
965 0 if i in axes and len(a) == 1 else slice(None)
966 for i, a in enumerate(self.axes)
967 )
968 ]
969
970 # ----------------------------------------------------------------------
971 # Rename
972
973 def _rename(
974 self: NDFrameT,
975 mapper: Renamer | None = None,
976 *,
977 index: Renamer | None = None,
978 columns: Renamer | None = None,
979 axis: Axis | None = None,
980 copy: bool_t | None = None,
981 inplace: bool_t = False,
982 level: Level | None = None,
983 errors: str = "ignore",
984 ) -> NDFrameT | None:
985 # called by Series.rename and DataFrame.rename
986
987 if mapper is None and index is None and columns is None:
988 raise TypeError("must pass an index to rename")
989
990 if index is not None or columns is not None:
991 if axis is not None:
992 raise TypeError(
993 "Cannot specify both 'axis' and any of 'index' or 'columns'"
994 )
995 if mapper is not None:
996 raise TypeError(
997 "Cannot specify both 'mapper' and any of 'index' or 'columns'"
998 )
999 else:
1000 # use the mapper argument
1001 if axis and self._get_axis_number(axis) == 1:
1002 columns = mapper
1003 else:
1004 index = mapper
1005
1006 self._check_inplace_and_allows_duplicate_labels(inplace)
1007 result = self if inplace else self.copy(deep=copy and not using_copy_on_write())
1008
1009 for axis_no, replacements in enumerate((index, columns)):
1010 if replacements is None:
1011 continue
1012
1013 ax = self._get_axis(axis_no)
1014 f = common.get_rename_function(replacements)
1015
1016 if level is not None:
1017 level = ax._get_level_number(level)
1018
1019 # GH 13473
1020 if not callable(replacements):
1021 if ax._is_multi and level is not None:
1022 indexer = ax.get_level_values(level).get_indexer_for(replacements)
1023 else:
1024 indexer = ax.get_indexer_for(replacements)
1025
1026 if errors == "raise" and len(indexer[indexer == -1]):
1027 missing_labels = [
1028 label
1029 for index, label in enumerate(replacements)
1030 if indexer[index] == -1
1031 ]
1032 raise KeyError(f"{missing_labels} not found in axis")
1033
1034 new_index = ax._transform_index(f, level=level)
1035 result._set_axis_nocheck(new_index, axis=axis_no, inplace=True, copy=False)
1036 result._clear_item_cache()
1037
1038 if inplace:
1039 self._update_inplace(result)
1040 return None
1041 else:
1042 return result.__finalize__(self, method="rename")
1043
1044 @overload
1045 def rename_axis(
1046 self: NDFrameT,
1047 mapper: IndexLabel | lib.NoDefault = ...,
1048 *,
1049 index=...,
1050 columns=...,
1051 axis: Axis = ...,
1052 copy: bool_t | None = ...,
1053 inplace: Literal[False] = ...,
1054 ) -> NDFrameT:
1055 ...
1056
1057 @overload
1058 def rename_axis(
1059 self,
1060 mapper: IndexLabel | lib.NoDefault = ...,
1061 *,
1062 index=...,
1063 columns=...,
1064 axis: Axis = ...,
1065 copy: bool_t | None = ...,
1066 inplace: Literal[True],
1067 ) -> None:
1068 ...
1069
1070 @overload
1071 def rename_axis(
1072 self: NDFrameT,
1073 mapper: IndexLabel | lib.NoDefault = ...,
1074 *,
1075 index=...,
1076 columns=...,
1077 axis: Axis = ...,
1078 copy: bool_t | None = ...,
1079 inplace: bool_t = ...,
1080 ) -> NDFrameT | None:
1081 ...
1082
1083 def rename_axis(
1084 self: NDFrameT,
1085 mapper: IndexLabel | lib.NoDefault = lib.no_default,
1086 *,
1087 index=lib.no_default,
1088 columns=lib.no_default,
1089 axis: Axis = 0,
1090 copy: bool_t | None = None,
1091 inplace: bool_t = False,
1092 ) -> NDFrameT | None:
1093 """
1094 Set the name of the axis for the index or columns.
1095
1096 Parameters
1097 ----------
1098 mapper : scalar, list-like, optional
1099 Value to set the axis name attribute.
1100 index, columns : scalar, list-like, dict-like or function, optional
1101 A scalar, list-like, dict-like or functions transformations to
1102 apply to that axis' values.
1103 Note that the ``columns`` parameter is not allowed if the
1104 object is a Series. This parameter only apply for DataFrame
1105 type objects.
1106
1107 Use either ``mapper`` and ``axis`` to
1108 specify the axis to target with ``mapper``, or ``index``
1109 and/or ``columns``.
1110 axis : {0 or 'index', 1 or 'columns'}, default 0
1111 The axis to rename. For `Series` this parameter is unused and defaults to 0.
1112 copy : bool, default None
1113 Also copy underlying data.
1114 inplace : bool, default False
1115 Modifies the object directly, instead of creating a new Series
1116 or DataFrame.
1117
1118 Returns
1119 -------
1120 Series, DataFrame, or None
1121 The same type as the caller or None if ``inplace=True``.
1122
1123 See Also
1124 --------
1125 Series.rename : Alter Series index labels or name.
1126 DataFrame.rename : Alter DataFrame index labels or name.
1127 Index.rename : Set new names on index.
1128
1129 Notes
1130 -----
1131 ``DataFrame.rename_axis`` supports two calling conventions
1132
1133 * ``(index=index_mapper, columns=columns_mapper, ...)``
1134 * ``(mapper, axis={'index', 'columns'}, ...)``
1135
1136 The first calling convention will only modify the names of
1137 the index and/or the names of the Index object that is the columns.
1138 In this case, the parameter ``copy`` is ignored.
1139
1140 The second calling convention will modify the names of the
1141 corresponding index if mapper is a list or a scalar.
1142 However, if mapper is dict-like or a function, it will use the
1143 deprecated behavior of modifying the axis *labels*.
1144
1145 We *highly* recommend using keyword arguments to clarify your
1146 intent.
1147
1148 Examples
1149 --------
1150 **Series**
1151
1152 >>> s = pd.Series(["dog", "cat", "monkey"])
1153 >>> s
1154 0 dog
1155 1 cat
1156 2 monkey
1157 dtype: object
1158 >>> s.rename_axis("animal")
1159 animal
1160 0 dog
1161 1 cat
1162 2 monkey
1163 dtype: object
1164
1165 **DataFrame**
1166
1167 >>> df = pd.DataFrame({"num_legs": [4, 4, 2],
1168 ... "num_arms": [0, 0, 2]},
1169 ... ["dog", "cat", "monkey"])
1170 >>> df
1171 num_legs num_arms
1172 dog 4 0
1173 cat 4 0
1174 monkey 2 2
1175 >>> df = df.rename_axis("animal")
1176 >>> df
1177 num_legs num_arms
1178 animal
1179 dog 4 0
1180 cat 4 0
1181 monkey 2 2
1182 >>> df = df.rename_axis("limbs", axis="columns")
1183 >>> df
1184 limbs num_legs num_arms
1185 animal
1186 dog 4 0
1187 cat 4 0
1188 monkey 2 2
1189
1190 **MultiIndex**
1191
1192 >>> df.index = pd.MultiIndex.from_product([['mammal'],
1193 ... ['dog', 'cat', 'monkey']],
1194 ... names=['type', 'name'])
1195 >>> df
1196 limbs num_legs num_arms
1197 type name
1198 mammal dog 4 0
1199 cat 4 0
1200 monkey 2 2
1201
1202 >>> df.rename_axis(index={'type': 'class'})
1203 limbs num_legs num_arms
1204 class name
1205 mammal dog 4 0
1206 cat 4 0
1207 monkey 2 2
1208
1209 >>> df.rename_axis(columns=str.upper)
1210 LIMBS num_legs num_arms
1211 type name
1212 mammal dog 4 0
1213 cat 4 0
1214 monkey 2 2
1215 """
1216 axes = {"index": index, "columns": columns}
1217
1218 if axis is not None:
1219 axis = self._get_axis_number(axis)
1220
1221 inplace = validate_bool_kwarg(inplace, "inplace")
1222
1223 if copy and using_copy_on_write():
1224 copy = False
1225
1226 if mapper is not lib.no_default:
1227 # Use v0.23 behavior if a scalar or list
1228 non_mapper = is_scalar(mapper) or (
1229 is_list_like(mapper) and not is_dict_like(mapper)
1230 )
1231 if non_mapper:
1232 return self._set_axis_name(
1233 mapper, axis=axis, inplace=inplace, copy=copy
1234 )
1235 else:
1236 raise ValueError("Use `.rename` to alter labels with a mapper.")
1237 else:
1238 # Use new behavior. Means that index and/or columns
1239 # is specified
1240 result = self if inplace else self.copy(deep=copy)
1241
1242 for axis in range(self._AXIS_LEN):
1243 v = axes.get(self._get_axis_name(axis))
1244 if v is lib.no_default:
1245 continue
1246 non_mapper = is_scalar(v) or (is_list_like(v) and not is_dict_like(v))
1247 if non_mapper:
1248 newnames = v
1249 else:
1250 f = common.get_rename_function(v)
1251 curnames = self._get_axis(axis).names
1252 newnames = [f(name) for name in curnames]
1253 result._set_axis_name(newnames, axis=axis, inplace=True, copy=copy)
1254 if not inplace:
1255 return result
1256 return None
1257
1258 @final
1259 def _set_axis_name(
1260 self, name, axis: Axis = 0, inplace: bool_t = False, copy: bool_t | None = True
1261 ):
1262 """
1263 Set the name(s) of the axis.
1264
1265 Parameters
1266 ----------
1267 name : str or list of str
1268 Name(s) to set.
1269 axis : {0 or 'index', 1 or 'columns'}, default 0
1270 The axis to set the label. The value 0 or 'index' specifies index,
1271 and the value 1 or 'columns' specifies columns.
1272 inplace : bool, default False
1273 If `True`, do operation inplace and return None.
1274 copy:
1275 Whether to make a copy of the result.
1276
1277 Returns
1278 -------
1279 Series, DataFrame, or None
1280 The same type as the caller or `None` if `inplace` is `True`.
1281
1282 See Also
1283 --------
1284 DataFrame.rename : Alter the axis labels of :class:`DataFrame`.
1285 Series.rename : Alter the index labels or set the index name
1286 of :class:`Series`.
1287 Index.rename : Set the name of :class:`Index` or :class:`MultiIndex`.
1288
1289 Examples
1290 --------
1291 >>> df = pd.DataFrame({"num_legs": [4, 4, 2]},
1292 ... ["dog", "cat", "monkey"])
1293 >>> df
1294 num_legs
1295 dog 4
1296 cat 4
1297 monkey 2
1298 >>> df._set_axis_name("animal")
1299 num_legs
1300 animal
1301 dog 4
1302 cat 4
1303 monkey 2
1304 >>> df.index = pd.MultiIndex.from_product(
1305 ... [["mammal"], ['dog', 'cat', 'monkey']])
1306 >>> df._set_axis_name(["type", "name"])
1307 num_legs
1308 type name
1309 mammal dog 4
1310 cat 4
1311 monkey 2
1312 """
1313 axis = self._get_axis_number(axis)
1314 idx = self._get_axis(axis).set_names(name)
1315
1316 inplace = validate_bool_kwarg(inplace, "inplace")
1317 renamed = self if inplace else self.copy(deep=copy)
1318 if axis == 0:
1319 renamed.index = idx
1320 else:
1321 renamed.columns = idx
1322
1323 if not inplace:
1324 return renamed
1325
1326 # ----------------------------------------------------------------------
1327 # Comparison Methods
1328
1329 @final
1330 def _indexed_same(self, other) -> bool_t:
1331 return all(
1332 self._get_axis(a).equals(other._get_axis(a)) for a in self._AXIS_ORDERS
1333 )
1334
1335 @final
1336 def equals(self, other: object) -> bool_t:
1337 """
1338 Test whether two objects contain the same elements.
1339
1340 This function allows two Series or DataFrames to be compared against
1341 each other to see if they have the same shape and elements. NaNs in
1342 the same location are considered equal.
1343
1344 The row/column index do not need to have the same type, as long
1345 as the values are considered equal. Corresponding columns must be of
1346 the same dtype.
1347
1348 Parameters
1349 ----------
1350 other : Series or DataFrame
1351 The other Series or DataFrame to be compared with the first.
1352
1353 Returns
1354 -------
1355 bool
1356 True if all elements are the same in both objects, False
1357 otherwise.
1358
1359 See Also
1360 --------
1361 Series.eq : Compare two Series objects of the same length
1362 and return a Series where each element is True if the element
1363 in each Series is equal, False otherwise.
1364 DataFrame.eq : Compare two DataFrame objects of the same shape and
1365 return a DataFrame where each element is True if the respective
1366 element in each DataFrame is equal, False otherwise.
1367 testing.assert_series_equal : Raises an AssertionError if left and
1368 right are not equal. Provides an easy interface to ignore
1369 inequality in dtypes, indexes and precision among others.
1370 testing.assert_frame_equal : Like assert_series_equal, but targets
1371 DataFrames.
1372 numpy.array_equal : Return True if two arrays have the same shape
1373 and elements, False otherwise.
1374
1375 Examples
1376 --------
1377 >>> df = pd.DataFrame({1: [10], 2: [20]})
1378 >>> df
1379 1 2
1380 0 10 20
1381
1382 DataFrames df and exactly_equal have the same types and values for
1383 their elements and column labels, which will return True.
1384
1385 >>> exactly_equal = pd.DataFrame({1: [10], 2: [20]})
1386 >>> exactly_equal
1387 1 2
1388 0 10 20
1389 >>> df.equals(exactly_equal)
1390 True
1391
1392 DataFrames df and different_column_type have the same element
1393 types and values, but have different types for the column labels,
1394 which will still return True.
1395
1396 >>> different_column_type = pd.DataFrame({1.0: [10], 2.0: [20]})
1397 >>> different_column_type
1398 1.0 2.0
1399 0 10 20
1400 >>> df.equals(different_column_type)
1401 True
1402
1403 DataFrames df and different_data_type have different types for the
1404 same values for their elements, and will return False even though
1405 their column labels are the same values and types.
1406
1407 >>> different_data_type = pd.DataFrame({1: [10.0], 2: [20.0]})
1408 >>> different_data_type
1409 1 2
1410 0 10.0 20.0
1411 >>> df.equals(different_data_type)
1412 False
1413 """
1414 if not (isinstance(other, type(self)) or isinstance(self, type(other))):
1415 return False
1416 other = cast(NDFrame, other)
1417 return self._mgr.equals(other._mgr)
1418
1419 # -------------------------------------------------------------------------
1420 # Unary Methods
1421
1422 @final
1423 def __neg__(self: NDFrameT) -> NDFrameT:
1424 def blk_func(values: ArrayLike):
1425 if is_bool_dtype(values.dtype):
1426 # error: Argument 1 to "inv" has incompatible type "Union
1427 # [ExtensionArray, ndarray[Any, Any]]"; expected
1428 # "_SupportsInversion[ndarray[Any, dtype[bool_]]]"
1429 return operator.inv(values) # type: ignore[arg-type]
1430 else:
1431 # error: Argument 1 to "neg" has incompatible type "Union
1432 # [ExtensionArray, ndarray[Any, Any]]"; expected
1433 # "_SupportsNeg[ndarray[Any, dtype[Any]]]"
1434 return operator.neg(values) # type: ignore[arg-type]
1435
1436 new_data = self._mgr.apply(blk_func)
1437 res = self._constructor(new_data)
1438 return res.__finalize__(self, method="__neg__")
1439
1440 @final
1441 def __pos__(self: NDFrameT) -> NDFrameT:
1442 def blk_func(values: ArrayLike):
1443 if is_bool_dtype(values.dtype):
1444 return values.copy()
1445 else:
1446 # error: Argument 1 to "pos" has incompatible type "Union
1447 # [ExtensionArray, ndarray[Any, Any]]"; expected
1448 # "_SupportsPos[ndarray[Any, dtype[Any]]]"
1449 return operator.pos(values) # type: ignore[arg-type]
1450
1451 new_data = self._mgr.apply(blk_func)
1452 res = self._constructor(new_data)
1453 return res.__finalize__(self, method="__pos__")
1454
1455 @final
1456 def __invert__(self: NDFrameT) -> NDFrameT:
1457 if not self.size:
1458 # inv fails with 0 len
1459 return self.copy(deep=False)
1460
1461 new_data = self._mgr.apply(operator.invert)
1462 return self._constructor(new_data).__finalize__(self, method="__invert__")
1463
1464 @final
1465 def __nonzero__(self) -> NoReturn:
1466 raise ValueError(
1467 f"The truth value of a {type(self).__name__} is ambiguous. "
1468 "Use a.empty, a.bool(), a.item(), a.any() or a.all()."
1469 )
1470
1471 __bool__ = __nonzero__
1472
1473 @final
1474 def bool(self) -> bool_t:
1475 """
1476 Return the bool of a single element Series or DataFrame.
1477
1478 This must be a boolean scalar value, either True or False. It will raise a
1479 ValueError if the Series or DataFrame does not have exactly 1 element, or that
1480 element is not boolean (integer values 0 and 1 will also raise an exception).
1481
1482 Returns
1483 -------
1484 bool
1485 The value in the Series or DataFrame.
1486
1487 See Also
1488 --------
1489 Series.astype : Change the data type of a Series, including to boolean.
1490 DataFrame.astype : Change the data type of a DataFrame, including to boolean.
1491 numpy.bool_ : NumPy boolean data type, used by pandas for boolean values.
1492
1493 Examples
1494 --------
1495 The method will only work for single element objects with a boolean value:
1496
1497 >>> pd.Series([True]).bool()
1498 True
1499 >>> pd.Series([False]).bool()
1500 False
1501
1502 >>> pd.DataFrame({'col': [True]}).bool()
1503 True
1504 >>> pd.DataFrame({'col': [False]}).bool()
1505 False
1506 """
1507 v = self.squeeze()
1508 if isinstance(v, (bool, np.bool_)):
1509 return bool(v)
1510 elif is_scalar(v):
1511 raise ValueError(
1512 "bool cannot act on a non-boolean single element "
1513 f"{type(self).__name__}"
1514 )
1515
1516 self.__nonzero__()
1517 # for mypy (__nonzero__ raises)
1518 return True
1519
1520 @final
1521 def abs(self: NDFrameT) -> NDFrameT:
1522 """
1523 Return a Series/DataFrame with absolute numeric value of each element.
1524
1525 This function only applies to elements that are all numeric.
1526
1527 Returns
1528 -------
1529 abs
1530 Series/DataFrame containing the absolute value of each element.
1531
1532 See Also
1533 --------
1534 numpy.absolute : Calculate the absolute value element-wise.
1535
1536 Notes
1537 -----
1538 For ``complex`` inputs, ``1.2 + 1j``, the absolute value is
1539 :math:`\\sqrt{ a^2 + b^2 }`.
1540
1541 Examples
1542 --------
1543 Absolute numeric values in a Series.
1544
1545 >>> s = pd.Series([-1.10, 2, -3.33, 4])
1546 >>> s.abs()
1547 0 1.10
1548 1 2.00
1549 2 3.33
1550 3 4.00
1551 dtype: float64
1552
1553 Absolute numeric values in a Series with complex numbers.
1554
1555 >>> s = pd.Series([1.2 + 1j])
1556 >>> s.abs()
1557 0 1.56205
1558 dtype: float64
1559
1560 Absolute numeric values in a Series with a Timedelta element.
1561
1562 >>> s = pd.Series([pd.Timedelta('1 days')])
1563 >>> s.abs()
1564 0 1 days
1565 dtype: timedelta64[ns]
1566
1567 Select rows with data closest to certain value using argsort (from
1568 `StackOverflow <https://stackoverflow.com/a/17758115>`__).
1569
1570 >>> df = pd.DataFrame({
1571 ... 'a': [4, 5, 6, 7],
1572 ... 'b': [10, 20, 30, 40],
1573 ... 'c': [100, 50, -30, -50]
1574 ... })
1575 >>> df
1576 a b c
1577 0 4 10 100
1578 1 5 20 50
1579 2 6 30 -30
1580 3 7 40 -50
1581 >>> df.loc[(df.c - 43).abs().argsort()]
1582 a b c
1583 1 5 20 50
1584 0 4 10 100
1585 2 6 30 -30
1586 3 7 40 -50
1587 """
1588 res_mgr = self._mgr.apply(np.abs)
1589 return self._constructor(res_mgr).__finalize__(self, name="abs")
1590
1591 @final
1592 def __abs__(self: NDFrameT) -> NDFrameT:
1593 return self.abs()
1594
1595 @final
1596 def __round__(self: NDFrameT, decimals: int = 0) -> NDFrameT:
1597 return self.round(decimals).__finalize__(self, method="__round__")
1598
1599 # -------------------------------------------------------------------------
1600 # Label or Level Combination Helpers
1601 #
1602 # A collection of helper methods for DataFrame/Series operations that
1603 # accept a combination of column/index labels and levels. All such
1604 # operations should utilize/extend these methods when possible so that we
1605 # have consistent precedence and validation logic throughout the library.
1606
1607 @final
1608 def _is_level_reference(self, key: Level, axis: Axis = 0) -> bool_t:
1609 """
1610 Test whether a key is a level reference for a given axis.
1611
1612 To be considered a level reference, `key` must be a string that:
1613 - (axis=0): Matches the name of an index level and does NOT match
1614 a column label.
1615 - (axis=1): Matches the name of a column level and does NOT match
1616 an index label.
1617
1618 Parameters
1619 ----------
1620 key : Hashable
1621 Potential level name for the given axis
1622 axis : int, default 0
1623 Axis that levels are associated with (0 for index, 1 for columns)
1624
1625 Returns
1626 -------
1627 is_level : bool
1628 """
1629 axis_int = self._get_axis_number(axis)
1630
1631 return (
1632 key is not None
1633 and is_hashable(key)
1634 and key in self.axes[axis_int].names
1635 and not self._is_label_reference(key, axis=axis_int)
1636 )
1637
1638 @final
1639 def _is_label_reference(self, key: Level, axis: Axis = 0) -> bool_t:
1640 """
1641 Test whether a key is a label reference for a given axis.
1642
1643 To be considered a label reference, `key` must be a string that:
1644 - (axis=0): Matches a column label
1645 - (axis=1): Matches an index label
1646
1647 Parameters
1648 ----------
1649 key : Hashable
1650 Potential label name, i.e. Index entry.
1651 axis : int, default 0
1652 Axis perpendicular to the axis that labels are associated with
1653 (0 means search for column labels, 1 means search for index labels)
1654
1655 Returns
1656 -------
1657 is_label: bool
1658 """
1659 axis_int = self._get_axis_number(axis)
1660 other_axes = (ax for ax in range(self._AXIS_LEN) if ax != axis_int)
1661
1662 return (
1663 key is not None
1664 and is_hashable(key)
1665 and any(key in self.axes[ax] for ax in other_axes)
1666 )
1667
1668 @final
1669 def _is_label_or_level_reference(self, key: Level, axis: AxisInt = 0) -> bool_t:
1670 """
1671 Test whether a key is a label or level reference for a given axis.
1672
1673 To be considered either a label or a level reference, `key` must be a
1674 string that:
1675 - (axis=0): Matches a column label or an index level
1676 - (axis=1): Matches an index label or a column level
1677
1678 Parameters
1679 ----------
1680 key : Hashable
1681 Potential label or level name
1682 axis : int, default 0
1683 Axis that levels are associated with (0 for index, 1 for columns)
1684
1685 Returns
1686 -------
1687 bool
1688 """
1689 return self._is_level_reference(key, axis=axis) or self._is_label_reference(
1690 key, axis=axis
1691 )
1692
1693 @final
1694 def _check_label_or_level_ambiguity(self, key: Level, axis: Axis = 0) -> None:
1695 """
1696 Check whether `key` is ambiguous.
1697
1698 By ambiguous, we mean that it matches both a level of the input
1699 `axis` and a label of the other axis.
1700
1701 Parameters
1702 ----------
1703 key : Hashable
1704 Label or level name.
1705 axis : int, default 0
1706 Axis that levels are associated with (0 for index, 1 for columns).
1707
1708 Raises
1709 ------
1710 ValueError: `key` is ambiguous
1711 """
1712
1713 axis_int = self._get_axis_number(axis)
1714 other_axes = (ax for ax in range(self._AXIS_LEN) if ax != axis_int)
1715
1716 if (
1717 key is not None
1718 and is_hashable(key)
1719 and key in self.axes[axis_int].names
1720 and any(key in self.axes[ax] for ax in other_axes)
1721 ):
1722 # Build an informative and grammatical warning
1723 level_article, level_type = (
1724 ("an", "index") if axis_int == 0 else ("a", "column")
1725 )
1726
1727 label_article, label_type = (
1728 ("a", "column") if axis_int == 0 else ("an", "index")
1729 )
1730
1731 msg = (
1732 f"'{key}' is both {level_article} {level_type} level and "
1733 f"{label_article} {label_type} label, which is ambiguous."
1734 )
1735 raise ValueError(msg)
1736
1737 @final
1738 def _get_label_or_level_values(self, key: Level, axis: AxisInt = 0) -> ArrayLike:
1739 """
1740 Return a 1-D array of values associated with `key`, a label or level
1741 from the given `axis`.
1742
1743 Retrieval logic:
1744 - (axis=0): Return column values if `key` matches a column label.
1745 Otherwise return index level values if `key` matches an index
1746 level.
1747 - (axis=1): Return row values if `key` matches an index label.
1748 Otherwise return column level values if 'key' matches a column
1749 level
1750
1751 Parameters
1752 ----------
1753 key : Hashable
1754 Label or level name.
1755 axis : int, default 0
1756 Axis that levels are associated with (0 for index, 1 for columns)
1757
1758 Returns
1759 -------
1760 np.ndarray or ExtensionArray
1761
1762 Raises
1763 ------
1764 KeyError
1765 if `key` matches neither a label nor a level
1766 ValueError
1767 if `key` matches multiple labels
1768 """
1769 axis = self._get_axis_number(axis)
1770 other_axes = [ax for ax in range(self._AXIS_LEN) if ax != axis]
1771
1772 if self._is_label_reference(key, axis=axis):
1773 self._check_label_or_level_ambiguity(key, axis=axis)
1774 values = self.xs(key, axis=other_axes[0])._values
1775 elif self._is_level_reference(key, axis=axis):
1776 values = self.axes[axis].get_level_values(key)._values
1777 else:
1778 raise KeyError(key)
1779
1780 # Check for duplicates
1781 if values.ndim > 1:
1782 if other_axes and isinstance(self._get_axis(other_axes[0]), MultiIndex):
1783 multi_message = (
1784 "\n"
1785 "For a multi-index, the label must be a "
1786 "tuple with elements corresponding to each level."
1787 )
1788 else:
1789 multi_message = ""
1790
1791 label_axis_name = "column" if axis == 0 else "index"
1792 raise ValueError(
1793 f"The {label_axis_name} label '{key}' is not unique.{multi_message}"
1794 )
1795
1796 return values
1797
1798 @final
1799 def _drop_labels_or_levels(self, keys, axis: AxisInt = 0):
1800 """
1801 Drop labels and/or levels for the given `axis`.
1802
1803 For each key in `keys`:
1804 - (axis=0): If key matches a column label then drop the column.
1805 Otherwise if key matches an index level then drop the level.
1806 - (axis=1): If key matches an index label then drop the row.
1807 Otherwise if key matches a column level then drop the level.
1808
1809 Parameters
1810 ----------
1811 keys : str or list of str
1812 labels or levels to drop
1813 axis : int, default 0
1814 Axis that levels are associated with (0 for index, 1 for columns)
1815
1816 Returns
1817 -------
1818 dropped: DataFrame
1819
1820 Raises
1821 ------
1822 ValueError
1823 if any `keys` match neither a label nor a level
1824 """
1825 axis = self._get_axis_number(axis)
1826
1827 # Validate keys
1828 keys = common.maybe_make_list(keys)
1829 invalid_keys = [
1830 k for k in keys if not self._is_label_or_level_reference(k, axis=axis)
1831 ]
1832
1833 if invalid_keys:
1834 raise ValueError(
1835 "The following keys are not valid labels or "
1836 f"levels for axis {axis}: {invalid_keys}"
1837 )
1838
1839 # Compute levels and labels to drop
1840 levels_to_drop = [k for k in keys if self._is_level_reference(k, axis=axis)]
1841
1842 labels_to_drop = [k for k in keys if not self._is_level_reference(k, axis=axis)]
1843
1844 # Perform copy upfront and then use inplace operations below.
1845 # This ensures that we always perform exactly one copy.
1846 # ``copy`` and/or ``inplace`` options could be added in the future.
1847 dropped = self.copy(deep=False)
1848
1849 if axis == 0:
1850 # Handle dropping index levels
1851 if levels_to_drop:
1852 dropped.reset_index(levels_to_drop, drop=True, inplace=True)
1853
1854 # Handle dropping columns labels
1855 if labels_to_drop:
1856 dropped.drop(labels_to_drop, axis=1, inplace=True)
1857 else:
1858 # Handle dropping column levels
1859 if levels_to_drop:
1860 if isinstance(dropped.columns, MultiIndex):
1861 # Drop the specified levels from the MultiIndex
1862 dropped.columns = dropped.columns.droplevel(levels_to_drop)
1863 else:
1864 # Drop the last level of Index by replacing with
1865 # a RangeIndex
1866 dropped.columns = RangeIndex(dropped.columns.size)
1867
1868 # Handle dropping index labels
1869 if labels_to_drop:
1870 dropped.drop(labels_to_drop, axis=0, inplace=True)
1871
1872 return dropped
1873
1874 # ----------------------------------------------------------------------
1875 # Iteration
1876
1877 # https://github.com/python/typeshed/issues/2148#issuecomment-520783318
1878 # Incompatible types in assignment (expression has type "None", base class
1879 # "object" defined the type as "Callable[[object], int]")
1880 __hash__: ClassVar[None] # type: ignore[assignment]
1881
1882 def __iter__(self) -> Iterator:
1883 """
1884 Iterate over info axis.
1885
1886 Returns
1887 -------
1888 iterator
1889 Info axis as iterator.
1890 """
1891 return iter(self._info_axis)
1892
1893 # can we get a better explanation of this?
1894 def keys(self) -> Index:
1895 """
1896 Get the 'info axis' (see Indexing for more).
1897
1898 This is index for Series, columns for DataFrame.
1899
1900 Returns
1901 -------
1902 Index
1903 Info axis.
1904 """
1905 return self._info_axis
1906
1907 def items(self):
1908 """
1909 Iterate over (label, values) on info axis
1910
1911 This is index for Series and columns for DataFrame.
1912
1913 Returns
1914 -------
1915 Generator
1916 """
1917 for h in self._info_axis:
1918 yield h, self[h]
1919
1920 def __len__(self) -> int:
1921 """Returns length of info axis"""
1922 return len(self._info_axis)
1923
1924 @final
1925 def __contains__(self, key) -> bool_t:
1926 """True if the key is in the info axis"""
1927 return key in self._info_axis
1928
1929 @property
1930 def empty(self) -> bool_t:
1931 """
1932 Indicator whether Series/DataFrame is empty.
1933
1934 True if Series/DataFrame is entirely empty (no items), meaning any of the
1935 axes are of length 0.
1936
1937 Returns
1938 -------
1939 bool
1940 If Series/DataFrame is empty, return True, if not return False.
1941
1942 See Also
1943 --------
1944 Series.dropna : Return series without null values.
1945 DataFrame.dropna : Return DataFrame with labels on given axis omitted
1946 where (all or any) data are missing.
1947
1948 Notes
1949 -----
1950 If Series/DataFrame contains only NaNs, it is still not considered empty. See
1951 the example below.
1952
1953 Examples
1954 --------
1955 An example of an actual empty DataFrame. Notice the index is empty:
1956
1957 >>> df_empty = pd.DataFrame({'A' : []})
1958 >>> df_empty
1959 Empty DataFrame
1960 Columns: [A]
1961 Index: []
1962 >>> df_empty.empty
1963 True
1964
1965 If we only have NaNs in our DataFrame, it is not considered empty! We
1966 will need to drop the NaNs to make the DataFrame empty:
1967
1968 >>> df = pd.DataFrame({'A' : [np.nan]})
1969 >>> df
1970 A
1971 0 NaN
1972 >>> df.empty
1973 False
1974 >>> df.dropna().empty
1975 True
1976
1977 >>> ser_empty = pd.Series({'A' : []})
1978 >>> ser_empty
1979 A []
1980 dtype: object
1981 >>> ser_empty.empty
1982 False
1983 >>> ser_empty = pd.Series()
1984 >>> ser_empty.empty
1985 True
1986 """
1987 return any(len(self._get_axis(a)) == 0 for a in self._AXIS_ORDERS)
1988
1989 # ----------------------------------------------------------------------
1990 # Array Interface
1991
1992 # This is also set in IndexOpsMixin
1993 # GH#23114 Ensure ndarray.__op__(DataFrame) returns NotImplemented
1994 __array_priority__: int = 1000
1995
1996 def __array__(self, dtype: npt.DTypeLike | None = None) -> np.ndarray:
1997 values = self._values
1998 arr = np.asarray(values, dtype=dtype)
1999 if (
2000 astype_is_view(values.dtype, arr.dtype)
2001 and using_copy_on_write()
2002 and self._mgr.is_single_block
2003 ):
2004 # Check if both conversions can be done without a copy
2005 if astype_is_view(self.dtypes.iloc[0], values.dtype) and astype_is_view(
2006 values.dtype, arr.dtype
2007 ):
2008 arr = arr.view()
2009 arr.flags.writeable = False
2010 return arr
2011
2012 @final
2013 def __array_ufunc__(
2014 self, ufunc: np.ufunc, method: str, *inputs: Any, **kwargs: Any
2015 ):
2016 return arraylike.array_ufunc(self, ufunc, method, *inputs, **kwargs)
2017
2018 # ----------------------------------------------------------------------
2019 # Picklability
2020
2021 @final
2022 def __getstate__(self) -> dict[str, Any]:
2023 meta = {k: getattr(self, k, None) for k in self._metadata}
2024 return {
2025 "_mgr": self._mgr,
2026 "_typ": self._typ,
2027 "_metadata": self._metadata,
2028 "attrs": self.attrs,
2029 "_flags": {k: self.flags[k] for k in self.flags._keys},
2030 **meta,
2031 }
2032
2033 @final
2034 def __setstate__(self, state) -> None:
2035 if isinstance(state, BlockManager):
2036 self._mgr = state
2037 elif isinstance(state, dict):
2038 if "_data" in state and "_mgr" not in state:
2039 # compat for older pickles
2040 state["_mgr"] = state.pop("_data")
2041 typ = state.get("_typ")
2042 if typ is not None:
2043 attrs = state.get("_attrs", {})
2044 object.__setattr__(self, "_attrs", attrs)
2045 flags = state.get("_flags", {"allows_duplicate_labels": True})
2046 object.__setattr__(self, "_flags", Flags(self, **flags))
2047
2048 # set in the order of internal names
2049 # to avoid definitional recursion
2050 # e.g. say fill_value needing _mgr to be
2051 # defined
2052 meta = set(self._internal_names + self._metadata)
2053 for k in list(meta):
2054 if k in state and k != "_flags":
2055 v = state[k]
2056 object.__setattr__(self, k, v)
2057
2058 for k, v in state.items():
2059 if k not in meta:
2060 object.__setattr__(self, k, v)
2061
2062 else:
2063 raise NotImplementedError("Pre-0.12 pickles are no longer supported")
2064 elif len(state) == 2:
2065 raise NotImplementedError("Pre-0.12 pickles are no longer supported")
2066
2067 self._item_cache: dict[Hashable, Series] = {}
2068
2069 # ----------------------------------------------------------------------
2070 # Rendering Methods
2071
2072 def __repr__(self) -> str:
2073 # string representation based upon iterating over self
2074 # (since, by definition, `PandasContainers` are iterable)
2075 prepr = f"[{','.join(map(pprint_thing, self))}]"
2076 return f"{type(self).__name__}({prepr})"
2077
2078 @final
2079 def _repr_latex_(self):
2080 """
2081 Returns a LaTeX representation for a particular object.
2082 Mainly for use with nbconvert (jupyter notebook conversion to pdf).
2083 """
2084 if config.get_option("styler.render.repr") == "latex":
2085 return self.to_latex()
2086 else:
2087 return None
2088
2089 @final
2090 def _repr_data_resource_(self):
2091 """
2092 Not a real Jupyter special repr method, but we use the same
2093 naming convention.
2094 """
2095 if config.get_option("display.html.table_schema"):
2096 data = self.head(config.get_option("display.max_rows"))
2097
2098 as_json = data.to_json(orient="table")
2099 as_json = cast(str, as_json)
2100 return loads(as_json, object_pairs_hook=collections.OrderedDict)
2101
2102 # ----------------------------------------------------------------------
2103 # I/O Methods
2104
2105 @final
2106 @doc(
2107 klass="object",
2108 storage_options=_shared_docs["storage_options"],
2109 storage_options_versionadded="1.2.0",
2110 )
2111 def to_excel(
2112 self,
2113 excel_writer,
2114 sheet_name: str = "Sheet1",
2115 na_rep: str = "",
2116 float_format: str | None = None,
2117 columns: Sequence[Hashable] | None = None,
2118 header: Sequence[Hashable] | bool_t = True,
2119 index: bool_t = True,
2120 index_label: IndexLabel = None,
2121 startrow: int = 0,
2122 startcol: int = 0,
2123 engine: str | None = None,
2124 merge_cells: bool_t = True,
2125 inf_rep: str = "inf",
2126 freeze_panes: tuple[int, int] | None = None,
2127 storage_options: StorageOptions = None,
2128 ) -> None:
2129 """
2130 Write {klass} to an Excel sheet.
2131
2132 To write a single {klass} to an Excel .xlsx file it is only necessary to
2133 specify a target file name. To write to multiple sheets it is necessary to
2134 create an `ExcelWriter` object with a target file name, and specify a sheet
2135 in the file to write to.
2136
2137 Multiple sheets may be written to by specifying unique `sheet_name`.
2138 With all data written to the file it is necessary to save the changes.
2139 Note that creating an `ExcelWriter` object with a file name that already
2140 exists will result in the contents of the existing file being erased.
2141
2142 Parameters
2143 ----------
2144 excel_writer : path-like, file-like, or ExcelWriter object
2145 File path or existing ExcelWriter.
2146 sheet_name : str, default 'Sheet1'
2147 Name of sheet which will contain DataFrame.
2148 na_rep : str, default ''
2149 Missing data representation.
2150 float_format : str, optional
2151 Format string for floating point numbers. For example
2152 ``float_format="%.2f"`` will format 0.1234 to 0.12.
2153 columns : sequence or list of str, optional
2154 Columns to write.
2155 header : bool or list of str, default True
2156 Write out the column names. If a list of string is given it is
2157 assumed to be aliases for the column names.
2158 index : bool, default True
2159 Write row names (index).
2160 index_label : str or sequence, optional
2161 Column label for index column(s) if desired. If not specified, and
2162 `header` and `index` are True, then the index names are used. A
2163 sequence should be given if the DataFrame uses MultiIndex.
2164 startrow : int, default 0
2165 Upper left cell row to dump data frame.
2166 startcol : int, default 0
2167 Upper left cell column to dump data frame.
2168 engine : str, optional
2169 Write engine to use, 'openpyxl' or 'xlsxwriter'. You can also set this
2170 via the options ``io.excel.xlsx.writer`` or
2171 ``io.excel.xlsm.writer``.
2172
2173 merge_cells : bool, default True
2174 Write MultiIndex and Hierarchical Rows as merged cells.
2175 inf_rep : str, default 'inf'
2176 Representation for infinity (there is no native representation for
2177 infinity in Excel).
2178 freeze_panes : tuple of int (length 2), optional
2179 Specifies the one-based bottommost row and rightmost column that
2180 is to be frozen.
2181 {storage_options}
2182
2183 .. versionadded:: {storage_options_versionadded}
2184
2185 See Also
2186 --------
2187 to_csv : Write DataFrame to a comma-separated values (csv) file.
2188 ExcelWriter : Class for writing DataFrame objects into excel sheets.
2189 read_excel : Read an Excel file into a pandas DataFrame.
2190 read_csv : Read a comma-separated values (csv) file into DataFrame.
2191 io.formats.style.Styler.to_excel : Add styles to Excel sheet.
2192
2193 Notes
2194 -----
2195 For compatibility with :meth:`~DataFrame.to_csv`,
2196 to_excel serializes lists and dicts to strings before writing.
2197
2198 Once a workbook has been saved it is not possible to write further
2199 data without rewriting the whole workbook.
2200
2201 Examples
2202 --------
2203
2204 Create, write to and save a workbook:
2205
2206 >>> df1 = pd.DataFrame([['a', 'b'], ['c', 'd']],
2207 ... index=['row 1', 'row 2'],
2208 ... columns=['col 1', 'col 2'])
2209 >>> df1.to_excel("output.xlsx") # doctest: +SKIP
2210
2211 To specify the sheet name:
2212
2213 >>> df1.to_excel("output.xlsx",
2214 ... sheet_name='Sheet_name_1') # doctest: +SKIP
2215
2216 If you wish to write to more than one sheet in the workbook, it is
2217 necessary to specify an ExcelWriter object:
2218
2219 >>> df2 = df1.copy()
2220 >>> with pd.ExcelWriter('output.xlsx') as writer: # doctest: +SKIP
2221 ... df1.to_excel(writer, sheet_name='Sheet_name_1')
2222 ... df2.to_excel(writer, sheet_name='Sheet_name_2')
2223
2224 ExcelWriter can also be used to append to an existing Excel file:
2225
2226 >>> with pd.ExcelWriter('output.xlsx',
2227 ... mode='a') as writer: # doctest: +SKIP
2228 ... df.to_excel(writer, sheet_name='Sheet_name_3')
2229
2230 To set the library that is used to write the Excel file,
2231 you can pass the `engine` keyword (the default engine is
2232 automatically chosen depending on the file extension):
2233
2234 >>> df1.to_excel('output1.xlsx', engine='xlsxwriter') # doctest: +SKIP
2235 """
2236
2237 df = self if isinstance(self, ABCDataFrame) else self.to_frame()
2238
2239 from pandas.io.formats.excel import ExcelFormatter
2240
2241 formatter = ExcelFormatter(
2242 df,
2243 na_rep=na_rep,
2244 cols=columns,
2245 header=header,
2246 float_format=float_format,
2247 index=index,
2248 index_label=index_label,
2249 merge_cells=merge_cells,
2250 inf_rep=inf_rep,
2251 )
2252 formatter.write(
2253 excel_writer,
2254 sheet_name=sheet_name,
2255 startrow=startrow,
2256 startcol=startcol,
2257 freeze_panes=freeze_panes,
2258 engine=engine,
2259 storage_options=storage_options,
2260 )
2261
2262 @final
2263 @doc(
2264 storage_options=_shared_docs["storage_options"],
2265 compression_options=_shared_docs["compression_options"] % "path_or_buf",
2266 )
2267 def to_json(
2268 self,
2269 path_or_buf: FilePath | WriteBuffer[bytes] | WriteBuffer[str] | None = None,
2270 orient: str | None = None,
2271 date_format: str | None = None,
2272 double_precision: int = 10,
2273 force_ascii: bool_t = True,
2274 date_unit: str = "ms",
2275 default_handler: Callable[[Any], JSONSerializable] | None = None,
2276 lines: bool_t = False,
2277 compression: CompressionOptions = "infer",
2278 index: bool_t = True,
2279 indent: int | None = None,
2280 storage_options: StorageOptions = None,
2281 mode: Literal["a", "w"] = "w",
2282 ) -> str | None:
2283 """
2284 Convert the object to a JSON string.
2285
2286 Note NaN's and None will be converted to null and datetime objects
2287 will be converted to UNIX timestamps.
2288
2289 Parameters
2290 ----------
2291 path_or_buf : str, path object, file-like object, or None, default None
2292 String, path object (implementing os.PathLike[str]), or file-like
2293 object implementing a write() function. If None, the result is
2294 returned as a string.
2295 orient : str
2296 Indication of expected JSON string format.
2297
2298 * Series:
2299
2300 - default is 'index'
2301 - allowed values are: {{'split', 'records', 'index', 'table'}}.
2302
2303 * DataFrame:
2304
2305 - default is 'columns'
2306 - allowed values are: {{'split', 'records', 'index', 'columns',
2307 'values', 'table'}}.
2308
2309 * The format of the JSON string:
2310
2311 - 'split' : dict like {{'index' -> [index], 'columns' -> [columns],
2312 'data' -> [values]}}
2313 - 'records' : list like [{{column -> value}}, ... , {{column -> value}}]
2314 - 'index' : dict like {{index -> {{column -> value}}}}
2315 - 'columns' : dict like {{column -> {{index -> value}}}}
2316 - 'values' : just the values array
2317 - 'table' : dict like {{'schema': {{schema}}, 'data': {{data}}}}
2318
2319 Describing the data, where data component is like ``orient='records'``.
2320
2321 date_format : {{None, 'epoch', 'iso'}}
2322 Type of date conversion. 'epoch' = epoch milliseconds,
2323 'iso' = ISO8601. The default depends on the `orient`. For
2324 ``orient='table'``, the default is 'iso'. For all other orients,
2325 the default is 'epoch'.
2326 double_precision : int, default 10
2327 The number of decimal places to use when encoding
2328 floating point values.
2329 force_ascii : bool, default True
2330 Force encoded string to be ASCII.
2331 date_unit : str, default 'ms' (milliseconds)
2332 The time unit to encode to, governs timestamp and ISO8601
2333 precision. One of 's', 'ms', 'us', 'ns' for second, millisecond,
2334 microsecond, and nanosecond respectively.
2335 default_handler : callable, default None
2336 Handler to call if object cannot otherwise be converted to a
2337 suitable format for JSON. Should receive a single argument which is
2338 the object to convert and return a serialisable object.
2339 lines : bool, default False
2340 If 'orient' is 'records' write out line-delimited json format. Will
2341 throw ValueError if incorrect 'orient' since others are not
2342 list-like.
2343 {compression_options}
2344
2345 .. versionchanged:: 1.4.0 Zstandard support.
2346
2347 index : bool, default True
2348 Whether to include the index values in the JSON string. Not
2349 including the index (``index=False``) is only supported when
2350 orient is 'split' or 'table'.
2351 indent : int, optional
2352 Length of whitespace used to indent each record.
2353
2354 {storage_options}
2355
2356 .. versionadded:: 1.2.0
2357
2358 mode : str, default 'w' (writing)
2359 Specify the IO mode for output when supplying a path_or_buf.
2360 Accepted args are 'w' (writing) and 'a' (append) only.
2361 mode='a' is only supported when lines is True and orient is 'records'.
2362
2363 Returns
2364 -------
2365 None or str
2366 If path_or_buf is None, returns the resulting json format as a
2367 string. Otherwise returns None.
2368
2369 See Also
2370 --------
2371 read_json : Convert a JSON string to pandas object.
2372
2373 Notes
2374 -----
2375 The behavior of ``indent=0`` varies from the stdlib, which does not
2376 indent the output but does insert newlines. Currently, ``indent=0``
2377 and the default ``indent=None`` are equivalent in pandas, though this
2378 may change in a future release.
2379
2380 ``orient='table'`` contains a 'pandas_version' field under 'schema'.
2381 This stores the version of `pandas` used in the latest revision of the
2382 schema.
2383
2384 Examples
2385 --------
2386 >>> from json import loads, dumps
2387 >>> df = pd.DataFrame(
2388 ... [["a", "b"], ["c", "d"]],
2389 ... index=["row 1", "row 2"],
2390 ... columns=["col 1", "col 2"],
2391 ... )
2392
2393 >>> result = df.to_json(orient="split")
2394 >>> parsed = loads(result)
2395 >>> dumps(parsed, indent=4) # doctest: +SKIP
2396 {{
2397 "columns": [
2398 "col 1",
2399 "col 2"
2400 ],
2401 "index": [
2402 "row 1",
2403 "row 2"
2404 ],
2405 "data": [
2406 [
2407 "a",
2408 "b"
2409 ],
2410 [
2411 "c",
2412 "d"
2413 ]
2414 ]
2415 }}
2416
2417 Encoding/decoding a Dataframe using ``'records'`` formatted JSON.
2418 Note that index labels are not preserved with this encoding.
2419
2420 >>> result = df.to_json(orient="records")
2421 >>> parsed = loads(result)
2422 >>> dumps(parsed, indent=4) # doctest: +SKIP
2423 [
2424 {{
2425 "col 1": "a",
2426 "col 2": "b"
2427 }},
2428 {{
2429 "col 1": "c",
2430 "col 2": "d"
2431 }}
2432 ]
2433
2434 Encoding/decoding a Dataframe using ``'index'`` formatted JSON:
2435
2436 >>> result = df.to_json(orient="index")
2437 >>> parsed = loads(result)
2438 >>> dumps(parsed, indent=4) # doctest: +SKIP
2439 {{
2440 "row 1": {{
2441 "col 1": "a",
2442 "col 2": "b"
2443 }},
2444 "row 2": {{
2445 "col 1": "c",
2446 "col 2": "d"
2447 }}
2448 }}
2449
2450 Encoding/decoding a Dataframe using ``'columns'`` formatted JSON:
2451
2452 >>> result = df.to_json(orient="columns")
2453 >>> parsed = loads(result)
2454 >>> dumps(parsed, indent=4) # doctest: +SKIP
2455 {{
2456 "col 1": {{
2457 "row 1": "a",
2458 "row 2": "c"
2459 }},
2460 "col 2": {{
2461 "row 1": "b",
2462 "row 2": "d"
2463 }}
2464 }}
2465
2466 Encoding/decoding a Dataframe using ``'values'`` formatted JSON:
2467
2468 >>> result = df.to_json(orient="values")
2469 >>> parsed = loads(result)
2470 >>> dumps(parsed, indent=4) # doctest: +SKIP
2471 [
2472 [
2473 "a",
2474 "b"
2475 ],
2476 [
2477 "c",
2478 "d"
2479 ]
2480 ]
2481
2482 Encoding with Table Schema:
2483
2484 >>> result = df.to_json(orient="table")
2485 >>> parsed = loads(result)
2486 >>> dumps(parsed, indent=4) # doctest: +SKIP
2487 {{
2488 "schema": {{
2489 "fields": [
2490 {{
2491 "name": "index",
2492 "type": "string"
2493 }},
2494 {{
2495 "name": "col 1",
2496 "type": "string"
2497 }},
2498 {{
2499 "name": "col 2",
2500 "type": "string"
2501 }}
2502 ],
2503 "primaryKey": [
2504 "index"
2505 ],
2506 "pandas_version": "1.4.0"
2507 }},
2508 "data": [
2509 {{
2510 "index": "row 1",
2511 "col 1": "a",
2512 "col 2": "b"
2513 }},
2514 {{
2515 "index": "row 2",
2516 "col 1": "c",
2517 "col 2": "d"
2518 }}
2519 ]
2520 }}
2521 """
2522 from pandas.io import json
2523
2524 if date_format is None and orient == "table":
2525 date_format = "iso"
2526 elif date_format is None:
2527 date_format = "epoch"
2528
2529 config.is_nonnegative_int(indent)
2530 indent = indent or 0
2531
2532 return json.to_json(
2533 path_or_buf=path_or_buf,
2534 obj=self,
2535 orient=orient,
2536 date_format=date_format,
2537 double_precision=double_precision,
2538 force_ascii=force_ascii,
2539 date_unit=date_unit,
2540 default_handler=default_handler,
2541 lines=lines,
2542 compression=compression,
2543 index=index,
2544 indent=indent,
2545 storage_options=storage_options,
2546 mode=mode,
2547 )
2548
2549 @final
2550 def to_hdf(
2551 self,
2552 path_or_buf: FilePath | HDFStore,
2553 key: str,
2554 mode: str = "a",
2555 complevel: int | None = None,
2556 complib: str | None = None,
2557 append: bool_t = False,
2558 format: str | None = None,
2559 index: bool_t = True,
2560 min_itemsize: int | dict[str, int] | None = None,
2561 nan_rep=None,
2562 dropna: bool_t | None = None,
2563 data_columns: Literal[True] | list[str] | None = None,
2564 errors: str = "strict",
2565 encoding: str = "UTF-8",
2566 ) -> None:
2567 """
2568 Write the contained data to an HDF5 file using HDFStore.
2569
2570 Hierarchical Data Format (HDF) is self-describing, allowing an
2571 application to interpret the structure and contents of a file with
2572 no outside information. One HDF file can hold a mix of related objects
2573 which can be accessed as a group or as individual objects.
2574
2575 In order to add another DataFrame or Series to an existing HDF file
2576 please use append mode and a different a key.
2577
2578 .. warning::
2579
2580 One can store a subclass of ``DataFrame`` or ``Series`` to HDF5,
2581 but the type of the subclass is lost upon storing.
2582
2583 For more information see the :ref:`user guide <io.hdf5>`.
2584
2585 Parameters
2586 ----------
2587 path_or_buf : str or pandas.HDFStore
2588 File path or HDFStore object.
2589 key : str
2590 Identifier for the group in the store.
2591 mode : {'a', 'w', 'r+'}, default 'a'
2592 Mode to open file:
2593
2594 - 'w': write, a new file is created (an existing file with
2595 the same name would be deleted).
2596 - 'a': append, an existing file is opened for reading and
2597 writing, and if the file does not exist it is created.
2598 - 'r+': similar to 'a', but the file must already exist.
2599 complevel : {0-9}, default None
2600 Specifies a compression level for data.
2601 A value of 0 or None disables compression.
2602 complib : {'zlib', 'lzo', 'bzip2', 'blosc'}, default 'zlib'
2603 Specifies the compression library to be used.
2604 As of v0.20.2 these additional compressors for Blosc are supported
2605 (default if no compressor specified: 'blosc:blosclz'):
2606 {'blosc:blosclz', 'blosc:lz4', 'blosc:lz4hc', 'blosc:snappy',
2607 'blosc:zlib', 'blosc:zstd'}.
2608 Specifying a compression library which is not available issues
2609 a ValueError.
2610 append : bool, default False
2611 For Table formats, append the input data to the existing.
2612 format : {'fixed', 'table', None}, default 'fixed'
2613 Possible values:
2614
2615 - 'fixed': Fixed format. Fast writing/reading. Not-appendable,
2616 nor searchable.
2617 - 'table': Table format. Write as a PyTables Table structure
2618 which may perform worse but allow more flexible operations
2619 like searching / selecting subsets of the data.
2620 - If None, pd.get_option('io.hdf.default_format') is checked,
2621 followed by fallback to "fixed".
2622 index : bool, default True
2623 Write DataFrame index as a column.
2624 min_itemsize : dict or int, optional
2625 Map column names to minimum string sizes for columns.
2626 nan_rep : Any, optional
2627 How to represent null values as str.
2628 Not allowed with append=True.
2629 dropna : bool, default False, optional
2630 Remove missing values.
2631 data_columns : list of columns or True, optional
2632 List of columns to create as indexed data columns for on-disk
2633 queries, or True to use all columns. By default only the axes
2634 of the object are indexed. See
2635 :ref:`Query via data columns<io.hdf5-query-data-columns>`. for
2636 more information.
2637 Applicable only to format='table'.
2638 errors : str, default 'strict'
2639 Specifies how encoding and decoding errors are to be handled.
2640 See the errors argument for :func:`open` for a full list
2641 of options.
2642 encoding : str, default "UTF-8"
2643
2644 See Also
2645 --------
2646 read_hdf : Read from HDF file.
2647 DataFrame.to_orc : Write a DataFrame to the binary orc format.
2648 DataFrame.to_parquet : Write a DataFrame to the binary parquet format.
2649 DataFrame.to_sql : Write to a SQL table.
2650 DataFrame.to_feather : Write out feather-format for DataFrames.
2651 DataFrame.to_csv : Write out to a csv file.
2652
2653 Examples
2654 --------
2655 >>> df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]},
2656 ... index=['a', 'b', 'c']) # doctest: +SKIP
2657 >>> df.to_hdf('data.h5', key='df', mode='w') # doctest: +SKIP
2658
2659 We can add another object to the same file:
2660
2661 >>> s = pd.Series([1, 2, 3, 4]) # doctest: +SKIP
2662 >>> s.to_hdf('data.h5', key='s') # doctest: +SKIP
2663
2664 Reading from HDF file:
2665
2666 >>> pd.read_hdf('data.h5', 'df') # doctest: +SKIP
2667 A B
2668 a 1 4
2669 b 2 5
2670 c 3 6
2671 >>> pd.read_hdf('data.h5', 's') # doctest: +SKIP
2672 0 1
2673 1 2
2674 2 3
2675 3 4
2676 dtype: int64
2677 """
2678 from pandas.io import pytables
2679
2680 # Argument 3 to "to_hdf" has incompatible type "NDFrame"; expected
2681 # "Union[DataFrame, Series]" [arg-type]
2682 pytables.to_hdf(
2683 path_or_buf,
2684 key,
2685 self, # type: ignore[arg-type]
2686 mode=mode,
2687 complevel=complevel,
2688 complib=complib,
2689 append=append,
2690 format=format,
2691 index=index,
2692 min_itemsize=min_itemsize,
2693 nan_rep=nan_rep,
2694 dropna=dropna,
2695 data_columns=data_columns,
2696 errors=errors,
2697 encoding=encoding,
2698 )
2699
2700 @final
2701 def to_sql(
2702 self,
2703 name: str,
2704 con,
2705 schema: str | None = None,
2706 if_exists: Literal["fail", "replace", "append"] = "fail",
2707 index: bool_t = True,
2708 index_label: IndexLabel = None,
2709 chunksize: int | None = None,
2710 dtype: DtypeArg | None = None,
2711 method: str | None = None,
2712 ) -> int | None:
2713 """
2714 Write records stored in a DataFrame to a SQL database.
2715
2716 Databases supported by SQLAlchemy [1]_ are supported. Tables can be
2717 newly created, appended to, or overwritten.
2718
2719 Parameters
2720 ----------
2721 name : str
2722 Name of SQL table.
2723 con : sqlalchemy.engine.(Engine or Connection) or sqlite3.Connection
2724 Using SQLAlchemy makes it possible to use any DB supported by that
2725 library. Legacy support is provided for sqlite3.Connection objects. The user
2726 is responsible for engine disposal and connection closure for the SQLAlchemy
2727 connectable. See `here \
2728 <https://docs.sqlalchemy.org/en/20/core/connections.html>`_.
2729 If passing a sqlalchemy.engine.Connection which is already in a transaction,
2730 the transaction will not be committed. If passing a sqlite3.Connection,
2731 it will not be possible to roll back the record insertion.
2732
2733 schema : str, optional
2734 Specify the schema (if database flavor supports this). If None, use
2735 default schema.
2736 if_exists : {'fail', 'replace', 'append'}, default 'fail'
2737 How to behave if the table already exists.
2738
2739 * fail: Raise a ValueError.
2740 * replace: Drop the table before inserting new values.
2741 * append: Insert new values to the existing table.
2742
2743 index : bool, default True
2744 Write DataFrame index as a column. Uses `index_label` as the column
2745 name in the table.
2746 index_label : str or sequence, default None
2747 Column label for index column(s). If None is given (default) and
2748 `index` is True, then the index names are used.
2749 A sequence should be given if the DataFrame uses MultiIndex.
2750 chunksize : int, optional
2751 Specify the number of rows in each batch to be written at a time.
2752 By default, all rows will be written at once.
2753 dtype : dict or scalar, optional
2754 Specifying the datatype for columns. If a dictionary is used, the
2755 keys should be the column names and the values should be the
2756 SQLAlchemy types or strings for the sqlite3 legacy mode. If a
2757 scalar is provided, it will be applied to all columns.
2758 method : {None, 'multi', callable}, optional
2759 Controls the SQL insertion clause used:
2760
2761 * None : Uses standard SQL ``INSERT`` clause (one per row).
2762 * 'multi': Pass multiple values in a single ``INSERT`` clause.
2763 * callable with signature ``(pd_table, conn, keys, data_iter)``.
2764
2765 Details and a sample callable implementation can be found in the
2766 section :ref:`insert method <io.sql.method>`.
2767
2768 Returns
2769 -------
2770 None or int
2771 Number of rows affected by to_sql. None is returned if the callable
2772 passed into ``method`` does not return an integer number of rows.
2773
2774 The number of returned rows affected is the sum of the ``rowcount``
2775 attribute of ``sqlite3.Cursor`` or SQLAlchemy connectable which may not
2776 reflect the exact number of written rows as stipulated in the
2777 `sqlite3 <https://docs.python.org/3/library/sqlite3.html#sqlite3.Cursor.rowcount>`__ or
2778 `SQLAlchemy <https://docs.sqlalchemy.org/en/20/core/connections.html#sqlalchemy.engine.CursorResult.rowcount>`__.
2779
2780 .. versionadded:: 1.4.0
2781
2782 Raises
2783 ------
2784 ValueError
2785 When the table already exists and `if_exists` is 'fail' (the
2786 default).
2787
2788 See Also
2789 --------
2790 read_sql : Read a DataFrame from a table.
2791
2792 Notes
2793 -----
2794 Timezone aware datetime columns will be written as
2795 ``Timestamp with timezone`` type with SQLAlchemy if supported by the
2796 database. Otherwise, the datetimes will be stored as timezone unaware
2797 timestamps local to the original timezone.
2798
2799 References
2800 ----------
2801 .. [1] https://docs.sqlalchemy.org
2802 .. [2] https://www.python.org/dev/peps/pep-0249/
2803
2804 Examples
2805 --------
2806 Create an in-memory SQLite database.
2807
2808 >>> from sqlalchemy import create_engine
2809 >>> engine = create_engine('sqlite://', echo=False)
2810
2811 Create a table from scratch with 3 rows.
2812
2813 >>> df = pd.DataFrame({'name' : ['User 1', 'User 2', 'User 3']})
2814 >>> df
2815 name
2816 0 User 1
2817 1 User 2
2818 2 User 3
2819
2820 >>> df.to_sql('users', con=engine)
2821 3
2822 >>> from sqlalchemy import text
2823 >>> with engine.connect() as conn:
2824 ... conn.execute(text("SELECT * FROM users")).fetchall()
2825 [(0, 'User 1'), (1, 'User 2'), (2, 'User 3')]
2826
2827 An `sqlalchemy.engine.Connection` can also be passed to `con`:
2828
2829 >>> with engine.begin() as connection:
2830 ... df1 = pd.DataFrame({'name' : ['User 4', 'User 5']})
2831 ... df1.to_sql('users', con=connection, if_exists='append')
2832 2
2833
2834 This is allowed to support operations that require that the same
2835 DBAPI connection is used for the entire operation.
2836
2837 >>> df2 = pd.DataFrame({'name' : ['User 6', 'User 7']})
2838 >>> df2.to_sql('users', con=engine, if_exists='append')
2839 2
2840 >>> with engine.connect() as conn:
2841 ... conn.execute(text("SELECT * FROM users")).fetchall()
2842 [(0, 'User 1'), (1, 'User 2'), (2, 'User 3'),
2843 (0, 'User 4'), (1, 'User 5'), (0, 'User 6'),
2844 (1, 'User 7')]
2845
2846 Overwrite the table with just ``df2``.
2847
2848 >>> df2.to_sql('users', con=engine, if_exists='replace',
2849 ... index_label='id')
2850 2
2851 >>> with engine.connect() as conn:
2852 ... conn.execute(text("SELECT * FROM users")).fetchall()
2853 [(0, 'User 6'), (1, 'User 7')]
2854
2855 Specify the dtype (especially useful for integers with missing values).
2856 Notice that while pandas is forced to store the data as floating point,
2857 the database supports nullable integers. When fetching the data with
2858 Python, we get back integer scalars.
2859
2860 >>> df = pd.DataFrame({"A": [1, None, 2]})
2861 >>> df
2862 A
2863 0 1.0
2864 1 NaN
2865 2 2.0
2866
2867 >>> from sqlalchemy.types import Integer
2868 >>> df.to_sql('integers', con=engine, index=False,
2869 ... dtype={"A": Integer()})
2870 3
2871
2872 >>> with engine.connect() as conn:
2873 ... conn.execute(text("SELECT * FROM integers")).fetchall()
2874 [(1,), (None,), (2,)]
2875 """ # noqa:E501
2876 from pandas.io import sql
2877
2878 return sql.to_sql(
2879 self,
2880 name,
2881 con,
2882 schema=schema,
2883 if_exists=if_exists,
2884 index=index,
2885 index_label=index_label,
2886 chunksize=chunksize,
2887 dtype=dtype,
2888 method=method,
2889 )
2890
2891 @final
2892 @doc(
2893 storage_options=_shared_docs["storage_options"],
2894 compression_options=_shared_docs["compression_options"] % "path",
2895 )
2896 def to_pickle(
2897 self,
2898 path: FilePath | WriteBuffer[bytes],
2899 compression: CompressionOptions = "infer",
2900 protocol: int = pickle.HIGHEST_PROTOCOL,
2901 storage_options: StorageOptions = None,
2902 ) -> None:
2903 """
2904 Pickle (serialize) object to file.
2905
2906 Parameters
2907 ----------
2908 path : str, path object, or file-like object
2909 String, path object (implementing ``os.PathLike[str]``), or file-like
2910 object implementing a binary ``write()`` function. File path where
2911 the pickled object will be stored.
2912 {compression_options}
2913 protocol : int
2914 Int which indicates which protocol should be used by the pickler,
2915 default HIGHEST_PROTOCOL (see [1]_ paragraph 12.1.2). The possible
2916 values are 0, 1, 2, 3, 4, 5. A negative value for the protocol
2917 parameter is equivalent to setting its value to HIGHEST_PROTOCOL.
2918
2919 .. [1] https://docs.python.org/3/library/pickle.html.
2920
2921 {storage_options}
2922
2923 .. versionadded:: 1.2.0
2924
2925 See Also
2926 --------
2927 read_pickle : Load pickled pandas object (or any object) from file.
2928 DataFrame.to_hdf : Write DataFrame to an HDF5 file.
2929 DataFrame.to_sql : Write DataFrame to a SQL database.
2930 DataFrame.to_parquet : Write a DataFrame to the binary parquet format.
2931
2932 Examples
2933 --------
2934 >>> original_df = pd.DataFrame({{"foo": range(5), "bar": range(5, 10)}}) # doctest: +SKIP
2935 >>> original_df # doctest: +SKIP
2936 foo bar
2937 0 0 5
2938 1 1 6
2939 2 2 7
2940 3 3 8
2941 4 4 9
2942 >>> original_df.to_pickle("./dummy.pkl") # doctest: +SKIP
2943
2944 >>> unpickled_df = pd.read_pickle("./dummy.pkl") # doctest: +SKIP
2945 >>> unpickled_df # doctest: +SKIP
2946 foo bar
2947 0 0 5
2948 1 1 6
2949 2 2 7
2950 3 3 8
2951 4 4 9
2952 """ # noqa: E501
2953 from pandas.io.pickle import to_pickle
2954
2955 to_pickle(
2956 self,
2957 path,
2958 compression=compression,
2959 protocol=protocol,
2960 storage_options=storage_options,
2961 )
2962
2963 @final
2964 def to_clipboard(
2965 self, excel: bool_t = True, sep: str | None = None, **kwargs
2966 ) -> None:
2967 r"""
2968 Copy object to the system clipboard.
2969
2970 Write a text representation of object to the system clipboard.
2971 This can be pasted into Excel, for example.
2972
2973 Parameters
2974 ----------
2975 excel : bool, default True
2976 Produce output in a csv format for easy pasting into excel.
2977
2978 - True, use the provided separator for csv pasting.
2979 - False, write a string representation of the object to the clipboard.
2980
2981 sep : str, default ``'\t'``
2982 Field delimiter.
2983 **kwargs
2984 These parameters will be passed to DataFrame.to_csv.
2985
2986 See Also
2987 --------
2988 DataFrame.to_csv : Write a DataFrame to a comma-separated values
2989 (csv) file.
2990 read_clipboard : Read text from clipboard and pass to read_csv.
2991
2992 Notes
2993 -----
2994 Requirements for your platform.
2995
2996 - Linux : `xclip`, or `xsel` (with `PyQt4` modules)
2997 - Windows : none
2998 - macOS : none
2999
3000 This method uses the processes developed for the package `pyperclip`. A
3001 solution to render any output string format is given in the examples.
3002
3003 Examples
3004 --------
3005 Copy the contents of a DataFrame to the clipboard.
3006
3007 >>> df = pd.DataFrame([[1, 2, 3], [4, 5, 6]], columns=['A', 'B', 'C'])
3008
3009 >>> df.to_clipboard(sep=',') # doctest: +SKIP
3010 ... # Wrote the following to the system clipboard:
3011 ... # ,A,B,C
3012 ... # 0,1,2,3
3013 ... # 1,4,5,6
3014
3015 We can omit the index by passing the keyword `index` and setting
3016 it to false.
3017
3018 >>> df.to_clipboard(sep=',', index=False) # doctest: +SKIP
3019 ... # Wrote the following to the system clipboard:
3020 ... # A,B,C
3021 ... # 1,2,3
3022 ... # 4,5,6
3023
3024 Using the original `pyperclip` package for any string output format.
3025
3026 .. code-block:: python
3027
3028 import pyperclip
3029 html = df.style.to_html()
3030 pyperclip.copy(html)
3031 """
3032 from pandas.io import clipboards
3033
3034 clipboards.to_clipboard(self, excel=excel, sep=sep, **kwargs)
3035
3036 @final
3037 def to_xarray(self):
3038 """
3039 Return an xarray object from the pandas object.
3040
3041 Returns
3042 -------
3043 xarray.DataArray or xarray.Dataset
3044 Data in the pandas structure converted to Dataset if the object is
3045 a DataFrame, or a DataArray if the object is a Series.
3046
3047 See Also
3048 --------
3049 DataFrame.to_hdf : Write DataFrame to an HDF5 file.
3050 DataFrame.to_parquet : Write a DataFrame to the binary parquet format.
3051
3052 Notes
3053 -----
3054 See the `xarray docs <https://xarray.pydata.org/en/stable/>`__
3055
3056 Examples
3057 --------
3058 >>> df = pd.DataFrame([('falcon', 'bird', 389.0, 2),
3059 ... ('parrot', 'bird', 24.0, 2),
3060 ... ('lion', 'mammal', 80.5, 4),
3061 ... ('monkey', 'mammal', np.nan, 4)],
3062 ... columns=['name', 'class', 'max_speed',
3063 ... 'num_legs'])
3064 >>> df
3065 name class max_speed num_legs
3066 0 falcon bird 389.0 2
3067 1 parrot bird 24.0 2
3068 2 lion mammal 80.5 4
3069 3 monkey mammal NaN 4
3070
3071 >>> df.to_xarray()
3072 <xarray.Dataset>
3073 Dimensions: (index: 4)
3074 Coordinates:
3075 * index (index) int64 0 1 2 3
3076 Data variables:
3077 name (index) object 'falcon' 'parrot' 'lion' 'monkey'
3078 class (index) object 'bird' 'bird' 'mammal' 'mammal'
3079 max_speed (index) float64 389.0 24.0 80.5 nan
3080 num_legs (index) int64 2 2 4 4
3081
3082 >>> df['max_speed'].to_xarray()
3083 <xarray.DataArray 'max_speed' (index: 4)>
3084 array([389. , 24. , 80.5, nan])
3085 Coordinates:
3086 * index (index) int64 0 1 2 3
3087
3088 >>> dates = pd.to_datetime(['2018-01-01', '2018-01-01',
3089 ... '2018-01-02', '2018-01-02'])
3090 >>> df_multiindex = pd.DataFrame({'date': dates,
3091 ... 'animal': ['falcon', 'parrot',
3092 ... 'falcon', 'parrot'],
3093 ... 'speed': [350, 18, 361, 15]})
3094 >>> df_multiindex = df_multiindex.set_index(['date', 'animal'])
3095
3096 >>> df_multiindex
3097 speed
3098 date animal
3099 2018-01-01 falcon 350
3100 parrot 18
3101 2018-01-02 falcon 361
3102 parrot 15
3103
3104 >>> df_multiindex.to_xarray()
3105 <xarray.Dataset>
3106 Dimensions: (date: 2, animal: 2)
3107 Coordinates:
3108 * date (date) datetime64[ns] 2018-01-01 2018-01-02
3109 * animal (animal) object 'falcon' 'parrot'
3110 Data variables:
3111 speed (date, animal) int64 350 18 361 15
3112 """
3113 xarray = import_optional_dependency("xarray")
3114
3115 if self.ndim == 1:
3116 return xarray.DataArray.from_series(self)
3117 else:
3118 return xarray.Dataset.from_dataframe(self)
3119
3120 @overload
3121 def to_latex(
3122 self,
3123 buf: None = ...,
3124 columns: Sequence[Hashable] | None = ...,
3125 header: bool_t | Sequence[str] = ...,
3126 index: bool_t = ...,
3127 na_rep: str = ...,
3128 formatters: FormattersType | None = ...,
3129 float_format: FloatFormatType | None = ...,
3130 sparsify: bool_t | None = ...,
3131 index_names: bool_t = ...,
3132 bold_rows: bool_t = ...,
3133 column_format: str | None = ...,
3134 longtable: bool_t | None = ...,
3135 escape: bool_t | None = ...,
3136 encoding: str | None = ...,
3137 decimal: str = ...,
3138 multicolumn: bool_t | None = ...,
3139 multicolumn_format: str | None = ...,
3140 multirow: bool_t | None = ...,
3141 caption: str | tuple[str, str] | None = ...,
3142 label: str | None = ...,
3143 position: str | None = ...,
3144 ) -> str:
3145 ...
3146
3147 @overload
3148 def to_latex(
3149 self,
3150 buf: FilePath | WriteBuffer[str],
3151 columns: Sequence[Hashable] | None = ...,
3152 header: bool_t | Sequence[str] = ...,
3153 index: bool_t = ...,
3154 na_rep: str = ...,
3155 formatters: FormattersType | None = ...,
3156 float_format: FloatFormatType | None = ...,
3157 sparsify: bool_t | None = ...,
3158 index_names: bool_t = ...,
3159 bold_rows: bool_t = ...,
3160 column_format: str | None = ...,
3161 longtable: bool_t | None = ...,
3162 escape: bool_t | None = ...,
3163 encoding: str | None = ...,
3164 decimal: str = ...,
3165 multicolumn: bool_t | None = ...,
3166 multicolumn_format: str | None = ...,
3167 multirow: bool_t | None = ...,
3168 caption: str | tuple[str, str] | None = ...,
3169 label: str | None = ...,
3170 position: str | None = ...,
3171 ) -> None:
3172 ...
3173
3174 @final
3175 def to_latex(
3176 self,
3177 buf: FilePath | WriteBuffer[str] | None = None,
3178 columns: Sequence[Hashable] | None = None,
3179 header: bool_t | Sequence[str] = True,
3180 index: bool_t = True,
3181 na_rep: str = "NaN",
3182 formatters: FormattersType | None = None,
3183 float_format: FloatFormatType | None = None,
3184 sparsify: bool_t | None = None,
3185 index_names: bool_t = True,
3186 bold_rows: bool_t = False,
3187 column_format: str | None = None,
3188 longtable: bool_t | None = None,
3189 escape: bool_t | None = None,
3190 encoding: str | None = None,
3191 decimal: str = ".",
3192 multicolumn: bool_t | None = None,
3193 multicolumn_format: str | None = None,
3194 multirow: bool_t | None = None,
3195 caption: str | tuple[str, str] | None = None,
3196 label: str | None = None,
3197 position: str | None = None,
3198 ) -> str | None:
3199 r"""
3200 Render object to a LaTeX tabular, longtable, or nested table.
3201
3202 Requires ``\usepackage{{booktabs}}``. The output can be copy/pasted
3203 into a main LaTeX document or read from an external file
3204 with ``\input{{table.tex}}``.
3205
3206 .. versionchanged:: 1.2.0
3207 Added position argument, changed meaning of caption argument.
3208
3209 .. versionchanged:: 2.0.0
3210 Refactored to use the Styler implementation via jinja2 templating.
3211
3212 Parameters
3213 ----------
3214 buf : str, Path or StringIO-like, optional, default None
3215 Buffer to write to. If None, the output is returned as a string.
3216 columns : list of label, optional
3217 The subset of columns to write. Writes all columns by default.
3218 header : bool or list of str, default True
3219 Write out the column names. If a list of strings is given,
3220 it is assumed to be aliases for the column names.
3221 index : bool, default True
3222 Write row names (index).
3223 na_rep : str, default 'NaN'
3224 Missing data representation.
3225 formatters : list of functions or dict of {{str: function}}, optional
3226 Formatter functions to apply to columns' elements by position or
3227 name. The result of each function must be a unicode string.
3228 List must be of length equal to the number of columns.
3229 float_format : one-parameter function or str, optional, default None
3230 Formatter for floating point numbers. For example
3231 ``float_format="%.2f"`` and ``float_format="{{:0.2f}}".format`` will
3232 both result in 0.1234 being formatted as 0.12.
3233 sparsify : bool, optional
3234 Set to False for a DataFrame with a hierarchical index to print
3235 every multiindex key at each row. By default, the value will be
3236 read from the config module.
3237 index_names : bool, default True
3238 Prints the names of the indexes.
3239 bold_rows : bool, default False
3240 Make the row labels bold in the output.
3241 column_format : str, optional
3242 The columns format as specified in `LaTeX table format
3243 <https://en.wikibooks.org/wiki/LaTeX/Tables>`__ e.g. 'rcl' for 3
3244 columns. By default, 'l' will be used for all columns except
3245 columns of numbers, which default to 'r'.
3246 longtable : bool, optional
3247 Use a longtable environment instead of tabular. Requires
3248 adding a \usepackage{{longtable}} to your LaTeX preamble.
3249 By default, the value will be read from the pandas config
3250 module, and set to `True` if the option ``styler.latex.environment`` is
3251 `"longtable"`.
3252
3253 .. versionchanged:: 2.0.0
3254 The pandas option affecting this argument has changed.
3255 escape : bool, optional
3256 By default, the value will be read from the pandas config
3257 module and set to `True` if the option ``styler.format.escape`` is
3258 `"latex"`. When set to False prevents from escaping latex special
3259 characters in column names.
3260
3261 .. versionchanged:: 2.0.0
3262 The pandas option affecting this argument has changed, as has the
3263 default value to `False`.
3264 encoding : str, optional
3265 A string representing the encoding to use in the output file,
3266 defaults to 'utf-8'.
3267 decimal : str, default '.'
3268 Character recognized as decimal separator, e.g. ',' in Europe.
3269 multicolumn : bool, default True
3270 Use \multicolumn to enhance MultiIndex columns.
3271 The default will be read from the config module, and is set
3272 as the option ``styler.sparse.columns``.
3273
3274 .. versionchanged:: 2.0.0
3275 The pandas option affecting this argument has changed.
3276 multicolumn_format : str, default 'r'
3277 The alignment for multicolumns, similar to `column_format`
3278 The default will be read from the config module, and is set as the option
3279 ``styler.latex.multicol_align``.
3280
3281 .. versionchanged:: 2.0.0
3282 The pandas option affecting this argument has changed, as has the
3283 default value to "r".
3284 multirow : bool, default True
3285 Use \multirow to enhance MultiIndex rows. Requires adding a
3286 \usepackage{{multirow}} to your LaTeX preamble. Will print
3287 centered labels (instead of top-aligned) across the contained
3288 rows, separating groups via clines. The default will be read
3289 from the pandas config module, and is set as the option
3290 ``styler.sparse.index``.
3291
3292 .. versionchanged:: 2.0.0
3293 The pandas option affecting this argument has changed, as has the
3294 default value to `True`.
3295 caption : str or tuple, optional
3296 Tuple (full_caption, short_caption),
3297 which results in ``\caption[short_caption]{{full_caption}}``;
3298 if a single string is passed, no short caption will be set.
3299
3300 .. versionchanged:: 1.2.0
3301 Optionally allow caption to be a tuple ``(full_caption, short_caption)``.
3302
3303 label : str, optional
3304 The LaTeX label to be placed inside ``\label{{}}`` in the output.
3305 This is used with ``\ref{{}}`` in the main ``.tex`` file.
3306
3307 position : str, optional
3308 The LaTeX positional argument for tables, to be placed after
3309 ``\begin{{}}`` in the output.
3310
3311 .. versionadded:: 1.2.0
3312
3313 Returns
3314 -------
3315 str or None
3316 If buf is None, returns the result as a string. Otherwise returns None.
3317
3318 See Also
3319 --------
3320 io.formats.style.Styler.to_latex : Render a DataFrame to LaTeX
3321 with conditional formatting.
3322 DataFrame.to_string : Render a DataFrame to a console-friendly
3323 tabular output.
3324 DataFrame.to_html : Render a DataFrame as an HTML table.
3325
3326 Notes
3327 -----
3328 As of v2.0.0 this method has changed to use the Styler implementation as
3329 part of :meth:`.Styler.to_latex` via ``jinja2`` templating. This means
3330 that ``jinja2`` is a requirement, and needs to be installed, for this method
3331 to function. It is advised that users switch to using Styler, since that
3332 implementation is more frequently updated and contains much more
3333 flexibility with the output.
3334
3335 Examples
3336 --------
3337 Convert a general DataFrame to LaTeX with formatting:
3338
3339 >>> df = pd.DataFrame(dict(name=['Raphael', 'Donatello'],
3340 ... age=[26, 45],
3341 ... height=[181.23, 177.65]))
3342 >>> print(df.to_latex(index=False,
3343 ... formatters={"name": str.upper},
3344 ... float_format="{:.1f}".format,
3345 ... )) # doctest: +SKIP
3346 \begin{tabular}{lrr}
3347 \toprule
3348 name & age & height \\
3349 \midrule
3350 RAPHAEL & 26 & 181.2 \\
3351 DONATELLO & 45 & 177.7 \\
3352 \bottomrule
3353 \end{tabular}
3354 """
3355 # Get defaults from the pandas config
3356 if self.ndim == 1:
3357 self = self.to_frame()
3358 if longtable is None:
3359 longtable = config.get_option("styler.latex.environment") == "longtable"
3360 if escape is None:
3361 escape = config.get_option("styler.format.escape") == "latex"
3362 if multicolumn is None:
3363 multicolumn = config.get_option("styler.sparse.columns")
3364 if multicolumn_format is None:
3365 multicolumn_format = config.get_option("styler.latex.multicol_align")
3366 if multirow is None:
3367 multirow = config.get_option("styler.sparse.index")
3368
3369 if column_format is not None and not isinstance(column_format, str):
3370 raise ValueError("`column_format` must be str or unicode")
3371 length = len(self.columns) if columns is None else len(columns)
3372 if isinstance(header, (list, tuple)) and len(header) != length:
3373 raise ValueError(f"Writing {length} cols but got {len(header)} aliases")
3374
3375 # Refactor formatters/float_format/decimal/na_rep/escape to Styler structure
3376 base_format_ = {
3377 "na_rep": na_rep,
3378 "escape": "latex" if escape else None,
3379 "decimal": decimal,
3380 }
3381 index_format_: dict[str, Any] = {"axis": 0, **base_format_}
3382 column_format_: dict[str, Any] = {"axis": 1, **base_format_}
3383
3384 if isinstance(float_format, str):
3385 float_format_: Callable | None = lambda x: float_format % x
3386 else:
3387 float_format_ = float_format
3388
3389 def _wrap(x, alt_format_):
3390 if isinstance(x, (float, complex)) and float_format_ is not None:
3391 return float_format_(x)
3392 else:
3393 return alt_format_(x)
3394
3395 formatters_: list | tuple | dict | Callable | None = None
3396 if isinstance(formatters, list):
3397 formatters_ = {
3398 c: partial(_wrap, alt_format_=formatters[i])
3399 for i, c in enumerate(self.columns)
3400 }
3401 elif isinstance(formatters, dict):
3402 index_formatter = formatters.pop("__index__", None)
3403 column_formatter = formatters.pop("__columns__", None)
3404 if index_formatter is not None:
3405 index_format_.update({"formatter": index_formatter})
3406 if column_formatter is not None:
3407 column_format_.update({"formatter": column_formatter})
3408
3409 formatters_ = formatters
3410 float_columns = self.select_dtypes(include="float").columns
3411 for col in float_columns:
3412 if col not in formatters.keys():
3413 formatters_.update({col: float_format_})
3414 elif formatters is None and float_format is not None:
3415 formatters_ = partial(_wrap, alt_format_=lambda v: v)
3416 format_index_ = [index_format_, column_format_]
3417
3418 # Deal with hiding indexes and relabelling column names
3419 hide_: list[dict] = []
3420 relabel_index_: list[dict] = []
3421 if columns:
3422 hide_.append(
3423 {
3424 "subset": [c for c in self.columns if c not in columns],
3425 "axis": "columns",
3426 }
3427 )
3428 if header is False:
3429 hide_.append({"axis": "columns"})
3430 elif isinstance(header, (list, tuple)):
3431 relabel_index_.append({"labels": header, "axis": "columns"})
3432 format_index_ = [index_format_] # column_format is overwritten
3433
3434 if index is False:
3435 hide_.append({"axis": "index"})
3436 if index_names is False:
3437 hide_.append({"names": True, "axis": "index"})
3438
3439 render_kwargs_ = {
3440 "hrules": True,
3441 "sparse_index": sparsify,
3442 "sparse_columns": sparsify,
3443 "environment": "longtable" if longtable else None,
3444 "multicol_align": multicolumn_format
3445 if multicolumn
3446 else f"naive-{multicolumn_format}",
3447 "multirow_align": "t" if multirow else "naive",
3448 "encoding": encoding,
3449 "caption": caption,
3450 "label": label,
3451 "position": position,
3452 "column_format": column_format,
3453 "clines": "skip-last;data"
3454 if (multirow and isinstance(self.index, MultiIndex))
3455 else None,
3456 "bold_rows": bold_rows,
3457 }
3458
3459 return self._to_latex_via_styler(
3460 buf,
3461 hide=hide_,
3462 relabel_index=relabel_index_,
3463 format={"formatter": formatters_, **base_format_},
3464 format_index=format_index_,
3465 render_kwargs=render_kwargs_,
3466 )
3467
3468 def _to_latex_via_styler(
3469 self,
3470 buf=None,
3471 *,
3472 hide: dict | list[dict] | None = None,
3473 relabel_index: dict | list[dict] | None = None,
3474 format: dict | list[dict] | None = None,
3475 format_index: dict | list[dict] | None = None,
3476 render_kwargs: dict | None = None,
3477 ):
3478 """
3479 Render object to a LaTeX tabular, longtable, or nested table.
3480
3481 Uses the ``Styler`` implementation with the following, ordered, method chaining:
3482
3483 .. code-block:: python
3484 styler = Styler(DataFrame)
3485 styler.hide(**hide)
3486 styler.relabel_index(**relabel_index)
3487 styler.format(**format)
3488 styler.format_index(**format_index)
3489 styler.to_latex(buf=buf, **render_kwargs)
3490
3491 Parameters
3492 ----------
3493 buf : str, Path or StringIO-like, optional, default None
3494 Buffer to write to. If None, the output is returned as a string.
3495 hide : dict, list of dict
3496 Keyword args to pass to the method call of ``Styler.hide``. If a list will
3497 call the method numerous times.
3498 relabel_index : dict, list of dict
3499 Keyword args to pass to the method of ``Styler.relabel_index``. If a list
3500 will call the method numerous times.
3501 format : dict, list of dict
3502 Keyword args to pass to the method call of ``Styler.format``. If a list will
3503 call the method numerous times.
3504 format_index : dict, list of dict
3505 Keyword args to pass to the method call of ``Styler.format_index``. If a
3506 list will call the method numerous times.
3507 render_kwargs : dict
3508 Keyword args to pass to the method call of ``Styler.to_latex``.
3509
3510 Returns
3511 -------
3512 str or None
3513 If buf is None, returns the result as a string. Otherwise returns None.
3514 """
3515 from pandas.io.formats.style import Styler
3516
3517 self = cast("DataFrame", self)
3518 styler = Styler(self, uuid="")
3519
3520 for kw_name in ["hide", "relabel_index", "format", "format_index"]:
3521 kw = vars()[kw_name]
3522 if isinstance(kw, dict):
3523 getattr(styler, kw_name)(**kw)
3524 elif isinstance(kw, list):
3525 for sub_kw in kw:
3526 getattr(styler, kw_name)(**sub_kw)
3527
3528 # bold_rows is not a direct kwarg of Styler.to_latex
3529 render_kwargs = {} if render_kwargs is None else render_kwargs
3530 if render_kwargs.pop("bold_rows"):
3531 styler.applymap_index(lambda v: "textbf:--rwrap;")
3532
3533 return styler.to_latex(buf=buf, **render_kwargs)
3534
3535 @overload
3536 def to_csv(
3537 self,
3538 path_or_buf: None = ...,
3539 sep: str = ...,
3540 na_rep: str = ...,
3541 float_format: str | Callable | None = ...,
3542 columns: Sequence[Hashable] | None = ...,
3543 header: bool_t | list[str] = ...,
3544 index: bool_t = ...,
3545 index_label: IndexLabel | None = ...,
3546 mode: str = ...,
3547 encoding: str | None = ...,
3548 compression: CompressionOptions = ...,
3549 quoting: int | None = ...,
3550 quotechar: str = ...,
3551 lineterminator: str | None = ...,
3552 chunksize: int | None = ...,
3553 date_format: str | None = ...,
3554 doublequote: bool_t = ...,
3555 escapechar: str | None = ...,
3556 decimal: str = ...,
3557 errors: str = ...,
3558 storage_options: StorageOptions = ...,
3559 ) -> str:
3560 ...
3561
3562 @overload
3563 def to_csv(
3564 self,
3565 path_or_buf: FilePath | WriteBuffer[bytes] | WriteBuffer[str],
3566 sep: str = ...,
3567 na_rep: str = ...,
3568 float_format: str | Callable | None = ...,
3569 columns: Sequence[Hashable] | None = ...,
3570 header: bool_t | list[str] = ...,
3571 index: bool_t = ...,
3572 index_label: IndexLabel | None = ...,
3573 mode: str = ...,
3574 encoding: str | None = ...,
3575 compression: CompressionOptions = ...,
3576 quoting: int | None = ...,
3577 quotechar: str = ...,
3578 lineterminator: str | None = ...,
3579 chunksize: int | None = ...,
3580 date_format: str | None = ...,
3581 doublequote: bool_t = ...,
3582 escapechar: str | None = ...,
3583 decimal: str = ...,
3584 errors: str = ...,
3585 storage_options: StorageOptions = ...,
3586 ) -> None:
3587 ...
3588
3589 @final
3590 @doc(
3591 storage_options=_shared_docs["storage_options"],
3592 compression_options=_shared_docs["compression_options"] % "path_or_buf",
3593 )
3594 def to_csv(
3595 self,
3596 path_or_buf: FilePath | WriteBuffer[bytes] | WriteBuffer[str] | None = None,
3597 sep: str = ",",
3598 na_rep: str = "",
3599 float_format: str | Callable | None = None,
3600 columns: Sequence[Hashable] | None = None,
3601 header: bool_t | list[str] = True,
3602 index: bool_t = True,
3603 index_label: IndexLabel | None = None,
3604 mode: str = "w",
3605 encoding: str | None = None,
3606 compression: CompressionOptions = "infer",
3607 quoting: int | None = None,
3608 quotechar: str = '"',
3609 lineterminator: str | None = None,
3610 chunksize: int | None = None,
3611 date_format: str | None = None,
3612 doublequote: bool_t = True,
3613 escapechar: str | None = None,
3614 decimal: str = ".",
3615 errors: str = "strict",
3616 storage_options: StorageOptions = None,
3617 ) -> str | None:
3618 r"""
3619 Write object to a comma-separated values (csv) file.
3620
3621 Parameters
3622 ----------
3623 path_or_buf : str, path object, file-like object, or None, default None
3624 String, path object (implementing os.PathLike[str]), or file-like
3625 object implementing a write() function. If None, the result is
3626 returned as a string. If a non-binary file object is passed, it should
3627 be opened with `newline=''`, disabling universal newlines. If a binary
3628 file object is passed, `mode` might need to contain a `'b'`.
3629
3630 .. versionchanged:: 1.2.0
3631
3632 Support for binary file objects was introduced.
3633
3634 sep : str, default ','
3635 String of length 1. Field delimiter for the output file.
3636 na_rep : str, default ''
3637 Missing data representation.
3638 float_format : str, Callable, default None
3639 Format string for floating point numbers. If a Callable is given, it takes
3640 precedence over other numeric formatting parameters, like decimal.
3641 columns : sequence, optional
3642 Columns to write.
3643 header : bool or list of str, default True
3644 Write out the column names. If a list of strings is given it is
3645 assumed to be aliases for the column names.
3646 index : bool, default True
3647 Write row names (index).
3648 index_label : str or sequence, or False, default None
3649 Column label for index column(s) if desired. If None is given, and
3650 `header` and `index` are True, then the index names are used. A
3651 sequence should be given if the object uses MultiIndex. If
3652 False do not print fields for index names. Use index_label=False
3653 for easier importing in R.
3654 mode : str, default 'w'
3655 Python write mode. The available write modes are the same as
3656 :py:func:`open`.
3657 encoding : str, optional
3658 A string representing the encoding to use in the output file,
3659 defaults to 'utf-8'. `encoding` is not supported if `path_or_buf`
3660 is a non-binary file object.
3661 {compression_options}
3662
3663 .. versionchanged:: 1.0.0
3664
3665 May now be a dict with key 'method' as compression mode
3666 and other entries as additional compression options if
3667 compression mode is 'zip'.
3668
3669 .. versionchanged:: 1.1.0
3670
3671 Passing compression options as keys in dict is
3672 supported for compression modes 'gzip', 'bz2', 'zstd', and 'zip'.
3673
3674 .. versionchanged:: 1.2.0
3675
3676 Compression is supported for binary file objects.
3677
3678 .. versionchanged:: 1.2.0
3679
3680 Previous versions forwarded dict entries for 'gzip' to
3681 `gzip.open` instead of `gzip.GzipFile` which prevented
3682 setting `mtime`.
3683
3684 quoting : optional constant from csv module
3685 Defaults to csv.QUOTE_MINIMAL. If you have set a `float_format`
3686 then floats are converted to strings and thus csv.QUOTE_NONNUMERIC
3687 will treat them as non-numeric.
3688 quotechar : str, default '\"'
3689 String of length 1. Character used to quote fields.
3690 lineterminator : str, optional
3691 The newline character or character sequence to use in the output
3692 file. Defaults to `os.linesep`, which depends on the OS in which
3693 this method is called ('\\n' for linux, '\\r\\n' for Windows, i.e.).
3694
3695 .. versionchanged:: 1.5.0
3696
3697 Previously was line_terminator, changed for consistency with
3698 read_csv and the standard library 'csv' module.
3699
3700 chunksize : int or None
3701 Rows to write at a time.
3702 date_format : str, default None
3703 Format string for datetime objects.
3704 doublequote : bool, default True
3705 Control quoting of `quotechar` inside a field.
3706 escapechar : str, default None
3707 String of length 1. Character used to escape `sep` and `quotechar`
3708 when appropriate.
3709 decimal : str, default '.'
3710 Character recognized as decimal separator. E.g. use ',' for
3711 European data.
3712 errors : str, default 'strict'
3713 Specifies how encoding and decoding errors are to be handled.
3714 See the errors argument for :func:`open` for a full list
3715 of options.
3716
3717 .. versionadded:: 1.1.0
3718
3719 {storage_options}
3720
3721 .. versionadded:: 1.2.0
3722
3723 Returns
3724 -------
3725 None or str
3726 If path_or_buf is None, returns the resulting csv format as a
3727 string. Otherwise returns None.
3728
3729 See Also
3730 --------
3731 read_csv : Load a CSV file into a DataFrame.
3732 to_excel : Write DataFrame to an Excel file.
3733
3734 Examples
3735 --------
3736 >>> df = pd.DataFrame({{'name': ['Raphael', 'Donatello'],
3737 ... 'mask': ['red', 'purple'],
3738 ... 'weapon': ['sai', 'bo staff']}})
3739 >>> df.to_csv(index=False)
3740 'name,mask,weapon\nRaphael,red,sai\nDonatello,purple,bo staff\n'
3741
3742 Create 'out.zip' containing 'out.csv'
3743
3744 >>> compression_opts = dict(method='zip',
3745 ... archive_name='out.csv') # doctest: +SKIP
3746 >>> df.to_csv('out.zip', index=False,
3747 ... compression=compression_opts) # doctest: +SKIP
3748
3749 To write a csv file to a new folder or nested folder you will first
3750 need to create it using either Pathlib or os:
3751
3752 >>> from pathlib import Path # doctest: +SKIP
3753 >>> filepath = Path('folder/subfolder/out.csv') # doctest: +SKIP
3754 >>> filepath.parent.mkdir(parents=True, exist_ok=True) # doctest: +SKIP
3755 >>> df.to_csv(filepath) # doctest: +SKIP
3756
3757 >>> import os # doctest: +SKIP
3758 >>> os.makedirs('folder/subfolder', exist_ok=True) # doctest: +SKIP
3759 >>> df.to_csv('folder/subfolder/out.csv') # doctest: +SKIP
3760 """
3761 df = self if isinstance(self, ABCDataFrame) else self.to_frame()
3762
3763 formatter = DataFrameFormatter(
3764 frame=df,
3765 header=header,
3766 index=index,
3767 na_rep=na_rep,
3768 float_format=float_format,
3769 decimal=decimal,
3770 )
3771
3772 return DataFrameRenderer(formatter).to_csv(
3773 path_or_buf,
3774 lineterminator=lineterminator,
3775 sep=sep,
3776 encoding=encoding,
3777 errors=errors,
3778 compression=compression,
3779 quoting=quoting,
3780 columns=columns,
3781 index_label=index_label,
3782 mode=mode,
3783 chunksize=chunksize,
3784 quotechar=quotechar,
3785 date_format=date_format,
3786 doublequote=doublequote,
3787 escapechar=escapechar,
3788 storage_options=storage_options,
3789 )
3790
3791 # ----------------------------------------------------------------------
3792 # Lookup Caching
3793
3794 def _reset_cacher(self) -> None:
3795 """
3796 Reset the cacher.
3797 """
3798 raise AbstractMethodError(self)
3799
3800 def _maybe_update_cacher(
3801 self,
3802 clear: bool_t = False,
3803 verify_is_copy: bool_t = True,
3804 inplace: bool_t = False,
3805 ) -> None:
3806 """
3807 See if we need to update our parent cacher if clear, then clear our
3808 cache.
3809
3810 Parameters
3811 ----------
3812 clear : bool, default False
3813 Clear the item cache.
3814 verify_is_copy : bool, default True
3815 Provide is_copy checks.
3816 """
3817 if using_copy_on_write():
3818 return
3819
3820 if verify_is_copy:
3821 self._check_setitem_copy(t="referent")
3822
3823 if clear:
3824 self._clear_item_cache()
3825
3826 def _clear_item_cache(self) -> None:
3827 raise AbstractMethodError(self)
3828
3829 # ----------------------------------------------------------------------
3830 # Indexing Methods
3831
3832 def take(self: NDFrameT, indices, axis: Axis = 0, **kwargs) -> NDFrameT:
3833 """
3834 Return the elements in the given *positional* indices along an axis.
3835
3836 This means that we are not indexing according to actual values in
3837 the index attribute of the object. We are indexing according to the
3838 actual position of the element in the object.
3839
3840 Parameters
3841 ----------
3842 indices : array-like
3843 An array of ints indicating which positions to take.
3844 axis : {0 or 'index', 1 or 'columns', None}, default 0
3845 The axis on which to select elements. ``0`` means that we are
3846 selecting rows, ``1`` means that we are selecting columns.
3847 For `Series` this parameter is unused and defaults to 0.
3848 **kwargs
3849 For compatibility with :meth:`numpy.take`. Has no effect on the
3850 output.
3851
3852 Returns
3853 -------
3854 same type as caller
3855 An array-like containing the elements taken from the object.
3856
3857 See Also
3858 --------
3859 DataFrame.loc : Select a subset of a DataFrame by labels.
3860 DataFrame.iloc : Select a subset of a DataFrame by positions.
3861 numpy.take : Take elements from an array along an axis.
3862
3863 Examples
3864 --------
3865 >>> df = pd.DataFrame([('falcon', 'bird', 389.0),
3866 ... ('parrot', 'bird', 24.0),
3867 ... ('lion', 'mammal', 80.5),
3868 ... ('monkey', 'mammal', np.nan)],
3869 ... columns=['name', 'class', 'max_speed'],
3870 ... index=[0, 2, 3, 1])
3871 >>> df
3872 name class max_speed
3873 0 falcon bird 389.0
3874 2 parrot bird 24.0
3875 3 lion mammal 80.5
3876 1 monkey mammal NaN
3877
3878 Take elements at positions 0 and 3 along the axis 0 (default).
3879
3880 Note how the actual indices selected (0 and 1) do not correspond to
3881 our selected indices 0 and 3. That's because we are selecting the 0th
3882 and 3rd rows, not rows whose indices equal 0 and 3.
3883
3884 >>> df.take([0, 3])
3885 name class max_speed
3886 0 falcon bird 389.0
3887 1 monkey mammal NaN
3888
3889 Take elements at indices 1 and 2 along the axis 1 (column selection).
3890
3891 >>> df.take([1, 2], axis=1)
3892 class max_speed
3893 0 bird 389.0
3894 2 bird 24.0
3895 3 mammal 80.5
3896 1 mammal NaN
3897
3898 We may take elements using negative integers for positive indices,
3899 starting from the end of the object, just like with Python lists.
3900
3901 >>> df.take([-1, -2])
3902 name class max_speed
3903 1 monkey mammal NaN
3904 3 lion mammal 80.5
3905 """
3906
3907 nv.validate_take((), kwargs)
3908
3909 return self._take(indices, axis)
3910
3911 def _take(
3912 self: NDFrameT,
3913 indices,
3914 axis: Axis = 0,
3915 convert_indices: bool_t = True,
3916 ) -> NDFrameT:
3917 """
3918 Internal version of the `take` allowing specification of additional args.
3919
3920 See the docstring of `take` for full explanation of the parameters.
3921 """
3922 if not isinstance(indices, slice):
3923 indices = np.asarray(indices, dtype=np.intp)
3924 if (
3925 axis == 0
3926 and indices.ndim == 1
3927 and using_copy_on_write()
3928 and is_range_indexer(indices, len(self))
3929 ):
3930 return self.copy(deep=None)
3931
3932 new_data = self._mgr.take(
3933 indices,
3934 axis=self._get_block_manager_axis(axis),
3935 verify=True,
3936 convert_indices=convert_indices,
3937 )
3938 return self._constructor(new_data).__finalize__(self, method="take")
3939
3940 def _take_with_is_copy(self: NDFrameT, indices, axis: Axis = 0) -> NDFrameT:
3941 """
3942 Internal version of the `take` method that sets the `_is_copy`
3943 attribute to keep track of the parent dataframe (using in indexing
3944 for the SettingWithCopyWarning).
3945
3946 See the docstring of `take` for full explanation of the parameters.
3947 """
3948 result = self._take(indices=indices, axis=axis)
3949 # Maybe set copy if we didn't actually change the index.
3950 if not result._get_axis(axis).equals(self._get_axis(axis)):
3951 result._set_is_copy(self)
3952 return result
3953
3954 @final
3955 def xs(
3956 self: NDFrameT,
3957 key: IndexLabel,
3958 axis: Axis = 0,
3959 level: IndexLabel = None,
3960 drop_level: bool_t = True,
3961 ) -> NDFrameT:
3962 """
3963 Return cross-section from the Series/DataFrame.
3964
3965 This method takes a `key` argument to select data at a particular
3966 level of a MultiIndex.
3967
3968 Parameters
3969 ----------
3970 key : label or tuple of label
3971 Label contained in the index, or partially in a MultiIndex.
3972 axis : {0 or 'index', 1 or 'columns'}, default 0
3973 Axis to retrieve cross-section on.
3974 level : object, defaults to first n levels (n=1 or len(key))
3975 In case of a key partially contained in a MultiIndex, indicate
3976 which levels are used. Levels can be referred by label or position.
3977 drop_level : bool, default True
3978 If False, returns object with same levels as self.
3979
3980 Returns
3981 -------
3982 Series or DataFrame
3983 Cross-section from the original Series or DataFrame
3984 corresponding to the selected index levels.
3985
3986 See Also
3987 --------
3988 DataFrame.loc : Access a group of rows and columns
3989 by label(s) or a boolean array.
3990 DataFrame.iloc : Purely integer-location based indexing
3991 for selection by position.
3992
3993 Notes
3994 -----
3995 `xs` can not be used to set values.
3996
3997 MultiIndex Slicers is a generic way to get/set values on
3998 any level or levels.
3999 It is a superset of `xs` functionality, see
4000 :ref:`MultiIndex Slicers <advanced.mi_slicers>`.
4001
4002 Examples
4003 --------
4004 >>> d = {'num_legs': [4, 4, 2, 2],
4005 ... 'num_wings': [0, 0, 2, 2],
4006 ... 'class': ['mammal', 'mammal', 'mammal', 'bird'],
4007 ... 'animal': ['cat', 'dog', 'bat', 'penguin'],
4008 ... 'locomotion': ['walks', 'walks', 'flies', 'walks']}
4009 >>> df = pd.DataFrame(data=d)
4010 >>> df = df.set_index(['class', 'animal', 'locomotion'])
4011 >>> df
4012 num_legs num_wings
4013 class animal locomotion
4014 mammal cat walks 4 0
4015 dog walks 4 0
4016 bat flies 2 2
4017 bird penguin walks 2 2
4018
4019 Get values at specified index
4020
4021 >>> df.xs('mammal')
4022 num_legs num_wings
4023 animal locomotion
4024 cat walks 4 0
4025 dog walks 4 0
4026 bat flies 2 2
4027
4028 Get values at several indexes
4029
4030 >>> df.xs(('mammal', 'dog', 'walks'))
4031 num_legs 4
4032 num_wings 0
4033 Name: (mammal, dog, walks), dtype: int64
4034
4035 Get values at specified index and level
4036
4037 >>> df.xs('cat', level=1)
4038 num_legs num_wings
4039 class locomotion
4040 mammal walks 4 0
4041
4042 Get values at several indexes and levels
4043
4044 >>> df.xs(('bird', 'walks'),
4045 ... level=[0, 'locomotion'])
4046 num_legs num_wings
4047 animal
4048 penguin 2 2
4049
4050 Get values at specified column and axis
4051
4052 >>> df.xs('num_wings', axis=1)
4053 class animal locomotion
4054 mammal cat walks 0
4055 dog walks 0
4056 bat flies 2
4057 bird penguin walks 2
4058 Name: num_wings, dtype: int64
4059 """
4060 axis = self._get_axis_number(axis)
4061 labels = self._get_axis(axis)
4062
4063 if isinstance(key, list):
4064 raise TypeError("list keys are not supported in xs, pass a tuple instead")
4065
4066 if level is not None:
4067 if not isinstance(labels, MultiIndex):
4068 raise TypeError("Index must be a MultiIndex")
4069 loc, new_ax = labels.get_loc_level(key, level=level, drop_level=drop_level)
4070
4071 # create the tuple of the indexer
4072 _indexer = [slice(None)] * self.ndim
4073 _indexer[axis] = loc
4074 indexer = tuple(_indexer)
4075
4076 result = self.iloc[indexer]
4077 setattr(result, result._get_axis_name(axis), new_ax)
4078 return result
4079
4080 if axis == 1:
4081 if drop_level:
4082 return self[key]
4083 index = self.columns
4084 else:
4085 index = self.index
4086
4087 if isinstance(index, MultiIndex):
4088 loc, new_index = index._get_loc_level(key, level=0)
4089 if not drop_level:
4090 if lib.is_integer(loc):
4091 new_index = index[loc : loc + 1]
4092 else:
4093 new_index = index[loc]
4094 else:
4095 loc = index.get_loc(key)
4096
4097 if isinstance(loc, np.ndarray):
4098 if loc.dtype == np.bool_:
4099 (inds,) = loc.nonzero()
4100 return self._take_with_is_copy(inds, axis=axis)
4101 else:
4102 return self._take_with_is_copy(loc, axis=axis)
4103
4104 if not is_scalar(loc):
4105 new_index = index[loc]
4106
4107 if is_scalar(loc) and axis == 0:
4108 # In this case loc should be an integer
4109 if self.ndim == 1:
4110 # if we encounter an array-like and we only have 1 dim
4111 # that means that their are list/ndarrays inside the Series!
4112 # so just return them (GH 6394)
4113 return self._values[loc]
4114
4115 new_mgr = self._mgr.fast_xs(loc)
4116
4117 result = self._constructor_sliced(
4118 new_mgr, name=self.index[loc]
4119 ).__finalize__(self)
4120 elif is_scalar(loc):
4121 result = self.iloc[:, slice(loc, loc + 1)]
4122 elif axis == 1:
4123 result = self.iloc[:, loc]
4124 else:
4125 result = self.iloc[loc]
4126 result.index = new_index
4127
4128 # this could be a view
4129 # but only in a single-dtyped view sliceable case
4130 result._set_is_copy(self, copy=not result._is_view)
4131 return result
4132
4133 def __getitem__(self, item):
4134 raise AbstractMethodError(self)
4135
4136 def _slice(self: NDFrameT, slobj: slice, axis: Axis = 0) -> NDFrameT:
4137 """
4138 Construct a slice of this container.
4139
4140 Slicing with this method is *always* positional.
4141 """
4142 assert isinstance(slobj, slice), type(slobj)
4143 axis = self._get_block_manager_axis(axis)
4144 result = self._constructor(self._mgr.get_slice(slobj, axis=axis))
4145 result = result.__finalize__(self)
4146
4147 # this could be a view
4148 # but only in a single-dtyped view sliceable case
4149 is_copy = axis != 0 or result._is_view
4150 result._set_is_copy(self, copy=is_copy)
4151 return result
4152
4153 @final
4154 def _set_is_copy(self, ref: NDFrame, copy: bool_t = True) -> None:
4155 if not copy:
4156 self._is_copy = None
4157 else:
4158 assert ref is not None
4159 self._is_copy = weakref.ref(ref)
4160
4161 def _check_is_chained_assignment_possible(self) -> bool_t:
4162 """
4163 Check if we are a view, have a cacher, and are of mixed type.
4164 If so, then force a setitem_copy check.
4165
4166 Should be called just near setting a value
4167
4168 Will return a boolean if it we are a view and are cached, but a
4169 single-dtype meaning that the cacher should be updated following
4170 setting.
4171 """
4172 if self._is_copy:
4173 self._check_setitem_copy(t="referent")
4174 return False
4175
4176 @final
4177 def _check_setitem_copy(self, t: str = "setting", force: bool_t = False):
4178 """
4179
4180 Parameters
4181 ----------
4182 t : str, the type of setting error
4183 force : bool, default False
4184 If True, then force showing an error.
4185
4186 validate if we are doing a setitem on a chained copy.
4187
4188 It is technically possible to figure out that we are setting on
4189 a copy even WITH a multi-dtyped pandas object. In other words, some
4190 blocks may be views while other are not. Currently _is_view will ALWAYS
4191 return False for multi-blocks to avoid having to handle this case.
4192
4193 df = DataFrame(np.arange(0,9), columns=['count'])
4194 df['group'] = 'b'
4195
4196 # This technically need not raise SettingWithCopy if both are view
4197 # (which is not generally guaranteed but is usually True. However,
4198 # this is in general not a good practice and we recommend using .loc.
4199 df.iloc[0:5]['group'] = 'a'
4200
4201 """
4202 if using_copy_on_write():
4203 return
4204
4205 # return early if the check is not needed
4206 if not (force or self._is_copy):
4207 return
4208
4209 value = config.get_option("mode.chained_assignment")
4210 if value is None:
4211 return
4212
4213 # see if the copy is not actually referred; if so, then dissolve
4214 # the copy weakref
4215 if self._is_copy is not None and not isinstance(self._is_copy, str):
4216 r = self._is_copy()
4217 if not gc.get_referents(r) or (r is not None and r.shape == self.shape):
4218 self._is_copy = None
4219 return
4220
4221 # a custom message
4222 if isinstance(self._is_copy, str):
4223 t = self._is_copy
4224
4225 elif t == "referent":
4226 t = (
4227 "\n"
4228 "A value is trying to be set on a copy of a slice from a "
4229 "DataFrame\n\n"
4230 "See the caveats in the documentation: "
4231 "https://pandas.pydata.org/pandas-docs/stable/user_guide/"
4232 "indexing.html#returning-a-view-versus-a-copy"
4233 )
4234
4235 else:
4236 t = (
4237 "\n"
4238 "A value is trying to be set on a copy of a slice from a "
4239 "DataFrame.\n"
4240 "Try using .loc[row_indexer,col_indexer] = value "
4241 "instead\n\nSee the caveats in the documentation: "
4242 "https://pandas.pydata.org/pandas-docs/stable/user_guide/"
4243 "indexing.html#returning-a-view-versus-a-copy"
4244 )
4245
4246 if value == "raise":
4247 raise SettingWithCopyError(t)
4248 if value == "warn":
4249 warnings.warn(t, SettingWithCopyWarning, stacklevel=find_stack_level())
4250
4251 def __delitem__(self, key) -> None:
4252 """
4253 Delete item
4254 """
4255 deleted = False
4256
4257 maybe_shortcut = False
4258 if self.ndim == 2 and isinstance(self.columns, MultiIndex):
4259 try:
4260 # By using engine's __contains__ we effectively
4261 # restrict to same-length tuples
4262 maybe_shortcut = key not in self.columns._engine
4263 except TypeError:
4264 pass
4265
4266 if maybe_shortcut:
4267 # Allow shorthand to delete all columns whose first len(key)
4268 # elements match key:
4269 if not isinstance(key, tuple):
4270 key = (key,)
4271 for col in self.columns:
4272 if isinstance(col, tuple) and col[: len(key)] == key:
4273 del self[col]
4274 deleted = True
4275 if not deleted:
4276 # If the above loop ran and didn't delete anything because
4277 # there was no match, this call should raise the appropriate
4278 # exception:
4279 loc = self.axes[-1].get_loc(key)
4280 self._mgr = self._mgr.idelete(loc)
4281
4282 # delete from the caches
4283 try:
4284 del self._item_cache[key]
4285 except KeyError:
4286 pass
4287
4288 # ----------------------------------------------------------------------
4289 # Unsorted
4290
4291 @final
4292 def _check_inplace_and_allows_duplicate_labels(self, inplace):
4293 if inplace and not self.flags.allows_duplicate_labels:
4294 raise ValueError(
4295 "Cannot specify 'inplace=True' when "
4296 "'self.flags.allows_duplicate_labels' is False."
4297 )
4298
4299 @final
4300 def get(self, key, default=None):
4301 """
4302 Get item from object for given key (ex: DataFrame column).
4303
4304 Returns default value if not found.
4305
4306 Parameters
4307 ----------
4308 key : object
4309
4310 Returns
4311 -------
4312 same type as items contained in object
4313
4314 Examples
4315 --------
4316 >>> df = pd.DataFrame(
4317 ... [
4318 ... [24.3, 75.7, "high"],
4319 ... [31, 87.8, "high"],
4320 ... [22, 71.6, "medium"],
4321 ... [35, 95, "medium"],
4322 ... ],
4323 ... columns=["temp_celsius", "temp_fahrenheit", "windspeed"],
4324 ... index=pd.date_range(start="2014-02-12", end="2014-02-15", freq="D"),
4325 ... )
4326
4327 >>> df
4328 temp_celsius temp_fahrenheit windspeed
4329 2014-02-12 24.3 75.7 high
4330 2014-02-13 31.0 87.8 high
4331 2014-02-14 22.0 71.6 medium
4332 2014-02-15 35.0 95.0 medium
4333
4334 >>> df.get(["temp_celsius", "windspeed"])
4335 temp_celsius windspeed
4336 2014-02-12 24.3 high
4337 2014-02-13 31.0 high
4338 2014-02-14 22.0 medium
4339 2014-02-15 35.0 medium
4340
4341 >>> ser = df['windspeed']
4342 >>> ser.get('2014-02-13')
4343 'high'
4344
4345 If the key isn't found, the default value will be used.
4346
4347 >>> df.get(["temp_celsius", "temp_kelvin"], default="default_value")
4348 'default_value'
4349
4350 >>> ser.get('2014-02-10', '[unknown]')
4351 '[unknown]'
4352 """
4353 try:
4354 return self[key]
4355 except (KeyError, ValueError, IndexError):
4356 return default
4357
4358 @final
4359 @property
4360 def _is_view(self) -> bool_t:
4361 """Return boolean indicating if self is view of another array"""
4362 return self._mgr.is_view
4363
4364 @final
4365 def reindex_like(
4366 self: NDFrameT,
4367 other,
4368 method: Literal["backfill", "bfill", "pad", "ffill", "nearest"] | None = None,
4369 copy: bool_t | None = None,
4370 limit=None,
4371 tolerance=None,
4372 ) -> NDFrameT:
4373 """
4374 Return an object with matching indices as other object.
4375
4376 Conform the object to the same index on all axes. Optional
4377 filling logic, placing NaN in locations having no value
4378 in the previous index. A new object is produced unless the
4379 new index is equivalent to the current one and copy=False.
4380
4381 Parameters
4382 ----------
4383 other : Object of the same data type
4384 Its row and column indices are used to define the new indices
4385 of this object.
4386 method : {None, 'backfill'/'bfill', 'pad'/'ffill', 'nearest'}
4387 Method to use for filling holes in reindexed DataFrame.
4388 Please note: this is only applicable to DataFrames/Series with a
4389 monotonically increasing/decreasing index.
4390
4391 * None (default): don't fill gaps
4392 * pad / ffill: propagate last valid observation forward to next
4393 valid
4394 * backfill / bfill: use next valid observation to fill gap
4395 * nearest: use nearest valid observations to fill gap.
4396
4397 copy : bool, default True
4398 Return a new object, even if the passed indexes are the same.
4399 limit : int, default None
4400 Maximum number of consecutive labels to fill for inexact matches.
4401 tolerance : optional
4402 Maximum distance between original and new labels for inexact
4403 matches. The values of the index at the matching locations must
4404 satisfy the equation ``abs(index[indexer] - target) <= tolerance``.
4405
4406 Tolerance may be a scalar value, which applies the same tolerance
4407 to all values, or list-like, which applies variable tolerance per
4408 element. List-like includes list, tuple, array, Series, and must be
4409 the same size as the index and its dtype must exactly match the
4410 index's type.
4411
4412 Returns
4413 -------
4414 Series or DataFrame
4415 Same type as caller, but with changed indices on each axis.
4416
4417 See Also
4418 --------
4419 DataFrame.set_index : Set row labels.
4420 DataFrame.reset_index : Remove row labels or move them to new columns.
4421 DataFrame.reindex : Change to new indices or expand indices.
4422
4423 Notes
4424 -----
4425 Same as calling
4426 ``.reindex(index=other.index, columns=other.columns,...)``.
4427
4428 Examples
4429 --------
4430 >>> df1 = pd.DataFrame([[24.3, 75.7, 'high'],
4431 ... [31, 87.8, 'high'],
4432 ... [22, 71.6, 'medium'],
4433 ... [35, 95, 'medium']],
4434 ... columns=['temp_celsius', 'temp_fahrenheit',
4435 ... 'windspeed'],
4436 ... index=pd.date_range(start='2014-02-12',
4437 ... end='2014-02-15', freq='D'))
4438
4439 >>> df1
4440 temp_celsius temp_fahrenheit windspeed
4441 2014-02-12 24.3 75.7 high
4442 2014-02-13 31.0 87.8 high
4443 2014-02-14 22.0 71.6 medium
4444 2014-02-15 35.0 95.0 medium
4445
4446 >>> df2 = pd.DataFrame([[28, 'low'],
4447 ... [30, 'low'],
4448 ... [35.1, 'medium']],
4449 ... columns=['temp_celsius', 'windspeed'],
4450 ... index=pd.DatetimeIndex(['2014-02-12', '2014-02-13',
4451 ... '2014-02-15']))
4452
4453 >>> df2
4454 temp_celsius windspeed
4455 2014-02-12 28.0 low
4456 2014-02-13 30.0 low
4457 2014-02-15 35.1 medium
4458
4459 >>> df2.reindex_like(df1)
4460 temp_celsius temp_fahrenheit windspeed
4461 2014-02-12 28.0 NaN low
4462 2014-02-13 30.0 NaN low
4463 2014-02-14 NaN NaN NaN
4464 2014-02-15 35.1 NaN medium
4465 """
4466 d = other._construct_axes_dict(
4467 axes=self._AXIS_ORDERS,
4468 method=method,
4469 copy=copy,
4470 limit=limit,
4471 tolerance=tolerance,
4472 )
4473
4474 return self.reindex(**d)
4475
4476 @overload
4477 def drop(
4478 self,
4479 labels: IndexLabel = ...,
4480 *,
4481 axis: Axis = ...,
4482 index: IndexLabel = ...,
4483 columns: IndexLabel = ...,
4484 level: Level | None = ...,
4485 inplace: Literal[True],
4486 errors: IgnoreRaise = ...,
4487 ) -> None:
4488 ...
4489
4490 @overload
4491 def drop(
4492 self: NDFrameT,
4493 labels: IndexLabel = ...,
4494 *,
4495 axis: Axis = ...,
4496 index: IndexLabel = ...,
4497 columns: IndexLabel = ...,
4498 level: Level | None = ...,
4499 inplace: Literal[False] = ...,
4500 errors: IgnoreRaise = ...,
4501 ) -> NDFrameT:
4502 ...
4503
4504 @overload
4505 def drop(
4506 self: NDFrameT,
4507 labels: IndexLabel = ...,
4508 *,
4509 axis: Axis = ...,
4510 index: IndexLabel = ...,
4511 columns: IndexLabel = ...,
4512 level: Level | None = ...,
4513 inplace: bool_t = ...,
4514 errors: IgnoreRaise = ...,
4515 ) -> NDFrameT | None:
4516 ...
4517
4518 def drop(
4519 self: NDFrameT,
4520 labels: IndexLabel = None,
4521 *,
4522 axis: Axis = 0,
4523 index: IndexLabel = None,
4524 columns: IndexLabel = None,
4525 level: Level | None = None,
4526 inplace: bool_t = False,
4527 errors: IgnoreRaise = "raise",
4528 ) -> NDFrameT | None:
4529 inplace = validate_bool_kwarg(inplace, "inplace")
4530
4531 if labels is not None:
4532 if index is not None or columns is not None:
4533 raise ValueError("Cannot specify both 'labels' and 'index'/'columns'")
4534 axis_name = self._get_axis_name(axis)
4535 axes = {axis_name: labels}
4536 elif index is not None or columns is not None:
4537 axes = {"index": index}
4538 if self.ndim == 2:
4539 axes["columns"] = columns
4540 else:
4541 raise ValueError(
4542 "Need to specify at least one of 'labels', 'index' or 'columns'"
4543 )
4544
4545 obj = self
4546
4547 for axis, labels in axes.items():
4548 if labels is not None:
4549 obj = obj._drop_axis(labels, axis, level=level, errors=errors)
4550
4551 if inplace:
4552 self._update_inplace(obj)
4553 return None
4554 else:
4555 return obj
4556
4557 @final
4558 def _drop_axis(
4559 self: NDFrameT,
4560 labels,
4561 axis,
4562 level=None,
4563 errors: IgnoreRaise = "raise",
4564 only_slice: bool_t = False,
4565 ) -> NDFrameT:
4566 """
4567 Drop labels from specified axis. Used in the ``drop`` method
4568 internally.
4569
4570 Parameters
4571 ----------
4572 labels : single label or list-like
4573 axis : int or axis name
4574 level : int or level name, default None
4575 For MultiIndex
4576 errors : {'ignore', 'raise'}, default 'raise'
4577 If 'ignore', suppress error and existing labels are dropped.
4578 only_slice : bool, default False
4579 Whether indexing along columns should be view-only.
4580
4581 """
4582 axis_num = self._get_axis_number(axis)
4583 axis = self._get_axis(axis)
4584
4585 if axis.is_unique:
4586 if level is not None:
4587 if not isinstance(axis, MultiIndex):
4588 raise AssertionError("axis must be a MultiIndex")
4589 new_axis = axis.drop(labels, level=level, errors=errors)
4590 else:
4591 new_axis = axis.drop(labels, errors=errors)
4592 indexer = axis.get_indexer(new_axis)
4593
4594 # Case for non-unique axis
4595 else:
4596 is_tuple_labels = is_nested_list_like(labels) or isinstance(labels, tuple)
4597 labels = ensure_object(common.index_labels_to_array(labels))
4598 if level is not None:
4599 if not isinstance(axis, MultiIndex):
4600 raise AssertionError("axis must be a MultiIndex")
4601 mask = ~axis.get_level_values(level).isin(labels)
4602
4603 # GH 18561 MultiIndex.drop should raise if label is absent
4604 if errors == "raise" and mask.all():
4605 raise KeyError(f"{labels} not found in axis")
4606 elif (
4607 isinstance(axis, MultiIndex)
4608 and labels.dtype == "object"
4609 and not is_tuple_labels
4610 ):
4611 # Set level to zero in case of MultiIndex and label is string,
4612 # because isin can't handle strings for MultiIndexes GH#36293
4613 # In case of tuples we get dtype object but have to use isin GH#42771
4614 mask = ~axis.get_level_values(0).isin(labels)
4615 else:
4616 mask = ~axis.isin(labels)
4617 # Check if label doesn't exist along axis
4618 labels_missing = (axis.get_indexer_for(labels) == -1).any()
4619 if errors == "raise" and labels_missing:
4620 raise KeyError(f"{labels} not found in axis")
4621
4622 if is_extension_array_dtype(mask.dtype):
4623 # GH#45860
4624 mask = mask.to_numpy(dtype=bool)
4625
4626 indexer = mask.nonzero()[0]
4627 new_axis = axis.take(indexer)
4628
4629 bm_axis = self.ndim - axis_num - 1
4630 new_mgr = self._mgr.reindex_indexer(
4631 new_axis,
4632 indexer,
4633 axis=bm_axis,
4634 allow_dups=True,
4635 copy=None,
4636 only_slice=only_slice,
4637 )
4638 result = self._constructor(new_mgr)
4639 if self.ndim == 1:
4640 result.name = self.name
4641
4642 return result.__finalize__(self)
4643
4644 @final
4645 def _update_inplace(self, result, verify_is_copy: bool_t = True) -> None:
4646 """
4647 Replace self internals with result.
4648
4649 Parameters
4650 ----------
4651 result : same type as self
4652 verify_is_copy : bool, default True
4653 Provide is_copy checks.
4654 """
4655 # NOTE: This does *not* call __finalize__ and that's an explicit
4656 # decision that we may revisit in the future.
4657 self._reset_cache()
4658 self._clear_item_cache()
4659 self._mgr = result._mgr
4660 self._maybe_update_cacher(verify_is_copy=verify_is_copy, inplace=True)
4661
4662 @final
4663 def add_prefix(self: NDFrameT, prefix: str, axis: Axis | None = None) -> NDFrameT:
4664 """
4665 Prefix labels with string `prefix`.
4666
4667 For Series, the row labels are prefixed.
4668 For DataFrame, the column labels are prefixed.
4669
4670 Parameters
4671 ----------
4672 prefix : str
4673 The string to add before each label.
4674 axis : {{0 or 'index', 1 or 'columns', None}}, default None
4675 Axis to add prefix on
4676
4677 .. versionadded:: 2.0.0
4678
4679 Returns
4680 -------
4681 Series or DataFrame
4682 New Series or DataFrame with updated labels.
4683
4684 See Also
4685 --------
4686 Series.add_suffix: Suffix row labels with string `suffix`.
4687 DataFrame.add_suffix: Suffix column labels with string `suffix`.
4688
4689 Examples
4690 --------
4691 >>> s = pd.Series([1, 2, 3, 4])
4692 >>> s
4693 0 1
4694 1 2
4695 2 3
4696 3 4
4697 dtype: int64
4698
4699 >>> s.add_prefix('item_')
4700 item_0 1
4701 item_1 2
4702 item_2 3
4703 item_3 4
4704 dtype: int64
4705
4706 >>> df = pd.DataFrame({'A': [1, 2, 3, 4], 'B': [3, 4, 5, 6]})
4707 >>> df
4708 A B
4709 0 1 3
4710 1 2 4
4711 2 3 5
4712 3 4 6
4713
4714 >>> df.add_prefix('col_')
4715 col_A col_B
4716 0 1 3
4717 1 2 4
4718 2 3 5
4719 3 4 6
4720 """
4721 f = lambda x: f"{prefix}{x}"
4722
4723 axis_name = self._info_axis_name
4724 if axis is not None:
4725 axis_name = self._get_axis_name(axis)
4726
4727 mapper = {axis_name: f}
4728
4729 # error: Incompatible return value type (got "Optional[NDFrameT]",
4730 # expected "NDFrameT")
4731 # error: Argument 1 to "rename" of "NDFrame" has incompatible type
4732 # "**Dict[str, partial[str]]"; expected "Union[str, int, None]"
4733 # error: Keywords must be strings
4734 return self._rename(**mapper) # type: ignore[return-value, arg-type, misc]
4735
4736 @final
4737 def add_suffix(self: NDFrameT, suffix: str, axis: Axis | None = None) -> NDFrameT:
4738 """
4739 Suffix labels with string `suffix`.
4740
4741 For Series, the row labels are suffixed.
4742 For DataFrame, the column labels are suffixed.
4743
4744 Parameters
4745 ----------
4746 suffix : str
4747 The string to add after each label.
4748 axis : {{0 or 'index', 1 or 'columns', None}}, default None
4749 Axis to add suffix on
4750
4751 .. versionadded:: 2.0.0
4752
4753 Returns
4754 -------
4755 Series or DataFrame
4756 New Series or DataFrame with updated labels.
4757
4758 See Also
4759 --------
4760 Series.add_prefix: Prefix row labels with string `prefix`.
4761 DataFrame.add_prefix: Prefix column labels with string `prefix`.
4762
4763 Examples
4764 --------
4765 >>> s = pd.Series([1, 2, 3, 4])
4766 >>> s
4767 0 1
4768 1 2
4769 2 3
4770 3 4
4771 dtype: int64
4772
4773 >>> s.add_suffix('_item')
4774 0_item 1
4775 1_item 2
4776 2_item 3
4777 3_item 4
4778 dtype: int64
4779
4780 >>> df = pd.DataFrame({'A': [1, 2, 3, 4], 'B': [3, 4, 5, 6]})
4781 >>> df
4782 A B
4783 0 1 3
4784 1 2 4
4785 2 3 5
4786 3 4 6
4787
4788 >>> df.add_suffix('_col')
4789 A_col B_col
4790 0 1 3
4791 1 2 4
4792 2 3 5
4793 3 4 6
4794 """
4795 f = lambda x: f"{x}{suffix}"
4796
4797 axis_name = self._info_axis_name
4798 if axis is not None:
4799 axis_name = self._get_axis_name(axis)
4800
4801 mapper = {axis_name: f}
4802 # error: Incompatible return value type (got "Optional[NDFrameT]",
4803 # expected "NDFrameT")
4804 # error: Argument 1 to "rename" of "NDFrame" has incompatible type
4805 # "**Dict[str, partial[str]]"; expected "Union[str, int, None]"
4806 # error: Keywords must be strings
4807 return self._rename(**mapper) # type: ignore[return-value, arg-type, misc]
4808
4809 @overload
4810 def sort_values(
4811 self: NDFrameT,
4812 *,
4813 axis: Axis = ...,
4814 ascending: bool_t | Sequence[bool_t] = ...,
4815 inplace: Literal[False] = ...,
4816 kind: str = ...,
4817 na_position: str = ...,
4818 ignore_index: bool_t = ...,
4819 key: ValueKeyFunc = ...,
4820 ) -> NDFrameT:
4821 ...
4822
4823 @overload
4824 def sort_values(
4825 self,
4826 *,
4827 axis: Axis = ...,
4828 ascending: bool_t | Sequence[bool_t] = ...,
4829 inplace: Literal[True],
4830 kind: str = ...,
4831 na_position: str = ...,
4832 ignore_index: bool_t = ...,
4833 key: ValueKeyFunc = ...,
4834 ) -> None:
4835 ...
4836
4837 @overload
4838 def sort_values(
4839 self: NDFrameT,
4840 *,
4841 axis: Axis = ...,
4842 ascending: bool_t | Sequence[bool_t] = ...,
4843 inplace: bool_t = ...,
4844 kind: str = ...,
4845 na_position: str = ...,
4846 ignore_index: bool_t = ...,
4847 key: ValueKeyFunc = ...,
4848 ) -> NDFrameT | None:
4849 ...
4850
4851 def sort_values(
4852 self: NDFrameT,
4853 *,
4854 axis: Axis = 0,
4855 ascending: bool_t | Sequence[bool_t] = True,
4856 inplace: bool_t = False,
4857 kind: str = "quicksort",
4858 na_position: str = "last",
4859 ignore_index: bool_t = False,
4860 key: ValueKeyFunc = None,
4861 ) -> NDFrameT | None:
4862 """
4863 Sort by the values along either axis.
4864
4865 Parameters
4866 ----------%(optional_by)s
4867 axis : %(axes_single_arg)s, default 0
4868 Axis to be sorted.
4869 ascending : bool or list of bool, default True
4870 Sort ascending vs. descending. Specify list for multiple sort
4871 orders. If this is a list of bools, must match the length of
4872 the by.
4873 inplace : bool, default False
4874 If True, perform operation in-place.
4875 kind : {'quicksort', 'mergesort', 'heapsort', 'stable'}, default 'quicksort'
4876 Choice of sorting algorithm. See also :func:`numpy.sort` for more
4877 information. `mergesort` and `stable` are the only stable algorithms. For
4878 DataFrames, this option is only applied when sorting on a single
4879 column or label.
4880 na_position : {'first', 'last'}, default 'last'
4881 Puts NaNs at the beginning if `first`; `last` puts NaNs at the
4882 end.
4883 ignore_index : bool, default False
4884 If True, the resulting axis will be labeled 0, 1, …, n - 1.
4885 key : callable, optional
4886 Apply the key function to the values
4887 before sorting. This is similar to the `key` argument in the
4888 builtin :meth:`sorted` function, with the notable difference that
4889 this `key` function should be *vectorized*. It should expect a
4890 ``Series`` and return a Series with the same shape as the input.
4891 It will be applied to each column in `by` independently.
4892
4893 .. versionadded:: 1.1.0
4894
4895 Returns
4896 -------
4897 DataFrame or None
4898 DataFrame with sorted values or None if ``inplace=True``.
4899
4900 See Also
4901 --------
4902 DataFrame.sort_index : Sort a DataFrame by the index.
4903 Series.sort_values : Similar method for a Series.
4904
4905 Examples
4906 --------
4907 >>> df = pd.DataFrame({
4908 ... 'col1': ['A', 'A', 'B', np.nan, 'D', 'C'],
4909 ... 'col2': [2, 1, 9, 8, 7, 4],
4910 ... 'col3': [0, 1, 9, 4, 2, 3],
4911 ... 'col4': ['a', 'B', 'c', 'D', 'e', 'F']
4912 ... })
4913 >>> df
4914 col1 col2 col3 col4
4915 0 A 2 0 a
4916 1 A 1 1 B
4917 2 B 9 9 c
4918 3 NaN 8 4 D
4919 4 D 7 2 e
4920 5 C 4 3 F
4921
4922 Sort by col1
4923
4924 >>> df.sort_values(by=['col1'])
4925 col1 col2 col3 col4
4926 0 A 2 0 a
4927 1 A 1 1 B
4928 2 B 9 9 c
4929 5 C 4 3 F
4930 4 D 7 2 e
4931 3 NaN 8 4 D
4932
4933 Sort by multiple columns
4934
4935 >>> df.sort_values(by=['col1', 'col2'])
4936 col1 col2 col3 col4
4937 1 A 1 1 B
4938 0 A 2 0 a
4939 2 B 9 9 c
4940 5 C 4 3 F
4941 4 D 7 2 e
4942 3 NaN 8 4 D
4943
4944 Sort Descending
4945
4946 >>> df.sort_values(by='col1', ascending=False)
4947 col1 col2 col3 col4
4948 4 D 7 2 e
4949 5 C 4 3 F
4950 2 B 9 9 c
4951 0 A 2 0 a
4952 1 A 1 1 B
4953 3 NaN 8 4 D
4954
4955 Putting NAs first
4956
4957 >>> df.sort_values(by='col1', ascending=False, na_position='first')
4958 col1 col2 col3 col4
4959 3 NaN 8 4 D
4960 4 D 7 2 e
4961 5 C 4 3 F
4962 2 B 9 9 c
4963 0 A 2 0 a
4964 1 A 1 1 B
4965
4966 Sorting with a key function
4967
4968 >>> df.sort_values(by='col4', key=lambda col: col.str.lower())
4969 col1 col2 col3 col4
4970 0 A 2 0 a
4971 1 A 1 1 B
4972 2 B 9 9 c
4973 3 NaN 8 4 D
4974 4 D 7 2 e
4975 5 C 4 3 F
4976
4977 Natural sort with the key argument,
4978 using the `natsort <https://github.com/SethMMorton/natsort>` package.
4979
4980 >>> df = pd.DataFrame({
4981 ... "time": ['0hr', '128hr', '72hr', '48hr', '96hr'],
4982 ... "value": [10, 20, 30, 40, 50]
4983 ... })
4984 >>> df
4985 time value
4986 0 0hr 10
4987 1 128hr 20
4988 2 72hr 30
4989 3 48hr 40
4990 4 96hr 50
4991 >>> from natsort import index_natsorted
4992 >>> df.sort_values(
4993 ... by="time",
4994 ... key=lambda x: np.argsort(index_natsorted(df["time"]))
4995 ... )
4996 time value
4997 0 0hr 10
4998 3 48hr 40
4999 2 72hr 30
5000 4 96hr 50
5001 1 128hr 20
5002 """
5003 raise AbstractMethodError(self)
5004
5005 @overload
5006 def sort_index(
5007 self,
5008 *,
5009 axis: Axis = ...,
5010 level: IndexLabel = ...,
5011 ascending: bool_t | Sequence[bool_t] = ...,
5012 inplace: Literal[True],
5013 kind: SortKind = ...,
5014 na_position: NaPosition = ...,
5015 sort_remaining: bool_t = ...,
5016 ignore_index: bool_t = ...,
5017 key: IndexKeyFunc = ...,
5018 ) -> None:
5019 ...
5020
5021 @overload
5022 def sort_index(
5023 self: NDFrameT,
5024 *,
5025 axis: Axis = ...,
5026 level: IndexLabel = ...,
5027 ascending: bool_t | Sequence[bool_t] = ...,
5028 inplace: Literal[False] = ...,
5029 kind: SortKind = ...,
5030 na_position: NaPosition = ...,
5031 sort_remaining: bool_t = ...,
5032 ignore_index: bool_t = ...,
5033 key: IndexKeyFunc = ...,
5034 ) -> NDFrameT:
5035 ...
5036
5037 @overload
5038 def sort_index(
5039 self: NDFrameT,
5040 *,
5041 axis: Axis = ...,
5042 level: IndexLabel = ...,
5043 ascending: bool_t | Sequence[bool_t] = ...,
5044 inplace: bool_t = ...,
5045 kind: SortKind = ...,
5046 na_position: NaPosition = ...,
5047 sort_remaining: bool_t = ...,
5048 ignore_index: bool_t = ...,
5049 key: IndexKeyFunc = ...,
5050 ) -> NDFrameT | None:
5051 ...
5052
5053 def sort_index(
5054 self: NDFrameT,
5055 *,
5056 axis: Axis = 0,
5057 level: IndexLabel = None,
5058 ascending: bool_t | Sequence[bool_t] = True,
5059 inplace: bool_t = False,
5060 kind: SortKind = "quicksort",
5061 na_position: NaPosition = "last",
5062 sort_remaining: bool_t = True,
5063 ignore_index: bool_t = False,
5064 key: IndexKeyFunc = None,
5065 ) -> NDFrameT | None:
5066 inplace = validate_bool_kwarg(inplace, "inplace")
5067 axis = self._get_axis_number(axis)
5068 ascending = validate_ascending(ascending)
5069
5070 target = self._get_axis(axis)
5071
5072 indexer = get_indexer_indexer(
5073 target, level, ascending, kind, na_position, sort_remaining, key
5074 )
5075
5076 if indexer is None:
5077 if inplace:
5078 result = self
5079 else:
5080 result = self.copy(deep=None)
5081
5082 if ignore_index:
5083 result.index = default_index(len(self))
5084 if inplace:
5085 return None
5086 else:
5087 return result
5088
5089 baxis = self._get_block_manager_axis(axis)
5090 new_data = self._mgr.take(indexer, axis=baxis, verify=False)
5091
5092 # reconstruct axis if needed
5093 new_data.set_axis(baxis, new_data.axes[baxis]._sort_levels_monotonic())
5094
5095 if ignore_index:
5096 axis = 1 if isinstance(self, ABCDataFrame) else 0
5097 new_data.set_axis(axis, default_index(len(indexer)))
5098
5099 result = self._constructor(new_data)
5100
5101 if inplace:
5102 return self._update_inplace(result)
5103 else:
5104 return result.__finalize__(self, method="sort_index")
5105
5106 @doc(
5107 klass=_shared_doc_kwargs["klass"],
5108 optional_reindex="",
5109 )
5110 def reindex(
5111 self: NDFrameT,
5112 labels=None,
5113 index=None,
5114 columns=None,
5115 axis: Axis | None = None,
5116 method: str | None = None,
5117 copy: bool_t | None = None,
5118 level: Level | None = None,
5119 fill_value: Scalar | None = np.nan,
5120 limit: int | None = None,
5121 tolerance=None,
5122 ) -> NDFrameT:
5123 """
5124 Conform {klass} to new index with optional filling logic.
5125
5126 Places NA/NaN in locations having no value in the previous index. A new object
5127 is produced unless the new index is equivalent to the current one and
5128 ``copy=False``.
5129
5130 Parameters
5131 ----------
5132 {optional_reindex}
5133 method : {{None, 'backfill'/'bfill', 'pad'/'ffill', 'nearest'}}
5134 Method to use for filling holes in reindexed DataFrame.
5135 Please note: this is only applicable to DataFrames/Series with a
5136 monotonically increasing/decreasing index.
5137
5138 * None (default): don't fill gaps
5139 * pad / ffill: Propagate last valid observation forward to next
5140 valid.
5141 * backfill / bfill: Use next valid observation to fill gap.
5142 * nearest: Use nearest valid observations to fill gap.
5143
5144 copy : bool, default True
5145 Return a new object, even if the passed indexes are the same.
5146 level : int or name
5147 Broadcast across a level, matching Index values on the
5148 passed MultiIndex level.
5149 fill_value : scalar, default np.NaN
5150 Value to use for missing values. Defaults to NaN, but can be any
5151 "compatible" value.
5152 limit : int, default None
5153 Maximum number of consecutive elements to forward or backward fill.
5154 tolerance : optional
5155 Maximum distance between original and new labels for inexact
5156 matches. The values of the index at the matching locations most
5157 satisfy the equation ``abs(index[indexer] - target) <= tolerance``.
5158
5159 Tolerance may be a scalar value, which applies the same tolerance
5160 to all values, or list-like, which applies variable tolerance per
5161 element. List-like includes list, tuple, array, Series, and must be
5162 the same size as the index and its dtype must exactly match the
5163 index's type.
5164
5165 Returns
5166 -------
5167 {klass} with changed index.
5168
5169 See Also
5170 --------
5171 DataFrame.set_index : Set row labels.
5172 DataFrame.reset_index : Remove row labels or move them to new columns.
5173 DataFrame.reindex_like : Change to same indices as other DataFrame.
5174
5175 Examples
5176 --------
5177 ``DataFrame.reindex`` supports two calling conventions
5178
5179 * ``(index=index_labels, columns=column_labels, ...)``
5180 * ``(labels, axis={{'index', 'columns'}}, ...)``
5181
5182 We *highly* recommend using keyword arguments to clarify your
5183 intent.
5184
5185 Create a dataframe with some fictional data.
5186
5187 >>> index = ['Firefox', 'Chrome', 'Safari', 'IE10', 'Konqueror']
5188 >>> df = pd.DataFrame({{'http_status': [200, 200, 404, 404, 301],
5189 ... 'response_time': [0.04, 0.02, 0.07, 0.08, 1.0]}},
5190 ... index=index)
5191 >>> df
5192 http_status response_time
5193 Firefox 200 0.04
5194 Chrome 200 0.02
5195 Safari 404 0.07
5196 IE10 404 0.08
5197 Konqueror 301 1.00
5198
5199 Create a new index and reindex the dataframe. By default
5200 values in the new index that do not have corresponding
5201 records in the dataframe are assigned ``NaN``.
5202
5203 >>> new_index = ['Safari', 'Iceweasel', 'Comodo Dragon', 'IE10',
5204 ... 'Chrome']
5205 >>> df.reindex(new_index)
5206 http_status response_time
5207 Safari 404.0 0.07
5208 Iceweasel NaN NaN
5209 Comodo Dragon NaN NaN
5210 IE10 404.0 0.08
5211 Chrome 200.0 0.02
5212
5213 We can fill in the missing values by passing a value to
5214 the keyword ``fill_value``. Because the index is not monotonically
5215 increasing or decreasing, we cannot use arguments to the keyword
5216 ``method`` to fill the ``NaN`` values.
5217
5218 >>> df.reindex(new_index, fill_value=0)
5219 http_status response_time
5220 Safari 404 0.07
5221 Iceweasel 0 0.00
5222 Comodo Dragon 0 0.00
5223 IE10 404 0.08
5224 Chrome 200 0.02
5225
5226 >>> df.reindex(new_index, fill_value='missing')
5227 http_status response_time
5228 Safari 404 0.07
5229 Iceweasel missing missing
5230 Comodo Dragon missing missing
5231 IE10 404 0.08
5232 Chrome 200 0.02
5233
5234 We can also reindex the columns.
5235
5236 >>> df.reindex(columns=['http_status', 'user_agent'])
5237 http_status user_agent
5238 Firefox 200 NaN
5239 Chrome 200 NaN
5240 Safari 404 NaN
5241 IE10 404 NaN
5242 Konqueror 301 NaN
5243
5244 Or we can use "axis-style" keyword arguments
5245
5246 >>> df.reindex(['http_status', 'user_agent'], axis="columns")
5247 http_status user_agent
5248 Firefox 200 NaN
5249 Chrome 200 NaN
5250 Safari 404 NaN
5251 IE10 404 NaN
5252 Konqueror 301 NaN
5253
5254 To further illustrate the filling functionality in
5255 ``reindex``, we will create a dataframe with a
5256 monotonically increasing index (for example, a sequence
5257 of dates).
5258
5259 >>> date_index = pd.date_range('1/1/2010', periods=6, freq='D')
5260 >>> df2 = pd.DataFrame({{"prices": [100, 101, np.nan, 100, 89, 88]}},
5261 ... index=date_index)
5262 >>> df2
5263 prices
5264 2010-01-01 100.0
5265 2010-01-02 101.0
5266 2010-01-03 NaN
5267 2010-01-04 100.0
5268 2010-01-05 89.0
5269 2010-01-06 88.0
5270
5271 Suppose we decide to expand the dataframe to cover a wider
5272 date range.
5273
5274 >>> date_index2 = pd.date_range('12/29/2009', periods=10, freq='D')
5275 >>> df2.reindex(date_index2)
5276 prices
5277 2009-12-29 NaN
5278 2009-12-30 NaN
5279 2009-12-31 NaN
5280 2010-01-01 100.0
5281 2010-01-02 101.0
5282 2010-01-03 NaN
5283 2010-01-04 100.0
5284 2010-01-05 89.0
5285 2010-01-06 88.0
5286 2010-01-07 NaN
5287
5288 The index entries that did not have a value in the original data frame
5289 (for example, '2009-12-29') are by default filled with ``NaN``.
5290 If desired, we can fill in the missing values using one of several
5291 options.
5292
5293 For example, to back-propagate the last valid value to fill the ``NaN``
5294 values, pass ``bfill`` as an argument to the ``method`` keyword.
5295
5296 >>> df2.reindex(date_index2, method='bfill')
5297 prices
5298 2009-12-29 100.0
5299 2009-12-30 100.0
5300 2009-12-31 100.0
5301 2010-01-01 100.0
5302 2010-01-02 101.0
5303 2010-01-03 NaN
5304 2010-01-04 100.0
5305 2010-01-05 89.0
5306 2010-01-06 88.0
5307 2010-01-07 NaN
5308
5309 Please note that the ``NaN`` value present in the original dataframe
5310 (at index value 2010-01-03) will not be filled by any of the
5311 value propagation schemes. This is because filling while reindexing
5312 does not look at dataframe values, but only compares the original and
5313 desired indexes. If you do want to fill in the ``NaN`` values present
5314 in the original dataframe, use the ``fillna()`` method.
5315
5316 See the :ref:`user guide <basics.reindexing>` for more.
5317 """
5318 # TODO: Decide if we care about having different examples for different
5319 # kinds
5320
5321 if index is not None and columns is not None and labels is not None:
5322 raise TypeError("Cannot specify all of 'labels', 'index', 'columns'.")
5323 elif index is not None or columns is not None:
5324 if axis is not None:
5325 raise TypeError(
5326 "Cannot specify both 'axis' and any of 'index' or 'columns'"
5327 )
5328 if labels is not None:
5329 if index is not None:
5330 columns = labels
5331 else:
5332 index = labels
5333 else:
5334 if axis and self._get_axis_number(axis) == 1:
5335 columns = labels
5336 else:
5337 index = labels
5338 axes: dict[Literal["index", "columns"], Any] = {
5339 "index": index,
5340 "columns": columns,
5341 }
5342 method = clean_reindex_fill_method(method)
5343
5344 # if all axes that are requested to reindex are equal, then only copy
5345 # if indicated must have index names equal here as well as values
5346 if copy and using_copy_on_write():
5347 copy = False
5348 if all(
5349 self._get_axis(axis_name).identical(ax)
5350 for axis_name, ax in axes.items()
5351 if ax is not None
5352 ):
5353 return self.copy(deep=copy)
5354
5355 # check if we are a multi reindex
5356 if self._needs_reindex_multi(axes, method, level):
5357 return self._reindex_multi(axes, copy, fill_value)
5358
5359 # perform the reindex on the axes
5360 return self._reindex_axes(
5361 axes, level, limit, tolerance, method, fill_value, copy
5362 ).__finalize__(self, method="reindex")
5363
5364 def _reindex_axes(
5365 self: NDFrameT, axes, level, limit, tolerance, method, fill_value, copy
5366 ) -> NDFrameT:
5367 """Perform the reindex for all the axes."""
5368 obj = self
5369 for a in self._AXIS_ORDERS:
5370 labels = axes[a]
5371 if labels is None:
5372 continue
5373
5374 ax = self._get_axis(a)
5375 new_index, indexer = ax.reindex(
5376 labels, level=level, limit=limit, tolerance=tolerance, method=method
5377 )
5378
5379 axis = self._get_axis_number(a)
5380 obj = obj._reindex_with_indexers(
5381 {axis: [new_index, indexer]},
5382 fill_value=fill_value,
5383 copy=copy,
5384 allow_dups=False,
5385 )
5386 # If we've made a copy once, no need to make another one
5387 copy = False
5388
5389 return obj
5390
5391 def _needs_reindex_multi(self, axes, method, level) -> bool_t:
5392 """Check if we do need a multi reindex."""
5393 return (
5394 (common.count_not_none(*axes.values()) == self._AXIS_LEN)
5395 and method is None
5396 and level is None
5397 and not self._is_mixed_type
5398 and not (
5399 self.ndim == 2
5400 and len(self.dtypes) == 1
5401 and is_extension_array_dtype(self.dtypes.iloc[0])
5402 )
5403 )
5404
5405 def _reindex_multi(self, axes, copy, fill_value):
5406 raise AbstractMethodError(self)
5407
5408 @final
5409 def _reindex_with_indexers(
5410 self: NDFrameT,
5411 reindexers,
5412 fill_value=None,
5413 copy: bool_t | None = False,
5414 allow_dups: bool_t = False,
5415 ) -> NDFrameT:
5416 """allow_dups indicates an internal call here"""
5417 # reindex doing multiple operations on different axes if indicated
5418 new_data = self._mgr
5419 for axis in sorted(reindexers.keys()):
5420 index, indexer = reindexers[axis]
5421 baxis = self._get_block_manager_axis(axis)
5422
5423 if index is None:
5424 continue
5425
5426 index = ensure_index(index)
5427 if indexer is not None:
5428 indexer = ensure_platform_int(indexer)
5429
5430 # TODO: speed up on homogeneous DataFrame objects (see _reindex_multi)
5431 new_data = new_data.reindex_indexer(
5432 index,
5433 indexer,
5434 axis=baxis,
5435 fill_value=fill_value,
5436 allow_dups=allow_dups,
5437 copy=copy,
5438 )
5439 # If we've made a copy once, no need to make another one
5440 copy = False
5441
5442 if (
5443 (copy or copy is None)
5444 and new_data is self._mgr
5445 and not using_copy_on_write()
5446 ):
5447 new_data = new_data.copy(deep=copy)
5448 elif using_copy_on_write() and new_data is self._mgr:
5449 new_data = new_data.copy(deep=False)
5450
5451 return self._constructor(new_data).__finalize__(self)
5452
5453 def filter(
5454 self: NDFrameT,
5455 items=None,
5456 like: str | None = None,
5457 regex: str | None = None,
5458 axis: Axis | None = None,
5459 ) -> NDFrameT:
5460 """
5461 Subset the dataframe rows or columns according to the specified index labels.
5462
5463 Note that this routine does not filter a dataframe on its
5464 contents. The filter is applied to the labels of the index.
5465
5466 Parameters
5467 ----------
5468 items : list-like
5469 Keep labels from axis which are in items.
5470 like : str
5471 Keep labels from axis for which "like in label == True".
5472 regex : str (regular expression)
5473 Keep labels from axis for which re.search(regex, label) == True.
5474 axis : {0 or ‘index’, 1 or ‘columns’, None}, default None
5475 The axis to filter on, expressed either as an index (int)
5476 or axis name (str). By default this is the info axis, 'columns' for
5477 DataFrame. For `Series` this parameter is unused and defaults to `None`.
5478
5479 Returns
5480 -------
5481 same type as input object
5482
5483 See Also
5484 --------
5485 DataFrame.loc : Access a group of rows and columns
5486 by label(s) or a boolean array.
5487
5488 Notes
5489 -----
5490 The ``items``, ``like``, and ``regex`` parameters are
5491 enforced to be mutually exclusive.
5492
5493 ``axis`` defaults to the info axis that is used when indexing
5494 with ``[]``.
5495
5496 Examples
5497 --------
5498 >>> df = pd.DataFrame(np.array(([1, 2, 3], [4, 5, 6])),
5499 ... index=['mouse', 'rabbit'],
5500 ... columns=['one', 'two', 'three'])
5501 >>> df
5502 one two three
5503 mouse 1 2 3
5504 rabbit 4 5 6
5505
5506 >>> # select columns by name
5507 >>> df.filter(items=['one', 'three'])
5508 one three
5509 mouse 1 3
5510 rabbit 4 6
5511
5512 >>> # select columns by regular expression
5513 >>> df.filter(regex='e$', axis=1)
5514 one three
5515 mouse 1 3
5516 rabbit 4 6
5517
5518 >>> # select rows containing 'bbi'
5519 >>> df.filter(like='bbi', axis=0)
5520 one two three
5521 rabbit 4 5 6
5522 """
5523 nkw = common.count_not_none(items, like, regex)
5524 if nkw > 1:
5525 raise TypeError(
5526 "Keyword arguments `items`, `like`, or `regex` "
5527 "are mutually exclusive"
5528 )
5529
5530 if axis is None:
5531 axis = self._info_axis_name
5532 labels = self._get_axis(axis)
5533
5534 if items is not None:
5535 name = self._get_axis_name(axis)
5536 # error: Keywords must be strings
5537 return self.reindex( # type: ignore[misc]
5538 **{name: [r for r in items if r in labels]} # type: ignore[arg-type]
5539 )
5540 elif like:
5541
5542 def f(x) -> bool_t:
5543 assert like is not None # needed for mypy
5544 return like in ensure_str(x)
5545
5546 values = labels.map(f)
5547 return self.loc(axis=axis)[values]
5548 elif regex:
5549
5550 def f(x) -> bool_t:
5551 return matcher.search(ensure_str(x)) is not None
5552
5553 matcher = re.compile(regex)
5554 values = labels.map(f)
5555 return self.loc(axis=axis)[values]
5556 else:
5557 raise TypeError("Must pass either `items`, `like`, or `regex`")
5558
5559 @final
5560 def head(self: NDFrameT, n: int = 5) -> NDFrameT:
5561 """
5562 Return the first `n` rows.
5563
5564 This function returns the first `n` rows for the object based
5565 on position. It is useful for quickly testing if your object
5566 has the right type of data in it.
5567
5568 For negative values of `n`, this function returns all rows except
5569 the last `|n|` rows, equivalent to ``df[:n]``.
5570
5571 If n is larger than the number of rows, this function returns all rows.
5572
5573 Parameters
5574 ----------
5575 n : int, default 5
5576 Number of rows to select.
5577
5578 Returns
5579 -------
5580 same type as caller
5581 The first `n` rows of the caller object.
5582
5583 See Also
5584 --------
5585 DataFrame.tail: Returns the last `n` rows.
5586
5587 Examples
5588 --------
5589 >>> df = pd.DataFrame({'animal': ['alligator', 'bee', 'falcon', 'lion',
5590 ... 'monkey', 'parrot', 'shark', 'whale', 'zebra']})
5591 >>> df
5592 animal
5593 0 alligator
5594 1 bee
5595 2 falcon
5596 3 lion
5597 4 monkey
5598 5 parrot
5599 6 shark
5600 7 whale
5601 8 zebra
5602
5603 Viewing the first 5 lines
5604
5605 >>> df.head()
5606 animal
5607 0 alligator
5608 1 bee
5609 2 falcon
5610 3 lion
5611 4 monkey
5612
5613 Viewing the first `n` lines (three in this case)
5614
5615 >>> df.head(3)
5616 animal
5617 0 alligator
5618 1 bee
5619 2 falcon
5620
5621 For negative values of `n`
5622
5623 >>> df.head(-3)
5624 animal
5625 0 alligator
5626 1 bee
5627 2 falcon
5628 3 lion
5629 4 monkey
5630 5 parrot
5631 """
5632 return self.iloc[:n]
5633
5634 @final
5635 def tail(self: NDFrameT, n: int = 5) -> NDFrameT:
5636 """
5637 Return the last `n` rows.
5638
5639 This function returns last `n` rows from the object based on
5640 position. It is useful for quickly verifying data, for example,
5641 after sorting or appending rows.
5642
5643 For negative values of `n`, this function returns all rows except
5644 the first `|n|` rows, equivalent to ``df[|n|:]``.
5645
5646 If n is larger than the number of rows, this function returns all rows.
5647
5648 Parameters
5649 ----------
5650 n : int, default 5
5651 Number of rows to select.
5652
5653 Returns
5654 -------
5655 type of caller
5656 The last `n` rows of the caller object.
5657
5658 See Also
5659 --------
5660 DataFrame.head : The first `n` rows of the caller object.
5661
5662 Examples
5663 --------
5664 >>> df = pd.DataFrame({'animal': ['alligator', 'bee', 'falcon', 'lion',
5665 ... 'monkey', 'parrot', 'shark', 'whale', 'zebra']})
5666 >>> df
5667 animal
5668 0 alligator
5669 1 bee
5670 2 falcon
5671 3 lion
5672 4 monkey
5673 5 parrot
5674 6 shark
5675 7 whale
5676 8 zebra
5677
5678 Viewing the last 5 lines
5679
5680 >>> df.tail()
5681 animal
5682 4 monkey
5683 5 parrot
5684 6 shark
5685 7 whale
5686 8 zebra
5687
5688 Viewing the last `n` lines (three in this case)
5689
5690 >>> df.tail(3)
5691 animal
5692 6 shark
5693 7 whale
5694 8 zebra
5695
5696 For negative values of `n`
5697
5698 >>> df.tail(-3)
5699 animal
5700 3 lion
5701 4 monkey
5702 5 parrot
5703 6 shark
5704 7 whale
5705 8 zebra
5706 """
5707 if n == 0:
5708 return self.iloc[0:0]
5709 return self.iloc[-n:]
5710
5711 @final
5712 def sample(
5713 self: NDFrameT,
5714 n: int | None = None,
5715 frac: float | None = None,
5716 replace: bool_t = False,
5717 weights=None,
5718 random_state: RandomState | None = None,
5719 axis: Axis | None = None,
5720 ignore_index: bool_t = False,
5721 ) -> NDFrameT:
5722 """
5723 Return a random sample of items from an axis of object.
5724
5725 You can use `random_state` for reproducibility.
5726
5727 Parameters
5728 ----------
5729 n : int, optional
5730 Number of items from axis to return. Cannot be used with `frac`.
5731 Default = 1 if `frac` = None.
5732 frac : float, optional
5733 Fraction of axis items to return. Cannot be used with `n`.
5734 replace : bool, default False
5735 Allow or disallow sampling of the same row more than once.
5736 weights : str or ndarray-like, optional
5737 Default 'None' results in equal probability weighting.
5738 If passed a Series, will align with target object on index. Index
5739 values in weights not found in sampled object will be ignored and
5740 index values in sampled object not in weights will be assigned
5741 weights of zero.
5742 If called on a DataFrame, will accept the name of a column
5743 when axis = 0.
5744 Unless weights are a Series, weights must be same length as axis
5745 being sampled.
5746 If weights do not sum to 1, they will be normalized to sum to 1.
5747 Missing values in the weights column will be treated as zero.
5748 Infinite values not allowed.
5749 random_state : int, array-like, BitGenerator, np.random.RandomState, np.random.Generator, optional
5750 If int, array-like, or BitGenerator, seed for random number generator.
5751 If np.random.RandomState or np.random.Generator, use as given.
5752
5753 .. versionchanged:: 1.1.0
5754
5755 array-like and BitGenerator object now passed to np.random.RandomState()
5756 as seed
5757
5758 .. versionchanged:: 1.4.0
5759
5760 np.random.Generator objects now accepted
5761
5762 axis : {0 or ‘index’, 1 or ‘columns’, None}, default None
5763 Axis to sample. Accepts axis number or name. Default is stat axis
5764 for given data type. For `Series` this parameter is unused and defaults to `None`.
5765 ignore_index : bool, default False
5766 If True, the resulting index will be labeled 0, 1, …, n - 1.
5767
5768 .. versionadded:: 1.3.0
5769
5770 Returns
5771 -------
5772 Series or DataFrame
5773 A new object of same type as caller containing `n` items randomly
5774 sampled from the caller object.
5775
5776 See Also
5777 --------
5778 DataFrameGroupBy.sample: Generates random samples from each group of a
5779 DataFrame object.
5780 SeriesGroupBy.sample: Generates random samples from each group of a
5781 Series object.
5782 numpy.random.choice: Generates a random sample from a given 1-D numpy
5783 array.
5784
5785 Notes
5786 -----
5787 If `frac` > 1, `replacement` should be set to `True`.
5788
5789 Examples
5790 --------
5791 >>> df = pd.DataFrame({'num_legs': [2, 4, 8, 0],
5792 ... 'num_wings': [2, 0, 0, 0],
5793 ... 'num_specimen_seen': [10, 2, 1, 8]},
5794 ... index=['falcon', 'dog', 'spider', 'fish'])
5795 >>> df
5796 num_legs num_wings num_specimen_seen
5797 falcon 2 2 10
5798 dog 4 0 2
5799 spider 8 0 1
5800 fish 0 0 8
5801
5802 Extract 3 random elements from the ``Series`` ``df['num_legs']``:
5803 Note that we use `random_state` to ensure the reproducibility of
5804 the examples.
5805
5806 >>> df['num_legs'].sample(n=3, random_state=1)
5807 fish 0
5808 spider 8
5809 falcon 2
5810 Name: num_legs, dtype: int64
5811
5812 A random 50% sample of the ``DataFrame`` with replacement:
5813
5814 >>> df.sample(frac=0.5, replace=True, random_state=1)
5815 num_legs num_wings num_specimen_seen
5816 dog 4 0 2
5817 fish 0 0 8
5818
5819 An upsample sample of the ``DataFrame`` with replacement:
5820 Note that `replace` parameter has to be `True` for `frac` parameter > 1.
5821
5822 >>> df.sample(frac=2, replace=True, random_state=1)
5823 num_legs num_wings num_specimen_seen
5824 dog 4 0 2
5825 fish 0 0 8
5826 falcon 2 2 10
5827 falcon 2 2 10
5828 fish 0 0 8
5829 dog 4 0 2
5830 fish 0 0 8
5831 dog 4 0 2
5832
5833 Using a DataFrame column as weights. Rows with larger value in the
5834 `num_specimen_seen` column are more likely to be sampled.
5835
5836 >>> df.sample(n=2, weights='num_specimen_seen', random_state=1)
5837 num_legs num_wings num_specimen_seen
5838 falcon 2 2 10
5839 fish 0 0 8
5840 """ # noqa:E501
5841 if axis is None:
5842 axis = self._stat_axis_number
5843
5844 axis = self._get_axis_number(axis)
5845 obj_len = self.shape[axis]
5846
5847 # Process random_state argument
5848 rs = common.random_state(random_state)
5849
5850 size = sample.process_sampling_size(n, frac, replace)
5851 if size is None:
5852 assert frac is not None
5853 size = round(frac * obj_len)
5854
5855 if weights is not None:
5856 weights = sample.preprocess_weights(self, weights, axis)
5857
5858 sampled_indices = sample.sample(obj_len, size, replace, weights, rs)
5859 result = self.take(sampled_indices, axis=axis)
5860
5861 if ignore_index:
5862 result.index = default_index(len(result))
5863
5864 return result
5865
5866 @final
5867 @doc(klass=_shared_doc_kwargs["klass"])
5868 def pipe(
5869 self,
5870 func: Callable[..., T] | tuple[Callable[..., T], str],
5871 *args,
5872 **kwargs,
5873 ) -> T:
5874 r"""
5875 Apply chainable functions that expect Series or DataFrames.
5876
5877 Parameters
5878 ----------
5879 func : function
5880 Function to apply to the {klass}.
5881 ``args``, and ``kwargs`` are passed into ``func``.
5882 Alternatively a ``(callable, data_keyword)`` tuple where
5883 ``data_keyword`` is a string indicating the keyword of
5884 ``callable`` that expects the {klass}.
5885 args : iterable, optional
5886 Positional arguments passed into ``func``.
5887 kwargs : mapping, optional
5888 A dictionary of keyword arguments passed into ``func``.
5889
5890 Returns
5891 -------
5892 the return type of ``func``.
5893
5894 See Also
5895 --------
5896 DataFrame.apply : Apply a function along input axis of DataFrame.
5897 DataFrame.applymap : Apply a function elementwise on a whole DataFrame.
5898 Series.map : Apply a mapping correspondence on a
5899 :class:`~pandas.Series`.
5900
5901 Notes
5902 -----
5903 Use ``.pipe`` when chaining together functions that expect
5904 Series, DataFrames or GroupBy objects. Instead of writing
5905
5906 >>> func(g(h(df), arg1=a), arg2=b, arg3=c) # doctest: +SKIP
5907
5908 You can write
5909
5910 >>> (df.pipe(h)
5911 ... .pipe(g, arg1=a)
5912 ... .pipe(func, arg2=b, arg3=c)
5913 ... ) # doctest: +SKIP
5914
5915 If you have a function that takes the data as (say) the second
5916 argument, pass a tuple indicating which keyword expects the
5917 data. For example, suppose ``func`` takes its data as ``arg2``:
5918
5919 >>> (df.pipe(h)
5920 ... .pipe(g, arg1=a)
5921 ... .pipe((func, 'arg2'), arg1=a, arg3=c)
5922 ... ) # doctest: +SKIP
5923 """
5924 if using_copy_on_write():
5925 return common.pipe(self.copy(deep=None), func, *args, **kwargs)
5926 return common.pipe(self, func, *args, **kwargs)
5927
5928 # ----------------------------------------------------------------------
5929 # Attribute access
5930
5931 @final
5932 def __finalize__(
5933 self: NDFrameT, other, method: str | None = None, **kwargs
5934 ) -> NDFrameT:
5935 """
5936 Propagate metadata from other to self.
5937
5938 Parameters
5939 ----------
5940 other : the object from which to get the attributes that we are going
5941 to propagate
5942 method : str, optional
5943 A passed method name providing context on where ``__finalize__``
5944 was called.
5945
5946 .. warning::
5947
5948 The value passed as `method` are not currently considered
5949 stable across pandas releases.
5950 """
5951 if isinstance(other, NDFrame):
5952 for name in other.attrs:
5953 self.attrs[name] = other.attrs[name]
5954
5955 self.flags.allows_duplicate_labels = other.flags.allows_duplicate_labels
5956 # For subclasses using _metadata.
5957 for name in set(self._metadata) & set(other._metadata):
5958 assert isinstance(name, str)
5959 object.__setattr__(self, name, getattr(other, name, None))
5960
5961 if method == "concat":
5962 attrs = other.objs[0].attrs
5963 check_attrs = all(objs.attrs == attrs for objs in other.objs[1:])
5964 if check_attrs:
5965 for name in attrs:
5966 self.attrs[name] = attrs[name]
5967
5968 allows_duplicate_labels = all(
5969 x.flags.allows_duplicate_labels for x in other.objs
5970 )
5971 self.flags.allows_duplicate_labels = allows_duplicate_labels
5972
5973 return self
5974
5975 def __getattr__(self, name: str):
5976 """
5977 After regular attribute access, try looking up the name
5978 This allows simpler access to columns for interactive use.
5979 """
5980 # Note: obj.x will always call obj.__getattribute__('x') prior to
5981 # calling obj.__getattr__('x').
5982 if (
5983 name not in self._internal_names_set
5984 and name not in self._metadata
5985 and name not in self._accessors
5986 and self._info_axis._can_hold_identifiers_and_holds_name(name)
5987 ):
5988 return self[name]
5989 return object.__getattribute__(self, name)
5990
5991 def __setattr__(self, name: str, value) -> None:
5992 """
5993 After regular attribute access, try setting the name
5994 This allows simpler access to columns for interactive use.
5995 """
5996 # first try regular attribute access via __getattribute__, so that
5997 # e.g. ``obj.x`` and ``obj.x = 4`` will always reference/modify
5998 # the same attribute.
5999
6000 try:
6001 object.__getattribute__(self, name)
6002 return object.__setattr__(self, name, value)
6003 except AttributeError:
6004 pass
6005
6006 # if this fails, go on to more involved attribute setting
6007 # (note that this matches __getattr__, above).
6008 if name in self._internal_names_set:
6009 object.__setattr__(self, name, value)
6010 elif name in self._metadata:
6011 object.__setattr__(self, name, value)
6012 else:
6013 try:
6014 existing = getattr(self, name)
6015 if isinstance(existing, Index):
6016 object.__setattr__(self, name, value)
6017 elif name in self._info_axis:
6018 self[name] = value
6019 else:
6020 object.__setattr__(self, name, value)
6021 except (AttributeError, TypeError):
6022 if isinstance(self, ABCDataFrame) and (is_list_like(value)):
6023 warnings.warn(
6024 "Pandas doesn't allow columns to be "
6025 "created via a new attribute name - see "
6026 "https://pandas.pydata.org/pandas-docs/"
6027 "stable/indexing.html#attribute-access",
6028 stacklevel=find_stack_level(),
6029 )
6030 object.__setattr__(self, name, value)
6031
6032 @final
6033 def _dir_additions(self) -> set[str]:
6034 """
6035 add the string-like attributes from the info_axis.
6036 If info_axis is a MultiIndex, its first level values are used.
6037 """
6038 additions = super()._dir_additions()
6039 if self._info_axis._can_hold_strings:
6040 additions.update(self._info_axis._dir_additions_for_owner)
6041 return additions
6042
6043 # ----------------------------------------------------------------------
6044 # Consolidation of internals
6045
6046 @final
6047 def _protect_consolidate(self, f):
6048 """
6049 Consolidate _mgr -- if the blocks have changed, then clear the
6050 cache
6051 """
6052 if isinstance(self._mgr, (ArrayManager, SingleArrayManager)):
6053 return f()
6054 blocks_before = len(self._mgr.blocks)
6055 result = f()
6056 if len(self._mgr.blocks) != blocks_before:
6057 self._clear_item_cache()
6058 return result
6059
6060 @final
6061 def _consolidate_inplace(self) -> None:
6062 """Consolidate data in place and return None"""
6063
6064 def f() -> None:
6065 self._mgr = self._mgr.consolidate()
6066
6067 self._protect_consolidate(f)
6068
6069 @final
6070 def _consolidate(self):
6071 """
6072 Compute NDFrame with "consolidated" internals (data of each dtype
6073 grouped together in a single ndarray).
6074
6075 Returns
6076 -------
6077 consolidated : same type as caller
6078 """
6079 f = lambda: self._mgr.consolidate()
6080 cons_data = self._protect_consolidate(f)
6081 return self._constructor(cons_data).__finalize__(self)
6082
6083 @property
6084 def _is_mixed_type(self) -> bool_t:
6085 if self._mgr.is_single_block:
6086 return False
6087
6088 if self._mgr.any_extension_types:
6089 # Even if they have the same dtype, we can't consolidate them,
6090 # so we pretend this is "mixed'"
6091 return True
6092
6093 return self.dtypes.nunique() > 1
6094
6095 @final
6096 def _check_inplace_setting(self, value) -> bool_t:
6097 """check whether we allow in-place setting with this type of value"""
6098 if self._is_mixed_type and not self._mgr.is_numeric_mixed_type:
6099 # allow an actual np.nan through
6100 if is_float(value) and np.isnan(value) or value is lib.no_default:
6101 return True
6102
6103 raise TypeError(
6104 "Cannot do inplace boolean setting on "
6105 "mixed-types with a non np.nan value"
6106 )
6107
6108 return True
6109
6110 @final
6111 def _get_numeric_data(self: NDFrameT) -> NDFrameT:
6112 return self._constructor(self._mgr.get_numeric_data()).__finalize__(self)
6113
6114 @final
6115 def _get_bool_data(self):
6116 return self._constructor(self._mgr.get_bool_data()).__finalize__(self)
6117
6118 # ----------------------------------------------------------------------
6119 # Internal Interface Methods
6120
6121 @property
6122 def values(self):
6123 raise AbstractMethodError(self)
6124
6125 @property
6126 def _values(self) -> ArrayLike:
6127 """internal implementation"""
6128 raise AbstractMethodError(self)
6129
6130 @property
6131 def dtypes(self):
6132 """
6133 Return the dtypes in the DataFrame.
6134
6135 This returns a Series with the data type of each column.
6136 The result's index is the original DataFrame's columns. Columns
6137 with mixed types are stored with the ``object`` dtype. See
6138 :ref:`the User Guide <basics.dtypes>` for more.
6139
6140 Returns
6141 -------
6142 pandas.Series
6143 The data type of each column.
6144
6145 Examples
6146 --------
6147 >>> df = pd.DataFrame({'float': [1.0],
6148 ... 'int': [1],
6149 ... 'datetime': [pd.Timestamp('20180310')],
6150 ... 'string': ['foo']})
6151 >>> df.dtypes
6152 float float64
6153 int int64
6154 datetime datetime64[ns]
6155 string object
6156 dtype: object
6157 """
6158 data = self._mgr.get_dtypes()
6159 return self._constructor_sliced(data, index=self._info_axis, dtype=np.object_)
6160
6161 def astype(
6162 self: NDFrameT, dtype, copy: bool_t | None = None, errors: IgnoreRaise = "raise"
6163 ) -> NDFrameT:
6164 """
6165 Cast a pandas object to a specified dtype ``dtype``.
6166
6167 Parameters
6168 ----------
6169 dtype : str, data type, Series or Mapping of column name -> data type
6170 Use a str, numpy.dtype, pandas.ExtensionDtype or Python type to
6171 cast entire pandas object to the same type. Alternatively, use a
6172 mapping, e.g. {col: dtype, ...}, where col is a column label and dtype is
6173 a numpy.dtype or Python type to cast one or more of the DataFrame's
6174 columns to column-specific types.
6175 copy : bool, default True
6176 Return a copy when ``copy=True`` (be very careful setting
6177 ``copy=False`` as changes to values then may propagate to other
6178 pandas objects).
6179 errors : {'raise', 'ignore'}, default 'raise'
6180 Control raising of exceptions on invalid data for provided dtype.
6181
6182 - ``raise`` : allow exceptions to be raised
6183 - ``ignore`` : suppress exceptions. On error return original object.
6184
6185 Returns
6186 -------
6187 same type as caller
6188
6189 See Also
6190 --------
6191 to_datetime : Convert argument to datetime.
6192 to_timedelta : Convert argument to timedelta.
6193 to_numeric : Convert argument to a numeric type.
6194 numpy.ndarray.astype : Cast a numpy array to a specified type.
6195
6196 Notes
6197 -----
6198 .. versionchanged:: 2.0.0
6199
6200 Using ``astype`` to convert from timezone-naive dtype to
6201 timezone-aware dtype will raise an exception.
6202 Use :meth:`Series.dt.tz_localize` instead.
6203
6204 Examples
6205 --------
6206 Create a DataFrame:
6207
6208 >>> d = {'col1': [1, 2], 'col2': [3, 4]}
6209 >>> df = pd.DataFrame(data=d)
6210 >>> df.dtypes
6211 col1 int64
6212 col2 int64
6213 dtype: object
6214
6215 Cast all columns to int32:
6216
6217 >>> df.astype('int32').dtypes
6218 col1 int32
6219 col2 int32
6220 dtype: object
6221
6222 Cast col1 to int32 using a dictionary:
6223
6224 >>> df.astype({'col1': 'int32'}).dtypes
6225 col1 int32
6226 col2 int64
6227 dtype: object
6228
6229 Create a series:
6230
6231 >>> ser = pd.Series([1, 2], dtype='int32')
6232 >>> ser
6233 0 1
6234 1 2
6235 dtype: int32
6236 >>> ser.astype('int64')
6237 0 1
6238 1 2
6239 dtype: int64
6240
6241 Convert to categorical type:
6242
6243 >>> ser.astype('category')
6244 0 1
6245 1 2
6246 dtype: category
6247 Categories (2, int32): [1, 2]
6248
6249 Convert to ordered categorical type with custom ordering:
6250
6251 >>> from pandas.api.types import CategoricalDtype
6252 >>> cat_dtype = CategoricalDtype(
6253 ... categories=[2, 1], ordered=True)
6254 >>> ser.astype(cat_dtype)
6255 0 1
6256 1 2
6257 dtype: category
6258 Categories (2, int64): [2 < 1]
6259
6260 Create a series of dates:
6261
6262 >>> ser_date = pd.Series(pd.date_range('20200101', periods=3))
6263 >>> ser_date
6264 0 2020-01-01
6265 1 2020-01-02
6266 2 2020-01-03
6267 dtype: datetime64[ns]
6268 """
6269 if copy and using_copy_on_write():
6270 copy = False
6271
6272 if is_dict_like(dtype):
6273 if self.ndim == 1: # i.e. Series
6274 if len(dtype) > 1 or self.name not in dtype:
6275 raise KeyError(
6276 "Only the Series name can be used for "
6277 "the key in Series dtype mappings."
6278 )
6279 new_type = dtype[self.name]
6280 return self.astype(new_type, copy, errors)
6281
6282 # GH#44417 cast to Series so we can use .iat below, which will be
6283 # robust in case we
6284 from pandas import Series
6285
6286 dtype_ser = Series(dtype, dtype=object)
6287
6288 for col_name in dtype_ser.index:
6289 if col_name not in self:
6290 raise KeyError(
6291 "Only a column name can be used for the "
6292 "key in a dtype mappings argument. "
6293 f"'{col_name}' not found in columns."
6294 )
6295
6296 dtype_ser = dtype_ser.reindex(self.columns, fill_value=None, copy=False)
6297
6298 results = []
6299 for i, (col_name, col) in enumerate(self.items()):
6300 cdt = dtype_ser.iat[i]
6301 if isna(cdt):
6302 res_col = col.copy(deep=copy)
6303 else:
6304 try:
6305 res_col = col.astype(dtype=cdt, copy=copy, errors=errors)
6306 except ValueError as ex:
6307 ex.args = (
6308 f"{ex}: Error while type casting for column '{col_name}'",
6309 )
6310 raise
6311 results.append(res_col)
6312
6313 elif is_extension_array_dtype(dtype) and self.ndim > 1:
6314 # GH 18099/22869: columnwise conversion to extension dtype
6315 # GH 24704: use iloc to handle duplicate column names
6316 # TODO(EA2D): special case not needed with 2D EAs
6317 results = [
6318 self.iloc[:, i].astype(dtype, copy=copy)
6319 for i in range(len(self.columns))
6320 ]
6321
6322 else:
6323 # else, only a single dtype is given
6324 new_data = self._mgr.astype(dtype=dtype, copy=copy, errors=errors)
6325 return self._constructor(new_data).__finalize__(self, method="astype")
6326
6327 # GH 33113: handle empty frame or series
6328 if not results:
6329 return self.copy(deep=None)
6330
6331 # GH 19920: retain column metadata after concat
6332 result = concat(results, axis=1, copy=False)
6333 # GH#40810 retain subclass
6334 # error: Incompatible types in assignment
6335 # (expression has type "NDFrameT", variable has type "DataFrame")
6336 result = self._constructor(result) # type: ignore[assignment]
6337 result.columns = self.columns
6338 result = result.__finalize__(self, method="astype")
6339 # https://github.com/python/mypy/issues/8354
6340 return cast(NDFrameT, result)
6341
6342 @final
6343 def copy(self: NDFrameT, deep: bool_t | None = True) -> NDFrameT:
6344 """
6345 Make a copy of this object's indices and data.
6346
6347 When ``deep=True`` (default), a new object will be created with a
6348 copy of the calling object's data and indices. Modifications to
6349 the data or indices of the copy will not be reflected in the
6350 original object (see notes below).
6351
6352 When ``deep=False``, a new object will be created without copying
6353 the calling object's data or index (only references to the data
6354 and index are copied). Any changes to the data of the original
6355 will be reflected in the shallow copy (and vice versa).
6356
6357 Parameters
6358 ----------
6359 deep : bool, default True
6360 Make a deep copy, including a copy of the data and the indices.
6361 With ``deep=False`` neither the indices nor the data are copied.
6362
6363 Returns
6364 -------
6365 Series or DataFrame
6366 Object type matches caller.
6367
6368 Notes
6369 -----
6370 When ``deep=True``, data is copied but actual Python objects
6371 will not be copied recursively, only the reference to the object.
6372 This is in contrast to `copy.deepcopy` in the Standard Library,
6373 which recursively copies object data (see examples below).
6374
6375 While ``Index`` objects are copied when ``deep=True``, the underlying
6376 numpy array is not copied for performance reasons. Since ``Index`` is
6377 immutable, the underlying data can be safely shared and a copy
6378 is not needed.
6379
6380 Since pandas is not thread safe, see the
6381 :ref:`gotchas <gotchas.thread-safety>` when copying in a threading
6382 environment.
6383
6384 Examples
6385 --------
6386 >>> s = pd.Series([1, 2], index=["a", "b"])
6387 >>> s
6388 a 1
6389 b 2
6390 dtype: int64
6391
6392 >>> s_copy = s.copy()
6393 >>> s_copy
6394 a 1
6395 b 2
6396 dtype: int64
6397
6398 **Shallow copy versus default (deep) copy:**
6399
6400 >>> s = pd.Series([1, 2], index=["a", "b"])
6401 >>> deep = s.copy()
6402 >>> shallow = s.copy(deep=False)
6403
6404 Shallow copy shares data and index with original.
6405
6406 >>> s is shallow
6407 False
6408 >>> s.values is shallow.values and s.index is shallow.index
6409 True
6410
6411 Deep copy has own copy of data and index.
6412
6413 >>> s is deep
6414 False
6415 >>> s.values is deep.values or s.index is deep.index
6416 False
6417
6418 Updates to the data shared by shallow copy and original is reflected
6419 in both; deep copy remains unchanged.
6420
6421 >>> s[0] = 3
6422 >>> shallow[1] = 4
6423 >>> s
6424 a 3
6425 b 4
6426 dtype: int64
6427 >>> shallow
6428 a 3
6429 b 4
6430 dtype: int64
6431 >>> deep
6432 a 1
6433 b 2
6434 dtype: int64
6435
6436 Note that when copying an object containing Python objects, a deep copy
6437 will copy the data, but will not do so recursively. Updating a nested
6438 data object will be reflected in the deep copy.
6439
6440 >>> s = pd.Series([[1, 2], [3, 4]])
6441 >>> deep = s.copy()
6442 >>> s[0][0] = 10
6443 >>> s
6444 0 [10, 2]
6445 1 [3, 4]
6446 dtype: object
6447 >>> deep
6448 0 [10, 2]
6449 1 [3, 4]
6450 dtype: object
6451 """
6452 data = self._mgr.copy(deep=deep)
6453 self._clear_item_cache()
6454 return self._constructor(data).__finalize__(self, method="copy")
6455
6456 @final
6457 def __copy__(self: NDFrameT, deep: bool_t = True) -> NDFrameT:
6458 return self.copy(deep=deep)
6459
6460 @final
6461 def __deepcopy__(self: NDFrameT, memo=None) -> NDFrameT:
6462 """
6463 Parameters
6464 ----------
6465 memo, default None
6466 Standard signature. Unused
6467 """
6468 return self.copy(deep=True)
6469
6470 @final
6471 def infer_objects(self: NDFrameT, copy: bool_t | None = None) -> NDFrameT:
6472 """
6473 Attempt to infer better dtypes for object columns.
6474
6475 Attempts soft conversion of object-dtyped
6476 columns, leaving non-object and unconvertible
6477 columns unchanged. The inference rules are the
6478 same as during normal Series/DataFrame construction.
6479
6480 Parameters
6481 ----------
6482 copy : bool, default True
6483 Whether to make a copy for non-object or non-inferrable columns
6484 or Series.
6485
6486 Returns
6487 -------
6488 same type as input object
6489
6490 See Also
6491 --------
6492 to_datetime : Convert argument to datetime.
6493 to_timedelta : Convert argument to timedelta.
6494 to_numeric : Convert argument to numeric type.
6495 convert_dtypes : Convert argument to best possible dtype.
6496
6497 Examples
6498 --------
6499 >>> df = pd.DataFrame({"A": ["a", 1, 2, 3]})
6500 >>> df = df.iloc[1:]
6501 >>> df
6502 A
6503 1 1
6504 2 2
6505 3 3
6506
6507 >>> df.dtypes
6508 A object
6509 dtype: object
6510
6511 >>> df.infer_objects().dtypes
6512 A int64
6513 dtype: object
6514 """
6515 new_mgr = self._mgr.convert(copy=copy)
6516 return self._constructor(new_mgr).__finalize__(self, method="infer_objects")
6517
6518 @final
6519 def convert_dtypes(
6520 self: NDFrameT,
6521 infer_objects: bool_t = True,
6522 convert_string: bool_t = True,
6523 convert_integer: bool_t = True,
6524 convert_boolean: bool_t = True,
6525 convert_floating: bool_t = True,
6526 dtype_backend: DtypeBackend = "numpy_nullable",
6527 ) -> NDFrameT:
6528 """
6529 Convert columns to the best possible dtypes using dtypes supporting ``pd.NA``.
6530
6531 Parameters
6532 ----------
6533 infer_objects : bool, default True
6534 Whether object dtypes should be converted to the best possible types.
6535 convert_string : bool, default True
6536 Whether object dtypes should be converted to ``StringDtype()``.
6537 convert_integer : bool, default True
6538 Whether, if possible, conversion can be done to integer extension types.
6539 convert_boolean : bool, defaults True
6540 Whether object dtypes should be converted to ``BooleanDtypes()``.
6541 convert_floating : bool, defaults True
6542 Whether, if possible, conversion can be done to floating extension types.
6543 If `convert_integer` is also True, preference will be give to integer
6544 dtypes if the floats can be faithfully casted to integers.
6545
6546 .. versionadded:: 1.2.0
6547 dtype_backend : {"numpy_nullable", "pyarrow"}, default "numpy_nullable"
6548 Which dtype_backend to use, e.g. whether a DataFrame should use nullable
6549 dtypes for all dtypes that have a nullable
6550 implementation when "numpy_nullable" is set, pyarrow is used for all
6551 dtypes if "pyarrow" is set.
6552
6553 The dtype_backends are still experimential.
6554
6555 .. versionadded:: 2.0
6556
6557 Returns
6558 -------
6559 Series or DataFrame
6560 Copy of input object with new dtype.
6561
6562 See Also
6563 --------
6564 infer_objects : Infer dtypes of objects.
6565 to_datetime : Convert argument to datetime.
6566 to_timedelta : Convert argument to timedelta.
6567 to_numeric : Convert argument to a numeric type.
6568
6569 Notes
6570 -----
6571 By default, ``convert_dtypes`` will attempt to convert a Series (or each
6572 Series in a DataFrame) to dtypes that support ``pd.NA``. By using the options
6573 ``convert_string``, ``convert_integer``, ``convert_boolean`` and
6574 ``convert_floating``, it is possible to turn off individual conversions
6575 to ``StringDtype``, the integer extension types, ``BooleanDtype``
6576 or floating extension types, respectively.
6577
6578 For object-dtyped columns, if ``infer_objects`` is ``True``, use the inference
6579 rules as during normal Series/DataFrame construction. Then, if possible,
6580 convert to ``StringDtype``, ``BooleanDtype`` or an appropriate integer
6581 or floating extension type, otherwise leave as ``object``.
6582
6583 If the dtype is integer, convert to an appropriate integer extension type.
6584
6585 If the dtype is numeric, and consists of all integers, convert to an
6586 appropriate integer extension type. Otherwise, convert to an
6587 appropriate floating extension type.
6588
6589 .. versionchanged:: 1.2
6590 Starting with pandas 1.2, this method also converts float columns
6591 to the nullable floating extension type.
6592
6593 In the future, as new dtypes are added that support ``pd.NA``, the results
6594 of this method will change to support those new dtypes.
6595
6596 Examples
6597 --------
6598 >>> df = pd.DataFrame(
6599 ... {
6600 ... "a": pd.Series([1, 2, 3], dtype=np.dtype("int32")),
6601 ... "b": pd.Series(["x", "y", "z"], dtype=np.dtype("O")),
6602 ... "c": pd.Series([True, False, np.nan], dtype=np.dtype("O")),
6603 ... "d": pd.Series(["h", "i", np.nan], dtype=np.dtype("O")),
6604 ... "e": pd.Series([10, np.nan, 20], dtype=np.dtype("float")),
6605 ... "f": pd.Series([np.nan, 100.5, 200], dtype=np.dtype("float")),
6606 ... }
6607 ... )
6608
6609 Start with a DataFrame with default dtypes.
6610
6611 >>> df
6612 a b c d e f
6613 0 1 x True h 10.0 NaN
6614 1 2 y False i NaN 100.5
6615 2 3 z NaN NaN 20.0 200.0
6616
6617 >>> df.dtypes
6618 a int32
6619 b object
6620 c object
6621 d object
6622 e float64
6623 f float64
6624 dtype: object
6625
6626 Convert the DataFrame to use best possible dtypes.
6627
6628 >>> dfn = df.convert_dtypes()
6629 >>> dfn
6630 a b c d e f
6631 0 1 x True h 10 <NA>
6632 1 2 y False i <NA> 100.5
6633 2 3 z <NA> <NA> 20 200.0
6634
6635 >>> dfn.dtypes
6636 a Int32
6637 b string[python]
6638 c boolean
6639 d string[python]
6640 e Int64
6641 f Float64
6642 dtype: object
6643
6644 Start with a Series of strings and missing data represented by ``np.nan``.
6645
6646 >>> s = pd.Series(["a", "b", np.nan])
6647 >>> s
6648 0 a
6649 1 b
6650 2 NaN
6651 dtype: object
6652
6653 Obtain a Series with dtype ``StringDtype``.
6654
6655 >>> s.convert_dtypes()
6656 0 a
6657 1 b
6658 2 <NA>
6659 dtype: string
6660 """
6661 check_dtype_backend(dtype_backend)
6662 if self.ndim == 1:
6663 return self._convert_dtypes(
6664 infer_objects,
6665 convert_string,
6666 convert_integer,
6667 convert_boolean,
6668 convert_floating,
6669 dtype_backend=dtype_backend,
6670 )
6671 else:
6672 results = [
6673 col._convert_dtypes(
6674 infer_objects,
6675 convert_string,
6676 convert_integer,
6677 convert_boolean,
6678 convert_floating,
6679 dtype_backend=dtype_backend,
6680 )
6681 for col_name, col in self.items()
6682 ]
6683 if len(results) > 0:
6684 result = concat(results, axis=1, copy=False, keys=self.columns)
6685 cons = cast(Type["DataFrame"], self._constructor)
6686 result = cons(result)
6687 result = result.__finalize__(self, method="convert_dtypes")
6688 # https://github.com/python/mypy/issues/8354
6689 return cast(NDFrameT, result)
6690 else:
6691 return self.copy(deep=None)
6692
6693 # ----------------------------------------------------------------------
6694 # Filling NA's
6695
6696 @overload
6697 def fillna(
6698 self: NDFrameT,
6699 value: Hashable | Mapping | Series | DataFrame = ...,
6700 *,
6701 method: FillnaOptions | None = ...,
6702 axis: Axis | None = ...,
6703 inplace: Literal[False] = ...,
6704 limit: int | None = ...,
6705 downcast: dict | None = ...,
6706 ) -> NDFrameT:
6707 ...
6708
6709 @overload
6710 def fillna(
6711 self,
6712 value: Hashable | Mapping | Series | DataFrame = ...,
6713 *,
6714 method: FillnaOptions | None = ...,
6715 axis: Axis | None = ...,
6716 inplace: Literal[True],
6717 limit: int | None = ...,
6718 downcast: dict | None = ...,
6719 ) -> None:
6720 ...
6721
6722 @overload
6723 def fillna(
6724 self: NDFrameT,
6725 value: Hashable | Mapping | Series | DataFrame = ...,
6726 *,
6727 method: FillnaOptions | None = ...,
6728 axis: Axis | None = ...,
6729 inplace: bool_t = ...,
6730 limit: int | None = ...,
6731 downcast: dict | None = ...,
6732 ) -> NDFrameT | None:
6733 ...
6734
6735 @doc(**_shared_doc_kwargs)
6736 def fillna(
6737 self: NDFrameT,
6738 value: Hashable | Mapping | Series | DataFrame = None,
6739 *,
6740 method: FillnaOptions | None = None,
6741 axis: Axis | None = None,
6742 inplace: bool_t = False,
6743 limit: int | None = None,
6744 downcast: dict | None = None,
6745 ) -> NDFrameT | None:
6746 """
6747 Fill NA/NaN values using the specified method.
6748
6749 Parameters
6750 ----------
6751 value : scalar, dict, Series, or DataFrame
6752 Value to use to fill holes (e.g. 0), alternately a
6753 dict/Series/DataFrame of values specifying which value to use for
6754 each index (for a Series) or column (for a DataFrame). Values not
6755 in the dict/Series/DataFrame will not be filled. This value cannot
6756 be a list.
6757 method : {{'backfill', 'bfill', 'ffill', None}}, default None
6758 Method to use for filling holes in reindexed Series:
6759
6760 * ffill: propagate last valid observation forward to next valid.
6761 * backfill / bfill: use next valid observation to fill gap.
6762
6763 axis : {axes_single_arg}
6764 Axis along which to fill missing values. For `Series`
6765 this parameter is unused and defaults to 0.
6766 inplace : bool, default False
6767 If True, fill in-place. Note: this will modify any
6768 other views on this object (e.g., a no-copy slice for a column in a
6769 DataFrame).
6770 limit : int, default None
6771 If method is specified, this is the maximum number of consecutive
6772 NaN values to forward/backward fill. In other words, if there is
6773 a gap with more than this number of consecutive NaNs, it will only
6774 be partially filled. If method is not specified, this is the
6775 maximum number of entries along the entire axis where NaNs will be
6776 filled. Must be greater than 0 if not None.
6777 downcast : dict, default is None
6778 A dict of item->dtype of what to downcast if possible,
6779 or the string 'infer' which will try to downcast to an appropriate
6780 equal type (e.g. float64 to int64 if possible).
6781
6782 Returns
6783 -------
6784 {klass} or None
6785 Object with missing values filled or None if ``inplace=True``.
6786
6787 See Also
6788 --------
6789 interpolate : Fill NaN values using interpolation.
6790 reindex : Conform object to new index.
6791 asfreq : Convert TimeSeries to specified frequency.
6792
6793 Examples
6794 --------
6795 >>> df = pd.DataFrame([[np.nan, 2, np.nan, 0],
6796 ... [3, 4, np.nan, 1],
6797 ... [np.nan, np.nan, np.nan, np.nan],
6798 ... [np.nan, 3, np.nan, 4]],
6799 ... columns=list("ABCD"))
6800 >>> df
6801 A B C D
6802 0 NaN 2.0 NaN 0.0
6803 1 3.0 4.0 NaN 1.0
6804 2 NaN NaN NaN NaN
6805 3 NaN 3.0 NaN 4.0
6806
6807 Replace all NaN elements with 0s.
6808
6809 >>> df.fillna(0)
6810 A B C D
6811 0 0.0 2.0 0.0 0.0
6812 1 3.0 4.0 0.0 1.0
6813 2 0.0 0.0 0.0 0.0
6814 3 0.0 3.0 0.0 4.0
6815
6816 We can also propagate non-null values forward or backward.
6817
6818 >>> df.fillna(method="ffill")
6819 A B C D
6820 0 NaN 2.0 NaN 0.0
6821 1 3.0 4.0 NaN 1.0
6822 2 3.0 4.0 NaN 1.0
6823 3 3.0 3.0 NaN 4.0
6824
6825 Replace all NaN elements in column 'A', 'B', 'C', and 'D', with 0, 1,
6826 2, and 3 respectively.
6827
6828 >>> values = {{"A": 0, "B": 1, "C": 2, "D": 3}}
6829 >>> df.fillna(value=values)
6830 A B C D
6831 0 0.0 2.0 2.0 0.0
6832 1 3.0 4.0 2.0 1.0
6833 2 0.0 1.0 2.0 3.0
6834 3 0.0 3.0 2.0 4.0
6835
6836 Only replace the first NaN element.
6837
6838 >>> df.fillna(value=values, limit=1)
6839 A B C D
6840 0 0.0 2.0 2.0 0.0
6841 1 3.0 4.0 NaN 1.0
6842 2 NaN 1.0 NaN 3.0
6843 3 NaN 3.0 NaN 4.0
6844
6845 When filling using a DataFrame, replacement happens along
6846 the same column names and same indices
6847
6848 >>> df2 = pd.DataFrame(np.zeros((4, 4)), columns=list("ABCE"))
6849 >>> df.fillna(df2)
6850 A B C D
6851 0 0.0 2.0 0.0 0.0
6852 1 3.0 4.0 0.0 1.0
6853 2 0.0 0.0 0.0 NaN
6854 3 0.0 3.0 0.0 4.0
6855
6856 Note that column D is not affected since it is not present in df2.
6857 """
6858 inplace = validate_bool_kwarg(inplace, "inplace")
6859 value, method = validate_fillna_kwargs(value, method)
6860
6861 # set the default here, so functions examining the signaure
6862 # can detect if something was set (e.g. in groupby) (GH9221)
6863 if axis is None:
6864 axis = 0
6865 axis = self._get_axis_number(axis)
6866
6867 if value is None:
6868 if not self._mgr.is_single_block and axis == 1:
6869 if inplace:
6870 raise NotImplementedError()
6871 result = self.T.fillna(method=method, limit=limit).T
6872
6873 return result
6874
6875 new_data = self._mgr.interpolate(
6876 method=method,
6877 axis=axis,
6878 limit=limit,
6879 inplace=inplace,
6880 downcast=downcast,
6881 )
6882 else:
6883 if self.ndim == 1:
6884 if isinstance(value, (dict, ABCSeries)):
6885 if not len(value):
6886 # test_fillna_nonscalar
6887 if inplace:
6888 return None
6889 return self.copy(deep=None)
6890 from pandas import Series
6891
6892 value = Series(value)
6893 value = value.reindex(self.index, copy=False)
6894 value = value._values
6895 elif not is_list_like(value):
6896 pass
6897 else:
6898 raise TypeError(
6899 '"value" parameter must be a scalar, dict '
6900 "or Series, but you passed a "
6901 f'"{type(value).__name__}"'
6902 )
6903
6904 new_data = self._mgr.fillna(
6905 value=value, limit=limit, inplace=inplace, downcast=downcast
6906 )
6907
6908 elif isinstance(value, (dict, ABCSeries)):
6909 if axis == 1:
6910 raise NotImplementedError(
6911 "Currently only can fill "
6912 "with dict/Series column "
6913 "by column"
6914 )
6915 if using_copy_on_write():
6916 result = self.copy(deep=None)
6917 else:
6918 result = self if inplace else self.copy()
6919 is_dict = isinstance(downcast, dict)
6920 for k, v in value.items():
6921 if k not in result:
6922 continue
6923
6924 # error: Item "None" of "Optional[Dict[Any, Any]]" has no
6925 # attribute "get"
6926 downcast_k = (
6927 downcast
6928 if not is_dict
6929 else downcast.get(k) # type: ignore[union-attr]
6930 )
6931
6932 res_k = result[k].fillna(v, limit=limit, downcast=downcast_k)
6933
6934 if not inplace:
6935 result[k] = res_k
6936 else:
6937 # We can write into our existing column(s) iff dtype
6938 # was preserved.
6939 if isinstance(res_k, ABCSeries):
6940 # i.e. 'k' only shows up once in self.columns
6941 if res_k.dtype == result[k].dtype:
6942 result.loc[:, k] = res_k
6943 else:
6944 # Different dtype -> no way to do inplace.
6945 result[k] = res_k
6946 else:
6947 # see test_fillna_dict_inplace_nonunique_columns
6948 locs = result.columns.get_loc(k)
6949 if isinstance(locs, slice):
6950 locs = np.arange(self.shape[1])[locs]
6951 elif (
6952 isinstance(locs, np.ndarray) and locs.dtype.kind == "b"
6953 ):
6954 locs = locs.nonzero()[0]
6955 elif not (
6956 isinstance(locs, np.ndarray) and locs.dtype.kind == "i"
6957 ):
6958 # Should never be reached, but let's cover our bases
6959 raise NotImplementedError(
6960 "Unexpected get_loc result, please report a bug at "
6961 "https://github.com/pandas-dev/pandas"
6962 )
6963
6964 for i, loc in enumerate(locs):
6965 res_loc = res_k.iloc[:, i]
6966 target = self.iloc[:, loc]
6967
6968 if res_loc.dtype == target.dtype:
6969 result.iloc[:, loc] = res_loc
6970 else:
6971 result.isetitem(loc, res_loc)
6972 if inplace:
6973 return self._update_inplace(result)
6974 else:
6975 return result
6976
6977 elif not is_list_like(value):
6978 if axis == 1:
6979 result = self.T.fillna(value=value, limit=limit).T
6980
6981 new_data = result
6982 else:
6983 new_data = self._mgr.fillna(
6984 value=value, limit=limit, inplace=inplace, downcast=downcast
6985 )
6986 elif isinstance(value, ABCDataFrame) and self.ndim == 2:
6987 new_data = self.where(self.notna(), value)._mgr
6988 else:
6989 raise ValueError(f"invalid fill value with a {type(value)}")
6990
6991 result = self._constructor(new_data)
6992 if inplace:
6993 return self._update_inplace(result)
6994 else:
6995 return result.__finalize__(self, method="fillna")
6996
6997 @overload
6998 def ffill(
6999 self: NDFrameT,
7000 *,
7001 axis: None | Axis = ...,
7002 inplace: Literal[False] = ...,
7003 limit: None | int = ...,
7004 downcast: dict | None = ...,
7005 ) -> NDFrameT:
7006 ...
7007
7008 @overload
7009 def ffill(
7010 self,
7011 *,
7012 axis: None | Axis = ...,
7013 inplace: Literal[True],
7014 limit: None | int = ...,
7015 downcast: dict | None = ...,
7016 ) -> None:
7017 ...
7018
7019 @overload
7020 def ffill(
7021 self: NDFrameT,
7022 *,
7023 axis: None | Axis = ...,
7024 inplace: bool_t = ...,
7025 limit: None | int = ...,
7026 downcast: dict | None = ...,
7027 ) -> NDFrameT | None:
7028 ...
7029
7030 @doc(klass=_shared_doc_kwargs["klass"])
7031 def ffill(
7032 self: NDFrameT,
7033 *,
7034 axis: None | Axis = None,
7035 inplace: bool_t = False,
7036 limit: None | int = None,
7037 downcast: dict | None = None,
7038 ) -> NDFrameT | None:
7039 """
7040 Synonym for :meth:`DataFrame.fillna` with ``method='ffill'``.
7041
7042 Returns
7043 -------
7044 {klass} or None
7045 Object with missing values filled or None if ``inplace=True``.
7046 """
7047 return self.fillna(
7048 method="ffill", axis=axis, inplace=inplace, limit=limit, downcast=downcast
7049 )
7050
7051 @doc(klass=_shared_doc_kwargs["klass"])
7052 def pad(
7053 self: NDFrameT,
7054 *,
7055 axis: None | Axis = None,
7056 inplace: bool_t = False,
7057 limit: None | int = None,
7058 downcast: dict | None = None,
7059 ) -> NDFrameT | None:
7060 """
7061 Synonym for :meth:`DataFrame.fillna` with ``method='ffill'``.
7062
7063 .. deprecated:: 2.0
7064
7065 {klass}.pad is deprecated. Use {klass}.ffill instead.
7066
7067 Returns
7068 -------
7069 {klass} or None
7070 Object with missing values filled or None if ``inplace=True``.
7071 """
7072 warnings.warn(
7073 "DataFrame.pad/Series.pad is deprecated. Use "
7074 "DataFrame.ffill/Series.ffill instead",
7075 FutureWarning,
7076 stacklevel=find_stack_level(),
7077 )
7078 return self.ffill(axis=axis, inplace=inplace, limit=limit, downcast=downcast)
7079
7080 @overload
7081 def bfill(
7082 self: NDFrameT,
7083 *,
7084 axis: None | Axis = ...,
7085 inplace: Literal[False] = ...,
7086 limit: None | int = ...,
7087 downcast: dict | None = ...,
7088 ) -> NDFrameT:
7089 ...
7090
7091 @overload
7092 def bfill(
7093 self,
7094 *,
7095 axis: None | Axis = ...,
7096 inplace: Literal[True],
7097 limit: None | int = ...,
7098 downcast: dict | None = ...,
7099 ) -> None:
7100 ...
7101
7102 @overload
7103 def bfill(
7104 self: NDFrameT,
7105 *,
7106 axis: None | Axis = ...,
7107 inplace: bool_t = ...,
7108 limit: None | int = ...,
7109 downcast: dict | None = ...,
7110 ) -> NDFrameT | None:
7111 ...
7112
7113 @doc(klass=_shared_doc_kwargs["klass"])
7114 def bfill(
7115 self: NDFrameT,
7116 *,
7117 axis: None | Axis = None,
7118 inplace: bool_t = False,
7119 limit: None | int = None,
7120 downcast: dict | None = None,
7121 ) -> NDFrameT | None:
7122 """
7123 Synonym for :meth:`DataFrame.fillna` with ``method='bfill'``.
7124
7125 Returns
7126 -------
7127 {klass} or None
7128 Object with missing values filled or None if ``inplace=True``.
7129 """
7130 return self.fillna(
7131 method="bfill", axis=axis, inplace=inplace, limit=limit, downcast=downcast
7132 )
7133
7134 @doc(klass=_shared_doc_kwargs["klass"])
7135 def backfill(
7136 self: NDFrameT,
7137 *,
7138 axis: None | Axis = None,
7139 inplace: bool_t = False,
7140 limit: None | int = None,
7141 downcast: dict | None = None,
7142 ) -> NDFrameT | None:
7143 """
7144 Synonym for :meth:`DataFrame.fillna` with ``method='bfill'``.
7145
7146 .. deprecated:: 2.0
7147
7148 {klass}.backfill is deprecated. Use {klass}.bfill instead.
7149
7150 Returns
7151 -------
7152 {klass} or None
7153 Object with missing values filled or None if ``inplace=True``.
7154 """
7155 warnings.warn(
7156 "DataFrame.backfill/Series.backfill is deprecated. Use "
7157 "DataFrame.bfill/Series.bfill instead",
7158 FutureWarning,
7159 stacklevel=find_stack_level(),
7160 )
7161 return self.bfill(axis=axis, inplace=inplace, limit=limit, downcast=downcast)
7162
7163 @overload
7164 def replace(
7165 self: NDFrameT,
7166 to_replace=...,
7167 value=...,
7168 *,
7169 inplace: Literal[False] = ...,
7170 limit: int | None = ...,
7171 regex: bool_t = ...,
7172 method: Literal["pad", "ffill", "bfill"] | lib.NoDefault = ...,
7173 ) -> NDFrameT:
7174 ...
7175
7176 @overload
7177 def replace(
7178 self,
7179 to_replace=...,
7180 value=...,
7181 *,
7182 inplace: Literal[True],
7183 limit: int | None = ...,
7184 regex: bool_t = ...,
7185 method: Literal["pad", "ffill", "bfill"] | lib.NoDefault = ...,
7186 ) -> None:
7187 ...
7188
7189 @overload
7190 def replace(
7191 self: NDFrameT,
7192 to_replace=...,
7193 value=...,
7194 *,
7195 inplace: bool_t = ...,
7196 limit: int | None = ...,
7197 regex: bool_t = ...,
7198 method: Literal["pad", "ffill", "bfill"] | lib.NoDefault = ...,
7199 ) -> NDFrameT | None:
7200 ...
7201
7202 @doc(
7203 _shared_docs["replace"],
7204 klass=_shared_doc_kwargs["klass"],
7205 inplace=_shared_doc_kwargs["inplace"],
7206 replace_iloc=_shared_doc_kwargs["replace_iloc"],
7207 )
7208 def replace(
7209 self: NDFrameT,
7210 to_replace=None,
7211 value=lib.no_default,
7212 *,
7213 inplace: bool_t = False,
7214 limit: int | None = None,
7215 regex: bool_t = False,
7216 method: Literal["pad", "ffill", "bfill"] | lib.NoDefault = lib.no_default,
7217 ) -> NDFrameT | None:
7218 if not (
7219 is_scalar(to_replace)
7220 or is_re_compilable(to_replace)
7221 or is_list_like(to_replace)
7222 ):
7223 raise TypeError(
7224 "Expecting 'to_replace' to be either a scalar, array-like, "
7225 "dict or None, got invalid type "
7226 f"{repr(type(to_replace).__name__)}"
7227 )
7228
7229 inplace = validate_bool_kwarg(inplace, "inplace")
7230 if not is_bool(regex) and to_replace is not None:
7231 raise ValueError("'to_replace' must be 'None' if 'regex' is not a bool")
7232
7233 if value is lib.no_default or method is not lib.no_default:
7234 # GH#36984 if the user explicitly passes value=None we want to
7235 # respect that. We have the corner case where the user explicitly
7236 # passes value=None *and* a method, which we interpret as meaning
7237 # they want the (documented) default behavior.
7238 if method is lib.no_default:
7239 # TODO: get this to show up as the default in the docs?
7240 method = "pad"
7241
7242 # passing a single value that is scalar like
7243 # when value is None (GH5319), for compat
7244 if not is_dict_like(to_replace) and not is_dict_like(regex):
7245 to_replace = [to_replace]
7246
7247 if isinstance(to_replace, (tuple, list)):
7248 # TODO: Consider copy-on-write for non-replaced columns's here
7249 if isinstance(self, ABCDataFrame):
7250 from pandas import Series
7251
7252 result = self.apply(
7253 Series._replace_single,
7254 args=(to_replace, method, inplace, limit),
7255 )
7256 if inplace:
7257 return None
7258 return result
7259 return self._replace_single(to_replace, method, inplace, limit)
7260
7261 if not is_dict_like(to_replace):
7262 if not is_dict_like(regex):
7263 raise TypeError(
7264 'If "to_replace" and "value" are both None '
7265 'and "to_replace" is not a list, then '
7266 "regex must be a mapping"
7267 )
7268 to_replace = regex
7269 regex = True
7270
7271 items = list(to_replace.items())
7272 if items:
7273 keys, values = zip(*items)
7274 else:
7275 keys, values = ([], [])
7276
7277 are_mappings = [is_dict_like(v) for v in values]
7278
7279 if any(are_mappings):
7280 if not all(are_mappings):
7281 raise TypeError(
7282 "If a nested mapping is passed, all values "
7283 "of the top level mapping must be mappings"
7284 )
7285 # passed a nested dict/Series
7286 to_rep_dict = {}
7287 value_dict = {}
7288
7289 for k, v in items:
7290 keys, values = list(zip(*v.items())) or ([], [])
7291
7292 to_rep_dict[k] = list(keys)
7293 value_dict[k] = list(values)
7294
7295 to_replace, value = to_rep_dict, value_dict
7296 else:
7297 to_replace, value = keys, values
7298
7299 return self.replace(
7300 to_replace, value, inplace=inplace, limit=limit, regex=regex
7301 )
7302 else:
7303 # need a non-zero len on all axes
7304 if not self.size:
7305 if inplace:
7306 return None
7307 return self.copy(deep=None)
7308
7309 if is_dict_like(to_replace):
7310 if is_dict_like(value): # {'A' : NA} -> {'A' : 0}
7311 # Note: Checking below for `in foo.keys()` instead of
7312 # `in foo` is needed for when we have a Series and not dict
7313 mapping = {
7314 col: (to_replace[col], value[col])
7315 for col in to_replace.keys()
7316 if col in value.keys() and col in self
7317 }
7318 return self._replace_columnwise(mapping, inplace, regex)
7319
7320 # {'A': NA} -> 0
7321 elif not is_list_like(value):
7322 # Operate column-wise
7323 if self.ndim == 1:
7324 raise ValueError(
7325 "Series.replace cannot use dict-like to_replace "
7326 "and non-None value"
7327 )
7328 mapping = {
7329 col: (to_rep, value) for col, to_rep in to_replace.items()
7330 }
7331 return self._replace_columnwise(mapping, inplace, regex)
7332 else:
7333 raise TypeError("value argument must be scalar, dict, or Series")
7334
7335 elif is_list_like(to_replace):
7336 if not is_list_like(value):
7337 # e.g. to_replace = [NA, ''] and value is 0,
7338 # so we replace NA with 0 and then replace '' with 0
7339 value = [value] * len(to_replace)
7340
7341 # e.g. we have to_replace = [NA, ''] and value = [0, 'missing']
7342 if len(to_replace) != len(value):
7343 raise ValueError(
7344 f"Replacement lists must match in length. "
7345 f"Expecting {len(to_replace)} got {len(value)} "
7346 )
7347 new_data = self._mgr.replace_list(
7348 src_list=to_replace,
7349 dest_list=value,
7350 inplace=inplace,
7351 regex=regex,
7352 )
7353
7354 elif to_replace is None:
7355 if not (
7356 is_re_compilable(regex)
7357 or is_list_like(regex)
7358 or is_dict_like(regex)
7359 ):
7360 raise TypeError(
7361 f"'regex' must be a string or a compiled regular expression "
7362 f"or a list or dict of strings or regular expressions, "
7363 f"you passed a {repr(type(regex).__name__)}"
7364 )
7365 return self.replace(
7366 regex, value, inplace=inplace, limit=limit, regex=True
7367 )
7368 else:
7369 # dest iterable dict-like
7370 if is_dict_like(value): # NA -> {'A' : 0, 'B' : -1}
7371 # Operate column-wise
7372 if self.ndim == 1:
7373 raise ValueError(
7374 "Series.replace cannot use dict-value and "
7375 "non-None to_replace"
7376 )
7377 mapping = {col: (to_replace, val) for col, val in value.items()}
7378 return self._replace_columnwise(mapping, inplace, regex)
7379
7380 elif not is_list_like(value): # NA -> 0
7381 regex = should_use_regex(regex, to_replace)
7382 if regex:
7383 new_data = self._mgr.replace_regex(
7384 to_replace=to_replace,
7385 value=value,
7386 inplace=inplace,
7387 )
7388 else:
7389 new_data = self._mgr.replace(
7390 to_replace=to_replace, value=value, inplace=inplace
7391 )
7392 else:
7393 raise TypeError(
7394 f'Invalid "to_replace" type: {repr(type(to_replace).__name__)}'
7395 )
7396
7397 result = self._constructor(new_data)
7398 if inplace:
7399 return self._update_inplace(result)
7400 else:
7401 return result.__finalize__(self, method="replace")
7402
7403 def interpolate(
7404 self: NDFrameT,
7405 method: str = "linear",
7406 *,
7407 axis: Axis = 0,
7408 limit: int | None = None,
7409 inplace: bool_t = False,
7410 limit_direction: str | None = None,
7411 limit_area: str | None = None,
7412 downcast: str | None = None,
7413 **kwargs,
7414 ) -> NDFrameT | None:
7415 """
7416 Fill NaN values using an interpolation method.
7417
7418 Please note that only ``method='linear'`` is supported for
7419 DataFrame/Series with a MultiIndex.
7420
7421 Parameters
7422 ----------
7423 method : str, default 'linear'
7424 Interpolation technique to use. One of:
7425
7426 * 'linear': Ignore the index and treat the values as equally
7427 spaced. This is the only method supported on MultiIndexes.
7428 * 'time': Works on daily and higher resolution data to interpolate
7429 given length of interval.
7430 * 'index', 'values': use the actual numerical values of the index.
7431 * 'pad': Fill in NaNs using existing values.
7432 * 'nearest', 'zero', 'slinear', 'quadratic', 'cubic',
7433 'barycentric', 'polynomial': Passed to
7434 `scipy.interpolate.interp1d`, whereas 'spline' is passed to
7435 `scipy.interpolate.UnivariateSpline`. These methods use the numerical
7436 values of the index. Both 'polynomial' and 'spline' require that
7437 you also specify an `order` (int), e.g.
7438 ``df.interpolate(method='polynomial', order=5)``. Note that,
7439 `slinear` method in Pandas refers to the Scipy first order `spline`
7440 instead of Pandas first order `spline`.
7441 * 'krogh', 'piecewise_polynomial', 'spline', 'pchip', 'akima',
7442 'cubicspline': Wrappers around the SciPy interpolation methods of
7443 similar names. See `Notes`.
7444 * 'from_derivatives': Refers to
7445 `scipy.interpolate.BPoly.from_derivatives` which
7446 replaces 'piecewise_polynomial' interpolation method in
7447 scipy 0.18.
7448
7449 axis : {{0 or 'index', 1 or 'columns', None}}, default None
7450 Axis to interpolate along. For `Series` this parameter is unused
7451 and defaults to 0.
7452 limit : int, optional
7453 Maximum number of consecutive NaNs to fill. Must be greater than
7454 0.
7455 inplace : bool, default False
7456 Update the data in place if possible.
7457 limit_direction : {{'forward', 'backward', 'both'}}, Optional
7458 Consecutive NaNs will be filled in this direction.
7459
7460 If limit is specified:
7461 * If 'method' is 'pad' or 'ffill', 'limit_direction' must be 'forward'.
7462 * If 'method' is 'backfill' or 'bfill', 'limit_direction' must be
7463 'backwards'.
7464
7465 If 'limit' is not specified:
7466 * If 'method' is 'backfill' or 'bfill', the default is 'backward'
7467 * else the default is 'forward'
7468
7469 .. versionchanged:: 1.1.0
7470 raises ValueError if `limit_direction` is 'forward' or 'both' and
7471 method is 'backfill' or 'bfill'.
7472 raises ValueError if `limit_direction` is 'backward' or 'both' and
7473 method is 'pad' or 'ffill'.
7474
7475 limit_area : {{`None`, 'inside', 'outside'}}, default None
7476 If limit is specified, consecutive NaNs will be filled with this
7477 restriction.
7478
7479 * ``None``: No fill restriction.
7480 * 'inside': Only fill NaNs surrounded by valid values
7481 (interpolate).
7482 * 'outside': Only fill NaNs outside valid values (extrapolate).
7483
7484 downcast : optional, 'infer' or None, defaults to None
7485 Downcast dtypes if possible.
7486 ``**kwargs`` : optional
7487 Keyword arguments to pass on to the interpolating function.
7488
7489 Returns
7490 -------
7491 Series or DataFrame or None
7492 Returns the same object type as the caller, interpolated at
7493 some or all ``NaN`` values or None if ``inplace=True``.
7494
7495 See Also
7496 --------
7497 fillna : Fill missing values using different methods.
7498 scipy.interpolate.Akima1DInterpolator : Piecewise cubic polynomials
7499 (Akima interpolator).
7500 scipy.interpolate.BPoly.from_derivatives : Piecewise polynomial in the
7501 Bernstein basis.
7502 scipy.interpolate.interp1d : Interpolate a 1-D function.
7503 scipy.interpolate.KroghInterpolator : Interpolate polynomial (Krogh
7504 interpolator).
7505 scipy.interpolate.PchipInterpolator : PCHIP 1-d monotonic cubic
7506 interpolation.
7507 scipy.interpolate.CubicSpline : Cubic spline data interpolator.
7508
7509 Notes
7510 -----
7511 The 'krogh', 'piecewise_polynomial', 'spline', 'pchip' and 'akima'
7512 methods are wrappers around the respective SciPy implementations of
7513 similar names. These use the actual numerical values of the index.
7514 For more information on their behavior, see the
7515 `SciPy documentation
7516 <https://docs.scipy.org/doc/scipy/reference/interpolate.html#univariate-interpolation>`__.
7517
7518 Examples
7519 --------
7520 Filling in ``NaN`` in a :class:`~pandas.Series` via linear
7521 interpolation.
7522
7523 >>> s = pd.Series([0, 1, np.nan, 3])
7524 >>> s
7525 0 0.0
7526 1 1.0
7527 2 NaN
7528 3 3.0
7529 dtype: float64
7530 >>> s.interpolate()
7531 0 0.0
7532 1 1.0
7533 2 2.0
7534 3 3.0
7535 dtype: float64
7536
7537 Filling in ``NaN`` in a Series by padding, but filling at most two
7538 consecutive ``NaN`` at a time.
7539
7540 >>> s = pd.Series([np.nan, "single_one", np.nan,
7541 ... "fill_two_more", np.nan, np.nan, np.nan,
7542 ... 4.71, np.nan])
7543 >>> s
7544 0 NaN
7545 1 single_one
7546 2 NaN
7547 3 fill_two_more
7548 4 NaN
7549 5 NaN
7550 6 NaN
7551 7 4.71
7552 8 NaN
7553 dtype: object
7554 >>> s.interpolate(method='pad', limit=2)
7555 0 NaN
7556 1 single_one
7557 2 single_one
7558 3 fill_two_more
7559 4 fill_two_more
7560 5 fill_two_more
7561 6 NaN
7562 7 4.71
7563 8 4.71
7564 dtype: object
7565
7566 Filling in ``NaN`` in a Series via polynomial interpolation or splines:
7567 Both 'polynomial' and 'spline' methods require that you also specify
7568 an ``order`` (int).
7569
7570 >>> s = pd.Series([0, 2, np.nan, 8])
7571 >>> s.interpolate(method='polynomial', order=2)
7572 0 0.000000
7573 1 2.000000
7574 2 4.666667
7575 3 8.000000
7576 dtype: float64
7577
7578 Fill the DataFrame forward (that is, going down) along each column
7579 using linear interpolation.
7580
7581 Note how the last entry in column 'a' is interpolated differently,
7582 because there is no entry after it to use for interpolation.
7583 Note how the first entry in column 'b' remains ``NaN``, because there
7584 is no entry before it to use for interpolation.
7585
7586 >>> df = pd.DataFrame([(0.0, np.nan, -1.0, 1.0),
7587 ... (np.nan, 2.0, np.nan, np.nan),
7588 ... (2.0, 3.0, np.nan, 9.0),
7589 ... (np.nan, 4.0, -4.0, 16.0)],
7590 ... columns=list('abcd'))
7591 >>> df
7592 a b c d
7593 0 0.0 NaN -1.0 1.0
7594 1 NaN 2.0 NaN NaN
7595 2 2.0 3.0 NaN 9.0
7596 3 NaN 4.0 -4.0 16.0
7597 >>> df.interpolate(method='linear', limit_direction='forward', axis=0)
7598 a b c d
7599 0 0.0 NaN -1.0 1.0
7600 1 1.0 2.0 -2.0 5.0
7601 2 2.0 3.0 -3.0 9.0
7602 3 2.0 4.0 -4.0 16.0
7603
7604 Using polynomial interpolation.
7605
7606 >>> df['d'].interpolate(method='polynomial', order=2)
7607 0 1.0
7608 1 4.0
7609 2 9.0
7610 3 16.0
7611 Name: d, dtype: float64
7612 """
7613 inplace = validate_bool_kwarg(inplace, "inplace")
7614
7615 axis = self._get_axis_number(axis)
7616
7617 fillna_methods = ["ffill", "bfill", "pad", "backfill"]
7618 should_transpose = axis == 1 and method not in fillna_methods
7619
7620 obj = self.T if should_transpose else self
7621
7622 if obj.empty:
7623 return self.copy()
7624
7625 if method not in fillna_methods:
7626 axis = self._info_axis_number
7627
7628 if isinstance(obj.index, MultiIndex) and method != "linear":
7629 raise ValueError(
7630 "Only `method=linear` interpolation is supported on MultiIndexes."
7631 )
7632
7633 # Set `limit_direction` depending on `method`
7634 if limit_direction is None:
7635 limit_direction = (
7636 "backward" if method in ("backfill", "bfill") else "forward"
7637 )
7638 else:
7639 if method in ("pad", "ffill") and limit_direction != "forward":
7640 raise ValueError(
7641 f"`limit_direction` must be 'forward' for method `{method}`"
7642 )
7643 if method in ("backfill", "bfill") and limit_direction != "backward":
7644 raise ValueError(
7645 f"`limit_direction` must be 'backward' for method `{method}`"
7646 )
7647
7648 if obj.ndim == 2 and np.all(obj.dtypes == np.dtype("object")):
7649 raise TypeError(
7650 "Cannot interpolate with all object-dtype columns "
7651 "in the DataFrame. Try setting at least one "
7652 "column to a numeric dtype."
7653 )
7654
7655 # create/use the index
7656 if method == "linear":
7657 # prior default
7658 index = Index(np.arange(len(obj.index)))
7659 else:
7660 index = obj.index
7661 methods = {"index", "values", "nearest", "time"}
7662 is_numeric_or_datetime = (
7663 is_numeric_dtype(index.dtype)
7664 or is_datetime64_any_dtype(index.dtype)
7665 or is_timedelta64_dtype(index.dtype)
7666 )
7667 if method not in methods and not is_numeric_or_datetime:
7668 raise ValueError(
7669 "Index column must be numeric or datetime type when "
7670 f"using {method} method other than linear. "
7671 "Try setting a numeric or datetime index column before "
7672 "interpolating."
7673 )
7674
7675 if isna(index).any():
7676 raise NotImplementedError(
7677 "Interpolation with NaNs in the index "
7678 "has not been implemented. Try filling "
7679 "those NaNs before interpolating."
7680 )
7681 new_data = obj._mgr.interpolate(
7682 method=method,
7683 axis=axis,
7684 index=index,
7685 limit=limit,
7686 limit_direction=limit_direction,
7687 limit_area=limit_area,
7688 inplace=inplace,
7689 downcast=downcast,
7690 **kwargs,
7691 )
7692
7693 result = self._constructor(new_data)
7694 if should_transpose:
7695 result = result.T
7696 if inplace:
7697 return self._update_inplace(result)
7698 else:
7699 return result.__finalize__(self, method="interpolate")
7700
7701 # ----------------------------------------------------------------------
7702 # Timeseries methods Methods
7703
7704 @final
7705 def asof(self, where, subset=None):
7706 """
7707 Return the last row(s) without any NaNs before `where`.
7708
7709 The last row (for each element in `where`, if list) without any
7710 NaN is taken.
7711 In case of a :class:`~pandas.DataFrame`, the last row without NaN
7712 considering only the subset of columns (if not `None`)
7713
7714 If there is no good value, NaN is returned for a Series or
7715 a Series of NaN values for a DataFrame
7716
7717 Parameters
7718 ----------
7719 where : date or array-like of dates
7720 Date(s) before which the last row(s) are returned.
7721 subset : str or array-like of str, default `None`
7722 For DataFrame, if not `None`, only use these columns to
7723 check for NaNs.
7724
7725 Returns
7726 -------
7727 scalar, Series, or DataFrame
7728
7729 The return can be:
7730
7731 * scalar : when `self` is a Series and `where` is a scalar
7732 * Series: when `self` is a Series and `where` is an array-like,
7733 or when `self` is a DataFrame and `where` is a scalar
7734 * DataFrame : when `self` is a DataFrame and `where` is an
7735 array-like
7736
7737 Return scalar, Series, or DataFrame.
7738
7739 See Also
7740 --------
7741 merge_asof : Perform an asof merge. Similar to left join.
7742
7743 Notes
7744 -----
7745 Dates are assumed to be sorted. Raises if this is not the case.
7746
7747 Examples
7748 --------
7749 A Series and a scalar `where`.
7750
7751 >>> s = pd.Series([1, 2, np.nan, 4], index=[10, 20, 30, 40])
7752 >>> s
7753 10 1.0
7754 20 2.0
7755 30 NaN
7756 40 4.0
7757 dtype: float64
7758
7759 >>> s.asof(20)
7760 2.0
7761
7762 For a sequence `where`, a Series is returned. The first value is
7763 NaN, because the first element of `where` is before the first
7764 index value.
7765
7766 >>> s.asof([5, 20])
7767 5 NaN
7768 20 2.0
7769 dtype: float64
7770
7771 Missing values are not considered. The following is ``2.0``, not
7772 NaN, even though NaN is at the index location for ``30``.
7773
7774 >>> s.asof(30)
7775 2.0
7776
7777 Take all columns into consideration
7778
7779 >>> df = pd.DataFrame({'a': [10, 20, 30, 40, 50],
7780 ... 'b': [None, None, None, None, 500]},
7781 ... index=pd.DatetimeIndex(['2018-02-27 09:01:00',
7782 ... '2018-02-27 09:02:00',
7783 ... '2018-02-27 09:03:00',
7784 ... '2018-02-27 09:04:00',
7785 ... '2018-02-27 09:05:00']))
7786 >>> df.asof(pd.DatetimeIndex(['2018-02-27 09:03:30',
7787 ... '2018-02-27 09:04:30']))
7788 a b
7789 2018-02-27 09:03:30 NaN NaN
7790 2018-02-27 09:04:30 NaN NaN
7791
7792 Take a single column into consideration
7793
7794 >>> df.asof(pd.DatetimeIndex(['2018-02-27 09:03:30',
7795 ... '2018-02-27 09:04:30']),
7796 ... subset=['a'])
7797 a b
7798 2018-02-27 09:03:30 30 NaN
7799 2018-02-27 09:04:30 40 NaN
7800 """
7801 if isinstance(where, str):
7802 where = Timestamp(where)
7803
7804 if not self.index.is_monotonic_increasing:
7805 raise ValueError("asof requires a sorted index")
7806
7807 is_series = isinstance(self, ABCSeries)
7808 if is_series:
7809 if subset is not None:
7810 raise ValueError("subset is not valid for Series")
7811 else:
7812 if subset is None:
7813 subset = self.columns
7814 if not is_list_like(subset):
7815 subset = [subset]
7816
7817 is_list = is_list_like(where)
7818 if not is_list:
7819 start = self.index[0]
7820 if isinstance(self.index, PeriodIndex):
7821 where = Period(where, freq=self.index.freq)
7822
7823 if where < start:
7824 if not is_series:
7825 return self._constructor_sliced(
7826 index=self.columns, name=where, dtype=np.float64
7827 )
7828 return np.nan
7829
7830 # It's always much faster to use a *while* loop here for
7831 # Series than pre-computing all the NAs. However a
7832 # *while* loop is extremely expensive for DataFrame
7833 # so we later pre-compute all the NAs and use the same
7834 # code path whether *where* is a scalar or list.
7835 # See PR: https://github.com/pandas-dev/pandas/pull/14476
7836 if is_series:
7837 loc = self.index.searchsorted(where, side="right")
7838 if loc > 0:
7839 loc -= 1
7840
7841 values = self._values
7842 while loc > 0 and isna(values[loc]):
7843 loc -= 1
7844 return values[loc]
7845
7846 if not isinstance(where, Index):
7847 where = Index(where) if is_list else Index([where])
7848
7849 nulls = self.isna() if is_series else self[subset].isna().any(axis=1)
7850 if nulls.all():
7851 if is_series:
7852 self = cast("Series", self)
7853 return self._constructor(np.nan, index=where, name=self.name)
7854 elif is_list:
7855 self = cast("DataFrame", self)
7856 return self._constructor(np.nan, index=where, columns=self.columns)
7857 else:
7858 self = cast("DataFrame", self)
7859 return self._constructor_sliced(
7860 np.nan, index=self.columns, name=where[0]
7861 )
7862
7863 locs = self.index.asof_locs(where, ~(nulls._values))
7864
7865 # mask the missing
7866 missing = locs == -1
7867 data = self.take(locs)
7868 data.index = where
7869 if missing.any():
7870 # GH#16063 only do this setting when necessary, otherwise
7871 # we'd cast e.g. bools to floats
7872 data.loc[missing] = np.nan
7873 return data if is_list else data.iloc[-1]
7874
7875 # ----------------------------------------------------------------------
7876 # Action Methods
7877
7878 @doc(klass=_shared_doc_kwargs["klass"])
7879 def isna(self: NDFrameT) -> NDFrameT:
7880 """
7881 Detect missing values.
7882
7883 Return a boolean same-sized object indicating if the values are NA.
7884 NA values, such as None or :attr:`numpy.NaN`, gets mapped to True
7885 values.
7886 Everything else gets mapped to False values. Characters such as empty
7887 strings ``''`` or :attr:`numpy.inf` are not considered NA values
7888 (unless you set ``pandas.options.mode.use_inf_as_na = True``).
7889
7890 Returns
7891 -------
7892 {klass}
7893 Mask of bool values for each element in {klass} that
7894 indicates whether an element is an NA value.
7895
7896 See Also
7897 --------
7898 {klass}.isnull : Alias of isna.
7899 {klass}.notna : Boolean inverse of isna.
7900 {klass}.dropna : Omit axes labels with missing values.
7901 isna : Top-level isna.
7902
7903 Examples
7904 --------
7905 Show which entries in a DataFrame are NA.
7906
7907 >>> df = pd.DataFrame(dict(age=[5, 6, np.NaN],
7908 ... born=[pd.NaT, pd.Timestamp('1939-05-27'),
7909 ... pd.Timestamp('1940-04-25')],
7910 ... name=['Alfred', 'Batman', ''],
7911 ... toy=[None, 'Batmobile', 'Joker']))
7912 >>> df
7913 age born name toy
7914 0 5.0 NaT Alfred None
7915 1 6.0 1939-05-27 Batman Batmobile
7916 2 NaN 1940-04-25 Joker
7917
7918 >>> df.isna()
7919 age born name toy
7920 0 False True False True
7921 1 False False False False
7922 2 True False False False
7923
7924 Show which entries in a Series are NA.
7925
7926 >>> ser = pd.Series([5, 6, np.NaN])
7927 >>> ser
7928 0 5.0
7929 1 6.0
7930 2 NaN
7931 dtype: float64
7932
7933 >>> ser.isna()
7934 0 False
7935 1 False
7936 2 True
7937 dtype: bool
7938 """
7939 return isna(self).__finalize__(self, method="isna")
7940
7941 @doc(isna, klass=_shared_doc_kwargs["klass"])
7942 def isnull(self: NDFrameT) -> NDFrameT:
7943 return isna(self).__finalize__(self, method="isnull")
7944
7945 @doc(klass=_shared_doc_kwargs["klass"])
7946 def notna(self: NDFrameT) -> NDFrameT:
7947 """
7948 Detect existing (non-missing) values.
7949
7950 Return a boolean same-sized object indicating if the values are not NA.
7951 Non-missing values get mapped to True. Characters such as empty
7952 strings ``''`` or :attr:`numpy.inf` are not considered NA values
7953 (unless you set ``pandas.options.mode.use_inf_as_na = True``).
7954 NA values, such as None or :attr:`numpy.NaN`, get mapped to False
7955 values.
7956
7957 Returns
7958 -------
7959 {klass}
7960 Mask of bool values for each element in {klass} that
7961 indicates whether an element is not an NA value.
7962
7963 See Also
7964 --------
7965 {klass}.notnull : Alias of notna.
7966 {klass}.isna : Boolean inverse of notna.
7967 {klass}.dropna : Omit axes labels with missing values.
7968 notna : Top-level notna.
7969
7970 Examples
7971 --------
7972 Show which entries in a DataFrame are not NA.
7973
7974 >>> df = pd.DataFrame(dict(age=[5, 6, np.NaN],
7975 ... born=[pd.NaT, pd.Timestamp('1939-05-27'),
7976 ... pd.Timestamp('1940-04-25')],
7977 ... name=['Alfred', 'Batman', ''],
7978 ... toy=[None, 'Batmobile', 'Joker']))
7979 >>> df
7980 age born name toy
7981 0 5.0 NaT Alfred None
7982 1 6.0 1939-05-27 Batman Batmobile
7983 2 NaN 1940-04-25 Joker
7984
7985 >>> df.notna()
7986 age born name toy
7987 0 True False True False
7988 1 True True True True
7989 2 False True True True
7990
7991 Show which entries in a Series are not NA.
7992
7993 >>> ser = pd.Series([5, 6, np.NaN])
7994 >>> ser
7995 0 5.0
7996 1 6.0
7997 2 NaN
7998 dtype: float64
7999
8000 >>> ser.notna()
8001 0 True
8002 1 True
8003 2 False
8004 dtype: bool
8005 """
8006 return notna(self).__finalize__(self, method="notna")
8007
8008 @doc(notna, klass=_shared_doc_kwargs["klass"])
8009 def notnull(self: NDFrameT) -> NDFrameT:
8010 return notna(self).__finalize__(self, method="notnull")
8011
8012 @final
8013 def _clip_with_scalar(self, lower, upper, inplace: bool_t = False):
8014 if (lower is not None and np.any(isna(lower))) or (
8015 upper is not None and np.any(isna(upper))
8016 ):
8017 raise ValueError("Cannot use an NA value as a clip threshold")
8018
8019 result = self
8020 mask = isna(self._values)
8021
8022 with np.errstate(all="ignore"):
8023 if upper is not None:
8024 subset = self <= upper
8025 result = result.where(subset, upper, axis=None, inplace=False)
8026 if lower is not None:
8027 subset = self >= lower
8028 result = result.where(subset, lower, axis=None, inplace=False)
8029
8030 if np.any(mask):
8031 result[mask] = np.nan
8032
8033 if inplace:
8034 return self._update_inplace(result)
8035 else:
8036 return result
8037
8038 @final
8039 def _clip_with_one_bound(self, threshold, method, axis, inplace):
8040 if axis is not None:
8041 axis = self._get_axis_number(axis)
8042
8043 # method is self.le for upper bound and self.ge for lower bound
8044 if is_scalar(threshold) and is_number(threshold):
8045 if method.__name__ == "le":
8046 return self._clip_with_scalar(None, threshold, inplace=inplace)
8047 return self._clip_with_scalar(threshold, None, inplace=inplace)
8048
8049 # GH #15390
8050 # In order for where method to work, the threshold must
8051 # be transformed to NDFrame from other array like structure.
8052 if (not isinstance(threshold, ABCSeries)) and is_list_like(threshold):
8053 if isinstance(self, ABCSeries):
8054 threshold = self._constructor(threshold, index=self.index)
8055 else:
8056 threshold = align_method_FRAME(self, threshold, axis, flex=None)[1]
8057
8058 # GH 40420
8059 # Treat missing thresholds as no bounds, not clipping the values
8060 if is_list_like(threshold):
8061 fill_value = np.inf if method.__name__ == "le" else -np.inf
8062 threshold_inf = threshold.fillna(fill_value)
8063 else:
8064 threshold_inf = threshold
8065
8066 subset = method(threshold_inf, axis=axis) | isna(self)
8067
8068 # GH 40420
8069 return self.where(subset, threshold, axis=axis, inplace=inplace)
8070
8071 def clip(
8072 self: NDFrameT,
8073 lower=None,
8074 upper=None,
8075 *,
8076 axis: Axis | None = None,
8077 inplace: bool_t = False,
8078 **kwargs,
8079 ) -> NDFrameT | None:
8080 """
8081 Trim values at input threshold(s).
8082
8083 Assigns values outside boundary to boundary values. Thresholds
8084 can be singular values or array like, and in the latter case
8085 the clipping is performed element-wise in the specified axis.
8086
8087 Parameters
8088 ----------
8089 lower : float or array-like, default None
8090 Minimum threshold value. All values below this
8091 threshold will be set to it. A missing
8092 threshold (e.g `NA`) will not clip the value.
8093 upper : float or array-like, default None
8094 Maximum threshold value. All values above this
8095 threshold will be set to it. A missing
8096 threshold (e.g `NA`) will not clip the value.
8097 axis : {{0 or 'index', 1 or 'columns', None}}, default None
8098 Align object with lower and upper along the given axis.
8099 For `Series` this parameter is unused and defaults to `None`.
8100 inplace : bool, default False
8101 Whether to perform the operation in place on the data.
8102 *args, **kwargs
8103 Additional keywords have no effect but might be accepted
8104 for compatibility with numpy.
8105
8106 Returns
8107 -------
8108 Series or DataFrame or None
8109 Same type as calling object with the values outside the
8110 clip boundaries replaced or None if ``inplace=True``.
8111
8112 See Also
8113 --------
8114 Series.clip : Trim values at input threshold in series.
8115 DataFrame.clip : Trim values at input threshold in dataframe.
8116 numpy.clip : Clip (limit) the values in an array.
8117
8118 Examples
8119 --------
8120 >>> data = {'col_0': [9, -3, 0, -1, 5], 'col_1': [-2, -7, 6, 8, -5]}
8121 >>> df = pd.DataFrame(data)
8122 >>> df
8123 col_0 col_1
8124 0 9 -2
8125 1 -3 -7
8126 2 0 6
8127 3 -1 8
8128 4 5 -5
8129
8130 Clips per column using lower and upper thresholds:
8131
8132 >>> df.clip(-4, 6)
8133 col_0 col_1
8134 0 6 -2
8135 1 -3 -4
8136 2 0 6
8137 3 -1 6
8138 4 5 -4
8139
8140 Clips using specific lower and upper thresholds per column element:
8141
8142 >>> t = pd.Series([2, -4, -1, 6, 3])
8143 >>> t
8144 0 2
8145 1 -4
8146 2 -1
8147 3 6
8148 4 3
8149 dtype: int64
8150
8151 >>> df.clip(t, t + 4, axis=0)
8152 col_0 col_1
8153 0 6 2
8154 1 -3 -4
8155 2 0 3
8156 3 6 8
8157 4 5 3
8158
8159 Clips using specific lower threshold per column element, with missing values:
8160
8161 >>> t = pd.Series([2, -4, np.NaN, 6, 3])
8162 >>> t
8163 0 2.0
8164 1 -4.0
8165 2 NaN
8166 3 6.0
8167 4 3.0
8168 dtype: float64
8169
8170 >>> df.clip(t, axis=0)
8171 col_0 col_1
8172 0 9 2
8173 1 -3 -4
8174 2 0 6
8175 3 6 8
8176 4 5 3
8177 """
8178 inplace = validate_bool_kwarg(inplace, "inplace")
8179
8180 axis = nv.validate_clip_with_axis(axis, (), kwargs)
8181 if axis is not None:
8182 axis = self._get_axis_number(axis)
8183
8184 # GH 17276
8185 # numpy doesn't like NaN as a clip value
8186 # so ignore
8187 # GH 19992
8188 # numpy doesn't drop a list-like bound containing NaN
8189 isna_lower = isna(lower)
8190 if not is_list_like(lower):
8191 if np.any(isna_lower):
8192 lower = None
8193 elif np.all(isna_lower):
8194 lower = None
8195 isna_upper = isna(upper)
8196 if not is_list_like(upper):
8197 if np.any(isna_upper):
8198 upper = None
8199 elif np.all(isna_upper):
8200 upper = None
8201
8202 # GH 2747 (arguments were reversed)
8203 if (
8204 lower is not None
8205 and upper is not None
8206 and is_scalar(lower)
8207 and is_scalar(upper)
8208 ):
8209 lower, upper = min(lower, upper), max(lower, upper)
8210
8211 # fast-path for scalars
8212 if (lower is None or (is_scalar(lower) and is_number(lower))) and (
8213 upper is None or (is_scalar(upper) and is_number(upper))
8214 ):
8215 return self._clip_with_scalar(lower, upper, inplace=inplace)
8216
8217 result = self
8218 if lower is not None:
8219 result = result._clip_with_one_bound(
8220 lower, method=self.ge, axis=axis, inplace=inplace
8221 )
8222 if upper is not None:
8223 if inplace:
8224 result = self
8225 result = result._clip_with_one_bound(
8226 upper, method=self.le, axis=axis, inplace=inplace
8227 )
8228
8229 return result
8230
8231 @doc(**_shared_doc_kwargs)
8232 def asfreq(
8233 self: NDFrameT,
8234 freq: Frequency,
8235 method: FillnaOptions | None = None,
8236 how: str | None = None,
8237 normalize: bool_t = False,
8238 fill_value: Hashable = None,
8239 ) -> NDFrameT:
8240 """
8241 Convert time series to specified frequency.
8242
8243 Returns the original data conformed to a new index with the specified
8244 frequency.
8245
8246 If the index of this {klass} is a :class:`~pandas.PeriodIndex`, the new index
8247 is the result of transforming the original index with
8248 :meth:`PeriodIndex.asfreq <pandas.PeriodIndex.asfreq>` (so the original index
8249 will map one-to-one to the new index).
8250
8251 Otherwise, the new index will be equivalent to ``pd.date_range(start, end,
8252 freq=freq)`` where ``start`` and ``end`` are, respectively, the first and
8253 last entries in the original index (see :func:`pandas.date_range`). The
8254 values corresponding to any timesteps in the new index which were not present
8255 in the original index will be null (``NaN``), unless a method for filling
8256 such unknowns is provided (see the ``method`` parameter below).
8257
8258 The :meth:`resample` method is more appropriate if an operation on each group of
8259 timesteps (such as an aggregate) is necessary to represent the data at the new
8260 frequency.
8261
8262 Parameters
8263 ----------
8264 freq : DateOffset or str
8265 Frequency DateOffset or string.
8266 method : {{'backfill'/'bfill', 'pad'/'ffill'}}, default None
8267 Method to use for filling holes in reindexed Series (note this
8268 does not fill NaNs that already were present):
8269
8270 * 'pad' / 'ffill': propagate last valid observation forward to next
8271 valid
8272 * 'backfill' / 'bfill': use NEXT valid observation to fill.
8273 how : {{'start', 'end'}}, default end
8274 For PeriodIndex only (see PeriodIndex.asfreq).
8275 normalize : bool, default False
8276 Whether to reset output index to midnight.
8277 fill_value : scalar, optional
8278 Value to use for missing values, applied during upsampling (note
8279 this does not fill NaNs that already were present).
8280
8281 Returns
8282 -------
8283 {klass}
8284 {klass} object reindexed to the specified frequency.
8285
8286 See Also
8287 --------
8288 reindex : Conform DataFrame to new index with optional filling logic.
8289
8290 Notes
8291 -----
8292 To learn more about the frequency strings, please see `this link
8293 <https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#offset-aliases>`__.
8294
8295 Examples
8296 --------
8297 Start by creating a series with 4 one minute timestamps.
8298
8299 >>> index = pd.date_range('1/1/2000', periods=4, freq='T')
8300 >>> series = pd.Series([0.0, None, 2.0, 3.0], index=index)
8301 >>> df = pd.DataFrame({{'s': series}})
8302 >>> df
8303 s
8304 2000-01-01 00:00:00 0.0
8305 2000-01-01 00:01:00 NaN
8306 2000-01-01 00:02:00 2.0
8307 2000-01-01 00:03:00 3.0
8308
8309 Upsample the series into 30 second bins.
8310
8311 >>> df.asfreq(freq='30S')
8312 s
8313 2000-01-01 00:00:00 0.0
8314 2000-01-01 00:00:30 NaN
8315 2000-01-01 00:01:00 NaN
8316 2000-01-01 00:01:30 NaN
8317 2000-01-01 00:02:00 2.0
8318 2000-01-01 00:02:30 NaN
8319 2000-01-01 00:03:00 3.0
8320
8321 Upsample again, providing a ``fill value``.
8322
8323 >>> df.asfreq(freq='30S', fill_value=9.0)
8324 s
8325 2000-01-01 00:00:00 0.0
8326 2000-01-01 00:00:30 9.0
8327 2000-01-01 00:01:00 NaN
8328 2000-01-01 00:01:30 9.0
8329 2000-01-01 00:02:00 2.0
8330 2000-01-01 00:02:30 9.0
8331 2000-01-01 00:03:00 3.0
8332
8333 Upsample again, providing a ``method``.
8334
8335 >>> df.asfreq(freq='30S', method='bfill')
8336 s
8337 2000-01-01 00:00:00 0.0
8338 2000-01-01 00:00:30 NaN
8339 2000-01-01 00:01:00 NaN
8340 2000-01-01 00:01:30 2.0
8341 2000-01-01 00:02:00 2.0
8342 2000-01-01 00:02:30 3.0
8343 2000-01-01 00:03:00 3.0
8344 """
8345 from pandas.core.resample import asfreq
8346
8347 return asfreq(
8348 self,
8349 freq,
8350 method=method,
8351 how=how,
8352 normalize=normalize,
8353 fill_value=fill_value,
8354 )
8355
8356 @final
8357 def at_time(
8358 self: NDFrameT, time, asof: bool_t = False, axis: Axis | None = None
8359 ) -> NDFrameT:
8360 """
8361 Select values at particular time of day (e.g., 9:30AM).
8362
8363 Parameters
8364 ----------
8365 time : datetime.time or str
8366 The values to select.
8367 axis : {0 or 'index', 1 or 'columns'}, default 0
8368 For `Series` this parameter is unused and defaults to 0.
8369
8370 Returns
8371 -------
8372 Series or DataFrame
8373
8374 Raises
8375 ------
8376 TypeError
8377 If the index is not a :class:`DatetimeIndex`
8378
8379 See Also
8380 --------
8381 between_time : Select values between particular times of the day.
8382 first : Select initial periods of time series based on a date offset.
8383 last : Select final periods of time series based on a date offset.
8384 DatetimeIndex.indexer_at_time : Get just the index locations for
8385 values at particular time of the day.
8386
8387 Examples
8388 --------
8389 >>> i = pd.date_range('2018-04-09', periods=4, freq='12H')
8390 >>> ts = pd.DataFrame({'A': [1, 2, 3, 4]}, index=i)
8391 >>> ts
8392 A
8393 2018-04-09 00:00:00 1
8394 2018-04-09 12:00:00 2
8395 2018-04-10 00:00:00 3
8396 2018-04-10 12:00:00 4
8397
8398 >>> ts.at_time('12:00')
8399 A
8400 2018-04-09 12:00:00 2
8401 2018-04-10 12:00:00 4
8402 """
8403 if axis is None:
8404 axis = self._stat_axis_number
8405 axis = self._get_axis_number(axis)
8406
8407 index = self._get_axis(axis)
8408
8409 if not isinstance(index, DatetimeIndex):
8410 raise TypeError("Index must be DatetimeIndex")
8411
8412 indexer = index.indexer_at_time(time, asof=asof)
8413 return self._take_with_is_copy(indexer, axis=axis)
8414
8415 @final
8416 def between_time(
8417 self: NDFrameT,
8418 start_time,
8419 end_time,
8420 inclusive: IntervalClosedType = "both",
8421 axis: Axis | None = None,
8422 ) -> NDFrameT:
8423 """
8424 Select values between particular times of the day (e.g., 9:00-9:30 AM).
8425
8426 By setting ``start_time`` to be later than ``end_time``,
8427 you can get the times that are *not* between the two times.
8428
8429 Parameters
8430 ----------
8431 start_time : datetime.time or str
8432 Initial time as a time filter limit.
8433 end_time : datetime.time or str
8434 End time as a time filter limit.
8435 inclusive : {"both", "neither", "left", "right"}, default "both"
8436 Include boundaries; whether to set each bound as closed or open.
8437 axis : {0 or 'index', 1 or 'columns'}, default 0
8438 Determine range time on index or columns value.
8439 For `Series` this parameter is unused and defaults to 0.
8440
8441 Returns
8442 -------
8443 Series or DataFrame
8444 Data from the original object filtered to the specified dates range.
8445
8446 Raises
8447 ------
8448 TypeError
8449 If the index is not a :class:`DatetimeIndex`
8450
8451 See Also
8452 --------
8453 at_time : Select values at a particular time of the day.
8454 first : Select initial periods of time series based on a date offset.
8455 last : Select final periods of time series based on a date offset.
8456 DatetimeIndex.indexer_between_time : Get just the index locations for
8457 values between particular times of the day.
8458
8459 Examples
8460 --------
8461 >>> i = pd.date_range('2018-04-09', periods=4, freq='1D20min')
8462 >>> ts = pd.DataFrame({'A': [1, 2, 3, 4]}, index=i)
8463 >>> ts
8464 A
8465 2018-04-09 00:00:00 1
8466 2018-04-10 00:20:00 2
8467 2018-04-11 00:40:00 3
8468 2018-04-12 01:00:00 4
8469
8470 >>> ts.between_time('0:15', '0:45')
8471 A
8472 2018-04-10 00:20:00 2
8473 2018-04-11 00:40:00 3
8474
8475 You get the times that are *not* between two times by setting
8476 ``start_time`` later than ``end_time``:
8477
8478 >>> ts.between_time('0:45', '0:15')
8479 A
8480 2018-04-09 00:00:00 1
8481 2018-04-12 01:00:00 4
8482 """
8483 if axis is None:
8484 axis = self._stat_axis_number
8485 axis = self._get_axis_number(axis)
8486
8487 index = self._get_axis(axis)
8488 if not isinstance(index, DatetimeIndex):
8489 raise TypeError("Index must be DatetimeIndex")
8490
8491 left_inclusive, right_inclusive = validate_inclusive(inclusive)
8492 indexer = index.indexer_between_time(
8493 start_time,
8494 end_time,
8495 include_start=left_inclusive,
8496 include_end=right_inclusive,
8497 )
8498 return self._take_with_is_copy(indexer, axis=axis)
8499
8500 @doc(**_shared_doc_kwargs)
8501 def resample(
8502 self,
8503 rule,
8504 axis: Axis = 0,
8505 closed: str | None = None,
8506 label: str | None = None,
8507 convention: str = "start",
8508 kind: str | None = None,
8509 on: Level = None,
8510 level: Level = None,
8511 origin: str | TimestampConvertibleTypes = "start_day",
8512 offset: TimedeltaConvertibleTypes | None = None,
8513 group_keys: bool_t = False,
8514 ) -> Resampler:
8515 """
8516 Resample time-series data.
8517
8518 Convenience method for frequency conversion and resampling of time series.
8519 The object must have a datetime-like index (`DatetimeIndex`, `PeriodIndex`,
8520 or `TimedeltaIndex`), or the caller must pass the label of a datetime-like
8521 series/index to the ``on``/``level`` keyword parameter.
8522
8523 Parameters
8524 ----------
8525 rule : DateOffset, Timedelta or str
8526 The offset string or object representing target conversion.
8527 axis : {{0 or 'index', 1 or 'columns'}}, default 0
8528 Which axis to use for up- or down-sampling. For `Series` this parameter
8529 is unused and defaults to 0. Must be
8530 `DatetimeIndex`, `TimedeltaIndex` or `PeriodIndex`.
8531 closed : {{'right', 'left'}}, default None
8532 Which side of bin interval is closed. The default is 'left'
8533 for all frequency offsets except for 'M', 'A', 'Q', 'BM',
8534 'BA', 'BQ', and 'W' which all have a default of 'right'.
8535 label : {{'right', 'left'}}, default None
8536 Which bin edge label to label bucket with. The default is 'left'
8537 for all frequency offsets except for 'M', 'A', 'Q', 'BM',
8538 'BA', 'BQ', and 'W' which all have a default of 'right'.
8539 convention : {{'start', 'end', 's', 'e'}}, default 'start'
8540 For `PeriodIndex` only, controls whether to use the start or
8541 end of `rule`.
8542 kind : {{'timestamp', 'period'}}, optional, default None
8543 Pass 'timestamp' to convert the resulting index to a
8544 `DateTimeIndex` or 'period' to convert it to a `PeriodIndex`.
8545 By default the input representation is retained.
8546
8547 on : str, optional
8548 For a DataFrame, column to use instead of index for resampling.
8549 Column must be datetime-like.
8550 level : str or int, optional
8551 For a MultiIndex, level (name or number) to use for
8552 resampling. `level` must be datetime-like.
8553 origin : Timestamp or str, default 'start_day'
8554 The timestamp on which to adjust the grouping. The timezone of origin
8555 must match the timezone of the index.
8556 If string, must be one of the following:
8557
8558 - 'epoch': `origin` is 1970-01-01
8559 - 'start': `origin` is the first value of the timeseries
8560 - 'start_day': `origin` is the first day at midnight of the timeseries
8561
8562 .. versionadded:: 1.1.0
8563
8564 - 'end': `origin` is the last value of the timeseries
8565 - 'end_day': `origin` is the ceiling midnight of the last day
8566
8567 .. versionadded:: 1.3.0
8568
8569 offset : Timedelta or str, default is None
8570 An offset timedelta added to the origin.
8571
8572 .. versionadded:: 1.1.0
8573
8574 group_keys : bool, default False
8575 Whether to include the group keys in the result index when using
8576 ``.apply()`` on the resampled object.
8577
8578 .. versionadded:: 1.5.0
8579
8580 Not specifying ``group_keys`` will retain values-dependent behavior
8581 from pandas 1.4 and earlier (see :ref:`pandas 1.5.0 Release notes
8582 <whatsnew_150.enhancements.resample_group_keys>` for examples).
8583
8584 .. versionchanged:: 2.0.0
8585
8586 ``group_keys`` now defaults to ``False``.
8587
8588 Returns
8589 -------
8590 pandas.core.Resampler
8591 :class:`~pandas.core.Resampler` object.
8592
8593 See Also
8594 --------
8595 Series.resample : Resample a Series.
8596 DataFrame.resample : Resample a DataFrame.
8597 groupby : Group {klass} by mapping, function, label, or list of labels.
8598 asfreq : Reindex a {klass} with the given frequency without grouping.
8599
8600 Notes
8601 -----
8602 See the `user guide
8603 <https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#resampling>`__
8604 for more.
8605
8606 To learn more about the offset strings, please see `this link
8607 <https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#dateoffset-objects>`__.
8608
8609 Examples
8610 --------
8611 Start by creating a series with 9 one minute timestamps.
8612
8613 >>> index = pd.date_range('1/1/2000', periods=9, freq='T')
8614 >>> series = pd.Series(range(9), index=index)
8615 >>> series
8616 2000-01-01 00:00:00 0
8617 2000-01-01 00:01:00 1
8618 2000-01-01 00:02:00 2
8619 2000-01-01 00:03:00 3
8620 2000-01-01 00:04:00 4
8621 2000-01-01 00:05:00 5
8622 2000-01-01 00:06:00 6
8623 2000-01-01 00:07:00 7
8624 2000-01-01 00:08:00 8
8625 Freq: T, dtype: int64
8626
8627 Downsample the series into 3 minute bins and sum the values
8628 of the timestamps falling into a bin.
8629
8630 >>> series.resample('3T').sum()
8631 2000-01-01 00:00:00 3
8632 2000-01-01 00:03:00 12
8633 2000-01-01 00:06:00 21
8634 Freq: 3T, dtype: int64
8635
8636 Downsample the series into 3 minute bins as above, but label each
8637 bin using the right edge instead of the left. Please note that the
8638 value in the bucket used as the label is not included in the bucket,
8639 which it labels. For example, in the original series the
8640 bucket ``2000-01-01 00:03:00`` contains the value 3, but the summed
8641 value in the resampled bucket with the label ``2000-01-01 00:03:00``
8642 does not include 3 (if it did, the summed value would be 6, not 3).
8643 To include this value close the right side of the bin interval as
8644 illustrated in the example below this one.
8645
8646 >>> series.resample('3T', label='right').sum()
8647 2000-01-01 00:03:00 3
8648 2000-01-01 00:06:00 12
8649 2000-01-01 00:09:00 21
8650 Freq: 3T, dtype: int64
8651
8652 Downsample the series into 3 minute bins as above, but close the right
8653 side of the bin interval.
8654
8655 >>> series.resample('3T', label='right', closed='right').sum()
8656 2000-01-01 00:00:00 0
8657 2000-01-01 00:03:00 6
8658 2000-01-01 00:06:00 15
8659 2000-01-01 00:09:00 15
8660 Freq: 3T, dtype: int64
8661
8662 Upsample the series into 30 second bins.
8663
8664 >>> series.resample('30S').asfreq()[0:5] # Select first 5 rows
8665 2000-01-01 00:00:00 0.0
8666 2000-01-01 00:00:30 NaN
8667 2000-01-01 00:01:00 1.0
8668 2000-01-01 00:01:30 NaN
8669 2000-01-01 00:02:00 2.0
8670 Freq: 30S, dtype: float64
8671
8672 Upsample the series into 30 second bins and fill the ``NaN``
8673 values using the ``ffill`` method.
8674
8675 >>> series.resample('30S').ffill()[0:5]
8676 2000-01-01 00:00:00 0
8677 2000-01-01 00:00:30 0
8678 2000-01-01 00:01:00 1
8679 2000-01-01 00:01:30 1
8680 2000-01-01 00:02:00 2
8681 Freq: 30S, dtype: int64
8682
8683 Upsample the series into 30 second bins and fill the
8684 ``NaN`` values using the ``bfill`` method.
8685
8686 >>> series.resample('30S').bfill()[0:5]
8687 2000-01-01 00:00:00 0
8688 2000-01-01 00:00:30 1
8689 2000-01-01 00:01:00 1
8690 2000-01-01 00:01:30 2
8691 2000-01-01 00:02:00 2
8692 Freq: 30S, dtype: int64
8693
8694 Pass a custom function via ``apply``
8695
8696 >>> def custom_resampler(arraylike):
8697 ... return np.sum(arraylike) + 5
8698 ...
8699 >>> series.resample('3T').apply(custom_resampler)
8700 2000-01-01 00:00:00 8
8701 2000-01-01 00:03:00 17
8702 2000-01-01 00:06:00 26
8703 Freq: 3T, dtype: int64
8704
8705 For a Series with a PeriodIndex, the keyword `convention` can be
8706 used to control whether to use the start or end of `rule`.
8707
8708 Resample a year by quarter using 'start' `convention`. Values are
8709 assigned to the first quarter of the period.
8710
8711 >>> s = pd.Series([1, 2], index=pd.period_range('2012-01-01',
8712 ... freq='A',
8713 ... periods=2))
8714 >>> s
8715 2012 1
8716 2013 2
8717 Freq: A-DEC, dtype: int64
8718 >>> s.resample('Q', convention='start').asfreq()
8719 2012Q1 1.0
8720 2012Q2 NaN
8721 2012Q3 NaN
8722 2012Q4 NaN
8723 2013Q1 2.0
8724 2013Q2 NaN
8725 2013Q3 NaN
8726 2013Q4 NaN
8727 Freq: Q-DEC, dtype: float64
8728
8729 Resample quarters by month using 'end' `convention`. Values are
8730 assigned to the last month of the period.
8731
8732 >>> q = pd.Series([1, 2, 3, 4], index=pd.period_range('2018-01-01',
8733 ... freq='Q',
8734 ... periods=4))
8735 >>> q
8736 2018Q1 1
8737 2018Q2 2
8738 2018Q3 3
8739 2018Q4 4
8740 Freq: Q-DEC, dtype: int64
8741 >>> q.resample('M', convention='end').asfreq()
8742 2018-03 1.0
8743 2018-04 NaN
8744 2018-05 NaN
8745 2018-06 2.0
8746 2018-07 NaN
8747 2018-08 NaN
8748 2018-09 3.0
8749 2018-10 NaN
8750 2018-11 NaN
8751 2018-12 4.0
8752 Freq: M, dtype: float64
8753
8754 For DataFrame objects, the keyword `on` can be used to specify the
8755 column instead of the index for resampling.
8756
8757 >>> d = {{'price': [10, 11, 9, 13, 14, 18, 17, 19],
8758 ... 'volume': [50, 60, 40, 100, 50, 100, 40, 50]}}
8759 >>> df = pd.DataFrame(d)
8760 >>> df['week_starting'] = pd.date_range('01/01/2018',
8761 ... periods=8,
8762 ... freq='W')
8763 >>> df
8764 price volume week_starting
8765 0 10 50 2018-01-07
8766 1 11 60 2018-01-14
8767 2 9 40 2018-01-21
8768 3 13 100 2018-01-28
8769 4 14 50 2018-02-04
8770 5 18 100 2018-02-11
8771 6 17 40 2018-02-18
8772 7 19 50 2018-02-25
8773 >>> df.resample('M', on='week_starting').mean()
8774 price volume
8775 week_starting
8776 2018-01-31 10.75 62.5
8777 2018-02-28 17.00 60.0
8778
8779 For a DataFrame with MultiIndex, the keyword `level` can be used to
8780 specify on which level the resampling needs to take place.
8781
8782 >>> days = pd.date_range('1/1/2000', periods=4, freq='D')
8783 >>> d2 = {{'price': [10, 11, 9, 13, 14, 18, 17, 19],
8784 ... 'volume': [50, 60, 40, 100, 50, 100, 40, 50]}}
8785 >>> df2 = pd.DataFrame(
8786 ... d2,
8787 ... index=pd.MultiIndex.from_product(
8788 ... [days, ['morning', 'afternoon']]
8789 ... )
8790 ... )
8791 >>> df2
8792 price volume
8793 2000-01-01 morning 10 50
8794 afternoon 11 60
8795 2000-01-02 morning 9 40
8796 afternoon 13 100
8797 2000-01-03 morning 14 50
8798 afternoon 18 100
8799 2000-01-04 morning 17 40
8800 afternoon 19 50
8801 >>> df2.resample('D', level=0).sum()
8802 price volume
8803 2000-01-01 21 110
8804 2000-01-02 22 140
8805 2000-01-03 32 150
8806 2000-01-04 36 90
8807
8808 If you want to adjust the start of the bins based on a fixed timestamp:
8809
8810 >>> start, end = '2000-10-01 23:30:00', '2000-10-02 00:30:00'
8811 >>> rng = pd.date_range(start, end, freq='7min')
8812 >>> ts = pd.Series(np.arange(len(rng)) * 3, index=rng)
8813 >>> ts
8814 2000-10-01 23:30:00 0
8815 2000-10-01 23:37:00 3
8816 2000-10-01 23:44:00 6
8817 2000-10-01 23:51:00 9
8818 2000-10-01 23:58:00 12
8819 2000-10-02 00:05:00 15
8820 2000-10-02 00:12:00 18
8821 2000-10-02 00:19:00 21
8822 2000-10-02 00:26:00 24
8823 Freq: 7T, dtype: int64
8824
8825 >>> ts.resample('17min').sum()
8826 2000-10-01 23:14:00 0
8827 2000-10-01 23:31:00 9
8828 2000-10-01 23:48:00 21
8829 2000-10-02 00:05:00 54
8830 2000-10-02 00:22:00 24
8831 Freq: 17T, dtype: int64
8832
8833 >>> ts.resample('17min', origin='epoch').sum()
8834 2000-10-01 23:18:00 0
8835 2000-10-01 23:35:00 18
8836 2000-10-01 23:52:00 27
8837 2000-10-02 00:09:00 39
8838 2000-10-02 00:26:00 24
8839 Freq: 17T, dtype: int64
8840
8841 >>> ts.resample('17min', origin='2000-01-01').sum()
8842 2000-10-01 23:24:00 3
8843 2000-10-01 23:41:00 15
8844 2000-10-01 23:58:00 45
8845 2000-10-02 00:15:00 45
8846 Freq: 17T, dtype: int64
8847
8848 If you want to adjust the start of the bins with an `offset` Timedelta, the two
8849 following lines are equivalent:
8850
8851 >>> ts.resample('17min', origin='start').sum()
8852 2000-10-01 23:30:00 9
8853 2000-10-01 23:47:00 21
8854 2000-10-02 00:04:00 54
8855 2000-10-02 00:21:00 24
8856 Freq: 17T, dtype: int64
8857
8858 >>> ts.resample('17min', offset='23h30min').sum()
8859 2000-10-01 23:30:00 9
8860 2000-10-01 23:47:00 21
8861 2000-10-02 00:04:00 54
8862 2000-10-02 00:21:00 24
8863 Freq: 17T, dtype: int64
8864
8865 If you want to take the largest Timestamp as the end of the bins:
8866
8867 >>> ts.resample('17min', origin='end').sum()
8868 2000-10-01 23:35:00 0
8869 2000-10-01 23:52:00 18
8870 2000-10-02 00:09:00 27
8871 2000-10-02 00:26:00 63
8872 Freq: 17T, dtype: int64
8873
8874 In contrast with the `start_day`, you can use `end_day` to take the ceiling
8875 midnight of the largest Timestamp as the end of the bins and drop the bins
8876 not containing data:
8877
8878 >>> ts.resample('17min', origin='end_day').sum()
8879 2000-10-01 23:38:00 3
8880 2000-10-01 23:55:00 15
8881 2000-10-02 00:12:00 45
8882 2000-10-02 00:29:00 45
8883 Freq: 17T, dtype: int64
8884 """
8885 from pandas.core.resample import get_resampler
8886
8887 axis = self._get_axis_number(axis)
8888 return get_resampler(
8889 cast("Series | DataFrame", self),
8890 freq=rule,
8891 label=label,
8892 closed=closed,
8893 axis=axis,
8894 kind=kind,
8895 convention=convention,
8896 key=on,
8897 level=level,
8898 origin=origin,
8899 offset=offset,
8900 group_keys=group_keys,
8901 )
8902
8903 @final
8904 def first(self: NDFrameT, offset) -> NDFrameT:
8905 """
8906 Select initial periods of time series data based on a date offset.
8907
8908 For a DataFrame with a sorted DatetimeIndex, this function can
8909 select the first few rows based on a date offset.
8910
8911 Parameters
8912 ----------
8913 offset : str, DateOffset or dateutil.relativedelta
8914 The offset length of the data that will be selected. For instance,
8915 '1M' will display all the rows having their index within the first month.
8916
8917 Returns
8918 -------
8919 Series or DataFrame
8920 A subset of the caller.
8921
8922 Raises
8923 ------
8924 TypeError
8925 If the index is not a :class:`DatetimeIndex`
8926
8927 See Also
8928 --------
8929 last : Select final periods of time series based on a date offset.
8930 at_time : Select values at a particular time of the day.
8931 between_time : Select values between particular times of the day.
8932
8933 Examples
8934 --------
8935 >>> i = pd.date_range('2018-04-09', periods=4, freq='2D')
8936 >>> ts = pd.DataFrame({'A': [1, 2, 3, 4]}, index=i)
8937 >>> ts
8938 A
8939 2018-04-09 1
8940 2018-04-11 2
8941 2018-04-13 3
8942 2018-04-15 4
8943
8944 Get the rows for the first 3 days:
8945
8946 >>> ts.first('3D')
8947 A
8948 2018-04-09 1
8949 2018-04-11 2
8950
8951 Notice the data for 3 first calendar days were returned, not the first
8952 3 days observed in the dataset, and therefore data for 2018-04-13 was
8953 not returned.
8954 """
8955 if not isinstance(self.index, DatetimeIndex):
8956 raise TypeError("'first' only supports a DatetimeIndex index")
8957
8958 if len(self.index) == 0:
8959 return self.copy(deep=False)
8960
8961 offset = to_offset(offset)
8962 if not isinstance(offset, Tick) and offset.is_on_offset(self.index[0]):
8963 # GH#29623 if first value is end of period, remove offset with n = 1
8964 # before adding the real offset
8965 end_date = end = self.index[0] - offset.base + offset
8966 else:
8967 end_date = end = self.index[0] + offset
8968
8969 # Tick-like, e.g. 3 weeks
8970 if isinstance(offset, Tick) and end_date in self.index:
8971 end = self.index.searchsorted(end_date, side="left")
8972 return self.iloc[:end]
8973
8974 return self.loc[:end]
8975
8976 @final
8977 def last(self: NDFrameT, offset) -> NDFrameT:
8978 """
8979 Select final periods of time series data based on a date offset.
8980
8981 For a DataFrame with a sorted DatetimeIndex, this function
8982 selects the last few rows based on a date offset.
8983
8984 Parameters
8985 ----------
8986 offset : str, DateOffset, dateutil.relativedelta
8987 The offset length of the data that will be selected. For instance,
8988 '3D' will display all the rows having their index within the last 3 days.
8989
8990 Returns
8991 -------
8992 Series or DataFrame
8993 A subset of the caller.
8994
8995 Raises
8996 ------
8997 TypeError
8998 If the index is not a :class:`DatetimeIndex`
8999
9000 See Also
9001 --------
9002 first : Select initial periods of time series based on a date offset.
9003 at_time : Select values at a particular time of the day.
9004 between_time : Select values between particular times of the day.
9005
9006 Examples
9007 --------
9008 >>> i = pd.date_range('2018-04-09', periods=4, freq='2D')
9009 >>> ts = pd.DataFrame({'A': [1, 2, 3, 4]}, index=i)
9010 >>> ts
9011 A
9012 2018-04-09 1
9013 2018-04-11 2
9014 2018-04-13 3
9015 2018-04-15 4
9016
9017 Get the rows for the last 3 days:
9018
9019 >>> ts.last('3D')
9020 A
9021 2018-04-13 3
9022 2018-04-15 4
9023
9024 Notice the data for 3 last calendar days were returned, not the last
9025 3 observed days in the dataset, and therefore data for 2018-04-11 was
9026 not returned.
9027 """
9028 if not isinstance(self.index, DatetimeIndex):
9029 raise TypeError("'last' only supports a DatetimeIndex index")
9030
9031 if len(self.index) == 0:
9032 return self.copy(deep=False)
9033
9034 offset = to_offset(offset)
9035
9036 start_date = self.index[-1] - offset
9037 start = self.index.searchsorted(start_date, side="right")
9038 return self.iloc[start:]
9039
9040 @final
9041 def rank(
9042 self: NDFrameT,
9043 axis: Axis = 0,
9044 method: str = "average",
9045 numeric_only: bool_t = False,
9046 na_option: str = "keep",
9047 ascending: bool_t = True,
9048 pct: bool_t = False,
9049 ) -> NDFrameT:
9050 """
9051 Compute numerical data ranks (1 through n) along axis.
9052
9053 By default, equal values are assigned a rank that is the average of the
9054 ranks of those values.
9055
9056 Parameters
9057 ----------
9058 axis : {0 or 'index', 1 or 'columns'}, default 0
9059 Index to direct ranking.
9060 For `Series` this parameter is unused and defaults to 0.
9061 method : {'average', 'min', 'max', 'first', 'dense'}, default 'average'
9062 How to rank the group of records that have the same value (i.e. ties):
9063
9064 * average: average rank of the group
9065 * min: lowest rank in the group
9066 * max: highest rank in the group
9067 * first: ranks assigned in order they appear in the array
9068 * dense: like 'min', but rank always increases by 1 between groups.
9069
9070 numeric_only : bool, default False
9071 For DataFrame objects, rank only numeric columns if set to True.
9072
9073 .. versionchanged:: 2.0.0
9074 The default value of ``numeric_only`` is now ``False``.
9075
9076 na_option : {'keep', 'top', 'bottom'}, default 'keep'
9077 How to rank NaN values:
9078
9079 * keep: assign NaN rank to NaN values
9080 * top: assign lowest rank to NaN values
9081 * bottom: assign highest rank to NaN values
9082
9083 ascending : bool, default True
9084 Whether or not the elements should be ranked in ascending order.
9085 pct : bool, default False
9086 Whether or not to display the returned rankings in percentile
9087 form.
9088
9089 Returns
9090 -------
9091 same type as caller
9092 Return a Series or DataFrame with data ranks as values.
9093
9094 See Also
9095 --------
9096 core.groupby.DataFrameGroupBy.rank : Rank of values within each group.
9097 core.groupby.SeriesGroupBy.rank : Rank of values within each group.
9098
9099 Examples
9100 --------
9101 >>> df = pd.DataFrame(data={'Animal': ['cat', 'penguin', 'dog',
9102 ... 'spider', 'snake'],
9103 ... 'Number_legs': [4, 2, 4, 8, np.nan]})
9104 >>> df
9105 Animal Number_legs
9106 0 cat 4.0
9107 1 penguin 2.0
9108 2 dog 4.0
9109 3 spider 8.0
9110 4 snake NaN
9111
9112 Ties are assigned the mean of the ranks (by default) for the group.
9113
9114 >>> s = pd.Series(range(5), index=list("abcde"))
9115 >>> s["d"] = s["b"]
9116 >>> s.rank()
9117 a 1.0
9118 b 2.5
9119 c 4.0
9120 d 2.5
9121 e 5.0
9122 dtype: float64
9123
9124 The following example shows how the method behaves with the above
9125 parameters:
9126
9127 * default_rank: this is the default behaviour obtained without using
9128 any parameter.
9129 * max_rank: setting ``method = 'max'`` the records that have the
9130 same values are ranked using the highest rank (e.g.: since 'cat'
9131 and 'dog' are both in the 2nd and 3rd position, rank 3 is assigned.)
9132 * NA_bottom: choosing ``na_option = 'bottom'``, if there are records
9133 with NaN values they are placed at the bottom of the ranking.
9134 * pct_rank: when setting ``pct = True``, the ranking is expressed as
9135 percentile rank.
9136
9137 >>> df['default_rank'] = df['Number_legs'].rank()
9138 >>> df['max_rank'] = df['Number_legs'].rank(method='max')
9139 >>> df['NA_bottom'] = df['Number_legs'].rank(na_option='bottom')
9140 >>> df['pct_rank'] = df['Number_legs'].rank(pct=True)
9141 >>> df
9142 Animal Number_legs default_rank max_rank NA_bottom pct_rank
9143 0 cat 4.0 2.5 3.0 2.5 0.625
9144 1 penguin 2.0 1.0 1.0 1.0 0.250
9145 2 dog 4.0 2.5 3.0 2.5 0.625
9146 3 spider 8.0 4.0 4.0 4.0 1.000
9147 4 snake NaN NaN NaN 5.0 NaN
9148 """
9149 axis_int = self._get_axis_number(axis)
9150
9151 if na_option not in {"keep", "top", "bottom"}:
9152 msg = "na_option must be one of 'keep', 'top', or 'bottom'"
9153 raise ValueError(msg)
9154
9155 def ranker(data):
9156 if data.ndim == 2:
9157 # i.e. DataFrame, we cast to ndarray
9158 values = data.values
9159 else:
9160 # i.e. Series, can dispatch to EA
9161 values = data._values
9162
9163 if isinstance(values, ExtensionArray):
9164 ranks = values._rank(
9165 axis=axis_int,
9166 method=method,
9167 ascending=ascending,
9168 na_option=na_option,
9169 pct=pct,
9170 )
9171 else:
9172 ranks = algos.rank(
9173 values,
9174 axis=axis_int,
9175 method=method,
9176 ascending=ascending,
9177 na_option=na_option,
9178 pct=pct,
9179 )
9180
9181 ranks_obj = self._constructor(ranks, **data._construct_axes_dict())
9182 return ranks_obj.__finalize__(self, method="rank")
9183
9184 if numeric_only:
9185 if self.ndim == 1 and not is_numeric_dtype(self.dtype):
9186 # GH#47500
9187 raise TypeError(
9188 "Series.rank does not allow numeric_only=True with "
9189 "non-numeric dtype."
9190 )
9191 data = self._get_numeric_data()
9192 else:
9193 data = self
9194
9195 return ranker(data)
9196
9197 @doc(_shared_docs["compare"], klass=_shared_doc_kwargs["klass"])
9198 def compare(
9199 self,
9200 other,
9201 align_axis: Axis = 1,
9202 keep_shape: bool_t = False,
9203 keep_equal: bool_t = False,
9204 result_names: Suffixes = ("self", "other"),
9205 ):
9206 if type(self) is not type(other):
9207 cls_self, cls_other = type(self).__name__, type(other).__name__
9208 raise TypeError(
9209 f"can only compare '{cls_self}' (not '{cls_other}') with '{cls_self}'"
9210 )
9211
9212 mask = ~((self == other) | (self.isna() & other.isna()))
9213 mask.fillna(True, inplace=True)
9214
9215 if not keep_equal:
9216 self = self.where(mask)
9217 other = other.where(mask)
9218
9219 if not keep_shape:
9220 if isinstance(self, ABCDataFrame):
9221 cmask = mask.any()
9222 rmask = mask.any(axis=1)
9223 self = self.loc[rmask, cmask]
9224 other = other.loc[rmask, cmask]
9225 else:
9226 self = self[mask]
9227 other = other[mask]
9228 if not isinstance(result_names, tuple):
9229 raise TypeError(
9230 f"Passing 'result_names' as a {type(result_names)} is not "
9231 "supported. Provide 'result_names' as a tuple instead."
9232 )
9233
9234 if align_axis in (1, "columns"): # This is needed for Series
9235 axis = 1
9236 else:
9237 axis = self._get_axis_number(align_axis)
9238
9239 diff = concat([self, other], axis=axis, keys=result_names)
9240
9241 if axis >= self.ndim:
9242 # No need to reorganize data if stacking on new axis
9243 # This currently applies for stacking two Series on columns
9244 return diff
9245
9246 ax = diff._get_axis(axis)
9247 ax_names = np.array(ax.names)
9248
9249 # set index names to positions to avoid confusion
9250 ax.names = np.arange(len(ax_names))
9251
9252 # bring self-other to inner level
9253 order = list(range(1, ax.nlevels)) + [0]
9254 if isinstance(diff, ABCDataFrame):
9255 diff = diff.reorder_levels(order, axis=axis)
9256 else:
9257 diff = diff.reorder_levels(order)
9258
9259 # restore the index names in order
9260 diff._get_axis(axis=axis).names = ax_names[order]
9261
9262 # reorder axis to keep things organized
9263 indices = (
9264 np.arange(diff.shape[axis]).reshape([2, diff.shape[axis] // 2]).T.flatten()
9265 )
9266 diff = diff.take(indices, axis=axis)
9267
9268 return diff
9269
9270 @doc(**_shared_doc_kwargs)
9271 def align(
9272 self: NDFrameT,
9273 other: NDFrameT,
9274 join: AlignJoin = "outer",
9275 axis: Axis | None = None,
9276 level: Level = None,
9277 copy: bool_t | None = None,
9278 fill_value: Hashable = None,
9279 method: FillnaOptions | None = None,
9280 limit: int | None = None,
9281 fill_axis: Axis = 0,
9282 broadcast_axis: Axis | None = None,
9283 ) -> NDFrameT:
9284 """
9285 Align two objects on their axes with the specified join method.
9286
9287 Join method is specified for each axis Index.
9288
9289 Parameters
9290 ----------
9291 other : DataFrame or Series
9292 join : {{'outer', 'inner', 'left', 'right'}}, default 'outer'
9293 axis : allowed axis of the other object, default None
9294 Align on index (0), columns (1), or both (None).
9295 level : int or level name, default None
9296 Broadcast across a level, matching Index values on the
9297 passed MultiIndex level.
9298 copy : bool, default True
9299 Always returns new objects. If copy=False and no reindexing is
9300 required then original objects are returned.
9301 fill_value : scalar, default np.NaN
9302 Value to use for missing values. Defaults to NaN, but can be any
9303 "compatible" value.
9304 method : {{'backfill', 'bfill', 'pad', 'ffill', None}}, default None
9305 Method to use for filling holes in reindexed Series:
9306
9307 - pad / ffill: propagate last valid observation forward to next valid.
9308 - backfill / bfill: use NEXT valid observation to fill gap.
9309
9310 limit : int, default None
9311 If method is specified, this is the maximum number of consecutive
9312 NaN values to forward/backward fill. In other words, if there is
9313 a gap with more than this number of consecutive NaNs, it will only
9314 be partially filled. If method is not specified, this is the
9315 maximum number of entries along the entire axis where NaNs will be
9316 filled. Must be greater than 0 if not None.
9317 fill_axis : {axes_single_arg}, default 0
9318 Filling axis, method and limit.
9319 broadcast_axis : {axes_single_arg}, default None
9320 Broadcast values along this axis, if aligning two objects of
9321 different dimensions.
9322
9323 Returns
9324 -------
9325 tuple of ({klass}, type of other)
9326 Aligned objects.
9327
9328 Examples
9329 --------
9330 >>> df = pd.DataFrame(
9331 ... [[1, 2, 3, 4], [6, 7, 8, 9]], columns=["D", "B", "E", "A"], index=[1, 2]
9332 ... )
9333 >>> other = pd.DataFrame(
9334 ... [[10, 20, 30, 40], [60, 70, 80, 90], [600, 700, 800, 900]],
9335 ... columns=["A", "B", "C", "D"],
9336 ... index=[2, 3, 4],
9337 ... )
9338 >>> df
9339 D B E A
9340 1 1 2 3 4
9341 2 6 7 8 9
9342 >>> other
9343 A B C D
9344 2 10 20 30 40
9345 3 60 70 80 90
9346 4 600 700 800 900
9347
9348 Align on columns:
9349
9350 >>> left, right = df.align(other, join="outer", axis=1)
9351 >>> left
9352 A B C D E
9353 1 4 2 NaN 1 3
9354 2 9 7 NaN 6 8
9355 >>> right
9356 A B C D E
9357 2 10 20 30 40 NaN
9358 3 60 70 80 90 NaN
9359 4 600 700 800 900 NaN
9360
9361 We can also align on the index:
9362
9363 >>> left, right = df.align(other, join="outer", axis=0)
9364 >>> left
9365 D B E A
9366 1 1.0 2.0 3.0 4.0
9367 2 6.0 7.0 8.0 9.0
9368 3 NaN NaN NaN NaN
9369 4 NaN NaN NaN NaN
9370 >>> right
9371 A B C D
9372 1 NaN NaN NaN NaN
9373 2 10.0 20.0 30.0 40.0
9374 3 60.0 70.0 80.0 90.0
9375 4 600.0 700.0 800.0 900.0
9376
9377 Finally, the default `axis=None` will align on both index and columns:
9378
9379 >>> left, right = df.align(other, join="outer", axis=None)
9380 >>> left
9381 A B C D E
9382 1 4.0 2.0 NaN 1.0 3.0
9383 2 9.0 7.0 NaN 6.0 8.0
9384 3 NaN NaN NaN NaN NaN
9385 4 NaN NaN NaN NaN NaN
9386 >>> right
9387 A B C D E
9388 1 NaN NaN NaN NaN NaN
9389 2 10.0 20.0 30.0 40.0 NaN
9390 3 60.0 70.0 80.0 90.0 NaN
9391 4 600.0 700.0 800.0 900.0 NaN
9392 """
9393
9394 method = clean_fill_method(method)
9395
9396 if broadcast_axis == 1 and self.ndim != other.ndim:
9397 if isinstance(self, ABCSeries):
9398 # this means other is a DataFrame, and we need to broadcast
9399 # self
9400 cons = self._constructor_expanddim
9401 df = cons(
9402 {c: self for c in other.columns}, **other._construct_axes_dict()
9403 )
9404 return df._align_frame(
9405 other,
9406 join=join,
9407 axis=axis,
9408 level=level,
9409 copy=copy,
9410 fill_value=fill_value,
9411 method=method,
9412 limit=limit,
9413 fill_axis=fill_axis,
9414 )
9415 elif isinstance(other, ABCSeries):
9416 # this means self is a DataFrame, and we need to broadcast
9417 # other
9418 cons = other._constructor_expanddim
9419 df = cons(
9420 {c: other for c in self.columns}, **self._construct_axes_dict()
9421 )
9422 return self._align_frame(
9423 df,
9424 join=join,
9425 axis=axis,
9426 level=level,
9427 copy=copy,
9428 fill_value=fill_value,
9429 method=method,
9430 limit=limit,
9431 fill_axis=fill_axis,
9432 )
9433
9434 if axis is not None:
9435 axis = self._get_axis_number(axis)
9436 if isinstance(other, ABCDataFrame):
9437 return self._align_frame(
9438 other,
9439 join=join,
9440 axis=axis,
9441 level=level,
9442 copy=copy,
9443 fill_value=fill_value,
9444 method=method,
9445 limit=limit,
9446 fill_axis=fill_axis,
9447 )
9448 elif isinstance(other, ABCSeries):
9449 return self._align_series(
9450 other,
9451 join=join,
9452 axis=axis,
9453 level=level,
9454 copy=copy,
9455 fill_value=fill_value,
9456 method=method,
9457 limit=limit,
9458 fill_axis=fill_axis,
9459 )
9460 else: # pragma: no cover
9461 raise TypeError(f"unsupported type: {type(other)}")
9462
9463 @final
9464 def _align_frame(
9465 self,
9466 other,
9467 join: AlignJoin = "outer",
9468 axis: Axis | None = None,
9469 level=None,
9470 copy: bool_t | None = None,
9471 fill_value=None,
9472 method=None,
9473 limit=None,
9474 fill_axis: Axis = 0,
9475 ):
9476 # defaults
9477 join_index, join_columns = None, None
9478 ilidx, iridx = None, None
9479 clidx, cridx = None, None
9480
9481 is_series = isinstance(self, ABCSeries)
9482
9483 if (axis is None or axis == 0) and not self.index.equals(other.index):
9484 join_index, ilidx, iridx = self.index.join(
9485 other.index, how=join, level=level, return_indexers=True
9486 )
9487
9488 if (
9489 (axis is None or axis == 1)
9490 and not is_series
9491 and not self.columns.equals(other.columns)
9492 ):
9493 join_columns, clidx, cridx = self.columns.join(
9494 other.columns, how=join, level=level, return_indexers=True
9495 )
9496
9497 if is_series:
9498 reindexers = {0: [join_index, ilidx]}
9499 else:
9500 reindexers = {0: [join_index, ilidx], 1: [join_columns, clidx]}
9501
9502 left = self._reindex_with_indexers(
9503 reindexers, copy=copy, fill_value=fill_value, allow_dups=True
9504 )
9505 # other must be always DataFrame
9506 right = other._reindex_with_indexers(
9507 {0: [join_index, iridx], 1: [join_columns, cridx]},
9508 copy=copy,
9509 fill_value=fill_value,
9510 allow_dups=True,
9511 )
9512
9513 if method is not None:
9514 _left = left.fillna(method=method, axis=fill_axis, limit=limit)
9515 assert _left is not None # needed for mypy
9516 left = _left
9517 right = right.fillna(method=method, axis=fill_axis, limit=limit)
9518
9519 # if DatetimeIndex have different tz, convert to UTC
9520 left, right = _align_as_utc(left, right, join_index)
9521
9522 return (
9523 left.__finalize__(self),
9524 right.__finalize__(other),
9525 )
9526
9527 @final
9528 def _align_series(
9529 self,
9530 other,
9531 join: AlignJoin = "outer",
9532 axis: Axis | None = None,
9533 level=None,
9534 copy: bool_t | None = None,
9535 fill_value=None,
9536 method=None,
9537 limit=None,
9538 fill_axis: Axis = 0,
9539 ):
9540 is_series = isinstance(self, ABCSeries)
9541 if copy and using_copy_on_write():
9542 copy = False
9543
9544 if (not is_series and axis is None) or axis not in [None, 0, 1]:
9545 raise ValueError("Must specify axis=0 or 1")
9546
9547 if is_series and axis == 1:
9548 raise ValueError("cannot align series to a series other than axis 0")
9549
9550 # series/series compat, other must always be a Series
9551 if not axis:
9552 # equal
9553 if self.index.equals(other.index):
9554 join_index, lidx, ridx = None, None, None
9555 else:
9556 join_index, lidx, ridx = self.index.join(
9557 other.index, how=join, level=level, return_indexers=True
9558 )
9559
9560 if is_series:
9561 left = self._reindex_indexer(join_index, lidx, copy)
9562 elif lidx is None or join_index is None:
9563 left = self.copy(deep=copy)
9564 else:
9565 left = self._constructor(
9566 self._mgr.reindex_indexer(join_index, lidx, axis=1, copy=copy)
9567 )
9568
9569 right = other._reindex_indexer(join_index, ridx, copy)
9570
9571 else:
9572 # one has > 1 ndim
9573 fdata = self._mgr
9574 join_index = self.axes[1]
9575 lidx, ridx = None, None
9576 if not join_index.equals(other.index):
9577 join_index, lidx, ridx = join_index.join(
9578 other.index, how=join, level=level, return_indexers=True
9579 )
9580
9581 if lidx is not None:
9582 bm_axis = self._get_block_manager_axis(1)
9583 fdata = fdata.reindex_indexer(join_index, lidx, axis=bm_axis)
9584
9585 if copy and fdata is self._mgr:
9586 fdata = fdata.copy()
9587
9588 left = self._constructor(fdata)
9589
9590 if ridx is None:
9591 right = other.copy(deep=copy)
9592 else:
9593 right = other.reindex(join_index, level=level)
9594
9595 # fill
9596 fill_na = notna(fill_value) or (method is not None)
9597 if fill_na:
9598 left = left.fillna(fill_value, method=method, limit=limit, axis=fill_axis)
9599 right = right.fillna(fill_value, method=method, limit=limit)
9600
9601 # if DatetimeIndex have different tz, convert to UTC
9602 if is_series or (not is_series and axis == 0):
9603 left, right = _align_as_utc(left, right, join_index)
9604
9605 return (
9606 left.__finalize__(self),
9607 right.__finalize__(other),
9608 )
9609
9610 @final
9611 def _where(
9612 self,
9613 cond,
9614 other=lib.no_default,
9615 inplace: bool_t = False,
9616 axis: Axis | None = None,
9617 level=None,
9618 ):
9619 """
9620 Equivalent to public method `where`, except that `other` is not
9621 applied as a function even if callable. Used in __setitem__.
9622 """
9623 inplace = validate_bool_kwarg(inplace, "inplace")
9624
9625 if axis is not None:
9626 axis = self._get_axis_number(axis)
9627
9628 # align the cond to same shape as myself
9629 cond = common.apply_if_callable(cond, self)
9630 if isinstance(cond, NDFrame):
9631 # CoW: Make sure reference is not kept alive
9632 cond = cond.align(self, join="right", broadcast_axis=1, copy=False)[0]
9633 else:
9634 if not hasattr(cond, "shape"):
9635 cond = np.asanyarray(cond)
9636 if cond.shape != self.shape:
9637 raise ValueError("Array conditional must be same shape as self")
9638 cond = self._constructor(cond, **self._construct_axes_dict(), copy=False)
9639
9640 # make sure we are boolean
9641 fill_value = bool(inplace)
9642 cond = cond.fillna(fill_value)
9643
9644 msg = "Boolean array expected for the condition, not {dtype}"
9645
9646 if not cond.empty:
9647 if not isinstance(cond, ABCDataFrame):
9648 # This is a single-dimensional object.
9649 if not is_bool_dtype(cond):
9650 raise ValueError(msg.format(dtype=cond.dtype))
9651 else:
9652 for _dt in cond.dtypes:
9653 if not is_bool_dtype(_dt):
9654 raise ValueError(msg.format(dtype=_dt))
9655 else:
9656 # GH#21947 we have an empty DataFrame/Series, could be object-dtype
9657 cond = cond.astype(bool)
9658
9659 cond = -cond if inplace else cond
9660 cond = cond.reindex(self._info_axis, axis=self._info_axis_number, copy=False)
9661
9662 # try to align with other
9663 if isinstance(other, NDFrame):
9664 # align with me
9665 if other.ndim <= self.ndim:
9666 # CoW: Make sure reference is not kept alive
9667 other = self.align(
9668 other,
9669 join="left",
9670 axis=axis,
9671 level=level,
9672 fill_value=None,
9673 copy=False,
9674 )[1]
9675
9676 # if we are NOT aligned, raise as we cannot where index
9677 if axis is None and not other._indexed_same(self):
9678 raise InvalidIndexError
9679
9680 if other.ndim < self.ndim:
9681 # TODO(EA2D): avoid object-dtype cast in EA case GH#38729
9682 other = other._values
9683 if axis == 0:
9684 other = np.reshape(other, (-1, 1))
9685 elif axis == 1:
9686 other = np.reshape(other, (1, -1))
9687
9688 other = np.broadcast_to(other, self.shape)
9689
9690 # slice me out of the other
9691 else:
9692 raise NotImplementedError(
9693 "cannot align with a higher dimensional NDFrame"
9694 )
9695
9696 elif not isinstance(other, (MultiIndex, NDFrame)):
9697 # mainly just catching Index here
9698 other = extract_array(other, extract_numpy=True)
9699
9700 if isinstance(other, (np.ndarray, ExtensionArray)):
9701 if other.shape != self.shape:
9702 if self.ndim != 1:
9703 # In the ndim == 1 case we may have
9704 # other length 1, which we treat as scalar (GH#2745, GH#4192)
9705 # or len(other) == icond.sum(), which we treat like
9706 # __setitem__ (GH#3235)
9707 raise ValueError(
9708 "other must be the same shape as self when an ndarray"
9709 )
9710
9711 # we are the same shape, so create an actual object for alignment
9712 else:
9713 other = self._constructor(
9714 other, **self._construct_axes_dict(), copy=False
9715 )
9716
9717 if axis is None:
9718 axis = 0
9719
9720 if self.ndim == getattr(other, "ndim", 0):
9721 align = True
9722 else:
9723 align = self._get_axis_number(axis) == 1
9724
9725 if inplace:
9726 # we may have different type blocks come out of putmask, so
9727 # reconstruct the block manager
9728
9729 self._check_inplace_setting(other)
9730 new_data = self._mgr.putmask(mask=cond, new=other, align=align)
9731 result = self._constructor(new_data)
9732 return self._update_inplace(result)
9733
9734 else:
9735 new_data = self._mgr.where(
9736 other=other,
9737 cond=cond,
9738 align=align,
9739 )
9740 result = self._constructor(new_data)
9741 return result.__finalize__(self)
9742
9743 @overload
9744 def where(
9745 self: NDFrameT,
9746 cond,
9747 other=...,
9748 *,
9749 inplace: Literal[False] = ...,
9750 axis: Axis | None = ...,
9751 level: Level = ...,
9752 ) -> NDFrameT:
9753 ...
9754
9755 @overload
9756 def where(
9757 self,
9758 cond,
9759 other=...,
9760 *,
9761 inplace: Literal[True],
9762 axis: Axis | None = ...,
9763 level: Level = ...,
9764 ) -> None:
9765 ...
9766
9767 @overload
9768 def where(
9769 self: NDFrameT,
9770 cond,
9771 other=...,
9772 *,
9773 inplace: bool_t = ...,
9774 axis: Axis | None = ...,
9775 level: Level = ...,
9776 ) -> NDFrameT | None:
9777 ...
9778
9779 @doc(
9780 klass=_shared_doc_kwargs["klass"],
9781 cond="True",
9782 cond_rev="False",
9783 name="where",
9784 name_other="mask",
9785 )
9786 def where(
9787 self: NDFrameT,
9788 cond,
9789 other=np.nan,
9790 *,
9791 inplace: bool_t = False,
9792 axis: Axis | None = None,
9793 level: Level = None,
9794 ) -> NDFrameT | None:
9795 """
9796 Replace values where the condition is {cond_rev}.
9797
9798 Parameters
9799 ----------
9800 cond : bool {klass}, array-like, or callable
9801 Where `cond` is {cond}, keep the original value. Where
9802 {cond_rev}, replace with corresponding value from `other`.
9803 If `cond` is callable, it is computed on the {klass} and
9804 should return boolean {klass} or array. The callable must
9805 not change input {klass} (though pandas doesn't check it).
9806 other : scalar, {klass}, or callable
9807 Entries where `cond` is {cond_rev} are replaced with
9808 corresponding value from `other`.
9809 If other is callable, it is computed on the {klass} and
9810 should return scalar or {klass}. The callable must not
9811 change input {klass} (though pandas doesn't check it).
9812 If not specified, entries will be filled with the corresponding
9813 NULL value (``np.nan`` for numpy dtypes, ``pd.NA`` for extension
9814 dtypes).
9815 inplace : bool, default False
9816 Whether to perform the operation in place on the data.
9817 axis : int, default None
9818 Alignment axis if needed. For `Series` this parameter is
9819 unused and defaults to 0.
9820 level : int, default None
9821 Alignment level if needed.
9822
9823 Returns
9824 -------
9825 Same type as caller or None if ``inplace=True``.
9826
9827 See Also
9828 --------
9829 :func:`DataFrame.{name_other}` : Return an object of same shape as
9830 self.
9831
9832 Notes
9833 -----
9834 The {name} method is an application of the if-then idiom. For each
9835 element in the calling DataFrame, if ``cond`` is ``{cond}`` the
9836 element is used; otherwise the corresponding element from the DataFrame
9837 ``other`` is used. If the axis of ``other`` does not align with axis of
9838 ``cond`` {klass}, the misaligned index positions will be filled with
9839 {cond_rev}.
9840
9841 The signature for :func:`DataFrame.where` differs from
9842 :func:`numpy.where`. Roughly ``df1.where(m, df2)`` is equivalent to
9843 ``np.where(m, df1, df2)``.
9844
9845 For further details and examples see the ``{name}`` documentation in
9846 :ref:`indexing <indexing.where_mask>`.
9847
9848 The dtype of the object takes precedence. The fill value is casted to
9849 the object's dtype, if this can be done losslessly.
9850
9851 Examples
9852 --------
9853 >>> s = pd.Series(range(5))
9854 >>> s.where(s > 0)
9855 0 NaN
9856 1 1.0
9857 2 2.0
9858 3 3.0
9859 4 4.0
9860 dtype: float64
9861 >>> s.mask(s > 0)
9862 0 0.0
9863 1 NaN
9864 2 NaN
9865 3 NaN
9866 4 NaN
9867 dtype: float64
9868
9869 >>> s = pd.Series(range(5))
9870 >>> t = pd.Series([True, False])
9871 >>> s.where(t, 99)
9872 0 0
9873 1 99
9874 2 99
9875 3 99
9876 4 99
9877 dtype: int64
9878 >>> s.mask(t, 99)
9879 0 99
9880 1 1
9881 2 99
9882 3 99
9883 4 99
9884 dtype: int64
9885
9886 >>> s.where(s > 1, 10)
9887 0 10
9888 1 10
9889 2 2
9890 3 3
9891 4 4
9892 dtype: int64
9893 >>> s.mask(s > 1, 10)
9894 0 0
9895 1 1
9896 2 10
9897 3 10
9898 4 10
9899 dtype: int64
9900
9901 >>> df = pd.DataFrame(np.arange(10).reshape(-1, 2), columns=['A', 'B'])
9902 >>> df
9903 A B
9904 0 0 1
9905 1 2 3
9906 2 4 5
9907 3 6 7
9908 4 8 9
9909 >>> m = df % 3 == 0
9910 >>> df.where(m, -df)
9911 A B
9912 0 0 -1
9913 1 -2 3
9914 2 -4 -5
9915 3 6 -7
9916 4 -8 9
9917 >>> df.where(m, -df) == np.where(m, df, -df)
9918 A B
9919 0 True True
9920 1 True True
9921 2 True True
9922 3 True True
9923 4 True True
9924 >>> df.where(m, -df) == df.mask(~m, -df)
9925 A B
9926 0 True True
9927 1 True True
9928 2 True True
9929 3 True True
9930 4 True True
9931 """
9932 other = common.apply_if_callable(other, self)
9933 return self._where(cond, other, inplace, axis, level)
9934
9935 @overload
9936 def mask(
9937 self: NDFrameT,
9938 cond,
9939 other=...,
9940 *,
9941 inplace: Literal[False] = ...,
9942 axis: Axis | None = ...,
9943 level: Level = ...,
9944 ) -> NDFrameT:
9945 ...
9946
9947 @overload
9948 def mask(
9949 self,
9950 cond,
9951 other=...,
9952 *,
9953 inplace: Literal[True],
9954 axis: Axis | None = ...,
9955 level: Level = ...,
9956 ) -> None:
9957 ...
9958
9959 @overload
9960 def mask(
9961 self: NDFrameT,
9962 cond,
9963 other=...,
9964 *,
9965 inplace: bool_t = ...,
9966 axis: Axis | None = ...,
9967 level: Level = ...,
9968 ) -> NDFrameT | None:
9969 ...
9970
9971 @doc(
9972 where,
9973 klass=_shared_doc_kwargs["klass"],
9974 cond="False",
9975 cond_rev="True",
9976 name="mask",
9977 name_other="where",
9978 )
9979 def mask(
9980 self: NDFrameT,
9981 cond,
9982 other=lib.no_default,
9983 *,
9984 inplace: bool_t = False,
9985 axis: Axis | None = None,
9986 level: Level = None,
9987 ) -> NDFrameT | None:
9988 inplace = validate_bool_kwarg(inplace, "inplace")
9989 cond = common.apply_if_callable(cond, self)
9990
9991 # see gh-21891
9992 if not hasattr(cond, "__invert__"):
9993 cond = np.array(cond)
9994
9995 return self.where(
9996 ~cond,
9997 other=other,
9998 inplace=inplace,
9999 axis=axis,
10000 level=level,
10001 )
10002
10003 @doc(klass=_shared_doc_kwargs["klass"])
10004 def shift(
10005 self: NDFrameT,
10006 periods: int = 1,
10007 freq=None,
10008 axis: Axis = 0,
10009 fill_value: Hashable = None,
10010 ) -> NDFrameT:
10011 """
10012 Shift index by desired number of periods with an optional time `freq`.
10013
10014 When `freq` is not passed, shift the index without realigning the data.
10015 If `freq` is passed (in this case, the index must be date or datetime,
10016 or it will raise a `NotImplementedError`), the index will be
10017 increased using the periods and the `freq`. `freq` can be inferred
10018 when specified as "infer" as long as either freq or inferred_freq
10019 attribute is set in the index.
10020
10021 Parameters
10022 ----------
10023 periods : int
10024 Number of periods to shift. Can be positive or negative.
10025 freq : DateOffset, tseries.offsets, timedelta, or str, optional
10026 Offset to use from the tseries module or time rule (e.g. 'EOM').
10027 If `freq` is specified then the index values are shifted but the
10028 data is not realigned. That is, use `freq` if you would like to
10029 extend the index when shifting and preserve the original data.
10030 If `freq` is specified as "infer" then it will be inferred from
10031 the freq or inferred_freq attributes of the index. If neither of
10032 those attributes exist, a ValueError is thrown.
10033 axis : {{0 or 'index', 1 or 'columns', None}}, default None
10034 Shift direction. For `Series` this parameter is unused and defaults to 0.
10035 fill_value : object, optional
10036 The scalar value to use for newly introduced missing values.
10037 the default depends on the dtype of `self`.
10038 For numeric data, ``np.nan`` is used.
10039 For datetime, timedelta, or period data, etc. :attr:`NaT` is used.
10040 For extension dtypes, ``self.dtype.na_value`` is used.
10041
10042 .. versionchanged:: 1.1.0
10043
10044 Returns
10045 -------
10046 {klass}
10047 Copy of input object, shifted.
10048
10049 See Also
10050 --------
10051 Index.shift : Shift values of Index.
10052 DatetimeIndex.shift : Shift values of DatetimeIndex.
10053 PeriodIndex.shift : Shift values of PeriodIndex.
10054
10055 Examples
10056 --------
10057 >>> df = pd.DataFrame({{"Col1": [10, 20, 15, 30, 45],
10058 ... "Col2": [13, 23, 18, 33, 48],
10059 ... "Col3": [17, 27, 22, 37, 52]}},
10060 ... index=pd.date_range("2020-01-01", "2020-01-05"))
10061 >>> df
10062 Col1 Col2 Col3
10063 2020-01-01 10 13 17
10064 2020-01-02 20 23 27
10065 2020-01-03 15 18 22
10066 2020-01-04 30 33 37
10067 2020-01-05 45 48 52
10068
10069 >>> df.shift(periods=3)
10070 Col1 Col2 Col3
10071 2020-01-01 NaN NaN NaN
10072 2020-01-02 NaN NaN NaN
10073 2020-01-03 NaN NaN NaN
10074 2020-01-04 10.0 13.0 17.0
10075 2020-01-05 20.0 23.0 27.0
10076
10077 >>> df.shift(periods=1, axis="columns")
10078 Col1 Col2 Col3
10079 2020-01-01 NaN 10 13
10080 2020-01-02 NaN 20 23
10081 2020-01-03 NaN 15 18
10082 2020-01-04 NaN 30 33
10083 2020-01-05 NaN 45 48
10084
10085 >>> df.shift(periods=3, fill_value=0)
10086 Col1 Col2 Col3
10087 2020-01-01 0 0 0
10088 2020-01-02 0 0 0
10089 2020-01-03 0 0 0
10090 2020-01-04 10 13 17
10091 2020-01-05 20 23 27
10092
10093 >>> df.shift(periods=3, freq="D")
10094 Col1 Col2 Col3
10095 2020-01-04 10 13 17
10096 2020-01-05 20 23 27
10097 2020-01-06 15 18 22
10098 2020-01-07 30 33 37
10099 2020-01-08 45 48 52
10100
10101 >>> df.shift(periods=3, freq="infer")
10102 Col1 Col2 Col3
10103 2020-01-04 10 13 17
10104 2020-01-05 20 23 27
10105 2020-01-06 15 18 22
10106 2020-01-07 30 33 37
10107 2020-01-08 45 48 52
10108 """
10109 if periods == 0:
10110 return self.copy(deep=None)
10111
10112 if freq is None:
10113 # when freq is None, data is shifted, index is not
10114 axis = self._get_axis_number(axis)
10115 new_data = self._mgr.shift(
10116 periods=periods, axis=axis, fill_value=fill_value
10117 )
10118 return self._constructor(new_data).__finalize__(self, method="shift")
10119
10120 # when freq is given, index is shifted, data is not
10121 index = self._get_axis(axis)
10122
10123 if freq == "infer":
10124 freq = getattr(index, "freq", None)
10125
10126 if freq is None:
10127 freq = getattr(index, "inferred_freq", None)
10128
10129 if freq is None:
10130 msg = "Freq was not set in the index hence cannot be inferred"
10131 raise ValueError(msg)
10132
10133 elif isinstance(freq, str):
10134 freq = to_offset(freq)
10135
10136 if isinstance(index, PeriodIndex):
10137 orig_freq = to_offset(index.freq)
10138 if freq != orig_freq:
10139 assert orig_freq is not None # for mypy
10140 raise ValueError(
10141 f"Given freq {freq.rule_code} does not match "
10142 f"PeriodIndex freq {orig_freq.rule_code}"
10143 )
10144 new_ax = index.shift(periods)
10145 else:
10146 new_ax = index.shift(periods, freq)
10147
10148 result = self.set_axis(new_ax, axis=axis)
10149 return result.__finalize__(self, method="shift")
10150
10151 def truncate(
10152 self: NDFrameT,
10153 before=None,
10154 after=None,
10155 axis: Axis | None = None,
10156 copy: bool_t | None = None,
10157 ) -> NDFrameT:
10158 """
10159 Truncate a Series or DataFrame before and after some index value.
10160
10161 This is a useful shorthand for boolean indexing based on index
10162 values above or below certain thresholds.
10163
10164 Parameters
10165 ----------
10166 before : date, str, int
10167 Truncate all rows before this index value.
10168 after : date, str, int
10169 Truncate all rows after this index value.
10170 axis : {0 or 'index', 1 or 'columns'}, optional
10171 Axis to truncate. Truncates the index (rows) by default.
10172 For `Series` this parameter is unused and defaults to 0.
10173 copy : bool, default is True,
10174 Return a copy of the truncated section.
10175
10176 Returns
10177 -------
10178 type of caller
10179 The truncated Series or DataFrame.
10180
10181 See Also
10182 --------
10183 DataFrame.loc : Select a subset of a DataFrame by label.
10184 DataFrame.iloc : Select a subset of a DataFrame by position.
10185
10186 Notes
10187 -----
10188 If the index being truncated contains only datetime values,
10189 `before` and `after` may be specified as strings instead of
10190 Timestamps.
10191
10192 Examples
10193 --------
10194 >>> df = pd.DataFrame({'A': ['a', 'b', 'c', 'd', 'e'],
10195 ... 'B': ['f', 'g', 'h', 'i', 'j'],
10196 ... 'C': ['k', 'l', 'm', 'n', 'o']},
10197 ... index=[1, 2, 3, 4, 5])
10198 >>> df
10199 A B C
10200 1 a f k
10201 2 b g l
10202 3 c h m
10203 4 d i n
10204 5 e j o
10205
10206 >>> df.truncate(before=2, after=4)
10207 A B C
10208 2 b g l
10209 3 c h m
10210 4 d i n
10211
10212 The columns of a DataFrame can be truncated.
10213
10214 >>> df.truncate(before="A", after="B", axis="columns")
10215 A B
10216 1 a f
10217 2 b g
10218 3 c h
10219 4 d i
10220 5 e j
10221
10222 For Series, only rows can be truncated.
10223
10224 >>> df['A'].truncate(before=2, after=4)
10225 2 b
10226 3 c
10227 4 d
10228 Name: A, dtype: object
10229
10230 The index values in ``truncate`` can be datetimes or string
10231 dates.
10232
10233 >>> dates = pd.date_range('2016-01-01', '2016-02-01', freq='s')
10234 >>> df = pd.DataFrame(index=dates, data={'A': 1})
10235 >>> df.tail()
10236 A
10237 2016-01-31 23:59:56 1
10238 2016-01-31 23:59:57 1
10239 2016-01-31 23:59:58 1
10240 2016-01-31 23:59:59 1
10241 2016-02-01 00:00:00 1
10242
10243 >>> df.truncate(before=pd.Timestamp('2016-01-05'),
10244 ... after=pd.Timestamp('2016-01-10')).tail()
10245 A
10246 2016-01-09 23:59:56 1
10247 2016-01-09 23:59:57 1
10248 2016-01-09 23:59:58 1
10249 2016-01-09 23:59:59 1
10250 2016-01-10 00:00:00 1
10251
10252 Because the index is a DatetimeIndex containing only dates, we can
10253 specify `before` and `after` as strings. They will be coerced to
10254 Timestamps before truncation.
10255
10256 >>> df.truncate('2016-01-05', '2016-01-10').tail()
10257 A
10258 2016-01-09 23:59:56 1
10259 2016-01-09 23:59:57 1
10260 2016-01-09 23:59:58 1
10261 2016-01-09 23:59:59 1
10262 2016-01-10 00:00:00 1
10263
10264 Note that ``truncate`` assumes a 0 value for any unspecified time
10265 component (midnight). This differs from partial string slicing, which
10266 returns any partially matching dates.
10267
10268 >>> df.loc['2016-01-05':'2016-01-10', :].tail()
10269 A
10270 2016-01-10 23:59:55 1
10271 2016-01-10 23:59:56 1
10272 2016-01-10 23:59:57 1
10273 2016-01-10 23:59:58 1
10274 2016-01-10 23:59:59 1
10275 """
10276 if axis is None:
10277 axis = self._stat_axis_number
10278 axis = self._get_axis_number(axis)
10279 ax = self._get_axis(axis)
10280
10281 # GH 17935
10282 # Check that index is sorted
10283 if not ax.is_monotonic_increasing and not ax.is_monotonic_decreasing:
10284 raise ValueError("truncate requires a sorted index")
10285
10286 # if we have a date index, convert to dates, otherwise
10287 # treat like a slice
10288 if ax._is_all_dates:
10289 from pandas.core.tools.datetimes import to_datetime
10290
10291 before = to_datetime(before)
10292 after = to_datetime(after)
10293
10294 if before is not None and after is not None and before > after:
10295 raise ValueError(f"Truncate: {after} must be after {before}")
10296
10297 if len(ax) > 1 and ax.is_monotonic_decreasing and ax.nunique() > 1:
10298 before, after = after, before
10299
10300 slicer = [slice(None, None)] * self._AXIS_LEN
10301 slicer[axis] = slice(before, after)
10302 result = self.loc[tuple(slicer)]
10303
10304 if isinstance(ax, MultiIndex):
10305 setattr(result, self._get_axis_name(axis), ax.truncate(before, after))
10306
10307 result = result.copy(deep=copy and not using_copy_on_write())
10308
10309 return result
10310
10311 @final
10312 @doc(klass=_shared_doc_kwargs["klass"])
10313 def tz_convert(
10314 self: NDFrameT, tz, axis: Axis = 0, level=None, copy: bool_t | None = None
10315 ) -> NDFrameT:
10316 """
10317 Convert tz-aware axis to target time zone.
10318
10319 Parameters
10320 ----------
10321 tz : str or tzinfo object or None
10322 Target time zone. Passing ``None`` will convert to
10323 UTC and remove the timezone information.
10324 axis : {{0 or 'index', 1 or 'columns'}}, default 0
10325 The axis to convert
10326 level : int, str, default None
10327 If axis is a MultiIndex, convert a specific level. Otherwise
10328 must be None.
10329 copy : bool, default True
10330 Also make a copy of the underlying data.
10331
10332 Returns
10333 -------
10334 {klass}
10335 Object with time zone converted axis.
10336
10337 Raises
10338 ------
10339 TypeError
10340 If the axis is tz-naive.
10341
10342 Examples
10343 --------
10344 Change to another time zone:
10345
10346 >>> s = pd.Series(
10347 ... [1],
10348 ... index=pd.DatetimeIndex(['2018-09-15 01:30:00+02:00']),
10349 ... )
10350 >>> s.tz_convert('Asia/Shanghai')
10351 2018-09-15 07:30:00+08:00 1
10352 dtype: int64
10353
10354 Pass None to convert to UTC and get a tz-naive index:
10355
10356 >>> s = pd.Series([1],
10357 ... index=pd.DatetimeIndex(['2018-09-15 01:30:00+02:00']))
10358 >>> s.tz_convert(None)
10359 2018-09-14 23:30:00 1
10360 dtype: int64
10361 """
10362 axis = self._get_axis_number(axis)
10363 ax = self._get_axis(axis)
10364
10365 def _tz_convert(ax, tz):
10366 if not hasattr(ax, "tz_convert"):
10367 if len(ax) > 0:
10368 ax_name = self._get_axis_name(axis)
10369 raise TypeError(
10370 f"{ax_name} is not a valid DatetimeIndex or PeriodIndex"
10371 )
10372 ax = DatetimeIndex([], tz=tz)
10373 else:
10374 ax = ax.tz_convert(tz)
10375 return ax
10376
10377 # if a level is given it must be a MultiIndex level or
10378 # equivalent to the axis name
10379 if isinstance(ax, MultiIndex):
10380 level = ax._get_level_number(level)
10381 new_level = _tz_convert(ax.levels[level], tz)
10382 ax = ax.set_levels(new_level, level=level)
10383 else:
10384 if level not in (None, 0, ax.name):
10385 raise ValueError(f"The level {level} is not valid")
10386 ax = _tz_convert(ax, tz)
10387
10388 result = self.copy(deep=copy and not using_copy_on_write())
10389 result = result.set_axis(ax, axis=axis, copy=False)
10390 return result.__finalize__(self, method="tz_convert")
10391
10392 @final
10393 @doc(klass=_shared_doc_kwargs["klass"])
10394 def tz_localize(
10395 self: NDFrameT,
10396 tz,
10397 axis: Axis = 0,
10398 level=None,
10399 copy: bool_t | None = None,
10400 ambiguous: TimeAmbiguous = "raise",
10401 nonexistent: TimeNonexistent = "raise",
10402 ) -> NDFrameT:
10403 """
10404 Localize tz-naive index of a Series or DataFrame to target time zone.
10405
10406 This operation localizes the Index. To localize the values in a
10407 timezone-naive Series, use :meth:`Series.dt.tz_localize`.
10408
10409 Parameters
10410 ----------
10411 tz : str or tzinfo or None
10412 Time zone to localize. Passing ``None`` will remove the
10413 time zone information and preserve local time.
10414 axis : {{0 or 'index', 1 or 'columns'}}, default 0
10415 The axis to localize
10416 level : int, str, default None
10417 If axis ia a MultiIndex, localize a specific level. Otherwise
10418 must be None.
10419 copy : bool, default True
10420 Also make a copy of the underlying data.
10421 ambiguous : 'infer', bool-ndarray, 'NaT', default 'raise'
10422 When clocks moved backward due to DST, ambiguous times may arise.
10423 For example in Central European Time (UTC+01), when going from
10424 03:00 DST to 02:00 non-DST, 02:30:00 local time occurs both at
10425 00:30:00 UTC and at 01:30:00 UTC. In such a situation, the
10426 `ambiguous` parameter dictates how ambiguous times should be
10427 handled.
10428
10429 - 'infer' will attempt to infer fall dst-transition hours based on
10430 order
10431 - bool-ndarray where True signifies a DST time, False designates
10432 a non-DST time (note that this flag is only applicable for
10433 ambiguous times)
10434 - 'NaT' will return NaT where there are ambiguous times
10435 - 'raise' will raise an AmbiguousTimeError if there are ambiguous
10436 times.
10437 nonexistent : str, default 'raise'
10438 A nonexistent time does not exist in a particular timezone
10439 where clocks moved forward due to DST. Valid values are:
10440
10441 - 'shift_forward' will shift the nonexistent time forward to the
10442 closest existing time
10443 - 'shift_backward' will shift the nonexistent time backward to the
10444 closest existing time
10445 - 'NaT' will return NaT where there are nonexistent times
10446 - timedelta objects will shift nonexistent times by the timedelta
10447 - 'raise' will raise an NonExistentTimeError if there are
10448 nonexistent times.
10449
10450 Returns
10451 -------
10452 {klass}
10453 Same type as the input.
10454
10455 Raises
10456 ------
10457 TypeError
10458 If the TimeSeries is tz-aware and tz is not None.
10459
10460 Examples
10461 --------
10462 Localize local times:
10463
10464 >>> s = pd.Series(
10465 ... [1],
10466 ... index=pd.DatetimeIndex(['2018-09-15 01:30:00']),
10467 ... )
10468 >>> s.tz_localize('CET')
10469 2018-09-15 01:30:00+02:00 1
10470 dtype: int64
10471
10472 Pass None to convert to tz-naive index and preserve local time:
10473
10474 >>> s = pd.Series([1],
10475 ... index=pd.DatetimeIndex(['2018-09-15 01:30:00+02:00']))
10476 >>> s.tz_localize(None)
10477 2018-09-15 01:30:00 1
10478 dtype: int64
10479
10480 Be careful with DST changes. When there is sequential data, pandas
10481 can infer the DST time:
10482
10483 >>> s = pd.Series(range(7),
10484 ... index=pd.DatetimeIndex(['2018-10-28 01:30:00',
10485 ... '2018-10-28 02:00:00',
10486 ... '2018-10-28 02:30:00',
10487 ... '2018-10-28 02:00:00',
10488 ... '2018-10-28 02:30:00',
10489 ... '2018-10-28 03:00:00',
10490 ... '2018-10-28 03:30:00']))
10491 >>> s.tz_localize('CET', ambiguous='infer')
10492 2018-10-28 01:30:00+02:00 0
10493 2018-10-28 02:00:00+02:00 1
10494 2018-10-28 02:30:00+02:00 2
10495 2018-10-28 02:00:00+01:00 3
10496 2018-10-28 02:30:00+01:00 4
10497 2018-10-28 03:00:00+01:00 5
10498 2018-10-28 03:30:00+01:00 6
10499 dtype: int64
10500
10501 In some cases, inferring the DST is impossible. In such cases, you can
10502 pass an ndarray to the ambiguous parameter to set the DST explicitly
10503
10504 >>> s = pd.Series(range(3),
10505 ... index=pd.DatetimeIndex(['2018-10-28 01:20:00',
10506 ... '2018-10-28 02:36:00',
10507 ... '2018-10-28 03:46:00']))
10508 >>> s.tz_localize('CET', ambiguous=np.array([True, True, False]))
10509 2018-10-28 01:20:00+02:00 0
10510 2018-10-28 02:36:00+02:00 1
10511 2018-10-28 03:46:00+01:00 2
10512 dtype: int64
10513
10514 If the DST transition causes nonexistent times, you can shift these
10515 dates forward or backward with a timedelta object or `'shift_forward'`
10516 or `'shift_backward'`.
10517
10518 >>> s = pd.Series(range(2),
10519 ... index=pd.DatetimeIndex(['2015-03-29 02:30:00',
10520 ... '2015-03-29 03:30:00']))
10521 >>> s.tz_localize('Europe/Warsaw', nonexistent='shift_forward')
10522 2015-03-29 03:00:00+02:00 0
10523 2015-03-29 03:30:00+02:00 1
10524 dtype: int64
10525 >>> s.tz_localize('Europe/Warsaw', nonexistent='shift_backward')
10526 2015-03-29 01:59:59.999999999+01:00 0
10527 2015-03-29 03:30:00+02:00 1
10528 dtype: int64
10529 >>> s.tz_localize('Europe/Warsaw', nonexistent=pd.Timedelta('1H'))
10530 2015-03-29 03:30:00+02:00 0
10531 2015-03-29 03:30:00+02:00 1
10532 dtype: int64
10533 """
10534 nonexistent_options = ("raise", "NaT", "shift_forward", "shift_backward")
10535 if nonexistent not in nonexistent_options and not isinstance(
10536 nonexistent, dt.timedelta
10537 ):
10538 raise ValueError(
10539 "The nonexistent argument must be one of 'raise', "
10540 "'NaT', 'shift_forward', 'shift_backward' or "
10541 "a timedelta object"
10542 )
10543
10544 axis = self._get_axis_number(axis)
10545 ax = self._get_axis(axis)
10546
10547 def _tz_localize(ax, tz, ambiguous, nonexistent):
10548 if not hasattr(ax, "tz_localize"):
10549 if len(ax) > 0:
10550 ax_name = self._get_axis_name(axis)
10551 raise TypeError(
10552 f"{ax_name} is not a valid DatetimeIndex or PeriodIndex"
10553 )
10554 ax = DatetimeIndex([], tz=tz)
10555 else:
10556 ax = ax.tz_localize(tz, ambiguous=ambiguous, nonexistent=nonexistent)
10557 return ax
10558
10559 # if a level is given it must be a MultiIndex level or
10560 # equivalent to the axis name
10561 if isinstance(ax, MultiIndex):
10562 level = ax._get_level_number(level)
10563 new_level = _tz_localize(ax.levels[level], tz, ambiguous, nonexistent)
10564 ax = ax.set_levels(new_level, level=level)
10565 else:
10566 if level not in (None, 0, ax.name):
10567 raise ValueError(f"The level {level} is not valid")
10568 ax = _tz_localize(ax, tz, ambiguous, nonexistent)
10569
10570 result = self.copy(deep=copy and not using_copy_on_write())
10571 result = result.set_axis(ax, axis=axis, copy=False)
10572 return result.__finalize__(self, method="tz_localize")
10573
10574 # ----------------------------------------------------------------------
10575 # Numeric Methods
10576
10577 @final
10578 def describe(
10579 self: NDFrameT,
10580 percentiles=None,
10581 include=None,
10582 exclude=None,
10583 ) -> NDFrameT:
10584 """
10585 Generate descriptive statistics.
10586
10587 Descriptive statistics include those that summarize the central
10588 tendency, dispersion and shape of a
10589 dataset's distribution, excluding ``NaN`` values.
10590
10591 Analyzes both numeric and object series, as well
10592 as ``DataFrame`` column sets of mixed data types. The output
10593 will vary depending on what is provided. Refer to the notes
10594 below for more detail.
10595
10596 Parameters
10597 ----------
10598 percentiles : list-like of numbers, optional
10599 The percentiles to include in the output. All should
10600 fall between 0 and 1. The default is
10601 ``[.25, .5, .75]``, which returns the 25th, 50th, and
10602 75th percentiles.
10603 include : 'all', list-like of dtypes or None (default), optional
10604 A white list of data types to include in the result. Ignored
10605 for ``Series``. Here are the options:
10606
10607 - 'all' : All columns of the input will be included in the output.
10608 - A list-like of dtypes : Limits the results to the
10609 provided data types.
10610 To limit the result to numeric types submit
10611 ``numpy.number``. To limit it instead to object columns submit
10612 the ``numpy.object`` data type. Strings
10613 can also be used in the style of
10614 ``select_dtypes`` (e.g. ``df.describe(include=['O'])``). To
10615 select pandas categorical columns, use ``'category'``
10616 - None (default) : The result will include all numeric columns.
10617 exclude : list-like of dtypes or None (default), optional,
10618 A black list of data types to omit from the result. Ignored
10619 for ``Series``. Here are the options:
10620
10621 - A list-like of dtypes : Excludes the provided data types
10622 from the result. To exclude numeric types submit
10623 ``numpy.number``. To exclude object columns submit the data
10624 type ``numpy.object``. Strings can also be used in the style of
10625 ``select_dtypes`` (e.g. ``df.describe(exclude=['O'])``). To
10626 exclude pandas categorical columns, use ``'category'``
10627 - None (default) : The result will exclude nothing.
10628
10629 Returns
10630 -------
10631 Series or DataFrame
10632 Summary statistics of the Series or Dataframe provided.
10633
10634 See Also
10635 --------
10636 DataFrame.count: Count number of non-NA/null observations.
10637 DataFrame.max: Maximum of the values in the object.
10638 DataFrame.min: Minimum of the values in the object.
10639 DataFrame.mean: Mean of the values.
10640 DataFrame.std: Standard deviation of the observations.
10641 DataFrame.select_dtypes: Subset of a DataFrame including/excluding
10642 columns based on their dtype.
10643
10644 Notes
10645 -----
10646 For numeric data, the result's index will include ``count``,
10647 ``mean``, ``std``, ``min``, ``max`` as well as lower, ``50`` and
10648 upper percentiles. By default the lower percentile is ``25`` and the
10649 upper percentile is ``75``. The ``50`` percentile is the
10650 same as the median.
10651
10652 For object data (e.g. strings or timestamps), the result's index
10653 will include ``count``, ``unique``, ``top``, and ``freq``. The ``top``
10654 is the most common value. The ``freq`` is the most common value's
10655 frequency. Timestamps also include the ``first`` and ``last`` items.
10656
10657 If multiple object values have the highest count, then the
10658 ``count`` and ``top`` results will be arbitrarily chosen from
10659 among those with the highest count.
10660
10661 For mixed data types provided via a ``DataFrame``, the default is to
10662 return only an analysis of numeric columns. If the dataframe consists
10663 only of object and categorical data without any numeric columns, the
10664 default is to return an analysis of both the object and categorical
10665 columns. If ``include='all'`` is provided as an option, the result
10666 will include a union of attributes of each type.
10667
10668 The `include` and `exclude` parameters can be used to limit
10669 which columns in a ``DataFrame`` are analyzed for the output.
10670 The parameters are ignored when analyzing a ``Series``.
10671
10672 Examples
10673 --------
10674 Describing a numeric ``Series``.
10675
10676 >>> s = pd.Series([1, 2, 3])
10677 >>> s.describe()
10678 count 3.0
10679 mean 2.0
10680 std 1.0
10681 min 1.0
10682 25% 1.5
10683 50% 2.0
10684 75% 2.5
10685 max 3.0
10686 dtype: float64
10687
10688 Describing a categorical ``Series``.
10689
10690 >>> s = pd.Series(['a', 'a', 'b', 'c'])
10691 >>> s.describe()
10692 count 4
10693 unique 3
10694 top a
10695 freq 2
10696 dtype: object
10697
10698 Describing a timestamp ``Series``.
10699
10700 >>> s = pd.Series([
10701 ... np.datetime64("2000-01-01"),
10702 ... np.datetime64("2010-01-01"),
10703 ... np.datetime64("2010-01-01")
10704 ... ])
10705 >>> s.describe()
10706 count 3
10707 mean 2006-09-01 08:00:00
10708 min 2000-01-01 00:00:00
10709 25% 2004-12-31 12:00:00
10710 50% 2010-01-01 00:00:00
10711 75% 2010-01-01 00:00:00
10712 max 2010-01-01 00:00:00
10713 dtype: object
10714
10715 Describing a ``DataFrame``. By default only numeric fields
10716 are returned.
10717
10718 >>> df = pd.DataFrame({'categorical': pd.Categorical(['d','e','f']),
10719 ... 'numeric': [1, 2, 3],
10720 ... 'object': ['a', 'b', 'c']
10721 ... })
10722 >>> df.describe()
10723 numeric
10724 count 3.0
10725 mean 2.0
10726 std 1.0
10727 min 1.0
10728 25% 1.5
10729 50% 2.0
10730 75% 2.5
10731 max 3.0
10732
10733 Describing all columns of a ``DataFrame`` regardless of data type.
10734
10735 >>> df.describe(include='all') # doctest: +SKIP
10736 categorical numeric object
10737 count 3 3.0 3
10738 unique 3 NaN 3
10739 top f NaN a
10740 freq 1 NaN 1
10741 mean NaN 2.0 NaN
10742 std NaN 1.0 NaN
10743 min NaN 1.0 NaN
10744 25% NaN 1.5 NaN
10745 50% NaN 2.0 NaN
10746 75% NaN 2.5 NaN
10747 max NaN 3.0 NaN
10748
10749 Describing a column from a ``DataFrame`` by accessing it as
10750 an attribute.
10751
10752 >>> df.numeric.describe()
10753 count 3.0
10754 mean 2.0
10755 std 1.0
10756 min 1.0
10757 25% 1.5
10758 50% 2.0
10759 75% 2.5
10760 max 3.0
10761 Name: numeric, dtype: float64
10762
10763 Including only numeric columns in a ``DataFrame`` description.
10764
10765 >>> df.describe(include=[np.number])
10766 numeric
10767 count 3.0
10768 mean 2.0
10769 std 1.0
10770 min 1.0
10771 25% 1.5
10772 50% 2.0
10773 75% 2.5
10774 max 3.0
10775
10776 Including only string columns in a ``DataFrame`` description.
10777
10778 >>> df.describe(include=[object]) # doctest: +SKIP
10779 object
10780 count 3
10781 unique 3
10782 top a
10783 freq 1
10784
10785 Including only categorical columns from a ``DataFrame`` description.
10786
10787 >>> df.describe(include=['category'])
10788 categorical
10789 count 3
10790 unique 3
10791 top d
10792 freq 1
10793
10794 Excluding numeric columns from a ``DataFrame`` description.
10795
10796 >>> df.describe(exclude=[np.number]) # doctest: +SKIP
10797 categorical object
10798 count 3 3
10799 unique 3 3
10800 top f a
10801 freq 1 1
10802
10803 Excluding object columns from a ``DataFrame`` description.
10804
10805 >>> df.describe(exclude=[object]) # doctest: +SKIP
10806 categorical numeric
10807 count 3 3.0
10808 unique 3 NaN
10809 top f NaN
10810 freq 1 NaN
10811 mean NaN 2.0
10812 std NaN 1.0
10813 min NaN 1.0
10814 25% NaN 1.5
10815 50% NaN 2.0
10816 75% NaN 2.5
10817 max NaN 3.0
10818 """
10819 return describe_ndframe(
10820 obj=self,
10821 include=include,
10822 exclude=exclude,
10823 percentiles=percentiles,
10824 )
10825
10826 @final
10827 def pct_change(
10828 self: NDFrameT,
10829 periods: int = 1,
10830 fill_method: Literal["backfill", "bfill", "pad", "ffill"] | None = "pad",
10831 limit=None,
10832 freq=None,
10833 **kwargs,
10834 ) -> NDFrameT:
10835 """
10836 Percentage change between the current and a prior element.
10837
10838 Computes the percentage change from the immediately previous row by
10839 default. This is useful in comparing the percentage of change in a time
10840 series of elements.
10841
10842 Parameters
10843 ----------
10844 periods : int, default 1
10845 Periods to shift for forming percent change.
10846 fill_method : {'backfill', 'bfill', 'pad', 'ffill', None}, default 'pad'
10847 How to handle NAs **before** computing percent changes.
10848 limit : int, default None
10849 The number of consecutive NAs to fill before stopping.
10850 freq : DateOffset, timedelta, or str, optional
10851 Increment to use from time series API (e.g. 'M' or BDay()).
10852 **kwargs
10853 Additional keyword arguments are passed into
10854 `DataFrame.shift` or `Series.shift`.
10855
10856 Returns
10857 -------
10858 Series or DataFrame
10859 The same type as the calling object.
10860
10861 See Also
10862 --------
10863 Series.diff : Compute the difference of two elements in a Series.
10864 DataFrame.diff : Compute the difference of two elements in a DataFrame.
10865 Series.shift : Shift the index by some number of periods.
10866 DataFrame.shift : Shift the index by some number of periods.
10867
10868 Examples
10869 --------
10870 **Series**
10871
10872 >>> s = pd.Series([90, 91, 85])
10873 >>> s
10874 0 90
10875 1 91
10876 2 85
10877 dtype: int64
10878
10879 >>> s.pct_change()
10880 0 NaN
10881 1 0.011111
10882 2 -0.065934
10883 dtype: float64
10884
10885 >>> s.pct_change(periods=2)
10886 0 NaN
10887 1 NaN
10888 2 -0.055556
10889 dtype: float64
10890
10891 See the percentage change in a Series where filling NAs with last
10892 valid observation forward to next valid.
10893
10894 >>> s = pd.Series([90, 91, None, 85])
10895 >>> s
10896 0 90.0
10897 1 91.0
10898 2 NaN
10899 3 85.0
10900 dtype: float64
10901
10902 >>> s.pct_change(fill_method='ffill')
10903 0 NaN
10904 1 0.011111
10905 2 0.000000
10906 3 -0.065934
10907 dtype: float64
10908
10909 **DataFrame**
10910
10911 Percentage change in French franc, Deutsche Mark, and Italian lira from
10912 1980-01-01 to 1980-03-01.
10913
10914 >>> df = pd.DataFrame({
10915 ... 'FR': [4.0405, 4.0963, 4.3149],
10916 ... 'GR': [1.7246, 1.7482, 1.8519],
10917 ... 'IT': [804.74, 810.01, 860.13]},
10918 ... index=['1980-01-01', '1980-02-01', '1980-03-01'])
10919 >>> df
10920 FR GR IT
10921 1980-01-01 4.0405 1.7246 804.74
10922 1980-02-01 4.0963 1.7482 810.01
10923 1980-03-01 4.3149 1.8519 860.13
10924
10925 >>> df.pct_change()
10926 FR GR IT
10927 1980-01-01 NaN NaN NaN
10928 1980-02-01 0.013810 0.013684 0.006549
10929 1980-03-01 0.053365 0.059318 0.061876
10930
10931 Percentage of change in GOOG and APPL stock volume. Shows computing
10932 the percentage change between columns.
10933
10934 >>> df = pd.DataFrame({
10935 ... '2016': [1769950, 30586265],
10936 ... '2015': [1500923, 40912316],
10937 ... '2014': [1371819, 41403351]},
10938 ... index=['GOOG', 'APPL'])
10939 >>> df
10940 2016 2015 2014
10941 GOOG 1769950 1500923 1371819
10942 APPL 30586265 40912316 41403351
10943
10944 >>> df.pct_change(axis='columns', periods=-1)
10945 2016 2015 2014
10946 GOOG 0.179241 0.094112 NaN
10947 APPL -0.252395 -0.011860 NaN
10948 """
10949 axis = self._get_axis_number(kwargs.pop("axis", self._stat_axis_name))
10950 if fill_method is None:
10951 data = self
10952 else:
10953 _data = self.fillna(method=fill_method, axis=axis, limit=limit)
10954 assert _data is not None # needed for mypy
10955 data = _data
10956
10957 shifted = data.shift(periods=periods, freq=freq, axis=axis, **kwargs)
10958 # Unsupported left operand type for / ("NDFrameT")
10959 rs = data / shifted - 1 # type: ignore[operator]
10960 if freq is not None:
10961 # Shift method is implemented differently when freq is not None
10962 # We want to restore the original index
10963 rs = rs.loc[~rs.index.duplicated()]
10964 rs = rs.reindex_like(data)
10965 return rs.__finalize__(self, method="pct_change")
10966
10967 @final
10968 def _logical_func(
10969 self,
10970 name: str,
10971 func,
10972 axis: Axis = 0,
10973 bool_only: bool_t = False,
10974 skipna: bool_t = True,
10975 **kwargs,
10976 ) -> Series | bool_t:
10977 nv.validate_logical_func((), kwargs, fname=name)
10978 validate_bool_kwarg(skipna, "skipna", none_allowed=False)
10979
10980 if self.ndim > 1 and axis is None:
10981 # Reduce along one dimension then the other, to simplify DataFrame._reduce
10982 res = self._logical_func(
10983 name, func, axis=0, bool_only=bool_only, skipna=skipna, **kwargs
10984 )
10985 return res._logical_func(name, func, skipna=skipna, **kwargs)
10986
10987 if (
10988 self.ndim > 1
10989 and axis == 1
10990 and len(self._mgr.arrays) > 1
10991 # TODO(EA2D): special-case not needed
10992 and all(x.ndim == 2 for x in self._mgr.arrays)
10993 and not kwargs
10994 ):
10995 # Fastpath avoiding potentially expensive transpose
10996 obj = self
10997 if bool_only:
10998 obj = self._get_bool_data()
10999 return obj._reduce_axis1(name, func, skipna=skipna)
11000
11001 return self._reduce(
11002 func,
11003 name=name,
11004 axis=axis,
11005 skipna=skipna,
11006 numeric_only=bool_only,
11007 filter_type="bool",
11008 )
11009
11010 def any(
11011 self,
11012 axis: Axis = 0,
11013 bool_only: bool_t = False,
11014 skipna: bool_t = True,
11015 **kwargs,
11016 ) -> DataFrame | Series | bool_t:
11017 return self._logical_func(
11018 "any", nanops.nanany, axis, bool_only, skipna, **kwargs
11019 )
11020
11021 def all(
11022 self,
11023 axis: Axis = 0,
11024 bool_only: bool_t = False,
11025 skipna: bool_t = True,
11026 **kwargs,
11027 ) -> Series | bool_t:
11028 return self._logical_func(
11029 "all", nanops.nanall, axis, bool_only, skipna, **kwargs
11030 )
11031
11032 @final
11033 def _accum_func(
11034 self,
11035 name: str,
11036 func,
11037 axis: Axis | None = None,
11038 skipna: bool_t = True,
11039 *args,
11040 **kwargs,
11041 ):
11042 skipna = nv.validate_cum_func_with_skipna(skipna, args, kwargs, name)
11043 if axis is None:
11044 axis = self._stat_axis_number
11045 else:
11046 axis = self._get_axis_number(axis)
11047
11048 if axis == 1:
11049 return self.T._accum_func(
11050 name, func, axis=0, skipna=skipna, *args, **kwargs # noqa: B026
11051 ).T
11052
11053 def block_accum_func(blk_values):
11054 values = blk_values.T if hasattr(blk_values, "T") else blk_values
11055
11056 result: np.ndarray | ExtensionArray
11057 if isinstance(values, ExtensionArray):
11058 result = values._accumulate(name, skipna=skipna, **kwargs)
11059 else:
11060 result = nanops.na_accum_func(values, func, skipna=skipna)
11061
11062 result = result.T if hasattr(result, "T") else result
11063 return result
11064
11065 result = self._mgr.apply(block_accum_func)
11066
11067 return self._constructor(result).__finalize__(self, method=name)
11068
11069 def cummax(self, axis: Axis | None = None, skipna: bool_t = True, *args, **kwargs):
11070 return self._accum_func(
11071 "cummax", np.maximum.accumulate, axis, skipna, *args, **kwargs
11072 )
11073
11074 def cummin(self, axis: Axis | None = None, skipna: bool_t = True, *args, **kwargs):
11075 return self._accum_func(
11076 "cummin", np.minimum.accumulate, axis, skipna, *args, **kwargs
11077 )
11078
11079 def cumsum(self, axis: Axis | None = None, skipna: bool_t = True, *args, **kwargs):
11080 return self._accum_func("cumsum", np.cumsum, axis, skipna, *args, **kwargs)
11081
11082 def cumprod(self, axis: Axis | None = None, skipna: bool_t = True, *args, **kwargs):
11083 return self._accum_func("cumprod", np.cumprod, axis, skipna, *args, **kwargs)
11084
11085 @final
11086 def _stat_function_ddof(
11087 self,
11088 name: str,
11089 func,
11090 axis: Axis | None = None,
11091 skipna: bool_t = True,
11092 ddof: int = 1,
11093 numeric_only: bool_t = False,
11094 **kwargs,
11095 ) -> Series | float:
11096 nv.validate_stat_ddof_func((), kwargs, fname=name)
11097 validate_bool_kwarg(skipna, "skipna", none_allowed=False)
11098 if axis is None:
11099 axis = self._stat_axis_number
11100
11101 return self._reduce(
11102 func, name, axis=axis, numeric_only=numeric_only, skipna=skipna, ddof=ddof
11103 )
11104
11105 def sem(
11106 self,
11107 axis: Axis | None = None,
11108 skipna: bool_t = True,
11109 ddof: int = 1,
11110 numeric_only: bool_t = False,
11111 **kwargs,
11112 ) -> Series | float:
11113 return self._stat_function_ddof(
11114 "sem", nanops.nansem, axis, skipna, ddof, numeric_only, **kwargs
11115 )
11116
11117 def var(
11118 self,
11119 axis: Axis | None = None,
11120 skipna: bool_t = True,
11121 ddof: int = 1,
11122 numeric_only: bool_t = False,
11123 **kwargs,
11124 ) -> Series | float:
11125 return self._stat_function_ddof(
11126 "var", nanops.nanvar, axis, skipna, ddof, numeric_only, **kwargs
11127 )
11128
11129 def std(
11130 self,
11131 axis: Axis | None = None,
11132 skipna: bool_t = True,
11133 ddof: int = 1,
11134 numeric_only: bool_t = False,
11135 **kwargs,
11136 ) -> Series | float:
11137 return self._stat_function_ddof(
11138 "std", nanops.nanstd, axis, skipna, ddof, numeric_only, **kwargs
11139 )
11140
11141 @final
11142 def _stat_function(
11143 self,
11144 name: str,
11145 func,
11146 axis: Axis | None = 0,
11147 skipna: bool_t = True,
11148 numeric_only: bool_t = False,
11149 **kwargs,
11150 ):
11151 if name == "median":
11152 nv.validate_median((), kwargs)
11153 else:
11154 nv.validate_stat_func((), kwargs, fname=name)
11155
11156 validate_bool_kwarg(skipna, "skipna", none_allowed=False)
11157
11158 return self._reduce(
11159 func, name=name, axis=axis, skipna=skipna, numeric_only=numeric_only
11160 )
11161
11162 def min(
11163 self,
11164 axis: Axis | None = 0,
11165 skipna: bool_t = True,
11166 numeric_only: bool_t = False,
11167 **kwargs,
11168 ):
11169 return self._stat_function(
11170 "min",
11171 nanops.nanmin,
11172 axis,
11173 skipna,
11174 numeric_only,
11175 **kwargs,
11176 )
11177
11178 def max(
11179 self,
11180 axis: Axis | None = 0,
11181 skipna: bool_t = True,
11182 numeric_only: bool_t = False,
11183 **kwargs,
11184 ):
11185 return self._stat_function(
11186 "max",
11187 nanops.nanmax,
11188 axis,
11189 skipna,
11190 numeric_only,
11191 **kwargs,
11192 )
11193
11194 def mean(
11195 self,
11196 axis: Axis | None = 0,
11197 skipna: bool_t = True,
11198 numeric_only: bool_t = False,
11199 **kwargs,
11200 ) -> Series | float:
11201 return self._stat_function(
11202 "mean", nanops.nanmean, axis, skipna, numeric_only, **kwargs
11203 )
11204
11205 def median(
11206 self,
11207 axis: Axis | None = 0,
11208 skipna: bool_t = True,
11209 numeric_only: bool_t = False,
11210 **kwargs,
11211 ) -> Series | float:
11212 return self._stat_function(
11213 "median", nanops.nanmedian, axis, skipna, numeric_only, **kwargs
11214 )
11215
11216 def skew(
11217 self,
11218 axis: Axis | None = 0,
11219 skipna: bool_t = True,
11220 numeric_only: bool_t = False,
11221 **kwargs,
11222 ) -> Series | float:
11223 return self._stat_function(
11224 "skew", nanops.nanskew, axis, skipna, numeric_only, **kwargs
11225 )
11226
11227 def kurt(
11228 self,
11229 axis: Axis | None = 0,
11230 skipna: bool_t = True,
11231 numeric_only: bool_t = False,
11232 **kwargs,
11233 ) -> Series | float:
11234 return self._stat_function(
11235 "kurt", nanops.nankurt, axis, skipna, numeric_only, **kwargs
11236 )
11237
11238 kurtosis = kurt
11239
11240 @final
11241 def _min_count_stat_function(
11242 self,
11243 name: str,
11244 func,
11245 axis: Axis | None = None,
11246 skipna: bool_t = True,
11247 numeric_only: bool_t = False,
11248 min_count: int = 0,
11249 **kwargs,
11250 ):
11251 if name == "sum":
11252 nv.validate_sum((), kwargs)
11253 elif name == "prod":
11254 nv.validate_prod((), kwargs)
11255 else:
11256 nv.validate_stat_func((), kwargs, fname=name)
11257
11258 validate_bool_kwarg(skipna, "skipna", none_allowed=False)
11259
11260 if axis is None:
11261 axis = self._stat_axis_number
11262
11263 return self._reduce(
11264 func,
11265 name=name,
11266 axis=axis,
11267 skipna=skipna,
11268 numeric_only=numeric_only,
11269 min_count=min_count,
11270 )
11271
11272 def sum(
11273 self,
11274 axis: Axis | None = None,
11275 skipna: bool_t = True,
11276 numeric_only: bool_t = False,
11277 min_count: int = 0,
11278 **kwargs,
11279 ):
11280 return self._min_count_stat_function(
11281 "sum", nanops.nansum, axis, skipna, numeric_only, min_count, **kwargs
11282 )
11283
11284 def prod(
11285 self,
11286 axis: Axis | None = None,
11287 skipna: bool_t = True,
11288 numeric_only: bool_t = False,
11289 min_count: int = 0,
11290 **kwargs,
11291 ):
11292 return self._min_count_stat_function(
11293 "prod",
11294 nanops.nanprod,
11295 axis,
11296 skipna,
11297 numeric_only,
11298 min_count,
11299 **kwargs,
11300 )
11301
11302 product = prod
11303
11304 @classmethod
11305 def _add_numeric_operations(cls) -> None:
11306 """
11307 Add the operations to the cls; evaluate the doc strings again
11308 """
11309 axis_descr, name1, name2 = _doc_params(cls)
11310
11311 @doc(
11312 _bool_doc,
11313 desc=_any_desc,
11314 name1=name1,
11315 name2=name2,
11316 axis_descr=axis_descr,
11317 see_also=_any_see_also,
11318 examples=_any_examples,
11319 empty_value=False,
11320 )
11321 def any(
11322 self,
11323 *,
11324 axis: Axis = 0,
11325 bool_only=None,
11326 skipna: bool_t = True,
11327 **kwargs,
11328 ):
11329 return NDFrame.any(
11330 self,
11331 axis=axis,
11332 bool_only=bool_only,
11333 skipna=skipna,
11334 **kwargs,
11335 )
11336
11337 setattr(cls, "any", any)
11338
11339 @doc(
11340 _bool_doc,
11341 desc=_all_desc,
11342 name1=name1,
11343 name2=name2,
11344 axis_descr=axis_descr,
11345 see_also=_all_see_also,
11346 examples=_all_examples,
11347 empty_value=True,
11348 )
11349 def all(
11350 self,
11351 axis: Axis = 0,
11352 bool_only=None,
11353 skipna: bool_t = True,
11354 **kwargs,
11355 ):
11356 return NDFrame.all(self, axis, bool_only, skipna, **kwargs)
11357
11358 setattr(cls, "all", all)
11359
11360 @doc(
11361 _num_ddof_doc,
11362 desc="Return unbiased standard error of the mean over requested "
11363 "axis.\n\nNormalized by N-1 by default. This can be changed "
11364 "using the ddof argument",
11365 name1=name1,
11366 name2=name2,
11367 axis_descr=axis_descr,
11368 notes="",
11369 examples="",
11370 )
11371 def sem(
11372 self,
11373 axis: Axis | None = None,
11374 skipna: bool_t = True,
11375 ddof: int = 1,
11376 numeric_only: bool_t = False,
11377 **kwargs,
11378 ):
11379 return NDFrame.sem(self, axis, skipna, ddof, numeric_only, **kwargs)
11380
11381 setattr(cls, "sem", sem)
11382
11383 @doc(
11384 _num_ddof_doc,
11385 desc="Return unbiased variance over requested axis.\n\nNormalized by "
11386 "N-1 by default. This can be changed using the ddof argument.",
11387 name1=name1,
11388 name2=name2,
11389 axis_descr=axis_descr,
11390 notes="",
11391 examples=_var_examples,
11392 )
11393 def var(
11394 self,
11395 axis: Axis | None = None,
11396 skipna: bool_t = True,
11397 ddof: int = 1,
11398 numeric_only: bool_t = False,
11399 **kwargs,
11400 ):
11401 return NDFrame.var(self, axis, skipna, ddof, numeric_only, **kwargs)
11402
11403 setattr(cls, "var", var)
11404
11405 @doc(
11406 _num_ddof_doc,
11407 desc="Return sample standard deviation over requested axis."
11408 "\n\nNormalized by N-1 by default. This can be changed using the "
11409 "ddof argument.",
11410 name1=name1,
11411 name2=name2,
11412 axis_descr=axis_descr,
11413 notes=_std_notes,
11414 examples=_std_examples,
11415 )
11416 def std(
11417 self,
11418 axis: Axis | None = None,
11419 skipna: bool_t = True,
11420 ddof: int = 1,
11421 numeric_only: bool_t = False,
11422 **kwargs,
11423 ):
11424 return NDFrame.std(self, axis, skipna, ddof, numeric_only, **kwargs)
11425
11426 setattr(cls, "std", std)
11427
11428 @doc(
11429 _cnum_doc,
11430 desc="minimum",
11431 name1=name1,
11432 name2=name2,
11433 axis_descr=axis_descr,
11434 accum_func_name="min",
11435 examples=_cummin_examples,
11436 )
11437 def cummin(
11438 self, axis: Axis | None = None, skipna: bool_t = True, *args, **kwargs
11439 ):
11440 return NDFrame.cummin(self, axis, skipna, *args, **kwargs)
11441
11442 setattr(cls, "cummin", cummin)
11443
11444 @doc(
11445 _cnum_doc,
11446 desc="maximum",
11447 name1=name1,
11448 name2=name2,
11449 axis_descr=axis_descr,
11450 accum_func_name="max",
11451 examples=_cummax_examples,
11452 )
11453 def cummax(
11454 self, axis: Axis | None = None, skipna: bool_t = True, *args, **kwargs
11455 ):
11456 return NDFrame.cummax(self, axis, skipna, *args, **kwargs)
11457
11458 setattr(cls, "cummax", cummax)
11459
11460 @doc(
11461 _cnum_doc,
11462 desc="sum",
11463 name1=name1,
11464 name2=name2,
11465 axis_descr=axis_descr,
11466 accum_func_name="sum",
11467 examples=_cumsum_examples,
11468 )
11469 def cumsum(
11470 self, axis: Axis | None = None, skipna: bool_t = True, *args, **kwargs
11471 ):
11472 return NDFrame.cumsum(self, axis, skipna, *args, **kwargs)
11473
11474 setattr(cls, "cumsum", cumsum)
11475
11476 @doc(
11477 _cnum_doc,
11478 desc="product",
11479 name1=name1,
11480 name2=name2,
11481 axis_descr=axis_descr,
11482 accum_func_name="prod",
11483 examples=_cumprod_examples,
11484 )
11485 def cumprod(
11486 self, axis: Axis | None = None, skipna: bool_t = True, *args, **kwargs
11487 ):
11488 return NDFrame.cumprod(self, axis, skipna, *args, **kwargs)
11489
11490 setattr(cls, "cumprod", cumprod)
11491
11492 # error: Untyped decorator makes function "sum" untyped
11493 @doc( # type: ignore[misc]
11494 _num_doc,
11495 desc="Return the sum of the values over the requested axis.\n\n"
11496 "This is equivalent to the method ``numpy.sum``.",
11497 name1=name1,
11498 name2=name2,
11499 axis_descr=axis_descr,
11500 min_count=_min_count_stub,
11501 see_also=_stat_func_see_also,
11502 examples=_sum_examples,
11503 )
11504 def sum(
11505 self,
11506 axis: Axis | None = None,
11507 skipna: bool_t = True,
11508 numeric_only: bool_t = False,
11509 min_count: int = 0,
11510 **kwargs,
11511 ):
11512 return NDFrame.sum(self, axis, skipna, numeric_only, min_count, **kwargs)
11513
11514 setattr(cls, "sum", sum)
11515
11516 @doc(
11517 _num_doc,
11518 desc="Return the product of the values over the requested axis.",
11519 name1=name1,
11520 name2=name2,
11521 axis_descr=axis_descr,
11522 min_count=_min_count_stub,
11523 see_also=_stat_func_see_also,
11524 examples=_prod_examples,
11525 )
11526 def prod(
11527 self,
11528 axis: Axis | None = None,
11529 skipna: bool_t = True,
11530 numeric_only: bool_t = False,
11531 min_count: int = 0,
11532 **kwargs,
11533 ):
11534 return NDFrame.prod(self, axis, skipna, numeric_only, min_count, **kwargs)
11535
11536 setattr(cls, "prod", prod)
11537 cls.product = prod
11538
11539 @doc(
11540 _num_doc,
11541 desc="Return the mean of the values over the requested axis.",
11542 name1=name1,
11543 name2=name2,
11544 axis_descr=axis_descr,
11545 min_count="",
11546 see_also="",
11547 examples="",
11548 )
11549 def mean(
11550 self,
11551 axis: AxisInt | None = 0,
11552 skipna: bool_t = True,
11553 numeric_only: bool_t = False,
11554 **kwargs,
11555 ):
11556 return NDFrame.mean(self, axis, skipna, numeric_only, **kwargs)
11557
11558 setattr(cls, "mean", mean)
11559
11560 @doc(
11561 _num_doc,
11562 desc="Return unbiased skew over requested axis.\n\nNormalized by N-1.",
11563 name1=name1,
11564 name2=name2,
11565 axis_descr=axis_descr,
11566 min_count="",
11567 see_also="",
11568 examples="",
11569 )
11570 def skew(
11571 self,
11572 axis: AxisInt | None = 0,
11573 skipna: bool_t = True,
11574 numeric_only: bool_t = False,
11575 **kwargs,
11576 ):
11577 return NDFrame.skew(self, axis, skipna, numeric_only, **kwargs)
11578
11579 setattr(cls, "skew", skew)
11580
11581 @doc(
11582 _num_doc,
11583 desc="Return unbiased kurtosis over requested axis.\n\n"
11584 "Kurtosis obtained using Fisher's definition of\n"
11585 "kurtosis (kurtosis of normal == 0.0). Normalized "
11586 "by N-1.",
11587 name1=name1,
11588 name2=name2,
11589 axis_descr=axis_descr,
11590 min_count="",
11591 see_also="",
11592 examples="",
11593 )
11594 def kurt(
11595 self,
11596 axis: Axis | None = 0,
11597 skipna: bool_t = True,
11598 numeric_only: bool_t = False,
11599 **kwargs,
11600 ):
11601 return NDFrame.kurt(self, axis, skipna, numeric_only, **kwargs)
11602
11603 setattr(cls, "kurt", kurt)
11604 cls.kurtosis = kurt
11605
11606 @doc(
11607 _num_doc,
11608 desc="Return the median of the values over the requested axis.",
11609 name1=name1,
11610 name2=name2,
11611 axis_descr=axis_descr,
11612 min_count="",
11613 see_also="",
11614 examples="",
11615 )
11616 def median(
11617 self,
11618 axis: AxisInt | None = 0,
11619 skipna: bool_t = True,
11620 numeric_only: bool_t = False,
11621 **kwargs,
11622 ):
11623 return NDFrame.median(self, axis, skipna, numeric_only, **kwargs)
11624
11625 setattr(cls, "median", median)
11626
11627 @doc(
11628 _num_doc,
11629 desc="Return the maximum of the values over the requested axis.\n\n"
11630 "If you want the *index* of the maximum, use ``idxmax``. This is "
11631 "the equivalent of the ``numpy.ndarray`` method ``argmax``.",
11632 name1=name1,
11633 name2=name2,
11634 axis_descr=axis_descr,
11635 min_count="",
11636 see_also=_stat_func_see_also,
11637 examples=_max_examples,
11638 )
11639 def max(
11640 self,
11641 axis: AxisInt | None = 0,
11642 skipna: bool_t = True,
11643 numeric_only: bool_t = False,
11644 **kwargs,
11645 ):
11646 return NDFrame.max(self, axis, skipna, numeric_only, **kwargs)
11647
11648 setattr(cls, "max", max)
11649
11650 @doc(
11651 _num_doc,
11652 desc="Return the minimum of the values over the requested axis.\n\n"
11653 "If you want the *index* of the minimum, use ``idxmin``. This is "
11654 "the equivalent of the ``numpy.ndarray`` method ``argmin``.",
11655 name1=name1,
11656 name2=name2,
11657 axis_descr=axis_descr,
11658 min_count="",
11659 see_also=_stat_func_see_also,
11660 examples=_min_examples,
11661 )
11662 def min(
11663 self,
11664 axis: AxisInt | None = 0,
11665 skipna: bool_t = True,
11666 numeric_only: bool_t = False,
11667 **kwargs,
11668 ):
11669 return NDFrame.min(self, axis, skipna, numeric_only, **kwargs)
11670
11671 setattr(cls, "min", min)
11672
11673 @final
11674 @doc(Rolling)
11675 def rolling(
11676 self,
11677 window: int | dt.timedelta | str | BaseOffset | BaseIndexer,
11678 min_periods: int | None = None,
11679 center: bool_t = False,
11680 win_type: str | None = None,
11681 on: str | None = None,
11682 axis: Axis = 0,
11683 closed: str | None = None,
11684 step: int | None = None,
11685 method: str = "single",
11686 ) -> Window | Rolling:
11687 axis = self._get_axis_number(axis)
11688
11689 if win_type is not None:
11690 return Window(
11691 self,
11692 window=window,
11693 min_periods=min_periods,
11694 center=center,
11695 win_type=win_type,
11696 on=on,
11697 axis=axis,
11698 closed=closed,
11699 step=step,
11700 method=method,
11701 )
11702
11703 return Rolling(
11704 self,
11705 window=window,
11706 min_periods=min_periods,
11707 center=center,
11708 win_type=win_type,
11709 on=on,
11710 axis=axis,
11711 closed=closed,
11712 step=step,
11713 method=method,
11714 )
11715
11716 @final
11717 @doc(Expanding)
11718 def expanding(
11719 self,
11720 min_periods: int = 1,
11721 axis: Axis = 0,
11722 method: str = "single",
11723 ) -> Expanding:
11724 axis = self._get_axis_number(axis)
11725 return Expanding(self, min_periods=min_periods, axis=axis, method=method)
11726
11727 @final
11728 @doc(ExponentialMovingWindow)
11729 def ewm(
11730 self,
11731 com: float | None = None,
11732 span: float | None = None,
11733 halflife: float | TimedeltaConvertibleTypes | None = None,
11734 alpha: float | None = None,
11735 min_periods: int | None = 0,
11736 adjust: bool_t = True,
11737 ignore_na: bool_t = False,
11738 axis: Axis = 0,
11739 times: np.ndarray | DataFrame | Series | None = None,
11740 method: str = "single",
11741 ) -> ExponentialMovingWindow:
11742 axis = self._get_axis_number(axis)
11743 return ExponentialMovingWindow(
11744 self,
11745 com=com,
11746 span=span,
11747 halflife=halflife,
11748 alpha=alpha,
11749 min_periods=min_periods,
11750 adjust=adjust,
11751 ignore_na=ignore_na,
11752 axis=axis,
11753 times=times,
11754 method=method,
11755 )
11756
11757 # ----------------------------------------------------------------------
11758 # Arithmetic Methods
11759
11760 @final
11761 def _inplace_method(self, other, op):
11762 """
11763 Wrap arithmetic method to operate inplace.
11764 """
11765 result = op(self, other)
11766
11767 if (
11768 self.ndim == 1
11769 and result._indexed_same(self)
11770 and is_dtype_equal(result.dtype, self.dtype)
11771 ):
11772 # GH#36498 this inplace op can _actually_ be inplace.
11773 # Item "ArrayManager" of "Union[ArrayManager, SingleArrayManager,
11774 # BlockManager, SingleBlockManager]" has no attribute "setitem_inplace"
11775 self._mgr.setitem_inplace( # type: ignore[union-attr]
11776 slice(None), result._values
11777 )
11778 return self
11779
11780 # Delete cacher
11781 self._reset_cacher()
11782
11783 # this makes sure that we are aligned like the input
11784 # we are updating inplace so we want to ignore is_copy
11785 self._update_inplace(
11786 result.reindex_like(self, copy=False), verify_is_copy=False
11787 )
11788 return self
11789
11790 def __iadd__(self: NDFrameT, other) -> NDFrameT:
11791 # error: Unsupported left operand type for + ("Type[NDFrame]")
11792 return self._inplace_method(other, type(self).__add__) # type: ignore[operator]
11793
11794 def __isub__(self: NDFrameT, other) -> NDFrameT:
11795 # error: Unsupported left operand type for - ("Type[NDFrame]")
11796 return self._inplace_method(other, type(self).__sub__) # type: ignore[operator]
11797
11798 def __imul__(self: NDFrameT, other) -> NDFrameT:
11799 # error: Unsupported left operand type for * ("Type[NDFrame]")
11800 return self._inplace_method(other, type(self).__mul__) # type: ignore[operator]
11801
11802 def __itruediv__(self: NDFrameT, other) -> NDFrameT:
11803 # error: Unsupported left operand type for / ("Type[NDFrame]")
11804 return self._inplace_method(
11805 other, type(self).__truediv__ # type: ignore[operator]
11806 )
11807
11808 def __ifloordiv__(self: NDFrameT, other) -> NDFrameT:
11809 # error: Unsupported left operand type for // ("Type[NDFrame]")
11810 return self._inplace_method(
11811 other, type(self).__floordiv__ # type: ignore[operator]
11812 )
11813
11814 def __imod__(self: NDFrameT, other) -> NDFrameT:
11815 # error: Unsupported left operand type for % ("Type[NDFrame]")
11816 return self._inplace_method(other, type(self).__mod__) # type: ignore[operator]
11817
11818 def __ipow__(self: NDFrameT, other) -> NDFrameT:
11819 # error: Unsupported left operand type for ** ("Type[NDFrame]")
11820 return self._inplace_method(other, type(self).__pow__) # type: ignore[operator]
11821
11822 def __iand__(self: NDFrameT, other) -> NDFrameT:
11823 # error: Unsupported left operand type for & ("Type[NDFrame]")
11824 return self._inplace_method(other, type(self).__and__) # type: ignore[operator]
11825
11826 def __ior__(self: NDFrameT, other) -> NDFrameT:
11827 # error: Unsupported left operand type for | ("Type[NDFrame]")
11828 return self._inplace_method(other, type(self).__or__) # type: ignore[operator]
11829
11830 def __ixor__(self: NDFrameT, other) -> NDFrameT:
11831 # error: Unsupported left operand type for ^ ("Type[NDFrame]")
11832 return self._inplace_method(other, type(self).__xor__) # type: ignore[operator]
11833
11834 # ----------------------------------------------------------------------
11835 # Misc methods
11836
11837 @final
11838 def _find_valid_index(self, *, how: str) -> Hashable | None:
11839 """
11840 Retrieves the index of the first valid value.
11841
11842 Parameters
11843 ----------
11844 how : {'first', 'last'}
11845 Use this parameter to change between the first or last valid index.
11846
11847 Returns
11848 -------
11849 idx_first_valid : type of index
11850 """
11851 idxpos = find_valid_index(self._values, how=how, is_valid=~isna(self._values))
11852 if idxpos is None:
11853 return None
11854 return self.index[idxpos]
11855
11856 @final
11857 @doc(position="first", klass=_shared_doc_kwargs["klass"])
11858 def first_valid_index(self) -> Hashable | None:
11859 """
11860 Return index for {position} non-NA value or None, if no non-NA value is found.
11861
11862 Returns
11863 -------
11864 type of index
11865
11866 Notes
11867 -----
11868 If all elements are non-NA/null, returns None.
11869 Also returns None for empty {klass}.
11870 """
11871 return self._find_valid_index(how="first")
11872
11873 @final
11874 @doc(first_valid_index, position="last", klass=_shared_doc_kwargs["klass"])
11875 def last_valid_index(self) -> Hashable | None:
11876 return self._find_valid_index(how="last")
11877
11878
11879def _doc_params(cls):
11880 """Return a tuple of the doc params."""
11881 axis_descr = (
11882 f"{{{', '.join([f'{a} ({i})' for i, a in enumerate(cls._AXIS_ORDERS)])}}}"
11883 )
11884 name = cls._constructor_sliced.__name__ if cls._AXIS_LEN > 1 else "scalar"
11885 name2 = cls.__name__
11886 return axis_descr, name, name2
11887
11888
11889_num_doc = """
11890{desc}
11891
11892Parameters
11893----------
11894axis : {axis_descr}
11895 Axis for the function to be applied on.
11896 For `Series` this parameter is unused and defaults to 0.
11897
11898 For DataFrames, specifying ``axis=None`` will apply the aggregation
11899 across both axes.
11900
11901 .. versionadded:: 2.0.0
11902
11903skipna : bool, default True
11904 Exclude NA/null values when computing the result.
11905numeric_only : bool, default False
11906 Include only float, int, boolean columns. Not implemented for Series.
11907
11908{min_count}\
11909**kwargs
11910 Additional keyword arguments to be passed to the function.
11911
11912Returns
11913-------
11914{name1} or scalar\
11915{see_also}\
11916{examples}
11917"""
11918
11919_num_ddof_doc = """
11920{desc}
11921
11922Parameters
11923----------
11924axis : {axis_descr}
11925 For `Series` this parameter is unused and defaults to 0.
11926skipna : bool, default True
11927 Exclude NA/null values. If an entire row/column is NA, the result
11928 will be NA.
11929ddof : int, default 1
11930 Delta Degrees of Freedom. The divisor used in calculations is N - ddof,
11931 where N represents the number of elements.
11932numeric_only : bool, default False
11933 Include only float, int, boolean columns. Not implemented for Series.
11934
11935Returns
11936-------
11937{name1} or {name2} (if level specified) \
11938{notes}\
11939{examples}
11940"""
11941
11942_std_notes = """
11943
11944Notes
11945-----
11946To have the same behaviour as `numpy.std`, use `ddof=0` (instead of the
11947default `ddof=1`)"""
11948
11949_std_examples = """
11950
11951Examples
11952--------
11953>>> df = pd.DataFrame({'person_id': [0, 1, 2, 3],
11954... 'age': [21, 25, 62, 43],
11955... 'height': [1.61, 1.87, 1.49, 2.01]}
11956... ).set_index('person_id')
11957>>> df
11958 age height
11959person_id
119600 21 1.61
119611 25 1.87
119622 62 1.49
119633 43 2.01
11964
11965The standard deviation of the columns can be found as follows:
11966
11967>>> df.std()
11968age 18.786076
11969height 0.237417
11970dtype: float64
11971
11972Alternatively, `ddof=0` can be set to normalize by N instead of N-1:
11973
11974>>> df.std(ddof=0)
11975age 16.269219
11976height 0.205609
11977dtype: float64"""
11978
11979_var_examples = """
11980
11981Examples
11982--------
11983>>> df = pd.DataFrame({'person_id': [0, 1, 2, 3],
11984... 'age': [21, 25, 62, 43],
11985... 'height': [1.61, 1.87, 1.49, 2.01]}
11986... ).set_index('person_id')
11987>>> df
11988 age height
11989person_id
119900 21 1.61
119911 25 1.87
119922 62 1.49
119933 43 2.01
11994
11995>>> df.var()
11996age 352.916667
11997height 0.056367
11998dtype: float64
11999
12000Alternatively, ``ddof=0`` can be set to normalize by N instead of N-1:
12001
12002>>> df.var(ddof=0)
12003age 264.687500
12004height 0.042275
12005dtype: float64"""
12006
12007_bool_doc = """
12008{desc}
12009
12010Parameters
12011----------
12012axis : {{0 or 'index', 1 or 'columns', None}}, default 0
12013 Indicate which axis or axes should be reduced. For `Series` this parameter
12014 is unused and defaults to 0.
12015
12016 * 0 / 'index' : reduce the index, return a Series whose index is the
12017 original column labels.
12018 * 1 / 'columns' : reduce the columns, return a Series whose index is the
12019 original index.
12020 * None : reduce all axes, return a scalar.
12021
12022bool_only : bool, default None
12023 Include only boolean columns. If None, will attempt to use everything,
12024 then use only boolean data. Not implemented for Series.
12025skipna : bool, default True
12026 Exclude NA/null values. If the entire row/column is NA and skipna is
12027 True, then the result will be {empty_value}, as for an empty row/column.
12028 If skipna is False, then NA are treated as True, because these are not
12029 equal to zero.
12030**kwargs : any, default None
12031 Additional keywords have no effect but might be accepted for
12032 compatibility with NumPy.
12033
12034Returns
12035-------
12036{name1} or {name2}
12037 If level is specified, then, {name2} is returned; otherwise, {name1}
12038 is returned.
12039
12040{see_also}
12041{examples}"""
12042
12043_all_desc = """\
12044Return whether all elements are True, potentially over an axis.
12045
12046Returns True unless there at least one element within a series or
12047along a Dataframe axis that is False or equivalent (e.g. zero or
12048empty)."""
12049
12050_all_examples = """\
12051Examples
12052--------
12053**Series**
12054
12055>>> pd.Series([True, True]).all()
12056True
12057>>> pd.Series([True, False]).all()
12058False
12059>>> pd.Series([], dtype="float64").all()
12060True
12061>>> pd.Series([np.nan]).all()
12062True
12063>>> pd.Series([np.nan]).all(skipna=False)
12064True
12065
12066**DataFrames**
12067
12068Create a dataframe from a dictionary.
12069
12070>>> df = pd.DataFrame({'col1': [True, True], 'col2': [True, False]})
12071>>> df
12072 col1 col2
120730 True True
120741 True False
12075
12076Default behaviour checks if values in each column all return True.
12077
12078>>> df.all()
12079col1 True
12080col2 False
12081dtype: bool
12082
12083Specify ``axis='columns'`` to check if values in each row all return True.
12084
12085>>> df.all(axis='columns')
120860 True
120871 False
12088dtype: bool
12089
12090Or ``axis=None`` for whether every value is True.
12091
12092>>> df.all(axis=None)
12093False
12094"""
12095
12096_all_see_also = """\
12097See Also
12098--------
12099Series.all : Return True if all elements are True.
12100DataFrame.any : Return True if one (or more) elements are True.
12101"""
12102
12103_cnum_doc = """
12104Return cumulative {desc} over a DataFrame or Series axis.
12105
12106Returns a DataFrame or Series of the same size containing the cumulative
12107{desc}.
12108
12109Parameters
12110----------
12111axis : {{0 or 'index', 1 or 'columns'}}, default 0
12112 The index or the name of the axis. 0 is equivalent to None or 'index'.
12113 For `Series` this parameter is unused and defaults to 0.
12114skipna : bool, default True
12115 Exclude NA/null values. If an entire row/column is NA, the result
12116 will be NA.
12117*args, **kwargs
12118 Additional keywords have no effect but might be accepted for
12119 compatibility with NumPy.
12120
12121Returns
12122-------
12123{name1} or {name2}
12124 Return cumulative {desc} of {name1} or {name2}.
12125
12126See Also
12127--------
12128core.window.expanding.Expanding.{accum_func_name} : Similar functionality
12129 but ignores ``NaN`` values.
12130{name2}.{accum_func_name} : Return the {desc} over
12131 {name2} axis.
12132{name2}.cummax : Return cumulative maximum over {name2} axis.
12133{name2}.cummin : Return cumulative minimum over {name2} axis.
12134{name2}.cumsum : Return cumulative sum over {name2} axis.
12135{name2}.cumprod : Return cumulative product over {name2} axis.
12136
12137{examples}"""
12138
12139_cummin_examples = """\
12140Examples
12141--------
12142**Series**
12143
12144>>> s = pd.Series([2, np.nan, 5, -1, 0])
12145>>> s
121460 2.0
121471 NaN
121482 5.0
121493 -1.0
121504 0.0
12151dtype: float64
12152
12153By default, NA values are ignored.
12154
12155>>> s.cummin()
121560 2.0
121571 NaN
121582 2.0
121593 -1.0
121604 -1.0
12161dtype: float64
12162
12163To include NA values in the operation, use ``skipna=False``
12164
12165>>> s.cummin(skipna=False)
121660 2.0
121671 NaN
121682 NaN
121693 NaN
121704 NaN
12171dtype: float64
12172
12173**DataFrame**
12174
12175>>> df = pd.DataFrame([[2.0, 1.0],
12176... [3.0, np.nan],
12177... [1.0, 0.0]],
12178... columns=list('AB'))
12179>>> df
12180 A B
121810 2.0 1.0
121821 3.0 NaN
121832 1.0 0.0
12184
12185By default, iterates over rows and finds the minimum
12186in each column. This is equivalent to ``axis=None`` or ``axis='index'``.
12187
12188>>> df.cummin()
12189 A B
121900 2.0 1.0
121911 2.0 NaN
121922 1.0 0.0
12193
12194To iterate over columns and find the minimum in each row,
12195use ``axis=1``
12196
12197>>> df.cummin(axis=1)
12198 A B
121990 2.0 1.0
122001 3.0 NaN
122012 1.0 0.0
12202"""
12203
12204_cumsum_examples = """\
12205Examples
12206--------
12207**Series**
12208
12209>>> s = pd.Series([2, np.nan, 5, -1, 0])
12210>>> s
122110 2.0
122121 NaN
122132 5.0
122143 -1.0
122154 0.0
12216dtype: float64
12217
12218By default, NA values are ignored.
12219
12220>>> s.cumsum()
122210 2.0
122221 NaN
122232 7.0
122243 6.0
122254 6.0
12226dtype: float64
12227
12228To include NA values in the operation, use ``skipna=False``
12229
12230>>> s.cumsum(skipna=False)
122310 2.0
122321 NaN
122332 NaN
122343 NaN
122354 NaN
12236dtype: float64
12237
12238**DataFrame**
12239
12240>>> df = pd.DataFrame([[2.0, 1.0],
12241... [3.0, np.nan],
12242... [1.0, 0.0]],
12243... columns=list('AB'))
12244>>> df
12245 A B
122460 2.0 1.0
122471 3.0 NaN
122482 1.0 0.0
12249
12250By default, iterates over rows and finds the sum
12251in each column. This is equivalent to ``axis=None`` or ``axis='index'``.
12252
12253>>> df.cumsum()
12254 A B
122550 2.0 1.0
122561 5.0 NaN
122572 6.0 1.0
12258
12259To iterate over columns and find the sum in each row,
12260use ``axis=1``
12261
12262>>> df.cumsum(axis=1)
12263 A B
122640 2.0 3.0
122651 3.0 NaN
122662 1.0 1.0
12267"""
12268
12269_cumprod_examples = """\
12270Examples
12271--------
12272**Series**
12273
12274>>> s = pd.Series([2, np.nan, 5, -1, 0])
12275>>> s
122760 2.0
122771 NaN
122782 5.0
122793 -1.0
122804 0.0
12281dtype: float64
12282
12283By default, NA values are ignored.
12284
12285>>> s.cumprod()
122860 2.0
122871 NaN
122882 10.0
122893 -10.0
122904 -0.0
12291dtype: float64
12292
12293To include NA values in the operation, use ``skipna=False``
12294
12295>>> s.cumprod(skipna=False)
122960 2.0
122971 NaN
122982 NaN
122993 NaN
123004 NaN
12301dtype: float64
12302
12303**DataFrame**
12304
12305>>> df = pd.DataFrame([[2.0, 1.0],
12306... [3.0, np.nan],
12307... [1.0, 0.0]],
12308... columns=list('AB'))
12309>>> df
12310 A B
123110 2.0 1.0
123121 3.0 NaN
123132 1.0 0.0
12314
12315By default, iterates over rows and finds the product
12316in each column. This is equivalent to ``axis=None`` or ``axis='index'``.
12317
12318>>> df.cumprod()
12319 A B
123200 2.0 1.0
123211 6.0 NaN
123222 6.0 0.0
12323
12324To iterate over columns and find the product in each row,
12325use ``axis=1``
12326
12327>>> df.cumprod(axis=1)
12328 A B
123290 2.0 2.0
123301 3.0 NaN
123312 1.0 0.0
12332"""
12333
12334_cummax_examples = """\
12335Examples
12336--------
12337**Series**
12338
12339>>> s = pd.Series([2, np.nan, 5, -1, 0])
12340>>> s
123410 2.0
123421 NaN
123432 5.0
123443 -1.0
123454 0.0
12346dtype: float64
12347
12348By default, NA values are ignored.
12349
12350>>> s.cummax()
123510 2.0
123521 NaN
123532 5.0
123543 5.0
123554 5.0
12356dtype: float64
12357
12358To include NA values in the operation, use ``skipna=False``
12359
12360>>> s.cummax(skipna=False)
123610 2.0
123621 NaN
123632 NaN
123643 NaN
123654 NaN
12366dtype: float64
12367
12368**DataFrame**
12369
12370>>> df = pd.DataFrame([[2.0, 1.0],
12371... [3.0, np.nan],
12372... [1.0, 0.0]],
12373... columns=list('AB'))
12374>>> df
12375 A B
123760 2.0 1.0
123771 3.0 NaN
123782 1.0 0.0
12379
12380By default, iterates over rows and finds the maximum
12381in each column. This is equivalent to ``axis=None`` or ``axis='index'``.
12382
12383>>> df.cummax()
12384 A B
123850 2.0 1.0
123861 3.0 NaN
123872 3.0 1.0
12388
12389To iterate over columns and find the maximum in each row,
12390use ``axis=1``
12391
12392>>> df.cummax(axis=1)
12393 A B
123940 2.0 2.0
123951 3.0 NaN
123962 1.0 1.0
12397"""
12398
12399_any_see_also = """\
12400See Also
12401--------
12402numpy.any : Numpy version of this method.
12403Series.any : Return whether any element is True.
12404Series.all : Return whether all elements are True.
12405DataFrame.any : Return whether any element is True over requested axis.
12406DataFrame.all : Return whether all elements are True over requested axis.
12407"""
12408
12409_any_desc = """\
12410Return whether any element is True, potentially over an axis.
12411
12412Returns False unless there is at least one element within a series or
12413along a Dataframe axis that is True or equivalent (e.g. non-zero or
12414non-empty)."""
12415
12416_any_examples = """\
12417Examples
12418--------
12419**Series**
12420
12421For Series input, the output is a scalar indicating whether any element
12422is True.
12423
12424>>> pd.Series([False, False]).any()
12425False
12426>>> pd.Series([True, False]).any()
12427True
12428>>> pd.Series([], dtype="float64").any()
12429False
12430>>> pd.Series([np.nan]).any()
12431False
12432>>> pd.Series([np.nan]).any(skipna=False)
12433True
12434
12435**DataFrame**
12436
12437Whether each column contains at least one True element (the default).
12438
12439>>> df = pd.DataFrame({"A": [1, 2], "B": [0, 2], "C": [0, 0]})
12440>>> df
12441 A B C
124420 1 0 0
124431 2 2 0
12444
12445>>> df.any()
12446A True
12447B True
12448C False
12449dtype: bool
12450
12451Aggregating over the columns.
12452
12453>>> df = pd.DataFrame({"A": [True, False], "B": [1, 2]})
12454>>> df
12455 A B
124560 True 1
124571 False 2
12458
12459>>> df.any(axis='columns')
124600 True
124611 True
12462dtype: bool
12463
12464>>> df = pd.DataFrame({"A": [True, False], "B": [1, 0]})
12465>>> df
12466 A B
124670 True 1
124681 False 0
12469
12470>>> df.any(axis='columns')
124710 True
124721 False
12473dtype: bool
12474
12475Aggregating over the entire DataFrame with ``axis=None``.
12476
12477>>> df.any(axis=None)
12478True
12479
12480`any` for an empty DataFrame is an empty Series.
12481
12482>>> pd.DataFrame([]).any()
12483Series([], dtype: bool)
12484"""
12485
12486_shared_docs[
12487 "stat_func_example"
12488] = """
12489
12490Examples
12491--------
12492>>> idx = pd.MultiIndex.from_arrays([
12493... ['warm', 'warm', 'cold', 'cold'],
12494... ['dog', 'falcon', 'fish', 'spider']],
12495... names=['blooded', 'animal'])
12496>>> s = pd.Series([4, 2, 0, 8], name='legs', index=idx)
12497>>> s
12498blooded animal
12499warm dog 4
12500 falcon 2
12501cold fish 0
12502 spider 8
12503Name: legs, dtype: int64
12504
12505>>> s.{stat_func}()
12506{default_output}"""
12507
12508_sum_examples = _shared_docs["stat_func_example"].format(
12509 stat_func="sum", verb="Sum", default_output=14, level_output_0=6, level_output_1=8
12510)
12511
12512_sum_examples += """
12513
12514By default, the sum of an empty or all-NA Series is ``0``.
12515
12516>>> pd.Series([], dtype="float64").sum() # min_count=0 is the default
125170.0
12518
12519This can be controlled with the ``min_count`` parameter. For example, if
12520you'd like the sum of an empty series to be NaN, pass ``min_count=1``.
12521
12522>>> pd.Series([], dtype="float64").sum(min_count=1)
12523nan
12524
12525Thanks to the ``skipna`` parameter, ``min_count`` handles all-NA and
12526empty series identically.
12527
12528>>> pd.Series([np.nan]).sum()
125290.0
12530
12531>>> pd.Series([np.nan]).sum(min_count=1)
12532nan"""
12533
12534_max_examples: str = _shared_docs["stat_func_example"].format(
12535 stat_func="max", verb="Max", default_output=8, level_output_0=4, level_output_1=8
12536)
12537
12538_min_examples: str = _shared_docs["stat_func_example"].format(
12539 stat_func="min", verb="Min", default_output=0, level_output_0=2, level_output_1=0
12540)
12541
12542_stat_func_see_also = """
12543
12544See Also
12545--------
12546Series.sum : Return the sum.
12547Series.min : Return the minimum.
12548Series.max : Return the maximum.
12549Series.idxmin : Return the index of the minimum.
12550Series.idxmax : Return the index of the maximum.
12551DataFrame.sum : Return the sum over the requested axis.
12552DataFrame.min : Return the minimum over the requested axis.
12553DataFrame.max : Return the maximum over the requested axis.
12554DataFrame.idxmin : Return the index of the minimum over the requested axis.
12555DataFrame.idxmax : Return the index of the maximum over the requested axis."""
12556
12557_prod_examples = """
12558
12559Examples
12560--------
12561By default, the product of an empty or all-NA Series is ``1``
12562
12563>>> pd.Series([], dtype="float64").prod()
125641.0
12565
12566This can be controlled with the ``min_count`` parameter
12567
12568>>> pd.Series([], dtype="float64").prod(min_count=1)
12569nan
12570
12571Thanks to the ``skipna`` parameter, ``min_count`` handles all-NA and
12572empty series identically.
12573
12574>>> pd.Series([np.nan]).prod()
125751.0
12576
12577>>> pd.Series([np.nan]).prod(min_count=1)
12578nan"""
12579
12580_min_count_stub = """\
12581min_count : int, default 0
12582 The required number of valid values to perform the operation. If fewer than
12583 ``min_count`` non-NA values are present the result will be NA.
12584"""
12585
12586
12587def _align_as_utc(
12588 left: NDFrameT, right: NDFrameT, join_index: Index | None
12589) -> tuple[NDFrameT, NDFrameT]:
12590 """
12591 If we are aligning timezone-aware DatetimeIndexes and the timezones
12592 do not match, convert both to UTC.
12593 """
12594 if is_datetime64tz_dtype(left.index.dtype):
12595 if left.index.tz != right.index.tz:
12596 if join_index is not None:
12597 # GH#33671 ensure we don't change the index on
12598 # our original Series (NB: by default deep=False)
12599 left = left.copy()
12600 right = right.copy()
12601 left.index = join_index
12602 right.index = join_index
12603
12604 return left, right