1"""
2DataFrame
3---------
4An efficient 2D container for potentially mixed-type time series or other
5labeled data series.
6
7Similar to its R counterpart, data.frame, except providing automatic data
8alignment and a host of useful data manipulation methods having to do with the
9labeling information
10"""
11from __future__ import annotations
12
13import collections
14from collections import abc
15from collections.abc import (
16 Hashable,
17 Iterable,
18 Iterator,
19 Mapping,
20 Sequence,
21)
22import functools
23from inspect import signature
24from io import StringIO
25import itertools
26import operator
27import sys
28from textwrap import dedent
29from typing import (
30 TYPE_CHECKING,
31 Any,
32 Callable,
33 Literal,
34 cast,
35 overload,
36)
37import warnings
38
39import numpy as np
40from numpy import ma
41
42from pandas._config import (
43 get_option,
44 using_copy_on_write,
45 warn_copy_on_write,
46)
47from pandas._config.config import _get_option
48
49from pandas._libs import (
50 algos as libalgos,
51 lib,
52 properties,
53)
54from pandas._libs.hashtable import duplicated
55from pandas._libs.lib import is_range_indexer
56from pandas.compat import PYPY
57from pandas.compat._constants import REF_COUNT
58from pandas.compat._optional import import_optional_dependency
59from pandas.compat.numpy import function as nv
60from pandas.errors import (
61 ChainedAssignmentError,
62 InvalidIndexError,
63 _chained_assignment_method_msg,
64 _chained_assignment_msg,
65 _chained_assignment_warning_method_msg,
66 _chained_assignment_warning_msg,
67)
68from pandas.util._decorators import (
69 Appender,
70 Substitution,
71 deprecate_nonkeyword_arguments,
72 doc,
73)
74from pandas.util._exceptions import (
75 find_stack_level,
76 rewrite_warning,
77)
78from pandas.util._validators import (
79 validate_ascending,
80 validate_bool_kwarg,
81 validate_percentile,
82)
83
84from pandas.core.dtypes.cast import (
85 LossySetitemError,
86 can_hold_element,
87 construct_1d_arraylike_from_scalar,
88 construct_2d_arraylike_from_scalar,
89 find_common_type,
90 infer_dtype_from_scalar,
91 invalidate_string_dtypes,
92 maybe_box_native,
93 maybe_downcast_to_dtype,
94)
95from pandas.core.dtypes.common import (
96 infer_dtype_from_object,
97 is_1d_only_ea_dtype,
98 is_array_like,
99 is_bool_dtype,
100 is_dataclass,
101 is_dict_like,
102 is_float,
103 is_float_dtype,
104 is_hashable,
105 is_integer,
106 is_integer_dtype,
107 is_iterator,
108 is_list_like,
109 is_scalar,
110 is_sequence,
111 needs_i8_conversion,
112 pandas_dtype,
113)
114from pandas.core.dtypes.concat import concat_compat
115from pandas.core.dtypes.dtypes import (
116 ArrowDtype,
117 BaseMaskedDtype,
118 ExtensionDtype,
119)
120from pandas.core.dtypes.missing import (
121 isna,
122 notna,
123)
124
125from pandas.core import (
126 algorithms,
127 common as com,
128 nanops,
129 ops,
130 roperator,
131)
132from pandas.core.accessor import CachedAccessor
133from pandas.core.apply import reconstruct_and_relabel_result
134from pandas.core.array_algos.take import take_2d_multi
135from pandas.core.arraylike import OpsMixin
136from pandas.core.arrays import (
137 BaseMaskedArray,
138 DatetimeArray,
139 ExtensionArray,
140 PeriodArray,
141 TimedeltaArray,
142)
143from pandas.core.arrays.sparse import SparseFrameAccessor
144from pandas.core.construction import (
145 ensure_wrapped_if_datetimelike,
146 sanitize_array,
147 sanitize_masked_array,
148)
149from pandas.core.generic import (
150 NDFrame,
151 make_doc,
152)
153from pandas.core.indexers import check_key_length
154from pandas.core.indexes.api import (
155 DatetimeIndex,
156 Index,
157 PeriodIndex,
158 default_index,
159 ensure_index,
160 ensure_index_from_sequences,
161)
162from pandas.core.indexes.multi import (
163 MultiIndex,
164 maybe_droplevels,
165)
166from pandas.core.indexing import (
167 check_bool_indexer,
168 check_dict_or_set_indexers,
169)
170from pandas.core.internals import (
171 ArrayManager,
172 BlockManager,
173)
174from pandas.core.internals.construction import (
175 arrays_to_mgr,
176 dataclasses_to_dicts,
177 dict_to_mgr,
178 mgr_to_mgr,
179 ndarray_to_mgr,
180 nested_data_to_arrays,
181 rec_array_to_mgr,
182 reorder_arrays,
183 to_arrays,
184 treat_as_nested,
185)
186from pandas.core.methods import selectn
187from pandas.core.reshape.melt import melt
188from pandas.core.series import Series
189from pandas.core.shared_docs import _shared_docs
190from pandas.core.sorting import (
191 get_group_index,
192 lexsort_indexer,
193 nargsort,
194)
195
196from pandas.io.common import get_handle
197from pandas.io.formats import (
198 console,
199 format as fmt,
200)
201from pandas.io.formats.info import (
202 INFO_DOCSTRING,
203 DataFrameInfo,
204 frame_sub_kwargs,
205)
206import pandas.plotting
207
208if TYPE_CHECKING:
209 import datetime
210
211 from pandas._libs.internals import BlockValuesRefs
212 from pandas._typing import (
213 AggFuncType,
214 AnyAll,
215 AnyArrayLike,
216 ArrayLike,
217 Axes,
218 Axis,
219 AxisInt,
220 ColspaceArgType,
221 CompressionOptions,
222 CorrelationMethod,
223 DropKeep,
224 Dtype,
225 DtypeObj,
226 FilePath,
227 FloatFormatType,
228 FormattersType,
229 Frequency,
230 FromDictOrient,
231 IgnoreRaise,
232 IndexKeyFunc,
233 IndexLabel,
234 JoinValidate,
235 Level,
236 MergeHow,
237 MergeValidate,
238 MutableMappingT,
239 NaAction,
240 NaPosition,
241 NsmallestNlargestKeep,
242 PythonFuncType,
243 QuantileInterpolation,
244 ReadBuffer,
245 ReindexMethod,
246 Renamer,
247 Scalar,
248 Self,
249 SequenceNotStr,
250 SortKind,
251 StorageOptions,
252 Suffixes,
253 ToGbqIfexist,
254 ToStataByteorder,
255 ToTimestampHow,
256 UpdateJoin,
257 ValueKeyFunc,
258 WriteBuffer,
259 XMLParsers,
260 npt,
261 )
262
263 from pandas.core.groupby.generic import DataFrameGroupBy
264 from pandas.core.interchange.dataframe_protocol import DataFrame as DataFrameXchg
265 from pandas.core.internals import SingleDataManager
266
267 from pandas.io.formats.style import Styler
268
269# ---------------------------------------------------------------------
270# Docstring templates
271
272_shared_doc_kwargs = {
273 "axes": "index, columns",
274 "klass": "DataFrame",
275 "axes_single_arg": "{0 or 'index', 1 or 'columns'}",
276 "axis": """axis : {0 or 'index', 1 or 'columns'}, default 0
277 If 0 or 'index': apply function to each column.
278 If 1 or 'columns': apply function to each row.""",
279 "inplace": """
280 inplace : bool, default False
281 Whether to modify the DataFrame rather than creating a new one.""",
282 "optional_by": """
283by : str or list of str
284 Name or list of names to sort by.
285
286 - if `axis` is 0 or `'index'` then `by` may contain index
287 levels and/or column labels.
288 - if `axis` is 1 or `'columns'` then `by` may contain column
289 levels and/or index labels.""",
290 "optional_reindex": """
291labels : array-like, optional
292 New labels / index to conform the axis specified by 'axis' to.
293index : array-like, optional
294 New labels for the index. Preferably an Index object to avoid
295 duplicating data.
296columns : array-like, optional
297 New labels for the columns. Preferably an Index object to avoid
298 duplicating data.
299axis : int or str, optional
300 Axis to target. Can be either the axis name ('index', 'columns')
301 or number (0, 1).""",
302}
303
304_merge_doc = """
305Merge DataFrame or named Series objects with a database-style join.
306
307A named Series object is treated as a DataFrame with a single named column.
308
309The join is done on columns or indexes. If joining columns on
310columns, the DataFrame indexes *will be ignored*. Otherwise if joining indexes
311on indexes or indexes on a column or columns, the index will be passed on.
312When performing a cross merge, no column specifications to merge on are
313allowed.
314
315.. warning::
316
317 If both key columns contain rows where the key is a null value, those
318 rows will be matched against each other. This is different from usual SQL
319 join behaviour and can lead to unexpected results.
320
321Parameters
322----------%s
323right : DataFrame or named Series
324 Object to merge with.
325how : {'left', 'right', 'outer', 'inner', 'cross'}, default 'inner'
326 Type of merge to be performed.
327
328 * left: use only keys from left frame, similar to a SQL left outer join;
329 preserve key order.
330 * right: use only keys from right frame, similar to a SQL right outer join;
331 preserve key order.
332 * outer: use union of keys from both frames, similar to a SQL full outer
333 join; sort keys lexicographically.
334 * inner: use intersection of keys from both frames, similar to a SQL inner
335 join; preserve the order of the left keys.
336 * cross: creates the cartesian product from both frames, preserves the order
337 of the left keys.
338on : label or list
339 Column or index level names to join on. These must be found in both
340 DataFrames. If `on` is None and not merging on indexes then this defaults
341 to the intersection of the columns in both DataFrames.
342left_on : label or list, or array-like
343 Column or index level names to join on in the left DataFrame. Can also
344 be an array or list of arrays of the length of the left DataFrame.
345 These arrays are treated as if they are columns.
346right_on : label or list, or array-like
347 Column or index level names to join on in the right DataFrame. Can also
348 be an array or list of arrays of the length of the right DataFrame.
349 These arrays are treated as if they are columns.
350left_index : bool, default False
351 Use the index from the left DataFrame as the join key(s). If it is a
352 MultiIndex, the number of keys in the other DataFrame (either the index
353 or a number of columns) must match the number of levels.
354right_index : bool, default False
355 Use the index from the right DataFrame as the join key. Same caveats as
356 left_index.
357sort : bool, default False
358 Sort the join keys lexicographically in the result DataFrame. If False,
359 the order of the join keys depends on the join type (how keyword).
360suffixes : list-like, default is ("_x", "_y")
361 A length-2 sequence where each element is optionally a string
362 indicating the suffix to add to overlapping column names in
363 `left` and `right` respectively. Pass a value of `None` instead
364 of a string to indicate that the column name from `left` or
365 `right` should be left as-is, with no suffix. At least one of the
366 values must not be None.
367copy : bool, default True
368 If False, avoid copy if possible.
369
370 .. note::
371 The `copy` keyword will change behavior in pandas 3.0.
372 `Copy-on-Write
373 <https://pandas.pydata.org/docs/dev/user_guide/copy_on_write.html>`__
374 will be enabled by default, which means that all methods with a
375 `copy` keyword will use a lazy copy mechanism to defer the copy and
376 ignore the `copy` keyword. The `copy` keyword will be removed in a
377 future version of pandas.
378
379 You can already get the future behavior and improvements through
380 enabling copy on write ``pd.options.mode.copy_on_write = True``
381indicator : bool or str, default False
382 If True, adds a column to the output DataFrame called "_merge" with
383 information on the source of each row. The column can be given a different
384 name by providing a string argument. The column will have a Categorical
385 type with the value of "left_only" for observations whose merge key only
386 appears in the left DataFrame, "right_only" for observations
387 whose merge key only appears in the right DataFrame, and "both"
388 if the observation's merge key is found in both DataFrames.
389
390validate : str, optional
391 If specified, checks if merge is of specified type.
392
393 * "one_to_one" or "1:1": check if merge keys are unique in both
394 left and right datasets.
395 * "one_to_many" or "1:m": check if merge keys are unique in left
396 dataset.
397 * "many_to_one" or "m:1": check if merge keys are unique in right
398 dataset.
399 * "many_to_many" or "m:m": allowed, but does not result in checks.
400
401Returns
402-------
403DataFrame
404 A DataFrame of the two merged objects.
405
406See Also
407--------
408merge_ordered : Merge with optional filling/interpolation.
409merge_asof : Merge on nearest keys.
410DataFrame.join : Similar method using indices.
411
412Examples
413--------
414>>> df1 = pd.DataFrame({'lkey': ['foo', 'bar', 'baz', 'foo'],
415... 'value': [1, 2, 3, 5]})
416>>> df2 = pd.DataFrame({'rkey': ['foo', 'bar', 'baz', 'foo'],
417... 'value': [5, 6, 7, 8]})
418>>> df1
419 lkey value
4200 foo 1
4211 bar 2
4222 baz 3
4233 foo 5
424>>> df2
425 rkey value
4260 foo 5
4271 bar 6
4282 baz 7
4293 foo 8
430
431Merge df1 and df2 on the lkey and rkey columns. The value columns have
432the default suffixes, _x and _y, appended.
433
434>>> df1.merge(df2, left_on='lkey', right_on='rkey')
435 lkey value_x rkey value_y
4360 foo 1 foo 5
4371 foo 1 foo 8
4382 bar 2 bar 6
4393 baz 3 baz 7
4404 foo 5 foo 5
4415 foo 5 foo 8
442
443Merge DataFrames df1 and df2 with specified left and right suffixes
444appended to any overlapping columns.
445
446>>> df1.merge(df2, left_on='lkey', right_on='rkey',
447... suffixes=('_left', '_right'))
448 lkey value_left rkey value_right
4490 foo 1 foo 5
4501 foo 1 foo 8
4512 bar 2 bar 6
4523 baz 3 baz 7
4534 foo 5 foo 5
4545 foo 5 foo 8
455
456Merge DataFrames df1 and df2, but raise an exception if the DataFrames have
457any overlapping columns.
458
459>>> df1.merge(df2, left_on='lkey', right_on='rkey', suffixes=(False, False))
460Traceback (most recent call last):
461...
462ValueError: columns overlap but no suffix specified:
463 Index(['value'], dtype='object')
464
465>>> df1 = pd.DataFrame({'a': ['foo', 'bar'], 'b': [1, 2]})
466>>> df2 = pd.DataFrame({'a': ['foo', 'baz'], 'c': [3, 4]})
467>>> df1
468 a b
4690 foo 1
4701 bar 2
471>>> df2
472 a c
4730 foo 3
4741 baz 4
475
476>>> df1.merge(df2, how='inner', on='a')
477 a b c
4780 foo 1 3
479
480>>> df1.merge(df2, how='left', on='a')
481 a b c
4820 foo 1 3.0
4831 bar 2 NaN
484
485>>> df1 = pd.DataFrame({'left': ['foo', 'bar']})
486>>> df2 = pd.DataFrame({'right': [7, 8]})
487>>> df1
488 left
4890 foo
4901 bar
491>>> df2
492 right
4930 7
4941 8
495
496>>> df1.merge(df2, how='cross')
497 left right
4980 foo 7
4991 foo 8
5002 bar 7
5013 bar 8
502"""
503
504
505# -----------------------------------------------------------------------
506# DataFrame class
507
508
509class DataFrame(NDFrame, OpsMixin):
510 """
511 Two-dimensional, size-mutable, potentially heterogeneous tabular data.
512
513 Data structure also contains labeled axes (rows and columns).
514 Arithmetic operations align on both row and column labels. Can be
515 thought of as a dict-like container for Series objects. The primary
516 pandas data structure.
517
518 Parameters
519 ----------
520 data : ndarray (structured or homogeneous), Iterable, dict, or DataFrame
521 Dict can contain Series, arrays, constants, dataclass or list-like objects. If
522 data is a dict, column order follows insertion-order. If a dict contains Series
523 which have an index defined, it is aligned by its index. This alignment also
524 occurs if data is a Series or a DataFrame itself. Alignment is done on
525 Series/DataFrame inputs.
526
527 If data is a list of dicts, column order follows insertion-order.
528
529 index : Index or array-like
530 Index to use for resulting frame. Will default to RangeIndex if
531 no indexing information part of input data and no index provided.
532 columns : Index or array-like
533 Column labels to use for resulting frame when data does not have them,
534 defaulting to RangeIndex(0, 1, 2, ..., n). If data contains column labels,
535 will perform column selection instead.
536 dtype : dtype, default None
537 Data type to force. Only a single dtype is allowed. If None, infer.
538 copy : bool or None, default None
539 Copy data from inputs.
540 For dict data, the default of None behaves like ``copy=True``. For DataFrame
541 or 2d ndarray input, the default of None behaves like ``copy=False``.
542 If data is a dict containing one or more Series (possibly of different dtypes),
543 ``copy=False`` will ensure that these inputs are not copied.
544
545 .. versionchanged:: 1.3.0
546
547 See Also
548 --------
549 DataFrame.from_records : Constructor from tuples, also record arrays.
550 DataFrame.from_dict : From dicts of Series, arrays, or dicts.
551 read_csv : Read a comma-separated values (csv) file into DataFrame.
552 read_table : Read general delimited file into DataFrame.
553 read_clipboard : Read text from clipboard into DataFrame.
554
555 Notes
556 -----
557 Please reference the :ref:`User Guide <basics.dataframe>` for more information.
558
559 Examples
560 --------
561 Constructing DataFrame from a dictionary.
562
563 >>> d = {'col1': [1, 2], 'col2': [3, 4]}
564 >>> df = pd.DataFrame(data=d)
565 >>> df
566 col1 col2
567 0 1 3
568 1 2 4
569
570 Notice that the inferred dtype is int64.
571
572 >>> df.dtypes
573 col1 int64
574 col2 int64
575 dtype: object
576
577 To enforce a single dtype:
578
579 >>> df = pd.DataFrame(data=d, dtype=np.int8)
580 >>> df.dtypes
581 col1 int8
582 col2 int8
583 dtype: object
584
585 Constructing DataFrame from a dictionary including Series:
586
587 >>> d = {'col1': [0, 1, 2, 3], 'col2': pd.Series([2, 3], index=[2, 3])}
588 >>> pd.DataFrame(data=d, index=[0, 1, 2, 3])
589 col1 col2
590 0 0 NaN
591 1 1 NaN
592 2 2 2.0
593 3 3 3.0
594
595 Constructing DataFrame from numpy ndarray:
596
597 >>> df2 = pd.DataFrame(np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]),
598 ... columns=['a', 'b', 'c'])
599 >>> df2
600 a b c
601 0 1 2 3
602 1 4 5 6
603 2 7 8 9
604
605 Constructing DataFrame from a numpy ndarray that has labeled columns:
606
607 >>> data = np.array([(1, 2, 3), (4, 5, 6), (7, 8, 9)],
608 ... dtype=[("a", "i4"), ("b", "i4"), ("c", "i4")])
609 >>> df3 = pd.DataFrame(data, columns=['c', 'a'])
610 ...
611 >>> df3
612 c a
613 0 3 1
614 1 6 4
615 2 9 7
616
617 Constructing DataFrame from dataclass:
618
619 >>> from dataclasses import make_dataclass
620 >>> Point = make_dataclass("Point", [("x", int), ("y", int)])
621 >>> pd.DataFrame([Point(0, 0), Point(0, 3), Point(2, 3)])
622 x y
623 0 0 0
624 1 0 3
625 2 2 3
626
627 Constructing DataFrame from Series/DataFrame:
628
629 >>> ser = pd.Series([1, 2, 3], index=["a", "b", "c"])
630 >>> df = pd.DataFrame(data=ser, index=["a", "c"])
631 >>> df
632 0
633 a 1
634 c 3
635
636 >>> df1 = pd.DataFrame([1, 2, 3], index=["a", "b", "c"], columns=["x"])
637 >>> df2 = pd.DataFrame(data=df1, index=["a", "c"])
638 >>> df2
639 x
640 a 1
641 c 3
642 """
643
644 _internal_names_set = {"columns", "index"} | NDFrame._internal_names_set
645 _typ = "dataframe"
646 _HANDLED_TYPES = (Series, Index, ExtensionArray, np.ndarray)
647 _accessors: set[str] = {"sparse"}
648 _hidden_attrs: frozenset[str] = NDFrame._hidden_attrs | frozenset([])
649 _mgr: BlockManager | ArrayManager
650
651 # similar to __array_priority__, positions DataFrame before Series, Index,
652 # and ExtensionArray. Should NOT be overridden by subclasses.
653 __pandas_priority__ = 4000
654
655 @property
656 def _constructor(self) -> Callable[..., DataFrame]:
657 return DataFrame
658
659 def _constructor_from_mgr(self, mgr, axes) -> DataFrame:
660 df = DataFrame._from_mgr(mgr, axes=axes)
661
662 if type(self) is DataFrame:
663 # This would also work `if self._constructor is DataFrame`, but
664 # this check is slightly faster, benefiting the most-common case.
665 return df
666
667 elif type(self).__name__ == "GeoDataFrame":
668 # Shim until geopandas can override their _constructor_from_mgr
669 # bc they have different behavior for Managers than for DataFrames
670 return self._constructor(mgr)
671
672 # We assume that the subclass __init__ knows how to handle a
673 # pd.DataFrame object.
674 return self._constructor(df)
675
676 _constructor_sliced: Callable[..., Series] = Series
677
678 def _constructor_sliced_from_mgr(self, mgr, axes) -> Series:
679 ser = Series._from_mgr(mgr, axes)
680 ser._name = None # caller is responsible for setting real name
681
682 if type(self) is DataFrame:
683 # This would also work `if self._constructor_sliced is Series`, but
684 # this check is slightly faster, benefiting the most-common case.
685 return ser
686
687 # We assume that the subclass __init__ knows how to handle a
688 # pd.Series object.
689 return self._constructor_sliced(ser)
690
691 # ----------------------------------------------------------------------
692 # Constructors
693
694 def __init__(
695 self,
696 data=None,
697 index: Axes | None = None,
698 columns: Axes | None = None,
699 dtype: Dtype | None = None,
700 copy: bool | None = None,
701 ) -> None:
702 allow_mgr = False
703 if dtype is not None:
704 dtype = self._validate_dtype(dtype)
705
706 if isinstance(data, DataFrame):
707 data = data._mgr
708 allow_mgr = True
709 if not copy:
710 # if not copying data, ensure to still return a shallow copy
711 # to avoid the result sharing the same Manager
712 data = data.copy(deep=False)
713
714 if isinstance(data, (BlockManager, ArrayManager)):
715 if not allow_mgr:
716 # GH#52419
717 warnings.warn(
718 f"Passing a {type(data).__name__} to {type(self).__name__} "
719 "is deprecated and will raise in a future version. "
720 "Use public APIs instead.",
721 DeprecationWarning,
722 stacklevel=1, # bump to 2 once pyarrow 15.0 is released with fix
723 )
724
725 if using_copy_on_write():
726 data = data.copy(deep=False)
727 # first check if a Manager is passed without any other arguments
728 # -> use fastpath (without checking Manager type)
729 if index is None and columns is None and dtype is None and not copy:
730 # GH#33357 fastpath
731 NDFrame.__init__(self, data)
732 return
733
734 manager = _get_option("mode.data_manager", silent=True)
735
736 is_pandas_object = isinstance(data, (Series, Index, ExtensionArray))
737 data_dtype = getattr(data, "dtype", None)
738 original_dtype = dtype
739
740 # GH47215
741 if isinstance(index, set):
742 raise ValueError("index cannot be a set")
743 if isinstance(columns, set):
744 raise ValueError("columns cannot be a set")
745
746 if copy is None:
747 if isinstance(data, dict):
748 # retain pre-GH#38939 default behavior
749 copy = True
750 elif (
751 manager == "array"
752 and isinstance(data, (np.ndarray, ExtensionArray))
753 and data.ndim == 2
754 ):
755 # INFO(ArrayManager) by default copy the 2D input array to get
756 # contiguous 1D arrays
757 copy = True
758 elif using_copy_on_write() and not isinstance(
759 data, (Index, DataFrame, Series)
760 ):
761 copy = True
762 else:
763 copy = False
764
765 if data is None:
766 index = index if index is not None else default_index(0)
767 columns = columns if columns is not None else default_index(0)
768 dtype = dtype if dtype is not None else pandas_dtype(object)
769 data = []
770
771 if isinstance(data, (BlockManager, ArrayManager)):
772 mgr = self._init_mgr(
773 data, axes={"index": index, "columns": columns}, dtype=dtype, copy=copy
774 )
775
776 elif isinstance(data, dict):
777 # GH#38939 de facto copy defaults to False only in non-dict cases
778 mgr = dict_to_mgr(data, index, columns, dtype=dtype, copy=copy, typ=manager)
779 elif isinstance(data, ma.MaskedArray):
780 from numpy.ma import mrecords
781
782 # masked recarray
783 if isinstance(data, mrecords.MaskedRecords):
784 raise TypeError(
785 "MaskedRecords are not supported. Pass "
786 "{name: data[name] for name in data.dtype.names} "
787 "instead"
788 )
789
790 # a masked array
791 data = sanitize_masked_array(data)
792 mgr = ndarray_to_mgr(
793 data,
794 index,
795 columns,
796 dtype=dtype,
797 copy=copy,
798 typ=manager,
799 )
800
801 elif isinstance(data, (np.ndarray, Series, Index, ExtensionArray)):
802 if data.dtype.names:
803 # i.e. numpy structured array
804 data = cast(np.ndarray, data)
805 mgr = rec_array_to_mgr(
806 data,
807 index,
808 columns,
809 dtype,
810 copy,
811 typ=manager,
812 )
813 elif getattr(data, "name", None) is not None:
814 # i.e. Series/Index with non-None name
815 _copy = copy if using_copy_on_write() else True
816 mgr = dict_to_mgr(
817 # error: Item "ndarray" of "Union[ndarray, Series, Index]" has no
818 # attribute "name"
819 {data.name: data}, # type: ignore[union-attr]
820 index,
821 columns,
822 dtype=dtype,
823 typ=manager,
824 copy=_copy,
825 )
826 else:
827 mgr = ndarray_to_mgr(
828 data,
829 index,
830 columns,
831 dtype=dtype,
832 copy=copy,
833 typ=manager,
834 )
835
836 # For data is list-like, or Iterable (will consume into list)
837 elif is_list_like(data):
838 if not isinstance(data, abc.Sequence):
839 if hasattr(data, "__array__"):
840 # GH#44616 big perf improvement for e.g. pytorch tensor
841 data = np.asarray(data)
842 else:
843 data = list(data)
844 if len(data) > 0:
845 if is_dataclass(data[0]):
846 data = dataclasses_to_dicts(data)
847 if not isinstance(data, np.ndarray) and treat_as_nested(data):
848 # exclude ndarray as we may have cast it a few lines above
849 if columns is not None:
850 columns = ensure_index(columns)
851 arrays, columns, index = nested_data_to_arrays(
852 # error: Argument 3 to "nested_data_to_arrays" has incompatible
853 # type "Optional[Collection[Any]]"; expected "Optional[Index]"
854 data,
855 columns,
856 index, # type: ignore[arg-type]
857 dtype,
858 )
859 mgr = arrays_to_mgr(
860 arrays,
861 columns,
862 index,
863 dtype=dtype,
864 typ=manager,
865 )
866 else:
867 mgr = ndarray_to_mgr(
868 data,
869 index,
870 columns,
871 dtype=dtype,
872 copy=copy,
873 typ=manager,
874 )
875 else:
876 mgr = dict_to_mgr(
877 {},
878 index,
879 columns if columns is not None else default_index(0),
880 dtype=dtype,
881 typ=manager,
882 )
883 # For data is scalar
884 else:
885 if index is None or columns is None:
886 raise ValueError("DataFrame constructor not properly called!")
887
888 index = ensure_index(index)
889 columns = ensure_index(columns)
890
891 if not dtype:
892 dtype, _ = infer_dtype_from_scalar(data)
893
894 # For data is a scalar extension dtype
895 if isinstance(dtype, ExtensionDtype):
896 # TODO(EA2D): special case not needed with 2D EAs
897
898 values = [
899 construct_1d_arraylike_from_scalar(data, len(index), dtype)
900 for _ in range(len(columns))
901 ]
902 mgr = arrays_to_mgr(values, columns, index, dtype=None, typ=manager)
903 else:
904 arr2d = construct_2d_arraylike_from_scalar(
905 data,
906 len(index),
907 len(columns),
908 dtype,
909 copy,
910 )
911
912 mgr = ndarray_to_mgr(
913 arr2d,
914 index,
915 columns,
916 dtype=arr2d.dtype,
917 copy=False,
918 typ=manager,
919 )
920
921 # ensure correct Manager type according to settings
922 mgr = mgr_to_mgr(mgr, typ=manager)
923
924 NDFrame.__init__(self, mgr)
925
926 if original_dtype is None and is_pandas_object and data_dtype == np.object_:
927 if self.dtypes.iloc[0] != data_dtype:
928 warnings.warn(
929 "Dtype inference on a pandas object "
930 "(Series, Index, ExtensionArray) is deprecated. The DataFrame "
931 "constructor will keep the original dtype in the future. "
932 "Call `infer_objects` on the result to get the old "
933 "behavior.",
934 FutureWarning,
935 stacklevel=2,
936 )
937
938 # ----------------------------------------------------------------------
939
940 def __dataframe__(
941 self, nan_as_null: bool = False, allow_copy: bool = True
942 ) -> DataFrameXchg:
943 """
944 Return the dataframe interchange object implementing the interchange protocol.
945
946 Parameters
947 ----------
948 nan_as_null : bool, default False
949 `nan_as_null` is DEPRECATED and has no effect. Please avoid using
950 it; it will be removed in a future release.
951 allow_copy : bool, default True
952 Whether to allow memory copying when exporting. If set to False
953 it would cause non-zero-copy exports to fail.
954
955 Returns
956 -------
957 DataFrame interchange object
958 The object which consuming library can use to ingress the dataframe.
959
960 Notes
961 -----
962 Details on the interchange protocol:
963 https://data-apis.org/dataframe-protocol/latest/index.html
964
965 Examples
966 --------
967 >>> df_not_necessarily_pandas = pd.DataFrame({'A': [1, 2], 'B': [3, 4]})
968 >>> interchange_object = df_not_necessarily_pandas.__dataframe__()
969 >>> interchange_object.column_names()
970 Index(['A', 'B'], dtype='object')
971 >>> df_pandas = (pd.api.interchange.from_dataframe
972 ... (interchange_object.select_columns_by_name(['A'])))
973 >>> df_pandas
974 A
975 0 1
976 1 2
977
978 These methods (``column_names``, ``select_columns_by_name``) should work
979 for any dataframe library which implements the interchange protocol.
980 """
981
982 from pandas.core.interchange.dataframe import PandasDataFrameXchg
983
984 return PandasDataFrameXchg(self, allow_copy=allow_copy)
985
986 def __dataframe_consortium_standard__(
987 self, *, api_version: str | None = None
988 ) -> Any:
989 """
990 Provide entry point to the Consortium DataFrame Standard API.
991
992 This is developed and maintained outside of pandas.
993 Please report any issues to https://github.com/data-apis/dataframe-api-compat.
994 """
995 dataframe_api_compat = import_optional_dependency("dataframe_api_compat")
996 convert_to_standard_compliant_dataframe = (
997 dataframe_api_compat.pandas_standard.convert_to_standard_compliant_dataframe
998 )
999 return convert_to_standard_compliant_dataframe(self, api_version=api_version)
1000
1001 def __arrow_c_stream__(self, requested_schema=None):
1002 """
1003 Export the pandas DataFrame as an Arrow C stream PyCapsule.
1004
1005 This relies on pyarrow to convert the pandas DataFrame to the Arrow
1006 format (and follows the default behaviour of ``pyarrow.Table.from_pandas``
1007 in its handling of the index, i.e. store the index as a column except
1008 for RangeIndex).
1009 This conversion is not necessarily zero-copy.
1010
1011 Parameters
1012 ----------
1013 requested_schema : PyCapsule, default None
1014 The schema to which the dataframe should be casted, passed as a
1015 PyCapsule containing a C ArrowSchema representation of the
1016 requested schema.
1017
1018 Returns
1019 -------
1020 PyCapsule
1021 """
1022 pa = import_optional_dependency("pyarrow", min_version="14.0.0")
1023 if requested_schema is not None:
1024 requested_schema = pa.Schema._import_from_c_capsule(requested_schema)
1025 table = pa.Table.from_pandas(self, schema=requested_schema)
1026 return table.__arrow_c_stream__()
1027
1028 # ----------------------------------------------------------------------
1029
1030 @property
1031 def axes(self) -> list[Index]:
1032 """
1033 Return a list representing the axes of the DataFrame.
1034
1035 It has the row axis labels and column axis labels as the only members.
1036 They are returned in that order.
1037
1038 Examples
1039 --------
1040 >>> df = pd.DataFrame({'col1': [1, 2], 'col2': [3, 4]})
1041 >>> df.axes
1042 [RangeIndex(start=0, stop=2, step=1), Index(['col1', 'col2'],
1043 dtype='object')]
1044 """
1045 return [self.index, self.columns]
1046
1047 @property
1048 def shape(self) -> tuple[int, int]:
1049 """
1050 Return a tuple representing the dimensionality of the DataFrame.
1051
1052 See Also
1053 --------
1054 ndarray.shape : Tuple of array dimensions.
1055
1056 Examples
1057 --------
1058 >>> df = pd.DataFrame({'col1': [1, 2], 'col2': [3, 4]})
1059 >>> df.shape
1060 (2, 2)
1061
1062 >>> df = pd.DataFrame({'col1': [1, 2], 'col2': [3, 4],
1063 ... 'col3': [5, 6]})
1064 >>> df.shape
1065 (2, 3)
1066 """
1067 return len(self.index), len(self.columns)
1068
1069 @property
1070 def _is_homogeneous_type(self) -> bool:
1071 """
1072 Whether all the columns in a DataFrame have the same type.
1073
1074 Returns
1075 -------
1076 bool
1077
1078 Examples
1079 --------
1080 >>> DataFrame({"A": [1, 2], "B": [3, 4]})._is_homogeneous_type
1081 True
1082 >>> DataFrame({"A": [1, 2], "B": [3.0, 4.0]})._is_homogeneous_type
1083 False
1084
1085 Items with the same type but different sizes are considered
1086 different types.
1087
1088 >>> DataFrame({
1089 ... "A": np.array([1, 2], dtype=np.int32),
1090 ... "B": np.array([1, 2], dtype=np.int64)})._is_homogeneous_type
1091 False
1092 """
1093 # The "<" part of "<=" here is for empty DataFrame cases
1094 return len({arr.dtype for arr in self._mgr.arrays}) <= 1
1095
1096 @property
1097 def _can_fast_transpose(self) -> bool:
1098 """
1099 Can we transpose this DataFrame without creating any new array objects.
1100 """
1101 if isinstance(self._mgr, ArrayManager):
1102 return False
1103 blocks = self._mgr.blocks
1104 if len(blocks) != 1:
1105 return False
1106
1107 dtype = blocks[0].dtype
1108 # TODO(EA2D) special case would be unnecessary with 2D EAs
1109 return not is_1d_only_ea_dtype(dtype)
1110
1111 @property
1112 def _values(self) -> np.ndarray | DatetimeArray | TimedeltaArray | PeriodArray:
1113 """
1114 Analogue to ._values that may return a 2D ExtensionArray.
1115 """
1116 mgr = self._mgr
1117
1118 if isinstance(mgr, ArrayManager):
1119 if len(mgr.arrays) == 1 and not is_1d_only_ea_dtype(mgr.arrays[0].dtype):
1120 # error: Item "ExtensionArray" of "Union[ndarray, ExtensionArray]"
1121 # has no attribute "reshape"
1122 return mgr.arrays[0].reshape(-1, 1) # type: ignore[union-attr]
1123 return ensure_wrapped_if_datetimelike(self.values)
1124
1125 blocks = mgr.blocks
1126 if len(blocks) != 1:
1127 return ensure_wrapped_if_datetimelike(self.values)
1128
1129 arr = blocks[0].values
1130 if arr.ndim == 1:
1131 # non-2D ExtensionArray
1132 return self.values
1133
1134 # more generally, whatever we allow in NDArrayBackedExtensionBlock
1135 arr = cast("np.ndarray | DatetimeArray | TimedeltaArray | PeriodArray", arr)
1136 return arr.T
1137
1138 # ----------------------------------------------------------------------
1139 # Rendering Methods
1140
1141 def _repr_fits_vertical_(self) -> bool:
1142 """
1143 Check length against max_rows.
1144 """
1145 max_rows = get_option("display.max_rows")
1146 return len(self) <= max_rows
1147
1148 def _repr_fits_horizontal_(self) -> bool:
1149 """
1150 Check if full repr fits in horizontal boundaries imposed by the display
1151 options width and max_columns.
1152 """
1153 width, height = console.get_console_size()
1154 max_columns = get_option("display.max_columns")
1155 nb_columns = len(self.columns)
1156
1157 # exceed max columns
1158 if (max_columns and nb_columns > max_columns) or (
1159 width and nb_columns > (width // 2)
1160 ):
1161 return False
1162
1163 # used by repr_html under IPython notebook or scripts ignore terminal
1164 # dims
1165 if width is None or not console.in_interactive_session():
1166 return True
1167
1168 if get_option("display.width") is not None or console.in_ipython_frontend():
1169 # check at least the column row for excessive width
1170 max_rows = 1
1171 else:
1172 max_rows = get_option("display.max_rows")
1173
1174 # when auto-detecting, so width=None and not in ipython front end
1175 # check whether repr fits horizontal by actually checking
1176 # the width of the rendered repr
1177 buf = StringIO()
1178
1179 # only care about the stuff we'll actually print out
1180 # and to_string on entire frame may be expensive
1181 d = self
1182
1183 if max_rows is not None: # unlimited rows
1184 # min of two, where one may be None
1185 d = d.iloc[: min(max_rows, len(d))]
1186 else:
1187 return True
1188
1189 d.to_string(buf=buf)
1190 value = buf.getvalue()
1191 repr_width = max(len(line) for line in value.split("\n"))
1192
1193 return repr_width < width
1194
1195 def _info_repr(self) -> bool:
1196 """
1197 True if the repr should show the info view.
1198 """
1199 info_repr_option = get_option("display.large_repr") == "info"
1200 return info_repr_option and not (
1201 self._repr_fits_horizontal_() and self._repr_fits_vertical_()
1202 )
1203
1204 def __repr__(self) -> str:
1205 """
1206 Return a string representation for a particular DataFrame.
1207 """
1208 if self._info_repr():
1209 buf = StringIO()
1210 self.info(buf=buf)
1211 return buf.getvalue()
1212
1213 repr_params = fmt.get_dataframe_repr_params()
1214 return self.to_string(**repr_params)
1215
1216 def _repr_html_(self) -> str | None:
1217 """
1218 Return a html representation for a particular DataFrame.
1219
1220 Mainly for IPython notebook.
1221 """
1222 if self._info_repr():
1223 buf = StringIO()
1224 self.info(buf=buf)
1225 # need to escape the <class>, should be the first line.
1226 val = buf.getvalue().replace("<", r"<", 1)
1227 val = val.replace(">", r">", 1)
1228 return f"<pre>{val}</pre>"
1229
1230 if get_option("display.notebook_repr_html"):
1231 max_rows = get_option("display.max_rows")
1232 min_rows = get_option("display.min_rows")
1233 max_cols = get_option("display.max_columns")
1234 show_dimensions = get_option("display.show_dimensions")
1235
1236 formatter = fmt.DataFrameFormatter(
1237 self,
1238 columns=None,
1239 col_space=None,
1240 na_rep="NaN",
1241 formatters=None,
1242 float_format=None,
1243 sparsify=None,
1244 justify=None,
1245 index_names=True,
1246 header=True,
1247 index=True,
1248 bold_rows=True,
1249 escape=True,
1250 max_rows=max_rows,
1251 min_rows=min_rows,
1252 max_cols=max_cols,
1253 show_dimensions=show_dimensions,
1254 decimal=".",
1255 )
1256 return fmt.DataFrameRenderer(formatter).to_html(notebook=True)
1257 else:
1258 return None
1259
1260 @overload
1261 def to_string(
1262 self,
1263 buf: None = ...,
1264 columns: Axes | None = ...,
1265 col_space: int | list[int] | dict[Hashable, int] | None = ...,
1266 header: bool | SequenceNotStr[str] = ...,
1267 index: bool = ...,
1268 na_rep: str = ...,
1269 formatters: fmt.FormattersType | None = ...,
1270 float_format: fmt.FloatFormatType | None = ...,
1271 sparsify: bool | None = ...,
1272 index_names: bool = ...,
1273 justify: str | None = ...,
1274 max_rows: int | None = ...,
1275 max_cols: int | None = ...,
1276 show_dimensions: bool = ...,
1277 decimal: str = ...,
1278 line_width: int | None = ...,
1279 min_rows: int | None = ...,
1280 max_colwidth: int | None = ...,
1281 encoding: str | None = ...,
1282 ) -> str:
1283 ...
1284
1285 @overload
1286 def to_string(
1287 self,
1288 buf: FilePath | WriteBuffer[str],
1289 columns: Axes | None = ...,
1290 col_space: int | list[int] | dict[Hashable, int] | None = ...,
1291 header: bool | SequenceNotStr[str] = ...,
1292 index: bool = ...,
1293 na_rep: str = ...,
1294 formatters: fmt.FormattersType | None = ...,
1295 float_format: fmt.FloatFormatType | None = ...,
1296 sparsify: bool | None = ...,
1297 index_names: bool = ...,
1298 justify: str | None = ...,
1299 max_rows: int | None = ...,
1300 max_cols: int | None = ...,
1301 show_dimensions: bool = ...,
1302 decimal: str = ...,
1303 line_width: int | None = ...,
1304 min_rows: int | None = ...,
1305 max_colwidth: int | None = ...,
1306 encoding: str | None = ...,
1307 ) -> None:
1308 ...
1309
1310 @deprecate_nonkeyword_arguments(
1311 version="3.0", allowed_args=["self", "buf"], name="to_string"
1312 )
1313 @Substitution(
1314 header_type="bool or list of str",
1315 header="Write out the column names. If a list of columns "
1316 "is given, it is assumed to be aliases for the "
1317 "column names",
1318 col_space_type="int, list or dict of int",
1319 col_space="The minimum width of each column. If a list of ints is given "
1320 "every integers corresponds with one column. If a dict is given, the key "
1321 "references the column, while the value defines the space to use.",
1322 )
1323 @Substitution(shared_params=fmt.common_docstring, returns=fmt.return_docstring)
1324 def to_string(
1325 self,
1326 buf: FilePath | WriteBuffer[str] | None = None,
1327 columns: Axes | None = None,
1328 col_space: int | list[int] | dict[Hashable, int] | None = None,
1329 header: bool | SequenceNotStr[str] = True,
1330 index: bool = True,
1331 na_rep: str = "NaN",
1332 formatters: fmt.FormattersType | None = None,
1333 float_format: fmt.FloatFormatType | None = None,
1334 sparsify: bool | None = None,
1335 index_names: bool = True,
1336 justify: str | None = None,
1337 max_rows: int | None = None,
1338 max_cols: int | None = None,
1339 show_dimensions: bool = False,
1340 decimal: str = ".",
1341 line_width: int | None = None,
1342 min_rows: int | None = None,
1343 max_colwidth: int | None = None,
1344 encoding: str | None = None,
1345 ) -> str | None:
1346 """
1347 Render a DataFrame to a console-friendly tabular output.
1348 %(shared_params)s
1349 line_width : int, optional
1350 Width to wrap a line in characters.
1351 min_rows : int, optional
1352 The number of rows to display in the console in a truncated repr
1353 (when number of rows is above `max_rows`).
1354 max_colwidth : int, optional
1355 Max width to truncate each column in characters. By default, no limit.
1356 encoding : str, default "utf-8"
1357 Set character encoding.
1358 %(returns)s
1359 See Also
1360 --------
1361 to_html : Convert DataFrame to HTML.
1362
1363 Examples
1364 --------
1365 >>> d = {'col1': [1, 2, 3], 'col2': [4, 5, 6]}
1366 >>> df = pd.DataFrame(d)
1367 >>> print(df.to_string())
1368 col1 col2
1369 0 1 4
1370 1 2 5
1371 2 3 6
1372 """
1373 from pandas import option_context
1374
1375 with option_context("display.max_colwidth", max_colwidth):
1376 formatter = fmt.DataFrameFormatter(
1377 self,
1378 columns=columns,
1379 col_space=col_space,
1380 na_rep=na_rep,
1381 formatters=formatters,
1382 float_format=float_format,
1383 sparsify=sparsify,
1384 justify=justify,
1385 index_names=index_names,
1386 header=header,
1387 index=index,
1388 min_rows=min_rows,
1389 max_rows=max_rows,
1390 max_cols=max_cols,
1391 show_dimensions=show_dimensions,
1392 decimal=decimal,
1393 )
1394 return fmt.DataFrameRenderer(formatter).to_string(
1395 buf=buf,
1396 encoding=encoding,
1397 line_width=line_width,
1398 )
1399
1400 def _get_values_for_csv(
1401 self,
1402 *,
1403 float_format: FloatFormatType | None,
1404 date_format: str | None,
1405 decimal: str,
1406 na_rep: str,
1407 quoting, # int csv.QUOTE_FOO from stdlib
1408 ) -> Self:
1409 # helper used by to_csv
1410 mgr = self._mgr.get_values_for_csv(
1411 float_format=float_format,
1412 date_format=date_format,
1413 decimal=decimal,
1414 na_rep=na_rep,
1415 quoting=quoting,
1416 )
1417 # error: Incompatible return value type (got "DataFrame", expected "Self")
1418 return self._constructor_from_mgr(mgr, axes=mgr.axes) # type: ignore[return-value]
1419
1420 # ----------------------------------------------------------------------
1421
1422 @property
1423 def style(self) -> Styler:
1424 """
1425 Returns a Styler object.
1426
1427 Contains methods for building a styled HTML representation of the DataFrame.
1428
1429 See Also
1430 --------
1431 io.formats.style.Styler : Helps style a DataFrame or Series according to the
1432 data with HTML and CSS.
1433
1434 Examples
1435 --------
1436 >>> df = pd.DataFrame({'A': [1, 2, 3]})
1437 >>> df.style # doctest: +SKIP
1438
1439 Please see
1440 `Table Visualization <../../user_guide/style.ipynb>`_ for more examples.
1441 """
1442 from pandas.io.formats.style import Styler
1443
1444 return Styler(self)
1445
1446 _shared_docs[
1447 "items"
1448 ] = r"""
1449 Iterate over (column name, Series) pairs.
1450
1451 Iterates over the DataFrame columns, returning a tuple with
1452 the column name and the content as a Series.
1453
1454 Yields
1455 ------
1456 label : object
1457 The column names for the DataFrame being iterated over.
1458 content : Series
1459 The column entries belonging to each label, as a Series.
1460
1461 See Also
1462 --------
1463 DataFrame.iterrows : Iterate over DataFrame rows as
1464 (index, Series) pairs.
1465 DataFrame.itertuples : Iterate over DataFrame rows as namedtuples
1466 of the values.
1467
1468 Examples
1469 --------
1470 >>> df = pd.DataFrame({'species': ['bear', 'bear', 'marsupial'],
1471 ... 'population': [1864, 22000, 80000]},
1472 ... index=['panda', 'polar', 'koala'])
1473 >>> df
1474 species population
1475 panda bear 1864
1476 polar bear 22000
1477 koala marsupial 80000
1478 >>> for label, content in df.items():
1479 ... print(f'label: {label}')
1480 ... print(f'content: {content}', sep='\n')
1481 ...
1482 label: species
1483 content:
1484 panda bear
1485 polar bear
1486 koala marsupial
1487 Name: species, dtype: object
1488 label: population
1489 content:
1490 panda 1864
1491 polar 22000
1492 koala 80000
1493 Name: population, dtype: int64
1494 """
1495
1496 @Appender(_shared_docs["items"])
1497 def items(self) -> Iterable[tuple[Hashable, Series]]:
1498 if self.columns.is_unique and hasattr(self, "_item_cache"):
1499 for k in self.columns:
1500 yield k, self._get_item_cache(k)
1501 else:
1502 for i, k in enumerate(self.columns):
1503 yield k, self._ixs(i, axis=1)
1504
1505 def iterrows(self) -> Iterable[tuple[Hashable, Series]]:
1506 """
1507 Iterate over DataFrame rows as (index, Series) pairs.
1508
1509 Yields
1510 ------
1511 index : label or tuple of label
1512 The index of the row. A tuple for a `MultiIndex`.
1513 data : Series
1514 The data of the row as a Series.
1515
1516 See Also
1517 --------
1518 DataFrame.itertuples : Iterate over DataFrame rows as namedtuples of the values.
1519 DataFrame.items : Iterate over (column name, Series) pairs.
1520
1521 Notes
1522 -----
1523 1. Because ``iterrows`` returns a Series for each row,
1524 it does **not** preserve dtypes across the rows (dtypes are
1525 preserved across columns for DataFrames).
1526
1527 To preserve dtypes while iterating over the rows, it is better
1528 to use :meth:`itertuples` which returns namedtuples of the values
1529 and which is generally faster than ``iterrows``.
1530
1531 2. You should **never modify** something you are iterating over.
1532 This is not guaranteed to work in all cases. Depending on the
1533 data types, the iterator returns a copy and not a view, and writing
1534 to it will have no effect.
1535
1536 Examples
1537 --------
1538
1539 >>> df = pd.DataFrame([[1, 1.5]], columns=['int', 'float'])
1540 >>> row = next(df.iterrows())[1]
1541 >>> row
1542 int 1.0
1543 float 1.5
1544 Name: 0, dtype: float64
1545 >>> print(row['int'].dtype)
1546 float64
1547 >>> print(df['int'].dtype)
1548 int64
1549 """
1550 columns = self.columns
1551 klass = self._constructor_sliced
1552 using_cow = using_copy_on_write()
1553 for k, v in zip(self.index, self.values):
1554 s = klass(v, index=columns, name=k).__finalize__(self)
1555 if using_cow and self._mgr.is_single_block:
1556 s._mgr.add_references(self._mgr) # type: ignore[arg-type]
1557 yield k, s
1558
1559 def itertuples(
1560 self, index: bool = True, name: str | None = "Pandas"
1561 ) -> Iterable[tuple[Any, ...]]:
1562 """
1563 Iterate over DataFrame rows as namedtuples.
1564
1565 Parameters
1566 ----------
1567 index : bool, default True
1568 If True, return the index as the first element of the tuple.
1569 name : str or None, default "Pandas"
1570 The name of the returned namedtuples or None to return regular
1571 tuples.
1572
1573 Returns
1574 -------
1575 iterator
1576 An object to iterate over namedtuples for each row in the
1577 DataFrame with the first field possibly being the index and
1578 following fields being the column values.
1579
1580 See Also
1581 --------
1582 DataFrame.iterrows : Iterate over DataFrame rows as (index, Series)
1583 pairs.
1584 DataFrame.items : Iterate over (column name, Series) pairs.
1585
1586 Notes
1587 -----
1588 The column names will be renamed to positional names if they are
1589 invalid Python identifiers, repeated, or start with an underscore.
1590
1591 Examples
1592 --------
1593 >>> df = pd.DataFrame({'num_legs': [4, 2], 'num_wings': [0, 2]},
1594 ... index=['dog', 'hawk'])
1595 >>> df
1596 num_legs num_wings
1597 dog 4 0
1598 hawk 2 2
1599 >>> for row in df.itertuples():
1600 ... print(row)
1601 ...
1602 Pandas(Index='dog', num_legs=4, num_wings=0)
1603 Pandas(Index='hawk', num_legs=2, num_wings=2)
1604
1605 By setting the `index` parameter to False we can remove the index
1606 as the first element of the tuple:
1607
1608 >>> for row in df.itertuples(index=False):
1609 ... print(row)
1610 ...
1611 Pandas(num_legs=4, num_wings=0)
1612 Pandas(num_legs=2, num_wings=2)
1613
1614 With the `name` parameter set we set a custom name for the yielded
1615 namedtuples:
1616
1617 >>> for row in df.itertuples(name='Animal'):
1618 ... print(row)
1619 ...
1620 Animal(Index='dog', num_legs=4, num_wings=0)
1621 Animal(Index='hawk', num_legs=2, num_wings=2)
1622 """
1623 arrays = []
1624 fields = list(self.columns)
1625 if index:
1626 arrays.append(self.index)
1627 fields.insert(0, "Index")
1628
1629 # use integer indexing because of possible duplicate column names
1630 arrays.extend(self.iloc[:, k] for k in range(len(self.columns)))
1631
1632 if name is not None:
1633 # https://github.com/python/mypy/issues/9046
1634 # error: namedtuple() expects a string literal as the first argument
1635 itertuple = collections.namedtuple( # type: ignore[misc]
1636 name, fields, rename=True
1637 )
1638 return map(itertuple._make, zip(*arrays))
1639
1640 # fallback to regular tuples
1641 return zip(*arrays)
1642
1643 def __len__(self) -> int:
1644 """
1645 Returns length of info axis, but here we use the index.
1646 """
1647 return len(self.index)
1648
1649 @overload
1650 def dot(self, other: Series) -> Series:
1651 ...
1652
1653 @overload
1654 def dot(self, other: DataFrame | Index | ArrayLike) -> DataFrame:
1655 ...
1656
1657 def dot(self, other: AnyArrayLike | DataFrame) -> DataFrame | Series:
1658 """
1659 Compute the matrix multiplication between the DataFrame and other.
1660
1661 This method computes the matrix product between the DataFrame and the
1662 values of an other Series, DataFrame or a numpy array.
1663
1664 It can also be called using ``self @ other``.
1665
1666 Parameters
1667 ----------
1668 other : Series, DataFrame or array-like
1669 The other object to compute the matrix product with.
1670
1671 Returns
1672 -------
1673 Series or DataFrame
1674 If other is a Series, return the matrix product between self and
1675 other as a Series. If other is a DataFrame or a numpy.array, return
1676 the matrix product of self and other in a DataFrame of a np.array.
1677
1678 See Also
1679 --------
1680 Series.dot: Similar method for Series.
1681
1682 Notes
1683 -----
1684 The dimensions of DataFrame and other must be compatible in order to
1685 compute the matrix multiplication. In addition, the column names of
1686 DataFrame and the index of other must contain the same values, as they
1687 will be aligned prior to the multiplication.
1688
1689 The dot method for Series computes the inner product, instead of the
1690 matrix product here.
1691
1692 Examples
1693 --------
1694 Here we multiply a DataFrame with a Series.
1695
1696 >>> df = pd.DataFrame([[0, 1, -2, -1], [1, 1, 1, 1]])
1697 >>> s = pd.Series([1, 1, 2, 1])
1698 >>> df.dot(s)
1699 0 -4
1700 1 5
1701 dtype: int64
1702
1703 Here we multiply a DataFrame with another DataFrame.
1704
1705 >>> other = pd.DataFrame([[0, 1], [1, 2], [-1, -1], [2, 0]])
1706 >>> df.dot(other)
1707 0 1
1708 0 1 4
1709 1 2 2
1710
1711 Note that the dot method give the same result as @
1712
1713 >>> df @ other
1714 0 1
1715 0 1 4
1716 1 2 2
1717
1718 The dot method works also if other is an np.array.
1719
1720 >>> arr = np.array([[0, 1], [1, 2], [-1, -1], [2, 0]])
1721 >>> df.dot(arr)
1722 0 1
1723 0 1 4
1724 1 2 2
1725
1726 Note how shuffling of the objects does not change the result.
1727
1728 >>> s2 = s.reindex([1, 0, 2, 3])
1729 >>> df.dot(s2)
1730 0 -4
1731 1 5
1732 dtype: int64
1733 """
1734 if isinstance(other, (Series, DataFrame)):
1735 common = self.columns.union(other.index)
1736 if len(common) > len(self.columns) or len(common) > len(other.index):
1737 raise ValueError("matrices are not aligned")
1738
1739 left = self.reindex(columns=common, copy=False)
1740 right = other.reindex(index=common, copy=False)
1741 lvals = left.values
1742 rvals = right._values
1743 else:
1744 left = self
1745 lvals = self.values
1746 rvals = np.asarray(other)
1747 if lvals.shape[1] != rvals.shape[0]:
1748 raise ValueError(
1749 f"Dot product shape mismatch, {lvals.shape} vs {rvals.shape}"
1750 )
1751
1752 if isinstance(other, DataFrame):
1753 common_type = find_common_type(list(self.dtypes) + list(other.dtypes))
1754 return self._constructor(
1755 np.dot(lvals, rvals),
1756 index=left.index,
1757 columns=other.columns,
1758 copy=False,
1759 dtype=common_type,
1760 )
1761 elif isinstance(other, Series):
1762 common_type = find_common_type(list(self.dtypes) + [other.dtypes])
1763 return self._constructor_sliced(
1764 np.dot(lvals, rvals), index=left.index, copy=False, dtype=common_type
1765 )
1766 elif isinstance(rvals, (np.ndarray, Index)):
1767 result = np.dot(lvals, rvals)
1768 if result.ndim == 2:
1769 return self._constructor(result, index=left.index, copy=False)
1770 else:
1771 return self._constructor_sliced(result, index=left.index, copy=False)
1772 else: # pragma: no cover
1773 raise TypeError(f"unsupported type: {type(other)}")
1774
1775 @overload
1776 def __matmul__(self, other: Series) -> Series:
1777 ...
1778
1779 @overload
1780 def __matmul__(self, other: AnyArrayLike | DataFrame) -> DataFrame | Series:
1781 ...
1782
1783 def __matmul__(self, other: AnyArrayLike | DataFrame) -> DataFrame | Series:
1784 """
1785 Matrix multiplication using binary `@` operator.
1786 """
1787 return self.dot(other)
1788
1789 def __rmatmul__(self, other) -> DataFrame:
1790 """
1791 Matrix multiplication using binary `@` operator.
1792 """
1793 try:
1794 return self.T.dot(np.transpose(other)).T
1795 except ValueError as err:
1796 if "shape mismatch" not in str(err):
1797 raise
1798 # GH#21581 give exception message for original shapes
1799 msg = f"shapes {np.shape(other)} and {self.shape} not aligned"
1800 raise ValueError(msg) from err
1801
1802 # ----------------------------------------------------------------------
1803 # IO methods (to / from other formats)
1804
1805 @classmethod
1806 def from_dict(
1807 cls,
1808 data: dict,
1809 orient: FromDictOrient = "columns",
1810 dtype: Dtype | None = None,
1811 columns: Axes | None = None,
1812 ) -> DataFrame:
1813 """
1814 Construct DataFrame from dict of array-like or dicts.
1815
1816 Creates DataFrame object from dictionary by columns or by index
1817 allowing dtype specification.
1818
1819 Parameters
1820 ----------
1821 data : dict
1822 Of the form {field : array-like} or {field : dict}.
1823 orient : {'columns', 'index', 'tight'}, default 'columns'
1824 The "orientation" of the data. If the keys of the passed dict
1825 should be the columns of the resulting DataFrame, pass 'columns'
1826 (default). Otherwise if the keys should be rows, pass 'index'.
1827 If 'tight', assume a dict with keys ['index', 'columns', 'data',
1828 'index_names', 'column_names'].
1829
1830 .. versionadded:: 1.4.0
1831 'tight' as an allowed value for the ``orient`` argument
1832
1833 dtype : dtype, default None
1834 Data type to force after DataFrame construction, otherwise infer.
1835 columns : list, default None
1836 Column labels to use when ``orient='index'``. Raises a ValueError
1837 if used with ``orient='columns'`` or ``orient='tight'``.
1838
1839 Returns
1840 -------
1841 DataFrame
1842
1843 See Also
1844 --------
1845 DataFrame.from_records : DataFrame from structured ndarray, sequence
1846 of tuples or dicts, or DataFrame.
1847 DataFrame : DataFrame object creation using constructor.
1848 DataFrame.to_dict : Convert the DataFrame to a dictionary.
1849
1850 Examples
1851 --------
1852 By default the keys of the dict become the DataFrame columns:
1853
1854 >>> data = {'col_1': [3, 2, 1, 0], 'col_2': ['a', 'b', 'c', 'd']}
1855 >>> pd.DataFrame.from_dict(data)
1856 col_1 col_2
1857 0 3 a
1858 1 2 b
1859 2 1 c
1860 3 0 d
1861
1862 Specify ``orient='index'`` to create the DataFrame using dictionary
1863 keys as rows:
1864
1865 >>> data = {'row_1': [3, 2, 1, 0], 'row_2': ['a', 'b', 'c', 'd']}
1866 >>> pd.DataFrame.from_dict(data, orient='index')
1867 0 1 2 3
1868 row_1 3 2 1 0
1869 row_2 a b c d
1870
1871 When using the 'index' orientation, the column names can be
1872 specified manually:
1873
1874 >>> pd.DataFrame.from_dict(data, orient='index',
1875 ... columns=['A', 'B', 'C', 'D'])
1876 A B C D
1877 row_1 3 2 1 0
1878 row_2 a b c d
1879
1880 Specify ``orient='tight'`` to create the DataFrame using a 'tight'
1881 format:
1882
1883 >>> data = {'index': [('a', 'b'), ('a', 'c')],
1884 ... 'columns': [('x', 1), ('y', 2)],
1885 ... 'data': [[1, 3], [2, 4]],
1886 ... 'index_names': ['n1', 'n2'],
1887 ... 'column_names': ['z1', 'z2']}
1888 >>> pd.DataFrame.from_dict(data, orient='tight')
1889 z1 x y
1890 z2 1 2
1891 n1 n2
1892 a b 1 3
1893 c 2 4
1894 """
1895 index = None
1896 orient = orient.lower() # type: ignore[assignment]
1897 if orient == "index":
1898 if len(data) > 0:
1899 # TODO speed up Series case
1900 if isinstance(next(iter(data.values())), (Series, dict)):
1901 data = _from_nested_dict(data)
1902 else:
1903 index = list(data.keys())
1904 # error: Incompatible types in assignment (expression has type
1905 # "List[Any]", variable has type "Dict[Any, Any]")
1906 data = list(data.values()) # type: ignore[assignment]
1907 elif orient in ("columns", "tight"):
1908 if columns is not None:
1909 raise ValueError(f"cannot use columns parameter with orient='{orient}'")
1910 else: # pragma: no cover
1911 raise ValueError(
1912 f"Expected 'index', 'columns' or 'tight' for orient parameter. "
1913 f"Got '{orient}' instead"
1914 )
1915
1916 if orient != "tight":
1917 return cls(data, index=index, columns=columns, dtype=dtype)
1918 else:
1919 realdata = data["data"]
1920
1921 def create_index(indexlist, namelist):
1922 index: Index
1923 if len(namelist) > 1:
1924 index = MultiIndex.from_tuples(indexlist, names=namelist)
1925 else:
1926 index = Index(indexlist, name=namelist[0])
1927 return index
1928
1929 index = create_index(data["index"], data["index_names"])
1930 columns = create_index(data["columns"], data["column_names"])
1931 return cls(realdata, index=index, columns=columns, dtype=dtype)
1932
1933 def to_numpy(
1934 self,
1935 dtype: npt.DTypeLike | None = None,
1936 copy: bool = False,
1937 na_value: object = lib.no_default,
1938 ) -> np.ndarray:
1939 """
1940 Convert the DataFrame to a NumPy array.
1941
1942 By default, the dtype of the returned array will be the common NumPy
1943 dtype of all types in the DataFrame. For example, if the dtypes are
1944 ``float16`` and ``float32``, the results dtype will be ``float32``.
1945 This may require copying data and coercing values, which may be
1946 expensive.
1947
1948 Parameters
1949 ----------
1950 dtype : str or numpy.dtype, optional
1951 The dtype to pass to :meth:`numpy.asarray`.
1952 copy : bool, default False
1953 Whether to ensure that the returned value is not a view on
1954 another array. Note that ``copy=False`` does not *ensure* that
1955 ``to_numpy()`` is no-copy. Rather, ``copy=True`` ensure that
1956 a copy is made, even if not strictly necessary.
1957 na_value : Any, optional
1958 The value to use for missing values. The default value depends
1959 on `dtype` and the dtypes of the DataFrame columns.
1960
1961 Returns
1962 -------
1963 numpy.ndarray
1964
1965 See Also
1966 --------
1967 Series.to_numpy : Similar method for Series.
1968
1969 Examples
1970 --------
1971 >>> pd.DataFrame({"A": [1, 2], "B": [3, 4]}).to_numpy()
1972 array([[1, 3],
1973 [2, 4]])
1974
1975 With heterogeneous data, the lowest common type will have to
1976 be used.
1977
1978 >>> df = pd.DataFrame({"A": [1, 2], "B": [3.0, 4.5]})
1979 >>> df.to_numpy()
1980 array([[1. , 3. ],
1981 [2. , 4.5]])
1982
1983 For a mix of numeric and non-numeric types, the output array will
1984 have object dtype.
1985
1986 >>> df['C'] = pd.date_range('2000', periods=2)
1987 >>> df.to_numpy()
1988 array([[1, 3.0, Timestamp('2000-01-01 00:00:00')],
1989 [2, 4.5, Timestamp('2000-01-02 00:00:00')]], dtype=object)
1990 """
1991 if dtype is not None:
1992 dtype = np.dtype(dtype)
1993 result = self._mgr.as_array(dtype=dtype, copy=copy, na_value=na_value)
1994 if result.dtype is not dtype:
1995 result = np.asarray(result, dtype=dtype)
1996
1997 return result
1998
1999 def _create_data_for_split_and_tight_to_dict(
2000 self, are_all_object_dtype_cols: bool, object_dtype_indices: list[int]
2001 ) -> list:
2002 """
2003 Simple helper method to create data for to ``to_dict(orient="split")`` and
2004 ``to_dict(orient="tight")`` to create the main output data
2005 """
2006 if are_all_object_dtype_cols:
2007 data = [
2008 list(map(maybe_box_native, t))
2009 for t in self.itertuples(index=False, name=None)
2010 ]
2011 else:
2012 data = [list(t) for t in self.itertuples(index=False, name=None)]
2013 if object_dtype_indices:
2014 # If we have object_dtype_cols, apply maybe_box_naive after list
2015 # comprehension for perf
2016 for row in data:
2017 for i in object_dtype_indices:
2018 row[i] = maybe_box_native(row[i])
2019 return data
2020
2021 @overload
2022 def to_dict(
2023 self,
2024 orient: Literal["dict", "list", "series", "split", "tight", "index"] = ...,
2025 *,
2026 into: type[MutableMappingT] | MutableMappingT,
2027 index: bool = ...,
2028 ) -> MutableMappingT:
2029 ...
2030
2031 @overload
2032 def to_dict(
2033 self,
2034 orient: Literal["records"],
2035 *,
2036 into: type[MutableMappingT] | MutableMappingT,
2037 index: bool = ...,
2038 ) -> list[MutableMappingT]:
2039 ...
2040
2041 @overload
2042 def to_dict(
2043 self,
2044 orient: Literal["dict", "list", "series", "split", "tight", "index"] = ...,
2045 *,
2046 into: type[dict] = ...,
2047 index: bool = ...,
2048 ) -> dict:
2049 ...
2050
2051 @overload
2052 def to_dict(
2053 self,
2054 orient: Literal["records"],
2055 *,
2056 into: type[dict] = ...,
2057 index: bool = ...,
2058 ) -> list[dict]:
2059 ...
2060
2061 # error: Incompatible default for argument "into" (default has type "type
2062 # [dict[Any, Any]]", argument has type "type[MutableMappingT] | MutableMappingT")
2063 @deprecate_nonkeyword_arguments(
2064 version="3.0", allowed_args=["self", "orient"], name="to_dict"
2065 )
2066 def to_dict(
2067 self,
2068 orient: Literal[
2069 "dict", "list", "series", "split", "tight", "records", "index"
2070 ] = "dict",
2071 into: type[MutableMappingT]
2072 | MutableMappingT = dict, # type: ignore[assignment]
2073 index: bool = True,
2074 ) -> MutableMappingT | list[MutableMappingT]:
2075 """
2076 Convert the DataFrame to a dictionary.
2077
2078 The type of the key-value pairs can be customized with the parameters
2079 (see below).
2080
2081 Parameters
2082 ----------
2083 orient : str {'dict', 'list', 'series', 'split', 'tight', 'records', 'index'}
2084 Determines the type of the values of the dictionary.
2085
2086 - 'dict' (default) : dict like {column -> {index -> value}}
2087 - 'list' : dict like {column -> [values]}
2088 - 'series' : dict like {column -> Series(values)}
2089 - 'split' : dict like
2090 {'index' -> [index], 'columns' -> [columns], 'data' -> [values]}
2091 - 'tight' : dict like
2092 {'index' -> [index], 'columns' -> [columns], 'data' -> [values],
2093 'index_names' -> [index.names], 'column_names' -> [column.names]}
2094 - 'records' : list like
2095 [{column -> value}, ... , {column -> value}]
2096 - 'index' : dict like {index -> {column -> value}}
2097
2098 .. versionadded:: 1.4.0
2099 'tight' as an allowed value for the ``orient`` argument
2100
2101 into : class, default dict
2102 The collections.abc.MutableMapping subclass used for all Mappings
2103 in the return value. Can be the actual class or an empty
2104 instance of the mapping type you want. If you want a
2105 collections.defaultdict, you must pass it initialized.
2106
2107 index : bool, default True
2108 Whether to include the index item (and index_names item if `orient`
2109 is 'tight') in the returned dictionary. Can only be ``False``
2110 when `orient` is 'split' or 'tight'.
2111
2112 .. versionadded:: 2.0.0
2113
2114 Returns
2115 -------
2116 dict, list or collections.abc.MutableMapping
2117 Return a collections.abc.MutableMapping object representing the
2118 DataFrame. The resulting transformation depends on the `orient`
2119 parameter.
2120
2121 See Also
2122 --------
2123 DataFrame.from_dict: Create a DataFrame from a dictionary.
2124 DataFrame.to_json: Convert a DataFrame to JSON format.
2125
2126 Examples
2127 --------
2128 >>> df = pd.DataFrame({'col1': [1, 2],
2129 ... 'col2': [0.5, 0.75]},
2130 ... index=['row1', 'row2'])
2131 >>> df
2132 col1 col2
2133 row1 1 0.50
2134 row2 2 0.75
2135 >>> df.to_dict()
2136 {'col1': {'row1': 1, 'row2': 2}, 'col2': {'row1': 0.5, 'row2': 0.75}}
2137
2138 You can specify the return orientation.
2139
2140 >>> df.to_dict('series')
2141 {'col1': row1 1
2142 row2 2
2143 Name: col1, dtype: int64,
2144 'col2': row1 0.50
2145 row2 0.75
2146 Name: col2, dtype: float64}
2147
2148 >>> df.to_dict('split')
2149 {'index': ['row1', 'row2'], 'columns': ['col1', 'col2'],
2150 'data': [[1, 0.5], [2, 0.75]]}
2151
2152 >>> df.to_dict('records')
2153 [{'col1': 1, 'col2': 0.5}, {'col1': 2, 'col2': 0.75}]
2154
2155 >>> df.to_dict('index')
2156 {'row1': {'col1': 1, 'col2': 0.5}, 'row2': {'col1': 2, 'col2': 0.75}}
2157
2158 >>> df.to_dict('tight')
2159 {'index': ['row1', 'row2'], 'columns': ['col1', 'col2'],
2160 'data': [[1, 0.5], [2, 0.75]], 'index_names': [None], 'column_names': [None]}
2161
2162 You can also specify the mapping type.
2163
2164 >>> from collections import OrderedDict, defaultdict
2165 >>> df.to_dict(into=OrderedDict)
2166 OrderedDict([('col1', OrderedDict([('row1', 1), ('row2', 2)])),
2167 ('col2', OrderedDict([('row1', 0.5), ('row2', 0.75)]))])
2168
2169 If you want a `defaultdict`, you need to initialize it:
2170
2171 >>> dd = defaultdict(list)
2172 >>> df.to_dict('records', into=dd)
2173 [defaultdict(<class 'list'>, {'col1': 1, 'col2': 0.5}),
2174 defaultdict(<class 'list'>, {'col1': 2, 'col2': 0.75})]
2175 """
2176 from pandas.core.methods.to_dict import to_dict
2177
2178 return to_dict(self, orient, into=into, index=index)
2179
2180 @deprecate_nonkeyword_arguments(
2181 version="3.0", allowed_args=["self", "destination_table"], name="to_gbq"
2182 )
2183 def to_gbq(
2184 self,
2185 destination_table: str,
2186 project_id: str | None = None,
2187 chunksize: int | None = None,
2188 reauth: bool = False,
2189 if_exists: ToGbqIfexist = "fail",
2190 auth_local_webserver: bool = True,
2191 table_schema: list[dict[str, str]] | None = None,
2192 location: str | None = None,
2193 progress_bar: bool = True,
2194 credentials=None,
2195 ) -> None:
2196 """
2197 Write a DataFrame to a Google BigQuery table.
2198
2199 .. deprecated:: 2.2.0
2200
2201 Please use ``pandas_gbq.to_gbq`` instead.
2202
2203 This function requires the `pandas-gbq package
2204 <https://pandas-gbq.readthedocs.io>`__.
2205
2206 See the `How to authenticate with Google BigQuery
2207 <https://pandas-gbq.readthedocs.io/en/latest/howto/authentication.html>`__
2208 guide for authentication instructions.
2209
2210 Parameters
2211 ----------
2212 destination_table : str
2213 Name of table to be written, in the form ``dataset.tablename``.
2214 project_id : str, optional
2215 Google BigQuery Account project ID. Optional when available from
2216 the environment.
2217 chunksize : int, optional
2218 Number of rows to be inserted in each chunk from the dataframe.
2219 Set to ``None`` to load the whole dataframe at once.
2220 reauth : bool, default False
2221 Force Google BigQuery to re-authenticate the user. This is useful
2222 if multiple accounts are used.
2223 if_exists : str, default 'fail'
2224 Behavior when the destination table exists. Value can be one of:
2225
2226 ``'fail'``
2227 If table exists raise pandas_gbq.gbq.TableCreationError.
2228 ``'replace'``
2229 If table exists, drop it, recreate it, and insert data.
2230 ``'append'``
2231 If table exists, insert data. Create if does not exist.
2232 auth_local_webserver : bool, default True
2233 Use the `local webserver flow`_ instead of the `console flow`_
2234 when getting user credentials.
2235
2236 .. _local webserver flow:
2237 https://google-auth-oauthlib.readthedocs.io/en/latest/reference/google_auth_oauthlib.flow.html#google_auth_oauthlib.flow.InstalledAppFlow.run_local_server
2238 .. _console flow:
2239 https://google-auth-oauthlib.readthedocs.io/en/latest/reference/google_auth_oauthlib.flow.html#google_auth_oauthlib.flow.InstalledAppFlow.run_console
2240
2241 *New in version 0.2.0 of pandas-gbq*.
2242
2243 .. versionchanged:: 1.5.0
2244 Default value is changed to ``True``. Google has deprecated the
2245 ``auth_local_webserver = False`` `"out of band" (copy-paste)
2246 flow
2247 <https://developers.googleblog.com/2022/02/making-oauth-flows-safer.html?m=1#disallowed-oob>`_.
2248 table_schema : list of dicts, optional
2249 List of BigQuery table fields to which according DataFrame
2250 columns conform to, e.g. ``[{'name': 'col1', 'type':
2251 'STRING'},...]``. If schema is not provided, it will be
2252 generated according to dtypes of DataFrame columns. See
2253 BigQuery API documentation on available names of a field.
2254
2255 *New in version 0.3.1 of pandas-gbq*.
2256 location : str, optional
2257 Location where the load job should run. See the `BigQuery locations
2258 documentation
2259 <https://cloud.google.com/bigquery/docs/dataset-locations>`__ for a
2260 list of available locations. The location must match that of the
2261 target dataset.
2262
2263 *New in version 0.5.0 of pandas-gbq*.
2264 progress_bar : bool, default True
2265 Use the library `tqdm` to show the progress bar for the upload,
2266 chunk by chunk.
2267
2268 *New in version 0.5.0 of pandas-gbq*.
2269 credentials : google.auth.credentials.Credentials, optional
2270 Credentials for accessing Google APIs. Use this parameter to
2271 override default credentials, such as to use Compute Engine
2272 :class:`google.auth.compute_engine.Credentials` or Service
2273 Account :class:`google.oauth2.service_account.Credentials`
2274 directly.
2275
2276 *New in version 0.8.0 of pandas-gbq*.
2277
2278 See Also
2279 --------
2280 pandas_gbq.to_gbq : This function in the pandas-gbq library.
2281 read_gbq : Read a DataFrame from Google BigQuery.
2282
2283 Examples
2284 --------
2285 Example taken from `Google BigQuery documentation
2286 <https://cloud.google.com/bigquery/docs/samples/bigquery-pandas-gbq-to-gbq-simple>`_
2287
2288 >>> project_id = "my-project"
2289 >>> table_id = 'my_dataset.my_table'
2290 >>> df = pd.DataFrame({
2291 ... "my_string": ["a", "b", "c"],
2292 ... "my_int64": [1, 2, 3],
2293 ... "my_float64": [4.0, 5.0, 6.0],
2294 ... "my_bool1": [True, False, True],
2295 ... "my_bool2": [False, True, False],
2296 ... "my_dates": pd.date_range("now", periods=3),
2297 ... }
2298 ... )
2299
2300 >>> df.to_gbq(table_id, project_id=project_id) # doctest: +SKIP
2301 """
2302 from pandas.io import gbq
2303
2304 gbq.to_gbq(
2305 self,
2306 destination_table,
2307 project_id=project_id,
2308 chunksize=chunksize,
2309 reauth=reauth,
2310 if_exists=if_exists,
2311 auth_local_webserver=auth_local_webserver,
2312 table_schema=table_schema,
2313 location=location,
2314 progress_bar=progress_bar,
2315 credentials=credentials,
2316 )
2317
2318 @classmethod
2319 def from_records(
2320 cls,
2321 data,
2322 index=None,
2323 exclude=None,
2324 columns=None,
2325 coerce_float: bool = False,
2326 nrows: int | None = None,
2327 ) -> DataFrame:
2328 """
2329 Convert structured or record ndarray to DataFrame.
2330
2331 Creates a DataFrame object from a structured ndarray, sequence of
2332 tuples or dicts, or DataFrame.
2333
2334 Parameters
2335 ----------
2336 data : structured ndarray, sequence of tuples or dicts, or DataFrame
2337 Structured input data.
2338
2339 .. deprecated:: 2.1.0
2340 Passing a DataFrame is deprecated.
2341 index : str, list of fields, array-like
2342 Field of array to use as the index, alternately a specific set of
2343 input labels to use.
2344 exclude : sequence, default None
2345 Columns or fields to exclude.
2346 columns : sequence, default None
2347 Column names to use. If the passed data do not have names
2348 associated with them, this argument provides names for the
2349 columns. Otherwise this argument indicates the order of the columns
2350 in the result (any names not found in the data will become all-NA
2351 columns).
2352 coerce_float : bool, default False
2353 Attempt to convert values of non-string, non-numeric objects (like
2354 decimal.Decimal) to floating point, useful for SQL result sets.
2355 nrows : int, default None
2356 Number of rows to read if data is an iterator.
2357
2358 Returns
2359 -------
2360 DataFrame
2361
2362 See Also
2363 --------
2364 DataFrame.from_dict : DataFrame from dict of array-like or dicts.
2365 DataFrame : DataFrame object creation using constructor.
2366
2367 Examples
2368 --------
2369 Data can be provided as a structured ndarray:
2370
2371 >>> data = np.array([(3, 'a'), (2, 'b'), (1, 'c'), (0, 'd')],
2372 ... dtype=[('col_1', 'i4'), ('col_2', 'U1')])
2373 >>> pd.DataFrame.from_records(data)
2374 col_1 col_2
2375 0 3 a
2376 1 2 b
2377 2 1 c
2378 3 0 d
2379
2380 Data can be provided as a list of dicts:
2381
2382 >>> data = [{'col_1': 3, 'col_2': 'a'},
2383 ... {'col_1': 2, 'col_2': 'b'},
2384 ... {'col_1': 1, 'col_2': 'c'},
2385 ... {'col_1': 0, 'col_2': 'd'}]
2386 >>> pd.DataFrame.from_records(data)
2387 col_1 col_2
2388 0 3 a
2389 1 2 b
2390 2 1 c
2391 3 0 d
2392
2393 Data can be provided as a list of tuples with corresponding columns:
2394
2395 >>> data = [(3, 'a'), (2, 'b'), (1, 'c'), (0, 'd')]
2396 >>> pd.DataFrame.from_records(data, columns=['col_1', 'col_2'])
2397 col_1 col_2
2398 0 3 a
2399 1 2 b
2400 2 1 c
2401 3 0 d
2402 """
2403 if isinstance(data, DataFrame):
2404 warnings.warn(
2405 "Passing a DataFrame to DataFrame.from_records is deprecated. Use "
2406 "set_index and/or drop to modify the DataFrame instead.",
2407 FutureWarning,
2408 stacklevel=find_stack_level(),
2409 )
2410 if columns is not None:
2411 if is_scalar(columns):
2412 columns = [columns]
2413 data = data[columns]
2414 if index is not None:
2415 data = data.set_index(index)
2416 if exclude is not None:
2417 data = data.drop(columns=exclude)
2418 return data.copy(deep=False)
2419
2420 result_index = None
2421
2422 # Make a copy of the input columns so we can modify it
2423 if columns is not None:
2424 columns = ensure_index(columns)
2425
2426 def maybe_reorder(
2427 arrays: list[ArrayLike], arr_columns: Index, columns: Index, index
2428 ) -> tuple[list[ArrayLike], Index, Index | None]:
2429 """
2430 If our desired 'columns' do not match the data's pre-existing 'arr_columns',
2431 we re-order our arrays. This is like a pre-emptive (cheap) reindex.
2432 """
2433 if len(arrays):
2434 length = len(arrays[0])
2435 else:
2436 length = 0
2437
2438 result_index = None
2439 if len(arrays) == 0 and index is None and length == 0:
2440 result_index = default_index(0)
2441
2442 arrays, arr_columns = reorder_arrays(arrays, arr_columns, columns, length)
2443 return arrays, arr_columns, result_index
2444
2445 if is_iterator(data):
2446 if nrows == 0:
2447 return cls()
2448
2449 try:
2450 first_row = next(data)
2451 except StopIteration:
2452 return cls(index=index, columns=columns)
2453
2454 dtype = None
2455 if hasattr(first_row, "dtype") and first_row.dtype.names:
2456 dtype = first_row.dtype
2457
2458 values = [first_row]
2459
2460 if nrows is None:
2461 values += data
2462 else:
2463 values.extend(itertools.islice(data, nrows - 1))
2464
2465 if dtype is not None:
2466 data = np.array(values, dtype=dtype)
2467 else:
2468 data = values
2469
2470 if isinstance(data, dict):
2471 if columns is None:
2472 columns = arr_columns = ensure_index(sorted(data))
2473 arrays = [data[k] for k in columns]
2474 else:
2475 arrays = []
2476 arr_columns_list = []
2477 for k, v in data.items():
2478 if k in columns:
2479 arr_columns_list.append(k)
2480 arrays.append(v)
2481
2482 arr_columns = Index(arr_columns_list)
2483 arrays, arr_columns, result_index = maybe_reorder(
2484 arrays, arr_columns, columns, index
2485 )
2486
2487 elif isinstance(data, np.ndarray):
2488 arrays, columns = to_arrays(data, columns)
2489 arr_columns = columns
2490 else:
2491 arrays, arr_columns = to_arrays(data, columns)
2492 if coerce_float:
2493 for i, arr in enumerate(arrays):
2494 if arr.dtype == object:
2495 # error: Argument 1 to "maybe_convert_objects" has
2496 # incompatible type "Union[ExtensionArray, ndarray]";
2497 # expected "ndarray"
2498 arrays[i] = lib.maybe_convert_objects(
2499 arr, # type: ignore[arg-type]
2500 try_float=True,
2501 )
2502
2503 arr_columns = ensure_index(arr_columns)
2504 if columns is None:
2505 columns = arr_columns
2506 else:
2507 arrays, arr_columns, result_index = maybe_reorder(
2508 arrays, arr_columns, columns, index
2509 )
2510
2511 if exclude is None:
2512 exclude = set()
2513 else:
2514 exclude = set(exclude)
2515
2516 if index is not None:
2517 if isinstance(index, str) or not hasattr(index, "__iter__"):
2518 i = columns.get_loc(index)
2519 exclude.add(index)
2520 if len(arrays) > 0:
2521 result_index = Index(arrays[i], name=index)
2522 else:
2523 result_index = Index([], name=index)
2524 else:
2525 try:
2526 index_data = [arrays[arr_columns.get_loc(field)] for field in index]
2527 except (KeyError, TypeError):
2528 # raised by get_loc, see GH#29258
2529 result_index = index
2530 else:
2531 result_index = ensure_index_from_sequences(index_data, names=index)
2532 exclude.update(index)
2533
2534 if any(exclude):
2535 arr_exclude = [x for x in exclude if x in arr_columns]
2536 to_remove = [arr_columns.get_loc(col) for col in arr_exclude]
2537 arrays = [v for i, v in enumerate(arrays) if i not in to_remove]
2538
2539 columns = columns.drop(exclude)
2540
2541 manager = _get_option("mode.data_manager", silent=True)
2542 mgr = arrays_to_mgr(arrays, columns, result_index, typ=manager)
2543
2544 return cls._from_mgr(mgr, axes=mgr.axes)
2545
2546 def to_records(
2547 self, index: bool = True, column_dtypes=None, index_dtypes=None
2548 ) -> np.rec.recarray:
2549 """
2550 Convert DataFrame to a NumPy record array.
2551
2552 Index will be included as the first field of the record array if
2553 requested.
2554
2555 Parameters
2556 ----------
2557 index : bool, default True
2558 Include index in resulting record array, stored in 'index'
2559 field or using the index label, if set.
2560 column_dtypes : str, type, dict, default None
2561 If a string or type, the data type to store all columns. If
2562 a dictionary, a mapping of column names and indices (zero-indexed)
2563 to specific data types.
2564 index_dtypes : str, type, dict, default None
2565 If a string or type, the data type to store all index levels. If
2566 a dictionary, a mapping of index level names and indices
2567 (zero-indexed) to specific data types.
2568
2569 This mapping is applied only if `index=True`.
2570
2571 Returns
2572 -------
2573 numpy.rec.recarray
2574 NumPy ndarray with the DataFrame labels as fields and each row
2575 of the DataFrame as entries.
2576
2577 See Also
2578 --------
2579 DataFrame.from_records: Convert structured or record ndarray
2580 to DataFrame.
2581 numpy.rec.recarray: An ndarray that allows field access using
2582 attributes, analogous to typed columns in a
2583 spreadsheet.
2584
2585 Examples
2586 --------
2587 >>> df = pd.DataFrame({'A': [1, 2], 'B': [0.5, 0.75]},
2588 ... index=['a', 'b'])
2589 >>> df
2590 A B
2591 a 1 0.50
2592 b 2 0.75
2593 >>> df.to_records()
2594 rec.array([('a', 1, 0.5 ), ('b', 2, 0.75)],
2595 dtype=[('index', 'O'), ('A', '<i8'), ('B', '<f8')])
2596
2597 If the DataFrame index has no label then the recarray field name
2598 is set to 'index'. If the index has a label then this is used as the
2599 field name:
2600
2601 >>> df.index = df.index.rename("I")
2602 >>> df.to_records()
2603 rec.array([('a', 1, 0.5 ), ('b', 2, 0.75)],
2604 dtype=[('I', 'O'), ('A', '<i8'), ('B', '<f8')])
2605
2606 The index can be excluded from the record array:
2607
2608 >>> df.to_records(index=False)
2609 rec.array([(1, 0.5 ), (2, 0.75)],
2610 dtype=[('A', '<i8'), ('B', '<f8')])
2611
2612 Data types can be specified for the columns:
2613
2614 >>> df.to_records(column_dtypes={"A": "int32"})
2615 rec.array([('a', 1, 0.5 ), ('b', 2, 0.75)],
2616 dtype=[('I', 'O'), ('A', '<i4'), ('B', '<f8')])
2617
2618 As well as for the index:
2619
2620 >>> df.to_records(index_dtypes="<S2")
2621 rec.array([(b'a', 1, 0.5 ), (b'b', 2, 0.75)],
2622 dtype=[('I', 'S2'), ('A', '<i8'), ('B', '<f8')])
2623
2624 >>> index_dtypes = f"<S{df.index.str.len().max()}"
2625 >>> df.to_records(index_dtypes=index_dtypes)
2626 rec.array([(b'a', 1, 0.5 ), (b'b', 2, 0.75)],
2627 dtype=[('I', 'S1'), ('A', '<i8'), ('B', '<f8')])
2628 """
2629 if index:
2630 ix_vals = [
2631 np.asarray(self.index.get_level_values(i))
2632 for i in range(self.index.nlevels)
2633 ]
2634
2635 arrays = ix_vals + [
2636 np.asarray(self.iloc[:, i]) for i in range(len(self.columns))
2637 ]
2638
2639 index_names = list(self.index.names)
2640
2641 if isinstance(self.index, MultiIndex):
2642 index_names = com.fill_missing_names(index_names)
2643 elif index_names[0] is None:
2644 index_names = ["index"]
2645
2646 names = [str(name) for name in itertools.chain(index_names, self.columns)]
2647 else:
2648 arrays = [np.asarray(self.iloc[:, i]) for i in range(len(self.columns))]
2649 names = [str(c) for c in self.columns]
2650 index_names = []
2651
2652 index_len = len(index_names)
2653 formats = []
2654
2655 for i, v in enumerate(arrays):
2656 index_int = i
2657
2658 # When the names and arrays are collected, we
2659 # first collect those in the DataFrame's index,
2660 # followed by those in its columns.
2661 #
2662 # Thus, the total length of the array is:
2663 # len(index_names) + len(DataFrame.columns).
2664 #
2665 # This check allows us to see whether we are
2666 # handling a name / array in the index or column.
2667 if index_int < index_len:
2668 dtype_mapping = index_dtypes
2669 name = index_names[index_int]
2670 else:
2671 index_int -= index_len
2672 dtype_mapping = column_dtypes
2673 name = self.columns[index_int]
2674
2675 # We have a dictionary, so we get the data type
2676 # associated with the index or column (which can
2677 # be denoted by its name in the DataFrame or its
2678 # position in DataFrame's array of indices or
2679 # columns, whichever is applicable.
2680 if is_dict_like(dtype_mapping):
2681 if name in dtype_mapping:
2682 dtype_mapping = dtype_mapping[name]
2683 elif index_int in dtype_mapping:
2684 dtype_mapping = dtype_mapping[index_int]
2685 else:
2686 dtype_mapping = None
2687
2688 # If no mapping can be found, use the array's
2689 # dtype attribute for formatting.
2690 #
2691 # A valid dtype must either be a type or
2692 # string naming a type.
2693 if dtype_mapping is None:
2694 formats.append(v.dtype)
2695 elif isinstance(dtype_mapping, (type, np.dtype, str)):
2696 # error: Argument 1 to "append" of "list" has incompatible
2697 # type "Union[type, dtype[Any], str]"; expected "dtype[Any]"
2698 formats.append(dtype_mapping) # type: ignore[arg-type]
2699 else:
2700 element = "row" if i < index_len else "column"
2701 msg = f"Invalid dtype {dtype_mapping} specified for {element} {name}"
2702 raise ValueError(msg)
2703
2704 return np.rec.fromarrays(arrays, dtype={"names": names, "formats": formats})
2705
2706 @classmethod
2707 def _from_arrays(
2708 cls,
2709 arrays,
2710 columns,
2711 index,
2712 dtype: Dtype | None = None,
2713 verify_integrity: bool = True,
2714 ) -> Self:
2715 """
2716 Create DataFrame from a list of arrays corresponding to the columns.
2717
2718 Parameters
2719 ----------
2720 arrays : list-like of arrays
2721 Each array in the list corresponds to one column, in order.
2722 columns : list-like, Index
2723 The column names for the resulting DataFrame.
2724 index : list-like, Index
2725 The rows labels for the resulting DataFrame.
2726 dtype : dtype, optional
2727 Optional dtype to enforce for all arrays.
2728 verify_integrity : bool, default True
2729 Validate and homogenize all input. If set to False, it is assumed
2730 that all elements of `arrays` are actual arrays how they will be
2731 stored in a block (numpy ndarray or ExtensionArray), have the same
2732 length as and are aligned with the index, and that `columns` and
2733 `index` are ensured to be an Index object.
2734
2735 Returns
2736 -------
2737 DataFrame
2738 """
2739 if dtype is not None:
2740 dtype = pandas_dtype(dtype)
2741
2742 manager = _get_option("mode.data_manager", silent=True)
2743 columns = ensure_index(columns)
2744 if len(columns) != len(arrays):
2745 raise ValueError("len(columns) must match len(arrays)")
2746 mgr = arrays_to_mgr(
2747 arrays,
2748 columns,
2749 index,
2750 dtype=dtype,
2751 verify_integrity=verify_integrity,
2752 typ=manager,
2753 )
2754 return cls._from_mgr(mgr, axes=mgr.axes)
2755
2756 @doc(
2757 storage_options=_shared_docs["storage_options"],
2758 compression_options=_shared_docs["compression_options"] % "path",
2759 )
2760 def to_stata(
2761 self,
2762 path: FilePath | WriteBuffer[bytes],
2763 *,
2764 convert_dates: dict[Hashable, str] | None = None,
2765 write_index: bool = True,
2766 byteorder: ToStataByteorder | None = None,
2767 time_stamp: datetime.datetime | None = None,
2768 data_label: str | None = None,
2769 variable_labels: dict[Hashable, str] | None = None,
2770 version: int | None = 114,
2771 convert_strl: Sequence[Hashable] | None = None,
2772 compression: CompressionOptions = "infer",
2773 storage_options: StorageOptions | None = None,
2774 value_labels: dict[Hashable, dict[float, str]] | None = None,
2775 ) -> None:
2776 """
2777 Export DataFrame object to Stata dta format.
2778
2779 Writes the DataFrame to a Stata dataset file.
2780 "dta" files contain a Stata dataset.
2781
2782 Parameters
2783 ----------
2784 path : str, path object, or buffer
2785 String, path object (implementing ``os.PathLike[str]``), or file-like
2786 object implementing a binary ``write()`` function.
2787
2788 convert_dates : dict
2789 Dictionary mapping columns containing datetime types to stata
2790 internal format to use when writing the dates. Options are 'tc',
2791 'td', 'tm', 'tw', 'th', 'tq', 'ty'. Column can be either an integer
2792 or a name. Datetime columns that do not have a conversion type
2793 specified will be converted to 'tc'. Raises NotImplementedError if
2794 a datetime column has timezone information.
2795 write_index : bool
2796 Write the index to Stata dataset.
2797 byteorder : str
2798 Can be ">", "<", "little", or "big". default is `sys.byteorder`.
2799 time_stamp : datetime
2800 A datetime to use as file creation date. Default is the current
2801 time.
2802 data_label : str, optional
2803 A label for the data set. Must be 80 characters or smaller.
2804 variable_labels : dict
2805 Dictionary containing columns as keys and variable labels as
2806 values. Each label must be 80 characters or smaller.
2807 version : {{114, 117, 118, 119, None}}, default 114
2808 Version to use in the output dta file. Set to None to let pandas
2809 decide between 118 or 119 formats depending on the number of
2810 columns in the frame. Version 114 can be read by Stata 10 and
2811 later. Version 117 can be read by Stata 13 or later. Version 118
2812 is supported in Stata 14 and later. Version 119 is supported in
2813 Stata 15 and later. Version 114 limits string variables to 244
2814 characters or fewer while versions 117 and later allow strings
2815 with lengths up to 2,000,000 characters. Versions 118 and 119
2816 support Unicode characters, and version 119 supports more than
2817 32,767 variables.
2818
2819 Version 119 should usually only be used when the number of
2820 variables exceeds the capacity of dta format 118. Exporting
2821 smaller datasets in format 119 may have unintended consequences,
2822 and, as of November 2020, Stata SE cannot read version 119 files.
2823
2824 convert_strl : list, optional
2825 List of column names to convert to string columns to Stata StrL
2826 format. Only available if version is 117. Storing strings in the
2827 StrL format can produce smaller dta files if strings have more than
2828 8 characters and values are repeated.
2829 {compression_options}
2830
2831 .. versionchanged:: 1.4.0 Zstandard support.
2832
2833 {storage_options}
2834
2835 value_labels : dict of dicts
2836 Dictionary containing columns as keys and dictionaries of column value
2837 to labels as values. Labels for a single variable must be 32,000
2838 characters or smaller.
2839
2840 .. versionadded:: 1.4.0
2841
2842 Raises
2843 ------
2844 NotImplementedError
2845 * If datetimes contain timezone information
2846 * Column dtype is not representable in Stata
2847 ValueError
2848 * Columns listed in convert_dates are neither datetime64[ns]
2849 or datetime.datetime
2850 * Column listed in convert_dates is not in DataFrame
2851 * Categorical label contains more than 32,000 characters
2852
2853 See Also
2854 --------
2855 read_stata : Import Stata data files.
2856 io.stata.StataWriter : Low-level writer for Stata data files.
2857 io.stata.StataWriter117 : Low-level writer for version 117 files.
2858
2859 Examples
2860 --------
2861 >>> df = pd.DataFrame({{'animal': ['falcon', 'parrot', 'falcon',
2862 ... 'parrot'],
2863 ... 'speed': [350, 18, 361, 15]}})
2864 >>> df.to_stata('animals.dta') # doctest: +SKIP
2865 """
2866 if version not in (114, 117, 118, 119, None):
2867 raise ValueError("Only formats 114, 117, 118 and 119 are supported.")
2868 if version == 114:
2869 if convert_strl is not None:
2870 raise ValueError("strl is not supported in format 114")
2871 from pandas.io.stata import StataWriter as statawriter
2872 elif version == 117:
2873 # Incompatible import of "statawriter" (imported name has type
2874 # "Type[StataWriter117]", local name has type "Type[StataWriter]")
2875 from pandas.io.stata import ( # type: ignore[assignment]
2876 StataWriter117 as statawriter,
2877 )
2878 else: # versions 118 and 119
2879 # Incompatible import of "statawriter" (imported name has type
2880 # "Type[StataWriter117]", local name has type "Type[StataWriter]")
2881 from pandas.io.stata import ( # type: ignore[assignment]
2882 StataWriterUTF8 as statawriter,
2883 )
2884
2885 kwargs: dict[str, Any] = {}
2886 if version is None or version >= 117:
2887 # strl conversion is only supported >= 117
2888 kwargs["convert_strl"] = convert_strl
2889 if version is None or version >= 118:
2890 # Specifying the version is only supported for UTF8 (118 or 119)
2891 kwargs["version"] = version
2892
2893 writer = statawriter(
2894 path,
2895 self,
2896 convert_dates=convert_dates,
2897 byteorder=byteorder,
2898 time_stamp=time_stamp,
2899 data_label=data_label,
2900 write_index=write_index,
2901 variable_labels=variable_labels,
2902 compression=compression,
2903 storage_options=storage_options,
2904 value_labels=value_labels,
2905 **kwargs,
2906 )
2907 writer.write_file()
2908
2909 def to_feather(self, path: FilePath | WriteBuffer[bytes], **kwargs) -> None:
2910 """
2911 Write a DataFrame to the binary Feather format.
2912
2913 Parameters
2914 ----------
2915 path : str, path object, file-like object
2916 String, path object (implementing ``os.PathLike[str]``), or file-like
2917 object implementing a binary ``write()`` function. If a string or a path,
2918 it will be used as Root Directory path when writing a partitioned dataset.
2919 **kwargs :
2920 Additional keywords passed to :func:`pyarrow.feather.write_feather`.
2921 This includes the `compression`, `compression_level`, `chunksize`
2922 and `version` keywords.
2923
2924 Notes
2925 -----
2926 This function writes the dataframe as a `feather file
2927 <https://arrow.apache.org/docs/python/feather.html>`_. Requires a default
2928 index. For saving the DataFrame with your custom index use a method that
2929 supports custom indices e.g. `to_parquet`.
2930
2931 Examples
2932 --------
2933 >>> df = pd.DataFrame([[1, 2, 3], [4, 5, 6]])
2934 >>> df.to_feather("file.feather") # doctest: +SKIP
2935 """
2936 from pandas.io.feather_format import to_feather
2937
2938 to_feather(self, path, **kwargs)
2939
2940 @deprecate_nonkeyword_arguments(
2941 version="3.0", allowed_args=["self", "buf"], name="to_markdown"
2942 )
2943 @doc(
2944 Series.to_markdown,
2945 klass=_shared_doc_kwargs["klass"],
2946 storage_options=_shared_docs["storage_options"],
2947 examples="""Examples
2948 --------
2949 >>> df = pd.DataFrame(
2950 ... data={"animal_1": ["elk", "pig"], "animal_2": ["dog", "quetzal"]}
2951 ... )
2952 >>> print(df.to_markdown())
2953 | | animal_1 | animal_2 |
2954 |---:|:-----------|:-----------|
2955 | 0 | elk | dog |
2956 | 1 | pig | quetzal |
2957
2958 Output markdown with a tabulate option.
2959
2960 >>> print(df.to_markdown(tablefmt="grid"))
2961 +----+------------+------------+
2962 | | animal_1 | animal_2 |
2963 +====+============+============+
2964 | 0 | elk | dog |
2965 +----+------------+------------+
2966 | 1 | pig | quetzal |
2967 +----+------------+------------+""",
2968 )
2969 def to_markdown(
2970 self,
2971 buf: FilePath | WriteBuffer[str] | None = None,
2972 mode: str = "wt",
2973 index: bool = True,
2974 storage_options: StorageOptions | None = None,
2975 **kwargs,
2976 ) -> str | None:
2977 if "showindex" in kwargs:
2978 raise ValueError("Pass 'index' instead of 'showindex")
2979
2980 kwargs.setdefault("headers", "keys")
2981 kwargs.setdefault("tablefmt", "pipe")
2982 kwargs.setdefault("showindex", index)
2983 tabulate = import_optional_dependency("tabulate")
2984 result = tabulate.tabulate(self, **kwargs)
2985 if buf is None:
2986 return result
2987
2988 with get_handle(buf, mode, storage_options=storage_options) as handles:
2989 handles.handle.write(result)
2990 return None
2991
2992 @overload
2993 def to_parquet(
2994 self,
2995 path: None = ...,
2996 engine: Literal["auto", "pyarrow", "fastparquet"] = ...,
2997 compression: str | None = ...,
2998 index: bool | None = ...,
2999 partition_cols: list[str] | None = ...,
3000 storage_options: StorageOptions = ...,
3001 **kwargs,
3002 ) -> bytes:
3003 ...
3004
3005 @overload
3006 def to_parquet(
3007 self,
3008 path: FilePath | WriteBuffer[bytes],
3009 engine: Literal["auto", "pyarrow", "fastparquet"] = ...,
3010 compression: str | None = ...,
3011 index: bool | None = ...,
3012 partition_cols: list[str] | None = ...,
3013 storage_options: StorageOptions = ...,
3014 **kwargs,
3015 ) -> None:
3016 ...
3017
3018 @deprecate_nonkeyword_arguments(
3019 version="3.0", allowed_args=["self", "path"], name="to_parquet"
3020 )
3021 @doc(storage_options=_shared_docs["storage_options"])
3022 def to_parquet(
3023 self,
3024 path: FilePath | WriteBuffer[bytes] | None = None,
3025 engine: Literal["auto", "pyarrow", "fastparquet"] = "auto",
3026 compression: str | None = "snappy",
3027 index: bool | None = None,
3028 partition_cols: list[str] | None = None,
3029 storage_options: StorageOptions | None = None,
3030 **kwargs,
3031 ) -> bytes | None:
3032 """
3033 Write a DataFrame to the binary parquet format.
3034
3035 This function writes the dataframe as a `parquet file
3036 <https://parquet.apache.org/>`_. You can choose different parquet
3037 backends, and have the option of compression. See
3038 :ref:`the user guide <io.parquet>` for more details.
3039
3040 Parameters
3041 ----------
3042 path : str, path object, file-like object, or None, default None
3043 String, path object (implementing ``os.PathLike[str]``), or file-like
3044 object implementing a binary ``write()`` function. If None, the result is
3045 returned as bytes. If a string or path, it will be used as Root Directory
3046 path when writing a partitioned dataset.
3047 engine : {{'auto', 'pyarrow', 'fastparquet'}}, default 'auto'
3048 Parquet library to use. If 'auto', then the option
3049 ``io.parquet.engine`` is used. The default ``io.parquet.engine``
3050 behavior is to try 'pyarrow', falling back to 'fastparquet' if
3051 'pyarrow' is unavailable.
3052 compression : str or None, default 'snappy'
3053 Name of the compression to use. Use ``None`` for no compression.
3054 Supported options: 'snappy', 'gzip', 'brotli', 'lz4', 'zstd'.
3055 index : bool, default None
3056 If ``True``, include the dataframe's index(es) in the file output.
3057 If ``False``, they will not be written to the file.
3058 If ``None``, similar to ``True`` the dataframe's index(es)
3059 will be saved. However, instead of being saved as values,
3060 the RangeIndex will be stored as a range in the metadata so it
3061 doesn't require much space and is faster. Other indexes will
3062 be included as columns in the file output.
3063 partition_cols : list, optional, default None
3064 Column names by which to partition the dataset.
3065 Columns are partitioned in the order they are given.
3066 Must be None if path is not a string.
3067 {storage_options}
3068
3069 **kwargs
3070 Additional arguments passed to the parquet library. See
3071 :ref:`pandas io <io.parquet>` for more details.
3072
3073 Returns
3074 -------
3075 bytes if no path argument is provided else None
3076
3077 See Also
3078 --------
3079 read_parquet : Read a parquet file.
3080 DataFrame.to_orc : Write an orc file.
3081 DataFrame.to_csv : Write a csv file.
3082 DataFrame.to_sql : Write to a sql table.
3083 DataFrame.to_hdf : Write to hdf.
3084
3085 Notes
3086 -----
3087 This function requires either the `fastparquet
3088 <https://pypi.org/project/fastparquet>`_ or `pyarrow
3089 <https://arrow.apache.org/docs/python/>`_ library.
3090
3091 Examples
3092 --------
3093 >>> df = pd.DataFrame(data={{'col1': [1, 2], 'col2': [3, 4]}})
3094 >>> df.to_parquet('df.parquet.gzip',
3095 ... compression='gzip') # doctest: +SKIP
3096 >>> pd.read_parquet('df.parquet.gzip') # doctest: +SKIP
3097 col1 col2
3098 0 1 3
3099 1 2 4
3100
3101 If you want to get a buffer to the parquet content you can use a io.BytesIO
3102 object, as long as you don't use partition_cols, which creates multiple files.
3103
3104 >>> import io
3105 >>> f = io.BytesIO()
3106 >>> df.to_parquet(f)
3107 >>> f.seek(0)
3108 0
3109 >>> content = f.read()
3110 """
3111 from pandas.io.parquet import to_parquet
3112
3113 return to_parquet(
3114 self,
3115 path,
3116 engine,
3117 compression=compression,
3118 index=index,
3119 partition_cols=partition_cols,
3120 storage_options=storage_options,
3121 **kwargs,
3122 )
3123
3124 def to_orc(
3125 self,
3126 path: FilePath | WriteBuffer[bytes] | None = None,
3127 *,
3128 engine: Literal["pyarrow"] = "pyarrow",
3129 index: bool | None = None,
3130 engine_kwargs: dict[str, Any] | None = None,
3131 ) -> bytes | None:
3132 """
3133 Write a DataFrame to the ORC format.
3134
3135 .. versionadded:: 1.5.0
3136
3137 Parameters
3138 ----------
3139 path : str, file-like object or None, default None
3140 If a string, it will be used as Root Directory path
3141 when writing a partitioned dataset. By file-like object,
3142 we refer to objects with a write() method, such as a file handle
3143 (e.g. via builtin open function). If path is None,
3144 a bytes object is returned.
3145 engine : {'pyarrow'}, default 'pyarrow'
3146 ORC library to use.
3147 index : bool, optional
3148 If ``True``, include the dataframe's index(es) in the file output.
3149 If ``False``, they will not be written to the file.
3150 If ``None``, similar to ``infer`` the dataframe's index(es)
3151 will be saved. However, instead of being saved as values,
3152 the RangeIndex will be stored as a range in the metadata so it
3153 doesn't require much space and is faster. Other indexes will
3154 be included as columns in the file output.
3155 engine_kwargs : dict[str, Any] or None, default None
3156 Additional keyword arguments passed to :func:`pyarrow.orc.write_table`.
3157
3158 Returns
3159 -------
3160 bytes if no path argument is provided else None
3161
3162 Raises
3163 ------
3164 NotImplementedError
3165 Dtype of one or more columns is category, unsigned integers, interval,
3166 period or sparse.
3167 ValueError
3168 engine is not pyarrow.
3169
3170 See Also
3171 --------
3172 read_orc : Read a ORC file.
3173 DataFrame.to_parquet : Write a parquet file.
3174 DataFrame.to_csv : Write a csv file.
3175 DataFrame.to_sql : Write to a sql table.
3176 DataFrame.to_hdf : Write to hdf.
3177
3178 Notes
3179 -----
3180 * Before using this function you should read the :ref:`user guide about
3181 ORC <io.orc>` and :ref:`install optional dependencies <install.warn_orc>`.
3182 * This function requires `pyarrow <https://arrow.apache.org/docs/python/>`_
3183 library.
3184 * For supported dtypes please refer to `supported ORC features in Arrow
3185 <https://arrow.apache.org/docs/cpp/orc.html#data-types>`__.
3186 * Currently timezones in datetime columns are not preserved when a
3187 dataframe is converted into ORC files.
3188
3189 Examples
3190 --------
3191 >>> df = pd.DataFrame(data={'col1': [1, 2], 'col2': [4, 3]})
3192 >>> df.to_orc('df.orc') # doctest: +SKIP
3193 >>> pd.read_orc('df.orc') # doctest: +SKIP
3194 col1 col2
3195 0 1 4
3196 1 2 3
3197
3198 If you want to get a buffer to the orc content you can write it to io.BytesIO
3199
3200 >>> import io
3201 >>> b = io.BytesIO(df.to_orc()) # doctest: +SKIP
3202 >>> b.seek(0) # doctest: +SKIP
3203 0
3204 >>> content = b.read() # doctest: +SKIP
3205 """
3206 from pandas.io.orc import to_orc
3207
3208 return to_orc(
3209 self, path, engine=engine, index=index, engine_kwargs=engine_kwargs
3210 )
3211
3212 @overload
3213 def to_html(
3214 self,
3215 buf: FilePath | WriteBuffer[str],
3216 columns: Axes | None = ...,
3217 col_space: ColspaceArgType | None = ...,
3218 header: bool = ...,
3219 index: bool = ...,
3220 na_rep: str = ...,
3221 formatters: FormattersType | None = ...,
3222 float_format: FloatFormatType | None = ...,
3223 sparsify: bool | None = ...,
3224 index_names: bool = ...,
3225 justify: str | None = ...,
3226 max_rows: int | None = ...,
3227 max_cols: int | None = ...,
3228 show_dimensions: bool | str = ...,
3229 decimal: str = ...,
3230 bold_rows: bool = ...,
3231 classes: str | list | tuple | None = ...,
3232 escape: bool = ...,
3233 notebook: bool = ...,
3234 border: int | bool | None = ...,
3235 table_id: str | None = ...,
3236 render_links: bool = ...,
3237 encoding: str | None = ...,
3238 ) -> None:
3239 ...
3240
3241 @overload
3242 def to_html(
3243 self,
3244 buf: None = ...,
3245 columns: Axes | None = ...,
3246 col_space: ColspaceArgType | None = ...,
3247 header: bool = ...,
3248 index: bool = ...,
3249 na_rep: str = ...,
3250 formatters: FormattersType | None = ...,
3251 float_format: FloatFormatType | None = ...,
3252 sparsify: bool | None = ...,
3253 index_names: bool = ...,
3254 justify: str | None = ...,
3255 max_rows: int | None = ...,
3256 max_cols: int | None = ...,
3257 show_dimensions: bool | str = ...,
3258 decimal: str = ...,
3259 bold_rows: bool = ...,
3260 classes: str | list | tuple | None = ...,
3261 escape: bool = ...,
3262 notebook: bool = ...,
3263 border: int | bool | None = ...,
3264 table_id: str | None = ...,
3265 render_links: bool = ...,
3266 encoding: str | None = ...,
3267 ) -> str:
3268 ...
3269
3270 @deprecate_nonkeyword_arguments(
3271 version="3.0", allowed_args=["self", "buf"], name="to_html"
3272 )
3273 @Substitution(
3274 header_type="bool",
3275 header="Whether to print column labels, default True",
3276 col_space_type="str or int, list or dict of int or str",
3277 col_space="The minimum width of each column in CSS length "
3278 "units. An int is assumed to be px units.",
3279 )
3280 @Substitution(shared_params=fmt.common_docstring, returns=fmt.return_docstring)
3281 def to_html(
3282 self,
3283 buf: FilePath | WriteBuffer[str] | None = None,
3284 columns: Axes | None = None,
3285 col_space: ColspaceArgType | None = None,
3286 header: bool = True,
3287 index: bool = True,
3288 na_rep: str = "NaN",
3289 formatters: FormattersType | None = None,
3290 float_format: FloatFormatType | None = None,
3291 sparsify: bool | None = None,
3292 index_names: bool = True,
3293 justify: str | None = None,
3294 max_rows: int | None = None,
3295 max_cols: int | None = None,
3296 show_dimensions: bool | str = False,
3297 decimal: str = ".",
3298 bold_rows: bool = True,
3299 classes: str | list | tuple | None = None,
3300 escape: bool = True,
3301 notebook: bool = False,
3302 border: int | bool | None = None,
3303 table_id: str | None = None,
3304 render_links: bool = False,
3305 encoding: str | None = None,
3306 ) -> str | None:
3307 """
3308 Render a DataFrame as an HTML table.
3309 %(shared_params)s
3310 bold_rows : bool, default True
3311 Make the row labels bold in the output.
3312 classes : str or list or tuple, default None
3313 CSS class(es) to apply to the resulting html table.
3314 escape : bool, default True
3315 Convert the characters <, >, and & to HTML-safe sequences.
3316 notebook : {True, False}, default False
3317 Whether the generated HTML is for IPython Notebook.
3318 border : int
3319 A ``border=border`` attribute is included in the opening
3320 `<table>` tag. Default ``pd.options.display.html.border``.
3321 table_id : str, optional
3322 A css id is included in the opening `<table>` tag if specified.
3323 render_links : bool, default False
3324 Convert URLs to HTML links.
3325 encoding : str, default "utf-8"
3326 Set character encoding.
3327 %(returns)s
3328 See Also
3329 --------
3330 to_string : Convert DataFrame to a string.
3331
3332 Examples
3333 --------
3334 >>> df = pd.DataFrame(data={'col1': [1, 2], 'col2': [4, 3]})
3335 >>> html_string = '''<table border="1" class="dataframe">
3336 ... <thead>
3337 ... <tr style="text-align: right;">
3338 ... <th></th>
3339 ... <th>col1</th>
3340 ... <th>col2</th>
3341 ... </tr>
3342 ... </thead>
3343 ... <tbody>
3344 ... <tr>
3345 ... <th>0</th>
3346 ... <td>1</td>
3347 ... <td>4</td>
3348 ... </tr>
3349 ... <tr>
3350 ... <th>1</th>
3351 ... <td>2</td>
3352 ... <td>3</td>
3353 ... </tr>
3354 ... </tbody>
3355 ... </table>'''
3356 >>> assert html_string == df.to_html()
3357 """
3358 if justify is not None and justify not in fmt.VALID_JUSTIFY_PARAMETERS:
3359 raise ValueError("Invalid value for justify parameter")
3360
3361 formatter = fmt.DataFrameFormatter(
3362 self,
3363 columns=columns,
3364 col_space=col_space,
3365 na_rep=na_rep,
3366 header=header,
3367 index=index,
3368 formatters=formatters,
3369 float_format=float_format,
3370 bold_rows=bold_rows,
3371 sparsify=sparsify,
3372 justify=justify,
3373 index_names=index_names,
3374 escape=escape,
3375 decimal=decimal,
3376 max_rows=max_rows,
3377 max_cols=max_cols,
3378 show_dimensions=show_dimensions,
3379 )
3380 # TODO: a generic formatter wld b in DataFrameFormatter
3381 return fmt.DataFrameRenderer(formatter).to_html(
3382 buf=buf,
3383 classes=classes,
3384 notebook=notebook,
3385 border=border,
3386 encoding=encoding,
3387 table_id=table_id,
3388 render_links=render_links,
3389 )
3390
3391 @overload
3392 def to_xml(
3393 self,
3394 path_or_buffer: None = ...,
3395 *,
3396 index: bool = ...,
3397 root_name: str | None = ...,
3398 row_name: str | None = ...,
3399 na_rep: str | None = ...,
3400 attr_cols: list[str] | None = ...,
3401 elem_cols: list[str] | None = ...,
3402 namespaces: dict[str | None, str] | None = ...,
3403 prefix: str | None = ...,
3404 encoding: str = ...,
3405 xml_declaration: bool | None = ...,
3406 pretty_print: bool | None = ...,
3407 parser: XMLParsers | None = ...,
3408 stylesheet: FilePath | ReadBuffer[str] | ReadBuffer[bytes] | None = ...,
3409 compression: CompressionOptions = ...,
3410 storage_options: StorageOptions | None = ...,
3411 ) -> str:
3412 ...
3413
3414 @overload
3415 def to_xml(
3416 self,
3417 path_or_buffer: FilePath | WriteBuffer[bytes] | WriteBuffer[str],
3418 *,
3419 index: bool = ...,
3420 root_name: str | None = ...,
3421 row_name: str | None = ...,
3422 na_rep: str | None = ...,
3423 attr_cols: list[str] | None = ...,
3424 elem_cols: list[str] | None = ...,
3425 namespaces: dict[str | None, str] | None = ...,
3426 prefix: str | None = ...,
3427 encoding: str = ...,
3428 xml_declaration: bool | None = ...,
3429 pretty_print: bool | None = ...,
3430 parser: XMLParsers | None = ...,
3431 stylesheet: FilePath | ReadBuffer[str] | ReadBuffer[bytes] | None = ...,
3432 compression: CompressionOptions = ...,
3433 storage_options: StorageOptions | None = ...,
3434 ) -> None:
3435 ...
3436
3437 @deprecate_nonkeyword_arguments(
3438 version="3.0", allowed_args=["self", "path_or_buffer"], name="to_xml"
3439 )
3440 @doc(
3441 storage_options=_shared_docs["storage_options"],
3442 compression_options=_shared_docs["compression_options"] % "path_or_buffer",
3443 )
3444 def to_xml(
3445 self,
3446 path_or_buffer: FilePath | WriteBuffer[bytes] | WriteBuffer[str] | None = None,
3447 index: bool = True,
3448 root_name: str | None = "data",
3449 row_name: str | None = "row",
3450 na_rep: str | None = None,
3451 attr_cols: list[str] | None = None,
3452 elem_cols: list[str] | None = None,
3453 namespaces: dict[str | None, str] | None = None,
3454 prefix: str | None = None,
3455 encoding: str = "utf-8",
3456 xml_declaration: bool | None = True,
3457 pretty_print: bool | None = True,
3458 parser: XMLParsers | None = "lxml",
3459 stylesheet: FilePath | ReadBuffer[str] | ReadBuffer[bytes] | None = None,
3460 compression: CompressionOptions = "infer",
3461 storage_options: StorageOptions | None = None,
3462 ) -> str | None:
3463 """
3464 Render a DataFrame to an XML document.
3465
3466 .. versionadded:: 1.3.0
3467
3468 Parameters
3469 ----------
3470 path_or_buffer : str, path object, file-like object, or None, default None
3471 String, path object (implementing ``os.PathLike[str]``), or file-like
3472 object implementing a ``write()`` function. If None, the result is returned
3473 as a string.
3474 index : bool, default True
3475 Whether to include index in XML document.
3476 root_name : str, default 'data'
3477 The name of root element in XML document.
3478 row_name : str, default 'row'
3479 The name of row element in XML document.
3480 na_rep : str, optional
3481 Missing data representation.
3482 attr_cols : list-like, optional
3483 List of columns to write as attributes in row element.
3484 Hierarchical columns will be flattened with underscore
3485 delimiting the different levels.
3486 elem_cols : list-like, optional
3487 List of columns to write as children in row element. By default,
3488 all columns output as children of row element. Hierarchical
3489 columns will be flattened with underscore delimiting the
3490 different levels.
3491 namespaces : dict, optional
3492 All namespaces to be defined in root element. Keys of dict
3493 should be prefix names and values of dict corresponding URIs.
3494 Default namespaces should be given empty string key. For
3495 example, ::
3496
3497 namespaces = {{"": "https://example.com"}}
3498
3499 prefix : str, optional
3500 Namespace prefix to be used for every element and/or attribute
3501 in document. This should be one of the keys in ``namespaces``
3502 dict.
3503 encoding : str, default 'utf-8'
3504 Encoding of the resulting document.
3505 xml_declaration : bool, default True
3506 Whether to include the XML declaration at start of document.
3507 pretty_print : bool, default True
3508 Whether output should be pretty printed with indentation and
3509 line breaks.
3510 parser : {{'lxml','etree'}}, default 'lxml'
3511 Parser module to use for building of tree. Only 'lxml' and
3512 'etree' are supported. With 'lxml', the ability to use XSLT
3513 stylesheet is supported.
3514 stylesheet : str, path object or file-like object, optional
3515 A URL, file-like object, or a raw string containing an XSLT
3516 script used to transform the raw XML output. Script should use
3517 layout of elements and attributes from original output. This
3518 argument requires ``lxml`` to be installed. Only XSLT 1.0
3519 scripts and not later versions is currently supported.
3520 {compression_options}
3521
3522 .. versionchanged:: 1.4.0 Zstandard support.
3523
3524 {storage_options}
3525
3526 Returns
3527 -------
3528 None or str
3529 If ``io`` is None, returns the resulting XML format as a
3530 string. Otherwise returns None.
3531
3532 See Also
3533 --------
3534 to_json : Convert the pandas object to a JSON string.
3535 to_html : Convert DataFrame to a html.
3536
3537 Examples
3538 --------
3539 >>> df = pd.DataFrame({{'shape': ['square', 'circle', 'triangle'],
3540 ... 'degrees': [360, 360, 180],
3541 ... 'sides': [4, np.nan, 3]}})
3542
3543 >>> df.to_xml() # doctest: +SKIP
3544 <?xml version='1.0' encoding='utf-8'?>
3545 <data>
3546 <row>
3547 <index>0</index>
3548 <shape>square</shape>
3549 <degrees>360</degrees>
3550 <sides>4.0</sides>
3551 </row>
3552 <row>
3553 <index>1</index>
3554 <shape>circle</shape>
3555 <degrees>360</degrees>
3556 <sides/>
3557 </row>
3558 <row>
3559 <index>2</index>
3560 <shape>triangle</shape>
3561 <degrees>180</degrees>
3562 <sides>3.0</sides>
3563 </row>
3564 </data>
3565
3566 >>> df.to_xml(attr_cols=[
3567 ... 'index', 'shape', 'degrees', 'sides'
3568 ... ]) # doctest: +SKIP
3569 <?xml version='1.0' encoding='utf-8'?>
3570 <data>
3571 <row index="0" shape="square" degrees="360" sides="4.0"/>
3572 <row index="1" shape="circle" degrees="360"/>
3573 <row index="2" shape="triangle" degrees="180" sides="3.0"/>
3574 </data>
3575
3576 >>> df.to_xml(namespaces={{"doc": "https://example.com"}},
3577 ... prefix="doc") # doctest: +SKIP
3578 <?xml version='1.0' encoding='utf-8'?>
3579 <doc:data xmlns:doc="https://example.com">
3580 <doc:row>
3581 <doc:index>0</doc:index>
3582 <doc:shape>square</doc:shape>
3583 <doc:degrees>360</doc:degrees>
3584 <doc:sides>4.0</doc:sides>
3585 </doc:row>
3586 <doc:row>
3587 <doc:index>1</doc:index>
3588 <doc:shape>circle</doc:shape>
3589 <doc:degrees>360</doc:degrees>
3590 <doc:sides/>
3591 </doc:row>
3592 <doc:row>
3593 <doc:index>2</doc:index>
3594 <doc:shape>triangle</doc:shape>
3595 <doc:degrees>180</doc:degrees>
3596 <doc:sides>3.0</doc:sides>
3597 </doc:row>
3598 </doc:data>
3599 """
3600
3601 from pandas.io.formats.xml import (
3602 EtreeXMLFormatter,
3603 LxmlXMLFormatter,
3604 )
3605
3606 lxml = import_optional_dependency("lxml.etree", errors="ignore")
3607
3608 TreeBuilder: type[EtreeXMLFormatter | LxmlXMLFormatter]
3609
3610 if parser == "lxml":
3611 if lxml is not None:
3612 TreeBuilder = LxmlXMLFormatter
3613 else:
3614 raise ImportError(
3615 "lxml not found, please install or use the etree parser."
3616 )
3617
3618 elif parser == "etree":
3619 TreeBuilder = EtreeXMLFormatter
3620
3621 else:
3622 raise ValueError("Values for parser can only be lxml or etree.")
3623
3624 xml_formatter = TreeBuilder(
3625 self,
3626 path_or_buffer=path_or_buffer,
3627 index=index,
3628 root_name=root_name,
3629 row_name=row_name,
3630 na_rep=na_rep,
3631 attr_cols=attr_cols,
3632 elem_cols=elem_cols,
3633 namespaces=namespaces,
3634 prefix=prefix,
3635 encoding=encoding,
3636 xml_declaration=xml_declaration,
3637 pretty_print=pretty_print,
3638 stylesheet=stylesheet,
3639 compression=compression,
3640 storage_options=storage_options,
3641 )
3642
3643 return xml_formatter.write_output()
3644
3645 # ----------------------------------------------------------------------
3646 @doc(INFO_DOCSTRING, **frame_sub_kwargs)
3647 def info(
3648 self,
3649 verbose: bool | None = None,
3650 buf: WriteBuffer[str] | None = None,
3651 max_cols: int | None = None,
3652 memory_usage: bool | str | None = None,
3653 show_counts: bool | None = None,
3654 ) -> None:
3655 info = DataFrameInfo(
3656 data=self,
3657 memory_usage=memory_usage,
3658 )
3659 info.render(
3660 buf=buf,
3661 max_cols=max_cols,
3662 verbose=verbose,
3663 show_counts=show_counts,
3664 )
3665
3666 def memory_usage(self, index: bool = True, deep: bool = False) -> Series:
3667 """
3668 Return the memory usage of each column in bytes.
3669
3670 The memory usage can optionally include the contribution of
3671 the index and elements of `object` dtype.
3672
3673 This value is displayed in `DataFrame.info` by default. This can be
3674 suppressed by setting ``pandas.options.display.memory_usage`` to False.
3675
3676 Parameters
3677 ----------
3678 index : bool, default True
3679 Specifies whether to include the memory usage of the DataFrame's
3680 index in returned Series. If ``index=True``, the memory usage of
3681 the index is the first item in the output.
3682 deep : bool, default False
3683 If True, introspect the data deeply by interrogating
3684 `object` dtypes for system-level memory consumption, and include
3685 it in the returned values.
3686
3687 Returns
3688 -------
3689 Series
3690 A Series whose index is the original column names and whose values
3691 is the memory usage of each column in bytes.
3692
3693 See Also
3694 --------
3695 numpy.ndarray.nbytes : Total bytes consumed by the elements of an
3696 ndarray.
3697 Series.memory_usage : Bytes consumed by a Series.
3698 Categorical : Memory-efficient array for string values with
3699 many repeated values.
3700 DataFrame.info : Concise summary of a DataFrame.
3701
3702 Notes
3703 -----
3704 See the :ref:`Frequently Asked Questions <df-memory-usage>` for more
3705 details.
3706
3707 Examples
3708 --------
3709 >>> dtypes = ['int64', 'float64', 'complex128', 'object', 'bool']
3710 >>> data = dict([(t, np.ones(shape=5000, dtype=int).astype(t))
3711 ... for t in dtypes])
3712 >>> df = pd.DataFrame(data)
3713 >>> df.head()
3714 int64 float64 complex128 object bool
3715 0 1 1.0 1.0+0.0j 1 True
3716 1 1 1.0 1.0+0.0j 1 True
3717 2 1 1.0 1.0+0.0j 1 True
3718 3 1 1.0 1.0+0.0j 1 True
3719 4 1 1.0 1.0+0.0j 1 True
3720
3721 >>> df.memory_usage()
3722 Index 128
3723 int64 40000
3724 float64 40000
3725 complex128 80000
3726 object 40000
3727 bool 5000
3728 dtype: int64
3729
3730 >>> df.memory_usage(index=False)
3731 int64 40000
3732 float64 40000
3733 complex128 80000
3734 object 40000
3735 bool 5000
3736 dtype: int64
3737
3738 The memory footprint of `object` dtype columns is ignored by default:
3739
3740 >>> df.memory_usage(deep=True)
3741 Index 128
3742 int64 40000
3743 float64 40000
3744 complex128 80000
3745 object 180000
3746 bool 5000
3747 dtype: int64
3748
3749 Use a Categorical for efficient storage of an object-dtype column with
3750 many repeated values.
3751
3752 >>> df['object'].astype('category').memory_usage(deep=True)
3753 5244
3754 """
3755 result = self._constructor_sliced(
3756 [c.memory_usage(index=False, deep=deep) for col, c in self.items()],
3757 index=self.columns,
3758 dtype=np.intp,
3759 )
3760 if index:
3761 index_memory_usage = self._constructor_sliced(
3762 self.index.memory_usage(deep=deep), index=["Index"]
3763 )
3764 result = index_memory_usage._append(result)
3765 return result
3766
3767 def transpose(self, *args, copy: bool = False) -> DataFrame:
3768 """
3769 Transpose index and columns.
3770
3771 Reflect the DataFrame over its main diagonal by writing rows as columns
3772 and vice-versa. The property :attr:`.T` is an accessor to the method
3773 :meth:`transpose`.
3774
3775 Parameters
3776 ----------
3777 *args : tuple, optional
3778 Accepted for compatibility with NumPy.
3779 copy : bool, default False
3780 Whether to copy the data after transposing, even for DataFrames
3781 with a single dtype.
3782
3783 Note that a copy is always required for mixed dtype DataFrames,
3784 or for DataFrames with any extension types.
3785
3786 .. note::
3787 The `copy` keyword will change behavior in pandas 3.0.
3788 `Copy-on-Write
3789 <https://pandas.pydata.org/docs/dev/user_guide/copy_on_write.html>`__
3790 will be enabled by default, which means that all methods with a
3791 `copy` keyword will use a lazy copy mechanism to defer the copy and
3792 ignore the `copy` keyword. The `copy` keyword will be removed in a
3793 future version of pandas.
3794
3795 You can already get the future behavior and improvements through
3796 enabling copy on write ``pd.options.mode.copy_on_write = True``
3797
3798 Returns
3799 -------
3800 DataFrame
3801 The transposed DataFrame.
3802
3803 See Also
3804 --------
3805 numpy.transpose : Permute the dimensions of a given array.
3806
3807 Notes
3808 -----
3809 Transposing a DataFrame with mixed dtypes will result in a homogeneous
3810 DataFrame with the `object` dtype. In such a case, a copy of the data
3811 is always made.
3812
3813 Examples
3814 --------
3815 **Square DataFrame with homogeneous dtype**
3816
3817 >>> d1 = {'col1': [1, 2], 'col2': [3, 4]}
3818 >>> df1 = pd.DataFrame(data=d1)
3819 >>> df1
3820 col1 col2
3821 0 1 3
3822 1 2 4
3823
3824 >>> df1_transposed = df1.T # or df1.transpose()
3825 >>> df1_transposed
3826 0 1
3827 col1 1 2
3828 col2 3 4
3829
3830 When the dtype is homogeneous in the original DataFrame, we get a
3831 transposed DataFrame with the same dtype:
3832
3833 >>> df1.dtypes
3834 col1 int64
3835 col2 int64
3836 dtype: object
3837 >>> df1_transposed.dtypes
3838 0 int64
3839 1 int64
3840 dtype: object
3841
3842 **Non-square DataFrame with mixed dtypes**
3843
3844 >>> d2 = {'name': ['Alice', 'Bob'],
3845 ... 'score': [9.5, 8],
3846 ... 'employed': [False, True],
3847 ... 'kids': [0, 0]}
3848 >>> df2 = pd.DataFrame(data=d2)
3849 >>> df2
3850 name score employed kids
3851 0 Alice 9.5 False 0
3852 1 Bob 8.0 True 0
3853
3854 >>> df2_transposed = df2.T # or df2.transpose()
3855 >>> df2_transposed
3856 0 1
3857 name Alice Bob
3858 score 9.5 8.0
3859 employed False True
3860 kids 0 0
3861
3862 When the DataFrame has mixed dtypes, we get a transposed DataFrame with
3863 the `object` dtype:
3864
3865 >>> df2.dtypes
3866 name object
3867 score float64
3868 employed bool
3869 kids int64
3870 dtype: object
3871 >>> df2_transposed.dtypes
3872 0 object
3873 1 object
3874 dtype: object
3875 """
3876 nv.validate_transpose(args, {})
3877 # construct the args
3878
3879 dtypes = list(self.dtypes)
3880
3881 if self._can_fast_transpose:
3882 # Note: tests pass without this, but this improves perf quite a bit.
3883 new_vals = self._values.T
3884 if copy and not using_copy_on_write():
3885 new_vals = new_vals.copy()
3886
3887 result = self._constructor(
3888 new_vals,
3889 index=self.columns,
3890 columns=self.index,
3891 copy=False,
3892 dtype=new_vals.dtype,
3893 )
3894 if using_copy_on_write() and len(self) > 0:
3895 result._mgr.add_references(self._mgr) # type: ignore[arg-type]
3896
3897 elif (
3898 self._is_homogeneous_type
3899 and dtypes
3900 and isinstance(dtypes[0], ExtensionDtype)
3901 ):
3902 new_values: list
3903 if isinstance(dtypes[0], BaseMaskedDtype):
3904 # We have masked arrays with the same dtype. We can transpose faster.
3905 from pandas.core.arrays.masked import (
3906 transpose_homogeneous_masked_arrays,
3907 )
3908
3909 new_values = transpose_homogeneous_masked_arrays(
3910 cast(Sequence[BaseMaskedArray], self._iter_column_arrays())
3911 )
3912 elif isinstance(dtypes[0], ArrowDtype):
3913 # We have arrow EAs with the same dtype. We can transpose faster.
3914 from pandas.core.arrays.arrow.array import (
3915 ArrowExtensionArray,
3916 transpose_homogeneous_pyarrow,
3917 )
3918
3919 new_values = transpose_homogeneous_pyarrow(
3920 cast(Sequence[ArrowExtensionArray], self._iter_column_arrays())
3921 )
3922 else:
3923 # We have other EAs with the same dtype. We preserve dtype in transpose.
3924 dtyp = dtypes[0]
3925 arr_typ = dtyp.construct_array_type()
3926 values = self.values
3927 new_values = [arr_typ._from_sequence(row, dtype=dtyp) for row in values]
3928
3929 result = type(self)._from_arrays(
3930 new_values,
3931 index=self.columns,
3932 columns=self.index,
3933 verify_integrity=False,
3934 )
3935
3936 else:
3937 new_arr = self.values.T
3938 if copy and not using_copy_on_write():
3939 new_arr = new_arr.copy()
3940 result = self._constructor(
3941 new_arr,
3942 index=self.columns,
3943 columns=self.index,
3944 dtype=new_arr.dtype,
3945 # We already made a copy (more than one block)
3946 copy=False,
3947 )
3948
3949 return result.__finalize__(self, method="transpose")
3950
3951 @property
3952 def T(self) -> DataFrame:
3953 """
3954 The transpose of the DataFrame.
3955
3956 Returns
3957 -------
3958 DataFrame
3959 The transposed DataFrame.
3960
3961 See Also
3962 --------
3963 DataFrame.transpose : Transpose index and columns.
3964
3965 Examples
3966 --------
3967 >>> df = pd.DataFrame({'col1': [1, 2], 'col2': [3, 4]})
3968 >>> df
3969 col1 col2
3970 0 1 3
3971 1 2 4
3972
3973 >>> df.T
3974 0 1
3975 col1 1 2
3976 col2 3 4
3977 """
3978 return self.transpose()
3979
3980 # ----------------------------------------------------------------------
3981 # Indexing Methods
3982
3983 def _ixs(self, i: int, axis: AxisInt = 0) -> Series:
3984 """
3985 Parameters
3986 ----------
3987 i : int
3988 axis : int
3989
3990 Returns
3991 -------
3992 Series
3993 """
3994 # irow
3995 if axis == 0:
3996 new_mgr = self._mgr.fast_xs(i)
3997
3998 # if we are a copy, mark as such
3999 copy = isinstance(new_mgr.array, np.ndarray) and new_mgr.array.base is None
4000 result = self._constructor_sliced_from_mgr(new_mgr, axes=new_mgr.axes)
4001 result._name = self.index[i]
4002 result = result.__finalize__(self)
4003 result._set_is_copy(self, copy=copy)
4004 return result
4005
4006 # icol
4007 else:
4008 label = self.columns[i]
4009
4010 col_mgr = self._mgr.iget(i)
4011 result = self._box_col_values(col_mgr, i)
4012
4013 # this is a cached value, mark it so
4014 result._set_as_cached(label, self)
4015 return result
4016
4017 def _get_column_array(self, i: int) -> ArrayLike:
4018 """
4019 Get the values of the i'th column (ndarray or ExtensionArray, as stored
4020 in the Block)
4021
4022 Warning! The returned array is a view but doesn't handle Copy-on-Write,
4023 so this should be used with caution (for read-only purposes).
4024 """
4025 return self._mgr.iget_values(i)
4026
4027 def _iter_column_arrays(self) -> Iterator[ArrayLike]:
4028 """
4029 Iterate over the arrays of all columns in order.
4030 This returns the values as stored in the Block (ndarray or ExtensionArray).
4031
4032 Warning! The returned array is a view but doesn't handle Copy-on-Write,
4033 so this should be used with caution (for read-only purposes).
4034 """
4035 if isinstance(self._mgr, ArrayManager):
4036 yield from self._mgr.arrays
4037 else:
4038 for i in range(len(self.columns)):
4039 yield self._get_column_array(i)
4040
4041 def _getitem_nocopy(self, key: list):
4042 """
4043 Behaves like __getitem__, but returns a view in cases where __getitem__
4044 would make a copy.
4045 """
4046 # TODO(CoW): can be removed if/when we are always Copy-on-Write
4047 indexer = self.columns._get_indexer_strict(key, "columns")[1]
4048 new_axis = self.columns[indexer]
4049
4050 new_mgr = self._mgr.reindex_indexer(
4051 new_axis,
4052 indexer,
4053 axis=0,
4054 allow_dups=True,
4055 copy=False,
4056 only_slice=True,
4057 )
4058 result = self._constructor_from_mgr(new_mgr, axes=new_mgr.axes)
4059 result = result.__finalize__(self)
4060 return result
4061
4062 def __getitem__(self, key):
4063 check_dict_or_set_indexers(key)
4064 key = lib.item_from_zerodim(key)
4065 key = com.apply_if_callable(key, self)
4066
4067 if is_hashable(key) and not is_iterator(key):
4068 # is_iterator to exclude generator e.g. test_getitem_listlike
4069 # shortcut if the key is in columns
4070 is_mi = isinstance(self.columns, MultiIndex)
4071 # GH#45316 Return view if key is not duplicated
4072 # Only use drop_duplicates with duplicates for performance
4073 if not is_mi and (
4074 self.columns.is_unique
4075 and key in self.columns
4076 or key in self.columns.drop_duplicates(keep=False)
4077 ):
4078 return self._get_item_cache(key)
4079
4080 elif is_mi and self.columns.is_unique and key in self.columns:
4081 return self._getitem_multilevel(key)
4082
4083 # Do we have a slicer (on rows)?
4084 if isinstance(key, slice):
4085 return self._getitem_slice(key)
4086
4087 # Do we have a (boolean) DataFrame?
4088 if isinstance(key, DataFrame):
4089 return self.where(key)
4090
4091 # Do we have a (boolean) 1d indexer?
4092 if com.is_bool_indexer(key):
4093 return self._getitem_bool_array(key)
4094
4095 # We are left with two options: a single key, and a collection of keys,
4096 # We interpret tuples as collections only for non-MultiIndex
4097 is_single_key = isinstance(key, tuple) or not is_list_like(key)
4098
4099 if is_single_key:
4100 if self.columns.nlevels > 1:
4101 return self._getitem_multilevel(key)
4102 indexer = self.columns.get_loc(key)
4103 if is_integer(indexer):
4104 indexer = [indexer]
4105 else:
4106 if is_iterator(key):
4107 key = list(key)
4108 indexer = self.columns._get_indexer_strict(key, "columns")[1]
4109
4110 # take() does not accept boolean indexers
4111 if getattr(indexer, "dtype", None) == bool:
4112 indexer = np.where(indexer)[0]
4113
4114 if isinstance(indexer, slice):
4115 return self._slice(indexer, axis=1)
4116
4117 data = self._take_with_is_copy(indexer, axis=1)
4118
4119 if is_single_key:
4120 # What does looking for a single key in a non-unique index return?
4121 # The behavior is inconsistent. It returns a Series, except when
4122 # - the key itself is repeated (test on data.shape, #9519), or
4123 # - we have a MultiIndex on columns (test on self.columns, #21309)
4124 if data.shape[1] == 1 and not isinstance(self.columns, MultiIndex):
4125 # GH#26490 using data[key] can cause RecursionError
4126 return data._get_item_cache(key)
4127
4128 return data
4129
4130 def _getitem_bool_array(self, key):
4131 # also raises Exception if object array with NA values
4132 # warning here just in case -- previously __setitem__ was
4133 # reindexing but __getitem__ was not; it seems more reasonable to
4134 # go with the __setitem__ behavior since that is more consistent
4135 # with all other indexing behavior
4136 if isinstance(key, Series) and not key.index.equals(self.index):
4137 warnings.warn(
4138 "Boolean Series key will be reindexed to match DataFrame index.",
4139 UserWarning,
4140 stacklevel=find_stack_level(),
4141 )
4142 elif len(key) != len(self.index):
4143 raise ValueError(
4144 f"Item wrong length {len(key)} instead of {len(self.index)}."
4145 )
4146
4147 # check_bool_indexer will throw exception if Series key cannot
4148 # be reindexed to match DataFrame rows
4149 key = check_bool_indexer(self.index, key)
4150
4151 if key.all():
4152 return self.copy(deep=None)
4153
4154 indexer = key.nonzero()[0]
4155 return self._take_with_is_copy(indexer, axis=0)
4156
4157 def _getitem_multilevel(self, key):
4158 # self.columns is a MultiIndex
4159 loc = self.columns.get_loc(key)
4160 if isinstance(loc, (slice, np.ndarray)):
4161 new_columns = self.columns[loc]
4162 result_columns = maybe_droplevels(new_columns, key)
4163 result = self.iloc[:, loc]
4164 result.columns = result_columns
4165
4166 # If there is only one column being returned, and its name is
4167 # either an empty string, or a tuple with an empty string as its
4168 # first element, then treat the empty string as a placeholder
4169 # and return the column as if the user had provided that empty
4170 # string in the key. If the result is a Series, exclude the
4171 # implied empty string from its name.
4172 if len(result.columns) == 1:
4173 # e.g. test_frame_getitem_multicolumn_empty_level,
4174 # test_frame_mixed_depth_get, test_loc_setitem_single_column_slice
4175 top = result.columns[0]
4176 if isinstance(top, tuple):
4177 top = top[0]
4178 if top == "":
4179 result = result[""]
4180 if isinstance(result, Series):
4181 result = self._constructor_sliced(
4182 result, index=self.index, name=key
4183 )
4184
4185 result._set_is_copy(self)
4186 return result
4187 else:
4188 # loc is neither a slice nor ndarray, so must be an int
4189 return self._ixs(loc, axis=1)
4190
4191 def _get_value(self, index, col, takeable: bool = False) -> Scalar:
4192 """
4193 Quickly retrieve single value at passed column and index.
4194
4195 Parameters
4196 ----------
4197 index : row label
4198 col : column label
4199 takeable : interpret the index/col as indexers, default False
4200
4201 Returns
4202 -------
4203 scalar
4204
4205 Notes
4206 -----
4207 Assumes that both `self.index._index_as_unique` and
4208 `self.columns._index_as_unique`; Caller is responsible for checking.
4209 """
4210 if takeable:
4211 series = self._ixs(col, axis=1)
4212 return series._values[index]
4213
4214 series = self._get_item_cache(col)
4215 engine = self.index._engine
4216
4217 if not isinstance(self.index, MultiIndex):
4218 # CategoricalIndex: Trying to use the engine fastpath may give incorrect
4219 # results if our categories are integers that dont match our codes
4220 # IntervalIndex: IntervalTree has no get_loc
4221 row = self.index.get_loc(index)
4222 return series._values[row]
4223
4224 # For MultiIndex going through engine effectively restricts us to
4225 # same-length tuples; see test_get_set_value_no_partial_indexing
4226 loc = engine.get_loc(index)
4227 return series._values[loc]
4228
4229 def isetitem(self, loc, value) -> None:
4230 """
4231 Set the given value in the column with position `loc`.
4232
4233 This is a positional analogue to ``__setitem__``.
4234
4235 Parameters
4236 ----------
4237 loc : int or sequence of ints
4238 Index position for the column.
4239 value : scalar or arraylike
4240 Value(s) for the column.
4241
4242 Notes
4243 -----
4244 ``frame.isetitem(loc, value)`` is an in-place method as it will
4245 modify the DataFrame in place (not returning a new object). In contrast to
4246 ``frame.iloc[:, i] = value`` which will try to update the existing values in
4247 place, ``frame.isetitem(loc, value)`` will not update the values of the column
4248 itself in place, it will instead insert a new array.
4249
4250 In cases where ``frame.columns`` is unique, this is equivalent to
4251 ``frame[frame.columns[i]] = value``.
4252 """
4253 if isinstance(value, DataFrame):
4254 if is_integer(loc):
4255 loc = [loc]
4256
4257 if len(loc) != len(value.columns):
4258 raise ValueError(
4259 f"Got {len(loc)} positions but value has {len(value.columns)} "
4260 f"columns."
4261 )
4262
4263 for i, idx in enumerate(loc):
4264 arraylike, refs = self._sanitize_column(value.iloc[:, i])
4265 self._iset_item_mgr(idx, arraylike, inplace=False, refs=refs)
4266 return
4267
4268 arraylike, refs = self._sanitize_column(value)
4269 self._iset_item_mgr(loc, arraylike, inplace=False, refs=refs)
4270
4271 def __setitem__(self, key, value) -> None:
4272 if not PYPY and using_copy_on_write():
4273 if sys.getrefcount(self) <= 3:
4274 warnings.warn(
4275 _chained_assignment_msg, ChainedAssignmentError, stacklevel=2
4276 )
4277 elif not PYPY and not using_copy_on_write():
4278 if sys.getrefcount(self) <= 3 and (
4279 warn_copy_on_write()
4280 or (
4281 not warn_copy_on_write()
4282 and any(b.refs.has_reference() for b in self._mgr.blocks) # type: ignore[union-attr]
4283 )
4284 ):
4285 warnings.warn(
4286 _chained_assignment_warning_msg, FutureWarning, stacklevel=2
4287 )
4288
4289 key = com.apply_if_callable(key, self)
4290
4291 # see if we can slice the rows
4292 if isinstance(key, slice):
4293 slc = self.index._convert_slice_indexer(key, kind="getitem")
4294 return self._setitem_slice(slc, value)
4295
4296 if isinstance(key, DataFrame) or getattr(key, "ndim", None) == 2:
4297 self._setitem_frame(key, value)
4298 elif isinstance(key, (Series, np.ndarray, list, Index)):
4299 self._setitem_array(key, value)
4300 elif isinstance(value, DataFrame):
4301 self._set_item_frame_value(key, value)
4302 elif (
4303 is_list_like(value)
4304 and not self.columns.is_unique
4305 and 1 < len(self.columns.get_indexer_for([key])) == len(value)
4306 ):
4307 # Column to set is duplicated
4308 self._setitem_array([key], value)
4309 else:
4310 # set column
4311 self._set_item(key, value)
4312
4313 def _setitem_slice(self, key: slice, value) -> None:
4314 # NB: we can't just use self.loc[key] = value because that
4315 # operates on labels and we need to operate positional for
4316 # backwards-compat, xref GH#31469
4317 self._check_setitem_copy()
4318 self.iloc[key] = value
4319
4320 def _setitem_array(self, key, value):
4321 # also raises Exception if object array with NA values
4322 if com.is_bool_indexer(key):
4323 # bool indexer is indexing along rows
4324 if len(key) != len(self.index):
4325 raise ValueError(
4326 f"Item wrong length {len(key)} instead of {len(self.index)}!"
4327 )
4328 key = check_bool_indexer(self.index, key)
4329 indexer = key.nonzero()[0]
4330 self._check_setitem_copy()
4331 if isinstance(value, DataFrame):
4332 # GH#39931 reindex since iloc does not align
4333 value = value.reindex(self.index.take(indexer))
4334 self.iloc[indexer] = value
4335
4336 else:
4337 # Note: unlike self.iloc[:, indexer] = value, this will
4338 # never try to overwrite values inplace
4339
4340 if isinstance(value, DataFrame):
4341 check_key_length(self.columns, key, value)
4342 for k1, k2 in zip(key, value.columns):
4343 self[k1] = value[k2]
4344
4345 elif not is_list_like(value):
4346 for col in key:
4347 self[col] = value
4348
4349 elif isinstance(value, np.ndarray) and value.ndim == 2:
4350 self._iset_not_inplace(key, value)
4351
4352 elif np.ndim(value) > 1:
4353 # list of lists
4354 value = DataFrame(value).values
4355 return self._setitem_array(key, value)
4356
4357 else:
4358 self._iset_not_inplace(key, value)
4359
4360 def _iset_not_inplace(self, key, value):
4361 # GH#39510 when setting with df[key] = obj with a list-like key and
4362 # list-like value, we iterate over those listlikes and set columns
4363 # one at a time. This is different from dispatching to
4364 # `self.loc[:, key]= value` because loc.__setitem__ may overwrite
4365 # data inplace, whereas this will insert new arrays.
4366
4367 def igetitem(obj, i: int):
4368 # Note: we catch DataFrame obj before getting here, but
4369 # hypothetically would return obj.iloc[:, i]
4370 if isinstance(obj, np.ndarray):
4371 return obj[..., i]
4372 else:
4373 return obj[i]
4374
4375 if self.columns.is_unique:
4376 if np.shape(value)[-1] != len(key):
4377 raise ValueError("Columns must be same length as key")
4378
4379 for i, col in enumerate(key):
4380 self[col] = igetitem(value, i)
4381
4382 else:
4383 ilocs = self.columns.get_indexer_non_unique(key)[0]
4384 if (ilocs < 0).any():
4385 # key entries not in self.columns
4386 raise NotImplementedError
4387
4388 if np.shape(value)[-1] != len(ilocs):
4389 raise ValueError("Columns must be same length as key")
4390
4391 assert np.ndim(value) <= 2
4392
4393 orig_columns = self.columns
4394
4395 # Using self.iloc[:, i] = ... may set values inplace, which
4396 # by convention we do not do in __setitem__
4397 try:
4398 self.columns = Index(range(len(self.columns)))
4399 for i, iloc in enumerate(ilocs):
4400 self[iloc] = igetitem(value, i)
4401 finally:
4402 self.columns = orig_columns
4403
4404 def _setitem_frame(self, key, value):
4405 # support boolean setting with DataFrame input, e.g.
4406 # df[df > df2] = 0
4407 if isinstance(key, np.ndarray):
4408 if key.shape != self.shape:
4409 raise ValueError("Array conditional must be same shape as self")
4410 key = self._constructor(key, **self._construct_axes_dict(), copy=False)
4411
4412 if key.size and not all(is_bool_dtype(dtype) for dtype in key.dtypes):
4413 raise TypeError(
4414 "Must pass DataFrame or 2-d ndarray with boolean values only"
4415 )
4416
4417 self._check_setitem_copy()
4418 self._where(-key, value, inplace=True)
4419
4420 def _set_item_frame_value(self, key, value: DataFrame) -> None:
4421 self._ensure_valid_index(value)
4422
4423 # align columns
4424 if key in self.columns:
4425 loc = self.columns.get_loc(key)
4426 cols = self.columns[loc]
4427 len_cols = 1 if is_scalar(cols) or isinstance(cols, tuple) else len(cols)
4428 if len_cols != len(value.columns):
4429 raise ValueError("Columns must be same length as key")
4430
4431 # align right-hand-side columns if self.columns
4432 # is multi-index and self[key] is a sub-frame
4433 if isinstance(self.columns, MultiIndex) and isinstance(
4434 loc, (slice, Series, np.ndarray, Index)
4435 ):
4436 cols_droplevel = maybe_droplevels(cols, key)
4437 if len(cols_droplevel) and not cols_droplevel.equals(value.columns):
4438 value = value.reindex(cols_droplevel, axis=1)
4439
4440 for col, col_droplevel in zip(cols, cols_droplevel):
4441 self[col] = value[col_droplevel]
4442 return
4443
4444 if is_scalar(cols):
4445 self[cols] = value[value.columns[0]]
4446 return
4447
4448 locs: np.ndarray | list
4449 if isinstance(loc, slice):
4450 locs = np.arange(loc.start, loc.stop, loc.step)
4451 elif is_scalar(loc):
4452 locs = [loc]
4453 else:
4454 locs = loc.nonzero()[0]
4455
4456 return self.isetitem(locs, value)
4457
4458 if len(value.columns) > 1:
4459 raise ValueError(
4460 "Cannot set a DataFrame with multiple columns to the single "
4461 f"column {key}"
4462 )
4463 elif len(value.columns) == 0:
4464 raise ValueError(
4465 f"Cannot set a DataFrame without columns to the column {key}"
4466 )
4467
4468 self[key] = value[value.columns[0]]
4469
4470 def _iset_item_mgr(
4471 self,
4472 loc: int | slice | np.ndarray,
4473 value,
4474 inplace: bool = False,
4475 refs: BlockValuesRefs | None = None,
4476 ) -> None:
4477 # when called from _set_item_mgr loc can be anything returned from get_loc
4478 self._mgr.iset(loc, value, inplace=inplace, refs=refs)
4479 self._clear_item_cache()
4480
4481 def _set_item_mgr(
4482 self, key, value: ArrayLike, refs: BlockValuesRefs | None = None
4483 ) -> None:
4484 try:
4485 loc = self._info_axis.get_loc(key)
4486 except KeyError:
4487 # This item wasn't present, just insert at end
4488 self._mgr.insert(len(self._info_axis), key, value, refs)
4489 else:
4490 self._iset_item_mgr(loc, value, refs=refs)
4491
4492 # check if we are modifying a copy
4493 # try to set first as we want an invalid
4494 # value exception to occur first
4495 if len(self):
4496 self._check_setitem_copy()
4497
4498 def _iset_item(self, loc: int, value: Series, inplace: bool = True) -> None:
4499 # We are only called from _replace_columnwise which guarantees that
4500 # no reindex is necessary
4501 if using_copy_on_write():
4502 self._iset_item_mgr(
4503 loc, value._values, inplace=inplace, refs=value._references
4504 )
4505 else:
4506 self._iset_item_mgr(loc, value._values.copy(), inplace=True)
4507
4508 # check if we are modifying a copy
4509 # try to set first as we want an invalid
4510 # value exception to occur first
4511 if len(self):
4512 self._check_setitem_copy()
4513
4514 def _set_item(self, key, value) -> None:
4515 """
4516 Add series to DataFrame in specified column.
4517
4518 If series is a numpy-array (not a Series/TimeSeries), it must be the
4519 same length as the DataFrames index or an error will be thrown.
4520
4521 Series/TimeSeries will be conformed to the DataFrames index to
4522 ensure homogeneity.
4523 """
4524 value, refs = self._sanitize_column(value)
4525
4526 if (
4527 key in self.columns
4528 and value.ndim == 1
4529 and not isinstance(value.dtype, ExtensionDtype)
4530 ):
4531 # broadcast across multiple columns if necessary
4532 if not self.columns.is_unique or isinstance(self.columns, MultiIndex):
4533 existing_piece = self[key]
4534 if isinstance(existing_piece, DataFrame):
4535 value = np.tile(value, (len(existing_piece.columns), 1)).T
4536 refs = None
4537
4538 self._set_item_mgr(key, value, refs)
4539
4540 def _set_value(
4541 self, index: IndexLabel, col, value: Scalar, takeable: bool = False
4542 ) -> None:
4543 """
4544 Put single value at passed column and index.
4545
4546 Parameters
4547 ----------
4548 index : Label
4549 row label
4550 col : Label
4551 column label
4552 value : scalar
4553 takeable : bool, default False
4554 Sets whether or not index/col interpreted as indexers
4555 """
4556 try:
4557 if takeable:
4558 icol = col
4559 iindex = cast(int, index)
4560 else:
4561 icol = self.columns.get_loc(col)
4562 iindex = self.index.get_loc(index)
4563 self._mgr.column_setitem(icol, iindex, value, inplace_only=True)
4564 self._clear_item_cache()
4565
4566 except (KeyError, TypeError, ValueError, LossySetitemError):
4567 # get_loc might raise a KeyError for missing labels (falling back
4568 # to (i)loc will do expansion of the index)
4569 # column_setitem will do validation that may raise TypeError,
4570 # ValueError, or LossySetitemError
4571 # set using a non-recursive method & reset the cache
4572 if takeable:
4573 self.iloc[index, col] = value
4574 else:
4575 self.loc[index, col] = value
4576 self._item_cache.pop(col, None)
4577
4578 except InvalidIndexError as ii_err:
4579 # GH48729: Seems like you are trying to assign a value to a
4580 # row when only scalar options are permitted
4581 raise InvalidIndexError(
4582 f"You can only assign a scalar value not a {type(value)}"
4583 ) from ii_err
4584
4585 def _ensure_valid_index(self, value) -> None:
4586 """
4587 Ensure that if we don't have an index, that we can create one from the
4588 passed value.
4589 """
4590 # GH5632, make sure that we are a Series convertible
4591 if not len(self.index) and is_list_like(value) and len(value):
4592 if not isinstance(value, DataFrame):
4593 try:
4594 value = Series(value)
4595 except (ValueError, NotImplementedError, TypeError) as err:
4596 raise ValueError(
4597 "Cannot set a frame with no defined index "
4598 "and a value that cannot be converted to a Series"
4599 ) from err
4600
4601 # GH31368 preserve name of index
4602 index_copy = value.index.copy()
4603 if self.index.name is not None:
4604 index_copy.name = self.index.name
4605
4606 self._mgr = self._mgr.reindex_axis(index_copy, axis=1, fill_value=np.nan)
4607
4608 def _box_col_values(self, values: SingleDataManager, loc: int) -> Series:
4609 """
4610 Provide boxed values for a column.
4611 """
4612 # Lookup in columns so that if e.g. a str datetime was passed
4613 # we attach the Timestamp object as the name.
4614 name = self.columns[loc]
4615 # We get index=self.index bc values is a SingleDataManager
4616 obj = self._constructor_sliced_from_mgr(values, axes=values.axes)
4617 obj._name = name
4618 return obj.__finalize__(self)
4619
4620 # ----------------------------------------------------------------------
4621 # Lookup Caching
4622
4623 def _clear_item_cache(self) -> None:
4624 self._item_cache.clear()
4625
4626 def _get_item_cache(self, item: Hashable) -> Series:
4627 """Return the cached item, item represents a label indexer."""
4628 if using_copy_on_write() or warn_copy_on_write():
4629 loc = self.columns.get_loc(item)
4630 return self._ixs(loc, axis=1)
4631
4632 cache = self._item_cache
4633 res = cache.get(item)
4634 if res is None:
4635 # All places that call _get_item_cache have unique columns,
4636 # pending resolution of GH#33047
4637
4638 loc = self.columns.get_loc(item)
4639 res = self._ixs(loc, axis=1)
4640
4641 cache[item] = res
4642
4643 # for a chain
4644 res._is_copy = self._is_copy
4645 return res
4646
4647 def _reset_cacher(self) -> None:
4648 # no-op for DataFrame
4649 pass
4650
4651 def _maybe_cache_changed(self, item, value: Series, inplace: bool) -> None:
4652 """
4653 The object has called back to us saying maybe it has changed.
4654 """
4655 loc = self._info_axis.get_loc(item)
4656 arraylike = value._values
4657
4658 old = self._ixs(loc, axis=1)
4659 if old._values is value._values and inplace:
4660 # GH#46149 avoid making unnecessary copies/block-splitting
4661 return
4662
4663 self._mgr.iset(loc, arraylike, inplace=inplace)
4664
4665 # ----------------------------------------------------------------------
4666 # Unsorted
4667
4668 @overload
4669 def query(self, expr: str, *, inplace: Literal[False] = ..., **kwargs) -> DataFrame:
4670 ...
4671
4672 @overload
4673 def query(self, expr: str, *, inplace: Literal[True], **kwargs) -> None:
4674 ...
4675
4676 @overload
4677 def query(self, expr: str, *, inplace: bool = ..., **kwargs) -> DataFrame | None:
4678 ...
4679
4680 def query(self, expr: str, *, inplace: bool = False, **kwargs) -> DataFrame | None:
4681 """
4682 Query the columns of a DataFrame with a boolean expression.
4683
4684 Parameters
4685 ----------
4686 expr : str
4687 The query string to evaluate.
4688
4689 You can refer to variables
4690 in the environment by prefixing them with an '@' character like
4691 ``@a + b``.
4692
4693 You can refer to column names that are not valid Python variable names
4694 by surrounding them in backticks. Thus, column names containing spaces
4695 or punctuations (besides underscores) or starting with digits must be
4696 surrounded by backticks. (For example, a column named "Area (cm^2)" would
4697 be referenced as ```Area (cm^2)```). Column names which are Python keywords
4698 (like "list", "for", "import", etc) cannot be used.
4699
4700 For example, if one of your columns is called ``a a`` and you want
4701 to sum it with ``b``, your query should be ```a a` + b``.
4702
4703 inplace : bool
4704 Whether to modify the DataFrame rather than creating a new one.
4705 **kwargs
4706 See the documentation for :func:`eval` for complete details
4707 on the keyword arguments accepted by :meth:`DataFrame.query`.
4708
4709 Returns
4710 -------
4711 DataFrame or None
4712 DataFrame resulting from the provided query expression or
4713 None if ``inplace=True``.
4714
4715 See Also
4716 --------
4717 eval : Evaluate a string describing operations on
4718 DataFrame columns.
4719 DataFrame.eval : Evaluate a string describing operations on
4720 DataFrame columns.
4721
4722 Notes
4723 -----
4724 The result of the evaluation of this expression is first passed to
4725 :attr:`DataFrame.loc` and if that fails because of a
4726 multidimensional key (e.g., a DataFrame) then the result will be passed
4727 to :meth:`DataFrame.__getitem__`.
4728
4729 This method uses the top-level :func:`eval` function to
4730 evaluate the passed query.
4731
4732 The :meth:`~pandas.DataFrame.query` method uses a slightly
4733 modified Python syntax by default. For example, the ``&`` and ``|``
4734 (bitwise) operators have the precedence of their boolean cousins,
4735 :keyword:`and` and :keyword:`or`. This *is* syntactically valid Python,
4736 however the semantics are different.
4737
4738 You can change the semantics of the expression by passing the keyword
4739 argument ``parser='python'``. This enforces the same semantics as
4740 evaluation in Python space. Likewise, you can pass ``engine='python'``
4741 to evaluate an expression using Python itself as a backend. This is not
4742 recommended as it is inefficient compared to using ``numexpr`` as the
4743 engine.
4744
4745 The :attr:`DataFrame.index` and
4746 :attr:`DataFrame.columns` attributes of the
4747 :class:`~pandas.DataFrame` instance are placed in the query namespace
4748 by default, which allows you to treat both the index and columns of the
4749 frame as a column in the frame.
4750 The identifier ``index`` is used for the frame index; you can also
4751 use the name of the index to identify it in a query. Please note that
4752 Python keywords may not be used as identifiers.
4753
4754 For further details and examples see the ``query`` documentation in
4755 :ref:`indexing <indexing.query>`.
4756
4757 *Backtick quoted variables*
4758
4759 Backtick quoted variables are parsed as literal Python code and
4760 are converted internally to a Python valid identifier.
4761 This can lead to the following problems.
4762
4763 During parsing a number of disallowed characters inside the backtick
4764 quoted string are replaced by strings that are allowed as a Python identifier.
4765 These characters include all operators in Python, the space character, the
4766 question mark, the exclamation mark, the dollar sign, and the euro sign.
4767 For other characters that fall outside the ASCII range (U+0001..U+007F)
4768 and those that are not further specified in PEP 3131,
4769 the query parser will raise an error.
4770 This excludes whitespace different than the space character,
4771 but also the hashtag (as it is used for comments) and the backtick
4772 itself (backtick can also not be escaped).
4773
4774 In a special case, quotes that make a pair around a backtick can
4775 confuse the parser.
4776 For example, ```it's` > `that's``` will raise an error,
4777 as it forms a quoted string (``'s > `that'``) with a backtick inside.
4778
4779 See also the Python documentation about lexical analysis
4780 (https://docs.python.org/3/reference/lexical_analysis.html)
4781 in combination with the source code in :mod:`pandas.core.computation.parsing`.
4782
4783 Examples
4784 --------
4785 >>> df = pd.DataFrame({'A': range(1, 6),
4786 ... 'B': range(10, 0, -2),
4787 ... 'C C': range(10, 5, -1)})
4788 >>> df
4789 A B C C
4790 0 1 10 10
4791 1 2 8 9
4792 2 3 6 8
4793 3 4 4 7
4794 4 5 2 6
4795 >>> df.query('A > B')
4796 A B C C
4797 4 5 2 6
4798
4799 The previous expression is equivalent to
4800
4801 >>> df[df.A > df.B]
4802 A B C C
4803 4 5 2 6
4804
4805 For columns with spaces in their name, you can use backtick quoting.
4806
4807 >>> df.query('B == `C C`')
4808 A B C C
4809 0 1 10 10
4810
4811 The previous expression is equivalent to
4812
4813 >>> df[df.B == df['C C']]
4814 A B C C
4815 0 1 10 10
4816 """
4817 inplace = validate_bool_kwarg(inplace, "inplace")
4818 if not isinstance(expr, str):
4819 msg = f"expr must be a string to be evaluated, {type(expr)} given"
4820 raise ValueError(msg)
4821 kwargs["level"] = kwargs.pop("level", 0) + 1
4822 kwargs["target"] = None
4823 res = self.eval(expr, **kwargs)
4824
4825 try:
4826 result = self.loc[res]
4827 except ValueError:
4828 # when res is multi-dimensional loc raises, but this is sometimes a
4829 # valid query
4830 result = self[res]
4831
4832 if inplace:
4833 self._update_inplace(result)
4834 return None
4835 else:
4836 return result
4837
4838 @overload
4839 def eval(self, expr: str, *, inplace: Literal[False] = ..., **kwargs) -> Any:
4840 ...
4841
4842 @overload
4843 def eval(self, expr: str, *, inplace: Literal[True], **kwargs) -> None:
4844 ...
4845
4846 def eval(self, expr: str, *, inplace: bool = False, **kwargs) -> Any | None:
4847 """
4848 Evaluate a string describing operations on DataFrame columns.
4849
4850 Operates on columns only, not specific rows or elements. This allows
4851 `eval` to run arbitrary code, which can make you vulnerable to code
4852 injection if you pass user input to this function.
4853
4854 Parameters
4855 ----------
4856 expr : str
4857 The expression string to evaluate.
4858 inplace : bool, default False
4859 If the expression contains an assignment, whether to perform the
4860 operation inplace and mutate the existing DataFrame. Otherwise,
4861 a new DataFrame is returned.
4862 **kwargs
4863 See the documentation for :func:`eval` for complete details
4864 on the keyword arguments accepted by
4865 :meth:`~pandas.DataFrame.query`.
4866
4867 Returns
4868 -------
4869 ndarray, scalar, pandas object, or None
4870 The result of the evaluation or None if ``inplace=True``.
4871
4872 See Also
4873 --------
4874 DataFrame.query : Evaluates a boolean expression to query the columns
4875 of a frame.
4876 DataFrame.assign : Can evaluate an expression or function to create new
4877 values for a column.
4878 eval : Evaluate a Python expression as a string using various
4879 backends.
4880
4881 Notes
4882 -----
4883 For more details see the API documentation for :func:`~eval`.
4884 For detailed examples see :ref:`enhancing performance with eval
4885 <enhancingperf.eval>`.
4886
4887 Examples
4888 --------
4889 >>> df = pd.DataFrame({'A': range(1, 6), 'B': range(10, 0, -2)})
4890 >>> df
4891 A B
4892 0 1 10
4893 1 2 8
4894 2 3 6
4895 3 4 4
4896 4 5 2
4897 >>> df.eval('A + B')
4898 0 11
4899 1 10
4900 2 9
4901 3 8
4902 4 7
4903 dtype: int64
4904
4905 Assignment is allowed though by default the original DataFrame is not
4906 modified.
4907
4908 >>> df.eval('C = A + B')
4909 A B C
4910 0 1 10 11
4911 1 2 8 10
4912 2 3 6 9
4913 3 4 4 8
4914 4 5 2 7
4915 >>> df
4916 A B
4917 0 1 10
4918 1 2 8
4919 2 3 6
4920 3 4 4
4921 4 5 2
4922
4923 Multiple columns can be assigned to using multi-line expressions:
4924
4925 >>> df.eval(
4926 ... '''
4927 ... C = A + B
4928 ... D = A - B
4929 ... '''
4930 ... )
4931 A B C D
4932 0 1 10 11 -9
4933 1 2 8 10 -6
4934 2 3 6 9 -3
4935 3 4 4 8 0
4936 4 5 2 7 3
4937 """
4938 from pandas.core.computation.eval import eval as _eval
4939
4940 inplace = validate_bool_kwarg(inplace, "inplace")
4941 kwargs["level"] = kwargs.pop("level", 0) + 1
4942 index_resolvers = self._get_index_resolvers()
4943 column_resolvers = self._get_cleaned_column_resolvers()
4944 resolvers = column_resolvers, index_resolvers
4945 if "target" not in kwargs:
4946 kwargs["target"] = self
4947 kwargs["resolvers"] = tuple(kwargs.get("resolvers", ())) + resolvers
4948
4949 return _eval(expr, inplace=inplace, **kwargs)
4950
4951 def select_dtypes(self, include=None, exclude=None) -> Self:
4952 """
4953 Return a subset of the DataFrame's columns based on the column dtypes.
4954
4955 Parameters
4956 ----------
4957 include, exclude : scalar or list-like
4958 A selection of dtypes or strings to be included/excluded. At least
4959 one of these parameters must be supplied.
4960
4961 Returns
4962 -------
4963 DataFrame
4964 The subset of the frame including the dtypes in ``include`` and
4965 excluding the dtypes in ``exclude``.
4966
4967 Raises
4968 ------
4969 ValueError
4970 * If both of ``include`` and ``exclude`` are empty
4971 * If ``include`` and ``exclude`` have overlapping elements
4972 * If any kind of string dtype is passed in.
4973
4974 See Also
4975 --------
4976 DataFrame.dtypes: Return Series with the data type of each column.
4977
4978 Notes
4979 -----
4980 * To select all *numeric* types, use ``np.number`` or ``'number'``
4981 * To select strings you must use the ``object`` dtype, but note that
4982 this will return *all* object dtype columns
4983 * See the `numpy dtype hierarchy
4984 <https://numpy.org/doc/stable/reference/arrays.scalars.html>`__
4985 * To select datetimes, use ``np.datetime64``, ``'datetime'`` or
4986 ``'datetime64'``
4987 * To select timedeltas, use ``np.timedelta64``, ``'timedelta'`` or
4988 ``'timedelta64'``
4989 * To select Pandas categorical dtypes, use ``'category'``
4990 * To select Pandas datetimetz dtypes, use ``'datetimetz'``
4991 or ``'datetime64[ns, tz]'``
4992
4993 Examples
4994 --------
4995 >>> df = pd.DataFrame({'a': [1, 2] * 3,
4996 ... 'b': [True, False] * 3,
4997 ... 'c': [1.0, 2.0] * 3})
4998 >>> df
4999 a b c
5000 0 1 True 1.0
5001 1 2 False 2.0
5002 2 1 True 1.0
5003 3 2 False 2.0
5004 4 1 True 1.0
5005 5 2 False 2.0
5006
5007 >>> df.select_dtypes(include='bool')
5008 b
5009 0 True
5010 1 False
5011 2 True
5012 3 False
5013 4 True
5014 5 False
5015
5016 >>> df.select_dtypes(include=['float64'])
5017 c
5018 0 1.0
5019 1 2.0
5020 2 1.0
5021 3 2.0
5022 4 1.0
5023 5 2.0
5024
5025 >>> df.select_dtypes(exclude=['int64'])
5026 b c
5027 0 True 1.0
5028 1 False 2.0
5029 2 True 1.0
5030 3 False 2.0
5031 4 True 1.0
5032 5 False 2.0
5033 """
5034 if not is_list_like(include):
5035 include = (include,) if include is not None else ()
5036 if not is_list_like(exclude):
5037 exclude = (exclude,) if exclude is not None else ()
5038
5039 selection = (frozenset(include), frozenset(exclude))
5040
5041 if not any(selection):
5042 raise ValueError("at least one of include or exclude must be nonempty")
5043
5044 # convert the myriad valid dtypes object to a single representation
5045 def check_int_infer_dtype(dtypes):
5046 converted_dtypes: list[type] = []
5047 for dtype in dtypes:
5048 # Numpy maps int to different types (int32, in64) on Windows and Linux
5049 # see https://github.com/numpy/numpy/issues/9464
5050 if (isinstance(dtype, str) and dtype == "int") or (dtype is int):
5051 converted_dtypes.append(np.int32)
5052 converted_dtypes.append(np.int64)
5053 elif dtype == "float" or dtype is float:
5054 # GH#42452 : np.dtype("float") coerces to np.float64 from Numpy 1.20
5055 converted_dtypes.extend([np.float64, np.float32])
5056 else:
5057 converted_dtypes.append(infer_dtype_from_object(dtype))
5058 return frozenset(converted_dtypes)
5059
5060 include = check_int_infer_dtype(include)
5061 exclude = check_int_infer_dtype(exclude)
5062
5063 for dtypes in (include, exclude):
5064 invalidate_string_dtypes(dtypes)
5065
5066 # can't both include AND exclude!
5067 if not include.isdisjoint(exclude):
5068 raise ValueError(f"include and exclude overlap on {(include & exclude)}")
5069
5070 def dtype_predicate(dtype: DtypeObj, dtypes_set) -> bool:
5071 # GH 46870: BooleanDtype._is_numeric == True but should be excluded
5072 dtype = dtype if not isinstance(dtype, ArrowDtype) else dtype.numpy_dtype
5073 return issubclass(dtype.type, tuple(dtypes_set)) or (
5074 np.number in dtypes_set
5075 and getattr(dtype, "_is_numeric", False)
5076 and not is_bool_dtype(dtype)
5077 )
5078
5079 def predicate(arr: ArrayLike) -> bool:
5080 dtype = arr.dtype
5081 if include:
5082 if not dtype_predicate(dtype, include):
5083 return False
5084
5085 if exclude:
5086 if dtype_predicate(dtype, exclude):
5087 return False
5088
5089 return True
5090
5091 mgr = self._mgr._get_data_subset(predicate).copy(deep=None)
5092 # error: Incompatible return value type (got "DataFrame", expected "Self")
5093 return self._constructor_from_mgr(mgr, axes=mgr.axes).__finalize__(self) # type: ignore[return-value]
5094
5095 def insert(
5096 self,
5097 loc: int,
5098 column: Hashable,
5099 value: Scalar | AnyArrayLike,
5100 allow_duplicates: bool | lib.NoDefault = lib.no_default,
5101 ) -> None:
5102 """
5103 Insert column into DataFrame at specified location.
5104
5105 Raises a ValueError if `column` is already contained in the DataFrame,
5106 unless `allow_duplicates` is set to True.
5107
5108 Parameters
5109 ----------
5110 loc : int
5111 Insertion index. Must verify 0 <= loc <= len(columns).
5112 column : str, number, or hashable object
5113 Label of the inserted column.
5114 value : Scalar, Series, or array-like
5115 Content of the inserted column.
5116 allow_duplicates : bool, optional, default lib.no_default
5117 Allow duplicate column labels to be created.
5118
5119 See Also
5120 --------
5121 Index.insert : Insert new item by index.
5122
5123 Examples
5124 --------
5125 >>> df = pd.DataFrame({'col1': [1, 2], 'col2': [3, 4]})
5126 >>> df
5127 col1 col2
5128 0 1 3
5129 1 2 4
5130 >>> df.insert(1, "newcol", [99, 99])
5131 >>> df
5132 col1 newcol col2
5133 0 1 99 3
5134 1 2 99 4
5135 >>> df.insert(0, "col1", [100, 100], allow_duplicates=True)
5136 >>> df
5137 col1 col1 newcol col2
5138 0 100 1 99 3
5139 1 100 2 99 4
5140
5141 Notice that pandas uses index alignment in case of `value` from type `Series`:
5142
5143 >>> df.insert(0, "col0", pd.Series([5, 6], index=[1, 2]))
5144 >>> df
5145 col0 col1 col1 newcol col2
5146 0 NaN 100 1 99 3
5147 1 5.0 100 2 99 4
5148 """
5149 if allow_duplicates is lib.no_default:
5150 allow_duplicates = False
5151 if allow_duplicates and not self.flags.allows_duplicate_labels:
5152 raise ValueError(
5153 "Cannot specify 'allow_duplicates=True' when "
5154 "'self.flags.allows_duplicate_labels' is False."
5155 )
5156 if not allow_duplicates and column in self.columns:
5157 # Should this be a different kind of error??
5158 raise ValueError(f"cannot insert {column}, already exists")
5159 if not is_integer(loc):
5160 raise TypeError("loc must be int")
5161 # convert non stdlib ints to satisfy typing checks
5162 loc = int(loc)
5163 if isinstance(value, DataFrame) and len(value.columns) > 1:
5164 raise ValueError(
5165 f"Expected a one-dimensional object, got a DataFrame with "
5166 f"{len(value.columns)} columns instead."
5167 )
5168 elif isinstance(value, DataFrame):
5169 value = value.iloc[:, 0]
5170
5171 value, refs = self._sanitize_column(value)
5172 self._mgr.insert(loc, column, value, refs=refs)
5173
5174 def assign(self, **kwargs) -> DataFrame:
5175 r"""
5176 Assign new columns to a DataFrame.
5177
5178 Returns a new object with all original columns in addition to new ones.
5179 Existing columns that are re-assigned will be overwritten.
5180
5181 Parameters
5182 ----------
5183 **kwargs : dict of {str: callable or Series}
5184 The column names are keywords. If the values are
5185 callable, they are computed on the DataFrame and
5186 assigned to the new columns. The callable must not
5187 change input DataFrame (though pandas doesn't check it).
5188 If the values are not callable, (e.g. a Series, scalar, or array),
5189 they are simply assigned.
5190
5191 Returns
5192 -------
5193 DataFrame
5194 A new DataFrame with the new columns in addition to
5195 all the existing columns.
5196
5197 Notes
5198 -----
5199 Assigning multiple columns within the same ``assign`` is possible.
5200 Later items in '\*\*kwargs' may refer to newly created or modified
5201 columns in 'df'; items are computed and assigned into 'df' in order.
5202
5203 Examples
5204 --------
5205 >>> df = pd.DataFrame({'temp_c': [17.0, 25.0]},
5206 ... index=['Portland', 'Berkeley'])
5207 >>> df
5208 temp_c
5209 Portland 17.0
5210 Berkeley 25.0
5211
5212 Where the value is a callable, evaluated on `df`:
5213
5214 >>> df.assign(temp_f=lambda x: x.temp_c * 9 / 5 + 32)
5215 temp_c temp_f
5216 Portland 17.0 62.6
5217 Berkeley 25.0 77.0
5218
5219 Alternatively, the same behavior can be achieved by directly
5220 referencing an existing Series or sequence:
5221
5222 >>> df.assign(temp_f=df['temp_c'] * 9 / 5 + 32)
5223 temp_c temp_f
5224 Portland 17.0 62.6
5225 Berkeley 25.0 77.0
5226
5227 You can create multiple columns within the same assign where one
5228 of the columns depends on another one defined within the same assign:
5229
5230 >>> df.assign(temp_f=lambda x: x['temp_c'] * 9 / 5 + 32,
5231 ... temp_k=lambda x: (x['temp_f'] + 459.67) * 5 / 9)
5232 temp_c temp_f temp_k
5233 Portland 17.0 62.6 290.15
5234 Berkeley 25.0 77.0 298.15
5235 """
5236 data = self.copy(deep=None)
5237
5238 for k, v in kwargs.items():
5239 data[k] = com.apply_if_callable(v, data)
5240 return data
5241
5242 def _sanitize_column(self, value) -> tuple[ArrayLike, BlockValuesRefs | None]:
5243 """
5244 Ensures new columns (which go into the BlockManager as new blocks) are
5245 always copied (or a reference is being tracked to them under CoW)
5246 and converted into an array.
5247
5248 Parameters
5249 ----------
5250 value : scalar, Series, or array-like
5251
5252 Returns
5253 -------
5254 tuple of numpy.ndarray or ExtensionArray and optional BlockValuesRefs
5255 """
5256 self._ensure_valid_index(value)
5257
5258 # Using a DataFrame would mean coercing values to one dtype
5259 assert not isinstance(value, DataFrame)
5260 if is_dict_like(value):
5261 if not isinstance(value, Series):
5262 value = Series(value)
5263 return _reindex_for_setitem(value, self.index)
5264
5265 if is_list_like(value):
5266 com.require_length_match(value, self.index)
5267 arr = sanitize_array(value, self.index, copy=True, allow_2d=True)
5268 if (
5269 isinstance(value, Index)
5270 and value.dtype == "object"
5271 and arr.dtype != value.dtype
5272 ): #
5273 # TODO: Remove kludge in sanitize_array for string mode when enforcing
5274 # this deprecation
5275 warnings.warn(
5276 "Setting an Index with object dtype into a DataFrame will stop "
5277 "inferring another dtype in a future version. Cast the Index "
5278 "explicitly before setting it into the DataFrame.",
5279 FutureWarning,
5280 stacklevel=find_stack_level(),
5281 )
5282 return arr, None
5283
5284 @property
5285 def _series(self):
5286 return {item: self._ixs(idx, axis=1) for idx, item in enumerate(self.columns)}
5287
5288 # ----------------------------------------------------------------------
5289 # Reindexing and alignment
5290
5291 def _reindex_multi(
5292 self, axes: dict[str, Index], copy: bool, fill_value
5293 ) -> DataFrame:
5294 """
5295 We are guaranteed non-Nones in the axes.
5296 """
5297
5298 new_index, row_indexer = self.index.reindex(axes["index"])
5299 new_columns, col_indexer = self.columns.reindex(axes["columns"])
5300
5301 if row_indexer is not None and col_indexer is not None:
5302 # Fastpath. By doing two 'take's at once we avoid making an
5303 # unnecessary copy.
5304 # We only get here with `self._can_fast_transpose`, which (almost)
5305 # ensures that self.values is cheap. It may be worth making this
5306 # condition more specific.
5307 indexer = row_indexer, col_indexer
5308 new_values = take_2d_multi(self.values, indexer, fill_value=fill_value)
5309 return self._constructor(
5310 new_values, index=new_index, columns=new_columns, copy=False
5311 )
5312 else:
5313 return self._reindex_with_indexers(
5314 {0: [new_index, row_indexer], 1: [new_columns, col_indexer]},
5315 copy=copy,
5316 fill_value=fill_value,
5317 )
5318
5319 @Appender(
5320 """
5321 Examples
5322 --------
5323 >>> df = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]})
5324
5325 Change the row labels.
5326
5327 >>> df.set_axis(['a', 'b', 'c'], axis='index')
5328 A B
5329 a 1 4
5330 b 2 5
5331 c 3 6
5332
5333 Change the column labels.
5334
5335 >>> df.set_axis(['I', 'II'], axis='columns')
5336 I II
5337 0 1 4
5338 1 2 5
5339 2 3 6
5340 """
5341 )
5342 @Substitution(
5343 klass=_shared_doc_kwargs["klass"],
5344 axes_single_arg=_shared_doc_kwargs["axes_single_arg"],
5345 extended_summary_sub=" column or",
5346 axis_description_sub=", and 1 identifies the columns",
5347 see_also_sub=" or columns",
5348 )
5349 @Appender(NDFrame.set_axis.__doc__)
5350 def set_axis(
5351 self,
5352 labels,
5353 *,
5354 axis: Axis = 0,
5355 copy: bool | None = None,
5356 ) -> DataFrame:
5357 return super().set_axis(labels, axis=axis, copy=copy)
5358
5359 @doc(
5360 NDFrame.reindex,
5361 klass=_shared_doc_kwargs["klass"],
5362 optional_reindex=_shared_doc_kwargs["optional_reindex"],
5363 )
5364 def reindex(
5365 self,
5366 labels=None,
5367 *,
5368 index=None,
5369 columns=None,
5370 axis: Axis | None = None,
5371 method: ReindexMethod | None = None,
5372 copy: bool | None = None,
5373 level: Level | None = None,
5374 fill_value: Scalar | None = np.nan,
5375 limit: int | None = None,
5376 tolerance=None,
5377 ) -> DataFrame:
5378 return super().reindex(
5379 labels=labels,
5380 index=index,
5381 columns=columns,
5382 axis=axis,
5383 method=method,
5384 copy=copy,
5385 level=level,
5386 fill_value=fill_value,
5387 limit=limit,
5388 tolerance=tolerance,
5389 )
5390
5391 @overload
5392 def drop(
5393 self,
5394 labels: IndexLabel = ...,
5395 *,
5396 axis: Axis = ...,
5397 index: IndexLabel = ...,
5398 columns: IndexLabel = ...,
5399 level: Level = ...,
5400 inplace: Literal[True],
5401 errors: IgnoreRaise = ...,
5402 ) -> None:
5403 ...
5404
5405 @overload
5406 def drop(
5407 self,
5408 labels: IndexLabel = ...,
5409 *,
5410 axis: Axis = ...,
5411 index: IndexLabel = ...,
5412 columns: IndexLabel = ...,
5413 level: Level = ...,
5414 inplace: Literal[False] = ...,
5415 errors: IgnoreRaise = ...,
5416 ) -> DataFrame:
5417 ...
5418
5419 @overload
5420 def drop(
5421 self,
5422 labels: IndexLabel = ...,
5423 *,
5424 axis: Axis = ...,
5425 index: IndexLabel = ...,
5426 columns: IndexLabel = ...,
5427 level: Level = ...,
5428 inplace: bool = ...,
5429 errors: IgnoreRaise = ...,
5430 ) -> DataFrame | None:
5431 ...
5432
5433 def drop(
5434 self,
5435 labels: IndexLabel | None = None,
5436 *,
5437 axis: Axis = 0,
5438 index: IndexLabel | None = None,
5439 columns: IndexLabel | None = None,
5440 level: Level | None = None,
5441 inplace: bool = False,
5442 errors: IgnoreRaise = "raise",
5443 ) -> DataFrame | None:
5444 """
5445 Drop specified labels from rows or columns.
5446
5447 Remove rows or columns by specifying label names and corresponding
5448 axis, or by directly specifying index or column names. When using a
5449 multi-index, labels on different levels can be removed by specifying
5450 the level. See the :ref:`user guide <advanced.shown_levels>`
5451 for more information about the now unused levels.
5452
5453 Parameters
5454 ----------
5455 labels : single label or list-like
5456 Index or column labels to drop. A tuple will be used as a single
5457 label and not treated as a list-like.
5458 axis : {0 or 'index', 1 or 'columns'}, default 0
5459 Whether to drop labels from the index (0 or 'index') or
5460 columns (1 or 'columns').
5461 index : single label or list-like
5462 Alternative to specifying axis (``labels, axis=0``
5463 is equivalent to ``index=labels``).
5464 columns : single label or list-like
5465 Alternative to specifying axis (``labels, axis=1``
5466 is equivalent to ``columns=labels``).
5467 level : int or level name, optional
5468 For MultiIndex, level from which the labels will be removed.
5469 inplace : bool, default False
5470 If False, return a copy. Otherwise, do operation
5471 in place and return None.
5472 errors : {'ignore', 'raise'}, default 'raise'
5473 If 'ignore', suppress error and only existing labels are
5474 dropped.
5475
5476 Returns
5477 -------
5478 DataFrame or None
5479 Returns DataFrame or None DataFrame with the specified
5480 index or column labels removed or None if inplace=True.
5481
5482 Raises
5483 ------
5484 KeyError
5485 If any of the labels is not found in the selected axis.
5486
5487 See Also
5488 --------
5489 DataFrame.loc : Label-location based indexer for selection by label.
5490 DataFrame.dropna : Return DataFrame with labels on given axis omitted
5491 where (all or any) data are missing.
5492 DataFrame.drop_duplicates : Return DataFrame with duplicate rows
5493 removed, optionally only considering certain columns.
5494 Series.drop : Return Series with specified index labels removed.
5495
5496 Examples
5497 --------
5498 >>> df = pd.DataFrame(np.arange(12).reshape(3, 4),
5499 ... columns=['A', 'B', 'C', 'D'])
5500 >>> df
5501 A B C D
5502 0 0 1 2 3
5503 1 4 5 6 7
5504 2 8 9 10 11
5505
5506 Drop columns
5507
5508 >>> df.drop(['B', 'C'], axis=1)
5509 A D
5510 0 0 3
5511 1 4 7
5512 2 8 11
5513
5514 >>> df.drop(columns=['B', 'C'])
5515 A D
5516 0 0 3
5517 1 4 7
5518 2 8 11
5519
5520 Drop a row by index
5521
5522 >>> df.drop([0, 1])
5523 A B C D
5524 2 8 9 10 11
5525
5526 Drop columns and/or rows of MultiIndex DataFrame
5527
5528 >>> midx = pd.MultiIndex(levels=[['llama', 'cow', 'falcon'],
5529 ... ['speed', 'weight', 'length']],
5530 ... codes=[[0, 0, 0, 1, 1, 1, 2, 2, 2],
5531 ... [0, 1, 2, 0, 1, 2, 0, 1, 2]])
5532 >>> df = pd.DataFrame(index=midx, columns=['big', 'small'],
5533 ... data=[[45, 30], [200, 100], [1.5, 1], [30, 20],
5534 ... [250, 150], [1.5, 0.8], [320, 250],
5535 ... [1, 0.8], [0.3, 0.2]])
5536 >>> df
5537 big small
5538 llama speed 45.0 30.0
5539 weight 200.0 100.0
5540 length 1.5 1.0
5541 cow speed 30.0 20.0
5542 weight 250.0 150.0
5543 length 1.5 0.8
5544 falcon speed 320.0 250.0
5545 weight 1.0 0.8
5546 length 0.3 0.2
5547
5548 Drop a specific index combination from the MultiIndex
5549 DataFrame, i.e., drop the combination ``'falcon'`` and
5550 ``'weight'``, which deletes only the corresponding row
5551
5552 >>> df.drop(index=('falcon', 'weight'))
5553 big small
5554 llama speed 45.0 30.0
5555 weight 200.0 100.0
5556 length 1.5 1.0
5557 cow speed 30.0 20.0
5558 weight 250.0 150.0
5559 length 1.5 0.8
5560 falcon speed 320.0 250.0
5561 length 0.3 0.2
5562
5563 >>> df.drop(index='cow', columns='small')
5564 big
5565 llama speed 45.0
5566 weight 200.0
5567 length 1.5
5568 falcon speed 320.0
5569 weight 1.0
5570 length 0.3
5571
5572 >>> df.drop(index='length', level=1)
5573 big small
5574 llama speed 45.0 30.0
5575 weight 200.0 100.0
5576 cow speed 30.0 20.0
5577 weight 250.0 150.0
5578 falcon speed 320.0 250.0
5579 weight 1.0 0.8
5580 """
5581 return super().drop(
5582 labels=labels,
5583 axis=axis,
5584 index=index,
5585 columns=columns,
5586 level=level,
5587 inplace=inplace,
5588 errors=errors,
5589 )
5590
5591 @overload
5592 def rename(
5593 self,
5594 mapper: Renamer | None = ...,
5595 *,
5596 index: Renamer | None = ...,
5597 columns: Renamer | None = ...,
5598 axis: Axis | None = ...,
5599 copy: bool | None = ...,
5600 inplace: Literal[True],
5601 level: Level = ...,
5602 errors: IgnoreRaise = ...,
5603 ) -> None:
5604 ...
5605
5606 @overload
5607 def rename(
5608 self,
5609 mapper: Renamer | None = ...,
5610 *,
5611 index: Renamer | None = ...,
5612 columns: Renamer | None = ...,
5613 axis: Axis | None = ...,
5614 copy: bool | None = ...,
5615 inplace: Literal[False] = ...,
5616 level: Level = ...,
5617 errors: IgnoreRaise = ...,
5618 ) -> DataFrame:
5619 ...
5620
5621 @overload
5622 def rename(
5623 self,
5624 mapper: Renamer | None = ...,
5625 *,
5626 index: Renamer | None = ...,
5627 columns: Renamer | None = ...,
5628 axis: Axis | None = ...,
5629 copy: bool | None = ...,
5630 inplace: bool = ...,
5631 level: Level = ...,
5632 errors: IgnoreRaise = ...,
5633 ) -> DataFrame | None:
5634 ...
5635
5636 def rename(
5637 self,
5638 mapper: Renamer | None = None,
5639 *,
5640 index: Renamer | None = None,
5641 columns: Renamer | None = None,
5642 axis: Axis | None = None,
5643 copy: bool | None = None,
5644 inplace: bool = False,
5645 level: Level | None = None,
5646 errors: IgnoreRaise = "ignore",
5647 ) -> DataFrame | None:
5648 """
5649 Rename columns or index labels.
5650
5651 Function / dict values must be unique (1-to-1). Labels not contained in
5652 a dict / Series will be left as-is. Extra labels listed don't throw an
5653 error.
5654
5655 See the :ref:`user guide <basics.rename>` for more.
5656
5657 Parameters
5658 ----------
5659 mapper : dict-like or function
5660 Dict-like or function transformations to apply to
5661 that axis' values. Use either ``mapper`` and ``axis`` to
5662 specify the axis to target with ``mapper``, or ``index`` and
5663 ``columns``.
5664 index : dict-like or function
5665 Alternative to specifying axis (``mapper, axis=0``
5666 is equivalent to ``index=mapper``).
5667 columns : dict-like or function
5668 Alternative to specifying axis (``mapper, axis=1``
5669 is equivalent to ``columns=mapper``).
5670 axis : {0 or 'index', 1 or 'columns'}, default 0
5671 Axis to target with ``mapper``. Can be either the axis name
5672 ('index', 'columns') or number (0, 1). The default is 'index'.
5673 copy : bool, default True
5674 Also copy underlying data.
5675
5676 .. note::
5677 The `copy` keyword will change behavior in pandas 3.0.
5678 `Copy-on-Write
5679 <https://pandas.pydata.org/docs/dev/user_guide/copy_on_write.html>`__
5680 will be enabled by default, which means that all methods with a
5681 `copy` keyword will use a lazy copy mechanism to defer the copy and
5682 ignore the `copy` keyword. The `copy` keyword will be removed in a
5683 future version of pandas.
5684
5685 You can already get the future behavior and improvements through
5686 enabling copy on write ``pd.options.mode.copy_on_write = True``
5687 inplace : bool, default False
5688 Whether to modify the DataFrame rather than creating a new one.
5689 If True then value of copy is ignored.
5690 level : int or level name, default None
5691 In case of a MultiIndex, only rename labels in the specified
5692 level.
5693 errors : {'ignore', 'raise'}, default 'ignore'
5694 If 'raise', raise a `KeyError` when a dict-like `mapper`, `index`,
5695 or `columns` contains labels that are not present in the Index
5696 being transformed.
5697 If 'ignore', existing keys will be renamed and extra keys will be
5698 ignored.
5699
5700 Returns
5701 -------
5702 DataFrame or None
5703 DataFrame with the renamed axis labels or None if ``inplace=True``.
5704
5705 Raises
5706 ------
5707 KeyError
5708 If any of the labels is not found in the selected axis and
5709 "errors='raise'".
5710
5711 See Also
5712 --------
5713 DataFrame.rename_axis : Set the name of the axis.
5714
5715 Examples
5716 --------
5717 ``DataFrame.rename`` supports two calling conventions
5718
5719 * ``(index=index_mapper, columns=columns_mapper, ...)``
5720 * ``(mapper, axis={'index', 'columns'}, ...)``
5721
5722 We *highly* recommend using keyword arguments to clarify your
5723 intent.
5724
5725 Rename columns using a mapping:
5726
5727 >>> df = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]})
5728 >>> df.rename(columns={"A": "a", "B": "c"})
5729 a c
5730 0 1 4
5731 1 2 5
5732 2 3 6
5733
5734 Rename index using a mapping:
5735
5736 >>> df.rename(index={0: "x", 1: "y", 2: "z"})
5737 A B
5738 x 1 4
5739 y 2 5
5740 z 3 6
5741
5742 Cast index labels to a different type:
5743
5744 >>> df.index
5745 RangeIndex(start=0, stop=3, step=1)
5746 >>> df.rename(index=str).index
5747 Index(['0', '1', '2'], dtype='object')
5748
5749 >>> df.rename(columns={"A": "a", "B": "b", "C": "c"}, errors="raise")
5750 Traceback (most recent call last):
5751 KeyError: ['C'] not found in axis
5752
5753 Using axis-style parameters:
5754
5755 >>> df.rename(str.lower, axis='columns')
5756 a b
5757 0 1 4
5758 1 2 5
5759 2 3 6
5760
5761 >>> df.rename({1: 2, 2: 4}, axis='index')
5762 A B
5763 0 1 4
5764 2 2 5
5765 4 3 6
5766 """
5767 return super()._rename(
5768 mapper=mapper,
5769 index=index,
5770 columns=columns,
5771 axis=axis,
5772 copy=copy,
5773 inplace=inplace,
5774 level=level,
5775 errors=errors,
5776 )
5777
5778 def pop(self, item: Hashable) -> Series:
5779 """
5780 Return item and drop from frame. Raise KeyError if not found.
5781
5782 Parameters
5783 ----------
5784 item : label
5785 Label of column to be popped.
5786
5787 Returns
5788 -------
5789 Series
5790
5791 Examples
5792 --------
5793 >>> df = pd.DataFrame([('falcon', 'bird', 389.0),
5794 ... ('parrot', 'bird', 24.0),
5795 ... ('lion', 'mammal', 80.5),
5796 ... ('monkey', 'mammal', np.nan)],
5797 ... columns=('name', 'class', 'max_speed'))
5798 >>> df
5799 name class max_speed
5800 0 falcon bird 389.0
5801 1 parrot bird 24.0
5802 2 lion mammal 80.5
5803 3 monkey mammal NaN
5804
5805 >>> df.pop('class')
5806 0 bird
5807 1 bird
5808 2 mammal
5809 3 mammal
5810 Name: class, dtype: object
5811
5812 >>> df
5813 name max_speed
5814 0 falcon 389.0
5815 1 parrot 24.0
5816 2 lion 80.5
5817 3 monkey NaN
5818 """
5819 return super().pop(item=item)
5820
5821 def _replace_columnwise(
5822 self, mapping: dict[Hashable, tuple[Any, Any]], inplace: bool, regex
5823 ):
5824 """
5825 Dispatch to Series.replace column-wise.
5826
5827 Parameters
5828 ----------
5829 mapping : dict
5830 of the form {col: (target, value)}
5831 inplace : bool
5832 regex : bool or same types as `to_replace` in DataFrame.replace
5833
5834 Returns
5835 -------
5836 DataFrame or None
5837 """
5838 # Operate column-wise
5839 res = self if inplace else self.copy(deep=None)
5840 ax = self.columns
5841
5842 for i, ax_value in enumerate(ax):
5843 if ax_value in mapping:
5844 ser = self.iloc[:, i]
5845
5846 target, value = mapping[ax_value]
5847 newobj = ser.replace(target, value, regex=regex)
5848
5849 res._iset_item(i, newobj, inplace=inplace)
5850
5851 if inplace:
5852 return
5853 return res.__finalize__(self)
5854
5855 @doc(NDFrame.shift, klass=_shared_doc_kwargs["klass"])
5856 def shift(
5857 self,
5858 periods: int | Sequence[int] = 1,
5859 freq: Frequency | None = None,
5860 axis: Axis = 0,
5861 fill_value: Hashable = lib.no_default,
5862 suffix: str | None = None,
5863 ) -> DataFrame:
5864 if freq is not None and fill_value is not lib.no_default:
5865 # GH#53832
5866 warnings.warn(
5867 "Passing a 'freq' together with a 'fill_value' silently ignores "
5868 "the fill_value and is deprecated. This will raise in a future "
5869 "version.",
5870 FutureWarning,
5871 stacklevel=find_stack_level(),
5872 )
5873 fill_value = lib.no_default
5874
5875 if self.empty:
5876 return self.copy()
5877
5878 axis = self._get_axis_number(axis)
5879
5880 if is_list_like(periods):
5881 periods = cast(Sequence, periods)
5882 if axis == 1:
5883 raise ValueError(
5884 "If `periods` contains multiple shifts, `axis` cannot be 1."
5885 )
5886 if len(periods) == 0:
5887 raise ValueError("If `periods` is an iterable, it cannot be empty.")
5888 from pandas.core.reshape.concat import concat
5889
5890 shifted_dataframes = []
5891 for period in periods:
5892 if not is_integer(period):
5893 raise TypeError(
5894 f"Periods must be integer, but {period} is {type(period)}."
5895 )
5896 period = cast(int, period)
5897 shifted_dataframes.append(
5898 super()
5899 .shift(periods=period, freq=freq, axis=axis, fill_value=fill_value)
5900 .add_suffix(f"{suffix}_{period}" if suffix else f"_{period}")
5901 )
5902 return concat(shifted_dataframes, axis=1)
5903 elif suffix:
5904 raise ValueError("Cannot specify `suffix` if `periods` is an int.")
5905 periods = cast(int, periods)
5906
5907 ncols = len(self.columns)
5908 arrays = self._mgr.arrays
5909 if axis == 1 and periods != 0 and ncols > 0 and freq is None:
5910 if fill_value is lib.no_default:
5911 # We will infer fill_value to match the closest column
5912
5913 # Use a column that we know is valid for our column's dtype GH#38434
5914 label = self.columns[0]
5915
5916 if periods > 0:
5917 result = self.iloc[:, :-periods]
5918 for col in range(min(ncols, abs(periods))):
5919 # TODO(EA2D): doing this in a loop unnecessary with 2D EAs
5920 # Define filler inside loop so we get a copy
5921 filler = self.iloc[:, 0].shift(len(self))
5922 result.insert(0, label, filler, allow_duplicates=True)
5923 else:
5924 result = self.iloc[:, -periods:]
5925 for col in range(min(ncols, abs(periods))):
5926 # Define filler inside loop so we get a copy
5927 filler = self.iloc[:, -1].shift(len(self))
5928 result.insert(
5929 len(result.columns), label, filler, allow_duplicates=True
5930 )
5931
5932 result.columns = self.columns.copy()
5933 return result
5934 elif len(arrays) > 1 or (
5935 # If we only have one block and we know that we can't
5936 # keep the same dtype (i.e. the _can_hold_element check)
5937 # then we can go through the reindex_indexer path
5938 # (and avoid casting logic in the Block method).
5939 not can_hold_element(arrays[0], fill_value)
5940 ):
5941 # GH#35488 we need to watch out for multi-block cases
5942 # We only get here with fill_value not-lib.no_default
5943 nper = abs(periods)
5944 nper = min(nper, ncols)
5945 if periods > 0:
5946 indexer = np.array(
5947 [-1] * nper + list(range(ncols - periods)), dtype=np.intp
5948 )
5949 else:
5950 indexer = np.array(
5951 list(range(nper, ncols)) + [-1] * nper, dtype=np.intp
5952 )
5953 mgr = self._mgr.reindex_indexer(
5954 self.columns,
5955 indexer,
5956 axis=0,
5957 fill_value=fill_value,
5958 allow_dups=True,
5959 )
5960 res_df = self._constructor_from_mgr(mgr, axes=mgr.axes)
5961 return res_df.__finalize__(self, method="shift")
5962 else:
5963 return self.T.shift(periods=periods, fill_value=fill_value).T
5964
5965 return super().shift(
5966 periods=periods, freq=freq, axis=axis, fill_value=fill_value
5967 )
5968
5969 @overload
5970 def set_index(
5971 self,
5972 keys,
5973 *,
5974 drop: bool = ...,
5975 append: bool = ...,
5976 inplace: Literal[False] = ...,
5977 verify_integrity: bool = ...,
5978 ) -> DataFrame:
5979 ...
5980
5981 @overload
5982 def set_index(
5983 self,
5984 keys,
5985 *,
5986 drop: bool = ...,
5987 append: bool = ...,
5988 inplace: Literal[True],
5989 verify_integrity: bool = ...,
5990 ) -> None:
5991 ...
5992
5993 def set_index(
5994 self,
5995 keys,
5996 *,
5997 drop: bool = True,
5998 append: bool = False,
5999 inplace: bool = False,
6000 verify_integrity: bool = False,
6001 ) -> DataFrame | None:
6002 """
6003 Set the DataFrame index using existing columns.
6004
6005 Set the DataFrame index (row labels) using one or more existing
6006 columns or arrays (of the correct length). The index can replace the
6007 existing index or expand on it.
6008
6009 Parameters
6010 ----------
6011 keys : label or array-like or list of labels/arrays
6012 This parameter can be either a single column key, a single array of
6013 the same length as the calling DataFrame, or a list containing an
6014 arbitrary combination of column keys and arrays. Here, "array"
6015 encompasses :class:`Series`, :class:`Index`, ``np.ndarray``, and
6016 instances of :class:`~collections.abc.Iterator`.
6017 drop : bool, default True
6018 Delete columns to be used as the new index.
6019 append : bool, default False
6020 Whether to append columns to existing index.
6021 inplace : bool, default False
6022 Whether to modify the DataFrame rather than creating a new one.
6023 verify_integrity : bool, default False
6024 Check the new index for duplicates. Otherwise defer the check until
6025 necessary. Setting to False will improve the performance of this
6026 method.
6027
6028 Returns
6029 -------
6030 DataFrame or None
6031 Changed row labels or None if ``inplace=True``.
6032
6033 See Also
6034 --------
6035 DataFrame.reset_index : Opposite of set_index.
6036 DataFrame.reindex : Change to new indices or expand indices.
6037 DataFrame.reindex_like : Change to same indices as other DataFrame.
6038
6039 Examples
6040 --------
6041 >>> df = pd.DataFrame({'month': [1, 4, 7, 10],
6042 ... 'year': [2012, 2014, 2013, 2014],
6043 ... 'sale': [55, 40, 84, 31]})
6044 >>> df
6045 month year sale
6046 0 1 2012 55
6047 1 4 2014 40
6048 2 7 2013 84
6049 3 10 2014 31
6050
6051 Set the index to become the 'month' column:
6052
6053 >>> df.set_index('month')
6054 year sale
6055 month
6056 1 2012 55
6057 4 2014 40
6058 7 2013 84
6059 10 2014 31
6060
6061 Create a MultiIndex using columns 'year' and 'month':
6062
6063 >>> df.set_index(['year', 'month'])
6064 sale
6065 year month
6066 2012 1 55
6067 2014 4 40
6068 2013 7 84
6069 2014 10 31
6070
6071 Create a MultiIndex using an Index and a column:
6072
6073 >>> df.set_index([pd.Index([1, 2, 3, 4]), 'year'])
6074 month sale
6075 year
6076 1 2012 1 55
6077 2 2014 4 40
6078 3 2013 7 84
6079 4 2014 10 31
6080
6081 Create a MultiIndex using two Series:
6082
6083 >>> s = pd.Series([1, 2, 3, 4])
6084 >>> df.set_index([s, s**2])
6085 month year sale
6086 1 1 1 2012 55
6087 2 4 4 2014 40
6088 3 9 7 2013 84
6089 4 16 10 2014 31
6090 """
6091 inplace = validate_bool_kwarg(inplace, "inplace")
6092 self._check_inplace_and_allows_duplicate_labels(inplace)
6093 if not isinstance(keys, list):
6094 keys = [keys]
6095
6096 err_msg = (
6097 'The parameter "keys" may be a column key, one-dimensional '
6098 "array, or a list containing only valid column keys and "
6099 "one-dimensional arrays."
6100 )
6101
6102 missing: list[Hashable] = []
6103 for col in keys:
6104 if isinstance(col, (Index, Series, np.ndarray, list, abc.Iterator)):
6105 # arrays are fine as long as they are one-dimensional
6106 # iterators get converted to list below
6107 if getattr(col, "ndim", 1) != 1:
6108 raise ValueError(err_msg)
6109 else:
6110 # everything else gets tried as a key; see GH 24969
6111 try:
6112 found = col in self.columns
6113 except TypeError as err:
6114 raise TypeError(
6115 f"{err_msg}. Received column of type {type(col)}"
6116 ) from err
6117 else:
6118 if not found:
6119 missing.append(col)
6120
6121 if missing:
6122 raise KeyError(f"None of {missing} are in the columns")
6123
6124 if inplace:
6125 frame = self
6126 else:
6127 # GH 49473 Use "lazy copy" with Copy-on-Write
6128 frame = self.copy(deep=None)
6129
6130 arrays: list[Index] = []
6131 names: list[Hashable] = []
6132 if append:
6133 names = list(self.index.names)
6134 if isinstance(self.index, MultiIndex):
6135 arrays.extend(
6136 self.index._get_level_values(i) for i in range(self.index.nlevels)
6137 )
6138 else:
6139 arrays.append(self.index)
6140
6141 to_remove: list[Hashable] = []
6142 for col in keys:
6143 if isinstance(col, MultiIndex):
6144 arrays.extend(col._get_level_values(n) for n in range(col.nlevels))
6145 names.extend(col.names)
6146 elif isinstance(col, (Index, Series)):
6147 # if Index then not MultiIndex (treated above)
6148
6149 # error: Argument 1 to "append" of "list" has incompatible type
6150 # "Union[Index, Series]"; expected "Index"
6151 arrays.append(col) # type: ignore[arg-type]
6152 names.append(col.name)
6153 elif isinstance(col, (list, np.ndarray)):
6154 # error: Argument 1 to "append" of "list" has incompatible type
6155 # "Union[List[Any], ndarray]"; expected "Index"
6156 arrays.append(col) # type: ignore[arg-type]
6157 names.append(None)
6158 elif isinstance(col, abc.Iterator):
6159 # error: Argument 1 to "append" of "list" has incompatible type
6160 # "List[Any]"; expected "Index"
6161 arrays.append(list(col)) # type: ignore[arg-type]
6162 names.append(None)
6163 # from here, col can only be a column label
6164 else:
6165 arrays.append(frame[col])
6166 names.append(col)
6167 if drop:
6168 to_remove.append(col)
6169
6170 if len(arrays[-1]) != len(self):
6171 # check newest element against length of calling frame, since
6172 # ensure_index_from_sequences would not raise for append=False.
6173 raise ValueError(
6174 f"Length mismatch: Expected {len(self)} rows, "
6175 f"received array of length {len(arrays[-1])}"
6176 )
6177
6178 index = ensure_index_from_sequences(arrays, names)
6179
6180 if verify_integrity and not index.is_unique:
6181 duplicates = index[index.duplicated()].unique()
6182 raise ValueError(f"Index has duplicate keys: {duplicates}")
6183
6184 # use set to handle duplicate column names gracefully in case of drop
6185 for c in set(to_remove):
6186 del frame[c]
6187
6188 # clear up memory usage
6189 index._cleanup()
6190
6191 frame.index = index
6192
6193 if not inplace:
6194 return frame
6195 return None
6196
6197 @overload
6198 def reset_index(
6199 self,
6200 level: IndexLabel = ...,
6201 *,
6202 drop: bool = ...,
6203 inplace: Literal[False] = ...,
6204 col_level: Hashable = ...,
6205 col_fill: Hashable = ...,
6206 allow_duplicates: bool | lib.NoDefault = ...,
6207 names: Hashable | Sequence[Hashable] | None = None,
6208 ) -> DataFrame:
6209 ...
6210
6211 @overload
6212 def reset_index(
6213 self,
6214 level: IndexLabel = ...,
6215 *,
6216 drop: bool = ...,
6217 inplace: Literal[True],
6218 col_level: Hashable = ...,
6219 col_fill: Hashable = ...,
6220 allow_duplicates: bool | lib.NoDefault = ...,
6221 names: Hashable | Sequence[Hashable] | None = None,
6222 ) -> None:
6223 ...
6224
6225 @overload
6226 def reset_index(
6227 self,
6228 level: IndexLabel = ...,
6229 *,
6230 drop: bool = ...,
6231 inplace: bool = ...,
6232 col_level: Hashable = ...,
6233 col_fill: Hashable = ...,
6234 allow_duplicates: bool | lib.NoDefault = ...,
6235 names: Hashable | Sequence[Hashable] | None = None,
6236 ) -> DataFrame | None:
6237 ...
6238
6239 def reset_index(
6240 self,
6241 level: IndexLabel | None = None,
6242 *,
6243 drop: bool = False,
6244 inplace: bool = False,
6245 col_level: Hashable = 0,
6246 col_fill: Hashable = "",
6247 allow_duplicates: bool | lib.NoDefault = lib.no_default,
6248 names: Hashable | Sequence[Hashable] | None = None,
6249 ) -> DataFrame | None:
6250 """
6251 Reset the index, or a level of it.
6252
6253 Reset the index of the DataFrame, and use the default one instead.
6254 If the DataFrame has a MultiIndex, this method can remove one or more
6255 levels.
6256
6257 Parameters
6258 ----------
6259 level : int, str, tuple, or list, default None
6260 Only remove the given levels from the index. Removes all levels by
6261 default.
6262 drop : bool, default False
6263 Do not try to insert index into dataframe columns. This resets
6264 the index to the default integer index.
6265 inplace : bool, default False
6266 Whether to modify the DataFrame rather than creating a new one.
6267 col_level : int or str, default 0
6268 If the columns have multiple levels, determines which level the
6269 labels are inserted into. By default it is inserted into the first
6270 level.
6271 col_fill : object, default ''
6272 If the columns have multiple levels, determines how the other
6273 levels are named. If None then the index name is repeated.
6274 allow_duplicates : bool, optional, default lib.no_default
6275 Allow duplicate column labels to be created.
6276
6277 .. versionadded:: 1.5.0
6278
6279 names : int, str or 1-dimensional list, default None
6280 Using the given string, rename the DataFrame column which contains the
6281 index data. If the DataFrame has a MultiIndex, this has to be a list or
6282 tuple with length equal to the number of levels.
6283
6284 .. versionadded:: 1.5.0
6285
6286 Returns
6287 -------
6288 DataFrame or None
6289 DataFrame with the new index or None if ``inplace=True``.
6290
6291 See Also
6292 --------
6293 DataFrame.set_index : Opposite of reset_index.
6294 DataFrame.reindex : Change to new indices or expand indices.
6295 DataFrame.reindex_like : Change to same indices as other DataFrame.
6296
6297 Examples
6298 --------
6299 >>> df = pd.DataFrame([('bird', 389.0),
6300 ... ('bird', 24.0),
6301 ... ('mammal', 80.5),
6302 ... ('mammal', np.nan)],
6303 ... index=['falcon', 'parrot', 'lion', 'monkey'],
6304 ... columns=('class', 'max_speed'))
6305 >>> df
6306 class max_speed
6307 falcon bird 389.0
6308 parrot bird 24.0
6309 lion mammal 80.5
6310 monkey mammal NaN
6311
6312 When we reset the index, the old index is added as a column, and a
6313 new sequential index is used:
6314
6315 >>> df.reset_index()
6316 index class max_speed
6317 0 falcon bird 389.0
6318 1 parrot bird 24.0
6319 2 lion mammal 80.5
6320 3 monkey mammal NaN
6321
6322 We can use the `drop` parameter to avoid the old index being added as
6323 a column:
6324
6325 >>> df.reset_index(drop=True)
6326 class max_speed
6327 0 bird 389.0
6328 1 bird 24.0
6329 2 mammal 80.5
6330 3 mammal NaN
6331
6332 You can also use `reset_index` with `MultiIndex`.
6333
6334 >>> index = pd.MultiIndex.from_tuples([('bird', 'falcon'),
6335 ... ('bird', 'parrot'),
6336 ... ('mammal', 'lion'),
6337 ... ('mammal', 'monkey')],
6338 ... names=['class', 'name'])
6339 >>> columns = pd.MultiIndex.from_tuples([('speed', 'max'),
6340 ... ('species', 'type')])
6341 >>> df = pd.DataFrame([(389.0, 'fly'),
6342 ... (24.0, 'fly'),
6343 ... (80.5, 'run'),
6344 ... (np.nan, 'jump')],
6345 ... index=index,
6346 ... columns=columns)
6347 >>> df
6348 speed species
6349 max type
6350 class name
6351 bird falcon 389.0 fly
6352 parrot 24.0 fly
6353 mammal lion 80.5 run
6354 monkey NaN jump
6355
6356 Using the `names` parameter, choose a name for the index column:
6357
6358 >>> df.reset_index(names=['classes', 'names'])
6359 classes names speed species
6360 max type
6361 0 bird falcon 389.0 fly
6362 1 bird parrot 24.0 fly
6363 2 mammal lion 80.5 run
6364 3 mammal monkey NaN jump
6365
6366 If the index has multiple levels, we can reset a subset of them:
6367
6368 >>> df.reset_index(level='class')
6369 class speed species
6370 max type
6371 name
6372 falcon bird 389.0 fly
6373 parrot bird 24.0 fly
6374 lion mammal 80.5 run
6375 monkey mammal NaN jump
6376
6377 If we are not dropping the index, by default, it is placed in the top
6378 level. We can place it in another level:
6379
6380 >>> df.reset_index(level='class', col_level=1)
6381 speed species
6382 class max type
6383 name
6384 falcon bird 389.0 fly
6385 parrot bird 24.0 fly
6386 lion mammal 80.5 run
6387 monkey mammal NaN jump
6388
6389 When the index is inserted under another level, we can specify under
6390 which one with the parameter `col_fill`:
6391
6392 >>> df.reset_index(level='class', col_level=1, col_fill='species')
6393 species speed species
6394 class max type
6395 name
6396 falcon bird 389.0 fly
6397 parrot bird 24.0 fly
6398 lion mammal 80.5 run
6399 monkey mammal NaN jump
6400
6401 If we specify a nonexistent level for `col_fill`, it is created:
6402
6403 >>> df.reset_index(level='class', col_level=1, col_fill='genus')
6404 genus speed species
6405 class max type
6406 name
6407 falcon bird 389.0 fly
6408 parrot bird 24.0 fly
6409 lion mammal 80.5 run
6410 monkey mammal NaN jump
6411 """
6412 inplace = validate_bool_kwarg(inplace, "inplace")
6413 self._check_inplace_and_allows_duplicate_labels(inplace)
6414 if inplace:
6415 new_obj = self
6416 else:
6417 new_obj = self.copy(deep=None)
6418 if allow_duplicates is not lib.no_default:
6419 allow_duplicates = validate_bool_kwarg(allow_duplicates, "allow_duplicates")
6420
6421 new_index = default_index(len(new_obj))
6422 if level is not None:
6423 if not isinstance(level, (tuple, list)):
6424 level = [level]
6425 level = [self.index._get_level_number(lev) for lev in level]
6426 if len(level) < self.index.nlevels:
6427 new_index = self.index.droplevel(level)
6428
6429 if not drop:
6430 to_insert: Iterable[tuple[Any, Any | None]]
6431
6432 default = "index" if "index" not in self else "level_0"
6433 names = self.index._get_default_index_names(names, default)
6434
6435 if isinstance(self.index, MultiIndex):
6436 to_insert = zip(self.index.levels, self.index.codes)
6437 else:
6438 to_insert = ((self.index, None),)
6439
6440 multi_col = isinstance(self.columns, MultiIndex)
6441 for i, (lev, lab) in reversed(list(enumerate(to_insert))):
6442 if level is not None and i not in level:
6443 continue
6444 name = names[i]
6445 if multi_col:
6446 col_name = list(name) if isinstance(name, tuple) else [name]
6447 if col_fill is None:
6448 if len(col_name) not in (1, self.columns.nlevels):
6449 raise ValueError(
6450 "col_fill=None is incompatible "
6451 f"with incomplete column name {name}"
6452 )
6453 col_fill = col_name[0]
6454
6455 lev_num = self.columns._get_level_number(col_level)
6456 name_lst = [col_fill] * lev_num + col_name
6457 missing = self.columns.nlevels - len(name_lst)
6458 name_lst += [col_fill] * missing
6459 name = tuple(name_lst)
6460
6461 # to ndarray and maybe infer different dtype
6462 level_values = lev._values
6463 if level_values.dtype == np.object_:
6464 level_values = lib.maybe_convert_objects(level_values)
6465
6466 if lab is not None:
6467 # if we have the codes, extract the values with a mask
6468 level_values = algorithms.take(
6469 level_values, lab, allow_fill=True, fill_value=lev._na_value
6470 )
6471
6472 new_obj.insert(
6473 0,
6474 name,
6475 level_values,
6476 allow_duplicates=allow_duplicates,
6477 )
6478
6479 new_obj.index = new_index
6480 if not inplace:
6481 return new_obj
6482
6483 return None
6484
6485 # ----------------------------------------------------------------------
6486 # Reindex-based selection methods
6487
6488 @doc(NDFrame.isna, klass=_shared_doc_kwargs["klass"])
6489 def isna(self) -> DataFrame:
6490 res_mgr = self._mgr.isna(func=isna)
6491 result = self._constructor_from_mgr(res_mgr, axes=res_mgr.axes)
6492 return result.__finalize__(self, method="isna")
6493
6494 @doc(NDFrame.isna, klass=_shared_doc_kwargs["klass"])
6495 def isnull(self) -> DataFrame:
6496 """
6497 DataFrame.isnull is an alias for DataFrame.isna.
6498 """
6499 return self.isna()
6500
6501 @doc(NDFrame.notna, klass=_shared_doc_kwargs["klass"])
6502 def notna(self) -> DataFrame:
6503 return ~self.isna()
6504
6505 @doc(NDFrame.notna, klass=_shared_doc_kwargs["klass"])
6506 def notnull(self) -> DataFrame:
6507 """
6508 DataFrame.notnull is an alias for DataFrame.notna.
6509 """
6510 return ~self.isna()
6511
6512 @overload
6513 def dropna(
6514 self,
6515 *,
6516 axis: Axis = ...,
6517 how: AnyAll | lib.NoDefault = ...,
6518 thresh: int | lib.NoDefault = ...,
6519 subset: IndexLabel = ...,
6520 inplace: Literal[False] = ...,
6521 ignore_index: bool = ...,
6522 ) -> DataFrame:
6523 ...
6524
6525 @overload
6526 def dropna(
6527 self,
6528 *,
6529 axis: Axis = ...,
6530 how: AnyAll | lib.NoDefault = ...,
6531 thresh: int | lib.NoDefault = ...,
6532 subset: IndexLabel = ...,
6533 inplace: Literal[True],
6534 ignore_index: bool = ...,
6535 ) -> None:
6536 ...
6537
6538 def dropna(
6539 self,
6540 *,
6541 axis: Axis = 0,
6542 how: AnyAll | lib.NoDefault = lib.no_default,
6543 thresh: int | lib.NoDefault = lib.no_default,
6544 subset: IndexLabel | None = None,
6545 inplace: bool = False,
6546 ignore_index: bool = False,
6547 ) -> DataFrame | None:
6548 """
6549 Remove missing values.
6550
6551 See the :ref:`User Guide <missing_data>` for more on which values are
6552 considered missing, and how to work with missing data.
6553
6554 Parameters
6555 ----------
6556 axis : {0 or 'index', 1 or 'columns'}, default 0
6557 Determine if rows or columns which contain missing values are
6558 removed.
6559
6560 * 0, or 'index' : Drop rows which contain missing values.
6561 * 1, or 'columns' : Drop columns which contain missing value.
6562
6563 Only a single axis is allowed.
6564
6565 how : {'any', 'all'}, default 'any'
6566 Determine if row or column is removed from DataFrame, when we have
6567 at least one NA or all NA.
6568
6569 * 'any' : If any NA values are present, drop that row or column.
6570 * 'all' : If all values are NA, drop that row or column.
6571
6572 thresh : int, optional
6573 Require that many non-NA values. Cannot be combined with how.
6574 subset : column label or sequence of labels, optional
6575 Labels along other axis to consider, e.g. if you are dropping rows
6576 these would be a list of columns to include.
6577 inplace : bool, default False
6578 Whether to modify the DataFrame rather than creating a new one.
6579 ignore_index : bool, default ``False``
6580 If ``True``, the resulting axis will be labeled 0, 1, …, n - 1.
6581
6582 .. versionadded:: 2.0.0
6583
6584 Returns
6585 -------
6586 DataFrame or None
6587 DataFrame with NA entries dropped from it or None if ``inplace=True``.
6588
6589 See Also
6590 --------
6591 DataFrame.isna: Indicate missing values.
6592 DataFrame.notna : Indicate existing (non-missing) values.
6593 DataFrame.fillna : Replace missing values.
6594 Series.dropna : Drop missing values.
6595 Index.dropna : Drop missing indices.
6596
6597 Examples
6598 --------
6599 >>> df = pd.DataFrame({"name": ['Alfred', 'Batman', 'Catwoman'],
6600 ... "toy": [np.nan, 'Batmobile', 'Bullwhip'],
6601 ... "born": [pd.NaT, pd.Timestamp("1940-04-25"),
6602 ... pd.NaT]})
6603 >>> df
6604 name toy born
6605 0 Alfred NaN NaT
6606 1 Batman Batmobile 1940-04-25
6607 2 Catwoman Bullwhip NaT
6608
6609 Drop the rows where at least one element is missing.
6610
6611 >>> df.dropna()
6612 name toy born
6613 1 Batman Batmobile 1940-04-25
6614
6615 Drop the columns where at least one element is missing.
6616
6617 >>> df.dropna(axis='columns')
6618 name
6619 0 Alfred
6620 1 Batman
6621 2 Catwoman
6622
6623 Drop the rows where all elements are missing.
6624
6625 >>> df.dropna(how='all')
6626 name toy born
6627 0 Alfred NaN NaT
6628 1 Batman Batmobile 1940-04-25
6629 2 Catwoman Bullwhip NaT
6630
6631 Keep only the rows with at least 2 non-NA values.
6632
6633 >>> df.dropna(thresh=2)
6634 name toy born
6635 1 Batman Batmobile 1940-04-25
6636 2 Catwoman Bullwhip NaT
6637
6638 Define in which columns to look for missing values.
6639
6640 >>> df.dropna(subset=['name', 'toy'])
6641 name toy born
6642 1 Batman Batmobile 1940-04-25
6643 2 Catwoman Bullwhip NaT
6644 """
6645 if (how is not lib.no_default) and (thresh is not lib.no_default):
6646 raise TypeError(
6647 "You cannot set both the how and thresh arguments at the same time."
6648 )
6649
6650 if how is lib.no_default:
6651 how = "any"
6652
6653 inplace = validate_bool_kwarg(inplace, "inplace")
6654 if isinstance(axis, (tuple, list)):
6655 # GH20987
6656 raise TypeError("supplying multiple axes to axis is no longer supported.")
6657
6658 axis = self._get_axis_number(axis)
6659 agg_axis = 1 - axis
6660
6661 agg_obj = self
6662 if subset is not None:
6663 # subset needs to be list
6664 if not is_list_like(subset):
6665 subset = [subset]
6666 ax = self._get_axis(agg_axis)
6667 indices = ax.get_indexer_for(subset)
6668 check = indices == -1
6669 if check.any():
6670 raise KeyError(np.array(subset)[check].tolist())
6671 agg_obj = self.take(indices, axis=agg_axis)
6672
6673 if thresh is not lib.no_default:
6674 count = agg_obj.count(axis=agg_axis)
6675 mask = count >= thresh
6676 elif how == "any":
6677 # faster equivalent to 'agg_obj.count(agg_axis) == self.shape[agg_axis]'
6678 mask = notna(agg_obj).all(axis=agg_axis, bool_only=False)
6679 elif how == "all":
6680 # faster equivalent to 'agg_obj.count(agg_axis) > 0'
6681 mask = notna(agg_obj).any(axis=agg_axis, bool_only=False)
6682 else:
6683 raise ValueError(f"invalid how option: {how}")
6684
6685 if np.all(mask):
6686 result = self.copy(deep=None)
6687 else:
6688 result = self.loc(axis=axis)[mask]
6689
6690 if ignore_index:
6691 result.index = default_index(len(result))
6692
6693 if not inplace:
6694 return result
6695 self._update_inplace(result)
6696 return None
6697
6698 @overload
6699 def drop_duplicates(
6700 self,
6701 subset: Hashable | Sequence[Hashable] | None = ...,
6702 *,
6703 keep: DropKeep = ...,
6704 inplace: Literal[True],
6705 ignore_index: bool = ...,
6706 ) -> None:
6707 ...
6708
6709 @overload
6710 def drop_duplicates(
6711 self,
6712 subset: Hashable | Sequence[Hashable] | None = ...,
6713 *,
6714 keep: DropKeep = ...,
6715 inplace: Literal[False] = ...,
6716 ignore_index: bool = ...,
6717 ) -> DataFrame:
6718 ...
6719
6720 @overload
6721 def drop_duplicates(
6722 self,
6723 subset: Hashable | Sequence[Hashable] | None = ...,
6724 *,
6725 keep: DropKeep = ...,
6726 inplace: bool = ...,
6727 ignore_index: bool = ...,
6728 ) -> DataFrame | None:
6729 ...
6730
6731 def drop_duplicates(
6732 self,
6733 subset: Hashable | Sequence[Hashable] | None = None,
6734 *,
6735 keep: DropKeep = "first",
6736 inplace: bool = False,
6737 ignore_index: bool = False,
6738 ) -> DataFrame | None:
6739 """
6740 Return DataFrame with duplicate rows removed.
6741
6742 Considering certain columns is optional. Indexes, including time indexes
6743 are ignored.
6744
6745 Parameters
6746 ----------
6747 subset : column label or sequence of labels, optional
6748 Only consider certain columns for identifying duplicates, by
6749 default use all of the columns.
6750 keep : {'first', 'last', ``False``}, default 'first'
6751 Determines which duplicates (if any) to keep.
6752
6753 - 'first' : Drop duplicates except for the first occurrence.
6754 - 'last' : Drop duplicates except for the last occurrence.
6755 - ``False`` : Drop all duplicates.
6756
6757 inplace : bool, default ``False``
6758 Whether to modify the DataFrame rather than creating a new one.
6759 ignore_index : bool, default ``False``
6760 If ``True``, the resulting axis will be labeled 0, 1, …, n - 1.
6761
6762 Returns
6763 -------
6764 DataFrame or None
6765 DataFrame with duplicates removed or None if ``inplace=True``.
6766
6767 See Also
6768 --------
6769 DataFrame.value_counts: Count unique combinations of columns.
6770
6771 Examples
6772 --------
6773 Consider dataset containing ramen rating.
6774
6775 >>> df = pd.DataFrame({
6776 ... 'brand': ['Yum Yum', 'Yum Yum', 'Indomie', 'Indomie', 'Indomie'],
6777 ... 'style': ['cup', 'cup', 'cup', 'pack', 'pack'],
6778 ... 'rating': [4, 4, 3.5, 15, 5]
6779 ... })
6780 >>> df
6781 brand style rating
6782 0 Yum Yum cup 4.0
6783 1 Yum Yum cup 4.0
6784 2 Indomie cup 3.5
6785 3 Indomie pack 15.0
6786 4 Indomie pack 5.0
6787
6788 By default, it removes duplicate rows based on all columns.
6789
6790 >>> df.drop_duplicates()
6791 brand style rating
6792 0 Yum Yum cup 4.0
6793 2 Indomie cup 3.5
6794 3 Indomie pack 15.0
6795 4 Indomie pack 5.0
6796
6797 To remove duplicates on specific column(s), use ``subset``.
6798
6799 >>> df.drop_duplicates(subset=['brand'])
6800 brand style rating
6801 0 Yum Yum cup 4.0
6802 2 Indomie cup 3.5
6803
6804 To remove duplicates and keep last occurrences, use ``keep``.
6805
6806 >>> df.drop_duplicates(subset=['brand', 'style'], keep='last')
6807 brand style rating
6808 1 Yum Yum cup 4.0
6809 2 Indomie cup 3.5
6810 4 Indomie pack 5.0
6811 """
6812 if self.empty:
6813 return self.copy(deep=None)
6814
6815 inplace = validate_bool_kwarg(inplace, "inplace")
6816 ignore_index = validate_bool_kwarg(ignore_index, "ignore_index")
6817
6818 result = self[-self.duplicated(subset, keep=keep)]
6819 if ignore_index:
6820 result.index = default_index(len(result))
6821
6822 if inplace:
6823 self._update_inplace(result)
6824 return None
6825 else:
6826 return result
6827
6828 def duplicated(
6829 self,
6830 subset: Hashable | Sequence[Hashable] | None = None,
6831 keep: DropKeep = "first",
6832 ) -> Series:
6833 """
6834 Return boolean Series denoting duplicate rows.
6835
6836 Considering certain columns is optional.
6837
6838 Parameters
6839 ----------
6840 subset : column label or sequence of labels, optional
6841 Only consider certain columns for identifying duplicates, by
6842 default use all of the columns.
6843 keep : {'first', 'last', False}, default 'first'
6844 Determines which duplicates (if any) to mark.
6845
6846 - ``first`` : Mark duplicates as ``True`` except for the first occurrence.
6847 - ``last`` : Mark duplicates as ``True`` except for the last occurrence.
6848 - False : Mark all duplicates as ``True``.
6849
6850 Returns
6851 -------
6852 Series
6853 Boolean series for each duplicated rows.
6854
6855 See Also
6856 --------
6857 Index.duplicated : Equivalent method on index.
6858 Series.duplicated : Equivalent method on Series.
6859 Series.drop_duplicates : Remove duplicate values from Series.
6860 DataFrame.drop_duplicates : Remove duplicate values from DataFrame.
6861
6862 Examples
6863 --------
6864 Consider dataset containing ramen rating.
6865
6866 >>> df = pd.DataFrame({
6867 ... 'brand': ['Yum Yum', 'Yum Yum', 'Indomie', 'Indomie', 'Indomie'],
6868 ... 'style': ['cup', 'cup', 'cup', 'pack', 'pack'],
6869 ... 'rating': [4, 4, 3.5, 15, 5]
6870 ... })
6871 >>> df
6872 brand style rating
6873 0 Yum Yum cup 4.0
6874 1 Yum Yum cup 4.0
6875 2 Indomie cup 3.5
6876 3 Indomie pack 15.0
6877 4 Indomie pack 5.0
6878
6879 By default, for each set of duplicated values, the first occurrence
6880 is set on False and all others on True.
6881
6882 >>> df.duplicated()
6883 0 False
6884 1 True
6885 2 False
6886 3 False
6887 4 False
6888 dtype: bool
6889
6890 By using 'last', the last occurrence of each set of duplicated values
6891 is set on False and all others on True.
6892
6893 >>> df.duplicated(keep='last')
6894 0 True
6895 1 False
6896 2 False
6897 3 False
6898 4 False
6899 dtype: bool
6900
6901 By setting ``keep`` on False, all duplicates are True.
6902
6903 >>> df.duplicated(keep=False)
6904 0 True
6905 1 True
6906 2 False
6907 3 False
6908 4 False
6909 dtype: bool
6910
6911 To find duplicates on specific column(s), use ``subset``.
6912
6913 >>> df.duplicated(subset=['brand'])
6914 0 False
6915 1 True
6916 2 False
6917 3 True
6918 4 True
6919 dtype: bool
6920 """
6921
6922 if self.empty:
6923 return self._constructor_sliced(dtype=bool)
6924
6925 def f(vals) -> tuple[np.ndarray, int]:
6926 labels, shape = algorithms.factorize(vals, size_hint=len(self))
6927 return labels.astype("i8", copy=False), len(shape)
6928
6929 if subset is None:
6930 # https://github.com/pandas-dev/pandas/issues/28770
6931 # Incompatible types in assignment (expression has type "Index", variable
6932 # has type "Sequence[Any]")
6933 subset = self.columns # type: ignore[assignment]
6934 elif (
6935 not np.iterable(subset)
6936 or isinstance(subset, str)
6937 or isinstance(subset, tuple)
6938 and subset in self.columns
6939 ):
6940 subset = (subset,)
6941
6942 # needed for mypy since can't narrow types using np.iterable
6943 subset = cast(Sequence, subset)
6944
6945 # Verify all columns in subset exist in the queried dataframe
6946 # Otherwise, raise a KeyError, same as if you try to __getitem__ with a
6947 # key that doesn't exist.
6948 diff = set(subset) - set(self.columns)
6949 if diff:
6950 raise KeyError(Index(diff))
6951
6952 if len(subset) == 1 and self.columns.is_unique:
6953 # GH#45236 This is faster than get_group_index below
6954 result = self[subset[0]].duplicated(keep)
6955 result.name = None
6956 else:
6957 vals = (col.values for name, col in self.items() if name in subset)
6958 labels, shape = map(list, zip(*map(f, vals)))
6959
6960 ids = get_group_index(labels, tuple(shape), sort=False, xnull=False)
6961 result = self._constructor_sliced(duplicated(ids, keep), index=self.index)
6962 return result.__finalize__(self, method="duplicated")
6963
6964 # ----------------------------------------------------------------------
6965 # Sorting
6966 # error: Signature of "sort_values" incompatible with supertype "NDFrame"
6967 @overload # type: ignore[override]
6968 def sort_values(
6969 self,
6970 by: IndexLabel,
6971 *,
6972 axis: Axis = ...,
6973 ascending=...,
6974 inplace: Literal[False] = ...,
6975 kind: SortKind = ...,
6976 na_position: NaPosition = ...,
6977 ignore_index: bool = ...,
6978 key: ValueKeyFunc = ...,
6979 ) -> DataFrame:
6980 ...
6981
6982 @overload
6983 def sort_values(
6984 self,
6985 by: IndexLabel,
6986 *,
6987 axis: Axis = ...,
6988 ascending=...,
6989 inplace: Literal[True],
6990 kind: SortKind = ...,
6991 na_position: str = ...,
6992 ignore_index: bool = ...,
6993 key: ValueKeyFunc = ...,
6994 ) -> None:
6995 ...
6996
6997 def sort_values(
6998 self,
6999 by: IndexLabel,
7000 *,
7001 axis: Axis = 0,
7002 ascending: bool | list[bool] | tuple[bool, ...] = True,
7003 inplace: bool = False,
7004 kind: SortKind = "quicksort",
7005 na_position: str = "last",
7006 ignore_index: bool = False,
7007 key: ValueKeyFunc | None = None,
7008 ) -> DataFrame | None:
7009 """
7010 Sort by the values along either axis.
7011
7012 Parameters
7013 ----------
7014 by : str or list of str
7015 Name or list of names to sort by.
7016
7017 - if `axis` is 0 or `'index'` then `by` may contain index
7018 levels and/or column labels.
7019 - if `axis` is 1 or `'columns'` then `by` may contain column
7020 levels and/or index labels.
7021 axis : "{0 or 'index', 1 or 'columns'}", default 0
7022 Axis to be sorted.
7023 ascending : bool or list of bool, default True
7024 Sort ascending vs. descending. Specify list for multiple sort
7025 orders. If this is a list of bools, must match the length of
7026 the by.
7027 inplace : bool, default False
7028 If True, perform operation in-place.
7029 kind : {'quicksort', 'mergesort', 'heapsort', 'stable'}, default 'quicksort'
7030 Choice of sorting algorithm. See also :func:`numpy.sort` for more
7031 information. `mergesort` and `stable` are the only stable algorithms. For
7032 DataFrames, this option is only applied when sorting on a single
7033 column or label.
7034 na_position : {'first', 'last'}, default 'last'
7035 Puts NaNs at the beginning if `first`; `last` puts NaNs at the
7036 end.
7037 ignore_index : bool, default False
7038 If True, the resulting axis will be labeled 0, 1, …, n - 1.
7039 key : callable, optional
7040 Apply the key function to the values
7041 before sorting. This is similar to the `key` argument in the
7042 builtin :meth:`sorted` function, with the notable difference that
7043 this `key` function should be *vectorized*. It should expect a
7044 ``Series`` and return a Series with the same shape as the input.
7045 It will be applied to each column in `by` independently.
7046
7047 Returns
7048 -------
7049 DataFrame or None
7050 DataFrame with sorted values or None if ``inplace=True``.
7051
7052 See Also
7053 --------
7054 DataFrame.sort_index : Sort a DataFrame by the index.
7055 Series.sort_values : Similar method for a Series.
7056
7057 Examples
7058 --------
7059 >>> df = pd.DataFrame({
7060 ... 'col1': ['A', 'A', 'B', np.nan, 'D', 'C'],
7061 ... 'col2': [2, 1, 9, 8, 7, 4],
7062 ... 'col3': [0, 1, 9, 4, 2, 3],
7063 ... 'col4': ['a', 'B', 'c', 'D', 'e', 'F']
7064 ... })
7065 >>> df
7066 col1 col2 col3 col4
7067 0 A 2 0 a
7068 1 A 1 1 B
7069 2 B 9 9 c
7070 3 NaN 8 4 D
7071 4 D 7 2 e
7072 5 C 4 3 F
7073
7074 Sort by col1
7075
7076 >>> df.sort_values(by=['col1'])
7077 col1 col2 col3 col4
7078 0 A 2 0 a
7079 1 A 1 1 B
7080 2 B 9 9 c
7081 5 C 4 3 F
7082 4 D 7 2 e
7083 3 NaN 8 4 D
7084
7085 Sort by multiple columns
7086
7087 >>> df.sort_values(by=['col1', 'col2'])
7088 col1 col2 col3 col4
7089 1 A 1 1 B
7090 0 A 2 0 a
7091 2 B 9 9 c
7092 5 C 4 3 F
7093 4 D 7 2 e
7094 3 NaN 8 4 D
7095
7096 Sort Descending
7097
7098 >>> df.sort_values(by='col1', ascending=False)
7099 col1 col2 col3 col4
7100 4 D 7 2 e
7101 5 C 4 3 F
7102 2 B 9 9 c
7103 0 A 2 0 a
7104 1 A 1 1 B
7105 3 NaN 8 4 D
7106
7107 Putting NAs first
7108
7109 >>> df.sort_values(by='col1', ascending=False, na_position='first')
7110 col1 col2 col3 col4
7111 3 NaN 8 4 D
7112 4 D 7 2 e
7113 5 C 4 3 F
7114 2 B 9 9 c
7115 0 A 2 0 a
7116 1 A 1 1 B
7117
7118 Sorting with a key function
7119
7120 >>> df.sort_values(by='col4', key=lambda col: col.str.lower())
7121 col1 col2 col3 col4
7122 0 A 2 0 a
7123 1 A 1 1 B
7124 2 B 9 9 c
7125 3 NaN 8 4 D
7126 4 D 7 2 e
7127 5 C 4 3 F
7128
7129 Natural sort with the key argument,
7130 using the `natsort <https://github.com/SethMMorton/natsort>` package.
7131
7132 >>> df = pd.DataFrame({
7133 ... "time": ['0hr', '128hr', '72hr', '48hr', '96hr'],
7134 ... "value": [10, 20, 30, 40, 50]
7135 ... })
7136 >>> df
7137 time value
7138 0 0hr 10
7139 1 128hr 20
7140 2 72hr 30
7141 3 48hr 40
7142 4 96hr 50
7143 >>> from natsort import index_natsorted
7144 >>> df.sort_values(
7145 ... by="time",
7146 ... key=lambda x: np.argsort(index_natsorted(df["time"]))
7147 ... )
7148 time value
7149 0 0hr 10
7150 3 48hr 40
7151 2 72hr 30
7152 4 96hr 50
7153 1 128hr 20
7154 """
7155 inplace = validate_bool_kwarg(inplace, "inplace")
7156 axis = self._get_axis_number(axis)
7157 ascending = validate_ascending(ascending)
7158 if not isinstance(by, list):
7159 by = [by]
7160 # error: Argument 1 to "len" has incompatible type "Union[bool, List[bool]]";
7161 # expected "Sized"
7162 if is_sequence(ascending) and (
7163 len(by) != len(ascending) # type: ignore[arg-type]
7164 ):
7165 # error: Argument 1 to "len" has incompatible type "Union[bool,
7166 # List[bool]]"; expected "Sized"
7167 raise ValueError(
7168 f"Length of ascending ({len(ascending)})" # type: ignore[arg-type]
7169 f" != length of by ({len(by)})"
7170 )
7171 if len(by) > 1:
7172 keys = [self._get_label_or_level_values(x, axis=axis) for x in by]
7173
7174 # need to rewrap columns in Series to apply key function
7175 if key is not None:
7176 # error: List comprehension has incompatible type List[Series];
7177 # expected List[ndarray]
7178 keys = [
7179 Series(k, name=name) # type: ignore[misc]
7180 for (k, name) in zip(keys, by)
7181 ]
7182
7183 indexer = lexsort_indexer(
7184 keys, orders=ascending, na_position=na_position, key=key
7185 )
7186 elif len(by):
7187 # len(by) == 1
7188
7189 k = self._get_label_or_level_values(by[0], axis=axis)
7190
7191 # need to rewrap column in Series to apply key function
7192 if key is not None:
7193 # error: Incompatible types in assignment (expression has type
7194 # "Series", variable has type "ndarray")
7195 k = Series(k, name=by[0]) # type: ignore[assignment]
7196
7197 if isinstance(ascending, (tuple, list)):
7198 ascending = ascending[0]
7199
7200 indexer = nargsort(
7201 k, kind=kind, ascending=ascending, na_position=na_position, key=key
7202 )
7203 else:
7204 if inplace:
7205 return self._update_inplace(self)
7206 else:
7207 return self.copy(deep=None)
7208
7209 if is_range_indexer(indexer, len(indexer)):
7210 result = self.copy(deep=(not inplace and not using_copy_on_write()))
7211 if ignore_index:
7212 result.index = default_index(len(result))
7213
7214 if inplace:
7215 return self._update_inplace(result)
7216 else:
7217 return result
7218
7219 new_data = self._mgr.take(
7220 indexer, axis=self._get_block_manager_axis(axis), verify=False
7221 )
7222
7223 if ignore_index:
7224 new_data.set_axis(
7225 self._get_block_manager_axis(axis), default_index(len(indexer))
7226 )
7227
7228 result = self._constructor_from_mgr(new_data, axes=new_data.axes)
7229 if inplace:
7230 return self._update_inplace(result)
7231 else:
7232 return result.__finalize__(self, method="sort_values")
7233
7234 @overload
7235 def sort_index(
7236 self,
7237 *,
7238 axis: Axis = ...,
7239 level: IndexLabel = ...,
7240 ascending: bool | Sequence[bool] = ...,
7241 inplace: Literal[True],
7242 kind: SortKind = ...,
7243 na_position: NaPosition = ...,
7244 sort_remaining: bool = ...,
7245 ignore_index: bool = ...,
7246 key: IndexKeyFunc = ...,
7247 ) -> None:
7248 ...
7249
7250 @overload
7251 def sort_index(
7252 self,
7253 *,
7254 axis: Axis = ...,
7255 level: IndexLabel = ...,
7256 ascending: bool | Sequence[bool] = ...,
7257 inplace: Literal[False] = ...,
7258 kind: SortKind = ...,
7259 na_position: NaPosition = ...,
7260 sort_remaining: bool = ...,
7261 ignore_index: bool = ...,
7262 key: IndexKeyFunc = ...,
7263 ) -> DataFrame:
7264 ...
7265
7266 @overload
7267 def sort_index(
7268 self,
7269 *,
7270 axis: Axis = ...,
7271 level: IndexLabel = ...,
7272 ascending: bool | Sequence[bool] = ...,
7273 inplace: bool = ...,
7274 kind: SortKind = ...,
7275 na_position: NaPosition = ...,
7276 sort_remaining: bool = ...,
7277 ignore_index: bool = ...,
7278 key: IndexKeyFunc = ...,
7279 ) -> DataFrame | None:
7280 ...
7281
7282 def sort_index(
7283 self,
7284 *,
7285 axis: Axis = 0,
7286 level: IndexLabel | None = None,
7287 ascending: bool | Sequence[bool] = True,
7288 inplace: bool = False,
7289 kind: SortKind = "quicksort",
7290 na_position: NaPosition = "last",
7291 sort_remaining: bool = True,
7292 ignore_index: bool = False,
7293 key: IndexKeyFunc | None = None,
7294 ) -> DataFrame | None:
7295 """
7296 Sort object by labels (along an axis).
7297
7298 Returns a new DataFrame sorted by label if `inplace` argument is
7299 ``False``, otherwise updates the original DataFrame and returns None.
7300
7301 Parameters
7302 ----------
7303 axis : {0 or 'index', 1 or 'columns'}, default 0
7304 The axis along which to sort. The value 0 identifies the rows,
7305 and 1 identifies the columns.
7306 level : int or level name or list of ints or list of level names
7307 If not None, sort on values in specified index level(s).
7308 ascending : bool or list-like of bools, default True
7309 Sort ascending vs. descending. When the index is a MultiIndex the
7310 sort direction can be controlled for each level individually.
7311 inplace : bool, default False
7312 Whether to modify the DataFrame rather than creating a new one.
7313 kind : {'quicksort', 'mergesort', 'heapsort', 'stable'}, default 'quicksort'
7314 Choice of sorting algorithm. See also :func:`numpy.sort` for more
7315 information. `mergesort` and `stable` are the only stable algorithms. For
7316 DataFrames, this option is only applied when sorting on a single
7317 column or label.
7318 na_position : {'first', 'last'}, default 'last'
7319 Puts NaNs at the beginning if `first`; `last` puts NaNs at the end.
7320 Not implemented for MultiIndex.
7321 sort_remaining : bool, default True
7322 If True and sorting by level and index is multilevel, sort by other
7323 levels too (in order) after sorting by specified level.
7324 ignore_index : bool, default False
7325 If True, the resulting axis will be labeled 0, 1, …, n - 1.
7326 key : callable, optional
7327 If not None, apply the key function to the index values
7328 before sorting. This is similar to the `key` argument in the
7329 builtin :meth:`sorted` function, with the notable difference that
7330 this `key` function should be *vectorized*. It should expect an
7331 ``Index`` and return an ``Index`` of the same shape. For MultiIndex
7332 inputs, the key is applied *per level*.
7333
7334 Returns
7335 -------
7336 DataFrame or None
7337 The original DataFrame sorted by the labels or None if ``inplace=True``.
7338
7339 See Also
7340 --------
7341 Series.sort_index : Sort Series by the index.
7342 DataFrame.sort_values : Sort DataFrame by the value.
7343 Series.sort_values : Sort Series by the value.
7344
7345 Examples
7346 --------
7347 >>> df = pd.DataFrame([1, 2, 3, 4, 5], index=[100, 29, 234, 1, 150],
7348 ... columns=['A'])
7349 >>> df.sort_index()
7350 A
7351 1 4
7352 29 2
7353 100 1
7354 150 5
7355 234 3
7356
7357 By default, it sorts in ascending order, to sort in descending order,
7358 use ``ascending=False``
7359
7360 >>> df.sort_index(ascending=False)
7361 A
7362 234 3
7363 150 5
7364 100 1
7365 29 2
7366 1 4
7367
7368 A key function can be specified which is applied to the index before
7369 sorting. For a ``MultiIndex`` this is applied to each level separately.
7370
7371 >>> df = pd.DataFrame({"a": [1, 2, 3, 4]}, index=['A', 'b', 'C', 'd'])
7372 >>> df.sort_index(key=lambda x: x.str.lower())
7373 a
7374 A 1
7375 b 2
7376 C 3
7377 d 4
7378 """
7379 return super().sort_index(
7380 axis=axis,
7381 level=level,
7382 ascending=ascending,
7383 inplace=inplace,
7384 kind=kind,
7385 na_position=na_position,
7386 sort_remaining=sort_remaining,
7387 ignore_index=ignore_index,
7388 key=key,
7389 )
7390
7391 def value_counts(
7392 self,
7393 subset: IndexLabel | None = None,
7394 normalize: bool = False,
7395 sort: bool = True,
7396 ascending: bool = False,
7397 dropna: bool = True,
7398 ) -> Series:
7399 """
7400 Return a Series containing the frequency of each distinct row in the Dataframe.
7401
7402 Parameters
7403 ----------
7404 subset : label or list of labels, optional
7405 Columns to use when counting unique combinations.
7406 normalize : bool, default False
7407 Return proportions rather than frequencies.
7408 sort : bool, default True
7409 Sort by frequencies when True. Sort by DataFrame column values when False.
7410 ascending : bool, default False
7411 Sort in ascending order.
7412 dropna : bool, default True
7413 Don't include counts of rows that contain NA values.
7414
7415 .. versionadded:: 1.3.0
7416
7417 Returns
7418 -------
7419 Series
7420
7421 See Also
7422 --------
7423 Series.value_counts: Equivalent method on Series.
7424
7425 Notes
7426 -----
7427 The returned Series will have a MultiIndex with one level per input
7428 column but an Index (non-multi) for a single label. By default, rows
7429 that contain any NA values are omitted from the result. By default,
7430 the resulting Series will be in descending order so that the first
7431 element is the most frequently-occurring row.
7432
7433 Examples
7434 --------
7435 >>> df = pd.DataFrame({'num_legs': [2, 4, 4, 6],
7436 ... 'num_wings': [2, 0, 0, 0]},
7437 ... index=['falcon', 'dog', 'cat', 'ant'])
7438 >>> df
7439 num_legs num_wings
7440 falcon 2 2
7441 dog 4 0
7442 cat 4 0
7443 ant 6 0
7444
7445 >>> df.value_counts()
7446 num_legs num_wings
7447 4 0 2
7448 2 2 1
7449 6 0 1
7450 Name: count, dtype: int64
7451
7452 >>> df.value_counts(sort=False)
7453 num_legs num_wings
7454 2 2 1
7455 4 0 2
7456 6 0 1
7457 Name: count, dtype: int64
7458
7459 >>> df.value_counts(ascending=True)
7460 num_legs num_wings
7461 2 2 1
7462 6 0 1
7463 4 0 2
7464 Name: count, dtype: int64
7465
7466 >>> df.value_counts(normalize=True)
7467 num_legs num_wings
7468 4 0 0.50
7469 2 2 0.25
7470 6 0 0.25
7471 Name: proportion, dtype: float64
7472
7473 With `dropna` set to `False` we can also count rows with NA values.
7474
7475 >>> df = pd.DataFrame({'first_name': ['John', 'Anne', 'John', 'Beth'],
7476 ... 'middle_name': ['Smith', pd.NA, pd.NA, 'Louise']})
7477 >>> df
7478 first_name middle_name
7479 0 John Smith
7480 1 Anne <NA>
7481 2 John <NA>
7482 3 Beth Louise
7483
7484 >>> df.value_counts()
7485 first_name middle_name
7486 Beth Louise 1
7487 John Smith 1
7488 Name: count, dtype: int64
7489
7490 >>> df.value_counts(dropna=False)
7491 first_name middle_name
7492 Anne NaN 1
7493 Beth Louise 1
7494 John Smith 1
7495 NaN 1
7496 Name: count, dtype: int64
7497
7498 >>> df.value_counts("first_name")
7499 first_name
7500 John 2
7501 Anne 1
7502 Beth 1
7503 Name: count, dtype: int64
7504 """
7505 if subset is None:
7506 subset = self.columns.tolist()
7507
7508 name = "proportion" if normalize else "count"
7509 counts = self.groupby(subset, dropna=dropna, observed=False)._grouper.size()
7510 counts.name = name
7511
7512 if sort:
7513 counts = counts.sort_values(ascending=ascending)
7514 if normalize:
7515 counts /= counts.sum()
7516
7517 # Force MultiIndex for a list_like subset with a single column
7518 if is_list_like(subset) and len(subset) == 1: # type: ignore[arg-type]
7519 counts.index = MultiIndex.from_arrays(
7520 [counts.index], names=[counts.index.name]
7521 )
7522
7523 return counts
7524
7525 def nlargest(
7526 self, n: int, columns: IndexLabel, keep: NsmallestNlargestKeep = "first"
7527 ) -> DataFrame:
7528 """
7529 Return the first `n` rows ordered by `columns` in descending order.
7530
7531 Return the first `n` rows with the largest values in `columns`, in
7532 descending order. The columns that are not specified are returned as
7533 well, but not used for ordering.
7534
7535 This method is equivalent to
7536 ``df.sort_values(columns, ascending=False).head(n)``, but more
7537 performant.
7538
7539 Parameters
7540 ----------
7541 n : int
7542 Number of rows to return.
7543 columns : label or list of labels
7544 Column label(s) to order by.
7545 keep : {'first', 'last', 'all'}, default 'first'
7546 Where there are duplicate values:
7547
7548 - ``first`` : prioritize the first occurrence(s)
7549 - ``last`` : prioritize the last occurrence(s)
7550 - ``all`` : keep all the ties of the smallest item even if it means
7551 selecting more than ``n`` items.
7552
7553 Returns
7554 -------
7555 DataFrame
7556 The first `n` rows ordered by the given columns in descending
7557 order.
7558
7559 See Also
7560 --------
7561 DataFrame.nsmallest : Return the first `n` rows ordered by `columns` in
7562 ascending order.
7563 DataFrame.sort_values : Sort DataFrame by the values.
7564 DataFrame.head : Return the first `n` rows without re-ordering.
7565
7566 Notes
7567 -----
7568 This function cannot be used with all column types. For example, when
7569 specifying columns with `object` or `category` dtypes, ``TypeError`` is
7570 raised.
7571
7572 Examples
7573 --------
7574 >>> df = pd.DataFrame({'population': [59000000, 65000000, 434000,
7575 ... 434000, 434000, 337000, 11300,
7576 ... 11300, 11300],
7577 ... 'GDP': [1937894, 2583560 , 12011, 4520, 12128,
7578 ... 17036, 182, 38, 311],
7579 ... 'alpha-2': ["IT", "FR", "MT", "MV", "BN",
7580 ... "IS", "NR", "TV", "AI"]},
7581 ... index=["Italy", "France", "Malta",
7582 ... "Maldives", "Brunei", "Iceland",
7583 ... "Nauru", "Tuvalu", "Anguilla"])
7584 >>> df
7585 population GDP alpha-2
7586 Italy 59000000 1937894 IT
7587 France 65000000 2583560 FR
7588 Malta 434000 12011 MT
7589 Maldives 434000 4520 MV
7590 Brunei 434000 12128 BN
7591 Iceland 337000 17036 IS
7592 Nauru 11300 182 NR
7593 Tuvalu 11300 38 TV
7594 Anguilla 11300 311 AI
7595
7596 In the following example, we will use ``nlargest`` to select the three
7597 rows having the largest values in column "population".
7598
7599 >>> df.nlargest(3, 'population')
7600 population GDP alpha-2
7601 France 65000000 2583560 FR
7602 Italy 59000000 1937894 IT
7603 Malta 434000 12011 MT
7604
7605 When using ``keep='last'``, ties are resolved in reverse order:
7606
7607 >>> df.nlargest(3, 'population', keep='last')
7608 population GDP alpha-2
7609 France 65000000 2583560 FR
7610 Italy 59000000 1937894 IT
7611 Brunei 434000 12128 BN
7612
7613 When using ``keep='all'``, the number of element kept can go beyond ``n``
7614 if there are duplicate values for the smallest element, all the
7615 ties are kept:
7616
7617 >>> df.nlargest(3, 'population', keep='all')
7618 population GDP alpha-2
7619 France 65000000 2583560 FR
7620 Italy 59000000 1937894 IT
7621 Malta 434000 12011 MT
7622 Maldives 434000 4520 MV
7623 Brunei 434000 12128 BN
7624
7625 However, ``nlargest`` does not keep ``n`` distinct largest elements:
7626
7627 >>> df.nlargest(5, 'population', keep='all')
7628 population GDP alpha-2
7629 France 65000000 2583560 FR
7630 Italy 59000000 1937894 IT
7631 Malta 434000 12011 MT
7632 Maldives 434000 4520 MV
7633 Brunei 434000 12128 BN
7634
7635 To order by the largest values in column "population" and then "GDP",
7636 we can specify multiple columns like in the next example.
7637
7638 >>> df.nlargest(3, ['population', 'GDP'])
7639 population GDP alpha-2
7640 France 65000000 2583560 FR
7641 Italy 59000000 1937894 IT
7642 Brunei 434000 12128 BN
7643 """
7644 return selectn.SelectNFrame(self, n=n, keep=keep, columns=columns).nlargest()
7645
7646 def nsmallest(
7647 self, n: int, columns: IndexLabel, keep: NsmallestNlargestKeep = "first"
7648 ) -> DataFrame:
7649 """
7650 Return the first `n` rows ordered by `columns` in ascending order.
7651
7652 Return the first `n` rows with the smallest values in `columns`, in
7653 ascending order. The columns that are not specified are returned as
7654 well, but not used for ordering.
7655
7656 This method is equivalent to
7657 ``df.sort_values(columns, ascending=True).head(n)``, but more
7658 performant.
7659
7660 Parameters
7661 ----------
7662 n : int
7663 Number of items to retrieve.
7664 columns : list or str
7665 Column name or names to order by.
7666 keep : {'first', 'last', 'all'}, default 'first'
7667 Where there are duplicate values:
7668
7669 - ``first`` : take the first occurrence.
7670 - ``last`` : take the last occurrence.
7671 - ``all`` : keep all the ties of the largest item even if it means
7672 selecting more than ``n`` items.
7673
7674 Returns
7675 -------
7676 DataFrame
7677
7678 See Also
7679 --------
7680 DataFrame.nlargest : Return the first `n` rows ordered by `columns` in
7681 descending order.
7682 DataFrame.sort_values : Sort DataFrame by the values.
7683 DataFrame.head : Return the first `n` rows without re-ordering.
7684
7685 Examples
7686 --------
7687 >>> df = pd.DataFrame({'population': [59000000, 65000000, 434000,
7688 ... 434000, 434000, 337000, 337000,
7689 ... 11300, 11300],
7690 ... 'GDP': [1937894, 2583560 , 12011, 4520, 12128,
7691 ... 17036, 182, 38, 311],
7692 ... 'alpha-2': ["IT", "FR", "MT", "MV", "BN",
7693 ... "IS", "NR", "TV", "AI"]},
7694 ... index=["Italy", "France", "Malta",
7695 ... "Maldives", "Brunei", "Iceland",
7696 ... "Nauru", "Tuvalu", "Anguilla"])
7697 >>> df
7698 population GDP alpha-2
7699 Italy 59000000 1937894 IT
7700 France 65000000 2583560 FR
7701 Malta 434000 12011 MT
7702 Maldives 434000 4520 MV
7703 Brunei 434000 12128 BN
7704 Iceland 337000 17036 IS
7705 Nauru 337000 182 NR
7706 Tuvalu 11300 38 TV
7707 Anguilla 11300 311 AI
7708
7709 In the following example, we will use ``nsmallest`` to select the
7710 three rows having the smallest values in column "population".
7711
7712 >>> df.nsmallest(3, 'population')
7713 population GDP alpha-2
7714 Tuvalu 11300 38 TV
7715 Anguilla 11300 311 AI
7716 Iceland 337000 17036 IS
7717
7718 When using ``keep='last'``, ties are resolved in reverse order:
7719
7720 >>> df.nsmallest(3, 'population', keep='last')
7721 population GDP alpha-2
7722 Anguilla 11300 311 AI
7723 Tuvalu 11300 38 TV
7724 Nauru 337000 182 NR
7725
7726 When using ``keep='all'``, the number of element kept can go beyond ``n``
7727 if there are duplicate values for the largest element, all the
7728 ties are kept.
7729
7730 >>> df.nsmallest(3, 'population', keep='all')
7731 population GDP alpha-2
7732 Tuvalu 11300 38 TV
7733 Anguilla 11300 311 AI
7734 Iceland 337000 17036 IS
7735 Nauru 337000 182 NR
7736
7737 However, ``nsmallest`` does not keep ``n`` distinct
7738 smallest elements:
7739
7740 >>> df.nsmallest(4, 'population', keep='all')
7741 population GDP alpha-2
7742 Tuvalu 11300 38 TV
7743 Anguilla 11300 311 AI
7744 Iceland 337000 17036 IS
7745 Nauru 337000 182 NR
7746
7747 To order by the smallest values in column "population" and then "GDP", we can
7748 specify multiple columns like in the next example.
7749
7750 >>> df.nsmallest(3, ['population', 'GDP'])
7751 population GDP alpha-2
7752 Tuvalu 11300 38 TV
7753 Anguilla 11300 311 AI
7754 Nauru 337000 182 NR
7755 """
7756 return selectn.SelectNFrame(self, n=n, keep=keep, columns=columns).nsmallest()
7757
7758 @doc(
7759 Series.swaplevel,
7760 klass=_shared_doc_kwargs["klass"],
7761 extra_params=dedent(
7762 """axis : {0 or 'index', 1 or 'columns'}, default 0
7763 The axis to swap levels on. 0 or 'index' for row-wise, 1 or
7764 'columns' for column-wise."""
7765 ),
7766 examples=dedent(
7767 """\
7768 Examples
7769 --------
7770 >>> df = pd.DataFrame(
7771 ... {"Grade": ["A", "B", "A", "C"]},
7772 ... index=[
7773 ... ["Final exam", "Final exam", "Coursework", "Coursework"],
7774 ... ["History", "Geography", "History", "Geography"],
7775 ... ["January", "February", "March", "April"],
7776 ... ],
7777 ... )
7778 >>> df
7779 Grade
7780 Final exam History January A
7781 Geography February B
7782 Coursework History March A
7783 Geography April C
7784
7785 In the following example, we will swap the levels of the indices.
7786 Here, we will swap the levels column-wise, but levels can be swapped row-wise
7787 in a similar manner. Note that column-wise is the default behaviour.
7788 By not supplying any arguments for i and j, we swap the last and second to
7789 last indices.
7790
7791 >>> df.swaplevel()
7792 Grade
7793 Final exam January History A
7794 February Geography B
7795 Coursework March History A
7796 April Geography C
7797
7798 By supplying one argument, we can choose which index to swap the last
7799 index with. We can for example swap the first index with the last one as
7800 follows.
7801
7802 >>> df.swaplevel(0)
7803 Grade
7804 January History Final exam A
7805 February Geography Final exam B
7806 March History Coursework A
7807 April Geography Coursework C
7808
7809 We can also define explicitly which indices we want to swap by supplying values
7810 for both i and j. Here, we for example swap the first and second indices.
7811
7812 >>> df.swaplevel(0, 1)
7813 Grade
7814 History Final exam January A
7815 Geography Final exam February B
7816 History Coursework March A
7817 Geography Coursework April C"""
7818 ),
7819 )
7820 def swaplevel(self, i: Axis = -2, j: Axis = -1, axis: Axis = 0) -> DataFrame:
7821 result = self.copy(deep=None)
7822
7823 axis = self._get_axis_number(axis)
7824
7825 if not isinstance(result._get_axis(axis), MultiIndex): # pragma: no cover
7826 raise TypeError("Can only swap levels on a hierarchical axis.")
7827
7828 if axis == 0:
7829 assert isinstance(result.index, MultiIndex)
7830 result.index = result.index.swaplevel(i, j)
7831 else:
7832 assert isinstance(result.columns, MultiIndex)
7833 result.columns = result.columns.swaplevel(i, j)
7834 return result
7835
7836 def reorder_levels(self, order: Sequence[int | str], axis: Axis = 0) -> DataFrame:
7837 """
7838 Rearrange index levels using input order. May not drop or duplicate levels.
7839
7840 Parameters
7841 ----------
7842 order : list of int or list of str
7843 List representing new level order. Reference level by number
7844 (position) or by key (label).
7845 axis : {0 or 'index', 1 or 'columns'}, default 0
7846 Where to reorder levels.
7847
7848 Returns
7849 -------
7850 DataFrame
7851
7852 Examples
7853 --------
7854 >>> data = {
7855 ... "class": ["Mammals", "Mammals", "Reptiles"],
7856 ... "diet": ["Omnivore", "Carnivore", "Carnivore"],
7857 ... "species": ["Humans", "Dogs", "Snakes"],
7858 ... }
7859 >>> df = pd.DataFrame(data, columns=["class", "diet", "species"])
7860 >>> df = df.set_index(["class", "diet"])
7861 >>> df
7862 species
7863 class diet
7864 Mammals Omnivore Humans
7865 Carnivore Dogs
7866 Reptiles Carnivore Snakes
7867
7868 Let's reorder the levels of the index:
7869
7870 >>> df.reorder_levels(["diet", "class"])
7871 species
7872 diet class
7873 Omnivore Mammals Humans
7874 Carnivore Mammals Dogs
7875 Reptiles Snakes
7876 """
7877 axis = self._get_axis_number(axis)
7878 if not isinstance(self._get_axis(axis), MultiIndex): # pragma: no cover
7879 raise TypeError("Can only reorder levels on a hierarchical axis.")
7880
7881 result = self.copy(deep=None)
7882
7883 if axis == 0:
7884 assert isinstance(result.index, MultiIndex)
7885 result.index = result.index.reorder_levels(order)
7886 else:
7887 assert isinstance(result.columns, MultiIndex)
7888 result.columns = result.columns.reorder_levels(order)
7889 return result
7890
7891 # ----------------------------------------------------------------------
7892 # Arithmetic Methods
7893
7894 def _cmp_method(self, other, op):
7895 axis: Literal[1] = 1 # only relevant for Series other case
7896
7897 self, other = self._align_for_op(other, axis, flex=False, level=None)
7898
7899 # See GH#4537 for discussion of scalar op behavior
7900 new_data = self._dispatch_frame_op(other, op, axis=axis)
7901 return self._construct_result(new_data)
7902
7903 def _arith_method(self, other, op):
7904 if self._should_reindex_frame_op(other, op, 1, None, None):
7905 return self._arith_method_with_reindex(other, op)
7906
7907 axis: Literal[1] = 1 # only relevant for Series other case
7908 other = ops.maybe_prepare_scalar_for_op(other, (self.shape[axis],))
7909
7910 self, other = self._align_for_op(other, axis, flex=True, level=None)
7911
7912 with np.errstate(all="ignore"):
7913 new_data = self._dispatch_frame_op(other, op, axis=axis)
7914 return self._construct_result(new_data)
7915
7916 _logical_method = _arith_method
7917
7918 def _dispatch_frame_op(
7919 self, right, func: Callable, axis: AxisInt | None = None
7920 ) -> DataFrame:
7921 """
7922 Evaluate the frame operation func(left, right) by evaluating
7923 column-by-column, dispatching to the Series implementation.
7924
7925 Parameters
7926 ----------
7927 right : scalar, Series, or DataFrame
7928 func : arithmetic or comparison operator
7929 axis : {None, 0, 1}
7930
7931 Returns
7932 -------
7933 DataFrame
7934
7935 Notes
7936 -----
7937 Caller is responsible for setting np.errstate where relevant.
7938 """
7939 # Get the appropriate array-op to apply to each column/block's values.
7940 array_op = ops.get_array_op(func)
7941
7942 right = lib.item_from_zerodim(right)
7943 if not is_list_like(right):
7944 # i.e. scalar, faster than checking np.ndim(right) == 0
7945 bm = self._mgr.apply(array_op, right=right)
7946 return self._constructor_from_mgr(bm, axes=bm.axes)
7947
7948 elif isinstance(right, DataFrame):
7949 assert self.index.equals(right.index)
7950 assert self.columns.equals(right.columns)
7951 # TODO: The previous assertion `assert right._indexed_same(self)`
7952 # fails in cases with empty columns reached via
7953 # _frame_arith_method_with_reindex
7954
7955 # TODO operate_blockwise expects a manager of the same type
7956 bm = self._mgr.operate_blockwise(
7957 # error: Argument 1 to "operate_blockwise" of "ArrayManager" has
7958 # incompatible type "Union[ArrayManager, BlockManager]"; expected
7959 # "ArrayManager"
7960 # error: Argument 1 to "operate_blockwise" of "BlockManager" has
7961 # incompatible type "Union[ArrayManager, BlockManager]"; expected
7962 # "BlockManager"
7963 right._mgr, # type: ignore[arg-type]
7964 array_op,
7965 )
7966 return self._constructor_from_mgr(bm, axes=bm.axes)
7967
7968 elif isinstance(right, Series) and axis == 1:
7969 # axis=1 means we want to operate row-by-row
7970 assert right.index.equals(self.columns)
7971
7972 right = right._values
7973 # maybe_align_as_frame ensures we do not have an ndarray here
7974 assert not isinstance(right, np.ndarray)
7975
7976 arrays = [
7977 array_op(_left, _right)
7978 for _left, _right in zip(self._iter_column_arrays(), right)
7979 ]
7980
7981 elif isinstance(right, Series):
7982 assert right.index.equals(self.index)
7983 right = right._values
7984
7985 arrays = [array_op(left, right) for left in self._iter_column_arrays()]
7986
7987 else:
7988 raise NotImplementedError(right)
7989
7990 return type(self)._from_arrays(
7991 arrays, self.columns, self.index, verify_integrity=False
7992 )
7993
7994 def _combine_frame(self, other: DataFrame, func, fill_value=None):
7995 # at this point we have `self._indexed_same(other)`
7996
7997 if fill_value is None:
7998 # since _arith_op may be called in a loop, avoid function call
7999 # overhead if possible by doing this check once
8000 _arith_op = func
8001
8002 else:
8003
8004 def _arith_op(left, right):
8005 # for the mixed_type case where we iterate over columns,
8006 # _arith_op(left, right) is equivalent to
8007 # left._binop(right, func, fill_value=fill_value)
8008 left, right = ops.fill_binop(left, right, fill_value)
8009 return func(left, right)
8010
8011 new_data = self._dispatch_frame_op(other, _arith_op)
8012 return new_data
8013
8014 def _arith_method_with_reindex(self, right: DataFrame, op) -> DataFrame:
8015 """
8016 For DataFrame-with-DataFrame operations that require reindexing,
8017 operate only on shared columns, then reindex.
8018
8019 Parameters
8020 ----------
8021 right : DataFrame
8022 op : binary operator
8023
8024 Returns
8025 -------
8026 DataFrame
8027 """
8028 left = self
8029
8030 # GH#31623, only operate on shared columns
8031 cols, lcols, rcols = left.columns.join(
8032 right.columns, how="inner", level=None, return_indexers=True
8033 )
8034
8035 new_left = left.iloc[:, lcols]
8036 new_right = right.iloc[:, rcols]
8037 result = op(new_left, new_right)
8038
8039 # Do the join on the columns instead of using left._align_for_op
8040 # to avoid constructing two potentially large/sparse DataFrames
8041 join_columns, _, _ = left.columns.join(
8042 right.columns, how="outer", level=None, return_indexers=True
8043 )
8044
8045 if result.columns.has_duplicates:
8046 # Avoid reindexing with a duplicate axis.
8047 # https://github.com/pandas-dev/pandas/issues/35194
8048 indexer, _ = result.columns.get_indexer_non_unique(join_columns)
8049 indexer = algorithms.unique1d(indexer)
8050 result = result._reindex_with_indexers(
8051 {1: [join_columns, indexer]}, allow_dups=True
8052 )
8053 else:
8054 result = result.reindex(join_columns, axis=1)
8055
8056 return result
8057
8058 def _should_reindex_frame_op(self, right, op, axis: int, fill_value, level) -> bool:
8059 """
8060 Check if this is an operation between DataFrames that will need to reindex.
8061 """
8062 if op is operator.pow or op is roperator.rpow:
8063 # GH#32685 pow has special semantics for operating with null values
8064 return False
8065
8066 if not isinstance(right, DataFrame):
8067 return False
8068
8069 if fill_value is None and level is None and axis == 1:
8070 # TODO: any other cases we should handle here?
8071
8072 # Intersection is always unique so we have to check the unique columns
8073 left_uniques = self.columns.unique()
8074 right_uniques = right.columns.unique()
8075 cols = left_uniques.intersection(right_uniques)
8076 if len(cols) and not (
8077 len(cols) == len(left_uniques) and len(cols) == len(right_uniques)
8078 ):
8079 # TODO: is there a shortcut available when len(cols) == 0?
8080 return True
8081
8082 return False
8083
8084 def _align_for_op(
8085 self,
8086 other,
8087 axis: AxisInt,
8088 flex: bool | None = False,
8089 level: Level | None = None,
8090 ):
8091 """
8092 Convert rhs to meet lhs dims if input is list, tuple or np.ndarray.
8093
8094 Parameters
8095 ----------
8096 left : DataFrame
8097 right : Any
8098 axis : int
8099 flex : bool or None, default False
8100 Whether this is a flex op, in which case we reindex.
8101 None indicates not to check for alignment.
8102 level : int or level name, default None
8103
8104 Returns
8105 -------
8106 left : DataFrame
8107 right : Any
8108 """
8109 left, right = self, other
8110
8111 def to_series(right):
8112 msg = (
8113 "Unable to coerce to Series, "
8114 "length must be {req_len}: given {given_len}"
8115 )
8116
8117 # pass dtype to avoid doing inference, which would break consistency
8118 # with Index/Series ops
8119 dtype = None
8120 if getattr(right, "dtype", None) == object:
8121 # can't pass right.dtype unconditionally as that would break on e.g.
8122 # datetime64[h] ndarray
8123 dtype = object
8124
8125 if axis == 0:
8126 if len(left.index) != len(right):
8127 raise ValueError(
8128 msg.format(req_len=len(left.index), given_len=len(right))
8129 )
8130 right = left._constructor_sliced(right, index=left.index, dtype=dtype)
8131 else:
8132 if len(left.columns) != len(right):
8133 raise ValueError(
8134 msg.format(req_len=len(left.columns), given_len=len(right))
8135 )
8136 right = left._constructor_sliced(right, index=left.columns, dtype=dtype)
8137 return right
8138
8139 if isinstance(right, np.ndarray):
8140 if right.ndim == 1:
8141 right = to_series(right)
8142
8143 elif right.ndim == 2:
8144 # We need to pass dtype=right.dtype to retain object dtype
8145 # otherwise we lose consistency with Index and array ops
8146 dtype = None
8147 if right.dtype == object:
8148 # can't pass right.dtype unconditionally as that would break on e.g.
8149 # datetime64[h] ndarray
8150 dtype = object
8151
8152 if right.shape == left.shape:
8153 right = left._constructor(
8154 right, index=left.index, columns=left.columns, dtype=dtype
8155 )
8156
8157 elif right.shape[0] == left.shape[0] and right.shape[1] == 1:
8158 # Broadcast across columns
8159 right = np.broadcast_to(right, left.shape)
8160 right = left._constructor(
8161 right, index=left.index, columns=left.columns, dtype=dtype
8162 )
8163
8164 elif right.shape[1] == left.shape[1] and right.shape[0] == 1:
8165 # Broadcast along rows
8166 right = to_series(right[0, :])
8167
8168 else:
8169 raise ValueError(
8170 "Unable to coerce to DataFrame, shape "
8171 f"must be {left.shape}: given {right.shape}"
8172 )
8173
8174 elif right.ndim > 2:
8175 raise ValueError(
8176 "Unable to coerce to Series/DataFrame, "
8177 f"dimension must be <= 2: {right.shape}"
8178 )
8179
8180 elif is_list_like(right) and not isinstance(right, (Series, DataFrame)):
8181 # GH#36702. Raise when attempting arithmetic with list of array-like.
8182 if any(is_array_like(el) for el in right):
8183 raise ValueError(
8184 f"Unable to coerce list of {type(right[0])} to Series/DataFrame"
8185 )
8186 # GH#17901
8187 right = to_series(right)
8188
8189 if flex is not None and isinstance(right, DataFrame):
8190 if not left._indexed_same(right):
8191 if flex:
8192 left, right = left.align(
8193 right, join="outer", level=level, copy=False
8194 )
8195 else:
8196 raise ValueError(
8197 "Can only compare identically-labeled (both index and columns) "
8198 "DataFrame objects"
8199 )
8200 elif isinstance(right, Series):
8201 # axis=1 is default for DataFrame-with-Series op
8202 axis = axis if axis is not None else 1
8203 if not flex:
8204 if not left.axes[axis].equals(right.index):
8205 raise ValueError(
8206 "Operands are not aligned. Do "
8207 "`left, right = left.align(right, axis=1, copy=False)` "
8208 "before operating."
8209 )
8210
8211 left, right = left.align(
8212 right,
8213 join="outer",
8214 axis=axis,
8215 level=level,
8216 copy=False,
8217 )
8218 right = left._maybe_align_series_as_frame(right, axis)
8219
8220 return left, right
8221
8222 def _maybe_align_series_as_frame(self, series: Series, axis: AxisInt):
8223 """
8224 If the Series operand is not EA-dtype, we can broadcast to 2D and operate
8225 blockwise.
8226 """
8227 rvalues = series._values
8228 if not isinstance(rvalues, np.ndarray):
8229 # TODO(EA2D): no need to special-case with 2D EAs
8230 if rvalues.dtype in ("datetime64[ns]", "timedelta64[ns]"):
8231 # We can losslessly+cheaply cast to ndarray
8232 rvalues = np.asarray(rvalues)
8233 else:
8234 return series
8235
8236 if axis == 0:
8237 rvalues = rvalues.reshape(-1, 1)
8238 else:
8239 rvalues = rvalues.reshape(1, -1)
8240
8241 rvalues = np.broadcast_to(rvalues, self.shape)
8242 # pass dtype to avoid doing inference
8243 return self._constructor(
8244 rvalues,
8245 index=self.index,
8246 columns=self.columns,
8247 dtype=rvalues.dtype,
8248 )
8249
8250 def _flex_arith_method(
8251 self, other, op, *, axis: Axis = "columns", level=None, fill_value=None
8252 ):
8253 axis = self._get_axis_number(axis) if axis is not None else 1
8254
8255 if self._should_reindex_frame_op(other, op, axis, fill_value, level):
8256 return self._arith_method_with_reindex(other, op)
8257
8258 if isinstance(other, Series) and fill_value is not None:
8259 # TODO: We could allow this in cases where we end up going
8260 # through the DataFrame path
8261 raise NotImplementedError(f"fill_value {fill_value} not supported.")
8262
8263 other = ops.maybe_prepare_scalar_for_op(other, self.shape)
8264 self, other = self._align_for_op(other, axis, flex=True, level=level)
8265
8266 with np.errstate(all="ignore"):
8267 if isinstance(other, DataFrame):
8268 # Another DataFrame
8269 new_data = self._combine_frame(other, op, fill_value)
8270
8271 elif isinstance(other, Series):
8272 new_data = self._dispatch_frame_op(other, op, axis=axis)
8273 else:
8274 # in this case we always have `np.ndim(other) == 0`
8275 if fill_value is not None:
8276 self = self.fillna(fill_value)
8277
8278 new_data = self._dispatch_frame_op(other, op)
8279
8280 return self._construct_result(new_data)
8281
8282 def _construct_result(self, result) -> DataFrame:
8283 """
8284 Wrap the result of an arithmetic, comparison, or logical operation.
8285
8286 Parameters
8287 ----------
8288 result : DataFrame
8289
8290 Returns
8291 -------
8292 DataFrame
8293 """
8294 out = self._constructor(result, copy=False).__finalize__(self)
8295 # Pin columns instead of passing to constructor for compat with
8296 # non-unique columns case
8297 out.columns = self.columns
8298 out.index = self.index
8299 return out
8300
8301 def __divmod__(self, other) -> tuple[DataFrame, DataFrame]:
8302 # Naive implementation, room for optimization
8303 div = self // other
8304 mod = self - div * other
8305 return div, mod
8306
8307 def __rdivmod__(self, other) -> tuple[DataFrame, DataFrame]:
8308 # Naive implementation, room for optimization
8309 div = other // self
8310 mod = other - div * self
8311 return div, mod
8312
8313 def _flex_cmp_method(self, other, op, *, axis: Axis = "columns", level=None):
8314 axis = self._get_axis_number(axis) if axis is not None else 1
8315
8316 self, other = self._align_for_op(other, axis, flex=True, level=level)
8317
8318 new_data = self._dispatch_frame_op(other, op, axis=axis)
8319 return self._construct_result(new_data)
8320
8321 @Appender(ops.make_flex_doc("eq", "dataframe"))
8322 def eq(self, other, axis: Axis = "columns", level=None) -> DataFrame:
8323 return self._flex_cmp_method(other, operator.eq, axis=axis, level=level)
8324
8325 @Appender(ops.make_flex_doc("ne", "dataframe"))
8326 def ne(self, other, axis: Axis = "columns", level=None) -> DataFrame:
8327 return self._flex_cmp_method(other, operator.ne, axis=axis, level=level)
8328
8329 @Appender(ops.make_flex_doc("le", "dataframe"))
8330 def le(self, other, axis: Axis = "columns", level=None) -> DataFrame:
8331 return self._flex_cmp_method(other, operator.le, axis=axis, level=level)
8332
8333 @Appender(ops.make_flex_doc("lt", "dataframe"))
8334 def lt(self, other, axis: Axis = "columns", level=None) -> DataFrame:
8335 return self._flex_cmp_method(other, operator.lt, axis=axis, level=level)
8336
8337 @Appender(ops.make_flex_doc("ge", "dataframe"))
8338 def ge(self, other, axis: Axis = "columns", level=None) -> DataFrame:
8339 return self._flex_cmp_method(other, operator.ge, axis=axis, level=level)
8340
8341 @Appender(ops.make_flex_doc("gt", "dataframe"))
8342 def gt(self, other, axis: Axis = "columns", level=None) -> DataFrame:
8343 return self._flex_cmp_method(other, operator.gt, axis=axis, level=level)
8344
8345 @Appender(ops.make_flex_doc("add", "dataframe"))
8346 def add(
8347 self, other, axis: Axis = "columns", level=None, fill_value=None
8348 ) -> DataFrame:
8349 return self._flex_arith_method(
8350 other, operator.add, level=level, fill_value=fill_value, axis=axis
8351 )
8352
8353 @Appender(ops.make_flex_doc("radd", "dataframe"))
8354 def radd(
8355 self, other, axis: Axis = "columns", level=None, fill_value=None
8356 ) -> DataFrame:
8357 return self._flex_arith_method(
8358 other, roperator.radd, level=level, fill_value=fill_value, axis=axis
8359 )
8360
8361 @Appender(ops.make_flex_doc("sub", "dataframe"))
8362 def sub(
8363 self, other, axis: Axis = "columns", level=None, fill_value=None
8364 ) -> DataFrame:
8365 return self._flex_arith_method(
8366 other, operator.sub, level=level, fill_value=fill_value, axis=axis
8367 )
8368
8369 subtract = sub
8370
8371 @Appender(ops.make_flex_doc("rsub", "dataframe"))
8372 def rsub(
8373 self, other, axis: Axis = "columns", level=None, fill_value=None
8374 ) -> DataFrame:
8375 return self._flex_arith_method(
8376 other, roperator.rsub, level=level, fill_value=fill_value, axis=axis
8377 )
8378
8379 @Appender(ops.make_flex_doc("mul", "dataframe"))
8380 def mul(
8381 self, other, axis: Axis = "columns", level=None, fill_value=None
8382 ) -> DataFrame:
8383 return self._flex_arith_method(
8384 other, operator.mul, level=level, fill_value=fill_value, axis=axis
8385 )
8386
8387 multiply = mul
8388
8389 @Appender(ops.make_flex_doc("rmul", "dataframe"))
8390 def rmul(
8391 self, other, axis: Axis = "columns", level=None, fill_value=None
8392 ) -> DataFrame:
8393 return self._flex_arith_method(
8394 other, roperator.rmul, level=level, fill_value=fill_value, axis=axis
8395 )
8396
8397 @Appender(ops.make_flex_doc("truediv", "dataframe"))
8398 def truediv(
8399 self, other, axis: Axis = "columns", level=None, fill_value=None
8400 ) -> DataFrame:
8401 return self._flex_arith_method(
8402 other, operator.truediv, level=level, fill_value=fill_value, axis=axis
8403 )
8404
8405 div = truediv
8406 divide = truediv
8407
8408 @Appender(ops.make_flex_doc("rtruediv", "dataframe"))
8409 def rtruediv(
8410 self, other, axis: Axis = "columns", level=None, fill_value=None
8411 ) -> DataFrame:
8412 return self._flex_arith_method(
8413 other, roperator.rtruediv, level=level, fill_value=fill_value, axis=axis
8414 )
8415
8416 rdiv = rtruediv
8417
8418 @Appender(ops.make_flex_doc("floordiv", "dataframe"))
8419 def floordiv(
8420 self, other, axis: Axis = "columns", level=None, fill_value=None
8421 ) -> DataFrame:
8422 return self._flex_arith_method(
8423 other, operator.floordiv, level=level, fill_value=fill_value, axis=axis
8424 )
8425
8426 @Appender(ops.make_flex_doc("rfloordiv", "dataframe"))
8427 def rfloordiv(
8428 self, other, axis: Axis = "columns", level=None, fill_value=None
8429 ) -> DataFrame:
8430 return self._flex_arith_method(
8431 other, roperator.rfloordiv, level=level, fill_value=fill_value, axis=axis
8432 )
8433
8434 @Appender(ops.make_flex_doc("mod", "dataframe"))
8435 def mod(
8436 self, other, axis: Axis = "columns", level=None, fill_value=None
8437 ) -> DataFrame:
8438 return self._flex_arith_method(
8439 other, operator.mod, level=level, fill_value=fill_value, axis=axis
8440 )
8441
8442 @Appender(ops.make_flex_doc("rmod", "dataframe"))
8443 def rmod(
8444 self, other, axis: Axis = "columns", level=None, fill_value=None
8445 ) -> DataFrame:
8446 return self._flex_arith_method(
8447 other, roperator.rmod, level=level, fill_value=fill_value, axis=axis
8448 )
8449
8450 @Appender(ops.make_flex_doc("pow", "dataframe"))
8451 def pow(
8452 self, other, axis: Axis = "columns", level=None, fill_value=None
8453 ) -> DataFrame:
8454 return self._flex_arith_method(
8455 other, operator.pow, level=level, fill_value=fill_value, axis=axis
8456 )
8457
8458 @Appender(ops.make_flex_doc("rpow", "dataframe"))
8459 def rpow(
8460 self, other, axis: Axis = "columns", level=None, fill_value=None
8461 ) -> DataFrame:
8462 return self._flex_arith_method(
8463 other, roperator.rpow, level=level, fill_value=fill_value, axis=axis
8464 )
8465
8466 # ----------------------------------------------------------------------
8467 # Combination-Related
8468
8469 @doc(
8470 _shared_docs["compare"],
8471 dedent(
8472 """
8473 Returns
8474 -------
8475 DataFrame
8476 DataFrame that shows the differences stacked side by side.
8477
8478 The resulting index will be a MultiIndex with 'self' and 'other'
8479 stacked alternately at the inner level.
8480
8481 Raises
8482 ------
8483 ValueError
8484 When the two DataFrames don't have identical labels or shape.
8485
8486 See Also
8487 --------
8488 Series.compare : Compare with another Series and show differences.
8489 DataFrame.equals : Test whether two objects contain the same elements.
8490
8491 Notes
8492 -----
8493 Matching NaNs will not appear as a difference.
8494
8495 Can only compare identically-labeled
8496 (i.e. same shape, identical row and column labels) DataFrames
8497
8498 Examples
8499 --------
8500 >>> df = pd.DataFrame(
8501 ... {{
8502 ... "col1": ["a", "a", "b", "b", "a"],
8503 ... "col2": [1.0, 2.0, 3.0, np.nan, 5.0],
8504 ... "col3": [1.0, 2.0, 3.0, 4.0, 5.0]
8505 ... }},
8506 ... columns=["col1", "col2", "col3"],
8507 ... )
8508 >>> df
8509 col1 col2 col3
8510 0 a 1.0 1.0
8511 1 a 2.0 2.0
8512 2 b 3.0 3.0
8513 3 b NaN 4.0
8514 4 a 5.0 5.0
8515
8516 >>> df2 = df.copy()
8517 >>> df2.loc[0, 'col1'] = 'c'
8518 >>> df2.loc[2, 'col3'] = 4.0
8519 >>> df2
8520 col1 col2 col3
8521 0 c 1.0 1.0
8522 1 a 2.0 2.0
8523 2 b 3.0 4.0
8524 3 b NaN 4.0
8525 4 a 5.0 5.0
8526
8527 Align the differences on columns
8528
8529 >>> df.compare(df2)
8530 col1 col3
8531 self other self other
8532 0 a c NaN NaN
8533 2 NaN NaN 3.0 4.0
8534
8535 Assign result_names
8536
8537 >>> df.compare(df2, result_names=("left", "right"))
8538 col1 col3
8539 left right left right
8540 0 a c NaN NaN
8541 2 NaN NaN 3.0 4.0
8542
8543 Stack the differences on rows
8544
8545 >>> df.compare(df2, align_axis=0)
8546 col1 col3
8547 0 self a NaN
8548 other c NaN
8549 2 self NaN 3.0
8550 other NaN 4.0
8551
8552 Keep the equal values
8553
8554 >>> df.compare(df2, keep_equal=True)
8555 col1 col3
8556 self other self other
8557 0 a c 1.0 1.0
8558 2 b b 3.0 4.0
8559
8560 Keep all original rows and columns
8561
8562 >>> df.compare(df2, keep_shape=True)
8563 col1 col2 col3
8564 self other self other self other
8565 0 a c NaN NaN NaN NaN
8566 1 NaN NaN NaN NaN NaN NaN
8567 2 NaN NaN NaN NaN 3.0 4.0
8568 3 NaN NaN NaN NaN NaN NaN
8569 4 NaN NaN NaN NaN NaN NaN
8570
8571 Keep all original rows and columns and also all original values
8572
8573 >>> df.compare(df2, keep_shape=True, keep_equal=True)
8574 col1 col2 col3
8575 self other self other self other
8576 0 a c 1.0 1.0 1.0 1.0
8577 1 a a 2.0 2.0 2.0 2.0
8578 2 b b 3.0 3.0 3.0 4.0
8579 3 b b NaN NaN 4.0 4.0
8580 4 a a 5.0 5.0 5.0 5.0
8581 """
8582 ),
8583 klass=_shared_doc_kwargs["klass"],
8584 )
8585 def compare(
8586 self,
8587 other: DataFrame,
8588 align_axis: Axis = 1,
8589 keep_shape: bool = False,
8590 keep_equal: bool = False,
8591 result_names: Suffixes = ("self", "other"),
8592 ) -> DataFrame:
8593 return super().compare(
8594 other=other,
8595 align_axis=align_axis,
8596 keep_shape=keep_shape,
8597 keep_equal=keep_equal,
8598 result_names=result_names,
8599 )
8600
8601 def combine(
8602 self,
8603 other: DataFrame,
8604 func: Callable[[Series, Series], Series | Hashable],
8605 fill_value=None,
8606 overwrite: bool = True,
8607 ) -> DataFrame:
8608 """
8609 Perform column-wise combine with another DataFrame.
8610
8611 Combines a DataFrame with `other` DataFrame using `func`
8612 to element-wise combine columns. The row and column indexes of the
8613 resulting DataFrame will be the union of the two.
8614
8615 Parameters
8616 ----------
8617 other : DataFrame
8618 The DataFrame to merge column-wise.
8619 func : function
8620 Function that takes two series as inputs and return a Series or a
8621 scalar. Used to merge the two dataframes column by columns.
8622 fill_value : scalar value, default None
8623 The value to fill NaNs with prior to passing any column to the
8624 merge func.
8625 overwrite : bool, default True
8626 If True, columns in `self` that do not exist in `other` will be
8627 overwritten with NaNs.
8628
8629 Returns
8630 -------
8631 DataFrame
8632 Combination of the provided DataFrames.
8633
8634 See Also
8635 --------
8636 DataFrame.combine_first : Combine two DataFrame objects and default to
8637 non-null values in frame calling the method.
8638
8639 Examples
8640 --------
8641 Combine using a simple function that chooses the smaller column.
8642
8643 >>> df1 = pd.DataFrame({'A': [0, 0], 'B': [4, 4]})
8644 >>> df2 = pd.DataFrame({'A': [1, 1], 'B': [3, 3]})
8645 >>> take_smaller = lambda s1, s2: s1 if s1.sum() < s2.sum() else s2
8646 >>> df1.combine(df2, take_smaller)
8647 A B
8648 0 0 3
8649 1 0 3
8650
8651 Example using a true element-wise combine function.
8652
8653 >>> df1 = pd.DataFrame({'A': [5, 0], 'B': [2, 4]})
8654 >>> df2 = pd.DataFrame({'A': [1, 1], 'B': [3, 3]})
8655 >>> df1.combine(df2, np.minimum)
8656 A B
8657 0 1 2
8658 1 0 3
8659
8660 Using `fill_value` fills Nones prior to passing the column to the
8661 merge function.
8662
8663 >>> df1 = pd.DataFrame({'A': [0, 0], 'B': [None, 4]})
8664 >>> df2 = pd.DataFrame({'A': [1, 1], 'B': [3, 3]})
8665 >>> df1.combine(df2, take_smaller, fill_value=-5)
8666 A B
8667 0 0 -5.0
8668 1 0 4.0
8669
8670 However, if the same element in both dataframes is None, that None
8671 is preserved
8672
8673 >>> df1 = pd.DataFrame({'A': [0, 0], 'B': [None, 4]})
8674 >>> df2 = pd.DataFrame({'A': [1, 1], 'B': [None, 3]})
8675 >>> df1.combine(df2, take_smaller, fill_value=-5)
8676 A B
8677 0 0 -5.0
8678 1 0 3.0
8679
8680 Example that demonstrates the use of `overwrite` and behavior when
8681 the axis differ between the dataframes.
8682
8683 >>> df1 = pd.DataFrame({'A': [0, 0], 'B': [4, 4]})
8684 >>> df2 = pd.DataFrame({'B': [3, 3], 'C': [-10, 1], }, index=[1, 2])
8685 >>> df1.combine(df2, take_smaller)
8686 A B C
8687 0 NaN NaN NaN
8688 1 NaN 3.0 -10.0
8689 2 NaN 3.0 1.0
8690
8691 >>> df1.combine(df2, take_smaller, overwrite=False)
8692 A B C
8693 0 0.0 NaN NaN
8694 1 0.0 3.0 -10.0
8695 2 NaN 3.0 1.0
8696
8697 Demonstrating the preference of the passed in dataframe.
8698
8699 >>> df2 = pd.DataFrame({'B': [3, 3], 'C': [1, 1], }, index=[1, 2])
8700 >>> df2.combine(df1, take_smaller)
8701 A B C
8702 0 0.0 NaN NaN
8703 1 0.0 3.0 NaN
8704 2 NaN 3.0 NaN
8705
8706 >>> df2.combine(df1, take_smaller, overwrite=False)
8707 A B C
8708 0 0.0 NaN NaN
8709 1 0.0 3.0 1.0
8710 2 NaN 3.0 1.0
8711 """
8712 other_idxlen = len(other.index) # save for compare
8713
8714 this, other = self.align(other, copy=False)
8715 new_index = this.index
8716
8717 if other.empty and len(new_index) == len(self.index):
8718 return self.copy()
8719
8720 if self.empty and len(other) == other_idxlen:
8721 return other.copy()
8722
8723 # sorts if possible; otherwise align above ensures that these are set-equal
8724 new_columns = this.columns.union(other.columns)
8725 do_fill = fill_value is not None
8726 result = {}
8727 for col in new_columns:
8728 series = this[col]
8729 other_series = other[col]
8730
8731 this_dtype = series.dtype
8732 other_dtype = other_series.dtype
8733
8734 this_mask = isna(series)
8735 other_mask = isna(other_series)
8736
8737 # don't overwrite columns unnecessarily
8738 # DO propagate if this column is not in the intersection
8739 if not overwrite and other_mask.all():
8740 result[col] = this[col].copy()
8741 continue
8742
8743 if do_fill:
8744 series = series.copy()
8745 other_series = other_series.copy()
8746 series[this_mask] = fill_value
8747 other_series[other_mask] = fill_value
8748
8749 if col not in self.columns:
8750 # If self DataFrame does not have col in other DataFrame,
8751 # try to promote series, which is all NaN, as other_dtype.
8752 new_dtype = other_dtype
8753 try:
8754 series = series.astype(new_dtype, copy=False)
8755 except ValueError:
8756 # e.g. new_dtype is integer types
8757 pass
8758 else:
8759 # if we have different dtypes, possibly promote
8760 new_dtype = find_common_type([this_dtype, other_dtype])
8761 series = series.astype(new_dtype, copy=False)
8762 other_series = other_series.astype(new_dtype, copy=False)
8763
8764 arr = func(series, other_series)
8765 if isinstance(new_dtype, np.dtype):
8766 # if new_dtype is an EA Dtype, then `func` is expected to return
8767 # the correct dtype without any additional casting
8768 # error: No overload variant of "maybe_downcast_to_dtype" matches
8769 # argument types "Union[Series, Hashable]", "dtype[Any]"
8770 arr = maybe_downcast_to_dtype( # type: ignore[call-overload]
8771 arr, new_dtype
8772 )
8773
8774 result[col] = arr
8775
8776 # convert_objects just in case
8777 frame_result = self._constructor(result, index=new_index, columns=new_columns)
8778 return frame_result.__finalize__(self, method="combine")
8779
8780 def combine_first(self, other: DataFrame) -> DataFrame:
8781 """
8782 Update null elements with value in the same location in `other`.
8783
8784 Combine two DataFrame objects by filling null values in one DataFrame
8785 with non-null values from other DataFrame. The row and column indexes
8786 of the resulting DataFrame will be the union of the two. The resulting
8787 dataframe contains the 'first' dataframe values and overrides the
8788 second one values where both first.loc[index, col] and
8789 second.loc[index, col] are not missing values, upon calling
8790 first.combine_first(second).
8791
8792 Parameters
8793 ----------
8794 other : DataFrame
8795 Provided DataFrame to use to fill null values.
8796
8797 Returns
8798 -------
8799 DataFrame
8800 The result of combining the provided DataFrame with the other object.
8801
8802 See Also
8803 --------
8804 DataFrame.combine : Perform series-wise operation on two DataFrames
8805 using a given function.
8806
8807 Examples
8808 --------
8809 >>> df1 = pd.DataFrame({'A': [None, 0], 'B': [None, 4]})
8810 >>> df2 = pd.DataFrame({'A': [1, 1], 'B': [3, 3]})
8811 >>> df1.combine_first(df2)
8812 A B
8813 0 1.0 3.0
8814 1 0.0 4.0
8815
8816 Null values still persist if the location of that null value
8817 does not exist in `other`
8818
8819 >>> df1 = pd.DataFrame({'A': [None, 0], 'B': [4, None]})
8820 >>> df2 = pd.DataFrame({'B': [3, 3], 'C': [1, 1]}, index=[1, 2])
8821 >>> df1.combine_first(df2)
8822 A B C
8823 0 NaN 4.0 NaN
8824 1 0.0 3.0 1.0
8825 2 NaN 3.0 1.0
8826 """
8827 from pandas.core.computation import expressions
8828
8829 def combiner(x: Series, y: Series):
8830 mask = x.isna()._values
8831
8832 x_values = x._values
8833 y_values = y._values
8834
8835 # If the column y in other DataFrame is not in first DataFrame,
8836 # just return y_values.
8837 if y.name not in self.columns:
8838 return y_values
8839
8840 return expressions.where(mask, y_values, x_values)
8841
8842 if len(other) == 0:
8843 combined = self.reindex(
8844 self.columns.append(other.columns.difference(self.columns)), axis=1
8845 )
8846 combined = combined.astype(other.dtypes)
8847 else:
8848 combined = self.combine(other, combiner, overwrite=False)
8849
8850 dtypes = {
8851 col: find_common_type([self.dtypes[col], other.dtypes[col]])
8852 for col in self.columns.intersection(other.columns)
8853 if combined.dtypes[col] != self.dtypes[col]
8854 }
8855
8856 if dtypes:
8857 combined = combined.astype(dtypes)
8858
8859 return combined.__finalize__(self, method="combine_first")
8860
8861 def update(
8862 self,
8863 other,
8864 join: UpdateJoin = "left",
8865 overwrite: bool = True,
8866 filter_func=None,
8867 errors: IgnoreRaise = "ignore",
8868 ) -> None:
8869 """
8870 Modify in place using non-NA values from another DataFrame.
8871
8872 Aligns on indices. There is no return value.
8873
8874 Parameters
8875 ----------
8876 other : DataFrame, or object coercible into a DataFrame
8877 Should have at least one matching index/column label
8878 with the original DataFrame. If a Series is passed,
8879 its name attribute must be set, and that will be
8880 used as the column name to align with the original DataFrame.
8881 join : {'left'}, default 'left'
8882 Only left join is implemented, keeping the index and columns of the
8883 original object.
8884 overwrite : bool, default True
8885 How to handle non-NA values for overlapping keys:
8886
8887 * True: overwrite original DataFrame's values
8888 with values from `other`.
8889 * False: only update values that are NA in
8890 the original DataFrame.
8891
8892 filter_func : callable(1d-array) -> bool 1d-array, optional
8893 Can choose to replace values other than NA. Return True for values
8894 that should be updated.
8895 errors : {'raise', 'ignore'}, default 'ignore'
8896 If 'raise', will raise a ValueError if the DataFrame and `other`
8897 both contain non-NA data in the same place.
8898
8899 Returns
8900 -------
8901 None
8902 This method directly changes calling object.
8903
8904 Raises
8905 ------
8906 ValueError
8907 * When `errors='raise'` and there's overlapping non-NA data.
8908 * When `errors` is not either `'ignore'` or `'raise'`
8909 NotImplementedError
8910 * If `join != 'left'`
8911
8912 See Also
8913 --------
8914 dict.update : Similar method for dictionaries.
8915 DataFrame.merge : For column(s)-on-column(s) operations.
8916
8917 Examples
8918 --------
8919 >>> df = pd.DataFrame({'A': [1, 2, 3],
8920 ... 'B': [400, 500, 600]})
8921 >>> new_df = pd.DataFrame({'B': [4, 5, 6],
8922 ... 'C': [7, 8, 9]})
8923 >>> df.update(new_df)
8924 >>> df
8925 A B
8926 0 1 4
8927 1 2 5
8928 2 3 6
8929
8930 The DataFrame's length does not increase as a result of the update,
8931 only values at matching index/column labels are updated.
8932
8933 >>> df = pd.DataFrame({'A': ['a', 'b', 'c'],
8934 ... 'B': ['x', 'y', 'z']})
8935 >>> new_df = pd.DataFrame({'B': ['d', 'e', 'f', 'g', 'h', 'i']})
8936 >>> df.update(new_df)
8937 >>> df
8938 A B
8939 0 a d
8940 1 b e
8941 2 c f
8942
8943 >>> df = pd.DataFrame({'A': ['a', 'b', 'c'],
8944 ... 'B': ['x', 'y', 'z']})
8945 >>> new_df = pd.DataFrame({'B': ['d', 'f']}, index=[0, 2])
8946 >>> df.update(new_df)
8947 >>> df
8948 A B
8949 0 a d
8950 1 b y
8951 2 c f
8952
8953 For Series, its name attribute must be set.
8954
8955 >>> df = pd.DataFrame({'A': ['a', 'b', 'c'],
8956 ... 'B': ['x', 'y', 'z']})
8957 >>> new_column = pd.Series(['d', 'e', 'f'], name='B')
8958 >>> df.update(new_column)
8959 >>> df
8960 A B
8961 0 a d
8962 1 b e
8963 2 c f
8964
8965 If `other` contains NaNs the corresponding values are not updated
8966 in the original dataframe.
8967
8968 >>> df = pd.DataFrame({'A': [1, 2, 3],
8969 ... 'B': [400., 500., 600.]})
8970 >>> new_df = pd.DataFrame({'B': [4, np.nan, 6]})
8971 >>> df.update(new_df)
8972 >>> df
8973 A B
8974 0 1 4.0
8975 1 2 500.0
8976 2 3 6.0
8977 """
8978
8979 if not PYPY and using_copy_on_write():
8980 if sys.getrefcount(self) <= REF_COUNT:
8981 warnings.warn(
8982 _chained_assignment_method_msg,
8983 ChainedAssignmentError,
8984 stacklevel=2,
8985 )
8986 elif not PYPY and not using_copy_on_write() and self._is_view_after_cow_rules():
8987 if sys.getrefcount(self) <= REF_COUNT:
8988 warnings.warn(
8989 _chained_assignment_warning_method_msg,
8990 FutureWarning,
8991 stacklevel=2,
8992 )
8993
8994 # TODO: Support other joins
8995 if join != "left": # pragma: no cover
8996 raise NotImplementedError("Only left join is supported")
8997 if errors not in ["ignore", "raise"]:
8998 raise ValueError("The parameter errors must be either 'ignore' or 'raise'")
8999
9000 if not isinstance(other, DataFrame):
9001 other = DataFrame(other)
9002
9003 other = other.reindex(self.index)
9004
9005 for col in self.columns.intersection(other.columns):
9006 this = self[col]._values
9007 that = other[col]._values
9008
9009 if filter_func is not None:
9010 mask = ~filter_func(this) | isna(that)
9011 else:
9012 if errors == "raise":
9013 mask_this = notna(that)
9014 mask_that = notna(this)
9015 if any(mask_this & mask_that):
9016 raise ValueError("Data overlaps.")
9017
9018 if overwrite:
9019 mask = isna(that)
9020 else:
9021 mask = notna(this)
9022
9023 # don't overwrite columns unnecessarily
9024 if mask.all():
9025 continue
9026
9027 with warnings.catch_warnings():
9028 warnings.filterwarnings(
9029 "ignore",
9030 message="Downcasting behavior",
9031 category=FutureWarning,
9032 )
9033 # GH#57124 - `that` might get upcasted because of NA values, and then
9034 # downcasted in where because of the mask. Ignoring the warning
9035 # is a stopgap, will replace with a new implementation of update
9036 # in 3.0.
9037 self.loc[:, col] = self[col].where(mask, that)
9038
9039 # ----------------------------------------------------------------------
9040 # Data reshaping
9041 @Appender(
9042 dedent(
9043 """
9044 Examples
9045 --------
9046 >>> df = pd.DataFrame({'Animal': ['Falcon', 'Falcon',
9047 ... 'Parrot', 'Parrot'],
9048 ... 'Max Speed': [380., 370., 24., 26.]})
9049 >>> df
9050 Animal Max Speed
9051 0 Falcon 380.0
9052 1 Falcon 370.0
9053 2 Parrot 24.0
9054 3 Parrot 26.0
9055 >>> df.groupby(['Animal']).mean()
9056 Max Speed
9057 Animal
9058 Falcon 375.0
9059 Parrot 25.0
9060
9061 **Hierarchical Indexes**
9062
9063 We can groupby different levels of a hierarchical index
9064 using the `level` parameter:
9065
9066 >>> arrays = [['Falcon', 'Falcon', 'Parrot', 'Parrot'],
9067 ... ['Captive', 'Wild', 'Captive', 'Wild']]
9068 >>> index = pd.MultiIndex.from_arrays(arrays, names=('Animal', 'Type'))
9069 >>> df = pd.DataFrame({'Max Speed': [390., 350., 30., 20.]},
9070 ... index=index)
9071 >>> df
9072 Max Speed
9073 Animal Type
9074 Falcon Captive 390.0
9075 Wild 350.0
9076 Parrot Captive 30.0
9077 Wild 20.0
9078 >>> df.groupby(level=0).mean()
9079 Max Speed
9080 Animal
9081 Falcon 370.0
9082 Parrot 25.0
9083 >>> df.groupby(level="Type").mean()
9084 Max Speed
9085 Type
9086 Captive 210.0
9087 Wild 185.0
9088
9089 We can also choose to include NA in group keys or not by setting
9090 `dropna` parameter, the default setting is `True`.
9091
9092 >>> l = [[1, 2, 3], [1, None, 4], [2, 1, 3], [1, 2, 2]]
9093 >>> df = pd.DataFrame(l, columns=["a", "b", "c"])
9094
9095 >>> df.groupby(by=["b"]).sum()
9096 a c
9097 b
9098 1.0 2 3
9099 2.0 2 5
9100
9101 >>> df.groupby(by=["b"], dropna=False).sum()
9102 a c
9103 b
9104 1.0 2 3
9105 2.0 2 5
9106 NaN 1 4
9107
9108 >>> l = [["a", 12, 12], [None, 12.3, 33.], ["b", 12.3, 123], ["a", 1, 1]]
9109 >>> df = pd.DataFrame(l, columns=["a", "b", "c"])
9110
9111 >>> df.groupby(by="a").sum()
9112 b c
9113 a
9114 a 13.0 13.0
9115 b 12.3 123.0
9116
9117 >>> df.groupby(by="a", dropna=False).sum()
9118 b c
9119 a
9120 a 13.0 13.0
9121 b 12.3 123.0
9122 NaN 12.3 33.0
9123
9124 When using ``.apply()``, use ``group_keys`` to include or exclude the
9125 group keys. The ``group_keys`` argument defaults to ``True`` (include).
9126
9127 >>> df = pd.DataFrame({'Animal': ['Falcon', 'Falcon',
9128 ... 'Parrot', 'Parrot'],
9129 ... 'Max Speed': [380., 370., 24., 26.]})
9130 >>> df.groupby("Animal", group_keys=True)[['Max Speed']].apply(lambda x: x)
9131 Max Speed
9132 Animal
9133 Falcon 0 380.0
9134 1 370.0
9135 Parrot 2 24.0
9136 3 26.0
9137
9138 >>> df.groupby("Animal", group_keys=False)[['Max Speed']].apply(lambda x: x)
9139 Max Speed
9140 0 380.0
9141 1 370.0
9142 2 24.0
9143 3 26.0
9144 """
9145 )
9146 )
9147 @Appender(_shared_docs["groupby"] % _shared_doc_kwargs)
9148 def groupby(
9149 self,
9150 by=None,
9151 axis: Axis | lib.NoDefault = lib.no_default,
9152 level: IndexLabel | None = None,
9153 as_index: bool = True,
9154 sort: bool = True,
9155 group_keys: bool = True,
9156 observed: bool | lib.NoDefault = lib.no_default,
9157 dropna: bool = True,
9158 ) -> DataFrameGroupBy:
9159 if axis is not lib.no_default:
9160 axis = self._get_axis_number(axis)
9161 if axis == 1:
9162 warnings.warn(
9163 "DataFrame.groupby with axis=1 is deprecated. Do "
9164 "`frame.T.groupby(...)` without axis instead.",
9165 FutureWarning,
9166 stacklevel=find_stack_level(),
9167 )
9168 else:
9169 warnings.warn(
9170 "The 'axis' keyword in DataFrame.groupby is deprecated and "
9171 "will be removed in a future version.",
9172 FutureWarning,
9173 stacklevel=find_stack_level(),
9174 )
9175 else:
9176 axis = 0
9177
9178 from pandas.core.groupby.generic import DataFrameGroupBy
9179
9180 if level is None and by is None:
9181 raise TypeError("You have to supply one of 'by' and 'level'")
9182
9183 return DataFrameGroupBy(
9184 obj=self,
9185 keys=by,
9186 axis=axis,
9187 level=level,
9188 as_index=as_index,
9189 sort=sort,
9190 group_keys=group_keys,
9191 observed=observed,
9192 dropna=dropna,
9193 )
9194
9195 _shared_docs[
9196 "pivot"
9197 ] = """
9198 Return reshaped DataFrame organized by given index / column values.
9199
9200 Reshape data (produce a "pivot" table) based on column values. Uses
9201 unique values from specified `index` / `columns` to form axes of the
9202 resulting DataFrame. This function does not support data
9203 aggregation, multiple values will result in a MultiIndex in the
9204 columns. See the :ref:`User Guide <reshaping>` for more on reshaping.
9205
9206 Parameters
9207 ----------%s
9208 columns : str or object or a list of str
9209 Column to use to make new frame's columns.
9210 index : str or object or a list of str, optional
9211 Column to use to make new frame's index. If not given, uses existing index.
9212 values : str, object or a list of the previous, optional
9213 Column(s) to use for populating new frame's values. If not
9214 specified, all remaining columns will be used and the result will
9215 have hierarchically indexed columns.
9216
9217 Returns
9218 -------
9219 DataFrame
9220 Returns reshaped DataFrame.
9221
9222 Raises
9223 ------
9224 ValueError:
9225 When there are any `index`, `columns` combinations with multiple
9226 values. `DataFrame.pivot_table` when you need to aggregate.
9227
9228 See Also
9229 --------
9230 DataFrame.pivot_table : Generalization of pivot that can handle
9231 duplicate values for one index/column pair.
9232 DataFrame.unstack : Pivot based on the index values instead of a
9233 column.
9234 wide_to_long : Wide panel to long format. Less flexible but more
9235 user-friendly than melt.
9236
9237 Notes
9238 -----
9239 For finer-tuned control, see hierarchical indexing documentation along
9240 with the related stack/unstack methods.
9241
9242 Reference :ref:`the user guide <reshaping.pivot>` for more examples.
9243
9244 Examples
9245 --------
9246 >>> df = pd.DataFrame({'foo': ['one', 'one', 'one', 'two', 'two',
9247 ... 'two'],
9248 ... 'bar': ['A', 'B', 'C', 'A', 'B', 'C'],
9249 ... 'baz': [1, 2, 3, 4, 5, 6],
9250 ... 'zoo': ['x', 'y', 'z', 'q', 'w', 't']})
9251 >>> df
9252 foo bar baz zoo
9253 0 one A 1 x
9254 1 one B 2 y
9255 2 one C 3 z
9256 3 two A 4 q
9257 4 two B 5 w
9258 5 two C 6 t
9259
9260 >>> df.pivot(index='foo', columns='bar', values='baz')
9261 bar A B C
9262 foo
9263 one 1 2 3
9264 two 4 5 6
9265
9266 >>> df.pivot(index='foo', columns='bar')['baz']
9267 bar A B C
9268 foo
9269 one 1 2 3
9270 two 4 5 6
9271
9272 >>> df.pivot(index='foo', columns='bar', values=['baz', 'zoo'])
9273 baz zoo
9274 bar A B C A B C
9275 foo
9276 one 1 2 3 x y z
9277 two 4 5 6 q w t
9278
9279 You could also assign a list of column names or a list of index names.
9280
9281 >>> df = pd.DataFrame({
9282 ... "lev1": [1, 1, 1, 2, 2, 2],
9283 ... "lev2": [1, 1, 2, 1, 1, 2],
9284 ... "lev3": [1, 2, 1, 2, 1, 2],
9285 ... "lev4": [1, 2, 3, 4, 5, 6],
9286 ... "values": [0, 1, 2, 3, 4, 5]})
9287 >>> df
9288 lev1 lev2 lev3 lev4 values
9289 0 1 1 1 1 0
9290 1 1 1 2 2 1
9291 2 1 2 1 3 2
9292 3 2 1 2 4 3
9293 4 2 1 1 5 4
9294 5 2 2 2 6 5
9295
9296 >>> df.pivot(index="lev1", columns=["lev2", "lev3"], values="values")
9297 lev2 1 2
9298 lev3 1 2 1 2
9299 lev1
9300 1 0.0 1.0 2.0 NaN
9301 2 4.0 3.0 NaN 5.0
9302
9303 >>> df.pivot(index=["lev1", "lev2"], columns=["lev3"], values="values")
9304 lev3 1 2
9305 lev1 lev2
9306 1 1 0.0 1.0
9307 2 2.0 NaN
9308 2 1 4.0 3.0
9309 2 NaN 5.0
9310
9311 A ValueError is raised if there are any duplicates.
9312
9313 >>> df = pd.DataFrame({"foo": ['one', 'one', 'two', 'two'],
9314 ... "bar": ['A', 'A', 'B', 'C'],
9315 ... "baz": [1, 2, 3, 4]})
9316 >>> df
9317 foo bar baz
9318 0 one A 1
9319 1 one A 2
9320 2 two B 3
9321 3 two C 4
9322
9323 Notice that the first two rows are the same for our `index`
9324 and `columns` arguments.
9325
9326 >>> df.pivot(index='foo', columns='bar', values='baz')
9327 Traceback (most recent call last):
9328 ...
9329 ValueError: Index contains duplicate entries, cannot reshape
9330 """
9331
9332 @Substitution("")
9333 @Appender(_shared_docs["pivot"])
9334 def pivot(
9335 self, *, columns, index=lib.no_default, values=lib.no_default
9336 ) -> DataFrame:
9337 from pandas.core.reshape.pivot import pivot
9338
9339 return pivot(self, index=index, columns=columns, values=values)
9340
9341 _shared_docs[
9342 "pivot_table"
9343 ] = """
9344 Create a spreadsheet-style pivot table as a DataFrame.
9345
9346 The levels in the pivot table will be stored in MultiIndex objects
9347 (hierarchical indexes) on the index and columns of the result DataFrame.
9348
9349 Parameters
9350 ----------%s
9351 values : list-like or scalar, optional
9352 Column or columns to aggregate.
9353 index : column, Grouper, array, or list of the previous
9354 Keys to group by on the pivot table index. If a list is passed,
9355 it can contain any of the other types (except list). If an array is
9356 passed, it must be the same length as the data and will be used in
9357 the same manner as column values.
9358 columns : column, Grouper, array, or list of the previous
9359 Keys to group by on the pivot table column. If a list is passed,
9360 it can contain any of the other types (except list). If an array is
9361 passed, it must be the same length as the data and will be used in
9362 the same manner as column values.
9363 aggfunc : function, list of functions, dict, default "mean"
9364 If a list of functions is passed, the resulting pivot table will have
9365 hierarchical columns whose top level are the function names
9366 (inferred from the function objects themselves).
9367 If a dict is passed, the key is column to aggregate and the value is
9368 function or list of functions. If ``margin=True``, aggfunc will be
9369 used to calculate the partial aggregates.
9370 fill_value : scalar, default None
9371 Value to replace missing values with (in the resulting pivot table,
9372 after aggregation).
9373 margins : bool, default False
9374 If ``margins=True``, special ``All`` columns and rows
9375 will be added with partial group aggregates across the categories
9376 on the rows and columns.
9377 dropna : bool, default True
9378 Do not include columns whose entries are all NaN. If True,
9379 rows with a NaN value in any column will be omitted before
9380 computing margins.
9381 margins_name : str, default 'All'
9382 Name of the row / column that will contain the totals
9383 when margins is True.
9384 observed : bool, default False
9385 This only applies if any of the groupers are Categoricals.
9386 If True: only show observed values for categorical groupers.
9387 If False: show all values for categorical groupers.
9388
9389 .. deprecated:: 2.2.0
9390
9391 The default value of ``False`` is deprecated and will change to
9392 ``True`` in a future version of pandas.
9393
9394 sort : bool, default True
9395 Specifies if the result should be sorted.
9396
9397 .. versionadded:: 1.3.0
9398
9399 Returns
9400 -------
9401 DataFrame
9402 An Excel style pivot table.
9403
9404 See Also
9405 --------
9406 DataFrame.pivot : Pivot without aggregation that can handle
9407 non-numeric data.
9408 DataFrame.melt: Unpivot a DataFrame from wide to long format,
9409 optionally leaving identifiers set.
9410 wide_to_long : Wide panel to long format. Less flexible but more
9411 user-friendly than melt.
9412
9413 Notes
9414 -----
9415 Reference :ref:`the user guide <reshaping.pivot>` for more examples.
9416
9417 Examples
9418 --------
9419 >>> df = pd.DataFrame({"A": ["foo", "foo", "foo", "foo", "foo",
9420 ... "bar", "bar", "bar", "bar"],
9421 ... "B": ["one", "one", "one", "two", "two",
9422 ... "one", "one", "two", "two"],
9423 ... "C": ["small", "large", "large", "small",
9424 ... "small", "large", "small", "small",
9425 ... "large"],
9426 ... "D": [1, 2, 2, 3, 3, 4, 5, 6, 7],
9427 ... "E": [2, 4, 5, 5, 6, 6, 8, 9, 9]})
9428 >>> df
9429 A B C D E
9430 0 foo one small 1 2
9431 1 foo one large 2 4
9432 2 foo one large 2 5
9433 3 foo two small 3 5
9434 4 foo two small 3 6
9435 5 bar one large 4 6
9436 6 bar one small 5 8
9437 7 bar two small 6 9
9438 8 bar two large 7 9
9439
9440 This first example aggregates values by taking the sum.
9441
9442 >>> table = pd.pivot_table(df, values='D', index=['A', 'B'],
9443 ... columns=['C'], aggfunc="sum")
9444 >>> table
9445 C large small
9446 A B
9447 bar one 4.0 5.0
9448 two 7.0 6.0
9449 foo one 4.0 1.0
9450 two NaN 6.0
9451
9452 We can also fill missing values using the `fill_value` parameter.
9453
9454 >>> table = pd.pivot_table(df, values='D', index=['A', 'B'],
9455 ... columns=['C'], aggfunc="sum", fill_value=0)
9456 >>> table
9457 C large small
9458 A B
9459 bar one 4 5
9460 two 7 6
9461 foo one 4 1
9462 two 0 6
9463
9464 The next example aggregates by taking the mean across multiple columns.
9465
9466 >>> table = pd.pivot_table(df, values=['D', 'E'], index=['A', 'C'],
9467 ... aggfunc={'D': "mean", 'E': "mean"})
9468 >>> table
9469 D E
9470 A C
9471 bar large 5.500000 7.500000
9472 small 5.500000 8.500000
9473 foo large 2.000000 4.500000
9474 small 2.333333 4.333333
9475
9476 We can also calculate multiple types of aggregations for any given
9477 value column.
9478
9479 >>> table = pd.pivot_table(df, values=['D', 'E'], index=['A', 'C'],
9480 ... aggfunc={'D': "mean",
9481 ... 'E': ["min", "max", "mean"]})
9482 >>> table
9483 D E
9484 mean max mean min
9485 A C
9486 bar large 5.500000 9 7.500000 6
9487 small 5.500000 9 8.500000 8
9488 foo large 2.000000 5 4.500000 4
9489 small 2.333333 6 4.333333 2
9490 """
9491
9492 @Substitution("")
9493 @Appender(_shared_docs["pivot_table"])
9494 def pivot_table(
9495 self,
9496 values=None,
9497 index=None,
9498 columns=None,
9499 aggfunc: AggFuncType = "mean",
9500 fill_value=None,
9501 margins: bool = False,
9502 dropna: bool = True,
9503 margins_name: Level = "All",
9504 observed: bool | lib.NoDefault = lib.no_default,
9505 sort: bool = True,
9506 ) -> DataFrame:
9507 from pandas.core.reshape.pivot import pivot_table
9508
9509 return pivot_table(
9510 self,
9511 values=values,
9512 index=index,
9513 columns=columns,
9514 aggfunc=aggfunc,
9515 fill_value=fill_value,
9516 margins=margins,
9517 dropna=dropna,
9518 margins_name=margins_name,
9519 observed=observed,
9520 sort=sort,
9521 )
9522
9523 def stack(
9524 self,
9525 level: IndexLabel = -1,
9526 dropna: bool | lib.NoDefault = lib.no_default,
9527 sort: bool | lib.NoDefault = lib.no_default,
9528 future_stack: bool = False,
9529 ):
9530 """
9531 Stack the prescribed level(s) from columns to index.
9532
9533 Return a reshaped DataFrame or Series having a multi-level
9534 index with one or more new inner-most levels compared to the current
9535 DataFrame. The new inner-most levels are created by pivoting the
9536 columns of the current dataframe:
9537
9538 - if the columns have a single level, the output is a Series;
9539 - if the columns have multiple levels, the new index
9540 level(s) is (are) taken from the prescribed level(s) and
9541 the output is a DataFrame.
9542
9543 Parameters
9544 ----------
9545 level : int, str, list, default -1
9546 Level(s) to stack from the column axis onto the index
9547 axis, defined as one index or label, or a list of indices
9548 or labels.
9549 dropna : bool, default True
9550 Whether to drop rows in the resulting Frame/Series with
9551 missing values. Stacking a column level onto the index
9552 axis can create combinations of index and column values
9553 that are missing from the original dataframe. See Examples
9554 section.
9555 sort : bool, default True
9556 Whether to sort the levels of the resulting MultiIndex.
9557 future_stack : bool, default False
9558 Whether to use the new implementation that will replace the current
9559 implementation in pandas 3.0. When True, dropna and sort have no impact
9560 on the result and must remain unspecified. See :ref:`pandas 2.1.0 Release
9561 notes <whatsnew_210.enhancements.new_stack>` for more details.
9562
9563 Returns
9564 -------
9565 DataFrame or Series
9566 Stacked dataframe or series.
9567
9568 See Also
9569 --------
9570 DataFrame.unstack : Unstack prescribed level(s) from index axis
9571 onto column axis.
9572 DataFrame.pivot : Reshape dataframe from long format to wide
9573 format.
9574 DataFrame.pivot_table : Create a spreadsheet-style pivot table
9575 as a DataFrame.
9576
9577 Notes
9578 -----
9579 The function is named by analogy with a collection of books
9580 being reorganized from being side by side on a horizontal
9581 position (the columns of the dataframe) to being stacked
9582 vertically on top of each other (in the index of the
9583 dataframe).
9584
9585 Reference :ref:`the user guide <reshaping.stacking>` for more examples.
9586
9587 Examples
9588 --------
9589 **Single level columns**
9590
9591 >>> df_single_level_cols = pd.DataFrame([[0, 1], [2, 3]],
9592 ... index=['cat', 'dog'],
9593 ... columns=['weight', 'height'])
9594
9595 Stacking a dataframe with a single level column axis returns a Series:
9596
9597 >>> df_single_level_cols
9598 weight height
9599 cat 0 1
9600 dog 2 3
9601 >>> df_single_level_cols.stack(future_stack=True)
9602 cat weight 0
9603 height 1
9604 dog weight 2
9605 height 3
9606 dtype: int64
9607
9608 **Multi level columns: simple case**
9609
9610 >>> multicol1 = pd.MultiIndex.from_tuples([('weight', 'kg'),
9611 ... ('weight', 'pounds')])
9612 >>> df_multi_level_cols1 = pd.DataFrame([[1, 2], [2, 4]],
9613 ... index=['cat', 'dog'],
9614 ... columns=multicol1)
9615
9616 Stacking a dataframe with a multi-level column axis:
9617
9618 >>> df_multi_level_cols1
9619 weight
9620 kg pounds
9621 cat 1 2
9622 dog 2 4
9623 >>> df_multi_level_cols1.stack(future_stack=True)
9624 weight
9625 cat kg 1
9626 pounds 2
9627 dog kg 2
9628 pounds 4
9629
9630 **Missing values**
9631
9632 >>> multicol2 = pd.MultiIndex.from_tuples([('weight', 'kg'),
9633 ... ('height', 'm')])
9634 >>> df_multi_level_cols2 = pd.DataFrame([[1.0, 2.0], [3.0, 4.0]],
9635 ... index=['cat', 'dog'],
9636 ... columns=multicol2)
9637
9638 It is common to have missing values when stacking a dataframe
9639 with multi-level columns, as the stacked dataframe typically
9640 has more values than the original dataframe. Missing values
9641 are filled with NaNs:
9642
9643 >>> df_multi_level_cols2
9644 weight height
9645 kg m
9646 cat 1.0 2.0
9647 dog 3.0 4.0
9648 >>> df_multi_level_cols2.stack(future_stack=True)
9649 weight height
9650 cat kg 1.0 NaN
9651 m NaN 2.0
9652 dog kg 3.0 NaN
9653 m NaN 4.0
9654
9655 **Prescribing the level(s) to be stacked**
9656
9657 The first parameter controls which level or levels are stacked:
9658
9659 >>> df_multi_level_cols2.stack(0, future_stack=True)
9660 kg m
9661 cat weight 1.0 NaN
9662 height NaN 2.0
9663 dog weight 3.0 NaN
9664 height NaN 4.0
9665 >>> df_multi_level_cols2.stack([0, 1], future_stack=True)
9666 cat weight kg 1.0
9667 height m 2.0
9668 dog weight kg 3.0
9669 height m 4.0
9670 dtype: float64
9671 """
9672 if not future_stack:
9673 from pandas.core.reshape.reshape import (
9674 stack,
9675 stack_multiple,
9676 )
9677
9678 if (
9679 dropna is not lib.no_default
9680 or sort is not lib.no_default
9681 or self.columns.nlevels > 1
9682 ):
9683 warnings.warn(
9684 "The previous implementation of stack is deprecated and will be "
9685 "removed in a future version of pandas. See the What's New notes "
9686 "for pandas 2.1.0 for details. Specify future_stack=True to adopt "
9687 "the new implementation and silence this warning.",
9688 FutureWarning,
9689 stacklevel=find_stack_level(),
9690 )
9691
9692 if dropna is lib.no_default:
9693 dropna = True
9694 if sort is lib.no_default:
9695 sort = True
9696
9697 if isinstance(level, (tuple, list)):
9698 result = stack_multiple(self, level, dropna=dropna, sort=sort)
9699 else:
9700 result = stack(self, level, dropna=dropna, sort=sort)
9701 else:
9702 from pandas.core.reshape.reshape import stack_v3
9703
9704 if dropna is not lib.no_default:
9705 raise ValueError(
9706 "dropna must be unspecified with future_stack=True as the new "
9707 "implementation does not introduce rows of NA values. This "
9708 "argument will be removed in a future version of pandas."
9709 )
9710
9711 if sort is not lib.no_default:
9712 raise ValueError(
9713 "Cannot specify sort with future_stack=True, this argument will be "
9714 "removed in a future version of pandas. Sort the result using "
9715 ".sort_index instead."
9716 )
9717
9718 if (
9719 isinstance(level, (tuple, list))
9720 and not all(lev in self.columns.names for lev in level)
9721 and not all(isinstance(lev, int) for lev in level)
9722 ):
9723 raise ValueError(
9724 "level should contain all level names or all level "
9725 "numbers, not a mixture of the two."
9726 )
9727
9728 if not isinstance(level, (tuple, list)):
9729 level = [level]
9730 level = [self.columns._get_level_number(lev) for lev in level]
9731 result = stack_v3(self, level)
9732
9733 return result.__finalize__(self, method="stack")
9734
9735 def explode(
9736 self,
9737 column: IndexLabel,
9738 ignore_index: bool = False,
9739 ) -> DataFrame:
9740 """
9741 Transform each element of a list-like to a row, replicating index values.
9742
9743 Parameters
9744 ----------
9745 column : IndexLabel
9746 Column(s) to explode.
9747 For multiple columns, specify a non-empty list with each element
9748 be str or tuple, and all specified columns their list-like data
9749 on same row of the frame must have matching length.
9750
9751 .. versionadded:: 1.3.0
9752 Multi-column explode
9753
9754 ignore_index : bool, default False
9755 If True, the resulting index will be labeled 0, 1, …, n - 1.
9756
9757 Returns
9758 -------
9759 DataFrame
9760 Exploded lists to rows of the subset columns;
9761 index will be duplicated for these rows.
9762
9763 Raises
9764 ------
9765 ValueError :
9766 * If columns of the frame are not unique.
9767 * If specified columns to explode is empty list.
9768 * If specified columns to explode have not matching count of
9769 elements rowwise in the frame.
9770
9771 See Also
9772 --------
9773 DataFrame.unstack : Pivot a level of the (necessarily hierarchical)
9774 index labels.
9775 DataFrame.melt : Unpivot a DataFrame from wide format to long format.
9776 Series.explode : Explode a DataFrame from list-like columns to long format.
9777
9778 Notes
9779 -----
9780 This routine will explode list-likes including lists, tuples, sets,
9781 Series, and np.ndarray. The result dtype of the subset rows will
9782 be object. Scalars will be returned unchanged, and empty list-likes will
9783 result in a np.nan for that row. In addition, the ordering of rows in the
9784 output will be non-deterministic when exploding sets.
9785
9786 Reference :ref:`the user guide <reshaping.explode>` for more examples.
9787
9788 Examples
9789 --------
9790 >>> df = pd.DataFrame({'A': [[0, 1, 2], 'foo', [], [3, 4]],
9791 ... 'B': 1,
9792 ... 'C': [['a', 'b', 'c'], np.nan, [], ['d', 'e']]})
9793 >>> df
9794 A B C
9795 0 [0, 1, 2] 1 [a, b, c]
9796 1 foo 1 NaN
9797 2 [] 1 []
9798 3 [3, 4] 1 [d, e]
9799
9800 Single-column explode.
9801
9802 >>> df.explode('A')
9803 A B C
9804 0 0 1 [a, b, c]
9805 0 1 1 [a, b, c]
9806 0 2 1 [a, b, c]
9807 1 foo 1 NaN
9808 2 NaN 1 []
9809 3 3 1 [d, e]
9810 3 4 1 [d, e]
9811
9812 Multi-column explode.
9813
9814 >>> df.explode(list('AC'))
9815 A B C
9816 0 0 1 a
9817 0 1 1 b
9818 0 2 1 c
9819 1 foo 1 NaN
9820 2 NaN 1 NaN
9821 3 3 1 d
9822 3 4 1 e
9823 """
9824 if not self.columns.is_unique:
9825 duplicate_cols = self.columns[self.columns.duplicated()].tolist()
9826 raise ValueError(
9827 f"DataFrame columns must be unique. Duplicate columns: {duplicate_cols}"
9828 )
9829
9830 columns: list[Hashable]
9831 if is_scalar(column) or isinstance(column, tuple):
9832 columns = [column]
9833 elif isinstance(column, list) and all(
9834 is_scalar(c) or isinstance(c, tuple) for c in column
9835 ):
9836 if not column:
9837 raise ValueError("column must be nonempty")
9838 if len(column) > len(set(column)):
9839 raise ValueError("column must be unique")
9840 columns = column
9841 else:
9842 raise ValueError("column must be a scalar, tuple, or list thereof")
9843
9844 df = self.reset_index(drop=True)
9845 if len(columns) == 1:
9846 result = df[columns[0]].explode()
9847 else:
9848 mylen = lambda x: len(x) if (is_list_like(x) and len(x) > 0) else 1
9849 counts0 = self[columns[0]].apply(mylen)
9850 for c in columns[1:]:
9851 if not all(counts0 == self[c].apply(mylen)):
9852 raise ValueError("columns must have matching element counts")
9853 result = DataFrame({c: df[c].explode() for c in columns})
9854 result = df.drop(columns, axis=1).join(result)
9855 if ignore_index:
9856 result.index = default_index(len(result))
9857 else:
9858 result.index = self.index.take(result.index)
9859 result = result.reindex(columns=self.columns, copy=False)
9860
9861 return result.__finalize__(self, method="explode")
9862
9863 def unstack(self, level: IndexLabel = -1, fill_value=None, sort: bool = True):
9864 """
9865 Pivot a level of the (necessarily hierarchical) index labels.
9866
9867 Returns a DataFrame having a new level of column labels whose inner-most level
9868 consists of the pivoted index labels.
9869
9870 If the index is not a MultiIndex, the output will be a Series
9871 (the analogue of stack when the columns are not a MultiIndex).
9872
9873 Parameters
9874 ----------
9875 level : int, str, or list of these, default -1 (last level)
9876 Level(s) of index to unstack, can pass level name.
9877 fill_value : int, str or dict
9878 Replace NaN with this value if the unstack produces missing values.
9879 sort : bool, default True
9880 Sort the level(s) in the resulting MultiIndex columns.
9881
9882 Returns
9883 -------
9884 Series or DataFrame
9885
9886 See Also
9887 --------
9888 DataFrame.pivot : Pivot a table based on column values.
9889 DataFrame.stack : Pivot a level of the column labels (inverse operation
9890 from `unstack`).
9891
9892 Notes
9893 -----
9894 Reference :ref:`the user guide <reshaping.stacking>` for more examples.
9895
9896 Examples
9897 --------
9898 >>> index = pd.MultiIndex.from_tuples([('one', 'a'), ('one', 'b'),
9899 ... ('two', 'a'), ('two', 'b')])
9900 >>> s = pd.Series(np.arange(1.0, 5.0), index=index)
9901 >>> s
9902 one a 1.0
9903 b 2.0
9904 two a 3.0
9905 b 4.0
9906 dtype: float64
9907
9908 >>> s.unstack(level=-1)
9909 a b
9910 one 1.0 2.0
9911 two 3.0 4.0
9912
9913 >>> s.unstack(level=0)
9914 one two
9915 a 1.0 3.0
9916 b 2.0 4.0
9917
9918 >>> df = s.unstack(level=0)
9919 >>> df.unstack()
9920 one a 1.0
9921 b 2.0
9922 two a 3.0
9923 b 4.0
9924 dtype: float64
9925 """
9926 from pandas.core.reshape.reshape import unstack
9927
9928 result = unstack(self, level, fill_value, sort)
9929
9930 return result.__finalize__(self, method="unstack")
9931
9932 @Appender(_shared_docs["melt"] % {"caller": "df.melt(", "other": "melt"})
9933 def melt(
9934 self,
9935 id_vars=None,
9936 value_vars=None,
9937 var_name=None,
9938 value_name: Hashable = "value",
9939 col_level: Level | None = None,
9940 ignore_index: bool = True,
9941 ) -> DataFrame:
9942 return melt(
9943 self,
9944 id_vars=id_vars,
9945 value_vars=value_vars,
9946 var_name=var_name,
9947 value_name=value_name,
9948 col_level=col_level,
9949 ignore_index=ignore_index,
9950 ).__finalize__(self, method="melt")
9951
9952 # ----------------------------------------------------------------------
9953 # Time series-related
9954
9955 @doc(
9956 Series.diff,
9957 klass="DataFrame",
9958 extra_params="axis : {0 or 'index', 1 or 'columns'}, default 0\n "
9959 "Take difference over rows (0) or columns (1).\n",
9960 other_klass="Series",
9961 examples=dedent(
9962 """
9963 Difference with previous row
9964
9965 >>> df = pd.DataFrame({'a': [1, 2, 3, 4, 5, 6],
9966 ... 'b': [1, 1, 2, 3, 5, 8],
9967 ... 'c': [1, 4, 9, 16, 25, 36]})
9968 >>> df
9969 a b c
9970 0 1 1 1
9971 1 2 1 4
9972 2 3 2 9
9973 3 4 3 16
9974 4 5 5 25
9975 5 6 8 36
9976
9977 >>> df.diff()
9978 a b c
9979 0 NaN NaN NaN
9980 1 1.0 0.0 3.0
9981 2 1.0 1.0 5.0
9982 3 1.0 1.0 7.0
9983 4 1.0 2.0 9.0
9984 5 1.0 3.0 11.0
9985
9986 Difference with previous column
9987
9988 >>> df.diff(axis=1)
9989 a b c
9990 0 NaN 0 0
9991 1 NaN -1 3
9992 2 NaN -1 7
9993 3 NaN -1 13
9994 4 NaN 0 20
9995 5 NaN 2 28
9996
9997 Difference with 3rd previous row
9998
9999 >>> df.diff(periods=3)
10000 a b c
10001 0 NaN NaN NaN
10002 1 NaN NaN NaN
10003 2 NaN NaN NaN
10004 3 3.0 2.0 15.0
10005 4 3.0 4.0 21.0
10006 5 3.0 6.0 27.0
10007
10008 Difference with following row
10009
10010 >>> df.diff(periods=-1)
10011 a b c
10012 0 -1.0 0.0 -3.0
10013 1 -1.0 -1.0 -5.0
10014 2 -1.0 -1.0 -7.0
10015 3 -1.0 -2.0 -9.0
10016 4 -1.0 -3.0 -11.0
10017 5 NaN NaN NaN
10018
10019 Overflow in input dtype
10020
10021 >>> df = pd.DataFrame({'a': [1, 0]}, dtype=np.uint8)
10022 >>> df.diff()
10023 a
10024 0 NaN
10025 1 255.0"""
10026 ),
10027 )
10028 def diff(self, periods: int = 1, axis: Axis = 0) -> DataFrame:
10029 if not lib.is_integer(periods):
10030 if not (is_float(periods) and periods.is_integer()):
10031 raise ValueError("periods must be an integer")
10032 periods = int(periods)
10033
10034 axis = self._get_axis_number(axis)
10035 if axis == 1:
10036 if periods != 0:
10037 # in the periods == 0 case, this is equivalent diff of 0 periods
10038 # along axis=0, and the Manager method may be somewhat more
10039 # performant, so we dispatch in that case.
10040 return self - self.shift(periods, axis=axis)
10041 # With periods=0 this is equivalent to a diff with axis=0
10042 axis = 0
10043
10044 new_data = self._mgr.diff(n=periods)
10045 res_df = self._constructor_from_mgr(new_data, axes=new_data.axes)
10046 return res_df.__finalize__(self, "diff")
10047
10048 # ----------------------------------------------------------------------
10049 # Function application
10050
10051 def _gotitem(
10052 self,
10053 key: IndexLabel,
10054 ndim: int,
10055 subset: DataFrame | Series | None = None,
10056 ) -> DataFrame | Series:
10057 """
10058 Sub-classes to define. Return a sliced object.
10059
10060 Parameters
10061 ----------
10062 key : string / list of selections
10063 ndim : {1, 2}
10064 requested ndim of result
10065 subset : object, default None
10066 subset to act on
10067 """
10068 if subset is None:
10069 subset = self
10070 elif subset.ndim == 1: # is Series
10071 return subset
10072
10073 # TODO: _shallow_copy(subset)?
10074 return subset[key]
10075
10076 _agg_see_also_doc = dedent(
10077 """
10078 See Also
10079 --------
10080 DataFrame.apply : Perform any type of operations.
10081 DataFrame.transform : Perform transformation type operations.
10082 pandas.DataFrame.groupby : Perform operations over groups.
10083 pandas.DataFrame.resample : Perform operations over resampled bins.
10084 pandas.DataFrame.rolling : Perform operations over rolling window.
10085 pandas.DataFrame.expanding : Perform operations over expanding window.
10086 pandas.core.window.ewm.ExponentialMovingWindow : Perform operation over exponential
10087 weighted window.
10088 """
10089 )
10090
10091 _agg_examples_doc = dedent(
10092 """
10093 Examples
10094 --------
10095 >>> df = pd.DataFrame([[1, 2, 3],
10096 ... [4, 5, 6],
10097 ... [7, 8, 9],
10098 ... [np.nan, np.nan, np.nan]],
10099 ... columns=['A', 'B', 'C'])
10100
10101 Aggregate these functions over the rows.
10102
10103 >>> df.agg(['sum', 'min'])
10104 A B C
10105 sum 12.0 15.0 18.0
10106 min 1.0 2.0 3.0
10107
10108 Different aggregations per column.
10109
10110 >>> df.agg({'A' : ['sum', 'min'], 'B' : ['min', 'max']})
10111 A B
10112 sum 12.0 NaN
10113 min 1.0 2.0
10114 max NaN 8.0
10115
10116 Aggregate different functions over the columns and rename the index of the resulting
10117 DataFrame.
10118
10119 >>> df.agg(x=('A', 'max'), y=('B', 'min'), z=('C', 'mean'))
10120 A B C
10121 x 7.0 NaN NaN
10122 y NaN 2.0 NaN
10123 z NaN NaN 6.0
10124
10125 Aggregate over the columns.
10126
10127 >>> df.agg("mean", axis="columns")
10128 0 2.0
10129 1 5.0
10130 2 8.0
10131 3 NaN
10132 dtype: float64
10133 """
10134 )
10135
10136 @doc(
10137 _shared_docs["aggregate"],
10138 klass=_shared_doc_kwargs["klass"],
10139 axis=_shared_doc_kwargs["axis"],
10140 see_also=_agg_see_also_doc,
10141 examples=_agg_examples_doc,
10142 )
10143 def aggregate(self, func=None, axis: Axis = 0, *args, **kwargs):
10144 from pandas.core.apply import frame_apply
10145
10146 axis = self._get_axis_number(axis)
10147
10148 op = frame_apply(self, func=func, axis=axis, args=args, kwargs=kwargs)
10149 result = op.agg()
10150 result = reconstruct_and_relabel_result(result, func, **kwargs)
10151 return result
10152
10153 agg = aggregate
10154
10155 @doc(
10156 _shared_docs["transform"],
10157 klass=_shared_doc_kwargs["klass"],
10158 axis=_shared_doc_kwargs["axis"],
10159 )
10160 def transform(
10161 self, func: AggFuncType, axis: Axis = 0, *args, **kwargs
10162 ) -> DataFrame:
10163 from pandas.core.apply import frame_apply
10164
10165 op = frame_apply(self, func=func, axis=axis, args=args, kwargs=kwargs)
10166 result = op.transform()
10167 assert isinstance(result, DataFrame)
10168 return result
10169
10170 def apply(
10171 self,
10172 func: AggFuncType,
10173 axis: Axis = 0,
10174 raw: bool = False,
10175 result_type: Literal["expand", "reduce", "broadcast"] | None = None,
10176 args=(),
10177 by_row: Literal[False, "compat"] = "compat",
10178 engine: Literal["python", "numba"] = "python",
10179 engine_kwargs: dict[str, bool] | None = None,
10180 **kwargs,
10181 ):
10182 """
10183 Apply a function along an axis of the DataFrame.
10184
10185 Objects passed to the function are Series objects whose index is
10186 either the DataFrame's index (``axis=0``) or the DataFrame's columns
10187 (``axis=1``). By default (``result_type=None``), the final return type
10188 is inferred from the return type of the applied function. Otherwise,
10189 it depends on the `result_type` argument.
10190
10191 Parameters
10192 ----------
10193 func : function
10194 Function to apply to each column or row.
10195 axis : {0 or 'index', 1 or 'columns'}, default 0
10196 Axis along which the function is applied:
10197
10198 * 0 or 'index': apply function to each column.
10199 * 1 or 'columns': apply function to each row.
10200
10201 raw : bool, default False
10202 Determines if row or column is passed as a Series or ndarray object:
10203
10204 * ``False`` : passes each row or column as a Series to the
10205 function.
10206 * ``True`` : the passed function will receive ndarray objects
10207 instead.
10208 If you are just applying a NumPy reduction function this will
10209 achieve much better performance.
10210
10211 result_type : {'expand', 'reduce', 'broadcast', None}, default None
10212 These only act when ``axis=1`` (columns):
10213
10214 * 'expand' : list-like results will be turned into columns.
10215 * 'reduce' : returns a Series if possible rather than expanding
10216 list-like results. This is the opposite of 'expand'.
10217 * 'broadcast' : results will be broadcast to the original shape
10218 of the DataFrame, the original index and columns will be
10219 retained.
10220
10221 The default behaviour (None) depends on the return value of the
10222 applied function: list-like results will be returned as a Series
10223 of those. However if the apply function returns a Series these
10224 are expanded to columns.
10225 args : tuple
10226 Positional arguments to pass to `func` in addition to the
10227 array/series.
10228 by_row : False or "compat", default "compat"
10229 Only has an effect when ``func`` is a listlike or dictlike of funcs
10230 and the func isn't a string.
10231 If "compat", will if possible first translate the func into pandas
10232 methods (e.g. ``Series().apply(np.sum)`` will be translated to
10233 ``Series().sum()``). If that doesn't work, will try call to apply again with
10234 ``by_row=True`` and if that fails, will call apply again with
10235 ``by_row=False`` (backward compatible).
10236 If False, the funcs will be passed the whole Series at once.
10237
10238 .. versionadded:: 2.1.0
10239
10240 engine : {'python', 'numba'}, default 'python'
10241 Choose between the python (default) engine or the numba engine in apply.
10242
10243 The numba engine will attempt to JIT compile the passed function,
10244 which may result in speedups for large DataFrames.
10245 It also supports the following engine_kwargs :
10246
10247 - nopython (compile the function in nopython mode)
10248 - nogil (release the GIL inside the JIT compiled function)
10249 - parallel (try to apply the function in parallel over the DataFrame)
10250
10251 Note: Due to limitations within numba/how pandas interfaces with numba,
10252 you should only use this if raw=True
10253
10254 Note: The numba compiler only supports a subset of
10255 valid Python/numpy operations.
10256
10257 Please read more about the `supported python features
10258 <https://numba.pydata.org/numba-doc/dev/reference/pysupported.html>`_
10259 and `supported numpy features
10260 <https://numba.pydata.org/numba-doc/dev/reference/numpysupported.html>`_
10261 in numba to learn what you can or cannot use in the passed function.
10262
10263 .. versionadded:: 2.2.0
10264
10265 engine_kwargs : dict
10266 Pass keyword arguments to the engine.
10267 This is currently only used by the numba engine,
10268 see the documentation for the engine argument for more information.
10269 **kwargs
10270 Additional keyword arguments to pass as keywords arguments to
10271 `func`.
10272
10273 Returns
10274 -------
10275 Series or DataFrame
10276 Result of applying ``func`` along the given axis of the
10277 DataFrame.
10278
10279 See Also
10280 --------
10281 DataFrame.map: For elementwise operations.
10282 DataFrame.aggregate: Only perform aggregating type operations.
10283 DataFrame.transform: Only perform transforming type operations.
10284
10285 Notes
10286 -----
10287 Functions that mutate the passed object can produce unexpected
10288 behavior or errors and are not supported. See :ref:`gotchas.udf-mutation`
10289 for more details.
10290
10291 Examples
10292 --------
10293 >>> df = pd.DataFrame([[4, 9]] * 3, columns=['A', 'B'])
10294 >>> df
10295 A B
10296 0 4 9
10297 1 4 9
10298 2 4 9
10299
10300 Using a numpy universal function (in this case the same as
10301 ``np.sqrt(df)``):
10302
10303 >>> df.apply(np.sqrt)
10304 A B
10305 0 2.0 3.0
10306 1 2.0 3.0
10307 2 2.0 3.0
10308
10309 Using a reducing function on either axis
10310
10311 >>> df.apply(np.sum, axis=0)
10312 A 12
10313 B 27
10314 dtype: int64
10315
10316 >>> df.apply(np.sum, axis=1)
10317 0 13
10318 1 13
10319 2 13
10320 dtype: int64
10321
10322 Returning a list-like will result in a Series
10323
10324 >>> df.apply(lambda x: [1, 2], axis=1)
10325 0 [1, 2]
10326 1 [1, 2]
10327 2 [1, 2]
10328 dtype: object
10329
10330 Passing ``result_type='expand'`` will expand list-like results
10331 to columns of a Dataframe
10332
10333 >>> df.apply(lambda x: [1, 2], axis=1, result_type='expand')
10334 0 1
10335 0 1 2
10336 1 1 2
10337 2 1 2
10338
10339 Returning a Series inside the function is similar to passing
10340 ``result_type='expand'``. The resulting column names
10341 will be the Series index.
10342
10343 >>> df.apply(lambda x: pd.Series([1, 2], index=['foo', 'bar']), axis=1)
10344 foo bar
10345 0 1 2
10346 1 1 2
10347 2 1 2
10348
10349 Passing ``result_type='broadcast'`` will ensure the same shape
10350 result, whether list-like or scalar is returned by the function,
10351 and broadcast it along the axis. The resulting column names will
10352 be the originals.
10353
10354 >>> df.apply(lambda x: [1, 2], axis=1, result_type='broadcast')
10355 A B
10356 0 1 2
10357 1 1 2
10358 2 1 2
10359 """
10360 from pandas.core.apply import frame_apply
10361
10362 op = frame_apply(
10363 self,
10364 func=func,
10365 axis=axis,
10366 raw=raw,
10367 result_type=result_type,
10368 by_row=by_row,
10369 engine=engine,
10370 engine_kwargs=engine_kwargs,
10371 args=args,
10372 kwargs=kwargs,
10373 )
10374 return op.apply().__finalize__(self, method="apply")
10375
10376 def map(
10377 self, func: PythonFuncType, na_action: str | None = None, **kwargs
10378 ) -> DataFrame:
10379 """
10380 Apply a function to a Dataframe elementwise.
10381
10382 .. versionadded:: 2.1.0
10383
10384 DataFrame.applymap was deprecated and renamed to DataFrame.map.
10385
10386 This method applies a function that accepts and returns a scalar
10387 to every element of a DataFrame.
10388
10389 Parameters
10390 ----------
10391 func : callable
10392 Python function, returns a single value from a single value.
10393 na_action : {None, 'ignore'}, default None
10394 If 'ignore', propagate NaN values, without passing them to func.
10395 **kwargs
10396 Additional keyword arguments to pass as keywords arguments to
10397 `func`.
10398
10399 Returns
10400 -------
10401 DataFrame
10402 Transformed DataFrame.
10403
10404 See Also
10405 --------
10406 DataFrame.apply : Apply a function along input axis of DataFrame.
10407 DataFrame.replace: Replace values given in `to_replace` with `value`.
10408 Series.map : Apply a function elementwise on a Series.
10409
10410 Examples
10411 --------
10412 >>> df = pd.DataFrame([[1, 2.12], [3.356, 4.567]])
10413 >>> df
10414 0 1
10415 0 1.000 2.120
10416 1 3.356 4.567
10417
10418 >>> df.map(lambda x: len(str(x)))
10419 0 1
10420 0 3 4
10421 1 5 5
10422
10423 Like Series.map, NA values can be ignored:
10424
10425 >>> df_copy = df.copy()
10426 >>> df_copy.iloc[0, 0] = pd.NA
10427 >>> df_copy.map(lambda x: len(str(x)), na_action='ignore')
10428 0 1
10429 0 NaN 4
10430 1 5.0 5
10431
10432 It is also possible to use `map` with functions that are not
10433 `lambda` functions:
10434
10435 >>> df.map(round, ndigits=1)
10436 0 1
10437 0 1.0 2.1
10438 1 3.4 4.6
10439
10440 Note that a vectorized version of `func` often exists, which will
10441 be much faster. You could square each number elementwise.
10442
10443 >>> df.map(lambda x: x**2)
10444 0 1
10445 0 1.000000 4.494400
10446 1 11.262736 20.857489
10447
10448 But it's better to avoid map in that case.
10449
10450 >>> df ** 2
10451 0 1
10452 0 1.000000 4.494400
10453 1 11.262736 20.857489
10454 """
10455 if na_action not in {"ignore", None}:
10456 raise ValueError(
10457 f"na_action must be 'ignore' or None. Got {repr(na_action)}"
10458 )
10459
10460 if self.empty:
10461 return self.copy()
10462
10463 func = functools.partial(func, **kwargs)
10464
10465 def infer(x):
10466 return x._map_values(func, na_action=na_action)
10467
10468 return self.apply(infer).__finalize__(self, "map")
10469
10470 def applymap(
10471 self, func: PythonFuncType, na_action: NaAction | None = None, **kwargs
10472 ) -> DataFrame:
10473 """
10474 Apply a function to a Dataframe elementwise.
10475
10476 .. deprecated:: 2.1.0
10477
10478 DataFrame.applymap has been deprecated. Use DataFrame.map instead.
10479
10480 This method applies a function that accepts and returns a scalar
10481 to every element of a DataFrame.
10482
10483 Parameters
10484 ----------
10485 func : callable
10486 Python function, returns a single value from a single value.
10487 na_action : {None, 'ignore'}, default None
10488 If 'ignore', propagate NaN values, without passing them to func.
10489 **kwargs
10490 Additional keyword arguments to pass as keywords arguments to
10491 `func`.
10492
10493 Returns
10494 -------
10495 DataFrame
10496 Transformed DataFrame.
10497
10498 See Also
10499 --------
10500 DataFrame.apply : Apply a function along input axis of DataFrame.
10501 DataFrame.map : Apply a function along input axis of DataFrame.
10502 DataFrame.replace: Replace values given in `to_replace` with `value`.
10503
10504 Examples
10505 --------
10506 >>> df = pd.DataFrame([[1, 2.12], [3.356, 4.567]])
10507 >>> df
10508 0 1
10509 0 1.000 2.120
10510 1 3.356 4.567
10511
10512 >>> df.map(lambda x: len(str(x)))
10513 0 1
10514 0 3 4
10515 1 5 5
10516 """
10517 warnings.warn(
10518 "DataFrame.applymap has been deprecated. Use DataFrame.map instead.",
10519 FutureWarning,
10520 stacklevel=find_stack_level(),
10521 )
10522 return self.map(func, na_action=na_action, **kwargs)
10523
10524 # ----------------------------------------------------------------------
10525 # Merging / joining methods
10526
10527 def _append(
10528 self,
10529 other,
10530 ignore_index: bool = False,
10531 verify_integrity: bool = False,
10532 sort: bool = False,
10533 ) -> DataFrame:
10534 if isinstance(other, (Series, dict)):
10535 if isinstance(other, dict):
10536 if not ignore_index:
10537 raise TypeError("Can only append a dict if ignore_index=True")
10538 other = Series(other)
10539 if other.name is None and not ignore_index:
10540 raise TypeError(
10541 "Can only append a Series if ignore_index=True "
10542 "or if the Series has a name"
10543 )
10544
10545 index = Index(
10546 [other.name],
10547 name=self.index.names
10548 if isinstance(self.index, MultiIndex)
10549 else self.index.name,
10550 )
10551 row_df = other.to_frame().T
10552 # infer_objects is needed for
10553 # test_append_empty_frame_to_series_with_dateutil_tz
10554 other = row_df.infer_objects(copy=False).rename_axis(
10555 index.names, copy=False
10556 )
10557 elif isinstance(other, list):
10558 if not other:
10559 pass
10560 elif not isinstance(other[0], DataFrame):
10561 other = DataFrame(other)
10562 if self.index.name is not None and not ignore_index:
10563 other.index.name = self.index.name
10564
10565 from pandas.core.reshape.concat import concat
10566
10567 if isinstance(other, (list, tuple)):
10568 to_concat = [self, *other]
10569 else:
10570 to_concat = [self, other]
10571
10572 result = concat(
10573 to_concat,
10574 ignore_index=ignore_index,
10575 verify_integrity=verify_integrity,
10576 sort=sort,
10577 )
10578 return result.__finalize__(self, method="append")
10579
10580 def join(
10581 self,
10582 other: DataFrame | Series | Iterable[DataFrame | Series],
10583 on: IndexLabel | None = None,
10584 how: MergeHow = "left",
10585 lsuffix: str = "",
10586 rsuffix: str = "",
10587 sort: bool = False,
10588 validate: JoinValidate | None = None,
10589 ) -> DataFrame:
10590 """
10591 Join columns of another DataFrame.
10592
10593 Join columns with `other` DataFrame either on index or on a key
10594 column. Efficiently join multiple DataFrame objects by index at once by
10595 passing a list.
10596
10597 Parameters
10598 ----------
10599 other : DataFrame, Series, or a list containing any combination of them
10600 Index should be similar to one of the columns in this one. If a
10601 Series is passed, its name attribute must be set, and that will be
10602 used as the column name in the resulting joined DataFrame.
10603 on : str, list of str, or array-like, optional
10604 Column or index level name(s) in the caller to join on the index
10605 in `other`, otherwise joins index-on-index. If multiple
10606 values given, the `other` DataFrame must have a MultiIndex. Can
10607 pass an array as the join key if it is not already contained in
10608 the calling DataFrame. Like an Excel VLOOKUP operation.
10609 how : {'left', 'right', 'outer', 'inner', 'cross'}, default 'left'
10610 How to handle the operation of the two objects.
10611
10612 * left: use calling frame's index (or column if on is specified)
10613 * right: use `other`'s index.
10614 * outer: form union of calling frame's index (or column if on is
10615 specified) with `other`'s index, and sort it lexicographically.
10616 * inner: form intersection of calling frame's index (or column if
10617 on is specified) with `other`'s index, preserving the order
10618 of the calling's one.
10619 * cross: creates the cartesian product from both frames, preserves the order
10620 of the left keys.
10621 lsuffix : str, default ''
10622 Suffix to use from left frame's overlapping columns.
10623 rsuffix : str, default ''
10624 Suffix to use from right frame's overlapping columns.
10625 sort : bool, default False
10626 Order result DataFrame lexicographically by the join key. If False,
10627 the order of the join key depends on the join type (how keyword).
10628 validate : str, optional
10629 If specified, checks if join is of specified type.
10630
10631 * "one_to_one" or "1:1": check if join keys are unique in both left
10632 and right datasets.
10633 * "one_to_many" or "1:m": check if join keys are unique in left dataset.
10634 * "many_to_one" or "m:1": check if join keys are unique in right dataset.
10635 * "many_to_many" or "m:m": allowed, but does not result in checks.
10636
10637 .. versionadded:: 1.5.0
10638
10639 Returns
10640 -------
10641 DataFrame
10642 A dataframe containing columns from both the caller and `other`.
10643
10644 See Also
10645 --------
10646 DataFrame.merge : For column(s)-on-column(s) operations.
10647
10648 Notes
10649 -----
10650 Parameters `on`, `lsuffix`, and `rsuffix` are not supported when
10651 passing a list of `DataFrame` objects.
10652
10653 Examples
10654 --------
10655 >>> df = pd.DataFrame({'key': ['K0', 'K1', 'K2', 'K3', 'K4', 'K5'],
10656 ... 'A': ['A0', 'A1', 'A2', 'A3', 'A4', 'A5']})
10657
10658 >>> df
10659 key A
10660 0 K0 A0
10661 1 K1 A1
10662 2 K2 A2
10663 3 K3 A3
10664 4 K4 A4
10665 5 K5 A5
10666
10667 >>> other = pd.DataFrame({'key': ['K0', 'K1', 'K2'],
10668 ... 'B': ['B0', 'B1', 'B2']})
10669
10670 >>> other
10671 key B
10672 0 K0 B0
10673 1 K1 B1
10674 2 K2 B2
10675
10676 Join DataFrames using their indexes.
10677
10678 >>> df.join(other, lsuffix='_caller', rsuffix='_other')
10679 key_caller A key_other B
10680 0 K0 A0 K0 B0
10681 1 K1 A1 K1 B1
10682 2 K2 A2 K2 B2
10683 3 K3 A3 NaN NaN
10684 4 K4 A4 NaN NaN
10685 5 K5 A5 NaN NaN
10686
10687 If we want to join using the key columns, we need to set key to be
10688 the index in both `df` and `other`. The joined DataFrame will have
10689 key as its index.
10690
10691 >>> df.set_index('key').join(other.set_index('key'))
10692 A B
10693 key
10694 K0 A0 B0
10695 K1 A1 B1
10696 K2 A2 B2
10697 K3 A3 NaN
10698 K4 A4 NaN
10699 K5 A5 NaN
10700
10701 Another option to join using the key columns is to use the `on`
10702 parameter. DataFrame.join always uses `other`'s index but we can use
10703 any column in `df`. This method preserves the original DataFrame's
10704 index in the result.
10705
10706 >>> df.join(other.set_index('key'), on='key')
10707 key A B
10708 0 K0 A0 B0
10709 1 K1 A1 B1
10710 2 K2 A2 B2
10711 3 K3 A3 NaN
10712 4 K4 A4 NaN
10713 5 K5 A5 NaN
10714
10715 Using non-unique key values shows how they are matched.
10716
10717 >>> df = pd.DataFrame({'key': ['K0', 'K1', 'K1', 'K3', 'K0', 'K1'],
10718 ... 'A': ['A0', 'A1', 'A2', 'A3', 'A4', 'A5']})
10719
10720 >>> df
10721 key A
10722 0 K0 A0
10723 1 K1 A1
10724 2 K1 A2
10725 3 K3 A3
10726 4 K0 A4
10727 5 K1 A5
10728
10729 >>> df.join(other.set_index('key'), on='key', validate='m:1')
10730 key A B
10731 0 K0 A0 B0
10732 1 K1 A1 B1
10733 2 K1 A2 B1
10734 3 K3 A3 NaN
10735 4 K0 A4 B0
10736 5 K1 A5 B1
10737 """
10738 from pandas.core.reshape.concat import concat
10739 from pandas.core.reshape.merge import merge
10740
10741 if isinstance(other, Series):
10742 if other.name is None:
10743 raise ValueError("Other Series must have a name")
10744 other = DataFrame({other.name: other})
10745
10746 if isinstance(other, DataFrame):
10747 if how == "cross":
10748 return merge(
10749 self,
10750 other,
10751 how=how,
10752 on=on,
10753 suffixes=(lsuffix, rsuffix),
10754 sort=sort,
10755 validate=validate,
10756 )
10757 return merge(
10758 self,
10759 other,
10760 left_on=on,
10761 how=how,
10762 left_index=on is None,
10763 right_index=True,
10764 suffixes=(lsuffix, rsuffix),
10765 sort=sort,
10766 validate=validate,
10767 )
10768 else:
10769 if on is not None:
10770 raise ValueError(
10771 "Joining multiple DataFrames only supported for joining on index"
10772 )
10773
10774 if rsuffix or lsuffix:
10775 raise ValueError(
10776 "Suffixes not supported when joining multiple DataFrames"
10777 )
10778
10779 # Mypy thinks the RHS is a
10780 # "Union[DataFrame, Series, Iterable[Union[DataFrame, Series]]]" whereas
10781 # the LHS is an "Iterable[DataFrame]", but in reality both types are
10782 # "Iterable[Union[DataFrame, Series]]" due to the if statements
10783 frames = [cast("DataFrame | Series", self)] + list(other)
10784
10785 can_concat = all(df.index.is_unique for df in frames)
10786
10787 # join indexes only using concat
10788 if can_concat:
10789 if how == "left":
10790 res = concat(
10791 frames, axis=1, join="outer", verify_integrity=True, sort=sort
10792 )
10793 return res.reindex(self.index, copy=False)
10794 else:
10795 return concat(
10796 frames, axis=1, join=how, verify_integrity=True, sort=sort
10797 )
10798
10799 joined = frames[0]
10800
10801 for frame in frames[1:]:
10802 joined = merge(
10803 joined,
10804 frame,
10805 how=how,
10806 left_index=True,
10807 right_index=True,
10808 validate=validate,
10809 )
10810
10811 return joined
10812
10813 @Substitution("")
10814 @Appender(_merge_doc, indents=2)
10815 def merge(
10816 self,
10817 right: DataFrame | Series,
10818 how: MergeHow = "inner",
10819 on: IndexLabel | AnyArrayLike | None = None,
10820 left_on: IndexLabel | AnyArrayLike | None = None,
10821 right_on: IndexLabel | AnyArrayLike | None = None,
10822 left_index: bool = False,
10823 right_index: bool = False,
10824 sort: bool = False,
10825 suffixes: Suffixes = ("_x", "_y"),
10826 copy: bool | None = None,
10827 indicator: str | bool = False,
10828 validate: MergeValidate | None = None,
10829 ) -> DataFrame:
10830 from pandas.core.reshape.merge import merge
10831
10832 return merge(
10833 self,
10834 right,
10835 how=how,
10836 on=on,
10837 left_on=left_on,
10838 right_on=right_on,
10839 left_index=left_index,
10840 right_index=right_index,
10841 sort=sort,
10842 suffixes=suffixes,
10843 copy=copy,
10844 indicator=indicator,
10845 validate=validate,
10846 )
10847
10848 def round(
10849 self, decimals: int | dict[IndexLabel, int] | Series = 0, *args, **kwargs
10850 ) -> DataFrame:
10851 """
10852 Round a DataFrame to a variable number of decimal places.
10853
10854 Parameters
10855 ----------
10856 decimals : int, dict, Series
10857 Number of decimal places to round each column to. If an int is
10858 given, round each column to the same number of places.
10859 Otherwise dict and Series round to variable numbers of places.
10860 Column names should be in the keys if `decimals` is a
10861 dict-like, or in the index if `decimals` is a Series. Any
10862 columns not included in `decimals` will be left as is. Elements
10863 of `decimals` which are not columns of the input will be
10864 ignored.
10865 *args
10866 Additional keywords have no effect but might be accepted for
10867 compatibility with numpy.
10868 **kwargs
10869 Additional keywords have no effect but might be accepted for
10870 compatibility with numpy.
10871
10872 Returns
10873 -------
10874 DataFrame
10875 A DataFrame with the affected columns rounded to the specified
10876 number of decimal places.
10877
10878 See Also
10879 --------
10880 numpy.around : Round a numpy array to the given number of decimals.
10881 Series.round : Round a Series to the given number of decimals.
10882
10883 Examples
10884 --------
10885 >>> df = pd.DataFrame([(.21, .32), (.01, .67), (.66, .03), (.21, .18)],
10886 ... columns=['dogs', 'cats'])
10887 >>> df
10888 dogs cats
10889 0 0.21 0.32
10890 1 0.01 0.67
10891 2 0.66 0.03
10892 3 0.21 0.18
10893
10894 By providing an integer each column is rounded to the same number
10895 of decimal places
10896
10897 >>> df.round(1)
10898 dogs cats
10899 0 0.2 0.3
10900 1 0.0 0.7
10901 2 0.7 0.0
10902 3 0.2 0.2
10903
10904 With a dict, the number of places for specific columns can be
10905 specified with the column names as key and the number of decimal
10906 places as value
10907
10908 >>> df.round({'dogs': 1, 'cats': 0})
10909 dogs cats
10910 0 0.2 0.0
10911 1 0.0 1.0
10912 2 0.7 0.0
10913 3 0.2 0.0
10914
10915 Using a Series, the number of places for specific columns can be
10916 specified with the column names as index and the number of
10917 decimal places as value
10918
10919 >>> decimals = pd.Series([0, 1], index=['cats', 'dogs'])
10920 >>> df.round(decimals)
10921 dogs cats
10922 0 0.2 0.0
10923 1 0.0 1.0
10924 2 0.7 0.0
10925 3 0.2 0.0
10926 """
10927 from pandas.core.reshape.concat import concat
10928
10929 def _dict_round(df: DataFrame, decimals):
10930 for col, vals in df.items():
10931 try:
10932 yield _series_round(vals, decimals[col])
10933 except KeyError:
10934 yield vals
10935
10936 def _series_round(ser: Series, decimals: int) -> Series:
10937 if is_integer_dtype(ser.dtype) or is_float_dtype(ser.dtype):
10938 return ser.round(decimals)
10939 return ser
10940
10941 nv.validate_round(args, kwargs)
10942
10943 if isinstance(decimals, (dict, Series)):
10944 if isinstance(decimals, Series) and not decimals.index.is_unique:
10945 raise ValueError("Index of decimals must be unique")
10946 if is_dict_like(decimals) and not all(
10947 is_integer(value) for _, value in decimals.items()
10948 ):
10949 raise TypeError("Values in decimals must be integers")
10950 new_cols = list(_dict_round(self, decimals))
10951 elif is_integer(decimals):
10952 # Dispatch to Block.round
10953 # Argument "decimals" to "round" of "BaseBlockManager" has incompatible
10954 # type "Union[int, integer[Any]]"; expected "int"
10955 new_mgr = self._mgr.round(
10956 decimals=decimals, # type: ignore[arg-type]
10957 using_cow=using_copy_on_write(),
10958 )
10959 return self._constructor_from_mgr(new_mgr, axes=new_mgr.axes).__finalize__(
10960 self, method="round"
10961 )
10962 else:
10963 raise TypeError("decimals must be an integer, a dict-like or a Series")
10964
10965 if new_cols is not None and len(new_cols) > 0:
10966 return self._constructor(
10967 concat(new_cols, axis=1), index=self.index, columns=self.columns
10968 ).__finalize__(self, method="round")
10969 else:
10970 return self.copy(deep=False)
10971
10972 # ----------------------------------------------------------------------
10973 # Statistical methods, etc.
10974
10975 def corr(
10976 self,
10977 method: CorrelationMethod = "pearson",
10978 min_periods: int = 1,
10979 numeric_only: bool = False,
10980 ) -> DataFrame:
10981 """
10982 Compute pairwise correlation of columns, excluding NA/null values.
10983
10984 Parameters
10985 ----------
10986 method : {'pearson', 'kendall', 'spearman'} or callable
10987 Method of correlation:
10988
10989 * pearson : standard correlation coefficient
10990 * kendall : Kendall Tau correlation coefficient
10991 * spearman : Spearman rank correlation
10992 * callable: callable with input two 1d ndarrays
10993 and returning a float. Note that the returned matrix from corr
10994 will have 1 along the diagonals and will be symmetric
10995 regardless of the callable's behavior.
10996 min_periods : int, optional
10997 Minimum number of observations required per pair of columns
10998 to have a valid result. Currently only available for Pearson
10999 and Spearman correlation.
11000 numeric_only : bool, default False
11001 Include only `float`, `int` or `boolean` data.
11002
11003 .. versionadded:: 1.5.0
11004
11005 .. versionchanged:: 2.0.0
11006 The default value of ``numeric_only`` is now ``False``.
11007
11008 Returns
11009 -------
11010 DataFrame
11011 Correlation matrix.
11012
11013 See Also
11014 --------
11015 DataFrame.corrwith : Compute pairwise correlation with another
11016 DataFrame or Series.
11017 Series.corr : Compute the correlation between two Series.
11018
11019 Notes
11020 -----
11021 Pearson, Kendall and Spearman correlation are currently computed using pairwise complete observations.
11022
11023 * `Pearson correlation coefficient <https://en.wikipedia.org/wiki/Pearson_correlation_coefficient>`_
11024 * `Kendall rank correlation coefficient <https://en.wikipedia.org/wiki/Kendall_rank_correlation_coefficient>`_
11025 * `Spearman's rank correlation coefficient <https://en.wikipedia.org/wiki/Spearman%27s_rank_correlation_coefficient>`_
11026
11027 Examples
11028 --------
11029 >>> def histogram_intersection(a, b):
11030 ... v = np.minimum(a, b).sum().round(decimals=1)
11031 ... return v
11032 >>> df = pd.DataFrame([(.2, .3), (.0, .6), (.6, .0), (.2, .1)],
11033 ... columns=['dogs', 'cats'])
11034 >>> df.corr(method=histogram_intersection)
11035 dogs cats
11036 dogs 1.0 0.3
11037 cats 0.3 1.0
11038
11039 >>> df = pd.DataFrame([(1, 1), (2, np.nan), (np.nan, 3), (4, 4)],
11040 ... columns=['dogs', 'cats'])
11041 >>> df.corr(min_periods=3)
11042 dogs cats
11043 dogs 1.0 NaN
11044 cats NaN 1.0
11045 """ # noqa: E501
11046 data = self._get_numeric_data() if numeric_only else self
11047 cols = data.columns
11048 idx = cols.copy()
11049 mat = data.to_numpy(dtype=float, na_value=np.nan, copy=False)
11050
11051 if method == "pearson":
11052 correl = libalgos.nancorr(mat, minp=min_periods)
11053 elif method == "spearman":
11054 correl = libalgos.nancorr_spearman(mat, minp=min_periods)
11055 elif method == "kendall" or callable(method):
11056 if min_periods is None:
11057 min_periods = 1
11058 mat = mat.T
11059 corrf = nanops.get_corr_func(method)
11060 K = len(cols)
11061 correl = np.empty((K, K), dtype=float)
11062 mask = np.isfinite(mat)
11063 for i, ac in enumerate(mat):
11064 for j, bc in enumerate(mat):
11065 if i > j:
11066 continue
11067
11068 valid = mask[i] & mask[j]
11069 if valid.sum() < min_periods:
11070 c = np.nan
11071 elif i == j:
11072 c = 1.0
11073 elif not valid.all():
11074 c = corrf(ac[valid], bc[valid])
11075 else:
11076 c = corrf(ac, bc)
11077 correl[i, j] = c
11078 correl[j, i] = c
11079 else:
11080 raise ValueError(
11081 "method must be either 'pearson', "
11082 "'spearman', 'kendall', or a callable, "
11083 f"'{method}' was supplied"
11084 )
11085
11086 result = self._constructor(correl, index=idx, columns=cols, copy=False)
11087 return result.__finalize__(self, method="corr")
11088
11089 def cov(
11090 self,
11091 min_periods: int | None = None,
11092 ddof: int | None = 1,
11093 numeric_only: bool = False,
11094 ) -> DataFrame:
11095 """
11096 Compute pairwise covariance of columns, excluding NA/null values.
11097
11098 Compute the pairwise covariance among the series of a DataFrame.
11099 The returned data frame is the `covariance matrix
11100 <https://en.wikipedia.org/wiki/Covariance_matrix>`__ of the columns
11101 of the DataFrame.
11102
11103 Both NA and null values are automatically excluded from the
11104 calculation. (See the note below about bias from missing values.)
11105 A threshold can be set for the minimum number of
11106 observations for each value created. Comparisons with observations
11107 below this threshold will be returned as ``NaN``.
11108
11109 This method is generally used for the analysis of time series data to
11110 understand the relationship between different measures
11111 across time.
11112
11113 Parameters
11114 ----------
11115 min_periods : int, optional
11116 Minimum number of observations required per pair of columns
11117 to have a valid result.
11118
11119 ddof : int, default 1
11120 Delta degrees of freedom. The divisor used in calculations
11121 is ``N - ddof``, where ``N`` represents the number of elements.
11122 This argument is applicable only when no ``nan`` is in the dataframe.
11123
11124 numeric_only : bool, default False
11125 Include only `float`, `int` or `boolean` data.
11126
11127 .. versionadded:: 1.5.0
11128
11129 .. versionchanged:: 2.0.0
11130 The default value of ``numeric_only`` is now ``False``.
11131
11132 Returns
11133 -------
11134 DataFrame
11135 The covariance matrix of the series of the DataFrame.
11136
11137 See Also
11138 --------
11139 Series.cov : Compute covariance with another Series.
11140 core.window.ewm.ExponentialMovingWindow.cov : Exponential weighted sample
11141 covariance.
11142 core.window.expanding.Expanding.cov : Expanding sample covariance.
11143 core.window.rolling.Rolling.cov : Rolling sample covariance.
11144
11145 Notes
11146 -----
11147 Returns the covariance matrix of the DataFrame's time series.
11148 The covariance is normalized by N-ddof.
11149
11150 For DataFrames that have Series that are missing data (assuming that
11151 data is `missing at random
11152 <https://en.wikipedia.org/wiki/Missing_data#Missing_at_random>`__)
11153 the returned covariance matrix will be an unbiased estimate
11154 of the variance and covariance between the member Series.
11155
11156 However, for many applications this estimate may not be acceptable
11157 because the estimate covariance matrix is not guaranteed to be positive
11158 semi-definite. This could lead to estimate correlations having
11159 absolute values which are greater than one, and/or a non-invertible
11160 covariance matrix. See `Estimation of covariance matrices
11161 <https://en.wikipedia.org/w/index.php?title=Estimation_of_covariance_
11162 matrices>`__ for more details.
11163
11164 Examples
11165 --------
11166 >>> df = pd.DataFrame([(1, 2), (0, 3), (2, 0), (1, 1)],
11167 ... columns=['dogs', 'cats'])
11168 >>> df.cov()
11169 dogs cats
11170 dogs 0.666667 -1.000000
11171 cats -1.000000 1.666667
11172
11173 >>> np.random.seed(42)
11174 >>> df = pd.DataFrame(np.random.randn(1000, 5),
11175 ... columns=['a', 'b', 'c', 'd', 'e'])
11176 >>> df.cov()
11177 a b c d e
11178 a 0.998438 -0.020161 0.059277 -0.008943 0.014144
11179 b -0.020161 1.059352 -0.008543 -0.024738 0.009826
11180 c 0.059277 -0.008543 1.010670 -0.001486 -0.000271
11181 d -0.008943 -0.024738 -0.001486 0.921297 -0.013692
11182 e 0.014144 0.009826 -0.000271 -0.013692 0.977795
11183
11184 **Minimum number of periods**
11185
11186 This method also supports an optional ``min_periods`` keyword
11187 that specifies the required minimum number of non-NA observations for
11188 each column pair in order to have a valid result:
11189
11190 >>> np.random.seed(42)
11191 >>> df = pd.DataFrame(np.random.randn(20, 3),
11192 ... columns=['a', 'b', 'c'])
11193 >>> df.loc[df.index[:5], 'a'] = np.nan
11194 >>> df.loc[df.index[5:10], 'b'] = np.nan
11195 >>> df.cov(min_periods=12)
11196 a b c
11197 a 0.316741 NaN -0.150812
11198 b NaN 1.248003 0.191417
11199 c -0.150812 0.191417 0.895202
11200 """
11201 data = self._get_numeric_data() if numeric_only else self
11202 cols = data.columns
11203 idx = cols.copy()
11204 mat = data.to_numpy(dtype=float, na_value=np.nan, copy=False)
11205
11206 if notna(mat).all():
11207 if min_periods is not None and min_periods > len(mat):
11208 base_cov = np.empty((mat.shape[1], mat.shape[1]))
11209 base_cov.fill(np.nan)
11210 else:
11211 base_cov = np.cov(mat.T, ddof=ddof)
11212 base_cov = base_cov.reshape((len(cols), len(cols)))
11213 else:
11214 base_cov = libalgos.nancorr(mat, cov=True, minp=min_periods)
11215
11216 result = self._constructor(base_cov, index=idx, columns=cols, copy=False)
11217 return result.__finalize__(self, method="cov")
11218
11219 def corrwith(
11220 self,
11221 other: DataFrame | Series,
11222 axis: Axis = 0,
11223 drop: bool = False,
11224 method: CorrelationMethod = "pearson",
11225 numeric_only: bool = False,
11226 ) -> Series:
11227 """
11228 Compute pairwise correlation.
11229
11230 Pairwise correlation is computed between rows or columns of
11231 DataFrame with rows or columns of Series or DataFrame. DataFrames
11232 are first aligned along both axes before computing the
11233 correlations.
11234
11235 Parameters
11236 ----------
11237 other : DataFrame, Series
11238 Object with which to compute correlations.
11239 axis : {0 or 'index', 1 or 'columns'}, default 0
11240 The axis to use. 0 or 'index' to compute row-wise, 1 or 'columns' for
11241 column-wise.
11242 drop : bool, default False
11243 Drop missing indices from result.
11244 method : {'pearson', 'kendall', 'spearman'} or callable
11245 Method of correlation:
11246
11247 * pearson : standard correlation coefficient
11248 * kendall : Kendall Tau correlation coefficient
11249 * spearman : Spearman rank correlation
11250 * callable: callable with input two 1d ndarrays
11251 and returning a float.
11252
11253 numeric_only : bool, default False
11254 Include only `float`, `int` or `boolean` data.
11255
11256 .. versionadded:: 1.5.0
11257
11258 .. versionchanged:: 2.0.0
11259 The default value of ``numeric_only`` is now ``False``.
11260
11261 Returns
11262 -------
11263 Series
11264 Pairwise correlations.
11265
11266 See Also
11267 --------
11268 DataFrame.corr : Compute pairwise correlation of columns.
11269
11270 Examples
11271 --------
11272 >>> index = ["a", "b", "c", "d", "e"]
11273 >>> columns = ["one", "two", "three", "four"]
11274 >>> df1 = pd.DataFrame(np.arange(20).reshape(5, 4), index=index, columns=columns)
11275 >>> df2 = pd.DataFrame(np.arange(16).reshape(4, 4), index=index[:4], columns=columns)
11276 >>> df1.corrwith(df2)
11277 one 1.0
11278 two 1.0
11279 three 1.0
11280 four 1.0
11281 dtype: float64
11282
11283 >>> df2.corrwith(df1, axis=1)
11284 a 1.0
11285 b 1.0
11286 c 1.0
11287 d 1.0
11288 e NaN
11289 dtype: float64
11290 """ # noqa: E501
11291 axis = self._get_axis_number(axis)
11292 this = self._get_numeric_data() if numeric_only else self
11293
11294 if isinstance(other, Series):
11295 return this.apply(lambda x: other.corr(x, method=method), axis=axis)
11296
11297 if numeric_only:
11298 other = other._get_numeric_data()
11299 left, right = this.align(other, join="inner", copy=False)
11300
11301 if axis == 1:
11302 left = left.T
11303 right = right.T
11304
11305 if method == "pearson":
11306 # mask missing values
11307 left = left + right * 0
11308 right = right + left * 0
11309
11310 # demeaned data
11311 ldem = left - left.mean(numeric_only=numeric_only)
11312 rdem = right - right.mean(numeric_only=numeric_only)
11313
11314 num = (ldem * rdem).sum()
11315 dom = (
11316 (left.count() - 1)
11317 * left.std(numeric_only=numeric_only)
11318 * right.std(numeric_only=numeric_only)
11319 )
11320
11321 correl = num / dom
11322
11323 elif method in ["kendall", "spearman"] or callable(method):
11324
11325 def c(x):
11326 return nanops.nancorr(x[0], x[1], method=method)
11327
11328 correl = self._constructor_sliced(
11329 map(c, zip(left.values.T, right.values.T)),
11330 index=left.columns,
11331 copy=False,
11332 )
11333
11334 else:
11335 raise ValueError(
11336 f"Invalid method {method} was passed, "
11337 "valid methods are: 'pearson', 'kendall', "
11338 "'spearman', or callable"
11339 )
11340
11341 if not drop:
11342 # Find non-matching labels along the given axis
11343 # and append missing correlations (GH 22375)
11344 raxis: AxisInt = 1 if axis == 0 else 0
11345 result_index = this._get_axis(raxis).union(other._get_axis(raxis))
11346 idx_diff = result_index.difference(correl.index)
11347
11348 if len(idx_diff) > 0:
11349 correl = correl._append(
11350 Series([np.nan] * len(idx_diff), index=idx_diff)
11351 )
11352
11353 return correl
11354
11355 # ----------------------------------------------------------------------
11356 # ndarray-like stats methods
11357
11358 def count(self, axis: Axis = 0, numeric_only: bool = False):
11359 """
11360 Count non-NA cells for each column or row.
11361
11362 The values `None`, `NaN`, `NaT`, ``pandas.NA`` are considered NA.
11363
11364 Parameters
11365 ----------
11366 axis : {0 or 'index', 1 or 'columns'}, default 0
11367 If 0 or 'index' counts are generated for each column.
11368 If 1 or 'columns' counts are generated for each row.
11369 numeric_only : bool, default False
11370 Include only `float`, `int` or `boolean` data.
11371
11372 Returns
11373 -------
11374 Series
11375 For each column/row the number of non-NA/null entries.
11376
11377 See Also
11378 --------
11379 Series.count: Number of non-NA elements in a Series.
11380 DataFrame.value_counts: Count unique combinations of columns.
11381 DataFrame.shape: Number of DataFrame rows and columns (including NA
11382 elements).
11383 DataFrame.isna: Boolean same-sized DataFrame showing places of NA
11384 elements.
11385
11386 Examples
11387 --------
11388 Constructing DataFrame from a dictionary:
11389
11390 >>> df = pd.DataFrame({"Person":
11391 ... ["John", "Myla", "Lewis", "John", "Myla"],
11392 ... "Age": [24., np.nan, 21., 33, 26],
11393 ... "Single": [False, True, True, True, False]})
11394 >>> df
11395 Person Age Single
11396 0 John 24.0 False
11397 1 Myla NaN True
11398 2 Lewis 21.0 True
11399 3 John 33.0 True
11400 4 Myla 26.0 False
11401
11402 Notice the uncounted NA values:
11403
11404 >>> df.count()
11405 Person 5
11406 Age 4
11407 Single 5
11408 dtype: int64
11409
11410 Counts for each **row**:
11411
11412 >>> df.count(axis='columns')
11413 0 3
11414 1 2
11415 2 3
11416 3 3
11417 4 3
11418 dtype: int64
11419 """
11420 axis = self._get_axis_number(axis)
11421
11422 if numeric_only:
11423 frame = self._get_numeric_data()
11424 else:
11425 frame = self
11426
11427 # GH #423
11428 if len(frame._get_axis(axis)) == 0:
11429 result = self._constructor_sliced(0, index=frame._get_agg_axis(axis))
11430 else:
11431 result = notna(frame).sum(axis=axis)
11432
11433 return result.astype("int64", copy=False).__finalize__(self, method="count")
11434
11435 def _reduce(
11436 self,
11437 op,
11438 name: str,
11439 *,
11440 axis: Axis = 0,
11441 skipna: bool = True,
11442 numeric_only: bool = False,
11443 filter_type=None,
11444 **kwds,
11445 ):
11446 assert filter_type is None or filter_type == "bool", filter_type
11447 out_dtype = "bool" if filter_type == "bool" else None
11448
11449 if axis is not None:
11450 axis = self._get_axis_number(axis)
11451
11452 def func(values: np.ndarray):
11453 # We only use this in the case that operates on self.values
11454 return op(values, axis=axis, skipna=skipna, **kwds)
11455
11456 dtype_has_keepdims: dict[ExtensionDtype, bool] = {}
11457
11458 def blk_func(values, axis: Axis = 1):
11459 if isinstance(values, ExtensionArray):
11460 if not is_1d_only_ea_dtype(values.dtype) and not isinstance(
11461 self._mgr, ArrayManager
11462 ):
11463 return values._reduce(name, axis=1, skipna=skipna, **kwds)
11464 has_keepdims = dtype_has_keepdims.get(values.dtype)
11465 if has_keepdims is None:
11466 sign = signature(values._reduce)
11467 has_keepdims = "keepdims" in sign.parameters
11468 dtype_has_keepdims[values.dtype] = has_keepdims
11469 if has_keepdims:
11470 return values._reduce(name, skipna=skipna, keepdims=True, **kwds)
11471 else:
11472 warnings.warn(
11473 f"{type(values)}._reduce will require a `keepdims` parameter "
11474 "in the future",
11475 FutureWarning,
11476 stacklevel=find_stack_level(),
11477 )
11478 result = values._reduce(name, skipna=skipna, **kwds)
11479 return np.array([result])
11480 else:
11481 return op(values, axis=axis, skipna=skipna, **kwds)
11482
11483 def _get_data() -> DataFrame:
11484 if filter_type is None:
11485 data = self._get_numeric_data()
11486 else:
11487 # GH#25101, GH#24434
11488 assert filter_type == "bool"
11489 data = self._get_bool_data()
11490 return data
11491
11492 # Case with EAs see GH#35881
11493 df = self
11494 if numeric_only:
11495 df = _get_data()
11496 if axis is None:
11497 dtype = find_common_type([arr.dtype for arr in df._mgr.arrays])
11498 if isinstance(dtype, ExtensionDtype):
11499 df = df.astype(dtype, copy=False)
11500 arr = concat_compat(list(df._iter_column_arrays()))
11501 return arr._reduce(name, skipna=skipna, keepdims=False, **kwds)
11502 return func(df.values)
11503 elif axis == 1:
11504 if len(df.index) == 0:
11505 # Taking a transpose would result in no columns, losing the dtype.
11506 # In the empty case, reducing along axis 0 or 1 gives the same
11507 # result dtype, so reduce with axis=0 and ignore values
11508 result = df._reduce(
11509 op,
11510 name,
11511 axis=0,
11512 skipna=skipna,
11513 numeric_only=False,
11514 filter_type=filter_type,
11515 **kwds,
11516 ).iloc[:0]
11517 result.index = df.index
11518 return result
11519
11520 # kurtosis excluded since groupby does not implement it
11521 if df.shape[1] and name != "kurt":
11522 dtype = find_common_type([arr.dtype for arr in df._mgr.arrays])
11523 if isinstance(dtype, ExtensionDtype):
11524 # GH 54341: fastpath for EA-backed axis=1 reductions
11525 # This flattens the frame into a single 1D array while keeping
11526 # track of the row and column indices of the original frame. Once
11527 # flattened, grouping by the row indices and aggregating should
11528 # be equivalent to transposing the original frame and aggregating
11529 # with axis=0.
11530 name = {"argmax": "idxmax", "argmin": "idxmin"}.get(name, name)
11531 df = df.astype(dtype, copy=False)
11532 arr = concat_compat(list(df._iter_column_arrays()))
11533 nrows, ncols = df.shape
11534 row_index = np.tile(np.arange(nrows), ncols)
11535 col_index = np.repeat(np.arange(ncols), nrows)
11536 ser = Series(arr, index=col_index, copy=False)
11537 # GroupBy will raise a warning with SeriesGroupBy as the object,
11538 # likely confusing users
11539 with rewrite_warning(
11540 target_message=(
11541 f"The behavior of SeriesGroupBy.{name} with all-NA values"
11542 ),
11543 target_category=FutureWarning,
11544 new_message=(
11545 f"The behavior of {type(self).__name__}.{name} with all-NA "
11546 "values, or any-NA and skipna=False, is deprecated. In "
11547 "a future version this will raise ValueError"
11548 ),
11549 ):
11550 result = ser.groupby(row_index).agg(name, **kwds)
11551 result.index = df.index
11552 if not skipna and name not in ("any", "all"):
11553 mask = df.isna().to_numpy(dtype=np.bool_).any(axis=1)
11554 other = -1 if name in ("idxmax", "idxmin") else lib.no_default
11555 result = result.mask(mask, other)
11556 return result
11557
11558 df = df.T
11559
11560 # After possibly _get_data and transposing, we are now in the
11561 # simple case where we can use BlockManager.reduce
11562 res = df._mgr.reduce(blk_func)
11563 out = df._constructor_from_mgr(res, axes=res.axes).iloc[0]
11564 if out_dtype is not None and out.dtype != "boolean":
11565 out = out.astype(out_dtype)
11566 elif (df._mgr.get_dtypes() == object).any() and name not in ["any", "all"]:
11567 out = out.astype(object)
11568 elif len(self) == 0 and out.dtype == object and name in ("sum", "prod"):
11569 # Even if we are object dtype, follow numpy and return
11570 # float64, see test_apply_funcs_over_empty
11571 out = out.astype(np.float64)
11572
11573 return out
11574
11575 def _reduce_axis1(self, name: str, func, skipna: bool) -> Series:
11576 """
11577 Special case for _reduce to try to avoid a potentially-expensive transpose.
11578
11579 Apply the reduction block-wise along axis=1 and then reduce the resulting
11580 1D arrays.
11581 """
11582 if name == "all":
11583 result = np.ones(len(self), dtype=bool)
11584 ufunc = np.logical_and
11585 elif name == "any":
11586 result = np.zeros(len(self), dtype=bool)
11587 # error: Incompatible types in assignment
11588 # (expression has type "_UFunc_Nin2_Nout1[Literal['logical_or'],
11589 # Literal[20], Literal[False]]", variable has type
11590 # "_UFunc_Nin2_Nout1[Literal['logical_and'], Literal[20],
11591 # Literal[True]]")
11592 ufunc = np.logical_or # type: ignore[assignment]
11593 else:
11594 raise NotImplementedError(name)
11595
11596 for arr in self._mgr.arrays:
11597 middle = func(arr, axis=0, skipna=skipna)
11598 result = ufunc(result, middle)
11599
11600 res_ser = self._constructor_sliced(result, index=self.index, copy=False)
11601 return res_ser
11602
11603 @doc(make_doc("any", ndim=2))
11604 # error: Signature of "any" incompatible with supertype "NDFrame"
11605 def any( # type: ignore[override]
11606 self,
11607 *,
11608 axis: Axis | None = 0,
11609 bool_only: bool = False,
11610 skipna: bool = True,
11611 **kwargs,
11612 ) -> Series | bool:
11613 result = self._logical_func(
11614 "any", nanops.nanany, axis, bool_only, skipna, **kwargs
11615 )
11616 if isinstance(result, Series):
11617 result = result.__finalize__(self, method="any")
11618 return result
11619
11620 @doc(make_doc("all", ndim=2))
11621 def all(
11622 self,
11623 axis: Axis | None = 0,
11624 bool_only: bool = False,
11625 skipna: bool = True,
11626 **kwargs,
11627 ) -> Series | bool:
11628 result = self._logical_func(
11629 "all", nanops.nanall, axis, bool_only, skipna, **kwargs
11630 )
11631 if isinstance(result, Series):
11632 result = result.__finalize__(self, method="all")
11633 return result
11634
11635 @doc(make_doc("min", ndim=2))
11636 def min(
11637 self,
11638 axis: Axis | None = 0,
11639 skipna: bool = True,
11640 numeric_only: bool = False,
11641 **kwargs,
11642 ):
11643 result = super().min(axis, skipna, numeric_only, **kwargs)
11644 if isinstance(result, Series):
11645 result = result.__finalize__(self, method="min")
11646 return result
11647
11648 @doc(make_doc("max", ndim=2))
11649 def max(
11650 self,
11651 axis: Axis | None = 0,
11652 skipna: bool = True,
11653 numeric_only: bool = False,
11654 **kwargs,
11655 ):
11656 result = super().max(axis, skipna, numeric_only, **kwargs)
11657 if isinstance(result, Series):
11658 result = result.__finalize__(self, method="max")
11659 return result
11660
11661 @doc(make_doc("sum", ndim=2))
11662 def sum(
11663 self,
11664 axis: Axis | None = 0,
11665 skipna: bool = True,
11666 numeric_only: bool = False,
11667 min_count: int = 0,
11668 **kwargs,
11669 ):
11670 result = super().sum(axis, skipna, numeric_only, min_count, **kwargs)
11671 return result.__finalize__(self, method="sum")
11672
11673 @doc(make_doc("prod", ndim=2))
11674 def prod(
11675 self,
11676 axis: Axis | None = 0,
11677 skipna: bool = True,
11678 numeric_only: bool = False,
11679 min_count: int = 0,
11680 **kwargs,
11681 ):
11682 result = super().prod(axis, skipna, numeric_only, min_count, **kwargs)
11683 return result.__finalize__(self, method="prod")
11684
11685 @doc(make_doc("mean", ndim=2))
11686 def mean(
11687 self,
11688 axis: Axis | None = 0,
11689 skipna: bool = True,
11690 numeric_only: bool = False,
11691 **kwargs,
11692 ):
11693 result = super().mean(axis, skipna, numeric_only, **kwargs)
11694 if isinstance(result, Series):
11695 result = result.__finalize__(self, method="mean")
11696 return result
11697
11698 @doc(make_doc("median", ndim=2))
11699 def median(
11700 self,
11701 axis: Axis | None = 0,
11702 skipna: bool = True,
11703 numeric_only: bool = False,
11704 **kwargs,
11705 ):
11706 result = super().median(axis, skipna, numeric_only, **kwargs)
11707 if isinstance(result, Series):
11708 result = result.__finalize__(self, method="median")
11709 return result
11710
11711 @doc(make_doc("sem", ndim=2))
11712 def sem(
11713 self,
11714 axis: Axis | None = 0,
11715 skipna: bool = True,
11716 ddof: int = 1,
11717 numeric_only: bool = False,
11718 **kwargs,
11719 ):
11720 result = super().sem(axis, skipna, ddof, numeric_only, **kwargs)
11721 if isinstance(result, Series):
11722 result = result.__finalize__(self, method="sem")
11723 return result
11724
11725 @doc(make_doc("var", ndim=2))
11726 def var(
11727 self,
11728 axis: Axis | None = 0,
11729 skipna: bool = True,
11730 ddof: int = 1,
11731 numeric_only: bool = False,
11732 **kwargs,
11733 ):
11734 result = super().var(axis, skipna, ddof, numeric_only, **kwargs)
11735 if isinstance(result, Series):
11736 result = result.__finalize__(self, method="var")
11737 return result
11738
11739 @doc(make_doc("std", ndim=2))
11740 def std(
11741 self,
11742 axis: Axis | None = 0,
11743 skipna: bool = True,
11744 ddof: int = 1,
11745 numeric_only: bool = False,
11746 **kwargs,
11747 ):
11748 result = super().std(axis, skipna, ddof, numeric_only, **kwargs)
11749 if isinstance(result, Series):
11750 result = result.__finalize__(self, method="std")
11751 return result
11752
11753 @doc(make_doc("skew", ndim=2))
11754 def skew(
11755 self,
11756 axis: Axis | None = 0,
11757 skipna: bool = True,
11758 numeric_only: bool = False,
11759 **kwargs,
11760 ):
11761 result = super().skew(axis, skipna, numeric_only, **kwargs)
11762 if isinstance(result, Series):
11763 result = result.__finalize__(self, method="skew")
11764 return result
11765
11766 @doc(make_doc("kurt", ndim=2))
11767 def kurt(
11768 self,
11769 axis: Axis | None = 0,
11770 skipna: bool = True,
11771 numeric_only: bool = False,
11772 **kwargs,
11773 ):
11774 result = super().kurt(axis, skipna, numeric_only, **kwargs)
11775 if isinstance(result, Series):
11776 result = result.__finalize__(self, method="kurt")
11777 return result
11778
11779 kurtosis = kurt
11780 product = prod
11781
11782 @doc(make_doc("cummin", ndim=2))
11783 def cummin(self, axis: Axis | None = None, skipna: bool = True, *args, **kwargs):
11784 return NDFrame.cummin(self, axis, skipna, *args, **kwargs)
11785
11786 @doc(make_doc("cummax", ndim=2))
11787 def cummax(self, axis: Axis | None = None, skipna: bool = True, *args, **kwargs):
11788 return NDFrame.cummax(self, axis, skipna, *args, **kwargs)
11789
11790 @doc(make_doc("cumsum", ndim=2))
11791 def cumsum(self, axis: Axis | None = None, skipna: bool = True, *args, **kwargs):
11792 return NDFrame.cumsum(self, axis, skipna, *args, **kwargs)
11793
11794 @doc(make_doc("cumprod", 2))
11795 def cumprod(self, axis: Axis | None = None, skipna: bool = True, *args, **kwargs):
11796 return NDFrame.cumprod(self, axis, skipna, *args, **kwargs)
11797
11798 def nunique(self, axis: Axis = 0, dropna: bool = True) -> Series:
11799 """
11800 Count number of distinct elements in specified axis.
11801
11802 Return Series with number of distinct elements. Can ignore NaN
11803 values.
11804
11805 Parameters
11806 ----------
11807 axis : {0 or 'index', 1 or 'columns'}, default 0
11808 The axis to use. 0 or 'index' for row-wise, 1 or 'columns' for
11809 column-wise.
11810 dropna : bool, default True
11811 Don't include NaN in the counts.
11812
11813 Returns
11814 -------
11815 Series
11816
11817 See Also
11818 --------
11819 Series.nunique: Method nunique for Series.
11820 DataFrame.count: Count non-NA cells for each column or row.
11821
11822 Examples
11823 --------
11824 >>> df = pd.DataFrame({'A': [4, 5, 6], 'B': [4, 1, 1]})
11825 >>> df.nunique()
11826 A 3
11827 B 2
11828 dtype: int64
11829
11830 >>> df.nunique(axis=1)
11831 0 1
11832 1 2
11833 2 2
11834 dtype: int64
11835 """
11836 return self.apply(Series.nunique, axis=axis, dropna=dropna)
11837
11838 @doc(_shared_docs["idxmin"], numeric_only_default="False")
11839 def idxmin(
11840 self, axis: Axis = 0, skipna: bool = True, numeric_only: bool = False
11841 ) -> Series:
11842 axis = self._get_axis_number(axis)
11843
11844 if self.empty and len(self.axes[axis]):
11845 axis_dtype = self.axes[axis].dtype
11846 return self._constructor_sliced(dtype=axis_dtype)
11847
11848 if numeric_only:
11849 data = self._get_numeric_data()
11850 else:
11851 data = self
11852
11853 res = data._reduce(
11854 nanops.nanargmin, "argmin", axis=axis, skipna=skipna, numeric_only=False
11855 )
11856 indices = res._values
11857 # indices will always be np.ndarray since axis is not N
11858
11859 if (indices == -1).any():
11860 warnings.warn(
11861 f"The behavior of {type(self).__name__}.idxmin with all-NA "
11862 "values, or any-NA and skipna=False, is deprecated. In a future "
11863 "version this will raise ValueError",
11864 FutureWarning,
11865 stacklevel=find_stack_level(),
11866 )
11867
11868 index = data._get_axis(axis)
11869 result = algorithms.take(
11870 index._values, indices, allow_fill=True, fill_value=index._na_value
11871 )
11872 final_result = data._constructor_sliced(result, index=data._get_agg_axis(axis))
11873 return final_result.__finalize__(self, method="idxmin")
11874
11875 @doc(_shared_docs["idxmax"], numeric_only_default="False")
11876 def idxmax(
11877 self, axis: Axis = 0, skipna: bool = True, numeric_only: bool = False
11878 ) -> Series:
11879 axis = self._get_axis_number(axis)
11880
11881 if self.empty and len(self.axes[axis]):
11882 axis_dtype = self.axes[axis].dtype
11883 return self._constructor_sliced(dtype=axis_dtype)
11884
11885 if numeric_only:
11886 data = self._get_numeric_data()
11887 else:
11888 data = self
11889
11890 res = data._reduce(
11891 nanops.nanargmax, "argmax", axis=axis, skipna=skipna, numeric_only=False
11892 )
11893 indices = res._values
11894 # indices will always be 1d array since axis is not None
11895
11896 if (indices == -1).any():
11897 warnings.warn(
11898 f"The behavior of {type(self).__name__}.idxmax with all-NA "
11899 "values, or any-NA and skipna=False, is deprecated. In a future "
11900 "version this will raise ValueError",
11901 FutureWarning,
11902 stacklevel=find_stack_level(),
11903 )
11904
11905 index = data._get_axis(axis)
11906 result = algorithms.take(
11907 index._values, indices, allow_fill=True, fill_value=index._na_value
11908 )
11909 final_result = data._constructor_sliced(result, index=data._get_agg_axis(axis))
11910 return final_result.__finalize__(self, method="idxmax")
11911
11912 def _get_agg_axis(self, axis_num: int) -> Index:
11913 """
11914 Let's be explicit about this.
11915 """
11916 if axis_num == 0:
11917 return self.columns
11918 elif axis_num == 1:
11919 return self.index
11920 else:
11921 raise ValueError(f"Axis must be 0 or 1 (got {repr(axis_num)})")
11922
11923 def mode(
11924 self, axis: Axis = 0, numeric_only: bool = False, dropna: bool = True
11925 ) -> DataFrame:
11926 """
11927 Get the mode(s) of each element along the selected axis.
11928
11929 The mode of a set of values is the value that appears most often.
11930 It can be multiple values.
11931
11932 Parameters
11933 ----------
11934 axis : {0 or 'index', 1 or 'columns'}, default 0
11935 The axis to iterate over while searching for the mode:
11936
11937 * 0 or 'index' : get mode of each column
11938 * 1 or 'columns' : get mode of each row.
11939
11940 numeric_only : bool, default False
11941 If True, only apply to numeric columns.
11942 dropna : bool, default True
11943 Don't consider counts of NaN/NaT.
11944
11945 Returns
11946 -------
11947 DataFrame
11948 The modes of each column or row.
11949
11950 See Also
11951 --------
11952 Series.mode : Return the highest frequency value in a Series.
11953 Series.value_counts : Return the counts of values in a Series.
11954
11955 Examples
11956 --------
11957 >>> df = pd.DataFrame([('bird', 2, 2),
11958 ... ('mammal', 4, np.nan),
11959 ... ('arthropod', 8, 0),
11960 ... ('bird', 2, np.nan)],
11961 ... index=('falcon', 'horse', 'spider', 'ostrich'),
11962 ... columns=('species', 'legs', 'wings'))
11963 >>> df
11964 species legs wings
11965 falcon bird 2 2.0
11966 horse mammal 4 NaN
11967 spider arthropod 8 0.0
11968 ostrich bird 2 NaN
11969
11970 By default, missing values are not considered, and the mode of wings
11971 are both 0 and 2. Because the resulting DataFrame has two rows,
11972 the second row of ``species`` and ``legs`` contains ``NaN``.
11973
11974 >>> df.mode()
11975 species legs wings
11976 0 bird 2.0 0.0
11977 1 NaN NaN 2.0
11978
11979 Setting ``dropna=False`` ``NaN`` values are considered and they can be
11980 the mode (like for wings).
11981
11982 >>> df.mode(dropna=False)
11983 species legs wings
11984 0 bird 2 NaN
11985
11986 Setting ``numeric_only=True``, only the mode of numeric columns is
11987 computed, and columns of other types are ignored.
11988
11989 >>> df.mode(numeric_only=True)
11990 legs wings
11991 0 2.0 0.0
11992 1 NaN 2.0
11993
11994 To compute the mode over columns and not rows, use the axis parameter:
11995
11996 >>> df.mode(axis='columns', numeric_only=True)
11997 0 1
11998 falcon 2.0 NaN
11999 horse 4.0 NaN
12000 spider 0.0 8.0
12001 ostrich 2.0 NaN
12002 """
12003 data = self if not numeric_only else self._get_numeric_data()
12004
12005 def f(s):
12006 return s.mode(dropna=dropna)
12007
12008 data = data.apply(f, axis=axis)
12009 # Ensure index is type stable (should always use int index)
12010 if data.empty:
12011 data.index = default_index(0)
12012
12013 return data
12014
12015 @overload
12016 def quantile(
12017 self,
12018 q: float = ...,
12019 axis: Axis = ...,
12020 numeric_only: bool = ...,
12021 interpolation: QuantileInterpolation = ...,
12022 method: Literal["single", "table"] = ...,
12023 ) -> Series:
12024 ...
12025
12026 @overload
12027 def quantile(
12028 self,
12029 q: AnyArrayLike | Sequence[float],
12030 axis: Axis = ...,
12031 numeric_only: bool = ...,
12032 interpolation: QuantileInterpolation = ...,
12033 method: Literal["single", "table"] = ...,
12034 ) -> Series | DataFrame:
12035 ...
12036
12037 @overload
12038 def quantile(
12039 self,
12040 q: float | AnyArrayLike | Sequence[float] = ...,
12041 axis: Axis = ...,
12042 numeric_only: bool = ...,
12043 interpolation: QuantileInterpolation = ...,
12044 method: Literal["single", "table"] = ...,
12045 ) -> Series | DataFrame:
12046 ...
12047
12048 def quantile(
12049 self,
12050 q: float | AnyArrayLike | Sequence[float] = 0.5,
12051 axis: Axis = 0,
12052 numeric_only: bool = False,
12053 interpolation: QuantileInterpolation = "linear",
12054 method: Literal["single", "table"] = "single",
12055 ) -> Series | DataFrame:
12056 """
12057 Return values at the given quantile over requested axis.
12058
12059 Parameters
12060 ----------
12061 q : float or array-like, default 0.5 (50% quantile)
12062 Value between 0 <= q <= 1, the quantile(s) to compute.
12063 axis : {0 or 'index', 1 or 'columns'}, default 0
12064 Equals 0 or 'index' for row-wise, 1 or 'columns' for column-wise.
12065 numeric_only : bool, default False
12066 Include only `float`, `int` or `boolean` data.
12067
12068 .. versionchanged:: 2.0.0
12069 The default value of ``numeric_only`` is now ``False``.
12070
12071 interpolation : {'linear', 'lower', 'higher', 'midpoint', 'nearest'}
12072 This optional parameter specifies the interpolation method to use,
12073 when the desired quantile lies between two data points `i` and `j`:
12074
12075 * linear: `i + (j - i) * fraction`, where `fraction` is the
12076 fractional part of the index surrounded by `i` and `j`.
12077 * lower: `i`.
12078 * higher: `j`.
12079 * nearest: `i` or `j` whichever is nearest.
12080 * midpoint: (`i` + `j`) / 2.
12081 method : {'single', 'table'}, default 'single'
12082 Whether to compute quantiles per-column ('single') or over all columns
12083 ('table'). When 'table', the only allowed interpolation methods are
12084 'nearest', 'lower', and 'higher'.
12085
12086 Returns
12087 -------
12088 Series or DataFrame
12089
12090 If ``q`` is an array, a DataFrame will be returned where the
12091 index is ``q``, the columns are the columns of self, and the
12092 values are the quantiles.
12093 If ``q`` is a float, a Series will be returned where the
12094 index is the columns of self and the values are the quantiles.
12095
12096 See Also
12097 --------
12098 core.window.rolling.Rolling.quantile: Rolling quantile.
12099 numpy.percentile: Numpy function to compute the percentile.
12100
12101 Examples
12102 --------
12103 >>> df = pd.DataFrame(np.array([[1, 1], [2, 10], [3, 100], [4, 100]]),
12104 ... columns=['a', 'b'])
12105 >>> df.quantile(.1)
12106 a 1.3
12107 b 3.7
12108 Name: 0.1, dtype: float64
12109 >>> df.quantile([.1, .5])
12110 a b
12111 0.1 1.3 3.7
12112 0.5 2.5 55.0
12113
12114 Specifying `method='table'` will compute the quantile over all columns.
12115
12116 >>> df.quantile(.1, method="table", interpolation="nearest")
12117 a 1
12118 b 1
12119 Name: 0.1, dtype: int64
12120 >>> df.quantile([.1, .5], method="table", interpolation="nearest")
12121 a b
12122 0.1 1 1
12123 0.5 3 100
12124
12125 Specifying `numeric_only=False` will also compute the quantile of
12126 datetime and timedelta data.
12127
12128 >>> df = pd.DataFrame({'A': [1, 2],
12129 ... 'B': [pd.Timestamp('2010'),
12130 ... pd.Timestamp('2011')],
12131 ... 'C': [pd.Timedelta('1 days'),
12132 ... pd.Timedelta('2 days')]})
12133 >>> df.quantile(0.5, numeric_only=False)
12134 A 1.5
12135 B 2010-07-02 12:00:00
12136 C 1 days 12:00:00
12137 Name: 0.5, dtype: object
12138 """
12139 validate_percentile(q)
12140 axis = self._get_axis_number(axis)
12141
12142 if not is_list_like(q):
12143 # BlockManager.quantile expects listlike, so we wrap and unwrap here
12144 # error: List item 0 has incompatible type "float | ExtensionArray |
12145 # ndarray[Any, Any] | Index | Series | Sequence[float]"; expected "float"
12146 res_df = self.quantile(
12147 [q], # type: ignore[list-item]
12148 axis=axis,
12149 numeric_only=numeric_only,
12150 interpolation=interpolation,
12151 method=method,
12152 )
12153 if method == "single":
12154 res = res_df.iloc[0]
12155 else:
12156 # cannot directly iloc over sparse arrays
12157 res = res_df.T.iloc[:, 0]
12158 if axis == 1 and len(self) == 0:
12159 # GH#41544 try to get an appropriate dtype
12160 dtype = find_common_type(list(self.dtypes))
12161 if needs_i8_conversion(dtype):
12162 return res.astype(dtype)
12163 return res
12164
12165 q = Index(q, dtype=np.float64)
12166 data = self._get_numeric_data() if numeric_only else self
12167
12168 if axis == 1:
12169 data = data.T
12170
12171 if len(data.columns) == 0:
12172 # GH#23925 _get_numeric_data may have dropped all columns
12173 cols = Index([], name=self.columns.name)
12174
12175 dtype = np.float64
12176 if axis == 1:
12177 # GH#41544 try to get an appropriate dtype
12178 cdtype = find_common_type(list(self.dtypes))
12179 if needs_i8_conversion(cdtype):
12180 dtype = cdtype
12181
12182 res = self._constructor([], index=q, columns=cols, dtype=dtype)
12183 return res.__finalize__(self, method="quantile")
12184
12185 valid_method = {"single", "table"}
12186 if method not in valid_method:
12187 raise ValueError(
12188 f"Invalid method: {method}. Method must be in {valid_method}."
12189 )
12190 if method == "single":
12191 res = data._mgr.quantile(qs=q, interpolation=interpolation)
12192 elif method == "table":
12193 valid_interpolation = {"nearest", "lower", "higher"}
12194 if interpolation not in valid_interpolation:
12195 raise ValueError(
12196 f"Invalid interpolation: {interpolation}. "
12197 f"Interpolation must be in {valid_interpolation}"
12198 )
12199 # handle degenerate case
12200 if len(data) == 0:
12201 if data.ndim == 2:
12202 dtype = find_common_type(list(self.dtypes))
12203 else:
12204 dtype = self.dtype
12205 return self._constructor([], index=q, columns=data.columns, dtype=dtype)
12206
12207 q_idx = np.quantile(np.arange(len(data)), q, method=interpolation)
12208
12209 by = data.columns
12210 if len(by) > 1:
12211 keys = [data._get_label_or_level_values(x) for x in by]
12212 indexer = lexsort_indexer(keys)
12213 else:
12214 k = data._get_label_or_level_values(by[0])
12215 indexer = nargsort(k)
12216
12217 res = data._mgr.take(indexer[q_idx], verify=False)
12218 res.axes[1] = q
12219
12220 result = self._constructor_from_mgr(res, axes=res.axes)
12221 return result.__finalize__(self, method="quantile")
12222
12223 def to_timestamp(
12224 self,
12225 freq: Frequency | None = None,
12226 how: ToTimestampHow = "start",
12227 axis: Axis = 0,
12228 copy: bool | None = None,
12229 ) -> DataFrame:
12230 """
12231 Cast to DatetimeIndex of timestamps, at *beginning* of period.
12232
12233 Parameters
12234 ----------
12235 freq : str, default frequency of PeriodIndex
12236 Desired frequency.
12237 how : {'s', 'e', 'start', 'end'}
12238 Convention for converting period to timestamp; start of period
12239 vs. end.
12240 axis : {0 or 'index', 1 or 'columns'}, default 0
12241 The axis to convert (the index by default).
12242 copy : bool, default True
12243 If False then underlying input data is not copied.
12244
12245 .. note::
12246 The `copy` keyword will change behavior in pandas 3.0.
12247 `Copy-on-Write
12248 <https://pandas.pydata.org/docs/dev/user_guide/copy_on_write.html>`__
12249 will be enabled by default, which means that all methods with a
12250 `copy` keyword will use a lazy copy mechanism to defer the copy and
12251 ignore the `copy` keyword. The `copy` keyword will be removed in a
12252 future version of pandas.
12253
12254 You can already get the future behavior and improvements through
12255 enabling copy on write ``pd.options.mode.copy_on_write = True``
12256
12257 Returns
12258 -------
12259 DataFrame
12260 The DataFrame has a DatetimeIndex.
12261
12262 Examples
12263 --------
12264 >>> idx = pd.PeriodIndex(['2023', '2024'], freq='Y')
12265 >>> d = {'col1': [1, 2], 'col2': [3, 4]}
12266 >>> df1 = pd.DataFrame(data=d, index=idx)
12267 >>> df1
12268 col1 col2
12269 2023 1 3
12270 2024 2 4
12271
12272 The resulting timestamps will be at the beginning of the year in this case
12273
12274 >>> df1 = df1.to_timestamp()
12275 >>> df1
12276 col1 col2
12277 2023-01-01 1 3
12278 2024-01-01 2 4
12279 >>> df1.index
12280 DatetimeIndex(['2023-01-01', '2024-01-01'], dtype='datetime64[ns]', freq=None)
12281
12282 Using `freq` which is the offset that the Timestamps will have
12283
12284 >>> df2 = pd.DataFrame(data=d, index=idx)
12285 >>> df2 = df2.to_timestamp(freq='M')
12286 >>> df2
12287 col1 col2
12288 2023-01-31 1 3
12289 2024-01-31 2 4
12290 >>> df2.index
12291 DatetimeIndex(['2023-01-31', '2024-01-31'], dtype='datetime64[ns]', freq=None)
12292 """
12293 new_obj = self.copy(deep=copy and not using_copy_on_write())
12294
12295 axis_name = self._get_axis_name(axis)
12296 old_ax = getattr(self, axis_name)
12297 if not isinstance(old_ax, PeriodIndex):
12298 raise TypeError(f"unsupported Type {type(old_ax).__name__}")
12299
12300 new_ax = old_ax.to_timestamp(freq=freq, how=how)
12301
12302 setattr(new_obj, axis_name, new_ax)
12303 return new_obj
12304
12305 def to_period(
12306 self, freq: Frequency | None = None, axis: Axis = 0, copy: bool | None = None
12307 ) -> DataFrame:
12308 """
12309 Convert DataFrame from DatetimeIndex to PeriodIndex.
12310
12311 Convert DataFrame from DatetimeIndex to PeriodIndex with desired
12312 frequency (inferred from index if not passed).
12313
12314 Parameters
12315 ----------
12316 freq : str, default
12317 Frequency of the PeriodIndex.
12318 axis : {0 or 'index', 1 or 'columns'}, default 0
12319 The axis to convert (the index by default).
12320 copy : bool, default True
12321 If False then underlying input data is not copied.
12322
12323 .. note::
12324 The `copy` keyword will change behavior in pandas 3.0.
12325 `Copy-on-Write
12326 <https://pandas.pydata.org/docs/dev/user_guide/copy_on_write.html>`__
12327 will be enabled by default, which means that all methods with a
12328 `copy` keyword will use a lazy copy mechanism to defer the copy and
12329 ignore the `copy` keyword. The `copy` keyword will be removed in a
12330 future version of pandas.
12331
12332 You can already get the future behavior and improvements through
12333 enabling copy on write ``pd.options.mode.copy_on_write = True``
12334
12335 Returns
12336 -------
12337 DataFrame
12338 The DataFrame has a PeriodIndex.
12339
12340 Examples
12341 --------
12342 >>> idx = pd.to_datetime(
12343 ... [
12344 ... "2001-03-31 00:00:00",
12345 ... "2002-05-31 00:00:00",
12346 ... "2003-08-31 00:00:00",
12347 ... ]
12348 ... )
12349
12350 >>> idx
12351 DatetimeIndex(['2001-03-31', '2002-05-31', '2003-08-31'],
12352 dtype='datetime64[ns]', freq=None)
12353
12354 >>> idx.to_period("M")
12355 PeriodIndex(['2001-03', '2002-05', '2003-08'], dtype='period[M]')
12356
12357 For the yearly frequency
12358
12359 >>> idx.to_period("Y")
12360 PeriodIndex(['2001', '2002', '2003'], dtype='period[Y-DEC]')
12361 """
12362 new_obj = self.copy(deep=copy and not using_copy_on_write())
12363
12364 axis_name = self._get_axis_name(axis)
12365 old_ax = getattr(self, axis_name)
12366 if not isinstance(old_ax, DatetimeIndex):
12367 raise TypeError(f"unsupported Type {type(old_ax).__name__}")
12368
12369 new_ax = old_ax.to_period(freq=freq)
12370
12371 setattr(new_obj, axis_name, new_ax)
12372 return new_obj
12373
12374 def isin(self, values: Series | DataFrame | Sequence | Mapping) -> DataFrame:
12375 """
12376 Whether each element in the DataFrame is contained in values.
12377
12378 Parameters
12379 ----------
12380 values : iterable, Series, DataFrame or dict
12381 The result will only be true at a location if all the
12382 labels match. If `values` is a Series, that's the index. If
12383 `values` is a dict, the keys must be the column names,
12384 which must match. If `values` is a DataFrame,
12385 then both the index and column labels must match.
12386
12387 Returns
12388 -------
12389 DataFrame
12390 DataFrame of booleans showing whether each element in the DataFrame
12391 is contained in values.
12392
12393 See Also
12394 --------
12395 DataFrame.eq: Equality test for DataFrame.
12396 Series.isin: Equivalent method on Series.
12397 Series.str.contains: Test if pattern or regex is contained within a
12398 string of a Series or Index.
12399
12400 Examples
12401 --------
12402 >>> df = pd.DataFrame({'num_legs': [2, 4], 'num_wings': [2, 0]},
12403 ... index=['falcon', 'dog'])
12404 >>> df
12405 num_legs num_wings
12406 falcon 2 2
12407 dog 4 0
12408
12409 When ``values`` is a list check whether every value in the DataFrame
12410 is present in the list (which animals have 0 or 2 legs or wings)
12411
12412 >>> df.isin([0, 2])
12413 num_legs num_wings
12414 falcon True True
12415 dog False True
12416
12417 To check if ``values`` is *not* in the DataFrame, use the ``~`` operator:
12418
12419 >>> ~df.isin([0, 2])
12420 num_legs num_wings
12421 falcon False False
12422 dog True False
12423
12424 When ``values`` is a dict, we can pass values to check for each
12425 column separately:
12426
12427 >>> df.isin({'num_wings': [0, 3]})
12428 num_legs num_wings
12429 falcon False False
12430 dog False True
12431
12432 When ``values`` is a Series or DataFrame the index and column must
12433 match. Note that 'falcon' does not match based on the number of legs
12434 in other.
12435
12436 >>> other = pd.DataFrame({'num_legs': [8, 3], 'num_wings': [0, 2]},
12437 ... index=['spider', 'falcon'])
12438 >>> df.isin(other)
12439 num_legs num_wings
12440 falcon False True
12441 dog False False
12442 """
12443 if isinstance(values, dict):
12444 from pandas.core.reshape.concat import concat
12445
12446 values = collections.defaultdict(list, values)
12447 result = concat(
12448 (
12449 self.iloc[:, [i]].isin(values[col])
12450 for i, col in enumerate(self.columns)
12451 ),
12452 axis=1,
12453 )
12454 elif isinstance(values, Series):
12455 if not values.index.is_unique:
12456 raise ValueError("cannot compute isin with a duplicate axis.")
12457 result = self.eq(values.reindex_like(self), axis="index")
12458 elif isinstance(values, DataFrame):
12459 if not (values.columns.is_unique and values.index.is_unique):
12460 raise ValueError("cannot compute isin with a duplicate axis.")
12461 result = self.eq(values.reindex_like(self))
12462 else:
12463 if not is_list_like(values):
12464 raise TypeError(
12465 "only list-like or dict-like objects are allowed "
12466 "to be passed to DataFrame.isin(), "
12467 f"you passed a '{type(values).__name__}'"
12468 )
12469
12470 def isin_(x):
12471 # error: Argument 2 to "isin" has incompatible type "Union[Series,
12472 # DataFrame, Sequence[Any], Mapping[Any, Any]]"; expected
12473 # "Union[Union[Union[ExtensionArray, ndarray[Any, Any]], Index,
12474 # Series], List[Any], range]"
12475 result = algorithms.isin(
12476 x.ravel(),
12477 values, # type: ignore[arg-type]
12478 )
12479 return result.reshape(x.shape)
12480
12481 res_mgr = self._mgr.apply(isin_)
12482 result = self._constructor_from_mgr(
12483 res_mgr,
12484 axes=res_mgr.axes,
12485 )
12486 return result.__finalize__(self, method="isin")
12487
12488 # ----------------------------------------------------------------------
12489 # Add index and columns
12490 _AXIS_ORDERS: list[Literal["index", "columns"]] = ["index", "columns"]
12491 _AXIS_TO_AXIS_NUMBER: dict[Axis, int] = {
12492 **NDFrame._AXIS_TO_AXIS_NUMBER,
12493 1: 1,
12494 "columns": 1,
12495 }
12496 _AXIS_LEN = len(_AXIS_ORDERS)
12497 _info_axis_number: Literal[1] = 1
12498 _info_axis_name: Literal["columns"] = "columns"
12499
12500 index = properties.AxisProperty(
12501 axis=1,
12502 doc="""
12503 The index (row labels) of the DataFrame.
12504
12505 The index of a DataFrame is a series of labels that identify each row.
12506 The labels can be integers, strings, or any other hashable type. The index
12507 is used for label-based access and alignment, and can be accessed or
12508 modified using this attribute.
12509
12510 Returns
12511 -------
12512 pandas.Index
12513 The index labels of the DataFrame.
12514
12515 See Also
12516 --------
12517 DataFrame.columns : The column labels of the DataFrame.
12518 DataFrame.to_numpy : Convert the DataFrame to a NumPy array.
12519
12520 Examples
12521 --------
12522 >>> df = pd.DataFrame({'Name': ['Alice', 'Bob', 'Aritra'],
12523 ... 'Age': [25, 30, 35],
12524 ... 'Location': ['Seattle', 'New York', 'Kona']},
12525 ... index=([10, 20, 30]))
12526 >>> df.index
12527 Index([10, 20, 30], dtype='int64')
12528
12529 In this example, we create a DataFrame with 3 rows and 3 columns,
12530 including Name, Age, and Location information. We set the index labels to
12531 be the integers 10, 20, and 30. We then access the `index` attribute of the
12532 DataFrame, which returns an `Index` object containing the index labels.
12533
12534 >>> df.index = [100, 200, 300]
12535 >>> df
12536 Name Age Location
12537 100 Alice 25 Seattle
12538 200 Bob 30 New York
12539 300 Aritra 35 Kona
12540
12541 In this example, we modify the index labels of the DataFrame by assigning
12542 a new list of labels to the `index` attribute. The DataFrame is then
12543 updated with the new labels, and the output shows the modified DataFrame.
12544 """,
12545 )
12546 columns = properties.AxisProperty(
12547 axis=0,
12548 doc=dedent(
12549 """
12550 The column labels of the DataFrame.
12551
12552 Examples
12553 --------
12554 >>> df = pd.DataFrame({'A': [1, 2], 'B': [3, 4]})
12555 >>> df
12556 A B
12557 0 1 3
12558 1 2 4
12559 >>> df.columns
12560 Index(['A', 'B'], dtype='object')
12561 """
12562 ),
12563 )
12564
12565 # ----------------------------------------------------------------------
12566 # Add plotting methods to DataFrame
12567 plot = CachedAccessor("plot", pandas.plotting.PlotAccessor)
12568 hist = pandas.plotting.hist_frame
12569 boxplot = pandas.plotting.boxplot_frame
12570 sparse = CachedAccessor("sparse", SparseFrameAccessor)
12571
12572 # ----------------------------------------------------------------------
12573 # Internal Interface Methods
12574
12575 def _to_dict_of_blocks(self):
12576 """
12577 Return a dict of dtype -> Constructor Types that
12578 each is a homogeneous dtype.
12579
12580 Internal ONLY - only works for BlockManager
12581 """
12582 mgr = self._mgr
12583 # convert to BlockManager if needed -> this way support ArrayManager as well
12584 mgr = cast(BlockManager, mgr_to_mgr(mgr, "block"))
12585 return {
12586 k: self._constructor_from_mgr(v, axes=v.axes).__finalize__(self)
12587 for k, v, in mgr.to_dict().items()
12588 }
12589
12590 @property
12591 def values(self) -> np.ndarray:
12592 """
12593 Return a Numpy representation of the DataFrame.
12594
12595 .. warning::
12596
12597 We recommend using :meth:`DataFrame.to_numpy` instead.
12598
12599 Only the values in the DataFrame will be returned, the axes labels
12600 will be removed.
12601
12602 Returns
12603 -------
12604 numpy.ndarray
12605 The values of the DataFrame.
12606
12607 See Also
12608 --------
12609 DataFrame.to_numpy : Recommended alternative to this method.
12610 DataFrame.index : Retrieve the index labels.
12611 DataFrame.columns : Retrieving the column names.
12612
12613 Notes
12614 -----
12615 The dtype will be a lower-common-denominator dtype (implicit
12616 upcasting); that is to say if the dtypes (even of numeric types)
12617 are mixed, the one that accommodates all will be chosen. Use this
12618 with care if you are not dealing with the blocks.
12619
12620 e.g. If the dtypes are float16 and float32, dtype will be upcast to
12621 float32. If dtypes are int32 and uint8, dtype will be upcast to
12622 int32. By :func:`numpy.find_common_type` convention, mixing int64
12623 and uint64 will result in a float64 dtype.
12624
12625 Examples
12626 --------
12627 A DataFrame where all columns are the same type (e.g., int64) results
12628 in an array of the same type.
12629
12630 >>> df = pd.DataFrame({'age': [ 3, 29],
12631 ... 'height': [94, 170],
12632 ... 'weight': [31, 115]})
12633 >>> df
12634 age height weight
12635 0 3 94 31
12636 1 29 170 115
12637 >>> df.dtypes
12638 age int64
12639 height int64
12640 weight int64
12641 dtype: object
12642 >>> df.values
12643 array([[ 3, 94, 31],
12644 [ 29, 170, 115]])
12645
12646 A DataFrame with mixed type columns(e.g., str/object, int64, float32)
12647 results in an ndarray of the broadest type that accommodates these
12648 mixed types (e.g., object).
12649
12650 >>> df2 = pd.DataFrame([('parrot', 24.0, 'second'),
12651 ... ('lion', 80.5, 1),
12652 ... ('monkey', np.nan, None)],
12653 ... columns=('name', 'max_speed', 'rank'))
12654 >>> df2.dtypes
12655 name object
12656 max_speed float64
12657 rank object
12658 dtype: object
12659 >>> df2.values
12660 array([['parrot', 24.0, 'second'],
12661 ['lion', 80.5, 1],
12662 ['monkey', nan, None]], dtype=object)
12663 """
12664 return self._mgr.as_array()
12665
12666
12667def _from_nested_dict(data) -> collections.defaultdict:
12668 new_data: collections.defaultdict = collections.defaultdict(dict)
12669 for index, s in data.items():
12670 for col, v in s.items():
12671 new_data[col][index] = v
12672 return new_data
12673
12674
12675def _reindex_for_setitem(
12676 value: DataFrame | Series, index: Index
12677) -> tuple[ArrayLike, BlockValuesRefs | None]:
12678 # reindex if necessary
12679
12680 if value.index.equals(index) or not len(index):
12681 if using_copy_on_write() and isinstance(value, Series):
12682 return value._values, value._references
12683 return value._values.copy(), None
12684
12685 # GH#4107
12686 try:
12687 reindexed_value = value.reindex(index)._values
12688 except ValueError as err:
12689 # raised in MultiIndex.from_tuples, see test_insert_error_msmgs
12690 if not value.index.is_unique:
12691 # duplicate axis
12692 raise err
12693
12694 raise TypeError(
12695 "incompatible index of inserted column with frame index"
12696 ) from err
12697 return reindexed_value, None