1"""
2DataFrame
3---------
4An efficient 2D container for potentially mixed-type time series or other
5labeled data series.
6
7Similar to its R counterpart, data.frame, except providing automatic data
8alignment and a host of useful data manipulation methods having to do with the
9labeling information
10"""
11from __future__ import annotations
12
13import collections
14from collections import abc
15import datetime
16import functools
17from io import StringIO
18import itertools
19import sys
20from textwrap import dedent
21from typing import (
22 TYPE_CHECKING,
23 Any,
24 Callable,
25 Hashable,
26 Iterable,
27 Iterator,
28 Literal,
29 Mapping,
30 Sequence,
31 cast,
32 overload,
33)
34import warnings
35
36import numpy as np
37from numpy import ma
38
39from pandas._config import (
40 get_option,
41 using_copy_on_write,
42)
43
44from pandas._libs import (
45 algos as libalgos,
46 lib,
47 properties,
48)
49from pandas._libs.hashtable import duplicated
50from pandas._libs.lib import (
51 NoDefault,
52 is_range_indexer,
53 no_default,
54)
55from pandas._typing import (
56 AggFuncType,
57 AlignJoin,
58 AnyAll,
59 AnyArrayLike,
60 ArrayLike,
61 Axes,
62 Axis,
63 AxisInt,
64 ColspaceArgType,
65 CompressionOptions,
66 CorrelationMethod,
67 DropKeep,
68 Dtype,
69 DtypeObj,
70 FilePath,
71 FillnaOptions,
72 FloatFormatType,
73 FormattersType,
74 Frequency,
75 IgnoreRaise,
76 IndexKeyFunc,
77 IndexLabel,
78 Level,
79 MergeHow,
80 NaPosition,
81 PythonFuncType,
82 QuantileInterpolation,
83 ReadBuffer,
84 Renamer,
85 Scalar,
86 SortKind,
87 StorageOptions,
88 Suffixes,
89 TimedeltaConvertibleTypes,
90 TimestampConvertibleTypes,
91 ValueKeyFunc,
92 WriteBuffer,
93 npt,
94)
95from pandas.compat import PYPY
96from pandas.compat._optional import import_optional_dependency
97from pandas.compat.numpy import (
98 function as nv,
99 np_percentile_argname,
100)
101from pandas.errors import (
102 ChainedAssignmentError,
103 InvalidIndexError,
104 _chained_assignment_msg,
105)
106from pandas.util._decorators import (
107 Appender,
108 Substitution,
109 doc,
110)
111from pandas.util._exceptions import find_stack_level
112from pandas.util._validators import (
113 validate_ascending,
114 validate_bool_kwarg,
115 validate_percentile,
116)
117
118from pandas.core.dtypes.cast import (
119 LossySetitemError,
120 can_hold_element,
121 construct_1d_arraylike_from_scalar,
122 construct_2d_arraylike_from_scalar,
123 find_common_type,
124 infer_dtype_from_scalar,
125 invalidate_string_dtypes,
126 maybe_box_native,
127 maybe_downcast_to_dtype,
128)
129from pandas.core.dtypes.common import (
130 infer_dtype_from_object,
131 is_1d_only_ea_dtype,
132 is_bool_dtype,
133 is_dataclass,
134 is_dict_like,
135 is_dtype_equal,
136 is_extension_array_dtype,
137 is_float,
138 is_float_dtype,
139 is_hashable,
140 is_integer,
141 is_integer_dtype,
142 is_iterator,
143 is_list_like,
144 is_scalar,
145 is_sequence,
146 needs_i8_conversion,
147 pandas_dtype,
148)
149from pandas.core.dtypes.dtypes import ExtensionDtype
150from pandas.core.dtypes.missing import (
151 isna,
152 notna,
153)
154
155from pandas.core import (
156 algorithms,
157 common as com,
158 nanops,
159 ops,
160)
161from pandas.core.accessor import CachedAccessor
162from pandas.core.apply import (
163 reconstruct_func,
164 relabel_result,
165)
166from pandas.core.array_algos.take import take_2d_multi
167from pandas.core.arraylike import OpsMixin
168from pandas.core.arrays import (
169 DatetimeArray,
170 ExtensionArray,
171 PeriodArray,
172 TimedeltaArray,
173)
174from pandas.core.arrays.arrow import ArrowDtype
175from pandas.core.arrays.sparse import SparseFrameAccessor
176from pandas.core.construction import (
177 ensure_wrapped_if_datetimelike,
178 extract_array,
179 sanitize_array,
180 sanitize_masked_array,
181)
182from pandas.core.generic import NDFrame
183from pandas.core.indexers import check_key_length
184from pandas.core.indexes.api import (
185 DatetimeIndex,
186 Index,
187 PeriodIndex,
188 default_index,
189 ensure_index,
190 ensure_index_from_sequences,
191)
192from pandas.core.indexes.multi import (
193 MultiIndex,
194 maybe_droplevels,
195)
196from pandas.core.indexing import (
197 check_bool_indexer,
198 check_dict_or_set_indexers,
199)
200from pandas.core.internals import (
201 ArrayManager,
202 BlockManager,
203)
204from pandas.core.internals.construction import (
205 arrays_to_mgr,
206 dataclasses_to_dicts,
207 dict_to_mgr,
208 mgr_to_mgr,
209 ndarray_to_mgr,
210 nested_data_to_arrays,
211 rec_array_to_mgr,
212 reorder_arrays,
213 to_arrays,
214 treat_as_nested,
215)
216from pandas.core.methods import selectn
217from pandas.core.reshape.melt import melt
218from pandas.core.series import Series
219from pandas.core.shared_docs import _shared_docs
220from pandas.core.sorting import (
221 get_group_index,
222 lexsort_indexer,
223 nargsort,
224)
225
226from pandas.io.common import get_handle
227from pandas.io.formats import (
228 console,
229 format as fmt,
230)
231from pandas.io.formats.info import (
232 INFO_DOCSTRING,
233 DataFrameInfo,
234 frame_sub_kwargs,
235)
236import pandas.plotting
237
238if TYPE_CHECKING:
239 from pandas.core.groupby.generic import DataFrameGroupBy
240 from pandas.core.interchange.dataframe_protocol import DataFrame as DataFrameXchg
241 from pandas.core.internals import SingleDataManager
242 from pandas.core.resample import Resampler
243
244 from pandas.io.formats.style import Styler
245
246# ---------------------------------------------------------------------
247# Docstring templates
248
249_shared_doc_kwargs = {
250 "axes": "index, columns",
251 "klass": "DataFrame",
252 "axes_single_arg": "{0 or 'index', 1 or 'columns'}",
253 "axis": """axis : {0 or 'index', 1 or 'columns'}, default 0
254 If 0 or 'index': apply function to each column.
255 If 1 or 'columns': apply function to each row.""",
256 "inplace": """
257 inplace : bool, default False
258 Whether to modify the DataFrame rather than creating a new one.""",
259 "optional_by": """
260by : str or list of str
261 Name or list of names to sort by.
262
263 - if `axis` is 0 or `'index'` then `by` may contain index
264 levels and/or column labels.
265 - if `axis` is 1 or `'columns'` then `by` may contain column
266 levels and/or index labels.""",
267 "optional_reindex": """
268labels : array-like, optional
269 New labels / index to conform the axis specified by 'axis' to.
270index : array-like, optional
271 New labels for the index. Preferably an Index object to avoid
272 duplicating data.
273columns : array-like, optional
274 New labels for the columns. Preferably an Index object to avoid
275 duplicating data.
276axis : int or str, optional
277 Axis to target. Can be either the axis name ('index', 'columns')
278 or number (0, 1).""",
279 "replace_iloc": """
280 This differs from updating with ``.loc`` or ``.iloc``, which require
281 you to specify a location to update with some value.""",
282}
283
284_numeric_only_doc = """numeric_only : bool, default False
285 Include only float, int, boolean data.
286"""
287
288_merge_doc = """
289Merge DataFrame or named Series objects with a database-style join.
290
291A named Series object is treated as a DataFrame with a single named column.
292
293The join is done on columns or indexes. If joining columns on
294columns, the DataFrame indexes *will be ignored*. Otherwise if joining indexes
295on indexes or indexes on a column or columns, the index will be passed on.
296When performing a cross merge, no column specifications to merge on are
297allowed.
298
299.. warning::
300
301 If both key columns contain rows where the key is a null value, those
302 rows will be matched against each other. This is different from usual SQL
303 join behaviour and can lead to unexpected results.
304
305Parameters
306----------%s
307right : DataFrame or named Series
308 Object to merge with.
309how : {'left', 'right', 'outer', 'inner', 'cross'}, default 'inner'
310 Type of merge to be performed.
311
312 * left: use only keys from left frame, similar to a SQL left outer join;
313 preserve key order.
314 * right: use only keys from right frame, similar to a SQL right outer join;
315 preserve key order.
316 * outer: use union of keys from both frames, similar to a SQL full outer
317 join; sort keys lexicographically.
318 * inner: use intersection of keys from both frames, similar to a SQL inner
319 join; preserve the order of the left keys.
320 * cross: creates the cartesian product from both frames, preserves the order
321 of the left keys.
322
323 .. versionadded:: 1.2.0
324
325on : label or list
326 Column or index level names to join on. These must be found in both
327 DataFrames. If `on` is None and not merging on indexes then this defaults
328 to the intersection of the columns in both DataFrames.
329left_on : label or list, or array-like
330 Column or index level names to join on in the left DataFrame. Can also
331 be an array or list of arrays of the length of the left DataFrame.
332 These arrays are treated as if they are columns.
333right_on : label or list, or array-like
334 Column or index level names to join on in the right DataFrame. Can also
335 be an array or list of arrays of the length of the right DataFrame.
336 These arrays are treated as if they are columns.
337left_index : bool, default False
338 Use the index from the left DataFrame as the join key(s). If it is a
339 MultiIndex, the number of keys in the other DataFrame (either the index
340 or a number of columns) must match the number of levels.
341right_index : bool, default False
342 Use the index from the right DataFrame as the join key. Same caveats as
343 left_index.
344sort : bool, default False
345 Sort the join keys lexicographically in the result DataFrame. If False,
346 the order of the join keys depends on the join type (how keyword).
347suffixes : list-like, default is ("_x", "_y")
348 A length-2 sequence where each element is optionally a string
349 indicating the suffix to add to overlapping column names in
350 `left` and `right` respectively. Pass a value of `None` instead
351 of a string to indicate that the column name from `left` or
352 `right` should be left as-is, with no suffix. At least one of the
353 values must not be None.
354copy : bool, default True
355 If False, avoid copy if possible.
356indicator : bool or str, default False
357 If True, adds a column to the output DataFrame called "_merge" with
358 information on the source of each row. The column can be given a different
359 name by providing a string argument. The column will have a Categorical
360 type with the value of "left_only" for observations whose merge key only
361 appears in the left DataFrame, "right_only" for observations
362 whose merge key only appears in the right DataFrame, and "both"
363 if the observation's merge key is found in both DataFrames.
364
365validate : str, optional
366 If specified, checks if merge is of specified type.
367
368 * "one_to_one" or "1:1": check if merge keys are unique in both
369 left and right datasets.
370 * "one_to_many" or "1:m": check if merge keys are unique in left
371 dataset.
372 * "many_to_one" or "m:1": check if merge keys are unique in right
373 dataset.
374 * "many_to_many" or "m:m": allowed, but does not result in checks.
375
376Returns
377-------
378DataFrame
379 A DataFrame of the two merged objects.
380
381See Also
382--------
383merge_ordered : Merge with optional filling/interpolation.
384merge_asof : Merge on nearest keys.
385DataFrame.join : Similar method using indices.
386
387Notes
388-----
389Support for specifying index levels as the `on`, `left_on`, and
390`right_on` parameters was added in version 0.23.0
391Support for merging named Series objects was added in version 0.24.0
392
393Examples
394--------
395>>> df1 = pd.DataFrame({'lkey': ['foo', 'bar', 'baz', 'foo'],
396... 'value': [1, 2, 3, 5]})
397>>> df2 = pd.DataFrame({'rkey': ['foo', 'bar', 'baz', 'foo'],
398... 'value': [5, 6, 7, 8]})
399>>> df1
400 lkey value
4010 foo 1
4021 bar 2
4032 baz 3
4043 foo 5
405>>> df2
406 rkey value
4070 foo 5
4081 bar 6
4092 baz 7
4103 foo 8
411
412Merge df1 and df2 on the lkey and rkey columns. The value columns have
413the default suffixes, _x and _y, appended.
414
415>>> df1.merge(df2, left_on='lkey', right_on='rkey')
416 lkey value_x rkey value_y
4170 foo 1 foo 5
4181 foo 1 foo 8
4192 foo 5 foo 5
4203 foo 5 foo 8
4214 bar 2 bar 6
4225 baz 3 baz 7
423
424Merge DataFrames df1 and df2 with specified left and right suffixes
425appended to any overlapping columns.
426
427>>> df1.merge(df2, left_on='lkey', right_on='rkey',
428... suffixes=('_left', '_right'))
429 lkey value_left rkey value_right
4300 foo 1 foo 5
4311 foo 1 foo 8
4322 foo 5 foo 5
4333 foo 5 foo 8
4344 bar 2 bar 6
4355 baz 3 baz 7
436
437Merge DataFrames df1 and df2, but raise an exception if the DataFrames have
438any overlapping columns.
439
440>>> df1.merge(df2, left_on='lkey', right_on='rkey', suffixes=(False, False))
441Traceback (most recent call last):
442...
443ValueError: columns overlap but no suffix specified:
444 Index(['value'], dtype='object')
445
446>>> df1 = pd.DataFrame({'a': ['foo', 'bar'], 'b': [1, 2]})
447>>> df2 = pd.DataFrame({'a': ['foo', 'baz'], 'c': [3, 4]})
448>>> df1
449 a b
4500 foo 1
4511 bar 2
452>>> df2
453 a c
4540 foo 3
4551 baz 4
456
457>>> df1.merge(df2, how='inner', on='a')
458 a b c
4590 foo 1 3
460
461>>> df1.merge(df2, how='left', on='a')
462 a b c
4630 foo 1 3.0
4641 bar 2 NaN
465
466>>> df1 = pd.DataFrame({'left': ['foo', 'bar']})
467>>> df2 = pd.DataFrame({'right': [7, 8]})
468>>> df1
469 left
4700 foo
4711 bar
472>>> df2
473 right
4740 7
4751 8
476
477>>> df1.merge(df2, how='cross')
478 left right
4790 foo 7
4801 foo 8
4812 bar 7
4823 bar 8
483"""
484
485
486# -----------------------------------------------------------------------
487# DataFrame class
488
489
490class DataFrame(NDFrame, OpsMixin):
491 """
492 Two-dimensional, size-mutable, potentially heterogeneous tabular data.
493
494 Data structure also contains labeled axes (rows and columns).
495 Arithmetic operations align on both row and column labels. Can be
496 thought of as a dict-like container for Series objects. The primary
497 pandas data structure.
498
499 Parameters
500 ----------
501 data : ndarray (structured or homogeneous), Iterable, dict, or DataFrame
502 Dict can contain Series, arrays, constants, dataclass or list-like objects. If
503 data is a dict, column order follows insertion-order. If a dict contains Series
504 which have an index defined, it is aligned by its index. This alignment also
505 occurs if data is a Series or a DataFrame itself. Alignment is done on
506 Series/DataFrame inputs.
507
508 If data is a list of dicts, column order follows insertion-order.
509
510 index : Index or array-like
511 Index to use for resulting frame. Will default to RangeIndex if
512 no indexing information part of input data and no index provided.
513 columns : Index or array-like
514 Column labels to use for resulting frame when data does not have them,
515 defaulting to RangeIndex(0, 1, 2, ..., n). If data contains column labels,
516 will perform column selection instead.
517 dtype : dtype, default None
518 Data type to force. Only a single dtype is allowed. If None, infer.
519 copy : bool or None, default None
520 Copy data from inputs.
521 For dict data, the default of None behaves like ``copy=True``. For DataFrame
522 or 2d ndarray input, the default of None behaves like ``copy=False``.
523 If data is a dict containing one or more Series (possibly of different dtypes),
524 ``copy=False`` will ensure that these inputs are not copied.
525
526 .. versionchanged:: 1.3.0
527
528 See Also
529 --------
530 DataFrame.from_records : Constructor from tuples, also record arrays.
531 DataFrame.from_dict : From dicts of Series, arrays, or dicts.
532 read_csv : Read a comma-separated values (csv) file into DataFrame.
533 read_table : Read general delimited file into DataFrame.
534 read_clipboard : Read text from clipboard into DataFrame.
535
536 Notes
537 -----
538 Please reference the :ref:`User Guide <basics.dataframe>` for more information.
539
540 Examples
541 --------
542 Constructing DataFrame from a dictionary.
543
544 >>> d = {'col1': [1, 2], 'col2': [3, 4]}
545 >>> df = pd.DataFrame(data=d)
546 >>> df
547 col1 col2
548 0 1 3
549 1 2 4
550
551 Notice that the inferred dtype is int64.
552
553 >>> df.dtypes
554 col1 int64
555 col2 int64
556 dtype: object
557
558 To enforce a single dtype:
559
560 >>> df = pd.DataFrame(data=d, dtype=np.int8)
561 >>> df.dtypes
562 col1 int8
563 col2 int8
564 dtype: object
565
566 Constructing DataFrame from a dictionary including Series:
567
568 >>> d = {'col1': [0, 1, 2, 3], 'col2': pd.Series([2, 3], index=[2, 3])}
569 >>> pd.DataFrame(data=d, index=[0, 1, 2, 3])
570 col1 col2
571 0 0 NaN
572 1 1 NaN
573 2 2 2.0
574 3 3 3.0
575
576 Constructing DataFrame from numpy ndarray:
577
578 >>> df2 = pd.DataFrame(np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]),
579 ... columns=['a', 'b', 'c'])
580 >>> df2
581 a b c
582 0 1 2 3
583 1 4 5 6
584 2 7 8 9
585
586 Constructing DataFrame from a numpy ndarray that has labeled columns:
587
588 >>> data = np.array([(1, 2, 3), (4, 5, 6), (7, 8, 9)],
589 ... dtype=[("a", "i4"), ("b", "i4"), ("c", "i4")])
590 >>> df3 = pd.DataFrame(data, columns=['c', 'a'])
591 ...
592 >>> df3
593 c a
594 0 3 1
595 1 6 4
596 2 9 7
597
598 Constructing DataFrame from dataclass:
599
600 >>> from dataclasses import make_dataclass
601 >>> Point = make_dataclass("Point", [("x", int), ("y", int)])
602 >>> pd.DataFrame([Point(0, 0), Point(0, 3), Point(2, 3)])
603 x y
604 0 0 0
605 1 0 3
606 2 2 3
607
608 Constructing DataFrame from Series/DataFrame:
609
610 >>> ser = pd.Series([1, 2, 3], index=["a", "b", "c"])
611 >>> df = pd.DataFrame(data=ser, index=["a", "c"])
612 >>> df
613 0
614 a 1
615 c 3
616
617 >>> df1 = pd.DataFrame([1, 2, 3], index=["a", "b", "c"], columns=["x"])
618 >>> df2 = pd.DataFrame(data=df1, index=["a", "c"])
619 >>> df2
620 x
621 a 1
622 c 3
623 """
624
625 _internal_names_set = {"columns", "index"} | NDFrame._internal_names_set
626 _typ = "dataframe"
627 _HANDLED_TYPES = (Series, Index, ExtensionArray, np.ndarray)
628 _accessors: set[str] = {"sparse"}
629 _hidden_attrs: frozenset[str] = NDFrame._hidden_attrs | frozenset([])
630 _mgr: BlockManager | ArrayManager
631
632 @property
633 def _constructor(self) -> Callable[..., DataFrame]:
634 return DataFrame
635
636 _constructor_sliced: Callable[..., Series] = Series
637
638 # ----------------------------------------------------------------------
639 # Constructors
640
641 def __init__(
642 self,
643 data=None,
644 index: Axes | None = None,
645 columns: Axes | None = None,
646 dtype: Dtype | None = None,
647 copy: bool | None = None,
648 ) -> None:
649 if dtype is not None:
650 dtype = self._validate_dtype(dtype)
651
652 if isinstance(data, DataFrame):
653 data = data._mgr
654 if not copy:
655 # if not copying data, ensure to still return a shallow copy
656 # to avoid the result sharing the same Manager
657 data = data.copy(deep=False)
658
659 if isinstance(data, (BlockManager, ArrayManager)):
660 if using_copy_on_write():
661 data = data.copy(deep=False)
662 # first check if a Manager is passed without any other arguments
663 # -> use fastpath (without checking Manager type)
664 if index is None and columns is None and dtype is None and not copy:
665 # GH#33357 fastpath
666 NDFrame.__init__(self, data)
667 return
668
669 manager = get_option("mode.data_manager")
670
671 # GH47215
672 if index is not None and isinstance(index, set):
673 raise ValueError("index cannot be a set")
674 if columns is not None and isinstance(columns, set):
675 raise ValueError("columns cannot be a set")
676
677 if copy is None:
678 if isinstance(data, dict):
679 # retain pre-GH#38939 default behavior
680 copy = True
681 elif (
682 manager == "array"
683 and isinstance(data, (np.ndarray, ExtensionArray))
684 and data.ndim == 2
685 ):
686 # INFO(ArrayManager) by default copy the 2D input array to get
687 # contiguous 1D arrays
688 copy = True
689 elif using_copy_on_write() and not isinstance(
690 data, (Index, DataFrame, Series)
691 ):
692 copy = True
693 else:
694 copy = False
695
696 if data is None:
697 index = index if index is not None else default_index(0)
698 columns = columns if columns is not None else default_index(0)
699 dtype = dtype if dtype is not None else pandas_dtype(object)
700 data = []
701
702 if isinstance(data, (BlockManager, ArrayManager)):
703 mgr = self._init_mgr(
704 data, axes={"index": index, "columns": columns}, dtype=dtype, copy=copy
705 )
706
707 elif isinstance(data, dict):
708 # GH#38939 de facto copy defaults to False only in non-dict cases
709 mgr = dict_to_mgr(data, index, columns, dtype=dtype, copy=copy, typ=manager)
710 elif isinstance(data, ma.MaskedArray):
711 from numpy.ma import mrecords
712
713 # masked recarray
714 if isinstance(data, mrecords.MaskedRecords):
715 raise TypeError(
716 "MaskedRecords are not supported. Pass "
717 "{name: data[name] for name in data.dtype.names} "
718 "instead"
719 )
720
721 # a masked array
722 data = sanitize_masked_array(data)
723 mgr = ndarray_to_mgr(
724 data,
725 index,
726 columns,
727 dtype=dtype,
728 copy=copy,
729 typ=manager,
730 )
731
732 elif isinstance(data, (np.ndarray, Series, Index, ExtensionArray)):
733 if data.dtype.names:
734 # i.e. numpy structured array
735 data = cast(np.ndarray, data)
736 mgr = rec_array_to_mgr(
737 data,
738 index,
739 columns,
740 dtype,
741 copy,
742 typ=manager,
743 )
744 elif getattr(data, "name", None) is not None:
745 # i.e. Series/Index with non-None name
746 _copy = copy if using_copy_on_write() else True
747 mgr = dict_to_mgr(
748 # error: Item "ndarray" of "Union[ndarray, Series, Index]" has no
749 # attribute "name"
750 {data.name: data}, # type: ignore[union-attr]
751 index,
752 columns,
753 dtype=dtype,
754 typ=manager,
755 copy=_copy,
756 )
757 else:
758 mgr = ndarray_to_mgr(
759 data,
760 index,
761 columns,
762 dtype=dtype,
763 copy=copy,
764 typ=manager,
765 )
766
767 # For data is list-like, or Iterable (will consume into list)
768 elif is_list_like(data):
769 if not isinstance(data, abc.Sequence):
770 if hasattr(data, "__array__"):
771 # GH#44616 big perf improvement for e.g. pytorch tensor
772 data = np.asarray(data)
773 else:
774 data = list(data)
775 if len(data) > 0:
776 if is_dataclass(data[0]):
777 data = dataclasses_to_dicts(data)
778 if not isinstance(data, np.ndarray) and treat_as_nested(data):
779 # exclude ndarray as we may have cast it a few lines above
780 if columns is not None:
781 columns = ensure_index(columns)
782 arrays, columns, index = nested_data_to_arrays(
783 # error: Argument 3 to "nested_data_to_arrays" has incompatible
784 # type "Optional[Collection[Any]]"; expected "Optional[Index]"
785 data,
786 columns,
787 index, # type: ignore[arg-type]
788 dtype,
789 )
790 mgr = arrays_to_mgr(
791 arrays,
792 columns,
793 index,
794 dtype=dtype,
795 typ=manager,
796 )
797 else:
798 mgr = ndarray_to_mgr(
799 data,
800 index,
801 columns,
802 dtype=dtype,
803 copy=copy,
804 typ=manager,
805 )
806 else:
807 mgr = dict_to_mgr(
808 {},
809 index,
810 columns if columns is not None else default_index(0),
811 dtype=dtype,
812 typ=manager,
813 )
814 # For data is scalar
815 else:
816 if index is None or columns is None:
817 raise ValueError("DataFrame constructor not properly called!")
818
819 index = ensure_index(index)
820 columns = ensure_index(columns)
821
822 if not dtype:
823 dtype, _ = infer_dtype_from_scalar(data, pandas_dtype=True)
824
825 # For data is a scalar extension dtype
826 if isinstance(dtype, ExtensionDtype):
827 # TODO(EA2D): special case not needed with 2D EAs
828
829 values = [
830 construct_1d_arraylike_from_scalar(data, len(index), dtype)
831 for _ in range(len(columns))
832 ]
833 mgr = arrays_to_mgr(values, columns, index, dtype=None, typ=manager)
834 else:
835 arr2d = construct_2d_arraylike_from_scalar(
836 data,
837 len(index),
838 len(columns),
839 dtype,
840 copy,
841 )
842
843 mgr = ndarray_to_mgr(
844 arr2d,
845 index,
846 columns,
847 dtype=arr2d.dtype,
848 copy=False,
849 typ=manager,
850 )
851
852 # ensure correct Manager type according to settings
853 mgr = mgr_to_mgr(mgr, typ=manager)
854
855 NDFrame.__init__(self, mgr)
856
857 # ----------------------------------------------------------------------
858 def __dataframe__(
859 self, nan_as_null: bool = False, allow_copy: bool = True
860 ) -> DataFrameXchg:
861 """
862 Return the dataframe interchange object implementing the interchange protocol.
863
864 Parameters
865 ----------
866 nan_as_null : bool, default False
867 Whether to tell the DataFrame to overwrite null values in the data
868 with ``NaN`` (or ``NaT``).
869 allow_copy : bool, default True
870 Whether to allow memory copying when exporting. If set to False
871 it would cause non-zero-copy exports to fail.
872
873 Returns
874 -------
875 DataFrame interchange object
876 The object which consuming library can use to ingress the dataframe.
877
878 Notes
879 -----
880 Details on the interchange protocol:
881 https://data-apis.org/dataframe-protocol/latest/index.html
882
883 `nan_as_null` currently has no effect; once support for nullable extension
884 dtypes is added, this value should be propagated to columns.
885 """
886
887 from pandas.core.interchange.dataframe import PandasDataFrameXchg
888
889 return PandasDataFrameXchg(self, nan_as_null, allow_copy)
890
891 # ----------------------------------------------------------------------
892
893 @property
894 def axes(self) -> list[Index]:
895 """
896 Return a list representing the axes of the DataFrame.
897
898 It has the row axis labels and column axis labels as the only members.
899 They are returned in that order.
900
901 Examples
902 --------
903 >>> df = pd.DataFrame({'col1': [1, 2], 'col2': [3, 4]})
904 >>> df.axes
905 [RangeIndex(start=0, stop=2, step=1), Index(['col1', 'col2'],
906 dtype='object')]
907 """
908 return [self.index, self.columns]
909
910 @property
911 def shape(self) -> tuple[int, int]:
912 """
913 Return a tuple representing the dimensionality of the DataFrame.
914
915 See Also
916 --------
917 ndarray.shape : Tuple of array dimensions.
918
919 Examples
920 --------
921 >>> df = pd.DataFrame({'col1': [1, 2], 'col2': [3, 4]})
922 >>> df.shape
923 (2, 2)
924
925 >>> df = pd.DataFrame({'col1': [1, 2], 'col2': [3, 4],
926 ... 'col3': [5, 6]})
927 >>> df.shape
928 (2, 3)
929 """
930 return len(self.index), len(self.columns)
931
932 @property
933 def _is_homogeneous_type(self) -> bool:
934 """
935 Whether all the columns in a DataFrame have the same type.
936
937 Returns
938 -------
939 bool
940
941 See Also
942 --------
943 Index._is_homogeneous_type : Whether the object has a single
944 dtype.
945 MultiIndex._is_homogeneous_type : Whether all the levels of a
946 MultiIndex have the same dtype.
947
948 Examples
949 --------
950 >>> DataFrame({"A": [1, 2], "B": [3, 4]})._is_homogeneous_type
951 True
952 >>> DataFrame({"A": [1, 2], "B": [3.0, 4.0]})._is_homogeneous_type
953 False
954
955 Items with the same type but different sizes are considered
956 different types.
957
958 >>> DataFrame({
959 ... "A": np.array([1, 2], dtype=np.int32),
960 ... "B": np.array([1, 2], dtype=np.int64)})._is_homogeneous_type
961 False
962 """
963 if isinstance(self._mgr, ArrayManager):
964 return len({arr.dtype for arr in self._mgr.arrays}) == 1
965 if self._mgr.any_extension_types:
966 return len({block.dtype for block in self._mgr.blocks}) == 1
967 else:
968 return not self._is_mixed_type
969
970 @property
971 def _can_fast_transpose(self) -> bool:
972 """
973 Can we transpose this DataFrame without creating any new array objects.
974 """
975 if isinstance(self._mgr, ArrayManager):
976 return False
977 blocks = self._mgr.blocks
978 if len(blocks) != 1:
979 return False
980
981 dtype = blocks[0].dtype
982 # TODO(EA2D) special case would be unnecessary with 2D EAs
983 return not is_1d_only_ea_dtype(dtype)
984
985 @property
986 def _values(self) -> np.ndarray | DatetimeArray | TimedeltaArray | PeriodArray:
987 """
988 Analogue to ._values that may return a 2D ExtensionArray.
989 """
990 mgr = self._mgr
991
992 if isinstance(mgr, ArrayManager):
993 if len(mgr.arrays) == 1 and not is_1d_only_ea_dtype(mgr.arrays[0].dtype):
994 # error: Item "ExtensionArray" of "Union[ndarray, ExtensionArray]"
995 # has no attribute "reshape"
996 return mgr.arrays[0].reshape(-1, 1) # type: ignore[union-attr]
997 return ensure_wrapped_if_datetimelike(self.values)
998
999 blocks = mgr.blocks
1000 if len(blocks) != 1:
1001 return ensure_wrapped_if_datetimelike(self.values)
1002
1003 arr = blocks[0].values
1004 if arr.ndim == 1:
1005 # non-2D ExtensionArray
1006 return self.values
1007
1008 # more generally, whatever we allow in NDArrayBackedExtensionBlock
1009 arr = cast("np.ndarray | DatetimeArray | TimedeltaArray | PeriodArray", arr)
1010 return arr.T
1011
1012 # ----------------------------------------------------------------------
1013 # Rendering Methods
1014
1015 def _repr_fits_vertical_(self) -> bool:
1016 """
1017 Check length against max_rows.
1018 """
1019 max_rows = get_option("display.max_rows")
1020 return len(self) <= max_rows
1021
1022 def _repr_fits_horizontal_(self, ignore_width: bool = False) -> bool:
1023 """
1024 Check if full repr fits in horizontal boundaries imposed by the display
1025 options width and max_columns.
1026
1027 In case of non-interactive session, no boundaries apply.
1028
1029 `ignore_width` is here so ipynb+HTML output can behave the way
1030 users expect. display.max_columns remains in effect.
1031 GH3541, GH3573
1032 """
1033 width, height = console.get_console_size()
1034 max_columns = get_option("display.max_columns")
1035 nb_columns = len(self.columns)
1036
1037 # exceed max columns
1038 if (max_columns and nb_columns > max_columns) or (
1039 (not ignore_width) and width and nb_columns > (width // 2)
1040 ):
1041 return False
1042
1043 # used by repr_html under IPython notebook or scripts ignore terminal
1044 # dims
1045 if ignore_width or width is None or not console.in_interactive_session():
1046 return True
1047
1048 if get_option("display.width") is not None or console.in_ipython_frontend():
1049 # check at least the column row for excessive width
1050 max_rows = 1
1051 else:
1052 max_rows = get_option("display.max_rows")
1053
1054 # when auto-detecting, so width=None and not in ipython front end
1055 # check whether repr fits horizontal by actually checking
1056 # the width of the rendered repr
1057 buf = StringIO()
1058
1059 # only care about the stuff we'll actually print out
1060 # and to_string on entire frame may be expensive
1061 d = self
1062
1063 if max_rows is not None: # unlimited rows
1064 # min of two, where one may be None
1065 d = d.iloc[: min(max_rows, len(d))]
1066 else:
1067 return True
1068
1069 d.to_string(buf=buf)
1070 value = buf.getvalue()
1071 repr_width = max(len(line) for line in value.split("\n"))
1072
1073 return repr_width < width
1074
1075 def _info_repr(self) -> bool:
1076 """
1077 True if the repr should show the info view.
1078 """
1079 info_repr_option = get_option("display.large_repr") == "info"
1080 return info_repr_option and not (
1081 self._repr_fits_horizontal_() and self._repr_fits_vertical_()
1082 )
1083
1084 def __repr__(self) -> str:
1085 """
1086 Return a string representation for a particular DataFrame.
1087 """
1088 if self._info_repr():
1089 buf = StringIO()
1090 self.info(buf=buf)
1091 return buf.getvalue()
1092
1093 repr_params = fmt.get_dataframe_repr_params()
1094 return self.to_string(**repr_params)
1095
1096 def _repr_html_(self) -> str | None:
1097 """
1098 Return a html representation for a particular DataFrame.
1099
1100 Mainly for IPython notebook.
1101 """
1102 if self._info_repr():
1103 buf = StringIO()
1104 self.info(buf=buf)
1105 # need to escape the <class>, should be the first line.
1106 val = buf.getvalue().replace("<", r"<", 1)
1107 val = val.replace(">", r">", 1)
1108 return f"<pre>{val}</pre>"
1109
1110 if get_option("display.notebook_repr_html"):
1111 max_rows = get_option("display.max_rows")
1112 min_rows = get_option("display.min_rows")
1113 max_cols = get_option("display.max_columns")
1114 show_dimensions = get_option("display.show_dimensions")
1115
1116 formatter = fmt.DataFrameFormatter(
1117 self,
1118 columns=None,
1119 col_space=None,
1120 na_rep="NaN",
1121 formatters=None,
1122 float_format=None,
1123 sparsify=None,
1124 justify=None,
1125 index_names=True,
1126 header=True,
1127 index=True,
1128 bold_rows=True,
1129 escape=True,
1130 max_rows=max_rows,
1131 min_rows=min_rows,
1132 max_cols=max_cols,
1133 show_dimensions=show_dimensions,
1134 decimal=".",
1135 )
1136 return fmt.DataFrameRenderer(formatter).to_html(notebook=True)
1137 else:
1138 return None
1139
1140 @overload
1141 def to_string(
1142 self,
1143 buf: None = ...,
1144 columns: Sequence[str] | None = ...,
1145 col_space: int | list[int] | dict[Hashable, int] | None = ...,
1146 header: bool | Sequence[str] = ...,
1147 index: bool = ...,
1148 na_rep: str = ...,
1149 formatters: fmt.FormattersType | None = ...,
1150 float_format: fmt.FloatFormatType | None = ...,
1151 sparsify: bool | None = ...,
1152 index_names: bool = ...,
1153 justify: str | None = ...,
1154 max_rows: int | None = ...,
1155 max_cols: int | None = ...,
1156 show_dimensions: bool = ...,
1157 decimal: str = ...,
1158 line_width: int | None = ...,
1159 min_rows: int | None = ...,
1160 max_colwidth: int | None = ...,
1161 encoding: str | None = ...,
1162 ) -> str:
1163 ...
1164
1165 @overload
1166 def to_string(
1167 self,
1168 buf: FilePath | WriteBuffer[str],
1169 columns: Sequence[str] | None = ...,
1170 col_space: int | list[int] | dict[Hashable, int] | None = ...,
1171 header: bool | Sequence[str] = ...,
1172 index: bool = ...,
1173 na_rep: str = ...,
1174 formatters: fmt.FormattersType | None = ...,
1175 float_format: fmt.FloatFormatType | None = ...,
1176 sparsify: bool | None = ...,
1177 index_names: bool = ...,
1178 justify: str | None = ...,
1179 max_rows: int | None = ...,
1180 max_cols: int | None = ...,
1181 show_dimensions: bool = ...,
1182 decimal: str = ...,
1183 line_width: int | None = ...,
1184 min_rows: int | None = ...,
1185 max_colwidth: int | None = ...,
1186 encoding: str | None = ...,
1187 ) -> None:
1188 ...
1189
1190 @Substitution(
1191 header_type="bool or sequence of str",
1192 header="Write out the column names. If a list of strings "
1193 "is given, it is assumed to be aliases for the "
1194 "column names",
1195 col_space_type="int, list or dict of int",
1196 col_space="The minimum width of each column. If a list of ints is given "
1197 "every integers corresponds with one column. If a dict is given, the key "
1198 "references the column, while the value defines the space to use.",
1199 )
1200 @Substitution(shared_params=fmt.common_docstring, returns=fmt.return_docstring)
1201 def to_string(
1202 self,
1203 buf: FilePath | WriteBuffer[str] | None = None,
1204 columns: Sequence[str] | None = None,
1205 col_space: int | list[int] | dict[Hashable, int] | None = None,
1206 header: bool | Sequence[str] = True,
1207 index: bool = True,
1208 na_rep: str = "NaN",
1209 formatters: fmt.FormattersType | None = None,
1210 float_format: fmt.FloatFormatType | None = None,
1211 sparsify: bool | None = None,
1212 index_names: bool = True,
1213 justify: str | None = None,
1214 max_rows: int | None = None,
1215 max_cols: int | None = None,
1216 show_dimensions: bool = False,
1217 decimal: str = ".",
1218 line_width: int | None = None,
1219 min_rows: int | None = None,
1220 max_colwidth: int | None = None,
1221 encoding: str | None = None,
1222 ) -> str | None:
1223 """
1224 Render a DataFrame to a console-friendly tabular output.
1225 %(shared_params)s
1226 line_width : int, optional
1227 Width to wrap a line in characters.
1228 min_rows : int, optional
1229 The number of rows to display in the console in a truncated repr
1230 (when number of rows is above `max_rows`).
1231 max_colwidth : int, optional
1232 Max width to truncate each column in characters. By default, no limit.
1233 encoding : str, default "utf-8"
1234 Set character encoding.
1235 %(returns)s
1236 See Also
1237 --------
1238 to_html : Convert DataFrame to HTML.
1239
1240 Examples
1241 --------
1242 >>> d = {'col1': [1, 2, 3], 'col2': [4, 5, 6]}
1243 >>> df = pd.DataFrame(d)
1244 >>> print(df.to_string())
1245 col1 col2
1246 0 1 4
1247 1 2 5
1248 2 3 6
1249 """
1250 from pandas import option_context
1251
1252 with option_context("display.max_colwidth", max_colwidth):
1253 formatter = fmt.DataFrameFormatter(
1254 self,
1255 columns=columns,
1256 col_space=col_space,
1257 na_rep=na_rep,
1258 formatters=formatters,
1259 float_format=float_format,
1260 sparsify=sparsify,
1261 justify=justify,
1262 index_names=index_names,
1263 header=header,
1264 index=index,
1265 min_rows=min_rows,
1266 max_rows=max_rows,
1267 max_cols=max_cols,
1268 show_dimensions=show_dimensions,
1269 decimal=decimal,
1270 )
1271 return fmt.DataFrameRenderer(formatter).to_string(
1272 buf=buf,
1273 encoding=encoding,
1274 line_width=line_width,
1275 )
1276
1277 # ----------------------------------------------------------------------
1278
1279 @property
1280 def style(self) -> Styler:
1281 """
1282 Returns a Styler object.
1283
1284 Contains methods for building a styled HTML representation of the DataFrame.
1285
1286 See Also
1287 --------
1288 io.formats.style.Styler : Helps style a DataFrame or Series according to the
1289 data with HTML and CSS.
1290 """
1291 from pandas.io.formats.style import Styler
1292
1293 return Styler(self)
1294
1295 _shared_docs[
1296 "items"
1297 ] = r"""
1298 Iterate over (column name, Series) pairs.
1299
1300 Iterates over the DataFrame columns, returning a tuple with
1301 the column name and the content as a Series.
1302
1303 Yields
1304 ------
1305 label : object
1306 The column names for the DataFrame being iterated over.
1307 content : Series
1308 The column entries belonging to each label, as a Series.
1309
1310 See Also
1311 --------
1312 DataFrame.iterrows : Iterate over DataFrame rows as
1313 (index, Series) pairs.
1314 DataFrame.itertuples : Iterate over DataFrame rows as namedtuples
1315 of the values.
1316
1317 Examples
1318 --------
1319 >>> df = pd.DataFrame({'species': ['bear', 'bear', 'marsupial'],
1320 ... 'population': [1864, 22000, 80000]},
1321 ... index=['panda', 'polar', 'koala'])
1322 >>> df
1323 species population
1324 panda bear 1864
1325 polar bear 22000
1326 koala marsupial 80000
1327 >>> for label, content in df.items():
1328 ... print(f'label: {label}')
1329 ... print(f'content: {content}', sep='\n')
1330 ...
1331 label: species
1332 content:
1333 panda bear
1334 polar bear
1335 koala marsupial
1336 Name: species, dtype: object
1337 label: population
1338 content:
1339 panda 1864
1340 polar 22000
1341 koala 80000
1342 Name: population, dtype: int64
1343 """
1344
1345 @Appender(_shared_docs["items"])
1346 def items(self) -> Iterable[tuple[Hashable, Series]]:
1347 if self.columns.is_unique and hasattr(self, "_item_cache"):
1348 for k in self.columns:
1349 yield k, self._get_item_cache(k)
1350 else:
1351 for i, k in enumerate(self.columns):
1352 yield k, self._ixs(i, axis=1)
1353
1354 def iterrows(self) -> Iterable[tuple[Hashable, Series]]:
1355 """
1356 Iterate over DataFrame rows as (index, Series) pairs.
1357
1358 Yields
1359 ------
1360 index : label or tuple of label
1361 The index of the row. A tuple for a `MultiIndex`.
1362 data : Series
1363 The data of the row as a Series.
1364
1365 See Also
1366 --------
1367 DataFrame.itertuples : Iterate over DataFrame rows as namedtuples of the values.
1368 DataFrame.items : Iterate over (column name, Series) pairs.
1369
1370 Notes
1371 -----
1372 1. Because ``iterrows`` returns a Series for each row,
1373 it does **not** preserve dtypes across the rows (dtypes are
1374 preserved across columns for DataFrames). For example,
1375
1376 >>> df = pd.DataFrame([[1, 1.5]], columns=['int', 'float'])
1377 >>> row = next(df.iterrows())[1]
1378 >>> row
1379 int 1.0
1380 float 1.5
1381 Name: 0, dtype: float64
1382 >>> print(row['int'].dtype)
1383 float64
1384 >>> print(df['int'].dtype)
1385 int64
1386
1387 To preserve dtypes while iterating over the rows, it is better
1388 to use :meth:`itertuples` which returns namedtuples of the values
1389 and which is generally faster than ``iterrows``.
1390
1391 2. You should **never modify** something you are iterating over.
1392 This is not guaranteed to work in all cases. Depending on the
1393 data types, the iterator returns a copy and not a view, and writing
1394 to it will have no effect.
1395 """
1396 columns = self.columns
1397 klass = self._constructor_sliced
1398 using_cow = using_copy_on_write()
1399 for k, v in zip(self.index, self.values):
1400 s = klass(v, index=columns, name=k).__finalize__(self)
1401 if using_cow and self._mgr.is_single_block:
1402 s._mgr.add_references(self._mgr) # type: ignore[arg-type]
1403 yield k, s
1404
1405 def itertuples(
1406 self, index: bool = True, name: str | None = "Pandas"
1407 ) -> Iterable[tuple[Any, ...]]:
1408 """
1409 Iterate over DataFrame rows as namedtuples.
1410
1411 Parameters
1412 ----------
1413 index : bool, default True
1414 If True, return the index as the first element of the tuple.
1415 name : str or None, default "Pandas"
1416 The name of the returned namedtuples or None to return regular
1417 tuples.
1418
1419 Returns
1420 -------
1421 iterator
1422 An object to iterate over namedtuples for each row in the
1423 DataFrame with the first field possibly being the index and
1424 following fields being the column values.
1425
1426 See Also
1427 --------
1428 DataFrame.iterrows : Iterate over DataFrame rows as (index, Series)
1429 pairs.
1430 DataFrame.items : Iterate over (column name, Series) pairs.
1431
1432 Notes
1433 -----
1434 The column names will be renamed to positional names if they are
1435 invalid Python identifiers, repeated, or start with an underscore.
1436
1437 Examples
1438 --------
1439 >>> df = pd.DataFrame({'num_legs': [4, 2], 'num_wings': [0, 2]},
1440 ... index=['dog', 'hawk'])
1441 >>> df
1442 num_legs num_wings
1443 dog 4 0
1444 hawk 2 2
1445 >>> for row in df.itertuples():
1446 ... print(row)
1447 ...
1448 Pandas(Index='dog', num_legs=4, num_wings=0)
1449 Pandas(Index='hawk', num_legs=2, num_wings=2)
1450
1451 By setting the `index` parameter to False we can remove the index
1452 as the first element of the tuple:
1453
1454 >>> for row in df.itertuples(index=False):
1455 ... print(row)
1456 ...
1457 Pandas(num_legs=4, num_wings=0)
1458 Pandas(num_legs=2, num_wings=2)
1459
1460 With the `name` parameter set we set a custom name for the yielded
1461 namedtuples:
1462
1463 >>> for row in df.itertuples(name='Animal'):
1464 ... print(row)
1465 ...
1466 Animal(Index='dog', num_legs=4, num_wings=0)
1467 Animal(Index='hawk', num_legs=2, num_wings=2)
1468 """
1469 arrays = []
1470 fields = list(self.columns)
1471 if index:
1472 arrays.append(self.index)
1473 fields.insert(0, "Index")
1474
1475 # use integer indexing because of possible duplicate column names
1476 arrays.extend(self.iloc[:, k] for k in range(len(self.columns)))
1477
1478 if name is not None:
1479 # https://github.com/python/mypy/issues/9046
1480 # error: namedtuple() expects a string literal as the first argument
1481 itertuple = collections.namedtuple( # type: ignore[misc]
1482 name, fields, rename=True
1483 )
1484 return map(itertuple._make, zip(*arrays))
1485
1486 # fallback to regular tuples
1487 return zip(*arrays)
1488
1489 def __len__(self) -> int:
1490 """
1491 Returns length of info axis, but here we use the index.
1492 """
1493 return len(self.index)
1494
1495 @overload
1496 def dot(self, other: Series) -> Series:
1497 ...
1498
1499 @overload
1500 def dot(self, other: DataFrame | Index | ArrayLike) -> DataFrame:
1501 ...
1502
1503 def dot(self, other: AnyArrayLike | DataFrame) -> DataFrame | Series:
1504 """
1505 Compute the matrix multiplication between the DataFrame and other.
1506
1507 This method computes the matrix product between the DataFrame and the
1508 values of an other Series, DataFrame or a numpy array.
1509
1510 It can also be called using ``self @ other`` in Python >= 3.5.
1511
1512 Parameters
1513 ----------
1514 other : Series, DataFrame or array-like
1515 The other object to compute the matrix product with.
1516
1517 Returns
1518 -------
1519 Series or DataFrame
1520 If other is a Series, return the matrix product between self and
1521 other as a Series. If other is a DataFrame or a numpy.array, return
1522 the matrix product of self and other in a DataFrame of a np.array.
1523
1524 See Also
1525 --------
1526 Series.dot: Similar method for Series.
1527
1528 Notes
1529 -----
1530 The dimensions of DataFrame and other must be compatible in order to
1531 compute the matrix multiplication. In addition, the column names of
1532 DataFrame and the index of other must contain the same values, as they
1533 will be aligned prior to the multiplication.
1534
1535 The dot method for Series computes the inner product, instead of the
1536 matrix product here.
1537
1538 Examples
1539 --------
1540 Here we multiply a DataFrame with a Series.
1541
1542 >>> df = pd.DataFrame([[0, 1, -2, -1], [1, 1, 1, 1]])
1543 >>> s = pd.Series([1, 1, 2, 1])
1544 >>> df.dot(s)
1545 0 -4
1546 1 5
1547 dtype: int64
1548
1549 Here we multiply a DataFrame with another DataFrame.
1550
1551 >>> other = pd.DataFrame([[0, 1], [1, 2], [-1, -1], [2, 0]])
1552 >>> df.dot(other)
1553 0 1
1554 0 1 4
1555 1 2 2
1556
1557 Note that the dot method give the same result as @
1558
1559 >>> df @ other
1560 0 1
1561 0 1 4
1562 1 2 2
1563
1564 The dot method works also if other is an np.array.
1565
1566 >>> arr = np.array([[0, 1], [1, 2], [-1, -1], [2, 0]])
1567 >>> df.dot(arr)
1568 0 1
1569 0 1 4
1570 1 2 2
1571
1572 Note how shuffling of the objects does not change the result.
1573
1574 >>> s2 = s.reindex([1, 0, 2, 3])
1575 >>> df.dot(s2)
1576 0 -4
1577 1 5
1578 dtype: int64
1579 """
1580 if isinstance(other, (Series, DataFrame)):
1581 common = self.columns.union(other.index)
1582 if len(common) > len(self.columns) or len(common) > len(other.index):
1583 raise ValueError("matrices are not aligned")
1584
1585 left = self.reindex(columns=common, copy=False)
1586 right = other.reindex(index=common, copy=False)
1587 lvals = left.values
1588 rvals = right._values
1589 else:
1590 left = self
1591 lvals = self.values
1592 rvals = np.asarray(other)
1593 if lvals.shape[1] != rvals.shape[0]:
1594 raise ValueError(
1595 f"Dot product shape mismatch, {lvals.shape} vs {rvals.shape}"
1596 )
1597
1598 if isinstance(other, DataFrame):
1599 return self._constructor(
1600 np.dot(lvals, rvals),
1601 index=left.index,
1602 columns=other.columns,
1603 copy=False,
1604 )
1605 elif isinstance(other, Series):
1606 return self._constructor_sliced(
1607 np.dot(lvals, rvals), index=left.index, copy=False
1608 )
1609 elif isinstance(rvals, (np.ndarray, Index)):
1610 result = np.dot(lvals, rvals)
1611 if result.ndim == 2:
1612 return self._constructor(result, index=left.index, copy=False)
1613 else:
1614 return self._constructor_sliced(result, index=left.index, copy=False)
1615 else: # pragma: no cover
1616 raise TypeError(f"unsupported type: {type(other)}")
1617
1618 @overload
1619 def __matmul__(self, other: Series) -> Series:
1620 ...
1621
1622 @overload
1623 def __matmul__(self, other: AnyArrayLike | DataFrame) -> DataFrame | Series:
1624 ...
1625
1626 def __matmul__(self, other: AnyArrayLike | DataFrame) -> DataFrame | Series:
1627 """
1628 Matrix multiplication using binary `@` operator in Python>=3.5.
1629 """
1630 return self.dot(other)
1631
1632 def __rmatmul__(self, other) -> DataFrame:
1633 """
1634 Matrix multiplication using binary `@` operator in Python>=3.5.
1635 """
1636 try:
1637 return self.T.dot(np.transpose(other)).T
1638 except ValueError as err:
1639 if "shape mismatch" not in str(err):
1640 raise
1641 # GH#21581 give exception message for original shapes
1642 msg = f"shapes {np.shape(other)} and {self.shape} not aligned"
1643 raise ValueError(msg) from err
1644
1645 # ----------------------------------------------------------------------
1646 # IO methods (to / from other formats)
1647
1648 @classmethod
1649 def from_dict(
1650 cls,
1651 data: dict,
1652 orient: str = "columns",
1653 dtype: Dtype | None = None,
1654 columns: Axes | None = None,
1655 ) -> DataFrame:
1656 """
1657 Construct DataFrame from dict of array-like or dicts.
1658
1659 Creates DataFrame object from dictionary by columns or by index
1660 allowing dtype specification.
1661
1662 Parameters
1663 ----------
1664 data : dict
1665 Of the form {field : array-like} or {field : dict}.
1666 orient : {'columns', 'index', 'tight'}, default 'columns'
1667 The "orientation" of the data. If the keys of the passed dict
1668 should be the columns of the resulting DataFrame, pass 'columns'
1669 (default). Otherwise if the keys should be rows, pass 'index'.
1670 If 'tight', assume a dict with keys ['index', 'columns', 'data',
1671 'index_names', 'column_names'].
1672
1673 .. versionadded:: 1.4.0
1674 'tight' as an allowed value for the ``orient`` argument
1675
1676 dtype : dtype, default None
1677 Data type to force after DataFrame construction, otherwise infer.
1678 columns : list, default None
1679 Column labels to use when ``orient='index'``. Raises a ValueError
1680 if used with ``orient='columns'`` or ``orient='tight'``.
1681
1682 Returns
1683 -------
1684 DataFrame
1685
1686 See Also
1687 --------
1688 DataFrame.from_records : DataFrame from structured ndarray, sequence
1689 of tuples or dicts, or DataFrame.
1690 DataFrame : DataFrame object creation using constructor.
1691 DataFrame.to_dict : Convert the DataFrame to a dictionary.
1692
1693 Examples
1694 --------
1695 By default the keys of the dict become the DataFrame columns:
1696
1697 >>> data = {'col_1': [3, 2, 1, 0], 'col_2': ['a', 'b', 'c', 'd']}
1698 >>> pd.DataFrame.from_dict(data)
1699 col_1 col_2
1700 0 3 a
1701 1 2 b
1702 2 1 c
1703 3 0 d
1704
1705 Specify ``orient='index'`` to create the DataFrame using dictionary
1706 keys as rows:
1707
1708 >>> data = {'row_1': [3, 2, 1, 0], 'row_2': ['a', 'b', 'c', 'd']}
1709 >>> pd.DataFrame.from_dict(data, orient='index')
1710 0 1 2 3
1711 row_1 3 2 1 0
1712 row_2 a b c d
1713
1714 When using the 'index' orientation, the column names can be
1715 specified manually:
1716
1717 >>> pd.DataFrame.from_dict(data, orient='index',
1718 ... columns=['A', 'B', 'C', 'D'])
1719 A B C D
1720 row_1 3 2 1 0
1721 row_2 a b c d
1722
1723 Specify ``orient='tight'`` to create the DataFrame using a 'tight'
1724 format:
1725
1726 >>> data = {'index': [('a', 'b'), ('a', 'c')],
1727 ... 'columns': [('x', 1), ('y', 2)],
1728 ... 'data': [[1, 3], [2, 4]],
1729 ... 'index_names': ['n1', 'n2'],
1730 ... 'column_names': ['z1', 'z2']}
1731 >>> pd.DataFrame.from_dict(data, orient='tight')
1732 z1 x y
1733 z2 1 2
1734 n1 n2
1735 a b 1 3
1736 c 2 4
1737 """
1738 index = None
1739 orient = orient.lower()
1740 if orient == "index":
1741 if len(data) > 0:
1742 # TODO speed up Series case
1743 if isinstance(list(data.values())[0], (Series, dict)):
1744 data = _from_nested_dict(data)
1745 else:
1746 index = list(data.keys())
1747 # error: Incompatible types in assignment (expression has type
1748 # "List[Any]", variable has type "Dict[Any, Any]")
1749 data = list(data.values()) # type: ignore[assignment]
1750 elif orient in ("columns", "tight"):
1751 if columns is not None:
1752 raise ValueError(f"cannot use columns parameter with orient='{orient}'")
1753 else: # pragma: no cover
1754 raise ValueError(
1755 f"Expected 'index', 'columns' or 'tight' for orient parameter. "
1756 f"Got '{orient}' instead"
1757 )
1758
1759 if orient != "tight":
1760 return cls(data, index=index, columns=columns, dtype=dtype)
1761 else:
1762 realdata = data["data"]
1763
1764 def create_index(indexlist, namelist):
1765 index: Index
1766 if len(namelist) > 1:
1767 index = MultiIndex.from_tuples(indexlist, names=namelist)
1768 else:
1769 index = Index(indexlist, name=namelist[0])
1770 return index
1771
1772 index = create_index(data["index"], data["index_names"])
1773 columns = create_index(data["columns"], data["column_names"])
1774 return cls(realdata, index=index, columns=columns, dtype=dtype)
1775
1776 def to_numpy(
1777 self,
1778 dtype: npt.DTypeLike | None = None,
1779 copy: bool = False,
1780 na_value: object = lib.no_default,
1781 ) -> np.ndarray:
1782 """
1783 Convert the DataFrame to a NumPy array.
1784
1785 By default, the dtype of the returned array will be the common NumPy
1786 dtype of all types in the DataFrame. For example, if the dtypes are
1787 ``float16`` and ``float32``, the results dtype will be ``float32``.
1788 This may require copying data and coercing values, which may be
1789 expensive.
1790
1791 Parameters
1792 ----------
1793 dtype : str or numpy.dtype, optional
1794 The dtype to pass to :meth:`numpy.asarray`.
1795 copy : bool, default False
1796 Whether to ensure that the returned value is not a view on
1797 another array. Note that ``copy=False`` does not *ensure* that
1798 ``to_numpy()`` is no-copy. Rather, ``copy=True`` ensure that
1799 a copy is made, even if not strictly necessary.
1800 na_value : Any, optional
1801 The value to use for missing values. The default value depends
1802 on `dtype` and the dtypes of the DataFrame columns.
1803
1804 .. versionadded:: 1.1.0
1805
1806 Returns
1807 -------
1808 numpy.ndarray
1809
1810 See Also
1811 --------
1812 Series.to_numpy : Similar method for Series.
1813
1814 Examples
1815 --------
1816 >>> pd.DataFrame({"A": [1, 2], "B": [3, 4]}).to_numpy()
1817 array([[1, 3],
1818 [2, 4]])
1819
1820 With heterogeneous data, the lowest common type will have to
1821 be used.
1822
1823 >>> df = pd.DataFrame({"A": [1, 2], "B": [3.0, 4.5]})
1824 >>> df.to_numpy()
1825 array([[1. , 3. ],
1826 [2. , 4.5]])
1827
1828 For a mix of numeric and non-numeric types, the output array will
1829 have object dtype.
1830
1831 >>> df['C'] = pd.date_range('2000', periods=2)
1832 >>> df.to_numpy()
1833 array([[1, 3.0, Timestamp('2000-01-01 00:00:00')],
1834 [2, 4.5, Timestamp('2000-01-02 00:00:00')]], dtype=object)
1835 """
1836 if dtype is not None:
1837 dtype = np.dtype(dtype)
1838 result = self._mgr.as_array(dtype=dtype, copy=copy, na_value=na_value)
1839 if result.dtype is not dtype:
1840 result = np.array(result, dtype=dtype, copy=False)
1841
1842 return result
1843
1844 def _create_data_for_split_and_tight_to_dict(
1845 self, are_all_object_dtype_cols: bool, object_dtype_indices: list[int]
1846 ) -> list:
1847 """
1848 Simple helper method to create data for to ``to_dict(orient="split")`` and
1849 ``to_dict(orient="tight")`` to create the main output data
1850 """
1851 if are_all_object_dtype_cols:
1852 data = [
1853 list(map(maybe_box_native, t))
1854 for t in self.itertuples(index=False, name=None)
1855 ]
1856 else:
1857 data = [list(t) for t in self.itertuples(index=False, name=None)]
1858 if object_dtype_indices:
1859 # If we have object_dtype_cols, apply maybe_box_naive after list
1860 # comprehension for perf
1861 for row in data:
1862 for i in object_dtype_indices:
1863 row[i] = maybe_box_native(row[i])
1864 return data
1865
1866 @overload
1867 def to_dict(
1868 self,
1869 orient: Literal["dict", "list", "series", "split", "tight", "index"] = ...,
1870 into: type[dict] = ...,
1871 ) -> dict:
1872 ...
1873
1874 @overload
1875 def to_dict(self, orient: Literal["records"], into: type[dict] = ...) -> list[dict]:
1876 ...
1877
1878 def to_dict(
1879 self,
1880 orient: Literal[
1881 "dict", "list", "series", "split", "tight", "records", "index"
1882 ] = "dict",
1883 into: type[dict] = dict,
1884 index: bool = True,
1885 ) -> dict | list[dict]:
1886 """
1887 Convert the DataFrame to a dictionary.
1888
1889 The type of the key-value pairs can be customized with the parameters
1890 (see below).
1891
1892 Parameters
1893 ----------
1894 orient : str {'dict', 'list', 'series', 'split', 'tight', 'records', 'index'}
1895 Determines the type of the values of the dictionary.
1896
1897 - 'dict' (default) : dict like {column -> {index -> value}}
1898 - 'list' : dict like {column -> [values]}
1899 - 'series' : dict like {column -> Series(values)}
1900 - 'split' : dict like
1901 {'index' -> [index], 'columns' -> [columns], 'data' -> [values]}
1902 - 'tight' : dict like
1903 {'index' -> [index], 'columns' -> [columns], 'data' -> [values],
1904 'index_names' -> [index.names], 'column_names' -> [column.names]}
1905 - 'records' : list like
1906 [{column -> value}, ... , {column -> value}]
1907 - 'index' : dict like {index -> {column -> value}}
1908
1909 .. versionadded:: 1.4.0
1910 'tight' as an allowed value for the ``orient`` argument
1911
1912 into : class, default dict
1913 The collections.abc.Mapping subclass used for all Mappings
1914 in the return value. Can be the actual class or an empty
1915 instance of the mapping type you want. If you want a
1916 collections.defaultdict, you must pass it initialized.
1917
1918 index : bool, default True
1919 Whether to include the index item (and index_names item if `orient`
1920 is 'tight') in the returned dictionary. Can only be ``False``
1921 when `orient` is 'split' or 'tight'.
1922
1923 .. versionadded:: 2.0.0
1924
1925 Returns
1926 -------
1927 dict, list or collections.abc.Mapping
1928 Return a collections.abc.Mapping object representing the DataFrame.
1929 The resulting transformation depends on the `orient` parameter.
1930
1931 See Also
1932 --------
1933 DataFrame.from_dict: Create a DataFrame from a dictionary.
1934 DataFrame.to_json: Convert a DataFrame to JSON format.
1935
1936 Examples
1937 --------
1938 >>> df = pd.DataFrame({'col1': [1, 2],
1939 ... 'col2': [0.5, 0.75]},
1940 ... index=['row1', 'row2'])
1941 >>> df
1942 col1 col2
1943 row1 1 0.50
1944 row2 2 0.75
1945 >>> df.to_dict()
1946 {'col1': {'row1': 1, 'row2': 2}, 'col2': {'row1': 0.5, 'row2': 0.75}}
1947
1948 You can specify the return orientation.
1949
1950 >>> df.to_dict('series')
1951 {'col1': row1 1
1952 row2 2
1953 Name: col1, dtype: int64,
1954 'col2': row1 0.50
1955 row2 0.75
1956 Name: col2, dtype: float64}
1957
1958 >>> df.to_dict('split')
1959 {'index': ['row1', 'row2'], 'columns': ['col1', 'col2'],
1960 'data': [[1, 0.5], [2, 0.75]]}
1961
1962 >>> df.to_dict('records')
1963 [{'col1': 1, 'col2': 0.5}, {'col1': 2, 'col2': 0.75}]
1964
1965 >>> df.to_dict('index')
1966 {'row1': {'col1': 1, 'col2': 0.5}, 'row2': {'col1': 2, 'col2': 0.75}}
1967
1968 >>> df.to_dict('tight')
1969 {'index': ['row1', 'row2'], 'columns': ['col1', 'col2'],
1970 'data': [[1, 0.5], [2, 0.75]], 'index_names': [None], 'column_names': [None]}
1971
1972 You can also specify the mapping type.
1973
1974 >>> from collections import OrderedDict, defaultdict
1975 >>> df.to_dict(into=OrderedDict)
1976 OrderedDict([('col1', OrderedDict([('row1', 1), ('row2', 2)])),
1977 ('col2', OrderedDict([('row1', 0.5), ('row2', 0.75)]))])
1978
1979 If you want a `defaultdict`, you need to initialize it:
1980
1981 >>> dd = defaultdict(list)
1982 >>> df.to_dict('records', into=dd)
1983 [defaultdict(<class 'list'>, {'col1': 1, 'col2': 0.5}),
1984 defaultdict(<class 'list'>, {'col1': 2, 'col2': 0.75})]
1985 """
1986 from pandas.core.methods.to_dict import to_dict
1987
1988 return to_dict(self, orient, into, index)
1989
1990 def to_gbq(
1991 self,
1992 destination_table: str,
1993 project_id: str | None = None,
1994 chunksize: int | None = None,
1995 reauth: bool = False,
1996 if_exists: str = "fail",
1997 auth_local_webserver: bool = True,
1998 table_schema: list[dict[str, str]] | None = None,
1999 location: str | None = None,
2000 progress_bar: bool = True,
2001 credentials=None,
2002 ) -> None:
2003 """
2004 Write a DataFrame to a Google BigQuery table.
2005
2006 This function requires the `pandas-gbq package
2007 <https://pandas-gbq.readthedocs.io>`__.
2008
2009 See the `How to authenticate with Google BigQuery
2010 <https://pandas-gbq.readthedocs.io/en/latest/howto/authentication.html>`__
2011 guide for authentication instructions.
2012
2013 Parameters
2014 ----------
2015 destination_table : str
2016 Name of table to be written, in the form ``dataset.tablename``.
2017 project_id : str, optional
2018 Google BigQuery Account project ID. Optional when available from
2019 the environment.
2020 chunksize : int, optional
2021 Number of rows to be inserted in each chunk from the dataframe.
2022 Set to ``None`` to load the whole dataframe at once.
2023 reauth : bool, default False
2024 Force Google BigQuery to re-authenticate the user. This is useful
2025 if multiple accounts are used.
2026 if_exists : str, default 'fail'
2027 Behavior when the destination table exists. Value can be one of:
2028
2029 ``'fail'``
2030 If table exists raise pandas_gbq.gbq.TableCreationError.
2031 ``'replace'``
2032 If table exists, drop it, recreate it, and insert data.
2033 ``'append'``
2034 If table exists, insert data. Create if does not exist.
2035 auth_local_webserver : bool, default True
2036 Use the `local webserver flow`_ instead of the `console flow`_
2037 when getting user credentials.
2038
2039 .. _local webserver flow:
2040 https://google-auth-oauthlib.readthedocs.io/en/latest/reference/google_auth_oauthlib.flow.html#google_auth_oauthlib.flow.InstalledAppFlow.run_local_server
2041 .. _console flow:
2042 https://google-auth-oauthlib.readthedocs.io/en/latest/reference/google_auth_oauthlib.flow.html#google_auth_oauthlib.flow.InstalledAppFlow.run_console
2043
2044 *New in version 0.2.0 of pandas-gbq*.
2045
2046 .. versionchanged:: 1.5.0
2047 Default value is changed to ``True``. Google has deprecated the
2048 ``auth_local_webserver = False`` `"out of band" (copy-paste)
2049 flow
2050 <https://developers.googleblog.com/2022/02/making-oauth-flows-safer.html?m=1#disallowed-oob>`_.
2051 table_schema : list of dicts, optional
2052 List of BigQuery table fields to which according DataFrame
2053 columns conform to, e.g. ``[{'name': 'col1', 'type':
2054 'STRING'},...]``. If schema is not provided, it will be
2055 generated according to dtypes of DataFrame columns. See
2056 BigQuery API documentation on available names of a field.
2057
2058 *New in version 0.3.1 of pandas-gbq*.
2059 location : str, optional
2060 Location where the load job should run. See the `BigQuery locations
2061 documentation
2062 <https://cloud.google.com/bigquery/docs/dataset-locations>`__ for a
2063 list of available locations. The location must match that of the
2064 target dataset.
2065
2066 *New in version 0.5.0 of pandas-gbq*.
2067 progress_bar : bool, default True
2068 Use the library `tqdm` to show the progress bar for the upload,
2069 chunk by chunk.
2070
2071 *New in version 0.5.0 of pandas-gbq*.
2072 credentials : google.auth.credentials.Credentials, optional
2073 Credentials for accessing Google APIs. Use this parameter to
2074 override default credentials, such as to use Compute Engine
2075 :class:`google.auth.compute_engine.Credentials` or Service
2076 Account :class:`google.oauth2.service_account.Credentials`
2077 directly.
2078
2079 *New in version 0.8.0 of pandas-gbq*.
2080
2081 See Also
2082 --------
2083 pandas_gbq.to_gbq : This function in the pandas-gbq library.
2084 read_gbq : Read a DataFrame from Google BigQuery.
2085 """
2086 from pandas.io import gbq
2087
2088 gbq.to_gbq(
2089 self,
2090 destination_table,
2091 project_id=project_id,
2092 chunksize=chunksize,
2093 reauth=reauth,
2094 if_exists=if_exists,
2095 auth_local_webserver=auth_local_webserver,
2096 table_schema=table_schema,
2097 location=location,
2098 progress_bar=progress_bar,
2099 credentials=credentials,
2100 )
2101
2102 @classmethod
2103 def from_records(
2104 cls,
2105 data,
2106 index=None,
2107 exclude=None,
2108 columns=None,
2109 coerce_float: bool = False,
2110 nrows: int | None = None,
2111 ) -> DataFrame:
2112 """
2113 Convert structured or record ndarray to DataFrame.
2114
2115 Creates a DataFrame object from a structured ndarray, sequence of
2116 tuples or dicts, or DataFrame.
2117
2118 Parameters
2119 ----------
2120 data : structured ndarray, sequence of tuples or dicts, or DataFrame
2121 Structured input data.
2122 index : str, list of fields, array-like
2123 Field of array to use as the index, alternately a specific set of
2124 input labels to use.
2125 exclude : sequence, default None
2126 Columns or fields to exclude.
2127 columns : sequence, default None
2128 Column names to use. If the passed data do not have names
2129 associated with them, this argument provides names for the
2130 columns. Otherwise this argument indicates the order of the columns
2131 in the result (any names not found in the data will become all-NA
2132 columns).
2133 coerce_float : bool, default False
2134 Attempt to convert values of non-string, non-numeric objects (like
2135 decimal.Decimal) to floating point, useful for SQL result sets.
2136 nrows : int, default None
2137 Number of rows to read if data is an iterator.
2138
2139 Returns
2140 -------
2141 DataFrame
2142
2143 See Also
2144 --------
2145 DataFrame.from_dict : DataFrame from dict of array-like or dicts.
2146 DataFrame : DataFrame object creation using constructor.
2147
2148 Examples
2149 --------
2150 Data can be provided as a structured ndarray:
2151
2152 >>> data = np.array([(3, 'a'), (2, 'b'), (1, 'c'), (0, 'd')],
2153 ... dtype=[('col_1', 'i4'), ('col_2', 'U1')])
2154 >>> pd.DataFrame.from_records(data)
2155 col_1 col_2
2156 0 3 a
2157 1 2 b
2158 2 1 c
2159 3 0 d
2160
2161 Data can be provided as a list of dicts:
2162
2163 >>> data = [{'col_1': 3, 'col_2': 'a'},
2164 ... {'col_1': 2, 'col_2': 'b'},
2165 ... {'col_1': 1, 'col_2': 'c'},
2166 ... {'col_1': 0, 'col_2': 'd'}]
2167 >>> pd.DataFrame.from_records(data)
2168 col_1 col_2
2169 0 3 a
2170 1 2 b
2171 2 1 c
2172 3 0 d
2173
2174 Data can be provided as a list of tuples with corresponding columns:
2175
2176 >>> data = [(3, 'a'), (2, 'b'), (1, 'c'), (0, 'd')]
2177 >>> pd.DataFrame.from_records(data, columns=['col_1', 'col_2'])
2178 col_1 col_2
2179 0 3 a
2180 1 2 b
2181 2 1 c
2182 3 0 d
2183 """
2184 if isinstance(data, DataFrame):
2185 if columns is not None:
2186 if is_scalar(columns):
2187 columns = [columns]
2188 data = data[columns]
2189 if index is not None:
2190 data = data.set_index(index)
2191 if exclude is not None:
2192 data = data.drop(columns=exclude)
2193 return data.copy(deep=False)
2194
2195 result_index = None
2196
2197 # Make a copy of the input columns so we can modify it
2198 if columns is not None:
2199 columns = ensure_index(columns)
2200
2201 def maybe_reorder(
2202 arrays: list[ArrayLike], arr_columns: Index, columns: Index, index
2203 ) -> tuple[list[ArrayLike], Index, Index | None]:
2204 """
2205 If our desired 'columns' do not match the data's pre-existing 'arr_columns',
2206 we re-order our arrays. This is like a pre-emptive (cheap) reindex.
2207 """
2208 if len(arrays):
2209 length = len(arrays[0])
2210 else:
2211 length = 0
2212
2213 result_index = None
2214 if len(arrays) == 0 and index is None and length == 0:
2215 result_index = default_index(0)
2216
2217 arrays, arr_columns = reorder_arrays(arrays, arr_columns, columns, length)
2218 return arrays, arr_columns, result_index
2219
2220 if is_iterator(data):
2221 if nrows == 0:
2222 return cls()
2223
2224 try:
2225 first_row = next(data)
2226 except StopIteration:
2227 return cls(index=index, columns=columns)
2228
2229 dtype = None
2230 if hasattr(first_row, "dtype") and first_row.dtype.names:
2231 dtype = first_row.dtype
2232
2233 values = [first_row]
2234
2235 if nrows is None:
2236 values += data
2237 else:
2238 values.extend(itertools.islice(data, nrows - 1))
2239
2240 if dtype is not None:
2241 data = np.array(values, dtype=dtype)
2242 else:
2243 data = values
2244
2245 if isinstance(data, dict):
2246 if columns is None:
2247 columns = arr_columns = ensure_index(sorted(data))
2248 arrays = [data[k] for k in columns]
2249 else:
2250 arrays = []
2251 arr_columns_list = []
2252 for k, v in data.items():
2253 if k in columns:
2254 arr_columns_list.append(k)
2255 arrays.append(v)
2256
2257 arr_columns = Index(arr_columns_list)
2258 arrays, arr_columns, result_index = maybe_reorder(
2259 arrays, arr_columns, columns, index
2260 )
2261
2262 elif isinstance(data, (np.ndarray, DataFrame)):
2263 arrays, columns = to_arrays(data, columns)
2264 arr_columns = columns
2265 else:
2266 arrays, arr_columns = to_arrays(data, columns)
2267 if coerce_float:
2268 for i, arr in enumerate(arrays):
2269 if arr.dtype == object:
2270 # error: Argument 1 to "maybe_convert_objects" has
2271 # incompatible type "Union[ExtensionArray, ndarray]";
2272 # expected "ndarray"
2273 arrays[i] = lib.maybe_convert_objects(
2274 arr, # type: ignore[arg-type]
2275 try_float=True,
2276 )
2277
2278 arr_columns = ensure_index(arr_columns)
2279 if columns is None:
2280 columns = arr_columns
2281 else:
2282 arrays, arr_columns, result_index = maybe_reorder(
2283 arrays, arr_columns, columns, index
2284 )
2285
2286 if exclude is None:
2287 exclude = set()
2288 else:
2289 exclude = set(exclude)
2290
2291 if index is not None:
2292 if isinstance(index, str) or not hasattr(index, "__iter__"):
2293 i = columns.get_loc(index)
2294 exclude.add(index)
2295 if len(arrays) > 0:
2296 result_index = Index(arrays[i], name=index)
2297 else:
2298 result_index = Index([], name=index)
2299 else:
2300 try:
2301 index_data = [arrays[arr_columns.get_loc(field)] for field in index]
2302 except (KeyError, TypeError):
2303 # raised by get_loc, see GH#29258
2304 result_index = index
2305 else:
2306 result_index = ensure_index_from_sequences(index_data, names=index)
2307 exclude.update(index)
2308
2309 if any(exclude):
2310 arr_exclude = [x for x in exclude if x in arr_columns]
2311 to_remove = [arr_columns.get_loc(col) for col in arr_exclude]
2312 arrays = [v for i, v in enumerate(arrays) if i not in to_remove]
2313
2314 columns = columns.drop(exclude)
2315
2316 manager = get_option("mode.data_manager")
2317 mgr = arrays_to_mgr(arrays, columns, result_index, typ=manager)
2318
2319 return cls(mgr)
2320
2321 def to_records(
2322 self, index: bool = True, column_dtypes=None, index_dtypes=None
2323 ) -> np.recarray:
2324 """
2325 Convert DataFrame to a NumPy record array.
2326
2327 Index will be included as the first field of the record array if
2328 requested.
2329
2330 Parameters
2331 ----------
2332 index : bool, default True
2333 Include index in resulting record array, stored in 'index'
2334 field or using the index label, if set.
2335 column_dtypes : str, type, dict, default None
2336 If a string or type, the data type to store all columns. If
2337 a dictionary, a mapping of column names and indices (zero-indexed)
2338 to specific data types.
2339 index_dtypes : str, type, dict, default None
2340 If a string or type, the data type to store all index levels. If
2341 a dictionary, a mapping of index level names and indices
2342 (zero-indexed) to specific data types.
2343
2344 This mapping is applied only if `index=True`.
2345
2346 Returns
2347 -------
2348 numpy.recarray
2349 NumPy ndarray with the DataFrame labels as fields and each row
2350 of the DataFrame as entries.
2351
2352 See Also
2353 --------
2354 DataFrame.from_records: Convert structured or record ndarray
2355 to DataFrame.
2356 numpy.recarray: An ndarray that allows field access using
2357 attributes, analogous to typed columns in a
2358 spreadsheet.
2359
2360 Examples
2361 --------
2362 >>> df = pd.DataFrame({'A': [1, 2], 'B': [0.5, 0.75]},
2363 ... index=['a', 'b'])
2364 >>> df
2365 A B
2366 a 1 0.50
2367 b 2 0.75
2368 >>> df.to_records()
2369 rec.array([('a', 1, 0.5 ), ('b', 2, 0.75)],
2370 dtype=[('index', 'O'), ('A', '<i8'), ('B', '<f8')])
2371
2372 If the DataFrame index has no label then the recarray field name
2373 is set to 'index'. If the index has a label then this is used as the
2374 field name:
2375
2376 >>> df.index = df.index.rename("I")
2377 >>> df.to_records()
2378 rec.array([('a', 1, 0.5 ), ('b', 2, 0.75)],
2379 dtype=[('I', 'O'), ('A', '<i8'), ('B', '<f8')])
2380
2381 The index can be excluded from the record array:
2382
2383 >>> df.to_records(index=False)
2384 rec.array([(1, 0.5 ), (2, 0.75)],
2385 dtype=[('A', '<i8'), ('B', '<f8')])
2386
2387 Data types can be specified for the columns:
2388
2389 >>> df.to_records(column_dtypes={"A": "int32"})
2390 rec.array([('a', 1, 0.5 ), ('b', 2, 0.75)],
2391 dtype=[('I', 'O'), ('A', '<i4'), ('B', '<f8')])
2392
2393 As well as for the index:
2394
2395 >>> df.to_records(index_dtypes="<S2")
2396 rec.array([(b'a', 1, 0.5 ), (b'b', 2, 0.75)],
2397 dtype=[('I', 'S2'), ('A', '<i8'), ('B', '<f8')])
2398
2399 >>> index_dtypes = f"<S{df.index.str.len().max()}"
2400 >>> df.to_records(index_dtypes=index_dtypes)
2401 rec.array([(b'a', 1, 0.5 ), (b'b', 2, 0.75)],
2402 dtype=[('I', 'S1'), ('A', '<i8'), ('B', '<f8')])
2403 """
2404 if index:
2405 ix_vals = [
2406 np.asarray(self.index.get_level_values(i))
2407 for i in range(self.index.nlevels)
2408 ]
2409
2410 arrays = ix_vals + [
2411 np.asarray(self.iloc[:, i]) for i in range(len(self.columns))
2412 ]
2413
2414 index_names = list(self.index.names)
2415
2416 if isinstance(self.index, MultiIndex):
2417 index_names = com.fill_missing_names(index_names)
2418 elif index_names[0] is None:
2419 index_names = ["index"]
2420
2421 names = [str(name) for name in itertools.chain(index_names, self.columns)]
2422 else:
2423 arrays = [np.asarray(self.iloc[:, i]) for i in range(len(self.columns))]
2424 names = [str(c) for c in self.columns]
2425 index_names = []
2426
2427 index_len = len(index_names)
2428 formats = []
2429
2430 for i, v in enumerate(arrays):
2431 index_int = i
2432
2433 # When the names and arrays are collected, we
2434 # first collect those in the DataFrame's index,
2435 # followed by those in its columns.
2436 #
2437 # Thus, the total length of the array is:
2438 # len(index_names) + len(DataFrame.columns).
2439 #
2440 # This check allows us to see whether we are
2441 # handling a name / array in the index or column.
2442 if index_int < index_len:
2443 dtype_mapping = index_dtypes
2444 name = index_names[index_int]
2445 else:
2446 index_int -= index_len
2447 dtype_mapping = column_dtypes
2448 name = self.columns[index_int]
2449
2450 # We have a dictionary, so we get the data type
2451 # associated with the index or column (which can
2452 # be denoted by its name in the DataFrame or its
2453 # position in DataFrame's array of indices or
2454 # columns, whichever is applicable.
2455 if is_dict_like(dtype_mapping):
2456 if name in dtype_mapping:
2457 dtype_mapping = dtype_mapping[name]
2458 elif index_int in dtype_mapping:
2459 dtype_mapping = dtype_mapping[index_int]
2460 else:
2461 dtype_mapping = None
2462
2463 # If no mapping can be found, use the array's
2464 # dtype attribute for formatting.
2465 #
2466 # A valid dtype must either be a type or
2467 # string naming a type.
2468 if dtype_mapping is None:
2469 formats.append(v.dtype)
2470 elif isinstance(dtype_mapping, (type, np.dtype, str)):
2471 # error: Argument 1 to "append" of "list" has incompatible
2472 # type "Union[type, dtype[Any], str]"; expected "dtype[Any]"
2473 formats.append(dtype_mapping) # type: ignore[arg-type]
2474 else:
2475 element = "row" if i < index_len else "column"
2476 msg = f"Invalid dtype {dtype_mapping} specified for {element} {name}"
2477 raise ValueError(msg)
2478
2479 return np.rec.fromarrays(arrays, dtype={"names": names, "formats": formats})
2480
2481 @classmethod
2482 def _from_arrays(
2483 cls,
2484 arrays,
2485 columns,
2486 index,
2487 dtype: Dtype | None = None,
2488 verify_integrity: bool = True,
2489 ) -> DataFrame:
2490 """
2491 Create DataFrame from a list of arrays corresponding to the columns.
2492
2493 Parameters
2494 ----------
2495 arrays : list-like of arrays
2496 Each array in the list corresponds to one column, in order.
2497 columns : list-like, Index
2498 The column names for the resulting DataFrame.
2499 index : list-like, Index
2500 The rows labels for the resulting DataFrame.
2501 dtype : dtype, optional
2502 Optional dtype to enforce for all arrays.
2503 verify_integrity : bool, default True
2504 Validate and homogenize all input. If set to False, it is assumed
2505 that all elements of `arrays` are actual arrays how they will be
2506 stored in a block (numpy ndarray or ExtensionArray), have the same
2507 length as and are aligned with the index, and that `columns` and
2508 `index` are ensured to be an Index object.
2509
2510 Returns
2511 -------
2512 DataFrame
2513 """
2514 if dtype is not None:
2515 dtype = pandas_dtype(dtype)
2516
2517 manager = get_option("mode.data_manager")
2518 columns = ensure_index(columns)
2519 if len(columns) != len(arrays):
2520 raise ValueError("len(columns) must match len(arrays)")
2521 mgr = arrays_to_mgr(
2522 arrays,
2523 columns,
2524 index,
2525 dtype=dtype,
2526 verify_integrity=verify_integrity,
2527 typ=manager,
2528 )
2529 return cls(mgr)
2530
2531 @doc(
2532 storage_options=_shared_docs["storage_options"],
2533 compression_options=_shared_docs["compression_options"] % "path",
2534 )
2535 def to_stata(
2536 self,
2537 path: FilePath | WriteBuffer[bytes],
2538 *,
2539 convert_dates: dict[Hashable, str] | None = None,
2540 write_index: bool = True,
2541 byteorder: str | None = None,
2542 time_stamp: datetime.datetime | None = None,
2543 data_label: str | None = None,
2544 variable_labels: dict[Hashable, str] | None = None,
2545 version: int | None = 114,
2546 convert_strl: Sequence[Hashable] | None = None,
2547 compression: CompressionOptions = "infer",
2548 storage_options: StorageOptions = None,
2549 value_labels: dict[Hashable, dict[float, str]] | None = None,
2550 ) -> None:
2551 """
2552 Export DataFrame object to Stata dta format.
2553
2554 Writes the DataFrame to a Stata dataset file.
2555 "dta" files contain a Stata dataset.
2556
2557 Parameters
2558 ----------
2559 path : str, path object, or buffer
2560 String, path object (implementing ``os.PathLike[str]``), or file-like
2561 object implementing a binary ``write()`` function.
2562
2563 convert_dates : dict
2564 Dictionary mapping columns containing datetime types to stata
2565 internal format to use when writing the dates. Options are 'tc',
2566 'td', 'tm', 'tw', 'th', 'tq', 'ty'. Column can be either an integer
2567 or a name. Datetime columns that do not have a conversion type
2568 specified will be converted to 'tc'. Raises NotImplementedError if
2569 a datetime column has timezone information.
2570 write_index : bool
2571 Write the index to Stata dataset.
2572 byteorder : str
2573 Can be ">", "<", "little", or "big". default is `sys.byteorder`.
2574 time_stamp : datetime
2575 A datetime to use as file creation date. Default is the current
2576 time.
2577 data_label : str, optional
2578 A label for the data set. Must be 80 characters or smaller.
2579 variable_labels : dict
2580 Dictionary containing columns as keys and variable labels as
2581 values. Each label must be 80 characters or smaller.
2582 version : {{114, 117, 118, 119, None}}, default 114
2583 Version to use in the output dta file. Set to None to let pandas
2584 decide between 118 or 119 formats depending on the number of
2585 columns in the frame. Version 114 can be read by Stata 10 and
2586 later. Version 117 can be read by Stata 13 or later. Version 118
2587 is supported in Stata 14 and later. Version 119 is supported in
2588 Stata 15 and later. Version 114 limits string variables to 244
2589 characters or fewer while versions 117 and later allow strings
2590 with lengths up to 2,000,000 characters. Versions 118 and 119
2591 support Unicode characters, and version 119 supports more than
2592 32,767 variables.
2593
2594 Version 119 should usually only be used when the number of
2595 variables exceeds the capacity of dta format 118. Exporting
2596 smaller datasets in format 119 may have unintended consequences,
2597 and, as of November 2020, Stata SE cannot read version 119 files.
2598
2599 convert_strl : list, optional
2600 List of column names to convert to string columns to Stata StrL
2601 format. Only available if version is 117. Storing strings in the
2602 StrL format can produce smaller dta files if strings have more than
2603 8 characters and values are repeated.
2604 {compression_options}
2605
2606 .. versionadded:: 1.1.0
2607
2608 .. versionchanged:: 1.4.0 Zstandard support.
2609
2610 {storage_options}
2611
2612 .. versionadded:: 1.2.0
2613
2614 value_labels : dict of dicts
2615 Dictionary containing columns as keys and dictionaries of column value
2616 to labels as values. Labels for a single variable must be 32,000
2617 characters or smaller.
2618
2619 .. versionadded:: 1.4.0
2620
2621 Raises
2622 ------
2623 NotImplementedError
2624 * If datetimes contain timezone information
2625 * Column dtype is not representable in Stata
2626 ValueError
2627 * Columns listed in convert_dates are neither datetime64[ns]
2628 or datetime.datetime
2629 * Column listed in convert_dates is not in DataFrame
2630 * Categorical label contains more than 32,000 characters
2631
2632 See Also
2633 --------
2634 read_stata : Import Stata data files.
2635 io.stata.StataWriter : Low-level writer for Stata data files.
2636 io.stata.StataWriter117 : Low-level writer for version 117 files.
2637
2638 Examples
2639 --------
2640 >>> df = pd.DataFrame({{'animal': ['falcon', 'parrot', 'falcon',
2641 ... 'parrot'],
2642 ... 'speed': [350, 18, 361, 15]}})
2643 >>> df.to_stata('animals.dta') # doctest: +SKIP
2644 """
2645 if version not in (114, 117, 118, 119, None):
2646 raise ValueError("Only formats 114, 117, 118 and 119 are supported.")
2647 if version == 114:
2648 if convert_strl is not None:
2649 raise ValueError("strl is not supported in format 114")
2650 from pandas.io.stata import StataWriter as statawriter
2651 elif version == 117:
2652 # Incompatible import of "statawriter" (imported name has type
2653 # "Type[StataWriter117]", local name has type "Type[StataWriter]")
2654 from pandas.io.stata import ( # type: ignore[assignment]
2655 StataWriter117 as statawriter,
2656 )
2657 else: # versions 118 and 119
2658 # Incompatible import of "statawriter" (imported name has type
2659 # "Type[StataWriter117]", local name has type "Type[StataWriter]")
2660 from pandas.io.stata import ( # type: ignore[assignment]
2661 StataWriterUTF8 as statawriter,
2662 )
2663
2664 kwargs: dict[str, Any] = {}
2665 if version is None or version >= 117:
2666 # strl conversion is only supported >= 117
2667 kwargs["convert_strl"] = convert_strl
2668 if version is None or version >= 118:
2669 # Specifying the version is only supported for UTF8 (118 or 119)
2670 kwargs["version"] = version
2671
2672 writer = statawriter(
2673 path,
2674 self,
2675 convert_dates=convert_dates,
2676 byteorder=byteorder,
2677 time_stamp=time_stamp,
2678 data_label=data_label,
2679 write_index=write_index,
2680 variable_labels=variable_labels,
2681 compression=compression,
2682 storage_options=storage_options,
2683 value_labels=value_labels,
2684 **kwargs,
2685 )
2686 writer.write_file()
2687
2688 def to_feather(self, path: FilePath | WriteBuffer[bytes], **kwargs) -> None:
2689 """
2690 Write a DataFrame to the binary Feather format.
2691
2692 Parameters
2693 ----------
2694 path : str, path object, file-like object
2695 String, path object (implementing ``os.PathLike[str]``), or file-like
2696 object implementing a binary ``write()`` function. If a string or a path,
2697 it will be used as Root Directory path when writing a partitioned dataset.
2698 **kwargs :
2699 Additional keywords passed to :func:`pyarrow.feather.write_feather`.
2700 Starting with pyarrow 0.17, this includes the `compression`,
2701 `compression_level`, `chunksize` and `version` keywords.
2702
2703 .. versionadded:: 1.1.0
2704
2705 Notes
2706 -----
2707 This function writes the dataframe as a `feather file
2708 <https://arrow.apache.org/docs/python/feather.html>`_. Requires a default
2709 index. For saving the DataFrame with your custom index use a method that
2710 supports custom indices e.g. `to_parquet`.
2711 """
2712 from pandas.io.feather_format import to_feather
2713
2714 to_feather(self, path, **kwargs)
2715
2716 @doc(
2717 Series.to_markdown,
2718 klass=_shared_doc_kwargs["klass"],
2719 storage_options=_shared_docs["storage_options"],
2720 examples="""Examples
2721 --------
2722 >>> df = pd.DataFrame(
2723 ... data={"animal_1": ["elk", "pig"], "animal_2": ["dog", "quetzal"]}
2724 ... )
2725 >>> print(df.to_markdown())
2726 | | animal_1 | animal_2 |
2727 |---:|:-----------|:-----------|
2728 | 0 | elk | dog |
2729 | 1 | pig | quetzal |
2730
2731 Output markdown with a tabulate option.
2732
2733 >>> print(df.to_markdown(tablefmt="grid"))
2734 +----+------------+------------+
2735 | | animal_1 | animal_2 |
2736 +====+============+============+
2737 | 0 | elk | dog |
2738 +----+------------+------------+
2739 | 1 | pig | quetzal |
2740 +----+------------+------------+""",
2741 )
2742 def to_markdown(
2743 self,
2744 buf: FilePath | WriteBuffer[str] | None = None,
2745 mode: str = "wt",
2746 index: bool = True,
2747 storage_options: StorageOptions = None,
2748 **kwargs,
2749 ) -> str | None:
2750 if "showindex" in kwargs:
2751 raise ValueError("Pass 'index' instead of 'showindex")
2752
2753 kwargs.setdefault("headers", "keys")
2754 kwargs.setdefault("tablefmt", "pipe")
2755 kwargs.setdefault("showindex", index)
2756 tabulate = import_optional_dependency("tabulate")
2757 result = tabulate.tabulate(self, **kwargs)
2758 if buf is None:
2759 return result
2760
2761 with get_handle(buf, mode, storage_options=storage_options) as handles:
2762 handles.handle.write(result)
2763 return None
2764
2765 @overload
2766 def to_parquet(
2767 self,
2768 path: None = ...,
2769 engine: str = ...,
2770 compression: str | None = ...,
2771 index: bool | None = ...,
2772 partition_cols: list[str] | None = ...,
2773 storage_options: StorageOptions = ...,
2774 **kwargs,
2775 ) -> bytes:
2776 ...
2777
2778 @overload
2779 def to_parquet(
2780 self,
2781 path: FilePath | WriteBuffer[bytes],
2782 engine: str = ...,
2783 compression: str | None = ...,
2784 index: bool | None = ...,
2785 partition_cols: list[str] | None = ...,
2786 storage_options: StorageOptions = ...,
2787 **kwargs,
2788 ) -> None:
2789 ...
2790
2791 @doc(storage_options=_shared_docs["storage_options"])
2792 def to_parquet(
2793 self,
2794 path: FilePath | WriteBuffer[bytes] | None = None,
2795 engine: str = "auto",
2796 compression: str | None = "snappy",
2797 index: bool | None = None,
2798 partition_cols: list[str] | None = None,
2799 storage_options: StorageOptions = None,
2800 **kwargs,
2801 ) -> bytes | None:
2802 """
2803 Write a DataFrame to the binary parquet format.
2804
2805 This function writes the dataframe as a `parquet file
2806 <https://parquet.apache.org/>`_. You can choose different parquet
2807 backends, and have the option of compression. See
2808 :ref:`the user guide <io.parquet>` for more details.
2809
2810 Parameters
2811 ----------
2812 path : str, path object, file-like object, or None, default None
2813 String, path object (implementing ``os.PathLike[str]``), or file-like
2814 object implementing a binary ``write()`` function. If None, the result is
2815 returned as bytes. If a string or path, it will be used as Root Directory
2816 path when writing a partitioned dataset.
2817
2818 .. versionchanged:: 1.2.0
2819
2820 Previously this was "fname"
2821
2822 engine : {{'auto', 'pyarrow', 'fastparquet'}}, default 'auto'
2823 Parquet library to use. If 'auto', then the option
2824 ``io.parquet.engine`` is used. The default ``io.parquet.engine``
2825 behavior is to try 'pyarrow', falling back to 'fastparquet' if
2826 'pyarrow' is unavailable.
2827 compression : {{'snappy', 'gzip', 'brotli', None}}, default 'snappy'
2828 Name of the compression to use. Use ``None`` for no compression.
2829 index : bool, default None
2830 If ``True``, include the dataframe's index(es) in the file output.
2831 If ``False``, they will not be written to the file.
2832 If ``None``, similar to ``True`` the dataframe's index(es)
2833 will be saved. However, instead of being saved as values,
2834 the RangeIndex will be stored as a range in the metadata so it
2835 doesn't require much space and is faster. Other indexes will
2836 be included as columns in the file output.
2837 partition_cols : list, optional, default None
2838 Column names by which to partition the dataset.
2839 Columns are partitioned in the order they are given.
2840 Must be None if path is not a string.
2841 {storage_options}
2842
2843 .. versionadded:: 1.2.0
2844
2845 **kwargs
2846 Additional arguments passed to the parquet library. See
2847 :ref:`pandas io <io.parquet>` for more details.
2848
2849 Returns
2850 -------
2851 bytes if no path argument is provided else None
2852
2853 See Also
2854 --------
2855 read_parquet : Read a parquet file.
2856 DataFrame.to_orc : Write an orc file.
2857 DataFrame.to_csv : Write a csv file.
2858 DataFrame.to_sql : Write to a sql table.
2859 DataFrame.to_hdf : Write to hdf.
2860
2861 Notes
2862 -----
2863 This function requires either the `fastparquet
2864 <https://pypi.org/project/fastparquet>`_ or `pyarrow
2865 <https://arrow.apache.org/docs/python/>`_ library.
2866
2867 Examples
2868 --------
2869 >>> df = pd.DataFrame(data={{'col1': [1, 2], 'col2': [3, 4]}})
2870 >>> df.to_parquet('df.parquet.gzip',
2871 ... compression='gzip') # doctest: +SKIP
2872 >>> pd.read_parquet('df.parquet.gzip') # doctest: +SKIP
2873 col1 col2
2874 0 1 3
2875 1 2 4
2876
2877 If you want to get a buffer to the parquet content you can use a io.BytesIO
2878 object, as long as you don't use partition_cols, which creates multiple files.
2879
2880 >>> import io
2881 >>> f = io.BytesIO()
2882 >>> df.to_parquet(f)
2883 >>> f.seek(0)
2884 0
2885 >>> content = f.read()
2886 """
2887 from pandas.io.parquet import to_parquet
2888
2889 return to_parquet(
2890 self,
2891 path,
2892 engine,
2893 compression=compression,
2894 index=index,
2895 partition_cols=partition_cols,
2896 storage_options=storage_options,
2897 **kwargs,
2898 )
2899
2900 def to_orc(
2901 self,
2902 path: FilePath | WriteBuffer[bytes] | None = None,
2903 *,
2904 engine: Literal["pyarrow"] = "pyarrow",
2905 index: bool | None = None,
2906 engine_kwargs: dict[str, Any] | None = None,
2907 ) -> bytes | None:
2908 """
2909 Write a DataFrame to the ORC format.
2910
2911 .. versionadded:: 1.5.0
2912
2913 Parameters
2914 ----------
2915 path : str, file-like object or None, default None
2916 If a string, it will be used as Root Directory path
2917 when writing a partitioned dataset. By file-like object,
2918 we refer to objects with a write() method, such as a file handle
2919 (e.g. via builtin open function). If path is None,
2920 a bytes object is returned.
2921 engine : str, default 'pyarrow'
2922 ORC library to use. Pyarrow must be >= 7.0.0.
2923 index : bool, optional
2924 If ``True``, include the dataframe's index(es) in the file output.
2925 If ``False``, they will not be written to the file.
2926 If ``None``, similar to ``infer`` the dataframe's index(es)
2927 will be saved. However, instead of being saved as values,
2928 the RangeIndex will be stored as a range in the metadata so it
2929 doesn't require much space and is faster. Other indexes will
2930 be included as columns in the file output.
2931 engine_kwargs : dict[str, Any] or None, default None
2932 Additional keyword arguments passed to :func:`pyarrow.orc.write_table`.
2933
2934 Returns
2935 -------
2936 bytes if no path argument is provided else None
2937
2938 Raises
2939 ------
2940 NotImplementedError
2941 Dtype of one or more columns is category, unsigned integers, interval,
2942 period or sparse.
2943 ValueError
2944 engine is not pyarrow.
2945
2946 See Also
2947 --------
2948 read_orc : Read a ORC file.
2949 DataFrame.to_parquet : Write a parquet file.
2950 DataFrame.to_csv : Write a csv file.
2951 DataFrame.to_sql : Write to a sql table.
2952 DataFrame.to_hdf : Write to hdf.
2953
2954 Notes
2955 -----
2956 * Before using this function you should read the :ref:`user guide about
2957 ORC <io.orc>` and :ref:`install optional dependencies <install.warn_orc>`.
2958 * This function requires `pyarrow <https://arrow.apache.org/docs/python/>`_
2959 library.
2960 * For supported dtypes please refer to `supported ORC features in Arrow
2961 <https://arrow.apache.org/docs/cpp/orc.html#data-types>`__.
2962 * Currently timezones in datetime columns are not preserved when a
2963 dataframe is converted into ORC files.
2964
2965 Examples
2966 --------
2967 >>> df = pd.DataFrame(data={'col1': [1, 2], 'col2': [4, 3]})
2968 >>> df.to_orc('df.orc') # doctest: +SKIP
2969 >>> pd.read_orc('df.orc') # doctest: +SKIP
2970 col1 col2
2971 0 1 4
2972 1 2 3
2973
2974 If you want to get a buffer to the orc content you can write it to io.BytesIO
2975 >>> import io
2976 >>> b = io.BytesIO(df.to_orc()) # doctest: +SKIP
2977 >>> b.seek(0) # doctest: +SKIP
2978 0
2979 >>> content = b.read() # doctest: +SKIP
2980 """
2981 from pandas.io.orc import to_orc
2982
2983 return to_orc(
2984 self, path, engine=engine, index=index, engine_kwargs=engine_kwargs
2985 )
2986
2987 @overload
2988 def to_html(
2989 self,
2990 buf: FilePath | WriteBuffer[str],
2991 columns: Sequence[Level] | None = ...,
2992 col_space: ColspaceArgType | None = ...,
2993 header: bool | Sequence[str] = ...,
2994 index: bool = ...,
2995 na_rep: str = ...,
2996 formatters: FormattersType | None = ...,
2997 float_format: FloatFormatType | None = ...,
2998 sparsify: bool | None = ...,
2999 index_names: bool = ...,
3000 justify: str | None = ...,
3001 max_rows: int | None = ...,
3002 max_cols: int | None = ...,
3003 show_dimensions: bool | str = ...,
3004 decimal: str = ...,
3005 bold_rows: bool = ...,
3006 classes: str | list | tuple | None = ...,
3007 escape: bool = ...,
3008 notebook: bool = ...,
3009 border: int | bool | None = ...,
3010 table_id: str | None = ...,
3011 render_links: bool = ...,
3012 encoding: str | None = ...,
3013 ) -> None:
3014 ...
3015
3016 @overload
3017 def to_html(
3018 self,
3019 buf: None = ...,
3020 columns: Sequence[Level] | None = ...,
3021 col_space: ColspaceArgType | None = ...,
3022 header: bool | Sequence[str] = ...,
3023 index: bool = ...,
3024 na_rep: str = ...,
3025 formatters: FormattersType | None = ...,
3026 float_format: FloatFormatType | None = ...,
3027 sparsify: bool | None = ...,
3028 index_names: bool = ...,
3029 justify: str | None = ...,
3030 max_rows: int | None = ...,
3031 max_cols: int | None = ...,
3032 show_dimensions: bool | str = ...,
3033 decimal: str = ...,
3034 bold_rows: bool = ...,
3035 classes: str | list | tuple | None = ...,
3036 escape: bool = ...,
3037 notebook: bool = ...,
3038 border: int | bool | None = ...,
3039 table_id: str | None = ...,
3040 render_links: bool = ...,
3041 encoding: str | None = ...,
3042 ) -> str:
3043 ...
3044
3045 @Substitution(
3046 header_type="bool",
3047 header="Whether to print column labels, default True",
3048 col_space_type="str or int, list or dict of int or str",
3049 col_space="The minimum width of each column in CSS length "
3050 "units. An int is assumed to be px units.",
3051 )
3052 @Substitution(shared_params=fmt.common_docstring, returns=fmt.return_docstring)
3053 def to_html(
3054 self,
3055 buf: FilePath | WriteBuffer[str] | None = None,
3056 columns: Sequence[Level] | None = None,
3057 col_space: ColspaceArgType | None = None,
3058 header: bool | Sequence[str] = True,
3059 index: bool = True,
3060 na_rep: str = "NaN",
3061 formatters: FormattersType | None = None,
3062 float_format: FloatFormatType | None = None,
3063 sparsify: bool | None = None,
3064 index_names: bool = True,
3065 justify: str | None = None,
3066 max_rows: int | None = None,
3067 max_cols: int | None = None,
3068 show_dimensions: bool | str = False,
3069 decimal: str = ".",
3070 bold_rows: bool = True,
3071 classes: str | list | tuple | None = None,
3072 escape: bool = True,
3073 notebook: bool = False,
3074 border: int | bool | None = None,
3075 table_id: str | None = None,
3076 render_links: bool = False,
3077 encoding: str | None = None,
3078 ) -> str | None:
3079 """
3080 Render a DataFrame as an HTML table.
3081 %(shared_params)s
3082 bold_rows : bool, default True
3083 Make the row labels bold in the output.
3084 classes : str or list or tuple, default None
3085 CSS class(es) to apply to the resulting html table.
3086 escape : bool, default True
3087 Convert the characters <, >, and & to HTML-safe sequences.
3088 notebook : {True, False}, default False
3089 Whether the generated HTML is for IPython Notebook.
3090 border : int
3091 A ``border=border`` attribute is included in the opening
3092 `<table>` tag. Default ``pd.options.display.html.border``.
3093 table_id : str, optional
3094 A css id is included in the opening `<table>` tag if specified.
3095 render_links : bool, default False
3096 Convert URLs to HTML links.
3097 encoding : str, default "utf-8"
3098 Set character encoding.
3099
3100 .. versionadded:: 1.0
3101 %(returns)s
3102 See Also
3103 --------
3104 to_string : Convert DataFrame to a string.
3105 """
3106 if justify is not None and justify not in fmt._VALID_JUSTIFY_PARAMETERS:
3107 raise ValueError("Invalid value for justify parameter")
3108
3109 formatter = fmt.DataFrameFormatter(
3110 self,
3111 columns=columns,
3112 col_space=col_space,
3113 na_rep=na_rep,
3114 header=header,
3115 index=index,
3116 formatters=formatters,
3117 float_format=float_format,
3118 bold_rows=bold_rows,
3119 sparsify=sparsify,
3120 justify=justify,
3121 index_names=index_names,
3122 escape=escape,
3123 decimal=decimal,
3124 max_rows=max_rows,
3125 max_cols=max_cols,
3126 show_dimensions=show_dimensions,
3127 )
3128 # TODO: a generic formatter wld b in DataFrameFormatter
3129 return fmt.DataFrameRenderer(formatter).to_html(
3130 buf=buf,
3131 classes=classes,
3132 notebook=notebook,
3133 border=border,
3134 encoding=encoding,
3135 table_id=table_id,
3136 render_links=render_links,
3137 )
3138
3139 @doc(
3140 storage_options=_shared_docs["storage_options"],
3141 compression_options=_shared_docs["compression_options"] % "path_or_buffer",
3142 )
3143 def to_xml(
3144 self,
3145 path_or_buffer: FilePath | WriteBuffer[bytes] | WriteBuffer[str] | None = None,
3146 index: bool = True,
3147 root_name: str | None = "data",
3148 row_name: str | None = "row",
3149 na_rep: str | None = None,
3150 attr_cols: list[str] | None = None,
3151 elem_cols: list[str] | None = None,
3152 namespaces: dict[str | None, str] | None = None,
3153 prefix: str | None = None,
3154 encoding: str = "utf-8",
3155 xml_declaration: bool | None = True,
3156 pretty_print: bool | None = True,
3157 parser: str | None = "lxml",
3158 stylesheet: FilePath | ReadBuffer[str] | ReadBuffer[bytes] | None = None,
3159 compression: CompressionOptions = "infer",
3160 storage_options: StorageOptions = None,
3161 ) -> str | None:
3162 """
3163 Render a DataFrame to an XML document.
3164
3165 .. versionadded:: 1.3.0
3166
3167 Parameters
3168 ----------
3169 path_or_buffer : str, path object, file-like object, or None, default None
3170 String, path object (implementing ``os.PathLike[str]``), or file-like
3171 object implementing a ``write()`` function. If None, the result is returned
3172 as a string.
3173 index : bool, default True
3174 Whether to include index in XML document.
3175 root_name : str, default 'data'
3176 The name of root element in XML document.
3177 row_name : str, default 'row'
3178 The name of row element in XML document.
3179 na_rep : str, optional
3180 Missing data representation.
3181 attr_cols : list-like, optional
3182 List of columns to write as attributes in row element.
3183 Hierarchical columns will be flattened with underscore
3184 delimiting the different levels.
3185 elem_cols : list-like, optional
3186 List of columns to write as children in row element. By default,
3187 all columns output as children of row element. Hierarchical
3188 columns will be flattened with underscore delimiting the
3189 different levels.
3190 namespaces : dict, optional
3191 All namespaces to be defined in root element. Keys of dict
3192 should be prefix names and values of dict corresponding URIs.
3193 Default namespaces should be given empty string key. For
3194 example, ::
3195
3196 namespaces = {{"": "https://example.com"}}
3197
3198 prefix : str, optional
3199 Namespace prefix to be used for every element and/or attribute
3200 in document. This should be one of the keys in ``namespaces``
3201 dict.
3202 encoding : str, default 'utf-8'
3203 Encoding of the resulting document.
3204 xml_declaration : bool, default True
3205 Whether to include the XML declaration at start of document.
3206 pretty_print : bool, default True
3207 Whether output should be pretty printed with indentation and
3208 line breaks.
3209 parser : {{'lxml','etree'}}, default 'lxml'
3210 Parser module to use for building of tree. Only 'lxml' and
3211 'etree' are supported. With 'lxml', the ability to use XSLT
3212 stylesheet is supported.
3213 stylesheet : str, path object or file-like object, optional
3214 A URL, file-like object, or a raw string containing an XSLT
3215 script used to transform the raw XML output. Script should use
3216 layout of elements and attributes from original output. This
3217 argument requires ``lxml`` to be installed. Only XSLT 1.0
3218 scripts and not later versions is currently supported.
3219 {compression_options}
3220
3221 .. versionchanged:: 1.4.0 Zstandard support.
3222
3223 {storage_options}
3224
3225 Returns
3226 -------
3227 None or str
3228 If ``io`` is None, returns the resulting XML format as a
3229 string. Otherwise returns None.
3230
3231 See Also
3232 --------
3233 to_json : Convert the pandas object to a JSON string.
3234 to_html : Convert DataFrame to a html.
3235
3236 Examples
3237 --------
3238 >>> df = pd.DataFrame({{'shape': ['square', 'circle', 'triangle'],
3239 ... 'degrees': [360, 360, 180],
3240 ... 'sides': [4, np.nan, 3]}})
3241
3242 >>> df.to_xml() # doctest: +SKIP
3243 <?xml version='1.0' encoding='utf-8'?>
3244 <data>
3245 <row>
3246 <index>0</index>
3247 <shape>square</shape>
3248 <degrees>360</degrees>
3249 <sides>4.0</sides>
3250 </row>
3251 <row>
3252 <index>1</index>
3253 <shape>circle</shape>
3254 <degrees>360</degrees>
3255 <sides/>
3256 </row>
3257 <row>
3258 <index>2</index>
3259 <shape>triangle</shape>
3260 <degrees>180</degrees>
3261 <sides>3.0</sides>
3262 </row>
3263 </data>
3264
3265 >>> df.to_xml(attr_cols=[
3266 ... 'index', 'shape', 'degrees', 'sides'
3267 ... ]) # doctest: +SKIP
3268 <?xml version='1.0' encoding='utf-8'?>
3269 <data>
3270 <row index="0" shape="square" degrees="360" sides="4.0"/>
3271 <row index="1" shape="circle" degrees="360"/>
3272 <row index="2" shape="triangle" degrees="180" sides="3.0"/>
3273 </data>
3274
3275 >>> df.to_xml(namespaces={{"doc": "https://example.com"}},
3276 ... prefix="doc") # doctest: +SKIP
3277 <?xml version='1.0' encoding='utf-8'?>
3278 <doc:data xmlns:doc="https://example.com">
3279 <doc:row>
3280 <doc:index>0</doc:index>
3281 <doc:shape>square</doc:shape>
3282 <doc:degrees>360</doc:degrees>
3283 <doc:sides>4.0</doc:sides>
3284 </doc:row>
3285 <doc:row>
3286 <doc:index>1</doc:index>
3287 <doc:shape>circle</doc:shape>
3288 <doc:degrees>360</doc:degrees>
3289 <doc:sides/>
3290 </doc:row>
3291 <doc:row>
3292 <doc:index>2</doc:index>
3293 <doc:shape>triangle</doc:shape>
3294 <doc:degrees>180</doc:degrees>
3295 <doc:sides>3.0</doc:sides>
3296 </doc:row>
3297 </doc:data>
3298 """
3299
3300 from pandas.io.formats.xml import (
3301 EtreeXMLFormatter,
3302 LxmlXMLFormatter,
3303 )
3304
3305 lxml = import_optional_dependency("lxml.etree", errors="ignore")
3306
3307 TreeBuilder: type[EtreeXMLFormatter] | type[LxmlXMLFormatter]
3308
3309 if parser == "lxml":
3310 if lxml is not None:
3311 TreeBuilder = LxmlXMLFormatter
3312 else:
3313 raise ImportError(
3314 "lxml not found, please install or use the etree parser."
3315 )
3316
3317 elif parser == "etree":
3318 TreeBuilder = EtreeXMLFormatter
3319
3320 else:
3321 raise ValueError("Values for parser can only be lxml or etree.")
3322
3323 xml_formatter = TreeBuilder(
3324 self,
3325 path_or_buffer=path_or_buffer,
3326 index=index,
3327 root_name=root_name,
3328 row_name=row_name,
3329 na_rep=na_rep,
3330 attr_cols=attr_cols,
3331 elem_cols=elem_cols,
3332 namespaces=namespaces,
3333 prefix=prefix,
3334 encoding=encoding,
3335 xml_declaration=xml_declaration,
3336 pretty_print=pretty_print,
3337 stylesheet=stylesheet,
3338 compression=compression,
3339 storage_options=storage_options,
3340 )
3341
3342 return xml_formatter.write_output()
3343
3344 # ----------------------------------------------------------------------
3345 @doc(INFO_DOCSTRING, **frame_sub_kwargs)
3346 def info(
3347 self,
3348 verbose: bool | None = None,
3349 buf: WriteBuffer[str] | None = None,
3350 max_cols: int | None = None,
3351 memory_usage: bool | str | None = None,
3352 show_counts: bool | None = None,
3353 ) -> None:
3354 info = DataFrameInfo(
3355 data=self,
3356 memory_usage=memory_usage,
3357 )
3358 info.render(
3359 buf=buf,
3360 max_cols=max_cols,
3361 verbose=verbose,
3362 show_counts=show_counts,
3363 )
3364
3365 def memory_usage(self, index: bool = True, deep: bool = False) -> Series:
3366 """
3367 Return the memory usage of each column in bytes.
3368
3369 The memory usage can optionally include the contribution of
3370 the index and elements of `object` dtype.
3371
3372 This value is displayed in `DataFrame.info` by default. This can be
3373 suppressed by setting ``pandas.options.display.memory_usage`` to False.
3374
3375 Parameters
3376 ----------
3377 index : bool, default True
3378 Specifies whether to include the memory usage of the DataFrame's
3379 index in returned Series. If ``index=True``, the memory usage of
3380 the index is the first item in the output.
3381 deep : bool, default False
3382 If True, introspect the data deeply by interrogating
3383 `object` dtypes for system-level memory consumption, and include
3384 it in the returned values.
3385
3386 Returns
3387 -------
3388 Series
3389 A Series whose index is the original column names and whose values
3390 is the memory usage of each column in bytes.
3391
3392 See Also
3393 --------
3394 numpy.ndarray.nbytes : Total bytes consumed by the elements of an
3395 ndarray.
3396 Series.memory_usage : Bytes consumed by a Series.
3397 Categorical : Memory-efficient array for string values with
3398 many repeated values.
3399 DataFrame.info : Concise summary of a DataFrame.
3400
3401 Notes
3402 -----
3403 See the :ref:`Frequently Asked Questions <df-memory-usage>` for more
3404 details.
3405
3406 Examples
3407 --------
3408 >>> dtypes = ['int64', 'float64', 'complex128', 'object', 'bool']
3409 >>> data = dict([(t, np.ones(shape=5000, dtype=int).astype(t))
3410 ... for t in dtypes])
3411 >>> df = pd.DataFrame(data)
3412 >>> df.head()
3413 int64 float64 complex128 object bool
3414 0 1 1.0 1.0+0.0j 1 True
3415 1 1 1.0 1.0+0.0j 1 True
3416 2 1 1.0 1.0+0.0j 1 True
3417 3 1 1.0 1.0+0.0j 1 True
3418 4 1 1.0 1.0+0.0j 1 True
3419
3420 >>> df.memory_usage()
3421 Index 128
3422 int64 40000
3423 float64 40000
3424 complex128 80000
3425 object 40000
3426 bool 5000
3427 dtype: int64
3428
3429 >>> df.memory_usage(index=False)
3430 int64 40000
3431 float64 40000
3432 complex128 80000
3433 object 40000
3434 bool 5000
3435 dtype: int64
3436
3437 The memory footprint of `object` dtype columns is ignored by default:
3438
3439 >>> df.memory_usage(deep=True)
3440 Index 128
3441 int64 40000
3442 float64 40000
3443 complex128 80000
3444 object 180000
3445 bool 5000
3446 dtype: int64
3447
3448 Use a Categorical for efficient storage of an object-dtype column with
3449 many repeated values.
3450
3451 >>> df['object'].astype('category').memory_usage(deep=True)
3452 5244
3453 """
3454 result = self._constructor_sliced(
3455 [c.memory_usage(index=False, deep=deep) for col, c in self.items()],
3456 index=self.columns,
3457 dtype=np.intp,
3458 )
3459 if index:
3460 index_memory_usage = self._constructor_sliced(
3461 self.index.memory_usage(deep=deep), index=["Index"]
3462 )
3463 result = index_memory_usage._append(result)
3464 return result
3465
3466 def transpose(self, *args, copy: bool = False) -> DataFrame:
3467 """
3468 Transpose index and columns.
3469
3470 Reflect the DataFrame over its main diagonal by writing rows as columns
3471 and vice-versa. The property :attr:`.T` is an accessor to the method
3472 :meth:`transpose`.
3473
3474 Parameters
3475 ----------
3476 *args : tuple, optional
3477 Accepted for compatibility with NumPy.
3478 copy : bool, default False
3479 Whether to copy the data after transposing, even for DataFrames
3480 with a single dtype.
3481
3482 Note that a copy is always required for mixed dtype DataFrames,
3483 or for DataFrames with any extension types.
3484
3485 Returns
3486 -------
3487 DataFrame
3488 The transposed DataFrame.
3489
3490 See Also
3491 --------
3492 numpy.transpose : Permute the dimensions of a given array.
3493
3494 Notes
3495 -----
3496 Transposing a DataFrame with mixed dtypes will result in a homogeneous
3497 DataFrame with the `object` dtype. In such a case, a copy of the data
3498 is always made.
3499
3500 Examples
3501 --------
3502 **Square DataFrame with homogeneous dtype**
3503
3504 >>> d1 = {'col1': [1, 2], 'col2': [3, 4]}
3505 >>> df1 = pd.DataFrame(data=d1)
3506 >>> df1
3507 col1 col2
3508 0 1 3
3509 1 2 4
3510
3511 >>> df1_transposed = df1.T # or df1.transpose()
3512 >>> df1_transposed
3513 0 1
3514 col1 1 2
3515 col2 3 4
3516
3517 When the dtype is homogeneous in the original DataFrame, we get a
3518 transposed DataFrame with the same dtype:
3519
3520 >>> df1.dtypes
3521 col1 int64
3522 col2 int64
3523 dtype: object
3524 >>> df1_transposed.dtypes
3525 0 int64
3526 1 int64
3527 dtype: object
3528
3529 **Non-square DataFrame with mixed dtypes**
3530
3531 >>> d2 = {'name': ['Alice', 'Bob'],
3532 ... 'score': [9.5, 8],
3533 ... 'employed': [False, True],
3534 ... 'kids': [0, 0]}
3535 >>> df2 = pd.DataFrame(data=d2)
3536 >>> df2
3537 name score employed kids
3538 0 Alice 9.5 False 0
3539 1 Bob 8.0 True 0
3540
3541 >>> df2_transposed = df2.T # or df2.transpose()
3542 >>> df2_transposed
3543 0 1
3544 name Alice Bob
3545 score 9.5 8.0
3546 employed False True
3547 kids 0 0
3548
3549 When the DataFrame has mixed dtypes, we get a transposed DataFrame with
3550 the `object` dtype:
3551
3552 >>> df2.dtypes
3553 name object
3554 score float64
3555 employed bool
3556 kids int64
3557 dtype: object
3558 >>> df2_transposed.dtypes
3559 0 object
3560 1 object
3561 dtype: object
3562 """
3563 nv.validate_transpose(args, {})
3564 # construct the args
3565
3566 dtypes = list(self.dtypes)
3567
3568 if self._can_fast_transpose:
3569 # Note: tests pass without this, but this improves perf quite a bit.
3570 new_vals = self._values.T
3571 if copy and not using_copy_on_write():
3572 new_vals = new_vals.copy()
3573
3574 result = self._constructor(
3575 new_vals, index=self.columns, columns=self.index, copy=False
3576 )
3577 if using_copy_on_write() and len(self) > 0:
3578 result._mgr.add_references(self._mgr) # type: ignore[arg-type]
3579
3580 elif (
3581 self._is_homogeneous_type and dtypes and is_extension_array_dtype(dtypes[0])
3582 ):
3583 # We have EAs with the same dtype. We can preserve that dtype in transpose.
3584 dtype = dtypes[0]
3585 arr_type = dtype.construct_array_type()
3586 values = self.values
3587
3588 new_values = [arr_type._from_sequence(row, dtype=dtype) for row in values]
3589 result = type(self)._from_arrays(
3590 new_values, index=self.columns, columns=self.index
3591 )
3592
3593 else:
3594 new_arr = self.values.T
3595 if copy and not using_copy_on_write():
3596 new_arr = new_arr.copy()
3597 result = self._constructor(
3598 new_arr,
3599 index=self.columns,
3600 columns=self.index,
3601 # We already made a copy (more than one block)
3602 copy=False,
3603 )
3604
3605 return result.__finalize__(self, method="transpose")
3606
3607 @property
3608 def T(self) -> DataFrame:
3609 """
3610 The transpose of the DataFrame.
3611
3612 Returns
3613 -------
3614 DataFrame
3615 The transposed DataFrame.
3616
3617 See Also
3618 --------
3619 DataFrame.transpose : Transpose index and columns.
3620
3621 Examples
3622 --------
3623 >>> df = pd.DataFrame({'col1': [1, 2], 'col2': [3, 4]})
3624 >>> df
3625 col1 col2
3626 0 1 3
3627 1 2 4
3628
3629 >>> df.T
3630 0 1
3631 col1 1 2
3632 col2 3 4
3633 """
3634 return self.transpose()
3635
3636 # ----------------------------------------------------------------------
3637 # Indexing Methods
3638
3639 def _ixs(self, i: int, axis: AxisInt = 0) -> Series:
3640 """
3641 Parameters
3642 ----------
3643 i : int
3644 axis : int
3645
3646 Returns
3647 -------
3648 Series
3649 """
3650 # irow
3651 if axis == 0:
3652 new_mgr = self._mgr.fast_xs(i)
3653
3654 # if we are a copy, mark as such
3655 copy = isinstance(new_mgr.array, np.ndarray) and new_mgr.array.base is None
3656 result = self._constructor_sliced(new_mgr, name=self.index[i]).__finalize__(
3657 self
3658 )
3659 result._set_is_copy(self, copy=copy)
3660 return result
3661
3662 # icol
3663 else:
3664 label = self.columns[i]
3665
3666 col_mgr = self._mgr.iget(i)
3667 result = self._box_col_values(col_mgr, i)
3668
3669 # this is a cached value, mark it so
3670 result._set_as_cached(label, self)
3671 return result
3672
3673 def _get_column_array(self, i: int) -> ArrayLike:
3674 """
3675 Get the values of the i'th column (ndarray or ExtensionArray, as stored
3676 in the Block)
3677
3678 Warning! The returned array is a view but doesn't handle Copy-on-Write,
3679 so this should be used with caution (for read-only purposes).
3680 """
3681 return self._mgr.iget_values(i)
3682
3683 def _iter_column_arrays(self) -> Iterator[ArrayLike]:
3684 """
3685 Iterate over the arrays of all columns in order.
3686 This returns the values as stored in the Block (ndarray or ExtensionArray).
3687
3688 Warning! The returned array is a view but doesn't handle Copy-on-Write,
3689 so this should be used with caution (for read-only purposes).
3690 """
3691 for i in range(len(self.columns)):
3692 yield self._get_column_array(i)
3693
3694 def _getitem_nocopy(self, key: list):
3695 """
3696 Behaves like __getitem__, but returns a view in cases where __getitem__
3697 would make a copy.
3698 """
3699 # TODO(CoW): can be removed if/when we are always Copy-on-Write
3700 indexer = self.columns._get_indexer_strict(key, "columns")[1]
3701 new_axis = self.columns[indexer]
3702
3703 new_mgr = self._mgr.reindex_indexer(
3704 new_axis,
3705 indexer,
3706 axis=0,
3707 allow_dups=True,
3708 copy=False,
3709 only_slice=True,
3710 )
3711 return self._constructor(new_mgr)
3712
3713 def __getitem__(self, key):
3714 check_dict_or_set_indexers(key)
3715 key = lib.item_from_zerodim(key)
3716 key = com.apply_if_callable(key, self)
3717
3718 if is_hashable(key) and not is_iterator(key):
3719 # is_iterator to exclude generator e.g. test_getitem_listlike
3720 # shortcut if the key is in columns
3721 is_mi = isinstance(self.columns, MultiIndex)
3722 # GH#45316 Return view if key is not duplicated
3723 # Only use drop_duplicates with duplicates for performance
3724 if not is_mi and (
3725 self.columns.is_unique
3726 and key in self.columns
3727 or key in self.columns.drop_duplicates(keep=False)
3728 ):
3729 return self._get_item_cache(key)
3730
3731 elif is_mi and self.columns.is_unique and key in self.columns:
3732 return self._getitem_multilevel(key)
3733 # Do we have a slicer (on rows)?
3734 if isinstance(key, slice):
3735 indexer = self.index._convert_slice_indexer(key, kind="getitem")
3736 if isinstance(indexer, np.ndarray):
3737 # reachable with DatetimeIndex
3738 indexer = lib.maybe_indices_to_slice(
3739 indexer.astype(np.intp, copy=False), len(self)
3740 )
3741 if isinstance(indexer, np.ndarray):
3742 # GH#43223 If we can not convert, use take
3743 return self.take(indexer, axis=0)
3744 return self._slice(indexer, axis=0)
3745
3746 # Do we have a (boolean) DataFrame?
3747 if isinstance(key, DataFrame):
3748 return self.where(key)
3749
3750 # Do we have a (boolean) 1d indexer?
3751 if com.is_bool_indexer(key):
3752 return self._getitem_bool_array(key)
3753
3754 # We are left with two options: a single key, and a collection of keys,
3755 # We interpret tuples as collections only for non-MultiIndex
3756 is_single_key = isinstance(key, tuple) or not is_list_like(key)
3757
3758 if is_single_key:
3759 if self.columns.nlevels > 1:
3760 return self._getitem_multilevel(key)
3761 indexer = self.columns.get_loc(key)
3762 if is_integer(indexer):
3763 indexer = [indexer]
3764 else:
3765 if is_iterator(key):
3766 key = list(key)
3767 indexer = self.columns._get_indexer_strict(key, "columns")[1]
3768
3769 # take() does not accept boolean indexers
3770 if getattr(indexer, "dtype", None) == bool:
3771 indexer = np.where(indexer)[0]
3772
3773 data = self._take_with_is_copy(indexer, axis=1)
3774
3775 if is_single_key:
3776 # What does looking for a single key in a non-unique index return?
3777 # The behavior is inconsistent. It returns a Series, except when
3778 # - the key itself is repeated (test on data.shape, #9519), or
3779 # - we have a MultiIndex on columns (test on self.columns, #21309)
3780 if data.shape[1] == 1 and not isinstance(self.columns, MultiIndex):
3781 # GH#26490 using data[key] can cause RecursionError
3782 return data._get_item_cache(key)
3783
3784 return data
3785
3786 def _getitem_bool_array(self, key):
3787 # also raises Exception if object array with NA values
3788 # warning here just in case -- previously __setitem__ was
3789 # reindexing but __getitem__ was not; it seems more reasonable to
3790 # go with the __setitem__ behavior since that is more consistent
3791 # with all other indexing behavior
3792 if isinstance(key, Series) and not key.index.equals(self.index):
3793 warnings.warn(
3794 "Boolean Series key will be reindexed to match DataFrame index.",
3795 UserWarning,
3796 stacklevel=find_stack_level(),
3797 )
3798 elif len(key) != len(self.index):
3799 raise ValueError(
3800 f"Item wrong length {len(key)} instead of {len(self.index)}."
3801 )
3802
3803 # check_bool_indexer will throw exception if Series key cannot
3804 # be reindexed to match DataFrame rows
3805 key = check_bool_indexer(self.index, key)
3806
3807 if key.all():
3808 return self.copy(deep=None)
3809
3810 indexer = key.nonzero()[0]
3811 return self._take_with_is_copy(indexer, axis=0)
3812
3813 def _getitem_multilevel(self, key):
3814 # self.columns is a MultiIndex
3815 loc = self.columns.get_loc(key)
3816 if isinstance(loc, (slice, np.ndarray)):
3817 new_columns = self.columns[loc]
3818 result_columns = maybe_droplevels(new_columns, key)
3819 result = self.iloc[:, loc]
3820 result.columns = result_columns
3821
3822 # If there is only one column being returned, and its name is
3823 # either an empty string, or a tuple with an empty string as its
3824 # first element, then treat the empty string as a placeholder
3825 # and return the column as if the user had provided that empty
3826 # string in the key. If the result is a Series, exclude the
3827 # implied empty string from its name.
3828 if len(result.columns) == 1:
3829 # e.g. test_frame_getitem_multicolumn_empty_level,
3830 # test_frame_mixed_depth_get, test_loc_setitem_single_column_slice
3831 top = result.columns[0]
3832 if isinstance(top, tuple):
3833 top = top[0]
3834 if top == "":
3835 result = result[""]
3836 if isinstance(result, Series):
3837 result = self._constructor_sliced(
3838 result, index=self.index, name=key
3839 )
3840
3841 result._set_is_copy(self)
3842 return result
3843 else:
3844 # loc is neither a slice nor ndarray, so must be an int
3845 return self._ixs(loc, axis=1)
3846
3847 def _get_value(self, index, col, takeable: bool = False) -> Scalar:
3848 """
3849 Quickly retrieve single value at passed column and index.
3850
3851 Parameters
3852 ----------
3853 index : row label
3854 col : column label
3855 takeable : interpret the index/col as indexers, default False
3856
3857 Returns
3858 -------
3859 scalar
3860
3861 Notes
3862 -----
3863 Assumes that both `self.index._index_as_unique` and
3864 `self.columns._index_as_unique`; Caller is responsible for checking.
3865 """
3866 if takeable:
3867 series = self._ixs(col, axis=1)
3868 return series._values[index]
3869
3870 series = self._get_item_cache(col)
3871 engine = self.index._engine
3872
3873 if not isinstance(self.index, MultiIndex):
3874 # CategoricalIndex: Trying to use the engine fastpath may give incorrect
3875 # results if our categories are integers that dont match our codes
3876 # IntervalIndex: IntervalTree has no get_loc
3877 row = self.index.get_loc(index)
3878 return series._values[row]
3879
3880 # For MultiIndex going through engine effectively restricts us to
3881 # same-length tuples; see test_get_set_value_no_partial_indexing
3882 loc = engine.get_loc(index)
3883 return series._values[loc]
3884
3885 def isetitem(self, loc, value) -> None:
3886 """
3887 Set the given value in the column with position `loc`.
3888
3889 This is a positional analogue to ``__setitem__``.
3890
3891 Parameters
3892 ----------
3893 loc : int or sequence of ints
3894 Index position for the column.
3895 value : scalar or arraylike
3896 Value(s) for the column.
3897
3898 Notes
3899 -----
3900 ``frame.isetitem(loc, value)`` is an in-place method as it will
3901 modify the DataFrame in place (not returning a new object). In contrast to
3902 ``frame.iloc[:, i] = value`` which will try to update the existing values in
3903 place, ``frame.isetitem(loc, value)`` will not update the values of the column
3904 itself in place, it will instead insert a new array.
3905
3906 In cases where ``frame.columns`` is unique, this is equivalent to
3907 ``frame[frame.columns[i]] = value``.
3908 """
3909 if isinstance(value, DataFrame):
3910 if is_scalar(loc):
3911 loc = [loc]
3912
3913 for i, idx in enumerate(loc):
3914 arraylike = self._sanitize_column(value.iloc[:, i])
3915 self._iset_item_mgr(idx, arraylike, inplace=False)
3916 return
3917
3918 arraylike = self._sanitize_column(value)
3919 self._iset_item_mgr(loc, arraylike, inplace=False)
3920
3921 def __setitem__(self, key, value):
3922 if not PYPY and using_copy_on_write():
3923 if sys.getrefcount(self) <= 3:
3924 warnings.warn(
3925 _chained_assignment_msg, ChainedAssignmentError, stacklevel=2
3926 )
3927
3928 key = com.apply_if_callable(key, self)
3929
3930 # see if we can slice the rows
3931 if isinstance(key, slice):
3932 slc = self.index._convert_slice_indexer(key, kind="getitem")
3933 return self._setitem_slice(slc, value)
3934
3935 if isinstance(key, DataFrame) or getattr(key, "ndim", None) == 2:
3936 self._setitem_frame(key, value)
3937 elif isinstance(key, (Series, np.ndarray, list, Index)):
3938 self._setitem_array(key, value)
3939 elif isinstance(value, DataFrame):
3940 self._set_item_frame_value(key, value)
3941 elif (
3942 is_list_like(value)
3943 and not self.columns.is_unique
3944 and 1 < len(self.columns.get_indexer_for([key])) == len(value)
3945 ):
3946 # Column to set is duplicated
3947 self._setitem_array([key], value)
3948 else:
3949 # set column
3950 self._set_item(key, value)
3951
3952 def _setitem_slice(self, key: slice, value) -> None:
3953 # NB: we can't just use self.loc[key] = value because that
3954 # operates on labels and we need to operate positional for
3955 # backwards-compat, xref GH#31469
3956 self._check_setitem_copy()
3957 self.iloc[key] = value
3958
3959 def _setitem_array(self, key, value):
3960 # also raises Exception if object array with NA values
3961 if com.is_bool_indexer(key):
3962 # bool indexer is indexing along rows
3963 if len(key) != len(self.index):
3964 raise ValueError(
3965 f"Item wrong length {len(key)} instead of {len(self.index)}!"
3966 )
3967 key = check_bool_indexer(self.index, key)
3968 indexer = key.nonzero()[0]
3969 self._check_setitem_copy()
3970 if isinstance(value, DataFrame):
3971 # GH#39931 reindex since iloc does not align
3972 value = value.reindex(self.index.take(indexer))
3973 self.iloc[indexer] = value
3974
3975 else:
3976 # Note: unlike self.iloc[:, indexer] = value, this will
3977 # never try to overwrite values inplace
3978
3979 if isinstance(value, DataFrame):
3980 check_key_length(self.columns, key, value)
3981 for k1, k2 in zip(key, value.columns):
3982 self[k1] = value[k2]
3983
3984 elif not is_list_like(value):
3985 for col in key:
3986 self[col] = value
3987
3988 elif isinstance(value, np.ndarray) and value.ndim == 2:
3989 self._iset_not_inplace(key, value)
3990
3991 elif np.ndim(value) > 1:
3992 # list of lists
3993 value = DataFrame(value).values
3994 return self._setitem_array(key, value)
3995
3996 else:
3997 self._iset_not_inplace(key, value)
3998
3999 def _iset_not_inplace(self, key, value):
4000 # GH#39510 when setting with df[key] = obj with a list-like key and
4001 # list-like value, we iterate over those listlikes and set columns
4002 # one at a time. This is different from dispatching to
4003 # `self.loc[:, key]= value` because loc.__setitem__ may overwrite
4004 # data inplace, whereas this will insert new arrays.
4005
4006 def igetitem(obj, i: int):
4007 # Note: we catch DataFrame obj before getting here, but
4008 # hypothetically would return obj.iloc[:, i]
4009 if isinstance(obj, np.ndarray):
4010 return obj[..., i]
4011 else:
4012 return obj[i]
4013
4014 if self.columns.is_unique:
4015 if np.shape(value)[-1] != len(key):
4016 raise ValueError("Columns must be same length as key")
4017
4018 for i, col in enumerate(key):
4019 self[col] = igetitem(value, i)
4020
4021 else:
4022 ilocs = self.columns.get_indexer_non_unique(key)[0]
4023 if (ilocs < 0).any():
4024 # key entries not in self.columns
4025 raise NotImplementedError
4026
4027 if np.shape(value)[-1] != len(ilocs):
4028 raise ValueError("Columns must be same length as key")
4029
4030 assert np.ndim(value) <= 2
4031
4032 orig_columns = self.columns
4033
4034 # Using self.iloc[:, i] = ... may set values inplace, which
4035 # by convention we do not do in __setitem__
4036 try:
4037 self.columns = Index(range(len(self.columns)))
4038 for i, iloc in enumerate(ilocs):
4039 self[iloc] = igetitem(value, i)
4040 finally:
4041 self.columns = orig_columns
4042
4043 def _setitem_frame(self, key, value):
4044 # support boolean setting with DataFrame input, e.g.
4045 # df[df > df2] = 0
4046 if isinstance(key, np.ndarray):
4047 if key.shape != self.shape:
4048 raise ValueError("Array conditional must be same shape as self")
4049 key = self._constructor(key, **self._construct_axes_dict(), copy=False)
4050
4051 if key.size and not all(is_bool_dtype(dtype) for dtype in key.dtypes):
4052 raise TypeError(
4053 "Must pass DataFrame or 2-d ndarray with boolean values only"
4054 )
4055
4056 self._check_inplace_setting(value)
4057 self._check_setitem_copy()
4058 self._where(-key, value, inplace=True)
4059
4060 def _set_item_frame_value(self, key, value: DataFrame) -> None:
4061 self._ensure_valid_index(value)
4062
4063 # align columns
4064 if key in self.columns:
4065 loc = self.columns.get_loc(key)
4066 cols = self.columns[loc]
4067 len_cols = 1 if is_scalar(cols) or isinstance(cols, tuple) else len(cols)
4068 if len_cols != len(value.columns):
4069 raise ValueError("Columns must be same length as key")
4070
4071 # align right-hand-side columns if self.columns
4072 # is multi-index and self[key] is a sub-frame
4073 if isinstance(self.columns, MultiIndex) and isinstance(
4074 loc, (slice, Series, np.ndarray, Index)
4075 ):
4076 cols_droplevel = maybe_droplevels(cols, key)
4077 if len(cols_droplevel) and not cols_droplevel.equals(value.columns):
4078 value = value.reindex(cols_droplevel, axis=1)
4079
4080 for col, col_droplevel in zip(cols, cols_droplevel):
4081 self[col] = value[col_droplevel]
4082 return
4083
4084 if is_scalar(cols):
4085 self[cols] = value[value.columns[0]]
4086 return
4087
4088 # now align rows
4089 arraylike = _reindex_for_setitem(value, self.index)
4090 self._set_item_mgr(key, arraylike)
4091 return
4092
4093 if len(value.columns) != 1:
4094 raise ValueError(
4095 "Cannot set a DataFrame with multiple columns to the single "
4096 f"column {key}"
4097 )
4098
4099 self[key] = value[value.columns[0]]
4100
4101 def _iset_item_mgr(
4102 self, loc: int | slice | np.ndarray, value, inplace: bool = False
4103 ) -> None:
4104 # when called from _set_item_mgr loc can be anything returned from get_loc
4105 self._mgr.iset(loc, value, inplace=inplace)
4106 self._clear_item_cache()
4107
4108 def _set_item_mgr(self, key, value: ArrayLike) -> None:
4109 try:
4110 loc = self._info_axis.get_loc(key)
4111 except KeyError:
4112 # This item wasn't present, just insert at end
4113 self._mgr.insert(len(self._info_axis), key, value)
4114 else:
4115 self._iset_item_mgr(loc, value)
4116
4117 # check if we are modifying a copy
4118 # try to set first as we want an invalid
4119 # value exception to occur first
4120 if len(self):
4121 self._check_setitem_copy()
4122
4123 def _iset_item(self, loc: int, value) -> None:
4124 arraylike = self._sanitize_column(value)
4125 self._iset_item_mgr(loc, arraylike, inplace=True)
4126
4127 # check if we are modifying a copy
4128 # try to set first as we want an invalid
4129 # value exception to occur first
4130 if len(self):
4131 self._check_setitem_copy()
4132
4133 def _set_item(self, key, value) -> None:
4134 """
4135 Add series to DataFrame in specified column.
4136
4137 If series is a numpy-array (not a Series/TimeSeries), it must be the
4138 same length as the DataFrames index or an error will be thrown.
4139
4140 Series/TimeSeries will be conformed to the DataFrames index to
4141 ensure homogeneity.
4142 """
4143 value = self._sanitize_column(value)
4144
4145 if (
4146 key in self.columns
4147 and value.ndim == 1
4148 and not is_extension_array_dtype(value)
4149 ):
4150 # broadcast across multiple columns if necessary
4151 if not self.columns.is_unique or isinstance(self.columns, MultiIndex):
4152 existing_piece = self[key]
4153 if isinstance(existing_piece, DataFrame):
4154 value = np.tile(value, (len(existing_piece.columns), 1)).T
4155
4156 self._set_item_mgr(key, value)
4157
4158 def _set_value(
4159 self, index: IndexLabel, col, value: Scalar, takeable: bool = False
4160 ) -> None:
4161 """
4162 Put single value at passed column and index.
4163
4164 Parameters
4165 ----------
4166 index : Label
4167 row label
4168 col : Label
4169 column label
4170 value : scalar
4171 takeable : bool, default False
4172 Sets whether or not index/col interpreted as indexers
4173 """
4174 try:
4175 if takeable:
4176 icol = col
4177 iindex = cast(int, index)
4178 else:
4179 icol = self.columns.get_loc(col)
4180 iindex = self.index.get_loc(index)
4181 self._mgr.column_setitem(icol, iindex, value, inplace_only=True)
4182 self._clear_item_cache()
4183
4184 except (KeyError, TypeError, ValueError, LossySetitemError):
4185 # get_loc might raise a KeyError for missing labels (falling back
4186 # to (i)loc will do expansion of the index)
4187 # column_setitem will do validation that may raise TypeError,
4188 # ValueError, or LossySetitemError
4189 # set using a non-recursive method & reset the cache
4190 if takeable:
4191 self.iloc[index, col] = value
4192 else:
4193 self.loc[index, col] = value
4194 self._item_cache.pop(col, None)
4195
4196 except InvalidIndexError as ii_err:
4197 # GH48729: Seems like you are trying to assign a value to a
4198 # row when only scalar options are permitted
4199 raise InvalidIndexError(
4200 f"You can only assign a scalar value not a {type(value)}"
4201 ) from ii_err
4202
4203 def _ensure_valid_index(self, value) -> None:
4204 """
4205 Ensure that if we don't have an index, that we can create one from the
4206 passed value.
4207 """
4208 # GH5632, make sure that we are a Series convertible
4209 if not len(self.index) and is_list_like(value) and len(value):
4210 if not isinstance(value, DataFrame):
4211 try:
4212 value = Series(value)
4213 except (ValueError, NotImplementedError, TypeError) as err:
4214 raise ValueError(
4215 "Cannot set a frame with no defined index "
4216 "and a value that cannot be converted to a Series"
4217 ) from err
4218
4219 # GH31368 preserve name of index
4220 index_copy = value.index.copy()
4221 if self.index.name is not None:
4222 index_copy.name = self.index.name
4223
4224 self._mgr = self._mgr.reindex_axis(index_copy, axis=1, fill_value=np.nan)
4225
4226 def _box_col_values(self, values: SingleDataManager, loc: int) -> Series:
4227 """
4228 Provide boxed values for a column.
4229 """
4230 # Lookup in columns so that if e.g. a str datetime was passed
4231 # we attach the Timestamp object as the name.
4232 name = self.columns[loc]
4233 klass = self._constructor_sliced
4234 # We get index=self.index bc values is a SingleDataManager
4235 return klass(values, name=name, fastpath=True).__finalize__(self)
4236
4237 # ----------------------------------------------------------------------
4238 # Lookup Caching
4239
4240 def _clear_item_cache(self) -> None:
4241 self._item_cache.clear()
4242
4243 def _get_item_cache(self, item: Hashable) -> Series:
4244 """Return the cached item, item represents a label indexer."""
4245 if using_copy_on_write():
4246 loc = self.columns.get_loc(item)
4247 return self._ixs(loc, axis=1)
4248
4249 cache = self._item_cache
4250 res = cache.get(item)
4251 if res is None:
4252 # All places that call _get_item_cache have unique columns,
4253 # pending resolution of GH#33047
4254
4255 loc = self.columns.get_loc(item)
4256 res = self._ixs(loc, axis=1)
4257
4258 cache[item] = res
4259
4260 # for a chain
4261 res._is_copy = self._is_copy
4262 return res
4263
4264 def _reset_cacher(self) -> None:
4265 # no-op for DataFrame
4266 pass
4267
4268 def _maybe_cache_changed(self, item, value: Series, inplace: bool) -> None:
4269 """
4270 The object has called back to us saying maybe it has changed.
4271 """
4272 loc = self._info_axis.get_loc(item)
4273 arraylike = value._values
4274
4275 old = self._ixs(loc, axis=1)
4276 if old._values is value._values and inplace:
4277 # GH#46149 avoid making unnecessary copies/block-splitting
4278 return
4279
4280 self._mgr.iset(loc, arraylike, inplace=inplace)
4281
4282 # ----------------------------------------------------------------------
4283 # Unsorted
4284
4285 @overload
4286 def query(self, expr: str, *, inplace: Literal[False] = ..., **kwargs) -> DataFrame:
4287 ...
4288
4289 @overload
4290 def query(self, expr: str, *, inplace: Literal[True], **kwargs) -> None:
4291 ...
4292
4293 @overload
4294 def query(self, expr: str, *, inplace: bool = ..., **kwargs) -> DataFrame | None:
4295 ...
4296
4297 def query(self, expr: str, *, inplace: bool = False, **kwargs) -> DataFrame | None:
4298 """
4299 Query the columns of a DataFrame with a boolean expression.
4300
4301 Parameters
4302 ----------
4303 expr : str
4304 The query string to evaluate.
4305
4306 You can refer to variables
4307 in the environment by prefixing them with an '@' character like
4308 ``@a + b``.
4309
4310 You can refer to column names that are not valid Python variable names
4311 by surrounding them in backticks. Thus, column names containing spaces
4312 or punctuations (besides underscores) or starting with digits must be
4313 surrounded by backticks. (For example, a column named "Area (cm^2)" would
4314 be referenced as ```Area (cm^2)```). Column names which are Python keywords
4315 (like "list", "for", "import", etc) cannot be used.
4316
4317 For example, if one of your columns is called ``a a`` and you want
4318 to sum it with ``b``, your query should be ```a a` + b``.
4319
4320 inplace : bool
4321 Whether to modify the DataFrame rather than creating a new one.
4322 **kwargs
4323 See the documentation for :func:`eval` for complete details
4324 on the keyword arguments accepted by :meth:`DataFrame.query`.
4325
4326 Returns
4327 -------
4328 DataFrame or None
4329 DataFrame resulting from the provided query expression or
4330 None if ``inplace=True``.
4331
4332 See Also
4333 --------
4334 eval : Evaluate a string describing operations on
4335 DataFrame columns.
4336 DataFrame.eval : Evaluate a string describing operations on
4337 DataFrame columns.
4338
4339 Notes
4340 -----
4341 The result of the evaluation of this expression is first passed to
4342 :attr:`DataFrame.loc` and if that fails because of a
4343 multidimensional key (e.g., a DataFrame) then the result will be passed
4344 to :meth:`DataFrame.__getitem__`.
4345
4346 This method uses the top-level :func:`eval` function to
4347 evaluate the passed query.
4348
4349 The :meth:`~pandas.DataFrame.query` method uses a slightly
4350 modified Python syntax by default. For example, the ``&`` and ``|``
4351 (bitwise) operators have the precedence of their boolean cousins,
4352 :keyword:`and` and :keyword:`or`. This *is* syntactically valid Python,
4353 however the semantics are different.
4354
4355 You can change the semantics of the expression by passing the keyword
4356 argument ``parser='python'``. This enforces the same semantics as
4357 evaluation in Python space. Likewise, you can pass ``engine='python'``
4358 to evaluate an expression using Python itself as a backend. This is not
4359 recommended as it is inefficient compared to using ``numexpr`` as the
4360 engine.
4361
4362 The :attr:`DataFrame.index` and
4363 :attr:`DataFrame.columns` attributes of the
4364 :class:`~pandas.DataFrame` instance are placed in the query namespace
4365 by default, which allows you to treat both the index and columns of the
4366 frame as a column in the frame.
4367 The identifier ``index`` is used for the frame index; you can also
4368 use the name of the index to identify it in a query. Please note that
4369 Python keywords may not be used as identifiers.
4370
4371 For further details and examples see the ``query`` documentation in
4372 :ref:`indexing <indexing.query>`.
4373
4374 *Backtick quoted variables*
4375
4376 Backtick quoted variables are parsed as literal Python code and
4377 are converted internally to a Python valid identifier.
4378 This can lead to the following problems.
4379
4380 During parsing a number of disallowed characters inside the backtick
4381 quoted string are replaced by strings that are allowed as a Python identifier.
4382 These characters include all operators in Python, the space character, the
4383 question mark, the exclamation mark, the dollar sign, and the euro sign.
4384 For other characters that fall outside the ASCII range (U+0001..U+007F)
4385 and those that are not further specified in PEP 3131,
4386 the query parser will raise an error.
4387 This excludes whitespace different than the space character,
4388 but also the hashtag (as it is used for comments) and the backtick
4389 itself (backtick can also not be escaped).
4390
4391 In a special case, quotes that make a pair around a backtick can
4392 confuse the parser.
4393 For example, ```it's` > `that's``` will raise an error,
4394 as it forms a quoted string (``'s > `that'``) with a backtick inside.
4395
4396 See also the Python documentation about lexical analysis
4397 (https://docs.python.org/3/reference/lexical_analysis.html)
4398 in combination with the source code in :mod:`pandas.core.computation.parsing`.
4399
4400 Examples
4401 --------
4402 >>> df = pd.DataFrame({'A': range(1, 6),
4403 ... 'B': range(10, 0, -2),
4404 ... 'C C': range(10, 5, -1)})
4405 >>> df
4406 A B C C
4407 0 1 10 10
4408 1 2 8 9
4409 2 3 6 8
4410 3 4 4 7
4411 4 5 2 6
4412 >>> df.query('A > B')
4413 A B C C
4414 4 5 2 6
4415
4416 The previous expression is equivalent to
4417
4418 >>> df[df.A > df.B]
4419 A B C C
4420 4 5 2 6
4421
4422 For columns with spaces in their name, you can use backtick quoting.
4423
4424 >>> df.query('B == `C C`')
4425 A B C C
4426 0 1 10 10
4427
4428 The previous expression is equivalent to
4429
4430 >>> df[df.B == df['C C']]
4431 A B C C
4432 0 1 10 10
4433 """
4434 inplace = validate_bool_kwarg(inplace, "inplace")
4435 if not isinstance(expr, str):
4436 msg = f"expr must be a string to be evaluated, {type(expr)} given"
4437 raise ValueError(msg)
4438 kwargs["level"] = kwargs.pop("level", 0) + 1
4439 kwargs["target"] = None
4440 res = self.eval(expr, **kwargs)
4441
4442 try:
4443 result = self.loc[res]
4444 except ValueError:
4445 # when res is multi-dimensional loc raises, but this is sometimes a
4446 # valid query
4447 result = self[res]
4448
4449 if inplace:
4450 self._update_inplace(result)
4451 return None
4452 else:
4453 return result
4454
4455 @overload
4456 def eval(self, expr: str, *, inplace: Literal[False] = ..., **kwargs) -> Any:
4457 ...
4458
4459 @overload
4460 def eval(self, expr: str, *, inplace: Literal[True], **kwargs) -> None:
4461 ...
4462
4463 def eval(self, expr: str, *, inplace: bool = False, **kwargs) -> Any | None:
4464 """
4465 Evaluate a string describing operations on DataFrame columns.
4466
4467 Operates on columns only, not specific rows or elements. This allows
4468 `eval` to run arbitrary code, which can make you vulnerable to code
4469 injection if you pass user input to this function.
4470
4471 Parameters
4472 ----------
4473 expr : str
4474 The expression string to evaluate.
4475 inplace : bool, default False
4476 If the expression contains an assignment, whether to perform the
4477 operation inplace and mutate the existing DataFrame. Otherwise,
4478 a new DataFrame is returned.
4479 **kwargs
4480 See the documentation for :func:`eval` for complete details
4481 on the keyword arguments accepted by
4482 :meth:`~pandas.DataFrame.query`.
4483
4484 Returns
4485 -------
4486 ndarray, scalar, pandas object, or None
4487 The result of the evaluation or None if ``inplace=True``.
4488
4489 See Also
4490 --------
4491 DataFrame.query : Evaluates a boolean expression to query the columns
4492 of a frame.
4493 DataFrame.assign : Can evaluate an expression or function to create new
4494 values for a column.
4495 eval : Evaluate a Python expression as a string using various
4496 backends.
4497
4498 Notes
4499 -----
4500 For more details see the API documentation for :func:`~eval`.
4501 For detailed examples see :ref:`enhancing performance with eval
4502 <enhancingperf.eval>`.
4503
4504 Examples
4505 --------
4506 >>> df = pd.DataFrame({'A': range(1, 6), 'B': range(10, 0, -2)})
4507 >>> df
4508 A B
4509 0 1 10
4510 1 2 8
4511 2 3 6
4512 3 4 4
4513 4 5 2
4514 >>> df.eval('A + B')
4515 0 11
4516 1 10
4517 2 9
4518 3 8
4519 4 7
4520 dtype: int64
4521
4522 Assignment is allowed though by default the original DataFrame is not
4523 modified.
4524
4525 >>> df.eval('C = A + B')
4526 A B C
4527 0 1 10 11
4528 1 2 8 10
4529 2 3 6 9
4530 3 4 4 8
4531 4 5 2 7
4532 >>> df
4533 A B
4534 0 1 10
4535 1 2 8
4536 2 3 6
4537 3 4 4
4538 4 5 2
4539
4540 Multiple columns can be assigned to using multi-line expressions:
4541
4542 >>> df.eval(
4543 ... '''
4544 ... C = A + B
4545 ... D = A - B
4546 ... '''
4547 ... )
4548 A B C D
4549 0 1 10 11 -9
4550 1 2 8 10 -6
4551 2 3 6 9 -3
4552 3 4 4 8 0
4553 4 5 2 7 3
4554 """
4555 from pandas.core.computation.eval import eval as _eval
4556
4557 inplace = validate_bool_kwarg(inplace, "inplace")
4558 kwargs["level"] = kwargs.pop("level", 0) + 1
4559 index_resolvers = self._get_index_resolvers()
4560 column_resolvers = self._get_cleaned_column_resolvers()
4561 resolvers = column_resolvers, index_resolvers
4562 if "target" not in kwargs:
4563 kwargs["target"] = self
4564 kwargs["resolvers"] = tuple(kwargs.get("resolvers", ())) + resolvers
4565
4566 return _eval(expr, inplace=inplace, **kwargs)
4567
4568 def select_dtypes(self, include=None, exclude=None) -> DataFrame:
4569 """
4570 Return a subset of the DataFrame's columns based on the column dtypes.
4571
4572 Parameters
4573 ----------
4574 include, exclude : scalar or list-like
4575 A selection of dtypes or strings to be included/excluded. At least
4576 one of these parameters must be supplied.
4577
4578 Returns
4579 -------
4580 DataFrame
4581 The subset of the frame including the dtypes in ``include`` and
4582 excluding the dtypes in ``exclude``.
4583
4584 Raises
4585 ------
4586 ValueError
4587 * If both of ``include`` and ``exclude`` are empty
4588 * If ``include`` and ``exclude`` have overlapping elements
4589 * If any kind of string dtype is passed in.
4590
4591 See Also
4592 --------
4593 DataFrame.dtypes: Return Series with the data type of each column.
4594
4595 Notes
4596 -----
4597 * To select all *numeric* types, use ``np.number`` or ``'number'``
4598 * To select strings you must use the ``object`` dtype, but note that
4599 this will return *all* object dtype columns
4600 * See the `numpy dtype hierarchy
4601 <https://numpy.org/doc/stable/reference/arrays.scalars.html>`__
4602 * To select datetimes, use ``np.datetime64``, ``'datetime'`` or
4603 ``'datetime64'``
4604 * To select timedeltas, use ``np.timedelta64``, ``'timedelta'`` or
4605 ``'timedelta64'``
4606 * To select Pandas categorical dtypes, use ``'category'``
4607 * To select Pandas datetimetz dtypes, use ``'datetimetz'`` (new in
4608 0.20.0) or ``'datetime64[ns, tz]'``
4609
4610 Examples
4611 --------
4612 >>> df = pd.DataFrame({'a': [1, 2] * 3,
4613 ... 'b': [True, False] * 3,
4614 ... 'c': [1.0, 2.0] * 3})
4615 >>> df
4616 a b c
4617 0 1 True 1.0
4618 1 2 False 2.0
4619 2 1 True 1.0
4620 3 2 False 2.0
4621 4 1 True 1.0
4622 5 2 False 2.0
4623
4624 >>> df.select_dtypes(include='bool')
4625 b
4626 0 True
4627 1 False
4628 2 True
4629 3 False
4630 4 True
4631 5 False
4632
4633 >>> df.select_dtypes(include=['float64'])
4634 c
4635 0 1.0
4636 1 2.0
4637 2 1.0
4638 3 2.0
4639 4 1.0
4640 5 2.0
4641
4642 >>> df.select_dtypes(exclude=['int64'])
4643 b c
4644 0 True 1.0
4645 1 False 2.0
4646 2 True 1.0
4647 3 False 2.0
4648 4 True 1.0
4649 5 False 2.0
4650 """
4651 if not is_list_like(include):
4652 include = (include,) if include is not None else ()
4653 if not is_list_like(exclude):
4654 exclude = (exclude,) if exclude is not None else ()
4655
4656 selection = (frozenset(include), frozenset(exclude))
4657
4658 if not any(selection):
4659 raise ValueError("at least one of include or exclude must be nonempty")
4660
4661 # convert the myriad valid dtypes object to a single representation
4662 def check_int_infer_dtype(dtypes):
4663 converted_dtypes: list[type] = []
4664 for dtype in dtypes:
4665 # Numpy maps int to different types (int32, in64) on Windows and Linux
4666 # see https://github.com/numpy/numpy/issues/9464
4667 if (isinstance(dtype, str) and dtype == "int") or (dtype is int):
4668 converted_dtypes.append(np.int32)
4669 converted_dtypes.append(np.int64)
4670 elif dtype == "float" or dtype is float:
4671 # GH#42452 : np.dtype("float") coerces to np.float64 from Numpy 1.20
4672 converted_dtypes.extend([np.float64, np.float32])
4673 else:
4674 converted_dtypes.append(infer_dtype_from_object(dtype))
4675 return frozenset(converted_dtypes)
4676
4677 include = check_int_infer_dtype(include)
4678 exclude = check_int_infer_dtype(exclude)
4679
4680 for dtypes in (include, exclude):
4681 invalidate_string_dtypes(dtypes)
4682
4683 # can't both include AND exclude!
4684 if not include.isdisjoint(exclude):
4685 raise ValueError(f"include and exclude overlap on {(include & exclude)}")
4686
4687 def dtype_predicate(dtype: DtypeObj, dtypes_set) -> bool:
4688 # GH 46870: BooleanDtype._is_numeric == True but should be excluded
4689 dtype = dtype if not isinstance(dtype, ArrowDtype) else dtype.numpy_dtype
4690 return issubclass(dtype.type, tuple(dtypes_set)) or (
4691 np.number in dtypes_set
4692 and getattr(dtype, "_is_numeric", False)
4693 and not is_bool_dtype(dtype)
4694 )
4695
4696 def predicate(arr: ArrayLike) -> bool:
4697 dtype = arr.dtype
4698 if include:
4699 if not dtype_predicate(dtype, include):
4700 return False
4701
4702 if exclude:
4703 if dtype_predicate(dtype, exclude):
4704 return False
4705
4706 return True
4707
4708 mgr = self._mgr._get_data_subset(predicate).copy(deep=None)
4709 return type(self)(mgr).__finalize__(self)
4710
4711 def insert(
4712 self,
4713 loc: int,
4714 column: Hashable,
4715 value: Scalar | AnyArrayLike,
4716 allow_duplicates: bool | lib.NoDefault = lib.no_default,
4717 ) -> None:
4718 """
4719 Insert column into DataFrame at specified location.
4720
4721 Raises a ValueError if `column` is already contained in the DataFrame,
4722 unless `allow_duplicates` is set to True.
4723
4724 Parameters
4725 ----------
4726 loc : int
4727 Insertion index. Must verify 0 <= loc <= len(columns).
4728 column : str, number, or hashable object
4729 Label of the inserted column.
4730 value : Scalar, Series, or array-like
4731 allow_duplicates : bool, optional, default lib.no_default
4732
4733 See Also
4734 --------
4735 Index.insert : Insert new item by index.
4736
4737 Examples
4738 --------
4739 >>> df = pd.DataFrame({'col1': [1, 2], 'col2': [3, 4]})
4740 >>> df
4741 col1 col2
4742 0 1 3
4743 1 2 4
4744 >>> df.insert(1, "newcol", [99, 99])
4745 >>> df
4746 col1 newcol col2
4747 0 1 99 3
4748 1 2 99 4
4749 >>> df.insert(0, "col1", [100, 100], allow_duplicates=True)
4750 >>> df
4751 col1 col1 newcol col2
4752 0 100 1 99 3
4753 1 100 2 99 4
4754
4755 Notice that pandas uses index alignment in case of `value` from type `Series`:
4756
4757 >>> df.insert(0, "col0", pd.Series([5, 6], index=[1, 2]))
4758 >>> df
4759 col0 col1 col1 newcol col2
4760 0 NaN 100 1 99 3
4761 1 5.0 100 2 99 4
4762 """
4763 if allow_duplicates is lib.no_default:
4764 allow_duplicates = False
4765 if allow_duplicates and not self.flags.allows_duplicate_labels:
4766 raise ValueError(
4767 "Cannot specify 'allow_duplicates=True' when "
4768 "'self.flags.allows_duplicate_labels' is False."
4769 )
4770 if not allow_duplicates and column in self.columns:
4771 # Should this be a different kind of error??
4772 raise ValueError(f"cannot insert {column}, already exists")
4773 if not isinstance(loc, int):
4774 raise TypeError("loc must be int")
4775
4776 value = self._sanitize_column(value)
4777 self._mgr.insert(loc, column, value)
4778
4779 def assign(self, **kwargs) -> DataFrame:
4780 r"""
4781 Assign new columns to a DataFrame.
4782
4783 Returns a new object with all original columns in addition to new ones.
4784 Existing columns that are re-assigned will be overwritten.
4785
4786 Parameters
4787 ----------
4788 **kwargs : dict of {str: callable or Series}
4789 The column names are keywords. If the values are
4790 callable, they are computed on the DataFrame and
4791 assigned to the new columns. The callable must not
4792 change input DataFrame (though pandas doesn't check it).
4793 If the values are not callable, (e.g. a Series, scalar, or array),
4794 they are simply assigned.
4795
4796 Returns
4797 -------
4798 DataFrame
4799 A new DataFrame with the new columns in addition to
4800 all the existing columns.
4801
4802 Notes
4803 -----
4804 Assigning multiple columns within the same ``assign`` is possible.
4805 Later items in '\*\*kwargs' may refer to newly created or modified
4806 columns in 'df'; items are computed and assigned into 'df' in order.
4807
4808 Examples
4809 --------
4810 >>> df = pd.DataFrame({'temp_c': [17.0, 25.0]},
4811 ... index=['Portland', 'Berkeley'])
4812 >>> df
4813 temp_c
4814 Portland 17.0
4815 Berkeley 25.0
4816
4817 Where the value is a callable, evaluated on `df`:
4818
4819 >>> df.assign(temp_f=lambda x: x.temp_c * 9 / 5 + 32)
4820 temp_c temp_f
4821 Portland 17.0 62.6
4822 Berkeley 25.0 77.0
4823
4824 Alternatively, the same behavior can be achieved by directly
4825 referencing an existing Series or sequence:
4826
4827 >>> df.assign(temp_f=df['temp_c'] * 9 / 5 + 32)
4828 temp_c temp_f
4829 Portland 17.0 62.6
4830 Berkeley 25.0 77.0
4831
4832 You can create multiple columns within the same assign where one
4833 of the columns depends on another one defined within the same assign:
4834
4835 >>> df.assign(temp_f=lambda x: x['temp_c'] * 9 / 5 + 32,
4836 ... temp_k=lambda x: (x['temp_f'] + 459.67) * 5 / 9)
4837 temp_c temp_f temp_k
4838 Portland 17.0 62.6 290.15
4839 Berkeley 25.0 77.0 298.15
4840 """
4841 data = self.copy(deep=None)
4842
4843 for k, v in kwargs.items():
4844 data[k] = com.apply_if_callable(v, data)
4845 return data
4846
4847 def _sanitize_column(self, value) -> ArrayLike:
4848 """
4849 Ensures new columns (which go into the BlockManager as new blocks) are
4850 always copied and converted into an array.
4851
4852 Parameters
4853 ----------
4854 value : scalar, Series, or array-like
4855
4856 Returns
4857 -------
4858 numpy.ndarray or ExtensionArray
4859 """
4860 self._ensure_valid_index(value)
4861
4862 # We can get there through isetitem with a DataFrame
4863 # or through loc single_block_path
4864 if isinstance(value, DataFrame):
4865 return _reindex_for_setitem(value, self.index)
4866 elif is_dict_like(value):
4867 return _reindex_for_setitem(Series(value), self.index)
4868
4869 if is_list_like(value):
4870 com.require_length_match(value, self.index)
4871 return sanitize_array(value, self.index, copy=True, allow_2d=True)
4872
4873 @property
4874 def _series(self):
4875 return {
4876 item: Series(
4877 self._mgr.iget(idx), index=self.index, name=item, fastpath=True
4878 )
4879 for idx, item in enumerate(self.columns)
4880 }
4881
4882 # ----------------------------------------------------------------------
4883 # Reindexing and alignment
4884
4885 def _reindex_axes(self, axes, level, limit, tolerance, method, fill_value, copy):
4886 frame = self
4887
4888 columns = axes["columns"]
4889 if columns is not None:
4890 frame = frame._reindex_columns(
4891 columns, method, copy, level, fill_value, limit, tolerance
4892 )
4893
4894 index = axes["index"]
4895 if index is not None:
4896 frame = frame._reindex_index(
4897 index, method, copy, level, fill_value, limit, tolerance
4898 )
4899
4900 return frame
4901
4902 def _reindex_index(
4903 self,
4904 new_index,
4905 method,
4906 copy: bool,
4907 level: Level,
4908 fill_value=np.nan,
4909 limit=None,
4910 tolerance=None,
4911 ):
4912 new_index, indexer = self.index.reindex(
4913 new_index, method=method, level=level, limit=limit, tolerance=tolerance
4914 )
4915 return self._reindex_with_indexers(
4916 {0: [new_index, indexer]},
4917 copy=copy,
4918 fill_value=fill_value,
4919 allow_dups=False,
4920 )
4921
4922 def _reindex_columns(
4923 self,
4924 new_columns,
4925 method,
4926 copy: bool,
4927 level: Level,
4928 fill_value=None,
4929 limit=None,
4930 tolerance=None,
4931 ):
4932 new_columns, indexer = self.columns.reindex(
4933 new_columns, method=method, level=level, limit=limit, tolerance=tolerance
4934 )
4935 return self._reindex_with_indexers(
4936 {1: [new_columns, indexer]},
4937 copy=copy,
4938 fill_value=fill_value,
4939 allow_dups=False,
4940 )
4941
4942 def _reindex_multi(
4943 self, axes: dict[str, Index], copy: bool, fill_value
4944 ) -> DataFrame:
4945 """
4946 We are guaranteed non-Nones in the axes.
4947 """
4948
4949 new_index, row_indexer = self.index.reindex(axes["index"])
4950 new_columns, col_indexer = self.columns.reindex(axes["columns"])
4951
4952 if row_indexer is not None and col_indexer is not None:
4953 # Fastpath. By doing two 'take's at once we avoid making an
4954 # unnecessary copy.
4955 # We only get here with `not self._is_mixed_type`, which (almost)
4956 # ensures that self.values is cheap. It may be worth making this
4957 # condition more specific.
4958 indexer = row_indexer, col_indexer
4959 new_values = take_2d_multi(self.values, indexer, fill_value=fill_value)
4960 return self._constructor(
4961 new_values, index=new_index, columns=new_columns, copy=False
4962 )
4963 else:
4964 return self._reindex_with_indexers(
4965 {0: [new_index, row_indexer], 1: [new_columns, col_indexer]},
4966 copy=copy,
4967 fill_value=fill_value,
4968 )
4969
4970 @doc(NDFrame.align, **_shared_doc_kwargs)
4971 def align(
4972 self,
4973 other: DataFrame,
4974 join: AlignJoin = "outer",
4975 axis: Axis | None = None,
4976 level: Level = None,
4977 copy: bool | None = None,
4978 fill_value=None,
4979 method: FillnaOptions | None = None,
4980 limit: int | None = None,
4981 fill_axis: Axis = 0,
4982 broadcast_axis: Axis | None = None,
4983 ) -> DataFrame:
4984 return super().align(
4985 other,
4986 join=join,
4987 axis=axis,
4988 level=level,
4989 copy=copy,
4990 fill_value=fill_value,
4991 method=method,
4992 limit=limit,
4993 fill_axis=fill_axis,
4994 broadcast_axis=broadcast_axis,
4995 )
4996
4997 @Appender(
4998 """
4999 Examples
5000 --------
5001 >>> df = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]})
5002
5003 Change the row labels.
5004
5005 >>> df.set_axis(['a', 'b', 'c'], axis='index')
5006 A B
5007 a 1 4
5008 b 2 5
5009 c 3 6
5010
5011 Change the column labels.
5012
5013 >>> df.set_axis(['I', 'II'], axis='columns')
5014 I II
5015 0 1 4
5016 1 2 5
5017 2 3 6
5018 """
5019 )
5020 @Substitution(
5021 **_shared_doc_kwargs,
5022 extended_summary_sub=" column or",
5023 axis_description_sub=", and 1 identifies the columns",
5024 see_also_sub=" or columns",
5025 )
5026 @Appender(NDFrame.set_axis.__doc__)
5027 def set_axis(
5028 self,
5029 labels,
5030 *,
5031 axis: Axis = 0,
5032 copy: bool | None = None,
5033 ) -> DataFrame:
5034 return super().set_axis(labels, axis=axis, copy=copy)
5035
5036 @doc(
5037 NDFrame.reindex,
5038 klass=_shared_doc_kwargs["klass"],
5039 optional_reindex=_shared_doc_kwargs["optional_reindex"],
5040 )
5041 def reindex( # type: ignore[override]
5042 self,
5043 labels=None,
5044 *,
5045 index=None,
5046 columns=None,
5047 axis: Axis | None = None,
5048 method: str | None = None,
5049 copy: bool | None = None,
5050 level: Level | None = None,
5051 fill_value: Scalar | None = np.nan,
5052 limit: int | None = None,
5053 tolerance=None,
5054 ) -> DataFrame:
5055 return super().reindex(
5056 labels=labels,
5057 index=index,
5058 columns=columns,
5059 axis=axis,
5060 method=method,
5061 copy=copy,
5062 level=level,
5063 fill_value=fill_value,
5064 limit=limit,
5065 tolerance=tolerance,
5066 )
5067
5068 @overload
5069 def drop(
5070 self,
5071 labels: IndexLabel = ...,
5072 *,
5073 axis: Axis = ...,
5074 index: IndexLabel = ...,
5075 columns: IndexLabel = ...,
5076 level: Level = ...,
5077 inplace: Literal[True],
5078 errors: IgnoreRaise = ...,
5079 ) -> None:
5080 ...
5081
5082 @overload
5083 def drop(
5084 self,
5085 labels: IndexLabel = ...,
5086 *,
5087 axis: Axis = ...,
5088 index: IndexLabel = ...,
5089 columns: IndexLabel = ...,
5090 level: Level = ...,
5091 inplace: Literal[False] = ...,
5092 errors: IgnoreRaise = ...,
5093 ) -> DataFrame:
5094 ...
5095
5096 @overload
5097 def drop(
5098 self,
5099 labels: IndexLabel = ...,
5100 *,
5101 axis: Axis = ...,
5102 index: IndexLabel = ...,
5103 columns: IndexLabel = ...,
5104 level: Level = ...,
5105 inplace: bool = ...,
5106 errors: IgnoreRaise = ...,
5107 ) -> DataFrame | None:
5108 ...
5109
5110 def drop(
5111 self,
5112 labels: IndexLabel = None,
5113 *,
5114 axis: Axis = 0,
5115 index: IndexLabel = None,
5116 columns: IndexLabel = None,
5117 level: Level = None,
5118 inplace: bool = False,
5119 errors: IgnoreRaise = "raise",
5120 ) -> DataFrame | None:
5121 """
5122 Drop specified labels from rows or columns.
5123
5124 Remove rows or columns by specifying label names and corresponding
5125 axis, or by specifying directly index or column names. When using a
5126 multi-index, labels on different levels can be removed by specifying
5127 the level. See the :ref:`user guide <advanced.shown_levels>`
5128 for more information about the now unused levels.
5129
5130 Parameters
5131 ----------
5132 labels : single label or list-like
5133 Index or column labels to drop. A tuple will be used as a single
5134 label and not treated as a list-like.
5135 axis : {0 or 'index', 1 or 'columns'}, default 0
5136 Whether to drop labels from the index (0 or 'index') or
5137 columns (1 or 'columns').
5138 index : single label or list-like
5139 Alternative to specifying axis (``labels, axis=0``
5140 is equivalent to ``index=labels``).
5141 columns : single label or list-like
5142 Alternative to specifying axis (``labels, axis=1``
5143 is equivalent to ``columns=labels``).
5144 level : int or level name, optional
5145 For MultiIndex, level from which the labels will be removed.
5146 inplace : bool, default False
5147 If False, return a copy. Otherwise, do operation
5148 inplace and return None.
5149 errors : {'ignore', 'raise'}, default 'raise'
5150 If 'ignore', suppress error and only existing labels are
5151 dropped.
5152
5153 Returns
5154 -------
5155 DataFrame or None
5156 DataFrame without the removed index or column labels or
5157 None if ``inplace=True``.
5158
5159 Raises
5160 ------
5161 KeyError
5162 If any of the labels is not found in the selected axis.
5163
5164 See Also
5165 --------
5166 DataFrame.loc : Label-location based indexer for selection by label.
5167 DataFrame.dropna : Return DataFrame with labels on given axis omitted
5168 where (all or any) data are missing.
5169 DataFrame.drop_duplicates : Return DataFrame with duplicate rows
5170 removed, optionally only considering certain columns.
5171 Series.drop : Return Series with specified index labels removed.
5172
5173 Examples
5174 --------
5175 >>> df = pd.DataFrame(np.arange(12).reshape(3, 4),
5176 ... columns=['A', 'B', 'C', 'D'])
5177 >>> df
5178 A B C D
5179 0 0 1 2 3
5180 1 4 5 6 7
5181 2 8 9 10 11
5182
5183 Drop columns
5184
5185 >>> df.drop(['B', 'C'], axis=1)
5186 A D
5187 0 0 3
5188 1 4 7
5189 2 8 11
5190
5191 >>> df.drop(columns=['B', 'C'])
5192 A D
5193 0 0 3
5194 1 4 7
5195 2 8 11
5196
5197 Drop a row by index
5198
5199 >>> df.drop([0, 1])
5200 A B C D
5201 2 8 9 10 11
5202
5203 Drop columns and/or rows of MultiIndex DataFrame
5204
5205 >>> midx = pd.MultiIndex(levels=[['lama', 'cow', 'falcon'],
5206 ... ['speed', 'weight', 'length']],
5207 ... codes=[[0, 0, 0, 1, 1, 1, 2, 2, 2],
5208 ... [0, 1, 2, 0, 1, 2, 0, 1, 2]])
5209 >>> df = pd.DataFrame(index=midx, columns=['big', 'small'],
5210 ... data=[[45, 30], [200, 100], [1.5, 1], [30, 20],
5211 ... [250, 150], [1.5, 0.8], [320, 250],
5212 ... [1, 0.8], [0.3, 0.2]])
5213 >>> df
5214 big small
5215 lama speed 45.0 30.0
5216 weight 200.0 100.0
5217 length 1.5 1.0
5218 cow speed 30.0 20.0
5219 weight 250.0 150.0
5220 length 1.5 0.8
5221 falcon speed 320.0 250.0
5222 weight 1.0 0.8
5223 length 0.3 0.2
5224
5225 Drop a specific index combination from the MultiIndex
5226 DataFrame, i.e., drop the combination ``'falcon'`` and
5227 ``'weight'``, which deletes only the corresponding row
5228
5229 >>> df.drop(index=('falcon', 'weight'))
5230 big small
5231 lama speed 45.0 30.0
5232 weight 200.0 100.0
5233 length 1.5 1.0
5234 cow speed 30.0 20.0
5235 weight 250.0 150.0
5236 length 1.5 0.8
5237 falcon speed 320.0 250.0
5238 length 0.3 0.2
5239
5240 >>> df.drop(index='cow', columns='small')
5241 big
5242 lama speed 45.0
5243 weight 200.0
5244 length 1.5
5245 falcon speed 320.0
5246 weight 1.0
5247 length 0.3
5248
5249 >>> df.drop(index='length', level=1)
5250 big small
5251 lama speed 45.0 30.0
5252 weight 200.0 100.0
5253 cow speed 30.0 20.0
5254 weight 250.0 150.0
5255 falcon speed 320.0 250.0
5256 weight 1.0 0.8
5257 """
5258 return super().drop(
5259 labels=labels,
5260 axis=axis,
5261 index=index,
5262 columns=columns,
5263 level=level,
5264 inplace=inplace,
5265 errors=errors,
5266 )
5267
5268 @overload
5269 def rename(
5270 self,
5271 mapper: Renamer | None = ...,
5272 *,
5273 index: Renamer | None = ...,
5274 columns: Renamer | None = ...,
5275 axis: Axis | None = ...,
5276 copy: bool | None = ...,
5277 inplace: Literal[True],
5278 level: Level = ...,
5279 errors: IgnoreRaise = ...,
5280 ) -> None:
5281 ...
5282
5283 @overload
5284 def rename(
5285 self,
5286 mapper: Renamer | None = ...,
5287 *,
5288 index: Renamer | None = ...,
5289 columns: Renamer | None = ...,
5290 axis: Axis | None = ...,
5291 copy: bool | None = ...,
5292 inplace: Literal[False] = ...,
5293 level: Level = ...,
5294 errors: IgnoreRaise = ...,
5295 ) -> DataFrame:
5296 ...
5297
5298 @overload
5299 def rename(
5300 self,
5301 mapper: Renamer | None = ...,
5302 *,
5303 index: Renamer | None = ...,
5304 columns: Renamer | None = ...,
5305 axis: Axis | None = ...,
5306 copy: bool | None = ...,
5307 inplace: bool = ...,
5308 level: Level = ...,
5309 errors: IgnoreRaise = ...,
5310 ) -> DataFrame | None:
5311 ...
5312
5313 def rename(
5314 self,
5315 mapper: Renamer | None = None,
5316 *,
5317 index: Renamer | None = None,
5318 columns: Renamer | None = None,
5319 axis: Axis | None = None,
5320 copy: bool | None = None,
5321 inplace: bool = False,
5322 level: Level = None,
5323 errors: IgnoreRaise = "ignore",
5324 ) -> DataFrame | None:
5325 """
5326 Rename columns or index labels.
5327
5328 Function / dict values must be unique (1-to-1). Labels not contained in
5329 a dict / Series will be left as-is. Extra labels listed don't throw an
5330 error.
5331
5332 See the :ref:`user guide <basics.rename>` for more.
5333
5334 Parameters
5335 ----------
5336 mapper : dict-like or function
5337 Dict-like or function transformations to apply to
5338 that axis' values. Use either ``mapper`` and ``axis`` to
5339 specify the axis to target with ``mapper``, or ``index`` and
5340 ``columns``.
5341 index : dict-like or function
5342 Alternative to specifying axis (``mapper, axis=0``
5343 is equivalent to ``index=mapper``).
5344 columns : dict-like or function
5345 Alternative to specifying axis (``mapper, axis=1``
5346 is equivalent to ``columns=mapper``).
5347 axis : {0 or 'index', 1 or 'columns'}, default 0
5348 Axis to target with ``mapper``. Can be either the axis name
5349 ('index', 'columns') or number (0, 1). The default is 'index'.
5350 copy : bool, default True
5351 Also copy underlying data.
5352 inplace : bool, default False
5353 Whether to modify the DataFrame rather than creating a new one.
5354 If True then value of copy is ignored.
5355 level : int or level name, default None
5356 In case of a MultiIndex, only rename labels in the specified
5357 level.
5358 errors : {'ignore', 'raise'}, default 'ignore'
5359 If 'raise', raise a `KeyError` when a dict-like `mapper`, `index`,
5360 or `columns` contains labels that are not present in the Index
5361 being transformed.
5362 If 'ignore', existing keys will be renamed and extra keys will be
5363 ignored.
5364
5365 Returns
5366 -------
5367 DataFrame or None
5368 DataFrame with the renamed axis labels or None if ``inplace=True``.
5369
5370 Raises
5371 ------
5372 KeyError
5373 If any of the labels is not found in the selected axis and
5374 "errors='raise'".
5375
5376 See Also
5377 --------
5378 DataFrame.rename_axis : Set the name of the axis.
5379
5380 Examples
5381 --------
5382 ``DataFrame.rename`` supports two calling conventions
5383
5384 * ``(index=index_mapper, columns=columns_mapper, ...)``
5385 * ``(mapper, axis={'index', 'columns'}, ...)``
5386
5387 We *highly* recommend using keyword arguments to clarify your
5388 intent.
5389
5390 Rename columns using a mapping:
5391
5392 >>> df = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]})
5393 >>> df.rename(columns={"A": "a", "B": "c"})
5394 a c
5395 0 1 4
5396 1 2 5
5397 2 3 6
5398
5399 Rename index using a mapping:
5400
5401 >>> df.rename(index={0: "x", 1: "y", 2: "z"})
5402 A B
5403 x 1 4
5404 y 2 5
5405 z 3 6
5406
5407 Cast index labels to a different type:
5408
5409 >>> df.index
5410 RangeIndex(start=0, stop=3, step=1)
5411 >>> df.rename(index=str).index
5412 Index(['0', '1', '2'], dtype='object')
5413
5414 >>> df.rename(columns={"A": "a", "B": "b", "C": "c"}, errors="raise")
5415 Traceback (most recent call last):
5416 KeyError: ['C'] not found in axis
5417
5418 Using axis-style parameters:
5419
5420 >>> df.rename(str.lower, axis='columns')
5421 a b
5422 0 1 4
5423 1 2 5
5424 2 3 6
5425
5426 >>> df.rename({1: 2, 2: 4}, axis='index')
5427 A B
5428 0 1 4
5429 2 2 5
5430 4 3 6
5431 """
5432 return super()._rename(
5433 mapper=mapper,
5434 index=index,
5435 columns=columns,
5436 axis=axis,
5437 copy=copy,
5438 inplace=inplace,
5439 level=level,
5440 errors=errors,
5441 )
5442
5443 @overload
5444 def fillna(
5445 self,
5446 value: Hashable | Mapping | Series | DataFrame = ...,
5447 *,
5448 method: FillnaOptions | None = ...,
5449 axis: Axis | None = ...,
5450 inplace: Literal[False] = ...,
5451 limit: int | None = ...,
5452 downcast: dict | None = ...,
5453 ) -> DataFrame:
5454 ...
5455
5456 @overload
5457 def fillna(
5458 self,
5459 value: Hashable | Mapping | Series | DataFrame = ...,
5460 *,
5461 method: FillnaOptions | None = ...,
5462 axis: Axis | None = ...,
5463 inplace: Literal[True],
5464 limit: int | None = ...,
5465 downcast: dict | None = ...,
5466 ) -> None:
5467 ...
5468
5469 @overload
5470 def fillna(
5471 self,
5472 value: Hashable | Mapping | Series | DataFrame = ...,
5473 *,
5474 method: FillnaOptions | None = ...,
5475 axis: Axis | None = ...,
5476 inplace: bool = ...,
5477 limit: int | None = ...,
5478 downcast: dict | None = ...,
5479 ) -> DataFrame | None:
5480 ...
5481
5482 @doc(NDFrame.fillna, **_shared_doc_kwargs)
5483 def fillna(
5484 self,
5485 value: Hashable | Mapping | Series | DataFrame = None,
5486 *,
5487 method: FillnaOptions | None = None,
5488 axis: Axis | None = None,
5489 inplace: bool = False,
5490 limit: int | None = None,
5491 downcast: dict | None = None,
5492 ) -> DataFrame | None:
5493 return super().fillna(
5494 value=value,
5495 method=method,
5496 axis=axis,
5497 inplace=inplace,
5498 limit=limit,
5499 downcast=downcast,
5500 )
5501
5502 def pop(self, item: Hashable) -> Series:
5503 """
5504 Return item and drop from frame. Raise KeyError if not found.
5505
5506 Parameters
5507 ----------
5508 item : label
5509 Label of column to be popped.
5510
5511 Returns
5512 -------
5513 Series
5514
5515 Examples
5516 --------
5517 >>> df = pd.DataFrame([('falcon', 'bird', 389.0),
5518 ... ('parrot', 'bird', 24.0),
5519 ... ('lion', 'mammal', 80.5),
5520 ... ('monkey', 'mammal', np.nan)],
5521 ... columns=('name', 'class', 'max_speed'))
5522 >>> df
5523 name class max_speed
5524 0 falcon bird 389.0
5525 1 parrot bird 24.0
5526 2 lion mammal 80.5
5527 3 monkey mammal NaN
5528
5529 >>> df.pop('class')
5530 0 bird
5531 1 bird
5532 2 mammal
5533 3 mammal
5534 Name: class, dtype: object
5535
5536 >>> df
5537 name max_speed
5538 0 falcon 389.0
5539 1 parrot 24.0
5540 2 lion 80.5
5541 3 monkey NaN
5542 """
5543 return super().pop(item=item)
5544
5545 @overload
5546 def replace(
5547 self,
5548 to_replace=...,
5549 value=...,
5550 *,
5551 inplace: Literal[False] = ...,
5552 limit: int | None = ...,
5553 regex: bool = ...,
5554 method: Literal["pad", "ffill", "bfill"] | lib.NoDefault = ...,
5555 ) -> DataFrame:
5556 ...
5557
5558 @overload
5559 def replace(
5560 self,
5561 to_replace=...,
5562 value=...,
5563 *,
5564 inplace: Literal[True],
5565 limit: int | None = ...,
5566 regex: bool = ...,
5567 method: Literal["pad", "ffill", "bfill"] | lib.NoDefault = ...,
5568 ) -> None:
5569 ...
5570
5571 @doc(NDFrame.replace, **_shared_doc_kwargs)
5572 def replace(
5573 self,
5574 to_replace=None,
5575 value=lib.no_default,
5576 *,
5577 inplace: bool = False,
5578 limit: int | None = None,
5579 regex: bool = False,
5580 method: Literal["pad", "ffill", "bfill"] | lib.NoDefault = lib.no_default,
5581 ) -> DataFrame | None:
5582 return super().replace(
5583 to_replace=to_replace,
5584 value=value,
5585 inplace=inplace,
5586 limit=limit,
5587 regex=regex,
5588 method=method,
5589 )
5590
5591 def _replace_columnwise(
5592 self, mapping: dict[Hashable, tuple[Any, Any]], inplace: bool, regex
5593 ):
5594 """
5595 Dispatch to Series.replace column-wise.
5596
5597 Parameters
5598 ----------
5599 mapping : dict
5600 of the form {col: (target, value)}
5601 inplace : bool
5602 regex : bool or same types as `to_replace` in DataFrame.replace
5603
5604 Returns
5605 -------
5606 DataFrame or None
5607 """
5608 # Operate column-wise
5609 res = self if inplace else self.copy(deep=None)
5610 ax = self.columns
5611
5612 for i, ax_value in enumerate(ax):
5613 if ax_value in mapping:
5614 ser = self.iloc[:, i]
5615
5616 target, value = mapping[ax_value]
5617 newobj = ser.replace(target, value, regex=regex)
5618
5619 res._iset_item(i, newobj)
5620
5621 if inplace:
5622 return
5623 return res.__finalize__(self)
5624
5625 @doc(NDFrame.shift, klass=_shared_doc_kwargs["klass"])
5626 def shift(
5627 self,
5628 periods: int = 1,
5629 freq: Frequency | None = None,
5630 axis: Axis = 0,
5631 fill_value: Hashable = lib.no_default,
5632 ) -> DataFrame:
5633 axis = self._get_axis_number(axis)
5634
5635 ncols = len(self.columns)
5636 if (
5637 axis == 1
5638 and periods != 0
5639 and freq is None
5640 and fill_value is lib.no_default
5641 and ncols > 0
5642 ):
5643 # We will infer fill_value to match the closest column
5644
5645 # Use a column that we know is valid for our column's dtype GH#38434
5646 label = self.columns[0]
5647
5648 if periods > 0:
5649 result = self.iloc[:, :-periods]
5650 for col in range(min(ncols, abs(periods))):
5651 # TODO(EA2D): doing this in a loop unnecessary with 2D EAs
5652 # Define filler inside loop so we get a copy
5653 filler = self.iloc[:, 0].shift(len(self))
5654 result.insert(0, label, filler, allow_duplicates=True)
5655 else:
5656 result = self.iloc[:, -periods:]
5657 for col in range(min(ncols, abs(periods))):
5658 # Define filler inside loop so we get a copy
5659 filler = self.iloc[:, -1].shift(len(self))
5660 result.insert(
5661 len(result.columns), label, filler, allow_duplicates=True
5662 )
5663
5664 result.columns = self.columns.copy()
5665 return result
5666 elif (
5667 axis == 1
5668 and periods != 0
5669 and fill_value is not lib.no_default
5670 and ncols > 0
5671 ):
5672 arrays = self._mgr.arrays
5673 if len(arrays) > 1 or (
5674 # If we only have one block and we know that we can't
5675 # keep the same dtype (i.e. the _can_hold_element check)
5676 # then we can go through the reindex_indexer path
5677 # (and avoid casting logic in the Block method).
5678 not can_hold_element(arrays[0], fill_value)
5679 ):
5680 # GH#35488 we need to watch out for multi-block cases
5681 # We only get here with fill_value not-lib.no_default
5682 nper = abs(periods)
5683 nper = min(nper, ncols)
5684 if periods > 0:
5685 indexer = np.array(
5686 [-1] * nper + list(range(ncols - periods)), dtype=np.intp
5687 )
5688 else:
5689 indexer = np.array(
5690 list(range(nper, ncols)) + [-1] * nper, dtype=np.intp
5691 )
5692 mgr = self._mgr.reindex_indexer(
5693 self.columns,
5694 indexer,
5695 axis=0,
5696 fill_value=fill_value,
5697 allow_dups=True,
5698 )
5699 res_df = self._constructor(mgr)
5700 return res_df.__finalize__(self, method="shift")
5701
5702 return super().shift(
5703 periods=periods, freq=freq, axis=axis, fill_value=fill_value
5704 )
5705
5706 @overload
5707 def set_index(
5708 self,
5709 keys,
5710 *,
5711 drop: bool = ...,
5712 append: bool = ...,
5713 inplace: Literal[False] = ...,
5714 verify_integrity: bool = ...,
5715 ) -> DataFrame:
5716 ...
5717
5718 @overload
5719 def set_index(
5720 self,
5721 keys,
5722 *,
5723 drop: bool = ...,
5724 append: bool = ...,
5725 inplace: Literal[True],
5726 verify_integrity: bool = ...,
5727 ) -> None:
5728 ...
5729
5730 def set_index(
5731 self,
5732 keys,
5733 *,
5734 drop: bool = True,
5735 append: bool = False,
5736 inplace: bool = False,
5737 verify_integrity: bool = False,
5738 ) -> DataFrame | None:
5739 """
5740 Set the DataFrame index using existing columns.
5741
5742 Set the DataFrame index (row labels) using one or more existing
5743 columns or arrays (of the correct length). The index can replace the
5744 existing index or expand on it.
5745
5746 Parameters
5747 ----------
5748 keys : label or array-like or list of labels/arrays
5749 This parameter can be either a single column key, a single array of
5750 the same length as the calling DataFrame, or a list containing an
5751 arbitrary combination of column keys and arrays. Here, "array"
5752 encompasses :class:`Series`, :class:`Index`, ``np.ndarray``, and
5753 instances of :class:`~collections.abc.Iterator`.
5754 drop : bool, default True
5755 Delete columns to be used as the new index.
5756 append : bool, default False
5757 Whether to append columns to existing index.
5758 inplace : bool, default False
5759 Whether to modify the DataFrame rather than creating a new one.
5760 verify_integrity : bool, default False
5761 Check the new index for duplicates. Otherwise defer the check until
5762 necessary. Setting to False will improve the performance of this
5763 method.
5764
5765 Returns
5766 -------
5767 DataFrame or None
5768 Changed row labels or None if ``inplace=True``.
5769
5770 See Also
5771 --------
5772 DataFrame.reset_index : Opposite of set_index.
5773 DataFrame.reindex : Change to new indices or expand indices.
5774 DataFrame.reindex_like : Change to same indices as other DataFrame.
5775
5776 Examples
5777 --------
5778 >>> df = pd.DataFrame({'month': [1, 4, 7, 10],
5779 ... 'year': [2012, 2014, 2013, 2014],
5780 ... 'sale': [55, 40, 84, 31]})
5781 >>> df
5782 month year sale
5783 0 1 2012 55
5784 1 4 2014 40
5785 2 7 2013 84
5786 3 10 2014 31
5787
5788 Set the index to become the 'month' column:
5789
5790 >>> df.set_index('month')
5791 year sale
5792 month
5793 1 2012 55
5794 4 2014 40
5795 7 2013 84
5796 10 2014 31
5797
5798 Create a MultiIndex using columns 'year' and 'month':
5799
5800 >>> df.set_index(['year', 'month'])
5801 sale
5802 year month
5803 2012 1 55
5804 2014 4 40
5805 2013 7 84
5806 2014 10 31
5807
5808 Create a MultiIndex using an Index and a column:
5809
5810 >>> df.set_index([pd.Index([1, 2, 3, 4]), 'year'])
5811 month sale
5812 year
5813 1 2012 1 55
5814 2 2014 4 40
5815 3 2013 7 84
5816 4 2014 10 31
5817
5818 Create a MultiIndex using two Series:
5819
5820 >>> s = pd.Series([1, 2, 3, 4])
5821 >>> df.set_index([s, s**2])
5822 month year sale
5823 1 1 1 2012 55
5824 2 4 4 2014 40
5825 3 9 7 2013 84
5826 4 16 10 2014 31
5827 """
5828 inplace = validate_bool_kwarg(inplace, "inplace")
5829 self._check_inplace_and_allows_duplicate_labels(inplace)
5830 if not isinstance(keys, list):
5831 keys = [keys]
5832
5833 err_msg = (
5834 'The parameter "keys" may be a column key, one-dimensional '
5835 "array, or a list containing only valid column keys and "
5836 "one-dimensional arrays."
5837 )
5838
5839 missing: list[Hashable] = []
5840 for col in keys:
5841 if isinstance(col, (Index, Series, np.ndarray, list, abc.Iterator)):
5842 # arrays are fine as long as they are one-dimensional
5843 # iterators get converted to list below
5844 if getattr(col, "ndim", 1) != 1:
5845 raise ValueError(err_msg)
5846 else:
5847 # everything else gets tried as a key; see GH 24969
5848 try:
5849 found = col in self.columns
5850 except TypeError as err:
5851 raise TypeError(
5852 f"{err_msg}. Received column of type {type(col)}"
5853 ) from err
5854 else:
5855 if not found:
5856 missing.append(col)
5857
5858 if missing:
5859 raise KeyError(f"None of {missing} are in the columns")
5860
5861 if inplace:
5862 frame = self
5863 else:
5864 # GH 49473 Use "lazy copy" with Copy-on-Write
5865 frame = self.copy(deep=None)
5866
5867 arrays = []
5868 names: list[Hashable] = []
5869 if append:
5870 names = list(self.index.names)
5871 if isinstance(self.index, MultiIndex):
5872 for i in range(self.index.nlevels):
5873 arrays.append(self.index._get_level_values(i))
5874 else:
5875 arrays.append(self.index)
5876
5877 to_remove: list[Hashable] = []
5878 for col in keys:
5879 if isinstance(col, MultiIndex):
5880 for n in range(col.nlevels):
5881 arrays.append(col._get_level_values(n))
5882 names.extend(col.names)
5883 elif isinstance(col, (Index, Series)):
5884 # if Index then not MultiIndex (treated above)
5885
5886 # error: Argument 1 to "append" of "list" has incompatible type
5887 # "Union[Index, Series]"; expected "Index"
5888 arrays.append(col) # type:ignore[arg-type]
5889 names.append(col.name)
5890 elif isinstance(col, (list, np.ndarray)):
5891 # error: Argument 1 to "append" of "list" has incompatible type
5892 # "Union[List[Any], ndarray]"; expected "Index"
5893 arrays.append(col) # type: ignore[arg-type]
5894 names.append(None)
5895 elif isinstance(col, abc.Iterator):
5896 # error: Argument 1 to "append" of "list" has incompatible type
5897 # "List[Any]"; expected "Index"
5898 arrays.append(list(col)) # type: ignore[arg-type]
5899 names.append(None)
5900 # from here, col can only be a column label
5901 else:
5902 arrays.append(frame[col])
5903 names.append(col)
5904 if drop:
5905 to_remove.append(col)
5906
5907 if len(arrays[-1]) != len(self):
5908 # check newest element against length of calling frame, since
5909 # ensure_index_from_sequences would not raise for append=False.
5910 raise ValueError(
5911 f"Length mismatch: Expected {len(self)} rows, "
5912 f"received array of length {len(arrays[-1])}"
5913 )
5914
5915 index = ensure_index_from_sequences(arrays, names)
5916
5917 if verify_integrity and not index.is_unique:
5918 duplicates = index[index.duplicated()].unique()
5919 raise ValueError(f"Index has duplicate keys: {duplicates}")
5920
5921 # use set to handle duplicate column names gracefully in case of drop
5922 for c in set(to_remove):
5923 del frame[c]
5924
5925 # clear up memory usage
5926 index._cleanup()
5927
5928 frame.index = index
5929
5930 if not inplace:
5931 return frame
5932 return None
5933
5934 @overload
5935 def reset_index(
5936 self,
5937 level: IndexLabel = ...,
5938 *,
5939 drop: bool = ...,
5940 inplace: Literal[False] = ...,
5941 col_level: Hashable = ...,
5942 col_fill: Hashable = ...,
5943 allow_duplicates: bool | lib.NoDefault = ...,
5944 names: Hashable | Sequence[Hashable] = None,
5945 ) -> DataFrame:
5946 ...
5947
5948 @overload
5949 def reset_index(
5950 self,
5951 level: IndexLabel = ...,
5952 *,
5953 drop: bool = ...,
5954 inplace: Literal[True],
5955 col_level: Hashable = ...,
5956 col_fill: Hashable = ...,
5957 allow_duplicates: bool | lib.NoDefault = ...,
5958 names: Hashable | Sequence[Hashable] = None,
5959 ) -> None:
5960 ...
5961
5962 @overload
5963 def reset_index(
5964 self,
5965 level: IndexLabel = ...,
5966 *,
5967 drop: bool = ...,
5968 inplace: bool = ...,
5969 col_level: Hashable = ...,
5970 col_fill: Hashable = ...,
5971 allow_duplicates: bool | lib.NoDefault = ...,
5972 names: Hashable | Sequence[Hashable] = None,
5973 ) -> DataFrame | None:
5974 ...
5975
5976 def reset_index(
5977 self,
5978 level: IndexLabel = None,
5979 *,
5980 drop: bool = False,
5981 inplace: bool = False,
5982 col_level: Hashable = 0,
5983 col_fill: Hashable = "",
5984 allow_duplicates: bool | lib.NoDefault = lib.no_default,
5985 names: Hashable | Sequence[Hashable] = None,
5986 ) -> DataFrame | None:
5987 """
5988 Reset the index, or a level of it.
5989
5990 Reset the index of the DataFrame, and use the default one instead.
5991 If the DataFrame has a MultiIndex, this method can remove one or more
5992 levels.
5993
5994 Parameters
5995 ----------
5996 level : int, str, tuple, or list, default None
5997 Only remove the given levels from the index. Removes all levels by
5998 default.
5999 drop : bool, default False
6000 Do not try to insert index into dataframe columns. This resets
6001 the index to the default integer index.
6002 inplace : bool, default False
6003 Whether to modify the DataFrame rather than creating a new one.
6004 col_level : int or str, default 0
6005 If the columns have multiple levels, determines which level the
6006 labels are inserted into. By default it is inserted into the first
6007 level.
6008 col_fill : object, default ''
6009 If the columns have multiple levels, determines how the other
6010 levels are named. If None then the index name is repeated.
6011 allow_duplicates : bool, optional, default lib.no_default
6012 Allow duplicate column labels to be created.
6013
6014 .. versionadded:: 1.5.0
6015
6016 names : int, str or 1-dimensional list, default None
6017 Using the given string, rename the DataFrame column which contains the
6018 index data. If the DataFrame has a MultiIndex, this has to be a list or
6019 tuple with length equal to the number of levels.
6020
6021 .. versionadded:: 1.5.0
6022
6023 Returns
6024 -------
6025 DataFrame or None
6026 DataFrame with the new index or None if ``inplace=True``.
6027
6028 See Also
6029 --------
6030 DataFrame.set_index : Opposite of reset_index.
6031 DataFrame.reindex : Change to new indices or expand indices.
6032 DataFrame.reindex_like : Change to same indices as other DataFrame.
6033
6034 Examples
6035 --------
6036 >>> df = pd.DataFrame([('bird', 389.0),
6037 ... ('bird', 24.0),
6038 ... ('mammal', 80.5),
6039 ... ('mammal', np.nan)],
6040 ... index=['falcon', 'parrot', 'lion', 'monkey'],
6041 ... columns=('class', 'max_speed'))
6042 >>> df
6043 class max_speed
6044 falcon bird 389.0
6045 parrot bird 24.0
6046 lion mammal 80.5
6047 monkey mammal NaN
6048
6049 When we reset the index, the old index is added as a column, and a
6050 new sequential index is used:
6051
6052 >>> df.reset_index()
6053 index class max_speed
6054 0 falcon bird 389.0
6055 1 parrot bird 24.0
6056 2 lion mammal 80.5
6057 3 monkey mammal NaN
6058
6059 We can use the `drop` parameter to avoid the old index being added as
6060 a column:
6061
6062 >>> df.reset_index(drop=True)
6063 class max_speed
6064 0 bird 389.0
6065 1 bird 24.0
6066 2 mammal 80.5
6067 3 mammal NaN
6068
6069 You can also use `reset_index` with `MultiIndex`.
6070
6071 >>> index = pd.MultiIndex.from_tuples([('bird', 'falcon'),
6072 ... ('bird', 'parrot'),
6073 ... ('mammal', 'lion'),
6074 ... ('mammal', 'monkey')],
6075 ... names=['class', 'name'])
6076 >>> columns = pd.MultiIndex.from_tuples([('speed', 'max'),
6077 ... ('species', 'type')])
6078 >>> df = pd.DataFrame([(389.0, 'fly'),
6079 ... (24.0, 'fly'),
6080 ... (80.5, 'run'),
6081 ... (np.nan, 'jump')],
6082 ... index=index,
6083 ... columns=columns)
6084 >>> df
6085 speed species
6086 max type
6087 class name
6088 bird falcon 389.0 fly
6089 parrot 24.0 fly
6090 mammal lion 80.5 run
6091 monkey NaN jump
6092
6093 Using the `names` parameter, choose a name for the index column:
6094
6095 >>> df.reset_index(names=['classes', 'names'])
6096 classes names speed species
6097 max type
6098 0 bird falcon 389.0 fly
6099 1 bird parrot 24.0 fly
6100 2 mammal lion 80.5 run
6101 3 mammal monkey NaN jump
6102
6103 If the index has multiple levels, we can reset a subset of them:
6104
6105 >>> df.reset_index(level='class')
6106 class speed species
6107 max type
6108 name
6109 falcon bird 389.0 fly
6110 parrot bird 24.0 fly
6111 lion mammal 80.5 run
6112 monkey mammal NaN jump
6113
6114 If we are not dropping the index, by default, it is placed in the top
6115 level. We can place it in another level:
6116
6117 >>> df.reset_index(level='class', col_level=1)
6118 speed species
6119 class max type
6120 name
6121 falcon bird 389.0 fly
6122 parrot bird 24.0 fly
6123 lion mammal 80.5 run
6124 monkey mammal NaN jump
6125
6126 When the index is inserted under another level, we can specify under
6127 which one with the parameter `col_fill`:
6128
6129 >>> df.reset_index(level='class', col_level=1, col_fill='species')
6130 species speed species
6131 class max type
6132 name
6133 falcon bird 389.0 fly
6134 parrot bird 24.0 fly
6135 lion mammal 80.5 run
6136 monkey mammal NaN jump
6137
6138 If we specify a nonexistent level for `col_fill`, it is created:
6139
6140 >>> df.reset_index(level='class', col_level=1, col_fill='genus')
6141 genus speed species
6142 class max type
6143 name
6144 falcon bird 389.0 fly
6145 parrot bird 24.0 fly
6146 lion mammal 80.5 run
6147 monkey mammal NaN jump
6148 """
6149 inplace = validate_bool_kwarg(inplace, "inplace")
6150 self._check_inplace_and_allows_duplicate_labels(inplace)
6151 if inplace:
6152 new_obj = self
6153 else:
6154 new_obj = self.copy(deep=None)
6155 if allow_duplicates is not lib.no_default:
6156 allow_duplicates = validate_bool_kwarg(allow_duplicates, "allow_duplicates")
6157
6158 new_index = default_index(len(new_obj))
6159 if level is not None:
6160 if not isinstance(level, (tuple, list)):
6161 level = [level]
6162 level = [self.index._get_level_number(lev) for lev in level]
6163 if len(level) < self.index.nlevels:
6164 new_index = self.index.droplevel(level)
6165
6166 if not drop:
6167 to_insert: Iterable[tuple[Any, Any | None]]
6168
6169 default = "index" if "index" not in self else "level_0"
6170 names = self.index._get_default_index_names(names, default)
6171
6172 if isinstance(self.index, MultiIndex):
6173 to_insert = zip(self.index.levels, self.index.codes)
6174 else:
6175 to_insert = ((self.index, None),)
6176
6177 multi_col = isinstance(self.columns, MultiIndex)
6178 for i, (lev, lab) in reversed(list(enumerate(to_insert))):
6179 if level is not None and i not in level:
6180 continue
6181 name = names[i]
6182 if multi_col:
6183 col_name = list(name) if isinstance(name, tuple) else [name]
6184 if col_fill is None:
6185 if len(col_name) not in (1, self.columns.nlevels):
6186 raise ValueError(
6187 "col_fill=None is incompatible "
6188 f"with incomplete column name {name}"
6189 )
6190 col_fill = col_name[0]
6191
6192 lev_num = self.columns._get_level_number(col_level)
6193 name_lst = [col_fill] * lev_num + col_name
6194 missing = self.columns.nlevels - len(name_lst)
6195 name_lst += [col_fill] * missing
6196 name = tuple(name_lst)
6197
6198 # to ndarray and maybe infer different dtype
6199 level_values = lev._values
6200 if level_values.dtype == np.object_:
6201 level_values = lib.maybe_convert_objects(level_values)
6202
6203 if lab is not None:
6204 # if we have the codes, extract the values with a mask
6205 level_values = algorithms.take(
6206 level_values, lab, allow_fill=True, fill_value=lev._na_value
6207 )
6208
6209 new_obj.insert(
6210 0,
6211 name,
6212 level_values,
6213 allow_duplicates=allow_duplicates,
6214 )
6215
6216 new_obj.index = new_index
6217 if not inplace:
6218 return new_obj
6219
6220 return None
6221
6222 # ----------------------------------------------------------------------
6223 # Reindex-based selection methods
6224
6225 @doc(NDFrame.isna, klass=_shared_doc_kwargs["klass"])
6226 def isna(self) -> DataFrame:
6227 result = self._constructor(self._mgr.isna(func=isna))
6228 return result.__finalize__(self, method="isna")
6229
6230 @doc(NDFrame.isna, klass=_shared_doc_kwargs["klass"])
6231 def isnull(self) -> DataFrame:
6232 """
6233 DataFrame.isnull is an alias for DataFrame.isna.
6234 """
6235 return self.isna()
6236
6237 @doc(NDFrame.notna, klass=_shared_doc_kwargs["klass"])
6238 def notna(self) -> DataFrame:
6239 return ~self.isna()
6240
6241 @doc(NDFrame.notna, klass=_shared_doc_kwargs["klass"])
6242 def notnull(self) -> DataFrame:
6243 """
6244 DataFrame.notnull is an alias for DataFrame.notna.
6245 """
6246 return ~self.isna()
6247
6248 @overload
6249 def dropna(
6250 self,
6251 *,
6252 axis: Axis = ...,
6253 how: AnyAll | NoDefault = ...,
6254 thresh: int | NoDefault = ...,
6255 subset: IndexLabel = ...,
6256 inplace: Literal[False] = ...,
6257 ignore_index: bool = ...,
6258 ) -> DataFrame:
6259 ...
6260
6261 @overload
6262 def dropna(
6263 self,
6264 *,
6265 axis: Axis = ...,
6266 how: AnyAll | NoDefault = ...,
6267 thresh: int | NoDefault = ...,
6268 subset: IndexLabel = ...,
6269 inplace: Literal[True],
6270 ignore_index: bool = ...,
6271 ) -> None:
6272 ...
6273
6274 def dropna(
6275 self,
6276 *,
6277 axis: Axis = 0,
6278 how: AnyAll | NoDefault = no_default,
6279 thresh: int | NoDefault = no_default,
6280 subset: IndexLabel = None,
6281 inplace: bool = False,
6282 ignore_index: bool = False,
6283 ) -> DataFrame | None:
6284 """
6285 Remove missing values.
6286
6287 See the :ref:`User Guide <missing_data>` for more on which values are
6288 considered missing, and how to work with missing data.
6289
6290 Parameters
6291 ----------
6292 axis : {0 or 'index', 1 or 'columns'}, default 0
6293 Determine if rows or columns which contain missing values are
6294 removed.
6295
6296 * 0, or 'index' : Drop rows which contain missing values.
6297 * 1, or 'columns' : Drop columns which contain missing value.
6298
6299 Pass tuple or list to drop on multiple axes.
6300 Only a single axis is allowed.
6301
6302 how : {'any', 'all'}, default 'any'
6303 Determine if row or column is removed from DataFrame, when we have
6304 at least one NA or all NA.
6305
6306 * 'any' : If any NA values are present, drop that row or column.
6307 * 'all' : If all values are NA, drop that row or column.
6308
6309 thresh : int, optional
6310 Require that many non-NA values. Cannot be combined with how.
6311 subset : column label or sequence of labels, optional
6312 Labels along other axis to consider, e.g. if you are dropping rows
6313 these would be a list of columns to include.
6314 inplace : bool, default False
6315 Whether to modify the DataFrame rather than creating a new one.
6316 ignore_index : bool, default ``False``
6317 If ``True``, the resulting axis will be labeled 0, 1, …, n - 1.
6318
6319 .. versionadded:: 2.0.0
6320
6321 Returns
6322 -------
6323 DataFrame or None
6324 DataFrame with NA entries dropped from it or None if ``inplace=True``.
6325
6326 See Also
6327 --------
6328 DataFrame.isna: Indicate missing values.
6329 DataFrame.notna : Indicate existing (non-missing) values.
6330 DataFrame.fillna : Replace missing values.
6331 Series.dropna : Drop missing values.
6332 Index.dropna : Drop missing indices.
6333
6334 Examples
6335 --------
6336 >>> df = pd.DataFrame({"name": ['Alfred', 'Batman', 'Catwoman'],
6337 ... "toy": [np.nan, 'Batmobile', 'Bullwhip'],
6338 ... "born": [pd.NaT, pd.Timestamp("1940-04-25"),
6339 ... pd.NaT]})
6340 >>> df
6341 name toy born
6342 0 Alfred NaN NaT
6343 1 Batman Batmobile 1940-04-25
6344 2 Catwoman Bullwhip NaT
6345
6346 Drop the rows where at least one element is missing.
6347
6348 >>> df.dropna()
6349 name toy born
6350 1 Batman Batmobile 1940-04-25
6351
6352 Drop the columns where at least one element is missing.
6353
6354 >>> df.dropna(axis='columns')
6355 name
6356 0 Alfred
6357 1 Batman
6358 2 Catwoman
6359
6360 Drop the rows where all elements are missing.
6361
6362 >>> df.dropna(how='all')
6363 name toy born
6364 0 Alfred NaN NaT
6365 1 Batman Batmobile 1940-04-25
6366 2 Catwoman Bullwhip NaT
6367
6368 Keep only the rows with at least 2 non-NA values.
6369
6370 >>> df.dropna(thresh=2)
6371 name toy born
6372 1 Batman Batmobile 1940-04-25
6373 2 Catwoman Bullwhip NaT
6374
6375 Define in which columns to look for missing values.
6376
6377 >>> df.dropna(subset=['name', 'toy'])
6378 name toy born
6379 1 Batman Batmobile 1940-04-25
6380 2 Catwoman Bullwhip NaT
6381 """
6382 if (how is not no_default) and (thresh is not no_default):
6383 raise TypeError(
6384 "You cannot set both the how and thresh arguments at the same time."
6385 )
6386
6387 if how is no_default:
6388 how = "any"
6389
6390 inplace = validate_bool_kwarg(inplace, "inplace")
6391 if isinstance(axis, (tuple, list)):
6392 # GH20987
6393 raise TypeError("supplying multiple axes to axis is no longer supported.")
6394
6395 axis = self._get_axis_number(axis)
6396 agg_axis = 1 - axis
6397
6398 agg_obj = self
6399 if subset is not None:
6400 # subset needs to be list
6401 if not is_list_like(subset):
6402 subset = [subset]
6403 ax = self._get_axis(agg_axis)
6404 indices = ax.get_indexer_for(subset)
6405 check = indices == -1
6406 if check.any():
6407 raise KeyError(np.array(subset)[check].tolist())
6408 agg_obj = self.take(indices, axis=agg_axis)
6409
6410 if thresh is not no_default:
6411 count = agg_obj.count(axis=agg_axis)
6412 mask = count >= thresh
6413 elif how == "any":
6414 # faster equivalent to 'agg_obj.count(agg_axis) == self.shape[agg_axis]'
6415 mask = notna(agg_obj).all(axis=agg_axis, bool_only=False)
6416 elif how == "all":
6417 # faster equivalent to 'agg_obj.count(agg_axis) > 0'
6418 mask = notna(agg_obj).any(axis=agg_axis, bool_only=False)
6419 else:
6420 raise ValueError(f"invalid how option: {how}")
6421
6422 if np.all(mask):
6423 result = self.copy(deep=None)
6424 else:
6425 result = self.loc(axis=axis)[mask]
6426
6427 if ignore_index:
6428 result.index = default_index(len(result))
6429
6430 if not inplace:
6431 return result
6432 self._update_inplace(result)
6433 return None
6434
6435 def drop_duplicates(
6436 self,
6437 subset: Hashable | Sequence[Hashable] | None = None,
6438 *,
6439 keep: DropKeep = "first",
6440 inplace: bool = False,
6441 ignore_index: bool = False,
6442 ) -> DataFrame | None:
6443 """
6444 Return DataFrame with duplicate rows removed.
6445
6446 Considering certain columns is optional. Indexes, including time indexes
6447 are ignored.
6448
6449 Parameters
6450 ----------
6451 subset : column label or sequence of labels, optional
6452 Only consider certain columns for identifying duplicates, by
6453 default use all of the columns.
6454 keep : {'first', 'last', ``False``}, default 'first'
6455 Determines which duplicates (if any) to keep.
6456
6457 - 'first' : Drop duplicates except for the first occurrence.
6458 - 'last' : Drop duplicates except for the last occurrence.
6459 - ``False`` : Drop all duplicates.
6460
6461 inplace : bool, default ``False``
6462 Whether to modify the DataFrame rather than creating a new one.
6463 ignore_index : bool, default ``False``
6464 If ``True``, the resulting axis will be labeled 0, 1, …, n - 1.
6465
6466 Returns
6467 -------
6468 DataFrame or None
6469 DataFrame with duplicates removed or None if ``inplace=True``.
6470
6471 See Also
6472 --------
6473 DataFrame.value_counts: Count unique combinations of columns.
6474
6475 Examples
6476 --------
6477 Consider dataset containing ramen rating.
6478
6479 >>> df = pd.DataFrame({
6480 ... 'brand': ['Yum Yum', 'Yum Yum', 'Indomie', 'Indomie', 'Indomie'],
6481 ... 'style': ['cup', 'cup', 'cup', 'pack', 'pack'],
6482 ... 'rating': [4, 4, 3.5, 15, 5]
6483 ... })
6484 >>> df
6485 brand style rating
6486 0 Yum Yum cup 4.0
6487 1 Yum Yum cup 4.0
6488 2 Indomie cup 3.5
6489 3 Indomie pack 15.0
6490 4 Indomie pack 5.0
6491
6492 By default, it removes duplicate rows based on all columns.
6493
6494 >>> df.drop_duplicates()
6495 brand style rating
6496 0 Yum Yum cup 4.0
6497 2 Indomie cup 3.5
6498 3 Indomie pack 15.0
6499 4 Indomie pack 5.0
6500
6501 To remove duplicates on specific column(s), use ``subset``.
6502
6503 >>> df.drop_duplicates(subset=['brand'])
6504 brand style rating
6505 0 Yum Yum cup 4.0
6506 2 Indomie cup 3.5
6507
6508 To remove duplicates and keep last occurrences, use ``keep``.
6509
6510 >>> df.drop_duplicates(subset=['brand', 'style'], keep='last')
6511 brand style rating
6512 1 Yum Yum cup 4.0
6513 2 Indomie cup 3.5
6514 4 Indomie pack 5.0
6515 """
6516 if self.empty:
6517 return self.copy(deep=None)
6518
6519 inplace = validate_bool_kwarg(inplace, "inplace")
6520 ignore_index = validate_bool_kwarg(ignore_index, "ignore_index")
6521
6522 result = self[-self.duplicated(subset, keep=keep)]
6523 if ignore_index:
6524 result.index = default_index(len(result))
6525
6526 if inplace:
6527 self._update_inplace(result)
6528 return None
6529 else:
6530 return result
6531
6532 def duplicated(
6533 self,
6534 subset: Hashable | Sequence[Hashable] | None = None,
6535 keep: DropKeep = "first",
6536 ) -> Series:
6537 """
6538 Return boolean Series denoting duplicate rows.
6539
6540 Considering certain columns is optional.
6541
6542 Parameters
6543 ----------
6544 subset : column label or sequence of labels, optional
6545 Only consider certain columns for identifying duplicates, by
6546 default use all of the columns.
6547 keep : {'first', 'last', False}, default 'first'
6548 Determines which duplicates (if any) to mark.
6549
6550 - ``first`` : Mark duplicates as ``True`` except for the first occurrence.
6551 - ``last`` : Mark duplicates as ``True`` except for the last occurrence.
6552 - False : Mark all duplicates as ``True``.
6553
6554 Returns
6555 -------
6556 Series
6557 Boolean series for each duplicated rows.
6558
6559 See Also
6560 --------
6561 Index.duplicated : Equivalent method on index.
6562 Series.duplicated : Equivalent method on Series.
6563 Series.drop_duplicates : Remove duplicate values from Series.
6564 DataFrame.drop_duplicates : Remove duplicate values from DataFrame.
6565
6566 Examples
6567 --------
6568 Consider dataset containing ramen rating.
6569
6570 >>> df = pd.DataFrame({
6571 ... 'brand': ['Yum Yum', 'Yum Yum', 'Indomie', 'Indomie', 'Indomie'],
6572 ... 'style': ['cup', 'cup', 'cup', 'pack', 'pack'],
6573 ... 'rating': [4, 4, 3.5, 15, 5]
6574 ... })
6575 >>> df
6576 brand style rating
6577 0 Yum Yum cup 4.0
6578 1 Yum Yum cup 4.0
6579 2 Indomie cup 3.5
6580 3 Indomie pack 15.0
6581 4 Indomie pack 5.0
6582
6583 By default, for each set of duplicated values, the first occurrence
6584 is set on False and all others on True.
6585
6586 >>> df.duplicated()
6587 0 False
6588 1 True
6589 2 False
6590 3 False
6591 4 False
6592 dtype: bool
6593
6594 By using 'last', the last occurrence of each set of duplicated values
6595 is set on False and all others on True.
6596
6597 >>> df.duplicated(keep='last')
6598 0 True
6599 1 False
6600 2 False
6601 3 False
6602 4 False
6603 dtype: bool
6604
6605 By setting ``keep`` on False, all duplicates are True.
6606
6607 >>> df.duplicated(keep=False)
6608 0 True
6609 1 True
6610 2 False
6611 3 False
6612 4 False
6613 dtype: bool
6614
6615 To find duplicates on specific column(s), use ``subset``.
6616
6617 >>> df.duplicated(subset=['brand'])
6618 0 False
6619 1 True
6620 2 False
6621 3 True
6622 4 True
6623 dtype: bool
6624 """
6625
6626 if self.empty:
6627 return self._constructor_sliced(dtype=bool)
6628
6629 def f(vals) -> tuple[np.ndarray, int]:
6630 labels, shape = algorithms.factorize(vals, size_hint=len(self))
6631 return labels.astype("i8", copy=False), len(shape)
6632
6633 if subset is None:
6634 # https://github.com/pandas-dev/pandas/issues/28770
6635 # Incompatible types in assignment (expression has type "Index", variable
6636 # has type "Sequence[Any]")
6637 subset = self.columns # type: ignore[assignment]
6638 elif (
6639 not np.iterable(subset)
6640 or isinstance(subset, str)
6641 or isinstance(subset, tuple)
6642 and subset in self.columns
6643 ):
6644 subset = (subset,)
6645
6646 # needed for mypy since can't narrow types using np.iterable
6647 subset = cast(Sequence, subset)
6648
6649 # Verify all columns in subset exist in the queried dataframe
6650 # Otherwise, raise a KeyError, same as if you try to __getitem__ with a
6651 # key that doesn't exist.
6652 diff = set(subset) - set(self.columns)
6653 if diff:
6654 raise KeyError(Index(diff))
6655
6656 if len(subset) == 1 and self.columns.is_unique:
6657 # GH#45236 This is faster than get_group_index below
6658 result = self[subset[0]].duplicated(keep)
6659 result.name = None
6660 else:
6661 vals = (col.values for name, col in self.items() if name in subset)
6662 labels, shape = map(list, zip(*map(f, vals)))
6663
6664 ids = get_group_index(
6665 labels,
6666 # error: Argument 1 to "tuple" has incompatible type "List[_T]";
6667 # expected "Iterable[int]"
6668 tuple(shape), # type: ignore[arg-type]
6669 sort=False,
6670 xnull=False,
6671 )
6672 result = self._constructor_sliced(duplicated(ids, keep), index=self.index)
6673 return result.__finalize__(self, method="duplicated")
6674
6675 # ----------------------------------------------------------------------
6676 # Sorting
6677 # error: Signature of "sort_values" incompatible with supertype "NDFrame"
6678 @overload # type: ignore[override]
6679 def sort_values(
6680 self,
6681 by: IndexLabel,
6682 *,
6683 axis: Axis = ...,
6684 ascending=...,
6685 inplace: Literal[False] = ...,
6686 kind: str = ...,
6687 na_position: str = ...,
6688 ignore_index: bool = ...,
6689 key: ValueKeyFunc = ...,
6690 ) -> DataFrame:
6691 ...
6692
6693 @overload
6694 def sort_values(
6695 self,
6696 by: IndexLabel,
6697 *,
6698 axis: Axis = ...,
6699 ascending=...,
6700 inplace: Literal[True],
6701 kind: str = ...,
6702 na_position: str = ...,
6703 ignore_index: bool = ...,
6704 key: ValueKeyFunc = ...,
6705 ) -> None:
6706 ...
6707
6708 # TODO: Just move the sort_values doc here.
6709 @Substitution(**_shared_doc_kwargs)
6710 @Appender(NDFrame.sort_values.__doc__)
6711 def sort_values(
6712 self,
6713 by: IndexLabel,
6714 *,
6715 axis: Axis = 0,
6716 ascending: bool | list[bool] | tuple[bool, ...] = True,
6717 inplace: bool = False,
6718 kind: str = "quicksort",
6719 na_position: str = "last",
6720 ignore_index: bool = False,
6721 key: ValueKeyFunc = None,
6722 ) -> DataFrame | None:
6723 inplace = validate_bool_kwarg(inplace, "inplace")
6724 axis = self._get_axis_number(axis)
6725 ascending = validate_ascending(ascending)
6726 if not isinstance(by, list):
6727 by = [by]
6728 # error: Argument 1 to "len" has incompatible type "Union[bool, List[bool]]";
6729 # expected "Sized"
6730 if is_sequence(ascending) and (
6731 len(by) != len(ascending) # type: ignore[arg-type]
6732 ):
6733 # error: Argument 1 to "len" has incompatible type "Union[bool,
6734 # List[bool]]"; expected "Sized"
6735 raise ValueError(
6736 f"Length of ascending ({len(ascending)})" # type: ignore[arg-type]
6737 f" != length of by ({len(by)})"
6738 )
6739 if len(by) > 1:
6740 keys = [self._get_label_or_level_values(x, axis=axis) for x in by]
6741
6742 # need to rewrap columns in Series to apply key function
6743 if key is not None:
6744 # error: List comprehension has incompatible type List[Series];
6745 # expected List[ndarray]
6746 keys = [
6747 Series(k, name=name) # type: ignore[misc]
6748 for (k, name) in zip(keys, by)
6749 ]
6750
6751 indexer = lexsort_indexer(
6752 keys, orders=ascending, na_position=na_position, key=key
6753 )
6754 elif len(by):
6755 # len(by) == 1
6756
6757 by = by[0]
6758 k = self._get_label_or_level_values(by, axis=axis)
6759
6760 # need to rewrap column in Series to apply key function
6761 if key is not None:
6762 # error: Incompatible types in assignment (expression has type
6763 # "Series", variable has type "ndarray")
6764 k = Series(k, name=by) # type: ignore[assignment]
6765
6766 if isinstance(ascending, (tuple, list)):
6767 ascending = ascending[0]
6768
6769 indexer = nargsort(
6770 k, kind=kind, ascending=ascending, na_position=na_position, key=key
6771 )
6772 else:
6773 if inplace:
6774 return self._update_inplace(self)
6775 else:
6776 return self.copy(deep=None)
6777
6778 if is_range_indexer(indexer, len(indexer)):
6779 result = self.copy(deep=(not inplace and not using_copy_on_write()))
6780 if ignore_index:
6781 result.index = default_index(len(result))
6782
6783 if inplace:
6784 return self._update_inplace(result)
6785 else:
6786 return result
6787
6788 new_data = self._mgr.take(
6789 indexer, axis=self._get_block_manager_axis(axis), verify=False
6790 )
6791
6792 if ignore_index:
6793 new_data.set_axis(
6794 self._get_block_manager_axis(axis), default_index(len(indexer))
6795 )
6796
6797 result = self._constructor(new_data)
6798 if inplace:
6799 return self._update_inplace(result)
6800 else:
6801 return result.__finalize__(self, method="sort_values")
6802
6803 @overload
6804 def sort_index(
6805 self,
6806 *,
6807 axis: Axis = ...,
6808 level: IndexLabel = ...,
6809 ascending: bool | Sequence[bool] = ...,
6810 inplace: Literal[True],
6811 kind: SortKind = ...,
6812 na_position: NaPosition = ...,
6813 sort_remaining: bool = ...,
6814 ignore_index: bool = ...,
6815 key: IndexKeyFunc = ...,
6816 ) -> None:
6817 ...
6818
6819 @overload
6820 def sort_index(
6821 self,
6822 *,
6823 axis: Axis = ...,
6824 level: IndexLabel = ...,
6825 ascending: bool | Sequence[bool] = ...,
6826 inplace: Literal[False] = ...,
6827 kind: SortKind = ...,
6828 na_position: NaPosition = ...,
6829 sort_remaining: bool = ...,
6830 ignore_index: bool = ...,
6831 key: IndexKeyFunc = ...,
6832 ) -> DataFrame:
6833 ...
6834
6835 @overload
6836 def sort_index(
6837 self,
6838 *,
6839 axis: Axis = ...,
6840 level: IndexLabel = ...,
6841 ascending: bool | Sequence[bool] = ...,
6842 inplace: bool = ...,
6843 kind: SortKind = ...,
6844 na_position: NaPosition = ...,
6845 sort_remaining: bool = ...,
6846 ignore_index: bool = ...,
6847 key: IndexKeyFunc = ...,
6848 ) -> DataFrame | None:
6849 ...
6850
6851 def sort_index(
6852 self,
6853 *,
6854 axis: Axis = 0,
6855 level: IndexLabel = None,
6856 ascending: bool | Sequence[bool] = True,
6857 inplace: bool = False,
6858 kind: SortKind = "quicksort",
6859 na_position: NaPosition = "last",
6860 sort_remaining: bool = True,
6861 ignore_index: bool = False,
6862 key: IndexKeyFunc = None,
6863 ) -> DataFrame | None:
6864 """
6865 Sort object by labels (along an axis).
6866
6867 Returns a new DataFrame sorted by label if `inplace` argument is
6868 ``False``, otherwise updates the original DataFrame and returns None.
6869
6870 Parameters
6871 ----------
6872 axis : {0 or 'index', 1 or 'columns'}, default 0
6873 The axis along which to sort. The value 0 identifies the rows,
6874 and 1 identifies the columns.
6875 level : int or level name or list of ints or list of level names
6876 If not None, sort on values in specified index level(s).
6877 ascending : bool or list-like of bools, default True
6878 Sort ascending vs. descending. When the index is a MultiIndex the
6879 sort direction can be controlled for each level individually.
6880 inplace : bool, default False
6881 Whether to modify the DataFrame rather than creating a new one.
6882 kind : {'quicksort', 'mergesort', 'heapsort', 'stable'}, default 'quicksort'
6883 Choice of sorting algorithm. See also :func:`numpy.sort` for more
6884 information. `mergesort` and `stable` are the only stable algorithms. For
6885 DataFrames, this option is only applied when sorting on a single
6886 column or label.
6887 na_position : {'first', 'last'}, default 'last'
6888 Puts NaNs at the beginning if `first`; `last` puts NaNs at the end.
6889 Not implemented for MultiIndex.
6890 sort_remaining : bool, default True
6891 If True and sorting by level and index is multilevel, sort by other
6892 levels too (in order) after sorting by specified level.
6893 ignore_index : bool, default False
6894 If True, the resulting axis will be labeled 0, 1, …, n - 1.
6895 key : callable, optional
6896 If not None, apply the key function to the index values
6897 before sorting. This is similar to the `key` argument in the
6898 builtin :meth:`sorted` function, with the notable difference that
6899 this `key` function should be *vectorized*. It should expect an
6900 ``Index`` and return an ``Index`` of the same shape. For MultiIndex
6901 inputs, the key is applied *per level*.
6902
6903 .. versionadded:: 1.1.0
6904
6905 Returns
6906 -------
6907 DataFrame or None
6908 The original DataFrame sorted by the labels or None if ``inplace=True``.
6909
6910 See Also
6911 --------
6912 Series.sort_index : Sort Series by the index.
6913 DataFrame.sort_values : Sort DataFrame by the value.
6914 Series.sort_values : Sort Series by the value.
6915
6916 Examples
6917 --------
6918 >>> df = pd.DataFrame([1, 2, 3, 4, 5], index=[100, 29, 234, 1, 150],
6919 ... columns=['A'])
6920 >>> df.sort_index()
6921 A
6922 1 4
6923 29 2
6924 100 1
6925 150 5
6926 234 3
6927
6928 By default, it sorts in ascending order, to sort in descending order,
6929 use ``ascending=False``
6930
6931 >>> df.sort_index(ascending=False)
6932 A
6933 234 3
6934 150 5
6935 100 1
6936 29 2
6937 1 4
6938
6939 A key function can be specified which is applied to the index before
6940 sorting. For a ``MultiIndex`` this is applied to each level separately.
6941
6942 >>> df = pd.DataFrame({"a": [1, 2, 3, 4]}, index=['A', 'b', 'C', 'd'])
6943 >>> df.sort_index(key=lambda x: x.str.lower())
6944 a
6945 A 1
6946 b 2
6947 C 3
6948 d 4
6949 """
6950 return super().sort_index(
6951 axis=axis,
6952 level=level,
6953 ascending=ascending,
6954 inplace=inplace,
6955 kind=kind,
6956 na_position=na_position,
6957 sort_remaining=sort_remaining,
6958 ignore_index=ignore_index,
6959 key=key,
6960 )
6961
6962 def value_counts(
6963 self,
6964 subset: Sequence[Hashable] | None = None,
6965 normalize: bool = False,
6966 sort: bool = True,
6967 ascending: bool = False,
6968 dropna: bool = True,
6969 ) -> Series:
6970 """
6971 Return a Series containing counts of unique rows in the DataFrame.
6972
6973 .. versionadded:: 1.1.0
6974
6975 Parameters
6976 ----------
6977 subset : label or list of labels, optional
6978 Columns to use when counting unique combinations.
6979 normalize : bool, default False
6980 Return proportions rather than frequencies.
6981 sort : bool, default True
6982 Sort by frequencies.
6983 ascending : bool, default False
6984 Sort in ascending order.
6985 dropna : bool, default True
6986 Don’t include counts of rows that contain NA values.
6987
6988 .. versionadded:: 1.3.0
6989
6990 Returns
6991 -------
6992 Series
6993
6994 See Also
6995 --------
6996 Series.value_counts: Equivalent method on Series.
6997
6998 Notes
6999 -----
7000 The returned Series will have a MultiIndex with one level per input
7001 column but an Index (non-multi) for a single label. By default, rows
7002 that contain any NA values are omitted from the result. By default,
7003 the resulting Series will be in descending order so that the first
7004 element is the most frequently-occurring row.
7005
7006 Examples
7007 --------
7008 >>> df = pd.DataFrame({'num_legs': [2, 4, 4, 6],
7009 ... 'num_wings': [2, 0, 0, 0]},
7010 ... index=['falcon', 'dog', 'cat', 'ant'])
7011 >>> df
7012 num_legs num_wings
7013 falcon 2 2
7014 dog 4 0
7015 cat 4 0
7016 ant 6 0
7017
7018 >>> df.value_counts()
7019 num_legs num_wings
7020 4 0 2
7021 2 2 1
7022 6 0 1
7023 Name: count, dtype: int64
7024
7025 >>> df.value_counts(sort=False)
7026 num_legs num_wings
7027 2 2 1
7028 4 0 2
7029 6 0 1
7030 Name: count, dtype: int64
7031
7032 >>> df.value_counts(ascending=True)
7033 num_legs num_wings
7034 2 2 1
7035 6 0 1
7036 4 0 2
7037 Name: count, dtype: int64
7038
7039 >>> df.value_counts(normalize=True)
7040 num_legs num_wings
7041 4 0 0.50
7042 2 2 0.25
7043 6 0 0.25
7044 Name: proportion, dtype: float64
7045
7046 With `dropna` set to `False` we can also count rows with NA values.
7047
7048 >>> df = pd.DataFrame({'first_name': ['John', 'Anne', 'John', 'Beth'],
7049 ... 'middle_name': ['Smith', pd.NA, pd.NA, 'Louise']})
7050 >>> df
7051 first_name middle_name
7052 0 John Smith
7053 1 Anne <NA>
7054 2 John <NA>
7055 3 Beth Louise
7056
7057 >>> df.value_counts()
7058 first_name middle_name
7059 Beth Louise 1
7060 John Smith 1
7061 Name: count, dtype: int64
7062
7063 >>> df.value_counts(dropna=False)
7064 first_name middle_name
7065 Anne NaN 1
7066 Beth Louise 1
7067 John Smith 1
7068 NaN 1
7069 Name: count, dtype: int64
7070
7071 >>> df.value_counts("first_name")
7072 first_name
7073 John 2
7074 Anne 1
7075 Beth 1
7076 Name: count, dtype: int64
7077 """
7078 if subset is None:
7079 subset = self.columns.tolist()
7080
7081 name = "proportion" if normalize else "count"
7082 counts = self.groupby(subset, dropna=dropna).grouper.size()
7083 counts.name = name
7084
7085 if sort:
7086 counts = counts.sort_values(ascending=ascending)
7087 if normalize:
7088 counts /= counts.sum()
7089
7090 # Force MultiIndex for single column
7091 if is_list_like(subset) and len(subset) == 1:
7092 counts.index = MultiIndex.from_arrays(
7093 [counts.index], names=[counts.index.name]
7094 )
7095
7096 return counts
7097
7098 def nlargest(self, n: int, columns: IndexLabel, keep: str = "first") -> DataFrame:
7099 """
7100 Return the first `n` rows ordered by `columns` in descending order.
7101
7102 Return the first `n` rows with the largest values in `columns`, in
7103 descending order. The columns that are not specified are returned as
7104 well, but not used for ordering.
7105
7106 This method is equivalent to
7107 ``df.sort_values(columns, ascending=False).head(n)``, but more
7108 performant.
7109
7110 Parameters
7111 ----------
7112 n : int
7113 Number of rows to return.
7114 columns : label or list of labels
7115 Column label(s) to order by.
7116 keep : {'first', 'last', 'all'}, default 'first'
7117 Where there are duplicate values:
7118
7119 - ``first`` : prioritize the first occurrence(s)
7120 - ``last`` : prioritize the last occurrence(s)
7121 - ``all`` : do not drop any duplicates, even it means
7122 selecting more than `n` items.
7123
7124 Returns
7125 -------
7126 DataFrame
7127 The first `n` rows ordered by the given columns in descending
7128 order.
7129
7130 See Also
7131 --------
7132 DataFrame.nsmallest : Return the first `n` rows ordered by `columns` in
7133 ascending order.
7134 DataFrame.sort_values : Sort DataFrame by the values.
7135 DataFrame.head : Return the first `n` rows without re-ordering.
7136
7137 Notes
7138 -----
7139 This function cannot be used with all column types. For example, when
7140 specifying columns with `object` or `category` dtypes, ``TypeError`` is
7141 raised.
7142
7143 Examples
7144 --------
7145 >>> df = pd.DataFrame({'population': [59000000, 65000000, 434000,
7146 ... 434000, 434000, 337000, 11300,
7147 ... 11300, 11300],
7148 ... 'GDP': [1937894, 2583560 , 12011, 4520, 12128,
7149 ... 17036, 182, 38, 311],
7150 ... 'alpha-2': ["IT", "FR", "MT", "MV", "BN",
7151 ... "IS", "NR", "TV", "AI"]},
7152 ... index=["Italy", "France", "Malta",
7153 ... "Maldives", "Brunei", "Iceland",
7154 ... "Nauru", "Tuvalu", "Anguilla"])
7155 >>> df
7156 population GDP alpha-2
7157 Italy 59000000 1937894 IT
7158 France 65000000 2583560 FR
7159 Malta 434000 12011 MT
7160 Maldives 434000 4520 MV
7161 Brunei 434000 12128 BN
7162 Iceland 337000 17036 IS
7163 Nauru 11300 182 NR
7164 Tuvalu 11300 38 TV
7165 Anguilla 11300 311 AI
7166
7167 In the following example, we will use ``nlargest`` to select the three
7168 rows having the largest values in column "population".
7169
7170 >>> df.nlargest(3, 'population')
7171 population GDP alpha-2
7172 France 65000000 2583560 FR
7173 Italy 59000000 1937894 IT
7174 Malta 434000 12011 MT
7175
7176 When using ``keep='last'``, ties are resolved in reverse order:
7177
7178 >>> df.nlargest(3, 'population', keep='last')
7179 population GDP alpha-2
7180 France 65000000 2583560 FR
7181 Italy 59000000 1937894 IT
7182 Brunei 434000 12128 BN
7183
7184 When using ``keep='all'``, all duplicate items are maintained:
7185
7186 >>> df.nlargest(3, 'population', keep='all')
7187 population GDP alpha-2
7188 France 65000000 2583560 FR
7189 Italy 59000000 1937894 IT
7190 Malta 434000 12011 MT
7191 Maldives 434000 4520 MV
7192 Brunei 434000 12128 BN
7193
7194 To order by the largest values in column "population" and then "GDP",
7195 we can specify multiple columns like in the next example.
7196
7197 >>> df.nlargest(3, ['population', 'GDP'])
7198 population GDP alpha-2
7199 France 65000000 2583560 FR
7200 Italy 59000000 1937894 IT
7201 Brunei 434000 12128 BN
7202 """
7203 return selectn.SelectNFrame(self, n=n, keep=keep, columns=columns).nlargest()
7204
7205 def nsmallest(self, n: int, columns: IndexLabel, keep: str = "first") -> DataFrame:
7206 """
7207 Return the first `n` rows ordered by `columns` in ascending order.
7208
7209 Return the first `n` rows with the smallest values in `columns`, in
7210 ascending order. The columns that are not specified are returned as
7211 well, but not used for ordering.
7212
7213 This method is equivalent to
7214 ``df.sort_values(columns, ascending=True).head(n)``, but more
7215 performant.
7216
7217 Parameters
7218 ----------
7219 n : int
7220 Number of items to retrieve.
7221 columns : list or str
7222 Column name or names to order by.
7223 keep : {'first', 'last', 'all'}, default 'first'
7224 Where there are duplicate values:
7225
7226 - ``first`` : take the first occurrence.
7227 - ``last`` : take the last occurrence.
7228 - ``all`` : do not drop any duplicates, even it means
7229 selecting more than `n` items.
7230
7231 Returns
7232 -------
7233 DataFrame
7234
7235 See Also
7236 --------
7237 DataFrame.nlargest : Return the first `n` rows ordered by `columns` in
7238 descending order.
7239 DataFrame.sort_values : Sort DataFrame by the values.
7240 DataFrame.head : Return the first `n` rows without re-ordering.
7241
7242 Examples
7243 --------
7244 >>> df = pd.DataFrame({'population': [59000000, 65000000, 434000,
7245 ... 434000, 434000, 337000, 337000,
7246 ... 11300, 11300],
7247 ... 'GDP': [1937894, 2583560 , 12011, 4520, 12128,
7248 ... 17036, 182, 38, 311],
7249 ... 'alpha-2': ["IT", "FR", "MT", "MV", "BN",
7250 ... "IS", "NR", "TV", "AI"]},
7251 ... index=["Italy", "France", "Malta",
7252 ... "Maldives", "Brunei", "Iceland",
7253 ... "Nauru", "Tuvalu", "Anguilla"])
7254 >>> df
7255 population GDP alpha-2
7256 Italy 59000000 1937894 IT
7257 France 65000000 2583560 FR
7258 Malta 434000 12011 MT
7259 Maldives 434000 4520 MV
7260 Brunei 434000 12128 BN
7261 Iceland 337000 17036 IS
7262 Nauru 337000 182 NR
7263 Tuvalu 11300 38 TV
7264 Anguilla 11300 311 AI
7265
7266 In the following example, we will use ``nsmallest`` to select the
7267 three rows having the smallest values in column "population".
7268
7269 >>> df.nsmallest(3, 'population')
7270 population GDP alpha-2
7271 Tuvalu 11300 38 TV
7272 Anguilla 11300 311 AI
7273 Iceland 337000 17036 IS
7274
7275 When using ``keep='last'``, ties are resolved in reverse order:
7276
7277 >>> df.nsmallest(3, 'population', keep='last')
7278 population GDP alpha-2
7279 Anguilla 11300 311 AI
7280 Tuvalu 11300 38 TV
7281 Nauru 337000 182 NR
7282
7283 When using ``keep='all'``, all duplicate items are maintained:
7284
7285 >>> df.nsmallest(3, 'population', keep='all')
7286 population GDP alpha-2
7287 Tuvalu 11300 38 TV
7288 Anguilla 11300 311 AI
7289 Iceland 337000 17036 IS
7290 Nauru 337000 182 NR
7291
7292 To order by the smallest values in column "population" and then "GDP", we can
7293 specify multiple columns like in the next example.
7294
7295 >>> df.nsmallest(3, ['population', 'GDP'])
7296 population GDP alpha-2
7297 Tuvalu 11300 38 TV
7298 Anguilla 11300 311 AI
7299 Nauru 337000 182 NR
7300 """
7301 return selectn.SelectNFrame(self, n=n, keep=keep, columns=columns).nsmallest()
7302
7303 @doc(
7304 Series.swaplevel,
7305 klass=_shared_doc_kwargs["klass"],
7306 extra_params=dedent(
7307 """axis : {0 or 'index', 1 or 'columns'}, default 0
7308 The axis to swap levels on. 0 or 'index' for row-wise, 1 or
7309 'columns' for column-wise."""
7310 ),
7311 examples=dedent(
7312 """\
7313 Examples
7314 --------
7315 >>> df = pd.DataFrame(
7316 ... {"Grade": ["A", "B", "A", "C"]},
7317 ... index=[
7318 ... ["Final exam", "Final exam", "Coursework", "Coursework"],
7319 ... ["History", "Geography", "History", "Geography"],
7320 ... ["January", "February", "March", "April"],
7321 ... ],
7322 ... )
7323 >>> df
7324 Grade
7325 Final exam History January A
7326 Geography February B
7327 Coursework History March A
7328 Geography April C
7329
7330 In the following example, we will swap the levels of the indices.
7331 Here, we will swap the levels column-wise, but levels can be swapped row-wise
7332 in a similar manner. Note that column-wise is the default behaviour.
7333 By not supplying any arguments for i and j, we swap the last and second to
7334 last indices.
7335
7336 >>> df.swaplevel()
7337 Grade
7338 Final exam January History A
7339 February Geography B
7340 Coursework March History A
7341 April Geography C
7342
7343 By supplying one argument, we can choose which index to swap the last
7344 index with. We can for example swap the first index with the last one as
7345 follows.
7346
7347 >>> df.swaplevel(0)
7348 Grade
7349 January History Final exam A
7350 February Geography Final exam B
7351 March History Coursework A
7352 April Geography Coursework C
7353
7354 We can also define explicitly which indices we want to swap by supplying values
7355 for both i and j. Here, we for example swap the first and second indices.
7356
7357 >>> df.swaplevel(0, 1)
7358 Grade
7359 History Final exam January A
7360 Geography Final exam February B
7361 History Coursework March A
7362 Geography Coursework April C"""
7363 ),
7364 )
7365 def swaplevel(self, i: Axis = -2, j: Axis = -1, axis: Axis = 0) -> DataFrame:
7366 result = self.copy(deep=None)
7367
7368 axis = self._get_axis_number(axis)
7369
7370 if not isinstance(result._get_axis(axis), MultiIndex): # pragma: no cover
7371 raise TypeError("Can only swap levels on a hierarchical axis.")
7372
7373 if axis == 0:
7374 assert isinstance(result.index, MultiIndex)
7375 result.index = result.index.swaplevel(i, j)
7376 else:
7377 assert isinstance(result.columns, MultiIndex)
7378 result.columns = result.columns.swaplevel(i, j)
7379 return result
7380
7381 def reorder_levels(self, order: Sequence[int | str], axis: Axis = 0) -> DataFrame:
7382 """
7383 Rearrange index levels using input order. May not drop or duplicate levels.
7384
7385 Parameters
7386 ----------
7387 order : list of int or list of str
7388 List representing new level order. Reference level by number
7389 (position) or by key (label).
7390 axis : {0 or 'index', 1 or 'columns'}, default 0
7391 Where to reorder levels.
7392
7393 Returns
7394 -------
7395 DataFrame
7396
7397 Examples
7398 --------
7399 >>> data = {
7400 ... "class": ["Mammals", "Mammals", "Reptiles"],
7401 ... "diet": ["Omnivore", "Carnivore", "Carnivore"],
7402 ... "species": ["Humans", "Dogs", "Snakes"],
7403 ... }
7404 >>> df = pd.DataFrame(data, columns=["class", "diet", "species"])
7405 >>> df = df.set_index(["class", "diet"])
7406 >>> df
7407 species
7408 class diet
7409 Mammals Omnivore Humans
7410 Carnivore Dogs
7411 Reptiles Carnivore Snakes
7412
7413 Let's reorder the levels of the index:
7414
7415 >>> df.reorder_levels(["diet", "class"])
7416 species
7417 diet class
7418 Omnivore Mammals Humans
7419 Carnivore Mammals Dogs
7420 Reptiles Snakes
7421 """
7422 axis = self._get_axis_number(axis)
7423 if not isinstance(self._get_axis(axis), MultiIndex): # pragma: no cover
7424 raise TypeError("Can only reorder levels on a hierarchical axis.")
7425
7426 result = self.copy(deep=None)
7427
7428 if axis == 0:
7429 assert isinstance(result.index, MultiIndex)
7430 result.index = result.index.reorder_levels(order)
7431 else:
7432 assert isinstance(result.columns, MultiIndex)
7433 result.columns = result.columns.reorder_levels(order)
7434 return result
7435
7436 # ----------------------------------------------------------------------
7437 # Arithmetic Methods
7438
7439 def _cmp_method(self, other, op):
7440 axis: Literal[1] = 1 # only relevant for Series other case
7441
7442 self, other = ops.align_method_FRAME(self, other, axis, flex=False, level=None)
7443
7444 # See GH#4537 for discussion of scalar op behavior
7445 new_data = self._dispatch_frame_op(other, op, axis=axis)
7446 return self._construct_result(new_data)
7447
7448 def _arith_method(self, other, op):
7449 if ops.should_reindex_frame_op(self, other, op, 1, None, None):
7450 return ops.frame_arith_method_with_reindex(self, other, op)
7451
7452 axis: Literal[1] = 1 # only relevant for Series other case
7453 other = ops.maybe_prepare_scalar_for_op(other, (self.shape[axis],))
7454
7455 self, other = ops.align_method_FRAME(self, other, axis, flex=True, level=None)
7456
7457 new_data = self._dispatch_frame_op(other, op, axis=axis)
7458 return self._construct_result(new_data)
7459
7460 _logical_method = _arith_method
7461
7462 def _dispatch_frame_op(self, right, func: Callable, axis: AxisInt | None = None):
7463 """
7464 Evaluate the frame operation func(left, right) by evaluating
7465 column-by-column, dispatching to the Series implementation.
7466
7467 Parameters
7468 ----------
7469 right : scalar, Series, or DataFrame
7470 func : arithmetic or comparison operator
7471 axis : {None, 0, 1}
7472
7473 Returns
7474 -------
7475 DataFrame
7476 """
7477 # Get the appropriate array-op to apply to each column/block's values.
7478 array_op = ops.get_array_op(func)
7479
7480 right = lib.item_from_zerodim(right)
7481 if not is_list_like(right):
7482 # i.e. scalar, faster than checking np.ndim(right) == 0
7483 with np.errstate(all="ignore"):
7484 bm = self._mgr.apply(array_op, right=right)
7485 return self._constructor(bm)
7486
7487 elif isinstance(right, DataFrame):
7488 assert self.index.equals(right.index)
7489 assert self.columns.equals(right.columns)
7490 # TODO: The previous assertion `assert right._indexed_same(self)`
7491 # fails in cases with empty columns reached via
7492 # _frame_arith_method_with_reindex
7493
7494 # TODO operate_blockwise expects a manager of the same type
7495 with np.errstate(all="ignore"):
7496 bm = self._mgr.operate_blockwise(
7497 # error: Argument 1 to "operate_blockwise" of "ArrayManager" has
7498 # incompatible type "Union[ArrayManager, BlockManager]"; expected
7499 # "ArrayManager"
7500 # error: Argument 1 to "operate_blockwise" of "BlockManager" has
7501 # incompatible type "Union[ArrayManager, BlockManager]"; expected
7502 # "BlockManager"
7503 right._mgr, # type: ignore[arg-type]
7504 array_op,
7505 )
7506 return self._constructor(bm)
7507
7508 elif isinstance(right, Series) and axis == 1:
7509 # axis=1 means we want to operate row-by-row
7510 assert right.index.equals(self.columns)
7511
7512 right = right._values
7513 # maybe_align_as_frame ensures we do not have an ndarray here
7514 assert not isinstance(right, np.ndarray)
7515
7516 with np.errstate(all="ignore"):
7517 arrays = [
7518 array_op(_left, _right)
7519 for _left, _right in zip(self._iter_column_arrays(), right)
7520 ]
7521
7522 elif isinstance(right, Series):
7523 assert right.index.equals(self.index) # Handle other cases later
7524 right = right._values
7525
7526 with np.errstate(all="ignore"):
7527 arrays = [array_op(left, right) for left in self._iter_column_arrays()]
7528
7529 else:
7530 # Remaining cases have less-obvious dispatch rules
7531 raise NotImplementedError(right)
7532
7533 return type(self)._from_arrays(
7534 arrays, self.columns, self.index, verify_integrity=False
7535 )
7536
7537 def _combine_frame(self, other: DataFrame, func, fill_value=None):
7538 # at this point we have `self._indexed_same(other)`
7539
7540 if fill_value is None:
7541 # since _arith_op may be called in a loop, avoid function call
7542 # overhead if possible by doing this check once
7543 _arith_op = func
7544
7545 else:
7546
7547 def _arith_op(left, right):
7548 # for the mixed_type case where we iterate over columns,
7549 # _arith_op(left, right) is equivalent to
7550 # left._binop(right, func, fill_value=fill_value)
7551 left, right = ops.fill_binop(left, right, fill_value)
7552 return func(left, right)
7553
7554 new_data = self._dispatch_frame_op(other, _arith_op)
7555 return new_data
7556
7557 def _construct_result(self, result) -> DataFrame:
7558 """
7559 Wrap the result of an arithmetic, comparison, or logical operation.
7560
7561 Parameters
7562 ----------
7563 result : DataFrame
7564
7565 Returns
7566 -------
7567 DataFrame
7568 """
7569 out = self._constructor(result, copy=False).__finalize__(self)
7570 # Pin columns instead of passing to constructor for compat with
7571 # non-unique columns case
7572 out.columns = self.columns
7573 out.index = self.index
7574 return out
7575
7576 def __divmod__(self, other) -> tuple[DataFrame, DataFrame]:
7577 # Naive implementation, room for optimization
7578 div = self // other
7579 mod = self - div * other
7580 return div, mod
7581
7582 def __rdivmod__(self, other) -> tuple[DataFrame, DataFrame]:
7583 # Naive implementation, room for optimization
7584 div = other // self
7585 mod = other - div * self
7586 return div, mod
7587
7588 # ----------------------------------------------------------------------
7589 # Combination-Related
7590
7591 @doc(
7592 _shared_docs["compare"],
7593 """
7594Returns
7595-------
7596DataFrame
7597 DataFrame that shows the differences stacked side by side.
7598
7599 The resulting index will be a MultiIndex with 'self' and 'other'
7600 stacked alternately at the inner level.
7601
7602Raises
7603------
7604ValueError
7605 When the two DataFrames don't have identical labels or shape.
7606
7607See Also
7608--------
7609Series.compare : Compare with another Series and show differences.
7610DataFrame.equals : Test whether two objects contain the same elements.
7611
7612Notes
7613-----
7614Matching NaNs will not appear as a difference.
7615
7616Can only compare identically-labeled
7617(i.e. same shape, identical row and column labels) DataFrames
7618
7619Examples
7620--------
7621>>> df = pd.DataFrame(
7622... {{
7623... "col1": ["a", "a", "b", "b", "a"],
7624... "col2": [1.0, 2.0, 3.0, np.nan, 5.0],
7625... "col3": [1.0, 2.0, 3.0, 4.0, 5.0]
7626... }},
7627... columns=["col1", "col2", "col3"],
7628... )
7629>>> df
7630 col1 col2 col3
76310 a 1.0 1.0
76321 a 2.0 2.0
76332 b 3.0 3.0
76343 b NaN 4.0
76354 a 5.0 5.0
7636
7637>>> df2 = df.copy()
7638>>> df2.loc[0, 'col1'] = 'c'
7639>>> df2.loc[2, 'col3'] = 4.0
7640>>> df2
7641 col1 col2 col3
76420 c 1.0 1.0
76431 a 2.0 2.0
76442 b 3.0 4.0
76453 b NaN 4.0
76464 a 5.0 5.0
7647
7648Align the differences on columns
7649
7650>>> df.compare(df2)
7651 col1 col3
7652 self other self other
76530 a c NaN NaN
76542 NaN NaN 3.0 4.0
7655
7656Assign result_names
7657
7658>>> df.compare(df2, result_names=("left", "right"))
7659 col1 col3
7660 left right left right
76610 a c NaN NaN
76622 NaN NaN 3.0 4.0
7663
7664Stack the differences on rows
7665
7666>>> df.compare(df2, align_axis=0)
7667 col1 col3
76680 self a NaN
7669 other c NaN
76702 self NaN 3.0
7671 other NaN 4.0
7672
7673Keep the equal values
7674
7675>>> df.compare(df2, keep_equal=True)
7676 col1 col3
7677 self other self other
76780 a c 1.0 1.0
76792 b b 3.0 4.0
7680
7681Keep all original rows and columns
7682
7683>>> df.compare(df2, keep_shape=True)
7684 col1 col2 col3
7685 self other self other self other
76860 a c NaN NaN NaN NaN
76871 NaN NaN NaN NaN NaN NaN
76882 NaN NaN NaN NaN 3.0 4.0
76893 NaN NaN NaN NaN NaN NaN
76904 NaN NaN NaN NaN NaN NaN
7691
7692Keep all original rows and columns and also all original values
7693
7694>>> df.compare(df2, keep_shape=True, keep_equal=True)
7695 col1 col2 col3
7696 self other self other self other
76970 a c 1.0 1.0 1.0 1.0
76981 a a 2.0 2.0 2.0 2.0
76992 b b 3.0 3.0 3.0 4.0
77003 b b NaN NaN 4.0 4.0
77014 a a 5.0 5.0 5.0 5.0
7702""",
7703 klass=_shared_doc_kwargs["klass"],
7704 )
7705 def compare(
7706 self,
7707 other: DataFrame,
7708 align_axis: Axis = 1,
7709 keep_shape: bool = False,
7710 keep_equal: bool = False,
7711 result_names: Suffixes = ("self", "other"),
7712 ) -> DataFrame:
7713 return super().compare(
7714 other=other,
7715 align_axis=align_axis,
7716 keep_shape=keep_shape,
7717 keep_equal=keep_equal,
7718 result_names=result_names,
7719 )
7720
7721 def combine(
7722 self,
7723 other: DataFrame,
7724 func: Callable[[Series, Series], Series | Hashable],
7725 fill_value=None,
7726 overwrite: bool = True,
7727 ) -> DataFrame:
7728 """
7729 Perform column-wise combine with another DataFrame.
7730
7731 Combines a DataFrame with `other` DataFrame using `func`
7732 to element-wise combine columns. The row and column indexes of the
7733 resulting DataFrame will be the union of the two.
7734
7735 Parameters
7736 ----------
7737 other : DataFrame
7738 The DataFrame to merge column-wise.
7739 func : function
7740 Function that takes two series as inputs and return a Series or a
7741 scalar. Used to merge the two dataframes column by columns.
7742 fill_value : scalar value, default None
7743 The value to fill NaNs with prior to passing any column to the
7744 merge func.
7745 overwrite : bool, default True
7746 If True, columns in `self` that do not exist in `other` will be
7747 overwritten with NaNs.
7748
7749 Returns
7750 -------
7751 DataFrame
7752 Combination of the provided DataFrames.
7753
7754 See Also
7755 --------
7756 DataFrame.combine_first : Combine two DataFrame objects and default to
7757 non-null values in frame calling the method.
7758
7759 Examples
7760 --------
7761 Combine using a simple function that chooses the smaller column.
7762
7763 >>> df1 = pd.DataFrame({'A': [0, 0], 'B': [4, 4]})
7764 >>> df2 = pd.DataFrame({'A': [1, 1], 'B': [3, 3]})
7765 >>> take_smaller = lambda s1, s2: s1 if s1.sum() < s2.sum() else s2
7766 >>> df1.combine(df2, take_smaller)
7767 A B
7768 0 0 3
7769 1 0 3
7770
7771 Example using a true element-wise combine function.
7772
7773 >>> df1 = pd.DataFrame({'A': [5, 0], 'B': [2, 4]})
7774 >>> df2 = pd.DataFrame({'A': [1, 1], 'B': [3, 3]})
7775 >>> df1.combine(df2, np.minimum)
7776 A B
7777 0 1 2
7778 1 0 3
7779
7780 Using `fill_value` fills Nones prior to passing the column to the
7781 merge function.
7782
7783 >>> df1 = pd.DataFrame({'A': [0, 0], 'B': [None, 4]})
7784 >>> df2 = pd.DataFrame({'A': [1, 1], 'B': [3, 3]})
7785 >>> df1.combine(df2, take_smaller, fill_value=-5)
7786 A B
7787 0 0 -5.0
7788 1 0 4.0
7789
7790 However, if the same element in both dataframes is None, that None
7791 is preserved
7792
7793 >>> df1 = pd.DataFrame({'A': [0, 0], 'B': [None, 4]})
7794 >>> df2 = pd.DataFrame({'A': [1, 1], 'B': [None, 3]})
7795 >>> df1.combine(df2, take_smaller, fill_value=-5)
7796 A B
7797 0 0 -5.0
7798 1 0 3.0
7799
7800 Example that demonstrates the use of `overwrite` and behavior when
7801 the axis differ between the dataframes.
7802
7803 >>> df1 = pd.DataFrame({'A': [0, 0], 'B': [4, 4]})
7804 >>> df2 = pd.DataFrame({'B': [3, 3], 'C': [-10, 1], }, index=[1, 2])
7805 >>> df1.combine(df2, take_smaller)
7806 A B C
7807 0 NaN NaN NaN
7808 1 NaN 3.0 -10.0
7809 2 NaN 3.0 1.0
7810
7811 >>> df1.combine(df2, take_smaller, overwrite=False)
7812 A B C
7813 0 0.0 NaN NaN
7814 1 0.0 3.0 -10.0
7815 2 NaN 3.0 1.0
7816
7817 Demonstrating the preference of the passed in dataframe.
7818
7819 >>> df2 = pd.DataFrame({'B': [3, 3], 'C': [1, 1], }, index=[1, 2])
7820 >>> df2.combine(df1, take_smaller)
7821 A B C
7822 0 0.0 NaN NaN
7823 1 0.0 3.0 NaN
7824 2 NaN 3.0 NaN
7825
7826 >>> df2.combine(df1, take_smaller, overwrite=False)
7827 A B C
7828 0 0.0 NaN NaN
7829 1 0.0 3.0 1.0
7830 2 NaN 3.0 1.0
7831 """
7832 other_idxlen = len(other.index) # save for compare
7833
7834 this, other = self.align(other, copy=False)
7835 new_index = this.index
7836
7837 if other.empty and len(new_index) == len(self.index):
7838 return self.copy()
7839
7840 if self.empty and len(other) == other_idxlen:
7841 return other.copy()
7842
7843 # sorts if possible; otherwise align above ensures that these are set-equal
7844 new_columns = this.columns.union(other.columns)
7845 do_fill = fill_value is not None
7846 result = {}
7847 for col in new_columns:
7848 series = this[col]
7849 other_series = other[col]
7850
7851 this_dtype = series.dtype
7852 other_dtype = other_series.dtype
7853
7854 this_mask = isna(series)
7855 other_mask = isna(other_series)
7856
7857 # don't overwrite columns unnecessarily
7858 # DO propagate if this column is not in the intersection
7859 if not overwrite and other_mask.all():
7860 result[col] = this[col].copy()
7861 continue
7862
7863 if do_fill:
7864 series = series.copy()
7865 other_series = other_series.copy()
7866 series[this_mask] = fill_value
7867 other_series[other_mask] = fill_value
7868
7869 if col not in self.columns:
7870 # If self DataFrame does not have col in other DataFrame,
7871 # try to promote series, which is all NaN, as other_dtype.
7872 new_dtype = other_dtype
7873 try:
7874 series = series.astype(new_dtype, copy=False)
7875 except ValueError:
7876 # e.g. new_dtype is integer types
7877 pass
7878 else:
7879 # if we have different dtypes, possibly promote
7880 new_dtype = find_common_type([this_dtype, other_dtype])
7881 series = series.astype(new_dtype, copy=False)
7882 other_series = other_series.astype(new_dtype, copy=False)
7883
7884 arr = func(series, other_series)
7885 if isinstance(new_dtype, np.dtype):
7886 # if new_dtype is an EA Dtype, then `func` is expected to return
7887 # the correct dtype without any additional casting
7888 # error: No overload variant of "maybe_downcast_to_dtype" matches
7889 # argument types "Union[Series, Hashable]", "dtype[Any]"
7890 arr = maybe_downcast_to_dtype( # type: ignore[call-overload]
7891 arr, new_dtype
7892 )
7893
7894 result[col] = arr
7895
7896 # convert_objects just in case
7897 return self._constructor(result, index=new_index, columns=new_columns)
7898
7899 def combine_first(self, other: DataFrame) -> DataFrame:
7900 """
7901 Update null elements with value in the same location in `other`.
7902
7903 Combine two DataFrame objects by filling null values in one DataFrame
7904 with non-null values from other DataFrame. The row and column indexes
7905 of the resulting DataFrame will be the union of the two. The resulting
7906 dataframe contains the 'first' dataframe values and overrides the
7907 second one values where both first.loc[index, col] and
7908 second.loc[index, col] are not missing values, upon calling
7909 first.combine_first(second).
7910
7911 Parameters
7912 ----------
7913 other : DataFrame
7914 Provided DataFrame to use to fill null values.
7915
7916 Returns
7917 -------
7918 DataFrame
7919 The result of combining the provided DataFrame with the other object.
7920
7921 See Also
7922 --------
7923 DataFrame.combine : Perform series-wise operation on two DataFrames
7924 using a given function.
7925
7926 Examples
7927 --------
7928 >>> df1 = pd.DataFrame({'A': [None, 0], 'B': [None, 4]})
7929 >>> df2 = pd.DataFrame({'A': [1, 1], 'B': [3, 3]})
7930 >>> df1.combine_first(df2)
7931 A B
7932 0 1.0 3.0
7933 1 0.0 4.0
7934
7935 Null values still persist if the location of that null value
7936 does not exist in `other`
7937
7938 >>> df1 = pd.DataFrame({'A': [None, 0], 'B': [4, None]})
7939 >>> df2 = pd.DataFrame({'B': [3, 3], 'C': [1, 1]}, index=[1, 2])
7940 >>> df1.combine_first(df2)
7941 A B C
7942 0 NaN 4.0 NaN
7943 1 0.0 3.0 1.0
7944 2 NaN 3.0 1.0
7945 """
7946 from pandas.core.computation import expressions
7947
7948 def combiner(x, y):
7949 mask = extract_array(isna(x))
7950
7951 x_values = extract_array(x, extract_numpy=True)
7952 y_values = extract_array(y, extract_numpy=True)
7953
7954 # If the column y in other DataFrame is not in first DataFrame,
7955 # just return y_values.
7956 if y.name not in self.columns:
7957 return y_values
7958
7959 return expressions.where(mask, y_values, x_values)
7960
7961 combined = self.combine(other, combiner, overwrite=False)
7962
7963 dtypes = {
7964 col: find_common_type([self.dtypes[col], other.dtypes[col]])
7965 for col in self.columns.intersection(other.columns)
7966 if not is_dtype_equal(combined.dtypes[col], self.dtypes[col])
7967 }
7968
7969 if dtypes:
7970 combined = combined.astype(dtypes)
7971
7972 return combined
7973
7974 def update(
7975 self,
7976 other,
7977 join: str = "left",
7978 overwrite: bool = True,
7979 filter_func=None,
7980 errors: str = "ignore",
7981 ) -> None:
7982 """
7983 Modify in place using non-NA values from another DataFrame.
7984
7985 Aligns on indices. There is no return value.
7986
7987 Parameters
7988 ----------
7989 other : DataFrame, or object coercible into a DataFrame
7990 Should have at least one matching index/column label
7991 with the original DataFrame. If a Series is passed,
7992 its name attribute must be set, and that will be
7993 used as the column name to align with the original DataFrame.
7994 join : {'left'}, default 'left'
7995 Only left join is implemented, keeping the index and columns of the
7996 original object.
7997 overwrite : bool, default True
7998 How to handle non-NA values for overlapping keys:
7999
8000 * True: overwrite original DataFrame's values
8001 with values from `other`.
8002 * False: only update values that are NA in
8003 the original DataFrame.
8004
8005 filter_func : callable(1d-array) -> bool 1d-array, optional
8006 Can choose to replace values other than NA. Return True for values
8007 that should be updated.
8008 errors : {'raise', 'ignore'}, default 'ignore'
8009 If 'raise', will raise a ValueError if the DataFrame and `other`
8010 both contain non-NA data in the same place.
8011
8012 Returns
8013 -------
8014 None
8015 This method directly changes calling object.
8016
8017 Raises
8018 ------
8019 ValueError
8020 * When `errors='raise'` and there's overlapping non-NA data.
8021 * When `errors` is not either `'ignore'` or `'raise'`
8022 NotImplementedError
8023 * If `join != 'left'`
8024
8025 See Also
8026 --------
8027 dict.update : Similar method for dictionaries.
8028 DataFrame.merge : For column(s)-on-column(s) operations.
8029
8030 Examples
8031 --------
8032 >>> df = pd.DataFrame({'A': [1, 2, 3],
8033 ... 'B': [400, 500, 600]})
8034 >>> new_df = pd.DataFrame({'B': [4, 5, 6],
8035 ... 'C': [7, 8, 9]})
8036 >>> df.update(new_df)
8037 >>> df
8038 A B
8039 0 1 4
8040 1 2 5
8041 2 3 6
8042
8043 The DataFrame's length does not increase as a result of the update,
8044 only values at matching index/column labels are updated.
8045
8046 >>> df = pd.DataFrame({'A': ['a', 'b', 'c'],
8047 ... 'B': ['x', 'y', 'z']})
8048 >>> new_df = pd.DataFrame({'B': ['d', 'e', 'f', 'g', 'h', 'i']})
8049 >>> df.update(new_df)
8050 >>> df
8051 A B
8052 0 a d
8053 1 b e
8054 2 c f
8055
8056 For Series, its name attribute must be set.
8057
8058 >>> df = pd.DataFrame({'A': ['a', 'b', 'c'],
8059 ... 'B': ['x', 'y', 'z']})
8060 >>> new_column = pd.Series(['d', 'e'], name='B', index=[0, 2])
8061 >>> df.update(new_column)
8062 >>> df
8063 A B
8064 0 a d
8065 1 b y
8066 2 c e
8067 >>> df = pd.DataFrame({'A': ['a', 'b', 'c'],
8068 ... 'B': ['x', 'y', 'z']})
8069 >>> new_df = pd.DataFrame({'B': ['d', 'e']}, index=[1, 2])
8070 >>> df.update(new_df)
8071 >>> df
8072 A B
8073 0 a x
8074 1 b d
8075 2 c e
8076
8077 If `other` contains NaNs the corresponding values are not updated
8078 in the original dataframe.
8079
8080 >>> df = pd.DataFrame({'A': [1, 2, 3],
8081 ... 'B': [400, 500, 600]})
8082 >>> new_df = pd.DataFrame({'B': [4, np.nan, 6]})
8083 >>> df.update(new_df)
8084 >>> df
8085 A B
8086 0 1 4
8087 1 2 500
8088 2 3 6
8089 """
8090 from pandas.core.computation import expressions
8091
8092 # TODO: Support other joins
8093 if join != "left": # pragma: no cover
8094 raise NotImplementedError("Only left join is supported")
8095 if errors not in ["ignore", "raise"]:
8096 raise ValueError("The parameter errors must be either 'ignore' or 'raise'")
8097
8098 if not isinstance(other, DataFrame):
8099 other = DataFrame(other)
8100
8101 other = other.reindex(self.index)
8102
8103 for col in self.columns.intersection(other.columns):
8104 this = self[col]._values
8105 that = other[col]._values
8106
8107 if filter_func is not None:
8108 with np.errstate(all="ignore"):
8109 mask = ~filter_func(this) | isna(that)
8110 else:
8111 if errors == "raise":
8112 mask_this = notna(that)
8113 mask_that = notna(this)
8114 if any(mask_this & mask_that):
8115 raise ValueError("Data overlaps.")
8116
8117 if overwrite:
8118 mask = isna(that)
8119 else:
8120 mask = notna(this)
8121
8122 # don't overwrite columns unnecessarily
8123 if mask.all():
8124 continue
8125
8126 self.loc[:, col] = expressions.where(mask, this, that)
8127
8128 # ----------------------------------------------------------------------
8129 # Data reshaping
8130 @Appender(
8131 """
8132Examples
8133--------
8134>>> df = pd.DataFrame({'Animal': ['Falcon', 'Falcon',
8135... 'Parrot', 'Parrot'],
8136... 'Max Speed': [380., 370., 24., 26.]})
8137>>> df
8138 Animal Max Speed
81390 Falcon 380.0
81401 Falcon 370.0
81412 Parrot 24.0
81423 Parrot 26.0
8143>>> df.groupby(['Animal']).mean()
8144 Max Speed
8145Animal
8146Falcon 375.0
8147Parrot 25.0
8148
8149**Hierarchical Indexes**
8150
8151We can groupby different levels of a hierarchical index
8152using the `level` parameter:
8153
8154>>> arrays = [['Falcon', 'Falcon', 'Parrot', 'Parrot'],
8155... ['Captive', 'Wild', 'Captive', 'Wild']]
8156>>> index = pd.MultiIndex.from_arrays(arrays, names=('Animal', 'Type'))
8157>>> df = pd.DataFrame({'Max Speed': [390., 350., 30., 20.]},
8158... index=index)
8159>>> df
8160 Max Speed
8161Animal Type
8162Falcon Captive 390.0
8163 Wild 350.0
8164Parrot Captive 30.0
8165 Wild 20.0
8166>>> df.groupby(level=0).mean()
8167 Max Speed
8168Animal
8169Falcon 370.0
8170Parrot 25.0
8171>>> df.groupby(level="Type").mean()
8172 Max Speed
8173Type
8174Captive 210.0
8175Wild 185.0
8176
8177We can also choose to include NA in group keys or not by setting
8178`dropna` parameter, the default setting is `True`.
8179
8180>>> l = [[1, 2, 3], [1, None, 4], [2, 1, 3], [1, 2, 2]]
8181>>> df = pd.DataFrame(l, columns=["a", "b", "c"])
8182
8183>>> df.groupby(by=["b"]).sum()
8184 a c
8185b
81861.0 2 3
81872.0 2 5
8188
8189>>> df.groupby(by=["b"], dropna=False).sum()
8190 a c
8191b
81921.0 2 3
81932.0 2 5
8194NaN 1 4
8195
8196>>> l = [["a", 12, 12], [None, 12.3, 33.], ["b", 12.3, 123], ["a", 1, 1]]
8197>>> df = pd.DataFrame(l, columns=["a", "b", "c"])
8198
8199>>> df.groupby(by="a").sum()
8200 b c
8201a
8202a 13.0 13.0
8203b 12.3 123.0
8204
8205>>> df.groupby(by="a", dropna=False).sum()
8206 b c
8207a
8208a 13.0 13.0
8209b 12.3 123.0
8210NaN 12.3 33.0
8211
8212When using ``.apply()``, use ``group_keys`` to include or exclude the group keys.
8213The ``group_keys`` argument defaults to ``True`` (include).
8214
8215>>> df = pd.DataFrame({'Animal': ['Falcon', 'Falcon',
8216... 'Parrot', 'Parrot'],
8217... 'Max Speed': [380., 370., 24., 26.]})
8218>>> df.groupby("Animal", group_keys=True).apply(lambda x: x)
8219 Animal Max Speed
8220Animal
8221Falcon 0 Falcon 380.0
8222 1 Falcon 370.0
8223Parrot 2 Parrot 24.0
8224 3 Parrot 26.0
8225
8226>>> df.groupby("Animal", group_keys=False).apply(lambda x: x)
8227 Animal Max Speed
82280 Falcon 380.0
82291 Falcon 370.0
82302 Parrot 24.0
82313 Parrot 26.0
8232"""
8233 )
8234 @Appender(_shared_docs["groupby"] % _shared_doc_kwargs)
8235 def groupby(
8236 self,
8237 by=None,
8238 axis: Axis = 0,
8239 level: IndexLabel | None = None,
8240 as_index: bool = True,
8241 sort: bool = True,
8242 group_keys: bool = True,
8243 observed: bool = False,
8244 dropna: bool = True,
8245 ) -> DataFrameGroupBy:
8246 from pandas.core.groupby.generic import DataFrameGroupBy
8247
8248 if level is None and by is None:
8249 raise TypeError("You have to supply one of 'by' and 'level'")
8250 axis = self._get_axis_number(axis)
8251
8252 return DataFrameGroupBy(
8253 obj=self,
8254 keys=by,
8255 axis=axis,
8256 level=level,
8257 as_index=as_index,
8258 sort=sort,
8259 group_keys=group_keys,
8260 observed=observed,
8261 dropna=dropna,
8262 )
8263
8264 _shared_docs[
8265 "pivot"
8266 ] = """
8267 Return reshaped DataFrame organized by given index / column values.
8268
8269 Reshape data (produce a "pivot" table) based on column values. Uses
8270 unique values from specified `index` / `columns` to form axes of the
8271 resulting DataFrame. This function does not support data
8272 aggregation, multiple values will result in a MultiIndex in the
8273 columns. See the :ref:`User Guide <reshaping>` for more on reshaping.
8274
8275 Parameters
8276 ----------%s
8277 columns : str or object or a list of str
8278 Column to use to make new frame's columns.
8279
8280 .. versionchanged:: 1.1.0
8281 Also accept list of columns names.
8282
8283 index : str or object or a list of str, optional
8284 Column to use to make new frame's index. If not given, uses existing index.
8285
8286 .. versionchanged:: 1.1.0
8287 Also accept list of index names.
8288
8289 values : str, object or a list of the previous, optional
8290 Column(s) to use for populating new frame's values. If not
8291 specified, all remaining columns will be used and the result will
8292 have hierarchically indexed columns.
8293
8294 Returns
8295 -------
8296 DataFrame
8297 Returns reshaped DataFrame.
8298
8299 Raises
8300 ------
8301 ValueError:
8302 When there are any `index`, `columns` combinations with multiple
8303 values. `DataFrame.pivot_table` when you need to aggregate.
8304
8305 See Also
8306 --------
8307 DataFrame.pivot_table : Generalization of pivot that can handle
8308 duplicate values for one index/column pair.
8309 DataFrame.unstack : Pivot based on the index values instead of a
8310 column.
8311 wide_to_long : Wide panel to long format. Less flexible but more
8312 user-friendly than melt.
8313
8314 Notes
8315 -----
8316 For finer-tuned control, see hierarchical indexing documentation along
8317 with the related stack/unstack methods.
8318
8319 Reference :ref:`the user guide <reshaping.pivot>` for more examples.
8320
8321 Examples
8322 --------
8323 >>> df = pd.DataFrame({'foo': ['one', 'one', 'one', 'two', 'two',
8324 ... 'two'],
8325 ... 'bar': ['A', 'B', 'C', 'A', 'B', 'C'],
8326 ... 'baz': [1, 2, 3, 4, 5, 6],
8327 ... 'zoo': ['x', 'y', 'z', 'q', 'w', 't']})
8328 >>> df
8329 foo bar baz zoo
8330 0 one A 1 x
8331 1 one B 2 y
8332 2 one C 3 z
8333 3 two A 4 q
8334 4 two B 5 w
8335 5 two C 6 t
8336
8337 >>> df.pivot(index='foo', columns='bar', values='baz')
8338 bar A B C
8339 foo
8340 one 1 2 3
8341 two 4 5 6
8342
8343 >>> df.pivot(index='foo', columns='bar')['baz']
8344 bar A B C
8345 foo
8346 one 1 2 3
8347 two 4 5 6
8348
8349 >>> df.pivot(index='foo', columns='bar', values=['baz', 'zoo'])
8350 baz zoo
8351 bar A B C A B C
8352 foo
8353 one 1 2 3 x y z
8354 two 4 5 6 q w t
8355
8356 You could also assign a list of column names or a list of index names.
8357
8358 >>> df = pd.DataFrame({
8359 ... "lev1": [1, 1, 1, 2, 2, 2],
8360 ... "lev2": [1, 1, 2, 1, 1, 2],
8361 ... "lev3": [1, 2, 1, 2, 1, 2],
8362 ... "lev4": [1, 2, 3, 4, 5, 6],
8363 ... "values": [0, 1, 2, 3, 4, 5]})
8364 >>> df
8365 lev1 lev2 lev3 lev4 values
8366 0 1 1 1 1 0
8367 1 1 1 2 2 1
8368 2 1 2 1 3 2
8369 3 2 1 2 4 3
8370 4 2 1 1 5 4
8371 5 2 2 2 6 5
8372
8373 >>> df.pivot(index="lev1", columns=["lev2", "lev3"], values="values")
8374 lev2 1 2
8375 lev3 1 2 1 2
8376 lev1
8377 1 0.0 1.0 2.0 NaN
8378 2 4.0 3.0 NaN 5.0
8379
8380 >>> df.pivot(index=["lev1", "lev2"], columns=["lev3"], values="values")
8381 lev3 1 2
8382 lev1 lev2
8383 1 1 0.0 1.0
8384 2 2.0 NaN
8385 2 1 4.0 3.0
8386 2 NaN 5.0
8387
8388 A ValueError is raised if there are any duplicates.
8389
8390 >>> df = pd.DataFrame({"foo": ['one', 'one', 'two', 'two'],
8391 ... "bar": ['A', 'A', 'B', 'C'],
8392 ... "baz": [1, 2, 3, 4]})
8393 >>> df
8394 foo bar baz
8395 0 one A 1
8396 1 one A 2
8397 2 two B 3
8398 3 two C 4
8399
8400 Notice that the first two rows are the same for our `index`
8401 and `columns` arguments.
8402
8403 >>> df.pivot(index='foo', columns='bar', values='baz')
8404 Traceback (most recent call last):
8405 ...
8406 ValueError: Index contains duplicate entries, cannot reshape
8407 """
8408
8409 @Substitution("")
8410 @Appender(_shared_docs["pivot"])
8411 def pivot(self, *, columns, index=lib.NoDefault, values=lib.NoDefault) -> DataFrame:
8412 from pandas.core.reshape.pivot import pivot
8413
8414 return pivot(self, index=index, columns=columns, values=values)
8415
8416 _shared_docs[
8417 "pivot_table"
8418 ] = """
8419 Create a spreadsheet-style pivot table as a DataFrame.
8420
8421 The levels in the pivot table will be stored in MultiIndex objects
8422 (hierarchical indexes) on the index and columns of the result DataFrame.
8423
8424 Parameters
8425 ----------%s
8426 values : list-like or scalar, optional
8427 Column or columns to aggregate.
8428 index : column, Grouper, array, or list of the previous
8429 If an array is passed, it must be the same length as the data. The
8430 list can contain any of the other types (except list).
8431 Keys to group by on the pivot table index. If an array is passed,
8432 it is being used as the same manner as column values.
8433 columns : column, Grouper, array, or list of the previous
8434 If an array is passed, it must be the same length as the data. The
8435 list can contain any of the other types (except list).
8436 Keys to group by on the pivot table column. If an array is passed,
8437 it is being used as the same manner as column values.
8438 aggfunc : function, list of functions, dict, default numpy.mean
8439 If list of functions passed, the resulting pivot table will have
8440 hierarchical columns whose top level are the function names
8441 (inferred from the function objects themselves)
8442 If dict is passed, the key is column to aggregate and value
8443 is function or list of functions. If ``margin=True``,
8444 aggfunc will be used to calculate the partial aggregates.
8445 fill_value : scalar, default None
8446 Value to replace missing values with (in the resulting pivot table,
8447 after aggregation).
8448 margins : bool, default False
8449 If ``margins=True``, special ``All`` columns and rows
8450 will be added with partial group aggregates across the categories
8451 on the rows and columns.
8452 dropna : bool, default True
8453 Do not include columns whose entries are all NaN. If True,
8454 rows with a NaN value in any column will be omitted before
8455 computing margins.
8456 margins_name : str, default 'All'
8457 Name of the row / column that will contain the totals
8458 when margins is True.
8459 observed : bool, default False
8460 This only applies if any of the groupers are Categoricals.
8461 If True: only show observed values for categorical groupers.
8462 If False: show all values for categorical groupers.
8463
8464 sort : bool, default True
8465 Specifies if the result should be sorted.
8466
8467 .. versionadded:: 1.3.0
8468
8469 Returns
8470 -------
8471 DataFrame
8472 An Excel style pivot table.
8473
8474 See Also
8475 --------
8476 DataFrame.pivot : Pivot without aggregation that can handle
8477 non-numeric data.
8478 DataFrame.melt: Unpivot a DataFrame from wide to long format,
8479 optionally leaving identifiers set.
8480 wide_to_long : Wide panel to long format. Less flexible but more
8481 user-friendly than melt.
8482
8483 Notes
8484 -----
8485 Reference :ref:`the user guide <reshaping.pivot>` for more examples.
8486
8487 Examples
8488 --------
8489 >>> df = pd.DataFrame({"A": ["foo", "foo", "foo", "foo", "foo",
8490 ... "bar", "bar", "bar", "bar"],
8491 ... "B": ["one", "one", "one", "two", "two",
8492 ... "one", "one", "two", "two"],
8493 ... "C": ["small", "large", "large", "small",
8494 ... "small", "large", "small", "small",
8495 ... "large"],
8496 ... "D": [1, 2, 2, 3, 3, 4, 5, 6, 7],
8497 ... "E": [2, 4, 5, 5, 6, 6, 8, 9, 9]})
8498 >>> df
8499 A B C D E
8500 0 foo one small 1 2
8501 1 foo one large 2 4
8502 2 foo one large 2 5
8503 3 foo two small 3 5
8504 4 foo two small 3 6
8505 5 bar one large 4 6
8506 6 bar one small 5 8
8507 7 bar two small 6 9
8508 8 bar two large 7 9
8509
8510 This first example aggregates values by taking the sum.
8511
8512 >>> table = pd.pivot_table(df, values='D', index=['A', 'B'],
8513 ... columns=['C'], aggfunc=np.sum)
8514 >>> table
8515 C large small
8516 A B
8517 bar one 4.0 5.0
8518 two 7.0 6.0
8519 foo one 4.0 1.0
8520 two NaN 6.0
8521
8522 We can also fill missing values using the `fill_value` parameter.
8523
8524 >>> table = pd.pivot_table(df, values='D', index=['A', 'B'],
8525 ... columns=['C'], aggfunc=np.sum, fill_value=0)
8526 >>> table
8527 C large small
8528 A B
8529 bar one 4 5
8530 two 7 6
8531 foo one 4 1
8532 two 0 6
8533
8534 The next example aggregates by taking the mean across multiple columns.
8535
8536 >>> table = pd.pivot_table(df, values=['D', 'E'], index=['A', 'C'],
8537 ... aggfunc={'D': np.mean, 'E': np.mean})
8538 >>> table
8539 D E
8540 A C
8541 bar large 5.500000 7.500000
8542 small 5.500000 8.500000
8543 foo large 2.000000 4.500000
8544 small 2.333333 4.333333
8545
8546 We can also calculate multiple types of aggregations for any given
8547 value column.
8548
8549 >>> table = pd.pivot_table(df, values=['D', 'E'], index=['A', 'C'],
8550 ... aggfunc={'D': np.mean,
8551 ... 'E': [min, max, np.mean]})
8552 >>> table
8553 D E
8554 mean max mean min
8555 A C
8556 bar large 5.500000 9 7.500000 6
8557 small 5.500000 9 8.500000 8
8558 foo large 2.000000 5 4.500000 4
8559 small 2.333333 6 4.333333 2
8560 """
8561
8562 @Substitution("")
8563 @Appender(_shared_docs["pivot_table"])
8564 def pivot_table(
8565 self,
8566 values=None,
8567 index=None,
8568 columns=None,
8569 aggfunc: AggFuncType = "mean",
8570 fill_value=None,
8571 margins: bool = False,
8572 dropna: bool = True,
8573 margins_name: Level = "All",
8574 observed: bool = False,
8575 sort: bool = True,
8576 ) -> DataFrame:
8577 from pandas.core.reshape.pivot import pivot_table
8578
8579 return pivot_table(
8580 self,
8581 values=values,
8582 index=index,
8583 columns=columns,
8584 aggfunc=aggfunc,
8585 fill_value=fill_value,
8586 margins=margins,
8587 dropna=dropna,
8588 margins_name=margins_name,
8589 observed=observed,
8590 sort=sort,
8591 )
8592
8593 def stack(self, level: Level = -1, dropna: bool = True):
8594 """
8595 Stack the prescribed level(s) from columns to index.
8596
8597 Return a reshaped DataFrame or Series having a multi-level
8598 index with one or more new inner-most levels compared to the current
8599 DataFrame. The new inner-most levels are created by pivoting the
8600 columns of the current dataframe:
8601
8602 - if the columns have a single level, the output is a Series;
8603 - if the columns have multiple levels, the new index
8604 level(s) is (are) taken from the prescribed level(s) and
8605 the output is a DataFrame.
8606
8607 Parameters
8608 ----------
8609 level : int, str, list, default -1
8610 Level(s) to stack from the column axis onto the index
8611 axis, defined as one index or label, or a list of indices
8612 or labels.
8613 dropna : bool, default True
8614 Whether to drop rows in the resulting Frame/Series with
8615 missing values. Stacking a column level onto the index
8616 axis can create combinations of index and column values
8617 that are missing from the original dataframe. See Examples
8618 section.
8619
8620 Returns
8621 -------
8622 DataFrame or Series
8623 Stacked dataframe or series.
8624
8625 See Also
8626 --------
8627 DataFrame.unstack : Unstack prescribed level(s) from index axis
8628 onto column axis.
8629 DataFrame.pivot : Reshape dataframe from long format to wide
8630 format.
8631 DataFrame.pivot_table : Create a spreadsheet-style pivot table
8632 as a DataFrame.
8633
8634 Notes
8635 -----
8636 The function is named by analogy with a collection of books
8637 being reorganized from being side by side on a horizontal
8638 position (the columns of the dataframe) to being stacked
8639 vertically on top of each other (in the index of the
8640 dataframe).
8641
8642 Reference :ref:`the user guide <reshaping.stacking>` for more examples.
8643
8644 Examples
8645 --------
8646 **Single level columns**
8647
8648 >>> df_single_level_cols = pd.DataFrame([[0, 1], [2, 3]],
8649 ... index=['cat', 'dog'],
8650 ... columns=['weight', 'height'])
8651
8652 Stacking a dataframe with a single level column axis returns a Series:
8653
8654 >>> df_single_level_cols
8655 weight height
8656 cat 0 1
8657 dog 2 3
8658 >>> df_single_level_cols.stack()
8659 cat weight 0
8660 height 1
8661 dog weight 2
8662 height 3
8663 dtype: int64
8664
8665 **Multi level columns: simple case**
8666
8667 >>> multicol1 = pd.MultiIndex.from_tuples([('weight', 'kg'),
8668 ... ('weight', 'pounds')])
8669 >>> df_multi_level_cols1 = pd.DataFrame([[1, 2], [2, 4]],
8670 ... index=['cat', 'dog'],
8671 ... columns=multicol1)
8672
8673 Stacking a dataframe with a multi-level column axis:
8674
8675 >>> df_multi_level_cols1
8676 weight
8677 kg pounds
8678 cat 1 2
8679 dog 2 4
8680 >>> df_multi_level_cols1.stack()
8681 weight
8682 cat kg 1
8683 pounds 2
8684 dog kg 2
8685 pounds 4
8686
8687 **Missing values**
8688
8689 >>> multicol2 = pd.MultiIndex.from_tuples([('weight', 'kg'),
8690 ... ('height', 'm')])
8691 >>> df_multi_level_cols2 = pd.DataFrame([[1.0, 2.0], [3.0, 4.0]],
8692 ... index=['cat', 'dog'],
8693 ... columns=multicol2)
8694
8695 It is common to have missing values when stacking a dataframe
8696 with multi-level columns, as the stacked dataframe typically
8697 has more values than the original dataframe. Missing values
8698 are filled with NaNs:
8699
8700 >>> df_multi_level_cols2
8701 weight height
8702 kg m
8703 cat 1.0 2.0
8704 dog 3.0 4.0
8705 >>> df_multi_level_cols2.stack()
8706 height weight
8707 cat kg NaN 1.0
8708 m 2.0 NaN
8709 dog kg NaN 3.0
8710 m 4.0 NaN
8711
8712 **Prescribing the level(s) to be stacked**
8713
8714 The first parameter controls which level or levels are stacked:
8715
8716 >>> df_multi_level_cols2.stack(0)
8717 kg m
8718 cat height NaN 2.0
8719 weight 1.0 NaN
8720 dog height NaN 4.0
8721 weight 3.0 NaN
8722 >>> df_multi_level_cols2.stack([0, 1])
8723 cat height m 2.0
8724 weight kg 1.0
8725 dog height m 4.0
8726 weight kg 3.0
8727 dtype: float64
8728
8729 **Dropping missing values**
8730
8731 >>> df_multi_level_cols3 = pd.DataFrame([[None, 1.0], [2.0, 3.0]],
8732 ... index=['cat', 'dog'],
8733 ... columns=multicol2)
8734
8735 Note that rows where all values are missing are dropped by
8736 default but this behaviour can be controlled via the dropna
8737 keyword parameter:
8738
8739 >>> df_multi_level_cols3
8740 weight height
8741 kg m
8742 cat NaN 1.0
8743 dog 2.0 3.0
8744 >>> df_multi_level_cols3.stack(dropna=False)
8745 height weight
8746 cat kg NaN NaN
8747 m 1.0 NaN
8748 dog kg NaN 2.0
8749 m 3.0 NaN
8750 >>> df_multi_level_cols3.stack(dropna=True)
8751 height weight
8752 cat m 1.0 NaN
8753 dog kg NaN 2.0
8754 m 3.0 NaN
8755 """
8756 from pandas.core.reshape.reshape import (
8757 stack,
8758 stack_multiple,
8759 )
8760
8761 if isinstance(level, (tuple, list)):
8762 result = stack_multiple(self, level, dropna=dropna)
8763 else:
8764 result = stack(self, level, dropna=dropna)
8765
8766 return result.__finalize__(self, method="stack")
8767
8768 def explode(
8769 self,
8770 column: IndexLabel,
8771 ignore_index: bool = False,
8772 ) -> DataFrame:
8773 """
8774 Transform each element of a list-like to a row, replicating index values.
8775
8776 Parameters
8777 ----------
8778 column : IndexLabel
8779 Column(s) to explode.
8780 For multiple columns, specify a non-empty list with each element
8781 be str or tuple, and all specified columns their list-like data
8782 on same row of the frame must have matching length.
8783
8784 .. versionadded:: 1.3.0
8785 Multi-column explode
8786
8787 ignore_index : bool, default False
8788 If True, the resulting index will be labeled 0, 1, …, n - 1.
8789
8790 .. versionadded:: 1.1.0
8791
8792 Returns
8793 -------
8794 DataFrame
8795 Exploded lists to rows of the subset columns;
8796 index will be duplicated for these rows.
8797
8798 Raises
8799 ------
8800 ValueError :
8801 * If columns of the frame are not unique.
8802 * If specified columns to explode is empty list.
8803 * If specified columns to explode have not matching count of
8804 elements rowwise in the frame.
8805
8806 See Also
8807 --------
8808 DataFrame.unstack : Pivot a level of the (necessarily hierarchical)
8809 index labels.
8810 DataFrame.melt : Unpivot a DataFrame from wide format to long format.
8811 Series.explode : Explode a DataFrame from list-like columns to long format.
8812
8813 Notes
8814 -----
8815 This routine will explode list-likes including lists, tuples, sets,
8816 Series, and np.ndarray. The result dtype of the subset rows will
8817 be object. Scalars will be returned unchanged, and empty list-likes will
8818 result in a np.nan for that row. In addition, the ordering of rows in the
8819 output will be non-deterministic when exploding sets.
8820
8821 Reference :ref:`the user guide <reshaping.explode>` for more examples.
8822
8823 Examples
8824 --------
8825 >>> df = pd.DataFrame({'A': [[0, 1, 2], 'foo', [], [3, 4]],
8826 ... 'B': 1,
8827 ... 'C': [['a', 'b', 'c'], np.nan, [], ['d', 'e']]})
8828 >>> df
8829 A B C
8830 0 [0, 1, 2] 1 [a, b, c]
8831 1 foo 1 NaN
8832 2 [] 1 []
8833 3 [3, 4] 1 [d, e]
8834
8835 Single-column explode.
8836
8837 >>> df.explode('A')
8838 A B C
8839 0 0 1 [a, b, c]
8840 0 1 1 [a, b, c]
8841 0 2 1 [a, b, c]
8842 1 foo 1 NaN
8843 2 NaN 1 []
8844 3 3 1 [d, e]
8845 3 4 1 [d, e]
8846
8847 Multi-column explode.
8848
8849 >>> df.explode(list('AC'))
8850 A B C
8851 0 0 1 a
8852 0 1 1 b
8853 0 2 1 c
8854 1 foo 1 NaN
8855 2 NaN 1 NaN
8856 3 3 1 d
8857 3 4 1 e
8858 """
8859 if not self.columns.is_unique:
8860 duplicate_cols = self.columns[self.columns.duplicated()].tolist()
8861 raise ValueError(
8862 f"DataFrame columns must be unique. Duplicate columns: {duplicate_cols}"
8863 )
8864
8865 columns: list[Hashable]
8866 if is_scalar(column) or isinstance(column, tuple):
8867 columns = [column]
8868 elif isinstance(column, list) and all(
8869 is_scalar(c) or isinstance(c, tuple) for c in column
8870 ):
8871 if not column:
8872 raise ValueError("column must be nonempty")
8873 if len(column) > len(set(column)):
8874 raise ValueError("column must be unique")
8875 columns = column
8876 else:
8877 raise ValueError("column must be a scalar, tuple, or list thereof")
8878
8879 df = self.reset_index(drop=True)
8880 if len(columns) == 1:
8881 result = df[columns[0]].explode()
8882 else:
8883 mylen = lambda x: len(x) if (is_list_like(x) and len(x) > 0) else 1
8884 counts0 = self[columns[0]].apply(mylen)
8885 for c in columns[1:]:
8886 if not all(counts0 == self[c].apply(mylen)):
8887 raise ValueError("columns must have matching element counts")
8888 result = DataFrame({c: df[c].explode() for c in columns})
8889 result = df.drop(columns, axis=1).join(result)
8890 if ignore_index:
8891 result.index = default_index(len(result))
8892 else:
8893 result.index = self.index.take(result.index)
8894 result = result.reindex(columns=self.columns, copy=False)
8895
8896 return result.__finalize__(self, method="explode")
8897
8898 def unstack(self, level: Level = -1, fill_value=None):
8899 """
8900 Pivot a level of the (necessarily hierarchical) index labels.
8901
8902 Returns a DataFrame having a new level of column labels whose inner-most level
8903 consists of the pivoted index labels.
8904
8905 If the index is not a MultiIndex, the output will be a Series
8906 (the analogue of stack when the columns are not a MultiIndex).
8907
8908 Parameters
8909 ----------
8910 level : int, str, or list of these, default -1 (last level)
8911 Level(s) of index to unstack, can pass level name.
8912 fill_value : int, str or dict
8913 Replace NaN with this value if the unstack produces missing values.
8914
8915 Returns
8916 -------
8917 Series or DataFrame
8918
8919 See Also
8920 --------
8921 DataFrame.pivot : Pivot a table based on column values.
8922 DataFrame.stack : Pivot a level of the column labels (inverse operation
8923 from `unstack`).
8924
8925 Notes
8926 -----
8927 Reference :ref:`the user guide <reshaping.stacking>` for more examples.
8928
8929 Examples
8930 --------
8931 >>> index = pd.MultiIndex.from_tuples([('one', 'a'), ('one', 'b'),
8932 ... ('two', 'a'), ('two', 'b')])
8933 >>> s = pd.Series(np.arange(1.0, 5.0), index=index)
8934 >>> s
8935 one a 1.0
8936 b 2.0
8937 two a 3.0
8938 b 4.0
8939 dtype: float64
8940
8941 >>> s.unstack(level=-1)
8942 a b
8943 one 1.0 2.0
8944 two 3.0 4.0
8945
8946 >>> s.unstack(level=0)
8947 one two
8948 a 1.0 3.0
8949 b 2.0 4.0
8950
8951 >>> df = s.unstack(level=0)
8952 >>> df.unstack()
8953 one a 1.0
8954 b 2.0
8955 two a 3.0
8956 b 4.0
8957 dtype: float64
8958 """
8959 from pandas.core.reshape.reshape import unstack
8960
8961 result = unstack(self, level, fill_value)
8962
8963 return result.__finalize__(self, method="unstack")
8964
8965 @Appender(_shared_docs["melt"] % {"caller": "df.melt(", "other": "melt"})
8966 def melt(
8967 self,
8968 id_vars=None,
8969 value_vars=None,
8970 var_name=None,
8971 value_name: Hashable = "value",
8972 col_level: Level = None,
8973 ignore_index: bool = True,
8974 ) -> DataFrame:
8975 return melt(
8976 self,
8977 id_vars=id_vars,
8978 value_vars=value_vars,
8979 var_name=var_name,
8980 value_name=value_name,
8981 col_level=col_level,
8982 ignore_index=ignore_index,
8983 ).__finalize__(self, method="melt")
8984
8985 # ----------------------------------------------------------------------
8986 # Time series-related
8987
8988 @doc(
8989 Series.diff,
8990 klass="DataFrame",
8991 extra_params="axis : {0 or 'index', 1 or 'columns'}, default 0\n "
8992 "Take difference over rows (0) or columns (1).\n",
8993 other_klass="Series",
8994 examples=dedent(
8995 """
8996 Difference with previous row
8997
8998 >>> df = pd.DataFrame({'a': [1, 2, 3, 4, 5, 6],
8999 ... 'b': [1, 1, 2, 3, 5, 8],
9000 ... 'c': [1, 4, 9, 16, 25, 36]})
9001 >>> df
9002 a b c
9003 0 1 1 1
9004 1 2 1 4
9005 2 3 2 9
9006 3 4 3 16
9007 4 5 5 25
9008 5 6 8 36
9009
9010 >>> df.diff()
9011 a b c
9012 0 NaN NaN NaN
9013 1 1.0 0.0 3.0
9014 2 1.0 1.0 5.0
9015 3 1.0 1.0 7.0
9016 4 1.0 2.0 9.0
9017 5 1.0 3.0 11.0
9018
9019 Difference with previous column
9020
9021 >>> df.diff(axis=1)
9022 a b c
9023 0 NaN 0 0
9024 1 NaN -1 3
9025 2 NaN -1 7
9026 3 NaN -1 13
9027 4 NaN 0 20
9028 5 NaN 2 28
9029
9030 Difference with 3rd previous row
9031
9032 >>> df.diff(periods=3)
9033 a b c
9034 0 NaN NaN NaN
9035 1 NaN NaN NaN
9036 2 NaN NaN NaN
9037 3 3.0 2.0 15.0
9038 4 3.0 4.0 21.0
9039 5 3.0 6.0 27.0
9040
9041 Difference with following row
9042
9043 >>> df.diff(periods=-1)
9044 a b c
9045 0 -1.0 0.0 -3.0
9046 1 -1.0 -1.0 -5.0
9047 2 -1.0 -1.0 -7.0
9048 3 -1.0 -2.0 -9.0
9049 4 -1.0 -3.0 -11.0
9050 5 NaN NaN NaN
9051
9052 Overflow in input dtype
9053
9054 >>> df = pd.DataFrame({'a': [1, 0]}, dtype=np.uint8)
9055 >>> df.diff()
9056 a
9057 0 NaN
9058 1 255.0"""
9059 ),
9060 )
9061 def diff(self, periods: int = 1, axis: Axis = 0) -> DataFrame:
9062 if not lib.is_integer(periods):
9063 if not (
9064 is_float(periods)
9065 # error: "int" has no attribute "is_integer"
9066 and periods.is_integer() # type: ignore[attr-defined]
9067 ):
9068 raise ValueError("periods must be an integer")
9069 periods = int(periods)
9070
9071 axis = self._get_axis_number(axis)
9072 if axis == 1:
9073 if periods != 0:
9074 # in the periods == 0 case, this is equivalent diff of 0 periods
9075 # along axis=0, and the Manager method may be somewhat more
9076 # performant, so we dispatch in that case.
9077 return self - self.shift(periods, axis=axis)
9078 # With periods=0 this is equivalent to a diff with axis=0
9079 axis = 0
9080
9081 new_data = self._mgr.diff(n=periods, axis=axis)
9082 return self._constructor(new_data).__finalize__(self, "diff")
9083
9084 # ----------------------------------------------------------------------
9085 # Function application
9086
9087 def _gotitem(
9088 self,
9089 key: IndexLabel,
9090 ndim: int,
9091 subset: DataFrame | Series | None = None,
9092 ) -> DataFrame | Series:
9093 """
9094 Sub-classes to define. Return a sliced object.
9095
9096 Parameters
9097 ----------
9098 key : string / list of selections
9099 ndim : {1, 2}
9100 requested ndim of result
9101 subset : object, default None
9102 subset to act on
9103 """
9104 if subset is None:
9105 subset = self
9106 elif subset.ndim == 1: # is Series
9107 return subset
9108
9109 # TODO: _shallow_copy(subset)?
9110 return subset[key]
9111
9112 _agg_summary_and_see_also_doc = dedent(
9113 """
9114 The aggregation operations are always performed over an axis, either the
9115 index (default) or the column axis. This behavior is different from
9116 `numpy` aggregation functions (`mean`, `median`, `prod`, `sum`, `std`,
9117 `var`), where the default is to compute the aggregation of the flattened
9118 array, e.g., ``numpy.mean(arr_2d)`` as opposed to
9119 ``numpy.mean(arr_2d, axis=0)``.
9120
9121 `agg` is an alias for `aggregate`. Use the alias.
9122
9123 See Also
9124 --------
9125 DataFrame.apply : Perform any type of operations.
9126 DataFrame.transform : Perform transformation type operations.
9127 core.groupby.GroupBy : Perform operations over groups.
9128 core.resample.Resampler : Perform operations over resampled bins.
9129 core.window.Rolling : Perform operations over rolling window.
9130 core.window.Expanding : Perform operations over expanding window.
9131 core.window.ExponentialMovingWindow : Perform operation over exponential weighted
9132 window.
9133 """
9134 )
9135
9136 _agg_examples_doc = dedent(
9137 """
9138 Examples
9139 --------
9140 >>> df = pd.DataFrame([[1, 2, 3],
9141 ... [4, 5, 6],
9142 ... [7, 8, 9],
9143 ... [np.nan, np.nan, np.nan]],
9144 ... columns=['A', 'B', 'C'])
9145
9146 Aggregate these functions over the rows.
9147
9148 >>> df.agg(['sum', 'min'])
9149 A B C
9150 sum 12.0 15.0 18.0
9151 min 1.0 2.0 3.0
9152
9153 Different aggregations per column.
9154
9155 >>> df.agg({'A' : ['sum', 'min'], 'B' : ['min', 'max']})
9156 A B
9157 sum 12.0 NaN
9158 min 1.0 2.0
9159 max NaN 8.0
9160
9161 Aggregate different functions over the columns and rename the index of the resulting
9162 DataFrame.
9163
9164 >>> df.agg(x=('A', max), y=('B', 'min'), z=('C', np.mean))
9165 A B C
9166 x 7.0 NaN NaN
9167 y NaN 2.0 NaN
9168 z NaN NaN 6.0
9169
9170 Aggregate over the columns.
9171
9172 >>> df.agg("mean", axis="columns")
9173 0 2.0
9174 1 5.0
9175 2 8.0
9176 3 NaN
9177 dtype: float64
9178 """
9179 )
9180
9181 @doc(
9182 _shared_docs["aggregate"],
9183 klass=_shared_doc_kwargs["klass"],
9184 axis=_shared_doc_kwargs["axis"],
9185 see_also=_agg_summary_and_see_also_doc,
9186 examples=_agg_examples_doc,
9187 )
9188 def aggregate(self, func=None, axis: Axis = 0, *args, **kwargs):
9189 from pandas.core.apply import frame_apply
9190
9191 axis = self._get_axis_number(axis)
9192
9193 relabeling, func, columns, order = reconstruct_func(func, **kwargs)
9194
9195 op = frame_apply(self, func=func, axis=axis, args=args, kwargs=kwargs)
9196 result = op.agg()
9197
9198 if relabeling:
9199 # This is to keep the order to columns occurrence unchanged, and also
9200 # keep the order of new columns occurrence unchanged
9201
9202 # For the return values of reconstruct_func, if relabeling is
9203 # False, columns and order will be None.
9204 assert columns is not None
9205 assert order is not None
9206
9207 result_in_dict = relabel_result(result, func, columns, order)
9208 result = DataFrame(result_in_dict, index=columns)
9209
9210 return result
9211
9212 agg = aggregate
9213
9214 # error: Signature of "any" incompatible with supertype "NDFrame" [override]
9215 @overload # type: ignore[override]
9216 def any(
9217 self,
9218 *,
9219 axis: Axis = ...,
9220 bool_only: bool | None = ...,
9221 skipna: bool = ...,
9222 level: None = ...,
9223 **kwargs,
9224 ) -> Series:
9225 ...
9226
9227 @overload
9228 def any(
9229 self,
9230 *,
9231 axis: Axis = ...,
9232 bool_only: bool | None = ...,
9233 skipna: bool = ...,
9234 level: Level,
9235 **kwargs,
9236 ) -> DataFrame | Series:
9237 ...
9238
9239 # error: Missing return statement
9240 @doc(NDFrame.any, **_shared_doc_kwargs)
9241 def any( # type: ignore[empty-body]
9242 self,
9243 axis: Axis = 0,
9244 bool_only: bool | None = None,
9245 skipna: bool = True,
9246 level: Level = None,
9247 **kwargs,
9248 ) -> DataFrame | Series:
9249 ...
9250
9251 @doc(
9252 _shared_docs["transform"],
9253 klass=_shared_doc_kwargs["klass"],
9254 axis=_shared_doc_kwargs["axis"],
9255 )
9256 def transform(
9257 self, func: AggFuncType, axis: Axis = 0, *args, **kwargs
9258 ) -> DataFrame:
9259 from pandas.core.apply import frame_apply
9260
9261 op = frame_apply(self, func=func, axis=axis, args=args, kwargs=kwargs)
9262 result = op.transform()
9263 assert isinstance(result, DataFrame)
9264 return result
9265
9266 def apply(
9267 self,
9268 func: AggFuncType,
9269 axis: Axis = 0,
9270 raw: bool = False,
9271 result_type: Literal["expand", "reduce", "broadcast"] | None = None,
9272 args=(),
9273 **kwargs,
9274 ):
9275 """
9276 Apply a function along an axis of the DataFrame.
9277
9278 Objects passed to the function are Series objects whose index is
9279 either the DataFrame's index (``axis=0``) or the DataFrame's columns
9280 (``axis=1``). By default (``result_type=None``), the final return type
9281 is inferred from the return type of the applied function. Otherwise,
9282 it depends on the `result_type` argument.
9283
9284 Parameters
9285 ----------
9286 func : function
9287 Function to apply to each column or row.
9288 axis : {0 or 'index', 1 or 'columns'}, default 0
9289 Axis along which the function is applied:
9290
9291 * 0 or 'index': apply function to each column.
9292 * 1 or 'columns': apply function to each row.
9293
9294 raw : bool, default False
9295 Determines if row or column is passed as a Series or ndarray object:
9296
9297 * ``False`` : passes each row or column as a Series to the
9298 function.
9299 * ``True`` : the passed function will receive ndarray objects
9300 instead.
9301 If you are just applying a NumPy reduction function this will
9302 achieve much better performance.
9303
9304 result_type : {'expand', 'reduce', 'broadcast', None}, default None
9305 These only act when ``axis=1`` (columns):
9306
9307 * 'expand' : list-like results will be turned into columns.
9308 * 'reduce' : returns a Series if possible rather than expanding
9309 list-like results. This is the opposite of 'expand'.
9310 * 'broadcast' : results will be broadcast to the original shape
9311 of the DataFrame, the original index and columns will be
9312 retained.
9313
9314 The default behaviour (None) depends on the return value of the
9315 applied function: list-like results will be returned as a Series
9316 of those. However if the apply function returns a Series these
9317 are expanded to columns.
9318 args : tuple
9319 Positional arguments to pass to `func` in addition to the
9320 array/series.
9321 **kwargs
9322 Additional keyword arguments to pass as keywords arguments to
9323 `func`.
9324
9325 Returns
9326 -------
9327 Series or DataFrame
9328 Result of applying ``func`` along the given axis of the
9329 DataFrame.
9330
9331 See Also
9332 --------
9333 DataFrame.applymap: For elementwise operations.
9334 DataFrame.aggregate: Only perform aggregating type operations.
9335 DataFrame.transform: Only perform transforming type operations.
9336
9337 Notes
9338 -----
9339 Functions that mutate the passed object can produce unexpected
9340 behavior or errors and are not supported. See :ref:`gotchas.udf-mutation`
9341 for more details.
9342
9343 Examples
9344 --------
9345 >>> df = pd.DataFrame([[4, 9]] * 3, columns=['A', 'B'])
9346 >>> df
9347 A B
9348 0 4 9
9349 1 4 9
9350 2 4 9
9351
9352 Using a numpy universal function (in this case the same as
9353 ``np.sqrt(df)``):
9354
9355 >>> df.apply(np.sqrt)
9356 A B
9357 0 2.0 3.0
9358 1 2.0 3.0
9359 2 2.0 3.0
9360
9361 Using a reducing function on either axis
9362
9363 >>> df.apply(np.sum, axis=0)
9364 A 12
9365 B 27
9366 dtype: int64
9367
9368 >>> df.apply(np.sum, axis=1)
9369 0 13
9370 1 13
9371 2 13
9372 dtype: int64
9373
9374 Returning a list-like will result in a Series
9375
9376 >>> df.apply(lambda x: [1, 2], axis=1)
9377 0 [1, 2]
9378 1 [1, 2]
9379 2 [1, 2]
9380 dtype: object
9381
9382 Passing ``result_type='expand'`` will expand list-like results
9383 to columns of a Dataframe
9384
9385 >>> df.apply(lambda x: [1, 2], axis=1, result_type='expand')
9386 0 1
9387 0 1 2
9388 1 1 2
9389 2 1 2
9390
9391 Returning a Series inside the function is similar to passing
9392 ``result_type='expand'``. The resulting column names
9393 will be the Series index.
9394
9395 >>> df.apply(lambda x: pd.Series([1, 2], index=['foo', 'bar']), axis=1)
9396 foo bar
9397 0 1 2
9398 1 1 2
9399 2 1 2
9400
9401 Passing ``result_type='broadcast'`` will ensure the same shape
9402 result, whether list-like or scalar is returned by the function,
9403 and broadcast it along the axis. The resulting column names will
9404 be the originals.
9405
9406 >>> df.apply(lambda x: [1, 2], axis=1, result_type='broadcast')
9407 A B
9408 0 1 2
9409 1 1 2
9410 2 1 2
9411 """
9412 from pandas.core.apply import frame_apply
9413
9414 op = frame_apply(
9415 self,
9416 func=func,
9417 axis=axis,
9418 raw=raw,
9419 result_type=result_type,
9420 args=args,
9421 kwargs=kwargs,
9422 )
9423 return op.apply().__finalize__(self, method="apply")
9424
9425 def applymap(
9426 self, func: PythonFuncType, na_action: str | None = None, **kwargs
9427 ) -> DataFrame:
9428 """
9429 Apply a function to a Dataframe elementwise.
9430
9431 This method applies a function that accepts and returns a scalar
9432 to every element of a DataFrame.
9433
9434 Parameters
9435 ----------
9436 func : callable
9437 Python function, returns a single value from a single value.
9438 na_action : {None, 'ignore'}, default None
9439 If ‘ignore’, propagate NaN values, without passing them to func.
9440
9441 .. versionadded:: 1.2
9442
9443 **kwargs
9444 Additional keyword arguments to pass as keywords arguments to
9445 `func`.
9446
9447 .. versionadded:: 1.3.0
9448
9449 Returns
9450 -------
9451 DataFrame
9452 Transformed DataFrame.
9453
9454 See Also
9455 --------
9456 DataFrame.apply : Apply a function along input axis of DataFrame.
9457
9458 Examples
9459 --------
9460 >>> df = pd.DataFrame([[1, 2.12], [3.356, 4.567]])
9461 >>> df
9462 0 1
9463 0 1.000 2.120
9464 1 3.356 4.567
9465
9466 >>> df.applymap(lambda x: len(str(x)))
9467 0 1
9468 0 3 4
9469 1 5 5
9470
9471 Like Series.map, NA values can be ignored:
9472
9473 >>> df_copy = df.copy()
9474 >>> df_copy.iloc[0, 0] = pd.NA
9475 >>> df_copy.applymap(lambda x: len(str(x)), na_action='ignore')
9476 0 1
9477 0 NaN 4
9478 1 5.0 5
9479
9480 Note that a vectorized version of `func` often exists, which will
9481 be much faster. You could square each number elementwise.
9482
9483 >>> df.applymap(lambda x: x**2)
9484 0 1
9485 0 1.000000 4.494400
9486 1 11.262736 20.857489
9487
9488 But it's better to avoid applymap in that case.
9489
9490 >>> df ** 2
9491 0 1
9492 0 1.000000 4.494400
9493 1 11.262736 20.857489
9494 """
9495 if na_action not in {"ignore", None}:
9496 raise ValueError(
9497 f"na_action must be 'ignore' or None. Got {repr(na_action)}"
9498 )
9499 ignore_na = na_action == "ignore"
9500 func = functools.partial(func, **kwargs)
9501
9502 # if we have a dtype == 'M8[ns]', provide boxed values
9503 def infer(x):
9504 if x.empty:
9505 return lib.map_infer(x, func, ignore_na=ignore_na)
9506 return lib.map_infer(x.astype(object)._values, func, ignore_na=ignore_na)
9507
9508 return self.apply(infer).__finalize__(self, "applymap")
9509
9510 # ----------------------------------------------------------------------
9511 # Merging / joining methods
9512
9513 def _append(
9514 self,
9515 other,
9516 ignore_index: bool = False,
9517 verify_integrity: bool = False,
9518 sort: bool = False,
9519 ) -> DataFrame:
9520 if isinstance(other, (Series, dict)):
9521 if isinstance(other, dict):
9522 if not ignore_index:
9523 raise TypeError("Can only append a dict if ignore_index=True")
9524 other = Series(other)
9525 if other.name is None and not ignore_index:
9526 raise TypeError(
9527 "Can only append a Series if ignore_index=True "
9528 "or if the Series has a name"
9529 )
9530
9531 index = Index(
9532 [other.name],
9533 name=self.index.names
9534 if isinstance(self.index, MultiIndex)
9535 else self.index.name,
9536 )
9537 row_df = other.to_frame().T
9538 # infer_objects is needed for
9539 # test_append_empty_frame_to_series_with_dateutil_tz
9540 other = row_df.infer_objects(copy=False).rename_axis(
9541 index.names, copy=False
9542 )
9543 elif isinstance(other, list):
9544 if not other:
9545 pass
9546 elif not isinstance(other[0], DataFrame):
9547 other = DataFrame(other)
9548 if self.index.name is not None and not ignore_index:
9549 other.index.name = self.index.name
9550
9551 from pandas.core.reshape.concat import concat
9552
9553 if isinstance(other, (list, tuple)):
9554 to_concat = [self, *other]
9555 else:
9556 to_concat = [self, other]
9557
9558 result = concat(
9559 to_concat,
9560 ignore_index=ignore_index,
9561 verify_integrity=verify_integrity,
9562 sort=sort,
9563 )
9564 return result.__finalize__(self, method="append")
9565
9566 def join(
9567 self,
9568 other: DataFrame | Series | Iterable[DataFrame | Series],
9569 on: IndexLabel | None = None,
9570 how: MergeHow = "left",
9571 lsuffix: str = "",
9572 rsuffix: str = "",
9573 sort: bool = False,
9574 validate: str | None = None,
9575 ) -> DataFrame:
9576 """
9577 Join columns of another DataFrame.
9578
9579 Join columns with `other` DataFrame either on index or on a key
9580 column. Efficiently join multiple DataFrame objects by index at once by
9581 passing a list.
9582
9583 Parameters
9584 ----------
9585 other : DataFrame, Series, or a list containing any combination of them
9586 Index should be similar to one of the columns in this one. If a
9587 Series is passed, its name attribute must be set, and that will be
9588 used as the column name in the resulting joined DataFrame.
9589 on : str, list of str, or array-like, optional
9590 Column or index level name(s) in the caller to join on the index
9591 in `other`, otherwise joins index-on-index. If multiple
9592 values given, the `other` DataFrame must have a MultiIndex. Can
9593 pass an array as the join key if it is not already contained in
9594 the calling DataFrame. Like an Excel VLOOKUP operation.
9595 how : {'left', 'right', 'outer', 'inner', 'cross'}, default 'left'
9596 How to handle the operation of the two objects.
9597
9598 * left: use calling frame's index (or column if on is specified)
9599 * right: use `other`'s index.
9600 * outer: form union of calling frame's index (or column if on is
9601 specified) with `other`'s index, and sort it.
9602 lexicographically.
9603 * inner: form intersection of calling frame's index (or column if
9604 on is specified) with `other`'s index, preserving the order
9605 of the calling's one.
9606 * cross: creates the cartesian product from both frames, preserves the order
9607 of the left keys.
9608
9609 .. versionadded:: 1.2.0
9610
9611 lsuffix : str, default ''
9612 Suffix to use from left frame's overlapping columns.
9613 rsuffix : str, default ''
9614 Suffix to use from right frame's overlapping columns.
9615 sort : bool, default False
9616 Order result DataFrame lexicographically by the join key. If False,
9617 the order of the join key depends on the join type (how keyword).
9618 validate : str, optional
9619 If specified, checks if join is of specified type.
9620 * "one_to_one" or "1:1": check if join keys are unique in both left
9621 and right datasets.
9622 * "one_to_many" or "1:m": check if join keys are unique in left dataset.
9623 * "many_to_one" or "m:1": check if join keys are unique in right dataset.
9624 * "many_to_many" or "m:m": allowed, but does not result in checks.
9625 .. versionadded:: 1.5.0
9626
9627 Returns
9628 -------
9629 DataFrame
9630 A dataframe containing columns from both the caller and `other`.
9631
9632 See Also
9633 --------
9634 DataFrame.merge : For column(s)-on-column(s) operations.
9635
9636 Notes
9637 -----
9638 Parameters `on`, `lsuffix`, and `rsuffix` are not supported when
9639 passing a list of `DataFrame` objects.
9640
9641 Support for specifying index levels as the `on` parameter was added
9642 in version 0.23.0.
9643
9644 Examples
9645 --------
9646 >>> df = pd.DataFrame({'key': ['K0', 'K1', 'K2', 'K3', 'K4', 'K5'],
9647 ... 'A': ['A0', 'A1', 'A2', 'A3', 'A4', 'A5']})
9648
9649 >>> df
9650 key A
9651 0 K0 A0
9652 1 K1 A1
9653 2 K2 A2
9654 3 K3 A3
9655 4 K4 A4
9656 5 K5 A5
9657
9658 >>> other = pd.DataFrame({'key': ['K0', 'K1', 'K2'],
9659 ... 'B': ['B0', 'B1', 'B2']})
9660
9661 >>> other
9662 key B
9663 0 K0 B0
9664 1 K1 B1
9665 2 K2 B2
9666
9667 Join DataFrames using their indexes.
9668
9669 >>> df.join(other, lsuffix='_caller', rsuffix='_other')
9670 key_caller A key_other B
9671 0 K0 A0 K0 B0
9672 1 K1 A1 K1 B1
9673 2 K2 A2 K2 B2
9674 3 K3 A3 NaN NaN
9675 4 K4 A4 NaN NaN
9676 5 K5 A5 NaN NaN
9677
9678 If we want to join using the key columns, we need to set key to be
9679 the index in both `df` and `other`. The joined DataFrame will have
9680 key as its index.
9681
9682 >>> df.set_index('key').join(other.set_index('key'))
9683 A B
9684 key
9685 K0 A0 B0
9686 K1 A1 B1
9687 K2 A2 B2
9688 K3 A3 NaN
9689 K4 A4 NaN
9690 K5 A5 NaN
9691
9692 Another option to join using the key columns is to use the `on`
9693 parameter. DataFrame.join always uses `other`'s index but we can use
9694 any column in `df`. This method preserves the original DataFrame's
9695 index in the result.
9696
9697 >>> df.join(other.set_index('key'), on='key')
9698 key A B
9699 0 K0 A0 B0
9700 1 K1 A1 B1
9701 2 K2 A2 B2
9702 3 K3 A3 NaN
9703 4 K4 A4 NaN
9704 5 K5 A5 NaN
9705
9706 Using non-unique key values shows how they are matched.
9707
9708 >>> df = pd.DataFrame({'key': ['K0', 'K1', 'K1', 'K3', 'K0', 'K1'],
9709 ... 'A': ['A0', 'A1', 'A2', 'A3', 'A4', 'A5']})
9710
9711 >>> df
9712 key A
9713 0 K0 A0
9714 1 K1 A1
9715 2 K1 A2
9716 3 K3 A3
9717 4 K0 A4
9718 5 K1 A5
9719
9720 >>> df.join(other.set_index('key'), on='key', validate='m:1')
9721 key A B
9722 0 K0 A0 B0
9723 1 K1 A1 B1
9724 2 K1 A2 B1
9725 3 K3 A3 NaN
9726 4 K0 A4 B0
9727 5 K1 A5 B1
9728 """
9729 return self._join_compat(
9730 other,
9731 on=on,
9732 how=how,
9733 lsuffix=lsuffix,
9734 rsuffix=rsuffix,
9735 sort=sort,
9736 validate=validate,
9737 )
9738
9739 def _join_compat(
9740 self,
9741 other: DataFrame | Series | Iterable[DataFrame | Series],
9742 on: IndexLabel | None = None,
9743 how: MergeHow = "left",
9744 lsuffix: str = "",
9745 rsuffix: str = "",
9746 sort: bool = False,
9747 validate: str | None = None,
9748 ):
9749 from pandas.core.reshape.concat import concat
9750 from pandas.core.reshape.merge import merge
9751
9752 if isinstance(other, Series):
9753 if other.name is None:
9754 raise ValueError("Other Series must have a name")
9755 other = DataFrame({other.name: other})
9756
9757 if isinstance(other, DataFrame):
9758 if how == "cross":
9759 return merge(
9760 self,
9761 other,
9762 how=how,
9763 on=on,
9764 suffixes=(lsuffix, rsuffix),
9765 sort=sort,
9766 validate=validate,
9767 )
9768 return merge(
9769 self,
9770 other,
9771 left_on=on,
9772 how=how,
9773 left_index=on is None,
9774 right_index=True,
9775 suffixes=(lsuffix, rsuffix),
9776 sort=sort,
9777 validate=validate,
9778 )
9779 else:
9780 if on is not None:
9781 raise ValueError(
9782 "Joining multiple DataFrames only supported for joining on index"
9783 )
9784
9785 if rsuffix or lsuffix:
9786 raise ValueError(
9787 "Suffixes not supported when joining multiple DataFrames"
9788 )
9789
9790 # Mypy thinks the RHS is a
9791 # "Union[DataFrame, Series, Iterable[Union[DataFrame, Series]]]" whereas
9792 # the LHS is an "Iterable[DataFrame]", but in reality both types are
9793 # "Iterable[Union[DataFrame, Series]]" due to the if statements
9794 frames = [cast("DataFrame | Series", self)] + list(other)
9795
9796 can_concat = all(df.index.is_unique for df in frames)
9797
9798 # join indexes only using concat
9799 if can_concat:
9800 if how == "left":
9801 res = concat(
9802 frames, axis=1, join="outer", verify_integrity=True, sort=sort
9803 )
9804 return res.reindex(self.index, copy=False)
9805 else:
9806 return concat(
9807 frames, axis=1, join=how, verify_integrity=True, sort=sort
9808 )
9809
9810 joined = frames[0]
9811
9812 for frame in frames[1:]:
9813 joined = merge(
9814 joined,
9815 frame,
9816 how=how,
9817 left_index=True,
9818 right_index=True,
9819 validate=validate,
9820 )
9821
9822 return joined
9823
9824 @Substitution("")
9825 @Appender(_merge_doc, indents=2)
9826 def merge(
9827 self,
9828 right: DataFrame | Series,
9829 how: MergeHow = "inner",
9830 on: IndexLabel | None = None,
9831 left_on: IndexLabel | None = None,
9832 right_on: IndexLabel | None = None,
9833 left_index: bool = False,
9834 right_index: bool = False,
9835 sort: bool = False,
9836 suffixes: Suffixes = ("_x", "_y"),
9837 copy: bool | None = None,
9838 indicator: str | bool = False,
9839 validate: str | None = None,
9840 ) -> DataFrame:
9841 from pandas.core.reshape.merge import merge
9842
9843 return merge(
9844 self,
9845 right,
9846 how=how,
9847 on=on,
9848 left_on=left_on,
9849 right_on=right_on,
9850 left_index=left_index,
9851 right_index=right_index,
9852 sort=sort,
9853 suffixes=suffixes,
9854 copy=copy,
9855 indicator=indicator,
9856 validate=validate,
9857 )
9858
9859 def round(
9860 self, decimals: int | dict[IndexLabel, int] | Series = 0, *args, **kwargs
9861 ) -> DataFrame:
9862 """
9863 Round a DataFrame to a variable number of decimal places.
9864
9865 Parameters
9866 ----------
9867 decimals : int, dict, Series
9868 Number of decimal places to round each column to. If an int is
9869 given, round each column to the same number of places.
9870 Otherwise dict and Series round to variable numbers of places.
9871 Column names should be in the keys if `decimals` is a
9872 dict-like, or in the index if `decimals` is a Series. Any
9873 columns not included in `decimals` will be left as is. Elements
9874 of `decimals` which are not columns of the input will be
9875 ignored.
9876 *args
9877 Additional keywords have no effect but might be accepted for
9878 compatibility with numpy.
9879 **kwargs
9880 Additional keywords have no effect but might be accepted for
9881 compatibility with numpy.
9882
9883 Returns
9884 -------
9885 DataFrame
9886 A DataFrame with the affected columns rounded to the specified
9887 number of decimal places.
9888
9889 See Also
9890 --------
9891 numpy.around : Round a numpy array to the given number of decimals.
9892 Series.round : Round a Series to the given number of decimals.
9893
9894 Examples
9895 --------
9896 >>> df = pd.DataFrame([(.21, .32), (.01, .67), (.66, .03), (.21, .18)],
9897 ... columns=['dogs', 'cats'])
9898 >>> df
9899 dogs cats
9900 0 0.21 0.32
9901 1 0.01 0.67
9902 2 0.66 0.03
9903 3 0.21 0.18
9904
9905 By providing an integer each column is rounded to the same number
9906 of decimal places
9907
9908 >>> df.round(1)
9909 dogs cats
9910 0 0.2 0.3
9911 1 0.0 0.7
9912 2 0.7 0.0
9913 3 0.2 0.2
9914
9915 With a dict, the number of places for specific columns can be
9916 specified with the column names as key and the number of decimal
9917 places as value
9918
9919 >>> df.round({'dogs': 1, 'cats': 0})
9920 dogs cats
9921 0 0.2 0.0
9922 1 0.0 1.0
9923 2 0.7 0.0
9924 3 0.2 0.0
9925
9926 Using a Series, the number of places for specific columns can be
9927 specified with the column names as index and the number of
9928 decimal places as value
9929
9930 >>> decimals = pd.Series([0, 1], index=['cats', 'dogs'])
9931 >>> df.round(decimals)
9932 dogs cats
9933 0 0.2 0.0
9934 1 0.0 1.0
9935 2 0.7 0.0
9936 3 0.2 0.0
9937 """
9938 from pandas.core.reshape.concat import concat
9939
9940 def _dict_round(df: DataFrame, decimals):
9941 for col, vals in df.items():
9942 try:
9943 yield _series_round(vals, decimals[col])
9944 except KeyError:
9945 yield vals
9946
9947 def _series_round(ser: Series, decimals: int) -> Series:
9948 if is_integer_dtype(ser.dtype) or is_float_dtype(ser.dtype):
9949 return ser.round(decimals)
9950 return ser
9951
9952 nv.validate_round(args, kwargs)
9953
9954 if isinstance(decimals, (dict, Series)):
9955 if isinstance(decimals, Series) and not decimals.index.is_unique:
9956 raise ValueError("Index of decimals must be unique")
9957 if is_dict_like(decimals) and not all(
9958 is_integer(value) for _, value in decimals.items()
9959 ):
9960 raise TypeError("Values in decimals must be integers")
9961 new_cols = list(_dict_round(self, decimals))
9962 elif is_integer(decimals):
9963 # Dispatch to Block.round
9964 return self._constructor(
9965 self._mgr.round(decimals=decimals, using_cow=using_copy_on_write()),
9966 ).__finalize__(self, method="round")
9967 else:
9968 raise TypeError("decimals must be an integer, a dict-like or a Series")
9969
9970 if new_cols is not None and len(new_cols) > 0:
9971 return self._constructor(
9972 concat(new_cols, axis=1), index=self.index, columns=self.columns
9973 ).__finalize__(self, method="round")
9974 else:
9975 return self.copy(deep=False)
9976
9977 # ----------------------------------------------------------------------
9978 # Statistical methods, etc.
9979
9980 def corr(
9981 self,
9982 method: CorrelationMethod = "pearson",
9983 min_periods: int = 1,
9984 numeric_only: bool = False,
9985 ) -> DataFrame:
9986 """
9987 Compute pairwise correlation of columns, excluding NA/null values.
9988
9989 Parameters
9990 ----------
9991 method : {'pearson', 'kendall', 'spearman'} or callable
9992 Method of correlation:
9993
9994 * pearson : standard correlation coefficient
9995 * kendall : Kendall Tau correlation coefficient
9996 * spearman : Spearman rank correlation
9997 * callable: callable with input two 1d ndarrays
9998 and returning a float. Note that the returned matrix from corr
9999 will have 1 along the diagonals and will be symmetric
10000 regardless of the callable's behavior.
10001 min_periods : int, optional
10002 Minimum number of observations required per pair of columns
10003 to have a valid result. Currently only available for Pearson
10004 and Spearman correlation.
10005 numeric_only : bool, default False
10006 Include only `float`, `int` or `boolean` data.
10007
10008 .. versionadded:: 1.5.0
10009
10010 .. versionchanged:: 2.0.0
10011 The default value of ``numeric_only`` is now ``False``.
10012
10013 Returns
10014 -------
10015 DataFrame
10016 Correlation matrix.
10017
10018 See Also
10019 --------
10020 DataFrame.corrwith : Compute pairwise correlation with another
10021 DataFrame or Series.
10022 Series.corr : Compute the correlation between two Series.
10023
10024 Notes
10025 -----
10026 Pearson, Kendall and Spearman correlation are currently computed using pairwise complete observations.
10027
10028 * `Pearson correlation coefficient <https://en.wikipedia.org/wiki/Pearson_correlation_coefficient>`_
10029 * `Kendall rank correlation coefficient <https://en.wikipedia.org/wiki/Kendall_rank_correlation_coefficient>`_
10030 * `Spearman's rank correlation coefficient <https://en.wikipedia.org/wiki/Spearman%27s_rank_correlation_coefficient>`_
10031
10032 Examples
10033 --------
10034 >>> def histogram_intersection(a, b):
10035 ... v = np.minimum(a, b).sum().round(decimals=1)
10036 ... return v
10037 >>> df = pd.DataFrame([(.2, .3), (.0, .6), (.6, .0), (.2, .1)],
10038 ... columns=['dogs', 'cats'])
10039 >>> df.corr(method=histogram_intersection)
10040 dogs cats
10041 dogs 1.0 0.3
10042 cats 0.3 1.0
10043
10044 >>> df = pd.DataFrame([(1, 1), (2, np.nan), (np.nan, 3), (4, 4)],
10045 ... columns=['dogs', 'cats'])
10046 >>> df.corr(min_periods=3)
10047 dogs cats
10048 dogs 1.0 NaN
10049 cats NaN 1.0
10050 """ # noqa:E501
10051 data = self._get_numeric_data() if numeric_only else self
10052 cols = data.columns
10053 idx = cols.copy()
10054 mat = data.to_numpy(dtype=float, na_value=np.nan, copy=False)
10055
10056 if method == "pearson":
10057 correl = libalgos.nancorr(mat, minp=min_periods)
10058 elif method == "spearman":
10059 correl = libalgos.nancorr_spearman(mat, minp=min_periods)
10060 elif method == "kendall" or callable(method):
10061 if min_periods is None:
10062 min_periods = 1
10063 mat = mat.T
10064 corrf = nanops.get_corr_func(method)
10065 K = len(cols)
10066 correl = np.empty((K, K), dtype=float)
10067 mask = np.isfinite(mat)
10068 for i, ac in enumerate(mat):
10069 for j, bc in enumerate(mat):
10070 if i > j:
10071 continue
10072
10073 valid = mask[i] & mask[j]
10074 if valid.sum() < min_periods:
10075 c = np.nan
10076 elif i == j:
10077 c = 1.0
10078 elif not valid.all():
10079 c = corrf(ac[valid], bc[valid])
10080 else:
10081 c = corrf(ac, bc)
10082 correl[i, j] = c
10083 correl[j, i] = c
10084 else:
10085 raise ValueError(
10086 "method must be either 'pearson', "
10087 "'spearman', 'kendall', or a callable, "
10088 f"'{method}' was supplied"
10089 )
10090
10091 result = self._constructor(correl, index=idx, columns=cols, copy=False)
10092 return result.__finalize__(self, method="corr")
10093
10094 def cov(
10095 self,
10096 min_periods: int | None = None,
10097 ddof: int | None = 1,
10098 numeric_only: bool = False,
10099 ) -> DataFrame:
10100 """
10101 Compute pairwise covariance of columns, excluding NA/null values.
10102
10103 Compute the pairwise covariance among the series of a DataFrame.
10104 The returned data frame is the `covariance matrix
10105 <https://en.wikipedia.org/wiki/Covariance_matrix>`__ of the columns
10106 of the DataFrame.
10107
10108 Both NA and null values are automatically excluded from the
10109 calculation. (See the note below about bias from missing values.)
10110 A threshold can be set for the minimum number of
10111 observations for each value created. Comparisons with observations
10112 below this threshold will be returned as ``NaN``.
10113
10114 This method is generally used for the analysis of time series data to
10115 understand the relationship between different measures
10116 across time.
10117
10118 Parameters
10119 ----------
10120 min_periods : int, optional
10121 Minimum number of observations required per pair of columns
10122 to have a valid result.
10123
10124 ddof : int, default 1
10125 Delta degrees of freedom. The divisor used in calculations
10126 is ``N - ddof``, where ``N`` represents the number of elements.
10127
10128 .. versionadded:: 1.1.0
10129
10130 numeric_only : bool, default False
10131 Include only `float`, `int` or `boolean` data.
10132
10133 .. versionadded:: 1.5.0
10134
10135 .. versionchanged:: 2.0.0
10136 The default value of ``numeric_only`` is now ``False``.
10137
10138 Returns
10139 -------
10140 DataFrame
10141 The covariance matrix of the series of the DataFrame.
10142
10143 See Also
10144 --------
10145 Series.cov : Compute covariance with another Series.
10146 core.window.ewm.ExponentialMovingWindow.cov : Exponential weighted sample
10147 covariance.
10148 core.window.expanding.Expanding.cov : Expanding sample covariance.
10149 core.window.rolling.Rolling.cov : Rolling sample covariance.
10150
10151 Notes
10152 -----
10153 Returns the covariance matrix of the DataFrame's time series.
10154 The covariance is normalized by N-ddof.
10155
10156 For DataFrames that have Series that are missing data (assuming that
10157 data is `missing at random
10158 <https://en.wikipedia.org/wiki/Missing_data#Missing_at_random>`__)
10159 the returned covariance matrix will be an unbiased estimate
10160 of the variance and covariance between the member Series.
10161
10162 However, for many applications this estimate may not be acceptable
10163 because the estimate covariance matrix is not guaranteed to be positive
10164 semi-definite. This could lead to estimate correlations having
10165 absolute values which are greater than one, and/or a non-invertible
10166 covariance matrix. See `Estimation of covariance matrices
10167 <https://en.wikipedia.org/w/index.php?title=Estimation_of_covariance_
10168 matrices>`__ for more details.
10169
10170 Examples
10171 --------
10172 >>> df = pd.DataFrame([(1, 2), (0, 3), (2, 0), (1, 1)],
10173 ... columns=['dogs', 'cats'])
10174 >>> df.cov()
10175 dogs cats
10176 dogs 0.666667 -1.000000
10177 cats -1.000000 1.666667
10178
10179 >>> np.random.seed(42)
10180 >>> df = pd.DataFrame(np.random.randn(1000, 5),
10181 ... columns=['a', 'b', 'c', 'd', 'e'])
10182 >>> df.cov()
10183 a b c d e
10184 a 0.998438 -0.020161 0.059277 -0.008943 0.014144
10185 b -0.020161 1.059352 -0.008543 -0.024738 0.009826
10186 c 0.059277 -0.008543 1.010670 -0.001486 -0.000271
10187 d -0.008943 -0.024738 -0.001486 0.921297 -0.013692
10188 e 0.014144 0.009826 -0.000271 -0.013692 0.977795
10189
10190 **Minimum number of periods**
10191
10192 This method also supports an optional ``min_periods`` keyword
10193 that specifies the required minimum number of non-NA observations for
10194 each column pair in order to have a valid result:
10195
10196 >>> np.random.seed(42)
10197 >>> df = pd.DataFrame(np.random.randn(20, 3),
10198 ... columns=['a', 'b', 'c'])
10199 >>> df.loc[df.index[:5], 'a'] = np.nan
10200 >>> df.loc[df.index[5:10], 'b'] = np.nan
10201 >>> df.cov(min_periods=12)
10202 a b c
10203 a 0.316741 NaN -0.150812
10204 b NaN 1.248003 0.191417
10205 c -0.150812 0.191417 0.895202
10206 """
10207 data = self._get_numeric_data() if numeric_only else self
10208 cols = data.columns
10209 idx = cols.copy()
10210 mat = data.to_numpy(dtype=float, na_value=np.nan, copy=False)
10211
10212 if notna(mat).all():
10213 if min_periods is not None and min_periods > len(mat):
10214 base_cov = np.empty((mat.shape[1], mat.shape[1]))
10215 base_cov.fill(np.nan)
10216 else:
10217 base_cov = np.cov(mat.T, ddof=ddof)
10218 base_cov = base_cov.reshape((len(cols), len(cols)))
10219 else:
10220 base_cov = libalgos.nancorr(mat, cov=True, minp=min_periods)
10221
10222 result = self._constructor(base_cov, index=idx, columns=cols, copy=False)
10223 return result.__finalize__(self, method="cov")
10224
10225 def corrwith(
10226 self,
10227 other: DataFrame | Series,
10228 axis: Axis = 0,
10229 drop: bool = False,
10230 method: CorrelationMethod = "pearson",
10231 numeric_only: bool = False,
10232 ) -> Series:
10233 """
10234 Compute pairwise correlation.
10235
10236 Pairwise correlation is computed between rows or columns of
10237 DataFrame with rows or columns of Series or DataFrame. DataFrames
10238 are first aligned along both axes before computing the
10239 correlations.
10240
10241 Parameters
10242 ----------
10243 other : DataFrame, Series
10244 Object with which to compute correlations.
10245 axis : {0 or 'index', 1 or 'columns'}, default 0
10246 The axis to use. 0 or 'index' to compute row-wise, 1 or 'columns' for
10247 column-wise.
10248 drop : bool, default False
10249 Drop missing indices from result.
10250 method : {'pearson', 'kendall', 'spearman'} or callable
10251 Method of correlation:
10252
10253 * pearson : standard correlation coefficient
10254 * kendall : Kendall Tau correlation coefficient
10255 * spearman : Spearman rank correlation
10256 * callable: callable with input two 1d ndarrays
10257 and returning a float.
10258
10259 numeric_only : bool, default False
10260 Include only `float`, `int` or `boolean` data.
10261
10262 .. versionadded:: 1.5.0
10263
10264 .. versionchanged:: 2.0.0
10265 The default value of ``numeric_only`` is now ``False``.
10266
10267 Returns
10268 -------
10269 Series
10270 Pairwise correlations.
10271
10272 See Also
10273 --------
10274 DataFrame.corr : Compute pairwise correlation of columns.
10275
10276 Examples
10277 --------
10278 >>> index = ["a", "b", "c", "d", "e"]
10279 >>> columns = ["one", "two", "three", "four"]
10280 >>> df1 = pd.DataFrame(np.arange(20).reshape(5, 4), index=index, columns=columns)
10281 >>> df2 = pd.DataFrame(np.arange(16).reshape(4, 4), index=index[:4], columns=columns)
10282 >>> df1.corrwith(df2)
10283 one 1.0
10284 two 1.0
10285 three 1.0
10286 four 1.0
10287 dtype: float64
10288
10289 >>> df2.corrwith(df1, axis=1)
10290 a 1.0
10291 b 1.0
10292 c 1.0
10293 d 1.0
10294 e NaN
10295 dtype: float64
10296 """ # noqa:E501
10297 axis = self._get_axis_number(axis)
10298 this = self._get_numeric_data() if numeric_only else self
10299
10300 if isinstance(other, Series):
10301 return this.apply(lambda x: other.corr(x, method=method), axis=axis)
10302
10303 if numeric_only:
10304 other = other._get_numeric_data()
10305 left, right = this.align(other, join="inner", copy=False)
10306
10307 if axis == 1:
10308 left = left.T
10309 right = right.T
10310
10311 if method == "pearson":
10312 # mask missing values
10313 left = left + right * 0
10314 right = right + left * 0
10315
10316 # demeaned data
10317 ldem = left - left.mean(numeric_only=numeric_only)
10318 rdem = right - right.mean(numeric_only=numeric_only)
10319
10320 num = (ldem * rdem).sum()
10321 dom = (
10322 (left.count() - 1)
10323 * left.std(numeric_only=numeric_only)
10324 * right.std(numeric_only=numeric_only)
10325 )
10326
10327 correl = num / dom
10328
10329 elif method in ["kendall", "spearman"] or callable(method):
10330
10331 def c(x):
10332 return nanops.nancorr(x[0], x[1], method=method)
10333
10334 correl = self._constructor_sliced(
10335 map(c, zip(left.values.T, right.values.T)),
10336 index=left.columns,
10337 copy=False,
10338 )
10339
10340 else:
10341 raise ValueError(
10342 f"Invalid method {method} was passed, "
10343 "valid methods are: 'pearson', 'kendall', "
10344 "'spearman', or callable"
10345 )
10346
10347 if not drop:
10348 # Find non-matching labels along the given axis
10349 # and append missing correlations (GH 22375)
10350 raxis: AxisInt = 1 if axis == 0 else 0
10351 result_index = this._get_axis(raxis).union(other._get_axis(raxis))
10352 idx_diff = result_index.difference(correl.index)
10353
10354 if len(idx_diff) > 0:
10355 correl = correl._append(
10356 Series([np.nan] * len(idx_diff), index=idx_diff)
10357 )
10358
10359 return correl
10360
10361 # ----------------------------------------------------------------------
10362 # ndarray-like stats methods
10363
10364 def count(self, axis: Axis = 0, numeric_only: bool = False):
10365 """
10366 Count non-NA cells for each column or row.
10367
10368 The values `None`, `NaN`, `NaT`, and optionally `numpy.inf` (depending
10369 on `pandas.options.mode.use_inf_as_na`) are considered NA.
10370
10371 Parameters
10372 ----------
10373 axis : {0 or 'index', 1 or 'columns'}, default 0
10374 If 0 or 'index' counts are generated for each column.
10375 If 1 or 'columns' counts are generated for each row.
10376 numeric_only : bool, default False
10377 Include only `float`, `int` or `boolean` data.
10378
10379 Returns
10380 -------
10381 Series or DataFrame
10382 For each column/row the number of non-NA/null entries.
10383 If `level` is specified returns a `DataFrame`.
10384
10385 See Also
10386 --------
10387 Series.count: Number of non-NA elements in a Series.
10388 DataFrame.value_counts: Count unique combinations of columns.
10389 DataFrame.shape: Number of DataFrame rows and columns (including NA
10390 elements).
10391 DataFrame.isna: Boolean same-sized DataFrame showing places of NA
10392 elements.
10393
10394 Examples
10395 --------
10396 Constructing DataFrame from a dictionary:
10397
10398 >>> df = pd.DataFrame({"Person":
10399 ... ["John", "Myla", "Lewis", "John", "Myla"],
10400 ... "Age": [24., np.nan, 21., 33, 26],
10401 ... "Single": [False, True, True, True, False]})
10402 >>> df
10403 Person Age Single
10404 0 John 24.0 False
10405 1 Myla NaN True
10406 2 Lewis 21.0 True
10407 3 John 33.0 True
10408 4 Myla 26.0 False
10409
10410 Notice the uncounted NA values:
10411
10412 >>> df.count()
10413 Person 5
10414 Age 4
10415 Single 5
10416 dtype: int64
10417
10418 Counts for each **row**:
10419
10420 >>> df.count(axis='columns')
10421 0 3
10422 1 2
10423 2 3
10424 3 3
10425 4 3
10426 dtype: int64
10427 """
10428 axis = self._get_axis_number(axis)
10429
10430 if numeric_only:
10431 frame = self._get_numeric_data()
10432 else:
10433 frame = self
10434
10435 # GH #423
10436 if len(frame._get_axis(axis)) == 0:
10437 result = self._constructor_sliced(0, index=frame._get_agg_axis(axis))
10438 else:
10439 if frame._is_mixed_type or frame._mgr.any_extension_types:
10440 # the or any_extension_types is really only hit for single-
10441 # column frames with an extension array
10442 result = notna(frame).sum(axis=axis)
10443 else:
10444 # GH13407
10445 series_counts = notna(frame).sum(axis=axis)
10446 counts = series_counts._values
10447 result = self._constructor_sliced(
10448 counts, index=frame._get_agg_axis(axis), copy=False
10449 )
10450
10451 return result.astype("int64").__finalize__(self, method="count")
10452
10453 def _reduce(
10454 self,
10455 op,
10456 name: str,
10457 *,
10458 axis: Axis = 0,
10459 skipna: bool = True,
10460 numeric_only: bool = False,
10461 filter_type=None,
10462 **kwds,
10463 ):
10464 assert filter_type is None or filter_type == "bool", filter_type
10465 out_dtype = "bool" if filter_type == "bool" else None
10466
10467 if axis is not None:
10468 axis = self._get_axis_number(axis)
10469
10470 def func(values: np.ndarray):
10471 # We only use this in the case that operates on self.values
10472 return op(values, axis=axis, skipna=skipna, **kwds)
10473
10474 def blk_func(values, axis: Axis = 1):
10475 if isinstance(values, ExtensionArray):
10476 if not is_1d_only_ea_dtype(values.dtype) and not isinstance(
10477 self._mgr, ArrayManager
10478 ):
10479 return values._reduce(name, axis=1, skipna=skipna, **kwds)
10480 return values._reduce(name, skipna=skipna, **kwds)
10481 else:
10482 return op(values, axis=axis, skipna=skipna, **kwds)
10483
10484 def _get_data() -> DataFrame:
10485 if filter_type is None:
10486 data = self._get_numeric_data()
10487 else:
10488 # GH#25101, GH#24434
10489 assert filter_type == "bool"
10490 data = self._get_bool_data()
10491 return data
10492
10493 # Case with EAs see GH#35881
10494 df = self
10495 if numeric_only:
10496 df = _get_data()
10497 if axis is None:
10498 return func(df.values)
10499 elif axis == 1:
10500 if len(df.index) == 0:
10501 # Taking a transpose would result in no columns, losing the dtype.
10502 # In the empty case, reducing along axis 0 or 1 gives the same
10503 # result dtype, so reduce with axis=0 and ignore values
10504 result = df._reduce(
10505 op,
10506 name,
10507 axis=0,
10508 skipna=skipna,
10509 numeric_only=False,
10510 filter_type=filter_type,
10511 **kwds,
10512 ).iloc[:0]
10513 result.index = df.index
10514 return result
10515 df = df.T
10516
10517 # After possibly _get_data and transposing, we are now in the
10518 # simple case where we can use BlockManager.reduce
10519 res = df._mgr.reduce(blk_func)
10520 out = df._constructor(res).iloc[0]
10521 if out_dtype is not None:
10522 out = out.astype(out_dtype)
10523 elif (df._mgr.get_dtypes() == object).any():
10524 out = out.astype(object)
10525 elif len(self) == 0 and name in ("sum", "prod"):
10526 # Even if we are object dtype, follow numpy and return
10527 # float64, see test_apply_funcs_over_empty
10528 out = out.astype(np.float64)
10529
10530 return out
10531
10532 def _reduce_axis1(self, name: str, func, skipna: bool) -> Series:
10533 """
10534 Special case for _reduce to try to avoid a potentially-expensive transpose.
10535
10536 Apply the reduction block-wise along axis=1 and then reduce the resulting
10537 1D arrays.
10538 """
10539 if name == "all":
10540 result = np.ones(len(self), dtype=bool)
10541 ufunc = np.logical_and
10542 elif name == "any":
10543 result = np.zeros(len(self), dtype=bool)
10544 # error: Incompatible types in assignment
10545 # (expression has type "_UFunc_Nin2_Nout1[Literal['logical_or'],
10546 # Literal[20], Literal[False]]", variable has type
10547 # "_UFunc_Nin2_Nout1[Literal['logical_and'], Literal[20],
10548 # Literal[True]]")
10549 ufunc = np.logical_or # type: ignore[assignment]
10550 else:
10551 raise NotImplementedError(name)
10552
10553 for arr in self._mgr.arrays:
10554 middle = func(arr, axis=0, skipna=skipna)
10555 result = ufunc(result, middle)
10556
10557 res_ser = self._constructor_sliced(result, index=self.index, copy=False)
10558 return res_ser
10559
10560 def nunique(self, axis: Axis = 0, dropna: bool = True) -> Series:
10561 """
10562 Count number of distinct elements in specified axis.
10563
10564 Return Series with number of distinct elements. Can ignore NaN
10565 values.
10566
10567 Parameters
10568 ----------
10569 axis : {0 or 'index', 1 or 'columns'}, default 0
10570 The axis to use. 0 or 'index' for row-wise, 1 or 'columns' for
10571 column-wise.
10572 dropna : bool, default True
10573 Don't include NaN in the counts.
10574
10575 Returns
10576 -------
10577 Series
10578
10579 See Also
10580 --------
10581 Series.nunique: Method nunique for Series.
10582 DataFrame.count: Count non-NA cells for each column or row.
10583
10584 Examples
10585 --------
10586 >>> df = pd.DataFrame({'A': [4, 5, 6], 'B': [4, 1, 1]})
10587 >>> df.nunique()
10588 A 3
10589 B 2
10590 dtype: int64
10591
10592 >>> df.nunique(axis=1)
10593 0 1
10594 1 2
10595 2 2
10596 dtype: int64
10597 """
10598 return self.apply(Series.nunique, axis=axis, dropna=dropna)
10599
10600 @doc(_shared_docs["idxmin"], numeric_only_default="False")
10601 def idxmin(
10602 self, axis: Axis = 0, skipna: bool = True, numeric_only: bool = False
10603 ) -> Series:
10604 axis = self._get_axis_number(axis)
10605 if numeric_only:
10606 data = self._get_numeric_data()
10607 else:
10608 data = self
10609
10610 res = data._reduce(
10611 nanops.nanargmin, "argmin", axis=axis, skipna=skipna, numeric_only=False
10612 )
10613 indices = res._values
10614
10615 # indices will always be np.ndarray since axis is not None and
10616 # values is a 2d array for DataFrame
10617 # error: Item "int" of "Union[int, Any]" has no attribute "__iter__"
10618 assert isinstance(indices, np.ndarray) # for mypy
10619
10620 index = data._get_axis(axis)
10621 result = [index[i] if i >= 0 else np.nan for i in indices]
10622 final_result = data._constructor_sliced(result, index=data._get_agg_axis(axis))
10623 return final_result.__finalize__(self, method="idxmin")
10624
10625 @doc(_shared_docs["idxmax"], numeric_only_default="False")
10626 def idxmax(
10627 self, axis: Axis = 0, skipna: bool = True, numeric_only: bool = False
10628 ) -> Series:
10629 axis = self._get_axis_number(axis)
10630 if numeric_only:
10631 data = self._get_numeric_data()
10632 else:
10633 data = self
10634
10635 res = data._reduce(
10636 nanops.nanargmax, "argmax", axis=axis, skipna=skipna, numeric_only=False
10637 )
10638 indices = res._values
10639
10640 # indices will always be np.ndarray since axis is not None and
10641 # values is a 2d array for DataFrame
10642 # error: Item "int" of "Union[int, Any]" has no attribute "__iter__"
10643 assert isinstance(indices, np.ndarray) # for mypy
10644
10645 index = data._get_axis(axis)
10646 result = [index[i] if i >= 0 else np.nan for i in indices]
10647 final_result = data._constructor_sliced(result, index=data._get_agg_axis(axis))
10648 return final_result.__finalize__(self, method="idxmax")
10649
10650 def _get_agg_axis(self, axis_num: int) -> Index:
10651 """
10652 Let's be explicit about this.
10653 """
10654 if axis_num == 0:
10655 return self.columns
10656 elif axis_num == 1:
10657 return self.index
10658 else:
10659 raise ValueError(f"Axis must be 0 or 1 (got {repr(axis_num)})")
10660
10661 def mode(
10662 self, axis: Axis = 0, numeric_only: bool = False, dropna: bool = True
10663 ) -> DataFrame:
10664 """
10665 Get the mode(s) of each element along the selected axis.
10666
10667 The mode of a set of values is the value that appears most often.
10668 It can be multiple values.
10669
10670 Parameters
10671 ----------
10672 axis : {0 or 'index', 1 or 'columns'}, default 0
10673 The axis to iterate over while searching for the mode:
10674
10675 * 0 or 'index' : get mode of each column
10676 * 1 or 'columns' : get mode of each row.
10677
10678 numeric_only : bool, default False
10679 If True, only apply to numeric columns.
10680 dropna : bool, default True
10681 Don't consider counts of NaN/NaT.
10682
10683 Returns
10684 -------
10685 DataFrame
10686 The modes of each column or row.
10687
10688 See Also
10689 --------
10690 Series.mode : Return the highest frequency value in a Series.
10691 Series.value_counts : Return the counts of values in a Series.
10692
10693 Examples
10694 --------
10695 >>> df = pd.DataFrame([('bird', 2, 2),
10696 ... ('mammal', 4, np.nan),
10697 ... ('arthropod', 8, 0),
10698 ... ('bird', 2, np.nan)],
10699 ... index=('falcon', 'horse', 'spider', 'ostrich'),
10700 ... columns=('species', 'legs', 'wings'))
10701 >>> df
10702 species legs wings
10703 falcon bird 2 2.0
10704 horse mammal 4 NaN
10705 spider arthropod 8 0.0
10706 ostrich bird 2 NaN
10707
10708 By default, missing values are not considered, and the mode of wings
10709 are both 0 and 2. Because the resulting DataFrame has two rows,
10710 the second row of ``species`` and ``legs`` contains ``NaN``.
10711
10712 >>> df.mode()
10713 species legs wings
10714 0 bird 2.0 0.0
10715 1 NaN NaN 2.0
10716
10717 Setting ``dropna=False`` ``NaN`` values are considered and they can be
10718 the mode (like for wings).
10719
10720 >>> df.mode(dropna=False)
10721 species legs wings
10722 0 bird 2 NaN
10723
10724 Setting ``numeric_only=True``, only the mode of numeric columns is
10725 computed, and columns of other types are ignored.
10726
10727 >>> df.mode(numeric_only=True)
10728 legs wings
10729 0 2.0 0.0
10730 1 NaN 2.0
10731
10732 To compute the mode over columns and not rows, use the axis parameter:
10733
10734 >>> df.mode(axis='columns', numeric_only=True)
10735 0 1
10736 falcon 2.0 NaN
10737 horse 4.0 NaN
10738 spider 0.0 8.0
10739 ostrich 2.0 NaN
10740 """
10741 data = self if not numeric_only else self._get_numeric_data()
10742
10743 def f(s):
10744 return s.mode(dropna=dropna)
10745
10746 data = data.apply(f, axis=axis)
10747 # Ensure index is type stable (should always use int index)
10748 if data.empty:
10749 data.index = default_index(0)
10750
10751 return data
10752
10753 @overload
10754 def quantile(
10755 self,
10756 q: float = ...,
10757 axis: Axis = ...,
10758 numeric_only: bool = ...,
10759 interpolation: QuantileInterpolation = ...,
10760 ) -> Series:
10761 ...
10762
10763 @overload
10764 def quantile(
10765 self,
10766 q: AnyArrayLike | Sequence[float],
10767 axis: Axis = ...,
10768 numeric_only: bool = ...,
10769 interpolation: QuantileInterpolation = ...,
10770 ) -> Series | DataFrame:
10771 ...
10772
10773 @overload
10774 def quantile(
10775 self,
10776 q: float | AnyArrayLike | Sequence[float] = ...,
10777 axis: Axis = ...,
10778 numeric_only: bool = ...,
10779 interpolation: QuantileInterpolation = ...,
10780 ) -> Series | DataFrame:
10781 ...
10782
10783 def quantile(
10784 self,
10785 q: float | AnyArrayLike | Sequence[float] = 0.5,
10786 axis: Axis = 0,
10787 numeric_only: bool = False,
10788 interpolation: QuantileInterpolation = "linear",
10789 method: Literal["single", "table"] = "single",
10790 ) -> Series | DataFrame:
10791 """
10792 Return values at the given quantile over requested axis.
10793
10794 Parameters
10795 ----------
10796 q : float or array-like, default 0.5 (50% quantile)
10797 Value between 0 <= q <= 1, the quantile(s) to compute.
10798 axis : {0 or 'index', 1 or 'columns'}, default 0
10799 Equals 0 or 'index' for row-wise, 1 or 'columns' for column-wise.
10800 numeric_only : bool, default False
10801 Include only `float`, `int` or `boolean` data.
10802
10803 .. versionchanged:: 2.0.0
10804 The default value of ``numeric_only`` is now ``False``.
10805
10806 interpolation : {'linear', 'lower', 'higher', 'midpoint', 'nearest'}
10807 This optional parameter specifies the interpolation method to use,
10808 when the desired quantile lies between two data points `i` and `j`:
10809
10810 * linear: `i + (j - i) * fraction`, where `fraction` is the
10811 fractional part of the index surrounded by `i` and `j`.
10812 * lower: `i`.
10813 * higher: `j`.
10814 * nearest: `i` or `j` whichever is nearest.
10815 * midpoint: (`i` + `j`) / 2.
10816 method : {'single', 'table'}, default 'single'
10817 Whether to compute quantiles per-column ('single') or over all columns
10818 ('table'). When 'table', the only allowed interpolation methods are
10819 'nearest', 'lower', and 'higher'.
10820
10821 Returns
10822 -------
10823 Series or DataFrame
10824
10825 If ``q`` is an array, a DataFrame will be returned where the
10826 index is ``q``, the columns are the columns of self, and the
10827 values are the quantiles.
10828 If ``q`` is a float, a Series will be returned where the
10829 index is the columns of self and the values are the quantiles.
10830
10831 See Also
10832 --------
10833 core.window.rolling.Rolling.quantile: Rolling quantile.
10834 numpy.percentile: Numpy function to compute the percentile.
10835
10836 Examples
10837 --------
10838 >>> df = pd.DataFrame(np.array([[1, 1], [2, 10], [3, 100], [4, 100]]),
10839 ... columns=['a', 'b'])
10840 >>> df.quantile(.1)
10841 a 1.3
10842 b 3.7
10843 Name: 0.1, dtype: float64
10844 >>> df.quantile([.1, .5])
10845 a b
10846 0.1 1.3 3.7
10847 0.5 2.5 55.0
10848
10849 Specifying `method='table'` will compute the quantile over all columns.
10850
10851 >>> df.quantile(.1, method="table", interpolation="nearest")
10852 a 1
10853 b 1
10854 Name: 0.1, dtype: int64
10855 >>> df.quantile([.1, .5], method="table", interpolation="nearest")
10856 a b
10857 0.1 1 1
10858 0.5 3 100
10859
10860 Specifying `numeric_only=False` will also compute the quantile of
10861 datetime and timedelta data.
10862
10863 >>> df = pd.DataFrame({'A': [1, 2],
10864 ... 'B': [pd.Timestamp('2010'),
10865 ... pd.Timestamp('2011')],
10866 ... 'C': [pd.Timedelta('1 days'),
10867 ... pd.Timedelta('2 days')]})
10868 >>> df.quantile(0.5, numeric_only=False)
10869 A 1.5
10870 B 2010-07-02 12:00:00
10871 C 1 days 12:00:00
10872 Name: 0.5, dtype: object
10873 """
10874 validate_percentile(q)
10875 axis = self._get_axis_number(axis)
10876
10877 if not is_list_like(q):
10878 # BlockManager.quantile expects listlike, so we wrap and unwrap here
10879 # error: List item 0 has incompatible type "Union[float, Union[Union[
10880 # ExtensionArray, ndarray[Any, Any]], Index, Series], Sequence[float]]";
10881 # expected "float"
10882 res_df = self.quantile( # type: ignore[call-overload]
10883 [q],
10884 axis=axis,
10885 numeric_only=numeric_only,
10886 interpolation=interpolation,
10887 method=method,
10888 )
10889 if method == "single":
10890 res = res_df.iloc[0]
10891 else:
10892 # cannot directly iloc over sparse arrays
10893 res = res_df.T.iloc[:, 0]
10894 if axis == 1 and len(self) == 0:
10895 # GH#41544 try to get an appropriate dtype
10896 dtype = find_common_type(list(self.dtypes))
10897 if needs_i8_conversion(dtype):
10898 return res.astype(dtype)
10899 return res
10900
10901 q = Index(q, dtype=np.float64)
10902 data = self._get_numeric_data() if numeric_only else self
10903
10904 if axis == 1:
10905 data = data.T
10906
10907 if len(data.columns) == 0:
10908 # GH#23925 _get_numeric_data may have dropped all columns
10909 cols = Index([], name=self.columns.name)
10910
10911 dtype = np.float64
10912 if axis == 1:
10913 # GH#41544 try to get an appropriate dtype
10914 cdtype = find_common_type(list(self.dtypes))
10915 if needs_i8_conversion(cdtype):
10916 dtype = cdtype
10917
10918 res = self._constructor([], index=q, columns=cols, dtype=dtype)
10919 return res.__finalize__(self, method="quantile")
10920
10921 valid_method = {"single", "table"}
10922 if method not in valid_method:
10923 raise ValueError(
10924 f"Invalid method: {method}. Method must be in {valid_method}."
10925 )
10926 if method == "single":
10927 res = data._mgr.quantile(qs=q, axis=1, interpolation=interpolation)
10928 elif method == "table":
10929 valid_interpolation = {"nearest", "lower", "higher"}
10930 if interpolation not in valid_interpolation:
10931 raise ValueError(
10932 f"Invalid interpolation: {interpolation}. "
10933 f"Interpolation must be in {valid_interpolation}"
10934 )
10935 # handle degenerate case
10936 if len(data) == 0:
10937 if data.ndim == 2:
10938 dtype = find_common_type(list(self.dtypes))
10939 else:
10940 dtype = self.dtype
10941 return self._constructor([], index=q, columns=data.columns, dtype=dtype)
10942
10943 q_idx = np.quantile( # type: ignore[call-overload]
10944 np.arange(len(data)), q, **{np_percentile_argname: interpolation}
10945 )
10946
10947 by = data.columns
10948 if len(by) > 1:
10949 keys = [data._get_label_or_level_values(x) for x in by]
10950 indexer = lexsort_indexer(keys)
10951 else:
10952 by = by[0]
10953 k = data._get_label_or_level_values(by) # type: ignore[arg-type]
10954 indexer = nargsort(k)
10955
10956 res = data._mgr.take(indexer[q_idx], verify=False)
10957 res.axes[1] = q
10958
10959 result = self._constructor(res)
10960 return result.__finalize__(self, method="quantile")
10961
10962 @doc(NDFrame.asfreq, **_shared_doc_kwargs)
10963 def asfreq(
10964 self,
10965 freq: Frequency,
10966 method: FillnaOptions | None = None,
10967 how: str | None = None,
10968 normalize: bool = False,
10969 fill_value: Hashable = None,
10970 ) -> DataFrame:
10971 return super().asfreq(
10972 freq=freq,
10973 method=method,
10974 how=how,
10975 normalize=normalize,
10976 fill_value=fill_value,
10977 )
10978
10979 @doc(NDFrame.resample, **_shared_doc_kwargs)
10980 def resample(
10981 self,
10982 rule,
10983 axis: Axis = 0,
10984 closed: str | None = None,
10985 label: str | None = None,
10986 convention: str = "start",
10987 kind: str | None = None,
10988 on: Level = None,
10989 level: Level = None,
10990 origin: str | TimestampConvertibleTypes = "start_day",
10991 offset: TimedeltaConvertibleTypes | None = None,
10992 group_keys: bool = False,
10993 ) -> Resampler:
10994 return super().resample(
10995 rule=rule,
10996 axis=axis,
10997 closed=closed,
10998 label=label,
10999 convention=convention,
11000 kind=kind,
11001 on=on,
11002 level=level,
11003 origin=origin,
11004 offset=offset,
11005 group_keys=group_keys,
11006 )
11007
11008 def to_timestamp(
11009 self,
11010 freq: Frequency | None = None,
11011 how: str = "start",
11012 axis: Axis = 0,
11013 copy: bool | None = None,
11014 ) -> DataFrame:
11015 """
11016 Cast to DatetimeIndex of timestamps, at *beginning* of period.
11017
11018 Parameters
11019 ----------
11020 freq : str, default frequency of PeriodIndex
11021 Desired frequency.
11022 how : {'s', 'e', 'start', 'end'}
11023 Convention for converting period to timestamp; start of period
11024 vs. end.
11025 axis : {0 or 'index', 1 or 'columns'}, default 0
11026 The axis to convert (the index by default).
11027 copy : bool, default True
11028 If False then underlying input data is not copied.
11029
11030 Returns
11031 -------
11032 DataFrame
11033 The DataFrame has a DatetimeIndex.
11034
11035 Examples
11036 --------
11037 >>> idx = pd.PeriodIndex(['2023', '2024'], freq='Y')
11038 >>> d = {'col1': [1, 2], 'col2': [3, 4]}
11039 >>> df1 = pd.DataFrame(data=d, index=idx)
11040 >>> df1
11041 col1 col2
11042 2023 1 3
11043 2024 2 4
11044
11045 The resulting timestamps will be at the beginning of the year in this case
11046
11047 >>> df1 = df1.to_timestamp()
11048 >>> df1
11049 col1 col2
11050 2023-01-01 1 3
11051 2024-01-01 2 4
11052 >>> df1.index
11053 DatetimeIndex(['2023-01-01', '2024-01-01'], dtype='datetime64[ns]', freq=None)
11054
11055 Using `freq` which is the offset that the Timestamps will have
11056
11057 >>> df2 = pd.DataFrame(data=d, index=idx)
11058 >>> df2 = df2.to_timestamp(freq='M')
11059 >>> df2
11060 col1 col2
11061 2023-01-31 1 3
11062 2024-01-31 2 4
11063 >>> df2.index
11064 DatetimeIndex(['2023-01-31', '2024-01-31'], dtype='datetime64[ns]', freq=None)
11065 """
11066 new_obj = self.copy(deep=copy and not using_copy_on_write())
11067
11068 axis_name = self._get_axis_name(axis)
11069 old_ax = getattr(self, axis_name)
11070 if not isinstance(old_ax, PeriodIndex):
11071 raise TypeError(f"unsupported Type {type(old_ax).__name__}")
11072
11073 new_ax = old_ax.to_timestamp(freq=freq, how=how)
11074
11075 setattr(new_obj, axis_name, new_ax)
11076 return new_obj
11077
11078 def to_period(
11079 self, freq: Frequency | None = None, axis: Axis = 0, copy: bool | None = None
11080 ) -> DataFrame:
11081 """
11082 Convert DataFrame from DatetimeIndex to PeriodIndex.
11083
11084 Convert DataFrame from DatetimeIndex to PeriodIndex with desired
11085 frequency (inferred from index if not passed).
11086
11087 Parameters
11088 ----------
11089 freq : str, default
11090 Frequency of the PeriodIndex.
11091 axis : {0 or 'index', 1 or 'columns'}, default 0
11092 The axis to convert (the index by default).
11093 copy : bool, default True
11094 If False then underlying input data is not copied.
11095
11096 Returns
11097 -------
11098 DataFrame
11099 The DataFrame has a PeriodIndex.
11100
11101 Examples
11102 --------
11103 >>> idx = pd.to_datetime(
11104 ... [
11105 ... "2001-03-31 00:00:00",
11106 ... "2002-05-31 00:00:00",
11107 ... "2003-08-31 00:00:00",
11108 ... ]
11109 ... )
11110
11111 >>> idx
11112 DatetimeIndex(['2001-03-31', '2002-05-31', '2003-08-31'],
11113 dtype='datetime64[ns]', freq=None)
11114
11115 >>> idx.to_period("M")
11116 PeriodIndex(['2001-03', '2002-05', '2003-08'], dtype='period[M]')
11117
11118 For the yearly frequency
11119
11120 >>> idx.to_period("Y")
11121 PeriodIndex(['2001', '2002', '2003'], dtype='period[A-DEC]')
11122 """
11123 new_obj = self.copy(deep=copy and not using_copy_on_write())
11124
11125 axis_name = self._get_axis_name(axis)
11126 old_ax = getattr(self, axis_name)
11127 if not isinstance(old_ax, DatetimeIndex):
11128 raise TypeError(f"unsupported Type {type(old_ax).__name__}")
11129
11130 new_ax = old_ax.to_period(freq=freq)
11131
11132 setattr(new_obj, axis_name, new_ax)
11133 return new_obj
11134
11135 def isin(self, values: Series | DataFrame | Sequence | Mapping) -> DataFrame:
11136 """
11137 Whether each element in the DataFrame is contained in values.
11138
11139 Parameters
11140 ----------
11141 values : iterable, Series, DataFrame or dict
11142 The result will only be true at a location if all the
11143 labels match. If `values` is a Series, that's the index. If
11144 `values` is a dict, the keys must be the column names,
11145 which must match. If `values` is a DataFrame,
11146 then both the index and column labels must match.
11147
11148 Returns
11149 -------
11150 DataFrame
11151 DataFrame of booleans showing whether each element in the DataFrame
11152 is contained in values.
11153
11154 See Also
11155 --------
11156 DataFrame.eq: Equality test for DataFrame.
11157 Series.isin: Equivalent method on Series.
11158 Series.str.contains: Test if pattern or regex is contained within a
11159 string of a Series or Index.
11160
11161 Examples
11162 --------
11163 >>> df = pd.DataFrame({'num_legs': [2, 4], 'num_wings': [2, 0]},
11164 ... index=['falcon', 'dog'])
11165 >>> df
11166 num_legs num_wings
11167 falcon 2 2
11168 dog 4 0
11169
11170 When ``values`` is a list check whether every value in the DataFrame
11171 is present in the list (which animals have 0 or 2 legs or wings)
11172
11173 >>> df.isin([0, 2])
11174 num_legs num_wings
11175 falcon True True
11176 dog False True
11177
11178 To check if ``values`` is *not* in the DataFrame, use the ``~`` operator:
11179
11180 >>> ~df.isin([0, 2])
11181 num_legs num_wings
11182 falcon False False
11183 dog True False
11184
11185 When ``values`` is a dict, we can pass values to check for each
11186 column separately:
11187
11188 >>> df.isin({'num_wings': [0, 3]})
11189 num_legs num_wings
11190 falcon False False
11191 dog False True
11192
11193 When ``values`` is a Series or DataFrame the index and column must
11194 match. Note that 'falcon' does not match based on the number of legs
11195 in other.
11196
11197 >>> other = pd.DataFrame({'num_legs': [8, 3], 'num_wings': [0, 2]},
11198 ... index=['spider', 'falcon'])
11199 >>> df.isin(other)
11200 num_legs num_wings
11201 falcon False True
11202 dog False False
11203 """
11204 if isinstance(values, dict):
11205 from pandas.core.reshape.concat import concat
11206
11207 values = collections.defaultdict(list, values)
11208 result = concat(
11209 (
11210 self.iloc[:, [i]].isin(values[col])
11211 for i, col in enumerate(self.columns)
11212 ),
11213 axis=1,
11214 )
11215 elif isinstance(values, Series):
11216 if not values.index.is_unique:
11217 raise ValueError("cannot compute isin with a duplicate axis.")
11218 result = self.eq(values.reindex_like(self), axis="index")
11219 elif isinstance(values, DataFrame):
11220 if not (values.columns.is_unique and values.index.is_unique):
11221 raise ValueError("cannot compute isin with a duplicate axis.")
11222 result = self.eq(values.reindex_like(self))
11223 else:
11224 if not is_list_like(values):
11225 raise TypeError(
11226 "only list-like or dict-like objects are allowed "
11227 "to be passed to DataFrame.isin(), "
11228 f"you passed a '{type(values).__name__}'"
11229 )
11230 # error: Argument 2 to "isin" has incompatible type "Union[Sequence[Any],
11231 # Mapping[Any, Any]]"; expected "Union[Union[ExtensionArray,
11232 # ndarray[Any, Any]], Index, Series]"
11233 result = self._constructor(
11234 algorithms.isin(
11235 self.values.ravel(), values # type: ignore[arg-type]
11236 ).reshape(self.shape),
11237 self.index,
11238 self.columns,
11239 copy=False,
11240 )
11241 return result.__finalize__(self, method="isin")
11242
11243 # ----------------------------------------------------------------------
11244 # Add index and columns
11245 _AXIS_ORDERS: list[Literal["index", "columns"]] = ["index", "columns"]
11246 _AXIS_TO_AXIS_NUMBER: dict[Axis, int] = {
11247 **NDFrame._AXIS_TO_AXIS_NUMBER,
11248 1: 1,
11249 "columns": 1,
11250 }
11251 _AXIS_LEN = len(_AXIS_ORDERS)
11252 _info_axis_number: Literal[1] = 1
11253 _info_axis_name: Literal["columns"] = "columns"
11254
11255 index = properties.AxisProperty(
11256 axis=1, doc="The index (row labels) of the DataFrame."
11257 )
11258 columns = properties.AxisProperty(axis=0, doc="The column labels of the DataFrame.")
11259
11260 # ----------------------------------------------------------------------
11261 # Add plotting methods to DataFrame
11262 plot = CachedAccessor("plot", pandas.plotting.PlotAccessor)
11263 hist = pandas.plotting.hist_frame
11264 boxplot = pandas.plotting.boxplot_frame
11265 sparse = CachedAccessor("sparse", SparseFrameAccessor)
11266
11267 # ----------------------------------------------------------------------
11268 # Internal Interface Methods
11269
11270 def _to_dict_of_blocks(self, copy: bool = True):
11271 """
11272 Return a dict of dtype -> Constructor Types that
11273 each is a homogeneous dtype.
11274
11275 Internal ONLY - only works for BlockManager
11276 """
11277 mgr = self._mgr
11278 # convert to BlockManager if needed -> this way support ArrayManager as well
11279 mgr = mgr_to_mgr(mgr, "block")
11280 mgr = cast(BlockManager, mgr)
11281 return {
11282 k: self._constructor(v).__finalize__(self)
11283 for k, v, in mgr.to_dict(copy=copy).items()
11284 }
11285
11286 @property
11287 def values(self) -> np.ndarray:
11288 """
11289 Return a Numpy representation of the DataFrame.
11290
11291 .. warning::
11292
11293 We recommend using :meth:`DataFrame.to_numpy` instead.
11294
11295 Only the values in the DataFrame will be returned, the axes labels
11296 will be removed.
11297
11298 Returns
11299 -------
11300 numpy.ndarray
11301 The values of the DataFrame.
11302
11303 See Also
11304 --------
11305 DataFrame.to_numpy : Recommended alternative to this method.
11306 DataFrame.index : Retrieve the index labels.
11307 DataFrame.columns : Retrieving the column names.
11308
11309 Notes
11310 -----
11311 The dtype will be a lower-common-denominator dtype (implicit
11312 upcasting); that is to say if the dtypes (even of numeric types)
11313 are mixed, the one that accommodates all will be chosen. Use this
11314 with care if you are not dealing with the blocks.
11315
11316 e.g. If the dtypes are float16 and float32, dtype will be upcast to
11317 float32. If dtypes are int32 and uint8, dtype will be upcast to
11318 int32. By :func:`numpy.find_common_type` convention, mixing int64
11319 and uint64 will result in a float64 dtype.
11320
11321 Examples
11322 --------
11323 A DataFrame where all columns are the same type (e.g., int64) results
11324 in an array of the same type.
11325
11326 >>> df = pd.DataFrame({'age': [ 3, 29],
11327 ... 'height': [94, 170],
11328 ... 'weight': [31, 115]})
11329 >>> df
11330 age height weight
11331 0 3 94 31
11332 1 29 170 115
11333 >>> df.dtypes
11334 age int64
11335 height int64
11336 weight int64
11337 dtype: object
11338 >>> df.values
11339 array([[ 3, 94, 31],
11340 [ 29, 170, 115]])
11341
11342 A DataFrame with mixed type columns(e.g., str/object, int64, float32)
11343 results in an ndarray of the broadest type that accommodates these
11344 mixed types (e.g., object).
11345
11346 >>> df2 = pd.DataFrame([('parrot', 24.0, 'second'),
11347 ... ('lion', 80.5, 1),
11348 ... ('monkey', np.nan, None)],
11349 ... columns=('name', 'max_speed', 'rank'))
11350 >>> df2.dtypes
11351 name object
11352 max_speed float64
11353 rank object
11354 dtype: object
11355 >>> df2.values
11356 array([['parrot', 24.0, 'second'],
11357 ['lion', 80.5, 1],
11358 ['monkey', nan, None]], dtype=object)
11359 """
11360 return self._mgr.as_array()
11361
11362 @overload
11363 def ffill(
11364 self,
11365 *,
11366 axis: None | Axis = ...,
11367 inplace: Literal[False] = ...,
11368 limit: None | int = ...,
11369 downcast: dict | None = ...,
11370 ) -> DataFrame:
11371 ...
11372
11373 @overload
11374 def ffill(
11375 self,
11376 *,
11377 axis: None | Axis = ...,
11378 inplace: Literal[True],
11379 limit: None | int = ...,
11380 downcast: dict | None = ...,
11381 ) -> None:
11382 ...
11383
11384 @overload
11385 def ffill(
11386 self,
11387 *,
11388 axis: None | Axis = ...,
11389 inplace: bool = ...,
11390 limit: None | int = ...,
11391 downcast: dict | None = ...,
11392 ) -> DataFrame | None:
11393 ...
11394
11395 def ffill(
11396 self,
11397 *,
11398 axis: None | Axis = None,
11399 inplace: bool = False,
11400 limit: None | int = None,
11401 downcast: dict | None = None,
11402 ) -> DataFrame | None:
11403 return super().ffill(axis=axis, inplace=inplace, limit=limit, downcast=downcast)
11404
11405 @overload
11406 def bfill(
11407 self,
11408 *,
11409 axis: None | Axis = ...,
11410 inplace: Literal[False] = ...,
11411 limit: None | int = ...,
11412 downcast=...,
11413 ) -> DataFrame:
11414 ...
11415
11416 @overload
11417 def bfill(
11418 self,
11419 *,
11420 axis: None | Axis = ...,
11421 inplace: Literal[True],
11422 limit: None | int = ...,
11423 downcast=...,
11424 ) -> None:
11425 ...
11426
11427 @overload
11428 def bfill(
11429 self,
11430 *,
11431 axis: None | Axis = ...,
11432 inplace: bool = ...,
11433 limit: None | int = ...,
11434 downcast=...,
11435 ) -> DataFrame | None:
11436 ...
11437
11438 def bfill(
11439 self,
11440 *,
11441 axis: None | Axis = None,
11442 inplace: bool = False,
11443 limit: None | int = None,
11444 downcast=None,
11445 ) -> DataFrame | None:
11446 return super().bfill(axis=axis, inplace=inplace, limit=limit, downcast=downcast)
11447
11448 def clip(
11449 self: DataFrame,
11450 lower: float | None = None,
11451 upper: float | None = None,
11452 *,
11453 axis: Axis | None = None,
11454 inplace: bool = False,
11455 **kwargs,
11456 ) -> DataFrame | None:
11457 return super().clip(lower, upper, axis=axis, inplace=inplace, **kwargs)
11458
11459 def interpolate(
11460 self: DataFrame,
11461 method: str = "linear",
11462 *,
11463 axis: Axis = 0,
11464 limit: int | None = None,
11465 inplace: bool = False,
11466 limit_direction: str | None = None,
11467 limit_area: str | None = None,
11468 downcast: str | None = None,
11469 **kwargs,
11470 ) -> DataFrame | None:
11471 return super().interpolate(
11472 method=method,
11473 axis=axis,
11474 limit=limit,
11475 inplace=inplace,
11476 limit_direction=limit_direction,
11477 limit_area=limit_area,
11478 downcast=downcast,
11479 **kwargs,
11480 )
11481
11482 @overload
11483 def where(
11484 self,
11485 cond,
11486 other=...,
11487 *,
11488 inplace: Literal[False] = ...,
11489 axis: Axis | None = ...,
11490 level: Level = ...,
11491 ) -> DataFrame:
11492 ...
11493
11494 @overload
11495 def where(
11496 self,
11497 cond,
11498 other=...,
11499 *,
11500 inplace: Literal[True],
11501 axis: Axis | None = ...,
11502 level: Level = ...,
11503 ) -> None:
11504 ...
11505
11506 @overload
11507 def where(
11508 self,
11509 cond,
11510 other=...,
11511 *,
11512 inplace: bool = ...,
11513 axis: Axis | None = ...,
11514 level: Level = ...,
11515 ) -> DataFrame | None:
11516 ...
11517
11518 def where(
11519 self,
11520 cond,
11521 other=lib.no_default,
11522 *,
11523 inplace: bool = False,
11524 axis: Axis | None = None,
11525 level: Level = None,
11526 ) -> DataFrame | None:
11527 return super().where(
11528 cond,
11529 other,
11530 inplace=inplace,
11531 axis=axis,
11532 level=level,
11533 )
11534
11535 @overload
11536 def mask(
11537 self,
11538 cond,
11539 other=...,
11540 *,
11541 inplace: Literal[False] = ...,
11542 axis: Axis | None = ...,
11543 level: Level = ...,
11544 ) -> DataFrame:
11545 ...
11546
11547 @overload
11548 def mask(
11549 self,
11550 cond,
11551 other=...,
11552 *,
11553 inplace: Literal[True],
11554 axis: Axis | None = ...,
11555 level: Level = ...,
11556 ) -> None:
11557 ...
11558
11559 @overload
11560 def mask(
11561 self,
11562 cond,
11563 other=...,
11564 *,
11565 inplace: bool = ...,
11566 axis: Axis | None = ...,
11567 level: Level = ...,
11568 ) -> DataFrame | None:
11569 ...
11570
11571 def mask(
11572 self,
11573 cond,
11574 other=lib.no_default,
11575 *,
11576 inplace: bool = False,
11577 axis: Axis | None = None,
11578 level: Level = None,
11579 ) -> DataFrame | None:
11580 return super().mask(
11581 cond,
11582 other,
11583 inplace=inplace,
11584 axis=axis,
11585 level=level,
11586 )
11587
11588
11589DataFrame._add_numeric_operations()
11590
11591ops.add_flex_arithmetic_methods(DataFrame)
11592
11593
11594def _from_nested_dict(data) -> collections.defaultdict:
11595 new_data: collections.defaultdict = collections.defaultdict(dict)
11596 for index, s in data.items():
11597 for col, v in s.items():
11598 new_data[col][index] = v
11599 return new_data
11600
11601
11602def _reindex_for_setitem(value: DataFrame | Series, index: Index) -> ArrayLike:
11603 # reindex if necessary
11604
11605 if value.index.equals(index) or not len(index):
11606 return value._values.copy()
11607
11608 # GH#4107
11609 try:
11610 reindexed_value = value.reindex(index)._values
11611 except ValueError as err:
11612 # raised in MultiIndex.from_tuples, see test_insert_error_msmgs
11613 if not value.index.is_unique:
11614 # duplicate axis
11615 raise err
11616
11617 raise TypeError(
11618 "incompatible index of inserted column with frame index"
11619 ) from err
11620 return reindexed_value