Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.9/dist-packages/pandas/core/frame.py: 30%

Shortcuts on this page

r m x toggle line displays

j k next/prev highlighted chunk

0 (zero) top of page

1 (one) first highlighted chunk

2512 statements

1"""

2DataFrame

3---------

4An efficient 2D container for potentially mixed-type time series or other

5labeled data series.

7Similar to its R counterpart, data.frame, except providing automatic data

8alignment and a host of useful data manipulation methods having to do with the

9labeling information

10"""

11from __future__ import annotations

13import collections

14from collections import abc

15from collections.abc import (

16 Hashable,

17 Iterable,

18 Iterator,

19 Mapping,

20 Sequence,

22import functools

23from inspect import signature

24from io import StringIO

25import itertools

26import operator

27import sys

28from textwrap import dedent

29from typing import (

30 TYPE_CHECKING,

31 Any,

32 Callable,

33 Literal,

34 cast,

35 overload,

37import warnings

39import numpy as np

40from numpy import ma

42from pandas._config import (

43 get_option,

44 using_copy_on_write,

45 warn_copy_on_write,

47from pandas._config.config import _get_option

49from pandas._libs import (

50 algos as libalgos,

51 lib,

52 properties,

54from pandas._libs.hashtable import duplicated

55from pandas._libs.lib import is_range_indexer

56from pandas.compat import PYPY

57from pandas.compat._constants import REF_COUNT

58from pandas.compat._optional import import_optional_dependency

59from pandas.compat.numpy import function as nv

60from pandas.errors import (

61 ChainedAssignmentError,

62 InvalidIndexError,

63 _chained_assignment_method_msg,

64 _chained_assignment_msg,

65 _chained_assignment_warning_method_msg,

66 _chained_assignment_warning_msg,

68from pandas.util._decorators import (

69 Appender,

70 Substitution,

71 deprecate_nonkeyword_arguments,

72 doc,

74from pandas.util._exceptions import (

75 find_stack_level,

76 rewrite_warning,

78from pandas.util._validators import (

79 validate_ascending,

80 validate_bool_kwarg,

81 validate_percentile,

84from pandas.core.dtypes.cast import (

85 LossySetitemError,

86 can_hold_element,

87 construct_1d_arraylike_from_scalar,

88 construct_2d_arraylike_from_scalar,

89 find_common_type,

90 infer_dtype_from_scalar,

91 invalidate_string_dtypes,

92 maybe_box_native,

93 maybe_downcast_to_dtype,

95from pandas.core.dtypes.common import (

96 infer_dtype_from_object,

97 is_1d_only_ea_dtype,

98 is_array_like,

99 is_bool_dtype,

100 is_dataclass,

101 is_dict_like,

102 is_float,

103 is_float_dtype,

104 is_hashable,

105 is_integer,

106 is_integer_dtype,

107 is_iterator,

108 is_list_like,

109 is_scalar,

110 is_sequence,

111 needs_i8_conversion,

112 pandas_dtype,

114from pandas.core.dtypes.concat import concat_compat

115from pandas.core.dtypes.dtypes import (

116 ArrowDtype,

117 BaseMaskedDtype,

118 ExtensionDtype,

120from pandas.core.dtypes.missing import (

121 isna,

122 notna,

125from pandas.core import (

126 algorithms,

127 common as com,

128 nanops,

129 ops,

130 roperator,

132from pandas.core.accessor import CachedAccessor

133from pandas.core.apply import reconstruct_and_relabel_result

134from pandas.core.array_algos.take import take_2d_multi

135from pandas.core.arraylike import OpsMixin

136from pandas.core.arrays import (

137 BaseMaskedArray,

138 DatetimeArray,

139 ExtensionArray,

140 PeriodArray,

141 TimedeltaArray,

143from pandas.core.arrays.sparse import SparseFrameAccessor

144from pandas.core.construction import (

145 ensure_wrapped_if_datetimelike,

146 sanitize_array,

147 sanitize_masked_array,

149from pandas.core.generic import (

150 NDFrame,

151 make_doc,

153from pandas.core.indexers import check_key_length

154from pandas.core.indexes.api import (

155 DatetimeIndex,

156 Index,

157 PeriodIndex,

158 default_index,

159 ensure_index,

160 ensure_index_from_sequences,

162from pandas.core.indexes.multi import (

163 MultiIndex,

164 maybe_droplevels,

166from pandas.core.indexing import (

167 check_bool_indexer,

168 check_dict_or_set_indexers,

170from pandas.core.internals import (

171 ArrayManager,

172 BlockManager,

174from pandas.core.internals.construction import (

175 arrays_to_mgr,

176 dataclasses_to_dicts,

177 dict_to_mgr,

178 mgr_to_mgr,

179 ndarray_to_mgr,

180 nested_data_to_arrays,

181 rec_array_to_mgr,

182 reorder_arrays,

183 to_arrays,

184 treat_as_nested,

186from pandas.core.methods import selectn

187from pandas.core.reshape.melt import melt

188from pandas.core.series import Series

189from pandas.core.shared_docs import _shared_docs

190from pandas.core.sorting import (

191 get_group_index,

192 lexsort_indexer,

193 nargsort,

196from pandas.io.common import get_handle

197from pandas.io.formats import (

198 console,

199 format as fmt,

201from pandas.io.formats.info import (

202 INFO_DOCSTRING,

203 DataFrameInfo,

204 frame_sub_kwargs,

206import pandas.plotting

208if TYPE_CHECKING:

209 import datetime

211 from pandas._libs.internals import BlockValuesRefs

212 from pandas._typing import (

213 AggFuncType,

214 AnyAll,

215 AnyArrayLike,

216 ArrayLike,

217 Axes,

218 Axis,

219 AxisInt,

220 ColspaceArgType,

221 CompressionOptions,

222 CorrelationMethod,

223 DropKeep,

224 Dtype,

225 DtypeObj,

226 FilePath,

227 FloatFormatType,

228 FormattersType,

229 Frequency,

230 FromDictOrient,

231 IgnoreRaise,

232 IndexKeyFunc,

233 IndexLabel,

234 JoinValidate,

235 Level,

236 MergeHow,

237 MergeValidate,

238 MutableMappingT,

239 NaAction,

240 NaPosition,

241 NsmallestNlargestKeep,

242 PythonFuncType,

243 QuantileInterpolation,

244 ReadBuffer,

245 ReindexMethod,

246 Renamer,

247 Scalar,

248 Self,

249 SequenceNotStr,

250 SortKind,

251 StorageOptions,

252 Suffixes,

253 ToGbqIfexist,

254 ToStataByteorder,

255 ToTimestampHow,

256 UpdateJoin,

257 ValueKeyFunc,

258 WriteBuffer,

259 XMLParsers,

260 npt,

263 from pandas.core.groupby.generic import DataFrameGroupBy

264 from pandas.core.interchange.dataframe_protocol import DataFrame as DataFrameXchg

265 from pandas.core.internals import SingleDataManager

267 from pandas.io.formats.style import Styler

269# ---------------------------------------------------------------------

270# Docstring templates

272_shared_doc_kwargs = {

273 "axes": "index, columns",

274 "klass": "DataFrame",

275 "axes_single_arg": "{0 or 'index', 1 or 'columns'}",

276 "axis": """axis : {0 or 'index', 1 or 'columns'}, default 0

277 If 0 or 'index': apply function to each column.

278 If 1 or 'columns': apply function to each row.""",

279 "inplace": """

280 inplace : bool, default False

281 Whether to modify the DataFrame rather than creating a new one.""",

282 "optional_by": """

283by : str or list of str

284 Name or list of names to sort by.

286 - if `axis` is 0 or `'index'` then `by` may contain index

287 levels and/or column labels.

288 - if `axis` is 1 or `'columns'` then `by` may contain column

289 levels and/or index labels.""",

290 "optional_reindex": """

291labels : array-like, optional

292 New labels / index to conform the axis specified by 'axis' to.

293index : array-like, optional

294 New labels for the index. Preferably an Index object to avoid

295 duplicating data.

296columns : array-like, optional

297 New labels for the columns. Preferably an Index object to avoid

298 duplicating data.

299axis : int or str, optional

300 Axis to target. Can be either the axis name ('index', 'columns')

301 or number (0, 1).""",

304_merge_doc = """

305Merge DataFrame or named Series objects with a database-style join.

307A named Series object is treated as a DataFrame with a single named column.

309The join is done on columns or indexes. If joining columns on

310columns, the DataFrame indexes *will be ignored*. Otherwise if joining indexes

311on indexes or indexes on a column or columns, the index will be passed on.

312When performing a cross merge, no column specifications to merge on are

313allowed.

315.. warning::

317 If both key columns contain rows where the key is a null value, those

318 rows will be matched against each other. This is different from usual SQL

319 join behaviour and can lead to unexpected results.

321Parameters

322----------%s

323right : DataFrame or named Series

324 Object to merge with.

325how : {'left', 'right', 'outer', 'inner', 'cross'}, default 'inner'

326 Type of merge to be performed.

328 * left: use only keys from left frame, similar to a SQL left outer join;

329 preserve key order.

330 * right: use only keys from right frame, similar to a SQL right outer join;

331 preserve key order.

332 * outer: use union of keys from both frames, similar to a SQL full outer

333 join; sort keys lexicographically.

334 * inner: use intersection of keys from both frames, similar to a SQL inner

335 join; preserve the order of the left keys.

336 * cross: creates the cartesian product from both frames, preserves the order

337 of the left keys.

338on : label or list

339 Column or index level names to join on. These must be found in both

340 DataFrames. If `on` is None and not merging on indexes then this defaults

341 to the intersection of the columns in both DataFrames.

342left_on : label or list, or array-like

343 Column or index level names to join on in the left DataFrame. Can also

344 be an array or list of arrays of the length of the left DataFrame.

345 These arrays are treated as if they are columns.

346right_on : label or list, or array-like

347 Column or index level names to join on in the right DataFrame. Can also

348 be an array or list of arrays of the length of the right DataFrame.

349 These arrays are treated as if they are columns.

350left_index : bool, default False

351 Use the index from the left DataFrame as the join key(s). If it is a

352 MultiIndex, the number of keys in the other DataFrame (either the index

353 or a number of columns) must match the number of levels.

354right_index : bool, default False

355 Use the index from the right DataFrame as the join key. Same caveats as

356 left_index.

357sort : bool, default False

358 Sort the join keys lexicographically in the result DataFrame. If False,

359 the order of the join keys depends on the join type (how keyword).

360suffixes : list-like, default is ("_x", "_y")

361 A length-2 sequence where each element is optionally a string

362 indicating the suffix to add to overlapping column names in

363 `left` and `right` respectively. Pass a value of `None` instead

364 of a string to indicate that the column name from `left` or

365 `right` should be left as-is, with no suffix. At least one of the

366 values must not be None.

367copy : bool, default True

368 If False, avoid copy if possible.

370 .. note::

371 The `copy` keyword will change behavior in pandas 3.0.

372 `Copy-on-Write

373 <https://pandas.pydata.org/docs/dev/user_guide/copy_on_write.html>`__

374 will be enabled by default, which means that all methods with a

375 `copy` keyword will use a lazy copy mechanism to defer the copy and

376 ignore the `copy` keyword. The `copy` keyword will be removed in a

377 future version of pandas.

379 You can already get the future behavior and improvements through

380 enabling copy on write ``pd.options.mode.copy_on_write = True``

381indicator : bool or str, default False

382 If True, adds a column to the output DataFrame called "_merge" with

383 information on the source of each row. The column can be given a different

384 name by providing a string argument. The column will have a Categorical

385 type with the value of "left_only" for observations whose merge key only

386 appears in the left DataFrame, "right_only" for observations

387 whose merge key only appears in the right DataFrame, and "both"

388 if the observation's merge key is found in both DataFrames.

390validate : str, optional

391 If specified, checks if merge is of specified type.

393 * "one_to_one" or "1:1": check if merge keys are unique in both

394 left and right datasets.

395 * "one_to_many" or "1:m": check if merge keys are unique in left

396 dataset.

397 * "many_to_one" or "m:1": check if merge keys are unique in right

398 dataset.

399 * "many_to_many" or "m:m": allowed, but does not result in checks.

401Returns

402-------

403DataFrame

404 A DataFrame of the two merged objects.

406See Also

407--------

408merge_ordered : Merge with optional filling/interpolation.

409merge_asof : Merge on nearest keys.

410DataFrame.join : Similar method using indices.

412Examples

413--------

414>>> df1 = pd.DataFrame({'lkey': ['foo', 'bar', 'baz', 'foo'],

415... 'value': [1, 2, 3, 5]})

416>>> df2 = pd.DataFrame({'rkey': ['foo', 'bar', 'baz', 'foo'],

417... 'value': [5, 6, 7, 8]})

418>>> df1

419 lkey value

4200 foo 1

4211 bar 2

4222 baz 3

4233 foo 5

424>>> df2

425 rkey value

4260 foo 5

4271 bar 6

4282 baz 7

4293 foo 8

431Merge df1 and df2 on the lkey and rkey columns. The value columns have

432the default suffixes, _x and _y, appended.

434>>> df1.merge(df2, left_on='lkey', right_on='rkey')

435 lkey value_x rkey value_y

4360 foo 1 foo 5

4371 foo 1 foo 8

4382 bar 2 bar 6

4393 baz 3 baz 7

4404 foo 5 foo 5

4415 foo 5 foo 8

443Merge DataFrames df1 and df2 with specified left and right suffixes

444appended to any overlapping columns.

446>>> df1.merge(df2, left_on='lkey', right_on='rkey',

447... suffixes=('_left', '_right'))

448 lkey value_left rkey value_right

4490 foo 1 foo 5

4501 foo 1 foo 8

4512 bar 2 bar 6

4523 baz 3 baz 7

4534 foo 5 foo 5

4545 foo 5 foo 8

456Merge DataFrames df1 and df2, but raise an exception if the DataFrames have

457any overlapping columns.

459>>> df1.merge(df2, left_on='lkey', right_on='rkey', suffixes=(False, False))

460Traceback (most recent call last):

461...

462ValueError: columns overlap but no suffix specified:

463 Index(['value'], dtype='object')

465>>> df1 = pd.DataFrame({'a': ['foo', 'bar'], 'b': [1, 2]})

466>>> df2 = pd.DataFrame({'a': ['foo', 'baz'], 'c': [3, 4]})

467>>> df1

468 a b

4690 foo 1

4701 bar 2

471>>> df2

472 a c

4730 foo 3

4741 baz 4

476>>> df1.merge(df2, how='inner', on='a')

477 a b c

4780 foo 1 3

480>>> df1.merge(df2, how='left', on='a')

481 a b c

4820 foo 1 3.0

4831 bar 2 NaN

485>>> df1 = pd.DataFrame({'left': ['foo', 'bar']})

486>>> df2 = pd.DataFrame({'right': [7, 8]})

487>>> df1

488 left

4890 foo

4901 bar

491>>> df2

492 right

4930 7

4941 8

496>>> df1.merge(df2, how='cross')

497 left right

4980 foo 7

4991 foo 8

5002 bar 7

5013 bar 8

502"""

505# -----------------------------------------------------------------------

506# DataFrame class

509class DataFrame(NDFrame, OpsMixin):

510 """

511 Two-dimensional, size-mutable, potentially heterogeneous tabular data.

513 Data structure also contains labeled axes (rows and columns).

514 Arithmetic operations align on both row and column labels. Can be

515 thought of as a dict-like container for Series objects. The primary

516 pandas data structure.

518 Parameters

519 ----------

520 data : ndarray (structured or homogeneous), Iterable, dict, or DataFrame

521 Dict can contain Series, arrays, constants, dataclass or list-like objects. If

522 data is a dict, column order follows insertion-order. If a dict contains Series

523 which have an index defined, it is aligned by its index. This alignment also

524 occurs if data is a Series or a DataFrame itself. Alignment is done on

525 Series/DataFrame inputs.

527 If data is a list of dicts, column order follows insertion-order.

529 index : Index or array-like

530 Index to use for resulting frame. Will default to RangeIndex if

531 no indexing information part of input data and no index provided.

532 columns : Index or array-like

533 Column labels to use for resulting frame when data does not have them,

534 defaulting to RangeIndex(0, 1, 2, ..., n). If data contains column labels,

535 will perform column selection instead.

536 dtype : dtype, default None

537 Data type to force. Only a single dtype is allowed. If None, infer.

538 copy : bool or None, default None

539 Copy data from inputs.

540 For dict data, the default of None behaves like ``copy=True``. For DataFrame

541 or 2d ndarray input, the default of None behaves like ``copy=False``.

542 If data is a dict containing one or more Series (possibly of different dtypes),

543 ``copy=False`` will ensure that these inputs are not copied.

545 .. versionchanged:: 1.3.0

547 See Also

548 --------

549 DataFrame.from_records : Constructor from tuples, also record arrays.

550 DataFrame.from_dict : From dicts of Series, arrays, or dicts.

551 read_csv : Read a comma-separated values (csv) file into DataFrame.

552 read_table : Read general delimited file into DataFrame.

553 read_clipboard : Read text from clipboard into DataFrame.

555 Notes

556 -----

557 Please reference the :ref:`User Guide <basics.dataframe>` for more information.

559 Examples

560 --------

561 Constructing DataFrame from a dictionary.

563 >>> d = {'col1': [1, 2], 'col2': [3, 4]}

564 >>> df = pd.DataFrame(data=d)

565 >>> df

566 col1 col2

567 0 1 3

568 1 2 4

570 Notice that the inferred dtype is int64.

572 >>> df.dtypes

573 col1 int64

574 col2 int64

575 dtype: object

577 To enforce a single dtype:

579 >>> df = pd.DataFrame(data=d, dtype=np.int8)

580 >>> df.dtypes

581 col1 int8

582 col2 int8

583 dtype: object

585 Constructing DataFrame from a dictionary including Series:

587 >>> d = {'col1': [0, 1, 2, 3], 'col2': pd.Series([2, 3], index=[2, 3])}

588 >>> pd.DataFrame(data=d, index=[0, 1, 2, 3])

589 col1 col2

590 0 0 NaN

591 1 1 NaN

592 2 2 2.0

593 3 3 3.0

595 Constructing DataFrame from numpy ndarray:

597 >>> df2 = pd.DataFrame(np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]),

598 ... columns=['a', 'b', 'c'])

599 >>> df2

600 a b c

601 0 1 2 3

602 1 4 5 6

603 2 7 8 9

605 Constructing DataFrame from a numpy ndarray that has labeled columns:

607 >>> data = np.array([(1, 2, 3), (4, 5, 6), (7, 8, 9)],

608 ... dtype=[("a", "i4"), ("b", "i4"), ("c", "i4")])

609 >>> df3 = pd.DataFrame(data, columns=['c', 'a'])

610 ...

611 >>> df3

612 c a

613 0 3 1

614 1 6 4

615 2 9 7

617 Constructing DataFrame from dataclass:

619 >>> from dataclasses import make_dataclass

620 >>> Point = make_dataclass("Point", [("x", int), ("y", int)])

621 >>> pd.DataFrame([Point(0, 0), Point(0, 3), Point(2, 3)])

622 x y

623 0 0 0

624 1 0 3

625 2 2 3

627 Constructing DataFrame from Series/DataFrame:

629 >>> ser = pd.Series([1, 2, 3], index=["a", "b", "c"])

630 >>> df = pd.DataFrame(data=ser, index=["a", "c"])

631 >>> df

633 a 1

634 c 3

636 >>> df1 = pd.DataFrame([1, 2, 3], index=["a", "b", "c"], columns=["x"])

637 >>> df2 = pd.DataFrame(data=df1, index=["a", "c"])

638 >>> df2

640 a 1

641 c 3

642 """

644 _internal_names_set = {"columns", "index"} | NDFrame._internal_names_set

645 _typ = "dataframe"

646 _HANDLED_TYPES = (Series, Index, ExtensionArray, np.ndarray)

647 _accessors: set[str] = {"sparse"}

648 _hidden_attrs: frozenset[str] = NDFrame._hidden_attrs | frozenset([])

649 _mgr: BlockManager | ArrayManager

651 # similar to __array_priority__, positions DataFrame before Series, Index,

652 # and ExtensionArray. Should NOT be overridden by subclasses.

653 __pandas_priority__ = 4000

655 @property

656 def _constructor(self) -> Callable[..., DataFrame]:

657 return DataFrame

659 def _constructor_from_mgr(self, mgr, axes) -> DataFrame:

660 df = DataFrame._from_mgr(mgr, axes=axes)

662 if type(self) is DataFrame:

663 # This would also work `if self._constructor is DataFrame`, but

664 # this check is slightly faster, benefiting the most-common case.

665 return df

667 elif type(self).__name__ == "GeoDataFrame":

668 # Shim until geopandas can override their _constructor_from_mgr

669 # bc they have different behavior for Managers than for DataFrames

670 return self._constructor(mgr)

672 # We assume that the subclass __init__ knows how to handle a

673 # pd.DataFrame object.

674 return self._constructor(df)

676 _constructor_sliced: Callable[..., Series] = Series

678 def _constructor_sliced_from_mgr(self, mgr, axes) -> Series:

679 ser = Series._from_mgr(mgr, axes)

680 ser._name = None # caller is responsible for setting real name

682 if type(self) is DataFrame:

683 # This would also work `if self._constructor_sliced is Series`, but

684 # this check is slightly faster, benefiting the most-common case.

685 return ser

687 # We assume that the subclass __init__ knows how to handle a

688 # pd.Series object.

689 return self._constructor_sliced(ser)

691 # ----------------------------------------------------------------------

692 # Constructors

694 def __init__(

695 self,

696 data=None,

697 index: Axes | None = None,

698 columns: Axes | None = None,

699 dtype: Dtype | None = None,

700 copy: bool | None = None,

701 ) -> None:

702 allow_mgr = False

703 if dtype is not None:

704 dtype = self._validate_dtype(dtype)

706 if isinstance(data, DataFrame):

707 data = data._mgr

708 allow_mgr = True

709 if not copy:

710 # if not copying data, ensure to still return a shallow copy

711 # to avoid the result sharing the same Manager

712 data = data.copy(deep=False)

714 if isinstance(data, (BlockManager, ArrayManager)):

715 if not allow_mgr:

716 # GH#52419

717 warnings.warn(

718 f"Passing a {type(data).__name__} to {type(self).__name__} "

719 "is deprecated and will raise in a future version. "

720 "Use public APIs instead.",

721 DeprecationWarning,

722 stacklevel=1, # bump to 2 once pyarrow 15.0 is released with fix

725 if using_copy_on_write():

726 data = data.copy(deep=False)

727 # first check if a Manager is passed without any other arguments

728 # -> use fastpath (without checking Manager type)

729 if index is None and columns is None and dtype is None and not copy:

730 # GH#33357 fastpath

731 NDFrame.__init__(self, data)

732 return

734 manager = _get_option("mode.data_manager", silent=True)

736 is_pandas_object = isinstance(data, (Series, Index, ExtensionArray))

737 data_dtype = getattr(data, "dtype", None)

738 original_dtype = dtype

740 # GH47215

741 if isinstance(index, set):

742 raise ValueError("index cannot be a set")

743 if isinstance(columns, set):

744 raise ValueError("columns cannot be a set")

746 if copy is None:

747 if isinstance(data, dict):

748 # retain pre-GH#38939 default behavior

749 copy = True

750 elif (

751 manager == "array"

752 and isinstance(data, (np.ndarray, ExtensionArray))

753 and data.ndim == 2

754 ):

755 # INFO(ArrayManager) by default copy the 2D input array to get

756 # contiguous 1D arrays

757 copy = True

758 elif using_copy_on_write() and not isinstance(

759 data, (Index, DataFrame, Series)

760 ):

761 copy = True

762 else:

763 copy = False

765 if data is None:

766 index = index if index is not None else default_index(0)

767 columns = columns if columns is not None else default_index(0)

768 dtype = dtype if dtype is not None else pandas_dtype(object)

769 data = []

771 if isinstance(data, (BlockManager, ArrayManager)):

772 mgr = self._init_mgr(

773 data, axes={"index": index, "columns": columns}, dtype=dtype, copy=copy

776 elif isinstance(data, dict):

777 # GH#38939 de facto copy defaults to False only in non-dict cases

778 mgr = dict_to_mgr(data, index, columns, dtype=dtype, copy=copy, typ=manager)

779 elif isinstance(data, ma.MaskedArray):

780 from numpy.ma import mrecords

782 # masked recarray

783 if isinstance(data, mrecords.MaskedRecords):

784 raise TypeError(

785 "MaskedRecords are not supported. Pass "

786 "{name: data[name] for name in data.dtype.names} "

787 "instead"

790 # a masked array

791 data = sanitize_masked_array(data)

792 mgr = ndarray_to_mgr(

793 data,

794 index,

795 columns,

796 dtype=dtype,

797 copy=copy,

798 typ=manager,

801 elif isinstance(data, (np.ndarray, Series, Index, ExtensionArray)):

802 if data.dtype.names:

803 # i.e. numpy structured array

804 data = cast(np.ndarray, data)

805 mgr = rec_array_to_mgr(

806 data,

807 index,

808 columns,

809 dtype,

810 copy,

811 typ=manager,

813 elif getattr(data, "name", None) is not None:

814 # i.e. Series/Index with non-None name

815 _copy = copy if using_copy_on_write() else True

816 mgr = dict_to_mgr(

817 # error: Item "ndarray" of "Union[ndarray, Series, Index]" has no

818 # attribute "name"

819 {data.name: data}, # type: ignore[union-attr]

820 index,

821 columns,

822 dtype=dtype,

823 typ=manager,

824 copy=_copy,

826 else:

827 mgr = ndarray_to_mgr(

828 data,

829 index,

830 columns,

831 dtype=dtype,

832 copy=copy,

833 typ=manager,

836 # For data is list-like, or Iterable (will consume into list)

837 elif is_list_like(data):

838 if not isinstance(data, abc.Sequence):

839 if hasattr(data, "__array__"):

840 # GH#44616 big perf improvement for e.g. pytorch tensor

841 data = np.asarray(data)

842 else:

843 data = list(data)

844 if len(data) > 0:

845 if is_dataclass(data[0]):

846 data = dataclasses_to_dicts(data)

847 if not isinstance(data, np.ndarray) and treat_as_nested(data):

848 # exclude ndarray as we may have cast it a few lines above

849 if columns is not None:

850 columns = ensure_index(columns)

851 arrays, columns, index = nested_data_to_arrays(

852 # error: Argument 3 to "nested_data_to_arrays" has incompatible

853 # type "Optional[Collection[Any]]"; expected "Optional[Index]"

854 data,

855 columns,

856 index, # type: ignore[arg-type]

857 dtype,

859 mgr = arrays_to_mgr(

860 arrays,

861 columns,

862 index,

863 dtype=dtype,

864 typ=manager,

866 else:

867 mgr = ndarray_to_mgr(

868 data,

869 index,

870 columns,

871 dtype=dtype,

872 copy=copy,

873 typ=manager,

875 else:

876 mgr = dict_to_mgr(

877 {},

878 index,

879 columns if columns is not None else default_index(0),

880 dtype=dtype,

881 typ=manager,

883 # For data is scalar

884 else:

885 if index is None or columns is None:

886 raise ValueError("DataFrame constructor not properly called!")

888 index = ensure_index(index)

889 columns = ensure_index(columns)

891 if not dtype:

892 dtype, _ = infer_dtype_from_scalar(data)

894 # For data is a scalar extension dtype

895 if isinstance(dtype, ExtensionDtype):

896 # TODO(EA2D): special case not needed with 2D EAs

898 values = [

899 construct_1d_arraylike_from_scalar(data, len(index), dtype)

900 for _ in range(len(columns))

902 mgr = arrays_to_mgr(values, columns, index, dtype=None, typ=manager)

903 else:

904 arr2d = construct_2d_arraylike_from_scalar(

905 data,

906 len(index),

907 len(columns),

908 dtype,

909 copy,

912 mgr = ndarray_to_mgr(

913 arr2d,

914 index,

915 columns,

916 dtype=arr2d.dtype,

917 copy=False,

918 typ=manager,

921 # ensure correct Manager type according to settings

922 mgr = mgr_to_mgr(mgr, typ=manager)

924 NDFrame.__init__(self, mgr)

926 if original_dtype is None and is_pandas_object and data_dtype == np.object_:

927 if self.dtypes.iloc[0] != data_dtype:

928 warnings.warn(

929 "Dtype inference on a pandas object "

930 "(Series, Index, ExtensionArray) is deprecated. The DataFrame "

931 "constructor will keep the original dtype in the future. "

932 "Call `infer_objects` on the result to get the old "

933 "behavior.",

934 FutureWarning,

935 stacklevel=2,

938 # ----------------------------------------------------------------------

940 def __dataframe__(

941 self, nan_as_null: bool = False, allow_copy: bool = True

942 ) -> DataFrameXchg:

943 """

944 Return the dataframe interchange object implementing the interchange protocol.

946 Parameters

947 ----------

948 nan_as_null : bool, default False

949 `nan_as_null` is DEPRECATED and has no effect. Please avoid using

950 it; it will be removed in a future release.

951 allow_copy : bool, default True

952 Whether to allow memory copying when exporting. If set to False

953 it would cause non-zero-copy exports to fail.

955 Returns

956 -------

957 DataFrame interchange object

958 The object which consuming library can use to ingress the dataframe.

960 Notes

961 -----

962 Details on the interchange protocol:

963 https://data-apis.org/dataframe-protocol/latest/index.html

965 Examples

966 --------

967 >>> df_not_necessarily_pandas = pd.DataFrame({'A': [1, 2], 'B': [3, 4]})

968 >>> interchange_object = df_not_necessarily_pandas.__dataframe__()

969 >>> interchange_object.column_names()

970 Index(['A', 'B'], dtype='object')

971 >>> df_pandas = (pd.api.interchange.from_dataframe

972 ... (interchange_object.select_columns_by_name(['A'])))

973 >>> df_pandas

975 0 1

976 1 2

978 These methods (``column_names``, ``select_columns_by_name``) should work

979 for any dataframe library which implements the interchange protocol.

980 """

982 from pandas.core.interchange.dataframe import PandasDataFrameXchg

984 return PandasDataFrameXchg(self, allow_copy=allow_copy)

986 def __dataframe_consortium_standard__(

987 self, *, api_version: str | None = None

988 ) -> Any:

989 """

990 Provide entry point to the Consortium DataFrame Standard API.

992 This is developed and maintained outside of pandas.

993 Please report any issues to https://github.com/data-apis/dataframe-api-compat.

994 """

995 dataframe_api_compat = import_optional_dependency("dataframe_api_compat")

996 convert_to_standard_compliant_dataframe = (

997 dataframe_api_compat.pandas_standard.convert_to_standard_compliant_dataframe

999 return convert_to_standard_compliant_dataframe(self, api_version=api_version)

1001 def __arrow_c_stream__(self, requested_schema=None):

1002 """

1003 Export the pandas DataFrame as an Arrow C stream PyCapsule.

1005 This relies on pyarrow to convert the pandas DataFrame to the Arrow

1006 format (and follows the default behaviour of ``pyarrow.Table.from_pandas``

1007 in its handling of the index, i.e. store the index as a column except

1008 for RangeIndex).

1009 This conversion is not necessarily zero-copy.

1011 Parameters

1012 ----------

1013 requested_schema : PyCapsule, default None

1014 The schema to which the dataframe should be casted, passed as a

1015 PyCapsule containing a C ArrowSchema representation of the

1016 requested schema.

1018 Returns

1019 -------

1020 PyCapsule

1021 """

1022 pa = import_optional_dependency("pyarrow", min_version="14.0.0")

1023 if requested_schema is not None:

1024 requested_schema = pa.Schema._import_from_c_capsule(requested_schema)

1025 table = pa.Table.from_pandas(self, schema=requested_schema)

1026 return table.__arrow_c_stream__()

1028 # ----------------------------------------------------------------------

1030 @property

1031 def axes(self) -> list[Index]:

1032 """

1033 Return a list representing the axes of the DataFrame.

1035 It has the row axis labels and column axis labels as the only members.

1036 They are returned in that order.

1038 Examples

1039 --------

1040 >>> df = pd.DataFrame({'col1': [1, 2], 'col2': [3, 4]})

1041 >>> df.axes

1042 [RangeIndex(start=0, stop=2, step=1), Index(['col1', 'col2'],

1043 dtype='object')]

1044 """

1045 return [self.index, self.columns]

1047 @property

1048 def shape(self) -> tuple[int, int]:

1049 """

1050 Return a tuple representing the dimensionality of the DataFrame.

1052 See Also

1053 --------

1054 ndarray.shape : Tuple of array dimensions.

1056 Examples

1057 --------

1058 >>> df = pd.DataFrame({'col1': [1, 2], 'col2': [3, 4]})

1059 >>> df.shape

1060 (2, 2)

1062 >>> df = pd.DataFrame({'col1': [1, 2], 'col2': [3, 4],

1063 ... 'col3': [5, 6]})

1064 >>> df.shape

1065 (2, 3)

1066 """

1067 return len(self.index), len(self.columns)

1069 @property

1070 def _is_homogeneous_type(self) -> bool:

1071 """

1072 Whether all the columns in a DataFrame have the same type.

1074 Returns

1075 -------

1076 bool

1078 Examples

1079 --------

1080 >>> DataFrame({"A": [1, 2], "B": [3, 4]})._is_homogeneous_type

1081 True

1082 >>> DataFrame({"A": [1, 2], "B": [3.0, 4.0]})._is_homogeneous_type

1083 False

1085 Items with the same type but different sizes are considered

1086 different types.

1088 >>> DataFrame({

1089 ... "A": np.array([1, 2], dtype=np.int32),

1090 ... "B": np.array([1, 2], dtype=np.int64)})._is_homogeneous_type

1091 False

1092 """

1093 # The "<" part of "<=" here is for empty DataFrame cases

1094 return len({arr.dtype for arr in self._mgr.arrays}) <= 1

1096 @property

1097 def _can_fast_transpose(self) -> bool:

1098 """

1099 Can we transpose this DataFrame without creating any new array objects.

1100 """

1101 if isinstance(self._mgr, ArrayManager):

1102 return False

1103 blocks = self._mgr.blocks

1104 if len(blocks) != 1:

1105 return False

1107 dtype = blocks[0].dtype

1108 # TODO(EA2D) special case would be unnecessary with 2D EAs

1109 return not is_1d_only_ea_dtype(dtype)

1111 @property

1112 def _values(self) -> np.ndarray | DatetimeArray | TimedeltaArray | PeriodArray:

1113 """

1114 Analogue to ._values that may return a 2D ExtensionArray.

1115 """

1116 mgr = self._mgr

1118 if isinstance(mgr, ArrayManager):

1119 if len(mgr.arrays) == 1 and not is_1d_only_ea_dtype(mgr.arrays[0].dtype):

1120 # error: Item "ExtensionArray" of "Union[ndarray, ExtensionArray]"

1121 # has no attribute "reshape"

1122 return mgr.arrays[0].reshape(-1, 1) # type: ignore[union-attr]

1123 return ensure_wrapped_if_datetimelike(self.values)

1125 blocks = mgr.blocks

1126 if len(blocks) != 1:

1127 return ensure_wrapped_if_datetimelike(self.values)

1129 arr = blocks[0].values

1130 if arr.ndim == 1:

1131 # non-2D ExtensionArray

1132 return self.values

1134 # more generally, whatever we allow in NDArrayBackedExtensionBlock

1135 arr = cast("np.ndarray | DatetimeArray | TimedeltaArray | PeriodArray", arr)

1136 return arr.T

1138 # ----------------------------------------------------------------------

1139 # Rendering Methods

1141 def _repr_fits_vertical_(self) -> bool:

1142 """

1143 Check length against max_rows.

1144 """

1145 max_rows = get_option("display.max_rows")

1146 return len(self) <= max_rows

1148 def _repr_fits_horizontal_(self) -> bool:

1149 """

1150 Check if full repr fits in horizontal boundaries imposed by the display

1151 options width and max_columns.

1152 """

1153 width, height = console.get_console_size()

1154 max_columns = get_option("display.max_columns")

1155 nb_columns = len(self.columns)

1157 # exceed max columns

1158 if (max_columns and nb_columns > max_columns) or (

1159 width and nb_columns > (width // 2)

1161 return False

1163 # used by repr_html under IPython notebook or scripts ignore terminal

1164 # dims

1165 if width is None or not console.in_interactive_session():

1166 return True

1168 if get_option("display.width") is not None or console.in_ipython_frontend():

1169 # check at least the column row for excessive width

1170 max_rows = 1

1171 else:

1172 max_rows = get_option("display.max_rows")

1174 # when auto-detecting, so width=None and not in ipython front end

1175 # check whether repr fits horizontal by actually checking

1176 # the width of the rendered repr

1177 buf = StringIO()

1179 # only care about the stuff we'll actually print out

1180 # and to_string on entire frame may be expensive

1181 d = self

1183 if max_rows is not None: # unlimited rows

1184 # min of two, where one may be None

1185 d = d.iloc[: min(max_rows, len(d))]

1186 else:

1187 return True

1189 d.to_string(buf=buf)

1190 value = buf.getvalue()

1191 repr_width = max(len(line) for line in value.split("\n"))

1193 return repr_width < width

1195 def _info_repr(self) -> bool:

1196 """

1197 True if the repr should show the info view.

1198 """

1199 info_repr_option = get_option("display.large_repr") == "info"

1200 return info_repr_option and not (

1201 self._repr_fits_horizontal_() and self._repr_fits_vertical_()

1204 def __repr__(self) -> str:

1205 """

1206 Return a string representation for a particular DataFrame.

1207 """

1208 if self._info_repr():

1209 buf = StringIO()

1210 self.info(buf=buf)

1211 return buf.getvalue()

1213 repr_params = fmt.get_dataframe_repr_params()

1214 return self.to_string(**repr_params)

1216 def _repr_html_(self) -> str | None:

1217 """

1218 Return a html representation for a particular DataFrame.

1220 Mainly for IPython notebook.

1221 """

1222 if self._info_repr():

1223 buf = StringIO()

1224 self.info(buf=buf)

1225 # need to escape the <class>, should be the first line.

1226 val = buf.getvalue().replace("<", r"<", 1)

1227 val = val.replace(">", r">", 1)

1228 return f"<pre>{val}</pre>"

1230 if get_option("display.notebook_repr_html"):

1231 max_rows = get_option("display.max_rows")

1232 min_rows = get_option("display.min_rows")

1233 max_cols = get_option("display.max_columns")

1234 show_dimensions = get_option("display.show_dimensions")

1236 formatter = fmt.DataFrameFormatter(

1237 self,

1238 columns=None,

1239 col_space=None,

1240 na_rep="NaN",

1241 formatters=None,

1242 float_format=None,

1243 sparsify=None,

1244 justify=None,

1245 index_names=True,

1246 header=True,

1247 index=True,

1248 bold_rows=True,

1249 escape=True,

1250 max_rows=max_rows,

1251 min_rows=min_rows,

1252 max_cols=max_cols,

1253 show_dimensions=show_dimensions,

1254 decimal=".",

1256 return fmt.DataFrameRenderer(formatter).to_html(notebook=True)

1257 else:

1258 return None

1260 @overload

1261 def to_string(

1262 self,

1263 buf: None = ...,

1264 columns: Axes | None = ...,

1265 col_space: int | list[int] | dict[Hashable, int] | None = ...,

1266 header: bool | SequenceNotStr[str] = ...,

1267 index: bool = ...,

1268 na_rep: str = ...,

1269 formatters: fmt.FormattersType | None = ...,

1270 float_format: fmt.FloatFormatType | None = ...,

1271 sparsify: bool | None = ...,

1272 index_names: bool = ...,

1273 justify: str | None = ...,

1274 max_rows: int | None = ...,

1275 max_cols: int | None = ...,

1276 show_dimensions: bool = ...,

1277 decimal: str = ...,

1278 line_width: int | None = ...,

1279 min_rows: int | None = ...,

1280 max_colwidth: int | None = ...,

1281 encoding: str | None = ...,

1282 ) -> str:

1283 ...

1285 @overload

1286 def to_string(

1287 self,

1288 buf: FilePath | WriteBuffer[str],

1289 columns: Axes | None = ...,

1290 col_space: int | list[int] | dict[Hashable, int] | None = ...,

1291 header: bool | SequenceNotStr[str] = ...,

1292 index: bool = ...,

1293 na_rep: str = ...,

1294 formatters: fmt.FormattersType | None = ...,

1295 float_format: fmt.FloatFormatType | None = ...,

1296 sparsify: bool | None = ...,

1297 index_names: bool = ...,

1298 justify: str | None = ...,

1299 max_rows: int | None = ...,

1300 max_cols: int | None = ...,

1301 show_dimensions: bool = ...,

1302 decimal: str = ...,

1303 line_width: int | None = ...,

1304 min_rows: int | None = ...,

1305 max_colwidth: int | None = ...,

1306 encoding: str | None = ...,

1307 ) -> None:

1308 ...

1310 @deprecate_nonkeyword_arguments(

1311 version="3.0", allowed_args=["self", "buf"], name="to_string"

1313 @Substitution(

1314 header_type="bool or list of str",

1315 header="Write out the column names. If a list of columns "

1316 "is given, it is assumed to be aliases for the "

1317 "column names",

1318 col_space_type="int, list or dict of int",

1319 col_space="The minimum width of each column. If a list of ints is given "

1320 "every integers corresponds with one column. If a dict is given, the key "

1321 "references the column, while the value defines the space to use.",

1323 @Substitution(shared_params=fmt.common_docstring, returns=fmt.return_docstring)

1324 def to_string(

1325 self,

1326 buf: FilePath | WriteBuffer[str] | None = None,

1327 columns: Axes | None = None,

1328 col_space: int | list[int] | dict[Hashable, int] | None = None,

1329 header: bool | SequenceNotStr[str] = True,

1330 index: bool = True,

1331 na_rep: str = "NaN",

1332 formatters: fmt.FormattersType | None = None,

1333 float_format: fmt.FloatFormatType | None = None,

1334 sparsify: bool | None = None,

1335 index_names: bool = True,

1336 justify: str | None = None,

1337 max_rows: int | None = None,

1338 max_cols: int | None = None,

1339 show_dimensions: bool = False,

1340 decimal: str = ".",

1341 line_width: int | None = None,

1342 min_rows: int | None = None,

1343 max_colwidth: int | None = None,

1344 encoding: str | None = None,

1345 ) -> str | None:

1346 """

1347 Render a DataFrame to a console-friendly tabular output.

1348 %(shared_params)s

1349 line_width : int, optional

1350 Width to wrap a line in characters.

1351 min_rows : int, optional

1352 The number of rows to display in the console in a truncated repr

1353 (when number of rows is above `max_rows`).

1354 max_colwidth : int, optional

1355 Max width to truncate each column in characters. By default, no limit.

1356 encoding : str, default "utf-8"

1357 Set character encoding.

1358 %(returns)s

1359 See Also

1360 --------

1361 to_html : Convert DataFrame to HTML.

1363 Examples

1364 --------

1365 >>> d = {'col1': [1, 2, 3], 'col2': [4, 5, 6]}

1366 >>> df = pd.DataFrame(d)

1367 >>> print(df.to_string())

1368 col1 col2

1369 0 1 4

1370 1 2 5

1371 2 3 6

1372 """

1373 from pandas import option_context

1375 with option_context("display.max_colwidth", max_colwidth):

1376 formatter = fmt.DataFrameFormatter(

1377 self,

1378 columns=columns,

1379 col_space=col_space,

1380 na_rep=na_rep,

1381 formatters=formatters,

1382 float_format=float_format,

1383 sparsify=sparsify,

1384 justify=justify,

1385 index_names=index_names,

1386 header=header,

1387 index=index,

1388 min_rows=min_rows,

1389 max_rows=max_rows,

1390 max_cols=max_cols,

1391 show_dimensions=show_dimensions,

1392 decimal=decimal,

1394 return fmt.DataFrameRenderer(formatter).to_string(

1395 buf=buf,

1396 encoding=encoding,

1397 line_width=line_width,

1400 def _get_values_for_csv(

1401 self,

1403 float_format: FloatFormatType | None,

1404 date_format: str | None,

1405 decimal: str,

1406 na_rep: str,

1407 quoting, # int csv.QUOTE_FOO from stdlib

1408 ) -> Self:

1409 # helper used by to_csv

1410 mgr = self._mgr.get_values_for_csv(

1411 float_format=float_format,

1412 date_format=date_format,

1413 decimal=decimal,

1414 na_rep=na_rep,

1415 quoting=quoting,

1417 # error: Incompatible return value type (got "DataFrame", expected "Self")

1418 return self._constructor_from_mgr(mgr, axes=mgr.axes) # type: ignore[return-value]

1420 # ----------------------------------------------------------------------

1422 @property

1423 def style(self) -> Styler:

1424 """

1425 Returns a Styler object.

1427 Contains methods for building a styled HTML representation of the DataFrame.

1429 See Also

1430 --------

1431 io.formats.style.Styler : Helps style a DataFrame or Series according to the

1432 data with HTML and CSS.

1434 Examples

1435 --------

1436 >>> df = pd.DataFrame({'A': [1, 2, 3]})

1437 >>> df.style # doctest: +SKIP

1439 Please see

1440 `Table Visualization <../../user_guide/style.ipynb>`_ for more examples.

1441 """

1442 from pandas.io.formats.style import Styler

1444 return Styler(self)

1446 _shared_docs[

1447 "items"

1448 ] = r"""

1449 Iterate over (column name, Series) pairs.

1451 Iterates over the DataFrame columns, returning a tuple with

1452 the column name and the content as a Series.

1454 Yields

1455 ------

1456 label : object

1457 The column names for the DataFrame being iterated over.

1458 content : Series

1459 The column entries belonging to each label, as a Series.

1461 See Also

1462 --------

1463 DataFrame.iterrows : Iterate over DataFrame rows as

1464 (index, Series) pairs.

1465 DataFrame.itertuples : Iterate over DataFrame rows as namedtuples

1466 of the values.

1468 Examples

1469 --------

1470 >>> df = pd.DataFrame({'species': ['bear', 'bear', 'marsupial'],

1471 ... 'population': [1864, 22000, 80000]},

1472 ... index=['panda', 'polar', 'koala'])

1473 >>> df

1474 species population

1475 panda bear 1864

1476 polar bear 22000

1477 koala marsupial 80000

1478 >>> for label, content in df.items():

1479 ... print(f'label: {label}')

1480 ... print(f'content: {content}', sep='\n')

1481 ...

1482 label: species

1483 content:

1484 panda bear

1485 polar bear

1486 koala marsupial

1487 Name: species, dtype: object

1488 label: population

1489 content:

1490 panda 1864

1491 polar 22000

1492 koala 80000

1493 Name: population, dtype: int64

1494 """

1496 @Appender(_shared_docs["items"])

1497 def items(self) -> Iterable[tuple[Hashable, Series]]:

1498 if self.columns.is_unique and hasattr(self, "_item_cache"):

1499 for k in self.columns:

1500 yield k, self._get_item_cache(k)

1501 else:

1502 for i, k in enumerate(self.columns):

1503 yield k, self._ixs(i, axis=1)

1505 def iterrows(self) -> Iterable[tuple[Hashable, Series]]:

1506 """

1507 Iterate over DataFrame rows as (index, Series) pairs.

1509 Yields

1510 ------

1511 index : label or tuple of label

1512 The index of the row. A tuple for a `MultiIndex`.

1513 data : Series

1514 The data of the row as a Series.

1516 See Also

1517 --------

1518 DataFrame.itertuples : Iterate over DataFrame rows as namedtuples of the values.

1519 DataFrame.items : Iterate over (column name, Series) pairs.

1521 Notes

1522 -----

1523 1. Because ``iterrows`` returns a Series for each row,

1524 it does **not** preserve dtypes across the rows (dtypes are

1525 preserved across columns for DataFrames).

1527 To preserve dtypes while iterating over the rows, it is better

1528 to use :meth:`itertuples` which returns namedtuples of the values

1529 and which is generally faster than ``iterrows``.

1531 2. You should **never modify** something you are iterating over.

1532 This is not guaranteed to work in all cases. Depending on the

1533 data types, the iterator returns a copy and not a view, and writing

1534 to it will have no effect.

1536 Examples

1537 --------

1539 >>> df = pd.DataFrame([[1, 1.5]], columns=['int', 'float'])

1540 >>> row = next(df.iterrows())[1]

1541 >>> row

1542 int 1.0

1543 float 1.5

1544 Name: 0, dtype: float64

1545 >>> print(row['int'].dtype)

1546 float64

1547 >>> print(df['int'].dtype)

1548 int64

1549 """

1550 columns = self.columns

1551 klass = self._constructor_sliced

1552 using_cow = using_copy_on_write()

1553 for k, v in zip(self.index, self.values):

1554 s = klass(v, index=columns, name=k).__finalize__(self)

1555 if using_cow and self._mgr.is_single_block:

1556 s._mgr.add_references(self._mgr) # type: ignore[arg-type]

1557 yield k, s

1559 def itertuples(

1560 self, index: bool = True, name: str | None = "Pandas"

1561 ) -> Iterable[tuple[Any, ...]]:

1562 """

1563 Iterate over DataFrame rows as namedtuples.

1565 Parameters

1566 ----------

1567 index : bool, default True

1568 If True, return the index as the first element of the tuple.

1569 name : str or None, default "Pandas"

1570 The name of the returned namedtuples or None to return regular

1571 tuples.

1573 Returns

1574 -------

1575 iterator

1576 An object to iterate over namedtuples for each row in the

1577 DataFrame with the first field possibly being the index and

1578 following fields being the column values.

1580 See Also

1581 --------

1582 DataFrame.iterrows : Iterate over DataFrame rows as (index, Series)

1583 pairs.

1584 DataFrame.items : Iterate over (column name, Series) pairs.

1586 Notes

1587 -----

1588 The column names will be renamed to positional names if they are

1589 invalid Python identifiers, repeated, or start with an underscore.

1591 Examples

1592 --------

1593 >>> df = pd.DataFrame({'num_legs': [4, 2], 'num_wings': [0, 2]},

1594 ... index=['dog', 'hawk'])

1595 >>> df

1596 num_legs num_wings

1597 dog 4 0

1598 hawk 2 2

1599 >>> for row in df.itertuples():

1600 ... print(row)

1601 ...

1602 Pandas(Index='dog', num_legs=4, num_wings=0)

1603 Pandas(Index='hawk', num_legs=2, num_wings=2)

1605 By setting the `index` parameter to False we can remove the index

1606 as the first element of the tuple:

1608 >>> for row in df.itertuples(index=False):

1609 ... print(row)

1610 ...

1611 Pandas(num_legs=4, num_wings=0)

1612 Pandas(num_legs=2, num_wings=2)

1614 With the `name` parameter set we set a custom name for the yielded

1615 namedtuples:

1617 >>> for row in df.itertuples(name='Animal'):

1618 ... print(row)

1619 ...

1620 Animal(Index='dog', num_legs=4, num_wings=0)

1621 Animal(Index='hawk', num_legs=2, num_wings=2)

1622 """

1623 arrays = []

1624 fields = list(self.columns)

1625 if index:

1626 arrays.append(self.index)

1627 fields.insert(0, "Index")

1629 # use integer indexing because of possible duplicate column names

1630 arrays.extend(self.iloc[:, k] for k in range(len(self.columns)))

1632 if name is not None:

1633 # https://github.com/python/mypy/issues/9046

1634 # error: namedtuple() expects a string literal as the first argument

1635 itertuple = collections.namedtuple( # type: ignore[misc]

1636 name, fields, rename=True

1638 return map(itertuple._make, zip(*arrays))

1640 # fallback to regular tuples

1641 return zip(*arrays)

1643 def __len__(self) -> int:

1644 """

1645 Returns length of info axis, but here we use the index.

1646 """

1647 return len(self.index)

1649 @overload

1650 def dot(self, other: Series) -> Series:

1651 ...

1653 @overload

1654 def dot(self, other: DataFrame | Index | ArrayLike) -> DataFrame:

1655 ...

1657 def dot(self, other: AnyArrayLike | DataFrame) -> DataFrame | Series:

1658 """

1659 Compute the matrix multiplication between the DataFrame and other.

1661 This method computes the matrix product between the DataFrame and the

1662 values of an other Series, DataFrame or a numpy array.

1664 It can also be called using ``self @ other``.

1666 Parameters

1667 ----------

1668 other : Series, DataFrame or array-like

1669 The other object to compute the matrix product with.

1671 Returns

1672 -------

1673 Series or DataFrame

1674 If other is a Series, return the matrix product between self and

1675 other as a Series. If other is a DataFrame or a numpy.array, return

1676 the matrix product of self and other in a DataFrame of a np.array.

1678 See Also

1679 --------

1680 Series.dot: Similar method for Series.

1682 Notes

1683 -----

1684 The dimensions of DataFrame and other must be compatible in order to

1685 compute the matrix multiplication. In addition, the column names of

1686 DataFrame and the index of other must contain the same values, as they

1687 will be aligned prior to the multiplication.

1689 The dot method for Series computes the inner product, instead of the

1690 matrix product here.

1692 Examples

1693 --------

1694 Here we multiply a DataFrame with a Series.

1696 >>> df = pd.DataFrame([[0, 1, -2, -1], [1, 1, 1, 1]])

1697 >>> s = pd.Series([1, 1, 2, 1])

1698 >>> df.dot(s)

1699 0 -4

1700 1 5

1701 dtype: int64

1703 Here we multiply a DataFrame with another DataFrame.

1705 >>> other = pd.DataFrame([[0, 1], [1, 2], [-1, -1], [2, 0]])

1706 >>> df.dot(other)

1707 0 1

1708 0 1 4

1709 1 2 2

1711 Note that the dot method give the same result as @

1713 >>> df @ other

1714 0 1

1715 0 1 4

1716 1 2 2

1718 The dot method works also if other is an np.array.

1720 >>> arr = np.array([[0, 1], [1, 2], [-1, -1], [2, 0]])

1721 >>> df.dot(arr)

1722 0 1

1723 0 1 4

1724 1 2 2

1726 Note how shuffling of the objects does not change the result.

1728 >>> s2 = s.reindex([1, 0, 2, 3])

1729 >>> df.dot(s2)

1730 0 -4

1731 1 5

1732 dtype: int64

1733 """

1734 if isinstance(other, (Series, DataFrame)):

1735 common = self.columns.union(other.index)

1736 if len(common) > len(self.columns) or len(common) > len(other.index):

1737 raise ValueError("matrices are not aligned")

1739 left = self.reindex(columns=common, copy=False)

1740 right = other.reindex(index=common, copy=False)

1741 lvals = left.values

1742 rvals = right._values

1743 else:

1744 left = self

1745 lvals = self.values

1746 rvals = np.asarray(other)

1747 if lvals.shape[1] != rvals.shape[0]:

1748 raise ValueError(

1749 f"Dot product shape mismatch, {lvals.shape} vs {rvals.shape}"

1752 if isinstance(other, DataFrame):

1753 common_type = find_common_type(list(self.dtypes) + list(other.dtypes))

1754 return self._constructor(

1755 np.dot(lvals, rvals),

1756 index=left.index,

1757 columns=other.columns,

1758 copy=False,

1759 dtype=common_type,

1761 elif isinstance(other, Series):

1762 common_type = find_common_type(list(self.dtypes) + [other.dtypes])

1763 return self._constructor_sliced(

1764 np.dot(lvals, rvals), index=left.index, copy=False, dtype=common_type

1766 elif isinstance(rvals, (np.ndarray, Index)):

1767 result = np.dot(lvals, rvals)

1768 if result.ndim == 2:

1769 return self._constructor(result, index=left.index, copy=False)

1770 else:

1771 return self._constructor_sliced(result, index=left.index, copy=False)

1772 else: # pragma: no cover

1773 raise TypeError(f"unsupported type: {type(other)}")

1775 @overload

1776 def __matmul__(self, other: Series) -> Series:

1777 ...

1779 @overload

1780 def __matmul__(self, other: AnyArrayLike | DataFrame) -> DataFrame | Series:

1781 ...

1783 def __matmul__(self, other: AnyArrayLike | DataFrame) -> DataFrame | Series:

1784 """

1785 Matrix multiplication using binary `@` operator.

1786 """

1787 return self.dot(other)

1789 def __rmatmul__(self, other) -> DataFrame:

1790 """

1791 Matrix multiplication using binary `@` operator.

1792 """

1793 try:

1794 return self.T.dot(np.transpose(other)).T

1795 except ValueError as err:

1796 if "shape mismatch" not in str(err):

1797 raise

1798 # GH#21581 give exception message for original shapes

1799 msg = f"shapes {np.shape(other)} and {self.shape} not aligned"

1800 raise ValueError(msg) from err

1802 # ----------------------------------------------------------------------

1803 # IO methods (to / from other formats)

1805 @classmethod

1806 def from_dict(

1807 cls,

1808 data: dict,

1809 orient: FromDictOrient = "columns",

1810 dtype: Dtype | None = None,

1811 columns: Axes | None = None,

1812 ) -> DataFrame:

1813 """

1814 Construct DataFrame from dict of array-like or dicts.

1816 Creates DataFrame object from dictionary by columns or by index

1817 allowing dtype specification.

1819 Parameters

1820 ----------

1821 data : dict

1822 Of the form {field : array-like} or {field : dict}.

1823 orient : {'columns', 'index', 'tight'}, default 'columns'

1824 The "orientation" of the data. If the keys of the passed dict

1825 should be the columns of the resulting DataFrame, pass 'columns'

1826 (default). Otherwise if the keys should be rows, pass 'index'.

1827 If 'tight', assume a dict with keys ['index', 'columns', 'data',

1828 'index_names', 'column_names'].

1830 .. versionadded:: 1.4.0

1831 'tight' as an allowed value for the ``orient`` argument

1833 dtype : dtype, default None

1834 Data type to force after DataFrame construction, otherwise infer.

1835 columns : list, default None

1836 Column labels to use when ``orient='index'``. Raises a ValueError

1837 if used with ``orient='columns'`` or ``orient='tight'``.

1839 Returns

1840 -------

1841 DataFrame

1843 See Also

1844 --------

1845 DataFrame.from_records : DataFrame from structured ndarray, sequence

1846 of tuples or dicts, or DataFrame.

1847 DataFrame : DataFrame object creation using constructor.

1848 DataFrame.to_dict : Convert the DataFrame to a dictionary.

1850 Examples

1851 --------

1852 By default the keys of the dict become the DataFrame columns:

1854 >>> data = {'col_1': [3, 2, 1, 0], 'col_2': ['a', 'b', 'c', 'd']}

1855 >>> pd.DataFrame.from_dict(data)

1856 col_1 col_2

1857 0 3 a

1858 1 2 b

1859 2 1 c

1860 3 0 d

1862 Specify ``orient='index'`` to create the DataFrame using dictionary

1863 keys as rows:

1865 >>> data = {'row_1': [3, 2, 1, 0], 'row_2': ['a', 'b', 'c', 'd']}

1866 >>> pd.DataFrame.from_dict(data, orient='index')

1867 0 1 2 3

1868 row_1 3 2 1 0

1869 row_2 a b c d

1871 When using the 'index' orientation, the column names can be

1872 specified manually:

1874 >>> pd.DataFrame.from_dict(data, orient='index',

1875 ... columns=['A', 'B', 'C', 'D'])

1876 A B C D

1877 row_1 3 2 1 0

1878 row_2 a b c d

1880 Specify ``orient='tight'`` to create the DataFrame using a 'tight'

1881 format:

1883 >>> data = {'index': [('a', 'b'), ('a', 'c')],

1884 ... 'columns': [('x', 1), ('y', 2)],

1885 ... 'data': [[1, 3], [2, 4]],

1886 ... 'index_names': ['n1', 'n2'],

1887 ... 'column_names': ['z1', 'z2']}

1888 >>> pd.DataFrame.from_dict(data, orient='tight')

1889 z1 x y

1890 z2 1 2

1891 n1 n2

1892 a b 1 3

1893 c 2 4

1894 """

1895 index = None

1896 orient = orient.lower() # type: ignore[assignment]

1897 if orient == "index":

1898 if len(data) > 0:

1899 # TODO speed up Series case

1900 if isinstance(next(iter(data.values())), (Series, dict)):

1901 data = _from_nested_dict(data)

1902 else:

1903 index = list(data.keys())

1904 # error: Incompatible types in assignment (expression has type

1905 # "List[Any]", variable has type "Dict[Any, Any]")

1906 data = list(data.values()) # type: ignore[assignment]

1907 elif orient in ("columns", "tight"):

1908 if columns is not None:

1909 raise ValueError(f"cannot use columns parameter with orient='{orient}'")

1910 else: # pragma: no cover

1911 raise ValueError(

1912 f"Expected 'index', 'columns' or 'tight' for orient parameter. "

1913 f"Got '{orient}' instead"

1916 if orient != "tight":

1917 return cls(data, index=index, columns=columns, dtype=dtype)

1918 else:

1919 realdata = data["data"]

1921 def create_index(indexlist, namelist):

1922 index: Index

1923 if len(namelist) > 1:

1924 index = MultiIndex.from_tuples(indexlist, names=namelist)

1925 else:

1926 index = Index(indexlist, name=namelist[0])

1927 return index

1929 index = create_index(data["index"], data["index_names"])

1930 columns = create_index(data["columns"], data["column_names"])

1931 return cls(realdata, index=index, columns=columns, dtype=dtype)

1933 def to_numpy(

1934 self,

1935 dtype: npt.DTypeLike | None = None,

1936 copy: bool = False,

1937 na_value: object = lib.no_default,

1938 ) -> np.ndarray:

1939 """

1940 Convert the DataFrame to a NumPy array.

1942 By default, the dtype of the returned array will be the common NumPy

1943 dtype of all types in the DataFrame. For example, if the dtypes are

1944 ``float16`` and ``float32``, the results dtype will be ``float32``.

1945 This may require copying data and coercing values, which may be

1946 expensive.

1948 Parameters

1949 ----------

1950 dtype : str or numpy.dtype, optional

1951 The dtype to pass to :meth:`numpy.asarray`.

1952 copy : bool, default False

1953 Whether to ensure that the returned value is not a view on

1954 another array. Note that ``copy=False`` does not *ensure* that

1955 ``to_numpy()`` is no-copy. Rather, ``copy=True`` ensure that

1956 a copy is made, even if not strictly necessary.

1957 na_value : Any, optional

1958 The value to use for missing values. The default value depends

1959 on `dtype` and the dtypes of the DataFrame columns.

1961 Returns

1962 -------

1963 numpy.ndarray

1965 See Also

1966 --------

1967 Series.to_numpy : Similar method for Series.

1969 Examples

1970 --------

1971 >>> pd.DataFrame({"A": [1, 2], "B": [3, 4]}).to_numpy()

1972 array([[1, 3],

1973 [2, 4]])

1975 With heterogeneous data, the lowest common type will have to

1976 be used.

1978 >>> df = pd.DataFrame({"A": [1, 2], "B": [3.0, 4.5]})

1979 >>> df.to_numpy()

1980 array([[1. , 3. ],

1981 [2. , 4.5]])

1983 For a mix of numeric and non-numeric types, the output array will

1984 have object dtype.

1986 >>> df['C'] = pd.date_range('2000', periods=2)

1987 >>> df.to_numpy()

1988 array([[1, 3.0, Timestamp('2000-01-01 00:00:00')],

1989 [2, 4.5, Timestamp('2000-01-02 00:00:00')]], dtype=object)

1990 """

1991 if dtype is not None:

1992 dtype = np.dtype(dtype)

1993 result = self._mgr.as_array(dtype=dtype, copy=copy, na_value=na_value)

1994 if result.dtype is not dtype:

1995 result = np.asarray(result, dtype=dtype)

1997 return result

1999 def _create_data_for_split_and_tight_to_dict(

2000 self, are_all_object_dtype_cols: bool, object_dtype_indices: list[int]

2001 ) -> list:

2002 """

2003 Simple helper method to create data for to ``to_dict(orient="split")`` and

2004 ``to_dict(orient="tight")`` to create the main output data

2005 """

2006 if are_all_object_dtype_cols:

2007 data = [

2008 list(map(maybe_box_native, t))

2009 for t in self.itertuples(index=False, name=None)

2011 else:

2012 data = [list(t) for t in self.itertuples(index=False, name=None)]

2013 if object_dtype_indices:

2014 # If we have object_dtype_cols, apply maybe_box_naive after list

2015 # comprehension for perf

2016 for row in data:

2017 for i in object_dtype_indices:

2018 row[i] = maybe_box_native(row[i])

2019 return data

2021 @overload

2022 def to_dict(

2023 self,

2024 orient: Literal["dict", "list", "series", "split", "tight", "index"] = ...,

2026 into: type[MutableMappingT] | MutableMappingT,

2027 index: bool = ...,

2028 ) -> MutableMappingT:

2029 ...

2031 @overload

2032 def to_dict(

2033 self,

2034 orient: Literal["records"],

2036 into: type[MutableMappingT] | MutableMappingT,

2037 index: bool = ...,

2038 ) -> list[MutableMappingT]:

2039 ...

2041 @overload

2042 def to_dict(

2043 self,

2044 orient: Literal["dict", "list", "series", "split", "tight", "index"] = ...,

2046 into: type[dict] = ...,

2047 index: bool = ...,

2048 ) -> dict:

2049 ...

2051 @overload

2052 def to_dict(

2053 self,

2054 orient: Literal["records"],

2056 into: type[dict] = ...,

2057 index: bool = ...,

2058 ) -> list[dict]:

2059 ...

2061 # error: Incompatible default for argument "into" (default has type "type

2062 # [dict[Any, Any]]", argument has type "type[MutableMappingT] | MutableMappingT")

2063 @deprecate_nonkeyword_arguments(

2064 version="3.0", allowed_args=["self", "orient"], name="to_dict"

2066 def to_dict(

2067 self,

2068 orient: Literal[

2069 "dict", "list", "series", "split", "tight", "records", "index"

2070 ] = "dict",

2071 into: type[MutableMappingT]

2072 | MutableMappingT = dict, # type: ignore[assignment]

2073 index: bool = True,

2074 ) -> MutableMappingT | list[MutableMappingT]:

2075 """

2076 Convert the DataFrame to a dictionary.

2078 The type of the key-value pairs can be customized with the parameters

2079 (see below).

2081 Parameters

2082 ----------

2083 orient : str {'dict', 'list', 'series', 'split', 'tight', 'records', 'index'}

2084 Determines the type of the values of the dictionary.

2086 - 'dict' (default) : dict like {column -> {index -> value}}

2087 - 'list' : dict like {column -> [values]}

2088 - 'series' : dict like {column -> Series(values)}

2089 - 'split' : dict like

2090 {'index' -> [index], 'columns' -> [columns], 'data' -> [values]}

2091 - 'tight' : dict like

2092 {'index' -> [index], 'columns' -> [columns], 'data' -> [values],

2093 'index_names' -> [index.names], 'column_names' -> [column.names]}

2094 - 'records' : list like

2095 [{column -> value}, ... , {column -> value}]

2096 - 'index' : dict like {index -> {column -> value}}

2098 .. versionadded:: 1.4.0

2099 'tight' as an allowed value for the ``orient`` argument

2101 into : class, default dict

2102 The collections.abc.MutableMapping subclass used for all Mappings

2103 in the return value. Can be the actual class or an empty

2104 instance of the mapping type you want. If you want a

2105 collections.defaultdict, you must pass it initialized.

2107 index : bool, default True

2108 Whether to include the index item (and index_names item if `orient`

2109 is 'tight') in the returned dictionary. Can only be ``False``

2110 when `orient` is 'split' or 'tight'.

2112 .. versionadded:: 2.0.0

2114 Returns

2115 -------

2116 dict, list or collections.abc.MutableMapping

2117 Return a collections.abc.MutableMapping object representing the

2118 DataFrame. The resulting transformation depends on the `orient`

2119 parameter.

2121 See Also

2122 --------

2123 DataFrame.from_dict: Create a DataFrame from a dictionary.

2124 DataFrame.to_json: Convert a DataFrame to JSON format.

2126 Examples

2127 --------

2128 >>> df = pd.DataFrame({'col1': [1, 2],

2129 ... 'col2': [0.5, 0.75]},

2130 ... index=['row1', 'row2'])

2131 >>> df

2132 col1 col2

2133 row1 1 0.50

2134 row2 2 0.75

2135 >>> df.to_dict()

2136 {'col1': {'row1': 1, 'row2': 2}, 'col2': {'row1': 0.5, 'row2': 0.75}}

2138 You can specify the return orientation.

2140 >>> df.to_dict('series')

2141 {'col1': row1 1

2142 row2 2

2143 Name: col1, dtype: int64,

2144 'col2': row1 0.50

2145 row2 0.75

2146 Name: col2, dtype: float64}

2148 >>> df.to_dict('split')

2149 {'index': ['row1', 'row2'], 'columns': ['col1', 'col2'],

2150 'data': [[1, 0.5], [2, 0.75]]}

2152 >>> df.to_dict('records')

2153 [{'col1': 1, 'col2': 0.5}, {'col1': 2, 'col2': 0.75}]

2155 >>> df.to_dict('index')

2156 {'row1': {'col1': 1, 'col2': 0.5}, 'row2': {'col1': 2, 'col2': 0.75}}

2158 >>> df.to_dict('tight')

2159 {'index': ['row1', 'row2'], 'columns': ['col1', 'col2'],

2160 'data': [[1, 0.5], [2, 0.75]], 'index_names': [None], 'column_names': [None]}

2162 You can also specify the mapping type.

2164 >>> from collections import OrderedDict, defaultdict

2165 >>> df.to_dict(into=OrderedDict)

2166 OrderedDict([('col1', OrderedDict([('row1', 1), ('row2', 2)])),

2167 ('col2', OrderedDict([('row1', 0.5), ('row2', 0.75)]))])

2169 If you want a `defaultdict`, you need to initialize it:

2171 >>> dd = defaultdict(list)

2172 >>> df.to_dict('records', into=dd)

2173 [defaultdict(<class 'list'>, {'col1': 1, 'col2': 0.5}),

2174 defaultdict(<class 'list'>, {'col1': 2, 'col2': 0.75})]

2175 """

2176 from pandas.core.methods.to_dict import to_dict

2178 return to_dict(self, orient, into=into, index=index)

2180 @deprecate_nonkeyword_arguments(

2181 version="3.0", allowed_args=["self", "destination_table"], name="to_gbq"

2183 def to_gbq(

2184 self,

2185 destination_table: str,

2186 project_id: str | None = None,

2187 chunksize: int | None = None,

2188 reauth: bool = False,

2189 if_exists: ToGbqIfexist = "fail",

2190 auth_local_webserver: bool = True,

2191 table_schema: list[dict[str, str]] | None = None,

2192 location: str | None = None,

2193 progress_bar: bool = True,

2194 credentials=None,

2195 ) -> None:

2196 """

2197 Write a DataFrame to a Google BigQuery table.

2199 .. deprecated:: 2.2.0

2201 Please use ``pandas_gbq.to_gbq`` instead.

2203 This function requires the `pandas-gbq package

2204 <https://pandas-gbq.readthedocs.io>`__.

2206 See the `How to authenticate with Google BigQuery

2207 <https://pandas-gbq.readthedocs.io/en/latest/howto/authentication.html>`__

2208 guide for authentication instructions.

2210 Parameters

2211 ----------

2212 destination_table : str

2213 Name of table to be written, in the form ``dataset.tablename``.

2214 project_id : str, optional

2215 Google BigQuery Account project ID. Optional when available from

2216 the environment.

2217 chunksize : int, optional

2218 Number of rows to be inserted in each chunk from the dataframe.

2219 Set to ``None`` to load the whole dataframe at once.

2220 reauth : bool, default False

2221 Force Google BigQuery to re-authenticate the user. This is useful

2222 if multiple accounts are used.

2223 if_exists : str, default 'fail'

2224 Behavior when the destination table exists. Value can be one of:

2226 ``'fail'``

2227 If table exists raise pandas_gbq.gbq.TableCreationError.

2228 ``'replace'``

2229 If table exists, drop it, recreate it, and insert data.

2230 ``'append'``

2231 If table exists, insert data. Create if does not exist.

2232 auth_local_webserver : bool, default True

2233 Use the `local webserver flow`_ instead of the `console flow`_

2234 when getting user credentials.

2236 .. _local webserver flow:

2237 https://google-auth-oauthlib.readthedocs.io/en/latest/reference/google_auth_oauthlib.flow.html#google_auth_oauthlib.flow.InstalledAppFlow.run_local_server

2238 .. _console flow:

2239 https://google-auth-oauthlib.readthedocs.io/en/latest/reference/google_auth_oauthlib.flow.html#google_auth_oauthlib.flow.InstalledAppFlow.run_console

2241 *New in version 0.2.0 of pandas-gbq*.

2243 .. versionchanged:: 1.5.0

2244 Default value is changed to ``True``. Google has deprecated the

2245 ``auth_local_webserver = False`` `"out of band" (copy-paste)

2246 flow

2247 <https://developers.googleblog.com/2022/02/making-oauth-flows-safer.html?m=1#disallowed-oob>`_.

2248 table_schema : list of dicts, optional

2249 List of BigQuery table fields to which according DataFrame

2250 columns conform to, e.g. ``[{'name': 'col1', 'type':

2251 'STRING'},...]``. If schema is not provided, it will be

2252 generated according to dtypes of DataFrame columns. See

2253 BigQuery API documentation on available names of a field.

2255 *New in version 0.3.1 of pandas-gbq*.

2256 location : str, optional

2257 Location where the load job should run. See the `BigQuery locations

2258 documentation

2259 <https://cloud.google.com/bigquery/docs/dataset-locations>`__ for a

2260 list of available locations. The location must match that of the

2261 target dataset.

2263 *New in version 0.5.0 of pandas-gbq*.

2264 progress_bar : bool, default True

2265 Use the library `tqdm` to show the progress bar for the upload,

2266 chunk by chunk.

2268 *New in version 0.5.0 of pandas-gbq*.

2269 credentials : google.auth.credentials.Credentials, optional

2270 Credentials for accessing Google APIs. Use this parameter to

2271 override default credentials, such as to use Compute Engine

2272 :class:`google.auth.compute_engine.Credentials` or Service

2273 Account :class:`google.oauth2.service_account.Credentials`

2274 directly.

2276 *New in version 0.8.0 of pandas-gbq*.

2278 See Also

2279 --------

2280 pandas_gbq.to_gbq : This function in the pandas-gbq library.

2281 read_gbq : Read a DataFrame from Google BigQuery.

2283 Examples

2284 --------

2285 Example taken from `Google BigQuery documentation

2286 <https://cloud.google.com/bigquery/docs/samples/bigquery-pandas-gbq-to-gbq-simple>`_

2288 >>> project_id = "my-project"

2289 >>> table_id = 'my_dataset.my_table'

2290 >>> df = pd.DataFrame({

2291 ... "my_string": ["a", "b", "c"],

2292 ... "my_int64": [1, 2, 3],

2293 ... "my_float64": [4.0, 5.0, 6.0],

2294 ... "my_bool1": [True, False, True],

2295 ... "my_bool2": [False, True, False],

2296 ... "my_dates": pd.date_range("now", periods=3),

2297 ... }

2298 ... )

2300 >>> df.to_gbq(table_id, project_id=project_id) # doctest: +SKIP

2301 """

2302 from pandas.io import gbq

2304 gbq.to_gbq(

2305 self,

2306 destination_table,

2307 project_id=project_id,

2308 chunksize=chunksize,

2309 reauth=reauth,

2310 if_exists=if_exists,

2311 auth_local_webserver=auth_local_webserver,

2312 table_schema=table_schema,

2313 location=location,

2314 progress_bar=progress_bar,

2315 credentials=credentials,

2318 @classmethod

2319 def from_records(

2320 cls,

2321 data,

2322 index=None,

2323 exclude=None,

2324 columns=None,

2325 coerce_float: bool = False,

2326 nrows: int | None = None,

2327 ) -> DataFrame:

2328 """

2329 Convert structured or record ndarray to DataFrame.

2331 Creates a DataFrame object from a structured ndarray, sequence of

2332 tuples or dicts, or DataFrame.

2334 Parameters

2335 ----------

2336 data : structured ndarray, sequence of tuples or dicts, or DataFrame

2337 Structured input data.

2339 .. deprecated:: 2.1.0

2340 Passing a DataFrame is deprecated.

2341 index : str, list of fields, array-like

2342 Field of array to use as the index, alternately a specific set of

2343 input labels to use.

2344 exclude : sequence, default None

2345 Columns or fields to exclude.

2346 columns : sequence, default None

2347 Column names to use. If the passed data do not have names

2348 associated with them, this argument provides names for the

2349 columns. Otherwise this argument indicates the order of the columns

2350 in the result (any names not found in the data will become all-NA

2351 columns).

2352 coerce_float : bool, default False

2353 Attempt to convert values of non-string, non-numeric objects (like

2354 decimal.Decimal) to floating point, useful for SQL result sets.

2355 nrows : int, default None

2356 Number of rows to read if data is an iterator.

2358 Returns

2359 -------

2360 DataFrame

2362 See Also

2363 --------

2364 DataFrame.from_dict : DataFrame from dict of array-like or dicts.

2365 DataFrame : DataFrame object creation using constructor.

2367 Examples

2368 --------

2369 Data can be provided as a structured ndarray:

2371 >>> data = np.array([(3, 'a'), (2, 'b'), (1, 'c'), (0, 'd')],

2372 ... dtype=[('col_1', 'i4'), ('col_2', 'U1')])

2373 >>> pd.DataFrame.from_records(data)

2374 col_1 col_2

2375 0 3 a

2376 1 2 b

2377 2 1 c

2378 3 0 d

2380 Data can be provided as a list of dicts:

2382 >>> data = [{'col_1': 3, 'col_2': 'a'},

2383 ... {'col_1': 2, 'col_2': 'b'},

2384 ... {'col_1': 1, 'col_2': 'c'},

2385 ... {'col_1': 0, 'col_2': 'd'}]

2386 >>> pd.DataFrame.from_records(data)

2387 col_1 col_2

2388 0 3 a

2389 1 2 b

2390 2 1 c

2391 3 0 d

2393 Data can be provided as a list of tuples with corresponding columns:

2395 >>> data = [(3, 'a'), (2, 'b'), (1, 'c'), (0, 'd')]

2396 >>> pd.DataFrame.from_records(data, columns=['col_1', 'col_2'])

2397 col_1 col_2

2398 0 3 a

2399 1 2 b

2400 2 1 c

2401 3 0 d

2402 """

2403 if isinstance(data, DataFrame):

2404 warnings.warn(

2405 "Passing a DataFrame to DataFrame.from_records is deprecated. Use "

2406 "set_index and/or drop to modify the DataFrame instead.",

2407 FutureWarning,

2408 stacklevel=find_stack_level(),

2410 if columns is not None:

2411 if is_scalar(columns):

2412 columns = [columns]

2413 data = data[columns]

2414 if index is not None:

2415 data = data.set_index(index)

2416 if exclude is not None:

2417 data = data.drop(columns=exclude)

2418 return data.copy(deep=False)

2420 result_index = None

2422 # Make a copy of the input columns so we can modify it

2423 if columns is not None:

2424 columns = ensure_index(columns)

2426 def maybe_reorder(

2427 arrays: list[ArrayLike], arr_columns: Index, columns: Index, index

2428 ) -> tuple[list[ArrayLike], Index, Index | None]:

2429 """

2430 If our desired 'columns' do not match the data's pre-existing 'arr_columns',

2431 we re-order our arrays. This is like a pre-emptive (cheap) reindex.

2432 """

2433 if len(arrays):

2434 length = len(arrays[0])

2435 else:

2436 length = 0

2438 result_index = None

2439 if len(arrays) == 0 and index is None and length == 0:

2440 result_index = default_index(0)

2442 arrays, arr_columns = reorder_arrays(arrays, arr_columns, columns, length)

2443 return arrays, arr_columns, result_index

2445 if is_iterator(data):

2446 if nrows == 0:

2447 return cls()

2449 try:

2450 first_row = next(data)

2451 except StopIteration:

2452 return cls(index=index, columns=columns)

2454 dtype = None

2455 if hasattr(first_row, "dtype") and first_row.dtype.names:

2456 dtype = first_row.dtype

2458 values = [first_row]

2460 if nrows is None:

2461 values += data

2462 else:

2463 values.extend(itertools.islice(data, nrows - 1))

2465 if dtype is not None:

2466 data = np.array(values, dtype=dtype)

2467 else:

2468 data = values

2470 if isinstance(data, dict):

2471 if columns is None:

2472 columns = arr_columns = ensure_index(sorted(data))

2473 arrays = [data[k] for k in columns]

2474 else:

2475 arrays = []

2476 arr_columns_list = []

2477 for k, v in data.items():

2478 if k in columns:

2479 arr_columns_list.append(k)

2480 arrays.append(v)

2482 arr_columns = Index(arr_columns_list)

2483 arrays, arr_columns, result_index = maybe_reorder(

2484 arrays, arr_columns, columns, index

2487 elif isinstance(data, np.ndarray):

2488 arrays, columns = to_arrays(data, columns)

2489 arr_columns = columns

2490 else:

2491 arrays, arr_columns = to_arrays(data, columns)

2492 if coerce_float:

2493 for i, arr in enumerate(arrays):

2494 if arr.dtype == object:

2495 # error: Argument 1 to "maybe_convert_objects" has

2496 # incompatible type "Union[ExtensionArray, ndarray]";

2497 # expected "ndarray"

2498 arrays[i] = lib.maybe_convert_objects(

2499 arr, # type: ignore[arg-type]

2500 try_float=True,

2503 arr_columns = ensure_index(arr_columns)

2504 if columns is None:

2505 columns = arr_columns

2506 else:

2507 arrays, arr_columns, result_index = maybe_reorder(

2508 arrays, arr_columns, columns, index

2511 if exclude is None:

2512 exclude = set()

2513 else:

2514 exclude = set(exclude)

2516 if index is not None:

2517 if isinstance(index, str) or not hasattr(index, "__iter__"):

2518 i = columns.get_loc(index)

2519 exclude.add(index)

2520 if len(arrays) > 0:

2521 result_index = Index(arrays[i], name=index)

2522 else:

2523 result_index = Index([], name=index)

2524 else:

2525 try:

2526 index_data = [arrays[arr_columns.get_loc(field)] for field in index]

2527 except (KeyError, TypeError):

2528 # raised by get_loc, see GH#29258

2529 result_index = index

2530 else:

2531 result_index = ensure_index_from_sequences(index_data, names=index)

2532 exclude.update(index)

2534 if any(exclude):

2535 arr_exclude = [x for x in exclude if x in arr_columns]

2536 to_remove = [arr_columns.get_loc(col) for col in arr_exclude]

2537 arrays = [v for i, v in enumerate(arrays) if i not in to_remove]

2539 columns = columns.drop(exclude)

2541 manager = _get_option("mode.data_manager", silent=True)

2542 mgr = arrays_to_mgr(arrays, columns, result_index, typ=manager)

2544 return cls._from_mgr(mgr, axes=mgr.axes)

2546 def to_records(

2547 self, index: bool = True, column_dtypes=None, index_dtypes=None

2548 ) -> np.rec.recarray:

2549 """

2550 Convert DataFrame to a NumPy record array.

2552 Index will be included as the first field of the record array if

2553 requested.

2555 Parameters

2556 ----------

2557 index : bool, default True

2558 Include index in resulting record array, stored in 'index'

2559 field or using the index label, if set.

2560 column_dtypes : str, type, dict, default None

2561 If a string or type, the data type to store all columns. If

2562 a dictionary, a mapping of column names and indices (zero-indexed)

2563 to specific data types.

2564 index_dtypes : str, type, dict, default None

2565 If a string or type, the data type to store all index levels. If

2566 a dictionary, a mapping of index level names and indices

2567 (zero-indexed) to specific data types.

2569 This mapping is applied only if `index=True`.

2571 Returns

2572 -------

2573 numpy.rec.recarray

2574 NumPy ndarray with the DataFrame labels as fields and each row

2575 of the DataFrame as entries.

2577 See Also

2578 --------

2579 DataFrame.from_records: Convert structured or record ndarray

2580 to DataFrame.

2581 numpy.rec.recarray: An ndarray that allows field access using

2582 attributes, analogous to typed columns in a

2583 spreadsheet.

2585 Examples

2586 --------

2587 >>> df = pd.DataFrame({'A': [1, 2], 'B': [0.5, 0.75]},

2588 ... index=['a', 'b'])

2589 >>> df

2590 A B

2591 a 1 0.50

2592 b 2 0.75

2593 >>> df.to_records()

2594 rec.array([('a', 1, 0.5 ), ('b', 2, 0.75)],

2595 dtype=[('index', 'O'), ('A', '<i8'), ('B', '<f8')])

2597 If the DataFrame index has no label then the recarray field name

2598 is set to 'index'. If the index has a label then this is used as the

2599 field name:

2601 >>> df.index = df.index.rename("I")

2602 >>> df.to_records()

2603 rec.array([('a', 1, 0.5 ), ('b', 2, 0.75)],

2604 dtype=[('I', 'O'), ('A', '<i8'), ('B', '<f8')])

2606 The index can be excluded from the record array:

2608 >>> df.to_records(index=False)

2609 rec.array([(1, 0.5 ), (2, 0.75)],

2610 dtype=[('A', '<i8'), ('B', '<f8')])

2612 Data types can be specified for the columns:

2614 >>> df.to_records(column_dtypes={"A": "int32"})

2615 rec.array([('a', 1, 0.5 ), ('b', 2, 0.75)],

2616 dtype=[('I', 'O'), ('A', '<i4'), ('B', '<f8')])

2618 As well as for the index:

2620 >>> df.to_records(index_dtypes="<S2")

2621 rec.array([(b'a', 1, 0.5 ), (b'b', 2, 0.75)],

2622 dtype=[('I', 'S2'), ('A', '<i8'), ('B', '<f8')])

2624 >>> index_dtypes = f"<S{df.index.str.len().max()}"

2625 >>> df.to_records(index_dtypes=index_dtypes)

2626 rec.array([(b'a', 1, 0.5 ), (b'b', 2, 0.75)],

2627 dtype=[('I', 'S1'), ('A', '<i8'), ('B', '<f8')])

2628 """

2629 if index:

2630 ix_vals = [

2631 np.asarray(self.index.get_level_values(i))

2632 for i in range(self.index.nlevels)

2635 arrays = ix_vals + [

2636 np.asarray(self.iloc[:, i]) for i in range(len(self.columns))

2639 index_names = list(self.index.names)

2641 if isinstance(self.index, MultiIndex):

2642 index_names = com.fill_missing_names(index_names)

2643 elif index_names[0] is None:

2644 index_names = ["index"]

2646 names = [str(name) for name in itertools.chain(index_names, self.columns)]

2647 else:

2648 arrays = [np.asarray(self.iloc[:, i]) for i in range(len(self.columns))]

2649 names = [str(c) for c in self.columns]

2650 index_names = []

2652 index_len = len(index_names)

2653 formats = []

2655 for i, v in enumerate(arrays):

2656 index_int = i

2658 # When the names and arrays are collected, we

2659 # first collect those in the DataFrame's index,

2660 # followed by those in its columns.

2662 # Thus, the total length of the array is:

2663 # len(index_names) + len(DataFrame.columns).

2665 # This check allows us to see whether we are

2666 # handling a name / array in the index or column.

2667 if index_int < index_len:

2668 dtype_mapping = index_dtypes

2669 name = index_names[index_int]

2670 else:

2671 index_int -= index_len

2672 dtype_mapping = column_dtypes

2673 name = self.columns[index_int]

2675 # We have a dictionary, so we get the data type

2676 # associated with the index or column (which can

2677 # be denoted by its name in the DataFrame or its

2678 # position in DataFrame's array of indices or

2679 # columns, whichever is applicable.

2680 if is_dict_like(dtype_mapping):

2681 if name in dtype_mapping:

2682 dtype_mapping = dtype_mapping[name]

2683 elif index_int in dtype_mapping:

2684 dtype_mapping = dtype_mapping[index_int]

2685 else:

2686 dtype_mapping = None

2688 # If no mapping can be found, use the array's

2689 # dtype attribute for formatting.

2691 # A valid dtype must either be a type or

2692 # string naming a type.

2693 if dtype_mapping is None:

2694 formats.append(v.dtype)

2695 elif isinstance(dtype_mapping, (type, np.dtype, str)):

2696 # error: Argument 1 to "append" of "list" has incompatible

2697 # type "Union[type, dtype[Any], str]"; expected "dtype[Any]"

2698 formats.append(dtype_mapping) # type: ignore[arg-type]

2699 else:

2700 element = "row" if i < index_len else "column"

2701 msg = f"Invalid dtype {dtype_mapping} specified for {element} {name}"

2702 raise ValueError(msg)

2704 return np.rec.fromarrays(arrays, dtype={"names": names, "formats": formats})

2706 @classmethod

2707 def _from_arrays(

2708 cls,

2709 arrays,

2710 columns,

2711 index,

2712 dtype: Dtype | None = None,

2713 verify_integrity: bool = True,

2714 ) -> Self:

2715 """

2716 Create DataFrame from a list of arrays corresponding to the columns.

2718 Parameters

2719 ----------

2720 arrays : list-like of arrays

2721 Each array in the list corresponds to one column, in order.

2722 columns : list-like, Index

2723 The column names for the resulting DataFrame.

2724 index : list-like, Index

2725 The rows labels for the resulting DataFrame.

2726 dtype : dtype, optional

2727 Optional dtype to enforce for all arrays.

2728 verify_integrity : bool, default True

2729 Validate and homogenize all input. If set to False, it is assumed

2730 that all elements of `arrays` are actual arrays how they will be

2731 stored in a block (numpy ndarray or ExtensionArray), have the same

2732 length as and are aligned with the index, and that `columns` and

2733 `index` are ensured to be an Index object.

2735 Returns

2736 -------

2737 DataFrame

2738 """

2739 if dtype is not None:

2740 dtype = pandas_dtype(dtype)

2742 manager = _get_option("mode.data_manager", silent=True)

2743 columns = ensure_index(columns)

2744 if len(columns) != len(arrays):

2745 raise ValueError("len(columns) must match len(arrays)")

2746 mgr = arrays_to_mgr(

2747 arrays,

2748 columns,

2749 index,

2750 dtype=dtype,

2751 verify_integrity=verify_integrity,

2752 typ=manager,

2754 return cls._from_mgr(mgr, axes=mgr.axes)

2756 @doc(

2757 storage_options=_shared_docs["storage_options"],

2758 compression_options=_shared_docs["compression_options"] % "path",

2760 def to_stata(

2761 self,

2762 path: FilePath | WriteBuffer[bytes],

2764 convert_dates: dict[Hashable, str] | None = None,

2765 write_index: bool = True,

2766 byteorder: ToStataByteorder | None = None,

2767 time_stamp: datetime.datetime | None = None,

2768 data_label: str | None = None,

2769 variable_labels: dict[Hashable, str] | None = None,

2770 version: int | None = 114,

2771 convert_strl: Sequence[Hashable] | None = None,

2772 compression: CompressionOptions = "infer",

2773 storage_options: StorageOptions | None = None,

2774 value_labels: dict[Hashable, dict[float, str]] | None = None,

2775 ) -> None:

2776 """

2777 Export DataFrame object to Stata dta format.

2779 Writes the DataFrame to a Stata dataset file.

2780 "dta" files contain a Stata dataset.

2782 Parameters

2783 ----------

2784 path : str, path object, or buffer

2785 String, path object (implementing ``os.PathLike[str]``), or file-like

2786 object implementing a binary ``write()`` function.

2788 convert_dates : dict

2789 Dictionary mapping columns containing datetime types to stata

2790 internal format to use when writing the dates. Options are 'tc',

2791 'td', 'tm', 'tw', 'th', 'tq', 'ty'. Column can be either an integer

2792 or a name. Datetime columns that do not have a conversion type

2793 specified will be converted to 'tc'. Raises NotImplementedError if

2794 a datetime column has timezone information.

2795 write_index : bool

2796 Write the index to Stata dataset.

2797 byteorder : str

2798 Can be ">", "<", "little", or "big". default is `sys.byteorder`.

2799 time_stamp : datetime

2800 A datetime to use as file creation date. Default is the current

2801 time.

2802 data_label : str, optional

2803 A label for the data set. Must be 80 characters or smaller.

2804 variable_labels : dict

2805 Dictionary containing columns as keys and variable labels as

2806 values. Each label must be 80 characters or smaller.

2807 version : {{114, 117, 118, 119, None}}, default 114

2808 Version to use in the output dta file. Set to None to let pandas

2809 decide between 118 or 119 formats depending on the number of

2810 columns in the frame. Version 114 can be read by Stata 10 and

2811 later. Version 117 can be read by Stata 13 or later. Version 118

2812 is supported in Stata 14 and later. Version 119 is supported in

2813 Stata 15 and later. Version 114 limits string variables to 244

2814 characters or fewer while versions 117 and later allow strings

2815 with lengths up to 2,000,000 characters. Versions 118 and 119

2816 support Unicode characters, and version 119 supports more than

2817 32,767 variables.

2819 Version 119 should usually only be used when the number of

2820 variables exceeds the capacity of dta format 118. Exporting

2821 smaller datasets in format 119 may have unintended consequences,

2822 and, as of November 2020, Stata SE cannot read version 119 files.

2824 convert_strl : list, optional

2825 List of column names to convert to string columns to Stata StrL

2826 format. Only available if version is 117. Storing strings in the

2827 StrL format can produce smaller dta files if strings have more than

2828 8 characters and values are repeated.

2829 {compression_options}

2831 .. versionchanged:: 1.4.0 Zstandard support.

2833 {storage_options}

2835 value_labels : dict of dicts

2836 Dictionary containing columns as keys and dictionaries of column value

2837 to labels as values. Labels for a single variable must be 32,000

2838 characters or smaller.

2840 .. versionadded:: 1.4.0

2842 Raises

2843 ------

2844 NotImplementedError

2845 * If datetimes contain timezone information

2846 * Column dtype is not representable in Stata

2847 ValueError

2848 * Columns listed in convert_dates are neither datetime64[ns]

2849 or datetime.datetime

2850 * Column listed in convert_dates is not in DataFrame

2851 * Categorical label contains more than 32,000 characters

2853 See Also

2854 --------

2855 read_stata : Import Stata data files.

2856 io.stata.StataWriter : Low-level writer for Stata data files.

2857 io.stata.StataWriter117 : Low-level writer for version 117 files.

2859 Examples

2860 --------

2861 >>> df = pd.DataFrame({{'animal': ['falcon', 'parrot', 'falcon',

2862 ... 'parrot'],

2863 ... 'speed': [350, 18, 361, 15]}})

2864 >>> df.to_stata('animals.dta') # doctest: +SKIP

2865 """

2866 if version not in (114, 117, 118, 119, None):

2867 raise ValueError("Only formats 114, 117, 118 and 119 are supported.")

2868 if version == 114:

2869 if convert_strl is not None:

2870 raise ValueError("strl is not supported in format 114")

2871 from pandas.io.stata import StataWriter as statawriter

2872 elif version == 117:

2873 # Incompatible import of "statawriter" (imported name has type

2874 # "Type[StataWriter117]", local name has type "Type[StataWriter]")

2875 from pandas.io.stata import ( # type: ignore[assignment]

2876 StataWriter117 as statawriter,

2878 else: # versions 118 and 119

2879 # Incompatible import of "statawriter" (imported name has type

2880 # "Type[StataWriter117]", local name has type "Type[StataWriter]")

2881 from pandas.io.stata import ( # type: ignore[assignment]

2882 StataWriterUTF8 as statawriter,

2885 kwargs: dict[str, Any] = {}

2886 if version is None or version >= 117:

2887 # strl conversion is only supported >= 117

2888 kwargs["convert_strl"] = convert_strl

2889 if version is None or version >= 118:

2890 # Specifying the version is only supported for UTF8 (118 or 119)

2891 kwargs["version"] = version

2893 writer = statawriter(

2894 path,

2895 self,

2896 convert_dates=convert_dates,

2897 byteorder=byteorder,

2898 time_stamp=time_stamp,

2899 data_label=data_label,

2900 write_index=write_index,

2901 variable_labels=variable_labels,

2902 compression=compression,

2903 storage_options=storage_options,

2904 value_labels=value_labels,

2905 **kwargs,

2907 writer.write_file()

2909 def to_feather(self, path: FilePath | WriteBuffer[bytes], **kwargs) -> None:

2910 """

2911 Write a DataFrame to the binary Feather format.

2913 Parameters

2914 ----------

2915 path : str, path object, file-like object

2916 String, path object (implementing ``os.PathLike[str]``), or file-like

2917 object implementing a binary ``write()`` function. If a string or a path,

2918 it will be used as Root Directory path when writing a partitioned dataset.

2919 **kwargs :

2920 Additional keywords passed to :func:`pyarrow.feather.write_feather`.

2921 This includes the `compression`, `compression_level`, `chunksize`

2922 and `version` keywords.

2924 Notes

2925 -----

2926 This function writes the dataframe as a `feather file

2927 <https://arrow.apache.org/docs/python/feather.html>`_. Requires a default

2928 index. For saving the DataFrame with your custom index use a method that

2929 supports custom indices e.g. `to_parquet`.

2931 Examples

2932 --------

2933 >>> df = pd.DataFrame([[1, 2, 3], [4, 5, 6]])

2934 >>> df.to_feather("file.feather") # doctest: +SKIP

2935 """

2936 from pandas.io.feather_format import to_feather

2938 to_feather(self, path, **kwargs)

2940 @deprecate_nonkeyword_arguments(

2941 version="3.0", allowed_args=["self", "buf"], name="to_markdown"

2943 @doc(

2944 Series.to_markdown,

2945 klass=_shared_doc_kwargs["klass"],

2946 storage_options=_shared_docs["storage_options"],

2947 examples="""Examples

2948 --------

2949 >>> df = pd.DataFrame(

2950 ... data={"animal_1": ["elk", "pig"], "animal_2": ["dog", "quetzal"]}

2951 ... )

2952 >>> print(df.to_markdown())

2953 | | animal_1 | animal_2 |

2954 |---:|:-----------|:-----------|

2955 | 0 | elk | dog |

2956 | 1 | pig | quetzal |

2958 Output markdown with a tabulate option.

2960 >>> print(df.to_markdown(tablefmt="grid"))

2961 +----+------------+------------+

2962 | | animal_1 | animal_2 |

2963 +====+============+============+

2964 | 0 | elk | dog |

2965 +----+------------+------------+

2966 | 1 | pig | quetzal |

2967 +----+------------+------------+""",

2969 def to_markdown(

2970 self,

2971 buf: FilePath | WriteBuffer[str] | None = None,

2972 mode: str = "wt",

2973 index: bool = True,

2974 storage_options: StorageOptions | None = None,

2975 **kwargs,

2976 ) -> str | None:

2977 if "showindex" in kwargs:

2978 raise ValueError("Pass 'index' instead of 'showindex")

2980 kwargs.setdefault("headers", "keys")

2981 kwargs.setdefault("tablefmt", "pipe")

2982 kwargs.setdefault("showindex", index)

2983 tabulate = import_optional_dependency("tabulate")

2984 result = tabulate.tabulate(self, **kwargs)

2985 if buf is None:

2986 return result

2988 with get_handle(buf, mode, storage_options=storage_options) as handles:

2989 handles.handle.write(result)

2990 return None

2992 @overload

2993 def to_parquet(

2994 self,

2995 path: None = ...,

2996 engine: Literal["auto", "pyarrow", "fastparquet"] = ...,

2997 compression: str | None = ...,

2998 index: bool | None = ...,

2999 partition_cols: list[str] | None = ...,

3000 storage_options: StorageOptions = ...,

3001 **kwargs,

3002 ) -> bytes:

3003 ...

3005 @overload

3006 def to_parquet(

3007 self,

3008 path: FilePath | WriteBuffer[bytes],

3009 engine: Literal["auto", "pyarrow", "fastparquet"] = ...,

3010 compression: str | None = ...,

3011 index: bool | None = ...,

3012 partition_cols: list[str] | None = ...,

3013 storage_options: StorageOptions = ...,

3014 **kwargs,

3015 ) -> None:

3016 ...

3018 @deprecate_nonkeyword_arguments(

3019 version="3.0", allowed_args=["self", "path"], name="to_parquet"

3021 @doc(storage_options=_shared_docs["storage_options"])

3022 def to_parquet(

3023 self,

3024 path: FilePath | WriteBuffer[bytes] | None = None,

3025 engine: Literal["auto", "pyarrow", "fastparquet"] = "auto",

3026 compression: str | None = "snappy",

3027 index: bool | None = None,

3028 partition_cols: list[str] | None = None,

3029 storage_options: StorageOptions | None = None,

3030 **kwargs,

3031 ) -> bytes | None:

3032 """

3033 Write a DataFrame to the binary parquet format.

3035 This function writes the dataframe as a `parquet file

3036 <https://parquet.apache.org/>`_. You can choose different parquet

3037 backends, and have the option of compression. See

3038 :ref:`the user guide <io.parquet>` for more details.

3040 Parameters

3041 ----------

3042 path : str, path object, file-like object, or None, default None

3043 String, path object (implementing ``os.PathLike[str]``), or file-like

3044 object implementing a binary ``write()`` function. If None, the result is

3045 returned as bytes. If a string or path, it will be used as Root Directory

3046 path when writing a partitioned dataset.

3047 engine : {{'auto', 'pyarrow', 'fastparquet'}}, default 'auto'

3048 Parquet library to use. If 'auto', then the option

3049 ``io.parquet.engine`` is used. The default ``io.parquet.engine``

3050 behavior is to try 'pyarrow', falling back to 'fastparquet' if

3051 'pyarrow' is unavailable.

3052 compression : str or None, default 'snappy'

3053 Name of the compression to use. Use ``None`` for no compression.

3054 Supported options: 'snappy', 'gzip', 'brotli', 'lz4', 'zstd'.

3055 index : bool, default None

3056 If ``True``, include the dataframe's index(es) in the file output.

3057 If ``False``, they will not be written to the file.

3058 If ``None``, similar to ``True`` the dataframe's index(es)

3059 will be saved. However, instead of being saved as values,

3060 the RangeIndex will be stored as a range in the metadata so it

3061 doesn't require much space and is faster. Other indexes will

3062 be included as columns in the file output.

3063 partition_cols : list, optional, default None

3064 Column names by which to partition the dataset.

3065 Columns are partitioned in the order they are given.

3066 Must be None if path is not a string.

3067 {storage_options}

3069 **kwargs

3070 Additional arguments passed to the parquet library. See

3071 :ref:`pandas io <io.parquet>` for more details.

3073 Returns

3074 -------

3075 bytes if no path argument is provided else None

3077 See Also

3078 --------

3079 read_parquet : Read a parquet file.

3080 DataFrame.to_orc : Write an orc file.

3081 DataFrame.to_csv : Write a csv file.

3082 DataFrame.to_sql : Write to a sql table.

3083 DataFrame.to_hdf : Write to hdf.

3085 Notes

3086 -----

3087 This function requires either the `fastparquet

3088 <https://pypi.org/project/fastparquet>`_ or `pyarrow

3089 <https://arrow.apache.org/docs/python/>`_ library.

3091 Examples

3092 --------

3093 >>> df = pd.DataFrame(data={{'col1': [1, 2], 'col2': [3, 4]}})

3094 >>> df.to_parquet('df.parquet.gzip',

3095 ... compression='gzip') # doctest: +SKIP

3096 >>> pd.read_parquet('df.parquet.gzip') # doctest: +SKIP

3097 col1 col2

3098 0 1 3

3099 1 2 4

3101 If you want to get a buffer to the parquet content you can use a io.BytesIO

3102 object, as long as you don't use partition_cols, which creates multiple files.

3104 >>> import io

3105 >>> f = io.BytesIO()

3106 >>> df.to_parquet(f)

3107 >>> f.seek(0)

3109 >>> content = f.read()

3110 """

3111 from pandas.io.parquet import to_parquet

3113 return to_parquet(

3114 self,

3115 path,

3116 engine,

3117 compression=compression,

3118 index=index,

3119 partition_cols=partition_cols,

3120 storage_options=storage_options,

3121 **kwargs,

3124 def to_orc(

3125 self,

3126 path: FilePath | WriteBuffer[bytes] | None = None,

3128 engine: Literal["pyarrow"] = "pyarrow",

3129 index: bool | None = None,

3130 engine_kwargs: dict[str, Any] | None = None,

3131 ) -> bytes | None:

3132 """

3133 Write a DataFrame to the ORC format.

3135 .. versionadded:: 1.5.0

3137 Parameters

3138 ----------

3139 path : str, file-like object or None, default None

3140 If a string, it will be used as Root Directory path

3141 when writing a partitioned dataset. By file-like object,

3142 we refer to objects with a write() method, such as a file handle

3143 (e.g. via builtin open function). If path is None,

3144 a bytes object is returned.

3145 engine : {'pyarrow'}, default 'pyarrow'

3146 ORC library to use.

3147 index : bool, optional

3148 If ``True``, include the dataframe's index(es) in the file output.

3149 If ``False``, they will not be written to the file.

3150 If ``None``, similar to ``infer`` the dataframe's index(es)

3151 will be saved. However, instead of being saved as values,

3152 the RangeIndex will be stored as a range in the metadata so it

3153 doesn't require much space and is faster. Other indexes will

3154 be included as columns in the file output.

3155 engine_kwargs : dict[str, Any] or None, default None

3156 Additional keyword arguments passed to :func:`pyarrow.orc.write_table`.

3158 Returns

3159 -------

3160 bytes if no path argument is provided else None

3162 Raises

3163 ------

3164 NotImplementedError

3165 Dtype of one or more columns is category, unsigned integers, interval,

3166 period or sparse.

3167 ValueError

3168 engine is not pyarrow.

3170 See Also

3171 --------

3172 read_orc : Read a ORC file.

3173 DataFrame.to_parquet : Write a parquet file.

3174 DataFrame.to_csv : Write a csv file.

3175 DataFrame.to_sql : Write to a sql table.

3176 DataFrame.to_hdf : Write to hdf.

3178 Notes

3179 -----

3180 * Before using this function you should read the :ref:`user guide about

3181 ORC <io.orc>` and :ref:`install optional dependencies <install.warn_orc>`.

3182 * This function requires `pyarrow <https://arrow.apache.org/docs/python/>`_

3183 library.

3184 * For supported dtypes please refer to `supported ORC features in Arrow

3185 <https://arrow.apache.org/docs/cpp/orc.html#data-types>`__.

3186 * Currently timezones in datetime columns are not preserved when a

3187 dataframe is converted into ORC files.

3189 Examples

3190 --------

3191 >>> df = pd.DataFrame(data={'col1': [1, 2], 'col2': [4, 3]})

3192 >>> df.to_orc('df.orc') # doctest: +SKIP

3193 >>> pd.read_orc('df.orc') # doctest: +SKIP

3194 col1 col2

3195 0 1 4

3196 1 2 3

3198 If you want to get a buffer to the orc content you can write it to io.BytesIO

3200 >>> import io

3201 >>> b = io.BytesIO(df.to_orc()) # doctest: +SKIP

3202 >>> b.seek(0) # doctest: +SKIP

3204 >>> content = b.read() # doctest: +SKIP

3205 """

3206 from pandas.io.orc import to_orc

3208 return to_orc(

3209 self, path, engine=engine, index=index, engine_kwargs=engine_kwargs

3212 @overload

3213 def to_html(

3214 self,

3215 buf: FilePath | WriteBuffer[str],

3216 columns: Axes | None = ...,

3217 col_space: ColspaceArgType | None = ...,

3218 header: bool = ...,

3219 index: bool = ...,

3220 na_rep: str = ...,

3221 formatters: FormattersType | None = ...,

3222 float_format: FloatFormatType | None = ...,

3223 sparsify: bool | None = ...,

3224 index_names: bool = ...,

3225 justify: str | None = ...,

3226 max_rows: int | None = ...,

3227 max_cols: int | None = ...,

3228 show_dimensions: bool | str = ...,

3229 decimal: str = ...,

3230 bold_rows: bool = ...,

3231 classes: str | list | tuple | None = ...,

3232 escape: bool = ...,

3233 notebook: bool = ...,

3234 border: int | bool | None = ...,

3235 table_id: str | None = ...,

3236 render_links: bool = ...,

3237 encoding: str | None = ...,

3238 ) -> None:

3239 ...

3241 @overload

3242 def to_html(

3243 self,

3244 buf: None = ...,

3245 columns: Axes | None = ...,

3246 col_space: ColspaceArgType | None = ...,

3247 header: bool = ...,

3248 index: bool = ...,

3249 na_rep: str = ...,

3250 formatters: FormattersType | None = ...,

3251 float_format: FloatFormatType | None = ...,

3252 sparsify: bool | None = ...,

3253 index_names: bool = ...,

3254 justify: str | None = ...,

3255 max_rows: int | None = ...,

3256 max_cols: int | None = ...,

3257 show_dimensions: bool | str = ...,

3258 decimal: str = ...,

3259 bold_rows: bool = ...,

3260 classes: str | list | tuple | None = ...,

3261 escape: bool = ...,

3262 notebook: bool = ...,

3263 border: int | bool | None = ...,

3264 table_id: str | None = ...,

3265 render_links: bool = ...,

3266 encoding: str | None = ...,

3267 ) -> str:

3268 ...

3270 @deprecate_nonkeyword_arguments(

3271 version="3.0", allowed_args=["self", "buf"], name="to_html"

3273 @Substitution(

3274 header_type="bool",

3275 header="Whether to print column labels, default True",

3276 col_space_type="str or int, list or dict of int or str",

3277 col_space="The minimum width of each column in CSS length "

3278 "units. An int is assumed to be px units.",

3280 @Substitution(shared_params=fmt.common_docstring, returns=fmt.return_docstring)

3281 def to_html(

3282 self,

3283 buf: FilePath | WriteBuffer[str] | None = None,

3284 columns: Axes | None = None,

3285 col_space: ColspaceArgType | None = None,

3286 header: bool = True,

3287 index: bool = True,

3288 na_rep: str = "NaN",

3289 formatters: FormattersType | None = None,

3290 float_format: FloatFormatType | None = None,

3291 sparsify: bool | None = None,

3292 index_names: bool = True,

3293 justify: str | None = None,

3294 max_rows: int | None = None,

3295 max_cols: int | None = None,

3296 show_dimensions: bool | str = False,

3297 decimal: str = ".",

3298 bold_rows: bool = True,

3299 classes: str | list | tuple | None = None,

3300 escape: bool = True,

3301 notebook: bool = False,

3302 border: int | bool | None = None,

3303 table_id: str | None = None,

3304 render_links: bool = False,

3305 encoding: str | None = None,

3306 ) -> str | None:

3307 """

3308 Render a DataFrame as an HTML table.

3309 %(shared_params)s

3310 bold_rows : bool, default True

3311 Make the row labels bold in the output.

3312 classes : str or list or tuple, default None

3313 CSS class(es) to apply to the resulting html table.

3314 escape : bool, default True

3315 Convert the characters <, >, and & to HTML-safe sequences.

3316 notebook : {True, False}, default False

3317 Whether the generated HTML is for IPython Notebook.

3318 border : int

3319 A ``border=border`` attribute is included in the opening

3320 `<table>` tag. Default ``pd.options.display.html.border``.

3321 table_id : str, optional

3322 A css id is included in the opening `<table>` tag if specified.

3323 render_links : bool, default False

3324 Convert URLs to HTML links.

3325 encoding : str, default "utf-8"

3326 Set character encoding.

3327 %(returns)s

3328 See Also

3329 --------

3330 to_string : Convert DataFrame to a string.

3332 Examples

3333 --------

3334 >>> df = pd.DataFrame(data={'col1': [1, 2], 'col2': [4, 3]})

3335 >>> html_string = '''<table border="1" class="dataframe">

3336 ... <thead>

3337 ... <tr style="text-align: right;">

3338 ... <th></th>

3339 ... <th>col1</th>

3340 ... <th>col2</th>

3341 ... </tr>

3342 ... </thead>

3343 ... <tbody>

3344 ... <tr>

3345 ... <th>0</th>

3346 ... <td>1</td>

3347 ... <td>4</td>

3348 ... </tr>

3349 ... <tr>

3350 ... <th>1</th>

3351 ... <td>2</td>

3352 ... <td>3</td>

3353 ... </tr>

3354 ... </tbody>

3355 ... </table>'''

3356 >>> assert html_string == df.to_html()

3357 """

3358 if justify is not None and justify not in fmt.VALID_JUSTIFY_PARAMETERS:

3359 raise ValueError("Invalid value for justify parameter")

3361 formatter = fmt.DataFrameFormatter(

3362 self,

3363 columns=columns,

3364 col_space=col_space,

3365 na_rep=na_rep,

3366 header=header,

3367 index=index,

3368 formatters=formatters,

3369 float_format=float_format,

3370 bold_rows=bold_rows,

3371 sparsify=sparsify,

3372 justify=justify,

3373 index_names=index_names,

3374 escape=escape,

3375 decimal=decimal,

3376 max_rows=max_rows,

3377 max_cols=max_cols,

3378 show_dimensions=show_dimensions,

3380 # TODO: a generic formatter wld b in DataFrameFormatter

3381 return fmt.DataFrameRenderer(formatter).to_html(

3382 buf=buf,

3383 classes=classes,

3384 notebook=notebook,

3385 border=border,

3386 encoding=encoding,

3387 table_id=table_id,

3388 render_links=render_links,

3391 @overload

3392 def to_xml(

3393 self,

3394 path_or_buffer: None = ...,

3396 index: bool = ...,

3397 root_name: str | None = ...,

3398 row_name: str | None = ...,

3399 na_rep: str | None = ...,

3400 attr_cols: list[str] | None = ...,

3401 elem_cols: list[str] | None = ...,

3402 namespaces: dict[str | None, str] | None = ...,

3403 prefix: str | None = ...,

3404 encoding: str = ...,

3405 xml_declaration: bool | None = ...,

3406 pretty_print: bool | None = ...,

3407 parser: XMLParsers | None = ...,

3408 stylesheet: FilePath | ReadBuffer[str] | ReadBuffer[bytes] | None = ...,

3409 compression: CompressionOptions = ...,

3410 storage_options: StorageOptions | None = ...,

3411 ) -> str:

3412 ...

3414 @overload

3415 def to_xml(

3416 self,

3417 path_or_buffer: FilePath | WriteBuffer[bytes] | WriteBuffer[str],

3419 index: bool = ...,

3420 root_name: str | None = ...,

3421 row_name: str | None = ...,

3422 na_rep: str | None = ...,

3423 attr_cols: list[str] | None = ...,

3424 elem_cols: list[str] | None = ...,

3425 namespaces: dict[str | None, str] | None = ...,

3426 prefix: str | None = ...,

3427 encoding: str = ...,

3428 xml_declaration: bool | None = ...,

3429 pretty_print: bool | None = ...,

3430 parser: XMLParsers | None = ...,

3431 stylesheet: FilePath | ReadBuffer[str] | ReadBuffer[bytes] | None = ...,

3432 compression: CompressionOptions = ...,

3433 storage_options: StorageOptions | None = ...,

3434 ) -> None:

3435 ...

3437 @deprecate_nonkeyword_arguments(

3438 version="3.0", allowed_args=["self", "path_or_buffer"], name="to_xml"

3440 @doc(

3441 storage_options=_shared_docs["storage_options"],

3442 compression_options=_shared_docs["compression_options"] % "path_or_buffer",

3444 def to_xml(

3445 self,

3446 path_or_buffer: FilePath | WriteBuffer[bytes] | WriteBuffer[str] | None = None,

3447 index: bool = True,

3448 root_name: str | None = "data",

3449 row_name: str | None = "row",

3450 na_rep: str | None = None,

3451 attr_cols: list[str] | None = None,

3452 elem_cols: list[str] | None = None,

3453 namespaces: dict[str | None, str] | None = None,

3454 prefix: str | None = None,

3455 encoding: str = "utf-8",

3456 xml_declaration: bool | None = True,

3457 pretty_print: bool | None = True,

3458 parser: XMLParsers | None = "lxml",

3459 stylesheet: FilePath | ReadBuffer[str] | ReadBuffer[bytes] | None = None,

3460 compression: CompressionOptions = "infer",

3461 storage_options: StorageOptions | None = None,

3462 ) -> str | None:

3463 """

3464 Render a DataFrame to an XML document.

3466 .. versionadded:: 1.3.0

3468 Parameters

3469 ----------

3470 path_or_buffer : str, path object, file-like object, or None, default None

3471 String, path object (implementing ``os.PathLike[str]``), or file-like

3472 object implementing a ``write()`` function. If None, the result is returned

3473 as a string.

3474 index : bool, default True

3475 Whether to include index in XML document.

3476 root_name : str, default 'data'

3477 The name of root element in XML document.

3478 row_name : str, default 'row'

3479 The name of row element in XML document.

3480 na_rep : str, optional

3481 Missing data representation.

3482 attr_cols : list-like, optional

3483 List of columns to write as attributes in row element.

3484 Hierarchical columns will be flattened with underscore

3485 delimiting the different levels.

3486 elem_cols : list-like, optional

3487 List of columns to write as children in row element. By default,

3488 all columns output as children of row element. Hierarchical

3489 columns will be flattened with underscore delimiting the

3490 different levels.

3491 namespaces : dict, optional

3492 All namespaces to be defined in root element. Keys of dict

3493 should be prefix names and values of dict corresponding URIs.

3494 Default namespaces should be given empty string key. For

3495 example, ::

3497 namespaces = {{"": "https://example.com"}}

3499 prefix : str, optional

3500 Namespace prefix to be used for every element and/or attribute

3501 in document. This should be one of the keys in ``namespaces``

3502 dict.

3503 encoding : str, default 'utf-8'

3504 Encoding of the resulting document.

3505 xml_declaration : bool, default True

3506 Whether to include the XML declaration at start of document.

3507 pretty_print : bool, default True

3508 Whether output should be pretty printed with indentation and

3509 line breaks.

3510 parser : {{'lxml','etree'}}, default 'lxml'

3511 Parser module to use for building of tree. Only 'lxml' and

3512 'etree' are supported. With 'lxml', the ability to use XSLT

3513 stylesheet is supported.

3514 stylesheet : str, path object or file-like object, optional

3515 A URL, file-like object, or a raw string containing an XSLT

3516 script used to transform the raw XML output. Script should use

3517 layout of elements and attributes from original output. This

3518 argument requires ``lxml`` to be installed. Only XSLT 1.0

3519 scripts and not later versions is currently supported.

3520 {compression_options}

3522 .. versionchanged:: 1.4.0 Zstandard support.

3524 {storage_options}

3526 Returns

3527 -------

3528 None or str

3529 If ``io`` is None, returns the resulting XML format as a

3530 string. Otherwise returns None.

3532 See Also

3533 --------

3534 to_json : Convert the pandas object to a JSON string.

3535 to_html : Convert DataFrame to a html.

3537 Examples

3538 --------

3539 >>> df = pd.DataFrame({{'shape': ['square', 'circle', 'triangle'],

3540 ... 'degrees': [360, 360, 180],

3541 ... 'sides': [4, np.nan, 3]}})

3543 >>> df.to_xml() # doctest: +SKIP

3544 <?xml version='1.0' encoding='utf-8'?>

3545 <data>

3546 <row>

3547 <index>0</index>

3548 <shape>square</shape>

3549 <degrees>360</degrees>

3550 <sides>4.0</sides>

3551 </row>

3552 <row>

3553 <index>1</index>

3554 <shape>circle</shape>

3555 <degrees>360</degrees>

3556 <sides/>

3557 </row>

3558 <row>

3559 <index>2</index>

3560 <shape>triangle</shape>

3561 <degrees>180</degrees>

3562 <sides>3.0</sides>

3563 </row>

3564 </data>

3566 >>> df.to_xml(attr_cols=[

3567 ... 'index', 'shape', 'degrees', 'sides'

3568 ... ]) # doctest: +SKIP

3569 <?xml version='1.0' encoding='utf-8'?>

3570 <data>

3571 <row index="0" shape="square" degrees="360" sides="4.0"/>

3572 <row index="1" shape="circle" degrees="360"/>

3573 <row index="2" shape="triangle" degrees="180" sides="3.0"/>

3574 </data>

3576 >>> df.to_xml(namespaces={{"doc": "https://example.com"}},

3577 ... prefix="doc") # doctest: +SKIP

3578 <?xml version='1.0' encoding='utf-8'?>

3579 <doc:data xmlns:doc="https://example.com">

3580 <doc:row>

3581 <doc:index>0</doc:index>

3582 <doc:shape>square</doc:shape>

3583 <doc:degrees>360</doc:degrees>

3584 <doc:sides>4.0</doc:sides>

3585 </doc:row>

3586 <doc:row>

3587 <doc:index>1</doc:index>

3588 <doc:shape>circle</doc:shape>

3589 <doc:degrees>360</doc:degrees>

3590 <doc:sides/>

3591 </doc:row>

3592 <doc:row>

3593 <doc:index>2</doc:index>

3594 <doc:shape>triangle</doc:shape>

3595 <doc:degrees>180</doc:degrees>

3596 <doc:sides>3.0</doc:sides>

3597 </doc:row>

3598 </doc:data>

3599 """

3601 from pandas.io.formats.xml import (

3602 EtreeXMLFormatter,

3603 LxmlXMLFormatter,

3606 lxml = import_optional_dependency("lxml.etree", errors="ignore")

3608 TreeBuilder: type[EtreeXMLFormatter | LxmlXMLFormatter]

3610 if parser == "lxml":

3611 if lxml is not None:

3612 TreeBuilder = LxmlXMLFormatter

3613 else:

3614 raise ImportError(

3615 "lxml not found, please install or use the etree parser."

3618 elif parser == "etree":

3619 TreeBuilder = EtreeXMLFormatter

3621 else:

3622 raise ValueError("Values for parser can only be lxml or etree.")

3624 xml_formatter = TreeBuilder(

3625 self,

3626 path_or_buffer=path_or_buffer,

3627 index=index,

3628 root_name=root_name,

3629 row_name=row_name,

3630 na_rep=na_rep,

3631 attr_cols=attr_cols,

3632 elem_cols=elem_cols,

3633 namespaces=namespaces,

3634 prefix=prefix,

3635 encoding=encoding,

3636 xml_declaration=xml_declaration,

3637 pretty_print=pretty_print,

3638 stylesheet=stylesheet,

3639 compression=compression,

3640 storage_options=storage_options,

3643 return xml_formatter.write_output()

3645 # ----------------------------------------------------------------------

3646 @doc(INFO_DOCSTRING, **frame_sub_kwargs)

3647 def info(

3648 self,

3649 verbose: bool | None = None,

3650 buf: WriteBuffer[str] | None = None,

3651 max_cols: int | None = None,

3652 memory_usage: bool | str | None = None,

3653 show_counts: bool | None = None,

3654 ) -> None:

3655 info = DataFrameInfo(

3656 data=self,

3657 memory_usage=memory_usage,

3659 info.render(

3660 buf=buf,

3661 max_cols=max_cols,

3662 verbose=verbose,

3663 show_counts=show_counts,

3666 def memory_usage(self, index: bool = True, deep: bool = False) -> Series:

3667 """

3668 Return the memory usage of each column in bytes.

3670 The memory usage can optionally include the contribution of

3671 the index and elements of `object` dtype.

3673 This value is displayed in `DataFrame.info` by default. This can be

3674 suppressed by setting ``pandas.options.display.memory_usage`` to False.

3676 Parameters

3677 ----------

3678 index : bool, default True

3679 Specifies whether to include the memory usage of the DataFrame's

3680 index in returned Series. If ``index=True``, the memory usage of

3681 the index is the first item in the output.

3682 deep : bool, default False

3683 If True, introspect the data deeply by interrogating

3684 `object` dtypes for system-level memory consumption, and include

3685 it in the returned values.

3687 Returns

3688 -------

3689 Series

3690 A Series whose index is the original column names and whose values

3691 is the memory usage of each column in bytes.

3693 See Also

3694 --------

3695 numpy.ndarray.nbytes : Total bytes consumed by the elements of an

3696 ndarray.

3697 Series.memory_usage : Bytes consumed by a Series.

3698 Categorical : Memory-efficient array for string values with

3699 many repeated values.

3700 DataFrame.info : Concise summary of a DataFrame.

3702 Notes

3703 -----

3704 See the :ref:`Frequently Asked Questions <df-memory-usage>` for more

3705 details.

3707 Examples

3708 --------

3709 >>> dtypes = ['int64', 'float64', 'complex128', 'object', 'bool']

3710 >>> data = dict([(t, np.ones(shape=5000, dtype=int).astype(t))

3711 ... for t in dtypes])

3712 >>> df = pd.DataFrame(data)

3713 >>> df.head()

3714 int64 float64 complex128 object bool

3715 0 1 1.0 1.0+0.0j 1 True

3716 1 1 1.0 1.0+0.0j 1 True

3717 2 1 1.0 1.0+0.0j 1 True

3718 3 1 1.0 1.0+0.0j 1 True

3719 4 1 1.0 1.0+0.0j 1 True

3721 >>> df.memory_usage()

3722 Index 128

3723 int64 40000

3724 float64 40000

3725 complex128 80000

3726 object 40000

3727 bool 5000

3728 dtype: int64

3730 >>> df.memory_usage(index=False)

3731 int64 40000

3732 float64 40000

3733 complex128 80000

3734 object 40000

3735 bool 5000

3736 dtype: int64

3738 The memory footprint of `object` dtype columns is ignored by default:

3740 >>> df.memory_usage(deep=True)

3741 Index 128

3742 int64 40000

3743 float64 40000

3744 complex128 80000

3745 object 180000

3746 bool 5000

3747 dtype: int64

3749 Use a Categorical for efficient storage of an object-dtype column with

3750 many repeated values.

3752 >>> df['object'].astype('category').memory_usage(deep=True)

3753 5244

3754 """

3755 result = self._constructor_sliced(

3756 [c.memory_usage(index=False, deep=deep) for col, c in self.items()],

3757 index=self.columns,

3758 dtype=np.intp,

3760 if index:

3761 index_memory_usage = self._constructor_sliced(

3762 self.index.memory_usage(deep=deep), index=["Index"]

3764 result = index_memory_usage._append(result)

3765 return result

3767 def transpose(self, *args, copy: bool = False) -> DataFrame:

3768 """

3769 Transpose index and columns.

3771 Reflect the DataFrame over its main diagonal by writing rows as columns

3772 and vice-versa. The property :attr:`.T` is an accessor to the method

3773 :meth:`transpose`.

3775 Parameters

3776 ----------

3777 *args : tuple, optional

3778 Accepted for compatibility with NumPy.

3779 copy : bool, default False

3780 Whether to copy the data after transposing, even for DataFrames

3781 with a single dtype.

3783 Note that a copy is always required for mixed dtype DataFrames,

3784 or for DataFrames with any extension types.

3786 .. note::

3787 The `copy` keyword will change behavior in pandas 3.0.

3788 `Copy-on-Write

3789 <https://pandas.pydata.org/docs/dev/user_guide/copy_on_write.html>`__

3790 will be enabled by default, which means that all methods with a

3791 `copy` keyword will use a lazy copy mechanism to defer the copy and

3792 ignore the `copy` keyword. The `copy` keyword will be removed in a

3793 future version of pandas.

3795 You can already get the future behavior and improvements through

3796 enabling copy on write ``pd.options.mode.copy_on_write = True``

3798 Returns

3799 -------

3800 DataFrame

3801 The transposed DataFrame.

3803 See Also

3804 --------

3805 numpy.transpose : Permute the dimensions of a given array.

3807 Notes

3808 -----

3809 Transposing a DataFrame with mixed dtypes will result in a homogeneous

3810 DataFrame with the `object` dtype. In such a case, a copy of the data

3811 is always made.

3813 Examples

3814 --------

3815 **Square DataFrame with homogeneous dtype**

3817 >>> d1 = {'col1': [1, 2], 'col2': [3, 4]}

3818 >>> df1 = pd.DataFrame(data=d1)

3819 >>> df1

3820 col1 col2

3821 0 1 3

3822 1 2 4

3824 >>> df1_transposed = df1.T # or df1.transpose()

3825 >>> df1_transposed

3826 0 1

3827 col1 1 2

3828 col2 3 4

3830 When the dtype is homogeneous in the original DataFrame, we get a

3831 transposed DataFrame with the same dtype:

3833 >>> df1.dtypes

3834 col1 int64

3835 col2 int64

3836 dtype: object

3837 >>> df1_transposed.dtypes

3838 0 int64

3839 1 int64

3840 dtype: object

3842 **Non-square DataFrame with mixed dtypes**

3844 >>> d2 = {'name': ['Alice', 'Bob'],

3845 ... 'score': [9.5, 8],

3846 ... 'employed': [False, True],

3847 ... 'kids': [0, 0]}

3848 >>> df2 = pd.DataFrame(data=d2)

3849 >>> df2

3850 name score employed kids

3851 0 Alice 9.5 False 0

3852 1 Bob 8.0 True 0

3854 >>> df2_transposed = df2.T # or df2.transpose()

3855 >>> df2_transposed

3856 0 1

3857 name Alice Bob

3858 score 9.5 8.0

3859 employed False True

3860 kids 0 0

3862 When the DataFrame has mixed dtypes, we get a transposed DataFrame with

3863 the `object` dtype:

3865 >>> df2.dtypes

3866 name object

3867 score float64

3868 employed bool

3869 kids int64

3870 dtype: object

3871 >>> df2_transposed.dtypes

3872 0 object

3873 1 object

3874 dtype: object

3875 """

3876 nv.validate_transpose(args, {})

3877 # construct the args

3879 dtypes = list(self.dtypes)

3881 if self._can_fast_transpose:

3882 # Note: tests pass without this, but this improves perf quite a bit.

3883 new_vals = self._values.T

3884 if copy and not using_copy_on_write():

3885 new_vals = new_vals.copy()

3887 result = self._constructor(

3888 new_vals,

3889 index=self.columns,

3890 columns=self.index,

3891 copy=False,

3892 dtype=new_vals.dtype,

3894 if using_copy_on_write() and len(self) > 0:

3895 result._mgr.add_references(self._mgr) # type: ignore[arg-type]

3897 elif (

3898 self._is_homogeneous_type

3899 and dtypes

3900 and isinstance(dtypes[0], ExtensionDtype)

3902 new_values: list

3903 if isinstance(dtypes[0], BaseMaskedDtype):

3904 # We have masked arrays with the same dtype. We can transpose faster.

3905 from pandas.core.arrays.masked import (

3906 transpose_homogeneous_masked_arrays,

3909 new_values = transpose_homogeneous_masked_arrays(

3910 cast(Sequence[BaseMaskedArray], self._iter_column_arrays())

3912 elif isinstance(dtypes[0], ArrowDtype):

3913 # We have arrow EAs with the same dtype. We can transpose faster.

3914 from pandas.core.arrays.arrow.array import (

3915 ArrowExtensionArray,

3916 transpose_homogeneous_pyarrow,

3919 new_values = transpose_homogeneous_pyarrow(

3920 cast(Sequence[ArrowExtensionArray], self._iter_column_arrays())

3922 else:

3923 # We have other EAs with the same dtype. We preserve dtype in transpose.

3924 dtyp = dtypes[0]

3925 arr_typ = dtyp.construct_array_type()

3926 values = self.values

3927 new_values = [arr_typ._from_sequence(row, dtype=dtyp) for row in values]

3929 result = type(self)._from_arrays(

3930 new_values,

3931 index=self.columns,

3932 columns=self.index,

3933 verify_integrity=False,

3936 else:

3937 new_arr = self.values.T

3938 if copy and not using_copy_on_write():

3939 new_arr = new_arr.copy()

3940 result = self._constructor(

3941 new_arr,

3942 index=self.columns,

3943 columns=self.index,

3944 dtype=new_arr.dtype,

3945 # We already made a copy (more than one block)

3946 copy=False,

3949 return result.__finalize__(self, method="transpose")

3951 @property

3952 def T(self) -> DataFrame:

3953 """

3954 The transpose of the DataFrame.

3956 Returns

3957 -------

3958 DataFrame

3959 The transposed DataFrame.

3961 See Also

3962 --------

3963 DataFrame.transpose : Transpose index and columns.

3965 Examples

3966 --------

3967 >>> df = pd.DataFrame({'col1': [1, 2], 'col2': [3, 4]})

3968 >>> df

3969 col1 col2

3970 0 1 3

3971 1 2 4

3973 >>> df.T

3974 0 1

3975 col1 1 2

3976 col2 3 4

3977 """

3978 return self.transpose()

3980 # ----------------------------------------------------------------------

3981 # Indexing Methods

3983 def _ixs(self, i: int, axis: AxisInt = 0) -> Series:

3984 """

3985 Parameters

3986 ----------

3987 i : int

3988 axis : int

3990 Returns

3991 -------

3992 Series

3993 """

3994 # irow

3995 if axis == 0:

3996 new_mgr = self._mgr.fast_xs(i)

3998 # if we are a copy, mark as such

3999 copy = isinstance(new_mgr.array, np.ndarray) and new_mgr.array.base is None

4000 result = self._constructor_sliced_from_mgr(new_mgr, axes=new_mgr.axes)

4001 result._name = self.index[i]

4002 result = result.__finalize__(self)

4003 result._set_is_copy(self, copy=copy)

4004 return result

4006 # icol

4007 else:

4008 label = self.columns[i]

4010 col_mgr = self._mgr.iget(i)

4011 result = self._box_col_values(col_mgr, i)

4013 # this is a cached value, mark it so

4014 result._set_as_cached(label, self)

4015 return result

4017 def _get_column_array(self, i: int) -> ArrayLike:

4018 """

4019 Get the values of the i'th column (ndarray or ExtensionArray, as stored

4020 in the Block)

4022 Warning! The returned array is a view but doesn't handle Copy-on-Write,

4023 so this should be used with caution (for read-only purposes).

4024 """

4025 return self._mgr.iget_values(i)

4027 def _iter_column_arrays(self) -> Iterator[ArrayLike]:

4028 """

4029 Iterate over the arrays of all columns in order.

4030 This returns the values as stored in the Block (ndarray or ExtensionArray).

4032 Warning! The returned array is a view but doesn't handle Copy-on-Write,

4033 so this should be used with caution (for read-only purposes).

4034 """

4035 if isinstance(self._mgr, ArrayManager):

4036 yield from self._mgr.arrays

4037 else:

4038 for i in range(len(self.columns)):

4039 yield self._get_column_array(i)

4041 def _getitem_nocopy(self, key: list):

4042 """

4043 Behaves like __getitem__, but returns a view in cases where __getitem__

4044 would make a copy.

4045 """

4046 # TODO(CoW): can be removed if/when we are always Copy-on-Write

4047 indexer = self.columns._get_indexer_strict(key, "columns")[1]

4048 new_axis = self.columns[indexer]

4050 new_mgr = self._mgr.reindex_indexer(

4051 new_axis,

4052 indexer,

4053 axis=0,

4054 allow_dups=True,

4055 copy=False,

4056 only_slice=True,

4058 result = self._constructor_from_mgr(new_mgr, axes=new_mgr.axes)

4059 result = result.__finalize__(self)

4060 return result

4062 def __getitem__(self, key):

4063 check_dict_or_set_indexers(key)

4064 key = lib.item_from_zerodim(key)

4065 key = com.apply_if_callable(key, self)

4067 if is_hashable(key) and not is_iterator(key):

4068 # is_iterator to exclude generator e.g. test_getitem_listlike

4069 # shortcut if the key is in columns

4070 is_mi = isinstance(self.columns, MultiIndex)

4071 # GH#45316 Return view if key is not duplicated

4072 # Only use drop_duplicates with duplicates for performance

4073 if not is_mi and (

4074 self.columns.is_unique

4075 and key in self.columns

4076 or key in self.columns.drop_duplicates(keep=False)

4078 return self._get_item_cache(key)

4080 elif is_mi and self.columns.is_unique and key in self.columns:

4081 return self._getitem_multilevel(key)

4083 # Do we have a slicer (on rows)?

4084 if isinstance(key, slice):

4085 return self._getitem_slice(key)

4087 # Do we have a (boolean) DataFrame?

4088 if isinstance(key, DataFrame):

4089 return self.where(key)

4091 # Do we have a (boolean) 1d indexer?

4092 if com.is_bool_indexer(key):

4093 return self._getitem_bool_array(key)

4095 # We are left with two options: a single key, and a collection of keys,

4096 # We interpret tuples as collections only for non-MultiIndex

4097 is_single_key = isinstance(key, tuple) or not is_list_like(key)

4099 if is_single_key:

4100 if self.columns.nlevels > 1:

4101 return self._getitem_multilevel(key)

4102 indexer = self.columns.get_loc(key)

4103 if is_integer(indexer):

4104 indexer = [indexer]

4105 else:

4106 if is_iterator(key):

4107 key = list(key)

4108 indexer = self.columns._get_indexer_strict(key, "columns")[1]

4110 # take() does not accept boolean indexers

4111 if getattr(indexer, "dtype", None) == bool:

4112 indexer = np.where(indexer)[0]

4114 if isinstance(indexer, slice):

4115 return self._slice(indexer, axis=1)

4117 data = self._take_with_is_copy(indexer, axis=1)

4119 if is_single_key:

4120 # What does looking for a single key in a non-unique index return?

4121 # The behavior is inconsistent. It returns a Series, except when

4122 # - the key itself is repeated (test on data.shape, #9519), or

4123 # - we have a MultiIndex on columns (test on self.columns, #21309)

4124 if data.shape[1] == 1 and not isinstance(self.columns, MultiIndex):

4125 # GH#26490 using data[key] can cause RecursionError

4126 return data._get_item_cache(key)

4128 return data

4130 def _getitem_bool_array(self, key):

4131 # also raises Exception if object array with NA values

4132 # warning here just in case -- previously __setitem__ was

4133 # reindexing but __getitem__ was not; it seems more reasonable to

4134 # go with the __setitem__ behavior since that is more consistent

4135 # with all other indexing behavior

4136 if isinstance(key, Series) and not key.index.equals(self.index):

4137 warnings.warn(

4138 "Boolean Series key will be reindexed to match DataFrame index.",

4139 UserWarning,

4140 stacklevel=find_stack_level(),

4142 elif len(key) != len(self.index):

4143 raise ValueError(

4144 f"Item wrong length {len(key)} instead of {len(self.index)}."

4147 # check_bool_indexer will throw exception if Series key cannot

4148 # be reindexed to match DataFrame rows

4149 key = check_bool_indexer(self.index, key)

4151 if key.all():

4152 return self.copy(deep=None)

4154 indexer = key.nonzero()[0]

4155 return self._take_with_is_copy(indexer, axis=0)

4157 def _getitem_multilevel(self, key):

4158 # self.columns is a MultiIndex

4159 loc = self.columns.get_loc(key)

4160 if isinstance(loc, (slice, np.ndarray)):

4161 new_columns = self.columns[loc]

4162 result_columns = maybe_droplevels(new_columns, key)

4163 result = self.iloc[:, loc]

4164 result.columns = result_columns

4166 # If there is only one column being returned, and its name is

4167 # either an empty string, or a tuple with an empty string as its

4168 # first element, then treat the empty string as a placeholder

4169 # and return the column as if the user had provided that empty

4170 # string in the key. If the result is a Series, exclude the

4171 # implied empty string from its name.

4172 if len(result.columns) == 1:

4173 # e.g. test_frame_getitem_multicolumn_empty_level,

4174 # test_frame_mixed_depth_get, test_loc_setitem_single_column_slice

4175 top = result.columns[0]

4176 if isinstance(top, tuple):

4177 top = top[0]

4178 if top == "":

4179 result = result[""]

4180 if isinstance(result, Series):

4181 result = self._constructor_sliced(

4182 result, index=self.index, name=key

4185 result._set_is_copy(self)

4186 return result

4187 else:

4188 # loc is neither a slice nor ndarray, so must be an int

4189 return self._ixs(loc, axis=1)

4191 def _get_value(self, index, col, takeable: bool = False) -> Scalar:

4192 """

4193 Quickly retrieve single value at passed column and index.

4195 Parameters

4196 ----------

4197 index : row label

4198 col : column label

4199 takeable : interpret the index/col as indexers, default False

4201 Returns

4202 -------

4203 scalar

4205 Notes

4206 -----

4207 Assumes that both `self.index._index_as_unique` and

4208 `self.columns._index_as_unique`; Caller is responsible for checking.

4209 """

4210 if takeable:

4211 series = self._ixs(col, axis=1)

4212 return series._values[index]

4214 series = self._get_item_cache(col)

4215 engine = self.index._engine

4217 if not isinstance(self.index, MultiIndex):

4218 # CategoricalIndex: Trying to use the engine fastpath may give incorrect

4219 # results if our categories are integers that dont match our codes

4220 # IntervalIndex: IntervalTree has no get_loc

4221 row = self.index.get_loc(index)

4222 return series._values[row]

4224 # For MultiIndex going through engine effectively restricts us to

4225 # same-length tuples; see test_get_set_value_no_partial_indexing

4226 loc = engine.get_loc(index)

4227 return series._values[loc]

4229 def isetitem(self, loc, value) -> None:

4230 """

4231 Set the given value in the column with position `loc`.

4233 This is a positional analogue to ``__setitem__``.

4235 Parameters

4236 ----------

4237 loc : int or sequence of ints

4238 Index position for the column.

4239 value : scalar or arraylike

4240 Value(s) for the column.

4242 Notes

4243 -----

4244 ``frame.isetitem(loc, value)`` is an in-place method as it will

4245 modify the DataFrame in place (not returning a new object). In contrast to

4246 ``frame.iloc[:, i] = value`` which will try to update the existing values in

4247 place, ``frame.isetitem(loc, value)`` will not update the values of the column

4248 itself in place, it will instead insert a new array.

4250 In cases where ``frame.columns`` is unique, this is equivalent to

4251 ``frame[frame.columns[i]] = value``.

4252 """

4253 if isinstance(value, DataFrame):

4254 if is_integer(loc):

4255 loc = [loc]

4257 if len(loc) != len(value.columns):

4258 raise ValueError(

4259 f"Got {len(loc)} positions but value has {len(value.columns)} "

4260 f"columns."

4263 for i, idx in enumerate(loc):

4264 arraylike, refs = self._sanitize_column(value.iloc[:, i])

4265 self._iset_item_mgr(idx, arraylike, inplace=False, refs=refs)

4266 return

4268 arraylike, refs = self._sanitize_column(value)

4269 self._iset_item_mgr(loc, arraylike, inplace=False, refs=refs)

4271 def __setitem__(self, key, value) -> None:

4272 if not PYPY and using_copy_on_write():

4273 if sys.getrefcount(self) <= 3:

4274 warnings.warn(

4275 _chained_assignment_msg, ChainedAssignmentError, stacklevel=2

4277 elif not PYPY and not using_copy_on_write():

4278 if sys.getrefcount(self) <= 3 and (

4279 warn_copy_on_write()

4280 or (

4281 not warn_copy_on_write()

4282 and any(b.refs.has_reference() for b in self._mgr.blocks) # type: ignore[union-attr]

4285 warnings.warn(

4286 _chained_assignment_warning_msg, FutureWarning, stacklevel=2

4289 key = com.apply_if_callable(key, self)

4291 # see if we can slice the rows

4292 if isinstance(key, slice):

4293 slc = self.index._convert_slice_indexer(key, kind="getitem")

4294 return self._setitem_slice(slc, value)

4296 if isinstance(key, DataFrame) or getattr(key, "ndim", None) == 2:

4297 self._setitem_frame(key, value)

4298 elif isinstance(key, (Series, np.ndarray, list, Index)):

4299 self._setitem_array(key, value)

4300 elif isinstance(value, DataFrame):

4301 self._set_item_frame_value(key, value)

4302 elif (

4303 is_list_like(value)

4304 and not self.columns.is_unique

4305 and 1 < len(self.columns.get_indexer_for([key])) == len(value)

4307 # Column to set is duplicated

4308 self._setitem_array([key], value)

4309 else:

4310 # set column

4311 self._set_item(key, value)

4313 def _setitem_slice(self, key: slice, value) -> None:

4314 # NB: we can't just use self.loc[key] = value because that

4315 # operates on labels and we need to operate positional for

4316 # backwards-compat, xref GH#31469

4317 self._check_setitem_copy()

4318 self.iloc[key] = value

4320 def _setitem_array(self, key, value):

4321 # also raises Exception if object array with NA values

4322 if com.is_bool_indexer(key):

4323 # bool indexer is indexing along rows

4324 if len(key) != len(self.index):

4325 raise ValueError(

4326 f"Item wrong length {len(key)} instead of {len(self.index)}!"

4328 key = check_bool_indexer(self.index, key)

4329 indexer = key.nonzero()[0]

4330 self._check_setitem_copy()

4331 if isinstance(value, DataFrame):

4332 # GH#39931 reindex since iloc does not align

4333 value = value.reindex(self.index.take(indexer))

4334 self.iloc[indexer] = value

4336 else:

4337 # Note: unlike self.iloc[:, indexer] = value, this will

4338 # never try to overwrite values inplace

4340 if isinstance(value, DataFrame):

4341 check_key_length(self.columns, key, value)

4342 for k1, k2 in zip(key, value.columns):

4343 self[k1] = value[k2]

4345 elif not is_list_like(value):

4346 for col in key:

4347 self[col] = value

4349 elif isinstance(value, np.ndarray) and value.ndim == 2:

4350 self._iset_not_inplace(key, value)

4352 elif np.ndim(value) > 1:

4353 # list of lists

4354 value = DataFrame(value).values

4355 return self._setitem_array(key, value)

4357 else:

4358 self._iset_not_inplace(key, value)

4360 def _iset_not_inplace(self, key, value):

4361 # GH#39510 when setting with df[key] = obj with a list-like key and

4362 # list-like value, we iterate over those listlikes and set columns

4363 # one at a time. This is different from dispatching to

4364 # `self.loc[:, key]= value` because loc.__setitem__ may overwrite

4365 # data inplace, whereas this will insert new arrays.

4367 def igetitem(obj, i: int):

4368 # Note: we catch DataFrame obj before getting here, but

4369 # hypothetically would return obj.iloc[:, i]

4370 if isinstance(obj, np.ndarray):

4371 return obj[..., i]

4372 else:

4373 return obj[i]

4375 if self.columns.is_unique:

4376 if np.shape(value)[-1] != len(key):

4377 raise ValueError("Columns must be same length as key")

4379 for i, col in enumerate(key):

4380 self[col] = igetitem(value, i)

4382 else:

4383 ilocs = self.columns.get_indexer_non_unique(key)[0]

4384 if (ilocs < 0).any():

4385 # key entries not in self.columns

4386 raise NotImplementedError

4388 if np.shape(value)[-1] != len(ilocs):

4389 raise ValueError("Columns must be same length as key")

4391 assert np.ndim(value) <= 2

4393 orig_columns = self.columns

4395 # Using self.iloc[:, i] = ... may set values inplace, which

4396 # by convention we do not do in __setitem__

4397 try:

4398 self.columns = Index(range(len(self.columns)))

4399 for i, iloc in enumerate(ilocs):

4400 self[iloc] = igetitem(value, i)

4401 finally:

4402 self.columns = orig_columns

4404 def _setitem_frame(self, key, value):

4405 # support boolean setting with DataFrame input, e.g.

4406 # df[df > df2] = 0

4407 if isinstance(key, np.ndarray):

4408 if key.shape != self.shape:

4409 raise ValueError("Array conditional must be same shape as self")

4410 key = self._constructor(key, **self._construct_axes_dict(), copy=False)

4412 if key.size and not all(is_bool_dtype(dtype) for dtype in key.dtypes):

4413 raise TypeError(

4414 "Must pass DataFrame or 2-d ndarray with boolean values only"

4417 self._check_setitem_copy()

4418 self._where(-key, value, inplace=True)

4420 def _set_item_frame_value(self, key, value: DataFrame) -> None:

4421 self._ensure_valid_index(value)

4423 # align columns

4424 if key in self.columns:

4425 loc = self.columns.get_loc(key)

4426 cols = self.columns[loc]

4427 len_cols = 1 if is_scalar(cols) or isinstance(cols, tuple) else len(cols)

4428 if len_cols != len(value.columns):

4429 raise ValueError("Columns must be same length as key")

4431 # align right-hand-side columns if self.columns

4432 # is multi-index and self[key] is a sub-frame

4433 if isinstance(self.columns, MultiIndex) and isinstance(

4434 loc, (slice, Series, np.ndarray, Index)

4436 cols_droplevel = maybe_droplevels(cols, key)

4437 if len(cols_droplevel) and not cols_droplevel.equals(value.columns):

4438 value = value.reindex(cols_droplevel, axis=1)

4440 for col, col_droplevel in zip(cols, cols_droplevel):

4441 self[col] = value[col_droplevel]

4442 return

4444 if is_scalar(cols):

4445 self[cols] = value[value.columns[0]]

4446 return

4448 locs: np.ndarray | list

4449 if isinstance(loc, slice):

4450 locs = np.arange(loc.start, loc.stop, loc.step)

4451 elif is_scalar(loc):

4452 locs = [loc]

4453 else:

4454 locs = loc.nonzero()[0]

4456 return self.isetitem(locs, value)

4458 if len(value.columns) > 1:

4459 raise ValueError(

4460 "Cannot set a DataFrame with multiple columns to the single "

4461 f"column {key}"

4463 elif len(value.columns) == 0:

4464 raise ValueError(

4465 f"Cannot set a DataFrame without columns to the column {key}"

4468 self[key] = value[value.columns[0]]

4470 def _iset_item_mgr(

4471 self,

4472 loc: int | slice | np.ndarray,

4473 value,

4474 inplace: bool = False,

4475 refs: BlockValuesRefs | None = None,

4476 ) -> None:

4477 # when called from _set_item_mgr loc can be anything returned from get_loc

4478 self._mgr.iset(loc, value, inplace=inplace, refs=refs)

4479 self._clear_item_cache()

4481 def _set_item_mgr(

4482 self, key, value: ArrayLike, refs: BlockValuesRefs | None = None

4483 ) -> None:

4484 try:

4485 loc = self._info_axis.get_loc(key)

4486 except KeyError:

4487 # This item wasn't present, just insert at end

4488 self._mgr.insert(len(self._info_axis), key, value, refs)

4489 else:

4490 self._iset_item_mgr(loc, value, refs=refs)

4492 # check if we are modifying a copy

4493 # try to set first as we want an invalid

4494 # value exception to occur first

4495 if len(self):

4496 self._check_setitem_copy()

4498 def _iset_item(self, loc: int, value: Series, inplace: bool = True) -> None:

4499 # We are only called from _replace_columnwise which guarantees that

4500 # no reindex is necessary

4501 if using_copy_on_write():

4502 self._iset_item_mgr(

4503 loc, value._values, inplace=inplace, refs=value._references

4505 else:

4506 self._iset_item_mgr(loc, value._values.copy(), inplace=True)

4508 # check if we are modifying a copy

4509 # try to set first as we want an invalid

4510 # value exception to occur first

4511 if len(self):

4512 self._check_setitem_copy()

4514 def _set_item(self, key, value) -> None:

4515 """

4516 Add series to DataFrame in specified column.

4518 If series is a numpy-array (not a Series/TimeSeries), it must be the

4519 same length as the DataFrames index or an error will be thrown.

4521 Series/TimeSeries will be conformed to the DataFrames index to

4522 ensure homogeneity.

4523 """

4524 value, refs = self._sanitize_column(value)

4526 if (

4527 key in self.columns

4528 and value.ndim == 1

4529 and not isinstance(value.dtype, ExtensionDtype)

4531 # broadcast across multiple columns if necessary

4532 if not self.columns.is_unique or isinstance(self.columns, MultiIndex):

4533 existing_piece = self[key]

4534 if isinstance(existing_piece, DataFrame):

4535 value = np.tile(value, (len(existing_piece.columns), 1)).T

4536 refs = None

4538 self._set_item_mgr(key, value, refs)

4540 def _set_value(

4541 self, index: IndexLabel, col, value: Scalar, takeable: bool = False

4542 ) -> None:

4543 """

4544 Put single value at passed column and index.

4546 Parameters

4547 ----------

4548 index : Label

4549 row label

4550 col : Label

4551 column label

4552 value : scalar

4553 takeable : bool, default False

4554 Sets whether or not index/col interpreted as indexers

4555 """

4556 try:

4557 if takeable:

4558 icol = col

4559 iindex = cast(int, index)

4560 else:

4561 icol = self.columns.get_loc(col)

4562 iindex = self.index.get_loc(index)

4563 self._mgr.column_setitem(icol, iindex, value, inplace_only=True)

4564 self._clear_item_cache()

4566 except (KeyError, TypeError, ValueError, LossySetitemError):

4567 # get_loc might raise a KeyError for missing labels (falling back

4568 # to (i)loc will do expansion of the index)

4569 # column_setitem will do validation that may raise TypeError,

4570 # ValueError, or LossySetitemError

4571 # set using a non-recursive method & reset the cache

4572 if takeable:

4573 self.iloc[index, col] = value

4574 else:

4575 self.loc[index, col] = value

4576 self._item_cache.pop(col, None)

4578 except InvalidIndexError as ii_err:

4579 # GH48729: Seems like you are trying to assign a value to a

4580 # row when only scalar options are permitted

4581 raise InvalidIndexError(

4582 f"You can only assign a scalar value not a {type(value)}"

4583 ) from ii_err

4585 def _ensure_valid_index(self, value) -> None:

4586 """

4587 Ensure that if we don't have an index, that we can create one from the

4588 passed value.

4589 """

4590 # GH5632, make sure that we are a Series convertible

4591 if not len(self.index) and is_list_like(value) and len(value):

4592 if not isinstance(value, DataFrame):

4593 try:

4594 value = Series(value)

4595 except (ValueError, NotImplementedError, TypeError) as err:

4596 raise ValueError(

4597 "Cannot set a frame with no defined index "

4598 "and a value that cannot be converted to a Series"

4599 ) from err

4601 # GH31368 preserve name of index

4602 index_copy = value.index.copy()

4603 if self.index.name is not None:

4604 index_copy.name = self.index.name

4606 self._mgr = self._mgr.reindex_axis(index_copy, axis=1, fill_value=np.nan)

4608 def _box_col_values(self, values: SingleDataManager, loc: int) -> Series:

4609 """

4610 Provide boxed values for a column.

4611 """

4612 # Lookup in columns so that if e.g. a str datetime was passed

4613 # we attach the Timestamp object as the name.

4614 name = self.columns[loc]

4615 # We get index=self.index bc values is a SingleDataManager

4616 obj = self._constructor_sliced_from_mgr(values, axes=values.axes)

4617 obj._name = name

4618 return obj.__finalize__(self)

4620 # ----------------------------------------------------------------------

4621 # Lookup Caching

4623 def _clear_item_cache(self) -> None:

4624 self._item_cache.clear()

4626 def _get_item_cache(self, item: Hashable) -> Series:

4627 """Return the cached item, item represents a label indexer."""

4628 if using_copy_on_write() or warn_copy_on_write():

4629 loc = self.columns.get_loc(item)

4630 return self._ixs(loc, axis=1)

4632 cache = self._item_cache

4633 res = cache.get(item)

4634 if res is None:

4635 # All places that call _get_item_cache have unique columns,

4636 # pending resolution of GH#33047

4638 loc = self.columns.get_loc(item)

4639 res = self._ixs(loc, axis=1)

4641 cache[item] = res

4643 # for a chain

4644 res._is_copy = self._is_copy

4645 return res

4647 def _reset_cacher(self) -> None:

4648 # no-op for DataFrame

4649 pass

4651 def _maybe_cache_changed(self, item, value: Series, inplace: bool) -> None:

4652 """

4653 The object has called back to us saying maybe it has changed.

4654 """

4655 loc = self._info_axis.get_loc(item)

4656 arraylike = value._values

4658 old = self._ixs(loc, axis=1)

4659 if old._values is value._values and inplace:

4660 # GH#46149 avoid making unnecessary copies/block-splitting

4661 return

4663 self._mgr.iset(loc, arraylike, inplace=inplace)

4665 # ----------------------------------------------------------------------

4666 # Unsorted

4668 @overload

4669 def query(self, expr: str, *, inplace: Literal[False] = ..., **kwargs) -> DataFrame:

4670 ...

4672 @overload

4673 def query(self, expr: str, *, inplace: Literal[True], **kwargs) -> None:

4674 ...

4676 @overload

4677 def query(self, expr: str, *, inplace: bool = ..., **kwargs) -> DataFrame | None:

4678 ...

4680 def query(self, expr: str, *, inplace: bool = False, **kwargs) -> DataFrame | None:

4681 """

4682 Query the columns of a DataFrame with a boolean expression.

4684 Parameters

4685 ----------

4686 expr : str

4687 The query string to evaluate.

4689 You can refer to variables

4690 in the environment by prefixing them with an '@' character like

4691 ``@a + b``.

4693 You can refer to column names that are not valid Python variable names

4694 by surrounding them in backticks. Thus, column names containing spaces

4695 or punctuations (besides underscores) or starting with digits must be

4696 surrounded by backticks. (For example, a column named "Area (cm^2)" would

4697 be referenced as ```Area (cm^2)```). Column names which are Python keywords

4698 (like "list", "for", "import", etc) cannot be used.

4700 For example, if one of your columns is called ``a a`` and you want

4701 to sum it with ``b``, your query should be ```a a` + b``.

4703 inplace : bool

4704 Whether to modify the DataFrame rather than creating a new one.

4705 **kwargs

4706 See the documentation for :func:`eval` for complete details

4707 on the keyword arguments accepted by :meth:`DataFrame.query`.

4709 Returns

4710 -------

4711 DataFrame or None

4712 DataFrame resulting from the provided query expression or

4713 None if ``inplace=True``.

4715 See Also

4716 --------

4717 eval : Evaluate a string describing operations on

4718 DataFrame columns.

4719 DataFrame.eval : Evaluate a string describing operations on

4720 DataFrame columns.

4722 Notes

4723 -----

4724 The result of the evaluation of this expression is first passed to

4725 :attr:`DataFrame.loc` and if that fails because of a

4726 multidimensional key (e.g., a DataFrame) then the result will be passed

4727 to :meth:`DataFrame.__getitem__`.

4729 This method uses the top-level :func:`eval` function to

4730 evaluate the passed query.

4732 The :meth:`~pandas.DataFrame.query` method uses a slightly

4733 modified Python syntax by default. For example, the ``&`` and ``|``

4734 (bitwise) operators have the precedence of their boolean cousins,

4735 :keyword:`and` and :keyword:`or`. This *is* syntactically valid Python,

4736 however the semantics are different.

4738 You can change the semantics of the expression by passing the keyword

4739 argument ``parser='python'``. This enforces the same semantics as

4740 evaluation in Python space. Likewise, you can pass ``engine='python'``

4741 to evaluate an expression using Python itself as a backend. This is not

4742 recommended as it is inefficient compared to using ``numexpr`` as the

4743 engine.

4745 The :attr:`DataFrame.index` and

4746 :attr:`DataFrame.columns` attributes of the

4747 :class:`~pandas.DataFrame` instance are placed in the query namespace

4748 by default, which allows you to treat both the index and columns of the

4749 frame as a column in the frame.

4750 The identifier ``index`` is used for the frame index; you can also

4751 use the name of the index to identify it in a query. Please note that

4752 Python keywords may not be used as identifiers.

4754 For further details and examples see the ``query`` documentation in

4755 :ref:`indexing <indexing.query>`.

4757 *Backtick quoted variables*

4759 Backtick quoted variables are parsed as literal Python code and

4760 are converted internally to a Python valid identifier.

4761 This can lead to the following problems.

4763 During parsing a number of disallowed characters inside the backtick

4764 quoted string are replaced by strings that are allowed as a Python identifier.

4765 These characters include all operators in Python, the space character, the

4766 question mark, the exclamation mark, the dollar sign, and the euro sign.

4767 For other characters that fall outside the ASCII range (U+0001..U+007F)

4768 and those that are not further specified in PEP 3131,

4769 the query parser will raise an error.

4770 This excludes whitespace different than the space character,

4771 but also the hashtag (as it is used for comments) and the backtick

4772 itself (backtick can also not be escaped).

4774 In a special case, quotes that make a pair around a backtick can

4775 confuse the parser.

4776 For example, ```it's` > `that's``` will raise an error,

4777 as it forms a quoted string (``'s > `that'``) with a backtick inside.

4779 See also the Python documentation about lexical analysis

4780 (https://docs.python.org/3/reference/lexical_analysis.html)

4781 in combination with the source code in :mod:`pandas.core.computation.parsing`.

4783 Examples

4784 --------

4785 >>> df = pd.DataFrame({'A': range(1, 6),

4786 ... 'B': range(10, 0, -2),

4787 ... 'C C': range(10, 5, -1)})

4788 >>> df

4789 A B C C

4790 0 1 10 10

4791 1 2 8 9

4792 2 3 6 8

4793 3 4 4 7

4794 4 5 2 6

4795 >>> df.query('A > B')

4796 A B C C

4797 4 5 2 6

4799 The previous expression is equivalent to

4801 >>> df[df.A > df.B]

4802 A B C C

4803 4 5 2 6

4805 For columns with spaces in their name, you can use backtick quoting.

4807 >>> df.query('B == `C C`')

4808 A B C C

4809 0 1 10 10

4811 The previous expression is equivalent to

4813 >>> df[df.B == df['C C']]

4814 A B C C

4815 0 1 10 10

4816 """

4817 inplace = validate_bool_kwarg(inplace, "inplace")

4818 if not isinstance(expr, str):

4819 msg = f"expr must be a string to be evaluated, {type(expr)} given"

4820 raise ValueError(msg)

4821 kwargs["level"] = kwargs.pop("level", 0) + 1

4822 kwargs["target"] = None

4823 res = self.eval(expr, **kwargs)

4825 try:

4826 result = self.loc[res]

4827 except ValueError:

4828 # when res is multi-dimensional loc raises, but this is sometimes a

4829 # valid query

4830 result = self[res]

4832 if inplace:

4833 self._update_inplace(result)

4834 return None

4835 else:

4836 return result

4838 @overload

4839 def eval(self, expr: str, *, inplace: Literal[False] = ..., **kwargs) -> Any:

4840 ...

4842 @overload

4843 def eval(self, expr: str, *, inplace: Literal[True], **kwargs) -> None:

4844 ...

4846 def eval(self, expr: str, *, inplace: bool = False, **kwargs) -> Any | None:

4847 """

4848 Evaluate a string describing operations on DataFrame columns.

4850 Operates on columns only, not specific rows or elements. This allows

4851 `eval` to run arbitrary code, which can make you vulnerable to code

4852 injection if you pass user input to this function.

4854 Parameters

4855 ----------

4856 expr : str

4857 The expression string to evaluate.

4858 inplace : bool, default False

4859 If the expression contains an assignment, whether to perform the

4860 operation inplace and mutate the existing DataFrame. Otherwise,

4861 a new DataFrame is returned.

4862 **kwargs

4863 See the documentation for :func:`eval` for complete details

4864 on the keyword arguments accepted by

4865 :meth:`~pandas.DataFrame.query`.

4867 Returns

4868 -------

4869 ndarray, scalar, pandas object, or None

4870 The result of the evaluation or None if ``inplace=True``.

4872 See Also

4873 --------

4874 DataFrame.query : Evaluates a boolean expression to query the columns

4875 of a frame.

4876 DataFrame.assign : Can evaluate an expression or function to create new

4877 values for a column.

4878 eval : Evaluate a Python expression as a string using various

4879 backends.

4881 Notes

4882 -----

4883 For more details see the API documentation for :func:`~eval`.

4884 For detailed examples see :ref:`enhancing performance with eval

4885 <enhancingperf.eval>`.

4887 Examples

4888 --------

4889 >>> df = pd.DataFrame({'A': range(1, 6), 'B': range(10, 0, -2)})

4890 >>> df

4891 A B

4892 0 1 10

4893 1 2 8

4894 2 3 6

4895 3 4 4

4896 4 5 2

4897 >>> df.eval('A + B')

4898 0 11

4899 1 10

4900 2 9

4901 3 8

4902 4 7

4903 dtype: int64

4905 Assignment is allowed though by default the original DataFrame is not

4906 modified.

4908 >>> df.eval('C = A + B')

4909 A B C

4910 0 1 10 11

4911 1 2 8 10

4912 2 3 6 9

4913 3 4 4 8

4914 4 5 2 7

4915 >>> df

4916 A B

4917 0 1 10

4918 1 2 8

4919 2 3 6

4920 3 4 4

4921 4 5 2

4923 Multiple columns can be assigned to using multi-line expressions:

4925 >>> df.eval(

4926 ... '''

4927 ... C = A + B

4928 ... D = A - B

4929 ... '''

4930 ... )

4931 A B C D

4932 0 1 10 11 -9

4933 1 2 8 10 -6

4934 2 3 6 9 -3

4935 3 4 4 8 0

4936 4 5 2 7 3

4937 """

4938 from pandas.core.computation.eval import eval as _eval

4940 inplace = validate_bool_kwarg(inplace, "inplace")

4941 kwargs["level"] = kwargs.pop("level", 0) + 1

4942 index_resolvers = self._get_index_resolvers()

4943 column_resolvers = self._get_cleaned_column_resolvers()

4944 resolvers = column_resolvers, index_resolvers

4945 if "target" not in kwargs:

4946 kwargs["target"] = self

4947 kwargs["resolvers"] = tuple(kwargs.get("resolvers", ())) + resolvers

4949 return _eval(expr, inplace=inplace, **kwargs)

4951 def select_dtypes(self, include=None, exclude=None) -> Self:

4952 """

4953 Return a subset of the DataFrame's columns based on the column dtypes.

4955 Parameters

4956 ----------

4957 include, exclude : scalar or list-like

4958 A selection of dtypes or strings to be included/excluded. At least

4959 one of these parameters must be supplied.

4961 Returns

4962 -------

4963 DataFrame

4964 The subset of the frame including the dtypes in ``include`` and

4965 excluding the dtypes in ``exclude``.

4967 Raises

4968 ------

4969 ValueError

4970 * If both of ``include`` and ``exclude`` are empty

4971 * If ``include`` and ``exclude`` have overlapping elements

4972 * If any kind of string dtype is passed in.

4974 See Also

4975 --------

4976 DataFrame.dtypes: Return Series with the data type of each column.

4978 Notes

4979 -----

4980 * To select all *numeric* types, use ``np.number`` or ``'number'``

4981 * To select strings you must use the ``object`` dtype, but note that

4982 this will return *all* object dtype columns

4983 * See the `numpy dtype hierarchy

4984 <https://numpy.org/doc/stable/reference/arrays.scalars.html>`__

4985 * To select datetimes, use ``np.datetime64``, ``'datetime'`` or

4986 ``'datetime64'``

4987 * To select timedeltas, use ``np.timedelta64``, ``'timedelta'`` or

4988 ``'timedelta64'``

4989 * To select Pandas categorical dtypes, use ``'category'``

4990 * To select Pandas datetimetz dtypes, use ``'datetimetz'``

4991 or ``'datetime64[ns, tz]'``

4993 Examples

4994 --------

4995 >>> df = pd.DataFrame({'a': [1, 2] * 3,

4996 ... 'b': [True, False] * 3,

4997 ... 'c': [1.0, 2.0] * 3})

4998 >>> df

4999 a b c

5000 0 1 True 1.0

5001 1 2 False 2.0

5002 2 1 True 1.0

5003 3 2 False 2.0

5004 4 1 True 1.0

5005 5 2 False 2.0

5007 >>> df.select_dtypes(include='bool')

5009 0 True

5010 1 False

5011 2 True

5012 3 False

5013 4 True

5014 5 False

5016 >>> df.select_dtypes(include=['float64'])

5018 0 1.0

5019 1 2.0

5020 2 1.0

5021 3 2.0

5022 4 1.0

5023 5 2.0

5025 >>> df.select_dtypes(exclude=['int64'])

5026 b c

5027 0 True 1.0

5028 1 False 2.0

5029 2 True 1.0

5030 3 False 2.0

5031 4 True 1.0

5032 5 False 2.0

5033 """

5034 if not is_list_like(include):

5035 include = (include,) if include is not None else ()

5036 if not is_list_like(exclude):

5037 exclude = (exclude,) if exclude is not None else ()

5039 selection = (frozenset(include), frozenset(exclude))

5041 if not any(selection):

5042 raise ValueError("at least one of include or exclude must be nonempty")

5044 # convert the myriad valid dtypes object to a single representation

5045 def check_int_infer_dtype(dtypes):

5046 converted_dtypes: list[type] = []

5047 for dtype in dtypes:

5048 # Numpy maps int to different types (int32, in64) on Windows and Linux

5049 # see https://github.com/numpy/numpy/issues/9464

5050 if (isinstance(dtype, str) and dtype == "int") or (dtype is int):

5051 converted_dtypes.append(np.int32)

5052 converted_dtypes.append(np.int64)

5053 elif dtype == "float" or dtype is float:

5054 # GH#42452 : np.dtype("float") coerces to np.float64 from Numpy 1.20

5055 converted_dtypes.extend([np.float64, np.float32])

5056 else:

5057 converted_dtypes.append(infer_dtype_from_object(dtype))

5058 return frozenset(converted_dtypes)

5060 include = check_int_infer_dtype(include)

5061 exclude = check_int_infer_dtype(exclude)

5063 for dtypes in (include, exclude):

5064 invalidate_string_dtypes(dtypes)

5066 # can't both include AND exclude!

5067 if not include.isdisjoint(exclude):

5068 raise ValueError(f"include and exclude overlap on {(include & exclude)}")

5070 def dtype_predicate(dtype: DtypeObj, dtypes_set) -> bool:

5071 # GH 46870: BooleanDtype._is_numeric == True but should be excluded

5072 dtype = dtype if not isinstance(dtype, ArrowDtype) else dtype.numpy_dtype

5073 return issubclass(dtype.type, tuple(dtypes_set)) or (

5074 np.number in dtypes_set

5075 and getattr(dtype, "_is_numeric", False)

5076 and not is_bool_dtype(dtype)

5079 def predicate(arr: ArrayLike) -> bool:

5080 dtype = arr.dtype

5081 if include:

5082 if not dtype_predicate(dtype, include):

5083 return False

5085 if exclude:

5086 if dtype_predicate(dtype, exclude):

5087 return False

5089 return True

5091 mgr = self._mgr._get_data_subset(predicate).copy(deep=None)

5092 # error: Incompatible return value type (got "DataFrame", expected "Self")

5093 return self._constructor_from_mgr(mgr, axes=mgr.axes).__finalize__(self) # type: ignore[return-value]

5095 def insert(

5096 self,

5097 loc: int,

5098 column: Hashable,

5099 value: Scalar | AnyArrayLike,

5100 allow_duplicates: bool | lib.NoDefault = lib.no_default,

5101 ) -> None:

5102 """

5103 Insert column into DataFrame at specified location.

5105 Raises a ValueError if `column` is already contained in the DataFrame,

5106 unless `allow_duplicates` is set to True.

5108 Parameters

5109 ----------

5110 loc : int

5111 Insertion index. Must verify 0 <= loc <= len(columns).

5112 column : str, number, or hashable object

5113 Label of the inserted column.

5114 value : Scalar, Series, or array-like

5115 Content of the inserted column.

5116 allow_duplicates : bool, optional, default lib.no_default

5117 Allow duplicate column labels to be created.

5119 See Also

5120 --------

5121 Index.insert : Insert new item by index.

5123 Examples

5124 --------

5125 >>> df = pd.DataFrame({'col1': [1, 2], 'col2': [3, 4]})

5126 >>> df

5127 col1 col2

5128 0 1 3

5129 1 2 4

5130 >>> df.insert(1, "newcol", [99, 99])

5131 >>> df

5132 col1 newcol col2

5133 0 1 99 3

5134 1 2 99 4

5135 >>> df.insert(0, "col1", [100, 100], allow_duplicates=True)

5136 >>> df

5137 col1 col1 newcol col2

5138 0 100 1 99 3

5139 1 100 2 99 4

5141 Notice that pandas uses index alignment in case of `value` from type `Series`:

5143 >>> df.insert(0, "col0", pd.Series([5, 6], index=[1, 2]))

5144 >>> df

5145 col0 col1 col1 newcol col2

5146 0 NaN 100 1 99 3

5147 1 5.0 100 2 99 4

5148 """

5149 if allow_duplicates is lib.no_default:

5150 allow_duplicates = False

5151 if allow_duplicates and not self.flags.allows_duplicate_labels:

5152 raise ValueError(

5153 "Cannot specify 'allow_duplicates=True' when "

5154 "'self.flags.allows_duplicate_labels' is False."

5156 if not allow_duplicates and column in self.columns:

5157 # Should this be a different kind of error??

5158 raise ValueError(f"cannot insert {column}, already exists")

5159 if not is_integer(loc):

5160 raise TypeError("loc must be int")

5161 # convert non stdlib ints to satisfy typing checks

5162 loc = int(loc)

5163 if isinstance(value, DataFrame) and len(value.columns) > 1:

5164 raise ValueError(

5165 f"Expected a one-dimensional object, got a DataFrame with "

5166 f"{len(value.columns)} columns instead."

5168 elif isinstance(value, DataFrame):

5169 value = value.iloc[:, 0]

5171 value, refs = self._sanitize_column(value)

5172 self._mgr.insert(loc, column, value, refs=refs)

5174 def assign(self, **kwargs) -> DataFrame:

5175 r"""

5176 Assign new columns to a DataFrame.

5178 Returns a new object with all original columns in addition to new ones.

5179 Existing columns that are re-assigned will be overwritten.

5181 Parameters

5182 ----------

5183 **kwargs : dict of {str: callable or Series}

5184 The column names are keywords. If the values are

5185 callable, they are computed on the DataFrame and

5186 assigned to the new columns. The callable must not

5187 change input DataFrame (though pandas doesn't check it).

5188 If the values are not callable, (e.g. a Series, scalar, or array),

5189 they are simply assigned.

5191 Returns

5192 -------

5193 DataFrame

5194 A new DataFrame with the new columns in addition to

5195 all the existing columns.

5197 Notes

5198 -----

5199 Assigning multiple columns within the same ``assign`` is possible.

5200 Later items in '\*\*kwargs' may refer to newly created or modified

5201 columns in 'df'; items are computed and assigned into 'df' in order.

5203 Examples

5204 --------

5205 >>> df = pd.DataFrame({'temp_c': [17.0, 25.0]},

5206 ... index=['Portland', 'Berkeley'])

5207 >>> df

5208 temp_c

5209 Portland 17.0

5210 Berkeley 25.0

5212 Where the value is a callable, evaluated on `df`:

5214 >>> df.assign(temp_f=lambda x: x.temp_c * 9 / 5 + 32)

5215 temp_c temp_f

5216 Portland 17.0 62.6

5217 Berkeley 25.0 77.0

5219 Alternatively, the same behavior can be achieved by directly

5220 referencing an existing Series or sequence:

5222 >>> df.assign(temp_f=df['temp_c'] * 9 / 5 + 32)

5223 temp_c temp_f

5224 Portland 17.0 62.6

5225 Berkeley 25.0 77.0

5227 You can create multiple columns within the same assign where one

5228 of the columns depends on another one defined within the same assign:

5230 >>> df.assign(temp_f=lambda x: x['temp_c'] * 9 / 5 + 32,

5231 ... temp_k=lambda x: (x['temp_f'] + 459.67) * 5 / 9)

5232 temp_c temp_f temp_k

5233 Portland 17.0 62.6 290.15

5234 Berkeley 25.0 77.0 298.15

5235 """

5236 data = self.copy(deep=None)

5238 for k, v in kwargs.items():

5239 data[k] = com.apply_if_callable(v, data)

5240 return data

5242 def _sanitize_column(self, value) -> tuple[ArrayLike, BlockValuesRefs | None]:

5243 """

5244 Ensures new columns (which go into the BlockManager as new blocks) are

5245 always copied (or a reference is being tracked to them under CoW)

5246 and converted into an array.

5248 Parameters

5249 ----------

5250 value : scalar, Series, or array-like

5252 Returns

5253 -------

5254 tuple of numpy.ndarray or ExtensionArray and optional BlockValuesRefs

5255 """

5256 self._ensure_valid_index(value)

5258 # Using a DataFrame would mean coercing values to one dtype

5259 assert not isinstance(value, DataFrame)

5260 if is_dict_like(value):

5261 if not isinstance(value, Series):

5262 value = Series(value)

5263 return _reindex_for_setitem(value, self.index)

5265 if is_list_like(value):

5266 com.require_length_match(value, self.index)

5267 arr = sanitize_array(value, self.index, copy=True, allow_2d=True)

5268 if (

5269 isinstance(value, Index)

5270 and value.dtype == "object"

5271 and arr.dtype != value.dtype

5272 ): #

5273 # TODO: Remove kludge in sanitize_array for string mode when enforcing

5274 # this deprecation

5275 warnings.warn(

5276 "Setting an Index with object dtype into a DataFrame will stop "

5277 "inferring another dtype in a future version. Cast the Index "

5278 "explicitly before setting it into the DataFrame.",

5279 FutureWarning,

5280 stacklevel=find_stack_level(),

5282 return arr, None

5284 @property

5285 def _series(self):

5286 return {item: self._ixs(idx, axis=1) for idx, item in enumerate(self.columns)}

5288 # ----------------------------------------------------------------------

5289 # Reindexing and alignment

5291 def _reindex_multi(

5292 self, axes: dict[str, Index], copy: bool, fill_value

5293 ) -> DataFrame:

5294 """

5295 We are guaranteed non-Nones in the axes.

5296 """

5298 new_index, row_indexer = self.index.reindex(axes["index"])

5299 new_columns, col_indexer = self.columns.reindex(axes["columns"])

5301 if row_indexer is not None and col_indexer is not None:

5302 # Fastpath. By doing two 'take's at once we avoid making an

5303 # unnecessary copy.

5304 # We only get here with `self._can_fast_transpose`, which (almost)

5305 # ensures that self.values is cheap. It may be worth making this

5306 # condition more specific.

5307 indexer = row_indexer, col_indexer

5308 new_values = take_2d_multi(self.values, indexer, fill_value=fill_value)

5309 return self._constructor(

5310 new_values, index=new_index, columns=new_columns, copy=False

5312 else:

5313 return self._reindex_with_indexers(

5314 {0: [new_index, row_indexer], 1: [new_columns, col_indexer]},

5315 copy=copy,

5316 fill_value=fill_value,

5319 @Appender(

5320 """

5321 Examples

5322 --------

5323 >>> df = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]})

5325 Change the row labels.

5327 >>> df.set_axis(['a', 'b', 'c'], axis='index')

5328 A B

5329 a 1 4

5330 b 2 5

5331 c 3 6

5333 Change the column labels.

5335 >>> df.set_axis(['I', 'II'], axis='columns')

5336 I II

5337 0 1 4

5338 1 2 5

5339 2 3 6

5340 """

5342 @Substitution(

5343 klass=_shared_doc_kwargs["klass"],

5344 axes_single_arg=_shared_doc_kwargs["axes_single_arg"],

5345 extended_summary_sub=" column or",

5346 axis_description_sub=", and 1 identifies the columns",

5347 see_also_sub=" or columns",

5349 @Appender(NDFrame.set_axis.__doc__)

5350 def set_axis(

5351 self,

5352 labels,

5354 axis: Axis = 0,

5355 copy: bool | None = None,

5356 ) -> DataFrame:

5357 return super().set_axis(labels, axis=axis, copy=copy)

5359 @doc(

5360 NDFrame.reindex,

5361 klass=_shared_doc_kwargs["klass"],

5362 optional_reindex=_shared_doc_kwargs["optional_reindex"],

5364 def reindex(

5365 self,

5366 labels=None,

5368 index=None,

5369 columns=None,

5370 axis: Axis | None = None,

5371 method: ReindexMethod | None = None,

5372 copy: bool | None = None,

5373 level: Level | None = None,

5374 fill_value: Scalar | None = np.nan,

5375 limit: int | None = None,

5376 tolerance=None,

5377 ) -> DataFrame:

5378 return super().reindex(

5379 labels=labels,

5380 index=index,

5381 columns=columns,

5382 axis=axis,

5383 method=method,

5384 copy=copy,

5385 level=level,

5386 fill_value=fill_value,

5387 limit=limit,

5388 tolerance=tolerance,

5391 @overload

5392 def drop(

5393 self,

5394 labels: IndexLabel = ...,

5396 axis: Axis = ...,

5397 index: IndexLabel = ...,

5398 columns: IndexLabel = ...,

5399 level: Level = ...,

5400 inplace: Literal[True],

5401 errors: IgnoreRaise = ...,

5402 ) -> None:

5403 ...

5405 @overload

5406 def drop(

5407 self,

5408 labels: IndexLabel = ...,

5410 axis: Axis = ...,

5411 index: IndexLabel = ...,

5412 columns: IndexLabel = ...,

5413 level: Level = ...,

5414 inplace: Literal[False] = ...,

5415 errors: IgnoreRaise = ...,

5416 ) -> DataFrame:

5417 ...

5419 @overload

5420 def drop(

5421 self,

5422 labels: IndexLabel = ...,

5424 axis: Axis = ...,

5425 index: IndexLabel = ...,

5426 columns: IndexLabel = ...,

5427 level: Level = ...,

5428 inplace: bool = ...,

5429 errors: IgnoreRaise = ...,

5430 ) -> DataFrame | None:

5431 ...

5433 def drop(

5434 self,

5435 labels: IndexLabel | None = None,

5437 axis: Axis = 0,

5438 index: IndexLabel | None = None,

5439 columns: IndexLabel | None = None,

5440 level: Level | None = None,

5441 inplace: bool = False,

5442 errors: IgnoreRaise = "raise",

5443 ) -> DataFrame | None:

5444 """

5445 Drop specified labels from rows or columns.

5447 Remove rows or columns by specifying label names and corresponding

5448 axis, or by directly specifying index or column names. When using a

5449 multi-index, labels on different levels can be removed by specifying

5450 the level. See the :ref:`user guide <advanced.shown_levels>`

5451 for more information about the now unused levels.

5453 Parameters

5454 ----------

5455 labels : single label or list-like

5456 Index or column labels to drop. A tuple will be used as a single

5457 label and not treated as a list-like.

5458 axis : {0 or 'index', 1 or 'columns'}, default 0

5459 Whether to drop labels from the index (0 or 'index') or

5460 columns (1 or 'columns').

5461 index : single label or list-like

5462 Alternative to specifying axis (``labels, axis=0``

5463 is equivalent to ``index=labels``).

5464 columns : single label or list-like

5465 Alternative to specifying axis (``labels, axis=1``

5466 is equivalent to ``columns=labels``).

5467 level : int or level name, optional

5468 For MultiIndex, level from which the labels will be removed.

5469 inplace : bool, default False

5470 If False, return a copy. Otherwise, do operation

5471 in place and return None.

5472 errors : {'ignore', 'raise'}, default 'raise'

5473 If 'ignore', suppress error and only existing labels are

5474 dropped.

5476 Returns

5477 -------

5478 DataFrame or None

5479 Returns DataFrame or None DataFrame with the specified

5480 index or column labels removed or None if inplace=True.

5482 Raises

5483 ------

5484 KeyError

5485 If any of the labels is not found in the selected axis.

5487 See Also

5488 --------

5489 DataFrame.loc : Label-location based indexer for selection by label.

5490 DataFrame.dropna : Return DataFrame with labels on given axis omitted

5491 where (all or any) data are missing.

5492 DataFrame.drop_duplicates : Return DataFrame with duplicate rows

5493 removed, optionally only considering certain columns.

5494 Series.drop : Return Series with specified index labels removed.

5496 Examples

5497 --------

5498 >>> df = pd.DataFrame(np.arange(12).reshape(3, 4),

5499 ... columns=['A', 'B', 'C', 'D'])

5500 >>> df

5501 A B C D

5502 0 0 1 2 3

5503 1 4 5 6 7

5504 2 8 9 10 11

5506 Drop columns

5508 >>> df.drop(['B', 'C'], axis=1)

5509 A D

5510 0 0 3

5511 1 4 7

5512 2 8 11

5514 >>> df.drop(columns=['B', 'C'])

5515 A D

5516 0 0 3

5517 1 4 7

5518 2 8 11

5520 Drop a row by index

5522 >>> df.drop([0, 1])

5523 A B C D

5524 2 8 9 10 11

5526 Drop columns and/or rows of MultiIndex DataFrame

5528 >>> midx = pd.MultiIndex(levels=[['llama', 'cow', 'falcon'],

5529 ... ['speed', 'weight', 'length']],

5530 ... codes=[[0, 0, 0, 1, 1, 1, 2, 2, 2],

5531 ... [0, 1, 2, 0, 1, 2, 0, 1, 2]])

5532 >>> df = pd.DataFrame(index=midx, columns=['big', 'small'],

5533 ... data=[[45, 30], [200, 100], [1.5, 1], [30, 20],

5534 ... [250, 150], [1.5, 0.8], [320, 250],

5535 ... [1, 0.8], [0.3, 0.2]])

5536 >>> df

5537 big small

5538 llama speed 45.0 30.0

5539 weight 200.0 100.0

5540 length 1.5 1.0

5541 cow speed 30.0 20.0

5542 weight 250.0 150.0

5543 length 1.5 0.8

5544 falcon speed 320.0 250.0

5545 weight 1.0 0.8

5546 length 0.3 0.2

5548 Drop a specific index combination from the MultiIndex

5549 DataFrame, i.e., drop the combination ``'falcon'`` and

5550 ``'weight'``, which deletes only the corresponding row

5552 >>> df.drop(index=('falcon', 'weight'))

5553 big small

5554 llama speed 45.0 30.0

5555 weight 200.0 100.0

5556 length 1.5 1.0

5557 cow speed 30.0 20.0

5558 weight 250.0 150.0

5559 length 1.5 0.8

5560 falcon speed 320.0 250.0

5561 length 0.3 0.2

5563 >>> df.drop(index='cow', columns='small')

5564 big

5565 llama speed 45.0

5566 weight 200.0

5567 length 1.5

5568 falcon speed 320.0

5569 weight 1.0

5570 length 0.3

5572 >>> df.drop(index='length', level=1)

5573 big small

5574 llama speed 45.0 30.0

5575 weight 200.0 100.0

5576 cow speed 30.0 20.0

5577 weight 250.0 150.0

5578 falcon speed 320.0 250.0

5579 weight 1.0 0.8

5580 """

5581 return super().drop(

5582 labels=labels,

5583 axis=axis,

5584 index=index,

5585 columns=columns,

5586 level=level,

5587 inplace=inplace,

5588 errors=errors,

5591 @overload

5592 def rename(

5593 self,

5594 mapper: Renamer | None = ...,

5596 index: Renamer | None = ...,

5597 columns: Renamer | None = ...,

5598 axis: Axis | None = ...,

5599 copy: bool | None = ...,

5600 inplace: Literal[True],

5601 level: Level = ...,

5602 errors: IgnoreRaise = ...,

5603 ) -> None:

5604 ...

5606 @overload

5607 def rename(

5608 self,

5609 mapper: Renamer | None = ...,

5611 index: Renamer | None = ...,

5612 columns: Renamer | None = ...,

5613 axis: Axis | None = ...,

5614 copy: bool | None = ...,

5615 inplace: Literal[False] = ...,

5616 level: Level = ...,

5617 errors: IgnoreRaise = ...,

5618 ) -> DataFrame:

5619 ...

5621 @overload

5622 def rename(

5623 self,

5624 mapper: Renamer | None = ...,

5626 index: Renamer | None = ...,

5627 columns: Renamer | None = ...,

5628 axis: Axis | None = ...,

5629 copy: bool | None = ...,

5630 inplace: bool = ...,

5631 level: Level = ...,

5632 errors: IgnoreRaise = ...,

5633 ) -> DataFrame | None:

5634 ...

5636 def rename(

5637 self,

5638 mapper: Renamer | None = None,

5640 index: Renamer | None = None,

5641 columns: Renamer | None = None,

5642 axis: Axis | None = None,

5643 copy: bool | None = None,

5644 inplace: bool = False,

5645 level: Level | None = None,

5646 errors: IgnoreRaise = "ignore",

5647 ) -> DataFrame | None:

5648 """

5649 Rename columns or index labels.

5651 Function / dict values must be unique (1-to-1). Labels not contained in

5652 a dict / Series will be left as-is. Extra labels listed don't throw an

5653 error.

5655 See the :ref:`user guide <basics.rename>` for more.

5657 Parameters

5658 ----------

5659 mapper : dict-like or function

5660 Dict-like or function transformations to apply to

5661 that axis' values. Use either ``mapper`` and ``axis`` to

5662 specify the axis to target with ``mapper``, or ``index`` and

5663 ``columns``.

5664 index : dict-like or function

5665 Alternative to specifying axis (``mapper, axis=0``

5666 is equivalent to ``index=mapper``).

5667 columns : dict-like or function

5668 Alternative to specifying axis (``mapper, axis=1``

5669 is equivalent to ``columns=mapper``).

5670 axis : {0 or 'index', 1 or 'columns'}, default 0

5671 Axis to target with ``mapper``. Can be either the axis name

5672 ('index', 'columns') or number (0, 1). The default is 'index'.

5673 copy : bool, default True

5674 Also copy underlying data.

5676 .. note::

5677 The `copy` keyword will change behavior in pandas 3.0.

5678 `Copy-on-Write

5679 <https://pandas.pydata.org/docs/dev/user_guide/copy_on_write.html>`__

5680 will be enabled by default, which means that all methods with a

5681 `copy` keyword will use a lazy copy mechanism to defer the copy and

5682 ignore the `copy` keyword. The `copy` keyword will be removed in a

5683 future version of pandas.

5685 You can already get the future behavior and improvements through

5686 enabling copy on write ``pd.options.mode.copy_on_write = True``

5687 inplace : bool, default False

5688 Whether to modify the DataFrame rather than creating a new one.

5689 If True then value of copy is ignored.

5690 level : int or level name, default None

5691 In case of a MultiIndex, only rename labels in the specified

5692 level.

5693 errors : {'ignore', 'raise'}, default 'ignore'

5694 If 'raise', raise a `KeyError` when a dict-like `mapper`, `index`,

5695 or `columns` contains labels that are not present in the Index

5696 being transformed.

5697 If 'ignore', existing keys will be renamed and extra keys will be

5698 ignored.

5700 Returns

5701 -------

5702 DataFrame or None

5703 DataFrame with the renamed axis labels or None if ``inplace=True``.

5705 Raises

5706 ------

5707 KeyError

5708 If any of the labels is not found in the selected axis and

5709 "errors='raise'".

5711 See Also

5712 --------

5713 DataFrame.rename_axis : Set the name of the axis.

5715 Examples

5716 --------

5717 ``DataFrame.rename`` supports two calling conventions

5719 * ``(index=index_mapper, columns=columns_mapper, ...)``

5720 * ``(mapper, axis={'index', 'columns'}, ...)``

5722 We *highly* recommend using keyword arguments to clarify your

5723 intent.

5725 Rename columns using a mapping:

5727 >>> df = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]})

5728 >>> df.rename(columns={"A": "a", "B": "c"})

5729 a c

5730 0 1 4

5731 1 2 5

5732 2 3 6

5734 Rename index using a mapping:

5736 >>> df.rename(index={0: "x", 1: "y", 2: "z"})

5737 A B

5738 x 1 4

5739 y 2 5

5740 z 3 6

5742 Cast index labels to a different type:

5744 >>> df.index

5745 RangeIndex(start=0, stop=3, step=1)

5746 >>> df.rename(index=str).index

5747 Index(['0', '1', '2'], dtype='object')

5749 >>> df.rename(columns={"A": "a", "B": "b", "C": "c"}, errors="raise")

5750 Traceback (most recent call last):

5751 KeyError: ['C'] not found in axis

5753 Using axis-style parameters:

5755 >>> df.rename(str.lower, axis='columns')

5756 a b

5757 0 1 4

5758 1 2 5

5759 2 3 6

5761 >>> df.rename({1: 2, 2: 4}, axis='index')

5762 A B

5763 0 1 4

5764 2 2 5

5765 4 3 6

5766 """

5767 return super()._rename(

5768 mapper=mapper,

5769 index=index,

5770 columns=columns,

5771 axis=axis,

5772 copy=copy,

5773 inplace=inplace,

5774 level=level,

5775 errors=errors,

5778 def pop(self, item: Hashable) -> Series:

5779 """

5780 Return item and drop from frame. Raise KeyError if not found.

5782 Parameters

5783 ----------

5784 item : label

5785 Label of column to be popped.

5787 Returns

5788 -------

5789 Series

5791 Examples

5792 --------

5793 >>> df = pd.DataFrame([('falcon', 'bird', 389.0),

5794 ... ('parrot', 'bird', 24.0),

5795 ... ('lion', 'mammal', 80.5),

5796 ... ('monkey', 'mammal', np.nan)],

5797 ... columns=('name', 'class', 'max_speed'))

5798 >>> df

5799 name class max_speed

5800 0 falcon bird 389.0

5801 1 parrot bird 24.0

5802 2 lion mammal 80.5

5803 3 monkey mammal NaN

5805 >>> df.pop('class')

5806 0 bird

5807 1 bird

5808 2 mammal

5809 3 mammal

5810 Name: class, dtype: object

5812 >>> df

5813 name max_speed

5814 0 falcon 389.0

5815 1 parrot 24.0

5816 2 lion 80.5

5817 3 monkey NaN

5818 """

5819 return super().pop(item=item)

5821 def _replace_columnwise(

5822 self, mapping: dict[Hashable, tuple[Any, Any]], inplace: bool, regex

5824 """

5825 Dispatch to Series.replace column-wise.

5827 Parameters

5828 ----------

5829 mapping : dict

5830 of the form {col: (target, value)}

5831 inplace : bool

5832 regex : bool or same types as `to_replace` in DataFrame.replace

5834 Returns

5835 -------

5836 DataFrame or None

5837 """

5838 # Operate column-wise

5839 res = self if inplace else self.copy(deep=None)

5840 ax = self.columns

5842 for i, ax_value in enumerate(ax):

5843 if ax_value in mapping:

5844 ser = self.iloc[:, i]

5846 target, value = mapping[ax_value]

5847 newobj = ser.replace(target, value, regex=regex)

5849 res._iset_item(i, newobj, inplace=inplace)

5851 if inplace:

5852 return

5853 return res.__finalize__(self)

5855 @doc(NDFrame.shift, klass=_shared_doc_kwargs["klass"])

5856 def shift(

5857 self,

5858 periods: int | Sequence[int] = 1,

5859 freq: Frequency | None = None,

5860 axis: Axis = 0,

5861 fill_value: Hashable = lib.no_default,

5862 suffix: str | None = None,

5863 ) -> DataFrame:

5864 if freq is not None and fill_value is not lib.no_default:

5865 # GH#53832

5866 warnings.warn(

5867 "Passing a 'freq' together with a 'fill_value' silently ignores "

5868 "the fill_value and is deprecated. This will raise in a future "

5869 "version.",

5870 FutureWarning,

5871 stacklevel=find_stack_level(),

5873 fill_value = lib.no_default

5875 if self.empty:

5876 return self.copy()

5878 axis = self._get_axis_number(axis)

5880 if is_list_like(periods):

5881 periods = cast(Sequence, periods)

5882 if axis == 1:

5883 raise ValueError(

5884 "If `periods` contains multiple shifts, `axis` cannot be 1."

5886 if len(periods) == 0:

5887 raise ValueError("If `periods` is an iterable, it cannot be empty.")

5888 from pandas.core.reshape.concat import concat

5890 shifted_dataframes = []

5891 for period in periods:

5892 if not is_integer(period):

5893 raise TypeError(

5894 f"Periods must be integer, but {period} is {type(period)}."

5896 period = cast(int, period)

5897 shifted_dataframes.append(

5898 super()

5899 .shift(periods=period, freq=freq, axis=axis, fill_value=fill_value)

5900 .add_suffix(f"{suffix}_{period}" if suffix else f"_{period}")

5902 return concat(shifted_dataframes, axis=1)

5903 elif suffix:

5904 raise ValueError("Cannot specify `suffix` if `periods` is an int.")

5905 periods = cast(int, periods)

5907 ncols = len(self.columns)

5908 arrays = self._mgr.arrays

5909 if axis == 1 and periods != 0 and ncols > 0 and freq is None:

5910 if fill_value is lib.no_default:

5911 # We will infer fill_value to match the closest column

5913 # Use a column that we know is valid for our column's dtype GH#38434

5914 label = self.columns[0]

5916 if periods > 0:

5917 result = self.iloc[:, :-periods]

5918 for col in range(min(ncols, abs(periods))):

5919 # TODO(EA2D): doing this in a loop unnecessary with 2D EAs

5920 # Define filler inside loop so we get a copy

5921 filler = self.iloc[:, 0].shift(len(self))

5922 result.insert(0, label, filler, allow_duplicates=True)

5923 else:

5924 result = self.iloc[:, -periods:]

5925 for col in range(min(ncols, abs(periods))):

5926 # Define filler inside loop so we get a copy

5927 filler = self.iloc[:, -1].shift(len(self))

5928 result.insert(

5929 len(result.columns), label, filler, allow_duplicates=True

5932 result.columns = self.columns.copy()

5933 return result

5934 elif len(arrays) > 1 or (

5935 # If we only have one block and we know that we can't

5936 # keep the same dtype (i.e. the _can_hold_element check)

5937 # then we can go through the reindex_indexer path

5938 # (and avoid casting logic in the Block method).

5939 not can_hold_element(arrays[0], fill_value)

5941 # GH#35488 we need to watch out for multi-block cases

5942 # We only get here with fill_value not-lib.no_default

5943 nper = abs(periods)

5944 nper = min(nper, ncols)

5945 if periods > 0:

5946 indexer = np.array(

5947 [-1] * nper + list(range(ncols - periods)), dtype=np.intp

5949 else:

5950 indexer = np.array(

5951 list(range(nper, ncols)) + [-1] * nper, dtype=np.intp

5953 mgr = self._mgr.reindex_indexer(

5954 self.columns,

5955 indexer,

5956 axis=0,

5957 fill_value=fill_value,

5958 allow_dups=True,

5960 res_df = self._constructor_from_mgr(mgr, axes=mgr.axes)

5961 return res_df.__finalize__(self, method="shift")

5962 else:

5963 return self.T.shift(periods=periods, fill_value=fill_value).T

5965 return super().shift(

5966 periods=periods, freq=freq, axis=axis, fill_value=fill_value

5969 @overload

5970 def set_index(

5971 self,

5972 keys,

5974 drop: bool = ...,

5975 append: bool = ...,

5976 inplace: Literal[False] = ...,

5977 verify_integrity: bool = ...,

5978 ) -> DataFrame:

5979 ...

5981 @overload

5982 def set_index(

5983 self,

5984 keys,

5986 drop: bool = ...,

5987 append: bool = ...,

5988 inplace: Literal[True],

5989 verify_integrity: bool = ...,

5990 ) -> None:

5991 ...

5993 def set_index(

5994 self,

5995 keys,

5997 drop: bool = True,

5998 append: bool = False,

5999 inplace: bool = False,

6000 verify_integrity: bool = False,

6001 ) -> DataFrame | None:

6002 """

6003 Set the DataFrame index using existing columns.

6005 Set the DataFrame index (row labels) using one or more existing

6006 columns or arrays (of the correct length). The index can replace the

6007 existing index or expand on it.

6009 Parameters

6010 ----------

6011 keys : label or array-like or list of labels/arrays

6012 This parameter can be either a single column key, a single array of

6013 the same length as the calling DataFrame, or a list containing an

6014 arbitrary combination of column keys and arrays. Here, "array"

6015 encompasses :class:`Series`, :class:`Index`, ``np.ndarray``, and

6016 instances of :class:`~collections.abc.Iterator`.

6017 drop : bool, default True

6018 Delete columns to be used as the new index.

6019 append : bool, default False

6020 Whether to append columns to existing index.

6021 inplace : bool, default False

6022 Whether to modify the DataFrame rather than creating a new one.

6023 verify_integrity : bool, default False

6024 Check the new index for duplicates. Otherwise defer the check until

6025 necessary. Setting to False will improve the performance of this

6026 method.

6028 Returns

6029 -------

6030 DataFrame or None

6031 Changed row labels or None if ``inplace=True``.

6033 See Also

6034 --------

6035 DataFrame.reset_index : Opposite of set_index.

6036 DataFrame.reindex : Change to new indices or expand indices.

6037 DataFrame.reindex_like : Change to same indices as other DataFrame.

6039 Examples

6040 --------

6041 >>> df = pd.DataFrame({'month': [1, 4, 7, 10],

6042 ... 'year': [2012, 2014, 2013, 2014],

6043 ... 'sale': [55, 40, 84, 31]})

6044 >>> df

6045 month year sale

6046 0 1 2012 55

6047 1 4 2014 40

6048 2 7 2013 84

6049 3 10 2014 31

6051 Set the index to become the 'month' column:

6053 >>> df.set_index('month')

6054 year sale

6055 month

6056 1 2012 55

6057 4 2014 40

6058 7 2013 84

6059 10 2014 31

6061 Create a MultiIndex using columns 'year' and 'month':

6063 >>> df.set_index(['year', 'month'])

6064 sale

6065 year month

6066 2012 1 55

6067 2014 4 40

6068 2013 7 84

6069 2014 10 31

6071 Create a MultiIndex using an Index and a column:

6073 >>> df.set_index([pd.Index([1, 2, 3, 4]), 'year'])

6074 month sale

6075 year

6076 1 2012 1 55

6077 2 2014 4 40

6078 3 2013 7 84

6079 4 2014 10 31

6081 Create a MultiIndex using two Series:

6083 >>> s = pd.Series([1, 2, 3, 4])

6084 >>> df.set_index([s, s**2])

6085 month year sale

6086 1 1 1 2012 55

6087 2 4 4 2014 40

6088 3 9 7 2013 84

6089 4 16 10 2014 31

6090 """

6091 inplace = validate_bool_kwarg(inplace, "inplace")

6092 self._check_inplace_and_allows_duplicate_labels(inplace)

6093 if not isinstance(keys, list):

6094 keys = [keys]

6096 err_msg = (

6097 'The parameter "keys" may be a column key, one-dimensional '

6098 "array, or a list containing only valid column keys and "

6099 "one-dimensional arrays."

6102 missing: list[Hashable] = []

6103 for col in keys:

6104 if isinstance(col, (Index, Series, np.ndarray, list, abc.Iterator)):

6105 # arrays are fine as long as they are one-dimensional

6106 # iterators get converted to list below

6107 if getattr(col, "ndim", 1) != 1:

6108 raise ValueError(err_msg)

6109 else:

6110 # everything else gets tried as a key; see GH 24969

6111 try:

6112 found = col in self.columns

6113 except TypeError as err:

6114 raise TypeError(

6115 f"{err_msg}. Received column of type {type(col)}"

6116 ) from err

6117 else:

6118 if not found:

6119 missing.append(col)

6121 if missing:

6122 raise KeyError(f"None of {missing} are in the columns")

6124 if inplace:

6125 frame = self

6126 else:

6127 # GH 49473 Use "lazy copy" with Copy-on-Write

6128 frame = self.copy(deep=None)

6130 arrays: list[Index] = []

6131 names: list[Hashable] = []

6132 if append:

6133 names = list(self.index.names)

6134 if isinstance(self.index, MultiIndex):

6135 arrays.extend(

6136 self.index._get_level_values(i) for i in range(self.index.nlevels)

6138 else:

6139 arrays.append(self.index)

6141 to_remove: list[Hashable] = []

6142 for col in keys:

6143 if isinstance(col, MultiIndex):

6144 arrays.extend(col._get_level_values(n) for n in range(col.nlevels))

6145 names.extend(col.names)

6146 elif isinstance(col, (Index, Series)):

6147 # if Index then not MultiIndex (treated above)

6149 # error: Argument 1 to "append" of "list" has incompatible type

6150 # "Union[Index, Series]"; expected "Index"

6151 arrays.append(col) # type: ignore[arg-type]

6152 names.append(col.name)

6153 elif isinstance(col, (list, np.ndarray)):

6154 # error: Argument 1 to "append" of "list" has incompatible type

6155 # "Union[List[Any], ndarray]"; expected "Index"

6156 arrays.append(col) # type: ignore[arg-type]

6157 names.append(None)

6158 elif isinstance(col, abc.Iterator):

6159 # error: Argument 1 to "append" of "list" has incompatible type

6160 # "List[Any]"; expected "Index"

6161 arrays.append(list(col)) # type: ignore[arg-type]

6162 names.append(None)

6163 # from here, col can only be a column label

6164 else:

6165 arrays.append(frame[col])

6166 names.append(col)

6167 if drop:

6168 to_remove.append(col)

6170 if len(arrays[-1]) != len(self):

6171 # check newest element against length of calling frame, since

6172 # ensure_index_from_sequences would not raise for append=False.

6173 raise ValueError(

6174 f"Length mismatch: Expected {len(self)} rows, "

6175 f"received array of length {len(arrays[-1])}"

6178 index = ensure_index_from_sequences(arrays, names)

6180 if verify_integrity and not index.is_unique:

6181 duplicates = index[index.duplicated()].unique()

6182 raise ValueError(f"Index has duplicate keys: {duplicates}")

6184 # use set to handle duplicate column names gracefully in case of drop

6185 for c in set(to_remove):

6186 del frame[c]

6188 # clear up memory usage

6189 index._cleanup()

6191 frame.index = index

6193 if not inplace:

6194 return frame

6195 return None

6197 @overload

6198 def reset_index(

6199 self,

6200 level: IndexLabel = ...,

6202 drop: bool = ...,

6203 inplace: Literal[False] = ...,

6204 col_level: Hashable = ...,

6205 col_fill: Hashable = ...,

6206 allow_duplicates: bool | lib.NoDefault = ...,

6207 names: Hashable | Sequence[Hashable] | None = None,

6208 ) -> DataFrame:

6209 ...

6211 @overload

6212 def reset_index(

6213 self,

6214 level: IndexLabel = ...,

6216 drop: bool = ...,

6217 inplace: Literal[True],

6218 col_level: Hashable = ...,

6219 col_fill: Hashable = ...,

6220 allow_duplicates: bool | lib.NoDefault = ...,

6221 names: Hashable | Sequence[Hashable] | None = None,

6222 ) -> None:

6223 ...

6225 @overload

6226 def reset_index(

6227 self,

6228 level: IndexLabel = ...,

6230 drop: bool = ...,

6231 inplace: bool = ...,

6232 col_level: Hashable = ...,

6233 col_fill: Hashable = ...,

6234 allow_duplicates: bool | lib.NoDefault = ...,

6235 names: Hashable | Sequence[Hashable] | None = None,

6236 ) -> DataFrame | None:

6237 ...

6239 def reset_index(

6240 self,

6241 level: IndexLabel | None = None,

6243 drop: bool = False,

6244 inplace: bool = False,

6245 col_level: Hashable = 0,

6246 col_fill: Hashable = "",

6247 allow_duplicates: bool | lib.NoDefault = lib.no_default,

6248 names: Hashable | Sequence[Hashable] | None = None,

6249 ) -> DataFrame | None:

6250 """

6251 Reset the index, or a level of it.

6253 Reset the index of the DataFrame, and use the default one instead.

6254 If the DataFrame has a MultiIndex, this method can remove one or more

6255 levels.

6257 Parameters

6258 ----------

6259 level : int, str, tuple, or list, default None

6260 Only remove the given levels from the index. Removes all levels by

6261 default.

6262 drop : bool, default False

6263 Do not try to insert index into dataframe columns. This resets

6264 the index to the default integer index.

6265 inplace : bool, default False

6266 Whether to modify the DataFrame rather than creating a new one.

6267 col_level : int or str, default 0

6268 If the columns have multiple levels, determines which level the

6269 labels are inserted into. By default it is inserted into the first

6270 level.

6271 col_fill : object, default ''

6272 If the columns have multiple levels, determines how the other

6273 levels are named. If None then the index name is repeated.

6274 allow_duplicates : bool, optional, default lib.no_default

6275 Allow duplicate column labels to be created.

6277 .. versionadded:: 1.5.0

6279 names : int, str or 1-dimensional list, default None

6280 Using the given string, rename the DataFrame column which contains the

6281 index data. If the DataFrame has a MultiIndex, this has to be a list or

6282 tuple with length equal to the number of levels.

6284 .. versionadded:: 1.5.0

6286 Returns

6287 -------

6288 DataFrame or None

6289 DataFrame with the new index or None if ``inplace=True``.

6291 See Also

6292 --------

6293 DataFrame.set_index : Opposite of reset_index.

6294 DataFrame.reindex : Change to new indices or expand indices.

6295 DataFrame.reindex_like : Change to same indices as other DataFrame.

6297 Examples

6298 --------

6299 >>> df = pd.DataFrame([('bird', 389.0),

6300 ... ('bird', 24.0),

6301 ... ('mammal', 80.5),

6302 ... ('mammal', np.nan)],

6303 ... index=['falcon', 'parrot', 'lion', 'monkey'],

6304 ... columns=('class', 'max_speed'))

6305 >>> df

6306 class max_speed

6307 falcon bird 389.0

6308 parrot bird 24.0

6309 lion mammal 80.5

6310 monkey mammal NaN

6312 When we reset the index, the old index is added as a column, and a

6313 new sequential index is used:

6315 >>> df.reset_index()

6316 index class max_speed

6317 0 falcon bird 389.0

6318 1 parrot bird 24.0

6319 2 lion mammal 80.5

6320 3 monkey mammal NaN

6322 We can use the `drop` parameter to avoid the old index being added as

6323 a column:

6325 >>> df.reset_index(drop=True)

6326 class max_speed

6327 0 bird 389.0

6328 1 bird 24.0

6329 2 mammal 80.5

6330 3 mammal NaN

6332 You can also use `reset_index` with `MultiIndex`.

6334 >>> index = pd.MultiIndex.from_tuples([('bird', 'falcon'),

6335 ... ('bird', 'parrot'),

6336 ... ('mammal', 'lion'),

6337 ... ('mammal', 'monkey')],

6338 ... names=['class', 'name'])

6339 >>> columns = pd.MultiIndex.from_tuples([('speed', 'max'),

6340 ... ('species', 'type')])

6341 >>> df = pd.DataFrame([(389.0, 'fly'),

6342 ... (24.0, 'fly'),

6343 ... (80.5, 'run'),

6344 ... (np.nan, 'jump')],

6345 ... index=index,

6346 ... columns=columns)

6347 >>> df

6348 speed species

6349 max type

6350 class name

6351 bird falcon 389.0 fly

6352 parrot 24.0 fly

6353 mammal lion 80.5 run

6354 monkey NaN jump

6356 Using the `names` parameter, choose a name for the index column:

6358 >>> df.reset_index(names=['classes', 'names'])

6359 classes names speed species

6360 max type

6361 0 bird falcon 389.0 fly

6362 1 bird parrot 24.0 fly

6363 2 mammal lion 80.5 run

6364 3 mammal monkey NaN jump

6366 If the index has multiple levels, we can reset a subset of them:

6368 >>> df.reset_index(level='class')

6369 class speed species

6370 max type

6371 name

6372 falcon bird 389.0 fly

6373 parrot bird 24.0 fly

6374 lion mammal 80.5 run

6375 monkey mammal NaN jump

6377 If we are not dropping the index, by default, it is placed in the top

6378 level. We can place it in another level:

6380 >>> df.reset_index(level='class', col_level=1)

6381 speed species

6382 class max type

6383 name

6384 falcon bird 389.0 fly

6385 parrot bird 24.0 fly

6386 lion mammal 80.5 run

6387 monkey mammal NaN jump

6389 When the index is inserted under another level, we can specify under

6390 which one with the parameter `col_fill`:

6392 >>> df.reset_index(level='class', col_level=1, col_fill='species')

6393 species speed species

6394 class max type

6395 name

6396 falcon bird 389.0 fly

6397 parrot bird 24.0 fly

6398 lion mammal 80.5 run

6399 monkey mammal NaN jump

6401 If we specify a nonexistent level for `col_fill`, it is created:

6403 >>> df.reset_index(level='class', col_level=1, col_fill='genus')

6404 genus speed species

6405 class max type

6406 name

6407 falcon bird 389.0 fly

6408 parrot bird 24.0 fly

6409 lion mammal 80.5 run

6410 monkey mammal NaN jump

6411 """

6412 inplace = validate_bool_kwarg(inplace, "inplace")

6413 self._check_inplace_and_allows_duplicate_labels(inplace)

6414 if inplace:

6415 new_obj = self

6416 else:

6417 new_obj = self.copy(deep=None)

6418 if allow_duplicates is not lib.no_default:

6419 allow_duplicates = validate_bool_kwarg(allow_duplicates, "allow_duplicates")

6421 new_index = default_index(len(new_obj))

6422 if level is not None:

6423 if not isinstance(level, (tuple, list)):

6424 level = [level]

6425 level = [self.index._get_level_number(lev) for lev in level]

6426 if len(level) < self.index.nlevels:

6427 new_index = self.index.droplevel(level)

6429 if not drop:

6430 to_insert: Iterable[tuple[Any, Any | None]]

6432 default = "index" if "index" not in self else "level_0"

6433 names = self.index._get_default_index_names(names, default)

6435 if isinstance(self.index, MultiIndex):

6436 to_insert = zip(self.index.levels, self.index.codes)

6437 else:

6438 to_insert = ((self.index, None),)

6440 multi_col = isinstance(self.columns, MultiIndex)

6441 for i, (lev, lab) in reversed(list(enumerate(to_insert))):

6442 if level is not None and i not in level:

6443 continue

6444 name = names[i]

6445 if multi_col:

6446 col_name = list(name) if isinstance(name, tuple) else [name]

6447 if col_fill is None:

6448 if len(col_name) not in (1, self.columns.nlevels):

6449 raise ValueError(

6450 "col_fill=None is incompatible "

6451 f"with incomplete column name {name}"

6453 col_fill = col_name[0]

6455 lev_num = self.columns._get_level_number(col_level)

6456 name_lst = [col_fill] * lev_num + col_name

6457 missing = self.columns.nlevels - len(name_lst)

6458 name_lst += [col_fill] * missing

6459 name = tuple(name_lst)

6461 # to ndarray and maybe infer different dtype

6462 level_values = lev._values

6463 if level_values.dtype == np.object_:

6464 level_values = lib.maybe_convert_objects(level_values)

6466 if lab is not None:

6467 # if we have the codes, extract the values with a mask

6468 level_values = algorithms.take(

6469 level_values, lab, allow_fill=True, fill_value=lev._na_value

6472 new_obj.insert(

6474 name,

6475 level_values,

6476 allow_duplicates=allow_duplicates,

6479 new_obj.index = new_index

6480 if not inplace:

6481 return new_obj

6483 return None

6485 # ----------------------------------------------------------------------

6486 # Reindex-based selection methods

6488 @doc(NDFrame.isna, klass=_shared_doc_kwargs["klass"])

6489 def isna(self) -> DataFrame:

6490 res_mgr = self._mgr.isna(func=isna)

6491 result = self._constructor_from_mgr(res_mgr, axes=res_mgr.axes)

6492 return result.__finalize__(self, method="isna")

6494 @doc(NDFrame.isna, klass=_shared_doc_kwargs["klass"])

6495 def isnull(self) -> DataFrame:

6496 """

6497 DataFrame.isnull is an alias for DataFrame.isna.

6498 """

6499 return self.isna()

6501 @doc(NDFrame.notna, klass=_shared_doc_kwargs["klass"])

6502 def notna(self) -> DataFrame:

6503 return ~self.isna()

6505 @doc(NDFrame.notna, klass=_shared_doc_kwargs["klass"])

6506 def notnull(self) -> DataFrame:

6507 """

6508 DataFrame.notnull is an alias for DataFrame.notna.

6509 """

6510 return ~self.isna()

6512 @overload

6513 def dropna(

6514 self,

6516 axis: Axis = ...,

6517 how: AnyAll | lib.NoDefault = ...,

6518 thresh: int | lib.NoDefault = ...,

6519 subset: IndexLabel = ...,

6520 inplace: Literal[False] = ...,

6521 ignore_index: bool = ...,

6522 ) -> DataFrame:

6523 ...

6525 @overload

6526 def dropna(

6527 self,

6529 axis: Axis = ...,

6530 how: AnyAll | lib.NoDefault = ...,

6531 thresh: int | lib.NoDefault = ...,

6532 subset: IndexLabel = ...,

6533 inplace: Literal[True],

6534 ignore_index: bool = ...,

6535 ) -> None:

6536 ...

6538 def dropna(

6539 self,

6541 axis: Axis = 0,

6542 how: AnyAll | lib.NoDefault = lib.no_default,

6543 thresh: int | lib.NoDefault = lib.no_default,

6544 subset: IndexLabel | None = None,

6545 inplace: bool = False,

6546 ignore_index: bool = False,

6547 ) -> DataFrame | None:

6548 """

6549 Remove missing values.

6551 See the :ref:`User Guide <missing_data>` for more on which values are

6552 considered missing, and how to work with missing data.

6554 Parameters

6555 ----------

6556 axis : {0 or 'index', 1 or 'columns'}, default 0

6557 Determine if rows or columns which contain missing values are

6558 removed.

6560 * 0, or 'index' : Drop rows which contain missing values.

6561 * 1, or 'columns' : Drop columns which contain missing value.

6563 Only a single axis is allowed.

6565 how : {'any', 'all'}, default 'any'

6566 Determine if row or column is removed from DataFrame, when we have

6567 at least one NA or all NA.

6569 * 'any' : If any NA values are present, drop that row or column.

6570 * 'all' : If all values are NA, drop that row or column.

6572 thresh : int, optional

6573 Require that many non-NA values. Cannot be combined with how.

6574 subset : column label or sequence of labels, optional

6575 Labels along other axis to consider, e.g. if you are dropping rows

6576 these would be a list of columns to include.

6577 inplace : bool, default False

6578 Whether to modify the DataFrame rather than creating a new one.

6579 ignore_index : bool, default ``False``

6580 If ``True``, the resulting axis will be labeled 0, 1, …, n - 1.

6582 .. versionadded:: 2.0.0

6584 Returns

6585 -------

6586 DataFrame or None

6587 DataFrame with NA entries dropped from it or None if ``inplace=True``.

6589 See Also

6590 --------

6591 DataFrame.isna: Indicate missing values.

6592 DataFrame.notna : Indicate existing (non-missing) values.

6593 DataFrame.fillna : Replace missing values.

6594 Series.dropna : Drop missing values.

6595 Index.dropna : Drop missing indices.

6597 Examples

6598 --------

6599 >>> df = pd.DataFrame({"name": ['Alfred', 'Batman', 'Catwoman'],

6600 ... "toy": [np.nan, 'Batmobile', 'Bullwhip'],

6601 ... "born": [pd.NaT, pd.Timestamp("1940-04-25"),

6602 ... pd.NaT]})

6603 >>> df

6604 name toy born

6605 0 Alfred NaN NaT

6606 1 Batman Batmobile 1940-04-25

6607 2 Catwoman Bullwhip NaT

6609 Drop the rows where at least one element is missing.

6611 >>> df.dropna()

6612 name toy born

6613 1 Batman Batmobile 1940-04-25

6615 Drop the columns where at least one element is missing.

6617 >>> df.dropna(axis='columns')

6618 name

6619 0 Alfred

6620 1 Batman

6621 2 Catwoman

6623 Drop the rows where all elements are missing.

6625 >>> df.dropna(how='all')

6626 name toy born

6627 0 Alfred NaN NaT

6628 1 Batman Batmobile 1940-04-25

6629 2 Catwoman Bullwhip NaT

6631 Keep only the rows with at least 2 non-NA values.

6633 >>> df.dropna(thresh=2)

6634 name toy born

6635 1 Batman Batmobile 1940-04-25

6636 2 Catwoman Bullwhip NaT

6638 Define in which columns to look for missing values.

6640 >>> df.dropna(subset=['name', 'toy'])

6641 name toy born

6642 1 Batman Batmobile 1940-04-25

6643 2 Catwoman Bullwhip NaT

6644 """

6645 if (how is not lib.no_default) and (thresh is not lib.no_default):

6646 raise TypeError(

6647 "You cannot set both the how and thresh arguments at the same time."

6650 if how is lib.no_default:

6651 how = "any"

6653 inplace = validate_bool_kwarg(inplace, "inplace")

6654 if isinstance(axis, (tuple, list)):

6655 # GH20987

6656 raise TypeError("supplying multiple axes to axis is no longer supported.")

6658 axis = self._get_axis_number(axis)

6659 agg_axis = 1 - axis

6661 agg_obj = self

6662 if subset is not None:

6663 # subset needs to be list

6664 if not is_list_like(subset):

6665 subset = [subset]

6666 ax = self._get_axis(agg_axis)

6667 indices = ax.get_indexer_for(subset)

6668 check = indices == -1

6669 if check.any():

6670 raise KeyError(np.array(subset)[check].tolist())

6671 agg_obj = self.take(indices, axis=agg_axis)

6673 if thresh is not lib.no_default:

6674 count = agg_obj.count(axis=agg_axis)

6675 mask = count >= thresh

6676 elif how == "any":

6677 # faster equivalent to 'agg_obj.count(agg_axis) == self.shape[agg_axis]'

6678 mask = notna(agg_obj).all(axis=agg_axis, bool_only=False)

6679 elif how == "all":

6680 # faster equivalent to 'agg_obj.count(agg_axis) > 0'

6681 mask = notna(agg_obj).any(axis=agg_axis, bool_only=False)

6682 else:

6683 raise ValueError(f"invalid how option: {how}")

6685 if np.all(mask):

6686 result = self.copy(deep=None)

6687 else:

6688 result = self.loc(axis=axis)[mask]

6690 if ignore_index:

6691 result.index = default_index(len(result))

6693 if not inplace:

6694 return result

6695 self._update_inplace(result)

6696 return None

6698 @overload

6699 def drop_duplicates(

6700 self,

6701 subset: Hashable | Sequence[Hashable] | None = ...,

6703 keep: DropKeep = ...,

6704 inplace: Literal[True],

6705 ignore_index: bool = ...,

6706 ) -> None:

6707 ...

6709 @overload

6710 def drop_duplicates(

6711 self,

6712 subset: Hashable | Sequence[Hashable] | None = ...,

6714 keep: DropKeep = ...,

6715 inplace: Literal[False] = ...,

6716 ignore_index: bool = ...,

6717 ) -> DataFrame:

6718 ...

6720 @overload

6721 def drop_duplicates(

6722 self,

6723 subset: Hashable | Sequence[Hashable] | None = ...,

6725 keep: DropKeep = ...,

6726 inplace: bool = ...,

6727 ignore_index: bool = ...,

6728 ) -> DataFrame | None:

6729 ...

6731 def drop_duplicates(

6732 self,

6733 subset: Hashable | Sequence[Hashable] | None = None,

6735 keep: DropKeep = "first",

6736 inplace: bool = False,

6737 ignore_index: bool = False,

6738 ) -> DataFrame | None:

6739 """

6740 Return DataFrame with duplicate rows removed.

6742 Considering certain columns is optional. Indexes, including time indexes

6743 are ignored.

6745 Parameters

6746 ----------

6747 subset : column label or sequence of labels, optional

6748 Only consider certain columns for identifying duplicates, by

6749 default use all of the columns.

6750 keep : {'first', 'last', ``False``}, default 'first'

6751 Determines which duplicates (if any) to keep.

6753 - 'first' : Drop duplicates except for the first occurrence.

6754 - 'last' : Drop duplicates except for the last occurrence.

6755 - ``False`` : Drop all duplicates.

6757 inplace : bool, default ``False``

6758 Whether to modify the DataFrame rather than creating a new one.

6759 ignore_index : bool, default ``False``

6760 If ``True``, the resulting axis will be labeled 0, 1, …, n - 1.

6762 Returns

6763 -------

6764 DataFrame or None

6765 DataFrame with duplicates removed or None if ``inplace=True``.

6767 See Also

6768 --------

6769 DataFrame.value_counts: Count unique combinations of columns.

6771 Examples

6772 --------

6773 Consider dataset containing ramen rating.

6775 >>> df = pd.DataFrame({

6776 ... 'brand': ['Yum Yum', 'Yum Yum', 'Indomie', 'Indomie', 'Indomie'],

6777 ... 'style': ['cup', 'cup', 'cup', 'pack', 'pack'],

6778 ... 'rating': [4, 4, 3.5, 15, 5]

6779 ... })

6780 >>> df

6781 brand style rating

6782 0 Yum Yum cup 4.0

6783 1 Yum Yum cup 4.0

6784 2 Indomie cup 3.5

6785 3 Indomie pack 15.0

6786 4 Indomie pack 5.0

6788 By default, it removes duplicate rows based on all columns.

6790 >>> df.drop_duplicates()

6791 brand style rating

6792 0 Yum Yum cup 4.0

6793 2 Indomie cup 3.5

6794 3 Indomie pack 15.0

6795 4 Indomie pack 5.0

6797 To remove duplicates on specific column(s), use ``subset``.

6799 >>> df.drop_duplicates(subset=['brand'])

6800 brand style rating

6801 0 Yum Yum cup 4.0

6802 2 Indomie cup 3.5

6804 To remove duplicates and keep last occurrences, use ``keep``.

6806 >>> df.drop_duplicates(subset=['brand', 'style'], keep='last')

6807 brand style rating

6808 1 Yum Yum cup 4.0

6809 2 Indomie cup 3.5

6810 4 Indomie pack 5.0

6811 """

6812 if self.empty:

6813 return self.copy(deep=None)

6815 inplace = validate_bool_kwarg(inplace, "inplace")

6816 ignore_index = validate_bool_kwarg(ignore_index, "ignore_index")

6818 result = self[-self.duplicated(subset, keep=keep)]

6819 if ignore_index:

6820 result.index = default_index(len(result))

6822 if inplace:

6823 self._update_inplace(result)

6824 return None

6825 else:

6826 return result

6828 def duplicated(

6829 self,

6830 subset: Hashable | Sequence[Hashable] | None = None,

6831 keep: DropKeep = "first",

6832 ) -> Series:

6833 """

6834 Return boolean Series denoting duplicate rows.

6836 Considering certain columns is optional.

6838 Parameters

6839 ----------

6840 subset : column label or sequence of labels, optional

6841 Only consider certain columns for identifying duplicates, by

6842 default use all of the columns.

6843 keep : {'first', 'last', False}, default 'first'

6844 Determines which duplicates (if any) to mark.

6846 - ``first`` : Mark duplicates as ``True`` except for the first occurrence.

6847 - ``last`` : Mark duplicates as ``True`` except for the last occurrence.

6848 - False : Mark all duplicates as ``True``.

6850 Returns

6851 -------

6852 Series

6853 Boolean series for each duplicated rows.

6855 See Also

6856 --------

6857 Index.duplicated : Equivalent method on index.

6858 Series.duplicated : Equivalent method on Series.

6859 Series.drop_duplicates : Remove duplicate values from Series.

6860 DataFrame.drop_duplicates : Remove duplicate values from DataFrame.

6862 Examples

6863 --------

6864 Consider dataset containing ramen rating.

6866 >>> df = pd.DataFrame({

6867 ... 'brand': ['Yum Yum', 'Yum Yum', 'Indomie', 'Indomie', 'Indomie'],

6868 ... 'style': ['cup', 'cup', 'cup', 'pack', 'pack'],

6869 ... 'rating': [4, 4, 3.5, 15, 5]

6870 ... })

6871 >>> df

6872 brand style rating

6873 0 Yum Yum cup 4.0

6874 1 Yum Yum cup 4.0

6875 2 Indomie cup 3.5

6876 3 Indomie pack 15.0

6877 4 Indomie pack 5.0

6879 By default, for each set of duplicated values, the first occurrence

6880 is set on False and all others on True.

6882 >>> df.duplicated()

6883 0 False

6884 1 True

6885 2 False

6886 3 False

6887 4 False

6888 dtype: bool

6890 By using 'last', the last occurrence of each set of duplicated values

6891 is set on False and all others on True.

6893 >>> df.duplicated(keep='last')

6894 0 True

6895 1 False

6896 2 False

6897 3 False

6898 4 False

6899 dtype: bool

6901 By setting ``keep`` on False, all duplicates are True.

6903 >>> df.duplicated(keep=False)

6904 0 True

6905 1 True

6906 2 False

6907 3 False

6908 4 False

6909 dtype: bool

6911 To find duplicates on specific column(s), use ``subset``.

6913 >>> df.duplicated(subset=['brand'])

6914 0 False

6915 1 True

6916 2 False

6917 3 True

6918 4 True

6919 dtype: bool

6920 """

6922 if self.empty:

6923 return self._constructor_sliced(dtype=bool)

6925 def f(vals) -> tuple[np.ndarray, int]:

6926 labels, shape = algorithms.factorize(vals, size_hint=len(self))

6927 return labels.astype("i8", copy=False), len(shape)

6929 if subset is None:

6930 # https://github.com/pandas-dev/pandas/issues/28770

6931 # Incompatible types in assignment (expression has type "Index", variable

6932 # has type "Sequence[Any]")

6933 subset = self.columns # type: ignore[assignment]

6934 elif (

6935 not np.iterable(subset)

6936 or isinstance(subset, str)

6937 or isinstance(subset, tuple)

6938 and subset in self.columns

6940 subset = (subset,)

6942 # needed for mypy since can't narrow types using np.iterable

6943 subset = cast(Sequence, subset)

6945 # Verify all columns in subset exist in the queried dataframe

6946 # Otherwise, raise a KeyError, same as if you try to __getitem__ with a

6947 # key that doesn't exist.

6948 diff = set(subset) - set(self.columns)

6949 if diff:

6950 raise KeyError(Index(diff))

6952 if len(subset) == 1 and self.columns.is_unique:

6953 # GH#45236 This is faster than get_group_index below

6954 result = self[subset[0]].duplicated(keep)

6955 result.name = None

6956 else:

6957 vals = (col.values for name, col in self.items() if name in subset)

6958 labels, shape = map(list, zip(*map(f, vals)))

6960 ids = get_group_index(labels, tuple(shape), sort=False, xnull=False)

6961 result = self._constructor_sliced(duplicated(ids, keep), index=self.index)

6962 return result.__finalize__(self, method="duplicated")

6964 # ----------------------------------------------------------------------

6965 # Sorting

6966 # error: Signature of "sort_values" incompatible with supertype "NDFrame"

6967 @overload # type: ignore[override]

6968 def sort_values(

6969 self,

6970 by: IndexLabel,

6972 axis: Axis = ...,

6973 ascending=...,

6974 inplace: Literal[False] = ...,

6975 kind: SortKind = ...,

6976 na_position: NaPosition = ...,

6977 ignore_index: bool = ...,

6978 key: ValueKeyFunc = ...,

6979 ) -> DataFrame:

6980 ...

6982 @overload

6983 def sort_values(

6984 self,

6985 by: IndexLabel,

6987 axis: Axis = ...,

6988 ascending=...,

6989 inplace: Literal[True],

6990 kind: SortKind = ...,

6991 na_position: str = ...,

6992 ignore_index: bool = ...,

6993 key: ValueKeyFunc = ...,

6994 ) -> None:

6995 ...

6997 def sort_values(

6998 self,

6999 by: IndexLabel,

7001 axis: Axis = 0,

7002 ascending: bool | list[bool] | tuple[bool, ...] = True,

7003 inplace: bool = False,

7004 kind: SortKind = "quicksort",

7005 na_position: str = "last",

7006 ignore_index: bool = False,

7007 key: ValueKeyFunc | None = None,

7008 ) -> DataFrame | None:

7009 """

7010 Sort by the values along either axis.

7012 Parameters

7013 ----------

7014 by : str or list of str

7015 Name or list of names to sort by.

7017 - if `axis` is 0 or `'index'` then `by` may contain index

7018 levels and/or column labels.

7019 - if `axis` is 1 or `'columns'` then `by` may contain column

7020 levels and/or index labels.

7021 axis : "{0 or 'index', 1 or 'columns'}", default 0

7022 Axis to be sorted.

7023 ascending : bool or list of bool, default True

7024 Sort ascending vs. descending. Specify list for multiple sort

7025 orders. If this is a list of bools, must match the length of

7026 the by.

7027 inplace : bool, default False

7028 If True, perform operation in-place.

7029 kind : {'quicksort', 'mergesort', 'heapsort', 'stable'}, default 'quicksort'

7030 Choice of sorting algorithm. See also :func:`numpy.sort` for more

7031 information. `mergesort` and `stable` are the only stable algorithms. For

7032 DataFrames, this option is only applied when sorting on a single

7033 column or label.

7034 na_position : {'first', 'last'}, default 'last'

7035 Puts NaNs at the beginning if `first`; `last` puts NaNs at the

7036 end.

7037 ignore_index : bool, default False

7038 If True, the resulting axis will be labeled 0, 1, …, n - 1.

7039 key : callable, optional

7040 Apply the key function to the values

7041 before sorting. This is similar to the `key` argument in the

7042 builtin :meth:`sorted` function, with the notable difference that

7043 this `key` function should be *vectorized*. It should expect a

7044 ``Series`` and return a Series with the same shape as the input.

7045 It will be applied to each column in `by` independently.

7047 Returns

7048 -------

7049 DataFrame or None

7050 DataFrame with sorted values or None if ``inplace=True``.

7052 See Also

7053 --------

7054 DataFrame.sort_index : Sort a DataFrame by the index.

7055 Series.sort_values : Similar method for a Series.

7057 Examples

7058 --------

7059 >>> df = pd.DataFrame({

7060 ... 'col1': ['A', 'A', 'B', np.nan, 'D', 'C'],

7061 ... 'col2': [2, 1, 9, 8, 7, 4],

7062 ... 'col3': [0, 1, 9, 4, 2, 3],

7063 ... 'col4': ['a', 'B', 'c', 'D', 'e', 'F']

7064 ... })

7065 >>> df

7066 col1 col2 col3 col4

7067 0 A 2 0 a

7068 1 A 1 1 B

7069 2 B 9 9 c

7070 3 NaN 8 4 D

7071 4 D 7 2 e

7072 5 C 4 3 F

7074 Sort by col1

7076 >>> df.sort_values(by=['col1'])

7077 col1 col2 col3 col4

7078 0 A 2 0 a

7079 1 A 1 1 B

7080 2 B 9 9 c

7081 5 C 4 3 F

7082 4 D 7 2 e

7083 3 NaN 8 4 D

7085 Sort by multiple columns

7087 >>> df.sort_values(by=['col1', 'col2'])

7088 col1 col2 col3 col4

7089 1 A 1 1 B

7090 0 A 2 0 a

7091 2 B 9 9 c

7092 5 C 4 3 F

7093 4 D 7 2 e

7094 3 NaN 8 4 D

7096 Sort Descending

7098 >>> df.sort_values(by='col1', ascending=False)

7099 col1 col2 col3 col4

7100 4 D 7 2 e

7101 5 C 4 3 F

7102 2 B 9 9 c

7103 0 A 2 0 a

7104 1 A 1 1 B

7105 3 NaN 8 4 D

7107 Putting NAs first

7109 >>> df.sort_values(by='col1', ascending=False, na_position='first')

7110 col1 col2 col3 col4

7111 3 NaN 8 4 D

7112 4 D 7 2 e

7113 5 C 4 3 F

7114 2 B 9 9 c

7115 0 A 2 0 a

7116 1 A 1 1 B

7118 Sorting with a key function

7120 >>> df.sort_values(by='col4', key=lambda col: col.str.lower())

7121 col1 col2 col3 col4

7122 0 A 2 0 a

7123 1 A 1 1 B

7124 2 B 9 9 c

7125 3 NaN 8 4 D

7126 4 D 7 2 e

7127 5 C 4 3 F

7129 Natural sort with the key argument,

7130 using the `natsort <https://github.com/SethMMorton/natsort>` package.

7132 >>> df = pd.DataFrame({

7133 ... "time": ['0hr', '128hr', '72hr', '48hr', '96hr'],

7134 ... "value": [10, 20, 30, 40, 50]

7135 ... })

7136 >>> df

7137 time value

7138 0 0hr 10

7139 1 128hr 20

7140 2 72hr 30

7141 3 48hr 40

7142 4 96hr 50

7143 >>> from natsort import index_natsorted

7144 >>> df.sort_values(

7145 ... by="time",

7146 ... key=lambda x: np.argsort(index_natsorted(df["time"]))

7147 ... )

7148 time value

7149 0 0hr 10

7150 3 48hr 40

7151 2 72hr 30

7152 4 96hr 50

7153 1 128hr 20

7154 """

7155 inplace = validate_bool_kwarg(inplace, "inplace")

7156 axis = self._get_axis_number(axis)

7157 ascending = validate_ascending(ascending)

7158 if not isinstance(by, list):

7159 by = [by]

7160 # error: Argument 1 to "len" has incompatible type "Union[bool, List[bool]]";

7161 # expected "Sized"

7162 if is_sequence(ascending) and (

7163 len(by) != len(ascending) # type: ignore[arg-type]

7165 # error: Argument 1 to "len" has incompatible type "Union[bool,

7166 # List[bool]]"; expected "Sized"

7167 raise ValueError(

7168 f"Length of ascending ({len(ascending)})" # type: ignore[arg-type]

7169 f" != length of by ({len(by)})"

7171 if len(by) > 1:

7172 keys = [self._get_label_or_level_values(x, axis=axis) for x in by]

7174 # need to rewrap columns in Series to apply key function

7175 if key is not None:

7176 # error: List comprehension has incompatible type List[Series];

7177 # expected List[ndarray]

7178 keys = [

7179 Series(k, name=name) # type: ignore[misc]

7180 for (k, name) in zip(keys, by)

7183 indexer = lexsort_indexer(

7184 keys, orders=ascending, na_position=na_position, key=key

7186 elif len(by):

7187 # len(by) == 1

7189 k = self._get_label_or_level_values(by[0], axis=axis)

7191 # need to rewrap column in Series to apply key function

7192 if key is not None:

7193 # error: Incompatible types in assignment (expression has type

7194 # "Series", variable has type "ndarray")

7195 k = Series(k, name=by[0]) # type: ignore[assignment]

7197 if isinstance(ascending, (tuple, list)):

7198 ascending = ascending[0]

7200 indexer = nargsort(

7201 k, kind=kind, ascending=ascending, na_position=na_position, key=key

7203 else:

7204 if inplace:

7205 return self._update_inplace(self)

7206 else:

7207 return self.copy(deep=None)

7209 if is_range_indexer(indexer, len(indexer)):

7210 result = self.copy(deep=(not inplace and not using_copy_on_write()))

7211 if ignore_index:

7212 result.index = default_index(len(result))

7214 if inplace:

7215 return self._update_inplace(result)

7216 else:

7217 return result

7219 new_data = self._mgr.take(

7220 indexer, axis=self._get_block_manager_axis(axis), verify=False

7223 if ignore_index:

7224 new_data.set_axis(

7225 self._get_block_manager_axis(axis), default_index(len(indexer))

7228 result = self._constructor_from_mgr(new_data, axes=new_data.axes)

7229 if inplace:

7230 return self._update_inplace(result)

7231 else:

7232 return result.__finalize__(self, method="sort_values")

7234 @overload

7235 def sort_index(

7236 self,

7238 axis: Axis = ...,

7239 level: IndexLabel = ...,

7240 ascending: bool | Sequence[bool] = ...,

7241 inplace: Literal[True],

7242 kind: SortKind = ...,

7243 na_position: NaPosition = ...,

7244 sort_remaining: bool = ...,

7245 ignore_index: bool = ...,

7246 key: IndexKeyFunc = ...,

7247 ) -> None:

7248 ...

7250 @overload

7251 def sort_index(

7252 self,

7254 axis: Axis = ...,

7255 level: IndexLabel = ...,

7256 ascending: bool | Sequence[bool] = ...,

7257 inplace: Literal[False] = ...,

7258 kind: SortKind = ...,

7259 na_position: NaPosition = ...,

7260 sort_remaining: bool = ...,

7261 ignore_index: bool = ...,

7262 key: IndexKeyFunc = ...,

7263 ) -> DataFrame:

7264 ...

7266 @overload

7267 def sort_index(

7268 self,

7270 axis: Axis = ...,

7271 level: IndexLabel = ...,

7272 ascending: bool | Sequence[bool] = ...,

7273 inplace: bool = ...,

7274 kind: SortKind = ...,

7275 na_position: NaPosition = ...,

7276 sort_remaining: bool = ...,

7277 ignore_index: bool = ...,

7278 key: IndexKeyFunc = ...,

7279 ) -> DataFrame | None:

7280 ...

7282 def sort_index(

7283 self,

7285 axis: Axis = 0,

7286 level: IndexLabel | None = None,

7287 ascending: bool | Sequence[bool] = True,

7288 inplace: bool = False,

7289 kind: SortKind = "quicksort",

7290 na_position: NaPosition = "last",

7291 sort_remaining: bool = True,

7292 ignore_index: bool = False,

7293 key: IndexKeyFunc | None = None,

7294 ) -> DataFrame | None:

7295 """

7296 Sort object by labels (along an axis).

7298 Returns a new DataFrame sorted by label if `inplace` argument is

7299 ``False``, otherwise updates the original DataFrame and returns None.

7301 Parameters

7302 ----------

7303 axis : {0 or 'index', 1 or 'columns'}, default 0

7304 The axis along which to sort. The value 0 identifies the rows,

7305 and 1 identifies the columns.

7306 level : int or level name or list of ints or list of level names

7307 If not None, sort on values in specified index level(s).

7308 ascending : bool or list-like of bools, default True

7309 Sort ascending vs. descending. When the index is a MultiIndex the

7310 sort direction can be controlled for each level individually.

7311 inplace : bool, default False

7312 Whether to modify the DataFrame rather than creating a new one.

7313 kind : {'quicksort', 'mergesort', 'heapsort', 'stable'}, default 'quicksort'

7314 Choice of sorting algorithm. See also :func:`numpy.sort` for more

7315 information. `mergesort` and `stable` are the only stable algorithms. For

7316 DataFrames, this option is only applied when sorting on a single

7317 column or label.

7318 na_position : {'first', 'last'}, default 'last'

7319 Puts NaNs at the beginning if `first`; `last` puts NaNs at the end.

7320 Not implemented for MultiIndex.

7321 sort_remaining : bool, default True

7322 If True and sorting by level and index is multilevel, sort by other

7323 levels too (in order) after sorting by specified level.

7324 ignore_index : bool, default False

7325 If True, the resulting axis will be labeled 0, 1, …, n - 1.

7326 key : callable, optional

7327 If not None, apply the key function to the index values

7328 before sorting. This is similar to the `key` argument in the

7329 builtin :meth:`sorted` function, with the notable difference that

7330 this `key` function should be *vectorized*. It should expect an

7331 ``Index`` and return an ``Index`` of the same shape. For MultiIndex

7332 inputs, the key is applied *per level*.

7334 Returns

7335 -------

7336 DataFrame or None

7337 The original DataFrame sorted by the labels or None if ``inplace=True``.

7339 See Also

7340 --------

7341 Series.sort_index : Sort Series by the index.

7342 DataFrame.sort_values : Sort DataFrame by the value.

7343 Series.sort_values : Sort Series by the value.

7345 Examples

7346 --------

7347 >>> df = pd.DataFrame([1, 2, 3, 4, 5], index=[100, 29, 234, 1, 150],

7348 ... columns=['A'])

7349 >>> df.sort_index()

7351 1 4

7352 29 2

7353 100 1

7354 150 5

7355 234 3

7357 By default, it sorts in ascending order, to sort in descending order,

7358 use ``ascending=False``

7360 >>> df.sort_index(ascending=False)

7362 234 3

7363 150 5

7364 100 1

7365 29 2

7366 1 4

7368 A key function can be specified which is applied to the index before

7369 sorting. For a ``MultiIndex`` this is applied to each level separately.

7371 >>> df = pd.DataFrame({"a": [1, 2, 3, 4]}, index=['A', 'b', 'C', 'd'])

7372 >>> df.sort_index(key=lambda x: x.str.lower())

7374 A 1

7375 b 2

7376 C 3

7377 d 4

7378 """

7379 return super().sort_index(

7380 axis=axis,

7381 level=level,

7382 ascending=ascending,

7383 inplace=inplace,

7384 kind=kind,

7385 na_position=na_position,

7386 sort_remaining=sort_remaining,

7387 ignore_index=ignore_index,

7388 key=key,

7391 def value_counts(

7392 self,

7393 subset: IndexLabel | None = None,

7394 normalize: bool = False,

7395 sort: bool = True,

7396 ascending: bool = False,

7397 dropna: bool = True,

7398 ) -> Series:

7399 """

7400 Return a Series containing the frequency of each distinct row in the Dataframe.

7402 Parameters

7403 ----------

7404 subset : label or list of labels, optional

7405 Columns to use when counting unique combinations.

7406 normalize : bool, default False

7407 Return proportions rather than frequencies.

7408 sort : bool, default True

7409 Sort by frequencies when True. Sort by DataFrame column values when False.

7410 ascending : bool, default False

7411 Sort in ascending order.

7412 dropna : bool, default True

7413 Don't include counts of rows that contain NA values.

7415 .. versionadded:: 1.3.0

7417 Returns

7418 -------

7419 Series

7421 See Also

7422 --------

7423 Series.value_counts: Equivalent method on Series.

7425 Notes

7426 -----

7427 The returned Series will have a MultiIndex with one level per input

7428 column but an Index (non-multi) for a single label. By default, rows

7429 that contain any NA values are omitted from the result. By default,

7430 the resulting Series will be in descending order so that the first

7431 element is the most frequently-occurring row.

7433 Examples

7434 --------

7435 >>> df = pd.DataFrame({'num_legs': [2, 4, 4, 6],

7436 ... 'num_wings': [2, 0, 0, 0]},

7437 ... index=['falcon', 'dog', 'cat', 'ant'])

7438 >>> df

7439 num_legs num_wings

7440 falcon 2 2

7441 dog 4 0

7442 cat 4 0

7443 ant 6 0

7445 >>> df.value_counts()

7446 num_legs num_wings

7447 4 0 2

7448 2 2 1

7449 6 0 1

7450 Name: count, dtype: int64

7452 >>> df.value_counts(sort=False)

7453 num_legs num_wings

7454 2 2 1

7455 4 0 2

7456 6 0 1

7457 Name: count, dtype: int64

7459 >>> df.value_counts(ascending=True)

7460 num_legs num_wings

7461 2 2 1

7462 6 0 1

7463 4 0 2

7464 Name: count, dtype: int64

7466 >>> df.value_counts(normalize=True)

7467 num_legs num_wings

7468 4 0 0.50

7469 2 2 0.25

7470 6 0 0.25

7471 Name: proportion, dtype: float64

7473 With `dropna` set to `False` we can also count rows with NA values.

7475 >>> df = pd.DataFrame({'first_name': ['John', 'Anne', 'John', 'Beth'],

7476 ... 'middle_name': ['Smith', pd.NA, pd.NA, 'Louise']})

7477 >>> df

7478 first_name middle_name

7479 0 John Smith

7480 1 Anne <NA>

7481 2 John <NA>

7482 3 Beth Louise

7484 >>> df.value_counts()

7485 first_name middle_name

7486 Beth Louise 1

7487 John Smith 1

7488 Name: count, dtype: int64

7490 >>> df.value_counts(dropna=False)

7491 first_name middle_name

7492 Anne NaN 1

7493 Beth Louise 1

7494 John Smith 1

7495 NaN 1

7496 Name: count, dtype: int64

7498 >>> df.value_counts("first_name")

7499 first_name

7500 John 2

7501 Anne 1

7502 Beth 1

7503 Name: count, dtype: int64

7504 """

7505 if subset is None:

7506 subset = self.columns.tolist()

7508 name = "proportion" if normalize else "count"

7509 counts = self.groupby(subset, dropna=dropna, observed=False)._grouper.size()

7510 counts.name = name

7512 if sort:

7513 counts = counts.sort_values(ascending=ascending)

7514 if normalize:

7515 counts /= counts.sum()

7517 # Force MultiIndex for a list_like subset with a single column

7518 if is_list_like(subset) and len(subset) == 1: # type: ignore[arg-type]

7519 counts.index = MultiIndex.from_arrays(

7520 [counts.index], names=[counts.index.name]

7523 return counts

7525 def nlargest(

7526 self, n: int, columns: IndexLabel, keep: NsmallestNlargestKeep = "first"

7527 ) -> DataFrame:

7528 """

7529 Return the first `n` rows ordered by `columns` in descending order.

7531 Return the first `n` rows with the largest values in `columns`, in

7532 descending order. The columns that are not specified are returned as

7533 well, but not used for ordering.

7535 This method is equivalent to

7536 ``df.sort_values(columns, ascending=False).head(n)``, but more

7537 performant.

7539 Parameters

7540 ----------

7541 n : int

7542 Number of rows to return.

7543 columns : label or list of labels

7544 Column label(s) to order by.

7545 keep : {'first', 'last', 'all'}, default 'first'

7546 Where there are duplicate values:

7548 - ``first`` : prioritize the first occurrence(s)

7549 - ``last`` : prioritize the last occurrence(s)

7550 - ``all`` : keep all the ties of the smallest item even if it means

7551 selecting more than ``n`` items.

7553 Returns

7554 -------

7555 DataFrame

7556 The first `n` rows ordered by the given columns in descending

7557 order.

7559 See Also

7560 --------

7561 DataFrame.nsmallest : Return the first `n` rows ordered by `columns` in

7562 ascending order.

7563 DataFrame.sort_values : Sort DataFrame by the values.

7564 DataFrame.head : Return the first `n` rows without re-ordering.

7566 Notes

7567 -----

7568 This function cannot be used with all column types. For example, when

7569 specifying columns with `object` or `category` dtypes, ``TypeError`` is

7570 raised.

7572 Examples

7573 --------

7574 >>> df = pd.DataFrame({'population': [59000000, 65000000, 434000,

7575 ... 434000, 434000, 337000, 11300,

7576 ... 11300, 11300],

7577 ... 'GDP': [1937894, 2583560 , 12011, 4520, 12128,

7578 ... 17036, 182, 38, 311],

7579 ... 'alpha-2': ["IT", "FR", "MT", "MV", "BN",

7580 ... "IS", "NR", "TV", "AI"]},

7581 ... index=["Italy", "France", "Malta",

7582 ... "Maldives", "Brunei", "Iceland",

7583 ... "Nauru", "Tuvalu", "Anguilla"])

7584 >>> df

7585 population GDP alpha-2

7586 Italy 59000000 1937894 IT

7587 France 65000000 2583560 FR

7588 Malta 434000 12011 MT

7589 Maldives 434000 4520 MV

7590 Brunei 434000 12128 BN

7591 Iceland 337000 17036 IS

7592 Nauru 11300 182 NR

7593 Tuvalu 11300 38 TV

7594 Anguilla 11300 311 AI

7596 In the following example, we will use ``nlargest`` to select the three

7597 rows having the largest values in column "population".

7599 >>> df.nlargest(3, 'population')

7600 population GDP alpha-2

7601 France 65000000 2583560 FR

7602 Italy 59000000 1937894 IT

7603 Malta 434000 12011 MT

7605 When using ``keep='last'``, ties are resolved in reverse order:

7607 >>> df.nlargest(3, 'population', keep='last')

7608 population GDP alpha-2

7609 France 65000000 2583560 FR

7610 Italy 59000000 1937894 IT

7611 Brunei 434000 12128 BN

7613 When using ``keep='all'``, the number of element kept can go beyond ``n``

7614 if there are duplicate values for the smallest element, all the

7615 ties are kept:

7617 >>> df.nlargest(3, 'population', keep='all')

7618 population GDP alpha-2

7619 France 65000000 2583560 FR

7620 Italy 59000000 1937894 IT

7621 Malta 434000 12011 MT

7622 Maldives 434000 4520 MV

7623 Brunei 434000 12128 BN

7625 However, ``nlargest`` does not keep ``n`` distinct largest elements:

7627 >>> df.nlargest(5, 'population', keep='all')

7628 population GDP alpha-2

7629 France 65000000 2583560 FR

7630 Italy 59000000 1937894 IT

7631 Malta 434000 12011 MT

7632 Maldives 434000 4520 MV

7633 Brunei 434000 12128 BN

7635 To order by the largest values in column "population" and then "GDP",

7636 we can specify multiple columns like in the next example.

7638 >>> df.nlargest(3, ['population', 'GDP'])

7639 population GDP alpha-2

7640 France 65000000 2583560 FR

7641 Italy 59000000 1937894 IT

7642 Brunei 434000 12128 BN

7643 """

7644 return selectn.SelectNFrame(self, n=n, keep=keep, columns=columns).nlargest()

7646 def nsmallest(

7647 self, n: int, columns: IndexLabel, keep: NsmallestNlargestKeep = "first"

7648 ) -> DataFrame:

7649 """

7650 Return the first `n` rows ordered by `columns` in ascending order.

7652 Return the first `n` rows with the smallest values in `columns`, in

7653 ascending order. The columns that are not specified are returned as

7654 well, but not used for ordering.

7656 This method is equivalent to

7657 ``df.sort_values(columns, ascending=True).head(n)``, but more

7658 performant.

7660 Parameters

7661 ----------

7662 n : int

7663 Number of items to retrieve.

7664 columns : list or str

7665 Column name or names to order by.

7666 keep : {'first', 'last', 'all'}, default 'first'

7667 Where there are duplicate values:

7669 - ``first`` : take the first occurrence.

7670 - ``last`` : take the last occurrence.

7671 - ``all`` : keep all the ties of the largest item even if it means

7672 selecting more than ``n`` items.

7674 Returns

7675 -------

7676 DataFrame

7678 See Also

7679 --------

7680 DataFrame.nlargest : Return the first `n` rows ordered by `columns` in

7681 descending order.

7682 DataFrame.sort_values : Sort DataFrame by the values.

7683 DataFrame.head : Return the first `n` rows without re-ordering.

7685 Examples

7686 --------

7687 >>> df = pd.DataFrame({'population': [59000000, 65000000, 434000,

7688 ... 434000, 434000, 337000, 337000,

7689 ... 11300, 11300],

7690 ... 'GDP': [1937894, 2583560 , 12011, 4520, 12128,

7691 ... 17036, 182, 38, 311],

7692 ... 'alpha-2': ["IT", "FR", "MT", "MV", "BN",

7693 ... "IS", "NR", "TV", "AI"]},

7694 ... index=["Italy", "France", "Malta",

7695 ... "Maldives", "Brunei", "Iceland",

7696 ... "Nauru", "Tuvalu", "Anguilla"])

7697 >>> df

7698 population GDP alpha-2

7699 Italy 59000000 1937894 IT

7700 France 65000000 2583560 FR

7701 Malta 434000 12011 MT

7702 Maldives 434000 4520 MV

7703 Brunei 434000 12128 BN

7704 Iceland 337000 17036 IS

7705 Nauru 337000 182 NR

7706 Tuvalu 11300 38 TV

7707 Anguilla 11300 311 AI

7709 In the following example, we will use ``nsmallest`` to select the

7710 three rows having the smallest values in column "population".

7712 >>> df.nsmallest(3, 'population')

7713 population GDP alpha-2

7714 Tuvalu 11300 38 TV

7715 Anguilla 11300 311 AI

7716 Iceland 337000 17036 IS

7718 When using ``keep='last'``, ties are resolved in reverse order:

7720 >>> df.nsmallest(3, 'population', keep='last')

7721 population GDP alpha-2

7722 Anguilla 11300 311 AI

7723 Tuvalu 11300 38 TV

7724 Nauru 337000 182 NR

7726 When using ``keep='all'``, the number of element kept can go beyond ``n``

7727 if there are duplicate values for the largest element, all the

7728 ties are kept.

7730 >>> df.nsmallest(3, 'population', keep='all')

7731 population GDP alpha-2

7732 Tuvalu 11300 38 TV

7733 Anguilla 11300 311 AI

7734 Iceland 337000 17036 IS

7735 Nauru 337000 182 NR

7737 However, ``nsmallest`` does not keep ``n`` distinct

7738 smallest elements:

7740 >>> df.nsmallest(4, 'population', keep='all')

7741 population GDP alpha-2

7742 Tuvalu 11300 38 TV

7743 Anguilla 11300 311 AI

7744 Iceland 337000 17036 IS

7745 Nauru 337000 182 NR

7747 To order by the smallest values in column "population" and then "GDP", we can

7748 specify multiple columns like in the next example.

7750 >>> df.nsmallest(3, ['population', 'GDP'])

7751 population GDP alpha-2

7752 Tuvalu 11300 38 TV

7753 Anguilla 11300 311 AI

7754 Nauru 337000 182 NR

7755 """

7756 return selectn.SelectNFrame(self, n=n, keep=keep, columns=columns).nsmallest()

7758 @doc(

7759 Series.swaplevel,

7760 klass=_shared_doc_kwargs["klass"],

7761 extra_params=dedent(

7762 """axis : {0 or 'index', 1 or 'columns'}, default 0

7763 The axis to swap levels on. 0 or 'index' for row-wise, 1 or

7764 'columns' for column-wise."""

7766 examples=dedent(

7767 """\

7768 Examples

7769 --------

7770 >>> df = pd.DataFrame(

7771 ... {"Grade": ["A", "B", "A", "C"]},

7772 ... index=[

7773 ... ["Final exam", "Final exam", "Coursework", "Coursework"],

7774 ... ["History", "Geography", "History", "Geography"],

7775 ... ["January", "February", "March", "April"],

7776 ... ],

7777 ... )

7778 >>> df

7779 Grade

7780 Final exam History January A

7781 Geography February B

7782 Coursework History March A

7783 Geography April C

7785 In the following example, we will swap the levels of the indices.

7786 Here, we will swap the levels column-wise, but levels can be swapped row-wise

7787 in a similar manner. Note that column-wise is the default behaviour.

7788 By not supplying any arguments for i and j, we swap the last and second to

7789 last indices.

7791 >>> df.swaplevel()

7792 Grade

7793 Final exam January History A

7794 February Geography B

7795 Coursework March History A

7796 April Geography C

7798 By supplying one argument, we can choose which index to swap the last

7799 index with. We can for example swap the first index with the last one as

7800 follows.

7802 >>> df.swaplevel(0)

7803 Grade

7804 January History Final exam A

7805 February Geography Final exam B

7806 March History Coursework A

7807 April Geography Coursework C

7809 We can also define explicitly which indices we want to swap by supplying values

7810 for both i and j. Here, we for example swap the first and second indices.

7812 >>> df.swaplevel(0, 1)

7813 Grade

7814 History Final exam January A

7815 Geography Final exam February B

7816 History Coursework March A

7817 Geography Coursework April C"""

7820 def swaplevel(self, i: Axis = -2, j: Axis = -1, axis: Axis = 0) -> DataFrame:

7821 result = self.copy(deep=None)

7823 axis = self._get_axis_number(axis)

7825 if not isinstance(result._get_axis(axis), MultiIndex): # pragma: no cover

7826 raise TypeError("Can only swap levels on a hierarchical axis.")

7828 if axis == 0:

7829 assert isinstance(result.index, MultiIndex)

7830 result.index = result.index.swaplevel(i, j)

7831 else:

7832 assert isinstance(result.columns, MultiIndex)

7833 result.columns = result.columns.swaplevel(i, j)

7834 return result

7836 def reorder_levels(self, order: Sequence[int | str], axis: Axis = 0) -> DataFrame:

7837 """

7838 Rearrange index levels using input order. May not drop or duplicate levels.

7840 Parameters

7841 ----------

7842 order : list of int or list of str

7843 List representing new level order. Reference level by number

7844 (position) or by key (label).

7845 axis : {0 or 'index', 1 or 'columns'}, default 0

7846 Where to reorder levels.

7848 Returns

7849 -------

7850 DataFrame

7852 Examples

7853 --------

7854 >>> data = {

7855 ... "class": ["Mammals", "Mammals", "Reptiles"],

7856 ... "diet": ["Omnivore", "Carnivore", "Carnivore"],

7857 ... "species": ["Humans", "Dogs", "Snakes"],

7858 ... }

7859 >>> df = pd.DataFrame(data, columns=["class", "diet", "species"])

7860 >>> df = df.set_index(["class", "diet"])

7861 >>> df

7862 species

7863 class diet

7864 Mammals Omnivore Humans

7865 Carnivore Dogs

7866 Reptiles Carnivore Snakes

7868 Let's reorder the levels of the index:

7870 >>> df.reorder_levels(["diet", "class"])

7871 species

7872 diet class

7873 Omnivore Mammals Humans

7874 Carnivore Mammals Dogs

7875 Reptiles Snakes

7876 """

7877 axis = self._get_axis_number(axis)

7878 if not isinstance(self._get_axis(axis), MultiIndex): # pragma: no cover

7879 raise TypeError("Can only reorder levels on a hierarchical axis.")

7881 result = self.copy(deep=None)

7883 if axis == 0:

7884 assert isinstance(result.index, MultiIndex)

7885 result.index = result.index.reorder_levels(order)

7886 else:

7887 assert isinstance(result.columns, MultiIndex)

7888 result.columns = result.columns.reorder_levels(order)

7889 return result

7891 # ----------------------------------------------------------------------

7892 # Arithmetic Methods

7894 def _cmp_method(self, other, op):

7895 axis: Literal[1] = 1 # only relevant for Series other case

7897 self, other = self._align_for_op(other, axis, flex=False, level=None)

7899 # See GH#4537 for discussion of scalar op behavior

7900 new_data = self._dispatch_frame_op(other, op, axis=axis)

7901 return self._construct_result(new_data)

7903 def _arith_method(self, other, op):

7904 if self._should_reindex_frame_op(other, op, 1, None, None):

7905 return self._arith_method_with_reindex(other, op)

7907 axis: Literal[1] = 1 # only relevant for Series other case

7908 other = ops.maybe_prepare_scalar_for_op(other, (self.shape[axis],))

7910 self, other = self._align_for_op(other, axis, flex=True, level=None)

7912 with np.errstate(all="ignore"):

7913 new_data = self._dispatch_frame_op(other, op, axis=axis)

7914 return self._construct_result(new_data)

7916 _logical_method = _arith_method

7918 def _dispatch_frame_op(

7919 self, right, func: Callable, axis: AxisInt | None = None

7920 ) -> DataFrame:

7921 """

7922 Evaluate the frame operation func(left, right) by evaluating

7923 column-by-column, dispatching to the Series implementation.

7925 Parameters

7926 ----------

7927 right : scalar, Series, or DataFrame

7928 func : arithmetic or comparison operator

7929 axis : {None, 0, 1}

7931 Returns

7932 -------

7933 DataFrame

7935 Notes

7936 -----

7937 Caller is responsible for setting np.errstate where relevant.

7938 """

7939 # Get the appropriate array-op to apply to each column/block's values.

7940 array_op = ops.get_array_op(func)

7942 right = lib.item_from_zerodim(right)

7943 if not is_list_like(right):

7944 # i.e. scalar, faster than checking np.ndim(right) == 0

7945 bm = self._mgr.apply(array_op, right=right)

7946 return self._constructor_from_mgr(bm, axes=bm.axes)

7948 elif isinstance(right, DataFrame):

7949 assert self.index.equals(right.index)

7950 assert self.columns.equals(right.columns)

7951 # TODO: The previous assertion `assert right._indexed_same(self)`

7952 # fails in cases with empty columns reached via

7953 # _frame_arith_method_with_reindex

7955 # TODO operate_blockwise expects a manager of the same type

7956 bm = self._mgr.operate_blockwise(

7957 # error: Argument 1 to "operate_blockwise" of "ArrayManager" has

7958 # incompatible type "Union[ArrayManager, BlockManager]"; expected

7959 # "ArrayManager"

7960 # error: Argument 1 to "operate_blockwise" of "BlockManager" has

7961 # incompatible type "Union[ArrayManager, BlockManager]"; expected

7962 # "BlockManager"

7963 right._mgr, # type: ignore[arg-type]

7964 array_op,

7966 return self._constructor_from_mgr(bm, axes=bm.axes)

7968 elif isinstance(right, Series) and axis == 1:

7969 # axis=1 means we want to operate row-by-row

7970 assert right.index.equals(self.columns)

7972 right = right._values

7973 # maybe_align_as_frame ensures we do not have an ndarray here

7974 assert not isinstance(right, np.ndarray)

7976 arrays = [

7977 array_op(_left, _right)

7978 for _left, _right in zip(self._iter_column_arrays(), right)

7981 elif isinstance(right, Series):

7982 assert right.index.equals(self.index)

7983 right = right._values

7985 arrays = [array_op(left, right) for left in self._iter_column_arrays()]

7987 else:

7988 raise NotImplementedError(right)

7990 return type(self)._from_arrays(

7991 arrays, self.columns, self.index, verify_integrity=False

7994 def _combine_frame(self, other: DataFrame, func, fill_value=None):

7995 # at this point we have `self._indexed_same(other)`

7997 if fill_value is None:

7998 # since _arith_op may be called in a loop, avoid function call

7999 # overhead if possible by doing this check once

8000 _arith_op = func

8002 else:

8004 def _arith_op(left, right):

8005 # for the mixed_type case where we iterate over columns,

8006 # _arith_op(left, right) is equivalent to

8007 # left._binop(right, func, fill_value=fill_value)

8008 left, right = ops.fill_binop(left, right, fill_value)

8009 return func(left, right)

8011 new_data = self._dispatch_frame_op(other, _arith_op)

8012 return new_data

8014 def _arith_method_with_reindex(self, right: DataFrame, op) -> DataFrame:

8015 """

8016 For DataFrame-with-DataFrame operations that require reindexing,

8017 operate only on shared columns, then reindex.

8019 Parameters

8020 ----------

8021 right : DataFrame

8022 op : binary operator

8024 Returns

8025 -------

8026 DataFrame

8027 """

8028 left = self

8030 # GH#31623, only operate on shared columns

8031 cols, lcols, rcols = left.columns.join(

8032 right.columns, how="inner", level=None, return_indexers=True

8035 new_left = left.iloc[:, lcols]

8036 new_right = right.iloc[:, rcols]

8037 result = op(new_left, new_right)

8039 # Do the join on the columns instead of using left._align_for_op

8040 # to avoid constructing two potentially large/sparse DataFrames

8041 join_columns, _, _ = left.columns.join(

8042 right.columns, how="outer", level=None, return_indexers=True

8045 if result.columns.has_duplicates:

8046 # Avoid reindexing with a duplicate axis.

8047 # https://github.com/pandas-dev/pandas/issues/35194

8048 indexer, _ = result.columns.get_indexer_non_unique(join_columns)

8049 indexer = algorithms.unique1d(indexer)

8050 result = result._reindex_with_indexers(

8051 {1: [join_columns, indexer]}, allow_dups=True

8053 else:

8054 result = result.reindex(join_columns, axis=1)

8056 return result

8058 def _should_reindex_frame_op(self, right, op, axis: int, fill_value, level) -> bool:

8059 """

8060 Check if this is an operation between DataFrames that will need to reindex.

8061 """

8062 if op is operator.pow or op is roperator.rpow:

8063 # GH#32685 pow has special semantics for operating with null values

8064 return False

8066 if not isinstance(right, DataFrame):

8067 return False

8069 if fill_value is None and level is None and axis == 1:

8070 # TODO: any other cases we should handle here?

8072 # Intersection is always unique so we have to check the unique columns

8073 left_uniques = self.columns.unique()

8074 right_uniques = right.columns.unique()

8075 cols = left_uniques.intersection(right_uniques)

8076 if len(cols) and not (

8077 len(cols) == len(left_uniques) and len(cols) == len(right_uniques)

8079 # TODO: is there a shortcut available when len(cols) == 0?

8080 return True

8082 return False

8084 def _align_for_op(

8085 self,

8086 other,

8087 axis: AxisInt,

8088 flex: bool | None = False,

8089 level: Level | None = None,

8091 """

8092 Convert rhs to meet lhs dims if input is list, tuple or np.ndarray.

8094 Parameters

8095 ----------

8096 left : DataFrame

8097 right : Any

8098 axis : int

8099 flex : bool or None, default False

8100 Whether this is a flex op, in which case we reindex.

8101 None indicates not to check for alignment.

8102 level : int or level name, default None

8104 Returns

8105 -------

8106 left : DataFrame

8107 right : Any

8108 """

8109 left, right = self, other

8111 def to_series(right):

8112 msg = (

8113 "Unable to coerce to Series, "

8114 "length must be {req_len}: given {given_len}"

8117 # pass dtype to avoid doing inference, which would break consistency

8118 # with Index/Series ops

8119 dtype = None

8120 if getattr(right, "dtype", None) == object:

8121 # can't pass right.dtype unconditionally as that would break on e.g.

8122 # datetime64[h] ndarray

8123 dtype = object

8125 if axis == 0:

8126 if len(left.index) != len(right):

8127 raise ValueError(

8128 msg.format(req_len=len(left.index), given_len=len(right))

8130 right = left._constructor_sliced(right, index=left.index, dtype=dtype)

8131 else:

8132 if len(left.columns) != len(right):

8133 raise ValueError(

8134 msg.format(req_len=len(left.columns), given_len=len(right))

8136 right = left._constructor_sliced(right, index=left.columns, dtype=dtype)

8137 return right

8139 if isinstance(right, np.ndarray):

8140 if right.ndim == 1:

8141 right = to_series(right)

8143 elif right.ndim == 2:

8144 # We need to pass dtype=right.dtype to retain object dtype

8145 # otherwise we lose consistency with Index and array ops

8146 dtype = None

8147 if right.dtype == object:

8148 # can't pass right.dtype unconditionally as that would break on e.g.

8149 # datetime64[h] ndarray

8150 dtype = object

8152 if right.shape == left.shape:

8153 right = left._constructor(

8154 right, index=left.index, columns=left.columns, dtype=dtype

8157 elif right.shape[0] == left.shape[0] and right.shape[1] == 1:

8158 # Broadcast across columns

8159 right = np.broadcast_to(right, left.shape)

8160 right = left._constructor(

8161 right, index=left.index, columns=left.columns, dtype=dtype

8164 elif right.shape[1] == left.shape[1] and right.shape[0] == 1:

8165 # Broadcast along rows

8166 right = to_series(right[0, :])

8168 else:

8169 raise ValueError(

8170 "Unable to coerce to DataFrame, shape "

8171 f"must be {left.shape}: given {right.shape}"

8174 elif right.ndim > 2:

8175 raise ValueError(

8176 "Unable to coerce to Series/DataFrame, "

8177 f"dimension must be <= 2: {right.shape}"

8180 elif is_list_like(right) and not isinstance(right, (Series, DataFrame)):

8181 # GH#36702. Raise when attempting arithmetic with list of array-like.

8182 if any(is_array_like(el) for el in right):

8183 raise ValueError(

8184 f"Unable to coerce list of {type(right[0])} to Series/DataFrame"

8186 # GH#17901

8187 right = to_series(right)

8189 if flex is not None and isinstance(right, DataFrame):

8190 if not left._indexed_same(right):

8191 if flex:

8192 left, right = left.align(

8193 right, join="outer", level=level, copy=False

8195 else:

8196 raise ValueError(

8197 "Can only compare identically-labeled (both index and columns) "

8198 "DataFrame objects"

8200 elif isinstance(right, Series):

8201 # axis=1 is default for DataFrame-with-Series op

8202 axis = axis if axis is not None else 1

8203 if not flex:

8204 if not left.axes[axis].equals(right.index):

8205 raise ValueError(

8206 "Operands are not aligned. Do "

8207 "`left, right = left.align(right, axis=1, copy=False)` "

8208 "before operating."

8211 left, right = left.align(

8212 right,

8213 join="outer",

8214 axis=axis,

8215 level=level,

8216 copy=False,

8218 right = left._maybe_align_series_as_frame(right, axis)

8220 return left, right

8222 def _maybe_align_series_as_frame(self, series: Series, axis: AxisInt):

8223 """

8224 If the Series operand is not EA-dtype, we can broadcast to 2D and operate

8225 blockwise.

8226 """

8227 rvalues = series._values

8228 if not isinstance(rvalues, np.ndarray):

8229 # TODO(EA2D): no need to special-case with 2D EAs

8230 if rvalues.dtype in ("datetime64[ns]", "timedelta64[ns]"):

8231 # We can losslessly+cheaply cast to ndarray

8232 rvalues = np.asarray(rvalues)

8233 else:

8234 return series

8236 if axis == 0:

8237 rvalues = rvalues.reshape(-1, 1)

8238 else:

8239 rvalues = rvalues.reshape(1, -1)

8241 rvalues = np.broadcast_to(rvalues, self.shape)

8242 # pass dtype to avoid doing inference

8243 return self._constructor(

8244 rvalues,

8245 index=self.index,

8246 columns=self.columns,

8247 dtype=rvalues.dtype,

8250 def _flex_arith_method(

8251 self, other, op, *, axis: Axis = "columns", level=None, fill_value=None

8253 axis = self._get_axis_number(axis) if axis is not None else 1

8255 if self._should_reindex_frame_op(other, op, axis, fill_value, level):

8256 return self._arith_method_with_reindex(other, op)

8258 if isinstance(other, Series) and fill_value is not None:

8259 # TODO: We could allow this in cases where we end up going

8260 # through the DataFrame path

8261 raise NotImplementedError(f"fill_value {fill_value} not supported.")

8263 other = ops.maybe_prepare_scalar_for_op(other, self.shape)

8264 self, other = self._align_for_op(other, axis, flex=True, level=level)

8266 with np.errstate(all="ignore"):

8267 if isinstance(other, DataFrame):

8268 # Another DataFrame

8269 new_data = self._combine_frame(other, op, fill_value)

8271 elif isinstance(other, Series):

8272 new_data = self._dispatch_frame_op(other, op, axis=axis)

8273 else:

8274 # in this case we always have `np.ndim(other) == 0`

8275 if fill_value is not None:

8276 self = self.fillna(fill_value)

8278 new_data = self._dispatch_frame_op(other, op)

8280 return self._construct_result(new_data)

8282 def _construct_result(self, result) -> DataFrame:

8283 """

8284 Wrap the result of an arithmetic, comparison, or logical operation.

8286 Parameters

8287 ----------

8288 result : DataFrame

8290 Returns

8291 -------

8292 DataFrame

8293 """

8294 out = self._constructor(result, copy=False).__finalize__(self)

8295 # Pin columns instead of passing to constructor for compat with

8296 # non-unique columns case

8297 out.columns = self.columns

8298 out.index = self.index

8299 return out

8301 def __divmod__(self, other) -> tuple[DataFrame, DataFrame]:

8302 # Naive implementation, room for optimization

8303 div = self // other

8304 mod = self - div * other

8305 return div, mod

8307 def __rdivmod__(self, other) -> tuple[DataFrame, DataFrame]:

8308 # Naive implementation, room for optimization

8309 div = other // self

8310 mod = other - div * self

8311 return div, mod

8313 def _flex_cmp_method(self, other, op, *, axis: Axis = "columns", level=None):

8314 axis = self._get_axis_number(axis) if axis is not None else 1

8316 self, other = self._align_for_op(other, axis, flex=True, level=level)

8318 new_data = self._dispatch_frame_op(other, op, axis=axis)

8319 return self._construct_result(new_data)

8321 @Appender(ops.make_flex_doc("eq", "dataframe"))

8322 def eq(self, other, axis: Axis = "columns", level=None) -> DataFrame:

8323 return self._flex_cmp_method(other, operator.eq, axis=axis, level=level)

8325 @Appender(ops.make_flex_doc("ne", "dataframe"))

8326 def ne(self, other, axis: Axis = "columns", level=None) -> DataFrame:

8327 return self._flex_cmp_method(other, operator.ne, axis=axis, level=level)

8329 @Appender(ops.make_flex_doc("le", "dataframe"))

8330 def le(self, other, axis: Axis = "columns", level=None) -> DataFrame:

8331 return self._flex_cmp_method(other, operator.le, axis=axis, level=level)

8333 @Appender(ops.make_flex_doc("lt", "dataframe"))

8334 def lt(self, other, axis: Axis = "columns", level=None) -> DataFrame:

8335 return self._flex_cmp_method(other, operator.lt, axis=axis, level=level)

8337 @Appender(ops.make_flex_doc("ge", "dataframe"))

8338 def ge(self, other, axis: Axis = "columns", level=None) -> DataFrame:

8339 return self._flex_cmp_method(other, operator.ge, axis=axis, level=level)

8341 @Appender(ops.make_flex_doc("gt", "dataframe"))

8342 def gt(self, other, axis: Axis = "columns", level=None) -> DataFrame:

8343 return self._flex_cmp_method(other, operator.gt, axis=axis, level=level)

8345 @Appender(ops.make_flex_doc("add", "dataframe"))

8346 def add(

8347 self, other, axis: Axis = "columns", level=None, fill_value=None

8348 ) -> DataFrame:

8349 return self._flex_arith_method(

8350 other, operator.add, level=level, fill_value=fill_value, axis=axis

8353 @Appender(ops.make_flex_doc("radd", "dataframe"))

8354 def radd(

8355 self, other, axis: Axis = "columns", level=None, fill_value=None

8356 ) -> DataFrame:

8357 return self._flex_arith_method(

8358 other, roperator.radd, level=level, fill_value=fill_value, axis=axis

8361 @Appender(ops.make_flex_doc("sub", "dataframe"))

8362 def sub(

8363 self, other, axis: Axis = "columns", level=None, fill_value=None

8364 ) -> DataFrame:

8365 return self._flex_arith_method(

8366 other, operator.sub, level=level, fill_value=fill_value, axis=axis

8369 subtract = sub

8371 @Appender(ops.make_flex_doc("rsub", "dataframe"))

8372 def rsub(

8373 self, other, axis: Axis = "columns", level=None, fill_value=None

8374 ) -> DataFrame:

8375 return self._flex_arith_method(

8376 other, roperator.rsub, level=level, fill_value=fill_value, axis=axis

8379 @Appender(ops.make_flex_doc("mul", "dataframe"))

8380 def mul(

8381 self, other, axis: Axis = "columns", level=None, fill_value=None

8382 ) -> DataFrame:

8383 return self._flex_arith_method(

8384 other, operator.mul, level=level, fill_value=fill_value, axis=axis

8387 multiply = mul

8389 @Appender(ops.make_flex_doc("rmul", "dataframe"))

8390 def rmul(

8391 self, other, axis: Axis = "columns", level=None, fill_value=None

8392 ) -> DataFrame:

8393 return self._flex_arith_method(

8394 other, roperator.rmul, level=level, fill_value=fill_value, axis=axis

8397 @Appender(ops.make_flex_doc("truediv", "dataframe"))

8398 def truediv(

8399 self, other, axis: Axis = "columns", level=None, fill_value=None

8400 ) -> DataFrame:

8401 return self._flex_arith_method(

8402 other, operator.truediv, level=level, fill_value=fill_value, axis=axis

8405 div = truediv

8406 divide = truediv

8408 @Appender(ops.make_flex_doc("rtruediv", "dataframe"))

8409 def rtruediv(

8410 self, other, axis: Axis = "columns", level=None, fill_value=None

8411 ) -> DataFrame:

8412 return self._flex_arith_method(

8413 other, roperator.rtruediv, level=level, fill_value=fill_value, axis=axis

8416 rdiv = rtruediv

8418 @Appender(ops.make_flex_doc("floordiv", "dataframe"))

8419 def floordiv(

8420 self, other, axis: Axis = "columns", level=None, fill_value=None

8421 ) -> DataFrame:

8422 return self._flex_arith_method(

8423 other, operator.floordiv, level=level, fill_value=fill_value, axis=axis

8426 @Appender(ops.make_flex_doc("rfloordiv", "dataframe"))

8427 def rfloordiv(

8428 self, other, axis: Axis = "columns", level=None, fill_value=None

8429 ) -> DataFrame:

8430 return self._flex_arith_method(

8431 other, roperator.rfloordiv, level=level, fill_value=fill_value, axis=axis

8434 @Appender(ops.make_flex_doc("mod", "dataframe"))

8435 def mod(

8436 self, other, axis: Axis = "columns", level=None, fill_value=None

8437 ) -> DataFrame:

8438 return self._flex_arith_method(

8439 other, operator.mod, level=level, fill_value=fill_value, axis=axis

8442 @Appender(ops.make_flex_doc("rmod", "dataframe"))

8443 def rmod(

8444 self, other, axis: Axis = "columns", level=None, fill_value=None

8445 ) -> DataFrame:

8446 return self._flex_arith_method(

8447 other, roperator.rmod, level=level, fill_value=fill_value, axis=axis

8450 @Appender(ops.make_flex_doc("pow", "dataframe"))

8451 def pow(

8452 self, other, axis: Axis = "columns", level=None, fill_value=None

8453 ) -> DataFrame:

8454 return self._flex_arith_method(

8455 other, operator.pow, level=level, fill_value=fill_value, axis=axis

8458 @Appender(ops.make_flex_doc("rpow", "dataframe"))

8459 def rpow(

8460 self, other, axis: Axis = "columns", level=None, fill_value=None

8461 ) -> DataFrame:

8462 return self._flex_arith_method(

8463 other, roperator.rpow, level=level, fill_value=fill_value, axis=axis

8466 # ----------------------------------------------------------------------

8467 # Combination-Related

8469 @doc(

8470 _shared_docs["compare"],

8471 dedent(

8472 """

8473 Returns

8474 -------

8475 DataFrame

8476 DataFrame that shows the differences stacked side by side.

8478 The resulting index will be a MultiIndex with 'self' and 'other'

8479 stacked alternately at the inner level.

8481 Raises

8482 ------

8483 ValueError

8484 When the two DataFrames don't have identical labels or shape.

8486 See Also

8487 --------

8488 Series.compare : Compare with another Series and show differences.

8489 DataFrame.equals : Test whether two objects contain the same elements.

8491 Notes

8492 -----

8493 Matching NaNs will not appear as a difference.

8495 Can only compare identically-labeled

8496 (i.e. same shape, identical row and column labels) DataFrames

8498 Examples

8499 --------

8500 >>> df = pd.DataFrame(

8501 ... {{

8502 ... "col1": ["a", "a", "b", "b", "a"],

8503 ... "col2": [1.0, 2.0, 3.0, np.nan, 5.0],

8504 ... "col3": [1.0, 2.0, 3.0, 4.0, 5.0]

8505 ... }},

8506 ... columns=["col1", "col2", "col3"],

8507 ... )

8508 >>> df

8509 col1 col2 col3

8510 0 a 1.0 1.0

8511 1 a 2.0 2.0

8512 2 b 3.0 3.0

8513 3 b NaN 4.0

8514 4 a 5.0 5.0

8516 >>> df2 = df.copy()

8517 >>> df2.loc[0, 'col1'] = 'c'

8518 >>> df2.loc[2, 'col3'] = 4.0

8519 >>> df2

8520 col1 col2 col3

8521 0 c 1.0 1.0

8522 1 a 2.0 2.0

8523 2 b 3.0 4.0

8524 3 b NaN 4.0

8525 4 a 5.0 5.0

8527 Align the differences on columns

8529 >>> df.compare(df2)

8530 col1 col3

8531 self other self other

8532 0 a c NaN NaN

8533 2 NaN NaN 3.0 4.0

8535 Assign result_names

8537 >>> df.compare(df2, result_names=("left", "right"))

8538 col1 col3

8539 left right left right

8540 0 a c NaN NaN

8541 2 NaN NaN 3.0 4.0

8543 Stack the differences on rows

8545 >>> df.compare(df2, align_axis=0)

8546 col1 col3

8547 0 self a NaN

8548 other c NaN

8549 2 self NaN 3.0

8550 other NaN 4.0

8552 Keep the equal values

8554 >>> df.compare(df2, keep_equal=True)

8555 col1 col3

8556 self other self other

8557 0 a c 1.0 1.0

8558 2 b b 3.0 4.0

8560 Keep all original rows and columns

8562 >>> df.compare(df2, keep_shape=True)

8563 col1 col2 col3

8564 self other self other self other

8565 0 a c NaN NaN NaN NaN

8566 1 NaN NaN NaN NaN NaN NaN

8567 2 NaN NaN NaN NaN 3.0 4.0

8568 3 NaN NaN NaN NaN NaN NaN

8569 4 NaN NaN NaN NaN NaN NaN

8571 Keep all original rows and columns and also all original values

8573 >>> df.compare(df2, keep_shape=True, keep_equal=True)

8574 col1 col2 col3

8575 self other self other self other

8576 0 a c 1.0 1.0 1.0 1.0

8577 1 a a 2.0 2.0 2.0 2.0

8578 2 b b 3.0 3.0 3.0 4.0

8579 3 b b NaN NaN 4.0 4.0

8580 4 a a 5.0 5.0 5.0 5.0

8581 """

8583 klass=_shared_doc_kwargs["klass"],

8585 def compare(

8586 self,

8587 other: DataFrame,

8588 align_axis: Axis = 1,

8589 keep_shape: bool = False,

8590 keep_equal: bool = False,

8591 result_names: Suffixes = ("self", "other"),

8592 ) -> DataFrame:

8593 return super().compare(

8594 other=other,

8595 align_axis=align_axis,

8596 keep_shape=keep_shape,

8597 keep_equal=keep_equal,

8598 result_names=result_names,

8601 def combine(

8602 self,

8603 other: DataFrame,

8604 func: Callable[[Series, Series], Series | Hashable],

8605 fill_value=None,

8606 overwrite: bool = True,

8607 ) -> DataFrame:

8608 """

8609 Perform column-wise combine with another DataFrame.

8611 Combines a DataFrame with `other` DataFrame using `func`

8612 to element-wise combine columns. The row and column indexes of the

8613 resulting DataFrame will be the union of the two.

8615 Parameters

8616 ----------

8617 other : DataFrame

8618 The DataFrame to merge column-wise.

8619 func : function

8620 Function that takes two series as inputs and return a Series or a

8621 scalar. Used to merge the two dataframes column by columns.

8622 fill_value : scalar value, default None

8623 The value to fill NaNs with prior to passing any column to the

8624 merge func.

8625 overwrite : bool, default True

8626 If True, columns in `self` that do not exist in `other` will be

8627 overwritten with NaNs.

8629 Returns

8630 -------

8631 DataFrame

8632 Combination of the provided DataFrames.

8634 See Also

8635 --------

8636 DataFrame.combine_first : Combine two DataFrame objects and default to

8637 non-null values in frame calling the method.

8639 Examples

8640 --------

8641 Combine using a simple function that chooses the smaller column.

8643 >>> df1 = pd.DataFrame({'A': [0, 0], 'B': [4, 4]})

8644 >>> df2 = pd.DataFrame({'A': [1, 1], 'B': [3, 3]})

8645 >>> take_smaller = lambda s1, s2: s1 if s1.sum() < s2.sum() else s2

8646 >>> df1.combine(df2, take_smaller)

8647 A B

8648 0 0 3

8649 1 0 3

8651 Example using a true element-wise combine function.

8653 >>> df1 = pd.DataFrame({'A': [5, 0], 'B': [2, 4]})

8654 >>> df2 = pd.DataFrame({'A': [1, 1], 'B': [3, 3]})

8655 >>> df1.combine(df2, np.minimum)

8656 A B

8657 0 1 2

8658 1 0 3

8660 Using `fill_value` fills Nones prior to passing the column to the

8661 merge function.

8663 >>> df1 = pd.DataFrame({'A': [0, 0], 'B': [None, 4]})

8664 >>> df2 = pd.DataFrame({'A': [1, 1], 'B': [3, 3]})

8665 >>> df1.combine(df2, take_smaller, fill_value=-5)

8666 A B

8667 0 0 -5.0

8668 1 0 4.0

8670 However, if the same element in both dataframes is None, that None

8671 is preserved

8673 >>> df1 = pd.DataFrame({'A': [0, 0], 'B': [None, 4]})

8674 >>> df2 = pd.DataFrame({'A': [1, 1], 'B': [None, 3]})

8675 >>> df1.combine(df2, take_smaller, fill_value=-5)

8676 A B

8677 0 0 -5.0

8678 1 0 3.0

8680 Example that demonstrates the use of `overwrite` and behavior when

8681 the axis differ between the dataframes.

8683 >>> df1 = pd.DataFrame({'A': [0, 0], 'B': [4, 4]})

8684 >>> df2 = pd.DataFrame({'B': [3, 3], 'C': [-10, 1], }, index=[1, 2])

8685 >>> df1.combine(df2, take_smaller)

8686 A B C

8687 0 NaN NaN NaN

8688 1 NaN 3.0 -10.0

8689 2 NaN 3.0 1.0

8691 >>> df1.combine(df2, take_smaller, overwrite=False)

8692 A B C

8693 0 0.0 NaN NaN

8694 1 0.0 3.0 -10.0

8695 2 NaN 3.0 1.0

8697 Demonstrating the preference of the passed in dataframe.

8699 >>> df2 = pd.DataFrame({'B': [3, 3], 'C': [1, 1], }, index=[1, 2])

8700 >>> df2.combine(df1, take_smaller)

8701 A B C

8702 0 0.0 NaN NaN

8703 1 0.0 3.0 NaN

8704 2 NaN 3.0 NaN

8706 >>> df2.combine(df1, take_smaller, overwrite=False)

8707 A B C

8708 0 0.0 NaN NaN

8709 1 0.0 3.0 1.0

8710 2 NaN 3.0 1.0

8711 """

8712 other_idxlen = len(other.index) # save for compare

8714 this, other = self.align(other, copy=False)

8715 new_index = this.index

8717 if other.empty and len(new_index) == len(self.index):

8718 return self.copy()

8720 if self.empty and len(other) == other_idxlen:

8721 return other.copy()

8723 # sorts if possible; otherwise align above ensures that these are set-equal

8724 new_columns = this.columns.union(other.columns)

8725 do_fill = fill_value is not None

8726 result = {}

8727 for col in new_columns:

8728 series = this[col]

8729 other_series = other[col]

8731 this_dtype = series.dtype

8732 other_dtype = other_series.dtype

8734 this_mask = isna(series)

8735 other_mask = isna(other_series)

8737 # don't overwrite columns unnecessarily

8738 # DO propagate if this column is not in the intersection

8739 if not overwrite and other_mask.all():

8740 result[col] = this[col].copy()

8741 continue

8743 if do_fill:

8744 series = series.copy()

8745 other_series = other_series.copy()

8746 series[this_mask] = fill_value

8747 other_series[other_mask] = fill_value

8749 if col not in self.columns:

8750 # If self DataFrame does not have col in other DataFrame,

8751 # try to promote series, which is all NaN, as other_dtype.

8752 new_dtype = other_dtype

8753 try:

8754 series = series.astype(new_dtype, copy=False)

8755 except ValueError:

8756 # e.g. new_dtype is integer types

8757 pass

8758 else:

8759 # if we have different dtypes, possibly promote

8760 new_dtype = find_common_type([this_dtype, other_dtype])

8761 series = series.astype(new_dtype, copy=False)

8762 other_series = other_series.astype(new_dtype, copy=False)

8764 arr = func(series, other_series)

8765 if isinstance(new_dtype, np.dtype):

8766 # if new_dtype is an EA Dtype, then `func` is expected to return

8767 # the correct dtype without any additional casting

8768 # error: No overload variant of "maybe_downcast_to_dtype" matches

8769 # argument types "Union[Series, Hashable]", "dtype[Any]"

8770 arr = maybe_downcast_to_dtype( # type: ignore[call-overload]

8771 arr, new_dtype

8774 result[col] = arr

8776 # convert_objects just in case

8777 frame_result = self._constructor(result, index=new_index, columns=new_columns)

8778 return frame_result.__finalize__(self, method="combine")

8780 def combine_first(self, other: DataFrame) -> DataFrame:

8781 """

8782 Update null elements with value in the same location in `other`.

8784 Combine two DataFrame objects by filling null values in one DataFrame

8785 with non-null values from other DataFrame. The row and column indexes

8786 of the resulting DataFrame will be the union of the two. The resulting

8787 dataframe contains the 'first' dataframe values and overrides the

8788 second one values where both first.loc[index, col] and

8789 second.loc[index, col] are not missing values, upon calling

8790 first.combine_first(second).

8792 Parameters

8793 ----------

8794 other : DataFrame

8795 Provided DataFrame to use to fill null values.

8797 Returns

8798 -------

8799 DataFrame

8800 The result of combining the provided DataFrame with the other object.

8802 See Also

8803 --------

8804 DataFrame.combine : Perform series-wise operation on two DataFrames

8805 using a given function.

8807 Examples

8808 --------

8809 >>> df1 = pd.DataFrame({'A': [None, 0], 'B': [None, 4]})

8810 >>> df2 = pd.DataFrame({'A': [1, 1], 'B': [3, 3]})

8811 >>> df1.combine_first(df2)

8812 A B

8813 0 1.0 3.0

8814 1 0.0 4.0

8816 Null values still persist if the location of that null value

8817 does not exist in `other`

8819 >>> df1 = pd.DataFrame({'A': [None, 0], 'B': [4, None]})

8820 >>> df2 = pd.DataFrame({'B': [3, 3], 'C': [1, 1]}, index=[1, 2])

8821 >>> df1.combine_first(df2)

8822 A B C

8823 0 NaN 4.0 NaN

8824 1 0.0 3.0 1.0

8825 2 NaN 3.0 1.0

8826 """

8827 from pandas.core.computation import expressions

8829 def combiner(x: Series, y: Series):

8830 mask = x.isna()._values

8832 x_values = x._values

8833 y_values = y._values

8835 # If the column y in other DataFrame is not in first DataFrame,

8836 # just return y_values.

8837 if y.name not in self.columns:

8838 return y_values

8840 return expressions.where(mask, y_values, x_values)

8842 if len(other) == 0:

8843 combined = self.reindex(

8844 self.columns.append(other.columns.difference(self.columns)), axis=1

8846 combined = combined.astype(other.dtypes)

8847 else:

8848 combined = self.combine(other, combiner, overwrite=False)

8850 dtypes = {

8851 col: find_common_type([self.dtypes[col], other.dtypes[col]])

8852 for col in self.columns.intersection(other.columns)

8853 if combined.dtypes[col] != self.dtypes[col]

8856 if dtypes:

8857 combined = combined.astype(dtypes)

8859 return combined.__finalize__(self, method="combine_first")

8861 def update(

8862 self,

8863 other,

8864 join: UpdateJoin = "left",

8865 overwrite: bool = True,

8866 filter_func=None,

8867 errors: IgnoreRaise = "ignore",

8868 ) -> None:

8869 """

8870 Modify in place using non-NA values from another DataFrame.

8872 Aligns on indices. There is no return value.

8874 Parameters

8875 ----------

8876 other : DataFrame, or object coercible into a DataFrame

8877 Should have at least one matching index/column label

8878 with the original DataFrame. If a Series is passed,

8879 its name attribute must be set, and that will be

8880 used as the column name to align with the original DataFrame.

8881 join : {'left'}, default 'left'

8882 Only left join is implemented, keeping the index and columns of the

8883 original object.

8884 overwrite : bool, default True

8885 How to handle non-NA values for overlapping keys:

8887 * True: overwrite original DataFrame's values

8888 with values from `other`.

8889 * False: only update values that are NA in

8890 the original DataFrame.

8892 filter_func : callable(1d-array) -> bool 1d-array, optional

8893 Can choose to replace values other than NA. Return True for values

8894 that should be updated.

8895 errors : {'raise', 'ignore'}, default 'ignore'

8896 If 'raise', will raise a ValueError if the DataFrame and `other`

8897 both contain non-NA data in the same place.

8899 Returns

8900 -------

8901 None

8902 This method directly changes calling object.

8904 Raises

8905 ------

8906 ValueError

8907 * When `errors='raise'` and there's overlapping non-NA data.

8908 * When `errors` is not either `'ignore'` or `'raise'`

8909 NotImplementedError

8910 * If `join != 'left'`

8912 See Also

8913 --------

8914 dict.update : Similar method for dictionaries.

8915 DataFrame.merge : For column(s)-on-column(s) operations.

8917 Examples

8918 --------

8919 >>> df = pd.DataFrame({'A': [1, 2, 3],

8920 ... 'B': [400, 500, 600]})

8921 >>> new_df = pd.DataFrame({'B': [4, 5, 6],

8922 ... 'C': [7, 8, 9]})

8923 >>> df.update(new_df)

8924 >>> df

8925 A B

8926 0 1 4

8927 1 2 5

8928 2 3 6

8930 The DataFrame's length does not increase as a result of the update,

8931 only values at matching index/column labels are updated.

8933 >>> df = pd.DataFrame({'A': ['a', 'b', 'c'],

8934 ... 'B': ['x', 'y', 'z']})

8935 >>> new_df = pd.DataFrame({'B': ['d', 'e', 'f', 'g', 'h', 'i']})

8936 >>> df.update(new_df)

8937 >>> df

8938 A B

8939 0 a d

8940 1 b e

8941 2 c f

8943 >>> df = pd.DataFrame({'A': ['a', 'b', 'c'],

8944 ... 'B': ['x', 'y', 'z']})

8945 >>> new_df = pd.DataFrame({'B': ['d', 'f']}, index=[0, 2])

8946 >>> df.update(new_df)

8947 >>> df

8948 A B

8949 0 a d

8950 1 b y

8951 2 c f

8953 For Series, its name attribute must be set.

8955 >>> df = pd.DataFrame({'A': ['a', 'b', 'c'],

8956 ... 'B': ['x', 'y', 'z']})

8957 >>> new_column = pd.Series(['d', 'e', 'f'], name='B')

8958 >>> df.update(new_column)

8959 >>> df

8960 A B

8961 0 a d

8962 1 b e

8963 2 c f

8965 If `other` contains NaNs the corresponding values are not updated

8966 in the original dataframe.

8968 >>> df = pd.DataFrame({'A': [1, 2, 3],

8969 ... 'B': [400., 500., 600.]})

8970 >>> new_df = pd.DataFrame({'B': [4, np.nan, 6]})

8971 >>> df.update(new_df)

8972 >>> df

8973 A B

8974 0 1 4.0

8975 1 2 500.0

8976 2 3 6.0

8977 """

8979 if not PYPY and using_copy_on_write():

8980 if sys.getrefcount(self) <= REF_COUNT:

8981 warnings.warn(

8982 _chained_assignment_method_msg,

8983 ChainedAssignmentError,

8984 stacklevel=2,

8986 elif not PYPY and not using_copy_on_write() and self._is_view_after_cow_rules():

8987 if sys.getrefcount(self) <= REF_COUNT:

8988 warnings.warn(

8989 _chained_assignment_warning_method_msg,

8990 FutureWarning,

8991 stacklevel=2,

8994 # TODO: Support other joins

8995 if join != "left": # pragma: no cover

8996 raise NotImplementedError("Only left join is supported")

8997 if errors not in ["ignore", "raise"]:

8998 raise ValueError("The parameter errors must be either 'ignore' or 'raise'")

9000 if not isinstance(other, DataFrame):

9001 other = DataFrame(other)

9003 other = other.reindex(self.index)

9005 for col in self.columns.intersection(other.columns):

9006 this = self[col]._values

9007 that = other[col]._values

9009 if filter_func is not None:

9010 mask = ~filter_func(this) | isna(that)

9011 else:

9012 if errors == "raise":

9013 mask_this = notna(that)

9014 mask_that = notna(this)

9015 if any(mask_this & mask_that):

9016 raise ValueError("Data overlaps.")

9018 if overwrite:

9019 mask = isna(that)

9020 else:

9021 mask = notna(this)

9023 # don't overwrite columns unnecessarily

9024 if mask.all():

9025 continue

9027 with warnings.catch_warnings():

9028 warnings.filterwarnings(

9029 "ignore",

9030 message="Downcasting behavior",

9031 category=FutureWarning,

9033 # GH#57124 - `that` might get upcasted because of NA values, and then

9034 # downcasted in where because of the mask. Ignoring the warning

9035 # is a stopgap, will replace with a new implementation of update

9036 # in 3.0.

9037 self.loc[:, col] = self[col].where(mask, that)

9039 # ----------------------------------------------------------------------

9040 # Data reshaping

9041 @Appender(

9042 dedent(

9043 """

9044 Examples

9045 --------

9046 >>> df = pd.DataFrame({'Animal': ['Falcon', 'Falcon',

9047 ... 'Parrot', 'Parrot'],

9048 ... 'Max Speed': [380., 370., 24., 26.]})

9049 >>> df

9050 Animal Max Speed

9051 0 Falcon 380.0

9052 1 Falcon 370.0

9053 2 Parrot 24.0

9054 3 Parrot 26.0

9055 >>> df.groupby(['Animal']).mean()

9056 Max Speed

9057 Animal

9058 Falcon 375.0

9059 Parrot 25.0

9061 **Hierarchical Indexes**

9063 We can groupby different levels of a hierarchical index

9064 using the `level` parameter:

9066 >>> arrays = [['Falcon', 'Falcon', 'Parrot', 'Parrot'],

9067 ... ['Captive', 'Wild', 'Captive', 'Wild']]

9068 >>> index = pd.MultiIndex.from_arrays(arrays, names=('Animal', 'Type'))

9069 >>> df = pd.DataFrame({'Max Speed': [390., 350., 30., 20.]},

9070 ... index=index)

9071 >>> df

9072 Max Speed

9073 Animal Type

9074 Falcon Captive 390.0

9075 Wild 350.0

9076 Parrot Captive 30.0

9077 Wild 20.0

9078 >>> df.groupby(level=0).mean()

9079 Max Speed

9080 Animal

9081 Falcon 370.0

9082 Parrot 25.0

9083 >>> df.groupby(level="Type").mean()

9084 Max Speed

9085 Type

9086 Captive 210.0

9087 Wild 185.0

9089 We can also choose to include NA in group keys or not by setting

9090 `dropna` parameter, the default setting is `True`.

9092 >>> l = [[1, 2, 3], [1, None, 4], [2, 1, 3], [1, 2, 2]]

9093 >>> df = pd.DataFrame(l, columns=["a", "b", "c"])

9095 >>> df.groupby(by=["b"]).sum()

9096 a c

9098 1.0 2 3

9099 2.0 2 5

9101 >>> df.groupby(by=["b"], dropna=False).sum()

9102 a c

9104 1.0 2 3

9105 2.0 2 5

9106 NaN 1 4

9108 >>> l = [["a", 12, 12], [None, 12.3, 33.], ["b", 12.3, 123], ["a", 1, 1]]

9109 >>> df = pd.DataFrame(l, columns=["a", "b", "c"])

9111 >>> df.groupby(by="a").sum()

9112 b c

9114 a 13.0 13.0

9115 b 12.3 123.0

9117 >>> df.groupby(by="a", dropna=False).sum()

9118 b c

9120 a 13.0 13.0

9121 b 12.3 123.0

9122 NaN 12.3 33.0

9124 When using ``.apply()``, use ``group_keys`` to include or exclude the

9125 group keys. The ``group_keys`` argument defaults to ``True`` (include).

9127 >>> df = pd.DataFrame({'Animal': ['Falcon', 'Falcon',

9128 ... 'Parrot', 'Parrot'],

9129 ... 'Max Speed': [380., 370., 24., 26.]})

9130 >>> df.groupby("Animal", group_keys=True)[['Max Speed']].apply(lambda x: x)

9131 Max Speed

9132 Animal

9133 Falcon 0 380.0

9134 1 370.0

9135 Parrot 2 24.0

9136 3 26.0

9138 >>> df.groupby("Animal", group_keys=False)[['Max Speed']].apply(lambda x: x)

9139 Max Speed

9140 0 380.0

9141 1 370.0

9142 2 24.0

9143 3 26.0

9144 """

9147 @Appender(_shared_docs["groupby"] % _shared_doc_kwargs)

9148 def groupby(

9149 self,

9150 by=None,

9151 axis: Axis | lib.NoDefault = lib.no_default,

9152 level: IndexLabel | None = None,

9153 as_index: bool = True,

9154 sort: bool = True,

9155 group_keys: bool = True,

9156 observed: bool | lib.NoDefault = lib.no_default,

9157 dropna: bool = True,

9158 ) -> DataFrameGroupBy:

9159 if axis is not lib.no_default:

9160 axis = self._get_axis_number(axis)

9161 if axis == 1:

9162 warnings.warn(

9163 "DataFrame.groupby with axis=1 is deprecated. Do "

9164 "`frame.T.groupby(...)` without axis instead.",

9165 FutureWarning,

9166 stacklevel=find_stack_level(),

9168 else:

9169 warnings.warn(

9170 "The 'axis' keyword in DataFrame.groupby is deprecated and "

9171 "will be removed in a future version.",

9172 FutureWarning,

9173 stacklevel=find_stack_level(),

9175 else:

9176 axis = 0

9178 from pandas.core.groupby.generic import DataFrameGroupBy

9180 if level is None and by is None:

9181 raise TypeError("You have to supply one of 'by' and 'level'")

9183 return DataFrameGroupBy(

9184 obj=self,

9185 keys=by,

9186 axis=axis,

9187 level=level,

9188 as_index=as_index,

9189 sort=sort,

9190 group_keys=group_keys,

9191 observed=observed,

9192 dropna=dropna,

9195 _shared_docs[

9196 "pivot"

9197 ] = """

9198 Return reshaped DataFrame organized by given index / column values.

9200 Reshape data (produce a "pivot" table) based on column values. Uses

9201 unique values from specified `index` / `columns` to form axes of the

9202 resulting DataFrame. This function does not support data

9203 aggregation, multiple values will result in a MultiIndex in the

9204 columns. See the :ref:`User Guide <reshaping>` for more on reshaping.

9206 Parameters

9207 ----------%s

9208 columns : str or object or a list of str

9209 Column to use to make new frame's columns.

9210 index : str or object or a list of str, optional

9211 Column to use to make new frame's index. If not given, uses existing index.

9212 values : str, object or a list of the previous, optional

9213 Column(s) to use for populating new frame's values. If not

9214 specified, all remaining columns will be used and the result will

9215 have hierarchically indexed columns.

9217 Returns

9218 -------

9219 DataFrame

9220 Returns reshaped DataFrame.

9222 Raises

9223 ------

9224 ValueError:

9225 When there are any `index`, `columns` combinations with multiple

9226 values. `DataFrame.pivot_table` when you need to aggregate.

9228 See Also

9229 --------

9230 DataFrame.pivot_table : Generalization of pivot that can handle

9231 duplicate values for one index/column pair.

9232 DataFrame.unstack : Pivot based on the index values instead of a

9233 column.

9234 wide_to_long : Wide panel to long format. Less flexible but more

9235 user-friendly than melt.

9237 Notes

9238 -----

9239 For finer-tuned control, see hierarchical indexing documentation along

9240 with the related stack/unstack methods.

9242 Reference :ref:`the user guide <reshaping.pivot>` for more examples.

9244 Examples

9245 --------

9246 >>> df = pd.DataFrame({'foo': ['one', 'one', 'one', 'two', 'two',

9247 ... 'two'],

9248 ... 'bar': ['A', 'B', 'C', 'A', 'B', 'C'],

9249 ... 'baz': [1, 2, 3, 4, 5, 6],

9250 ... 'zoo': ['x', 'y', 'z', 'q', 'w', 't']})

9251 >>> df

9252 foo bar baz zoo

9253 0 one A 1 x

9254 1 one B 2 y

9255 2 one C 3 z

9256 3 two A 4 q

9257 4 two B 5 w

9258 5 two C 6 t

9260 >>> df.pivot(index='foo', columns='bar', values='baz')

9261 bar A B C

9262 foo

9263 one 1 2 3

9264 two 4 5 6

9266 >>> df.pivot(index='foo', columns='bar')['baz']

9267 bar A B C

9268 foo

9269 one 1 2 3

9270 two 4 5 6

9272 >>> df.pivot(index='foo', columns='bar', values=['baz', 'zoo'])

9273 baz zoo

9274 bar A B C A B C

9275 foo

9276 one 1 2 3 x y z

9277 two 4 5 6 q w t

9279 You could also assign a list of column names or a list of index names.

9281 >>> df = pd.DataFrame({

9282 ... "lev1": [1, 1, 1, 2, 2, 2],

9283 ... "lev2": [1, 1, 2, 1, 1, 2],

9284 ... "lev3": [1, 2, 1, 2, 1, 2],

9285 ... "lev4": [1, 2, 3, 4, 5, 6],

9286 ... "values": [0, 1, 2, 3, 4, 5]})

9287 >>> df

9288 lev1 lev2 lev3 lev4 values

9289 0 1 1 1 1 0

9290 1 1 1 2 2 1

9291 2 1 2 1 3 2

9292 3 2 1 2 4 3

9293 4 2 1 1 5 4

9294 5 2 2 2 6 5

9296 >>> df.pivot(index="lev1", columns=["lev2", "lev3"], values="values")

9297 lev2 1 2

9298 lev3 1 2 1 2

9299 lev1

9300 1 0.0 1.0 2.0 NaN

9301 2 4.0 3.0 NaN 5.0

9303 >>> df.pivot(index=["lev1", "lev2"], columns=["lev3"], values="values")

9304 lev3 1 2

9305 lev1 lev2

9306 1 1 0.0 1.0

9307 2 2.0 NaN

9308 2 1 4.0 3.0

9309 2 NaN 5.0

9311 A ValueError is raised if there are any duplicates.

9313 >>> df = pd.DataFrame({"foo": ['one', 'one', 'two', 'two'],

9314 ... "bar": ['A', 'A', 'B', 'C'],

9315 ... "baz": [1, 2, 3, 4]})

9316 >>> df

9317 foo bar baz

9318 0 one A 1

9319 1 one A 2

9320 2 two B 3

9321 3 two C 4

9323 Notice that the first two rows are the same for our `index`

9324 and `columns` arguments.

9326 >>> df.pivot(index='foo', columns='bar', values='baz')

9327 Traceback (most recent call last):

9328 ...

9329 ValueError: Index contains duplicate entries, cannot reshape

9330 """

9332 @Substitution("")

9333 @Appender(_shared_docs["pivot"])

9334 def pivot(

9335 self, *, columns, index=lib.no_default, values=lib.no_default

9336 ) -> DataFrame:

9337 from pandas.core.reshape.pivot import pivot

9339 return pivot(self, index=index, columns=columns, values=values)

9341 _shared_docs[

9342 "pivot_table"

9343 ] = """

9344 Create a spreadsheet-style pivot table as a DataFrame.

9346 The levels in the pivot table will be stored in MultiIndex objects

9347 (hierarchical indexes) on the index and columns of the result DataFrame.

9349 Parameters

9350 ----------%s

9351 values : list-like or scalar, optional

9352 Column or columns to aggregate.

9353 index : column, Grouper, array, or list of the previous

9354 Keys to group by on the pivot table index. If a list is passed,

9355 it can contain any of the other types (except list). If an array is

9356 passed, it must be the same length as the data and will be used in

9357 the same manner as column values.

9358 columns : column, Grouper, array, or list of the previous

9359 Keys to group by on the pivot table column. If a list is passed,

9360 it can contain any of the other types (except list). If an array is

9361 passed, it must be the same length as the data and will be used in

9362 the same manner as column values.

9363 aggfunc : function, list of functions, dict, default "mean"

9364 If a list of functions is passed, the resulting pivot table will have

9365 hierarchical columns whose top level are the function names

9366 (inferred from the function objects themselves).

9367 If a dict is passed, the key is column to aggregate and the value is

9368 function or list of functions. If ``margin=True``, aggfunc will be

9369 used to calculate the partial aggregates.

9370 fill_value : scalar, default None

9371 Value to replace missing values with (in the resulting pivot table,

9372 after aggregation).

9373 margins : bool, default False

9374 If ``margins=True``, special ``All`` columns and rows

9375 will be added with partial group aggregates across the categories

9376 on the rows and columns.

9377 dropna : bool, default True

9378 Do not include columns whose entries are all NaN. If True,

9379 rows with a NaN value in any column will be omitted before

9380 computing margins.

9381 margins_name : str, default 'All'

9382 Name of the row / column that will contain the totals

9383 when margins is True.

9384 observed : bool, default False

9385 This only applies if any of the groupers are Categoricals.

9386 If True: only show observed values for categorical groupers.

9387 If False: show all values for categorical groupers.

9389 .. deprecated:: 2.2.0

9391 The default value of ``False`` is deprecated and will change to

9392 ``True`` in a future version of pandas.

9394 sort : bool, default True

9395 Specifies if the result should be sorted.

9397 .. versionadded:: 1.3.0

9399 Returns

9400 -------

9401 DataFrame

9402 An Excel style pivot table.

9404 See Also

9405 --------

9406 DataFrame.pivot : Pivot without aggregation that can handle

9407 non-numeric data.

9408 DataFrame.melt: Unpivot a DataFrame from wide to long format,

9409 optionally leaving identifiers set.

9410 wide_to_long : Wide panel to long format. Less flexible but more

9411 user-friendly than melt.

9413 Notes

9414 -----

9415 Reference :ref:`the user guide <reshaping.pivot>` for more examples.

9417 Examples

9418 --------

9419 >>> df = pd.DataFrame({"A": ["foo", "foo", "foo", "foo", "foo",

9420 ... "bar", "bar", "bar", "bar"],

9421 ... "B": ["one", "one", "one", "two", "two",

9422 ... "one", "one", "two", "two"],

9423 ... "C": ["small", "large", "large", "small",

9424 ... "small", "large", "small", "small",

9425 ... "large"],

9426 ... "D": [1, 2, 2, 3, 3, 4, 5, 6, 7],

9427 ... "E": [2, 4, 5, 5, 6, 6, 8, 9, 9]})

9428 >>> df

9429 A B C D E

9430 0 foo one small 1 2

9431 1 foo one large 2 4

9432 2 foo one large 2 5

9433 3 foo two small 3 5

9434 4 foo two small 3 6

9435 5 bar one large 4 6

9436 6 bar one small 5 8

9437 7 bar two small 6 9

9438 8 bar two large 7 9

9440 This first example aggregates values by taking the sum.

9442 >>> table = pd.pivot_table(df, values='D', index=['A', 'B'],

9443 ... columns=['C'], aggfunc="sum")

9444 >>> table

9445 C large small

9446 A B

9447 bar one 4.0 5.0

9448 two 7.0 6.0

9449 foo one 4.0 1.0

9450 two NaN 6.0

9452 We can also fill missing values using the `fill_value` parameter.

9454 >>> table = pd.pivot_table(df, values='D', index=['A', 'B'],

9455 ... columns=['C'], aggfunc="sum", fill_value=0)

9456 >>> table

9457 C large small

9458 A B

9459 bar one 4 5

9460 two 7 6

9461 foo one 4 1

9462 two 0 6

9464 The next example aggregates by taking the mean across multiple columns.

9466 >>> table = pd.pivot_table(df, values=['D', 'E'], index=['A', 'C'],

9467 ... aggfunc={'D': "mean", 'E': "mean"})

9468 >>> table

9469 D E

9470 A C

9471 bar large 5.500000 7.500000

9472 small 5.500000 8.500000

9473 foo large 2.000000 4.500000

9474 small 2.333333 4.333333

9476 We can also calculate multiple types of aggregations for any given

9477 value column.

9479 >>> table = pd.pivot_table(df, values=['D', 'E'], index=['A', 'C'],

9480 ... aggfunc={'D': "mean",

9481 ... 'E': ["min", "max", "mean"]})

9482 >>> table

9483 D E

9484 mean max mean min

9485 A C

9486 bar large 5.500000 9 7.500000 6

9487 small 5.500000 9 8.500000 8

9488 foo large 2.000000 5 4.500000 4

9489 small 2.333333 6 4.333333 2

9490 """

9492 @Substitution("")

9493 @Appender(_shared_docs["pivot_table"])

9494 def pivot_table(

9495 self,

9496 values=None,

9497 index=None,

9498 columns=None,

9499 aggfunc: AggFuncType = "mean",

9500 fill_value=None,

9501 margins: bool = False,

9502 dropna: bool = True,

9503 margins_name: Level = "All",

9504 observed: bool | lib.NoDefault = lib.no_default,

9505 sort: bool = True,

9506 ) -> DataFrame:

9507 from pandas.core.reshape.pivot import pivot_table

9509 return pivot_table(

9510 self,

9511 values=values,

9512 index=index,

9513 columns=columns,

9514 aggfunc=aggfunc,

9515 fill_value=fill_value,

9516 margins=margins,

9517 dropna=dropna,

9518 margins_name=margins_name,

9519 observed=observed,

9520 sort=sort,

9523 def stack(

9524 self,

9525 level: IndexLabel = -1,

9526 dropna: bool | lib.NoDefault = lib.no_default,

9527 sort: bool | lib.NoDefault = lib.no_default,

9528 future_stack: bool = False,

9530 """

9531 Stack the prescribed level(s) from columns to index.

9533 Return a reshaped DataFrame or Series having a multi-level

9534 index with one or more new inner-most levels compared to the current

9535 DataFrame. The new inner-most levels are created by pivoting the

9536 columns of the current dataframe:

9538 - if the columns have a single level, the output is a Series;

9539 - if the columns have multiple levels, the new index

9540 level(s) is (are) taken from the prescribed level(s) and

9541 the output is a DataFrame.

9543 Parameters

9544 ----------

9545 level : int, str, list, default -1

9546 Level(s) to stack from the column axis onto the index

9547 axis, defined as one index or label, or a list of indices

9548 or labels.

9549 dropna : bool, default True

9550 Whether to drop rows in the resulting Frame/Series with

9551 missing values. Stacking a column level onto the index

9552 axis can create combinations of index and column values

9553 that are missing from the original dataframe. See Examples

9554 section.

9555 sort : bool, default True

9556 Whether to sort the levels of the resulting MultiIndex.

9557 future_stack : bool, default False

9558 Whether to use the new implementation that will replace the current

9559 implementation in pandas 3.0. When True, dropna and sort have no impact

9560 on the result and must remain unspecified. See :ref:`pandas 2.1.0 Release

9561 notes <whatsnew_210.enhancements.new_stack>` for more details.

9563 Returns

9564 -------

9565 DataFrame or Series

9566 Stacked dataframe or series.

9568 See Also

9569 --------

9570 DataFrame.unstack : Unstack prescribed level(s) from index axis

9571 onto column axis.

9572 DataFrame.pivot : Reshape dataframe from long format to wide

9573 format.

9574 DataFrame.pivot_table : Create a spreadsheet-style pivot table

9575 as a DataFrame.

9577 Notes

9578 -----

9579 The function is named by analogy with a collection of books

9580 being reorganized from being side by side on a horizontal

9581 position (the columns of the dataframe) to being stacked

9582 vertically on top of each other (in the index of the

9583 dataframe).

9585 Reference :ref:`the user guide <reshaping.stacking>` for more examples.

9587 Examples

9588 --------

9589 **Single level columns**

9591 >>> df_single_level_cols = pd.DataFrame([[0, 1], [2, 3]],

9592 ... index=['cat', 'dog'],

9593 ... columns=['weight', 'height'])

9595 Stacking a dataframe with a single level column axis returns a Series:

9597 >>> df_single_level_cols

9598 weight height

9599 cat 0 1

9600 dog 2 3

9601 >>> df_single_level_cols.stack(future_stack=True)

9602 cat weight 0

9603 height 1

9604 dog weight 2

9605 height 3

9606 dtype: int64

9608 **Multi level columns: simple case**

9610 >>> multicol1 = pd.MultiIndex.from_tuples([('weight', 'kg'),

9611 ... ('weight', 'pounds')])

9612 >>> df_multi_level_cols1 = pd.DataFrame([[1, 2], [2, 4]],

9613 ... index=['cat', 'dog'],

9614 ... columns=multicol1)

9616 Stacking a dataframe with a multi-level column axis:

9618 >>> df_multi_level_cols1

9619 weight

9620 kg pounds

9621 cat 1 2

9622 dog 2 4

9623 >>> df_multi_level_cols1.stack(future_stack=True)

9624 weight

9625 cat kg 1

9626 pounds 2

9627 dog kg 2

9628 pounds 4

9630 **Missing values**

9632 >>> multicol2 = pd.MultiIndex.from_tuples([('weight', 'kg'),

9633 ... ('height', 'm')])

9634 >>> df_multi_level_cols2 = pd.DataFrame([[1.0, 2.0], [3.0, 4.0]],

9635 ... index=['cat', 'dog'],

9636 ... columns=multicol2)

9638 It is common to have missing values when stacking a dataframe

9639 with multi-level columns, as the stacked dataframe typically

9640 has more values than the original dataframe. Missing values

9641 are filled with NaNs:

9643 >>> df_multi_level_cols2

9644 weight height

9645 kg m

9646 cat 1.0 2.0

9647 dog 3.0 4.0

9648 >>> df_multi_level_cols2.stack(future_stack=True)

9649 weight height

9650 cat kg 1.0 NaN

9651 m NaN 2.0

9652 dog kg 3.0 NaN

9653 m NaN 4.0

9655 **Prescribing the level(s) to be stacked**

9657 The first parameter controls which level or levels are stacked:

9659 >>> df_multi_level_cols2.stack(0, future_stack=True)

9660 kg m

9661 cat weight 1.0 NaN

9662 height NaN 2.0

9663 dog weight 3.0 NaN

9664 height NaN 4.0

9665 >>> df_multi_level_cols2.stack([0, 1], future_stack=True)

9666 cat weight kg 1.0

9667 height m 2.0

9668 dog weight kg 3.0

9669 height m 4.0

9670 dtype: float64

9671 """

9672 if not future_stack:

9673 from pandas.core.reshape.reshape import (

9674 stack,

9675 stack_multiple,

9678 if (

9679 dropna is not lib.no_default

9680 or sort is not lib.no_default

9681 or self.columns.nlevels > 1

9683 warnings.warn(

9684 "The previous implementation of stack is deprecated and will be "

9685 "removed in a future version of pandas. See the What's New notes "

9686 "for pandas 2.1.0 for details. Specify future_stack=True to adopt "

9687 "the new implementation and silence this warning.",

9688 FutureWarning,

9689 stacklevel=find_stack_level(),

9692 if dropna is lib.no_default:

9693 dropna = True

9694 if sort is lib.no_default:

9695 sort = True

9697 if isinstance(level, (tuple, list)):

9698 result = stack_multiple(self, level, dropna=dropna, sort=sort)

9699 else:

9700 result = stack(self, level, dropna=dropna, sort=sort)

9701 else:

9702 from pandas.core.reshape.reshape import stack_v3

9704 if dropna is not lib.no_default:

9705 raise ValueError(

9706 "dropna must be unspecified with future_stack=True as the new "

9707 "implementation does not introduce rows of NA values. This "

9708 "argument will be removed in a future version of pandas."

9711 if sort is not lib.no_default:

9712 raise ValueError(

9713 "Cannot specify sort with future_stack=True, this argument will be "

9714 "removed in a future version of pandas. Sort the result using "

9715 ".sort_index instead."

9718 if (

9719 isinstance(level, (tuple, list))

9720 and not all(lev in self.columns.names for lev in level)

9721 and not all(isinstance(lev, int) for lev in level)

9723 raise ValueError(

9724 "level should contain all level names or all level "

9725 "numbers, not a mixture of the two."

9728 if not isinstance(level, (tuple, list)):

9729 level = [level]

9730 level = [self.columns._get_level_number(lev) for lev in level]

9731 result = stack_v3(self, level)

9733 return result.__finalize__(self, method="stack")

9735 def explode(

9736 self,

9737 column: IndexLabel,

9738 ignore_index: bool = False,

9739 ) -> DataFrame:

9740 """

9741 Transform each element of a list-like to a row, replicating index values.

9743 Parameters

9744 ----------

9745 column : IndexLabel

9746 Column(s) to explode.

9747 For multiple columns, specify a non-empty list with each element

9748 be str or tuple, and all specified columns their list-like data

9749 on same row of the frame must have matching length.

9751 .. versionadded:: 1.3.0

9752 Multi-column explode

9754 ignore_index : bool, default False

9755 If True, the resulting index will be labeled 0, 1, …, n - 1.

9757 Returns

9758 -------

9759 DataFrame

9760 Exploded lists to rows of the subset columns;

9761 index will be duplicated for these rows.

9763 Raises

9764 ------

9765 ValueError :

9766 * If columns of the frame are not unique.

9767 * If specified columns to explode is empty list.

9768 * If specified columns to explode have not matching count of

9769 elements rowwise in the frame.

9771 See Also

9772 --------

9773 DataFrame.unstack : Pivot a level of the (necessarily hierarchical)

9774 index labels.

9775 DataFrame.melt : Unpivot a DataFrame from wide format to long format.

9776 Series.explode : Explode a DataFrame from list-like columns to long format.

9778 Notes

9779 -----

9780 This routine will explode list-likes including lists, tuples, sets,

9781 Series, and np.ndarray. The result dtype of the subset rows will

9782 be object. Scalars will be returned unchanged, and empty list-likes will

9783 result in a np.nan for that row. In addition, the ordering of rows in the

9784 output will be non-deterministic when exploding sets.

9786 Reference :ref:`the user guide <reshaping.explode>` for more examples.

9788 Examples

9789 --------

9790 >>> df = pd.DataFrame({'A': [[0, 1, 2], 'foo', [], [3, 4]],

9791 ... 'B': 1,

9792 ... 'C': [['a', 'b', 'c'], np.nan, [], ['d', 'e']]})

9793 >>> df

9794 A B C

9795 0 [0, 1, 2] 1 [a, b, c]

9796 1 foo 1 NaN

9797 2 [] 1 []

9798 3 [3, 4] 1 [d, e]

9800 Single-column explode.

9802 >>> df.explode('A')

9803 A B C

9804 0 0 1 [a, b, c]

9805 0 1 1 [a, b, c]

9806 0 2 1 [a, b, c]

9807 1 foo 1 NaN

9808 2 NaN 1 []

9809 3 3 1 [d, e]

9810 3 4 1 [d, e]

9812 Multi-column explode.

9814 >>> df.explode(list('AC'))

9815 A B C

9816 0 0 1 a

9817 0 1 1 b

9818 0 2 1 c

9819 1 foo 1 NaN

9820 2 NaN 1 NaN

9821 3 3 1 d

9822 3 4 1 e

9823 """

9824 if not self.columns.is_unique:

9825 duplicate_cols = self.columns[self.columns.duplicated()].tolist()

9826 raise ValueError(

9827 f"DataFrame columns must be unique. Duplicate columns: {duplicate_cols}"

9830 columns: list[Hashable]

9831 if is_scalar(column) or isinstance(column, tuple):

9832 columns = [column]

9833 elif isinstance(column, list) and all(

9834 is_scalar(c) or isinstance(c, tuple) for c in column

9836 if not column:

9837 raise ValueError("column must be nonempty")

9838 if len(column) > len(set(column)):

9839 raise ValueError("column must be unique")

9840 columns = column

9841 else:

9842 raise ValueError("column must be a scalar, tuple, or list thereof")

9844 df = self.reset_index(drop=True)

9845 if len(columns) == 1:

9846 result = df[columns[0]].explode()

9847 else:

9848 mylen = lambda x: len(x) if (is_list_like(x) and len(x) > 0) else 1

9849 counts0 = self[columns[0]].apply(mylen)

9850 for c in columns[1:]:

9851 if not all(counts0 == self[c].apply(mylen)):

9852 raise ValueError("columns must have matching element counts")

9853 result = DataFrame({c: df[c].explode() for c in columns})

9854 result = df.drop(columns, axis=1).join(result)

9855 if ignore_index:

9856 result.index = default_index(len(result))

9857 else:

9858 result.index = self.index.take(result.index)

9859 result = result.reindex(columns=self.columns, copy=False)

9861 return result.__finalize__(self, method="explode")

9863 def unstack(self, level: IndexLabel = -1, fill_value=None, sort: bool = True):

9864 """

9865 Pivot a level of the (necessarily hierarchical) index labels.

9867 Returns a DataFrame having a new level of column labels whose inner-most level

9868 consists of the pivoted index labels.

9870 If the index is not a MultiIndex, the output will be a Series

9871 (the analogue of stack when the columns are not a MultiIndex).

9873 Parameters

9874 ----------

9875 level : int, str, or list of these, default -1 (last level)

9876 Level(s) of index to unstack, can pass level name.

9877 fill_value : int, str or dict

9878 Replace NaN with this value if the unstack produces missing values.

9879 sort : bool, default True

9880 Sort the level(s) in the resulting MultiIndex columns.

9882 Returns

9883 -------

9884 Series or DataFrame

9886 See Also

9887 --------

9888 DataFrame.pivot : Pivot a table based on column values.

9889 DataFrame.stack : Pivot a level of the column labels (inverse operation

9890 from `unstack`).

9892 Notes

9893 -----

9894 Reference :ref:`the user guide <reshaping.stacking>` for more examples.

9896 Examples

9897 --------

9898 >>> index = pd.MultiIndex.from_tuples([('one', 'a'), ('one', 'b'),

9899 ... ('two', 'a'), ('two', 'b')])

9900 >>> s = pd.Series(np.arange(1.0, 5.0), index=index)

9901 >>> s

9902 one a 1.0

9903 b 2.0

9904 two a 3.0

9905 b 4.0

9906 dtype: float64

9908 >>> s.unstack(level=-1)

9909 a b

9910 one 1.0 2.0

9911 two 3.0 4.0

9913 >>> s.unstack(level=0)

9914 one two

9915 a 1.0 3.0

9916 b 2.0 4.0

9918 >>> df = s.unstack(level=0)

9919 >>> df.unstack()

9920 one a 1.0

9921 b 2.0

9922 two a 3.0

9923 b 4.0

9924 dtype: float64

9925 """

9926 from pandas.core.reshape.reshape import unstack

9928 result = unstack(self, level, fill_value, sort)

9930 return result.__finalize__(self, method="unstack")

9932 @Appender(_shared_docs["melt"] % {"caller": "df.melt(", "other": "melt"})

9933 def melt(

9934 self,

9935 id_vars=None,

9936 value_vars=None,

9937 var_name=None,

9938 value_name: Hashable = "value",

9939 col_level: Level | None = None,

9940 ignore_index: bool = True,

9941 ) -> DataFrame:

9942 return melt(

9943 self,

9944 id_vars=id_vars,

9945 value_vars=value_vars,

9946 var_name=var_name,

9947 value_name=value_name,

9948 col_level=col_level,

9949 ignore_index=ignore_index,

9950 ).__finalize__(self, method="melt")

9952 # ----------------------------------------------------------------------

9953 # Time series-related

9955 @doc(

9956 Series.diff,

9957 klass="DataFrame",

9958 extra_params="axis : {0 or 'index', 1 or 'columns'}, default 0\n "

9959 "Take difference over rows (0) or columns (1).\n",

9960 other_klass="Series",

9961 examples=dedent(

9962 """

9963 Difference with previous row

9965 >>> df = pd.DataFrame({'a': [1, 2, 3, 4, 5, 6],

9966 ... 'b': [1, 1, 2, 3, 5, 8],

9967 ... 'c': [1, 4, 9, 16, 25, 36]})

9968 >>> df

9969 a b c

9970 0 1 1 1

9971 1 2 1 4

9972 2 3 2 9

9973 3 4 3 16

9974 4 5 5 25

9975 5 6 8 36

9977 >>> df.diff()

9978 a b c

9979 0 NaN NaN NaN

9980 1 1.0 0.0 3.0

9981 2 1.0 1.0 5.0

9982 3 1.0 1.0 7.0

9983 4 1.0 2.0 9.0

9984 5 1.0 3.0 11.0

9986 Difference with previous column

9988 >>> df.diff(axis=1)

9989 a b c

9990 0 NaN 0 0

9991 1 NaN -1 3

9992 2 NaN -1 7

9993 3 NaN -1 13

9994 4 NaN 0 20

9995 5 NaN 2 28

9997 Difference with 3rd previous row

9999 >>> df.diff(periods=3)

10000 a b c

10001 0 NaN NaN NaN

10002 1 NaN NaN NaN

10003 2 NaN NaN NaN

10004 3 3.0 2.0 15.0

10005 4 3.0 4.0 21.0

10006 5 3.0 6.0 27.0

10008 Difference with following row

10010 >>> df.diff(periods=-1)

10011 a b c

10012 0 -1.0 0.0 -3.0

10013 1 -1.0 -1.0 -5.0

10014 2 -1.0 -1.0 -7.0

10015 3 -1.0 -2.0 -9.0

10016 4 -1.0 -3.0 -11.0

10017 5 NaN NaN NaN

10019 Overflow in input dtype

10021 >>> df = pd.DataFrame({'a': [1, 0]}, dtype=np.uint8)

10022 >>> df.diff()

10024 0 NaN

10025 1 255.0"""

10028 def diff(self, periods: int = 1, axis: Axis = 0) -> DataFrame:

10029 if not lib.is_integer(periods):

10030 if not (is_float(periods) and periods.is_integer()):

10031 raise ValueError("periods must be an integer")

10032 periods = int(periods)

10034 axis = self._get_axis_number(axis)

10035 if axis == 1:

10036 if periods != 0:

10037 # in the periods == 0 case, this is equivalent diff of 0 periods

10038 # along axis=0, and the Manager method may be somewhat more

10039 # performant, so we dispatch in that case.

10040 return self - self.shift(periods, axis=axis)

10041 # With periods=0 this is equivalent to a diff with axis=0

10042 axis = 0

10044 new_data = self._mgr.diff(n=periods)

10045 res_df = self._constructor_from_mgr(new_data, axes=new_data.axes)

10046 return res_df.__finalize__(self, "diff")

10048 # ----------------------------------------------------------------------

10049 # Function application

10051 def _gotitem(

10052 self,

10053 key: IndexLabel,

10054 ndim: int,

10055 subset: DataFrame | Series | None = None,

10056 ) -> DataFrame | Series:

10058 Sub-classes to define. Return a sliced object.

10060 Parameters

10061 ----------

10062 key : string / list of selections

10063 ndim : {1, 2}

10064 requested ndim of result

10065 subset : object, default None

10066 subset to act on

10068 if subset is None:

10069 subset = self

10070 elif subset.ndim == 1: # is Series

10071 return subset

10073 # TODO: _shallow_copy(subset)?

10074 return subset[key]

10076 _agg_see_also_doc = dedent(

10078 See Also

10079 --------

10080 DataFrame.apply : Perform any type of operations.

10081 DataFrame.transform : Perform transformation type operations.

10082 pandas.DataFrame.groupby : Perform operations over groups.

10083 pandas.DataFrame.resample : Perform operations over resampled bins.

10084 pandas.DataFrame.rolling : Perform operations over rolling window.

10085 pandas.DataFrame.expanding : Perform operations over expanding window.

10086 pandas.core.window.ewm.ExponentialMovingWindow : Perform operation over exponential

10087 weighted window.

10091 _agg_examples_doc = dedent(

10093 Examples

10094 --------

10095 >>> df = pd.DataFrame([[1, 2, 3],

10096 ... [4, 5, 6],

10097 ... [7, 8, 9],

10098 ... [np.nan, np.nan, np.nan]],

10099 ... columns=['A', 'B', 'C'])

10101 Aggregate these functions over the rows.

10103 >>> df.agg(['sum', 'min'])

10104 A B C

10105 sum 12.0 15.0 18.0

10106 min 1.0 2.0 3.0

10108 Different aggregations per column.

10110 >>> df.agg({'A' : ['sum', 'min'], 'B' : ['min', 'max']})

10112 sum 12.0 NaN

10113 min 1.0 2.0

10114 max NaN 8.0

10116 Aggregate different functions over the columns and rename the index of the resulting

10117 DataFrame.

10119 >>> df.agg(x=('A', 'max'), y=('B', 'min'), z=('C', 'mean'))

10120 A B C

10121 x 7.0 NaN NaN

10122 y NaN 2.0 NaN

10123 z NaN NaN 6.0

10125 Aggregate over the columns.

10127 >>> df.agg("mean", axis="columns")

10128 0 2.0

10129 1 5.0

10130 2 8.0

10131 3 NaN

10132 dtype: float64

10136 @doc(

10137 _shared_docs["aggregate"],

10138 klass=_shared_doc_kwargs["klass"],

10139 axis=_shared_doc_kwargs["axis"],

10140 see_also=_agg_see_also_doc,

10141 examples=_agg_examples_doc,

10143 def aggregate(self, func=None, axis: Axis = 0, *args, **kwargs):

10144 from pandas.core.apply import frame_apply

10146 axis = self._get_axis_number(axis)

10148 op = frame_apply(self, func=func, axis=axis, args=args, kwargs=kwargs)

10149 result = op.agg()

10150 result = reconstruct_and_relabel_result(result, func, **kwargs)

10151 return result

10153 agg = aggregate

10155 @doc(

10156 _shared_docs["transform"],

10157 klass=_shared_doc_kwargs["klass"],

10158 axis=_shared_doc_kwargs["axis"],

10160 def transform(

10161 self, func: AggFuncType, axis: Axis = 0, *args, **kwargs

10162 ) -> DataFrame:

10163 from pandas.core.apply import frame_apply

10165 op = frame_apply(self, func=func, axis=axis, args=args, kwargs=kwargs)

10166 result = op.transform()

10167 assert isinstance(result, DataFrame)

10168 return result

10170 def apply(

10171 self,

10172 func: AggFuncType,

10173 axis: Axis = 0,

10174 raw: bool = False,

10175 result_type: Literal["expand", "reduce", "broadcast"] | None = None,

10176 args=(),

10177 by_row: Literal[False, "compat"] = "compat",

10178 engine: Literal["python", "numba"] = "python",

10179 engine_kwargs: dict[str, bool] | None = None,

10180 **kwargs,

10183 Apply a function along an axis of the DataFrame.

10185 Objects passed to the function are Series objects whose index is

10186 either the DataFrame's index (``axis=0``) or the DataFrame's columns

10187 (``axis=1``). By default (``result_type=None``), the final return type

10188 is inferred from the return type of the applied function. Otherwise,

10189 it depends on the `result_type` argument.

10191 Parameters

10192 ----------

10193 func : function

10194 Function to apply to each column or row.

10195 axis : {0 or 'index', 1 or 'columns'}, default 0

10196 Axis along which the function is applied:

10198 * 0 or 'index': apply function to each column.

10199 * 1 or 'columns': apply function to each row.

10201 raw : bool, default False

10202 Determines if row or column is passed as a Series or ndarray object:

10204 * ``False`` : passes each row or column as a Series to the

10205 function.

10206 * ``True`` : the passed function will receive ndarray objects

10207 instead.

10208 If you are just applying a NumPy reduction function this will

10209 achieve much better performance.

10211 result_type : {'expand', 'reduce', 'broadcast', None}, default None

10212 These only act when ``axis=1`` (columns):

10214 * 'expand' : list-like results will be turned into columns.

10215 * 'reduce' : returns a Series if possible rather than expanding

10216 list-like results. This is the opposite of 'expand'.

10217 * 'broadcast' : results will be broadcast to the original shape

10218 of the DataFrame, the original index and columns will be

10219 retained.

10221 The default behaviour (None) depends on the return value of the

10222 applied function: list-like results will be returned as a Series

10223 of those. However if the apply function returns a Series these

10224 are expanded to columns.

10225 args : tuple

10226 Positional arguments to pass to `func` in addition to the

10227 array/series.

10228 by_row : False or "compat", default "compat"

10229 Only has an effect when ``func`` is a listlike or dictlike of funcs

10230 and the func isn't a string.

10231 If "compat", will if possible first translate the func into pandas

10232 methods (e.g. ``Series().apply(np.sum)`` will be translated to

10233 ``Series().sum()``). If that doesn't work, will try call to apply again with

10234 ``by_row=True`` and if that fails, will call apply again with

10235 ``by_row=False`` (backward compatible).

10236 If False, the funcs will be passed the whole Series at once.

10238 .. versionadded:: 2.1.0

10240 engine : {'python', 'numba'}, default 'python'

10241 Choose between the python (default) engine or the numba engine in apply.

10243 The numba engine will attempt to JIT compile the passed function,

10244 which may result in speedups for large DataFrames.

10245 It also supports the following engine_kwargs :

10247 - nopython (compile the function in nopython mode)

10248 - nogil (release the GIL inside the JIT compiled function)

10249 - parallel (try to apply the function in parallel over the DataFrame)

10251 Note: Due to limitations within numba/how pandas interfaces with numba,

10252 you should only use this if raw=True

10254 Note: The numba compiler only supports a subset of

10255 valid Python/numpy operations.

10257 Please read more about the `supported python features

10258 <https://numba.pydata.org/numba-doc/dev/reference/pysupported.html>`_

10259 and `supported numpy features

10260 <https://numba.pydata.org/numba-doc/dev/reference/numpysupported.html>`_

10261 in numba to learn what you can or cannot use in the passed function.

10263 .. versionadded:: 2.2.0

10265 engine_kwargs : dict

10266 Pass keyword arguments to the engine.

10267 This is currently only used by the numba engine,

10268 see the documentation for the engine argument for more information.

10269 **kwargs

10270 Additional keyword arguments to pass as keywords arguments to

10271 `func`.

10273 Returns

10274 -------

10275 Series or DataFrame

10276 Result of applying ``func`` along the given axis of the

10277 DataFrame.

10279 See Also

10280 --------

10281 DataFrame.map: For elementwise operations.

10282 DataFrame.aggregate: Only perform aggregating type operations.

10283 DataFrame.transform: Only perform transforming type operations.

10285 Notes

10286 -----

10287 Functions that mutate the passed object can produce unexpected

10288 behavior or errors and are not supported. See :ref:`gotchas.udf-mutation`

10289 for more details.

10291 Examples

10292 --------

10293 >>> df = pd.DataFrame([[4, 9]] * 3, columns=['A', 'B'])

10294 >>> df

10296 0 4 9

10297 1 4 9

10298 2 4 9

10300 Using a numpy universal function (in this case the same as

10301 ``np.sqrt(df)``):

10303 >>> df.apply(np.sqrt)

10305 0 2.0 3.0

10306 1 2.0 3.0

10307 2 2.0 3.0

10309 Using a reducing function on either axis

10311 >>> df.apply(np.sum, axis=0)

10312 A 12

10313 B 27

10314 dtype: int64

10316 >>> df.apply(np.sum, axis=1)

10317 0 13

10318 1 13

10319 2 13

10320 dtype: int64

10322 Returning a list-like will result in a Series

10324 >>> df.apply(lambda x: [1, 2], axis=1)

10325 0 [1, 2]

10326 1 [1, 2]

10327 2 [1, 2]

10328 dtype: object

10330 Passing ``result_type='expand'`` will expand list-like results

10331 to columns of a Dataframe

10333 >>> df.apply(lambda x: [1, 2], axis=1, result_type='expand')

10335 0 1 2

10336 1 1 2

10337 2 1 2

10339 Returning a Series inside the function is similar to passing

10340 ``result_type='expand'``. The resulting column names

10341 will be the Series index.

10343 >>> df.apply(lambda x: pd.Series([1, 2], index=['foo', 'bar']), axis=1)

10344 foo bar

10345 0 1 2

10346 1 1 2

10347 2 1 2

10349 Passing ``result_type='broadcast'`` will ensure the same shape

10350 result, whether list-like or scalar is returned by the function,

10351 and broadcast it along the axis. The resulting column names will

10352 be the originals.

10354 >>> df.apply(lambda x: [1, 2], axis=1, result_type='broadcast')

10356 0 1 2

10357 1 1 2

10358 2 1 2

10360 from pandas.core.apply import frame_apply

10362 op = frame_apply(

10363 self,

10364 func=func,

10365 axis=axis,

10366 raw=raw,

10367 result_type=result_type,

10368 by_row=by_row,

10369 engine=engine,

10370 engine_kwargs=engine_kwargs,

10371 args=args,

10372 kwargs=kwargs,

10374 return op.apply().__finalize__(self, method="apply")

10376 def map(

10377 self, func: PythonFuncType, na_action: str | None = None, **kwargs

10378 ) -> DataFrame:

10380 Apply a function to a Dataframe elementwise.

10382 .. versionadded:: 2.1.0

10384 DataFrame.applymap was deprecated and renamed to DataFrame.map.

10386 This method applies a function that accepts and returns a scalar

10387 to every element of a DataFrame.

10389 Parameters

10390 ----------

10391 func : callable

10392 Python function, returns a single value from a single value.

10393 na_action : {None, 'ignore'}, default None

10394 If 'ignore', propagate NaN values, without passing them to func.

10395 **kwargs

10396 Additional keyword arguments to pass as keywords arguments to

10397 `func`.

10399 Returns

10400 -------

10401 DataFrame

10402 Transformed DataFrame.

10404 See Also

10405 --------

10406 DataFrame.apply : Apply a function along input axis of DataFrame.

10407 DataFrame.replace: Replace values given in `to_replace` with `value`.

10408 Series.map : Apply a function elementwise on a Series.

10410 Examples

10411 --------

10412 >>> df = pd.DataFrame([[1, 2.12], [3.356, 4.567]])

10413 >>> df

10415 0 1.000 2.120

10416 1 3.356 4.567

10418 >>> df.map(lambda x: len(str(x)))

10420 0 3 4

10421 1 5 5

10423 Like Series.map, NA values can be ignored:

10425 >>> df_copy = df.copy()

10426 >>> df_copy.iloc[0, 0] = pd.NA

10427 >>> df_copy.map(lambda x: len(str(x)), na_action='ignore')

10429 0 NaN 4

10430 1 5.0 5

10432 It is also possible to use `map` with functions that are not

10433 `lambda` functions:

10435 >>> df.map(round, ndigits=1)

10437 0 1.0 2.1

10438 1 3.4 4.6

10440 Note that a vectorized version of `func` often exists, which will

10441 be much faster. You could square each number elementwise.

10443 >>> df.map(lambda x: x**2)

10445 0 1.000000 4.494400

10446 1 11.262736 20.857489

10448 But it's better to avoid map in that case.

10450 >>> df ** 2

10452 0 1.000000 4.494400

10453 1 11.262736 20.857489

10455 if na_action not in {"ignore", None}:

10456 raise ValueError(

10457 f"na_action must be 'ignore' or None. Got {repr(na_action)}"

10460 if self.empty:

10461 return self.copy()

10463 func = functools.partial(func, **kwargs)

10465 def infer(x):

10466 return x._map_values(func, na_action=na_action)

10468 return self.apply(infer).__finalize__(self, "map")

10470 def applymap(

10471 self, func: PythonFuncType, na_action: NaAction | None = None, **kwargs

10472 ) -> DataFrame:

10474 Apply a function to a Dataframe elementwise.

10476 .. deprecated:: 2.1.0

10478 DataFrame.applymap has been deprecated. Use DataFrame.map instead.

10480 This method applies a function that accepts and returns a scalar

10481 to every element of a DataFrame.

10483 Parameters

10484 ----------

10485 func : callable

10486 Python function, returns a single value from a single value.

10487 na_action : {None, 'ignore'}, default None

10488 If 'ignore', propagate NaN values, without passing them to func.

10489 **kwargs

10490 Additional keyword arguments to pass as keywords arguments to

10491 `func`.

10493 Returns

10494 -------

10495 DataFrame

10496 Transformed DataFrame.

10498 See Also

10499 --------

10500 DataFrame.apply : Apply a function along input axis of DataFrame.

10501 DataFrame.map : Apply a function along input axis of DataFrame.

10502 DataFrame.replace: Replace values given in `to_replace` with `value`.

10504 Examples

10505 --------

10506 >>> df = pd.DataFrame([[1, 2.12], [3.356, 4.567]])

10507 >>> df

10509 0 1.000 2.120

10510 1 3.356 4.567

10512 >>> df.map(lambda x: len(str(x)))

10514 0 3 4

10515 1 5 5

10517 warnings.warn(

10518 "DataFrame.applymap has been deprecated. Use DataFrame.map instead.",

10519 FutureWarning,

10520 stacklevel=find_stack_level(),

10522 return self.map(func, na_action=na_action, **kwargs)

10524 # ----------------------------------------------------------------------

10525 # Merging / joining methods

10527 def _append(

10528 self,

10529 other,

10530 ignore_index: bool = False,

10531 verify_integrity: bool = False,

10532 sort: bool = False,

10533 ) -> DataFrame:

10534 if isinstance(other, (Series, dict)):

10535 if isinstance(other, dict):

10536 if not ignore_index:

10537 raise TypeError("Can only append a dict if ignore_index=True")

10538 other = Series(other)

10539 if other.name is None and not ignore_index:

10540 raise TypeError(

10541 "Can only append a Series if ignore_index=True "

10542 "or if the Series has a name"

10545 index = Index(

10546 [other.name],

10547 name=self.index.names

10548 if isinstance(self.index, MultiIndex)

10549 else self.index.name,

10551 row_df = other.to_frame().T

10552 # infer_objects is needed for

10553 # test_append_empty_frame_to_series_with_dateutil_tz

10554 other = row_df.infer_objects(copy=False).rename_axis(

10555 index.names, copy=False

10557 elif isinstance(other, list):

10558 if not other:

10559 pass

10560 elif not isinstance(other[0], DataFrame):

10561 other = DataFrame(other)

10562 if self.index.name is not None and not ignore_index:

10563 other.index.name = self.index.name

10565 from pandas.core.reshape.concat import concat

10567 if isinstance(other, (list, tuple)):

10568 to_concat = [self, *other]

10569 else:

10570 to_concat = [self, other]

10572 result = concat(

10573 to_concat,

10574 ignore_index=ignore_index,

10575 verify_integrity=verify_integrity,

10576 sort=sort,

10578 return result.__finalize__(self, method="append")

10580 def join(

10581 self,

10582 other: DataFrame | Series | Iterable[DataFrame | Series],

10583 on: IndexLabel | None = None,

10584 how: MergeHow = "left",

10585 lsuffix: str = "",

10586 rsuffix: str = "",

10587 sort: bool = False,

10588 validate: JoinValidate | None = None,

10589 ) -> DataFrame:

10591 Join columns of another DataFrame.

10593 Join columns with `other` DataFrame either on index or on a key

10594 column. Efficiently join multiple DataFrame objects by index at once by

10595 passing a list.

10597 Parameters

10598 ----------

10599 other : DataFrame, Series, or a list containing any combination of them

10600 Index should be similar to one of the columns in this one. If a

10601 Series is passed, its name attribute must be set, and that will be

10602 used as the column name in the resulting joined DataFrame.

10603 on : str, list of str, or array-like, optional

10604 Column or index level name(s) in the caller to join on the index

10605 in `other`, otherwise joins index-on-index. If multiple

10606 values given, the `other` DataFrame must have a MultiIndex. Can

10607 pass an array as the join key if it is not already contained in

10608 the calling DataFrame. Like an Excel VLOOKUP operation.

10609 how : {'left', 'right', 'outer', 'inner', 'cross'}, default 'left'

10610 How to handle the operation of the two objects.

10612 * left: use calling frame's index (or column if on is specified)

10613 * right: use `other`'s index.

10614 * outer: form union of calling frame's index (or column if on is

10615 specified) with `other`'s index, and sort it lexicographically.

10616 * inner: form intersection of calling frame's index (or column if

10617 on is specified) with `other`'s index, preserving the order

10618 of the calling's one.

10619 * cross: creates the cartesian product from both frames, preserves the order

10620 of the left keys.

10621 lsuffix : str, default ''

10622 Suffix to use from left frame's overlapping columns.

10623 rsuffix : str, default ''

10624 Suffix to use from right frame's overlapping columns.

10625 sort : bool, default False

10626 Order result DataFrame lexicographically by the join key. If False,

10627 the order of the join key depends on the join type (how keyword).

10628 validate : str, optional

10629 If specified, checks if join is of specified type.

10631 * "one_to_one" or "1:1": check if join keys are unique in both left

10632 and right datasets.

10633 * "one_to_many" or "1:m": check if join keys are unique in left dataset.

10634 * "many_to_one" or "m:1": check if join keys are unique in right dataset.

10635 * "many_to_many" or "m:m": allowed, but does not result in checks.

10637 .. versionadded:: 1.5.0

10639 Returns

10640 -------

10641 DataFrame

10642 A dataframe containing columns from both the caller and `other`.

10644 See Also

10645 --------

10646 DataFrame.merge : For column(s)-on-column(s) operations.

10648 Notes

10649 -----

10650 Parameters `on`, `lsuffix`, and `rsuffix` are not supported when

10651 passing a list of `DataFrame` objects.

10653 Examples

10654 --------

10655 >>> df = pd.DataFrame({'key': ['K0', 'K1', 'K2', 'K3', 'K4', 'K5'],

10656 ... 'A': ['A0', 'A1', 'A2', 'A3', 'A4', 'A5']})

10658 >>> df

10659 key A

10660 0 K0 A0

10661 1 K1 A1

10662 2 K2 A2

10663 3 K3 A3

10664 4 K4 A4

10665 5 K5 A5

10667 >>> other = pd.DataFrame({'key': ['K0', 'K1', 'K2'],

10668 ... 'B': ['B0', 'B1', 'B2']})

10670 >>> other

10671 key B

10672 0 K0 B0

10673 1 K1 B1

10674 2 K2 B2

10676 Join DataFrames using their indexes.

10678 >>> df.join(other, lsuffix='_caller', rsuffix='_other')

10679 key_caller A key_other B

10680 0 K0 A0 K0 B0

10681 1 K1 A1 K1 B1

10682 2 K2 A2 K2 B2

10683 3 K3 A3 NaN NaN

10684 4 K4 A4 NaN NaN

10685 5 K5 A5 NaN NaN

10687 If we want to join using the key columns, we need to set key to be

10688 the index in both `df` and `other`. The joined DataFrame will have

10689 key as its index.

10691 >>> df.set_index('key').join(other.set_index('key'))

10694 K0 A0 B0

10695 K1 A1 B1

10696 K2 A2 B2

10697 K3 A3 NaN

10698 K4 A4 NaN

10699 K5 A5 NaN

10701 Another option to join using the key columns is to use the `on`

10702 parameter. DataFrame.join always uses `other`'s index but we can use

10703 any column in `df`. This method preserves the original DataFrame's

10704 index in the result.

10706 >>> df.join(other.set_index('key'), on='key')

10707 key A B

10708 0 K0 A0 B0

10709 1 K1 A1 B1

10710 2 K2 A2 B2

10711 3 K3 A3 NaN

10712 4 K4 A4 NaN

10713 5 K5 A5 NaN

10715 Using non-unique key values shows how they are matched.

10717 >>> df = pd.DataFrame({'key': ['K0', 'K1', 'K1', 'K3', 'K0', 'K1'],

10718 ... 'A': ['A0', 'A1', 'A2', 'A3', 'A4', 'A5']})

10720 >>> df

10721 key A

10722 0 K0 A0

10723 1 K1 A1

10724 2 K1 A2

10725 3 K3 A3

10726 4 K0 A4

10727 5 K1 A5

10729 >>> df.join(other.set_index('key'), on='key', validate='m:1')

10730 key A B

10731 0 K0 A0 B0

10732 1 K1 A1 B1

10733 2 K1 A2 B1

10734 3 K3 A3 NaN

10735 4 K0 A4 B0

10736 5 K1 A5 B1

10738 from pandas.core.reshape.concat import concat

10739 from pandas.core.reshape.merge import merge

10741 if isinstance(other, Series):

10742 if other.name is None:

10743 raise ValueError("Other Series must have a name")

10744 other = DataFrame({other.name: other})

10746 if isinstance(other, DataFrame):

10747 if how == "cross":

10748 return merge(

10749 self,

10750 other,

10751 how=how,

10752 on=on,

10753 suffixes=(lsuffix, rsuffix),

10754 sort=sort,

10755 validate=validate,

10757 return merge(

10758 self,

10759 other,

10760 left_on=on,

10761 how=how,

10762 left_index=on is None,

10763 right_index=True,

10764 suffixes=(lsuffix, rsuffix),

10765 sort=sort,

10766 validate=validate,

10768 else:

10769 if on is not None:

10770 raise ValueError(

10771 "Joining multiple DataFrames only supported for joining on index"

10774 if rsuffix or lsuffix:

10775 raise ValueError(

10776 "Suffixes not supported when joining multiple DataFrames"

10779 # Mypy thinks the RHS is a

10780 # "Union[DataFrame, Series, Iterable[Union[DataFrame, Series]]]" whereas

10781 # the LHS is an "Iterable[DataFrame]", but in reality both types are

10782 # "Iterable[Union[DataFrame, Series]]" due to the if statements

10783 frames = [cast("DataFrame | Series", self)] + list(other)

10785 can_concat = all(df.index.is_unique for df in frames)

10787 # join indexes only using concat

10788 if can_concat:

10789 if how == "left":

10790 res = concat(

10791 frames, axis=1, join="outer", verify_integrity=True, sort=sort

10793 return res.reindex(self.index, copy=False)

10794 else:

10795 return concat(

10796 frames, axis=1, join=how, verify_integrity=True, sort=sort

10799 joined = frames[0]

10801 for frame in frames[1:]:

10802 joined = merge(

10803 joined,

10804 frame,

10805 how=how,

10806 left_index=True,

10807 right_index=True,

10808 validate=validate,

10811 return joined

10813 @Substitution("")

10814 @Appender(_merge_doc, indents=2)

10815 def merge(

10816 self,

10817 right: DataFrame | Series,

10818 how: MergeHow = "inner",

10819 on: IndexLabel | AnyArrayLike | None = None,

10820 left_on: IndexLabel | AnyArrayLike | None = None,

10821 right_on: IndexLabel | AnyArrayLike | None = None,

10822 left_index: bool = False,

10823 right_index: bool = False,

10824 sort: bool = False,

10825 suffixes: Suffixes = ("_x", "_y"),

10826 copy: bool | None = None,

10827 indicator: str | bool = False,

10828 validate: MergeValidate | None = None,

10829 ) -> DataFrame:

10830 from pandas.core.reshape.merge import merge

10832 return merge(

10833 self,

10834 right,

10835 how=how,

10836 on=on,

10837 left_on=left_on,

10838 right_on=right_on,

10839 left_index=left_index,

10840 right_index=right_index,

10841 sort=sort,

10842 suffixes=suffixes,

10843 copy=copy,

10844 indicator=indicator,

10845 validate=validate,

10848 def round(

10849 self, decimals: int | dict[IndexLabel, int] | Series = 0, *args, **kwargs

10850 ) -> DataFrame:

10852 Round a DataFrame to a variable number of decimal places.

10854 Parameters

10855 ----------

10856 decimals : int, dict, Series

10857 Number of decimal places to round each column to. If an int is

10858 given, round each column to the same number of places.

10859 Otherwise dict and Series round to variable numbers of places.

10860 Column names should be in the keys if `decimals` is a

10861 dict-like, or in the index if `decimals` is a Series. Any

10862 columns not included in `decimals` will be left as is. Elements

10863 of `decimals` which are not columns of the input will be

10864 ignored.

10865 *args

10866 Additional keywords have no effect but might be accepted for

10867 compatibility with numpy.

10868 **kwargs

10869 Additional keywords have no effect but might be accepted for

10870 compatibility with numpy.

10872 Returns

10873 -------

10874 DataFrame

10875 A DataFrame with the affected columns rounded to the specified

10876 number of decimal places.

10878 See Also

10879 --------

10880 numpy.around : Round a numpy array to the given number of decimals.

10881 Series.round : Round a Series to the given number of decimals.

10883 Examples

10884 --------

10885 >>> df = pd.DataFrame([(.21, .32), (.01, .67), (.66, .03), (.21, .18)],

10886 ... columns=['dogs', 'cats'])

10887 >>> df

10888 dogs cats

10889 0 0.21 0.32

10890 1 0.01 0.67

10891 2 0.66 0.03

10892 3 0.21 0.18

10894 By providing an integer each column is rounded to the same number

10895 of decimal places

10897 >>> df.round(1)

10898 dogs cats

10899 0 0.2 0.3

10900 1 0.0 0.7

10901 2 0.7 0.0

10902 3 0.2 0.2

10904 With a dict, the number of places for specific columns can be

10905 specified with the column names as key and the number of decimal

10906 places as value

10908 >>> df.round({'dogs': 1, 'cats': 0})

10909 dogs cats

10910 0 0.2 0.0

10911 1 0.0 1.0

10912 2 0.7 0.0

10913 3 0.2 0.0

10915 Using a Series, the number of places for specific columns can be

10916 specified with the column names as index and the number of

10917 decimal places as value

10919 >>> decimals = pd.Series([0, 1], index=['cats', 'dogs'])

10920 >>> df.round(decimals)

10921 dogs cats

10922 0 0.2 0.0

10923 1 0.0 1.0

10924 2 0.7 0.0

10925 3 0.2 0.0

10927 from pandas.core.reshape.concat import concat

10929 def _dict_round(df: DataFrame, decimals):

10930 for col, vals in df.items():

10931 try:

10932 yield _series_round(vals, decimals[col])

10933 except KeyError:

10934 yield vals

10936 def _series_round(ser: Series, decimals: int) -> Series:

10937 if is_integer_dtype(ser.dtype) or is_float_dtype(ser.dtype):

10938 return ser.round(decimals)

10939 return ser

10941 nv.validate_round(args, kwargs)

10943 if isinstance(decimals, (dict, Series)):

10944 if isinstance(decimals, Series) and not decimals.index.is_unique:

10945 raise ValueError("Index of decimals must be unique")

10946 if is_dict_like(decimals) and not all(

10947 is_integer(value) for _, value in decimals.items()

10949 raise TypeError("Values in decimals must be integers")

10950 new_cols = list(_dict_round(self, decimals))

10951 elif is_integer(decimals):

10952 # Dispatch to Block.round

10953 # Argument "decimals" to "round" of "BaseBlockManager" has incompatible

10954 # type "Union[int, integer[Any]]"; expected "int"

10955 new_mgr = self._mgr.round(

10956 decimals=decimals, # type: ignore[arg-type]

10957 using_cow=using_copy_on_write(),

10959 return self._constructor_from_mgr(new_mgr, axes=new_mgr.axes).__finalize__(

10960 self, method="round"

10962 else:

10963 raise TypeError("decimals must be an integer, a dict-like or a Series")

10965 if new_cols is not None and len(new_cols) > 0:

10966 return self._constructor(

10967 concat(new_cols, axis=1), index=self.index, columns=self.columns

10968 ).__finalize__(self, method="round")

10969 else:

10970 return self.copy(deep=False)

10972 # ----------------------------------------------------------------------

10973 # Statistical methods, etc.

10975 def corr(

10976 self,

10977 method: CorrelationMethod = "pearson",

10978 min_periods: int = 1,

10979 numeric_only: bool = False,

10980 ) -> DataFrame:

10982 Compute pairwise correlation of columns, excluding NA/null values.

10984 Parameters

10985 ----------

10986 method : {'pearson', 'kendall', 'spearman'} or callable

10987 Method of correlation:

10989 * pearson : standard correlation coefficient

10990 * kendall : Kendall Tau correlation coefficient

10991 * spearman : Spearman rank correlation

10992 * callable: callable with input two 1d ndarrays

10993 and returning a float. Note that the returned matrix from corr

10994 will have 1 along the diagonals and will be symmetric

10995 regardless of the callable's behavior.

10996 min_periods : int, optional

10997 Minimum number of observations required per pair of columns

10998 to have a valid result. Currently only available for Pearson

10999 and Spearman correlation.

11000 numeric_only : bool, default False

11001 Include only `float`, `int` or `boolean` data.

11003 .. versionadded:: 1.5.0

11005 .. versionchanged:: 2.0.0

11006 The default value of ``numeric_only`` is now ``False``.

11008 Returns

11009 -------

11010 DataFrame

11011 Correlation matrix.

11013 See Also

11014 --------

11015 DataFrame.corrwith : Compute pairwise correlation with another

11016 DataFrame or Series.

11017 Series.corr : Compute the correlation between two Series.

11019 Notes

11020 -----

11021 Pearson, Kendall and Spearman correlation are currently computed using pairwise complete observations.

11023 * `Pearson correlation coefficient <https://en.wikipedia.org/wiki/Pearson_correlation_coefficient>`_

11024 * `Kendall rank correlation coefficient <https://en.wikipedia.org/wiki/Kendall_rank_correlation_coefficient>`_

11025 * `Spearman's rank correlation coefficient <https://en.wikipedia.org/wiki/Spearman%27s_rank_correlation_coefficient>`_

11027 Examples

11028 --------

11029 >>> def histogram_intersection(a, b):

11030 ... v = np.minimum(a, b).sum().round(decimals=1)

11031 ... return v

11032 >>> df = pd.DataFrame([(.2, .3), (.0, .6), (.6, .0), (.2, .1)],

11033 ... columns=['dogs', 'cats'])

11034 >>> df.corr(method=histogram_intersection)

11035 dogs cats

11036 dogs 1.0 0.3

11037 cats 0.3 1.0

11039 >>> df = pd.DataFrame([(1, 1), (2, np.nan), (np.nan, 3), (4, 4)],

11040 ... columns=['dogs', 'cats'])

11041 >>> df.corr(min_periods=3)

11042 dogs cats

11043 dogs 1.0 NaN

11044 cats NaN 1.0

11045 """ # noqa: E501

11046 data = self._get_numeric_data() if numeric_only else self

11047 cols = data.columns

11048 idx = cols.copy()

11049 mat = data.to_numpy(dtype=float, na_value=np.nan, copy=False)

11051 if method == "pearson":

11052 correl = libalgos.nancorr(mat, minp=min_periods)

11053 elif method == "spearman":

11054 correl = libalgos.nancorr_spearman(mat, minp=min_periods)

11055 elif method == "kendall" or callable(method):

11056 if min_periods is None:

11057 min_periods = 1

11058 mat = mat.T

11059 corrf = nanops.get_corr_func(method)

11060 K = len(cols)

11061 correl = np.empty((K, K), dtype=float)

11062 mask = np.isfinite(mat)

11063 for i, ac in enumerate(mat):

11064 for j, bc in enumerate(mat):

11065 if i > j:

11066 continue

11068 valid = mask[i] & mask[j]

11069 if valid.sum() < min_periods:

11070 c = np.nan

11071 elif i == j:

11072 c = 1.0

11073 elif not valid.all():

11074 c = corrf(ac[valid], bc[valid])

11075 else:

11076 c = corrf(ac, bc)

11077 correl[i, j] = c

11078 correl[j, i] = c

11079 else:

11080 raise ValueError(

11081 "method must be either 'pearson', "

11082 "'spearman', 'kendall', or a callable, "

11083 f"'{method}' was supplied"

11086 result = self._constructor(correl, index=idx, columns=cols, copy=False)

11087 return result.__finalize__(self, method="corr")

11089 def cov(

11090 self,

11091 min_periods: int | None = None,

11092 ddof: int | None = 1,

11093 numeric_only: bool = False,

11094 ) -> DataFrame:

11096 Compute pairwise covariance of columns, excluding NA/null values.

11098 Compute the pairwise covariance among the series of a DataFrame.

11099 The returned data frame is the `covariance matrix

11100 <https://en.wikipedia.org/wiki/Covariance_matrix>`__ of the columns

11101 of the DataFrame.

11103 Both NA and null values are automatically excluded from the

11104 calculation. (See the note below about bias from missing values.)

11105 A threshold can be set for the minimum number of

11106 observations for each value created. Comparisons with observations

11107 below this threshold will be returned as ``NaN``.

11109 This method is generally used for the analysis of time series data to

11110 understand the relationship between different measures

11111 across time.

11113 Parameters

11114 ----------

11115 min_periods : int, optional

11116 Minimum number of observations required per pair of columns

11117 to have a valid result.

11119 ddof : int, default 1

11120 Delta degrees of freedom. The divisor used in calculations

11121 is ``N - ddof``, where ``N`` represents the number of elements.

11122 This argument is applicable only when no ``nan`` is in the dataframe.

11124 numeric_only : bool, default False

11125 Include only `float`, `int` or `boolean` data.

11127 .. versionadded:: 1.5.0

11129 .. versionchanged:: 2.0.0

11130 The default value of ``numeric_only`` is now ``False``.

11132 Returns

11133 -------

11134 DataFrame

11135 The covariance matrix of the series of the DataFrame.

11137 See Also

11138 --------

11139 Series.cov : Compute covariance with another Series.

11140 core.window.ewm.ExponentialMovingWindow.cov : Exponential weighted sample

11141 covariance.

11142 core.window.expanding.Expanding.cov : Expanding sample covariance.

11143 core.window.rolling.Rolling.cov : Rolling sample covariance.

11145 Notes

11146 -----

11147 Returns the covariance matrix of the DataFrame's time series.

11148 The covariance is normalized by N-ddof.

11150 For DataFrames that have Series that are missing data (assuming that

11151 data is `missing at random

11152 <https://en.wikipedia.org/wiki/Missing_data#Missing_at_random>`__)

11153 the returned covariance matrix will be an unbiased estimate

11154 of the variance and covariance between the member Series.

11156 However, for many applications this estimate may not be acceptable

11157 because the estimate covariance matrix is not guaranteed to be positive

11158 semi-definite. This could lead to estimate correlations having

11159 absolute values which are greater than one, and/or a non-invertible

11160 covariance matrix. See `Estimation of covariance matrices

11161 <https://en.wikipedia.org/w/index.php?title=Estimation_of_covariance_

11162 matrices>`__ for more details.

11164 Examples

11165 --------

11166 >>> df = pd.DataFrame([(1, 2), (0, 3), (2, 0), (1, 1)],

11167 ... columns=['dogs', 'cats'])

11168 >>> df.cov()

11169 dogs cats

11170 dogs 0.666667 -1.000000

11171 cats -1.000000 1.666667

11173 >>> np.random.seed(42)

11174 >>> df = pd.DataFrame(np.random.randn(1000, 5),

11175 ... columns=['a', 'b', 'c', 'd', 'e'])

11176 >>> df.cov()

11177 a b c d e

11178 a 0.998438 -0.020161 0.059277 -0.008943 0.014144

11179 b -0.020161 1.059352 -0.008543 -0.024738 0.009826

11180 c 0.059277 -0.008543 1.010670 -0.001486 -0.000271

11181 d -0.008943 -0.024738 -0.001486 0.921297 -0.013692

11182 e 0.014144 0.009826 -0.000271 -0.013692 0.977795

11184 **Minimum number of periods**

11186 This method also supports an optional ``min_periods`` keyword

11187 that specifies the required minimum number of non-NA observations for

11188 each column pair in order to have a valid result:

11190 >>> np.random.seed(42)

11191 >>> df = pd.DataFrame(np.random.randn(20, 3),

11192 ... columns=['a', 'b', 'c'])

11193 >>> df.loc[df.index[:5], 'a'] = np.nan

11194 >>> df.loc[df.index[5:10], 'b'] = np.nan

11195 >>> df.cov(min_periods=12)

11196 a b c

11197 a 0.316741 NaN -0.150812

11198 b NaN 1.248003 0.191417

11199 c -0.150812 0.191417 0.895202

11201 data = self._get_numeric_data() if numeric_only else self

11202 cols = data.columns

11203 idx = cols.copy()

11204 mat = data.to_numpy(dtype=float, na_value=np.nan, copy=False)

11206 if notna(mat).all():

11207 if min_periods is not None and min_periods > len(mat):

11208 base_cov = np.empty((mat.shape[1], mat.shape[1]))

11209 base_cov.fill(np.nan)

11210 else:

11211 base_cov = np.cov(mat.T, ddof=ddof)

11212 base_cov = base_cov.reshape((len(cols), len(cols)))

11213 else:

11214 base_cov = libalgos.nancorr(mat, cov=True, minp=min_periods)

11216 result = self._constructor(base_cov, index=idx, columns=cols, copy=False)

11217 return result.__finalize__(self, method="cov")

11219 def corrwith(

11220 self,

11221 other: DataFrame | Series,

11222 axis: Axis = 0,

11223 drop: bool = False,

11224 method: CorrelationMethod = "pearson",

11225 numeric_only: bool = False,

11226 ) -> Series:

11228 Compute pairwise correlation.

11230 Pairwise correlation is computed between rows or columns of

11231 DataFrame with rows or columns of Series or DataFrame. DataFrames

11232 are first aligned along both axes before computing the

11233 correlations.

11235 Parameters

11236 ----------

11237 other : DataFrame, Series

11238 Object with which to compute correlations.

11239 axis : {0 or 'index', 1 or 'columns'}, default 0

11240 The axis to use. 0 or 'index' to compute row-wise, 1 or 'columns' for

11241 column-wise.

11242 drop : bool, default False

11243 Drop missing indices from result.

11244 method : {'pearson', 'kendall', 'spearman'} or callable

11245 Method of correlation:

11247 * pearson : standard correlation coefficient

11248 * kendall : Kendall Tau correlation coefficient

11249 * spearman : Spearman rank correlation

11250 * callable: callable with input two 1d ndarrays

11251 and returning a float.

11253 numeric_only : bool, default False

11254 Include only `float`, `int` or `boolean` data.

11256 .. versionadded:: 1.5.0

11258 .. versionchanged:: 2.0.0

11259 The default value of ``numeric_only`` is now ``False``.

11261 Returns

11262 -------

11263 Series

11264 Pairwise correlations.

11266 See Also

11267 --------

11268 DataFrame.corr : Compute pairwise correlation of columns.

11270 Examples

11271 --------

11272 >>> index = ["a", "b", "c", "d", "e"]

11273 >>> columns = ["one", "two", "three", "four"]

11274 >>> df1 = pd.DataFrame(np.arange(20).reshape(5, 4), index=index, columns=columns)

11275 >>> df2 = pd.DataFrame(np.arange(16).reshape(4, 4), index=index[:4], columns=columns)

11276 >>> df1.corrwith(df2)

11277 one 1.0

11278 two 1.0

11279 three 1.0

11280 four 1.0

11281 dtype: float64

11283 >>> df2.corrwith(df1, axis=1)

11284 a 1.0

11285 b 1.0

11286 c 1.0

11287 d 1.0

11288 e NaN

11289 dtype: float64

11290 """ # noqa: E501

11291 axis = self._get_axis_number(axis)

11292 this = self._get_numeric_data() if numeric_only else self

11294 if isinstance(other, Series):

11295 return this.apply(lambda x: other.corr(x, method=method), axis=axis)

11297 if numeric_only:

11298 other = other._get_numeric_data()

11299 left, right = this.align(other, join="inner", copy=False)

11301 if axis == 1:

11302 left = left.T

11303 right = right.T

11305 if method == "pearson":

11306 # mask missing values

11307 left = left + right * 0

11308 right = right + left * 0

11310 # demeaned data

11311 ldem = left - left.mean(numeric_only=numeric_only)

11312 rdem = right - right.mean(numeric_only=numeric_only)

11314 num = (ldem * rdem).sum()

11315 dom = (

11316 (left.count() - 1)

11317 * left.std(numeric_only=numeric_only)

11318 * right.std(numeric_only=numeric_only)

11321 correl = num / dom

11323 elif method in ["kendall", "spearman"] or callable(method):

11325 def c(x):

11326 return nanops.nancorr(x[0], x[1], method=method)

11328 correl = self._constructor_sliced(

11329 map(c, zip(left.values.T, right.values.T)),

11330 index=left.columns,

11331 copy=False,

11334 else:

11335 raise ValueError(

11336 f"Invalid method {method} was passed, "

11337 "valid methods are: 'pearson', 'kendall', "

11338 "'spearman', or callable"

11341 if not drop:

11342 # Find non-matching labels along the given axis

11343 # and append missing correlations (GH 22375)

11344 raxis: AxisInt = 1 if axis == 0 else 0

11345 result_index = this._get_axis(raxis).union(other._get_axis(raxis))

11346 idx_diff = result_index.difference(correl.index)

11348 if len(idx_diff) > 0:

11349 correl = correl._append(

11350 Series([np.nan] * len(idx_diff), index=idx_diff)

11353 return correl

11355 # ----------------------------------------------------------------------

11356 # ndarray-like stats methods

11358 def count(self, axis: Axis = 0, numeric_only: bool = False):

11360 Count non-NA cells for each column or row.

11362 The values `None`, `NaN`, `NaT`, ``pandas.NA`` are considered NA.

11364 Parameters

11365 ----------

11366 axis : {0 or 'index', 1 or 'columns'}, default 0

11367 If 0 or 'index' counts are generated for each column.

11368 If 1 or 'columns' counts are generated for each row.

11369 numeric_only : bool, default False

11370 Include only `float`, `int` or `boolean` data.

11372 Returns

11373 -------

11374 Series

11375 For each column/row the number of non-NA/null entries.

11377 See Also

11378 --------

11379 Series.count: Number of non-NA elements in a Series.

11380 DataFrame.value_counts: Count unique combinations of columns.

11381 DataFrame.shape: Number of DataFrame rows and columns (including NA

11382 elements).

11383 DataFrame.isna: Boolean same-sized DataFrame showing places of NA

11384 elements.

11386 Examples

11387 --------

11388 Constructing DataFrame from a dictionary:

11390 >>> df = pd.DataFrame({"Person":

11391 ... ["John", "Myla", "Lewis", "John", "Myla"],

11392 ... "Age": [24., np.nan, 21., 33, 26],

11393 ... "Single": [False, True, True, True, False]})

11394 >>> df

11395 Person Age Single

11396 0 John 24.0 False

11397 1 Myla NaN True

11398 2 Lewis 21.0 True

11399 3 John 33.0 True

11400 4 Myla 26.0 False

11402 Notice the uncounted NA values:

11404 >>> df.count()

11405 Person 5

11406 Age 4

11407 Single 5

11408 dtype: int64

11410 Counts for each **row**:

11412 >>> df.count(axis='columns')

11418 dtype: int64

11420 axis = self._get_axis_number(axis)

11422 if numeric_only:

11423 frame = self._get_numeric_data()

11424 else:

11425 frame = self

11427 # GH #423

11428 if len(frame._get_axis(axis)) == 0:

11429 result = self._constructor_sliced(0, index=frame._get_agg_axis(axis))

11430 else:

11431 result = notna(frame).sum(axis=axis)

11433 return result.astype("int64", copy=False).__finalize__(self, method="count")

11435 def _reduce(

11436 self,

11438 name: str,

11440 axis: Axis = 0,

11441 skipna: bool = True,

11442 numeric_only: bool = False,

11443 filter_type=None,

11444 **kwds,

11446 assert filter_type is None or filter_type == "bool", filter_type

11447 out_dtype = "bool" if filter_type == "bool" else None

11449 if axis is not None:

11450 axis = self._get_axis_number(axis)

11452 def func(values: np.ndarray):

11453 # We only use this in the case that operates on self.values

11454 return op(values, axis=axis, skipna=skipna, **kwds)

11456 dtype_has_keepdims: dict[ExtensionDtype, bool] = {}

11458 def blk_func(values, axis: Axis = 1):

11459 if isinstance(values, ExtensionArray):

11460 if not is_1d_only_ea_dtype(values.dtype) and not isinstance(

11461 self._mgr, ArrayManager

11463 return values._reduce(name, axis=1, skipna=skipna, **kwds)

11464 has_keepdims = dtype_has_keepdims.get(values.dtype)

11465 if has_keepdims is None:

11466 sign = signature(values._reduce)

11467 has_keepdims = "keepdims" in sign.parameters

11468 dtype_has_keepdims[values.dtype] = has_keepdims

11469 if has_keepdims:

11470 return values._reduce(name, skipna=skipna, keepdims=True, **kwds)

11471 else:

11472 warnings.warn(

11473 f"{type(values)}._reduce will require a `keepdims` parameter "

11474 "in the future",

11475 FutureWarning,

11476 stacklevel=find_stack_level(),

11478 result = values._reduce(name, skipna=skipna, **kwds)

11479 return np.array([result])

11480 else:

11481 return op(values, axis=axis, skipna=skipna, **kwds)

11483 def _get_data() -> DataFrame:

11484 if filter_type is None:

11485 data = self._get_numeric_data()

11486 else:

11487 # GH#25101, GH#24434

11488 assert filter_type == "bool"

11489 data = self._get_bool_data()

11490 return data

11492 # Case with EAs see GH#35881

11493 df = self

11494 if numeric_only:

11495 df = _get_data()

11496 if axis is None:

11497 dtype = find_common_type([arr.dtype for arr in df._mgr.arrays])

11498 if isinstance(dtype, ExtensionDtype):

11499 df = df.astype(dtype, copy=False)

11500 arr = concat_compat(list(df._iter_column_arrays()))

11501 return arr._reduce(name, skipna=skipna, keepdims=False, **kwds)

11502 return func(df.values)

11503 elif axis == 1:

11504 if len(df.index) == 0:

11505 # Taking a transpose would result in no columns, losing the dtype.

11506 # In the empty case, reducing along axis 0 or 1 gives the same

11507 # result dtype, so reduce with axis=0 and ignore values

11508 result = df._reduce(

11510 name,

11511 axis=0,

11512 skipna=skipna,

11513 numeric_only=False,

11514 filter_type=filter_type,

11515 **kwds,

11516 ).iloc[:0]

11517 result.index = df.index

11518 return result

11520 # kurtosis excluded since groupby does not implement it

11521 if df.shape[1] and name != "kurt":

11522 dtype = find_common_type([arr.dtype for arr in df._mgr.arrays])

11523 if isinstance(dtype, ExtensionDtype):

11524 # GH 54341: fastpath for EA-backed axis=1 reductions

11525 # This flattens the frame into a single 1D array while keeping

11526 # track of the row and column indices of the original frame. Once

11527 # flattened, grouping by the row indices and aggregating should

11528 # be equivalent to transposing the original frame and aggregating

11529 # with axis=0.

11530 name = {"argmax": "idxmax", "argmin": "idxmin"}.get(name, name)

11531 df = df.astype(dtype, copy=False)

11532 arr = concat_compat(list(df._iter_column_arrays()))

11533 nrows, ncols = df.shape

11534 row_index = np.tile(np.arange(nrows), ncols)

11535 col_index = np.repeat(np.arange(ncols), nrows)

11536 ser = Series(arr, index=col_index, copy=False)

11537 # GroupBy will raise a warning with SeriesGroupBy as the object,

11538 # likely confusing users

11539 with rewrite_warning(

11540 target_message=(

11541 f"The behavior of SeriesGroupBy.{name} with all-NA values"

11543 target_category=FutureWarning,

11544 new_message=(

11545 f"The behavior of {type(self).__name__}.{name} with all-NA "

11546 "values, or any-NA and skipna=False, is deprecated. In "

11547 "a future version this will raise ValueError"

11550 result = ser.groupby(row_index).agg(name, **kwds)

11551 result.index = df.index

11552 if not skipna and name not in ("any", "all"):

11553 mask = df.isna().to_numpy(dtype=np.bool_).any(axis=1)

11554 other = -1 if name in ("idxmax", "idxmin") else lib.no_default

11555 result = result.mask(mask, other)

11556 return result

11558 df = df.T

11560 # After possibly _get_data and transposing, we are now in the

11561 # simple case where we can use BlockManager.reduce

11562 res = df._mgr.reduce(blk_func)

11563 out = df._constructor_from_mgr(res, axes=res.axes).iloc[0]

11564 if out_dtype is not None and out.dtype != "boolean":

11565 out = out.astype(out_dtype)

11566 elif (df._mgr.get_dtypes() == object).any() and name not in ["any", "all"]:

11567 out = out.astype(object)

11568 elif len(self) == 0 and out.dtype == object and name in ("sum", "prod"):

11569 # Even if we are object dtype, follow numpy and return

11570 # float64, see test_apply_funcs_over_empty

11571 out = out.astype(np.float64)

11573 return out

11575 def _reduce_axis1(self, name: str, func, skipna: bool) -> Series:

11577 Special case for _reduce to try to avoid a potentially-expensive transpose.

11579 Apply the reduction block-wise along axis=1 and then reduce the resulting

11580 1D arrays.

11582 if name == "all":

11583 result = np.ones(len(self), dtype=bool)

11584 ufunc = np.logical_and

11585 elif name == "any":

11586 result = np.zeros(len(self), dtype=bool)

11587 # error: Incompatible types in assignment

11588 # (expression has type "_UFunc_Nin2_Nout1[Literal['logical_or'],

11589 # Literal[20], Literal[False]]", variable has type

11590 # "_UFunc_Nin2_Nout1[Literal['logical_and'], Literal[20],

11591 # Literal[True]]")

11592 ufunc = np.logical_or # type: ignore[assignment]

11593 else:

11594 raise NotImplementedError(name)

11596 for arr in self._mgr.arrays:

11597 middle = func(arr, axis=0, skipna=skipna)

11598 result = ufunc(result, middle)

11600 res_ser = self._constructor_sliced(result, index=self.index, copy=False)

11601 return res_ser

11603 @doc(make_doc("any", ndim=2))

11604 # error: Signature of "any" incompatible with supertype "NDFrame"

11605 def any( # type: ignore[override]

11606 self,

11608 axis: Axis | None = 0,

11609 bool_only: bool = False,

11610 skipna: bool = True,

11611 **kwargs,

11612 ) -> Series | bool:

11613 result = self._logical_func(

11614 "any", nanops.nanany, axis, bool_only, skipna, **kwargs

11616 if isinstance(result, Series):

11617 result = result.__finalize__(self, method="any")

11618 return result

11620 @doc(make_doc("all", ndim=2))

11621 def all(

11622 self,

11623 axis: Axis | None = 0,

11624 bool_only: bool = False,

11625 skipna: bool = True,

11626 **kwargs,

11627 ) -> Series | bool:

11628 result = self._logical_func(

11629 "all", nanops.nanall, axis, bool_only, skipna, **kwargs

11631 if isinstance(result, Series):

11632 result = result.__finalize__(self, method="all")

11633 return result

11635 @doc(make_doc("min", ndim=2))

11636 def min(

11637 self,

11638 axis: Axis | None = 0,

11639 skipna: bool = True,

11640 numeric_only: bool = False,

11641 **kwargs,

11643 result = super().min(axis, skipna, numeric_only, **kwargs)

11644 if isinstance(result, Series):

11645 result = result.__finalize__(self, method="min")

11646 return result

11648 @doc(make_doc("max", ndim=2))

11649 def max(

11650 self,

11651 axis: Axis | None = 0,

11652 skipna: bool = True,

11653 numeric_only: bool = False,

11654 **kwargs,

11656 result = super().max(axis, skipna, numeric_only, **kwargs)

11657 if isinstance(result, Series):

11658 result = result.__finalize__(self, method="max")

11659 return result

11661 @doc(make_doc("sum", ndim=2))

11662 def sum(

11663 self,

11664 axis: Axis | None = 0,

11665 skipna: bool = True,

11666 numeric_only: bool = False,

11667 min_count: int = 0,

11668 **kwargs,

11670 result = super().sum(axis, skipna, numeric_only, min_count, **kwargs)

11671 return result.__finalize__(self, method="sum")

11673 @doc(make_doc("prod", ndim=2))

11674 def prod(

11675 self,

11676 axis: Axis | None = 0,

11677 skipna: bool = True,

11678 numeric_only: bool = False,

11679 min_count: int = 0,

11680 **kwargs,

11682 result = super().prod(axis, skipna, numeric_only, min_count, **kwargs)

11683 return result.__finalize__(self, method="prod")

11685 @doc(make_doc("mean", ndim=2))

11686 def mean(

11687 self,

11688 axis: Axis | None = 0,

11689 skipna: bool = True,

11690 numeric_only: bool = False,

11691 **kwargs,

11693 result = super().mean(axis, skipna, numeric_only, **kwargs)

11694 if isinstance(result, Series):

11695 result = result.__finalize__(self, method="mean")

11696 return result

11698 @doc(make_doc("median", ndim=2))

11699 def median(

11700 self,

11701 axis: Axis | None = 0,

11702 skipna: bool = True,

11703 numeric_only: bool = False,

11704 **kwargs,

11706 result = super().median(axis, skipna, numeric_only, **kwargs)

11707 if isinstance(result, Series):

11708 result = result.__finalize__(self, method="median")

11709 return result

11711 @doc(make_doc("sem", ndim=2))

11712 def sem(

11713 self,

11714 axis: Axis | None = 0,

11715 skipna: bool = True,

11716 ddof: int = 1,

11717 numeric_only: bool = False,

11718 **kwargs,

11720 result = super().sem(axis, skipna, ddof, numeric_only, **kwargs)

11721 if isinstance(result, Series):

11722 result = result.__finalize__(self, method="sem")

11723 return result

11725 @doc(make_doc("var", ndim=2))

11726 def var(

11727 self,

11728 axis: Axis | None = 0,

11729 skipna: bool = True,

11730 ddof: int = 1,

11731 numeric_only: bool = False,

11732 **kwargs,

11734 result = super().var(axis, skipna, ddof, numeric_only, **kwargs)

11735 if isinstance(result, Series):

11736 result = result.__finalize__(self, method="var")

11737 return result

11739 @doc(make_doc("std", ndim=2))

11740 def std(

11741 self,

11742 axis: Axis | None = 0,

11743 skipna: bool = True,

11744 ddof: int = 1,

11745 numeric_only: bool = False,

11746 **kwargs,

11748 result = super().std(axis, skipna, ddof, numeric_only, **kwargs)

11749 if isinstance(result, Series):

11750 result = result.__finalize__(self, method="std")

11751 return result

11753 @doc(make_doc("skew", ndim=2))

11754 def skew(

11755 self,

11756 axis: Axis | None = 0,

11757 skipna: bool = True,

11758 numeric_only: bool = False,

11759 **kwargs,

11761 result = super().skew(axis, skipna, numeric_only, **kwargs)

11762 if isinstance(result, Series):

11763 result = result.__finalize__(self, method="skew")

11764 return result

11766 @doc(make_doc("kurt", ndim=2))

11767 def kurt(

11768 self,

11769 axis: Axis | None = 0,

11770 skipna: bool = True,

11771 numeric_only: bool = False,

11772 **kwargs,

11774 result = super().kurt(axis, skipna, numeric_only, **kwargs)

11775 if isinstance(result, Series):

11776 result = result.__finalize__(self, method="kurt")

11777 return result

11779 kurtosis = kurt

11780 product = prod

11782 @doc(make_doc("cummin", ndim=2))

11783 def cummin(self, axis: Axis | None = None, skipna: bool = True, *args, **kwargs):

11784 return NDFrame.cummin(self, axis, skipna, *args, **kwargs)

11786 @doc(make_doc("cummax", ndim=2))

11787 def cummax(self, axis: Axis | None = None, skipna: bool = True, *args, **kwargs):

11788 return NDFrame.cummax(self, axis, skipna, *args, **kwargs)

11790 @doc(make_doc("cumsum", ndim=2))

11791 def cumsum(self, axis: Axis | None = None, skipna: bool = True, *args, **kwargs):

11792 return NDFrame.cumsum(self, axis, skipna, *args, **kwargs)

11794 @doc(make_doc("cumprod", 2))

11795 def cumprod(self, axis: Axis | None = None, skipna: bool = True, *args, **kwargs):

11796 return NDFrame.cumprod(self, axis, skipna, *args, **kwargs)

11798 def nunique(self, axis: Axis = 0, dropna: bool = True) -> Series:

11800 Count number of distinct elements in specified axis.

11802 Return Series with number of distinct elements. Can ignore NaN

11803 values.

11805 Parameters

11806 ----------

11807 axis : {0 or 'index', 1 or 'columns'}, default 0

11808 The axis to use. 0 or 'index' for row-wise, 1 or 'columns' for

11809 column-wise.

11810 dropna : bool, default True

11811 Don't include NaN in the counts.

11813 Returns

11814 -------

11815 Series

11817 See Also

11818 --------

11819 Series.nunique: Method nunique for Series.

11820 DataFrame.count: Count non-NA cells for each column or row.

11822 Examples

11823 --------

11824 >>> df = pd.DataFrame({'A': [4, 5, 6], 'B': [4, 1, 1]})

11825 >>> df.nunique()

11828 dtype: int64

11830 >>> df.nunique(axis=1)

11834 dtype: int64

11836 return self.apply(Series.nunique, axis=axis, dropna=dropna)

11838 @doc(_shared_docs["idxmin"], numeric_only_default="False")

11839 def idxmin(

11840 self, axis: Axis = 0, skipna: bool = True, numeric_only: bool = False

11841 ) -> Series:

11842 axis = self._get_axis_number(axis)

11844 if self.empty and len(self.axes[axis]):

11845 axis_dtype = self.axes[axis].dtype

11846 return self._constructor_sliced(dtype=axis_dtype)

11848 if numeric_only:

11849 data = self._get_numeric_data()

11850 else:

11851 data = self

11853 res = data._reduce(

11854 nanops.nanargmin, "argmin", axis=axis, skipna=skipna, numeric_only=False

11856 indices = res._values

11857 # indices will always be np.ndarray since axis is not N

11859 if (indices == -1).any():

11860 warnings.warn(

11861 f"The behavior of {type(self).__name__}.idxmin with all-NA "

11862 "values, or any-NA and skipna=False, is deprecated. In a future "

11863 "version this will raise ValueError",

11864 FutureWarning,

11865 stacklevel=find_stack_level(),

11868 index = data._get_axis(axis)

11869 result = algorithms.take(

11870 index._values, indices, allow_fill=True, fill_value=index._na_value

11872 final_result = data._constructor_sliced(result, index=data._get_agg_axis(axis))

11873 return final_result.__finalize__(self, method="idxmin")

11875 @doc(_shared_docs["idxmax"], numeric_only_default="False")

11876 def idxmax(

11877 self, axis: Axis = 0, skipna: bool = True, numeric_only: bool = False

11878 ) -> Series:

11879 axis = self._get_axis_number(axis)

11881 if self.empty and len(self.axes[axis]):

11882 axis_dtype = self.axes[axis].dtype

11883 return self._constructor_sliced(dtype=axis_dtype)

11885 if numeric_only:

11886 data = self._get_numeric_data()

11887 else:

11888 data = self

11890 res = data._reduce(

11891 nanops.nanargmax, "argmax", axis=axis, skipna=skipna, numeric_only=False

11893 indices = res._values

11894 # indices will always be 1d array since axis is not None

11896 if (indices == -1).any():

11897 warnings.warn(

11898 f"The behavior of {type(self).__name__}.idxmax with all-NA "

11899 "values, or any-NA and skipna=False, is deprecated. In a future "

11900 "version this will raise ValueError",

11901 FutureWarning,

11902 stacklevel=find_stack_level(),

11905 index = data._get_axis(axis)

11906 result = algorithms.take(

11907 index._values, indices, allow_fill=True, fill_value=index._na_value

11909 final_result = data._constructor_sliced(result, index=data._get_agg_axis(axis))

11910 return final_result.__finalize__(self, method="idxmax")

11912 def _get_agg_axis(self, axis_num: int) -> Index:

11914 Let's be explicit about this.

11916 if axis_num == 0:

11917 return self.columns

11918 elif axis_num == 1:

11919 return self.index

11920 else:

11921 raise ValueError(f"Axis must be 0 or 1 (got {repr(axis_num)})")

11923 def mode(

11924 self, axis: Axis = 0, numeric_only: bool = False, dropna: bool = True

11925 ) -> DataFrame:

11927 Get the mode(s) of each element along the selected axis.

11929 The mode of a set of values is the value that appears most often.

11930 It can be multiple values.

11932 Parameters

11933 ----------

11934 axis : {0 or 'index', 1 or 'columns'}, default 0

11935 The axis to iterate over while searching for the mode:

11937 * 0 or 'index' : get mode of each column

11938 * 1 or 'columns' : get mode of each row.

11940 numeric_only : bool, default False

11941 If True, only apply to numeric columns.

11942 dropna : bool, default True

11943 Don't consider counts of NaN/NaT.

11945 Returns

11946 -------

11947 DataFrame

11948 The modes of each column or row.

11950 See Also

11951 --------

11952 Series.mode : Return the highest frequency value in a Series.

11953 Series.value_counts : Return the counts of values in a Series.

11955 Examples

11956 --------

11957 >>> df = pd.DataFrame([('bird', 2, 2),

11958 ... ('mammal', 4, np.nan),

11959 ... ('arthropod', 8, 0),

11960 ... ('bird', 2, np.nan)],

11961 ... index=('falcon', 'horse', 'spider', 'ostrich'),

11962 ... columns=('species', 'legs', 'wings'))

11963 >>> df

11964 species legs wings

11965 falcon bird 2 2.0

11966 horse mammal 4 NaN

11967 spider arthropod 8 0.0

11968 ostrich bird 2 NaN

11970 By default, missing values are not considered, and the mode of wings

11971 are both 0 and 2. Because the resulting DataFrame has two rows,

11972 the second row of ``species`` and ``legs`` contains ``NaN``.

11974 >>> df.mode()

11975 species legs wings

11976 0 bird 2.0 0.0

11977 1 NaN NaN 2.0

11979 Setting ``dropna=False`` ``NaN`` values are considered and they can be

11980 the mode (like for wings).

11982 >>> df.mode(dropna=False)

11983 species legs wings

11984 0 bird 2 NaN

11986 Setting ``numeric_only=True``, only the mode of numeric columns is

11987 computed, and columns of other types are ignored.

11989 >>> df.mode(numeric_only=True)

11990 legs wings

11991 0 2.0 0.0

11992 1 NaN 2.0

11994 To compute the mode over columns and not rows, use the axis parameter:

11996 >>> df.mode(axis='columns', numeric_only=True)

11998 falcon 2.0 NaN

11999 horse 4.0 NaN

12000 spider 0.0 8.0

12001 ostrich 2.0 NaN

12003 data = self if not numeric_only else self._get_numeric_data()

12005 def f(s):

12006 return s.mode(dropna=dropna)

12008 data = data.apply(f, axis=axis)

12009 # Ensure index is type stable (should always use int index)

12010 if data.empty:

12011 data.index = default_index(0)

12013 return data

12015 @overload

12016 def quantile(

12017 self,

12018 q: float = ...,

12019 axis: Axis = ...,

12020 numeric_only: bool = ...,

12021 interpolation: QuantileInterpolation = ...,

12022 method: Literal["single", "table"] = ...,

12023 ) -> Series:

12026 @overload

12027 def quantile(

12028 self,

12029 q: AnyArrayLike | Sequence[float],

12030 axis: Axis = ...,

12031 numeric_only: bool = ...,

12032 interpolation: QuantileInterpolation = ...,

12033 method: Literal["single", "table"] = ...,

12034 ) -> Series | DataFrame:

12037 @overload

12038 def quantile(

12039 self,

12040 q: float | AnyArrayLike | Sequence[float] = ...,

12041 axis: Axis = ...,

12042 numeric_only: bool = ...,

12043 interpolation: QuantileInterpolation = ...,

12044 method: Literal["single", "table"] = ...,

12045 ) -> Series | DataFrame:

12048 def quantile(

12049 self,

12050 q: float | AnyArrayLike | Sequence[float] = 0.5,

12051 axis: Axis = 0,

12052 numeric_only: bool = False,

12053 interpolation: QuantileInterpolation = "linear",

12054 method: Literal["single", "table"] = "single",

12055 ) -> Series | DataFrame:

12057 Return values at the given quantile over requested axis.

12059 Parameters

12060 ----------

12061 q : float or array-like, default 0.5 (50% quantile)

12062 Value between 0 <= q <= 1, the quantile(s) to compute.

12063 axis : {0 or 'index', 1 or 'columns'}, default 0

12064 Equals 0 or 'index' for row-wise, 1 or 'columns' for column-wise.

12065 numeric_only : bool, default False

12066 Include only `float`, `int` or `boolean` data.

12068 .. versionchanged:: 2.0.0

12069 The default value of ``numeric_only`` is now ``False``.

12071 interpolation : {'linear', 'lower', 'higher', 'midpoint', 'nearest'}

12072 This optional parameter specifies the interpolation method to use,

12073 when the desired quantile lies between two data points `i` and `j`:

12075 * linear: `i + (j - i) * fraction`, where `fraction` is the

12076 fractional part of the index surrounded by `i` and `j`.

12077 * lower: `i`.

12078 * higher: `j`.

12079 * nearest: `i` or `j` whichever is nearest.

12080 * midpoint: (`i` + `j`) / 2.

12081 method : {'single', 'table'}, default 'single'

12082 Whether to compute quantiles per-column ('single') or over all columns

12083 ('table'). When 'table', the only allowed interpolation methods are

12084 'nearest', 'lower', and 'higher'.

12086 Returns

12087 -------

12088 Series or DataFrame

12090 If ``q`` is an array, a DataFrame will be returned where the

12091 index is ``q``, the columns are the columns of self, and the

12092 values are the quantiles.

12093 If ``q`` is a float, a Series will be returned where the

12094 index is the columns of self and the values are the quantiles.

12096 See Also

12097 --------

12098 core.window.rolling.Rolling.quantile: Rolling quantile.

12099 numpy.percentile: Numpy function to compute the percentile.

12101 Examples

12102 --------

12103 >>> df = pd.DataFrame(np.array([[1, 1], [2, 10], [3, 100], [4, 100]]),

12104 ... columns=['a', 'b'])

12105 >>> df.quantile(.1)

12106 a 1.3

12107 b 3.7

12108 Name: 0.1, dtype: float64

12109 >>> df.quantile([.1, .5])

12111 0.1 1.3 3.7

12112 0.5 2.5 55.0

12114 Specifying `method='table'` will compute the quantile over all columns.

12116 >>> df.quantile(.1, method="table", interpolation="nearest")

12119 Name: 0.1, dtype: int64

12120 >>> df.quantile([.1, .5], method="table", interpolation="nearest")

12122 0.1 1 1

12123 0.5 3 100

12125 Specifying `numeric_only=False` will also compute the quantile of

12126 datetime and timedelta data.

12128 >>> df = pd.DataFrame({'A': [1, 2],

12129 ... 'B': [pd.Timestamp('2010'),

12130 ... pd.Timestamp('2011')],

12131 ... 'C': [pd.Timedelta('1 days'),

12132 ... pd.Timedelta('2 days')]})

12133 >>> df.quantile(0.5, numeric_only=False)

12134 A 1.5

12135 B 2010-07-02 12:00:00

12136 C 1 days 12:00:00

12137 Name: 0.5, dtype: object

12139 validate_percentile(q)

12140 axis = self._get_axis_number(axis)

12142 if not is_list_like(q):

12143 # BlockManager.quantile expects listlike, so we wrap and unwrap here

12144 # error: List item 0 has incompatible type "float | ExtensionArray |

12145 # ndarray[Any, Any] | Index | Series | Sequence[float]"; expected "float"

12146 res_df = self.quantile(

12147 [q], # type: ignore[list-item]

12148 axis=axis,

12149 numeric_only=numeric_only,

12150 interpolation=interpolation,

12151 method=method,

12153 if method == "single":

12154 res = res_df.iloc[0]

12155 else:

12156 # cannot directly iloc over sparse arrays

12157 res = res_df.T.iloc[:, 0]

12158 if axis == 1 and len(self) == 0:

12159 # GH#41544 try to get an appropriate dtype

12160 dtype = find_common_type(list(self.dtypes))

12161 if needs_i8_conversion(dtype):

12162 return res.astype(dtype)

12163 return res

12165 q = Index(q, dtype=np.float64)

12166 data = self._get_numeric_data() if numeric_only else self

12168 if axis == 1:

12169 data = data.T

12171 if len(data.columns) == 0:

12172 # GH#23925 _get_numeric_data may have dropped all columns

12173 cols = Index([], name=self.columns.name)

12175 dtype = np.float64

12176 if axis == 1:

12177 # GH#41544 try to get an appropriate dtype

12178 cdtype = find_common_type(list(self.dtypes))

12179 if needs_i8_conversion(cdtype):

12180 dtype = cdtype

12182 res = self._constructor([], index=q, columns=cols, dtype=dtype)

12183 return res.__finalize__(self, method="quantile")

12185 valid_method = {"single", "table"}

12186 if method not in valid_method:

12187 raise ValueError(

12188 f"Invalid method: {method}. Method must be in {valid_method}."

12190 if method == "single":

12191 res = data._mgr.quantile(qs=q, interpolation=interpolation)

12192 elif method == "table":

12193 valid_interpolation = {"nearest", "lower", "higher"}

12194 if interpolation not in valid_interpolation:

12195 raise ValueError(

12196 f"Invalid interpolation: {interpolation}. "

12197 f"Interpolation must be in {valid_interpolation}"

12199 # handle degenerate case

12200 if len(data) == 0:

12201 if data.ndim == 2:

12202 dtype = find_common_type(list(self.dtypes))

12203 else:

12204 dtype = self.dtype

12205 return self._constructor([], index=q, columns=data.columns, dtype=dtype)

12207 q_idx = np.quantile(np.arange(len(data)), q, method=interpolation)

12209 by = data.columns

12210 if len(by) > 1:

12211 keys = [data._get_label_or_level_values(x) for x in by]

12212 indexer = lexsort_indexer(keys)

12213 else:

12214 k = data._get_label_or_level_values(by[0])

12215 indexer = nargsort(k)

12217 res = data._mgr.take(indexer[q_idx], verify=False)

12218 res.axes[1] = q

12220 result = self._constructor_from_mgr(res, axes=res.axes)

12221 return result.__finalize__(self, method="quantile")

12223 def to_timestamp(

12224 self,

12225 freq: Frequency | None = None,

12226 how: ToTimestampHow = "start",

12227 axis: Axis = 0,

12228 copy: bool | None = None,

12229 ) -> DataFrame:

12231 Cast to DatetimeIndex of timestamps, at *beginning* of period.

12233 Parameters

12234 ----------

12235 freq : str, default frequency of PeriodIndex

12236 Desired frequency.

12237 how : {'s', 'e', 'start', 'end'}

12238 Convention for converting period to timestamp; start of period

12239 vs. end.

12240 axis : {0 or 'index', 1 or 'columns'}, default 0

12241 The axis to convert (the index by default).

12242 copy : bool, default True

12243 If False then underlying input data is not copied.

12245 .. note::

12246 The `copy` keyword will change behavior in pandas 3.0.

12247 `Copy-on-Write

12248 <https://pandas.pydata.org/docs/dev/user_guide/copy_on_write.html>`__

12249 will be enabled by default, which means that all methods with a

12250 `copy` keyword will use a lazy copy mechanism to defer the copy and

12251 ignore the `copy` keyword. The `copy` keyword will be removed in a

12252 future version of pandas.

12254 You can already get the future behavior and improvements through

12255 enabling copy on write ``pd.options.mode.copy_on_write = True``

12257 Returns

12258 -------

12259 DataFrame

12260 The DataFrame has a DatetimeIndex.

12262 Examples

12263 --------

12264 >>> idx = pd.PeriodIndex(['2023', '2024'], freq='Y')

12265 >>> d = {'col1': [1, 2], 'col2': [3, 4]}

12266 >>> df1 = pd.DataFrame(data=d, index=idx)

12267 >>> df1

12268 col1 col2

12269 2023 1 3

12270 2024 2 4

12272 The resulting timestamps will be at the beginning of the year in this case

12274 >>> df1 = df1.to_timestamp()

12275 >>> df1

12276 col1 col2

12277 2023-01-01 1 3

12278 2024-01-01 2 4

12279 >>> df1.index

12280 DatetimeIndex(['2023-01-01', '2024-01-01'], dtype='datetime64[ns]', freq=None)

12282 Using `freq` which is the offset that the Timestamps will have

12284 >>> df2 = pd.DataFrame(data=d, index=idx)

12285 >>> df2 = df2.to_timestamp(freq='M')

12286 >>> df2

12287 col1 col2

12288 2023-01-31 1 3

12289 2024-01-31 2 4

12290 >>> df2.index

12291 DatetimeIndex(['2023-01-31', '2024-01-31'], dtype='datetime64[ns]', freq=None)

12293 new_obj = self.copy(deep=copy and not using_copy_on_write())

12295 axis_name = self._get_axis_name(axis)

12296 old_ax = getattr(self, axis_name)

12297 if not isinstance(old_ax, PeriodIndex):

12298 raise TypeError(f"unsupported Type {type(old_ax).__name__}")

12300 new_ax = old_ax.to_timestamp(freq=freq, how=how)

12302 setattr(new_obj, axis_name, new_ax)

12303 return new_obj

12305 def to_period(

12306 self, freq: Frequency | None = None, axis: Axis = 0, copy: bool | None = None

12307 ) -> DataFrame:

12309 Convert DataFrame from DatetimeIndex to PeriodIndex.

12311 Convert DataFrame from DatetimeIndex to PeriodIndex with desired

12312 frequency (inferred from index if not passed).

12314 Parameters

12315 ----------

12316 freq : str, default

12317 Frequency of the PeriodIndex.

12318 axis : {0 or 'index', 1 or 'columns'}, default 0

12319 The axis to convert (the index by default).

12320 copy : bool, default True

12321 If False then underlying input data is not copied.

12323 .. note::

12324 The `copy` keyword will change behavior in pandas 3.0.

12325 `Copy-on-Write

12326 <https://pandas.pydata.org/docs/dev/user_guide/copy_on_write.html>`__

12327 will be enabled by default, which means that all methods with a

12328 `copy` keyword will use a lazy copy mechanism to defer the copy and

12329 ignore the `copy` keyword. The `copy` keyword will be removed in a

12330 future version of pandas.

12332 You can already get the future behavior and improvements through

12333 enabling copy on write ``pd.options.mode.copy_on_write = True``

12335 Returns

12336 -------

12337 DataFrame

12338 The DataFrame has a PeriodIndex.

12340 Examples

12341 --------

12342 >>> idx = pd.to_datetime(

12343 ... [

12344 ... "2001-03-31 00:00:00",

12345 ... "2002-05-31 00:00:00",

12346 ... "2003-08-31 00:00:00",

12347 ... ]

12348 ... )

12350 >>> idx

12351 DatetimeIndex(['2001-03-31', '2002-05-31', '2003-08-31'],

12352 dtype='datetime64[ns]', freq=None)

12354 >>> idx.to_period("M")

12355 PeriodIndex(['2001-03', '2002-05', '2003-08'], dtype='period[M]')

12357 For the yearly frequency

12359 >>> idx.to_period("Y")

12360 PeriodIndex(['2001', '2002', '2003'], dtype='period[Y-DEC]')

12362 new_obj = self.copy(deep=copy and not using_copy_on_write())

12364 axis_name = self._get_axis_name(axis)

12365 old_ax = getattr(self, axis_name)

12366 if not isinstance(old_ax, DatetimeIndex):

12367 raise TypeError(f"unsupported Type {type(old_ax).__name__}")

12369 new_ax = old_ax.to_period(freq=freq)

12371 setattr(new_obj, axis_name, new_ax)

12372 return new_obj

12374 def isin(self, values: Series | DataFrame | Sequence | Mapping) -> DataFrame:

12376 Whether each element in the DataFrame is contained in values.

12378 Parameters

12379 ----------

12380 values : iterable, Series, DataFrame or dict

12381 The result will only be true at a location if all the

12382 labels match. If `values` is a Series, that's the index. If

12383 `values` is a dict, the keys must be the column names,

12384 which must match. If `values` is a DataFrame,

12385 then both the index and column labels must match.

12387 Returns

12388 -------

12389 DataFrame

12390 DataFrame of booleans showing whether each element in the DataFrame

12391 is contained in values.

12393 See Also

12394 --------

12395 DataFrame.eq: Equality test for DataFrame.

12396 Series.isin: Equivalent method on Series.

12397 Series.str.contains: Test if pattern or regex is contained within a

12398 string of a Series or Index.

12400 Examples

12401 --------

12402 >>> df = pd.DataFrame({'num_legs': [2, 4], 'num_wings': [2, 0]},

12403 ... index=['falcon', 'dog'])

12404 >>> df

12405 num_legs num_wings

12406 falcon 2 2

12407 dog 4 0

12409 When ``values`` is a list check whether every value in the DataFrame

12410 is present in the list (which animals have 0 or 2 legs or wings)

12412 >>> df.isin([0, 2])

12413 num_legs num_wings

12414 falcon True True

12415 dog False True

12417 To check if ``values`` is *not* in the DataFrame, use the ``~`` operator:

12419 >>> ~df.isin([0, 2])

12420 num_legs num_wings

12421 falcon False False

12422 dog True False

12424 When ``values`` is a dict, we can pass values to check for each

12425 column separately:

12427 >>> df.isin({'num_wings': [0, 3]})

12428 num_legs num_wings

12429 falcon False False

12430 dog False True

12432 When ``values`` is a Series or DataFrame the index and column must

12433 match. Note that 'falcon' does not match based on the number of legs

12434 in other.

12436 >>> other = pd.DataFrame({'num_legs': [8, 3], 'num_wings': [0, 2]},

12437 ... index=['spider', 'falcon'])

12438 >>> df.isin(other)

12439 num_legs num_wings

12440 falcon False True

12441 dog False False

12443 if isinstance(values, dict):

12444 from pandas.core.reshape.concat import concat

12446 values = collections.defaultdict(list, values)

12447 result = concat(

12449 self.iloc[:, [i]].isin(values[col])

12450 for i, col in enumerate(self.columns)

12452 axis=1,

12454 elif isinstance(values, Series):

12455 if not values.index.is_unique:

12456 raise ValueError("cannot compute isin with a duplicate axis.")

12457 result = self.eq(values.reindex_like(self), axis="index")

12458 elif isinstance(values, DataFrame):

12459 if not (values.columns.is_unique and values.index.is_unique):

12460 raise ValueError("cannot compute isin with a duplicate axis.")

12461 result = self.eq(values.reindex_like(self))

12462 else:

12463 if not is_list_like(values):

12464 raise TypeError(

12465 "only list-like or dict-like objects are allowed "

12466 "to be passed to DataFrame.isin(), "

12467 f"you passed a '{type(values).__name__}'"

12470 def isin_(x):

12471 # error: Argument 2 to "isin" has incompatible type "Union[Series,

12472 # DataFrame, Sequence[Any], Mapping[Any, Any]]"; expected

12473 # "Union[Union[Union[ExtensionArray, ndarray[Any, Any]], Index,

12474 # Series], List[Any], range]"

12475 result = algorithms.isin(

12476 x.ravel(),

12477 values, # type: ignore[arg-type]

12479 return result.reshape(x.shape)

12481 res_mgr = self._mgr.apply(isin_)

12482 result = self._constructor_from_mgr(

12483 res_mgr,

12484 axes=res_mgr.axes,

12486 return result.__finalize__(self, method="isin")

12488 # ----------------------------------------------------------------------

12489 # Add index and columns

12490 _AXIS_ORDERS: list[Literal["index", "columns"]] = ["index", "columns"]

12491 _AXIS_TO_AXIS_NUMBER: dict[Axis, int] = {

12492 **NDFrame._AXIS_TO_AXIS_NUMBER,

12493 1: 1,

12494 "columns": 1,

12496 _AXIS_LEN = len(_AXIS_ORDERS)

12497 _info_axis_number: Literal[1] = 1

12498 _info_axis_name: Literal["columns"] = "columns"

12500 index = properties.AxisProperty(

12501 axis=1,

12502 doc="""

12503 The index (row labels) of the DataFrame.

12505 The index of a DataFrame is a series of labels that identify each row.

12506 The labels can be integers, strings, or any other hashable type. The index

12507 is used for label-based access and alignment, and can be accessed or

12508 modified using this attribute.

12510 Returns

12511 -------

12512 pandas.Index

12513 The index labels of the DataFrame.

12515 See Also

12516 --------

12517 DataFrame.columns : The column labels of the DataFrame.

12518 DataFrame.to_numpy : Convert the DataFrame to a NumPy array.

12520 Examples

12521 --------

12522 >>> df = pd.DataFrame({'Name': ['Alice', 'Bob', 'Aritra'],

12523 ... 'Age': [25, 30, 35],

12524 ... 'Location': ['Seattle', 'New York', 'Kona']},

12525 ... index=([10, 20, 30]))

12526 >>> df.index

12527 Index([10, 20, 30], dtype='int64')

12529 In this example, we create a DataFrame with 3 rows and 3 columns,

12530 including Name, Age, and Location information. We set the index labels to

12531 be the integers 10, 20, and 30. We then access the `index` attribute of the

12532 DataFrame, which returns an `Index` object containing the index labels.

12534 >>> df.index = [100, 200, 300]

12535 >>> df

12536 Name Age Location

12537 100 Alice 25 Seattle

12538 200 Bob 30 New York

12539 300 Aritra 35 Kona

12541 In this example, we modify the index labels of the DataFrame by assigning

12542 a new list of labels to the `index` attribute. The DataFrame is then

12543 updated with the new labels, and the output shows the modified DataFrame.

12544 """,

12546 columns = properties.AxisProperty(

12547 axis=0,

12548 doc=dedent(

12550 The column labels of the DataFrame.

12552 Examples

12553 --------

12554 >>> df = pd.DataFrame({'A': [1, 2], 'B': [3, 4]})

12555 >>> df

12557 0 1 3

12558 1 2 4

12559 >>> df.columns

12560 Index(['A', 'B'], dtype='object')

12565 # ----------------------------------------------------------------------

12566 # Add plotting methods to DataFrame

12567 plot = CachedAccessor("plot", pandas.plotting.PlotAccessor)

12568 hist = pandas.plotting.hist_frame

12569 boxplot = pandas.plotting.boxplot_frame

12570 sparse = CachedAccessor("sparse", SparseFrameAccessor)

12572 # ----------------------------------------------------------------------

12573 # Internal Interface Methods

12575 def _to_dict_of_blocks(self):

12577 Return a dict of dtype -> Constructor Types that

12578 each is a homogeneous dtype.

12580 Internal ONLY - only works for BlockManager

12582 mgr = self._mgr

12583 # convert to BlockManager if needed -> this way support ArrayManager as well

12584 mgr = cast(BlockManager, mgr_to_mgr(mgr, "block"))

12585 return {

12586 k: self._constructor_from_mgr(v, axes=v.axes).__finalize__(self)

12587 for k, v, in mgr.to_dict().items()

12590 @property

12591 def values(self) -> np.ndarray:

12593 Return a Numpy representation of the DataFrame.

12595 .. warning::

12597 We recommend using :meth:`DataFrame.to_numpy` instead.

12599 Only the values in the DataFrame will be returned, the axes labels

12600 will be removed.

12602 Returns

12603 -------

12604 numpy.ndarray

12605 The values of the DataFrame.

12607 See Also

12608 --------

12609 DataFrame.to_numpy : Recommended alternative to this method.

12610 DataFrame.index : Retrieve the index labels.

12611 DataFrame.columns : Retrieving the column names.

12613 Notes

12614 -----

12615 The dtype will be a lower-common-denominator dtype (implicit

12616 upcasting); that is to say if the dtypes (even of numeric types)

12617 are mixed, the one that accommodates all will be chosen. Use this

12618 with care if you are not dealing with the blocks.

12620 e.g. If the dtypes are float16 and float32, dtype will be upcast to

12621 float32. If dtypes are int32 and uint8, dtype will be upcast to

12622 int32. By :func:`numpy.find_common_type` convention, mixing int64

12623 and uint64 will result in a float64 dtype.

12625 Examples

12626 --------

12627 A DataFrame where all columns are the same type (e.g., int64) results

12628 in an array of the same type.

12630 >>> df = pd.DataFrame({'age': [ 3, 29],

12631 ... 'height': [94, 170],

12632 ... 'weight': [31, 115]})

12633 >>> df

12634 age height weight

12635 0 3 94 31

12636 1 29 170 115

12637 >>> df.dtypes

12638 age int64

12639 height int64

12640 weight int64

12641 dtype: object

12642 >>> df.values

12643 array([[ 3, 94, 31],

12644 [ 29, 170, 115]])

12646 A DataFrame with mixed type columns(e.g., str/object, int64, float32)

12647 results in an ndarray of the broadest type that accommodates these

12648 mixed types (e.g., object).

12650 >>> df2 = pd.DataFrame([('parrot', 24.0, 'second'),

12651 ... ('lion', 80.5, 1),

12652 ... ('monkey', np.nan, None)],

12653 ... columns=('name', 'max_speed', 'rank'))

12654 >>> df2.dtypes

12655 name object

12656 max_speed float64

12657 rank object

12658 dtype: object

12659 >>> df2.values

12660 array([['parrot', 24.0, 'second'],

12661 ['lion', 80.5, 1],

12662 ['monkey', nan, None]], dtype=object)

12664 return self._mgr.as_array()

12667def _from_nested_dict(data) -> collections.defaultdict:

12668 new_data: collections.defaultdict = collections.defaultdict(dict)

12669 for index, s in data.items():

12670 for col, v in s.items():

12671 new_data[col][index] = v

12672 return new_data

12675def _reindex_for_setitem(

12676 value: DataFrame | Series, index: Index

12677) -> tuple[ArrayLike, BlockValuesRefs | None]:

12678 # reindex if necessary

12680 if value.index.equals(index) or not len(index):

12681 if using_copy_on_write() and isinstance(value, Series):

12682 return value._values, value._references

12683 return value._values.copy(), None

12685 # GH#4107

12686 try:

12687 reindexed_value = value.reindex(index)._values

12688 except ValueError as err:

12689 # raised in MultiIndex.from_tuples, see test_insert_error_msmgs

12690 if not value.index.is_unique:

12691 # duplicate axis

12692 raise err

12694 raise TypeError(

12695 "incompatible index of inserted column with frame index"

12696 ) from err

12697 return reindexed_value, None