Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/pandas/core/frame.py: 19%

Shortcuts on this page

r m x toggle line displays

j k next/prev highlighted chunk

0 (zero) top of page

1 (one) first highlighted chunk

2171 statements

1"""

2DataFrame

3---------

4An efficient 2D container for potentially mixed-type time series or other

5labeled data series.

7Similar to its R counterpart, data.frame, except providing automatic data

8alignment and a host of useful data manipulation methods having to do with the

9labeling information

10"""

11from __future__ import annotations

13import collections

14from collections import abc

15import datetime

16import functools

17from io import StringIO

18import itertools

19import sys

20from textwrap import dedent

21from typing import (

22 TYPE_CHECKING,

23 Any,

24 Callable,

25 Hashable,

26 Iterable,

27 Iterator,

28 Literal,

29 Mapping,

30 Sequence,

31 cast,

32 overload,

34import warnings

36import numpy as np

37from numpy import ma

39from pandas._config import (

40 get_option,

41 using_copy_on_write,

44from pandas._libs import (

45 algos as libalgos,

46 lib,

47 properties,

49from pandas._libs.hashtable import duplicated

50from pandas._libs.lib import (

51 NoDefault,

52 is_range_indexer,

53 no_default,

55from pandas._typing import (

56 AggFuncType,

57 AlignJoin,

58 AnyAll,

59 AnyArrayLike,

60 ArrayLike,

61 Axes,

62 Axis,

63 AxisInt,

64 ColspaceArgType,

65 CompressionOptions,

66 CorrelationMethod,

67 DropKeep,

68 Dtype,

69 DtypeObj,

70 FilePath,

71 FillnaOptions,

72 FloatFormatType,

73 FormattersType,

74 Frequency,

75 IgnoreRaise,

76 IndexKeyFunc,

77 IndexLabel,

78 Level,

79 MergeHow,

80 NaPosition,

81 PythonFuncType,

82 QuantileInterpolation,

83 ReadBuffer,

84 Renamer,

85 Scalar,

86 SortKind,

87 StorageOptions,

88 Suffixes,

89 TimedeltaConvertibleTypes,

90 TimestampConvertibleTypes,

91 ValueKeyFunc,

92 WriteBuffer,

93 npt,

95from pandas.compat import PYPY

96from pandas.compat._optional import import_optional_dependency

97from pandas.compat.numpy import (

98 function as nv,

99 np_percentile_argname,

101from pandas.errors import (

102 ChainedAssignmentError,

103 InvalidIndexError,

104 _chained_assignment_msg,

106from pandas.util._decorators import (

107 Appender,

108 Substitution,

109 doc,

111from pandas.util._exceptions import find_stack_level

112from pandas.util._validators import (

113 validate_ascending,

114 validate_bool_kwarg,

115 validate_percentile,

118from pandas.core.dtypes.cast import (

119 LossySetitemError,

120 can_hold_element,

121 construct_1d_arraylike_from_scalar,

122 construct_2d_arraylike_from_scalar,

123 find_common_type,

124 infer_dtype_from_scalar,

125 invalidate_string_dtypes,

126 maybe_box_native,

127 maybe_downcast_to_dtype,

129from pandas.core.dtypes.common import (

130 infer_dtype_from_object,

131 is_1d_only_ea_dtype,

132 is_bool_dtype,

133 is_dataclass,

134 is_dict_like,

135 is_dtype_equal,

136 is_extension_array_dtype,

137 is_float,

138 is_float_dtype,

139 is_hashable,

140 is_integer,

141 is_integer_dtype,

142 is_iterator,

143 is_list_like,

144 is_scalar,

145 is_sequence,

146 needs_i8_conversion,

147 pandas_dtype,

149from pandas.core.dtypes.dtypes import ExtensionDtype

150from pandas.core.dtypes.missing import (

151 isna,

152 notna,

155from pandas.core import (

156 algorithms,

157 common as com,

158 nanops,

159 ops,

161from pandas.core.accessor import CachedAccessor

162from pandas.core.apply import (

163 reconstruct_func,

164 relabel_result,

166from pandas.core.array_algos.take import take_2d_multi

167from pandas.core.arraylike import OpsMixin

168from pandas.core.arrays import (

169 DatetimeArray,

170 ExtensionArray,

171 PeriodArray,

172 TimedeltaArray,

174from pandas.core.arrays.arrow import ArrowDtype

175from pandas.core.arrays.sparse import SparseFrameAccessor

176from pandas.core.construction import (

177 ensure_wrapped_if_datetimelike,

178 extract_array,

179 sanitize_array,

180 sanitize_masked_array,

182from pandas.core.generic import NDFrame

183from pandas.core.indexers import check_key_length

184from pandas.core.indexes.api import (

185 DatetimeIndex,

186 Index,

187 PeriodIndex,

188 default_index,

189 ensure_index,

190 ensure_index_from_sequences,

192from pandas.core.indexes.multi import (

193 MultiIndex,

194 maybe_droplevels,

196from pandas.core.indexing import (

197 check_bool_indexer,

198 check_dict_or_set_indexers,

200from pandas.core.internals import (

201 ArrayManager,

202 BlockManager,

204from pandas.core.internals.construction import (

205 arrays_to_mgr,

206 dataclasses_to_dicts,

207 dict_to_mgr,

208 mgr_to_mgr,

209 ndarray_to_mgr,

210 nested_data_to_arrays,

211 rec_array_to_mgr,

212 reorder_arrays,

213 to_arrays,

214 treat_as_nested,

216from pandas.core.methods import selectn

217from pandas.core.reshape.melt import melt

218from pandas.core.series import Series

219from pandas.core.shared_docs import _shared_docs

220from pandas.core.sorting import (

221 get_group_index,

222 lexsort_indexer,

223 nargsort,

226from pandas.io.common import get_handle

227from pandas.io.formats import (

228 console,

229 format as fmt,

231from pandas.io.formats.info import (

232 INFO_DOCSTRING,

233 DataFrameInfo,

234 frame_sub_kwargs,

236import pandas.plotting

238if TYPE_CHECKING:

239 from pandas.core.groupby.generic import DataFrameGroupBy

240 from pandas.core.interchange.dataframe_protocol import DataFrame as DataFrameXchg

241 from pandas.core.internals import SingleDataManager

242 from pandas.core.resample import Resampler

244 from pandas.io.formats.style import Styler

246# ---------------------------------------------------------------------

247# Docstring templates

249_shared_doc_kwargs = {

250 "axes": "index, columns",

251 "klass": "DataFrame",

252 "axes_single_arg": "{0 or 'index', 1 or 'columns'}",

253 "axis": """axis : {0 or 'index', 1 or 'columns'}, default 0

254 If 0 or 'index': apply function to each column.

255 If 1 or 'columns': apply function to each row.""",

256 "inplace": """

257 inplace : bool, default False

258 Whether to modify the DataFrame rather than creating a new one.""",

259 "optional_by": """

260by : str or list of str

261 Name or list of names to sort by.

263 - if `axis` is 0 or `'index'` then `by` may contain index

264 levels and/or column labels.

265 - if `axis` is 1 or `'columns'` then `by` may contain column

266 levels and/or index labels.""",

267 "optional_reindex": """

268labels : array-like, optional

269 New labels / index to conform the axis specified by 'axis' to.

270index : array-like, optional

271 New labels for the index. Preferably an Index object to avoid

272 duplicating data.

273columns : array-like, optional

274 New labels for the columns. Preferably an Index object to avoid

275 duplicating data.

276axis : int or str, optional

277 Axis to target. Can be either the axis name ('index', 'columns')

278 or number (0, 1).""",

279 "replace_iloc": """

280 This differs from updating with ``.loc`` or ``.iloc``, which require

281 you to specify a location to update with some value.""",

284_numeric_only_doc = """numeric_only : bool, default False

285 Include only float, int, boolean data.

286"""

288_merge_doc = """

289Merge DataFrame or named Series objects with a database-style join.

291A named Series object is treated as a DataFrame with a single named column.

293The join is done on columns or indexes. If joining columns on

294columns, the DataFrame indexes *will be ignored*. Otherwise if joining indexes

295on indexes or indexes on a column or columns, the index will be passed on.

296When performing a cross merge, no column specifications to merge on are

297allowed.

299.. warning::

301 If both key columns contain rows where the key is a null value, those

302 rows will be matched against each other. This is different from usual SQL

303 join behaviour and can lead to unexpected results.

305Parameters

306----------%s

307right : DataFrame or named Series

308 Object to merge with.

309how : {'left', 'right', 'outer', 'inner', 'cross'}, default 'inner'

310 Type of merge to be performed.

312 * left: use only keys from left frame, similar to a SQL left outer join;

313 preserve key order.

314 * right: use only keys from right frame, similar to a SQL right outer join;

315 preserve key order.

316 * outer: use union of keys from both frames, similar to a SQL full outer

317 join; sort keys lexicographically.

318 * inner: use intersection of keys from both frames, similar to a SQL inner

319 join; preserve the order of the left keys.

320 * cross: creates the cartesian product from both frames, preserves the order

321 of the left keys.

323 .. versionadded:: 1.2.0

325on : label or list

326 Column or index level names to join on. These must be found in both

327 DataFrames. If `on` is None and not merging on indexes then this defaults

328 to the intersection of the columns in both DataFrames.

329left_on : label or list, or array-like

330 Column or index level names to join on in the left DataFrame. Can also

331 be an array or list of arrays of the length of the left DataFrame.

332 These arrays are treated as if they are columns.

333right_on : label or list, or array-like

334 Column or index level names to join on in the right DataFrame. Can also

335 be an array or list of arrays of the length of the right DataFrame.

336 These arrays are treated as if they are columns.

337left_index : bool, default False

338 Use the index from the left DataFrame as the join key(s). If it is a

339 MultiIndex, the number of keys in the other DataFrame (either the index

340 or a number of columns) must match the number of levels.

341right_index : bool, default False

342 Use the index from the right DataFrame as the join key. Same caveats as

343 left_index.

344sort : bool, default False

345 Sort the join keys lexicographically in the result DataFrame. If False,

346 the order of the join keys depends on the join type (how keyword).

347suffixes : list-like, default is ("_x", "_y")

348 A length-2 sequence where each element is optionally a string

349 indicating the suffix to add to overlapping column names in

350 `left` and `right` respectively. Pass a value of `None` instead

351 of a string to indicate that the column name from `left` or

352 `right` should be left as-is, with no suffix. At least one of the

353 values must not be None.

354copy : bool, default True

355 If False, avoid copy if possible.

356indicator : bool or str, default False

357 If True, adds a column to the output DataFrame called "_merge" with

358 information on the source of each row. The column can be given a different

359 name by providing a string argument. The column will have a Categorical

360 type with the value of "left_only" for observations whose merge key only

361 appears in the left DataFrame, "right_only" for observations

362 whose merge key only appears in the right DataFrame, and "both"

363 if the observation's merge key is found in both DataFrames.

365validate : str, optional

366 If specified, checks if merge is of specified type.

368 * "one_to_one" or "1:1": check if merge keys are unique in both

369 left and right datasets.

370 * "one_to_many" or "1:m": check if merge keys are unique in left

371 dataset.

372 * "many_to_one" or "m:1": check if merge keys are unique in right

373 dataset.

374 * "many_to_many" or "m:m": allowed, but does not result in checks.

376Returns

377-------

378DataFrame

379 A DataFrame of the two merged objects.

381See Also

382--------

383merge_ordered : Merge with optional filling/interpolation.

384merge_asof : Merge on nearest keys.

385DataFrame.join : Similar method using indices.

387Notes

388-----

389Support for specifying index levels as the `on`, `left_on`, and

390`right_on` parameters was added in version 0.23.0

391Support for merging named Series objects was added in version 0.24.0

393Examples

394--------

395>>> df1 = pd.DataFrame({'lkey': ['foo', 'bar', 'baz', 'foo'],

396... 'value': [1, 2, 3, 5]})

397>>> df2 = pd.DataFrame({'rkey': ['foo', 'bar', 'baz', 'foo'],

398... 'value': [5, 6, 7, 8]})

399>>> df1

400 lkey value

4010 foo 1

4021 bar 2

4032 baz 3

4043 foo 5

405>>> df2

406 rkey value

4070 foo 5

4081 bar 6

4092 baz 7

4103 foo 8

412Merge df1 and df2 on the lkey and rkey columns. The value columns have

413the default suffixes, _x and _y, appended.

415>>> df1.merge(df2, left_on='lkey', right_on='rkey')

416 lkey value_x rkey value_y

4170 foo 1 foo 5

4181 foo 1 foo 8

4192 foo 5 foo 5

4203 foo 5 foo 8

4214 bar 2 bar 6

4225 baz 3 baz 7

424Merge DataFrames df1 and df2 with specified left and right suffixes

425appended to any overlapping columns.

427>>> df1.merge(df2, left_on='lkey', right_on='rkey',

428... suffixes=('_left', '_right'))

429 lkey value_left rkey value_right

4300 foo 1 foo 5

4311 foo 1 foo 8

4322 foo 5 foo 5

4333 foo 5 foo 8

4344 bar 2 bar 6

4355 baz 3 baz 7

437Merge DataFrames df1 and df2, but raise an exception if the DataFrames have

438any overlapping columns.

440>>> df1.merge(df2, left_on='lkey', right_on='rkey', suffixes=(False, False))

441Traceback (most recent call last):

442...

443ValueError: columns overlap but no suffix specified:

444 Index(['value'], dtype='object')

446>>> df1 = pd.DataFrame({'a': ['foo', 'bar'], 'b': [1, 2]})

447>>> df2 = pd.DataFrame({'a': ['foo', 'baz'], 'c': [3, 4]})

448>>> df1

449 a b

4500 foo 1

4511 bar 2

452>>> df2

453 a c

4540 foo 3

4551 baz 4

457>>> df1.merge(df2, how='inner', on='a')

458 a b c

4590 foo 1 3

461>>> df1.merge(df2, how='left', on='a')

462 a b c

4630 foo 1 3.0

4641 bar 2 NaN

466>>> df1 = pd.DataFrame({'left': ['foo', 'bar']})

467>>> df2 = pd.DataFrame({'right': [7, 8]})

468>>> df1

469 left

4700 foo

4711 bar

472>>> df2

473 right

4740 7

4751 8

477>>> df1.merge(df2, how='cross')

478 left right

4790 foo 7

4801 foo 8

4812 bar 7

4823 bar 8

483"""

486# -----------------------------------------------------------------------

487# DataFrame class

490class DataFrame(NDFrame, OpsMixin):

491 """

492 Two-dimensional, size-mutable, potentially heterogeneous tabular data.

494 Data structure also contains labeled axes (rows and columns).

495 Arithmetic operations align on both row and column labels. Can be

496 thought of as a dict-like container for Series objects. The primary

497 pandas data structure.

499 Parameters

500 ----------

501 data : ndarray (structured or homogeneous), Iterable, dict, or DataFrame

502 Dict can contain Series, arrays, constants, dataclass or list-like objects. If

503 data is a dict, column order follows insertion-order. If a dict contains Series

504 which have an index defined, it is aligned by its index. This alignment also

505 occurs if data is a Series or a DataFrame itself. Alignment is done on

506 Series/DataFrame inputs.

508 If data is a list of dicts, column order follows insertion-order.

510 index : Index or array-like

511 Index to use for resulting frame. Will default to RangeIndex if

512 no indexing information part of input data and no index provided.

513 columns : Index or array-like

514 Column labels to use for resulting frame when data does not have them,

515 defaulting to RangeIndex(0, 1, 2, ..., n). If data contains column labels,

516 will perform column selection instead.

517 dtype : dtype, default None

518 Data type to force. Only a single dtype is allowed. If None, infer.

519 copy : bool or None, default None

520 Copy data from inputs.

521 For dict data, the default of None behaves like ``copy=True``. For DataFrame

522 or 2d ndarray input, the default of None behaves like ``copy=False``.

523 If data is a dict containing one or more Series (possibly of different dtypes),

524 ``copy=False`` will ensure that these inputs are not copied.

526 .. versionchanged:: 1.3.0

528 See Also

529 --------

530 DataFrame.from_records : Constructor from tuples, also record arrays.

531 DataFrame.from_dict : From dicts of Series, arrays, or dicts.

532 read_csv : Read a comma-separated values (csv) file into DataFrame.

533 read_table : Read general delimited file into DataFrame.

534 read_clipboard : Read text from clipboard into DataFrame.

536 Notes

537 -----

538 Please reference the :ref:`User Guide <basics.dataframe>` for more information.

540 Examples

541 --------

542 Constructing DataFrame from a dictionary.

544 >>> d = {'col1': [1, 2], 'col2': [3, 4]}

545 >>> df = pd.DataFrame(data=d)

546 >>> df

547 col1 col2

548 0 1 3

549 1 2 4

551 Notice that the inferred dtype is int64.

553 >>> df.dtypes

554 col1 int64

555 col2 int64

556 dtype: object

558 To enforce a single dtype:

560 >>> df = pd.DataFrame(data=d, dtype=np.int8)

561 >>> df.dtypes

562 col1 int8

563 col2 int8

564 dtype: object

566 Constructing DataFrame from a dictionary including Series:

568 >>> d = {'col1': [0, 1, 2, 3], 'col2': pd.Series([2, 3], index=[2, 3])}

569 >>> pd.DataFrame(data=d, index=[0, 1, 2, 3])

570 col1 col2

571 0 0 NaN

572 1 1 NaN

573 2 2 2.0

574 3 3 3.0

576 Constructing DataFrame from numpy ndarray:

578 >>> df2 = pd.DataFrame(np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]),

579 ... columns=['a', 'b', 'c'])

580 >>> df2

581 a b c

582 0 1 2 3

583 1 4 5 6

584 2 7 8 9

586 Constructing DataFrame from a numpy ndarray that has labeled columns:

588 >>> data = np.array([(1, 2, 3), (4, 5, 6), (7, 8, 9)],

589 ... dtype=[("a", "i4"), ("b", "i4"), ("c", "i4")])

590 >>> df3 = pd.DataFrame(data, columns=['c', 'a'])

591 ...

592 >>> df3

593 c a

594 0 3 1

595 1 6 4

596 2 9 7

598 Constructing DataFrame from dataclass:

600 >>> from dataclasses import make_dataclass

601 >>> Point = make_dataclass("Point", [("x", int), ("y", int)])

602 >>> pd.DataFrame([Point(0, 0), Point(0, 3), Point(2, 3)])

603 x y

604 0 0 0

605 1 0 3

606 2 2 3

608 Constructing DataFrame from Series/DataFrame:

610 >>> ser = pd.Series([1, 2, 3], index=["a", "b", "c"])

611 >>> df = pd.DataFrame(data=ser, index=["a", "c"])

612 >>> df

614 a 1

615 c 3

617 >>> df1 = pd.DataFrame([1, 2, 3], index=["a", "b", "c"], columns=["x"])

618 >>> df2 = pd.DataFrame(data=df1, index=["a", "c"])

619 >>> df2

621 a 1

622 c 3

623 """

625 _internal_names_set = {"columns", "index"} | NDFrame._internal_names_set

626 _typ = "dataframe"

627 _HANDLED_TYPES = (Series, Index, ExtensionArray, np.ndarray)

628 _accessors: set[str] = {"sparse"}

629 _hidden_attrs: frozenset[str] = NDFrame._hidden_attrs | frozenset([])

630 _mgr: BlockManager | ArrayManager

632 @property

633 def _constructor(self) -> Callable[..., DataFrame]:

634 return DataFrame

636 _constructor_sliced: Callable[..., Series] = Series

638 # ----------------------------------------------------------------------

639 # Constructors

641 def __init__(

642 self,

643 data=None,

644 index: Axes | None = None,

645 columns: Axes | None = None,

646 dtype: Dtype | None = None,

647 copy: bool | None = None,

648 ) -> None:

649 if dtype is not None:

650 dtype = self._validate_dtype(dtype)

652 if isinstance(data, DataFrame):

653 data = data._mgr

654 if not copy:

655 # if not copying data, ensure to still return a shallow copy

656 # to avoid the result sharing the same Manager

657 data = data.copy(deep=False)

659 if isinstance(data, (BlockManager, ArrayManager)):

660 if using_copy_on_write():

661 data = data.copy(deep=False)

662 # first check if a Manager is passed without any other arguments

663 # -> use fastpath (without checking Manager type)

664 if index is None and columns is None and dtype is None and not copy:

665 # GH#33357 fastpath

666 NDFrame.__init__(self, data)

667 return

669 manager = get_option("mode.data_manager")

671 # GH47215

672 if index is not None and isinstance(index, set):

673 raise ValueError("index cannot be a set")

674 if columns is not None and isinstance(columns, set):

675 raise ValueError("columns cannot be a set")

677 if copy is None:

678 if isinstance(data, dict):

679 # retain pre-GH#38939 default behavior

680 copy = True

681 elif (

682 manager == "array"

683 and isinstance(data, (np.ndarray, ExtensionArray))

684 and data.ndim == 2

685 ):

686 # INFO(ArrayManager) by default copy the 2D input array to get

687 # contiguous 1D arrays

688 copy = True

689 elif using_copy_on_write() and not isinstance(

690 data, (Index, DataFrame, Series)

691 ):

692 copy = True

693 else:

694 copy = False

696 if data is None:

697 index = index if index is not None else default_index(0)

698 columns = columns if columns is not None else default_index(0)

699 dtype = dtype if dtype is not None else pandas_dtype(object)

700 data = []

702 if isinstance(data, (BlockManager, ArrayManager)):

703 mgr = self._init_mgr(

704 data, axes={"index": index, "columns": columns}, dtype=dtype, copy=copy

707 elif isinstance(data, dict):

708 # GH#38939 de facto copy defaults to False only in non-dict cases

709 mgr = dict_to_mgr(data, index, columns, dtype=dtype, copy=copy, typ=manager)

710 elif isinstance(data, ma.MaskedArray):

711 from numpy.ma import mrecords

713 # masked recarray

714 if isinstance(data, mrecords.MaskedRecords):

715 raise TypeError(

716 "MaskedRecords are not supported. Pass "

717 "{name: data[name] for name in data.dtype.names} "

718 "instead"

721 # a masked array

722 data = sanitize_masked_array(data)

723 mgr = ndarray_to_mgr(

724 data,

725 index,

726 columns,

727 dtype=dtype,

728 copy=copy,

729 typ=manager,

732 elif isinstance(data, (np.ndarray, Series, Index, ExtensionArray)):

733 if data.dtype.names:

734 # i.e. numpy structured array

735 data = cast(np.ndarray, data)

736 mgr = rec_array_to_mgr(

737 data,

738 index,

739 columns,

740 dtype,

741 copy,

742 typ=manager,

744 elif getattr(data, "name", None) is not None:

745 # i.e. Series/Index with non-None name

746 _copy = copy if using_copy_on_write() else True

747 mgr = dict_to_mgr(

748 # error: Item "ndarray" of "Union[ndarray, Series, Index]" has no

749 # attribute "name"

750 {data.name: data}, # type: ignore[union-attr]

751 index,

752 columns,

753 dtype=dtype,

754 typ=manager,

755 copy=_copy,

757 else:

758 mgr = ndarray_to_mgr(

759 data,

760 index,

761 columns,

762 dtype=dtype,

763 copy=copy,

764 typ=manager,

767 # For data is list-like, or Iterable (will consume into list)

768 elif is_list_like(data):

769 if not isinstance(data, abc.Sequence):

770 if hasattr(data, "__array__"):

771 # GH#44616 big perf improvement for e.g. pytorch tensor

772 data = np.asarray(data)

773 else:

774 data = list(data)

775 if len(data) > 0:

776 if is_dataclass(data[0]):

777 data = dataclasses_to_dicts(data)

778 if not isinstance(data, np.ndarray) and treat_as_nested(data):

779 # exclude ndarray as we may have cast it a few lines above

780 if columns is not None:

781 columns = ensure_index(columns)

782 arrays, columns, index = nested_data_to_arrays(

783 # error: Argument 3 to "nested_data_to_arrays" has incompatible

784 # type "Optional[Collection[Any]]"; expected "Optional[Index]"

785 data,

786 columns,

787 index, # type: ignore[arg-type]

788 dtype,

790 mgr = arrays_to_mgr(

791 arrays,

792 columns,

793 index,

794 dtype=dtype,

795 typ=manager,

797 else:

798 mgr = ndarray_to_mgr(

799 data,

800 index,

801 columns,

802 dtype=dtype,

803 copy=copy,

804 typ=manager,

806 else:

807 mgr = dict_to_mgr(

808 {},

809 index,

810 columns if columns is not None else default_index(0),

811 dtype=dtype,

812 typ=manager,

814 # For data is scalar

815 else:

816 if index is None or columns is None:

817 raise ValueError("DataFrame constructor not properly called!")

819 index = ensure_index(index)

820 columns = ensure_index(columns)

822 if not dtype:

823 dtype, _ = infer_dtype_from_scalar(data, pandas_dtype=True)

825 # For data is a scalar extension dtype

826 if isinstance(dtype, ExtensionDtype):

827 # TODO(EA2D): special case not needed with 2D EAs

829 values = [

830 construct_1d_arraylike_from_scalar(data, len(index), dtype)

831 for _ in range(len(columns))

833 mgr = arrays_to_mgr(values, columns, index, dtype=None, typ=manager)

834 else:

835 arr2d = construct_2d_arraylike_from_scalar(

836 data,

837 len(index),

838 len(columns),

839 dtype,

840 copy,

843 mgr = ndarray_to_mgr(

844 arr2d,

845 index,

846 columns,

847 dtype=arr2d.dtype,

848 copy=False,

849 typ=manager,

852 # ensure correct Manager type according to settings

853 mgr = mgr_to_mgr(mgr, typ=manager)

855 NDFrame.__init__(self, mgr)

857 # ----------------------------------------------------------------------

858 def __dataframe__(

859 self, nan_as_null: bool = False, allow_copy: bool = True

860 ) -> DataFrameXchg:

861 """

862 Return the dataframe interchange object implementing the interchange protocol.

864 Parameters

865 ----------

866 nan_as_null : bool, default False

867 Whether to tell the DataFrame to overwrite null values in the data

868 with ``NaN`` (or ``NaT``).

869 allow_copy : bool, default True

870 Whether to allow memory copying when exporting. If set to False

871 it would cause non-zero-copy exports to fail.

873 Returns

874 -------

875 DataFrame interchange object

876 The object which consuming library can use to ingress the dataframe.

878 Notes

879 -----

880 Details on the interchange protocol:

881 https://data-apis.org/dataframe-protocol/latest/index.html

883 `nan_as_null` currently has no effect; once support for nullable extension

884 dtypes is added, this value should be propagated to columns.

885 """

887 from pandas.core.interchange.dataframe import PandasDataFrameXchg

889 return PandasDataFrameXchg(self, nan_as_null, allow_copy)

891 # ----------------------------------------------------------------------

893 @property

894 def axes(self) -> list[Index]:

895 """

896 Return a list representing the axes of the DataFrame.

898 It has the row axis labels and column axis labels as the only members.

899 They are returned in that order.

901 Examples

902 --------

903 >>> df = pd.DataFrame({'col1': [1, 2], 'col2': [3, 4]})

904 >>> df.axes

905 [RangeIndex(start=0, stop=2, step=1), Index(['col1', 'col2'],

906 dtype='object')]

907 """

908 return [self.index, self.columns]

910 @property

911 def shape(self) -> tuple[int, int]:

912 """

913 Return a tuple representing the dimensionality of the DataFrame.

915 See Also

916 --------

917 ndarray.shape : Tuple of array dimensions.

919 Examples

920 --------

921 >>> df = pd.DataFrame({'col1': [1, 2], 'col2': [3, 4]})

922 >>> df.shape

923 (2, 2)

925 >>> df = pd.DataFrame({'col1': [1, 2], 'col2': [3, 4],

926 ... 'col3': [5, 6]})

927 >>> df.shape

928 (2, 3)

929 """

930 return len(self.index), len(self.columns)

932 @property

933 def _is_homogeneous_type(self) -> bool:

934 """

935 Whether all the columns in a DataFrame have the same type.

937 Returns

938 -------

939 bool

941 See Also

942 --------

943 Index._is_homogeneous_type : Whether the object has a single

944 dtype.

945 MultiIndex._is_homogeneous_type : Whether all the levels of a

946 MultiIndex have the same dtype.

948 Examples

949 --------

950 >>> DataFrame({"A": [1, 2], "B": [3, 4]})._is_homogeneous_type

951 True

952 >>> DataFrame({"A": [1, 2], "B": [3.0, 4.0]})._is_homogeneous_type

953 False

955 Items with the same type but different sizes are considered

956 different types.

958 >>> DataFrame({

959 ... "A": np.array([1, 2], dtype=np.int32),

960 ... "B": np.array([1, 2], dtype=np.int64)})._is_homogeneous_type

961 False

962 """

963 if isinstance(self._mgr, ArrayManager):

964 return len({arr.dtype for arr in self._mgr.arrays}) == 1

965 if self._mgr.any_extension_types:

966 return len({block.dtype for block in self._mgr.blocks}) == 1

967 else:

968 return not self._is_mixed_type

970 @property

971 def _can_fast_transpose(self) -> bool:

972 """

973 Can we transpose this DataFrame without creating any new array objects.

974 """

975 if isinstance(self._mgr, ArrayManager):

976 return False

977 blocks = self._mgr.blocks

978 if len(blocks) != 1:

979 return False

981 dtype = blocks[0].dtype

982 # TODO(EA2D) special case would be unnecessary with 2D EAs

983 return not is_1d_only_ea_dtype(dtype)

985 @property

986 def _values(self) -> np.ndarray | DatetimeArray | TimedeltaArray | PeriodArray:

987 """

988 Analogue to ._values that may return a 2D ExtensionArray.

989 """

990 mgr = self._mgr

992 if isinstance(mgr, ArrayManager):

993 if len(mgr.arrays) == 1 and not is_1d_only_ea_dtype(mgr.arrays[0].dtype):

994 # error: Item "ExtensionArray" of "Union[ndarray, ExtensionArray]"

995 # has no attribute "reshape"

996 return mgr.arrays[0].reshape(-1, 1) # type: ignore[union-attr]

997 return ensure_wrapped_if_datetimelike(self.values)

999 blocks = mgr.blocks

1000 if len(blocks) != 1:

1001 return ensure_wrapped_if_datetimelike(self.values)

1003 arr = blocks[0].values

1004 if arr.ndim == 1:

1005 # non-2D ExtensionArray

1006 return self.values

1008 # more generally, whatever we allow in NDArrayBackedExtensionBlock

1009 arr = cast("np.ndarray | DatetimeArray | TimedeltaArray | PeriodArray", arr)

1010 return arr.T

1012 # ----------------------------------------------------------------------

1013 # Rendering Methods

1015 def _repr_fits_vertical_(self) -> bool:

1016 """

1017 Check length against max_rows.

1018 """

1019 max_rows = get_option("display.max_rows")

1020 return len(self) <= max_rows

1022 def _repr_fits_horizontal_(self, ignore_width: bool = False) -> bool:

1023 """

1024 Check if full repr fits in horizontal boundaries imposed by the display

1025 options width and max_columns.

1027 In case of non-interactive session, no boundaries apply.

1029 `ignore_width` is here so ipynb+HTML output can behave the way

1030 users expect. display.max_columns remains in effect.

1031 GH3541, GH3573

1032 """

1033 width, height = console.get_console_size()

1034 max_columns = get_option("display.max_columns")

1035 nb_columns = len(self.columns)

1037 # exceed max columns

1038 if (max_columns and nb_columns > max_columns) or (

1039 (not ignore_width) and width and nb_columns > (width // 2)

1041 return False

1043 # used by repr_html under IPython notebook or scripts ignore terminal

1044 # dims

1045 if ignore_width or width is None or not console.in_interactive_session():

1046 return True

1048 if get_option("display.width") is not None or console.in_ipython_frontend():

1049 # check at least the column row for excessive width

1050 max_rows = 1

1051 else:

1052 max_rows = get_option("display.max_rows")

1054 # when auto-detecting, so width=None and not in ipython front end

1055 # check whether repr fits horizontal by actually checking

1056 # the width of the rendered repr

1057 buf = StringIO()

1059 # only care about the stuff we'll actually print out

1060 # and to_string on entire frame may be expensive

1061 d = self

1063 if max_rows is not None: # unlimited rows

1064 # min of two, where one may be None

1065 d = d.iloc[: min(max_rows, len(d))]

1066 else:

1067 return True

1069 d.to_string(buf=buf)

1070 value = buf.getvalue()

1071 repr_width = max(len(line) for line in value.split("\n"))

1073 return repr_width < width

1075 def _info_repr(self) -> bool:

1076 """

1077 True if the repr should show the info view.

1078 """

1079 info_repr_option = get_option("display.large_repr") == "info"

1080 return info_repr_option and not (

1081 self._repr_fits_horizontal_() and self._repr_fits_vertical_()

1084 def __repr__(self) -> str:

1085 """

1086 Return a string representation for a particular DataFrame.

1087 """

1088 if self._info_repr():

1089 buf = StringIO()

1090 self.info(buf=buf)

1091 return buf.getvalue()

1093 repr_params = fmt.get_dataframe_repr_params()

1094 return self.to_string(**repr_params)

1096 def _repr_html_(self) -> str | None:

1097 """

1098 Return a html representation for a particular DataFrame.

1100 Mainly for IPython notebook.

1101 """

1102 if self._info_repr():

1103 buf = StringIO()

1104 self.info(buf=buf)

1105 # need to escape the <class>, should be the first line.

1106 val = buf.getvalue().replace("<", r"<", 1)

1107 val = val.replace(">", r">", 1)

1108 return f"<pre>{val}</pre>"

1110 if get_option("display.notebook_repr_html"):

1111 max_rows = get_option("display.max_rows")

1112 min_rows = get_option("display.min_rows")

1113 max_cols = get_option("display.max_columns")

1114 show_dimensions = get_option("display.show_dimensions")

1116 formatter = fmt.DataFrameFormatter(

1117 self,

1118 columns=None,

1119 col_space=None,

1120 na_rep="NaN",

1121 formatters=None,

1122 float_format=None,

1123 sparsify=None,

1124 justify=None,

1125 index_names=True,

1126 header=True,

1127 index=True,

1128 bold_rows=True,

1129 escape=True,

1130 max_rows=max_rows,

1131 min_rows=min_rows,

1132 max_cols=max_cols,

1133 show_dimensions=show_dimensions,

1134 decimal=".",

1136 return fmt.DataFrameRenderer(formatter).to_html(notebook=True)

1137 else:

1138 return None

1140 @overload

1141 def to_string(

1142 self,

1143 buf: None = ...,

1144 columns: Sequence[str] | None = ...,

1145 col_space: int | list[int] | dict[Hashable, int] | None = ...,

1146 header: bool | Sequence[str] = ...,

1147 index: bool = ...,

1148 na_rep: str = ...,

1149 formatters: fmt.FormattersType | None = ...,

1150 float_format: fmt.FloatFormatType | None = ...,

1151 sparsify: bool | None = ...,

1152 index_names: bool = ...,

1153 justify: str | None = ...,

1154 max_rows: int | None = ...,

1155 max_cols: int | None = ...,

1156 show_dimensions: bool = ...,

1157 decimal: str = ...,

1158 line_width: int | None = ...,

1159 min_rows: int | None = ...,

1160 max_colwidth: int | None = ...,

1161 encoding: str | None = ...,

1162 ) -> str:

1163 ...

1165 @overload

1166 def to_string(

1167 self,

1168 buf: FilePath | WriteBuffer[str],

1169 columns: Sequence[str] | None = ...,

1170 col_space: int | list[int] | dict[Hashable, int] | None = ...,

1171 header: bool | Sequence[str] = ...,

1172 index: bool = ...,

1173 na_rep: str = ...,

1174 formatters: fmt.FormattersType | None = ...,

1175 float_format: fmt.FloatFormatType | None = ...,

1176 sparsify: bool | None = ...,

1177 index_names: bool = ...,

1178 justify: str | None = ...,

1179 max_rows: int | None = ...,

1180 max_cols: int | None = ...,

1181 show_dimensions: bool = ...,

1182 decimal: str = ...,

1183 line_width: int | None = ...,

1184 min_rows: int | None = ...,

1185 max_colwidth: int | None = ...,

1186 encoding: str | None = ...,

1187 ) -> None:

1188 ...

1190 @Substitution(

1191 header_type="bool or sequence of str",

1192 header="Write out the column names. If a list of strings "

1193 "is given, it is assumed to be aliases for the "

1194 "column names",

1195 col_space_type="int, list or dict of int",

1196 col_space="The minimum width of each column. If a list of ints is given "

1197 "every integers corresponds with one column. If a dict is given, the key "

1198 "references the column, while the value defines the space to use.",

1200 @Substitution(shared_params=fmt.common_docstring, returns=fmt.return_docstring)

1201 def to_string(

1202 self,

1203 buf: FilePath | WriteBuffer[str] | None = None,

1204 columns: Sequence[str] | None = None,

1205 col_space: int | list[int] | dict[Hashable, int] | None = None,

1206 header: bool | Sequence[str] = True,

1207 index: bool = True,

1208 na_rep: str = "NaN",

1209 formatters: fmt.FormattersType | None = None,

1210 float_format: fmt.FloatFormatType | None = None,

1211 sparsify: bool | None = None,

1212 index_names: bool = True,

1213 justify: str | None = None,

1214 max_rows: int | None = None,

1215 max_cols: int | None = None,

1216 show_dimensions: bool = False,

1217 decimal: str = ".",

1218 line_width: int | None = None,

1219 min_rows: int | None = None,

1220 max_colwidth: int | None = None,

1221 encoding: str | None = None,

1222 ) -> str | None:

1223 """

1224 Render a DataFrame to a console-friendly tabular output.

1225 %(shared_params)s

1226 line_width : int, optional

1227 Width to wrap a line in characters.

1228 min_rows : int, optional

1229 The number of rows to display in the console in a truncated repr

1230 (when number of rows is above `max_rows`).

1231 max_colwidth : int, optional

1232 Max width to truncate each column in characters. By default, no limit.

1233 encoding : str, default "utf-8"

1234 Set character encoding.

1235 %(returns)s

1236 See Also

1237 --------

1238 to_html : Convert DataFrame to HTML.

1240 Examples

1241 --------

1242 >>> d = {'col1': [1, 2, 3], 'col2': [4, 5, 6]}

1243 >>> df = pd.DataFrame(d)

1244 >>> print(df.to_string())

1245 col1 col2

1246 0 1 4

1247 1 2 5

1248 2 3 6

1249 """

1250 from pandas import option_context

1252 with option_context("display.max_colwidth", max_colwidth):

1253 formatter = fmt.DataFrameFormatter(

1254 self,

1255 columns=columns,

1256 col_space=col_space,

1257 na_rep=na_rep,

1258 formatters=formatters,

1259 float_format=float_format,

1260 sparsify=sparsify,

1261 justify=justify,

1262 index_names=index_names,

1263 header=header,

1264 index=index,

1265 min_rows=min_rows,

1266 max_rows=max_rows,

1267 max_cols=max_cols,

1268 show_dimensions=show_dimensions,

1269 decimal=decimal,

1271 return fmt.DataFrameRenderer(formatter).to_string(

1272 buf=buf,

1273 encoding=encoding,

1274 line_width=line_width,

1277 # ----------------------------------------------------------------------

1279 @property

1280 def style(self) -> Styler:

1281 """

1282 Returns a Styler object.

1284 Contains methods for building a styled HTML representation of the DataFrame.

1286 See Also

1287 --------

1288 io.formats.style.Styler : Helps style a DataFrame or Series according to the

1289 data with HTML and CSS.

1290 """

1291 from pandas.io.formats.style import Styler

1293 return Styler(self)

1295 _shared_docs[

1296 "items"

1297 ] = r"""

1298 Iterate over (column name, Series) pairs.

1300 Iterates over the DataFrame columns, returning a tuple with

1301 the column name and the content as a Series.

1303 Yields

1304 ------

1305 label : object

1306 The column names for the DataFrame being iterated over.

1307 content : Series

1308 The column entries belonging to each label, as a Series.

1310 See Also

1311 --------

1312 DataFrame.iterrows : Iterate over DataFrame rows as

1313 (index, Series) pairs.

1314 DataFrame.itertuples : Iterate over DataFrame rows as namedtuples

1315 of the values.

1317 Examples

1318 --------

1319 >>> df = pd.DataFrame({'species': ['bear', 'bear', 'marsupial'],

1320 ... 'population': [1864, 22000, 80000]},

1321 ... index=['panda', 'polar', 'koala'])

1322 >>> df

1323 species population

1324 panda bear 1864

1325 polar bear 22000

1326 koala marsupial 80000

1327 >>> for label, content in df.items():

1328 ... print(f'label: {label}')

1329 ... print(f'content: {content}', sep='\n')

1330 ...

1331 label: species

1332 content:

1333 panda bear

1334 polar bear

1335 koala marsupial

1336 Name: species, dtype: object

1337 label: population

1338 content:

1339 panda 1864

1340 polar 22000

1341 koala 80000

1342 Name: population, dtype: int64

1343 """

1345 @Appender(_shared_docs["items"])

1346 def items(self) -> Iterable[tuple[Hashable, Series]]:

1347 if self.columns.is_unique and hasattr(self, "_item_cache"):

1348 for k in self.columns:

1349 yield k, self._get_item_cache(k)

1350 else:

1351 for i, k in enumerate(self.columns):

1352 yield k, self._ixs(i, axis=1)

1354 def iterrows(self) -> Iterable[tuple[Hashable, Series]]:

1355 """

1356 Iterate over DataFrame rows as (index, Series) pairs.

1358 Yields

1359 ------

1360 index : label or tuple of label

1361 The index of the row. A tuple for a `MultiIndex`.

1362 data : Series

1363 The data of the row as a Series.

1365 See Also

1366 --------

1367 DataFrame.itertuples : Iterate over DataFrame rows as namedtuples of the values.

1368 DataFrame.items : Iterate over (column name, Series) pairs.

1370 Notes

1371 -----

1372 1. Because ``iterrows`` returns a Series for each row,

1373 it does **not** preserve dtypes across the rows (dtypes are

1374 preserved across columns for DataFrames). For example,

1376 >>> df = pd.DataFrame([[1, 1.5]], columns=['int', 'float'])

1377 >>> row = next(df.iterrows())[1]

1378 >>> row

1379 int 1.0

1380 float 1.5

1381 Name: 0, dtype: float64

1382 >>> print(row['int'].dtype)

1383 float64

1384 >>> print(df['int'].dtype)

1385 int64

1387 To preserve dtypes while iterating over the rows, it is better

1388 to use :meth:`itertuples` which returns namedtuples of the values

1389 and which is generally faster than ``iterrows``.

1391 2. You should **never modify** something you are iterating over.

1392 This is not guaranteed to work in all cases. Depending on the

1393 data types, the iterator returns a copy and not a view, and writing

1394 to it will have no effect.

1395 """

1396 columns = self.columns

1397 klass = self._constructor_sliced

1398 using_cow = using_copy_on_write()

1399 for k, v in zip(self.index, self.values):

1400 s = klass(v, index=columns, name=k).__finalize__(self)

1401 if using_cow and self._mgr.is_single_block:

1402 s._mgr.add_references(self._mgr) # type: ignore[arg-type]

1403 yield k, s

1405 def itertuples(

1406 self, index: bool = True, name: str | None = "Pandas"

1407 ) -> Iterable[tuple[Any, ...]]:

1408 """

1409 Iterate over DataFrame rows as namedtuples.

1411 Parameters

1412 ----------

1413 index : bool, default True

1414 If True, return the index as the first element of the tuple.

1415 name : str or None, default "Pandas"

1416 The name of the returned namedtuples or None to return regular

1417 tuples.

1419 Returns

1420 -------

1421 iterator

1422 An object to iterate over namedtuples for each row in the

1423 DataFrame with the first field possibly being the index and

1424 following fields being the column values.

1426 See Also

1427 --------

1428 DataFrame.iterrows : Iterate over DataFrame rows as (index, Series)

1429 pairs.

1430 DataFrame.items : Iterate over (column name, Series) pairs.

1432 Notes

1433 -----

1434 The column names will be renamed to positional names if they are

1435 invalid Python identifiers, repeated, or start with an underscore.

1437 Examples

1438 --------

1439 >>> df = pd.DataFrame({'num_legs': [4, 2], 'num_wings': [0, 2]},

1440 ... index=['dog', 'hawk'])

1441 >>> df

1442 num_legs num_wings

1443 dog 4 0

1444 hawk 2 2

1445 >>> for row in df.itertuples():

1446 ... print(row)

1447 ...

1448 Pandas(Index='dog', num_legs=4, num_wings=0)

1449 Pandas(Index='hawk', num_legs=2, num_wings=2)

1451 By setting the `index` parameter to False we can remove the index

1452 as the first element of the tuple:

1454 >>> for row in df.itertuples(index=False):

1455 ... print(row)

1456 ...

1457 Pandas(num_legs=4, num_wings=0)

1458 Pandas(num_legs=2, num_wings=2)

1460 With the `name` parameter set we set a custom name for the yielded

1461 namedtuples:

1463 >>> for row in df.itertuples(name='Animal'):

1464 ... print(row)

1465 ...

1466 Animal(Index='dog', num_legs=4, num_wings=0)

1467 Animal(Index='hawk', num_legs=2, num_wings=2)

1468 """

1469 arrays = []

1470 fields = list(self.columns)

1471 if index:

1472 arrays.append(self.index)

1473 fields.insert(0, "Index")

1475 # use integer indexing because of possible duplicate column names

1476 arrays.extend(self.iloc[:, k] for k in range(len(self.columns)))

1478 if name is not None:

1479 # https://github.com/python/mypy/issues/9046

1480 # error: namedtuple() expects a string literal as the first argument

1481 itertuple = collections.namedtuple( # type: ignore[misc]

1482 name, fields, rename=True

1484 return map(itertuple._make, zip(*arrays))

1486 # fallback to regular tuples

1487 return zip(*arrays)

1489 def __len__(self) -> int:

1490 """

1491 Returns length of info axis, but here we use the index.

1492 """

1493 return len(self.index)

1495 @overload

1496 def dot(self, other: Series) -> Series:

1497 ...

1499 @overload

1500 def dot(self, other: DataFrame | Index | ArrayLike) -> DataFrame:

1501 ...

1503 def dot(self, other: AnyArrayLike | DataFrame) -> DataFrame | Series:

1504 """

1505 Compute the matrix multiplication between the DataFrame and other.

1507 This method computes the matrix product between the DataFrame and the

1508 values of an other Series, DataFrame or a numpy array.

1510 It can also be called using ``self @ other`` in Python >= 3.5.

1512 Parameters

1513 ----------

1514 other : Series, DataFrame or array-like

1515 The other object to compute the matrix product with.

1517 Returns

1518 -------

1519 Series or DataFrame

1520 If other is a Series, return the matrix product between self and

1521 other as a Series. If other is a DataFrame or a numpy.array, return

1522 the matrix product of self and other in a DataFrame of a np.array.

1524 See Also

1525 --------

1526 Series.dot: Similar method for Series.

1528 Notes

1529 -----

1530 The dimensions of DataFrame and other must be compatible in order to

1531 compute the matrix multiplication. In addition, the column names of

1532 DataFrame and the index of other must contain the same values, as they

1533 will be aligned prior to the multiplication.

1535 The dot method for Series computes the inner product, instead of the

1536 matrix product here.

1538 Examples

1539 --------

1540 Here we multiply a DataFrame with a Series.

1542 >>> df = pd.DataFrame([[0, 1, -2, -1], [1, 1, 1, 1]])

1543 >>> s = pd.Series([1, 1, 2, 1])

1544 >>> df.dot(s)

1545 0 -4

1546 1 5

1547 dtype: int64

1549 Here we multiply a DataFrame with another DataFrame.

1551 >>> other = pd.DataFrame([[0, 1], [1, 2], [-1, -1], [2, 0]])

1552 >>> df.dot(other)

1553 0 1

1554 0 1 4

1555 1 2 2

1557 Note that the dot method give the same result as @

1559 >>> df @ other

1560 0 1

1561 0 1 4

1562 1 2 2

1564 The dot method works also if other is an np.array.

1566 >>> arr = np.array([[0, 1], [1, 2], [-1, -1], [2, 0]])

1567 >>> df.dot(arr)

1568 0 1

1569 0 1 4

1570 1 2 2

1572 Note how shuffling of the objects does not change the result.

1574 >>> s2 = s.reindex([1, 0, 2, 3])

1575 >>> df.dot(s2)

1576 0 -4

1577 1 5

1578 dtype: int64

1579 """

1580 if isinstance(other, (Series, DataFrame)):

1581 common = self.columns.union(other.index)

1582 if len(common) > len(self.columns) or len(common) > len(other.index):

1583 raise ValueError("matrices are not aligned")

1585 left = self.reindex(columns=common, copy=False)

1586 right = other.reindex(index=common, copy=False)

1587 lvals = left.values

1588 rvals = right._values

1589 else:

1590 left = self

1591 lvals = self.values

1592 rvals = np.asarray(other)

1593 if lvals.shape[1] != rvals.shape[0]:

1594 raise ValueError(

1595 f"Dot product shape mismatch, {lvals.shape} vs {rvals.shape}"

1598 if isinstance(other, DataFrame):

1599 return self._constructor(

1600 np.dot(lvals, rvals),

1601 index=left.index,

1602 columns=other.columns,

1603 copy=False,

1605 elif isinstance(other, Series):

1606 return self._constructor_sliced(

1607 np.dot(lvals, rvals), index=left.index, copy=False

1609 elif isinstance(rvals, (np.ndarray, Index)):

1610 result = np.dot(lvals, rvals)

1611 if result.ndim == 2:

1612 return self._constructor(result, index=left.index, copy=False)

1613 else:

1614 return self._constructor_sliced(result, index=left.index, copy=False)

1615 else: # pragma: no cover

1616 raise TypeError(f"unsupported type: {type(other)}")

1618 @overload

1619 def __matmul__(self, other: Series) -> Series:

1620 ...

1622 @overload

1623 def __matmul__(self, other: AnyArrayLike | DataFrame) -> DataFrame | Series:

1624 ...

1626 def __matmul__(self, other: AnyArrayLike | DataFrame) -> DataFrame | Series:

1627 """

1628 Matrix multiplication using binary `@` operator in Python>=3.5.

1629 """

1630 return self.dot(other)

1632 def __rmatmul__(self, other) -> DataFrame:

1633 """

1634 Matrix multiplication using binary `@` operator in Python>=3.5.

1635 """

1636 try:

1637 return self.T.dot(np.transpose(other)).T

1638 except ValueError as err:

1639 if "shape mismatch" not in str(err):

1640 raise

1641 # GH#21581 give exception message for original shapes

1642 msg = f"shapes {np.shape(other)} and {self.shape} not aligned"

1643 raise ValueError(msg) from err

1645 # ----------------------------------------------------------------------

1646 # IO methods (to / from other formats)

1648 @classmethod

1649 def from_dict(

1650 cls,

1651 data: dict,

1652 orient: str = "columns",

1653 dtype: Dtype | None = None,

1654 columns: Axes | None = None,

1655 ) -> DataFrame:

1656 """

1657 Construct DataFrame from dict of array-like or dicts.

1659 Creates DataFrame object from dictionary by columns or by index

1660 allowing dtype specification.

1662 Parameters

1663 ----------

1664 data : dict

1665 Of the form {field : array-like} or {field : dict}.

1666 orient : {'columns', 'index', 'tight'}, default 'columns'

1667 The "orientation" of the data. If the keys of the passed dict

1668 should be the columns of the resulting DataFrame, pass 'columns'

1669 (default). Otherwise if the keys should be rows, pass 'index'.

1670 If 'tight', assume a dict with keys ['index', 'columns', 'data',

1671 'index_names', 'column_names'].

1673 .. versionadded:: 1.4.0

1674 'tight' as an allowed value for the ``orient`` argument

1676 dtype : dtype, default None

1677 Data type to force after DataFrame construction, otherwise infer.

1678 columns : list, default None

1679 Column labels to use when ``orient='index'``. Raises a ValueError

1680 if used with ``orient='columns'`` or ``orient='tight'``.

1682 Returns

1683 -------

1684 DataFrame

1686 See Also

1687 --------

1688 DataFrame.from_records : DataFrame from structured ndarray, sequence

1689 of tuples or dicts, or DataFrame.

1690 DataFrame : DataFrame object creation using constructor.

1691 DataFrame.to_dict : Convert the DataFrame to a dictionary.

1693 Examples

1694 --------

1695 By default the keys of the dict become the DataFrame columns:

1697 >>> data = {'col_1': [3, 2, 1, 0], 'col_2': ['a', 'b', 'c', 'd']}

1698 >>> pd.DataFrame.from_dict(data)

1699 col_1 col_2

1700 0 3 a

1701 1 2 b

1702 2 1 c

1703 3 0 d

1705 Specify ``orient='index'`` to create the DataFrame using dictionary

1706 keys as rows:

1708 >>> data = {'row_1': [3, 2, 1, 0], 'row_2': ['a', 'b', 'c', 'd']}

1709 >>> pd.DataFrame.from_dict(data, orient='index')

1710 0 1 2 3

1711 row_1 3 2 1 0

1712 row_2 a b c d

1714 When using the 'index' orientation, the column names can be

1715 specified manually:

1717 >>> pd.DataFrame.from_dict(data, orient='index',

1718 ... columns=['A', 'B', 'C', 'D'])

1719 A B C D

1720 row_1 3 2 1 0

1721 row_2 a b c d

1723 Specify ``orient='tight'`` to create the DataFrame using a 'tight'

1724 format:

1726 >>> data = {'index': [('a', 'b'), ('a', 'c')],

1727 ... 'columns': [('x', 1), ('y', 2)],

1728 ... 'data': [[1, 3], [2, 4]],

1729 ... 'index_names': ['n1', 'n2'],

1730 ... 'column_names': ['z1', 'z2']}

1731 >>> pd.DataFrame.from_dict(data, orient='tight')

1732 z1 x y

1733 z2 1 2

1734 n1 n2

1735 a b 1 3

1736 c 2 4

1737 """

1738 index = None

1739 orient = orient.lower()

1740 if orient == "index":

1741 if len(data) > 0:

1742 # TODO speed up Series case

1743 if isinstance(list(data.values())[0], (Series, dict)):

1744 data = _from_nested_dict(data)

1745 else:

1746 index = list(data.keys())

1747 # error: Incompatible types in assignment (expression has type

1748 # "List[Any]", variable has type "Dict[Any, Any]")

1749 data = list(data.values()) # type: ignore[assignment]

1750 elif orient in ("columns", "tight"):

1751 if columns is not None:

1752 raise ValueError(f"cannot use columns parameter with orient='{orient}'")

1753 else: # pragma: no cover

1754 raise ValueError(

1755 f"Expected 'index', 'columns' or 'tight' for orient parameter. "

1756 f"Got '{orient}' instead"

1759 if orient != "tight":

1760 return cls(data, index=index, columns=columns, dtype=dtype)

1761 else:

1762 realdata = data["data"]

1764 def create_index(indexlist, namelist):

1765 index: Index

1766 if len(namelist) > 1:

1767 index = MultiIndex.from_tuples(indexlist, names=namelist)

1768 else:

1769 index = Index(indexlist, name=namelist[0])

1770 return index

1772 index = create_index(data["index"], data["index_names"])

1773 columns = create_index(data["columns"], data["column_names"])

1774 return cls(realdata, index=index, columns=columns, dtype=dtype)

1776 def to_numpy(

1777 self,

1778 dtype: npt.DTypeLike | None = None,

1779 copy: bool = False,

1780 na_value: object = lib.no_default,

1781 ) -> np.ndarray:

1782 """

1783 Convert the DataFrame to a NumPy array.

1785 By default, the dtype of the returned array will be the common NumPy

1786 dtype of all types in the DataFrame. For example, if the dtypes are

1787 ``float16`` and ``float32``, the results dtype will be ``float32``.

1788 This may require copying data and coercing values, which may be

1789 expensive.

1791 Parameters

1792 ----------

1793 dtype : str or numpy.dtype, optional

1794 The dtype to pass to :meth:`numpy.asarray`.

1795 copy : bool, default False

1796 Whether to ensure that the returned value is not a view on

1797 another array. Note that ``copy=False`` does not *ensure* that

1798 ``to_numpy()`` is no-copy. Rather, ``copy=True`` ensure that

1799 a copy is made, even if not strictly necessary.

1800 na_value : Any, optional

1801 The value to use for missing values. The default value depends

1802 on `dtype` and the dtypes of the DataFrame columns.

1804 .. versionadded:: 1.1.0

1806 Returns

1807 -------

1808 numpy.ndarray

1810 See Also

1811 --------

1812 Series.to_numpy : Similar method for Series.

1814 Examples

1815 --------

1816 >>> pd.DataFrame({"A": [1, 2], "B": [3, 4]}).to_numpy()

1817 array([[1, 3],

1818 [2, 4]])

1820 With heterogeneous data, the lowest common type will have to

1821 be used.

1823 >>> df = pd.DataFrame({"A": [1, 2], "B": [3.0, 4.5]})

1824 >>> df.to_numpy()

1825 array([[1. , 3. ],

1826 [2. , 4.5]])

1828 For a mix of numeric and non-numeric types, the output array will

1829 have object dtype.

1831 >>> df['C'] = pd.date_range('2000', periods=2)

1832 >>> df.to_numpy()

1833 array([[1, 3.0, Timestamp('2000-01-01 00:00:00')],

1834 [2, 4.5, Timestamp('2000-01-02 00:00:00')]], dtype=object)

1835 """

1836 if dtype is not None:

1837 dtype = np.dtype(dtype)

1838 result = self._mgr.as_array(dtype=dtype, copy=copy, na_value=na_value)

1839 if result.dtype is not dtype:

1840 result = np.array(result, dtype=dtype, copy=False)

1842 return result

1844 def _create_data_for_split_and_tight_to_dict(

1845 self, are_all_object_dtype_cols: bool, object_dtype_indices: list[int]

1846 ) -> list:

1847 """

1848 Simple helper method to create data for to ``to_dict(orient="split")`` and

1849 ``to_dict(orient="tight")`` to create the main output data

1850 """

1851 if are_all_object_dtype_cols:

1852 data = [

1853 list(map(maybe_box_native, t))

1854 for t in self.itertuples(index=False, name=None)

1856 else:

1857 data = [list(t) for t in self.itertuples(index=False, name=None)]

1858 if object_dtype_indices:

1859 # If we have object_dtype_cols, apply maybe_box_naive after list

1860 # comprehension for perf

1861 for row in data:

1862 for i in object_dtype_indices:

1863 row[i] = maybe_box_native(row[i])

1864 return data

1866 @overload

1867 def to_dict(

1868 self,

1869 orient: Literal["dict", "list", "series", "split", "tight", "index"] = ...,

1870 into: type[dict] = ...,

1871 ) -> dict:

1872 ...

1874 @overload

1875 def to_dict(self, orient: Literal["records"], into: type[dict] = ...) -> list[dict]:

1876 ...

1878 def to_dict(

1879 self,

1880 orient: Literal[

1881 "dict", "list", "series", "split", "tight", "records", "index"

1882 ] = "dict",

1883 into: type[dict] = dict,

1884 index: bool = True,

1885 ) -> dict | list[dict]:

1886 """

1887 Convert the DataFrame to a dictionary.

1889 The type of the key-value pairs can be customized with the parameters

1890 (see below).

1892 Parameters

1893 ----------

1894 orient : str {'dict', 'list', 'series', 'split', 'tight', 'records', 'index'}

1895 Determines the type of the values of the dictionary.

1897 - 'dict' (default) : dict like {column -> {index -> value}}

1898 - 'list' : dict like {column -> [values]}

1899 - 'series' : dict like {column -> Series(values)}

1900 - 'split' : dict like

1901 {'index' -> [index], 'columns' -> [columns], 'data' -> [values]}

1902 - 'tight' : dict like

1903 {'index' -> [index], 'columns' -> [columns], 'data' -> [values],

1904 'index_names' -> [index.names], 'column_names' -> [column.names]}

1905 - 'records' : list like

1906 [{column -> value}, ... , {column -> value}]

1907 - 'index' : dict like {index -> {column -> value}}

1909 .. versionadded:: 1.4.0

1910 'tight' as an allowed value for the ``orient`` argument

1912 into : class, default dict

1913 The collections.abc.Mapping subclass used for all Mappings

1914 in the return value. Can be the actual class or an empty

1915 instance of the mapping type you want. If you want a

1916 collections.defaultdict, you must pass it initialized.

1918 index : bool, default True

1919 Whether to include the index item (and index_names item if `orient`

1920 is 'tight') in the returned dictionary. Can only be ``False``

1921 when `orient` is 'split' or 'tight'.

1923 .. versionadded:: 2.0.0

1925 Returns

1926 -------

1927 dict, list or collections.abc.Mapping

1928 Return a collections.abc.Mapping object representing the DataFrame.

1929 The resulting transformation depends on the `orient` parameter.

1931 See Also

1932 --------

1933 DataFrame.from_dict: Create a DataFrame from a dictionary.

1934 DataFrame.to_json: Convert a DataFrame to JSON format.

1936 Examples

1937 --------

1938 >>> df = pd.DataFrame({'col1': [1, 2],

1939 ... 'col2': [0.5, 0.75]},

1940 ... index=['row1', 'row2'])

1941 >>> df

1942 col1 col2

1943 row1 1 0.50

1944 row2 2 0.75

1945 >>> df.to_dict()

1946 {'col1': {'row1': 1, 'row2': 2}, 'col2': {'row1': 0.5, 'row2': 0.75}}

1948 You can specify the return orientation.

1950 >>> df.to_dict('series')

1951 {'col1': row1 1

1952 row2 2

1953 Name: col1, dtype: int64,

1954 'col2': row1 0.50

1955 row2 0.75

1956 Name: col2, dtype: float64}

1958 >>> df.to_dict('split')

1959 {'index': ['row1', 'row2'], 'columns': ['col1', 'col2'],

1960 'data': [[1, 0.5], [2, 0.75]]}

1962 >>> df.to_dict('records')

1963 [{'col1': 1, 'col2': 0.5}, {'col1': 2, 'col2': 0.75}]

1965 >>> df.to_dict('index')

1966 {'row1': {'col1': 1, 'col2': 0.5}, 'row2': {'col1': 2, 'col2': 0.75}}

1968 >>> df.to_dict('tight')

1969 {'index': ['row1', 'row2'], 'columns': ['col1', 'col2'],

1970 'data': [[1, 0.5], [2, 0.75]], 'index_names': [None], 'column_names': [None]}

1972 You can also specify the mapping type.

1974 >>> from collections import OrderedDict, defaultdict

1975 >>> df.to_dict(into=OrderedDict)

1976 OrderedDict([('col1', OrderedDict([('row1', 1), ('row2', 2)])),

1977 ('col2', OrderedDict([('row1', 0.5), ('row2', 0.75)]))])

1979 If you want a `defaultdict`, you need to initialize it:

1981 >>> dd = defaultdict(list)

1982 >>> df.to_dict('records', into=dd)

1983 [defaultdict(<class 'list'>, {'col1': 1, 'col2': 0.5}),

1984 defaultdict(<class 'list'>, {'col1': 2, 'col2': 0.75})]

1985 """

1986 from pandas.core.methods.to_dict import to_dict

1988 return to_dict(self, orient, into, index)

1990 def to_gbq(

1991 self,

1992 destination_table: str,

1993 project_id: str | None = None,

1994 chunksize: int | None = None,

1995 reauth: bool = False,

1996 if_exists: str = "fail",

1997 auth_local_webserver: bool = True,

1998 table_schema: list[dict[str, str]] | None = None,

1999 location: str | None = None,

2000 progress_bar: bool = True,

2001 credentials=None,

2002 ) -> None:

2003 """

2004 Write a DataFrame to a Google BigQuery table.

2006 This function requires the `pandas-gbq package

2007 <https://pandas-gbq.readthedocs.io>`__.

2009 See the `How to authenticate with Google BigQuery

2010 <https://pandas-gbq.readthedocs.io/en/latest/howto/authentication.html>`__

2011 guide for authentication instructions.

2013 Parameters

2014 ----------

2015 destination_table : str

2016 Name of table to be written, in the form ``dataset.tablename``.

2017 project_id : str, optional

2018 Google BigQuery Account project ID. Optional when available from

2019 the environment.

2020 chunksize : int, optional

2021 Number of rows to be inserted in each chunk from the dataframe.

2022 Set to ``None`` to load the whole dataframe at once.

2023 reauth : bool, default False

2024 Force Google BigQuery to re-authenticate the user. This is useful

2025 if multiple accounts are used.

2026 if_exists : str, default 'fail'

2027 Behavior when the destination table exists. Value can be one of:

2029 ``'fail'``

2030 If table exists raise pandas_gbq.gbq.TableCreationError.

2031 ``'replace'``

2032 If table exists, drop it, recreate it, and insert data.

2033 ``'append'``

2034 If table exists, insert data. Create if does not exist.

2035 auth_local_webserver : bool, default True

2036 Use the `local webserver flow`_ instead of the `console flow`_

2037 when getting user credentials.

2039 .. _local webserver flow:

2040 https://google-auth-oauthlib.readthedocs.io/en/latest/reference/google_auth_oauthlib.flow.html#google_auth_oauthlib.flow.InstalledAppFlow.run_local_server

2041 .. _console flow:

2042 https://google-auth-oauthlib.readthedocs.io/en/latest/reference/google_auth_oauthlib.flow.html#google_auth_oauthlib.flow.InstalledAppFlow.run_console

2044 *New in version 0.2.0 of pandas-gbq*.

2046 .. versionchanged:: 1.5.0

2047 Default value is changed to ``True``. Google has deprecated the

2048 ``auth_local_webserver = False`` `"out of band" (copy-paste)

2049 flow

2050 <https://developers.googleblog.com/2022/02/making-oauth-flows-safer.html?m=1#disallowed-oob>`_.

2051 table_schema : list of dicts, optional

2052 List of BigQuery table fields to which according DataFrame

2053 columns conform to, e.g. ``[{'name': 'col1', 'type':

2054 'STRING'},...]``. If schema is not provided, it will be

2055 generated according to dtypes of DataFrame columns. See

2056 BigQuery API documentation on available names of a field.

2058 *New in version 0.3.1 of pandas-gbq*.

2059 location : str, optional

2060 Location where the load job should run. See the `BigQuery locations

2061 documentation

2062 <https://cloud.google.com/bigquery/docs/dataset-locations>`__ for a

2063 list of available locations. The location must match that of the

2064 target dataset.

2066 *New in version 0.5.0 of pandas-gbq*.

2067 progress_bar : bool, default True

2068 Use the library `tqdm` to show the progress bar for the upload,

2069 chunk by chunk.

2071 *New in version 0.5.0 of pandas-gbq*.

2072 credentials : google.auth.credentials.Credentials, optional

2073 Credentials for accessing Google APIs. Use this parameter to

2074 override default credentials, such as to use Compute Engine

2075 :class:`google.auth.compute_engine.Credentials` or Service

2076 Account :class:`google.oauth2.service_account.Credentials`

2077 directly.

2079 *New in version 0.8.0 of pandas-gbq*.

2081 See Also

2082 --------

2083 pandas_gbq.to_gbq : This function in the pandas-gbq library.

2084 read_gbq : Read a DataFrame from Google BigQuery.

2085 """

2086 from pandas.io import gbq

2088 gbq.to_gbq(

2089 self,

2090 destination_table,

2091 project_id=project_id,

2092 chunksize=chunksize,

2093 reauth=reauth,

2094 if_exists=if_exists,

2095 auth_local_webserver=auth_local_webserver,

2096 table_schema=table_schema,

2097 location=location,

2098 progress_bar=progress_bar,

2099 credentials=credentials,

2102 @classmethod

2103 def from_records(

2104 cls,

2105 data,

2106 index=None,

2107 exclude=None,

2108 columns=None,

2109 coerce_float: bool = False,

2110 nrows: int | None = None,

2111 ) -> DataFrame:

2112 """

2113 Convert structured or record ndarray to DataFrame.

2115 Creates a DataFrame object from a structured ndarray, sequence of

2116 tuples or dicts, or DataFrame.

2118 Parameters

2119 ----------

2120 data : structured ndarray, sequence of tuples or dicts, or DataFrame

2121 Structured input data.

2122 index : str, list of fields, array-like

2123 Field of array to use as the index, alternately a specific set of

2124 input labels to use.

2125 exclude : sequence, default None

2126 Columns or fields to exclude.

2127 columns : sequence, default None

2128 Column names to use. If the passed data do not have names

2129 associated with them, this argument provides names for the

2130 columns. Otherwise this argument indicates the order of the columns

2131 in the result (any names not found in the data will become all-NA

2132 columns).

2133 coerce_float : bool, default False

2134 Attempt to convert values of non-string, non-numeric objects (like

2135 decimal.Decimal) to floating point, useful for SQL result sets.

2136 nrows : int, default None

2137 Number of rows to read if data is an iterator.

2139 Returns

2140 -------

2141 DataFrame

2143 See Also

2144 --------

2145 DataFrame.from_dict : DataFrame from dict of array-like or dicts.

2146 DataFrame : DataFrame object creation using constructor.

2148 Examples

2149 --------

2150 Data can be provided as a structured ndarray:

2152 >>> data = np.array([(3, 'a'), (2, 'b'), (1, 'c'), (0, 'd')],

2153 ... dtype=[('col_1', 'i4'), ('col_2', 'U1')])

2154 >>> pd.DataFrame.from_records(data)

2155 col_1 col_2

2156 0 3 a

2157 1 2 b

2158 2 1 c

2159 3 0 d

2161 Data can be provided as a list of dicts:

2163 >>> data = [{'col_1': 3, 'col_2': 'a'},

2164 ... {'col_1': 2, 'col_2': 'b'},

2165 ... {'col_1': 1, 'col_2': 'c'},

2166 ... {'col_1': 0, 'col_2': 'd'}]

2167 >>> pd.DataFrame.from_records(data)

2168 col_1 col_2

2169 0 3 a

2170 1 2 b

2171 2 1 c

2172 3 0 d

2174 Data can be provided as a list of tuples with corresponding columns:

2176 >>> data = [(3, 'a'), (2, 'b'), (1, 'c'), (0, 'd')]

2177 >>> pd.DataFrame.from_records(data, columns=['col_1', 'col_2'])

2178 col_1 col_2

2179 0 3 a

2180 1 2 b

2181 2 1 c

2182 3 0 d

2183 """

2184 if isinstance(data, DataFrame):

2185 if columns is not None:

2186 if is_scalar(columns):

2187 columns = [columns]

2188 data = data[columns]

2189 if index is not None:

2190 data = data.set_index(index)

2191 if exclude is not None:

2192 data = data.drop(columns=exclude)

2193 return data.copy(deep=False)

2195 result_index = None

2197 # Make a copy of the input columns so we can modify it

2198 if columns is not None:

2199 columns = ensure_index(columns)

2201 def maybe_reorder(

2202 arrays: list[ArrayLike], arr_columns: Index, columns: Index, index

2203 ) -> tuple[list[ArrayLike], Index, Index | None]:

2204 """

2205 If our desired 'columns' do not match the data's pre-existing 'arr_columns',

2206 we re-order our arrays. This is like a pre-emptive (cheap) reindex.

2207 """

2208 if len(arrays):

2209 length = len(arrays[0])

2210 else:

2211 length = 0

2213 result_index = None

2214 if len(arrays) == 0 and index is None and length == 0:

2215 result_index = default_index(0)

2217 arrays, arr_columns = reorder_arrays(arrays, arr_columns, columns, length)

2218 return arrays, arr_columns, result_index

2220 if is_iterator(data):

2221 if nrows == 0:

2222 return cls()

2224 try:

2225 first_row = next(data)

2226 except StopIteration:

2227 return cls(index=index, columns=columns)

2229 dtype = None

2230 if hasattr(first_row, "dtype") and first_row.dtype.names:

2231 dtype = first_row.dtype

2233 values = [first_row]

2235 if nrows is None:

2236 values += data

2237 else:

2238 values.extend(itertools.islice(data, nrows - 1))

2240 if dtype is not None:

2241 data = np.array(values, dtype=dtype)

2242 else:

2243 data = values

2245 if isinstance(data, dict):

2246 if columns is None:

2247 columns = arr_columns = ensure_index(sorted(data))

2248 arrays = [data[k] for k in columns]

2249 else:

2250 arrays = []

2251 arr_columns_list = []

2252 for k, v in data.items():

2253 if k in columns:

2254 arr_columns_list.append(k)

2255 arrays.append(v)

2257 arr_columns = Index(arr_columns_list)

2258 arrays, arr_columns, result_index = maybe_reorder(

2259 arrays, arr_columns, columns, index

2262 elif isinstance(data, (np.ndarray, DataFrame)):

2263 arrays, columns = to_arrays(data, columns)

2264 arr_columns = columns

2265 else:

2266 arrays, arr_columns = to_arrays(data, columns)

2267 if coerce_float:

2268 for i, arr in enumerate(arrays):

2269 if arr.dtype == object:

2270 # error: Argument 1 to "maybe_convert_objects" has

2271 # incompatible type "Union[ExtensionArray, ndarray]";

2272 # expected "ndarray"

2273 arrays[i] = lib.maybe_convert_objects(

2274 arr, # type: ignore[arg-type]

2275 try_float=True,

2278 arr_columns = ensure_index(arr_columns)

2279 if columns is None:

2280 columns = arr_columns

2281 else:

2282 arrays, arr_columns, result_index = maybe_reorder(

2283 arrays, arr_columns, columns, index

2286 if exclude is None:

2287 exclude = set()

2288 else:

2289 exclude = set(exclude)

2291 if index is not None:

2292 if isinstance(index, str) or not hasattr(index, "__iter__"):

2293 i = columns.get_loc(index)

2294 exclude.add(index)

2295 if len(arrays) > 0:

2296 result_index = Index(arrays[i], name=index)

2297 else:

2298 result_index = Index([], name=index)

2299 else:

2300 try:

2301 index_data = [arrays[arr_columns.get_loc(field)] for field in index]

2302 except (KeyError, TypeError):

2303 # raised by get_loc, see GH#29258

2304 result_index = index

2305 else:

2306 result_index = ensure_index_from_sequences(index_data, names=index)

2307 exclude.update(index)

2309 if any(exclude):

2310 arr_exclude = [x for x in exclude if x in arr_columns]

2311 to_remove = [arr_columns.get_loc(col) for col in arr_exclude]

2312 arrays = [v for i, v in enumerate(arrays) if i not in to_remove]

2314 columns = columns.drop(exclude)

2316 manager = get_option("mode.data_manager")

2317 mgr = arrays_to_mgr(arrays, columns, result_index, typ=manager)

2319 return cls(mgr)

2321 def to_records(

2322 self, index: bool = True, column_dtypes=None, index_dtypes=None

2323 ) -> np.recarray:

2324 """

2325 Convert DataFrame to a NumPy record array.

2327 Index will be included as the first field of the record array if

2328 requested.

2330 Parameters

2331 ----------

2332 index : bool, default True

2333 Include index in resulting record array, stored in 'index'

2334 field or using the index label, if set.

2335 column_dtypes : str, type, dict, default None

2336 If a string or type, the data type to store all columns. If

2337 a dictionary, a mapping of column names and indices (zero-indexed)

2338 to specific data types.

2339 index_dtypes : str, type, dict, default None

2340 If a string or type, the data type to store all index levels. If

2341 a dictionary, a mapping of index level names and indices

2342 (zero-indexed) to specific data types.

2344 This mapping is applied only if `index=True`.

2346 Returns

2347 -------

2348 numpy.recarray

2349 NumPy ndarray with the DataFrame labels as fields and each row

2350 of the DataFrame as entries.

2352 See Also

2353 --------

2354 DataFrame.from_records: Convert structured or record ndarray

2355 to DataFrame.

2356 numpy.recarray: An ndarray that allows field access using

2357 attributes, analogous to typed columns in a

2358 spreadsheet.

2360 Examples

2361 --------

2362 >>> df = pd.DataFrame({'A': [1, 2], 'B': [0.5, 0.75]},

2363 ... index=['a', 'b'])

2364 >>> df

2365 A B

2366 a 1 0.50

2367 b 2 0.75

2368 >>> df.to_records()

2369 rec.array([('a', 1, 0.5 ), ('b', 2, 0.75)],

2370 dtype=[('index', 'O'), ('A', '<i8'), ('B', '<f8')])

2372 If the DataFrame index has no label then the recarray field name

2373 is set to 'index'. If the index has a label then this is used as the

2374 field name:

2376 >>> df.index = df.index.rename("I")

2377 >>> df.to_records()

2378 rec.array([('a', 1, 0.5 ), ('b', 2, 0.75)],

2379 dtype=[('I', 'O'), ('A', '<i8'), ('B', '<f8')])

2381 The index can be excluded from the record array:

2383 >>> df.to_records(index=False)

2384 rec.array([(1, 0.5 ), (2, 0.75)],

2385 dtype=[('A', '<i8'), ('B', '<f8')])

2387 Data types can be specified for the columns:

2389 >>> df.to_records(column_dtypes={"A": "int32"})

2390 rec.array([('a', 1, 0.5 ), ('b', 2, 0.75)],

2391 dtype=[('I', 'O'), ('A', '<i4'), ('B', '<f8')])

2393 As well as for the index:

2395 >>> df.to_records(index_dtypes="<S2")

2396 rec.array([(b'a', 1, 0.5 ), (b'b', 2, 0.75)],

2397 dtype=[('I', 'S2'), ('A', '<i8'), ('B', '<f8')])

2399 >>> index_dtypes = f"<S{df.index.str.len().max()}"

2400 >>> df.to_records(index_dtypes=index_dtypes)

2401 rec.array([(b'a', 1, 0.5 ), (b'b', 2, 0.75)],

2402 dtype=[('I', 'S1'), ('A', '<i8'), ('B', '<f8')])

2403 """

2404 if index:

2405 ix_vals = [

2406 np.asarray(self.index.get_level_values(i))

2407 for i in range(self.index.nlevels)

2410 arrays = ix_vals + [

2411 np.asarray(self.iloc[:, i]) for i in range(len(self.columns))

2414 index_names = list(self.index.names)

2416 if isinstance(self.index, MultiIndex):

2417 index_names = com.fill_missing_names(index_names)

2418 elif index_names[0] is None:

2419 index_names = ["index"]

2421 names = [str(name) for name in itertools.chain(index_names, self.columns)]

2422 else:

2423 arrays = [np.asarray(self.iloc[:, i]) for i in range(len(self.columns))]

2424 names = [str(c) for c in self.columns]

2425 index_names = []

2427 index_len = len(index_names)

2428 formats = []

2430 for i, v in enumerate(arrays):

2431 index_int = i

2433 # When the names and arrays are collected, we

2434 # first collect those in the DataFrame's index,

2435 # followed by those in its columns.

2437 # Thus, the total length of the array is:

2438 # len(index_names) + len(DataFrame.columns).

2440 # This check allows us to see whether we are

2441 # handling a name / array in the index or column.

2442 if index_int < index_len:

2443 dtype_mapping = index_dtypes

2444 name = index_names[index_int]

2445 else:

2446 index_int -= index_len

2447 dtype_mapping = column_dtypes

2448 name = self.columns[index_int]

2450 # We have a dictionary, so we get the data type

2451 # associated with the index or column (which can

2452 # be denoted by its name in the DataFrame or its

2453 # position in DataFrame's array of indices or

2454 # columns, whichever is applicable.

2455 if is_dict_like(dtype_mapping):

2456 if name in dtype_mapping:

2457 dtype_mapping = dtype_mapping[name]

2458 elif index_int in dtype_mapping:

2459 dtype_mapping = dtype_mapping[index_int]

2460 else:

2461 dtype_mapping = None

2463 # If no mapping can be found, use the array's

2464 # dtype attribute for formatting.

2466 # A valid dtype must either be a type or

2467 # string naming a type.

2468 if dtype_mapping is None:

2469 formats.append(v.dtype)

2470 elif isinstance(dtype_mapping, (type, np.dtype, str)):

2471 # error: Argument 1 to "append" of "list" has incompatible

2472 # type "Union[type, dtype[Any], str]"; expected "dtype[Any]"

2473 formats.append(dtype_mapping) # type: ignore[arg-type]

2474 else:

2475 element = "row" if i < index_len else "column"

2476 msg = f"Invalid dtype {dtype_mapping} specified for {element} {name}"

2477 raise ValueError(msg)

2479 return np.rec.fromarrays(arrays, dtype={"names": names, "formats": formats})

2481 @classmethod

2482 def _from_arrays(

2483 cls,

2484 arrays,

2485 columns,

2486 index,

2487 dtype: Dtype | None = None,

2488 verify_integrity: bool = True,

2489 ) -> DataFrame:

2490 """

2491 Create DataFrame from a list of arrays corresponding to the columns.

2493 Parameters

2494 ----------

2495 arrays : list-like of arrays

2496 Each array in the list corresponds to one column, in order.

2497 columns : list-like, Index

2498 The column names for the resulting DataFrame.

2499 index : list-like, Index

2500 The rows labels for the resulting DataFrame.

2501 dtype : dtype, optional

2502 Optional dtype to enforce for all arrays.

2503 verify_integrity : bool, default True

2504 Validate and homogenize all input. If set to False, it is assumed

2505 that all elements of `arrays` are actual arrays how they will be

2506 stored in a block (numpy ndarray or ExtensionArray), have the same

2507 length as and are aligned with the index, and that `columns` and

2508 `index` are ensured to be an Index object.

2510 Returns

2511 -------

2512 DataFrame

2513 """

2514 if dtype is not None:

2515 dtype = pandas_dtype(dtype)

2517 manager = get_option("mode.data_manager")

2518 columns = ensure_index(columns)

2519 if len(columns) != len(arrays):

2520 raise ValueError("len(columns) must match len(arrays)")

2521 mgr = arrays_to_mgr(

2522 arrays,

2523 columns,

2524 index,

2525 dtype=dtype,

2526 verify_integrity=verify_integrity,

2527 typ=manager,

2529 return cls(mgr)

2531 @doc(

2532 storage_options=_shared_docs["storage_options"],

2533 compression_options=_shared_docs["compression_options"] % "path",

2535 def to_stata(

2536 self,

2537 path: FilePath | WriteBuffer[bytes],

2539 convert_dates: dict[Hashable, str] | None = None,

2540 write_index: bool = True,

2541 byteorder: str | None = None,

2542 time_stamp: datetime.datetime | None = None,

2543 data_label: str | None = None,

2544 variable_labels: dict[Hashable, str] | None = None,

2545 version: int | None = 114,

2546 convert_strl: Sequence[Hashable] | None = None,

2547 compression: CompressionOptions = "infer",

2548 storage_options: StorageOptions = None,

2549 value_labels: dict[Hashable, dict[float, str]] | None = None,

2550 ) -> None:

2551 """

2552 Export DataFrame object to Stata dta format.

2554 Writes the DataFrame to a Stata dataset file.

2555 "dta" files contain a Stata dataset.

2557 Parameters

2558 ----------

2559 path : str, path object, or buffer

2560 String, path object (implementing ``os.PathLike[str]``), or file-like

2561 object implementing a binary ``write()`` function.

2563 convert_dates : dict

2564 Dictionary mapping columns containing datetime types to stata

2565 internal format to use when writing the dates. Options are 'tc',

2566 'td', 'tm', 'tw', 'th', 'tq', 'ty'. Column can be either an integer

2567 or a name. Datetime columns that do not have a conversion type

2568 specified will be converted to 'tc'. Raises NotImplementedError if

2569 a datetime column has timezone information.

2570 write_index : bool

2571 Write the index to Stata dataset.

2572 byteorder : str

2573 Can be ">", "<", "little", or "big". default is `sys.byteorder`.

2574 time_stamp : datetime

2575 A datetime to use as file creation date. Default is the current

2576 time.

2577 data_label : str, optional

2578 A label for the data set. Must be 80 characters or smaller.

2579 variable_labels : dict

2580 Dictionary containing columns as keys and variable labels as

2581 values. Each label must be 80 characters or smaller.

2582 version : {{114, 117, 118, 119, None}}, default 114

2583 Version to use in the output dta file. Set to None to let pandas

2584 decide between 118 or 119 formats depending on the number of

2585 columns in the frame. Version 114 can be read by Stata 10 and

2586 later. Version 117 can be read by Stata 13 or later. Version 118

2587 is supported in Stata 14 and later. Version 119 is supported in

2588 Stata 15 and later. Version 114 limits string variables to 244

2589 characters or fewer while versions 117 and later allow strings

2590 with lengths up to 2,000,000 characters. Versions 118 and 119

2591 support Unicode characters, and version 119 supports more than

2592 32,767 variables.

2594 Version 119 should usually only be used when the number of

2595 variables exceeds the capacity of dta format 118. Exporting

2596 smaller datasets in format 119 may have unintended consequences,

2597 and, as of November 2020, Stata SE cannot read version 119 files.

2599 convert_strl : list, optional

2600 List of column names to convert to string columns to Stata StrL

2601 format. Only available if version is 117. Storing strings in the

2602 StrL format can produce smaller dta files if strings have more than

2603 8 characters and values are repeated.

2604 {compression_options}

2606 .. versionadded:: 1.1.0

2608 .. versionchanged:: 1.4.0 Zstandard support.

2610 {storage_options}

2612 .. versionadded:: 1.2.0

2614 value_labels : dict of dicts

2615 Dictionary containing columns as keys and dictionaries of column value

2616 to labels as values. Labels for a single variable must be 32,000

2617 characters or smaller.

2619 .. versionadded:: 1.4.0

2621 Raises

2622 ------

2623 NotImplementedError

2624 * If datetimes contain timezone information

2625 * Column dtype is not representable in Stata

2626 ValueError

2627 * Columns listed in convert_dates are neither datetime64[ns]

2628 or datetime.datetime

2629 * Column listed in convert_dates is not in DataFrame

2630 * Categorical label contains more than 32,000 characters

2632 See Also

2633 --------

2634 read_stata : Import Stata data files.

2635 io.stata.StataWriter : Low-level writer for Stata data files.

2636 io.stata.StataWriter117 : Low-level writer for version 117 files.

2638 Examples

2639 --------

2640 >>> df = pd.DataFrame({{'animal': ['falcon', 'parrot', 'falcon',

2641 ... 'parrot'],

2642 ... 'speed': [350, 18, 361, 15]}})

2643 >>> df.to_stata('animals.dta') # doctest: +SKIP

2644 """

2645 if version not in (114, 117, 118, 119, None):

2646 raise ValueError("Only formats 114, 117, 118 and 119 are supported.")

2647 if version == 114:

2648 if convert_strl is not None:

2649 raise ValueError("strl is not supported in format 114")

2650 from pandas.io.stata import StataWriter as statawriter

2651 elif version == 117:

2652 # Incompatible import of "statawriter" (imported name has type

2653 # "Type[StataWriter117]", local name has type "Type[StataWriter]")

2654 from pandas.io.stata import ( # type: ignore[assignment]

2655 StataWriter117 as statawriter,

2657 else: # versions 118 and 119

2658 # Incompatible import of "statawriter" (imported name has type

2659 # "Type[StataWriter117]", local name has type "Type[StataWriter]")

2660 from pandas.io.stata import ( # type: ignore[assignment]

2661 StataWriterUTF8 as statawriter,

2664 kwargs: dict[str, Any] = {}

2665 if version is None or version >= 117:

2666 # strl conversion is only supported >= 117

2667 kwargs["convert_strl"] = convert_strl

2668 if version is None or version >= 118:

2669 # Specifying the version is only supported for UTF8 (118 or 119)

2670 kwargs["version"] = version

2672 writer = statawriter(

2673 path,

2674 self,

2675 convert_dates=convert_dates,

2676 byteorder=byteorder,

2677 time_stamp=time_stamp,

2678 data_label=data_label,

2679 write_index=write_index,

2680 variable_labels=variable_labels,

2681 compression=compression,

2682 storage_options=storage_options,

2683 value_labels=value_labels,

2684 **kwargs,

2686 writer.write_file()

2688 def to_feather(self, path: FilePath | WriteBuffer[bytes], **kwargs) -> None:

2689 """

2690 Write a DataFrame to the binary Feather format.

2692 Parameters

2693 ----------

2694 path : str, path object, file-like object

2695 String, path object (implementing ``os.PathLike[str]``), or file-like

2696 object implementing a binary ``write()`` function. If a string or a path,

2697 it will be used as Root Directory path when writing a partitioned dataset.

2698 **kwargs :

2699 Additional keywords passed to :func:`pyarrow.feather.write_feather`.

2700 Starting with pyarrow 0.17, this includes the `compression`,

2701 `compression_level`, `chunksize` and `version` keywords.

2703 .. versionadded:: 1.1.0

2705 Notes

2706 -----

2707 This function writes the dataframe as a `feather file

2708 <https://arrow.apache.org/docs/python/feather.html>`_. Requires a default

2709 index. For saving the DataFrame with your custom index use a method that

2710 supports custom indices e.g. `to_parquet`.

2711 """

2712 from pandas.io.feather_format import to_feather

2714 to_feather(self, path, **kwargs)

2716 @doc(

2717 Series.to_markdown,

2718 klass=_shared_doc_kwargs["klass"],

2719 storage_options=_shared_docs["storage_options"],

2720 examples="""Examples

2721 --------

2722 >>> df = pd.DataFrame(

2723 ... data={"animal_1": ["elk", "pig"], "animal_2": ["dog", "quetzal"]}

2724 ... )

2725 >>> print(df.to_markdown())

2726 | | animal_1 | animal_2 |

2727 |---:|:-----------|:-----------|

2728 | 0 | elk | dog |

2729 | 1 | pig | quetzal |

2731 Output markdown with a tabulate option.

2733 >>> print(df.to_markdown(tablefmt="grid"))

2734 +----+------------+------------+

2735 | | animal_1 | animal_2 |

2736 +====+============+============+

2737 | 0 | elk | dog |

2738 +----+------------+------------+

2739 | 1 | pig | quetzal |

2740 +----+------------+------------+""",

2742 def to_markdown(

2743 self,

2744 buf: FilePath | WriteBuffer[str] | None = None,

2745 mode: str = "wt",

2746 index: bool = True,

2747 storage_options: StorageOptions = None,

2748 **kwargs,

2749 ) -> str | None:

2750 if "showindex" in kwargs:

2751 raise ValueError("Pass 'index' instead of 'showindex")

2753 kwargs.setdefault("headers", "keys")

2754 kwargs.setdefault("tablefmt", "pipe")

2755 kwargs.setdefault("showindex", index)

2756 tabulate = import_optional_dependency("tabulate")

2757 result = tabulate.tabulate(self, **kwargs)

2758 if buf is None:

2759 return result

2761 with get_handle(buf, mode, storage_options=storage_options) as handles:

2762 handles.handle.write(result)

2763 return None

2765 @overload

2766 def to_parquet(

2767 self,

2768 path: None = ...,

2769 engine: str = ...,

2770 compression: str | None = ...,

2771 index: bool | None = ...,

2772 partition_cols: list[str] | None = ...,

2773 storage_options: StorageOptions = ...,

2774 **kwargs,

2775 ) -> bytes:

2776 ...

2778 @overload

2779 def to_parquet(

2780 self,

2781 path: FilePath | WriteBuffer[bytes],

2782 engine: str = ...,

2783 compression: str | None = ...,

2784 index: bool | None = ...,

2785 partition_cols: list[str] | None = ...,

2786 storage_options: StorageOptions = ...,

2787 **kwargs,

2788 ) -> None:

2789 ...

2791 @doc(storage_options=_shared_docs["storage_options"])

2792 def to_parquet(

2793 self,

2794 path: FilePath | WriteBuffer[bytes] | None = None,

2795 engine: str = "auto",

2796 compression: str | None = "snappy",

2797 index: bool | None = None,

2798 partition_cols: list[str] | None = None,

2799 storage_options: StorageOptions = None,

2800 **kwargs,

2801 ) -> bytes | None:

2802 """

2803 Write a DataFrame to the binary parquet format.

2805 This function writes the dataframe as a `parquet file

2806 <https://parquet.apache.org/>`_. You can choose different parquet

2807 backends, and have the option of compression. See

2808 :ref:`the user guide <io.parquet>` for more details.

2810 Parameters

2811 ----------

2812 path : str, path object, file-like object, or None, default None

2813 String, path object (implementing ``os.PathLike[str]``), or file-like

2814 object implementing a binary ``write()`` function. If None, the result is

2815 returned as bytes. If a string or path, it will be used as Root Directory

2816 path when writing a partitioned dataset.

2818 .. versionchanged:: 1.2.0

2820 Previously this was "fname"

2822 engine : {{'auto', 'pyarrow', 'fastparquet'}}, default 'auto'

2823 Parquet library to use. If 'auto', then the option

2824 ``io.parquet.engine`` is used. The default ``io.parquet.engine``

2825 behavior is to try 'pyarrow', falling back to 'fastparquet' if

2826 'pyarrow' is unavailable.

2827 compression : {{'snappy', 'gzip', 'brotli', None}}, default 'snappy'

2828 Name of the compression to use. Use ``None`` for no compression.

2829 index : bool, default None

2830 If ``True``, include the dataframe's index(es) in the file output.

2831 If ``False``, they will not be written to the file.

2832 If ``None``, similar to ``True`` the dataframe's index(es)

2833 will be saved. However, instead of being saved as values,

2834 the RangeIndex will be stored as a range in the metadata so it

2835 doesn't require much space and is faster. Other indexes will

2836 be included as columns in the file output.

2837 partition_cols : list, optional, default None

2838 Column names by which to partition the dataset.

2839 Columns are partitioned in the order they are given.

2840 Must be None if path is not a string.

2841 {storage_options}

2843 .. versionadded:: 1.2.0

2845 **kwargs

2846 Additional arguments passed to the parquet library. See

2847 :ref:`pandas io <io.parquet>` for more details.

2849 Returns

2850 -------

2851 bytes if no path argument is provided else None

2853 See Also

2854 --------

2855 read_parquet : Read a parquet file.

2856 DataFrame.to_orc : Write an orc file.

2857 DataFrame.to_csv : Write a csv file.

2858 DataFrame.to_sql : Write to a sql table.

2859 DataFrame.to_hdf : Write to hdf.

2861 Notes

2862 -----

2863 This function requires either the `fastparquet

2864 <https://pypi.org/project/fastparquet>`_ or `pyarrow

2865 <https://arrow.apache.org/docs/python/>`_ library.

2867 Examples

2868 --------

2869 >>> df = pd.DataFrame(data={{'col1': [1, 2], 'col2': [3, 4]}})

2870 >>> df.to_parquet('df.parquet.gzip',

2871 ... compression='gzip') # doctest: +SKIP

2872 >>> pd.read_parquet('df.parquet.gzip') # doctest: +SKIP

2873 col1 col2

2874 0 1 3

2875 1 2 4

2877 If you want to get a buffer to the parquet content you can use a io.BytesIO

2878 object, as long as you don't use partition_cols, which creates multiple files.

2880 >>> import io

2881 >>> f = io.BytesIO()

2882 >>> df.to_parquet(f)

2883 >>> f.seek(0)

2885 >>> content = f.read()

2886 """

2887 from pandas.io.parquet import to_parquet

2889 return to_parquet(

2890 self,

2891 path,

2892 engine,

2893 compression=compression,

2894 index=index,

2895 partition_cols=partition_cols,

2896 storage_options=storage_options,

2897 **kwargs,

2900 def to_orc(

2901 self,

2902 path: FilePath | WriteBuffer[bytes] | None = None,

2904 engine: Literal["pyarrow"] = "pyarrow",

2905 index: bool | None = None,

2906 engine_kwargs: dict[str, Any] | None = None,

2907 ) -> bytes | None:

2908 """

2909 Write a DataFrame to the ORC format.

2911 .. versionadded:: 1.5.0

2913 Parameters

2914 ----------

2915 path : str, file-like object or None, default None

2916 If a string, it will be used as Root Directory path

2917 when writing a partitioned dataset. By file-like object,

2918 we refer to objects with a write() method, such as a file handle

2919 (e.g. via builtin open function). If path is None,

2920 a bytes object is returned.

2921 engine : str, default 'pyarrow'

2922 ORC library to use. Pyarrow must be >= 7.0.0.

2923 index : bool, optional

2924 If ``True``, include the dataframe's index(es) in the file output.

2925 If ``False``, they will not be written to the file.

2926 If ``None``, similar to ``infer`` the dataframe's index(es)

2927 will be saved. However, instead of being saved as values,

2928 the RangeIndex will be stored as a range in the metadata so it

2929 doesn't require much space and is faster. Other indexes will

2930 be included as columns in the file output.

2931 engine_kwargs : dict[str, Any] or None, default None

2932 Additional keyword arguments passed to :func:`pyarrow.orc.write_table`.

2934 Returns

2935 -------

2936 bytes if no path argument is provided else None

2938 Raises

2939 ------

2940 NotImplementedError

2941 Dtype of one or more columns is category, unsigned integers, interval,

2942 period or sparse.

2943 ValueError

2944 engine is not pyarrow.

2946 See Also

2947 --------

2948 read_orc : Read a ORC file.

2949 DataFrame.to_parquet : Write a parquet file.

2950 DataFrame.to_csv : Write a csv file.

2951 DataFrame.to_sql : Write to a sql table.

2952 DataFrame.to_hdf : Write to hdf.

2954 Notes

2955 -----

2956 * Before using this function you should read the :ref:`user guide about

2957 ORC <io.orc>` and :ref:`install optional dependencies <install.warn_orc>`.

2958 * This function requires `pyarrow <https://arrow.apache.org/docs/python/>`_

2959 library.

2960 * For supported dtypes please refer to `supported ORC features in Arrow

2961 <https://arrow.apache.org/docs/cpp/orc.html#data-types>`__.

2962 * Currently timezones in datetime columns are not preserved when a

2963 dataframe is converted into ORC files.

2965 Examples

2966 --------

2967 >>> df = pd.DataFrame(data={'col1': [1, 2], 'col2': [4, 3]})

2968 >>> df.to_orc('df.orc') # doctest: +SKIP

2969 >>> pd.read_orc('df.orc') # doctest: +SKIP

2970 col1 col2

2971 0 1 4

2972 1 2 3

2974 If you want to get a buffer to the orc content you can write it to io.BytesIO

2975 >>> import io

2976 >>> b = io.BytesIO(df.to_orc()) # doctest: +SKIP

2977 >>> b.seek(0) # doctest: +SKIP

2979 >>> content = b.read() # doctest: +SKIP

2980 """

2981 from pandas.io.orc import to_orc

2983 return to_orc(

2984 self, path, engine=engine, index=index, engine_kwargs=engine_kwargs

2987 @overload

2988 def to_html(

2989 self,

2990 buf: FilePath | WriteBuffer[str],

2991 columns: Sequence[Level] | None = ...,

2992 col_space: ColspaceArgType | None = ...,

2993 header: bool | Sequence[str] = ...,

2994 index: bool = ...,

2995 na_rep: str = ...,

2996 formatters: FormattersType | None = ...,

2997 float_format: FloatFormatType | None = ...,

2998 sparsify: bool | None = ...,

2999 index_names: bool = ...,

3000 justify: str | None = ...,

3001 max_rows: int | None = ...,

3002 max_cols: int | None = ...,

3003 show_dimensions: bool | str = ...,

3004 decimal: str = ...,

3005 bold_rows: bool = ...,

3006 classes: str | list | tuple | None = ...,

3007 escape: bool = ...,

3008 notebook: bool = ...,

3009 border: int | bool | None = ...,

3010 table_id: str | None = ...,

3011 render_links: bool = ...,

3012 encoding: str | None = ...,

3013 ) -> None:

3014 ...

3016 @overload

3017 def to_html(

3018 self,

3019 buf: None = ...,

3020 columns: Sequence[Level] | None = ...,

3021 col_space: ColspaceArgType | None = ...,

3022 header: bool | Sequence[str] = ...,

3023 index: bool = ...,

3024 na_rep: str = ...,

3025 formatters: FormattersType | None = ...,

3026 float_format: FloatFormatType | None = ...,

3027 sparsify: bool | None = ...,

3028 index_names: bool = ...,

3029 justify: str | None = ...,

3030 max_rows: int | None = ...,

3031 max_cols: int | None = ...,

3032 show_dimensions: bool | str = ...,

3033 decimal: str = ...,

3034 bold_rows: bool = ...,

3035 classes: str | list | tuple | None = ...,

3036 escape: bool = ...,

3037 notebook: bool = ...,

3038 border: int | bool | None = ...,

3039 table_id: str | None = ...,

3040 render_links: bool = ...,

3041 encoding: str | None = ...,

3042 ) -> str:

3043 ...

3045 @Substitution(

3046 header_type="bool",

3047 header="Whether to print column labels, default True",

3048 col_space_type="str or int, list or dict of int or str",

3049 col_space="The minimum width of each column in CSS length "

3050 "units. An int is assumed to be px units.",

3052 @Substitution(shared_params=fmt.common_docstring, returns=fmt.return_docstring)

3053 def to_html(

3054 self,

3055 buf: FilePath | WriteBuffer[str] | None = None,

3056 columns: Sequence[Level] | None = None,

3057 col_space: ColspaceArgType | None = None,

3058 header: bool | Sequence[str] = True,

3059 index: bool = True,

3060 na_rep: str = "NaN",

3061 formatters: FormattersType | None = None,

3062 float_format: FloatFormatType | None = None,

3063 sparsify: bool | None = None,

3064 index_names: bool = True,

3065 justify: str | None = None,

3066 max_rows: int | None = None,

3067 max_cols: int | None = None,

3068 show_dimensions: bool | str = False,

3069 decimal: str = ".",

3070 bold_rows: bool = True,

3071 classes: str | list | tuple | None = None,

3072 escape: bool = True,

3073 notebook: bool = False,

3074 border: int | bool | None = None,

3075 table_id: str | None = None,

3076 render_links: bool = False,

3077 encoding: str | None = None,

3078 ) -> str | None:

3079 """

3080 Render a DataFrame as an HTML table.

3081 %(shared_params)s

3082 bold_rows : bool, default True

3083 Make the row labels bold in the output.

3084 classes : str or list or tuple, default None

3085 CSS class(es) to apply to the resulting html table.

3086 escape : bool, default True

3087 Convert the characters <, >, and & to HTML-safe sequences.

3088 notebook : {True, False}, default False

3089 Whether the generated HTML is for IPython Notebook.

3090 border : int

3091 A ``border=border`` attribute is included in the opening

3092 `<table>` tag. Default ``pd.options.display.html.border``.

3093 table_id : str, optional

3094 A css id is included in the opening `<table>` tag if specified.

3095 render_links : bool, default False

3096 Convert URLs to HTML links.

3097 encoding : str, default "utf-8"

3098 Set character encoding.

3100 .. versionadded:: 1.0

3101 %(returns)s

3102 See Also

3103 --------

3104 to_string : Convert DataFrame to a string.

3105 """

3106 if justify is not None and justify not in fmt._VALID_JUSTIFY_PARAMETERS:

3107 raise ValueError("Invalid value for justify parameter")

3109 formatter = fmt.DataFrameFormatter(

3110 self,

3111 columns=columns,

3112 col_space=col_space,

3113 na_rep=na_rep,

3114 header=header,

3115 index=index,

3116 formatters=formatters,

3117 float_format=float_format,

3118 bold_rows=bold_rows,

3119 sparsify=sparsify,

3120 justify=justify,

3121 index_names=index_names,

3122 escape=escape,

3123 decimal=decimal,

3124 max_rows=max_rows,

3125 max_cols=max_cols,

3126 show_dimensions=show_dimensions,

3128 # TODO: a generic formatter wld b in DataFrameFormatter

3129 return fmt.DataFrameRenderer(formatter).to_html(

3130 buf=buf,

3131 classes=classes,

3132 notebook=notebook,

3133 border=border,

3134 encoding=encoding,

3135 table_id=table_id,

3136 render_links=render_links,

3139 @doc(

3140 storage_options=_shared_docs["storage_options"],

3141 compression_options=_shared_docs["compression_options"] % "path_or_buffer",

3143 def to_xml(

3144 self,

3145 path_or_buffer: FilePath | WriteBuffer[bytes] | WriteBuffer[str] | None = None,

3146 index: bool = True,

3147 root_name: str | None = "data",

3148 row_name: str | None = "row",

3149 na_rep: str | None = None,

3150 attr_cols: list[str] | None = None,

3151 elem_cols: list[str] | None = None,

3152 namespaces: dict[str | None, str] | None = None,

3153 prefix: str | None = None,

3154 encoding: str = "utf-8",

3155 xml_declaration: bool | None = True,

3156 pretty_print: bool | None = True,

3157 parser: str | None = "lxml",

3158 stylesheet: FilePath | ReadBuffer[str] | ReadBuffer[bytes] | None = None,

3159 compression: CompressionOptions = "infer",

3160 storage_options: StorageOptions = None,

3161 ) -> str | None:

3162 """

3163 Render a DataFrame to an XML document.

3165 .. versionadded:: 1.3.0

3167 Parameters

3168 ----------

3169 path_or_buffer : str, path object, file-like object, or None, default None

3170 String, path object (implementing ``os.PathLike[str]``), or file-like

3171 object implementing a ``write()`` function. If None, the result is returned

3172 as a string.

3173 index : bool, default True

3174 Whether to include index in XML document.

3175 root_name : str, default 'data'

3176 The name of root element in XML document.

3177 row_name : str, default 'row'

3178 The name of row element in XML document.

3179 na_rep : str, optional

3180 Missing data representation.

3181 attr_cols : list-like, optional

3182 List of columns to write as attributes in row element.

3183 Hierarchical columns will be flattened with underscore

3184 delimiting the different levels.

3185 elem_cols : list-like, optional

3186 List of columns to write as children in row element. By default,

3187 all columns output as children of row element. Hierarchical

3188 columns will be flattened with underscore delimiting the

3189 different levels.

3190 namespaces : dict, optional

3191 All namespaces to be defined in root element. Keys of dict

3192 should be prefix names and values of dict corresponding URIs.

3193 Default namespaces should be given empty string key. For

3194 example, ::

3196 namespaces = {{"": "https://example.com"}}

3198 prefix : str, optional

3199 Namespace prefix to be used for every element and/or attribute

3200 in document. This should be one of the keys in ``namespaces``

3201 dict.

3202 encoding : str, default 'utf-8'

3203 Encoding of the resulting document.

3204 xml_declaration : bool, default True

3205 Whether to include the XML declaration at start of document.

3206 pretty_print : bool, default True

3207 Whether output should be pretty printed with indentation and

3208 line breaks.

3209 parser : {{'lxml','etree'}}, default 'lxml'

3210 Parser module to use for building of tree. Only 'lxml' and

3211 'etree' are supported. With 'lxml', the ability to use XSLT

3212 stylesheet is supported.

3213 stylesheet : str, path object or file-like object, optional

3214 A URL, file-like object, or a raw string containing an XSLT

3215 script used to transform the raw XML output. Script should use

3216 layout of elements and attributes from original output. This

3217 argument requires ``lxml`` to be installed. Only XSLT 1.0

3218 scripts and not later versions is currently supported.

3219 {compression_options}

3221 .. versionchanged:: 1.4.0 Zstandard support.

3223 {storage_options}

3225 Returns

3226 -------

3227 None or str

3228 If ``io`` is None, returns the resulting XML format as a

3229 string. Otherwise returns None.

3231 See Also

3232 --------

3233 to_json : Convert the pandas object to a JSON string.

3234 to_html : Convert DataFrame to a html.

3236 Examples

3237 --------

3238 >>> df = pd.DataFrame({{'shape': ['square', 'circle', 'triangle'],

3239 ... 'degrees': [360, 360, 180],

3240 ... 'sides': [4, np.nan, 3]}})

3242 >>> df.to_xml() # doctest: +SKIP

3243 <?xml version='1.0' encoding='utf-8'?>

3244 <data>

3245 <row>

3246 <index>0</index>

3247 <shape>square</shape>

3248 <degrees>360</degrees>

3249 <sides>4.0</sides>

3250 </row>

3251 <row>

3252 <index>1</index>

3253 <shape>circle</shape>

3254 <degrees>360</degrees>

3255 <sides/>

3256 </row>

3257 <row>

3258 <index>2</index>

3259 <shape>triangle</shape>

3260 <degrees>180</degrees>

3261 <sides>3.0</sides>

3262 </row>

3263 </data>

3265 >>> df.to_xml(attr_cols=[

3266 ... 'index', 'shape', 'degrees', 'sides'

3267 ... ]) # doctest: +SKIP

3268 <?xml version='1.0' encoding='utf-8'?>

3269 <data>

3270 <row index="0" shape="square" degrees="360" sides="4.0"/>

3271 <row index="1" shape="circle" degrees="360"/>

3272 <row index="2" shape="triangle" degrees="180" sides="3.0"/>

3273 </data>

3275 >>> df.to_xml(namespaces={{"doc": "https://example.com"}},

3276 ... prefix="doc") # doctest: +SKIP

3277 <?xml version='1.0' encoding='utf-8'?>

3278 <doc:data xmlns:doc="https://example.com">

3279 <doc:row>

3280 <doc:index>0</doc:index>

3281 <doc:shape>square</doc:shape>

3282 <doc:degrees>360</doc:degrees>

3283 <doc:sides>4.0</doc:sides>

3284 </doc:row>

3285 <doc:row>

3286 <doc:index>1</doc:index>

3287 <doc:shape>circle</doc:shape>

3288 <doc:degrees>360</doc:degrees>

3289 <doc:sides/>

3290 </doc:row>

3291 <doc:row>

3292 <doc:index>2</doc:index>

3293 <doc:shape>triangle</doc:shape>

3294 <doc:degrees>180</doc:degrees>

3295 <doc:sides>3.0</doc:sides>

3296 </doc:row>

3297 </doc:data>

3298 """

3300 from pandas.io.formats.xml import (

3301 EtreeXMLFormatter,

3302 LxmlXMLFormatter,

3305 lxml = import_optional_dependency("lxml.etree", errors="ignore")

3307 TreeBuilder: type[EtreeXMLFormatter] | type[LxmlXMLFormatter]

3309 if parser == "lxml":

3310 if lxml is not None:

3311 TreeBuilder = LxmlXMLFormatter

3312 else:

3313 raise ImportError(

3314 "lxml not found, please install or use the etree parser."

3317 elif parser == "etree":

3318 TreeBuilder = EtreeXMLFormatter

3320 else:

3321 raise ValueError("Values for parser can only be lxml or etree.")

3323 xml_formatter = TreeBuilder(

3324 self,

3325 path_or_buffer=path_or_buffer,

3326 index=index,

3327 root_name=root_name,

3328 row_name=row_name,

3329 na_rep=na_rep,

3330 attr_cols=attr_cols,

3331 elem_cols=elem_cols,

3332 namespaces=namespaces,

3333 prefix=prefix,

3334 encoding=encoding,

3335 xml_declaration=xml_declaration,

3336 pretty_print=pretty_print,

3337 stylesheet=stylesheet,

3338 compression=compression,

3339 storage_options=storage_options,

3342 return xml_formatter.write_output()

3344 # ----------------------------------------------------------------------

3345 @doc(INFO_DOCSTRING, **frame_sub_kwargs)

3346 def info(

3347 self,

3348 verbose: bool | None = None,

3349 buf: WriteBuffer[str] | None = None,

3350 max_cols: int | None = None,

3351 memory_usage: bool | str | None = None,

3352 show_counts: bool | None = None,

3353 ) -> None:

3354 info = DataFrameInfo(

3355 data=self,

3356 memory_usage=memory_usage,

3358 info.render(

3359 buf=buf,

3360 max_cols=max_cols,

3361 verbose=verbose,

3362 show_counts=show_counts,

3365 def memory_usage(self, index: bool = True, deep: bool = False) -> Series:

3366 """

3367 Return the memory usage of each column in bytes.

3369 The memory usage can optionally include the contribution of

3370 the index and elements of `object` dtype.

3372 This value is displayed in `DataFrame.info` by default. This can be

3373 suppressed by setting ``pandas.options.display.memory_usage`` to False.

3375 Parameters

3376 ----------

3377 index : bool, default True

3378 Specifies whether to include the memory usage of the DataFrame's

3379 index in returned Series. If ``index=True``, the memory usage of

3380 the index is the first item in the output.

3381 deep : bool, default False

3382 If True, introspect the data deeply by interrogating

3383 `object` dtypes for system-level memory consumption, and include

3384 it in the returned values.

3386 Returns

3387 -------

3388 Series

3389 A Series whose index is the original column names and whose values

3390 is the memory usage of each column in bytes.

3392 See Also

3393 --------

3394 numpy.ndarray.nbytes : Total bytes consumed by the elements of an

3395 ndarray.

3396 Series.memory_usage : Bytes consumed by a Series.

3397 Categorical : Memory-efficient array for string values with

3398 many repeated values.

3399 DataFrame.info : Concise summary of a DataFrame.

3401 Notes

3402 -----

3403 See the :ref:`Frequently Asked Questions <df-memory-usage>` for more

3404 details.

3406 Examples

3407 --------

3408 >>> dtypes = ['int64', 'float64', 'complex128', 'object', 'bool']

3409 >>> data = dict([(t, np.ones(shape=5000, dtype=int).astype(t))

3410 ... for t in dtypes])

3411 >>> df = pd.DataFrame(data)

3412 >>> df.head()

3413 int64 float64 complex128 object bool

3414 0 1 1.0 1.0+0.0j 1 True

3415 1 1 1.0 1.0+0.0j 1 True

3416 2 1 1.0 1.0+0.0j 1 True

3417 3 1 1.0 1.0+0.0j 1 True

3418 4 1 1.0 1.0+0.0j 1 True

3420 >>> df.memory_usage()

3421 Index 128

3422 int64 40000

3423 float64 40000

3424 complex128 80000

3425 object 40000

3426 bool 5000

3427 dtype: int64

3429 >>> df.memory_usage(index=False)

3430 int64 40000

3431 float64 40000

3432 complex128 80000

3433 object 40000

3434 bool 5000

3435 dtype: int64

3437 The memory footprint of `object` dtype columns is ignored by default:

3439 >>> df.memory_usage(deep=True)

3440 Index 128

3441 int64 40000

3442 float64 40000

3443 complex128 80000

3444 object 180000

3445 bool 5000

3446 dtype: int64

3448 Use a Categorical for efficient storage of an object-dtype column with

3449 many repeated values.

3451 >>> df['object'].astype('category').memory_usage(deep=True)

3452 5244

3453 """

3454 result = self._constructor_sliced(

3455 [c.memory_usage(index=False, deep=deep) for col, c in self.items()],

3456 index=self.columns,

3457 dtype=np.intp,

3459 if index:

3460 index_memory_usage = self._constructor_sliced(

3461 self.index.memory_usage(deep=deep), index=["Index"]

3463 result = index_memory_usage._append(result)

3464 return result

3466 def transpose(self, *args, copy: bool = False) -> DataFrame:

3467 """

3468 Transpose index and columns.

3470 Reflect the DataFrame over its main diagonal by writing rows as columns

3471 and vice-versa. The property :attr:`.T` is an accessor to the method

3472 :meth:`transpose`.

3474 Parameters

3475 ----------

3476 *args : tuple, optional

3477 Accepted for compatibility with NumPy.

3478 copy : bool, default False

3479 Whether to copy the data after transposing, even for DataFrames

3480 with a single dtype.

3482 Note that a copy is always required for mixed dtype DataFrames,

3483 or for DataFrames with any extension types.

3485 Returns

3486 -------

3487 DataFrame

3488 The transposed DataFrame.

3490 See Also

3491 --------

3492 numpy.transpose : Permute the dimensions of a given array.

3494 Notes

3495 -----

3496 Transposing a DataFrame with mixed dtypes will result in a homogeneous

3497 DataFrame with the `object` dtype. In such a case, a copy of the data

3498 is always made.

3500 Examples

3501 --------

3502 **Square DataFrame with homogeneous dtype**

3504 >>> d1 = {'col1': [1, 2], 'col2': [3, 4]}

3505 >>> df1 = pd.DataFrame(data=d1)

3506 >>> df1

3507 col1 col2

3508 0 1 3

3509 1 2 4

3511 >>> df1_transposed = df1.T # or df1.transpose()

3512 >>> df1_transposed

3513 0 1

3514 col1 1 2

3515 col2 3 4

3517 When the dtype is homogeneous in the original DataFrame, we get a

3518 transposed DataFrame with the same dtype:

3520 >>> df1.dtypes

3521 col1 int64

3522 col2 int64

3523 dtype: object

3524 >>> df1_transposed.dtypes

3525 0 int64

3526 1 int64

3527 dtype: object

3529 **Non-square DataFrame with mixed dtypes**

3531 >>> d2 = {'name': ['Alice', 'Bob'],

3532 ... 'score': [9.5, 8],

3533 ... 'employed': [False, True],

3534 ... 'kids': [0, 0]}

3535 >>> df2 = pd.DataFrame(data=d2)

3536 >>> df2

3537 name score employed kids

3538 0 Alice 9.5 False 0

3539 1 Bob 8.0 True 0

3541 >>> df2_transposed = df2.T # or df2.transpose()

3542 >>> df2_transposed

3543 0 1

3544 name Alice Bob

3545 score 9.5 8.0

3546 employed False True

3547 kids 0 0

3549 When the DataFrame has mixed dtypes, we get a transposed DataFrame with

3550 the `object` dtype:

3552 >>> df2.dtypes

3553 name object

3554 score float64

3555 employed bool

3556 kids int64

3557 dtype: object

3558 >>> df2_transposed.dtypes

3559 0 object

3560 1 object

3561 dtype: object

3562 """

3563 nv.validate_transpose(args, {})

3564 # construct the args

3566 dtypes = list(self.dtypes)

3568 if self._can_fast_transpose:

3569 # Note: tests pass without this, but this improves perf quite a bit.

3570 new_vals = self._values.T

3571 if copy and not using_copy_on_write():

3572 new_vals = new_vals.copy()

3574 result = self._constructor(

3575 new_vals, index=self.columns, columns=self.index, copy=False

3577 if using_copy_on_write() and len(self) > 0:

3578 result._mgr.add_references(self._mgr) # type: ignore[arg-type]

3580 elif (

3581 self._is_homogeneous_type and dtypes and is_extension_array_dtype(dtypes[0])

3583 # We have EAs with the same dtype. We can preserve that dtype in transpose.

3584 dtype = dtypes[0]

3585 arr_type = dtype.construct_array_type()

3586 values = self.values

3588 new_values = [arr_type._from_sequence(row, dtype=dtype) for row in values]

3589 result = type(self)._from_arrays(

3590 new_values, index=self.columns, columns=self.index

3593 else:

3594 new_arr = self.values.T

3595 if copy and not using_copy_on_write():

3596 new_arr = new_arr.copy()

3597 result = self._constructor(

3598 new_arr,

3599 index=self.columns,

3600 columns=self.index,

3601 # We already made a copy (more than one block)

3602 copy=False,

3605 return result.__finalize__(self, method="transpose")

3607 @property

3608 def T(self) -> DataFrame:

3609 """

3610 The transpose of the DataFrame.

3612 Returns

3613 -------

3614 DataFrame

3615 The transposed DataFrame.

3617 See Also

3618 --------

3619 DataFrame.transpose : Transpose index and columns.

3621 Examples

3622 --------

3623 >>> df = pd.DataFrame({'col1': [1, 2], 'col2': [3, 4]})

3624 >>> df

3625 col1 col2

3626 0 1 3

3627 1 2 4

3629 >>> df.T

3630 0 1

3631 col1 1 2

3632 col2 3 4

3633 """

3634 return self.transpose()

3636 # ----------------------------------------------------------------------

3637 # Indexing Methods

3639 def _ixs(self, i: int, axis: AxisInt = 0) -> Series:

3640 """

3641 Parameters

3642 ----------

3643 i : int

3644 axis : int

3646 Returns

3647 -------

3648 Series

3649 """

3650 # irow

3651 if axis == 0:

3652 new_mgr = self._mgr.fast_xs(i)

3654 # if we are a copy, mark as such

3655 copy = isinstance(new_mgr.array, np.ndarray) and new_mgr.array.base is None

3656 result = self._constructor_sliced(new_mgr, name=self.index[i]).__finalize__(

3657 self

3659 result._set_is_copy(self, copy=copy)

3660 return result

3662 # icol

3663 else:

3664 label = self.columns[i]

3666 col_mgr = self._mgr.iget(i)

3667 result = self._box_col_values(col_mgr, i)

3669 # this is a cached value, mark it so

3670 result._set_as_cached(label, self)

3671 return result

3673 def _get_column_array(self, i: int) -> ArrayLike:

3674 """

3675 Get the values of the i'th column (ndarray or ExtensionArray, as stored

3676 in the Block)

3678 Warning! The returned array is a view but doesn't handle Copy-on-Write,

3679 so this should be used with caution (for read-only purposes).

3680 """

3681 return self._mgr.iget_values(i)

3683 def _iter_column_arrays(self) -> Iterator[ArrayLike]:

3684 """

3685 Iterate over the arrays of all columns in order.

3686 This returns the values as stored in the Block (ndarray or ExtensionArray).

3688 Warning! The returned array is a view but doesn't handle Copy-on-Write,

3689 so this should be used with caution (for read-only purposes).

3690 """

3691 for i in range(len(self.columns)):

3692 yield self._get_column_array(i)

3694 def _getitem_nocopy(self, key: list):

3695 """

3696 Behaves like __getitem__, but returns a view in cases where __getitem__

3697 would make a copy.

3698 """

3699 # TODO(CoW): can be removed if/when we are always Copy-on-Write

3700 indexer = self.columns._get_indexer_strict(key, "columns")[1]

3701 new_axis = self.columns[indexer]

3703 new_mgr = self._mgr.reindex_indexer(

3704 new_axis,

3705 indexer,

3706 axis=0,

3707 allow_dups=True,

3708 copy=False,

3709 only_slice=True,

3711 return self._constructor(new_mgr)

3713 def __getitem__(self, key):

3714 check_dict_or_set_indexers(key)

3715 key = lib.item_from_zerodim(key)

3716 key = com.apply_if_callable(key, self)

3718 if is_hashable(key) and not is_iterator(key):

3719 # is_iterator to exclude generator e.g. test_getitem_listlike

3720 # shortcut if the key is in columns

3721 is_mi = isinstance(self.columns, MultiIndex)

3722 # GH#45316 Return view if key is not duplicated

3723 # Only use drop_duplicates with duplicates for performance

3724 if not is_mi and (

3725 self.columns.is_unique

3726 and key in self.columns

3727 or key in self.columns.drop_duplicates(keep=False)

3729 return self._get_item_cache(key)

3731 elif is_mi and self.columns.is_unique and key in self.columns:

3732 return self._getitem_multilevel(key)

3733 # Do we have a slicer (on rows)?

3734 if isinstance(key, slice):

3735 indexer = self.index._convert_slice_indexer(key, kind="getitem")

3736 if isinstance(indexer, np.ndarray):

3737 # reachable with DatetimeIndex

3738 indexer = lib.maybe_indices_to_slice(

3739 indexer.astype(np.intp, copy=False), len(self)

3741 if isinstance(indexer, np.ndarray):

3742 # GH#43223 If we can not convert, use take

3743 return self.take(indexer, axis=0)

3744 return self._slice(indexer, axis=0)

3746 # Do we have a (boolean) DataFrame?

3747 if isinstance(key, DataFrame):

3748 return self.where(key)

3750 # Do we have a (boolean) 1d indexer?

3751 if com.is_bool_indexer(key):

3752 return self._getitem_bool_array(key)

3754 # We are left with two options: a single key, and a collection of keys,

3755 # We interpret tuples as collections only for non-MultiIndex

3756 is_single_key = isinstance(key, tuple) or not is_list_like(key)

3758 if is_single_key:

3759 if self.columns.nlevels > 1:

3760 return self._getitem_multilevel(key)

3761 indexer = self.columns.get_loc(key)

3762 if is_integer(indexer):

3763 indexer = [indexer]

3764 else:

3765 if is_iterator(key):

3766 key = list(key)

3767 indexer = self.columns._get_indexer_strict(key, "columns")[1]

3769 # take() does not accept boolean indexers

3770 if getattr(indexer, "dtype", None) == bool:

3771 indexer = np.where(indexer)[0]

3773 data = self._take_with_is_copy(indexer, axis=1)

3775 if is_single_key:

3776 # What does looking for a single key in a non-unique index return?

3777 # The behavior is inconsistent. It returns a Series, except when

3778 # - the key itself is repeated (test on data.shape, #9519), or

3779 # - we have a MultiIndex on columns (test on self.columns, #21309)

3780 if data.shape[1] == 1 and not isinstance(self.columns, MultiIndex):

3781 # GH#26490 using data[key] can cause RecursionError

3782 return data._get_item_cache(key)

3784 return data

3786 def _getitem_bool_array(self, key):

3787 # also raises Exception if object array with NA values

3788 # warning here just in case -- previously __setitem__ was

3789 # reindexing but __getitem__ was not; it seems more reasonable to

3790 # go with the __setitem__ behavior since that is more consistent

3791 # with all other indexing behavior

3792 if isinstance(key, Series) and not key.index.equals(self.index):

3793 warnings.warn(

3794 "Boolean Series key will be reindexed to match DataFrame index.",

3795 UserWarning,

3796 stacklevel=find_stack_level(),

3798 elif len(key) != len(self.index):

3799 raise ValueError(

3800 f"Item wrong length {len(key)} instead of {len(self.index)}."

3803 # check_bool_indexer will throw exception if Series key cannot

3804 # be reindexed to match DataFrame rows

3805 key = check_bool_indexer(self.index, key)

3807 if key.all():

3808 return self.copy(deep=None)

3810 indexer = key.nonzero()[0]

3811 return self._take_with_is_copy(indexer, axis=0)

3813 def _getitem_multilevel(self, key):

3814 # self.columns is a MultiIndex

3815 loc = self.columns.get_loc(key)

3816 if isinstance(loc, (slice, np.ndarray)):

3817 new_columns = self.columns[loc]

3818 result_columns = maybe_droplevels(new_columns, key)

3819 result = self.iloc[:, loc]

3820 result.columns = result_columns

3822 # If there is only one column being returned, and its name is

3823 # either an empty string, or a tuple with an empty string as its

3824 # first element, then treat the empty string as a placeholder

3825 # and return the column as if the user had provided that empty

3826 # string in the key. If the result is a Series, exclude the

3827 # implied empty string from its name.

3828 if len(result.columns) == 1:

3829 # e.g. test_frame_getitem_multicolumn_empty_level,

3830 # test_frame_mixed_depth_get, test_loc_setitem_single_column_slice

3831 top = result.columns[0]

3832 if isinstance(top, tuple):

3833 top = top[0]

3834 if top == "":

3835 result = result[""]

3836 if isinstance(result, Series):

3837 result = self._constructor_sliced(

3838 result, index=self.index, name=key

3841 result._set_is_copy(self)

3842 return result

3843 else:

3844 # loc is neither a slice nor ndarray, so must be an int

3845 return self._ixs(loc, axis=1)

3847 def _get_value(self, index, col, takeable: bool = False) -> Scalar:

3848 """

3849 Quickly retrieve single value at passed column and index.

3851 Parameters

3852 ----------

3853 index : row label

3854 col : column label

3855 takeable : interpret the index/col as indexers, default False

3857 Returns

3858 -------

3859 scalar

3861 Notes

3862 -----

3863 Assumes that both `self.index._index_as_unique` and

3864 `self.columns._index_as_unique`; Caller is responsible for checking.

3865 """

3866 if takeable:

3867 series = self._ixs(col, axis=1)

3868 return series._values[index]

3870 series = self._get_item_cache(col)

3871 engine = self.index._engine

3873 if not isinstance(self.index, MultiIndex):

3874 # CategoricalIndex: Trying to use the engine fastpath may give incorrect

3875 # results if our categories are integers that dont match our codes

3876 # IntervalIndex: IntervalTree has no get_loc

3877 row = self.index.get_loc(index)

3878 return series._values[row]

3880 # For MultiIndex going through engine effectively restricts us to

3881 # same-length tuples; see test_get_set_value_no_partial_indexing

3882 loc = engine.get_loc(index)

3883 return series._values[loc]

3885 def isetitem(self, loc, value) -> None:

3886 """

3887 Set the given value in the column with position `loc`.

3889 This is a positional analogue to ``__setitem__``.

3891 Parameters

3892 ----------

3893 loc : int or sequence of ints

3894 Index position for the column.

3895 value : scalar or arraylike

3896 Value(s) for the column.

3898 Notes

3899 -----

3900 ``frame.isetitem(loc, value)`` is an in-place method as it will

3901 modify the DataFrame in place (not returning a new object). In contrast to

3902 ``frame.iloc[:, i] = value`` which will try to update the existing values in

3903 place, ``frame.isetitem(loc, value)`` will not update the values of the column

3904 itself in place, it will instead insert a new array.

3906 In cases where ``frame.columns`` is unique, this is equivalent to

3907 ``frame[frame.columns[i]] = value``.

3908 """

3909 if isinstance(value, DataFrame):

3910 if is_scalar(loc):

3911 loc = [loc]

3913 for i, idx in enumerate(loc):

3914 arraylike = self._sanitize_column(value.iloc[:, i])

3915 self._iset_item_mgr(idx, arraylike, inplace=False)

3916 return

3918 arraylike = self._sanitize_column(value)

3919 self._iset_item_mgr(loc, arraylike, inplace=False)

3921 def __setitem__(self, key, value):

3922 if not PYPY and using_copy_on_write():

3923 if sys.getrefcount(self) <= 3:

3924 warnings.warn(

3925 _chained_assignment_msg, ChainedAssignmentError, stacklevel=2

3928 key = com.apply_if_callable(key, self)

3930 # see if we can slice the rows

3931 if isinstance(key, slice):

3932 slc = self.index._convert_slice_indexer(key, kind="getitem")

3933 return self._setitem_slice(slc, value)

3935 if isinstance(key, DataFrame) or getattr(key, "ndim", None) == 2:

3936 self._setitem_frame(key, value)

3937 elif isinstance(key, (Series, np.ndarray, list, Index)):

3938 self._setitem_array(key, value)

3939 elif isinstance(value, DataFrame):

3940 self._set_item_frame_value(key, value)

3941 elif (

3942 is_list_like(value)

3943 and not self.columns.is_unique

3944 and 1 < len(self.columns.get_indexer_for([key])) == len(value)

3946 # Column to set is duplicated

3947 self._setitem_array([key], value)

3948 else:

3949 # set column

3950 self._set_item(key, value)

3952 def _setitem_slice(self, key: slice, value) -> None:

3953 # NB: we can't just use self.loc[key] = value because that

3954 # operates on labels and we need to operate positional for

3955 # backwards-compat, xref GH#31469

3956 self._check_setitem_copy()

3957 self.iloc[key] = value

3959 def _setitem_array(self, key, value):

3960 # also raises Exception if object array with NA values

3961 if com.is_bool_indexer(key):

3962 # bool indexer is indexing along rows

3963 if len(key) != len(self.index):

3964 raise ValueError(

3965 f"Item wrong length {len(key)} instead of {len(self.index)}!"

3967 key = check_bool_indexer(self.index, key)

3968 indexer = key.nonzero()[0]

3969 self._check_setitem_copy()

3970 if isinstance(value, DataFrame):

3971 # GH#39931 reindex since iloc does not align

3972 value = value.reindex(self.index.take(indexer))

3973 self.iloc[indexer] = value

3975 else:

3976 # Note: unlike self.iloc[:, indexer] = value, this will

3977 # never try to overwrite values inplace

3979 if isinstance(value, DataFrame):

3980 check_key_length(self.columns, key, value)

3981 for k1, k2 in zip(key, value.columns):

3982 self[k1] = value[k2]

3984 elif not is_list_like(value):

3985 for col in key:

3986 self[col] = value

3988 elif isinstance(value, np.ndarray) and value.ndim == 2:

3989 self._iset_not_inplace(key, value)

3991 elif np.ndim(value) > 1:

3992 # list of lists

3993 value = DataFrame(value).values

3994 return self._setitem_array(key, value)

3996 else:

3997 self._iset_not_inplace(key, value)

3999 def _iset_not_inplace(self, key, value):

4000 # GH#39510 when setting with df[key] = obj with a list-like key and

4001 # list-like value, we iterate over those listlikes and set columns

4002 # one at a time. This is different from dispatching to

4003 # `self.loc[:, key]= value` because loc.__setitem__ may overwrite

4004 # data inplace, whereas this will insert new arrays.

4006 def igetitem(obj, i: int):

4007 # Note: we catch DataFrame obj before getting here, but

4008 # hypothetically would return obj.iloc[:, i]

4009 if isinstance(obj, np.ndarray):

4010 return obj[..., i]

4011 else:

4012 return obj[i]

4014 if self.columns.is_unique:

4015 if np.shape(value)[-1] != len(key):

4016 raise ValueError("Columns must be same length as key")

4018 for i, col in enumerate(key):

4019 self[col] = igetitem(value, i)

4021 else:

4022 ilocs = self.columns.get_indexer_non_unique(key)[0]

4023 if (ilocs < 0).any():

4024 # key entries not in self.columns

4025 raise NotImplementedError

4027 if np.shape(value)[-1] != len(ilocs):

4028 raise ValueError("Columns must be same length as key")

4030 assert np.ndim(value) <= 2

4032 orig_columns = self.columns

4034 # Using self.iloc[:, i] = ... may set values inplace, which

4035 # by convention we do not do in __setitem__

4036 try:

4037 self.columns = Index(range(len(self.columns)))

4038 for i, iloc in enumerate(ilocs):

4039 self[iloc] = igetitem(value, i)

4040 finally:

4041 self.columns = orig_columns

4043 def _setitem_frame(self, key, value):

4044 # support boolean setting with DataFrame input, e.g.

4045 # df[df > df2] = 0

4046 if isinstance(key, np.ndarray):

4047 if key.shape != self.shape:

4048 raise ValueError("Array conditional must be same shape as self")

4049 key = self._constructor(key, **self._construct_axes_dict(), copy=False)

4051 if key.size and not all(is_bool_dtype(dtype) for dtype in key.dtypes):

4052 raise TypeError(

4053 "Must pass DataFrame or 2-d ndarray with boolean values only"

4056 self._check_inplace_setting(value)

4057 self._check_setitem_copy()

4058 self._where(-key, value, inplace=True)

4060 def _set_item_frame_value(self, key, value: DataFrame) -> None:

4061 self._ensure_valid_index(value)

4063 # align columns

4064 if key in self.columns:

4065 loc = self.columns.get_loc(key)

4066 cols = self.columns[loc]

4067 len_cols = 1 if is_scalar(cols) or isinstance(cols, tuple) else len(cols)

4068 if len_cols != len(value.columns):

4069 raise ValueError("Columns must be same length as key")

4071 # align right-hand-side columns if self.columns

4072 # is multi-index and self[key] is a sub-frame

4073 if isinstance(self.columns, MultiIndex) and isinstance(

4074 loc, (slice, Series, np.ndarray, Index)

4076 cols_droplevel = maybe_droplevels(cols, key)

4077 if len(cols_droplevel) and not cols_droplevel.equals(value.columns):

4078 value = value.reindex(cols_droplevel, axis=1)

4080 for col, col_droplevel in zip(cols, cols_droplevel):

4081 self[col] = value[col_droplevel]

4082 return

4084 if is_scalar(cols):

4085 self[cols] = value[value.columns[0]]

4086 return

4088 # now align rows

4089 arraylike = _reindex_for_setitem(value, self.index)

4090 self._set_item_mgr(key, arraylike)

4091 return

4093 if len(value.columns) != 1:

4094 raise ValueError(

4095 "Cannot set a DataFrame with multiple columns to the single "

4096 f"column {key}"

4099 self[key] = value[value.columns[0]]

4101 def _iset_item_mgr(

4102 self, loc: int | slice | np.ndarray, value, inplace: bool = False

4103 ) -> None:

4104 # when called from _set_item_mgr loc can be anything returned from get_loc

4105 self._mgr.iset(loc, value, inplace=inplace)

4106 self._clear_item_cache()

4108 def _set_item_mgr(self, key, value: ArrayLike) -> None:

4109 try:

4110 loc = self._info_axis.get_loc(key)

4111 except KeyError:

4112 # This item wasn't present, just insert at end

4113 self._mgr.insert(len(self._info_axis), key, value)

4114 else:

4115 self._iset_item_mgr(loc, value)

4117 # check if we are modifying a copy

4118 # try to set first as we want an invalid

4119 # value exception to occur first

4120 if len(self):

4121 self._check_setitem_copy()

4123 def _iset_item(self, loc: int, value) -> None:

4124 arraylike = self._sanitize_column(value)

4125 self._iset_item_mgr(loc, arraylike, inplace=True)

4127 # check if we are modifying a copy

4128 # try to set first as we want an invalid

4129 # value exception to occur first

4130 if len(self):

4131 self._check_setitem_copy()

4133 def _set_item(self, key, value) -> None:

4134 """

4135 Add series to DataFrame in specified column.

4137 If series is a numpy-array (not a Series/TimeSeries), it must be the

4138 same length as the DataFrames index or an error will be thrown.

4140 Series/TimeSeries will be conformed to the DataFrames index to

4141 ensure homogeneity.

4142 """

4143 value = self._sanitize_column(value)

4145 if (

4146 key in self.columns

4147 and value.ndim == 1

4148 and not is_extension_array_dtype(value)

4150 # broadcast across multiple columns if necessary

4151 if not self.columns.is_unique or isinstance(self.columns, MultiIndex):

4152 existing_piece = self[key]

4153 if isinstance(existing_piece, DataFrame):

4154 value = np.tile(value, (len(existing_piece.columns), 1)).T

4156 self._set_item_mgr(key, value)

4158 def _set_value(

4159 self, index: IndexLabel, col, value: Scalar, takeable: bool = False

4160 ) -> None:

4161 """

4162 Put single value at passed column and index.

4164 Parameters

4165 ----------

4166 index : Label

4167 row label

4168 col : Label

4169 column label

4170 value : scalar

4171 takeable : bool, default False

4172 Sets whether or not index/col interpreted as indexers

4173 """

4174 try:

4175 if takeable:

4176 icol = col

4177 iindex = cast(int, index)

4178 else:

4179 icol = self.columns.get_loc(col)

4180 iindex = self.index.get_loc(index)

4181 self._mgr.column_setitem(icol, iindex, value, inplace_only=True)

4182 self._clear_item_cache()

4184 except (KeyError, TypeError, ValueError, LossySetitemError):

4185 # get_loc might raise a KeyError for missing labels (falling back

4186 # to (i)loc will do expansion of the index)

4187 # column_setitem will do validation that may raise TypeError,

4188 # ValueError, or LossySetitemError

4189 # set using a non-recursive method & reset the cache

4190 if takeable:

4191 self.iloc[index, col] = value

4192 else:

4193 self.loc[index, col] = value

4194 self._item_cache.pop(col, None)

4196 except InvalidIndexError as ii_err:

4197 # GH48729: Seems like you are trying to assign a value to a

4198 # row when only scalar options are permitted

4199 raise InvalidIndexError(

4200 f"You can only assign a scalar value not a {type(value)}"

4201 ) from ii_err

4203 def _ensure_valid_index(self, value) -> None:

4204 """

4205 Ensure that if we don't have an index, that we can create one from the

4206 passed value.

4207 """

4208 # GH5632, make sure that we are a Series convertible

4209 if not len(self.index) and is_list_like(value) and len(value):

4210 if not isinstance(value, DataFrame):

4211 try:

4212 value = Series(value)

4213 except (ValueError, NotImplementedError, TypeError) as err:

4214 raise ValueError(

4215 "Cannot set a frame with no defined index "

4216 "and a value that cannot be converted to a Series"

4217 ) from err

4219 # GH31368 preserve name of index

4220 index_copy = value.index.copy()

4221 if self.index.name is not None:

4222 index_copy.name = self.index.name

4224 self._mgr = self._mgr.reindex_axis(index_copy, axis=1, fill_value=np.nan)

4226 def _box_col_values(self, values: SingleDataManager, loc: int) -> Series:

4227 """

4228 Provide boxed values for a column.

4229 """

4230 # Lookup in columns so that if e.g. a str datetime was passed

4231 # we attach the Timestamp object as the name.

4232 name = self.columns[loc]

4233 klass = self._constructor_sliced

4234 # We get index=self.index bc values is a SingleDataManager

4235 return klass(values, name=name, fastpath=True).__finalize__(self)

4237 # ----------------------------------------------------------------------

4238 # Lookup Caching

4240 def _clear_item_cache(self) -> None:

4241 self._item_cache.clear()

4243 def _get_item_cache(self, item: Hashable) -> Series:

4244 """Return the cached item, item represents a label indexer."""

4245 if using_copy_on_write():

4246 loc = self.columns.get_loc(item)

4247 return self._ixs(loc, axis=1)

4249 cache = self._item_cache

4250 res = cache.get(item)

4251 if res is None:

4252 # All places that call _get_item_cache have unique columns,

4253 # pending resolution of GH#33047

4255 loc = self.columns.get_loc(item)

4256 res = self._ixs(loc, axis=1)

4258 cache[item] = res

4260 # for a chain

4261 res._is_copy = self._is_copy

4262 return res

4264 def _reset_cacher(self) -> None:

4265 # no-op for DataFrame

4266 pass

4268 def _maybe_cache_changed(self, item, value: Series, inplace: bool) -> None:

4269 """

4270 The object has called back to us saying maybe it has changed.

4271 """

4272 loc = self._info_axis.get_loc(item)

4273 arraylike = value._values

4275 old = self._ixs(loc, axis=1)

4276 if old._values is value._values and inplace:

4277 # GH#46149 avoid making unnecessary copies/block-splitting

4278 return

4280 self._mgr.iset(loc, arraylike, inplace=inplace)

4282 # ----------------------------------------------------------------------

4283 # Unsorted

4285 @overload

4286 def query(self, expr: str, *, inplace: Literal[False] = ..., **kwargs) -> DataFrame:

4287 ...

4289 @overload

4290 def query(self, expr: str, *, inplace: Literal[True], **kwargs) -> None:

4291 ...

4293 @overload

4294 def query(self, expr: str, *, inplace: bool = ..., **kwargs) -> DataFrame | None:

4295 ...

4297 def query(self, expr: str, *, inplace: bool = False, **kwargs) -> DataFrame | None:

4298 """

4299 Query the columns of a DataFrame with a boolean expression.

4301 Parameters

4302 ----------

4303 expr : str

4304 The query string to evaluate.

4306 You can refer to variables

4307 in the environment by prefixing them with an '@' character like

4308 ``@a + b``.

4310 You can refer to column names that are not valid Python variable names

4311 by surrounding them in backticks. Thus, column names containing spaces

4312 or punctuations (besides underscores) or starting with digits must be

4313 surrounded by backticks. (For example, a column named "Area (cm^2)" would

4314 be referenced as ```Area (cm^2)```). Column names which are Python keywords

4315 (like "list", "for", "import", etc) cannot be used.

4317 For example, if one of your columns is called ``a a`` and you want

4318 to sum it with ``b``, your query should be ```a a` + b``.

4320 inplace : bool

4321 Whether to modify the DataFrame rather than creating a new one.

4322 **kwargs

4323 See the documentation for :func:`eval` for complete details

4324 on the keyword arguments accepted by :meth:`DataFrame.query`.

4326 Returns

4327 -------

4328 DataFrame or None

4329 DataFrame resulting from the provided query expression or

4330 None if ``inplace=True``.

4332 See Also

4333 --------

4334 eval : Evaluate a string describing operations on

4335 DataFrame columns.

4336 DataFrame.eval : Evaluate a string describing operations on

4337 DataFrame columns.

4339 Notes

4340 -----

4341 The result of the evaluation of this expression is first passed to

4342 :attr:`DataFrame.loc` and if that fails because of a

4343 multidimensional key (e.g., a DataFrame) then the result will be passed

4344 to :meth:`DataFrame.__getitem__`.

4346 This method uses the top-level :func:`eval` function to

4347 evaluate the passed query.

4349 The :meth:`~pandas.DataFrame.query` method uses a slightly

4350 modified Python syntax by default. For example, the ``&`` and ``|``

4351 (bitwise) operators have the precedence of their boolean cousins,

4352 :keyword:`and` and :keyword:`or`. This *is* syntactically valid Python,

4353 however the semantics are different.

4355 You can change the semantics of the expression by passing the keyword

4356 argument ``parser='python'``. This enforces the same semantics as

4357 evaluation in Python space. Likewise, you can pass ``engine='python'``

4358 to evaluate an expression using Python itself as a backend. This is not

4359 recommended as it is inefficient compared to using ``numexpr`` as the

4360 engine.

4362 The :attr:`DataFrame.index` and

4363 :attr:`DataFrame.columns` attributes of the

4364 :class:`~pandas.DataFrame` instance are placed in the query namespace

4365 by default, which allows you to treat both the index and columns of the

4366 frame as a column in the frame.

4367 The identifier ``index`` is used for the frame index; you can also

4368 use the name of the index to identify it in a query. Please note that

4369 Python keywords may not be used as identifiers.

4371 For further details and examples see the ``query`` documentation in

4372 :ref:`indexing <indexing.query>`.

4374 *Backtick quoted variables*

4376 Backtick quoted variables are parsed as literal Python code and

4377 are converted internally to a Python valid identifier.

4378 This can lead to the following problems.

4380 During parsing a number of disallowed characters inside the backtick

4381 quoted string are replaced by strings that are allowed as a Python identifier.

4382 These characters include all operators in Python, the space character, the

4383 question mark, the exclamation mark, the dollar sign, and the euro sign.

4384 For other characters that fall outside the ASCII range (U+0001..U+007F)

4385 and those that are not further specified in PEP 3131,

4386 the query parser will raise an error.

4387 This excludes whitespace different than the space character,

4388 but also the hashtag (as it is used for comments) and the backtick

4389 itself (backtick can also not be escaped).

4391 In a special case, quotes that make a pair around a backtick can

4392 confuse the parser.

4393 For example, ```it's` > `that's``` will raise an error,

4394 as it forms a quoted string (``'s > `that'``) with a backtick inside.

4396 See also the Python documentation about lexical analysis

4397 (https://docs.python.org/3/reference/lexical_analysis.html)

4398 in combination with the source code in :mod:`pandas.core.computation.parsing`.

4400 Examples

4401 --------

4402 >>> df = pd.DataFrame({'A': range(1, 6),

4403 ... 'B': range(10, 0, -2),

4404 ... 'C C': range(10, 5, -1)})

4405 >>> df

4406 A B C C

4407 0 1 10 10

4408 1 2 8 9

4409 2 3 6 8

4410 3 4 4 7

4411 4 5 2 6

4412 >>> df.query('A > B')

4413 A B C C

4414 4 5 2 6

4416 The previous expression is equivalent to

4418 >>> df[df.A > df.B]

4419 A B C C

4420 4 5 2 6

4422 For columns with spaces in their name, you can use backtick quoting.

4424 >>> df.query('B == `C C`')

4425 A B C C

4426 0 1 10 10

4428 The previous expression is equivalent to

4430 >>> df[df.B == df['C C']]

4431 A B C C

4432 0 1 10 10

4433 """

4434 inplace = validate_bool_kwarg(inplace, "inplace")

4435 if not isinstance(expr, str):

4436 msg = f"expr must be a string to be evaluated, {type(expr)} given"

4437 raise ValueError(msg)

4438 kwargs["level"] = kwargs.pop("level", 0) + 1

4439 kwargs["target"] = None

4440 res = self.eval(expr, **kwargs)

4442 try:

4443 result = self.loc[res]

4444 except ValueError:

4445 # when res is multi-dimensional loc raises, but this is sometimes a

4446 # valid query

4447 result = self[res]

4449 if inplace:

4450 self._update_inplace(result)

4451 return None

4452 else:

4453 return result

4455 @overload

4456 def eval(self, expr: str, *, inplace: Literal[False] = ..., **kwargs) -> Any:

4457 ...

4459 @overload

4460 def eval(self, expr: str, *, inplace: Literal[True], **kwargs) -> None:

4461 ...

4463 def eval(self, expr: str, *, inplace: bool = False, **kwargs) -> Any | None:

4464 """

4465 Evaluate a string describing operations on DataFrame columns.

4467 Operates on columns only, not specific rows or elements. This allows

4468 `eval` to run arbitrary code, which can make you vulnerable to code

4469 injection if you pass user input to this function.

4471 Parameters

4472 ----------

4473 expr : str

4474 The expression string to evaluate.

4475 inplace : bool, default False

4476 If the expression contains an assignment, whether to perform the

4477 operation inplace and mutate the existing DataFrame. Otherwise,

4478 a new DataFrame is returned.

4479 **kwargs

4480 See the documentation for :func:`eval` for complete details

4481 on the keyword arguments accepted by

4482 :meth:`~pandas.DataFrame.query`.

4484 Returns

4485 -------

4486 ndarray, scalar, pandas object, or None

4487 The result of the evaluation or None if ``inplace=True``.

4489 See Also

4490 --------

4491 DataFrame.query : Evaluates a boolean expression to query the columns

4492 of a frame.

4493 DataFrame.assign : Can evaluate an expression or function to create new

4494 values for a column.

4495 eval : Evaluate a Python expression as a string using various

4496 backends.

4498 Notes

4499 -----

4500 For more details see the API documentation for :func:`~eval`.

4501 For detailed examples see :ref:`enhancing performance with eval

4502 <enhancingperf.eval>`.

4504 Examples

4505 --------

4506 >>> df = pd.DataFrame({'A': range(1, 6), 'B': range(10, 0, -2)})

4507 >>> df

4508 A B

4509 0 1 10

4510 1 2 8

4511 2 3 6

4512 3 4 4

4513 4 5 2

4514 >>> df.eval('A + B')

4515 0 11

4516 1 10

4517 2 9

4518 3 8

4519 4 7

4520 dtype: int64

4522 Assignment is allowed though by default the original DataFrame is not

4523 modified.

4525 >>> df.eval('C = A + B')

4526 A B C

4527 0 1 10 11

4528 1 2 8 10

4529 2 3 6 9

4530 3 4 4 8

4531 4 5 2 7

4532 >>> df

4533 A B

4534 0 1 10

4535 1 2 8

4536 2 3 6

4537 3 4 4

4538 4 5 2

4540 Multiple columns can be assigned to using multi-line expressions:

4542 >>> df.eval(

4543 ... '''

4544 ... C = A + B

4545 ... D = A - B

4546 ... '''

4547 ... )

4548 A B C D

4549 0 1 10 11 -9

4550 1 2 8 10 -6

4551 2 3 6 9 -3

4552 3 4 4 8 0

4553 4 5 2 7 3

4554 """

4555 from pandas.core.computation.eval import eval as _eval

4557 inplace = validate_bool_kwarg(inplace, "inplace")

4558 kwargs["level"] = kwargs.pop("level", 0) + 1

4559 index_resolvers = self._get_index_resolvers()

4560 column_resolvers = self._get_cleaned_column_resolvers()

4561 resolvers = column_resolvers, index_resolvers

4562 if "target" not in kwargs:

4563 kwargs["target"] = self

4564 kwargs["resolvers"] = tuple(kwargs.get("resolvers", ())) + resolvers

4566 return _eval(expr, inplace=inplace, **kwargs)

4568 def select_dtypes(self, include=None, exclude=None) -> DataFrame:

4569 """

4570 Return a subset of the DataFrame's columns based on the column dtypes.

4572 Parameters

4573 ----------

4574 include, exclude : scalar or list-like

4575 A selection of dtypes or strings to be included/excluded. At least

4576 one of these parameters must be supplied.

4578 Returns

4579 -------

4580 DataFrame

4581 The subset of the frame including the dtypes in ``include`` and

4582 excluding the dtypes in ``exclude``.

4584 Raises

4585 ------

4586 ValueError

4587 * If both of ``include`` and ``exclude`` are empty

4588 * If ``include`` and ``exclude`` have overlapping elements

4589 * If any kind of string dtype is passed in.

4591 See Also

4592 --------

4593 DataFrame.dtypes: Return Series with the data type of each column.

4595 Notes

4596 -----

4597 * To select all *numeric* types, use ``np.number`` or ``'number'``

4598 * To select strings you must use the ``object`` dtype, but note that

4599 this will return *all* object dtype columns

4600 * See the `numpy dtype hierarchy

4601 <https://numpy.org/doc/stable/reference/arrays.scalars.html>`__

4602 * To select datetimes, use ``np.datetime64``, ``'datetime'`` or

4603 ``'datetime64'``

4604 * To select timedeltas, use ``np.timedelta64``, ``'timedelta'`` or

4605 ``'timedelta64'``

4606 * To select Pandas categorical dtypes, use ``'category'``

4607 * To select Pandas datetimetz dtypes, use ``'datetimetz'`` (new in

4608 0.20.0) or ``'datetime64[ns, tz]'``

4610 Examples

4611 --------

4612 >>> df = pd.DataFrame({'a': [1, 2] * 3,

4613 ... 'b': [True, False] * 3,

4614 ... 'c': [1.0, 2.0] * 3})

4615 >>> df

4616 a b c

4617 0 1 True 1.0

4618 1 2 False 2.0

4619 2 1 True 1.0

4620 3 2 False 2.0

4621 4 1 True 1.0

4622 5 2 False 2.0

4624 >>> df.select_dtypes(include='bool')

4626 0 True

4627 1 False

4628 2 True

4629 3 False

4630 4 True

4631 5 False

4633 >>> df.select_dtypes(include=['float64'])

4635 0 1.0

4636 1 2.0

4637 2 1.0

4638 3 2.0

4639 4 1.0

4640 5 2.0

4642 >>> df.select_dtypes(exclude=['int64'])

4643 b c

4644 0 True 1.0

4645 1 False 2.0

4646 2 True 1.0

4647 3 False 2.0

4648 4 True 1.0

4649 5 False 2.0

4650 """

4651 if not is_list_like(include):

4652 include = (include,) if include is not None else ()

4653 if not is_list_like(exclude):

4654 exclude = (exclude,) if exclude is not None else ()

4656 selection = (frozenset(include), frozenset(exclude))

4658 if not any(selection):

4659 raise ValueError("at least one of include or exclude must be nonempty")

4661 # convert the myriad valid dtypes object to a single representation

4662 def check_int_infer_dtype(dtypes):

4663 converted_dtypes: list[type] = []

4664 for dtype in dtypes:

4665 # Numpy maps int to different types (int32, in64) on Windows and Linux

4666 # see https://github.com/numpy/numpy/issues/9464

4667 if (isinstance(dtype, str) and dtype == "int") or (dtype is int):

4668 converted_dtypes.append(np.int32)

4669 converted_dtypes.append(np.int64)

4670 elif dtype == "float" or dtype is float:

4671 # GH#42452 : np.dtype("float") coerces to np.float64 from Numpy 1.20

4672 converted_dtypes.extend([np.float64, np.float32])

4673 else:

4674 converted_dtypes.append(infer_dtype_from_object(dtype))

4675 return frozenset(converted_dtypes)

4677 include = check_int_infer_dtype(include)

4678 exclude = check_int_infer_dtype(exclude)

4680 for dtypes in (include, exclude):

4681 invalidate_string_dtypes(dtypes)

4683 # can't both include AND exclude!

4684 if not include.isdisjoint(exclude):

4685 raise ValueError(f"include and exclude overlap on {(include & exclude)}")

4687 def dtype_predicate(dtype: DtypeObj, dtypes_set) -> bool:

4688 # GH 46870: BooleanDtype._is_numeric == True but should be excluded

4689 dtype = dtype if not isinstance(dtype, ArrowDtype) else dtype.numpy_dtype

4690 return issubclass(dtype.type, tuple(dtypes_set)) or (

4691 np.number in dtypes_set

4692 and getattr(dtype, "_is_numeric", False)

4693 and not is_bool_dtype(dtype)

4696 def predicate(arr: ArrayLike) -> bool:

4697 dtype = arr.dtype

4698 if include:

4699 if not dtype_predicate(dtype, include):

4700 return False

4702 if exclude:

4703 if dtype_predicate(dtype, exclude):

4704 return False

4706 return True

4708 mgr = self._mgr._get_data_subset(predicate).copy(deep=None)

4709 return type(self)(mgr).__finalize__(self)

4711 def insert(

4712 self,

4713 loc: int,

4714 column: Hashable,

4715 value: Scalar | AnyArrayLike,

4716 allow_duplicates: bool | lib.NoDefault = lib.no_default,

4717 ) -> None:

4718 """

4719 Insert column into DataFrame at specified location.

4721 Raises a ValueError if `column` is already contained in the DataFrame,

4722 unless `allow_duplicates` is set to True.

4724 Parameters

4725 ----------

4726 loc : int

4727 Insertion index. Must verify 0 <= loc <= len(columns).

4728 column : str, number, or hashable object

4729 Label of the inserted column.

4730 value : Scalar, Series, or array-like

4731 allow_duplicates : bool, optional, default lib.no_default

4733 See Also

4734 --------

4735 Index.insert : Insert new item by index.

4737 Examples

4738 --------

4739 >>> df = pd.DataFrame({'col1': [1, 2], 'col2': [3, 4]})

4740 >>> df

4741 col1 col2

4742 0 1 3

4743 1 2 4

4744 >>> df.insert(1, "newcol", [99, 99])

4745 >>> df

4746 col1 newcol col2

4747 0 1 99 3

4748 1 2 99 4

4749 >>> df.insert(0, "col1", [100, 100], allow_duplicates=True)

4750 >>> df

4751 col1 col1 newcol col2

4752 0 100 1 99 3

4753 1 100 2 99 4

4755 Notice that pandas uses index alignment in case of `value` from type `Series`:

4757 >>> df.insert(0, "col0", pd.Series([5, 6], index=[1, 2]))

4758 >>> df

4759 col0 col1 col1 newcol col2

4760 0 NaN 100 1 99 3

4761 1 5.0 100 2 99 4

4762 """

4763 if allow_duplicates is lib.no_default:

4764 allow_duplicates = False

4765 if allow_duplicates and not self.flags.allows_duplicate_labels:

4766 raise ValueError(

4767 "Cannot specify 'allow_duplicates=True' when "

4768 "'self.flags.allows_duplicate_labels' is False."

4770 if not allow_duplicates and column in self.columns:

4771 # Should this be a different kind of error??

4772 raise ValueError(f"cannot insert {column}, already exists")

4773 if not isinstance(loc, int):

4774 raise TypeError("loc must be int")

4776 value = self._sanitize_column(value)

4777 self._mgr.insert(loc, column, value)

4779 def assign(self, **kwargs) -> DataFrame:

4780 r"""

4781 Assign new columns to a DataFrame.

4783 Returns a new object with all original columns in addition to new ones.

4784 Existing columns that are re-assigned will be overwritten.

4786 Parameters

4787 ----------

4788 **kwargs : dict of {str: callable or Series}

4789 The column names are keywords. If the values are

4790 callable, they are computed on the DataFrame and

4791 assigned to the new columns. The callable must not

4792 change input DataFrame (though pandas doesn't check it).

4793 If the values are not callable, (e.g. a Series, scalar, or array),

4794 they are simply assigned.

4796 Returns

4797 -------

4798 DataFrame

4799 A new DataFrame with the new columns in addition to

4800 all the existing columns.

4802 Notes

4803 -----

4804 Assigning multiple columns within the same ``assign`` is possible.

4805 Later items in '\*\*kwargs' may refer to newly created or modified

4806 columns in 'df'; items are computed and assigned into 'df' in order.

4808 Examples

4809 --------

4810 >>> df = pd.DataFrame({'temp_c': [17.0, 25.0]},

4811 ... index=['Portland', 'Berkeley'])

4812 >>> df

4813 temp_c

4814 Portland 17.0

4815 Berkeley 25.0

4817 Where the value is a callable, evaluated on `df`:

4819 >>> df.assign(temp_f=lambda x: x.temp_c * 9 / 5 + 32)

4820 temp_c temp_f

4821 Portland 17.0 62.6

4822 Berkeley 25.0 77.0

4824 Alternatively, the same behavior can be achieved by directly

4825 referencing an existing Series or sequence:

4827 >>> df.assign(temp_f=df['temp_c'] * 9 / 5 + 32)

4828 temp_c temp_f

4829 Portland 17.0 62.6

4830 Berkeley 25.0 77.0

4832 You can create multiple columns within the same assign where one

4833 of the columns depends on another one defined within the same assign:

4835 >>> df.assign(temp_f=lambda x: x['temp_c'] * 9 / 5 + 32,

4836 ... temp_k=lambda x: (x['temp_f'] + 459.67) * 5 / 9)

4837 temp_c temp_f temp_k

4838 Portland 17.0 62.6 290.15

4839 Berkeley 25.0 77.0 298.15

4840 """

4841 data = self.copy(deep=None)

4843 for k, v in kwargs.items():

4844 data[k] = com.apply_if_callable(v, data)

4845 return data

4847 def _sanitize_column(self, value) -> ArrayLike:

4848 """

4849 Ensures new columns (which go into the BlockManager as new blocks) are

4850 always copied and converted into an array.

4852 Parameters

4853 ----------

4854 value : scalar, Series, or array-like

4856 Returns

4857 -------

4858 numpy.ndarray or ExtensionArray

4859 """

4860 self._ensure_valid_index(value)

4862 # We can get there through isetitem with a DataFrame

4863 # or through loc single_block_path

4864 if isinstance(value, DataFrame):

4865 return _reindex_for_setitem(value, self.index)

4866 elif is_dict_like(value):

4867 return _reindex_for_setitem(Series(value), self.index)

4869 if is_list_like(value):

4870 com.require_length_match(value, self.index)

4871 return sanitize_array(value, self.index, copy=True, allow_2d=True)

4873 @property

4874 def _series(self):

4875 return {

4876 item: Series(

4877 self._mgr.iget(idx), index=self.index, name=item, fastpath=True

4879 for idx, item in enumerate(self.columns)

4882 # ----------------------------------------------------------------------

4883 # Reindexing and alignment

4885 def _reindex_axes(self, axes, level, limit, tolerance, method, fill_value, copy):

4886 frame = self

4888 columns = axes["columns"]

4889 if columns is not None:

4890 frame = frame._reindex_columns(

4891 columns, method, copy, level, fill_value, limit, tolerance

4894 index = axes["index"]

4895 if index is not None:

4896 frame = frame._reindex_index(

4897 index, method, copy, level, fill_value, limit, tolerance

4900 return frame

4902 def _reindex_index(

4903 self,

4904 new_index,

4905 method,

4906 copy: bool,

4907 level: Level,

4908 fill_value=np.nan,

4909 limit=None,

4910 tolerance=None,

4912 new_index, indexer = self.index.reindex(

4913 new_index, method=method, level=level, limit=limit, tolerance=tolerance

4915 return self._reindex_with_indexers(

4916 {0: [new_index, indexer]},

4917 copy=copy,

4918 fill_value=fill_value,

4919 allow_dups=False,

4922 def _reindex_columns(

4923 self,

4924 new_columns,

4925 method,

4926 copy: bool,

4927 level: Level,

4928 fill_value=None,

4929 limit=None,

4930 tolerance=None,

4932 new_columns, indexer = self.columns.reindex(

4933 new_columns, method=method, level=level, limit=limit, tolerance=tolerance

4935 return self._reindex_with_indexers(

4936 {1: [new_columns, indexer]},

4937 copy=copy,

4938 fill_value=fill_value,

4939 allow_dups=False,

4942 def _reindex_multi(

4943 self, axes: dict[str, Index], copy: bool, fill_value

4944 ) -> DataFrame:

4945 """

4946 We are guaranteed non-Nones in the axes.

4947 """

4949 new_index, row_indexer = self.index.reindex(axes["index"])

4950 new_columns, col_indexer = self.columns.reindex(axes["columns"])

4952 if row_indexer is not None and col_indexer is not None:

4953 # Fastpath. By doing two 'take's at once we avoid making an

4954 # unnecessary copy.

4955 # We only get here with `not self._is_mixed_type`, which (almost)

4956 # ensures that self.values is cheap. It may be worth making this

4957 # condition more specific.

4958 indexer = row_indexer, col_indexer

4959 new_values = take_2d_multi(self.values, indexer, fill_value=fill_value)

4960 return self._constructor(

4961 new_values, index=new_index, columns=new_columns, copy=False

4963 else:

4964 return self._reindex_with_indexers(

4965 {0: [new_index, row_indexer], 1: [new_columns, col_indexer]},

4966 copy=copy,

4967 fill_value=fill_value,

4970 @doc(NDFrame.align, **_shared_doc_kwargs)

4971 def align(

4972 self,

4973 other: DataFrame,

4974 join: AlignJoin = "outer",

4975 axis: Axis | None = None,

4976 level: Level = None,

4977 copy: bool | None = None,

4978 fill_value=None,

4979 method: FillnaOptions | None = None,

4980 limit: int | None = None,

4981 fill_axis: Axis = 0,

4982 broadcast_axis: Axis | None = None,

4983 ) -> DataFrame:

4984 return super().align(

4985 other,

4986 join=join,

4987 axis=axis,

4988 level=level,

4989 copy=copy,

4990 fill_value=fill_value,

4991 method=method,

4992 limit=limit,

4993 fill_axis=fill_axis,

4994 broadcast_axis=broadcast_axis,

4997 @Appender(

4998 """

4999 Examples

5000 --------

5001 >>> df = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]})

5003 Change the row labels.

5005 >>> df.set_axis(['a', 'b', 'c'], axis='index')

5006 A B

5007 a 1 4

5008 b 2 5

5009 c 3 6

5011 Change the column labels.

5013 >>> df.set_axis(['I', 'II'], axis='columns')

5014 I II

5015 0 1 4

5016 1 2 5

5017 2 3 6

5018 """

5020 @Substitution(

5021 **_shared_doc_kwargs,

5022 extended_summary_sub=" column or",

5023 axis_description_sub=", and 1 identifies the columns",

5024 see_also_sub=" or columns",

5026 @Appender(NDFrame.set_axis.__doc__)

5027 def set_axis(

5028 self,

5029 labels,

5031 axis: Axis = 0,

5032 copy: bool | None = None,

5033 ) -> DataFrame:

5034 return super().set_axis(labels, axis=axis, copy=copy)

5036 @doc(

5037 NDFrame.reindex,

5038 klass=_shared_doc_kwargs["klass"],

5039 optional_reindex=_shared_doc_kwargs["optional_reindex"],

5041 def reindex( # type: ignore[override]

5042 self,

5043 labels=None,

5045 index=None,

5046 columns=None,

5047 axis: Axis | None = None,

5048 method: str | None = None,

5049 copy: bool | None = None,

5050 level: Level | None = None,

5051 fill_value: Scalar | None = np.nan,

5052 limit: int | None = None,

5053 tolerance=None,

5054 ) -> DataFrame:

5055 return super().reindex(

5056 labels=labels,

5057 index=index,

5058 columns=columns,

5059 axis=axis,

5060 method=method,

5061 copy=copy,

5062 level=level,

5063 fill_value=fill_value,

5064 limit=limit,

5065 tolerance=tolerance,

5068 @overload

5069 def drop(

5070 self,

5071 labels: IndexLabel = ...,

5073 axis: Axis = ...,

5074 index: IndexLabel = ...,

5075 columns: IndexLabel = ...,

5076 level: Level = ...,

5077 inplace: Literal[True],

5078 errors: IgnoreRaise = ...,

5079 ) -> None:

5080 ...

5082 @overload

5083 def drop(

5084 self,

5085 labels: IndexLabel = ...,

5087 axis: Axis = ...,

5088 index: IndexLabel = ...,

5089 columns: IndexLabel = ...,

5090 level: Level = ...,

5091 inplace: Literal[False] = ...,

5092 errors: IgnoreRaise = ...,

5093 ) -> DataFrame:

5094 ...

5096 @overload

5097 def drop(

5098 self,

5099 labels: IndexLabel = ...,

5101 axis: Axis = ...,

5102 index: IndexLabel = ...,

5103 columns: IndexLabel = ...,

5104 level: Level = ...,

5105 inplace: bool = ...,

5106 errors: IgnoreRaise = ...,

5107 ) -> DataFrame | None:

5108 ...

5110 def drop(

5111 self,

5112 labels: IndexLabel = None,

5114 axis: Axis = 0,

5115 index: IndexLabel = None,

5116 columns: IndexLabel = None,

5117 level: Level = None,

5118 inplace: bool = False,

5119 errors: IgnoreRaise = "raise",

5120 ) -> DataFrame | None:

5121 """

5122 Drop specified labels from rows or columns.

5124 Remove rows or columns by specifying label names and corresponding

5125 axis, or by specifying directly index or column names. When using a

5126 multi-index, labels on different levels can be removed by specifying

5127 the level. See the :ref:`user guide <advanced.shown_levels>`

5128 for more information about the now unused levels.

5130 Parameters

5131 ----------

5132 labels : single label or list-like

5133 Index or column labels to drop. A tuple will be used as a single

5134 label and not treated as a list-like.

5135 axis : {0 or 'index', 1 or 'columns'}, default 0

5136 Whether to drop labels from the index (0 or 'index') or

5137 columns (1 or 'columns').

5138 index : single label or list-like

5139 Alternative to specifying axis (``labels, axis=0``

5140 is equivalent to ``index=labels``).

5141 columns : single label or list-like

5142 Alternative to specifying axis (``labels, axis=1``

5143 is equivalent to ``columns=labels``).

5144 level : int or level name, optional

5145 For MultiIndex, level from which the labels will be removed.

5146 inplace : bool, default False

5147 If False, return a copy. Otherwise, do operation

5148 inplace and return None.

5149 errors : {'ignore', 'raise'}, default 'raise'

5150 If 'ignore', suppress error and only existing labels are

5151 dropped.

5153 Returns

5154 -------

5155 DataFrame or None

5156 DataFrame without the removed index or column labels or

5157 None if ``inplace=True``.

5159 Raises

5160 ------

5161 KeyError

5162 If any of the labels is not found in the selected axis.

5164 See Also

5165 --------

5166 DataFrame.loc : Label-location based indexer for selection by label.

5167 DataFrame.dropna : Return DataFrame with labels on given axis omitted

5168 where (all or any) data are missing.

5169 DataFrame.drop_duplicates : Return DataFrame with duplicate rows

5170 removed, optionally only considering certain columns.

5171 Series.drop : Return Series with specified index labels removed.

5173 Examples

5174 --------

5175 >>> df = pd.DataFrame(np.arange(12).reshape(3, 4),

5176 ... columns=['A', 'B', 'C', 'D'])

5177 >>> df

5178 A B C D

5179 0 0 1 2 3

5180 1 4 5 6 7

5181 2 8 9 10 11

5183 Drop columns

5185 >>> df.drop(['B', 'C'], axis=1)

5186 A D

5187 0 0 3

5188 1 4 7

5189 2 8 11

5191 >>> df.drop(columns=['B', 'C'])

5192 A D

5193 0 0 3

5194 1 4 7

5195 2 8 11

5197 Drop a row by index

5199 >>> df.drop([0, 1])

5200 A B C D

5201 2 8 9 10 11

5203 Drop columns and/or rows of MultiIndex DataFrame

5205 >>> midx = pd.MultiIndex(levels=[['lama', 'cow', 'falcon'],

5206 ... ['speed', 'weight', 'length']],

5207 ... codes=[[0, 0, 0, 1, 1, 1, 2, 2, 2],

5208 ... [0, 1, 2, 0, 1, 2, 0, 1, 2]])

5209 >>> df = pd.DataFrame(index=midx, columns=['big', 'small'],

5210 ... data=[[45, 30], [200, 100], [1.5, 1], [30, 20],

5211 ... [250, 150], [1.5, 0.8], [320, 250],

5212 ... [1, 0.8], [0.3, 0.2]])

5213 >>> df

5214 big small

5215 lama speed 45.0 30.0

5216 weight 200.0 100.0

5217 length 1.5 1.0

5218 cow speed 30.0 20.0

5219 weight 250.0 150.0

5220 length 1.5 0.8

5221 falcon speed 320.0 250.0

5222 weight 1.0 0.8

5223 length 0.3 0.2

5225 Drop a specific index combination from the MultiIndex

5226 DataFrame, i.e., drop the combination ``'falcon'`` and

5227 ``'weight'``, which deletes only the corresponding row

5229 >>> df.drop(index=('falcon', 'weight'))

5230 big small

5231 lama speed 45.0 30.0

5232 weight 200.0 100.0

5233 length 1.5 1.0

5234 cow speed 30.0 20.0

5235 weight 250.0 150.0

5236 length 1.5 0.8

5237 falcon speed 320.0 250.0

5238 length 0.3 0.2

5240 >>> df.drop(index='cow', columns='small')

5241 big

5242 lama speed 45.0

5243 weight 200.0

5244 length 1.5

5245 falcon speed 320.0

5246 weight 1.0

5247 length 0.3

5249 >>> df.drop(index='length', level=1)

5250 big small

5251 lama speed 45.0 30.0

5252 weight 200.0 100.0

5253 cow speed 30.0 20.0

5254 weight 250.0 150.0

5255 falcon speed 320.0 250.0

5256 weight 1.0 0.8

5257 """

5258 return super().drop(

5259 labels=labels,

5260 axis=axis,

5261 index=index,

5262 columns=columns,

5263 level=level,

5264 inplace=inplace,

5265 errors=errors,

5268 @overload

5269 def rename(

5270 self,

5271 mapper: Renamer | None = ...,

5273 index: Renamer | None = ...,

5274 columns: Renamer | None = ...,

5275 axis: Axis | None = ...,

5276 copy: bool | None = ...,

5277 inplace: Literal[True],

5278 level: Level = ...,

5279 errors: IgnoreRaise = ...,

5280 ) -> None:

5281 ...

5283 @overload

5284 def rename(

5285 self,

5286 mapper: Renamer | None = ...,

5288 index: Renamer | None = ...,

5289 columns: Renamer | None = ...,

5290 axis: Axis | None = ...,

5291 copy: bool | None = ...,

5292 inplace: Literal[False] = ...,

5293 level: Level = ...,

5294 errors: IgnoreRaise = ...,

5295 ) -> DataFrame:

5296 ...

5298 @overload

5299 def rename(

5300 self,

5301 mapper: Renamer | None = ...,

5303 index: Renamer | None = ...,

5304 columns: Renamer | None = ...,

5305 axis: Axis | None = ...,

5306 copy: bool | None = ...,

5307 inplace: bool = ...,

5308 level: Level = ...,

5309 errors: IgnoreRaise = ...,

5310 ) -> DataFrame | None:

5311 ...

5313 def rename(

5314 self,

5315 mapper: Renamer | None = None,

5317 index: Renamer | None = None,

5318 columns: Renamer | None = None,

5319 axis: Axis | None = None,

5320 copy: bool | None = None,

5321 inplace: bool = False,

5322 level: Level = None,

5323 errors: IgnoreRaise = "ignore",

5324 ) -> DataFrame | None:

5325 """

5326 Rename columns or index labels.

5328 Function / dict values must be unique (1-to-1). Labels not contained in

5329 a dict / Series will be left as-is. Extra labels listed don't throw an

5330 error.

5332 See the :ref:`user guide <basics.rename>` for more.

5334 Parameters

5335 ----------

5336 mapper : dict-like or function

5337 Dict-like or function transformations to apply to

5338 that axis' values. Use either ``mapper`` and ``axis`` to

5339 specify the axis to target with ``mapper``, or ``index`` and

5340 ``columns``.

5341 index : dict-like or function

5342 Alternative to specifying axis (``mapper, axis=0``

5343 is equivalent to ``index=mapper``).

5344 columns : dict-like or function

5345 Alternative to specifying axis (``mapper, axis=1``

5346 is equivalent to ``columns=mapper``).

5347 axis : {0 or 'index', 1 or 'columns'}, default 0

5348 Axis to target with ``mapper``. Can be either the axis name

5349 ('index', 'columns') or number (0, 1). The default is 'index'.

5350 copy : bool, default True

5351 Also copy underlying data.

5352 inplace : bool, default False

5353 Whether to modify the DataFrame rather than creating a new one.

5354 If True then value of copy is ignored.

5355 level : int or level name, default None

5356 In case of a MultiIndex, only rename labels in the specified

5357 level.

5358 errors : {'ignore', 'raise'}, default 'ignore'

5359 If 'raise', raise a `KeyError` when a dict-like `mapper`, `index`,

5360 or `columns` contains labels that are not present in the Index

5361 being transformed.

5362 If 'ignore', existing keys will be renamed and extra keys will be

5363 ignored.

5365 Returns

5366 -------

5367 DataFrame or None

5368 DataFrame with the renamed axis labels or None if ``inplace=True``.

5370 Raises

5371 ------

5372 KeyError

5373 If any of the labels is not found in the selected axis and

5374 "errors='raise'".

5376 See Also

5377 --------

5378 DataFrame.rename_axis : Set the name of the axis.

5380 Examples

5381 --------

5382 ``DataFrame.rename`` supports two calling conventions

5384 * ``(index=index_mapper, columns=columns_mapper, ...)``

5385 * ``(mapper, axis={'index', 'columns'}, ...)``

5387 We *highly* recommend using keyword arguments to clarify your

5388 intent.

5390 Rename columns using a mapping:

5392 >>> df = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]})

5393 >>> df.rename(columns={"A": "a", "B": "c"})

5394 a c

5395 0 1 4

5396 1 2 5

5397 2 3 6

5399 Rename index using a mapping:

5401 >>> df.rename(index={0: "x", 1: "y", 2: "z"})

5402 A B

5403 x 1 4

5404 y 2 5

5405 z 3 6

5407 Cast index labels to a different type:

5409 >>> df.index

5410 RangeIndex(start=0, stop=3, step=1)

5411 >>> df.rename(index=str).index

5412 Index(['0', '1', '2'], dtype='object')

5414 >>> df.rename(columns={"A": "a", "B": "b", "C": "c"}, errors="raise")

5415 Traceback (most recent call last):

5416 KeyError: ['C'] not found in axis

5418 Using axis-style parameters:

5420 >>> df.rename(str.lower, axis='columns')

5421 a b

5422 0 1 4

5423 1 2 5

5424 2 3 6

5426 >>> df.rename({1: 2, 2: 4}, axis='index')

5427 A B

5428 0 1 4

5429 2 2 5

5430 4 3 6

5431 """

5432 return super()._rename(

5433 mapper=mapper,

5434 index=index,

5435 columns=columns,

5436 axis=axis,

5437 copy=copy,

5438 inplace=inplace,

5439 level=level,

5440 errors=errors,

5443 @overload

5444 def fillna(

5445 self,

5446 value: Hashable | Mapping | Series | DataFrame = ...,

5448 method: FillnaOptions | None = ...,

5449 axis: Axis | None = ...,

5450 inplace: Literal[False] = ...,

5451 limit: int | None = ...,

5452 downcast: dict | None = ...,

5453 ) -> DataFrame:

5454 ...

5456 @overload

5457 def fillna(

5458 self,

5459 value: Hashable | Mapping | Series | DataFrame = ...,

5461 method: FillnaOptions | None = ...,

5462 axis: Axis | None = ...,

5463 inplace: Literal[True],

5464 limit: int | None = ...,

5465 downcast: dict | None = ...,

5466 ) -> None:

5467 ...

5469 @overload

5470 def fillna(

5471 self,

5472 value: Hashable | Mapping | Series | DataFrame = ...,

5474 method: FillnaOptions | None = ...,

5475 axis: Axis | None = ...,

5476 inplace: bool = ...,

5477 limit: int | None = ...,

5478 downcast: dict | None = ...,

5479 ) -> DataFrame | None:

5480 ...

5482 @doc(NDFrame.fillna, **_shared_doc_kwargs)

5483 def fillna(

5484 self,

5485 value: Hashable | Mapping | Series | DataFrame = None,

5487 method: FillnaOptions | None = None,

5488 axis: Axis | None = None,

5489 inplace: bool = False,

5490 limit: int | None = None,

5491 downcast: dict | None = None,

5492 ) -> DataFrame | None:

5493 return super().fillna(

5494 value=value,

5495 method=method,

5496 axis=axis,

5497 inplace=inplace,

5498 limit=limit,

5499 downcast=downcast,

5502 def pop(self, item: Hashable) -> Series:

5503 """

5504 Return item and drop from frame. Raise KeyError if not found.

5506 Parameters

5507 ----------

5508 item : label

5509 Label of column to be popped.

5511 Returns

5512 -------

5513 Series

5515 Examples

5516 --------

5517 >>> df = pd.DataFrame([('falcon', 'bird', 389.0),

5518 ... ('parrot', 'bird', 24.0),

5519 ... ('lion', 'mammal', 80.5),

5520 ... ('monkey', 'mammal', np.nan)],

5521 ... columns=('name', 'class', 'max_speed'))

5522 >>> df

5523 name class max_speed

5524 0 falcon bird 389.0

5525 1 parrot bird 24.0

5526 2 lion mammal 80.5

5527 3 monkey mammal NaN

5529 >>> df.pop('class')

5530 0 bird

5531 1 bird

5532 2 mammal

5533 3 mammal

5534 Name: class, dtype: object

5536 >>> df

5537 name max_speed

5538 0 falcon 389.0

5539 1 parrot 24.0

5540 2 lion 80.5

5541 3 monkey NaN

5542 """

5543 return super().pop(item=item)

5545 @overload

5546 def replace(

5547 self,

5548 to_replace=...,

5549 value=...,

5551 inplace: Literal[False] = ...,

5552 limit: int | None = ...,

5553 regex: bool = ...,

5554 method: Literal["pad", "ffill", "bfill"] | lib.NoDefault = ...,

5555 ) -> DataFrame:

5556 ...

5558 @overload

5559 def replace(

5560 self,

5561 to_replace=...,

5562 value=...,

5564 inplace: Literal[True],

5565 limit: int | None = ...,

5566 regex: bool = ...,

5567 method: Literal["pad", "ffill", "bfill"] | lib.NoDefault = ...,

5568 ) -> None:

5569 ...

5571 @doc(NDFrame.replace, **_shared_doc_kwargs)

5572 def replace(

5573 self,

5574 to_replace=None,

5575 value=lib.no_default,

5577 inplace: bool = False,

5578 limit: int | None = None,

5579 regex: bool = False,

5580 method: Literal["pad", "ffill", "bfill"] | lib.NoDefault = lib.no_default,

5581 ) -> DataFrame | None:

5582 return super().replace(

5583 to_replace=to_replace,

5584 value=value,

5585 inplace=inplace,

5586 limit=limit,

5587 regex=regex,

5588 method=method,

5591 def _replace_columnwise(

5592 self, mapping: dict[Hashable, tuple[Any, Any]], inplace: bool, regex

5594 """

5595 Dispatch to Series.replace column-wise.

5597 Parameters

5598 ----------

5599 mapping : dict

5600 of the form {col: (target, value)}

5601 inplace : bool

5602 regex : bool or same types as `to_replace` in DataFrame.replace

5604 Returns

5605 -------

5606 DataFrame or None

5607 """

5608 # Operate column-wise

5609 res = self if inplace else self.copy(deep=None)

5610 ax = self.columns

5612 for i, ax_value in enumerate(ax):

5613 if ax_value in mapping:

5614 ser = self.iloc[:, i]

5616 target, value = mapping[ax_value]

5617 newobj = ser.replace(target, value, regex=regex)

5619 res._iset_item(i, newobj)

5621 if inplace:

5622 return

5623 return res.__finalize__(self)

5625 @doc(NDFrame.shift, klass=_shared_doc_kwargs["klass"])

5626 def shift(

5627 self,

5628 periods: int = 1,

5629 freq: Frequency | None = None,

5630 axis: Axis = 0,

5631 fill_value: Hashable = lib.no_default,

5632 ) -> DataFrame:

5633 axis = self._get_axis_number(axis)

5635 ncols = len(self.columns)

5636 if (

5637 axis == 1

5638 and periods != 0

5639 and freq is None

5640 and fill_value is lib.no_default

5641 and ncols > 0

5643 # We will infer fill_value to match the closest column

5645 # Use a column that we know is valid for our column's dtype GH#38434

5646 label = self.columns[0]

5648 if periods > 0:

5649 result = self.iloc[:, :-periods]

5650 for col in range(min(ncols, abs(periods))):

5651 # TODO(EA2D): doing this in a loop unnecessary with 2D EAs

5652 # Define filler inside loop so we get a copy

5653 filler = self.iloc[:, 0].shift(len(self))

5654 result.insert(0, label, filler, allow_duplicates=True)

5655 else:

5656 result = self.iloc[:, -periods:]

5657 for col in range(min(ncols, abs(periods))):

5658 # Define filler inside loop so we get a copy

5659 filler = self.iloc[:, -1].shift(len(self))

5660 result.insert(

5661 len(result.columns), label, filler, allow_duplicates=True

5664 result.columns = self.columns.copy()

5665 return result

5666 elif (

5667 axis == 1

5668 and periods != 0

5669 and fill_value is not lib.no_default

5670 and ncols > 0

5672 arrays = self._mgr.arrays

5673 if len(arrays) > 1 or (

5674 # If we only have one block and we know that we can't

5675 # keep the same dtype (i.e. the _can_hold_element check)

5676 # then we can go through the reindex_indexer path

5677 # (and avoid casting logic in the Block method).

5678 not can_hold_element(arrays[0], fill_value)

5680 # GH#35488 we need to watch out for multi-block cases

5681 # We only get here with fill_value not-lib.no_default

5682 nper = abs(periods)

5683 nper = min(nper, ncols)

5684 if periods > 0:

5685 indexer = np.array(

5686 [-1] * nper + list(range(ncols - periods)), dtype=np.intp

5688 else:

5689 indexer = np.array(

5690 list(range(nper, ncols)) + [-1] * nper, dtype=np.intp

5692 mgr = self._mgr.reindex_indexer(

5693 self.columns,

5694 indexer,

5695 axis=0,

5696 fill_value=fill_value,

5697 allow_dups=True,

5699 res_df = self._constructor(mgr)

5700 return res_df.__finalize__(self, method="shift")

5702 return super().shift(

5703 periods=periods, freq=freq, axis=axis, fill_value=fill_value

5706 @overload

5707 def set_index(

5708 self,

5709 keys,

5711 drop: bool = ...,

5712 append: bool = ...,

5713 inplace: Literal[False] = ...,

5714 verify_integrity: bool = ...,

5715 ) -> DataFrame:

5716 ...

5718 @overload

5719 def set_index(

5720 self,

5721 keys,

5723 drop: bool = ...,

5724 append: bool = ...,

5725 inplace: Literal[True],

5726 verify_integrity: bool = ...,

5727 ) -> None:

5728 ...

5730 def set_index(

5731 self,

5732 keys,

5734 drop: bool = True,

5735 append: bool = False,

5736 inplace: bool = False,

5737 verify_integrity: bool = False,

5738 ) -> DataFrame | None:

5739 """

5740 Set the DataFrame index using existing columns.

5742 Set the DataFrame index (row labels) using one or more existing

5743 columns or arrays (of the correct length). The index can replace the

5744 existing index or expand on it.

5746 Parameters

5747 ----------

5748 keys : label or array-like or list of labels/arrays

5749 This parameter can be either a single column key, a single array of

5750 the same length as the calling DataFrame, or a list containing an

5751 arbitrary combination of column keys and arrays. Here, "array"

5752 encompasses :class:`Series`, :class:`Index`, ``np.ndarray``, and

5753 instances of :class:`~collections.abc.Iterator`.

5754 drop : bool, default True

5755 Delete columns to be used as the new index.

5756 append : bool, default False

5757 Whether to append columns to existing index.

5758 inplace : bool, default False

5759 Whether to modify the DataFrame rather than creating a new one.

5760 verify_integrity : bool, default False

5761 Check the new index for duplicates. Otherwise defer the check until

5762 necessary. Setting to False will improve the performance of this

5763 method.

5765 Returns

5766 -------

5767 DataFrame or None

5768 Changed row labels or None if ``inplace=True``.

5770 See Also

5771 --------

5772 DataFrame.reset_index : Opposite of set_index.

5773 DataFrame.reindex : Change to new indices or expand indices.

5774 DataFrame.reindex_like : Change to same indices as other DataFrame.

5776 Examples

5777 --------

5778 >>> df = pd.DataFrame({'month': [1, 4, 7, 10],

5779 ... 'year': [2012, 2014, 2013, 2014],

5780 ... 'sale': [55, 40, 84, 31]})

5781 >>> df

5782 month year sale

5783 0 1 2012 55

5784 1 4 2014 40

5785 2 7 2013 84

5786 3 10 2014 31

5788 Set the index to become the 'month' column:

5790 >>> df.set_index('month')

5791 year sale

5792 month

5793 1 2012 55

5794 4 2014 40

5795 7 2013 84

5796 10 2014 31

5798 Create a MultiIndex using columns 'year' and 'month':

5800 >>> df.set_index(['year', 'month'])

5801 sale

5802 year month

5803 2012 1 55

5804 2014 4 40

5805 2013 7 84

5806 2014 10 31

5808 Create a MultiIndex using an Index and a column:

5810 >>> df.set_index([pd.Index([1, 2, 3, 4]), 'year'])

5811 month sale

5812 year

5813 1 2012 1 55

5814 2 2014 4 40

5815 3 2013 7 84

5816 4 2014 10 31

5818 Create a MultiIndex using two Series:

5820 >>> s = pd.Series([1, 2, 3, 4])

5821 >>> df.set_index([s, s**2])

5822 month year sale

5823 1 1 1 2012 55

5824 2 4 4 2014 40

5825 3 9 7 2013 84

5826 4 16 10 2014 31

5827 """

5828 inplace = validate_bool_kwarg(inplace, "inplace")

5829 self._check_inplace_and_allows_duplicate_labels(inplace)

5830 if not isinstance(keys, list):

5831 keys = [keys]

5833 err_msg = (

5834 'The parameter "keys" may be a column key, one-dimensional '

5835 "array, or a list containing only valid column keys and "

5836 "one-dimensional arrays."

5839 missing: list[Hashable] = []

5840 for col in keys:

5841 if isinstance(col, (Index, Series, np.ndarray, list, abc.Iterator)):

5842 # arrays are fine as long as they are one-dimensional

5843 # iterators get converted to list below

5844 if getattr(col, "ndim", 1) != 1:

5845 raise ValueError(err_msg)

5846 else:

5847 # everything else gets tried as a key; see GH 24969

5848 try:

5849 found = col in self.columns

5850 except TypeError as err:

5851 raise TypeError(

5852 f"{err_msg}. Received column of type {type(col)}"

5853 ) from err

5854 else:

5855 if not found:

5856 missing.append(col)

5858 if missing:

5859 raise KeyError(f"None of {missing} are in the columns")

5861 if inplace:

5862 frame = self

5863 else:

5864 # GH 49473 Use "lazy copy" with Copy-on-Write

5865 frame = self.copy(deep=None)

5867 arrays = []

5868 names: list[Hashable] = []

5869 if append:

5870 names = list(self.index.names)

5871 if isinstance(self.index, MultiIndex):

5872 for i in range(self.index.nlevels):

5873 arrays.append(self.index._get_level_values(i))

5874 else:

5875 arrays.append(self.index)

5877 to_remove: list[Hashable] = []

5878 for col in keys:

5879 if isinstance(col, MultiIndex):

5880 for n in range(col.nlevels):

5881 arrays.append(col._get_level_values(n))

5882 names.extend(col.names)

5883 elif isinstance(col, (Index, Series)):

5884 # if Index then not MultiIndex (treated above)

5886 # error: Argument 1 to "append" of "list" has incompatible type

5887 # "Union[Index, Series]"; expected "Index"

5888 arrays.append(col) # type:ignore[arg-type]

5889 names.append(col.name)

5890 elif isinstance(col, (list, np.ndarray)):

5891 # error: Argument 1 to "append" of "list" has incompatible type

5892 # "Union[List[Any], ndarray]"; expected "Index"

5893 arrays.append(col) # type: ignore[arg-type]

5894 names.append(None)

5895 elif isinstance(col, abc.Iterator):

5896 # error: Argument 1 to "append" of "list" has incompatible type

5897 # "List[Any]"; expected "Index"

5898 arrays.append(list(col)) # type: ignore[arg-type]

5899 names.append(None)

5900 # from here, col can only be a column label

5901 else:

5902 arrays.append(frame[col])

5903 names.append(col)

5904 if drop:

5905 to_remove.append(col)

5907 if len(arrays[-1]) != len(self):

5908 # check newest element against length of calling frame, since

5909 # ensure_index_from_sequences would not raise for append=False.

5910 raise ValueError(

5911 f"Length mismatch: Expected {len(self)} rows, "

5912 f"received array of length {len(arrays[-1])}"

5915 index = ensure_index_from_sequences(arrays, names)

5917 if verify_integrity and not index.is_unique:

5918 duplicates = index[index.duplicated()].unique()

5919 raise ValueError(f"Index has duplicate keys: {duplicates}")

5921 # use set to handle duplicate column names gracefully in case of drop

5922 for c in set(to_remove):

5923 del frame[c]

5925 # clear up memory usage

5926 index._cleanup()

5928 frame.index = index

5930 if not inplace:

5931 return frame

5932 return None

5934 @overload

5935 def reset_index(

5936 self,

5937 level: IndexLabel = ...,

5939 drop: bool = ...,

5940 inplace: Literal[False] = ...,

5941 col_level: Hashable = ...,

5942 col_fill: Hashable = ...,

5943 allow_duplicates: bool | lib.NoDefault = ...,

5944 names: Hashable | Sequence[Hashable] = None,

5945 ) -> DataFrame:

5946 ...

5948 @overload

5949 def reset_index(

5950 self,

5951 level: IndexLabel = ...,

5953 drop: bool = ...,

5954 inplace: Literal[True],

5955 col_level: Hashable = ...,

5956 col_fill: Hashable = ...,

5957 allow_duplicates: bool | lib.NoDefault = ...,

5958 names: Hashable | Sequence[Hashable] = None,

5959 ) -> None:

5960 ...

5962 @overload

5963 def reset_index(

5964 self,

5965 level: IndexLabel = ...,

5967 drop: bool = ...,

5968 inplace: bool = ...,

5969 col_level: Hashable = ...,

5970 col_fill: Hashable = ...,

5971 allow_duplicates: bool | lib.NoDefault = ...,

5972 names: Hashable | Sequence[Hashable] = None,

5973 ) -> DataFrame | None:

5974 ...

5976 def reset_index(

5977 self,

5978 level: IndexLabel = None,

5980 drop: bool = False,

5981 inplace: bool = False,

5982 col_level: Hashable = 0,

5983 col_fill: Hashable = "",

5984 allow_duplicates: bool | lib.NoDefault = lib.no_default,

5985 names: Hashable | Sequence[Hashable] = None,

5986 ) -> DataFrame | None:

5987 """

5988 Reset the index, or a level of it.

5990 Reset the index of the DataFrame, and use the default one instead.

5991 If the DataFrame has a MultiIndex, this method can remove one or more

5992 levels.

5994 Parameters

5995 ----------

5996 level : int, str, tuple, or list, default None

5997 Only remove the given levels from the index. Removes all levels by

5998 default.

5999 drop : bool, default False

6000 Do not try to insert index into dataframe columns. This resets

6001 the index to the default integer index.

6002 inplace : bool, default False

6003 Whether to modify the DataFrame rather than creating a new one.

6004 col_level : int or str, default 0

6005 If the columns have multiple levels, determines which level the

6006 labels are inserted into. By default it is inserted into the first

6007 level.

6008 col_fill : object, default ''

6009 If the columns have multiple levels, determines how the other

6010 levels are named. If None then the index name is repeated.

6011 allow_duplicates : bool, optional, default lib.no_default

6012 Allow duplicate column labels to be created.

6014 .. versionadded:: 1.5.0

6016 names : int, str or 1-dimensional list, default None

6017 Using the given string, rename the DataFrame column which contains the

6018 index data. If the DataFrame has a MultiIndex, this has to be a list or

6019 tuple with length equal to the number of levels.

6021 .. versionadded:: 1.5.0

6023 Returns

6024 -------

6025 DataFrame or None

6026 DataFrame with the new index or None if ``inplace=True``.

6028 See Also

6029 --------

6030 DataFrame.set_index : Opposite of reset_index.

6031 DataFrame.reindex : Change to new indices or expand indices.

6032 DataFrame.reindex_like : Change to same indices as other DataFrame.

6034 Examples

6035 --------

6036 >>> df = pd.DataFrame([('bird', 389.0),

6037 ... ('bird', 24.0),

6038 ... ('mammal', 80.5),

6039 ... ('mammal', np.nan)],

6040 ... index=['falcon', 'parrot', 'lion', 'monkey'],

6041 ... columns=('class', 'max_speed'))

6042 >>> df

6043 class max_speed

6044 falcon bird 389.0

6045 parrot bird 24.0

6046 lion mammal 80.5

6047 monkey mammal NaN

6049 When we reset the index, the old index is added as a column, and a

6050 new sequential index is used:

6052 >>> df.reset_index()

6053 index class max_speed

6054 0 falcon bird 389.0

6055 1 parrot bird 24.0

6056 2 lion mammal 80.5

6057 3 monkey mammal NaN

6059 We can use the `drop` parameter to avoid the old index being added as

6060 a column:

6062 >>> df.reset_index(drop=True)

6063 class max_speed

6064 0 bird 389.0

6065 1 bird 24.0

6066 2 mammal 80.5

6067 3 mammal NaN

6069 You can also use `reset_index` with `MultiIndex`.

6071 >>> index = pd.MultiIndex.from_tuples([('bird', 'falcon'),

6072 ... ('bird', 'parrot'),

6073 ... ('mammal', 'lion'),

6074 ... ('mammal', 'monkey')],

6075 ... names=['class', 'name'])

6076 >>> columns = pd.MultiIndex.from_tuples([('speed', 'max'),

6077 ... ('species', 'type')])

6078 >>> df = pd.DataFrame([(389.0, 'fly'),

6079 ... (24.0, 'fly'),

6080 ... (80.5, 'run'),

6081 ... (np.nan, 'jump')],

6082 ... index=index,

6083 ... columns=columns)

6084 >>> df

6085 speed species

6086 max type

6087 class name

6088 bird falcon 389.0 fly

6089 parrot 24.0 fly

6090 mammal lion 80.5 run

6091 monkey NaN jump

6093 Using the `names` parameter, choose a name for the index column:

6095 >>> df.reset_index(names=['classes', 'names'])

6096 classes names speed species

6097 max type

6098 0 bird falcon 389.0 fly

6099 1 bird parrot 24.0 fly

6100 2 mammal lion 80.5 run

6101 3 mammal monkey NaN jump

6103 If the index has multiple levels, we can reset a subset of them:

6105 >>> df.reset_index(level='class')

6106 class speed species

6107 max type

6108 name

6109 falcon bird 389.0 fly

6110 parrot bird 24.0 fly

6111 lion mammal 80.5 run

6112 monkey mammal NaN jump

6114 If we are not dropping the index, by default, it is placed in the top

6115 level. We can place it in another level:

6117 >>> df.reset_index(level='class', col_level=1)

6118 speed species

6119 class max type

6120 name

6121 falcon bird 389.0 fly

6122 parrot bird 24.0 fly

6123 lion mammal 80.5 run

6124 monkey mammal NaN jump

6126 When the index is inserted under another level, we can specify under

6127 which one with the parameter `col_fill`:

6129 >>> df.reset_index(level='class', col_level=1, col_fill='species')

6130 species speed species

6131 class max type

6132 name

6133 falcon bird 389.0 fly

6134 parrot bird 24.0 fly

6135 lion mammal 80.5 run

6136 monkey mammal NaN jump

6138 If we specify a nonexistent level for `col_fill`, it is created:

6140 >>> df.reset_index(level='class', col_level=1, col_fill='genus')

6141 genus speed species

6142 class max type

6143 name

6144 falcon bird 389.0 fly

6145 parrot bird 24.0 fly

6146 lion mammal 80.5 run

6147 monkey mammal NaN jump

6148 """

6149 inplace = validate_bool_kwarg(inplace, "inplace")

6150 self._check_inplace_and_allows_duplicate_labels(inplace)

6151 if inplace:

6152 new_obj = self

6153 else:

6154 new_obj = self.copy(deep=None)

6155 if allow_duplicates is not lib.no_default:

6156 allow_duplicates = validate_bool_kwarg(allow_duplicates, "allow_duplicates")

6158 new_index = default_index(len(new_obj))

6159 if level is not None:

6160 if not isinstance(level, (tuple, list)):

6161 level = [level]

6162 level = [self.index._get_level_number(lev) for lev in level]

6163 if len(level) < self.index.nlevels:

6164 new_index = self.index.droplevel(level)

6166 if not drop:

6167 to_insert: Iterable[tuple[Any, Any | None]]

6169 default = "index" if "index" not in self else "level_0"

6170 names = self.index._get_default_index_names(names, default)

6172 if isinstance(self.index, MultiIndex):

6173 to_insert = zip(self.index.levels, self.index.codes)

6174 else:

6175 to_insert = ((self.index, None),)

6177 multi_col = isinstance(self.columns, MultiIndex)

6178 for i, (lev, lab) in reversed(list(enumerate(to_insert))):

6179 if level is not None and i not in level:

6180 continue

6181 name = names[i]

6182 if multi_col:

6183 col_name = list(name) if isinstance(name, tuple) else [name]

6184 if col_fill is None:

6185 if len(col_name) not in (1, self.columns.nlevels):

6186 raise ValueError(

6187 "col_fill=None is incompatible "

6188 f"with incomplete column name {name}"

6190 col_fill = col_name[0]

6192 lev_num = self.columns._get_level_number(col_level)

6193 name_lst = [col_fill] * lev_num + col_name

6194 missing = self.columns.nlevels - len(name_lst)

6195 name_lst += [col_fill] * missing

6196 name = tuple(name_lst)

6198 # to ndarray and maybe infer different dtype

6199 level_values = lev._values

6200 if level_values.dtype == np.object_:

6201 level_values = lib.maybe_convert_objects(level_values)

6203 if lab is not None:

6204 # if we have the codes, extract the values with a mask

6205 level_values = algorithms.take(

6206 level_values, lab, allow_fill=True, fill_value=lev._na_value

6209 new_obj.insert(

6211 name,

6212 level_values,

6213 allow_duplicates=allow_duplicates,

6216 new_obj.index = new_index

6217 if not inplace:

6218 return new_obj

6220 return None

6222 # ----------------------------------------------------------------------

6223 # Reindex-based selection methods

6225 @doc(NDFrame.isna, klass=_shared_doc_kwargs["klass"])

6226 def isna(self) -> DataFrame:

6227 result = self._constructor(self._mgr.isna(func=isna))

6228 return result.__finalize__(self, method="isna")

6230 @doc(NDFrame.isna, klass=_shared_doc_kwargs["klass"])

6231 def isnull(self) -> DataFrame:

6232 """

6233 DataFrame.isnull is an alias for DataFrame.isna.

6234 """

6235 return self.isna()

6237 @doc(NDFrame.notna, klass=_shared_doc_kwargs["klass"])

6238 def notna(self) -> DataFrame:

6239 return ~self.isna()

6241 @doc(NDFrame.notna, klass=_shared_doc_kwargs["klass"])

6242 def notnull(self) -> DataFrame:

6243 """

6244 DataFrame.notnull is an alias for DataFrame.notna.

6245 """

6246 return ~self.isna()

6248 @overload

6249 def dropna(

6250 self,

6252 axis: Axis = ...,

6253 how: AnyAll | NoDefault = ...,

6254 thresh: int | NoDefault = ...,

6255 subset: IndexLabel = ...,

6256 inplace: Literal[False] = ...,

6257 ignore_index: bool = ...,

6258 ) -> DataFrame:

6259 ...

6261 @overload

6262 def dropna(

6263 self,

6265 axis: Axis = ...,

6266 how: AnyAll | NoDefault = ...,

6267 thresh: int | NoDefault = ...,

6268 subset: IndexLabel = ...,

6269 inplace: Literal[True],

6270 ignore_index: bool = ...,

6271 ) -> None:

6272 ...

6274 def dropna(

6275 self,

6277 axis: Axis = 0,

6278 how: AnyAll | NoDefault = no_default,

6279 thresh: int | NoDefault = no_default,

6280 subset: IndexLabel = None,

6281 inplace: bool = False,

6282 ignore_index: bool = False,

6283 ) -> DataFrame | None:

6284 """

6285 Remove missing values.

6287 See the :ref:`User Guide <missing_data>` for more on which values are

6288 considered missing, and how to work with missing data.

6290 Parameters

6291 ----------

6292 axis : {0 or 'index', 1 or 'columns'}, default 0

6293 Determine if rows or columns which contain missing values are

6294 removed.

6296 * 0, or 'index' : Drop rows which contain missing values.

6297 * 1, or 'columns' : Drop columns which contain missing value.

6299 Pass tuple or list to drop on multiple axes.

6300 Only a single axis is allowed.

6302 how : {'any', 'all'}, default 'any'

6303 Determine if row or column is removed from DataFrame, when we have

6304 at least one NA or all NA.

6306 * 'any' : If any NA values are present, drop that row or column.

6307 * 'all' : If all values are NA, drop that row or column.

6309 thresh : int, optional

6310 Require that many non-NA values. Cannot be combined with how.

6311 subset : column label or sequence of labels, optional

6312 Labels along other axis to consider, e.g. if you are dropping rows

6313 these would be a list of columns to include.

6314 inplace : bool, default False

6315 Whether to modify the DataFrame rather than creating a new one.

6316 ignore_index : bool, default ``False``

6317 If ``True``, the resulting axis will be labeled 0, 1, …, n - 1.

6319 .. versionadded:: 2.0.0

6321 Returns

6322 -------

6323 DataFrame or None

6324 DataFrame with NA entries dropped from it or None if ``inplace=True``.

6326 See Also

6327 --------

6328 DataFrame.isna: Indicate missing values.

6329 DataFrame.notna : Indicate existing (non-missing) values.

6330 DataFrame.fillna : Replace missing values.

6331 Series.dropna : Drop missing values.

6332 Index.dropna : Drop missing indices.

6334 Examples

6335 --------

6336 >>> df = pd.DataFrame({"name": ['Alfred', 'Batman', 'Catwoman'],

6337 ... "toy": [np.nan, 'Batmobile', 'Bullwhip'],

6338 ... "born": [pd.NaT, pd.Timestamp("1940-04-25"),

6339 ... pd.NaT]})

6340 >>> df

6341 name toy born

6342 0 Alfred NaN NaT

6343 1 Batman Batmobile 1940-04-25

6344 2 Catwoman Bullwhip NaT

6346 Drop the rows where at least one element is missing.

6348 >>> df.dropna()

6349 name toy born

6350 1 Batman Batmobile 1940-04-25

6352 Drop the columns where at least one element is missing.

6354 >>> df.dropna(axis='columns')

6355 name

6356 0 Alfred

6357 1 Batman

6358 2 Catwoman

6360 Drop the rows where all elements are missing.

6362 >>> df.dropna(how='all')

6363 name toy born

6364 0 Alfred NaN NaT

6365 1 Batman Batmobile 1940-04-25

6366 2 Catwoman Bullwhip NaT

6368 Keep only the rows with at least 2 non-NA values.

6370 >>> df.dropna(thresh=2)

6371 name toy born

6372 1 Batman Batmobile 1940-04-25

6373 2 Catwoman Bullwhip NaT

6375 Define in which columns to look for missing values.

6377 >>> df.dropna(subset=['name', 'toy'])

6378 name toy born

6379 1 Batman Batmobile 1940-04-25

6380 2 Catwoman Bullwhip NaT

6381 """

6382 if (how is not no_default) and (thresh is not no_default):

6383 raise TypeError(

6384 "You cannot set both the how and thresh arguments at the same time."

6387 if how is no_default:

6388 how = "any"

6390 inplace = validate_bool_kwarg(inplace, "inplace")

6391 if isinstance(axis, (tuple, list)):

6392 # GH20987

6393 raise TypeError("supplying multiple axes to axis is no longer supported.")

6395 axis = self._get_axis_number(axis)

6396 agg_axis = 1 - axis

6398 agg_obj = self

6399 if subset is not None:

6400 # subset needs to be list

6401 if not is_list_like(subset):

6402 subset = [subset]

6403 ax = self._get_axis(agg_axis)

6404 indices = ax.get_indexer_for(subset)

6405 check = indices == -1

6406 if check.any():

6407 raise KeyError(np.array(subset)[check].tolist())

6408 agg_obj = self.take(indices, axis=agg_axis)

6410 if thresh is not no_default:

6411 count = agg_obj.count(axis=agg_axis)

6412 mask = count >= thresh

6413 elif how == "any":

6414 # faster equivalent to 'agg_obj.count(agg_axis) == self.shape[agg_axis]'

6415 mask = notna(agg_obj).all(axis=agg_axis, bool_only=False)

6416 elif how == "all":

6417 # faster equivalent to 'agg_obj.count(agg_axis) > 0'

6418 mask = notna(agg_obj).any(axis=agg_axis, bool_only=False)

6419 else:

6420 raise ValueError(f"invalid how option: {how}")

6422 if np.all(mask):

6423 result = self.copy(deep=None)

6424 else:

6425 result = self.loc(axis=axis)[mask]

6427 if ignore_index:

6428 result.index = default_index(len(result))

6430 if not inplace:

6431 return result

6432 self._update_inplace(result)

6433 return None

6435 def drop_duplicates(

6436 self,

6437 subset: Hashable | Sequence[Hashable] | None = None,

6439 keep: DropKeep = "first",

6440 inplace: bool = False,

6441 ignore_index: bool = False,

6442 ) -> DataFrame | None:

6443 """

6444 Return DataFrame with duplicate rows removed.

6446 Considering certain columns is optional. Indexes, including time indexes

6447 are ignored.

6449 Parameters

6450 ----------

6451 subset : column label or sequence of labels, optional

6452 Only consider certain columns for identifying duplicates, by

6453 default use all of the columns.

6454 keep : {'first', 'last', ``False``}, default 'first'

6455 Determines which duplicates (if any) to keep.

6457 - 'first' : Drop duplicates except for the first occurrence.

6458 - 'last' : Drop duplicates except for the last occurrence.

6459 - ``False`` : Drop all duplicates.

6461 inplace : bool, default ``False``

6462 Whether to modify the DataFrame rather than creating a new one.

6463 ignore_index : bool, default ``False``

6464 If ``True``, the resulting axis will be labeled 0, 1, …, n - 1.

6466 Returns

6467 -------

6468 DataFrame or None

6469 DataFrame with duplicates removed or None if ``inplace=True``.

6471 See Also

6472 --------

6473 DataFrame.value_counts: Count unique combinations of columns.

6475 Examples

6476 --------

6477 Consider dataset containing ramen rating.

6479 >>> df = pd.DataFrame({

6480 ... 'brand': ['Yum Yum', 'Yum Yum', 'Indomie', 'Indomie', 'Indomie'],

6481 ... 'style': ['cup', 'cup', 'cup', 'pack', 'pack'],

6482 ... 'rating': [4, 4, 3.5, 15, 5]

6483 ... })

6484 >>> df

6485 brand style rating

6486 0 Yum Yum cup 4.0

6487 1 Yum Yum cup 4.0

6488 2 Indomie cup 3.5

6489 3 Indomie pack 15.0

6490 4 Indomie pack 5.0

6492 By default, it removes duplicate rows based on all columns.

6494 >>> df.drop_duplicates()

6495 brand style rating

6496 0 Yum Yum cup 4.0

6497 2 Indomie cup 3.5

6498 3 Indomie pack 15.0

6499 4 Indomie pack 5.0

6501 To remove duplicates on specific column(s), use ``subset``.

6503 >>> df.drop_duplicates(subset=['brand'])

6504 brand style rating

6505 0 Yum Yum cup 4.0

6506 2 Indomie cup 3.5

6508 To remove duplicates and keep last occurrences, use ``keep``.

6510 >>> df.drop_duplicates(subset=['brand', 'style'], keep='last')

6511 brand style rating

6512 1 Yum Yum cup 4.0

6513 2 Indomie cup 3.5

6514 4 Indomie pack 5.0

6515 """

6516 if self.empty:

6517 return self.copy(deep=None)

6519 inplace = validate_bool_kwarg(inplace, "inplace")

6520 ignore_index = validate_bool_kwarg(ignore_index, "ignore_index")

6522 result = self[-self.duplicated(subset, keep=keep)]

6523 if ignore_index:

6524 result.index = default_index(len(result))

6526 if inplace:

6527 self._update_inplace(result)

6528 return None

6529 else:

6530 return result

6532 def duplicated(

6533 self,

6534 subset: Hashable | Sequence[Hashable] | None = None,

6535 keep: DropKeep = "first",

6536 ) -> Series:

6537 """

6538 Return boolean Series denoting duplicate rows.

6540 Considering certain columns is optional.

6542 Parameters

6543 ----------

6544 subset : column label or sequence of labels, optional

6545 Only consider certain columns for identifying duplicates, by

6546 default use all of the columns.

6547 keep : {'first', 'last', False}, default 'first'

6548 Determines which duplicates (if any) to mark.

6550 - ``first`` : Mark duplicates as ``True`` except for the first occurrence.

6551 - ``last`` : Mark duplicates as ``True`` except for the last occurrence.

6552 - False : Mark all duplicates as ``True``.

6554 Returns

6555 -------

6556 Series

6557 Boolean series for each duplicated rows.

6559 See Also

6560 --------

6561 Index.duplicated : Equivalent method on index.

6562 Series.duplicated : Equivalent method on Series.

6563 Series.drop_duplicates : Remove duplicate values from Series.

6564 DataFrame.drop_duplicates : Remove duplicate values from DataFrame.

6566 Examples

6567 --------

6568 Consider dataset containing ramen rating.

6570 >>> df = pd.DataFrame({

6571 ... 'brand': ['Yum Yum', 'Yum Yum', 'Indomie', 'Indomie', 'Indomie'],

6572 ... 'style': ['cup', 'cup', 'cup', 'pack', 'pack'],

6573 ... 'rating': [4, 4, 3.5, 15, 5]

6574 ... })

6575 >>> df

6576 brand style rating

6577 0 Yum Yum cup 4.0

6578 1 Yum Yum cup 4.0

6579 2 Indomie cup 3.5

6580 3 Indomie pack 15.0

6581 4 Indomie pack 5.0

6583 By default, for each set of duplicated values, the first occurrence

6584 is set on False and all others on True.

6586 >>> df.duplicated()

6587 0 False

6588 1 True

6589 2 False

6590 3 False

6591 4 False

6592 dtype: bool

6594 By using 'last', the last occurrence of each set of duplicated values

6595 is set on False and all others on True.

6597 >>> df.duplicated(keep='last')

6598 0 True

6599 1 False

6600 2 False

6601 3 False

6602 4 False

6603 dtype: bool

6605 By setting ``keep`` on False, all duplicates are True.

6607 >>> df.duplicated(keep=False)

6608 0 True

6609 1 True

6610 2 False

6611 3 False

6612 4 False

6613 dtype: bool

6615 To find duplicates on specific column(s), use ``subset``.

6617 >>> df.duplicated(subset=['brand'])

6618 0 False

6619 1 True

6620 2 False

6621 3 True

6622 4 True

6623 dtype: bool

6624 """

6626 if self.empty:

6627 return self._constructor_sliced(dtype=bool)

6629 def f(vals) -> tuple[np.ndarray, int]:

6630 labels, shape = algorithms.factorize(vals, size_hint=len(self))

6631 return labels.astype("i8", copy=False), len(shape)

6633 if subset is None:

6634 # https://github.com/pandas-dev/pandas/issues/28770

6635 # Incompatible types in assignment (expression has type "Index", variable

6636 # has type "Sequence[Any]")

6637 subset = self.columns # type: ignore[assignment]

6638 elif (

6639 not np.iterable(subset)

6640 or isinstance(subset, str)

6641 or isinstance(subset, tuple)

6642 and subset in self.columns

6644 subset = (subset,)

6646 # needed for mypy since can't narrow types using np.iterable

6647 subset = cast(Sequence, subset)

6649 # Verify all columns in subset exist in the queried dataframe

6650 # Otherwise, raise a KeyError, same as if you try to __getitem__ with a

6651 # key that doesn't exist.

6652 diff = set(subset) - set(self.columns)

6653 if diff:

6654 raise KeyError(Index(diff))

6656 if len(subset) == 1 and self.columns.is_unique:

6657 # GH#45236 This is faster than get_group_index below

6658 result = self[subset[0]].duplicated(keep)

6659 result.name = None

6660 else:

6661 vals = (col.values for name, col in self.items() if name in subset)

6662 labels, shape = map(list, zip(*map(f, vals)))

6664 ids = get_group_index(

6665 labels,

6666 # error: Argument 1 to "tuple" has incompatible type "List[_T]";

6667 # expected "Iterable[int]"

6668 tuple(shape), # type: ignore[arg-type]

6669 sort=False,

6670 xnull=False,

6672 result = self._constructor_sliced(duplicated(ids, keep), index=self.index)

6673 return result.__finalize__(self, method="duplicated")

6675 # ----------------------------------------------------------------------

6676 # Sorting

6677 # error: Signature of "sort_values" incompatible with supertype "NDFrame"

6678 @overload # type: ignore[override]

6679 def sort_values(

6680 self,

6681 by: IndexLabel,

6683 axis: Axis = ...,

6684 ascending=...,

6685 inplace: Literal[False] = ...,

6686 kind: str = ...,

6687 na_position: str = ...,

6688 ignore_index: bool = ...,

6689 key: ValueKeyFunc = ...,

6690 ) -> DataFrame:

6691 ...

6693 @overload

6694 def sort_values(

6695 self,

6696 by: IndexLabel,

6698 axis: Axis = ...,

6699 ascending=...,

6700 inplace: Literal[True],

6701 kind: str = ...,

6702 na_position: str = ...,

6703 ignore_index: bool = ...,

6704 key: ValueKeyFunc = ...,

6705 ) -> None:

6706 ...

6708 # TODO: Just move the sort_values doc here.

6709 @Substitution(**_shared_doc_kwargs)

6710 @Appender(NDFrame.sort_values.__doc__)

6711 def sort_values(

6712 self,

6713 by: IndexLabel,

6715 axis: Axis = 0,

6716 ascending: bool | list[bool] | tuple[bool, ...] = True,

6717 inplace: bool = False,

6718 kind: str = "quicksort",

6719 na_position: str = "last",

6720 ignore_index: bool = False,

6721 key: ValueKeyFunc = None,

6722 ) -> DataFrame | None:

6723 inplace = validate_bool_kwarg(inplace, "inplace")

6724 axis = self._get_axis_number(axis)

6725 ascending = validate_ascending(ascending)

6726 if not isinstance(by, list):

6727 by = [by]

6728 # error: Argument 1 to "len" has incompatible type "Union[bool, List[bool]]";

6729 # expected "Sized"

6730 if is_sequence(ascending) and (

6731 len(by) != len(ascending) # type: ignore[arg-type]

6733 # error: Argument 1 to "len" has incompatible type "Union[bool,

6734 # List[bool]]"; expected "Sized"

6735 raise ValueError(

6736 f"Length of ascending ({len(ascending)})" # type: ignore[arg-type]

6737 f" != length of by ({len(by)})"

6739 if len(by) > 1:

6740 keys = [self._get_label_or_level_values(x, axis=axis) for x in by]

6742 # need to rewrap columns in Series to apply key function

6743 if key is not None:

6744 # error: List comprehension has incompatible type List[Series];

6745 # expected List[ndarray]

6746 keys = [

6747 Series(k, name=name) # type: ignore[misc]

6748 for (k, name) in zip(keys, by)

6751 indexer = lexsort_indexer(

6752 keys, orders=ascending, na_position=na_position, key=key

6754 elif len(by):

6755 # len(by) == 1

6757 by = by[0]

6758 k = self._get_label_or_level_values(by, axis=axis)

6760 # need to rewrap column in Series to apply key function

6761 if key is not None:

6762 # error: Incompatible types in assignment (expression has type

6763 # "Series", variable has type "ndarray")

6764 k = Series(k, name=by) # type: ignore[assignment]

6766 if isinstance(ascending, (tuple, list)):

6767 ascending = ascending[0]

6769 indexer = nargsort(

6770 k, kind=kind, ascending=ascending, na_position=na_position, key=key

6772 else:

6773 if inplace:

6774 return self._update_inplace(self)

6775 else:

6776 return self.copy(deep=None)

6778 if is_range_indexer(indexer, len(indexer)):

6779 result = self.copy(deep=(not inplace and not using_copy_on_write()))

6780 if ignore_index:

6781 result.index = default_index(len(result))

6783 if inplace:

6784 return self._update_inplace(result)

6785 else:

6786 return result

6788 new_data = self._mgr.take(

6789 indexer, axis=self._get_block_manager_axis(axis), verify=False

6792 if ignore_index:

6793 new_data.set_axis(

6794 self._get_block_manager_axis(axis), default_index(len(indexer))

6797 result = self._constructor(new_data)

6798 if inplace:

6799 return self._update_inplace(result)

6800 else:

6801 return result.__finalize__(self, method="sort_values")

6803 @overload

6804 def sort_index(

6805 self,

6807 axis: Axis = ...,

6808 level: IndexLabel = ...,

6809 ascending: bool | Sequence[bool] = ...,

6810 inplace: Literal[True],

6811 kind: SortKind = ...,

6812 na_position: NaPosition = ...,

6813 sort_remaining: bool = ...,

6814 ignore_index: bool = ...,

6815 key: IndexKeyFunc = ...,

6816 ) -> None:

6817 ...

6819 @overload

6820 def sort_index(

6821 self,

6823 axis: Axis = ...,

6824 level: IndexLabel = ...,

6825 ascending: bool | Sequence[bool] = ...,

6826 inplace: Literal[False] = ...,

6827 kind: SortKind = ...,

6828 na_position: NaPosition = ...,

6829 sort_remaining: bool = ...,

6830 ignore_index: bool = ...,

6831 key: IndexKeyFunc = ...,

6832 ) -> DataFrame:

6833 ...

6835 @overload

6836 def sort_index(

6837 self,

6839 axis: Axis = ...,

6840 level: IndexLabel = ...,

6841 ascending: bool | Sequence[bool] = ...,

6842 inplace: bool = ...,

6843 kind: SortKind = ...,

6844 na_position: NaPosition = ...,

6845 sort_remaining: bool = ...,

6846 ignore_index: bool = ...,

6847 key: IndexKeyFunc = ...,

6848 ) -> DataFrame | None:

6849 ...

6851 def sort_index(

6852 self,

6854 axis: Axis = 0,

6855 level: IndexLabel = None,

6856 ascending: bool | Sequence[bool] = True,

6857 inplace: bool = False,

6858 kind: SortKind = "quicksort",

6859 na_position: NaPosition = "last",

6860 sort_remaining: bool = True,

6861 ignore_index: bool = False,

6862 key: IndexKeyFunc = None,

6863 ) -> DataFrame | None:

6864 """

6865 Sort object by labels (along an axis).

6867 Returns a new DataFrame sorted by label if `inplace` argument is

6868 ``False``, otherwise updates the original DataFrame and returns None.

6870 Parameters

6871 ----------

6872 axis : {0 or 'index', 1 or 'columns'}, default 0

6873 The axis along which to sort. The value 0 identifies the rows,

6874 and 1 identifies the columns.

6875 level : int or level name or list of ints or list of level names

6876 If not None, sort on values in specified index level(s).

6877 ascending : bool or list-like of bools, default True

6878 Sort ascending vs. descending. When the index is a MultiIndex the

6879 sort direction can be controlled for each level individually.

6880 inplace : bool, default False

6881 Whether to modify the DataFrame rather than creating a new one.

6882 kind : {'quicksort', 'mergesort', 'heapsort', 'stable'}, default 'quicksort'

6883 Choice of sorting algorithm. See also :func:`numpy.sort` for more

6884 information. `mergesort` and `stable` are the only stable algorithms. For

6885 DataFrames, this option is only applied when sorting on a single

6886 column or label.

6887 na_position : {'first', 'last'}, default 'last'

6888 Puts NaNs at the beginning if `first`; `last` puts NaNs at the end.

6889 Not implemented for MultiIndex.

6890 sort_remaining : bool, default True

6891 If True and sorting by level and index is multilevel, sort by other

6892 levels too (in order) after sorting by specified level.

6893 ignore_index : bool, default False

6894 If True, the resulting axis will be labeled 0, 1, …, n - 1.

6895 key : callable, optional

6896 If not None, apply the key function to the index values

6897 before sorting. This is similar to the `key` argument in the

6898 builtin :meth:`sorted` function, with the notable difference that

6899 this `key` function should be *vectorized*. It should expect an

6900 ``Index`` and return an ``Index`` of the same shape. For MultiIndex

6901 inputs, the key is applied *per level*.

6903 .. versionadded:: 1.1.0

6905 Returns

6906 -------

6907 DataFrame or None

6908 The original DataFrame sorted by the labels or None if ``inplace=True``.

6910 See Also

6911 --------

6912 Series.sort_index : Sort Series by the index.

6913 DataFrame.sort_values : Sort DataFrame by the value.

6914 Series.sort_values : Sort Series by the value.

6916 Examples

6917 --------

6918 >>> df = pd.DataFrame([1, 2, 3, 4, 5], index=[100, 29, 234, 1, 150],

6919 ... columns=['A'])

6920 >>> df.sort_index()

6922 1 4

6923 29 2

6924 100 1

6925 150 5

6926 234 3

6928 By default, it sorts in ascending order, to sort in descending order,

6929 use ``ascending=False``

6931 >>> df.sort_index(ascending=False)

6933 234 3

6934 150 5

6935 100 1

6936 29 2

6937 1 4

6939 A key function can be specified which is applied to the index before

6940 sorting. For a ``MultiIndex`` this is applied to each level separately.

6942 >>> df = pd.DataFrame({"a": [1, 2, 3, 4]}, index=['A', 'b', 'C', 'd'])

6943 >>> df.sort_index(key=lambda x: x.str.lower())

6945 A 1

6946 b 2

6947 C 3

6948 d 4

6949 """

6950 return super().sort_index(

6951 axis=axis,

6952 level=level,

6953 ascending=ascending,

6954 inplace=inplace,

6955 kind=kind,

6956 na_position=na_position,

6957 sort_remaining=sort_remaining,

6958 ignore_index=ignore_index,

6959 key=key,

6962 def value_counts(

6963 self,

6964 subset: Sequence[Hashable] | None = None,

6965 normalize: bool = False,

6966 sort: bool = True,

6967 ascending: bool = False,

6968 dropna: bool = True,

6969 ) -> Series:

6970 """

6971 Return a Series containing counts of unique rows in the DataFrame.

6973 .. versionadded:: 1.1.0

6975 Parameters

6976 ----------

6977 subset : label or list of labels, optional

6978 Columns to use when counting unique combinations.

6979 normalize : bool, default False

6980 Return proportions rather than frequencies.

6981 sort : bool, default True

6982 Sort by frequencies.

6983 ascending : bool, default False

6984 Sort in ascending order.

6985 dropna : bool, default True

6986 Don’t include counts of rows that contain NA values.

6988 .. versionadded:: 1.3.0

6990 Returns

6991 -------

6992 Series

6994 See Also

6995 --------

6996 Series.value_counts: Equivalent method on Series.

6998 Notes

6999 -----

7000 The returned Series will have a MultiIndex with one level per input

7001 column but an Index (non-multi) for a single label. By default, rows

7002 that contain any NA values are omitted from the result. By default,

7003 the resulting Series will be in descending order so that the first

7004 element is the most frequently-occurring row.

7006 Examples

7007 --------

7008 >>> df = pd.DataFrame({'num_legs': [2, 4, 4, 6],

7009 ... 'num_wings': [2, 0, 0, 0]},

7010 ... index=['falcon', 'dog', 'cat', 'ant'])

7011 >>> df

7012 num_legs num_wings

7013 falcon 2 2

7014 dog 4 0

7015 cat 4 0

7016 ant 6 0

7018 >>> df.value_counts()

7019 num_legs num_wings

7020 4 0 2

7021 2 2 1

7022 6 0 1

7023 Name: count, dtype: int64

7025 >>> df.value_counts(sort=False)

7026 num_legs num_wings

7027 2 2 1

7028 4 0 2

7029 6 0 1

7030 Name: count, dtype: int64

7032 >>> df.value_counts(ascending=True)

7033 num_legs num_wings

7034 2 2 1

7035 6 0 1

7036 4 0 2

7037 Name: count, dtype: int64

7039 >>> df.value_counts(normalize=True)

7040 num_legs num_wings

7041 4 0 0.50

7042 2 2 0.25

7043 6 0 0.25

7044 Name: proportion, dtype: float64

7046 With `dropna` set to `False` we can also count rows with NA values.

7048 >>> df = pd.DataFrame({'first_name': ['John', 'Anne', 'John', 'Beth'],

7049 ... 'middle_name': ['Smith', pd.NA, pd.NA, 'Louise']})

7050 >>> df

7051 first_name middle_name

7052 0 John Smith

7053 1 Anne <NA>

7054 2 John <NA>

7055 3 Beth Louise

7057 >>> df.value_counts()

7058 first_name middle_name

7059 Beth Louise 1

7060 John Smith 1

7061 Name: count, dtype: int64

7063 >>> df.value_counts(dropna=False)

7064 first_name middle_name

7065 Anne NaN 1

7066 Beth Louise 1

7067 John Smith 1

7068 NaN 1

7069 Name: count, dtype: int64

7071 >>> df.value_counts("first_name")

7072 first_name

7073 John 2

7074 Anne 1

7075 Beth 1

7076 Name: count, dtype: int64

7077 """

7078 if subset is None:

7079 subset = self.columns.tolist()

7081 name = "proportion" if normalize else "count"

7082 counts = self.groupby(subset, dropna=dropna).grouper.size()

7083 counts.name = name

7085 if sort:

7086 counts = counts.sort_values(ascending=ascending)

7087 if normalize:

7088 counts /= counts.sum()

7090 # Force MultiIndex for single column

7091 if is_list_like(subset) and len(subset) == 1:

7092 counts.index = MultiIndex.from_arrays(

7093 [counts.index], names=[counts.index.name]

7096 return counts

7098 def nlargest(self, n: int, columns: IndexLabel, keep: str = "first") -> DataFrame:

7099 """

7100 Return the first `n` rows ordered by `columns` in descending order.

7102 Return the first `n` rows with the largest values in `columns`, in

7103 descending order. The columns that are not specified are returned as

7104 well, but not used for ordering.

7106 This method is equivalent to

7107 ``df.sort_values(columns, ascending=False).head(n)``, but more

7108 performant.

7110 Parameters

7111 ----------

7112 n : int

7113 Number of rows to return.

7114 columns : label or list of labels

7115 Column label(s) to order by.

7116 keep : {'first', 'last', 'all'}, default 'first'

7117 Where there are duplicate values:

7119 - ``first`` : prioritize the first occurrence(s)

7120 - ``last`` : prioritize the last occurrence(s)

7121 - ``all`` : do not drop any duplicates, even it means

7122 selecting more than `n` items.

7124 Returns

7125 -------

7126 DataFrame

7127 The first `n` rows ordered by the given columns in descending

7128 order.

7130 See Also

7131 --------

7132 DataFrame.nsmallest : Return the first `n` rows ordered by `columns` in

7133 ascending order.

7134 DataFrame.sort_values : Sort DataFrame by the values.

7135 DataFrame.head : Return the first `n` rows without re-ordering.

7137 Notes

7138 -----

7139 This function cannot be used with all column types. For example, when

7140 specifying columns with `object` or `category` dtypes, ``TypeError`` is

7141 raised.

7143 Examples

7144 --------

7145 >>> df = pd.DataFrame({'population': [59000000, 65000000, 434000,

7146 ... 434000, 434000, 337000, 11300,

7147 ... 11300, 11300],

7148 ... 'GDP': [1937894, 2583560 , 12011, 4520, 12128,

7149 ... 17036, 182, 38, 311],

7150 ... 'alpha-2': ["IT", "FR", "MT", "MV", "BN",

7151 ... "IS", "NR", "TV", "AI"]},

7152 ... index=["Italy", "France", "Malta",

7153 ... "Maldives", "Brunei", "Iceland",

7154 ... "Nauru", "Tuvalu", "Anguilla"])

7155 >>> df

7156 population GDP alpha-2

7157 Italy 59000000 1937894 IT

7158 France 65000000 2583560 FR

7159 Malta 434000 12011 MT

7160 Maldives 434000 4520 MV

7161 Brunei 434000 12128 BN

7162 Iceland 337000 17036 IS

7163 Nauru 11300 182 NR

7164 Tuvalu 11300 38 TV

7165 Anguilla 11300 311 AI

7167 In the following example, we will use ``nlargest`` to select the three

7168 rows having the largest values in column "population".

7170 >>> df.nlargest(3, 'population')

7171 population GDP alpha-2

7172 France 65000000 2583560 FR

7173 Italy 59000000 1937894 IT

7174 Malta 434000 12011 MT

7176 When using ``keep='last'``, ties are resolved in reverse order:

7178 >>> df.nlargest(3, 'population', keep='last')

7179 population GDP alpha-2

7180 France 65000000 2583560 FR

7181 Italy 59000000 1937894 IT

7182 Brunei 434000 12128 BN

7184 When using ``keep='all'``, all duplicate items are maintained:

7186 >>> df.nlargest(3, 'population', keep='all')

7187 population GDP alpha-2

7188 France 65000000 2583560 FR

7189 Italy 59000000 1937894 IT

7190 Malta 434000 12011 MT

7191 Maldives 434000 4520 MV

7192 Brunei 434000 12128 BN

7194 To order by the largest values in column "population" and then "GDP",

7195 we can specify multiple columns like in the next example.

7197 >>> df.nlargest(3, ['population', 'GDP'])

7198 population GDP alpha-2

7199 France 65000000 2583560 FR

7200 Italy 59000000 1937894 IT

7201 Brunei 434000 12128 BN

7202 """

7203 return selectn.SelectNFrame(self, n=n, keep=keep, columns=columns).nlargest()

7205 def nsmallest(self, n: int, columns: IndexLabel, keep: str = "first") -> DataFrame:

7206 """

7207 Return the first `n` rows ordered by `columns` in ascending order.

7209 Return the first `n` rows with the smallest values in `columns`, in

7210 ascending order. The columns that are not specified are returned as

7211 well, but not used for ordering.

7213 This method is equivalent to

7214 ``df.sort_values(columns, ascending=True).head(n)``, but more

7215 performant.

7217 Parameters

7218 ----------

7219 n : int

7220 Number of items to retrieve.

7221 columns : list or str

7222 Column name or names to order by.

7223 keep : {'first', 'last', 'all'}, default 'first'

7224 Where there are duplicate values:

7226 - ``first`` : take the first occurrence.

7227 - ``last`` : take the last occurrence.

7228 - ``all`` : do not drop any duplicates, even it means

7229 selecting more than `n` items.

7231 Returns

7232 -------

7233 DataFrame

7235 See Also

7236 --------

7237 DataFrame.nlargest : Return the first `n` rows ordered by `columns` in

7238 descending order.

7239 DataFrame.sort_values : Sort DataFrame by the values.

7240 DataFrame.head : Return the first `n` rows without re-ordering.

7242 Examples

7243 --------

7244 >>> df = pd.DataFrame({'population': [59000000, 65000000, 434000,

7245 ... 434000, 434000, 337000, 337000,

7246 ... 11300, 11300],

7247 ... 'GDP': [1937894, 2583560 , 12011, 4520, 12128,

7248 ... 17036, 182, 38, 311],

7249 ... 'alpha-2': ["IT", "FR", "MT", "MV", "BN",

7250 ... "IS", "NR", "TV", "AI"]},

7251 ... index=["Italy", "France", "Malta",

7252 ... "Maldives", "Brunei", "Iceland",

7253 ... "Nauru", "Tuvalu", "Anguilla"])

7254 >>> df

7255 population GDP alpha-2

7256 Italy 59000000 1937894 IT

7257 France 65000000 2583560 FR

7258 Malta 434000 12011 MT

7259 Maldives 434000 4520 MV

7260 Brunei 434000 12128 BN

7261 Iceland 337000 17036 IS

7262 Nauru 337000 182 NR

7263 Tuvalu 11300 38 TV

7264 Anguilla 11300 311 AI

7266 In the following example, we will use ``nsmallest`` to select the

7267 three rows having the smallest values in column "population".

7269 >>> df.nsmallest(3, 'population')

7270 population GDP alpha-2

7271 Tuvalu 11300 38 TV

7272 Anguilla 11300 311 AI

7273 Iceland 337000 17036 IS

7275 When using ``keep='last'``, ties are resolved in reverse order:

7277 >>> df.nsmallest(3, 'population', keep='last')

7278 population GDP alpha-2

7279 Anguilla 11300 311 AI

7280 Tuvalu 11300 38 TV

7281 Nauru 337000 182 NR

7283 When using ``keep='all'``, all duplicate items are maintained:

7285 >>> df.nsmallest(3, 'population', keep='all')

7286 population GDP alpha-2

7287 Tuvalu 11300 38 TV

7288 Anguilla 11300 311 AI

7289 Iceland 337000 17036 IS

7290 Nauru 337000 182 NR

7292 To order by the smallest values in column "population" and then "GDP", we can

7293 specify multiple columns like in the next example.

7295 >>> df.nsmallest(3, ['population', 'GDP'])

7296 population GDP alpha-2

7297 Tuvalu 11300 38 TV

7298 Anguilla 11300 311 AI

7299 Nauru 337000 182 NR

7300 """

7301 return selectn.SelectNFrame(self, n=n, keep=keep, columns=columns).nsmallest()

7303 @doc(

7304 Series.swaplevel,

7305 klass=_shared_doc_kwargs["klass"],

7306 extra_params=dedent(

7307 """axis : {0 or 'index', 1 or 'columns'}, default 0

7308 The axis to swap levels on. 0 or 'index' for row-wise, 1 or

7309 'columns' for column-wise."""

7311 examples=dedent(

7312 """\

7313 Examples

7314 --------

7315 >>> df = pd.DataFrame(

7316 ... {"Grade": ["A", "B", "A", "C"]},

7317 ... index=[

7318 ... ["Final exam", "Final exam", "Coursework", "Coursework"],

7319 ... ["History", "Geography", "History", "Geography"],

7320 ... ["January", "February", "March", "April"],

7321 ... ],

7322 ... )

7323 >>> df

7324 Grade

7325 Final exam History January A

7326 Geography February B

7327 Coursework History March A

7328 Geography April C

7330 In the following example, we will swap the levels of the indices.

7331 Here, we will swap the levels column-wise, but levels can be swapped row-wise

7332 in a similar manner. Note that column-wise is the default behaviour.

7333 By not supplying any arguments for i and j, we swap the last and second to

7334 last indices.

7336 >>> df.swaplevel()

7337 Grade

7338 Final exam January History A

7339 February Geography B

7340 Coursework March History A

7341 April Geography C

7343 By supplying one argument, we can choose which index to swap the last

7344 index with. We can for example swap the first index with the last one as

7345 follows.

7347 >>> df.swaplevel(0)

7348 Grade

7349 January History Final exam A

7350 February Geography Final exam B

7351 March History Coursework A

7352 April Geography Coursework C

7354 We can also define explicitly which indices we want to swap by supplying values

7355 for both i and j. Here, we for example swap the first and second indices.

7357 >>> df.swaplevel(0, 1)

7358 Grade

7359 History Final exam January A

7360 Geography Final exam February B

7361 History Coursework March A

7362 Geography Coursework April C"""

7365 def swaplevel(self, i: Axis = -2, j: Axis = -1, axis: Axis = 0) -> DataFrame:

7366 result = self.copy(deep=None)

7368 axis = self._get_axis_number(axis)

7370 if not isinstance(result._get_axis(axis), MultiIndex): # pragma: no cover

7371 raise TypeError("Can only swap levels on a hierarchical axis.")

7373 if axis == 0:

7374 assert isinstance(result.index, MultiIndex)

7375 result.index = result.index.swaplevel(i, j)

7376 else:

7377 assert isinstance(result.columns, MultiIndex)

7378 result.columns = result.columns.swaplevel(i, j)

7379 return result

7381 def reorder_levels(self, order: Sequence[int | str], axis: Axis = 0) -> DataFrame:

7382 """

7383 Rearrange index levels using input order. May not drop or duplicate levels.

7385 Parameters

7386 ----------

7387 order : list of int or list of str

7388 List representing new level order. Reference level by number

7389 (position) or by key (label).

7390 axis : {0 or 'index', 1 or 'columns'}, default 0

7391 Where to reorder levels.

7393 Returns

7394 -------

7395 DataFrame

7397 Examples

7398 --------

7399 >>> data = {

7400 ... "class": ["Mammals", "Mammals", "Reptiles"],

7401 ... "diet": ["Omnivore", "Carnivore", "Carnivore"],

7402 ... "species": ["Humans", "Dogs", "Snakes"],

7403 ... }

7404 >>> df = pd.DataFrame(data, columns=["class", "diet", "species"])

7405 >>> df = df.set_index(["class", "diet"])

7406 >>> df

7407 species

7408 class diet

7409 Mammals Omnivore Humans

7410 Carnivore Dogs

7411 Reptiles Carnivore Snakes

7413 Let's reorder the levels of the index:

7415 >>> df.reorder_levels(["diet", "class"])

7416 species

7417 diet class

7418 Omnivore Mammals Humans

7419 Carnivore Mammals Dogs

7420 Reptiles Snakes

7421 """

7422 axis = self._get_axis_number(axis)

7423 if not isinstance(self._get_axis(axis), MultiIndex): # pragma: no cover

7424 raise TypeError("Can only reorder levels on a hierarchical axis.")

7426 result = self.copy(deep=None)

7428 if axis == 0:

7429 assert isinstance(result.index, MultiIndex)

7430 result.index = result.index.reorder_levels(order)

7431 else:

7432 assert isinstance(result.columns, MultiIndex)

7433 result.columns = result.columns.reorder_levels(order)

7434 return result

7436 # ----------------------------------------------------------------------

7437 # Arithmetic Methods

7439 def _cmp_method(self, other, op):

7440 axis: Literal[1] = 1 # only relevant for Series other case

7442 self, other = ops.align_method_FRAME(self, other, axis, flex=False, level=None)

7444 # See GH#4537 for discussion of scalar op behavior

7445 new_data = self._dispatch_frame_op(other, op, axis=axis)

7446 return self._construct_result(new_data)

7448 def _arith_method(self, other, op):

7449 if ops.should_reindex_frame_op(self, other, op, 1, None, None):

7450 return ops.frame_arith_method_with_reindex(self, other, op)

7452 axis: Literal[1] = 1 # only relevant for Series other case

7453 other = ops.maybe_prepare_scalar_for_op(other, (self.shape[axis],))

7455 self, other = ops.align_method_FRAME(self, other, axis, flex=True, level=None)

7457 new_data = self._dispatch_frame_op(other, op, axis=axis)

7458 return self._construct_result(new_data)

7460 _logical_method = _arith_method

7462 def _dispatch_frame_op(self, right, func: Callable, axis: AxisInt | None = None):

7463 """

7464 Evaluate the frame operation func(left, right) by evaluating

7465 column-by-column, dispatching to the Series implementation.

7467 Parameters

7468 ----------

7469 right : scalar, Series, or DataFrame

7470 func : arithmetic or comparison operator

7471 axis : {None, 0, 1}

7473 Returns

7474 -------

7475 DataFrame

7476 """

7477 # Get the appropriate array-op to apply to each column/block's values.

7478 array_op = ops.get_array_op(func)

7480 right = lib.item_from_zerodim(right)

7481 if not is_list_like(right):

7482 # i.e. scalar, faster than checking np.ndim(right) == 0

7483 with np.errstate(all="ignore"):

7484 bm = self._mgr.apply(array_op, right=right)

7485 return self._constructor(bm)

7487 elif isinstance(right, DataFrame):

7488 assert self.index.equals(right.index)

7489 assert self.columns.equals(right.columns)

7490 # TODO: The previous assertion `assert right._indexed_same(self)`

7491 # fails in cases with empty columns reached via

7492 # _frame_arith_method_with_reindex

7494 # TODO operate_blockwise expects a manager of the same type

7495 with np.errstate(all="ignore"):

7496 bm = self._mgr.operate_blockwise(

7497 # error: Argument 1 to "operate_blockwise" of "ArrayManager" has

7498 # incompatible type "Union[ArrayManager, BlockManager]"; expected

7499 # "ArrayManager"

7500 # error: Argument 1 to "operate_blockwise" of "BlockManager" has

7501 # incompatible type "Union[ArrayManager, BlockManager]"; expected

7502 # "BlockManager"

7503 right._mgr, # type: ignore[arg-type]

7504 array_op,

7506 return self._constructor(bm)

7508 elif isinstance(right, Series) and axis == 1:

7509 # axis=1 means we want to operate row-by-row

7510 assert right.index.equals(self.columns)

7512 right = right._values

7513 # maybe_align_as_frame ensures we do not have an ndarray here

7514 assert not isinstance(right, np.ndarray)

7516 with np.errstate(all="ignore"):

7517 arrays = [

7518 array_op(_left, _right)

7519 for _left, _right in zip(self._iter_column_arrays(), right)

7522 elif isinstance(right, Series):

7523 assert right.index.equals(self.index) # Handle other cases later

7524 right = right._values

7526 with np.errstate(all="ignore"):

7527 arrays = [array_op(left, right) for left in self._iter_column_arrays()]

7529 else:

7530 # Remaining cases have less-obvious dispatch rules

7531 raise NotImplementedError(right)

7533 return type(self)._from_arrays(

7534 arrays, self.columns, self.index, verify_integrity=False

7537 def _combine_frame(self, other: DataFrame, func, fill_value=None):

7538 # at this point we have `self._indexed_same(other)`

7540 if fill_value is None:

7541 # since _arith_op may be called in a loop, avoid function call

7542 # overhead if possible by doing this check once

7543 _arith_op = func

7545 else:

7547 def _arith_op(left, right):

7548 # for the mixed_type case where we iterate over columns,

7549 # _arith_op(left, right) is equivalent to

7550 # left._binop(right, func, fill_value=fill_value)

7551 left, right = ops.fill_binop(left, right, fill_value)

7552 return func(left, right)

7554 new_data = self._dispatch_frame_op(other, _arith_op)

7555 return new_data

7557 def _construct_result(self, result) -> DataFrame:

7558 """

7559 Wrap the result of an arithmetic, comparison, or logical operation.

7561 Parameters

7562 ----------

7563 result : DataFrame

7565 Returns

7566 -------

7567 DataFrame

7568 """

7569 out = self._constructor(result, copy=False).__finalize__(self)

7570 # Pin columns instead of passing to constructor for compat with

7571 # non-unique columns case

7572 out.columns = self.columns

7573 out.index = self.index

7574 return out

7576 def __divmod__(self, other) -> tuple[DataFrame, DataFrame]:

7577 # Naive implementation, room for optimization

7578 div = self // other

7579 mod = self - div * other

7580 return div, mod

7582 def __rdivmod__(self, other) -> tuple[DataFrame, DataFrame]:

7583 # Naive implementation, room for optimization

7584 div = other // self

7585 mod = other - div * self

7586 return div, mod

7588 # ----------------------------------------------------------------------

7589 # Combination-Related

7591 @doc(

7592 _shared_docs["compare"],

7593 """

7594Returns

7595-------

7596DataFrame

7597 DataFrame that shows the differences stacked side by side.

7599 The resulting index will be a MultiIndex with 'self' and 'other'

7600 stacked alternately at the inner level.

7602Raises

7603------

7604ValueError

7605 When the two DataFrames don't have identical labels or shape.

7607See Also

7608--------

7609Series.compare : Compare with another Series and show differences.

7610DataFrame.equals : Test whether two objects contain the same elements.

7612Notes

7613-----

7614Matching NaNs will not appear as a difference.

7616Can only compare identically-labeled

7617(i.e. same shape, identical row and column labels) DataFrames

7619Examples

7620--------

7621>>> df = pd.DataFrame(

7622... {{

7623... "col1": ["a", "a", "b", "b", "a"],

7624... "col2": [1.0, 2.0, 3.0, np.nan, 5.0],

7625... "col3": [1.0, 2.0, 3.0, 4.0, 5.0]

7626... }},

7627... columns=["col1", "col2", "col3"],

7628... )

7629>>> df

7630 col1 col2 col3

76310 a 1.0 1.0

76321 a 2.0 2.0

76332 b 3.0 3.0

76343 b NaN 4.0

76354 a 5.0 5.0

7637>>> df2 = df.copy()

7638>>> df2.loc[0, 'col1'] = 'c'

7639>>> df2.loc[2, 'col3'] = 4.0

7640>>> df2

7641 col1 col2 col3

76420 c 1.0 1.0

76431 a 2.0 2.0

76442 b 3.0 4.0

76453 b NaN 4.0

76464 a 5.0 5.0

7648Align the differences on columns

7650>>> df.compare(df2)

7651 col1 col3

7652 self other self other

76530 a c NaN NaN

76542 NaN NaN 3.0 4.0

7656Assign result_names

7658>>> df.compare(df2, result_names=("left", "right"))

7659 col1 col3

7660 left right left right

76610 a c NaN NaN

76622 NaN NaN 3.0 4.0

7664Stack the differences on rows

7666>>> df.compare(df2, align_axis=0)

7667 col1 col3

76680 self a NaN

7669 other c NaN

76702 self NaN 3.0

7671 other NaN 4.0

7673Keep the equal values

7675>>> df.compare(df2, keep_equal=True)

7676 col1 col3

7677 self other self other

76780 a c 1.0 1.0

76792 b b 3.0 4.0

7681Keep all original rows and columns

7683>>> df.compare(df2, keep_shape=True)

7684 col1 col2 col3

7685 self other self other self other

76860 a c NaN NaN NaN NaN

76871 NaN NaN NaN NaN NaN NaN

76882 NaN NaN NaN NaN 3.0 4.0

76893 NaN NaN NaN NaN NaN NaN

76904 NaN NaN NaN NaN NaN NaN

7692Keep all original rows and columns and also all original values

7694>>> df.compare(df2, keep_shape=True, keep_equal=True)

7695 col1 col2 col3

7696 self other self other self other

76970 a c 1.0 1.0 1.0 1.0

76981 a a 2.0 2.0 2.0 2.0

76992 b b 3.0 3.0 3.0 4.0

77003 b b NaN NaN 4.0 4.0

77014 a a 5.0 5.0 5.0 5.0

7702""",

7703 klass=_shared_doc_kwargs["klass"],

7705 def compare(

7706 self,

7707 other: DataFrame,

7708 align_axis: Axis = 1,

7709 keep_shape: bool = False,

7710 keep_equal: bool = False,

7711 result_names: Suffixes = ("self", "other"),

7712 ) -> DataFrame:

7713 return super().compare(

7714 other=other,

7715 align_axis=align_axis,

7716 keep_shape=keep_shape,

7717 keep_equal=keep_equal,

7718 result_names=result_names,

7721 def combine(

7722 self,

7723 other: DataFrame,

7724 func: Callable[[Series, Series], Series | Hashable],

7725 fill_value=None,

7726 overwrite: bool = True,

7727 ) -> DataFrame:

7728 """

7729 Perform column-wise combine with another DataFrame.

7731 Combines a DataFrame with `other` DataFrame using `func`

7732 to element-wise combine columns. The row and column indexes of the

7733 resulting DataFrame will be the union of the two.

7735 Parameters

7736 ----------

7737 other : DataFrame

7738 The DataFrame to merge column-wise.

7739 func : function

7740 Function that takes two series as inputs and return a Series or a

7741 scalar. Used to merge the two dataframes column by columns.

7742 fill_value : scalar value, default None

7743 The value to fill NaNs with prior to passing any column to the

7744 merge func.

7745 overwrite : bool, default True

7746 If True, columns in `self` that do not exist in `other` will be

7747 overwritten with NaNs.

7749 Returns

7750 -------

7751 DataFrame

7752 Combination of the provided DataFrames.

7754 See Also

7755 --------

7756 DataFrame.combine_first : Combine two DataFrame objects and default to

7757 non-null values in frame calling the method.

7759 Examples

7760 --------

7761 Combine using a simple function that chooses the smaller column.

7763 >>> df1 = pd.DataFrame({'A': [0, 0], 'B': [4, 4]})

7764 >>> df2 = pd.DataFrame({'A': [1, 1], 'B': [3, 3]})

7765 >>> take_smaller = lambda s1, s2: s1 if s1.sum() < s2.sum() else s2

7766 >>> df1.combine(df2, take_smaller)

7767 A B

7768 0 0 3

7769 1 0 3

7771 Example using a true element-wise combine function.

7773 >>> df1 = pd.DataFrame({'A': [5, 0], 'B': [2, 4]})

7774 >>> df2 = pd.DataFrame({'A': [1, 1], 'B': [3, 3]})

7775 >>> df1.combine(df2, np.minimum)

7776 A B

7777 0 1 2

7778 1 0 3

7780 Using `fill_value` fills Nones prior to passing the column to the

7781 merge function.

7783 >>> df1 = pd.DataFrame({'A': [0, 0], 'B': [None, 4]})

7784 >>> df2 = pd.DataFrame({'A': [1, 1], 'B': [3, 3]})

7785 >>> df1.combine(df2, take_smaller, fill_value=-5)

7786 A B

7787 0 0 -5.0

7788 1 0 4.0

7790 However, if the same element in both dataframes is None, that None

7791 is preserved

7793 >>> df1 = pd.DataFrame({'A': [0, 0], 'B': [None, 4]})

7794 >>> df2 = pd.DataFrame({'A': [1, 1], 'B': [None, 3]})

7795 >>> df1.combine(df2, take_smaller, fill_value=-5)

7796 A B

7797 0 0 -5.0

7798 1 0 3.0

7800 Example that demonstrates the use of `overwrite` and behavior when

7801 the axis differ between the dataframes.

7803 >>> df1 = pd.DataFrame({'A': [0, 0], 'B': [4, 4]})

7804 >>> df2 = pd.DataFrame({'B': [3, 3], 'C': [-10, 1], }, index=[1, 2])

7805 >>> df1.combine(df2, take_smaller)

7806 A B C

7807 0 NaN NaN NaN

7808 1 NaN 3.0 -10.0

7809 2 NaN 3.0 1.0

7811 >>> df1.combine(df2, take_smaller, overwrite=False)

7812 A B C

7813 0 0.0 NaN NaN

7814 1 0.0 3.0 -10.0

7815 2 NaN 3.0 1.0

7817 Demonstrating the preference of the passed in dataframe.

7819 >>> df2 = pd.DataFrame({'B': [3, 3], 'C': [1, 1], }, index=[1, 2])

7820 >>> df2.combine(df1, take_smaller)

7821 A B C

7822 0 0.0 NaN NaN

7823 1 0.0 3.0 NaN

7824 2 NaN 3.0 NaN

7826 >>> df2.combine(df1, take_smaller, overwrite=False)

7827 A B C

7828 0 0.0 NaN NaN

7829 1 0.0 3.0 1.0

7830 2 NaN 3.0 1.0

7831 """

7832 other_idxlen = len(other.index) # save for compare

7834 this, other = self.align(other, copy=False)

7835 new_index = this.index

7837 if other.empty and len(new_index) == len(self.index):

7838 return self.copy()

7840 if self.empty and len(other) == other_idxlen:

7841 return other.copy()

7843 # sorts if possible; otherwise align above ensures that these are set-equal

7844 new_columns = this.columns.union(other.columns)

7845 do_fill = fill_value is not None

7846 result = {}

7847 for col in new_columns:

7848 series = this[col]

7849 other_series = other[col]

7851 this_dtype = series.dtype

7852 other_dtype = other_series.dtype

7854 this_mask = isna(series)

7855 other_mask = isna(other_series)

7857 # don't overwrite columns unnecessarily

7858 # DO propagate if this column is not in the intersection

7859 if not overwrite and other_mask.all():

7860 result[col] = this[col].copy()

7861 continue

7863 if do_fill:

7864 series = series.copy()

7865 other_series = other_series.copy()

7866 series[this_mask] = fill_value

7867 other_series[other_mask] = fill_value

7869 if col not in self.columns:

7870 # If self DataFrame does not have col in other DataFrame,

7871 # try to promote series, which is all NaN, as other_dtype.

7872 new_dtype = other_dtype

7873 try:

7874 series = series.astype(new_dtype, copy=False)

7875 except ValueError:

7876 # e.g. new_dtype is integer types

7877 pass

7878 else:

7879 # if we have different dtypes, possibly promote

7880 new_dtype = find_common_type([this_dtype, other_dtype])

7881 series = series.astype(new_dtype, copy=False)

7882 other_series = other_series.astype(new_dtype, copy=False)

7884 arr = func(series, other_series)

7885 if isinstance(new_dtype, np.dtype):

7886 # if new_dtype is an EA Dtype, then `func` is expected to return

7887 # the correct dtype without any additional casting

7888 # error: No overload variant of "maybe_downcast_to_dtype" matches

7889 # argument types "Union[Series, Hashable]", "dtype[Any]"

7890 arr = maybe_downcast_to_dtype( # type: ignore[call-overload]

7891 arr, new_dtype

7894 result[col] = arr

7896 # convert_objects just in case

7897 return self._constructor(result, index=new_index, columns=new_columns)

7899 def combine_first(self, other: DataFrame) -> DataFrame:

7900 """

7901 Update null elements with value in the same location in `other`.

7903 Combine two DataFrame objects by filling null values in one DataFrame

7904 with non-null values from other DataFrame. The row and column indexes

7905 of the resulting DataFrame will be the union of the two. The resulting

7906 dataframe contains the 'first' dataframe values and overrides the

7907 second one values where both first.loc[index, col] and

7908 second.loc[index, col] are not missing values, upon calling

7909 first.combine_first(second).

7911 Parameters

7912 ----------

7913 other : DataFrame

7914 Provided DataFrame to use to fill null values.

7916 Returns

7917 -------

7918 DataFrame

7919 The result of combining the provided DataFrame with the other object.

7921 See Also

7922 --------

7923 DataFrame.combine : Perform series-wise operation on two DataFrames

7924 using a given function.

7926 Examples

7927 --------

7928 >>> df1 = pd.DataFrame({'A': [None, 0], 'B': [None, 4]})

7929 >>> df2 = pd.DataFrame({'A': [1, 1], 'B': [3, 3]})

7930 >>> df1.combine_first(df2)

7931 A B

7932 0 1.0 3.0

7933 1 0.0 4.0

7935 Null values still persist if the location of that null value

7936 does not exist in `other`

7938 >>> df1 = pd.DataFrame({'A': [None, 0], 'B': [4, None]})

7939 >>> df2 = pd.DataFrame({'B': [3, 3], 'C': [1, 1]}, index=[1, 2])

7940 >>> df1.combine_first(df2)

7941 A B C

7942 0 NaN 4.0 NaN

7943 1 0.0 3.0 1.0

7944 2 NaN 3.0 1.0

7945 """

7946 from pandas.core.computation import expressions

7948 def combiner(x, y):

7949 mask = extract_array(isna(x))

7951 x_values = extract_array(x, extract_numpy=True)

7952 y_values = extract_array(y, extract_numpy=True)

7954 # If the column y in other DataFrame is not in first DataFrame,

7955 # just return y_values.

7956 if y.name not in self.columns:

7957 return y_values

7959 return expressions.where(mask, y_values, x_values)

7961 combined = self.combine(other, combiner, overwrite=False)

7963 dtypes = {

7964 col: find_common_type([self.dtypes[col], other.dtypes[col]])

7965 for col in self.columns.intersection(other.columns)

7966 if not is_dtype_equal(combined.dtypes[col], self.dtypes[col])

7969 if dtypes:

7970 combined = combined.astype(dtypes)

7972 return combined

7974 def update(

7975 self,

7976 other,

7977 join: str = "left",

7978 overwrite: bool = True,

7979 filter_func=None,

7980 errors: str = "ignore",

7981 ) -> None:

7982 """

7983 Modify in place using non-NA values from another DataFrame.

7985 Aligns on indices. There is no return value.

7987 Parameters

7988 ----------

7989 other : DataFrame, or object coercible into a DataFrame

7990 Should have at least one matching index/column label

7991 with the original DataFrame. If a Series is passed,

7992 its name attribute must be set, and that will be

7993 used as the column name to align with the original DataFrame.

7994 join : {'left'}, default 'left'

7995 Only left join is implemented, keeping the index and columns of the

7996 original object.

7997 overwrite : bool, default True

7998 How to handle non-NA values for overlapping keys:

8000 * True: overwrite original DataFrame's values

8001 with values from `other`.

8002 * False: only update values that are NA in

8003 the original DataFrame.

8005 filter_func : callable(1d-array) -> bool 1d-array, optional

8006 Can choose to replace values other than NA. Return True for values

8007 that should be updated.

8008 errors : {'raise', 'ignore'}, default 'ignore'

8009 If 'raise', will raise a ValueError if the DataFrame and `other`

8010 both contain non-NA data in the same place.

8012 Returns

8013 -------

8014 None

8015 This method directly changes calling object.

8017 Raises

8018 ------

8019 ValueError

8020 * When `errors='raise'` and there's overlapping non-NA data.

8021 * When `errors` is not either `'ignore'` or `'raise'`

8022 NotImplementedError

8023 * If `join != 'left'`

8025 See Also

8026 --------

8027 dict.update : Similar method for dictionaries.

8028 DataFrame.merge : For column(s)-on-column(s) operations.

8030 Examples

8031 --------

8032 >>> df = pd.DataFrame({'A': [1, 2, 3],

8033 ... 'B': [400, 500, 600]})

8034 >>> new_df = pd.DataFrame({'B': [4, 5, 6],

8035 ... 'C': [7, 8, 9]})

8036 >>> df.update(new_df)

8037 >>> df

8038 A B

8039 0 1 4

8040 1 2 5

8041 2 3 6

8043 The DataFrame's length does not increase as a result of the update,

8044 only values at matching index/column labels are updated.

8046 >>> df = pd.DataFrame({'A': ['a', 'b', 'c'],

8047 ... 'B': ['x', 'y', 'z']})

8048 >>> new_df = pd.DataFrame({'B': ['d', 'e', 'f', 'g', 'h', 'i']})

8049 >>> df.update(new_df)

8050 >>> df

8051 A B

8052 0 a d

8053 1 b e

8054 2 c f

8056 For Series, its name attribute must be set.

8058 >>> df = pd.DataFrame({'A': ['a', 'b', 'c'],

8059 ... 'B': ['x', 'y', 'z']})

8060 >>> new_column = pd.Series(['d', 'e'], name='B', index=[0, 2])

8061 >>> df.update(new_column)

8062 >>> df

8063 A B

8064 0 a d

8065 1 b y

8066 2 c e

8067 >>> df = pd.DataFrame({'A': ['a', 'b', 'c'],

8068 ... 'B': ['x', 'y', 'z']})

8069 >>> new_df = pd.DataFrame({'B': ['d', 'e']}, index=[1, 2])

8070 >>> df.update(new_df)

8071 >>> df

8072 A B

8073 0 a x

8074 1 b d

8075 2 c e

8077 If `other` contains NaNs the corresponding values are not updated

8078 in the original dataframe.

8080 >>> df = pd.DataFrame({'A': [1, 2, 3],

8081 ... 'B': [400, 500, 600]})

8082 >>> new_df = pd.DataFrame({'B': [4, np.nan, 6]})

8083 >>> df.update(new_df)

8084 >>> df

8085 A B

8086 0 1 4

8087 1 2 500

8088 2 3 6

8089 """

8090 from pandas.core.computation import expressions

8092 # TODO: Support other joins

8093 if join != "left": # pragma: no cover

8094 raise NotImplementedError("Only left join is supported")

8095 if errors not in ["ignore", "raise"]:

8096 raise ValueError("The parameter errors must be either 'ignore' or 'raise'")

8098 if not isinstance(other, DataFrame):

8099 other = DataFrame(other)

8101 other = other.reindex(self.index)

8103 for col in self.columns.intersection(other.columns):

8104 this = self[col]._values

8105 that = other[col]._values

8107 if filter_func is not None:

8108 with np.errstate(all="ignore"):

8109 mask = ~filter_func(this) | isna(that)

8110 else:

8111 if errors == "raise":

8112 mask_this = notna(that)

8113 mask_that = notna(this)

8114 if any(mask_this & mask_that):

8115 raise ValueError("Data overlaps.")

8117 if overwrite:

8118 mask = isna(that)

8119 else:

8120 mask = notna(this)

8122 # don't overwrite columns unnecessarily

8123 if mask.all():

8124 continue

8126 self.loc[:, col] = expressions.where(mask, this, that)

8128 # ----------------------------------------------------------------------

8129 # Data reshaping

8130 @Appender(

8131 """

8132Examples

8133--------

8134>>> df = pd.DataFrame({'Animal': ['Falcon', 'Falcon',

8135... 'Parrot', 'Parrot'],

8136... 'Max Speed': [380., 370., 24., 26.]})

8137>>> df

8138 Animal Max Speed

81390 Falcon 380.0

81401 Falcon 370.0

81412 Parrot 24.0

81423 Parrot 26.0

8143>>> df.groupby(['Animal']).mean()

8144 Max Speed

8145Animal

8146Falcon 375.0

8147Parrot 25.0

8149**Hierarchical Indexes**

8151We can groupby different levels of a hierarchical index

8152using the `level` parameter:

8154>>> arrays = [['Falcon', 'Falcon', 'Parrot', 'Parrot'],

8155... ['Captive', 'Wild', 'Captive', 'Wild']]

8156>>> index = pd.MultiIndex.from_arrays(arrays, names=('Animal', 'Type'))

8157>>> df = pd.DataFrame({'Max Speed': [390., 350., 30., 20.]},

8158... index=index)

8159>>> df

8160 Max Speed

8161Animal Type

8162Falcon Captive 390.0

8163 Wild 350.0

8164Parrot Captive 30.0

8165 Wild 20.0

8166>>> df.groupby(level=0).mean()

8167 Max Speed

8168Animal

8169Falcon 370.0

8170Parrot 25.0

8171>>> df.groupby(level="Type").mean()

8172 Max Speed

8173Type

8174Captive 210.0

8175Wild 185.0

8177We can also choose to include NA in group keys or not by setting

8178`dropna` parameter, the default setting is `True`.

8180>>> l = [[1, 2, 3], [1, None, 4], [2, 1, 3], [1, 2, 2]]

8181>>> df = pd.DataFrame(l, columns=["a", "b", "c"])

8183>>> df.groupby(by=["b"]).sum()

8184 a c

81861.0 2 3

81872.0 2 5

8189>>> df.groupby(by=["b"], dropna=False).sum()

8190 a c

81921.0 2 3

81932.0 2 5

8194NaN 1 4

8196>>> l = [["a", 12, 12], [None, 12.3, 33.], ["b", 12.3, 123], ["a", 1, 1]]

8197>>> df = pd.DataFrame(l, columns=["a", "b", "c"])

8199>>> df.groupby(by="a").sum()

8200 b c

8202a 13.0 13.0

8203b 12.3 123.0

8205>>> df.groupby(by="a", dropna=False).sum()

8206 b c

8208a 13.0 13.0

8209b 12.3 123.0

8210NaN 12.3 33.0

8212When using ``.apply()``, use ``group_keys`` to include or exclude the group keys.

8213The ``group_keys`` argument defaults to ``True`` (include).

8215>>> df = pd.DataFrame({'Animal': ['Falcon', 'Falcon',

8216... 'Parrot', 'Parrot'],

8217... 'Max Speed': [380., 370., 24., 26.]})

8218>>> df.groupby("Animal", group_keys=True).apply(lambda x: x)

8219 Animal Max Speed

8220Animal

8221Falcon 0 Falcon 380.0

8222 1 Falcon 370.0

8223Parrot 2 Parrot 24.0

8224 3 Parrot 26.0

8226>>> df.groupby("Animal", group_keys=False).apply(lambda x: x)

8227 Animal Max Speed

82280 Falcon 380.0

82291 Falcon 370.0

82302 Parrot 24.0

82313 Parrot 26.0

8234 @Appender(_shared_docs["groupby"] % _shared_doc_kwargs)

8235 def groupby(

8236 self,

8237 by=None,

8238 axis: Axis = 0,

8239 level: IndexLabel | None = None,

8240 as_index: bool = True,

8241 sort: bool = True,

8242 group_keys: bool = True,

8243 observed: bool = False,

8244 dropna: bool = True,

8245 ) -> DataFrameGroupBy:

8246 from pandas.core.groupby.generic import DataFrameGroupBy

8248 if level is None and by is None:

8249 raise TypeError("You have to supply one of 'by' and 'level'")

8250 axis = self._get_axis_number(axis)

8252 return DataFrameGroupBy(

8253 obj=self,

8254 keys=by,

8255 axis=axis,

8256 level=level,

8257 as_index=as_index,

8258 sort=sort,

8259 group_keys=group_keys,

8260 observed=observed,

8261 dropna=dropna,

8264 _shared_docs[

8265 "pivot"

8266 ] = """

8267 Return reshaped DataFrame organized by given index / column values.

8269 Reshape data (produce a "pivot" table) based on column values. Uses

8270 unique values from specified `index` / `columns` to form axes of the

8271 resulting DataFrame. This function does not support data

8272 aggregation, multiple values will result in a MultiIndex in the

8273 columns. See the :ref:`User Guide <reshaping>` for more on reshaping.

8275 Parameters

8276 ----------%s

8277 columns : str or object or a list of str

8278 Column to use to make new frame's columns.

8280 .. versionchanged:: 1.1.0

8281 Also accept list of columns names.

8283 index : str or object or a list of str, optional

8284 Column to use to make new frame's index. If not given, uses existing index.

8286 .. versionchanged:: 1.1.0

8287 Also accept list of index names.

8289 values : str, object or a list of the previous, optional

8290 Column(s) to use for populating new frame's values. If not

8291 specified, all remaining columns will be used and the result will

8292 have hierarchically indexed columns.

8294 Returns

8295 -------

8296 DataFrame

8297 Returns reshaped DataFrame.

8299 Raises

8300 ------

8301 ValueError:

8302 When there are any `index`, `columns` combinations with multiple

8303 values. `DataFrame.pivot_table` when you need to aggregate.

8305 See Also

8306 --------

8307 DataFrame.pivot_table : Generalization of pivot that can handle

8308 duplicate values for one index/column pair.

8309 DataFrame.unstack : Pivot based on the index values instead of a

8310 column.

8311 wide_to_long : Wide panel to long format. Less flexible but more

8312 user-friendly than melt.

8314 Notes

8315 -----

8316 For finer-tuned control, see hierarchical indexing documentation along

8317 with the related stack/unstack methods.

8319 Reference :ref:`the user guide <reshaping.pivot>` for more examples.

8321 Examples

8322 --------

8323 >>> df = pd.DataFrame({'foo': ['one', 'one', 'one', 'two', 'two',

8324 ... 'two'],

8325 ... 'bar': ['A', 'B', 'C', 'A', 'B', 'C'],

8326 ... 'baz': [1, 2, 3, 4, 5, 6],

8327 ... 'zoo': ['x', 'y', 'z', 'q', 'w', 't']})

8328 >>> df

8329 foo bar baz zoo

8330 0 one A 1 x

8331 1 one B 2 y

8332 2 one C 3 z

8333 3 two A 4 q

8334 4 two B 5 w

8335 5 two C 6 t

8337 >>> df.pivot(index='foo', columns='bar', values='baz')

8338 bar A B C

8339 foo

8340 one 1 2 3

8341 two 4 5 6

8343 >>> df.pivot(index='foo', columns='bar')['baz']

8344 bar A B C

8345 foo

8346 one 1 2 3

8347 two 4 5 6

8349 >>> df.pivot(index='foo', columns='bar', values=['baz', 'zoo'])

8350 baz zoo

8351 bar A B C A B C

8352 foo

8353 one 1 2 3 x y z

8354 two 4 5 6 q w t

8356 You could also assign a list of column names or a list of index names.

8358 >>> df = pd.DataFrame({

8359 ... "lev1": [1, 1, 1, 2, 2, 2],

8360 ... "lev2": [1, 1, 2, 1, 1, 2],

8361 ... "lev3": [1, 2, 1, 2, 1, 2],

8362 ... "lev4": [1, 2, 3, 4, 5, 6],

8363 ... "values": [0, 1, 2, 3, 4, 5]})

8364 >>> df

8365 lev1 lev2 lev3 lev4 values

8366 0 1 1 1 1 0

8367 1 1 1 2 2 1

8368 2 1 2 1 3 2

8369 3 2 1 2 4 3

8370 4 2 1 1 5 4

8371 5 2 2 2 6 5

8373 >>> df.pivot(index="lev1", columns=["lev2", "lev3"], values="values")

8374 lev2 1 2

8375 lev3 1 2 1 2

8376 lev1

8377 1 0.0 1.0 2.0 NaN

8378 2 4.0 3.0 NaN 5.0

8380 >>> df.pivot(index=["lev1", "lev2"], columns=["lev3"], values="values")

8381 lev3 1 2

8382 lev1 lev2

8383 1 1 0.0 1.0

8384 2 2.0 NaN

8385 2 1 4.0 3.0

8386 2 NaN 5.0

8388 A ValueError is raised if there are any duplicates.

8390 >>> df = pd.DataFrame({"foo": ['one', 'one', 'two', 'two'],

8391 ... "bar": ['A', 'A', 'B', 'C'],

8392 ... "baz": [1, 2, 3, 4]})

8393 >>> df

8394 foo bar baz

8395 0 one A 1

8396 1 one A 2

8397 2 two B 3

8398 3 two C 4

8400 Notice that the first two rows are the same for our `index`

8401 and `columns` arguments.

8403 >>> df.pivot(index='foo', columns='bar', values='baz')

8404 Traceback (most recent call last):

8405 ...

8406 ValueError: Index contains duplicate entries, cannot reshape

8407 """

8409 @Substitution("")

8410 @Appender(_shared_docs["pivot"])

8411 def pivot(self, *, columns, index=lib.NoDefault, values=lib.NoDefault) -> DataFrame:

8412 from pandas.core.reshape.pivot import pivot

8414 return pivot(self, index=index, columns=columns, values=values)

8416 _shared_docs[

8417 "pivot_table"

8418 ] = """

8419 Create a spreadsheet-style pivot table as a DataFrame.

8421 The levels in the pivot table will be stored in MultiIndex objects

8422 (hierarchical indexes) on the index and columns of the result DataFrame.

8424 Parameters

8425 ----------%s

8426 values : list-like or scalar, optional

8427 Column or columns to aggregate.

8428 index : column, Grouper, array, or list of the previous

8429 If an array is passed, it must be the same length as the data. The

8430 list can contain any of the other types (except list).

8431 Keys to group by on the pivot table index. If an array is passed,

8432 it is being used as the same manner as column values.

8433 columns : column, Grouper, array, or list of the previous

8434 If an array is passed, it must be the same length as the data. The

8435 list can contain any of the other types (except list).

8436 Keys to group by on the pivot table column. If an array is passed,

8437 it is being used as the same manner as column values.

8438 aggfunc : function, list of functions, dict, default numpy.mean

8439 If list of functions passed, the resulting pivot table will have

8440 hierarchical columns whose top level are the function names

8441 (inferred from the function objects themselves)

8442 If dict is passed, the key is column to aggregate and value

8443 is function or list of functions. If ``margin=True``,

8444 aggfunc will be used to calculate the partial aggregates.

8445 fill_value : scalar, default None

8446 Value to replace missing values with (in the resulting pivot table,

8447 after aggregation).

8448 margins : bool, default False

8449 If ``margins=True``, special ``All`` columns and rows

8450 will be added with partial group aggregates across the categories

8451 on the rows and columns.

8452 dropna : bool, default True

8453 Do not include columns whose entries are all NaN. If True,

8454 rows with a NaN value in any column will be omitted before

8455 computing margins.

8456 margins_name : str, default 'All'

8457 Name of the row / column that will contain the totals

8458 when margins is True.

8459 observed : bool, default False

8460 This only applies if any of the groupers are Categoricals.

8461 If True: only show observed values for categorical groupers.

8462 If False: show all values for categorical groupers.

8464 sort : bool, default True

8465 Specifies if the result should be sorted.

8467 .. versionadded:: 1.3.0

8469 Returns

8470 -------

8471 DataFrame

8472 An Excel style pivot table.

8474 See Also

8475 --------

8476 DataFrame.pivot : Pivot without aggregation that can handle

8477 non-numeric data.

8478 DataFrame.melt: Unpivot a DataFrame from wide to long format,

8479 optionally leaving identifiers set.

8480 wide_to_long : Wide panel to long format. Less flexible but more

8481 user-friendly than melt.

8483 Notes

8484 -----

8485 Reference :ref:`the user guide <reshaping.pivot>` for more examples.

8487 Examples

8488 --------

8489 >>> df = pd.DataFrame({"A": ["foo", "foo", "foo", "foo", "foo",

8490 ... "bar", "bar", "bar", "bar"],

8491 ... "B": ["one", "one", "one", "two", "two",

8492 ... "one", "one", "two", "two"],

8493 ... "C": ["small", "large", "large", "small",

8494 ... "small", "large", "small", "small",

8495 ... "large"],

8496 ... "D": [1, 2, 2, 3, 3, 4, 5, 6, 7],

8497 ... "E": [2, 4, 5, 5, 6, 6, 8, 9, 9]})

8498 >>> df

8499 A B C D E

8500 0 foo one small 1 2

8501 1 foo one large 2 4

8502 2 foo one large 2 5

8503 3 foo two small 3 5

8504 4 foo two small 3 6

8505 5 bar one large 4 6

8506 6 bar one small 5 8

8507 7 bar two small 6 9

8508 8 bar two large 7 9

8510 This first example aggregates values by taking the sum.

8512 >>> table = pd.pivot_table(df, values='D', index=['A', 'B'],

8513 ... columns=['C'], aggfunc=np.sum)

8514 >>> table

8515 C large small

8516 A B

8517 bar one 4.0 5.0

8518 two 7.0 6.0

8519 foo one 4.0 1.0

8520 two NaN 6.0

8522 We can also fill missing values using the `fill_value` parameter.

8524 >>> table = pd.pivot_table(df, values='D', index=['A', 'B'],

8525 ... columns=['C'], aggfunc=np.sum, fill_value=0)

8526 >>> table

8527 C large small

8528 A B

8529 bar one 4 5

8530 two 7 6

8531 foo one 4 1

8532 two 0 6

8534 The next example aggregates by taking the mean across multiple columns.

8536 >>> table = pd.pivot_table(df, values=['D', 'E'], index=['A', 'C'],

8537 ... aggfunc={'D': np.mean, 'E': np.mean})

8538 >>> table

8539 D E

8540 A C

8541 bar large 5.500000 7.500000

8542 small 5.500000 8.500000

8543 foo large 2.000000 4.500000

8544 small 2.333333 4.333333

8546 We can also calculate multiple types of aggregations for any given

8547 value column.

8549 >>> table = pd.pivot_table(df, values=['D', 'E'], index=['A', 'C'],

8550 ... aggfunc={'D': np.mean,

8551 ... 'E': [min, max, np.mean]})

8552 >>> table

8553 D E

8554 mean max mean min

8555 A C

8556 bar large 5.500000 9 7.500000 6

8557 small 5.500000 9 8.500000 8

8558 foo large 2.000000 5 4.500000 4

8559 small 2.333333 6 4.333333 2

8560 """

8562 @Substitution("")

8563 @Appender(_shared_docs["pivot_table"])

8564 def pivot_table(

8565 self,

8566 values=None,

8567 index=None,

8568 columns=None,

8569 aggfunc: AggFuncType = "mean",

8570 fill_value=None,

8571 margins: bool = False,

8572 dropna: bool = True,

8573 margins_name: Level = "All",

8574 observed: bool = False,

8575 sort: bool = True,

8576 ) -> DataFrame:

8577 from pandas.core.reshape.pivot import pivot_table

8579 return pivot_table(

8580 self,

8581 values=values,

8582 index=index,

8583 columns=columns,

8584 aggfunc=aggfunc,

8585 fill_value=fill_value,

8586 margins=margins,

8587 dropna=dropna,

8588 margins_name=margins_name,

8589 observed=observed,

8590 sort=sort,

8593 def stack(self, level: Level = -1, dropna: bool = True):

8594 """

8595 Stack the prescribed level(s) from columns to index.

8597 Return a reshaped DataFrame or Series having a multi-level

8598 index with one or more new inner-most levels compared to the current

8599 DataFrame. The new inner-most levels are created by pivoting the

8600 columns of the current dataframe:

8602 - if the columns have a single level, the output is a Series;

8603 - if the columns have multiple levels, the new index

8604 level(s) is (are) taken from the prescribed level(s) and

8605 the output is a DataFrame.

8607 Parameters

8608 ----------

8609 level : int, str, list, default -1

8610 Level(s) to stack from the column axis onto the index

8611 axis, defined as one index or label, or a list of indices

8612 or labels.

8613 dropna : bool, default True

8614 Whether to drop rows in the resulting Frame/Series with

8615 missing values. Stacking a column level onto the index

8616 axis can create combinations of index and column values

8617 that are missing from the original dataframe. See Examples

8618 section.

8620 Returns

8621 -------

8622 DataFrame or Series

8623 Stacked dataframe or series.

8625 See Also

8626 --------

8627 DataFrame.unstack : Unstack prescribed level(s) from index axis

8628 onto column axis.

8629 DataFrame.pivot : Reshape dataframe from long format to wide

8630 format.

8631 DataFrame.pivot_table : Create a spreadsheet-style pivot table

8632 as a DataFrame.

8634 Notes

8635 -----

8636 The function is named by analogy with a collection of books

8637 being reorganized from being side by side on a horizontal

8638 position (the columns of the dataframe) to being stacked

8639 vertically on top of each other (in the index of the

8640 dataframe).

8642 Reference :ref:`the user guide <reshaping.stacking>` for more examples.

8644 Examples

8645 --------

8646 **Single level columns**

8648 >>> df_single_level_cols = pd.DataFrame([[0, 1], [2, 3]],

8649 ... index=['cat', 'dog'],

8650 ... columns=['weight', 'height'])

8652 Stacking a dataframe with a single level column axis returns a Series:

8654 >>> df_single_level_cols

8655 weight height

8656 cat 0 1

8657 dog 2 3

8658 >>> df_single_level_cols.stack()

8659 cat weight 0

8660 height 1

8661 dog weight 2

8662 height 3

8663 dtype: int64

8665 **Multi level columns: simple case**

8667 >>> multicol1 = pd.MultiIndex.from_tuples([('weight', 'kg'),

8668 ... ('weight', 'pounds')])

8669 >>> df_multi_level_cols1 = pd.DataFrame([[1, 2], [2, 4]],

8670 ... index=['cat', 'dog'],

8671 ... columns=multicol1)

8673 Stacking a dataframe with a multi-level column axis:

8675 >>> df_multi_level_cols1

8676 weight

8677 kg pounds

8678 cat 1 2

8679 dog 2 4

8680 >>> df_multi_level_cols1.stack()

8681 weight

8682 cat kg 1

8683 pounds 2

8684 dog kg 2

8685 pounds 4

8687 **Missing values**

8689 >>> multicol2 = pd.MultiIndex.from_tuples([('weight', 'kg'),

8690 ... ('height', 'm')])

8691 >>> df_multi_level_cols2 = pd.DataFrame([[1.0, 2.0], [3.0, 4.0]],

8692 ... index=['cat', 'dog'],

8693 ... columns=multicol2)

8695 It is common to have missing values when stacking a dataframe

8696 with multi-level columns, as the stacked dataframe typically

8697 has more values than the original dataframe. Missing values

8698 are filled with NaNs:

8700 >>> df_multi_level_cols2

8701 weight height

8702 kg m

8703 cat 1.0 2.0

8704 dog 3.0 4.0

8705 >>> df_multi_level_cols2.stack()

8706 height weight

8707 cat kg NaN 1.0

8708 m 2.0 NaN

8709 dog kg NaN 3.0

8710 m 4.0 NaN

8712 **Prescribing the level(s) to be stacked**

8714 The first parameter controls which level or levels are stacked:

8716 >>> df_multi_level_cols2.stack(0)

8717 kg m

8718 cat height NaN 2.0

8719 weight 1.0 NaN

8720 dog height NaN 4.0

8721 weight 3.0 NaN

8722 >>> df_multi_level_cols2.stack([0, 1])

8723 cat height m 2.0

8724 weight kg 1.0

8725 dog height m 4.0

8726 weight kg 3.0

8727 dtype: float64

8729 **Dropping missing values**

8731 >>> df_multi_level_cols3 = pd.DataFrame([[None, 1.0], [2.0, 3.0]],

8732 ... index=['cat', 'dog'],

8733 ... columns=multicol2)

8735 Note that rows where all values are missing are dropped by

8736 default but this behaviour can be controlled via the dropna

8737 keyword parameter:

8739 >>> df_multi_level_cols3

8740 weight height

8741 kg m

8742 cat NaN 1.0

8743 dog 2.0 3.0

8744 >>> df_multi_level_cols3.stack(dropna=False)

8745 height weight

8746 cat kg NaN NaN

8747 m 1.0 NaN

8748 dog kg NaN 2.0

8749 m 3.0 NaN

8750 >>> df_multi_level_cols3.stack(dropna=True)

8751 height weight

8752 cat m 1.0 NaN

8753 dog kg NaN 2.0

8754 m 3.0 NaN

8755 """

8756 from pandas.core.reshape.reshape import (

8757 stack,

8758 stack_multiple,

8761 if isinstance(level, (tuple, list)):

8762 result = stack_multiple(self, level, dropna=dropna)

8763 else:

8764 result = stack(self, level, dropna=dropna)

8766 return result.__finalize__(self, method="stack")

8768 def explode(

8769 self,

8770 column: IndexLabel,

8771 ignore_index: bool = False,

8772 ) -> DataFrame:

8773 """

8774 Transform each element of a list-like to a row, replicating index values.

8776 Parameters

8777 ----------

8778 column : IndexLabel

8779 Column(s) to explode.

8780 For multiple columns, specify a non-empty list with each element

8781 be str or tuple, and all specified columns their list-like data

8782 on same row of the frame must have matching length.

8784 .. versionadded:: 1.3.0

8785 Multi-column explode

8787 ignore_index : bool, default False

8788 If True, the resulting index will be labeled 0, 1, …, n - 1.

8790 .. versionadded:: 1.1.0

8792 Returns

8793 -------

8794 DataFrame

8795 Exploded lists to rows of the subset columns;

8796 index will be duplicated for these rows.

8798 Raises

8799 ------

8800 ValueError :

8801 * If columns of the frame are not unique.

8802 * If specified columns to explode is empty list.

8803 * If specified columns to explode have not matching count of

8804 elements rowwise in the frame.

8806 See Also

8807 --------

8808 DataFrame.unstack : Pivot a level of the (necessarily hierarchical)

8809 index labels.

8810 DataFrame.melt : Unpivot a DataFrame from wide format to long format.

8811 Series.explode : Explode a DataFrame from list-like columns to long format.

8813 Notes

8814 -----

8815 This routine will explode list-likes including lists, tuples, sets,

8816 Series, and np.ndarray. The result dtype of the subset rows will

8817 be object. Scalars will be returned unchanged, and empty list-likes will

8818 result in a np.nan for that row. In addition, the ordering of rows in the

8819 output will be non-deterministic when exploding sets.

8821 Reference :ref:`the user guide <reshaping.explode>` for more examples.

8823 Examples

8824 --------

8825 >>> df = pd.DataFrame({'A': [[0, 1, 2], 'foo', [], [3, 4]],

8826 ... 'B': 1,

8827 ... 'C': [['a', 'b', 'c'], np.nan, [], ['d', 'e']]})

8828 >>> df

8829 A B C

8830 0 [0, 1, 2] 1 [a, b, c]

8831 1 foo 1 NaN

8832 2 [] 1 []

8833 3 [3, 4] 1 [d, e]

8835 Single-column explode.

8837 >>> df.explode('A')

8838 A B C

8839 0 0 1 [a, b, c]

8840 0 1 1 [a, b, c]

8841 0 2 1 [a, b, c]

8842 1 foo 1 NaN

8843 2 NaN 1 []

8844 3 3 1 [d, e]

8845 3 4 1 [d, e]

8847 Multi-column explode.

8849 >>> df.explode(list('AC'))

8850 A B C

8851 0 0 1 a

8852 0 1 1 b

8853 0 2 1 c

8854 1 foo 1 NaN

8855 2 NaN 1 NaN

8856 3 3 1 d

8857 3 4 1 e

8858 """

8859 if not self.columns.is_unique:

8860 duplicate_cols = self.columns[self.columns.duplicated()].tolist()

8861 raise ValueError(

8862 f"DataFrame columns must be unique. Duplicate columns: {duplicate_cols}"

8865 columns: list[Hashable]

8866 if is_scalar(column) or isinstance(column, tuple):

8867 columns = [column]

8868 elif isinstance(column, list) and all(

8869 is_scalar(c) or isinstance(c, tuple) for c in column

8871 if not column:

8872 raise ValueError("column must be nonempty")

8873 if len(column) > len(set(column)):

8874 raise ValueError("column must be unique")

8875 columns = column

8876 else:

8877 raise ValueError("column must be a scalar, tuple, or list thereof")

8879 df = self.reset_index(drop=True)

8880 if len(columns) == 1:

8881 result = df[columns[0]].explode()

8882 else:

8883 mylen = lambda x: len(x) if (is_list_like(x) and len(x) > 0) else 1

8884 counts0 = self[columns[0]].apply(mylen)

8885 for c in columns[1:]:

8886 if not all(counts0 == self[c].apply(mylen)):

8887 raise ValueError("columns must have matching element counts")

8888 result = DataFrame({c: df[c].explode() for c in columns})

8889 result = df.drop(columns, axis=1).join(result)

8890 if ignore_index:

8891 result.index = default_index(len(result))

8892 else:

8893 result.index = self.index.take(result.index)

8894 result = result.reindex(columns=self.columns, copy=False)

8896 return result.__finalize__(self, method="explode")

8898 def unstack(self, level: Level = -1, fill_value=None):

8899 """

8900 Pivot a level of the (necessarily hierarchical) index labels.

8902 Returns a DataFrame having a new level of column labels whose inner-most level

8903 consists of the pivoted index labels.

8905 If the index is not a MultiIndex, the output will be a Series

8906 (the analogue of stack when the columns are not a MultiIndex).

8908 Parameters

8909 ----------

8910 level : int, str, or list of these, default -1 (last level)

8911 Level(s) of index to unstack, can pass level name.

8912 fill_value : int, str or dict

8913 Replace NaN with this value if the unstack produces missing values.

8915 Returns

8916 -------

8917 Series or DataFrame

8919 See Also

8920 --------

8921 DataFrame.pivot : Pivot a table based on column values.

8922 DataFrame.stack : Pivot a level of the column labels (inverse operation

8923 from `unstack`).

8925 Notes

8926 -----

8927 Reference :ref:`the user guide <reshaping.stacking>` for more examples.

8929 Examples

8930 --------

8931 >>> index = pd.MultiIndex.from_tuples([('one', 'a'), ('one', 'b'),

8932 ... ('two', 'a'), ('two', 'b')])

8933 >>> s = pd.Series(np.arange(1.0, 5.0), index=index)

8934 >>> s

8935 one a 1.0

8936 b 2.0

8937 two a 3.0

8938 b 4.0

8939 dtype: float64

8941 >>> s.unstack(level=-1)

8942 a b

8943 one 1.0 2.0

8944 two 3.0 4.0

8946 >>> s.unstack(level=0)

8947 one two

8948 a 1.0 3.0

8949 b 2.0 4.0

8951 >>> df = s.unstack(level=0)

8952 >>> df.unstack()

8953 one a 1.0

8954 b 2.0

8955 two a 3.0

8956 b 4.0

8957 dtype: float64

8958 """

8959 from pandas.core.reshape.reshape import unstack

8961 result = unstack(self, level, fill_value)

8963 return result.__finalize__(self, method="unstack")

8965 @Appender(_shared_docs["melt"] % {"caller": "df.melt(", "other": "melt"})

8966 def melt(

8967 self,

8968 id_vars=None,

8969 value_vars=None,

8970 var_name=None,

8971 value_name: Hashable = "value",

8972 col_level: Level = None,

8973 ignore_index: bool = True,

8974 ) -> DataFrame:

8975 return melt(

8976 self,

8977 id_vars=id_vars,

8978 value_vars=value_vars,

8979 var_name=var_name,

8980 value_name=value_name,

8981 col_level=col_level,

8982 ignore_index=ignore_index,

8983 ).__finalize__(self, method="melt")

8985 # ----------------------------------------------------------------------

8986 # Time series-related

8988 @doc(

8989 Series.diff,

8990 klass="DataFrame",

8991 extra_params="axis : {0 or 'index', 1 or 'columns'}, default 0\n "

8992 "Take difference over rows (0) or columns (1).\n",

8993 other_klass="Series",

8994 examples=dedent(

8995 """

8996 Difference with previous row

8998 >>> df = pd.DataFrame({'a': [1, 2, 3, 4, 5, 6],

8999 ... 'b': [1, 1, 2, 3, 5, 8],

9000 ... 'c': [1, 4, 9, 16, 25, 36]})

9001 >>> df

9002 a b c

9003 0 1 1 1

9004 1 2 1 4

9005 2 3 2 9

9006 3 4 3 16

9007 4 5 5 25

9008 5 6 8 36

9010 >>> df.diff()

9011 a b c

9012 0 NaN NaN NaN

9013 1 1.0 0.0 3.0

9014 2 1.0 1.0 5.0

9015 3 1.0 1.0 7.0

9016 4 1.0 2.0 9.0

9017 5 1.0 3.0 11.0

9019 Difference with previous column

9021 >>> df.diff(axis=1)

9022 a b c

9023 0 NaN 0 0

9024 1 NaN -1 3

9025 2 NaN -1 7

9026 3 NaN -1 13

9027 4 NaN 0 20

9028 5 NaN 2 28

9030 Difference with 3rd previous row

9032 >>> df.diff(periods=3)

9033 a b c

9034 0 NaN NaN NaN

9035 1 NaN NaN NaN

9036 2 NaN NaN NaN

9037 3 3.0 2.0 15.0

9038 4 3.0 4.0 21.0

9039 5 3.0 6.0 27.0

9041 Difference with following row

9043 >>> df.diff(periods=-1)

9044 a b c

9045 0 -1.0 0.0 -3.0

9046 1 -1.0 -1.0 -5.0

9047 2 -1.0 -1.0 -7.0

9048 3 -1.0 -2.0 -9.0

9049 4 -1.0 -3.0 -11.0

9050 5 NaN NaN NaN

9052 Overflow in input dtype

9054 >>> df = pd.DataFrame({'a': [1, 0]}, dtype=np.uint8)

9055 >>> df.diff()

9057 0 NaN

9058 1 255.0"""

9061 def diff(self, periods: int = 1, axis: Axis = 0) -> DataFrame:

9062 if not lib.is_integer(periods):

9063 if not (

9064 is_float(periods)

9065 # error: "int" has no attribute "is_integer"

9066 and periods.is_integer() # type: ignore[attr-defined]

9068 raise ValueError("periods must be an integer")

9069 periods = int(periods)

9071 axis = self._get_axis_number(axis)

9072 if axis == 1:

9073 if periods != 0:

9074 # in the periods == 0 case, this is equivalent diff of 0 periods

9075 # along axis=0, and the Manager method may be somewhat more

9076 # performant, so we dispatch in that case.

9077 return self - self.shift(periods, axis=axis)

9078 # With periods=0 this is equivalent to a diff with axis=0

9079 axis = 0

9081 new_data = self._mgr.diff(n=periods, axis=axis)

9082 return self._constructor(new_data).__finalize__(self, "diff")

9084 # ----------------------------------------------------------------------

9085 # Function application

9087 def _gotitem(

9088 self,

9089 key: IndexLabel,

9090 ndim: int,

9091 subset: DataFrame | Series | None = None,

9092 ) -> DataFrame | Series:

9093 """

9094 Sub-classes to define. Return a sliced object.

9096 Parameters

9097 ----------

9098 key : string / list of selections

9099 ndim : {1, 2}

9100 requested ndim of result

9101 subset : object, default None

9102 subset to act on

9103 """

9104 if subset is None:

9105 subset = self

9106 elif subset.ndim == 1: # is Series

9107 return subset

9109 # TODO: _shallow_copy(subset)?

9110 return subset[key]

9112 _agg_summary_and_see_also_doc = dedent(

9113 """

9114 The aggregation operations are always performed over an axis, either the

9115 index (default) or the column axis. This behavior is different from

9116 `numpy` aggregation functions (`mean`, `median`, `prod`, `sum`, `std`,

9117 `var`), where the default is to compute the aggregation of the flattened

9118 array, e.g., ``numpy.mean(arr_2d)`` as opposed to

9119 ``numpy.mean(arr_2d, axis=0)``.

9121 `agg` is an alias for `aggregate`. Use the alias.

9123 See Also

9124 --------

9125 DataFrame.apply : Perform any type of operations.

9126 DataFrame.transform : Perform transformation type operations.

9127 core.groupby.GroupBy : Perform operations over groups.

9128 core.resample.Resampler : Perform operations over resampled bins.

9129 core.window.Rolling : Perform operations over rolling window.

9130 core.window.Expanding : Perform operations over expanding window.

9131 core.window.ExponentialMovingWindow : Perform operation over exponential weighted

9132 window.

9133 """

9136 _agg_examples_doc = dedent(

9137 """

9138 Examples

9139 --------

9140 >>> df = pd.DataFrame([[1, 2, 3],

9141 ... [4, 5, 6],

9142 ... [7, 8, 9],

9143 ... [np.nan, np.nan, np.nan]],

9144 ... columns=['A', 'B', 'C'])

9146 Aggregate these functions over the rows.

9148 >>> df.agg(['sum', 'min'])

9149 A B C

9150 sum 12.0 15.0 18.0

9151 min 1.0 2.0 3.0

9153 Different aggregations per column.

9155 >>> df.agg({'A' : ['sum', 'min'], 'B' : ['min', 'max']})

9156 A B

9157 sum 12.0 NaN

9158 min 1.0 2.0

9159 max NaN 8.0

9161 Aggregate different functions over the columns and rename the index of the resulting

9162 DataFrame.

9164 >>> df.agg(x=('A', max), y=('B', 'min'), z=('C', np.mean))

9165 A B C

9166 x 7.0 NaN NaN

9167 y NaN 2.0 NaN

9168 z NaN NaN 6.0

9170 Aggregate over the columns.

9172 >>> df.agg("mean", axis="columns")

9173 0 2.0

9174 1 5.0

9175 2 8.0

9176 3 NaN

9177 dtype: float64

9178 """

9181 @doc(

9182 _shared_docs["aggregate"],

9183 klass=_shared_doc_kwargs["klass"],

9184 axis=_shared_doc_kwargs["axis"],

9185 see_also=_agg_summary_and_see_also_doc,

9186 examples=_agg_examples_doc,

9188 def aggregate(self, func=None, axis: Axis = 0, *args, **kwargs):

9189 from pandas.core.apply import frame_apply

9191 axis = self._get_axis_number(axis)

9193 relabeling, func, columns, order = reconstruct_func(func, **kwargs)

9195 op = frame_apply(self, func=func, axis=axis, args=args, kwargs=kwargs)

9196 result = op.agg()

9198 if relabeling:

9199 # This is to keep the order to columns occurrence unchanged, and also

9200 # keep the order of new columns occurrence unchanged

9202 # For the return values of reconstruct_func, if relabeling is

9203 # False, columns and order will be None.

9204 assert columns is not None

9205 assert order is not None

9207 result_in_dict = relabel_result(result, func, columns, order)

9208 result = DataFrame(result_in_dict, index=columns)

9210 return result

9212 agg = aggregate

9214 # error: Signature of "any" incompatible with supertype "NDFrame" [override]

9215 @overload # type: ignore[override]

9216 def any(

9217 self,

9219 axis: Axis = ...,

9220 bool_only: bool | None = ...,

9221 skipna: bool = ...,

9222 level: None = ...,

9223 **kwargs,

9224 ) -> Series:

9225 ...

9227 @overload

9228 def any(

9229 self,

9231 axis: Axis = ...,

9232 bool_only: bool | None = ...,

9233 skipna: bool = ...,

9234 level: Level,

9235 **kwargs,

9236 ) -> DataFrame | Series:

9237 ...

9239 # error: Missing return statement

9240 @doc(NDFrame.any, **_shared_doc_kwargs)

9241 def any( # type: ignore[empty-body]

9242 self,

9243 axis: Axis = 0,

9244 bool_only: bool | None = None,

9245 skipna: bool = True,

9246 level: Level = None,

9247 **kwargs,

9248 ) -> DataFrame | Series:

9249 ...

9251 @doc(

9252 _shared_docs["transform"],

9253 klass=_shared_doc_kwargs["klass"],

9254 axis=_shared_doc_kwargs["axis"],

9256 def transform(

9257 self, func: AggFuncType, axis: Axis = 0, *args, **kwargs

9258 ) -> DataFrame:

9259 from pandas.core.apply import frame_apply

9261 op = frame_apply(self, func=func, axis=axis, args=args, kwargs=kwargs)

9262 result = op.transform()

9263 assert isinstance(result, DataFrame)

9264 return result

9266 def apply(

9267 self,

9268 func: AggFuncType,

9269 axis: Axis = 0,

9270 raw: bool = False,

9271 result_type: Literal["expand", "reduce", "broadcast"] | None = None,

9272 args=(),

9273 **kwargs,

9275 """

9276 Apply a function along an axis of the DataFrame.

9278 Objects passed to the function are Series objects whose index is

9279 either the DataFrame's index (``axis=0``) or the DataFrame's columns

9280 (``axis=1``). By default (``result_type=None``), the final return type

9281 is inferred from the return type of the applied function. Otherwise,

9282 it depends on the `result_type` argument.

9284 Parameters

9285 ----------

9286 func : function

9287 Function to apply to each column or row.

9288 axis : {0 or 'index', 1 or 'columns'}, default 0

9289 Axis along which the function is applied:

9291 * 0 or 'index': apply function to each column.

9292 * 1 or 'columns': apply function to each row.

9294 raw : bool, default False

9295 Determines if row or column is passed as a Series or ndarray object:

9297 * ``False`` : passes each row or column as a Series to the

9298 function.

9299 * ``True`` : the passed function will receive ndarray objects

9300 instead.

9301 If you are just applying a NumPy reduction function this will

9302 achieve much better performance.

9304 result_type : {'expand', 'reduce', 'broadcast', None}, default None

9305 These only act when ``axis=1`` (columns):

9307 * 'expand' : list-like results will be turned into columns.

9308 * 'reduce' : returns a Series if possible rather than expanding

9309 list-like results. This is the opposite of 'expand'.

9310 * 'broadcast' : results will be broadcast to the original shape

9311 of the DataFrame, the original index and columns will be

9312 retained.

9314 The default behaviour (None) depends on the return value of the

9315 applied function: list-like results will be returned as a Series

9316 of those. However if the apply function returns a Series these

9317 are expanded to columns.

9318 args : tuple

9319 Positional arguments to pass to `func` in addition to the

9320 array/series.

9321 **kwargs

9322 Additional keyword arguments to pass as keywords arguments to

9323 `func`.

9325 Returns

9326 -------

9327 Series or DataFrame

9328 Result of applying ``func`` along the given axis of the

9329 DataFrame.

9331 See Also

9332 --------

9333 DataFrame.applymap: For elementwise operations.

9334 DataFrame.aggregate: Only perform aggregating type operations.

9335 DataFrame.transform: Only perform transforming type operations.

9337 Notes

9338 -----

9339 Functions that mutate the passed object can produce unexpected

9340 behavior or errors and are not supported. See :ref:`gotchas.udf-mutation`

9341 for more details.

9343 Examples

9344 --------

9345 >>> df = pd.DataFrame([[4, 9]] * 3, columns=['A', 'B'])

9346 >>> df

9347 A B

9348 0 4 9

9349 1 4 9

9350 2 4 9

9352 Using a numpy universal function (in this case the same as

9353 ``np.sqrt(df)``):

9355 >>> df.apply(np.sqrt)

9356 A B

9357 0 2.0 3.0

9358 1 2.0 3.0

9359 2 2.0 3.0

9361 Using a reducing function on either axis

9363 >>> df.apply(np.sum, axis=0)

9364 A 12

9365 B 27

9366 dtype: int64

9368 >>> df.apply(np.sum, axis=1)

9369 0 13

9370 1 13

9371 2 13

9372 dtype: int64

9374 Returning a list-like will result in a Series

9376 >>> df.apply(lambda x: [1, 2], axis=1)

9377 0 [1, 2]

9378 1 [1, 2]

9379 2 [1, 2]

9380 dtype: object

9382 Passing ``result_type='expand'`` will expand list-like results

9383 to columns of a Dataframe

9385 >>> df.apply(lambda x: [1, 2], axis=1, result_type='expand')

9386 0 1

9387 0 1 2

9388 1 1 2

9389 2 1 2

9391 Returning a Series inside the function is similar to passing

9392 ``result_type='expand'``. The resulting column names

9393 will be the Series index.

9395 >>> df.apply(lambda x: pd.Series([1, 2], index=['foo', 'bar']), axis=1)

9396 foo bar

9397 0 1 2

9398 1 1 2

9399 2 1 2

9401 Passing ``result_type='broadcast'`` will ensure the same shape

9402 result, whether list-like or scalar is returned by the function,

9403 and broadcast it along the axis. The resulting column names will

9404 be the originals.

9406 >>> df.apply(lambda x: [1, 2], axis=1, result_type='broadcast')

9407 A B

9408 0 1 2

9409 1 1 2

9410 2 1 2

9411 """

9412 from pandas.core.apply import frame_apply

9414 op = frame_apply(

9415 self,

9416 func=func,

9417 axis=axis,

9418 raw=raw,

9419 result_type=result_type,

9420 args=args,

9421 kwargs=kwargs,

9423 return op.apply().__finalize__(self, method="apply")

9425 def applymap(

9426 self, func: PythonFuncType, na_action: str | None = None, **kwargs

9427 ) -> DataFrame:

9428 """

9429 Apply a function to a Dataframe elementwise.

9431 This method applies a function that accepts and returns a scalar

9432 to every element of a DataFrame.

9434 Parameters

9435 ----------

9436 func : callable

9437 Python function, returns a single value from a single value.

9438 na_action : {None, 'ignore'}, default None

9439 If ‘ignore’, propagate NaN values, without passing them to func.

9441 .. versionadded:: 1.2

9443 **kwargs

9444 Additional keyword arguments to pass as keywords arguments to

9445 `func`.

9447 .. versionadded:: 1.3.0

9449 Returns

9450 -------

9451 DataFrame

9452 Transformed DataFrame.

9454 See Also

9455 --------

9456 DataFrame.apply : Apply a function along input axis of DataFrame.

9458 Examples

9459 --------

9460 >>> df = pd.DataFrame([[1, 2.12], [3.356, 4.567]])

9461 >>> df

9462 0 1

9463 0 1.000 2.120

9464 1 3.356 4.567

9466 >>> df.applymap(lambda x: len(str(x)))

9467 0 1

9468 0 3 4

9469 1 5 5

9471 Like Series.map, NA values can be ignored:

9473 >>> df_copy = df.copy()

9474 >>> df_copy.iloc[0, 0] = pd.NA

9475 >>> df_copy.applymap(lambda x: len(str(x)), na_action='ignore')

9476 0 1

9477 0 NaN 4

9478 1 5.0 5

9480 Note that a vectorized version of `func` often exists, which will

9481 be much faster. You could square each number elementwise.

9483 >>> df.applymap(lambda x: x**2)

9484 0 1

9485 0 1.000000 4.494400

9486 1 11.262736 20.857489

9488 But it's better to avoid applymap in that case.

9490 >>> df ** 2

9491 0 1

9492 0 1.000000 4.494400

9493 1 11.262736 20.857489

9494 """

9495 if na_action not in {"ignore", None}:

9496 raise ValueError(

9497 f"na_action must be 'ignore' or None. Got {repr(na_action)}"

9499 ignore_na = na_action == "ignore"

9500 func = functools.partial(func, **kwargs)

9502 # if we have a dtype == 'M8[ns]', provide boxed values

9503 def infer(x):

9504 if x.empty:

9505 return lib.map_infer(x, func, ignore_na=ignore_na)

9506 return lib.map_infer(x.astype(object)._values, func, ignore_na=ignore_na)

9508 return self.apply(infer).__finalize__(self, "applymap")

9510 # ----------------------------------------------------------------------

9511 # Merging / joining methods

9513 def _append(

9514 self,

9515 other,

9516 ignore_index: bool = False,

9517 verify_integrity: bool = False,

9518 sort: bool = False,

9519 ) -> DataFrame:

9520 if isinstance(other, (Series, dict)):

9521 if isinstance(other, dict):

9522 if not ignore_index:

9523 raise TypeError("Can only append a dict if ignore_index=True")

9524 other = Series(other)

9525 if other.name is None and not ignore_index:

9526 raise TypeError(

9527 "Can only append a Series if ignore_index=True "

9528 "or if the Series has a name"

9531 index = Index(

9532 [other.name],

9533 name=self.index.names

9534 if isinstance(self.index, MultiIndex)

9535 else self.index.name,

9537 row_df = other.to_frame().T

9538 # infer_objects is needed for

9539 # test_append_empty_frame_to_series_with_dateutil_tz

9540 other = row_df.infer_objects(copy=False).rename_axis(

9541 index.names, copy=False

9543 elif isinstance(other, list):

9544 if not other:

9545 pass

9546 elif not isinstance(other[0], DataFrame):

9547 other = DataFrame(other)

9548 if self.index.name is not None and not ignore_index:

9549 other.index.name = self.index.name

9551 from pandas.core.reshape.concat import concat

9553 if isinstance(other, (list, tuple)):

9554 to_concat = [self, *other]

9555 else:

9556 to_concat = [self, other]

9558 result = concat(

9559 to_concat,

9560 ignore_index=ignore_index,

9561 verify_integrity=verify_integrity,

9562 sort=sort,

9564 return result.__finalize__(self, method="append")

9566 def join(

9567 self,

9568 other: DataFrame | Series | Iterable[DataFrame | Series],

9569 on: IndexLabel | None = None,

9570 how: MergeHow = "left",

9571 lsuffix: str = "",

9572 rsuffix: str = "",

9573 sort: bool = False,

9574 validate: str | None = None,

9575 ) -> DataFrame:

9576 """

9577 Join columns of another DataFrame.

9579 Join columns with `other` DataFrame either on index or on a key

9580 column. Efficiently join multiple DataFrame objects by index at once by

9581 passing a list.

9583 Parameters

9584 ----------

9585 other : DataFrame, Series, or a list containing any combination of them

9586 Index should be similar to one of the columns in this one. If a

9587 Series is passed, its name attribute must be set, and that will be

9588 used as the column name in the resulting joined DataFrame.

9589 on : str, list of str, or array-like, optional

9590 Column or index level name(s) in the caller to join on the index

9591 in `other`, otherwise joins index-on-index. If multiple

9592 values given, the `other` DataFrame must have a MultiIndex. Can

9593 pass an array as the join key if it is not already contained in

9594 the calling DataFrame. Like an Excel VLOOKUP operation.

9595 how : {'left', 'right', 'outer', 'inner', 'cross'}, default 'left'

9596 How to handle the operation of the two objects.

9598 * left: use calling frame's index (or column if on is specified)

9599 * right: use `other`'s index.

9600 * outer: form union of calling frame's index (or column if on is

9601 specified) with `other`'s index, and sort it.

9602 lexicographically.

9603 * inner: form intersection of calling frame's index (or column if

9604 on is specified) with `other`'s index, preserving the order

9605 of the calling's one.

9606 * cross: creates the cartesian product from both frames, preserves the order

9607 of the left keys.

9609 .. versionadded:: 1.2.0

9611 lsuffix : str, default ''

9612 Suffix to use from left frame's overlapping columns.

9613 rsuffix : str, default ''

9614 Suffix to use from right frame's overlapping columns.

9615 sort : bool, default False

9616 Order result DataFrame lexicographically by the join key. If False,

9617 the order of the join key depends on the join type (how keyword).

9618 validate : str, optional

9619 If specified, checks if join is of specified type.

9620 * "one_to_one" or "1:1": check if join keys are unique in both left

9621 and right datasets.

9622 * "one_to_many" or "1:m": check if join keys are unique in left dataset.

9623 * "many_to_one" or "m:1": check if join keys are unique in right dataset.

9624 * "many_to_many" or "m:m": allowed, but does not result in checks.

9625 .. versionadded:: 1.5.0

9627 Returns

9628 -------

9629 DataFrame

9630 A dataframe containing columns from both the caller and `other`.

9632 See Also

9633 --------

9634 DataFrame.merge : For column(s)-on-column(s) operations.

9636 Notes

9637 -----

9638 Parameters `on`, `lsuffix`, and `rsuffix` are not supported when

9639 passing a list of `DataFrame` objects.

9641 Support for specifying index levels as the `on` parameter was added

9642 in version 0.23.0.

9644 Examples

9645 --------

9646 >>> df = pd.DataFrame({'key': ['K0', 'K1', 'K2', 'K3', 'K4', 'K5'],

9647 ... 'A': ['A0', 'A1', 'A2', 'A3', 'A4', 'A5']})

9649 >>> df

9650 key A

9651 0 K0 A0

9652 1 K1 A1

9653 2 K2 A2

9654 3 K3 A3

9655 4 K4 A4

9656 5 K5 A5

9658 >>> other = pd.DataFrame({'key': ['K0', 'K1', 'K2'],

9659 ... 'B': ['B0', 'B1', 'B2']})

9661 >>> other

9662 key B

9663 0 K0 B0

9664 1 K1 B1

9665 2 K2 B2

9667 Join DataFrames using their indexes.

9669 >>> df.join(other, lsuffix='_caller', rsuffix='_other')

9670 key_caller A key_other B

9671 0 K0 A0 K0 B0

9672 1 K1 A1 K1 B1

9673 2 K2 A2 K2 B2

9674 3 K3 A3 NaN NaN

9675 4 K4 A4 NaN NaN

9676 5 K5 A5 NaN NaN

9678 If we want to join using the key columns, we need to set key to be

9679 the index in both `df` and `other`. The joined DataFrame will have

9680 key as its index.

9682 >>> df.set_index('key').join(other.set_index('key'))

9683 A B

9684 key

9685 K0 A0 B0

9686 K1 A1 B1

9687 K2 A2 B2

9688 K3 A3 NaN

9689 K4 A4 NaN

9690 K5 A5 NaN

9692 Another option to join using the key columns is to use the `on`

9693 parameter. DataFrame.join always uses `other`'s index but we can use

9694 any column in `df`. This method preserves the original DataFrame's

9695 index in the result.

9697 >>> df.join(other.set_index('key'), on='key')

9698 key A B

9699 0 K0 A0 B0

9700 1 K1 A1 B1

9701 2 K2 A2 B2

9702 3 K3 A3 NaN

9703 4 K4 A4 NaN

9704 5 K5 A5 NaN

9706 Using non-unique key values shows how they are matched.

9708 >>> df = pd.DataFrame({'key': ['K0', 'K1', 'K1', 'K3', 'K0', 'K1'],

9709 ... 'A': ['A0', 'A1', 'A2', 'A3', 'A4', 'A5']})

9711 >>> df

9712 key A

9713 0 K0 A0

9714 1 K1 A1

9715 2 K1 A2

9716 3 K3 A3

9717 4 K0 A4

9718 5 K1 A5

9720 >>> df.join(other.set_index('key'), on='key', validate='m:1')

9721 key A B

9722 0 K0 A0 B0

9723 1 K1 A1 B1

9724 2 K1 A2 B1

9725 3 K3 A3 NaN

9726 4 K0 A4 B0

9727 5 K1 A5 B1

9728 """

9729 return self._join_compat(

9730 other,

9731 on=on,

9732 how=how,

9733 lsuffix=lsuffix,

9734 rsuffix=rsuffix,

9735 sort=sort,

9736 validate=validate,

9739 def _join_compat(

9740 self,

9741 other: DataFrame | Series | Iterable[DataFrame | Series],

9742 on: IndexLabel | None = None,

9743 how: MergeHow = "left",

9744 lsuffix: str = "",

9745 rsuffix: str = "",

9746 sort: bool = False,

9747 validate: str | None = None,

9749 from pandas.core.reshape.concat import concat

9750 from pandas.core.reshape.merge import merge

9752 if isinstance(other, Series):

9753 if other.name is None:

9754 raise ValueError("Other Series must have a name")

9755 other = DataFrame({other.name: other})

9757 if isinstance(other, DataFrame):

9758 if how == "cross":

9759 return merge(

9760 self,

9761 other,

9762 how=how,

9763 on=on,

9764 suffixes=(lsuffix, rsuffix),

9765 sort=sort,

9766 validate=validate,

9768 return merge(

9769 self,

9770 other,

9771 left_on=on,

9772 how=how,

9773 left_index=on is None,

9774 right_index=True,

9775 suffixes=(lsuffix, rsuffix),

9776 sort=sort,

9777 validate=validate,

9779 else:

9780 if on is not None:

9781 raise ValueError(

9782 "Joining multiple DataFrames only supported for joining on index"

9785 if rsuffix or lsuffix:

9786 raise ValueError(

9787 "Suffixes not supported when joining multiple DataFrames"

9790 # Mypy thinks the RHS is a

9791 # "Union[DataFrame, Series, Iterable[Union[DataFrame, Series]]]" whereas

9792 # the LHS is an "Iterable[DataFrame]", but in reality both types are

9793 # "Iterable[Union[DataFrame, Series]]" due to the if statements

9794 frames = [cast("DataFrame | Series", self)] + list(other)

9796 can_concat = all(df.index.is_unique for df in frames)

9798 # join indexes only using concat

9799 if can_concat:

9800 if how == "left":

9801 res = concat(

9802 frames, axis=1, join="outer", verify_integrity=True, sort=sort

9804 return res.reindex(self.index, copy=False)

9805 else:

9806 return concat(

9807 frames, axis=1, join=how, verify_integrity=True, sort=sort

9810 joined = frames[0]

9812 for frame in frames[1:]:

9813 joined = merge(

9814 joined,

9815 frame,

9816 how=how,

9817 left_index=True,

9818 right_index=True,

9819 validate=validate,

9822 return joined

9824 @Substitution("")

9825 @Appender(_merge_doc, indents=2)

9826 def merge(

9827 self,

9828 right: DataFrame | Series,

9829 how: MergeHow = "inner",

9830 on: IndexLabel | None = None,

9831 left_on: IndexLabel | None = None,

9832 right_on: IndexLabel | None = None,

9833 left_index: bool = False,

9834 right_index: bool = False,

9835 sort: bool = False,

9836 suffixes: Suffixes = ("_x", "_y"),

9837 copy: bool | None = None,

9838 indicator: str | bool = False,

9839 validate: str | None = None,

9840 ) -> DataFrame:

9841 from pandas.core.reshape.merge import merge

9843 return merge(

9844 self,

9845 right,

9846 how=how,

9847 on=on,

9848 left_on=left_on,

9849 right_on=right_on,

9850 left_index=left_index,

9851 right_index=right_index,

9852 sort=sort,

9853 suffixes=suffixes,

9854 copy=copy,

9855 indicator=indicator,

9856 validate=validate,

9859 def round(

9860 self, decimals: int | dict[IndexLabel, int] | Series = 0, *args, **kwargs

9861 ) -> DataFrame:

9862 """

9863 Round a DataFrame to a variable number of decimal places.

9865 Parameters

9866 ----------

9867 decimals : int, dict, Series

9868 Number of decimal places to round each column to. If an int is

9869 given, round each column to the same number of places.

9870 Otherwise dict and Series round to variable numbers of places.

9871 Column names should be in the keys if `decimals` is a

9872 dict-like, or in the index if `decimals` is a Series. Any

9873 columns not included in `decimals` will be left as is. Elements

9874 of `decimals` which are not columns of the input will be

9875 ignored.

9876 *args

9877 Additional keywords have no effect but might be accepted for

9878 compatibility with numpy.

9879 **kwargs

9880 Additional keywords have no effect but might be accepted for

9881 compatibility with numpy.

9883 Returns

9884 -------

9885 DataFrame

9886 A DataFrame with the affected columns rounded to the specified

9887 number of decimal places.

9889 See Also

9890 --------

9891 numpy.around : Round a numpy array to the given number of decimals.

9892 Series.round : Round a Series to the given number of decimals.

9894 Examples

9895 --------

9896 >>> df = pd.DataFrame([(.21, .32), (.01, .67), (.66, .03), (.21, .18)],

9897 ... columns=['dogs', 'cats'])

9898 >>> df

9899 dogs cats

9900 0 0.21 0.32

9901 1 0.01 0.67

9902 2 0.66 0.03

9903 3 0.21 0.18

9905 By providing an integer each column is rounded to the same number

9906 of decimal places

9908 >>> df.round(1)

9909 dogs cats

9910 0 0.2 0.3

9911 1 0.0 0.7

9912 2 0.7 0.0

9913 3 0.2 0.2

9915 With a dict, the number of places for specific columns can be

9916 specified with the column names as key and the number of decimal

9917 places as value

9919 >>> df.round({'dogs': 1, 'cats': 0})

9920 dogs cats

9921 0 0.2 0.0

9922 1 0.0 1.0

9923 2 0.7 0.0

9924 3 0.2 0.0

9926 Using a Series, the number of places for specific columns can be

9927 specified with the column names as index and the number of

9928 decimal places as value

9930 >>> decimals = pd.Series([0, 1], index=['cats', 'dogs'])

9931 >>> df.round(decimals)

9932 dogs cats

9933 0 0.2 0.0

9934 1 0.0 1.0

9935 2 0.7 0.0

9936 3 0.2 0.0

9937 """

9938 from pandas.core.reshape.concat import concat

9940 def _dict_round(df: DataFrame, decimals):

9941 for col, vals in df.items():

9942 try:

9943 yield _series_round(vals, decimals[col])

9944 except KeyError:

9945 yield vals

9947 def _series_round(ser: Series, decimals: int) -> Series:

9948 if is_integer_dtype(ser.dtype) or is_float_dtype(ser.dtype):

9949 return ser.round(decimals)

9950 return ser

9952 nv.validate_round(args, kwargs)

9954 if isinstance(decimals, (dict, Series)):

9955 if isinstance(decimals, Series) and not decimals.index.is_unique:

9956 raise ValueError("Index of decimals must be unique")

9957 if is_dict_like(decimals) and not all(

9958 is_integer(value) for _, value in decimals.items()

9960 raise TypeError("Values in decimals must be integers")

9961 new_cols = list(_dict_round(self, decimals))

9962 elif is_integer(decimals):

9963 # Dispatch to Block.round

9964 return self._constructor(

9965 self._mgr.round(decimals=decimals, using_cow=using_copy_on_write()),

9966 ).__finalize__(self, method="round")

9967 else:

9968 raise TypeError("decimals must be an integer, a dict-like or a Series")

9970 if new_cols is not None and len(new_cols) > 0:

9971 return self._constructor(

9972 concat(new_cols, axis=1), index=self.index, columns=self.columns

9973 ).__finalize__(self, method="round")

9974 else:

9975 return self.copy(deep=False)

9977 # ----------------------------------------------------------------------

9978 # Statistical methods, etc.

9980 def corr(

9981 self,

9982 method: CorrelationMethod = "pearson",

9983 min_periods: int = 1,

9984 numeric_only: bool = False,

9985 ) -> DataFrame:

9986 """

9987 Compute pairwise correlation of columns, excluding NA/null values.

9989 Parameters

9990 ----------

9991 method : {'pearson', 'kendall', 'spearman'} or callable

9992 Method of correlation:

9994 * pearson : standard correlation coefficient

9995 * kendall : Kendall Tau correlation coefficient

9996 * spearman : Spearman rank correlation

9997 * callable: callable with input two 1d ndarrays

9998 and returning a float. Note that the returned matrix from corr

9999 will have 1 along the diagonals and will be symmetric

10000 regardless of the callable's behavior.

10001 min_periods : int, optional

10002 Minimum number of observations required per pair of columns

10003 to have a valid result. Currently only available for Pearson

10004 and Spearman correlation.

10005 numeric_only : bool, default False

10006 Include only `float`, `int` or `boolean` data.

10008 .. versionadded:: 1.5.0

10010 .. versionchanged:: 2.0.0

10011 The default value of ``numeric_only`` is now ``False``.

10013 Returns

10014 -------

10015 DataFrame

10016 Correlation matrix.

10018 See Also

10019 --------

10020 DataFrame.corrwith : Compute pairwise correlation with another

10021 DataFrame or Series.

10022 Series.corr : Compute the correlation between two Series.

10024 Notes

10025 -----

10026 Pearson, Kendall and Spearman correlation are currently computed using pairwise complete observations.

10028 * `Pearson correlation coefficient <https://en.wikipedia.org/wiki/Pearson_correlation_coefficient>`_

10029 * `Kendall rank correlation coefficient <https://en.wikipedia.org/wiki/Kendall_rank_correlation_coefficient>`_

10030 * `Spearman's rank correlation coefficient <https://en.wikipedia.org/wiki/Spearman%27s_rank_correlation_coefficient>`_

10032 Examples

10033 --------

10034 >>> def histogram_intersection(a, b):

10035 ... v = np.minimum(a, b).sum().round(decimals=1)

10036 ... return v

10037 >>> df = pd.DataFrame([(.2, .3), (.0, .6), (.6, .0), (.2, .1)],

10038 ... columns=['dogs', 'cats'])

10039 >>> df.corr(method=histogram_intersection)

10040 dogs cats

10041 dogs 1.0 0.3

10042 cats 0.3 1.0

10044 >>> df = pd.DataFrame([(1, 1), (2, np.nan), (np.nan, 3), (4, 4)],

10045 ... columns=['dogs', 'cats'])

10046 >>> df.corr(min_periods=3)

10047 dogs cats

10048 dogs 1.0 NaN

10049 cats NaN 1.0

10050 """ # noqa:E501

10051 data = self._get_numeric_data() if numeric_only else self

10052 cols = data.columns

10053 idx = cols.copy()

10054 mat = data.to_numpy(dtype=float, na_value=np.nan, copy=False)

10056 if method == "pearson":

10057 correl = libalgos.nancorr(mat, minp=min_periods)

10058 elif method == "spearman":

10059 correl = libalgos.nancorr_spearman(mat, minp=min_periods)

10060 elif method == "kendall" or callable(method):

10061 if min_periods is None:

10062 min_periods = 1

10063 mat = mat.T

10064 corrf = nanops.get_corr_func(method)

10065 K = len(cols)

10066 correl = np.empty((K, K), dtype=float)

10067 mask = np.isfinite(mat)

10068 for i, ac in enumerate(mat):

10069 for j, bc in enumerate(mat):

10070 if i > j:

10071 continue

10073 valid = mask[i] & mask[j]

10074 if valid.sum() < min_periods:

10075 c = np.nan

10076 elif i == j:

10077 c = 1.0

10078 elif not valid.all():

10079 c = corrf(ac[valid], bc[valid])

10080 else:

10081 c = corrf(ac, bc)

10082 correl[i, j] = c

10083 correl[j, i] = c

10084 else:

10085 raise ValueError(

10086 "method must be either 'pearson', "

10087 "'spearman', 'kendall', or a callable, "

10088 f"'{method}' was supplied"

10091 result = self._constructor(correl, index=idx, columns=cols, copy=False)

10092 return result.__finalize__(self, method="corr")

10094 def cov(

10095 self,

10096 min_periods: int | None = None,

10097 ddof: int | None = 1,

10098 numeric_only: bool = False,

10099 ) -> DataFrame:

10101 Compute pairwise covariance of columns, excluding NA/null values.

10103 Compute the pairwise covariance among the series of a DataFrame.

10104 The returned data frame is the `covariance matrix

10105 <https://en.wikipedia.org/wiki/Covariance_matrix>`__ of the columns

10106 of the DataFrame.

10108 Both NA and null values are automatically excluded from the

10109 calculation. (See the note below about bias from missing values.)

10110 A threshold can be set for the minimum number of

10111 observations for each value created. Comparisons with observations

10112 below this threshold will be returned as ``NaN``.

10114 This method is generally used for the analysis of time series data to

10115 understand the relationship between different measures

10116 across time.

10118 Parameters

10119 ----------

10120 min_periods : int, optional

10121 Minimum number of observations required per pair of columns

10122 to have a valid result.

10124 ddof : int, default 1

10125 Delta degrees of freedom. The divisor used in calculations

10126 is ``N - ddof``, where ``N`` represents the number of elements.

10128 .. versionadded:: 1.1.0

10130 numeric_only : bool, default False

10131 Include only `float`, `int` or `boolean` data.

10133 .. versionadded:: 1.5.0

10135 .. versionchanged:: 2.0.0

10136 The default value of ``numeric_only`` is now ``False``.

10138 Returns

10139 -------

10140 DataFrame

10141 The covariance matrix of the series of the DataFrame.

10143 See Also

10144 --------

10145 Series.cov : Compute covariance with another Series.

10146 core.window.ewm.ExponentialMovingWindow.cov : Exponential weighted sample

10147 covariance.

10148 core.window.expanding.Expanding.cov : Expanding sample covariance.

10149 core.window.rolling.Rolling.cov : Rolling sample covariance.

10151 Notes

10152 -----

10153 Returns the covariance matrix of the DataFrame's time series.

10154 The covariance is normalized by N-ddof.

10156 For DataFrames that have Series that are missing data (assuming that

10157 data is `missing at random

10158 <https://en.wikipedia.org/wiki/Missing_data#Missing_at_random>`__)

10159 the returned covariance matrix will be an unbiased estimate

10160 of the variance and covariance between the member Series.

10162 However, for many applications this estimate may not be acceptable

10163 because the estimate covariance matrix is not guaranteed to be positive

10164 semi-definite. This could lead to estimate correlations having

10165 absolute values which are greater than one, and/or a non-invertible

10166 covariance matrix. See `Estimation of covariance matrices

10167 <https://en.wikipedia.org/w/index.php?title=Estimation_of_covariance_

10168 matrices>`__ for more details.

10170 Examples

10171 --------

10172 >>> df = pd.DataFrame([(1, 2), (0, 3), (2, 0), (1, 1)],

10173 ... columns=['dogs', 'cats'])

10174 >>> df.cov()

10175 dogs cats

10176 dogs 0.666667 -1.000000

10177 cats -1.000000 1.666667

10179 >>> np.random.seed(42)

10180 >>> df = pd.DataFrame(np.random.randn(1000, 5),

10181 ... columns=['a', 'b', 'c', 'd', 'e'])

10182 >>> df.cov()

10183 a b c d e

10184 a 0.998438 -0.020161 0.059277 -0.008943 0.014144

10185 b -0.020161 1.059352 -0.008543 -0.024738 0.009826

10186 c 0.059277 -0.008543 1.010670 -0.001486 -0.000271

10187 d -0.008943 -0.024738 -0.001486 0.921297 -0.013692

10188 e 0.014144 0.009826 -0.000271 -0.013692 0.977795

10190 **Minimum number of periods**

10192 This method also supports an optional ``min_periods`` keyword

10193 that specifies the required minimum number of non-NA observations for

10194 each column pair in order to have a valid result:

10196 >>> np.random.seed(42)

10197 >>> df = pd.DataFrame(np.random.randn(20, 3),

10198 ... columns=['a', 'b', 'c'])

10199 >>> df.loc[df.index[:5], 'a'] = np.nan

10200 >>> df.loc[df.index[5:10], 'b'] = np.nan

10201 >>> df.cov(min_periods=12)

10202 a b c

10203 a 0.316741 NaN -0.150812

10204 b NaN 1.248003 0.191417

10205 c -0.150812 0.191417 0.895202

10207 data = self._get_numeric_data() if numeric_only else self

10208 cols = data.columns

10209 idx = cols.copy()

10210 mat = data.to_numpy(dtype=float, na_value=np.nan, copy=False)

10212 if notna(mat).all():

10213 if min_periods is not None and min_periods > len(mat):

10214 base_cov = np.empty((mat.shape[1], mat.shape[1]))

10215 base_cov.fill(np.nan)

10216 else:

10217 base_cov = np.cov(mat.T, ddof=ddof)

10218 base_cov = base_cov.reshape((len(cols), len(cols)))

10219 else:

10220 base_cov = libalgos.nancorr(mat, cov=True, minp=min_periods)

10222 result = self._constructor(base_cov, index=idx, columns=cols, copy=False)

10223 return result.__finalize__(self, method="cov")

10225 def corrwith(

10226 self,

10227 other: DataFrame | Series,

10228 axis: Axis = 0,

10229 drop: bool = False,

10230 method: CorrelationMethod = "pearson",

10231 numeric_only: bool = False,

10232 ) -> Series:

10234 Compute pairwise correlation.

10236 Pairwise correlation is computed between rows or columns of

10237 DataFrame with rows or columns of Series or DataFrame. DataFrames

10238 are first aligned along both axes before computing the

10239 correlations.

10241 Parameters

10242 ----------

10243 other : DataFrame, Series

10244 Object with which to compute correlations.

10245 axis : {0 or 'index', 1 or 'columns'}, default 0

10246 The axis to use. 0 or 'index' to compute row-wise, 1 or 'columns' for

10247 column-wise.

10248 drop : bool, default False

10249 Drop missing indices from result.

10250 method : {'pearson', 'kendall', 'spearman'} or callable

10251 Method of correlation:

10253 * pearson : standard correlation coefficient

10254 * kendall : Kendall Tau correlation coefficient

10255 * spearman : Spearman rank correlation

10256 * callable: callable with input two 1d ndarrays

10257 and returning a float.

10259 numeric_only : bool, default False

10260 Include only `float`, `int` or `boolean` data.

10262 .. versionadded:: 1.5.0

10264 .. versionchanged:: 2.0.0

10265 The default value of ``numeric_only`` is now ``False``.

10267 Returns

10268 -------

10269 Series

10270 Pairwise correlations.

10272 See Also

10273 --------

10274 DataFrame.corr : Compute pairwise correlation of columns.

10276 Examples

10277 --------

10278 >>> index = ["a", "b", "c", "d", "e"]

10279 >>> columns = ["one", "two", "three", "four"]

10280 >>> df1 = pd.DataFrame(np.arange(20).reshape(5, 4), index=index, columns=columns)

10281 >>> df2 = pd.DataFrame(np.arange(16).reshape(4, 4), index=index[:4], columns=columns)

10282 >>> df1.corrwith(df2)

10283 one 1.0

10284 two 1.0

10285 three 1.0

10286 four 1.0

10287 dtype: float64

10289 >>> df2.corrwith(df1, axis=1)

10290 a 1.0

10291 b 1.0

10292 c 1.0

10293 d 1.0

10294 e NaN

10295 dtype: float64

10296 """ # noqa:E501

10297 axis = self._get_axis_number(axis)

10298 this = self._get_numeric_data() if numeric_only else self

10300 if isinstance(other, Series):

10301 return this.apply(lambda x: other.corr(x, method=method), axis=axis)

10303 if numeric_only:

10304 other = other._get_numeric_data()

10305 left, right = this.align(other, join="inner", copy=False)

10307 if axis == 1:

10308 left = left.T

10309 right = right.T

10311 if method == "pearson":

10312 # mask missing values

10313 left = left + right * 0

10314 right = right + left * 0

10316 # demeaned data

10317 ldem = left - left.mean(numeric_only=numeric_only)

10318 rdem = right - right.mean(numeric_only=numeric_only)

10320 num = (ldem * rdem).sum()

10321 dom = (

10322 (left.count() - 1)

10323 * left.std(numeric_only=numeric_only)

10324 * right.std(numeric_only=numeric_only)

10327 correl = num / dom

10329 elif method in ["kendall", "spearman"] or callable(method):

10331 def c(x):

10332 return nanops.nancorr(x[0], x[1], method=method)

10334 correl = self._constructor_sliced(

10335 map(c, zip(left.values.T, right.values.T)),

10336 index=left.columns,

10337 copy=False,

10340 else:

10341 raise ValueError(

10342 f"Invalid method {method} was passed, "

10343 "valid methods are: 'pearson', 'kendall', "

10344 "'spearman', or callable"

10347 if not drop:

10348 # Find non-matching labels along the given axis

10349 # and append missing correlations (GH 22375)

10350 raxis: AxisInt = 1 if axis == 0 else 0

10351 result_index = this._get_axis(raxis).union(other._get_axis(raxis))

10352 idx_diff = result_index.difference(correl.index)

10354 if len(idx_diff) > 0:

10355 correl = correl._append(

10356 Series([np.nan] * len(idx_diff), index=idx_diff)

10359 return correl

10361 # ----------------------------------------------------------------------

10362 # ndarray-like stats methods

10364 def count(self, axis: Axis = 0, numeric_only: bool = False):

10366 Count non-NA cells for each column or row.

10368 The values `None`, `NaN`, `NaT`, and optionally `numpy.inf` (depending

10369 on `pandas.options.mode.use_inf_as_na`) are considered NA.

10371 Parameters

10372 ----------

10373 axis : {0 or 'index', 1 or 'columns'}, default 0

10374 If 0 or 'index' counts are generated for each column.

10375 If 1 or 'columns' counts are generated for each row.

10376 numeric_only : bool, default False

10377 Include only `float`, `int` or `boolean` data.

10379 Returns

10380 -------

10381 Series or DataFrame

10382 For each column/row the number of non-NA/null entries.

10383 If `level` is specified returns a `DataFrame`.

10385 See Also

10386 --------

10387 Series.count: Number of non-NA elements in a Series.

10388 DataFrame.value_counts: Count unique combinations of columns.

10389 DataFrame.shape: Number of DataFrame rows and columns (including NA

10390 elements).

10391 DataFrame.isna: Boolean same-sized DataFrame showing places of NA

10392 elements.

10394 Examples

10395 --------

10396 Constructing DataFrame from a dictionary:

10398 >>> df = pd.DataFrame({"Person":

10399 ... ["John", "Myla", "Lewis", "John", "Myla"],

10400 ... "Age": [24., np.nan, 21., 33, 26],

10401 ... "Single": [False, True, True, True, False]})

10402 >>> df

10403 Person Age Single

10404 0 John 24.0 False

10405 1 Myla NaN True

10406 2 Lewis 21.0 True

10407 3 John 33.0 True

10408 4 Myla 26.0 False

10410 Notice the uncounted NA values:

10412 >>> df.count()

10413 Person 5

10414 Age 4

10415 Single 5

10416 dtype: int64

10418 Counts for each **row**:

10420 >>> df.count(axis='columns')

10426 dtype: int64

10428 axis = self._get_axis_number(axis)

10430 if numeric_only:

10431 frame = self._get_numeric_data()

10432 else:

10433 frame = self

10435 # GH #423

10436 if len(frame._get_axis(axis)) == 0:

10437 result = self._constructor_sliced(0, index=frame._get_agg_axis(axis))

10438 else:

10439 if frame._is_mixed_type or frame._mgr.any_extension_types:

10440 # the or any_extension_types is really only hit for single-

10441 # column frames with an extension array

10442 result = notna(frame).sum(axis=axis)

10443 else:

10444 # GH13407

10445 series_counts = notna(frame).sum(axis=axis)

10446 counts = series_counts._values

10447 result = self._constructor_sliced(

10448 counts, index=frame._get_agg_axis(axis), copy=False

10451 return result.astype("int64").__finalize__(self, method="count")

10453 def _reduce(

10454 self,

10456 name: str,

10458 axis: Axis = 0,

10459 skipna: bool = True,

10460 numeric_only: bool = False,

10461 filter_type=None,

10462 **kwds,

10464 assert filter_type is None or filter_type == "bool", filter_type

10465 out_dtype = "bool" if filter_type == "bool" else None

10467 if axis is not None:

10468 axis = self._get_axis_number(axis)

10470 def func(values: np.ndarray):

10471 # We only use this in the case that operates on self.values

10472 return op(values, axis=axis, skipna=skipna, **kwds)

10474 def blk_func(values, axis: Axis = 1):

10475 if isinstance(values, ExtensionArray):

10476 if not is_1d_only_ea_dtype(values.dtype) and not isinstance(

10477 self._mgr, ArrayManager

10479 return values._reduce(name, axis=1, skipna=skipna, **kwds)

10480 return values._reduce(name, skipna=skipna, **kwds)

10481 else:

10482 return op(values, axis=axis, skipna=skipna, **kwds)

10484 def _get_data() -> DataFrame:

10485 if filter_type is None:

10486 data = self._get_numeric_data()

10487 else:

10488 # GH#25101, GH#24434

10489 assert filter_type == "bool"

10490 data = self._get_bool_data()

10491 return data

10493 # Case with EAs see GH#35881

10494 df = self

10495 if numeric_only:

10496 df = _get_data()

10497 if axis is None:

10498 return func(df.values)

10499 elif axis == 1:

10500 if len(df.index) == 0:

10501 # Taking a transpose would result in no columns, losing the dtype.

10502 # In the empty case, reducing along axis 0 or 1 gives the same

10503 # result dtype, so reduce with axis=0 and ignore values

10504 result = df._reduce(

10506 name,

10507 axis=0,

10508 skipna=skipna,

10509 numeric_only=False,

10510 filter_type=filter_type,

10511 **kwds,

10512 ).iloc[:0]

10513 result.index = df.index

10514 return result

10515 df = df.T

10517 # After possibly _get_data and transposing, we are now in the

10518 # simple case where we can use BlockManager.reduce

10519 res = df._mgr.reduce(blk_func)

10520 out = df._constructor(res).iloc[0]

10521 if out_dtype is not None:

10522 out = out.astype(out_dtype)

10523 elif (df._mgr.get_dtypes() == object).any():

10524 out = out.astype(object)

10525 elif len(self) == 0 and name in ("sum", "prod"):

10526 # Even if we are object dtype, follow numpy and return

10527 # float64, see test_apply_funcs_over_empty

10528 out = out.astype(np.float64)

10530 return out

10532 def _reduce_axis1(self, name: str, func, skipna: bool) -> Series:

10534 Special case for _reduce to try to avoid a potentially-expensive transpose.

10536 Apply the reduction block-wise along axis=1 and then reduce the resulting

10537 1D arrays.

10539 if name == "all":

10540 result = np.ones(len(self), dtype=bool)

10541 ufunc = np.logical_and

10542 elif name == "any":

10543 result = np.zeros(len(self), dtype=bool)

10544 # error: Incompatible types in assignment

10545 # (expression has type "_UFunc_Nin2_Nout1[Literal['logical_or'],

10546 # Literal[20], Literal[False]]", variable has type

10547 # "_UFunc_Nin2_Nout1[Literal['logical_and'], Literal[20],

10548 # Literal[True]]")

10549 ufunc = np.logical_or # type: ignore[assignment]

10550 else:

10551 raise NotImplementedError(name)

10553 for arr in self._mgr.arrays:

10554 middle = func(arr, axis=0, skipna=skipna)

10555 result = ufunc(result, middle)

10557 res_ser = self._constructor_sliced(result, index=self.index, copy=False)

10558 return res_ser

10560 def nunique(self, axis: Axis = 0, dropna: bool = True) -> Series:

10562 Count number of distinct elements in specified axis.

10564 Return Series with number of distinct elements. Can ignore NaN

10565 values.

10567 Parameters

10568 ----------

10569 axis : {0 or 'index', 1 or 'columns'}, default 0

10570 The axis to use. 0 or 'index' for row-wise, 1 or 'columns' for

10571 column-wise.

10572 dropna : bool, default True

10573 Don't include NaN in the counts.

10575 Returns

10576 -------

10577 Series

10579 See Also

10580 --------

10581 Series.nunique: Method nunique for Series.

10582 DataFrame.count: Count non-NA cells for each column or row.

10584 Examples

10585 --------

10586 >>> df = pd.DataFrame({'A': [4, 5, 6], 'B': [4, 1, 1]})

10587 >>> df.nunique()

10590 dtype: int64

10592 >>> df.nunique(axis=1)

10596 dtype: int64

10598 return self.apply(Series.nunique, axis=axis, dropna=dropna)

10600 @doc(_shared_docs["idxmin"], numeric_only_default="False")

10601 def idxmin(

10602 self, axis: Axis = 0, skipna: bool = True, numeric_only: bool = False

10603 ) -> Series:

10604 axis = self._get_axis_number(axis)

10605 if numeric_only:

10606 data = self._get_numeric_data()

10607 else:

10608 data = self

10610 res = data._reduce(

10611 nanops.nanargmin, "argmin", axis=axis, skipna=skipna, numeric_only=False

10613 indices = res._values

10615 # indices will always be np.ndarray since axis is not None and

10616 # values is a 2d array for DataFrame

10617 # error: Item "int" of "Union[int, Any]" has no attribute "__iter__"

10618 assert isinstance(indices, np.ndarray) # for mypy

10620 index = data._get_axis(axis)

10621 result = [index[i] if i >= 0 else np.nan for i in indices]

10622 final_result = data._constructor_sliced(result, index=data._get_agg_axis(axis))

10623 return final_result.__finalize__(self, method="idxmin")

10625 @doc(_shared_docs["idxmax"], numeric_only_default="False")

10626 def idxmax(

10627 self, axis: Axis = 0, skipna: bool = True, numeric_only: bool = False

10628 ) -> Series:

10629 axis = self._get_axis_number(axis)

10630 if numeric_only:

10631 data = self._get_numeric_data()

10632 else:

10633 data = self

10635 res = data._reduce(

10636 nanops.nanargmax, "argmax", axis=axis, skipna=skipna, numeric_only=False

10638 indices = res._values

10640 # indices will always be np.ndarray since axis is not None and

10641 # values is a 2d array for DataFrame

10642 # error: Item "int" of "Union[int, Any]" has no attribute "__iter__"

10643 assert isinstance(indices, np.ndarray) # for mypy

10645 index = data._get_axis(axis)

10646 result = [index[i] if i >= 0 else np.nan for i in indices]

10647 final_result = data._constructor_sliced(result, index=data._get_agg_axis(axis))

10648 return final_result.__finalize__(self, method="idxmax")

10650 def _get_agg_axis(self, axis_num: int) -> Index:

10652 Let's be explicit about this.

10654 if axis_num == 0:

10655 return self.columns

10656 elif axis_num == 1:

10657 return self.index

10658 else:

10659 raise ValueError(f"Axis must be 0 or 1 (got {repr(axis_num)})")

10661 def mode(

10662 self, axis: Axis = 0, numeric_only: bool = False, dropna: bool = True

10663 ) -> DataFrame:

10665 Get the mode(s) of each element along the selected axis.

10667 The mode of a set of values is the value that appears most often.

10668 It can be multiple values.

10670 Parameters

10671 ----------

10672 axis : {0 or 'index', 1 or 'columns'}, default 0

10673 The axis to iterate over while searching for the mode:

10675 * 0 or 'index' : get mode of each column

10676 * 1 or 'columns' : get mode of each row.

10678 numeric_only : bool, default False

10679 If True, only apply to numeric columns.

10680 dropna : bool, default True

10681 Don't consider counts of NaN/NaT.

10683 Returns

10684 -------

10685 DataFrame

10686 The modes of each column or row.

10688 See Also

10689 --------

10690 Series.mode : Return the highest frequency value in a Series.

10691 Series.value_counts : Return the counts of values in a Series.

10693 Examples

10694 --------

10695 >>> df = pd.DataFrame([('bird', 2, 2),

10696 ... ('mammal', 4, np.nan),

10697 ... ('arthropod', 8, 0),

10698 ... ('bird', 2, np.nan)],

10699 ... index=('falcon', 'horse', 'spider', 'ostrich'),

10700 ... columns=('species', 'legs', 'wings'))

10701 >>> df

10702 species legs wings

10703 falcon bird 2 2.0

10704 horse mammal 4 NaN

10705 spider arthropod 8 0.0

10706 ostrich bird 2 NaN

10708 By default, missing values are not considered, and the mode of wings

10709 are both 0 and 2. Because the resulting DataFrame has two rows,

10710 the second row of ``species`` and ``legs`` contains ``NaN``.

10712 >>> df.mode()

10713 species legs wings

10714 0 bird 2.0 0.0

10715 1 NaN NaN 2.0

10717 Setting ``dropna=False`` ``NaN`` values are considered and they can be

10718 the mode (like for wings).

10720 >>> df.mode(dropna=False)

10721 species legs wings

10722 0 bird 2 NaN

10724 Setting ``numeric_only=True``, only the mode of numeric columns is

10725 computed, and columns of other types are ignored.

10727 >>> df.mode(numeric_only=True)

10728 legs wings

10729 0 2.0 0.0

10730 1 NaN 2.0

10732 To compute the mode over columns and not rows, use the axis parameter:

10734 >>> df.mode(axis='columns', numeric_only=True)

10736 falcon 2.0 NaN

10737 horse 4.0 NaN

10738 spider 0.0 8.0

10739 ostrich 2.0 NaN

10741 data = self if not numeric_only else self._get_numeric_data()

10743 def f(s):

10744 return s.mode(dropna=dropna)

10746 data = data.apply(f, axis=axis)

10747 # Ensure index is type stable (should always use int index)

10748 if data.empty:

10749 data.index = default_index(0)

10751 return data

10753 @overload

10754 def quantile(

10755 self,

10756 q: float = ...,

10757 axis: Axis = ...,

10758 numeric_only: bool = ...,

10759 interpolation: QuantileInterpolation = ...,

10760 ) -> Series:

10763 @overload

10764 def quantile(

10765 self,

10766 q: AnyArrayLike | Sequence[float],

10767 axis: Axis = ...,

10768 numeric_only: bool = ...,

10769 interpolation: QuantileInterpolation = ...,

10770 ) -> Series | DataFrame:

10773 @overload

10774 def quantile(

10775 self,

10776 q: float | AnyArrayLike | Sequence[float] = ...,

10777 axis: Axis = ...,

10778 numeric_only: bool = ...,

10779 interpolation: QuantileInterpolation = ...,

10780 ) -> Series | DataFrame:

10783 def quantile(

10784 self,

10785 q: float | AnyArrayLike | Sequence[float] = 0.5,

10786 axis: Axis = 0,

10787 numeric_only: bool = False,

10788 interpolation: QuantileInterpolation = "linear",

10789 method: Literal["single", "table"] = "single",

10790 ) -> Series | DataFrame:

10792 Return values at the given quantile over requested axis.

10794 Parameters

10795 ----------

10796 q : float or array-like, default 0.5 (50% quantile)

10797 Value between 0 <= q <= 1, the quantile(s) to compute.

10798 axis : {0 or 'index', 1 or 'columns'}, default 0

10799 Equals 0 or 'index' for row-wise, 1 or 'columns' for column-wise.

10800 numeric_only : bool, default False

10801 Include only `float`, `int` or `boolean` data.

10803 .. versionchanged:: 2.0.0

10804 The default value of ``numeric_only`` is now ``False``.

10806 interpolation : {'linear', 'lower', 'higher', 'midpoint', 'nearest'}

10807 This optional parameter specifies the interpolation method to use,

10808 when the desired quantile lies between two data points `i` and `j`:

10810 * linear: `i + (j - i) * fraction`, where `fraction` is the

10811 fractional part of the index surrounded by `i` and `j`.

10812 * lower: `i`.

10813 * higher: `j`.

10814 * nearest: `i` or `j` whichever is nearest.

10815 * midpoint: (`i` + `j`) / 2.

10816 method : {'single', 'table'}, default 'single'

10817 Whether to compute quantiles per-column ('single') or over all columns

10818 ('table'). When 'table', the only allowed interpolation methods are

10819 'nearest', 'lower', and 'higher'.

10821 Returns

10822 -------

10823 Series or DataFrame

10825 If ``q`` is an array, a DataFrame will be returned where the

10826 index is ``q``, the columns are the columns of self, and the

10827 values are the quantiles.

10828 If ``q`` is a float, a Series will be returned where the

10829 index is the columns of self and the values are the quantiles.

10831 See Also

10832 --------

10833 core.window.rolling.Rolling.quantile: Rolling quantile.

10834 numpy.percentile: Numpy function to compute the percentile.

10836 Examples

10837 --------

10838 >>> df = pd.DataFrame(np.array([[1, 1], [2, 10], [3, 100], [4, 100]]),

10839 ... columns=['a', 'b'])

10840 >>> df.quantile(.1)

10841 a 1.3

10842 b 3.7

10843 Name: 0.1, dtype: float64

10844 >>> df.quantile([.1, .5])

10846 0.1 1.3 3.7

10847 0.5 2.5 55.0

10849 Specifying `method='table'` will compute the quantile over all columns.

10851 >>> df.quantile(.1, method="table", interpolation="nearest")

10854 Name: 0.1, dtype: int64

10855 >>> df.quantile([.1, .5], method="table", interpolation="nearest")

10857 0.1 1 1

10858 0.5 3 100

10860 Specifying `numeric_only=False` will also compute the quantile of

10861 datetime and timedelta data.

10863 >>> df = pd.DataFrame({'A': [1, 2],

10864 ... 'B': [pd.Timestamp('2010'),

10865 ... pd.Timestamp('2011')],

10866 ... 'C': [pd.Timedelta('1 days'),

10867 ... pd.Timedelta('2 days')]})

10868 >>> df.quantile(0.5, numeric_only=False)

10869 A 1.5

10870 B 2010-07-02 12:00:00

10871 C 1 days 12:00:00

10872 Name: 0.5, dtype: object

10874 validate_percentile(q)

10875 axis = self._get_axis_number(axis)

10877 if not is_list_like(q):

10878 # BlockManager.quantile expects listlike, so we wrap and unwrap here

10879 # error: List item 0 has incompatible type "Union[float, Union[Union[

10880 # ExtensionArray, ndarray[Any, Any]], Index, Series], Sequence[float]]";

10881 # expected "float"

10882 res_df = self.quantile( # type: ignore[call-overload]

10883 [q],

10884 axis=axis,

10885 numeric_only=numeric_only,

10886 interpolation=interpolation,

10887 method=method,

10889 if method == "single":

10890 res = res_df.iloc[0]

10891 else:

10892 # cannot directly iloc over sparse arrays

10893 res = res_df.T.iloc[:, 0]

10894 if axis == 1 and len(self) == 0:

10895 # GH#41544 try to get an appropriate dtype

10896 dtype = find_common_type(list(self.dtypes))

10897 if needs_i8_conversion(dtype):

10898 return res.astype(dtype)

10899 return res

10901 q = Index(q, dtype=np.float64)

10902 data = self._get_numeric_data() if numeric_only else self

10904 if axis == 1:

10905 data = data.T

10907 if len(data.columns) == 0:

10908 # GH#23925 _get_numeric_data may have dropped all columns

10909 cols = Index([], name=self.columns.name)

10911 dtype = np.float64

10912 if axis == 1:

10913 # GH#41544 try to get an appropriate dtype

10914 cdtype = find_common_type(list(self.dtypes))

10915 if needs_i8_conversion(cdtype):

10916 dtype = cdtype

10918 res = self._constructor([], index=q, columns=cols, dtype=dtype)

10919 return res.__finalize__(self, method="quantile")

10921 valid_method = {"single", "table"}

10922 if method not in valid_method:

10923 raise ValueError(

10924 f"Invalid method: {method}. Method must be in {valid_method}."

10926 if method == "single":

10927 res = data._mgr.quantile(qs=q, axis=1, interpolation=interpolation)

10928 elif method == "table":

10929 valid_interpolation = {"nearest", "lower", "higher"}

10930 if interpolation not in valid_interpolation:

10931 raise ValueError(

10932 f"Invalid interpolation: {interpolation}. "

10933 f"Interpolation must be in {valid_interpolation}"

10935 # handle degenerate case

10936 if len(data) == 0:

10937 if data.ndim == 2:

10938 dtype = find_common_type(list(self.dtypes))

10939 else:

10940 dtype = self.dtype

10941 return self._constructor([], index=q, columns=data.columns, dtype=dtype)

10943 q_idx = np.quantile( # type: ignore[call-overload]

10944 np.arange(len(data)), q, **{np_percentile_argname: interpolation}

10947 by = data.columns

10948 if len(by) > 1:

10949 keys = [data._get_label_or_level_values(x) for x in by]

10950 indexer = lexsort_indexer(keys)

10951 else:

10952 by = by[0]

10953 k = data._get_label_or_level_values(by) # type: ignore[arg-type]

10954 indexer = nargsort(k)

10956 res = data._mgr.take(indexer[q_idx], verify=False)

10957 res.axes[1] = q

10959 result = self._constructor(res)

10960 return result.__finalize__(self, method="quantile")

10962 @doc(NDFrame.asfreq, **_shared_doc_kwargs)

10963 def asfreq(

10964 self,

10965 freq: Frequency,

10966 method: FillnaOptions | None = None,

10967 how: str | None = None,

10968 normalize: bool = False,

10969 fill_value: Hashable = None,

10970 ) -> DataFrame:

10971 return super().asfreq(

10972 freq=freq,

10973 method=method,

10974 how=how,

10975 normalize=normalize,

10976 fill_value=fill_value,

10979 @doc(NDFrame.resample, **_shared_doc_kwargs)

10980 def resample(

10981 self,

10982 rule,

10983 axis: Axis = 0,

10984 closed: str | None = None,

10985 label: str | None = None,

10986 convention: str = "start",

10987 kind: str | None = None,

10988 on: Level = None,

10989 level: Level = None,

10990 origin: str | TimestampConvertibleTypes = "start_day",

10991 offset: TimedeltaConvertibleTypes | None = None,

10992 group_keys: bool = False,

10993 ) -> Resampler:

10994 return super().resample(

10995 rule=rule,

10996 axis=axis,

10997 closed=closed,

10998 label=label,

10999 convention=convention,

11000 kind=kind,

11001 on=on,

11002 level=level,

11003 origin=origin,

11004 offset=offset,

11005 group_keys=group_keys,

11008 def to_timestamp(

11009 self,

11010 freq: Frequency | None = None,

11011 how: str = "start",

11012 axis: Axis = 0,

11013 copy: bool | None = None,

11014 ) -> DataFrame:

11016 Cast to DatetimeIndex of timestamps, at *beginning* of period.

11018 Parameters

11019 ----------

11020 freq : str, default frequency of PeriodIndex

11021 Desired frequency.

11022 how : {'s', 'e', 'start', 'end'}

11023 Convention for converting period to timestamp; start of period

11024 vs. end.

11025 axis : {0 or 'index', 1 or 'columns'}, default 0

11026 The axis to convert (the index by default).

11027 copy : bool, default True

11028 If False then underlying input data is not copied.

11030 Returns

11031 -------

11032 DataFrame

11033 The DataFrame has a DatetimeIndex.

11035 Examples

11036 --------

11037 >>> idx = pd.PeriodIndex(['2023', '2024'], freq='Y')

11038 >>> d = {'col1': [1, 2], 'col2': [3, 4]}

11039 >>> df1 = pd.DataFrame(data=d, index=idx)

11040 >>> df1

11041 col1 col2

11042 2023 1 3

11043 2024 2 4

11045 The resulting timestamps will be at the beginning of the year in this case

11047 >>> df1 = df1.to_timestamp()

11048 >>> df1

11049 col1 col2

11050 2023-01-01 1 3

11051 2024-01-01 2 4

11052 >>> df1.index

11053 DatetimeIndex(['2023-01-01', '2024-01-01'], dtype='datetime64[ns]', freq=None)

11055 Using `freq` which is the offset that the Timestamps will have

11057 >>> df2 = pd.DataFrame(data=d, index=idx)

11058 >>> df2 = df2.to_timestamp(freq='M')

11059 >>> df2

11060 col1 col2

11061 2023-01-31 1 3

11062 2024-01-31 2 4

11063 >>> df2.index

11064 DatetimeIndex(['2023-01-31', '2024-01-31'], dtype='datetime64[ns]', freq=None)

11066 new_obj = self.copy(deep=copy and not using_copy_on_write())

11068 axis_name = self._get_axis_name(axis)

11069 old_ax = getattr(self, axis_name)

11070 if not isinstance(old_ax, PeriodIndex):

11071 raise TypeError(f"unsupported Type {type(old_ax).__name__}")

11073 new_ax = old_ax.to_timestamp(freq=freq, how=how)

11075 setattr(new_obj, axis_name, new_ax)

11076 return new_obj

11078 def to_period(

11079 self, freq: Frequency | None = None, axis: Axis = 0, copy: bool | None = None

11080 ) -> DataFrame:

11082 Convert DataFrame from DatetimeIndex to PeriodIndex.

11084 Convert DataFrame from DatetimeIndex to PeriodIndex with desired

11085 frequency (inferred from index if not passed).

11087 Parameters

11088 ----------

11089 freq : str, default

11090 Frequency of the PeriodIndex.

11091 axis : {0 or 'index', 1 or 'columns'}, default 0

11092 The axis to convert (the index by default).

11093 copy : bool, default True

11094 If False then underlying input data is not copied.

11096 Returns

11097 -------

11098 DataFrame

11099 The DataFrame has a PeriodIndex.

11101 Examples

11102 --------

11103 >>> idx = pd.to_datetime(

11104 ... [

11105 ... "2001-03-31 00:00:00",

11106 ... "2002-05-31 00:00:00",

11107 ... "2003-08-31 00:00:00",

11108 ... ]

11109 ... )

11111 >>> idx

11112 DatetimeIndex(['2001-03-31', '2002-05-31', '2003-08-31'],

11113 dtype='datetime64[ns]', freq=None)

11115 >>> idx.to_period("M")

11116 PeriodIndex(['2001-03', '2002-05', '2003-08'], dtype='period[M]')

11118 For the yearly frequency

11120 >>> idx.to_period("Y")

11121 PeriodIndex(['2001', '2002', '2003'], dtype='period[A-DEC]')

11123 new_obj = self.copy(deep=copy and not using_copy_on_write())

11125 axis_name = self._get_axis_name(axis)

11126 old_ax = getattr(self, axis_name)

11127 if not isinstance(old_ax, DatetimeIndex):

11128 raise TypeError(f"unsupported Type {type(old_ax).__name__}")

11130 new_ax = old_ax.to_period(freq=freq)

11132 setattr(new_obj, axis_name, new_ax)

11133 return new_obj

11135 def isin(self, values: Series | DataFrame | Sequence | Mapping) -> DataFrame:

11137 Whether each element in the DataFrame is contained in values.

11139 Parameters

11140 ----------

11141 values : iterable, Series, DataFrame or dict

11142 The result will only be true at a location if all the

11143 labels match. If `values` is a Series, that's the index. If

11144 `values` is a dict, the keys must be the column names,

11145 which must match. If `values` is a DataFrame,

11146 then both the index and column labels must match.

11148 Returns

11149 -------

11150 DataFrame

11151 DataFrame of booleans showing whether each element in the DataFrame

11152 is contained in values.

11154 See Also

11155 --------

11156 DataFrame.eq: Equality test for DataFrame.

11157 Series.isin: Equivalent method on Series.

11158 Series.str.contains: Test if pattern or regex is contained within a

11159 string of a Series or Index.

11161 Examples

11162 --------

11163 >>> df = pd.DataFrame({'num_legs': [2, 4], 'num_wings': [2, 0]},

11164 ... index=['falcon', 'dog'])

11165 >>> df

11166 num_legs num_wings

11167 falcon 2 2

11168 dog 4 0

11170 When ``values`` is a list check whether every value in the DataFrame

11171 is present in the list (which animals have 0 or 2 legs or wings)

11173 >>> df.isin([0, 2])

11174 num_legs num_wings

11175 falcon True True

11176 dog False True

11178 To check if ``values`` is *not* in the DataFrame, use the ``~`` operator:

11180 >>> ~df.isin([0, 2])

11181 num_legs num_wings

11182 falcon False False

11183 dog True False

11185 When ``values`` is a dict, we can pass values to check for each

11186 column separately:

11188 >>> df.isin({'num_wings': [0, 3]})

11189 num_legs num_wings

11190 falcon False False

11191 dog False True

11193 When ``values`` is a Series or DataFrame the index and column must

11194 match. Note that 'falcon' does not match based on the number of legs

11195 in other.

11197 >>> other = pd.DataFrame({'num_legs': [8, 3], 'num_wings': [0, 2]},

11198 ... index=['spider', 'falcon'])

11199 >>> df.isin(other)

11200 num_legs num_wings

11201 falcon False True

11202 dog False False

11204 if isinstance(values, dict):

11205 from pandas.core.reshape.concat import concat

11207 values = collections.defaultdict(list, values)

11208 result = concat(

11210 self.iloc[:, [i]].isin(values[col])

11211 for i, col in enumerate(self.columns)

11213 axis=1,

11215 elif isinstance(values, Series):

11216 if not values.index.is_unique:

11217 raise ValueError("cannot compute isin with a duplicate axis.")

11218 result = self.eq(values.reindex_like(self), axis="index")

11219 elif isinstance(values, DataFrame):

11220 if not (values.columns.is_unique and values.index.is_unique):

11221 raise ValueError("cannot compute isin with a duplicate axis.")

11222 result = self.eq(values.reindex_like(self))

11223 else:

11224 if not is_list_like(values):

11225 raise TypeError(

11226 "only list-like or dict-like objects are allowed "

11227 "to be passed to DataFrame.isin(), "

11228 f"you passed a '{type(values).__name__}'"

11230 # error: Argument 2 to "isin" has incompatible type "Union[Sequence[Any],

11231 # Mapping[Any, Any]]"; expected "Union[Union[ExtensionArray,

11232 # ndarray[Any, Any]], Index, Series]"

11233 result = self._constructor(

11234 algorithms.isin(

11235 self.values.ravel(), values # type: ignore[arg-type]

11236 ).reshape(self.shape),

11237 self.index,

11238 self.columns,

11239 copy=False,

11241 return result.__finalize__(self, method="isin")

11243 # ----------------------------------------------------------------------

11244 # Add index and columns

11245 _AXIS_ORDERS: list[Literal["index", "columns"]] = ["index", "columns"]

11246 _AXIS_TO_AXIS_NUMBER: dict[Axis, int] = {

11247 **NDFrame._AXIS_TO_AXIS_NUMBER,

11248 1: 1,

11249 "columns": 1,

11251 _AXIS_LEN = len(_AXIS_ORDERS)

11252 _info_axis_number: Literal[1] = 1

11253 _info_axis_name: Literal["columns"] = "columns"

11255 index = properties.AxisProperty(

11256 axis=1, doc="The index (row labels) of the DataFrame."

11258 columns = properties.AxisProperty(axis=0, doc="The column labels of the DataFrame.")

11260 # ----------------------------------------------------------------------

11261 # Add plotting methods to DataFrame

11262 plot = CachedAccessor("plot", pandas.plotting.PlotAccessor)

11263 hist = pandas.plotting.hist_frame

11264 boxplot = pandas.plotting.boxplot_frame

11265 sparse = CachedAccessor("sparse", SparseFrameAccessor)

11267 # ----------------------------------------------------------------------

11268 # Internal Interface Methods

11270 def _to_dict_of_blocks(self, copy: bool = True):

11272 Return a dict of dtype -> Constructor Types that

11273 each is a homogeneous dtype.

11275 Internal ONLY - only works for BlockManager

11277 mgr = self._mgr

11278 # convert to BlockManager if needed -> this way support ArrayManager as well

11279 mgr = mgr_to_mgr(mgr, "block")

11280 mgr = cast(BlockManager, mgr)

11281 return {

11282 k: self._constructor(v).__finalize__(self)

11283 for k, v, in mgr.to_dict(copy=copy).items()

11286 @property

11287 def values(self) -> np.ndarray:

11289 Return a Numpy representation of the DataFrame.

11291 .. warning::

11293 We recommend using :meth:`DataFrame.to_numpy` instead.

11295 Only the values in the DataFrame will be returned, the axes labels

11296 will be removed.

11298 Returns

11299 -------

11300 numpy.ndarray

11301 The values of the DataFrame.

11303 See Also

11304 --------

11305 DataFrame.to_numpy : Recommended alternative to this method.

11306 DataFrame.index : Retrieve the index labels.

11307 DataFrame.columns : Retrieving the column names.

11309 Notes

11310 -----

11311 The dtype will be a lower-common-denominator dtype (implicit

11312 upcasting); that is to say if the dtypes (even of numeric types)

11313 are mixed, the one that accommodates all will be chosen. Use this

11314 with care if you are not dealing with the blocks.

11316 e.g. If the dtypes are float16 and float32, dtype will be upcast to

11317 float32. If dtypes are int32 and uint8, dtype will be upcast to

11318 int32. By :func:`numpy.find_common_type` convention, mixing int64

11319 and uint64 will result in a float64 dtype.

11321 Examples

11322 --------

11323 A DataFrame where all columns are the same type (e.g., int64) results

11324 in an array of the same type.

11326 >>> df = pd.DataFrame({'age': [ 3, 29],

11327 ... 'height': [94, 170],

11328 ... 'weight': [31, 115]})

11329 >>> df

11330 age height weight

11331 0 3 94 31

11332 1 29 170 115

11333 >>> df.dtypes

11334 age int64

11335 height int64

11336 weight int64

11337 dtype: object

11338 >>> df.values

11339 array([[ 3, 94, 31],

11340 [ 29, 170, 115]])

11342 A DataFrame with mixed type columns(e.g., str/object, int64, float32)

11343 results in an ndarray of the broadest type that accommodates these

11344 mixed types (e.g., object).

11346 >>> df2 = pd.DataFrame([('parrot', 24.0, 'second'),

11347 ... ('lion', 80.5, 1),

11348 ... ('monkey', np.nan, None)],

11349 ... columns=('name', 'max_speed', 'rank'))

11350 >>> df2.dtypes

11351 name object

11352 max_speed float64

11353 rank object

11354 dtype: object

11355 >>> df2.values

11356 array([['parrot', 24.0, 'second'],

11357 ['lion', 80.5, 1],

11358 ['monkey', nan, None]], dtype=object)

11360 return self._mgr.as_array()

11362 @overload

11363 def ffill(

11364 self,

11366 axis: None | Axis = ...,

11367 inplace: Literal[False] = ...,

11368 limit: None | int = ...,

11369 downcast: dict | None = ...,

11370 ) -> DataFrame:

11373 @overload

11374 def ffill(

11375 self,

11377 axis: None | Axis = ...,

11378 inplace: Literal[True],

11379 limit: None | int = ...,

11380 downcast: dict | None = ...,

11381 ) -> None:

11384 @overload

11385 def ffill(

11386 self,

11388 axis: None | Axis = ...,

11389 inplace: bool = ...,

11390 limit: None | int = ...,

11391 downcast: dict | None = ...,

11392 ) -> DataFrame | None:

11395 def ffill(

11396 self,

11398 axis: None | Axis = None,

11399 inplace: bool = False,

11400 limit: None | int = None,

11401 downcast: dict | None = None,

11402 ) -> DataFrame | None:

11403 return super().ffill(axis=axis, inplace=inplace, limit=limit, downcast=downcast)

11405 @overload

11406 def bfill(

11407 self,

11409 axis: None | Axis = ...,

11410 inplace: Literal[False] = ...,

11411 limit: None | int = ...,

11412 downcast=...,

11413 ) -> DataFrame:

11416 @overload

11417 def bfill(

11418 self,

11420 axis: None | Axis = ...,

11421 inplace: Literal[True],

11422 limit: None | int = ...,

11423 downcast=...,

11424 ) -> None:

11427 @overload

11428 def bfill(

11429 self,

11431 axis: None | Axis = ...,

11432 inplace: bool = ...,

11433 limit: None | int = ...,

11434 downcast=...,

11435 ) -> DataFrame | None:

11438 def bfill(

11439 self,

11441 axis: None | Axis = None,

11442 inplace: bool = False,

11443 limit: None | int = None,

11444 downcast=None,

11445 ) -> DataFrame | None:

11446 return super().bfill(axis=axis, inplace=inplace, limit=limit, downcast=downcast)

11448 def clip(

11449 self: DataFrame,

11450 lower: float | None = None,

11451 upper: float | None = None,

11453 axis: Axis | None = None,

11454 inplace: bool = False,

11455 **kwargs,

11456 ) -> DataFrame | None:

11457 return super().clip(lower, upper, axis=axis, inplace=inplace, **kwargs)

11459 def interpolate(

11460 self: DataFrame,

11461 method: str = "linear",

11463 axis: Axis = 0,

11464 limit: int | None = None,

11465 inplace: bool = False,

11466 limit_direction: str | None = None,

11467 limit_area: str | None = None,

11468 downcast: str | None = None,

11469 **kwargs,

11470 ) -> DataFrame | None:

11471 return super().interpolate(

11472 method=method,

11473 axis=axis,

11474 limit=limit,

11475 inplace=inplace,

11476 limit_direction=limit_direction,

11477 limit_area=limit_area,

11478 downcast=downcast,

11479 **kwargs,

11482 @overload

11483 def where(

11484 self,

11485 cond,

11486 other=...,

11488 inplace: Literal[False] = ...,

11489 axis: Axis | None = ...,

11490 level: Level = ...,

11491 ) -> DataFrame:

11494 @overload

11495 def where(

11496 self,

11497 cond,

11498 other=...,

11500 inplace: Literal[True],

11501 axis: Axis | None = ...,

11502 level: Level = ...,

11503 ) -> None:

11506 @overload

11507 def where(

11508 self,

11509 cond,

11510 other=...,

11512 inplace: bool = ...,

11513 axis: Axis | None = ...,

11514 level: Level = ...,

11515 ) -> DataFrame | None:

11518 def where(

11519 self,

11520 cond,

11521 other=lib.no_default,

11523 inplace: bool = False,

11524 axis: Axis | None = None,

11525 level: Level = None,

11526 ) -> DataFrame | None:

11527 return super().where(

11528 cond,

11529 other,

11530 inplace=inplace,

11531 axis=axis,

11532 level=level,

11535 @overload

11536 def mask(

11537 self,

11538 cond,

11539 other=...,

11541 inplace: Literal[False] = ...,

11542 axis: Axis | None = ...,

11543 level: Level = ...,

11544 ) -> DataFrame:

11547 @overload

11548 def mask(

11549 self,

11550 cond,

11551 other=...,

11553 inplace: Literal[True],

11554 axis: Axis | None = ...,

11555 level: Level = ...,

11556 ) -> None:

11559 @overload

11560 def mask(

11561 self,

11562 cond,

11563 other=...,

11565 inplace: bool = ...,

11566 axis: Axis | None = ...,

11567 level: Level = ...,

11568 ) -> DataFrame | None:

11571 def mask(

11572 self,

11573 cond,

11574 other=lib.no_default,

11576 inplace: bool = False,

11577 axis: Axis | None = None,

11578 level: Level = None,

11579 ) -> DataFrame | None:

11580 return super().mask(

11581 cond,

11582 other,

11583 inplace=inplace,

11584 axis=axis,

11585 level=level,

11589DataFrame._add_numeric_operations()

11591ops.add_flex_arithmetic_methods(DataFrame)

11594def _from_nested_dict(data) -> collections.defaultdict:

11595 new_data: collections.defaultdict = collections.defaultdict(dict)

11596 for index, s in data.items():

11597 for col, v in s.items():

11598 new_data[col][index] = v

11599 return new_data

11602def _reindex_for_setitem(value: DataFrame | Series, index: Index) -> ArrayLike:

11603 # reindex if necessary

11605 if value.index.equals(index) or not len(index):

11606 return value._values.copy()

11608 # GH#4107

11609 try:

11610 reindexed_value = value.reindex(index)._values

11611 except ValueError as err:

11612 # raised in MultiIndex.from_tuples, see test_insert_error_msmgs

11613 if not value.index.is_unique:

11614 # duplicate axis

11615 raise err

11617 raise TypeError(

11618 "incompatible index of inserted column with frame index"

11619 ) from err

11620 return reindexed_value