Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.9/dist-packages/pandas/core/generic.py: 40%

1# pyright: reportPropertyTypeMismatch=false

2from __future__ import annotations

4import collections

5from copy import deepcopy

6import datetime as dt

7from functools import partial

8import gc

9from json import loads

10import operator

11import pickle

12import re

13import sys

14from typing import (

15 TYPE_CHECKING,

16 Any,

17 Callable,

18 ClassVar,

19 Literal,

20 NoReturn,

21 cast,

22 final,

23 overload,

24)

25import warnings

26import weakref

28import numpy as np

30from pandas._config import (

31 config,

32 using_copy_on_write,

33 warn_copy_on_write,

34)

36from pandas._libs import lib

37from pandas._libs.lib import is_range_indexer

38from pandas._libs.tslibs import (

39 Period,

40 Tick,

41 Timestamp,

42 to_offset,

43)

44from pandas._libs.tslibs.dtypes import freq_to_period_freqstr

45from pandas._typing import (

46 AlignJoin,

47 AnyArrayLike,

48 ArrayLike,

49 Axes,

50 Axis,

51 AxisInt,

52 CompressionOptions,

53 DtypeArg,

54 DtypeBackend,

55 DtypeObj,

56 FilePath,

57 FillnaOptions,

58 FloatFormatType,

59 FormattersType,

60 Frequency,

61 IgnoreRaise,

62 IndexKeyFunc,

63 IndexLabel,

64 InterpolateOptions,

65 IntervalClosedType,

66 JSONSerializable,

67 Level,

68 Manager,

69 NaPosition,

70 NDFrameT,

71 OpenFileErrors,

72 RandomState,

73 ReindexMethod,

74 Renamer,

75 Scalar,

76 Self,

77 SequenceNotStr,

78 SortKind,

79 StorageOptions,

80 Suffixes,

81 T,

82 TimeAmbiguous,

83 TimedeltaConvertibleTypes,

84 TimeNonexistent,

85 TimestampConvertibleTypes,

86 TimeUnit,

87 ValueKeyFunc,

88 WriteBuffer,

89 WriteExcelBuffer,

90 npt,

91)

92from pandas.compat import PYPY

93from pandas.compat._constants import REF_COUNT

94from pandas.compat._optional import import_optional_dependency

95from pandas.compat.numpy import function as nv

96from pandas.errors import (

97 AbstractMethodError,

98 ChainedAssignmentError,

99 InvalidIndexError,

100 SettingWithCopyError,

101 SettingWithCopyWarning,

102 _chained_assignment_method_msg,

103 _chained_assignment_warning_method_msg,

104 _check_cacher,

105)

106from pandas.util._decorators import (

107 deprecate_nonkeyword_arguments,

108 doc,

109)

110from pandas.util._exceptions import find_stack_level

111from pandas.util._validators import (

112 check_dtype_backend,

113 validate_ascending,

114 validate_bool_kwarg,

115 validate_fillna_kwargs,

116 validate_inclusive,

117)

118

119from pandas.core.dtypes.astype import astype_is_view

120from pandas.core.dtypes.common import (

121 ensure_object,

122 ensure_platform_int,

123 ensure_str,

124 is_bool,

125 is_bool_dtype,

126 is_dict_like,

127 is_extension_array_dtype,

128 is_list_like,

129 is_number,

130 is_numeric_dtype,

131 is_re_compilable,

132 is_scalar,

133 pandas_dtype,

134)

135from pandas.core.dtypes.dtypes import (

136 DatetimeTZDtype,

137 ExtensionDtype,

138)

139from pandas.core.dtypes.generic import (

140 ABCDataFrame,

141 ABCSeries,

142)

143from pandas.core.dtypes.inference import (

144 is_hashable,

145 is_nested_list_like,

146)

147from pandas.core.dtypes.missing import (

148 isna,

149 notna,

150)

151

152from pandas.core import (

153 algorithms as algos,

154 arraylike,

155 common,

156 indexing,

157 missing,

158 nanops,

159 sample,

160)

161from pandas.core.array_algos.replace import should_use_regex

162from pandas.core.arrays import ExtensionArray

163from pandas.core.base import PandasObject

164from pandas.core.construction import extract_array

165from pandas.core.flags import Flags

166from pandas.core.indexes.api import (

167 DatetimeIndex,

168 Index,

169 MultiIndex,

170 PeriodIndex,

171 RangeIndex,

172 default_index,

173 ensure_index,

174)

175from pandas.core.internals import (

176 ArrayManager,

177 BlockManager,

178 SingleArrayManager,

179)

180from pandas.core.internals.construction import (

181 mgr_to_mgr,

182 ndarray_to_mgr,

183)

184from pandas.core.methods.describe import describe_ndframe

185from pandas.core.missing import (

186 clean_fill_method,

187 clean_reindex_fill_method,

188 find_valid_index,

189)

190from pandas.core.reshape.concat import concat

191from pandas.core.shared_docs import _shared_docs

192from pandas.core.sorting import get_indexer_indexer

193from pandas.core.window import (

194 Expanding,

195 ExponentialMovingWindow,

196 Rolling,

197 Window,

198)

199

200from pandas.io.formats.format import (

201 DataFrameFormatter,

202 DataFrameRenderer,

203)

204from pandas.io.formats.printing import pprint_thing

205

206if TYPE_CHECKING:

207 from collections.abc import (

208 Hashable,

209 Iterator,

210 Mapping,

211 Sequence,

212 )

213

214 from pandas._libs.tslibs import BaseOffset

215

216 from pandas import (

217 DataFrame,

218 ExcelWriter,

219 HDFStore,

220 Series,

221 )

222 from pandas.core.indexers.objects import BaseIndexer

223 from pandas.core.resample import Resampler

224

225# goal is to be able to define the docs close to function, while still being

226# able to share

227_shared_docs = {**_shared_docs}

228_shared_doc_kwargs = {

229 "axes": "keywords for axes",

230 "klass": "Series/DataFrame",

231 "axes_single_arg": "{0 or 'index'} for Series, {0 or 'index', 1 or 'columns'} for DataFrame", # noqa: E501

232 "inplace": """

233 inplace : bool, default False

234 If True, performs operation inplace and returns None.""",

235 "optional_by": """

236 by : str or list of str

237 Name or list of names to sort by""",

238}

239

240

241bool_t = bool # Need alias because NDFrame has def bool:

242

243

244class NDFrame(PandasObject, indexing.IndexingMixin):

245 """

246 N-dimensional analogue of DataFrame. Store multi-dimensional in a

247 size-mutable, labeled data structure

248

249 Parameters

250 ----------

251 data : BlockManager

252 axes : list

253 copy : bool, default False

254 """

255

256 _internal_names: list[str] = [

257 "_mgr",

258 "_cacher",

259 "_item_cache",

260 "_cache",

261 "_is_copy",

262 "_name",

263 "_metadata",

264 "_flags",

265 ]

266 _internal_names_set: set[str] = set(_internal_names)

267 _accessors: set[str] = set()

268 _hidden_attrs: frozenset[str] = frozenset([])

269 _metadata: list[str] = []

270 _is_copy: weakref.ReferenceType[NDFrame] | str | None = None

271 _mgr: Manager

272 _attrs: dict[Hashable, Any]

273 _typ: str

274

275 # ----------------------------------------------------------------------

276 # Constructors

277

278 def __init__(self, data: Manager) -> None:

279 object.__setattr__(self, "_is_copy", None)

280 object.__setattr__(self, "_mgr", data)

281 object.__setattr__(self, "_item_cache", {})

282 object.__setattr__(self, "_attrs", {})

283 object.__setattr__(self, "_flags", Flags(self, allows_duplicate_labels=True))

284

285 @final

286 @classmethod

287 def _init_mgr(

288 cls,

289 mgr: Manager,

290 axes: dict[Literal["index", "columns"], Axes | None],

291 dtype: DtypeObj | None = None,

292 copy: bool_t = False,

293 ) -> Manager:

294 """passed a manager and a axes dict"""

295 for a, axe in axes.items():

296 if axe is not None:

297 axe = ensure_index(axe)

298 bm_axis = cls._get_block_manager_axis(a)

299 mgr = mgr.reindex_axis(axe, axis=bm_axis)

300

301 # make a copy if explicitly requested

302 if copy:

303 mgr = mgr.copy()

304 if dtype is not None:

305 # avoid further copies if we can

306 if (

307 isinstance(mgr, BlockManager)

308 and len(mgr.blocks) == 1

309 and mgr.blocks[0].values.dtype == dtype

310 ):

311 pass

312 else:

313 mgr = mgr.astype(dtype=dtype)

314 return mgr

315

316 @final

317 def _as_manager(self, typ: str, copy: bool_t = True) -> Self:

318 """

319 Private helper function to create a DataFrame with specific manager.

320

321 Parameters

322 ----------

323 typ : {"block", "array"}

324 copy : bool, default True

325 Only controls whether the conversion from Block->ArrayManager

326 copies the 1D arrays (to ensure proper/contiguous memory layout).

327

328 Returns

329 -------

330 DataFrame

331 New DataFrame using specified manager type. Is not guaranteed

332 to be a copy or not.

333 """

334 new_mgr: Manager

335 new_mgr = mgr_to_mgr(self._mgr, typ=typ, copy=copy)

336 # fastpath of passing a manager doesn't check the option/manager class

337 return self._constructor_from_mgr(new_mgr, axes=new_mgr.axes).__finalize__(self)

338

339 @final

340 @classmethod

341 def _from_mgr(cls, mgr: Manager, axes: list[Index]) -> Self:

342 """

343 Construct a new object of this type from a Manager object and axes.

344

345 Parameters

346 ----------

347 mgr : Manager

348 Must have the same ndim as cls.

349 axes : list[Index]

350

351 Notes

352 -----

353 The axes must match mgr.axes, but are required for future-proofing

354 in the event that axes are refactored out of the Manager objects.

355 """

356 obj = cls.__new__(cls)

357 NDFrame.__init__(obj, mgr)

358 return obj

359

360 # ----------------------------------------------------------------------

361 # attrs and flags

362

363 @property

364 def attrs(self) -> dict[Hashable, Any]:

365 """

366 Dictionary of global attributes of this dataset.

367

368 .. warning::

369

370 attrs is experimental and may change without warning.

371

372 See Also

373 --------

374 DataFrame.flags : Global flags applying to this object.

375

376 Notes

377 -----

378 Many operations that create new datasets will copy ``attrs``. Copies

379 are always deep so that changing ``attrs`` will only affect the

380 present dataset. ``pandas.concat`` copies ``attrs`` only if all input

381 datasets have the same ``attrs``.

382

383 Examples

384 --------

385 For Series:

386

387 >>> ser = pd.Series([1, 2, 3])

388 >>> ser.attrs = {"A": [10, 20, 30]}

389 >>> ser.attrs

390 {'A': [10, 20, 30]}

391

392 For DataFrame:

393

394 >>> df = pd.DataFrame({'A': [1, 2], 'B': [3, 4]})

395 >>> df.attrs = {"A": [10, 20, 30]}

396 >>> df.attrs

397 {'A': [10, 20, 30]}

398 """

399 return self._attrs

400

401 @attrs.setter

402 def attrs(self, value: Mapping[Hashable, Any]) -> None:

403 self._attrs = dict(value)

404

405 @final

406 @property

407 def flags(self) -> Flags:

408 """

409 Get the properties associated with this pandas object.

410

411 The available flags are

412

413 * :attr:`Flags.allows_duplicate_labels`

414

415 See Also

416 --------

417 Flags : Flags that apply to pandas objects.

418 DataFrame.attrs : Global metadata applying to this dataset.

419

420 Notes

421 -----

422 "Flags" differ from "metadata". Flags reflect properties of the

423 pandas object (the Series or DataFrame). Metadata refer to properties

424 of the dataset, and should be stored in :attr:`DataFrame.attrs`.

425

426 Examples

427 --------

428 >>> df = pd.DataFrame({"A": [1, 2]})

429 >>> df.flags

430 <Flags(allows_duplicate_labels=True)>

431

432 Flags can be get or set using ``.``

433

434 >>> df.flags.allows_duplicate_labels

435 True

436 >>> df.flags.allows_duplicate_labels = False

437

438 Or by slicing with a key

439

440 >>> df.flags["allows_duplicate_labels"]

441 False

442 >>> df.flags["allows_duplicate_labels"] = True

443 """

444 return self._flags

445

446 @final

447 def set_flags(

448 self,

449 *,

450 copy: bool_t = False,

451 allows_duplicate_labels: bool_t | None = None,

452 ) -> Self:

453 """

454 Return a new object with updated flags.

455

456 Parameters

457 ----------

458 copy : bool, default False

459 Specify if a copy of the object should be made.

460

461 .. note::

462 The `copy` keyword will change behavior in pandas 3.0.

463 `Copy-on-Write

464 <https://pandas.pydata.org/docs/dev/user_guide/copy_on_write.html>`__

465 will be enabled by default, which means that all methods with a

466 `copy` keyword will use a lazy copy mechanism to defer the copy and

467 ignore the `copy` keyword. The `copy` keyword will be removed in a

468 future version of pandas.

469

470 You can already get the future behavior and improvements through

471 enabling copy on write ``pd.options.mode.copy_on_write = True``

472 allows_duplicate_labels : bool, optional

473 Whether the returned object allows duplicate labels.

474

475 Returns

476 -------

477 Series or DataFrame

478 The same type as the caller.

479

480 See Also

481 --------

482 DataFrame.attrs : Global metadata applying to this dataset.

483 DataFrame.flags : Global flags applying to this object.

484

485 Notes

486 -----

487 This method returns a new object that's a view on the same data

488 as the input. Mutating the input or the output values will be reflected

489 in the other.

490

491 This method is intended to be used in method chains.

492

493 "Flags" differ from "metadata". Flags reflect properties of the

494 pandas object (the Series or DataFrame). Metadata refer to properties

495 of the dataset, and should be stored in :attr:`DataFrame.attrs`.

496

497 Examples

498 --------

499 >>> df = pd.DataFrame({"A": [1, 2]})

500 >>> df.flags.allows_duplicate_labels

501 True

502 >>> df2 = df.set_flags(allows_duplicate_labels=False)

503 >>> df2.flags.allows_duplicate_labels

504 False

505 """

506 df = self.copy(deep=copy and not using_copy_on_write())

507 if allows_duplicate_labels is not None:

508 df.flags["allows_duplicate_labels"] = allows_duplicate_labels

509 return df

510

511 @final

512 @classmethod

513 def _validate_dtype(cls, dtype) -> DtypeObj | None:

514 """validate the passed dtype"""

515 if dtype is not None:

516 dtype = pandas_dtype(dtype)

517

518 # a compound dtype

519 if dtype.kind == "V":

520 raise NotImplementedError(

521 "compound dtypes are not implemented "

522 f"in the {cls.__name__} constructor"

523 )

524

525 return dtype

526

527 # ----------------------------------------------------------------------

528 # Construction

529

530 @property

531 def _constructor(self) -> Callable[..., Self]:

532 """

533 Used when a manipulation result has the same dimensions as the

534 original.

535 """

536 raise AbstractMethodError(self)

537

538 # ----------------------------------------------------------------------

539 # Internals

540

541 @final

542 @property

543 def _data(self):

544 # GH#33054 retained because some downstream packages uses this,

545 # e.g. fastparquet

546 # GH#33333

547 warnings.warn(

548 f"{type(self).__name__}._data is deprecated and will be removed in "

549 "a future version. Use public APIs instead.",

550 DeprecationWarning,

551 stacklevel=find_stack_level(),

552 )

553 return self._mgr

554

555 # ----------------------------------------------------------------------

556 # Axis

557 _AXIS_ORDERS: list[Literal["index", "columns"]]

558 _AXIS_TO_AXIS_NUMBER: dict[Axis, AxisInt] = {0: 0, "index": 0, "rows": 0}

559 _info_axis_number: int

560 _info_axis_name: Literal["index", "columns"]

561 _AXIS_LEN: int

562

563 @final

564 def _construct_axes_dict(self, axes: Sequence[Axis] | None = None, **kwargs):

565 """Return an axes dictionary for myself."""

566 d = {a: self._get_axis(a) for a in (axes or self._AXIS_ORDERS)}

567 # error: Argument 1 to "update" of "MutableMapping" has incompatible type

568 # "Dict[str, Any]"; expected "SupportsKeysAndGetItem[Union[int, str], Any]"

569 d.update(kwargs) # type: ignore[arg-type]

570 return d

571

572 @final

573 @classmethod

574 def _get_axis_number(cls, axis: Axis) -> AxisInt:

575 try:

576 return cls._AXIS_TO_AXIS_NUMBER[axis]

577 except KeyError:

578 raise ValueError(f"No axis named {axis} for object type {cls.__name__}")

579

580 @final

581 @classmethod

582 def _get_axis_name(cls, axis: Axis) -> Literal["index", "columns"]:

583 axis_number = cls._get_axis_number(axis)

584 return cls._AXIS_ORDERS[axis_number]

585

586 @final

587 def _get_axis(self, axis: Axis) -> Index:

588 axis_number = self._get_axis_number(axis)

589 assert axis_number in {0, 1}

590 return self.index if axis_number == 0 else self.columns

591

592 @final

593 @classmethod

594 def _get_block_manager_axis(cls, axis: Axis) -> AxisInt:

595 """Map the axis to the block_manager axis."""

596 axis = cls._get_axis_number(axis)

597 ndim = cls._AXIS_LEN

598 if ndim == 2:

599 # i.e. DataFrame

600 return 1 - axis

601 return axis

602

603 @final

604 def _get_axis_resolvers(self, axis: str) -> dict[str, Series | MultiIndex]:

605 # index or columns

606 axis_index = getattr(self, axis)

607 d = {}

608 prefix = axis[0]

609

610 for i, name in enumerate(axis_index.names):

611 if name is not None:

612 key = level = name

613 else:

614 # prefix with 'i' or 'c' depending on the input axis

615 # e.g., you must do ilevel_0 for the 0th level of an unnamed

616 # multiiindex

617 key = f"{prefix}level_{i}"

618 level = i

619

620 level_values = axis_index.get_level_values(level)

621 s = level_values.to_series()

622 s.index = axis_index

623 d[key] = s

624

625 # put the index/columns itself in the dict

626 if isinstance(axis_index, MultiIndex):

627 dindex = axis_index

628 else:

629 dindex = axis_index.to_series()

630

631 d[axis] = dindex

632 return d

633

634 @final

635 def _get_index_resolvers(self) -> dict[Hashable, Series | MultiIndex]:

636 from pandas.core.computation.parsing import clean_column_name

637

638 d: dict[str, Series | MultiIndex] = {}

639 for axis_name in self._AXIS_ORDERS:

640 d.update(self._get_axis_resolvers(axis_name))

641

642 return {clean_column_name(k): v for k, v in d.items() if not isinstance(k, int)}

643

644 @final

645 def _get_cleaned_column_resolvers(self) -> dict[Hashable, Series]:

646 """

647 Return the special character free column resolvers of a dataframe.

648

649 Column names with special characters are 'cleaned up' so that they can

650 be referred to by backtick quoting.

651 Used in :meth:`DataFrame.eval`.

652 """

653 from pandas.core.computation.parsing import clean_column_name

654 from pandas.core.series import Series

655

656 if isinstance(self, ABCSeries):

657 return {clean_column_name(self.name): self}

658

659 return {

660 clean_column_name(k): Series(

661 v, copy=False, index=self.index, name=k, dtype=self.dtypes[k]

662 ).__finalize__(self)

663 for k, v in zip(self.columns, self._iter_column_arrays())

664 if not isinstance(k, int)

665 }

666

667 @final

668 @property

669 def _info_axis(self) -> Index:

670 return getattr(self, self._info_axis_name)

671

672 def _is_view_after_cow_rules(self):

673 # Only to be used in cases of chained assignment checks, this is a

674 # simplified check that assumes that either the whole object is a view

675 # or a copy

676 if len(self._mgr.blocks) == 0: # type: ignore[union-attr]

677 return False

678 return self._mgr.blocks[0].refs.has_reference() # type: ignore[union-attr]

679

680 @property

681 def shape(self) -> tuple[int, ...]:

682 """

683 Return a tuple of axis dimensions

684 """

685 return tuple(len(self._get_axis(a)) for a in self._AXIS_ORDERS)

686

687 @property

688 def axes(self) -> list[Index]:

689 """

690 Return index label(s) of the internal NDFrame

691 """

692 # we do it this way because if we have reversed axes, then

693 # the block manager shows then reversed

694 return [self._get_axis(a) for a in self._AXIS_ORDERS]

695

696 @final

697 @property

698 def ndim(self) -> int:

699 """

700 Return an int representing the number of axes / array dimensions.

701

702 Return 1 if Series. Otherwise return 2 if DataFrame.

703

704 See Also

705 --------

706 ndarray.ndim : Number of array dimensions.

707

708 Examples

709 --------

710 >>> s = pd.Series({'a': 1, 'b': 2, 'c': 3})

711 >>> s.ndim

712 1

713

714 >>> df = pd.DataFrame({'col1': [1, 2], 'col2': [3, 4]})

715 >>> df.ndim

716 2

717 """

718 return self._mgr.ndim

719

720 @final

721 @property

722 def size(self) -> int:

723 """

724 Return an int representing the number of elements in this object.

725

726 Return the number of rows if Series. Otherwise return the number of

727 rows times number of columns if DataFrame.

728

729 See Also

730 --------

731 ndarray.size : Number of elements in the array.

732

733 Examples

734 --------

735 >>> s = pd.Series({'a': 1, 'b': 2, 'c': 3})

736 >>> s.size

737 3

738

739 >>> df = pd.DataFrame({'col1': [1, 2], 'col2': [3, 4]})

740 >>> df.size

741 4

742 """

743

744 return int(np.prod(self.shape))

745

746 def set_axis(

747 self,

748 labels,

749 *,

750 axis: Axis = 0,

751 copy: bool_t | None = None,

752 ) -> Self:

753 """

754 Assign desired index to given axis.

755

756 Indexes for%(extended_summary_sub)s row labels can be changed by assigning

757 a list-like or Index.

758

759 Parameters

760 ----------

761 labels : list-like, Index

762 The values for the new index.

763

764 axis : %(axes_single_arg)s, default 0

765 The axis to update. The value 0 identifies the rows. For `Series`

766 this parameter is unused and defaults to 0.

767

768 copy : bool, default True

769 Whether to make a copy of the underlying data.

770

771 .. note::

772 The `copy` keyword will change behavior in pandas 3.0.

773 `Copy-on-Write

774 <https://pandas.pydata.org/docs/dev/user_guide/copy_on_write.html>`__

775 will be enabled by default, which means that all methods with a

776 `copy` keyword will use a lazy copy mechanism to defer the copy and

777 ignore the `copy` keyword. The `copy` keyword will be removed in a

778 future version of pandas.

779

780 You can already get the future behavior and improvements through

781 enabling copy on write ``pd.options.mode.copy_on_write = True``

782

783 Returns

784 -------

785 %(klass)s

786 An object of type %(klass)s.

787

788 See Also

789 --------

790 %(klass)s.rename_axis : Alter the name of the index%(see_also_sub)s.

791 """

792 return self._set_axis_nocheck(labels, axis, inplace=False, copy=copy)

793

794 @final

795 def _set_axis_nocheck(

796 self, labels, axis: Axis, inplace: bool_t, copy: bool_t | None

797 ):

798 if inplace:

799 setattr(self, self._get_axis_name(axis), labels)

800 else:

801 # With copy=False, we create a new object but don't copy the

802 # underlying data.

803 obj = self.copy(deep=copy and not using_copy_on_write())

804 setattr(obj, obj._get_axis_name(axis), labels)

805 return obj

806

807 @final

808 def _set_axis(self, axis: AxisInt, labels: AnyArrayLike | list) -> None:

809 """

810 This is called from the cython code when we set the `index` attribute

811 directly, e.g. `series.index = [1, 2, 3]`.

812 """

813 labels = ensure_index(labels)

814 self._mgr.set_axis(axis, labels)

815 self._clear_item_cache()

816

817 @final

818 def swapaxes(self, axis1: Axis, axis2: Axis, copy: bool_t | None = None) -> Self:

819 """

820 Interchange axes and swap values axes appropriately.

821

822 .. deprecated:: 2.1.0

823 ``swapaxes`` is deprecated and will be removed.

824 Please use ``transpose`` instead.

825

826 Returns

827 -------

828 same as input

829

830 Examples

831 --------

832 Please see examples for :meth:`DataFrame.transpose`.

833 """

834 warnings.warn(

835 # GH#51946

836 f"'{type(self).__name__}.swapaxes' is deprecated and "

837 "will be removed in a future version. "

838 f"Please use '{type(self).__name__}.transpose' instead.",

839 FutureWarning,

840 stacklevel=find_stack_level(),

841 )

842

843 i = self._get_axis_number(axis1)

844 j = self._get_axis_number(axis2)

845

846 if i == j:

847 return self.copy(deep=copy and not using_copy_on_write())

848

849 mapping = {i: j, j: i}

850

851 new_axes = [self._get_axis(mapping.get(k, k)) for k in range(self._AXIS_LEN)]

852 new_values = self._values.swapaxes(i, j) # type: ignore[union-attr]

853 if self._mgr.is_single_block and isinstance(self._mgr, BlockManager):

854 # This should only get hit in case of having a single block, otherwise a

855 # copy is made, we don't have to set up references.

856 new_mgr = ndarray_to_mgr(

857 new_values,

858 new_axes[0],

859 new_axes[1],

860 dtype=None,

861 copy=False,

862 typ="block",

863 )

864 assert isinstance(new_mgr, BlockManager)

865 assert isinstance(self._mgr, BlockManager)

866 new_mgr.blocks[0].refs = self._mgr.blocks[0].refs

867 new_mgr.blocks[0].refs.add_reference(new_mgr.blocks[0])

868 if not using_copy_on_write() and copy is not False:

869 new_mgr = new_mgr.copy(deep=True)

870

871 out = self._constructor_from_mgr(new_mgr, axes=new_mgr.axes)

872 return out.__finalize__(self, method="swapaxes")

873

874 return self._constructor(

875 new_values,

876 *new_axes,

877 # The no-copy case for CoW is handled above

878 copy=False,

879 ).__finalize__(self, method="swapaxes")

880

881 @final

882 @doc(klass=_shared_doc_kwargs["klass"])

883 def droplevel(self, level: IndexLabel, axis: Axis = 0) -> Self:

884 """

885 Return {klass} with requested index / column level(s) removed.

886

887 Parameters

888 ----------

889 level : int, str, or list-like

890 If a string is given, must be the name of a level

891 If list-like, elements must be names or positional indexes

892 of levels.

893

894 axis : {{0 or 'index', 1 or 'columns'}}, default 0

895 Axis along which the level(s) is removed:

896

897 * 0 or 'index': remove level(s) in column.

898 * 1 or 'columns': remove level(s) in row.

899

900 For `Series` this parameter is unused and defaults to 0.

901

902 Returns

903 -------

904 {klass}

905 {klass} with requested index / column level(s) removed.

906

907 Examples

908 --------

909 >>> df = pd.DataFrame([

910 ... [1, 2, 3, 4],

911 ... [5, 6, 7, 8],

912 ... [9, 10, 11, 12]

913 ... ]).set_index([0, 1]).rename_axis(['a', 'b'])

914

915 >>> df.columns = pd.MultiIndex.from_tuples([

916 ... ('c', 'e'), ('d', 'f')

917 ... ], names=['level_1', 'level_2'])

918

919 >>> df

920 level_1 c d

921 level_2 e f

922 a b

923 1 2 3 4

924 5 6 7 8

925 9 10 11 12

926

927 >>> df.droplevel('a')

928 level_1 c d

929 level_2 e f

930 b

931 2 3 4

932 6 7 8

933 10 11 12

934

935 >>> df.droplevel('level_2', axis=1)

936 level_1 c d

937 a b

938 1 2 3 4

939 5 6 7 8

940 9 10 11 12

941 """

942 labels = self._get_axis(axis)

943 new_labels = labels.droplevel(level)

944 return self.set_axis(new_labels, axis=axis, copy=None)

945

946 def pop(self, item: Hashable) -> Series | Any:

947 result = self[item]

948 del self[item]

949

950 return result

951

952 @final

953 def squeeze(self, axis: Axis | None = None):

954 """

955 Squeeze 1 dimensional axis objects into scalars.

956

957 Series or DataFrames with a single element are squeezed to a scalar.

958 DataFrames with a single column or a single row are squeezed to a

959 Series. Otherwise the object is unchanged.

960

961 This method is most useful when you don't know if your

962 object is a Series or DataFrame, but you do know it has just a single

963 column. In that case you can safely call `squeeze` to ensure you have a

964 Series.

965

966 Parameters

967 ----------

968 axis : {0 or 'index', 1 or 'columns', None}, default None

969 A specific axis to squeeze. By default, all length-1 axes are

970 squeezed. For `Series` this parameter is unused and defaults to `None`.

971

972 Returns

973 -------

974 DataFrame, Series, or scalar

975 The projection after squeezing `axis` or all the axes.

976

977 See Also

978 --------

979 Series.iloc : Integer-location based indexing for selecting scalars.

980 DataFrame.iloc : Integer-location based indexing for selecting Series.

981 Series.to_frame : Inverse of DataFrame.squeeze for a

982 single-column DataFrame.

983

984 Examples

985 --------

986 >>> primes = pd.Series([2, 3, 5, 7])

987

988 Slicing might produce a Series with a single value:

989

990 >>> even_primes = primes[primes % 2 == 0]

991 >>> even_primes

992 0 2

993 dtype: int64

994

995 >>> even_primes.squeeze()

996 2

997

998 Squeezing objects with more than one value in every axis does nothing:

999

1000 >>> odd_primes = primes[primes % 2 == 1]

1001 >>> odd_primes

1002 1 3

1003 2 5

1004 3 7

1005 dtype: int64

1006

1007 >>> odd_primes.squeeze()

1008 1 3

1009 2 5

1010 3 7

1011 dtype: int64

1012

1013 Squeezing is even more effective when used with DataFrames.

1014

1015 >>> df = pd.DataFrame([[1, 2], [3, 4]], columns=['a', 'b'])

1016 >>> df

1017 a b

1018 0 1 2

1019 1 3 4

1020

1021 Slicing a single column will produce a DataFrame with the columns

1022 having only one value:

1023

1024 >>> df_a = df[['a']]

1025 >>> df_a

1026 a

1027 0 1

1028 1 3

1029

1030 So the columns can be squeezed down, resulting in a Series:

1031

1032 >>> df_a.squeeze('columns')

1033 0 1

1034 1 3

1035 Name: a, dtype: int64

1036

1037 Slicing a single row from a single column will produce a single

1038 scalar DataFrame:

1039

1040 >>> df_0a = df.loc[df.index < 1, ['a']]

1041 >>> df_0a

1042 a

1043 0 1

1044

1045 Squeezing the rows produces a single scalar Series:

1046

1047 >>> df_0a.squeeze('rows')

1048 a 1

1049 Name: 0, dtype: int64

1050

1051 Squeezing all axes will project directly into a scalar:

1052

1053 >>> df_0a.squeeze()

1054 1

1055 """

1056 axes = range(self._AXIS_LEN) if axis is None else (self._get_axis_number(axis),)

1057 result = self.iloc[

1058 tuple(

1059 0 if i in axes and len(a) == 1 else slice(None)

1060 for i, a in enumerate(self.axes)

1061 )

1062 ]

1063 if isinstance(result, NDFrame):

1064 result = result.__finalize__(self, method="squeeze")

1065 return result

1066

1067 # ----------------------------------------------------------------------

1068 # Rename

1069

1070 @final

1071 def _rename(

1072 self,

1073 mapper: Renamer | None = None,

1074 *,

1075 index: Renamer | None = None,

1076 columns: Renamer | None = None,

1077 axis: Axis | None = None,

1078 copy: bool_t | None = None,

1079 inplace: bool_t = False,

1080 level: Level | None = None,

1081 errors: str = "ignore",

1082 ) -> Self | None:

1083 # called by Series.rename and DataFrame.rename

1084

1085 if mapper is None and index is None and columns is None:

1086 raise TypeError("must pass an index to rename")

1087

1088 if index is not None or columns is not None:

1089 if axis is not None:

1090 raise TypeError(

1091 "Cannot specify both 'axis' and any of 'index' or 'columns'"

1092 )

1093 if mapper is not None:

1094 raise TypeError(

1095 "Cannot specify both 'mapper' and any of 'index' or 'columns'"

1096 )

1097 else:

1098 # use the mapper argument

1099 if axis and self._get_axis_number(axis) == 1:

1100 columns = mapper

1101 else:

1102 index = mapper

1103

1104 self._check_inplace_and_allows_duplicate_labels(inplace)

1105 result = self if inplace else self.copy(deep=copy and not using_copy_on_write())

1106

1107 for axis_no, replacements in enumerate((index, columns)):

1108 if replacements is None:

1109 continue

1110

1111 ax = self._get_axis(axis_no)

1112 f = common.get_rename_function(replacements)

1113

1114 if level is not None:

1115 level = ax._get_level_number(level)

1116

1117 # GH 13473

1118 if not callable(replacements):

1119 if ax._is_multi and level is not None:

1120 indexer = ax.get_level_values(level).get_indexer_for(replacements)

1121 else:

1122 indexer = ax.get_indexer_for(replacements)

1123

1124 if errors == "raise" and len(indexer[indexer == -1]):

1125 missing_labels = [

1126 label

1127 for index, label in enumerate(replacements)

1128 if indexer[index] == -1

1129 ]

1130 raise KeyError(f"{missing_labels} not found in axis")

1131

1132 new_index = ax._transform_index(f, level=level)

1133 result._set_axis_nocheck(new_index, axis=axis_no, inplace=True, copy=False)

1134 result._clear_item_cache()

1135

1136 if inplace:

1137 self._update_inplace(result)

1138 return None

1139 else:

1140 return result.__finalize__(self, method="rename")

1141

1142 @overload

1143 def rename_axis(

1144 self,

1145 mapper: IndexLabel | lib.NoDefault = ...,

1146 *,

1147 index=...,

1148 columns=...,

1149 axis: Axis = ...,

1150 copy: bool_t | None = ...,

1151 inplace: Literal[False] = ...,

1152 ) -> Self:

1153 ...

1154

1155 @overload

1156 def rename_axis(

1157 self,

1158 mapper: IndexLabel | lib.NoDefault = ...,

1159 *,

1160 index=...,

1161 columns=...,

1162 axis: Axis = ...,

1163 copy: bool_t | None = ...,

1164 inplace: Literal[True],

1165 ) -> None:

1166 ...

1167

1168 @overload

1169 def rename_axis(

1170 self,

1171 mapper: IndexLabel | lib.NoDefault = ...,

1172 *,

1173 index=...,

1174 columns=...,

1175 axis: Axis = ...,

1176 copy: bool_t | None = ...,

1177 inplace: bool_t = ...,

1178 ) -> Self | None:

1179 ...

1180

1181 def rename_axis(

1182 self,

1183 mapper: IndexLabel | lib.NoDefault = lib.no_default,

1184 *,

1185 index=lib.no_default,

1186 columns=lib.no_default,

1187 axis: Axis = 0,

1188 copy: bool_t | None = None,

1189 inplace: bool_t = False,

1190 ) -> Self | None:

1191 """

1192 Set the name of the axis for the index or columns.

1193

1194 Parameters

1195 ----------

1196 mapper : scalar, list-like, optional

1197 Value to set the axis name attribute.

1198 index, columns : scalar, list-like, dict-like or function, optional

1199 A scalar, list-like, dict-like or functions transformations to

1200 apply to that axis' values.

1201 Note that the ``columns`` parameter is not allowed if the

1202 object is a Series. This parameter only apply for DataFrame

1203 type objects.

1204

1205 Use either ``mapper`` and ``axis`` to

1206 specify the axis to target with ``mapper``, or ``index``

1207 and/or ``columns``.

1208 axis : {0 or 'index', 1 or 'columns'}, default 0

1209 The axis to rename. For `Series` this parameter is unused and defaults to 0.

1210 copy : bool, default None

1211 Also copy underlying data.

1212

1213 .. note::

1214 The `copy` keyword will change behavior in pandas 3.0.

1215 `Copy-on-Write

1216 <https://pandas.pydata.org/docs/dev/user_guide/copy_on_write.html>`__

1217 will be enabled by default, which means that all methods with a

1218 `copy` keyword will use a lazy copy mechanism to defer the copy and

1219 ignore the `copy` keyword. The `copy` keyword will be removed in a

1220 future version of pandas.

1221

1222 You can already get the future behavior and improvements through

1223 enabling copy on write ``pd.options.mode.copy_on_write = True``

1224 inplace : bool, default False

1225 Modifies the object directly, instead of creating a new Series

1226 or DataFrame.

1227

1228 Returns

1229 -------

1230 Series, DataFrame, or None

1231 The same type as the caller or None if ``inplace=True``.

1232

1233 See Also

1234 --------

1235 Series.rename : Alter Series index labels or name.

1236 DataFrame.rename : Alter DataFrame index labels or name.

1237 Index.rename : Set new names on index.

1238

1239 Notes

1240 -----

1241 ``DataFrame.rename_axis`` supports two calling conventions

1242

1243 * ``(index=index_mapper, columns=columns_mapper, ...)``

1244 * ``(mapper, axis={'index', 'columns'}, ...)``

1245

1246 The first calling convention will only modify the names of

1247 the index and/or the names of the Index object that is the columns.

1248 In this case, the parameter ``copy`` is ignored.

1249

1250 The second calling convention will modify the names of the

1251 corresponding index if mapper is a list or a scalar.

1252 However, if mapper is dict-like or a function, it will use the

1253 deprecated behavior of modifying the axis *labels*.

1254

1255 We *highly* recommend using keyword arguments to clarify your

1256 intent.

1257

1258 Examples

1259 --------

1260 **Series**

1261

1262 >>> s = pd.Series(["dog", "cat", "monkey"])

1263 >>> s

1264 0 dog

1265 1 cat

1266 2 monkey

1267 dtype: object

1268 >>> s.rename_axis("animal")

1269 animal

1270 0 dog

1271 1 cat

1272 2 monkey

1273 dtype: object

1274

1275 **DataFrame**

1276

1277 >>> df = pd.DataFrame({"num_legs": [4, 4, 2],

1278 ... "num_arms": [0, 0, 2]},

1279 ... ["dog", "cat", "monkey"])

1280 >>> df

1281 num_legs num_arms

1282 dog 4 0

1283 cat 4 0

1284 monkey 2 2

1285 >>> df = df.rename_axis("animal")

1286 >>> df

1287 num_legs num_arms

1288 animal

1289 dog 4 0

1290 cat 4 0

1291 monkey 2 2

1292 >>> df = df.rename_axis("limbs", axis="columns")

1293 >>> df

1294 limbs num_legs num_arms

1295 animal

1296 dog 4 0

1297 cat 4 0

1298 monkey 2 2

1299

1300 **MultiIndex**

1301

1302 >>> df.index = pd.MultiIndex.from_product([['mammal'],

1303 ... ['dog', 'cat', 'monkey']],

1304 ... names=['type', 'name'])

1305 >>> df

1306 limbs num_legs num_arms

1307 type name

1308 mammal dog 4 0

1309 cat 4 0

1310 monkey 2 2

1311

1312 >>> df.rename_axis(index={'type': 'class'})

1313 limbs num_legs num_arms

1314 class name

1315 mammal dog 4 0

1316 cat 4 0

1317 monkey 2 2

1318

1319 >>> df.rename_axis(columns=str.upper)

1320 LIMBS num_legs num_arms

1321 type name

1322 mammal dog 4 0

1323 cat 4 0

1324 monkey 2 2

1325 """

1326 axes = {"index": index, "columns": columns}

1327

1328 if axis is not None:

1329 axis = self._get_axis_number(axis)

1330

1331 inplace = validate_bool_kwarg(inplace, "inplace")

1332

1333 if copy and using_copy_on_write():

1334 copy = False

1335

1336 if mapper is not lib.no_default:

1337 # Use v0.23 behavior if a scalar or list

1338 non_mapper = is_scalar(mapper) or (

1339 is_list_like(mapper) and not is_dict_like(mapper)

1340 )

1341 if non_mapper:

1342 return self._set_axis_name(

1343 mapper, axis=axis, inplace=inplace, copy=copy

1344 )

1345 else:

1346 raise ValueError("Use `.rename` to alter labels with a mapper.")

1347 else:

1348 # Use new behavior. Means that index and/or columns

1349 # is specified

1350 result = self if inplace else self.copy(deep=copy)

1351

1352 for axis in range(self._AXIS_LEN):

1353 v = axes.get(self._get_axis_name(axis))

1354 if v is lib.no_default:

1355 continue

1356 non_mapper = is_scalar(v) or (is_list_like(v) and not is_dict_like(v))

1357 if non_mapper:

1358 newnames = v

1359 else:

1360 f = common.get_rename_function(v)

1361 curnames = self._get_axis(axis).names

1362 newnames = [f(name) for name in curnames]

1363 result._set_axis_name(newnames, axis=axis, inplace=True, copy=copy)

1364 if not inplace:

1365 return result

1366 return None

1367

1368 @final

1369 def _set_axis_name(

1370 self, name, axis: Axis = 0, inplace: bool_t = False, copy: bool_t | None = True

1371 ):

1372 """

1373 Set the name(s) of the axis.

1374

1375 Parameters

1376 ----------

1377 name : str or list of str

1378 Name(s) to set.

1379 axis : {0 or 'index', 1 or 'columns'}, default 0

1380 The axis to set the label. The value 0 or 'index' specifies index,

1381 and the value 1 or 'columns' specifies columns.

1382 inplace : bool, default False

1383 If `True`, do operation inplace and return None.

1384 copy:

1385 Whether to make a copy of the result.

1386

1387 Returns

1388 -------

1389 Series, DataFrame, or None

1390 The same type as the caller or `None` if `inplace` is `True`.

1391

1392 See Also

1393 --------

1394 DataFrame.rename : Alter the axis labels of :class:`DataFrame`.

1395 Series.rename : Alter the index labels or set the index name

1396 of :class:`Series`.

1397 Index.rename : Set the name of :class:`Index` or :class:`MultiIndex`.

1398

1399 Examples

1400 --------

1401 >>> df = pd.DataFrame({"num_legs": [4, 4, 2]},

1402 ... ["dog", "cat", "monkey"])

1403 >>> df

1404 num_legs

1405 dog 4

1406 cat 4

1407 monkey 2

1408 >>> df._set_axis_name("animal")

1409 num_legs

1410 animal

1411 dog 4

1412 cat 4

1413 monkey 2

1414 >>> df.index = pd.MultiIndex.from_product(

1415 ... [["mammal"], ['dog', 'cat', 'monkey']])

1416 >>> df._set_axis_name(["type", "name"])

1417 num_legs

1418 type name

1419 mammal dog 4

1420 cat 4

1421 monkey 2

1422 """

1423 axis = self._get_axis_number(axis)

1424 idx = self._get_axis(axis).set_names(name)

1425

1426 inplace = validate_bool_kwarg(inplace, "inplace")

1427 renamed = self if inplace else self.copy(deep=copy)

1428 if axis == 0:

1429 renamed.index = idx

1430 else:

1431 renamed.columns = idx

1432

1433 if not inplace:

1434 return renamed

1435

1436 # ----------------------------------------------------------------------

1437 # Comparison Methods

1438

1439 @final

1440 def _indexed_same(self, other) -> bool_t:

1441 return all(

1442 self._get_axis(a).equals(other._get_axis(a)) for a in self._AXIS_ORDERS

1443 )

1444

1445 @final

1446 def equals(self, other: object) -> bool_t:

1447 """

1448 Test whether two objects contain the same elements.

1449

1450 This function allows two Series or DataFrames to be compared against

1451 each other to see if they have the same shape and elements. NaNs in

1452 the same location are considered equal.

1453

1454 The row/column index do not need to have the same type, as long

1455 as the values are considered equal. Corresponding columns and

1456 index must be of the same dtype.

1457

1458 Parameters

1459 ----------

1460 other : Series or DataFrame

1461 The other Series or DataFrame to be compared with the first.

1462

1463 Returns

1464 -------

1465 bool

1466 True if all elements are the same in both objects, False

1467 otherwise.

1468

1469 See Also

1470 --------

1471 Series.eq : Compare two Series objects of the same length

1472 and return a Series where each element is True if the element

1473 in each Series is equal, False otherwise.

1474 DataFrame.eq : Compare two DataFrame objects of the same shape and

1475 return a DataFrame where each element is True if the respective

1476 element in each DataFrame is equal, False otherwise.

1477 testing.assert_series_equal : Raises an AssertionError if left and

1478 right are not equal. Provides an easy interface to ignore

1479 inequality in dtypes, indexes and precision among others.

1480 testing.assert_frame_equal : Like assert_series_equal, but targets

1481 DataFrames.

1482 numpy.array_equal : Return True if two arrays have the same shape

1483 and elements, False otherwise.

1484

1485 Examples

1486 --------

1487 >>> df = pd.DataFrame({1: [10], 2: [20]})

1488 >>> df

1489 1 2

1490 0 10 20

1491

1492 DataFrames df and exactly_equal have the same types and values for

1493 their elements and column labels, which will return True.

1494

1495 >>> exactly_equal = pd.DataFrame({1: [10], 2: [20]})

1496 >>> exactly_equal

1497 1 2

1498 0 10 20

1499 >>> df.equals(exactly_equal)

1500 True

1501

1502 DataFrames df and different_column_type have the same element

1503 types and values, but have different types for the column labels,

1504 which will still return True.

1505

1506 >>> different_column_type = pd.DataFrame({1.0: [10], 2.0: [20]})

1507 >>> different_column_type

1508 1.0 2.0

1509 0 10 20

1510 >>> df.equals(different_column_type)

1511 True

1512

1513 DataFrames df and different_data_type have different types for the

1514 same values for their elements, and will return False even though

1515 their column labels are the same values and types.

1516

1517 >>> different_data_type = pd.DataFrame({1: [10.0], 2: [20.0]})

1518 >>> different_data_type

1519 1 2

1520 0 10.0 20.0

1521 >>> df.equals(different_data_type)

1522 False

1523 """

1524 if not (isinstance(other, type(self)) or isinstance(self, type(other))):

1525 return False

1526 other = cast(NDFrame, other)

1527 return self._mgr.equals(other._mgr)

1528

1529 # -------------------------------------------------------------------------

1530 # Unary Methods

1531

1532 @final

1533 def __neg__(self) -> Self:

1534 def blk_func(values: ArrayLike):

1535 if is_bool_dtype(values.dtype):

1536 # error: Argument 1 to "inv" has incompatible type "Union

1537 # [ExtensionArray, ndarray[Any, Any]]"; expected

1538 # "_SupportsInversion[ndarray[Any, dtype[bool_]]]"

1539 return operator.inv(values) # type: ignore[arg-type]

1540 else:

1541 # error: Argument 1 to "neg" has incompatible type "Union

1542 # [ExtensionArray, ndarray[Any, Any]]"; expected

1543 # "_SupportsNeg[ndarray[Any, dtype[Any]]]"

1544 return operator.neg(values) # type: ignore[arg-type]

1545

1546 new_data = self._mgr.apply(blk_func)

1547 res = self._constructor_from_mgr(new_data, axes=new_data.axes)

1548 return res.__finalize__(self, method="__neg__")

1549

1550 @final

1551 def __pos__(self) -> Self:

1552 def blk_func(values: ArrayLike):

1553 if is_bool_dtype(values.dtype):

1554 return values.copy()

1555 else:

1556 # error: Argument 1 to "pos" has incompatible type "Union

1557 # [ExtensionArray, ndarray[Any, Any]]"; expected

1558 # "_SupportsPos[ndarray[Any, dtype[Any]]]"

1559 return operator.pos(values) # type: ignore[arg-type]

1560

1561 new_data = self._mgr.apply(blk_func)

1562 res = self._constructor_from_mgr(new_data, axes=new_data.axes)

1563 return res.__finalize__(self, method="__pos__")

1564

1565 @final

1566 def __invert__(self) -> Self:

1567 if not self.size:

1568 # inv fails with 0 len

1569 return self.copy(deep=False)

1570

1571 new_data = self._mgr.apply(operator.invert)

1572 res = self._constructor_from_mgr(new_data, axes=new_data.axes)

1573 return res.__finalize__(self, method="__invert__")

1574

1575 @final

1576 def __nonzero__(self) -> NoReturn:

1577 raise ValueError(

1578 f"The truth value of a {type(self).__name__} is ambiguous. "

1579 "Use a.empty, a.bool(), a.item(), a.any() or a.all()."

1580 )

1581

1582 __bool__ = __nonzero__

1583

1584 @final

1585 def bool(self) -> bool_t:

1586 """

1587 Return the bool of a single element Series or DataFrame.

1588

1589 .. deprecated:: 2.1.0

1590

1591 bool is deprecated and will be removed in future version of pandas.

1592 For ``Series`` use ``pandas.Series.item``.

1593

1594 This must be a boolean scalar value, either True or False. It will raise a

1595 ValueError if the Series or DataFrame does not have exactly 1 element, or that

1596 element is not boolean (integer values 0 and 1 will also raise an exception).

1597

1598 Returns

1599 -------

1600 bool

1601 The value in the Series or DataFrame.

1602

1603 See Also

1604 --------

1605 Series.astype : Change the data type of a Series, including to boolean.

1606 DataFrame.astype : Change the data type of a DataFrame, including to boolean.

1607 numpy.bool_ : NumPy boolean data type, used by pandas for boolean values.

1608

1609 Examples

1610 --------

1611 The method will only work for single element objects with a boolean value:

1612

1613 >>> pd.Series([True]).bool() # doctest: +SKIP

1614 True

1615 >>> pd.Series([False]).bool() # doctest: +SKIP

1616 False

1617

1618 >>> pd.DataFrame({'col': [True]}).bool() # doctest: +SKIP

1619 True

1620 >>> pd.DataFrame({'col': [False]}).bool() # doctest: +SKIP

1621 False

1622

1623 This is an alternative method and will only work

1624 for single element objects with a boolean value:

1625

1626 >>> pd.Series([True]).item() # doctest: +SKIP

1627 True

1628 >>> pd.Series([False]).item() # doctest: +SKIP

1629 False

1630 """

1631

1632 warnings.warn(

1633 f"{type(self).__name__}.bool is now deprecated and will be removed "

1634 "in future version of pandas",

1635 FutureWarning,

1636 stacklevel=find_stack_level(),

1637 )

1638 v = self.squeeze()

1639 if isinstance(v, (bool, np.bool_)):

1640 return bool(v)

1641 elif is_scalar(v):

1642 raise ValueError(

1643 "bool cannot act on a non-boolean single element "

1644 f"{type(self).__name__}"

1645 )

1646

1647 self.__nonzero__()

1648 # for mypy (__nonzero__ raises)

1649 return True

1650

1651 @final

1652 def abs(self) -> Self:

1653 """

1654 Return a Series/DataFrame with absolute numeric value of each element.

1655

1656 This function only applies to elements that are all numeric.

1657

1658 Returns

1659 -------

1660 abs

1661 Series/DataFrame containing the absolute value of each element.

1662

1663 See Also

1664 --------

1665 numpy.absolute : Calculate the absolute value element-wise.

1666

1667 Notes

1668 -----

1669 For ``complex`` inputs, ``1.2 + 1j``, the absolute value is

1670 :math:`\\sqrt{ a^2 + b^2 }`.

1671

1672 Examples

1673 --------

1674 Absolute numeric values in a Series.

1675

1676 >>> s = pd.Series([-1.10, 2, -3.33, 4])

1677 >>> s.abs()

1678 0 1.10

1679 1 2.00

1680 2 3.33

1681 3 4.00

1682 dtype: float64

1683

1684 Absolute numeric values in a Series with complex numbers.

1685

1686 >>> s = pd.Series([1.2 + 1j])

1687 >>> s.abs()

1688 0 1.56205

1689 dtype: float64

1690

1691 Absolute numeric values in a Series with a Timedelta element.

1692

1693 >>> s = pd.Series([pd.Timedelta('1 days')])

1694 >>> s.abs()

1695 0 1 days

1696 dtype: timedelta64[ns]

1697

1698 Select rows with data closest to certain value using argsort (from

1699 `StackOverflow <https://stackoverflow.com/a/17758115>`__).

1700

1701 >>> df = pd.DataFrame({

1702 ... 'a': [4, 5, 6, 7],

1703 ... 'b': [10, 20, 30, 40],

1704 ... 'c': [100, 50, -30, -50]

1705 ... })

1706 >>> df

1707 a b c

1708 0 4 10 100

1709 1 5 20 50

1710 2 6 30 -30

1711 3 7 40 -50

1712 >>> df.loc[(df.c - 43).abs().argsort()]

1713 a b c

1714 1 5 20 50

1715 0 4 10 100

1716 2 6 30 -30

1717 3 7 40 -50

1718 """

1719 res_mgr = self._mgr.apply(np.abs)

1720 return self._constructor_from_mgr(res_mgr, axes=res_mgr.axes).__finalize__(

1721 self, name="abs"

1722 )

1723

1724 @final

1725 def __abs__(self) -> Self:

1726 return self.abs()

1727

1728 @final

1729 def __round__(self, decimals: int = 0) -> Self:

1730 return self.round(decimals).__finalize__(self, method="__round__")

1731

1732 # -------------------------------------------------------------------------

1733 # Label or Level Combination Helpers

1734 #

1735 # A collection of helper methods for DataFrame/Series operations that

1736 # accept a combination of column/index labels and levels. All such

1737 # operations should utilize/extend these methods when possible so that we

1738 # have consistent precedence and validation logic throughout the library.

1739

1740 @final

1741 def _is_level_reference(self, key: Level, axis: Axis = 0) -> bool_t:

1742 """

1743 Test whether a key is a level reference for a given axis.

1744

1745 To be considered a level reference, `key` must be a string that:

1746 - (axis=0): Matches the name of an index level and does NOT match

1747 a column label.

1748 - (axis=1): Matches the name of a column level and does NOT match

1749 an index label.

1750

1751 Parameters

1752 ----------

1753 key : Hashable

1754 Potential level name for the given axis

1755 axis : int, default 0

1756 Axis that levels are associated with (0 for index, 1 for columns)

1757

1758 Returns

1759 -------

1760 is_level : bool

1761 """

1762 axis_int = self._get_axis_number(axis)

1763

1764 return (

1765 key is not None

1766 and is_hashable(key)

1767 and key in self.axes[axis_int].names

1768 and not self._is_label_reference(key, axis=axis_int)

1769 )

1770

1771 @final

1772 def _is_label_reference(self, key: Level, axis: Axis = 0) -> bool_t:

1773 """

1774 Test whether a key is a label reference for a given axis.

1775

1776 To be considered a label reference, `key` must be a string that:

1777 - (axis=0): Matches a column label

1778 - (axis=1): Matches an index label

1779

1780 Parameters

1781 ----------

1782 key : Hashable

1783 Potential label name, i.e. Index entry.

1784 axis : int, default 0

1785 Axis perpendicular to the axis that labels are associated with

1786 (0 means search for column labels, 1 means search for index labels)

1787

1788 Returns

1789 -------

1790 is_label: bool

1791 """

1792 axis_int = self._get_axis_number(axis)

1793 other_axes = (ax for ax in range(self._AXIS_LEN) if ax != axis_int)

1794

1795 return (

1796 key is not None

1797 and is_hashable(key)

1798 and any(key in self.axes[ax] for ax in other_axes)

1799 )

1800

1801 @final

1802 def _is_label_or_level_reference(self, key: Level, axis: AxisInt = 0) -> bool_t:

1803 """

1804 Test whether a key is a label or level reference for a given axis.

1805

1806 To be considered either a label or a level reference, `key` must be a

1807 string that:

1808 - (axis=0): Matches a column label or an index level

1809 - (axis=1): Matches an index label or a column level

1810

1811 Parameters

1812 ----------

1813 key : Hashable

1814 Potential label or level name

1815 axis : int, default 0

1816 Axis that levels are associated with (0 for index, 1 for columns)

1817

1818 Returns

1819 -------

1820 bool

1821 """

1822 return self._is_level_reference(key, axis=axis) or self._is_label_reference(

1823 key, axis=axis

1824 )

1825

1826 @final

1827 def _check_label_or_level_ambiguity(self, key: Level, axis: Axis = 0) -> None:

1828 """

1829 Check whether `key` is ambiguous.

1830

1831 By ambiguous, we mean that it matches both a level of the input

1832 `axis` and a label of the other axis.

1833

1834 Parameters

1835 ----------

1836 key : Hashable

1837 Label or level name.

1838 axis : int, default 0

1839 Axis that levels are associated with (0 for index, 1 for columns).

1840

1841 Raises

1842 ------

1843 ValueError: `key` is ambiguous

1844 """

1845

1846 axis_int = self._get_axis_number(axis)

1847 other_axes = (ax for ax in range(self._AXIS_LEN) if ax != axis_int)

1848

1849 if (

1850 key is not None

1851 and is_hashable(key)

1852 and key in self.axes[axis_int].names

1853 and any(key in self.axes[ax] for ax in other_axes)

1854 ):

1855 # Build an informative and grammatical warning

1856 level_article, level_type = (

1857 ("an", "index") if axis_int == 0 else ("a", "column")

1858 )

1859

1860 label_article, label_type = (

1861 ("a", "column") if axis_int == 0 else ("an", "index")

1862 )

1863

1864 msg = (

1865 f"'{key}' is both {level_article} {level_type} level and "

1866 f"{label_article} {label_type} label, which is ambiguous."

1867 )

1868 raise ValueError(msg)

1869

1870 @final

1871 def _get_label_or_level_values(self, key: Level, axis: AxisInt = 0) -> ArrayLike:

1872 """

1873 Return a 1-D array of values associated with `key`, a label or level

1874 from the given `axis`.

1875

1876 Retrieval logic:

1877 - (axis=0): Return column values if `key` matches a column label.

1878 Otherwise return index level values if `key` matches an index

1879 level.

1880 - (axis=1): Return row values if `key` matches an index label.

1881 Otherwise return column level values if 'key' matches a column

1882 level

1883

1884 Parameters

1885 ----------

1886 key : Hashable

1887 Label or level name.

1888 axis : int, default 0

1889 Axis that levels are associated with (0 for index, 1 for columns)

1890

1891 Returns

1892 -------

1893 np.ndarray or ExtensionArray

1894

1895 Raises

1896 ------

1897 KeyError

1898 if `key` matches neither a label nor a level

1899 ValueError

1900 if `key` matches multiple labels

1901 """

1902 axis = self._get_axis_number(axis)

1903 other_axes = [ax for ax in range(self._AXIS_LEN) if ax != axis]

1904

1905 if self._is_label_reference(key, axis=axis):

1906 self._check_label_or_level_ambiguity(key, axis=axis)

1907 values = self.xs(key, axis=other_axes[0])._values

1908 elif self._is_level_reference(key, axis=axis):

1909 values = self.axes[axis].get_level_values(key)._values

1910 else:

1911 raise KeyError(key)

1912

1913 # Check for duplicates

1914 if values.ndim > 1:

1915 if other_axes and isinstance(self._get_axis(other_axes[0]), MultiIndex):

1916 multi_message = (

1917 "\n"

1918 "For a multi-index, the label must be a "

1919 "tuple with elements corresponding to each level."

1920 )

1921 else:

1922 multi_message = ""

1923

1924 label_axis_name = "column" if axis == 0 else "index"

1925 raise ValueError(

1926 f"The {label_axis_name} label '{key}' is not unique.{multi_message}"

1927 )

1928

1929 return values

1930

1931 @final

1932 def _drop_labels_or_levels(self, keys, axis: AxisInt = 0):

1933 """

1934 Drop labels and/or levels for the given `axis`.

1935

1936 For each key in `keys`:

1937 - (axis=0): If key matches a column label then drop the column.

1938 Otherwise if key matches an index level then drop the level.

1939 - (axis=1): If key matches an index label then drop the row.

1940 Otherwise if key matches a column level then drop the level.

1941

1942 Parameters

1943 ----------

1944 keys : str or list of str

1945 labels or levels to drop

1946 axis : int, default 0

1947 Axis that levels are associated with (0 for index, 1 for columns)

1948

1949 Returns

1950 -------

1951 dropped: DataFrame

1952

1953 Raises

1954 ------

1955 ValueError

1956 if any `keys` match neither a label nor a level

1957 """

1958 axis = self._get_axis_number(axis)

1959

1960 # Validate keys

1961 keys = common.maybe_make_list(keys)

1962 invalid_keys = [

1963 k for k in keys if not self._is_label_or_level_reference(k, axis=axis)

1964 ]

1965

1966 if invalid_keys:

1967 raise ValueError(

1968 "The following keys are not valid labels or "

1969 f"levels for axis {axis}: {invalid_keys}"

1970 )

1971

1972 # Compute levels and labels to drop

1973 levels_to_drop = [k for k in keys if self._is_level_reference(k, axis=axis)]

1974

1975 labels_to_drop = [k for k in keys if not self._is_level_reference(k, axis=axis)]

1976

1977 # Perform copy upfront and then use inplace operations below.

1978 # This ensures that we always perform exactly one copy.

1979 # ``copy`` and/or ``inplace`` options could be added in the future.

1980 dropped = self.copy(deep=False)

1981

1982 if axis == 0:

1983 # Handle dropping index levels

1984 if levels_to_drop:

1985 dropped.reset_index(levels_to_drop, drop=True, inplace=True)

1986

1987 # Handle dropping columns labels

1988 if labels_to_drop:

1989 dropped.drop(labels_to_drop, axis=1, inplace=True)

1990 else:

1991 # Handle dropping column levels

1992 if levels_to_drop:

1993 if isinstance(dropped.columns, MultiIndex):

1994 # Drop the specified levels from the MultiIndex

1995 dropped.columns = dropped.columns.droplevel(levels_to_drop)

1996 else:

1997 # Drop the last level of Index by replacing with

1998 # a RangeIndex

1999 dropped.columns = RangeIndex(dropped.columns.size)

2000

2001 # Handle dropping index labels

2002 if labels_to_drop:

2003 dropped.drop(labels_to_drop, axis=0, inplace=True)

2004

2005 return dropped

2006

2007 # ----------------------------------------------------------------------

2008 # Iteration

2009

2010 # https://github.com/python/typeshed/issues/2148#issuecomment-520783318

2011 # Incompatible types in assignment (expression has type "None", base class

2012 # "object" defined the type as "Callable[[object], int]")

2013 __hash__: ClassVar[None] # type: ignore[assignment]

2014

2015 def __iter__(self) -> Iterator:

2016 """

2017 Iterate over info axis.

2018

2019 Returns

2020 -------

2021 iterator

2022 Info axis as iterator.

2023

2024 Examples

2025 --------

2026 >>> df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})

2027 >>> for x in df:

2028 ... print(x)

2029 A

2030 B

2031 """

2032 return iter(self._info_axis)

2033

2034 # can we get a better explanation of this?

2035 def keys(self) -> Index:

2036 """

2037 Get the 'info axis' (see Indexing for more).

2038

2039 This is index for Series, columns for DataFrame.

2040

2041 Returns

2042 -------

2043 Index

2044 Info axis.

2045

2046 Examples

2047 --------

2048 >>> d = pd.DataFrame(data={'A': [1, 2, 3], 'B': [0, 4, 8]},

2049 ... index=['a', 'b', 'c'])

2050 >>> d

2051 A B

2052 a 1 0

2053 b 2 4

2054 c 3 8

2055 >>> d.keys()

2056 Index(['A', 'B'], dtype='object')

2057 """

2058 return self._info_axis

2059

2060 def items(self):

2061 """

2062 Iterate over (label, values) on info axis

2063

2064 This is index for Series and columns for DataFrame.

2065

2066 Returns

2067 -------

2068 Generator

2069 """

2070 for h in self._info_axis:

2071 yield h, self[h]

2072

2073 def __len__(self) -> int:

2074 """Returns length of info axis"""

2075 return len(self._info_axis)

2076

2077 @final

2078 def __contains__(self, key) -> bool_t:

2079 """True if the key is in the info axis"""

2080 return key in self._info_axis

2081

2082 @property

2083 def empty(self) -> bool_t:

2084 """

2085 Indicator whether Series/DataFrame is empty.

2086

2087 True if Series/DataFrame is entirely empty (no items), meaning any of the

2088 axes are of length 0.

2089

2090 Returns

2091 -------

2092 bool

2093 If Series/DataFrame is empty, return True, if not return False.

2094

2095 See Also

2096 --------

2097 Series.dropna : Return series without null values.

2098 DataFrame.dropna : Return DataFrame with labels on given axis omitted

2099 where (all or any) data are missing.

2100

2101 Notes

2102 -----

2103 If Series/DataFrame contains only NaNs, it is still not considered empty. See

2104 the example below.

2105

2106 Examples

2107 --------

2108 An example of an actual empty DataFrame. Notice the index is empty:

2109

2110 >>> df_empty = pd.DataFrame({'A' : []})

2111 >>> df_empty

2112 Empty DataFrame

2113 Columns: [A]

2114 Index: []

2115 >>> df_empty.empty

2116 True

2117

2118 If we only have NaNs in our DataFrame, it is not considered empty! We

2119 will need to drop the NaNs to make the DataFrame empty:

2120

2121 >>> df = pd.DataFrame({'A' : [np.nan]})

2122 >>> df

2123 A

2124 0 NaN

2125 >>> df.empty

2126 False

2127 >>> df.dropna().empty

2128 True

2129

2130 >>> ser_empty = pd.Series({'A' : []})

2131 >>> ser_empty

2132 A []

2133 dtype: object

2134 >>> ser_empty.empty

2135 False

2136 >>> ser_empty = pd.Series()

2137 >>> ser_empty.empty

2138 True

2139 """

2140 return any(len(self._get_axis(a)) == 0 for a in self._AXIS_ORDERS)

2141

2142 # ----------------------------------------------------------------------

2143 # Array Interface

2144

2145 # This is also set in IndexOpsMixin

2146 # GH#23114 Ensure ndarray.__op__(DataFrame) returns NotImplemented

2147 __array_priority__: int = 1000

2148

2149 def __array__(

2150 self, dtype: npt.DTypeLike | None = None, copy: bool_t | None = None

2151 ) -> np.ndarray:

2152 values = self._values

2153 arr = np.asarray(values, dtype=dtype)

2154 if (

2155 astype_is_view(values.dtype, arr.dtype)

2156 and using_copy_on_write()

2157 and self._mgr.is_single_block

2158 ):

2159 # Check if both conversions can be done without a copy

2160 if astype_is_view(self.dtypes.iloc[0], values.dtype) and astype_is_view(

2161 values.dtype, arr.dtype

2162 ):

2163 arr = arr.view()

2164 arr.flags.writeable = False

2165 return arr

2166

2167 @final

2168 def __array_ufunc__(

2169 self, ufunc: np.ufunc, method: str, *inputs: Any, **kwargs: Any

2170 ):

2171 return arraylike.array_ufunc(self, ufunc, method, *inputs, **kwargs)

2172

2173 # ----------------------------------------------------------------------

2174 # Picklability

2175

2176 @final

2177 def __getstate__(self) -> dict[str, Any]:

2178 meta = {k: getattr(self, k, None) for k in self._metadata}

2179 return {

2180 "_mgr": self._mgr,

2181 "_typ": self._typ,

2182 "_metadata": self._metadata,

2183 "attrs": self.attrs,

2184 "_flags": {k: self.flags[k] for k in self.flags._keys},

2185 **meta,

2186 }

2187

2188 @final

2189 def __setstate__(self, state) -> None:

2190 if isinstance(state, BlockManager):

2191 self._mgr = state

2192 elif isinstance(state, dict):

2193 if "_data" in state and "_mgr" not in state:

2194 # compat for older pickles

2195 state["_mgr"] = state.pop("_data")

2196 typ = state.get("_typ")

2197 if typ is not None:

2198 attrs = state.get("_attrs", {})

2199 if attrs is None: # should not happen, but better be on the safe side

2200 attrs = {}

2201 object.__setattr__(self, "_attrs", attrs)

2202 flags = state.get("_flags", {"allows_duplicate_labels": True})

2203 object.__setattr__(self, "_flags", Flags(self, **flags))

2204

2205 # set in the order of internal names

2206 # to avoid definitional recursion

2207 # e.g. say fill_value needing _mgr to be

2208 # defined

2209 meta = set(self._internal_names + self._metadata)

2210 for k in list(meta):

2211 if k in state and k != "_flags":

2212 v = state[k]

2213 object.__setattr__(self, k, v)

2214

2215 for k, v in state.items():

2216 if k not in meta:

2217 object.__setattr__(self, k, v)

2218

2219 else:

2220 raise NotImplementedError("Pre-0.12 pickles are no longer supported")

2221 elif len(state) == 2:

2222 raise NotImplementedError("Pre-0.12 pickles are no longer supported")

2223

2224 self._item_cache: dict[Hashable, Series] = {}

2225

2226 # ----------------------------------------------------------------------

2227 # Rendering Methods

2228

2229 def __repr__(self) -> str:

2230 # string representation based upon iterating over self

2231 # (since, by definition, `PandasContainers` are iterable)

2232 prepr = f"[{','.join(map(pprint_thing, self))}]"

2233 return f"{type(self).__name__}({prepr})"

2234

2235 @final

2236 def _repr_latex_(self):

2237 """

2238 Returns a LaTeX representation for a particular object.

2239 Mainly for use with nbconvert (jupyter notebook conversion to pdf).

2240 """

2241 if config.get_option("styler.render.repr") == "latex":

2242 return self.to_latex()

2243 else:

2244 return None

2245

2246 @final

2247 def _repr_data_resource_(self):

2248 """

2249 Not a real Jupyter special repr method, but we use the same

2250 naming convention.

2251 """

2252 if config.get_option("display.html.table_schema"):

2253 data = self.head(config.get_option("display.max_rows"))

2254

2255 as_json = data.to_json(orient="table")

2256 as_json = cast(str, as_json)

2257 return loads(as_json, object_pairs_hook=collections.OrderedDict)

2258

2259 # ----------------------------------------------------------------------

2260 # I/O Methods

2261

2262 @final

2263 @deprecate_nonkeyword_arguments(

2264 version="3.0", allowed_args=["self", "excel_writer"], name="to_excel"

2265 )

2266 @doc(

2267 klass="object",

2268 storage_options=_shared_docs["storage_options"],

2269 storage_options_versionadded="1.2.0",

2270 )

2271 def to_excel(

2272 self,

2273 excel_writer: FilePath | WriteExcelBuffer | ExcelWriter,

2274 sheet_name: str = "Sheet1",

2275 na_rep: str = "",

2276 float_format: str | None = None,

2277 columns: Sequence[Hashable] | None = None,

2278 header: Sequence[Hashable] | bool_t = True,

2279 index: bool_t = True,

2280 index_label: IndexLabel | None = None,

2281 startrow: int = 0,

2282 startcol: int = 0,

2283 engine: Literal["openpyxl", "xlsxwriter"] | None = None,

2284 merge_cells: bool_t = True,

2285 inf_rep: str = "inf",

2286 freeze_panes: tuple[int, int] | None = None,

2287 storage_options: StorageOptions | None = None,

2288 engine_kwargs: dict[str, Any] | None = None,

2289 ) -> None:

2290 """

2291 Write {klass} to an Excel sheet.

2292

2293 To write a single {klass} to an Excel .xlsx file it is only necessary to

2294 specify a target file name. To write to multiple sheets it is necessary to

2295 create an `ExcelWriter` object with a target file name, and specify a sheet

2296 in the file to write to.

2297

2298 Multiple sheets may be written to by specifying unique `sheet_name`.

2299 With all data written to the file it is necessary to save the changes.

2300 Note that creating an `ExcelWriter` object with a file name that already

2301 exists will result in the contents of the existing file being erased.

2302

2303 Parameters

2304 ----------

2305 excel_writer : path-like, file-like, or ExcelWriter object

2306 File path or existing ExcelWriter.

2307 sheet_name : str, default 'Sheet1'

2308 Name of sheet which will contain DataFrame.

2309 na_rep : str, default ''

2310 Missing data representation.

2311 float_format : str, optional

2312 Format string for floating point numbers. For example

2313 ``float_format="%.2f"`` will format 0.1234 to 0.12.

2314 columns : sequence or list of str, optional

2315 Columns to write.

2316 header : bool or list of str, default True

2317 Write out the column names. If a list of string is given it is

2318 assumed to be aliases for the column names.

2319 index : bool, default True

2320 Write row names (index).

2321 index_label : str or sequence, optional

2322 Column label for index column(s) if desired. If not specified, and

2323 `header` and `index` are True, then the index names are used. A

2324 sequence should be given if the DataFrame uses MultiIndex.

2325 startrow : int, default 0

2326 Upper left cell row to dump data frame.

2327 startcol : int, default 0

2328 Upper left cell column to dump data frame.

2329 engine : str, optional

2330 Write engine to use, 'openpyxl' or 'xlsxwriter'. You can also set this

2331 via the options ``io.excel.xlsx.writer`` or

2332 ``io.excel.xlsm.writer``.

2333

2334 merge_cells : bool, default True

2335 Write MultiIndex and Hierarchical Rows as merged cells.

2336 inf_rep : str, default 'inf'

2337 Representation for infinity (there is no native representation for

2338 infinity in Excel).

2339 freeze_panes : tuple of int (length 2), optional

2340 Specifies the one-based bottommost row and rightmost column that

2341 is to be frozen.

2342 {storage_options}

2343

2344 .. versionadded:: {storage_options_versionadded}

2345 engine_kwargs : dict, optional

2346 Arbitrary keyword arguments passed to excel engine.

2347

2348 See Also

2349 --------

2350 to_csv : Write DataFrame to a comma-separated values (csv) file.

2351 ExcelWriter : Class for writing DataFrame objects into excel sheets.

2352 read_excel : Read an Excel file into a pandas DataFrame.

2353 read_csv : Read a comma-separated values (csv) file into DataFrame.

2354 io.formats.style.Styler.to_excel : Add styles to Excel sheet.

2355

2356 Notes

2357 -----

2358 For compatibility with :meth:`~DataFrame.to_csv`,

2359 to_excel serializes lists and dicts to strings before writing.

2360

2361 Once a workbook has been saved it is not possible to write further

2362 data without rewriting the whole workbook.

2363

2364 Examples

2365 --------

2366

2367 Create, write to and save a workbook:

2368

2369 >>> df1 = pd.DataFrame([['a', 'b'], ['c', 'd']],

2370 ... index=['row 1', 'row 2'],

2371 ... columns=['col 1', 'col 2'])

2372 >>> df1.to_excel("output.xlsx") # doctest: +SKIP

2373

2374 To specify the sheet name:

2375

2376 >>> df1.to_excel("output.xlsx",

2377 ... sheet_name='Sheet_name_1') # doctest: +SKIP

2378

2379 If you wish to write to more than one sheet in the workbook, it is

2380 necessary to specify an ExcelWriter object:

2381

2382 >>> df2 = df1.copy()

2383 >>> with pd.ExcelWriter('output.xlsx') as writer: # doctest: +SKIP

2384 ... df1.to_excel(writer, sheet_name='Sheet_name_1')

2385 ... df2.to_excel(writer, sheet_name='Sheet_name_2')

2386

2387 ExcelWriter can also be used to append to an existing Excel file:

2388

2389 >>> with pd.ExcelWriter('output.xlsx',

2390 ... mode='a') as writer: # doctest: +SKIP

2391 ... df1.to_excel(writer, sheet_name='Sheet_name_3')

2392

2393 To set the library that is used to write the Excel file,

2394 you can pass the `engine` keyword (the default engine is

2395 automatically chosen depending on the file extension):

2396

2397 >>> df1.to_excel('output1.xlsx', engine='xlsxwriter') # doctest: +SKIP

2398 """

2399 if engine_kwargs is None:

2400 engine_kwargs = {}

2401

2402 df = self if isinstance(self, ABCDataFrame) else self.to_frame()

2403

2404 from pandas.io.formats.excel import ExcelFormatter

2405

2406 formatter = ExcelFormatter(

2407 df,

2408 na_rep=na_rep,

2409 cols=columns,

2410 header=header,

2411 float_format=float_format,

2412 index=index,

2413 index_label=index_label,

2414 merge_cells=merge_cells,

2415 inf_rep=inf_rep,

2416 )

2417 formatter.write(

2418 excel_writer,

2419 sheet_name=sheet_name,

2420 startrow=startrow,

2421 startcol=startcol,

2422 freeze_panes=freeze_panes,

2423 engine=engine,

2424 storage_options=storage_options,

2425 engine_kwargs=engine_kwargs,

2426 )

2427

2428 @final

2429 @deprecate_nonkeyword_arguments(

2430 version="3.0", allowed_args=["self", "path_or_buf"], name="to_json"

2431 )

2432 @doc(

2433 storage_options=_shared_docs["storage_options"],

2434 compression_options=_shared_docs["compression_options"] % "path_or_buf",

2435 )

2436 def to_json(

2437 self,

2438 path_or_buf: FilePath | WriteBuffer[bytes] | WriteBuffer[str] | None = None,

2439 orient: Literal["split", "records", "index", "table", "columns", "values"]

2440 | None = None,

2441 date_format: str | None = None,

2442 double_precision: int = 10,

2443 force_ascii: bool_t = True,

2444 date_unit: TimeUnit = "ms",

2445 default_handler: Callable[[Any], JSONSerializable] | None = None,

2446 lines: bool_t = False,

2447 compression: CompressionOptions = "infer",

2448 index: bool_t | None = None,

2449 indent: int | None = None,

2450 storage_options: StorageOptions | None = None,

2451 mode: Literal["a", "w"] = "w",

2452 ) -> str | None:

2453 """

2454 Convert the object to a JSON string.

2455

2456 Note NaN's and None will be converted to null and datetime objects

2457 will be converted to UNIX timestamps.

2458

2459 Parameters

2460 ----------

2461 path_or_buf : str, path object, file-like object, or None, default None

2462 String, path object (implementing os.PathLike[str]), or file-like

2463 object implementing a write() function. If None, the result is

2464 returned as a string.

2465 orient : str

2466 Indication of expected JSON string format.

2467

2468 * Series:

2469

2470 - default is 'index'

2471 - allowed values are: {{'split', 'records', 'index', 'table'}}.

2472

2473 * DataFrame:

2474

2475 - default is 'columns'

2476 - allowed values are: {{'split', 'records', 'index', 'columns',

2477 'values', 'table'}}.

2478

2479 * The format of the JSON string:

2480

2481 - 'split' : dict like {{'index' -> [index], 'columns' -> [columns],

2482 'data' -> [values]}}

2483 - 'records' : list like [{{column -> value}}, ... , {{column -> value}}]

2484 - 'index' : dict like {{index -> {{column -> value}}}}

2485 - 'columns' : dict like {{column -> {{index -> value}}}}

2486 - 'values' : just the values array

2487 - 'table' : dict like {{'schema': {{schema}}, 'data': {{data}}}}

2488

2489 Describing the data, where data component is like ``orient='records'``.

2490

2491 date_format : {{None, 'epoch', 'iso'}}

2492 Type of date conversion. 'epoch' = epoch milliseconds,

2493 'iso' = ISO8601. The default depends on the `orient`. For

2494 ``orient='table'``, the default is 'iso'. For all other orients,

2495 the default is 'epoch'.

2496 double_precision : int, default 10

2497 The number of decimal places to use when encoding

2498 floating point values. The possible maximal value is 15.

2499 Passing double_precision greater than 15 will raise a ValueError.

2500 force_ascii : bool, default True

2501 Force encoded string to be ASCII.

2502 date_unit : str, default 'ms' (milliseconds)

2503 The time unit to encode to, governs timestamp and ISO8601

2504 precision. One of 's', 'ms', 'us', 'ns' for second, millisecond,

2505 microsecond, and nanosecond respectively.

2506 default_handler : callable, default None

2507 Handler to call if object cannot otherwise be converted to a

2508 suitable format for JSON. Should receive a single argument which is

2509 the object to convert and return a serialisable object.

2510 lines : bool, default False

2511 If 'orient' is 'records' write out line-delimited json format. Will

2512 throw ValueError if incorrect 'orient' since others are not

2513 list-like.

2514 {compression_options}

2515

2516 .. versionchanged:: 1.4.0 Zstandard support.

2517

2518 index : bool or None, default None

2519 The index is only used when 'orient' is 'split', 'index', 'column',

2520 or 'table'. Of these, 'index' and 'column' do not support

2521 `index=False`.

2522

2523 indent : int, optional

2524 Length of whitespace used to indent each record.

2525

2526 {storage_options}

2527

2528 mode : str, default 'w' (writing)

2529 Specify the IO mode for output when supplying a path_or_buf.

2530 Accepted args are 'w' (writing) and 'a' (append) only.

2531 mode='a' is only supported when lines is True and orient is 'records'.

2532

2533 Returns

2534 -------

2535 None or str

2536 If path_or_buf is None, returns the resulting json format as a

2537 string. Otherwise returns None.

2538

2539 See Also

2540 --------

2541 read_json : Convert a JSON string to pandas object.

2542

2543 Notes

2544 -----

2545 The behavior of ``indent=0`` varies from the stdlib, which does not

2546 indent the output but does insert newlines. Currently, ``indent=0``

2547 and the default ``indent=None`` are equivalent in pandas, though this

2548 may change in a future release.

2549

2550 ``orient='table'`` contains a 'pandas_version' field under 'schema'.

2551 This stores the version of `pandas` used in the latest revision of the

2552 schema.

2553

2554 Examples

2555 --------

2556 >>> from json import loads, dumps

2557 >>> df = pd.DataFrame(

2558 ... [["a", "b"], ["c", "d"]],

2559 ... index=["row 1", "row 2"],

2560 ... columns=["col 1", "col 2"],

2561 ... )

2562

2563 >>> result = df.to_json(orient="split")

2564 >>> parsed = loads(result)

2565 >>> dumps(parsed, indent=4) # doctest: +SKIP

2566 {{

2567 "columns": [

2568 "col 1",

2569 "col 2"

2570 ],

2571 "index": [

2572 "row 1",

2573 "row 2"

2574 ],

2575 "data": [

2576 [

2577 "a",

2578 "b"

2579 ],

2580 [

2581 "c",

2582 "d"

2583 ]

2584 ]

2585 }}

2586

2587 Encoding/decoding a Dataframe using ``'records'`` formatted JSON.

2588 Note that index labels are not preserved with this encoding.

2589

2590 >>> result = df.to_json(orient="records")

2591 >>> parsed = loads(result)

2592 >>> dumps(parsed, indent=4) # doctest: +SKIP

2593 [

2594 {{

2595 "col 1": "a",

2596 "col 2": "b"

2597 }},

2598 {{

2599 "col 1": "c",

2600 "col 2": "d"

2601 }}

2602 ]

2603

2604 Encoding/decoding a Dataframe using ``'index'`` formatted JSON:

2605

2606 >>> result = df.to_json(orient="index")

2607 >>> parsed = loads(result)

2608 >>> dumps(parsed, indent=4) # doctest: +SKIP

2609 {{

2610 "row 1": {{

2611 "col 1": "a",

2612 "col 2": "b"

2613 }},

2614 "row 2": {{

2615 "col 1": "c",

2616 "col 2": "d"

2617 }}

2618 }}

2619

2620 Encoding/decoding a Dataframe using ``'columns'`` formatted JSON:

2621

2622 >>> result = df.to_json(orient="columns")

2623 >>> parsed = loads(result)

2624 >>> dumps(parsed, indent=4) # doctest: +SKIP

2625 {{

2626 "col 1": {{

2627 "row 1": "a",

2628 "row 2": "c"

2629 }},

2630 "col 2": {{

2631 "row 1": "b",

2632 "row 2": "d"

2633 }}

2634 }}

2635

2636 Encoding/decoding a Dataframe using ``'values'`` formatted JSON:

2637

2638 >>> result = df.to_json(orient="values")

2639 >>> parsed = loads(result)

2640 >>> dumps(parsed, indent=4) # doctest: +SKIP

2641 [

2642 [

2643 "a",

2644 "b"

2645 ],

2646 [

2647 "c",

2648 "d"

2649 ]

2650 ]

2651

2652 Encoding with Table Schema:

2653

2654 >>> result = df.to_json(orient="table")

2655 >>> parsed = loads(result)

2656 >>> dumps(parsed, indent=4) # doctest: +SKIP

2657 {{

2658 "schema": {{

2659 "fields": [

2660 {{

2661 "name": "index",

2662 "type": "string"

2663 }},

2664 {{

2665 "name": "col 1",

2666 "type": "string"

2667 }},

2668 {{

2669 "name": "col 2",

2670 "type": "string"

2671 }}

2672 ],

2673 "primaryKey": [

2674 "index"

2675 ],

2676 "pandas_version": "1.4.0"

2677 }},

2678 "data": [

2679 {{

2680 "index": "row 1",

2681 "col 1": "a",

2682 "col 2": "b"

2683 }},

2684 {{

2685 "index": "row 2",

2686 "col 1": "c",

2687 "col 2": "d"

2688 }}

2689 ]

2690 }}

2691 """

2692 from pandas.io import json

2693

2694 if date_format is None and orient == "table":

2695 date_format = "iso"

2696 elif date_format is None:

2697 date_format = "epoch"

2698

2699 config.is_nonnegative_int(indent)

2700 indent = indent or 0

2701

2702 return json.to_json(

2703 path_or_buf=path_or_buf,

2704 obj=self,

2705 orient=orient,

2706 date_format=date_format,

2707 double_precision=double_precision,

2708 force_ascii=force_ascii,

2709 date_unit=date_unit,

2710 default_handler=default_handler,

2711 lines=lines,

2712 compression=compression,

2713 index=index,

2714 indent=indent,

2715 storage_options=storage_options,

2716 mode=mode,

2717 )

2718

2719 @final

2720 @deprecate_nonkeyword_arguments(

2721 version="3.0", allowed_args=["self", "path_or_buf"], name="to_hdf"

2722 )

2723 def to_hdf(

2724 self,

2725 path_or_buf: FilePath | HDFStore,

2726 key: str,

2727 mode: Literal["a", "w", "r+"] = "a",

2728 complevel: int | None = None,

2729 complib: Literal["zlib", "lzo", "bzip2", "blosc"] | None = None,

2730 append: bool_t = False,

2731 format: Literal["fixed", "table"] | None = None,

2732 index: bool_t = True,

2733 min_itemsize: int | dict[str, int] | None = None,

2734 nan_rep=None,

2735 dropna: bool_t | None = None,

2736 data_columns: Literal[True] | list[str] | None = None,

2737 errors: OpenFileErrors = "strict",

2738 encoding: str = "UTF-8",

2739 ) -> None:

2740 """

2741 Write the contained data to an HDF5 file using HDFStore.

2742

2743 Hierarchical Data Format (HDF) is self-describing, allowing an

2744 application to interpret the structure and contents of a file with

2745 no outside information. One HDF file can hold a mix of related objects

2746 which can be accessed as a group or as individual objects.

2747

2748 In order to add another DataFrame or Series to an existing HDF file

2749 please use append mode and a different a key.

2750

2751 .. warning::

2752

2753 One can store a subclass of ``DataFrame`` or ``Series`` to HDF5,

2754 but the type of the subclass is lost upon storing.

2755

2756 For more information see the :ref:`user guide <io.hdf5>`.

2757

2758 Parameters

2759 ----------

2760 path_or_buf : str or pandas.HDFStore

2761 File path or HDFStore object.

2762 key : str

2763 Identifier for the group in the store.

2764 mode : {'a', 'w', 'r+'}, default 'a'

2765 Mode to open file:

2766

2767 - 'w': write, a new file is created (an existing file with

2768 the same name would be deleted).

2769 - 'a': append, an existing file is opened for reading and

2770 writing, and if the file does not exist it is created.

2771 - 'r+': similar to 'a', but the file must already exist.

2772 complevel : {0-9}, default None

2773 Specifies a compression level for data.

2774 A value of 0 or None disables compression.

2775 complib : {'zlib', 'lzo', 'bzip2', 'blosc'}, default 'zlib'

2776 Specifies the compression library to be used.

2777 These additional compressors for Blosc are supported

2778 (default if no compressor specified: 'blosc:blosclz'):

2779 {'blosc:blosclz', 'blosc:lz4', 'blosc:lz4hc', 'blosc:snappy',

2780 'blosc:zlib', 'blosc:zstd'}.

2781 Specifying a compression library which is not available issues

2782 a ValueError.

2783 append : bool, default False

2784 For Table formats, append the input data to the existing.

2785 format : {'fixed', 'table', None}, default 'fixed'

2786 Possible values:

2787

2788 - 'fixed': Fixed format. Fast writing/reading. Not-appendable,

2789 nor searchable.

2790 - 'table': Table format. Write as a PyTables Table structure

2791 which may perform worse but allow more flexible operations

2792 like searching / selecting subsets of the data.

2793 - If None, pd.get_option('io.hdf.default_format') is checked,

2794 followed by fallback to "fixed".

2795 index : bool, default True

2796 Write DataFrame index as a column.

2797 min_itemsize : dict or int, optional

2798 Map column names to minimum string sizes for columns.

2799 nan_rep : Any, optional

2800 How to represent null values as str.

2801 Not allowed with append=True.

2802 dropna : bool, default False, optional

2803 Remove missing values.

2804 data_columns : list of columns or True, optional

2805 List of columns to create as indexed data columns for on-disk

2806 queries, or True to use all columns. By default only the axes

2807 of the object are indexed. See

2808 :ref:`Query via data columns<io.hdf5-query-data-columns>`. for

2809 more information.

2810 Applicable only to format='table'.

2811 errors : str, default 'strict'

2812 Specifies how encoding and decoding errors are to be handled.

2813 See the errors argument for :func:`open` for a full list

2814 of options.

2815 encoding : str, default "UTF-8"

2816

2817 See Also

2818 --------

2819 read_hdf : Read from HDF file.

2820 DataFrame.to_orc : Write a DataFrame to the binary orc format.

2821 DataFrame.to_parquet : Write a DataFrame to the binary parquet format.

2822 DataFrame.to_sql : Write to a SQL table.

2823 DataFrame.to_feather : Write out feather-format for DataFrames.

2824 DataFrame.to_csv : Write out to a csv file.

2825

2826 Examples

2827 --------

2828 >>> df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]},

2829 ... index=['a', 'b', 'c']) # doctest: +SKIP

2830 >>> df.to_hdf('data.h5', key='df', mode='w') # doctest: +SKIP

2831

2832 We can add another object to the same file:

2833

2834 >>> s = pd.Series([1, 2, 3, 4]) # doctest: +SKIP

2835 >>> s.to_hdf('data.h5', key='s') # doctest: +SKIP

2836

2837 Reading from HDF file:

2838

2839 >>> pd.read_hdf('data.h5', 'df') # doctest: +SKIP

2840 A B

2841 a 1 4

2842 b 2 5

2843 c 3 6

2844 >>> pd.read_hdf('data.h5', 's') # doctest: +SKIP

2845 0 1

2846 1 2

2847 2 3

2848 3 4

2849 dtype: int64

2850 """

2851 from pandas.io import pytables

2852

2853 # Argument 3 to "to_hdf" has incompatible type "NDFrame"; expected

2854 # "Union[DataFrame, Series]" [arg-type]

2855 pytables.to_hdf(

2856 path_or_buf,

2857 key,

2858 self, # type: ignore[arg-type]

2859 mode=mode,

2860 complevel=complevel,

2861 complib=complib,

2862 append=append,

2863 format=format,

2864 index=index,

2865 min_itemsize=min_itemsize,

2866 nan_rep=nan_rep,

2867 dropna=dropna,

2868 data_columns=data_columns,

2869 errors=errors,

2870 encoding=encoding,

2871 )

2872

2873 @final

2874 @deprecate_nonkeyword_arguments(

2875 version="3.0", allowed_args=["self", "name", "con"], name="to_sql"

2876 )

2877 def to_sql(

2878 self,

2879 name: str,

2880 con,

2881 schema: str | None = None,

2882 if_exists: Literal["fail", "replace", "append"] = "fail",

2883 index: bool_t = True,

2884 index_label: IndexLabel | None = None,

2885 chunksize: int | None = None,

2886 dtype: DtypeArg | None = None,

2887 method: Literal["multi"] | Callable | None = None,

2888 ) -> int | None:

2889 """

2890 Write records stored in a DataFrame to a SQL database.

2891

2892 Databases supported by SQLAlchemy [1]_ are supported. Tables can be

2893 newly created, appended to, or overwritten.

2894

2895 Parameters

2896 ----------

2897 name : str

2898 Name of SQL table.

2899 con : sqlalchemy.engine.(Engine or Connection) or sqlite3.Connection

2900 Using SQLAlchemy makes it possible to use any DB supported by that

2901 library. Legacy support is provided for sqlite3.Connection objects. The user

2902 is responsible for engine disposal and connection closure for the SQLAlchemy

2903 connectable. See `here \

2904 <https://docs.sqlalchemy.org/en/20/core/connections.html>`_.

2905 If passing a sqlalchemy.engine.Connection which is already in a transaction,

2906 the transaction will not be committed. If passing a sqlite3.Connection,

2907 it will not be possible to roll back the record insertion.

2908

2909 schema : str, optional

2910 Specify the schema (if database flavor supports this). If None, use

2911 default schema.

2912 if_exists : {'fail', 'replace', 'append'}, default 'fail'

2913 How to behave if the table already exists.

2914

2915 * fail: Raise a ValueError.

2916 * replace: Drop the table before inserting new values.

2917 * append: Insert new values to the existing table.

2918

2919 index : bool, default True

2920 Write DataFrame index as a column. Uses `index_label` as the column

2921 name in the table. Creates a table index for this column.

2922 index_label : str or sequence, default None

2923 Column label for index column(s). If None is given (default) and

2924 `index` is True, then the index names are used.

2925 A sequence should be given if the DataFrame uses MultiIndex.

2926 chunksize : int, optional

2927 Specify the number of rows in each batch to be written at a time.

2928 By default, all rows will be written at once.

2929 dtype : dict or scalar, optional

2930 Specifying the datatype for columns. If a dictionary is used, the

2931 keys should be the column names and the values should be the

2932 SQLAlchemy types or strings for the sqlite3 legacy mode. If a

2933 scalar is provided, it will be applied to all columns.

2934 method : {None, 'multi', callable}, optional

2935 Controls the SQL insertion clause used:

2936

2937 * None : Uses standard SQL ``INSERT`` clause (one per row).

2938 * 'multi': Pass multiple values in a single ``INSERT`` clause.

2939 * callable with signature ``(pd_table, conn, keys, data_iter)``.

2940

2941 Details and a sample callable implementation can be found in the

2942 section :ref:`insert method <io.sql.method>`.

2943

2944 Returns

2945 -------

2946 None or int

2947 Number of rows affected by to_sql. None is returned if the callable

2948 passed into ``method`` does not return an integer number of rows.

2949

2950 The number of returned rows affected is the sum of the ``rowcount``

2951 attribute of ``sqlite3.Cursor`` or SQLAlchemy connectable which may not

2952 reflect the exact number of written rows as stipulated in the

2953 `sqlite3 <https://docs.python.org/3/library/sqlite3.html#sqlite3.Cursor.rowcount>`__ or

2954 `SQLAlchemy <https://docs.sqlalchemy.org/en/20/core/connections.html#sqlalchemy.engine.CursorResult.rowcount>`__.

2955

2956 .. versionadded:: 1.4.0

2957

2958 Raises

2959 ------

2960 ValueError

2961 When the table already exists and `if_exists` is 'fail' (the

2962 default).

2963

2964 See Also

2965 --------

2966 read_sql : Read a DataFrame from a table.

2967

2968 Notes

2969 -----

2970 Timezone aware datetime columns will be written as

2971 ``Timestamp with timezone`` type with SQLAlchemy if supported by the

2972 database. Otherwise, the datetimes will be stored as timezone unaware

2973 timestamps local to the original timezone.

2974

2975 Not all datastores support ``method="multi"``. Oracle, for example,

2976 does not support multi-value insert.

2977

2978 References

2979 ----------

2980 .. [1] https://docs.sqlalchemy.org

2981 .. [2] https://www.python.org/dev/peps/pep-0249/

2982

2983 Examples

2984 --------

2985 Create an in-memory SQLite database.

2986

2987 >>> from sqlalchemy import create_engine

2988 >>> engine = create_engine('sqlite://', echo=False)

2989

2990 Create a table from scratch with 3 rows.

2991

2992 >>> df = pd.DataFrame({'name' : ['User 1', 'User 2', 'User 3']})

2993 >>> df

2994 name

2995 0 User 1

2996 1 User 2

2997 2 User 3

2998

2999 >>> df.to_sql(name='users', con=engine)

3000 3

3001 >>> from sqlalchemy import text

3002 >>> with engine.connect() as conn:

3003 ... conn.execute(text("SELECT * FROM users")).fetchall()

3004 [(0, 'User 1'), (1, 'User 2'), (2, 'User 3')]

3005

3006 An `sqlalchemy.engine.Connection` can also be passed to `con`:

3007

3008 >>> with engine.begin() as connection:

3009 ... df1 = pd.DataFrame({'name' : ['User 4', 'User 5']})

3010 ... df1.to_sql(name='users', con=connection, if_exists='append')

3011 2

3012

3013 This is allowed to support operations that require that the same

3014 DBAPI connection is used for the entire operation.

3015

3016 >>> df2 = pd.DataFrame({'name' : ['User 6', 'User 7']})

3017 >>> df2.to_sql(name='users', con=engine, if_exists='append')

3018 2

3019 >>> with engine.connect() as conn:

3020 ... conn.execute(text("SELECT * FROM users")).fetchall()

3021 [(0, 'User 1'), (1, 'User 2'), (2, 'User 3'),

3022 (0, 'User 4'), (1, 'User 5'), (0, 'User 6'),

3023 (1, 'User 7')]

3024

3025 Overwrite the table with just ``df2``.

3026

3027 >>> df2.to_sql(name='users', con=engine, if_exists='replace',

3028 ... index_label='id')

3029 2

3030 >>> with engine.connect() as conn:

3031 ... conn.execute(text("SELECT * FROM users")).fetchall()

3032 [(0, 'User 6'), (1, 'User 7')]

3033

3034 Use ``method`` to define a callable insertion method to do nothing

3035 if there's a primary key conflict on a table in a PostgreSQL database.

3036

3037 >>> from sqlalchemy.dialects.postgresql import insert

3038 >>> def insert_on_conflict_nothing(table, conn, keys, data_iter):

3039 ... # "a" is the primary key in "conflict_table"

3040 ... data = [dict(zip(keys, row)) for row in data_iter]

3041 ... stmt = insert(table.table).values(data).on_conflict_do_nothing(index_elements=["a"])

3042 ... result = conn.execute(stmt)

3043 ... return result.rowcount

3044 >>> df_conflict.to_sql(name="conflict_table", con=conn, if_exists="append", method=insert_on_conflict_nothing) # doctest: +SKIP

3045 0

3046

3047 For MySQL, a callable to update columns ``b`` and ``c`` if there's a conflict

3048 on a primary key.

3049

3050 >>> from sqlalchemy.dialects.mysql import insert

3051 >>> def insert_on_conflict_update(table, conn, keys, data_iter):

3052 ... # update columns "b" and "c" on primary key conflict

3053 ... data = [dict(zip(keys, row)) for row in data_iter]

3054 ... stmt = (

3055 ... insert(table.table)

3056 ... .values(data)

3057 ... )

3058 ... stmt = stmt.on_duplicate_key_update(b=stmt.inserted.b, c=stmt.inserted.c)

3059 ... result = conn.execute(stmt)

3060 ... return result.rowcount

3061 >>> df_conflict.to_sql(name="conflict_table", con=conn, if_exists="append", method=insert_on_conflict_update) # doctest: +SKIP

3062 2

3063

3064 Specify the dtype (especially useful for integers with missing values).

3065 Notice that while pandas is forced to store the data as floating point,

3066 the database supports nullable integers. When fetching the data with

3067 Python, we get back integer scalars.

3068

3069 >>> df = pd.DataFrame({"A": [1, None, 2]})

3070 >>> df

3071 A

3072 0 1.0

3073 1 NaN

3074 2 2.0

3075

3076 >>> from sqlalchemy.types import Integer

3077 >>> df.to_sql(name='integers', con=engine, index=False,

3078 ... dtype={"A": Integer()})

3079 3

3080

3081 >>> with engine.connect() as conn:

3082 ... conn.execute(text("SELECT * FROM integers")).fetchall()

3083 [(1,), (None,), (2,)]

3084 """ # noqa: E501

3085 from pandas.io import sql

3086

3087 return sql.to_sql(

3088 self,

3089 name,

3090 con,

3091 schema=schema,

3092 if_exists=if_exists,

3093 index=index,

3094 index_label=index_label,

3095 chunksize=chunksize,

3096 dtype=dtype,

3097 method=method,

3098 )

3099

3100 @final

3101 @deprecate_nonkeyword_arguments(

3102 version="3.0", allowed_args=["self", "path"], name="to_pickle"

3103 )

3104 @doc(

3105 storage_options=_shared_docs["storage_options"],

3106 compression_options=_shared_docs["compression_options"] % "path",

3107 )

3108 def to_pickle(

3109 self,

3110 path: FilePath | WriteBuffer[bytes],

3111 compression: CompressionOptions = "infer",

3112 protocol: int = pickle.HIGHEST_PROTOCOL,

3113 storage_options: StorageOptions | None = None,

3114 ) -> None:

3115 """

3116 Pickle (serialize) object to file.

3117

3118 Parameters

3119 ----------

3120 path : str, path object, or file-like object

3121 String, path object (implementing ``os.PathLike[str]``), or file-like

3122 object implementing a binary ``write()`` function. File path where

3123 the pickled object will be stored.

3124 {compression_options}

3125 protocol : int

3126 Int which indicates which protocol should be used by the pickler,

3127 default HIGHEST_PROTOCOL (see [1]_ paragraph 12.1.2). The possible

3128 values are 0, 1, 2, 3, 4, 5. A negative value for the protocol

3129 parameter is equivalent to setting its value to HIGHEST_PROTOCOL.

3130

3131 .. [1] https://docs.python.org/3/library/pickle.html.

3132

3133 {storage_options}

3134

3135 See Also

3136 --------

3137 read_pickle : Load pickled pandas object (or any object) from file.

3138 DataFrame.to_hdf : Write DataFrame to an HDF5 file.

3139 DataFrame.to_sql : Write DataFrame to a SQL database.

3140 DataFrame.to_parquet : Write a DataFrame to the binary parquet format.

3141

3142 Examples

3143 --------

3144 >>> original_df = pd.DataFrame({{"foo": range(5), "bar": range(5, 10)}}) # doctest: +SKIP

3145 >>> original_df # doctest: +SKIP

3146 foo bar

3147 0 0 5

3148 1 1 6

3149 2 2 7

3150 3 3 8

3151 4 4 9

3152 >>> original_df.to_pickle("./dummy.pkl") # doctest: +SKIP

3153

3154 >>> unpickled_df = pd.read_pickle("./dummy.pkl") # doctest: +SKIP

3155 >>> unpickled_df # doctest: +SKIP

3156 foo bar

3157 0 0 5

3158 1 1 6

3159 2 2 7

3160 3 3 8

3161 4 4 9

3162 """ # noqa: E501

3163 from pandas.io.pickle import to_pickle

3164

3165 to_pickle(

3166 self,

3167 path,

3168 compression=compression,

3169 protocol=protocol,

3170 storage_options=storage_options,

3171 )

3172

3173 @final

3174 @deprecate_nonkeyword_arguments(

3175 version="3.0", allowed_args=["self"], name="to_clipboard"

3176 )

3177 def to_clipboard(

3178 self, excel: bool_t = True, sep: str | None = None, **kwargs

3179 ) -> None:

3180 r"""

3181 Copy object to the system clipboard.

3182

3183 Write a text representation of object to the system clipboard.

3184 This can be pasted into Excel, for example.

3185

3186 Parameters

3187 ----------

3188 excel : bool, default True

3189 Produce output in a csv format for easy pasting into excel.

3190

3191 - True, use the provided separator for csv pasting.

3192 - False, write a string representation of the object to the clipboard.

3193

3194 sep : str, default ``'\t'``

3195 Field delimiter.

3196 **kwargs

3197 These parameters will be passed to DataFrame.to_csv.

3198

3199 See Also

3200 --------

3201 DataFrame.to_csv : Write a DataFrame to a comma-separated values

3202 (csv) file.

3203 read_clipboard : Read text from clipboard and pass to read_csv.

3204

3205 Notes

3206 -----

3207 Requirements for your platform.

3208

3209 - Linux : `xclip`, or `xsel` (with `PyQt4` modules)

3210 - Windows : none

3211 - macOS : none

3212

3213 This method uses the processes developed for the package `pyperclip`. A

3214 solution to render any output string format is given in the examples.

3215

3216 Examples

3217 --------

3218 Copy the contents of a DataFrame to the clipboard.

3219

3220 >>> df = pd.DataFrame([[1, 2, 3], [4, 5, 6]], columns=['A', 'B', 'C'])

3221

3222 >>> df.to_clipboard(sep=',') # doctest: +SKIP

3223 ... # Wrote the following to the system clipboard:

3224 ... # ,A,B,C

3225 ... # 0,1,2,3

3226 ... # 1,4,5,6

3227

3228 We can omit the index by passing the keyword `index` and setting

3229 it to false.

3230

3231 >>> df.to_clipboard(sep=',', index=False) # doctest: +SKIP

3232 ... # Wrote the following to the system clipboard:

3233 ... # A,B,C

3234 ... # 1,2,3

3235 ... # 4,5,6

3236

3237 Using the original `pyperclip` package for any string output format.

3238

3239 .. code-block:: python

3240

3241 import pyperclip

3242 html = df.style.to_html()

3243 pyperclip.copy(html)

3244 """

3245 from pandas.io import clipboards

3246

3247 clipboards.to_clipboard(self, excel=excel, sep=sep, **kwargs)

3248

3249 @final

3250 def to_xarray(self):

3251 """

3252 Return an xarray object from the pandas object.

3253

3254 Returns

3255 -------

3256 xarray.DataArray or xarray.Dataset

3257 Data in the pandas structure converted to Dataset if the object is

3258 a DataFrame, or a DataArray if the object is a Series.

3259

3260 See Also

3261 --------

3262 DataFrame.to_hdf : Write DataFrame to an HDF5 file.

3263 DataFrame.to_parquet : Write a DataFrame to the binary parquet format.

3264

3265 Notes

3266 -----

3267 See the `xarray docs <https://xarray.pydata.org/en/stable/>`__

3268

3269 Examples

3270 --------

3271 >>> df = pd.DataFrame([('falcon', 'bird', 389.0, 2),

3272 ... ('parrot', 'bird', 24.0, 2),

3273 ... ('lion', 'mammal', 80.5, 4),

3274 ... ('monkey', 'mammal', np.nan, 4)],

3275 ... columns=['name', 'class', 'max_speed',

3276 ... 'num_legs'])

3277 >>> df

3278 name class max_speed num_legs

3279 0 falcon bird 389.0 2

3280 1 parrot bird 24.0 2

3281 2 lion mammal 80.5 4

3282 3 monkey mammal NaN 4

3283

3284 >>> df.to_xarray() # doctest: +SKIP

3285 <xarray.Dataset>

3286 Dimensions: (index: 4)

3287 Coordinates:

3288 * index (index) int64 32B 0 1 2 3

3289 Data variables:

3290 name (index) object 32B 'falcon' 'parrot' 'lion' 'monkey'

3291 class (index) object 32B 'bird' 'bird' 'mammal' 'mammal'

3292 max_speed (index) float64 32B 389.0 24.0 80.5 nan

3293 num_legs (index) int64 32B 2 2 4 4

3294

3295 >>> df['max_speed'].to_xarray() # doctest: +SKIP

3296 <xarray.DataArray 'max_speed' (index: 4)>

3297 array([389. , 24. , 80.5, nan])

3298 Coordinates:

3299 * index (index) int64 0 1 2 3

3300

3301 >>> dates = pd.to_datetime(['2018-01-01', '2018-01-01',

3302 ... '2018-01-02', '2018-01-02'])

3303 >>> df_multiindex = pd.DataFrame({'date': dates,

3304 ... 'animal': ['falcon', 'parrot',

3305 ... 'falcon', 'parrot'],

3306 ... 'speed': [350, 18, 361, 15]})

3307 >>> df_multiindex = df_multiindex.set_index(['date', 'animal'])

3308

3309 >>> df_multiindex

3310 speed

3311 date animal

3312 2018-01-01 falcon 350

3313 parrot 18

3314 2018-01-02 falcon 361

3315 parrot 15

3316

3317 >>> df_multiindex.to_xarray() # doctest: +SKIP

3318 <xarray.Dataset>

3319 Dimensions: (date: 2, animal: 2)

3320 Coordinates:

3321 * date (date) datetime64[ns] 2018-01-01 2018-01-02

3322 * animal (animal) object 'falcon' 'parrot'

3323 Data variables:

3324 speed (date, animal) int64 350 18 361 15

3325 """

3326 xarray = import_optional_dependency("xarray")

3327

3328 if self.ndim == 1:

3329 return xarray.DataArray.from_series(self)

3330 else:

3331 return xarray.Dataset.from_dataframe(self)

3332

3333 @overload

3334 def to_latex(

3335 self,

3336 buf: None = ...,

3337 columns: Sequence[Hashable] | None = ...,

3338 header: bool_t | SequenceNotStr[str] = ...,

3339 index: bool_t = ...,

3340 na_rep: str = ...,

3341 formatters: FormattersType | None = ...,

3342 float_format: FloatFormatType | None = ...,

3343 sparsify: bool_t | None = ...,

3344 index_names: bool_t = ...,

3345 bold_rows: bool_t = ...,

3346 column_format: str | None = ...,

3347 longtable: bool_t | None = ...,

3348 escape: bool_t | None = ...,

3349 encoding: str | None = ...,

3350 decimal: str = ...,

3351 multicolumn: bool_t | None = ...,

3352 multicolumn_format: str | None = ...,

3353 multirow: bool_t | None = ...,

3354 caption: str | tuple[str, str] | None = ...,

3355 label: str | None = ...,

3356 position: str | None = ...,

3357 ) -> str:

3358 ...

3359

3360 @overload

3361 def to_latex(

3362 self,

3363 buf: FilePath | WriteBuffer[str],

3364 columns: Sequence[Hashable] | None = ...,

3365 header: bool_t | SequenceNotStr[str] = ...,

3366 index: bool_t = ...,

3367 na_rep: str = ...,

3368 formatters: FormattersType | None = ...,

3369 float_format: FloatFormatType | None = ...,

3370 sparsify: bool_t | None = ...,

3371 index_names: bool_t = ...,

3372 bold_rows: bool_t = ...,

3373 column_format: str | None = ...,

3374 longtable: bool_t | None = ...,

3375 escape: bool_t | None = ...,

3376 encoding: str | None = ...,

3377 decimal: str = ...,

3378 multicolumn: bool_t | None = ...,

3379 multicolumn_format: str | None = ...,

3380 multirow: bool_t | None = ...,

3381 caption: str | tuple[str, str] | None = ...,

3382 label: str | None = ...,

3383 position: str | None = ...,

3384 ) -> None:

3385 ...

3386

3387 @final

3388 @deprecate_nonkeyword_arguments(

3389 version="3.0", allowed_args=["self", "buf"], name="to_latex"

3390 )

3391 def to_latex(

3392 self,

3393 buf: FilePath | WriteBuffer[str] | None = None,

3394 columns: Sequence[Hashable] | None = None,

3395 header: bool_t | SequenceNotStr[str] = True,

3396 index: bool_t = True,

3397 na_rep: str = "NaN",

3398 formatters: FormattersType | None = None,

3399 float_format: FloatFormatType | None = None,

3400 sparsify: bool_t | None = None,

3401 index_names: bool_t = True,

3402 bold_rows: bool_t = False,

3403 column_format: str | None = None,

3404 longtable: bool_t | None = None,

3405 escape: bool_t | None = None,

3406 encoding: str | None = None,

3407 decimal: str = ".",

3408 multicolumn: bool_t | None = None,

3409 multicolumn_format: str | None = None,

3410 multirow: bool_t | None = None,

3411 caption: str | tuple[str, str] | None = None,

3412 label: str | None = None,

3413 position: str | None = None,

3414 ) -> str | None:

3415 r"""

3416 Render object to a LaTeX tabular, longtable, or nested table.

3417

3418 Requires ``\usepackage{{booktabs}}``. The output can be copy/pasted

3419 into a main LaTeX document or read from an external file

3420 with ``\input{{table.tex}}``.

3421

3422 .. versionchanged:: 2.0.0

3423 Refactored to use the Styler implementation via jinja2 templating.

3424

3425 Parameters

3426 ----------

3427 buf : str, Path or StringIO-like, optional, default None

3428 Buffer to write to. If None, the output is returned as a string.

3429 columns : list of label, optional

3430 The subset of columns to write. Writes all columns by default.

3431 header : bool or list of str, default True

3432 Write out the column names. If a list of strings is given,

3433 it is assumed to be aliases for the column names.

3434 index : bool, default True

3435 Write row names (index).

3436 na_rep : str, default 'NaN'

3437 Missing data representation.

3438 formatters : list of functions or dict of {{str: function}}, optional

3439 Formatter functions to apply to columns' elements by position or

3440 name. The result of each function must be a unicode string.

3441 List must be of length equal to the number of columns.

3442 float_format : one-parameter function or str, optional, default None

3443 Formatter for floating point numbers. For example

3444 ``float_format="%.2f"`` and ``float_format="{{:0.2f}}".format`` will

3445 both result in 0.1234 being formatted as 0.12.

3446 sparsify : bool, optional

3447 Set to False for a DataFrame with a hierarchical index to print

3448 every multiindex key at each row. By default, the value will be

3449 read from the config module.

3450 index_names : bool, default True

3451 Prints the names of the indexes.

3452 bold_rows : bool, default False

3453 Make the row labels bold in the output.

3454 column_format : str, optional

3455 The columns format as specified in `LaTeX table format

3456 <https://en.wikibooks.org/wiki/LaTeX/Tables>`__ e.g. 'rcl' for 3

3457 columns. By default, 'l' will be used for all columns except

3458 columns of numbers, which default to 'r'.

3459 longtable : bool, optional

3460 Use a longtable environment instead of tabular. Requires

3461 adding a \usepackage{{longtable}} to your LaTeX preamble.

3462 By default, the value will be read from the pandas config

3463 module, and set to `True` if the option ``styler.latex.environment`` is

3464 `"longtable"`.

3465

3466 .. versionchanged:: 2.0.0

3467 The pandas option affecting this argument has changed.

3468 escape : bool, optional

3469 By default, the value will be read from the pandas config

3470 module and set to `True` if the option ``styler.format.escape`` is

3471 `"latex"`. When set to False prevents from escaping latex special

3472 characters in column names.

3473

3474 .. versionchanged:: 2.0.0

3475 The pandas option affecting this argument has changed, as has the

3476 default value to `False`.

3477 encoding : str, optional

3478 A string representing the encoding to use in the output file,

3479 defaults to 'utf-8'.

3480 decimal : str, default '.'

3481 Character recognized as decimal separator, e.g. ',' in Europe.

3482 multicolumn : bool, default True

3483 Use \multicolumn to enhance MultiIndex columns.

3484 The default will be read from the config module, and is set

3485 as the option ``styler.sparse.columns``.

3486

3487 .. versionchanged:: 2.0.0

3488 The pandas option affecting this argument has changed.

3489 multicolumn_format : str, default 'r'

3490 The alignment for multicolumns, similar to `column_format`

3491 The default will be read from the config module, and is set as the option

3492 ``styler.latex.multicol_align``.

3493

3494 .. versionchanged:: 2.0.0

3495 The pandas option affecting this argument has changed, as has the

3496 default value to "r".

3497 multirow : bool, default True

3498 Use \multirow to enhance MultiIndex rows. Requires adding a

3499 \usepackage{{multirow}} to your LaTeX preamble. Will print

3500 centered labels (instead of top-aligned) across the contained

3501 rows, separating groups via clines. The default will be read

3502 from the pandas config module, and is set as the option

3503 ``styler.sparse.index``.

3504

3505 .. versionchanged:: 2.0.0

3506 The pandas option affecting this argument has changed, as has the

3507 default value to `True`.

3508 caption : str or tuple, optional

3509 Tuple (full_caption, short_caption),

3510 which results in ``\caption[short_caption]{{full_caption}}``;

3511 if a single string is passed, no short caption will be set.

3512 label : str, optional

3513 The LaTeX label to be placed inside ``\label{{}}`` in the output.

3514 This is used with ``\ref{{}}`` in the main ``.tex`` file.

3515

3516 position : str, optional

3517 The LaTeX positional argument for tables, to be placed after

3518 ``\begin{{}}`` in the output.

3519

3520 Returns

3521 -------

3522 str or None

3523 If buf is None, returns the result as a string. Otherwise returns None.

3524

3525 See Also

3526 --------

3527 io.formats.style.Styler.to_latex : Render a DataFrame to LaTeX

3528 with conditional formatting.

3529 DataFrame.to_string : Render a DataFrame to a console-friendly

3530 tabular output.

3531 DataFrame.to_html : Render a DataFrame as an HTML table.

3532

3533 Notes

3534 -----

3535 As of v2.0.0 this method has changed to use the Styler implementation as

3536 part of :meth:`.Styler.to_latex` via ``jinja2`` templating. This means

3537 that ``jinja2`` is a requirement, and needs to be installed, for this method

3538 to function. It is advised that users switch to using Styler, since that

3539 implementation is more frequently updated and contains much more

3540 flexibility with the output.

3541

3542 Examples

3543 --------

3544 Convert a general DataFrame to LaTeX with formatting:

3545

3546 >>> df = pd.DataFrame(dict(name=['Raphael', 'Donatello'],

3547 ... age=[26, 45],

3548 ... height=[181.23, 177.65]))

3549 >>> print(df.to_latex(index=False,

3550 ... formatters={"name": str.upper},

3551 ... float_format="{:.1f}".format,

3552 ... )) # doctest: +SKIP

3553 \begin{tabular}{lrr}

3554 \toprule

3555 name & age & height \\

3556 \midrule

3557 RAPHAEL & 26 & 181.2 \\

3558 DONATELLO & 45 & 177.7 \\

3559 \bottomrule

3560 \end{tabular}

3561 """

3562 # Get defaults from the pandas config

3563 if self.ndim == 1:

3564 self = self.to_frame()

3565 if longtable is None:

3566 longtable = config.get_option("styler.latex.environment") == "longtable"

3567 if escape is None:

3568 escape = config.get_option("styler.format.escape") == "latex"

3569 if multicolumn is None:

3570 multicolumn = config.get_option("styler.sparse.columns")

3571 if multicolumn_format is None:

3572 multicolumn_format = config.get_option("styler.latex.multicol_align")

3573 if multirow is None:

3574 multirow = config.get_option("styler.sparse.index")

3575

3576 if column_format is not None and not isinstance(column_format, str):

3577 raise ValueError("`column_format` must be str or unicode")

3578 length = len(self.columns) if columns is None else len(columns)

3579 if isinstance(header, (list, tuple)) and len(header) != length:

3580 raise ValueError(f"Writing {length} cols but got {len(header)} aliases")

3581

3582 # Refactor formatters/float_format/decimal/na_rep/escape to Styler structure

3583 base_format_ = {

3584 "na_rep": na_rep,

3585 "escape": "latex" if escape else None,

3586 "decimal": decimal,

3587 }

3588 index_format_: dict[str, Any] = {"axis": 0, **base_format_}

3589 column_format_: dict[str, Any] = {"axis": 1, **base_format_}

3590

3591 if isinstance(float_format, str):

3592 float_format_: Callable | None = lambda x: float_format % x

3593 else:

3594 float_format_ = float_format

3595

3596 def _wrap(x, alt_format_):

3597 if isinstance(x, (float, complex)) and float_format_ is not None:

3598 return float_format_(x)

3599 else:

3600 return alt_format_(x)

3601

3602 formatters_: list | tuple | dict | Callable | None = None

3603 if isinstance(formatters, list):

3604 formatters_ = {

3605 c: partial(_wrap, alt_format_=formatters[i])

3606 for i, c in enumerate(self.columns)

3607 }

3608 elif isinstance(formatters, dict):

3609 index_formatter = formatters.pop("__index__", None)

3610 column_formatter = formatters.pop("__columns__", None)

3611 if index_formatter is not None:

3612 index_format_.update({"formatter": index_formatter})

3613 if column_formatter is not None:

3614 column_format_.update({"formatter": column_formatter})

3615

3616 formatters_ = formatters

3617 float_columns = self.select_dtypes(include="float").columns

3618 for col in float_columns:

3619 if col not in formatters.keys():

3620 formatters_.update({col: float_format_})

3621 elif formatters is None and float_format is not None:

3622 formatters_ = partial(_wrap, alt_format_=lambda v: v)

3623 format_index_ = [index_format_, column_format_]

3624

3625 # Deal with hiding indexes and relabelling column names

3626 hide_: list[dict] = []

3627 relabel_index_: list[dict] = []

3628 if columns:

3629 hide_.append(

3630 {

3631 "subset": [c for c in self.columns if c not in columns],

3632 "axis": "columns",

3633 }

3634 )

3635 if header is False:

3636 hide_.append({"axis": "columns"})

3637 elif isinstance(header, (list, tuple)):

3638 relabel_index_.append({"labels": header, "axis": "columns"})

3639 format_index_ = [index_format_] # column_format is overwritten

3640

3641 if index is False:

3642 hide_.append({"axis": "index"})

3643 if index_names is False:

3644 hide_.append({"names": True, "axis": "index"})

3645

3646 render_kwargs_ = {

3647 "hrules": True,

3648 "sparse_index": sparsify,

3649 "sparse_columns": sparsify,

3650 "environment": "longtable" if longtable else None,

3651 "multicol_align": multicolumn_format

3652 if multicolumn

3653 else f"naive-{multicolumn_format}",

3654 "multirow_align": "t" if multirow else "naive",

3655 "encoding": encoding,

3656 "caption": caption,

3657 "label": label,

3658 "position": position,

3659 "column_format": column_format,

3660 "clines": "skip-last;data"

3661 if (multirow and isinstance(self.index, MultiIndex))

3662 else None,

3663 "bold_rows": bold_rows,

3664 }

3665

3666 return self._to_latex_via_styler(

3667 buf,

3668 hide=hide_,

3669 relabel_index=relabel_index_,

3670 format={"formatter": formatters_, **base_format_},

3671 format_index=format_index_,

3672 render_kwargs=render_kwargs_,

3673 )

3674

3675 @final

3676 def _to_latex_via_styler(

3677 self,

3678 buf=None,

3679 *,

3680 hide: dict | list[dict] | None = None,

3681 relabel_index: dict | list[dict] | None = None,

3682 format: dict | list[dict] | None = None,

3683 format_index: dict | list[dict] | None = None,

3684 render_kwargs: dict | None = None,

3685 ):

3686 """

3687 Render object to a LaTeX tabular, longtable, or nested table.

3688

3689 Uses the ``Styler`` implementation with the following, ordered, method chaining:

3690

3691 .. code-block:: python

3692 styler = Styler(DataFrame)

3693 styler.hide(**hide)

3694 styler.relabel_index(**relabel_index)

3695 styler.format(**format)

3696 styler.format_index(**format_index)

3697 styler.to_latex(buf=buf, **render_kwargs)

3698

3699 Parameters

3700 ----------

3701 buf : str, Path or StringIO-like, optional, default None

3702 Buffer to write to. If None, the output is returned as a string.

3703 hide : dict, list of dict

3704 Keyword args to pass to the method call of ``Styler.hide``. If a list will

3705 call the method numerous times.

3706 relabel_index : dict, list of dict

3707 Keyword args to pass to the method of ``Styler.relabel_index``. If a list

3708 will call the method numerous times.

3709 format : dict, list of dict

3710 Keyword args to pass to the method call of ``Styler.format``. If a list will

3711 call the method numerous times.

3712 format_index : dict, list of dict

3713 Keyword args to pass to the method call of ``Styler.format_index``. If a

3714 list will call the method numerous times.

3715 render_kwargs : dict

3716 Keyword args to pass to the method call of ``Styler.to_latex``.

3717

3718 Returns

3719 -------

3720 str or None

3721 If buf is None, returns the result as a string. Otherwise returns None.

3722 """

3723 from pandas.io.formats.style import Styler

3724

3725 self = cast("DataFrame", self)

3726 styler = Styler(self, uuid="")

3727

3728 for kw_name in ["hide", "relabel_index", "format", "format_index"]:

3729 kw = vars()[kw_name]

3730 if isinstance(kw, dict):

3731 getattr(styler, kw_name)(**kw)

3732 elif isinstance(kw, list):

3733 for sub_kw in kw:

3734 getattr(styler, kw_name)(**sub_kw)

3735

3736 # bold_rows is not a direct kwarg of Styler.to_latex

3737 render_kwargs = {} if render_kwargs is None else render_kwargs

3738 if render_kwargs.pop("bold_rows"):

3739 styler.map_index(lambda v: "textbf:--rwrap;")

3740

3741 return styler.to_latex(buf=buf, **render_kwargs)

3742

3743 @overload

3744 def to_csv(

3745 self,

3746 path_or_buf: None = ...,

3747 sep: str = ...,

3748 na_rep: str = ...,

3749 float_format: str | Callable | None = ...,

3750 columns: Sequence[Hashable] | None = ...,

3751 header: bool_t | list[str] = ...,

3752 index: bool_t = ...,

3753 index_label: IndexLabel | None = ...,

3754 mode: str = ...,

3755 encoding: str | None = ...,

3756 compression: CompressionOptions = ...,

3757 quoting: int | None = ...,

3758 quotechar: str = ...,

3759 lineterminator: str | None = ...,

3760 chunksize: int | None = ...,

3761 date_format: str | None = ...,

3762 doublequote: bool_t = ...,

3763 escapechar: str | None = ...,

3764 decimal: str = ...,

3765 errors: OpenFileErrors = ...,

3766 storage_options: StorageOptions = ...,

3767 ) -> str:

3768 ...

3769

3770 @overload

3771 def to_csv(

3772 self,

3773 path_or_buf: FilePath | WriteBuffer[bytes] | WriteBuffer[str],

3774 sep: str = ...,

3775 na_rep: str = ...,

3776 float_format: str | Callable | None = ...,

3777 columns: Sequence[Hashable] | None = ...,

3778 header: bool_t | list[str] = ...,

3779 index: bool_t = ...,

3780 index_label: IndexLabel | None = ...,

3781 mode: str = ...,

3782 encoding: str | None = ...,

3783 compression: CompressionOptions = ...,

3784 quoting: int | None = ...,

3785 quotechar: str = ...,

3786 lineterminator: str | None = ...,

3787 chunksize: int | None = ...,

3788 date_format: str | None = ...,

3789 doublequote: bool_t = ...,

3790 escapechar: str | None = ...,

3791 decimal: str = ...,

3792 errors: OpenFileErrors = ...,

3793 storage_options: StorageOptions = ...,

3794 ) -> None:

3795 ...

3796

3797 @final

3798 @deprecate_nonkeyword_arguments(

3799 version="3.0", allowed_args=["self", "path_or_buf"], name="to_csv"

3800 )

3801 @doc(

3802 storage_options=_shared_docs["storage_options"],

3803 compression_options=_shared_docs["compression_options"] % "path_or_buf",

3804 )

3805 def to_csv(

3806 self,

3807 path_or_buf: FilePath | WriteBuffer[bytes] | WriteBuffer[str] | None = None,

3808 sep: str = ",",

3809 na_rep: str = "",

3810 float_format: str | Callable | None = None,

3811 columns: Sequence[Hashable] | None = None,

3812 header: bool_t | list[str] = True,

3813 index: bool_t = True,

3814 index_label: IndexLabel | None = None,

3815 mode: str = "w",

3816 encoding: str | None = None,

3817 compression: CompressionOptions = "infer",

3818 quoting: int | None = None,

3819 quotechar: str = '"',

3820 lineterminator: str | None = None,

3821 chunksize: int | None = None,

3822 date_format: str | None = None,

3823 doublequote: bool_t = True,

3824 escapechar: str | None = None,

3825 decimal: str = ".",

3826 errors: OpenFileErrors = "strict",

3827 storage_options: StorageOptions | None = None,

3828 ) -> str | None:

3829 r"""

3830 Write object to a comma-separated values (csv) file.

3831

3832 Parameters

3833 ----------

3834 path_or_buf : str, path object, file-like object, or None, default None

3835 String, path object (implementing os.PathLike[str]), or file-like

3836 object implementing a write() function. If None, the result is

3837 returned as a string. If a non-binary file object is passed, it should

3838 be opened with `newline=''`, disabling universal newlines. If a binary

3839 file object is passed, `mode` might need to contain a `'b'`.

3840 sep : str, default ','

3841 String of length 1. Field delimiter for the output file.

3842 na_rep : str, default ''

3843 Missing data representation.

3844 float_format : str, Callable, default None

3845 Format string for floating point numbers. If a Callable is given, it takes

3846 precedence over other numeric formatting parameters, like decimal.

3847 columns : sequence, optional

3848 Columns to write.

3849 header : bool or list of str, default True

3850 Write out the column names. If a list of strings is given it is

3851 assumed to be aliases for the column names.

3852 index : bool, default True

3853 Write row names (index).

3854 index_label : str or sequence, or False, default None

3855 Column label for index column(s) if desired. If None is given, and

3856 `header` and `index` are True, then the index names are used. A

3857 sequence should be given if the object uses MultiIndex. If

3858 False do not print fields for index names. Use index_label=False

3859 for easier importing in R.

3860 mode : {{'w', 'x', 'a'}}, default 'w'

3861 Forwarded to either `open(mode=)` or `fsspec.open(mode=)` to control

3862 the file opening. Typical values include:

3863

3864 - 'w', truncate the file first.

3865 - 'x', exclusive creation, failing if the file already exists.

3866 - 'a', append to the end of file if it exists.

3867

3868 encoding : str, optional

3869 A string representing the encoding to use in the output file,

3870 defaults to 'utf-8'. `encoding` is not supported if `path_or_buf`

3871 is a non-binary file object.

3872 {compression_options}

3873

3874 May be a dict with key 'method' as compression mode

3875 and other entries as additional compression options if

3876 compression mode is 'zip'.

3877

3878 Passing compression options as keys in dict is

3879 supported for compression modes 'gzip', 'bz2', 'zstd', and 'zip'.

3880 quoting : optional constant from csv module

3881 Defaults to csv.QUOTE_MINIMAL. If you have set a `float_format`

3882 then floats are converted to strings and thus csv.QUOTE_NONNUMERIC

3883 will treat them as non-numeric.

3884 quotechar : str, default '\"'

3885 String of length 1. Character used to quote fields.

3886 lineterminator : str, optional

3887 The newline character or character sequence to use in the output

3888 file. Defaults to `os.linesep`, which depends on the OS in which

3889 this method is called ('\\n' for linux, '\\r\\n' for Windows, i.e.).

3890

3891 .. versionchanged:: 1.5.0

3892

3893 Previously was line_terminator, changed for consistency with

3894 read_csv and the standard library 'csv' module.

3895

3896 chunksize : int or None

3897 Rows to write at a time.

3898 date_format : str, default None

3899 Format string for datetime objects.

3900 doublequote : bool, default True

3901 Control quoting of `quotechar` inside a field.

3902 escapechar : str, default None

3903 String of length 1. Character used to escape `sep` and `quotechar`

3904 when appropriate.

3905 decimal : str, default '.'

3906 Character recognized as decimal separator. E.g. use ',' for

3907 European data.

3908 errors : str, default 'strict'

3909 Specifies how encoding and decoding errors are to be handled.

3910 See the errors argument for :func:`open` for a full list

3911 of options.

3912

3913 {storage_options}

3914

3915 Returns

3916 -------

3917 None or str

3918 If path_or_buf is None, returns the resulting csv format as a

3919 string. Otherwise returns None.

3920

3921 See Also

3922 --------

3923 read_csv : Load a CSV file into a DataFrame.

3924 to_excel : Write DataFrame to an Excel file.

3925

3926 Examples

3927 --------

3928 Create 'out.csv' containing 'df' without indices

3929

3930 >>> df = pd.DataFrame({{'name': ['Raphael', 'Donatello'],

3931 ... 'mask': ['red', 'purple'],

3932 ... 'weapon': ['sai', 'bo staff']}})

3933 >>> df.to_csv('out.csv', index=False) # doctest: +SKIP

3934

3935 Create 'out.zip' containing 'out.csv'

3936

3937 >>> df.to_csv(index=False)

3938 'name,mask,weapon\nRaphael,red,sai\nDonatello,purple,bo staff\n'

3939 >>> compression_opts = dict(method='zip',

3940 ... archive_name='out.csv') # doctest: +SKIP

3941 >>> df.to_csv('out.zip', index=False,

3942 ... compression=compression_opts) # doctest: +SKIP

3943

3944 To write a csv file to a new folder or nested folder you will first

3945 need to create it using either Pathlib or os:

3946

3947 >>> from pathlib import Path # doctest: +SKIP

3948 >>> filepath = Path('folder/subfolder/out.csv') # doctest: +SKIP

3949 >>> filepath.parent.mkdir(parents=True, exist_ok=True) # doctest: +SKIP

3950 >>> df.to_csv(filepath) # doctest: +SKIP

3951

3952 >>> import os # doctest: +SKIP

3953 >>> os.makedirs('folder/subfolder', exist_ok=True) # doctest: +SKIP

3954 >>> df.to_csv('folder/subfolder/out.csv') # doctest: +SKIP

3955 """

3956 df = self if isinstance(self, ABCDataFrame) else self.to_frame()

3957

3958 formatter = DataFrameFormatter(

3959 frame=df,

3960 header=header,

3961 index=index,

3962 na_rep=na_rep,

3963 float_format=float_format,

3964 decimal=decimal,

3965 )

3966

3967 return DataFrameRenderer(formatter).to_csv(

3968 path_or_buf,

3969 lineterminator=lineterminator,

3970 sep=sep,

3971 encoding=encoding,

3972 errors=errors,

3973 compression=compression,

3974 quoting=quoting,

3975 columns=columns,

3976 index_label=index_label,

3977 mode=mode,

3978 chunksize=chunksize,

3979 quotechar=quotechar,

3980 date_format=date_format,

3981 doublequote=doublequote,

3982 escapechar=escapechar,

3983 storage_options=storage_options,

3984 )

3985

3986 # ----------------------------------------------------------------------

3987 # Lookup Caching

3988

3989 def _reset_cacher(self) -> None:

3990 """

3991 Reset the cacher.

3992 """

3993 raise AbstractMethodError(self)

3994

3995 def _maybe_update_cacher(

3996 self,

3997 clear: bool_t = False,

3998 verify_is_copy: bool_t = True,

3999 inplace: bool_t = False,

4000 ) -> None:

4001 """

4002 See if we need to update our parent cacher if clear, then clear our

4003 cache.

4004

4005 Parameters

4006 ----------

4007 clear : bool, default False

4008 Clear the item cache.

4009 verify_is_copy : bool, default True

4010 Provide is_copy checks.

4011 """

4012 if using_copy_on_write():

4013 return

4014

4015 if verify_is_copy:

4016 self._check_setitem_copy(t="referent")

4017

4018 if clear:

4019 self._clear_item_cache()

4020

4021 def _clear_item_cache(self) -> None:

4022 raise AbstractMethodError(self)

4023

4024 # ----------------------------------------------------------------------

4025 # Indexing Methods

4026

4027 @final

4028 def take(self, indices, axis: Axis = 0, **kwargs) -> Self:

4029 """

4030 Return the elements in the given *positional* indices along an axis.

4031

4032 This means that we are not indexing according to actual values in

4033 the index attribute of the object. We are indexing according to the

4034 actual position of the element in the object.

4035

4036 Parameters

4037 ----------

4038 indices : array-like

4039 An array of ints indicating which positions to take.

4040 axis : {0 or 'index', 1 or 'columns', None}, default 0

4041 The axis on which to select elements. ``0`` means that we are

4042 selecting rows, ``1`` means that we are selecting columns.

4043 For `Series` this parameter is unused and defaults to 0.

4044 **kwargs

4045 For compatibility with :meth:`numpy.take`. Has no effect on the

4046 output.

4047

4048 Returns

4049 -------

4050 same type as caller

4051 An array-like containing the elements taken from the object.

4052

4053 See Also

4054 --------

4055 DataFrame.loc : Select a subset of a DataFrame by labels.

4056 DataFrame.iloc : Select a subset of a DataFrame by positions.

4057 numpy.take : Take elements from an array along an axis.

4058

4059 Examples

4060 --------

4061 >>> df = pd.DataFrame([('falcon', 'bird', 389.0),

4062 ... ('parrot', 'bird', 24.0),

4063 ... ('lion', 'mammal', 80.5),

4064 ... ('monkey', 'mammal', np.nan)],

4065 ... columns=['name', 'class', 'max_speed'],

4066 ... index=[0, 2, 3, 1])

4067 >>> df

4068 name class max_speed

4069 0 falcon bird 389.0

4070 2 parrot bird 24.0

4071 3 lion mammal 80.5

4072 1 monkey mammal NaN

4073

4074 Take elements at positions 0 and 3 along the axis 0 (default).

4075

4076 Note how the actual indices selected (0 and 1) do not correspond to

4077 our selected indices 0 and 3. That's because we are selecting the 0th

4078 and 3rd rows, not rows whose indices equal 0 and 3.

4079

4080 >>> df.take([0, 3])

4081 name class max_speed

4082 0 falcon bird 389.0

4083 1 monkey mammal NaN

4084

4085 Take elements at indices 1 and 2 along the axis 1 (column selection).

4086

4087 >>> df.take([1, 2], axis=1)

4088 class max_speed

4089 0 bird 389.0

4090 2 bird 24.0

4091 3 mammal 80.5

4092 1 mammal NaN

4093

4094 We may take elements using negative integers for positive indices,

4095 starting from the end of the object, just like with Python lists.

4096

4097 >>> df.take([-1, -2])

4098 name class max_speed

4099 1 monkey mammal NaN

4100 3 lion mammal 80.5

4101 """

4102

4103 nv.validate_take((), kwargs)

4104

4105 if not isinstance(indices, slice):

4106 indices = np.asarray(indices, dtype=np.intp)

4107 if (

4108 axis == 0

4109 and indices.ndim == 1

4110 and using_copy_on_write()

4111 and is_range_indexer(indices, len(self))

4112 ):

4113 return self.copy(deep=None)

4114 elif self.ndim == 1:

4115 raise TypeError(

4116 f"{type(self).__name__}.take requires a sequence of integers, "

4117 "not slice."

4118 )

4119 else:

4120 warnings.warn(

4121 # GH#51539

4122 f"Passing a slice to {type(self).__name__}.take is deprecated "

4123 "and will raise in a future version. Use `obj[slicer]` or pass "

4124 "a sequence of integers instead.",

4125 FutureWarning,

4126 stacklevel=find_stack_level(),

4127 )

4128 # We can get here with a slice via DataFrame.__getitem__

4129 indices = np.arange(

4130 indices.start, indices.stop, indices.step, dtype=np.intp

4131 )

4132

4133 new_data = self._mgr.take(

4134 indices,

4135 axis=self._get_block_manager_axis(axis),

4136 verify=True,

4137 )

4138 return self._constructor_from_mgr(new_data, axes=new_data.axes).__finalize__(

4139 self, method="take"

4140 )

4141

4142 @final

4143 def _take_with_is_copy(self, indices, axis: Axis = 0) -> Self:

4144 """

4145 Internal version of the `take` method that sets the `_is_copy`

4146 attribute to keep track of the parent dataframe (using in indexing

4147 for the SettingWithCopyWarning).

4148

4149 For Series this does the same as the public take (it never sets `_is_copy`).

4150

4151 See the docstring of `take` for full explanation of the parameters.

4152 """

4153 result = self.take(indices=indices, axis=axis)

4154 # Maybe set copy if we didn't actually change the index.

4155 if self.ndim == 2 and not result._get_axis(axis).equals(self._get_axis(axis)):

4156 result._set_is_copy(self)

4157 return result

4158

4159 @final

4160 def xs(

4161 self,

4162 key: IndexLabel,

4163 axis: Axis = 0,

4164 level: IndexLabel | None = None,

4165 drop_level: bool_t = True,

4166 ) -> Self:

4167 """

4168 Return cross-section from the Series/DataFrame.

4169

4170 This method takes a `key` argument to select data at a particular

4171 level of a MultiIndex.

4172

4173 Parameters

4174 ----------

4175 key : label or tuple of label

4176 Label contained in the index, or partially in a MultiIndex.

4177 axis : {0 or 'index', 1 or 'columns'}, default 0

4178 Axis to retrieve cross-section on.

4179 level : object, defaults to first n levels (n=1 or len(key))

4180 In case of a key partially contained in a MultiIndex, indicate

4181 which levels are used. Levels can be referred by label or position.

4182 drop_level : bool, default True

4183 If False, returns object with same levels as self.

4184

4185 Returns

4186 -------

4187 Series or DataFrame

4188 Cross-section from the original Series or DataFrame

4189 corresponding to the selected index levels.

4190

4191 See Also

4192 --------

4193 DataFrame.loc : Access a group of rows and columns

4194 by label(s) or a boolean array.

4195 DataFrame.iloc : Purely integer-location based indexing

4196 for selection by position.

4197

4198 Notes

4199 -----

4200 `xs` can not be used to set values.

4201

4202 MultiIndex Slicers is a generic way to get/set values on

4203 any level or levels.

4204 It is a superset of `xs` functionality, see

4205 :ref:`MultiIndex Slicers <advanced.mi_slicers>`.

4206

4207 Examples

4208 --------

4209 >>> d = {'num_legs': [4, 4, 2, 2],

4210 ... 'num_wings': [0, 0, 2, 2],

4211 ... 'class': ['mammal', 'mammal', 'mammal', 'bird'],

4212 ... 'animal': ['cat', 'dog', 'bat', 'penguin'],

4213 ... 'locomotion': ['walks', 'walks', 'flies', 'walks']}

4214 >>> df = pd.DataFrame(data=d)

4215 >>> df = df.set_index(['class', 'animal', 'locomotion'])

4216 >>> df

4217 num_legs num_wings

4218 class animal locomotion

4219 mammal cat walks 4 0

4220 dog walks 4 0

4221 bat flies 2 2

4222 bird penguin walks 2 2

4223

4224 Get values at specified index

4225

4226 >>> df.xs('mammal')

4227 num_legs num_wings

4228 animal locomotion

4229 cat walks 4 0

4230 dog walks 4 0

4231 bat flies 2 2

4232

4233 Get values at several indexes

4234

4235 >>> df.xs(('mammal', 'dog', 'walks'))

4236 num_legs 4

4237 num_wings 0

4238 Name: (mammal, dog, walks), dtype: int64

4239

4240 Get values at specified index and level

4241

4242 >>> df.xs('cat', level=1)

4243 num_legs num_wings

4244 class locomotion

4245 mammal walks 4 0

4246

4247 Get values at several indexes and levels

4248

4249 >>> df.xs(('bird', 'walks'),

4250 ... level=[0, 'locomotion'])

4251 num_legs num_wings

4252 animal

4253 penguin 2 2

4254

4255 Get values at specified column and axis

4256

4257 >>> df.xs('num_wings', axis=1)

4258 class animal locomotion

4259 mammal cat walks 0

4260 dog walks 0

4261 bat flies 2

4262 bird penguin walks 2

4263 Name: num_wings, dtype: int64

4264 """

4265 axis = self._get_axis_number(axis)

4266 labels = self._get_axis(axis)

4267

4268 if isinstance(key, list):

4269 raise TypeError("list keys are not supported in xs, pass a tuple instead")

4270

4271 if level is not None:

4272 if not isinstance(labels, MultiIndex):

4273 raise TypeError("Index must be a MultiIndex")

4274 loc, new_ax = labels.get_loc_level(key, level=level, drop_level=drop_level)

4275

4276 # create the tuple of the indexer

4277 _indexer = [slice(None)] * self.ndim

4278 _indexer[axis] = loc

4279 indexer = tuple(_indexer)

4280

4281 result = self.iloc[indexer]

4282 setattr(result, result._get_axis_name(axis), new_ax)

4283 return result

4284

4285 if axis == 1:

4286 if drop_level:

4287 return self[key]

4288 index = self.columns

4289 else:

4290 index = self.index

4291

4292 if isinstance(index, MultiIndex):

4293 loc, new_index = index._get_loc_level(key, level=0)

4294 if not drop_level:

4295 if lib.is_integer(loc):

4296 # Slice index must be an integer or None

4297 new_index = index[loc : loc + 1]

4298 else:

4299 new_index = index[loc]

4300 else:

4301 loc = index.get_loc(key)

4302

4303 if isinstance(loc, np.ndarray):

4304 if loc.dtype == np.bool_:

4305 (inds,) = loc.nonzero()

4306 return self._take_with_is_copy(inds, axis=axis)

4307 else:

4308 return self._take_with_is_copy(loc, axis=axis)

4309

4310 if not is_scalar(loc):

4311 new_index = index[loc]

4312

4313 if is_scalar(loc) and axis == 0:

4314 # In this case loc should be an integer

4315 if self.ndim == 1:

4316 # if we encounter an array-like and we only have 1 dim

4317 # that means that their are list/ndarrays inside the Series!

4318 # so just return them (GH 6394)

4319 return self._values[loc]

4320

4321 new_mgr = self._mgr.fast_xs(loc)

4322

4323 result = self._constructor_sliced_from_mgr(new_mgr, axes=new_mgr.axes)

4324 result._name = self.index[loc]

4325 result = result.__finalize__(self)

4326 elif is_scalar(loc):

4327 result = self.iloc[:, slice(loc, loc + 1)]

4328 elif axis == 1:

4329 result = self.iloc[:, loc]

4330 else:

4331 result = self.iloc[loc]

4332 result.index = new_index

4333

4334 # this could be a view

4335 # but only in a single-dtyped view sliceable case

4336 result._set_is_copy(self, copy=not result._is_view)

4337 return result

4338

4339 def __getitem__(self, item):

4340 raise AbstractMethodError(self)

4341

4342 @final

4343 def _getitem_slice(self, key: slice) -> Self:

4344 """

4345 __getitem__ for the case where the key is a slice object.

4346 """

4347 # _convert_slice_indexer to determine if this slice is positional

4348 # or label based, and if the latter, convert to positional

4349 slobj = self.index._convert_slice_indexer(key, kind="getitem")

4350 if isinstance(slobj, np.ndarray):

4351 # reachable with DatetimeIndex

4352 indexer = lib.maybe_indices_to_slice(

4353 slobj.astype(np.intp, copy=False), len(self)

4354 )

4355 if isinstance(indexer, np.ndarray):

4356 # GH#43223 If we can not convert, use take

4357 return self.take(indexer, axis=0)

4358 slobj = indexer

4359 return self._slice(slobj)

4360

4361 def _slice(self, slobj: slice, axis: AxisInt = 0) -> Self:

4362 """

4363 Construct a slice of this container.

4364

4365 Slicing with this method is *always* positional.

4366 """

4367 assert isinstance(slobj, slice), type(slobj)

4368 axis = self._get_block_manager_axis(axis)

4369 new_mgr = self._mgr.get_slice(slobj, axis=axis)

4370 result = self._constructor_from_mgr(new_mgr, axes=new_mgr.axes)

4371 result = result.__finalize__(self)

4372

4373 # this could be a view

4374 # but only in a single-dtyped view sliceable case

4375 is_copy = axis != 0 or result._is_view

4376 result._set_is_copy(self, copy=is_copy)

4377 return result

4378

4379 @final

4380 def _set_is_copy(self, ref: NDFrame, copy: bool_t = True) -> None:

4381 if not copy:

4382 self._is_copy = None

4383 else:

4384 assert ref is not None

4385 self._is_copy = weakref.ref(ref)

4386

4387 def _check_is_chained_assignment_possible(self) -> bool_t:

4388 """

4389 Check if we are a view, have a cacher, and are of mixed type.

4390 If so, then force a setitem_copy check.

4391

4392 Should be called just near setting a value

4393

4394 Will return a boolean if it we are a view and are cached, but a

4395 single-dtype meaning that the cacher should be updated following

4396 setting.

4397 """

4398 if self._is_copy:

4399 self._check_setitem_copy(t="referent")

4400 return False

4401

4402 @final

4403 def _check_setitem_copy(self, t: str = "setting", force: bool_t = False):

4404 """

4405

4406 Parameters

4407 ----------

4408 t : str, the type of setting error

4409 force : bool, default False

4410 If True, then force showing an error.

4411

4412 validate if we are doing a setitem on a chained copy.

4413

4414 It is technically possible to figure out that we are setting on

4415 a copy even WITH a multi-dtyped pandas object. In other words, some

4416 blocks may be views while other are not. Currently _is_view will ALWAYS

4417 return False for multi-blocks to avoid having to handle this case.

4418

4419 df = DataFrame(np.arange(0,9), columns=['count'])

4420 df['group'] = 'b'

4421

4422 # This technically need not raise SettingWithCopy if both are view

4423 # (which is not generally guaranteed but is usually True. However,

4424 # this is in general not a good practice and we recommend using .loc.

4425 df.iloc[0:5]['group'] = 'a'

4426

4427 """

4428 if using_copy_on_write() or warn_copy_on_write():

4429 return

4430

4431 # return early if the check is not needed

4432 if not (force or self._is_copy):

4433 return

4434

4435 value = config.get_option("mode.chained_assignment")

4436 if value is None:

4437 return

4438

4439 # see if the copy is not actually referred; if so, then dissolve

4440 # the copy weakref

4441 if self._is_copy is not None and not isinstance(self._is_copy, str):

4442 r = self._is_copy()

4443 if not gc.get_referents(r) or (r is not None and r.shape == self.shape):

4444 self._is_copy = None

4445 return

4446

4447 # a custom message

4448 if isinstance(self._is_copy, str):

4449 t = self._is_copy

4450

4451 elif t == "referent":

4452 t = (

4453 "\n"

4454 "A value is trying to be set on a copy of a slice from a "

4455 "DataFrame\n\n"

4456 "See the caveats in the documentation: "

4457 "https://pandas.pydata.org/pandas-docs/stable/user_guide/"

4458 "indexing.html#returning-a-view-versus-a-copy"

4459 )

4460

4461 else:

4462 t = (

4463 "\n"

4464 "A value is trying to be set on a copy of a slice from a "

4465 "DataFrame.\n"

4466 "Try using .loc[row_indexer,col_indexer] = value "

4467 "instead\n\nSee the caveats in the documentation: "

4468 "https://pandas.pydata.org/pandas-docs/stable/user_guide/"

4469 "indexing.html#returning-a-view-versus-a-copy"

4470 )

4471

4472 if value == "raise":

4473 raise SettingWithCopyError(t)

4474 if value == "warn":

4475 warnings.warn(t, SettingWithCopyWarning, stacklevel=find_stack_level())

4476

4477 @final

4478 def __delitem__(self, key) -> None:

4479 """

4480 Delete item

4481 """

4482 deleted = False

4483

4484 maybe_shortcut = False

4485 if self.ndim == 2 and isinstance(self.columns, MultiIndex):

4486 try:

4487 # By using engine's __contains__ we effectively

4488 # restrict to same-length tuples

4489 maybe_shortcut = key not in self.columns._engine

4490 except TypeError:

4491 pass

4492

4493 if maybe_shortcut:

4494 # Allow shorthand to delete all columns whose first len(key)

4495 # elements match key:

4496 if not isinstance(key, tuple):

4497 key = (key,)

4498 for col in self.columns:

4499 if isinstance(col, tuple) and col[: len(key)] == key:

4500 del self[col]

4501 deleted = True

4502 if not deleted:

4503 # If the above loop ran and didn't delete anything because

4504 # there was no match, this call should raise the appropriate

4505 # exception:

4506 loc = self.axes[-1].get_loc(key)

4507 self._mgr = self._mgr.idelete(loc)

4508

4509 # delete from the caches

4510 try:

4511 del self._item_cache[key]

4512 except KeyError:

4513 pass

4514

4515 # ----------------------------------------------------------------------

4516 # Unsorted

4517

4518 @final

4519 def _check_inplace_and_allows_duplicate_labels(self, inplace: bool_t):

4520 if inplace and not self.flags.allows_duplicate_labels:

4521 raise ValueError(

4522 "Cannot specify 'inplace=True' when "

4523 "'self.flags.allows_duplicate_labels' is False."

4524 )

4525

4526 @final

4527 def get(self, key, default=None):

4528 """

4529 Get item from object for given key (ex: DataFrame column).

4530

4531 Returns default value if not found.

4532

4533 Parameters

4534 ----------

4535 key : object

4536

4537 Returns

4538 -------

4539 same type as items contained in object

4540

4541 Examples

4542 --------

4543 >>> df = pd.DataFrame(

4544 ... [

4545 ... [24.3, 75.7, "high"],

4546 ... [31, 87.8, "high"],

4547 ... [22, 71.6, "medium"],

4548 ... [35, 95, "medium"],

4549 ... ],

4550 ... columns=["temp_celsius", "temp_fahrenheit", "windspeed"],

4551 ... index=pd.date_range(start="2014-02-12", end="2014-02-15", freq="D"),

4552 ... )

4553

4554 >>> df

4555 temp_celsius temp_fahrenheit windspeed

4556 2014-02-12 24.3 75.7 high

4557 2014-02-13 31.0 87.8 high

4558 2014-02-14 22.0 71.6 medium

4559 2014-02-15 35.0 95.0 medium

4560

4561 >>> df.get(["temp_celsius", "windspeed"])

4562 temp_celsius windspeed

4563 2014-02-12 24.3 high

4564 2014-02-13 31.0 high

4565 2014-02-14 22.0 medium

4566 2014-02-15 35.0 medium

4567

4568 >>> ser = df['windspeed']

4569 >>> ser.get('2014-02-13')

4570 'high'

4571

4572 If the key isn't found, the default value will be used.

4573

4574 >>> df.get(["temp_celsius", "temp_kelvin"], default="default_value")

4575 'default_value'

4576

4577 >>> ser.get('2014-02-10', '[unknown]')

4578 '[unknown]'

4579 """

4580 try:

4581 return self[key]

4582 except (KeyError, ValueError, IndexError):

4583 return default

4584

4585 @final

4586 @property

4587 def _is_view(self) -> bool_t:

4588 """Return boolean indicating if self is view of another array"""

4589 return self._mgr.is_view

4590

4591 @final

4592 def reindex_like(

4593 self,

4594 other,

4595 method: Literal["backfill", "bfill", "pad", "ffill", "nearest"] | None = None,

4596 copy: bool_t | None = None,

4597 limit: int | None = None,

4598 tolerance=None,

4599 ) -> Self:

4600 """

4601 Return an object with matching indices as other object.

4602

4603 Conform the object to the same index on all axes. Optional

4604 filling logic, placing NaN in locations having no value

4605 in the previous index. A new object is produced unless the

4606 new index is equivalent to the current one and copy=False.

4607

4608 Parameters

4609 ----------

4610 other : Object of the same data type

4611 Its row and column indices are used to define the new indices

4612 of this object.

4613 method : {None, 'backfill'/'bfill', 'pad'/'ffill', 'nearest'}

4614 Method to use for filling holes in reindexed DataFrame.

4615 Please note: this is only applicable to DataFrames/Series with a

4616 monotonically increasing/decreasing index.

4617

4618 * None (default): don't fill gaps

4619 * pad / ffill: propagate last valid observation forward to next

4620 valid

4621 * backfill / bfill: use next valid observation to fill gap

4622 * nearest: use nearest valid observations to fill gap.

4623

4624 copy : bool, default True

4625 Return a new object, even if the passed indexes are the same.

4626

4627 .. note::

4628 The `copy` keyword will change behavior in pandas 3.0.

4629 `Copy-on-Write

4630 <https://pandas.pydata.org/docs/dev/user_guide/copy_on_write.html>`__

4631 will be enabled by default, which means that all methods with a

4632 `copy` keyword will use a lazy copy mechanism to defer the copy and

4633 ignore the `copy` keyword. The `copy` keyword will be removed in a

4634 future version of pandas.

4635

4636 You can already get the future behavior and improvements through

4637 enabling copy on write ``pd.options.mode.copy_on_write = True``

4638 limit : int, default None

4639 Maximum number of consecutive labels to fill for inexact matches.

4640 tolerance : optional

4641 Maximum distance between original and new labels for inexact

4642 matches. The values of the index at the matching locations must

4643 satisfy the equation ``abs(index[indexer] - target) <= tolerance``.

4644

4645 Tolerance may be a scalar value, which applies the same tolerance

4646 to all values, or list-like, which applies variable tolerance per

4647 element. List-like includes list, tuple, array, Series, and must be

4648 the same size as the index and its dtype must exactly match the

4649 index's type.

4650

4651 Returns

4652 -------

4653 Series or DataFrame

4654 Same type as caller, but with changed indices on each axis.

4655

4656 See Also

4657 --------

4658 DataFrame.set_index : Set row labels.

4659 DataFrame.reset_index : Remove row labels or move them to new columns.

4660 DataFrame.reindex : Change to new indices or expand indices.

4661

4662 Notes

4663 -----

4664 Same as calling

4665 ``.reindex(index=other.index, columns=other.columns,...)``.

4666

4667 Examples

4668 --------

4669 >>> df1 = pd.DataFrame([[24.3, 75.7, 'high'],

4670 ... [31, 87.8, 'high'],

4671 ... [22, 71.6, 'medium'],

4672 ... [35, 95, 'medium']],

4673 ... columns=['temp_celsius', 'temp_fahrenheit',

4674 ... 'windspeed'],

4675 ... index=pd.date_range(start='2014-02-12',

4676 ... end='2014-02-15', freq='D'))

4677

4678 >>> df1

4679 temp_celsius temp_fahrenheit windspeed

4680 2014-02-12 24.3 75.7 high

4681 2014-02-13 31.0 87.8 high

4682 2014-02-14 22.0 71.6 medium

4683 2014-02-15 35.0 95.0 medium

4684

4685 >>> df2 = pd.DataFrame([[28, 'low'],

4686 ... [30, 'low'],

4687 ... [35.1, 'medium']],

4688 ... columns=['temp_celsius', 'windspeed'],

4689 ... index=pd.DatetimeIndex(['2014-02-12', '2014-02-13',

4690 ... '2014-02-15']))

4691

4692 >>> df2

4693 temp_celsius windspeed

4694 2014-02-12 28.0 low

4695 2014-02-13 30.0 low

4696 2014-02-15 35.1 medium

4697

4698 >>> df2.reindex_like(df1)

4699 temp_celsius temp_fahrenheit windspeed

4700 2014-02-12 28.0 NaN low

4701 2014-02-13 30.0 NaN low

4702 2014-02-14 NaN NaN NaN

4703 2014-02-15 35.1 NaN medium

4704 """

4705 d = other._construct_axes_dict(

4706 axes=self._AXIS_ORDERS,

4707 method=method,

4708 copy=copy,

4709 limit=limit,

4710 tolerance=tolerance,

4711 )

4712

4713 return self.reindex(**d)

4714

4715 @overload

4716 def drop(

4717 self,

4718 labels: IndexLabel = ...,

4719 *,

4720 axis: Axis = ...,

4721 index: IndexLabel = ...,

4722 columns: IndexLabel = ...,

4723 level: Level | None = ...,

4724 inplace: Literal[True],

4725 errors: IgnoreRaise = ...,

4726 ) -> None:

4727 ...

4728

4729 @overload

4730 def drop(

4731 self,

4732 labels: IndexLabel = ...,

4733 *,

4734 axis: Axis = ...,

4735 index: IndexLabel = ...,

4736 columns: IndexLabel = ...,

4737 level: Level | None = ...,

4738 inplace: Literal[False] = ...,

4739 errors: IgnoreRaise = ...,

4740 ) -> Self:

4741 ...

4742

4743 @overload

4744 def drop(

4745 self,

4746 labels: IndexLabel = ...,

4747 *,

4748 axis: Axis = ...,

4749 index: IndexLabel = ...,

4750 columns: IndexLabel = ...,

4751 level: Level | None = ...,

4752 inplace: bool_t = ...,

4753 errors: IgnoreRaise = ...,

4754 ) -> Self | None:

4755 ...

4756

4757 def drop(

4758 self,

4759 labels: IndexLabel | None = None,

4760 *,

4761 axis: Axis = 0,

4762 index: IndexLabel | None = None,

4763 columns: IndexLabel | None = None,

4764 level: Level | None = None,

4765 inplace: bool_t = False,

4766 errors: IgnoreRaise = "raise",

4767 ) -> Self | None:

4768 inplace = validate_bool_kwarg(inplace, "inplace")

4769

4770 if labels is not None:

4771 if index is not None or columns is not None:

4772 raise ValueError("Cannot specify both 'labels' and 'index'/'columns'")

4773 axis_name = self._get_axis_name(axis)

4774 axes = {axis_name: labels}

4775 elif index is not None or columns is not None:

4776 axes = {"index": index}

4777 if self.ndim == 2:

4778 axes["columns"] = columns

4779 else:

4780 raise ValueError(

4781 "Need to specify at least one of 'labels', 'index' or 'columns'"

4782 )

4783

4784 obj = self

4785

4786 for axis, labels in axes.items():

4787 if labels is not None:

4788 obj = obj._drop_axis(labels, axis, level=level, errors=errors)

4789

4790 if inplace:

4791 self._update_inplace(obj)

4792 return None

4793 else:

4794 return obj

4795

4796 @final

4797 def _drop_axis(

4798 self,

4799 labels,

4800 axis,

4801 level=None,

4802 errors: IgnoreRaise = "raise",

4803 only_slice: bool_t = False,

4804 ) -> Self:

4805 """

4806 Drop labels from specified axis. Used in the ``drop`` method

4807 internally.

4808

4809 Parameters

4810 ----------

4811 labels : single label or list-like

4812 axis : int or axis name

4813 level : int or level name, default None

4814 For MultiIndex

4815 errors : {'ignore', 'raise'}, default 'raise'

4816 If 'ignore', suppress error and existing labels are dropped.

4817 only_slice : bool, default False

4818 Whether indexing along columns should be view-only.

4819

4820 """

4821 axis_num = self._get_axis_number(axis)

4822 axis = self._get_axis(axis)

4823

4824 if axis.is_unique:

4825 if level is not None:

4826 if not isinstance(axis, MultiIndex):

4827 raise AssertionError("axis must be a MultiIndex")

4828 new_axis = axis.drop(labels, level=level, errors=errors)

4829 else:

4830 new_axis = axis.drop(labels, errors=errors)

4831 indexer = axis.get_indexer(new_axis)

4832

4833 # Case for non-unique axis

4834 else:

4835 is_tuple_labels = is_nested_list_like(labels) or isinstance(labels, tuple)

4836 labels = ensure_object(common.index_labels_to_array(labels))

4837 if level is not None:

4838 if not isinstance(axis, MultiIndex):

4839 raise AssertionError("axis must be a MultiIndex")

4840 mask = ~axis.get_level_values(level).isin(labels)

4841

4842 # GH 18561 MultiIndex.drop should raise if label is absent

4843 if errors == "raise" and mask.all():

4844 raise KeyError(f"{labels} not found in axis")

4845 elif (

4846 isinstance(axis, MultiIndex)

4847 and labels.dtype == "object"

4848 and not is_tuple_labels

4849 ):

4850 # Set level to zero in case of MultiIndex and label is string,

4851 # because isin can't handle strings for MultiIndexes GH#36293

4852 # In case of tuples we get dtype object but have to use isin GH#42771

4853 mask = ~axis.get_level_values(0).isin(labels)

4854 else:

4855 mask = ~axis.isin(labels)

4856 # Check if label doesn't exist along axis

4857 labels_missing = (axis.get_indexer_for(labels) == -1).any()

4858 if errors == "raise" and labels_missing:

4859 raise KeyError(f"{labels} not found in axis")

4860

4861 if isinstance(mask.dtype, ExtensionDtype):

4862 # GH#45860

4863 mask = mask.to_numpy(dtype=bool)

4864

4865 indexer = mask.nonzero()[0]

4866 new_axis = axis.take(indexer)

4867

4868 bm_axis = self.ndim - axis_num - 1

4869 new_mgr = self._mgr.reindex_indexer(

4870 new_axis,

4871 indexer,

4872 axis=bm_axis,

4873 allow_dups=True,

4874 copy=None,

4875 only_slice=only_slice,

4876 )

4877 result = self._constructor_from_mgr(new_mgr, axes=new_mgr.axes)

4878 if self.ndim == 1:

4879 result._name = self.name

4880

4881 return result.__finalize__(self)

4882

4883 @final

4884 def _update_inplace(self, result, verify_is_copy: bool_t = True) -> None:

4885 """

4886 Replace self internals with result.

4887

4888 Parameters

4889 ----------

4890 result : same type as self

4891 verify_is_copy : bool, default True

4892 Provide is_copy checks.

4893 """

4894 # NOTE: This does *not* call __finalize__ and that's an explicit

4895 # decision that we may revisit in the future.

4896 self._reset_cache()

4897 self._clear_item_cache()

4898 self._mgr = result._mgr

4899 self._maybe_update_cacher(verify_is_copy=verify_is_copy, inplace=True)

4900

4901 @final

4902 def add_prefix(self, prefix: str, axis: Axis | None = None) -> Self:

4903 """

4904 Prefix labels with string `prefix`.

4905

4906 For Series, the row labels are prefixed.

4907 For DataFrame, the column labels are prefixed.

4908

4909 Parameters

4910 ----------

4911 prefix : str

4912 The string to add before each label.

4913 axis : {0 or 'index', 1 or 'columns', None}, default None

4914 Axis to add prefix on

4915

4916 .. versionadded:: 2.0.0

4917

4918 Returns

4919 -------

4920 Series or DataFrame

4921 New Series or DataFrame with updated labels.

4922

4923 See Also

4924 --------

4925 Series.add_suffix: Suffix row labels with string `suffix`.

4926 DataFrame.add_suffix: Suffix column labels with string `suffix`.

4927

4928 Examples

4929 --------

4930 >>> s = pd.Series([1, 2, 3, 4])

4931 >>> s

4932 0 1

4933 1 2

4934 2 3

4935 3 4

4936 dtype: int64

4937

4938 >>> s.add_prefix('item_')

4939 item_0 1

4940 item_1 2

4941 item_2 3

4942 item_3 4

4943 dtype: int64

4944

4945 >>> df = pd.DataFrame({'A': [1, 2, 3, 4], 'B': [3, 4, 5, 6]})

4946 >>> df

4947 A B

4948 0 1 3

4949 1 2 4

4950 2 3 5

4951 3 4 6

4952

4953 >>> df.add_prefix('col_')

4954 col_A col_B

4955 0 1 3

4956 1 2 4

4957 2 3 5

4958 3 4 6

4959 """

4960 f = lambda x: f"{prefix}{x}"

4961

4962 axis_name = self._info_axis_name

4963 if axis is not None:

4964 axis_name = self._get_axis_name(axis)

4965

4966 mapper = {axis_name: f}

4967

4968 # error: Incompatible return value type (got "Optional[Self]",

4969 # expected "Self")

4970 # error: Argument 1 to "rename" of "NDFrame" has incompatible type

4971 # "**Dict[str, partial[str]]"; expected "Union[str, int, None]"

4972 # error: Keywords must be strings

4973 return self._rename(**mapper) # type: ignore[return-value, arg-type, misc]

4974

4975 @final

4976 def add_suffix(self, suffix: str, axis: Axis | None = None) -> Self:

4977 """

4978 Suffix labels with string `suffix`.

4979

4980 For Series, the row labels are suffixed.

4981 For DataFrame, the column labels are suffixed.

4982

4983 Parameters

4984 ----------

4985 suffix : str

4986 The string to add after each label.

4987 axis : {0 or 'index', 1 or 'columns', None}, default None

4988 Axis to add suffix on

4989

4990 .. versionadded:: 2.0.0

4991

4992 Returns

4993 -------

4994 Series or DataFrame

4995 New Series or DataFrame with updated labels.

4996

4997 See Also

4998 --------

4999 Series.add_prefix: Prefix row labels with string `prefix`.

5000 DataFrame.add_prefix: Prefix column labels with string `prefix`.

5001

5002 Examples

5003 --------

5004 >>> s = pd.Series([1, 2, 3, 4])

5005 >>> s

5006 0 1

5007 1 2

5008 2 3

5009 3 4

5010 dtype: int64

5011

5012 >>> s.add_suffix('_item')

5013 0_item 1

5014 1_item 2

5015 2_item 3

5016 3_item 4

5017 dtype: int64

5018

5019 >>> df = pd.DataFrame({'A': [1, 2, 3, 4], 'B': [3, 4, 5, 6]})

5020 >>> df

5021 A B

5022 0 1 3

5023 1 2 4

5024 2 3 5

5025 3 4 6

5026

5027 >>> df.add_suffix('_col')

5028 A_col B_col

5029 0 1 3

5030 1 2 4

5031 2 3 5

5032 3 4 6

5033 """

5034 f = lambda x: f"{x}{suffix}"

5035

5036 axis_name = self._info_axis_name

5037 if axis is not None:

5038 axis_name = self._get_axis_name(axis)

5039

5040 mapper = {axis_name: f}

5041 # error: Incompatible return value type (got "Optional[Self]",

5042 # expected "Self")

5043 # error: Argument 1 to "rename" of "NDFrame" has incompatible type

5044 # "**Dict[str, partial[str]]"; expected "Union[str, int, None]"

5045 # error: Keywords must be strings

5046 return self._rename(**mapper) # type: ignore[return-value, arg-type, misc]

5047

5048 @overload

5049 def sort_values(

5050 self,

5051 *,

5052 axis: Axis = ...,

5053 ascending: bool_t | Sequence[bool_t] = ...,

5054 inplace: Literal[False] = ...,

5055 kind: SortKind = ...,

5056 na_position: NaPosition = ...,

5057 ignore_index: bool_t = ...,

5058 key: ValueKeyFunc = ...,

5059 ) -> Self:

5060 ...

5061

5062 @overload

5063 def sort_values(

5064 self,

5065 *,

5066 axis: Axis = ...,

5067 ascending: bool_t | Sequence[bool_t] = ...,

5068 inplace: Literal[True],

5069 kind: SortKind = ...,

5070 na_position: NaPosition = ...,

5071 ignore_index: bool_t = ...,

5072 key: ValueKeyFunc = ...,

5073 ) -> None:

5074 ...

5075

5076 @overload

5077 def sort_values(

5078 self,

5079 *,

5080 axis: Axis = ...,

5081 ascending: bool_t | Sequence[bool_t] = ...,

5082 inplace: bool_t = ...,

5083 kind: SortKind = ...,

5084 na_position: NaPosition = ...,

5085 ignore_index: bool_t = ...,

5086 key: ValueKeyFunc = ...,

5087 ) -> Self | None:

5088 ...

5089

5090 def sort_values(

5091 self,

5092 *,

5093 axis: Axis = 0,

5094 ascending: bool_t | Sequence[bool_t] = True,

5095 inplace: bool_t = False,

5096 kind: SortKind = "quicksort",

5097 na_position: NaPosition = "last",

5098 ignore_index: bool_t = False,

5099 key: ValueKeyFunc | None = None,

5100 ) -> Self | None:

5101 """

5102 Sort by the values along either axis.

5103

5104 Parameters

5105 ----------%(optional_by)s

5106 axis : %(axes_single_arg)s, default 0

5107 Axis to be sorted.

5108 ascending : bool or list of bool, default True

5109 Sort ascending vs. descending. Specify list for multiple sort

5110 orders. If this is a list of bools, must match the length of

5111 the by.

5112 inplace : bool, default False

5113 If True, perform operation in-place.

5114 kind : {'quicksort', 'mergesort', 'heapsort', 'stable'}, default 'quicksort'

5115 Choice of sorting algorithm. See also :func:`numpy.sort` for more

5116 information. `mergesort` and `stable` are the only stable algorithms. For

5117 DataFrames, this option is only applied when sorting on a single

5118 column or label.

5119 na_position : {'first', 'last'}, default 'last'

5120 Puts NaNs at the beginning if `first`; `last` puts NaNs at the

5121 end.

5122 ignore_index : bool, default False

5123 If True, the resulting axis will be labeled 0, 1, …, n - 1.

5124 key : callable, optional

5125 Apply the key function to the values

5126 before sorting. This is similar to the `key` argument in the

5127 builtin :meth:`sorted` function, with the notable difference that

5128 this `key` function should be *vectorized*. It should expect a

5129 ``Series`` and return a Series with the same shape as the input.

5130 It will be applied to each column in `by` independently.

5131

5132 Returns

5133 -------

5134 DataFrame or None

5135 DataFrame with sorted values or None if ``inplace=True``.

5136

5137 See Also

5138 --------

5139 DataFrame.sort_index : Sort a DataFrame by the index.

5140 Series.sort_values : Similar method for a Series.

5141

5142 Examples

5143 --------

5144 >>> df = pd.DataFrame({

5145 ... 'col1': ['A', 'A', 'B', np.nan, 'D', 'C'],

5146 ... 'col2': [2, 1, 9, 8, 7, 4],

5147 ... 'col3': [0, 1, 9, 4, 2, 3],

5148 ... 'col4': ['a', 'B', 'c', 'D', 'e', 'F']

5149 ... })

5150 >>> df

5151 col1 col2 col3 col4

5152 0 A 2 0 a

5153 1 A 1 1 B

5154 2 B 9 9 c

5155 3 NaN 8 4 D

5156 4 D 7 2 e

5157 5 C 4 3 F

5158

5159 Sort by col1

5160

5161 >>> df.sort_values(by=['col1'])

5162 col1 col2 col3 col4

5163 0 A 2 0 a

5164 1 A 1 1 B

5165 2 B 9 9 c

5166 5 C 4 3 F

5167 4 D 7 2 e

5168 3 NaN 8 4 D

5169

5170 Sort by multiple columns

5171

5172 >>> df.sort_values(by=['col1', 'col2'])

5173 col1 col2 col3 col4

5174 1 A 1 1 B

5175 0 A 2 0 a

5176 2 B 9 9 c

5177 5 C 4 3 F

5178 4 D 7 2 e

5179 3 NaN 8 4 D

5180

5181 Sort Descending

5182

5183 >>> df.sort_values(by='col1', ascending=False)

5184 col1 col2 col3 col4

5185 4 D 7 2 e

5186 5 C 4 3 F

5187 2 B 9 9 c

5188 0 A 2 0 a

5189 1 A 1 1 B

5190 3 NaN 8 4 D

5191

5192 Putting NAs first

5193

5194 >>> df.sort_values(by='col1', ascending=False, na_position='first')

5195 col1 col2 col3 col4

5196 3 NaN 8 4 D

5197 4 D 7 2 e

5198 5 C 4 3 F

5199 2 B 9 9 c

5200 0 A 2 0 a

5201 1 A 1 1 B

5202

5203 Sorting with a key function

5204

5205 >>> df.sort_values(by='col4', key=lambda col: col.str.lower())

5206 col1 col2 col3 col4

5207 0 A 2 0 a

5208 1 A 1 1 B

5209 2 B 9 9 c

5210 3 NaN 8 4 D

5211 4 D 7 2 e

5212 5 C 4 3 F

5213

5214 Natural sort with the key argument,

5215 using the `natsort <https://github.com/SethMMorton/natsort>` package.

5216

5217 >>> df = pd.DataFrame({

5218 ... "time": ['0hr', '128hr', '72hr', '48hr', '96hr'],

5219 ... "value": [10, 20, 30, 40, 50]

5220 ... })

5221 >>> df

5222 time value

5223 0 0hr 10

5224 1 128hr 20

5225 2 72hr 30

5226 3 48hr 40

5227 4 96hr 50

5228 >>> from natsort import index_natsorted

5229 >>> df.sort_values(

5230 ... by="time",

5231 ... key=lambda x: np.argsort(index_natsorted(df["time"]))

5232 ... )

5233 time value

5234 0 0hr 10

5235 3 48hr 40

5236 2 72hr 30

5237 4 96hr 50

5238 1 128hr 20

5239 """

5240 raise AbstractMethodError(self)

5241

5242 @overload

5243 def sort_index(

5244 self,

5245 *,

5246 axis: Axis = ...,

5247 level: IndexLabel = ...,

5248 ascending: bool_t | Sequence[bool_t] = ...,

5249 inplace: Literal[True],

5250 kind: SortKind = ...,

5251 na_position: NaPosition = ...,

5252 sort_remaining: bool_t = ...,

5253 ignore_index: bool_t = ...,

5254 key: IndexKeyFunc = ...,

5255 ) -> None:

5256 ...

5257

5258 @overload

5259 def sort_index(

5260 self,

5261 *,

5262 axis: Axis = ...,

5263 level: IndexLabel = ...,

5264 ascending: bool_t | Sequence[bool_t] = ...,

5265 inplace: Literal[False] = ...,

5266 kind: SortKind = ...,

5267 na_position: NaPosition = ...,

5268 sort_remaining: bool_t = ...,

5269 ignore_index: bool_t = ...,

5270 key: IndexKeyFunc = ...,

5271 ) -> Self:

5272 ...

5273

5274 @overload

5275 def sort_index(

5276 self,

5277 *,

5278 axis: Axis = ...,

5279 level: IndexLabel = ...,

5280 ascending: bool_t | Sequence[bool_t] = ...,

5281 inplace: bool_t = ...,

5282 kind: SortKind = ...,

5283 na_position: NaPosition = ...,

5284 sort_remaining: bool_t = ...,

5285 ignore_index: bool_t = ...,

5286 key: IndexKeyFunc = ...,

5287 ) -> Self | None:

5288 ...

5289

5290 def sort_index(

5291 self,

5292 *,

5293 axis: Axis = 0,

5294 level: IndexLabel | None = None,

5295 ascending: bool_t | Sequence[bool_t] = True,

5296 inplace: bool_t = False,

5297 kind: SortKind = "quicksort",

5298 na_position: NaPosition = "last",

5299 sort_remaining: bool_t = True,

5300 ignore_index: bool_t = False,

5301 key: IndexKeyFunc | None = None,

5302 ) -> Self | None:

5303 inplace = validate_bool_kwarg(inplace, "inplace")

5304 axis = self._get_axis_number(axis)

5305 ascending = validate_ascending(ascending)

5306

5307 target = self._get_axis(axis)

5308

5309 indexer = get_indexer_indexer(

5310 target, level, ascending, kind, na_position, sort_remaining, key

5311 )

5312

5313 if indexer is None:

5314 if inplace:

5315 result = self

5316 else:

5317 result = self.copy(deep=None)

5318

5319 if ignore_index:

5320 result.index = default_index(len(self))

5321 if inplace:

5322 return None

5323 else:

5324 return result

5325

5326 baxis = self._get_block_manager_axis(axis)

5327 new_data = self._mgr.take(indexer, axis=baxis, verify=False)

5328

5329 # reconstruct axis if needed

5330 if not ignore_index:

5331 new_axis = new_data.axes[baxis]._sort_levels_monotonic()

5332 else:

5333 new_axis = default_index(len(indexer))

5334 new_data.set_axis(baxis, new_axis)

5335

5336 result = self._constructor_from_mgr(new_data, axes=new_data.axes)

5337

5338 if inplace:

5339 return self._update_inplace(result)

5340 else:

5341 return result.__finalize__(self, method="sort_index")

5342

5343 @doc(

5344 klass=_shared_doc_kwargs["klass"],

5345 optional_reindex="",

5346 )

5347 def reindex(

5348 self,

5349 labels=None,

5350 *,

5351 index=None,

5352 columns=None,

5353 axis: Axis | None = None,

5354 method: ReindexMethod | None = None,

5355 copy: bool_t | None = None,

5356 level: Level | None = None,

5357 fill_value: Scalar | None = np.nan,

5358 limit: int | None = None,

5359 tolerance=None,

5360 ) -> Self:

5361 """

5362 Conform {klass} to new index with optional filling logic.

5363

5364 Places NA/NaN in locations having no value in the previous index. A new object

5365 is produced unless the new index is equivalent to the current one and

5366 ``copy=False``.

5367

5368 Parameters

5369 ----------

5370 {optional_reindex}

5371 method : {{None, 'backfill'/'bfill', 'pad'/'ffill', 'nearest'}}

5372 Method to use for filling holes in reindexed DataFrame.

5373 Please note: this is only applicable to DataFrames/Series with a

5374 monotonically increasing/decreasing index.

5375

5376 * None (default): don't fill gaps

5377 * pad / ffill: Propagate last valid observation forward to next

5378 valid.

5379 * backfill / bfill: Use next valid observation to fill gap.

5380 * nearest: Use nearest valid observations to fill gap.

5381

5382 copy : bool, default True

5383 Return a new object, even if the passed indexes are the same.

5384

5385 .. note::

5386 The `copy` keyword will change behavior in pandas 3.0.

5387 `Copy-on-Write

5388 <https://pandas.pydata.org/docs/dev/user_guide/copy_on_write.html>`__

5389 will be enabled by default, which means that all methods with a

5390 `copy` keyword will use a lazy copy mechanism to defer the copy and

5391 ignore the `copy` keyword. The `copy` keyword will be removed in a

5392 future version of pandas.

5393

5394 You can already get the future behavior and improvements through

5395 enabling copy on write ``pd.options.mode.copy_on_write = True``

5396 level : int or name

5397 Broadcast across a level, matching Index values on the

5398 passed MultiIndex level.

5399 fill_value : scalar, default np.nan

5400 Value to use for missing values. Defaults to NaN, but can be any

5401 "compatible" value.

5402 limit : int, default None

5403 Maximum number of consecutive elements to forward or backward fill.

5404 tolerance : optional

5405 Maximum distance between original and new labels for inexact

5406 matches. The values of the index at the matching locations most

5407 satisfy the equation ``abs(index[indexer] - target) <= tolerance``.

5408

5409 Tolerance may be a scalar value, which applies the same tolerance

5410 to all values, or list-like, which applies variable tolerance per

5411 element. List-like includes list, tuple, array, Series, and must be

5412 the same size as the index and its dtype must exactly match the

5413 index's type.

5414

5415 Returns

5416 -------

5417 {klass} with changed index.

5418

5419 See Also

5420 --------

5421 DataFrame.set_index : Set row labels.

5422 DataFrame.reset_index : Remove row labels or move them to new columns.

5423 DataFrame.reindex_like : Change to same indices as other DataFrame.

5424

5425 Examples

5426 --------

5427 ``DataFrame.reindex`` supports two calling conventions

5428

5429 * ``(index=index_labels, columns=column_labels, ...)``

5430 * ``(labels, axis={{'index', 'columns'}}, ...)``

5431

5432 We *highly* recommend using keyword arguments to clarify your

5433 intent.

5434

5435 Create a dataframe with some fictional data.

5436

5437 >>> index = ['Firefox', 'Chrome', 'Safari', 'IE10', 'Konqueror']

5438 >>> df = pd.DataFrame({{'http_status': [200, 200, 404, 404, 301],

5439 ... 'response_time': [0.04, 0.02, 0.07, 0.08, 1.0]}},

5440 ... index=index)

5441 >>> df

5442 http_status response_time

5443 Firefox 200 0.04

5444 Chrome 200 0.02

5445 Safari 404 0.07

5446 IE10 404 0.08

5447 Konqueror 301 1.00

5448

5449 Create a new index and reindex the dataframe. By default

5450 values in the new index that do not have corresponding

5451 records in the dataframe are assigned ``NaN``.

5452

5453 >>> new_index = ['Safari', 'Iceweasel', 'Comodo Dragon', 'IE10',

5454 ... 'Chrome']

5455 >>> df.reindex(new_index)

5456 http_status response_time

5457 Safari 404.0 0.07

5458 Iceweasel NaN NaN

5459 Comodo Dragon NaN NaN

5460 IE10 404.0 0.08

5461 Chrome 200.0 0.02

5462

5463 We can fill in the missing values by passing a value to

5464 the keyword ``fill_value``. Because the index is not monotonically

5465 increasing or decreasing, we cannot use arguments to the keyword

5466 ``method`` to fill the ``NaN`` values.

5467

5468 >>> df.reindex(new_index, fill_value=0)

5469 http_status response_time

5470 Safari 404 0.07

5471 Iceweasel 0 0.00

5472 Comodo Dragon 0 0.00

5473 IE10 404 0.08

5474 Chrome 200 0.02

5475

5476 >>> df.reindex(new_index, fill_value='missing')

5477 http_status response_time

5478 Safari 404 0.07

5479 Iceweasel missing missing

5480 Comodo Dragon missing missing

5481 IE10 404 0.08

5482 Chrome 200 0.02

5483

5484 We can also reindex the columns.

5485

5486 >>> df.reindex(columns=['http_status', 'user_agent'])

5487 http_status user_agent

5488 Firefox 200 NaN

5489 Chrome 200 NaN

5490 Safari 404 NaN

5491 IE10 404 NaN

5492 Konqueror 301 NaN

5493

5494 Or we can use "axis-style" keyword arguments

5495

5496 >>> df.reindex(['http_status', 'user_agent'], axis="columns")

5497 http_status user_agent

5498 Firefox 200 NaN

5499 Chrome 200 NaN

5500 Safari 404 NaN

5501 IE10 404 NaN

5502 Konqueror 301 NaN

5503

5504 To further illustrate the filling functionality in

5505 ``reindex``, we will create a dataframe with a

5506 monotonically increasing index (for example, a sequence

5507 of dates).

5508

5509 >>> date_index = pd.date_range('1/1/2010', periods=6, freq='D')

5510 >>> df2 = pd.DataFrame({{"prices": [100, 101, np.nan, 100, 89, 88]}},

5511 ... index=date_index)

5512 >>> df2

5513 prices

5514 2010-01-01 100.0

5515 2010-01-02 101.0

5516 2010-01-03 NaN

5517 2010-01-04 100.0

5518 2010-01-05 89.0

5519 2010-01-06 88.0

5520

5521 Suppose we decide to expand the dataframe to cover a wider

5522 date range.

5523

5524 >>> date_index2 = pd.date_range('12/29/2009', periods=10, freq='D')

5525 >>> df2.reindex(date_index2)

5526 prices

5527 2009-12-29 NaN

5528 2009-12-30 NaN

5529 2009-12-31 NaN

5530 2010-01-01 100.0

5531 2010-01-02 101.0

5532 2010-01-03 NaN

5533 2010-01-04 100.0

5534 2010-01-05 89.0

5535 2010-01-06 88.0

5536 2010-01-07 NaN

5537

5538 The index entries that did not have a value in the original data frame

5539 (for example, '2009-12-29') are by default filled with ``NaN``.

5540 If desired, we can fill in the missing values using one of several

5541 options.

5542

5543 For example, to back-propagate the last valid value to fill the ``NaN``

5544 values, pass ``bfill`` as an argument to the ``method`` keyword.

5545

5546 >>> df2.reindex(date_index2, method='bfill')

5547 prices

5548 2009-12-29 100.0

5549 2009-12-30 100.0

5550 2009-12-31 100.0

5551 2010-01-01 100.0

5552 2010-01-02 101.0

5553 2010-01-03 NaN

5554 2010-01-04 100.0

5555 2010-01-05 89.0

5556 2010-01-06 88.0

5557 2010-01-07 NaN

5558

5559 Please note that the ``NaN`` value present in the original dataframe

5560 (at index value 2010-01-03) will not be filled by any of the

5561 value propagation schemes. This is because filling while reindexing

5562 does not look at dataframe values, but only compares the original and

5563 desired indexes. If you do want to fill in the ``NaN`` values present

5564 in the original dataframe, use the ``fillna()`` method.

5565

5566 See the :ref:`user guide <basics.reindexing>` for more.

5567 """

5568 # TODO: Decide if we care about having different examples for different

5569 # kinds

5570

5571 if index is not None and columns is not None and labels is not None:

5572 raise TypeError("Cannot specify all of 'labels', 'index', 'columns'.")

5573 elif index is not None or columns is not None:

5574 if axis is not None:

5575 raise TypeError(

5576 "Cannot specify both 'axis' and any of 'index' or 'columns'"

5577 )

5578 if labels is not None:

5579 if index is not None:

5580 columns = labels

5581 else:

5582 index = labels

5583 else:

5584 if axis and self._get_axis_number(axis) == 1:

5585 columns = labels

5586 else:

5587 index = labels

5588 axes: dict[Literal["index", "columns"], Any] = {

5589 "index": index,

5590 "columns": columns,

5591 }

5592 method = clean_reindex_fill_method(method)

5593

5594 # if all axes that are requested to reindex are equal, then only copy

5595 # if indicated must have index names equal here as well as values

5596 if copy and using_copy_on_write():

5597 copy = False

5598 if all(

5599 self._get_axis(axis_name).identical(ax)

5600 for axis_name, ax in axes.items()

5601 if ax is not None

5602 ):

5603 return self.copy(deep=copy)

5604

5605 # check if we are a multi reindex

5606 if self._needs_reindex_multi(axes, method, level):

5607 return self._reindex_multi(axes, copy, fill_value)

5608

5609 # perform the reindex on the axes

5610 return self._reindex_axes(

5611 axes, level, limit, tolerance, method, fill_value, copy

5612 ).__finalize__(self, method="reindex")

5613

5614 @final

5615 def _reindex_axes(

5616 self,

5617 axes,

5618 level: Level | None,

5619 limit: int | None,

5620 tolerance,

5621 method,

5622 fill_value: Scalar | None,

5623 copy: bool_t | None,

5624 ) -> Self:

5625 """Perform the reindex for all the axes."""

5626 obj = self

5627 for a in self._AXIS_ORDERS:

5628 labels = axes[a]

5629 if labels is None:

5630 continue

5631

5632 ax = self._get_axis(a)

5633 new_index, indexer = ax.reindex(

5634 labels, level=level, limit=limit, tolerance=tolerance, method=method

5635 )

5636

5637 axis = self._get_axis_number(a)

5638 obj = obj._reindex_with_indexers(

5639 {axis: [new_index, indexer]},

5640 fill_value=fill_value,

5641 copy=copy,

5642 allow_dups=False,

5643 )

5644 # If we've made a copy once, no need to make another one

5645 copy = False

5646

5647 return obj

5648

5649 def _needs_reindex_multi(self, axes, method, level: Level | None) -> bool_t:

5650 """Check if we do need a multi reindex."""

5651 return (

5652 (common.count_not_none(*axes.values()) == self._AXIS_LEN)

5653 and method is None

5654 and level is None

5655 # reindex_multi calls self.values, so we only want to go

5656 # down that path when doing so is cheap.

5657 and self._can_fast_transpose

5658 )

5659

5660 def _reindex_multi(self, axes, copy, fill_value):

5661 raise AbstractMethodError(self)

5662

5663 @final

5664 def _reindex_with_indexers(

5665 self,

5666 reindexers,

5667 fill_value=None,

5668 copy: bool_t | None = False,

5669 allow_dups: bool_t = False,

5670 ) -> Self:

5671 """allow_dups indicates an internal call here"""

5672 # reindex doing multiple operations on different axes if indicated

5673 new_data = self._mgr

5674 for axis in sorted(reindexers.keys()):

5675 index, indexer = reindexers[axis]

5676 baxis = self._get_block_manager_axis(axis)

5677

5678 if index is None:

5679 continue

5680

5681 index = ensure_index(index)

5682 if indexer is not None:

5683 indexer = ensure_platform_int(indexer)

5684

5685 # TODO: speed up on homogeneous DataFrame objects (see _reindex_multi)

5686 new_data = new_data.reindex_indexer(

5687 index,

5688 indexer,

5689 axis=baxis,

5690 fill_value=fill_value,

5691 allow_dups=allow_dups,

5692 copy=copy,

5693 )

5694 # If we've made a copy once, no need to make another one

5695 copy = False

5696

5697 if (

5698 (copy or copy is None)

5699 and new_data is self._mgr

5700 and not using_copy_on_write()

5701 ):

5702 new_data = new_data.copy(deep=copy)

5703 elif using_copy_on_write() and new_data is self._mgr:

5704 new_data = new_data.copy(deep=False)

5705

5706 return self._constructor_from_mgr(new_data, axes=new_data.axes).__finalize__(

5707 self

5708 )

5709

5710 def filter(

5711 self,

5712 items=None,

5713 like: str | None = None,

5714 regex: str | None = None,

5715 axis: Axis | None = None,

5716 ) -> Self:

5717 """

5718 Subset the dataframe rows or columns according to the specified index labels.

5719

5720 Note that this routine does not filter a dataframe on its

5721 contents. The filter is applied to the labels of the index.

5722

5723 Parameters

5724 ----------

5725 items : list-like

5726 Keep labels from axis which are in items.

5727 like : str

5728 Keep labels from axis for which "like in label == True".

5729 regex : str (regular expression)

5730 Keep labels from axis for which re.search(regex, label) == True.

5731 axis : {0 or 'index', 1 or 'columns', None}, default None

5732 The axis to filter on, expressed either as an index (int)

5733 or axis name (str). By default this is the info axis, 'columns' for

5734 DataFrame. For `Series` this parameter is unused and defaults to `None`.

5735

5736 Returns

5737 -------

5738 same type as input object

5739

5740 See Also

5741 --------

5742 DataFrame.loc : Access a group of rows and columns

5743 by label(s) or a boolean array.

5744

5745 Notes

5746 -----

5747 The ``items``, ``like``, and ``regex`` parameters are

5748 enforced to be mutually exclusive.

5749

5750 ``axis`` defaults to the info axis that is used when indexing

5751 with ``[]``.

5752

5753 Examples

5754 --------

5755 >>> df = pd.DataFrame(np.array(([1, 2, 3], [4, 5, 6])),

5756 ... index=['mouse', 'rabbit'],

5757 ... columns=['one', 'two', 'three'])

5758 >>> df

5759 one two three

5760 mouse 1 2 3

5761 rabbit 4 5 6

5762

5763 >>> # select columns by name

5764 >>> df.filter(items=['one', 'three'])

5765 one three

5766 mouse 1 3

5767 rabbit 4 6

5768

5769 >>> # select columns by regular expression

5770 >>> df.filter(regex='e$', axis=1)

5771 one three

5772 mouse 1 3

5773 rabbit 4 6

5774

5775 >>> # select rows containing 'bbi'

5776 >>> df.filter(like='bbi', axis=0)

5777 one two three

5778 rabbit 4 5 6

5779 """

5780 nkw = common.count_not_none(items, like, regex)

5781 if nkw > 1:

5782 raise TypeError(

5783 "Keyword arguments `items`, `like`, or `regex` "

5784 "are mutually exclusive"

5785 )

5786

5787 if axis is None:

5788 axis = self._info_axis_name

5789 labels = self._get_axis(axis)

5790

5791 if items is not None:

5792 name = self._get_axis_name(axis)

5793 items = Index(items).intersection(labels)

5794 if len(items) == 0:

5795 # Keep the dtype of labels when we are empty

5796 items = items.astype(labels.dtype)

5797 # error: Keywords must be strings

5798 return self.reindex(**{name: items}) # type: ignore[misc]

5799 elif like:

5800

5801 def f(x) -> bool_t:

5802 assert like is not None # needed for mypy

5803 return like in ensure_str(x)

5804

5805 values = labels.map(f)

5806 return self.loc(axis=axis)[values]

5807 elif regex:

5808

5809 def f(x) -> bool_t:

5810 return matcher.search(ensure_str(x)) is not None

5811

5812 matcher = re.compile(regex)

5813 values = labels.map(f)

5814 return self.loc(axis=axis)[values]

5815 else:

5816 raise TypeError("Must pass either `items`, `like`, or `regex`")

5817

5818 @final

5819 def head(self, n: int = 5) -> Self:

5820 """

5821 Return the first `n` rows.

5822

5823 This function returns the first `n` rows for the object based

5824 on position. It is useful for quickly testing if your object

5825 has the right type of data in it.

5826

5827 For negative values of `n`, this function returns all rows except

5828 the last `|n|` rows, equivalent to ``df[:n]``.

5829

5830 If n is larger than the number of rows, this function returns all rows.

5831

5832 Parameters

5833 ----------

5834 n : int, default 5

5835 Number of rows to select.

5836

5837 Returns

5838 -------

5839 same type as caller

5840 The first `n` rows of the caller object.

5841

5842 See Also

5843 --------

5844 DataFrame.tail: Returns the last `n` rows.

5845

5846 Examples

5847 --------

5848 >>> df = pd.DataFrame({'animal': ['alligator', 'bee', 'falcon', 'lion',

5849 ... 'monkey', 'parrot', 'shark', 'whale', 'zebra']})

5850 >>> df

5851 animal

5852 0 alligator

5853 1 bee

5854 2 falcon

5855 3 lion

5856 4 monkey

5857 5 parrot

5858 6 shark

5859 7 whale

5860 8 zebra

5861

5862 Viewing the first 5 lines

5863

5864 >>> df.head()

5865 animal

5866 0 alligator

5867 1 bee

5868 2 falcon

5869 3 lion

5870 4 monkey

5871

5872 Viewing the first `n` lines (three in this case)

5873

5874 >>> df.head(3)

5875 animal

5876 0 alligator

5877 1 bee

5878 2 falcon

5879

5880 For negative values of `n`

5881

5882 >>> df.head(-3)

5883 animal

5884 0 alligator

5885 1 bee

5886 2 falcon

5887 3 lion

5888 4 monkey

5889 5 parrot

5890 """

5891 if using_copy_on_write():

5892 return self.iloc[:n].copy()

5893 return self.iloc[:n]

5894

5895 @final

5896 def tail(self, n: int = 5) -> Self:

5897 """

5898 Return the last `n` rows.

5899

5900 This function returns last `n` rows from the object based on

5901 position. It is useful for quickly verifying data, for example,

5902 after sorting or appending rows.

5903

5904 For negative values of `n`, this function returns all rows except

5905 the first `|n|` rows, equivalent to ``df[|n|:]``.

5906

5907 If n is larger than the number of rows, this function returns all rows.

5908

5909 Parameters

5910 ----------

5911 n : int, default 5

5912 Number of rows to select.

5913

5914 Returns

5915 -------

5916 type of caller

5917 The last `n` rows of the caller object.

5918

5919 See Also

5920 --------

5921 DataFrame.head : The first `n` rows of the caller object.

5922

5923 Examples

5924 --------

5925 >>> df = pd.DataFrame({'animal': ['alligator', 'bee', 'falcon', 'lion',

5926 ... 'monkey', 'parrot', 'shark', 'whale', 'zebra']})

5927 >>> df

5928 animal

5929 0 alligator

5930 1 bee

5931 2 falcon

5932 3 lion

5933 4 monkey

5934 5 parrot

5935 6 shark

5936 7 whale

5937 8 zebra

5938

5939 Viewing the last 5 lines

5940

5941 >>> df.tail()

5942 animal

5943 4 monkey

5944 5 parrot

5945 6 shark

5946 7 whale

5947 8 zebra

5948

5949 Viewing the last `n` lines (three in this case)

5950

5951 >>> df.tail(3)

5952 animal

5953 6 shark

5954 7 whale

5955 8 zebra

5956

5957 For negative values of `n`

5958

5959 >>> df.tail(-3)

5960 animal

5961 3 lion

5962 4 monkey

5963 5 parrot

5964 6 shark

5965 7 whale

5966 8 zebra

5967 """

5968 if using_copy_on_write():

5969 if n == 0:

5970 return self.iloc[0:0].copy()

5971 return self.iloc[-n:].copy()

5972 if n == 0:

5973 return self.iloc[0:0]

5974 return self.iloc[-n:]

5975

5976 @final

5977 def sample(

5978 self,

5979 n: int | None = None,

5980 frac: float | None = None,

5981 replace: bool_t = False,

5982 weights=None,

5983 random_state: RandomState | None = None,

5984 axis: Axis | None = None,

5985 ignore_index: bool_t = False,

5986 ) -> Self:

5987 """

5988 Return a random sample of items from an axis of object.

5989

5990 You can use `random_state` for reproducibility.

5991

5992 Parameters

5993 ----------

5994 n : int, optional

5995 Number of items from axis to return. Cannot be used with `frac`.

5996 Default = 1 if `frac` = None.

5997 frac : float, optional

5998 Fraction of axis items to return. Cannot be used with `n`.

5999 replace : bool, default False

6000 Allow or disallow sampling of the same row more than once.

6001 weights : str or ndarray-like, optional

6002 Default 'None' results in equal probability weighting.

6003 If passed a Series, will align with target object on index. Index

6004 values in weights not found in sampled object will be ignored and

6005 index values in sampled object not in weights will be assigned

6006 weights of zero.

6007 If called on a DataFrame, will accept the name of a column

6008 when axis = 0.

6009 Unless weights are a Series, weights must be same length as axis

6010 being sampled.

6011 If weights do not sum to 1, they will be normalized to sum to 1.

6012 Missing values in the weights column will be treated as zero.

6013 Infinite values not allowed.

6014 random_state : int, array-like, BitGenerator, np.random.RandomState, np.random.Generator, optional

6015 If int, array-like, or BitGenerator, seed for random number generator.

6016 If np.random.RandomState or np.random.Generator, use as given.

6017

6018 .. versionchanged:: 1.4.0

6019

6020 np.random.Generator objects now accepted

6021

6022 axis : {0 or 'index', 1 or 'columns', None}, default None

6023 Axis to sample. Accepts axis number or name. Default is stat axis

6024 for given data type. For `Series` this parameter is unused and defaults to `None`.

6025 ignore_index : bool, default False

6026 If True, the resulting index will be labeled 0, 1, …, n - 1.

6027

6028 .. versionadded:: 1.3.0

6029

6030 Returns

6031 -------

6032 Series or DataFrame

6033 A new object of same type as caller containing `n` items randomly

6034 sampled from the caller object.

6035

6036 See Also

6037 --------

6038 DataFrameGroupBy.sample: Generates random samples from each group of a

6039 DataFrame object.

6040 SeriesGroupBy.sample: Generates random samples from each group of a

6041 Series object.

6042 numpy.random.choice: Generates a random sample from a given 1-D numpy

6043 array.

6044

6045 Notes

6046 -----

6047 If `frac` > 1, `replacement` should be set to `True`.

6048

6049 Examples

6050 --------

6051 >>> df = pd.DataFrame({'num_legs': [2, 4, 8, 0],

6052 ... 'num_wings': [2, 0, 0, 0],

6053 ... 'num_specimen_seen': [10, 2, 1, 8]},

6054 ... index=['falcon', 'dog', 'spider', 'fish'])

6055 >>> df

6056 num_legs num_wings num_specimen_seen

6057 falcon 2 2 10

6058 dog 4 0 2

6059 spider 8 0 1

6060 fish 0 0 8

6061

6062 Extract 3 random elements from the ``Series`` ``df['num_legs']``:

6063 Note that we use `random_state` to ensure the reproducibility of

6064 the examples.

6065

6066 >>> df['num_legs'].sample(n=3, random_state=1)

6067 fish 0

6068 spider 8

6069 falcon 2

6070 Name: num_legs, dtype: int64

6071

6072 A random 50% sample of the ``DataFrame`` with replacement:

6073

6074 >>> df.sample(frac=0.5, replace=True, random_state=1)

6075 num_legs num_wings num_specimen_seen

6076 dog 4 0 2

6077 fish 0 0 8

6078

6079 An upsample sample of the ``DataFrame`` with replacement:

6080 Note that `replace` parameter has to be `True` for `frac` parameter > 1.

6081

6082 >>> df.sample(frac=2, replace=True, random_state=1)

6083 num_legs num_wings num_specimen_seen

6084 dog 4 0 2

6085 fish 0 0 8

6086 falcon 2 2 10

6087 falcon 2 2 10

6088 fish 0 0 8

6089 dog 4 0 2

6090 fish 0 0 8

6091 dog 4 0 2

6092

6093 Using a DataFrame column as weights. Rows with larger value in the

6094 `num_specimen_seen` column are more likely to be sampled.

6095

6096 >>> df.sample(n=2, weights='num_specimen_seen', random_state=1)

6097 num_legs num_wings num_specimen_seen

6098 falcon 2 2 10

6099 fish 0 0 8

6100 """ # noqa: E501

6101 if axis is None:

6102 axis = 0

6103

6104 axis = self._get_axis_number(axis)

6105 obj_len = self.shape[axis]

6106

6107 # Process random_state argument

6108 rs = common.random_state(random_state)

6109

6110 size = sample.process_sampling_size(n, frac, replace)

6111 if size is None:

6112 assert frac is not None

6113 size = round(frac * obj_len)

6114

6115 if weights is not None:

6116 weights = sample.preprocess_weights(self, weights, axis)

6117

6118 sampled_indices = sample.sample(obj_len, size, replace, weights, rs)

6119 result = self.take(sampled_indices, axis=axis)

6120

6121 if ignore_index:

6122 result.index = default_index(len(result))

6123

6124 return result

6125

6126 @final

6127 @doc(klass=_shared_doc_kwargs["klass"])

6128 def pipe(

6129 self,

6130 func: Callable[..., T] | tuple[Callable[..., T], str],

6131 *args,

6132 **kwargs,

6133 ) -> T:

6134 r"""

6135 Apply chainable functions that expect Series or DataFrames.

6136

6137 Parameters

6138 ----------

6139 func : function

6140 Function to apply to the {klass}.

6141 ``args``, and ``kwargs`` are passed into ``func``.

6142 Alternatively a ``(callable, data_keyword)`` tuple where

6143 ``data_keyword`` is a string indicating the keyword of

6144 ``callable`` that expects the {klass}.

6145 *args : iterable, optional

6146 Positional arguments passed into ``func``.

6147 **kwargs : mapping, optional

6148 A dictionary of keyword arguments passed into ``func``.

6149

6150 Returns

6151 -------

6152 the return type of ``func``.

6153

6154 See Also

6155 --------

6156 DataFrame.apply : Apply a function along input axis of DataFrame.

6157 DataFrame.map : Apply a function elementwise on a whole DataFrame.

6158 Series.map : Apply a mapping correspondence on a

6159 :class:`~pandas.Series`.

6160

6161 Notes

6162 -----

6163 Use ``.pipe`` when chaining together functions that expect

6164 Series, DataFrames or GroupBy objects.

6165

6166 Examples

6167 --------

6168 Constructing a income DataFrame from a dictionary.

6169

6170 >>> data = [[8000, 1000], [9500, np.nan], [5000, 2000]]

6171 >>> df = pd.DataFrame(data, columns=['Salary', 'Others'])

6172 >>> df

6173 Salary Others

6174 0 8000 1000.0

6175 1 9500 NaN

6176 2 5000 2000.0

6177

6178 Functions that perform tax reductions on an income DataFrame.

6179

6180 >>> def subtract_federal_tax(df):

6181 ... return df * 0.9

6182 >>> def subtract_state_tax(df, rate):

6183 ... return df * (1 - rate)

6184 >>> def subtract_national_insurance(df, rate, rate_increase):

6185 ... new_rate = rate + rate_increase

6186 ... return df * (1 - new_rate)

6187

6188 Instead of writing

6189

6190 >>> subtract_national_insurance(

6191 ... subtract_state_tax(subtract_federal_tax(df), rate=0.12),

6192 ... rate=0.05,

6193 ... rate_increase=0.02) # doctest: +SKIP

6194

6195 You can write

6196

6197 >>> (

6198 ... df.pipe(subtract_federal_tax)

6199 ... .pipe(subtract_state_tax, rate=0.12)

6200 ... .pipe(subtract_national_insurance, rate=0.05, rate_increase=0.02)

6201 ... )

6202 Salary Others

6203 0 5892.48 736.56

6204 1 6997.32 NaN

6205 2 3682.80 1473.12

6206

6207 If you have a function that takes the data as (say) the second

6208 argument, pass a tuple indicating which keyword expects the

6209 data. For example, suppose ``national_insurance`` takes its data as ``df``

6210 in the second argument:

6211

6212 >>> def subtract_national_insurance(rate, df, rate_increase):

6213 ... new_rate = rate + rate_increase

6214 ... return df * (1 - new_rate)

6215 >>> (

6216 ... df.pipe(subtract_federal_tax)

6217 ... .pipe(subtract_state_tax, rate=0.12)

6218 ... .pipe(

6219 ... (subtract_national_insurance, 'df'),

6220 ... rate=0.05,

6221 ... rate_increase=0.02

6222 ... )

6223 ... )

6224 Salary Others

6225 0 5892.48 736.56

6226 1 6997.32 NaN

6227 2 3682.80 1473.12

6228 """

6229 if using_copy_on_write():

6230 return common.pipe(self.copy(deep=None), func, *args, **kwargs)

6231 return common.pipe(self, func, *args, **kwargs)

6232

6233 # ----------------------------------------------------------------------

6234 # Attribute access

6235

6236 @final

6237 def __finalize__(self, other, method: str | None = None, **kwargs) -> Self:

6238 """

6239 Propagate metadata from other to self.

6240

6241 Parameters

6242 ----------

6243 other : the object from which to get the attributes that we are going

6244 to propagate

6245 method : str, optional

6246 A passed method name providing context on where ``__finalize__``

6247 was called.

6248

6249 .. warning::

6250

6251 The value passed as `method` are not currently considered

6252 stable across pandas releases.

6253 """

6254 if isinstance(other, NDFrame):

6255 if other.attrs:

6256 # We want attrs propagation to have minimal performance

6257 # impact if attrs are not used; i.e. attrs is an empty dict.

6258 # One could make the deepcopy unconditionally, but a deepcopy

6259 # of an empty dict is 50x more expensive than the empty check.

6260 self.attrs = deepcopy(other.attrs)

6261

6262 self.flags.allows_duplicate_labels = other.flags.allows_duplicate_labels

6263 # For subclasses using _metadata.

6264 for name in set(self._metadata) & set(other._metadata):

6265 assert isinstance(name, str)

6266 object.__setattr__(self, name, getattr(other, name, None))

6267

6268 if method == "concat":

6269 # propagate attrs only if all concat arguments have the same attrs

6270 if all(bool(obj.attrs) for obj in other.objs):

6271 # all concatenate arguments have non-empty attrs

6272 attrs = other.objs[0].attrs

6273 have_same_attrs = all(obj.attrs == attrs for obj in other.objs[1:])

6274 if have_same_attrs:

6275 self.attrs = deepcopy(attrs)

6276

6277 allows_duplicate_labels = all(

6278 x.flags.allows_duplicate_labels for x in other.objs

6279 )

6280 self.flags.allows_duplicate_labels = allows_duplicate_labels

6281

6282 return self

6283

6284 @final

6285 def __getattr__(self, name: str):

6286 """

6287 After regular attribute access, try looking up the name

6288 This allows simpler access to columns for interactive use.

6289 """

6290 # Note: obj.x will always call obj.__getattribute__('x') prior to

6291 # calling obj.__getattr__('x').

6292 if (

6293 name not in self._internal_names_set

6294 and name not in self._metadata

6295 and name not in self._accessors

6296 and self._info_axis._can_hold_identifiers_and_holds_name(name)

6297 ):

6298 return self[name]

6299 return object.__getattribute__(self, name)

6300

6301 @final

6302 def __setattr__(self, name: str, value) -> None:

6303 """

6304 After regular attribute access, try setting the name

6305 This allows simpler access to columns for interactive use.

6306 """

6307 # first try regular attribute access via __getattribute__, so that

6308 # e.g. ``obj.x`` and ``obj.x = 4`` will always reference/modify

6309 # the same attribute.

6310

6311 try:

6312 object.__getattribute__(self, name)

6313 return object.__setattr__(self, name, value)

6314 except AttributeError:

6315 pass

6316

6317 # if this fails, go on to more involved attribute setting

6318 # (note that this matches __getattr__, above).

6319 if name in self._internal_names_set:

6320 object.__setattr__(self, name, value)

6321 elif name in self._metadata:

6322 object.__setattr__(self, name, value)

6323 else:

6324 try:

6325 existing = getattr(self, name)

6326 if isinstance(existing, Index):

6327 object.__setattr__(self, name, value)

6328 elif name in self._info_axis:

6329 self[name] = value

6330 else:

6331 object.__setattr__(self, name, value)

6332 except (AttributeError, TypeError):

6333 if isinstance(self, ABCDataFrame) and (is_list_like(value)):

6334 warnings.warn(

6335 "Pandas doesn't allow columns to be "

6336 "created via a new attribute name - see "

6337 "https://pandas.pydata.org/pandas-docs/"

6338 "stable/indexing.html#attribute-access",

6339 stacklevel=find_stack_level(),

6340 )

6341 object.__setattr__(self, name, value)

6342

6343 @final

6344 def _dir_additions(self) -> set[str]:

6345 """

6346 add the string-like attributes from the info_axis.

6347 If info_axis is a MultiIndex, its first level values are used.

6348 """

6349 additions = super()._dir_additions()

6350 if self._info_axis._can_hold_strings:

6351 additions.update(self._info_axis._dir_additions_for_owner)

6352 return additions

6353

6354 # ----------------------------------------------------------------------

6355 # Consolidation of internals

6356

6357 @final

6358 def _protect_consolidate(self, f):

6359 """

6360 Consolidate _mgr -- if the blocks have changed, then clear the

6361 cache

6362 """

6363 if isinstance(self._mgr, (ArrayManager, SingleArrayManager)):

6364 return f()

6365 blocks_before = len(self._mgr.blocks)

6366 result = f()

6367 if len(self._mgr.blocks) != blocks_before:

6368 self._clear_item_cache()

6369 return result

6370

6371 @final

6372 def _consolidate_inplace(self) -> None:

6373 """Consolidate data in place and return None"""

6374

6375 def f() -> None:

6376 self._mgr = self._mgr.consolidate()

6377

6378 self._protect_consolidate(f)

6379

6380 @final

6381 def _consolidate(self):

6382 """

6383 Compute NDFrame with "consolidated" internals (data of each dtype

6384 grouped together in a single ndarray).

6385

6386 Returns

6387 -------

6388 consolidated : same type as caller

6389 """

6390 f = lambda: self._mgr.consolidate()

6391 cons_data = self._protect_consolidate(f)

6392 return self._constructor_from_mgr(cons_data, axes=cons_data.axes).__finalize__(

6393 self

6394 )

6395

6396 @final

6397 @property

6398 def _is_mixed_type(self) -> bool_t:

6399 if self._mgr.is_single_block:

6400 # Includes all Series cases

6401 return False

6402

6403 if self._mgr.any_extension_types:

6404 # Even if they have the same dtype, we can't consolidate them,

6405 # so we pretend this is "mixed'"

6406 return True

6407

6408 return self.dtypes.nunique() > 1

6409

6410 @final

6411 def _get_numeric_data(self) -> Self:

6412 new_mgr = self._mgr.get_numeric_data()

6413 return self._constructor_from_mgr(new_mgr, axes=new_mgr.axes).__finalize__(self)

6414

6415 @final

6416 def _get_bool_data(self):

6417 new_mgr = self._mgr.get_bool_data()

6418 return self._constructor_from_mgr(new_mgr, axes=new_mgr.axes).__finalize__(self)

6419

6420 # ----------------------------------------------------------------------

6421 # Internal Interface Methods

6422

6423 @property

6424 def values(self):

6425 raise AbstractMethodError(self)

6426

6427 @property

6428 def _values(self) -> ArrayLike:

6429 """internal implementation"""

6430 raise AbstractMethodError(self)

6431

6432 @property

6433 def dtypes(self):

6434 """

6435 Return the dtypes in the DataFrame.

6436

6437 This returns a Series with the data type of each column.

6438 The result's index is the original DataFrame's columns. Columns

6439 with mixed types are stored with the ``object`` dtype. See

6440 :ref:`the User Guide <basics.dtypes>` for more.

6441

6442 Returns

6443 -------

6444 pandas.Series

6445 The data type of each column.

6446

6447 Examples

6448 --------

6449 >>> df = pd.DataFrame({'float': [1.0],

6450 ... 'int': [1],

6451 ... 'datetime': [pd.Timestamp('20180310')],

6452 ... 'string': ['foo']})

6453 >>> df.dtypes

6454 float float64

6455 int int64

6456 datetime datetime64[ns]

6457 string object

6458 dtype: object

6459 """

6460 data = self._mgr.get_dtypes()

6461 return self._constructor_sliced(data, index=self._info_axis, dtype=np.object_)

6462

6463 @final

6464 def astype(

6465 self, dtype, copy: bool_t | None = None, errors: IgnoreRaise = "raise"

6466 ) -> Self:

6467 """

6468 Cast a pandas object to a specified dtype ``dtype``.

6469

6470 Parameters

6471 ----------

6472 dtype : str, data type, Series or Mapping of column name -> data type

6473 Use a str, numpy.dtype, pandas.ExtensionDtype or Python type to

6474 cast entire pandas object to the same type. Alternatively, use a

6475 mapping, e.g. {col: dtype, ...}, where col is a column label and dtype is

6476 a numpy.dtype or Python type to cast one or more of the DataFrame's

6477 columns to column-specific types.

6478 copy : bool, default True

6479 Return a copy when ``copy=True`` (be very careful setting

6480 ``copy=False`` as changes to values then may propagate to other

6481 pandas objects).

6482

6483 .. note::

6484 The `copy` keyword will change behavior in pandas 3.0.

6485 `Copy-on-Write

6486 <https://pandas.pydata.org/docs/dev/user_guide/copy_on_write.html>`__

6487 will be enabled by default, which means that all methods with a

6488 `copy` keyword will use a lazy copy mechanism to defer the copy and

6489 ignore the `copy` keyword. The `copy` keyword will be removed in a

6490 future version of pandas.

6491

6492 You can already get the future behavior and improvements through

6493 enabling copy on write ``pd.options.mode.copy_on_write = True``

6494 errors : {'raise', 'ignore'}, default 'raise'

6495 Control raising of exceptions on invalid data for provided dtype.

6496

6497 - ``raise`` : allow exceptions to be raised

6498 - ``ignore`` : suppress exceptions. On error return original object.

6499

6500 Returns

6501 -------

6502 same type as caller

6503

6504 See Also

6505 --------

6506 to_datetime : Convert argument to datetime.

6507 to_timedelta : Convert argument to timedelta.

6508 to_numeric : Convert argument to a numeric type.

6509 numpy.ndarray.astype : Cast a numpy array to a specified type.

6510

6511 Notes

6512 -----

6513 .. versionchanged:: 2.0.0

6514

6515 Using ``astype`` to convert from timezone-naive dtype to

6516 timezone-aware dtype will raise an exception.

6517 Use :meth:`Series.dt.tz_localize` instead.

6518

6519 Examples

6520 --------

6521 Create a DataFrame:

6522

6523 >>> d = {'col1': [1, 2], 'col2': [3, 4]}

6524 >>> df = pd.DataFrame(data=d)

6525 >>> df.dtypes

6526 col1 int64

6527 col2 int64

6528 dtype: object

6529

6530 Cast all columns to int32:

6531

6532 >>> df.astype('int32').dtypes

6533 col1 int32

6534 col2 int32

6535 dtype: object

6536

6537 Cast col1 to int32 using a dictionary:

6538

6539 >>> df.astype({'col1': 'int32'}).dtypes

6540 col1 int32

6541 col2 int64

6542 dtype: object

6543

6544 Create a series:

6545

6546 >>> ser = pd.Series([1, 2], dtype='int32')

6547 >>> ser

6548 0 1

6549 1 2

6550 dtype: int32

6551 >>> ser.astype('int64')

6552 0 1

6553 1 2

6554 dtype: int64

6555

6556 Convert to categorical type:

6557

6558 >>> ser.astype('category')

6559 0 1

6560 1 2

6561 dtype: category

6562 Categories (2, int32): [1, 2]

6563

6564 Convert to ordered categorical type with custom ordering:

6565

6566 >>> from pandas.api.types import CategoricalDtype

6567 >>> cat_dtype = CategoricalDtype(

6568 ... categories=[2, 1], ordered=True)

6569 >>> ser.astype(cat_dtype)

6570 0 1

6571 1 2

6572 dtype: category

6573 Categories (2, int64): [2 < 1]

6574

6575 Create a series of dates:

6576

6577 >>> ser_date = pd.Series(pd.date_range('20200101', periods=3))

6578 >>> ser_date

6579 0 2020-01-01

6580 1 2020-01-02

6581 2 2020-01-03

6582 dtype: datetime64[ns]

6583 """

6584 if copy and using_copy_on_write():

6585 copy = False

6586

6587 if is_dict_like(dtype):

6588 if self.ndim == 1: # i.e. Series

6589 if len(dtype) > 1 or self.name not in dtype:

6590 raise KeyError(

6591 "Only the Series name can be used for "

6592 "the key in Series dtype mappings."

6593 )

6594 new_type = dtype[self.name]

6595 return self.astype(new_type, copy, errors)

6596

6597 # GH#44417 cast to Series so we can use .iat below, which will be

6598 # robust in case we

6599 from pandas import Series

6600

6601 dtype_ser = Series(dtype, dtype=object)

6602

6603 for col_name in dtype_ser.index:

6604 if col_name not in self:

6605 raise KeyError(

6606 "Only a column name can be used for the "

6607 "key in a dtype mappings argument. "

6608 f"'{col_name}' not found in columns."

6609 )

6610

6611 dtype_ser = dtype_ser.reindex(self.columns, fill_value=None, copy=False)

6612

6613 results = []

6614 for i, (col_name, col) in enumerate(self.items()):

6615 cdt = dtype_ser.iat[i]

6616 if isna(cdt):

6617 res_col = col.copy(deep=copy)

6618 else:

6619 try:

6620 res_col = col.astype(dtype=cdt, copy=copy, errors=errors)

6621 except ValueError as ex:

6622 ex.args = (

6623 f"{ex}: Error while type casting for column '{col_name}'",

6624 )

6625 raise

6626 results.append(res_col)

6627

6628 elif is_extension_array_dtype(dtype) and self.ndim > 1:

6629 # TODO(EA2D): special case not needed with 2D EAs

6630 dtype = pandas_dtype(dtype)

6631 if isinstance(dtype, ExtensionDtype) and all(

6632 arr.dtype == dtype for arr in self._mgr.arrays

6633 ):

6634 return self.copy(deep=copy)

6635 # GH 18099/22869: columnwise conversion to extension dtype

6636 # GH 24704: self.items handles duplicate column names

6637 results = [

6638 ser.astype(dtype, copy=copy, errors=errors) for _, ser in self.items()

6639 ]

6640

6641 else:

6642 # else, only a single dtype is given

6643 new_data = self._mgr.astype(dtype=dtype, copy=copy, errors=errors)

6644 res = self._constructor_from_mgr(new_data, axes=new_data.axes)

6645 return res.__finalize__(self, method="astype")

6646

6647 # GH 33113: handle empty frame or series

6648 if not results:

6649 return self.copy(deep=None)

6650

6651 # GH 19920: retain column metadata after concat

6652 result = concat(results, axis=1, copy=False)

6653 # GH#40810 retain subclass

6654 # error: Incompatible types in assignment

6655 # (expression has type "Self", variable has type "DataFrame")

6656 result = self._constructor(result) # type: ignore[assignment]

6657 result.columns = self.columns

6658 result = result.__finalize__(self, method="astype")

6659 # https://github.com/python/mypy/issues/8354

6660 return cast(Self, result)

6661

6662 @final

6663 def copy(self, deep: bool_t | None = True) -> Self:

6664 """

6665 Make a copy of this object's indices and data.

6666

6667 When ``deep=True`` (default), a new object will be created with a

6668 copy of the calling object's data and indices. Modifications to

6669 the data or indices of the copy will not be reflected in the

6670 original object (see notes below).

6671

6672 When ``deep=False``, a new object will be created without copying

6673 the calling object's data or index (only references to the data

6674 and index are copied). Any changes to the data of the original

6675 will be reflected in the shallow copy (and vice versa).

6676

6677 .. note::

6678 The ``deep=False`` behaviour as described above will change

6679 in pandas 3.0. `Copy-on-Write

6680 <https://pandas.pydata.org/docs/dev/user_guide/copy_on_write.html>`__

6681 will be enabled by default, which means that the "shallow" copy

6682 is that is returned with ``deep=False`` will still avoid making

6683 an eager copy, but changes to the data of the original will *no*

6684 longer be reflected in the shallow copy (or vice versa). Instead,

6685 it makes use of a lazy (deferred) copy mechanism that will copy

6686 the data only when any changes to the original or shallow copy is

6687 made.

6688

6689 You can already get the future behavior and improvements through

6690 enabling copy on write ``pd.options.mode.copy_on_write = True``

6691

6692 Parameters

6693 ----------

6694 deep : bool, default True

6695 Make a deep copy, including a copy of the data and the indices.

6696 With ``deep=False`` neither the indices nor the data are copied.

6697

6698 Returns

6699 -------

6700 Series or DataFrame

6701 Object type matches caller.

6702

6703 Notes

6704 -----

6705 When ``deep=True``, data is copied but actual Python objects

6706 will not be copied recursively, only the reference to the object.

6707 This is in contrast to `copy.deepcopy` in the Standard Library,

6708 which recursively copies object data (see examples below).

6709

6710 While ``Index`` objects are copied when ``deep=True``, the underlying

6711 numpy array is not copied for performance reasons. Since ``Index`` is

6712 immutable, the underlying data can be safely shared and a copy

6713 is not needed.

6714

6715 Since pandas is not thread safe, see the

6716 :ref:`gotchas <gotchas.thread-safety>` when copying in a threading

6717 environment.

6718

6719 When ``copy_on_write`` in pandas config is set to ``True``, the

6720 ``copy_on_write`` config takes effect even when ``deep=False``.

6721 This means that any changes to the copied data would make a new copy

6722 of the data upon write (and vice versa). Changes made to either the

6723 original or copied variable would not be reflected in the counterpart.

6724 See :ref:`Copy_on_Write <copy_on_write>` for more information.

6725

6726 Examples

6727 --------

6728 >>> s = pd.Series([1, 2], index=["a", "b"])

6729 >>> s

6730 a 1

6731 b 2

6732 dtype: int64

6733

6734 >>> s_copy = s.copy()

6735 >>> s_copy

6736 a 1

6737 b 2

6738 dtype: int64

6739

6740 **Shallow copy versus default (deep) copy:**

6741

6742 >>> s = pd.Series([1, 2], index=["a", "b"])

6743 >>> deep = s.copy()

6744 >>> shallow = s.copy(deep=False)

6745

6746 Shallow copy shares data and index with original.

6747

6748 >>> s is shallow

6749 False

6750 >>> s.values is shallow.values and s.index is shallow.index

6751 True

6752

6753 Deep copy has own copy of data and index.

6754

6755 >>> s is deep

6756 False

6757 >>> s.values is deep.values or s.index is deep.index

6758 False

6759

6760 Updates to the data shared by shallow copy and original is reflected

6761 in both (NOTE: this will no longer be true for pandas >= 3.0);

6762 deep copy remains unchanged.

6763

6764 >>> s.iloc[0] = 3

6765 >>> shallow.iloc[1] = 4

6766 >>> s

6767 a 3

6768 b 4

6769 dtype: int64

6770 >>> shallow

6771 a 3

6772 b 4

6773 dtype: int64

6774 >>> deep

6775 a 1

6776 b 2

6777 dtype: int64

6778

6779 Note that when copying an object containing Python objects, a deep copy

6780 will copy the data, but will not do so recursively. Updating a nested

6781 data object will be reflected in the deep copy.

6782

6783 >>> s = pd.Series([[1, 2], [3, 4]])

6784 >>> deep = s.copy()

6785 >>> s[0][0] = 10

6786 >>> s

6787 0 [10, 2]

6788 1 [3, 4]

6789 dtype: object

6790 >>> deep

6791 0 [10, 2]

6792 1 [3, 4]

6793 dtype: object

6794

6795 **Copy-on-Write is set to true**, the shallow copy is not modified

6796 when the original data is changed:

6797

6798 >>> with pd.option_context("mode.copy_on_write", True):

6799 ... s = pd.Series([1, 2], index=["a", "b"])

6800 ... copy = s.copy(deep=False)

6801 ... s.iloc[0] = 100

6802 ... s

6803 a 100

6804 b 2

6805 dtype: int64

6806 >>> copy

6807 a 1

6808 b 2

6809 dtype: int64

6810 """

6811 data = self._mgr.copy(deep=deep)

6812 self._clear_item_cache()

6813 return self._constructor_from_mgr(data, axes=data.axes).__finalize__(

6814 self, method="copy"

6815 )

6816

6817 @final

6818 def __copy__(self, deep: bool_t = True) -> Self:

6819 return self.copy(deep=deep)

6820

6821 @final

6822 def __deepcopy__(self, memo=None) -> Self:

6823 """

6824 Parameters

6825 ----------

6826 memo, default None

6827 Standard signature. Unused

6828 """

6829 return self.copy(deep=True)

6830

6831 @final

6832 def infer_objects(self, copy: bool_t | None = None) -> Self:

6833 """

6834 Attempt to infer better dtypes for object columns.

6835

6836 Attempts soft conversion of object-dtyped

6837 columns, leaving non-object and unconvertible

6838 columns unchanged. The inference rules are the

6839 same as during normal Series/DataFrame construction.

6840

6841 Parameters

6842 ----------

6843 copy : bool, default True

6844 Whether to make a copy for non-object or non-inferable columns

6845 or Series.

6846

6847 .. note::

6848 The `copy` keyword will change behavior in pandas 3.0.

6849 `Copy-on-Write

6850 <https://pandas.pydata.org/docs/dev/user_guide/copy_on_write.html>`__

6851 will be enabled by default, which means that all methods with a

6852 `copy` keyword will use a lazy copy mechanism to defer the copy and

6853 ignore the `copy` keyword. The `copy` keyword will be removed in a

6854 future version of pandas.

6855

6856 You can already get the future behavior and improvements through

6857 enabling copy on write ``pd.options.mode.copy_on_write = True``

6858

6859 Returns

6860 -------

6861 same type as input object

6862

6863 See Also

6864 --------

6865 to_datetime : Convert argument to datetime.

6866 to_timedelta : Convert argument to timedelta.

6867 to_numeric : Convert argument to numeric type.

6868 convert_dtypes : Convert argument to best possible dtype.

6869

6870 Examples

6871 --------

6872 >>> df = pd.DataFrame({"A": ["a", 1, 2, 3]})

6873 >>> df = df.iloc[1:]

6874 >>> df

6875 A

6876 1 1

6877 2 2

6878 3 3

6879

6880 >>> df.dtypes

6881 A object

6882 dtype: object

6883

6884 >>> df.infer_objects().dtypes

6885 A int64

6886 dtype: object

6887 """

6888 new_mgr = self._mgr.convert(copy=copy)

6889 res = self._constructor_from_mgr(new_mgr, axes=new_mgr.axes)

6890 return res.__finalize__(self, method="infer_objects")

6891

6892 @final

6893 def convert_dtypes(

6894 self,

6895 infer_objects: bool_t = True,

6896 convert_string: bool_t = True,

6897 convert_integer: bool_t = True,

6898 convert_boolean: bool_t = True,

6899 convert_floating: bool_t = True,

6900 dtype_backend: DtypeBackend = "numpy_nullable",

6901 ) -> Self:

6902 """

6903 Convert columns to the best possible dtypes using dtypes supporting ``pd.NA``.

6904

6905 Parameters

6906 ----------

6907 infer_objects : bool, default True

6908 Whether object dtypes should be converted to the best possible types.

6909 convert_string : bool, default True

6910 Whether object dtypes should be converted to ``StringDtype()``.

6911 convert_integer : bool, default True

6912 Whether, if possible, conversion can be done to integer extension types.

6913 convert_boolean : bool, defaults True

6914 Whether object dtypes should be converted to ``BooleanDtypes()``.

6915 convert_floating : bool, defaults True

6916 Whether, if possible, conversion can be done to floating extension types.

6917 If `convert_integer` is also True, preference will be give to integer

6918 dtypes if the floats can be faithfully casted to integers.

6919 dtype_backend : {'numpy_nullable', 'pyarrow'}, default 'numpy_nullable'

6920 Back-end data type applied to the resultant :class:`DataFrame`

6921 (still experimental). Behaviour is as follows:

6922

6923 * ``"numpy_nullable"``: returns nullable-dtype-backed :class:`DataFrame`

6924 (default).

6925 * ``"pyarrow"``: returns pyarrow-backed nullable :class:`ArrowDtype`

6926 DataFrame.

6927

6928 .. versionadded:: 2.0

6929

6930 Returns

6931 -------

6932 Series or DataFrame

6933 Copy of input object with new dtype.

6934

6935 See Also

6936 --------

6937 infer_objects : Infer dtypes of objects.

6938 to_datetime : Convert argument to datetime.

6939 to_timedelta : Convert argument to timedelta.

6940 to_numeric : Convert argument to a numeric type.

6941

6942 Notes

6943 -----

6944 By default, ``convert_dtypes`` will attempt to convert a Series (or each

6945 Series in a DataFrame) to dtypes that support ``pd.NA``. By using the options

6946 ``convert_string``, ``convert_integer``, ``convert_boolean`` and

6947 ``convert_floating``, it is possible to turn off individual conversions

6948 to ``StringDtype``, the integer extension types, ``BooleanDtype``

6949 or floating extension types, respectively.

6950

6951 For object-dtyped columns, if ``infer_objects`` is ``True``, use the inference

6952 rules as during normal Series/DataFrame construction. Then, if possible,

6953 convert to ``StringDtype``, ``BooleanDtype`` or an appropriate integer

6954 or floating extension type, otherwise leave as ``object``.

6955

6956 If the dtype is integer, convert to an appropriate integer extension type.

6957

6958 If the dtype is numeric, and consists of all integers, convert to an

6959 appropriate integer extension type. Otherwise, convert to an

6960 appropriate floating extension type.

6961

6962 In the future, as new dtypes are added that support ``pd.NA``, the results

6963 of this method will change to support those new dtypes.

6964

6965 Examples

6966 --------

6967 >>> df = pd.DataFrame(

6968 ... {

6969 ... "a": pd.Series([1, 2, 3], dtype=np.dtype("int32")),

6970 ... "b": pd.Series(["x", "y", "z"], dtype=np.dtype("O")),

6971 ... "c": pd.Series([True, False, np.nan], dtype=np.dtype("O")),

6972 ... "d": pd.Series(["h", "i", np.nan], dtype=np.dtype("O")),

6973 ... "e": pd.Series([10, np.nan, 20], dtype=np.dtype("float")),

6974 ... "f": pd.Series([np.nan, 100.5, 200], dtype=np.dtype("float")),

6975 ... }

6976 ... )

6977

6978 Start with a DataFrame with default dtypes.

6979

6980 >>> df

6981 a b c d e f

6982 0 1 x True h 10.0 NaN

6983 1 2 y False i NaN 100.5

6984 2 3 z NaN NaN 20.0 200.0

6985

6986 >>> df.dtypes

6987 a int32

6988 b object

6989 c object

6990 d object

6991 e float64

6992 f float64

6993 dtype: object

6994

6995 Convert the DataFrame to use best possible dtypes.

6996

6997 >>> dfn = df.convert_dtypes()

6998 >>> dfn

6999 a b c d e f

7000 0 1 x True h 10 <NA>

7001 1 2 y False i <NA> 100.5

7002 2 3 z <NA> <NA> 20 200.0

7003

7004 >>> dfn.dtypes

7005 a Int32

7006 b string[python]

7007 c boolean

7008 d string[python]

7009 e Int64

7010 f Float64

7011 dtype: object

7012

7013 Start with a Series of strings and missing data represented by ``np.nan``.

7014

7015 >>> s = pd.Series(["a", "b", np.nan])

7016 >>> s

7017 0 a

7018 1 b

7019 2 NaN

7020 dtype: object

7021

7022 Obtain a Series with dtype ``StringDtype``.

7023

7024 >>> s.convert_dtypes()

7025 0 a

7026 1 b

7027 2 <NA>

7028 dtype: string

7029 """

7030 check_dtype_backend(dtype_backend)

7031 new_mgr = self._mgr.convert_dtypes( # type: ignore[union-attr]

7032 infer_objects=infer_objects,

7033 convert_string=convert_string,

7034 convert_integer=convert_integer,

7035 convert_boolean=convert_boolean,

7036 convert_floating=convert_floating,

7037 dtype_backend=dtype_backend,

7038 )

7039 res = self._constructor_from_mgr(new_mgr, axes=new_mgr.axes)

7040 return res.__finalize__(self, method="convert_dtypes")

7041

7042 # ----------------------------------------------------------------------

7043 # Filling NA's

7044

7045 def _deprecate_downcast(self, downcast, method_name: str):

7046 # GH#40988

7047 if downcast is not lib.no_default:

7048 warnings.warn(

7049 f"The 'downcast' keyword in {method_name} is deprecated and "

7050 "will be removed in a future version. Use "

7051 "res.infer_objects(copy=False) to infer non-object dtype, or "

7052 "pd.to_numeric with the 'downcast' keyword to downcast numeric "

7053 "results.",

7054 FutureWarning,

7055 stacklevel=find_stack_level(),

7056 )

7057 else:

7058 downcast = None

7059 return downcast

7060

7061 @final

7062 def _pad_or_backfill(

7063 self,

7064 method: Literal["ffill", "bfill", "pad", "backfill"],

7065 *,

7066 axis: None | Axis = None,

7067 inplace: bool_t = False,

7068 limit: None | int = None,

7069 limit_area: Literal["inside", "outside"] | None = None,

7070 downcast: dict | None = None,

7071 ):

7072 if axis is None:

7073 axis = 0

7074 axis = self._get_axis_number(axis)

7075 method = clean_fill_method(method)

7076

7077 if not self._mgr.is_single_block and axis == 1:

7078 # e.g. test_align_fill_method

7079 # TODO(3.0): once downcast is removed, we can do the .T

7080 # in all axis=1 cases, and remove axis kward from mgr.pad_or_backfill.

7081 if inplace:

7082 raise NotImplementedError()

7083 result = self.T._pad_or_backfill(

7084 method=method, limit=limit, limit_area=limit_area

7085 ).T

7086

7087 return result

7088

7089 new_mgr = self._mgr.pad_or_backfill(

7090 method=method,

7091 axis=self._get_block_manager_axis(axis),

7092 limit=limit,

7093 limit_area=limit_area,

7094 inplace=inplace,

7095 downcast=downcast,

7096 )

7097 result = self._constructor_from_mgr(new_mgr, axes=new_mgr.axes)

7098 if inplace:

7099 return self._update_inplace(result)

7100 else:

7101 return result.__finalize__(self, method="fillna")

7102

7103 @overload

7104 def fillna(

7105 self,

7106 value: Hashable | Mapping | Series | DataFrame = ...,

7107 *,

7108 method: FillnaOptions | None = ...,

7109 axis: Axis | None = ...,

7110 inplace: Literal[False] = ...,

7111 limit: int | None = ...,

7112 downcast: dict | None = ...,

7113 ) -> Self:

7114 ...

7115

7116 @overload

7117 def fillna(

7118 self,

7119 value: Hashable | Mapping | Series | DataFrame = ...,

7120 *,

7121 method: FillnaOptions | None = ...,

7122 axis: Axis | None = ...,

7123 inplace: Literal[True],

7124 limit: int | None = ...,

7125 downcast: dict | None = ...,

7126 ) -> None:

7127 ...

7128

7129 @overload

7130 def fillna(

7131 self,

7132 value: Hashable | Mapping | Series | DataFrame = ...,

7133 *,

7134 method: FillnaOptions | None = ...,

7135 axis: Axis | None = ...,

7136 inplace: bool_t = ...,

7137 limit: int | None = ...,

7138 downcast: dict | None = ...,

7139 ) -> Self | None:

7140 ...

7141

7142 @final

7143 @doc(

7144 klass=_shared_doc_kwargs["klass"],

7145 axes_single_arg=_shared_doc_kwargs["axes_single_arg"],

7146 )

7147 def fillna(

7148 self,

7149 value: Hashable | Mapping | Series | DataFrame | None = None,

7150 *,

7151 method: FillnaOptions | None = None,

7152 axis: Axis | None = None,

7153 inplace: bool_t = False,

7154 limit: int | None = None,

7155 downcast: dict | None | lib.NoDefault = lib.no_default,

7156 ) -> Self | None:

7157 """

7158 Fill NA/NaN values using the specified method.

7159

7160 Parameters

7161 ----------

7162 value : scalar, dict, Series, or DataFrame

7163 Value to use to fill holes (e.g. 0), alternately a

7164 dict/Series/DataFrame of values specifying which value to use for

7165 each index (for a Series) or column (for a DataFrame). Values not

7166 in the dict/Series/DataFrame will not be filled. This value cannot

7167 be a list.

7168 method : {{'backfill', 'bfill', 'ffill', None}}, default None

7169 Method to use for filling holes in reindexed Series:

7170

7171 * ffill: propagate last valid observation forward to next valid.

7172 * backfill / bfill: use next valid observation to fill gap.

7173

7174 .. deprecated:: 2.1.0

7175 Use ffill or bfill instead.

7176

7177 axis : {axes_single_arg}

7178 Axis along which to fill missing values. For `Series`

7179 this parameter is unused and defaults to 0.

7180 inplace : bool, default False

7181 If True, fill in-place. Note: this will modify any

7182 other views on this object (e.g., a no-copy slice for a column in a

7183 DataFrame).

7184 limit : int, default None

7185 If method is specified, this is the maximum number of consecutive

7186 NaN values to forward/backward fill. In other words, if there is

7187 a gap with more than this number of consecutive NaNs, it will only

7188 be partially filled. If method is not specified, this is the

7189 maximum number of entries along the entire axis where NaNs will be

7190 filled. Must be greater than 0 if not None.

7191 downcast : dict, default is None

7192 A dict of item->dtype of what to downcast if possible,

7193 or the string 'infer' which will try to downcast to an appropriate

7194 equal type (e.g. float64 to int64 if possible).

7195

7196 .. deprecated:: 2.2.0

7197

7198 Returns

7199 -------

7200 {klass} or None

7201 Object with missing values filled or None if ``inplace=True``.

7202

7203 See Also

7204 --------

7205 ffill : Fill values by propagating the last valid observation to next valid.

7206 bfill : Fill values by using the next valid observation to fill the gap.

7207 interpolate : Fill NaN values using interpolation.

7208 reindex : Conform object to new index.

7209 asfreq : Convert TimeSeries to specified frequency.

7210

7211 Examples

7212 --------

7213 >>> df = pd.DataFrame([[np.nan, 2, np.nan, 0],

7214 ... [3, 4, np.nan, 1],

7215 ... [np.nan, np.nan, np.nan, np.nan],

7216 ... [np.nan, 3, np.nan, 4]],

7217 ... columns=list("ABCD"))

7218 >>> df

7219 A B C D

7220 0 NaN 2.0 NaN 0.0

7221 1 3.0 4.0 NaN 1.0

7222 2 NaN NaN NaN NaN

7223 3 NaN 3.0 NaN 4.0

7224

7225 Replace all NaN elements with 0s.

7226

7227 >>> df.fillna(0)

7228 A B C D

7229 0 0.0 2.0 0.0 0.0

7230 1 3.0 4.0 0.0 1.0

7231 2 0.0 0.0 0.0 0.0

7232 3 0.0 3.0 0.0 4.0

7233

7234 Replace all NaN elements in column 'A', 'B', 'C', and 'D', with 0, 1,

7235 2, and 3 respectively.

7236

7237 >>> values = {{"A": 0, "B": 1, "C": 2, "D": 3}}

7238 >>> df.fillna(value=values)

7239 A B C D

7240 0 0.0 2.0 2.0 0.0

7241 1 3.0 4.0 2.0 1.0

7242 2 0.0 1.0 2.0 3.0

7243 3 0.0 3.0 2.0 4.0

7244

7245 Only replace the first NaN element.

7246

7247 >>> df.fillna(value=values, limit=1)

7248 A B C D

7249 0 0.0 2.0 2.0 0.0

7250 1 3.0 4.0 NaN 1.0

7251 2 NaN 1.0 NaN 3.0

7252 3 NaN 3.0 NaN 4.0

7253

7254 When filling using a DataFrame, replacement happens along

7255 the same column names and same indices

7256

7257 >>> df2 = pd.DataFrame(np.zeros((4, 4)), columns=list("ABCE"))

7258 >>> df.fillna(df2)

7259 A B C D

7260 0 0.0 2.0 0.0 0.0

7261 1 3.0 4.0 0.0 1.0

7262 2 0.0 0.0 0.0 NaN

7263 3 0.0 3.0 0.0 4.0

7264

7265 Note that column D is not affected since it is not present in df2.

7266 """

7267 inplace = validate_bool_kwarg(inplace, "inplace")

7268 if inplace:

7269 if not PYPY and using_copy_on_write():

7270 if sys.getrefcount(self) <= REF_COUNT:

7271 warnings.warn(

7272 _chained_assignment_method_msg,

7273 ChainedAssignmentError,

7274 stacklevel=2,

7275 )

7276 elif (

7277 not PYPY

7278 and not using_copy_on_write()

7279 and self._is_view_after_cow_rules()

7280 ):

7281 ctr = sys.getrefcount(self)

7282 ref_count = REF_COUNT

7283 if isinstance(self, ABCSeries) and _check_cacher(self):

7284 # see https://github.com/pandas-dev/pandas/pull/56060#discussion_r1399245221

7285 ref_count += 1

7286 if ctr <= ref_count:

7287 warnings.warn(

7288 _chained_assignment_warning_method_msg,

7289 FutureWarning,

7290 stacklevel=2,

7291 )

7292

7293 value, method = validate_fillna_kwargs(value, method)

7294 if method is not None:

7295 warnings.warn(

7296 f"{type(self).__name__}.fillna with 'method' is deprecated and "

7297 "will raise in a future version. Use obj.ffill() or obj.bfill() "

7298 "instead.",

7299 FutureWarning,

7300 stacklevel=find_stack_level(),

7301 )

7302

7303 was_no_default = downcast is lib.no_default

7304 downcast = self._deprecate_downcast(downcast, "fillna")

7305

7306 # set the default here, so functions examining the signaure

7307 # can detect if something was set (e.g. in groupby) (GH9221)

7308 if axis is None:

7309 axis = 0

7310 axis = self._get_axis_number(axis)

7311

7312 if value is None:

7313 return self._pad_or_backfill(

7314 # error: Argument 1 to "_pad_or_backfill" of "NDFrame" has

7315 # incompatible type "Optional[Literal['backfill', 'bfill', 'ffill',

7316 # 'pad']]"; expected "Literal['ffill', 'bfill', 'pad', 'backfill']"

7317 method, # type: ignore[arg-type]

7318 axis=axis,

7319 limit=limit,

7320 inplace=inplace,

7321 # error: Argument "downcast" to "_fillna_with_method" of "NDFrame"

7322 # has incompatible type "Union[Dict[Any, Any], None,

7323 # Literal[_NoDefault.no_default]]"; expected

7324 # "Optional[Dict[Any, Any]]"

7325 downcast=downcast, # type: ignore[arg-type]

7326 )

7327 else:

7328 if self.ndim == 1:

7329 if isinstance(value, (dict, ABCSeries)):

7330 if not len(value):

7331 # test_fillna_nonscalar

7332 if inplace:

7333 return None

7334 return self.copy(deep=None)

7335 from pandas import Series

7336

7337 value = Series(value)

7338 value = value.reindex(self.index, copy=False)

7339 value = value._values

7340 elif not is_list_like(value):

7341 pass

7342 else:

7343 raise TypeError(

7344 '"value" parameter must be a scalar, dict '

7345 "or Series, but you passed a "

7346 f'"{type(value).__name__}"'

7347 )

7348

7349 new_data = self._mgr.fillna(

7350 value=value, limit=limit, inplace=inplace, downcast=downcast

7351 )

7352

7353 elif isinstance(value, (dict, ABCSeries)):

7354 if axis == 1:

7355 raise NotImplementedError(

7356 "Currently only can fill "

7357 "with dict/Series column "

7358 "by column"

7359 )

7360 if using_copy_on_write():

7361 result = self.copy(deep=None)

7362 else:

7363 result = self if inplace else self.copy()

7364 is_dict = isinstance(downcast, dict)

7365 for k, v in value.items():

7366 if k not in result:

7367 continue

7368

7369 if was_no_default:

7370 downcast_k = lib.no_default

7371 else:

7372 downcast_k = (

7373 # error: Incompatible types in assignment (expression

7374 # has type "Union[Dict[Any, Any], None,

7375 # Literal[_NoDefault.no_default], Any]", variable has

7376 # type "_NoDefault")

7377 downcast # type: ignore[assignment]

7378 if not is_dict

7379 # error: Item "None" of "Optional[Dict[Any, Any]]" has

7380 # no attribute "get"

7381 else downcast.get(k) # type: ignore[union-attr]

7382 )

7383

7384 res_k = result[k].fillna(v, limit=limit, downcast=downcast_k)

7385

7386 if not inplace:

7387 result[k] = res_k

7388 else:

7389 # We can write into our existing column(s) iff dtype

7390 # was preserved.

7391 if isinstance(res_k, ABCSeries):

7392 # i.e. 'k' only shows up once in self.columns

7393 if res_k.dtype == result[k].dtype:

7394 result.loc[:, k] = res_k

7395 else:

7396 # Different dtype -> no way to do inplace.

7397 result[k] = res_k

7398 else:

7399 # see test_fillna_dict_inplace_nonunique_columns

7400 locs = result.columns.get_loc(k)

7401 if isinstance(locs, slice):

7402 locs = np.arange(self.shape[1])[locs]

7403 elif (

7404 isinstance(locs, np.ndarray) and locs.dtype.kind == "b"

7405 ):

7406 locs = locs.nonzero()[0]

7407 elif not (

7408 isinstance(locs, np.ndarray) and locs.dtype.kind == "i"

7409 ):

7410 # Should never be reached, but let's cover our bases

7411 raise NotImplementedError(

7412 "Unexpected get_loc result, please report a bug at "

7413 "https://github.com/pandas-dev/pandas"

7414 )

7415

7416 for i, loc in enumerate(locs):

7417 res_loc = res_k.iloc[:, i]

7418 target = self.iloc[:, loc]

7419

7420 if res_loc.dtype == target.dtype:

7421 result.iloc[:, loc] = res_loc

7422 else:

7423 result.isetitem(loc, res_loc)

7424 if inplace:

7425 return self._update_inplace(result)

7426 else:

7427 return result

7428

7429 elif not is_list_like(value):

7430 if axis == 1:

7431 result = self.T.fillna(value=value, limit=limit).T

7432 new_data = result._mgr

7433 else:

7434 new_data = self._mgr.fillna(

7435 value=value, limit=limit, inplace=inplace, downcast=downcast

7436 )

7437 elif isinstance(value, ABCDataFrame) and self.ndim == 2:

7438 new_data = self.where(self.notna(), value)._mgr

7439 else:

7440 raise ValueError(f"invalid fill value with a {type(value)}")

7441

7442 result = self._constructor_from_mgr(new_data, axes=new_data.axes)

7443 if inplace:

7444 return self._update_inplace(result)

7445 else:

7446 return result.__finalize__(self, method="fillna")

7447

7448 @overload

7449 def ffill(

7450 self,

7451 *,

7452 axis: None | Axis = ...,

7453 inplace: Literal[False] = ...,

7454 limit: None | int = ...,

7455 limit_area: Literal["inside", "outside"] | None = ...,

7456 downcast: dict | None | lib.NoDefault = ...,

7457 ) -> Self:

7458 ...

7459

7460 @overload

7461 def ffill(

7462 self,

7463 *,

7464 axis: None | Axis = ...,

7465 inplace: Literal[True],

7466 limit: None | int = ...,

7467 limit_area: Literal["inside", "outside"] | None = ...,

7468 downcast: dict | None | lib.NoDefault = ...,

7469 ) -> None:

7470 ...

7471

7472 @overload

7473 def ffill(

7474 self,

7475 *,

7476 axis: None | Axis = ...,

7477 inplace: bool_t = ...,

7478 limit: None | int = ...,

7479 limit_area: Literal["inside", "outside"] | None = ...,

7480 downcast: dict | None | lib.NoDefault = ...,

7481 ) -> Self | None:

7482 ...

7483

7484 @final

7485 @doc(

7486 klass=_shared_doc_kwargs["klass"],

7487 axes_single_arg=_shared_doc_kwargs["axes_single_arg"],

7488 )

7489 def ffill(

7490 self,

7491 *,

7492 axis: None | Axis = None,

7493 inplace: bool_t = False,

7494 limit: None | int = None,

7495 limit_area: Literal["inside", "outside"] | None = None,

7496 downcast: dict | None | lib.NoDefault = lib.no_default,

7497 ) -> Self | None:

7498 """

7499 Fill NA/NaN values by propagating the last valid observation to next valid.

7500

7501 Parameters

7502 ----------

7503 axis : {axes_single_arg}

7504 Axis along which to fill missing values. For `Series`

7505 this parameter is unused and defaults to 0.

7506 inplace : bool, default False

7507 If True, fill in-place. Note: this will modify any

7508 other views on this object (e.g., a no-copy slice for a column in a

7509 DataFrame).

7510 limit : int, default None

7511 If method is specified, this is the maximum number of consecutive

7512 NaN values to forward/backward fill. In other words, if there is

7513 a gap with more than this number of consecutive NaNs, it will only

7514 be partially filled. If method is not specified, this is the

7515 maximum number of entries along the entire axis where NaNs will be

7516 filled. Must be greater than 0 if not None.

7517 limit_area : {{`None`, 'inside', 'outside'}}, default None

7518 If limit is specified, consecutive NaNs will be filled with this

7519 restriction.

7520

7521 * ``None``: No fill restriction.

7522 * 'inside': Only fill NaNs surrounded by valid values

7523 (interpolate).

7524 * 'outside': Only fill NaNs outside valid values (extrapolate).

7525

7526 .. versionadded:: 2.2.0

7527

7528 downcast : dict, default is None

7529 A dict of item->dtype of what to downcast if possible,

7530 or the string 'infer' which will try to downcast to an appropriate

7531 equal type (e.g. float64 to int64 if possible).

7532

7533 .. deprecated:: 2.2.0

7534

7535 Returns

7536 -------

7537 {klass} or None

7538 Object with missing values filled or None if ``inplace=True``.

7539

7540 Examples

7541 --------

7542 >>> df = pd.DataFrame([[np.nan, 2, np.nan, 0],

7543 ... [3, 4, np.nan, 1],

7544 ... [np.nan, np.nan, np.nan, np.nan],

7545 ... [np.nan, 3, np.nan, 4]],

7546 ... columns=list("ABCD"))

7547 >>> df

7548 A B C D

7549 0 NaN 2.0 NaN 0.0

7550 1 3.0 4.0 NaN 1.0

7551 2 NaN NaN NaN NaN

7552 3 NaN 3.0 NaN 4.0

7553

7554 >>> df.ffill()

7555 A B C D

7556 0 NaN 2.0 NaN 0.0

7557 1 3.0 4.0 NaN 1.0

7558 2 3.0 4.0 NaN 1.0

7559 3 3.0 3.0 NaN 4.0

7560

7561 >>> ser = pd.Series([1, np.nan, 2, 3])

7562 >>> ser.ffill()

7563 0 1.0

7564 1 1.0

7565 2 2.0

7566 3 3.0

7567 dtype: float64

7568 """

7569 downcast = self._deprecate_downcast(downcast, "ffill")

7570 inplace = validate_bool_kwarg(inplace, "inplace")

7571 if inplace:

7572 if not PYPY and using_copy_on_write():

7573 if sys.getrefcount(self) <= REF_COUNT:

7574 warnings.warn(

7575 _chained_assignment_method_msg,

7576 ChainedAssignmentError,

7577 stacklevel=2,

7578 )

7579 elif (

7580 not PYPY

7581 and not using_copy_on_write()

7582 and self._is_view_after_cow_rules()

7583 ):

7584 ctr = sys.getrefcount(self)

7585 ref_count = REF_COUNT

7586 if isinstance(self, ABCSeries) and _check_cacher(self):

7587 # see https://github.com/pandas-dev/pandas/pull/56060#discussion_r1399245221

7588 ref_count += 1

7589 if ctr <= ref_count:

7590 warnings.warn(

7591 _chained_assignment_warning_method_msg,

7592 FutureWarning,

7593 stacklevel=2,

7594 )

7595

7596 return self._pad_or_backfill(

7597 "ffill",

7598 axis=axis,

7599 inplace=inplace,

7600 limit=limit,

7601 limit_area=limit_area,

7602 # error: Argument "downcast" to "_fillna_with_method" of "NDFrame"

7603 # has incompatible type "Union[Dict[Any, Any], None,

7604 # Literal[_NoDefault.no_default]]"; expected "Optional[Dict[Any, Any]]"

7605 downcast=downcast, # type: ignore[arg-type]

7606 )

7607

7608 @final

7609 @doc(klass=_shared_doc_kwargs["klass"])

7610 def pad(

7611 self,

7612 *,

7613 axis: None | Axis = None,

7614 inplace: bool_t = False,

7615 limit: None | int = None,

7616 downcast: dict | None | lib.NoDefault = lib.no_default,

7617 ) -> Self | None:

7618 """

7619 Fill NA/NaN values by propagating the last valid observation to next valid.

7620

7621 .. deprecated:: 2.0

7622

7623 {klass}.pad is deprecated. Use {klass}.ffill instead.

7624

7625 Returns

7626 -------

7627 {klass} or None

7628 Object with missing values filled or None if ``inplace=True``.

7629

7630 Examples

7631 --------

7632 Please see examples for :meth:`DataFrame.ffill` or :meth:`Series.ffill`.

7633 """

7634 warnings.warn(

7635 "DataFrame.pad/Series.pad is deprecated. Use "

7636 "DataFrame.ffill/Series.ffill instead",

7637 FutureWarning,

7638 stacklevel=find_stack_level(),

7639 )

7640 return self.ffill(axis=axis, inplace=inplace, limit=limit, downcast=downcast)

7641

7642 @overload

7643 def bfill(

7644 self,

7645 *,

7646 axis: None | Axis = ...,

7647 inplace: Literal[False] = ...,

7648 limit: None | int = ...,

7649 limit_area: Literal["inside", "outside"] | None = ...,

7650 downcast: dict | None | lib.NoDefault = ...,

7651 ) -> Self:

7652 ...

7653

7654 @overload

7655 def bfill(

7656 self,

7657 *,

7658 axis: None | Axis = ...,

7659 inplace: Literal[True],

7660 limit: None | int = ...,

7661 downcast: dict | None | lib.NoDefault = ...,

7662 ) -> None:

7663 ...

7664

7665 @overload

7666 def bfill(

7667 self,

7668 *,

7669 axis: None | Axis = ...,

7670 inplace: bool_t = ...,

7671 limit: None | int = ...,

7672 limit_area: Literal["inside", "outside"] | None = ...,

7673 downcast: dict | None | lib.NoDefault = ...,

7674 ) -> Self | None:

7675 ...

7676

7677 @final

7678 @doc(

7679 klass=_shared_doc_kwargs["klass"],

7680 axes_single_arg=_shared_doc_kwargs["axes_single_arg"],

7681 )

7682 def bfill(

7683 self,

7684 *,

7685 axis: None | Axis = None,

7686 inplace: bool_t = False,

7687 limit: None | int = None,

7688 limit_area: Literal["inside", "outside"] | None = None,

7689 downcast: dict | None | lib.NoDefault = lib.no_default,

7690 ) -> Self | None:

7691 """

7692 Fill NA/NaN values by using the next valid observation to fill the gap.

7693

7694 Parameters

7695 ----------

7696 axis : {axes_single_arg}

7697 Axis along which to fill missing values. For `Series`

7698 this parameter is unused and defaults to 0.

7699 inplace : bool, default False

7700 If True, fill in-place. Note: this will modify any

7701 other views on this object (e.g., a no-copy slice for a column in a

7702 DataFrame).

7703 limit : int, default None

7704 If method is specified, this is the maximum number of consecutive

7705 NaN values to forward/backward fill. In other words, if there is

7706 a gap with more than this number of consecutive NaNs, it will only

7707 be partially filled. If method is not specified, this is the

7708 maximum number of entries along the entire axis where NaNs will be

7709 filled. Must be greater than 0 if not None.

7710 limit_area : {{`None`, 'inside', 'outside'}}, default None

7711 If limit is specified, consecutive NaNs will be filled with this

7712 restriction.

7713

7714 * ``None``: No fill restriction.

7715 * 'inside': Only fill NaNs surrounded by valid values

7716 (interpolate).

7717 * 'outside': Only fill NaNs outside valid values (extrapolate).

7718

7719 .. versionadded:: 2.2.0

7720

7721 downcast : dict, default is None

7722 A dict of item->dtype of what to downcast if possible,

7723 or the string 'infer' which will try to downcast to an appropriate

7724 equal type (e.g. float64 to int64 if possible).

7725

7726 .. deprecated:: 2.2.0

7727

7728 Returns

7729 -------

7730 {klass} or None

7731 Object with missing values filled or None if ``inplace=True``.

7732

7733 Examples

7734 --------

7735 For Series:

7736

7737 >>> s = pd.Series([1, None, None, 2])

7738 >>> s.bfill()

7739 0 1.0

7740 1 2.0

7741 2 2.0

7742 3 2.0

7743 dtype: float64

7744 >>> s.bfill(limit=1)

7745 0 1.0

7746 1 NaN

7747 2 2.0

7748 3 2.0

7749 dtype: float64

7750

7751 With DataFrame:

7752

7753 >>> df = pd.DataFrame({{'A': [1, None, None, 4], 'B': [None, 5, None, 7]}})

7754 >>> df

7755 A B

7756 0 1.0 NaN

7757 1 NaN 5.0

7758 2 NaN NaN

7759 3 4.0 7.0

7760 >>> df.bfill()

7761 A B

7762 0 1.0 5.0

7763 1 4.0 5.0

7764 2 4.0 7.0

7765 3 4.0 7.0

7766 >>> df.bfill(limit=1)

7767 A B

7768 0 1.0 5.0

7769 1 NaN 5.0

7770 2 4.0 7.0

7771 3 4.0 7.0

7772 """

7773 downcast = self._deprecate_downcast(downcast, "bfill")

7774 inplace = validate_bool_kwarg(inplace, "inplace")

7775 if inplace:

7776 if not PYPY and using_copy_on_write():

7777 if sys.getrefcount(self) <= REF_COUNT:

7778 warnings.warn(

7779 _chained_assignment_method_msg,

7780 ChainedAssignmentError,

7781 stacklevel=2,

7782 )

7783 elif (

7784 not PYPY

7785 and not using_copy_on_write()

7786 and self._is_view_after_cow_rules()

7787 ):

7788 ctr = sys.getrefcount(self)

7789 ref_count = REF_COUNT

7790 if isinstance(self, ABCSeries) and _check_cacher(self):

7791 # see https://github.com/pandas-dev/pandas/pull/56060#discussion_r1399245221

7792 ref_count += 1

7793 if ctr <= ref_count:

7794 warnings.warn(

7795 _chained_assignment_warning_method_msg,

7796 FutureWarning,

7797 stacklevel=2,

7798 )

7799

7800 return self._pad_or_backfill(

7801 "bfill",

7802 axis=axis,

7803 inplace=inplace,

7804 limit=limit,

7805 limit_area=limit_area,

7806 # error: Argument "downcast" to "_fillna_with_method" of "NDFrame"

7807 # has incompatible type "Union[Dict[Any, Any], None,

7808 # Literal[_NoDefault.no_default]]"; expected "Optional[Dict[Any, Any]]"

7809 downcast=downcast, # type: ignore[arg-type]

7810 )

7811

7812 @final

7813 @doc(klass=_shared_doc_kwargs["klass"])

7814 def backfill(

7815 self,

7816 *,

7817 axis: None | Axis = None,

7818 inplace: bool_t = False,

7819 limit: None | int = None,

7820 downcast: dict | None | lib.NoDefault = lib.no_default,

7821 ) -> Self | None:

7822 """

7823 Fill NA/NaN values by using the next valid observation to fill the gap.

7824

7825 .. deprecated:: 2.0

7826

7827 {klass}.backfill is deprecated. Use {klass}.bfill instead.

7828

7829 Returns

7830 -------

7831 {klass} or None

7832 Object with missing values filled or None if ``inplace=True``.

7833

7834 Examples

7835 --------

7836 Please see examples for :meth:`DataFrame.bfill` or :meth:`Series.bfill`.

7837 """

7838 warnings.warn(

7839 "DataFrame.backfill/Series.backfill is deprecated. Use "

7840 "DataFrame.bfill/Series.bfill instead",

7841 FutureWarning,

7842 stacklevel=find_stack_level(),

7843 )

7844 return self.bfill(axis=axis, inplace=inplace, limit=limit, downcast=downcast)

7845

7846 @overload

7847 def replace(

7848 self,

7849 to_replace=...,

7850 value=...,

7851 *,

7852 inplace: Literal[False] = ...,

7853 limit: int | None = ...,

7854 regex: bool_t = ...,

7855 method: Literal["pad", "ffill", "bfill"] | lib.NoDefault = ...,

7856 ) -> Self:

7857 ...

7858

7859 @overload

7860 def replace(

7861 self,

7862 to_replace=...,

7863 value=...,

7864 *,

7865 inplace: Literal[True],

7866 limit: int | None = ...,

7867 regex: bool_t = ...,

7868 method: Literal["pad", "ffill", "bfill"] | lib.NoDefault = ...,

7869 ) -> None:

7870 ...

7871

7872 @overload

7873 def replace(

7874 self,

7875 to_replace=...,

7876 value=...,

7877 *,

7878 inplace: bool_t = ...,

7879 limit: int | None = ...,

7880 regex: bool_t = ...,

7881 method: Literal["pad", "ffill", "bfill"] | lib.NoDefault = ...,

7882 ) -> Self | None:

7883 ...

7884

7885 @final

7886 @doc(

7887 _shared_docs["replace"],

7888 klass=_shared_doc_kwargs["klass"],

7889 inplace=_shared_doc_kwargs["inplace"],

7890 )

7891 def replace(

7892 self,

7893 to_replace=None,

7894 value=lib.no_default,

7895 *,

7896 inplace: bool_t = False,

7897 limit: int | None = None,

7898 regex: bool_t = False,

7899 method: Literal["pad", "ffill", "bfill"] | lib.NoDefault = lib.no_default,

7900 ) -> Self | None:

7901 if method is not lib.no_default:

7902 warnings.warn(

7903 # GH#33302

7904 f"The 'method' keyword in {type(self).__name__}.replace is "

7905 "deprecated and will be removed in a future version.",

7906 FutureWarning,

7907 stacklevel=find_stack_level(),

7908 )

7909 elif limit is not None:

7910 warnings.warn(

7911 # GH#33302

7912 f"The 'limit' keyword in {type(self).__name__}.replace is "

7913 "deprecated and will be removed in a future version.",

7914 FutureWarning,

7915 stacklevel=find_stack_level(),

7916 )

7917 if (

7918 value is lib.no_default

7919 and method is lib.no_default

7920 and not is_dict_like(to_replace)

7921 and regex is False

7922 ):

7923 # case that goes through _replace_single and defaults to method="pad"

7924 warnings.warn(

7925 # GH#33302

7926 f"{type(self).__name__}.replace without 'value' and with "

7927 "non-dict-like 'to_replace' is deprecated "

7928 "and will raise in a future version. "

7929 "Explicitly specify the new values instead.",

7930 FutureWarning,

7931 stacklevel=find_stack_level(),

7932 )

7933

7934 if not (

7935 is_scalar(to_replace)

7936 or is_re_compilable(to_replace)

7937 or is_list_like(to_replace)

7938 ):

7939 raise TypeError(

7940 "Expecting 'to_replace' to be either a scalar, array-like, "

7941 "dict or None, got invalid type "

7942 f"{repr(type(to_replace).__name__)}"

7943 )

7944

7945 inplace = validate_bool_kwarg(inplace, "inplace")

7946 if inplace:

7947 if not PYPY and using_copy_on_write():

7948 if sys.getrefcount(self) <= REF_COUNT:

7949 warnings.warn(

7950 _chained_assignment_method_msg,

7951 ChainedAssignmentError,

7952 stacklevel=2,

7953 )

7954 elif (

7955 not PYPY

7956 and not using_copy_on_write()

7957 and self._is_view_after_cow_rules()

7958 ):

7959 ctr = sys.getrefcount(self)

7960 ref_count = REF_COUNT

7961 if isinstance(self, ABCSeries) and _check_cacher(self):

7962 # in non-CoW mode, chained Series access will populate the

7963 # `_item_cache` which results in an increased ref count not below

7964 # the threshold, while we still need to warn. We detect this case

7965 # of a Series derived from a DataFrame through the presence of

7966 # checking the `_cacher`

7967 ref_count += 1

7968 if ctr <= ref_count:

7969 warnings.warn(

7970 _chained_assignment_warning_method_msg,

7971 FutureWarning,

7972 stacklevel=2,

7973 )

7974

7975 if not is_bool(regex) and to_replace is not None:

7976 raise ValueError("'to_replace' must be 'None' if 'regex' is not a bool")

7977

7978 if value is lib.no_default or method is not lib.no_default:

7979 # GH#36984 if the user explicitly passes value=None we want to

7980 # respect that. We have the corner case where the user explicitly

7981 # passes value=None *and* a method, which we interpret as meaning

7982 # they want the (documented) default behavior.

7983 if method is lib.no_default:

7984 # TODO: get this to show up as the default in the docs?

7985 method = "pad"

7986

7987 # passing a single value that is scalar like

7988 # when value is None (GH5319), for compat

7989 if not is_dict_like(to_replace) and not is_dict_like(regex):

7990 to_replace = [to_replace]

7991

7992 if isinstance(to_replace, (tuple, list)):

7993 # TODO: Consider copy-on-write for non-replaced columns's here

7994 if isinstance(self, ABCDataFrame):

7995 from pandas import Series

7996

7997 result = self.apply(

7998 Series._replace_single,

7999 args=(to_replace, method, inplace, limit),

8000 )

8001 if inplace:

8002 return None

8003 return result

8004 return self._replace_single(to_replace, method, inplace, limit)

8005

8006 if not is_dict_like(to_replace):

8007 if not is_dict_like(regex):

8008 raise TypeError(

8009 'If "to_replace" and "value" are both None '

8010 'and "to_replace" is not a list, then '

8011 "regex must be a mapping"

8012 )

8013 to_replace = regex

8014 regex = True

8015

8016 items = list(to_replace.items())

8017 if items:

8018 keys, values = zip(*items)

8019 else:

8020 # error: Incompatible types in assignment (expression has type

8021 # "list[Never]", variable has type "tuple[Any, ...]")

8022 keys, values = ([], []) # type: ignore[assignment]

8023

8024 are_mappings = [is_dict_like(v) for v in values]

8025

8026 if any(are_mappings):

8027 if not all(are_mappings):

8028 raise TypeError(

8029 "If a nested mapping is passed, all values "

8030 "of the top level mapping must be mappings"

8031 )

8032 # passed a nested dict/Series

8033 to_rep_dict = {}

8034 value_dict = {}

8035

8036 for k, v in items:

8037 # error: Incompatible types in assignment (expression has type

8038 # "list[Never]", variable has type "tuple[Any, ...]")

8039 keys, values = list(zip(*v.items())) or ( # type: ignore[assignment]

8040 [],

8041 [],

8042 )

8043

8044 to_rep_dict[k] = list(keys)

8045 value_dict[k] = list(values)

8046

8047 to_replace, value = to_rep_dict, value_dict

8048 else:

8049 to_replace, value = keys, values

8050

8051 return self.replace(

8052 to_replace, value, inplace=inplace, limit=limit, regex=regex

8053 )

8054 else:

8055 # need a non-zero len on all axes

8056 if not self.size:

8057 if inplace:

8058 return None

8059 return self.copy(deep=None)

8060

8061 if is_dict_like(to_replace):

8062 if is_dict_like(value): # {'A' : NA} -> {'A' : 0}

8063 # Note: Checking below for `in foo.keys()` instead of

8064 # `in foo` is needed for when we have a Series and not dict

8065 mapping = {

8066 col: (to_replace[col], value[col])

8067 for col in to_replace.keys()

8068 if col in value.keys() and col in self

8069 }

8070 return self._replace_columnwise(mapping, inplace, regex)

8071

8072 # {'A': NA} -> 0

8073 elif not is_list_like(value):

8074 # Operate column-wise

8075 if self.ndim == 1:

8076 raise ValueError(

8077 "Series.replace cannot use dict-like to_replace "

8078 "and non-None value"

8079 )

8080 mapping = {

8081 col: (to_rep, value) for col, to_rep in to_replace.items()

8082 }

8083 return self._replace_columnwise(mapping, inplace, regex)

8084 else:

8085 raise TypeError("value argument must be scalar, dict, or Series")

8086

8087 elif is_list_like(to_replace):

8088 if not is_list_like(value):

8089 # e.g. to_replace = [NA, ''] and value is 0,

8090 # so we replace NA with 0 and then replace '' with 0

8091 value = [value] * len(to_replace)

8092

8093 # e.g. we have to_replace = [NA, ''] and value = [0, 'missing']

8094 if len(to_replace) != len(value):

8095 raise ValueError(

8096 f"Replacement lists must match in length. "

8097 f"Expecting {len(to_replace)} got {len(value)} "

8098 )

8099 new_data = self._mgr.replace_list(

8100 src_list=to_replace,

8101 dest_list=value,

8102 inplace=inplace,

8103 regex=regex,

8104 )

8105

8106 elif to_replace is None:

8107 if not (

8108 is_re_compilable(regex)

8109 or is_list_like(regex)

8110 or is_dict_like(regex)

8111 ):

8112 raise TypeError(

8113 f"'regex' must be a string or a compiled regular expression "

8114 f"or a list or dict of strings or regular expressions, "

8115 f"you passed a {repr(type(regex).__name__)}"

8116 )

8117 return self.replace(

8118 regex, value, inplace=inplace, limit=limit, regex=True

8119 )

8120 else:

8121 # dest iterable dict-like

8122 if is_dict_like(value): # NA -> {'A' : 0, 'B' : -1}

8123 # Operate column-wise

8124 if self.ndim == 1:

8125 raise ValueError(

8126 "Series.replace cannot use dict-value and "

8127 "non-None to_replace"

8128 )

8129 mapping = {col: (to_replace, val) for col, val in value.items()}

8130 return self._replace_columnwise(mapping, inplace, regex)

8131

8132 elif not is_list_like(value): # NA -> 0

8133 regex = should_use_regex(regex, to_replace)

8134 if regex:

8135 new_data = self._mgr.replace_regex(

8136 to_replace=to_replace,

8137 value=value,

8138 inplace=inplace,

8139 )

8140 else:

8141 new_data = self._mgr.replace(

8142 to_replace=to_replace, value=value, inplace=inplace

8143 )

8144 else:

8145 raise TypeError(

8146 f'Invalid "to_replace" type: {repr(type(to_replace).__name__)}'

8147 )

8148

8149 result = self._constructor_from_mgr(new_data, axes=new_data.axes)

8150 if inplace:

8151 return self._update_inplace(result)

8152 else:

8153 return result.__finalize__(self, method="replace")

8154

8155 @overload

8156 def interpolate(

8157 self,

8158 method: InterpolateOptions = ...,

8159 *,

8160 axis: Axis = ...,

8161 limit: int | None = ...,

8162 inplace: Literal[False] = ...,

8163 limit_direction: Literal["forward", "backward", "both"] | None = ...,

8164 limit_area: Literal["inside", "outside"] | None = ...,

8165 downcast: Literal["infer"] | None | lib.NoDefault = ...,

8166 **kwargs,

8167 ) -> Self:

8168 ...

8169

8170 @overload

8171 def interpolate(

8172 self,

8173 method: InterpolateOptions = ...,

8174 *,

8175 axis: Axis = ...,

8176 limit: int | None = ...,

8177 inplace: Literal[True],

8178 limit_direction: Literal["forward", "backward", "both"] | None = ...,

8179 limit_area: Literal["inside", "outside"] | None = ...,

8180 downcast: Literal["infer"] | None | lib.NoDefault = ...,

8181 **kwargs,

8182 ) -> None:

8183 ...

8184

8185 @overload

8186 def interpolate(

8187 self,

8188 method: InterpolateOptions = ...,

8189 *,

8190 axis: Axis = ...,

8191 limit: int | None = ...,

8192 inplace: bool_t = ...,

8193 limit_direction: Literal["forward", "backward", "both"] | None = ...,

8194 limit_area: Literal["inside", "outside"] | None = ...,

8195 downcast: Literal["infer"] | None | lib.NoDefault = ...,

8196 **kwargs,

8197 ) -> Self | None:

8198 ...

8199

8200 @final

8201 def interpolate(

8202 self,

8203 method: InterpolateOptions = "linear",

8204 *,

8205 axis: Axis = 0,

8206 limit: int | None = None,

8207 inplace: bool_t = False,

8208 limit_direction: Literal["forward", "backward", "both"] | None = None,

8209 limit_area: Literal["inside", "outside"] | None = None,

8210 downcast: Literal["infer"] | None | lib.NoDefault = lib.no_default,

8211 **kwargs,

8212 ) -> Self | None:

8213 """

8214 Fill NaN values using an interpolation method.

8215

8216 Please note that only ``method='linear'`` is supported for

8217 DataFrame/Series with a MultiIndex.

8218

8219 Parameters

8220 ----------

8221 method : str, default 'linear'

8222 Interpolation technique to use. One of:

8223

8224 * 'linear': Ignore the index and treat the values as equally

8225 spaced. This is the only method supported on MultiIndexes.

8226 * 'time': Works on daily and higher resolution data to interpolate

8227 given length of interval.

8228 * 'index', 'values': use the actual numerical values of the index.

8229 * 'pad': Fill in NaNs using existing values.

8230 * 'nearest', 'zero', 'slinear', 'quadratic', 'cubic',

8231 'barycentric', 'polynomial': Passed to

8232 `scipy.interpolate.interp1d`, whereas 'spline' is passed to

8233 `scipy.interpolate.UnivariateSpline`. These methods use the numerical

8234 values of the index. Both 'polynomial' and 'spline' require that

8235 you also specify an `order` (int), e.g.

8236 ``df.interpolate(method='polynomial', order=5)``. Note that,

8237 `slinear` method in Pandas refers to the Scipy first order `spline`

8238 instead of Pandas first order `spline`.

8239 * 'krogh', 'piecewise_polynomial', 'spline', 'pchip', 'akima',

8240 'cubicspline': Wrappers around the SciPy interpolation methods of

8241 similar names. See `Notes`.

8242 * 'from_derivatives': Refers to

8243 `scipy.interpolate.BPoly.from_derivatives`.

8244

8245 axis : {{0 or 'index', 1 or 'columns', None}}, default None

8246 Axis to interpolate along. For `Series` this parameter is unused

8247 and defaults to 0.

8248 limit : int, optional

8249 Maximum number of consecutive NaNs to fill. Must be greater than

8250 0.

8251 inplace : bool, default False

8252 Update the data in place if possible.

8253 limit_direction : {{'forward', 'backward', 'both'}}, Optional

8254 Consecutive NaNs will be filled in this direction.

8255

8256 If limit is specified:

8257 * If 'method' is 'pad' or 'ffill', 'limit_direction' must be 'forward'.

8258 * If 'method' is 'backfill' or 'bfill', 'limit_direction' must be

8259 'backwards'.

8260

8261 If 'limit' is not specified:

8262 * If 'method' is 'backfill' or 'bfill', the default is 'backward'

8263 * else the default is 'forward'

8264

8265 raises ValueError if `limit_direction` is 'forward' or 'both' and

8266 method is 'backfill' or 'bfill'.

8267 raises ValueError if `limit_direction` is 'backward' or 'both' and

8268 method is 'pad' or 'ffill'.

8269

8270 limit_area : {{`None`, 'inside', 'outside'}}, default None

8271 If limit is specified, consecutive NaNs will be filled with this

8272 restriction.

8273

8274 * ``None``: No fill restriction.

8275 * 'inside': Only fill NaNs surrounded by valid values

8276 (interpolate).

8277 * 'outside': Only fill NaNs outside valid values (extrapolate).

8278

8279 downcast : optional, 'infer' or None, defaults to None

8280 Downcast dtypes if possible.

8281

8282 .. deprecated:: 2.1.0

8283

8284 ``**kwargs`` : optional

8285 Keyword arguments to pass on to the interpolating function.

8286

8287 Returns

8288 -------

8289 Series or DataFrame or None

8290 Returns the same object type as the caller, interpolated at

8291 some or all ``NaN`` values or None if ``inplace=True``.

8292

8293 See Also

8294 --------

8295 fillna : Fill missing values using different methods.

8296 scipy.interpolate.Akima1DInterpolator : Piecewise cubic polynomials

8297 (Akima interpolator).

8298 scipy.interpolate.BPoly.from_derivatives : Piecewise polynomial in the

8299 Bernstein basis.

8300 scipy.interpolate.interp1d : Interpolate a 1-D function.

8301 scipy.interpolate.KroghInterpolator : Interpolate polynomial (Krogh

8302 interpolator).

8303 scipy.interpolate.PchipInterpolator : PCHIP 1-d monotonic cubic

8304 interpolation.

8305 scipy.interpolate.CubicSpline : Cubic spline data interpolator.

8306

8307 Notes

8308 -----

8309 The 'krogh', 'piecewise_polynomial', 'spline', 'pchip' and 'akima'

8310 methods are wrappers around the respective SciPy implementations of

8311 similar names. These use the actual numerical values of the index.

8312 For more information on their behavior, see the

8313 `SciPy documentation

8314 <https://docs.scipy.org/doc/scipy/reference/interpolate.html#univariate-interpolation>`__.

8315

8316 Examples

8317 --------

8318 Filling in ``NaN`` in a :class:`~pandas.Series` via linear

8319 interpolation.

8320

8321 >>> s = pd.Series([0, 1, np.nan, 3])

8322 >>> s

8323 0 0.0

8324 1 1.0

8325 2 NaN

8326 3 3.0

8327 dtype: float64

8328 >>> s.interpolate()

8329 0 0.0

8330 1 1.0

8331 2 2.0

8332 3 3.0

8333 dtype: float64

8334

8335 Filling in ``NaN`` in a Series via polynomial interpolation or splines:

8336 Both 'polynomial' and 'spline' methods require that you also specify

8337 an ``order`` (int).

8338

8339 >>> s = pd.Series([0, 2, np.nan, 8])

8340 >>> s.interpolate(method='polynomial', order=2)

8341 0 0.000000

8342 1 2.000000

8343 2 4.666667

8344 3 8.000000

8345 dtype: float64

8346

8347 Fill the DataFrame forward (that is, going down) along each column

8348 using linear interpolation.

8349

8350 Note how the last entry in column 'a' is interpolated differently,

8351 because there is no entry after it to use for interpolation.

8352 Note how the first entry in column 'b' remains ``NaN``, because there

8353 is no entry before it to use for interpolation.

8354

8355 >>> df = pd.DataFrame([(0.0, np.nan, -1.0, 1.0),

8356 ... (np.nan, 2.0, np.nan, np.nan),

8357 ... (2.0, 3.0, np.nan, 9.0),

8358 ... (np.nan, 4.0, -4.0, 16.0)],

8359 ... columns=list('abcd'))

8360 >>> df

8361 a b c d

8362 0 0.0 NaN -1.0 1.0

8363 1 NaN 2.0 NaN NaN

8364 2 2.0 3.0 NaN 9.0

8365 3 NaN 4.0 -4.0 16.0

8366 >>> df.interpolate(method='linear', limit_direction='forward', axis=0)

8367 a b c d

8368 0 0.0 NaN -1.0 1.0

8369 1 1.0 2.0 -2.0 5.0

8370 2 2.0 3.0 -3.0 9.0

8371 3 2.0 4.0 -4.0 16.0

8372

8373 Using polynomial interpolation.

8374

8375 >>> df['d'].interpolate(method='polynomial', order=2)

8376 0 1.0

8377 1 4.0

8378 2 9.0

8379 3 16.0

8380 Name: d, dtype: float64

8381 """

8382 if downcast is not lib.no_default:

8383 # GH#40988

8384 warnings.warn(

8385 f"The 'downcast' keyword in {type(self).__name__}.interpolate "

8386 "is deprecated and will be removed in a future version. "

8387 "Call result.infer_objects(copy=False) on the result instead.",

8388 FutureWarning,

8389 stacklevel=find_stack_level(),

8390 )

8391 else:

8392 downcast = None

8393 if downcast is not None and downcast != "infer":

8394 raise ValueError("downcast must be either None or 'infer'")

8395

8396 inplace = validate_bool_kwarg(inplace, "inplace")

8397

8398 if inplace:

8399 if not PYPY and using_copy_on_write():

8400 if sys.getrefcount(self) <= REF_COUNT:

8401 warnings.warn(

8402 _chained_assignment_method_msg,

8403 ChainedAssignmentError,

8404 stacklevel=2,

8405 )

8406 elif (

8407 not PYPY

8408 and not using_copy_on_write()

8409 and self._is_view_after_cow_rules()

8410 ):

8411 ctr = sys.getrefcount(self)

8412 ref_count = REF_COUNT

8413 if isinstance(self, ABCSeries) and _check_cacher(self):

8414 # see https://github.com/pandas-dev/pandas/pull/56060#discussion_r1399245221

8415 ref_count += 1

8416 if ctr <= ref_count:

8417 warnings.warn(

8418 _chained_assignment_warning_method_msg,

8419 FutureWarning,

8420 stacklevel=2,

8421 )

8422

8423 axis = self._get_axis_number(axis)

8424

8425 if self.empty:

8426 if inplace:

8427 return None

8428 return self.copy()

8429

8430 if not isinstance(method, str):

8431 raise ValueError("'method' should be a string, not None.")

8432

8433 fillna_methods = ["ffill", "bfill", "pad", "backfill"]

8434 if method.lower() in fillna_methods:

8435 # GH#53581

8436 warnings.warn(

8437 f"{type(self).__name__}.interpolate with method={method} is "

8438 "deprecated and will raise in a future version. "

8439 "Use obj.ffill() or obj.bfill() instead.",

8440 FutureWarning,

8441 stacklevel=find_stack_level(),

8442 )

8443 obj, should_transpose = self, False

8444 else:

8445 obj, should_transpose = (self.T, True) if axis == 1 else (self, False)

8446 if np.any(obj.dtypes == object):

8447 # GH#53631

8448 if not (obj.ndim == 2 and np.all(obj.dtypes == object)):

8449 # don't warn in cases that already raise

8450 warnings.warn(

8451 f"{type(self).__name__}.interpolate with object dtype is "

8452 "deprecated and will raise in a future version. Call "

8453 "obj.infer_objects(copy=False) before interpolating instead.",

8454 FutureWarning,

8455 stacklevel=find_stack_level(),

8456 )

8457

8458 if method in fillna_methods and "fill_value" in kwargs:

8459 raise ValueError(

8460 "'fill_value' is not a valid keyword for "

8461 f"{type(self).__name__}.interpolate with method from "

8462 f"{fillna_methods}"

8463 )

8464

8465 if isinstance(obj.index, MultiIndex) and method != "linear":

8466 raise ValueError(

8467 "Only `method=linear` interpolation is supported on MultiIndexes."

8468 )

8469

8470 limit_direction = missing.infer_limit_direction(limit_direction, method)

8471

8472 if obj.ndim == 2 and np.all(obj.dtypes == object):

8473 raise TypeError(

8474 "Cannot interpolate with all object-dtype columns "

8475 "in the DataFrame. Try setting at least one "

8476 "column to a numeric dtype."

8477 )

8478

8479 if method.lower() in fillna_methods:

8480 # TODO(3.0): remove this case

8481 # TODO: warn/raise on limit_direction or kwargs which are ignored?

8482 # as of 2023-06-26 no tests get here with either

8483 if not self._mgr.is_single_block and axis == 1:

8484 # GH#53898

8485 if inplace:

8486 raise NotImplementedError()

8487 obj, axis, should_transpose = self.T, 1 - axis, True

8488

8489 new_data = obj._mgr.pad_or_backfill(

8490 method=method,

8491 axis=self._get_block_manager_axis(axis),

8492 limit=limit,

8493 limit_area=limit_area,

8494 inplace=inplace,

8495 downcast=downcast,

8496 )

8497 else:

8498 index = missing.get_interp_index(method, obj.index)

8499 new_data = obj._mgr.interpolate(

8500 method=method,

8501 index=index,

8502 limit=limit,

8503 limit_direction=limit_direction,

8504 limit_area=limit_area,

8505 inplace=inplace,

8506 downcast=downcast,

8507 **kwargs,

8508 )

8509

8510 result = self._constructor_from_mgr(new_data, axes=new_data.axes)

8511 if should_transpose:

8512 result = result.T

8513 if inplace:

8514 return self._update_inplace(result)

8515 else:

8516 return result.__finalize__(self, method="interpolate")

8517

8518 # ----------------------------------------------------------------------

8519 # Timeseries methods Methods

8520

8521 @final

8522 def asof(self, where, subset=None):

8523 """

8524 Return the last row(s) without any NaNs before `where`.

8525

8526 The last row (for each element in `where`, if list) without any

8527 NaN is taken.

8528 In case of a :class:`~pandas.DataFrame`, the last row without NaN

8529 considering only the subset of columns (if not `None`)

8530

8531 If there is no good value, NaN is returned for a Series or

8532 a Series of NaN values for a DataFrame

8533

8534 Parameters

8535 ----------

8536 where : date or array-like of dates

8537 Date(s) before which the last row(s) are returned.

8538 subset : str or array-like of str, default `None`

8539 For DataFrame, if not `None`, only use these columns to

8540 check for NaNs.

8541

8542 Returns

8543 -------

8544 scalar, Series, or DataFrame

8545

8546 The return can be:

8547

8548 * scalar : when `self` is a Series and `where` is a scalar

8549 * Series: when `self` is a Series and `where` is an array-like,

8550 or when `self` is a DataFrame and `where` is a scalar

8551 * DataFrame : when `self` is a DataFrame and `where` is an

8552 array-like

8553

8554 See Also

8555 --------

8556 merge_asof : Perform an asof merge. Similar to left join.

8557

8558 Notes

8559 -----

8560 Dates are assumed to be sorted. Raises if this is not the case.

8561

8562 Examples

8563 --------

8564 A Series and a scalar `where`.

8565

8566 >>> s = pd.Series([1, 2, np.nan, 4], index=[10, 20, 30, 40])

8567 >>> s

8568 10 1.0

8569 20 2.0

8570 30 NaN

8571 40 4.0

8572 dtype: float64

8573

8574 >>> s.asof(20)

8575 2.0

8576

8577 For a sequence `where`, a Series is returned. The first value is

8578 NaN, because the first element of `where` is before the first

8579 index value.

8580

8581 >>> s.asof([5, 20])

8582 5 NaN

8583 20 2.0

8584 dtype: float64

8585

8586 Missing values are not considered. The following is ``2.0``, not

8587 NaN, even though NaN is at the index location for ``30``.

8588

8589 >>> s.asof(30)

8590 2.0

8591

8592 Take all columns into consideration

8593

8594 >>> df = pd.DataFrame({'a': [10., 20., 30., 40., 50.],

8595 ... 'b': [None, None, None, None, 500]},

8596 ... index=pd.DatetimeIndex(['2018-02-27 09:01:00',

8597 ... '2018-02-27 09:02:00',

8598 ... '2018-02-27 09:03:00',

8599 ... '2018-02-27 09:04:00',

8600 ... '2018-02-27 09:05:00']))

8601 >>> df.asof(pd.DatetimeIndex(['2018-02-27 09:03:30',

8602 ... '2018-02-27 09:04:30']))

8603 a b

8604 2018-02-27 09:03:30 NaN NaN

8605 2018-02-27 09:04:30 NaN NaN

8606

8607 Take a single column into consideration

8608

8609 >>> df.asof(pd.DatetimeIndex(['2018-02-27 09:03:30',

8610 ... '2018-02-27 09:04:30']),

8611 ... subset=['a'])

8612 a b

8613 2018-02-27 09:03:30 30.0 NaN

8614 2018-02-27 09:04:30 40.0 NaN

8615 """

8616 if isinstance(where, str):

8617 where = Timestamp(where)

8618

8619 if not self.index.is_monotonic_increasing:

8620 raise ValueError("asof requires a sorted index")

8621

8622 is_series = isinstance(self, ABCSeries)

8623 if is_series:

8624 if subset is not None:

8625 raise ValueError("subset is not valid for Series")

8626 else:

8627 if subset is None:

8628 subset = self.columns

8629 if not is_list_like(subset):

8630 subset = [subset]

8631

8632 is_list = is_list_like(where)

8633 if not is_list:

8634 start = self.index[0]

8635 if isinstance(self.index, PeriodIndex):

8636 where = Period(where, freq=self.index.freq)

8637

8638 if where < start:

8639 if not is_series:

8640 return self._constructor_sliced(

8641 index=self.columns, name=where, dtype=np.float64

8642 )

8643 return np.nan

8644

8645 # It's always much faster to use a *while* loop here for

8646 # Series than pre-computing all the NAs. However a

8647 # *while* loop is extremely expensive for DataFrame

8648 # so we later pre-compute all the NAs and use the same

8649 # code path whether *where* is a scalar or list.

8650 # See PR: https://github.com/pandas-dev/pandas/pull/14476

8651 if is_series:

8652 loc = self.index.searchsorted(where, side="right")

8653 if loc > 0:

8654 loc -= 1

8655

8656 values = self._values

8657 while loc > 0 and isna(values[loc]):

8658 loc -= 1

8659 return values[loc]

8660

8661 if not isinstance(where, Index):

8662 where = Index(where) if is_list else Index([where])

8663

8664 nulls = self.isna() if is_series else self[subset].isna().any(axis=1)

8665 if nulls.all():

8666 if is_series:

8667 self = cast("Series", self)

8668 return self._constructor(np.nan, index=where, name=self.name)

8669 elif is_list:

8670 self = cast("DataFrame", self)

8671 return self._constructor(np.nan, index=where, columns=self.columns)

8672 else:

8673 self = cast("DataFrame", self)

8674 return self._constructor_sliced(

8675 np.nan, index=self.columns, name=where[0]

8676 )

8677

8678 locs = self.index.asof_locs(where, ~(nulls._values))

8679

8680 # mask the missing

8681 mask = locs == -1

8682 data = self.take(locs)

8683 data.index = where

8684 if mask.any():

8685 # GH#16063 only do this setting when necessary, otherwise

8686 # we'd cast e.g. bools to floats

8687 data.loc[mask] = np.nan

8688 return data if is_list else data.iloc[-1]

8689

8690 # ----------------------------------------------------------------------

8691 # Action Methods

8692

8693 @doc(klass=_shared_doc_kwargs["klass"])

8694 def isna(self) -> Self:

8695 """

8696 Detect missing values.

8697

8698 Return a boolean same-sized object indicating if the values are NA.

8699 NA values, such as None or :attr:`numpy.NaN`, gets mapped to True

8700 values.

8701 Everything else gets mapped to False values. Characters such as empty

8702 strings ``''`` or :attr:`numpy.inf` are not considered NA values

8703 (unless you set ``pandas.options.mode.use_inf_as_na = True``).

8704

8705 Returns

8706 -------

8707 {klass}

8708 Mask of bool values for each element in {klass} that

8709 indicates whether an element is an NA value.

8710

8711 See Also

8712 --------

8713 {klass}.isnull : Alias of isna.

8714 {klass}.notna : Boolean inverse of isna.

8715 {klass}.dropna : Omit axes labels with missing values.

8716 isna : Top-level isna.

8717

8718 Examples

8719 --------

8720 Show which entries in a DataFrame are NA.

8721

8722 >>> df = pd.DataFrame(dict(age=[5, 6, np.nan],

8723 ... born=[pd.NaT, pd.Timestamp('1939-05-27'),

8724 ... pd.Timestamp('1940-04-25')],

8725 ... name=['Alfred', 'Batman', ''],

8726 ... toy=[None, 'Batmobile', 'Joker']))

8727 >>> df

8728 age born name toy

8729 0 5.0 NaT Alfred None

8730 1 6.0 1939-05-27 Batman Batmobile

8731 2 NaN 1940-04-25 Joker

8732

8733 >>> df.isna()

8734 age born name toy

8735 0 False True False True

8736 1 False False False False

8737 2 True False False False

8738

8739 Show which entries in a Series are NA.

8740

8741 >>> ser = pd.Series([5, 6, np.nan])

8742 >>> ser

8743 0 5.0

8744 1 6.0

8745 2 NaN

8746 dtype: float64

8747

8748 >>> ser.isna()

8749 0 False

8750 1 False

8751 2 True

8752 dtype: bool

8753 """

8754 return isna(self).__finalize__(self, method="isna")

8755

8756 @doc(isna, klass=_shared_doc_kwargs["klass"])

8757 def isnull(self) -> Self:

8758 return isna(self).__finalize__(self, method="isnull")

8759

8760 @doc(klass=_shared_doc_kwargs["klass"])

8761 def notna(self) -> Self:

8762 """

8763 Detect existing (non-missing) values.

8764

8765 Return a boolean same-sized object indicating if the values are not NA.

8766 Non-missing values get mapped to True. Characters such as empty

8767 strings ``''`` or :attr:`numpy.inf` are not considered NA values

8768 (unless you set ``pandas.options.mode.use_inf_as_na = True``).

8769 NA values, such as None or :attr:`numpy.NaN`, get mapped to False

8770 values.

8771

8772 Returns

8773 -------

8774 {klass}

8775 Mask of bool values for each element in {klass} that

8776 indicates whether an element is not an NA value.

8777

8778 See Also

8779 --------

8780 {klass}.notnull : Alias of notna.

8781 {klass}.isna : Boolean inverse of notna.

8782 {klass}.dropna : Omit axes labels with missing values.

8783 notna : Top-level notna.

8784

8785 Examples

8786 --------

8787 Show which entries in a DataFrame are not NA.

8788

8789 >>> df = pd.DataFrame(dict(age=[5, 6, np.nan],

8790 ... born=[pd.NaT, pd.Timestamp('1939-05-27'),

8791 ... pd.Timestamp('1940-04-25')],

8792 ... name=['Alfred', 'Batman', ''],

8793 ... toy=[None, 'Batmobile', 'Joker']))

8794 >>> df

8795 age born name toy

8796 0 5.0 NaT Alfred None

8797 1 6.0 1939-05-27 Batman Batmobile

8798 2 NaN 1940-04-25 Joker

8799

8800 >>> df.notna()

8801 age born name toy

8802 0 True False True False

8803 1 True True True True

8804 2 False True True True

8805

8806 Show which entries in a Series are not NA.

8807

8808 >>> ser = pd.Series([5, 6, np.nan])

8809 >>> ser

8810 0 5.0

8811 1 6.0

8812 2 NaN

8813 dtype: float64

8814

8815 >>> ser.notna()

8816 0 True

8817 1 True

8818 2 False

8819 dtype: bool

8820 """

8821 return notna(self).__finalize__(self, method="notna")

8822

8823 @doc(notna, klass=_shared_doc_kwargs["klass"])

8824 def notnull(self) -> Self:

8825 return notna(self).__finalize__(self, method="notnull")

8826

8827 @final

8828 def _clip_with_scalar(self, lower, upper, inplace: bool_t = False):

8829 if (lower is not None and np.any(isna(lower))) or (

8830 upper is not None and np.any(isna(upper))

8831 ):

8832 raise ValueError("Cannot use an NA value as a clip threshold")

8833

8834 result = self

8835 mask = self.isna()

8836

8837 if lower is not None:

8838 cond = mask | (self >= lower)

8839 result = result.where(

8840 cond, lower, inplace=inplace

8841 ) # type: ignore[assignment]

8842 if upper is not None:

8843 cond = mask | (self <= upper)

8844 result = self if inplace else result

8845 result = result.where(

8846 cond, upper, inplace=inplace

8847 ) # type: ignore[assignment]

8848

8849 return result

8850

8851 @final

8852 def _clip_with_one_bound(self, threshold, method, axis, inplace):

8853 if axis is not None:

8854 axis = self._get_axis_number(axis)

8855

8856 # method is self.le for upper bound and self.ge for lower bound

8857 if is_scalar(threshold) and is_number(threshold):

8858 if method.__name__ == "le":

8859 return self._clip_with_scalar(None, threshold, inplace=inplace)

8860 return self._clip_with_scalar(threshold, None, inplace=inplace)

8861

8862 # GH #15390

8863 # In order for where method to work, the threshold must

8864 # be transformed to NDFrame from other array like structure.

8865 if (not isinstance(threshold, ABCSeries)) and is_list_like(threshold):

8866 if isinstance(self, ABCSeries):

8867 threshold = self._constructor(threshold, index=self.index)

8868 else:

8869 threshold = self._align_for_op(threshold, axis, flex=None)[1]

8870

8871 # GH 40420

8872 # Treat missing thresholds as no bounds, not clipping the values

8873 if is_list_like(threshold):

8874 fill_value = np.inf if method.__name__ == "le" else -np.inf

8875 threshold_inf = threshold.fillna(fill_value)

8876 else:

8877 threshold_inf = threshold

8878

8879 subset = method(threshold_inf, axis=axis) | isna(self)

8880

8881 # GH 40420

8882 return self.where(subset, threshold, axis=axis, inplace=inplace)

8883

8884 @overload

8885 def clip(

8886 self,

8887 lower=...,

8888 upper=...,

8889 *,

8890 axis: Axis | None = ...,

8891 inplace: Literal[False] = ...,

8892 **kwargs,

8893 ) -> Self:

8894 ...

8895

8896 @overload

8897 def clip(

8898 self,

8899 lower=...,

8900 upper=...,

8901 *,

8902 axis: Axis | None = ...,

8903 inplace: Literal[True],

8904 **kwargs,

8905 ) -> None:

8906 ...

8907

8908 @overload

8909 def clip(

8910 self,

8911 lower=...,

8912 upper=...,

8913 *,

8914 axis: Axis | None = ...,

8915 inplace: bool_t = ...,

8916 **kwargs,

8917 ) -> Self | None:

8918 ...

8919

8920 @final

8921 def clip(

8922 self,

8923 lower=None,

8924 upper=None,

8925 *,

8926 axis: Axis | None = None,

8927 inplace: bool_t = False,

8928 **kwargs,

8929 ) -> Self | None:

8930 """

8931 Trim values at input threshold(s).

8932

8933 Assigns values outside boundary to boundary values. Thresholds

8934 can be singular values or array like, and in the latter case

8935 the clipping is performed element-wise in the specified axis.

8936

8937 Parameters

8938 ----------

8939 lower : float or array-like, default None

8940 Minimum threshold value. All values below this

8941 threshold will be set to it. A missing

8942 threshold (e.g `NA`) will not clip the value.

8943 upper : float or array-like, default None

8944 Maximum threshold value. All values above this

8945 threshold will be set to it. A missing

8946 threshold (e.g `NA`) will not clip the value.

8947 axis : {{0 or 'index', 1 or 'columns', None}}, default None

8948 Align object with lower and upper along the given axis.

8949 For `Series` this parameter is unused and defaults to `None`.

8950 inplace : bool, default False

8951 Whether to perform the operation in place on the data.

8952 *args, **kwargs

8953 Additional keywords have no effect but might be accepted

8954 for compatibility with numpy.

8955

8956 Returns

8957 -------

8958 Series or DataFrame or None

8959 Same type as calling object with the values outside the

8960 clip boundaries replaced or None if ``inplace=True``.

8961

8962 See Also

8963 --------

8964 Series.clip : Trim values at input threshold in series.

8965 DataFrame.clip : Trim values at input threshold in dataframe.

8966 numpy.clip : Clip (limit) the values in an array.

8967

8968 Examples

8969 --------

8970 >>> data = {'col_0': [9, -3, 0, -1, 5], 'col_1': [-2, -7, 6, 8, -5]}

8971 >>> df = pd.DataFrame(data)

8972 >>> df

8973 col_0 col_1

8974 0 9 -2

8975 1 -3 -7

8976 2 0 6

8977 3 -1 8

8978 4 5 -5

8979

8980 Clips per column using lower and upper thresholds:

8981

8982 >>> df.clip(-4, 6)

8983 col_0 col_1

8984 0 6 -2

8985 1 -3 -4

8986 2 0 6

8987 3 -1 6

8988 4 5 -4

8989

8990 Clips using specific lower and upper thresholds per column:

8991

8992 >>> df.clip([-2, -1], [4, 5])

8993 col_0 col_1

8994 0 4 -1

8995 1 -2 -1

8996 2 0 5

8997 3 -1 5

8998 4 4 -1

8999

9000 Clips using specific lower and upper thresholds per column element:

9001

9002 >>> t = pd.Series([2, -4, -1, 6, 3])

9003 >>> t

9004 0 2

9005 1 -4

9006 2 -1

9007 3 6

9008 4 3

9009 dtype: int64

9010

9011 >>> df.clip(t, t + 4, axis=0)

9012 col_0 col_1

9013 0 6 2

9014 1 -3 -4

9015 2 0 3

9016 3 6 8

9017 4 5 3

9018

9019 Clips using specific lower threshold per column element, with missing values:

9020

9021 >>> t = pd.Series([2, -4, np.nan, 6, 3])

9022 >>> t

9023 0 2.0

9024 1 -4.0

9025 2 NaN

9026 3 6.0

9027 4 3.0

9028 dtype: float64

9029

9030 >>> df.clip(t, axis=0)

9031 col_0 col_1

9032 0 9 2

9033 1 -3 -4

9034 2 0 6

9035 3 6 8

9036 4 5 3

9037 """

9038 inplace = validate_bool_kwarg(inplace, "inplace")

9039

9040 if inplace:

9041 if not PYPY and using_copy_on_write():

9042 if sys.getrefcount(self) <= REF_COUNT:

9043 warnings.warn(

9044 _chained_assignment_method_msg,

9045 ChainedAssignmentError,

9046 stacklevel=2,

9047 )

9048 elif (

9049 not PYPY

9050 and not using_copy_on_write()

9051 and self._is_view_after_cow_rules()

9052 ):

9053 ctr = sys.getrefcount(self)

9054 ref_count = REF_COUNT

9055 if isinstance(self, ABCSeries) and hasattr(self, "_cacher"):

9056 # see https://github.com/pandas-dev/pandas/pull/56060#discussion_r1399245221

9057 ref_count += 1

9058 if ctr <= ref_count:

9059 warnings.warn(

9060 _chained_assignment_warning_method_msg,

9061 FutureWarning,

9062 stacklevel=2,

9063 )

9064

9065 axis = nv.validate_clip_with_axis(axis, (), kwargs)

9066 if axis is not None:

9067 axis = self._get_axis_number(axis)

9068

9069 # GH 17276

9070 # numpy doesn't like NaN as a clip value

9071 # so ignore

9072 # GH 19992

9073 # numpy doesn't drop a list-like bound containing NaN

9074 isna_lower = isna(lower)

9075 if not is_list_like(lower):

9076 if np.any(isna_lower):

9077 lower = None

9078 elif np.all(isna_lower):

9079 lower = None

9080 isna_upper = isna(upper)

9081 if not is_list_like(upper):

9082 if np.any(isna_upper):

9083 upper = None

9084 elif np.all(isna_upper):

9085 upper = None

9086

9087 # GH 2747 (arguments were reversed)

9088 if (

9089 lower is not None

9090 and upper is not None

9091 and is_scalar(lower)

9092 and is_scalar(upper)

9093 ):

9094 lower, upper = min(lower, upper), max(lower, upper)

9095

9096 # fast-path for scalars

9097 if (lower is None or is_number(lower)) and (upper is None or is_number(upper)):

9098 return self._clip_with_scalar(lower, upper, inplace=inplace)

9099

9100 result = self

9101 if lower is not None:

9102 result = result._clip_with_one_bound(

9103 lower, method=self.ge, axis=axis, inplace=inplace

9104 )

9105 if upper is not None:

9106 if inplace:

9107 result = self

9108 result = result._clip_with_one_bound(

9109 upper, method=self.le, axis=axis, inplace=inplace

9110 )

9111

9112 return result

9113

9114 @final

9115 @doc(klass=_shared_doc_kwargs["klass"])

9116 def asfreq(

9117 self,

9118 freq: Frequency,

9119 method: FillnaOptions | None = None,

9120 how: Literal["start", "end"] | None = None,

9121 normalize: bool_t = False,

9122 fill_value: Hashable | None = None,

9123 ) -> Self:

9124 """

9125 Convert time series to specified frequency.

9126

9127 Returns the original data conformed to a new index with the specified

9128 frequency.

9129

9130 If the index of this {klass} is a :class:`~pandas.PeriodIndex`, the new index

9131 is the result of transforming the original index with

9132 :meth:`PeriodIndex.asfreq <pandas.PeriodIndex.asfreq>` (so the original index

9133 will map one-to-one to the new index).

9134

9135 Otherwise, the new index will be equivalent to ``pd.date_range(start, end,

9136 freq=freq)`` where ``start`` and ``end`` are, respectively, the first and

9137 last entries in the original index (see :func:`pandas.date_range`). The

9138 values corresponding to any timesteps in the new index which were not present

9139 in the original index will be null (``NaN``), unless a method for filling

9140 such unknowns is provided (see the ``method`` parameter below).

9141

9142 The :meth:`resample` method is more appropriate if an operation on each group of

9143 timesteps (such as an aggregate) is necessary to represent the data at the new

9144 frequency.

9145

9146 Parameters

9147 ----------

9148 freq : DateOffset or str

9149 Frequency DateOffset or string.

9150 method : {{'backfill'/'bfill', 'pad'/'ffill'}}, default None

9151 Method to use for filling holes in reindexed Series (note this

9152 does not fill NaNs that already were present):

9153

9154 * 'pad' / 'ffill': propagate last valid observation forward to next

9155 valid

9156 * 'backfill' / 'bfill': use NEXT valid observation to fill.

9157 how : {{'start', 'end'}}, default end

9158 For PeriodIndex only (see PeriodIndex.asfreq).

9159 normalize : bool, default False

9160 Whether to reset output index to midnight.

9161 fill_value : scalar, optional

9162 Value to use for missing values, applied during upsampling (note

9163 this does not fill NaNs that already were present).

9164

9165 Returns

9166 -------

9167 {klass}

9168 {klass} object reindexed to the specified frequency.

9169

9170 See Also

9171 --------

9172 reindex : Conform DataFrame to new index with optional filling logic.

9173

9174 Notes

9175 -----

9176 To learn more about the frequency strings, please see `this link

9177 <https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#offset-aliases>`__.

9178

9179 Examples

9180 --------

9181 Start by creating a series with 4 one minute timestamps.

9182

9183 >>> index = pd.date_range('1/1/2000', periods=4, freq='min')

9184 >>> series = pd.Series([0.0, None, 2.0, 3.0], index=index)

9185 >>> df = pd.DataFrame({{'s': series}})

9186 >>> df

9187 s

9188 2000-01-01 00:00:00 0.0

9189 2000-01-01 00:01:00 NaN

9190 2000-01-01 00:02:00 2.0

9191 2000-01-01 00:03:00 3.0

9192

9193 Upsample the series into 30 second bins.

9194

9195 >>> df.asfreq(freq='30s')

9196 s

9197 2000-01-01 00:00:00 0.0

9198 2000-01-01 00:00:30 NaN

9199 2000-01-01 00:01:00 NaN

9200 2000-01-01 00:01:30 NaN

9201 2000-01-01 00:02:00 2.0

9202 2000-01-01 00:02:30 NaN

9203 2000-01-01 00:03:00 3.0

9204

9205 Upsample again, providing a ``fill value``.

9206

9207 >>> df.asfreq(freq='30s', fill_value=9.0)

9208 s

9209 2000-01-01 00:00:00 0.0

9210 2000-01-01 00:00:30 9.0

9211 2000-01-01 00:01:00 NaN

9212 2000-01-01 00:01:30 9.0

9213 2000-01-01 00:02:00 2.0

9214 2000-01-01 00:02:30 9.0

9215 2000-01-01 00:03:00 3.0

9216

9217 Upsample again, providing a ``method``.

9218

9219 >>> df.asfreq(freq='30s', method='bfill')

9220 s

9221 2000-01-01 00:00:00 0.0

9222 2000-01-01 00:00:30 NaN

9223 2000-01-01 00:01:00 NaN

9224 2000-01-01 00:01:30 2.0

9225 2000-01-01 00:02:00 2.0

9226 2000-01-01 00:02:30 3.0

9227 2000-01-01 00:03:00 3.0

9228 """

9229 from pandas.core.resample import asfreq

9230

9231 return asfreq(

9232 self,

9233 freq,

9234 method=method,

9235 how=how,

9236 normalize=normalize,

9237 fill_value=fill_value,

9238 )

9239

9240 @final

9241 def at_time(self, time, asof: bool_t = False, axis: Axis | None = None) -> Self:

9242 """

9243 Select values at particular time of day (e.g., 9:30AM).

9244

9245 Parameters

9246 ----------

9247 time : datetime.time or str

9248 The values to select.

9249 axis : {0 or 'index', 1 or 'columns'}, default 0

9250 For `Series` this parameter is unused and defaults to 0.

9251

9252 Returns

9253 -------

9254 Series or DataFrame

9255

9256 Raises

9257 ------

9258 TypeError

9259 If the index is not a :class:`DatetimeIndex`

9260

9261 See Also

9262 --------

9263 between_time : Select values between particular times of the day.

9264 first : Select initial periods of time series based on a date offset.

9265 last : Select final periods of time series based on a date offset.

9266 DatetimeIndex.indexer_at_time : Get just the index locations for

9267 values at particular time of the day.

9268

9269 Examples

9270 --------

9271 >>> i = pd.date_range('2018-04-09', periods=4, freq='12h')

9272 >>> ts = pd.DataFrame({'A': [1, 2, 3, 4]}, index=i)

9273 >>> ts

9274 A

9275 2018-04-09 00:00:00 1

9276 2018-04-09 12:00:00 2

9277 2018-04-10 00:00:00 3

9278 2018-04-10 12:00:00 4

9279

9280 >>> ts.at_time('12:00')

9281 A

9282 2018-04-09 12:00:00 2

9283 2018-04-10 12:00:00 4

9284 """

9285 if axis is None:

9286 axis = 0

9287 axis = self._get_axis_number(axis)

9288

9289 index = self._get_axis(axis)

9290

9291 if not isinstance(index, DatetimeIndex):

9292 raise TypeError("Index must be DatetimeIndex")

9293

9294 indexer = index.indexer_at_time(time, asof=asof)

9295 return self._take_with_is_copy(indexer, axis=axis)

9296

9297 @final

9298 def between_time(

9299 self,

9300 start_time,

9301 end_time,

9302 inclusive: IntervalClosedType = "both",

9303 axis: Axis | None = None,

9304 ) -> Self:

9305 """

9306 Select values between particular times of the day (e.g., 9:00-9:30 AM).

9307

9308 By setting ``start_time`` to be later than ``end_time``,

9309 you can get the times that are *not* between the two times.

9310

9311 Parameters

9312 ----------

9313 start_time : datetime.time or str

9314 Initial time as a time filter limit.

9315 end_time : datetime.time or str

9316 End time as a time filter limit.

9317 inclusive : {"both", "neither", "left", "right"}, default "both"

9318 Include boundaries; whether to set each bound as closed or open.

9319 axis : {0 or 'index', 1 or 'columns'}, default 0

9320 Determine range time on index or columns value.

9321 For `Series` this parameter is unused and defaults to 0.

9322

9323 Returns

9324 -------

9325 Series or DataFrame

9326 Data from the original object filtered to the specified dates range.

9327

9328 Raises

9329 ------

9330 TypeError

9331 If the index is not a :class:`DatetimeIndex`

9332

9333 See Also

9334 --------

9335 at_time : Select values at a particular time of the day.

9336 first : Select initial periods of time series based on a date offset.

9337 last : Select final periods of time series based on a date offset.

9338 DatetimeIndex.indexer_between_time : Get just the index locations for

9339 values between particular times of the day.

9340

9341 Examples

9342 --------

9343 >>> i = pd.date_range('2018-04-09', periods=4, freq='1D20min')

9344 >>> ts = pd.DataFrame({'A': [1, 2, 3, 4]}, index=i)

9345 >>> ts

9346 A

9347 2018-04-09 00:00:00 1

9348 2018-04-10 00:20:00 2

9349 2018-04-11 00:40:00 3

9350 2018-04-12 01:00:00 4

9351

9352 >>> ts.between_time('0:15', '0:45')

9353 A

9354 2018-04-10 00:20:00 2

9355 2018-04-11 00:40:00 3

9356

9357 You get the times that are *not* between two times by setting

9358 ``start_time`` later than ``end_time``:

9359

9360 >>> ts.between_time('0:45', '0:15')

9361 A

9362 2018-04-09 00:00:00 1

9363 2018-04-12 01:00:00 4

9364 """

9365 if axis is None:

9366 axis = 0

9367 axis = self._get_axis_number(axis)

9368

9369 index = self._get_axis(axis)

9370 if not isinstance(index, DatetimeIndex):

9371 raise TypeError("Index must be DatetimeIndex")

9372

9373 left_inclusive, right_inclusive = validate_inclusive(inclusive)

9374 indexer = index.indexer_between_time(

9375 start_time,

9376 end_time,

9377 include_start=left_inclusive,

9378 include_end=right_inclusive,

9379 )

9380 return self._take_with_is_copy(indexer, axis=axis)

9381

9382 @final

9383 @doc(klass=_shared_doc_kwargs["klass"])

9384 def resample(

9385 self,

9386 rule,

9387 axis: Axis | lib.NoDefault = lib.no_default,

9388 closed: Literal["right", "left"] | None = None,

9389 label: Literal["right", "left"] | None = None,

9390 convention: Literal["start", "end", "s", "e"] | lib.NoDefault = lib.no_default,

9391 kind: Literal["timestamp", "period"] | None | lib.NoDefault = lib.no_default,

9392 on: Level | None = None,

9393 level: Level | None = None,

9394 origin: str | TimestampConvertibleTypes = "start_day",

9395 offset: TimedeltaConvertibleTypes | None = None,

9396 group_keys: bool_t = False,

9397 ) -> Resampler:

9398 """

9399 Resample time-series data.

9400

9401 Convenience method for frequency conversion and resampling of time series.

9402 The object must have a datetime-like index (`DatetimeIndex`, `PeriodIndex`,

9403 or `TimedeltaIndex`), or the caller must pass the label of a datetime-like

9404 series/index to the ``on``/``level`` keyword parameter.

9405

9406 Parameters

9407 ----------

9408 rule : DateOffset, Timedelta or str

9409 The offset string or object representing target conversion.

9410 axis : {{0 or 'index', 1 or 'columns'}}, default 0

9411 Which axis to use for up- or down-sampling. For `Series` this parameter

9412 is unused and defaults to 0. Must be

9413 `DatetimeIndex`, `TimedeltaIndex` or `PeriodIndex`.

9414

9415 .. deprecated:: 2.0.0

9416 Use frame.T.resample(...) instead.

9417 closed : {{'right', 'left'}}, default None

9418 Which side of bin interval is closed. The default is 'left'

9419 for all frequency offsets except for 'ME', 'YE', 'QE', 'BME',

9420 'BA', 'BQE', and 'W' which all have a default of 'right'.

9421 label : {{'right', 'left'}}, default None

9422 Which bin edge label to label bucket with. The default is 'left'

9423 for all frequency offsets except for 'ME', 'YE', 'QE', 'BME',

9424 'BA', 'BQE', and 'W' which all have a default of 'right'.

9425 convention : {{'start', 'end', 's', 'e'}}, default 'start'

9426 For `PeriodIndex` only, controls whether to use the start or

9427 end of `rule`.

9428

9429 .. deprecated:: 2.2.0

9430 Convert PeriodIndex to DatetimeIndex before resampling instead.

9431 kind : {{'timestamp', 'period'}}, optional, default None

9432 Pass 'timestamp' to convert the resulting index to a

9433 `DateTimeIndex` or 'period' to convert it to a `PeriodIndex`.

9434 By default the input representation is retained.

9435

9436 .. deprecated:: 2.2.0

9437 Convert index to desired type explicitly instead.

9438

9439 on : str, optional

9440 For a DataFrame, column to use instead of index for resampling.

9441 Column must be datetime-like.

9442 level : str or int, optional

9443 For a MultiIndex, level (name or number) to use for

9444 resampling. `level` must be datetime-like.

9445 origin : Timestamp or str, default 'start_day'

9446 The timestamp on which to adjust the grouping. The timezone of origin

9447 must match the timezone of the index.

9448 If string, must be one of the following:

9449

9450 - 'epoch': `origin` is 1970-01-01

9451 - 'start': `origin` is the first value of the timeseries

9452 - 'start_day': `origin` is the first day at midnight of the timeseries

9453

9454 - 'end': `origin` is the last value of the timeseries

9455 - 'end_day': `origin` is the ceiling midnight of the last day

9456

9457 .. versionadded:: 1.3.0

9458

9459 .. note::

9460

9461 Only takes effect for Tick-frequencies (i.e. fixed frequencies like

9462 days, hours, and minutes, rather than months or quarters).

9463 offset : Timedelta or str, default is None

9464 An offset timedelta added to the origin.

9465

9466 group_keys : bool, default False

9467 Whether to include the group keys in the result index when using

9468 ``.apply()`` on the resampled object.

9469

9470 .. versionadded:: 1.5.0

9471

9472 Not specifying ``group_keys`` will retain values-dependent behavior

9473 from pandas 1.4 and earlier (see :ref:`pandas 1.5.0 Release notes

9474 <whatsnew_150.enhancements.resample_group_keys>` for examples).

9475

9476 .. versionchanged:: 2.0.0

9477

9478 ``group_keys`` now defaults to ``False``.

9479

9480 Returns

9481 -------

9482 pandas.api.typing.Resampler

9483 :class:`~pandas.core.Resampler` object.

9484

9485 See Also

9486 --------

9487 Series.resample : Resample a Series.

9488 DataFrame.resample : Resample a DataFrame.

9489 groupby : Group {klass} by mapping, function, label, or list of labels.

9490 asfreq : Reindex a {klass} with the given frequency without grouping.

9491

9492 Notes

9493 -----

9494 See the `user guide

9495 <https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#resampling>`__

9496 for more.

9497

9498 To learn more about the offset strings, please see `this link

9499 <https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#dateoffset-objects>`__.

9500

9501 Examples

9502 --------

9503 Start by creating a series with 9 one minute timestamps.

9504

9505 >>> index = pd.date_range('1/1/2000', periods=9, freq='min')

9506 >>> series = pd.Series(range(9), index=index)

9507 >>> series

9508 2000-01-01 00:00:00 0

9509 2000-01-01 00:01:00 1

9510 2000-01-01 00:02:00 2

9511 2000-01-01 00:03:00 3

9512 2000-01-01 00:04:00 4

9513 2000-01-01 00:05:00 5

9514 2000-01-01 00:06:00 6

9515 2000-01-01 00:07:00 7

9516 2000-01-01 00:08:00 8

9517 Freq: min, dtype: int64

9518

9519 Downsample the series into 3 minute bins and sum the values

9520 of the timestamps falling into a bin.

9521

9522 >>> series.resample('3min').sum()

9523 2000-01-01 00:00:00 3

9524 2000-01-01 00:03:00 12

9525 2000-01-01 00:06:00 21

9526 Freq: 3min, dtype: int64

9527

9528 Downsample the series into 3 minute bins as above, but label each

9529 bin using the right edge instead of the left. Please note that the

9530 value in the bucket used as the label is not included in the bucket,

9531 which it labels. For example, in the original series the

9532 bucket ``2000-01-01 00:03:00`` contains the value 3, but the summed

9533 value in the resampled bucket with the label ``2000-01-01 00:03:00``

9534 does not include 3 (if it did, the summed value would be 6, not 3).

9535

9536 >>> series.resample('3min', label='right').sum()

9537 2000-01-01 00:03:00 3

9538 2000-01-01 00:06:00 12

9539 2000-01-01 00:09:00 21

9540 Freq: 3min, dtype: int64

9541

9542 To include this value close the right side of the bin interval,

9543 as shown below.

9544

9545 >>> series.resample('3min', label='right', closed='right').sum()

9546 2000-01-01 00:00:00 0

9547 2000-01-01 00:03:00 6

9548 2000-01-01 00:06:00 15

9549 2000-01-01 00:09:00 15

9550 Freq: 3min, dtype: int64

9551

9552 Upsample the series into 30 second bins.

9553

9554 >>> series.resample('30s').asfreq()[0:5] # Select first 5 rows

9555 2000-01-01 00:00:00 0.0

9556 2000-01-01 00:00:30 NaN

9557 2000-01-01 00:01:00 1.0

9558 2000-01-01 00:01:30 NaN

9559 2000-01-01 00:02:00 2.0

9560 Freq: 30s, dtype: float64

9561

9562 Upsample the series into 30 second bins and fill the ``NaN``

9563 values using the ``ffill`` method.

9564

9565 >>> series.resample('30s').ffill()[0:5]

9566 2000-01-01 00:00:00 0

9567 2000-01-01 00:00:30 0

9568 2000-01-01 00:01:00 1

9569 2000-01-01 00:01:30 1

9570 2000-01-01 00:02:00 2

9571 Freq: 30s, dtype: int64

9572

9573 Upsample the series into 30 second bins and fill the

9574 ``NaN`` values using the ``bfill`` method.

9575

9576 >>> series.resample('30s').bfill()[0:5]

9577 2000-01-01 00:00:00 0

9578 2000-01-01 00:00:30 1

9579 2000-01-01 00:01:00 1

9580 2000-01-01 00:01:30 2

9581 2000-01-01 00:02:00 2

9582 Freq: 30s, dtype: int64

9583

9584 Pass a custom function via ``apply``

9585

9586 >>> def custom_resampler(arraylike):

9587 ... return np.sum(arraylike) + 5

9588 ...

9589 >>> series.resample('3min').apply(custom_resampler)

9590 2000-01-01 00:00:00 8

9591 2000-01-01 00:03:00 17

9592 2000-01-01 00:06:00 26

9593 Freq: 3min, dtype: int64

9594

9595 For DataFrame objects, the keyword `on` can be used to specify the

9596 column instead of the index for resampling.

9597

9598 >>> d = {{'price': [10, 11, 9, 13, 14, 18, 17, 19],

9599 ... 'volume': [50, 60, 40, 100, 50, 100, 40, 50]}}

9600 >>> df = pd.DataFrame(d)

9601 >>> df['week_starting'] = pd.date_range('01/01/2018',

9602 ... periods=8,

9603 ... freq='W')

9604 >>> df

9605 price volume week_starting

9606 0 10 50 2018-01-07

9607 1 11 60 2018-01-14

9608 2 9 40 2018-01-21

9609 3 13 100 2018-01-28

9610 4 14 50 2018-02-04

9611 5 18 100 2018-02-11

9612 6 17 40 2018-02-18

9613 7 19 50 2018-02-25

9614 >>> df.resample('ME', on='week_starting').mean()

9615 price volume

9616 week_starting

9617 2018-01-31 10.75 62.5

9618 2018-02-28 17.00 60.0

9619

9620 For a DataFrame with MultiIndex, the keyword `level` can be used to

9621 specify on which level the resampling needs to take place.

9622

9623 >>> days = pd.date_range('1/1/2000', periods=4, freq='D')

9624 >>> d2 = {{'price': [10, 11, 9, 13, 14, 18, 17, 19],

9625 ... 'volume': [50, 60, 40, 100, 50, 100, 40, 50]}}

9626 >>> df2 = pd.DataFrame(

9627 ... d2,

9628 ... index=pd.MultiIndex.from_product(

9629 ... [days, ['morning', 'afternoon']]

9630 ... )

9631 ... )

9632 >>> df2

9633 price volume

9634 2000-01-01 morning 10 50

9635 afternoon 11 60

9636 2000-01-02 morning 9 40

9637 afternoon 13 100

9638 2000-01-03 morning 14 50

9639 afternoon 18 100

9640 2000-01-04 morning 17 40

9641 afternoon 19 50

9642 >>> df2.resample('D', level=0).sum()

9643 price volume

9644 2000-01-01 21 110

9645 2000-01-02 22 140

9646 2000-01-03 32 150

9647 2000-01-04 36 90

9648

9649 If you want to adjust the start of the bins based on a fixed timestamp:

9650

9651 >>> start, end = '2000-10-01 23:30:00', '2000-10-02 00:30:00'

9652 >>> rng = pd.date_range(start, end, freq='7min')

9653 >>> ts = pd.Series(np.arange(len(rng)) * 3, index=rng)

9654 >>> ts

9655 2000-10-01 23:30:00 0

9656 2000-10-01 23:37:00 3

9657 2000-10-01 23:44:00 6

9658 2000-10-01 23:51:00 9

9659 2000-10-01 23:58:00 12

9660 2000-10-02 00:05:00 15

9661 2000-10-02 00:12:00 18

9662 2000-10-02 00:19:00 21

9663 2000-10-02 00:26:00 24

9664 Freq: 7min, dtype: int64

9665

9666 >>> ts.resample('17min').sum()

9667 2000-10-01 23:14:00 0

9668 2000-10-01 23:31:00 9

9669 2000-10-01 23:48:00 21

9670 2000-10-02 00:05:00 54

9671 2000-10-02 00:22:00 24

9672 Freq: 17min, dtype: int64

9673

9674 >>> ts.resample('17min', origin='epoch').sum()

9675 2000-10-01 23:18:00 0

9676 2000-10-01 23:35:00 18

9677 2000-10-01 23:52:00 27

9678 2000-10-02 00:09:00 39

9679 2000-10-02 00:26:00 24

9680 Freq: 17min, dtype: int64

9681

9682 >>> ts.resample('17min', origin='2000-01-01').sum()

9683 2000-10-01 23:24:00 3

9684 2000-10-01 23:41:00 15

9685 2000-10-01 23:58:00 45

9686 2000-10-02 00:15:00 45

9687 Freq: 17min, dtype: int64

9688

9689 If you want to adjust the start of the bins with an `offset` Timedelta, the two

9690 following lines are equivalent:

9691

9692 >>> ts.resample('17min', origin='start').sum()

9693 2000-10-01 23:30:00 9

9694 2000-10-01 23:47:00 21

9695 2000-10-02 00:04:00 54

9696 2000-10-02 00:21:00 24

9697 Freq: 17min, dtype: int64

9698

9699 >>> ts.resample('17min', offset='23h30min').sum()

9700 2000-10-01 23:30:00 9

9701 2000-10-01 23:47:00 21

9702 2000-10-02 00:04:00 54

9703 2000-10-02 00:21:00 24

9704 Freq: 17min, dtype: int64

9705

9706 If you want to take the largest Timestamp as the end of the bins:

9707

9708 >>> ts.resample('17min', origin='end').sum()

9709 2000-10-01 23:35:00 0

9710 2000-10-01 23:52:00 18

9711 2000-10-02 00:09:00 27

9712 2000-10-02 00:26:00 63

9713 Freq: 17min, dtype: int64

9714

9715 In contrast with the `start_day`, you can use `end_day` to take the ceiling

9716 midnight of the largest Timestamp as the end of the bins and drop the bins

9717 not containing data:

9718

9719 >>> ts.resample('17min', origin='end_day').sum()

9720 2000-10-01 23:38:00 3

9721 2000-10-01 23:55:00 15

9722 2000-10-02 00:12:00 45

9723 2000-10-02 00:29:00 45

9724 Freq: 17min, dtype: int64

9725 """

9726 from pandas.core.resample import get_resampler

9727

9728 if axis is not lib.no_default:

9729 axis = self._get_axis_number(axis)

9730 if axis == 1:

9731 warnings.warn(

9732 "DataFrame.resample with axis=1 is deprecated. Do "

9733 "`frame.T.resample(...)` without axis instead.",

9734 FutureWarning,

9735 stacklevel=find_stack_level(),

9736 )

9737 else:

9738 warnings.warn(

9739 f"The 'axis' keyword in {type(self).__name__}.resample is "

9740 "deprecated and will be removed in a future version.",

9741 FutureWarning,

9742 stacklevel=find_stack_level(),

9743 )

9744 else:

9745 axis = 0

9746

9747 if kind is not lib.no_default:

9748 # GH#55895

9749 warnings.warn(

9750 f"The 'kind' keyword in {type(self).__name__}.resample is "

9751 "deprecated and will be removed in a future version. "

9752 "Explicitly cast the index to the desired type instead",

9753 FutureWarning,

9754 stacklevel=find_stack_level(),

9755 )

9756 else:

9757 kind = None

9758

9759 if convention is not lib.no_default:

9760 warnings.warn(

9761 f"The 'convention' keyword in {type(self).__name__}.resample is "

9762 "deprecated and will be removed in a future version. "

9763 "Explicitly cast PeriodIndex to DatetimeIndex before resampling "

9764 "instead.",

9765 FutureWarning,

9766 stacklevel=find_stack_level(),

9767 )

9768 else:

9769 convention = "start"

9770

9771 return get_resampler(

9772 cast("Series | DataFrame", self),

9773 freq=rule,

9774 label=label,

9775 closed=closed,

9776 axis=axis,

9777 kind=kind,

9778 convention=convention,

9779 key=on,

9780 level=level,

9781 origin=origin,

9782 offset=offset,

9783 group_keys=group_keys,

9784 )

9785

9786 @final

9787 def first(self, offset) -> Self:

9788 """

9789 Select initial periods of time series data based on a date offset.

9790

9791 .. deprecated:: 2.1

9792 :meth:`.first` is deprecated and will be removed in a future version.

9793 Please create a mask and filter using `.loc` instead.

9794

9795 For a DataFrame with a sorted DatetimeIndex, this function can

9796 select the first few rows based on a date offset.

9797

9798 Parameters

9799 ----------

9800 offset : str, DateOffset or dateutil.relativedelta

9801 The offset length of the data that will be selected. For instance,

9802 '1ME' will display all the rows having their index within the first month.

9803

9804 Returns

9805 -------

9806 Series or DataFrame

9807 A subset of the caller.

9808

9809 Raises

9810 ------

9811 TypeError

9812 If the index is not a :class:`DatetimeIndex`

9813

9814 See Also

9815 --------

9816 last : Select final periods of time series based on a date offset.

9817 at_time : Select values at a particular time of the day.

9818 between_time : Select values between particular times of the day.

9819

9820 Examples

9821 --------

9822 >>> i = pd.date_range('2018-04-09', periods=4, freq='2D')

9823 >>> ts = pd.DataFrame({'A': [1, 2, 3, 4]}, index=i)

9824 >>> ts

9825 A

9826 2018-04-09 1

9827 2018-04-11 2

9828 2018-04-13 3

9829 2018-04-15 4

9830

9831 Get the rows for the first 3 days:

9832

9833 >>> ts.first('3D')

9834 A

9835 2018-04-09 1

9836 2018-04-11 2

9837

9838 Notice the data for 3 first calendar days were returned, not the first

9839 3 days observed in the dataset, and therefore data for 2018-04-13 was

9840 not returned.

9841 """

9842 warnings.warn(

9843 "first is deprecated and will be removed in a future version. "

9844 "Please create a mask and filter using `.loc` instead",

9845 FutureWarning,

9846 stacklevel=find_stack_level(),

9847 )

9848 if not isinstance(self.index, DatetimeIndex):

9849 raise TypeError("'first' only supports a DatetimeIndex index")

9850

9851 if len(self.index) == 0:

9852 return self.copy(deep=False)

9853

9854 offset = to_offset(offset)

9855 if not isinstance(offset, Tick) and offset.is_on_offset(self.index[0]):

9856 # GH#29623 if first value is end of period, remove offset with n = 1

9857 # before adding the real offset

9858 end_date = end = self.index[0] - offset.base + offset

9859 else:

9860 end_date = end = self.index[0] + offset

9861

9862 # Tick-like, e.g. 3 weeks

9863 if isinstance(offset, Tick) and end_date in self.index:

9864 end = self.index.searchsorted(end_date, side="left")

9865 return self.iloc[:end]

9866

9867 return self.loc[:end]

9868

9869 @final

9870 def last(self, offset) -> Self:

9871 """

9872 Select final periods of time series data based on a date offset.

9873

9874 .. deprecated:: 2.1

9875 :meth:`.last` is deprecated and will be removed in a future version.

9876 Please create a mask and filter using `.loc` instead.

9877

9878 For a DataFrame with a sorted DatetimeIndex, this function

9879 selects the last few rows based on a date offset.

9880

9881 Parameters

9882 ----------

9883 offset : str, DateOffset, dateutil.relativedelta

9884 The offset length of the data that will be selected. For instance,

9885 '3D' will display all the rows having their index within the last 3 days.

9886

9887 Returns

9888 -------

9889 Series or DataFrame

9890 A subset of the caller.

9891

9892 Raises

9893 ------

9894 TypeError

9895 If the index is not a :class:`DatetimeIndex`

9896

9897 See Also

9898 --------

9899 first : Select initial periods of time series based on a date offset.

9900 at_time : Select values at a particular time of the day.

9901 between_time : Select values between particular times of the day.

9902

9903 Notes

9904 -----

9905 .. deprecated:: 2.1.0

9906 Please create a mask and filter using `.loc` instead

9907

9908 Examples

9909 --------

9910 >>> i = pd.date_range('2018-04-09', periods=4, freq='2D')

9911 >>> ts = pd.DataFrame({'A': [1, 2, 3, 4]}, index=i)

9912 >>> ts

9913 A

9914 2018-04-09 1

9915 2018-04-11 2

9916 2018-04-13 3

9917 2018-04-15 4

9918

9919 Get the rows for the last 3 days:

9920

9921 >>> ts.last('3D') # doctest: +SKIP

9922 A

9923 2018-04-13 3

9924 2018-04-15 4

9925

9926 Notice the data for 3 last calendar days were returned, not the last

9927 3 observed days in the dataset, and therefore data for 2018-04-11 was

9928 not returned.

9929 """

9930 warnings.warn(

9931 "last is deprecated and will be removed in a future version. "

9932 "Please create a mask and filter using `.loc` instead",

9933 FutureWarning,

9934 stacklevel=find_stack_level(),

9935 )

9936

9937 if not isinstance(self.index, DatetimeIndex):

9938 raise TypeError("'last' only supports a DatetimeIndex index")

9939

9940 if len(self.index) == 0:

9941 return self.copy(deep=False)

9942

9943 offset = to_offset(offset)

9944

9945 start_date = self.index[-1] - offset

9946 start = self.index.searchsorted(start_date, side="right")

9947 return self.iloc[start:]

9948

9949 @final

9950 def rank(

9951 self,

9952 axis: Axis = 0,

9953 method: Literal["average", "min", "max", "first", "dense"] = "average",

9954 numeric_only: bool_t = False,

9955 na_option: Literal["keep", "top", "bottom"] = "keep",

9956 ascending: bool_t = True,

9957 pct: bool_t = False,

9958 ) -> Self:

9959 """

9960 Compute numerical data ranks (1 through n) along axis.

9961

9962 By default, equal values are assigned a rank that is the average of the

9963 ranks of those values.

9964

9965 Parameters

9966 ----------

9967 axis : {0 or 'index', 1 or 'columns'}, default 0

9968 Index to direct ranking.

9969 For `Series` this parameter is unused and defaults to 0.

9970 method : {'average', 'min', 'max', 'first', 'dense'}, default 'average'

9971 How to rank the group of records that have the same value (i.e. ties):

9972

9973 * average: average rank of the group

9974 * min: lowest rank in the group

9975 * max: highest rank in the group

9976 * first: ranks assigned in order they appear in the array

9977 * dense: like 'min', but rank always increases by 1 between groups.

9978

9979 numeric_only : bool, default False

9980 For DataFrame objects, rank only numeric columns if set to True.

9981

9982 .. versionchanged:: 2.0.0

9983 The default value of ``numeric_only`` is now ``False``.

9984

9985 na_option : {'keep', 'top', 'bottom'}, default 'keep'

9986 How to rank NaN values:

9987

9988 * keep: assign NaN rank to NaN values

9989 * top: assign lowest rank to NaN values

9990 * bottom: assign highest rank to NaN values

9991

9992 ascending : bool, default True

9993 Whether or not the elements should be ranked in ascending order.

9994 pct : bool, default False

9995 Whether or not to display the returned rankings in percentile

9996 form.

9997

9998 Returns

9999 -------

10000 same type as caller

10001 Return a Series or DataFrame with data ranks as values.

10002

10003 See Also

10004 --------

10005 core.groupby.DataFrameGroupBy.rank : Rank of values within each group.

10006 core.groupby.SeriesGroupBy.rank : Rank of values within each group.

10007

10008 Examples

10009 --------

10010 >>> df = pd.DataFrame(data={'Animal': ['cat', 'penguin', 'dog',

10011 ... 'spider', 'snake'],

10012 ... 'Number_legs': [4, 2, 4, 8, np.nan]})

10013 >>> df

10014 Animal Number_legs

10015 0 cat 4.0

10016 1 penguin 2.0

10017 2 dog 4.0

10018 3 spider 8.0

10019 4 snake NaN

10020

10021 Ties are assigned the mean of the ranks (by default) for the group.

10022

10023 >>> s = pd.Series(range(5), index=list("abcde"))

10024 >>> s["d"] = s["b"]

10025 >>> s.rank()

10026 a 1.0

10027 b 2.5

10028 c 4.0

10029 d 2.5

10030 e 5.0

10031 dtype: float64

10032

10033 The following example shows how the method behaves with the above

10034 parameters:

10035

10036 * default_rank: this is the default behaviour obtained without using

10037 any parameter.

10038 * max_rank: setting ``method = 'max'`` the records that have the

10039 same values are ranked using the highest rank (e.g.: since 'cat'

10040 and 'dog' are both in the 2nd and 3rd position, rank 3 is assigned.)

10041 * NA_bottom: choosing ``na_option = 'bottom'``, if there are records

10042 with NaN values they are placed at the bottom of the ranking.

10043 * pct_rank: when setting ``pct = True``, the ranking is expressed as

10044 percentile rank.

10045

10046 >>> df['default_rank'] = df['Number_legs'].rank()

10047 >>> df['max_rank'] = df['Number_legs'].rank(method='max')

10048 >>> df['NA_bottom'] = df['Number_legs'].rank(na_option='bottom')

10049 >>> df['pct_rank'] = df['Number_legs'].rank(pct=True)

10050 >>> df

10051 Animal Number_legs default_rank max_rank NA_bottom pct_rank

10052 0 cat 4.0 2.5 3.0 2.5 0.625

10053 1 penguin 2.0 1.0 1.0 1.0 0.250

10054 2 dog 4.0 2.5 3.0 2.5 0.625

10055 3 spider 8.0 4.0 4.0 4.0 1.000

10056 4 snake NaN NaN NaN 5.0 NaN

10057 """

10058 axis_int = self._get_axis_number(axis)

10059

10060 if na_option not in {"keep", "top", "bottom"}:

10061 msg = "na_option must be one of 'keep', 'top', or 'bottom'"

10062 raise ValueError(msg)

10063

10064 def ranker(data):

10065 if data.ndim == 2:

10066 # i.e. DataFrame, we cast to ndarray

10067 values = data.values

10068 else:

10069 # i.e. Series, can dispatch to EA

10070 values = data._values

10071

10072 if isinstance(values, ExtensionArray):

10073 ranks = values._rank(

10074 axis=axis_int,

10075 method=method,

10076 ascending=ascending,

10077 na_option=na_option,

10078 pct=pct,

10079 )

10080 else:

10081 ranks = algos.rank(

10082 values,

10083 axis=axis_int,

10084 method=method,

10085 ascending=ascending,

10086 na_option=na_option,

10087 pct=pct,

10088 )

10089

10090 ranks_obj = self._constructor(ranks, **data._construct_axes_dict())

10091 return ranks_obj.__finalize__(self, method="rank")

10092

10093 if numeric_only:

10094 if self.ndim == 1 and not is_numeric_dtype(self.dtype):

10095 # GH#47500

10096 raise TypeError(

10097 "Series.rank does not allow numeric_only=True with "

10098 "non-numeric dtype."

10099 )

10100 data = self._get_numeric_data()

10101 else:

10102 data = self

10103

10104 return ranker(data)

10105

10106 @doc(_shared_docs["compare"], klass=_shared_doc_kwargs["klass"])

10107 def compare(

10108 self,

10109 other,

10110 align_axis: Axis = 1,

10111 keep_shape: bool_t = False,

10112 keep_equal: bool_t = False,

10113 result_names: Suffixes = ("self", "other"),

10114 ):

10115 if type(self) is not type(other):

10116 cls_self, cls_other = type(self).__name__, type(other).__name__

10117 raise TypeError(

10118 f"can only compare '{cls_self}' (not '{cls_other}') with '{cls_self}'"

10119 )

10120

10121 mask = ~((self == other) | (self.isna() & other.isna()))

10122 mask.fillna(True, inplace=True)

10123

10124 if not keep_equal:

10125 self = self.where(mask)

10126 other = other.where(mask)

10127

10128 if not keep_shape:

10129 if isinstance(self, ABCDataFrame):

10130 cmask = mask.any()

10131 rmask = mask.any(axis=1)

10132 self = self.loc[rmask, cmask]

10133 other = other.loc[rmask, cmask]

10134 else:

10135 self = self[mask]

10136 other = other[mask]

10137 if not isinstance(result_names, tuple):

10138 raise TypeError(

10139 f"Passing 'result_names' as a {type(result_names)} is not "

10140 "supported. Provide 'result_names' as a tuple instead."

10141 )

10142

10143 if align_axis in (1, "columns"): # This is needed for Series

10144 axis = 1

10145 else:

10146 axis = self._get_axis_number(align_axis)

10147

10148 # error: List item 0 has incompatible type "NDFrame"; expected

10149 # "Union[Series, DataFrame]"

10150 diff = concat(

10151 [self, other], # type: ignore[list-item]

10152 axis=axis,

10153 keys=result_names,

10154 )

10155

10156 if axis >= self.ndim:

10157 # No need to reorganize data if stacking on new axis

10158 # This currently applies for stacking two Series on columns

10159 return diff

10160

10161 ax = diff._get_axis(axis)

10162 ax_names = np.array(ax.names)

10163

10164 # set index names to positions to avoid confusion

10165 ax.names = np.arange(len(ax_names))

10166

10167 # bring self-other to inner level

10168 order = list(range(1, ax.nlevels)) + [0]

10169 if isinstance(diff, ABCDataFrame):

10170 diff = diff.reorder_levels(order, axis=axis)

10171 else:

10172 diff = diff.reorder_levels(order)

10173

10174 # restore the index names in order

10175 diff._get_axis(axis=axis).names = ax_names[order]

10176

10177 # reorder axis to keep things organized

10178 indices = (

10179 np.arange(diff.shape[axis]).reshape([2, diff.shape[axis] // 2]).T.flatten()

10180 )

10181 diff = diff.take(indices, axis=axis)

10182

10183 return diff

10184

10185 @final

10186 @doc(

10187 klass=_shared_doc_kwargs["klass"],

10188 axes_single_arg=_shared_doc_kwargs["axes_single_arg"],

10189 )

10190 def align(

10191 self,

10192 other: NDFrameT,

10193 join: AlignJoin = "outer",

10194 axis: Axis | None = None,

10195 level: Level | None = None,

10196 copy: bool_t | None = None,

10197 fill_value: Hashable | None = None,

10198 method: FillnaOptions | None | lib.NoDefault = lib.no_default,

10199 limit: int | None | lib.NoDefault = lib.no_default,

10200 fill_axis: Axis | lib.NoDefault = lib.no_default,

10201 broadcast_axis: Axis | None | lib.NoDefault = lib.no_default,

10202 ) -> tuple[Self, NDFrameT]:

10203 """

10204 Align two objects on their axes with the specified join method.

10205

10206 Join method is specified for each axis Index.

10207

10208 Parameters

10209 ----------

10210 other : DataFrame or Series

10211 join : {{'outer', 'inner', 'left', 'right'}}, default 'outer'

10212 Type of alignment to be performed.

10213

10214 * left: use only keys from left frame, preserve key order.

10215 * right: use only keys from right frame, preserve key order.

10216 * outer: use union of keys from both frames, sort keys lexicographically.

10217 * inner: use intersection of keys from both frames,

10218 preserve the order of the left keys.

10219

10220 axis : allowed axis of the other object, default None

10221 Align on index (0), columns (1), or both (None).

10222 level : int or level name, default None

10223 Broadcast across a level, matching Index values on the

10224 passed MultiIndex level.

10225 copy : bool, default True

10226 Always returns new objects. If copy=False and no reindexing is

10227 required then original objects are returned.

10228

10229 .. note::

10230 The `copy` keyword will change behavior in pandas 3.0.

10231 `Copy-on-Write

10232 <https://pandas.pydata.org/docs/dev/user_guide/copy_on_write.html>`__

10233 will be enabled by default, which means that all methods with a

10234 `copy` keyword will use a lazy copy mechanism to defer the copy and

10235 ignore the `copy` keyword. The `copy` keyword will be removed in a

10236 future version of pandas.

10237

10238 You can already get the future behavior and improvements through

10239 enabling copy on write ``pd.options.mode.copy_on_write = True``

10240 fill_value : scalar, default np.nan

10241 Value to use for missing values. Defaults to NaN, but can be any

10242 "compatible" value.

10243 method : {{'backfill', 'bfill', 'pad', 'ffill', None}}, default None

10244 Method to use for filling holes in reindexed Series:

10245

10246 - pad / ffill: propagate last valid observation forward to next valid.

10247 - backfill / bfill: use NEXT valid observation to fill gap.

10248

10249 .. deprecated:: 2.1

10250

10251 limit : int, default None

10252 If method is specified, this is the maximum number of consecutive

10253 NaN values to forward/backward fill. In other words, if there is

10254 a gap with more than this number of consecutive NaNs, it will only

10255 be partially filled. If method is not specified, this is the

10256 maximum number of entries along the entire axis where NaNs will be

10257 filled. Must be greater than 0 if not None.

10258

10259 .. deprecated:: 2.1

10260

10261 fill_axis : {axes_single_arg}, default 0

10262 Filling axis, method and limit.

10263

10264 .. deprecated:: 2.1

10265

10266 broadcast_axis : {axes_single_arg}, default None

10267 Broadcast values along this axis, if aligning two objects of

10268 different dimensions.

10269

10270 .. deprecated:: 2.1

10271

10272 Returns

10273 -------

10274 tuple of ({klass}, type of other)

10275 Aligned objects.

10276

10277 Examples

10278 --------

10279 >>> df = pd.DataFrame(

10280 ... [[1, 2, 3, 4], [6, 7, 8, 9]], columns=["D", "B", "E", "A"], index=[1, 2]

10281 ... )

10282 >>> other = pd.DataFrame(

10283 ... [[10, 20, 30, 40], [60, 70, 80, 90], [600, 700, 800, 900]],

10284 ... columns=["A", "B", "C", "D"],

10285 ... index=[2, 3, 4],

10286 ... )

10287 >>> df

10288 D B E A

10289 1 1 2 3 4

10290 2 6 7 8 9

10291 >>> other

10292 A B C D

10293 2 10 20 30 40

10294 3 60 70 80 90

10295 4 600 700 800 900

10296

10297 Align on columns:

10298

10299 >>> left, right = df.align(other, join="outer", axis=1)

10300 >>> left

10301 A B C D E

10302 1 4 2 NaN 1 3

10303 2 9 7 NaN 6 8

10304 >>> right

10305 A B C D E

10306 2 10 20 30 40 NaN

10307 3 60 70 80 90 NaN

10308 4 600 700 800 900 NaN

10309

10310 We can also align on the index:

10311

10312 >>> left, right = df.align(other, join="outer", axis=0)

10313 >>> left

10314 D B E A

10315 1 1.0 2.0 3.0 4.0

10316 2 6.0 7.0 8.0 9.0

10317 3 NaN NaN NaN NaN

10318 4 NaN NaN NaN NaN

10319 >>> right

10320 A B C D

10321 1 NaN NaN NaN NaN

10322 2 10.0 20.0 30.0 40.0

10323 3 60.0 70.0 80.0 90.0

10324 4 600.0 700.0 800.0 900.0

10325

10326 Finally, the default `axis=None` will align on both index and columns:

10327

10328 >>> left, right = df.align(other, join="outer", axis=None)

10329 >>> left

10330 A B C D E

10331 1 4.0 2.0 NaN 1.0 3.0

10332 2 9.0 7.0 NaN 6.0 8.0

10333 3 NaN NaN NaN NaN NaN

10334 4 NaN NaN NaN NaN NaN

10335 >>> right

10336 A B C D E

10337 1 NaN NaN NaN NaN NaN

10338 2 10.0 20.0 30.0 40.0 NaN

10339 3 60.0 70.0 80.0 90.0 NaN

10340 4 600.0 700.0 800.0 900.0 NaN

10341 """

10342 if (

10343 method is not lib.no_default

10344 or limit is not lib.no_default

10345 or fill_axis is not lib.no_default

10346 ):

10347 # GH#51856

10348 warnings.warn(

10349 "The 'method', 'limit', and 'fill_axis' keywords in "

10350 f"{type(self).__name__}.align are deprecated and will be removed "

10351 "in a future version. Call fillna directly on the returned objects "

10352 "instead.",

10353 FutureWarning,

10354 stacklevel=find_stack_level(),

10355 )

10356 if fill_axis is lib.no_default:

10357 fill_axis = 0

10358 if method is lib.no_default:

10359 method = None

10360 if limit is lib.no_default:

10361 limit = None

10362

10363 if method is not None:

10364 method = clean_fill_method(method)

10365

10366 if broadcast_axis is not lib.no_default:

10367 # GH#51856

10368 # TODO(3.0): enforcing this deprecation will close GH#13194

10369 msg = (

10370 f"The 'broadcast_axis' keyword in {type(self).__name__}.align is "

10371 "deprecated and will be removed in a future version."

10372 )

10373 if broadcast_axis is not None:

10374 if self.ndim == 1 and other.ndim == 2:

10375 msg += (

10376 " Use left = DataFrame({col: left for col in right.columns}, "

10377 "index=right.index) before calling `left.align(right)` instead."

10378 )

10379 elif self.ndim == 2 and other.ndim == 1:

10380 msg += (

10381 " Use right = DataFrame({col: right for col in left.columns}, "

10382 "index=left.index) before calling `left.align(right)` instead"

10383 )

10384 warnings.warn(msg, FutureWarning, stacklevel=find_stack_level())

10385 else:

10386 broadcast_axis = None

10387

10388 if broadcast_axis == 1 and self.ndim != other.ndim:

10389 if isinstance(self, ABCSeries):

10390 # this means other is a DataFrame, and we need to broadcast

10391 # self

10392 cons = self._constructor_expanddim

10393 df = cons(

10394 {c: self for c in other.columns}, **other._construct_axes_dict()

10395 )

10396 # error: Incompatible return value type (got "Tuple[DataFrame,

10397 # DataFrame]", expected "Tuple[Self, NDFrameT]")

10398 return df._align_frame( # type: ignore[return-value]

10399 other, # type: ignore[arg-type]

10400 join=join,

10401 axis=axis,

10402 level=level,

10403 copy=copy,

10404 fill_value=fill_value,

10405 method=method,

10406 limit=limit,

10407 fill_axis=fill_axis,

10408 )[:2]

10409 elif isinstance(other, ABCSeries):

10410 # this means self is a DataFrame, and we need to broadcast

10411 # other

10412 cons = other._constructor_expanddim

10413 df = cons(

10414 {c: other for c in self.columns}, **self._construct_axes_dict()

10415 )

10416 # error: Incompatible return value type (got "Tuple[NDFrameT,

10417 # DataFrame]", expected "Tuple[Self, NDFrameT]")

10418 return self._align_frame( # type: ignore[return-value]

10419 df,

10420 join=join,

10421 axis=axis,

10422 level=level,

10423 copy=copy,

10424 fill_value=fill_value,

10425 method=method,

10426 limit=limit,

10427 fill_axis=fill_axis,

10428 )[:2]

10429

10430 _right: DataFrame | Series

10431 if axis is not None:

10432 axis = self._get_axis_number(axis)

10433 if isinstance(other, ABCDataFrame):

10434 left, _right, join_index = self._align_frame(

10435 other,

10436 join=join,

10437 axis=axis,

10438 level=level,

10439 copy=copy,

10440 fill_value=fill_value,

10441 method=method,

10442 limit=limit,

10443 fill_axis=fill_axis,

10444 )

10445

10446 elif isinstance(other, ABCSeries):

10447 left, _right, join_index = self._align_series(

10448 other,

10449 join=join,

10450 axis=axis,

10451 level=level,

10452 copy=copy,

10453 fill_value=fill_value,

10454 method=method,

10455 limit=limit,

10456 fill_axis=fill_axis,

10457 )

10458 else: # pragma: no cover

10459 raise TypeError(f"unsupported type: {type(other)}")

10460

10461 right = cast(NDFrameT, _right)

10462 if self.ndim == 1 or axis == 0:

10463 # If we are aligning timezone-aware DatetimeIndexes and the timezones

10464 # do not match, convert both to UTC.

10465 if isinstance(left.index.dtype, DatetimeTZDtype):

10466 if left.index.tz != right.index.tz:

10467 if join_index is not None:

10468 # GH#33671 copy to ensure we don't change the index on

10469 # our original Series

10470 left = left.copy(deep=False)

10471 right = right.copy(deep=False)

10472 left.index = join_index

10473 right.index = join_index

10474

10475 left = left.__finalize__(self)

10476 right = right.__finalize__(other)

10477 return left, right

10478

10479 @final

10480 def _align_frame(

10481 self,

10482 other: DataFrame,

10483 join: AlignJoin = "outer",

10484 axis: Axis | None = None,

10485 level=None,

10486 copy: bool_t | None = None,

10487 fill_value=None,

10488 method=None,

10489 limit: int | None = None,

10490 fill_axis: Axis = 0,

10491 ) -> tuple[Self, DataFrame, Index | None]:

10492 # defaults

10493 join_index, join_columns = None, None

10494 ilidx, iridx = None, None

10495 clidx, cridx = None, None

10496

10497 is_series = isinstance(self, ABCSeries)

10498

10499 if (axis is None or axis == 0) and not self.index.equals(other.index):

10500 join_index, ilidx, iridx = self.index.join(

10501 other.index, how=join, level=level, return_indexers=True

10502 )

10503

10504 if (

10505 (axis is None or axis == 1)

10506 and not is_series

10507 and not self.columns.equals(other.columns)

10508 ):

10509 join_columns, clidx, cridx = self.columns.join(

10510 other.columns, how=join, level=level, return_indexers=True

10511 )

10512

10513 if is_series:

10514 reindexers = {0: [join_index, ilidx]}

10515 else:

10516 reindexers = {0: [join_index, ilidx], 1: [join_columns, clidx]}

10517

10518 left = self._reindex_with_indexers(

10519 reindexers, copy=copy, fill_value=fill_value, allow_dups=True

10520 )

10521 # other must be always DataFrame

10522 right = other._reindex_with_indexers(

10523 {0: [join_index, iridx], 1: [join_columns, cridx]},

10524 copy=copy,

10525 fill_value=fill_value,

10526 allow_dups=True,

10527 )

10528

10529 if method is not None:

10530 left = left._pad_or_backfill(method, axis=fill_axis, limit=limit)

10531 right = right._pad_or_backfill(method, axis=fill_axis, limit=limit)

10532

10533 return left, right, join_index

10534

10535 @final

10536 def _align_series(

10537 self,

10538 other: Series,

10539 join: AlignJoin = "outer",

10540 axis: Axis | None = None,

10541 level=None,

10542 copy: bool_t | None = None,

10543 fill_value=None,

10544 method=None,

10545 limit: int | None = None,

10546 fill_axis: Axis = 0,

10547 ) -> tuple[Self, Series, Index | None]:

10548 is_series = isinstance(self, ABCSeries)

10549 if copy and using_copy_on_write():

10550 copy = False

10551

10552 if (not is_series and axis is None) or axis not in [None, 0, 1]:

10553 raise ValueError("Must specify axis=0 or 1")

10554

10555 if is_series and axis == 1:

10556 raise ValueError("cannot align series to a series other than axis 0")

10557

10558 # series/series compat, other must always be a Series

10559 if not axis:

10560 # equal

10561 if self.index.equals(other.index):

10562 join_index, lidx, ridx = None, None, None

10563 else:

10564 join_index, lidx, ridx = self.index.join(

10565 other.index, how=join, level=level, return_indexers=True

10566 )

10567

10568 if is_series:

10569 left = self._reindex_indexer(join_index, lidx, copy)

10570 elif lidx is None or join_index is None:

10571 left = self.copy(deep=copy)

10572 else:

10573 new_mgr = self._mgr.reindex_indexer(join_index, lidx, axis=1, copy=copy)

10574 left = self._constructor_from_mgr(new_mgr, axes=new_mgr.axes)

10575

10576 right = other._reindex_indexer(join_index, ridx, copy)

10577

10578 else:

10579 # one has > 1 ndim

10580 fdata = self._mgr

10581 join_index = self.axes[1]

10582 lidx, ridx = None, None

10583 if not join_index.equals(other.index):

10584 join_index, lidx, ridx = join_index.join(

10585 other.index, how=join, level=level, return_indexers=True

10586 )

10587

10588 if lidx is not None:

10589 bm_axis = self._get_block_manager_axis(1)

10590 fdata = fdata.reindex_indexer(join_index, lidx, axis=bm_axis)

10591

10592 if copy and fdata is self._mgr:

10593 fdata = fdata.copy()

10594

10595 left = self._constructor_from_mgr(fdata, axes=fdata.axes)

10596

10597 if ridx is None:

10598 right = other.copy(deep=copy)

10599 else:

10600 right = other.reindex(join_index, level=level)

10601

10602 # fill

10603 fill_na = notna(fill_value) or (method is not None)

10604 if fill_na:

10605 fill_value, method = validate_fillna_kwargs(fill_value, method)

10606 if method is not None:

10607 left = left._pad_or_backfill(method, limit=limit, axis=fill_axis)

10608 right = right._pad_or_backfill(method, limit=limit)

10609 else:

10610 left = left.fillna(fill_value, limit=limit, axis=fill_axis)

10611 right = right.fillna(fill_value, limit=limit)

10612

10613 return left, right, join_index

10614

10615 @final

10616 def _where(

10617 self,

10618 cond,

10619 other=lib.no_default,

10620 inplace: bool_t = False,

10621 axis: Axis | None = None,

10622 level=None,

10623 warn: bool_t = True,

10624 ):

10625 """

10626 Equivalent to public method `where`, except that `other` is not

10627 applied as a function even if callable. Used in __setitem__.

10628 """

10629 inplace = validate_bool_kwarg(inplace, "inplace")

10630

10631 if axis is not None:

10632 axis = self._get_axis_number(axis)

10633

10634 # align the cond to same shape as myself

10635 cond = common.apply_if_callable(cond, self)

10636 if isinstance(cond, NDFrame):

10637 # CoW: Make sure reference is not kept alive

10638 if cond.ndim == 1 and self.ndim == 2:

10639 cond = cond._constructor_expanddim(

10640 {i: cond for i in range(len(self.columns))},

10641 copy=False,

10642 )

10643 cond.columns = self.columns

10644 cond = cond.align(self, join="right", copy=False)[0]

10645 else:

10646 if not hasattr(cond, "shape"):

10647 cond = np.asanyarray(cond)

10648 if cond.shape != self.shape:

10649 raise ValueError("Array conditional must be same shape as self")

10650 cond = self._constructor(cond, **self._construct_axes_dict(), copy=False)

10651

10652 # make sure we are boolean

10653 fill_value = bool(inplace)

10654 with warnings.catch_warnings():

10655 warnings.filterwarnings(

10656 "ignore",

10657 "Downcasting object dtype arrays",

10658 category=FutureWarning,

10659 )

10660 cond = cond.fillna(fill_value)

10661 cond = cond.infer_objects(copy=False)

10662

10663 msg = "Boolean array expected for the condition, not {dtype}"

10664

10665 if not cond.empty:

10666 if not isinstance(cond, ABCDataFrame):

10667 # This is a single-dimensional object.

10668 if not is_bool_dtype(cond):

10669 raise ValueError(msg.format(dtype=cond.dtype))

10670 else:

10671 for _dt in cond.dtypes:

10672 if not is_bool_dtype(_dt):

10673 raise ValueError(msg.format(dtype=_dt))

10674 if cond._mgr.any_extension_types:

10675 # GH51574: avoid object ndarray conversion later on

10676 cond = cond._constructor(

10677 cond.to_numpy(dtype=bool, na_value=fill_value),

10678 **cond._construct_axes_dict(),

10679 )

10680 else:

10681 # GH#21947 we have an empty DataFrame/Series, could be object-dtype

10682 cond = cond.astype(bool)

10683

10684 cond = -cond if inplace else cond

10685 cond = cond.reindex(self._info_axis, axis=self._info_axis_number, copy=False)

10686

10687 # try to align with other

10688 if isinstance(other, NDFrame):

10689 # align with me

10690 if other.ndim <= self.ndim:

10691 # CoW: Make sure reference is not kept alive

10692 other = self.align(

10693 other,

10694 join="left",

10695 axis=axis,

10696 level=level,

10697 fill_value=None,

10698 copy=False,

10699 )[1]

10700

10701 # if we are NOT aligned, raise as we cannot where index

10702 if axis is None and not other._indexed_same(self):

10703 raise InvalidIndexError

10704

10705 if other.ndim < self.ndim:

10706 # TODO(EA2D): avoid object-dtype cast in EA case GH#38729

10707 other = other._values

10708 if axis == 0:

10709 other = np.reshape(other, (-1, 1))

10710 elif axis == 1:

10711 other = np.reshape(other, (1, -1))

10712

10713 other = np.broadcast_to(other, self.shape)

10714

10715 # slice me out of the other

10716 else:

10717 raise NotImplementedError(

10718 "cannot align with a higher dimensional NDFrame"

10719 )

10720

10721 elif not isinstance(other, (MultiIndex, NDFrame)):

10722 # mainly just catching Index here

10723 other = extract_array(other, extract_numpy=True)

10724

10725 if isinstance(other, (np.ndarray, ExtensionArray)):

10726 if other.shape != self.shape:

10727 if self.ndim != 1:

10728 # In the ndim == 1 case we may have

10729 # other length 1, which we treat as scalar (GH#2745, GH#4192)

10730 # or len(other) == icond.sum(), which we treat like

10731 # __setitem__ (GH#3235)

10732 raise ValueError(

10733 "other must be the same shape as self when an ndarray"

10734 )

10735

10736 # we are the same shape, so create an actual object for alignment

10737 else:

10738 other = self._constructor(

10739 other, **self._construct_axes_dict(), copy=False

10740 )

10741

10742 if axis is None:

10743 axis = 0

10744

10745 if self.ndim == getattr(other, "ndim", 0):

10746 align = True

10747 else:

10748 align = self._get_axis_number(axis) == 1

10749

10750 if inplace:

10751 # we may have different type blocks come out of putmask, so

10752 # reconstruct the block manager

10753

10754 new_data = self._mgr.putmask(mask=cond, new=other, align=align, warn=warn)

10755 result = self._constructor_from_mgr(new_data, axes=new_data.axes)

10756 return self._update_inplace(result)

10757

10758 else:

10759 new_data = self._mgr.where(

10760 other=other,

10761 cond=cond,

10762 align=align,

10763 )

10764 result = self._constructor_from_mgr(new_data, axes=new_data.axes)

10765 return result.__finalize__(self)

10766

10767 @overload

10768 def where(

10769 self,

10770 cond,

10771 other=...,

10772 *,

10773 inplace: Literal[False] = ...,

10774 axis: Axis | None = ...,

10775 level: Level = ...,

10776 ) -> Self:

10777 ...

10778

10779 @overload

10780 def where(

10781 self,

10782 cond,

10783 other=...,

10784 *,

10785 inplace: Literal[True],

10786 axis: Axis | None = ...,

10787 level: Level = ...,

10788 ) -> None:

10789 ...

10790

10791 @overload

10792 def where(

10793 self,

10794 cond,

10795 other=...,

10796 *,

10797 inplace: bool_t = ...,

10798 axis: Axis | None = ...,

10799 level: Level = ...,

10800 ) -> Self | None:

10801 ...

10802

10803 @final

10804 @doc(

10805 klass=_shared_doc_kwargs["klass"],

10806 cond="True",

10807 cond_rev="False",

10808 name="where",

10809 name_other="mask",

10810 )

10811 def where(

10812 self,

10813 cond,

10814 other=np.nan,

10815 *,

10816 inplace: bool_t = False,

10817 axis: Axis | None = None,

10818 level: Level | None = None,

10819 ) -> Self | None:

10820 """

10821 Replace values where the condition is {cond_rev}.

10822

10823 Parameters

10824 ----------

10825 cond : bool {klass}, array-like, or callable

10826 Where `cond` is {cond}, keep the original value. Where

10827 {cond_rev}, replace with corresponding value from `other`.

10828 If `cond` is callable, it is computed on the {klass} and

10829 should return boolean {klass} or array. The callable must

10830 not change input {klass} (though pandas doesn't check it).

10831 other : scalar, {klass}, or callable

10832 Entries where `cond` is {cond_rev} are replaced with

10833 corresponding value from `other`.

10834 If other is callable, it is computed on the {klass} and

10835 should return scalar or {klass}. The callable must not

10836 change input {klass} (though pandas doesn't check it).

10837 If not specified, entries will be filled with the corresponding

10838 NULL value (``np.nan`` for numpy dtypes, ``pd.NA`` for extension

10839 dtypes).

10840 inplace : bool, default False

10841 Whether to perform the operation in place on the data.

10842 axis : int, default None

10843 Alignment axis if needed. For `Series` this parameter is

10844 unused and defaults to 0.

10845 level : int, default None

10846 Alignment level if needed.

10847

10848 Returns

10849 -------

10850 Same type as caller or None if ``inplace=True``.

10851

10852 See Also

10853 --------

10854 :func:`DataFrame.{name_other}` : Return an object of same shape as

10855 self.

10856

10857 Notes

10858 -----

10859 The {name} method is an application of the if-then idiom. For each

10860 element in the calling DataFrame, if ``cond`` is ``{cond}`` the

10861 element is used; otherwise the corresponding element from the DataFrame

10862 ``other`` is used. If the axis of ``other`` does not align with axis of

10863 ``cond`` {klass}, the misaligned index positions will be filled with

10864 {cond_rev}.

10865

10866 The signature for :func:`DataFrame.where` differs from

10867 :func:`numpy.where`. Roughly ``df1.where(m, df2)`` is equivalent to

10868 ``np.where(m, df1, df2)``.

10869

10870 For further details and examples see the ``{name}`` documentation in

10871 :ref:`indexing <indexing.where_mask>`.

10872

10873 The dtype of the object takes precedence. The fill value is casted to

10874 the object's dtype, if this can be done losslessly.

10875

10876 Examples

10877 --------

10878 >>> s = pd.Series(range(5))

10879 >>> s.where(s > 0)

10880 0 NaN

10881 1 1.0

10882 2 2.0

10883 3 3.0

10884 4 4.0

10885 dtype: float64

10886 >>> s.mask(s > 0)

10887 0 0.0

10888 1 NaN

10889 2 NaN

10890 3 NaN

10891 4 NaN

10892 dtype: float64

10893

10894 >>> s = pd.Series(range(5))

10895 >>> t = pd.Series([True, False])

10896 >>> s.where(t, 99)

10897 0 0

10898 1 99

10899 2 99

10900 3 99

10901 4 99

10902 dtype: int64

10903 >>> s.mask(t, 99)

10904 0 99

10905 1 1

10906 2 99

10907 3 99

10908 4 99

10909 dtype: int64

10910

10911 >>> s.where(s > 1, 10)

10912 0 10

10913 1 10

10914 2 2

10915 3 3

10916 4 4

10917 dtype: int64

10918 >>> s.mask(s > 1, 10)

10919 0 0

10920 1 1

10921 2 10

10922 3 10

10923 4 10

10924 dtype: int64

10925

10926 >>> df = pd.DataFrame(np.arange(10).reshape(-1, 2), columns=['A', 'B'])

10927 >>> df

10928 A B

10929 0 0 1

10930 1 2 3

10931 2 4 5

10932 3 6 7

10933 4 8 9

10934 >>> m = df % 3 == 0

10935 >>> df.where(m, -df)

10936 A B

10937 0 0 -1

10938 1 -2 3

10939 2 -4 -5

10940 3 6 -7

10941 4 -8 9

10942 >>> df.where(m, -df) == np.where(m, df, -df)

10943 A B

10944 0 True True

10945 1 True True

10946 2 True True

10947 3 True True

10948 4 True True

10949 >>> df.where(m, -df) == df.mask(~m, -df)

10950 A B

10951 0 True True

10952 1 True True

10953 2 True True

10954 3 True True

10955 4 True True

10956 """

10957 inplace = validate_bool_kwarg(inplace, "inplace")

10958 if inplace:

10959 if not PYPY and using_copy_on_write():

10960 if sys.getrefcount(self) <= REF_COUNT:

10961 warnings.warn(

10962 _chained_assignment_method_msg,

10963 ChainedAssignmentError,

10964 stacklevel=2,

10965 )

10966 elif (

10967 not PYPY

10968 and not using_copy_on_write()

10969 and self._is_view_after_cow_rules()

10970 ):

10971 ctr = sys.getrefcount(self)

10972 ref_count = REF_COUNT

10973 if isinstance(self, ABCSeries) and hasattr(self, "_cacher"):

10974 # see https://github.com/pandas-dev/pandas/pull/56060#discussion_r1399245221

10975 ref_count += 1

10976 if ctr <= ref_count:

10977 warnings.warn(

10978 _chained_assignment_warning_method_msg,

10979 FutureWarning,

10980 stacklevel=2,

10981 )

10982

10983 other = common.apply_if_callable(other, self)

10984 return self._where(cond, other, inplace, axis, level)

10985

10986 @overload

10987 def mask(

10988 self,

10989 cond,

10990 other=...,

10991 *,

10992 inplace: Literal[False] = ...,

10993 axis: Axis | None = ...,

10994 level: Level = ...,

10995 ) -> Self:

10996 ...

10997

10998 @overload

10999 def mask(

11000 self,

11001 cond,

11002 other=...,

11003 *,

11004 inplace: Literal[True],

11005 axis: Axis | None = ...,

11006 level: Level = ...,

11007 ) -> None:

11008 ...

11009

11010 @overload

11011 def mask(

11012 self,

11013 cond,

11014 other=...,

11015 *,

11016 inplace: bool_t = ...,

11017 axis: Axis | None = ...,

11018 level: Level = ...,

11019 ) -> Self | None:

11020 ...

11021

11022 @final

11023 @doc(

11024 where,

11025 klass=_shared_doc_kwargs["klass"],

11026 cond="False",

11027 cond_rev="True",

11028 name="mask",

11029 name_other="where",

11030 )

11031 def mask(

11032 self,

11033 cond,

11034 other=lib.no_default,

11035 *,

11036 inplace: bool_t = False,

11037 axis: Axis | None = None,

11038 level: Level | None = None,

11039 ) -> Self | None:

11040 inplace = validate_bool_kwarg(inplace, "inplace")

11041 if inplace:

11042 if not PYPY and using_copy_on_write():

11043 if sys.getrefcount(self) <= REF_COUNT:

11044 warnings.warn(

11045 _chained_assignment_method_msg,

11046 ChainedAssignmentError,

11047 stacklevel=2,

11048 )

11049 elif (

11050 not PYPY

11051 and not using_copy_on_write()

11052 and self._is_view_after_cow_rules()

11053 ):

11054 ctr = sys.getrefcount(self)

11055 ref_count = REF_COUNT

11056 if isinstance(self, ABCSeries) and hasattr(self, "_cacher"):

11057 # see https://github.com/pandas-dev/pandas/pull/56060#discussion_r1399245221

11058 ref_count += 1

11059 if ctr <= ref_count:

11060 warnings.warn(

11061 _chained_assignment_warning_method_msg,

11062 FutureWarning,

11063 stacklevel=2,

11064 )

11065

11066 cond = common.apply_if_callable(cond, self)

11067 other = common.apply_if_callable(other, self)

11068

11069 # see gh-21891

11070 if not hasattr(cond, "__invert__"):

11071 cond = np.array(cond)

11072

11073 return self._where(

11074 ~cond,

11075 other=other,

11076 inplace=inplace,

11077 axis=axis,

11078 level=level,

11079 )

11080

11081 @doc(klass=_shared_doc_kwargs["klass"])

11082 def shift(

11083 self,

11084 periods: int | Sequence[int] = 1,

11085 freq=None,

11086 axis: Axis = 0,

11087 fill_value: Hashable = lib.no_default,

11088 suffix: str | None = None,

11089 ) -> Self | DataFrame:

11090 """

11091 Shift index by desired number of periods with an optional time `freq`.

11092

11093 When `freq` is not passed, shift the index without realigning the data.

11094 If `freq` is passed (in this case, the index must be date or datetime,

11095 or it will raise a `NotImplementedError`), the index will be

11096 increased using the periods and the `freq`. `freq` can be inferred

11097 when specified as "infer" as long as either freq or inferred_freq

11098 attribute is set in the index.

11099

11100 Parameters

11101 ----------

11102 periods : int or Sequence

11103 Number of periods to shift. Can be positive or negative.

11104 If an iterable of ints, the data will be shifted once by each int.

11105 This is equivalent to shifting by one value at a time and

11106 concatenating all resulting frames. The resulting columns will have

11107 the shift suffixed to their column names. For multiple periods,

11108 axis must not be 1.

11109 freq : DateOffset, tseries.offsets, timedelta, or str, optional

11110 Offset to use from the tseries module or time rule (e.g. 'EOM').

11111 If `freq` is specified then the index values are shifted but the

11112 data is not realigned. That is, use `freq` if you would like to

11113 extend the index when shifting and preserve the original data.

11114 If `freq` is specified as "infer" then it will be inferred from

11115 the freq or inferred_freq attributes of the index. If neither of

11116 those attributes exist, a ValueError is thrown.

11117 axis : {{0 or 'index', 1 or 'columns', None}}, default None

11118 Shift direction. For `Series` this parameter is unused and defaults to 0.

11119 fill_value : object, optional

11120 The scalar value to use for newly introduced missing values.

11121 the default depends on the dtype of `self`.

11122 For numeric data, ``np.nan`` is used.

11123 For datetime, timedelta, or period data, etc. :attr:`NaT` is used.

11124 For extension dtypes, ``self.dtype.na_value`` is used.

11125 suffix : str, optional

11126 If str and periods is an iterable, this is added after the column

11127 name and before the shift value for each shifted column name.

11128

11129 Returns

11130 -------

11131 {klass}

11132 Copy of input object, shifted.

11133

11134 See Also

11135 --------

11136 Index.shift : Shift values of Index.

11137 DatetimeIndex.shift : Shift values of DatetimeIndex.

11138 PeriodIndex.shift : Shift values of PeriodIndex.

11139

11140 Examples

11141 --------

11142 >>> df = pd.DataFrame({{"Col1": [10, 20, 15, 30, 45],

11143 ... "Col2": [13, 23, 18, 33, 48],

11144 ... "Col3": [17, 27, 22, 37, 52]}},

11145 ... index=pd.date_range("2020-01-01", "2020-01-05"))

11146 >>> df

11147 Col1 Col2 Col3

11148 2020-01-01 10 13 17

11149 2020-01-02 20 23 27

11150 2020-01-03 15 18 22

11151 2020-01-04 30 33 37

11152 2020-01-05 45 48 52

11153

11154 >>> df.shift(periods=3)

11155 Col1 Col2 Col3

11156 2020-01-01 NaN NaN NaN

11157 2020-01-02 NaN NaN NaN

11158 2020-01-03 NaN NaN NaN

11159 2020-01-04 10.0 13.0 17.0

11160 2020-01-05 20.0 23.0 27.0

11161

11162 >>> df.shift(periods=1, axis="columns")

11163 Col1 Col2 Col3

11164 2020-01-01 NaN 10 13

11165 2020-01-02 NaN 20 23

11166 2020-01-03 NaN 15 18

11167 2020-01-04 NaN 30 33

11168 2020-01-05 NaN 45 48

11169

11170 >>> df.shift(periods=3, fill_value=0)

11171 Col1 Col2 Col3

11172 2020-01-01 0 0 0

11173 2020-01-02 0 0 0

11174 2020-01-03 0 0 0

11175 2020-01-04 10 13 17

11176 2020-01-05 20 23 27

11177

11178 >>> df.shift(periods=3, freq="D")

11179 Col1 Col2 Col3

11180 2020-01-04 10 13 17

11181 2020-01-05 20 23 27

11182 2020-01-06 15 18 22

11183 2020-01-07 30 33 37

11184 2020-01-08 45 48 52

11185

11186 >>> df.shift(periods=3, freq="infer")

11187 Col1 Col2 Col3

11188 2020-01-04 10 13 17

11189 2020-01-05 20 23 27

11190 2020-01-06 15 18 22

11191 2020-01-07 30 33 37

11192 2020-01-08 45 48 52

11193

11194 >>> df['Col1'].shift(periods=[0, 1, 2])

11195 Col1_0 Col1_1 Col1_2

11196 2020-01-01 10 NaN NaN

11197 2020-01-02 20 10.0 NaN

11198 2020-01-03 15 20.0 10.0

11199 2020-01-04 30 15.0 20.0

11200 2020-01-05 45 30.0 15.0

11201 """

11202 axis = self._get_axis_number(axis)

11203

11204 if freq is not None and fill_value is not lib.no_default:

11205 # GH#53832

11206 warnings.warn(

11207 "Passing a 'freq' together with a 'fill_value' silently ignores "

11208 "the fill_value and is deprecated. This will raise in a future "

11209 "version.",

11210 FutureWarning,

11211 stacklevel=find_stack_level(),

11212 )

11213 fill_value = lib.no_default

11214

11215 if periods == 0:

11216 return self.copy(deep=None)

11217

11218 if is_list_like(periods) and isinstance(self, ABCSeries):

11219 return self.to_frame().shift(

11220 periods=periods, freq=freq, axis=axis, fill_value=fill_value

11221 )

11222 periods = cast(int, periods)

11223

11224 if freq is None:

11225 # when freq is None, data is shifted, index is not

11226 axis = self._get_axis_number(axis)

11227 assert axis == 0 # axis == 1 cases handled in DataFrame.shift

11228 new_data = self._mgr.shift(periods=periods, fill_value=fill_value)

11229 return self._constructor_from_mgr(

11230 new_data, axes=new_data.axes

11231 ).__finalize__(self, method="shift")

11232

11233 return self._shift_with_freq(periods, axis, freq)

11234

11235 @final

11236 def _shift_with_freq(self, periods: int, axis: int, freq) -> Self:

11237 # see shift.__doc__

11238 # when freq is given, index is shifted, data is not

11239 index = self._get_axis(axis)

11240

11241 if freq == "infer":

11242 freq = getattr(index, "freq", None)

11243

11244 if freq is None:

11245 freq = getattr(index, "inferred_freq", None)

11246

11247 if freq is None:

11248 msg = "Freq was not set in the index hence cannot be inferred"

11249 raise ValueError(msg)

11250

11251 elif isinstance(freq, str):

11252 is_period = isinstance(index, PeriodIndex)

11253 freq = to_offset(freq, is_period=is_period)

11254

11255 if isinstance(index, PeriodIndex):

11256 orig_freq = to_offset(index.freq)

11257 if freq != orig_freq:

11258 assert orig_freq is not None # for mypy

11259 raise ValueError(

11260 f"Given freq {freq_to_period_freqstr(freq.n, freq.name)} "

11261 f"does not match PeriodIndex freq "

11262 f"{freq_to_period_freqstr(orig_freq.n, orig_freq.name)}"

11263 )

11264 new_ax = index.shift(periods)

11265 else:

11266 new_ax = index.shift(periods, freq)

11267

11268 result = self.set_axis(new_ax, axis=axis)

11269 return result.__finalize__(self, method="shift")

11270

11271 @final

11272 def truncate(

11273 self,

11274 before=None,

11275 after=None,

11276 axis: Axis | None = None,

11277 copy: bool_t | None = None,

11278 ) -> Self:

11279 """

11280 Truncate a Series or DataFrame before and after some index value.

11281

11282 This is a useful shorthand for boolean indexing based on index

11283 values above or below certain thresholds.

11284

11285 Parameters

11286 ----------

11287 before : date, str, int

11288 Truncate all rows before this index value.

11289 after : date, str, int

11290 Truncate all rows after this index value.

11291 axis : {0 or 'index', 1 or 'columns'}, optional

11292 Axis to truncate. Truncates the index (rows) by default.

11293 For `Series` this parameter is unused and defaults to 0.

11294 copy : bool, default is True,

11295 Return a copy of the truncated section.

11296

11297 .. note::

11298 The `copy` keyword will change behavior in pandas 3.0.

11299 `Copy-on-Write

11300 <https://pandas.pydata.org/docs/dev/user_guide/copy_on_write.html>`__

11301 will be enabled by default, which means that all methods with a

11302 `copy` keyword will use a lazy copy mechanism to defer the copy and

11303 ignore the `copy` keyword. The `copy` keyword will be removed in a

11304 future version of pandas.

11305

11306 You can already get the future behavior and improvements through

11307 enabling copy on write ``pd.options.mode.copy_on_write = True``

11308

11309 Returns

11310 -------

11311 type of caller

11312 The truncated Series or DataFrame.

11313

11314 See Also

11315 --------

11316 DataFrame.loc : Select a subset of a DataFrame by label.

11317 DataFrame.iloc : Select a subset of a DataFrame by position.

11318

11319 Notes

11320 -----

11321 If the index being truncated contains only datetime values,

11322 `before` and `after` may be specified as strings instead of

11323 Timestamps.

11324

11325 Examples

11326 --------

11327 >>> df = pd.DataFrame({'A': ['a', 'b', 'c', 'd', 'e'],

11328 ... 'B': ['f', 'g', 'h', 'i', 'j'],

11329 ... 'C': ['k', 'l', 'm', 'n', 'o']},

11330 ... index=[1, 2, 3, 4, 5])

11331 >>> df

11332 A B C

11333 1 a f k

11334 2 b g l

11335 3 c h m

11336 4 d i n

11337 5 e j o

11338

11339 >>> df.truncate(before=2, after=4)

11340 A B C

11341 2 b g l

11342 3 c h m

11343 4 d i n

11344

11345 The columns of a DataFrame can be truncated.

11346

11347 >>> df.truncate(before="A", after="B", axis="columns")

11348 A B

11349 1 a f

11350 2 b g

11351 3 c h

11352 4 d i

11353 5 e j

11354

11355 For Series, only rows can be truncated.

11356

11357 >>> df['A'].truncate(before=2, after=4)

11358 2 b

11359 3 c

11360 4 d

11361 Name: A, dtype: object

11362

11363 The index values in ``truncate`` can be datetimes or string

11364 dates.

11365

11366 >>> dates = pd.date_range('2016-01-01', '2016-02-01', freq='s')

11367 >>> df = pd.DataFrame(index=dates, data={'A': 1})

11368 >>> df.tail()

11369 A

11370 2016-01-31 23:59:56 1

11371 2016-01-31 23:59:57 1

11372 2016-01-31 23:59:58 1

11373 2016-01-31 23:59:59 1

11374 2016-02-01 00:00:00 1

11375

11376 >>> df.truncate(before=pd.Timestamp('2016-01-05'),

11377 ... after=pd.Timestamp('2016-01-10')).tail()

11378 A

11379 2016-01-09 23:59:56 1

11380 2016-01-09 23:59:57 1

11381 2016-01-09 23:59:58 1

11382 2016-01-09 23:59:59 1

11383 2016-01-10 00:00:00 1

11384

11385 Because the index is a DatetimeIndex containing only dates, we can

11386 specify `before` and `after` as strings. They will be coerced to

11387 Timestamps before truncation.

11388

11389 >>> df.truncate('2016-01-05', '2016-01-10').tail()

11390 A

11391 2016-01-09 23:59:56 1

11392 2016-01-09 23:59:57 1

11393 2016-01-09 23:59:58 1

11394 2016-01-09 23:59:59 1

11395 2016-01-10 00:00:00 1

11396

11397 Note that ``truncate`` assumes a 0 value for any unspecified time

11398 component (midnight). This differs from partial string slicing, which

11399 returns any partially matching dates.

11400

11401 >>> df.loc['2016-01-05':'2016-01-10', :].tail()

11402 A

11403 2016-01-10 23:59:55 1

11404 2016-01-10 23:59:56 1

11405 2016-01-10 23:59:57 1

11406 2016-01-10 23:59:58 1

11407 2016-01-10 23:59:59 1

11408 """

11409 if axis is None:

11410 axis = 0

11411 axis = self._get_axis_number(axis)

11412 ax = self._get_axis(axis)

11413

11414 # GH 17935

11415 # Check that index is sorted

11416 if not ax.is_monotonic_increasing and not ax.is_monotonic_decreasing:

11417 raise ValueError("truncate requires a sorted index")

11418

11419 # if we have a date index, convert to dates, otherwise

11420 # treat like a slice

11421 if ax._is_all_dates:

11422 from pandas.core.tools.datetimes import to_datetime

11423

11424 before = to_datetime(before)

11425 after = to_datetime(after)

11426

11427 if before is not None and after is not None and before > after:

11428 raise ValueError(f"Truncate: {after} must be after {before}")

11429

11430 if len(ax) > 1 and ax.is_monotonic_decreasing and ax.nunique() > 1:

11431 before, after = after, before

11432

11433 slicer = [slice(None, None)] * self._AXIS_LEN

11434 slicer[axis] = slice(before, after)

11435 result = self.loc[tuple(slicer)]

11436

11437 if isinstance(ax, MultiIndex):

11438 setattr(result, self._get_axis_name(axis), ax.truncate(before, after))

11439

11440 result = result.copy(deep=copy and not using_copy_on_write())

11441

11442 return result

11443

11444 @final

11445 @doc(klass=_shared_doc_kwargs["klass"])

11446 def tz_convert(

11447 self, tz, axis: Axis = 0, level=None, copy: bool_t | None = None

11448 ) -> Self:

11449 """

11450 Convert tz-aware axis to target time zone.

11451

11452 Parameters

11453 ----------

11454 tz : str or tzinfo object or None

11455 Target time zone. Passing ``None`` will convert to

11456 UTC and remove the timezone information.

11457 axis : {{0 or 'index', 1 or 'columns'}}, default 0

11458 The axis to convert

11459 level : int, str, default None

11460 If axis is a MultiIndex, convert a specific level. Otherwise

11461 must be None.

11462 copy : bool, default True

11463 Also make a copy of the underlying data.

11464

11465 .. note::

11466 The `copy` keyword will change behavior in pandas 3.0.

11467 `Copy-on-Write

11468 <https://pandas.pydata.org/docs/dev/user_guide/copy_on_write.html>`__

11469 will be enabled by default, which means that all methods with a

11470 `copy` keyword will use a lazy copy mechanism to defer the copy and

11471 ignore the `copy` keyword. The `copy` keyword will be removed in a

11472 future version of pandas.

11473

11474 You can already get the future behavior and improvements through

11475 enabling copy on write ``pd.options.mode.copy_on_write = True``

11476

11477 Returns

11478 -------

11479 {klass}

11480 Object with time zone converted axis.

11481

11482 Raises

11483 ------

11484 TypeError

11485 If the axis is tz-naive.

11486

11487 Examples

11488 --------

11489 Change to another time zone:

11490

11491 >>> s = pd.Series(

11492 ... [1],

11493 ... index=pd.DatetimeIndex(['2018-09-15 01:30:00+02:00']),

11494 ... )

11495 >>> s.tz_convert('Asia/Shanghai')

11496 2018-09-15 07:30:00+08:00 1

11497 dtype: int64

11498

11499 Pass None to convert to UTC and get a tz-naive index:

11500

11501 >>> s = pd.Series([1],

11502 ... index=pd.DatetimeIndex(['2018-09-15 01:30:00+02:00']))

11503 >>> s.tz_convert(None)

11504 2018-09-14 23:30:00 1

11505 dtype: int64

11506 """

11507 axis = self._get_axis_number(axis)

11508 ax = self._get_axis(axis)

11509

11510 def _tz_convert(ax, tz):

11511 if not hasattr(ax, "tz_convert"):

11512 if len(ax) > 0:

11513 ax_name = self._get_axis_name(axis)

11514 raise TypeError(

11515 f"{ax_name} is not a valid DatetimeIndex or PeriodIndex"

11516 )

11517 ax = DatetimeIndex([], tz=tz)

11518 else:

11519 ax = ax.tz_convert(tz)

11520 return ax

11521

11522 # if a level is given it must be a MultiIndex level or

11523 # equivalent to the axis name

11524 if isinstance(ax, MultiIndex):

11525 level = ax._get_level_number(level)

11526 new_level = _tz_convert(ax.levels[level], tz)

11527 ax = ax.set_levels(new_level, level=level)

11528 else:

11529 if level not in (None, 0, ax.name):

11530 raise ValueError(f"The level {level} is not valid")

11531 ax = _tz_convert(ax, tz)

11532

11533 result = self.copy(deep=copy and not using_copy_on_write())

11534 result = result.set_axis(ax, axis=axis, copy=False)

11535 return result.__finalize__(self, method="tz_convert")

11536

11537 @final

11538 @doc(klass=_shared_doc_kwargs["klass"])

11539 def tz_localize(

11540 self,

11541 tz,

11542 axis: Axis = 0,

11543 level=None,

11544 copy: bool_t | None = None,

11545 ambiguous: TimeAmbiguous = "raise",

11546 nonexistent: TimeNonexistent = "raise",

11547 ) -> Self:

11548 """

11549 Localize tz-naive index of a Series or DataFrame to target time zone.

11550

11551 This operation localizes the Index. To localize the values in a

11552 timezone-naive Series, use :meth:`Series.dt.tz_localize`.

11553

11554 Parameters

11555 ----------

11556 tz : str or tzinfo or None

11557 Time zone to localize. Passing ``None`` will remove the

11558 time zone information and preserve local time.

11559 axis : {{0 or 'index', 1 or 'columns'}}, default 0

11560 The axis to localize

11561 level : int, str, default None

11562 If axis ia a MultiIndex, localize a specific level. Otherwise

11563 must be None.

11564 copy : bool, default True

11565 Also make a copy of the underlying data.

11566

11567 .. note::

11568 The `copy` keyword will change behavior in pandas 3.0.

11569 `Copy-on-Write

11570 <https://pandas.pydata.org/docs/dev/user_guide/copy_on_write.html>`__

11571 will be enabled by default, which means that all methods with a

11572 `copy` keyword will use a lazy copy mechanism to defer the copy and

11573 ignore the `copy` keyword. The `copy` keyword will be removed in a

11574 future version of pandas.

11575

11576 You can already get the future behavior and improvements through

11577 enabling copy on write ``pd.options.mode.copy_on_write = True``

11578 ambiguous : 'infer', bool-ndarray, 'NaT', default 'raise'

11579 When clocks moved backward due to DST, ambiguous times may arise.

11580 For example in Central European Time (UTC+01), when going from

11581 03:00 DST to 02:00 non-DST, 02:30:00 local time occurs both at

11582 00:30:00 UTC and at 01:30:00 UTC. In such a situation, the

11583 `ambiguous` parameter dictates how ambiguous times should be

11584 handled.

11585

11586 - 'infer' will attempt to infer fall dst-transition hours based on

11587 order

11588 - bool-ndarray where True signifies a DST time, False designates

11589 a non-DST time (note that this flag is only applicable for

11590 ambiguous times)

11591 - 'NaT' will return NaT where there are ambiguous times

11592 - 'raise' will raise an AmbiguousTimeError if there are ambiguous

11593 times.

11594 nonexistent : str, default 'raise'

11595 A nonexistent time does not exist in a particular timezone

11596 where clocks moved forward due to DST. Valid values are:

11597

11598 - 'shift_forward' will shift the nonexistent time forward to the

11599 closest existing time

11600 - 'shift_backward' will shift the nonexistent time backward to the

11601 closest existing time

11602 - 'NaT' will return NaT where there are nonexistent times

11603 - timedelta objects will shift nonexistent times by the timedelta

11604 - 'raise' will raise an NonExistentTimeError if there are

11605 nonexistent times.

11606

11607 Returns

11608 -------

11609 {klass}

11610 Same type as the input.

11611

11612 Raises

11613 ------

11614 TypeError

11615 If the TimeSeries is tz-aware and tz is not None.

11616

11617 Examples

11618 --------

11619 Localize local times:

11620

11621 >>> s = pd.Series(

11622 ... [1],

11623 ... index=pd.DatetimeIndex(['2018-09-15 01:30:00']),

11624 ... )

11625 >>> s.tz_localize('CET')

11626 2018-09-15 01:30:00+02:00 1

11627 dtype: int64

11628

11629 Pass None to convert to tz-naive index and preserve local time:

11630

11631 >>> s = pd.Series([1],

11632 ... index=pd.DatetimeIndex(['2018-09-15 01:30:00+02:00']))

11633 >>> s.tz_localize(None)

11634 2018-09-15 01:30:00 1

11635 dtype: int64

11636

11637 Be careful with DST changes. When there is sequential data, pandas

11638 can infer the DST time:

11639

11640 >>> s = pd.Series(range(7),

11641 ... index=pd.DatetimeIndex(['2018-10-28 01:30:00',

11642 ... '2018-10-28 02:00:00',

11643 ... '2018-10-28 02:30:00',

11644 ... '2018-10-28 02:00:00',

11645 ... '2018-10-28 02:30:00',

11646 ... '2018-10-28 03:00:00',

11647 ... '2018-10-28 03:30:00']))

11648 >>> s.tz_localize('CET', ambiguous='infer')

11649 2018-10-28 01:30:00+02:00 0

11650 2018-10-28 02:00:00+02:00 1

11651 2018-10-28 02:30:00+02:00 2

11652 2018-10-28 02:00:00+01:00 3

11653 2018-10-28 02:30:00+01:00 4

11654 2018-10-28 03:00:00+01:00 5

11655 2018-10-28 03:30:00+01:00 6

11656 dtype: int64

11657

11658 In some cases, inferring the DST is impossible. In such cases, you can

11659 pass an ndarray to the ambiguous parameter to set the DST explicitly

11660

11661 >>> s = pd.Series(range(3),

11662 ... index=pd.DatetimeIndex(['2018-10-28 01:20:00',

11663 ... '2018-10-28 02:36:00',

11664 ... '2018-10-28 03:46:00']))

11665 >>> s.tz_localize('CET', ambiguous=np.array([True, True, False]))

11666 2018-10-28 01:20:00+02:00 0

11667 2018-10-28 02:36:00+02:00 1

11668 2018-10-28 03:46:00+01:00 2

11669 dtype: int64

11670

11671 If the DST transition causes nonexistent times, you can shift these

11672 dates forward or backward with a timedelta object or `'shift_forward'`

11673 or `'shift_backward'`.

11674

11675 >>> s = pd.Series(range(2),

11676 ... index=pd.DatetimeIndex(['2015-03-29 02:30:00',

11677 ... '2015-03-29 03:30:00']))

11678 >>> s.tz_localize('Europe/Warsaw', nonexistent='shift_forward')

11679 2015-03-29 03:00:00+02:00 0

11680 2015-03-29 03:30:00+02:00 1

11681 dtype: int64

11682 >>> s.tz_localize('Europe/Warsaw', nonexistent='shift_backward')

11683 2015-03-29 01:59:59.999999999+01:00 0

11684 2015-03-29 03:30:00+02:00 1

11685 dtype: int64

11686 >>> s.tz_localize('Europe/Warsaw', nonexistent=pd.Timedelta('1h'))

11687 2015-03-29 03:30:00+02:00 0

11688 2015-03-29 03:30:00+02:00 1

11689 dtype: int64

11690 """

11691 nonexistent_options = ("raise", "NaT", "shift_forward", "shift_backward")

11692 if nonexistent not in nonexistent_options and not isinstance(

11693 nonexistent, dt.timedelta

11694 ):

11695 raise ValueError(

11696 "The nonexistent argument must be one of 'raise', "

11697 "'NaT', 'shift_forward', 'shift_backward' or "

11698 "a timedelta object"

11699 )

11700

11701 axis = self._get_axis_number(axis)

11702 ax = self._get_axis(axis)

11703

11704 def _tz_localize(ax, tz, ambiguous, nonexistent):

11705 if not hasattr(ax, "tz_localize"):

11706 if len(ax) > 0:

11707 ax_name = self._get_axis_name(axis)

11708 raise TypeError(

11709 f"{ax_name} is not a valid DatetimeIndex or PeriodIndex"

11710 )

11711 ax = DatetimeIndex([], tz=tz)

11712 else:

11713 ax = ax.tz_localize(tz, ambiguous=ambiguous, nonexistent=nonexistent)

11714 return ax

11715

11716 # if a level is given it must be a MultiIndex level or

11717 # equivalent to the axis name

11718 if isinstance(ax, MultiIndex):

11719 level = ax._get_level_number(level)

11720 new_level = _tz_localize(ax.levels[level], tz, ambiguous, nonexistent)

11721 ax = ax.set_levels(new_level, level=level)

11722 else:

11723 if level not in (None, 0, ax.name):

11724 raise ValueError(f"The level {level} is not valid")

11725 ax = _tz_localize(ax, tz, ambiguous, nonexistent)

11726

11727 result = self.copy(deep=copy and not using_copy_on_write())

11728 result = result.set_axis(ax, axis=axis, copy=False)

11729 return result.__finalize__(self, method="tz_localize")

11730

11731 # ----------------------------------------------------------------------

11732 # Numeric Methods

11733

11734 @final

11735 def describe(

11736 self,

11737 percentiles=None,

11738 include=None,

11739 exclude=None,

11740 ) -> Self:

11741 """

11742 Generate descriptive statistics.

11743

11744 Descriptive statistics include those that summarize the central

11745 tendency, dispersion and shape of a

11746 dataset's distribution, excluding ``NaN`` values.

11747

11748 Analyzes both numeric and object series, as well

11749 as ``DataFrame`` column sets of mixed data types. The output

11750 will vary depending on what is provided. Refer to the notes

11751 below for more detail.

11752

11753 Parameters

11754 ----------

11755 percentiles : list-like of numbers, optional

11756 The percentiles to include in the output. All should

11757 fall between 0 and 1. The default is

11758 ``[.25, .5, .75]``, which returns the 25th, 50th, and

11759 75th percentiles.

11760 include : 'all', list-like of dtypes or None (default), optional

11761 A white list of data types to include in the result. Ignored

11762 for ``Series``. Here are the options:

11763

11764 - 'all' : All columns of the input will be included in the output.

11765 - A list-like of dtypes : Limits the results to the

11766 provided data types.

11767 To limit the result to numeric types submit

11768 ``numpy.number``. To limit it instead to object columns submit

11769 the ``numpy.object`` data type. Strings

11770 can also be used in the style of

11771 ``select_dtypes`` (e.g. ``df.describe(include=['O'])``). To

11772 select pandas categorical columns, use ``'category'``

11773 - None (default) : The result will include all numeric columns.

11774 exclude : list-like of dtypes or None (default), optional,

11775 A black list of data types to omit from the result. Ignored

11776 for ``Series``. Here are the options:

11777

11778 - A list-like of dtypes : Excludes the provided data types

11779 from the result. To exclude numeric types submit

11780 ``numpy.number``. To exclude object columns submit the data

11781 type ``numpy.object``. Strings can also be used in the style of

11782 ``select_dtypes`` (e.g. ``df.describe(exclude=['O'])``). To

11783 exclude pandas categorical columns, use ``'category'``

11784 - None (default) : The result will exclude nothing.

11785

11786 Returns

11787 -------

11788 Series or DataFrame

11789 Summary statistics of the Series or Dataframe provided.

11790

11791 See Also

11792 --------

11793 DataFrame.count: Count number of non-NA/null observations.

11794 DataFrame.max: Maximum of the values in the object.

11795 DataFrame.min: Minimum of the values in the object.

11796 DataFrame.mean: Mean of the values.

11797 DataFrame.std: Standard deviation of the observations.

11798 DataFrame.select_dtypes: Subset of a DataFrame including/excluding

11799 columns based on their dtype.

11800

11801 Notes

11802 -----

11803 For numeric data, the result's index will include ``count``,

11804 ``mean``, ``std``, ``min``, ``max`` as well as lower, ``50`` and

11805 upper percentiles. By default the lower percentile is ``25`` and the

11806 upper percentile is ``75``. The ``50`` percentile is the

11807 same as the median.

11808

11809 For object data (e.g. strings or timestamps), the result's index

11810 will include ``count``, ``unique``, ``top``, and ``freq``. The ``top``

11811 is the most common value. The ``freq`` is the most common value's

11812 frequency. Timestamps also include the ``first`` and ``last`` items.

11813

11814 If multiple object values have the highest count, then the

11815 ``count`` and ``top`` results will be arbitrarily chosen from

11816 among those with the highest count.

11817

11818 For mixed data types provided via a ``DataFrame``, the default is to

11819 return only an analysis of numeric columns. If the dataframe consists

11820 only of object and categorical data without any numeric columns, the

11821 default is to return an analysis of both the object and categorical

11822 columns. If ``include='all'`` is provided as an option, the result

11823 will include a union of attributes of each type.

11824

11825 The `include` and `exclude` parameters can be used to limit

11826 which columns in a ``DataFrame`` are analyzed for the output.

11827 The parameters are ignored when analyzing a ``Series``.

11828

11829 Examples

11830 --------

11831 Describing a numeric ``Series``.

11832

11833 >>> s = pd.Series([1, 2, 3])

11834 >>> s.describe()

11835 count 3.0

11836 mean 2.0

11837 std 1.0

11838 min 1.0

11839 25% 1.5

11840 50% 2.0

11841 75% 2.5

11842 max 3.0

11843 dtype: float64

11844

11845 Describing a categorical ``Series``.

11846

11847 >>> s = pd.Series(['a', 'a', 'b', 'c'])

11848 >>> s.describe()

11849 count 4

11850 unique 3

11851 top a

11852 freq 2

11853 dtype: object

11854

11855 Describing a timestamp ``Series``.

11856

11857 >>> s = pd.Series([

11858 ... np.datetime64("2000-01-01"),

11859 ... np.datetime64("2010-01-01"),

11860 ... np.datetime64("2010-01-01")

11861 ... ])

11862 >>> s.describe()

11863 count 3

11864 mean 2006-09-01 08:00:00

11865 min 2000-01-01 00:00:00

11866 25% 2004-12-31 12:00:00

11867 50% 2010-01-01 00:00:00

11868 75% 2010-01-01 00:00:00

11869 max 2010-01-01 00:00:00

11870 dtype: object

11871

11872 Describing a ``DataFrame``. By default only numeric fields

11873 are returned.

11874

11875 >>> df = pd.DataFrame({'categorical': pd.Categorical(['d', 'e', 'f']),

11876 ... 'numeric': [1, 2, 3],

11877 ... 'object': ['a', 'b', 'c']

11878 ... })

11879 >>> df.describe()

11880 numeric

11881 count 3.0

11882 mean 2.0

11883 std 1.0

11884 min 1.0

11885 25% 1.5

11886 50% 2.0

11887 75% 2.5

11888 max 3.0

11889

11890 Describing all columns of a ``DataFrame`` regardless of data type.

11891

11892 >>> df.describe(include='all') # doctest: +SKIP

11893 categorical numeric object

11894 count 3 3.0 3

11895 unique 3 NaN 3

11896 top f NaN a

11897 freq 1 NaN 1

11898 mean NaN 2.0 NaN

11899 std NaN 1.0 NaN

11900 min NaN 1.0 NaN

11901 25% NaN 1.5 NaN

11902 50% NaN 2.0 NaN

11903 75% NaN 2.5 NaN

11904 max NaN 3.0 NaN

11905

11906 Describing a column from a ``DataFrame`` by accessing it as

11907 an attribute.

11908

11909 >>> df.numeric.describe()

11910 count 3.0

11911 mean 2.0

11912 std 1.0

11913 min 1.0

11914 25% 1.5

11915 50% 2.0

11916 75% 2.5

11917 max 3.0

11918 Name: numeric, dtype: float64

11919

11920 Including only numeric columns in a ``DataFrame`` description.

11921

11922 >>> df.describe(include=[np.number])

11923 numeric

11924 count 3.0

11925 mean 2.0

11926 std 1.0

11927 min 1.0

11928 25% 1.5

11929 50% 2.0

11930 75% 2.5

11931 max 3.0

11932

11933 Including only string columns in a ``DataFrame`` description.

11934

11935 >>> df.describe(include=[object]) # doctest: +SKIP

11936 object

11937 count 3

11938 unique 3

11939 top a

11940 freq 1

11941

11942 Including only categorical columns from a ``DataFrame`` description.

11943

11944 >>> df.describe(include=['category'])

11945 categorical

11946 count 3

11947 unique 3

11948 top d

11949 freq 1

11950

11951 Excluding numeric columns from a ``DataFrame`` description.

11952

11953 >>> df.describe(exclude=[np.number]) # doctest: +SKIP

11954 categorical object

11955 count 3 3

11956 unique 3 3

11957 top f a

11958 freq 1 1

11959

11960 Excluding object columns from a ``DataFrame`` description.

11961

11962 >>> df.describe(exclude=[object]) # doctest: +SKIP

11963 categorical numeric

11964 count 3 3.0

11965 unique 3 NaN

11966 top f NaN

11967 freq 1 NaN

11968 mean NaN 2.0

11969 std NaN 1.0

11970 min NaN 1.0

11971 25% NaN 1.5

11972 50% NaN 2.0

11973 75% NaN 2.5

11974 max NaN 3.0

11975 """

11976 return describe_ndframe(

11977 obj=self,

11978 include=include,

11979 exclude=exclude,

11980 percentiles=percentiles,

11981 ).__finalize__(self, method="describe")

11982

11983 @final

11984 def pct_change(

11985 self,

11986 periods: int = 1,

11987 fill_method: FillnaOptions | None | lib.NoDefault = lib.no_default,

11988 limit: int | None | lib.NoDefault = lib.no_default,

11989 freq=None,

11990 **kwargs,

11991 ) -> Self:

11992 """

11993 Fractional change between the current and a prior element.

11994

11995 Computes the fractional change from the immediately previous row by

11996 default. This is useful in comparing the fraction of change in a time

11997 series of elements.

11998

11999 .. note::

12000

12001 Despite the name of this method, it calculates fractional change

12002 (also known as per unit change or relative change) and not

12003 percentage change. If you need the percentage change, multiply

12004 these values by 100.

12005

12006 Parameters

12007 ----------

12008 periods : int, default 1

12009 Periods to shift for forming percent change.

12010 fill_method : {'backfill', 'bfill', 'pad', 'ffill', None}, default 'pad'

12011 How to handle NAs **before** computing percent changes.

12012

12013 .. deprecated:: 2.1

12014 All options of `fill_method` are deprecated except `fill_method=None`.

12015

12016 limit : int, default None

12017 The number of consecutive NAs to fill before stopping.

12018

12019 .. deprecated:: 2.1

12020

12021 freq : DateOffset, timedelta, or str, optional

12022 Increment to use from time series API (e.g. 'ME' or BDay()).

12023 **kwargs

12024 Additional keyword arguments are passed into

12025 `DataFrame.shift` or `Series.shift`.

12026

12027 Returns

12028 -------

12029 Series or DataFrame

12030 The same type as the calling object.

12031

12032 See Also

12033 --------

12034 Series.diff : Compute the difference of two elements in a Series.

12035 DataFrame.diff : Compute the difference of two elements in a DataFrame.

12036 Series.shift : Shift the index by some number of periods.

12037 DataFrame.shift : Shift the index by some number of periods.

12038

12039 Examples

12040 --------

12041 **Series**

12042

12043 >>> s = pd.Series([90, 91, 85])

12044 >>> s

12045 0 90

12046 1 91

12047 2 85

12048 dtype: int64

12049

12050 >>> s.pct_change()

12051 0 NaN

12052 1 0.011111

12053 2 -0.065934

12054 dtype: float64

12055

12056 >>> s.pct_change(periods=2)

12057 0 NaN

12058 1 NaN

12059 2 -0.055556

12060 dtype: float64

12061

12062 See the percentage change in a Series where filling NAs with last

12063 valid observation forward to next valid.

12064

12065 >>> s = pd.Series([90, 91, None, 85])

12066 >>> s

12067 0 90.0

12068 1 91.0

12069 2 NaN

12070 3 85.0

12071 dtype: float64

12072

12073 >>> s.ffill().pct_change()

12074 0 NaN

12075 1 0.011111

12076 2 0.000000

12077 3 -0.065934

12078 dtype: float64

12079

12080 **DataFrame**

12081

12082 Percentage change in French franc, Deutsche Mark, and Italian lira from

12083 1980-01-01 to 1980-03-01.

12084

12085 >>> df = pd.DataFrame({

12086 ... 'FR': [4.0405, 4.0963, 4.3149],

12087 ... 'GR': [1.7246, 1.7482, 1.8519],

12088 ... 'IT': [804.74, 810.01, 860.13]},

12089 ... index=['1980-01-01', '1980-02-01', '1980-03-01'])

12090 >>> df

12091 FR GR IT

12092 1980-01-01 4.0405 1.7246 804.74

12093 1980-02-01 4.0963 1.7482 810.01

12094 1980-03-01 4.3149 1.8519 860.13

12095

12096 >>> df.pct_change()

12097 FR GR IT

12098 1980-01-01 NaN NaN NaN

12099 1980-02-01 0.013810 0.013684 0.006549

12100 1980-03-01 0.053365 0.059318 0.061876

12101

12102 Percentage of change in GOOG and APPL stock volume. Shows computing

12103 the percentage change between columns.

12104

12105 >>> df = pd.DataFrame({

12106 ... '2016': [1769950, 30586265],

12107 ... '2015': [1500923, 40912316],

12108 ... '2014': [1371819, 41403351]},

12109 ... index=['GOOG', 'APPL'])

12110 >>> df

12111 2016 2015 2014

12112 GOOG 1769950 1500923 1371819

12113 APPL 30586265 40912316 41403351

12114

12115 >>> df.pct_change(axis='columns', periods=-1)

12116 2016 2015 2014

12117 GOOG 0.179241 0.094112 NaN

12118 APPL -0.252395 -0.011860 NaN

12119 """

12120 # GH#53491

12121 if fill_method not in (lib.no_default, None) or limit is not lib.no_default:

12122 warnings.warn(

12123 "The 'fill_method' keyword being not None and the 'limit' keyword in "

12124 f"{type(self).__name__}.pct_change are deprecated and will be removed "

12125 "in a future version. Either fill in any non-leading NA values prior "

12126 "to calling pct_change or specify 'fill_method=None' to not fill NA "

12127 "values.",

12128 FutureWarning,

12129 stacklevel=find_stack_level(),

12130 )

12131 if fill_method is lib.no_default:

12132 if limit is lib.no_default:

12133 cols = self.items() if self.ndim == 2 else [(None, self)]

12134 for _, col in cols:

12135 if len(col) > 0:

12136 mask = col.isna().values

12137 mask = mask[np.argmax(~mask) :]

12138 if mask.any():

12139 warnings.warn(

12140 "The default fill_method='pad' in "

12141 f"{type(self).__name__}.pct_change is deprecated and "

12142 "will be removed in a future version. Either fill in "

12143 "any non-leading NA values prior to calling pct_change "

12144 "or specify 'fill_method=None' to not fill NA values.",

12145 FutureWarning,

12146 stacklevel=find_stack_level(),

12147 )

12148 break

12149 fill_method = "pad"

12150 if limit is lib.no_default:

12151 limit = None

12152

12153 axis = self._get_axis_number(kwargs.pop("axis", "index"))

12154 if fill_method is None:

12155 data = self

12156 else:

12157 data = self._pad_or_backfill(fill_method, axis=axis, limit=limit)

12158

12159 shifted = data.shift(periods=periods, freq=freq, axis=axis, **kwargs)

12160 # Unsupported left operand type for / ("Self")

12161 rs = data / shifted - 1 # type: ignore[operator]

12162 if freq is not None:

12163 # Shift method is implemented differently when freq is not None

12164 # We want to restore the original index

12165 rs = rs.loc[~rs.index.duplicated()]

12166 rs = rs.reindex_like(data)

12167 return rs.__finalize__(self, method="pct_change")

12168

12169 @final

12170 def _logical_func(

12171 self,

12172 name: str,

12173 func,

12174 axis: Axis | None = 0,

12175 bool_only: bool_t = False,

12176 skipna: bool_t = True,

12177 **kwargs,

12178 ) -> Series | bool_t:

12179 nv.validate_logical_func((), kwargs, fname=name)

12180 validate_bool_kwarg(skipna, "skipna", none_allowed=False)

12181

12182 if self.ndim > 1 and axis is None:

12183 # Reduce along one dimension then the other, to simplify DataFrame._reduce

12184 res = self._logical_func(

12185 name, func, axis=0, bool_only=bool_only, skipna=skipna, **kwargs

12186 )

12187 # error: Item "bool" of "Series | bool" has no attribute "_logical_func"

12188 return res._logical_func( # type: ignore[union-attr]

12189 name, func, skipna=skipna, **kwargs

12190 )

12191 elif axis is None:

12192 axis = 0

12193

12194 if (

12195 self.ndim > 1

12196 and axis == 1

12197 and len(self._mgr.arrays) > 1

12198 # TODO(EA2D): special-case not needed

12199 and all(x.ndim == 2 for x in self._mgr.arrays)

12200 and not kwargs

12201 ):

12202 # Fastpath avoiding potentially expensive transpose

12203 obj = self

12204 if bool_only:

12205 obj = self._get_bool_data()

12206 return obj._reduce_axis1(name, func, skipna=skipna)

12207

12208 return self._reduce(

12209 func,

12210 name=name,

12211 axis=axis,

12212 skipna=skipna,

12213 numeric_only=bool_only,

12214 filter_type="bool",

12215 )

12216

12217 def any(

12218 self,

12219 axis: Axis | None = 0,

12220 bool_only: bool_t = False,

12221 skipna: bool_t = True,

12222 **kwargs,

12223 ) -> Series | bool_t:

12224 return self._logical_func(

12225 "any", nanops.nanany, axis, bool_only, skipna, **kwargs

12226 )

12227

12228 def all(

12229 self,

12230 axis: Axis = 0,

12231 bool_only: bool_t = False,

12232 skipna: bool_t = True,

12233 **kwargs,

12234 ) -> Series | bool_t:

12235 return self._logical_func(

12236 "all", nanops.nanall, axis, bool_only, skipna, **kwargs

12237 )

12238

12239 @final

12240 def _accum_func(

12241 self,

12242 name: str,

12243 func,

12244 axis: Axis | None = None,

12245 skipna: bool_t = True,

12246 *args,

12247 **kwargs,

12248 ):

12249 skipna = nv.validate_cum_func_with_skipna(skipna, args, kwargs, name)

12250 if axis is None:

12251 axis = 0

12252 else:

12253 axis = self._get_axis_number(axis)

12254

12255 if axis == 1:

12256 return self.T._accum_func(

12257 name, func, axis=0, skipna=skipna, *args, **kwargs # noqa: B026

12258 ).T

12259

12260 def block_accum_func(blk_values):

12261 values = blk_values.T if hasattr(blk_values, "T") else blk_values

12262

12263 result: np.ndarray | ExtensionArray

12264 if isinstance(values, ExtensionArray):

12265 result = values._accumulate(name, skipna=skipna, **kwargs)

12266 else:

12267 result = nanops.na_accum_func(values, func, skipna=skipna)

12268

12269 result = result.T if hasattr(result, "T") else result

12270 return result

12271

12272 result = self._mgr.apply(block_accum_func)

12273

12274 return self._constructor_from_mgr(result, axes=result.axes).__finalize__(

12275 self, method=name

12276 )

12277

12278 def cummax(self, axis: Axis | None = None, skipna: bool_t = True, *args, **kwargs):

12279 return self._accum_func(

12280 "cummax", np.maximum.accumulate, axis, skipna, *args, **kwargs

12281 )

12282

12283 def cummin(self, axis: Axis | None = None, skipna: bool_t = True, *args, **kwargs):

12284 return self._accum_func(

12285 "cummin", np.minimum.accumulate, axis, skipna, *args, **kwargs

12286 )

12287

12288 def cumsum(self, axis: Axis | None = None, skipna: bool_t = True, *args, **kwargs):

12289 return self._accum_func("cumsum", np.cumsum, axis, skipna, *args, **kwargs)

12290

12291 def cumprod(self, axis: Axis | None = None, skipna: bool_t = True, *args, **kwargs):

12292 return self._accum_func("cumprod", np.cumprod, axis, skipna, *args, **kwargs)

12293

12294 @final

12295 def _stat_function_ddof(

12296 self,

12297 name: str,

12298 func,

12299 axis: Axis | None | lib.NoDefault = lib.no_default,

12300 skipna: bool_t = True,

12301 ddof: int = 1,

12302 numeric_only: bool_t = False,

12303 **kwargs,

12304 ) -> Series | float:

12305 nv.validate_stat_ddof_func((), kwargs, fname=name)

12306 validate_bool_kwarg(skipna, "skipna", none_allowed=False)

12307

12308 if axis is None:

12309 if self.ndim > 1:

12310 warnings.warn(

12311 f"The behavior of {type(self).__name__}.{name} with axis=None "

12312 "is deprecated, in a future version this will reduce over both "

12313 "axes and return a scalar. To retain the old behavior, pass "

12314 "axis=0 (or do not pass axis)",

12315 FutureWarning,

12316 stacklevel=find_stack_level(),

12317 )

12318 axis = 0

12319 elif axis is lib.no_default:

12320 axis = 0

12321

12322 return self._reduce(

12323 func, name, axis=axis, numeric_only=numeric_only, skipna=skipna, ddof=ddof

12324 )

12325

12326 def sem(

12327 self,

12328 axis: Axis | None = 0,

12329 skipna: bool_t = True,

12330 ddof: int = 1,

12331 numeric_only: bool_t = False,

12332 **kwargs,

12333 ) -> Series | float:

12334 return self._stat_function_ddof(

12335 "sem", nanops.nansem, axis, skipna, ddof, numeric_only, **kwargs

12336 )

12337

12338 def var(

12339 self,

12340 axis: Axis | None = 0,

12341 skipna: bool_t = True,

12342 ddof: int = 1,

12343 numeric_only: bool_t = False,

12344 **kwargs,

12345 ) -> Series | float:

12346 return self._stat_function_ddof(

12347 "var", nanops.nanvar, axis, skipna, ddof, numeric_only, **kwargs

12348 )

12349

12350 def std(

12351 self,

12352 axis: Axis | None = 0,

12353 skipna: bool_t = True,

12354 ddof: int = 1,

12355 numeric_only: bool_t = False,

12356 **kwargs,

12357 ) -> Series | float:

12358 return self._stat_function_ddof(

12359 "std", nanops.nanstd, axis, skipna, ddof, numeric_only, **kwargs

12360 )

12361

12362 @final

12363 def _stat_function(

12364 self,

12365 name: str,

12366 func,

12367 axis: Axis | None = 0,

12368 skipna: bool_t = True,

12369 numeric_only: bool_t = False,

12370 **kwargs,

12371 ):

12372 assert name in ["median", "mean", "min", "max", "kurt", "skew"], name

12373 nv.validate_func(name, (), kwargs)

12374

12375 validate_bool_kwarg(skipna, "skipna", none_allowed=False)

12376

12377 return self._reduce(

12378 func, name=name, axis=axis, skipna=skipna, numeric_only=numeric_only

12379 )

12380

12381 def min(

12382 self,

12383 axis: Axis | None = 0,

12384 skipna: bool_t = True,

12385 numeric_only: bool_t = False,

12386 **kwargs,

12387 ):

12388 return self._stat_function(

12389 "min",

12390 nanops.nanmin,

12391 axis,

12392 skipna,

12393 numeric_only,

12394 **kwargs,

12395 )

12396

12397 def max(

12398 self,

12399 axis: Axis | None = 0,

12400 skipna: bool_t = True,

12401 numeric_only: bool_t = False,

12402 **kwargs,

12403 ):

12404 return self._stat_function(

12405 "max",

12406 nanops.nanmax,

12407 axis,

12408 skipna,

12409 numeric_only,

12410 **kwargs,

12411 )

12412

12413 def mean(

12414 self,

12415 axis: Axis | None = 0,

12416 skipna: bool_t = True,

12417 numeric_only: bool_t = False,

12418 **kwargs,

12419 ) -> Series | float:

12420 return self._stat_function(

12421 "mean", nanops.nanmean, axis, skipna, numeric_only, **kwargs

12422 )

12423

12424 def median(

12425 self,

12426 axis: Axis | None = 0,

12427 skipna: bool_t = True,

12428 numeric_only: bool_t = False,

12429 **kwargs,

12430 ) -> Series | float:

12431 return self._stat_function(

12432 "median", nanops.nanmedian, axis, skipna, numeric_only, **kwargs

12433 )

12434

12435 def skew(

12436 self,

12437 axis: Axis | None = 0,

12438 skipna: bool_t = True,

12439 numeric_only: bool_t = False,

12440 **kwargs,

12441 ) -> Series | float:

12442 return self._stat_function(

12443 "skew", nanops.nanskew, axis, skipna, numeric_only, **kwargs

12444 )

12445

12446 def kurt(

12447 self,

12448 axis: Axis | None = 0,

12449 skipna: bool_t = True,

12450 numeric_only: bool_t = False,

12451 **kwargs,

12452 ) -> Series | float:

12453 return self._stat_function(

12454 "kurt", nanops.nankurt, axis, skipna, numeric_only, **kwargs

12455 )

12456

12457 kurtosis = kurt

12458

12459 @final

12460 def _min_count_stat_function(

12461 self,

12462 name: str,

12463 func,

12464 axis: Axis | None | lib.NoDefault = lib.no_default,

12465 skipna: bool_t = True,

12466 numeric_only: bool_t = False,

12467 min_count: int = 0,

12468 **kwargs,

12469 ):

12470 assert name in ["sum", "prod"], name

12471 nv.validate_func(name, (), kwargs)

12472

12473 validate_bool_kwarg(skipna, "skipna", none_allowed=False)

12474

12475 if axis is None:

12476 if self.ndim > 1:

12477 warnings.warn(

12478 f"The behavior of {type(self).__name__}.{name} with axis=None "

12479 "is deprecated, in a future version this will reduce over both "

12480 "axes and return a scalar. To retain the old behavior, pass "

12481 "axis=0 (or do not pass axis)",

12482 FutureWarning,

12483 stacklevel=find_stack_level(),

12484 )

12485 axis = 0

12486 elif axis is lib.no_default:

12487 axis = 0

12488

12489 return self._reduce(

12490 func,

12491 name=name,

12492 axis=axis,

12493 skipna=skipna,

12494 numeric_only=numeric_only,

12495 min_count=min_count,

12496 )

12497

12498 def sum(

12499 self,

12500 axis: Axis | None = 0,

12501 skipna: bool_t = True,

12502 numeric_only: bool_t = False,

12503 min_count: int = 0,

12504 **kwargs,

12505 ):

12506 return self._min_count_stat_function(

12507 "sum", nanops.nansum, axis, skipna, numeric_only, min_count, **kwargs

12508 )

12509

12510 def prod(

12511 self,

12512 axis: Axis | None = 0,

12513 skipna: bool_t = True,

12514 numeric_only: bool_t = False,

12515 min_count: int = 0,

12516 **kwargs,

12517 ):

12518 return self._min_count_stat_function(

12519 "prod",

12520 nanops.nanprod,

12521 axis,

12522 skipna,

12523 numeric_only,

12524 min_count,

12525 **kwargs,

12526 )

12527

12528 product = prod

12529

12530 @final

12531 @doc(Rolling)

12532 def rolling(

12533 self,

12534 window: int | dt.timedelta | str | BaseOffset | BaseIndexer,

12535 min_periods: int | None = None,

12536 center: bool_t = False,

12537 win_type: str | None = None,

12538 on: str | None = None,

12539 axis: Axis | lib.NoDefault = lib.no_default,

12540 closed: IntervalClosedType | None = None,

12541 step: int | None = None,

12542 method: str = "single",

12543 ) -> Window | Rolling:

12544 if axis is not lib.no_default:

12545 axis = self._get_axis_number(axis)

12546 name = "rolling"

12547 if axis == 1:

12548 warnings.warn(

12549 f"Support for axis=1 in {type(self).__name__}.{name} is "

12550 "deprecated and will be removed in a future version. "

12551 f"Use obj.T.{name}(...) instead",

12552 FutureWarning,

12553 stacklevel=find_stack_level(),

12554 )

12555 else:

12556 warnings.warn(

12557 f"The 'axis' keyword in {type(self).__name__}.{name} is "

12558 "deprecated and will be removed in a future version. "

12559 "Call the method without the axis keyword instead.",

12560 FutureWarning,

12561 stacklevel=find_stack_level(),

12562 )

12563 else:

12564 axis = 0

12565

12566 if win_type is not None:

12567 return Window(

12568 self,

12569 window=window,

12570 min_periods=min_periods,

12571 center=center,

12572 win_type=win_type,

12573 on=on,

12574 axis=axis,

12575 closed=closed,

12576 step=step,

12577 method=method,

12578 )

12579

12580 return Rolling(

12581 self,

12582 window=window,

12583 min_periods=min_periods,

12584 center=center,

12585 win_type=win_type,

12586 on=on,

12587 axis=axis,

12588 closed=closed,

12589 step=step,

12590 method=method,

12591 )

12592

12593 @final

12594 @doc(Expanding)

12595 def expanding(

12596 self,

12597 min_periods: int = 1,

12598 axis: Axis | lib.NoDefault = lib.no_default,

12599 method: Literal["single", "table"] = "single",

12600 ) -> Expanding:

12601 if axis is not lib.no_default:

12602 axis = self._get_axis_number(axis)

12603 name = "expanding"

12604 if axis == 1:

12605 warnings.warn(

12606 f"Support for axis=1 in {type(self).__name__}.{name} is "

12607 "deprecated and will be removed in a future version. "

12608 f"Use obj.T.{name}(...) instead",

12609 FutureWarning,

12610 stacklevel=find_stack_level(),

12611 )

12612 else:

12613 warnings.warn(

12614 f"The 'axis' keyword in {type(self).__name__}.{name} is "

12615 "deprecated and will be removed in a future version. "

12616 "Call the method without the axis keyword instead.",

12617 FutureWarning,

12618 stacklevel=find_stack_level(),

12619 )

12620 else:

12621 axis = 0

12622 return Expanding(self, min_periods=min_periods, axis=axis, method=method)

12623

12624 @final

12625 @doc(ExponentialMovingWindow)

12626 def ewm(

12627 self,

12628 com: float | None = None,

12629 span: float | None = None,

12630 halflife: float | TimedeltaConvertibleTypes | None = None,

12631 alpha: float | None = None,

12632 min_periods: int | None = 0,

12633 adjust: bool_t = True,

12634 ignore_na: bool_t = False,

12635 axis: Axis | lib.NoDefault = lib.no_default,

12636 times: np.ndarray | DataFrame | Series | None = None,

12637 method: Literal["single", "table"] = "single",

12638 ) -> ExponentialMovingWindow:

12639 if axis is not lib.no_default:

12640 axis = self._get_axis_number(axis)

12641 name = "ewm"

12642 if axis == 1:

12643 warnings.warn(

12644 f"Support for axis=1 in {type(self).__name__}.{name} is "

12645 "deprecated and will be removed in a future version. "

12646 f"Use obj.T.{name}(...) instead",

12647 FutureWarning,

12648 stacklevel=find_stack_level(),

12649 )

12650 else:

12651 warnings.warn(

12652 f"The 'axis' keyword in {type(self).__name__}.{name} is "

12653 "deprecated and will be removed in a future version. "

12654 "Call the method without the axis keyword instead.",

12655 FutureWarning,

12656 stacklevel=find_stack_level(),

12657 )

12658 else:

12659 axis = 0

12660

12661 return ExponentialMovingWindow(

12662 self,

12663 com=com,

12664 span=span,

12665 halflife=halflife,

12666 alpha=alpha,

12667 min_periods=min_periods,

12668 adjust=adjust,

12669 ignore_na=ignore_na,

12670 axis=axis,

12671 times=times,

12672 method=method,

12673 )

12674

12675 # ----------------------------------------------------------------------

12676 # Arithmetic Methods

12677

12678 @final

12679 def _inplace_method(self, other, op) -> Self:

12680 """

12681 Wrap arithmetic method to operate inplace.

12682 """

12683 warn = True

12684 if not PYPY and warn_copy_on_write():

12685 if sys.getrefcount(self) <= REF_COUNT + 2:

12686 # we are probably in an inplace setitem context (e.g. df['a'] += 1)

12687 warn = False

12688

12689 result = op(self, other)

12690

12691 if (

12692 self.ndim == 1

12693 and result._indexed_same(self)

12694 and result.dtype == self.dtype

12695 and not using_copy_on_write()

12696 and not (warn_copy_on_write() and not warn)

12697 ):

12698 # GH#36498 this inplace op can _actually_ be inplace.

12699 # Item "ArrayManager" of "Union[ArrayManager, SingleArrayManager,

12700 # BlockManager, SingleBlockManager]" has no attribute "setitem_inplace"

12701 self._mgr.setitem_inplace( # type: ignore[union-attr]

12702 slice(None), result._values, warn=warn

12703 )

12704 return self

12705

12706 # Delete cacher

12707 self._reset_cacher()

12708

12709 # this makes sure that we are aligned like the input

12710 # we are updating inplace so we want to ignore is_copy

12711 self._update_inplace(

12712 result.reindex_like(self, copy=False), verify_is_copy=False

12713 )

12714 return self

12715

12716 @final

12717 def __iadd__(self, other) -> Self:

12718 # error: Unsupported left operand type for + ("Type[NDFrame]")

12719 return self._inplace_method(other, type(self).__add__) # type: ignore[operator]

12720

12721 @final

12722 def __isub__(self, other) -> Self:

12723 # error: Unsupported left operand type for - ("Type[NDFrame]")

12724 return self._inplace_method(other, type(self).__sub__) # type: ignore[operator]

12725

12726 @final

12727 def __imul__(self, other) -> Self:

12728 # error: Unsupported left operand type for * ("Type[NDFrame]")

12729 return self._inplace_method(other, type(self).__mul__) # type: ignore[operator]

12730

12731 @final

12732 def __itruediv__(self, other) -> Self:

12733 # error: Unsupported left operand type for / ("Type[NDFrame]")

12734 return self._inplace_method(

12735 other, type(self).__truediv__ # type: ignore[operator]

12736 )

12737

12738 @final

12739 def __ifloordiv__(self, other) -> Self:

12740 # error: Unsupported left operand type for // ("Type[NDFrame]")

12741 return self._inplace_method(

12742 other, type(self).__floordiv__ # type: ignore[operator]

12743 )

12744

12745 @final

12746 def __imod__(self, other) -> Self:

12747 # error: Unsupported left operand type for % ("Type[NDFrame]")

12748 return self._inplace_method(other, type(self).__mod__) # type: ignore[operator]

12749

12750 @final

12751 def __ipow__(self, other) -> Self:

12752 # error: Unsupported left operand type for ** ("Type[NDFrame]")

12753 return self._inplace_method(other, type(self).__pow__) # type: ignore[operator]

12754

12755 @final

12756 def __iand__(self, other) -> Self:

12757 # error: Unsupported left operand type for & ("Type[NDFrame]")

12758 return self._inplace_method(other, type(self).__and__) # type: ignore[operator]

12759

12760 @final

12761 def __ior__(self, other) -> Self:

12762 return self._inplace_method(other, type(self).__or__)

12763

12764 @final

12765 def __ixor__(self, other) -> Self:

12766 # error: Unsupported left operand type for ^ ("Type[NDFrame]")

12767 return self._inplace_method(other, type(self).__xor__) # type: ignore[operator]

12768

12769 # ----------------------------------------------------------------------

12770 # Misc methods

12771

12772 @final

12773 def _find_valid_index(self, *, how: str) -> Hashable | None:

12774 """

12775 Retrieves the index of the first valid value.

12776

12777 Parameters

12778 ----------

12779 how : {'first', 'last'}

12780 Use this parameter to change between the first or last valid index.

12781

12782 Returns

12783 -------

12784 idx_first_valid : type of index

12785 """

12786 is_valid = self.notna().values

12787 idxpos = find_valid_index(how=how, is_valid=is_valid)

12788 if idxpos is None:

12789 return None

12790 return self.index[idxpos]

12791

12792 @final

12793 @doc(position="first", klass=_shared_doc_kwargs["klass"])

12794 def first_valid_index(self) -> Hashable | None:

12795 """

12796 Return index for {position} non-NA value or None, if no non-NA value is found.

12797

12798 Returns

12799 -------

12800 type of index

12801

12802 Examples

12803 --------

12804 For Series:

12805

12806 >>> s = pd.Series([None, 3, 4])

12807 >>> s.first_valid_index()

12808 1

12809 >>> s.last_valid_index()

12810 2

12811

12812 >>> s = pd.Series([None, None])

12813 >>> print(s.first_valid_index())

12814 None

12815 >>> print(s.last_valid_index())

12816 None

12817

12818 If all elements in Series are NA/null, returns None.

12819

12820 >>> s = pd.Series()

12821 >>> print(s.first_valid_index())

12822 None

12823 >>> print(s.last_valid_index())

12824 None

12825

12826 If Series is empty, returns None.

12827

12828 For DataFrame:

12829

12830 >>> df = pd.DataFrame({{'A': [None, None, 2], 'B': [None, 3, 4]}})

12831 >>> df

12832 A B

12833 0 NaN NaN

12834 1 NaN 3.0

12835 2 2.0 4.0

12836 >>> df.first_valid_index()

12837 1

12838 >>> df.last_valid_index()

12839 2

12840

12841 >>> df = pd.DataFrame({{'A': [None, None, None], 'B': [None, None, None]}})

12842 >>> df

12843 A B

12844 0 None None

12845 1 None None

12846 2 None None

12847 >>> print(df.first_valid_index())

12848 None

12849 >>> print(df.last_valid_index())

12850 None

12851

12852 If all elements in DataFrame are NA/null, returns None.

12853

12854 >>> df = pd.DataFrame()

12855 >>> df

12856 Empty DataFrame

12857 Columns: []

12858 Index: []

12859 >>> print(df.first_valid_index())

12860 None

12861 >>> print(df.last_valid_index())

12862 None

12863

12864 If DataFrame is empty, returns None.

12865 """

12866 return self._find_valid_index(how="first")

12867

12868 @final

12869 @doc(first_valid_index, position="last", klass=_shared_doc_kwargs["klass"])

12870 def last_valid_index(self) -> Hashable | None:

12871 return self._find_valid_index(how="last")

12872

12873

12874_num_doc = """

12875{desc}

12876

12877Parameters

12878----------

12879axis : {axis_descr}

12880 Axis for the function to be applied on.

12881 For `Series` this parameter is unused and defaults to 0.

12882

12883 For DataFrames, specifying ``axis=None`` will apply the aggregation

12884 across both axes.

12885

12886 .. versionadded:: 2.0.0

12887

12888skipna : bool, default True

12889 Exclude NA/null values when computing the result.

12890numeric_only : bool, default False

12891 Include only float, int, boolean columns. Not implemented for Series.

12892

12893{min_count}\

12894**kwargs

12895 Additional keyword arguments to be passed to the function.

12896

12897Returns

12898-------

12899{name1} or scalar\

12900{see_also}\

12901{examples}

12902"""

12903

12904_sum_prod_doc = """

12905{desc}

12906

12907Parameters

12908----------

12909axis : {axis_descr}

12910 Axis for the function to be applied on.

12911 For `Series` this parameter is unused and defaults to 0.

12912

12913 .. warning::

12914

12915 The behavior of DataFrame.{name} with ``axis=None`` is deprecated,

12916 in a future version this will reduce over both axes and return a scalar

12917 To retain the old behavior, pass axis=0 (or do not pass axis).

12918

12919 .. versionadded:: 2.0.0

12920

12921skipna : bool, default True

12922 Exclude NA/null values when computing the result.

12923numeric_only : bool, default False

12924 Include only float, int, boolean columns. Not implemented for Series.

12925

12926{min_count}\

12927**kwargs

12928 Additional keyword arguments to be passed to the function.

12929

12930Returns

12931-------

12932{name1} or scalar\

12933{see_also}\

12934{examples}

12935"""

12936

12937_num_ddof_doc = """

12938{desc}

12939

12940Parameters

12941----------

12942axis : {axis_descr}

12943 For `Series` this parameter is unused and defaults to 0.

12944

12945 .. warning::

12946

12947 The behavior of DataFrame.{name} with ``axis=None`` is deprecated,

12948 in a future version this will reduce over both axes and return a scalar

12949 To retain the old behavior, pass axis=0 (or do not pass axis).

12950

12951skipna : bool, default True

12952 Exclude NA/null values. If an entire row/column is NA, the result

12953 will be NA.

12954ddof : int, default 1

12955 Delta Degrees of Freedom. The divisor used in calculations is N - ddof,

12956 where N represents the number of elements.

12957numeric_only : bool, default False

12958 Include only float, int, boolean columns. Not implemented for Series.

12959

12960Returns

12961-------

12962{name1} or {name2} (if level specified) \

12963{notes}\

12964{examples}

12965"""

12966

12967_std_notes = """

12968

12969Notes

12970-----

12971To have the same behaviour as `numpy.std`, use `ddof=0` (instead of the

12972default `ddof=1`)"""

12973

12974_std_examples = """

12975

12976Examples

12977--------

12978>>> df = pd.DataFrame({'person_id': [0, 1, 2, 3],

12979... 'age': [21, 25, 62, 43],

12980... 'height': [1.61, 1.87, 1.49, 2.01]}

12981... ).set_index('person_id')

12982>>> df

12983 age height

12984person_id

129850 21 1.61

129861 25 1.87

129872 62 1.49

129883 43 2.01

12989

12990The standard deviation of the columns can be found as follows:

12991

12992>>> df.std()

12993age 18.786076

12994height 0.237417

12995dtype: float64

12996

12997Alternatively, `ddof=0` can be set to normalize by N instead of N-1:

12998

12999>>> df.std(ddof=0)

13000age 16.269219

13001height 0.205609

13002dtype: float64"""

13003

13004_var_examples = """

13005

13006Examples

13007--------

13008>>> df = pd.DataFrame({'person_id': [0, 1, 2, 3],

13009... 'age': [21, 25, 62, 43],

13010... 'height': [1.61, 1.87, 1.49, 2.01]}

13011... ).set_index('person_id')

13012>>> df

13013 age height

13014person_id

130150 21 1.61

130161 25 1.87

130172 62 1.49

130183 43 2.01

13019

13020>>> df.var()

13021age 352.916667

13022height 0.056367

13023dtype: float64

13024

13025Alternatively, ``ddof=0`` can be set to normalize by N instead of N-1:

13026

13027>>> df.var(ddof=0)

13028age 264.687500

13029height 0.042275

13030dtype: float64"""

13031

13032_bool_doc = """

13033{desc}

13034

13035Parameters

13036----------

13037axis : {{0 or 'index', 1 or 'columns', None}}, default 0

13038 Indicate which axis or axes should be reduced. For `Series` this parameter

13039 is unused and defaults to 0.

13040

13041 * 0 / 'index' : reduce the index, return a Series whose index is the

13042 original column labels.

13043 * 1 / 'columns' : reduce the columns, return a Series whose index is the

13044 original index.

13045 * None : reduce all axes, return a scalar.

13046

13047bool_only : bool, default False

13048 Include only boolean columns. Not implemented for Series.

13049skipna : bool, default True

13050 Exclude NA/null values. If the entire row/column is NA and skipna is

13051 True, then the result will be {empty_value}, as for an empty row/column.

13052 If skipna is False, then NA are treated as True, because these are not

13053 equal to zero.

13054**kwargs : any, default None

13055 Additional keywords have no effect but might be accepted for

13056 compatibility with NumPy.

13057

13058Returns

13059-------

13060{name1} or {name2}

13061 If level is specified, then, {name2} is returned; otherwise, {name1}

13062 is returned.

13063

13064{see_also}

13065{examples}"""

13066

13067_all_desc = """\

13068Return whether all elements are True, potentially over an axis.

13069

13070Returns True unless there at least one element within a series or

13071along a Dataframe axis that is False or equivalent (e.g. zero or

13072empty)."""

13073

13074_all_examples = """\

13075Examples

13076--------

13077**Series**

13078

13079>>> pd.Series([True, True]).all()

13080True

13081>>> pd.Series([True, False]).all()

13082False

13083>>> pd.Series([], dtype="float64").all()

13084True

13085>>> pd.Series([np.nan]).all()

13086True

13087>>> pd.Series([np.nan]).all(skipna=False)

13088True

13089

13090**DataFrames**

13091

13092Create a dataframe from a dictionary.

13093

13094>>> df = pd.DataFrame({'col1': [True, True], 'col2': [True, False]})

13095>>> df

13096 col1 col2

130970 True True

130981 True False

13099

13100Default behaviour checks if values in each column all return True.

13101

13102>>> df.all()

13103col1 True

13104col2 False

13105dtype: bool

13106

13107Specify ``axis='columns'`` to check if values in each row all return True.

13108

13109>>> df.all(axis='columns')

131100 True

131111 False

13112dtype: bool

13113

13114Or ``axis=None`` for whether every value is True.

13115

13116>>> df.all(axis=None)

13117False

13118"""

13119

13120_all_see_also = """\

13121See Also

13122--------

13123Series.all : Return True if all elements are True.

13124DataFrame.any : Return True if one (or more) elements are True.

13125"""

13126

13127_cnum_doc = """

13128Return cumulative {desc} over a DataFrame or Series axis.

13129

13130Returns a DataFrame or Series of the same size containing the cumulative

13131{desc}.

13132

13133Parameters

13134----------

13135axis : {{0 or 'index', 1 or 'columns'}}, default 0

13136 The index or the name of the axis. 0 is equivalent to None or 'index'.

13137 For `Series` this parameter is unused and defaults to 0.

13138skipna : bool, default True

13139 Exclude NA/null values. If an entire row/column is NA, the result

13140 will be NA.

13141*args, **kwargs

13142 Additional keywords have no effect but might be accepted for

13143 compatibility with NumPy.

13144

13145Returns

13146-------

13147{name1} or {name2}

13148 Return cumulative {desc} of {name1} or {name2}.

13149

13150See Also

13151--------

13152core.window.expanding.Expanding.{accum_func_name} : Similar functionality

13153 but ignores ``NaN`` values.

13154{name2}.{accum_func_name} : Return the {desc} over

13155 {name2} axis.

13156{name2}.cummax : Return cumulative maximum over {name2} axis.

13157{name2}.cummin : Return cumulative minimum over {name2} axis.

13158{name2}.cumsum : Return cumulative sum over {name2} axis.

13159{name2}.cumprod : Return cumulative product over {name2} axis.

13160

13161{examples}"""

13162

13163_cummin_examples = """\

13164Examples

13165--------

13166**Series**

13167

13168>>> s = pd.Series([2, np.nan, 5, -1, 0])

13169>>> s

131700 2.0

131711 NaN

131722 5.0

131733 -1.0

131744 0.0

13175dtype: float64

13176

13177By default, NA values are ignored.

13178

13179>>> s.cummin()

131800 2.0

131811 NaN

131822 2.0

131833 -1.0

131844 -1.0

13185dtype: float64

13186

13187To include NA values in the operation, use ``skipna=False``

13188

13189>>> s.cummin(skipna=False)

131900 2.0

131911 NaN

131922 NaN

131933 NaN

131944 NaN

13195dtype: float64

13196

13197**DataFrame**

13198

13199>>> df = pd.DataFrame([[2.0, 1.0],

13200... [3.0, np.nan],

13201... [1.0, 0.0]],

13202... columns=list('AB'))

13203>>> df

13204 A B

132050 2.0 1.0

132061 3.0 NaN

132072 1.0 0.0

13208

13209By default, iterates over rows and finds the minimum

13210in each column. This is equivalent to ``axis=None`` or ``axis='index'``.

13211

13212>>> df.cummin()

13213 A B

132140 2.0 1.0

132151 2.0 NaN

132162 1.0 0.0

13217

13218To iterate over columns and find the minimum in each row,

13219use ``axis=1``

13220

13221>>> df.cummin(axis=1)

13222 A B

132230 2.0 1.0

132241 3.0 NaN

132252 1.0 0.0

13226"""

13227

13228_cumsum_examples = """\

13229Examples

13230--------

13231**Series**

13232

13233>>> s = pd.Series([2, np.nan, 5, -1, 0])

13234>>> s

132350 2.0

132361 NaN

132372 5.0

132383 -1.0

132394 0.0

13240dtype: float64

13241

13242By default, NA values are ignored.

13243

13244>>> s.cumsum()

132450 2.0

132461 NaN

132472 7.0

132483 6.0

132494 6.0

13250dtype: float64

13251

13252To include NA values in the operation, use ``skipna=False``

13253

13254>>> s.cumsum(skipna=False)

132550 2.0

132561 NaN

132572 NaN

132583 NaN

132594 NaN

13260dtype: float64

13261

13262**DataFrame**

13263

13264>>> df = pd.DataFrame([[2.0, 1.0],

13265... [3.0, np.nan],

13266... [1.0, 0.0]],

13267... columns=list('AB'))

13268>>> df

13269 A B

132700 2.0 1.0

132711 3.0 NaN

132722 1.0 0.0

13273

13274By default, iterates over rows and finds the sum

13275in each column. This is equivalent to ``axis=None`` or ``axis='index'``.

13276

13277>>> df.cumsum()

13278 A B

132790 2.0 1.0

132801 5.0 NaN

132812 6.0 1.0

13282

13283To iterate over columns and find the sum in each row,

13284use ``axis=1``

13285

13286>>> df.cumsum(axis=1)

13287 A B

132880 2.0 3.0

132891 3.0 NaN

132902 1.0 1.0

13291"""

13292

13293_cumprod_examples = """\

13294Examples

13295--------

13296**Series**

13297

13298>>> s = pd.Series([2, np.nan, 5, -1, 0])

13299>>> s

133000 2.0

133011 NaN

133022 5.0

133033 -1.0

133044 0.0

13305dtype: float64

13306

13307By default, NA values are ignored.

13308

13309>>> s.cumprod()

133100 2.0

133111 NaN

133122 10.0

133133 -10.0

133144 -0.0

13315dtype: float64

13316

13317To include NA values in the operation, use ``skipna=False``

13318

13319>>> s.cumprod(skipna=False)

133200 2.0

133211 NaN

133222 NaN

133233 NaN

133244 NaN

13325dtype: float64

13326

13327**DataFrame**

13328

13329>>> df = pd.DataFrame([[2.0, 1.0],

13330... [3.0, np.nan],

13331... [1.0, 0.0]],

13332... columns=list('AB'))

13333>>> df

13334 A B

133350 2.0 1.0

133361 3.0 NaN

133372 1.0 0.0

13338

13339By default, iterates over rows and finds the product

13340in each column. This is equivalent to ``axis=None`` or ``axis='index'``.

13341

13342>>> df.cumprod()

13343 A B

133440 2.0 1.0

133451 6.0 NaN

133462 6.0 0.0

13347

13348To iterate over columns and find the product in each row,

13349use ``axis=1``

13350

13351>>> df.cumprod(axis=1)

13352 A B

133530 2.0 2.0

133541 3.0 NaN

133552 1.0 0.0

13356"""

13357

13358_cummax_examples = """\

13359Examples

13360--------

13361**Series**

13362

13363>>> s = pd.Series([2, np.nan, 5, -1, 0])

13364>>> s

133650 2.0

133661 NaN

133672 5.0

133683 -1.0

133694 0.0

13370dtype: float64

13371

13372By default, NA values are ignored.

13373

13374>>> s.cummax()

133750 2.0

133761 NaN

133772 5.0

133783 5.0

133794 5.0

13380dtype: float64

13381

13382To include NA values in the operation, use ``skipna=False``

13383

13384>>> s.cummax(skipna=False)

133850 2.0

133861 NaN

133872 NaN

133883 NaN

133894 NaN

13390dtype: float64

13391

13392**DataFrame**

13393

13394>>> df = pd.DataFrame([[2.0, 1.0],

13395... [3.0, np.nan],

13396... [1.0, 0.0]],

13397... columns=list('AB'))

13398>>> df

13399 A B

134000 2.0 1.0

134011 3.0 NaN

134022 1.0 0.0

13403

13404By default, iterates over rows and finds the maximum

13405in each column. This is equivalent to ``axis=None`` or ``axis='index'``.

13406

13407>>> df.cummax()

13408 A B

134090 2.0 1.0

134101 3.0 NaN

134112 3.0 1.0

13412

13413To iterate over columns and find the maximum in each row,

13414use ``axis=1``

13415

13416>>> df.cummax(axis=1)

13417 A B

134180 2.0 2.0

134191 3.0 NaN

134202 1.0 1.0

13421"""

13422

13423_any_see_also = """\

13424See Also

13425--------

13426numpy.any : Numpy version of this method.

13427Series.any : Return whether any element is True.

13428Series.all : Return whether all elements are True.

13429DataFrame.any : Return whether any element is True over requested axis.

13430DataFrame.all : Return whether all elements are True over requested axis.

13431"""

13432

13433_any_desc = """\

13434Return whether any element is True, potentially over an axis.

13435

13436Returns False unless there is at least one element within a series or

13437along a Dataframe axis that is True or equivalent (e.g. non-zero or

13438non-empty)."""

13439

13440_any_examples = """\

13441Examples

13442--------

13443**Series**

13444

13445For Series input, the output is a scalar indicating whether any element

13446is True.

13447

13448>>> pd.Series([False, False]).any()

13449False

13450>>> pd.Series([True, False]).any()

13451True

13452>>> pd.Series([], dtype="float64").any()

13453False

13454>>> pd.Series([np.nan]).any()

13455False

13456>>> pd.Series([np.nan]).any(skipna=False)

13457True

13458

13459**DataFrame**

13460

13461Whether each column contains at least one True element (the default).

13462

13463>>> df = pd.DataFrame({"A": [1, 2], "B": [0, 2], "C": [0, 0]})

13464>>> df

13465 A B C

134660 1 0 0

134671 2 2 0

13468

13469>>> df.any()

13470A True

13471B True

13472C False

13473dtype: bool

13474

13475Aggregating over the columns.

13476

13477>>> df = pd.DataFrame({"A": [True, False], "B": [1, 2]})

13478>>> df

13479 A B

134800 True 1

134811 False 2

13482

13483>>> df.any(axis='columns')

134840 True

134851 True

13486dtype: bool

13487

13488>>> df = pd.DataFrame({"A": [True, False], "B": [1, 0]})

13489>>> df

13490 A B

134910 True 1

134921 False 0

13493

13494>>> df.any(axis='columns')

134950 True

134961 False

13497dtype: bool

13498

13499Aggregating over the entire DataFrame with ``axis=None``.

13500

13501>>> df.any(axis=None)

13502True

13503

13504`any` for an empty DataFrame is an empty Series.

13505

13506>>> pd.DataFrame([]).any()

13507Series([], dtype: bool)

13508"""

13509

13510_shared_docs[

13511 "stat_func_example"

13512] = """

13513

13514Examples

13515--------

13516>>> idx = pd.MultiIndex.from_arrays([

13517... ['warm', 'warm', 'cold', 'cold'],

13518... ['dog', 'falcon', 'fish', 'spider']],

13519... names=['blooded', 'animal'])

13520>>> s = pd.Series([4, 2, 0, 8], name='legs', index=idx)

13521>>> s

13522blooded animal

13523warm dog 4

13524 falcon 2

13525cold fish 0

13526 spider 8

13527Name: legs, dtype: int64

13528

13529>>> s.{stat_func}()

13530{default_output}"""

13531

13532_sum_examples = _shared_docs["stat_func_example"].format(

13533 stat_func="sum", verb="Sum", default_output=14, level_output_0=6, level_output_1=8

13534)

13535

13536_sum_examples += """

13537

13538By default, the sum of an empty or all-NA Series is ``0``.

13539

13540>>> pd.Series([], dtype="float64").sum() # min_count=0 is the default

135410.0

13542

13543This can be controlled with the ``min_count`` parameter. For example, if

13544you'd like the sum of an empty series to be NaN, pass ``min_count=1``.

13545

13546>>> pd.Series([], dtype="float64").sum(min_count=1)

13547nan

13548

13549Thanks to the ``skipna`` parameter, ``min_count`` handles all-NA and

13550empty series identically.

13551

13552>>> pd.Series([np.nan]).sum()

135530.0

13554

13555>>> pd.Series([np.nan]).sum(min_count=1)

13556nan"""

13557

13558_max_examples: str = _shared_docs["stat_func_example"].format(

13559 stat_func="max", verb="Max", default_output=8, level_output_0=4, level_output_1=8

13560)

13561

13562_min_examples: str = _shared_docs["stat_func_example"].format(

13563 stat_func="min", verb="Min", default_output=0, level_output_0=2, level_output_1=0

13564)

13565

13566_stat_func_see_also = """

13567

13568See Also

13569--------

13570Series.sum : Return the sum.

13571Series.min : Return the minimum.

13572Series.max : Return the maximum.

13573Series.idxmin : Return the index of the minimum.

13574Series.idxmax : Return the index of the maximum.

13575DataFrame.sum : Return the sum over the requested axis.

13576DataFrame.min : Return the minimum over the requested axis.

13577DataFrame.max : Return the maximum over the requested axis.

13578DataFrame.idxmin : Return the index of the minimum over the requested axis.

13579DataFrame.idxmax : Return the index of the maximum over the requested axis."""

13580

13581_prod_examples = """

13582

13583Examples

13584--------

13585By default, the product of an empty or all-NA Series is ``1``

13586

13587>>> pd.Series([], dtype="float64").prod()

135881.0

13589

13590This can be controlled with the ``min_count`` parameter

13591

13592>>> pd.Series([], dtype="float64").prod(min_count=1)

13593nan

13594

13595Thanks to the ``skipna`` parameter, ``min_count`` handles all-NA and

13596empty series identically.

13597

13598>>> pd.Series([np.nan]).prod()

135991.0

13600

13601>>> pd.Series([np.nan]).prod(min_count=1)

13602nan"""

13603

13604_min_count_stub = """\

13605min_count : int, default 0

13606 The required number of valid values to perform the operation. If fewer than

13607 ``min_count`` non-NA values are present the result will be NA.

13608"""

13609

13610

13611def make_doc(name: str, ndim: int) -> str:

13612 """

13613 Generate the docstring for a Series/DataFrame reduction.

13614 """

13615 if ndim == 1:

13616 name1 = "scalar"

13617 name2 = "Series"

13618 axis_descr = "{index (0)}"

13619 else:

13620 name1 = "Series"

13621 name2 = "DataFrame"

13622 axis_descr = "{index (0), columns (1)}"

13623

13624 if name == "any":

13625 base_doc = _bool_doc

13626 desc = _any_desc

13627 see_also = _any_see_also

13628 examples = _any_examples

13629 kwargs = {"empty_value": "False"}

13630 elif name == "all":

13631 base_doc = _bool_doc

13632 desc = _all_desc

13633 see_also = _all_see_also

13634 examples = _all_examples

13635 kwargs = {"empty_value": "True"}

13636 elif name == "min":

13637 base_doc = _num_doc

13638 desc = (

13639 "Return the minimum of the values over the requested axis.\n\n"

13640 "If you want the *index* of the minimum, use ``idxmin``. This is "

13641 "the equivalent of the ``numpy.ndarray`` method ``argmin``."

13642 )

13643 see_also = _stat_func_see_also

13644 examples = _min_examples

13645 kwargs = {"min_count": ""}

13646 elif name == "max":

13647 base_doc = _num_doc

13648 desc = (

13649 "Return the maximum of the values over the requested axis.\n\n"

13650 "If you want the *index* of the maximum, use ``idxmax``. This is "

13651 "the equivalent of the ``numpy.ndarray`` method ``argmax``."

13652 )

13653 see_also = _stat_func_see_also

13654 examples = _max_examples

13655 kwargs = {"min_count": ""}

13656

13657 elif name == "sum":

13658 base_doc = _sum_prod_doc

13659 desc = (

13660 "Return the sum of the values over the requested axis.\n\n"

13661 "This is equivalent to the method ``numpy.sum``."

13662 )

13663 see_also = _stat_func_see_also

13664 examples = _sum_examples

13665 kwargs = {"min_count": _min_count_stub}

13666

13667 elif name == "prod":

13668 base_doc = _sum_prod_doc

13669 desc = "Return the product of the values over the requested axis."

13670 see_also = _stat_func_see_also

13671 examples = _prod_examples

13672 kwargs = {"min_count": _min_count_stub}

13673

13674 elif name == "median":

13675 base_doc = _num_doc

13676 desc = "Return the median of the values over the requested axis."

13677 see_also = ""

13678 examples = """

13679

13680 Examples

13681 --------

13682 >>> s = pd.Series([1, 2, 3])

13683 >>> s.median()

13684 2.0

13685

13686 With a DataFrame

13687

13688 >>> df = pd.DataFrame({'a': [1, 2], 'b': [2, 3]}, index=['tiger', 'zebra'])

13689 >>> df

13690 a b

13691 tiger 1 2

13692 zebra 2 3

13693 >>> df.median()

13694 a 1.5

13695 b 2.5

13696 dtype: float64

13697

13698 Using axis=1

13699

13700 >>> df.median(axis=1)

13701 tiger 1.5

13702 zebra 2.5

13703 dtype: float64

13704

13705 In this case, `numeric_only` should be set to `True`

13706 to avoid getting an error.

13707

13708 >>> df = pd.DataFrame({'a': [1, 2], 'b': ['T', 'Z']},

13709 ... index=['tiger', 'zebra'])

13710 >>> df.median(numeric_only=True)

13711 a 1.5

13712 dtype: float64"""

13713 kwargs = {"min_count": ""}

13714

13715 elif name == "mean":

13716 base_doc = _num_doc

13717 desc = "Return the mean of the values over the requested axis."

13718 see_also = ""

13719 examples = """

13720

13721 Examples

13722 --------

13723 >>> s = pd.Series([1, 2, 3])

13724 >>> s.mean()

13725 2.0

13726

13727 With a DataFrame

13728

13729 >>> df = pd.DataFrame({'a': [1, 2], 'b': [2, 3]}, index=['tiger', 'zebra'])

13730 >>> df

13731 a b

13732 tiger 1 2

13733 zebra 2 3

13734 >>> df.mean()

13735 a 1.5

13736 b 2.5

13737 dtype: float64

13738

13739 Using axis=1

13740

13741 >>> df.mean(axis=1)

13742 tiger 1.5

13743 zebra 2.5

13744 dtype: float64

13745

13746 In this case, `numeric_only` should be set to `True` to avoid

13747 getting an error.

13748

13749 >>> df = pd.DataFrame({'a': [1, 2], 'b': ['T', 'Z']},

13750 ... index=['tiger', 'zebra'])

13751 >>> df.mean(numeric_only=True)

13752 a 1.5

13753 dtype: float64"""

13754 kwargs = {"min_count": ""}

13755

13756 elif name == "var":

13757 base_doc = _num_ddof_doc

13758 desc = (

13759 "Return unbiased variance over requested axis.\n\nNormalized by "

13760 "N-1 by default. This can be changed using the ddof argument."

13761 )

13762 examples = _var_examples

13763 see_also = ""

13764 kwargs = {"notes": ""}

13765

13766 elif name == "std":

13767 base_doc = _num_ddof_doc

13768 desc = (

13769 "Return sample standard deviation over requested axis."

13770 "\n\nNormalized by N-1 by default. This can be changed using the "

13771 "ddof argument."

13772 )

13773 examples = _std_examples

13774 see_also = ""

13775 kwargs = {"notes": _std_notes}

13776

13777 elif name == "sem":

13778 base_doc = _num_ddof_doc

13779 desc = (

13780 "Return unbiased standard error of the mean over requested "

13781 "axis.\n\nNormalized by N-1 by default. This can be changed "

13782 "using the ddof argument"

13783 )

13784 examples = """

13785

13786 Examples

13787 --------

13788 >>> s = pd.Series([1, 2, 3])

13789 >>> s.sem().round(6)

13790 0.57735

13791

13792 With a DataFrame

13793

13794 >>> df = pd.DataFrame({'a': [1, 2], 'b': [2, 3]}, index=['tiger', 'zebra'])

13795 >>> df

13796 a b

13797 tiger 1 2

13798 zebra 2 3

13799 >>> df.sem()

13800 a 0.5

13801 b 0.5

13802 dtype: float64

13803

13804 Using axis=1

13805

13806 >>> df.sem(axis=1)

13807 tiger 0.5

13808 zebra 0.5

13809 dtype: float64

13810

13811 In this case, `numeric_only` should be set to `True`

13812 to avoid getting an error.

13813

13814 >>> df = pd.DataFrame({'a': [1, 2], 'b': ['T', 'Z']},

13815 ... index=['tiger', 'zebra'])

13816 >>> df.sem(numeric_only=True)

13817 a 0.5

13818 dtype: float64"""

13819 see_also = ""

13820 kwargs = {"notes": ""}

13821

13822 elif name == "skew":

13823 base_doc = _num_doc

13824 desc = "Return unbiased skew over requested axis.\n\nNormalized by N-1."

13825 see_also = ""

13826 examples = """

13827

13828 Examples

13829 --------

13830 >>> s = pd.Series([1, 2, 3])

13831 >>> s.skew()

13832 0.0

13833

13834 With a DataFrame

13835

13836 >>> df = pd.DataFrame({'a': [1, 2, 3], 'b': [2, 3, 4], 'c': [1, 3, 5]},

13837 ... index=['tiger', 'zebra', 'cow'])

13838 >>> df

13839 a b c

13840 tiger 1 2 1

13841 zebra 2 3 3

13842 cow 3 4 5

13843 >>> df.skew()

13844 a 0.0

13845 b 0.0

13846 c 0.0

13847 dtype: float64

13848

13849 Using axis=1

13850

13851 >>> df.skew(axis=1)

13852 tiger 1.732051

13853 zebra -1.732051

13854 cow 0.000000

13855 dtype: float64

13856

13857 In this case, `numeric_only` should be set to `True` to avoid

13858 getting an error.

13859

13860 >>> df = pd.DataFrame({'a': [1, 2, 3], 'b': ['T', 'Z', 'X']},

13861 ... index=['tiger', 'zebra', 'cow'])

13862 >>> df.skew(numeric_only=True)

13863 a 0.0

13864 dtype: float64"""

13865 kwargs = {"min_count": ""}

13866 elif name == "kurt":

13867 base_doc = _num_doc

13868 desc = (

13869 "Return unbiased kurtosis over requested axis.\n\n"

13870 "Kurtosis obtained using Fisher's definition of\n"

13871 "kurtosis (kurtosis of normal == 0.0). Normalized "

13872 "by N-1."

13873 )

13874 see_also = ""

13875 examples = """

13876

13877 Examples

13878 --------

13879 >>> s = pd.Series([1, 2, 2, 3], index=['cat', 'dog', 'dog', 'mouse'])

13880 >>> s

13881 cat 1

13882 dog 2

13883 dog 2

13884 mouse 3

13885 dtype: int64

13886 >>> s.kurt()

13887 1.5

13888

13889 With a DataFrame

13890

13891 >>> df = pd.DataFrame({'a': [1, 2, 2, 3], 'b': [3, 4, 4, 4]},

13892 ... index=['cat', 'dog', 'dog', 'mouse'])

13893 >>> df

13894 a b

13895 cat 1 3

13896 dog 2 4

13897 dog 2 4

13898 mouse 3 4

13899 >>> df.kurt()

13900 a 1.5

13901 b 4.0

13902 dtype: float64

13903

13904 With axis=None

13905

13906 >>> df.kurt(axis=None).round(6)

13907 -0.988693

13908

13909 Using axis=1

13910

13911 >>> df = pd.DataFrame({'a': [1, 2], 'b': [3, 4], 'c': [3, 4], 'd': [1, 2]},

13912 ... index=['cat', 'dog'])

13913 >>> df.kurt(axis=1)

13914 cat -6.0

13915 dog -6.0

13916 dtype: float64"""

13917 kwargs = {"min_count": ""}

13918

13919 elif name == "cumsum":

13920 base_doc = _cnum_doc

13921 desc = "sum"

13922 see_also = ""

13923 examples = _cumsum_examples

13924 kwargs = {"accum_func_name": "sum"}

13925

13926 elif name == "cumprod":

13927 base_doc = _cnum_doc

13928 desc = "product"

13929 see_also = ""

13930 examples = _cumprod_examples

13931 kwargs = {"accum_func_name": "prod"}

13932

13933 elif name == "cummin":

13934 base_doc = _cnum_doc

13935 desc = "minimum"

13936 see_also = ""

13937 examples = _cummin_examples

13938 kwargs = {"accum_func_name": "min"}

13939

13940 elif name == "cummax":

13941 base_doc = _cnum_doc

13942 desc = "maximum"

13943 see_also = ""

13944 examples = _cummax_examples

13945 kwargs = {"accum_func_name": "max"}

13946

13947 else:

13948 raise NotImplementedError

13949

13950 docstr = base_doc.format(

13951 desc=desc,

13952 name=name,

13953 name1=name1,

13954 name2=name2,

13955 axis_descr=axis_descr,

13956 see_also=see_also,

13957 examples=examples,

13958 **kwargs,

13959 )

13960 return docstr