Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/pandas/core/generic.py: 26%

1# pyright: reportPropertyTypeMismatch=false

2from __future__ import annotations

4import collections

5import datetime as dt

6from functools import partial

7import gc

8from json import loads

9import operator

10import pickle

11import re

12from typing import (

13 TYPE_CHECKING,

14 Any,

15 Callable,

16 ClassVar,

17 Hashable,

18 Iterator,

19 Literal,

20 Mapping,

21 NoReturn,

22 Sequence,

23 Type,

24 cast,

25 final,

26 overload,

27)

28import warnings

29import weakref

31import numpy as np

33from pandas._config import (

34 config,

35 using_copy_on_write,

36)

38from pandas._libs import lib

39from pandas._libs.lib import is_range_indexer

40from pandas._libs.tslibs import (

41 Period,

42 Tick,

43 Timestamp,

44 to_offset,

45)

46from pandas._typing import (

47 AlignJoin,

48 AnyArrayLike,

49 ArrayLike,

50 Axis,

51 AxisInt,

52 CompressionOptions,

53 Dtype,

54 DtypeArg,

55 DtypeBackend,

56 DtypeObj,

57 FilePath,

58 FillnaOptions,

59 FloatFormatType,

60 FormattersType,

61 Frequency,

62 IgnoreRaise,

63 IndexKeyFunc,

64 IndexLabel,

65 IntervalClosedType,

66 JSONSerializable,

67 Level,

68 Manager,

69 NaPosition,

70 NDFrameT,

71 RandomState,

72 Renamer,

73 Scalar,

74 SortKind,

75 StorageOptions,

76 Suffixes,

77 T,

78 TimeAmbiguous,

79 TimedeltaConvertibleTypes,

80 TimeNonexistent,

81 TimestampConvertibleTypes,

82 ValueKeyFunc,

83 WriteBuffer,

84 npt,

85)

86from pandas.compat._optional import import_optional_dependency

87from pandas.compat.numpy import function as nv

88from pandas.errors import (

89 AbstractMethodError,

90 InvalidIndexError,

91 SettingWithCopyError,

92 SettingWithCopyWarning,

93)

94from pandas.util._decorators import doc

95from pandas.util._exceptions import find_stack_level

96from pandas.util._validators import (

97 check_dtype_backend,

98 validate_ascending,

99 validate_bool_kwarg,

100 validate_fillna_kwargs,

101 validate_inclusive,

102)

103

104from pandas.core.dtypes.astype import astype_is_view

105from pandas.core.dtypes.common import (

106 ensure_object,

107 ensure_platform_int,

108 ensure_str,

109 is_bool,

110 is_bool_dtype,

111 is_datetime64_any_dtype,

112 is_datetime64tz_dtype,

113 is_dict_like,

114 is_dtype_equal,

115 is_extension_array_dtype,

116 is_float,

117 is_list_like,

118 is_number,

119 is_numeric_dtype,

120 is_re_compilable,

121 is_scalar,

122 is_timedelta64_dtype,

123 pandas_dtype,

124)

125from pandas.core.dtypes.generic import (

126 ABCDataFrame,

127 ABCSeries,

128)

129from pandas.core.dtypes.inference import (

130 is_hashable,

131 is_nested_list_like,

132)

133from pandas.core.dtypes.missing import (

134 isna,

135 notna,

136)

137

138from pandas.core import (

139 algorithms as algos,

140 arraylike,

141 common,

142 indexing,

143 nanops,

144 sample,

145)

146from pandas.core.array_algos.replace import should_use_regex

147from pandas.core.arrays import ExtensionArray

148from pandas.core.base import PandasObject

149from pandas.core.construction import extract_array

150from pandas.core.flags import Flags

151from pandas.core.indexes.api import (

152 DatetimeIndex,

153 Index,

154 MultiIndex,

155 PeriodIndex,

156 RangeIndex,

157 default_index,

158 ensure_index,

159)

160from pandas.core.internals import (

161 ArrayManager,

162 BlockManager,

163 SingleArrayManager,

164)

165from pandas.core.internals.construction import (

166 mgr_to_mgr,

167 ndarray_to_mgr,

168)

169from pandas.core.methods.describe import describe_ndframe

170from pandas.core.missing import (

171 clean_fill_method,

172 clean_reindex_fill_method,

173 find_valid_index,

174)

175from pandas.core.ops import align_method_FRAME

176from pandas.core.reshape.concat import concat

177from pandas.core.shared_docs import _shared_docs

178from pandas.core.sorting import get_indexer_indexer

179from pandas.core.window import (

180 Expanding,

181 ExponentialMovingWindow,

182 Rolling,

183 Window,

184)

185

186from pandas.io.formats.format import (

187 DataFrameFormatter,

188 DataFrameRenderer,

189)

190from pandas.io.formats.printing import pprint_thing

191

192if TYPE_CHECKING:

193 from pandas._libs.tslibs import BaseOffset

194

195 from pandas.core.frame import DataFrame

196 from pandas.core.indexers.objects import BaseIndexer

197 from pandas.core.resample import Resampler

198 from pandas.core.series import Series

199

200 from pandas.io.pytables import HDFStore

201

202

203# goal is to be able to define the docs close to function, while still being

204# able to share

205_shared_docs = {**_shared_docs}

206_shared_doc_kwargs = {

207 "axes": "keywords for axes",

208 "klass": "Series/DataFrame",

209 "axes_single_arg": "int or labels for object",

210 "args_transpose": "axes to permute (int or label for object)",

211 "inplace": """

212 inplace : bool, default False

213 If True, performs operation inplace and returns None.""",

214 "optional_by": """

215 by : str or list of str

216 Name or list of names to sort by""",

217 "replace_iloc": """

218 This differs from updating with ``.loc`` or ``.iloc``, which require

219 you to specify a location to update with some value.""",

220}

221

222

223bool_t = bool # Need alias because NDFrame has def bool:

224

225

226class NDFrame(PandasObject, indexing.IndexingMixin):

227 """

228 N-dimensional analogue of DataFrame. Store multi-dimensional in a

229 size-mutable, labeled data structure

230

231 Parameters

232 ----------

233 data : BlockManager

234 axes : list

235 copy : bool, default False

236 """

237

238 _internal_names: list[str] = [

239 "_mgr",

240 "_cacher",

241 "_item_cache",

242 "_cache",

243 "_is_copy",

244 "_subtyp",

245 "_name",

246 "_default_kind",

247 "_default_fill_value",

248 "_metadata",

249 "__array_struct__",

250 "__array_interface__",

251 "_flags",

252 ]

253 _internal_names_set: set[str] = set(_internal_names)

254 _accessors: set[str] = set()

255 _hidden_attrs: frozenset[str] = frozenset([])

256 _metadata: list[str] = []

257 _is_copy: weakref.ReferenceType[NDFrame] | None = None

258 _mgr: Manager

259 _attrs: dict[Hashable, Any]

260 _typ: str

261

262 # ----------------------------------------------------------------------

263 # Constructors

264

265 def __init__(

266 self,

267 data: Manager,

268 copy: bool_t = False,

269 attrs: Mapping[Hashable, Any] | None = None,

270 ) -> None:

271 # copy kwarg is retained for mypy compat, is not used

272

273 object.__setattr__(self, "_is_copy", None)

274 object.__setattr__(self, "_mgr", data)

275 object.__setattr__(self, "_item_cache", {})

276 if attrs is None:

277 attrs = {}

278 else:

279 attrs = dict(attrs)

280 object.__setattr__(self, "_attrs", attrs)

281 object.__setattr__(self, "_flags", Flags(self, allows_duplicate_labels=True))

282

283 @classmethod

284 def _init_mgr(

285 cls,

286 mgr: Manager,

287 axes,

288 dtype: Dtype | None = None,

289 copy: bool_t = False,

290 ) -> Manager:

291 """passed a manager and a axes dict"""

292 for a, axe in axes.items():

293 if axe is not None:

294 axe = ensure_index(axe)

295 bm_axis = cls._get_block_manager_axis(a)

296 mgr = mgr.reindex_axis(axe, axis=bm_axis)

297

298 # make a copy if explicitly requested

299 if copy:

300 mgr = mgr.copy()

301 if dtype is not None:

302 # avoid further copies if we can

303 if (

304 isinstance(mgr, BlockManager)

305 and len(mgr.blocks) == 1

306 and is_dtype_equal(mgr.blocks[0].values.dtype, dtype)

307 ):

308 pass

309 else:

310 mgr = mgr.astype(dtype=dtype)

311 return mgr

312

313 def _as_manager(self: NDFrameT, typ: str, copy: bool_t = True) -> NDFrameT:

314 """

315 Private helper function to create a DataFrame with specific manager.

316

317 Parameters

318 ----------

319 typ : {"block", "array"}

320 copy : bool, default True

321 Only controls whether the conversion from Block->ArrayManager

322 copies the 1D arrays (to ensure proper/contiguous memory layout).

323

324 Returns

325 -------

326 DataFrame

327 New DataFrame using specified manager type. Is not guaranteed

328 to be a copy or not.

329 """

330 new_mgr: Manager

331 new_mgr = mgr_to_mgr(self._mgr, typ=typ, copy=copy)

332 # fastpath of passing a manager doesn't check the option/manager class

333 return self._constructor(new_mgr).__finalize__(self)

334

335 # ----------------------------------------------------------------------

336 # attrs and flags

337

338 @property

339 def attrs(self) -> dict[Hashable, Any]:

340 """

341 Dictionary of global attributes of this dataset.

342

343 .. warning::

344

345 attrs is experimental and may change without warning.

346

347 See Also

348 --------

349 DataFrame.flags : Global flags applying to this object.

350 """

351 if self._attrs is None:

352 self._attrs = {}

353 return self._attrs

354

355 @attrs.setter

356 def attrs(self, value: Mapping[Hashable, Any]) -> None:

357 self._attrs = dict(value)

358

359 @final

360 @property

361 def flags(self) -> Flags:

362 """

363 Get the properties associated with this pandas object.

364

365 The available flags are

366

367 * :attr:`Flags.allows_duplicate_labels`

368

369 See Also

370 --------

371 Flags : Flags that apply to pandas objects.

372 DataFrame.attrs : Global metadata applying to this dataset.

373

374 Notes

375 -----

376 "Flags" differ from "metadata". Flags reflect properties of the

377 pandas object (the Series or DataFrame). Metadata refer to properties

378 of the dataset, and should be stored in :attr:`DataFrame.attrs`.

379

380 Examples

381 --------

382 >>> df = pd.DataFrame({"A": [1, 2]})

383 >>> df.flags

384 <Flags(allows_duplicate_labels=True)>

385

386 Flags can be get or set using ``.``

387

388 >>> df.flags.allows_duplicate_labels

389 True

390 >>> df.flags.allows_duplicate_labels = False

391

392 Or by slicing with a key

393

394 >>> df.flags["allows_duplicate_labels"]

395 False

396 >>> df.flags["allows_duplicate_labels"] = True

397 """

398 return self._flags

399

400 @final

401 def set_flags(

402 self: NDFrameT,

403 *,

404 copy: bool_t = False,

405 allows_duplicate_labels: bool_t | None = None,

406 ) -> NDFrameT:

407 """

408 Return a new object with updated flags.

409

410 Parameters

411 ----------

412 copy : bool, default False

413 Specify if a copy of the object should be made.

414 allows_duplicate_labels : bool, optional

415 Whether the returned object allows duplicate labels.

416

417 Returns

418 -------

419 Series or DataFrame

420 The same type as the caller.

421

422 See Also

423 --------

424 DataFrame.attrs : Global metadata applying to this dataset.

425 DataFrame.flags : Global flags applying to this object.

426

427 Notes

428 -----

429 This method returns a new object that's a view on the same data

430 as the input. Mutating the input or the output values will be reflected

431 in the other.

432

433 This method is intended to be used in method chains.

434

435 "Flags" differ from "metadata". Flags reflect properties of the

436 pandas object (the Series or DataFrame). Metadata refer to properties

437 of the dataset, and should be stored in :attr:`DataFrame.attrs`.

438

439 Examples

440 --------

441 >>> df = pd.DataFrame({"A": [1, 2]})

442 >>> df.flags.allows_duplicate_labels

443 True

444 >>> df2 = df.set_flags(allows_duplicate_labels=False)

445 >>> df2.flags.allows_duplicate_labels

446 False

447 """

448 df = self.copy(deep=copy and not using_copy_on_write())

449 if allows_duplicate_labels is not None:

450 df.flags["allows_duplicate_labels"] = allows_duplicate_labels

451 return df

452

453 @final

454 @classmethod

455 def _validate_dtype(cls, dtype) -> DtypeObj | None:

456 """validate the passed dtype"""

457 if dtype is not None:

458 dtype = pandas_dtype(dtype)

459

460 # a compound dtype

461 if dtype.kind == "V":

462 raise NotImplementedError(

463 "compound dtypes are not implemented "

464 f"in the {cls.__name__} constructor"

465 )

466

467 return dtype

468

469 # ----------------------------------------------------------------------

470 # Construction

471

472 @property

473 def _constructor(self: NDFrameT) -> Callable[..., NDFrameT]:

474 """

475 Used when a manipulation result has the same dimensions as the

476 original.

477 """

478 raise AbstractMethodError(self)

479

480 # ----------------------------------------------------------------------

481 # Internals

482

483 @final

484 @property

485 def _data(self):

486 # GH#33054 retained because some downstream packages uses this,

487 # e.g. fastparquet

488 return self._mgr

489

490 # ----------------------------------------------------------------------

491 # Axis

492 _stat_axis_number = 0

493 _stat_axis_name = "index"

494 _AXIS_ORDERS: list[Literal["index", "columns"]]

495 _AXIS_TO_AXIS_NUMBER: dict[Axis, AxisInt] = {0: 0, "index": 0, "rows": 0}

496 _info_axis_number: int

497 _info_axis_name: Literal["index", "columns"]

498 _AXIS_LEN: int

499

500 @final

501 def _construct_axes_dict(self, axes: Sequence[Axis] | None = None, **kwargs):

502 """Return an axes dictionary for myself."""

503 d = {a: self._get_axis(a) for a in (axes or self._AXIS_ORDERS)}

504 # error: Argument 1 to "update" of "MutableMapping" has incompatible type

505 # "Dict[str, Any]"; expected "SupportsKeysAndGetItem[Union[int, str], Any]"

506 d.update(kwargs) # type: ignore[arg-type]

507 return d

508

509 @final

510 @classmethod

511 def _get_axis_number(cls, axis: Axis) -> AxisInt:

512 try:

513 return cls._AXIS_TO_AXIS_NUMBER[axis]

514 except KeyError:

515 raise ValueError(f"No axis named {axis} for object type {cls.__name__}")

516

517 @final

518 @classmethod

519 def _get_axis_name(cls, axis: Axis) -> Literal["index", "columns"]:

520 axis_number = cls._get_axis_number(axis)

521 return cls._AXIS_ORDERS[axis_number]

522

523 @final

524 def _get_axis(self, axis: Axis) -> Index:

525 axis_number = self._get_axis_number(axis)

526 assert axis_number in {0, 1}

527 return self.index if axis_number == 0 else self.columns

528

529 @final

530 @classmethod

531 def _get_block_manager_axis(cls, axis: Axis) -> AxisInt:

532 """Map the axis to the block_manager axis."""

533 axis = cls._get_axis_number(axis)

534 ndim = cls._AXIS_LEN

535 if ndim == 2:

536 # i.e. DataFrame

537 return 1 - axis

538 return axis

539

540 @final

541 def _get_axis_resolvers(self, axis: str) -> dict[str, Series | MultiIndex]:

542 # index or columns

543 axis_index = getattr(self, axis)

544 d = {}

545 prefix = axis[0]

546

547 for i, name in enumerate(axis_index.names):

548 if name is not None:

549 key = level = name

550 else:

551 # prefix with 'i' or 'c' depending on the input axis

552 # e.g., you must do ilevel_0 for the 0th level of an unnamed

553 # multiiindex

554 key = f"{prefix}level_{i}"

555 level = i

556

557 level_values = axis_index.get_level_values(level)

558 s = level_values.to_series()

559 s.index = axis_index

560 d[key] = s

561

562 # put the index/columns itself in the dict

563 if isinstance(axis_index, MultiIndex):

564 dindex = axis_index

565 else:

566 dindex = axis_index.to_series()

567

568 d[axis] = dindex

569 return d

570

571 @final

572 def _get_index_resolvers(self) -> dict[Hashable, Series | MultiIndex]:

573 from pandas.core.computation.parsing import clean_column_name

574

575 d: dict[str, Series | MultiIndex] = {}

576 for axis_name in self._AXIS_ORDERS:

577 d.update(self._get_axis_resolvers(axis_name))

578

579 return {clean_column_name(k): v for k, v in d.items() if not isinstance(k, int)}

580

581 @final

582 def _get_cleaned_column_resolvers(self) -> dict[Hashable, Series]:

583 """

584 Return the special character free column resolvers of a dataframe.

585

586 Column names with special characters are 'cleaned up' so that they can

587 be referred to by backtick quoting.

588 Used in :meth:`DataFrame.eval`.

589 """

590 from pandas.core.computation.parsing import clean_column_name

591

592 if isinstance(self, ABCSeries):

593 return {clean_column_name(self.name): self}

594

595 return {

596 clean_column_name(k): v for k, v in self.items() if not isinstance(k, int)

597 }

598

599 @property

600 def _info_axis(self) -> Index:

601 return getattr(self, self._info_axis_name)

602

603 @property

604 def _stat_axis(self) -> Index:

605 return getattr(self, self._stat_axis_name)

606

607 @property

608 def shape(self) -> tuple[int, ...]:

609 """

610 Return a tuple of axis dimensions

611 """

612 return tuple(len(self._get_axis(a)) for a in self._AXIS_ORDERS)

613

614 @property

615 def axes(self) -> list[Index]:

616 """

617 Return index label(s) of the internal NDFrame

618 """

619 # we do it this way because if we have reversed axes, then

620 # the block manager shows then reversed

621 return [self._get_axis(a) for a in self._AXIS_ORDERS]

622

623 @property

624 def ndim(self) -> int:

625 """

626 Return an int representing the number of axes / array dimensions.

627

628 Return 1 if Series. Otherwise return 2 if DataFrame.

629

630 See Also

631 --------

632 ndarray.ndim : Number of array dimensions.

633

634 Examples

635 --------

636 >>> s = pd.Series({'a': 1, 'b': 2, 'c': 3})

637 >>> s.ndim

638 1

639

640 >>> df = pd.DataFrame({'col1': [1, 2], 'col2': [3, 4]})

641 >>> df.ndim

642 2

643 """

644 return self._mgr.ndim

645

646 @property

647 def size(self) -> int:

648 """

649 Return an int representing the number of elements in this object.

650

651 Return the number of rows if Series. Otherwise return the number of

652 rows times number of columns if DataFrame.

653

654 See Also

655 --------

656 ndarray.size : Number of elements in the array.

657

658 Examples

659 --------

660 >>> s = pd.Series({'a': 1, 'b': 2, 'c': 3})

661 >>> s.size

662 3

663

664 >>> df = pd.DataFrame({'col1': [1, 2], 'col2': [3, 4]})

665 >>> df.size

666 4

667 """

668 # error: Incompatible return value type (got "signedinteger[_64Bit]",

669 # expected "int") [return-value]

670 return np.prod(self.shape) # type: ignore[return-value]

671

672 def set_axis(

673 self: NDFrameT,

674 labels,

675 *,

676 axis: Axis = 0,

677 copy: bool_t | None = None,

678 ) -> NDFrameT:

679 """

680 Assign desired index to given axis.

681

682 Indexes for%(extended_summary_sub)s row labels can be changed by assigning

683 a list-like or Index.

684

685 Parameters

686 ----------

687 labels : list-like, Index

688 The values for the new index.

689

690 axis : %(axes_single_arg)s, default 0

691 The axis to update. The value 0 identifies the rows. For `Series`

692 this parameter is unused and defaults to 0.

693

694 copy : bool, default True

695 Whether to make a copy of the underlying data.

696

697 .. versionadded:: 1.5.0

698

699 Returns

700 -------

701 %(klass)s

702 An object of type %(klass)s.

703

704 See Also

705 --------

706 %(klass)s.rename_axis : Alter the name of the index%(see_also_sub)s.

707 """

708 return self._set_axis_nocheck(labels, axis, inplace=False, copy=copy)

709

710 @final

711 def _set_axis_nocheck(

712 self, labels, axis: Axis, inplace: bool_t, copy: bool_t | None

713 ):

714 if inplace:

715 setattr(self, self._get_axis_name(axis), labels)

716 else:

717 # With copy=False, we create a new object but don't copy the

718 # underlying data.

719 obj = self.copy(deep=copy and not using_copy_on_write())

720 setattr(obj, obj._get_axis_name(axis), labels)

721 return obj

722

723 @final

724 def _set_axis(self, axis: AxisInt, labels: AnyArrayLike | list) -> None:

725 """

726 This is called from the cython code when we set the `index` attribute

727 directly, e.g. `series.index = [1, 2, 3]`.

728 """

729 labels = ensure_index(labels)

730 self._mgr.set_axis(axis, labels)

731 self._clear_item_cache()

732

733 @final

734 def swapaxes(

735 self: NDFrameT, axis1: Axis, axis2: Axis, copy: bool_t | None = None

736 ) -> NDFrameT:

737 """

738 Interchange axes and swap values axes appropriately.

739

740 Returns

741 -------

742 same as input

743 """

744 i = self._get_axis_number(axis1)

745 j = self._get_axis_number(axis2)

746

747 if i == j:

748 return self.copy(deep=copy and not using_copy_on_write())

749

750 mapping = {i: j, j: i}

751

752 new_axes = [self._get_axis(mapping.get(k, k)) for k in range(self._AXIS_LEN)]

753 new_values = self._values.swapaxes(i, j) # type: ignore[union-attr]

754 if (

755 using_copy_on_write()

756 and self._mgr.is_single_block

757 and isinstance(self._mgr, BlockManager)

758 ):

759 # This should only get hit in case of having a single block, otherwise a

760 # copy is made, we don't have to set up references.

761 new_mgr = ndarray_to_mgr(

762 new_values,

763 new_axes[0],

764 new_axes[1],

765 dtype=None,

766 copy=False,

767 typ="block",

768 )

769 assert isinstance(new_mgr, BlockManager)

770 assert isinstance(self._mgr, BlockManager)

771 new_mgr.blocks[0].refs = self._mgr.blocks[0].refs

772 new_mgr.blocks[0].refs.add_reference(

773 new_mgr.blocks[0] # type: ignore[arg-type]

774 )

775 return self._constructor(new_mgr).__finalize__(self, method="swapaxes")

776

777 elif (copy or copy is None) and self._mgr.is_single_block:

778 new_values = new_values.copy()

779

780 return self._constructor(

781 new_values,

782 *new_axes,

783 # The no-copy case for CoW is handled above

784 copy=False,

785 ).__finalize__(self, method="swapaxes")

786

787 @final

788 @doc(klass=_shared_doc_kwargs["klass"])

789 def droplevel(self: NDFrameT, level: IndexLabel, axis: Axis = 0) -> NDFrameT:

790 """

791 Return {klass} with requested index / column level(s) removed.

792

793 Parameters

794 ----------

795 level : int, str, or list-like

796 If a string is given, must be the name of a level

797 If list-like, elements must be names or positional indexes

798 of levels.

799

800 axis : {{0 or 'index', 1 or 'columns'}}, default 0

801 Axis along which the level(s) is removed:

802

803 * 0 or 'index': remove level(s) in column.

804 * 1 or 'columns': remove level(s) in row.

805

806 For `Series` this parameter is unused and defaults to 0.

807

808 Returns

809 -------

810 {klass}

811 {klass} with requested index / column level(s) removed.

812

813 Examples

814 --------

815 >>> df = pd.DataFrame([

816 ... [1, 2, 3, 4],

817 ... [5, 6, 7, 8],

818 ... [9, 10, 11, 12]

819 ... ]).set_index([0, 1]).rename_axis(['a', 'b'])

820

821 >>> df.columns = pd.MultiIndex.from_tuples([

822 ... ('c', 'e'), ('d', 'f')

823 ... ], names=['level_1', 'level_2'])

824

825 >>> df

826 level_1 c d

827 level_2 e f

828 a b

829 1 2 3 4

830 5 6 7 8

831 9 10 11 12

832

833 >>> df.droplevel('a')

834 level_1 c d

835 level_2 e f

836 b

837 2 3 4

838 6 7 8

839 10 11 12

840

841 >>> df.droplevel('level_2', axis=1)

842 level_1 c d

843 a b

844 1 2 3 4

845 5 6 7 8

846 9 10 11 12

847 """

848 labels = self._get_axis(axis)

849 new_labels = labels.droplevel(level)

850 return self.set_axis(new_labels, axis=axis, copy=None)

851

852 def pop(self, item: Hashable) -> Series | Any:

853 result = self[item]

854 del self[item]

855

856 return result

857

858 @final

859 def squeeze(self, axis: Axis | None = None):

860 """

861 Squeeze 1 dimensional axis objects into scalars.

862

863 Series or DataFrames with a single element are squeezed to a scalar.

864 DataFrames with a single column or a single row are squeezed to a

865 Series. Otherwise the object is unchanged.

866

867 This method is most useful when you don't know if your

868 object is a Series or DataFrame, but you do know it has just a single

869 column. In that case you can safely call `squeeze` to ensure you have a

870 Series.

871

872 Parameters

873 ----------

874 axis : {0 or 'index', 1 or 'columns', None}, default None

875 A specific axis to squeeze. By default, all length-1 axes are

876 squeezed. For `Series` this parameter is unused and defaults to `None`.

877

878 Returns

879 -------

880 DataFrame, Series, or scalar

881 The projection after squeezing `axis` or all the axes.

882

883 See Also

884 --------

885 Series.iloc : Integer-location based indexing for selecting scalars.

886 DataFrame.iloc : Integer-location based indexing for selecting Series.

887 Series.to_frame : Inverse of DataFrame.squeeze for a

888 single-column DataFrame.

889

890 Examples

891 --------

892 >>> primes = pd.Series([2, 3, 5, 7])

893

894 Slicing might produce a Series with a single value:

895

896 >>> even_primes = primes[primes % 2 == 0]

897 >>> even_primes

898 0 2

899 dtype: int64

900

901 >>> even_primes.squeeze()

902 2

903

904 Squeezing objects with more than one value in every axis does nothing:

905

906 >>> odd_primes = primes[primes % 2 == 1]

907 >>> odd_primes

908 1 3

909 2 5

910 3 7

911 dtype: int64

912

913 >>> odd_primes.squeeze()

914 1 3

915 2 5

916 3 7

917 dtype: int64

918

919 Squeezing is even more effective when used with DataFrames.

920

921 >>> df = pd.DataFrame([[1, 2], [3, 4]], columns=['a', 'b'])

922 >>> df

923 a b

924 0 1 2

925 1 3 4

926

927 Slicing a single column will produce a DataFrame with the columns

928 having only one value:

929

930 >>> df_a = df[['a']]

931 >>> df_a

932 a

933 0 1

934 1 3

935

936 So the columns can be squeezed down, resulting in a Series:

937

938 >>> df_a.squeeze('columns')

939 0 1

940 1 3

941 Name: a, dtype: int64

942

943 Slicing a single row from a single column will produce a single

944 scalar DataFrame:

945

946 >>> df_0a = df.loc[df.index < 1, ['a']]

947 >>> df_0a

948 a

949 0 1

950

951 Squeezing the rows produces a single scalar Series:

952

953 >>> df_0a.squeeze('rows')

954 a 1

955 Name: 0, dtype: int64

956

957 Squeezing all axes will project directly into a scalar:

958

959 >>> df_0a.squeeze()

960 1

961 """

962 axes = range(self._AXIS_LEN) if axis is None else (self._get_axis_number(axis),)

963 return self.iloc[

964 tuple(

965 0 if i in axes and len(a) == 1 else slice(None)

966 for i, a in enumerate(self.axes)

967 )

968 ]

969

970 # ----------------------------------------------------------------------

971 # Rename

972

973 def _rename(

974 self: NDFrameT,

975 mapper: Renamer | None = None,

976 *,

977 index: Renamer | None = None,

978 columns: Renamer | None = None,

979 axis: Axis | None = None,

980 copy: bool_t | None = None,

981 inplace: bool_t = False,

982 level: Level | None = None,

983 errors: str = "ignore",

984 ) -> NDFrameT | None:

985 # called by Series.rename and DataFrame.rename

986

987 if mapper is None and index is None and columns is None:

988 raise TypeError("must pass an index to rename")

989

990 if index is not None or columns is not None:

991 if axis is not None:

992 raise TypeError(

993 "Cannot specify both 'axis' and any of 'index' or 'columns'"

994 )

995 if mapper is not None:

996 raise TypeError(

997 "Cannot specify both 'mapper' and any of 'index' or 'columns'"

998 )

999 else:

1000 # use the mapper argument

1001 if axis and self._get_axis_number(axis) == 1:

1002 columns = mapper

1003 else:

1004 index = mapper

1005

1006 self._check_inplace_and_allows_duplicate_labels(inplace)

1007 result = self if inplace else self.copy(deep=copy and not using_copy_on_write())

1008

1009 for axis_no, replacements in enumerate((index, columns)):

1010 if replacements is None:

1011 continue

1012

1013 ax = self._get_axis(axis_no)

1014 f = common.get_rename_function(replacements)

1015

1016 if level is not None:

1017 level = ax._get_level_number(level)

1018

1019 # GH 13473

1020 if not callable(replacements):

1021 if ax._is_multi and level is not None:

1022 indexer = ax.get_level_values(level).get_indexer_for(replacements)

1023 else:

1024 indexer = ax.get_indexer_for(replacements)

1025

1026 if errors == "raise" and len(indexer[indexer == -1]):

1027 missing_labels = [

1028 label

1029 for index, label in enumerate(replacements)

1030 if indexer[index] == -1

1031 ]

1032 raise KeyError(f"{missing_labels} not found in axis")

1033

1034 new_index = ax._transform_index(f, level=level)

1035 result._set_axis_nocheck(new_index, axis=axis_no, inplace=True, copy=False)

1036 result._clear_item_cache()

1037

1038 if inplace:

1039 self._update_inplace(result)

1040 return None

1041 else:

1042 return result.__finalize__(self, method="rename")

1043

1044 @overload

1045 def rename_axis(

1046 self: NDFrameT,

1047 mapper: IndexLabel | lib.NoDefault = ...,

1048 *,

1049 index=...,

1050 columns=...,

1051 axis: Axis = ...,

1052 copy: bool_t | None = ...,

1053 inplace: Literal[False] = ...,

1054 ) -> NDFrameT:

1055 ...

1056

1057 @overload

1058 def rename_axis(

1059 self,

1060 mapper: IndexLabel | lib.NoDefault = ...,

1061 *,

1062 index=...,

1063 columns=...,

1064 axis: Axis = ...,

1065 copy: bool_t | None = ...,

1066 inplace: Literal[True],

1067 ) -> None:

1068 ...

1069

1070 @overload

1071 def rename_axis(

1072 self: NDFrameT,

1073 mapper: IndexLabel | lib.NoDefault = ...,

1074 *,

1075 index=...,

1076 columns=...,

1077 axis: Axis = ...,

1078 copy: bool_t | None = ...,

1079 inplace: bool_t = ...,

1080 ) -> NDFrameT | None:

1081 ...

1082

1083 def rename_axis(

1084 self: NDFrameT,

1085 mapper: IndexLabel | lib.NoDefault = lib.no_default,

1086 *,

1087 index=lib.no_default,

1088 columns=lib.no_default,

1089 axis: Axis = 0,

1090 copy: bool_t | None = None,

1091 inplace: bool_t = False,

1092 ) -> NDFrameT | None:

1093 """

1094 Set the name of the axis for the index or columns.

1095

1096 Parameters

1097 ----------

1098 mapper : scalar, list-like, optional

1099 Value to set the axis name attribute.

1100 index, columns : scalar, list-like, dict-like or function, optional

1101 A scalar, list-like, dict-like or functions transformations to

1102 apply to that axis' values.

1103 Note that the ``columns`` parameter is not allowed if the

1104 object is a Series. This parameter only apply for DataFrame

1105 type objects.

1106

1107 Use either ``mapper`` and ``axis`` to

1108 specify the axis to target with ``mapper``, or ``index``

1109 and/or ``columns``.

1110 axis : {0 or 'index', 1 or 'columns'}, default 0

1111 The axis to rename. For `Series` this parameter is unused and defaults to 0.

1112 copy : bool, default None

1113 Also copy underlying data.

1114 inplace : bool, default False

1115 Modifies the object directly, instead of creating a new Series

1116 or DataFrame.

1117

1118 Returns

1119 -------

1120 Series, DataFrame, or None

1121 The same type as the caller or None if ``inplace=True``.

1122

1123 See Also

1124 --------

1125 Series.rename : Alter Series index labels or name.

1126 DataFrame.rename : Alter DataFrame index labels or name.

1127 Index.rename : Set new names on index.

1128

1129 Notes

1130 -----

1131 ``DataFrame.rename_axis`` supports two calling conventions

1132

1133 * ``(index=index_mapper, columns=columns_mapper, ...)``

1134 * ``(mapper, axis={'index', 'columns'}, ...)``

1135

1136 The first calling convention will only modify the names of

1137 the index and/or the names of the Index object that is the columns.

1138 In this case, the parameter ``copy`` is ignored.

1139

1140 The second calling convention will modify the names of the

1141 corresponding index if mapper is a list or a scalar.

1142 However, if mapper is dict-like or a function, it will use the

1143 deprecated behavior of modifying the axis *labels*.

1144

1145 We *highly* recommend using keyword arguments to clarify your

1146 intent.

1147

1148 Examples

1149 --------

1150 **Series**

1151

1152 >>> s = pd.Series(["dog", "cat", "monkey"])

1153 >>> s

1154 0 dog

1155 1 cat

1156 2 monkey

1157 dtype: object

1158 >>> s.rename_axis("animal")

1159 animal

1160 0 dog

1161 1 cat

1162 2 monkey

1163 dtype: object

1164

1165 **DataFrame**

1166

1167 >>> df = pd.DataFrame({"num_legs": [4, 4, 2],

1168 ... "num_arms": [0, 0, 2]},

1169 ... ["dog", "cat", "monkey"])

1170 >>> df

1171 num_legs num_arms

1172 dog 4 0

1173 cat 4 0

1174 monkey 2 2

1175 >>> df = df.rename_axis("animal")

1176 >>> df

1177 num_legs num_arms

1178 animal

1179 dog 4 0

1180 cat 4 0

1181 monkey 2 2

1182 >>> df = df.rename_axis("limbs", axis="columns")

1183 >>> df

1184 limbs num_legs num_arms

1185 animal

1186 dog 4 0

1187 cat 4 0

1188 monkey 2 2

1189

1190 **MultiIndex**

1191

1192 >>> df.index = pd.MultiIndex.from_product([['mammal'],

1193 ... ['dog', 'cat', 'monkey']],

1194 ... names=['type', 'name'])

1195 >>> df

1196 limbs num_legs num_arms

1197 type name

1198 mammal dog 4 0

1199 cat 4 0

1200 monkey 2 2

1201

1202 >>> df.rename_axis(index={'type': 'class'})

1203 limbs num_legs num_arms

1204 class name

1205 mammal dog 4 0

1206 cat 4 0

1207 monkey 2 2

1208

1209 >>> df.rename_axis(columns=str.upper)

1210 LIMBS num_legs num_arms

1211 type name

1212 mammal dog 4 0

1213 cat 4 0

1214 monkey 2 2

1215 """

1216 axes = {"index": index, "columns": columns}

1217

1218 if axis is not None:

1219 axis = self._get_axis_number(axis)

1220

1221 inplace = validate_bool_kwarg(inplace, "inplace")

1222

1223 if copy and using_copy_on_write():

1224 copy = False

1225

1226 if mapper is not lib.no_default:

1227 # Use v0.23 behavior if a scalar or list

1228 non_mapper = is_scalar(mapper) or (

1229 is_list_like(mapper) and not is_dict_like(mapper)

1230 )

1231 if non_mapper:

1232 return self._set_axis_name(

1233 mapper, axis=axis, inplace=inplace, copy=copy

1234 )

1235 else:

1236 raise ValueError("Use `.rename` to alter labels with a mapper.")

1237 else:

1238 # Use new behavior. Means that index and/or columns

1239 # is specified

1240 result = self if inplace else self.copy(deep=copy)

1241

1242 for axis in range(self._AXIS_LEN):

1243 v = axes.get(self._get_axis_name(axis))

1244 if v is lib.no_default:

1245 continue

1246 non_mapper = is_scalar(v) or (is_list_like(v) and not is_dict_like(v))

1247 if non_mapper:

1248 newnames = v

1249 else:

1250 f = common.get_rename_function(v)

1251 curnames = self._get_axis(axis).names

1252 newnames = [f(name) for name in curnames]

1253 result._set_axis_name(newnames, axis=axis, inplace=True, copy=copy)

1254 if not inplace:

1255 return result

1256 return None

1257

1258 @final

1259 def _set_axis_name(

1260 self, name, axis: Axis = 0, inplace: bool_t = False, copy: bool_t | None = True

1261 ):

1262 """

1263 Set the name(s) of the axis.

1264

1265 Parameters

1266 ----------

1267 name : str or list of str

1268 Name(s) to set.

1269 axis : {0 or 'index', 1 or 'columns'}, default 0

1270 The axis to set the label. The value 0 or 'index' specifies index,

1271 and the value 1 or 'columns' specifies columns.

1272 inplace : bool, default False

1273 If `True`, do operation inplace and return None.

1274 copy:

1275 Whether to make a copy of the result.

1276

1277 Returns

1278 -------

1279 Series, DataFrame, or None

1280 The same type as the caller or `None` if `inplace` is `True`.

1281

1282 See Also

1283 --------

1284 DataFrame.rename : Alter the axis labels of :class:`DataFrame`.

1285 Series.rename : Alter the index labels or set the index name

1286 of :class:`Series`.

1287 Index.rename : Set the name of :class:`Index` or :class:`MultiIndex`.

1288

1289 Examples

1290 --------

1291 >>> df = pd.DataFrame({"num_legs": [4, 4, 2]},

1292 ... ["dog", "cat", "monkey"])

1293 >>> df

1294 num_legs

1295 dog 4

1296 cat 4

1297 monkey 2

1298 >>> df._set_axis_name("animal")

1299 num_legs

1300 animal

1301 dog 4

1302 cat 4

1303 monkey 2

1304 >>> df.index = pd.MultiIndex.from_product(

1305 ... [["mammal"], ['dog', 'cat', 'monkey']])

1306 >>> df._set_axis_name(["type", "name"])

1307 num_legs

1308 type name

1309 mammal dog 4

1310 cat 4

1311 monkey 2

1312 """

1313 axis = self._get_axis_number(axis)

1314 idx = self._get_axis(axis).set_names(name)

1315

1316 inplace = validate_bool_kwarg(inplace, "inplace")

1317 renamed = self if inplace else self.copy(deep=copy)

1318 if axis == 0:

1319 renamed.index = idx

1320 else:

1321 renamed.columns = idx

1322

1323 if not inplace:

1324 return renamed

1325

1326 # ----------------------------------------------------------------------

1327 # Comparison Methods

1328

1329 @final

1330 def _indexed_same(self, other) -> bool_t:

1331 return all(

1332 self._get_axis(a).equals(other._get_axis(a)) for a in self._AXIS_ORDERS

1333 )

1334

1335 @final

1336 def equals(self, other: object) -> bool_t:

1337 """

1338 Test whether two objects contain the same elements.

1339

1340 This function allows two Series or DataFrames to be compared against

1341 each other to see if they have the same shape and elements. NaNs in

1342 the same location are considered equal.

1343

1344 The row/column index do not need to have the same type, as long

1345 as the values are considered equal. Corresponding columns must be of

1346 the same dtype.

1347

1348 Parameters

1349 ----------

1350 other : Series or DataFrame

1351 The other Series or DataFrame to be compared with the first.

1352

1353 Returns

1354 -------

1355 bool

1356 True if all elements are the same in both objects, False

1357 otherwise.

1358

1359 See Also

1360 --------

1361 Series.eq : Compare two Series objects of the same length

1362 and return a Series where each element is True if the element

1363 in each Series is equal, False otherwise.

1364 DataFrame.eq : Compare two DataFrame objects of the same shape and

1365 return a DataFrame where each element is True if the respective

1366 element in each DataFrame is equal, False otherwise.

1367 testing.assert_series_equal : Raises an AssertionError if left and

1368 right are not equal. Provides an easy interface to ignore

1369 inequality in dtypes, indexes and precision among others.

1370 testing.assert_frame_equal : Like assert_series_equal, but targets

1371 DataFrames.

1372 numpy.array_equal : Return True if two arrays have the same shape

1373 and elements, False otherwise.

1374

1375 Examples

1376 --------

1377 >>> df = pd.DataFrame({1: [10], 2: [20]})

1378 >>> df

1379 1 2

1380 0 10 20

1381

1382 DataFrames df and exactly_equal have the same types and values for

1383 their elements and column labels, which will return True.

1384

1385 >>> exactly_equal = pd.DataFrame({1: [10], 2: [20]})

1386 >>> exactly_equal

1387 1 2

1388 0 10 20

1389 >>> df.equals(exactly_equal)

1390 True

1391

1392 DataFrames df and different_column_type have the same element

1393 types and values, but have different types for the column labels,

1394 which will still return True.

1395

1396 >>> different_column_type = pd.DataFrame({1.0: [10], 2.0: [20]})

1397 >>> different_column_type

1398 1.0 2.0

1399 0 10 20

1400 >>> df.equals(different_column_type)

1401 True

1402

1403 DataFrames df and different_data_type have different types for the

1404 same values for their elements, and will return False even though

1405 their column labels are the same values and types.

1406

1407 >>> different_data_type = pd.DataFrame({1: [10.0], 2: [20.0]})

1408 >>> different_data_type

1409 1 2

1410 0 10.0 20.0

1411 >>> df.equals(different_data_type)

1412 False

1413 """

1414 if not (isinstance(other, type(self)) or isinstance(self, type(other))):

1415 return False

1416 other = cast(NDFrame, other)

1417 return self._mgr.equals(other._mgr)

1418

1419 # -------------------------------------------------------------------------

1420 # Unary Methods

1421

1422 @final

1423 def __neg__(self: NDFrameT) -> NDFrameT:

1424 def blk_func(values: ArrayLike):

1425 if is_bool_dtype(values.dtype):

1426 # error: Argument 1 to "inv" has incompatible type "Union

1427 # [ExtensionArray, ndarray[Any, Any]]"; expected

1428 # "_SupportsInversion[ndarray[Any, dtype[bool_]]]"

1429 return operator.inv(values) # type: ignore[arg-type]

1430 else:

1431 # error: Argument 1 to "neg" has incompatible type "Union

1432 # [ExtensionArray, ndarray[Any, Any]]"; expected

1433 # "_SupportsNeg[ndarray[Any, dtype[Any]]]"

1434 return operator.neg(values) # type: ignore[arg-type]

1435

1436 new_data = self._mgr.apply(blk_func)

1437 res = self._constructor(new_data)

1438 return res.__finalize__(self, method="__neg__")

1439

1440 @final

1441 def __pos__(self: NDFrameT) -> NDFrameT:

1442 def blk_func(values: ArrayLike):

1443 if is_bool_dtype(values.dtype):

1444 return values.copy()

1445 else:

1446 # error: Argument 1 to "pos" has incompatible type "Union

1447 # [ExtensionArray, ndarray[Any, Any]]"; expected

1448 # "_SupportsPos[ndarray[Any, dtype[Any]]]"

1449 return operator.pos(values) # type: ignore[arg-type]

1450

1451 new_data = self._mgr.apply(blk_func)

1452 res = self._constructor(new_data)

1453 return res.__finalize__(self, method="__pos__")

1454

1455 @final

1456 def __invert__(self: NDFrameT) -> NDFrameT:

1457 if not self.size:

1458 # inv fails with 0 len

1459 return self.copy(deep=False)

1460

1461 new_data = self._mgr.apply(operator.invert)

1462 return self._constructor(new_data).__finalize__(self, method="__invert__")

1463

1464 @final

1465 def __nonzero__(self) -> NoReturn:

1466 raise ValueError(

1467 f"The truth value of a {type(self).__name__} is ambiguous. "

1468 "Use a.empty, a.bool(), a.item(), a.any() or a.all()."

1469 )

1470

1471 __bool__ = __nonzero__

1472

1473 @final

1474 def bool(self) -> bool_t:

1475 """

1476 Return the bool of a single element Series or DataFrame.

1477

1478 This must be a boolean scalar value, either True or False. It will raise a

1479 ValueError if the Series or DataFrame does not have exactly 1 element, or that

1480 element is not boolean (integer values 0 and 1 will also raise an exception).

1481

1482 Returns

1483 -------

1484 bool

1485 The value in the Series or DataFrame.

1486

1487 See Also

1488 --------

1489 Series.astype : Change the data type of a Series, including to boolean.

1490 DataFrame.astype : Change the data type of a DataFrame, including to boolean.

1491 numpy.bool_ : NumPy boolean data type, used by pandas for boolean values.

1492

1493 Examples

1494 --------

1495 The method will only work for single element objects with a boolean value:

1496

1497 >>> pd.Series([True]).bool()

1498 True

1499 >>> pd.Series([False]).bool()

1500 False

1501

1502 >>> pd.DataFrame({'col': [True]}).bool()

1503 True

1504 >>> pd.DataFrame({'col': [False]}).bool()

1505 False

1506 """

1507 v = self.squeeze()

1508 if isinstance(v, (bool, np.bool_)):

1509 return bool(v)

1510 elif is_scalar(v):

1511 raise ValueError(

1512 "bool cannot act on a non-boolean single element "

1513 f"{type(self).__name__}"

1514 )

1515

1516 self.__nonzero__()

1517 # for mypy (__nonzero__ raises)

1518 return True

1519

1520 @final

1521 def abs(self: NDFrameT) -> NDFrameT:

1522 """

1523 Return a Series/DataFrame with absolute numeric value of each element.

1524

1525 This function only applies to elements that are all numeric.

1526

1527 Returns

1528 -------

1529 abs

1530 Series/DataFrame containing the absolute value of each element.

1531

1532 See Also

1533 --------

1534 numpy.absolute : Calculate the absolute value element-wise.

1535

1536 Notes

1537 -----

1538 For ``complex`` inputs, ``1.2 + 1j``, the absolute value is

1539 :math:`\\sqrt{ a^2 + b^2 }`.

1540

1541 Examples

1542 --------

1543 Absolute numeric values in a Series.

1544

1545 >>> s = pd.Series([-1.10, 2, -3.33, 4])

1546 >>> s.abs()

1547 0 1.10

1548 1 2.00

1549 2 3.33

1550 3 4.00

1551 dtype: float64

1552

1553 Absolute numeric values in a Series with complex numbers.

1554

1555 >>> s = pd.Series([1.2 + 1j])

1556 >>> s.abs()

1557 0 1.56205

1558 dtype: float64

1559

1560 Absolute numeric values in a Series with a Timedelta element.

1561

1562 >>> s = pd.Series([pd.Timedelta('1 days')])

1563 >>> s.abs()

1564 0 1 days

1565 dtype: timedelta64[ns]

1566

1567 Select rows with data closest to certain value using argsort (from

1568 `StackOverflow <https://stackoverflow.com/a/17758115>`__).

1569

1570 >>> df = pd.DataFrame({

1571 ... 'a': [4, 5, 6, 7],

1572 ... 'b': [10, 20, 30, 40],

1573 ... 'c': [100, 50, -30, -50]

1574 ... })

1575 >>> df

1576 a b c

1577 0 4 10 100

1578 1 5 20 50

1579 2 6 30 -30

1580 3 7 40 -50

1581 >>> df.loc[(df.c - 43).abs().argsort()]

1582 a b c

1583 1 5 20 50

1584 0 4 10 100

1585 2 6 30 -30

1586 3 7 40 -50

1587 """

1588 res_mgr = self._mgr.apply(np.abs)

1589 return self._constructor(res_mgr).__finalize__(self, name="abs")

1590

1591 @final

1592 def __abs__(self: NDFrameT) -> NDFrameT:

1593 return self.abs()

1594

1595 @final

1596 def __round__(self: NDFrameT, decimals: int = 0) -> NDFrameT:

1597 return self.round(decimals).__finalize__(self, method="__round__")

1598

1599 # -------------------------------------------------------------------------

1600 # Label or Level Combination Helpers

1601 #

1602 # A collection of helper methods for DataFrame/Series operations that

1603 # accept a combination of column/index labels and levels. All such

1604 # operations should utilize/extend these methods when possible so that we

1605 # have consistent precedence and validation logic throughout the library.

1606

1607 @final

1608 def _is_level_reference(self, key: Level, axis: Axis = 0) -> bool_t:

1609 """

1610 Test whether a key is a level reference for a given axis.

1611

1612 To be considered a level reference, `key` must be a string that:

1613 - (axis=0): Matches the name of an index level and does NOT match

1614 a column label.

1615 - (axis=1): Matches the name of a column level and does NOT match

1616 an index label.

1617

1618 Parameters

1619 ----------

1620 key : Hashable

1621 Potential level name for the given axis

1622 axis : int, default 0

1623 Axis that levels are associated with (0 for index, 1 for columns)

1624

1625 Returns

1626 -------

1627 is_level : bool

1628 """

1629 axis_int = self._get_axis_number(axis)

1630

1631 return (

1632 key is not None

1633 and is_hashable(key)

1634 and key in self.axes[axis_int].names

1635 and not self._is_label_reference(key, axis=axis_int)

1636 )

1637

1638 @final

1639 def _is_label_reference(self, key: Level, axis: Axis = 0) -> bool_t:

1640 """

1641 Test whether a key is a label reference for a given axis.

1642

1643 To be considered a label reference, `key` must be a string that:

1644 - (axis=0): Matches a column label

1645 - (axis=1): Matches an index label

1646

1647 Parameters

1648 ----------

1649 key : Hashable

1650 Potential label name, i.e. Index entry.

1651 axis : int, default 0

1652 Axis perpendicular to the axis that labels are associated with

1653 (0 means search for column labels, 1 means search for index labels)

1654

1655 Returns

1656 -------

1657 is_label: bool

1658 """

1659 axis_int = self._get_axis_number(axis)

1660 other_axes = (ax for ax in range(self._AXIS_LEN) if ax != axis_int)

1661

1662 return (

1663 key is not None

1664 and is_hashable(key)

1665 and any(key in self.axes[ax] for ax in other_axes)

1666 )

1667

1668 @final

1669 def _is_label_or_level_reference(self, key: Level, axis: AxisInt = 0) -> bool_t:

1670 """

1671 Test whether a key is a label or level reference for a given axis.

1672

1673 To be considered either a label or a level reference, `key` must be a

1674 string that:

1675 - (axis=0): Matches a column label or an index level

1676 - (axis=1): Matches an index label or a column level

1677

1678 Parameters

1679 ----------

1680 key : Hashable

1681 Potential label or level name

1682 axis : int, default 0

1683 Axis that levels are associated with (0 for index, 1 for columns)

1684

1685 Returns

1686 -------

1687 bool

1688 """

1689 return self._is_level_reference(key, axis=axis) or self._is_label_reference(

1690 key, axis=axis

1691 )

1692

1693 @final

1694 def _check_label_or_level_ambiguity(self, key: Level, axis: Axis = 0) -> None:

1695 """

1696 Check whether `key` is ambiguous.

1697

1698 By ambiguous, we mean that it matches both a level of the input

1699 `axis` and a label of the other axis.

1700

1701 Parameters

1702 ----------

1703 key : Hashable

1704 Label or level name.

1705 axis : int, default 0

1706 Axis that levels are associated with (0 for index, 1 for columns).

1707

1708 Raises

1709 ------

1710 ValueError: `key` is ambiguous

1711 """

1712

1713 axis_int = self._get_axis_number(axis)

1714 other_axes = (ax for ax in range(self._AXIS_LEN) if ax != axis_int)

1715

1716 if (

1717 key is not None

1718 and is_hashable(key)

1719 and key in self.axes[axis_int].names

1720 and any(key in self.axes[ax] for ax in other_axes)

1721 ):

1722 # Build an informative and grammatical warning

1723 level_article, level_type = (

1724 ("an", "index") if axis_int == 0 else ("a", "column")

1725 )

1726

1727 label_article, label_type = (

1728 ("a", "column") if axis_int == 0 else ("an", "index")

1729 )

1730

1731 msg = (

1732 f"'{key}' is both {level_article} {level_type} level and "

1733 f"{label_article} {label_type} label, which is ambiguous."

1734 )

1735 raise ValueError(msg)

1736

1737 @final

1738 def _get_label_or_level_values(self, key: Level, axis: AxisInt = 0) -> ArrayLike:

1739 """

1740 Return a 1-D array of values associated with `key`, a label or level

1741 from the given `axis`.

1742

1743 Retrieval logic:

1744 - (axis=0): Return column values if `key` matches a column label.

1745 Otherwise return index level values if `key` matches an index

1746 level.

1747 - (axis=1): Return row values if `key` matches an index label.

1748 Otherwise return column level values if 'key' matches a column

1749 level

1750

1751 Parameters

1752 ----------

1753 key : Hashable

1754 Label or level name.

1755 axis : int, default 0

1756 Axis that levels are associated with (0 for index, 1 for columns)

1757

1758 Returns

1759 -------

1760 np.ndarray or ExtensionArray

1761

1762 Raises

1763 ------

1764 KeyError

1765 if `key` matches neither a label nor a level

1766 ValueError

1767 if `key` matches multiple labels

1768 """

1769 axis = self._get_axis_number(axis)

1770 other_axes = [ax for ax in range(self._AXIS_LEN) if ax != axis]

1771

1772 if self._is_label_reference(key, axis=axis):

1773 self._check_label_or_level_ambiguity(key, axis=axis)

1774 values = self.xs(key, axis=other_axes[0])._values

1775 elif self._is_level_reference(key, axis=axis):

1776 values = self.axes[axis].get_level_values(key)._values

1777 else:

1778 raise KeyError(key)

1779

1780 # Check for duplicates

1781 if values.ndim > 1:

1782 if other_axes and isinstance(self._get_axis(other_axes[0]), MultiIndex):

1783 multi_message = (

1784 "\n"

1785 "For a multi-index, the label must be a "

1786 "tuple with elements corresponding to each level."

1787 )

1788 else:

1789 multi_message = ""

1790

1791 label_axis_name = "column" if axis == 0 else "index"

1792 raise ValueError(

1793 f"The {label_axis_name} label '{key}' is not unique.{multi_message}"

1794 )

1795

1796 return values

1797

1798 @final

1799 def _drop_labels_or_levels(self, keys, axis: AxisInt = 0):

1800 """

1801 Drop labels and/or levels for the given `axis`.

1802

1803 For each key in `keys`:

1804 - (axis=0): If key matches a column label then drop the column.

1805 Otherwise if key matches an index level then drop the level.

1806 - (axis=1): If key matches an index label then drop the row.

1807 Otherwise if key matches a column level then drop the level.

1808

1809 Parameters

1810 ----------

1811 keys : str or list of str

1812 labels or levels to drop

1813 axis : int, default 0

1814 Axis that levels are associated with (0 for index, 1 for columns)

1815

1816 Returns

1817 -------

1818 dropped: DataFrame

1819

1820 Raises

1821 ------

1822 ValueError

1823 if any `keys` match neither a label nor a level

1824 """

1825 axis = self._get_axis_number(axis)

1826

1827 # Validate keys

1828 keys = common.maybe_make_list(keys)

1829 invalid_keys = [

1830 k for k in keys if not self._is_label_or_level_reference(k, axis=axis)

1831 ]

1832

1833 if invalid_keys:

1834 raise ValueError(

1835 "The following keys are not valid labels or "

1836 f"levels for axis {axis}: {invalid_keys}"

1837 )

1838

1839 # Compute levels and labels to drop

1840 levels_to_drop = [k for k in keys if self._is_level_reference(k, axis=axis)]

1841

1842 labels_to_drop = [k for k in keys if not self._is_level_reference(k, axis=axis)]

1843

1844 # Perform copy upfront and then use inplace operations below.

1845 # This ensures that we always perform exactly one copy.

1846 # ``copy`` and/or ``inplace`` options could be added in the future.

1847 dropped = self.copy(deep=False)

1848

1849 if axis == 0:

1850 # Handle dropping index levels

1851 if levels_to_drop:

1852 dropped.reset_index(levels_to_drop, drop=True, inplace=True)

1853

1854 # Handle dropping columns labels

1855 if labels_to_drop:

1856 dropped.drop(labels_to_drop, axis=1, inplace=True)

1857 else:

1858 # Handle dropping column levels

1859 if levels_to_drop:

1860 if isinstance(dropped.columns, MultiIndex):

1861 # Drop the specified levels from the MultiIndex

1862 dropped.columns = dropped.columns.droplevel(levels_to_drop)

1863 else:

1864 # Drop the last level of Index by replacing with

1865 # a RangeIndex

1866 dropped.columns = RangeIndex(dropped.columns.size)

1867

1868 # Handle dropping index labels

1869 if labels_to_drop:

1870 dropped.drop(labels_to_drop, axis=0, inplace=True)

1871

1872 return dropped

1873

1874 # ----------------------------------------------------------------------

1875 # Iteration

1876

1877 # https://github.com/python/typeshed/issues/2148#issuecomment-520783318

1878 # Incompatible types in assignment (expression has type "None", base class

1879 # "object" defined the type as "Callable[[object], int]")

1880 __hash__: ClassVar[None] # type: ignore[assignment]

1881

1882 def __iter__(self) -> Iterator:

1883 """

1884 Iterate over info axis.

1885

1886 Returns

1887 -------

1888 iterator

1889 Info axis as iterator.

1890 """

1891 return iter(self._info_axis)

1892

1893 # can we get a better explanation of this?

1894 def keys(self) -> Index:

1895 """

1896 Get the 'info axis' (see Indexing for more).

1897

1898 This is index for Series, columns for DataFrame.

1899

1900 Returns

1901 -------

1902 Index

1903 Info axis.

1904 """

1905 return self._info_axis

1906

1907 def items(self):

1908 """

1909 Iterate over (label, values) on info axis

1910

1911 This is index for Series and columns for DataFrame.

1912

1913 Returns

1914 -------

1915 Generator

1916 """

1917 for h in self._info_axis:

1918 yield h, self[h]

1919

1920 def __len__(self) -> int:

1921 """Returns length of info axis"""

1922 return len(self._info_axis)

1923

1924 @final

1925 def __contains__(self, key) -> bool_t:

1926 """True if the key is in the info axis"""

1927 return key in self._info_axis

1928

1929 @property

1930 def empty(self) -> bool_t:

1931 """

1932 Indicator whether Series/DataFrame is empty.

1933

1934 True if Series/DataFrame is entirely empty (no items), meaning any of the

1935 axes are of length 0.

1936

1937 Returns

1938 -------

1939 bool

1940 If Series/DataFrame is empty, return True, if not return False.

1941

1942 See Also

1943 --------

1944 Series.dropna : Return series without null values.

1945 DataFrame.dropna : Return DataFrame with labels on given axis omitted

1946 where (all or any) data are missing.

1947

1948 Notes

1949 -----

1950 If Series/DataFrame contains only NaNs, it is still not considered empty. See

1951 the example below.

1952

1953 Examples

1954 --------

1955 An example of an actual empty DataFrame. Notice the index is empty:

1956

1957 >>> df_empty = pd.DataFrame({'A' : []})

1958 >>> df_empty

1959 Empty DataFrame

1960 Columns: [A]

1961 Index: []

1962 >>> df_empty.empty

1963 True

1964

1965 If we only have NaNs in our DataFrame, it is not considered empty! We

1966 will need to drop the NaNs to make the DataFrame empty:

1967

1968 >>> df = pd.DataFrame({'A' : [np.nan]})

1969 >>> df

1970 A

1971 0 NaN

1972 >>> df.empty

1973 False

1974 >>> df.dropna().empty

1975 True

1976

1977 >>> ser_empty = pd.Series({'A' : []})

1978 >>> ser_empty

1979 A []

1980 dtype: object

1981 >>> ser_empty.empty

1982 False

1983 >>> ser_empty = pd.Series()

1984 >>> ser_empty.empty

1985 True

1986 """

1987 return any(len(self._get_axis(a)) == 0 for a in self._AXIS_ORDERS)

1988

1989 # ----------------------------------------------------------------------

1990 # Array Interface

1991

1992 # This is also set in IndexOpsMixin

1993 # GH#23114 Ensure ndarray.__op__(DataFrame) returns NotImplemented

1994 __array_priority__: int = 1000

1995

1996 def __array__(self, dtype: npt.DTypeLike | None = None) -> np.ndarray:

1997 values = self._values

1998 arr = np.asarray(values, dtype=dtype)

1999 if (

2000 astype_is_view(values.dtype, arr.dtype)

2001 and using_copy_on_write()

2002 and self._mgr.is_single_block

2003 ):

2004 # Check if both conversions can be done without a copy

2005 if astype_is_view(self.dtypes.iloc[0], values.dtype) and astype_is_view(

2006 values.dtype, arr.dtype

2007 ):

2008 arr = arr.view()

2009 arr.flags.writeable = False

2010 return arr

2011

2012 @final

2013 def __array_ufunc__(

2014 self, ufunc: np.ufunc, method: str, *inputs: Any, **kwargs: Any

2015 ):

2016 return arraylike.array_ufunc(self, ufunc, method, *inputs, **kwargs)

2017

2018 # ----------------------------------------------------------------------

2019 # Picklability

2020

2021 @final

2022 def __getstate__(self) -> dict[str, Any]:

2023 meta = {k: getattr(self, k, None) for k in self._metadata}

2024 return {

2025 "_mgr": self._mgr,

2026 "_typ": self._typ,

2027 "_metadata": self._metadata,

2028 "attrs": self.attrs,

2029 "_flags": {k: self.flags[k] for k in self.flags._keys},

2030 **meta,

2031 }

2032

2033 @final

2034 def __setstate__(self, state) -> None:

2035 if isinstance(state, BlockManager):

2036 self._mgr = state

2037 elif isinstance(state, dict):

2038 if "_data" in state and "_mgr" not in state:

2039 # compat for older pickles

2040 state["_mgr"] = state.pop("_data")

2041 typ = state.get("_typ")

2042 if typ is not None:

2043 attrs = state.get("_attrs", {})

2044 object.__setattr__(self, "_attrs", attrs)

2045 flags = state.get("_flags", {"allows_duplicate_labels": True})

2046 object.__setattr__(self, "_flags", Flags(self, **flags))

2047

2048 # set in the order of internal names

2049 # to avoid definitional recursion

2050 # e.g. say fill_value needing _mgr to be

2051 # defined

2052 meta = set(self._internal_names + self._metadata)

2053 for k in list(meta):

2054 if k in state and k != "_flags":

2055 v = state[k]

2056 object.__setattr__(self, k, v)

2057

2058 for k, v in state.items():

2059 if k not in meta:

2060 object.__setattr__(self, k, v)

2061

2062 else:

2063 raise NotImplementedError("Pre-0.12 pickles are no longer supported")

2064 elif len(state) == 2:

2065 raise NotImplementedError("Pre-0.12 pickles are no longer supported")

2066

2067 self._item_cache: dict[Hashable, Series] = {}

2068

2069 # ----------------------------------------------------------------------

2070 # Rendering Methods

2071

2072 def __repr__(self) -> str:

2073 # string representation based upon iterating over self

2074 # (since, by definition, `PandasContainers` are iterable)

2075 prepr = f"[{','.join(map(pprint_thing, self))}]"

2076 return f"{type(self).__name__}({prepr})"

2077

2078 @final

2079 def _repr_latex_(self):

2080 """

2081 Returns a LaTeX representation for a particular object.

2082 Mainly for use with nbconvert (jupyter notebook conversion to pdf).

2083 """

2084 if config.get_option("styler.render.repr") == "latex":

2085 return self.to_latex()

2086 else:

2087 return None

2088

2089 @final

2090 def _repr_data_resource_(self):

2091 """

2092 Not a real Jupyter special repr method, but we use the same

2093 naming convention.

2094 """

2095 if config.get_option("display.html.table_schema"):

2096 data = self.head(config.get_option("display.max_rows"))

2097

2098 as_json = data.to_json(orient="table")

2099 as_json = cast(str, as_json)

2100 return loads(as_json, object_pairs_hook=collections.OrderedDict)

2101

2102 # ----------------------------------------------------------------------

2103 # I/O Methods

2104

2105 @final

2106 @doc(

2107 klass="object",

2108 storage_options=_shared_docs["storage_options"],

2109 storage_options_versionadded="1.2.0",

2110 )

2111 def to_excel(

2112 self,

2113 excel_writer,

2114 sheet_name: str = "Sheet1",

2115 na_rep: str = "",

2116 float_format: str | None = None,

2117 columns: Sequence[Hashable] | None = None,

2118 header: Sequence[Hashable] | bool_t = True,

2119 index: bool_t = True,

2120 index_label: IndexLabel = None,

2121 startrow: int = 0,

2122 startcol: int = 0,

2123 engine: str | None = None,

2124 merge_cells: bool_t = True,

2125 inf_rep: str = "inf",

2126 freeze_panes: tuple[int, int] | None = None,

2127 storage_options: StorageOptions = None,

2128 ) -> None:

2129 """

2130 Write {klass} to an Excel sheet.

2131

2132 To write a single {klass} to an Excel .xlsx file it is only necessary to

2133 specify a target file name. To write to multiple sheets it is necessary to

2134 create an `ExcelWriter` object with a target file name, and specify a sheet

2135 in the file to write to.

2136

2137 Multiple sheets may be written to by specifying unique `sheet_name`.

2138 With all data written to the file it is necessary to save the changes.

2139 Note that creating an `ExcelWriter` object with a file name that already

2140 exists will result in the contents of the existing file being erased.

2141

2142 Parameters

2143 ----------

2144 excel_writer : path-like, file-like, or ExcelWriter object

2145 File path or existing ExcelWriter.

2146 sheet_name : str, default 'Sheet1'

2147 Name of sheet which will contain DataFrame.

2148 na_rep : str, default ''

2149 Missing data representation.

2150 float_format : str, optional

2151 Format string for floating point numbers. For example

2152 ``float_format="%.2f"`` will format 0.1234 to 0.12.

2153 columns : sequence or list of str, optional

2154 Columns to write.

2155 header : bool or list of str, default True

2156 Write out the column names. If a list of string is given it is

2157 assumed to be aliases for the column names.

2158 index : bool, default True

2159 Write row names (index).

2160 index_label : str or sequence, optional

2161 Column label for index column(s) if desired. If not specified, and

2162 `header` and `index` are True, then the index names are used. A

2163 sequence should be given if the DataFrame uses MultiIndex.

2164 startrow : int, default 0

2165 Upper left cell row to dump data frame.

2166 startcol : int, default 0

2167 Upper left cell column to dump data frame.

2168 engine : str, optional

2169 Write engine to use, 'openpyxl' or 'xlsxwriter'. You can also set this

2170 via the options ``io.excel.xlsx.writer`` or

2171 ``io.excel.xlsm.writer``.

2172

2173 merge_cells : bool, default True

2174 Write MultiIndex and Hierarchical Rows as merged cells.

2175 inf_rep : str, default 'inf'

2176 Representation for infinity (there is no native representation for

2177 infinity in Excel).

2178 freeze_panes : tuple of int (length 2), optional

2179 Specifies the one-based bottommost row and rightmost column that

2180 is to be frozen.

2181 {storage_options}

2182

2183 .. versionadded:: {storage_options_versionadded}

2184

2185 See Also

2186 --------

2187 to_csv : Write DataFrame to a comma-separated values (csv) file.

2188 ExcelWriter : Class for writing DataFrame objects into excel sheets.

2189 read_excel : Read an Excel file into a pandas DataFrame.

2190 read_csv : Read a comma-separated values (csv) file into DataFrame.

2191 io.formats.style.Styler.to_excel : Add styles to Excel sheet.

2192

2193 Notes

2194 -----

2195 For compatibility with :meth:`~DataFrame.to_csv`,

2196 to_excel serializes lists and dicts to strings before writing.

2197

2198 Once a workbook has been saved it is not possible to write further

2199 data without rewriting the whole workbook.

2200

2201 Examples

2202 --------

2203

2204 Create, write to and save a workbook:

2205

2206 >>> df1 = pd.DataFrame([['a', 'b'], ['c', 'd']],

2207 ... index=['row 1', 'row 2'],

2208 ... columns=['col 1', 'col 2'])

2209 >>> df1.to_excel("output.xlsx") # doctest: +SKIP

2210

2211 To specify the sheet name:

2212

2213 >>> df1.to_excel("output.xlsx",

2214 ... sheet_name='Sheet_name_1') # doctest: +SKIP

2215

2216 If you wish to write to more than one sheet in the workbook, it is

2217 necessary to specify an ExcelWriter object:

2218

2219 >>> df2 = df1.copy()

2220 >>> with pd.ExcelWriter('output.xlsx') as writer: # doctest: +SKIP

2221 ... df1.to_excel(writer, sheet_name='Sheet_name_1')

2222 ... df2.to_excel(writer, sheet_name='Sheet_name_2')

2223

2224 ExcelWriter can also be used to append to an existing Excel file:

2225

2226 >>> with pd.ExcelWriter('output.xlsx',

2227 ... mode='a') as writer: # doctest: +SKIP

2228 ... df.to_excel(writer, sheet_name='Sheet_name_3')

2229

2230 To set the library that is used to write the Excel file,

2231 you can pass the `engine` keyword (the default engine is

2232 automatically chosen depending on the file extension):

2233

2234 >>> df1.to_excel('output1.xlsx', engine='xlsxwriter') # doctest: +SKIP

2235 """

2236

2237 df = self if isinstance(self, ABCDataFrame) else self.to_frame()

2238

2239 from pandas.io.formats.excel import ExcelFormatter

2240

2241 formatter = ExcelFormatter(

2242 df,

2243 na_rep=na_rep,

2244 cols=columns,

2245 header=header,

2246 float_format=float_format,

2247 index=index,

2248 index_label=index_label,

2249 merge_cells=merge_cells,

2250 inf_rep=inf_rep,

2251 )

2252 formatter.write(

2253 excel_writer,

2254 sheet_name=sheet_name,

2255 startrow=startrow,

2256 startcol=startcol,

2257 freeze_panes=freeze_panes,

2258 engine=engine,

2259 storage_options=storage_options,

2260 )

2261

2262 @final

2263 @doc(

2264 storage_options=_shared_docs["storage_options"],

2265 compression_options=_shared_docs["compression_options"] % "path_or_buf",

2266 )

2267 def to_json(

2268 self,

2269 path_or_buf: FilePath | WriteBuffer[bytes] | WriteBuffer[str] | None = None,

2270 orient: str | None = None,

2271 date_format: str | None = None,

2272 double_precision: int = 10,

2273 force_ascii: bool_t = True,

2274 date_unit: str = "ms",

2275 default_handler: Callable[[Any], JSONSerializable] | None = None,

2276 lines: bool_t = False,

2277 compression: CompressionOptions = "infer",

2278 index: bool_t = True,

2279 indent: int | None = None,

2280 storage_options: StorageOptions = None,

2281 mode: Literal["a", "w"] = "w",

2282 ) -> str | None:

2283 """

2284 Convert the object to a JSON string.

2285

2286 Note NaN's and None will be converted to null and datetime objects

2287 will be converted to UNIX timestamps.

2288

2289 Parameters

2290 ----------

2291 path_or_buf : str, path object, file-like object, or None, default None

2292 String, path object (implementing os.PathLike[str]), or file-like

2293 object implementing a write() function. If None, the result is

2294 returned as a string.

2295 orient : str

2296 Indication of expected JSON string format.

2297

2298 * Series:

2299

2300 - default is 'index'

2301 - allowed values are: {{'split', 'records', 'index', 'table'}}.

2302

2303 * DataFrame:

2304

2305 - default is 'columns'

2306 - allowed values are: {{'split', 'records', 'index', 'columns',

2307 'values', 'table'}}.

2308

2309 * The format of the JSON string:

2310

2311 - 'split' : dict like {{'index' -> [index], 'columns' -> [columns],

2312 'data' -> [values]}}

2313 - 'records' : list like [{{column -> value}}, ... , {{column -> value}}]

2314 - 'index' : dict like {{index -> {{column -> value}}}}

2315 - 'columns' : dict like {{column -> {{index -> value}}}}

2316 - 'values' : just the values array

2317 - 'table' : dict like {{'schema': {{schema}}, 'data': {{data}}}}

2318

2319 Describing the data, where data component is like ``orient='records'``.

2320

2321 date_format : {{None, 'epoch', 'iso'}}

2322 Type of date conversion. 'epoch' = epoch milliseconds,

2323 'iso' = ISO8601. The default depends on the `orient`. For

2324 ``orient='table'``, the default is 'iso'. For all other orients,

2325 the default is 'epoch'.

2326 double_precision : int, default 10

2327 The number of decimal places to use when encoding

2328 floating point values.

2329 force_ascii : bool, default True

2330 Force encoded string to be ASCII.

2331 date_unit : str, default 'ms' (milliseconds)

2332 The time unit to encode to, governs timestamp and ISO8601

2333 precision. One of 's', 'ms', 'us', 'ns' for second, millisecond,

2334 microsecond, and nanosecond respectively.

2335 default_handler : callable, default None

2336 Handler to call if object cannot otherwise be converted to a

2337 suitable format for JSON. Should receive a single argument which is

2338 the object to convert and return a serialisable object.

2339 lines : bool, default False

2340 If 'orient' is 'records' write out line-delimited json format. Will

2341 throw ValueError if incorrect 'orient' since others are not

2342 list-like.

2343 {compression_options}

2344

2345 .. versionchanged:: 1.4.0 Zstandard support.

2346

2347 index : bool, default True

2348 Whether to include the index values in the JSON string. Not

2349 including the index (``index=False``) is only supported when

2350 orient is 'split' or 'table'.

2351 indent : int, optional

2352 Length of whitespace used to indent each record.

2353

2354 {storage_options}

2355

2356 .. versionadded:: 1.2.0

2357

2358 mode : str, default 'w' (writing)

2359 Specify the IO mode for output when supplying a path_or_buf.

2360 Accepted args are 'w' (writing) and 'a' (append) only.

2361 mode='a' is only supported when lines is True and orient is 'records'.

2362

2363 Returns

2364 -------

2365 None or str

2366 If path_or_buf is None, returns the resulting json format as a

2367 string. Otherwise returns None.

2368

2369 See Also

2370 --------

2371 read_json : Convert a JSON string to pandas object.

2372

2373 Notes

2374 -----

2375 The behavior of ``indent=0`` varies from the stdlib, which does not

2376 indent the output but does insert newlines. Currently, ``indent=0``

2377 and the default ``indent=None`` are equivalent in pandas, though this

2378 may change in a future release.

2379

2380 ``orient='table'`` contains a 'pandas_version' field under 'schema'.

2381 This stores the version of `pandas` used in the latest revision of the

2382 schema.

2383

2384 Examples

2385 --------

2386 >>> from json import loads, dumps

2387 >>> df = pd.DataFrame(

2388 ... [["a", "b"], ["c", "d"]],

2389 ... index=["row 1", "row 2"],

2390 ... columns=["col 1", "col 2"],

2391 ... )

2392

2393 >>> result = df.to_json(orient="split")

2394 >>> parsed = loads(result)

2395 >>> dumps(parsed, indent=4) # doctest: +SKIP

2396 {{

2397 "columns": [

2398 "col 1",

2399 "col 2"

2400 ],

2401 "index": [

2402 "row 1",

2403 "row 2"

2404 ],

2405 "data": [

2406 [

2407 "a",

2408 "b"

2409 ],

2410 [

2411 "c",

2412 "d"

2413 ]

2414 ]

2415 }}

2416

2417 Encoding/decoding a Dataframe using ``'records'`` formatted JSON.

2418 Note that index labels are not preserved with this encoding.

2419

2420 >>> result = df.to_json(orient="records")

2421 >>> parsed = loads(result)

2422 >>> dumps(parsed, indent=4) # doctest: +SKIP

2423 [

2424 {{

2425 "col 1": "a",

2426 "col 2": "b"

2427 }},

2428 {{

2429 "col 1": "c",

2430 "col 2": "d"

2431 }}

2432 ]

2433

2434 Encoding/decoding a Dataframe using ``'index'`` formatted JSON:

2435

2436 >>> result = df.to_json(orient="index")

2437 >>> parsed = loads(result)

2438 >>> dumps(parsed, indent=4) # doctest: +SKIP

2439 {{

2440 "row 1": {{

2441 "col 1": "a",

2442 "col 2": "b"

2443 }},

2444 "row 2": {{

2445 "col 1": "c",

2446 "col 2": "d"

2447 }}

2448 }}

2449

2450 Encoding/decoding a Dataframe using ``'columns'`` formatted JSON:

2451

2452 >>> result = df.to_json(orient="columns")

2453 >>> parsed = loads(result)

2454 >>> dumps(parsed, indent=4) # doctest: +SKIP

2455 {{

2456 "col 1": {{

2457 "row 1": "a",

2458 "row 2": "c"

2459 }},

2460 "col 2": {{

2461 "row 1": "b",

2462 "row 2": "d"

2463 }}

2464 }}

2465

2466 Encoding/decoding a Dataframe using ``'values'`` formatted JSON:

2467

2468 >>> result = df.to_json(orient="values")

2469 >>> parsed = loads(result)

2470 >>> dumps(parsed, indent=4) # doctest: +SKIP

2471 [

2472 [

2473 "a",

2474 "b"

2475 ],

2476 [

2477 "c",

2478 "d"

2479 ]

2480 ]

2481

2482 Encoding with Table Schema:

2483

2484 >>> result = df.to_json(orient="table")

2485 >>> parsed = loads(result)

2486 >>> dumps(parsed, indent=4) # doctest: +SKIP

2487 {{

2488 "schema": {{

2489 "fields": [

2490 {{

2491 "name": "index",

2492 "type": "string"

2493 }},

2494 {{

2495 "name": "col 1",

2496 "type": "string"

2497 }},

2498 {{

2499 "name": "col 2",

2500 "type": "string"

2501 }}

2502 ],

2503 "primaryKey": [

2504 "index"

2505 ],

2506 "pandas_version": "1.4.0"

2507 }},

2508 "data": [

2509 {{

2510 "index": "row 1",

2511 "col 1": "a",

2512 "col 2": "b"

2513 }},

2514 {{

2515 "index": "row 2",

2516 "col 1": "c",

2517 "col 2": "d"

2518 }}

2519 ]

2520 }}

2521 """

2522 from pandas.io import json

2523

2524 if date_format is None and orient == "table":

2525 date_format = "iso"

2526 elif date_format is None:

2527 date_format = "epoch"

2528

2529 config.is_nonnegative_int(indent)

2530 indent = indent or 0

2531

2532 return json.to_json(

2533 path_or_buf=path_or_buf,

2534 obj=self,

2535 orient=orient,

2536 date_format=date_format,

2537 double_precision=double_precision,

2538 force_ascii=force_ascii,

2539 date_unit=date_unit,

2540 default_handler=default_handler,

2541 lines=lines,

2542 compression=compression,

2543 index=index,

2544 indent=indent,

2545 storage_options=storage_options,

2546 mode=mode,

2547 )

2548

2549 @final

2550 def to_hdf(

2551 self,

2552 path_or_buf: FilePath | HDFStore,

2553 key: str,

2554 mode: str = "a",

2555 complevel: int | None = None,

2556 complib: str | None = None,

2557 append: bool_t = False,

2558 format: str | None = None,

2559 index: bool_t = True,

2560 min_itemsize: int | dict[str, int] | None = None,

2561 nan_rep=None,

2562 dropna: bool_t | None = None,

2563 data_columns: Literal[True] | list[str] | None = None,

2564 errors: str = "strict",

2565 encoding: str = "UTF-8",

2566 ) -> None:

2567 """

2568 Write the contained data to an HDF5 file using HDFStore.

2569

2570 Hierarchical Data Format (HDF) is self-describing, allowing an

2571 application to interpret the structure and contents of a file with

2572 no outside information. One HDF file can hold a mix of related objects

2573 which can be accessed as a group or as individual objects.

2574

2575 In order to add another DataFrame or Series to an existing HDF file

2576 please use append mode and a different a key.

2577

2578 .. warning::

2579

2580 One can store a subclass of ``DataFrame`` or ``Series`` to HDF5,

2581 but the type of the subclass is lost upon storing.

2582

2583 For more information see the :ref:`user guide <io.hdf5>`.

2584

2585 Parameters

2586 ----------

2587 path_or_buf : str or pandas.HDFStore

2588 File path or HDFStore object.

2589 key : str

2590 Identifier for the group in the store.

2591 mode : {'a', 'w', 'r+'}, default 'a'

2592 Mode to open file:

2593

2594 - 'w': write, a new file is created (an existing file with

2595 the same name would be deleted).

2596 - 'a': append, an existing file is opened for reading and

2597 writing, and if the file does not exist it is created.

2598 - 'r+': similar to 'a', but the file must already exist.

2599 complevel : {0-9}, default None

2600 Specifies a compression level for data.

2601 A value of 0 or None disables compression.

2602 complib : {'zlib', 'lzo', 'bzip2', 'blosc'}, default 'zlib'

2603 Specifies the compression library to be used.

2604 As of v0.20.2 these additional compressors for Blosc are supported

2605 (default if no compressor specified: 'blosc:blosclz'):

2606 {'blosc:blosclz', 'blosc:lz4', 'blosc:lz4hc', 'blosc:snappy',

2607 'blosc:zlib', 'blosc:zstd'}.

2608 Specifying a compression library which is not available issues

2609 a ValueError.

2610 append : bool, default False

2611 For Table formats, append the input data to the existing.

2612 format : {'fixed', 'table', None}, default 'fixed'

2613 Possible values:

2614

2615 - 'fixed': Fixed format. Fast writing/reading. Not-appendable,

2616 nor searchable.

2617 - 'table': Table format. Write as a PyTables Table structure

2618 which may perform worse but allow more flexible operations

2619 like searching / selecting subsets of the data.

2620 - If None, pd.get_option('io.hdf.default_format') is checked,

2621 followed by fallback to "fixed".

2622 index : bool, default True

2623 Write DataFrame index as a column.

2624 min_itemsize : dict or int, optional

2625 Map column names to minimum string sizes for columns.

2626 nan_rep : Any, optional

2627 How to represent null values as str.

2628 Not allowed with append=True.

2629 dropna : bool, default False, optional

2630 Remove missing values.

2631 data_columns : list of columns or True, optional

2632 List of columns to create as indexed data columns for on-disk

2633 queries, or True to use all columns. By default only the axes

2634 of the object are indexed. See

2635 :ref:`Query via data columns<io.hdf5-query-data-columns>`. for

2636 more information.

2637 Applicable only to format='table'.

2638 errors : str, default 'strict'

2639 Specifies how encoding and decoding errors are to be handled.

2640 See the errors argument for :func:`open` for a full list

2641 of options.

2642 encoding : str, default "UTF-8"

2643

2644 See Also

2645 --------

2646 read_hdf : Read from HDF file.

2647 DataFrame.to_orc : Write a DataFrame to the binary orc format.

2648 DataFrame.to_parquet : Write a DataFrame to the binary parquet format.

2649 DataFrame.to_sql : Write to a SQL table.

2650 DataFrame.to_feather : Write out feather-format for DataFrames.

2651 DataFrame.to_csv : Write out to a csv file.

2652

2653 Examples

2654 --------

2655 >>> df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]},

2656 ... index=['a', 'b', 'c']) # doctest: +SKIP

2657 >>> df.to_hdf('data.h5', key='df', mode='w') # doctest: +SKIP

2658

2659 We can add another object to the same file:

2660

2661 >>> s = pd.Series([1, 2, 3, 4]) # doctest: +SKIP

2662 >>> s.to_hdf('data.h5', key='s') # doctest: +SKIP

2663

2664 Reading from HDF file:

2665

2666 >>> pd.read_hdf('data.h5', 'df') # doctest: +SKIP

2667 A B

2668 a 1 4

2669 b 2 5

2670 c 3 6

2671 >>> pd.read_hdf('data.h5', 's') # doctest: +SKIP

2672 0 1

2673 1 2

2674 2 3

2675 3 4

2676 dtype: int64

2677 """

2678 from pandas.io import pytables

2679

2680 # Argument 3 to "to_hdf" has incompatible type "NDFrame"; expected

2681 # "Union[DataFrame, Series]" [arg-type]

2682 pytables.to_hdf(

2683 path_or_buf,

2684 key,

2685 self, # type: ignore[arg-type]

2686 mode=mode,

2687 complevel=complevel,

2688 complib=complib,

2689 append=append,

2690 format=format,

2691 index=index,

2692 min_itemsize=min_itemsize,

2693 nan_rep=nan_rep,

2694 dropna=dropna,

2695 data_columns=data_columns,

2696 errors=errors,

2697 encoding=encoding,

2698 )

2699

2700 @final

2701 def to_sql(

2702 self,

2703 name: str,

2704 con,

2705 schema: str | None = None,

2706 if_exists: Literal["fail", "replace", "append"] = "fail",

2707 index: bool_t = True,

2708 index_label: IndexLabel = None,

2709 chunksize: int | None = None,

2710 dtype: DtypeArg | None = None,

2711 method: str | None = None,

2712 ) -> int | None:

2713 """

2714 Write records stored in a DataFrame to a SQL database.

2715

2716 Databases supported by SQLAlchemy [1]_ are supported. Tables can be

2717 newly created, appended to, or overwritten.

2718

2719 Parameters

2720 ----------

2721 name : str

2722 Name of SQL table.

2723 con : sqlalchemy.engine.(Engine or Connection) or sqlite3.Connection

2724 Using SQLAlchemy makes it possible to use any DB supported by that

2725 library. Legacy support is provided for sqlite3.Connection objects. The user

2726 is responsible for engine disposal and connection closure for the SQLAlchemy

2727 connectable. See `here \

2728 <https://docs.sqlalchemy.org/en/20/core/connections.html>`_.

2729 If passing a sqlalchemy.engine.Connection which is already in a transaction,

2730 the transaction will not be committed. If passing a sqlite3.Connection,

2731 it will not be possible to roll back the record insertion.

2732

2733 schema : str, optional

2734 Specify the schema (if database flavor supports this). If None, use

2735 default schema.

2736 if_exists : {'fail', 'replace', 'append'}, default 'fail'

2737 How to behave if the table already exists.

2738

2739 * fail: Raise a ValueError.

2740 * replace: Drop the table before inserting new values.

2741 * append: Insert new values to the existing table.

2742

2743 index : bool, default True

2744 Write DataFrame index as a column. Uses `index_label` as the column

2745 name in the table.

2746 index_label : str or sequence, default None

2747 Column label for index column(s). If None is given (default) and

2748 `index` is True, then the index names are used.

2749 A sequence should be given if the DataFrame uses MultiIndex.

2750 chunksize : int, optional

2751 Specify the number of rows in each batch to be written at a time.

2752 By default, all rows will be written at once.

2753 dtype : dict or scalar, optional

2754 Specifying the datatype for columns. If a dictionary is used, the

2755 keys should be the column names and the values should be the

2756 SQLAlchemy types or strings for the sqlite3 legacy mode. If a

2757 scalar is provided, it will be applied to all columns.

2758 method : {None, 'multi', callable}, optional

2759 Controls the SQL insertion clause used:

2760

2761 * None : Uses standard SQL ``INSERT`` clause (one per row).

2762 * 'multi': Pass multiple values in a single ``INSERT`` clause.

2763 * callable with signature ``(pd_table, conn, keys, data_iter)``.

2764

2765 Details and a sample callable implementation can be found in the

2766 section :ref:`insert method <io.sql.method>`.

2767

2768 Returns

2769 -------

2770 None or int

2771 Number of rows affected by to_sql. None is returned if the callable

2772 passed into ``method`` does not return an integer number of rows.

2773

2774 The number of returned rows affected is the sum of the ``rowcount``

2775 attribute of ``sqlite3.Cursor`` or SQLAlchemy connectable which may not

2776 reflect the exact number of written rows as stipulated in the

2777 `sqlite3 <https://docs.python.org/3/library/sqlite3.html#sqlite3.Cursor.rowcount>`__ or

2778 `SQLAlchemy <https://docs.sqlalchemy.org/en/20/core/connections.html#sqlalchemy.engine.CursorResult.rowcount>`__.

2779

2780 .. versionadded:: 1.4.0

2781

2782 Raises

2783 ------

2784 ValueError

2785 When the table already exists and `if_exists` is 'fail' (the

2786 default).

2787

2788 See Also

2789 --------

2790 read_sql : Read a DataFrame from a table.

2791

2792 Notes

2793 -----

2794 Timezone aware datetime columns will be written as

2795 ``Timestamp with timezone`` type with SQLAlchemy if supported by the

2796 database. Otherwise, the datetimes will be stored as timezone unaware

2797 timestamps local to the original timezone.

2798

2799 References

2800 ----------

2801 .. [1] https://docs.sqlalchemy.org

2802 .. [2] https://www.python.org/dev/peps/pep-0249/

2803

2804 Examples

2805 --------

2806 Create an in-memory SQLite database.

2807

2808 >>> from sqlalchemy import create_engine

2809 >>> engine = create_engine('sqlite://', echo=False)

2810

2811 Create a table from scratch with 3 rows.

2812

2813 >>> df = pd.DataFrame({'name' : ['User 1', 'User 2', 'User 3']})

2814 >>> df

2815 name

2816 0 User 1

2817 1 User 2

2818 2 User 3

2819

2820 >>> df.to_sql('users', con=engine)

2821 3

2822 >>> from sqlalchemy import text

2823 >>> with engine.connect() as conn:

2824 ... conn.execute(text("SELECT * FROM users")).fetchall()

2825 [(0, 'User 1'), (1, 'User 2'), (2, 'User 3')]

2826

2827 An `sqlalchemy.engine.Connection` can also be passed to `con`:

2828

2829 >>> with engine.begin() as connection:

2830 ... df1 = pd.DataFrame({'name' : ['User 4', 'User 5']})

2831 ... df1.to_sql('users', con=connection, if_exists='append')

2832 2

2833

2834 This is allowed to support operations that require that the same

2835 DBAPI connection is used for the entire operation.

2836

2837 >>> df2 = pd.DataFrame({'name' : ['User 6', 'User 7']})

2838 >>> df2.to_sql('users', con=engine, if_exists='append')

2839 2

2840 >>> with engine.connect() as conn:

2841 ... conn.execute(text("SELECT * FROM users")).fetchall()

2842 [(0, 'User 1'), (1, 'User 2'), (2, 'User 3'),

2843 (0, 'User 4'), (1, 'User 5'), (0, 'User 6'),

2844 (1, 'User 7')]

2845

2846 Overwrite the table with just ``df2``.

2847

2848 >>> df2.to_sql('users', con=engine, if_exists='replace',

2849 ... index_label='id')

2850 2

2851 >>> with engine.connect() as conn:

2852 ... conn.execute(text("SELECT * FROM users")).fetchall()

2853 [(0, 'User 6'), (1, 'User 7')]

2854

2855 Specify the dtype (especially useful for integers with missing values).

2856 Notice that while pandas is forced to store the data as floating point,

2857 the database supports nullable integers. When fetching the data with

2858 Python, we get back integer scalars.

2859

2860 >>> df = pd.DataFrame({"A": [1, None, 2]})

2861 >>> df

2862 A

2863 0 1.0

2864 1 NaN

2865 2 2.0

2866

2867 >>> from sqlalchemy.types import Integer

2868 >>> df.to_sql('integers', con=engine, index=False,

2869 ... dtype={"A": Integer()})

2870 3

2871

2872 >>> with engine.connect() as conn:

2873 ... conn.execute(text("SELECT * FROM integers")).fetchall()

2874 [(1,), (None,), (2,)]

2875 """ # noqa:E501

2876 from pandas.io import sql

2877

2878 return sql.to_sql(

2879 self,

2880 name,

2881 con,

2882 schema=schema,

2883 if_exists=if_exists,

2884 index=index,

2885 index_label=index_label,

2886 chunksize=chunksize,

2887 dtype=dtype,

2888 method=method,

2889 )

2890

2891 @final

2892 @doc(

2893 storage_options=_shared_docs["storage_options"],

2894 compression_options=_shared_docs["compression_options"] % "path",

2895 )

2896 def to_pickle(

2897 self,

2898 path: FilePath | WriteBuffer[bytes],

2899 compression: CompressionOptions = "infer",

2900 protocol: int = pickle.HIGHEST_PROTOCOL,

2901 storage_options: StorageOptions = None,

2902 ) -> None:

2903 """

2904 Pickle (serialize) object to file.

2905

2906 Parameters

2907 ----------

2908 path : str, path object, or file-like object

2909 String, path object (implementing ``os.PathLike[str]``), or file-like

2910 object implementing a binary ``write()`` function. File path where

2911 the pickled object will be stored.

2912 {compression_options}

2913 protocol : int

2914 Int which indicates which protocol should be used by the pickler,

2915 default HIGHEST_PROTOCOL (see [1]_ paragraph 12.1.2). The possible

2916 values are 0, 1, 2, 3, 4, 5. A negative value for the protocol

2917 parameter is equivalent to setting its value to HIGHEST_PROTOCOL.

2918

2919 .. [1] https://docs.python.org/3/library/pickle.html.

2920

2921 {storage_options}

2922

2923 .. versionadded:: 1.2.0

2924

2925 See Also

2926 --------

2927 read_pickle : Load pickled pandas object (or any object) from file.

2928 DataFrame.to_hdf : Write DataFrame to an HDF5 file.

2929 DataFrame.to_sql : Write DataFrame to a SQL database.

2930 DataFrame.to_parquet : Write a DataFrame to the binary parquet format.

2931

2932 Examples

2933 --------

2934 >>> original_df = pd.DataFrame({{"foo": range(5), "bar": range(5, 10)}}) # doctest: +SKIP

2935 >>> original_df # doctest: +SKIP

2936 foo bar

2937 0 0 5

2938 1 1 6

2939 2 2 7

2940 3 3 8

2941 4 4 9

2942 >>> original_df.to_pickle("./dummy.pkl") # doctest: +SKIP

2943

2944 >>> unpickled_df = pd.read_pickle("./dummy.pkl") # doctest: +SKIP

2945 >>> unpickled_df # doctest: +SKIP

2946 foo bar

2947 0 0 5

2948 1 1 6

2949 2 2 7

2950 3 3 8

2951 4 4 9

2952 """ # noqa: E501

2953 from pandas.io.pickle import to_pickle

2954

2955 to_pickle(

2956 self,

2957 path,

2958 compression=compression,

2959 protocol=protocol,

2960 storage_options=storage_options,

2961 )

2962

2963 @final

2964 def to_clipboard(

2965 self, excel: bool_t = True, sep: str | None = None, **kwargs

2966 ) -> None:

2967 r"""

2968 Copy object to the system clipboard.

2969

2970 Write a text representation of object to the system clipboard.

2971 This can be pasted into Excel, for example.

2972

2973 Parameters

2974 ----------

2975 excel : bool, default True

2976 Produce output in a csv format for easy pasting into excel.

2977

2978 - True, use the provided separator for csv pasting.

2979 - False, write a string representation of the object to the clipboard.

2980

2981 sep : str, default ``'\t'``

2982 Field delimiter.

2983 **kwargs

2984 These parameters will be passed to DataFrame.to_csv.

2985

2986 See Also

2987 --------

2988 DataFrame.to_csv : Write a DataFrame to a comma-separated values

2989 (csv) file.

2990 read_clipboard : Read text from clipboard and pass to read_csv.

2991

2992 Notes

2993 -----

2994 Requirements for your platform.

2995

2996 - Linux : `xclip`, or `xsel` (with `PyQt4` modules)

2997 - Windows : none

2998 - macOS : none

2999

3000 This method uses the processes developed for the package `pyperclip`. A

3001 solution to render any output string format is given in the examples.

3002

3003 Examples

3004 --------

3005 Copy the contents of a DataFrame to the clipboard.

3006

3007 >>> df = pd.DataFrame([[1, 2, 3], [4, 5, 6]], columns=['A', 'B', 'C'])

3008

3009 >>> df.to_clipboard(sep=',') # doctest: +SKIP

3010 ... # Wrote the following to the system clipboard:

3011 ... # ,A,B,C

3012 ... # 0,1,2,3

3013 ... # 1,4,5,6

3014

3015 We can omit the index by passing the keyword `index` and setting

3016 it to false.

3017

3018 >>> df.to_clipboard(sep=',', index=False) # doctest: +SKIP

3019 ... # Wrote the following to the system clipboard:

3020 ... # A,B,C

3021 ... # 1,2,3

3022 ... # 4,5,6

3023

3024 Using the original `pyperclip` package for any string output format.

3025

3026 .. code-block:: python

3027

3028 import pyperclip

3029 html = df.style.to_html()

3030 pyperclip.copy(html)

3031 """

3032 from pandas.io import clipboards

3033

3034 clipboards.to_clipboard(self, excel=excel, sep=sep, **kwargs)

3035

3036 @final

3037 def to_xarray(self):

3038 """

3039 Return an xarray object from the pandas object.

3040

3041 Returns

3042 -------

3043 xarray.DataArray or xarray.Dataset

3044 Data in the pandas structure converted to Dataset if the object is

3045 a DataFrame, or a DataArray if the object is a Series.

3046

3047 See Also

3048 --------

3049 DataFrame.to_hdf : Write DataFrame to an HDF5 file.

3050 DataFrame.to_parquet : Write a DataFrame to the binary parquet format.

3051

3052 Notes

3053 -----

3054 See the `xarray docs <https://xarray.pydata.org/en/stable/>`__

3055

3056 Examples

3057 --------

3058 >>> df = pd.DataFrame([('falcon', 'bird', 389.0, 2),

3059 ... ('parrot', 'bird', 24.0, 2),

3060 ... ('lion', 'mammal', 80.5, 4),

3061 ... ('monkey', 'mammal', np.nan, 4)],

3062 ... columns=['name', 'class', 'max_speed',

3063 ... 'num_legs'])

3064 >>> df

3065 name class max_speed num_legs

3066 0 falcon bird 389.0 2

3067 1 parrot bird 24.0 2

3068 2 lion mammal 80.5 4

3069 3 monkey mammal NaN 4

3070

3071 >>> df.to_xarray()

3072 <xarray.Dataset>

3073 Dimensions: (index: 4)

3074 Coordinates:

3075 * index (index) int64 0 1 2 3

3076 Data variables:

3077 name (index) object 'falcon' 'parrot' 'lion' 'monkey'

3078 class (index) object 'bird' 'bird' 'mammal' 'mammal'

3079 max_speed (index) float64 389.0 24.0 80.5 nan

3080 num_legs (index) int64 2 2 4 4

3081

3082 >>> df['max_speed'].to_xarray()

3083 <xarray.DataArray 'max_speed' (index: 4)>

3084 array([389. , 24. , 80.5, nan])

3085 Coordinates:

3086 * index (index) int64 0 1 2 3

3087

3088 >>> dates = pd.to_datetime(['2018-01-01', '2018-01-01',

3089 ... '2018-01-02', '2018-01-02'])

3090 >>> df_multiindex = pd.DataFrame({'date': dates,

3091 ... 'animal': ['falcon', 'parrot',

3092 ... 'falcon', 'parrot'],

3093 ... 'speed': [350, 18, 361, 15]})

3094 >>> df_multiindex = df_multiindex.set_index(['date', 'animal'])

3095

3096 >>> df_multiindex

3097 speed

3098 date animal

3099 2018-01-01 falcon 350

3100 parrot 18

3101 2018-01-02 falcon 361

3102 parrot 15

3103

3104 >>> df_multiindex.to_xarray()

3105 <xarray.Dataset>

3106 Dimensions: (date: 2, animal: 2)

3107 Coordinates:

3108 * date (date) datetime64[ns] 2018-01-01 2018-01-02

3109 * animal (animal) object 'falcon' 'parrot'

3110 Data variables:

3111 speed (date, animal) int64 350 18 361 15

3112 """

3113 xarray = import_optional_dependency("xarray")

3114

3115 if self.ndim == 1:

3116 return xarray.DataArray.from_series(self)

3117 else:

3118 return xarray.Dataset.from_dataframe(self)

3119

3120 @overload

3121 def to_latex(

3122 self,

3123 buf: None = ...,

3124 columns: Sequence[Hashable] | None = ...,

3125 header: bool_t | Sequence[str] = ...,

3126 index: bool_t = ...,

3127 na_rep: str = ...,

3128 formatters: FormattersType | None = ...,

3129 float_format: FloatFormatType | None = ...,

3130 sparsify: bool_t | None = ...,

3131 index_names: bool_t = ...,

3132 bold_rows: bool_t = ...,

3133 column_format: str | None = ...,

3134 longtable: bool_t | None = ...,

3135 escape: bool_t | None = ...,

3136 encoding: str | None = ...,

3137 decimal: str = ...,

3138 multicolumn: bool_t | None = ...,

3139 multicolumn_format: str | None = ...,

3140 multirow: bool_t | None = ...,

3141 caption: str | tuple[str, str] | None = ...,

3142 label: str | None = ...,

3143 position: str | None = ...,

3144 ) -> str:

3145 ...

3146

3147 @overload

3148 def to_latex(

3149 self,

3150 buf: FilePath | WriteBuffer[str],

3151 columns: Sequence[Hashable] | None = ...,

3152 header: bool_t | Sequence[str] = ...,

3153 index: bool_t = ...,

3154 na_rep: str = ...,

3155 formatters: FormattersType | None = ...,

3156 float_format: FloatFormatType | None = ...,

3157 sparsify: bool_t | None = ...,

3158 index_names: bool_t = ...,

3159 bold_rows: bool_t = ...,

3160 column_format: str | None = ...,

3161 longtable: bool_t | None = ...,

3162 escape: bool_t | None = ...,

3163 encoding: str | None = ...,

3164 decimal: str = ...,

3165 multicolumn: bool_t | None = ...,

3166 multicolumn_format: str | None = ...,

3167 multirow: bool_t | None = ...,

3168 caption: str | tuple[str, str] | None = ...,

3169 label: str | None = ...,

3170 position: str | None = ...,

3171 ) -> None:

3172 ...

3173

3174 @final

3175 def to_latex(

3176 self,

3177 buf: FilePath | WriteBuffer[str] | None = None,

3178 columns: Sequence[Hashable] | None = None,

3179 header: bool_t | Sequence[str] = True,

3180 index: bool_t = True,

3181 na_rep: str = "NaN",

3182 formatters: FormattersType | None = None,

3183 float_format: FloatFormatType | None = None,

3184 sparsify: bool_t | None = None,

3185 index_names: bool_t = True,

3186 bold_rows: bool_t = False,

3187 column_format: str | None = None,

3188 longtable: bool_t | None = None,

3189 escape: bool_t | None = None,

3190 encoding: str | None = None,

3191 decimal: str = ".",

3192 multicolumn: bool_t | None = None,

3193 multicolumn_format: str | None = None,

3194 multirow: bool_t | None = None,

3195 caption: str | tuple[str, str] | None = None,

3196 label: str | None = None,

3197 position: str | None = None,

3198 ) -> str | None:

3199 r"""

3200 Render object to a LaTeX tabular, longtable, or nested table.

3201

3202 Requires ``\usepackage{{booktabs}}``. The output can be copy/pasted

3203 into a main LaTeX document or read from an external file

3204 with ``\input{{table.tex}}``.

3205

3206 .. versionchanged:: 1.2.0

3207 Added position argument, changed meaning of caption argument.

3208

3209 .. versionchanged:: 2.0.0

3210 Refactored to use the Styler implementation via jinja2 templating.

3211

3212 Parameters

3213 ----------

3214 buf : str, Path or StringIO-like, optional, default None

3215 Buffer to write to. If None, the output is returned as a string.

3216 columns : list of label, optional

3217 The subset of columns to write. Writes all columns by default.

3218 header : bool or list of str, default True

3219 Write out the column names. If a list of strings is given,

3220 it is assumed to be aliases for the column names.

3221 index : bool, default True

3222 Write row names (index).

3223 na_rep : str, default 'NaN'

3224 Missing data representation.

3225 formatters : list of functions or dict of {{str: function}}, optional

3226 Formatter functions to apply to columns' elements by position or

3227 name. The result of each function must be a unicode string.

3228 List must be of length equal to the number of columns.

3229 float_format : one-parameter function or str, optional, default None

3230 Formatter for floating point numbers. For example

3231 ``float_format="%.2f"`` and ``float_format="{{:0.2f}}".format`` will

3232 both result in 0.1234 being formatted as 0.12.

3233 sparsify : bool, optional

3234 Set to False for a DataFrame with a hierarchical index to print

3235 every multiindex key at each row. By default, the value will be

3236 read from the config module.

3237 index_names : bool, default True

3238 Prints the names of the indexes.

3239 bold_rows : bool, default False

3240 Make the row labels bold in the output.

3241 column_format : str, optional

3242 The columns format as specified in `LaTeX table format

3243 <https://en.wikibooks.org/wiki/LaTeX/Tables>`__ e.g. 'rcl' for 3

3244 columns. By default, 'l' will be used for all columns except

3245 columns of numbers, which default to 'r'.

3246 longtable : bool, optional

3247 Use a longtable environment instead of tabular. Requires

3248 adding a \usepackage{{longtable}} to your LaTeX preamble.

3249 By default, the value will be read from the pandas config

3250 module, and set to `True` if the option ``styler.latex.environment`` is

3251 `"longtable"`.

3252

3253 .. versionchanged:: 2.0.0

3254 The pandas option affecting this argument has changed.

3255 escape : bool, optional

3256 By default, the value will be read from the pandas config

3257 module and set to `True` if the option ``styler.format.escape`` is

3258 `"latex"`. When set to False prevents from escaping latex special

3259 characters in column names.

3260

3261 .. versionchanged:: 2.0.0

3262 The pandas option affecting this argument has changed, as has the

3263 default value to `False`.

3264 encoding : str, optional

3265 A string representing the encoding to use in the output file,

3266 defaults to 'utf-8'.

3267 decimal : str, default '.'

3268 Character recognized as decimal separator, e.g. ',' in Europe.

3269 multicolumn : bool, default True

3270 Use \multicolumn to enhance MultiIndex columns.

3271 The default will be read from the config module, and is set

3272 as the option ``styler.sparse.columns``.

3273

3274 .. versionchanged:: 2.0.0

3275 The pandas option affecting this argument has changed.

3276 multicolumn_format : str, default 'r'

3277 The alignment for multicolumns, similar to `column_format`

3278 The default will be read from the config module, and is set as the option

3279 ``styler.latex.multicol_align``.

3280

3281 .. versionchanged:: 2.0.0

3282 The pandas option affecting this argument has changed, as has the

3283 default value to "r".

3284 multirow : bool, default True

3285 Use \multirow to enhance MultiIndex rows. Requires adding a

3286 \usepackage{{multirow}} to your LaTeX preamble. Will print

3287 centered labels (instead of top-aligned) across the contained

3288 rows, separating groups via clines. The default will be read

3289 from the pandas config module, and is set as the option

3290 ``styler.sparse.index``.

3291

3292 .. versionchanged:: 2.0.0

3293 The pandas option affecting this argument has changed, as has the

3294 default value to `True`.

3295 caption : str or tuple, optional

3296 Tuple (full_caption, short_caption),

3297 which results in ``\caption[short_caption]{{full_caption}}``;

3298 if a single string is passed, no short caption will be set.

3299

3300 .. versionchanged:: 1.2.0

3301 Optionally allow caption to be a tuple ``(full_caption, short_caption)``.

3302

3303 label : str, optional

3304 The LaTeX label to be placed inside ``\label{{}}`` in the output.

3305 This is used with ``\ref{{}}`` in the main ``.tex`` file.

3306

3307 position : str, optional

3308 The LaTeX positional argument for tables, to be placed after

3309 ``\begin{{}}`` in the output.

3310

3311 .. versionadded:: 1.2.0

3312

3313 Returns

3314 -------

3315 str or None

3316 If buf is None, returns the result as a string. Otherwise returns None.

3317

3318 See Also

3319 --------

3320 io.formats.style.Styler.to_latex : Render a DataFrame to LaTeX

3321 with conditional formatting.

3322 DataFrame.to_string : Render a DataFrame to a console-friendly

3323 tabular output.

3324 DataFrame.to_html : Render a DataFrame as an HTML table.

3325

3326 Notes

3327 -----

3328 As of v2.0.0 this method has changed to use the Styler implementation as

3329 part of :meth:`.Styler.to_latex` via ``jinja2`` templating. This means

3330 that ``jinja2`` is a requirement, and needs to be installed, for this method

3331 to function. It is advised that users switch to using Styler, since that

3332 implementation is more frequently updated and contains much more

3333 flexibility with the output.

3334

3335 Examples

3336 --------

3337 Convert a general DataFrame to LaTeX with formatting:

3338

3339 >>> df = pd.DataFrame(dict(name=['Raphael', 'Donatello'],

3340 ... age=[26, 45],

3341 ... height=[181.23, 177.65]))

3342 >>> print(df.to_latex(index=False,

3343 ... formatters={"name": str.upper},

3344 ... float_format="{:.1f}".format,

3345 ... )) # doctest: +SKIP

3346 \begin{tabular}{lrr}

3347 \toprule

3348 name & age & height \\

3349 \midrule

3350 RAPHAEL & 26 & 181.2 \\

3351 DONATELLO & 45 & 177.7 \\

3352 \bottomrule

3353 \end{tabular}

3354 """

3355 # Get defaults from the pandas config

3356 if self.ndim == 1:

3357 self = self.to_frame()

3358 if longtable is None:

3359 longtable = config.get_option("styler.latex.environment") == "longtable"

3360 if escape is None:

3361 escape = config.get_option("styler.format.escape") == "latex"

3362 if multicolumn is None:

3363 multicolumn = config.get_option("styler.sparse.columns")

3364 if multicolumn_format is None:

3365 multicolumn_format = config.get_option("styler.latex.multicol_align")

3366 if multirow is None:

3367 multirow = config.get_option("styler.sparse.index")

3368

3369 if column_format is not None and not isinstance(column_format, str):

3370 raise ValueError("`column_format` must be str or unicode")

3371 length = len(self.columns) if columns is None else len(columns)

3372 if isinstance(header, (list, tuple)) and len(header) != length:

3373 raise ValueError(f"Writing {length} cols but got {len(header)} aliases")

3374

3375 # Refactor formatters/float_format/decimal/na_rep/escape to Styler structure

3376 base_format_ = {

3377 "na_rep": na_rep,

3378 "escape": "latex" if escape else None,

3379 "decimal": decimal,

3380 }

3381 index_format_: dict[str, Any] = {"axis": 0, **base_format_}

3382 column_format_: dict[str, Any] = {"axis": 1, **base_format_}

3383

3384 if isinstance(float_format, str):

3385 float_format_: Callable | None = lambda x: float_format % x

3386 else:

3387 float_format_ = float_format

3388

3389 def _wrap(x, alt_format_):

3390 if isinstance(x, (float, complex)) and float_format_ is not None:

3391 return float_format_(x)

3392 else:

3393 return alt_format_(x)

3394

3395 formatters_: list | tuple | dict | Callable | None = None

3396 if isinstance(formatters, list):

3397 formatters_ = {

3398 c: partial(_wrap, alt_format_=formatters[i])

3399 for i, c in enumerate(self.columns)

3400 }

3401 elif isinstance(formatters, dict):

3402 index_formatter = formatters.pop("__index__", None)

3403 column_formatter = formatters.pop("__columns__", None)

3404 if index_formatter is not None:

3405 index_format_.update({"formatter": index_formatter})

3406 if column_formatter is not None:

3407 column_format_.update({"formatter": column_formatter})

3408

3409 formatters_ = formatters

3410 float_columns = self.select_dtypes(include="float").columns

3411 for col in float_columns:

3412 if col not in formatters.keys():

3413 formatters_.update({col: float_format_})

3414 elif formatters is None and float_format is not None:

3415 formatters_ = partial(_wrap, alt_format_=lambda v: v)

3416 format_index_ = [index_format_, column_format_]

3417

3418 # Deal with hiding indexes and relabelling column names

3419 hide_: list[dict] = []

3420 relabel_index_: list[dict] = []

3421 if columns:

3422 hide_.append(

3423 {

3424 "subset": [c for c in self.columns if c not in columns],

3425 "axis": "columns",

3426 }

3427 )

3428 if header is False:

3429 hide_.append({"axis": "columns"})

3430 elif isinstance(header, (list, tuple)):

3431 relabel_index_.append({"labels": header, "axis": "columns"})

3432 format_index_ = [index_format_] # column_format is overwritten

3433

3434 if index is False:

3435 hide_.append({"axis": "index"})

3436 if index_names is False:

3437 hide_.append({"names": True, "axis": "index"})

3438

3439 render_kwargs_ = {

3440 "hrules": True,

3441 "sparse_index": sparsify,

3442 "sparse_columns": sparsify,

3443 "environment": "longtable" if longtable else None,

3444 "multicol_align": multicolumn_format

3445 if multicolumn

3446 else f"naive-{multicolumn_format}",

3447 "multirow_align": "t" if multirow else "naive",

3448 "encoding": encoding,

3449 "caption": caption,

3450 "label": label,

3451 "position": position,

3452 "column_format": column_format,

3453 "clines": "skip-last;data"

3454 if (multirow and isinstance(self.index, MultiIndex))

3455 else None,

3456 "bold_rows": bold_rows,

3457 }

3458

3459 return self._to_latex_via_styler(

3460 buf,

3461 hide=hide_,

3462 relabel_index=relabel_index_,

3463 format={"formatter": formatters_, **base_format_},

3464 format_index=format_index_,

3465 render_kwargs=render_kwargs_,

3466 )

3467

3468 def _to_latex_via_styler(

3469 self,

3470 buf=None,

3471 *,

3472 hide: dict | list[dict] | None = None,

3473 relabel_index: dict | list[dict] | None = None,

3474 format: dict | list[dict] | None = None,

3475 format_index: dict | list[dict] | None = None,

3476 render_kwargs: dict | None = None,

3477 ):

3478 """

3479 Render object to a LaTeX tabular, longtable, or nested table.

3480

3481 Uses the ``Styler`` implementation with the following, ordered, method chaining:

3482

3483 .. code-block:: python

3484 styler = Styler(DataFrame)

3485 styler.hide(**hide)

3486 styler.relabel_index(**relabel_index)

3487 styler.format(**format)

3488 styler.format_index(**format_index)

3489 styler.to_latex(buf=buf, **render_kwargs)

3490

3491 Parameters

3492 ----------

3493 buf : str, Path or StringIO-like, optional, default None

3494 Buffer to write to. If None, the output is returned as a string.

3495 hide : dict, list of dict

3496 Keyword args to pass to the method call of ``Styler.hide``. If a list will

3497 call the method numerous times.

3498 relabel_index : dict, list of dict

3499 Keyword args to pass to the method of ``Styler.relabel_index``. If a list

3500 will call the method numerous times.

3501 format : dict, list of dict

3502 Keyword args to pass to the method call of ``Styler.format``. If a list will

3503 call the method numerous times.

3504 format_index : dict, list of dict

3505 Keyword args to pass to the method call of ``Styler.format_index``. If a

3506 list will call the method numerous times.

3507 render_kwargs : dict

3508 Keyword args to pass to the method call of ``Styler.to_latex``.

3509

3510 Returns

3511 -------

3512 str or None

3513 If buf is None, returns the result as a string. Otherwise returns None.

3514 """

3515 from pandas.io.formats.style import Styler

3516

3517 self = cast("DataFrame", self)

3518 styler = Styler(self, uuid="")

3519

3520 for kw_name in ["hide", "relabel_index", "format", "format_index"]:

3521 kw = vars()[kw_name]

3522 if isinstance(kw, dict):

3523 getattr(styler, kw_name)(**kw)

3524 elif isinstance(kw, list):

3525 for sub_kw in kw:

3526 getattr(styler, kw_name)(**sub_kw)

3527

3528 # bold_rows is not a direct kwarg of Styler.to_latex

3529 render_kwargs = {} if render_kwargs is None else render_kwargs

3530 if render_kwargs.pop("bold_rows"):

3531 styler.applymap_index(lambda v: "textbf:--rwrap;")

3532

3533 return styler.to_latex(buf=buf, **render_kwargs)

3534

3535 @overload

3536 def to_csv(

3537 self,

3538 path_or_buf: None = ...,

3539 sep: str = ...,

3540 na_rep: str = ...,

3541 float_format: str | Callable | None = ...,

3542 columns: Sequence[Hashable] | None = ...,

3543 header: bool_t | list[str] = ...,

3544 index: bool_t = ...,

3545 index_label: IndexLabel | None = ...,

3546 mode: str = ...,

3547 encoding: str | None = ...,

3548 compression: CompressionOptions = ...,

3549 quoting: int | None = ...,

3550 quotechar: str = ...,

3551 lineterminator: str | None = ...,

3552 chunksize: int | None = ...,

3553 date_format: str | None = ...,

3554 doublequote: bool_t = ...,

3555 escapechar: str | None = ...,

3556 decimal: str = ...,

3557 errors: str = ...,

3558 storage_options: StorageOptions = ...,

3559 ) -> str:

3560 ...

3561

3562 @overload

3563 def to_csv(

3564 self,

3565 path_or_buf: FilePath | WriteBuffer[bytes] | WriteBuffer[str],

3566 sep: str = ...,

3567 na_rep: str = ...,

3568 float_format: str | Callable | None = ...,

3569 columns: Sequence[Hashable] | None = ...,

3570 header: bool_t | list[str] = ...,

3571 index: bool_t = ...,

3572 index_label: IndexLabel | None = ...,

3573 mode: str = ...,

3574 encoding: str | None = ...,

3575 compression: CompressionOptions = ...,

3576 quoting: int | None = ...,

3577 quotechar: str = ...,

3578 lineterminator: str | None = ...,

3579 chunksize: int | None = ...,

3580 date_format: str | None = ...,

3581 doublequote: bool_t = ...,

3582 escapechar: str | None = ...,

3583 decimal: str = ...,

3584 errors: str = ...,

3585 storage_options: StorageOptions = ...,

3586 ) -> None:

3587 ...

3588

3589 @final

3590 @doc(

3591 storage_options=_shared_docs["storage_options"],

3592 compression_options=_shared_docs["compression_options"] % "path_or_buf",

3593 )

3594 def to_csv(

3595 self,

3596 path_or_buf: FilePath | WriteBuffer[bytes] | WriteBuffer[str] | None = None,

3597 sep: str = ",",

3598 na_rep: str = "",

3599 float_format: str | Callable | None = None,

3600 columns: Sequence[Hashable] | None = None,

3601 header: bool_t | list[str] = True,

3602 index: bool_t = True,

3603 index_label: IndexLabel | None = None,

3604 mode: str = "w",

3605 encoding: str | None = None,

3606 compression: CompressionOptions = "infer",

3607 quoting: int | None = None,

3608 quotechar: str = '"',

3609 lineterminator: str | None = None,

3610 chunksize: int | None = None,

3611 date_format: str | None = None,

3612 doublequote: bool_t = True,

3613 escapechar: str | None = None,

3614 decimal: str = ".",

3615 errors: str = "strict",

3616 storage_options: StorageOptions = None,

3617 ) -> str | None:

3618 r"""

3619 Write object to a comma-separated values (csv) file.

3620

3621 Parameters

3622 ----------

3623 path_or_buf : str, path object, file-like object, or None, default None

3624 String, path object (implementing os.PathLike[str]), or file-like

3625 object implementing a write() function. If None, the result is

3626 returned as a string. If a non-binary file object is passed, it should

3627 be opened with `newline=''`, disabling universal newlines. If a binary

3628 file object is passed, `mode` might need to contain a `'b'`.

3629

3630 .. versionchanged:: 1.2.0

3631

3632 Support for binary file objects was introduced.

3633

3634 sep : str, default ','

3635 String of length 1. Field delimiter for the output file.

3636 na_rep : str, default ''

3637 Missing data representation.

3638 float_format : str, Callable, default None

3639 Format string for floating point numbers. If a Callable is given, it takes

3640 precedence over other numeric formatting parameters, like decimal.

3641 columns : sequence, optional

3642 Columns to write.

3643 header : bool or list of str, default True

3644 Write out the column names. If a list of strings is given it is

3645 assumed to be aliases for the column names.

3646 index : bool, default True

3647 Write row names (index).

3648 index_label : str or sequence, or False, default None

3649 Column label for index column(s) if desired. If None is given, and

3650 `header` and `index` are True, then the index names are used. A

3651 sequence should be given if the object uses MultiIndex. If

3652 False do not print fields for index names. Use index_label=False

3653 for easier importing in R.

3654 mode : str, default 'w'

3655 Python write mode. The available write modes are the same as

3656 :py:func:`open`.

3657 encoding : str, optional

3658 A string representing the encoding to use in the output file,

3659 defaults to 'utf-8'. `encoding` is not supported if `path_or_buf`

3660 is a non-binary file object.

3661 {compression_options}

3662

3663 .. versionchanged:: 1.0.0

3664

3665 May now be a dict with key 'method' as compression mode

3666 and other entries as additional compression options if

3667 compression mode is 'zip'.

3668

3669 .. versionchanged:: 1.1.0

3670

3671 Passing compression options as keys in dict is

3672 supported for compression modes 'gzip', 'bz2', 'zstd', and 'zip'.

3673

3674 .. versionchanged:: 1.2.0

3675

3676 Compression is supported for binary file objects.

3677

3678 .. versionchanged:: 1.2.0

3679

3680 Previous versions forwarded dict entries for 'gzip' to

3681 `gzip.open` instead of `gzip.GzipFile` which prevented

3682 setting `mtime`.

3683

3684 quoting : optional constant from csv module

3685 Defaults to csv.QUOTE_MINIMAL. If you have set a `float_format`

3686 then floats are converted to strings and thus csv.QUOTE_NONNUMERIC

3687 will treat them as non-numeric.

3688 quotechar : str, default '\"'

3689 String of length 1. Character used to quote fields.

3690 lineterminator : str, optional

3691 The newline character or character sequence to use in the output

3692 file. Defaults to `os.linesep`, which depends on the OS in which

3693 this method is called ('\\n' for linux, '\\r\\n' for Windows, i.e.).

3694

3695 .. versionchanged:: 1.5.0

3696

3697 Previously was line_terminator, changed for consistency with

3698 read_csv and the standard library 'csv' module.

3699

3700 chunksize : int or None

3701 Rows to write at a time.

3702 date_format : str, default None

3703 Format string for datetime objects.

3704 doublequote : bool, default True

3705 Control quoting of `quotechar` inside a field.

3706 escapechar : str, default None

3707 String of length 1. Character used to escape `sep` and `quotechar`

3708 when appropriate.

3709 decimal : str, default '.'

3710 Character recognized as decimal separator. E.g. use ',' for

3711 European data.

3712 errors : str, default 'strict'

3713 Specifies how encoding and decoding errors are to be handled.

3714 See the errors argument for :func:`open` for a full list

3715 of options.

3716

3717 .. versionadded:: 1.1.0

3718

3719 {storage_options}

3720

3721 .. versionadded:: 1.2.0

3722

3723 Returns

3724 -------

3725 None or str

3726 If path_or_buf is None, returns the resulting csv format as a

3727 string. Otherwise returns None.

3728

3729 See Also

3730 --------

3731 read_csv : Load a CSV file into a DataFrame.

3732 to_excel : Write DataFrame to an Excel file.

3733

3734 Examples

3735 --------

3736 >>> df = pd.DataFrame({{'name': ['Raphael', 'Donatello'],

3737 ... 'mask': ['red', 'purple'],

3738 ... 'weapon': ['sai', 'bo staff']}})

3739 >>> df.to_csv(index=False)

3740 'name,mask,weapon\nRaphael,red,sai\nDonatello,purple,bo staff\n'

3741

3742 Create 'out.zip' containing 'out.csv'

3743

3744 >>> compression_opts = dict(method='zip',

3745 ... archive_name='out.csv') # doctest: +SKIP

3746 >>> df.to_csv('out.zip', index=False,

3747 ... compression=compression_opts) # doctest: +SKIP

3748

3749 To write a csv file to a new folder or nested folder you will first

3750 need to create it using either Pathlib or os:

3751

3752 >>> from pathlib import Path # doctest: +SKIP

3753 >>> filepath = Path('folder/subfolder/out.csv') # doctest: +SKIP

3754 >>> filepath.parent.mkdir(parents=True, exist_ok=True) # doctest: +SKIP

3755 >>> df.to_csv(filepath) # doctest: +SKIP

3756

3757 >>> import os # doctest: +SKIP

3758 >>> os.makedirs('folder/subfolder', exist_ok=True) # doctest: +SKIP

3759 >>> df.to_csv('folder/subfolder/out.csv') # doctest: +SKIP

3760 """

3761 df = self if isinstance(self, ABCDataFrame) else self.to_frame()

3762

3763 formatter = DataFrameFormatter(

3764 frame=df,

3765 header=header,

3766 index=index,

3767 na_rep=na_rep,

3768 float_format=float_format,

3769 decimal=decimal,

3770 )

3771

3772 return DataFrameRenderer(formatter).to_csv(

3773 path_or_buf,

3774 lineterminator=lineterminator,

3775 sep=sep,

3776 encoding=encoding,

3777 errors=errors,

3778 compression=compression,

3779 quoting=quoting,

3780 columns=columns,

3781 index_label=index_label,

3782 mode=mode,

3783 chunksize=chunksize,

3784 quotechar=quotechar,

3785 date_format=date_format,

3786 doublequote=doublequote,

3787 escapechar=escapechar,

3788 storage_options=storage_options,

3789 )

3790

3791 # ----------------------------------------------------------------------

3792 # Lookup Caching

3793

3794 def _reset_cacher(self) -> None:

3795 """

3796 Reset the cacher.

3797 """

3798 raise AbstractMethodError(self)

3799

3800 def _maybe_update_cacher(

3801 self,

3802 clear: bool_t = False,

3803 verify_is_copy: bool_t = True,

3804 inplace: bool_t = False,

3805 ) -> None:

3806 """

3807 See if we need to update our parent cacher if clear, then clear our

3808 cache.

3809

3810 Parameters

3811 ----------

3812 clear : bool, default False

3813 Clear the item cache.

3814 verify_is_copy : bool, default True

3815 Provide is_copy checks.

3816 """

3817 if using_copy_on_write():

3818 return

3819

3820 if verify_is_copy:

3821 self._check_setitem_copy(t="referent")

3822

3823 if clear:

3824 self._clear_item_cache()

3825

3826 def _clear_item_cache(self) -> None:

3827 raise AbstractMethodError(self)

3828

3829 # ----------------------------------------------------------------------

3830 # Indexing Methods

3831

3832 def take(self: NDFrameT, indices, axis: Axis = 0, **kwargs) -> NDFrameT:

3833 """

3834 Return the elements in the given *positional* indices along an axis.

3835

3836 This means that we are not indexing according to actual values in

3837 the index attribute of the object. We are indexing according to the

3838 actual position of the element in the object.

3839

3840 Parameters

3841 ----------

3842 indices : array-like

3843 An array of ints indicating which positions to take.

3844 axis : {0 or 'index', 1 or 'columns', None}, default 0

3845 The axis on which to select elements. ``0`` means that we are

3846 selecting rows, ``1`` means that we are selecting columns.

3847 For `Series` this parameter is unused and defaults to 0.

3848 **kwargs

3849 For compatibility with :meth:`numpy.take`. Has no effect on the

3850 output.

3851

3852 Returns

3853 -------

3854 same type as caller

3855 An array-like containing the elements taken from the object.

3856

3857 See Also

3858 --------

3859 DataFrame.loc : Select a subset of a DataFrame by labels.

3860 DataFrame.iloc : Select a subset of a DataFrame by positions.

3861 numpy.take : Take elements from an array along an axis.

3862

3863 Examples

3864 --------

3865 >>> df = pd.DataFrame([('falcon', 'bird', 389.0),

3866 ... ('parrot', 'bird', 24.0),

3867 ... ('lion', 'mammal', 80.5),

3868 ... ('monkey', 'mammal', np.nan)],

3869 ... columns=['name', 'class', 'max_speed'],

3870 ... index=[0, 2, 3, 1])

3871 >>> df

3872 name class max_speed

3873 0 falcon bird 389.0

3874 2 parrot bird 24.0

3875 3 lion mammal 80.5

3876 1 monkey mammal NaN

3877

3878 Take elements at positions 0 and 3 along the axis 0 (default).

3879

3880 Note how the actual indices selected (0 and 1) do not correspond to

3881 our selected indices 0 and 3. That's because we are selecting the 0th

3882 and 3rd rows, not rows whose indices equal 0 and 3.

3883

3884 >>> df.take([0, 3])

3885 name class max_speed

3886 0 falcon bird 389.0

3887 1 monkey mammal NaN

3888

3889 Take elements at indices 1 and 2 along the axis 1 (column selection).

3890

3891 >>> df.take([1, 2], axis=1)

3892 class max_speed

3893 0 bird 389.0

3894 2 bird 24.0

3895 3 mammal 80.5

3896 1 mammal NaN

3897

3898 We may take elements using negative integers for positive indices,

3899 starting from the end of the object, just like with Python lists.

3900

3901 >>> df.take([-1, -2])

3902 name class max_speed

3903 1 monkey mammal NaN

3904 3 lion mammal 80.5

3905 """

3906

3907 nv.validate_take((), kwargs)

3908

3909 return self._take(indices, axis)

3910

3911 def _take(

3912 self: NDFrameT,

3913 indices,

3914 axis: Axis = 0,

3915 convert_indices: bool_t = True,

3916 ) -> NDFrameT:

3917 """

3918 Internal version of the `take` allowing specification of additional args.

3919

3920 See the docstring of `take` for full explanation of the parameters.

3921 """

3922 if not isinstance(indices, slice):

3923 indices = np.asarray(indices, dtype=np.intp)

3924 if (

3925 axis == 0

3926 and indices.ndim == 1

3927 and using_copy_on_write()

3928 and is_range_indexer(indices, len(self))

3929 ):

3930 return self.copy(deep=None)

3931

3932 new_data = self._mgr.take(

3933 indices,

3934 axis=self._get_block_manager_axis(axis),

3935 verify=True,

3936 convert_indices=convert_indices,

3937 )

3938 return self._constructor(new_data).__finalize__(self, method="take")

3939

3940 def _take_with_is_copy(self: NDFrameT, indices, axis: Axis = 0) -> NDFrameT:

3941 """

3942 Internal version of the `take` method that sets the `_is_copy`

3943 attribute to keep track of the parent dataframe (using in indexing

3944 for the SettingWithCopyWarning).

3945

3946 See the docstring of `take` for full explanation of the parameters.

3947 """

3948 result = self._take(indices=indices, axis=axis)

3949 # Maybe set copy if we didn't actually change the index.

3950 if not result._get_axis(axis).equals(self._get_axis(axis)):

3951 result._set_is_copy(self)

3952 return result

3953

3954 @final

3955 def xs(

3956 self: NDFrameT,

3957 key: IndexLabel,

3958 axis: Axis = 0,

3959 level: IndexLabel = None,

3960 drop_level: bool_t = True,

3961 ) -> NDFrameT:

3962 """

3963 Return cross-section from the Series/DataFrame.

3964

3965 This method takes a `key` argument to select data at a particular

3966 level of a MultiIndex.

3967

3968 Parameters

3969 ----------

3970 key : label or tuple of label

3971 Label contained in the index, or partially in a MultiIndex.

3972 axis : {0 or 'index', 1 or 'columns'}, default 0

3973 Axis to retrieve cross-section on.

3974 level : object, defaults to first n levels (n=1 or len(key))

3975 In case of a key partially contained in a MultiIndex, indicate

3976 which levels are used. Levels can be referred by label or position.

3977 drop_level : bool, default True

3978 If False, returns object with same levels as self.

3979

3980 Returns

3981 -------

3982 Series or DataFrame

3983 Cross-section from the original Series or DataFrame

3984 corresponding to the selected index levels.

3985

3986 See Also

3987 --------

3988 DataFrame.loc : Access a group of rows and columns

3989 by label(s) or a boolean array.

3990 DataFrame.iloc : Purely integer-location based indexing

3991 for selection by position.

3992

3993 Notes

3994 -----

3995 `xs` can not be used to set values.

3996

3997 MultiIndex Slicers is a generic way to get/set values on

3998 any level or levels.

3999 It is a superset of `xs` functionality, see

4000 :ref:`MultiIndex Slicers <advanced.mi_slicers>`.

4001

4002 Examples

4003 --------

4004 >>> d = {'num_legs': [4, 4, 2, 2],

4005 ... 'num_wings': [0, 0, 2, 2],

4006 ... 'class': ['mammal', 'mammal', 'mammal', 'bird'],

4007 ... 'animal': ['cat', 'dog', 'bat', 'penguin'],

4008 ... 'locomotion': ['walks', 'walks', 'flies', 'walks']}

4009 >>> df = pd.DataFrame(data=d)

4010 >>> df = df.set_index(['class', 'animal', 'locomotion'])

4011 >>> df

4012 num_legs num_wings

4013 class animal locomotion

4014 mammal cat walks 4 0

4015 dog walks 4 0

4016 bat flies 2 2

4017 bird penguin walks 2 2

4018

4019 Get values at specified index

4020

4021 >>> df.xs('mammal')

4022 num_legs num_wings

4023 animal locomotion

4024 cat walks 4 0

4025 dog walks 4 0

4026 bat flies 2 2

4027

4028 Get values at several indexes

4029

4030 >>> df.xs(('mammal', 'dog', 'walks'))

4031 num_legs 4

4032 num_wings 0

4033 Name: (mammal, dog, walks), dtype: int64

4034

4035 Get values at specified index and level

4036

4037 >>> df.xs('cat', level=1)

4038 num_legs num_wings

4039 class locomotion

4040 mammal walks 4 0

4041

4042 Get values at several indexes and levels

4043

4044 >>> df.xs(('bird', 'walks'),

4045 ... level=[0, 'locomotion'])

4046 num_legs num_wings

4047 animal

4048 penguin 2 2

4049

4050 Get values at specified column and axis

4051

4052 >>> df.xs('num_wings', axis=1)

4053 class animal locomotion

4054 mammal cat walks 0

4055 dog walks 0

4056 bat flies 2

4057 bird penguin walks 2

4058 Name: num_wings, dtype: int64

4059 """

4060 axis = self._get_axis_number(axis)

4061 labels = self._get_axis(axis)

4062

4063 if isinstance(key, list):

4064 raise TypeError("list keys are not supported in xs, pass a tuple instead")

4065

4066 if level is not None:

4067 if not isinstance(labels, MultiIndex):

4068 raise TypeError("Index must be a MultiIndex")

4069 loc, new_ax = labels.get_loc_level(key, level=level, drop_level=drop_level)

4070

4071 # create the tuple of the indexer

4072 _indexer = [slice(None)] * self.ndim

4073 _indexer[axis] = loc

4074 indexer = tuple(_indexer)

4075

4076 result = self.iloc[indexer]

4077 setattr(result, result._get_axis_name(axis), new_ax)

4078 return result

4079

4080 if axis == 1:

4081 if drop_level:

4082 return self[key]

4083 index = self.columns

4084 else:

4085 index = self.index

4086

4087 if isinstance(index, MultiIndex):

4088 loc, new_index = index._get_loc_level(key, level=0)

4089 if not drop_level:

4090 if lib.is_integer(loc):

4091 new_index = index[loc : loc + 1]

4092 else:

4093 new_index = index[loc]

4094 else:

4095 loc = index.get_loc(key)

4096

4097 if isinstance(loc, np.ndarray):

4098 if loc.dtype == np.bool_:

4099 (inds,) = loc.nonzero()

4100 return self._take_with_is_copy(inds, axis=axis)

4101 else:

4102 return self._take_with_is_copy(loc, axis=axis)

4103

4104 if not is_scalar(loc):

4105 new_index = index[loc]

4106

4107 if is_scalar(loc) and axis == 0:

4108 # In this case loc should be an integer

4109 if self.ndim == 1:

4110 # if we encounter an array-like and we only have 1 dim

4111 # that means that their are list/ndarrays inside the Series!

4112 # so just return them (GH 6394)

4113 return self._values[loc]

4114

4115 new_mgr = self._mgr.fast_xs(loc)

4116

4117 result = self._constructor_sliced(

4118 new_mgr, name=self.index[loc]

4119 ).__finalize__(self)

4120 elif is_scalar(loc):

4121 result = self.iloc[:, slice(loc, loc + 1)]

4122 elif axis == 1:

4123 result = self.iloc[:, loc]

4124 else:

4125 result = self.iloc[loc]

4126 result.index = new_index

4127

4128 # this could be a view

4129 # but only in a single-dtyped view sliceable case

4130 result._set_is_copy(self, copy=not result._is_view)

4131 return result

4132

4133 def __getitem__(self, item):

4134 raise AbstractMethodError(self)

4135

4136 def _slice(self: NDFrameT, slobj: slice, axis: Axis = 0) -> NDFrameT:

4137 """

4138 Construct a slice of this container.

4139

4140 Slicing with this method is *always* positional.

4141 """

4142 assert isinstance(slobj, slice), type(slobj)

4143 axis = self._get_block_manager_axis(axis)

4144 result = self._constructor(self._mgr.get_slice(slobj, axis=axis))

4145 result = result.__finalize__(self)

4146

4147 # this could be a view

4148 # but only in a single-dtyped view sliceable case

4149 is_copy = axis != 0 or result._is_view

4150 result._set_is_copy(self, copy=is_copy)

4151 return result

4152

4153 @final

4154 def _set_is_copy(self, ref: NDFrame, copy: bool_t = True) -> None:

4155 if not copy:

4156 self._is_copy = None

4157 else:

4158 assert ref is not None

4159 self._is_copy = weakref.ref(ref)

4160

4161 def _check_is_chained_assignment_possible(self) -> bool_t:

4162 """

4163 Check if we are a view, have a cacher, and are of mixed type.

4164 If so, then force a setitem_copy check.

4165

4166 Should be called just near setting a value

4167

4168 Will return a boolean if it we are a view and are cached, but a

4169 single-dtype meaning that the cacher should be updated following

4170 setting.

4171 """

4172 if self._is_copy:

4173 self._check_setitem_copy(t="referent")

4174 return False

4175

4176 @final

4177 def _check_setitem_copy(self, t: str = "setting", force: bool_t = False):

4178 """

4179

4180 Parameters

4181 ----------

4182 t : str, the type of setting error

4183 force : bool, default False

4184 If True, then force showing an error.

4185

4186 validate if we are doing a setitem on a chained copy.

4187

4188 It is technically possible to figure out that we are setting on

4189 a copy even WITH a multi-dtyped pandas object. In other words, some

4190 blocks may be views while other are not. Currently _is_view will ALWAYS

4191 return False for multi-blocks to avoid having to handle this case.

4192

4193 df = DataFrame(np.arange(0,9), columns=['count'])

4194 df['group'] = 'b'

4195

4196 # This technically need not raise SettingWithCopy if both are view

4197 # (which is not generally guaranteed but is usually True. However,

4198 # this is in general not a good practice and we recommend using .loc.

4199 df.iloc[0:5]['group'] = 'a'

4200

4201 """

4202 if using_copy_on_write():

4203 return

4204

4205 # return early if the check is not needed

4206 if not (force or self._is_copy):

4207 return

4208

4209 value = config.get_option("mode.chained_assignment")

4210 if value is None:

4211 return

4212

4213 # see if the copy is not actually referred; if so, then dissolve

4214 # the copy weakref

4215 if self._is_copy is not None and not isinstance(self._is_copy, str):

4216 r = self._is_copy()

4217 if not gc.get_referents(r) or (r is not None and r.shape == self.shape):

4218 self._is_copy = None

4219 return

4220

4221 # a custom message

4222 if isinstance(self._is_copy, str):

4223 t = self._is_copy

4224

4225 elif t == "referent":

4226 t = (

4227 "\n"

4228 "A value is trying to be set on a copy of a slice from a "

4229 "DataFrame\n\n"

4230 "See the caveats in the documentation: "

4231 "https://pandas.pydata.org/pandas-docs/stable/user_guide/"

4232 "indexing.html#returning-a-view-versus-a-copy"

4233 )

4234

4235 else:

4236 t = (

4237 "\n"

4238 "A value is trying to be set on a copy of a slice from a "

4239 "DataFrame.\n"

4240 "Try using .loc[row_indexer,col_indexer] = value "

4241 "instead\n\nSee the caveats in the documentation: "

4242 "https://pandas.pydata.org/pandas-docs/stable/user_guide/"

4243 "indexing.html#returning-a-view-versus-a-copy"

4244 )

4245

4246 if value == "raise":

4247 raise SettingWithCopyError(t)

4248 if value == "warn":

4249 warnings.warn(t, SettingWithCopyWarning, stacklevel=find_stack_level())

4250

4251 def __delitem__(self, key) -> None:

4252 """

4253 Delete item

4254 """

4255 deleted = False

4256

4257 maybe_shortcut = False

4258 if self.ndim == 2 and isinstance(self.columns, MultiIndex):

4259 try:

4260 # By using engine's __contains__ we effectively

4261 # restrict to same-length tuples

4262 maybe_shortcut = key not in self.columns._engine

4263 except TypeError:

4264 pass

4265

4266 if maybe_shortcut:

4267 # Allow shorthand to delete all columns whose first len(key)

4268 # elements match key:

4269 if not isinstance(key, tuple):

4270 key = (key,)

4271 for col in self.columns:

4272 if isinstance(col, tuple) and col[: len(key)] == key:

4273 del self[col]

4274 deleted = True

4275 if not deleted:

4276 # If the above loop ran and didn't delete anything because

4277 # there was no match, this call should raise the appropriate

4278 # exception:

4279 loc = self.axes[-1].get_loc(key)

4280 self._mgr = self._mgr.idelete(loc)

4281

4282 # delete from the caches

4283 try:

4284 del self._item_cache[key]

4285 except KeyError:

4286 pass

4287

4288 # ----------------------------------------------------------------------

4289 # Unsorted

4290

4291 @final

4292 def _check_inplace_and_allows_duplicate_labels(self, inplace):

4293 if inplace and not self.flags.allows_duplicate_labels:

4294 raise ValueError(

4295 "Cannot specify 'inplace=True' when "

4296 "'self.flags.allows_duplicate_labels' is False."

4297 )

4298

4299 @final

4300 def get(self, key, default=None):

4301 """

4302 Get item from object for given key (ex: DataFrame column).

4303

4304 Returns default value if not found.

4305

4306 Parameters

4307 ----------

4308 key : object

4309

4310 Returns

4311 -------

4312 same type as items contained in object

4313

4314 Examples

4315 --------

4316 >>> df = pd.DataFrame(

4317 ... [

4318 ... [24.3, 75.7, "high"],

4319 ... [31, 87.8, "high"],

4320 ... [22, 71.6, "medium"],

4321 ... [35, 95, "medium"],

4322 ... ],

4323 ... columns=["temp_celsius", "temp_fahrenheit", "windspeed"],

4324 ... index=pd.date_range(start="2014-02-12", end="2014-02-15", freq="D"),

4325 ... )

4326

4327 >>> df

4328 temp_celsius temp_fahrenheit windspeed

4329 2014-02-12 24.3 75.7 high

4330 2014-02-13 31.0 87.8 high

4331 2014-02-14 22.0 71.6 medium

4332 2014-02-15 35.0 95.0 medium

4333

4334 >>> df.get(["temp_celsius", "windspeed"])

4335 temp_celsius windspeed

4336 2014-02-12 24.3 high

4337 2014-02-13 31.0 high

4338 2014-02-14 22.0 medium

4339 2014-02-15 35.0 medium

4340

4341 >>> ser = df['windspeed']

4342 >>> ser.get('2014-02-13')

4343 'high'

4344

4345 If the key isn't found, the default value will be used.

4346

4347 >>> df.get(["temp_celsius", "temp_kelvin"], default="default_value")

4348 'default_value'

4349

4350 >>> ser.get('2014-02-10', '[unknown]')

4351 '[unknown]'

4352 """

4353 try:

4354 return self[key]

4355 except (KeyError, ValueError, IndexError):

4356 return default

4357

4358 @final

4359 @property

4360 def _is_view(self) -> bool_t:

4361 """Return boolean indicating if self is view of another array"""

4362 return self._mgr.is_view

4363

4364 @final

4365 def reindex_like(

4366 self: NDFrameT,

4367 other,

4368 method: Literal["backfill", "bfill", "pad", "ffill", "nearest"] | None = None,

4369 copy: bool_t | None = None,

4370 limit=None,

4371 tolerance=None,

4372 ) -> NDFrameT:

4373 """

4374 Return an object with matching indices as other object.

4375

4376 Conform the object to the same index on all axes. Optional

4377 filling logic, placing NaN in locations having no value

4378 in the previous index. A new object is produced unless the

4379 new index is equivalent to the current one and copy=False.

4380

4381 Parameters

4382 ----------

4383 other : Object of the same data type

4384 Its row and column indices are used to define the new indices

4385 of this object.

4386 method : {None, 'backfill'/'bfill', 'pad'/'ffill', 'nearest'}

4387 Method to use for filling holes in reindexed DataFrame.

4388 Please note: this is only applicable to DataFrames/Series with a

4389 monotonically increasing/decreasing index.

4390

4391 * None (default): don't fill gaps

4392 * pad / ffill: propagate last valid observation forward to next

4393 valid

4394 * backfill / bfill: use next valid observation to fill gap

4395 * nearest: use nearest valid observations to fill gap.

4396

4397 copy : bool, default True

4398 Return a new object, even if the passed indexes are the same.

4399 limit : int, default None

4400 Maximum number of consecutive labels to fill for inexact matches.

4401 tolerance : optional

4402 Maximum distance between original and new labels for inexact

4403 matches. The values of the index at the matching locations must

4404 satisfy the equation ``abs(index[indexer] - target) <= tolerance``.

4405

4406 Tolerance may be a scalar value, which applies the same tolerance

4407 to all values, or list-like, which applies variable tolerance per

4408 element. List-like includes list, tuple, array, Series, and must be

4409 the same size as the index and its dtype must exactly match the

4410 index's type.

4411

4412 Returns

4413 -------

4414 Series or DataFrame

4415 Same type as caller, but with changed indices on each axis.

4416

4417 See Also

4418 --------

4419 DataFrame.set_index : Set row labels.

4420 DataFrame.reset_index : Remove row labels or move them to new columns.

4421 DataFrame.reindex : Change to new indices or expand indices.

4422

4423 Notes

4424 -----

4425 Same as calling

4426 ``.reindex(index=other.index, columns=other.columns,...)``.

4427

4428 Examples

4429 --------

4430 >>> df1 = pd.DataFrame([[24.3, 75.7, 'high'],

4431 ... [31, 87.8, 'high'],

4432 ... [22, 71.6, 'medium'],

4433 ... [35, 95, 'medium']],

4434 ... columns=['temp_celsius', 'temp_fahrenheit',

4435 ... 'windspeed'],

4436 ... index=pd.date_range(start='2014-02-12',

4437 ... end='2014-02-15', freq='D'))

4438

4439 >>> df1

4440 temp_celsius temp_fahrenheit windspeed

4441 2014-02-12 24.3 75.7 high

4442 2014-02-13 31.0 87.8 high

4443 2014-02-14 22.0 71.6 medium

4444 2014-02-15 35.0 95.0 medium

4445

4446 >>> df2 = pd.DataFrame([[28, 'low'],

4447 ... [30, 'low'],

4448 ... [35.1, 'medium']],

4449 ... columns=['temp_celsius', 'windspeed'],

4450 ... index=pd.DatetimeIndex(['2014-02-12', '2014-02-13',

4451 ... '2014-02-15']))

4452

4453 >>> df2

4454 temp_celsius windspeed

4455 2014-02-12 28.0 low

4456 2014-02-13 30.0 low

4457 2014-02-15 35.1 medium

4458

4459 >>> df2.reindex_like(df1)

4460 temp_celsius temp_fahrenheit windspeed

4461 2014-02-12 28.0 NaN low

4462 2014-02-13 30.0 NaN low

4463 2014-02-14 NaN NaN NaN

4464 2014-02-15 35.1 NaN medium

4465 """

4466 d = other._construct_axes_dict(

4467 axes=self._AXIS_ORDERS,

4468 method=method,

4469 copy=copy,

4470 limit=limit,

4471 tolerance=tolerance,

4472 )

4473

4474 return self.reindex(**d)

4475

4476 @overload

4477 def drop(

4478 self,

4479 labels: IndexLabel = ...,

4480 *,

4481 axis: Axis = ...,

4482 index: IndexLabel = ...,

4483 columns: IndexLabel = ...,

4484 level: Level | None = ...,

4485 inplace: Literal[True],

4486 errors: IgnoreRaise = ...,

4487 ) -> None:

4488 ...

4489

4490 @overload

4491 def drop(

4492 self: NDFrameT,

4493 labels: IndexLabel = ...,

4494 *,

4495 axis: Axis = ...,

4496 index: IndexLabel = ...,

4497 columns: IndexLabel = ...,

4498 level: Level | None = ...,

4499 inplace: Literal[False] = ...,

4500 errors: IgnoreRaise = ...,

4501 ) -> NDFrameT:

4502 ...

4503

4504 @overload

4505 def drop(

4506 self: NDFrameT,

4507 labels: IndexLabel = ...,

4508 *,

4509 axis: Axis = ...,

4510 index: IndexLabel = ...,

4511 columns: IndexLabel = ...,

4512 level: Level | None = ...,

4513 inplace: bool_t = ...,

4514 errors: IgnoreRaise = ...,

4515 ) -> NDFrameT | None:

4516 ...

4517

4518 def drop(

4519 self: NDFrameT,

4520 labels: IndexLabel = None,

4521 *,

4522 axis: Axis = 0,

4523 index: IndexLabel = None,

4524 columns: IndexLabel = None,

4525 level: Level | None = None,

4526 inplace: bool_t = False,

4527 errors: IgnoreRaise = "raise",

4528 ) -> NDFrameT | None:

4529 inplace = validate_bool_kwarg(inplace, "inplace")

4530

4531 if labels is not None:

4532 if index is not None or columns is not None:

4533 raise ValueError("Cannot specify both 'labels' and 'index'/'columns'")

4534 axis_name = self._get_axis_name(axis)

4535 axes = {axis_name: labels}

4536 elif index is not None or columns is not None:

4537 axes = {"index": index}

4538 if self.ndim == 2:

4539 axes["columns"] = columns

4540 else:

4541 raise ValueError(

4542 "Need to specify at least one of 'labels', 'index' or 'columns'"

4543 )

4544

4545 obj = self

4546

4547 for axis, labels in axes.items():

4548 if labels is not None:

4549 obj = obj._drop_axis(labels, axis, level=level, errors=errors)

4550

4551 if inplace:

4552 self._update_inplace(obj)

4553 return None

4554 else:

4555 return obj

4556

4557 @final

4558 def _drop_axis(

4559 self: NDFrameT,

4560 labels,

4561 axis,

4562 level=None,

4563 errors: IgnoreRaise = "raise",

4564 only_slice: bool_t = False,

4565 ) -> NDFrameT:

4566 """

4567 Drop labels from specified axis. Used in the ``drop`` method

4568 internally.

4569

4570 Parameters

4571 ----------

4572 labels : single label or list-like

4573 axis : int or axis name

4574 level : int or level name, default None

4575 For MultiIndex

4576 errors : {'ignore', 'raise'}, default 'raise'

4577 If 'ignore', suppress error and existing labels are dropped.

4578 only_slice : bool, default False

4579 Whether indexing along columns should be view-only.

4580

4581 """

4582 axis_num = self._get_axis_number(axis)

4583 axis = self._get_axis(axis)

4584

4585 if axis.is_unique:

4586 if level is not None:

4587 if not isinstance(axis, MultiIndex):

4588 raise AssertionError("axis must be a MultiIndex")

4589 new_axis = axis.drop(labels, level=level, errors=errors)

4590 else:

4591 new_axis = axis.drop(labels, errors=errors)

4592 indexer = axis.get_indexer(new_axis)

4593

4594 # Case for non-unique axis

4595 else:

4596 is_tuple_labels = is_nested_list_like(labels) or isinstance(labels, tuple)

4597 labels = ensure_object(common.index_labels_to_array(labels))

4598 if level is not None:

4599 if not isinstance(axis, MultiIndex):

4600 raise AssertionError("axis must be a MultiIndex")

4601 mask = ~axis.get_level_values(level).isin(labels)

4602

4603 # GH 18561 MultiIndex.drop should raise if label is absent

4604 if errors == "raise" and mask.all():

4605 raise KeyError(f"{labels} not found in axis")

4606 elif (

4607 isinstance(axis, MultiIndex)

4608 and labels.dtype == "object"

4609 and not is_tuple_labels

4610 ):

4611 # Set level to zero in case of MultiIndex and label is string,

4612 # because isin can't handle strings for MultiIndexes GH#36293

4613 # In case of tuples we get dtype object but have to use isin GH#42771

4614 mask = ~axis.get_level_values(0).isin(labels)

4615 else:

4616 mask = ~axis.isin(labels)

4617 # Check if label doesn't exist along axis

4618 labels_missing = (axis.get_indexer_for(labels) == -1).any()

4619 if errors == "raise" and labels_missing:

4620 raise KeyError(f"{labels} not found in axis")

4621

4622 if is_extension_array_dtype(mask.dtype):

4623 # GH#45860

4624 mask = mask.to_numpy(dtype=bool)

4625

4626 indexer = mask.nonzero()[0]

4627 new_axis = axis.take(indexer)

4628

4629 bm_axis = self.ndim - axis_num - 1

4630 new_mgr = self._mgr.reindex_indexer(

4631 new_axis,

4632 indexer,

4633 axis=bm_axis,

4634 allow_dups=True,

4635 copy=None,

4636 only_slice=only_slice,

4637 )

4638 result = self._constructor(new_mgr)

4639 if self.ndim == 1:

4640 result.name = self.name

4641

4642 return result.__finalize__(self)

4643

4644 @final

4645 def _update_inplace(self, result, verify_is_copy: bool_t = True) -> None:

4646 """

4647 Replace self internals with result.

4648

4649 Parameters

4650 ----------

4651 result : same type as self

4652 verify_is_copy : bool, default True

4653 Provide is_copy checks.

4654 """

4655 # NOTE: This does *not* call __finalize__ and that's an explicit

4656 # decision that we may revisit in the future.

4657 self._reset_cache()

4658 self._clear_item_cache()

4659 self._mgr = result._mgr

4660 self._maybe_update_cacher(verify_is_copy=verify_is_copy, inplace=True)

4661

4662 @final

4663 def add_prefix(self: NDFrameT, prefix: str, axis: Axis | None = None) -> NDFrameT:

4664 """

4665 Prefix labels with string `prefix`.

4666

4667 For Series, the row labels are prefixed.

4668 For DataFrame, the column labels are prefixed.

4669

4670 Parameters

4671 ----------

4672 prefix : str

4673 The string to add before each label.

4674 axis : {{0 or 'index', 1 or 'columns', None}}, default None

4675 Axis to add prefix on

4676

4677 .. versionadded:: 2.0.0

4678

4679 Returns

4680 -------

4681 Series or DataFrame

4682 New Series or DataFrame with updated labels.

4683

4684 See Also

4685 --------

4686 Series.add_suffix: Suffix row labels with string `suffix`.

4687 DataFrame.add_suffix: Suffix column labels with string `suffix`.

4688

4689 Examples

4690 --------

4691 >>> s = pd.Series([1, 2, 3, 4])

4692 >>> s

4693 0 1

4694 1 2

4695 2 3

4696 3 4

4697 dtype: int64

4698

4699 >>> s.add_prefix('item_')

4700 item_0 1

4701 item_1 2

4702 item_2 3

4703 item_3 4

4704 dtype: int64

4705

4706 >>> df = pd.DataFrame({'A': [1, 2, 3, 4], 'B': [3, 4, 5, 6]})

4707 >>> df

4708 A B

4709 0 1 3

4710 1 2 4

4711 2 3 5

4712 3 4 6

4713

4714 >>> df.add_prefix('col_')

4715 col_A col_B

4716 0 1 3

4717 1 2 4

4718 2 3 5

4719 3 4 6

4720 """

4721 f = lambda x: f"{prefix}{x}"

4722

4723 axis_name = self._info_axis_name

4724 if axis is not None:

4725 axis_name = self._get_axis_name(axis)

4726

4727 mapper = {axis_name: f}

4728

4729 # error: Incompatible return value type (got "Optional[NDFrameT]",

4730 # expected "NDFrameT")

4731 # error: Argument 1 to "rename" of "NDFrame" has incompatible type

4732 # "**Dict[str, partial[str]]"; expected "Union[str, int, None]"

4733 # error: Keywords must be strings

4734 return self._rename(**mapper) # type: ignore[return-value, arg-type, misc]

4735

4736 @final

4737 def add_suffix(self: NDFrameT, suffix: str, axis: Axis | None = None) -> NDFrameT:

4738 """

4739 Suffix labels with string `suffix`.

4740

4741 For Series, the row labels are suffixed.

4742 For DataFrame, the column labels are suffixed.

4743

4744 Parameters

4745 ----------

4746 suffix : str

4747 The string to add after each label.

4748 axis : {{0 or 'index', 1 or 'columns', None}}, default None

4749 Axis to add suffix on

4750

4751 .. versionadded:: 2.0.0

4752

4753 Returns

4754 -------

4755 Series or DataFrame

4756 New Series or DataFrame with updated labels.

4757

4758 See Also

4759 --------

4760 Series.add_prefix: Prefix row labels with string `prefix`.

4761 DataFrame.add_prefix: Prefix column labels with string `prefix`.

4762

4763 Examples

4764 --------

4765 >>> s = pd.Series([1, 2, 3, 4])

4766 >>> s

4767 0 1

4768 1 2

4769 2 3

4770 3 4

4771 dtype: int64

4772

4773 >>> s.add_suffix('_item')

4774 0_item 1

4775 1_item 2

4776 2_item 3

4777 3_item 4

4778 dtype: int64

4779

4780 >>> df = pd.DataFrame({'A': [1, 2, 3, 4], 'B': [3, 4, 5, 6]})

4781 >>> df

4782 A B

4783 0 1 3

4784 1 2 4

4785 2 3 5

4786 3 4 6

4787

4788 >>> df.add_suffix('_col')

4789 A_col B_col

4790 0 1 3

4791 1 2 4

4792 2 3 5

4793 3 4 6

4794 """

4795 f = lambda x: f"{x}{suffix}"

4796

4797 axis_name = self._info_axis_name

4798 if axis is not None:

4799 axis_name = self._get_axis_name(axis)

4800

4801 mapper = {axis_name: f}

4802 # error: Incompatible return value type (got "Optional[NDFrameT]",

4803 # expected "NDFrameT")

4804 # error: Argument 1 to "rename" of "NDFrame" has incompatible type

4805 # "**Dict[str, partial[str]]"; expected "Union[str, int, None]"

4806 # error: Keywords must be strings

4807 return self._rename(**mapper) # type: ignore[return-value, arg-type, misc]

4808

4809 @overload

4810 def sort_values(

4811 self: NDFrameT,

4812 *,

4813 axis: Axis = ...,

4814 ascending: bool_t | Sequence[bool_t] = ...,

4815 inplace: Literal[False] = ...,

4816 kind: str = ...,

4817 na_position: str = ...,

4818 ignore_index: bool_t = ...,

4819 key: ValueKeyFunc = ...,

4820 ) -> NDFrameT:

4821 ...

4822

4823 @overload

4824 def sort_values(

4825 self,

4826 *,

4827 axis: Axis = ...,

4828 ascending: bool_t | Sequence[bool_t] = ...,

4829 inplace: Literal[True],

4830 kind: str = ...,

4831 na_position: str = ...,

4832 ignore_index: bool_t = ...,

4833 key: ValueKeyFunc = ...,

4834 ) -> None:

4835 ...

4836

4837 @overload

4838 def sort_values(

4839 self: NDFrameT,

4840 *,

4841 axis: Axis = ...,

4842 ascending: bool_t | Sequence[bool_t] = ...,

4843 inplace: bool_t = ...,

4844 kind: str = ...,

4845 na_position: str = ...,

4846 ignore_index: bool_t = ...,

4847 key: ValueKeyFunc = ...,

4848 ) -> NDFrameT | None:

4849 ...

4850

4851 def sort_values(

4852 self: NDFrameT,

4853 *,

4854 axis: Axis = 0,

4855 ascending: bool_t | Sequence[bool_t] = True,

4856 inplace: bool_t = False,

4857 kind: str = "quicksort",

4858 na_position: str = "last",

4859 ignore_index: bool_t = False,

4860 key: ValueKeyFunc = None,

4861 ) -> NDFrameT | None:

4862 """

4863 Sort by the values along either axis.

4864

4865 Parameters

4866 ----------%(optional_by)s

4867 axis : %(axes_single_arg)s, default 0

4868 Axis to be sorted.

4869 ascending : bool or list of bool, default True

4870 Sort ascending vs. descending. Specify list for multiple sort

4871 orders. If this is a list of bools, must match the length of

4872 the by.

4873 inplace : bool, default False

4874 If True, perform operation in-place.

4875 kind : {'quicksort', 'mergesort', 'heapsort', 'stable'}, default 'quicksort'

4876 Choice of sorting algorithm. See also :func:`numpy.sort` for more

4877 information. `mergesort` and `stable` are the only stable algorithms. For

4878 DataFrames, this option is only applied when sorting on a single

4879 column or label.

4880 na_position : {'first', 'last'}, default 'last'

4881 Puts NaNs at the beginning if `first`; `last` puts NaNs at the

4882 end.

4883 ignore_index : bool, default False

4884 If True, the resulting axis will be labeled 0, 1, …, n - 1.

4885 key : callable, optional

4886 Apply the key function to the values

4887 before sorting. This is similar to the `key` argument in the

4888 builtin :meth:`sorted` function, with the notable difference that

4889 this `key` function should be *vectorized*. It should expect a

4890 ``Series`` and return a Series with the same shape as the input.

4891 It will be applied to each column in `by` independently.

4892

4893 .. versionadded:: 1.1.0

4894

4895 Returns

4896 -------

4897 DataFrame or None

4898 DataFrame with sorted values or None if ``inplace=True``.

4899

4900 See Also

4901 --------

4902 DataFrame.sort_index : Sort a DataFrame by the index.

4903 Series.sort_values : Similar method for a Series.

4904

4905 Examples

4906 --------

4907 >>> df = pd.DataFrame({

4908 ... 'col1': ['A', 'A', 'B', np.nan, 'D', 'C'],

4909 ... 'col2': [2, 1, 9, 8, 7, 4],

4910 ... 'col3': [0, 1, 9, 4, 2, 3],

4911 ... 'col4': ['a', 'B', 'c', 'D', 'e', 'F']

4912 ... })

4913 >>> df

4914 col1 col2 col3 col4

4915 0 A 2 0 a

4916 1 A 1 1 B

4917 2 B 9 9 c

4918 3 NaN 8 4 D

4919 4 D 7 2 e

4920 5 C 4 3 F

4921

4922 Sort by col1

4923

4924 >>> df.sort_values(by=['col1'])

4925 col1 col2 col3 col4

4926 0 A 2 0 a

4927 1 A 1 1 B

4928 2 B 9 9 c

4929 5 C 4 3 F

4930 4 D 7 2 e

4931 3 NaN 8 4 D

4932

4933 Sort by multiple columns

4934

4935 >>> df.sort_values(by=['col1', 'col2'])

4936 col1 col2 col3 col4

4937 1 A 1 1 B

4938 0 A 2 0 a

4939 2 B 9 9 c

4940 5 C 4 3 F

4941 4 D 7 2 e

4942 3 NaN 8 4 D

4943

4944 Sort Descending

4945

4946 >>> df.sort_values(by='col1', ascending=False)

4947 col1 col2 col3 col4

4948 4 D 7 2 e

4949 5 C 4 3 F

4950 2 B 9 9 c

4951 0 A 2 0 a

4952 1 A 1 1 B

4953 3 NaN 8 4 D

4954

4955 Putting NAs first

4956

4957 >>> df.sort_values(by='col1', ascending=False, na_position='first')

4958 col1 col2 col3 col4

4959 3 NaN 8 4 D

4960 4 D 7 2 e

4961 5 C 4 3 F

4962 2 B 9 9 c

4963 0 A 2 0 a

4964 1 A 1 1 B

4965

4966 Sorting with a key function

4967

4968 >>> df.sort_values(by='col4', key=lambda col: col.str.lower())

4969 col1 col2 col3 col4

4970 0 A 2 0 a

4971 1 A 1 1 B

4972 2 B 9 9 c

4973 3 NaN 8 4 D

4974 4 D 7 2 e

4975 5 C 4 3 F

4976

4977 Natural sort with the key argument,

4978 using the `natsort <https://github.com/SethMMorton/natsort>` package.

4979

4980 >>> df = pd.DataFrame({

4981 ... "time": ['0hr', '128hr', '72hr', '48hr', '96hr'],

4982 ... "value": [10, 20, 30, 40, 50]

4983 ... })

4984 >>> df

4985 time value

4986 0 0hr 10

4987 1 128hr 20

4988 2 72hr 30

4989 3 48hr 40

4990 4 96hr 50

4991 >>> from natsort import index_natsorted

4992 >>> df.sort_values(

4993 ... by="time",

4994 ... key=lambda x: np.argsort(index_natsorted(df["time"]))

4995 ... )

4996 time value

4997 0 0hr 10

4998 3 48hr 40

4999 2 72hr 30

5000 4 96hr 50

5001 1 128hr 20

5002 """

5003 raise AbstractMethodError(self)

5004

5005 @overload

5006 def sort_index(

5007 self,

5008 *,

5009 axis: Axis = ...,

5010 level: IndexLabel = ...,

5011 ascending: bool_t | Sequence[bool_t] = ...,

5012 inplace: Literal[True],

5013 kind: SortKind = ...,

5014 na_position: NaPosition = ...,

5015 sort_remaining: bool_t = ...,

5016 ignore_index: bool_t = ...,

5017 key: IndexKeyFunc = ...,

5018 ) -> None:

5019 ...

5020

5021 @overload

5022 def sort_index(

5023 self: NDFrameT,

5024 *,

5025 axis: Axis = ...,

5026 level: IndexLabel = ...,

5027 ascending: bool_t | Sequence[bool_t] = ...,

5028 inplace: Literal[False] = ...,

5029 kind: SortKind = ...,

5030 na_position: NaPosition = ...,

5031 sort_remaining: bool_t = ...,

5032 ignore_index: bool_t = ...,

5033 key: IndexKeyFunc = ...,

5034 ) -> NDFrameT:

5035 ...

5036

5037 @overload

5038 def sort_index(

5039 self: NDFrameT,

5040 *,

5041 axis: Axis = ...,

5042 level: IndexLabel = ...,

5043 ascending: bool_t | Sequence[bool_t] = ...,

5044 inplace: bool_t = ...,

5045 kind: SortKind = ...,

5046 na_position: NaPosition = ...,

5047 sort_remaining: bool_t = ...,

5048 ignore_index: bool_t = ...,

5049 key: IndexKeyFunc = ...,

5050 ) -> NDFrameT | None:

5051 ...

5052

5053 def sort_index(

5054 self: NDFrameT,

5055 *,

5056 axis: Axis = 0,

5057 level: IndexLabel = None,

5058 ascending: bool_t | Sequence[bool_t] = True,

5059 inplace: bool_t = False,

5060 kind: SortKind = "quicksort",

5061 na_position: NaPosition = "last",

5062 sort_remaining: bool_t = True,

5063 ignore_index: bool_t = False,

5064 key: IndexKeyFunc = None,

5065 ) -> NDFrameT | None:

5066 inplace = validate_bool_kwarg(inplace, "inplace")

5067 axis = self._get_axis_number(axis)

5068 ascending = validate_ascending(ascending)

5069

5070 target = self._get_axis(axis)

5071

5072 indexer = get_indexer_indexer(

5073 target, level, ascending, kind, na_position, sort_remaining, key

5074 )

5075

5076 if indexer is None:

5077 if inplace:

5078 result = self

5079 else:

5080 result = self.copy(deep=None)

5081

5082 if ignore_index:

5083 result.index = default_index(len(self))

5084 if inplace:

5085 return None

5086 else:

5087 return result

5088

5089 baxis = self._get_block_manager_axis(axis)

5090 new_data = self._mgr.take(indexer, axis=baxis, verify=False)

5091

5092 # reconstruct axis if needed

5093 new_data.set_axis(baxis, new_data.axes[baxis]._sort_levels_monotonic())

5094

5095 if ignore_index:

5096 axis = 1 if isinstance(self, ABCDataFrame) else 0

5097 new_data.set_axis(axis, default_index(len(indexer)))

5098

5099 result = self._constructor(new_data)

5100

5101 if inplace:

5102 return self._update_inplace(result)

5103 else:

5104 return result.__finalize__(self, method="sort_index")

5105

5106 @doc(

5107 klass=_shared_doc_kwargs["klass"],

5108 optional_reindex="",

5109 )

5110 def reindex(

5111 self: NDFrameT,

5112 labels=None,

5113 index=None,

5114 columns=None,

5115 axis: Axis | None = None,

5116 method: str | None = None,

5117 copy: bool_t | None = None,

5118 level: Level | None = None,

5119 fill_value: Scalar | None = np.nan,

5120 limit: int | None = None,

5121 tolerance=None,

5122 ) -> NDFrameT:

5123 """

5124 Conform {klass} to new index with optional filling logic.

5125

5126 Places NA/NaN in locations having no value in the previous index. A new object

5127 is produced unless the new index is equivalent to the current one and

5128 ``copy=False``.

5129

5130 Parameters

5131 ----------

5132 {optional_reindex}

5133 method : {{None, 'backfill'/'bfill', 'pad'/'ffill', 'nearest'}}

5134 Method to use for filling holes in reindexed DataFrame.

5135 Please note: this is only applicable to DataFrames/Series with a

5136 monotonically increasing/decreasing index.

5137

5138 * None (default): don't fill gaps

5139 * pad / ffill: Propagate last valid observation forward to next

5140 valid.

5141 * backfill / bfill: Use next valid observation to fill gap.

5142 * nearest: Use nearest valid observations to fill gap.

5143

5144 copy : bool, default True

5145 Return a new object, even if the passed indexes are the same.

5146 level : int or name

5147 Broadcast across a level, matching Index values on the

5148 passed MultiIndex level.

5149 fill_value : scalar, default np.NaN

5150 Value to use for missing values. Defaults to NaN, but can be any

5151 "compatible" value.

5152 limit : int, default None

5153 Maximum number of consecutive elements to forward or backward fill.

5154 tolerance : optional

5155 Maximum distance between original and new labels for inexact

5156 matches. The values of the index at the matching locations most

5157 satisfy the equation ``abs(index[indexer] - target) <= tolerance``.

5158

5159 Tolerance may be a scalar value, which applies the same tolerance

5160 to all values, or list-like, which applies variable tolerance per

5161 element. List-like includes list, tuple, array, Series, and must be

5162 the same size as the index and its dtype must exactly match the

5163 index's type.

5164

5165 Returns

5166 -------

5167 {klass} with changed index.

5168

5169 See Also

5170 --------

5171 DataFrame.set_index : Set row labels.

5172 DataFrame.reset_index : Remove row labels or move them to new columns.

5173 DataFrame.reindex_like : Change to same indices as other DataFrame.

5174

5175 Examples

5176 --------

5177 ``DataFrame.reindex`` supports two calling conventions

5178

5179 * ``(index=index_labels, columns=column_labels, ...)``

5180 * ``(labels, axis={{'index', 'columns'}}, ...)``

5181

5182 We *highly* recommend using keyword arguments to clarify your

5183 intent.

5184

5185 Create a dataframe with some fictional data.

5186

5187 >>> index = ['Firefox', 'Chrome', 'Safari', 'IE10', 'Konqueror']

5188 >>> df = pd.DataFrame({{'http_status': [200, 200, 404, 404, 301],

5189 ... 'response_time': [0.04, 0.02, 0.07, 0.08, 1.0]}},

5190 ... index=index)

5191 >>> df

5192 http_status response_time

5193 Firefox 200 0.04

5194 Chrome 200 0.02

5195 Safari 404 0.07

5196 IE10 404 0.08

5197 Konqueror 301 1.00

5198

5199 Create a new index and reindex the dataframe. By default

5200 values in the new index that do not have corresponding

5201 records in the dataframe are assigned ``NaN``.

5202

5203 >>> new_index = ['Safari', 'Iceweasel', 'Comodo Dragon', 'IE10',

5204 ... 'Chrome']

5205 >>> df.reindex(new_index)

5206 http_status response_time

5207 Safari 404.0 0.07

5208 Iceweasel NaN NaN

5209 Comodo Dragon NaN NaN

5210 IE10 404.0 0.08

5211 Chrome 200.0 0.02

5212

5213 We can fill in the missing values by passing a value to

5214 the keyword ``fill_value``. Because the index is not monotonically

5215 increasing or decreasing, we cannot use arguments to the keyword

5216 ``method`` to fill the ``NaN`` values.

5217

5218 >>> df.reindex(new_index, fill_value=0)

5219 http_status response_time

5220 Safari 404 0.07

5221 Iceweasel 0 0.00

5222 Comodo Dragon 0 0.00

5223 IE10 404 0.08

5224 Chrome 200 0.02

5225

5226 >>> df.reindex(new_index, fill_value='missing')

5227 http_status response_time

5228 Safari 404 0.07

5229 Iceweasel missing missing

5230 Comodo Dragon missing missing

5231 IE10 404 0.08

5232 Chrome 200 0.02

5233

5234 We can also reindex the columns.

5235

5236 >>> df.reindex(columns=['http_status', 'user_agent'])

5237 http_status user_agent

5238 Firefox 200 NaN

5239 Chrome 200 NaN

5240 Safari 404 NaN

5241 IE10 404 NaN

5242 Konqueror 301 NaN

5243

5244 Or we can use "axis-style" keyword arguments

5245

5246 >>> df.reindex(['http_status', 'user_agent'], axis="columns")

5247 http_status user_agent

5248 Firefox 200 NaN

5249 Chrome 200 NaN

5250 Safari 404 NaN

5251 IE10 404 NaN

5252 Konqueror 301 NaN

5253

5254 To further illustrate the filling functionality in

5255 ``reindex``, we will create a dataframe with a

5256 monotonically increasing index (for example, a sequence

5257 of dates).

5258

5259 >>> date_index = pd.date_range('1/1/2010', periods=6, freq='D')

5260 >>> df2 = pd.DataFrame({{"prices": [100, 101, np.nan, 100, 89, 88]}},

5261 ... index=date_index)

5262 >>> df2

5263 prices

5264 2010-01-01 100.0

5265 2010-01-02 101.0

5266 2010-01-03 NaN

5267 2010-01-04 100.0

5268 2010-01-05 89.0

5269 2010-01-06 88.0

5270

5271 Suppose we decide to expand the dataframe to cover a wider

5272 date range.

5273

5274 >>> date_index2 = pd.date_range('12/29/2009', periods=10, freq='D')

5275 >>> df2.reindex(date_index2)

5276 prices

5277 2009-12-29 NaN

5278 2009-12-30 NaN

5279 2009-12-31 NaN

5280 2010-01-01 100.0

5281 2010-01-02 101.0

5282 2010-01-03 NaN

5283 2010-01-04 100.0

5284 2010-01-05 89.0

5285 2010-01-06 88.0

5286 2010-01-07 NaN

5287

5288 The index entries that did not have a value in the original data frame

5289 (for example, '2009-12-29') are by default filled with ``NaN``.

5290 If desired, we can fill in the missing values using one of several

5291 options.

5292

5293 For example, to back-propagate the last valid value to fill the ``NaN``

5294 values, pass ``bfill`` as an argument to the ``method`` keyword.

5295

5296 >>> df2.reindex(date_index2, method='bfill')

5297 prices

5298 2009-12-29 100.0

5299 2009-12-30 100.0

5300 2009-12-31 100.0

5301 2010-01-01 100.0

5302 2010-01-02 101.0

5303 2010-01-03 NaN

5304 2010-01-04 100.0

5305 2010-01-05 89.0

5306 2010-01-06 88.0

5307 2010-01-07 NaN

5308

5309 Please note that the ``NaN`` value present in the original dataframe

5310 (at index value 2010-01-03) will not be filled by any of the

5311 value propagation schemes. This is because filling while reindexing

5312 does not look at dataframe values, but only compares the original and

5313 desired indexes. If you do want to fill in the ``NaN`` values present

5314 in the original dataframe, use the ``fillna()`` method.

5315

5316 See the :ref:`user guide <basics.reindexing>` for more.

5317 """

5318 # TODO: Decide if we care about having different examples for different

5319 # kinds

5320

5321 if index is not None and columns is not None and labels is not None:

5322 raise TypeError("Cannot specify all of 'labels', 'index', 'columns'.")

5323 elif index is not None or columns is not None:

5324 if axis is not None:

5325 raise TypeError(

5326 "Cannot specify both 'axis' and any of 'index' or 'columns'"

5327 )

5328 if labels is not None:

5329 if index is not None:

5330 columns = labels

5331 else:

5332 index = labels

5333 else:

5334 if axis and self._get_axis_number(axis) == 1:

5335 columns = labels

5336 else:

5337 index = labels

5338 axes: dict[Literal["index", "columns"], Any] = {

5339 "index": index,

5340 "columns": columns,

5341 }

5342 method = clean_reindex_fill_method(method)

5343

5344 # if all axes that are requested to reindex are equal, then only copy

5345 # if indicated must have index names equal here as well as values

5346 if copy and using_copy_on_write():

5347 copy = False

5348 if all(

5349 self._get_axis(axis_name).identical(ax)

5350 for axis_name, ax in axes.items()

5351 if ax is not None

5352 ):

5353 return self.copy(deep=copy)

5354

5355 # check if we are a multi reindex

5356 if self._needs_reindex_multi(axes, method, level):

5357 return self._reindex_multi(axes, copy, fill_value)

5358

5359 # perform the reindex on the axes

5360 return self._reindex_axes(

5361 axes, level, limit, tolerance, method, fill_value, copy

5362 ).__finalize__(self, method="reindex")

5363

5364 def _reindex_axes(

5365 self: NDFrameT, axes, level, limit, tolerance, method, fill_value, copy

5366 ) -> NDFrameT:

5367 """Perform the reindex for all the axes."""

5368 obj = self

5369 for a in self._AXIS_ORDERS:

5370 labels = axes[a]

5371 if labels is None:

5372 continue

5373

5374 ax = self._get_axis(a)

5375 new_index, indexer = ax.reindex(

5376 labels, level=level, limit=limit, tolerance=tolerance, method=method

5377 )

5378

5379 axis = self._get_axis_number(a)

5380 obj = obj._reindex_with_indexers(

5381 {axis: [new_index, indexer]},

5382 fill_value=fill_value,

5383 copy=copy,

5384 allow_dups=False,

5385 )

5386 # If we've made a copy once, no need to make another one

5387 copy = False

5388

5389 return obj

5390

5391 def _needs_reindex_multi(self, axes, method, level) -> bool_t:

5392 """Check if we do need a multi reindex."""

5393 return (

5394 (common.count_not_none(*axes.values()) == self._AXIS_LEN)

5395 and method is None

5396 and level is None

5397 and not self._is_mixed_type

5398 and not (

5399 self.ndim == 2

5400 and len(self.dtypes) == 1

5401 and is_extension_array_dtype(self.dtypes.iloc[0])

5402 )

5403 )

5404

5405 def _reindex_multi(self, axes, copy, fill_value):

5406 raise AbstractMethodError(self)

5407

5408 @final

5409 def _reindex_with_indexers(

5410 self: NDFrameT,

5411 reindexers,

5412 fill_value=None,

5413 copy: bool_t | None = False,

5414 allow_dups: bool_t = False,

5415 ) -> NDFrameT:

5416 """allow_dups indicates an internal call here"""

5417 # reindex doing multiple operations on different axes if indicated

5418 new_data = self._mgr

5419 for axis in sorted(reindexers.keys()):

5420 index, indexer = reindexers[axis]

5421 baxis = self._get_block_manager_axis(axis)

5422

5423 if index is None:

5424 continue

5425

5426 index = ensure_index(index)

5427 if indexer is not None:

5428 indexer = ensure_platform_int(indexer)

5429

5430 # TODO: speed up on homogeneous DataFrame objects (see _reindex_multi)

5431 new_data = new_data.reindex_indexer(

5432 index,

5433 indexer,

5434 axis=baxis,

5435 fill_value=fill_value,

5436 allow_dups=allow_dups,

5437 copy=copy,

5438 )

5439 # If we've made a copy once, no need to make another one

5440 copy = False

5441

5442 if (

5443 (copy or copy is None)

5444 and new_data is self._mgr

5445 and not using_copy_on_write()

5446 ):

5447 new_data = new_data.copy(deep=copy)

5448 elif using_copy_on_write() and new_data is self._mgr:

5449 new_data = new_data.copy(deep=False)

5450

5451 return self._constructor(new_data).__finalize__(self)

5452

5453 def filter(

5454 self: NDFrameT,

5455 items=None,

5456 like: str | None = None,

5457 regex: str | None = None,

5458 axis: Axis | None = None,

5459 ) -> NDFrameT:

5460 """

5461 Subset the dataframe rows or columns according to the specified index labels.

5462

5463 Note that this routine does not filter a dataframe on its

5464 contents. The filter is applied to the labels of the index.

5465

5466 Parameters

5467 ----------

5468 items : list-like

5469 Keep labels from axis which are in items.

5470 like : str

5471 Keep labels from axis for which "like in label == True".

5472 regex : str (regular expression)

5473 Keep labels from axis for which re.search(regex, label) == True.

5474 axis : {0 or ‘index’, 1 or ‘columns’, None}, default None

5475 The axis to filter on, expressed either as an index (int)

5476 or axis name (str). By default this is the info axis, 'columns' for

5477 DataFrame. For `Series` this parameter is unused and defaults to `None`.

5478

5479 Returns

5480 -------

5481 same type as input object

5482

5483 See Also

5484 --------

5485 DataFrame.loc : Access a group of rows and columns

5486 by label(s) or a boolean array.

5487

5488 Notes

5489 -----

5490 The ``items``, ``like``, and ``regex`` parameters are

5491 enforced to be mutually exclusive.

5492

5493 ``axis`` defaults to the info axis that is used when indexing

5494 with ``[]``.

5495

5496 Examples

5497 --------

5498 >>> df = pd.DataFrame(np.array(([1, 2, 3], [4, 5, 6])),

5499 ... index=['mouse', 'rabbit'],

5500 ... columns=['one', 'two', 'three'])

5501 >>> df

5502 one two three

5503 mouse 1 2 3

5504 rabbit 4 5 6

5505

5506 >>> # select columns by name

5507 >>> df.filter(items=['one', 'three'])

5508 one three

5509 mouse 1 3

5510 rabbit 4 6

5511

5512 >>> # select columns by regular expression

5513 >>> df.filter(regex='e$', axis=1)

5514 one three

5515 mouse 1 3

5516 rabbit 4 6

5517

5518 >>> # select rows containing 'bbi'

5519 >>> df.filter(like='bbi', axis=0)

5520 one two three

5521 rabbit 4 5 6

5522 """

5523 nkw = common.count_not_none(items, like, regex)

5524 if nkw > 1:

5525 raise TypeError(

5526 "Keyword arguments `items`, `like`, or `regex` "

5527 "are mutually exclusive"

5528 )

5529

5530 if axis is None:

5531 axis = self._info_axis_name

5532 labels = self._get_axis(axis)

5533

5534 if items is not None:

5535 name = self._get_axis_name(axis)

5536 # error: Keywords must be strings

5537 return self.reindex( # type: ignore[misc]

5538 **{name: [r for r in items if r in labels]} # type: ignore[arg-type]

5539 )

5540 elif like:

5541

5542 def f(x) -> bool_t:

5543 assert like is not None # needed for mypy

5544 return like in ensure_str(x)

5545

5546 values = labels.map(f)

5547 return self.loc(axis=axis)[values]

5548 elif regex:

5549

5550 def f(x) -> bool_t:

5551 return matcher.search(ensure_str(x)) is not None

5552

5553 matcher = re.compile(regex)

5554 values = labels.map(f)

5555 return self.loc(axis=axis)[values]

5556 else:

5557 raise TypeError("Must pass either `items`, `like`, or `regex`")

5558

5559 @final

5560 def head(self: NDFrameT, n: int = 5) -> NDFrameT:

5561 """

5562 Return the first `n` rows.

5563

5564 This function returns the first `n` rows for the object based

5565 on position. It is useful for quickly testing if your object

5566 has the right type of data in it.

5567

5568 For negative values of `n`, this function returns all rows except

5569 the last `|n|` rows, equivalent to ``df[:n]``.

5570

5571 If n is larger than the number of rows, this function returns all rows.

5572

5573 Parameters

5574 ----------

5575 n : int, default 5

5576 Number of rows to select.

5577

5578 Returns

5579 -------

5580 same type as caller

5581 The first `n` rows of the caller object.

5582

5583 See Also

5584 --------

5585 DataFrame.tail: Returns the last `n` rows.

5586

5587 Examples

5588 --------

5589 >>> df = pd.DataFrame({'animal': ['alligator', 'bee', 'falcon', 'lion',

5590 ... 'monkey', 'parrot', 'shark', 'whale', 'zebra']})

5591 >>> df

5592 animal

5593 0 alligator

5594 1 bee

5595 2 falcon

5596 3 lion

5597 4 monkey

5598 5 parrot

5599 6 shark

5600 7 whale

5601 8 zebra

5602

5603 Viewing the first 5 lines

5604

5605 >>> df.head()

5606 animal

5607 0 alligator

5608 1 bee

5609 2 falcon

5610 3 lion

5611 4 monkey

5612

5613 Viewing the first `n` lines (three in this case)

5614

5615 >>> df.head(3)

5616 animal

5617 0 alligator

5618 1 bee

5619 2 falcon

5620

5621 For negative values of `n`

5622

5623 >>> df.head(-3)

5624 animal

5625 0 alligator

5626 1 bee

5627 2 falcon

5628 3 lion

5629 4 monkey

5630 5 parrot

5631 """

5632 return self.iloc[:n]

5633

5634 @final

5635 def tail(self: NDFrameT, n: int = 5) -> NDFrameT:

5636 """

5637 Return the last `n` rows.

5638

5639 This function returns last `n` rows from the object based on

5640 position. It is useful for quickly verifying data, for example,

5641 after sorting or appending rows.

5642

5643 For negative values of `n`, this function returns all rows except

5644 the first `|n|` rows, equivalent to ``df[|n|:]``.

5645

5646 If n is larger than the number of rows, this function returns all rows.

5647

5648 Parameters

5649 ----------

5650 n : int, default 5

5651 Number of rows to select.

5652

5653 Returns

5654 -------

5655 type of caller

5656 The last `n` rows of the caller object.

5657

5658 See Also

5659 --------

5660 DataFrame.head : The first `n` rows of the caller object.

5661

5662 Examples

5663 --------

5664 >>> df = pd.DataFrame({'animal': ['alligator', 'bee', 'falcon', 'lion',

5665 ... 'monkey', 'parrot', 'shark', 'whale', 'zebra']})

5666 >>> df

5667 animal

5668 0 alligator

5669 1 bee

5670 2 falcon

5671 3 lion

5672 4 monkey

5673 5 parrot

5674 6 shark

5675 7 whale

5676 8 zebra

5677

5678 Viewing the last 5 lines

5679

5680 >>> df.tail()

5681 animal

5682 4 monkey

5683 5 parrot

5684 6 shark

5685 7 whale

5686 8 zebra

5687

5688 Viewing the last `n` lines (three in this case)

5689

5690 >>> df.tail(3)

5691 animal

5692 6 shark

5693 7 whale

5694 8 zebra

5695

5696 For negative values of `n`

5697

5698 >>> df.tail(-3)

5699 animal

5700 3 lion

5701 4 monkey

5702 5 parrot

5703 6 shark

5704 7 whale

5705 8 zebra

5706 """

5707 if n == 0:

5708 return self.iloc[0:0]

5709 return self.iloc[-n:]

5710

5711 @final

5712 def sample(

5713 self: NDFrameT,

5714 n: int | None = None,

5715 frac: float | None = None,

5716 replace: bool_t = False,

5717 weights=None,

5718 random_state: RandomState | None = None,

5719 axis: Axis | None = None,

5720 ignore_index: bool_t = False,

5721 ) -> NDFrameT:

5722 """

5723 Return a random sample of items from an axis of object.

5724

5725 You can use `random_state` for reproducibility.

5726

5727 Parameters

5728 ----------

5729 n : int, optional

5730 Number of items from axis to return. Cannot be used with `frac`.

5731 Default = 1 if `frac` = None.

5732 frac : float, optional

5733 Fraction of axis items to return. Cannot be used with `n`.

5734 replace : bool, default False

5735 Allow or disallow sampling of the same row more than once.

5736 weights : str or ndarray-like, optional

5737 Default 'None' results in equal probability weighting.

5738 If passed a Series, will align with target object on index. Index

5739 values in weights not found in sampled object will be ignored and

5740 index values in sampled object not in weights will be assigned

5741 weights of zero.

5742 If called on a DataFrame, will accept the name of a column

5743 when axis = 0.

5744 Unless weights are a Series, weights must be same length as axis

5745 being sampled.

5746 If weights do not sum to 1, they will be normalized to sum to 1.

5747 Missing values in the weights column will be treated as zero.

5748 Infinite values not allowed.

5749 random_state : int, array-like, BitGenerator, np.random.RandomState, np.random.Generator, optional

5750 If int, array-like, or BitGenerator, seed for random number generator.

5751 If np.random.RandomState or np.random.Generator, use as given.

5752

5753 .. versionchanged:: 1.1.0

5754

5755 array-like and BitGenerator object now passed to np.random.RandomState()

5756 as seed

5757

5758 .. versionchanged:: 1.4.0

5759

5760 np.random.Generator objects now accepted

5761

5762 axis : {0 or ‘index’, 1 or ‘columns’, None}, default None

5763 Axis to sample. Accepts axis number or name. Default is stat axis

5764 for given data type. For `Series` this parameter is unused and defaults to `None`.

5765 ignore_index : bool, default False

5766 If True, the resulting index will be labeled 0, 1, …, n - 1.

5767

5768 .. versionadded:: 1.3.0

5769

5770 Returns

5771 -------

5772 Series or DataFrame

5773 A new object of same type as caller containing `n` items randomly

5774 sampled from the caller object.

5775

5776 See Also

5777 --------

5778 DataFrameGroupBy.sample: Generates random samples from each group of a

5779 DataFrame object.

5780 SeriesGroupBy.sample: Generates random samples from each group of a

5781 Series object.

5782 numpy.random.choice: Generates a random sample from a given 1-D numpy

5783 array.

5784

5785 Notes

5786 -----

5787 If `frac` > 1, `replacement` should be set to `True`.

5788

5789 Examples

5790 --------

5791 >>> df = pd.DataFrame({'num_legs': [2, 4, 8, 0],

5792 ... 'num_wings': [2, 0, 0, 0],

5793 ... 'num_specimen_seen': [10, 2, 1, 8]},

5794 ... index=['falcon', 'dog', 'spider', 'fish'])

5795 >>> df

5796 num_legs num_wings num_specimen_seen

5797 falcon 2 2 10

5798 dog 4 0 2

5799 spider 8 0 1

5800 fish 0 0 8

5801

5802 Extract 3 random elements from the ``Series`` ``df['num_legs']``:

5803 Note that we use `random_state` to ensure the reproducibility of

5804 the examples.

5805

5806 >>> df['num_legs'].sample(n=3, random_state=1)

5807 fish 0

5808 spider 8

5809 falcon 2

5810 Name: num_legs, dtype: int64

5811

5812 A random 50% sample of the ``DataFrame`` with replacement:

5813

5814 >>> df.sample(frac=0.5, replace=True, random_state=1)

5815 num_legs num_wings num_specimen_seen

5816 dog 4 0 2

5817 fish 0 0 8

5818

5819 An upsample sample of the ``DataFrame`` with replacement:

5820 Note that `replace` parameter has to be `True` for `frac` parameter > 1.

5821

5822 >>> df.sample(frac=2, replace=True, random_state=1)

5823 num_legs num_wings num_specimen_seen

5824 dog 4 0 2

5825 fish 0 0 8

5826 falcon 2 2 10

5827 falcon 2 2 10

5828 fish 0 0 8

5829 dog 4 0 2

5830 fish 0 0 8

5831 dog 4 0 2

5832

5833 Using a DataFrame column as weights. Rows with larger value in the

5834 `num_specimen_seen` column are more likely to be sampled.

5835

5836 >>> df.sample(n=2, weights='num_specimen_seen', random_state=1)

5837 num_legs num_wings num_specimen_seen

5838 falcon 2 2 10

5839 fish 0 0 8

5840 """ # noqa:E501

5841 if axis is None:

5842 axis = self._stat_axis_number

5843

5844 axis = self._get_axis_number(axis)

5845 obj_len = self.shape[axis]

5846

5847 # Process random_state argument

5848 rs = common.random_state(random_state)

5849

5850 size = sample.process_sampling_size(n, frac, replace)

5851 if size is None:

5852 assert frac is not None

5853 size = round(frac * obj_len)

5854

5855 if weights is not None:

5856 weights = sample.preprocess_weights(self, weights, axis)

5857

5858 sampled_indices = sample.sample(obj_len, size, replace, weights, rs)

5859 result = self.take(sampled_indices, axis=axis)

5860

5861 if ignore_index:

5862 result.index = default_index(len(result))

5863

5864 return result

5865

5866 @final

5867 @doc(klass=_shared_doc_kwargs["klass"])

5868 def pipe(

5869 self,

5870 func: Callable[..., T] | tuple[Callable[..., T], str],

5871 *args,

5872 **kwargs,

5873 ) -> T:

5874 r"""

5875 Apply chainable functions that expect Series or DataFrames.

5876

5877 Parameters

5878 ----------

5879 func : function

5880 Function to apply to the {klass}.

5881 ``args``, and ``kwargs`` are passed into ``func``.

5882 Alternatively a ``(callable, data_keyword)`` tuple where

5883 ``data_keyword`` is a string indicating the keyword of

5884 ``callable`` that expects the {klass}.

5885 args : iterable, optional

5886 Positional arguments passed into ``func``.

5887 kwargs : mapping, optional

5888 A dictionary of keyword arguments passed into ``func``.

5889

5890 Returns

5891 -------

5892 the return type of ``func``.

5893

5894 See Also

5895 --------

5896 DataFrame.apply : Apply a function along input axis of DataFrame.

5897 DataFrame.applymap : Apply a function elementwise on a whole DataFrame.

5898 Series.map : Apply a mapping correspondence on a

5899 :class:`~pandas.Series`.

5900

5901 Notes

5902 -----

5903 Use ``.pipe`` when chaining together functions that expect

5904 Series, DataFrames or GroupBy objects. Instead of writing

5905

5906 >>> func(g(h(df), arg1=a), arg2=b, arg3=c) # doctest: +SKIP

5907

5908 You can write

5909

5910 >>> (df.pipe(h)

5911 ... .pipe(g, arg1=a)

5912 ... .pipe(func, arg2=b, arg3=c)

5913 ... ) # doctest: +SKIP

5914

5915 If you have a function that takes the data as (say) the second

5916 argument, pass a tuple indicating which keyword expects the

5917 data. For example, suppose ``func`` takes its data as ``arg2``:

5918

5919 >>> (df.pipe(h)

5920 ... .pipe(g, arg1=a)

5921 ... .pipe((func, 'arg2'), arg1=a, arg3=c)

5922 ... ) # doctest: +SKIP

5923 """

5924 if using_copy_on_write():

5925 return common.pipe(self.copy(deep=None), func, *args, **kwargs)

5926 return common.pipe(self, func, *args, **kwargs)

5927

5928 # ----------------------------------------------------------------------

5929 # Attribute access

5930

5931 @final

5932 def __finalize__(

5933 self: NDFrameT, other, method: str | None = None, **kwargs

5934 ) -> NDFrameT:

5935 """

5936 Propagate metadata from other to self.

5937

5938 Parameters

5939 ----------

5940 other : the object from which to get the attributes that we are going

5941 to propagate

5942 method : str, optional

5943 A passed method name providing context on where ``__finalize__``

5944 was called.

5945

5946 .. warning::

5947

5948 The value passed as `method` are not currently considered

5949 stable across pandas releases.

5950 """

5951 if isinstance(other, NDFrame):

5952 for name in other.attrs:

5953 self.attrs[name] = other.attrs[name]

5954

5955 self.flags.allows_duplicate_labels = other.flags.allows_duplicate_labels

5956 # For subclasses using _metadata.

5957 for name in set(self._metadata) & set(other._metadata):

5958 assert isinstance(name, str)

5959 object.__setattr__(self, name, getattr(other, name, None))

5960

5961 if method == "concat":

5962 attrs = other.objs[0].attrs

5963 check_attrs = all(objs.attrs == attrs for objs in other.objs[1:])

5964 if check_attrs:

5965 for name in attrs:

5966 self.attrs[name] = attrs[name]

5967

5968 allows_duplicate_labels = all(

5969 x.flags.allows_duplicate_labels for x in other.objs

5970 )

5971 self.flags.allows_duplicate_labels = allows_duplicate_labels

5972

5973 return self

5974

5975 def __getattr__(self, name: str):

5976 """

5977 After regular attribute access, try looking up the name

5978 This allows simpler access to columns for interactive use.

5979 """

5980 # Note: obj.x will always call obj.__getattribute__('x') prior to

5981 # calling obj.__getattr__('x').

5982 if (

5983 name not in self._internal_names_set

5984 and name not in self._metadata

5985 and name not in self._accessors

5986 and self._info_axis._can_hold_identifiers_and_holds_name(name)

5987 ):

5988 return self[name]

5989 return object.__getattribute__(self, name)

5990

5991 def __setattr__(self, name: str, value) -> None:

5992 """

5993 After regular attribute access, try setting the name

5994 This allows simpler access to columns for interactive use.

5995 """

5996 # first try regular attribute access via __getattribute__, so that

5997 # e.g. ``obj.x`` and ``obj.x = 4`` will always reference/modify

5998 # the same attribute.

5999

6000 try:

6001 object.__getattribute__(self, name)

6002 return object.__setattr__(self, name, value)

6003 except AttributeError:

6004 pass

6005

6006 # if this fails, go on to more involved attribute setting

6007 # (note that this matches __getattr__, above).

6008 if name in self._internal_names_set:

6009 object.__setattr__(self, name, value)

6010 elif name in self._metadata:

6011 object.__setattr__(self, name, value)

6012 else:

6013 try:

6014 existing = getattr(self, name)

6015 if isinstance(existing, Index):

6016 object.__setattr__(self, name, value)

6017 elif name in self._info_axis:

6018 self[name] = value

6019 else:

6020 object.__setattr__(self, name, value)

6021 except (AttributeError, TypeError):

6022 if isinstance(self, ABCDataFrame) and (is_list_like(value)):

6023 warnings.warn(

6024 "Pandas doesn't allow columns to be "

6025 "created via a new attribute name - see "

6026 "https://pandas.pydata.org/pandas-docs/"

6027 "stable/indexing.html#attribute-access",

6028 stacklevel=find_stack_level(),

6029 )

6030 object.__setattr__(self, name, value)

6031

6032 @final

6033 def _dir_additions(self) -> set[str]:

6034 """

6035 add the string-like attributes from the info_axis.

6036 If info_axis is a MultiIndex, its first level values are used.

6037 """

6038 additions = super()._dir_additions()

6039 if self._info_axis._can_hold_strings:

6040 additions.update(self._info_axis._dir_additions_for_owner)

6041 return additions

6042

6043 # ----------------------------------------------------------------------

6044 # Consolidation of internals

6045

6046 @final

6047 def _protect_consolidate(self, f):

6048 """

6049 Consolidate _mgr -- if the blocks have changed, then clear the

6050 cache

6051 """

6052 if isinstance(self._mgr, (ArrayManager, SingleArrayManager)):

6053 return f()

6054 blocks_before = len(self._mgr.blocks)

6055 result = f()

6056 if len(self._mgr.blocks) != blocks_before:

6057 self._clear_item_cache()

6058 return result

6059

6060 @final

6061 def _consolidate_inplace(self) -> None:

6062 """Consolidate data in place and return None"""

6063

6064 def f() -> None:

6065 self._mgr = self._mgr.consolidate()

6066

6067 self._protect_consolidate(f)

6068

6069 @final

6070 def _consolidate(self):

6071 """

6072 Compute NDFrame with "consolidated" internals (data of each dtype

6073 grouped together in a single ndarray).

6074

6075 Returns

6076 -------

6077 consolidated : same type as caller

6078 """

6079 f = lambda: self._mgr.consolidate()

6080 cons_data = self._protect_consolidate(f)

6081 return self._constructor(cons_data).__finalize__(self)

6082

6083 @property

6084 def _is_mixed_type(self) -> bool_t:

6085 if self._mgr.is_single_block:

6086 return False

6087

6088 if self._mgr.any_extension_types:

6089 # Even if they have the same dtype, we can't consolidate them,

6090 # so we pretend this is "mixed'"

6091 return True

6092

6093 return self.dtypes.nunique() > 1

6094

6095 @final

6096 def _check_inplace_setting(self, value) -> bool_t:

6097 """check whether we allow in-place setting with this type of value"""

6098 if self._is_mixed_type and not self._mgr.is_numeric_mixed_type:

6099 # allow an actual np.nan through

6100 if is_float(value) and np.isnan(value) or value is lib.no_default:

6101 return True

6102

6103 raise TypeError(

6104 "Cannot do inplace boolean setting on "

6105 "mixed-types with a non np.nan value"

6106 )

6107

6108 return True

6109

6110 @final

6111 def _get_numeric_data(self: NDFrameT) -> NDFrameT:

6112 return self._constructor(self._mgr.get_numeric_data()).__finalize__(self)

6113

6114 @final

6115 def _get_bool_data(self):

6116 return self._constructor(self._mgr.get_bool_data()).__finalize__(self)

6117

6118 # ----------------------------------------------------------------------

6119 # Internal Interface Methods

6120

6121 @property

6122 def values(self):

6123 raise AbstractMethodError(self)

6124

6125 @property

6126 def _values(self) -> ArrayLike:

6127 """internal implementation"""

6128 raise AbstractMethodError(self)

6129

6130 @property

6131 def dtypes(self):

6132 """

6133 Return the dtypes in the DataFrame.

6134

6135 This returns a Series with the data type of each column.

6136 The result's index is the original DataFrame's columns. Columns

6137 with mixed types are stored with the ``object`` dtype. See

6138 :ref:`the User Guide <basics.dtypes>` for more.

6139

6140 Returns

6141 -------

6142 pandas.Series

6143 The data type of each column.

6144

6145 Examples

6146 --------

6147 >>> df = pd.DataFrame({'float': [1.0],

6148 ... 'int': [1],

6149 ... 'datetime': [pd.Timestamp('20180310')],

6150 ... 'string': ['foo']})

6151 >>> df.dtypes

6152 float float64

6153 int int64

6154 datetime datetime64[ns]

6155 string object

6156 dtype: object

6157 """

6158 data = self._mgr.get_dtypes()

6159 return self._constructor_sliced(data, index=self._info_axis, dtype=np.object_)

6160

6161 def astype(

6162 self: NDFrameT, dtype, copy: bool_t | None = None, errors: IgnoreRaise = "raise"

6163 ) -> NDFrameT:

6164 """

6165 Cast a pandas object to a specified dtype ``dtype``.

6166

6167 Parameters

6168 ----------

6169 dtype : str, data type, Series or Mapping of column name -> data type

6170 Use a str, numpy.dtype, pandas.ExtensionDtype or Python type to

6171 cast entire pandas object to the same type. Alternatively, use a

6172 mapping, e.g. {col: dtype, ...}, where col is a column label and dtype is

6173 a numpy.dtype or Python type to cast one or more of the DataFrame's

6174 columns to column-specific types.

6175 copy : bool, default True

6176 Return a copy when ``copy=True`` (be very careful setting

6177 ``copy=False`` as changes to values then may propagate to other

6178 pandas objects).

6179 errors : {'raise', 'ignore'}, default 'raise'

6180 Control raising of exceptions on invalid data for provided dtype.

6181

6182 - ``raise`` : allow exceptions to be raised

6183 - ``ignore`` : suppress exceptions. On error return original object.

6184

6185 Returns

6186 -------

6187 same type as caller

6188

6189 See Also

6190 --------

6191 to_datetime : Convert argument to datetime.

6192 to_timedelta : Convert argument to timedelta.

6193 to_numeric : Convert argument to a numeric type.

6194 numpy.ndarray.astype : Cast a numpy array to a specified type.

6195

6196 Notes

6197 -----

6198 .. versionchanged:: 2.0.0

6199

6200 Using ``astype`` to convert from timezone-naive dtype to

6201 timezone-aware dtype will raise an exception.

6202 Use :meth:`Series.dt.tz_localize` instead.

6203

6204 Examples

6205 --------

6206 Create a DataFrame:

6207

6208 >>> d = {'col1': [1, 2], 'col2': [3, 4]}

6209 >>> df = pd.DataFrame(data=d)

6210 >>> df.dtypes

6211 col1 int64

6212 col2 int64

6213 dtype: object

6214

6215 Cast all columns to int32:

6216

6217 >>> df.astype('int32').dtypes

6218 col1 int32

6219 col2 int32

6220 dtype: object

6221

6222 Cast col1 to int32 using a dictionary:

6223

6224 >>> df.astype({'col1': 'int32'}).dtypes

6225 col1 int32

6226 col2 int64

6227 dtype: object

6228

6229 Create a series:

6230

6231 >>> ser = pd.Series([1, 2], dtype='int32')

6232 >>> ser

6233 0 1

6234 1 2

6235 dtype: int32

6236 >>> ser.astype('int64')

6237 0 1

6238 1 2

6239 dtype: int64

6240

6241 Convert to categorical type:

6242

6243 >>> ser.astype('category')

6244 0 1

6245 1 2

6246 dtype: category

6247 Categories (2, int32): [1, 2]

6248

6249 Convert to ordered categorical type with custom ordering:

6250

6251 >>> from pandas.api.types import CategoricalDtype

6252 >>> cat_dtype = CategoricalDtype(

6253 ... categories=[2, 1], ordered=True)

6254 >>> ser.astype(cat_dtype)

6255 0 1

6256 1 2

6257 dtype: category

6258 Categories (2, int64): [2 < 1]

6259

6260 Create a series of dates:

6261

6262 >>> ser_date = pd.Series(pd.date_range('20200101', periods=3))

6263 >>> ser_date

6264 0 2020-01-01

6265 1 2020-01-02

6266 2 2020-01-03

6267 dtype: datetime64[ns]

6268 """

6269 if copy and using_copy_on_write():

6270 copy = False

6271

6272 if is_dict_like(dtype):

6273 if self.ndim == 1: # i.e. Series

6274 if len(dtype) > 1 or self.name not in dtype:

6275 raise KeyError(

6276 "Only the Series name can be used for "

6277 "the key in Series dtype mappings."

6278 )

6279 new_type = dtype[self.name]

6280 return self.astype(new_type, copy, errors)

6281

6282 # GH#44417 cast to Series so we can use .iat below, which will be

6283 # robust in case we

6284 from pandas import Series

6285

6286 dtype_ser = Series(dtype, dtype=object)

6287

6288 for col_name in dtype_ser.index:

6289 if col_name not in self:

6290 raise KeyError(

6291 "Only a column name can be used for the "

6292 "key in a dtype mappings argument. "

6293 f"'{col_name}' not found in columns."

6294 )

6295

6296 dtype_ser = dtype_ser.reindex(self.columns, fill_value=None, copy=False)

6297

6298 results = []

6299 for i, (col_name, col) in enumerate(self.items()):

6300 cdt = dtype_ser.iat[i]

6301 if isna(cdt):

6302 res_col = col.copy(deep=copy)

6303 else:

6304 try:

6305 res_col = col.astype(dtype=cdt, copy=copy, errors=errors)

6306 except ValueError as ex:

6307 ex.args = (

6308 f"{ex}: Error while type casting for column '{col_name}'",

6309 )

6310 raise

6311 results.append(res_col)

6312

6313 elif is_extension_array_dtype(dtype) and self.ndim > 1:

6314 # GH 18099/22869: columnwise conversion to extension dtype

6315 # GH 24704: use iloc to handle duplicate column names

6316 # TODO(EA2D): special case not needed with 2D EAs

6317 results = [

6318 self.iloc[:, i].astype(dtype, copy=copy)

6319 for i in range(len(self.columns))

6320 ]

6321

6322 else:

6323 # else, only a single dtype is given

6324 new_data = self._mgr.astype(dtype=dtype, copy=copy, errors=errors)

6325 return self._constructor(new_data).__finalize__(self, method="astype")

6326

6327 # GH 33113: handle empty frame or series

6328 if not results:

6329 return self.copy(deep=None)

6330

6331 # GH 19920: retain column metadata after concat

6332 result = concat(results, axis=1, copy=False)

6333 # GH#40810 retain subclass

6334 # error: Incompatible types in assignment

6335 # (expression has type "NDFrameT", variable has type "DataFrame")

6336 result = self._constructor(result) # type: ignore[assignment]

6337 result.columns = self.columns

6338 result = result.__finalize__(self, method="astype")

6339 # https://github.com/python/mypy/issues/8354

6340 return cast(NDFrameT, result)

6341

6342 @final

6343 def copy(self: NDFrameT, deep: bool_t | None = True) -> NDFrameT:

6344 """

6345 Make a copy of this object's indices and data.

6346

6347 When ``deep=True`` (default), a new object will be created with a

6348 copy of the calling object's data and indices. Modifications to

6349 the data or indices of the copy will not be reflected in the

6350 original object (see notes below).

6351

6352 When ``deep=False``, a new object will be created without copying

6353 the calling object's data or index (only references to the data

6354 and index are copied). Any changes to the data of the original

6355 will be reflected in the shallow copy (and vice versa).

6356

6357 Parameters

6358 ----------

6359 deep : bool, default True

6360 Make a deep copy, including a copy of the data and the indices.

6361 With ``deep=False`` neither the indices nor the data are copied.

6362

6363 Returns

6364 -------

6365 Series or DataFrame

6366 Object type matches caller.

6367

6368 Notes

6369 -----

6370 When ``deep=True``, data is copied but actual Python objects

6371 will not be copied recursively, only the reference to the object.

6372 This is in contrast to `copy.deepcopy` in the Standard Library,

6373 which recursively copies object data (see examples below).

6374

6375 While ``Index`` objects are copied when ``deep=True``, the underlying

6376 numpy array is not copied for performance reasons. Since ``Index`` is

6377 immutable, the underlying data can be safely shared and a copy

6378 is not needed.

6379

6380 Since pandas is not thread safe, see the

6381 :ref:`gotchas <gotchas.thread-safety>` when copying in a threading

6382 environment.

6383

6384 Examples

6385 --------

6386 >>> s = pd.Series([1, 2], index=["a", "b"])

6387 >>> s

6388 a 1

6389 b 2

6390 dtype: int64

6391

6392 >>> s_copy = s.copy()

6393 >>> s_copy

6394 a 1

6395 b 2

6396 dtype: int64

6397

6398 **Shallow copy versus default (deep) copy:**

6399

6400 >>> s = pd.Series([1, 2], index=["a", "b"])

6401 >>> deep = s.copy()

6402 >>> shallow = s.copy(deep=False)

6403

6404 Shallow copy shares data and index with original.

6405

6406 >>> s is shallow

6407 False

6408 >>> s.values is shallow.values and s.index is shallow.index

6409 True

6410

6411 Deep copy has own copy of data and index.

6412

6413 >>> s is deep

6414 False

6415 >>> s.values is deep.values or s.index is deep.index

6416 False

6417

6418 Updates to the data shared by shallow copy and original is reflected

6419 in both; deep copy remains unchanged.

6420

6421 >>> s[0] = 3

6422 >>> shallow[1] = 4

6423 >>> s

6424 a 3

6425 b 4

6426 dtype: int64

6427 >>> shallow

6428 a 3

6429 b 4

6430 dtype: int64

6431 >>> deep

6432 a 1

6433 b 2

6434 dtype: int64

6435

6436 Note that when copying an object containing Python objects, a deep copy

6437 will copy the data, but will not do so recursively. Updating a nested

6438 data object will be reflected in the deep copy.

6439

6440 >>> s = pd.Series([[1, 2], [3, 4]])

6441 >>> deep = s.copy()

6442 >>> s[0][0] = 10

6443 >>> s

6444 0 [10, 2]

6445 1 [3, 4]

6446 dtype: object

6447 >>> deep

6448 0 [10, 2]

6449 1 [3, 4]

6450 dtype: object

6451 """

6452 data = self._mgr.copy(deep=deep)

6453 self._clear_item_cache()

6454 return self._constructor(data).__finalize__(self, method="copy")

6455

6456 @final

6457 def __copy__(self: NDFrameT, deep: bool_t = True) -> NDFrameT:

6458 return self.copy(deep=deep)

6459

6460 @final

6461 def __deepcopy__(self: NDFrameT, memo=None) -> NDFrameT:

6462 """

6463 Parameters

6464 ----------

6465 memo, default None

6466 Standard signature. Unused

6467 """

6468 return self.copy(deep=True)

6469

6470 @final

6471 def infer_objects(self: NDFrameT, copy: bool_t | None = None) -> NDFrameT:

6472 """

6473 Attempt to infer better dtypes for object columns.

6474

6475 Attempts soft conversion of object-dtyped

6476 columns, leaving non-object and unconvertible

6477 columns unchanged. The inference rules are the

6478 same as during normal Series/DataFrame construction.

6479

6480 Parameters

6481 ----------

6482 copy : bool, default True

6483 Whether to make a copy for non-object or non-inferrable columns

6484 or Series.

6485

6486 Returns

6487 -------

6488 same type as input object

6489

6490 See Also

6491 --------

6492 to_datetime : Convert argument to datetime.

6493 to_timedelta : Convert argument to timedelta.

6494 to_numeric : Convert argument to numeric type.

6495 convert_dtypes : Convert argument to best possible dtype.

6496

6497 Examples

6498 --------

6499 >>> df = pd.DataFrame({"A": ["a", 1, 2, 3]})

6500 >>> df = df.iloc[1:]

6501 >>> df

6502 A

6503 1 1

6504 2 2

6505 3 3

6506

6507 >>> df.dtypes

6508 A object

6509 dtype: object

6510

6511 >>> df.infer_objects().dtypes

6512 A int64

6513 dtype: object

6514 """

6515 new_mgr = self._mgr.convert(copy=copy)

6516 return self._constructor(new_mgr).__finalize__(self, method="infer_objects")

6517

6518 @final

6519 def convert_dtypes(

6520 self: NDFrameT,

6521 infer_objects: bool_t = True,

6522 convert_string: bool_t = True,

6523 convert_integer: bool_t = True,

6524 convert_boolean: bool_t = True,

6525 convert_floating: bool_t = True,

6526 dtype_backend: DtypeBackend = "numpy_nullable",

6527 ) -> NDFrameT:

6528 """

6529 Convert columns to the best possible dtypes using dtypes supporting ``pd.NA``.

6530

6531 Parameters

6532 ----------

6533 infer_objects : bool, default True

6534 Whether object dtypes should be converted to the best possible types.

6535 convert_string : bool, default True

6536 Whether object dtypes should be converted to ``StringDtype()``.

6537 convert_integer : bool, default True

6538 Whether, if possible, conversion can be done to integer extension types.

6539 convert_boolean : bool, defaults True

6540 Whether object dtypes should be converted to ``BooleanDtypes()``.

6541 convert_floating : bool, defaults True

6542 Whether, if possible, conversion can be done to floating extension types.

6543 If `convert_integer` is also True, preference will be give to integer

6544 dtypes if the floats can be faithfully casted to integers.

6545

6546 .. versionadded:: 1.2.0

6547 dtype_backend : {"numpy_nullable", "pyarrow"}, default "numpy_nullable"

6548 Which dtype_backend to use, e.g. whether a DataFrame should use nullable

6549 dtypes for all dtypes that have a nullable

6550 implementation when "numpy_nullable" is set, pyarrow is used for all

6551 dtypes if "pyarrow" is set.

6552

6553 The dtype_backends are still experimential.

6554

6555 .. versionadded:: 2.0

6556

6557 Returns

6558 -------

6559 Series or DataFrame

6560 Copy of input object with new dtype.

6561

6562 See Also

6563 --------

6564 infer_objects : Infer dtypes of objects.

6565 to_datetime : Convert argument to datetime.

6566 to_timedelta : Convert argument to timedelta.

6567 to_numeric : Convert argument to a numeric type.

6568

6569 Notes

6570 -----

6571 By default, ``convert_dtypes`` will attempt to convert a Series (or each

6572 Series in a DataFrame) to dtypes that support ``pd.NA``. By using the options

6573 ``convert_string``, ``convert_integer``, ``convert_boolean`` and

6574 ``convert_floating``, it is possible to turn off individual conversions

6575 to ``StringDtype``, the integer extension types, ``BooleanDtype``

6576 or floating extension types, respectively.

6577

6578 For object-dtyped columns, if ``infer_objects`` is ``True``, use the inference

6579 rules as during normal Series/DataFrame construction. Then, if possible,

6580 convert to ``StringDtype``, ``BooleanDtype`` or an appropriate integer

6581 or floating extension type, otherwise leave as ``object``.

6582

6583 If the dtype is integer, convert to an appropriate integer extension type.

6584

6585 If the dtype is numeric, and consists of all integers, convert to an

6586 appropriate integer extension type. Otherwise, convert to an

6587 appropriate floating extension type.

6588

6589 .. versionchanged:: 1.2

6590 Starting with pandas 1.2, this method also converts float columns

6591 to the nullable floating extension type.

6592

6593 In the future, as new dtypes are added that support ``pd.NA``, the results

6594 of this method will change to support those new dtypes.

6595

6596 Examples

6597 --------

6598 >>> df = pd.DataFrame(

6599 ... {

6600 ... "a": pd.Series([1, 2, 3], dtype=np.dtype("int32")),

6601 ... "b": pd.Series(["x", "y", "z"], dtype=np.dtype("O")),

6602 ... "c": pd.Series([True, False, np.nan], dtype=np.dtype("O")),

6603 ... "d": pd.Series(["h", "i", np.nan], dtype=np.dtype("O")),

6604 ... "e": pd.Series([10, np.nan, 20], dtype=np.dtype("float")),

6605 ... "f": pd.Series([np.nan, 100.5, 200], dtype=np.dtype("float")),

6606 ... }

6607 ... )

6608

6609 Start with a DataFrame with default dtypes.

6610

6611 >>> df

6612 a b c d e f

6613 0 1 x True h 10.0 NaN

6614 1 2 y False i NaN 100.5

6615 2 3 z NaN NaN 20.0 200.0

6616

6617 >>> df.dtypes

6618 a int32

6619 b object

6620 c object

6621 d object

6622 e float64

6623 f float64

6624 dtype: object

6625

6626 Convert the DataFrame to use best possible dtypes.

6627

6628 >>> dfn = df.convert_dtypes()

6629 >>> dfn

6630 a b c d e f

6631 0 1 x True h 10 <NA>

6632 1 2 y False i <NA> 100.5

6633 2 3 z <NA> <NA> 20 200.0

6634

6635 >>> dfn.dtypes

6636 a Int32

6637 b string[python]

6638 c boolean

6639 d string[python]

6640 e Int64

6641 f Float64

6642 dtype: object

6643

6644 Start with a Series of strings and missing data represented by ``np.nan``.

6645

6646 >>> s = pd.Series(["a", "b", np.nan])

6647 >>> s

6648 0 a

6649 1 b

6650 2 NaN

6651 dtype: object

6652

6653 Obtain a Series with dtype ``StringDtype``.

6654

6655 >>> s.convert_dtypes()

6656 0 a

6657 1 b

6658 2 <NA>

6659 dtype: string

6660 """

6661 check_dtype_backend(dtype_backend)

6662 if self.ndim == 1:

6663 return self._convert_dtypes(

6664 infer_objects,

6665 convert_string,

6666 convert_integer,

6667 convert_boolean,

6668 convert_floating,

6669 dtype_backend=dtype_backend,

6670 )

6671 else:

6672 results = [

6673 col._convert_dtypes(

6674 infer_objects,

6675 convert_string,

6676 convert_integer,

6677 convert_boolean,

6678 convert_floating,

6679 dtype_backend=dtype_backend,

6680 )

6681 for col_name, col in self.items()

6682 ]

6683 if len(results) > 0:

6684 result = concat(results, axis=1, copy=False, keys=self.columns)

6685 cons = cast(Type["DataFrame"], self._constructor)

6686 result = cons(result)

6687 result = result.__finalize__(self, method="convert_dtypes")

6688 # https://github.com/python/mypy/issues/8354

6689 return cast(NDFrameT, result)

6690 else:

6691 return self.copy(deep=None)

6692

6693 # ----------------------------------------------------------------------

6694 # Filling NA's

6695

6696 @overload

6697 def fillna(

6698 self: NDFrameT,

6699 value: Hashable | Mapping | Series | DataFrame = ...,

6700 *,

6701 method: FillnaOptions | None = ...,

6702 axis: Axis | None = ...,

6703 inplace: Literal[False] = ...,

6704 limit: int | None = ...,

6705 downcast: dict | None = ...,

6706 ) -> NDFrameT:

6707 ...

6708

6709 @overload

6710 def fillna(

6711 self,

6712 value: Hashable | Mapping | Series | DataFrame = ...,

6713 *,

6714 method: FillnaOptions | None = ...,

6715 axis: Axis | None = ...,

6716 inplace: Literal[True],

6717 limit: int | None = ...,

6718 downcast: dict | None = ...,

6719 ) -> None:

6720 ...

6721

6722 @overload

6723 def fillna(

6724 self: NDFrameT,

6725 value: Hashable | Mapping | Series | DataFrame = ...,

6726 *,

6727 method: FillnaOptions | None = ...,

6728 axis: Axis | None = ...,

6729 inplace: bool_t = ...,

6730 limit: int | None = ...,

6731 downcast: dict | None = ...,

6732 ) -> NDFrameT | None:

6733 ...

6734

6735 @doc(**_shared_doc_kwargs)

6736 def fillna(

6737 self: NDFrameT,

6738 value: Hashable | Mapping | Series | DataFrame = None,

6739 *,

6740 method: FillnaOptions | None = None,

6741 axis: Axis | None = None,

6742 inplace: bool_t = False,

6743 limit: int | None = None,

6744 downcast: dict | None = None,

6745 ) -> NDFrameT | None:

6746 """

6747 Fill NA/NaN values using the specified method.

6748

6749 Parameters

6750 ----------

6751 value : scalar, dict, Series, or DataFrame

6752 Value to use to fill holes (e.g. 0), alternately a

6753 dict/Series/DataFrame of values specifying which value to use for

6754 each index (for a Series) or column (for a DataFrame). Values not

6755 in the dict/Series/DataFrame will not be filled. This value cannot

6756 be a list.

6757 method : {{'backfill', 'bfill', 'ffill', None}}, default None

6758 Method to use for filling holes in reindexed Series:

6759

6760 * ffill: propagate last valid observation forward to next valid.

6761 * backfill / bfill: use next valid observation to fill gap.

6762

6763 axis : {axes_single_arg}

6764 Axis along which to fill missing values. For `Series`

6765 this parameter is unused and defaults to 0.

6766 inplace : bool, default False

6767 If True, fill in-place. Note: this will modify any

6768 other views on this object (e.g., a no-copy slice for a column in a

6769 DataFrame).

6770 limit : int, default None

6771 If method is specified, this is the maximum number of consecutive

6772 NaN values to forward/backward fill. In other words, if there is

6773 a gap with more than this number of consecutive NaNs, it will only

6774 be partially filled. If method is not specified, this is the

6775 maximum number of entries along the entire axis where NaNs will be

6776 filled. Must be greater than 0 if not None.

6777 downcast : dict, default is None

6778 A dict of item->dtype of what to downcast if possible,

6779 or the string 'infer' which will try to downcast to an appropriate

6780 equal type (e.g. float64 to int64 if possible).

6781

6782 Returns

6783 -------

6784 {klass} or None

6785 Object with missing values filled or None if ``inplace=True``.

6786

6787 See Also

6788 --------

6789 interpolate : Fill NaN values using interpolation.

6790 reindex : Conform object to new index.

6791 asfreq : Convert TimeSeries to specified frequency.

6792

6793 Examples

6794 --------

6795 >>> df = pd.DataFrame([[np.nan, 2, np.nan, 0],

6796 ... [3, 4, np.nan, 1],

6797 ... [np.nan, np.nan, np.nan, np.nan],

6798 ... [np.nan, 3, np.nan, 4]],

6799 ... columns=list("ABCD"))

6800 >>> df

6801 A B C D

6802 0 NaN 2.0 NaN 0.0

6803 1 3.0 4.0 NaN 1.0

6804 2 NaN NaN NaN NaN

6805 3 NaN 3.0 NaN 4.0

6806

6807 Replace all NaN elements with 0s.

6808

6809 >>> df.fillna(0)

6810 A B C D

6811 0 0.0 2.0 0.0 0.0

6812 1 3.0 4.0 0.0 1.0

6813 2 0.0 0.0 0.0 0.0

6814 3 0.0 3.0 0.0 4.0

6815

6816 We can also propagate non-null values forward or backward.

6817

6818 >>> df.fillna(method="ffill")

6819 A B C D

6820 0 NaN 2.0 NaN 0.0

6821 1 3.0 4.0 NaN 1.0

6822 2 3.0 4.0 NaN 1.0

6823 3 3.0 3.0 NaN 4.0

6824

6825 Replace all NaN elements in column 'A', 'B', 'C', and 'D', with 0, 1,

6826 2, and 3 respectively.

6827

6828 >>> values = {{"A": 0, "B": 1, "C": 2, "D": 3}}

6829 >>> df.fillna(value=values)

6830 A B C D

6831 0 0.0 2.0 2.0 0.0

6832 1 3.0 4.0 2.0 1.0

6833 2 0.0 1.0 2.0 3.0

6834 3 0.0 3.0 2.0 4.0

6835

6836 Only replace the first NaN element.

6837

6838 >>> df.fillna(value=values, limit=1)

6839 A B C D

6840 0 0.0 2.0 2.0 0.0

6841 1 3.0 4.0 NaN 1.0

6842 2 NaN 1.0 NaN 3.0

6843 3 NaN 3.0 NaN 4.0

6844

6845 When filling using a DataFrame, replacement happens along

6846 the same column names and same indices

6847

6848 >>> df2 = pd.DataFrame(np.zeros((4, 4)), columns=list("ABCE"))

6849 >>> df.fillna(df2)

6850 A B C D

6851 0 0.0 2.0 0.0 0.0

6852 1 3.0 4.0 0.0 1.0

6853 2 0.0 0.0 0.0 NaN

6854 3 0.0 3.0 0.0 4.0

6855

6856 Note that column D is not affected since it is not present in df2.

6857 """

6858 inplace = validate_bool_kwarg(inplace, "inplace")

6859 value, method = validate_fillna_kwargs(value, method)

6860

6861 # set the default here, so functions examining the signaure

6862 # can detect if something was set (e.g. in groupby) (GH9221)

6863 if axis is None:

6864 axis = 0

6865 axis = self._get_axis_number(axis)

6866

6867 if value is None:

6868 if not self._mgr.is_single_block and axis == 1:

6869 if inplace:

6870 raise NotImplementedError()

6871 result = self.T.fillna(method=method, limit=limit).T

6872

6873 return result

6874

6875 new_data = self._mgr.interpolate(

6876 method=method,

6877 axis=axis,

6878 limit=limit,

6879 inplace=inplace,

6880 downcast=downcast,

6881 )

6882 else:

6883 if self.ndim == 1:

6884 if isinstance(value, (dict, ABCSeries)):

6885 if not len(value):

6886 # test_fillna_nonscalar

6887 if inplace:

6888 return None

6889 return self.copy(deep=None)

6890 from pandas import Series

6891

6892 value = Series(value)

6893 value = value.reindex(self.index, copy=False)

6894 value = value._values

6895 elif not is_list_like(value):

6896 pass

6897 else:

6898 raise TypeError(

6899 '"value" parameter must be a scalar, dict '

6900 "or Series, but you passed a "

6901 f'"{type(value).__name__}"'

6902 )

6903

6904 new_data = self._mgr.fillna(

6905 value=value, limit=limit, inplace=inplace, downcast=downcast

6906 )

6907

6908 elif isinstance(value, (dict, ABCSeries)):

6909 if axis == 1:

6910 raise NotImplementedError(

6911 "Currently only can fill "

6912 "with dict/Series column "

6913 "by column"

6914 )

6915 if using_copy_on_write():

6916 result = self.copy(deep=None)

6917 else:

6918 result = self if inplace else self.copy()

6919 is_dict = isinstance(downcast, dict)

6920 for k, v in value.items():

6921 if k not in result:

6922 continue

6923

6924 # error: Item "None" of "Optional[Dict[Any, Any]]" has no

6925 # attribute "get"

6926 downcast_k = (

6927 downcast

6928 if not is_dict

6929 else downcast.get(k) # type: ignore[union-attr]

6930 )

6931

6932 res_k = result[k].fillna(v, limit=limit, downcast=downcast_k)

6933

6934 if not inplace:

6935 result[k] = res_k

6936 else:

6937 # We can write into our existing column(s) iff dtype

6938 # was preserved.

6939 if isinstance(res_k, ABCSeries):

6940 # i.e. 'k' only shows up once in self.columns

6941 if res_k.dtype == result[k].dtype:

6942 result.loc[:, k] = res_k

6943 else:

6944 # Different dtype -> no way to do inplace.

6945 result[k] = res_k

6946 else:

6947 # see test_fillna_dict_inplace_nonunique_columns

6948 locs = result.columns.get_loc(k)

6949 if isinstance(locs, slice):

6950 locs = np.arange(self.shape[1])[locs]

6951 elif (

6952 isinstance(locs, np.ndarray) and locs.dtype.kind == "b"

6953 ):

6954 locs = locs.nonzero()[0]

6955 elif not (

6956 isinstance(locs, np.ndarray) and locs.dtype.kind == "i"

6957 ):

6958 # Should never be reached, but let's cover our bases

6959 raise NotImplementedError(

6960 "Unexpected get_loc result, please report a bug at "

6961 "https://github.com/pandas-dev/pandas"

6962 )

6963

6964 for i, loc in enumerate(locs):

6965 res_loc = res_k.iloc[:, i]

6966 target = self.iloc[:, loc]

6967

6968 if res_loc.dtype == target.dtype:

6969 result.iloc[:, loc] = res_loc

6970 else:

6971 result.isetitem(loc, res_loc)

6972 if inplace:

6973 return self._update_inplace(result)

6974 else:

6975 return result

6976

6977 elif not is_list_like(value):

6978 if axis == 1:

6979 result = self.T.fillna(value=value, limit=limit).T

6980

6981 new_data = result

6982 else:

6983 new_data = self._mgr.fillna(

6984 value=value, limit=limit, inplace=inplace, downcast=downcast

6985 )

6986 elif isinstance(value, ABCDataFrame) and self.ndim == 2:

6987 new_data = self.where(self.notna(), value)._mgr

6988 else:

6989 raise ValueError(f"invalid fill value with a {type(value)}")

6990

6991 result = self._constructor(new_data)

6992 if inplace:

6993 return self._update_inplace(result)

6994 else:

6995 return result.__finalize__(self, method="fillna")

6996

6997 @overload

6998 def ffill(

6999 self: NDFrameT,

7000 *,

7001 axis: None | Axis = ...,

7002 inplace: Literal[False] = ...,

7003 limit: None | int = ...,

7004 downcast: dict | None = ...,

7005 ) -> NDFrameT:

7006 ...

7007

7008 @overload

7009 def ffill(

7010 self,

7011 *,

7012 axis: None | Axis = ...,

7013 inplace: Literal[True],

7014 limit: None | int = ...,

7015 downcast: dict | None = ...,

7016 ) -> None:

7017 ...

7018

7019 @overload

7020 def ffill(

7021 self: NDFrameT,

7022 *,

7023 axis: None | Axis = ...,

7024 inplace: bool_t = ...,

7025 limit: None | int = ...,

7026 downcast: dict | None = ...,

7027 ) -> NDFrameT | None:

7028 ...

7029

7030 @doc(klass=_shared_doc_kwargs["klass"])

7031 def ffill(

7032 self: NDFrameT,

7033 *,

7034 axis: None | Axis = None,

7035 inplace: bool_t = False,

7036 limit: None | int = None,

7037 downcast: dict | None = None,

7038 ) -> NDFrameT | None:

7039 """

7040 Synonym for :meth:`DataFrame.fillna` with ``method='ffill'``.

7041

7042 Returns

7043 -------

7044 {klass} or None

7045 Object with missing values filled or None if ``inplace=True``.

7046 """

7047 return self.fillna(

7048 method="ffill", axis=axis, inplace=inplace, limit=limit, downcast=downcast

7049 )

7050

7051 @doc(klass=_shared_doc_kwargs["klass"])

7052 def pad(

7053 self: NDFrameT,

7054 *,

7055 axis: None | Axis = None,

7056 inplace: bool_t = False,

7057 limit: None | int = None,

7058 downcast: dict | None = None,

7059 ) -> NDFrameT | None:

7060 """

7061 Synonym for :meth:`DataFrame.fillna` with ``method='ffill'``.

7062

7063 .. deprecated:: 2.0

7064

7065 {klass}.pad is deprecated. Use {klass}.ffill instead.

7066

7067 Returns

7068 -------

7069 {klass} or None

7070 Object with missing values filled or None if ``inplace=True``.

7071 """

7072 warnings.warn(

7073 "DataFrame.pad/Series.pad is deprecated. Use "

7074 "DataFrame.ffill/Series.ffill instead",

7075 FutureWarning,

7076 stacklevel=find_stack_level(),

7077 )

7078 return self.ffill(axis=axis, inplace=inplace, limit=limit, downcast=downcast)

7079

7080 @overload

7081 def bfill(

7082 self: NDFrameT,

7083 *,

7084 axis: None | Axis = ...,

7085 inplace: Literal[False] = ...,

7086 limit: None | int = ...,

7087 downcast: dict | None = ...,

7088 ) -> NDFrameT:

7089 ...

7090

7091 @overload

7092 def bfill(

7093 self,

7094 *,

7095 axis: None | Axis = ...,

7096 inplace: Literal[True],

7097 limit: None | int = ...,

7098 downcast: dict | None = ...,

7099 ) -> None:

7100 ...

7101

7102 @overload

7103 def bfill(

7104 self: NDFrameT,

7105 *,

7106 axis: None | Axis = ...,

7107 inplace: bool_t = ...,

7108 limit: None | int = ...,

7109 downcast: dict | None = ...,

7110 ) -> NDFrameT | None:

7111 ...

7112

7113 @doc(klass=_shared_doc_kwargs["klass"])

7114 def bfill(

7115 self: NDFrameT,

7116 *,

7117 axis: None | Axis = None,

7118 inplace: bool_t = False,

7119 limit: None | int = None,

7120 downcast: dict | None = None,

7121 ) -> NDFrameT | None:

7122 """

7123 Synonym for :meth:`DataFrame.fillna` with ``method='bfill'``.

7124

7125 Returns

7126 -------

7127 {klass} or None

7128 Object with missing values filled or None if ``inplace=True``.

7129 """

7130 return self.fillna(

7131 method="bfill", axis=axis, inplace=inplace, limit=limit, downcast=downcast

7132 )

7133

7134 @doc(klass=_shared_doc_kwargs["klass"])

7135 def backfill(

7136 self: NDFrameT,

7137 *,

7138 axis: None | Axis = None,

7139 inplace: bool_t = False,

7140 limit: None | int = None,

7141 downcast: dict | None = None,

7142 ) -> NDFrameT | None:

7143 """

7144 Synonym for :meth:`DataFrame.fillna` with ``method='bfill'``.

7145

7146 .. deprecated:: 2.0

7147

7148 {klass}.backfill is deprecated. Use {klass}.bfill instead.

7149

7150 Returns

7151 -------

7152 {klass} or None

7153 Object with missing values filled or None if ``inplace=True``.

7154 """

7155 warnings.warn(

7156 "DataFrame.backfill/Series.backfill is deprecated. Use "

7157 "DataFrame.bfill/Series.bfill instead",

7158 FutureWarning,

7159 stacklevel=find_stack_level(),

7160 )

7161 return self.bfill(axis=axis, inplace=inplace, limit=limit, downcast=downcast)

7162

7163 @overload

7164 def replace(

7165 self: NDFrameT,

7166 to_replace=...,

7167 value=...,

7168 *,

7169 inplace: Literal[False] = ...,

7170 limit: int | None = ...,

7171 regex: bool_t = ...,

7172 method: Literal["pad", "ffill", "bfill"] | lib.NoDefault = ...,

7173 ) -> NDFrameT:

7174 ...

7175

7176 @overload

7177 def replace(

7178 self,

7179 to_replace=...,

7180 value=...,

7181 *,

7182 inplace: Literal[True],

7183 limit: int | None = ...,

7184 regex: bool_t = ...,

7185 method: Literal["pad", "ffill", "bfill"] | lib.NoDefault = ...,

7186 ) -> None:

7187 ...

7188

7189 @overload

7190 def replace(

7191 self: NDFrameT,

7192 to_replace=...,

7193 value=...,

7194 *,

7195 inplace: bool_t = ...,

7196 limit: int | None = ...,

7197 regex: bool_t = ...,

7198 method: Literal["pad", "ffill", "bfill"] | lib.NoDefault = ...,

7199 ) -> NDFrameT | None:

7200 ...

7201

7202 @doc(

7203 _shared_docs["replace"],

7204 klass=_shared_doc_kwargs["klass"],

7205 inplace=_shared_doc_kwargs["inplace"],

7206 replace_iloc=_shared_doc_kwargs["replace_iloc"],

7207 )

7208 def replace(

7209 self: NDFrameT,

7210 to_replace=None,

7211 value=lib.no_default,

7212 *,

7213 inplace: bool_t = False,

7214 limit: int | None = None,

7215 regex: bool_t = False,

7216 method: Literal["pad", "ffill", "bfill"] | lib.NoDefault = lib.no_default,

7217 ) -> NDFrameT | None:

7218 if not (

7219 is_scalar(to_replace)

7220 or is_re_compilable(to_replace)

7221 or is_list_like(to_replace)

7222 ):

7223 raise TypeError(

7224 "Expecting 'to_replace' to be either a scalar, array-like, "

7225 "dict or None, got invalid type "

7226 f"{repr(type(to_replace).__name__)}"

7227 )

7228

7229 inplace = validate_bool_kwarg(inplace, "inplace")

7230 if not is_bool(regex) and to_replace is not None:

7231 raise ValueError("'to_replace' must be 'None' if 'regex' is not a bool")

7232

7233 if value is lib.no_default or method is not lib.no_default:

7234 # GH#36984 if the user explicitly passes value=None we want to

7235 # respect that. We have the corner case where the user explicitly

7236 # passes value=None *and* a method, which we interpret as meaning

7237 # they want the (documented) default behavior.

7238 if method is lib.no_default:

7239 # TODO: get this to show up as the default in the docs?

7240 method = "pad"

7241

7242 # passing a single value that is scalar like

7243 # when value is None (GH5319), for compat

7244 if not is_dict_like(to_replace) and not is_dict_like(regex):

7245 to_replace = [to_replace]

7246

7247 if isinstance(to_replace, (tuple, list)):

7248 # TODO: Consider copy-on-write for non-replaced columns's here

7249 if isinstance(self, ABCDataFrame):

7250 from pandas import Series

7251

7252 result = self.apply(

7253 Series._replace_single,

7254 args=(to_replace, method, inplace, limit),

7255 )

7256 if inplace:

7257 return None

7258 return result

7259 return self._replace_single(to_replace, method, inplace, limit)

7260

7261 if not is_dict_like(to_replace):

7262 if not is_dict_like(regex):

7263 raise TypeError(

7264 'If "to_replace" and "value" are both None '

7265 'and "to_replace" is not a list, then '

7266 "regex must be a mapping"

7267 )

7268 to_replace = regex

7269 regex = True

7270

7271 items = list(to_replace.items())

7272 if items:

7273 keys, values = zip(*items)

7274 else:

7275 keys, values = ([], [])

7276

7277 are_mappings = [is_dict_like(v) for v in values]

7278

7279 if any(are_mappings):

7280 if not all(are_mappings):

7281 raise TypeError(

7282 "If a nested mapping is passed, all values "

7283 "of the top level mapping must be mappings"

7284 )

7285 # passed a nested dict/Series

7286 to_rep_dict = {}

7287 value_dict = {}

7288

7289 for k, v in items:

7290 keys, values = list(zip(*v.items())) or ([], [])

7291

7292 to_rep_dict[k] = list(keys)

7293 value_dict[k] = list(values)

7294

7295 to_replace, value = to_rep_dict, value_dict

7296 else:

7297 to_replace, value = keys, values

7298

7299 return self.replace(

7300 to_replace, value, inplace=inplace, limit=limit, regex=regex

7301 )

7302 else:

7303 # need a non-zero len on all axes

7304 if not self.size:

7305 if inplace:

7306 return None

7307 return self.copy(deep=None)

7308

7309 if is_dict_like(to_replace):

7310 if is_dict_like(value): # {'A' : NA} -> {'A' : 0}

7311 # Note: Checking below for `in foo.keys()` instead of

7312 # `in foo` is needed for when we have a Series and not dict

7313 mapping = {

7314 col: (to_replace[col], value[col])

7315 for col in to_replace.keys()

7316 if col in value.keys() and col in self

7317 }

7318 return self._replace_columnwise(mapping, inplace, regex)

7319

7320 # {'A': NA} -> 0

7321 elif not is_list_like(value):

7322 # Operate column-wise

7323 if self.ndim == 1:

7324 raise ValueError(

7325 "Series.replace cannot use dict-like to_replace "

7326 "and non-None value"

7327 )

7328 mapping = {

7329 col: (to_rep, value) for col, to_rep in to_replace.items()

7330 }

7331 return self._replace_columnwise(mapping, inplace, regex)

7332 else:

7333 raise TypeError("value argument must be scalar, dict, or Series")

7334

7335 elif is_list_like(to_replace):

7336 if not is_list_like(value):

7337 # e.g. to_replace = [NA, ''] and value is 0,

7338 # so we replace NA with 0 and then replace '' with 0

7339 value = [value] * len(to_replace)

7340

7341 # e.g. we have to_replace = [NA, ''] and value = [0, 'missing']

7342 if len(to_replace) != len(value):

7343 raise ValueError(

7344 f"Replacement lists must match in length. "

7345 f"Expecting {len(to_replace)} got {len(value)} "

7346 )

7347 new_data = self._mgr.replace_list(

7348 src_list=to_replace,

7349 dest_list=value,

7350 inplace=inplace,

7351 regex=regex,

7352 )

7353

7354 elif to_replace is None:

7355 if not (

7356 is_re_compilable(regex)

7357 or is_list_like(regex)

7358 or is_dict_like(regex)

7359 ):

7360 raise TypeError(

7361 f"'regex' must be a string or a compiled regular expression "

7362 f"or a list or dict of strings or regular expressions, "

7363 f"you passed a {repr(type(regex).__name__)}"

7364 )

7365 return self.replace(

7366 regex, value, inplace=inplace, limit=limit, regex=True

7367 )

7368 else:

7369 # dest iterable dict-like

7370 if is_dict_like(value): # NA -> {'A' : 0, 'B' : -1}

7371 # Operate column-wise

7372 if self.ndim == 1:

7373 raise ValueError(

7374 "Series.replace cannot use dict-value and "

7375 "non-None to_replace"

7376 )

7377 mapping = {col: (to_replace, val) for col, val in value.items()}

7378 return self._replace_columnwise(mapping, inplace, regex)

7379

7380 elif not is_list_like(value): # NA -> 0

7381 regex = should_use_regex(regex, to_replace)

7382 if regex:

7383 new_data = self._mgr.replace_regex(

7384 to_replace=to_replace,

7385 value=value,

7386 inplace=inplace,

7387 )

7388 else:

7389 new_data = self._mgr.replace(

7390 to_replace=to_replace, value=value, inplace=inplace

7391 )

7392 else:

7393 raise TypeError(

7394 f'Invalid "to_replace" type: {repr(type(to_replace).__name__)}'

7395 )

7396

7397 result = self._constructor(new_data)

7398 if inplace:

7399 return self._update_inplace(result)

7400 else:

7401 return result.__finalize__(self, method="replace")

7402

7403 def interpolate(

7404 self: NDFrameT,

7405 method: str = "linear",

7406 *,

7407 axis: Axis = 0,

7408 limit: int | None = None,

7409 inplace: bool_t = False,

7410 limit_direction: str | None = None,

7411 limit_area: str | None = None,

7412 downcast: str | None = None,

7413 **kwargs,

7414 ) -> NDFrameT | None:

7415 """

7416 Fill NaN values using an interpolation method.

7417

7418 Please note that only ``method='linear'`` is supported for

7419 DataFrame/Series with a MultiIndex.

7420

7421 Parameters

7422 ----------

7423 method : str, default 'linear'

7424 Interpolation technique to use. One of:

7425

7426 * 'linear': Ignore the index and treat the values as equally

7427 spaced. This is the only method supported on MultiIndexes.

7428 * 'time': Works on daily and higher resolution data to interpolate

7429 given length of interval.

7430 * 'index', 'values': use the actual numerical values of the index.

7431 * 'pad': Fill in NaNs using existing values.

7432 * 'nearest', 'zero', 'slinear', 'quadratic', 'cubic',

7433 'barycentric', 'polynomial': Passed to

7434 `scipy.interpolate.interp1d`, whereas 'spline' is passed to

7435 `scipy.interpolate.UnivariateSpline`. These methods use the numerical

7436 values of the index. Both 'polynomial' and 'spline' require that

7437 you also specify an `order` (int), e.g.

7438 ``df.interpolate(method='polynomial', order=5)``. Note that,

7439 `slinear` method in Pandas refers to the Scipy first order `spline`

7440 instead of Pandas first order `spline`.

7441 * 'krogh', 'piecewise_polynomial', 'spline', 'pchip', 'akima',

7442 'cubicspline': Wrappers around the SciPy interpolation methods of

7443 similar names. See `Notes`.

7444 * 'from_derivatives': Refers to

7445 `scipy.interpolate.BPoly.from_derivatives` which

7446 replaces 'piecewise_polynomial' interpolation method in

7447 scipy 0.18.

7448

7449 axis : {{0 or 'index', 1 or 'columns', None}}, default None

7450 Axis to interpolate along. For `Series` this parameter is unused

7451 and defaults to 0.

7452 limit : int, optional

7453 Maximum number of consecutive NaNs to fill. Must be greater than

7454 0.

7455 inplace : bool, default False

7456 Update the data in place if possible.

7457 limit_direction : {{'forward', 'backward', 'both'}}, Optional

7458 Consecutive NaNs will be filled in this direction.

7459

7460 If limit is specified:

7461 * If 'method' is 'pad' or 'ffill', 'limit_direction' must be 'forward'.

7462 * If 'method' is 'backfill' or 'bfill', 'limit_direction' must be

7463 'backwards'.

7464

7465 If 'limit' is not specified:

7466 * If 'method' is 'backfill' or 'bfill', the default is 'backward'

7467 * else the default is 'forward'

7468

7469 .. versionchanged:: 1.1.0

7470 raises ValueError if `limit_direction` is 'forward' or 'both' and

7471 method is 'backfill' or 'bfill'.

7472 raises ValueError if `limit_direction` is 'backward' or 'both' and

7473 method is 'pad' or 'ffill'.

7474

7475 limit_area : {{`None`, 'inside', 'outside'}}, default None

7476 If limit is specified, consecutive NaNs will be filled with this

7477 restriction.

7478

7479 * ``None``: No fill restriction.

7480 * 'inside': Only fill NaNs surrounded by valid values

7481 (interpolate).

7482 * 'outside': Only fill NaNs outside valid values (extrapolate).

7483

7484 downcast : optional, 'infer' or None, defaults to None

7485 Downcast dtypes if possible.

7486 ``**kwargs`` : optional

7487 Keyword arguments to pass on to the interpolating function.

7488

7489 Returns

7490 -------

7491 Series or DataFrame or None

7492 Returns the same object type as the caller, interpolated at

7493 some or all ``NaN`` values or None if ``inplace=True``.

7494

7495 See Also

7496 --------

7497 fillna : Fill missing values using different methods.

7498 scipy.interpolate.Akima1DInterpolator : Piecewise cubic polynomials

7499 (Akima interpolator).

7500 scipy.interpolate.BPoly.from_derivatives : Piecewise polynomial in the

7501 Bernstein basis.

7502 scipy.interpolate.interp1d : Interpolate a 1-D function.

7503 scipy.interpolate.KroghInterpolator : Interpolate polynomial (Krogh

7504 interpolator).

7505 scipy.interpolate.PchipInterpolator : PCHIP 1-d monotonic cubic

7506 interpolation.

7507 scipy.interpolate.CubicSpline : Cubic spline data interpolator.

7508

7509 Notes

7510 -----

7511 The 'krogh', 'piecewise_polynomial', 'spline', 'pchip' and 'akima'

7512 methods are wrappers around the respective SciPy implementations of

7513 similar names. These use the actual numerical values of the index.

7514 For more information on their behavior, see the

7515 `SciPy documentation

7516 <https://docs.scipy.org/doc/scipy/reference/interpolate.html#univariate-interpolation>`__.

7517

7518 Examples

7519 --------

7520 Filling in ``NaN`` in a :class:`~pandas.Series` via linear

7521 interpolation.

7522

7523 >>> s = pd.Series([0, 1, np.nan, 3])

7524 >>> s

7525 0 0.0

7526 1 1.0

7527 2 NaN

7528 3 3.0

7529 dtype: float64

7530 >>> s.interpolate()

7531 0 0.0

7532 1 1.0

7533 2 2.0

7534 3 3.0

7535 dtype: float64

7536

7537 Filling in ``NaN`` in a Series by padding, but filling at most two

7538 consecutive ``NaN`` at a time.

7539

7540 >>> s = pd.Series([np.nan, "single_one", np.nan,

7541 ... "fill_two_more", np.nan, np.nan, np.nan,

7542 ... 4.71, np.nan])

7543 >>> s

7544 0 NaN

7545 1 single_one

7546 2 NaN

7547 3 fill_two_more

7548 4 NaN

7549 5 NaN

7550 6 NaN

7551 7 4.71

7552 8 NaN

7553 dtype: object

7554 >>> s.interpolate(method='pad', limit=2)

7555 0 NaN

7556 1 single_one

7557 2 single_one

7558 3 fill_two_more

7559 4 fill_two_more

7560 5 fill_two_more

7561 6 NaN

7562 7 4.71

7563 8 4.71

7564 dtype: object

7565

7566 Filling in ``NaN`` in a Series via polynomial interpolation or splines:

7567 Both 'polynomial' and 'spline' methods require that you also specify

7568 an ``order`` (int).

7569

7570 >>> s = pd.Series([0, 2, np.nan, 8])

7571 >>> s.interpolate(method='polynomial', order=2)

7572 0 0.000000

7573 1 2.000000

7574 2 4.666667

7575 3 8.000000

7576 dtype: float64

7577

7578 Fill the DataFrame forward (that is, going down) along each column

7579 using linear interpolation.

7580

7581 Note how the last entry in column 'a' is interpolated differently,

7582 because there is no entry after it to use for interpolation.

7583 Note how the first entry in column 'b' remains ``NaN``, because there

7584 is no entry before it to use for interpolation.

7585

7586 >>> df = pd.DataFrame([(0.0, np.nan, -1.0, 1.0),

7587 ... (np.nan, 2.0, np.nan, np.nan),

7588 ... (2.0, 3.0, np.nan, 9.0),

7589 ... (np.nan, 4.0, -4.0, 16.0)],

7590 ... columns=list('abcd'))

7591 >>> df

7592 a b c d

7593 0 0.0 NaN -1.0 1.0

7594 1 NaN 2.0 NaN NaN

7595 2 2.0 3.0 NaN 9.0

7596 3 NaN 4.0 -4.0 16.0

7597 >>> df.interpolate(method='linear', limit_direction='forward', axis=0)

7598 a b c d

7599 0 0.0 NaN -1.0 1.0

7600 1 1.0 2.0 -2.0 5.0

7601 2 2.0 3.0 -3.0 9.0

7602 3 2.0 4.0 -4.0 16.0

7603

7604 Using polynomial interpolation.

7605

7606 >>> df['d'].interpolate(method='polynomial', order=2)

7607 0 1.0

7608 1 4.0

7609 2 9.0

7610 3 16.0

7611 Name: d, dtype: float64

7612 """

7613 inplace = validate_bool_kwarg(inplace, "inplace")

7614

7615 axis = self._get_axis_number(axis)

7616

7617 fillna_methods = ["ffill", "bfill", "pad", "backfill"]

7618 should_transpose = axis == 1 and method not in fillna_methods

7619

7620 obj = self.T if should_transpose else self

7621

7622 if obj.empty:

7623 return self.copy()

7624

7625 if method not in fillna_methods:

7626 axis = self._info_axis_number

7627

7628 if isinstance(obj.index, MultiIndex) and method != "linear":

7629 raise ValueError(

7630 "Only `method=linear` interpolation is supported on MultiIndexes."

7631 )

7632

7633 # Set `limit_direction` depending on `method`

7634 if limit_direction is None:

7635 limit_direction = (

7636 "backward" if method in ("backfill", "bfill") else "forward"

7637 )

7638 else:

7639 if method in ("pad", "ffill") and limit_direction != "forward":

7640 raise ValueError(

7641 f"`limit_direction` must be 'forward' for method `{method}`"

7642 )

7643 if method in ("backfill", "bfill") and limit_direction != "backward":

7644 raise ValueError(

7645 f"`limit_direction` must be 'backward' for method `{method}`"

7646 )

7647

7648 if obj.ndim == 2 and np.all(obj.dtypes == np.dtype("object")):

7649 raise TypeError(

7650 "Cannot interpolate with all object-dtype columns "

7651 "in the DataFrame. Try setting at least one "

7652 "column to a numeric dtype."

7653 )

7654

7655 # create/use the index

7656 if method == "linear":

7657 # prior default

7658 index = Index(np.arange(len(obj.index)))

7659 else:

7660 index = obj.index

7661 methods = {"index", "values", "nearest", "time"}

7662 is_numeric_or_datetime = (

7663 is_numeric_dtype(index.dtype)

7664 or is_datetime64_any_dtype(index.dtype)

7665 or is_timedelta64_dtype(index.dtype)

7666 )

7667 if method not in methods and not is_numeric_or_datetime:

7668 raise ValueError(

7669 "Index column must be numeric or datetime type when "

7670 f"using {method} method other than linear. "

7671 "Try setting a numeric or datetime index column before "

7672 "interpolating."

7673 )

7674

7675 if isna(index).any():

7676 raise NotImplementedError(

7677 "Interpolation with NaNs in the index "

7678 "has not been implemented. Try filling "

7679 "those NaNs before interpolating."

7680 )

7681 new_data = obj._mgr.interpolate(

7682 method=method,

7683 axis=axis,

7684 index=index,

7685 limit=limit,

7686 limit_direction=limit_direction,

7687 limit_area=limit_area,

7688 inplace=inplace,

7689 downcast=downcast,

7690 **kwargs,

7691 )

7692

7693 result = self._constructor(new_data)

7694 if should_transpose:

7695 result = result.T

7696 if inplace:

7697 return self._update_inplace(result)

7698 else:

7699 return result.__finalize__(self, method="interpolate")

7700

7701 # ----------------------------------------------------------------------

7702 # Timeseries methods Methods

7703

7704 @final

7705 def asof(self, where, subset=None):

7706 """

7707 Return the last row(s) without any NaNs before `where`.

7708

7709 The last row (for each element in `where`, if list) without any

7710 NaN is taken.

7711 In case of a :class:`~pandas.DataFrame`, the last row without NaN

7712 considering only the subset of columns (if not `None`)

7713

7714 If there is no good value, NaN is returned for a Series or

7715 a Series of NaN values for a DataFrame

7716

7717 Parameters

7718 ----------

7719 where : date or array-like of dates

7720 Date(s) before which the last row(s) are returned.

7721 subset : str or array-like of str, default `None`

7722 For DataFrame, if not `None`, only use these columns to

7723 check for NaNs.

7724

7725 Returns

7726 -------

7727 scalar, Series, or DataFrame

7728

7729 The return can be:

7730

7731 * scalar : when `self` is a Series and `where` is a scalar

7732 * Series: when `self` is a Series and `where` is an array-like,

7733 or when `self` is a DataFrame and `where` is a scalar

7734 * DataFrame : when `self` is a DataFrame and `where` is an

7735 array-like

7736

7737 Return scalar, Series, or DataFrame.

7738

7739 See Also

7740 --------

7741 merge_asof : Perform an asof merge. Similar to left join.

7742

7743 Notes

7744 -----

7745 Dates are assumed to be sorted. Raises if this is not the case.

7746

7747 Examples

7748 --------

7749 A Series and a scalar `where`.

7750

7751 >>> s = pd.Series([1, 2, np.nan, 4], index=[10, 20, 30, 40])

7752 >>> s

7753 10 1.0

7754 20 2.0

7755 30 NaN

7756 40 4.0

7757 dtype: float64

7758

7759 >>> s.asof(20)

7760 2.0

7761

7762 For a sequence `where`, a Series is returned. The first value is

7763 NaN, because the first element of `where` is before the first

7764 index value.

7765

7766 >>> s.asof([5, 20])

7767 5 NaN

7768 20 2.0

7769 dtype: float64

7770

7771 Missing values are not considered. The following is ``2.0``, not

7772 NaN, even though NaN is at the index location for ``30``.

7773

7774 >>> s.asof(30)

7775 2.0

7776

7777 Take all columns into consideration

7778

7779 >>> df = pd.DataFrame({'a': [10, 20, 30, 40, 50],

7780 ... 'b': [None, None, None, None, 500]},

7781 ... index=pd.DatetimeIndex(['2018-02-27 09:01:00',

7782 ... '2018-02-27 09:02:00',

7783 ... '2018-02-27 09:03:00',

7784 ... '2018-02-27 09:04:00',

7785 ... '2018-02-27 09:05:00']))

7786 >>> df.asof(pd.DatetimeIndex(['2018-02-27 09:03:30',

7787 ... '2018-02-27 09:04:30']))

7788 a b

7789 2018-02-27 09:03:30 NaN NaN

7790 2018-02-27 09:04:30 NaN NaN

7791

7792 Take a single column into consideration

7793

7794 >>> df.asof(pd.DatetimeIndex(['2018-02-27 09:03:30',

7795 ... '2018-02-27 09:04:30']),

7796 ... subset=['a'])

7797 a b

7798 2018-02-27 09:03:30 30 NaN

7799 2018-02-27 09:04:30 40 NaN

7800 """

7801 if isinstance(where, str):

7802 where = Timestamp(where)

7803

7804 if not self.index.is_monotonic_increasing:

7805 raise ValueError("asof requires a sorted index")

7806

7807 is_series = isinstance(self, ABCSeries)

7808 if is_series:

7809 if subset is not None:

7810 raise ValueError("subset is not valid for Series")

7811 else:

7812 if subset is None:

7813 subset = self.columns

7814 if not is_list_like(subset):

7815 subset = [subset]

7816

7817 is_list = is_list_like(where)

7818 if not is_list:

7819 start = self.index[0]

7820 if isinstance(self.index, PeriodIndex):

7821 where = Period(where, freq=self.index.freq)

7822

7823 if where < start:

7824 if not is_series:

7825 return self._constructor_sliced(

7826 index=self.columns, name=where, dtype=np.float64

7827 )

7828 return np.nan

7829

7830 # It's always much faster to use a *while* loop here for

7831 # Series than pre-computing all the NAs. However a

7832 # *while* loop is extremely expensive for DataFrame

7833 # so we later pre-compute all the NAs and use the same

7834 # code path whether *where* is a scalar or list.

7835 # See PR: https://github.com/pandas-dev/pandas/pull/14476

7836 if is_series:

7837 loc = self.index.searchsorted(where, side="right")

7838 if loc > 0:

7839 loc -= 1

7840

7841 values = self._values

7842 while loc > 0 and isna(values[loc]):

7843 loc -= 1

7844 return values[loc]

7845

7846 if not isinstance(where, Index):

7847 where = Index(where) if is_list else Index([where])

7848

7849 nulls = self.isna() if is_series else self[subset].isna().any(axis=1)

7850 if nulls.all():

7851 if is_series:

7852 self = cast("Series", self)

7853 return self._constructor(np.nan, index=where, name=self.name)

7854 elif is_list:

7855 self = cast("DataFrame", self)

7856 return self._constructor(np.nan, index=where, columns=self.columns)

7857 else:

7858 self = cast("DataFrame", self)

7859 return self._constructor_sliced(

7860 np.nan, index=self.columns, name=where[0]

7861 )

7862

7863 locs = self.index.asof_locs(where, ~(nulls._values))

7864

7865 # mask the missing

7866 missing = locs == -1

7867 data = self.take(locs)

7868 data.index = where

7869 if missing.any():

7870 # GH#16063 only do this setting when necessary, otherwise

7871 # we'd cast e.g. bools to floats

7872 data.loc[missing] = np.nan

7873 return data if is_list else data.iloc[-1]

7874

7875 # ----------------------------------------------------------------------

7876 # Action Methods

7877

7878 @doc(klass=_shared_doc_kwargs["klass"])

7879 def isna(self: NDFrameT) -> NDFrameT:

7880 """

7881 Detect missing values.

7882

7883 Return a boolean same-sized object indicating if the values are NA.

7884 NA values, such as None or :attr:`numpy.NaN`, gets mapped to True

7885 values.

7886 Everything else gets mapped to False values. Characters such as empty

7887 strings ``''`` or :attr:`numpy.inf` are not considered NA values

7888 (unless you set ``pandas.options.mode.use_inf_as_na = True``).

7889

7890 Returns

7891 -------

7892 {klass}

7893 Mask of bool values for each element in {klass} that

7894 indicates whether an element is an NA value.

7895

7896 See Also

7897 --------

7898 {klass}.isnull : Alias of isna.

7899 {klass}.notna : Boolean inverse of isna.

7900 {klass}.dropna : Omit axes labels with missing values.

7901 isna : Top-level isna.

7902

7903 Examples

7904 --------

7905 Show which entries in a DataFrame are NA.

7906

7907 >>> df = pd.DataFrame(dict(age=[5, 6, np.NaN],

7908 ... born=[pd.NaT, pd.Timestamp('1939-05-27'),

7909 ... pd.Timestamp('1940-04-25')],

7910 ... name=['Alfred', 'Batman', ''],

7911 ... toy=[None, 'Batmobile', 'Joker']))

7912 >>> df

7913 age born name toy

7914 0 5.0 NaT Alfred None

7915 1 6.0 1939-05-27 Batman Batmobile

7916 2 NaN 1940-04-25 Joker

7917

7918 >>> df.isna()

7919 age born name toy

7920 0 False True False True

7921 1 False False False False

7922 2 True False False False

7923

7924 Show which entries in a Series are NA.

7925

7926 >>> ser = pd.Series([5, 6, np.NaN])

7927 >>> ser

7928 0 5.0

7929 1 6.0

7930 2 NaN

7931 dtype: float64

7932

7933 >>> ser.isna()

7934 0 False

7935 1 False

7936 2 True

7937 dtype: bool

7938 """

7939 return isna(self).__finalize__(self, method="isna")

7940

7941 @doc(isna, klass=_shared_doc_kwargs["klass"])

7942 def isnull(self: NDFrameT) -> NDFrameT:

7943 return isna(self).__finalize__(self, method="isnull")

7944

7945 @doc(klass=_shared_doc_kwargs["klass"])

7946 def notna(self: NDFrameT) -> NDFrameT:

7947 """

7948 Detect existing (non-missing) values.

7949

7950 Return a boolean same-sized object indicating if the values are not NA.

7951 Non-missing values get mapped to True. Characters such as empty

7952 strings ``''`` or :attr:`numpy.inf` are not considered NA values

7953 (unless you set ``pandas.options.mode.use_inf_as_na = True``).

7954 NA values, such as None or :attr:`numpy.NaN`, get mapped to False

7955 values.

7956

7957 Returns

7958 -------

7959 {klass}

7960 Mask of bool values for each element in {klass} that

7961 indicates whether an element is not an NA value.

7962

7963 See Also

7964 --------

7965 {klass}.notnull : Alias of notna.

7966 {klass}.isna : Boolean inverse of notna.

7967 {klass}.dropna : Omit axes labels with missing values.

7968 notna : Top-level notna.

7969

7970 Examples

7971 --------

7972 Show which entries in a DataFrame are not NA.

7973

7974 >>> df = pd.DataFrame(dict(age=[5, 6, np.NaN],

7975 ... born=[pd.NaT, pd.Timestamp('1939-05-27'),

7976 ... pd.Timestamp('1940-04-25')],

7977 ... name=['Alfred', 'Batman', ''],

7978 ... toy=[None, 'Batmobile', 'Joker']))

7979 >>> df

7980 age born name toy

7981 0 5.0 NaT Alfred None

7982 1 6.0 1939-05-27 Batman Batmobile

7983 2 NaN 1940-04-25 Joker

7984

7985 >>> df.notna()

7986 age born name toy

7987 0 True False True False

7988 1 True True True True

7989 2 False True True True

7990

7991 Show which entries in a Series are not NA.

7992

7993 >>> ser = pd.Series([5, 6, np.NaN])

7994 >>> ser

7995 0 5.0

7996 1 6.0

7997 2 NaN

7998 dtype: float64

7999

8000 >>> ser.notna()

8001 0 True

8002 1 True

8003 2 False

8004 dtype: bool

8005 """

8006 return notna(self).__finalize__(self, method="notna")

8007

8008 @doc(notna, klass=_shared_doc_kwargs["klass"])

8009 def notnull(self: NDFrameT) -> NDFrameT:

8010 return notna(self).__finalize__(self, method="notnull")

8011

8012 @final

8013 def _clip_with_scalar(self, lower, upper, inplace: bool_t = False):

8014 if (lower is not None and np.any(isna(lower))) or (

8015 upper is not None and np.any(isna(upper))

8016 ):

8017 raise ValueError("Cannot use an NA value as a clip threshold")

8018

8019 result = self

8020 mask = isna(self._values)

8021

8022 with np.errstate(all="ignore"):

8023 if upper is not None:

8024 subset = self <= upper

8025 result = result.where(subset, upper, axis=None, inplace=False)

8026 if lower is not None:

8027 subset = self >= lower

8028 result = result.where(subset, lower, axis=None, inplace=False)

8029

8030 if np.any(mask):

8031 result[mask] = np.nan

8032

8033 if inplace:

8034 return self._update_inplace(result)

8035 else:

8036 return result

8037

8038 @final

8039 def _clip_with_one_bound(self, threshold, method, axis, inplace):

8040 if axis is not None:

8041 axis = self._get_axis_number(axis)

8042

8043 # method is self.le for upper bound and self.ge for lower bound

8044 if is_scalar(threshold) and is_number(threshold):

8045 if method.__name__ == "le":

8046 return self._clip_with_scalar(None, threshold, inplace=inplace)

8047 return self._clip_with_scalar(threshold, None, inplace=inplace)

8048

8049 # GH #15390

8050 # In order for where method to work, the threshold must

8051 # be transformed to NDFrame from other array like structure.

8052 if (not isinstance(threshold, ABCSeries)) and is_list_like(threshold):

8053 if isinstance(self, ABCSeries):

8054 threshold = self._constructor(threshold, index=self.index)

8055 else:

8056 threshold = align_method_FRAME(self, threshold, axis, flex=None)[1]

8057

8058 # GH 40420

8059 # Treat missing thresholds as no bounds, not clipping the values

8060 if is_list_like(threshold):

8061 fill_value = np.inf if method.__name__ == "le" else -np.inf

8062 threshold_inf = threshold.fillna(fill_value)

8063 else:

8064 threshold_inf = threshold

8065

8066 subset = method(threshold_inf, axis=axis) | isna(self)

8067

8068 # GH 40420

8069 return self.where(subset, threshold, axis=axis, inplace=inplace)

8070

8071 def clip(

8072 self: NDFrameT,

8073 lower=None,

8074 upper=None,

8075 *,

8076 axis: Axis | None = None,

8077 inplace: bool_t = False,

8078 **kwargs,

8079 ) -> NDFrameT | None:

8080 """

8081 Trim values at input threshold(s).

8082

8083 Assigns values outside boundary to boundary values. Thresholds

8084 can be singular values or array like, and in the latter case

8085 the clipping is performed element-wise in the specified axis.

8086

8087 Parameters

8088 ----------

8089 lower : float or array-like, default None

8090 Minimum threshold value. All values below this

8091 threshold will be set to it. A missing

8092 threshold (e.g `NA`) will not clip the value.

8093 upper : float or array-like, default None

8094 Maximum threshold value. All values above this

8095 threshold will be set to it. A missing

8096 threshold (e.g `NA`) will not clip the value.

8097 axis : {{0 or 'index', 1 or 'columns', None}}, default None

8098 Align object with lower and upper along the given axis.

8099 For `Series` this parameter is unused and defaults to `None`.

8100 inplace : bool, default False

8101 Whether to perform the operation in place on the data.

8102 *args, **kwargs

8103 Additional keywords have no effect but might be accepted

8104 for compatibility with numpy.

8105

8106 Returns

8107 -------

8108 Series or DataFrame or None

8109 Same type as calling object with the values outside the

8110 clip boundaries replaced or None if ``inplace=True``.

8111

8112 See Also

8113 --------

8114 Series.clip : Trim values at input threshold in series.

8115 DataFrame.clip : Trim values at input threshold in dataframe.

8116 numpy.clip : Clip (limit) the values in an array.

8117

8118 Examples

8119 --------

8120 >>> data = {'col_0': [9, -3, 0, -1, 5], 'col_1': [-2, -7, 6, 8, -5]}

8121 >>> df = pd.DataFrame(data)

8122 >>> df

8123 col_0 col_1

8124 0 9 -2

8125 1 -3 -7

8126 2 0 6

8127 3 -1 8

8128 4 5 -5

8129

8130 Clips per column using lower and upper thresholds:

8131

8132 >>> df.clip(-4, 6)

8133 col_0 col_1

8134 0 6 -2

8135 1 -3 -4

8136 2 0 6

8137 3 -1 6

8138 4 5 -4

8139

8140 Clips using specific lower and upper thresholds per column element:

8141

8142 >>> t = pd.Series([2, -4, -1, 6, 3])

8143 >>> t

8144 0 2

8145 1 -4

8146 2 -1

8147 3 6

8148 4 3

8149 dtype: int64

8150

8151 >>> df.clip(t, t + 4, axis=0)

8152 col_0 col_1

8153 0 6 2

8154 1 -3 -4

8155 2 0 3

8156 3 6 8

8157 4 5 3

8158

8159 Clips using specific lower threshold per column element, with missing values:

8160

8161 >>> t = pd.Series([2, -4, np.NaN, 6, 3])

8162 >>> t

8163 0 2.0

8164 1 -4.0

8165 2 NaN

8166 3 6.0

8167 4 3.0

8168 dtype: float64

8169

8170 >>> df.clip(t, axis=0)

8171 col_0 col_1

8172 0 9 2

8173 1 -3 -4

8174 2 0 6

8175 3 6 8

8176 4 5 3

8177 """

8178 inplace = validate_bool_kwarg(inplace, "inplace")

8179

8180 axis = nv.validate_clip_with_axis(axis, (), kwargs)

8181 if axis is not None:

8182 axis = self._get_axis_number(axis)

8183

8184 # GH 17276

8185 # numpy doesn't like NaN as a clip value

8186 # so ignore

8187 # GH 19992

8188 # numpy doesn't drop a list-like bound containing NaN

8189 isna_lower = isna(lower)

8190 if not is_list_like(lower):

8191 if np.any(isna_lower):

8192 lower = None

8193 elif np.all(isna_lower):

8194 lower = None

8195 isna_upper = isna(upper)

8196 if not is_list_like(upper):

8197 if np.any(isna_upper):

8198 upper = None

8199 elif np.all(isna_upper):

8200 upper = None

8201

8202 # GH 2747 (arguments were reversed)

8203 if (

8204 lower is not None

8205 and upper is not None

8206 and is_scalar(lower)

8207 and is_scalar(upper)

8208 ):

8209 lower, upper = min(lower, upper), max(lower, upper)

8210

8211 # fast-path for scalars

8212 if (lower is None or (is_scalar(lower) and is_number(lower))) and (

8213 upper is None or (is_scalar(upper) and is_number(upper))

8214 ):

8215 return self._clip_with_scalar(lower, upper, inplace=inplace)

8216

8217 result = self

8218 if lower is not None:

8219 result = result._clip_with_one_bound(

8220 lower, method=self.ge, axis=axis, inplace=inplace

8221 )

8222 if upper is not None:

8223 if inplace:

8224 result = self

8225 result = result._clip_with_one_bound(

8226 upper, method=self.le, axis=axis, inplace=inplace

8227 )

8228

8229 return result

8230

8231 @doc(**_shared_doc_kwargs)

8232 def asfreq(

8233 self: NDFrameT,

8234 freq: Frequency,

8235 method: FillnaOptions | None = None,

8236 how: str | None = None,

8237 normalize: bool_t = False,

8238 fill_value: Hashable = None,

8239 ) -> NDFrameT:

8240 """

8241 Convert time series to specified frequency.

8242

8243 Returns the original data conformed to a new index with the specified

8244 frequency.

8245

8246 If the index of this {klass} is a :class:`~pandas.PeriodIndex`, the new index

8247 is the result of transforming the original index with

8248 :meth:`PeriodIndex.asfreq <pandas.PeriodIndex.asfreq>` (so the original index

8249 will map one-to-one to the new index).

8250

8251 Otherwise, the new index will be equivalent to ``pd.date_range(start, end,

8252 freq=freq)`` where ``start`` and ``end`` are, respectively, the first and

8253 last entries in the original index (see :func:`pandas.date_range`). The

8254 values corresponding to any timesteps in the new index which were not present

8255 in the original index will be null (``NaN``), unless a method for filling

8256 such unknowns is provided (see the ``method`` parameter below).

8257

8258 The :meth:`resample` method is more appropriate if an operation on each group of

8259 timesteps (such as an aggregate) is necessary to represent the data at the new

8260 frequency.

8261

8262 Parameters

8263 ----------

8264 freq : DateOffset or str

8265 Frequency DateOffset or string.

8266 method : {{'backfill'/'bfill', 'pad'/'ffill'}}, default None

8267 Method to use for filling holes in reindexed Series (note this

8268 does not fill NaNs that already were present):

8269

8270 * 'pad' / 'ffill': propagate last valid observation forward to next

8271 valid

8272 * 'backfill' / 'bfill': use NEXT valid observation to fill.

8273 how : {{'start', 'end'}}, default end

8274 For PeriodIndex only (see PeriodIndex.asfreq).

8275 normalize : bool, default False

8276 Whether to reset output index to midnight.

8277 fill_value : scalar, optional

8278 Value to use for missing values, applied during upsampling (note

8279 this does not fill NaNs that already were present).

8280

8281 Returns

8282 -------

8283 {klass}

8284 {klass} object reindexed to the specified frequency.

8285

8286 See Also

8287 --------

8288 reindex : Conform DataFrame to new index with optional filling logic.

8289

8290 Notes

8291 -----

8292 To learn more about the frequency strings, please see `this link

8293 <https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#offset-aliases>`__.

8294

8295 Examples

8296 --------

8297 Start by creating a series with 4 one minute timestamps.

8298

8299 >>> index = pd.date_range('1/1/2000', periods=4, freq='T')

8300 >>> series = pd.Series([0.0, None, 2.0, 3.0], index=index)

8301 >>> df = pd.DataFrame({{'s': series}})

8302 >>> df

8303 s

8304 2000-01-01 00:00:00 0.0

8305 2000-01-01 00:01:00 NaN

8306 2000-01-01 00:02:00 2.0

8307 2000-01-01 00:03:00 3.0

8308

8309 Upsample the series into 30 second bins.

8310

8311 >>> df.asfreq(freq='30S')

8312 s

8313 2000-01-01 00:00:00 0.0

8314 2000-01-01 00:00:30 NaN

8315 2000-01-01 00:01:00 NaN

8316 2000-01-01 00:01:30 NaN

8317 2000-01-01 00:02:00 2.0

8318 2000-01-01 00:02:30 NaN

8319 2000-01-01 00:03:00 3.0

8320

8321 Upsample again, providing a ``fill value``.

8322

8323 >>> df.asfreq(freq='30S', fill_value=9.0)

8324 s

8325 2000-01-01 00:00:00 0.0

8326 2000-01-01 00:00:30 9.0

8327 2000-01-01 00:01:00 NaN

8328 2000-01-01 00:01:30 9.0

8329 2000-01-01 00:02:00 2.0

8330 2000-01-01 00:02:30 9.0

8331 2000-01-01 00:03:00 3.0

8332

8333 Upsample again, providing a ``method``.

8334

8335 >>> df.asfreq(freq='30S', method='bfill')

8336 s

8337 2000-01-01 00:00:00 0.0

8338 2000-01-01 00:00:30 NaN

8339 2000-01-01 00:01:00 NaN

8340 2000-01-01 00:01:30 2.0

8341 2000-01-01 00:02:00 2.0

8342 2000-01-01 00:02:30 3.0

8343 2000-01-01 00:03:00 3.0

8344 """

8345 from pandas.core.resample import asfreq

8346

8347 return asfreq(

8348 self,

8349 freq,

8350 method=method,

8351 how=how,

8352 normalize=normalize,

8353 fill_value=fill_value,

8354 )

8355

8356 @final

8357 def at_time(

8358 self: NDFrameT, time, asof: bool_t = False, axis: Axis | None = None

8359 ) -> NDFrameT:

8360 """

8361 Select values at particular time of day (e.g., 9:30AM).

8362

8363 Parameters

8364 ----------

8365 time : datetime.time or str

8366 The values to select.

8367 axis : {0 or 'index', 1 or 'columns'}, default 0

8368 For `Series` this parameter is unused and defaults to 0.

8369

8370 Returns

8371 -------

8372 Series or DataFrame

8373

8374 Raises

8375 ------

8376 TypeError

8377 If the index is not a :class:`DatetimeIndex`

8378

8379 See Also

8380 --------

8381 between_time : Select values between particular times of the day.

8382 first : Select initial periods of time series based on a date offset.

8383 last : Select final periods of time series based on a date offset.

8384 DatetimeIndex.indexer_at_time : Get just the index locations for

8385 values at particular time of the day.

8386

8387 Examples

8388 --------

8389 >>> i = pd.date_range('2018-04-09', periods=4, freq='12H')

8390 >>> ts = pd.DataFrame({'A': [1, 2, 3, 4]}, index=i)

8391 >>> ts

8392 A

8393 2018-04-09 00:00:00 1

8394 2018-04-09 12:00:00 2

8395 2018-04-10 00:00:00 3

8396 2018-04-10 12:00:00 4

8397

8398 >>> ts.at_time('12:00')

8399 A

8400 2018-04-09 12:00:00 2

8401 2018-04-10 12:00:00 4

8402 """

8403 if axis is None:

8404 axis = self._stat_axis_number

8405 axis = self._get_axis_number(axis)

8406

8407 index = self._get_axis(axis)

8408

8409 if not isinstance(index, DatetimeIndex):

8410 raise TypeError("Index must be DatetimeIndex")

8411

8412 indexer = index.indexer_at_time(time, asof=asof)

8413 return self._take_with_is_copy(indexer, axis=axis)

8414

8415 @final

8416 def between_time(

8417 self: NDFrameT,

8418 start_time,

8419 end_time,

8420 inclusive: IntervalClosedType = "both",

8421 axis: Axis | None = None,

8422 ) -> NDFrameT:

8423 """

8424 Select values between particular times of the day (e.g., 9:00-9:30 AM).

8425

8426 By setting ``start_time`` to be later than ``end_time``,

8427 you can get the times that are *not* between the two times.

8428

8429 Parameters

8430 ----------

8431 start_time : datetime.time or str

8432 Initial time as a time filter limit.

8433 end_time : datetime.time or str

8434 End time as a time filter limit.

8435 inclusive : {"both", "neither", "left", "right"}, default "both"

8436 Include boundaries; whether to set each bound as closed or open.

8437 axis : {0 or 'index', 1 or 'columns'}, default 0

8438 Determine range time on index or columns value.

8439 For `Series` this parameter is unused and defaults to 0.

8440

8441 Returns

8442 -------

8443 Series or DataFrame

8444 Data from the original object filtered to the specified dates range.

8445

8446 Raises

8447 ------

8448 TypeError

8449 If the index is not a :class:`DatetimeIndex`

8450

8451 See Also

8452 --------

8453 at_time : Select values at a particular time of the day.

8454 first : Select initial periods of time series based on a date offset.

8455 last : Select final periods of time series based on a date offset.

8456 DatetimeIndex.indexer_between_time : Get just the index locations for

8457 values between particular times of the day.

8458

8459 Examples

8460 --------

8461 >>> i = pd.date_range('2018-04-09', periods=4, freq='1D20min')

8462 >>> ts = pd.DataFrame({'A': [1, 2, 3, 4]}, index=i)

8463 >>> ts

8464 A

8465 2018-04-09 00:00:00 1

8466 2018-04-10 00:20:00 2

8467 2018-04-11 00:40:00 3

8468 2018-04-12 01:00:00 4

8469

8470 >>> ts.between_time('0:15', '0:45')

8471 A

8472 2018-04-10 00:20:00 2

8473 2018-04-11 00:40:00 3

8474

8475 You get the times that are *not* between two times by setting

8476 ``start_time`` later than ``end_time``:

8477

8478 >>> ts.between_time('0:45', '0:15')

8479 A

8480 2018-04-09 00:00:00 1

8481 2018-04-12 01:00:00 4

8482 """

8483 if axis is None:

8484 axis = self._stat_axis_number

8485 axis = self._get_axis_number(axis)

8486

8487 index = self._get_axis(axis)

8488 if not isinstance(index, DatetimeIndex):

8489 raise TypeError("Index must be DatetimeIndex")

8490

8491 left_inclusive, right_inclusive = validate_inclusive(inclusive)

8492 indexer = index.indexer_between_time(

8493 start_time,

8494 end_time,

8495 include_start=left_inclusive,

8496 include_end=right_inclusive,

8497 )

8498 return self._take_with_is_copy(indexer, axis=axis)

8499

8500 @doc(**_shared_doc_kwargs)

8501 def resample(

8502 self,

8503 rule,

8504 axis: Axis = 0,

8505 closed: str | None = None,

8506 label: str | None = None,

8507 convention: str = "start",

8508 kind: str | None = None,

8509 on: Level = None,

8510 level: Level = None,

8511 origin: str | TimestampConvertibleTypes = "start_day",

8512 offset: TimedeltaConvertibleTypes | None = None,

8513 group_keys: bool_t = False,

8514 ) -> Resampler:

8515 """

8516 Resample time-series data.

8517

8518 Convenience method for frequency conversion and resampling of time series.

8519 The object must have a datetime-like index (`DatetimeIndex`, `PeriodIndex`,

8520 or `TimedeltaIndex`), or the caller must pass the label of a datetime-like

8521 series/index to the ``on``/``level`` keyword parameter.

8522

8523 Parameters

8524 ----------

8525 rule : DateOffset, Timedelta or str

8526 The offset string or object representing target conversion.

8527 axis : {{0 or 'index', 1 or 'columns'}}, default 0

8528 Which axis to use for up- or down-sampling. For `Series` this parameter

8529 is unused and defaults to 0. Must be

8530 `DatetimeIndex`, `TimedeltaIndex` or `PeriodIndex`.

8531 closed : {{'right', 'left'}}, default None

8532 Which side of bin interval is closed. The default is 'left'

8533 for all frequency offsets except for 'M', 'A', 'Q', 'BM',

8534 'BA', 'BQ', and 'W' which all have a default of 'right'.

8535 label : {{'right', 'left'}}, default None

8536 Which bin edge label to label bucket with. The default is 'left'

8537 for all frequency offsets except for 'M', 'A', 'Q', 'BM',

8538 'BA', 'BQ', and 'W' which all have a default of 'right'.

8539 convention : {{'start', 'end', 's', 'e'}}, default 'start'

8540 For `PeriodIndex` only, controls whether to use the start or

8541 end of `rule`.

8542 kind : {{'timestamp', 'period'}}, optional, default None

8543 Pass 'timestamp' to convert the resulting index to a

8544 `DateTimeIndex` or 'period' to convert it to a `PeriodIndex`.

8545 By default the input representation is retained.

8546

8547 on : str, optional

8548 For a DataFrame, column to use instead of index for resampling.

8549 Column must be datetime-like.

8550 level : str or int, optional

8551 For a MultiIndex, level (name or number) to use for

8552 resampling. `level` must be datetime-like.

8553 origin : Timestamp or str, default 'start_day'

8554 The timestamp on which to adjust the grouping. The timezone of origin

8555 must match the timezone of the index.

8556 If string, must be one of the following:

8557

8558 - 'epoch': `origin` is 1970-01-01

8559 - 'start': `origin` is the first value of the timeseries

8560 - 'start_day': `origin` is the first day at midnight of the timeseries

8561

8562 .. versionadded:: 1.1.0

8563

8564 - 'end': `origin` is the last value of the timeseries

8565 - 'end_day': `origin` is the ceiling midnight of the last day

8566

8567 .. versionadded:: 1.3.0

8568

8569 offset : Timedelta or str, default is None

8570 An offset timedelta added to the origin.

8571

8572 .. versionadded:: 1.1.0

8573

8574 group_keys : bool, default False

8575 Whether to include the group keys in the result index when using

8576 ``.apply()`` on the resampled object.

8577

8578 .. versionadded:: 1.5.0

8579

8580 Not specifying ``group_keys`` will retain values-dependent behavior

8581 from pandas 1.4 and earlier (see :ref:`pandas 1.5.0 Release notes

8582 <whatsnew_150.enhancements.resample_group_keys>` for examples).

8583

8584 .. versionchanged:: 2.0.0

8585

8586 ``group_keys`` now defaults to ``False``.

8587

8588 Returns

8589 -------

8590 pandas.core.Resampler

8591 :class:`~pandas.core.Resampler` object.

8592

8593 See Also

8594 --------

8595 Series.resample : Resample a Series.

8596 DataFrame.resample : Resample a DataFrame.

8597 groupby : Group {klass} by mapping, function, label, or list of labels.

8598 asfreq : Reindex a {klass} with the given frequency without grouping.

8599

8600 Notes

8601 -----

8602 See the `user guide

8603 <https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#resampling>`__

8604 for more.

8605

8606 To learn more about the offset strings, please see `this link

8607 <https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#dateoffset-objects>`__.

8608

8609 Examples

8610 --------

8611 Start by creating a series with 9 one minute timestamps.

8612

8613 >>> index = pd.date_range('1/1/2000', periods=9, freq='T')

8614 >>> series = pd.Series(range(9), index=index)

8615 >>> series

8616 2000-01-01 00:00:00 0

8617 2000-01-01 00:01:00 1

8618 2000-01-01 00:02:00 2

8619 2000-01-01 00:03:00 3

8620 2000-01-01 00:04:00 4

8621 2000-01-01 00:05:00 5

8622 2000-01-01 00:06:00 6

8623 2000-01-01 00:07:00 7

8624 2000-01-01 00:08:00 8

8625 Freq: T, dtype: int64

8626

8627 Downsample the series into 3 minute bins and sum the values

8628 of the timestamps falling into a bin.

8629

8630 >>> series.resample('3T').sum()

8631 2000-01-01 00:00:00 3

8632 2000-01-01 00:03:00 12

8633 2000-01-01 00:06:00 21

8634 Freq: 3T, dtype: int64

8635

8636 Downsample the series into 3 minute bins as above, but label each

8637 bin using the right edge instead of the left. Please note that the

8638 value in the bucket used as the label is not included in the bucket,

8639 which it labels. For example, in the original series the

8640 bucket ``2000-01-01 00:03:00`` contains the value 3, but the summed

8641 value in the resampled bucket with the label ``2000-01-01 00:03:00``

8642 does not include 3 (if it did, the summed value would be 6, not 3).

8643 To include this value close the right side of the bin interval as

8644 illustrated in the example below this one.

8645

8646 >>> series.resample('3T', label='right').sum()

8647 2000-01-01 00:03:00 3

8648 2000-01-01 00:06:00 12

8649 2000-01-01 00:09:00 21

8650 Freq: 3T, dtype: int64

8651

8652 Downsample the series into 3 minute bins as above, but close the right

8653 side of the bin interval.

8654

8655 >>> series.resample('3T', label='right', closed='right').sum()

8656 2000-01-01 00:00:00 0

8657 2000-01-01 00:03:00 6

8658 2000-01-01 00:06:00 15

8659 2000-01-01 00:09:00 15

8660 Freq: 3T, dtype: int64

8661

8662 Upsample the series into 30 second bins.

8663

8664 >>> series.resample('30S').asfreq()[0:5] # Select first 5 rows

8665 2000-01-01 00:00:00 0.0

8666 2000-01-01 00:00:30 NaN

8667 2000-01-01 00:01:00 1.0

8668 2000-01-01 00:01:30 NaN

8669 2000-01-01 00:02:00 2.0

8670 Freq: 30S, dtype: float64

8671

8672 Upsample the series into 30 second bins and fill the ``NaN``

8673 values using the ``ffill`` method.

8674

8675 >>> series.resample('30S').ffill()[0:5]

8676 2000-01-01 00:00:00 0

8677 2000-01-01 00:00:30 0

8678 2000-01-01 00:01:00 1

8679 2000-01-01 00:01:30 1

8680 2000-01-01 00:02:00 2

8681 Freq: 30S, dtype: int64

8682

8683 Upsample the series into 30 second bins and fill the

8684 ``NaN`` values using the ``bfill`` method.

8685

8686 >>> series.resample('30S').bfill()[0:5]

8687 2000-01-01 00:00:00 0

8688 2000-01-01 00:00:30 1

8689 2000-01-01 00:01:00 1

8690 2000-01-01 00:01:30 2

8691 2000-01-01 00:02:00 2

8692 Freq: 30S, dtype: int64

8693

8694 Pass a custom function via ``apply``

8695

8696 >>> def custom_resampler(arraylike):

8697 ... return np.sum(arraylike) + 5

8698 ...

8699 >>> series.resample('3T').apply(custom_resampler)

8700 2000-01-01 00:00:00 8

8701 2000-01-01 00:03:00 17

8702 2000-01-01 00:06:00 26

8703 Freq: 3T, dtype: int64

8704

8705 For a Series with a PeriodIndex, the keyword `convention` can be

8706 used to control whether to use the start or end of `rule`.

8707

8708 Resample a year by quarter using 'start' `convention`. Values are

8709 assigned to the first quarter of the period.

8710

8711 >>> s = pd.Series([1, 2], index=pd.period_range('2012-01-01',

8712 ... freq='A',

8713 ... periods=2))

8714 >>> s

8715 2012 1

8716 2013 2

8717 Freq: A-DEC, dtype: int64

8718 >>> s.resample('Q', convention='start').asfreq()

8719 2012Q1 1.0

8720 2012Q2 NaN

8721 2012Q3 NaN

8722 2012Q4 NaN

8723 2013Q1 2.0

8724 2013Q2 NaN

8725 2013Q3 NaN

8726 2013Q4 NaN

8727 Freq: Q-DEC, dtype: float64

8728

8729 Resample quarters by month using 'end' `convention`. Values are

8730 assigned to the last month of the period.

8731

8732 >>> q = pd.Series([1, 2, 3, 4], index=pd.period_range('2018-01-01',

8733 ... freq='Q',

8734 ... periods=4))

8735 >>> q

8736 2018Q1 1

8737 2018Q2 2

8738 2018Q3 3

8739 2018Q4 4

8740 Freq: Q-DEC, dtype: int64

8741 >>> q.resample('M', convention='end').asfreq()

8742 2018-03 1.0

8743 2018-04 NaN

8744 2018-05 NaN

8745 2018-06 2.0

8746 2018-07 NaN

8747 2018-08 NaN

8748 2018-09 3.0

8749 2018-10 NaN

8750 2018-11 NaN

8751 2018-12 4.0

8752 Freq: M, dtype: float64

8753

8754 For DataFrame objects, the keyword `on` can be used to specify the

8755 column instead of the index for resampling.

8756

8757 >>> d = {{'price': [10, 11, 9, 13, 14, 18, 17, 19],

8758 ... 'volume': [50, 60, 40, 100, 50, 100, 40, 50]}}

8759 >>> df = pd.DataFrame(d)

8760 >>> df['week_starting'] = pd.date_range('01/01/2018',

8761 ... periods=8,

8762 ... freq='W')

8763 >>> df

8764 price volume week_starting

8765 0 10 50 2018-01-07

8766 1 11 60 2018-01-14

8767 2 9 40 2018-01-21

8768 3 13 100 2018-01-28

8769 4 14 50 2018-02-04

8770 5 18 100 2018-02-11

8771 6 17 40 2018-02-18

8772 7 19 50 2018-02-25

8773 >>> df.resample('M', on='week_starting').mean()

8774 price volume

8775 week_starting

8776 2018-01-31 10.75 62.5

8777 2018-02-28 17.00 60.0

8778

8779 For a DataFrame with MultiIndex, the keyword `level` can be used to

8780 specify on which level the resampling needs to take place.

8781

8782 >>> days = pd.date_range('1/1/2000', periods=4, freq='D')

8783 >>> d2 = {{'price': [10, 11, 9, 13, 14, 18, 17, 19],

8784 ... 'volume': [50, 60, 40, 100, 50, 100, 40, 50]}}

8785 >>> df2 = pd.DataFrame(

8786 ... d2,

8787 ... index=pd.MultiIndex.from_product(

8788 ... [days, ['morning', 'afternoon']]

8789 ... )

8790 ... )

8791 >>> df2

8792 price volume

8793 2000-01-01 morning 10 50

8794 afternoon 11 60

8795 2000-01-02 morning 9 40

8796 afternoon 13 100

8797 2000-01-03 morning 14 50

8798 afternoon 18 100

8799 2000-01-04 morning 17 40

8800 afternoon 19 50

8801 >>> df2.resample('D', level=0).sum()

8802 price volume

8803 2000-01-01 21 110

8804 2000-01-02 22 140

8805 2000-01-03 32 150

8806 2000-01-04 36 90

8807

8808 If you want to adjust the start of the bins based on a fixed timestamp:

8809

8810 >>> start, end = '2000-10-01 23:30:00', '2000-10-02 00:30:00'

8811 >>> rng = pd.date_range(start, end, freq='7min')

8812 >>> ts = pd.Series(np.arange(len(rng)) * 3, index=rng)

8813 >>> ts

8814 2000-10-01 23:30:00 0

8815 2000-10-01 23:37:00 3

8816 2000-10-01 23:44:00 6

8817 2000-10-01 23:51:00 9

8818 2000-10-01 23:58:00 12

8819 2000-10-02 00:05:00 15

8820 2000-10-02 00:12:00 18

8821 2000-10-02 00:19:00 21

8822 2000-10-02 00:26:00 24

8823 Freq: 7T, dtype: int64

8824

8825 >>> ts.resample('17min').sum()

8826 2000-10-01 23:14:00 0

8827 2000-10-01 23:31:00 9

8828 2000-10-01 23:48:00 21

8829 2000-10-02 00:05:00 54

8830 2000-10-02 00:22:00 24

8831 Freq: 17T, dtype: int64

8832

8833 >>> ts.resample('17min', origin='epoch').sum()

8834 2000-10-01 23:18:00 0

8835 2000-10-01 23:35:00 18

8836 2000-10-01 23:52:00 27

8837 2000-10-02 00:09:00 39

8838 2000-10-02 00:26:00 24

8839 Freq: 17T, dtype: int64

8840

8841 >>> ts.resample('17min', origin='2000-01-01').sum()

8842 2000-10-01 23:24:00 3

8843 2000-10-01 23:41:00 15

8844 2000-10-01 23:58:00 45

8845 2000-10-02 00:15:00 45

8846 Freq: 17T, dtype: int64

8847

8848 If you want to adjust the start of the bins with an `offset` Timedelta, the two

8849 following lines are equivalent:

8850

8851 >>> ts.resample('17min', origin='start').sum()

8852 2000-10-01 23:30:00 9

8853 2000-10-01 23:47:00 21

8854 2000-10-02 00:04:00 54

8855 2000-10-02 00:21:00 24

8856 Freq: 17T, dtype: int64

8857

8858 >>> ts.resample('17min', offset='23h30min').sum()

8859 2000-10-01 23:30:00 9

8860 2000-10-01 23:47:00 21

8861 2000-10-02 00:04:00 54

8862 2000-10-02 00:21:00 24

8863 Freq: 17T, dtype: int64

8864

8865 If you want to take the largest Timestamp as the end of the bins:

8866

8867 >>> ts.resample('17min', origin='end').sum()

8868 2000-10-01 23:35:00 0

8869 2000-10-01 23:52:00 18

8870 2000-10-02 00:09:00 27

8871 2000-10-02 00:26:00 63

8872 Freq: 17T, dtype: int64

8873

8874 In contrast with the `start_day`, you can use `end_day` to take the ceiling

8875 midnight of the largest Timestamp as the end of the bins and drop the bins

8876 not containing data:

8877

8878 >>> ts.resample('17min', origin='end_day').sum()

8879 2000-10-01 23:38:00 3

8880 2000-10-01 23:55:00 15

8881 2000-10-02 00:12:00 45

8882 2000-10-02 00:29:00 45

8883 Freq: 17T, dtype: int64

8884 """

8885 from pandas.core.resample import get_resampler

8886

8887 axis = self._get_axis_number(axis)

8888 return get_resampler(

8889 cast("Series | DataFrame", self),

8890 freq=rule,

8891 label=label,

8892 closed=closed,

8893 axis=axis,

8894 kind=kind,

8895 convention=convention,

8896 key=on,

8897 level=level,

8898 origin=origin,

8899 offset=offset,

8900 group_keys=group_keys,

8901 )

8902

8903 @final

8904 def first(self: NDFrameT, offset) -> NDFrameT:

8905 """

8906 Select initial periods of time series data based on a date offset.

8907

8908 For a DataFrame with a sorted DatetimeIndex, this function can

8909 select the first few rows based on a date offset.

8910

8911 Parameters

8912 ----------

8913 offset : str, DateOffset or dateutil.relativedelta

8914 The offset length of the data that will be selected. For instance,

8915 '1M' will display all the rows having their index within the first month.

8916

8917 Returns

8918 -------

8919 Series or DataFrame

8920 A subset of the caller.

8921

8922 Raises

8923 ------

8924 TypeError

8925 If the index is not a :class:`DatetimeIndex`

8926

8927 See Also

8928 --------

8929 last : Select final periods of time series based on a date offset.

8930 at_time : Select values at a particular time of the day.

8931 between_time : Select values between particular times of the day.

8932

8933 Examples

8934 --------

8935 >>> i = pd.date_range('2018-04-09', periods=4, freq='2D')

8936 >>> ts = pd.DataFrame({'A': [1, 2, 3, 4]}, index=i)

8937 >>> ts

8938 A

8939 2018-04-09 1

8940 2018-04-11 2

8941 2018-04-13 3

8942 2018-04-15 4

8943

8944 Get the rows for the first 3 days:

8945

8946 >>> ts.first('3D')

8947 A

8948 2018-04-09 1

8949 2018-04-11 2

8950

8951 Notice the data for 3 first calendar days were returned, not the first

8952 3 days observed in the dataset, and therefore data for 2018-04-13 was

8953 not returned.

8954 """

8955 if not isinstance(self.index, DatetimeIndex):

8956 raise TypeError("'first' only supports a DatetimeIndex index")

8957

8958 if len(self.index) == 0:

8959 return self.copy(deep=False)

8960

8961 offset = to_offset(offset)

8962 if not isinstance(offset, Tick) and offset.is_on_offset(self.index[0]):

8963 # GH#29623 if first value is end of period, remove offset with n = 1

8964 # before adding the real offset

8965 end_date = end = self.index[0] - offset.base + offset

8966 else:

8967 end_date = end = self.index[0] + offset

8968

8969 # Tick-like, e.g. 3 weeks

8970 if isinstance(offset, Tick) and end_date in self.index:

8971 end = self.index.searchsorted(end_date, side="left")

8972 return self.iloc[:end]

8973

8974 return self.loc[:end]

8975

8976 @final

8977 def last(self: NDFrameT, offset) -> NDFrameT:

8978 """

8979 Select final periods of time series data based on a date offset.

8980

8981 For a DataFrame with a sorted DatetimeIndex, this function

8982 selects the last few rows based on a date offset.

8983

8984 Parameters

8985 ----------

8986 offset : str, DateOffset, dateutil.relativedelta

8987 The offset length of the data that will be selected. For instance,

8988 '3D' will display all the rows having their index within the last 3 days.

8989

8990 Returns

8991 -------

8992 Series or DataFrame

8993 A subset of the caller.

8994

8995 Raises

8996 ------

8997 TypeError

8998 If the index is not a :class:`DatetimeIndex`

8999

9000 See Also

9001 --------

9002 first : Select initial periods of time series based on a date offset.

9003 at_time : Select values at a particular time of the day.

9004 between_time : Select values between particular times of the day.

9005

9006 Examples

9007 --------

9008 >>> i = pd.date_range('2018-04-09', periods=4, freq='2D')

9009 >>> ts = pd.DataFrame({'A': [1, 2, 3, 4]}, index=i)

9010 >>> ts

9011 A

9012 2018-04-09 1

9013 2018-04-11 2

9014 2018-04-13 3

9015 2018-04-15 4

9016

9017 Get the rows for the last 3 days:

9018

9019 >>> ts.last('3D')

9020 A

9021 2018-04-13 3

9022 2018-04-15 4

9023

9024 Notice the data for 3 last calendar days were returned, not the last

9025 3 observed days in the dataset, and therefore data for 2018-04-11 was

9026 not returned.

9027 """

9028 if not isinstance(self.index, DatetimeIndex):

9029 raise TypeError("'last' only supports a DatetimeIndex index")

9030

9031 if len(self.index) == 0:

9032 return self.copy(deep=False)

9033

9034 offset = to_offset(offset)

9035

9036 start_date = self.index[-1] - offset

9037 start = self.index.searchsorted(start_date, side="right")

9038 return self.iloc[start:]

9039

9040 @final

9041 def rank(

9042 self: NDFrameT,

9043 axis: Axis = 0,

9044 method: str = "average",

9045 numeric_only: bool_t = False,

9046 na_option: str = "keep",

9047 ascending: bool_t = True,

9048 pct: bool_t = False,

9049 ) -> NDFrameT:

9050 """

9051 Compute numerical data ranks (1 through n) along axis.

9052

9053 By default, equal values are assigned a rank that is the average of the

9054 ranks of those values.

9055

9056 Parameters

9057 ----------

9058 axis : {0 or 'index', 1 or 'columns'}, default 0

9059 Index to direct ranking.

9060 For `Series` this parameter is unused and defaults to 0.

9061 method : {'average', 'min', 'max', 'first', 'dense'}, default 'average'

9062 How to rank the group of records that have the same value (i.e. ties):

9063

9064 * average: average rank of the group

9065 * min: lowest rank in the group

9066 * max: highest rank in the group

9067 * first: ranks assigned in order they appear in the array

9068 * dense: like 'min', but rank always increases by 1 between groups.

9069

9070 numeric_only : bool, default False

9071 For DataFrame objects, rank only numeric columns if set to True.

9072

9073 .. versionchanged:: 2.0.0

9074 The default value of ``numeric_only`` is now ``False``.

9075

9076 na_option : {'keep', 'top', 'bottom'}, default 'keep'

9077 How to rank NaN values:

9078

9079 * keep: assign NaN rank to NaN values

9080 * top: assign lowest rank to NaN values

9081 * bottom: assign highest rank to NaN values

9082

9083 ascending : bool, default True

9084 Whether or not the elements should be ranked in ascending order.

9085 pct : bool, default False

9086 Whether or not to display the returned rankings in percentile

9087 form.

9088

9089 Returns

9090 -------

9091 same type as caller

9092 Return a Series or DataFrame with data ranks as values.

9093

9094 See Also

9095 --------

9096 core.groupby.DataFrameGroupBy.rank : Rank of values within each group.

9097 core.groupby.SeriesGroupBy.rank : Rank of values within each group.

9098

9099 Examples

9100 --------

9101 >>> df = pd.DataFrame(data={'Animal': ['cat', 'penguin', 'dog',

9102 ... 'spider', 'snake'],

9103 ... 'Number_legs': [4, 2, 4, 8, np.nan]})

9104 >>> df

9105 Animal Number_legs

9106 0 cat 4.0

9107 1 penguin 2.0

9108 2 dog 4.0

9109 3 spider 8.0

9110 4 snake NaN

9111

9112 Ties are assigned the mean of the ranks (by default) for the group.

9113

9114 >>> s = pd.Series(range(5), index=list("abcde"))

9115 >>> s["d"] = s["b"]

9116 >>> s.rank()

9117 a 1.0

9118 b 2.5

9119 c 4.0

9120 d 2.5

9121 e 5.0

9122 dtype: float64

9123

9124 The following example shows how the method behaves with the above

9125 parameters:

9126

9127 * default_rank: this is the default behaviour obtained without using

9128 any parameter.

9129 * max_rank: setting ``method = 'max'`` the records that have the

9130 same values are ranked using the highest rank (e.g.: since 'cat'

9131 and 'dog' are both in the 2nd and 3rd position, rank 3 is assigned.)

9132 * NA_bottom: choosing ``na_option = 'bottom'``, if there are records

9133 with NaN values they are placed at the bottom of the ranking.

9134 * pct_rank: when setting ``pct = True``, the ranking is expressed as

9135 percentile rank.

9136

9137 >>> df['default_rank'] = df['Number_legs'].rank()

9138 >>> df['max_rank'] = df['Number_legs'].rank(method='max')

9139 >>> df['NA_bottom'] = df['Number_legs'].rank(na_option='bottom')

9140 >>> df['pct_rank'] = df['Number_legs'].rank(pct=True)

9141 >>> df

9142 Animal Number_legs default_rank max_rank NA_bottom pct_rank

9143 0 cat 4.0 2.5 3.0 2.5 0.625

9144 1 penguin 2.0 1.0 1.0 1.0 0.250

9145 2 dog 4.0 2.5 3.0 2.5 0.625

9146 3 spider 8.0 4.0 4.0 4.0 1.000

9147 4 snake NaN NaN NaN 5.0 NaN

9148 """

9149 axis_int = self._get_axis_number(axis)

9150

9151 if na_option not in {"keep", "top", "bottom"}:

9152 msg = "na_option must be one of 'keep', 'top', or 'bottom'"

9153 raise ValueError(msg)

9154

9155 def ranker(data):

9156 if data.ndim == 2:

9157 # i.e. DataFrame, we cast to ndarray

9158 values = data.values

9159 else:

9160 # i.e. Series, can dispatch to EA

9161 values = data._values

9162

9163 if isinstance(values, ExtensionArray):

9164 ranks = values._rank(

9165 axis=axis_int,

9166 method=method,

9167 ascending=ascending,

9168 na_option=na_option,

9169 pct=pct,

9170 )

9171 else:

9172 ranks = algos.rank(

9173 values,

9174 axis=axis_int,

9175 method=method,

9176 ascending=ascending,

9177 na_option=na_option,

9178 pct=pct,

9179 )

9180

9181 ranks_obj = self._constructor(ranks, **data._construct_axes_dict())

9182 return ranks_obj.__finalize__(self, method="rank")

9183

9184 if numeric_only:

9185 if self.ndim == 1 and not is_numeric_dtype(self.dtype):

9186 # GH#47500

9187 raise TypeError(

9188 "Series.rank does not allow numeric_only=True with "

9189 "non-numeric dtype."

9190 )

9191 data = self._get_numeric_data()

9192 else:

9193 data = self

9194

9195 return ranker(data)

9196

9197 @doc(_shared_docs["compare"], klass=_shared_doc_kwargs["klass"])

9198 def compare(

9199 self,

9200 other,

9201 align_axis: Axis = 1,

9202 keep_shape: bool_t = False,

9203 keep_equal: bool_t = False,

9204 result_names: Suffixes = ("self", "other"),

9205 ):

9206 if type(self) is not type(other):

9207 cls_self, cls_other = type(self).__name__, type(other).__name__

9208 raise TypeError(

9209 f"can only compare '{cls_self}' (not '{cls_other}') with '{cls_self}'"

9210 )

9211

9212 mask = ~((self == other) | (self.isna() & other.isna()))

9213 mask.fillna(True, inplace=True)

9214

9215 if not keep_equal:

9216 self = self.where(mask)

9217 other = other.where(mask)

9218

9219 if not keep_shape:

9220 if isinstance(self, ABCDataFrame):

9221 cmask = mask.any()

9222 rmask = mask.any(axis=1)

9223 self = self.loc[rmask, cmask]

9224 other = other.loc[rmask, cmask]

9225 else:

9226 self = self[mask]

9227 other = other[mask]

9228 if not isinstance(result_names, tuple):

9229 raise TypeError(

9230 f"Passing 'result_names' as a {type(result_names)} is not "

9231 "supported. Provide 'result_names' as a tuple instead."

9232 )

9233

9234 if align_axis in (1, "columns"): # This is needed for Series

9235 axis = 1

9236 else:

9237 axis = self._get_axis_number(align_axis)

9238

9239 diff = concat([self, other], axis=axis, keys=result_names)

9240

9241 if axis >= self.ndim:

9242 # No need to reorganize data if stacking on new axis

9243 # This currently applies for stacking two Series on columns

9244 return diff

9245

9246 ax = diff._get_axis(axis)

9247 ax_names = np.array(ax.names)

9248

9249 # set index names to positions to avoid confusion

9250 ax.names = np.arange(len(ax_names))

9251

9252 # bring self-other to inner level

9253 order = list(range(1, ax.nlevels)) + [0]

9254 if isinstance(diff, ABCDataFrame):

9255 diff = diff.reorder_levels(order, axis=axis)

9256 else:

9257 diff = diff.reorder_levels(order)

9258

9259 # restore the index names in order

9260 diff._get_axis(axis=axis).names = ax_names[order]

9261

9262 # reorder axis to keep things organized

9263 indices = (

9264 np.arange(diff.shape[axis]).reshape([2, diff.shape[axis] // 2]).T.flatten()

9265 )

9266 diff = diff.take(indices, axis=axis)

9267

9268 return diff

9269

9270 @doc(**_shared_doc_kwargs)

9271 def align(

9272 self: NDFrameT,

9273 other: NDFrameT,

9274 join: AlignJoin = "outer",

9275 axis: Axis | None = None,

9276 level: Level = None,

9277 copy: bool_t | None = None,

9278 fill_value: Hashable = None,

9279 method: FillnaOptions | None = None,

9280 limit: int | None = None,

9281 fill_axis: Axis = 0,

9282 broadcast_axis: Axis | None = None,

9283 ) -> NDFrameT:

9284 """

9285 Align two objects on their axes with the specified join method.

9286

9287 Join method is specified for each axis Index.

9288

9289 Parameters

9290 ----------

9291 other : DataFrame or Series

9292 join : {{'outer', 'inner', 'left', 'right'}}, default 'outer'

9293 axis : allowed axis of the other object, default None

9294 Align on index (0), columns (1), or both (None).

9295 level : int or level name, default None

9296 Broadcast across a level, matching Index values on the

9297 passed MultiIndex level.

9298 copy : bool, default True

9299 Always returns new objects. If copy=False and no reindexing is

9300 required then original objects are returned.

9301 fill_value : scalar, default np.NaN

9302 Value to use for missing values. Defaults to NaN, but can be any

9303 "compatible" value.

9304 method : {{'backfill', 'bfill', 'pad', 'ffill', None}}, default None

9305 Method to use for filling holes in reindexed Series:

9306

9307 - pad / ffill: propagate last valid observation forward to next valid.

9308 - backfill / bfill: use NEXT valid observation to fill gap.

9309

9310 limit : int, default None

9311 If method is specified, this is the maximum number of consecutive

9312 NaN values to forward/backward fill. In other words, if there is

9313 a gap with more than this number of consecutive NaNs, it will only

9314 be partially filled. If method is not specified, this is the

9315 maximum number of entries along the entire axis where NaNs will be

9316 filled. Must be greater than 0 if not None.

9317 fill_axis : {axes_single_arg}, default 0

9318 Filling axis, method and limit.

9319 broadcast_axis : {axes_single_arg}, default None

9320 Broadcast values along this axis, if aligning two objects of

9321 different dimensions.

9322

9323 Returns

9324 -------

9325 tuple of ({klass}, type of other)

9326 Aligned objects.

9327

9328 Examples

9329 --------

9330 >>> df = pd.DataFrame(

9331 ... [[1, 2, 3, 4], [6, 7, 8, 9]], columns=["D", "B", "E", "A"], index=[1, 2]

9332 ... )

9333 >>> other = pd.DataFrame(

9334 ... [[10, 20, 30, 40], [60, 70, 80, 90], [600, 700, 800, 900]],

9335 ... columns=["A", "B", "C", "D"],

9336 ... index=[2, 3, 4],

9337 ... )

9338 >>> df

9339 D B E A

9340 1 1 2 3 4

9341 2 6 7 8 9

9342 >>> other

9343 A B C D

9344 2 10 20 30 40

9345 3 60 70 80 90

9346 4 600 700 800 900

9347

9348 Align on columns:

9349

9350 >>> left, right = df.align(other, join="outer", axis=1)

9351 >>> left

9352 A B C D E

9353 1 4 2 NaN 1 3

9354 2 9 7 NaN 6 8

9355 >>> right

9356 A B C D E

9357 2 10 20 30 40 NaN

9358 3 60 70 80 90 NaN

9359 4 600 700 800 900 NaN

9360

9361 We can also align on the index:

9362

9363 >>> left, right = df.align(other, join="outer", axis=0)

9364 >>> left

9365 D B E A

9366 1 1.0 2.0 3.0 4.0

9367 2 6.0 7.0 8.0 9.0

9368 3 NaN NaN NaN NaN

9369 4 NaN NaN NaN NaN

9370 >>> right

9371 A B C D

9372 1 NaN NaN NaN NaN

9373 2 10.0 20.0 30.0 40.0

9374 3 60.0 70.0 80.0 90.0

9375 4 600.0 700.0 800.0 900.0

9376

9377 Finally, the default `axis=None` will align on both index and columns:

9378

9379 >>> left, right = df.align(other, join="outer", axis=None)

9380 >>> left

9381 A B C D E

9382 1 4.0 2.0 NaN 1.0 3.0

9383 2 9.0 7.0 NaN 6.0 8.0

9384 3 NaN NaN NaN NaN NaN

9385 4 NaN NaN NaN NaN NaN

9386 >>> right

9387 A B C D E

9388 1 NaN NaN NaN NaN NaN

9389 2 10.0 20.0 30.0 40.0 NaN

9390 3 60.0 70.0 80.0 90.0 NaN

9391 4 600.0 700.0 800.0 900.0 NaN

9392 """

9393

9394 method = clean_fill_method(method)

9395

9396 if broadcast_axis == 1 and self.ndim != other.ndim:

9397 if isinstance(self, ABCSeries):

9398 # this means other is a DataFrame, and we need to broadcast

9399 # self

9400 cons = self._constructor_expanddim

9401 df = cons(

9402 {c: self for c in other.columns}, **other._construct_axes_dict()

9403 )

9404 return df._align_frame(

9405 other,

9406 join=join,

9407 axis=axis,

9408 level=level,

9409 copy=copy,

9410 fill_value=fill_value,

9411 method=method,

9412 limit=limit,

9413 fill_axis=fill_axis,

9414 )

9415 elif isinstance(other, ABCSeries):

9416 # this means self is a DataFrame, and we need to broadcast

9417 # other

9418 cons = other._constructor_expanddim

9419 df = cons(

9420 {c: other for c in self.columns}, **self._construct_axes_dict()

9421 )

9422 return self._align_frame(

9423 df,

9424 join=join,

9425 axis=axis,

9426 level=level,

9427 copy=copy,

9428 fill_value=fill_value,

9429 method=method,

9430 limit=limit,

9431 fill_axis=fill_axis,

9432 )

9433

9434 if axis is not None:

9435 axis = self._get_axis_number(axis)

9436 if isinstance(other, ABCDataFrame):

9437 return self._align_frame(

9438 other,

9439 join=join,

9440 axis=axis,

9441 level=level,

9442 copy=copy,

9443 fill_value=fill_value,

9444 method=method,

9445 limit=limit,

9446 fill_axis=fill_axis,

9447 )

9448 elif isinstance(other, ABCSeries):

9449 return self._align_series(

9450 other,

9451 join=join,

9452 axis=axis,

9453 level=level,

9454 copy=copy,

9455 fill_value=fill_value,

9456 method=method,

9457 limit=limit,

9458 fill_axis=fill_axis,

9459 )

9460 else: # pragma: no cover

9461 raise TypeError(f"unsupported type: {type(other)}")

9462

9463 @final

9464 def _align_frame(

9465 self,

9466 other,

9467 join: AlignJoin = "outer",

9468 axis: Axis | None = None,

9469 level=None,

9470 copy: bool_t | None = None,

9471 fill_value=None,

9472 method=None,

9473 limit=None,

9474 fill_axis: Axis = 0,

9475 ):

9476 # defaults

9477 join_index, join_columns = None, None

9478 ilidx, iridx = None, None

9479 clidx, cridx = None, None

9480

9481 is_series = isinstance(self, ABCSeries)

9482

9483 if (axis is None or axis == 0) and not self.index.equals(other.index):

9484 join_index, ilidx, iridx = self.index.join(

9485 other.index, how=join, level=level, return_indexers=True

9486 )

9487

9488 if (

9489 (axis is None or axis == 1)

9490 and not is_series

9491 and not self.columns.equals(other.columns)

9492 ):

9493 join_columns, clidx, cridx = self.columns.join(

9494 other.columns, how=join, level=level, return_indexers=True

9495 )

9496

9497 if is_series:

9498 reindexers = {0: [join_index, ilidx]}

9499 else:

9500 reindexers = {0: [join_index, ilidx], 1: [join_columns, clidx]}

9501

9502 left = self._reindex_with_indexers(

9503 reindexers, copy=copy, fill_value=fill_value, allow_dups=True

9504 )

9505 # other must be always DataFrame

9506 right = other._reindex_with_indexers(

9507 {0: [join_index, iridx], 1: [join_columns, cridx]},

9508 copy=copy,

9509 fill_value=fill_value,

9510 allow_dups=True,

9511 )

9512

9513 if method is not None:

9514 _left = left.fillna(method=method, axis=fill_axis, limit=limit)

9515 assert _left is not None # needed for mypy

9516 left = _left

9517 right = right.fillna(method=method, axis=fill_axis, limit=limit)

9518

9519 # if DatetimeIndex have different tz, convert to UTC

9520 left, right = _align_as_utc(left, right, join_index)

9521

9522 return (

9523 left.__finalize__(self),

9524 right.__finalize__(other),

9525 )

9526

9527 @final

9528 def _align_series(

9529 self,

9530 other,

9531 join: AlignJoin = "outer",

9532 axis: Axis | None = None,

9533 level=None,

9534 copy: bool_t | None = None,

9535 fill_value=None,

9536 method=None,

9537 limit=None,

9538 fill_axis: Axis = 0,

9539 ):

9540 is_series = isinstance(self, ABCSeries)

9541 if copy and using_copy_on_write():

9542 copy = False

9543

9544 if (not is_series and axis is None) or axis not in [None, 0, 1]:

9545 raise ValueError("Must specify axis=0 or 1")

9546

9547 if is_series and axis == 1:

9548 raise ValueError("cannot align series to a series other than axis 0")

9549

9550 # series/series compat, other must always be a Series

9551 if not axis:

9552 # equal

9553 if self.index.equals(other.index):

9554 join_index, lidx, ridx = None, None, None

9555 else:

9556 join_index, lidx, ridx = self.index.join(

9557 other.index, how=join, level=level, return_indexers=True

9558 )

9559

9560 if is_series:

9561 left = self._reindex_indexer(join_index, lidx, copy)

9562 elif lidx is None or join_index is None:

9563 left = self.copy(deep=copy)

9564 else:

9565 left = self._constructor(

9566 self._mgr.reindex_indexer(join_index, lidx, axis=1, copy=copy)

9567 )

9568

9569 right = other._reindex_indexer(join_index, ridx, copy)

9570

9571 else:

9572 # one has > 1 ndim

9573 fdata = self._mgr

9574 join_index = self.axes[1]

9575 lidx, ridx = None, None

9576 if not join_index.equals(other.index):

9577 join_index, lidx, ridx = join_index.join(

9578 other.index, how=join, level=level, return_indexers=True

9579 )

9580

9581 if lidx is not None:

9582 bm_axis = self._get_block_manager_axis(1)

9583 fdata = fdata.reindex_indexer(join_index, lidx, axis=bm_axis)

9584

9585 if copy and fdata is self._mgr:

9586 fdata = fdata.copy()

9587

9588 left = self._constructor(fdata)

9589

9590 if ridx is None:

9591 right = other.copy(deep=copy)

9592 else:

9593 right = other.reindex(join_index, level=level)

9594

9595 # fill

9596 fill_na = notna(fill_value) or (method is not None)

9597 if fill_na:

9598 left = left.fillna(fill_value, method=method, limit=limit, axis=fill_axis)

9599 right = right.fillna(fill_value, method=method, limit=limit)

9600

9601 # if DatetimeIndex have different tz, convert to UTC

9602 if is_series or (not is_series and axis == 0):

9603 left, right = _align_as_utc(left, right, join_index)

9604

9605 return (

9606 left.__finalize__(self),

9607 right.__finalize__(other),

9608 )

9609

9610 @final

9611 def _where(

9612 self,

9613 cond,

9614 other=lib.no_default,

9615 inplace: bool_t = False,

9616 axis: Axis | None = None,

9617 level=None,

9618 ):

9619 """

9620 Equivalent to public method `where`, except that `other` is not

9621 applied as a function even if callable. Used in __setitem__.

9622 """

9623 inplace = validate_bool_kwarg(inplace, "inplace")

9624

9625 if axis is not None:

9626 axis = self._get_axis_number(axis)

9627

9628 # align the cond to same shape as myself

9629 cond = common.apply_if_callable(cond, self)

9630 if isinstance(cond, NDFrame):

9631 # CoW: Make sure reference is not kept alive

9632 cond = cond.align(self, join="right", broadcast_axis=1, copy=False)[0]

9633 else:

9634 if not hasattr(cond, "shape"):

9635 cond = np.asanyarray(cond)

9636 if cond.shape != self.shape:

9637 raise ValueError("Array conditional must be same shape as self")

9638 cond = self._constructor(cond, **self._construct_axes_dict(), copy=False)

9639

9640 # make sure we are boolean

9641 fill_value = bool(inplace)

9642 cond = cond.fillna(fill_value)

9643

9644 msg = "Boolean array expected for the condition, not {dtype}"

9645

9646 if not cond.empty:

9647 if not isinstance(cond, ABCDataFrame):

9648 # This is a single-dimensional object.

9649 if not is_bool_dtype(cond):

9650 raise ValueError(msg.format(dtype=cond.dtype))

9651 else:

9652 for _dt in cond.dtypes:

9653 if not is_bool_dtype(_dt):

9654 raise ValueError(msg.format(dtype=_dt))

9655 else:

9656 # GH#21947 we have an empty DataFrame/Series, could be object-dtype

9657 cond = cond.astype(bool)

9658

9659 cond = -cond if inplace else cond

9660 cond = cond.reindex(self._info_axis, axis=self._info_axis_number, copy=False)

9661

9662 # try to align with other

9663 if isinstance(other, NDFrame):

9664 # align with me

9665 if other.ndim <= self.ndim:

9666 # CoW: Make sure reference is not kept alive

9667 other = self.align(

9668 other,

9669 join="left",

9670 axis=axis,

9671 level=level,

9672 fill_value=None,

9673 copy=False,

9674 )[1]

9675

9676 # if we are NOT aligned, raise as we cannot where index

9677 if axis is None and not other._indexed_same(self):

9678 raise InvalidIndexError

9679

9680 if other.ndim < self.ndim:

9681 # TODO(EA2D): avoid object-dtype cast in EA case GH#38729

9682 other = other._values

9683 if axis == 0:

9684 other = np.reshape(other, (-1, 1))

9685 elif axis == 1:

9686 other = np.reshape(other, (1, -1))

9687

9688 other = np.broadcast_to(other, self.shape)

9689

9690 # slice me out of the other

9691 else:

9692 raise NotImplementedError(

9693 "cannot align with a higher dimensional NDFrame"

9694 )

9695

9696 elif not isinstance(other, (MultiIndex, NDFrame)):

9697 # mainly just catching Index here

9698 other = extract_array(other, extract_numpy=True)

9699

9700 if isinstance(other, (np.ndarray, ExtensionArray)):

9701 if other.shape != self.shape:

9702 if self.ndim != 1:

9703 # In the ndim == 1 case we may have

9704 # other length 1, which we treat as scalar (GH#2745, GH#4192)

9705 # or len(other) == icond.sum(), which we treat like

9706 # __setitem__ (GH#3235)

9707 raise ValueError(

9708 "other must be the same shape as self when an ndarray"

9709 )

9710

9711 # we are the same shape, so create an actual object for alignment

9712 else:

9713 other = self._constructor(

9714 other, **self._construct_axes_dict(), copy=False

9715 )

9716

9717 if axis is None:

9718 axis = 0

9719

9720 if self.ndim == getattr(other, "ndim", 0):

9721 align = True

9722 else:

9723 align = self._get_axis_number(axis) == 1

9724

9725 if inplace:

9726 # we may have different type blocks come out of putmask, so

9727 # reconstruct the block manager

9728

9729 self._check_inplace_setting(other)

9730 new_data = self._mgr.putmask(mask=cond, new=other, align=align)

9731 result = self._constructor(new_data)

9732 return self._update_inplace(result)

9733

9734 else:

9735 new_data = self._mgr.where(

9736 other=other,

9737 cond=cond,

9738 align=align,

9739 )

9740 result = self._constructor(new_data)

9741 return result.__finalize__(self)

9742

9743 @overload

9744 def where(

9745 self: NDFrameT,

9746 cond,

9747 other=...,

9748 *,

9749 inplace: Literal[False] = ...,

9750 axis: Axis | None = ...,

9751 level: Level = ...,

9752 ) -> NDFrameT:

9753 ...

9754

9755 @overload

9756 def where(

9757 self,

9758 cond,

9759 other=...,

9760 *,

9761 inplace: Literal[True],

9762 axis: Axis | None = ...,

9763 level: Level = ...,

9764 ) -> None:

9765 ...

9766

9767 @overload

9768 def where(

9769 self: NDFrameT,

9770 cond,

9771 other=...,

9772 *,

9773 inplace: bool_t = ...,

9774 axis: Axis | None = ...,

9775 level: Level = ...,

9776 ) -> NDFrameT | None:

9777 ...

9778

9779 @doc(

9780 klass=_shared_doc_kwargs["klass"],

9781 cond="True",

9782 cond_rev="False",

9783 name="where",

9784 name_other="mask",

9785 )

9786 def where(

9787 self: NDFrameT,

9788 cond,

9789 other=np.nan,

9790 *,

9791 inplace: bool_t = False,

9792 axis: Axis | None = None,

9793 level: Level = None,

9794 ) -> NDFrameT | None:

9795 """

9796 Replace values where the condition is {cond_rev}.

9797

9798 Parameters

9799 ----------

9800 cond : bool {klass}, array-like, or callable

9801 Where `cond` is {cond}, keep the original value. Where

9802 {cond_rev}, replace with corresponding value from `other`.

9803 If `cond` is callable, it is computed on the {klass} and

9804 should return boolean {klass} or array. The callable must

9805 not change input {klass} (though pandas doesn't check it).

9806 other : scalar, {klass}, or callable

9807 Entries where `cond` is {cond_rev} are replaced with

9808 corresponding value from `other`.

9809 If other is callable, it is computed on the {klass} and

9810 should return scalar or {klass}. The callable must not

9811 change input {klass} (though pandas doesn't check it).

9812 If not specified, entries will be filled with the corresponding

9813 NULL value (``np.nan`` for numpy dtypes, ``pd.NA`` for extension

9814 dtypes).

9815 inplace : bool, default False

9816 Whether to perform the operation in place on the data.

9817 axis : int, default None

9818 Alignment axis if needed. For `Series` this parameter is

9819 unused and defaults to 0.

9820 level : int, default None

9821 Alignment level if needed.

9822

9823 Returns

9824 -------

9825 Same type as caller or None if ``inplace=True``.

9826

9827 See Also

9828 --------

9829 :func:`DataFrame.{name_other}` : Return an object of same shape as

9830 self.

9831

9832 Notes

9833 -----

9834 The {name} method is an application of the if-then idiom. For each

9835 element in the calling DataFrame, if ``cond`` is ``{cond}`` the

9836 element is used; otherwise the corresponding element from the DataFrame

9837 ``other`` is used. If the axis of ``other`` does not align with axis of

9838 ``cond`` {klass}, the misaligned index positions will be filled with

9839 {cond_rev}.

9840

9841 The signature for :func:`DataFrame.where` differs from

9842 :func:`numpy.where`. Roughly ``df1.where(m, df2)`` is equivalent to

9843 ``np.where(m, df1, df2)``.

9844

9845 For further details and examples see the ``{name}`` documentation in

9846 :ref:`indexing <indexing.where_mask>`.

9847

9848 The dtype of the object takes precedence. The fill value is casted to

9849 the object's dtype, if this can be done losslessly.

9850

9851 Examples

9852 --------

9853 >>> s = pd.Series(range(5))

9854 >>> s.where(s > 0)

9855 0 NaN

9856 1 1.0

9857 2 2.0

9858 3 3.0

9859 4 4.0

9860 dtype: float64

9861 >>> s.mask(s > 0)

9862 0 0.0

9863 1 NaN

9864 2 NaN

9865 3 NaN

9866 4 NaN

9867 dtype: float64

9868

9869 >>> s = pd.Series(range(5))

9870 >>> t = pd.Series([True, False])

9871 >>> s.where(t, 99)

9872 0 0

9873 1 99

9874 2 99

9875 3 99

9876 4 99

9877 dtype: int64

9878 >>> s.mask(t, 99)

9879 0 99

9880 1 1

9881 2 99

9882 3 99

9883 4 99

9884 dtype: int64

9885

9886 >>> s.where(s > 1, 10)

9887 0 10

9888 1 10

9889 2 2

9890 3 3

9891 4 4

9892 dtype: int64

9893 >>> s.mask(s > 1, 10)

9894 0 0

9895 1 1

9896 2 10

9897 3 10

9898 4 10

9899 dtype: int64

9900

9901 >>> df = pd.DataFrame(np.arange(10).reshape(-1, 2), columns=['A', 'B'])

9902 >>> df

9903 A B

9904 0 0 1

9905 1 2 3

9906 2 4 5

9907 3 6 7

9908 4 8 9

9909 >>> m = df % 3 == 0

9910 >>> df.where(m, -df)

9911 A B

9912 0 0 -1

9913 1 -2 3

9914 2 -4 -5

9915 3 6 -7

9916 4 -8 9

9917 >>> df.where(m, -df) == np.where(m, df, -df)

9918 A B

9919 0 True True

9920 1 True True

9921 2 True True

9922 3 True True

9923 4 True True

9924 >>> df.where(m, -df) == df.mask(~m, -df)

9925 A B

9926 0 True True

9927 1 True True

9928 2 True True

9929 3 True True

9930 4 True True

9931 """

9932 other = common.apply_if_callable(other, self)

9933 return self._where(cond, other, inplace, axis, level)

9934

9935 @overload

9936 def mask(

9937 self: NDFrameT,

9938 cond,

9939 other=...,

9940 *,

9941 inplace: Literal[False] = ...,

9942 axis: Axis | None = ...,

9943 level: Level = ...,

9944 ) -> NDFrameT:

9945 ...

9946

9947 @overload

9948 def mask(

9949 self,

9950 cond,

9951 other=...,

9952 *,

9953 inplace: Literal[True],

9954 axis: Axis | None = ...,

9955 level: Level = ...,

9956 ) -> None:

9957 ...

9958

9959 @overload

9960 def mask(

9961 self: NDFrameT,

9962 cond,

9963 other=...,

9964 *,

9965 inplace: bool_t = ...,

9966 axis: Axis | None = ...,

9967 level: Level = ...,

9968 ) -> NDFrameT | None:

9969 ...

9970

9971 @doc(

9972 where,

9973 klass=_shared_doc_kwargs["klass"],

9974 cond="False",

9975 cond_rev="True",

9976 name="mask",

9977 name_other="where",

9978 )

9979 def mask(

9980 self: NDFrameT,

9981 cond,

9982 other=lib.no_default,

9983 *,

9984 inplace: bool_t = False,

9985 axis: Axis | None = None,

9986 level: Level = None,

9987 ) -> NDFrameT | None:

9988 inplace = validate_bool_kwarg(inplace, "inplace")

9989 cond = common.apply_if_callable(cond, self)

9990

9991 # see gh-21891

9992 if not hasattr(cond, "__invert__"):

9993 cond = np.array(cond)

9994

9995 return self.where(

9996 ~cond,

9997 other=other,

9998 inplace=inplace,

9999 axis=axis,

10000 level=level,

10001 )

10002

10003 @doc(klass=_shared_doc_kwargs["klass"])

10004 def shift(

10005 self: NDFrameT,

10006 periods: int = 1,

10007 freq=None,

10008 axis: Axis = 0,

10009 fill_value: Hashable = None,

10010 ) -> NDFrameT:

10011 """

10012 Shift index by desired number of periods with an optional time `freq`.

10013

10014 When `freq` is not passed, shift the index without realigning the data.

10015 If `freq` is passed (in this case, the index must be date or datetime,

10016 or it will raise a `NotImplementedError`), the index will be

10017 increased using the periods and the `freq`. `freq` can be inferred

10018 when specified as "infer" as long as either freq or inferred_freq

10019 attribute is set in the index.

10020

10021 Parameters

10022 ----------

10023 periods : int

10024 Number of periods to shift. Can be positive or negative.

10025 freq : DateOffset, tseries.offsets, timedelta, or str, optional

10026 Offset to use from the tseries module or time rule (e.g. 'EOM').

10027 If `freq` is specified then the index values are shifted but the

10028 data is not realigned. That is, use `freq` if you would like to

10029 extend the index when shifting and preserve the original data.

10030 If `freq` is specified as "infer" then it will be inferred from

10031 the freq or inferred_freq attributes of the index. If neither of

10032 those attributes exist, a ValueError is thrown.

10033 axis : {{0 or 'index', 1 or 'columns', None}}, default None

10034 Shift direction. For `Series` this parameter is unused and defaults to 0.

10035 fill_value : object, optional

10036 The scalar value to use for newly introduced missing values.

10037 the default depends on the dtype of `self`.

10038 For numeric data, ``np.nan`` is used.

10039 For datetime, timedelta, or period data, etc. :attr:`NaT` is used.

10040 For extension dtypes, ``self.dtype.na_value`` is used.

10041

10042 .. versionchanged:: 1.1.0

10043

10044 Returns

10045 -------

10046 {klass}

10047 Copy of input object, shifted.

10048

10049 See Also

10050 --------

10051 Index.shift : Shift values of Index.

10052 DatetimeIndex.shift : Shift values of DatetimeIndex.

10053 PeriodIndex.shift : Shift values of PeriodIndex.

10054

10055 Examples

10056 --------

10057 >>> df = pd.DataFrame({{"Col1": [10, 20, 15, 30, 45],

10058 ... "Col2": [13, 23, 18, 33, 48],

10059 ... "Col3": [17, 27, 22, 37, 52]}},

10060 ... index=pd.date_range("2020-01-01", "2020-01-05"))

10061 >>> df

10062 Col1 Col2 Col3

10063 2020-01-01 10 13 17

10064 2020-01-02 20 23 27

10065 2020-01-03 15 18 22

10066 2020-01-04 30 33 37

10067 2020-01-05 45 48 52

10068

10069 >>> df.shift(periods=3)

10070 Col1 Col2 Col3

10071 2020-01-01 NaN NaN NaN

10072 2020-01-02 NaN NaN NaN

10073 2020-01-03 NaN NaN NaN

10074 2020-01-04 10.0 13.0 17.0

10075 2020-01-05 20.0 23.0 27.0

10076

10077 >>> df.shift(periods=1, axis="columns")

10078 Col1 Col2 Col3

10079 2020-01-01 NaN 10 13

10080 2020-01-02 NaN 20 23

10081 2020-01-03 NaN 15 18

10082 2020-01-04 NaN 30 33

10083 2020-01-05 NaN 45 48

10084

10085 >>> df.shift(periods=3, fill_value=0)

10086 Col1 Col2 Col3

10087 2020-01-01 0 0 0

10088 2020-01-02 0 0 0

10089 2020-01-03 0 0 0

10090 2020-01-04 10 13 17

10091 2020-01-05 20 23 27

10092

10093 >>> df.shift(periods=3, freq="D")

10094 Col1 Col2 Col3

10095 2020-01-04 10 13 17

10096 2020-01-05 20 23 27

10097 2020-01-06 15 18 22

10098 2020-01-07 30 33 37

10099 2020-01-08 45 48 52

10100

10101 >>> df.shift(periods=3, freq="infer")

10102 Col1 Col2 Col3

10103 2020-01-04 10 13 17

10104 2020-01-05 20 23 27

10105 2020-01-06 15 18 22

10106 2020-01-07 30 33 37

10107 2020-01-08 45 48 52

10108 """

10109 if periods == 0:

10110 return self.copy(deep=None)

10111

10112 if freq is None:

10113 # when freq is None, data is shifted, index is not

10114 axis = self._get_axis_number(axis)

10115 new_data = self._mgr.shift(

10116 periods=periods, axis=axis, fill_value=fill_value

10117 )

10118 return self._constructor(new_data).__finalize__(self, method="shift")

10119

10120 # when freq is given, index is shifted, data is not

10121 index = self._get_axis(axis)

10122

10123 if freq == "infer":

10124 freq = getattr(index, "freq", None)

10125

10126 if freq is None:

10127 freq = getattr(index, "inferred_freq", None)

10128

10129 if freq is None:

10130 msg = "Freq was not set in the index hence cannot be inferred"

10131 raise ValueError(msg)

10132

10133 elif isinstance(freq, str):

10134 freq = to_offset(freq)

10135

10136 if isinstance(index, PeriodIndex):

10137 orig_freq = to_offset(index.freq)

10138 if freq != orig_freq:

10139 assert orig_freq is not None # for mypy

10140 raise ValueError(

10141 f"Given freq {freq.rule_code} does not match "

10142 f"PeriodIndex freq {orig_freq.rule_code}"

10143 )

10144 new_ax = index.shift(periods)

10145 else:

10146 new_ax = index.shift(periods, freq)

10147

10148 result = self.set_axis(new_ax, axis=axis)

10149 return result.__finalize__(self, method="shift")

10150

10151 def truncate(

10152 self: NDFrameT,

10153 before=None,

10154 after=None,

10155 axis: Axis | None = None,

10156 copy: bool_t | None = None,

10157 ) -> NDFrameT:

10158 """

10159 Truncate a Series or DataFrame before and after some index value.

10160

10161 This is a useful shorthand for boolean indexing based on index

10162 values above or below certain thresholds.

10163

10164 Parameters

10165 ----------

10166 before : date, str, int

10167 Truncate all rows before this index value.

10168 after : date, str, int

10169 Truncate all rows after this index value.

10170 axis : {0 or 'index', 1 or 'columns'}, optional

10171 Axis to truncate. Truncates the index (rows) by default.

10172 For `Series` this parameter is unused and defaults to 0.

10173 copy : bool, default is True,

10174 Return a copy of the truncated section.

10175

10176 Returns

10177 -------

10178 type of caller

10179 The truncated Series or DataFrame.

10180

10181 See Also

10182 --------

10183 DataFrame.loc : Select a subset of a DataFrame by label.

10184 DataFrame.iloc : Select a subset of a DataFrame by position.

10185

10186 Notes

10187 -----

10188 If the index being truncated contains only datetime values,

10189 `before` and `after` may be specified as strings instead of

10190 Timestamps.

10191

10192 Examples

10193 --------

10194 >>> df = pd.DataFrame({'A': ['a', 'b', 'c', 'd', 'e'],

10195 ... 'B': ['f', 'g', 'h', 'i', 'j'],

10196 ... 'C': ['k', 'l', 'm', 'n', 'o']},

10197 ... index=[1, 2, 3, 4, 5])

10198 >>> df

10199 A B C

10200 1 a f k

10201 2 b g l

10202 3 c h m

10203 4 d i n

10204 5 e j o

10205

10206 >>> df.truncate(before=2, after=4)

10207 A B C

10208 2 b g l

10209 3 c h m

10210 4 d i n

10211

10212 The columns of a DataFrame can be truncated.

10213

10214 >>> df.truncate(before="A", after="B", axis="columns")

10215 A B

10216 1 a f

10217 2 b g

10218 3 c h

10219 4 d i

10220 5 e j

10221

10222 For Series, only rows can be truncated.

10223

10224 >>> df['A'].truncate(before=2, after=4)

10225 2 b

10226 3 c

10227 4 d

10228 Name: A, dtype: object

10229

10230 The index values in ``truncate`` can be datetimes or string

10231 dates.

10232

10233 >>> dates = pd.date_range('2016-01-01', '2016-02-01', freq='s')

10234 >>> df = pd.DataFrame(index=dates, data={'A': 1})

10235 >>> df.tail()

10236 A

10237 2016-01-31 23:59:56 1

10238 2016-01-31 23:59:57 1

10239 2016-01-31 23:59:58 1

10240 2016-01-31 23:59:59 1

10241 2016-02-01 00:00:00 1

10242

10243 >>> df.truncate(before=pd.Timestamp('2016-01-05'),

10244 ... after=pd.Timestamp('2016-01-10')).tail()

10245 A

10246 2016-01-09 23:59:56 1

10247 2016-01-09 23:59:57 1

10248 2016-01-09 23:59:58 1

10249 2016-01-09 23:59:59 1

10250 2016-01-10 00:00:00 1

10251

10252 Because the index is a DatetimeIndex containing only dates, we can

10253 specify `before` and `after` as strings. They will be coerced to

10254 Timestamps before truncation.

10255

10256 >>> df.truncate('2016-01-05', '2016-01-10').tail()

10257 A

10258 2016-01-09 23:59:56 1

10259 2016-01-09 23:59:57 1

10260 2016-01-09 23:59:58 1

10261 2016-01-09 23:59:59 1

10262 2016-01-10 00:00:00 1

10263

10264 Note that ``truncate`` assumes a 0 value for any unspecified time

10265 component (midnight). This differs from partial string slicing, which

10266 returns any partially matching dates.

10267

10268 >>> df.loc['2016-01-05':'2016-01-10', :].tail()

10269 A

10270 2016-01-10 23:59:55 1

10271 2016-01-10 23:59:56 1

10272 2016-01-10 23:59:57 1

10273 2016-01-10 23:59:58 1

10274 2016-01-10 23:59:59 1

10275 """

10276 if axis is None:

10277 axis = self._stat_axis_number

10278 axis = self._get_axis_number(axis)

10279 ax = self._get_axis(axis)

10280

10281 # GH 17935

10282 # Check that index is sorted

10283 if not ax.is_monotonic_increasing and not ax.is_monotonic_decreasing:

10284 raise ValueError("truncate requires a sorted index")

10285

10286 # if we have a date index, convert to dates, otherwise

10287 # treat like a slice

10288 if ax._is_all_dates:

10289 from pandas.core.tools.datetimes import to_datetime

10290

10291 before = to_datetime(before)

10292 after = to_datetime(after)

10293

10294 if before is not None and after is not None and before > after:

10295 raise ValueError(f"Truncate: {after} must be after {before}")

10296

10297 if len(ax) > 1 and ax.is_monotonic_decreasing and ax.nunique() > 1:

10298 before, after = after, before

10299

10300 slicer = [slice(None, None)] * self._AXIS_LEN

10301 slicer[axis] = slice(before, after)

10302 result = self.loc[tuple(slicer)]

10303

10304 if isinstance(ax, MultiIndex):

10305 setattr(result, self._get_axis_name(axis), ax.truncate(before, after))

10306

10307 result = result.copy(deep=copy and not using_copy_on_write())

10308

10309 return result

10310

10311 @final

10312 @doc(klass=_shared_doc_kwargs["klass"])

10313 def tz_convert(

10314 self: NDFrameT, tz, axis: Axis = 0, level=None, copy: bool_t | None = None

10315 ) -> NDFrameT:

10316 """

10317 Convert tz-aware axis to target time zone.

10318

10319 Parameters

10320 ----------

10321 tz : str or tzinfo object or None

10322 Target time zone. Passing ``None`` will convert to

10323 UTC and remove the timezone information.

10324 axis : {{0 or 'index', 1 or 'columns'}}, default 0

10325 The axis to convert

10326 level : int, str, default None

10327 If axis is a MultiIndex, convert a specific level. Otherwise

10328 must be None.

10329 copy : bool, default True

10330 Also make a copy of the underlying data.

10331

10332 Returns

10333 -------

10334 {klass}

10335 Object with time zone converted axis.

10336

10337 Raises

10338 ------

10339 TypeError

10340 If the axis is tz-naive.

10341

10342 Examples

10343 --------

10344 Change to another time zone:

10345

10346 >>> s = pd.Series(

10347 ... [1],

10348 ... index=pd.DatetimeIndex(['2018-09-15 01:30:00+02:00']),

10349 ... )

10350 >>> s.tz_convert('Asia/Shanghai')

10351 2018-09-15 07:30:00+08:00 1

10352 dtype: int64

10353

10354 Pass None to convert to UTC and get a tz-naive index:

10355

10356 >>> s = pd.Series([1],

10357 ... index=pd.DatetimeIndex(['2018-09-15 01:30:00+02:00']))

10358 >>> s.tz_convert(None)

10359 2018-09-14 23:30:00 1

10360 dtype: int64

10361 """

10362 axis = self._get_axis_number(axis)

10363 ax = self._get_axis(axis)

10364

10365 def _tz_convert(ax, tz):

10366 if not hasattr(ax, "tz_convert"):

10367 if len(ax) > 0:

10368 ax_name = self._get_axis_name(axis)

10369 raise TypeError(

10370 f"{ax_name} is not a valid DatetimeIndex or PeriodIndex"

10371 )

10372 ax = DatetimeIndex([], tz=tz)

10373 else:

10374 ax = ax.tz_convert(tz)

10375 return ax

10376

10377 # if a level is given it must be a MultiIndex level or

10378 # equivalent to the axis name

10379 if isinstance(ax, MultiIndex):

10380 level = ax._get_level_number(level)

10381 new_level = _tz_convert(ax.levels[level], tz)

10382 ax = ax.set_levels(new_level, level=level)

10383 else:

10384 if level not in (None, 0, ax.name):

10385 raise ValueError(f"The level {level} is not valid")

10386 ax = _tz_convert(ax, tz)

10387

10388 result = self.copy(deep=copy and not using_copy_on_write())

10389 result = result.set_axis(ax, axis=axis, copy=False)

10390 return result.__finalize__(self, method="tz_convert")

10391

10392 @final

10393 @doc(klass=_shared_doc_kwargs["klass"])

10394 def tz_localize(

10395 self: NDFrameT,

10396 tz,

10397 axis: Axis = 0,

10398 level=None,

10399 copy: bool_t | None = None,

10400 ambiguous: TimeAmbiguous = "raise",

10401 nonexistent: TimeNonexistent = "raise",

10402 ) -> NDFrameT:

10403 """

10404 Localize tz-naive index of a Series or DataFrame to target time zone.

10405

10406 This operation localizes the Index. To localize the values in a

10407 timezone-naive Series, use :meth:`Series.dt.tz_localize`.

10408

10409 Parameters

10410 ----------

10411 tz : str or tzinfo or None

10412 Time zone to localize. Passing ``None`` will remove the

10413 time zone information and preserve local time.

10414 axis : {{0 or 'index', 1 or 'columns'}}, default 0

10415 The axis to localize

10416 level : int, str, default None

10417 If axis ia a MultiIndex, localize a specific level. Otherwise

10418 must be None.

10419 copy : bool, default True

10420 Also make a copy of the underlying data.

10421 ambiguous : 'infer', bool-ndarray, 'NaT', default 'raise'

10422 When clocks moved backward due to DST, ambiguous times may arise.

10423 For example in Central European Time (UTC+01), when going from

10424 03:00 DST to 02:00 non-DST, 02:30:00 local time occurs both at

10425 00:30:00 UTC and at 01:30:00 UTC. In such a situation, the

10426 `ambiguous` parameter dictates how ambiguous times should be

10427 handled.

10428

10429 - 'infer' will attempt to infer fall dst-transition hours based on

10430 order

10431 - bool-ndarray where True signifies a DST time, False designates

10432 a non-DST time (note that this flag is only applicable for

10433 ambiguous times)

10434 - 'NaT' will return NaT where there are ambiguous times

10435 - 'raise' will raise an AmbiguousTimeError if there are ambiguous

10436 times.

10437 nonexistent : str, default 'raise'

10438 A nonexistent time does not exist in a particular timezone

10439 where clocks moved forward due to DST. Valid values are:

10440

10441 - 'shift_forward' will shift the nonexistent time forward to the

10442 closest existing time

10443 - 'shift_backward' will shift the nonexistent time backward to the

10444 closest existing time

10445 - 'NaT' will return NaT where there are nonexistent times

10446 - timedelta objects will shift nonexistent times by the timedelta

10447 - 'raise' will raise an NonExistentTimeError if there are

10448 nonexistent times.

10449

10450 Returns

10451 -------

10452 {klass}

10453 Same type as the input.

10454

10455 Raises

10456 ------

10457 TypeError

10458 If the TimeSeries is tz-aware and tz is not None.

10459

10460 Examples

10461 --------

10462 Localize local times:

10463

10464 >>> s = pd.Series(

10465 ... [1],

10466 ... index=pd.DatetimeIndex(['2018-09-15 01:30:00']),

10467 ... )

10468 >>> s.tz_localize('CET')

10469 2018-09-15 01:30:00+02:00 1

10470 dtype: int64

10471

10472 Pass None to convert to tz-naive index and preserve local time:

10473

10474 >>> s = pd.Series([1],

10475 ... index=pd.DatetimeIndex(['2018-09-15 01:30:00+02:00']))

10476 >>> s.tz_localize(None)

10477 2018-09-15 01:30:00 1

10478 dtype: int64

10479

10480 Be careful with DST changes. When there is sequential data, pandas

10481 can infer the DST time:

10482

10483 >>> s = pd.Series(range(7),

10484 ... index=pd.DatetimeIndex(['2018-10-28 01:30:00',

10485 ... '2018-10-28 02:00:00',

10486 ... '2018-10-28 02:30:00',

10487 ... '2018-10-28 02:00:00',

10488 ... '2018-10-28 02:30:00',

10489 ... '2018-10-28 03:00:00',

10490 ... '2018-10-28 03:30:00']))

10491 >>> s.tz_localize('CET', ambiguous='infer')

10492 2018-10-28 01:30:00+02:00 0

10493 2018-10-28 02:00:00+02:00 1

10494 2018-10-28 02:30:00+02:00 2

10495 2018-10-28 02:00:00+01:00 3

10496 2018-10-28 02:30:00+01:00 4

10497 2018-10-28 03:00:00+01:00 5

10498 2018-10-28 03:30:00+01:00 6

10499 dtype: int64

10500

10501 In some cases, inferring the DST is impossible. In such cases, you can

10502 pass an ndarray to the ambiguous parameter to set the DST explicitly

10503

10504 >>> s = pd.Series(range(3),

10505 ... index=pd.DatetimeIndex(['2018-10-28 01:20:00',

10506 ... '2018-10-28 02:36:00',

10507 ... '2018-10-28 03:46:00']))

10508 >>> s.tz_localize('CET', ambiguous=np.array([True, True, False]))

10509 2018-10-28 01:20:00+02:00 0

10510 2018-10-28 02:36:00+02:00 1

10511 2018-10-28 03:46:00+01:00 2

10512 dtype: int64

10513

10514 If the DST transition causes nonexistent times, you can shift these

10515 dates forward or backward with a timedelta object or `'shift_forward'`

10516 or `'shift_backward'`.

10517

10518 >>> s = pd.Series(range(2),

10519 ... index=pd.DatetimeIndex(['2015-03-29 02:30:00',

10520 ... '2015-03-29 03:30:00']))

10521 >>> s.tz_localize('Europe/Warsaw', nonexistent='shift_forward')

10522 2015-03-29 03:00:00+02:00 0

10523 2015-03-29 03:30:00+02:00 1

10524 dtype: int64

10525 >>> s.tz_localize('Europe/Warsaw', nonexistent='shift_backward')

10526 2015-03-29 01:59:59.999999999+01:00 0

10527 2015-03-29 03:30:00+02:00 1

10528 dtype: int64

10529 >>> s.tz_localize('Europe/Warsaw', nonexistent=pd.Timedelta('1H'))

10530 2015-03-29 03:30:00+02:00 0

10531 2015-03-29 03:30:00+02:00 1

10532 dtype: int64

10533 """

10534 nonexistent_options = ("raise", "NaT", "shift_forward", "shift_backward")

10535 if nonexistent not in nonexistent_options and not isinstance(

10536 nonexistent, dt.timedelta

10537 ):

10538 raise ValueError(

10539 "The nonexistent argument must be one of 'raise', "

10540 "'NaT', 'shift_forward', 'shift_backward' or "

10541 "a timedelta object"

10542 )

10543

10544 axis = self._get_axis_number(axis)

10545 ax = self._get_axis(axis)

10546

10547 def _tz_localize(ax, tz, ambiguous, nonexistent):

10548 if not hasattr(ax, "tz_localize"):

10549 if len(ax) > 0:

10550 ax_name = self._get_axis_name(axis)

10551 raise TypeError(

10552 f"{ax_name} is not a valid DatetimeIndex or PeriodIndex"

10553 )

10554 ax = DatetimeIndex([], tz=tz)

10555 else:

10556 ax = ax.tz_localize(tz, ambiguous=ambiguous, nonexistent=nonexistent)

10557 return ax

10558

10559 # if a level is given it must be a MultiIndex level or

10560 # equivalent to the axis name

10561 if isinstance(ax, MultiIndex):

10562 level = ax._get_level_number(level)

10563 new_level = _tz_localize(ax.levels[level], tz, ambiguous, nonexistent)

10564 ax = ax.set_levels(new_level, level=level)

10565 else:

10566 if level not in (None, 0, ax.name):

10567 raise ValueError(f"The level {level} is not valid")

10568 ax = _tz_localize(ax, tz, ambiguous, nonexistent)

10569

10570 result = self.copy(deep=copy and not using_copy_on_write())

10571 result = result.set_axis(ax, axis=axis, copy=False)

10572 return result.__finalize__(self, method="tz_localize")

10573

10574 # ----------------------------------------------------------------------

10575 # Numeric Methods

10576

10577 @final

10578 def describe(

10579 self: NDFrameT,

10580 percentiles=None,

10581 include=None,

10582 exclude=None,

10583 ) -> NDFrameT:

10584 """

10585 Generate descriptive statistics.

10586

10587 Descriptive statistics include those that summarize the central

10588 tendency, dispersion and shape of a

10589 dataset's distribution, excluding ``NaN`` values.

10590

10591 Analyzes both numeric and object series, as well

10592 as ``DataFrame`` column sets of mixed data types. The output

10593 will vary depending on what is provided. Refer to the notes

10594 below for more detail.

10595

10596 Parameters

10597 ----------

10598 percentiles : list-like of numbers, optional

10599 The percentiles to include in the output. All should

10600 fall between 0 and 1. The default is

10601 ``[.25, .5, .75]``, which returns the 25th, 50th, and

10602 75th percentiles.

10603 include : 'all', list-like of dtypes or None (default), optional

10604 A white list of data types to include in the result. Ignored

10605 for ``Series``. Here are the options:

10606

10607 - 'all' : All columns of the input will be included in the output.

10608 - A list-like of dtypes : Limits the results to the

10609 provided data types.

10610 To limit the result to numeric types submit

10611 ``numpy.number``. To limit it instead to object columns submit

10612 the ``numpy.object`` data type. Strings

10613 can also be used in the style of

10614 ``select_dtypes`` (e.g. ``df.describe(include=['O'])``). To

10615 select pandas categorical columns, use ``'category'``

10616 - None (default) : The result will include all numeric columns.

10617 exclude : list-like of dtypes or None (default), optional,

10618 A black list of data types to omit from the result. Ignored

10619 for ``Series``. Here are the options:

10620

10621 - A list-like of dtypes : Excludes the provided data types

10622 from the result. To exclude numeric types submit

10623 ``numpy.number``. To exclude object columns submit the data

10624 type ``numpy.object``. Strings can also be used in the style of

10625 ``select_dtypes`` (e.g. ``df.describe(exclude=['O'])``). To

10626 exclude pandas categorical columns, use ``'category'``

10627 - None (default) : The result will exclude nothing.

10628

10629 Returns

10630 -------

10631 Series or DataFrame

10632 Summary statistics of the Series or Dataframe provided.

10633

10634 See Also

10635 --------

10636 DataFrame.count: Count number of non-NA/null observations.

10637 DataFrame.max: Maximum of the values in the object.

10638 DataFrame.min: Minimum of the values in the object.

10639 DataFrame.mean: Mean of the values.

10640 DataFrame.std: Standard deviation of the observations.

10641 DataFrame.select_dtypes: Subset of a DataFrame including/excluding

10642 columns based on their dtype.

10643

10644 Notes

10645 -----

10646 For numeric data, the result's index will include ``count``,

10647 ``mean``, ``std``, ``min``, ``max`` as well as lower, ``50`` and

10648 upper percentiles. By default the lower percentile is ``25`` and the

10649 upper percentile is ``75``. The ``50`` percentile is the

10650 same as the median.

10651

10652 For object data (e.g. strings or timestamps), the result's index

10653 will include ``count``, ``unique``, ``top``, and ``freq``. The ``top``

10654 is the most common value. The ``freq`` is the most common value's

10655 frequency. Timestamps also include the ``first`` and ``last`` items.

10656

10657 If multiple object values have the highest count, then the

10658 ``count`` and ``top`` results will be arbitrarily chosen from

10659 among those with the highest count.

10660

10661 For mixed data types provided via a ``DataFrame``, the default is to

10662 return only an analysis of numeric columns. If the dataframe consists

10663 only of object and categorical data without any numeric columns, the

10664 default is to return an analysis of both the object and categorical

10665 columns. If ``include='all'`` is provided as an option, the result

10666 will include a union of attributes of each type.

10667

10668 The `include` and `exclude` parameters can be used to limit

10669 which columns in a ``DataFrame`` are analyzed for the output.

10670 The parameters are ignored when analyzing a ``Series``.

10671

10672 Examples

10673 --------

10674 Describing a numeric ``Series``.

10675

10676 >>> s = pd.Series([1, 2, 3])

10677 >>> s.describe()

10678 count 3.0

10679 mean 2.0

10680 std 1.0

10681 min 1.0

10682 25% 1.5

10683 50% 2.0

10684 75% 2.5

10685 max 3.0

10686 dtype: float64

10687

10688 Describing a categorical ``Series``.

10689

10690 >>> s = pd.Series(['a', 'a', 'b', 'c'])

10691 >>> s.describe()

10692 count 4

10693 unique 3

10694 top a

10695 freq 2

10696 dtype: object

10697

10698 Describing a timestamp ``Series``.

10699

10700 >>> s = pd.Series([

10701 ... np.datetime64("2000-01-01"),

10702 ... np.datetime64("2010-01-01"),

10703 ... np.datetime64("2010-01-01")

10704 ... ])

10705 >>> s.describe()

10706 count 3

10707 mean 2006-09-01 08:00:00

10708 min 2000-01-01 00:00:00

10709 25% 2004-12-31 12:00:00

10710 50% 2010-01-01 00:00:00

10711 75% 2010-01-01 00:00:00

10712 max 2010-01-01 00:00:00

10713 dtype: object

10714

10715 Describing a ``DataFrame``. By default only numeric fields

10716 are returned.

10717

10718 >>> df = pd.DataFrame({'categorical': pd.Categorical(['d','e','f']),

10719 ... 'numeric': [1, 2, 3],

10720 ... 'object': ['a', 'b', 'c']

10721 ... })

10722 >>> df.describe()

10723 numeric

10724 count 3.0

10725 mean 2.0

10726 std 1.0

10727 min 1.0

10728 25% 1.5

10729 50% 2.0

10730 75% 2.5

10731 max 3.0

10732

10733 Describing all columns of a ``DataFrame`` regardless of data type.

10734

10735 >>> df.describe(include='all') # doctest: +SKIP

10736 categorical numeric object

10737 count 3 3.0 3

10738 unique 3 NaN 3

10739 top f NaN a

10740 freq 1 NaN 1

10741 mean NaN 2.0 NaN

10742 std NaN 1.0 NaN

10743 min NaN 1.0 NaN

10744 25% NaN 1.5 NaN

10745 50% NaN 2.0 NaN

10746 75% NaN 2.5 NaN

10747 max NaN 3.0 NaN

10748

10749 Describing a column from a ``DataFrame`` by accessing it as

10750 an attribute.

10751

10752 >>> df.numeric.describe()

10753 count 3.0

10754 mean 2.0

10755 std 1.0

10756 min 1.0

10757 25% 1.5

10758 50% 2.0

10759 75% 2.5

10760 max 3.0

10761 Name: numeric, dtype: float64

10762

10763 Including only numeric columns in a ``DataFrame`` description.

10764

10765 >>> df.describe(include=[np.number])

10766 numeric

10767 count 3.0

10768 mean 2.0

10769 std 1.0

10770 min 1.0

10771 25% 1.5

10772 50% 2.0

10773 75% 2.5

10774 max 3.0

10775

10776 Including only string columns in a ``DataFrame`` description.

10777

10778 >>> df.describe(include=[object]) # doctest: +SKIP

10779 object

10780 count 3

10781 unique 3

10782 top a

10783 freq 1

10784

10785 Including only categorical columns from a ``DataFrame`` description.

10786

10787 >>> df.describe(include=['category'])

10788 categorical

10789 count 3

10790 unique 3

10791 top d

10792 freq 1

10793

10794 Excluding numeric columns from a ``DataFrame`` description.

10795

10796 >>> df.describe(exclude=[np.number]) # doctest: +SKIP

10797 categorical object

10798 count 3 3

10799 unique 3 3

10800 top f a

10801 freq 1 1

10802

10803 Excluding object columns from a ``DataFrame`` description.

10804

10805 >>> df.describe(exclude=[object]) # doctest: +SKIP

10806 categorical numeric

10807 count 3 3.0

10808 unique 3 NaN

10809 top f NaN

10810 freq 1 NaN

10811 mean NaN 2.0

10812 std NaN 1.0

10813 min NaN 1.0

10814 25% NaN 1.5

10815 50% NaN 2.0

10816 75% NaN 2.5

10817 max NaN 3.0

10818 """

10819 return describe_ndframe(

10820 obj=self,

10821 include=include,

10822 exclude=exclude,

10823 percentiles=percentiles,

10824 )

10825

10826 @final

10827 def pct_change(

10828 self: NDFrameT,

10829 periods: int = 1,

10830 fill_method: Literal["backfill", "bfill", "pad", "ffill"] | None = "pad",

10831 limit=None,

10832 freq=None,

10833 **kwargs,

10834 ) -> NDFrameT:

10835 """

10836 Percentage change between the current and a prior element.

10837

10838 Computes the percentage change from the immediately previous row by

10839 default. This is useful in comparing the percentage of change in a time

10840 series of elements.

10841

10842 Parameters

10843 ----------

10844 periods : int, default 1

10845 Periods to shift for forming percent change.

10846 fill_method : {'backfill', 'bfill', 'pad', 'ffill', None}, default 'pad'

10847 How to handle NAs **before** computing percent changes.

10848 limit : int, default None

10849 The number of consecutive NAs to fill before stopping.

10850 freq : DateOffset, timedelta, or str, optional

10851 Increment to use from time series API (e.g. 'M' or BDay()).

10852 **kwargs

10853 Additional keyword arguments are passed into

10854 `DataFrame.shift` or `Series.shift`.

10855

10856 Returns

10857 -------

10858 Series or DataFrame

10859 The same type as the calling object.

10860

10861 See Also

10862 --------

10863 Series.diff : Compute the difference of two elements in a Series.

10864 DataFrame.diff : Compute the difference of two elements in a DataFrame.

10865 Series.shift : Shift the index by some number of periods.

10866 DataFrame.shift : Shift the index by some number of periods.

10867

10868 Examples

10869 --------

10870 **Series**

10871

10872 >>> s = pd.Series([90, 91, 85])

10873 >>> s

10874 0 90

10875 1 91

10876 2 85

10877 dtype: int64

10878

10879 >>> s.pct_change()

10880 0 NaN

10881 1 0.011111

10882 2 -0.065934

10883 dtype: float64

10884

10885 >>> s.pct_change(periods=2)

10886 0 NaN

10887 1 NaN

10888 2 -0.055556

10889 dtype: float64

10890

10891 See the percentage change in a Series where filling NAs with last

10892 valid observation forward to next valid.

10893

10894 >>> s = pd.Series([90, 91, None, 85])

10895 >>> s

10896 0 90.0

10897 1 91.0

10898 2 NaN

10899 3 85.0

10900 dtype: float64

10901

10902 >>> s.pct_change(fill_method='ffill')

10903 0 NaN

10904 1 0.011111

10905 2 0.000000

10906 3 -0.065934

10907 dtype: float64

10908

10909 **DataFrame**

10910

10911 Percentage change in French franc, Deutsche Mark, and Italian lira from

10912 1980-01-01 to 1980-03-01.

10913

10914 >>> df = pd.DataFrame({

10915 ... 'FR': [4.0405, 4.0963, 4.3149],

10916 ... 'GR': [1.7246, 1.7482, 1.8519],

10917 ... 'IT': [804.74, 810.01, 860.13]},

10918 ... index=['1980-01-01', '1980-02-01', '1980-03-01'])

10919 >>> df

10920 FR GR IT

10921 1980-01-01 4.0405 1.7246 804.74

10922 1980-02-01 4.0963 1.7482 810.01

10923 1980-03-01 4.3149 1.8519 860.13

10924

10925 >>> df.pct_change()

10926 FR GR IT

10927 1980-01-01 NaN NaN NaN

10928 1980-02-01 0.013810 0.013684 0.006549

10929 1980-03-01 0.053365 0.059318 0.061876

10930

10931 Percentage of change in GOOG and APPL stock volume. Shows computing

10932 the percentage change between columns.

10933

10934 >>> df = pd.DataFrame({

10935 ... '2016': [1769950, 30586265],

10936 ... '2015': [1500923, 40912316],

10937 ... '2014': [1371819, 41403351]},

10938 ... index=['GOOG', 'APPL'])

10939 >>> df

10940 2016 2015 2014

10941 GOOG 1769950 1500923 1371819

10942 APPL 30586265 40912316 41403351

10943

10944 >>> df.pct_change(axis='columns', periods=-1)

10945 2016 2015 2014

10946 GOOG 0.179241 0.094112 NaN

10947 APPL -0.252395 -0.011860 NaN

10948 """

10949 axis = self._get_axis_number(kwargs.pop("axis", self._stat_axis_name))

10950 if fill_method is None:

10951 data = self

10952 else:

10953 _data = self.fillna(method=fill_method, axis=axis, limit=limit)

10954 assert _data is not None # needed for mypy

10955 data = _data

10956

10957 shifted = data.shift(periods=periods, freq=freq, axis=axis, **kwargs)

10958 # Unsupported left operand type for / ("NDFrameT")

10959 rs = data / shifted - 1 # type: ignore[operator]

10960 if freq is not None:

10961 # Shift method is implemented differently when freq is not None

10962 # We want to restore the original index

10963 rs = rs.loc[~rs.index.duplicated()]

10964 rs = rs.reindex_like(data)

10965 return rs.__finalize__(self, method="pct_change")

10966

10967 @final

10968 def _logical_func(

10969 self,

10970 name: str,

10971 func,

10972 axis: Axis = 0,

10973 bool_only: bool_t = False,

10974 skipna: bool_t = True,

10975 **kwargs,

10976 ) -> Series | bool_t:

10977 nv.validate_logical_func((), kwargs, fname=name)

10978 validate_bool_kwarg(skipna, "skipna", none_allowed=False)

10979

10980 if self.ndim > 1 and axis is None:

10981 # Reduce along one dimension then the other, to simplify DataFrame._reduce

10982 res = self._logical_func(

10983 name, func, axis=0, bool_only=bool_only, skipna=skipna, **kwargs

10984 )

10985 return res._logical_func(name, func, skipna=skipna, **kwargs)

10986

10987 if (

10988 self.ndim > 1

10989 and axis == 1

10990 and len(self._mgr.arrays) > 1

10991 # TODO(EA2D): special-case not needed

10992 and all(x.ndim == 2 for x in self._mgr.arrays)

10993 and not kwargs

10994 ):

10995 # Fastpath avoiding potentially expensive transpose

10996 obj = self

10997 if bool_only:

10998 obj = self._get_bool_data()

10999 return obj._reduce_axis1(name, func, skipna=skipna)

11000

11001 return self._reduce(

11002 func,

11003 name=name,

11004 axis=axis,

11005 skipna=skipna,

11006 numeric_only=bool_only,

11007 filter_type="bool",

11008 )

11009

11010 def any(

11011 self,

11012 axis: Axis = 0,

11013 bool_only: bool_t = False,

11014 skipna: bool_t = True,

11015 **kwargs,

11016 ) -> DataFrame | Series | bool_t:

11017 return self._logical_func(

11018 "any", nanops.nanany, axis, bool_only, skipna, **kwargs

11019 )

11020

11021 def all(

11022 self,

11023 axis: Axis = 0,

11024 bool_only: bool_t = False,

11025 skipna: bool_t = True,

11026 **kwargs,

11027 ) -> Series | bool_t:

11028 return self._logical_func(

11029 "all", nanops.nanall, axis, bool_only, skipna, **kwargs

11030 )

11031

11032 @final

11033 def _accum_func(

11034 self,

11035 name: str,

11036 func,

11037 axis: Axis | None = None,

11038 skipna: bool_t = True,

11039 *args,

11040 **kwargs,

11041 ):

11042 skipna = nv.validate_cum_func_with_skipna(skipna, args, kwargs, name)

11043 if axis is None:

11044 axis = self._stat_axis_number

11045 else:

11046 axis = self._get_axis_number(axis)

11047

11048 if axis == 1:

11049 return self.T._accum_func(

11050 name, func, axis=0, skipna=skipna, *args, **kwargs # noqa: B026

11051 ).T

11052

11053 def block_accum_func(blk_values):

11054 values = blk_values.T if hasattr(blk_values, "T") else blk_values

11055

11056 result: np.ndarray | ExtensionArray

11057 if isinstance(values, ExtensionArray):

11058 result = values._accumulate(name, skipna=skipna, **kwargs)

11059 else:

11060 result = nanops.na_accum_func(values, func, skipna=skipna)

11061

11062 result = result.T if hasattr(result, "T") else result

11063 return result

11064

11065 result = self._mgr.apply(block_accum_func)

11066

11067 return self._constructor(result).__finalize__(self, method=name)

11068

11069 def cummax(self, axis: Axis | None = None, skipna: bool_t = True, *args, **kwargs):

11070 return self._accum_func(

11071 "cummax", np.maximum.accumulate, axis, skipna, *args, **kwargs

11072 )

11073

11074 def cummin(self, axis: Axis | None = None, skipna: bool_t = True, *args, **kwargs):

11075 return self._accum_func(

11076 "cummin", np.minimum.accumulate, axis, skipna, *args, **kwargs

11077 )

11078

11079 def cumsum(self, axis: Axis | None = None, skipna: bool_t = True, *args, **kwargs):

11080 return self._accum_func("cumsum", np.cumsum, axis, skipna, *args, **kwargs)

11081

11082 def cumprod(self, axis: Axis | None = None, skipna: bool_t = True, *args, **kwargs):

11083 return self._accum_func("cumprod", np.cumprod, axis, skipna, *args, **kwargs)

11084

11085 @final

11086 def _stat_function_ddof(

11087 self,

11088 name: str,

11089 func,

11090 axis: Axis | None = None,

11091 skipna: bool_t = True,

11092 ddof: int = 1,

11093 numeric_only: bool_t = False,

11094 **kwargs,

11095 ) -> Series | float:

11096 nv.validate_stat_ddof_func((), kwargs, fname=name)

11097 validate_bool_kwarg(skipna, "skipna", none_allowed=False)

11098 if axis is None:

11099 axis = self._stat_axis_number

11100

11101 return self._reduce(

11102 func, name, axis=axis, numeric_only=numeric_only, skipna=skipna, ddof=ddof

11103 )

11104

11105 def sem(

11106 self,

11107 axis: Axis | None = None,

11108 skipna: bool_t = True,

11109 ddof: int = 1,

11110 numeric_only: bool_t = False,

11111 **kwargs,

11112 ) -> Series | float:

11113 return self._stat_function_ddof(

11114 "sem", nanops.nansem, axis, skipna, ddof, numeric_only, **kwargs

11115 )

11116

11117 def var(

11118 self,

11119 axis: Axis | None = None,

11120 skipna: bool_t = True,

11121 ddof: int = 1,

11122 numeric_only: bool_t = False,

11123 **kwargs,

11124 ) -> Series | float:

11125 return self._stat_function_ddof(

11126 "var", nanops.nanvar, axis, skipna, ddof, numeric_only, **kwargs

11127 )

11128

11129 def std(

11130 self,

11131 axis: Axis | None = None,

11132 skipna: bool_t = True,

11133 ddof: int = 1,

11134 numeric_only: bool_t = False,

11135 **kwargs,

11136 ) -> Series | float:

11137 return self._stat_function_ddof(

11138 "std", nanops.nanstd, axis, skipna, ddof, numeric_only, **kwargs

11139 )

11140

11141 @final

11142 def _stat_function(

11143 self,

11144 name: str,

11145 func,

11146 axis: Axis | None = 0,

11147 skipna: bool_t = True,

11148 numeric_only: bool_t = False,

11149 **kwargs,

11150 ):

11151 if name == "median":

11152 nv.validate_median((), kwargs)

11153 else:

11154 nv.validate_stat_func((), kwargs, fname=name)

11155

11156 validate_bool_kwarg(skipna, "skipna", none_allowed=False)

11157

11158 return self._reduce(

11159 func, name=name, axis=axis, skipna=skipna, numeric_only=numeric_only

11160 )

11161

11162 def min(

11163 self,

11164 axis: Axis | None = 0,

11165 skipna: bool_t = True,

11166 numeric_only: bool_t = False,

11167 **kwargs,

11168 ):

11169 return self._stat_function(

11170 "min",

11171 nanops.nanmin,

11172 axis,

11173 skipna,

11174 numeric_only,

11175 **kwargs,

11176 )

11177

11178 def max(

11179 self,

11180 axis: Axis | None = 0,

11181 skipna: bool_t = True,

11182 numeric_only: bool_t = False,

11183 **kwargs,

11184 ):

11185 return self._stat_function(

11186 "max",

11187 nanops.nanmax,

11188 axis,

11189 skipna,

11190 numeric_only,

11191 **kwargs,

11192 )

11193

11194 def mean(

11195 self,

11196 axis: Axis | None = 0,

11197 skipna: bool_t = True,

11198 numeric_only: bool_t = False,

11199 **kwargs,

11200 ) -> Series | float:

11201 return self._stat_function(

11202 "mean", nanops.nanmean, axis, skipna, numeric_only, **kwargs

11203 )

11204

11205 def median(

11206 self,

11207 axis: Axis | None = 0,

11208 skipna: bool_t = True,

11209 numeric_only: bool_t = False,

11210 **kwargs,

11211 ) -> Series | float:

11212 return self._stat_function(

11213 "median", nanops.nanmedian, axis, skipna, numeric_only, **kwargs

11214 )

11215

11216 def skew(

11217 self,

11218 axis: Axis | None = 0,

11219 skipna: bool_t = True,

11220 numeric_only: bool_t = False,

11221 **kwargs,

11222 ) -> Series | float:

11223 return self._stat_function(

11224 "skew", nanops.nanskew, axis, skipna, numeric_only, **kwargs

11225 )

11226

11227 def kurt(

11228 self,

11229 axis: Axis | None = 0,

11230 skipna: bool_t = True,

11231 numeric_only: bool_t = False,

11232 **kwargs,

11233 ) -> Series | float:

11234 return self._stat_function(

11235 "kurt", nanops.nankurt, axis, skipna, numeric_only, **kwargs

11236 )

11237

11238 kurtosis = kurt

11239

11240 @final

11241 def _min_count_stat_function(

11242 self,

11243 name: str,

11244 func,

11245 axis: Axis | None = None,

11246 skipna: bool_t = True,

11247 numeric_only: bool_t = False,

11248 min_count: int = 0,

11249 **kwargs,

11250 ):

11251 if name == "sum":

11252 nv.validate_sum((), kwargs)

11253 elif name == "prod":

11254 nv.validate_prod((), kwargs)

11255 else:

11256 nv.validate_stat_func((), kwargs, fname=name)

11257

11258 validate_bool_kwarg(skipna, "skipna", none_allowed=False)

11259

11260 if axis is None:

11261 axis = self._stat_axis_number

11262

11263 return self._reduce(

11264 func,

11265 name=name,

11266 axis=axis,

11267 skipna=skipna,

11268 numeric_only=numeric_only,

11269 min_count=min_count,

11270 )

11271

11272 def sum(

11273 self,

11274 axis: Axis | None = None,

11275 skipna: bool_t = True,

11276 numeric_only: bool_t = False,

11277 min_count: int = 0,

11278 **kwargs,

11279 ):

11280 return self._min_count_stat_function(

11281 "sum", nanops.nansum, axis, skipna, numeric_only, min_count, **kwargs

11282 )

11283

11284 def prod(

11285 self,

11286 axis: Axis | None = None,

11287 skipna: bool_t = True,

11288 numeric_only: bool_t = False,

11289 min_count: int = 0,

11290 **kwargs,

11291 ):

11292 return self._min_count_stat_function(

11293 "prod",

11294 nanops.nanprod,

11295 axis,

11296 skipna,

11297 numeric_only,

11298 min_count,

11299 **kwargs,

11300 )

11301

11302 product = prod

11303

11304 @classmethod

11305 def _add_numeric_operations(cls) -> None:

11306 """

11307 Add the operations to the cls; evaluate the doc strings again

11308 """

11309 axis_descr, name1, name2 = _doc_params(cls)

11310

11311 @doc(

11312 _bool_doc,

11313 desc=_any_desc,

11314 name1=name1,

11315 name2=name2,

11316 axis_descr=axis_descr,

11317 see_also=_any_see_also,

11318 examples=_any_examples,

11319 empty_value=False,

11320 )

11321 def any(

11322 self,

11323 *,

11324 axis: Axis = 0,

11325 bool_only=None,

11326 skipna: bool_t = True,

11327 **kwargs,

11328 ):

11329 return NDFrame.any(

11330 self,

11331 axis=axis,

11332 bool_only=bool_only,

11333 skipna=skipna,

11334 **kwargs,

11335 )

11336

11337 setattr(cls, "any", any)

11338

11339 @doc(

11340 _bool_doc,

11341 desc=_all_desc,

11342 name1=name1,

11343 name2=name2,

11344 axis_descr=axis_descr,

11345 see_also=_all_see_also,

11346 examples=_all_examples,

11347 empty_value=True,

11348 )

11349 def all(

11350 self,

11351 axis: Axis = 0,

11352 bool_only=None,

11353 skipna: bool_t = True,

11354 **kwargs,

11355 ):

11356 return NDFrame.all(self, axis, bool_only, skipna, **kwargs)

11357

11358 setattr(cls, "all", all)

11359

11360 @doc(

11361 _num_ddof_doc,

11362 desc="Return unbiased standard error of the mean over requested "

11363 "axis.\n\nNormalized by N-1 by default. This can be changed "

11364 "using the ddof argument",

11365 name1=name1,

11366 name2=name2,

11367 axis_descr=axis_descr,

11368 notes="",

11369 examples="",

11370 )

11371 def sem(

11372 self,

11373 axis: Axis | None = None,

11374 skipna: bool_t = True,

11375 ddof: int = 1,

11376 numeric_only: bool_t = False,

11377 **kwargs,

11378 ):

11379 return NDFrame.sem(self, axis, skipna, ddof, numeric_only, **kwargs)

11380

11381 setattr(cls, "sem", sem)

11382

11383 @doc(

11384 _num_ddof_doc,

11385 desc="Return unbiased variance over requested axis.\n\nNormalized by "

11386 "N-1 by default. This can be changed using the ddof argument.",

11387 name1=name1,

11388 name2=name2,

11389 axis_descr=axis_descr,

11390 notes="",

11391 examples=_var_examples,

11392 )

11393 def var(

11394 self,

11395 axis: Axis | None = None,

11396 skipna: bool_t = True,

11397 ddof: int = 1,

11398 numeric_only: bool_t = False,

11399 **kwargs,

11400 ):

11401 return NDFrame.var(self, axis, skipna, ddof, numeric_only, **kwargs)

11402

11403 setattr(cls, "var", var)

11404

11405 @doc(

11406 _num_ddof_doc,

11407 desc="Return sample standard deviation over requested axis."

11408 "\n\nNormalized by N-1 by default. This can be changed using the "

11409 "ddof argument.",

11410 name1=name1,

11411 name2=name2,

11412 axis_descr=axis_descr,

11413 notes=_std_notes,

11414 examples=_std_examples,

11415 )

11416 def std(

11417 self,

11418 axis: Axis | None = None,

11419 skipna: bool_t = True,

11420 ddof: int = 1,

11421 numeric_only: bool_t = False,

11422 **kwargs,

11423 ):

11424 return NDFrame.std(self, axis, skipna, ddof, numeric_only, **kwargs)

11425

11426 setattr(cls, "std", std)

11427

11428 @doc(

11429 _cnum_doc,

11430 desc="minimum",

11431 name1=name1,

11432 name2=name2,

11433 axis_descr=axis_descr,

11434 accum_func_name="min",

11435 examples=_cummin_examples,

11436 )

11437 def cummin(

11438 self, axis: Axis | None = None, skipna: bool_t = True, *args, **kwargs

11439 ):

11440 return NDFrame.cummin(self, axis, skipna, *args, **kwargs)

11441

11442 setattr(cls, "cummin", cummin)

11443

11444 @doc(

11445 _cnum_doc,

11446 desc="maximum",

11447 name1=name1,

11448 name2=name2,

11449 axis_descr=axis_descr,

11450 accum_func_name="max",

11451 examples=_cummax_examples,

11452 )

11453 def cummax(

11454 self, axis: Axis | None = None, skipna: bool_t = True, *args, **kwargs

11455 ):

11456 return NDFrame.cummax(self, axis, skipna, *args, **kwargs)

11457

11458 setattr(cls, "cummax", cummax)

11459

11460 @doc(

11461 _cnum_doc,

11462 desc="sum",

11463 name1=name1,

11464 name2=name2,

11465 axis_descr=axis_descr,

11466 accum_func_name="sum",

11467 examples=_cumsum_examples,

11468 )

11469 def cumsum(

11470 self, axis: Axis | None = None, skipna: bool_t = True, *args, **kwargs

11471 ):

11472 return NDFrame.cumsum(self, axis, skipna, *args, **kwargs)

11473

11474 setattr(cls, "cumsum", cumsum)

11475

11476 @doc(

11477 _cnum_doc,

11478 desc="product",

11479 name1=name1,

11480 name2=name2,

11481 axis_descr=axis_descr,

11482 accum_func_name="prod",

11483 examples=_cumprod_examples,

11484 )

11485 def cumprod(

11486 self, axis: Axis | None = None, skipna: bool_t = True, *args, **kwargs

11487 ):

11488 return NDFrame.cumprod(self, axis, skipna, *args, **kwargs)

11489

11490 setattr(cls, "cumprod", cumprod)

11491

11492 # error: Untyped decorator makes function "sum" untyped

11493 @doc( # type: ignore[misc]

11494 _num_doc,

11495 desc="Return the sum of the values over the requested axis.\n\n"

11496 "This is equivalent to the method ``numpy.sum``.",

11497 name1=name1,

11498 name2=name2,

11499 axis_descr=axis_descr,

11500 min_count=_min_count_stub,

11501 see_also=_stat_func_see_also,

11502 examples=_sum_examples,

11503 )

11504 def sum(

11505 self,

11506 axis: Axis | None = None,

11507 skipna: bool_t = True,

11508 numeric_only: bool_t = False,

11509 min_count: int = 0,

11510 **kwargs,

11511 ):

11512 return NDFrame.sum(self, axis, skipna, numeric_only, min_count, **kwargs)

11513

11514 setattr(cls, "sum", sum)

11515

11516 @doc(

11517 _num_doc,

11518 desc="Return the product of the values over the requested axis.",

11519 name1=name1,

11520 name2=name2,

11521 axis_descr=axis_descr,

11522 min_count=_min_count_stub,

11523 see_also=_stat_func_see_also,

11524 examples=_prod_examples,

11525 )

11526 def prod(

11527 self,

11528 axis: Axis | None = None,

11529 skipna: bool_t = True,

11530 numeric_only: bool_t = False,

11531 min_count: int = 0,

11532 **kwargs,

11533 ):

11534 return NDFrame.prod(self, axis, skipna, numeric_only, min_count, **kwargs)

11535

11536 setattr(cls, "prod", prod)

11537 cls.product = prod

11538

11539 @doc(

11540 _num_doc,

11541 desc="Return the mean of the values over the requested axis.",

11542 name1=name1,

11543 name2=name2,

11544 axis_descr=axis_descr,

11545 min_count="",

11546 see_also="",

11547 examples="",

11548 )

11549 def mean(

11550 self,

11551 axis: AxisInt | None = 0,

11552 skipna: bool_t = True,

11553 numeric_only: bool_t = False,

11554 **kwargs,

11555 ):

11556 return NDFrame.mean(self, axis, skipna, numeric_only, **kwargs)

11557

11558 setattr(cls, "mean", mean)

11559

11560 @doc(

11561 _num_doc,

11562 desc="Return unbiased skew over requested axis.\n\nNormalized by N-1.",

11563 name1=name1,

11564 name2=name2,

11565 axis_descr=axis_descr,

11566 min_count="",

11567 see_also="",

11568 examples="",

11569 )

11570 def skew(

11571 self,

11572 axis: AxisInt | None = 0,

11573 skipna: bool_t = True,

11574 numeric_only: bool_t = False,

11575 **kwargs,

11576 ):

11577 return NDFrame.skew(self, axis, skipna, numeric_only, **kwargs)

11578

11579 setattr(cls, "skew", skew)

11580

11581 @doc(

11582 _num_doc,

11583 desc="Return unbiased kurtosis over requested axis.\n\n"

11584 "Kurtosis obtained using Fisher's definition of\n"

11585 "kurtosis (kurtosis of normal == 0.0). Normalized "

11586 "by N-1.",

11587 name1=name1,

11588 name2=name2,

11589 axis_descr=axis_descr,

11590 min_count="",

11591 see_also="",

11592 examples="",

11593 )

11594 def kurt(

11595 self,

11596 axis: Axis | None = 0,

11597 skipna: bool_t = True,

11598 numeric_only: bool_t = False,

11599 **kwargs,

11600 ):

11601 return NDFrame.kurt(self, axis, skipna, numeric_only, **kwargs)

11602

11603 setattr(cls, "kurt", kurt)

11604 cls.kurtosis = kurt

11605

11606 @doc(

11607 _num_doc,

11608 desc="Return the median of the values over the requested axis.",

11609 name1=name1,

11610 name2=name2,

11611 axis_descr=axis_descr,

11612 min_count="",

11613 see_also="",

11614 examples="",

11615 )

11616 def median(

11617 self,

11618 axis: AxisInt | None = 0,

11619 skipna: bool_t = True,

11620 numeric_only: bool_t = False,

11621 **kwargs,

11622 ):

11623 return NDFrame.median(self, axis, skipna, numeric_only, **kwargs)

11624

11625 setattr(cls, "median", median)

11626

11627 @doc(

11628 _num_doc,

11629 desc="Return the maximum of the values over the requested axis.\n\n"

11630 "If you want the *index* of the maximum, use ``idxmax``. This is "

11631 "the equivalent of the ``numpy.ndarray`` method ``argmax``.",

11632 name1=name1,

11633 name2=name2,

11634 axis_descr=axis_descr,

11635 min_count="",

11636 see_also=_stat_func_see_also,

11637 examples=_max_examples,

11638 )

11639 def max(

11640 self,

11641 axis: AxisInt | None = 0,

11642 skipna: bool_t = True,

11643 numeric_only: bool_t = False,

11644 **kwargs,

11645 ):

11646 return NDFrame.max(self, axis, skipna, numeric_only, **kwargs)

11647

11648 setattr(cls, "max", max)

11649

11650 @doc(

11651 _num_doc,

11652 desc="Return the minimum of the values over the requested axis.\n\n"

11653 "If you want the *index* of the minimum, use ``idxmin``. This is "

11654 "the equivalent of the ``numpy.ndarray`` method ``argmin``.",

11655 name1=name1,

11656 name2=name2,

11657 axis_descr=axis_descr,

11658 min_count="",

11659 see_also=_stat_func_see_also,

11660 examples=_min_examples,

11661 )

11662 def min(

11663 self,

11664 axis: AxisInt | None = 0,

11665 skipna: bool_t = True,

11666 numeric_only: bool_t = False,

11667 **kwargs,

11668 ):

11669 return NDFrame.min(self, axis, skipna, numeric_only, **kwargs)

11670

11671 setattr(cls, "min", min)

11672

11673 @final

11674 @doc(Rolling)

11675 def rolling(

11676 self,

11677 window: int | dt.timedelta | str | BaseOffset | BaseIndexer,

11678 min_periods: int | None = None,

11679 center: bool_t = False,

11680 win_type: str | None = None,

11681 on: str | None = None,

11682 axis: Axis = 0,

11683 closed: str | None = None,

11684 step: int | None = None,

11685 method: str = "single",

11686 ) -> Window | Rolling:

11687 axis = self._get_axis_number(axis)

11688

11689 if win_type is not None:

11690 return Window(

11691 self,

11692 window=window,

11693 min_periods=min_periods,

11694 center=center,

11695 win_type=win_type,

11696 on=on,

11697 axis=axis,

11698 closed=closed,

11699 step=step,

11700 method=method,

11701 )

11702

11703 return Rolling(

11704 self,

11705 window=window,

11706 min_periods=min_periods,

11707 center=center,

11708 win_type=win_type,

11709 on=on,

11710 axis=axis,

11711 closed=closed,

11712 step=step,

11713 method=method,

11714 )

11715

11716 @final

11717 @doc(Expanding)

11718 def expanding(

11719 self,

11720 min_periods: int = 1,

11721 axis: Axis = 0,

11722 method: str = "single",

11723 ) -> Expanding:

11724 axis = self._get_axis_number(axis)

11725 return Expanding(self, min_periods=min_periods, axis=axis, method=method)

11726

11727 @final

11728 @doc(ExponentialMovingWindow)

11729 def ewm(

11730 self,

11731 com: float | None = None,

11732 span: float | None = None,

11733 halflife: float | TimedeltaConvertibleTypes | None = None,

11734 alpha: float | None = None,

11735 min_periods: int | None = 0,

11736 adjust: bool_t = True,

11737 ignore_na: bool_t = False,

11738 axis: Axis = 0,

11739 times: np.ndarray | DataFrame | Series | None = None,

11740 method: str = "single",

11741 ) -> ExponentialMovingWindow:

11742 axis = self._get_axis_number(axis)

11743 return ExponentialMovingWindow(

11744 self,

11745 com=com,

11746 span=span,

11747 halflife=halflife,

11748 alpha=alpha,

11749 min_periods=min_periods,

11750 adjust=adjust,

11751 ignore_na=ignore_na,

11752 axis=axis,

11753 times=times,

11754 method=method,

11755 )

11756

11757 # ----------------------------------------------------------------------

11758 # Arithmetic Methods

11759

11760 @final

11761 def _inplace_method(self, other, op):

11762 """

11763 Wrap arithmetic method to operate inplace.

11764 """

11765 result = op(self, other)

11766

11767 if (

11768 self.ndim == 1

11769 and result._indexed_same(self)

11770 and is_dtype_equal(result.dtype, self.dtype)

11771 ):

11772 # GH#36498 this inplace op can _actually_ be inplace.

11773 # Item "ArrayManager" of "Union[ArrayManager, SingleArrayManager,

11774 # BlockManager, SingleBlockManager]" has no attribute "setitem_inplace"

11775 self._mgr.setitem_inplace( # type: ignore[union-attr]

11776 slice(None), result._values

11777 )

11778 return self

11779

11780 # Delete cacher

11781 self._reset_cacher()

11782

11783 # this makes sure that we are aligned like the input

11784 # we are updating inplace so we want to ignore is_copy

11785 self._update_inplace(

11786 result.reindex_like(self, copy=False), verify_is_copy=False

11787 )

11788 return self

11789

11790 def __iadd__(self: NDFrameT, other) -> NDFrameT:

11791 # error: Unsupported left operand type for + ("Type[NDFrame]")

11792 return self._inplace_method(other, type(self).__add__) # type: ignore[operator]

11793

11794 def __isub__(self: NDFrameT, other) -> NDFrameT:

11795 # error: Unsupported left operand type for - ("Type[NDFrame]")

11796 return self._inplace_method(other, type(self).__sub__) # type: ignore[operator]

11797

11798 def __imul__(self: NDFrameT, other) -> NDFrameT:

11799 # error: Unsupported left operand type for * ("Type[NDFrame]")

11800 return self._inplace_method(other, type(self).__mul__) # type: ignore[operator]

11801

11802 def __itruediv__(self: NDFrameT, other) -> NDFrameT:

11803 # error: Unsupported left operand type for / ("Type[NDFrame]")

11804 return self._inplace_method(

11805 other, type(self).__truediv__ # type: ignore[operator]

11806 )

11807

11808 def __ifloordiv__(self: NDFrameT, other) -> NDFrameT:

11809 # error: Unsupported left operand type for // ("Type[NDFrame]")

11810 return self._inplace_method(

11811 other, type(self).__floordiv__ # type: ignore[operator]

11812 )

11813

11814 def __imod__(self: NDFrameT, other) -> NDFrameT:

11815 # error: Unsupported left operand type for % ("Type[NDFrame]")

11816 return self._inplace_method(other, type(self).__mod__) # type: ignore[operator]

11817

11818 def __ipow__(self: NDFrameT, other) -> NDFrameT:

11819 # error: Unsupported left operand type for ** ("Type[NDFrame]")

11820 return self._inplace_method(other, type(self).__pow__) # type: ignore[operator]

11821

11822 def __iand__(self: NDFrameT, other) -> NDFrameT:

11823 # error: Unsupported left operand type for & ("Type[NDFrame]")

11824 return self._inplace_method(other, type(self).__and__) # type: ignore[operator]

11825

11826 def __ior__(self: NDFrameT, other) -> NDFrameT:

11827 # error: Unsupported left operand type for | ("Type[NDFrame]")

11828 return self._inplace_method(other, type(self).__or__) # type: ignore[operator]

11829

11830 def __ixor__(self: NDFrameT, other) -> NDFrameT:

11831 # error: Unsupported left operand type for ^ ("Type[NDFrame]")

11832 return self._inplace_method(other, type(self).__xor__) # type: ignore[operator]

11833

11834 # ----------------------------------------------------------------------

11835 # Misc methods

11836

11837 @final

11838 def _find_valid_index(self, *, how: str) -> Hashable | None:

11839 """

11840 Retrieves the index of the first valid value.

11841

11842 Parameters

11843 ----------

11844 how : {'first', 'last'}

11845 Use this parameter to change between the first or last valid index.

11846

11847 Returns

11848 -------

11849 idx_first_valid : type of index

11850 """

11851 idxpos = find_valid_index(self._values, how=how, is_valid=~isna(self._values))

11852 if idxpos is None:

11853 return None

11854 return self.index[idxpos]

11855

11856 @final

11857 @doc(position="first", klass=_shared_doc_kwargs["klass"])

11858 def first_valid_index(self) -> Hashable | None:

11859 """

11860 Return index for {position} non-NA value or None, if no non-NA value is found.

11861

11862 Returns

11863 -------

11864 type of index

11865

11866 Notes

11867 -----

11868 If all elements are non-NA/null, returns None.

11869 Also returns None for empty {klass}.

11870 """

11871 return self._find_valid_index(how="first")

11872

11873 @final

11874 @doc(first_valid_index, position="last", klass=_shared_doc_kwargs["klass"])

11875 def last_valid_index(self) -> Hashable | None:

11876 return self._find_valid_index(how="last")

11877

11878

11879def _doc_params(cls):

11880 """Return a tuple of the doc params."""

11881 axis_descr = (

11882 f"{{{', '.join([f'{a} ({i})' for i, a in enumerate(cls._AXIS_ORDERS)])}}}"

11883 )

11884 name = cls._constructor_sliced.__name__ if cls._AXIS_LEN > 1 else "scalar"

11885 name2 = cls.__name__

11886 return axis_descr, name, name2

11887

11888

11889_num_doc = """

11890{desc}

11891

11892Parameters

11893----------

11894axis : {axis_descr}

11895 Axis for the function to be applied on.

11896 For `Series` this parameter is unused and defaults to 0.

11897

11898 For DataFrames, specifying ``axis=None`` will apply the aggregation

11899 across both axes.

11900

11901 .. versionadded:: 2.0.0

11902

11903skipna : bool, default True

11904 Exclude NA/null values when computing the result.

11905numeric_only : bool, default False

11906 Include only float, int, boolean columns. Not implemented for Series.

11907

11908{min_count}\

11909**kwargs

11910 Additional keyword arguments to be passed to the function.

11911

11912Returns

11913-------

11914{name1} or scalar\

11915{see_also}\

11916{examples}

11917"""

11918

11919_num_ddof_doc = """

11920{desc}

11921

11922Parameters

11923----------

11924axis : {axis_descr}

11925 For `Series` this parameter is unused and defaults to 0.

11926skipna : bool, default True

11927 Exclude NA/null values. If an entire row/column is NA, the result

11928 will be NA.

11929ddof : int, default 1

11930 Delta Degrees of Freedom. The divisor used in calculations is N - ddof,

11931 where N represents the number of elements.

11932numeric_only : bool, default False

11933 Include only float, int, boolean columns. Not implemented for Series.

11934

11935Returns

11936-------

11937{name1} or {name2} (if level specified) \

11938{notes}\

11939{examples}

11940"""

11941

11942_std_notes = """

11943

11944Notes

11945-----

11946To have the same behaviour as `numpy.std`, use `ddof=0` (instead of the

11947default `ddof=1`)"""

11948

11949_std_examples = """

11950

11951Examples

11952--------

11953>>> df = pd.DataFrame({'person_id': [0, 1, 2, 3],

11954... 'age': [21, 25, 62, 43],

11955... 'height': [1.61, 1.87, 1.49, 2.01]}

11956... ).set_index('person_id')

11957>>> df

11958 age height

11959person_id

119600 21 1.61

119611 25 1.87

119622 62 1.49

119633 43 2.01

11964

11965The standard deviation of the columns can be found as follows:

11966

11967>>> df.std()

11968age 18.786076

11969height 0.237417

11970dtype: float64

11971

11972Alternatively, `ddof=0` can be set to normalize by N instead of N-1:

11973

11974>>> df.std(ddof=0)

11975age 16.269219

11976height 0.205609

11977dtype: float64"""

11978

11979_var_examples = """

11980

11981Examples

11982--------

11983>>> df = pd.DataFrame({'person_id': [0, 1, 2, 3],

11984... 'age': [21, 25, 62, 43],

11985... 'height': [1.61, 1.87, 1.49, 2.01]}

11986... ).set_index('person_id')

11987>>> df

11988 age height

11989person_id

119900 21 1.61

119911 25 1.87

119922 62 1.49

119933 43 2.01

11994

11995>>> df.var()

11996age 352.916667

11997height 0.056367

11998dtype: float64

11999

12000Alternatively, ``ddof=0`` can be set to normalize by N instead of N-1:

12001

12002>>> df.var(ddof=0)

12003age 264.687500

12004height 0.042275

12005dtype: float64"""

12006

12007_bool_doc = """

12008{desc}

12009

12010Parameters

12011----------

12012axis : {{0 or 'index', 1 or 'columns', None}}, default 0

12013 Indicate which axis or axes should be reduced. For `Series` this parameter

12014 is unused and defaults to 0.

12015

12016 * 0 / 'index' : reduce the index, return a Series whose index is the

12017 original column labels.

12018 * 1 / 'columns' : reduce the columns, return a Series whose index is the

12019 original index.

12020 * None : reduce all axes, return a scalar.

12021

12022bool_only : bool, default None

12023 Include only boolean columns. If None, will attempt to use everything,

12024 then use only boolean data. Not implemented for Series.

12025skipna : bool, default True

12026 Exclude NA/null values. If the entire row/column is NA and skipna is

12027 True, then the result will be {empty_value}, as for an empty row/column.

12028 If skipna is False, then NA are treated as True, because these are not

12029 equal to zero.

12030**kwargs : any, default None

12031 Additional keywords have no effect but might be accepted for

12032 compatibility with NumPy.

12033

12034Returns

12035-------

12036{name1} or {name2}

12037 If level is specified, then, {name2} is returned; otherwise, {name1}

12038 is returned.

12039

12040{see_also}

12041{examples}"""

12042

12043_all_desc = """\

12044Return whether all elements are True, potentially over an axis.

12045

12046Returns True unless there at least one element within a series or

12047along a Dataframe axis that is False or equivalent (e.g. zero or

12048empty)."""

12049

12050_all_examples = """\

12051Examples

12052--------

12053**Series**

12054

12055>>> pd.Series([True, True]).all()

12056True

12057>>> pd.Series([True, False]).all()

12058False

12059>>> pd.Series([], dtype="float64").all()

12060True

12061>>> pd.Series([np.nan]).all()

12062True

12063>>> pd.Series([np.nan]).all(skipna=False)

12064True

12065

12066**DataFrames**

12067

12068Create a dataframe from a dictionary.

12069

12070>>> df = pd.DataFrame({'col1': [True, True], 'col2': [True, False]})

12071>>> df

12072 col1 col2

120730 True True

120741 True False

12075

12076Default behaviour checks if values in each column all return True.

12077

12078>>> df.all()

12079col1 True

12080col2 False

12081dtype: bool

12082

12083Specify ``axis='columns'`` to check if values in each row all return True.

12084

12085>>> df.all(axis='columns')

120860 True

120871 False

12088dtype: bool

12089

12090Or ``axis=None`` for whether every value is True.

12091

12092>>> df.all(axis=None)

12093False

12094"""

12095

12096_all_see_also = """\

12097See Also

12098--------

12099Series.all : Return True if all elements are True.

12100DataFrame.any : Return True if one (or more) elements are True.

12101"""

12102

12103_cnum_doc = """

12104Return cumulative {desc} over a DataFrame or Series axis.

12105

12106Returns a DataFrame or Series of the same size containing the cumulative

12107{desc}.

12108

12109Parameters

12110----------

12111axis : {{0 or 'index', 1 or 'columns'}}, default 0

12112 The index or the name of the axis. 0 is equivalent to None or 'index'.

12113 For `Series` this parameter is unused and defaults to 0.

12114skipna : bool, default True

12115 Exclude NA/null values. If an entire row/column is NA, the result

12116 will be NA.

12117*args, **kwargs

12118 Additional keywords have no effect but might be accepted for

12119 compatibility with NumPy.

12120

12121Returns

12122-------

12123{name1} or {name2}

12124 Return cumulative {desc} of {name1} or {name2}.

12125

12126See Also

12127--------

12128core.window.expanding.Expanding.{accum_func_name} : Similar functionality

12129 but ignores ``NaN`` values.

12130{name2}.{accum_func_name} : Return the {desc} over

12131 {name2} axis.

12132{name2}.cummax : Return cumulative maximum over {name2} axis.

12133{name2}.cummin : Return cumulative minimum over {name2} axis.

12134{name2}.cumsum : Return cumulative sum over {name2} axis.

12135{name2}.cumprod : Return cumulative product over {name2} axis.

12136

12137{examples}"""

12138

12139_cummin_examples = """\

12140Examples

12141--------

12142**Series**

12143

12144>>> s = pd.Series([2, np.nan, 5, -1, 0])

12145>>> s

121460 2.0

121471 NaN

121482 5.0

121493 -1.0

121504 0.0

12151dtype: float64

12152

12153By default, NA values are ignored.

12154

12155>>> s.cummin()

121560 2.0

121571 NaN

121582 2.0

121593 -1.0

121604 -1.0

12161dtype: float64

12162

12163To include NA values in the operation, use ``skipna=False``

12164

12165>>> s.cummin(skipna=False)

121660 2.0

121671 NaN

121682 NaN

121693 NaN

121704 NaN

12171dtype: float64

12172

12173**DataFrame**

12174

12175>>> df = pd.DataFrame([[2.0, 1.0],

12176... [3.0, np.nan],

12177... [1.0, 0.0]],

12178... columns=list('AB'))

12179>>> df

12180 A B

121810 2.0 1.0

121821 3.0 NaN

121832 1.0 0.0

12184

12185By default, iterates over rows and finds the minimum

12186in each column. This is equivalent to ``axis=None`` or ``axis='index'``.

12187

12188>>> df.cummin()

12189 A B

121900 2.0 1.0

121911 2.0 NaN

121922 1.0 0.0

12193

12194To iterate over columns and find the minimum in each row,

12195use ``axis=1``

12196

12197>>> df.cummin(axis=1)

12198 A B

121990 2.0 1.0

122001 3.0 NaN

122012 1.0 0.0

12202"""

12203

12204_cumsum_examples = """\

12205Examples

12206--------

12207**Series**

12208

12209>>> s = pd.Series([2, np.nan, 5, -1, 0])

12210>>> s

122110 2.0

122121 NaN

122132 5.0

122143 -1.0

122154 0.0

12216dtype: float64

12217

12218By default, NA values are ignored.

12219

12220>>> s.cumsum()

122210 2.0

122221 NaN

122232 7.0

122243 6.0

122254 6.0

12226dtype: float64

12227

12228To include NA values in the operation, use ``skipna=False``

12229

12230>>> s.cumsum(skipna=False)

122310 2.0

122321 NaN

122332 NaN

122343 NaN

122354 NaN

12236dtype: float64

12237

12238**DataFrame**

12239

12240>>> df = pd.DataFrame([[2.0, 1.0],

12241... [3.0, np.nan],

12242... [1.0, 0.0]],

12243... columns=list('AB'))

12244>>> df

12245 A B

122460 2.0 1.0

122471 3.0 NaN

122482 1.0 0.0

12249

12250By default, iterates over rows and finds the sum

12251in each column. This is equivalent to ``axis=None`` or ``axis='index'``.

12252

12253>>> df.cumsum()

12254 A B

122550 2.0 1.0

122561 5.0 NaN

122572 6.0 1.0

12258

12259To iterate over columns and find the sum in each row,

12260use ``axis=1``

12261

12262>>> df.cumsum(axis=1)

12263 A B

122640 2.0 3.0

122651 3.0 NaN

122662 1.0 1.0

12267"""

12268

12269_cumprod_examples = """\

12270Examples

12271--------

12272**Series**

12273

12274>>> s = pd.Series([2, np.nan, 5, -1, 0])

12275>>> s

122760 2.0

122771 NaN

122782 5.0

122793 -1.0

122804 0.0

12281dtype: float64

12282

12283By default, NA values are ignored.

12284

12285>>> s.cumprod()

122860 2.0

122871 NaN

122882 10.0

122893 -10.0

122904 -0.0

12291dtype: float64

12292

12293To include NA values in the operation, use ``skipna=False``

12294

12295>>> s.cumprod(skipna=False)

122960 2.0

122971 NaN

122982 NaN

122993 NaN

123004 NaN

12301dtype: float64

12302

12303**DataFrame**

12304

12305>>> df = pd.DataFrame([[2.0, 1.0],

12306... [3.0, np.nan],

12307... [1.0, 0.0]],

12308... columns=list('AB'))

12309>>> df

12310 A B

123110 2.0 1.0

123121 3.0 NaN

123132 1.0 0.0

12314

12315By default, iterates over rows and finds the product

12316in each column. This is equivalent to ``axis=None`` or ``axis='index'``.

12317

12318>>> df.cumprod()

12319 A B

123200 2.0 1.0

123211 6.0 NaN

123222 6.0 0.0

12323

12324To iterate over columns and find the product in each row,

12325use ``axis=1``

12326

12327>>> df.cumprod(axis=1)

12328 A B

123290 2.0 2.0

123301 3.0 NaN

123312 1.0 0.0

12332"""

12333

12334_cummax_examples = """\

12335Examples

12336--------

12337**Series**

12338

12339>>> s = pd.Series([2, np.nan, 5, -1, 0])

12340>>> s

123410 2.0

123421 NaN

123432 5.0

123443 -1.0

123454 0.0

12346dtype: float64

12347

12348By default, NA values are ignored.

12349

12350>>> s.cummax()

123510 2.0

123521 NaN

123532 5.0

123543 5.0

123554 5.0

12356dtype: float64

12357

12358To include NA values in the operation, use ``skipna=False``

12359

12360>>> s.cummax(skipna=False)

123610 2.0

123621 NaN

123632 NaN

123643 NaN

123654 NaN

12366dtype: float64

12367

12368**DataFrame**

12369

12370>>> df = pd.DataFrame([[2.0, 1.0],

12371... [3.0, np.nan],

12372... [1.0, 0.0]],

12373... columns=list('AB'))

12374>>> df

12375 A B

123760 2.0 1.0

123771 3.0 NaN

123782 1.0 0.0

12379

12380By default, iterates over rows and finds the maximum

12381in each column. This is equivalent to ``axis=None`` or ``axis='index'``.

12382

12383>>> df.cummax()

12384 A B

123850 2.0 1.0

123861 3.0 NaN

123872 3.0 1.0

12388

12389To iterate over columns and find the maximum in each row,

12390use ``axis=1``

12391

12392>>> df.cummax(axis=1)

12393 A B

123940 2.0 2.0

123951 3.0 NaN

123962 1.0 1.0

12397"""

12398

12399_any_see_also = """\

12400See Also

12401--------

12402numpy.any : Numpy version of this method.

12403Series.any : Return whether any element is True.

12404Series.all : Return whether all elements are True.

12405DataFrame.any : Return whether any element is True over requested axis.

12406DataFrame.all : Return whether all elements are True over requested axis.

12407"""

12408

12409_any_desc = """\

12410Return whether any element is True, potentially over an axis.

12411

12412Returns False unless there is at least one element within a series or

12413along a Dataframe axis that is True or equivalent (e.g. non-zero or

12414non-empty)."""

12415

12416_any_examples = """\

12417Examples

12418--------

12419**Series**

12420

12421For Series input, the output is a scalar indicating whether any element

12422is True.

12423

12424>>> pd.Series([False, False]).any()

12425False

12426>>> pd.Series([True, False]).any()

12427True

12428>>> pd.Series([], dtype="float64").any()

12429False

12430>>> pd.Series([np.nan]).any()

12431False

12432>>> pd.Series([np.nan]).any(skipna=False)

12433True

12434

12435**DataFrame**

12436

12437Whether each column contains at least one True element (the default).

12438

12439>>> df = pd.DataFrame({"A": [1, 2], "B": [0, 2], "C": [0, 0]})

12440>>> df

12441 A B C

124420 1 0 0

124431 2 2 0

12444

12445>>> df.any()

12446A True

12447B True

12448C False

12449dtype: bool

12450

12451Aggregating over the columns.

12452

12453>>> df = pd.DataFrame({"A": [True, False], "B": [1, 2]})

12454>>> df

12455 A B

124560 True 1

124571 False 2

12458

12459>>> df.any(axis='columns')

124600 True

124611 True

12462dtype: bool

12463

12464>>> df = pd.DataFrame({"A": [True, False], "B": [1, 0]})

12465>>> df

12466 A B

124670 True 1

124681 False 0

12469

12470>>> df.any(axis='columns')

124710 True

124721 False

12473dtype: bool

12474

12475Aggregating over the entire DataFrame with ``axis=None``.

12476

12477>>> df.any(axis=None)

12478True

12479

12480`any` for an empty DataFrame is an empty Series.

12481

12482>>> pd.DataFrame([]).any()

12483Series([], dtype: bool)

12484"""

12485

12486_shared_docs[

12487 "stat_func_example"

12488] = """

12489

12490Examples

12491--------

12492>>> idx = pd.MultiIndex.from_arrays([

12493... ['warm', 'warm', 'cold', 'cold'],

12494... ['dog', 'falcon', 'fish', 'spider']],

12495... names=['blooded', 'animal'])

12496>>> s = pd.Series([4, 2, 0, 8], name='legs', index=idx)

12497>>> s

12498blooded animal

12499warm dog 4

12500 falcon 2

12501cold fish 0

12502 spider 8

12503Name: legs, dtype: int64

12504

12505>>> s.{stat_func}()

12506{default_output}"""

12507

12508_sum_examples = _shared_docs["stat_func_example"].format(

12509 stat_func="sum", verb="Sum", default_output=14, level_output_0=6, level_output_1=8

12510)

12511

12512_sum_examples += """

12513

12514By default, the sum of an empty or all-NA Series is ``0``.

12515

12516>>> pd.Series([], dtype="float64").sum() # min_count=0 is the default

125170.0

12518

12519This can be controlled with the ``min_count`` parameter. For example, if

12520you'd like the sum of an empty series to be NaN, pass ``min_count=1``.

12521

12522>>> pd.Series([], dtype="float64").sum(min_count=1)

12523nan

12524

12525Thanks to the ``skipna`` parameter, ``min_count`` handles all-NA and

12526empty series identically.

12527

12528>>> pd.Series([np.nan]).sum()

125290.0

12530

12531>>> pd.Series([np.nan]).sum(min_count=1)

12532nan"""

12533

12534_max_examples: str = _shared_docs["stat_func_example"].format(

12535 stat_func="max", verb="Max", default_output=8, level_output_0=4, level_output_1=8

12536)

12537

12538_min_examples: str = _shared_docs["stat_func_example"].format(

12539 stat_func="min", verb="Min", default_output=0, level_output_0=2, level_output_1=0

12540)

12541

12542_stat_func_see_also = """

12543

12544See Also

12545--------

12546Series.sum : Return the sum.

12547Series.min : Return the minimum.

12548Series.max : Return the maximum.

12549Series.idxmin : Return the index of the minimum.

12550Series.idxmax : Return the index of the maximum.

12551DataFrame.sum : Return the sum over the requested axis.

12552DataFrame.min : Return the minimum over the requested axis.

12553DataFrame.max : Return the maximum over the requested axis.

12554DataFrame.idxmin : Return the index of the minimum over the requested axis.

12555DataFrame.idxmax : Return the index of the maximum over the requested axis."""

12556

12557_prod_examples = """

12558

12559Examples

12560--------

12561By default, the product of an empty or all-NA Series is ``1``

12562

12563>>> pd.Series([], dtype="float64").prod()

125641.0

12565

12566This can be controlled with the ``min_count`` parameter

12567

12568>>> pd.Series([], dtype="float64").prod(min_count=1)

12569nan

12570

12571Thanks to the ``skipna`` parameter, ``min_count`` handles all-NA and

12572empty series identically.

12573

12574>>> pd.Series([np.nan]).prod()

125751.0

12576

12577>>> pd.Series([np.nan]).prod(min_count=1)

12578nan"""

12579

12580_min_count_stub = """\

12581min_count : int, default 0

12582 The required number of valid values to perform the operation. If fewer than

12583 ``min_count`` non-NA values are present the result will be NA.

12584"""

12585

12586

12587def _align_as_utc(

12588 left: NDFrameT, right: NDFrameT, join_index: Index | None

12589) -> tuple[NDFrameT, NDFrameT]:

12590 """

12591 If we are aligning timezone-aware DatetimeIndexes and the timezones

12592 do not match, convert both to UTC.

12593 """

12594 if is_datetime64tz_dtype(left.index.dtype):

12595 if left.index.tz != right.index.tz:

12596 if join_index is not None:

12597 # GH#33671 ensure we don't change the index on

12598 # our original Series (NB: by default deep=False)

12599 left = left.copy()

12600 right = right.copy()

12601 left.index = join_index

12602 right.index = join_index

12603

12604 return left, right