Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.9/dist-packages/pandas/core/frame.py: 30%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

2512 statements  

1""" 

2DataFrame 

3--------- 

4An efficient 2D container for potentially mixed-type time series or other 

5labeled data series. 

6 

7Similar to its R counterpart, data.frame, except providing automatic data 

8alignment and a host of useful data manipulation methods having to do with the 

9labeling information 

10""" 

11from __future__ import annotations 

12 

13import collections 

14from collections import abc 

15from collections.abc import ( 

16 Hashable, 

17 Iterable, 

18 Iterator, 

19 Mapping, 

20 Sequence, 

21) 

22import functools 

23from inspect import signature 

24from io import StringIO 

25import itertools 

26import operator 

27import sys 

28from textwrap import dedent 

29from typing import ( 

30 TYPE_CHECKING, 

31 Any, 

32 Callable, 

33 Literal, 

34 cast, 

35 overload, 

36) 

37import warnings 

38 

39import numpy as np 

40from numpy import ma 

41 

42from pandas._config import ( 

43 get_option, 

44 using_copy_on_write, 

45 warn_copy_on_write, 

46) 

47from pandas._config.config import _get_option 

48 

49from pandas._libs import ( 

50 algos as libalgos, 

51 lib, 

52 properties, 

53) 

54from pandas._libs.hashtable import duplicated 

55from pandas._libs.lib import is_range_indexer 

56from pandas.compat import PYPY 

57from pandas.compat._constants import REF_COUNT 

58from pandas.compat._optional import import_optional_dependency 

59from pandas.compat.numpy import function as nv 

60from pandas.errors import ( 

61 ChainedAssignmentError, 

62 InvalidIndexError, 

63 _chained_assignment_method_msg, 

64 _chained_assignment_msg, 

65 _chained_assignment_warning_method_msg, 

66 _chained_assignment_warning_msg, 

67) 

68from pandas.util._decorators import ( 

69 Appender, 

70 Substitution, 

71 deprecate_nonkeyword_arguments, 

72 doc, 

73) 

74from pandas.util._exceptions import ( 

75 find_stack_level, 

76 rewrite_warning, 

77) 

78from pandas.util._validators import ( 

79 validate_ascending, 

80 validate_bool_kwarg, 

81 validate_percentile, 

82) 

83 

84from pandas.core.dtypes.cast import ( 

85 LossySetitemError, 

86 can_hold_element, 

87 construct_1d_arraylike_from_scalar, 

88 construct_2d_arraylike_from_scalar, 

89 find_common_type, 

90 infer_dtype_from_scalar, 

91 invalidate_string_dtypes, 

92 maybe_box_native, 

93 maybe_downcast_to_dtype, 

94) 

95from pandas.core.dtypes.common import ( 

96 infer_dtype_from_object, 

97 is_1d_only_ea_dtype, 

98 is_array_like, 

99 is_bool_dtype, 

100 is_dataclass, 

101 is_dict_like, 

102 is_float, 

103 is_float_dtype, 

104 is_hashable, 

105 is_integer, 

106 is_integer_dtype, 

107 is_iterator, 

108 is_list_like, 

109 is_scalar, 

110 is_sequence, 

111 needs_i8_conversion, 

112 pandas_dtype, 

113) 

114from pandas.core.dtypes.concat import concat_compat 

115from pandas.core.dtypes.dtypes import ( 

116 ArrowDtype, 

117 BaseMaskedDtype, 

118 ExtensionDtype, 

119) 

120from pandas.core.dtypes.missing import ( 

121 isna, 

122 notna, 

123) 

124 

125from pandas.core import ( 

126 algorithms, 

127 common as com, 

128 nanops, 

129 ops, 

130 roperator, 

131) 

132from pandas.core.accessor import CachedAccessor 

133from pandas.core.apply import reconstruct_and_relabel_result 

134from pandas.core.array_algos.take import take_2d_multi 

135from pandas.core.arraylike import OpsMixin 

136from pandas.core.arrays import ( 

137 BaseMaskedArray, 

138 DatetimeArray, 

139 ExtensionArray, 

140 PeriodArray, 

141 TimedeltaArray, 

142) 

143from pandas.core.arrays.sparse import SparseFrameAccessor 

144from pandas.core.construction import ( 

145 ensure_wrapped_if_datetimelike, 

146 sanitize_array, 

147 sanitize_masked_array, 

148) 

149from pandas.core.generic import ( 

150 NDFrame, 

151 make_doc, 

152) 

153from pandas.core.indexers import check_key_length 

154from pandas.core.indexes.api import ( 

155 DatetimeIndex, 

156 Index, 

157 PeriodIndex, 

158 default_index, 

159 ensure_index, 

160 ensure_index_from_sequences, 

161) 

162from pandas.core.indexes.multi import ( 

163 MultiIndex, 

164 maybe_droplevels, 

165) 

166from pandas.core.indexing import ( 

167 check_bool_indexer, 

168 check_dict_or_set_indexers, 

169) 

170from pandas.core.internals import ( 

171 ArrayManager, 

172 BlockManager, 

173) 

174from pandas.core.internals.construction import ( 

175 arrays_to_mgr, 

176 dataclasses_to_dicts, 

177 dict_to_mgr, 

178 mgr_to_mgr, 

179 ndarray_to_mgr, 

180 nested_data_to_arrays, 

181 rec_array_to_mgr, 

182 reorder_arrays, 

183 to_arrays, 

184 treat_as_nested, 

185) 

186from pandas.core.methods import selectn 

187from pandas.core.reshape.melt import melt 

188from pandas.core.series import Series 

189from pandas.core.shared_docs import _shared_docs 

190from pandas.core.sorting import ( 

191 get_group_index, 

192 lexsort_indexer, 

193 nargsort, 

194) 

195 

196from pandas.io.common import get_handle 

197from pandas.io.formats import ( 

198 console, 

199 format as fmt, 

200) 

201from pandas.io.formats.info import ( 

202 INFO_DOCSTRING, 

203 DataFrameInfo, 

204 frame_sub_kwargs, 

205) 

206import pandas.plotting 

207 

208if TYPE_CHECKING: 

209 import datetime 

210 

211 from pandas._libs.internals import BlockValuesRefs 

212 from pandas._typing import ( 

213 AggFuncType, 

214 AnyAll, 

215 AnyArrayLike, 

216 ArrayLike, 

217 Axes, 

218 Axis, 

219 AxisInt, 

220 ColspaceArgType, 

221 CompressionOptions, 

222 CorrelationMethod, 

223 DropKeep, 

224 Dtype, 

225 DtypeObj, 

226 FilePath, 

227 FloatFormatType, 

228 FormattersType, 

229 Frequency, 

230 FromDictOrient, 

231 IgnoreRaise, 

232 IndexKeyFunc, 

233 IndexLabel, 

234 JoinValidate, 

235 Level, 

236 MergeHow, 

237 MergeValidate, 

238 MutableMappingT, 

239 NaAction, 

240 NaPosition, 

241 NsmallestNlargestKeep, 

242 PythonFuncType, 

243 QuantileInterpolation, 

244 ReadBuffer, 

245 ReindexMethod, 

246 Renamer, 

247 Scalar, 

248 Self, 

249 SequenceNotStr, 

250 SortKind, 

251 StorageOptions, 

252 Suffixes, 

253 ToGbqIfexist, 

254 ToStataByteorder, 

255 ToTimestampHow, 

256 UpdateJoin, 

257 ValueKeyFunc, 

258 WriteBuffer, 

259 XMLParsers, 

260 npt, 

261 ) 

262 

263 from pandas.core.groupby.generic import DataFrameGroupBy 

264 from pandas.core.interchange.dataframe_protocol import DataFrame as DataFrameXchg 

265 from pandas.core.internals import SingleDataManager 

266 

267 from pandas.io.formats.style import Styler 

268 

269# --------------------------------------------------------------------- 

270# Docstring templates 

271 

272_shared_doc_kwargs = { 

273 "axes": "index, columns", 

274 "klass": "DataFrame", 

275 "axes_single_arg": "{0 or 'index', 1 or 'columns'}", 

276 "axis": """axis : {0 or 'index', 1 or 'columns'}, default 0 

277 If 0 or 'index': apply function to each column. 

278 If 1 or 'columns': apply function to each row.""", 

279 "inplace": """ 

280 inplace : bool, default False 

281 Whether to modify the DataFrame rather than creating a new one.""", 

282 "optional_by": """ 

283by : str or list of str 

284 Name or list of names to sort by. 

285 

286 - if `axis` is 0 or `'index'` then `by` may contain index 

287 levels and/or column labels. 

288 - if `axis` is 1 or `'columns'` then `by` may contain column 

289 levels and/or index labels.""", 

290 "optional_reindex": """ 

291labels : array-like, optional 

292 New labels / index to conform the axis specified by 'axis' to. 

293index : array-like, optional 

294 New labels for the index. Preferably an Index object to avoid 

295 duplicating data. 

296columns : array-like, optional 

297 New labels for the columns. Preferably an Index object to avoid 

298 duplicating data. 

299axis : int or str, optional 

300 Axis to target. Can be either the axis name ('index', 'columns') 

301 or number (0, 1).""", 

302} 

303 

304_merge_doc = """ 

305Merge DataFrame or named Series objects with a database-style join. 

306 

307A named Series object is treated as a DataFrame with a single named column. 

308 

309The join is done on columns or indexes. If joining columns on 

310columns, the DataFrame indexes *will be ignored*. Otherwise if joining indexes 

311on indexes or indexes on a column or columns, the index will be passed on. 

312When performing a cross merge, no column specifications to merge on are 

313allowed. 

314 

315.. warning:: 

316 

317 If both key columns contain rows where the key is a null value, those 

318 rows will be matched against each other. This is different from usual SQL 

319 join behaviour and can lead to unexpected results. 

320 

321Parameters 

322----------%s 

323right : DataFrame or named Series 

324 Object to merge with. 

325how : {'left', 'right', 'outer', 'inner', 'cross'}, default 'inner' 

326 Type of merge to be performed. 

327 

328 * left: use only keys from left frame, similar to a SQL left outer join; 

329 preserve key order. 

330 * right: use only keys from right frame, similar to a SQL right outer join; 

331 preserve key order. 

332 * outer: use union of keys from both frames, similar to a SQL full outer 

333 join; sort keys lexicographically. 

334 * inner: use intersection of keys from both frames, similar to a SQL inner 

335 join; preserve the order of the left keys. 

336 * cross: creates the cartesian product from both frames, preserves the order 

337 of the left keys. 

338on : label or list 

339 Column or index level names to join on. These must be found in both 

340 DataFrames. If `on` is None and not merging on indexes then this defaults 

341 to the intersection of the columns in both DataFrames. 

342left_on : label or list, or array-like 

343 Column or index level names to join on in the left DataFrame. Can also 

344 be an array or list of arrays of the length of the left DataFrame. 

345 These arrays are treated as if they are columns. 

346right_on : label or list, or array-like 

347 Column or index level names to join on in the right DataFrame. Can also 

348 be an array or list of arrays of the length of the right DataFrame. 

349 These arrays are treated as if they are columns. 

350left_index : bool, default False 

351 Use the index from the left DataFrame as the join key(s). If it is a 

352 MultiIndex, the number of keys in the other DataFrame (either the index 

353 or a number of columns) must match the number of levels. 

354right_index : bool, default False 

355 Use the index from the right DataFrame as the join key. Same caveats as 

356 left_index. 

357sort : bool, default False 

358 Sort the join keys lexicographically in the result DataFrame. If False, 

359 the order of the join keys depends on the join type (how keyword). 

360suffixes : list-like, default is ("_x", "_y") 

361 A length-2 sequence where each element is optionally a string 

362 indicating the suffix to add to overlapping column names in 

363 `left` and `right` respectively. Pass a value of `None` instead 

364 of a string to indicate that the column name from `left` or 

365 `right` should be left as-is, with no suffix. At least one of the 

366 values must not be None. 

367copy : bool, default True 

368 If False, avoid copy if possible. 

369 

370 .. note:: 

371 The `copy` keyword will change behavior in pandas 3.0. 

372 `Copy-on-Write 

373 <https://pandas.pydata.org/docs/dev/user_guide/copy_on_write.html>`__ 

374 will be enabled by default, which means that all methods with a 

375 `copy` keyword will use a lazy copy mechanism to defer the copy and 

376 ignore the `copy` keyword. The `copy` keyword will be removed in a 

377 future version of pandas. 

378 

379 You can already get the future behavior and improvements through 

380 enabling copy on write ``pd.options.mode.copy_on_write = True`` 

381indicator : bool or str, default False 

382 If True, adds a column to the output DataFrame called "_merge" with 

383 information on the source of each row. The column can be given a different 

384 name by providing a string argument. The column will have a Categorical 

385 type with the value of "left_only" for observations whose merge key only 

386 appears in the left DataFrame, "right_only" for observations 

387 whose merge key only appears in the right DataFrame, and "both" 

388 if the observation's merge key is found in both DataFrames. 

389 

390validate : str, optional 

391 If specified, checks if merge is of specified type. 

392 

393 * "one_to_one" or "1:1": check if merge keys are unique in both 

394 left and right datasets. 

395 * "one_to_many" or "1:m": check if merge keys are unique in left 

396 dataset. 

397 * "many_to_one" or "m:1": check if merge keys are unique in right 

398 dataset. 

399 * "many_to_many" or "m:m": allowed, but does not result in checks. 

400 

401Returns 

402------- 

403DataFrame 

404 A DataFrame of the two merged objects. 

405 

406See Also 

407-------- 

408merge_ordered : Merge with optional filling/interpolation. 

409merge_asof : Merge on nearest keys. 

410DataFrame.join : Similar method using indices. 

411 

412Examples 

413-------- 

414>>> df1 = pd.DataFrame({'lkey': ['foo', 'bar', 'baz', 'foo'], 

415... 'value': [1, 2, 3, 5]}) 

416>>> df2 = pd.DataFrame({'rkey': ['foo', 'bar', 'baz', 'foo'], 

417... 'value': [5, 6, 7, 8]}) 

418>>> df1 

419 lkey value 

4200 foo 1 

4211 bar 2 

4222 baz 3 

4233 foo 5 

424>>> df2 

425 rkey value 

4260 foo 5 

4271 bar 6 

4282 baz 7 

4293 foo 8 

430 

431Merge df1 and df2 on the lkey and rkey columns. The value columns have 

432the default suffixes, _x and _y, appended. 

433 

434>>> df1.merge(df2, left_on='lkey', right_on='rkey') 

435 lkey value_x rkey value_y 

4360 foo 1 foo 5 

4371 foo 1 foo 8 

4382 bar 2 bar 6 

4393 baz 3 baz 7 

4404 foo 5 foo 5 

4415 foo 5 foo 8 

442 

443Merge DataFrames df1 and df2 with specified left and right suffixes 

444appended to any overlapping columns. 

445 

446>>> df1.merge(df2, left_on='lkey', right_on='rkey', 

447... suffixes=('_left', '_right')) 

448 lkey value_left rkey value_right 

4490 foo 1 foo 5 

4501 foo 1 foo 8 

4512 bar 2 bar 6 

4523 baz 3 baz 7 

4534 foo 5 foo 5 

4545 foo 5 foo 8 

455 

456Merge DataFrames df1 and df2, but raise an exception if the DataFrames have 

457any overlapping columns. 

458 

459>>> df1.merge(df2, left_on='lkey', right_on='rkey', suffixes=(False, False)) 

460Traceback (most recent call last): 

461... 

462ValueError: columns overlap but no suffix specified: 

463 Index(['value'], dtype='object') 

464 

465>>> df1 = pd.DataFrame({'a': ['foo', 'bar'], 'b': [1, 2]}) 

466>>> df2 = pd.DataFrame({'a': ['foo', 'baz'], 'c': [3, 4]}) 

467>>> df1 

468 a b 

4690 foo 1 

4701 bar 2 

471>>> df2 

472 a c 

4730 foo 3 

4741 baz 4 

475 

476>>> df1.merge(df2, how='inner', on='a') 

477 a b c 

4780 foo 1 3 

479 

480>>> df1.merge(df2, how='left', on='a') 

481 a b c 

4820 foo 1 3.0 

4831 bar 2 NaN 

484 

485>>> df1 = pd.DataFrame({'left': ['foo', 'bar']}) 

486>>> df2 = pd.DataFrame({'right': [7, 8]}) 

487>>> df1 

488 left 

4890 foo 

4901 bar 

491>>> df2 

492 right 

4930 7 

4941 8 

495 

496>>> df1.merge(df2, how='cross') 

497 left right 

4980 foo 7 

4991 foo 8 

5002 bar 7 

5013 bar 8 

502""" 

503 

504 

505# ----------------------------------------------------------------------- 

506# DataFrame class 

507 

508 

509class DataFrame(NDFrame, OpsMixin): 

510 """ 

511 Two-dimensional, size-mutable, potentially heterogeneous tabular data. 

512 

513 Data structure also contains labeled axes (rows and columns). 

514 Arithmetic operations align on both row and column labels. Can be 

515 thought of as a dict-like container for Series objects. The primary 

516 pandas data structure. 

517 

518 Parameters 

519 ---------- 

520 data : ndarray (structured or homogeneous), Iterable, dict, or DataFrame 

521 Dict can contain Series, arrays, constants, dataclass or list-like objects. If 

522 data is a dict, column order follows insertion-order. If a dict contains Series 

523 which have an index defined, it is aligned by its index. This alignment also 

524 occurs if data is a Series or a DataFrame itself. Alignment is done on 

525 Series/DataFrame inputs. 

526 

527 If data is a list of dicts, column order follows insertion-order. 

528 

529 index : Index or array-like 

530 Index to use for resulting frame. Will default to RangeIndex if 

531 no indexing information part of input data and no index provided. 

532 columns : Index or array-like 

533 Column labels to use for resulting frame when data does not have them, 

534 defaulting to RangeIndex(0, 1, 2, ..., n). If data contains column labels, 

535 will perform column selection instead. 

536 dtype : dtype, default None 

537 Data type to force. Only a single dtype is allowed. If None, infer. 

538 copy : bool or None, default None 

539 Copy data from inputs. 

540 For dict data, the default of None behaves like ``copy=True``. For DataFrame 

541 or 2d ndarray input, the default of None behaves like ``copy=False``. 

542 If data is a dict containing one or more Series (possibly of different dtypes), 

543 ``copy=False`` will ensure that these inputs are not copied. 

544 

545 .. versionchanged:: 1.3.0 

546 

547 See Also 

548 -------- 

549 DataFrame.from_records : Constructor from tuples, also record arrays. 

550 DataFrame.from_dict : From dicts of Series, arrays, or dicts. 

551 read_csv : Read a comma-separated values (csv) file into DataFrame. 

552 read_table : Read general delimited file into DataFrame. 

553 read_clipboard : Read text from clipboard into DataFrame. 

554 

555 Notes 

556 ----- 

557 Please reference the :ref:`User Guide <basics.dataframe>` for more information. 

558 

559 Examples 

560 -------- 

561 Constructing DataFrame from a dictionary. 

562 

563 >>> d = {'col1': [1, 2], 'col2': [3, 4]} 

564 >>> df = pd.DataFrame(data=d) 

565 >>> df 

566 col1 col2 

567 0 1 3 

568 1 2 4 

569 

570 Notice that the inferred dtype is int64. 

571 

572 >>> df.dtypes 

573 col1 int64 

574 col2 int64 

575 dtype: object 

576 

577 To enforce a single dtype: 

578 

579 >>> df = pd.DataFrame(data=d, dtype=np.int8) 

580 >>> df.dtypes 

581 col1 int8 

582 col2 int8 

583 dtype: object 

584 

585 Constructing DataFrame from a dictionary including Series: 

586 

587 >>> d = {'col1': [0, 1, 2, 3], 'col2': pd.Series([2, 3], index=[2, 3])} 

588 >>> pd.DataFrame(data=d, index=[0, 1, 2, 3]) 

589 col1 col2 

590 0 0 NaN 

591 1 1 NaN 

592 2 2 2.0 

593 3 3 3.0 

594 

595 Constructing DataFrame from numpy ndarray: 

596 

597 >>> df2 = pd.DataFrame(np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]), 

598 ... columns=['a', 'b', 'c']) 

599 >>> df2 

600 a b c 

601 0 1 2 3 

602 1 4 5 6 

603 2 7 8 9 

604 

605 Constructing DataFrame from a numpy ndarray that has labeled columns: 

606 

607 >>> data = np.array([(1, 2, 3), (4, 5, 6), (7, 8, 9)], 

608 ... dtype=[("a", "i4"), ("b", "i4"), ("c", "i4")]) 

609 >>> df3 = pd.DataFrame(data, columns=['c', 'a']) 

610 ... 

611 >>> df3 

612 c a 

613 0 3 1 

614 1 6 4 

615 2 9 7 

616 

617 Constructing DataFrame from dataclass: 

618 

619 >>> from dataclasses import make_dataclass 

620 >>> Point = make_dataclass("Point", [("x", int), ("y", int)]) 

621 >>> pd.DataFrame([Point(0, 0), Point(0, 3), Point(2, 3)]) 

622 x y 

623 0 0 0 

624 1 0 3 

625 2 2 3 

626 

627 Constructing DataFrame from Series/DataFrame: 

628 

629 >>> ser = pd.Series([1, 2, 3], index=["a", "b", "c"]) 

630 >>> df = pd.DataFrame(data=ser, index=["a", "c"]) 

631 >>> df 

632 0 

633 a 1 

634 c 3 

635 

636 >>> df1 = pd.DataFrame([1, 2, 3], index=["a", "b", "c"], columns=["x"]) 

637 >>> df2 = pd.DataFrame(data=df1, index=["a", "c"]) 

638 >>> df2 

639 x 

640 a 1 

641 c 3 

642 """ 

643 

644 _internal_names_set = {"columns", "index"} | NDFrame._internal_names_set 

645 _typ = "dataframe" 

646 _HANDLED_TYPES = (Series, Index, ExtensionArray, np.ndarray) 

647 _accessors: set[str] = {"sparse"} 

648 _hidden_attrs: frozenset[str] = NDFrame._hidden_attrs | frozenset([]) 

649 _mgr: BlockManager | ArrayManager 

650 

651 # similar to __array_priority__, positions DataFrame before Series, Index, 

652 # and ExtensionArray. Should NOT be overridden by subclasses. 

653 __pandas_priority__ = 4000 

654 

655 @property 

656 def _constructor(self) -> Callable[..., DataFrame]: 

657 return DataFrame 

658 

659 def _constructor_from_mgr(self, mgr, axes) -> DataFrame: 

660 df = DataFrame._from_mgr(mgr, axes=axes) 

661 

662 if type(self) is DataFrame: 

663 # This would also work `if self._constructor is DataFrame`, but 

664 # this check is slightly faster, benefiting the most-common case. 

665 return df 

666 

667 elif type(self).__name__ == "GeoDataFrame": 

668 # Shim until geopandas can override their _constructor_from_mgr 

669 # bc they have different behavior for Managers than for DataFrames 

670 return self._constructor(mgr) 

671 

672 # We assume that the subclass __init__ knows how to handle a 

673 # pd.DataFrame object. 

674 return self._constructor(df) 

675 

676 _constructor_sliced: Callable[..., Series] = Series 

677 

678 def _constructor_sliced_from_mgr(self, mgr, axes) -> Series: 

679 ser = Series._from_mgr(mgr, axes) 

680 ser._name = None # caller is responsible for setting real name 

681 

682 if type(self) is DataFrame: 

683 # This would also work `if self._constructor_sliced is Series`, but 

684 # this check is slightly faster, benefiting the most-common case. 

685 return ser 

686 

687 # We assume that the subclass __init__ knows how to handle a 

688 # pd.Series object. 

689 return self._constructor_sliced(ser) 

690 

691 # ---------------------------------------------------------------------- 

692 # Constructors 

693 

694 def __init__( 

695 self, 

696 data=None, 

697 index: Axes | None = None, 

698 columns: Axes | None = None, 

699 dtype: Dtype | None = None, 

700 copy: bool | None = None, 

701 ) -> None: 

702 allow_mgr = False 

703 if dtype is not None: 

704 dtype = self._validate_dtype(dtype) 

705 

706 if isinstance(data, DataFrame): 

707 data = data._mgr 

708 allow_mgr = True 

709 if not copy: 

710 # if not copying data, ensure to still return a shallow copy 

711 # to avoid the result sharing the same Manager 

712 data = data.copy(deep=False) 

713 

714 if isinstance(data, (BlockManager, ArrayManager)): 

715 if not allow_mgr: 

716 # GH#52419 

717 warnings.warn( 

718 f"Passing a {type(data).__name__} to {type(self).__name__} " 

719 "is deprecated and will raise in a future version. " 

720 "Use public APIs instead.", 

721 DeprecationWarning, 

722 stacklevel=1, # bump to 2 once pyarrow 15.0 is released with fix 

723 ) 

724 

725 if using_copy_on_write(): 

726 data = data.copy(deep=False) 

727 # first check if a Manager is passed without any other arguments 

728 # -> use fastpath (without checking Manager type) 

729 if index is None and columns is None and dtype is None and not copy: 

730 # GH#33357 fastpath 

731 NDFrame.__init__(self, data) 

732 return 

733 

734 manager = _get_option("mode.data_manager", silent=True) 

735 

736 is_pandas_object = isinstance(data, (Series, Index, ExtensionArray)) 

737 data_dtype = getattr(data, "dtype", None) 

738 original_dtype = dtype 

739 

740 # GH47215 

741 if isinstance(index, set): 

742 raise ValueError("index cannot be a set") 

743 if isinstance(columns, set): 

744 raise ValueError("columns cannot be a set") 

745 

746 if copy is None: 

747 if isinstance(data, dict): 

748 # retain pre-GH#38939 default behavior 

749 copy = True 

750 elif ( 

751 manager == "array" 

752 and isinstance(data, (np.ndarray, ExtensionArray)) 

753 and data.ndim == 2 

754 ): 

755 # INFO(ArrayManager) by default copy the 2D input array to get 

756 # contiguous 1D arrays 

757 copy = True 

758 elif using_copy_on_write() and not isinstance( 

759 data, (Index, DataFrame, Series) 

760 ): 

761 copy = True 

762 else: 

763 copy = False 

764 

765 if data is None: 

766 index = index if index is not None else default_index(0) 

767 columns = columns if columns is not None else default_index(0) 

768 dtype = dtype if dtype is not None else pandas_dtype(object) 

769 data = [] 

770 

771 if isinstance(data, (BlockManager, ArrayManager)): 

772 mgr = self._init_mgr( 

773 data, axes={"index": index, "columns": columns}, dtype=dtype, copy=copy 

774 ) 

775 

776 elif isinstance(data, dict): 

777 # GH#38939 de facto copy defaults to False only in non-dict cases 

778 mgr = dict_to_mgr(data, index, columns, dtype=dtype, copy=copy, typ=manager) 

779 elif isinstance(data, ma.MaskedArray): 

780 from numpy.ma import mrecords 

781 

782 # masked recarray 

783 if isinstance(data, mrecords.MaskedRecords): 

784 raise TypeError( 

785 "MaskedRecords are not supported. Pass " 

786 "{name: data[name] for name in data.dtype.names} " 

787 "instead" 

788 ) 

789 

790 # a masked array 

791 data = sanitize_masked_array(data) 

792 mgr = ndarray_to_mgr( 

793 data, 

794 index, 

795 columns, 

796 dtype=dtype, 

797 copy=copy, 

798 typ=manager, 

799 ) 

800 

801 elif isinstance(data, (np.ndarray, Series, Index, ExtensionArray)): 

802 if data.dtype.names: 

803 # i.e. numpy structured array 

804 data = cast(np.ndarray, data) 

805 mgr = rec_array_to_mgr( 

806 data, 

807 index, 

808 columns, 

809 dtype, 

810 copy, 

811 typ=manager, 

812 ) 

813 elif getattr(data, "name", None) is not None: 

814 # i.e. Series/Index with non-None name 

815 _copy = copy if using_copy_on_write() else True 

816 mgr = dict_to_mgr( 

817 # error: Item "ndarray" of "Union[ndarray, Series, Index]" has no 

818 # attribute "name" 

819 {data.name: data}, # type: ignore[union-attr] 

820 index, 

821 columns, 

822 dtype=dtype, 

823 typ=manager, 

824 copy=_copy, 

825 ) 

826 else: 

827 mgr = ndarray_to_mgr( 

828 data, 

829 index, 

830 columns, 

831 dtype=dtype, 

832 copy=copy, 

833 typ=manager, 

834 ) 

835 

836 # For data is list-like, or Iterable (will consume into list) 

837 elif is_list_like(data): 

838 if not isinstance(data, abc.Sequence): 

839 if hasattr(data, "__array__"): 

840 # GH#44616 big perf improvement for e.g. pytorch tensor 

841 data = np.asarray(data) 

842 else: 

843 data = list(data) 

844 if len(data) > 0: 

845 if is_dataclass(data[0]): 

846 data = dataclasses_to_dicts(data) 

847 if not isinstance(data, np.ndarray) and treat_as_nested(data): 

848 # exclude ndarray as we may have cast it a few lines above 

849 if columns is not None: 

850 columns = ensure_index(columns) 

851 arrays, columns, index = nested_data_to_arrays( 

852 # error: Argument 3 to "nested_data_to_arrays" has incompatible 

853 # type "Optional[Collection[Any]]"; expected "Optional[Index]" 

854 data, 

855 columns, 

856 index, # type: ignore[arg-type] 

857 dtype, 

858 ) 

859 mgr = arrays_to_mgr( 

860 arrays, 

861 columns, 

862 index, 

863 dtype=dtype, 

864 typ=manager, 

865 ) 

866 else: 

867 mgr = ndarray_to_mgr( 

868 data, 

869 index, 

870 columns, 

871 dtype=dtype, 

872 copy=copy, 

873 typ=manager, 

874 ) 

875 else: 

876 mgr = dict_to_mgr( 

877 {}, 

878 index, 

879 columns if columns is not None else default_index(0), 

880 dtype=dtype, 

881 typ=manager, 

882 ) 

883 # For data is scalar 

884 else: 

885 if index is None or columns is None: 

886 raise ValueError("DataFrame constructor not properly called!") 

887 

888 index = ensure_index(index) 

889 columns = ensure_index(columns) 

890 

891 if not dtype: 

892 dtype, _ = infer_dtype_from_scalar(data) 

893 

894 # For data is a scalar extension dtype 

895 if isinstance(dtype, ExtensionDtype): 

896 # TODO(EA2D): special case not needed with 2D EAs 

897 

898 values = [ 

899 construct_1d_arraylike_from_scalar(data, len(index), dtype) 

900 for _ in range(len(columns)) 

901 ] 

902 mgr = arrays_to_mgr(values, columns, index, dtype=None, typ=manager) 

903 else: 

904 arr2d = construct_2d_arraylike_from_scalar( 

905 data, 

906 len(index), 

907 len(columns), 

908 dtype, 

909 copy, 

910 ) 

911 

912 mgr = ndarray_to_mgr( 

913 arr2d, 

914 index, 

915 columns, 

916 dtype=arr2d.dtype, 

917 copy=False, 

918 typ=manager, 

919 ) 

920 

921 # ensure correct Manager type according to settings 

922 mgr = mgr_to_mgr(mgr, typ=manager) 

923 

924 NDFrame.__init__(self, mgr) 

925 

926 if original_dtype is None and is_pandas_object and data_dtype == np.object_: 

927 if self.dtypes.iloc[0] != data_dtype: 

928 warnings.warn( 

929 "Dtype inference on a pandas object " 

930 "(Series, Index, ExtensionArray) is deprecated. The DataFrame " 

931 "constructor will keep the original dtype in the future. " 

932 "Call `infer_objects` on the result to get the old " 

933 "behavior.", 

934 FutureWarning, 

935 stacklevel=2, 

936 ) 

937 

938 # ---------------------------------------------------------------------- 

939 

940 def __dataframe__( 

941 self, nan_as_null: bool = False, allow_copy: bool = True 

942 ) -> DataFrameXchg: 

943 """ 

944 Return the dataframe interchange object implementing the interchange protocol. 

945 

946 Parameters 

947 ---------- 

948 nan_as_null : bool, default False 

949 `nan_as_null` is DEPRECATED and has no effect. Please avoid using 

950 it; it will be removed in a future release. 

951 allow_copy : bool, default True 

952 Whether to allow memory copying when exporting. If set to False 

953 it would cause non-zero-copy exports to fail. 

954 

955 Returns 

956 ------- 

957 DataFrame interchange object 

958 The object which consuming library can use to ingress the dataframe. 

959 

960 Notes 

961 ----- 

962 Details on the interchange protocol: 

963 https://data-apis.org/dataframe-protocol/latest/index.html 

964 

965 Examples 

966 -------- 

967 >>> df_not_necessarily_pandas = pd.DataFrame({'A': [1, 2], 'B': [3, 4]}) 

968 >>> interchange_object = df_not_necessarily_pandas.__dataframe__() 

969 >>> interchange_object.column_names() 

970 Index(['A', 'B'], dtype='object') 

971 >>> df_pandas = (pd.api.interchange.from_dataframe 

972 ... (interchange_object.select_columns_by_name(['A']))) 

973 >>> df_pandas 

974 A 

975 0 1 

976 1 2 

977 

978 These methods (``column_names``, ``select_columns_by_name``) should work 

979 for any dataframe library which implements the interchange protocol. 

980 """ 

981 

982 from pandas.core.interchange.dataframe import PandasDataFrameXchg 

983 

984 return PandasDataFrameXchg(self, allow_copy=allow_copy) 

985 

986 def __dataframe_consortium_standard__( 

987 self, *, api_version: str | None = None 

988 ) -> Any: 

989 """ 

990 Provide entry point to the Consortium DataFrame Standard API. 

991 

992 This is developed and maintained outside of pandas. 

993 Please report any issues to https://github.com/data-apis/dataframe-api-compat. 

994 """ 

995 dataframe_api_compat = import_optional_dependency("dataframe_api_compat") 

996 convert_to_standard_compliant_dataframe = ( 

997 dataframe_api_compat.pandas_standard.convert_to_standard_compliant_dataframe 

998 ) 

999 return convert_to_standard_compliant_dataframe(self, api_version=api_version) 

1000 

1001 def __arrow_c_stream__(self, requested_schema=None): 

1002 """ 

1003 Export the pandas DataFrame as an Arrow C stream PyCapsule. 

1004 

1005 This relies on pyarrow to convert the pandas DataFrame to the Arrow 

1006 format (and follows the default behaviour of ``pyarrow.Table.from_pandas`` 

1007 in its handling of the index, i.e. store the index as a column except 

1008 for RangeIndex). 

1009 This conversion is not necessarily zero-copy. 

1010 

1011 Parameters 

1012 ---------- 

1013 requested_schema : PyCapsule, default None 

1014 The schema to which the dataframe should be casted, passed as a 

1015 PyCapsule containing a C ArrowSchema representation of the 

1016 requested schema. 

1017 

1018 Returns 

1019 ------- 

1020 PyCapsule 

1021 """ 

1022 pa = import_optional_dependency("pyarrow", min_version="14.0.0") 

1023 if requested_schema is not None: 

1024 requested_schema = pa.Schema._import_from_c_capsule(requested_schema) 

1025 table = pa.Table.from_pandas(self, schema=requested_schema) 

1026 return table.__arrow_c_stream__() 

1027 

1028 # ---------------------------------------------------------------------- 

1029 

1030 @property 

1031 def axes(self) -> list[Index]: 

1032 """ 

1033 Return a list representing the axes of the DataFrame. 

1034 

1035 It has the row axis labels and column axis labels as the only members. 

1036 They are returned in that order. 

1037 

1038 Examples 

1039 -------- 

1040 >>> df = pd.DataFrame({'col1': [1, 2], 'col2': [3, 4]}) 

1041 >>> df.axes 

1042 [RangeIndex(start=0, stop=2, step=1), Index(['col1', 'col2'], 

1043 dtype='object')] 

1044 """ 

1045 return [self.index, self.columns] 

1046 

1047 @property 

1048 def shape(self) -> tuple[int, int]: 

1049 """ 

1050 Return a tuple representing the dimensionality of the DataFrame. 

1051 

1052 See Also 

1053 -------- 

1054 ndarray.shape : Tuple of array dimensions. 

1055 

1056 Examples 

1057 -------- 

1058 >>> df = pd.DataFrame({'col1': [1, 2], 'col2': [3, 4]}) 

1059 >>> df.shape 

1060 (2, 2) 

1061 

1062 >>> df = pd.DataFrame({'col1': [1, 2], 'col2': [3, 4], 

1063 ... 'col3': [5, 6]}) 

1064 >>> df.shape 

1065 (2, 3) 

1066 """ 

1067 return len(self.index), len(self.columns) 

1068 

1069 @property 

1070 def _is_homogeneous_type(self) -> bool: 

1071 """ 

1072 Whether all the columns in a DataFrame have the same type. 

1073 

1074 Returns 

1075 ------- 

1076 bool 

1077 

1078 Examples 

1079 -------- 

1080 >>> DataFrame({"A": [1, 2], "B": [3, 4]})._is_homogeneous_type 

1081 True 

1082 >>> DataFrame({"A": [1, 2], "B": [3.0, 4.0]})._is_homogeneous_type 

1083 False 

1084 

1085 Items with the same type but different sizes are considered 

1086 different types. 

1087 

1088 >>> DataFrame({ 

1089 ... "A": np.array([1, 2], dtype=np.int32), 

1090 ... "B": np.array([1, 2], dtype=np.int64)})._is_homogeneous_type 

1091 False 

1092 """ 

1093 # The "<" part of "<=" here is for empty DataFrame cases 

1094 return len({arr.dtype for arr in self._mgr.arrays}) <= 1 

1095 

1096 @property 

1097 def _can_fast_transpose(self) -> bool: 

1098 """ 

1099 Can we transpose this DataFrame without creating any new array objects. 

1100 """ 

1101 if isinstance(self._mgr, ArrayManager): 

1102 return False 

1103 blocks = self._mgr.blocks 

1104 if len(blocks) != 1: 

1105 return False 

1106 

1107 dtype = blocks[0].dtype 

1108 # TODO(EA2D) special case would be unnecessary with 2D EAs 

1109 return not is_1d_only_ea_dtype(dtype) 

1110 

1111 @property 

1112 def _values(self) -> np.ndarray | DatetimeArray | TimedeltaArray | PeriodArray: 

1113 """ 

1114 Analogue to ._values that may return a 2D ExtensionArray. 

1115 """ 

1116 mgr = self._mgr 

1117 

1118 if isinstance(mgr, ArrayManager): 

1119 if len(mgr.arrays) == 1 and not is_1d_only_ea_dtype(mgr.arrays[0].dtype): 

1120 # error: Item "ExtensionArray" of "Union[ndarray, ExtensionArray]" 

1121 # has no attribute "reshape" 

1122 return mgr.arrays[0].reshape(-1, 1) # type: ignore[union-attr] 

1123 return ensure_wrapped_if_datetimelike(self.values) 

1124 

1125 blocks = mgr.blocks 

1126 if len(blocks) != 1: 

1127 return ensure_wrapped_if_datetimelike(self.values) 

1128 

1129 arr = blocks[0].values 

1130 if arr.ndim == 1: 

1131 # non-2D ExtensionArray 

1132 return self.values 

1133 

1134 # more generally, whatever we allow in NDArrayBackedExtensionBlock 

1135 arr = cast("np.ndarray | DatetimeArray | TimedeltaArray | PeriodArray", arr) 

1136 return arr.T 

1137 

1138 # ---------------------------------------------------------------------- 

1139 # Rendering Methods 

1140 

1141 def _repr_fits_vertical_(self) -> bool: 

1142 """ 

1143 Check length against max_rows. 

1144 """ 

1145 max_rows = get_option("display.max_rows") 

1146 return len(self) <= max_rows 

1147 

1148 def _repr_fits_horizontal_(self) -> bool: 

1149 """ 

1150 Check if full repr fits in horizontal boundaries imposed by the display 

1151 options width and max_columns. 

1152 """ 

1153 width, height = console.get_console_size() 

1154 max_columns = get_option("display.max_columns") 

1155 nb_columns = len(self.columns) 

1156 

1157 # exceed max columns 

1158 if (max_columns and nb_columns > max_columns) or ( 

1159 width and nb_columns > (width // 2) 

1160 ): 

1161 return False 

1162 

1163 # used by repr_html under IPython notebook or scripts ignore terminal 

1164 # dims 

1165 if width is None or not console.in_interactive_session(): 

1166 return True 

1167 

1168 if get_option("display.width") is not None or console.in_ipython_frontend(): 

1169 # check at least the column row for excessive width 

1170 max_rows = 1 

1171 else: 

1172 max_rows = get_option("display.max_rows") 

1173 

1174 # when auto-detecting, so width=None and not in ipython front end 

1175 # check whether repr fits horizontal by actually checking 

1176 # the width of the rendered repr 

1177 buf = StringIO() 

1178 

1179 # only care about the stuff we'll actually print out 

1180 # and to_string on entire frame may be expensive 

1181 d = self 

1182 

1183 if max_rows is not None: # unlimited rows 

1184 # min of two, where one may be None 

1185 d = d.iloc[: min(max_rows, len(d))] 

1186 else: 

1187 return True 

1188 

1189 d.to_string(buf=buf) 

1190 value = buf.getvalue() 

1191 repr_width = max(len(line) for line in value.split("\n")) 

1192 

1193 return repr_width < width 

1194 

1195 def _info_repr(self) -> bool: 

1196 """ 

1197 True if the repr should show the info view. 

1198 """ 

1199 info_repr_option = get_option("display.large_repr") == "info" 

1200 return info_repr_option and not ( 

1201 self._repr_fits_horizontal_() and self._repr_fits_vertical_() 

1202 ) 

1203 

1204 def __repr__(self) -> str: 

1205 """ 

1206 Return a string representation for a particular DataFrame. 

1207 """ 

1208 if self._info_repr(): 

1209 buf = StringIO() 

1210 self.info(buf=buf) 

1211 return buf.getvalue() 

1212 

1213 repr_params = fmt.get_dataframe_repr_params() 

1214 return self.to_string(**repr_params) 

1215 

1216 def _repr_html_(self) -> str | None: 

1217 """ 

1218 Return a html representation for a particular DataFrame. 

1219 

1220 Mainly for IPython notebook. 

1221 """ 

1222 if self._info_repr(): 

1223 buf = StringIO() 

1224 self.info(buf=buf) 

1225 # need to escape the <class>, should be the first line. 

1226 val = buf.getvalue().replace("<", r"&lt;", 1) 

1227 val = val.replace(">", r"&gt;", 1) 

1228 return f"<pre>{val}</pre>" 

1229 

1230 if get_option("display.notebook_repr_html"): 

1231 max_rows = get_option("display.max_rows") 

1232 min_rows = get_option("display.min_rows") 

1233 max_cols = get_option("display.max_columns") 

1234 show_dimensions = get_option("display.show_dimensions") 

1235 

1236 formatter = fmt.DataFrameFormatter( 

1237 self, 

1238 columns=None, 

1239 col_space=None, 

1240 na_rep="NaN", 

1241 formatters=None, 

1242 float_format=None, 

1243 sparsify=None, 

1244 justify=None, 

1245 index_names=True, 

1246 header=True, 

1247 index=True, 

1248 bold_rows=True, 

1249 escape=True, 

1250 max_rows=max_rows, 

1251 min_rows=min_rows, 

1252 max_cols=max_cols, 

1253 show_dimensions=show_dimensions, 

1254 decimal=".", 

1255 ) 

1256 return fmt.DataFrameRenderer(formatter).to_html(notebook=True) 

1257 else: 

1258 return None 

1259 

1260 @overload 

1261 def to_string( 

1262 self, 

1263 buf: None = ..., 

1264 columns: Axes | None = ..., 

1265 col_space: int | list[int] | dict[Hashable, int] | None = ..., 

1266 header: bool | SequenceNotStr[str] = ..., 

1267 index: bool = ..., 

1268 na_rep: str = ..., 

1269 formatters: fmt.FormattersType | None = ..., 

1270 float_format: fmt.FloatFormatType | None = ..., 

1271 sparsify: bool | None = ..., 

1272 index_names: bool = ..., 

1273 justify: str | None = ..., 

1274 max_rows: int | None = ..., 

1275 max_cols: int | None = ..., 

1276 show_dimensions: bool = ..., 

1277 decimal: str = ..., 

1278 line_width: int | None = ..., 

1279 min_rows: int | None = ..., 

1280 max_colwidth: int | None = ..., 

1281 encoding: str | None = ..., 

1282 ) -> str: 

1283 ... 

1284 

1285 @overload 

1286 def to_string( 

1287 self, 

1288 buf: FilePath | WriteBuffer[str], 

1289 columns: Axes | None = ..., 

1290 col_space: int | list[int] | dict[Hashable, int] | None = ..., 

1291 header: bool | SequenceNotStr[str] = ..., 

1292 index: bool = ..., 

1293 na_rep: str = ..., 

1294 formatters: fmt.FormattersType | None = ..., 

1295 float_format: fmt.FloatFormatType | None = ..., 

1296 sparsify: bool | None = ..., 

1297 index_names: bool = ..., 

1298 justify: str | None = ..., 

1299 max_rows: int | None = ..., 

1300 max_cols: int | None = ..., 

1301 show_dimensions: bool = ..., 

1302 decimal: str = ..., 

1303 line_width: int | None = ..., 

1304 min_rows: int | None = ..., 

1305 max_colwidth: int | None = ..., 

1306 encoding: str | None = ..., 

1307 ) -> None: 

1308 ... 

1309 

1310 @deprecate_nonkeyword_arguments( 

1311 version="3.0", allowed_args=["self", "buf"], name="to_string" 

1312 ) 

1313 @Substitution( 

1314 header_type="bool or list of str", 

1315 header="Write out the column names. If a list of columns " 

1316 "is given, it is assumed to be aliases for the " 

1317 "column names", 

1318 col_space_type="int, list or dict of int", 

1319 col_space="The minimum width of each column. If a list of ints is given " 

1320 "every integers corresponds with one column. If a dict is given, the key " 

1321 "references the column, while the value defines the space to use.", 

1322 ) 

1323 @Substitution(shared_params=fmt.common_docstring, returns=fmt.return_docstring) 

1324 def to_string( 

1325 self, 

1326 buf: FilePath | WriteBuffer[str] | None = None, 

1327 columns: Axes | None = None, 

1328 col_space: int | list[int] | dict[Hashable, int] | None = None, 

1329 header: bool | SequenceNotStr[str] = True, 

1330 index: bool = True, 

1331 na_rep: str = "NaN", 

1332 formatters: fmt.FormattersType | None = None, 

1333 float_format: fmt.FloatFormatType | None = None, 

1334 sparsify: bool | None = None, 

1335 index_names: bool = True, 

1336 justify: str | None = None, 

1337 max_rows: int | None = None, 

1338 max_cols: int | None = None, 

1339 show_dimensions: bool = False, 

1340 decimal: str = ".", 

1341 line_width: int | None = None, 

1342 min_rows: int | None = None, 

1343 max_colwidth: int | None = None, 

1344 encoding: str | None = None, 

1345 ) -> str | None: 

1346 """ 

1347 Render a DataFrame to a console-friendly tabular output. 

1348 %(shared_params)s 

1349 line_width : int, optional 

1350 Width to wrap a line in characters. 

1351 min_rows : int, optional 

1352 The number of rows to display in the console in a truncated repr 

1353 (when number of rows is above `max_rows`). 

1354 max_colwidth : int, optional 

1355 Max width to truncate each column in characters. By default, no limit. 

1356 encoding : str, default "utf-8" 

1357 Set character encoding. 

1358 %(returns)s 

1359 See Also 

1360 -------- 

1361 to_html : Convert DataFrame to HTML. 

1362 

1363 Examples 

1364 -------- 

1365 >>> d = {'col1': [1, 2, 3], 'col2': [4, 5, 6]} 

1366 >>> df = pd.DataFrame(d) 

1367 >>> print(df.to_string()) 

1368 col1 col2 

1369 0 1 4 

1370 1 2 5 

1371 2 3 6 

1372 """ 

1373 from pandas import option_context 

1374 

1375 with option_context("display.max_colwidth", max_colwidth): 

1376 formatter = fmt.DataFrameFormatter( 

1377 self, 

1378 columns=columns, 

1379 col_space=col_space, 

1380 na_rep=na_rep, 

1381 formatters=formatters, 

1382 float_format=float_format, 

1383 sparsify=sparsify, 

1384 justify=justify, 

1385 index_names=index_names, 

1386 header=header, 

1387 index=index, 

1388 min_rows=min_rows, 

1389 max_rows=max_rows, 

1390 max_cols=max_cols, 

1391 show_dimensions=show_dimensions, 

1392 decimal=decimal, 

1393 ) 

1394 return fmt.DataFrameRenderer(formatter).to_string( 

1395 buf=buf, 

1396 encoding=encoding, 

1397 line_width=line_width, 

1398 ) 

1399 

1400 def _get_values_for_csv( 

1401 self, 

1402 *, 

1403 float_format: FloatFormatType | None, 

1404 date_format: str | None, 

1405 decimal: str, 

1406 na_rep: str, 

1407 quoting, # int csv.QUOTE_FOO from stdlib 

1408 ) -> Self: 

1409 # helper used by to_csv 

1410 mgr = self._mgr.get_values_for_csv( 

1411 float_format=float_format, 

1412 date_format=date_format, 

1413 decimal=decimal, 

1414 na_rep=na_rep, 

1415 quoting=quoting, 

1416 ) 

1417 # error: Incompatible return value type (got "DataFrame", expected "Self") 

1418 return self._constructor_from_mgr(mgr, axes=mgr.axes) # type: ignore[return-value] 

1419 

1420 # ---------------------------------------------------------------------- 

1421 

1422 @property 

1423 def style(self) -> Styler: 

1424 """ 

1425 Returns a Styler object. 

1426 

1427 Contains methods for building a styled HTML representation of the DataFrame. 

1428 

1429 See Also 

1430 -------- 

1431 io.formats.style.Styler : Helps style a DataFrame or Series according to the 

1432 data with HTML and CSS. 

1433 

1434 Examples 

1435 -------- 

1436 >>> df = pd.DataFrame({'A': [1, 2, 3]}) 

1437 >>> df.style # doctest: +SKIP 

1438 

1439 Please see 

1440 `Table Visualization <../../user_guide/style.ipynb>`_ for more examples. 

1441 """ 

1442 from pandas.io.formats.style import Styler 

1443 

1444 return Styler(self) 

1445 

1446 _shared_docs[ 

1447 "items" 

1448 ] = r""" 

1449 Iterate over (column name, Series) pairs. 

1450 

1451 Iterates over the DataFrame columns, returning a tuple with 

1452 the column name and the content as a Series. 

1453 

1454 Yields 

1455 ------ 

1456 label : object 

1457 The column names for the DataFrame being iterated over. 

1458 content : Series 

1459 The column entries belonging to each label, as a Series. 

1460 

1461 See Also 

1462 -------- 

1463 DataFrame.iterrows : Iterate over DataFrame rows as 

1464 (index, Series) pairs. 

1465 DataFrame.itertuples : Iterate over DataFrame rows as namedtuples 

1466 of the values. 

1467 

1468 Examples 

1469 -------- 

1470 >>> df = pd.DataFrame({'species': ['bear', 'bear', 'marsupial'], 

1471 ... 'population': [1864, 22000, 80000]}, 

1472 ... index=['panda', 'polar', 'koala']) 

1473 >>> df 

1474 species population 

1475 panda bear 1864 

1476 polar bear 22000 

1477 koala marsupial 80000 

1478 >>> for label, content in df.items(): 

1479 ... print(f'label: {label}') 

1480 ... print(f'content: {content}', sep='\n') 

1481 ... 

1482 label: species 

1483 content: 

1484 panda bear 

1485 polar bear 

1486 koala marsupial 

1487 Name: species, dtype: object 

1488 label: population 

1489 content: 

1490 panda 1864 

1491 polar 22000 

1492 koala 80000 

1493 Name: population, dtype: int64 

1494 """ 

1495 

1496 @Appender(_shared_docs["items"]) 

1497 def items(self) -> Iterable[tuple[Hashable, Series]]: 

1498 if self.columns.is_unique and hasattr(self, "_item_cache"): 

1499 for k in self.columns: 

1500 yield k, self._get_item_cache(k) 

1501 else: 

1502 for i, k in enumerate(self.columns): 

1503 yield k, self._ixs(i, axis=1) 

1504 

1505 def iterrows(self) -> Iterable[tuple[Hashable, Series]]: 

1506 """ 

1507 Iterate over DataFrame rows as (index, Series) pairs. 

1508 

1509 Yields 

1510 ------ 

1511 index : label or tuple of label 

1512 The index of the row. A tuple for a `MultiIndex`. 

1513 data : Series 

1514 The data of the row as a Series. 

1515 

1516 See Also 

1517 -------- 

1518 DataFrame.itertuples : Iterate over DataFrame rows as namedtuples of the values. 

1519 DataFrame.items : Iterate over (column name, Series) pairs. 

1520 

1521 Notes 

1522 ----- 

1523 1. Because ``iterrows`` returns a Series for each row, 

1524 it does **not** preserve dtypes across the rows (dtypes are 

1525 preserved across columns for DataFrames). 

1526 

1527 To preserve dtypes while iterating over the rows, it is better 

1528 to use :meth:`itertuples` which returns namedtuples of the values 

1529 and which is generally faster than ``iterrows``. 

1530 

1531 2. You should **never modify** something you are iterating over. 

1532 This is not guaranteed to work in all cases. Depending on the 

1533 data types, the iterator returns a copy and not a view, and writing 

1534 to it will have no effect. 

1535 

1536 Examples 

1537 -------- 

1538 

1539 >>> df = pd.DataFrame([[1, 1.5]], columns=['int', 'float']) 

1540 >>> row = next(df.iterrows())[1] 

1541 >>> row 

1542 int 1.0 

1543 float 1.5 

1544 Name: 0, dtype: float64 

1545 >>> print(row['int'].dtype) 

1546 float64 

1547 >>> print(df['int'].dtype) 

1548 int64 

1549 """ 

1550 columns = self.columns 

1551 klass = self._constructor_sliced 

1552 using_cow = using_copy_on_write() 

1553 for k, v in zip(self.index, self.values): 

1554 s = klass(v, index=columns, name=k).__finalize__(self) 

1555 if using_cow and self._mgr.is_single_block: 

1556 s._mgr.add_references(self._mgr) # type: ignore[arg-type] 

1557 yield k, s 

1558 

1559 def itertuples( 

1560 self, index: bool = True, name: str | None = "Pandas" 

1561 ) -> Iterable[tuple[Any, ...]]: 

1562 """ 

1563 Iterate over DataFrame rows as namedtuples. 

1564 

1565 Parameters 

1566 ---------- 

1567 index : bool, default True 

1568 If True, return the index as the first element of the tuple. 

1569 name : str or None, default "Pandas" 

1570 The name of the returned namedtuples or None to return regular 

1571 tuples. 

1572 

1573 Returns 

1574 ------- 

1575 iterator 

1576 An object to iterate over namedtuples for each row in the 

1577 DataFrame with the first field possibly being the index and 

1578 following fields being the column values. 

1579 

1580 See Also 

1581 -------- 

1582 DataFrame.iterrows : Iterate over DataFrame rows as (index, Series) 

1583 pairs. 

1584 DataFrame.items : Iterate over (column name, Series) pairs. 

1585 

1586 Notes 

1587 ----- 

1588 The column names will be renamed to positional names if they are 

1589 invalid Python identifiers, repeated, or start with an underscore. 

1590 

1591 Examples 

1592 -------- 

1593 >>> df = pd.DataFrame({'num_legs': [4, 2], 'num_wings': [0, 2]}, 

1594 ... index=['dog', 'hawk']) 

1595 >>> df 

1596 num_legs num_wings 

1597 dog 4 0 

1598 hawk 2 2 

1599 >>> for row in df.itertuples(): 

1600 ... print(row) 

1601 ... 

1602 Pandas(Index='dog', num_legs=4, num_wings=0) 

1603 Pandas(Index='hawk', num_legs=2, num_wings=2) 

1604 

1605 By setting the `index` parameter to False we can remove the index 

1606 as the first element of the tuple: 

1607 

1608 >>> for row in df.itertuples(index=False): 

1609 ... print(row) 

1610 ... 

1611 Pandas(num_legs=4, num_wings=0) 

1612 Pandas(num_legs=2, num_wings=2) 

1613 

1614 With the `name` parameter set we set a custom name for the yielded 

1615 namedtuples: 

1616 

1617 >>> for row in df.itertuples(name='Animal'): 

1618 ... print(row) 

1619 ... 

1620 Animal(Index='dog', num_legs=4, num_wings=0) 

1621 Animal(Index='hawk', num_legs=2, num_wings=2) 

1622 """ 

1623 arrays = [] 

1624 fields = list(self.columns) 

1625 if index: 

1626 arrays.append(self.index) 

1627 fields.insert(0, "Index") 

1628 

1629 # use integer indexing because of possible duplicate column names 

1630 arrays.extend(self.iloc[:, k] for k in range(len(self.columns))) 

1631 

1632 if name is not None: 

1633 # https://github.com/python/mypy/issues/9046 

1634 # error: namedtuple() expects a string literal as the first argument 

1635 itertuple = collections.namedtuple( # type: ignore[misc] 

1636 name, fields, rename=True 

1637 ) 

1638 return map(itertuple._make, zip(*arrays)) 

1639 

1640 # fallback to regular tuples 

1641 return zip(*arrays) 

1642 

1643 def __len__(self) -> int: 

1644 """ 

1645 Returns length of info axis, but here we use the index. 

1646 """ 

1647 return len(self.index) 

1648 

1649 @overload 

1650 def dot(self, other: Series) -> Series: 

1651 ... 

1652 

1653 @overload 

1654 def dot(self, other: DataFrame | Index | ArrayLike) -> DataFrame: 

1655 ... 

1656 

1657 def dot(self, other: AnyArrayLike | DataFrame) -> DataFrame | Series: 

1658 """ 

1659 Compute the matrix multiplication between the DataFrame and other. 

1660 

1661 This method computes the matrix product between the DataFrame and the 

1662 values of an other Series, DataFrame or a numpy array. 

1663 

1664 It can also be called using ``self @ other``. 

1665 

1666 Parameters 

1667 ---------- 

1668 other : Series, DataFrame or array-like 

1669 The other object to compute the matrix product with. 

1670 

1671 Returns 

1672 ------- 

1673 Series or DataFrame 

1674 If other is a Series, return the matrix product between self and 

1675 other as a Series. If other is a DataFrame or a numpy.array, return 

1676 the matrix product of self and other in a DataFrame of a np.array. 

1677 

1678 See Also 

1679 -------- 

1680 Series.dot: Similar method for Series. 

1681 

1682 Notes 

1683 ----- 

1684 The dimensions of DataFrame and other must be compatible in order to 

1685 compute the matrix multiplication. In addition, the column names of 

1686 DataFrame and the index of other must contain the same values, as they 

1687 will be aligned prior to the multiplication. 

1688 

1689 The dot method for Series computes the inner product, instead of the 

1690 matrix product here. 

1691 

1692 Examples 

1693 -------- 

1694 Here we multiply a DataFrame with a Series. 

1695 

1696 >>> df = pd.DataFrame([[0, 1, -2, -1], [1, 1, 1, 1]]) 

1697 >>> s = pd.Series([1, 1, 2, 1]) 

1698 >>> df.dot(s) 

1699 0 -4 

1700 1 5 

1701 dtype: int64 

1702 

1703 Here we multiply a DataFrame with another DataFrame. 

1704 

1705 >>> other = pd.DataFrame([[0, 1], [1, 2], [-1, -1], [2, 0]]) 

1706 >>> df.dot(other) 

1707 0 1 

1708 0 1 4 

1709 1 2 2 

1710 

1711 Note that the dot method give the same result as @ 

1712 

1713 >>> df @ other 

1714 0 1 

1715 0 1 4 

1716 1 2 2 

1717 

1718 The dot method works also if other is an np.array. 

1719 

1720 >>> arr = np.array([[0, 1], [1, 2], [-1, -1], [2, 0]]) 

1721 >>> df.dot(arr) 

1722 0 1 

1723 0 1 4 

1724 1 2 2 

1725 

1726 Note how shuffling of the objects does not change the result. 

1727 

1728 >>> s2 = s.reindex([1, 0, 2, 3]) 

1729 >>> df.dot(s2) 

1730 0 -4 

1731 1 5 

1732 dtype: int64 

1733 """ 

1734 if isinstance(other, (Series, DataFrame)): 

1735 common = self.columns.union(other.index) 

1736 if len(common) > len(self.columns) or len(common) > len(other.index): 

1737 raise ValueError("matrices are not aligned") 

1738 

1739 left = self.reindex(columns=common, copy=False) 

1740 right = other.reindex(index=common, copy=False) 

1741 lvals = left.values 

1742 rvals = right._values 

1743 else: 

1744 left = self 

1745 lvals = self.values 

1746 rvals = np.asarray(other) 

1747 if lvals.shape[1] != rvals.shape[0]: 

1748 raise ValueError( 

1749 f"Dot product shape mismatch, {lvals.shape} vs {rvals.shape}" 

1750 ) 

1751 

1752 if isinstance(other, DataFrame): 

1753 common_type = find_common_type(list(self.dtypes) + list(other.dtypes)) 

1754 return self._constructor( 

1755 np.dot(lvals, rvals), 

1756 index=left.index, 

1757 columns=other.columns, 

1758 copy=False, 

1759 dtype=common_type, 

1760 ) 

1761 elif isinstance(other, Series): 

1762 common_type = find_common_type(list(self.dtypes) + [other.dtypes]) 

1763 return self._constructor_sliced( 

1764 np.dot(lvals, rvals), index=left.index, copy=False, dtype=common_type 

1765 ) 

1766 elif isinstance(rvals, (np.ndarray, Index)): 

1767 result = np.dot(lvals, rvals) 

1768 if result.ndim == 2: 

1769 return self._constructor(result, index=left.index, copy=False) 

1770 else: 

1771 return self._constructor_sliced(result, index=left.index, copy=False) 

1772 else: # pragma: no cover 

1773 raise TypeError(f"unsupported type: {type(other)}") 

1774 

1775 @overload 

1776 def __matmul__(self, other: Series) -> Series: 

1777 ... 

1778 

1779 @overload 

1780 def __matmul__(self, other: AnyArrayLike | DataFrame) -> DataFrame | Series: 

1781 ... 

1782 

1783 def __matmul__(self, other: AnyArrayLike | DataFrame) -> DataFrame | Series: 

1784 """ 

1785 Matrix multiplication using binary `@` operator. 

1786 """ 

1787 return self.dot(other) 

1788 

1789 def __rmatmul__(self, other) -> DataFrame: 

1790 """ 

1791 Matrix multiplication using binary `@` operator. 

1792 """ 

1793 try: 

1794 return self.T.dot(np.transpose(other)).T 

1795 except ValueError as err: 

1796 if "shape mismatch" not in str(err): 

1797 raise 

1798 # GH#21581 give exception message for original shapes 

1799 msg = f"shapes {np.shape(other)} and {self.shape} not aligned" 

1800 raise ValueError(msg) from err 

1801 

1802 # ---------------------------------------------------------------------- 

1803 # IO methods (to / from other formats) 

1804 

1805 @classmethod 

1806 def from_dict( 

1807 cls, 

1808 data: dict, 

1809 orient: FromDictOrient = "columns", 

1810 dtype: Dtype | None = None, 

1811 columns: Axes | None = None, 

1812 ) -> DataFrame: 

1813 """ 

1814 Construct DataFrame from dict of array-like or dicts. 

1815 

1816 Creates DataFrame object from dictionary by columns or by index 

1817 allowing dtype specification. 

1818 

1819 Parameters 

1820 ---------- 

1821 data : dict 

1822 Of the form {field : array-like} or {field : dict}. 

1823 orient : {'columns', 'index', 'tight'}, default 'columns' 

1824 The "orientation" of the data. If the keys of the passed dict 

1825 should be the columns of the resulting DataFrame, pass 'columns' 

1826 (default). Otherwise if the keys should be rows, pass 'index'. 

1827 If 'tight', assume a dict with keys ['index', 'columns', 'data', 

1828 'index_names', 'column_names']. 

1829 

1830 .. versionadded:: 1.4.0 

1831 'tight' as an allowed value for the ``orient`` argument 

1832 

1833 dtype : dtype, default None 

1834 Data type to force after DataFrame construction, otherwise infer. 

1835 columns : list, default None 

1836 Column labels to use when ``orient='index'``. Raises a ValueError 

1837 if used with ``orient='columns'`` or ``orient='tight'``. 

1838 

1839 Returns 

1840 ------- 

1841 DataFrame 

1842 

1843 See Also 

1844 -------- 

1845 DataFrame.from_records : DataFrame from structured ndarray, sequence 

1846 of tuples or dicts, or DataFrame. 

1847 DataFrame : DataFrame object creation using constructor. 

1848 DataFrame.to_dict : Convert the DataFrame to a dictionary. 

1849 

1850 Examples 

1851 -------- 

1852 By default the keys of the dict become the DataFrame columns: 

1853 

1854 >>> data = {'col_1': [3, 2, 1, 0], 'col_2': ['a', 'b', 'c', 'd']} 

1855 >>> pd.DataFrame.from_dict(data) 

1856 col_1 col_2 

1857 0 3 a 

1858 1 2 b 

1859 2 1 c 

1860 3 0 d 

1861 

1862 Specify ``orient='index'`` to create the DataFrame using dictionary 

1863 keys as rows: 

1864 

1865 >>> data = {'row_1': [3, 2, 1, 0], 'row_2': ['a', 'b', 'c', 'd']} 

1866 >>> pd.DataFrame.from_dict(data, orient='index') 

1867 0 1 2 3 

1868 row_1 3 2 1 0 

1869 row_2 a b c d 

1870 

1871 When using the 'index' orientation, the column names can be 

1872 specified manually: 

1873 

1874 >>> pd.DataFrame.from_dict(data, orient='index', 

1875 ... columns=['A', 'B', 'C', 'D']) 

1876 A B C D 

1877 row_1 3 2 1 0 

1878 row_2 a b c d 

1879 

1880 Specify ``orient='tight'`` to create the DataFrame using a 'tight' 

1881 format: 

1882 

1883 >>> data = {'index': [('a', 'b'), ('a', 'c')], 

1884 ... 'columns': [('x', 1), ('y', 2)], 

1885 ... 'data': [[1, 3], [2, 4]], 

1886 ... 'index_names': ['n1', 'n2'], 

1887 ... 'column_names': ['z1', 'z2']} 

1888 >>> pd.DataFrame.from_dict(data, orient='tight') 

1889 z1 x y 

1890 z2 1 2 

1891 n1 n2 

1892 a b 1 3 

1893 c 2 4 

1894 """ 

1895 index = None 

1896 orient = orient.lower() # type: ignore[assignment] 

1897 if orient == "index": 

1898 if len(data) > 0: 

1899 # TODO speed up Series case 

1900 if isinstance(next(iter(data.values())), (Series, dict)): 

1901 data = _from_nested_dict(data) 

1902 else: 

1903 index = list(data.keys()) 

1904 # error: Incompatible types in assignment (expression has type 

1905 # "List[Any]", variable has type "Dict[Any, Any]") 

1906 data = list(data.values()) # type: ignore[assignment] 

1907 elif orient in ("columns", "tight"): 

1908 if columns is not None: 

1909 raise ValueError(f"cannot use columns parameter with orient='{orient}'") 

1910 else: # pragma: no cover 

1911 raise ValueError( 

1912 f"Expected 'index', 'columns' or 'tight' for orient parameter. " 

1913 f"Got '{orient}' instead" 

1914 ) 

1915 

1916 if orient != "tight": 

1917 return cls(data, index=index, columns=columns, dtype=dtype) 

1918 else: 

1919 realdata = data["data"] 

1920 

1921 def create_index(indexlist, namelist): 

1922 index: Index 

1923 if len(namelist) > 1: 

1924 index = MultiIndex.from_tuples(indexlist, names=namelist) 

1925 else: 

1926 index = Index(indexlist, name=namelist[0]) 

1927 return index 

1928 

1929 index = create_index(data["index"], data["index_names"]) 

1930 columns = create_index(data["columns"], data["column_names"]) 

1931 return cls(realdata, index=index, columns=columns, dtype=dtype) 

1932 

1933 def to_numpy( 

1934 self, 

1935 dtype: npt.DTypeLike | None = None, 

1936 copy: bool = False, 

1937 na_value: object = lib.no_default, 

1938 ) -> np.ndarray: 

1939 """ 

1940 Convert the DataFrame to a NumPy array. 

1941 

1942 By default, the dtype of the returned array will be the common NumPy 

1943 dtype of all types in the DataFrame. For example, if the dtypes are 

1944 ``float16`` and ``float32``, the results dtype will be ``float32``. 

1945 This may require copying data and coercing values, which may be 

1946 expensive. 

1947 

1948 Parameters 

1949 ---------- 

1950 dtype : str or numpy.dtype, optional 

1951 The dtype to pass to :meth:`numpy.asarray`. 

1952 copy : bool, default False 

1953 Whether to ensure that the returned value is not a view on 

1954 another array. Note that ``copy=False`` does not *ensure* that 

1955 ``to_numpy()`` is no-copy. Rather, ``copy=True`` ensure that 

1956 a copy is made, even if not strictly necessary. 

1957 na_value : Any, optional 

1958 The value to use for missing values. The default value depends 

1959 on `dtype` and the dtypes of the DataFrame columns. 

1960 

1961 Returns 

1962 ------- 

1963 numpy.ndarray 

1964 

1965 See Also 

1966 -------- 

1967 Series.to_numpy : Similar method for Series. 

1968 

1969 Examples 

1970 -------- 

1971 >>> pd.DataFrame({"A": [1, 2], "B": [3, 4]}).to_numpy() 

1972 array([[1, 3], 

1973 [2, 4]]) 

1974 

1975 With heterogeneous data, the lowest common type will have to 

1976 be used. 

1977 

1978 >>> df = pd.DataFrame({"A": [1, 2], "B": [3.0, 4.5]}) 

1979 >>> df.to_numpy() 

1980 array([[1. , 3. ], 

1981 [2. , 4.5]]) 

1982 

1983 For a mix of numeric and non-numeric types, the output array will 

1984 have object dtype. 

1985 

1986 >>> df['C'] = pd.date_range('2000', periods=2) 

1987 >>> df.to_numpy() 

1988 array([[1, 3.0, Timestamp('2000-01-01 00:00:00')], 

1989 [2, 4.5, Timestamp('2000-01-02 00:00:00')]], dtype=object) 

1990 """ 

1991 if dtype is not None: 

1992 dtype = np.dtype(dtype) 

1993 result = self._mgr.as_array(dtype=dtype, copy=copy, na_value=na_value) 

1994 if result.dtype is not dtype: 

1995 result = np.asarray(result, dtype=dtype) 

1996 

1997 return result 

1998 

1999 def _create_data_for_split_and_tight_to_dict( 

2000 self, are_all_object_dtype_cols: bool, object_dtype_indices: list[int] 

2001 ) -> list: 

2002 """ 

2003 Simple helper method to create data for to ``to_dict(orient="split")`` and 

2004 ``to_dict(orient="tight")`` to create the main output data 

2005 """ 

2006 if are_all_object_dtype_cols: 

2007 data = [ 

2008 list(map(maybe_box_native, t)) 

2009 for t in self.itertuples(index=False, name=None) 

2010 ] 

2011 else: 

2012 data = [list(t) for t in self.itertuples(index=False, name=None)] 

2013 if object_dtype_indices: 

2014 # If we have object_dtype_cols, apply maybe_box_naive after list 

2015 # comprehension for perf 

2016 for row in data: 

2017 for i in object_dtype_indices: 

2018 row[i] = maybe_box_native(row[i]) 

2019 return data 

2020 

2021 @overload 

2022 def to_dict( 

2023 self, 

2024 orient: Literal["dict", "list", "series", "split", "tight", "index"] = ..., 

2025 *, 

2026 into: type[MutableMappingT] | MutableMappingT, 

2027 index: bool = ..., 

2028 ) -> MutableMappingT: 

2029 ... 

2030 

2031 @overload 

2032 def to_dict( 

2033 self, 

2034 orient: Literal["records"], 

2035 *, 

2036 into: type[MutableMappingT] | MutableMappingT, 

2037 index: bool = ..., 

2038 ) -> list[MutableMappingT]: 

2039 ... 

2040 

2041 @overload 

2042 def to_dict( 

2043 self, 

2044 orient: Literal["dict", "list", "series", "split", "tight", "index"] = ..., 

2045 *, 

2046 into: type[dict] = ..., 

2047 index: bool = ..., 

2048 ) -> dict: 

2049 ... 

2050 

2051 @overload 

2052 def to_dict( 

2053 self, 

2054 orient: Literal["records"], 

2055 *, 

2056 into: type[dict] = ..., 

2057 index: bool = ..., 

2058 ) -> list[dict]: 

2059 ... 

2060 

2061 # error: Incompatible default for argument "into" (default has type "type 

2062 # [dict[Any, Any]]", argument has type "type[MutableMappingT] | MutableMappingT") 

2063 @deprecate_nonkeyword_arguments( 

2064 version="3.0", allowed_args=["self", "orient"], name="to_dict" 

2065 ) 

2066 def to_dict( 

2067 self, 

2068 orient: Literal[ 

2069 "dict", "list", "series", "split", "tight", "records", "index" 

2070 ] = "dict", 

2071 into: type[MutableMappingT] 

2072 | MutableMappingT = dict, # type: ignore[assignment] 

2073 index: bool = True, 

2074 ) -> MutableMappingT | list[MutableMappingT]: 

2075 """ 

2076 Convert the DataFrame to a dictionary. 

2077 

2078 The type of the key-value pairs can be customized with the parameters 

2079 (see below). 

2080 

2081 Parameters 

2082 ---------- 

2083 orient : str {'dict', 'list', 'series', 'split', 'tight', 'records', 'index'} 

2084 Determines the type of the values of the dictionary. 

2085 

2086 - 'dict' (default) : dict like {column -> {index -> value}} 

2087 - 'list' : dict like {column -> [values]} 

2088 - 'series' : dict like {column -> Series(values)} 

2089 - 'split' : dict like 

2090 {'index' -> [index], 'columns' -> [columns], 'data' -> [values]} 

2091 - 'tight' : dict like 

2092 {'index' -> [index], 'columns' -> [columns], 'data' -> [values], 

2093 'index_names' -> [index.names], 'column_names' -> [column.names]} 

2094 - 'records' : list like 

2095 [{column -> value}, ... , {column -> value}] 

2096 - 'index' : dict like {index -> {column -> value}} 

2097 

2098 .. versionadded:: 1.4.0 

2099 'tight' as an allowed value for the ``orient`` argument 

2100 

2101 into : class, default dict 

2102 The collections.abc.MutableMapping subclass used for all Mappings 

2103 in the return value. Can be the actual class or an empty 

2104 instance of the mapping type you want. If you want a 

2105 collections.defaultdict, you must pass it initialized. 

2106 

2107 index : bool, default True 

2108 Whether to include the index item (and index_names item if `orient` 

2109 is 'tight') in the returned dictionary. Can only be ``False`` 

2110 when `orient` is 'split' or 'tight'. 

2111 

2112 .. versionadded:: 2.0.0 

2113 

2114 Returns 

2115 ------- 

2116 dict, list or collections.abc.MutableMapping 

2117 Return a collections.abc.MutableMapping object representing the 

2118 DataFrame. The resulting transformation depends on the `orient` 

2119 parameter. 

2120 

2121 See Also 

2122 -------- 

2123 DataFrame.from_dict: Create a DataFrame from a dictionary. 

2124 DataFrame.to_json: Convert a DataFrame to JSON format. 

2125 

2126 Examples 

2127 -------- 

2128 >>> df = pd.DataFrame({'col1': [1, 2], 

2129 ... 'col2': [0.5, 0.75]}, 

2130 ... index=['row1', 'row2']) 

2131 >>> df 

2132 col1 col2 

2133 row1 1 0.50 

2134 row2 2 0.75 

2135 >>> df.to_dict() 

2136 {'col1': {'row1': 1, 'row2': 2}, 'col2': {'row1': 0.5, 'row2': 0.75}} 

2137 

2138 You can specify the return orientation. 

2139 

2140 >>> df.to_dict('series') 

2141 {'col1': row1 1 

2142 row2 2 

2143 Name: col1, dtype: int64, 

2144 'col2': row1 0.50 

2145 row2 0.75 

2146 Name: col2, dtype: float64} 

2147 

2148 >>> df.to_dict('split') 

2149 {'index': ['row1', 'row2'], 'columns': ['col1', 'col2'], 

2150 'data': [[1, 0.5], [2, 0.75]]} 

2151 

2152 >>> df.to_dict('records') 

2153 [{'col1': 1, 'col2': 0.5}, {'col1': 2, 'col2': 0.75}] 

2154 

2155 >>> df.to_dict('index') 

2156 {'row1': {'col1': 1, 'col2': 0.5}, 'row2': {'col1': 2, 'col2': 0.75}} 

2157 

2158 >>> df.to_dict('tight') 

2159 {'index': ['row1', 'row2'], 'columns': ['col1', 'col2'], 

2160 'data': [[1, 0.5], [2, 0.75]], 'index_names': [None], 'column_names': [None]} 

2161 

2162 You can also specify the mapping type. 

2163 

2164 >>> from collections import OrderedDict, defaultdict 

2165 >>> df.to_dict(into=OrderedDict) 

2166 OrderedDict([('col1', OrderedDict([('row1', 1), ('row2', 2)])), 

2167 ('col2', OrderedDict([('row1', 0.5), ('row2', 0.75)]))]) 

2168 

2169 If you want a `defaultdict`, you need to initialize it: 

2170 

2171 >>> dd = defaultdict(list) 

2172 >>> df.to_dict('records', into=dd) 

2173 [defaultdict(<class 'list'>, {'col1': 1, 'col2': 0.5}), 

2174 defaultdict(<class 'list'>, {'col1': 2, 'col2': 0.75})] 

2175 """ 

2176 from pandas.core.methods.to_dict import to_dict 

2177 

2178 return to_dict(self, orient, into=into, index=index) 

2179 

2180 @deprecate_nonkeyword_arguments( 

2181 version="3.0", allowed_args=["self", "destination_table"], name="to_gbq" 

2182 ) 

2183 def to_gbq( 

2184 self, 

2185 destination_table: str, 

2186 project_id: str | None = None, 

2187 chunksize: int | None = None, 

2188 reauth: bool = False, 

2189 if_exists: ToGbqIfexist = "fail", 

2190 auth_local_webserver: bool = True, 

2191 table_schema: list[dict[str, str]] | None = None, 

2192 location: str | None = None, 

2193 progress_bar: bool = True, 

2194 credentials=None, 

2195 ) -> None: 

2196 """ 

2197 Write a DataFrame to a Google BigQuery table. 

2198 

2199 .. deprecated:: 2.2.0 

2200 

2201 Please use ``pandas_gbq.to_gbq`` instead. 

2202 

2203 This function requires the `pandas-gbq package 

2204 <https://pandas-gbq.readthedocs.io>`__. 

2205 

2206 See the `How to authenticate with Google BigQuery 

2207 <https://pandas-gbq.readthedocs.io/en/latest/howto/authentication.html>`__ 

2208 guide for authentication instructions. 

2209 

2210 Parameters 

2211 ---------- 

2212 destination_table : str 

2213 Name of table to be written, in the form ``dataset.tablename``. 

2214 project_id : str, optional 

2215 Google BigQuery Account project ID. Optional when available from 

2216 the environment. 

2217 chunksize : int, optional 

2218 Number of rows to be inserted in each chunk from the dataframe. 

2219 Set to ``None`` to load the whole dataframe at once. 

2220 reauth : bool, default False 

2221 Force Google BigQuery to re-authenticate the user. This is useful 

2222 if multiple accounts are used. 

2223 if_exists : str, default 'fail' 

2224 Behavior when the destination table exists. Value can be one of: 

2225 

2226 ``'fail'`` 

2227 If table exists raise pandas_gbq.gbq.TableCreationError. 

2228 ``'replace'`` 

2229 If table exists, drop it, recreate it, and insert data. 

2230 ``'append'`` 

2231 If table exists, insert data. Create if does not exist. 

2232 auth_local_webserver : bool, default True 

2233 Use the `local webserver flow`_ instead of the `console flow`_ 

2234 when getting user credentials. 

2235 

2236 .. _local webserver flow: 

2237 https://google-auth-oauthlib.readthedocs.io/en/latest/reference/google_auth_oauthlib.flow.html#google_auth_oauthlib.flow.InstalledAppFlow.run_local_server 

2238 .. _console flow: 

2239 https://google-auth-oauthlib.readthedocs.io/en/latest/reference/google_auth_oauthlib.flow.html#google_auth_oauthlib.flow.InstalledAppFlow.run_console 

2240 

2241 *New in version 0.2.0 of pandas-gbq*. 

2242 

2243 .. versionchanged:: 1.5.0 

2244 Default value is changed to ``True``. Google has deprecated the 

2245 ``auth_local_webserver = False`` `"out of band" (copy-paste) 

2246 flow 

2247 <https://developers.googleblog.com/2022/02/making-oauth-flows-safer.html?m=1#disallowed-oob>`_. 

2248 table_schema : list of dicts, optional 

2249 List of BigQuery table fields to which according DataFrame 

2250 columns conform to, e.g. ``[{'name': 'col1', 'type': 

2251 'STRING'},...]``. If schema is not provided, it will be 

2252 generated according to dtypes of DataFrame columns. See 

2253 BigQuery API documentation on available names of a field. 

2254 

2255 *New in version 0.3.1 of pandas-gbq*. 

2256 location : str, optional 

2257 Location where the load job should run. See the `BigQuery locations 

2258 documentation 

2259 <https://cloud.google.com/bigquery/docs/dataset-locations>`__ for a 

2260 list of available locations. The location must match that of the 

2261 target dataset. 

2262 

2263 *New in version 0.5.0 of pandas-gbq*. 

2264 progress_bar : bool, default True 

2265 Use the library `tqdm` to show the progress bar for the upload, 

2266 chunk by chunk. 

2267 

2268 *New in version 0.5.0 of pandas-gbq*. 

2269 credentials : google.auth.credentials.Credentials, optional 

2270 Credentials for accessing Google APIs. Use this parameter to 

2271 override default credentials, such as to use Compute Engine 

2272 :class:`google.auth.compute_engine.Credentials` or Service 

2273 Account :class:`google.oauth2.service_account.Credentials` 

2274 directly. 

2275 

2276 *New in version 0.8.0 of pandas-gbq*. 

2277 

2278 See Also 

2279 -------- 

2280 pandas_gbq.to_gbq : This function in the pandas-gbq library. 

2281 read_gbq : Read a DataFrame from Google BigQuery. 

2282 

2283 Examples 

2284 -------- 

2285 Example taken from `Google BigQuery documentation 

2286 <https://cloud.google.com/bigquery/docs/samples/bigquery-pandas-gbq-to-gbq-simple>`_ 

2287 

2288 >>> project_id = "my-project" 

2289 >>> table_id = 'my_dataset.my_table' 

2290 >>> df = pd.DataFrame({ 

2291 ... "my_string": ["a", "b", "c"], 

2292 ... "my_int64": [1, 2, 3], 

2293 ... "my_float64": [4.0, 5.0, 6.0], 

2294 ... "my_bool1": [True, False, True], 

2295 ... "my_bool2": [False, True, False], 

2296 ... "my_dates": pd.date_range("now", periods=3), 

2297 ... } 

2298 ... ) 

2299 

2300 >>> df.to_gbq(table_id, project_id=project_id) # doctest: +SKIP 

2301 """ 

2302 from pandas.io import gbq 

2303 

2304 gbq.to_gbq( 

2305 self, 

2306 destination_table, 

2307 project_id=project_id, 

2308 chunksize=chunksize, 

2309 reauth=reauth, 

2310 if_exists=if_exists, 

2311 auth_local_webserver=auth_local_webserver, 

2312 table_schema=table_schema, 

2313 location=location, 

2314 progress_bar=progress_bar, 

2315 credentials=credentials, 

2316 ) 

2317 

2318 @classmethod 

2319 def from_records( 

2320 cls, 

2321 data, 

2322 index=None, 

2323 exclude=None, 

2324 columns=None, 

2325 coerce_float: bool = False, 

2326 nrows: int | None = None, 

2327 ) -> DataFrame: 

2328 """ 

2329 Convert structured or record ndarray to DataFrame. 

2330 

2331 Creates a DataFrame object from a structured ndarray, sequence of 

2332 tuples or dicts, or DataFrame. 

2333 

2334 Parameters 

2335 ---------- 

2336 data : structured ndarray, sequence of tuples or dicts, or DataFrame 

2337 Structured input data. 

2338 

2339 .. deprecated:: 2.1.0 

2340 Passing a DataFrame is deprecated. 

2341 index : str, list of fields, array-like 

2342 Field of array to use as the index, alternately a specific set of 

2343 input labels to use. 

2344 exclude : sequence, default None 

2345 Columns or fields to exclude. 

2346 columns : sequence, default None 

2347 Column names to use. If the passed data do not have names 

2348 associated with them, this argument provides names for the 

2349 columns. Otherwise this argument indicates the order of the columns 

2350 in the result (any names not found in the data will become all-NA 

2351 columns). 

2352 coerce_float : bool, default False 

2353 Attempt to convert values of non-string, non-numeric objects (like 

2354 decimal.Decimal) to floating point, useful for SQL result sets. 

2355 nrows : int, default None 

2356 Number of rows to read if data is an iterator. 

2357 

2358 Returns 

2359 ------- 

2360 DataFrame 

2361 

2362 See Also 

2363 -------- 

2364 DataFrame.from_dict : DataFrame from dict of array-like or dicts. 

2365 DataFrame : DataFrame object creation using constructor. 

2366 

2367 Examples 

2368 -------- 

2369 Data can be provided as a structured ndarray: 

2370 

2371 >>> data = np.array([(3, 'a'), (2, 'b'), (1, 'c'), (0, 'd')], 

2372 ... dtype=[('col_1', 'i4'), ('col_2', 'U1')]) 

2373 >>> pd.DataFrame.from_records(data) 

2374 col_1 col_2 

2375 0 3 a 

2376 1 2 b 

2377 2 1 c 

2378 3 0 d 

2379 

2380 Data can be provided as a list of dicts: 

2381 

2382 >>> data = [{'col_1': 3, 'col_2': 'a'}, 

2383 ... {'col_1': 2, 'col_2': 'b'}, 

2384 ... {'col_1': 1, 'col_2': 'c'}, 

2385 ... {'col_1': 0, 'col_2': 'd'}] 

2386 >>> pd.DataFrame.from_records(data) 

2387 col_1 col_2 

2388 0 3 a 

2389 1 2 b 

2390 2 1 c 

2391 3 0 d 

2392 

2393 Data can be provided as a list of tuples with corresponding columns: 

2394 

2395 >>> data = [(3, 'a'), (2, 'b'), (1, 'c'), (0, 'd')] 

2396 >>> pd.DataFrame.from_records(data, columns=['col_1', 'col_2']) 

2397 col_1 col_2 

2398 0 3 a 

2399 1 2 b 

2400 2 1 c 

2401 3 0 d 

2402 """ 

2403 if isinstance(data, DataFrame): 

2404 warnings.warn( 

2405 "Passing a DataFrame to DataFrame.from_records is deprecated. Use " 

2406 "set_index and/or drop to modify the DataFrame instead.", 

2407 FutureWarning, 

2408 stacklevel=find_stack_level(), 

2409 ) 

2410 if columns is not None: 

2411 if is_scalar(columns): 

2412 columns = [columns] 

2413 data = data[columns] 

2414 if index is not None: 

2415 data = data.set_index(index) 

2416 if exclude is not None: 

2417 data = data.drop(columns=exclude) 

2418 return data.copy(deep=False) 

2419 

2420 result_index = None 

2421 

2422 # Make a copy of the input columns so we can modify it 

2423 if columns is not None: 

2424 columns = ensure_index(columns) 

2425 

2426 def maybe_reorder( 

2427 arrays: list[ArrayLike], arr_columns: Index, columns: Index, index 

2428 ) -> tuple[list[ArrayLike], Index, Index | None]: 

2429 """ 

2430 If our desired 'columns' do not match the data's pre-existing 'arr_columns', 

2431 we re-order our arrays. This is like a pre-emptive (cheap) reindex. 

2432 """ 

2433 if len(arrays): 

2434 length = len(arrays[0]) 

2435 else: 

2436 length = 0 

2437 

2438 result_index = None 

2439 if len(arrays) == 0 and index is None and length == 0: 

2440 result_index = default_index(0) 

2441 

2442 arrays, arr_columns = reorder_arrays(arrays, arr_columns, columns, length) 

2443 return arrays, arr_columns, result_index 

2444 

2445 if is_iterator(data): 

2446 if nrows == 0: 

2447 return cls() 

2448 

2449 try: 

2450 first_row = next(data) 

2451 except StopIteration: 

2452 return cls(index=index, columns=columns) 

2453 

2454 dtype = None 

2455 if hasattr(first_row, "dtype") and first_row.dtype.names: 

2456 dtype = first_row.dtype 

2457 

2458 values = [first_row] 

2459 

2460 if nrows is None: 

2461 values += data 

2462 else: 

2463 values.extend(itertools.islice(data, nrows - 1)) 

2464 

2465 if dtype is not None: 

2466 data = np.array(values, dtype=dtype) 

2467 else: 

2468 data = values 

2469 

2470 if isinstance(data, dict): 

2471 if columns is None: 

2472 columns = arr_columns = ensure_index(sorted(data)) 

2473 arrays = [data[k] for k in columns] 

2474 else: 

2475 arrays = [] 

2476 arr_columns_list = [] 

2477 for k, v in data.items(): 

2478 if k in columns: 

2479 arr_columns_list.append(k) 

2480 arrays.append(v) 

2481 

2482 arr_columns = Index(arr_columns_list) 

2483 arrays, arr_columns, result_index = maybe_reorder( 

2484 arrays, arr_columns, columns, index 

2485 ) 

2486 

2487 elif isinstance(data, np.ndarray): 

2488 arrays, columns = to_arrays(data, columns) 

2489 arr_columns = columns 

2490 else: 

2491 arrays, arr_columns = to_arrays(data, columns) 

2492 if coerce_float: 

2493 for i, arr in enumerate(arrays): 

2494 if arr.dtype == object: 

2495 # error: Argument 1 to "maybe_convert_objects" has 

2496 # incompatible type "Union[ExtensionArray, ndarray]"; 

2497 # expected "ndarray" 

2498 arrays[i] = lib.maybe_convert_objects( 

2499 arr, # type: ignore[arg-type] 

2500 try_float=True, 

2501 ) 

2502 

2503 arr_columns = ensure_index(arr_columns) 

2504 if columns is None: 

2505 columns = arr_columns 

2506 else: 

2507 arrays, arr_columns, result_index = maybe_reorder( 

2508 arrays, arr_columns, columns, index 

2509 ) 

2510 

2511 if exclude is None: 

2512 exclude = set() 

2513 else: 

2514 exclude = set(exclude) 

2515 

2516 if index is not None: 

2517 if isinstance(index, str) or not hasattr(index, "__iter__"): 

2518 i = columns.get_loc(index) 

2519 exclude.add(index) 

2520 if len(arrays) > 0: 

2521 result_index = Index(arrays[i], name=index) 

2522 else: 

2523 result_index = Index([], name=index) 

2524 else: 

2525 try: 

2526 index_data = [arrays[arr_columns.get_loc(field)] for field in index] 

2527 except (KeyError, TypeError): 

2528 # raised by get_loc, see GH#29258 

2529 result_index = index 

2530 else: 

2531 result_index = ensure_index_from_sequences(index_data, names=index) 

2532 exclude.update(index) 

2533 

2534 if any(exclude): 

2535 arr_exclude = [x for x in exclude if x in arr_columns] 

2536 to_remove = [arr_columns.get_loc(col) for col in arr_exclude] 

2537 arrays = [v for i, v in enumerate(arrays) if i not in to_remove] 

2538 

2539 columns = columns.drop(exclude) 

2540 

2541 manager = _get_option("mode.data_manager", silent=True) 

2542 mgr = arrays_to_mgr(arrays, columns, result_index, typ=manager) 

2543 

2544 return cls._from_mgr(mgr, axes=mgr.axes) 

2545 

2546 def to_records( 

2547 self, index: bool = True, column_dtypes=None, index_dtypes=None 

2548 ) -> np.rec.recarray: 

2549 """ 

2550 Convert DataFrame to a NumPy record array. 

2551 

2552 Index will be included as the first field of the record array if 

2553 requested. 

2554 

2555 Parameters 

2556 ---------- 

2557 index : bool, default True 

2558 Include index in resulting record array, stored in 'index' 

2559 field or using the index label, if set. 

2560 column_dtypes : str, type, dict, default None 

2561 If a string or type, the data type to store all columns. If 

2562 a dictionary, a mapping of column names and indices (zero-indexed) 

2563 to specific data types. 

2564 index_dtypes : str, type, dict, default None 

2565 If a string or type, the data type to store all index levels. If 

2566 a dictionary, a mapping of index level names and indices 

2567 (zero-indexed) to specific data types. 

2568 

2569 This mapping is applied only if `index=True`. 

2570 

2571 Returns 

2572 ------- 

2573 numpy.rec.recarray 

2574 NumPy ndarray with the DataFrame labels as fields and each row 

2575 of the DataFrame as entries. 

2576 

2577 See Also 

2578 -------- 

2579 DataFrame.from_records: Convert structured or record ndarray 

2580 to DataFrame. 

2581 numpy.rec.recarray: An ndarray that allows field access using 

2582 attributes, analogous to typed columns in a 

2583 spreadsheet. 

2584 

2585 Examples 

2586 -------- 

2587 >>> df = pd.DataFrame({'A': [1, 2], 'B': [0.5, 0.75]}, 

2588 ... index=['a', 'b']) 

2589 >>> df 

2590 A B 

2591 a 1 0.50 

2592 b 2 0.75 

2593 >>> df.to_records() 

2594 rec.array([('a', 1, 0.5 ), ('b', 2, 0.75)], 

2595 dtype=[('index', 'O'), ('A', '<i8'), ('B', '<f8')]) 

2596 

2597 If the DataFrame index has no label then the recarray field name 

2598 is set to 'index'. If the index has a label then this is used as the 

2599 field name: 

2600 

2601 >>> df.index = df.index.rename("I") 

2602 >>> df.to_records() 

2603 rec.array([('a', 1, 0.5 ), ('b', 2, 0.75)], 

2604 dtype=[('I', 'O'), ('A', '<i8'), ('B', '<f8')]) 

2605 

2606 The index can be excluded from the record array: 

2607 

2608 >>> df.to_records(index=False) 

2609 rec.array([(1, 0.5 ), (2, 0.75)], 

2610 dtype=[('A', '<i8'), ('B', '<f8')]) 

2611 

2612 Data types can be specified for the columns: 

2613 

2614 >>> df.to_records(column_dtypes={"A": "int32"}) 

2615 rec.array([('a', 1, 0.5 ), ('b', 2, 0.75)], 

2616 dtype=[('I', 'O'), ('A', '<i4'), ('B', '<f8')]) 

2617 

2618 As well as for the index: 

2619 

2620 >>> df.to_records(index_dtypes="<S2") 

2621 rec.array([(b'a', 1, 0.5 ), (b'b', 2, 0.75)], 

2622 dtype=[('I', 'S2'), ('A', '<i8'), ('B', '<f8')]) 

2623 

2624 >>> index_dtypes = f"<S{df.index.str.len().max()}" 

2625 >>> df.to_records(index_dtypes=index_dtypes) 

2626 rec.array([(b'a', 1, 0.5 ), (b'b', 2, 0.75)], 

2627 dtype=[('I', 'S1'), ('A', '<i8'), ('B', '<f8')]) 

2628 """ 

2629 if index: 

2630 ix_vals = [ 

2631 np.asarray(self.index.get_level_values(i)) 

2632 for i in range(self.index.nlevels) 

2633 ] 

2634 

2635 arrays = ix_vals + [ 

2636 np.asarray(self.iloc[:, i]) for i in range(len(self.columns)) 

2637 ] 

2638 

2639 index_names = list(self.index.names) 

2640 

2641 if isinstance(self.index, MultiIndex): 

2642 index_names = com.fill_missing_names(index_names) 

2643 elif index_names[0] is None: 

2644 index_names = ["index"] 

2645 

2646 names = [str(name) for name in itertools.chain(index_names, self.columns)] 

2647 else: 

2648 arrays = [np.asarray(self.iloc[:, i]) for i in range(len(self.columns))] 

2649 names = [str(c) for c in self.columns] 

2650 index_names = [] 

2651 

2652 index_len = len(index_names) 

2653 formats = [] 

2654 

2655 for i, v in enumerate(arrays): 

2656 index_int = i 

2657 

2658 # When the names and arrays are collected, we 

2659 # first collect those in the DataFrame's index, 

2660 # followed by those in its columns. 

2661 # 

2662 # Thus, the total length of the array is: 

2663 # len(index_names) + len(DataFrame.columns). 

2664 # 

2665 # This check allows us to see whether we are 

2666 # handling a name / array in the index or column. 

2667 if index_int < index_len: 

2668 dtype_mapping = index_dtypes 

2669 name = index_names[index_int] 

2670 else: 

2671 index_int -= index_len 

2672 dtype_mapping = column_dtypes 

2673 name = self.columns[index_int] 

2674 

2675 # We have a dictionary, so we get the data type 

2676 # associated with the index or column (which can 

2677 # be denoted by its name in the DataFrame or its 

2678 # position in DataFrame's array of indices or 

2679 # columns, whichever is applicable. 

2680 if is_dict_like(dtype_mapping): 

2681 if name in dtype_mapping: 

2682 dtype_mapping = dtype_mapping[name] 

2683 elif index_int in dtype_mapping: 

2684 dtype_mapping = dtype_mapping[index_int] 

2685 else: 

2686 dtype_mapping = None 

2687 

2688 # If no mapping can be found, use the array's 

2689 # dtype attribute for formatting. 

2690 # 

2691 # A valid dtype must either be a type or 

2692 # string naming a type. 

2693 if dtype_mapping is None: 

2694 formats.append(v.dtype) 

2695 elif isinstance(dtype_mapping, (type, np.dtype, str)): 

2696 # error: Argument 1 to "append" of "list" has incompatible 

2697 # type "Union[type, dtype[Any], str]"; expected "dtype[Any]" 

2698 formats.append(dtype_mapping) # type: ignore[arg-type] 

2699 else: 

2700 element = "row" if i < index_len else "column" 

2701 msg = f"Invalid dtype {dtype_mapping} specified for {element} {name}" 

2702 raise ValueError(msg) 

2703 

2704 return np.rec.fromarrays(arrays, dtype={"names": names, "formats": formats}) 

2705 

2706 @classmethod 

2707 def _from_arrays( 

2708 cls, 

2709 arrays, 

2710 columns, 

2711 index, 

2712 dtype: Dtype | None = None, 

2713 verify_integrity: bool = True, 

2714 ) -> Self: 

2715 """ 

2716 Create DataFrame from a list of arrays corresponding to the columns. 

2717 

2718 Parameters 

2719 ---------- 

2720 arrays : list-like of arrays 

2721 Each array in the list corresponds to one column, in order. 

2722 columns : list-like, Index 

2723 The column names for the resulting DataFrame. 

2724 index : list-like, Index 

2725 The rows labels for the resulting DataFrame. 

2726 dtype : dtype, optional 

2727 Optional dtype to enforce for all arrays. 

2728 verify_integrity : bool, default True 

2729 Validate and homogenize all input. If set to False, it is assumed 

2730 that all elements of `arrays` are actual arrays how they will be 

2731 stored in a block (numpy ndarray or ExtensionArray), have the same 

2732 length as and are aligned with the index, and that `columns` and 

2733 `index` are ensured to be an Index object. 

2734 

2735 Returns 

2736 ------- 

2737 DataFrame 

2738 """ 

2739 if dtype is not None: 

2740 dtype = pandas_dtype(dtype) 

2741 

2742 manager = _get_option("mode.data_manager", silent=True) 

2743 columns = ensure_index(columns) 

2744 if len(columns) != len(arrays): 

2745 raise ValueError("len(columns) must match len(arrays)") 

2746 mgr = arrays_to_mgr( 

2747 arrays, 

2748 columns, 

2749 index, 

2750 dtype=dtype, 

2751 verify_integrity=verify_integrity, 

2752 typ=manager, 

2753 ) 

2754 return cls._from_mgr(mgr, axes=mgr.axes) 

2755 

2756 @doc( 

2757 storage_options=_shared_docs["storage_options"], 

2758 compression_options=_shared_docs["compression_options"] % "path", 

2759 ) 

2760 def to_stata( 

2761 self, 

2762 path: FilePath | WriteBuffer[bytes], 

2763 *, 

2764 convert_dates: dict[Hashable, str] | None = None, 

2765 write_index: bool = True, 

2766 byteorder: ToStataByteorder | None = None, 

2767 time_stamp: datetime.datetime | None = None, 

2768 data_label: str | None = None, 

2769 variable_labels: dict[Hashable, str] | None = None, 

2770 version: int | None = 114, 

2771 convert_strl: Sequence[Hashable] | None = None, 

2772 compression: CompressionOptions = "infer", 

2773 storage_options: StorageOptions | None = None, 

2774 value_labels: dict[Hashable, dict[float, str]] | None = None, 

2775 ) -> None: 

2776 """ 

2777 Export DataFrame object to Stata dta format. 

2778 

2779 Writes the DataFrame to a Stata dataset file. 

2780 "dta" files contain a Stata dataset. 

2781 

2782 Parameters 

2783 ---------- 

2784 path : str, path object, or buffer 

2785 String, path object (implementing ``os.PathLike[str]``), or file-like 

2786 object implementing a binary ``write()`` function. 

2787 

2788 convert_dates : dict 

2789 Dictionary mapping columns containing datetime types to stata 

2790 internal format to use when writing the dates. Options are 'tc', 

2791 'td', 'tm', 'tw', 'th', 'tq', 'ty'. Column can be either an integer 

2792 or a name. Datetime columns that do not have a conversion type 

2793 specified will be converted to 'tc'. Raises NotImplementedError if 

2794 a datetime column has timezone information. 

2795 write_index : bool 

2796 Write the index to Stata dataset. 

2797 byteorder : str 

2798 Can be ">", "<", "little", or "big". default is `sys.byteorder`. 

2799 time_stamp : datetime 

2800 A datetime to use as file creation date. Default is the current 

2801 time. 

2802 data_label : str, optional 

2803 A label for the data set. Must be 80 characters or smaller. 

2804 variable_labels : dict 

2805 Dictionary containing columns as keys and variable labels as 

2806 values. Each label must be 80 characters or smaller. 

2807 version : {{114, 117, 118, 119, None}}, default 114 

2808 Version to use in the output dta file. Set to None to let pandas 

2809 decide between 118 or 119 formats depending on the number of 

2810 columns in the frame. Version 114 can be read by Stata 10 and 

2811 later. Version 117 can be read by Stata 13 or later. Version 118 

2812 is supported in Stata 14 and later. Version 119 is supported in 

2813 Stata 15 and later. Version 114 limits string variables to 244 

2814 characters or fewer while versions 117 and later allow strings 

2815 with lengths up to 2,000,000 characters. Versions 118 and 119 

2816 support Unicode characters, and version 119 supports more than 

2817 32,767 variables. 

2818 

2819 Version 119 should usually only be used when the number of 

2820 variables exceeds the capacity of dta format 118. Exporting 

2821 smaller datasets in format 119 may have unintended consequences, 

2822 and, as of November 2020, Stata SE cannot read version 119 files. 

2823 

2824 convert_strl : list, optional 

2825 List of column names to convert to string columns to Stata StrL 

2826 format. Only available if version is 117. Storing strings in the 

2827 StrL format can produce smaller dta files if strings have more than 

2828 8 characters and values are repeated. 

2829 {compression_options} 

2830 

2831 .. versionchanged:: 1.4.0 Zstandard support. 

2832 

2833 {storage_options} 

2834 

2835 value_labels : dict of dicts 

2836 Dictionary containing columns as keys and dictionaries of column value 

2837 to labels as values. Labels for a single variable must be 32,000 

2838 characters or smaller. 

2839 

2840 .. versionadded:: 1.4.0 

2841 

2842 Raises 

2843 ------ 

2844 NotImplementedError 

2845 * If datetimes contain timezone information 

2846 * Column dtype is not representable in Stata 

2847 ValueError 

2848 * Columns listed in convert_dates are neither datetime64[ns] 

2849 or datetime.datetime 

2850 * Column listed in convert_dates is not in DataFrame 

2851 * Categorical label contains more than 32,000 characters 

2852 

2853 See Also 

2854 -------- 

2855 read_stata : Import Stata data files. 

2856 io.stata.StataWriter : Low-level writer for Stata data files. 

2857 io.stata.StataWriter117 : Low-level writer for version 117 files. 

2858 

2859 Examples 

2860 -------- 

2861 >>> df = pd.DataFrame({{'animal': ['falcon', 'parrot', 'falcon', 

2862 ... 'parrot'], 

2863 ... 'speed': [350, 18, 361, 15]}}) 

2864 >>> df.to_stata('animals.dta') # doctest: +SKIP 

2865 """ 

2866 if version not in (114, 117, 118, 119, None): 

2867 raise ValueError("Only formats 114, 117, 118 and 119 are supported.") 

2868 if version == 114: 

2869 if convert_strl is not None: 

2870 raise ValueError("strl is not supported in format 114") 

2871 from pandas.io.stata import StataWriter as statawriter 

2872 elif version == 117: 

2873 # Incompatible import of "statawriter" (imported name has type 

2874 # "Type[StataWriter117]", local name has type "Type[StataWriter]") 

2875 from pandas.io.stata import ( # type: ignore[assignment] 

2876 StataWriter117 as statawriter, 

2877 ) 

2878 else: # versions 118 and 119 

2879 # Incompatible import of "statawriter" (imported name has type 

2880 # "Type[StataWriter117]", local name has type "Type[StataWriter]") 

2881 from pandas.io.stata import ( # type: ignore[assignment] 

2882 StataWriterUTF8 as statawriter, 

2883 ) 

2884 

2885 kwargs: dict[str, Any] = {} 

2886 if version is None or version >= 117: 

2887 # strl conversion is only supported >= 117 

2888 kwargs["convert_strl"] = convert_strl 

2889 if version is None or version >= 118: 

2890 # Specifying the version is only supported for UTF8 (118 or 119) 

2891 kwargs["version"] = version 

2892 

2893 writer = statawriter( 

2894 path, 

2895 self, 

2896 convert_dates=convert_dates, 

2897 byteorder=byteorder, 

2898 time_stamp=time_stamp, 

2899 data_label=data_label, 

2900 write_index=write_index, 

2901 variable_labels=variable_labels, 

2902 compression=compression, 

2903 storage_options=storage_options, 

2904 value_labels=value_labels, 

2905 **kwargs, 

2906 ) 

2907 writer.write_file() 

2908 

2909 def to_feather(self, path: FilePath | WriteBuffer[bytes], **kwargs) -> None: 

2910 """ 

2911 Write a DataFrame to the binary Feather format. 

2912 

2913 Parameters 

2914 ---------- 

2915 path : str, path object, file-like object 

2916 String, path object (implementing ``os.PathLike[str]``), or file-like 

2917 object implementing a binary ``write()`` function. If a string or a path, 

2918 it will be used as Root Directory path when writing a partitioned dataset. 

2919 **kwargs : 

2920 Additional keywords passed to :func:`pyarrow.feather.write_feather`. 

2921 This includes the `compression`, `compression_level`, `chunksize` 

2922 and `version` keywords. 

2923 

2924 Notes 

2925 ----- 

2926 This function writes the dataframe as a `feather file 

2927 <https://arrow.apache.org/docs/python/feather.html>`_. Requires a default 

2928 index. For saving the DataFrame with your custom index use a method that 

2929 supports custom indices e.g. `to_parquet`. 

2930 

2931 Examples 

2932 -------- 

2933 >>> df = pd.DataFrame([[1, 2, 3], [4, 5, 6]]) 

2934 >>> df.to_feather("file.feather") # doctest: +SKIP 

2935 """ 

2936 from pandas.io.feather_format import to_feather 

2937 

2938 to_feather(self, path, **kwargs) 

2939 

2940 @deprecate_nonkeyword_arguments( 

2941 version="3.0", allowed_args=["self", "buf"], name="to_markdown" 

2942 ) 

2943 @doc( 

2944 Series.to_markdown, 

2945 klass=_shared_doc_kwargs["klass"], 

2946 storage_options=_shared_docs["storage_options"], 

2947 examples="""Examples 

2948 -------- 

2949 >>> df = pd.DataFrame( 

2950 ... data={"animal_1": ["elk", "pig"], "animal_2": ["dog", "quetzal"]} 

2951 ... ) 

2952 >>> print(df.to_markdown()) 

2953 | | animal_1 | animal_2 | 

2954 |---:|:-----------|:-----------| 

2955 | 0 | elk | dog | 

2956 | 1 | pig | quetzal | 

2957 

2958 Output markdown with a tabulate option. 

2959 

2960 >>> print(df.to_markdown(tablefmt="grid")) 

2961 +----+------------+------------+ 

2962 | | animal_1 | animal_2 | 

2963 +====+============+============+ 

2964 | 0 | elk | dog | 

2965 +----+------------+------------+ 

2966 | 1 | pig | quetzal | 

2967 +----+------------+------------+""", 

2968 ) 

2969 def to_markdown( 

2970 self, 

2971 buf: FilePath | WriteBuffer[str] | None = None, 

2972 mode: str = "wt", 

2973 index: bool = True, 

2974 storage_options: StorageOptions | None = None, 

2975 **kwargs, 

2976 ) -> str | None: 

2977 if "showindex" in kwargs: 

2978 raise ValueError("Pass 'index' instead of 'showindex") 

2979 

2980 kwargs.setdefault("headers", "keys") 

2981 kwargs.setdefault("tablefmt", "pipe") 

2982 kwargs.setdefault("showindex", index) 

2983 tabulate = import_optional_dependency("tabulate") 

2984 result = tabulate.tabulate(self, **kwargs) 

2985 if buf is None: 

2986 return result 

2987 

2988 with get_handle(buf, mode, storage_options=storage_options) as handles: 

2989 handles.handle.write(result) 

2990 return None 

2991 

2992 @overload 

2993 def to_parquet( 

2994 self, 

2995 path: None = ..., 

2996 engine: Literal["auto", "pyarrow", "fastparquet"] = ..., 

2997 compression: str | None = ..., 

2998 index: bool | None = ..., 

2999 partition_cols: list[str] | None = ..., 

3000 storage_options: StorageOptions = ..., 

3001 **kwargs, 

3002 ) -> bytes: 

3003 ... 

3004 

3005 @overload 

3006 def to_parquet( 

3007 self, 

3008 path: FilePath | WriteBuffer[bytes], 

3009 engine: Literal["auto", "pyarrow", "fastparquet"] = ..., 

3010 compression: str | None = ..., 

3011 index: bool | None = ..., 

3012 partition_cols: list[str] | None = ..., 

3013 storage_options: StorageOptions = ..., 

3014 **kwargs, 

3015 ) -> None: 

3016 ... 

3017 

3018 @deprecate_nonkeyword_arguments( 

3019 version="3.0", allowed_args=["self", "path"], name="to_parquet" 

3020 ) 

3021 @doc(storage_options=_shared_docs["storage_options"]) 

3022 def to_parquet( 

3023 self, 

3024 path: FilePath | WriteBuffer[bytes] | None = None, 

3025 engine: Literal["auto", "pyarrow", "fastparquet"] = "auto", 

3026 compression: str | None = "snappy", 

3027 index: bool | None = None, 

3028 partition_cols: list[str] | None = None, 

3029 storage_options: StorageOptions | None = None, 

3030 **kwargs, 

3031 ) -> bytes | None: 

3032 """ 

3033 Write a DataFrame to the binary parquet format. 

3034 

3035 This function writes the dataframe as a `parquet file 

3036 <https://parquet.apache.org/>`_. You can choose different parquet 

3037 backends, and have the option of compression. See 

3038 :ref:`the user guide <io.parquet>` for more details. 

3039 

3040 Parameters 

3041 ---------- 

3042 path : str, path object, file-like object, or None, default None 

3043 String, path object (implementing ``os.PathLike[str]``), or file-like 

3044 object implementing a binary ``write()`` function. If None, the result is 

3045 returned as bytes. If a string or path, it will be used as Root Directory 

3046 path when writing a partitioned dataset. 

3047 engine : {{'auto', 'pyarrow', 'fastparquet'}}, default 'auto' 

3048 Parquet library to use. If 'auto', then the option 

3049 ``io.parquet.engine`` is used. The default ``io.parquet.engine`` 

3050 behavior is to try 'pyarrow', falling back to 'fastparquet' if 

3051 'pyarrow' is unavailable. 

3052 compression : str or None, default 'snappy' 

3053 Name of the compression to use. Use ``None`` for no compression. 

3054 Supported options: 'snappy', 'gzip', 'brotli', 'lz4', 'zstd'. 

3055 index : bool, default None 

3056 If ``True``, include the dataframe's index(es) in the file output. 

3057 If ``False``, they will not be written to the file. 

3058 If ``None``, similar to ``True`` the dataframe's index(es) 

3059 will be saved. However, instead of being saved as values, 

3060 the RangeIndex will be stored as a range in the metadata so it 

3061 doesn't require much space and is faster. Other indexes will 

3062 be included as columns in the file output. 

3063 partition_cols : list, optional, default None 

3064 Column names by which to partition the dataset. 

3065 Columns are partitioned in the order they are given. 

3066 Must be None if path is not a string. 

3067 {storage_options} 

3068 

3069 **kwargs 

3070 Additional arguments passed to the parquet library. See 

3071 :ref:`pandas io <io.parquet>` for more details. 

3072 

3073 Returns 

3074 ------- 

3075 bytes if no path argument is provided else None 

3076 

3077 See Also 

3078 -------- 

3079 read_parquet : Read a parquet file. 

3080 DataFrame.to_orc : Write an orc file. 

3081 DataFrame.to_csv : Write a csv file. 

3082 DataFrame.to_sql : Write to a sql table. 

3083 DataFrame.to_hdf : Write to hdf. 

3084 

3085 Notes 

3086 ----- 

3087 This function requires either the `fastparquet 

3088 <https://pypi.org/project/fastparquet>`_ or `pyarrow 

3089 <https://arrow.apache.org/docs/python/>`_ library. 

3090 

3091 Examples 

3092 -------- 

3093 >>> df = pd.DataFrame(data={{'col1': [1, 2], 'col2': [3, 4]}}) 

3094 >>> df.to_parquet('df.parquet.gzip', 

3095 ... compression='gzip') # doctest: +SKIP 

3096 >>> pd.read_parquet('df.parquet.gzip') # doctest: +SKIP 

3097 col1 col2 

3098 0 1 3 

3099 1 2 4 

3100 

3101 If you want to get a buffer to the parquet content you can use a io.BytesIO 

3102 object, as long as you don't use partition_cols, which creates multiple files. 

3103 

3104 >>> import io 

3105 >>> f = io.BytesIO() 

3106 >>> df.to_parquet(f) 

3107 >>> f.seek(0) 

3108 0 

3109 >>> content = f.read() 

3110 """ 

3111 from pandas.io.parquet import to_parquet 

3112 

3113 return to_parquet( 

3114 self, 

3115 path, 

3116 engine, 

3117 compression=compression, 

3118 index=index, 

3119 partition_cols=partition_cols, 

3120 storage_options=storage_options, 

3121 **kwargs, 

3122 ) 

3123 

3124 def to_orc( 

3125 self, 

3126 path: FilePath | WriteBuffer[bytes] | None = None, 

3127 *, 

3128 engine: Literal["pyarrow"] = "pyarrow", 

3129 index: bool | None = None, 

3130 engine_kwargs: dict[str, Any] | None = None, 

3131 ) -> bytes | None: 

3132 """ 

3133 Write a DataFrame to the ORC format. 

3134 

3135 .. versionadded:: 1.5.0 

3136 

3137 Parameters 

3138 ---------- 

3139 path : str, file-like object or None, default None 

3140 If a string, it will be used as Root Directory path 

3141 when writing a partitioned dataset. By file-like object, 

3142 we refer to objects with a write() method, such as a file handle 

3143 (e.g. via builtin open function). If path is None, 

3144 a bytes object is returned. 

3145 engine : {'pyarrow'}, default 'pyarrow' 

3146 ORC library to use. 

3147 index : bool, optional 

3148 If ``True``, include the dataframe's index(es) in the file output. 

3149 If ``False``, they will not be written to the file. 

3150 If ``None``, similar to ``infer`` the dataframe's index(es) 

3151 will be saved. However, instead of being saved as values, 

3152 the RangeIndex will be stored as a range in the metadata so it 

3153 doesn't require much space and is faster. Other indexes will 

3154 be included as columns in the file output. 

3155 engine_kwargs : dict[str, Any] or None, default None 

3156 Additional keyword arguments passed to :func:`pyarrow.orc.write_table`. 

3157 

3158 Returns 

3159 ------- 

3160 bytes if no path argument is provided else None 

3161 

3162 Raises 

3163 ------ 

3164 NotImplementedError 

3165 Dtype of one or more columns is category, unsigned integers, interval, 

3166 period or sparse. 

3167 ValueError 

3168 engine is not pyarrow. 

3169 

3170 See Also 

3171 -------- 

3172 read_orc : Read a ORC file. 

3173 DataFrame.to_parquet : Write a parquet file. 

3174 DataFrame.to_csv : Write a csv file. 

3175 DataFrame.to_sql : Write to a sql table. 

3176 DataFrame.to_hdf : Write to hdf. 

3177 

3178 Notes 

3179 ----- 

3180 * Before using this function you should read the :ref:`user guide about 

3181 ORC <io.orc>` and :ref:`install optional dependencies <install.warn_orc>`. 

3182 * This function requires `pyarrow <https://arrow.apache.org/docs/python/>`_ 

3183 library. 

3184 * For supported dtypes please refer to `supported ORC features in Arrow 

3185 <https://arrow.apache.org/docs/cpp/orc.html#data-types>`__. 

3186 * Currently timezones in datetime columns are not preserved when a 

3187 dataframe is converted into ORC files. 

3188 

3189 Examples 

3190 -------- 

3191 >>> df = pd.DataFrame(data={'col1': [1, 2], 'col2': [4, 3]}) 

3192 >>> df.to_orc('df.orc') # doctest: +SKIP 

3193 >>> pd.read_orc('df.orc') # doctest: +SKIP 

3194 col1 col2 

3195 0 1 4 

3196 1 2 3 

3197 

3198 If you want to get a buffer to the orc content you can write it to io.BytesIO 

3199 

3200 >>> import io 

3201 >>> b = io.BytesIO(df.to_orc()) # doctest: +SKIP 

3202 >>> b.seek(0) # doctest: +SKIP 

3203 0 

3204 >>> content = b.read() # doctest: +SKIP 

3205 """ 

3206 from pandas.io.orc import to_orc 

3207 

3208 return to_orc( 

3209 self, path, engine=engine, index=index, engine_kwargs=engine_kwargs 

3210 ) 

3211 

3212 @overload 

3213 def to_html( 

3214 self, 

3215 buf: FilePath | WriteBuffer[str], 

3216 columns: Axes | None = ..., 

3217 col_space: ColspaceArgType | None = ..., 

3218 header: bool = ..., 

3219 index: bool = ..., 

3220 na_rep: str = ..., 

3221 formatters: FormattersType | None = ..., 

3222 float_format: FloatFormatType | None = ..., 

3223 sparsify: bool | None = ..., 

3224 index_names: bool = ..., 

3225 justify: str | None = ..., 

3226 max_rows: int | None = ..., 

3227 max_cols: int | None = ..., 

3228 show_dimensions: bool | str = ..., 

3229 decimal: str = ..., 

3230 bold_rows: bool = ..., 

3231 classes: str | list | tuple | None = ..., 

3232 escape: bool = ..., 

3233 notebook: bool = ..., 

3234 border: int | bool | None = ..., 

3235 table_id: str | None = ..., 

3236 render_links: bool = ..., 

3237 encoding: str | None = ..., 

3238 ) -> None: 

3239 ... 

3240 

3241 @overload 

3242 def to_html( 

3243 self, 

3244 buf: None = ..., 

3245 columns: Axes | None = ..., 

3246 col_space: ColspaceArgType | None = ..., 

3247 header: bool = ..., 

3248 index: bool = ..., 

3249 na_rep: str = ..., 

3250 formatters: FormattersType | None = ..., 

3251 float_format: FloatFormatType | None = ..., 

3252 sparsify: bool | None = ..., 

3253 index_names: bool = ..., 

3254 justify: str | None = ..., 

3255 max_rows: int | None = ..., 

3256 max_cols: int | None = ..., 

3257 show_dimensions: bool | str = ..., 

3258 decimal: str = ..., 

3259 bold_rows: bool = ..., 

3260 classes: str | list | tuple | None = ..., 

3261 escape: bool = ..., 

3262 notebook: bool = ..., 

3263 border: int | bool | None = ..., 

3264 table_id: str | None = ..., 

3265 render_links: bool = ..., 

3266 encoding: str | None = ..., 

3267 ) -> str: 

3268 ... 

3269 

3270 @deprecate_nonkeyword_arguments( 

3271 version="3.0", allowed_args=["self", "buf"], name="to_html" 

3272 ) 

3273 @Substitution( 

3274 header_type="bool", 

3275 header="Whether to print column labels, default True", 

3276 col_space_type="str or int, list or dict of int or str", 

3277 col_space="The minimum width of each column in CSS length " 

3278 "units. An int is assumed to be px units.", 

3279 ) 

3280 @Substitution(shared_params=fmt.common_docstring, returns=fmt.return_docstring) 

3281 def to_html( 

3282 self, 

3283 buf: FilePath | WriteBuffer[str] | None = None, 

3284 columns: Axes | None = None, 

3285 col_space: ColspaceArgType | None = None, 

3286 header: bool = True, 

3287 index: bool = True, 

3288 na_rep: str = "NaN", 

3289 formatters: FormattersType | None = None, 

3290 float_format: FloatFormatType | None = None, 

3291 sparsify: bool | None = None, 

3292 index_names: bool = True, 

3293 justify: str | None = None, 

3294 max_rows: int | None = None, 

3295 max_cols: int | None = None, 

3296 show_dimensions: bool | str = False, 

3297 decimal: str = ".", 

3298 bold_rows: bool = True, 

3299 classes: str | list | tuple | None = None, 

3300 escape: bool = True, 

3301 notebook: bool = False, 

3302 border: int | bool | None = None, 

3303 table_id: str | None = None, 

3304 render_links: bool = False, 

3305 encoding: str | None = None, 

3306 ) -> str | None: 

3307 """ 

3308 Render a DataFrame as an HTML table. 

3309 %(shared_params)s 

3310 bold_rows : bool, default True 

3311 Make the row labels bold in the output. 

3312 classes : str or list or tuple, default None 

3313 CSS class(es) to apply to the resulting html table. 

3314 escape : bool, default True 

3315 Convert the characters <, >, and & to HTML-safe sequences. 

3316 notebook : {True, False}, default False 

3317 Whether the generated HTML is for IPython Notebook. 

3318 border : int 

3319 A ``border=border`` attribute is included in the opening 

3320 `<table>` tag. Default ``pd.options.display.html.border``. 

3321 table_id : str, optional 

3322 A css id is included in the opening `<table>` tag if specified. 

3323 render_links : bool, default False 

3324 Convert URLs to HTML links. 

3325 encoding : str, default "utf-8" 

3326 Set character encoding. 

3327 %(returns)s 

3328 See Also 

3329 -------- 

3330 to_string : Convert DataFrame to a string. 

3331 

3332 Examples 

3333 -------- 

3334 >>> df = pd.DataFrame(data={'col1': [1, 2], 'col2': [4, 3]}) 

3335 >>> html_string = '''<table border="1" class="dataframe"> 

3336 ... <thead> 

3337 ... <tr style="text-align: right;"> 

3338 ... <th></th> 

3339 ... <th>col1</th> 

3340 ... <th>col2</th> 

3341 ... </tr> 

3342 ... </thead> 

3343 ... <tbody> 

3344 ... <tr> 

3345 ... <th>0</th> 

3346 ... <td>1</td> 

3347 ... <td>4</td> 

3348 ... </tr> 

3349 ... <tr> 

3350 ... <th>1</th> 

3351 ... <td>2</td> 

3352 ... <td>3</td> 

3353 ... </tr> 

3354 ... </tbody> 

3355 ... </table>''' 

3356 >>> assert html_string == df.to_html() 

3357 """ 

3358 if justify is not None and justify not in fmt.VALID_JUSTIFY_PARAMETERS: 

3359 raise ValueError("Invalid value for justify parameter") 

3360 

3361 formatter = fmt.DataFrameFormatter( 

3362 self, 

3363 columns=columns, 

3364 col_space=col_space, 

3365 na_rep=na_rep, 

3366 header=header, 

3367 index=index, 

3368 formatters=formatters, 

3369 float_format=float_format, 

3370 bold_rows=bold_rows, 

3371 sparsify=sparsify, 

3372 justify=justify, 

3373 index_names=index_names, 

3374 escape=escape, 

3375 decimal=decimal, 

3376 max_rows=max_rows, 

3377 max_cols=max_cols, 

3378 show_dimensions=show_dimensions, 

3379 ) 

3380 # TODO: a generic formatter wld b in DataFrameFormatter 

3381 return fmt.DataFrameRenderer(formatter).to_html( 

3382 buf=buf, 

3383 classes=classes, 

3384 notebook=notebook, 

3385 border=border, 

3386 encoding=encoding, 

3387 table_id=table_id, 

3388 render_links=render_links, 

3389 ) 

3390 

3391 @overload 

3392 def to_xml( 

3393 self, 

3394 path_or_buffer: None = ..., 

3395 *, 

3396 index: bool = ..., 

3397 root_name: str | None = ..., 

3398 row_name: str | None = ..., 

3399 na_rep: str | None = ..., 

3400 attr_cols: list[str] | None = ..., 

3401 elem_cols: list[str] | None = ..., 

3402 namespaces: dict[str | None, str] | None = ..., 

3403 prefix: str | None = ..., 

3404 encoding: str = ..., 

3405 xml_declaration: bool | None = ..., 

3406 pretty_print: bool | None = ..., 

3407 parser: XMLParsers | None = ..., 

3408 stylesheet: FilePath | ReadBuffer[str] | ReadBuffer[bytes] | None = ..., 

3409 compression: CompressionOptions = ..., 

3410 storage_options: StorageOptions | None = ..., 

3411 ) -> str: 

3412 ... 

3413 

3414 @overload 

3415 def to_xml( 

3416 self, 

3417 path_or_buffer: FilePath | WriteBuffer[bytes] | WriteBuffer[str], 

3418 *, 

3419 index: bool = ..., 

3420 root_name: str | None = ..., 

3421 row_name: str | None = ..., 

3422 na_rep: str | None = ..., 

3423 attr_cols: list[str] | None = ..., 

3424 elem_cols: list[str] | None = ..., 

3425 namespaces: dict[str | None, str] | None = ..., 

3426 prefix: str | None = ..., 

3427 encoding: str = ..., 

3428 xml_declaration: bool | None = ..., 

3429 pretty_print: bool | None = ..., 

3430 parser: XMLParsers | None = ..., 

3431 stylesheet: FilePath | ReadBuffer[str] | ReadBuffer[bytes] | None = ..., 

3432 compression: CompressionOptions = ..., 

3433 storage_options: StorageOptions | None = ..., 

3434 ) -> None: 

3435 ... 

3436 

3437 @deprecate_nonkeyword_arguments( 

3438 version="3.0", allowed_args=["self", "path_or_buffer"], name="to_xml" 

3439 ) 

3440 @doc( 

3441 storage_options=_shared_docs["storage_options"], 

3442 compression_options=_shared_docs["compression_options"] % "path_or_buffer", 

3443 ) 

3444 def to_xml( 

3445 self, 

3446 path_or_buffer: FilePath | WriteBuffer[bytes] | WriteBuffer[str] | None = None, 

3447 index: bool = True, 

3448 root_name: str | None = "data", 

3449 row_name: str | None = "row", 

3450 na_rep: str | None = None, 

3451 attr_cols: list[str] | None = None, 

3452 elem_cols: list[str] | None = None, 

3453 namespaces: dict[str | None, str] | None = None, 

3454 prefix: str | None = None, 

3455 encoding: str = "utf-8", 

3456 xml_declaration: bool | None = True, 

3457 pretty_print: bool | None = True, 

3458 parser: XMLParsers | None = "lxml", 

3459 stylesheet: FilePath | ReadBuffer[str] | ReadBuffer[bytes] | None = None, 

3460 compression: CompressionOptions = "infer", 

3461 storage_options: StorageOptions | None = None, 

3462 ) -> str | None: 

3463 """ 

3464 Render a DataFrame to an XML document. 

3465 

3466 .. versionadded:: 1.3.0 

3467 

3468 Parameters 

3469 ---------- 

3470 path_or_buffer : str, path object, file-like object, or None, default None 

3471 String, path object (implementing ``os.PathLike[str]``), or file-like 

3472 object implementing a ``write()`` function. If None, the result is returned 

3473 as a string. 

3474 index : bool, default True 

3475 Whether to include index in XML document. 

3476 root_name : str, default 'data' 

3477 The name of root element in XML document. 

3478 row_name : str, default 'row' 

3479 The name of row element in XML document. 

3480 na_rep : str, optional 

3481 Missing data representation. 

3482 attr_cols : list-like, optional 

3483 List of columns to write as attributes in row element. 

3484 Hierarchical columns will be flattened with underscore 

3485 delimiting the different levels. 

3486 elem_cols : list-like, optional 

3487 List of columns to write as children in row element. By default, 

3488 all columns output as children of row element. Hierarchical 

3489 columns will be flattened with underscore delimiting the 

3490 different levels. 

3491 namespaces : dict, optional 

3492 All namespaces to be defined in root element. Keys of dict 

3493 should be prefix names and values of dict corresponding URIs. 

3494 Default namespaces should be given empty string key. For 

3495 example, :: 

3496 

3497 namespaces = {{"": "https://example.com"}} 

3498 

3499 prefix : str, optional 

3500 Namespace prefix to be used for every element and/or attribute 

3501 in document. This should be one of the keys in ``namespaces`` 

3502 dict. 

3503 encoding : str, default 'utf-8' 

3504 Encoding of the resulting document. 

3505 xml_declaration : bool, default True 

3506 Whether to include the XML declaration at start of document. 

3507 pretty_print : bool, default True 

3508 Whether output should be pretty printed with indentation and 

3509 line breaks. 

3510 parser : {{'lxml','etree'}}, default 'lxml' 

3511 Parser module to use for building of tree. Only 'lxml' and 

3512 'etree' are supported. With 'lxml', the ability to use XSLT 

3513 stylesheet is supported. 

3514 stylesheet : str, path object or file-like object, optional 

3515 A URL, file-like object, or a raw string containing an XSLT 

3516 script used to transform the raw XML output. Script should use 

3517 layout of elements and attributes from original output. This 

3518 argument requires ``lxml`` to be installed. Only XSLT 1.0 

3519 scripts and not later versions is currently supported. 

3520 {compression_options} 

3521 

3522 .. versionchanged:: 1.4.0 Zstandard support. 

3523 

3524 {storage_options} 

3525 

3526 Returns 

3527 ------- 

3528 None or str 

3529 If ``io`` is None, returns the resulting XML format as a 

3530 string. Otherwise returns None. 

3531 

3532 See Also 

3533 -------- 

3534 to_json : Convert the pandas object to a JSON string. 

3535 to_html : Convert DataFrame to a html. 

3536 

3537 Examples 

3538 -------- 

3539 >>> df = pd.DataFrame({{'shape': ['square', 'circle', 'triangle'], 

3540 ... 'degrees': [360, 360, 180], 

3541 ... 'sides': [4, np.nan, 3]}}) 

3542 

3543 >>> df.to_xml() # doctest: +SKIP 

3544 <?xml version='1.0' encoding='utf-8'?> 

3545 <data> 

3546 <row> 

3547 <index>0</index> 

3548 <shape>square</shape> 

3549 <degrees>360</degrees> 

3550 <sides>4.0</sides> 

3551 </row> 

3552 <row> 

3553 <index>1</index> 

3554 <shape>circle</shape> 

3555 <degrees>360</degrees> 

3556 <sides/> 

3557 </row> 

3558 <row> 

3559 <index>2</index> 

3560 <shape>triangle</shape> 

3561 <degrees>180</degrees> 

3562 <sides>3.0</sides> 

3563 </row> 

3564 </data> 

3565 

3566 >>> df.to_xml(attr_cols=[ 

3567 ... 'index', 'shape', 'degrees', 'sides' 

3568 ... ]) # doctest: +SKIP 

3569 <?xml version='1.0' encoding='utf-8'?> 

3570 <data> 

3571 <row index="0" shape="square" degrees="360" sides="4.0"/> 

3572 <row index="1" shape="circle" degrees="360"/> 

3573 <row index="2" shape="triangle" degrees="180" sides="3.0"/> 

3574 </data> 

3575 

3576 >>> df.to_xml(namespaces={{"doc": "https://example.com"}}, 

3577 ... prefix="doc") # doctest: +SKIP 

3578 <?xml version='1.0' encoding='utf-8'?> 

3579 <doc:data xmlns:doc="https://example.com"> 

3580 <doc:row> 

3581 <doc:index>0</doc:index> 

3582 <doc:shape>square</doc:shape> 

3583 <doc:degrees>360</doc:degrees> 

3584 <doc:sides>4.0</doc:sides> 

3585 </doc:row> 

3586 <doc:row> 

3587 <doc:index>1</doc:index> 

3588 <doc:shape>circle</doc:shape> 

3589 <doc:degrees>360</doc:degrees> 

3590 <doc:sides/> 

3591 </doc:row> 

3592 <doc:row> 

3593 <doc:index>2</doc:index> 

3594 <doc:shape>triangle</doc:shape> 

3595 <doc:degrees>180</doc:degrees> 

3596 <doc:sides>3.0</doc:sides> 

3597 </doc:row> 

3598 </doc:data> 

3599 """ 

3600 

3601 from pandas.io.formats.xml import ( 

3602 EtreeXMLFormatter, 

3603 LxmlXMLFormatter, 

3604 ) 

3605 

3606 lxml = import_optional_dependency("lxml.etree", errors="ignore") 

3607 

3608 TreeBuilder: type[EtreeXMLFormatter | LxmlXMLFormatter] 

3609 

3610 if parser == "lxml": 

3611 if lxml is not None: 

3612 TreeBuilder = LxmlXMLFormatter 

3613 else: 

3614 raise ImportError( 

3615 "lxml not found, please install or use the etree parser." 

3616 ) 

3617 

3618 elif parser == "etree": 

3619 TreeBuilder = EtreeXMLFormatter 

3620 

3621 else: 

3622 raise ValueError("Values for parser can only be lxml or etree.") 

3623 

3624 xml_formatter = TreeBuilder( 

3625 self, 

3626 path_or_buffer=path_or_buffer, 

3627 index=index, 

3628 root_name=root_name, 

3629 row_name=row_name, 

3630 na_rep=na_rep, 

3631 attr_cols=attr_cols, 

3632 elem_cols=elem_cols, 

3633 namespaces=namespaces, 

3634 prefix=prefix, 

3635 encoding=encoding, 

3636 xml_declaration=xml_declaration, 

3637 pretty_print=pretty_print, 

3638 stylesheet=stylesheet, 

3639 compression=compression, 

3640 storage_options=storage_options, 

3641 ) 

3642 

3643 return xml_formatter.write_output() 

3644 

3645 # ---------------------------------------------------------------------- 

3646 @doc(INFO_DOCSTRING, **frame_sub_kwargs) 

3647 def info( 

3648 self, 

3649 verbose: bool | None = None, 

3650 buf: WriteBuffer[str] | None = None, 

3651 max_cols: int | None = None, 

3652 memory_usage: bool | str | None = None, 

3653 show_counts: bool | None = None, 

3654 ) -> None: 

3655 info = DataFrameInfo( 

3656 data=self, 

3657 memory_usage=memory_usage, 

3658 ) 

3659 info.render( 

3660 buf=buf, 

3661 max_cols=max_cols, 

3662 verbose=verbose, 

3663 show_counts=show_counts, 

3664 ) 

3665 

3666 def memory_usage(self, index: bool = True, deep: bool = False) -> Series: 

3667 """ 

3668 Return the memory usage of each column in bytes. 

3669 

3670 The memory usage can optionally include the contribution of 

3671 the index and elements of `object` dtype. 

3672 

3673 This value is displayed in `DataFrame.info` by default. This can be 

3674 suppressed by setting ``pandas.options.display.memory_usage`` to False. 

3675 

3676 Parameters 

3677 ---------- 

3678 index : bool, default True 

3679 Specifies whether to include the memory usage of the DataFrame's 

3680 index in returned Series. If ``index=True``, the memory usage of 

3681 the index is the first item in the output. 

3682 deep : bool, default False 

3683 If True, introspect the data deeply by interrogating 

3684 `object` dtypes for system-level memory consumption, and include 

3685 it in the returned values. 

3686 

3687 Returns 

3688 ------- 

3689 Series 

3690 A Series whose index is the original column names and whose values 

3691 is the memory usage of each column in bytes. 

3692 

3693 See Also 

3694 -------- 

3695 numpy.ndarray.nbytes : Total bytes consumed by the elements of an 

3696 ndarray. 

3697 Series.memory_usage : Bytes consumed by a Series. 

3698 Categorical : Memory-efficient array for string values with 

3699 many repeated values. 

3700 DataFrame.info : Concise summary of a DataFrame. 

3701 

3702 Notes 

3703 ----- 

3704 See the :ref:`Frequently Asked Questions <df-memory-usage>` for more 

3705 details. 

3706 

3707 Examples 

3708 -------- 

3709 >>> dtypes = ['int64', 'float64', 'complex128', 'object', 'bool'] 

3710 >>> data = dict([(t, np.ones(shape=5000, dtype=int).astype(t)) 

3711 ... for t in dtypes]) 

3712 >>> df = pd.DataFrame(data) 

3713 >>> df.head() 

3714 int64 float64 complex128 object bool 

3715 0 1 1.0 1.0+0.0j 1 True 

3716 1 1 1.0 1.0+0.0j 1 True 

3717 2 1 1.0 1.0+0.0j 1 True 

3718 3 1 1.0 1.0+0.0j 1 True 

3719 4 1 1.0 1.0+0.0j 1 True 

3720 

3721 >>> df.memory_usage() 

3722 Index 128 

3723 int64 40000 

3724 float64 40000 

3725 complex128 80000 

3726 object 40000 

3727 bool 5000 

3728 dtype: int64 

3729 

3730 >>> df.memory_usage(index=False) 

3731 int64 40000 

3732 float64 40000 

3733 complex128 80000 

3734 object 40000 

3735 bool 5000 

3736 dtype: int64 

3737 

3738 The memory footprint of `object` dtype columns is ignored by default: 

3739 

3740 >>> df.memory_usage(deep=True) 

3741 Index 128 

3742 int64 40000 

3743 float64 40000 

3744 complex128 80000 

3745 object 180000 

3746 bool 5000 

3747 dtype: int64 

3748 

3749 Use a Categorical for efficient storage of an object-dtype column with 

3750 many repeated values. 

3751 

3752 >>> df['object'].astype('category').memory_usage(deep=True) 

3753 5244 

3754 """ 

3755 result = self._constructor_sliced( 

3756 [c.memory_usage(index=False, deep=deep) for col, c in self.items()], 

3757 index=self.columns, 

3758 dtype=np.intp, 

3759 ) 

3760 if index: 

3761 index_memory_usage = self._constructor_sliced( 

3762 self.index.memory_usage(deep=deep), index=["Index"] 

3763 ) 

3764 result = index_memory_usage._append(result) 

3765 return result 

3766 

3767 def transpose(self, *args, copy: bool = False) -> DataFrame: 

3768 """ 

3769 Transpose index and columns. 

3770 

3771 Reflect the DataFrame over its main diagonal by writing rows as columns 

3772 and vice-versa. The property :attr:`.T` is an accessor to the method 

3773 :meth:`transpose`. 

3774 

3775 Parameters 

3776 ---------- 

3777 *args : tuple, optional 

3778 Accepted for compatibility with NumPy. 

3779 copy : bool, default False 

3780 Whether to copy the data after transposing, even for DataFrames 

3781 with a single dtype. 

3782 

3783 Note that a copy is always required for mixed dtype DataFrames, 

3784 or for DataFrames with any extension types. 

3785 

3786 .. note:: 

3787 The `copy` keyword will change behavior in pandas 3.0. 

3788 `Copy-on-Write 

3789 <https://pandas.pydata.org/docs/dev/user_guide/copy_on_write.html>`__ 

3790 will be enabled by default, which means that all methods with a 

3791 `copy` keyword will use a lazy copy mechanism to defer the copy and 

3792 ignore the `copy` keyword. The `copy` keyword will be removed in a 

3793 future version of pandas. 

3794 

3795 You can already get the future behavior and improvements through 

3796 enabling copy on write ``pd.options.mode.copy_on_write = True`` 

3797 

3798 Returns 

3799 ------- 

3800 DataFrame 

3801 The transposed DataFrame. 

3802 

3803 See Also 

3804 -------- 

3805 numpy.transpose : Permute the dimensions of a given array. 

3806 

3807 Notes 

3808 ----- 

3809 Transposing a DataFrame with mixed dtypes will result in a homogeneous 

3810 DataFrame with the `object` dtype. In such a case, a copy of the data 

3811 is always made. 

3812 

3813 Examples 

3814 -------- 

3815 **Square DataFrame with homogeneous dtype** 

3816 

3817 >>> d1 = {'col1': [1, 2], 'col2': [3, 4]} 

3818 >>> df1 = pd.DataFrame(data=d1) 

3819 >>> df1 

3820 col1 col2 

3821 0 1 3 

3822 1 2 4 

3823 

3824 >>> df1_transposed = df1.T # or df1.transpose() 

3825 >>> df1_transposed 

3826 0 1 

3827 col1 1 2 

3828 col2 3 4 

3829 

3830 When the dtype is homogeneous in the original DataFrame, we get a 

3831 transposed DataFrame with the same dtype: 

3832 

3833 >>> df1.dtypes 

3834 col1 int64 

3835 col2 int64 

3836 dtype: object 

3837 >>> df1_transposed.dtypes 

3838 0 int64 

3839 1 int64 

3840 dtype: object 

3841 

3842 **Non-square DataFrame with mixed dtypes** 

3843 

3844 >>> d2 = {'name': ['Alice', 'Bob'], 

3845 ... 'score': [9.5, 8], 

3846 ... 'employed': [False, True], 

3847 ... 'kids': [0, 0]} 

3848 >>> df2 = pd.DataFrame(data=d2) 

3849 >>> df2 

3850 name score employed kids 

3851 0 Alice 9.5 False 0 

3852 1 Bob 8.0 True 0 

3853 

3854 >>> df2_transposed = df2.T # or df2.transpose() 

3855 >>> df2_transposed 

3856 0 1 

3857 name Alice Bob 

3858 score 9.5 8.0 

3859 employed False True 

3860 kids 0 0 

3861 

3862 When the DataFrame has mixed dtypes, we get a transposed DataFrame with 

3863 the `object` dtype: 

3864 

3865 >>> df2.dtypes 

3866 name object 

3867 score float64 

3868 employed bool 

3869 kids int64 

3870 dtype: object 

3871 >>> df2_transposed.dtypes 

3872 0 object 

3873 1 object 

3874 dtype: object 

3875 """ 

3876 nv.validate_transpose(args, {}) 

3877 # construct the args 

3878 

3879 dtypes = list(self.dtypes) 

3880 

3881 if self._can_fast_transpose: 

3882 # Note: tests pass without this, but this improves perf quite a bit. 

3883 new_vals = self._values.T 

3884 if copy and not using_copy_on_write(): 

3885 new_vals = new_vals.copy() 

3886 

3887 result = self._constructor( 

3888 new_vals, 

3889 index=self.columns, 

3890 columns=self.index, 

3891 copy=False, 

3892 dtype=new_vals.dtype, 

3893 ) 

3894 if using_copy_on_write() and len(self) > 0: 

3895 result._mgr.add_references(self._mgr) # type: ignore[arg-type] 

3896 

3897 elif ( 

3898 self._is_homogeneous_type 

3899 and dtypes 

3900 and isinstance(dtypes[0], ExtensionDtype) 

3901 ): 

3902 new_values: list 

3903 if isinstance(dtypes[0], BaseMaskedDtype): 

3904 # We have masked arrays with the same dtype. We can transpose faster. 

3905 from pandas.core.arrays.masked import ( 

3906 transpose_homogeneous_masked_arrays, 

3907 ) 

3908 

3909 new_values = transpose_homogeneous_masked_arrays( 

3910 cast(Sequence[BaseMaskedArray], self._iter_column_arrays()) 

3911 ) 

3912 elif isinstance(dtypes[0], ArrowDtype): 

3913 # We have arrow EAs with the same dtype. We can transpose faster. 

3914 from pandas.core.arrays.arrow.array import ( 

3915 ArrowExtensionArray, 

3916 transpose_homogeneous_pyarrow, 

3917 ) 

3918 

3919 new_values = transpose_homogeneous_pyarrow( 

3920 cast(Sequence[ArrowExtensionArray], self._iter_column_arrays()) 

3921 ) 

3922 else: 

3923 # We have other EAs with the same dtype. We preserve dtype in transpose. 

3924 dtyp = dtypes[0] 

3925 arr_typ = dtyp.construct_array_type() 

3926 values = self.values 

3927 new_values = [arr_typ._from_sequence(row, dtype=dtyp) for row in values] 

3928 

3929 result = type(self)._from_arrays( 

3930 new_values, 

3931 index=self.columns, 

3932 columns=self.index, 

3933 verify_integrity=False, 

3934 ) 

3935 

3936 else: 

3937 new_arr = self.values.T 

3938 if copy and not using_copy_on_write(): 

3939 new_arr = new_arr.copy() 

3940 result = self._constructor( 

3941 new_arr, 

3942 index=self.columns, 

3943 columns=self.index, 

3944 dtype=new_arr.dtype, 

3945 # We already made a copy (more than one block) 

3946 copy=False, 

3947 ) 

3948 

3949 return result.__finalize__(self, method="transpose") 

3950 

3951 @property 

3952 def T(self) -> DataFrame: 

3953 """ 

3954 The transpose of the DataFrame. 

3955 

3956 Returns 

3957 ------- 

3958 DataFrame 

3959 The transposed DataFrame. 

3960 

3961 See Also 

3962 -------- 

3963 DataFrame.transpose : Transpose index and columns. 

3964 

3965 Examples 

3966 -------- 

3967 >>> df = pd.DataFrame({'col1': [1, 2], 'col2': [3, 4]}) 

3968 >>> df 

3969 col1 col2 

3970 0 1 3 

3971 1 2 4 

3972 

3973 >>> df.T 

3974 0 1 

3975 col1 1 2 

3976 col2 3 4 

3977 """ 

3978 return self.transpose() 

3979 

3980 # ---------------------------------------------------------------------- 

3981 # Indexing Methods 

3982 

3983 def _ixs(self, i: int, axis: AxisInt = 0) -> Series: 

3984 """ 

3985 Parameters 

3986 ---------- 

3987 i : int 

3988 axis : int 

3989 

3990 Returns 

3991 ------- 

3992 Series 

3993 """ 

3994 # irow 

3995 if axis == 0: 

3996 new_mgr = self._mgr.fast_xs(i) 

3997 

3998 # if we are a copy, mark as such 

3999 copy = isinstance(new_mgr.array, np.ndarray) and new_mgr.array.base is None 

4000 result = self._constructor_sliced_from_mgr(new_mgr, axes=new_mgr.axes) 

4001 result._name = self.index[i] 

4002 result = result.__finalize__(self) 

4003 result._set_is_copy(self, copy=copy) 

4004 return result 

4005 

4006 # icol 

4007 else: 

4008 label = self.columns[i] 

4009 

4010 col_mgr = self._mgr.iget(i) 

4011 result = self._box_col_values(col_mgr, i) 

4012 

4013 # this is a cached value, mark it so 

4014 result._set_as_cached(label, self) 

4015 return result 

4016 

4017 def _get_column_array(self, i: int) -> ArrayLike: 

4018 """ 

4019 Get the values of the i'th column (ndarray or ExtensionArray, as stored 

4020 in the Block) 

4021 

4022 Warning! The returned array is a view but doesn't handle Copy-on-Write, 

4023 so this should be used with caution (for read-only purposes). 

4024 """ 

4025 return self._mgr.iget_values(i) 

4026 

4027 def _iter_column_arrays(self) -> Iterator[ArrayLike]: 

4028 """ 

4029 Iterate over the arrays of all columns in order. 

4030 This returns the values as stored in the Block (ndarray or ExtensionArray). 

4031 

4032 Warning! The returned array is a view but doesn't handle Copy-on-Write, 

4033 so this should be used with caution (for read-only purposes). 

4034 """ 

4035 if isinstance(self._mgr, ArrayManager): 

4036 yield from self._mgr.arrays 

4037 else: 

4038 for i in range(len(self.columns)): 

4039 yield self._get_column_array(i) 

4040 

4041 def _getitem_nocopy(self, key: list): 

4042 """ 

4043 Behaves like __getitem__, but returns a view in cases where __getitem__ 

4044 would make a copy. 

4045 """ 

4046 # TODO(CoW): can be removed if/when we are always Copy-on-Write 

4047 indexer = self.columns._get_indexer_strict(key, "columns")[1] 

4048 new_axis = self.columns[indexer] 

4049 

4050 new_mgr = self._mgr.reindex_indexer( 

4051 new_axis, 

4052 indexer, 

4053 axis=0, 

4054 allow_dups=True, 

4055 copy=False, 

4056 only_slice=True, 

4057 ) 

4058 result = self._constructor_from_mgr(new_mgr, axes=new_mgr.axes) 

4059 result = result.__finalize__(self) 

4060 return result 

4061 

4062 def __getitem__(self, key): 

4063 check_dict_or_set_indexers(key) 

4064 key = lib.item_from_zerodim(key) 

4065 key = com.apply_if_callable(key, self) 

4066 

4067 if is_hashable(key) and not is_iterator(key): 

4068 # is_iterator to exclude generator e.g. test_getitem_listlike 

4069 # shortcut if the key is in columns 

4070 is_mi = isinstance(self.columns, MultiIndex) 

4071 # GH#45316 Return view if key is not duplicated 

4072 # Only use drop_duplicates with duplicates for performance 

4073 if not is_mi and ( 

4074 self.columns.is_unique 

4075 and key in self.columns 

4076 or key in self.columns.drop_duplicates(keep=False) 

4077 ): 

4078 return self._get_item_cache(key) 

4079 

4080 elif is_mi and self.columns.is_unique and key in self.columns: 

4081 return self._getitem_multilevel(key) 

4082 

4083 # Do we have a slicer (on rows)? 

4084 if isinstance(key, slice): 

4085 return self._getitem_slice(key) 

4086 

4087 # Do we have a (boolean) DataFrame? 

4088 if isinstance(key, DataFrame): 

4089 return self.where(key) 

4090 

4091 # Do we have a (boolean) 1d indexer? 

4092 if com.is_bool_indexer(key): 

4093 return self._getitem_bool_array(key) 

4094 

4095 # We are left with two options: a single key, and a collection of keys, 

4096 # We interpret tuples as collections only for non-MultiIndex 

4097 is_single_key = isinstance(key, tuple) or not is_list_like(key) 

4098 

4099 if is_single_key: 

4100 if self.columns.nlevels > 1: 

4101 return self._getitem_multilevel(key) 

4102 indexer = self.columns.get_loc(key) 

4103 if is_integer(indexer): 

4104 indexer = [indexer] 

4105 else: 

4106 if is_iterator(key): 

4107 key = list(key) 

4108 indexer = self.columns._get_indexer_strict(key, "columns")[1] 

4109 

4110 # take() does not accept boolean indexers 

4111 if getattr(indexer, "dtype", None) == bool: 

4112 indexer = np.where(indexer)[0] 

4113 

4114 if isinstance(indexer, slice): 

4115 return self._slice(indexer, axis=1) 

4116 

4117 data = self._take_with_is_copy(indexer, axis=1) 

4118 

4119 if is_single_key: 

4120 # What does looking for a single key in a non-unique index return? 

4121 # The behavior is inconsistent. It returns a Series, except when 

4122 # - the key itself is repeated (test on data.shape, #9519), or 

4123 # - we have a MultiIndex on columns (test on self.columns, #21309) 

4124 if data.shape[1] == 1 and not isinstance(self.columns, MultiIndex): 

4125 # GH#26490 using data[key] can cause RecursionError 

4126 return data._get_item_cache(key) 

4127 

4128 return data 

4129 

4130 def _getitem_bool_array(self, key): 

4131 # also raises Exception if object array with NA values 

4132 # warning here just in case -- previously __setitem__ was 

4133 # reindexing but __getitem__ was not; it seems more reasonable to 

4134 # go with the __setitem__ behavior since that is more consistent 

4135 # with all other indexing behavior 

4136 if isinstance(key, Series) and not key.index.equals(self.index): 

4137 warnings.warn( 

4138 "Boolean Series key will be reindexed to match DataFrame index.", 

4139 UserWarning, 

4140 stacklevel=find_stack_level(), 

4141 ) 

4142 elif len(key) != len(self.index): 

4143 raise ValueError( 

4144 f"Item wrong length {len(key)} instead of {len(self.index)}." 

4145 ) 

4146 

4147 # check_bool_indexer will throw exception if Series key cannot 

4148 # be reindexed to match DataFrame rows 

4149 key = check_bool_indexer(self.index, key) 

4150 

4151 if key.all(): 

4152 return self.copy(deep=None) 

4153 

4154 indexer = key.nonzero()[0] 

4155 return self._take_with_is_copy(indexer, axis=0) 

4156 

4157 def _getitem_multilevel(self, key): 

4158 # self.columns is a MultiIndex 

4159 loc = self.columns.get_loc(key) 

4160 if isinstance(loc, (slice, np.ndarray)): 

4161 new_columns = self.columns[loc] 

4162 result_columns = maybe_droplevels(new_columns, key) 

4163 result = self.iloc[:, loc] 

4164 result.columns = result_columns 

4165 

4166 # If there is only one column being returned, and its name is 

4167 # either an empty string, or a tuple with an empty string as its 

4168 # first element, then treat the empty string as a placeholder 

4169 # and return the column as if the user had provided that empty 

4170 # string in the key. If the result is a Series, exclude the 

4171 # implied empty string from its name. 

4172 if len(result.columns) == 1: 

4173 # e.g. test_frame_getitem_multicolumn_empty_level, 

4174 # test_frame_mixed_depth_get, test_loc_setitem_single_column_slice 

4175 top = result.columns[0] 

4176 if isinstance(top, tuple): 

4177 top = top[0] 

4178 if top == "": 

4179 result = result[""] 

4180 if isinstance(result, Series): 

4181 result = self._constructor_sliced( 

4182 result, index=self.index, name=key 

4183 ) 

4184 

4185 result._set_is_copy(self) 

4186 return result 

4187 else: 

4188 # loc is neither a slice nor ndarray, so must be an int 

4189 return self._ixs(loc, axis=1) 

4190 

4191 def _get_value(self, index, col, takeable: bool = False) -> Scalar: 

4192 """ 

4193 Quickly retrieve single value at passed column and index. 

4194 

4195 Parameters 

4196 ---------- 

4197 index : row label 

4198 col : column label 

4199 takeable : interpret the index/col as indexers, default False 

4200 

4201 Returns 

4202 ------- 

4203 scalar 

4204 

4205 Notes 

4206 ----- 

4207 Assumes that both `self.index._index_as_unique` and 

4208 `self.columns._index_as_unique`; Caller is responsible for checking. 

4209 """ 

4210 if takeable: 

4211 series = self._ixs(col, axis=1) 

4212 return series._values[index] 

4213 

4214 series = self._get_item_cache(col) 

4215 engine = self.index._engine 

4216 

4217 if not isinstance(self.index, MultiIndex): 

4218 # CategoricalIndex: Trying to use the engine fastpath may give incorrect 

4219 # results if our categories are integers that dont match our codes 

4220 # IntervalIndex: IntervalTree has no get_loc 

4221 row = self.index.get_loc(index) 

4222 return series._values[row] 

4223 

4224 # For MultiIndex going through engine effectively restricts us to 

4225 # same-length tuples; see test_get_set_value_no_partial_indexing 

4226 loc = engine.get_loc(index) 

4227 return series._values[loc] 

4228 

4229 def isetitem(self, loc, value) -> None: 

4230 """ 

4231 Set the given value in the column with position `loc`. 

4232 

4233 This is a positional analogue to ``__setitem__``. 

4234 

4235 Parameters 

4236 ---------- 

4237 loc : int or sequence of ints 

4238 Index position for the column. 

4239 value : scalar or arraylike 

4240 Value(s) for the column. 

4241 

4242 Notes 

4243 ----- 

4244 ``frame.isetitem(loc, value)`` is an in-place method as it will 

4245 modify the DataFrame in place (not returning a new object). In contrast to 

4246 ``frame.iloc[:, i] = value`` which will try to update the existing values in 

4247 place, ``frame.isetitem(loc, value)`` will not update the values of the column 

4248 itself in place, it will instead insert a new array. 

4249 

4250 In cases where ``frame.columns`` is unique, this is equivalent to 

4251 ``frame[frame.columns[i]] = value``. 

4252 """ 

4253 if isinstance(value, DataFrame): 

4254 if is_integer(loc): 

4255 loc = [loc] 

4256 

4257 if len(loc) != len(value.columns): 

4258 raise ValueError( 

4259 f"Got {len(loc)} positions but value has {len(value.columns)} " 

4260 f"columns." 

4261 ) 

4262 

4263 for i, idx in enumerate(loc): 

4264 arraylike, refs = self._sanitize_column(value.iloc[:, i]) 

4265 self._iset_item_mgr(idx, arraylike, inplace=False, refs=refs) 

4266 return 

4267 

4268 arraylike, refs = self._sanitize_column(value) 

4269 self._iset_item_mgr(loc, arraylike, inplace=False, refs=refs) 

4270 

4271 def __setitem__(self, key, value) -> None: 

4272 if not PYPY and using_copy_on_write(): 

4273 if sys.getrefcount(self) <= 3: 

4274 warnings.warn( 

4275 _chained_assignment_msg, ChainedAssignmentError, stacklevel=2 

4276 ) 

4277 elif not PYPY and not using_copy_on_write(): 

4278 if sys.getrefcount(self) <= 3 and ( 

4279 warn_copy_on_write() 

4280 or ( 

4281 not warn_copy_on_write() 

4282 and any(b.refs.has_reference() for b in self._mgr.blocks) # type: ignore[union-attr] 

4283 ) 

4284 ): 

4285 warnings.warn( 

4286 _chained_assignment_warning_msg, FutureWarning, stacklevel=2 

4287 ) 

4288 

4289 key = com.apply_if_callable(key, self) 

4290 

4291 # see if we can slice the rows 

4292 if isinstance(key, slice): 

4293 slc = self.index._convert_slice_indexer(key, kind="getitem") 

4294 return self._setitem_slice(slc, value) 

4295 

4296 if isinstance(key, DataFrame) or getattr(key, "ndim", None) == 2: 

4297 self._setitem_frame(key, value) 

4298 elif isinstance(key, (Series, np.ndarray, list, Index)): 

4299 self._setitem_array(key, value) 

4300 elif isinstance(value, DataFrame): 

4301 self._set_item_frame_value(key, value) 

4302 elif ( 

4303 is_list_like(value) 

4304 and not self.columns.is_unique 

4305 and 1 < len(self.columns.get_indexer_for([key])) == len(value) 

4306 ): 

4307 # Column to set is duplicated 

4308 self._setitem_array([key], value) 

4309 else: 

4310 # set column 

4311 self._set_item(key, value) 

4312 

4313 def _setitem_slice(self, key: slice, value) -> None: 

4314 # NB: we can't just use self.loc[key] = value because that 

4315 # operates on labels and we need to operate positional for 

4316 # backwards-compat, xref GH#31469 

4317 self._check_setitem_copy() 

4318 self.iloc[key] = value 

4319 

4320 def _setitem_array(self, key, value): 

4321 # also raises Exception if object array with NA values 

4322 if com.is_bool_indexer(key): 

4323 # bool indexer is indexing along rows 

4324 if len(key) != len(self.index): 

4325 raise ValueError( 

4326 f"Item wrong length {len(key)} instead of {len(self.index)}!" 

4327 ) 

4328 key = check_bool_indexer(self.index, key) 

4329 indexer = key.nonzero()[0] 

4330 self._check_setitem_copy() 

4331 if isinstance(value, DataFrame): 

4332 # GH#39931 reindex since iloc does not align 

4333 value = value.reindex(self.index.take(indexer)) 

4334 self.iloc[indexer] = value 

4335 

4336 else: 

4337 # Note: unlike self.iloc[:, indexer] = value, this will 

4338 # never try to overwrite values inplace 

4339 

4340 if isinstance(value, DataFrame): 

4341 check_key_length(self.columns, key, value) 

4342 for k1, k2 in zip(key, value.columns): 

4343 self[k1] = value[k2] 

4344 

4345 elif not is_list_like(value): 

4346 for col in key: 

4347 self[col] = value 

4348 

4349 elif isinstance(value, np.ndarray) and value.ndim == 2: 

4350 self._iset_not_inplace(key, value) 

4351 

4352 elif np.ndim(value) > 1: 

4353 # list of lists 

4354 value = DataFrame(value).values 

4355 return self._setitem_array(key, value) 

4356 

4357 else: 

4358 self._iset_not_inplace(key, value) 

4359 

4360 def _iset_not_inplace(self, key, value): 

4361 # GH#39510 when setting with df[key] = obj with a list-like key and 

4362 # list-like value, we iterate over those listlikes and set columns 

4363 # one at a time. This is different from dispatching to 

4364 # `self.loc[:, key]= value` because loc.__setitem__ may overwrite 

4365 # data inplace, whereas this will insert new arrays. 

4366 

4367 def igetitem(obj, i: int): 

4368 # Note: we catch DataFrame obj before getting here, but 

4369 # hypothetically would return obj.iloc[:, i] 

4370 if isinstance(obj, np.ndarray): 

4371 return obj[..., i] 

4372 else: 

4373 return obj[i] 

4374 

4375 if self.columns.is_unique: 

4376 if np.shape(value)[-1] != len(key): 

4377 raise ValueError("Columns must be same length as key") 

4378 

4379 for i, col in enumerate(key): 

4380 self[col] = igetitem(value, i) 

4381 

4382 else: 

4383 ilocs = self.columns.get_indexer_non_unique(key)[0] 

4384 if (ilocs < 0).any(): 

4385 # key entries not in self.columns 

4386 raise NotImplementedError 

4387 

4388 if np.shape(value)[-1] != len(ilocs): 

4389 raise ValueError("Columns must be same length as key") 

4390 

4391 assert np.ndim(value) <= 2 

4392 

4393 orig_columns = self.columns 

4394 

4395 # Using self.iloc[:, i] = ... may set values inplace, which 

4396 # by convention we do not do in __setitem__ 

4397 try: 

4398 self.columns = Index(range(len(self.columns))) 

4399 for i, iloc in enumerate(ilocs): 

4400 self[iloc] = igetitem(value, i) 

4401 finally: 

4402 self.columns = orig_columns 

4403 

4404 def _setitem_frame(self, key, value): 

4405 # support boolean setting with DataFrame input, e.g. 

4406 # df[df > df2] = 0 

4407 if isinstance(key, np.ndarray): 

4408 if key.shape != self.shape: 

4409 raise ValueError("Array conditional must be same shape as self") 

4410 key = self._constructor(key, **self._construct_axes_dict(), copy=False) 

4411 

4412 if key.size and not all(is_bool_dtype(dtype) for dtype in key.dtypes): 

4413 raise TypeError( 

4414 "Must pass DataFrame or 2-d ndarray with boolean values only" 

4415 ) 

4416 

4417 self._check_setitem_copy() 

4418 self._where(-key, value, inplace=True) 

4419 

4420 def _set_item_frame_value(self, key, value: DataFrame) -> None: 

4421 self._ensure_valid_index(value) 

4422 

4423 # align columns 

4424 if key in self.columns: 

4425 loc = self.columns.get_loc(key) 

4426 cols = self.columns[loc] 

4427 len_cols = 1 if is_scalar(cols) or isinstance(cols, tuple) else len(cols) 

4428 if len_cols != len(value.columns): 

4429 raise ValueError("Columns must be same length as key") 

4430 

4431 # align right-hand-side columns if self.columns 

4432 # is multi-index and self[key] is a sub-frame 

4433 if isinstance(self.columns, MultiIndex) and isinstance( 

4434 loc, (slice, Series, np.ndarray, Index) 

4435 ): 

4436 cols_droplevel = maybe_droplevels(cols, key) 

4437 if len(cols_droplevel) and not cols_droplevel.equals(value.columns): 

4438 value = value.reindex(cols_droplevel, axis=1) 

4439 

4440 for col, col_droplevel in zip(cols, cols_droplevel): 

4441 self[col] = value[col_droplevel] 

4442 return 

4443 

4444 if is_scalar(cols): 

4445 self[cols] = value[value.columns[0]] 

4446 return 

4447 

4448 locs: np.ndarray | list 

4449 if isinstance(loc, slice): 

4450 locs = np.arange(loc.start, loc.stop, loc.step) 

4451 elif is_scalar(loc): 

4452 locs = [loc] 

4453 else: 

4454 locs = loc.nonzero()[0] 

4455 

4456 return self.isetitem(locs, value) 

4457 

4458 if len(value.columns) > 1: 

4459 raise ValueError( 

4460 "Cannot set a DataFrame with multiple columns to the single " 

4461 f"column {key}" 

4462 ) 

4463 elif len(value.columns) == 0: 

4464 raise ValueError( 

4465 f"Cannot set a DataFrame without columns to the column {key}" 

4466 ) 

4467 

4468 self[key] = value[value.columns[0]] 

4469 

4470 def _iset_item_mgr( 

4471 self, 

4472 loc: int | slice | np.ndarray, 

4473 value, 

4474 inplace: bool = False, 

4475 refs: BlockValuesRefs | None = None, 

4476 ) -> None: 

4477 # when called from _set_item_mgr loc can be anything returned from get_loc 

4478 self._mgr.iset(loc, value, inplace=inplace, refs=refs) 

4479 self._clear_item_cache() 

4480 

4481 def _set_item_mgr( 

4482 self, key, value: ArrayLike, refs: BlockValuesRefs | None = None 

4483 ) -> None: 

4484 try: 

4485 loc = self._info_axis.get_loc(key) 

4486 except KeyError: 

4487 # This item wasn't present, just insert at end 

4488 self._mgr.insert(len(self._info_axis), key, value, refs) 

4489 else: 

4490 self._iset_item_mgr(loc, value, refs=refs) 

4491 

4492 # check if we are modifying a copy 

4493 # try to set first as we want an invalid 

4494 # value exception to occur first 

4495 if len(self): 

4496 self._check_setitem_copy() 

4497 

4498 def _iset_item(self, loc: int, value: Series, inplace: bool = True) -> None: 

4499 # We are only called from _replace_columnwise which guarantees that 

4500 # no reindex is necessary 

4501 if using_copy_on_write(): 

4502 self._iset_item_mgr( 

4503 loc, value._values, inplace=inplace, refs=value._references 

4504 ) 

4505 else: 

4506 self._iset_item_mgr(loc, value._values.copy(), inplace=True) 

4507 

4508 # check if we are modifying a copy 

4509 # try to set first as we want an invalid 

4510 # value exception to occur first 

4511 if len(self): 

4512 self._check_setitem_copy() 

4513 

4514 def _set_item(self, key, value) -> None: 

4515 """ 

4516 Add series to DataFrame in specified column. 

4517 

4518 If series is a numpy-array (not a Series/TimeSeries), it must be the 

4519 same length as the DataFrames index or an error will be thrown. 

4520 

4521 Series/TimeSeries will be conformed to the DataFrames index to 

4522 ensure homogeneity. 

4523 """ 

4524 value, refs = self._sanitize_column(value) 

4525 

4526 if ( 

4527 key in self.columns 

4528 and value.ndim == 1 

4529 and not isinstance(value.dtype, ExtensionDtype) 

4530 ): 

4531 # broadcast across multiple columns if necessary 

4532 if not self.columns.is_unique or isinstance(self.columns, MultiIndex): 

4533 existing_piece = self[key] 

4534 if isinstance(existing_piece, DataFrame): 

4535 value = np.tile(value, (len(existing_piece.columns), 1)).T 

4536 refs = None 

4537 

4538 self._set_item_mgr(key, value, refs) 

4539 

4540 def _set_value( 

4541 self, index: IndexLabel, col, value: Scalar, takeable: bool = False 

4542 ) -> None: 

4543 """ 

4544 Put single value at passed column and index. 

4545 

4546 Parameters 

4547 ---------- 

4548 index : Label 

4549 row label 

4550 col : Label 

4551 column label 

4552 value : scalar 

4553 takeable : bool, default False 

4554 Sets whether or not index/col interpreted as indexers 

4555 """ 

4556 try: 

4557 if takeable: 

4558 icol = col 

4559 iindex = cast(int, index) 

4560 else: 

4561 icol = self.columns.get_loc(col) 

4562 iindex = self.index.get_loc(index) 

4563 self._mgr.column_setitem(icol, iindex, value, inplace_only=True) 

4564 self._clear_item_cache() 

4565 

4566 except (KeyError, TypeError, ValueError, LossySetitemError): 

4567 # get_loc might raise a KeyError for missing labels (falling back 

4568 # to (i)loc will do expansion of the index) 

4569 # column_setitem will do validation that may raise TypeError, 

4570 # ValueError, or LossySetitemError 

4571 # set using a non-recursive method & reset the cache 

4572 if takeable: 

4573 self.iloc[index, col] = value 

4574 else: 

4575 self.loc[index, col] = value 

4576 self._item_cache.pop(col, None) 

4577 

4578 except InvalidIndexError as ii_err: 

4579 # GH48729: Seems like you are trying to assign a value to a 

4580 # row when only scalar options are permitted 

4581 raise InvalidIndexError( 

4582 f"You can only assign a scalar value not a {type(value)}" 

4583 ) from ii_err 

4584 

4585 def _ensure_valid_index(self, value) -> None: 

4586 """ 

4587 Ensure that if we don't have an index, that we can create one from the 

4588 passed value. 

4589 """ 

4590 # GH5632, make sure that we are a Series convertible 

4591 if not len(self.index) and is_list_like(value) and len(value): 

4592 if not isinstance(value, DataFrame): 

4593 try: 

4594 value = Series(value) 

4595 except (ValueError, NotImplementedError, TypeError) as err: 

4596 raise ValueError( 

4597 "Cannot set a frame with no defined index " 

4598 "and a value that cannot be converted to a Series" 

4599 ) from err 

4600 

4601 # GH31368 preserve name of index 

4602 index_copy = value.index.copy() 

4603 if self.index.name is not None: 

4604 index_copy.name = self.index.name 

4605 

4606 self._mgr = self._mgr.reindex_axis(index_copy, axis=1, fill_value=np.nan) 

4607 

4608 def _box_col_values(self, values: SingleDataManager, loc: int) -> Series: 

4609 """ 

4610 Provide boxed values for a column. 

4611 """ 

4612 # Lookup in columns so that if e.g. a str datetime was passed 

4613 # we attach the Timestamp object as the name. 

4614 name = self.columns[loc] 

4615 # We get index=self.index bc values is a SingleDataManager 

4616 obj = self._constructor_sliced_from_mgr(values, axes=values.axes) 

4617 obj._name = name 

4618 return obj.__finalize__(self) 

4619 

4620 # ---------------------------------------------------------------------- 

4621 # Lookup Caching 

4622 

4623 def _clear_item_cache(self) -> None: 

4624 self._item_cache.clear() 

4625 

4626 def _get_item_cache(self, item: Hashable) -> Series: 

4627 """Return the cached item, item represents a label indexer.""" 

4628 if using_copy_on_write() or warn_copy_on_write(): 

4629 loc = self.columns.get_loc(item) 

4630 return self._ixs(loc, axis=1) 

4631 

4632 cache = self._item_cache 

4633 res = cache.get(item) 

4634 if res is None: 

4635 # All places that call _get_item_cache have unique columns, 

4636 # pending resolution of GH#33047 

4637 

4638 loc = self.columns.get_loc(item) 

4639 res = self._ixs(loc, axis=1) 

4640 

4641 cache[item] = res 

4642 

4643 # for a chain 

4644 res._is_copy = self._is_copy 

4645 return res 

4646 

4647 def _reset_cacher(self) -> None: 

4648 # no-op for DataFrame 

4649 pass 

4650 

4651 def _maybe_cache_changed(self, item, value: Series, inplace: bool) -> None: 

4652 """ 

4653 The object has called back to us saying maybe it has changed. 

4654 """ 

4655 loc = self._info_axis.get_loc(item) 

4656 arraylike = value._values 

4657 

4658 old = self._ixs(loc, axis=1) 

4659 if old._values is value._values and inplace: 

4660 # GH#46149 avoid making unnecessary copies/block-splitting 

4661 return 

4662 

4663 self._mgr.iset(loc, arraylike, inplace=inplace) 

4664 

4665 # ---------------------------------------------------------------------- 

4666 # Unsorted 

4667 

4668 @overload 

4669 def query(self, expr: str, *, inplace: Literal[False] = ..., **kwargs) -> DataFrame: 

4670 ... 

4671 

4672 @overload 

4673 def query(self, expr: str, *, inplace: Literal[True], **kwargs) -> None: 

4674 ... 

4675 

4676 @overload 

4677 def query(self, expr: str, *, inplace: bool = ..., **kwargs) -> DataFrame | None: 

4678 ... 

4679 

4680 def query(self, expr: str, *, inplace: bool = False, **kwargs) -> DataFrame | None: 

4681 """ 

4682 Query the columns of a DataFrame with a boolean expression. 

4683 

4684 Parameters 

4685 ---------- 

4686 expr : str 

4687 The query string to evaluate. 

4688 

4689 You can refer to variables 

4690 in the environment by prefixing them with an '@' character like 

4691 ``@a + b``. 

4692 

4693 You can refer to column names that are not valid Python variable names 

4694 by surrounding them in backticks. Thus, column names containing spaces 

4695 or punctuations (besides underscores) or starting with digits must be 

4696 surrounded by backticks. (For example, a column named "Area (cm^2)" would 

4697 be referenced as ```Area (cm^2)```). Column names which are Python keywords 

4698 (like "list", "for", "import", etc) cannot be used. 

4699 

4700 For example, if one of your columns is called ``a a`` and you want 

4701 to sum it with ``b``, your query should be ```a a` + b``. 

4702 

4703 inplace : bool 

4704 Whether to modify the DataFrame rather than creating a new one. 

4705 **kwargs 

4706 See the documentation for :func:`eval` for complete details 

4707 on the keyword arguments accepted by :meth:`DataFrame.query`. 

4708 

4709 Returns 

4710 ------- 

4711 DataFrame or None 

4712 DataFrame resulting from the provided query expression or 

4713 None if ``inplace=True``. 

4714 

4715 See Also 

4716 -------- 

4717 eval : Evaluate a string describing operations on 

4718 DataFrame columns. 

4719 DataFrame.eval : Evaluate a string describing operations on 

4720 DataFrame columns. 

4721 

4722 Notes 

4723 ----- 

4724 The result of the evaluation of this expression is first passed to 

4725 :attr:`DataFrame.loc` and if that fails because of a 

4726 multidimensional key (e.g., a DataFrame) then the result will be passed 

4727 to :meth:`DataFrame.__getitem__`. 

4728 

4729 This method uses the top-level :func:`eval` function to 

4730 evaluate the passed query. 

4731 

4732 The :meth:`~pandas.DataFrame.query` method uses a slightly 

4733 modified Python syntax by default. For example, the ``&`` and ``|`` 

4734 (bitwise) operators have the precedence of their boolean cousins, 

4735 :keyword:`and` and :keyword:`or`. This *is* syntactically valid Python, 

4736 however the semantics are different. 

4737 

4738 You can change the semantics of the expression by passing the keyword 

4739 argument ``parser='python'``. This enforces the same semantics as 

4740 evaluation in Python space. Likewise, you can pass ``engine='python'`` 

4741 to evaluate an expression using Python itself as a backend. This is not 

4742 recommended as it is inefficient compared to using ``numexpr`` as the 

4743 engine. 

4744 

4745 The :attr:`DataFrame.index` and 

4746 :attr:`DataFrame.columns` attributes of the 

4747 :class:`~pandas.DataFrame` instance are placed in the query namespace 

4748 by default, which allows you to treat both the index and columns of the 

4749 frame as a column in the frame. 

4750 The identifier ``index`` is used for the frame index; you can also 

4751 use the name of the index to identify it in a query. Please note that 

4752 Python keywords may not be used as identifiers. 

4753 

4754 For further details and examples see the ``query`` documentation in 

4755 :ref:`indexing <indexing.query>`. 

4756 

4757 *Backtick quoted variables* 

4758 

4759 Backtick quoted variables are parsed as literal Python code and 

4760 are converted internally to a Python valid identifier. 

4761 This can lead to the following problems. 

4762 

4763 During parsing a number of disallowed characters inside the backtick 

4764 quoted string are replaced by strings that are allowed as a Python identifier. 

4765 These characters include all operators in Python, the space character, the 

4766 question mark, the exclamation mark, the dollar sign, and the euro sign. 

4767 For other characters that fall outside the ASCII range (U+0001..U+007F) 

4768 and those that are not further specified in PEP 3131, 

4769 the query parser will raise an error. 

4770 This excludes whitespace different than the space character, 

4771 but also the hashtag (as it is used for comments) and the backtick 

4772 itself (backtick can also not be escaped). 

4773 

4774 In a special case, quotes that make a pair around a backtick can 

4775 confuse the parser. 

4776 For example, ```it's` > `that's``` will raise an error, 

4777 as it forms a quoted string (``'s > `that'``) with a backtick inside. 

4778 

4779 See also the Python documentation about lexical analysis 

4780 (https://docs.python.org/3/reference/lexical_analysis.html) 

4781 in combination with the source code in :mod:`pandas.core.computation.parsing`. 

4782 

4783 Examples 

4784 -------- 

4785 >>> df = pd.DataFrame({'A': range(1, 6), 

4786 ... 'B': range(10, 0, -2), 

4787 ... 'C C': range(10, 5, -1)}) 

4788 >>> df 

4789 A B C C 

4790 0 1 10 10 

4791 1 2 8 9 

4792 2 3 6 8 

4793 3 4 4 7 

4794 4 5 2 6 

4795 >>> df.query('A > B') 

4796 A B C C 

4797 4 5 2 6 

4798 

4799 The previous expression is equivalent to 

4800 

4801 >>> df[df.A > df.B] 

4802 A B C C 

4803 4 5 2 6 

4804 

4805 For columns with spaces in their name, you can use backtick quoting. 

4806 

4807 >>> df.query('B == `C C`') 

4808 A B C C 

4809 0 1 10 10 

4810 

4811 The previous expression is equivalent to 

4812 

4813 >>> df[df.B == df['C C']] 

4814 A B C C 

4815 0 1 10 10 

4816 """ 

4817 inplace = validate_bool_kwarg(inplace, "inplace") 

4818 if not isinstance(expr, str): 

4819 msg = f"expr must be a string to be evaluated, {type(expr)} given" 

4820 raise ValueError(msg) 

4821 kwargs["level"] = kwargs.pop("level", 0) + 1 

4822 kwargs["target"] = None 

4823 res = self.eval(expr, **kwargs) 

4824 

4825 try: 

4826 result = self.loc[res] 

4827 except ValueError: 

4828 # when res is multi-dimensional loc raises, but this is sometimes a 

4829 # valid query 

4830 result = self[res] 

4831 

4832 if inplace: 

4833 self._update_inplace(result) 

4834 return None 

4835 else: 

4836 return result 

4837 

4838 @overload 

4839 def eval(self, expr: str, *, inplace: Literal[False] = ..., **kwargs) -> Any: 

4840 ... 

4841 

4842 @overload 

4843 def eval(self, expr: str, *, inplace: Literal[True], **kwargs) -> None: 

4844 ... 

4845 

4846 def eval(self, expr: str, *, inplace: bool = False, **kwargs) -> Any | None: 

4847 """ 

4848 Evaluate a string describing operations on DataFrame columns. 

4849 

4850 Operates on columns only, not specific rows or elements. This allows 

4851 `eval` to run arbitrary code, which can make you vulnerable to code 

4852 injection if you pass user input to this function. 

4853 

4854 Parameters 

4855 ---------- 

4856 expr : str 

4857 The expression string to evaluate. 

4858 inplace : bool, default False 

4859 If the expression contains an assignment, whether to perform the 

4860 operation inplace and mutate the existing DataFrame. Otherwise, 

4861 a new DataFrame is returned. 

4862 **kwargs 

4863 See the documentation for :func:`eval` for complete details 

4864 on the keyword arguments accepted by 

4865 :meth:`~pandas.DataFrame.query`. 

4866 

4867 Returns 

4868 ------- 

4869 ndarray, scalar, pandas object, or None 

4870 The result of the evaluation or None if ``inplace=True``. 

4871 

4872 See Also 

4873 -------- 

4874 DataFrame.query : Evaluates a boolean expression to query the columns 

4875 of a frame. 

4876 DataFrame.assign : Can evaluate an expression or function to create new 

4877 values for a column. 

4878 eval : Evaluate a Python expression as a string using various 

4879 backends. 

4880 

4881 Notes 

4882 ----- 

4883 For more details see the API documentation for :func:`~eval`. 

4884 For detailed examples see :ref:`enhancing performance with eval 

4885 <enhancingperf.eval>`. 

4886 

4887 Examples 

4888 -------- 

4889 >>> df = pd.DataFrame({'A': range(1, 6), 'B': range(10, 0, -2)}) 

4890 >>> df 

4891 A B 

4892 0 1 10 

4893 1 2 8 

4894 2 3 6 

4895 3 4 4 

4896 4 5 2 

4897 >>> df.eval('A + B') 

4898 0 11 

4899 1 10 

4900 2 9 

4901 3 8 

4902 4 7 

4903 dtype: int64 

4904 

4905 Assignment is allowed though by default the original DataFrame is not 

4906 modified. 

4907 

4908 >>> df.eval('C = A + B') 

4909 A B C 

4910 0 1 10 11 

4911 1 2 8 10 

4912 2 3 6 9 

4913 3 4 4 8 

4914 4 5 2 7 

4915 >>> df 

4916 A B 

4917 0 1 10 

4918 1 2 8 

4919 2 3 6 

4920 3 4 4 

4921 4 5 2 

4922 

4923 Multiple columns can be assigned to using multi-line expressions: 

4924 

4925 >>> df.eval( 

4926 ... ''' 

4927 ... C = A + B 

4928 ... D = A - B 

4929 ... ''' 

4930 ... ) 

4931 A B C D 

4932 0 1 10 11 -9 

4933 1 2 8 10 -6 

4934 2 3 6 9 -3 

4935 3 4 4 8 0 

4936 4 5 2 7 3 

4937 """ 

4938 from pandas.core.computation.eval import eval as _eval 

4939 

4940 inplace = validate_bool_kwarg(inplace, "inplace") 

4941 kwargs["level"] = kwargs.pop("level", 0) + 1 

4942 index_resolvers = self._get_index_resolvers() 

4943 column_resolvers = self._get_cleaned_column_resolvers() 

4944 resolvers = column_resolvers, index_resolvers 

4945 if "target" not in kwargs: 

4946 kwargs["target"] = self 

4947 kwargs["resolvers"] = tuple(kwargs.get("resolvers", ())) + resolvers 

4948 

4949 return _eval(expr, inplace=inplace, **kwargs) 

4950 

4951 def select_dtypes(self, include=None, exclude=None) -> Self: 

4952 """ 

4953 Return a subset of the DataFrame's columns based on the column dtypes. 

4954 

4955 Parameters 

4956 ---------- 

4957 include, exclude : scalar or list-like 

4958 A selection of dtypes or strings to be included/excluded. At least 

4959 one of these parameters must be supplied. 

4960 

4961 Returns 

4962 ------- 

4963 DataFrame 

4964 The subset of the frame including the dtypes in ``include`` and 

4965 excluding the dtypes in ``exclude``. 

4966 

4967 Raises 

4968 ------ 

4969 ValueError 

4970 * If both of ``include`` and ``exclude`` are empty 

4971 * If ``include`` and ``exclude`` have overlapping elements 

4972 * If any kind of string dtype is passed in. 

4973 

4974 See Also 

4975 -------- 

4976 DataFrame.dtypes: Return Series with the data type of each column. 

4977 

4978 Notes 

4979 ----- 

4980 * To select all *numeric* types, use ``np.number`` or ``'number'`` 

4981 * To select strings you must use the ``object`` dtype, but note that 

4982 this will return *all* object dtype columns 

4983 * See the `numpy dtype hierarchy 

4984 <https://numpy.org/doc/stable/reference/arrays.scalars.html>`__ 

4985 * To select datetimes, use ``np.datetime64``, ``'datetime'`` or 

4986 ``'datetime64'`` 

4987 * To select timedeltas, use ``np.timedelta64``, ``'timedelta'`` or 

4988 ``'timedelta64'`` 

4989 * To select Pandas categorical dtypes, use ``'category'`` 

4990 * To select Pandas datetimetz dtypes, use ``'datetimetz'`` 

4991 or ``'datetime64[ns, tz]'`` 

4992 

4993 Examples 

4994 -------- 

4995 >>> df = pd.DataFrame({'a': [1, 2] * 3, 

4996 ... 'b': [True, False] * 3, 

4997 ... 'c': [1.0, 2.0] * 3}) 

4998 >>> df 

4999 a b c 

5000 0 1 True 1.0 

5001 1 2 False 2.0 

5002 2 1 True 1.0 

5003 3 2 False 2.0 

5004 4 1 True 1.0 

5005 5 2 False 2.0 

5006 

5007 >>> df.select_dtypes(include='bool') 

5008 b 

5009 0 True 

5010 1 False 

5011 2 True 

5012 3 False 

5013 4 True 

5014 5 False 

5015 

5016 >>> df.select_dtypes(include=['float64']) 

5017 c 

5018 0 1.0 

5019 1 2.0 

5020 2 1.0 

5021 3 2.0 

5022 4 1.0 

5023 5 2.0 

5024 

5025 >>> df.select_dtypes(exclude=['int64']) 

5026 b c 

5027 0 True 1.0 

5028 1 False 2.0 

5029 2 True 1.0 

5030 3 False 2.0 

5031 4 True 1.0 

5032 5 False 2.0 

5033 """ 

5034 if not is_list_like(include): 

5035 include = (include,) if include is not None else () 

5036 if not is_list_like(exclude): 

5037 exclude = (exclude,) if exclude is not None else () 

5038 

5039 selection = (frozenset(include), frozenset(exclude)) 

5040 

5041 if not any(selection): 

5042 raise ValueError("at least one of include or exclude must be nonempty") 

5043 

5044 # convert the myriad valid dtypes object to a single representation 

5045 def check_int_infer_dtype(dtypes): 

5046 converted_dtypes: list[type] = [] 

5047 for dtype in dtypes: 

5048 # Numpy maps int to different types (int32, in64) on Windows and Linux 

5049 # see https://github.com/numpy/numpy/issues/9464 

5050 if (isinstance(dtype, str) and dtype == "int") or (dtype is int): 

5051 converted_dtypes.append(np.int32) 

5052 converted_dtypes.append(np.int64) 

5053 elif dtype == "float" or dtype is float: 

5054 # GH#42452 : np.dtype("float") coerces to np.float64 from Numpy 1.20 

5055 converted_dtypes.extend([np.float64, np.float32]) 

5056 else: 

5057 converted_dtypes.append(infer_dtype_from_object(dtype)) 

5058 return frozenset(converted_dtypes) 

5059 

5060 include = check_int_infer_dtype(include) 

5061 exclude = check_int_infer_dtype(exclude) 

5062 

5063 for dtypes in (include, exclude): 

5064 invalidate_string_dtypes(dtypes) 

5065 

5066 # can't both include AND exclude! 

5067 if not include.isdisjoint(exclude): 

5068 raise ValueError(f"include and exclude overlap on {(include & exclude)}") 

5069 

5070 def dtype_predicate(dtype: DtypeObj, dtypes_set) -> bool: 

5071 # GH 46870: BooleanDtype._is_numeric == True but should be excluded 

5072 dtype = dtype if not isinstance(dtype, ArrowDtype) else dtype.numpy_dtype 

5073 return issubclass(dtype.type, tuple(dtypes_set)) or ( 

5074 np.number in dtypes_set 

5075 and getattr(dtype, "_is_numeric", False) 

5076 and not is_bool_dtype(dtype) 

5077 ) 

5078 

5079 def predicate(arr: ArrayLike) -> bool: 

5080 dtype = arr.dtype 

5081 if include: 

5082 if not dtype_predicate(dtype, include): 

5083 return False 

5084 

5085 if exclude: 

5086 if dtype_predicate(dtype, exclude): 

5087 return False 

5088 

5089 return True 

5090 

5091 mgr = self._mgr._get_data_subset(predicate).copy(deep=None) 

5092 # error: Incompatible return value type (got "DataFrame", expected "Self") 

5093 return self._constructor_from_mgr(mgr, axes=mgr.axes).__finalize__(self) # type: ignore[return-value] 

5094 

5095 def insert( 

5096 self, 

5097 loc: int, 

5098 column: Hashable, 

5099 value: Scalar | AnyArrayLike, 

5100 allow_duplicates: bool | lib.NoDefault = lib.no_default, 

5101 ) -> None: 

5102 """ 

5103 Insert column into DataFrame at specified location. 

5104 

5105 Raises a ValueError if `column` is already contained in the DataFrame, 

5106 unless `allow_duplicates` is set to True. 

5107 

5108 Parameters 

5109 ---------- 

5110 loc : int 

5111 Insertion index. Must verify 0 <= loc <= len(columns). 

5112 column : str, number, or hashable object 

5113 Label of the inserted column. 

5114 value : Scalar, Series, or array-like 

5115 Content of the inserted column. 

5116 allow_duplicates : bool, optional, default lib.no_default 

5117 Allow duplicate column labels to be created. 

5118 

5119 See Also 

5120 -------- 

5121 Index.insert : Insert new item by index. 

5122 

5123 Examples 

5124 -------- 

5125 >>> df = pd.DataFrame({'col1': [1, 2], 'col2': [3, 4]}) 

5126 >>> df 

5127 col1 col2 

5128 0 1 3 

5129 1 2 4 

5130 >>> df.insert(1, "newcol", [99, 99]) 

5131 >>> df 

5132 col1 newcol col2 

5133 0 1 99 3 

5134 1 2 99 4 

5135 >>> df.insert(0, "col1", [100, 100], allow_duplicates=True) 

5136 >>> df 

5137 col1 col1 newcol col2 

5138 0 100 1 99 3 

5139 1 100 2 99 4 

5140 

5141 Notice that pandas uses index alignment in case of `value` from type `Series`: 

5142 

5143 >>> df.insert(0, "col0", pd.Series([5, 6], index=[1, 2])) 

5144 >>> df 

5145 col0 col1 col1 newcol col2 

5146 0 NaN 100 1 99 3 

5147 1 5.0 100 2 99 4 

5148 """ 

5149 if allow_duplicates is lib.no_default: 

5150 allow_duplicates = False 

5151 if allow_duplicates and not self.flags.allows_duplicate_labels: 

5152 raise ValueError( 

5153 "Cannot specify 'allow_duplicates=True' when " 

5154 "'self.flags.allows_duplicate_labels' is False." 

5155 ) 

5156 if not allow_duplicates and column in self.columns: 

5157 # Should this be a different kind of error?? 

5158 raise ValueError(f"cannot insert {column}, already exists") 

5159 if not is_integer(loc): 

5160 raise TypeError("loc must be int") 

5161 # convert non stdlib ints to satisfy typing checks 

5162 loc = int(loc) 

5163 if isinstance(value, DataFrame) and len(value.columns) > 1: 

5164 raise ValueError( 

5165 f"Expected a one-dimensional object, got a DataFrame with " 

5166 f"{len(value.columns)} columns instead." 

5167 ) 

5168 elif isinstance(value, DataFrame): 

5169 value = value.iloc[:, 0] 

5170 

5171 value, refs = self._sanitize_column(value) 

5172 self._mgr.insert(loc, column, value, refs=refs) 

5173 

5174 def assign(self, **kwargs) -> DataFrame: 

5175 r""" 

5176 Assign new columns to a DataFrame. 

5177 

5178 Returns a new object with all original columns in addition to new ones. 

5179 Existing columns that are re-assigned will be overwritten. 

5180 

5181 Parameters 

5182 ---------- 

5183 **kwargs : dict of {str: callable or Series} 

5184 The column names are keywords. If the values are 

5185 callable, they are computed on the DataFrame and 

5186 assigned to the new columns. The callable must not 

5187 change input DataFrame (though pandas doesn't check it). 

5188 If the values are not callable, (e.g. a Series, scalar, or array), 

5189 they are simply assigned. 

5190 

5191 Returns 

5192 ------- 

5193 DataFrame 

5194 A new DataFrame with the new columns in addition to 

5195 all the existing columns. 

5196 

5197 Notes 

5198 ----- 

5199 Assigning multiple columns within the same ``assign`` is possible. 

5200 Later items in '\*\*kwargs' may refer to newly created or modified 

5201 columns in 'df'; items are computed and assigned into 'df' in order. 

5202 

5203 Examples 

5204 -------- 

5205 >>> df = pd.DataFrame({'temp_c': [17.0, 25.0]}, 

5206 ... index=['Portland', 'Berkeley']) 

5207 >>> df 

5208 temp_c 

5209 Portland 17.0 

5210 Berkeley 25.0 

5211 

5212 Where the value is a callable, evaluated on `df`: 

5213 

5214 >>> df.assign(temp_f=lambda x: x.temp_c * 9 / 5 + 32) 

5215 temp_c temp_f 

5216 Portland 17.0 62.6 

5217 Berkeley 25.0 77.0 

5218 

5219 Alternatively, the same behavior can be achieved by directly 

5220 referencing an existing Series or sequence: 

5221 

5222 >>> df.assign(temp_f=df['temp_c'] * 9 / 5 + 32) 

5223 temp_c temp_f 

5224 Portland 17.0 62.6 

5225 Berkeley 25.0 77.0 

5226 

5227 You can create multiple columns within the same assign where one 

5228 of the columns depends on another one defined within the same assign: 

5229 

5230 >>> df.assign(temp_f=lambda x: x['temp_c'] * 9 / 5 + 32, 

5231 ... temp_k=lambda x: (x['temp_f'] + 459.67) * 5 / 9) 

5232 temp_c temp_f temp_k 

5233 Portland 17.0 62.6 290.15 

5234 Berkeley 25.0 77.0 298.15 

5235 """ 

5236 data = self.copy(deep=None) 

5237 

5238 for k, v in kwargs.items(): 

5239 data[k] = com.apply_if_callable(v, data) 

5240 return data 

5241 

5242 def _sanitize_column(self, value) -> tuple[ArrayLike, BlockValuesRefs | None]: 

5243 """ 

5244 Ensures new columns (which go into the BlockManager as new blocks) are 

5245 always copied (or a reference is being tracked to them under CoW) 

5246 and converted into an array. 

5247 

5248 Parameters 

5249 ---------- 

5250 value : scalar, Series, or array-like 

5251 

5252 Returns 

5253 ------- 

5254 tuple of numpy.ndarray or ExtensionArray and optional BlockValuesRefs 

5255 """ 

5256 self._ensure_valid_index(value) 

5257 

5258 # Using a DataFrame would mean coercing values to one dtype 

5259 assert not isinstance(value, DataFrame) 

5260 if is_dict_like(value): 

5261 if not isinstance(value, Series): 

5262 value = Series(value) 

5263 return _reindex_for_setitem(value, self.index) 

5264 

5265 if is_list_like(value): 

5266 com.require_length_match(value, self.index) 

5267 arr = sanitize_array(value, self.index, copy=True, allow_2d=True) 

5268 if ( 

5269 isinstance(value, Index) 

5270 and value.dtype == "object" 

5271 and arr.dtype != value.dtype 

5272 ): # 

5273 # TODO: Remove kludge in sanitize_array for string mode when enforcing 

5274 # this deprecation 

5275 warnings.warn( 

5276 "Setting an Index with object dtype into a DataFrame will stop " 

5277 "inferring another dtype in a future version. Cast the Index " 

5278 "explicitly before setting it into the DataFrame.", 

5279 FutureWarning, 

5280 stacklevel=find_stack_level(), 

5281 ) 

5282 return arr, None 

5283 

5284 @property 

5285 def _series(self): 

5286 return {item: self._ixs(idx, axis=1) for idx, item in enumerate(self.columns)} 

5287 

5288 # ---------------------------------------------------------------------- 

5289 # Reindexing and alignment 

5290 

5291 def _reindex_multi( 

5292 self, axes: dict[str, Index], copy: bool, fill_value 

5293 ) -> DataFrame: 

5294 """ 

5295 We are guaranteed non-Nones in the axes. 

5296 """ 

5297 

5298 new_index, row_indexer = self.index.reindex(axes["index"]) 

5299 new_columns, col_indexer = self.columns.reindex(axes["columns"]) 

5300 

5301 if row_indexer is not None and col_indexer is not None: 

5302 # Fastpath. By doing two 'take's at once we avoid making an 

5303 # unnecessary copy. 

5304 # We only get here with `self._can_fast_transpose`, which (almost) 

5305 # ensures that self.values is cheap. It may be worth making this 

5306 # condition more specific. 

5307 indexer = row_indexer, col_indexer 

5308 new_values = take_2d_multi(self.values, indexer, fill_value=fill_value) 

5309 return self._constructor( 

5310 new_values, index=new_index, columns=new_columns, copy=False 

5311 ) 

5312 else: 

5313 return self._reindex_with_indexers( 

5314 {0: [new_index, row_indexer], 1: [new_columns, col_indexer]}, 

5315 copy=copy, 

5316 fill_value=fill_value, 

5317 ) 

5318 

5319 @Appender( 

5320 """ 

5321 Examples 

5322 -------- 

5323 >>> df = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}) 

5324 

5325 Change the row labels. 

5326 

5327 >>> df.set_axis(['a', 'b', 'c'], axis='index') 

5328 A B 

5329 a 1 4 

5330 b 2 5 

5331 c 3 6 

5332 

5333 Change the column labels. 

5334 

5335 >>> df.set_axis(['I', 'II'], axis='columns') 

5336 I II 

5337 0 1 4 

5338 1 2 5 

5339 2 3 6 

5340 """ 

5341 ) 

5342 @Substitution( 

5343 klass=_shared_doc_kwargs["klass"], 

5344 axes_single_arg=_shared_doc_kwargs["axes_single_arg"], 

5345 extended_summary_sub=" column or", 

5346 axis_description_sub=", and 1 identifies the columns", 

5347 see_also_sub=" or columns", 

5348 ) 

5349 @Appender(NDFrame.set_axis.__doc__) 

5350 def set_axis( 

5351 self, 

5352 labels, 

5353 *, 

5354 axis: Axis = 0, 

5355 copy: bool | None = None, 

5356 ) -> DataFrame: 

5357 return super().set_axis(labels, axis=axis, copy=copy) 

5358 

5359 @doc( 

5360 NDFrame.reindex, 

5361 klass=_shared_doc_kwargs["klass"], 

5362 optional_reindex=_shared_doc_kwargs["optional_reindex"], 

5363 ) 

5364 def reindex( 

5365 self, 

5366 labels=None, 

5367 *, 

5368 index=None, 

5369 columns=None, 

5370 axis: Axis | None = None, 

5371 method: ReindexMethod | None = None, 

5372 copy: bool | None = None, 

5373 level: Level | None = None, 

5374 fill_value: Scalar | None = np.nan, 

5375 limit: int | None = None, 

5376 tolerance=None, 

5377 ) -> DataFrame: 

5378 return super().reindex( 

5379 labels=labels, 

5380 index=index, 

5381 columns=columns, 

5382 axis=axis, 

5383 method=method, 

5384 copy=copy, 

5385 level=level, 

5386 fill_value=fill_value, 

5387 limit=limit, 

5388 tolerance=tolerance, 

5389 ) 

5390 

5391 @overload 

5392 def drop( 

5393 self, 

5394 labels: IndexLabel = ..., 

5395 *, 

5396 axis: Axis = ..., 

5397 index: IndexLabel = ..., 

5398 columns: IndexLabel = ..., 

5399 level: Level = ..., 

5400 inplace: Literal[True], 

5401 errors: IgnoreRaise = ..., 

5402 ) -> None: 

5403 ... 

5404 

5405 @overload 

5406 def drop( 

5407 self, 

5408 labels: IndexLabel = ..., 

5409 *, 

5410 axis: Axis = ..., 

5411 index: IndexLabel = ..., 

5412 columns: IndexLabel = ..., 

5413 level: Level = ..., 

5414 inplace: Literal[False] = ..., 

5415 errors: IgnoreRaise = ..., 

5416 ) -> DataFrame: 

5417 ... 

5418 

5419 @overload 

5420 def drop( 

5421 self, 

5422 labels: IndexLabel = ..., 

5423 *, 

5424 axis: Axis = ..., 

5425 index: IndexLabel = ..., 

5426 columns: IndexLabel = ..., 

5427 level: Level = ..., 

5428 inplace: bool = ..., 

5429 errors: IgnoreRaise = ..., 

5430 ) -> DataFrame | None: 

5431 ... 

5432 

5433 def drop( 

5434 self, 

5435 labels: IndexLabel | None = None, 

5436 *, 

5437 axis: Axis = 0, 

5438 index: IndexLabel | None = None, 

5439 columns: IndexLabel | None = None, 

5440 level: Level | None = None, 

5441 inplace: bool = False, 

5442 errors: IgnoreRaise = "raise", 

5443 ) -> DataFrame | None: 

5444 """ 

5445 Drop specified labels from rows or columns. 

5446 

5447 Remove rows or columns by specifying label names and corresponding 

5448 axis, or by directly specifying index or column names. When using a 

5449 multi-index, labels on different levels can be removed by specifying 

5450 the level. See the :ref:`user guide <advanced.shown_levels>` 

5451 for more information about the now unused levels. 

5452 

5453 Parameters 

5454 ---------- 

5455 labels : single label or list-like 

5456 Index or column labels to drop. A tuple will be used as a single 

5457 label and not treated as a list-like. 

5458 axis : {0 or 'index', 1 or 'columns'}, default 0 

5459 Whether to drop labels from the index (0 or 'index') or 

5460 columns (1 or 'columns'). 

5461 index : single label or list-like 

5462 Alternative to specifying axis (``labels, axis=0`` 

5463 is equivalent to ``index=labels``). 

5464 columns : single label or list-like 

5465 Alternative to specifying axis (``labels, axis=1`` 

5466 is equivalent to ``columns=labels``). 

5467 level : int or level name, optional 

5468 For MultiIndex, level from which the labels will be removed. 

5469 inplace : bool, default False 

5470 If False, return a copy. Otherwise, do operation 

5471 in place and return None. 

5472 errors : {'ignore', 'raise'}, default 'raise' 

5473 If 'ignore', suppress error and only existing labels are 

5474 dropped. 

5475 

5476 Returns 

5477 ------- 

5478 DataFrame or None 

5479 Returns DataFrame or None DataFrame with the specified 

5480 index or column labels removed or None if inplace=True. 

5481 

5482 Raises 

5483 ------ 

5484 KeyError 

5485 If any of the labels is not found in the selected axis. 

5486 

5487 See Also 

5488 -------- 

5489 DataFrame.loc : Label-location based indexer for selection by label. 

5490 DataFrame.dropna : Return DataFrame with labels on given axis omitted 

5491 where (all or any) data are missing. 

5492 DataFrame.drop_duplicates : Return DataFrame with duplicate rows 

5493 removed, optionally only considering certain columns. 

5494 Series.drop : Return Series with specified index labels removed. 

5495 

5496 Examples 

5497 -------- 

5498 >>> df = pd.DataFrame(np.arange(12).reshape(3, 4), 

5499 ... columns=['A', 'B', 'C', 'D']) 

5500 >>> df 

5501 A B C D 

5502 0 0 1 2 3 

5503 1 4 5 6 7 

5504 2 8 9 10 11 

5505 

5506 Drop columns 

5507 

5508 >>> df.drop(['B', 'C'], axis=1) 

5509 A D 

5510 0 0 3 

5511 1 4 7 

5512 2 8 11 

5513 

5514 >>> df.drop(columns=['B', 'C']) 

5515 A D 

5516 0 0 3 

5517 1 4 7 

5518 2 8 11 

5519 

5520 Drop a row by index 

5521 

5522 >>> df.drop([0, 1]) 

5523 A B C D 

5524 2 8 9 10 11 

5525 

5526 Drop columns and/or rows of MultiIndex DataFrame 

5527 

5528 >>> midx = pd.MultiIndex(levels=[['llama', 'cow', 'falcon'], 

5529 ... ['speed', 'weight', 'length']], 

5530 ... codes=[[0, 0, 0, 1, 1, 1, 2, 2, 2], 

5531 ... [0, 1, 2, 0, 1, 2, 0, 1, 2]]) 

5532 >>> df = pd.DataFrame(index=midx, columns=['big', 'small'], 

5533 ... data=[[45, 30], [200, 100], [1.5, 1], [30, 20], 

5534 ... [250, 150], [1.5, 0.8], [320, 250], 

5535 ... [1, 0.8], [0.3, 0.2]]) 

5536 >>> df 

5537 big small 

5538 llama speed 45.0 30.0 

5539 weight 200.0 100.0 

5540 length 1.5 1.0 

5541 cow speed 30.0 20.0 

5542 weight 250.0 150.0 

5543 length 1.5 0.8 

5544 falcon speed 320.0 250.0 

5545 weight 1.0 0.8 

5546 length 0.3 0.2 

5547 

5548 Drop a specific index combination from the MultiIndex 

5549 DataFrame, i.e., drop the combination ``'falcon'`` and 

5550 ``'weight'``, which deletes only the corresponding row 

5551 

5552 >>> df.drop(index=('falcon', 'weight')) 

5553 big small 

5554 llama speed 45.0 30.0 

5555 weight 200.0 100.0 

5556 length 1.5 1.0 

5557 cow speed 30.0 20.0 

5558 weight 250.0 150.0 

5559 length 1.5 0.8 

5560 falcon speed 320.0 250.0 

5561 length 0.3 0.2 

5562 

5563 >>> df.drop(index='cow', columns='small') 

5564 big 

5565 llama speed 45.0 

5566 weight 200.0 

5567 length 1.5 

5568 falcon speed 320.0 

5569 weight 1.0 

5570 length 0.3 

5571 

5572 >>> df.drop(index='length', level=1) 

5573 big small 

5574 llama speed 45.0 30.0 

5575 weight 200.0 100.0 

5576 cow speed 30.0 20.0 

5577 weight 250.0 150.0 

5578 falcon speed 320.0 250.0 

5579 weight 1.0 0.8 

5580 """ 

5581 return super().drop( 

5582 labels=labels, 

5583 axis=axis, 

5584 index=index, 

5585 columns=columns, 

5586 level=level, 

5587 inplace=inplace, 

5588 errors=errors, 

5589 ) 

5590 

5591 @overload 

5592 def rename( 

5593 self, 

5594 mapper: Renamer | None = ..., 

5595 *, 

5596 index: Renamer | None = ..., 

5597 columns: Renamer | None = ..., 

5598 axis: Axis | None = ..., 

5599 copy: bool | None = ..., 

5600 inplace: Literal[True], 

5601 level: Level = ..., 

5602 errors: IgnoreRaise = ..., 

5603 ) -> None: 

5604 ... 

5605 

5606 @overload 

5607 def rename( 

5608 self, 

5609 mapper: Renamer | None = ..., 

5610 *, 

5611 index: Renamer | None = ..., 

5612 columns: Renamer | None = ..., 

5613 axis: Axis | None = ..., 

5614 copy: bool | None = ..., 

5615 inplace: Literal[False] = ..., 

5616 level: Level = ..., 

5617 errors: IgnoreRaise = ..., 

5618 ) -> DataFrame: 

5619 ... 

5620 

5621 @overload 

5622 def rename( 

5623 self, 

5624 mapper: Renamer | None = ..., 

5625 *, 

5626 index: Renamer | None = ..., 

5627 columns: Renamer | None = ..., 

5628 axis: Axis | None = ..., 

5629 copy: bool | None = ..., 

5630 inplace: bool = ..., 

5631 level: Level = ..., 

5632 errors: IgnoreRaise = ..., 

5633 ) -> DataFrame | None: 

5634 ... 

5635 

5636 def rename( 

5637 self, 

5638 mapper: Renamer | None = None, 

5639 *, 

5640 index: Renamer | None = None, 

5641 columns: Renamer | None = None, 

5642 axis: Axis | None = None, 

5643 copy: bool | None = None, 

5644 inplace: bool = False, 

5645 level: Level | None = None, 

5646 errors: IgnoreRaise = "ignore", 

5647 ) -> DataFrame | None: 

5648 """ 

5649 Rename columns or index labels. 

5650 

5651 Function / dict values must be unique (1-to-1). Labels not contained in 

5652 a dict / Series will be left as-is. Extra labels listed don't throw an 

5653 error. 

5654 

5655 See the :ref:`user guide <basics.rename>` for more. 

5656 

5657 Parameters 

5658 ---------- 

5659 mapper : dict-like or function 

5660 Dict-like or function transformations to apply to 

5661 that axis' values. Use either ``mapper`` and ``axis`` to 

5662 specify the axis to target with ``mapper``, or ``index`` and 

5663 ``columns``. 

5664 index : dict-like or function 

5665 Alternative to specifying axis (``mapper, axis=0`` 

5666 is equivalent to ``index=mapper``). 

5667 columns : dict-like or function 

5668 Alternative to specifying axis (``mapper, axis=1`` 

5669 is equivalent to ``columns=mapper``). 

5670 axis : {0 or 'index', 1 or 'columns'}, default 0 

5671 Axis to target with ``mapper``. Can be either the axis name 

5672 ('index', 'columns') or number (0, 1). The default is 'index'. 

5673 copy : bool, default True 

5674 Also copy underlying data. 

5675 

5676 .. note:: 

5677 The `copy` keyword will change behavior in pandas 3.0. 

5678 `Copy-on-Write 

5679 <https://pandas.pydata.org/docs/dev/user_guide/copy_on_write.html>`__ 

5680 will be enabled by default, which means that all methods with a 

5681 `copy` keyword will use a lazy copy mechanism to defer the copy and 

5682 ignore the `copy` keyword. The `copy` keyword will be removed in a 

5683 future version of pandas. 

5684 

5685 You can already get the future behavior and improvements through 

5686 enabling copy on write ``pd.options.mode.copy_on_write = True`` 

5687 inplace : bool, default False 

5688 Whether to modify the DataFrame rather than creating a new one. 

5689 If True then value of copy is ignored. 

5690 level : int or level name, default None 

5691 In case of a MultiIndex, only rename labels in the specified 

5692 level. 

5693 errors : {'ignore', 'raise'}, default 'ignore' 

5694 If 'raise', raise a `KeyError` when a dict-like `mapper`, `index`, 

5695 or `columns` contains labels that are not present in the Index 

5696 being transformed. 

5697 If 'ignore', existing keys will be renamed and extra keys will be 

5698 ignored. 

5699 

5700 Returns 

5701 ------- 

5702 DataFrame or None 

5703 DataFrame with the renamed axis labels or None if ``inplace=True``. 

5704 

5705 Raises 

5706 ------ 

5707 KeyError 

5708 If any of the labels is not found in the selected axis and 

5709 "errors='raise'". 

5710 

5711 See Also 

5712 -------- 

5713 DataFrame.rename_axis : Set the name of the axis. 

5714 

5715 Examples 

5716 -------- 

5717 ``DataFrame.rename`` supports two calling conventions 

5718 

5719 * ``(index=index_mapper, columns=columns_mapper, ...)`` 

5720 * ``(mapper, axis={'index', 'columns'}, ...)`` 

5721 

5722 We *highly* recommend using keyword arguments to clarify your 

5723 intent. 

5724 

5725 Rename columns using a mapping: 

5726 

5727 >>> df = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}) 

5728 >>> df.rename(columns={"A": "a", "B": "c"}) 

5729 a c 

5730 0 1 4 

5731 1 2 5 

5732 2 3 6 

5733 

5734 Rename index using a mapping: 

5735 

5736 >>> df.rename(index={0: "x", 1: "y", 2: "z"}) 

5737 A B 

5738 x 1 4 

5739 y 2 5 

5740 z 3 6 

5741 

5742 Cast index labels to a different type: 

5743 

5744 >>> df.index 

5745 RangeIndex(start=0, stop=3, step=1) 

5746 >>> df.rename(index=str).index 

5747 Index(['0', '1', '2'], dtype='object') 

5748 

5749 >>> df.rename(columns={"A": "a", "B": "b", "C": "c"}, errors="raise") 

5750 Traceback (most recent call last): 

5751 KeyError: ['C'] not found in axis 

5752 

5753 Using axis-style parameters: 

5754 

5755 >>> df.rename(str.lower, axis='columns') 

5756 a b 

5757 0 1 4 

5758 1 2 5 

5759 2 3 6 

5760 

5761 >>> df.rename({1: 2, 2: 4}, axis='index') 

5762 A B 

5763 0 1 4 

5764 2 2 5 

5765 4 3 6 

5766 """ 

5767 return super()._rename( 

5768 mapper=mapper, 

5769 index=index, 

5770 columns=columns, 

5771 axis=axis, 

5772 copy=copy, 

5773 inplace=inplace, 

5774 level=level, 

5775 errors=errors, 

5776 ) 

5777 

5778 def pop(self, item: Hashable) -> Series: 

5779 """ 

5780 Return item and drop from frame. Raise KeyError if not found. 

5781 

5782 Parameters 

5783 ---------- 

5784 item : label 

5785 Label of column to be popped. 

5786 

5787 Returns 

5788 ------- 

5789 Series 

5790 

5791 Examples 

5792 -------- 

5793 >>> df = pd.DataFrame([('falcon', 'bird', 389.0), 

5794 ... ('parrot', 'bird', 24.0), 

5795 ... ('lion', 'mammal', 80.5), 

5796 ... ('monkey', 'mammal', np.nan)], 

5797 ... columns=('name', 'class', 'max_speed')) 

5798 >>> df 

5799 name class max_speed 

5800 0 falcon bird 389.0 

5801 1 parrot bird 24.0 

5802 2 lion mammal 80.5 

5803 3 monkey mammal NaN 

5804 

5805 >>> df.pop('class') 

5806 0 bird 

5807 1 bird 

5808 2 mammal 

5809 3 mammal 

5810 Name: class, dtype: object 

5811 

5812 >>> df 

5813 name max_speed 

5814 0 falcon 389.0 

5815 1 parrot 24.0 

5816 2 lion 80.5 

5817 3 monkey NaN 

5818 """ 

5819 return super().pop(item=item) 

5820 

5821 def _replace_columnwise( 

5822 self, mapping: dict[Hashable, tuple[Any, Any]], inplace: bool, regex 

5823 ): 

5824 """ 

5825 Dispatch to Series.replace column-wise. 

5826 

5827 Parameters 

5828 ---------- 

5829 mapping : dict 

5830 of the form {col: (target, value)} 

5831 inplace : bool 

5832 regex : bool or same types as `to_replace` in DataFrame.replace 

5833 

5834 Returns 

5835 ------- 

5836 DataFrame or None 

5837 """ 

5838 # Operate column-wise 

5839 res = self if inplace else self.copy(deep=None) 

5840 ax = self.columns 

5841 

5842 for i, ax_value in enumerate(ax): 

5843 if ax_value in mapping: 

5844 ser = self.iloc[:, i] 

5845 

5846 target, value = mapping[ax_value] 

5847 newobj = ser.replace(target, value, regex=regex) 

5848 

5849 res._iset_item(i, newobj, inplace=inplace) 

5850 

5851 if inplace: 

5852 return 

5853 return res.__finalize__(self) 

5854 

5855 @doc(NDFrame.shift, klass=_shared_doc_kwargs["klass"]) 

5856 def shift( 

5857 self, 

5858 periods: int | Sequence[int] = 1, 

5859 freq: Frequency | None = None, 

5860 axis: Axis = 0, 

5861 fill_value: Hashable = lib.no_default, 

5862 suffix: str | None = None, 

5863 ) -> DataFrame: 

5864 if freq is not None and fill_value is not lib.no_default: 

5865 # GH#53832 

5866 warnings.warn( 

5867 "Passing a 'freq' together with a 'fill_value' silently ignores " 

5868 "the fill_value and is deprecated. This will raise in a future " 

5869 "version.", 

5870 FutureWarning, 

5871 stacklevel=find_stack_level(), 

5872 ) 

5873 fill_value = lib.no_default 

5874 

5875 if self.empty: 

5876 return self.copy() 

5877 

5878 axis = self._get_axis_number(axis) 

5879 

5880 if is_list_like(periods): 

5881 periods = cast(Sequence, periods) 

5882 if axis == 1: 

5883 raise ValueError( 

5884 "If `periods` contains multiple shifts, `axis` cannot be 1." 

5885 ) 

5886 if len(periods) == 0: 

5887 raise ValueError("If `periods` is an iterable, it cannot be empty.") 

5888 from pandas.core.reshape.concat import concat 

5889 

5890 shifted_dataframes = [] 

5891 for period in periods: 

5892 if not is_integer(period): 

5893 raise TypeError( 

5894 f"Periods must be integer, but {period} is {type(period)}." 

5895 ) 

5896 period = cast(int, period) 

5897 shifted_dataframes.append( 

5898 super() 

5899 .shift(periods=period, freq=freq, axis=axis, fill_value=fill_value) 

5900 .add_suffix(f"{suffix}_{period}" if suffix else f"_{period}") 

5901 ) 

5902 return concat(shifted_dataframes, axis=1) 

5903 elif suffix: 

5904 raise ValueError("Cannot specify `suffix` if `periods` is an int.") 

5905 periods = cast(int, periods) 

5906 

5907 ncols = len(self.columns) 

5908 arrays = self._mgr.arrays 

5909 if axis == 1 and periods != 0 and ncols > 0 and freq is None: 

5910 if fill_value is lib.no_default: 

5911 # We will infer fill_value to match the closest column 

5912 

5913 # Use a column that we know is valid for our column's dtype GH#38434 

5914 label = self.columns[0] 

5915 

5916 if periods > 0: 

5917 result = self.iloc[:, :-periods] 

5918 for col in range(min(ncols, abs(periods))): 

5919 # TODO(EA2D): doing this in a loop unnecessary with 2D EAs 

5920 # Define filler inside loop so we get a copy 

5921 filler = self.iloc[:, 0].shift(len(self)) 

5922 result.insert(0, label, filler, allow_duplicates=True) 

5923 else: 

5924 result = self.iloc[:, -periods:] 

5925 for col in range(min(ncols, abs(periods))): 

5926 # Define filler inside loop so we get a copy 

5927 filler = self.iloc[:, -1].shift(len(self)) 

5928 result.insert( 

5929 len(result.columns), label, filler, allow_duplicates=True 

5930 ) 

5931 

5932 result.columns = self.columns.copy() 

5933 return result 

5934 elif len(arrays) > 1 or ( 

5935 # If we only have one block and we know that we can't 

5936 # keep the same dtype (i.e. the _can_hold_element check) 

5937 # then we can go through the reindex_indexer path 

5938 # (and avoid casting logic in the Block method). 

5939 not can_hold_element(arrays[0], fill_value) 

5940 ): 

5941 # GH#35488 we need to watch out for multi-block cases 

5942 # We only get here with fill_value not-lib.no_default 

5943 nper = abs(periods) 

5944 nper = min(nper, ncols) 

5945 if periods > 0: 

5946 indexer = np.array( 

5947 [-1] * nper + list(range(ncols - periods)), dtype=np.intp 

5948 ) 

5949 else: 

5950 indexer = np.array( 

5951 list(range(nper, ncols)) + [-1] * nper, dtype=np.intp 

5952 ) 

5953 mgr = self._mgr.reindex_indexer( 

5954 self.columns, 

5955 indexer, 

5956 axis=0, 

5957 fill_value=fill_value, 

5958 allow_dups=True, 

5959 ) 

5960 res_df = self._constructor_from_mgr(mgr, axes=mgr.axes) 

5961 return res_df.__finalize__(self, method="shift") 

5962 else: 

5963 return self.T.shift(periods=periods, fill_value=fill_value).T 

5964 

5965 return super().shift( 

5966 periods=periods, freq=freq, axis=axis, fill_value=fill_value 

5967 ) 

5968 

5969 @overload 

5970 def set_index( 

5971 self, 

5972 keys, 

5973 *, 

5974 drop: bool = ..., 

5975 append: bool = ..., 

5976 inplace: Literal[False] = ..., 

5977 verify_integrity: bool = ..., 

5978 ) -> DataFrame: 

5979 ... 

5980 

5981 @overload 

5982 def set_index( 

5983 self, 

5984 keys, 

5985 *, 

5986 drop: bool = ..., 

5987 append: bool = ..., 

5988 inplace: Literal[True], 

5989 verify_integrity: bool = ..., 

5990 ) -> None: 

5991 ... 

5992 

5993 def set_index( 

5994 self, 

5995 keys, 

5996 *, 

5997 drop: bool = True, 

5998 append: bool = False, 

5999 inplace: bool = False, 

6000 verify_integrity: bool = False, 

6001 ) -> DataFrame | None: 

6002 """ 

6003 Set the DataFrame index using existing columns. 

6004 

6005 Set the DataFrame index (row labels) using one or more existing 

6006 columns or arrays (of the correct length). The index can replace the 

6007 existing index or expand on it. 

6008 

6009 Parameters 

6010 ---------- 

6011 keys : label or array-like or list of labels/arrays 

6012 This parameter can be either a single column key, a single array of 

6013 the same length as the calling DataFrame, or a list containing an 

6014 arbitrary combination of column keys and arrays. Here, "array" 

6015 encompasses :class:`Series`, :class:`Index`, ``np.ndarray``, and 

6016 instances of :class:`~collections.abc.Iterator`. 

6017 drop : bool, default True 

6018 Delete columns to be used as the new index. 

6019 append : bool, default False 

6020 Whether to append columns to existing index. 

6021 inplace : bool, default False 

6022 Whether to modify the DataFrame rather than creating a new one. 

6023 verify_integrity : bool, default False 

6024 Check the new index for duplicates. Otherwise defer the check until 

6025 necessary. Setting to False will improve the performance of this 

6026 method. 

6027 

6028 Returns 

6029 ------- 

6030 DataFrame or None 

6031 Changed row labels or None if ``inplace=True``. 

6032 

6033 See Also 

6034 -------- 

6035 DataFrame.reset_index : Opposite of set_index. 

6036 DataFrame.reindex : Change to new indices or expand indices. 

6037 DataFrame.reindex_like : Change to same indices as other DataFrame. 

6038 

6039 Examples 

6040 -------- 

6041 >>> df = pd.DataFrame({'month': [1, 4, 7, 10], 

6042 ... 'year': [2012, 2014, 2013, 2014], 

6043 ... 'sale': [55, 40, 84, 31]}) 

6044 >>> df 

6045 month year sale 

6046 0 1 2012 55 

6047 1 4 2014 40 

6048 2 7 2013 84 

6049 3 10 2014 31 

6050 

6051 Set the index to become the 'month' column: 

6052 

6053 >>> df.set_index('month') 

6054 year sale 

6055 month 

6056 1 2012 55 

6057 4 2014 40 

6058 7 2013 84 

6059 10 2014 31 

6060 

6061 Create a MultiIndex using columns 'year' and 'month': 

6062 

6063 >>> df.set_index(['year', 'month']) 

6064 sale 

6065 year month 

6066 2012 1 55 

6067 2014 4 40 

6068 2013 7 84 

6069 2014 10 31 

6070 

6071 Create a MultiIndex using an Index and a column: 

6072 

6073 >>> df.set_index([pd.Index([1, 2, 3, 4]), 'year']) 

6074 month sale 

6075 year 

6076 1 2012 1 55 

6077 2 2014 4 40 

6078 3 2013 7 84 

6079 4 2014 10 31 

6080 

6081 Create a MultiIndex using two Series: 

6082 

6083 >>> s = pd.Series([1, 2, 3, 4]) 

6084 >>> df.set_index([s, s**2]) 

6085 month year sale 

6086 1 1 1 2012 55 

6087 2 4 4 2014 40 

6088 3 9 7 2013 84 

6089 4 16 10 2014 31 

6090 """ 

6091 inplace = validate_bool_kwarg(inplace, "inplace") 

6092 self._check_inplace_and_allows_duplicate_labels(inplace) 

6093 if not isinstance(keys, list): 

6094 keys = [keys] 

6095 

6096 err_msg = ( 

6097 'The parameter "keys" may be a column key, one-dimensional ' 

6098 "array, or a list containing only valid column keys and " 

6099 "one-dimensional arrays." 

6100 ) 

6101 

6102 missing: list[Hashable] = [] 

6103 for col in keys: 

6104 if isinstance(col, (Index, Series, np.ndarray, list, abc.Iterator)): 

6105 # arrays are fine as long as they are one-dimensional 

6106 # iterators get converted to list below 

6107 if getattr(col, "ndim", 1) != 1: 

6108 raise ValueError(err_msg) 

6109 else: 

6110 # everything else gets tried as a key; see GH 24969 

6111 try: 

6112 found = col in self.columns 

6113 except TypeError as err: 

6114 raise TypeError( 

6115 f"{err_msg}. Received column of type {type(col)}" 

6116 ) from err 

6117 else: 

6118 if not found: 

6119 missing.append(col) 

6120 

6121 if missing: 

6122 raise KeyError(f"None of {missing} are in the columns") 

6123 

6124 if inplace: 

6125 frame = self 

6126 else: 

6127 # GH 49473 Use "lazy copy" with Copy-on-Write 

6128 frame = self.copy(deep=None) 

6129 

6130 arrays: list[Index] = [] 

6131 names: list[Hashable] = [] 

6132 if append: 

6133 names = list(self.index.names) 

6134 if isinstance(self.index, MultiIndex): 

6135 arrays.extend( 

6136 self.index._get_level_values(i) for i in range(self.index.nlevels) 

6137 ) 

6138 else: 

6139 arrays.append(self.index) 

6140 

6141 to_remove: list[Hashable] = [] 

6142 for col in keys: 

6143 if isinstance(col, MultiIndex): 

6144 arrays.extend(col._get_level_values(n) for n in range(col.nlevels)) 

6145 names.extend(col.names) 

6146 elif isinstance(col, (Index, Series)): 

6147 # if Index then not MultiIndex (treated above) 

6148 

6149 # error: Argument 1 to "append" of "list" has incompatible type 

6150 # "Union[Index, Series]"; expected "Index" 

6151 arrays.append(col) # type: ignore[arg-type] 

6152 names.append(col.name) 

6153 elif isinstance(col, (list, np.ndarray)): 

6154 # error: Argument 1 to "append" of "list" has incompatible type 

6155 # "Union[List[Any], ndarray]"; expected "Index" 

6156 arrays.append(col) # type: ignore[arg-type] 

6157 names.append(None) 

6158 elif isinstance(col, abc.Iterator): 

6159 # error: Argument 1 to "append" of "list" has incompatible type 

6160 # "List[Any]"; expected "Index" 

6161 arrays.append(list(col)) # type: ignore[arg-type] 

6162 names.append(None) 

6163 # from here, col can only be a column label 

6164 else: 

6165 arrays.append(frame[col]) 

6166 names.append(col) 

6167 if drop: 

6168 to_remove.append(col) 

6169 

6170 if len(arrays[-1]) != len(self): 

6171 # check newest element against length of calling frame, since 

6172 # ensure_index_from_sequences would not raise for append=False. 

6173 raise ValueError( 

6174 f"Length mismatch: Expected {len(self)} rows, " 

6175 f"received array of length {len(arrays[-1])}" 

6176 ) 

6177 

6178 index = ensure_index_from_sequences(arrays, names) 

6179 

6180 if verify_integrity and not index.is_unique: 

6181 duplicates = index[index.duplicated()].unique() 

6182 raise ValueError(f"Index has duplicate keys: {duplicates}") 

6183 

6184 # use set to handle duplicate column names gracefully in case of drop 

6185 for c in set(to_remove): 

6186 del frame[c] 

6187 

6188 # clear up memory usage 

6189 index._cleanup() 

6190 

6191 frame.index = index 

6192 

6193 if not inplace: 

6194 return frame 

6195 return None 

6196 

6197 @overload 

6198 def reset_index( 

6199 self, 

6200 level: IndexLabel = ..., 

6201 *, 

6202 drop: bool = ..., 

6203 inplace: Literal[False] = ..., 

6204 col_level: Hashable = ..., 

6205 col_fill: Hashable = ..., 

6206 allow_duplicates: bool | lib.NoDefault = ..., 

6207 names: Hashable | Sequence[Hashable] | None = None, 

6208 ) -> DataFrame: 

6209 ... 

6210 

6211 @overload 

6212 def reset_index( 

6213 self, 

6214 level: IndexLabel = ..., 

6215 *, 

6216 drop: bool = ..., 

6217 inplace: Literal[True], 

6218 col_level: Hashable = ..., 

6219 col_fill: Hashable = ..., 

6220 allow_duplicates: bool | lib.NoDefault = ..., 

6221 names: Hashable | Sequence[Hashable] | None = None, 

6222 ) -> None: 

6223 ... 

6224 

6225 @overload 

6226 def reset_index( 

6227 self, 

6228 level: IndexLabel = ..., 

6229 *, 

6230 drop: bool = ..., 

6231 inplace: bool = ..., 

6232 col_level: Hashable = ..., 

6233 col_fill: Hashable = ..., 

6234 allow_duplicates: bool | lib.NoDefault = ..., 

6235 names: Hashable | Sequence[Hashable] | None = None, 

6236 ) -> DataFrame | None: 

6237 ... 

6238 

6239 def reset_index( 

6240 self, 

6241 level: IndexLabel | None = None, 

6242 *, 

6243 drop: bool = False, 

6244 inplace: bool = False, 

6245 col_level: Hashable = 0, 

6246 col_fill: Hashable = "", 

6247 allow_duplicates: bool | lib.NoDefault = lib.no_default, 

6248 names: Hashable | Sequence[Hashable] | None = None, 

6249 ) -> DataFrame | None: 

6250 """ 

6251 Reset the index, or a level of it. 

6252 

6253 Reset the index of the DataFrame, and use the default one instead. 

6254 If the DataFrame has a MultiIndex, this method can remove one or more 

6255 levels. 

6256 

6257 Parameters 

6258 ---------- 

6259 level : int, str, tuple, or list, default None 

6260 Only remove the given levels from the index. Removes all levels by 

6261 default. 

6262 drop : bool, default False 

6263 Do not try to insert index into dataframe columns. This resets 

6264 the index to the default integer index. 

6265 inplace : bool, default False 

6266 Whether to modify the DataFrame rather than creating a new one. 

6267 col_level : int or str, default 0 

6268 If the columns have multiple levels, determines which level the 

6269 labels are inserted into. By default it is inserted into the first 

6270 level. 

6271 col_fill : object, default '' 

6272 If the columns have multiple levels, determines how the other 

6273 levels are named. If None then the index name is repeated. 

6274 allow_duplicates : bool, optional, default lib.no_default 

6275 Allow duplicate column labels to be created. 

6276 

6277 .. versionadded:: 1.5.0 

6278 

6279 names : int, str or 1-dimensional list, default None 

6280 Using the given string, rename the DataFrame column which contains the 

6281 index data. If the DataFrame has a MultiIndex, this has to be a list or 

6282 tuple with length equal to the number of levels. 

6283 

6284 .. versionadded:: 1.5.0 

6285 

6286 Returns 

6287 ------- 

6288 DataFrame or None 

6289 DataFrame with the new index or None if ``inplace=True``. 

6290 

6291 See Also 

6292 -------- 

6293 DataFrame.set_index : Opposite of reset_index. 

6294 DataFrame.reindex : Change to new indices or expand indices. 

6295 DataFrame.reindex_like : Change to same indices as other DataFrame. 

6296 

6297 Examples 

6298 -------- 

6299 >>> df = pd.DataFrame([('bird', 389.0), 

6300 ... ('bird', 24.0), 

6301 ... ('mammal', 80.5), 

6302 ... ('mammal', np.nan)], 

6303 ... index=['falcon', 'parrot', 'lion', 'monkey'], 

6304 ... columns=('class', 'max_speed')) 

6305 >>> df 

6306 class max_speed 

6307 falcon bird 389.0 

6308 parrot bird 24.0 

6309 lion mammal 80.5 

6310 monkey mammal NaN 

6311 

6312 When we reset the index, the old index is added as a column, and a 

6313 new sequential index is used: 

6314 

6315 >>> df.reset_index() 

6316 index class max_speed 

6317 0 falcon bird 389.0 

6318 1 parrot bird 24.0 

6319 2 lion mammal 80.5 

6320 3 monkey mammal NaN 

6321 

6322 We can use the `drop` parameter to avoid the old index being added as 

6323 a column: 

6324 

6325 >>> df.reset_index(drop=True) 

6326 class max_speed 

6327 0 bird 389.0 

6328 1 bird 24.0 

6329 2 mammal 80.5 

6330 3 mammal NaN 

6331 

6332 You can also use `reset_index` with `MultiIndex`. 

6333 

6334 >>> index = pd.MultiIndex.from_tuples([('bird', 'falcon'), 

6335 ... ('bird', 'parrot'), 

6336 ... ('mammal', 'lion'), 

6337 ... ('mammal', 'monkey')], 

6338 ... names=['class', 'name']) 

6339 >>> columns = pd.MultiIndex.from_tuples([('speed', 'max'), 

6340 ... ('species', 'type')]) 

6341 >>> df = pd.DataFrame([(389.0, 'fly'), 

6342 ... (24.0, 'fly'), 

6343 ... (80.5, 'run'), 

6344 ... (np.nan, 'jump')], 

6345 ... index=index, 

6346 ... columns=columns) 

6347 >>> df 

6348 speed species 

6349 max type 

6350 class name 

6351 bird falcon 389.0 fly 

6352 parrot 24.0 fly 

6353 mammal lion 80.5 run 

6354 monkey NaN jump 

6355 

6356 Using the `names` parameter, choose a name for the index column: 

6357 

6358 >>> df.reset_index(names=['classes', 'names']) 

6359 classes names speed species 

6360 max type 

6361 0 bird falcon 389.0 fly 

6362 1 bird parrot 24.0 fly 

6363 2 mammal lion 80.5 run 

6364 3 mammal monkey NaN jump 

6365 

6366 If the index has multiple levels, we can reset a subset of them: 

6367 

6368 >>> df.reset_index(level='class') 

6369 class speed species 

6370 max type 

6371 name 

6372 falcon bird 389.0 fly 

6373 parrot bird 24.0 fly 

6374 lion mammal 80.5 run 

6375 monkey mammal NaN jump 

6376 

6377 If we are not dropping the index, by default, it is placed in the top 

6378 level. We can place it in another level: 

6379 

6380 >>> df.reset_index(level='class', col_level=1) 

6381 speed species 

6382 class max type 

6383 name 

6384 falcon bird 389.0 fly 

6385 parrot bird 24.0 fly 

6386 lion mammal 80.5 run 

6387 monkey mammal NaN jump 

6388 

6389 When the index is inserted under another level, we can specify under 

6390 which one with the parameter `col_fill`: 

6391 

6392 >>> df.reset_index(level='class', col_level=1, col_fill='species') 

6393 species speed species 

6394 class max type 

6395 name 

6396 falcon bird 389.0 fly 

6397 parrot bird 24.0 fly 

6398 lion mammal 80.5 run 

6399 monkey mammal NaN jump 

6400 

6401 If we specify a nonexistent level for `col_fill`, it is created: 

6402 

6403 >>> df.reset_index(level='class', col_level=1, col_fill='genus') 

6404 genus speed species 

6405 class max type 

6406 name 

6407 falcon bird 389.0 fly 

6408 parrot bird 24.0 fly 

6409 lion mammal 80.5 run 

6410 monkey mammal NaN jump 

6411 """ 

6412 inplace = validate_bool_kwarg(inplace, "inplace") 

6413 self._check_inplace_and_allows_duplicate_labels(inplace) 

6414 if inplace: 

6415 new_obj = self 

6416 else: 

6417 new_obj = self.copy(deep=None) 

6418 if allow_duplicates is not lib.no_default: 

6419 allow_duplicates = validate_bool_kwarg(allow_duplicates, "allow_duplicates") 

6420 

6421 new_index = default_index(len(new_obj)) 

6422 if level is not None: 

6423 if not isinstance(level, (tuple, list)): 

6424 level = [level] 

6425 level = [self.index._get_level_number(lev) for lev in level] 

6426 if len(level) < self.index.nlevels: 

6427 new_index = self.index.droplevel(level) 

6428 

6429 if not drop: 

6430 to_insert: Iterable[tuple[Any, Any | None]] 

6431 

6432 default = "index" if "index" not in self else "level_0" 

6433 names = self.index._get_default_index_names(names, default) 

6434 

6435 if isinstance(self.index, MultiIndex): 

6436 to_insert = zip(self.index.levels, self.index.codes) 

6437 else: 

6438 to_insert = ((self.index, None),) 

6439 

6440 multi_col = isinstance(self.columns, MultiIndex) 

6441 for i, (lev, lab) in reversed(list(enumerate(to_insert))): 

6442 if level is not None and i not in level: 

6443 continue 

6444 name = names[i] 

6445 if multi_col: 

6446 col_name = list(name) if isinstance(name, tuple) else [name] 

6447 if col_fill is None: 

6448 if len(col_name) not in (1, self.columns.nlevels): 

6449 raise ValueError( 

6450 "col_fill=None is incompatible " 

6451 f"with incomplete column name {name}" 

6452 ) 

6453 col_fill = col_name[0] 

6454 

6455 lev_num = self.columns._get_level_number(col_level) 

6456 name_lst = [col_fill] * lev_num + col_name 

6457 missing = self.columns.nlevels - len(name_lst) 

6458 name_lst += [col_fill] * missing 

6459 name = tuple(name_lst) 

6460 

6461 # to ndarray and maybe infer different dtype 

6462 level_values = lev._values 

6463 if level_values.dtype == np.object_: 

6464 level_values = lib.maybe_convert_objects(level_values) 

6465 

6466 if lab is not None: 

6467 # if we have the codes, extract the values with a mask 

6468 level_values = algorithms.take( 

6469 level_values, lab, allow_fill=True, fill_value=lev._na_value 

6470 ) 

6471 

6472 new_obj.insert( 

6473 0, 

6474 name, 

6475 level_values, 

6476 allow_duplicates=allow_duplicates, 

6477 ) 

6478 

6479 new_obj.index = new_index 

6480 if not inplace: 

6481 return new_obj 

6482 

6483 return None 

6484 

6485 # ---------------------------------------------------------------------- 

6486 # Reindex-based selection methods 

6487 

6488 @doc(NDFrame.isna, klass=_shared_doc_kwargs["klass"]) 

6489 def isna(self) -> DataFrame: 

6490 res_mgr = self._mgr.isna(func=isna) 

6491 result = self._constructor_from_mgr(res_mgr, axes=res_mgr.axes) 

6492 return result.__finalize__(self, method="isna") 

6493 

6494 @doc(NDFrame.isna, klass=_shared_doc_kwargs["klass"]) 

6495 def isnull(self) -> DataFrame: 

6496 """ 

6497 DataFrame.isnull is an alias for DataFrame.isna. 

6498 """ 

6499 return self.isna() 

6500 

6501 @doc(NDFrame.notna, klass=_shared_doc_kwargs["klass"]) 

6502 def notna(self) -> DataFrame: 

6503 return ~self.isna() 

6504 

6505 @doc(NDFrame.notna, klass=_shared_doc_kwargs["klass"]) 

6506 def notnull(self) -> DataFrame: 

6507 """ 

6508 DataFrame.notnull is an alias for DataFrame.notna. 

6509 """ 

6510 return ~self.isna() 

6511 

6512 @overload 

6513 def dropna( 

6514 self, 

6515 *, 

6516 axis: Axis = ..., 

6517 how: AnyAll | lib.NoDefault = ..., 

6518 thresh: int | lib.NoDefault = ..., 

6519 subset: IndexLabel = ..., 

6520 inplace: Literal[False] = ..., 

6521 ignore_index: bool = ..., 

6522 ) -> DataFrame: 

6523 ... 

6524 

6525 @overload 

6526 def dropna( 

6527 self, 

6528 *, 

6529 axis: Axis = ..., 

6530 how: AnyAll | lib.NoDefault = ..., 

6531 thresh: int | lib.NoDefault = ..., 

6532 subset: IndexLabel = ..., 

6533 inplace: Literal[True], 

6534 ignore_index: bool = ..., 

6535 ) -> None: 

6536 ... 

6537 

6538 def dropna( 

6539 self, 

6540 *, 

6541 axis: Axis = 0, 

6542 how: AnyAll | lib.NoDefault = lib.no_default, 

6543 thresh: int | lib.NoDefault = lib.no_default, 

6544 subset: IndexLabel | None = None, 

6545 inplace: bool = False, 

6546 ignore_index: bool = False, 

6547 ) -> DataFrame | None: 

6548 """ 

6549 Remove missing values. 

6550 

6551 See the :ref:`User Guide <missing_data>` for more on which values are 

6552 considered missing, and how to work with missing data. 

6553 

6554 Parameters 

6555 ---------- 

6556 axis : {0 or 'index', 1 or 'columns'}, default 0 

6557 Determine if rows or columns which contain missing values are 

6558 removed. 

6559 

6560 * 0, or 'index' : Drop rows which contain missing values. 

6561 * 1, or 'columns' : Drop columns which contain missing value. 

6562 

6563 Only a single axis is allowed. 

6564 

6565 how : {'any', 'all'}, default 'any' 

6566 Determine if row or column is removed from DataFrame, when we have 

6567 at least one NA or all NA. 

6568 

6569 * 'any' : If any NA values are present, drop that row or column. 

6570 * 'all' : If all values are NA, drop that row or column. 

6571 

6572 thresh : int, optional 

6573 Require that many non-NA values. Cannot be combined with how. 

6574 subset : column label or sequence of labels, optional 

6575 Labels along other axis to consider, e.g. if you are dropping rows 

6576 these would be a list of columns to include. 

6577 inplace : bool, default False 

6578 Whether to modify the DataFrame rather than creating a new one. 

6579 ignore_index : bool, default ``False`` 

6580 If ``True``, the resulting axis will be labeled 0, 1, …, n - 1. 

6581 

6582 .. versionadded:: 2.0.0 

6583 

6584 Returns 

6585 ------- 

6586 DataFrame or None 

6587 DataFrame with NA entries dropped from it or None if ``inplace=True``. 

6588 

6589 See Also 

6590 -------- 

6591 DataFrame.isna: Indicate missing values. 

6592 DataFrame.notna : Indicate existing (non-missing) values. 

6593 DataFrame.fillna : Replace missing values. 

6594 Series.dropna : Drop missing values. 

6595 Index.dropna : Drop missing indices. 

6596 

6597 Examples 

6598 -------- 

6599 >>> df = pd.DataFrame({"name": ['Alfred', 'Batman', 'Catwoman'], 

6600 ... "toy": [np.nan, 'Batmobile', 'Bullwhip'], 

6601 ... "born": [pd.NaT, pd.Timestamp("1940-04-25"), 

6602 ... pd.NaT]}) 

6603 >>> df 

6604 name toy born 

6605 0 Alfred NaN NaT 

6606 1 Batman Batmobile 1940-04-25 

6607 2 Catwoman Bullwhip NaT 

6608 

6609 Drop the rows where at least one element is missing. 

6610 

6611 >>> df.dropna() 

6612 name toy born 

6613 1 Batman Batmobile 1940-04-25 

6614 

6615 Drop the columns where at least one element is missing. 

6616 

6617 >>> df.dropna(axis='columns') 

6618 name 

6619 0 Alfred 

6620 1 Batman 

6621 2 Catwoman 

6622 

6623 Drop the rows where all elements are missing. 

6624 

6625 >>> df.dropna(how='all') 

6626 name toy born 

6627 0 Alfred NaN NaT 

6628 1 Batman Batmobile 1940-04-25 

6629 2 Catwoman Bullwhip NaT 

6630 

6631 Keep only the rows with at least 2 non-NA values. 

6632 

6633 >>> df.dropna(thresh=2) 

6634 name toy born 

6635 1 Batman Batmobile 1940-04-25 

6636 2 Catwoman Bullwhip NaT 

6637 

6638 Define in which columns to look for missing values. 

6639 

6640 >>> df.dropna(subset=['name', 'toy']) 

6641 name toy born 

6642 1 Batman Batmobile 1940-04-25 

6643 2 Catwoman Bullwhip NaT 

6644 """ 

6645 if (how is not lib.no_default) and (thresh is not lib.no_default): 

6646 raise TypeError( 

6647 "You cannot set both the how and thresh arguments at the same time." 

6648 ) 

6649 

6650 if how is lib.no_default: 

6651 how = "any" 

6652 

6653 inplace = validate_bool_kwarg(inplace, "inplace") 

6654 if isinstance(axis, (tuple, list)): 

6655 # GH20987 

6656 raise TypeError("supplying multiple axes to axis is no longer supported.") 

6657 

6658 axis = self._get_axis_number(axis) 

6659 agg_axis = 1 - axis 

6660 

6661 agg_obj = self 

6662 if subset is not None: 

6663 # subset needs to be list 

6664 if not is_list_like(subset): 

6665 subset = [subset] 

6666 ax = self._get_axis(agg_axis) 

6667 indices = ax.get_indexer_for(subset) 

6668 check = indices == -1 

6669 if check.any(): 

6670 raise KeyError(np.array(subset)[check].tolist()) 

6671 agg_obj = self.take(indices, axis=agg_axis) 

6672 

6673 if thresh is not lib.no_default: 

6674 count = agg_obj.count(axis=agg_axis) 

6675 mask = count >= thresh 

6676 elif how == "any": 

6677 # faster equivalent to 'agg_obj.count(agg_axis) == self.shape[agg_axis]' 

6678 mask = notna(agg_obj).all(axis=agg_axis, bool_only=False) 

6679 elif how == "all": 

6680 # faster equivalent to 'agg_obj.count(agg_axis) > 0' 

6681 mask = notna(agg_obj).any(axis=agg_axis, bool_only=False) 

6682 else: 

6683 raise ValueError(f"invalid how option: {how}") 

6684 

6685 if np.all(mask): 

6686 result = self.copy(deep=None) 

6687 else: 

6688 result = self.loc(axis=axis)[mask] 

6689 

6690 if ignore_index: 

6691 result.index = default_index(len(result)) 

6692 

6693 if not inplace: 

6694 return result 

6695 self._update_inplace(result) 

6696 return None 

6697 

6698 @overload 

6699 def drop_duplicates( 

6700 self, 

6701 subset: Hashable | Sequence[Hashable] | None = ..., 

6702 *, 

6703 keep: DropKeep = ..., 

6704 inplace: Literal[True], 

6705 ignore_index: bool = ..., 

6706 ) -> None: 

6707 ... 

6708 

6709 @overload 

6710 def drop_duplicates( 

6711 self, 

6712 subset: Hashable | Sequence[Hashable] | None = ..., 

6713 *, 

6714 keep: DropKeep = ..., 

6715 inplace: Literal[False] = ..., 

6716 ignore_index: bool = ..., 

6717 ) -> DataFrame: 

6718 ... 

6719 

6720 @overload 

6721 def drop_duplicates( 

6722 self, 

6723 subset: Hashable | Sequence[Hashable] | None = ..., 

6724 *, 

6725 keep: DropKeep = ..., 

6726 inplace: bool = ..., 

6727 ignore_index: bool = ..., 

6728 ) -> DataFrame | None: 

6729 ... 

6730 

6731 def drop_duplicates( 

6732 self, 

6733 subset: Hashable | Sequence[Hashable] | None = None, 

6734 *, 

6735 keep: DropKeep = "first", 

6736 inplace: bool = False, 

6737 ignore_index: bool = False, 

6738 ) -> DataFrame | None: 

6739 """ 

6740 Return DataFrame with duplicate rows removed. 

6741 

6742 Considering certain columns is optional. Indexes, including time indexes 

6743 are ignored. 

6744 

6745 Parameters 

6746 ---------- 

6747 subset : column label or sequence of labels, optional 

6748 Only consider certain columns for identifying duplicates, by 

6749 default use all of the columns. 

6750 keep : {'first', 'last', ``False``}, default 'first' 

6751 Determines which duplicates (if any) to keep. 

6752 

6753 - 'first' : Drop duplicates except for the first occurrence. 

6754 - 'last' : Drop duplicates except for the last occurrence. 

6755 - ``False`` : Drop all duplicates. 

6756 

6757 inplace : bool, default ``False`` 

6758 Whether to modify the DataFrame rather than creating a new one. 

6759 ignore_index : bool, default ``False`` 

6760 If ``True``, the resulting axis will be labeled 0, 1, …, n - 1. 

6761 

6762 Returns 

6763 ------- 

6764 DataFrame or None 

6765 DataFrame with duplicates removed or None if ``inplace=True``. 

6766 

6767 See Also 

6768 -------- 

6769 DataFrame.value_counts: Count unique combinations of columns. 

6770 

6771 Examples 

6772 -------- 

6773 Consider dataset containing ramen rating. 

6774 

6775 >>> df = pd.DataFrame({ 

6776 ... 'brand': ['Yum Yum', 'Yum Yum', 'Indomie', 'Indomie', 'Indomie'], 

6777 ... 'style': ['cup', 'cup', 'cup', 'pack', 'pack'], 

6778 ... 'rating': [4, 4, 3.5, 15, 5] 

6779 ... }) 

6780 >>> df 

6781 brand style rating 

6782 0 Yum Yum cup 4.0 

6783 1 Yum Yum cup 4.0 

6784 2 Indomie cup 3.5 

6785 3 Indomie pack 15.0 

6786 4 Indomie pack 5.0 

6787 

6788 By default, it removes duplicate rows based on all columns. 

6789 

6790 >>> df.drop_duplicates() 

6791 brand style rating 

6792 0 Yum Yum cup 4.0 

6793 2 Indomie cup 3.5 

6794 3 Indomie pack 15.0 

6795 4 Indomie pack 5.0 

6796 

6797 To remove duplicates on specific column(s), use ``subset``. 

6798 

6799 >>> df.drop_duplicates(subset=['brand']) 

6800 brand style rating 

6801 0 Yum Yum cup 4.0 

6802 2 Indomie cup 3.5 

6803 

6804 To remove duplicates and keep last occurrences, use ``keep``. 

6805 

6806 >>> df.drop_duplicates(subset=['brand', 'style'], keep='last') 

6807 brand style rating 

6808 1 Yum Yum cup 4.0 

6809 2 Indomie cup 3.5 

6810 4 Indomie pack 5.0 

6811 """ 

6812 if self.empty: 

6813 return self.copy(deep=None) 

6814 

6815 inplace = validate_bool_kwarg(inplace, "inplace") 

6816 ignore_index = validate_bool_kwarg(ignore_index, "ignore_index") 

6817 

6818 result = self[-self.duplicated(subset, keep=keep)] 

6819 if ignore_index: 

6820 result.index = default_index(len(result)) 

6821 

6822 if inplace: 

6823 self._update_inplace(result) 

6824 return None 

6825 else: 

6826 return result 

6827 

6828 def duplicated( 

6829 self, 

6830 subset: Hashable | Sequence[Hashable] | None = None, 

6831 keep: DropKeep = "first", 

6832 ) -> Series: 

6833 """ 

6834 Return boolean Series denoting duplicate rows. 

6835 

6836 Considering certain columns is optional. 

6837 

6838 Parameters 

6839 ---------- 

6840 subset : column label or sequence of labels, optional 

6841 Only consider certain columns for identifying duplicates, by 

6842 default use all of the columns. 

6843 keep : {'first', 'last', False}, default 'first' 

6844 Determines which duplicates (if any) to mark. 

6845 

6846 - ``first`` : Mark duplicates as ``True`` except for the first occurrence. 

6847 - ``last`` : Mark duplicates as ``True`` except for the last occurrence. 

6848 - False : Mark all duplicates as ``True``. 

6849 

6850 Returns 

6851 ------- 

6852 Series 

6853 Boolean series for each duplicated rows. 

6854 

6855 See Also 

6856 -------- 

6857 Index.duplicated : Equivalent method on index. 

6858 Series.duplicated : Equivalent method on Series. 

6859 Series.drop_duplicates : Remove duplicate values from Series. 

6860 DataFrame.drop_duplicates : Remove duplicate values from DataFrame. 

6861 

6862 Examples 

6863 -------- 

6864 Consider dataset containing ramen rating. 

6865 

6866 >>> df = pd.DataFrame({ 

6867 ... 'brand': ['Yum Yum', 'Yum Yum', 'Indomie', 'Indomie', 'Indomie'], 

6868 ... 'style': ['cup', 'cup', 'cup', 'pack', 'pack'], 

6869 ... 'rating': [4, 4, 3.5, 15, 5] 

6870 ... }) 

6871 >>> df 

6872 brand style rating 

6873 0 Yum Yum cup 4.0 

6874 1 Yum Yum cup 4.0 

6875 2 Indomie cup 3.5 

6876 3 Indomie pack 15.0 

6877 4 Indomie pack 5.0 

6878 

6879 By default, for each set of duplicated values, the first occurrence 

6880 is set on False and all others on True. 

6881 

6882 >>> df.duplicated() 

6883 0 False 

6884 1 True 

6885 2 False 

6886 3 False 

6887 4 False 

6888 dtype: bool 

6889 

6890 By using 'last', the last occurrence of each set of duplicated values 

6891 is set on False and all others on True. 

6892 

6893 >>> df.duplicated(keep='last') 

6894 0 True 

6895 1 False 

6896 2 False 

6897 3 False 

6898 4 False 

6899 dtype: bool 

6900 

6901 By setting ``keep`` on False, all duplicates are True. 

6902 

6903 >>> df.duplicated(keep=False) 

6904 0 True 

6905 1 True 

6906 2 False 

6907 3 False 

6908 4 False 

6909 dtype: bool 

6910 

6911 To find duplicates on specific column(s), use ``subset``. 

6912 

6913 >>> df.duplicated(subset=['brand']) 

6914 0 False 

6915 1 True 

6916 2 False 

6917 3 True 

6918 4 True 

6919 dtype: bool 

6920 """ 

6921 

6922 if self.empty: 

6923 return self._constructor_sliced(dtype=bool) 

6924 

6925 def f(vals) -> tuple[np.ndarray, int]: 

6926 labels, shape = algorithms.factorize(vals, size_hint=len(self)) 

6927 return labels.astype("i8", copy=False), len(shape) 

6928 

6929 if subset is None: 

6930 # https://github.com/pandas-dev/pandas/issues/28770 

6931 # Incompatible types in assignment (expression has type "Index", variable 

6932 # has type "Sequence[Any]") 

6933 subset = self.columns # type: ignore[assignment] 

6934 elif ( 

6935 not np.iterable(subset) 

6936 or isinstance(subset, str) 

6937 or isinstance(subset, tuple) 

6938 and subset in self.columns 

6939 ): 

6940 subset = (subset,) 

6941 

6942 # needed for mypy since can't narrow types using np.iterable 

6943 subset = cast(Sequence, subset) 

6944 

6945 # Verify all columns in subset exist in the queried dataframe 

6946 # Otherwise, raise a KeyError, same as if you try to __getitem__ with a 

6947 # key that doesn't exist. 

6948 diff = set(subset) - set(self.columns) 

6949 if diff: 

6950 raise KeyError(Index(diff)) 

6951 

6952 if len(subset) == 1 and self.columns.is_unique: 

6953 # GH#45236 This is faster than get_group_index below 

6954 result = self[subset[0]].duplicated(keep) 

6955 result.name = None 

6956 else: 

6957 vals = (col.values for name, col in self.items() if name in subset) 

6958 labels, shape = map(list, zip(*map(f, vals))) 

6959 

6960 ids = get_group_index(labels, tuple(shape), sort=False, xnull=False) 

6961 result = self._constructor_sliced(duplicated(ids, keep), index=self.index) 

6962 return result.__finalize__(self, method="duplicated") 

6963 

6964 # ---------------------------------------------------------------------- 

6965 # Sorting 

6966 # error: Signature of "sort_values" incompatible with supertype "NDFrame" 

6967 @overload # type: ignore[override] 

6968 def sort_values( 

6969 self, 

6970 by: IndexLabel, 

6971 *, 

6972 axis: Axis = ..., 

6973 ascending=..., 

6974 inplace: Literal[False] = ..., 

6975 kind: SortKind = ..., 

6976 na_position: NaPosition = ..., 

6977 ignore_index: bool = ..., 

6978 key: ValueKeyFunc = ..., 

6979 ) -> DataFrame: 

6980 ... 

6981 

6982 @overload 

6983 def sort_values( 

6984 self, 

6985 by: IndexLabel, 

6986 *, 

6987 axis: Axis = ..., 

6988 ascending=..., 

6989 inplace: Literal[True], 

6990 kind: SortKind = ..., 

6991 na_position: str = ..., 

6992 ignore_index: bool = ..., 

6993 key: ValueKeyFunc = ..., 

6994 ) -> None: 

6995 ... 

6996 

6997 def sort_values( 

6998 self, 

6999 by: IndexLabel, 

7000 *, 

7001 axis: Axis = 0, 

7002 ascending: bool | list[bool] | tuple[bool, ...] = True, 

7003 inplace: bool = False, 

7004 kind: SortKind = "quicksort", 

7005 na_position: str = "last", 

7006 ignore_index: bool = False, 

7007 key: ValueKeyFunc | None = None, 

7008 ) -> DataFrame | None: 

7009 """ 

7010 Sort by the values along either axis. 

7011 

7012 Parameters 

7013 ---------- 

7014 by : str or list of str 

7015 Name or list of names to sort by. 

7016 

7017 - if `axis` is 0 or `'index'` then `by` may contain index 

7018 levels and/or column labels. 

7019 - if `axis` is 1 or `'columns'` then `by` may contain column 

7020 levels and/or index labels. 

7021 axis : "{0 or 'index', 1 or 'columns'}", default 0 

7022 Axis to be sorted. 

7023 ascending : bool or list of bool, default True 

7024 Sort ascending vs. descending. Specify list for multiple sort 

7025 orders. If this is a list of bools, must match the length of 

7026 the by. 

7027 inplace : bool, default False 

7028 If True, perform operation in-place. 

7029 kind : {'quicksort', 'mergesort', 'heapsort', 'stable'}, default 'quicksort' 

7030 Choice of sorting algorithm. See also :func:`numpy.sort` for more 

7031 information. `mergesort` and `stable` are the only stable algorithms. For 

7032 DataFrames, this option is only applied when sorting on a single 

7033 column or label. 

7034 na_position : {'first', 'last'}, default 'last' 

7035 Puts NaNs at the beginning if `first`; `last` puts NaNs at the 

7036 end. 

7037 ignore_index : bool, default False 

7038 If True, the resulting axis will be labeled 0, 1, …, n - 1. 

7039 key : callable, optional 

7040 Apply the key function to the values 

7041 before sorting. This is similar to the `key` argument in the 

7042 builtin :meth:`sorted` function, with the notable difference that 

7043 this `key` function should be *vectorized*. It should expect a 

7044 ``Series`` and return a Series with the same shape as the input. 

7045 It will be applied to each column in `by` independently. 

7046 

7047 Returns 

7048 ------- 

7049 DataFrame or None 

7050 DataFrame with sorted values or None if ``inplace=True``. 

7051 

7052 See Also 

7053 -------- 

7054 DataFrame.sort_index : Sort a DataFrame by the index. 

7055 Series.sort_values : Similar method for a Series. 

7056 

7057 Examples 

7058 -------- 

7059 >>> df = pd.DataFrame({ 

7060 ... 'col1': ['A', 'A', 'B', np.nan, 'D', 'C'], 

7061 ... 'col2': [2, 1, 9, 8, 7, 4], 

7062 ... 'col3': [0, 1, 9, 4, 2, 3], 

7063 ... 'col4': ['a', 'B', 'c', 'D', 'e', 'F'] 

7064 ... }) 

7065 >>> df 

7066 col1 col2 col3 col4 

7067 0 A 2 0 a 

7068 1 A 1 1 B 

7069 2 B 9 9 c 

7070 3 NaN 8 4 D 

7071 4 D 7 2 e 

7072 5 C 4 3 F 

7073 

7074 Sort by col1 

7075 

7076 >>> df.sort_values(by=['col1']) 

7077 col1 col2 col3 col4 

7078 0 A 2 0 a 

7079 1 A 1 1 B 

7080 2 B 9 9 c 

7081 5 C 4 3 F 

7082 4 D 7 2 e 

7083 3 NaN 8 4 D 

7084 

7085 Sort by multiple columns 

7086 

7087 >>> df.sort_values(by=['col1', 'col2']) 

7088 col1 col2 col3 col4 

7089 1 A 1 1 B 

7090 0 A 2 0 a 

7091 2 B 9 9 c 

7092 5 C 4 3 F 

7093 4 D 7 2 e 

7094 3 NaN 8 4 D 

7095 

7096 Sort Descending 

7097 

7098 >>> df.sort_values(by='col1', ascending=False) 

7099 col1 col2 col3 col4 

7100 4 D 7 2 e 

7101 5 C 4 3 F 

7102 2 B 9 9 c 

7103 0 A 2 0 a 

7104 1 A 1 1 B 

7105 3 NaN 8 4 D 

7106 

7107 Putting NAs first 

7108 

7109 >>> df.sort_values(by='col1', ascending=False, na_position='first') 

7110 col1 col2 col3 col4 

7111 3 NaN 8 4 D 

7112 4 D 7 2 e 

7113 5 C 4 3 F 

7114 2 B 9 9 c 

7115 0 A 2 0 a 

7116 1 A 1 1 B 

7117 

7118 Sorting with a key function 

7119 

7120 >>> df.sort_values(by='col4', key=lambda col: col.str.lower()) 

7121 col1 col2 col3 col4 

7122 0 A 2 0 a 

7123 1 A 1 1 B 

7124 2 B 9 9 c 

7125 3 NaN 8 4 D 

7126 4 D 7 2 e 

7127 5 C 4 3 F 

7128 

7129 Natural sort with the key argument, 

7130 using the `natsort <https://github.com/SethMMorton/natsort>` package. 

7131 

7132 >>> df = pd.DataFrame({ 

7133 ... "time": ['0hr', '128hr', '72hr', '48hr', '96hr'], 

7134 ... "value": [10, 20, 30, 40, 50] 

7135 ... }) 

7136 >>> df 

7137 time value 

7138 0 0hr 10 

7139 1 128hr 20 

7140 2 72hr 30 

7141 3 48hr 40 

7142 4 96hr 50 

7143 >>> from natsort import index_natsorted 

7144 >>> df.sort_values( 

7145 ... by="time", 

7146 ... key=lambda x: np.argsort(index_natsorted(df["time"])) 

7147 ... ) 

7148 time value 

7149 0 0hr 10 

7150 3 48hr 40 

7151 2 72hr 30 

7152 4 96hr 50 

7153 1 128hr 20 

7154 """ 

7155 inplace = validate_bool_kwarg(inplace, "inplace") 

7156 axis = self._get_axis_number(axis) 

7157 ascending = validate_ascending(ascending) 

7158 if not isinstance(by, list): 

7159 by = [by] 

7160 # error: Argument 1 to "len" has incompatible type "Union[bool, List[bool]]"; 

7161 # expected "Sized" 

7162 if is_sequence(ascending) and ( 

7163 len(by) != len(ascending) # type: ignore[arg-type] 

7164 ): 

7165 # error: Argument 1 to "len" has incompatible type "Union[bool, 

7166 # List[bool]]"; expected "Sized" 

7167 raise ValueError( 

7168 f"Length of ascending ({len(ascending)})" # type: ignore[arg-type] 

7169 f" != length of by ({len(by)})" 

7170 ) 

7171 if len(by) > 1: 

7172 keys = [self._get_label_or_level_values(x, axis=axis) for x in by] 

7173 

7174 # need to rewrap columns in Series to apply key function 

7175 if key is not None: 

7176 # error: List comprehension has incompatible type List[Series]; 

7177 # expected List[ndarray] 

7178 keys = [ 

7179 Series(k, name=name) # type: ignore[misc] 

7180 for (k, name) in zip(keys, by) 

7181 ] 

7182 

7183 indexer = lexsort_indexer( 

7184 keys, orders=ascending, na_position=na_position, key=key 

7185 ) 

7186 elif len(by): 

7187 # len(by) == 1 

7188 

7189 k = self._get_label_or_level_values(by[0], axis=axis) 

7190 

7191 # need to rewrap column in Series to apply key function 

7192 if key is not None: 

7193 # error: Incompatible types in assignment (expression has type 

7194 # "Series", variable has type "ndarray") 

7195 k = Series(k, name=by[0]) # type: ignore[assignment] 

7196 

7197 if isinstance(ascending, (tuple, list)): 

7198 ascending = ascending[0] 

7199 

7200 indexer = nargsort( 

7201 k, kind=kind, ascending=ascending, na_position=na_position, key=key 

7202 ) 

7203 else: 

7204 if inplace: 

7205 return self._update_inplace(self) 

7206 else: 

7207 return self.copy(deep=None) 

7208 

7209 if is_range_indexer(indexer, len(indexer)): 

7210 result = self.copy(deep=(not inplace and not using_copy_on_write())) 

7211 if ignore_index: 

7212 result.index = default_index(len(result)) 

7213 

7214 if inplace: 

7215 return self._update_inplace(result) 

7216 else: 

7217 return result 

7218 

7219 new_data = self._mgr.take( 

7220 indexer, axis=self._get_block_manager_axis(axis), verify=False 

7221 ) 

7222 

7223 if ignore_index: 

7224 new_data.set_axis( 

7225 self._get_block_manager_axis(axis), default_index(len(indexer)) 

7226 ) 

7227 

7228 result = self._constructor_from_mgr(new_data, axes=new_data.axes) 

7229 if inplace: 

7230 return self._update_inplace(result) 

7231 else: 

7232 return result.__finalize__(self, method="sort_values") 

7233 

7234 @overload 

7235 def sort_index( 

7236 self, 

7237 *, 

7238 axis: Axis = ..., 

7239 level: IndexLabel = ..., 

7240 ascending: bool | Sequence[bool] = ..., 

7241 inplace: Literal[True], 

7242 kind: SortKind = ..., 

7243 na_position: NaPosition = ..., 

7244 sort_remaining: bool = ..., 

7245 ignore_index: bool = ..., 

7246 key: IndexKeyFunc = ..., 

7247 ) -> None: 

7248 ... 

7249 

7250 @overload 

7251 def sort_index( 

7252 self, 

7253 *, 

7254 axis: Axis = ..., 

7255 level: IndexLabel = ..., 

7256 ascending: bool | Sequence[bool] = ..., 

7257 inplace: Literal[False] = ..., 

7258 kind: SortKind = ..., 

7259 na_position: NaPosition = ..., 

7260 sort_remaining: bool = ..., 

7261 ignore_index: bool = ..., 

7262 key: IndexKeyFunc = ..., 

7263 ) -> DataFrame: 

7264 ... 

7265 

7266 @overload 

7267 def sort_index( 

7268 self, 

7269 *, 

7270 axis: Axis = ..., 

7271 level: IndexLabel = ..., 

7272 ascending: bool | Sequence[bool] = ..., 

7273 inplace: bool = ..., 

7274 kind: SortKind = ..., 

7275 na_position: NaPosition = ..., 

7276 sort_remaining: bool = ..., 

7277 ignore_index: bool = ..., 

7278 key: IndexKeyFunc = ..., 

7279 ) -> DataFrame | None: 

7280 ... 

7281 

7282 def sort_index( 

7283 self, 

7284 *, 

7285 axis: Axis = 0, 

7286 level: IndexLabel | None = None, 

7287 ascending: bool | Sequence[bool] = True, 

7288 inplace: bool = False, 

7289 kind: SortKind = "quicksort", 

7290 na_position: NaPosition = "last", 

7291 sort_remaining: bool = True, 

7292 ignore_index: bool = False, 

7293 key: IndexKeyFunc | None = None, 

7294 ) -> DataFrame | None: 

7295 """ 

7296 Sort object by labels (along an axis). 

7297 

7298 Returns a new DataFrame sorted by label if `inplace` argument is 

7299 ``False``, otherwise updates the original DataFrame and returns None. 

7300 

7301 Parameters 

7302 ---------- 

7303 axis : {0 or 'index', 1 or 'columns'}, default 0 

7304 The axis along which to sort. The value 0 identifies the rows, 

7305 and 1 identifies the columns. 

7306 level : int or level name or list of ints or list of level names 

7307 If not None, sort on values in specified index level(s). 

7308 ascending : bool or list-like of bools, default True 

7309 Sort ascending vs. descending. When the index is a MultiIndex the 

7310 sort direction can be controlled for each level individually. 

7311 inplace : bool, default False 

7312 Whether to modify the DataFrame rather than creating a new one. 

7313 kind : {'quicksort', 'mergesort', 'heapsort', 'stable'}, default 'quicksort' 

7314 Choice of sorting algorithm. See also :func:`numpy.sort` for more 

7315 information. `mergesort` and `stable` are the only stable algorithms. For 

7316 DataFrames, this option is only applied when sorting on a single 

7317 column or label. 

7318 na_position : {'first', 'last'}, default 'last' 

7319 Puts NaNs at the beginning if `first`; `last` puts NaNs at the end. 

7320 Not implemented for MultiIndex. 

7321 sort_remaining : bool, default True 

7322 If True and sorting by level and index is multilevel, sort by other 

7323 levels too (in order) after sorting by specified level. 

7324 ignore_index : bool, default False 

7325 If True, the resulting axis will be labeled 0, 1, …, n - 1. 

7326 key : callable, optional 

7327 If not None, apply the key function to the index values 

7328 before sorting. This is similar to the `key` argument in the 

7329 builtin :meth:`sorted` function, with the notable difference that 

7330 this `key` function should be *vectorized*. It should expect an 

7331 ``Index`` and return an ``Index`` of the same shape. For MultiIndex 

7332 inputs, the key is applied *per level*. 

7333 

7334 Returns 

7335 ------- 

7336 DataFrame or None 

7337 The original DataFrame sorted by the labels or None if ``inplace=True``. 

7338 

7339 See Also 

7340 -------- 

7341 Series.sort_index : Sort Series by the index. 

7342 DataFrame.sort_values : Sort DataFrame by the value. 

7343 Series.sort_values : Sort Series by the value. 

7344 

7345 Examples 

7346 -------- 

7347 >>> df = pd.DataFrame([1, 2, 3, 4, 5], index=[100, 29, 234, 1, 150], 

7348 ... columns=['A']) 

7349 >>> df.sort_index() 

7350 A 

7351 1 4 

7352 29 2 

7353 100 1 

7354 150 5 

7355 234 3 

7356 

7357 By default, it sorts in ascending order, to sort in descending order, 

7358 use ``ascending=False`` 

7359 

7360 >>> df.sort_index(ascending=False) 

7361 A 

7362 234 3 

7363 150 5 

7364 100 1 

7365 29 2 

7366 1 4 

7367 

7368 A key function can be specified which is applied to the index before 

7369 sorting. For a ``MultiIndex`` this is applied to each level separately. 

7370 

7371 >>> df = pd.DataFrame({"a": [1, 2, 3, 4]}, index=['A', 'b', 'C', 'd']) 

7372 >>> df.sort_index(key=lambda x: x.str.lower()) 

7373 a 

7374 A 1 

7375 b 2 

7376 C 3 

7377 d 4 

7378 """ 

7379 return super().sort_index( 

7380 axis=axis, 

7381 level=level, 

7382 ascending=ascending, 

7383 inplace=inplace, 

7384 kind=kind, 

7385 na_position=na_position, 

7386 sort_remaining=sort_remaining, 

7387 ignore_index=ignore_index, 

7388 key=key, 

7389 ) 

7390 

7391 def value_counts( 

7392 self, 

7393 subset: IndexLabel | None = None, 

7394 normalize: bool = False, 

7395 sort: bool = True, 

7396 ascending: bool = False, 

7397 dropna: bool = True, 

7398 ) -> Series: 

7399 """ 

7400 Return a Series containing the frequency of each distinct row in the Dataframe. 

7401 

7402 Parameters 

7403 ---------- 

7404 subset : label or list of labels, optional 

7405 Columns to use when counting unique combinations. 

7406 normalize : bool, default False 

7407 Return proportions rather than frequencies. 

7408 sort : bool, default True 

7409 Sort by frequencies when True. Sort by DataFrame column values when False. 

7410 ascending : bool, default False 

7411 Sort in ascending order. 

7412 dropna : bool, default True 

7413 Don't include counts of rows that contain NA values. 

7414 

7415 .. versionadded:: 1.3.0 

7416 

7417 Returns 

7418 ------- 

7419 Series 

7420 

7421 See Also 

7422 -------- 

7423 Series.value_counts: Equivalent method on Series. 

7424 

7425 Notes 

7426 ----- 

7427 The returned Series will have a MultiIndex with one level per input 

7428 column but an Index (non-multi) for a single label. By default, rows 

7429 that contain any NA values are omitted from the result. By default, 

7430 the resulting Series will be in descending order so that the first 

7431 element is the most frequently-occurring row. 

7432 

7433 Examples 

7434 -------- 

7435 >>> df = pd.DataFrame({'num_legs': [2, 4, 4, 6], 

7436 ... 'num_wings': [2, 0, 0, 0]}, 

7437 ... index=['falcon', 'dog', 'cat', 'ant']) 

7438 >>> df 

7439 num_legs num_wings 

7440 falcon 2 2 

7441 dog 4 0 

7442 cat 4 0 

7443 ant 6 0 

7444 

7445 >>> df.value_counts() 

7446 num_legs num_wings 

7447 4 0 2 

7448 2 2 1 

7449 6 0 1 

7450 Name: count, dtype: int64 

7451 

7452 >>> df.value_counts(sort=False) 

7453 num_legs num_wings 

7454 2 2 1 

7455 4 0 2 

7456 6 0 1 

7457 Name: count, dtype: int64 

7458 

7459 >>> df.value_counts(ascending=True) 

7460 num_legs num_wings 

7461 2 2 1 

7462 6 0 1 

7463 4 0 2 

7464 Name: count, dtype: int64 

7465 

7466 >>> df.value_counts(normalize=True) 

7467 num_legs num_wings 

7468 4 0 0.50 

7469 2 2 0.25 

7470 6 0 0.25 

7471 Name: proportion, dtype: float64 

7472 

7473 With `dropna` set to `False` we can also count rows with NA values. 

7474 

7475 >>> df = pd.DataFrame({'first_name': ['John', 'Anne', 'John', 'Beth'], 

7476 ... 'middle_name': ['Smith', pd.NA, pd.NA, 'Louise']}) 

7477 >>> df 

7478 first_name middle_name 

7479 0 John Smith 

7480 1 Anne <NA> 

7481 2 John <NA> 

7482 3 Beth Louise 

7483 

7484 >>> df.value_counts() 

7485 first_name middle_name 

7486 Beth Louise 1 

7487 John Smith 1 

7488 Name: count, dtype: int64 

7489 

7490 >>> df.value_counts(dropna=False) 

7491 first_name middle_name 

7492 Anne NaN 1 

7493 Beth Louise 1 

7494 John Smith 1 

7495 NaN 1 

7496 Name: count, dtype: int64 

7497 

7498 >>> df.value_counts("first_name") 

7499 first_name 

7500 John 2 

7501 Anne 1 

7502 Beth 1 

7503 Name: count, dtype: int64 

7504 """ 

7505 if subset is None: 

7506 subset = self.columns.tolist() 

7507 

7508 name = "proportion" if normalize else "count" 

7509 counts = self.groupby(subset, dropna=dropna, observed=False)._grouper.size() 

7510 counts.name = name 

7511 

7512 if sort: 

7513 counts = counts.sort_values(ascending=ascending) 

7514 if normalize: 

7515 counts /= counts.sum() 

7516 

7517 # Force MultiIndex for a list_like subset with a single column 

7518 if is_list_like(subset) and len(subset) == 1: # type: ignore[arg-type] 

7519 counts.index = MultiIndex.from_arrays( 

7520 [counts.index], names=[counts.index.name] 

7521 ) 

7522 

7523 return counts 

7524 

7525 def nlargest( 

7526 self, n: int, columns: IndexLabel, keep: NsmallestNlargestKeep = "first" 

7527 ) -> DataFrame: 

7528 """ 

7529 Return the first `n` rows ordered by `columns` in descending order. 

7530 

7531 Return the first `n` rows with the largest values in `columns`, in 

7532 descending order. The columns that are not specified are returned as 

7533 well, but not used for ordering. 

7534 

7535 This method is equivalent to 

7536 ``df.sort_values(columns, ascending=False).head(n)``, but more 

7537 performant. 

7538 

7539 Parameters 

7540 ---------- 

7541 n : int 

7542 Number of rows to return. 

7543 columns : label or list of labels 

7544 Column label(s) to order by. 

7545 keep : {'first', 'last', 'all'}, default 'first' 

7546 Where there are duplicate values: 

7547 

7548 - ``first`` : prioritize the first occurrence(s) 

7549 - ``last`` : prioritize the last occurrence(s) 

7550 - ``all`` : keep all the ties of the smallest item even if it means 

7551 selecting more than ``n`` items. 

7552 

7553 Returns 

7554 ------- 

7555 DataFrame 

7556 The first `n` rows ordered by the given columns in descending 

7557 order. 

7558 

7559 See Also 

7560 -------- 

7561 DataFrame.nsmallest : Return the first `n` rows ordered by `columns` in 

7562 ascending order. 

7563 DataFrame.sort_values : Sort DataFrame by the values. 

7564 DataFrame.head : Return the first `n` rows without re-ordering. 

7565 

7566 Notes 

7567 ----- 

7568 This function cannot be used with all column types. For example, when 

7569 specifying columns with `object` or `category` dtypes, ``TypeError`` is 

7570 raised. 

7571 

7572 Examples 

7573 -------- 

7574 >>> df = pd.DataFrame({'population': [59000000, 65000000, 434000, 

7575 ... 434000, 434000, 337000, 11300, 

7576 ... 11300, 11300], 

7577 ... 'GDP': [1937894, 2583560 , 12011, 4520, 12128, 

7578 ... 17036, 182, 38, 311], 

7579 ... 'alpha-2': ["IT", "FR", "MT", "MV", "BN", 

7580 ... "IS", "NR", "TV", "AI"]}, 

7581 ... index=["Italy", "France", "Malta", 

7582 ... "Maldives", "Brunei", "Iceland", 

7583 ... "Nauru", "Tuvalu", "Anguilla"]) 

7584 >>> df 

7585 population GDP alpha-2 

7586 Italy 59000000 1937894 IT 

7587 France 65000000 2583560 FR 

7588 Malta 434000 12011 MT 

7589 Maldives 434000 4520 MV 

7590 Brunei 434000 12128 BN 

7591 Iceland 337000 17036 IS 

7592 Nauru 11300 182 NR 

7593 Tuvalu 11300 38 TV 

7594 Anguilla 11300 311 AI 

7595 

7596 In the following example, we will use ``nlargest`` to select the three 

7597 rows having the largest values in column "population". 

7598 

7599 >>> df.nlargest(3, 'population') 

7600 population GDP alpha-2 

7601 France 65000000 2583560 FR 

7602 Italy 59000000 1937894 IT 

7603 Malta 434000 12011 MT 

7604 

7605 When using ``keep='last'``, ties are resolved in reverse order: 

7606 

7607 >>> df.nlargest(3, 'population', keep='last') 

7608 population GDP alpha-2 

7609 France 65000000 2583560 FR 

7610 Italy 59000000 1937894 IT 

7611 Brunei 434000 12128 BN 

7612 

7613 When using ``keep='all'``, the number of element kept can go beyond ``n`` 

7614 if there are duplicate values for the smallest element, all the 

7615 ties are kept: 

7616 

7617 >>> df.nlargest(3, 'population', keep='all') 

7618 population GDP alpha-2 

7619 France 65000000 2583560 FR 

7620 Italy 59000000 1937894 IT 

7621 Malta 434000 12011 MT 

7622 Maldives 434000 4520 MV 

7623 Brunei 434000 12128 BN 

7624 

7625 However, ``nlargest`` does not keep ``n`` distinct largest elements: 

7626 

7627 >>> df.nlargest(5, 'population', keep='all') 

7628 population GDP alpha-2 

7629 France 65000000 2583560 FR 

7630 Italy 59000000 1937894 IT 

7631 Malta 434000 12011 MT 

7632 Maldives 434000 4520 MV 

7633 Brunei 434000 12128 BN 

7634 

7635 To order by the largest values in column "population" and then "GDP", 

7636 we can specify multiple columns like in the next example. 

7637 

7638 >>> df.nlargest(3, ['population', 'GDP']) 

7639 population GDP alpha-2 

7640 France 65000000 2583560 FR 

7641 Italy 59000000 1937894 IT 

7642 Brunei 434000 12128 BN 

7643 """ 

7644 return selectn.SelectNFrame(self, n=n, keep=keep, columns=columns).nlargest() 

7645 

7646 def nsmallest( 

7647 self, n: int, columns: IndexLabel, keep: NsmallestNlargestKeep = "first" 

7648 ) -> DataFrame: 

7649 """ 

7650 Return the first `n` rows ordered by `columns` in ascending order. 

7651 

7652 Return the first `n` rows with the smallest values in `columns`, in 

7653 ascending order. The columns that are not specified are returned as 

7654 well, but not used for ordering. 

7655 

7656 This method is equivalent to 

7657 ``df.sort_values(columns, ascending=True).head(n)``, but more 

7658 performant. 

7659 

7660 Parameters 

7661 ---------- 

7662 n : int 

7663 Number of items to retrieve. 

7664 columns : list or str 

7665 Column name or names to order by. 

7666 keep : {'first', 'last', 'all'}, default 'first' 

7667 Where there are duplicate values: 

7668 

7669 - ``first`` : take the first occurrence. 

7670 - ``last`` : take the last occurrence. 

7671 - ``all`` : keep all the ties of the largest item even if it means 

7672 selecting more than ``n`` items. 

7673 

7674 Returns 

7675 ------- 

7676 DataFrame 

7677 

7678 See Also 

7679 -------- 

7680 DataFrame.nlargest : Return the first `n` rows ordered by `columns` in 

7681 descending order. 

7682 DataFrame.sort_values : Sort DataFrame by the values. 

7683 DataFrame.head : Return the first `n` rows without re-ordering. 

7684 

7685 Examples 

7686 -------- 

7687 >>> df = pd.DataFrame({'population': [59000000, 65000000, 434000, 

7688 ... 434000, 434000, 337000, 337000, 

7689 ... 11300, 11300], 

7690 ... 'GDP': [1937894, 2583560 , 12011, 4520, 12128, 

7691 ... 17036, 182, 38, 311], 

7692 ... 'alpha-2': ["IT", "FR", "MT", "MV", "BN", 

7693 ... "IS", "NR", "TV", "AI"]}, 

7694 ... index=["Italy", "France", "Malta", 

7695 ... "Maldives", "Brunei", "Iceland", 

7696 ... "Nauru", "Tuvalu", "Anguilla"]) 

7697 >>> df 

7698 population GDP alpha-2 

7699 Italy 59000000 1937894 IT 

7700 France 65000000 2583560 FR 

7701 Malta 434000 12011 MT 

7702 Maldives 434000 4520 MV 

7703 Brunei 434000 12128 BN 

7704 Iceland 337000 17036 IS 

7705 Nauru 337000 182 NR 

7706 Tuvalu 11300 38 TV 

7707 Anguilla 11300 311 AI 

7708 

7709 In the following example, we will use ``nsmallest`` to select the 

7710 three rows having the smallest values in column "population". 

7711 

7712 >>> df.nsmallest(3, 'population') 

7713 population GDP alpha-2 

7714 Tuvalu 11300 38 TV 

7715 Anguilla 11300 311 AI 

7716 Iceland 337000 17036 IS 

7717 

7718 When using ``keep='last'``, ties are resolved in reverse order: 

7719 

7720 >>> df.nsmallest(3, 'population', keep='last') 

7721 population GDP alpha-2 

7722 Anguilla 11300 311 AI 

7723 Tuvalu 11300 38 TV 

7724 Nauru 337000 182 NR 

7725 

7726 When using ``keep='all'``, the number of element kept can go beyond ``n`` 

7727 if there are duplicate values for the largest element, all the 

7728 ties are kept. 

7729 

7730 >>> df.nsmallest(3, 'population', keep='all') 

7731 population GDP alpha-2 

7732 Tuvalu 11300 38 TV 

7733 Anguilla 11300 311 AI 

7734 Iceland 337000 17036 IS 

7735 Nauru 337000 182 NR 

7736 

7737 However, ``nsmallest`` does not keep ``n`` distinct 

7738 smallest elements: 

7739 

7740 >>> df.nsmallest(4, 'population', keep='all') 

7741 population GDP alpha-2 

7742 Tuvalu 11300 38 TV 

7743 Anguilla 11300 311 AI 

7744 Iceland 337000 17036 IS 

7745 Nauru 337000 182 NR 

7746 

7747 To order by the smallest values in column "population" and then "GDP", we can 

7748 specify multiple columns like in the next example. 

7749 

7750 >>> df.nsmallest(3, ['population', 'GDP']) 

7751 population GDP alpha-2 

7752 Tuvalu 11300 38 TV 

7753 Anguilla 11300 311 AI 

7754 Nauru 337000 182 NR 

7755 """ 

7756 return selectn.SelectNFrame(self, n=n, keep=keep, columns=columns).nsmallest() 

7757 

7758 @doc( 

7759 Series.swaplevel, 

7760 klass=_shared_doc_kwargs["klass"], 

7761 extra_params=dedent( 

7762 """axis : {0 or 'index', 1 or 'columns'}, default 0 

7763 The axis to swap levels on. 0 or 'index' for row-wise, 1 or 

7764 'columns' for column-wise.""" 

7765 ), 

7766 examples=dedent( 

7767 """\ 

7768 Examples 

7769 -------- 

7770 >>> df = pd.DataFrame( 

7771 ... {"Grade": ["A", "B", "A", "C"]}, 

7772 ... index=[ 

7773 ... ["Final exam", "Final exam", "Coursework", "Coursework"], 

7774 ... ["History", "Geography", "History", "Geography"], 

7775 ... ["January", "February", "March", "April"], 

7776 ... ], 

7777 ... ) 

7778 >>> df 

7779 Grade 

7780 Final exam History January A 

7781 Geography February B 

7782 Coursework History March A 

7783 Geography April C 

7784 

7785 In the following example, we will swap the levels of the indices. 

7786 Here, we will swap the levels column-wise, but levels can be swapped row-wise 

7787 in a similar manner. Note that column-wise is the default behaviour. 

7788 By not supplying any arguments for i and j, we swap the last and second to 

7789 last indices. 

7790 

7791 >>> df.swaplevel() 

7792 Grade 

7793 Final exam January History A 

7794 February Geography B 

7795 Coursework March History A 

7796 April Geography C 

7797 

7798 By supplying one argument, we can choose which index to swap the last 

7799 index with. We can for example swap the first index with the last one as 

7800 follows. 

7801 

7802 >>> df.swaplevel(0) 

7803 Grade 

7804 January History Final exam A 

7805 February Geography Final exam B 

7806 March History Coursework A 

7807 April Geography Coursework C 

7808 

7809 We can also define explicitly which indices we want to swap by supplying values 

7810 for both i and j. Here, we for example swap the first and second indices. 

7811 

7812 >>> df.swaplevel(0, 1) 

7813 Grade 

7814 History Final exam January A 

7815 Geography Final exam February B 

7816 History Coursework March A 

7817 Geography Coursework April C""" 

7818 ), 

7819 ) 

7820 def swaplevel(self, i: Axis = -2, j: Axis = -1, axis: Axis = 0) -> DataFrame: 

7821 result = self.copy(deep=None) 

7822 

7823 axis = self._get_axis_number(axis) 

7824 

7825 if not isinstance(result._get_axis(axis), MultiIndex): # pragma: no cover 

7826 raise TypeError("Can only swap levels on a hierarchical axis.") 

7827 

7828 if axis == 0: 

7829 assert isinstance(result.index, MultiIndex) 

7830 result.index = result.index.swaplevel(i, j) 

7831 else: 

7832 assert isinstance(result.columns, MultiIndex) 

7833 result.columns = result.columns.swaplevel(i, j) 

7834 return result 

7835 

7836 def reorder_levels(self, order: Sequence[int | str], axis: Axis = 0) -> DataFrame: 

7837 """ 

7838 Rearrange index levels using input order. May not drop or duplicate levels. 

7839 

7840 Parameters 

7841 ---------- 

7842 order : list of int or list of str 

7843 List representing new level order. Reference level by number 

7844 (position) or by key (label). 

7845 axis : {0 or 'index', 1 or 'columns'}, default 0 

7846 Where to reorder levels. 

7847 

7848 Returns 

7849 ------- 

7850 DataFrame 

7851 

7852 Examples 

7853 -------- 

7854 >>> data = { 

7855 ... "class": ["Mammals", "Mammals", "Reptiles"], 

7856 ... "diet": ["Omnivore", "Carnivore", "Carnivore"], 

7857 ... "species": ["Humans", "Dogs", "Snakes"], 

7858 ... } 

7859 >>> df = pd.DataFrame(data, columns=["class", "diet", "species"]) 

7860 >>> df = df.set_index(["class", "diet"]) 

7861 >>> df 

7862 species 

7863 class diet 

7864 Mammals Omnivore Humans 

7865 Carnivore Dogs 

7866 Reptiles Carnivore Snakes 

7867 

7868 Let's reorder the levels of the index: 

7869 

7870 >>> df.reorder_levels(["diet", "class"]) 

7871 species 

7872 diet class 

7873 Omnivore Mammals Humans 

7874 Carnivore Mammals Dogs 

7875 Reptiles Snakes 

7876 """ 

7877 axis = self._get_axis_number(axis) 

7878 if not isinstance(self._get_axis(axis), MultiIndex): # pragma: no cover 

7879 raise TypeError("Can only reorder levels on a hierarchical axis.") 

7880 

7881 result = self.copy(deep=None) 

7882 

7883 if axis == 0: 

7884 assert isinstance(result.index, MultiIndex) 

7885 result.index = result.index.reorder_levels(order) 

7886 else: 

7887 assert isinstance(result.columns, MultiIndex) 

7888 result.columns = result.columns.reorder_levels(order) 

7889 return result 

7890 

7891 # ---------------------------------------------------------------------- 

7892 # Arithmetic Methods 

7893 

7894 def _cmp_method(self, other, op): 

7895 axis: Literal[1] = 1 # only relevant for Series other case 

7896 

7897 self, other = self._align_for_op(other, axis, flex=False, level=None) 

7898 

7899 # See GH#4537 for discussion of scalar op behavior 

7900 new_data = self._dispatch_frame_op(other, op, axis=axis) 

7901 return self._construct_result(new_data) 

7902 

7903 def _arith_method(self, other, op): 

7904 if self._should_reindex_frame_op(other, op, 1, None, None): 

7905 return self._arith_method_with_reindex(other, op) 

7906 

7907 axis: Literal[1] = 1 # only relevant for Series other case 

7908 other = ops.maybe_prepare_scalar_for_op(other, (self.shape[axis],)) 

7909 

7910 self, other = self._align_for_op(other, axis, flex=True, level=None) 

7911 

7912 with np.errstate(all="ignore"): 

7913 new_data = self._dispatch_frame_op(other, op, axis=axis) 

7914 return self._construct_result(new_data) 

7915 

7916 _logical_method = _arith_method 

7917 

7918 def _dispatch_frame_op( 

7919 self, right, func: Callable, axis: AxisInt | None = None 

7920 ) -> DataFrame: 

7921 """ 

7922 Evaluate the frame operation func(left, right) by evaluating 

7923 column-by-column, dispatching to the Series implementation. 

7924 

7925 Parameters 

7926 ---------- 

7927 right : scalar, Series, or DataFrame 

7928 func : arithmetic or comparison operator 

7929 axis : {None, 0, 1} 

7930 

7931 Returns 

7932 ------- 

7933 DataFrame 

7934 

7935 Notes 

7936 ----- 

7937 Caller is responsible for setting np.errstate where relevant. 

7938 """ 

7939 # Get the appropriate array-op to apply to each column/block's values. 

7940 array_op = ops.get_array_op(func) 

7941 

7942 right = lib.item_from_zerodim(right) 

7943 if not is_list_like(right): 

7944 # i.e. scalar, faster than checking np.ndim(right) == 0 

7945 bm = self._mgr.apply(array_op, right=right) 

7946 return self._constructor_from_mgr(bm, axes=bm.axes) 

7947 

7948 elif isinstance(right, DataFrame): 

7949 assert self.index.equals(right.index) 

7950 assert self.columns.equals(right.columns) 

7951 # TODO: The previous assertion `assert right._indexed_same(self)` 

7952 # fails in cases with empty columns reached via 

7953 # _frame_arith_method_with_reindex 

7954 

7955 # TODO operate_blockwise expects a manager of the same type 

7956 bm = self._mgr.operate_blockwise( 

7957 # error: Argument 1 to "operate_blockwise" of "ArrayManager" has 

7958 # incompatible type "Union[ArrayManager, BlockManager]"; expected 

7959 # "ArrayManager" 

7960 # error: Argument 1 to "operate_blockwise" of "BlockManager" has 

7961 # incompatible type "Union[ArrayManager, BlockManager]"; expected 

7962 # "BlockManager" 

7963 right._mgr, # type: ignore[arg-type] 

7964 array_op, 

7965 ) 

7966 return self._constructor_from_mgr(bm, axes=bm.axes) 

7967 

7968 elif isinstance(right, Series) and axis == 1: 

7969 # axis=1 means we want to operate row-by-row 

7970 assert right.index.equals(self.columns) 

7971 

7972 right = right._values 

7973 # maybe_align_as_frame ensures we do not have an ndarray here 

7974 assert not isinstance(right, np.ndarray) 

7975 

7976 arrays = [ 

7977 array_op(_left, _right) 

7978 for _left, _right in zip(self._iter_column_arrays(), right) 

7979 ] 

7980 

7981 elif isinstance(right, Series): 

7982 assert right.index.equals(self.index) 

7983 right = right._values 

7984 

7985 arrays = [array_op(left, right) for left in self._iter_column_arrays()] 

7986 

7987 else: 

7988 raise NotImplementedError(right) 

7989 

7990 return type(self)._from_arrays( 

7991 arrays, self.columns, self.index, verify_integrity=False 

7992 ) 

7993 

7994 def _combine_frame(self, other: DataFrame, func, fill_value=None): 

7995 # at this point we have `self._indexed_same(other)` 

7996 

7997 if fill_value is None: 

7998 # since _arith_op may be called in a loop, avoid function call 

7999 # overhead if possible by doing this check once 

8000 _arith_op = func 

8001 

8002 else: 

8003 

8004 def _arith_op(left, right): 

8005 # for the mixed_type case where we iterate over columns, 

8006 # _arith_op(left, right) is equivalent to 

8007 # left._binop(right, func, fill_value=fill_value) 

8008 left, right = ops.fill_binop(left, right, fill_value) 

8009 return func(left, right) 

8010 

8011 new_data = self._dispatch_frame_op(other, _arith_op) 

8012 return new_data 

8013 

8014 def _arith_method_with_reindex(self, right: DataFrame, op) -> DataFrame: 

8015 """ 

8016 For DataFrame-with-DataFrame operations that require reindexing, 

8017 operate only on shared columns, then reindex. 

8018 

8019 Parameters 

8020 ---------- 

8021 right : DataFrame 

8022 op : binary operator 

8023 

8024 Returns 

8025 ------- 

8026 DataFrame 

8027 """ 

8028 left = self 

8029 

8030 # GH#31623, only operate on shared columns 

8031 cols, lcols, rcols = left.columns.join( 

8032 right.columns, how="inner", level=None, return_indexers=True 

8033 ) 

8034 

8035 new_left = left.iloc[:, lcols] 

8036 new_right = right.iloc[:, rcols] 

8037 result = op(new_left, new_right) 

8038 

8039 # Do the join on the columns instead of using left._align_for_op 

8040 # to avoid constructing two potentially large/sparse DataFrames 

8041 join_columns, _, _ = left.columns.join( 

8042 right.columns, how="outer", level=None, return_indexers=True 

8043 ) 

8044 

8045 if result.columns.has_duplicates: 

8046 # Avoid reindexing with a duplicate axis. 

8047 # https://github.com/pandas-dev/pandas/issues/35194 

8048 indexer, _ = result.columns.get_indexer_non_unique(join_columns) 

8049 indexer = algorithms.unique1d(indexer) 

8050 result = result._reindex_with_indexers( 

8051 {1: [join_columns, indexer]}, allow_dups=True 

8052 ) 

8053 else: 

8054 result = result.reindex(join_columns, axis=1) 

8055 

8056 return result 

8057 

8058 def _should_reindex_frame_op(self, right, op, axis: int, fill_value, level) -> bool: 

8059 """ 

8060 Check if this is an operation between DataFrames that will need to reindex. 

8061 """ 

8062 if op is operator.pow or op is roperator.rpow: 

8063 # GH#32685 pow has special semantics for operating with null values 

8064 return False 

8065 

8066 if not isinstance(right, DataFrame): 

8067 return False 

8068 

8069 if fill_value is None and level is None and axis == 1: 

8070 # TODO: any other cases we should handle here? 

8071 

8072 # Intersection is always unique so we have to check the unique columns 

8073 left_uniques = self.columns.unique() 

8074 right_uniques = right.columns.unique() 

8075 cols = left_uniques.intersection(right_uniques) 

8076 if len(cols) and not ( 

8077 len(cols) == len(left_uniques) and len(cols) == len(right_uniques) 

8078 ): 

8079 # TODO: is there a shortcut available when len(cols) == 0? 

8080 return True 

8081 

8082 return False 

8083 

8084 def _align_for_op( 

8085 self, 

8086 other, 

8087 axis: AxisInt, 

8088 flex: bool | None = False, 

8089 level: Level | None = None, 

8090 ): 

8091 """ 

8092 Convert rhs to meet lhs dims if input is list, tuple or np.ndarray. 

8093 

8094 Parameters 

8095 ---------- 

8096 left : DataFrame 

8097 right : Any 

8098 axis : int 

8099 flex : bool or None, default False 

8100 Whether this is a flex op, in which case we reindex. 

8101 None indicates not to check for alignment. 

8102 level : int or level name, default None 

8103 

8104 Returns 

8105 ------- 

8106 left : DataFrame 

8107 right : Any 

8108 """ 

8109 left, right = self, other 

8110 

8111 def to_series(right): 

8112 msg = ( 

8113 "Unable to coerce to Series, " 

8114 "length must be {req_len}: given {given_len}" 

8115 ) 

8116 

8117 # pass dtype to avoid doing inference, which would break consistency 

8118 # with Index/Series ops 

8119 dtype = None 

8120 if getattr(right, "dtype", None) == object: 

8121 # can't pass right.dtype unconditionally as that would break on e.g. 

8122 # datetime64[h] ndarray 

8123 dtype = object 

8124 

8125 if axis == 0: 

8126 if len(left.index) != len(right): 

8127 raise ValueError( 

8128 msg.format(req_len=len(left.index), given_len=len(right)) 

8129 ) 

8130 right = left._constructor_sliced(right, index=left.index, dtype=dtype) 

8131 else: 

8132 if len(left.columns) != len(right): 

8133 raise ValueError( 

8134 msg.format(req_len=len(left.columns), given_len=len(right)) 

8135 ) 

8136 right = left._constructor_sliced(right, index=left.columns, dtype=dtype) 

8137 return right 

8138 

8139 if isinstance(right, np.ndarray): 

8140 if right.ndim == 1: 

8141 right = to_series(right) 

8142 

8143 elif right.ndim == 2: 

8144 # We need to pass dtype=right.dtype to retain object dtype 

8145 # otherwise we lose consistency with Index and array ops 

8146 dtype = None 

8147 if right.dtype == object: 

8148 # can't pass right.dtype unconditionally as that would break on e.g. 

8149 # datetime64[h] ndarray 

8150 dtype = object 

8151 

8152 if right.shape == left.shape: 

8153 right = left._constructor( 

8154 right, index=left.index, columns=left.columns, dtype=dtype 

8155 ) 

8156 

8157 elif right.shape[0] == left.shape[0] and right.shape[1] == 1: 

8158 # Broadcast across columns 

8159 right = np.broadcast_to(right, left.shape) 

8160 right = left._constructor( 

8161 right, index=left.index, columns=left.columns, dtype=dtype 

8162 ) 

8163 

8164 elif right.shape[1] == left.shape[1] and right.shape[0] == 1: 

8165 # Broadcast along rows 

8166 right = to_series(right[0, :]) 

8167 

8168 else: 

8169 raise ValueError( 

8170 "Unable to coerce to DataFrame, shape " 

8171 f"must be {left.shape}: given {right.shape}" 

8172 ) 

8173 

8174 elif right.ndim > 2: 

8175 raise ValueError( 

8176 "Unable to coerce to Series/DataFrame, " 

8177 f"dimension must be <= 2: {right.shape}" 

8178 ) 

8179 

8180 elif is_list_like(right) and not isinstance(right, (Series, DataFrame)): 

8181 # GH#36702. Raise when attempting arithmetic with list of array-like. 

8182 if any(is_array_like(el) for el in right): 

8183 raise ValueError( 

8184 f"Unable to coerce list of {type(right[0])} to Series/DataFrame" 

8185 ) 

8186 # GH#17901 

8187 right = to_series(right) 

8188 

8189 if flex is not None and isinstance(right, DataFrame): 

8190 if not left._indexed_same(right): 

8191 if flex: 

8192 left, right = left.align( 

8193 right, join="outer", level=level, copy=False 

8194 ) 

8195 else: 

8196 raise ValueError( 

8197 "Can only compare identically-labeled (both index and columns) " 

8198 "DataFrame objects" 

8199 ) 

8200 elif isinstance(right, Series): 

8201 # axis=1 is default for DataFrame-with-Series op 

8202 axis = axis if axis is not None else 1 

8203 if not flex: 

8204 if not left.axes[axis].equals(right.index): 

8205 raise ValueError( 

8206 "Operands are not aligned. Do " 

8207 "`left, right = left.align(right, axis=1, copy=False)` " 

8208 "before operating." 

8209 ) 

8210 

8211 left, right = left.align( 

8212 right, 

8213 join="outer", 

8214 axis=axis, 

8215 level=level, 

8216 copy=False, 

8217 ) 

8218 right = left._maybe_align_series_as_frame(right, axis) 

8219 

8220 return left, right 

8221 

8222 def _maybe_align_series_as_frame(self, series: Series, axis: AxisInt): 

8223 """ 

8224 If the Series operand is not EA-dtype, we can broadcast to 2D and operate 

8225 blockwise. 

8226 """ 

8227 rvalues = series._values 

8228 if not isinstance(rvalues, np.ndarray): 

8229 # TODO(EA2D): no need to special-case with 2D EAs 

8230 if rvalues.dtype in ("datetime64[ns]", "timedelta64[ns]"): 

8231 # We can losslessly+cheaply cast to ndarray 

8232 rvalues = np.asarray(rvalues) 

8233 else: 

8234 return series 

8235 

8236 if axis == 0: 

8237 rvalues = rvalues.reshape(-1, 1) 

8238 else: 

8239 rvalues = rvalues.reshape(1, -1) 

8240 

8241 rvalues = np.broadcast_to(rvalues, self.shape) 

8242 # pass dtype to avoid doing inference 

8243 return self._constructor( 

8244 rvalues, 

8245 index=self.index, 

8246 columns=self.columns, 

8247 dtype=rvalues.dtype, 

8248 ) 

8249 

8250 def _flex_arith_method( 

8251 self, other, op, *, axis: Axis = "columns", level=None, fill_value=None 

8252 ): 

8253 axis = self._get_axis_number(axis) if axis is not None else 1 

8254 

8255 if self._should_reindex_frame_op(other, op, axis, fill_value, level): 

8256 return self._arith_method_with_reindex(other, op) 

8257 

8258 if isinstance(other, Series) and fill_value is not None: 

8259 # TODO: We could allow this in cases where we end up going 

8260 # through the DataFrame path 

8261 raise NotImplementedError(f"fill_value {fill_value} not supported.") 

8262 

8263 other = ops.maybe_prepare_scalar_for_op(other, self.shape) 

8264 self, other = self._align_for_op(other, axis, flex=True, level=level) 

8265 

8266 with np.errstate(all="ignore"): 

8267 if isinstance(other, DataFrame): 

8268 # Another DataFrame 

8269 new_data = self._combine_frame(other, op, fill_value) 

8270 

8271 elif isinstance(other, Series): 

8272 new_data = self._dispatch_frame_op(other, op, axis=axis) 

8273 else: 

8274 # in this case we always have `np.ndim(other) == 0` 

8275 if fill_value is not None: 

8276 self = self.fillna(fill_value) 

8277 

8278 new_data = self._dispatch_frame_op(other, op) 

8279 

8280 return self._construct_result(new_data) 

8281 

8282 def _construct_result(self, result) -> DataFrame: 

8283 """ 

8284 Wrap the result of an arithmetic, comparison, or logical operation. 

8285 

8286 Parameters 

8287 ---------- 

8288 result : DataFrame 

8289 

8290 Returns 

8291 ------- 

8292 DataFrame 

8293 """ 

8294 out = self._constructor(result, copy=False).__finalize__(self) 

8295 # Pin columns instead of passing to constructor for compat with 

8296 # non-unique columns case 

8297 out.columns = self.columns 

8298 out.index = self.index 

8299 return out 

8300 

8301 def __divmod__(self, other) -> tuple[DataFrame, DataFrame]: 

8302 # Naive implementation, room for optimization 

8303 div = self // other 

8304 mod = self - div * other 

8305 return div, mod 

8306 

8307 def __rdivmod__(self, other) -> tuple[DataFrame, DataFrame]: 

8308 # Naive implementation, room for optimization 

8309 div = other // self 

8310 mod = other - div * self 

8311 return div, mod 

8312 

8313 def _flex_cmp_method(self, other, op, *, axis: Axis = "columns", level=None): 

8314 axis = self._get_axis_number(axis) if axis is not None else 1 

8315 

8316 self, other = self._align_for_op(other, axis, flex=True, level=level) 

8317 

8318 new_data = self._dispatch_frame_op(other, op, axis=axis) 

8319 return self._construct_result(new_data) 

8320 

8321 @Appender(ops.make_flex_doc("eq", "dataframe")) 

8322 def eq(self, other, axis: Axis = "columns", level=None) -> DataFrame: 

8323 return self._flex_cmp_method(other, operator.eq, axis=axis, level=level) 

8324 

8325 @Appender(ops.make_flex_doc("ne", "dataframe")) 

8326 def ne(self, other, axis: Axis = "columns", level=None) -> DataFrame: 

8327 return self._flex_cmp_method(other, operator.ne, axis=axis, level=level) 

8328 

8329 @Appender(ops.make_flex_doc("le", "dataframe")) 

8330 def le(self, other, axis: Axis = "columns", level=None) -> DataFrame: 

8331 return self._flex_cmp_method(other, operator.le, axis=axis, level=level) 

8332 

8333 @Appender(ops.make_flex_doc("lt", "dataframe")) 

8334 def lt(self, other, axis: Axis = "columns", level=None) -> DataFrame: 

8335 return self._flex_cmp_method(other, operator.lt, axis=axis, level=level) 

8336 

8337 @Appender(ops.make_flex_doc("ge", "dataframe")) 

8338 def ge(self, other, axis: Axis = "columns", level=None) -> DataFrame: 

8339 return self._flex_cmp_method(other, operator.ge, axis=axis, level=level) 

8340 

8341 @Appender(ops.make_flex_doc("gt", "dataframe")) 

8342 def gt(self, other, axis: Axis = "columns", level=None) -> DataFrame: 

8343 return self._flex_cmp_method(other, operator.gt, axis=axis, level=level) 

8344 

8345 @Appender(ops.make_flex_doc("add", "dataframe")) 

8346 def add( 

8347 self, other, axis: Axis = "columns", level=None, fill_value=None 

8348 ) -> DataFrame: 

8349 return self._flex_arith_method( 

8350 other, operator.add, level=level, fill_value=fill_value, axis=axis 

8351 ) 

8352 

8353 @Appender(ops.make_flex_doc("radd", "dataframe")) 

8354 def radd( 

8355 self, other, axis: Axis = "columns", level=None, fill_value=None 

8356 ) -> DataFrame: 

8357 return self._flex_arith_method( 

8358 other, roperator.radd, level=level, fill_value=fill_value, axis=axis 

8359 ) 

8360 

8361 @Appender(ops.make_flex_doc("sub", "dataframe")) 

8362 def sub( 

8363 self, other, axis: Axis = "columns", level=None, fill_value=None 

8364 ) -> DataFrame: 

8365 return self._flex_arith_method( 

8366 other, operator.sub, level=level, fill_value=fill_value, axis=axis 

8367 ) 

8368 

8369 subtract = sub 

8370 

8371 @Appender(ops.make_flex_doc("rsub", "dataframe")) 

8372 def rsub( 

8373 self, other, axis: Axis = "columns", level=None, fill_value=None 

8374 ) -> DataFrame: 

8375 return self._flex_arith_method( 

8376 other, roperator.rsub, level=level, fill_value=fill_value, axis=axis 

8377 ) 

8378 

8379 @Appender(ops.make_flex_doc("mul", "dataframe")) 

8380 def mul( 

8381 self, other, axis: Axis = "columns", level=None, fill_value=None 

8382 ) -> DataFrame: 

8383 return self._flex_arith_method( 

8384 other, operator.mul, level=level, fill_value=fill_value, axis=axis 

8385 ) 

8386 

8387 multiply = mul 

8388 

8389 @Appender(ops.make_flex_doc("rmul", "dataframe")) 

8390 def rmul( 

8391 self, other, axis: Axis = "columns", level=None, fill_value=None 

8392 ) -> DataFrame: 

8393 return self._flex_arith_method( 

8394 other, roperator.rmul, level=level, fill_value=fill_value, axis=axis 

8395 ) 

8396 

8397 @Appender(ops.make_flex_doc("truediv", "dataframe")) 

8398 def truediv( 

8399 self, other, axis: Axis = "columns", level=None, fill_value=None 

8400 ) -> DataFrame: 

8401 return self._flex_arith_method( 

8402 other, operator.truediv, level=level, fill_value=fill_value, axis=axis 

8403 ) 

8404 

8405 div = truediv 

8406 divide = truediv 

8407 

8408 @Appender(ops.make_flex_doc("rtruediv", "dataframe")) 

8409 def rtruediv( 

8410 self, other, axis: Axis = "columns", level=None, fill_value=None 

8411 ) -> DataFrame: 

8412 return self._flex_arith_method( 

8413 other, roperator.rtruediv, level=level, fill_value=fill_value, axis=axis 

8414 ) 

8415 

8416 rdiv = rtruediv 

8417 

8418 @Appender(ops.make_flex_doc("floordiv", "dataframe")) 

8419 def floordiv( 

8420 self, other, axis: Axis = "columns", level=None, fill_value=None 

8421 ) -> DataFrame: 

8422 return self._flex_arith_method( 

8423 other, operator.floordiv, level=level, fill_value=fill_value, axis=axis 

8424 ) 

8425 

8426 @Appender(ops.make_flex_doc("rfloordiv", "dataframe")) 

8427 def rfloordiv( 

8428 self, other, axis: Axis = "columns", level=None, fill_value=None 

8429 ) -> DataFrame: 

8430 return self._flex_arith_method( 

8431 other, roperator.rfloordiv, level=level, fill_value=fill_value, axis=axis 

8432 ) 

8433 

8434 @Appender(ops.make_flex_doc("mod", "dataframe")) 

8435 def mod( 

8436 self, other, axis: Axis = "columns", level=None, fill_value=None 

8437 ) -> DataFrame: 

8438 return self._flex_arith_method( 

8439 other, operator.mod, level=level, fill_value=fill_value, axis=axis 

8440 ) 

8441 

8442 @Appender(ops.make_flex_doc("rmod", "dataframe")) 

8443 def rmod( 

8444 self, other, axis: Axis = "columns", level=None, fill_value=None 

8445 ) -> DataFrame: 

8446 return self._flex_arith_method( 

8447 other, roperator.rmod, level=level, fill_value=fill_value, axis=axis 

8448 ) 

8449 

8450 @Appender(ops.make_flex_doc("pow", "dataframe")) 

8451 def pow( 

8452 self, other, axis: Axis = "columns", level=None, fill_value=None 

8453 ) -> DataFrame: 

8454 return self._flex_arith_method( 

8455 other, operator.pow, level=level, fill_value=fill_value, axis=axis 

8456 ) 

8457 

8458 @Appender(ops.make_flex_doc("rpow", "dataframe")) 

8459 def rpow( 

8460 self, other, axis: Axis = "columns", level=None, fill_value=None 

8461 ) -> DataFrame: 

8462 return self._flex_arith_method( 

8463 other, roperator.rpow, level=level, fill_value=fill_value, axis=axis 

8464 ) 

8465 

8466 # ---------------------------------------------------------------------- 

8467 # Combination-Related 

8468 

8469 @doc( 

8470 _shared_docs["compare"], 

8471 dedent( 

8472 """ 

8473 Returns 

8474 ------- 

8475 DataFrame 

8476 DataFrame that shows the differences stacked side by side. 

8477 

8478 The resulting index will be a MultiIndex with 'self' and 'other' 

8479 stacked alternately at the inner level. 

8480 

8481 Raises 

8482 ------ 

8483 ValueError 

8484 When the two DataFrames don't have identical labels or shape. 

8485 

8486 See Also 

8487 -------- 

8488 Series.compare : Compare with another Series and show differences. 

8489 DataFrame.equals : Test whether two objects contain the same elements. 

8490 

8491 Notes 

8492 ----- 

8493 Matching NaNs will not appear as a difference. 

8494 

8495 Can only compare identically-labeled 

8496 (i.e. same shape, identical row and column labels) DataFrames 

8497 

8498 Examples 

8499 -------- 

8500 >>> df = pd.DataFrame( 

8501 ... {{ 

8502 ... "col1": ["a", "a", "b", "b", "a"], 

8503 ... "col2": [1.0, 2.0, 3.0, np.nan, 5.0], 

8504 ... "col3": [1.0, 2.0, 3.0, 4.0, 5.0] 

8505 ... }}, 

8506 ... columns=["col1", "col2", "col3"], 

8507 ... ) 

8508 >>> df 

8509 col1 col2 col3 

8510 0 a 1.0 1.0 

8511 1 a 2.0 2.0 

8512 2 b 3.0 3.0 

8513 3 b NaN 4.0 

8514 4 a 5.0 5.0 

8515 

8516 >>> df2 = df.copy() 

8517 >>> df2.loc[0, 'col1'] = 'c' 

8518 >>> df2.loc[2, 'col3'] = 4.0 

8519 >>> df2 

8520 col1 col2 col3 

8521 0 c 1.0 1.0 

8522 1 a 2.0 2.0 

8523 2 b 3.0 4.0 

8524 3 b NaN 4.0 

8525 4 a 5.0 5.0 

8526 

8527 Align the differences on columns 

8528 

8529 >>> df.compare(df2) 

8530 col1 col3 

8531 self other self other 

8532 0 a c NaN NaN 

8533 2 NaN NaN 3.0 4.0 

8534 

8535 Assign result_names 

8536 

8537 >>> df.compare(df2, result_names=("left", "right")) 

8538 col1 col3 

8539 left right left right 

8540 0 a c NaN NaN 

8541 2 NaN NaN 3.0 4.0 

8542 

8543 Stack the differences on rows 

8544 

8545 >>> df.compare(df2, align_axis=0) 

8546 col1 col3 

8547 0 self a NaN 

8548 other c NaN 

8549 2 self NaN 3.0 

8550 other NaN 4.0 

8551 

8552 Keep the equal values 

8553 

8554 >>> df.compare(df2, keep_equal=True) 

8555 col1 col3 

8556 self other self other 

8557 0 a c 1.0 1.0 

8558 2 b b 3.0 4.0 

8559 

8560 Keep all original rows and columns 

8561 

8562 >>> df.compare(df2, keep_shape=True) 

8563 col1 col2 col3 

8564 self other self other self other 

8565 0 a c NaN NaN NaN NaN 

8566 1 NaN NaN NaN NaN NaN NaN 

8567 2 NaN NaN NaN NaN 3.0 4.0 

8568 3 NaN NaN NaN NaN NaN NaN 

8569 4 NaN NaN NaN NaN NaN NaN 

8570 

8571 Keep all original rows and columns and also all original values 

8572 

8573 >>> df.compare(df2, keep_shape=True, keep_equal=True) 

8574 col1 col2 col3 

8575 self other self other self other 

8576 0 a c 1.0 1.0 1.0 1.0 

8577 1 a a 2.0 2.0 2.0 2.0 

8578 2 b b 3.0 3.0 3.0 4.0 

8579 3 b b NaN NaN 4.0 4.0 

8580 4 a a 5.0 5.0 5.0 5.0 

8581 """ 

8582 ), 

8583 klass=_shared_doc_kwargs["klass"], 

8584 ) 

8585 def compare( 

8586 self, 

8587 other: DataFrame, 

8588 align_axis: Axis = 1, 

8589 keep_shape: bool = False, 

8590 keep_equal: bool = False, 

8591 result_names: Suffixes = ("self", "other"), 

8592 ) -> DataFrame: 

8593 return super().compare( 

8594 other=other, 

8595 align_axis=align_axis, 

8596 keep_shape=keep_shape, 

8597 keep_equal=keep_equal, 

8598 result_names=result_names, 

8599 ) 

8600 

8601 def combine( 

8602 self, 

8603 other: DataFrame, 

8604 func: Callable[[Series, Series], Series | Hashable], 

8605 fill_value=None, 

8606 overwrite: bool = True, 

8607 ) -> DataFrame: 

8608 """ 

8609 Perform column-wise combine with another DataFrame. 

8610 

8611 Combines a DataFrame with `other` DataFrame using `func` 

8612 to element-wise combine columns. The row and column indexes of the 

8613 resulting DataFrame will be the union of the two. 

8614 

8615 Parameters 

8616 ---------- 

8617 other : DataFrame 

8618 The DataFrame to merge column-wise. 

8619 func : function 

8620 Function that takes two series as inputs and return a Series or a 

8621 scalar. Used to merge the two dataframes column by columns. 

8622 fill_value : scalar value, default None 

8623 The value to fill NaNs with prior to passing any column to the 

8624 merge func. 

8625 overwrite : bool, default True 

8626 If True, columns in `self` that do not exist in `other` will be 

8627 overwritten with NaNs. 

8628 

8629 Returns 

8630 ------- 

8631 DataFrame 

8632 Combination of the provided DataFrames. 

8633 

8634 See Also 

8635 -------- 

8636 DataFrame.combine_first : Combine two DataFrame objects and default to 

8637 non-null values in frame calling the method. 

8638 

8639 Examples 

8640 -------- 

8641 Combine using a simple function that chooses the smaller column. 

8642 

8643 >>> df1 = pd.DataFrame({'A': [0, 0], 'B': [4, 4]}) 

8644 >>> df2 = pd.DataFrame({'A': [1, 1], 'B': [3, 3]}) 

8645 >>> take_smaller = lambda s1, s2: s1 if s1.sum() < s2.sum() else s2 

8646 >>> df1.combine(df2, take_smaller) 

8647 A B 

8648 0 0 3 

8649 1 0 3 

8650 

8651 Example using a true element-wise combine function. 

8652 

8653 >>> df1 = pd.DataFrame({'A': [5, 0], 'B': [2, 4]}) 

8654 >>> df2 = pd.DataFrame({'A': [1, 1], 'B': [3, 3]}) 

8655 >>> df1.combine(df2, np.minimum) 

8656 A B 

8657 0 1 2 

8658 1 0 3 

8659 

8660 Using `fill_value` fills Nones prior to passing the column to the 

8661 merge function. 

8662 

8663 >>> df1 = pd.DataFrame({'A': [0, 0], 'B': [None, 4]}) 

8664 >>> df2 = pd.DataFrame({'A': [1, 1], 'B': [3, 3]}) 

8665 >>> df1.combine(df2, take_smaller, fill_value=-5) 

8666 A B 

8667 0 0 -5.0 

8668 1 0 4.0 

8669 

8670 However, if the same element in both dataframes is None, that None 

8671 is preserved 

8672 

8673 >>> df1 = pd.DataFrame({'A': [0, 0], 'B': [None, 4]}) 

8674 >>> df2 = pd.DataFrame({'A': [1, 1], 'B': [None, 3]}) 

8675 >>> df1.combine(df2, take_smaller, fill_value=-5) 

8676 A B 

8677 0 0 -5.0 

8678 1 0 3.0 

8679 

8680 Example that demonstrates the use of `overwrite` and behavior when 

8681 the axis differ between the dataframes. 

8682 

8683 >>> df1 = pd.DataFrame({'A': [0, 0], 'B': [4, 4]}) 

8684 >>> df2 = pd.DataFrame({'B': [3, 3], 'C': [-10, 1], }, index=[1, 2]) 

8685 >>> df1.combine(df2, take_smaller) 

8686 A B C 

8687 0 NaN NaN NaN 

8688 1 NaN 3.0 -10.0 

8689 2 NaN 3.0 1.0 

8690 

8691 >>> df1.combine(df2, take_smaller, overwrite=False) 

8692 A B C 

8693 0 0.0 NaN NaN 

8694 1 0.0 3.0 -10.0 

8695 2 NaN 3.0 1.0 

8696 

8697 Demonstrating the preference of the passed in dataframe. 

8698 

8699 >>> df2 = pd.DataFrame({'B': [3, 3], 'C': [1, 1], }, index=[1, 2]) 

8700 >>> df2.combine(df1, take_smaller) 

8701 A B C 

8702 0 0.0 NaN NaN 

8703 1 0.0 3.0 NaN 

8704 2 NaN 3.0 NaN 

8705 

8706 >>> df2.combine(df1, take_smaller, overwrite=False) 

8707 A B C 

8708 0 0.0 NaN NaN 

8709 1 0.0 3.0 1.0 

8710 2 NaN 3.0 1.0 

8711 """ 

8712 other_idxlen = len(other.index) # save for compare 

8713 

8714 this, other = self.align(other, copy=False) 

8715 new_index = this.index 

8716 

8717 if other.empty and len(new_index) == len(self.index): 

8718 return self.copy() 

8719 

8720 if self.empty and len(other) == other_idxlen: 

8721 return other.copy() 

8722 

8723 # sorts if possible; otherwise align above ensures that these are set-equal 

8724 new_columns = this.columns.union(other.columns) 

8725 do_fill = fill_value is not None 

8726 result = {} 

8727 for col in new_columns: 

8728 series = this[col] 

8729 other_series = other[col] 

8730 

8731 this_dtype = series.dtype 

8732 other_dtype = other_series.dtype 

8733 

8734 this_mask = isna(series) 

8735 other_mask = isna(other_series) 

8736 

8737 # don't overwrite columns unnecessarily 

8738 # DO propagate if this column is not in the intersection 

8739 if not overwrite and other_mask.all(): 

8740 result[col] = this[col].copy() 

8741 continue 

8742 

8743 if do_fill: 

8744 series = series.copy() 

8745 other_series = other_series.copy() 

8746 series[this_mask] = fill_value 

8747 other_series[other_mask] = fill_value 

8748 

8749 if col not in self.columns: 

8750 # If self DataFrame does not have col in other DataFrame, 

8751 # try to promote series, which is all NaN, as other_dtype. 

8752 new_dtype = other_dtype 

8753 try: 

8754 series = series.astype(new_dtype, copy=False) 

8755 except ValueError: 

8756 # e.g. new_dtype is integer types 

8757 pass 

8758 else: 

8759 # if we have different dtypes, possibly promote 

8760 new_dtype = find_common_type([this_dtype, other_dtype]) 

8761 series = series.astype(new_dtype, copy=False) 

8762 other_series = other_series.astype(new_dtype, copy=False) 

8763 

8764 arr = func(series, other_series) 

8765 if isinstance(new_dtype, np.dtype): 

8766 # if new_dtype is an EA Dtype, then `func` is expected to return 

8767 # the correct dtype without any additional casting 

8768 # error: No overload variant of "maybe_downcast_to_dtype" matches 

8769 # argument types "Union[Series, Hashable]", "dtype[Any]" 

8770 arr = maybe_downcast_to_dtype( # type: ignore[call-overload] 

8771 arr, new_dtype 

8772 ) 

8773 

8774 result[col] = arr 

8775 

8776 # convert_objects just in case 

8777 frame_result = self._constructor(result, index=new_index, columns=new_columns) 

8778 return frame_result.__finalize__(self, method="combine") 

8779 

8780 def combine_first(self, other: DataFrame) -> DataFrame: 

8781 """ 

8782 Update null elements with value in the same location in `other`. 

8783 

8784 Combine two DataFrame objects by filling null values in one DataFrame 

8785 with non-null values from other DataFrame. The row and column indexes 

8786 of the resulting DataFrame will be the union of the two. The resulting 

8787 dataframe contains the 'first' dataframe values and overrides the 

8788 second one values where both first.loc[index, col] and 

8789 second.loc[index, col] are not missing values, upon calling 

8790 first.combine_first(second). 

8791 

8792 Parameters 

8793 ---------- 

8794 other : DataFrame 

8795 Provided DataFrame to use to fill null values. 

8796 

8797 Returns 

8798 ------- 

8799 DataFrame 

8800 The result of combining the provided DataFrame with the other object. 

8801 

8802 See Also 

8803 -------- 

8804 DataFrame.combine : Perform series-wise operation on two DataFrames 

8805 using a given function. 

8806 

8807 Examples 

8808 -------- 

8809 >>> df1 = pd.DataFrame({'A': [None, 0], 'B': [None, 4]}) 

8810 >>> df2 = pd.DataFrame({'A': [1, 1], 'B': [3, 3]}) 

8811 >>> df1.combine_first(df2) 

8812 A B 

8813 0 1.0 3.0 

8814 1 0.0 4.0 

8815 

8816 Null values still persist if the location of that null value 

8817 does not exist in `other` 

8818 

8819 >>> df1 = pd.DataFrame({'A': [None, 0], 'B': [4, None]}) 

8820 >>> df2 = pd.DataFrame({'B': [3, 3], 'C': [1, 1]}, index=[1, 2]) 

8821 >>> df1.combine_first(df2) 

8822 A B C 

8823 0 NaN 4.0 NaN 

8824 1 0.0 3.0 1.0 

8825 2 NaN 3.0 1.0 

8826 """ 

8827 from pandas.core.computation import expressions 

8828 

8829 def combiner(x: Series, y: Series): 

8830 mask = x.isna()._values 

8831 

8832 x_values = x._values 

8833 y_values = y._values 

8834 

8835 # If the column y in other DataFrame is not in first DataFrame, 

8836 # just return y_values. 

8837 if y.name not in self.columns: 

8838 return y_values 

8839 

8840 return expressions.where(mask, y_values, x_values) 

8841 

8842 if len(other) == 0: 

8843 combined = self.reindex( 

8844 self.columns.append(other.columns.difference(self.columns)), axis=1 

8845 ) 

8846 combined = combined.astype(other.dtypes) 

8847 else: 

8848 combined = self.combine(other, combiner, overwrite=False) 

8849 

8850 dtypes = { 

8851 col: find_common_type([self.dtypes[col], other.dtypes[col]]) 

8852 for col in self.columns.intersection(other.columns) 

8853 if combined.dtypes[col] != self.dtypes[col] 

8854 } 

8855 

8856 if dtypes: 

8857 combined = combined.astype(dtypes) 

8858 

8859 return combined.__finalize__(self, method="combine_first") 

8860 

8861 def update( 

8862 self, 

8863 other, 

8864 join: UpdateJoin = "left", 

8865 overwrite: bool = True, 

8866 filter_func=None, 

8867 errors: IgnoreRaise = "ignore", 

8868 ) -> None: 

8869 """ 

8870 Modify in place using non-NA values from another DataFrame. 

8871 

8872 Aligns on indices. There is no return value. 

8873 

8874 Parameters 

8875 ---------- 

8876 other : DataFrame, or object coercible into a DataFrame 

8877 Should have at least one matching index/column label 

8878 with the original DataFrame. If a Series is passed, 

8879 its name attribute must be set, and that will be 

8880 used as the column name to align with the original DataFrame. 

8881 join : {'left'}, default 'left' 

8882 Only left join is implemented, keeping the index and columns of the 

8883 original object. 

8884 overwrite : bool, default True 

8885 How to handle non-NA values for overlapping keys: 

8886 

8887 * True: overwrite original DataFrame's values 

8888 with values from `other`. 

8889 * False: only update values that are NA in 

8890 the original DataFrame. 

8891 

8892 filter_func : callable(1d-array) -> bool 1d-array, optional 

8893 Can choose to replace values other than NA. Return True for values 

8894 that should be updated. 

8895 errors : {'raise', 'ignore'}, default 'ignore' 

8896 If 'raise', will raise a ValueError if the DataFrame and `other` 

8897 both contain non-NA data in the same place. 

8898 

8899 Returns 

8900 ------- 

8901 None 

8902 This method directly changes calling object. 

8903 

8904 Raises 

8905 ------ 

8906 ValueError 

8907 * When `errors='raise'` and there's overlapping non-NA data. 

8908 * When `errors` is not either `'ignore'` or `'raise'` 

8909 NotImplementedError 

8910 * If `join != 'left'` 

8911 

8912 See Also 

8913 -------- 

8914 dict.update : Similar method for dictionaries. 

8915 DataFrame.merge : For column(s)-on-column(s) operations. 

8916 

8917 Examples 

8918 -------- 

8919 >>> df = pd.DataFrame({'A': [1, 2, 3], 

8920 ... 'B': [400, 500, 600]}) 

8921 >>> new_df = pd.DataFrame({'B': [4, 5, 6], 

8922 ... 'C': [7, 8, 9]}) 

8923 >>> df.update(new_df) 

8924 >>> df 

8925 A B 

8926 0 1 4 

8927 1 2 5 

8928 2 3 6 

8929 

8930 The DataFrame's length does not increase as a result of the update, 

8931 only values at matching index/column labels are updated. 

8932 

8933 >>> df = pd.DataFrame({'A': ['a', 'b', 'c'], 

8934 ... 'B': ['x', 'y', 'z']}) 

8935 >>> new_df = pd.DataFrame({'B': ['d', 'e', 'f', 'g', 'h', 'i']}) 

8936 >>> df.update(new_df) 

8937 >>> df 

8938 A B 

8939 0 a d 

8940 1 b e 

8941 2 c f 

8942 

8943 >>> df = pd.DataFrame({'A': ['a', 'b', 'c'], 

8944 ... 'B': ['x', 'y', 'z']}) 

8945 >>> new_df = pd.DataFrame({'B': ['d', 'f']}, index=[0, 2]) 

8946 >>> df.update(new_df) 

8947 >>> df 

8948 A B 

8949 0 a d 

8950 1 b y 

8951 2 c f 

8952 

8953 For Series, its name attribute must be set. 

8954 

8955 >>> df = pd.DataFrame({'A': ['a', 'b', 'c'], 

8956 ... 'B': ['x', 'y', 'z']}) 

8957 >>> new_column = pd.Series(['d', 'e', 'f'], name='B') 

8958 >>> df.update(new_column) 

8959 >>> df 

8960 A B 

8961 0 a d 

8962 1 b e 

8963 2 c f 

8964 

8965 If `other` contains NaNs the corresponding values are not updated 

8966 in the original dataframe. 

8967 

8968 >>> df = pd.DataFrame({'A': [1, 2, 3], 

8969 ... 'B': [400., 500., 600.]}) 

8970 >>> new_df = pd.DataFrame({'B': [4, np.nan, 6]}) 

8971 >>> df.update(new_df) 

8972 >>> df 

8973 A B 

8974 0 1 4.0 

8975 1 2 500.0 

8976 2 3 6.0 

8977 """ 

8978 

8979 if not PYPY and using_copy_on_write(): 

8980 if sys.getrefcount(self) <= REF_COUNT: 

8981 warnings.warn( 

8982 _chained_assignment_method_msg, 

8983 ChainedAssignmentError, 

8984 stacklevel=2, 

8985 ) 

8986 elif not PYPY and not using_copy_on_write() and self._is_view_after_cow_rules(): 

8987 if sys.getrefcount(self) <= REF_COUNT: 

8988 warnings.warn( 

8989 _chained_assignment_warning_method_msg, 

8990 FutureWarning, 

8991 stacklevel=2, 

8992 ) 

8993 

8994 # TODO: Support other joins 

8995 if join != "left": # pragma: no cover 

8996 raise NotImplementedError("Only left join is supported") 

8997 if errors not in ["ignore", "raise"]: 

8998 raise ValueError("The parameter errors must be either 'ignore' or 'raise'") 

8999 

9000 if not isinstance(other, DataFrame): 

9001 other = DataFrame(other) 

9002 

9003 other = other.reindex(self.index) 

9004 

9005 for col in self.columns.intersection(other.columns): 

9006 this = self[col]._values 

9007 that = other[col]._values 

9008 

9009 if filter_func is not None: 

9010 mask = ~filter_func(this) | isna(that) 

9011 else: 

9012 if errors == "raise": 

9013 mask_this = notna(that) 

9014 mask_that = notna(this) 

9015 if any(mask_this & mask_that): 

9016 raise ValueError("Data overlaps.") 

9017 

9018 if overwrite: 

9019 mask = isna(that) 

9020 else: 

9021 mask = notna(this) 

9022 

9023 # don't overwrite columns unnecessarily 

9024 if mask.all(): 

9025 continue 

9026 

9027 with warnings.catch_warnings(): 

9028 warnings.filterwarnings( 

9029 "ignore", 

9030 message="Downcasting behavior", 

9031 category=FutureWarning, 

9032 ) 

9033 # GH#57124 - `that` might get upcasted because of NA values, and then 

9034 # downcasted in where because of the mask. Ignoring the warning 

9035 # is a stopgap, will replace with a new implementation of update 

9036 # in 3.0. 

9037 self.loc[:, col] = self[col].where(mask, that) 

9038 

9039 # ---------------------------------------------------------------------- 

9040 # Data reshaping 

9041 @Appender( 

9042 dedent( 

9043 """ 

9044 Examples 

9045 -------- 

9046 >>> df = pd.DataFrame({'Animal': ['Falcon', 'Falcon', 

9047 ... 'Parrot', 'Parrot'], 

9048 ... 'Max Speed': [380., 370., 24., 26.]}) 

9049 >>> df 

9050 Animal Max Speed 

9051 0 Falcon 380.0 

9052 1 Falcon 370.0 

9053 2 Parrot 24.0 

9054 3 Parrot 26.0 

9055 >>> df.groupby(['Animal']).mean() 

9056 Max Speed 

9057 Animal 

9058 Falcon 375.0 

9059 Parrot 25.0 

9060 

9061 **Hierarchical Indexes** 

9062 

9063 We can groupby different levels of a hierarchical index 

9064 using the `level` parameter: 

9065 

9066 >>> arrays = [['Falcon', 'Falcon', 'Parrot', 'Parrot'], 

9067 ... ['Captive', 'Wild', 'Captive', 'Wild']] 

9068 >>> index = pd.MultiIndex.from_arrays(arrays, names=('Animal', 'Type')) 

9069 >>> df = pd.DataFrame({'Max Speed': [390., 350., 30., 20.]}, 

9070 ... index=index) 

9071 >>> df 

9072 Max Speed 

9073 Animal Type 

9074 Falcon Captive 390.0 

9075 Wild 350.0 

9076 Parrot Captive 30.0 

9077 Wild 20.0 

9078 >>> df.groupby(level=0).mean() 

9079 Max Speed 

9080 Animal 

9081 Falcon 370.0 

9082 Parrot 25.0 

9083 >>> df.groupby(level="Type").mean() 

9084 Max Speed 

9085 Type 

9086 Captive 210.0 

9087 Wild 185.0 

9088 

9089 We can also choose to include NA in group keys or not by setting 

9090 `dropna` parameter, the default setting is `True`. 

9091 

9092 >>> l = [[1, 2, 3], [1, None, 4], [2, 1, 3], [1, 2, 2]] 

9093 >>> df = pd.DataFrame(l, columns=["a", "b", "c"]) 

9094 

9095 >>> df.groupby(by=["b"]).sum() 

9096 a c 

9097 b 

9098 1.0 2 3 

9099 2.0 2 5 

9100 

9101 >>> df.groupby(by=["b"], dropna=False).sum() 

9102 a c 

9103 b 

9104 1.0 2 3 

9105 2.0 2 5 

9106 NaN 1 4 

9107 

9108 >>> l = [["a", 12, 12], [None, 12.3, 33.], ["b", 12.3, 123], ["a", 1, 1]] 

9109 >>> df = pd.DataFrame(l, columns=["a", "b", "c"]) 

9110 

9111 >>> df.groupby(by="a").sum() 

9112 b c 

9113 a 

9114 a 13.0 13.0 

9115 b 12.3 123.0 

9116 

9117 >>> df.groupby(by="a", dropna=False).sum() 

9118 b c 

9119 a 

9120 a 13.0 13.0 

9121 b 12.3 123.0 

9122 NaN 12.3 33.0 

9123 

9124 When using ``.apply()``, use ``group_keys`` to include or exclude the 

9125 group keys. The ``group_keys`` argument defaults to ``True`` (include). 

9126 

9127 >>> df = pd.DataFrame({'Animal': ['Falcon', 'Falcon', 

9128 ... 'Parrot', 'Parrot'], 

9129 ... 'Max Speed': [380., 370., 24., 26.]}) 

9130 >>> df.groupby("Animal", group_keys=True)[['Max Speed']].apply(lambda x: x) 

9131 Max Speed 

9132 Animal 

9133 Falcon 0 380.0 

9134 1 370.0 

9135 Parrot 2 24.0 

9136 3 26.0 

9137 

9138 >>> df.groupby("Animal", group_keys=False)[['Max Speed']].apply(lambda x: x) 

9139 Max Speed 

9140 0 380.0 

9141 1 370.0 

9142 2 24.0 

9143 3 26.0 

9144 """ 

9145 ) 

9146 ) 

9147 @Appender(_shared_docs["groupby"] % _shared_doc_kwargs) 

9148 def groupby( 

9149 self, 

9150 by=None, 

9151 axis: Axis | lib.NoDefault = lib.no_default, 

9152 level: IndexLabel | None = None, 

9153 as_index: bool = True, 

9154 sort: bool = True, 

9155 group_keys: bool = True, 

9156 observed: bool | lib.NoDefault = lib.no_default, 

9157 dropna: bool = True, 

9158 ) -> DataFrameGroupBy: 

9159 if axis is not lib.no_default: 

9160 axis = self._get_axis_number(axis) 

9161 if axis == 1: 

9162 warnings.warn( 

9163 "DataFrame.groupby with axis=1 is deprecated. Do " 

9164 "`frame.T.groupby(...)` without axis instead.", 

9165 FutureWarning, 

9166 stacklevel=find_stack_level(), 

9167 ) 

9168 else: 

9169 warnings.warn( 

9170 "The 'axis' keyword in DataFrame.groupby is deprecated and " 

9171 "will be removed in a future version.", 

9172 FutureWarning, 

9173 stacklevel=find_stack_level(), 

9174 ) 

9175 else: 

9176 axis = 0 

9177 

9178 from pandas.core.groupby.generic import DataFrameGroupBy 

9179 

9180 if level is None and by is None: 

9181 raise TypeError("You have to supply one of 'by' and 'level'") 

9182 

9183 return DataFrameGroupBy( 

9184 obj=self, 

9185 keys=by, 

9186 axis=axis, 

9187 level=level, 

9188 as_index=as_index, 

9189 sort=sort, 

9190 group_keys=group_keys, 

9191 observed=observed, 

9192 dropna=dropna, 

9193 ) 

9194 

9195 _shared_docs[ 

9196 "pivot" 

9197 ] = """ 

9198 Return reshaped DataFrame organized by given index / column values. 

9199 

9200 Reshape data (produce a "pivot" table) based on column values. Uses 

9201 unique values from specified `index` / `columns` to form axes of the 

9202 resulting DataFrame. This function does not support data 

9203 aggregation, multiple values will result in a MultiIndex in the 

9204 columns. See the :ref:`User Guide <reshaping>` for more on reshaping. 

9205 

9206 Parameters 

9207 ----------%s 

9208 columns : str or object or a list of str 

9209 Column to use to make new frame's columns. 

9210 index : str or object or a list of str, optional 

9211 Column to use to make new frame's index. If not given, uses existing index. 

9212 values : str, object or a list of the previous, optional 

9213 Column(s) to use for populating new frame's values. If not 

9214 specified, all remaining columns will be used and the result will 

9215 have hierarchically indexed columns. 

9216 

9217 Returns 

9218 ------- 

9219 DataFrame 

9220 Returns reshaped DataFrame. 

9221 

9222 Raises 

9223 ------ 

9224 ValueError: 

9225 When there are any `index`, `columns` combinations with multiple 

9226 values. `DataFrame.pivot_table` when you need to aggregate. 

9227 

9228 See Also 

9229 -------- 

9230 DataFrame.pivot_table : Generalization of pivot that can handle 

9231 duplicate values for one index/column pair. 

9232 DataFrame.unstack : Pivot based on the index values instead of a 

9233 column. 

9234 wide_to_long : Wide panel to long format. Less flexible but more 

9235 user-friendly than melt. 

9236 

9237 Notes 

9238 ----- 

9239 For finer-tuned control, see hierarchical indexing documentation along 

9240 with the related stack/unstack methods. 

9241 

9242 Reference :ref:`the user guide <reshaping.pivot>` for more examples. 

9243 

9244 Examples 

9245 -------- 

9246 >>> df = pd.DataFrame({'foo': ['one', 'one', 'one', 'two', 'two', 

9247 ... 'two'], 

9248 ... 'bar': ['A', 'B', 'C', 'A', 'B', 'C'], 

9249 ... 'baz': [1, 2, 3, 4, 5, 6], 

9250 ... 'zoo': ['x', 'y', 'z', 'q', 'w', 't']}) 

9251 >>> df 

9252 foo bar baz zoo 

9253 0 one A 1 x 

9254 1 one B 2 y 

9255 2 one C 3 z 

9256 3 two A 4 q 

9257 4 two B 5 w 

9258 5 two C 6 t 

9259 

9260 >>> df.pivot(index='foo', columns='bar', values='baz') 

9261 bar A B C 

9262 foo 

9263 one 1 2 3 

9264 two 4 5 6 

9265 

9266 >>> df.pivot(index='foo', columns='bar')['baz'] 

9267 bar A B C 

9268 foo 

9269 one 1 2 3 

9270 two 4 5 6 

9271 

9272 >>> df.pivot(index='foo', columns='bar', values=['baz', 'zoo']) 

9273 baz zoo 

9274 bar A B C A B C 

9275 foo 

9276 one 1 2 3 x y z 

9277 two 4 5 6 q w t 

9278 

9279 You could also assign a list of column names or a list of index names. 

9280 

9281 >>> df = pd.DataFrame({ 

9282 ... "lev1": [1, 1, 1, 2, 2, 2], 

9283 ... "lev2": [1, 1, 2, 1, 1, 2], 

9284 ... "lev3": [1, 2, 1, 2, 1, 2], 

9285 ... "lev4": [1, 2, 3, 4, 5, 6], 

9286 ... "values": [0, 1, 2, 3, 4, 5]}) 

9287 >>> df 

9288 lev1 lev2 lev3 lev4 values 

9289 0 1 1 1 1 0 

9290 1 1 1 2 2 1 

9291 2 1 2 1 3 2 

9292 3 2 1 2 4 3 

9293 4 2 1 1 5 4 

9294 5 2 2 2 6 5 

9295 

9296 >>> df.pivot(index="lev1", columns=["lev2", "lev3"], values="values") 

9297 lev2 1 2 

9298 lev3 1 2 1 2 

9299 lev1 

9300 1 0.0 1.0 2.0 NaN 

9301 2 4.0 3.0 NaN 5.0 

9302 

9303 >>> df.pivot(index=["lev1", "lev2"], columns=["lev3"], values="values") 

9304 lev3 1 2 

9305 lev1 lev2 

9306 1 1 0.0 1.0 

9307 2 2.0 NaN 

9308 2 1 4.0 3.0 

9309 2 NaN 5.0 

9310 

9311 A ValueError is raised if there are any duplicates. 

9312 

9313 >>> df = pd.DataFrame({"foo": ['one', 'one', 'two', 'two'], 

9314 ... "bar": ['A', 'A', 'B', 'C'], 

9315 ... "baz": [1, 2, 3, 4]}) 

9316 >>> df 

9317 foo bar baz 

9318 0 one A 1 

9319 1 one A 2 

9320 2 two B 3 

9321 3 two C 4 

9322 

9323 Notice that the first two rows are the same for our `index` 

9324 and `columns` arguments. 

9325 

9326 >>> df.pivot(index='foo', columns='bar', values='baz') 

9327 Traceback (most recent call last): 

9328 ... 

9329 ValueError: Index contains duplicate entries, cannot reshape 

9330 """ 

9331 

9332 @Substitution("") 

9333 @Appender(_shared_docs["pivot"]) 

9334 def pivot( 

9335 self, *, columns, index=lib.no_default, values=lib.no_default 

9336 ) -> DataFrame: 

9337 from pandas.core.reshape.pivot import pivot 

9338 

9339 return pivot(self, index=index, columns=columns, values=values) 

9340 

9341 _shared_docs[ 

9342 "pivot_table" 

9343 ] = """ 

9344 Create a spreadsheet-style pivot table as a DataFrame. 

9345 

9346 The levels in the pivot table will be stored in MultiIndex objects 

9347 (hierarchical indexes) on the index and columns of the result DataFrame. 

9348 

9349 Parameters 

9350 ----------%s 

9351 values : list-like or scalar, optional 

9352 Column or columns to aggregate. 

9353 index : column, Grouper, array, or list of the previous 

9354 Keys to group by on the pivot table index. If a list is passed, 

9355 it can contain any of the other types (except list). If an array is 

9356 passed, it must be the same length as the data and will be used in 

9357 the same manner as column values. 

9358 columns : column, Grouper, array, or list of the previous 

9359 Keys to group by on the pivot table column. If a list is passed, 

9360 it can contain any of the other types (except list). If an array is 

9361 passed, it must be the same length as the data and will be used in 

9362 the same manner as column values. 

9363 aggfunc : function, list of functions, dict, default "mean" 

9364 If a list of functions is passed, the resulting pivot table will have 

9365 hierarchical columns whose top level are the function names 

9366 (inferred from the function objects themselves). 

9367 If a dict is passed, the key is column to aggregate and the value is 

9368 function or list of functions. If ``margin=True``, aggfunc will be 

9369 used to calculate the partial aggregates. 

9370 fill_value : scalar, default None 

9371 Value to replace missing values with (in the resulting pivot table, 

9372 after aggregation). 

9373 margins : bool, default False 

9374 If ``margins=True``, special ``All`` columns and rows 

9375 will be added with partial group aggregates across the categories 

9376 on the rows and columns. 

9377 dropna : bool, default True 

9378 Do not include columns whose entries are all NaN. If True, 

9379 rows with a NaN value in any column will be omitted before 

9380 computing margins. 

9381 margins_name : str, default 'All' 

9382 Name of the row / column that will contain the totals 

9383 when margins is True. 

9384 observed : bool, default False 

9385 This only applies if any of the groupers are Categoricals. 

9386 If True: only show observed values for categorical groupers. 

9387 If False: show all values for categorical groupers. 

9388 

9389 .. deprecated:: 2.2.0 

9390 

9391 The default value of ``False`` is deprecated and will change to 

9392 ``True`` in a future version of pandas. 

9393 

9394 sort : bool, default True 

9395 Specifies if the result should be sorted. 

9396 

9397 .. versionadded:: 1.3.0 

9398 

9399 Returns 

9400 ------- 

9401 DataFrame 

9402 An Excel style pivot table. 

9403 

9404 See Also 

9405 -------- 

9406 DataFrame.pivot : Pivot without aggregation that can handle 

9407 non-numeric data. 

9408 DataFrame.melt: Unpivot a DataFrame from wide to long format, 

9409 optionally leaving identifiers set. 

9410 wide_to_long : Wide panel to long format. Less flexible but more 

9411 user-friendly than melt. 

9412 

9413 Notes 

9414 ----- 

9415 Reference :ref:`the user guide <reshaping.pivot>` for more examples. 

9416 

9417 Examples 

9418 -------- 

9419 >>> df = pd.DataFrame({"A": ["foo", "foo", "foo", "foo", "foo", 

9420 ... "bar", "bar", "bar", "bar"], 

9421 ... "B": ["one", "one", "one", "two", "two", 

9422 ... "one", "one", "two", "two"], 

9423 ... "C": ["small", "large", "large", "small", 

9424 ... "small", "large", "small", "small", 

9425 ... "large"], 

9426 ... "D": [1, 2, 2, 3, 3, 4, 5, 6, 7], 

9427 ... "E": [2, 4, 5, 5, 6, 6, 8, 9, 9]}) 

9428 >>> df 

9429 A B C D E 

9430 0 foo one small 1 2 

9431 1 foo one large 2 4 

9432 2 foo one large 2 5 

9433 3 foo two small 3 5 

9434 4 foo two small 3 6 

9435 5 bar one large 4 6 

9436 6 bar one small 5 8 

9437 7 bar two small 6 9 

9438 8 bar two large 7 9 

9439 

9440 This first example aggregates values by taking the sum. 

9441 

9442 >>> table = pd.pivot_table(df, values='D', index=['A', 'B'], 

9443 ... columns=['C'], aggfunc="sum") 

9444 >>> table 

9445 C large small 

9446 A B 

9447 bar one 4.0 5.0 

9448 two 7.0 6.0 

9449 foo one 4.0 1.0 

9450 two NaN 6.0 

9451 

9452 We can also fill missing values using the `fill_value` parameter. 

9453 

9454 >>> table = pd.pivot_table(df, values='D', index=['A', 'B'], 

9455 ... columns=['C'], aggfunc="sum", fill_value=0) 

9456 >>> table 

9457 C large small 

9458 A B 

9459 bar one 4 5 

9460 two 7 6 

9461 foo one 4 1 

9462 two 0 6 

9463 

9464 The next example aggregates by taking the mean across multiple columns. 

9465 

9466 >>> table = pd.pivot_table(df, values=['D', 'E'], index=['A', 'C'], 

9467 ... aggfunc={'D': "mean", 'E': "mean"}) 

9468 >>> table 

9469 D E 

9470 A C 

9471 bar large 5.500000 7.500000 

9472 small 5.500000 8.500000 

9473 foo large 2.000000 4.500000 

9474 small 2.333333 4.333333 

9475 

9476 We can also calculate multiple types of aggregations for any given 

9477 value column. 

9478 

9479 >>> table = pd.pivot_table(df, values=['D', 'E'], index=['A', 'C'], 

9480 ... aggfunc={'D': "mean", 

9481 ... 'E': ["min", "max", "mean"]}) 

9482 >>> table 

9483 D E 

9484 mean max mean min 

9485 A C 

9486 bar large 5.500000 9 7.500000 6 

9487 small 5.500000 9 8.500000 8 

9488 foo large 2.000000 5 4.500000 4 

9489 small 2.333333 6 4.333333 2 

9490 """ 

9491 

9492 @Substitution("") 

9493 @Appender(_shared_docs["pivot_table"]) 

9494 def pivot_table( 

9495 self, 

9496 values=None, 

9497 index=None, 

9498 columns=None, 

9499 aggfunc: AggFuncType = "mean", 

9500 fill_value=None, 

9501 margins: bool = False, 

9502 dropna: bool = True, 

9503 margins_name: Level = "All", 

9504 observed: bool | lib.NoDefault = lib.no_default, 

9505 sort: bool = True, 

9506 ) -> DataFrame: 

9507 from pandas.core.reshape.pivot import pivot_table 

9508 

9509 return pivot_table( 

9510 self, 

9511 values=values, 

9512 index=index, 

9513 columns=columns, 

9514 aggfunc=aggfunc, 

9515 fill_value=fill_value, 

9516 margins=margins, 

9517 dropna=dropna, 

9518 margins_name=margins_name, 

9519 observed=observed, 

9520 sort=sort, 

9521 ) 

9522 

9523 def stack( 

9524 self, 

9525 level: IndexLabel = -1, 

9526 dropna: bool | lib.NoDefault = lib.no_default, 

9527 sort: bool | lib.NoDefault = lib.no_default, 

9528 future_stack: bool = False, 

9529 ): 

9530 """ 

9531 Stack the prescribed level(s) from columns to index. 

9532 

9533 Return a reshaped DataFrame or Series having a multi-level 

9534 index with one or more new inner-most levels compared to the current 

9535 DataFrame. The new inner-most levels are created by pivoting the 

9536 columns of the current dataframe: 

9537 

9538 - if the columns have a single level, the output is a Series; 

9539 - if the columns have multiple levels, the new index 

9540 level(s) is (are) taken from the prescribed level(s) and 

9541 the output is a DataFrame. 

9542 

9543 Parameters 

9544 ---------- 

9545 level : int, str, list, default -1 

9546 Level(s) to stack from the column axis onto the index 

9547 axis, defined as one index or label, or a list of indices 

9548 or labels. 

9549 dropna : bool, default True 

9550 Whether to drop rows in the resulting Frame/Series with 

9551 missing values. Stacking a column level onto the index 

9552 axis can create combinations of index and column values 

9553 that are missing from the original dataframe. See Examples 

9554 section. 

9555 sort : bool, default True 

9556 Whether to sort the levels of the resulting MultiIndex. 

9557 future_stack : bool, default False 

9558 Whether to use the new implementation that will replace the current 

9559 implementation in pandas 3.0. When True, dropna and sort have no impact 

9560 on the result and must remain unspecified. See :ref:`pandas 2.1.0 Release 

9561 notes <whatsnew_210.enhancements.new_stack>` for more details. 

9562 

9563 Returns 

9564 ------- 

9565 DataFrame or Series 

9566 Stacked dataframe or series. 

9567 

9568 See Also 

9569 -------- 

9570 DataFrame.unstack : Unstack prescribed level(s) from index axis 

9571 onto column axis. 

9572 DataFrame.pivot : Reshape dataframe from long format to wide 

9573 format. 

9574 DataFrame.pivot_table : Create a spreadsheet-style pivot table 

9575 as a DataFrame. 

9576 

9577 Notes 

9578 ----- 

9579 The function is named by analogy with a collection of books 

9580 being reorganized from being side by side on a horizontal 

9581 position (the columns of the dataframe) to being stacked 

9582 vertically on top of each other (in the index of the 

9583 dataframe). 

9584 

9585 Reference :ref:`the user guide <reshaping.stacking>` for more examples. 

9586 

9587 Examples 

9588 -------- 

9589 **Single level columns** 

9590 

9591 >>> df_single_level_cols = pd.DataFrame([[0, 1], [2, 3]], 

9592 ... index=['cat', 'dog'], 

9593 ... columns=['weight', 'height']) 

9594 

9595 Stacking a dataframe with a single level column axis returns a Series: 

9596 

9597 >>> df_single_level_cols 

9598 weight height 

9599 cat 0 1 

9600 dog 2 3 

9601 >>> df_single_level_cols.stack(future_stack=True) 

9602 cat weight 0 

9603 height 1 

9604 dog weight 2 

9605 height 3 

9606 dtype: int64 

9607 

9608 **Multi level columns: simple case** 

9609 

9610 >>> multicol1 = pd.MultiIndex.from_tuples([('weight', 'kg'), 

9611 ... ('weight', 'pounds')]) 

9612 >>> df_multi_level_cols1 = pd.DataFrame([[1, 2], [2, 4]], 

9613 ... index=['cat', 'dog'], 

9614 ... columns=multicol1) 

9615 

9616 Stacking a dataframe with a multi-level column axis: 

9617 

9618 >>> df_multi_level_cols1 

9619 weight 

9620 kg pounds 

9621 cat 1 2 

9622 dog 2 4 

9623 >>> df_multi_level_cols1.stack(future_stack=True) 

9624 weight 

9625 cat kg 1 

9626 pounds 2 

9627 dog kg 2 

9628 pounds 4 

9629 

9630 **Missing values** 

9631 

9632 >>> multicol2 = pd.MultiIndex.from_tuples([('weight', 'kg'), 

9633 ... ('height', 'm')]) 

9634 >>> df_multi_level_cols2 = pd.DataFrame([[1.0, 2.0], [3.0, 4.0]], 

9635 ... index=['cat', 'dog'], 

9636 ... columns=multicol2) 

9637 

9638 It is common to have missing values when stacking a dataframe 

9639 with multi-level columns, as the stacked dataframe typically 

9640 has more values than the original dataframe. Missing values 

9641 are filled with NaNs: 

9642 

9643 >>> df_multi_level_cols2 

9644 weight height 

9645 kg m 

9646 cat 1.0 2.0 

9647 dog 3.0 4.0 

9648 >>> df_multi_level_cols2.stack(future_stack=True) 

9649 weight height 

9650 cat kg 1.0 NaN 

9651 m NaN 2.0 

9652 dog kg 3.0 NaN 

9653 m NaN 4.0 

9654 

9655 **Prescribing the level(s) to be stacked** 

9656 

9657 The first parameter controls which level or levels are stacked: 

9658 

9659 >>> df_multi_level_cols2.stack(0, future_stack=True) 

9660 kg m 

9661 cat weight 1.0 NaN 

9662 height NaN 2.0 

9663 dog weight 3.0 NaN 

9664 height NaN 4.0 

9665 >>> df_multi_level_cols2.stack([0, 1], future_stack=True) 

9666 cat weight kg 1.0 

9667 height m 2.0 

9668 dog weight kg 3.0 

9669 height m 4.0 

9670 dtype: float64 

9671 """ 

9672 if not future_stack: 

9673 from pandas.core.reshape.reshape import ( 

9674 stack, 

9675 stack_multiple, 

9676 ) 

9677 

9678 if ( 

9679 dropna is not lib.no_default 

9680 or sort is not lib.no_default 

9681 or self.columns.nlevels > 1 

9682 ): 

9683 warnings.warn( 

9684 "The previous implementation of stack is deprecated and will be " 

9685 "removed in a future version of pandas. See the What's New notes " 

9686 "for pandas 2.1.0 for details. Specify future_stack=True to adopt " 

9687 "the new implementation and silence this warning.", 

9688 FutureWarning, 

9689 stacklevel=find_stack_level(), 

9690 ) 

9691 

9692 if dropna is lib.no_default: 

9693 dropna = True 

9694 if sort is lib.no_default: 

9695 sort = True 

9696 

9697 if isinstance(level, (tuple, list)): 

9698 result = stack_multiple(self, level, dropna=dropna, sort=sort) 

9699 else: 

9700 result = stack(self, level, dropna=dropna, sort=sort) 

9701 else: 

9702 from pandas.core.reshape.reshape import stack_v3 

9703 

9704 if dropna is not lib.no_default: 

9705 raise ValueError( 

9706 "dropna must be unspecified with future_stack=True as the new " 

9707 "implementation does not introduce rows of NA values. This " 

9708 "argument will be removed in a future version of pandas." 

9709 ) 

9710 

9711 if sort is not lib.no_default: 

9712 raise ValueError( 

9713 "Cannot specify sort with future_stack=True, this argument will be " 

9714 "removed in a future version of pandas. Sort the result using " 

9715 ".sort_index instead." 

9716 ) 

9717 

9718 if ( 

9719 isinstance(level, (tuple, list)) 

9720 and not all(lev in self.columns.names for lev in level) 

9721 and not all(isinstance(lev, int) for lev in level) 

9722 ): 

9723 raise ValueError( 

9724 "level should contain all level names or all level " 

9725 "numbers, not a mixture of the two." 

9726 ) 

9727 

9728 if not isinstance(level, (tuple, list)): 

9729 level = [level] 

9730 level = [self.columns._get_level_number(lev) for lev in level] 

9731 result = stack_v3(self, level) 

9732 

9733 return result.__finalize__(self, method="stack") 

9734 

9735 def explode( 

9736 self, 

9737 column: IndexLabel, 

9738 ignore_index: bool = False, 

9739 ) -> DataFrame: 

9740 """ 

9741 Transform each element of a list-like to a row, replicating index values. 

9742 

9743 Parameters 

9744 ---------- 

9745 column : IndexLabel 

9746 Column(s) to explode. 

9747 For multiple columns, specify a non-empty list with each element 

9748 be str or tuple, and all specified columns their list-like data 

9749 on same row of the frame must have matching length. 

9750 

9751 .. versionadded:: 1.3.0 

9752 Multi-column explode 

9753 

9754 ignore_index : bool, default False 

9755 If True, the resulting index will be labeled 0, 1, …, n - 1. 

9756 

9757 Returns 

9758 ------- 

9759 DataFrame 

9760 Exploded lists to rows of the subset columns; 

9761 index will be duplicated for these rows. 

9762 

9763 Raises 

9764 ------ 

9765 ValueError : 

9766 * If columns of the frame are not unique. 

9767 * If specified columns to explode is empty list. 

9768 * If specified columns to explode have not matching count of 

9769 elements rowwise in the frame. 

9770 

9771 See Also 

9772 -------- 

9773 DataFrame.unstack : Pivot a level of the (necessarily hierarchical) 

9774 index labels. 

9775 DataFrame.melt : Unpivot a DataFrame from wide format to long format. 

9776 Series.explode : Explode a DataFrame from list-like columns to long format. 

9777 

9778 Notes 

9779 ----- 

9780 This routine will explode list-likes including lists, tuples, sets, 

9781 Series, and np.ndarray. The result dtype of the subset rows will 

9782 be object. Scalars will be returned unchanged, and empty list-likes will 

9783 result in a np.nan for that row. In addition, the ordering of rows in the 

9784 output will be non-deterministic when exploding sets. 

9785 

9786 Reference :ref:`the user guide <reshaping.explode>` for more examples. 

9787 

9788 Examples 

9789 -------- 

9790 >>> df = pd.DataFrame({'A': [[0, 1, 2], 'foo', [], [3, 4]], 

9791 ... 'B': 1, 

9792 ... 'C': [['a', 'b', 'c'], np.nan, [], ['d', 'e']]}) 

9793 >>> df 

9794 A B C 

9795 0 [0, 1, 2] 1 [a, b, c] 

9796 1 foo 1 NaN 

9797 2 [] 1 [] 

9798 3 [3, 4] 1 [d, e] 

9799 

9800 Single-column explode. 

9801 

9802 >>> df.explode('A') 

9803 A B C 

9804 0 0 1 [a, b, c] 

9805 0 1 1 [a, b, c] 

9806 0 2 1 [a, b, c] 

9807 1 foo 1 NaN 

9808 2 NaN 1 [] 

9809 3 3 1 [d, e] 

9810 3 4 1 [d, e] 

9811 

9812 Multi-column explode. 

9813 

9814 >>> df.explode(list('AC')) 

9815 A B C 

9816 0 0 1 a 

9817 0 1 1 b 

9818 0 2 1 c 

9819 1 foo 1 NaN 

9820 2 NaN 1 NaN 

9821 3 3 1 d 

9822 3 4 1 e 

9823 """ 

9824 if not self.columns.is_unique: 

9825 duplicate_cols = self.columns[self.columns.duplicated()].tolist() 

9826 raise ValueError( 

9827 f"DataFrame columns must be unique. Duplicate columns: {duplicate_cols}" 

9828 ) 

9829 

9830 columns: list[Hashable] 

9831 if is_scalar(column) or isinstance(column, tuple): 

9832 columns = [column] 

9833 elif isinstance(column, list) and all( 

9834 is_scalar(c) or isinstance(c, tuple) for c in column 

9835 ): 

9836 if not column: 

9837 raise ValueError("column must be nonempty") 

9838 if len(column) > len(set(column)): 

9839 raise ValueError("column must be unique") 

9840 columns = column 

9841 else: 

9842 raise ValueError("column must be a scalar, tuple, or list thereof") 

9843 

9844 df = self.reset_index(drop=True) 

9845 if len(columns) == 1: 

9846 result = df[columns[0]].explode() 

9847 else: 

9848 mylen = lambda x: len(x) if (is_list_like(x) and len(x) > 0) else 1 

9849 counts0 = self[columns[0]].apply(mylen) 

9850 for c in columns[1:]: 

9851 if not all(counts0 == self[c].apply(mylen)): 

9852 raise ValueError("columns must have matching element counts") 

9853 result = DataFrame({c: df[c].explode() for c in columns}) 

9854 result = df.drop(columns, axis=1).join(result) 

9855 if ignore_index: 

9856 result.index = default_index(len(result)) 

9857 else: 

9858 result.index = self.index.take(result.index) 

9859 result = result.reindex(columns=self.columns, copy=False) 

9860 

9861 return result.__finalize__(self, method="explode") 

9862 

9863 def unstack(self, level: IndexLabel = -1, fill_value=None, sort: bool = True): 

9864 """ 

9865 Pivot a level of the (necessarily hierarchical) index labels. 

9866 

9867 Returns a DataFrame having a new level of column labels whose inner-most level 

9868 consists of the pivoted index labels. 

9869 

9870 If the index is not a MultiIndex, the output will be a Series 

9871 (the analogue of stack when the columns are not a MultiIndex). 

9872 

9873 Parameters 

9874 ---------- 

9875 level : int, str, or list of these, default -1 (last level) 

9876 Level(s) of index to unstack, can pass level name. 

9877 fill_value : int, str or dict 

9878 Replace NaN with this value if the unstack produces missing values. 

9879 sort : bool, default True 

9880 Sort the level(s) in the resulting MultiIndex columns. 

9881 

9882 Returns 

9883 ------- 

9884 Series or DataFrame 

9885 

9886 See Also 

9887 -------- 

9888 DataFrame.pivot : Pivot a table based on column values. 

9889 DataFrame.stack : Pivot a level of the column labels (inverse operation 

9890 from `unstack`). 

9891 

9892 Notes 

9893 ----- 

9894 Reference :ref:`the user guide <reshaping.stacking>` for more examples. 

9895 

9896 Examples 

9897 -------- 

9898 >>> index = pd.MultiIndex.from_tuples([('one', 'a'), ('one', 'b'), 

9899 ... ('two', 'a'), ('two', 'b')]) 

9900 >>> s = pd.Series(np.arange(1.0, 5.0), index=index) 

9901 >>> s 

9902 one a 1.0 

9903 b 2.0 

9904 two a 3.0 

9905 b 4.0 

9906 dtype: float64 

9907 

9908 >>> s.unstack(level=-1) 

9909 a b 

9910 one 1.0 2.0 

9911 two 3.0 4.0 

9912 

9913 >>> s.unstack(level=0) 

9914 one two 

9915 a 1.0 3.0 

9916 b 2.0 4.0 

9917 

9918 >>> df = s.unstack(level=0) 

9919 >>> df.unstack() 

9920 one a 1.0 

9921 b 2.0 

9922 two a 3.0 

9923 b 4.0 

9924 dtype: float64 

9925 """ 

9926 from pandas.core.reshape.reshape import unstack 

9927 

9928 result = unstack(self, level, fill_value, sort) 

9929 

9930 return result.__finalize__(self, method="unstack") 

9931 

9932 @Appender(_shared_docs["melt"] % {"caller": "df.melt(", "other": "melt"}) 

9933 def melt( 

9934 self, 

9935 id_vars=None, 

9936 value_vars=None, 

9937 var_name=None, 

9938 value_name: Hashable = "value", 

9939 col_level: Level | None = None, 

9940 ignore_index: bool = True, 

9941 ) -> DataFrame: 

9942 return melt( 

9943 self, 

9944 id_vars=id_vars, 

9945 value_vars=value_vars, 

9946 var_name=var_name, 

9947 value_name=value_name, 

9948 col_level=col_level, 

9949 ignore_index=ignore_index, 

9950 ).__finalize__(self, method="melt") 

9951 

9952 # ---------------------------------------------------------------------- 

9953 # Time series-related 

9954 

9955 @doc( 

9956 Series.diff, 

9957 klass="DataFrame", 

9958 extra_params="axis : {0 or 'index', 1 or 'columns'}, default 0\n " 

9959 "Take difference over rows (0) or columns (1).\n", 

9960 other_klass="Series", 

9961 examples=dedent( 

9962 """ 

9963 Difference with previous row 

9964 

9965 >>> df = pd.DataFrame({'a': [1, 2, 3, 4, 5, 6], 

9966 ... 'b': [1, 1, 2, 3, 5, 8], 

9967 ... 'c': [1, 4, 9, 16, 25, 36]}) 

9968 >>> df 

9969 a b c 

9970 0 1 1 1 

9971 1 2 1 4 

9972 2 3 2 9 

9973 3 4 3 16 

9974 4 5 5 25 

9975 5 6 8 36 

9976 

9977 >>> df.diff() 

9978 a b c 

9979 0 NaN NaN NaN 

9980 1 1.0 0.0 3.0 

9981 2 1.0 1.0 5.0 

9982 3 1.0 1.0 7.0 

9983 4 1.0 2.0 9.0 

9984 5 1.0 3.0 11.0 

9985 

9986 Difference with previous column 

9987 

9988 >>> df.diff(axis=1) 

9989 a b c 

9990 0 NaN 0 0 

9991 1 NaN -1 3 

9992 2 NaN -1 7 

9993 3 NaN -1 13 

9994 4 NaN 0 20 

9995 5 NaN 2 28 

9996 

9997 Difference with 3rd previous row 

9998 

9999 >>> df.diff(periods=3) 

10000 a b c 

10001 0 NaN NaN NaN 

10002 1 NaN NaN NaN 

10003 2 NaN NaN NaN 

10004 3 3.0 2.0 15.0 

10005 4 3.0 4.0 21.0 

10006 5 3.0 6.0 27.0 

10007 

10008 Difference with following row 

10009 

10010 >>> df.diff(periods=-1) 

10011 a b c 

10012 0 -1.0 0.0 -3.0 

10013 1 -1.0 -1.0 -5.0 

10014 2 -1.0 -1.0 -7.0 

10015 3 -1.0 -2.0 -9.0 

10016 4 -1.0 -3.0 -11.0 

10017 5 NaN NaN NaN 

10018 

10019 Overflow in input dtype 

10020 

10021 >>> df = pd.DataFrame({'a': [1, 0]}, dtype=np.uint8) 

10022 >>> df.diff() 

10023 a 

10024 0 NaN 

10025 1 255.0""" 

10026 ), 

10027 ) 

10028 def diff(self, periods: int = 1, axis: Axis = 0) -> DataFrame: 

10029 if not lib.is_integer(periods): 

10030 if not (is_float(periods) and periods.is_integer()): 

10031 raise ValueError("periods must be an integer") 

10032 periods = int(periods) 

10033 

10034 axis = self._get_axis_number(axis) 

10035 if axis == 1: 

10036 if periods != 0: 

10037 # in the periods == 0 case, this is equivalent diff of 0 periods 

10038 # along axis=0, and the Manager method may be somewhat more 

10039 # performant, so we dispatch in that case. 

10040 return self - self.shift(periods, axis=axis) 

10041 # With periods=0 this is equivalent to a diff with axis=0 

10042 axis = 0 

10043 

10044 new_data = self._mgr.diff(n=periods) 

10045 res_df = self._constructor_from_mgr(new_data, axes=new_data.axes) 

10046 return res_df.__finalize__(self, "diff") 

10047 

10048 # ---------------------------------------------------------------------- 

10049 # Function application 

10050 

10051 def _gotitem( 

10052 self, 

10053 key: IndexLabel, 

10054 ndim: int, 

10055 subset: DataFrame | Series | None = None, 

10056 ) -> DataFrame | Series: 

10057 """ 

10058 Sub-classes to define. Return a sliced object. 

10059 

10060 Parameters 

10061 ---------- 

10062 key : string / list of selections 

10063 ndim : {1, 2} 

10064 requested ndim of result 

10065 subset : object, default None 

10066 subset to act on 

10067 """ 

10068 if subset is None: 

10069 subset = self 

10070 elif subset.ndim == 1: # is Series 

10071 return subset 

10072 

10073 # TODO: _shallow_copy(subset)? 

10074 return subset[key] 

10075 

10076 _agg_see_also_doc = dedent( 

10077 """ 

10078 See Also 

10079 -------- 

10080 DataFrame.apply : Perform any type of operations. 

10081 DataFrame.transform : Perform transformation type operations. 

10082 pandas.DataFrame.groupby : Perform operations over groups. 

10083 pandas.DataFrame.resample : Perform operations over resampled bins. 

10084 pandas.DataFrame.rolling : Perform operations over rolling window. 

10085 pandas.DataFrame.expanding : Perform operations over expanding window. 

10086 pandas.core.window.ewm.ExponentialMovingWindow : Perform operation over exponential 

10087 weighted window. 

10088 """ 

10089 ) 

10090 

10091 _agg_examples_doc = dedent( 

10092 """ 

10093 Examples 

10094 -------- 

10095 >>> df = pd.DataFrame([[1, 2, 3], 

10096 ... [4, 5, 6], 

10097 ... [7, 8, 9], 

10098 ... [np.nan, np.nan, np.nan]], 

10099 ... columns=['A', 'B', 'C']) 

10100 

10101 Aggregate these functions over the rows. 

10102 

10103 >>> df.agg(['sum', 'min']) 

10104 A B C 

10105 sum 12.0 15.0 18.0 

10106 min 1.0 2.0 3.0 

10107 

10108 Different aggregations per column. 

10109 

10110 >>> df.agg({'A' : ['sum', 'min'], 'B' : ['min', 'max']}) 

10111 A B 

10112 sum 12.0 NaN 

10113 min 1.0 2.0 

10114 max NaN 8.0 

10115 

10116 Aggregate different functions over the columns and rename the index of the resulting 

10117 DataFrame. 

10118 

10119 >>> df.agg(x=('A', 'max'), y=('B', 'min'), z=('C', 'mean')) 

10120 A B C 

10121 x 7.0 NaN NaN 

10122 y NaN 2.0 NaN 

10123 z NaN NaN 6.0 

10124 

10125 Aggregate over the columns. 

10126 

10127 >>> df.agg("mean", axis="columns") 

10128 0 2.0 

10129 1 5.0 

10130 2 8.0 

10131 3 NaN 

10132 dtype: float64 

10133 """ 

10134 ) 

10135 

10136 @doc( 

10137 _shared_docs["aggregate"], 

10138 klass=_shared_doc_kwargs["klass"], 

10139 axis=_shared_doc_kwargs["axis"], 

10140 see_also=_agg_see_also_doc, 

10141 examples=_agg_examples_doc, 

10142 ) 

10143 def aggregate(self, func=None, axis: Axis = 0, *args, **kwargs): 

10144 from pandas.core.apply import frame_apply 

10145 

10146 axis = self._get_axis_number(axis) 

10147 

10148 op = frame_apply(self, func=func, axis=axis, args=args, kwargs=kwargs) 

10149 result = op.agg() 

10150 result = reconstruct_and_relabel_result(result, func, **kwargs) 

10151 return result 

10152 

10153 agg = aggregate 

10154 

10155 @doc( 

10156 _shared_docs["transform"], 

10157 klass=_shared_doc_kwargs["klass"], 

10158 axis=_shared_doc_kwargs["axis"], 

10159 ) 

10160 def transform( 

10161 self, func: AggFuncType, axis: Axis = 0, *args, **kwargs 

10162 ) -> DataFrame: 

10163 from pandas.core.apply import frame_apply 

10164 

10165 op = frame_apply(self, func=func, axis=axis, args=args, kwargs=kwargs) 

10166 result = op.transform() 

10167 assert isinstance(result, DataFrame) 

10168 return result 

10169 

10170 def apply( 

10171 self, 

10172 func: AggFuncType, 

10173 axis: Axis = 0, 

10174 raw: bool = False, 

10175 result_type: Literal["expand", "reduce", "broadcast"] | None = None, 

10176 args=(), 

10177 by_row: Literal[False, "compat"] = "compat", 

10178 engine: Literal["python", "numba"] = "python", 

10179 engine_kwargs: dict[str, bool] | None = None, 

10180 **kwargs, 

10181 ): 

10182 """ 

10183 Apply a function along an axis of the DataFrame. 

10184 

10185 Objects passed to the function are Series objects whose index is 

10186 either the DataFrame's index (``axis=0``) or the DataFrame's columns 

10187 (``axis=1``). By default (``result_type=None``), the final return type 

10188 is inferred from the return type of the applied function. Otherwise, 

10189 it depends on the `result_type` argument. 

10190 

10191 Parameters 

10192 ---------- 

10193 func : function 

10194 Function to apply to each column or row. 

10195 axis : {0 or 'index', 1 or 'columns'}, default 0 

10196 Axis along which the function is applied: 

10197 

10198 * 0 or 'index': apply function to each column. 

10199 * 1 or 'columns': apply function to each row. 

10200 

10201 raw : bool, default False 

10202 Determines if row or column is passed as a Series or ndarray object: 

10203 

10204 * ``False`` : passes each row or column as a Series to the 

10205 function. 

10206 * ``True`` : the passed function will receive ndarray objects 

10207 instead. 

10208 If you are just applying a NumPy reduction function this will 

10209 achieve much better performance. 

10210 

10211 result_type : {'expand', 'reduce', 'broadcast', None}, default None 

10212 These only act when ``axis=1`` (columns): 

10213 

10214 * 'expand' : list-like results will be turned into columns. 

10215 * 'reduce' : returns a Series if possible rather than expanding 

10216 list-like results. This is the opposite of 'expand'. 

10217 * 'broadcast' : results will be broadcast to the original shape 

10218 of the DataFrame, the original index and columns will be 

10219 retained. 

10220 

10221 The default behaviour (None) depends on the return value of the 

10222 applied function: list-like results will be returned as a Series 

10223 of those. However if the apply function returns a Series these 

10224 are expanded to columns. 

10225 args : tuple 

10226 Positional arguments to pass to `func` in addition to the 

10227 array/series. 

10228 by_row : False or "compat", default "compat" 

10229 Only has an effect when ``func`` is a listlike or dictlike of funcs 

10230 and the func isn't a string. 

10231 If "compat", will if possible first translate the func into pandas 

10232 methods (e.g. ``Series().apply(np.sum)`` will be translated to 

10233 ``Series().sum()``). If that doesn't work, will try call to apply again with 

10234 ``by_row=True`` and if that fails, will call apply again with 

10235 ``by_row=False`` (backward compatible). 

10236 If False, the funcs will be passed the whole Series at once. 

10237 

10238 .. versionadded:: 2.1.0 

10239 

10240 engine : {'python', 'numba'}, default 'python' 

10241 Choose between the python (default) engine or the numba engine in apply. 

10242 

10243 The numba engine will attempt to JIT compile the passed function, 

10244 which may result in speedups for large DataFrames. 

10245 It also supports the following engine_kwargs : 

10246 

10247 - nopython (compile the function in nopython mode) 

10248 - nogil (release the GIL inside the JIT compiled function) 

10249 - parallel (try to apply the function in parallel over the DataFrame) 

10250 

10251 Note: Due to limitations within numba/how pandas interfaces with numba, 

10252 you should only use this if raw=True 

10253 

10254 Note: The numba compiler only supports a subset of 

10255 valid Python/numpy operations. 

10256 

10257 Please read more about the `supported python features 

10258 <https://numba.pydata.org/numba-doc/dev/reference/pysupported.html>`_ 

10259 and `supported numpy features 

10260 <https://numba.pydata.org/numba-doc/dev/reference/numpysupported.html>`_ 

10261 in numba to learn what you can or cannot use in the passed function. 

10262 

10263 .. versionadded:: 2.2.0 

10264 

10265 engine_kwargs : dict 

10266 Pass keyword arguments to the engine. 

10267 This is currently only used by the numba engine, 

10268 see the documentation for the engine argument for more information. 

10269 **kwargs 

10270 Additional keyword arguments to pass as keywords arguments to 

10271 `func`. 

10272 

10273 Returns 

10274 ------- 

10275 Series or DataFrame 

10276 Result of applying ``func`` along the given axis of the 

10277 DataFrame. 

10278 

10279 See Also 

10280 -------- 

10281 DataFrame.map: For elementwise operations. 

10282 DataFrame.aggregate: Only perform aggregating type operations. 

10283 DataFrame.transform: Only perform transforming type operations. 

10284 

10285 Notes 

10286 ----- 

10287 Functions that mutate the passed object can produce unexpected 

10288 behavior or errors and are not supported. See :ref:`gotchas.udf-mutation` 

10289 for more details. 

10290 

10291 Examples 

10292 -------- 

10293 >>> df = pd.DataFrame([[4, 9]] * 3, columns=['A', 'B']) 

10294 >>> df 

10295 A B 

10296 0 4 9 

10297 1 4 9 

10298 2 4 9 

10299 

10300 Using a numpy universal function (in this case the same as 

10301 ``np.sqrt(df)``): 

10302 

10303 >>> df.apply(np.sqrt) 

10304 A B 

10305 0 2.0 3.0 

10306 1 2.0 3.0 

10307 2 2.0 3.0 

10308 

10309 Using a reducing function on either axis 

10310 

10311 >>> df.apply(np.sum, axis=0) 

10312 A 12 

10313 B 27 

10314 dtype: int64 

10315 

10316 >>> df.apply(np.sum, axis=1) 

10317 0 13 

10318 1 13 

10319 2 13 

10320 dtype: int64 

10321 

10322 Returning a list-like will result in a Series 

10323 

10324 >>> df.apply(lambda x: [1, 2], axis=1) 

10325 0 [1, 2] 

10326 1 [1, 2] 

10327 2 [1, 2] 

10328 dtype: object 

10329 

10330 Passing ``result_type='expand'`` will expand list-like results 

10331 to columns of a Dataframe 

10332 

10333 >>> df.apply(lambda x: [1, 2], axis=1, result_type='expand') 

10334 0 1 

10335 0 1 2 

10336 1 1 2 

10337 2 1 2 

10338 

10339 Returning a Series inside the function is similar to passing 

10340 ``result_type='expand'``. The resulting column names 

10341 will be the Series index. 

10342 

10343 >>> df.apply(lambda x: pd.Series([1, 2], index=['foo', 'bar']), axis=1) 

10344 foo bar 

10345 0 1 2 

10346 1 1 2 

10347 2 1 2 

10348 

10349 Passing ``result_type='broadcast'`` will ensure the same shape 

10350 result, whether list-like or scalar is returned by the function, 

10351 and broadcast it along the axis. The resulting column names will 

10352 be the originals. 

10353 

10354 >>> df.apply(lambda x: [1, 2], axis=1, result_type='broadcast') 

10355 A B 

10356 0 1 2 

10357 1 1 2 

10358 2 1 2 

10359 """ 

10360 from pandas.core.apply import frame_apply 

10361 

10362 op = frame_apply( 

10363 self, 

10364 func=func, 

10365 axis=axis, 

10366 raw=raw, 

10367 result_type=result_type, 

10368 by_row=by_row, 

10369 engine=engine, 

10370 engine_kwargs=engine_kwargs, 

10371 args=args, 

10372 kwargs=kwargs, 

10373 ) 

10374 return op.apply().__finalize__(self, method="apply") 

10375 

10376 def map( 

10377 self, func: PythonFuncType, na_action: str | None = None, **kwargs 

10378 ) -> DataFrame: 

10379 """ 

10380 Apply a function to a Dataframe elementwise. 

10381 

10382 .. versionadded:: 2.1.0 

10383 

10384 DataFrame.applymap was deprecated and renamed to DataFrame.map. 

10385 

10386 This method applies a function that accepts and returns a scalar 

10387 to every element of a DataFrame. 

10388 

10389 Parameters 

10390 ---------- 

10391 func : callable 

10392 Python function, returns a single value from a single value. 

10393 na_action : {None, 'ignore'}, default None 

10394 If 'ignore', propagate NaN values, without passing them to func. 

10395 **kwargs 

10396 Additional keyword arguments to pass as keywords arguments to 

10397 `func`. 

10398 

10399 Returns 

10400 ------- 

10401 DataFrame 

10402 Transformed DataFrame. 

10403 

10404 See Also 

10405 -------- 

10406 DataFrame.apply : Apply a function along input axis of DataFrame. 

10407 DataFrame.replace: Replace values given in `to_replace` with `value`. 

10408 Series.map : Apply a function elementwise on a Series. 

10409 

10410 Examples 

10411 -------- 

10412 >>> df = pd.DataFrame([[1, 2.12], [3.356, 4.567]]) 

10413 >>> df 

10414 0 1 

10415 0 1.000 2.120 

10416 1 3.356 4.567 

10417 

10418 >>> df.map(lambda x: len(str(x))) 

10419 0 1 

10420 0 3 4 

10421 1 5 5 

10422 

10423 Like Series.map, NA values can be ignored: 

10424 

10425 >>> df_copy = df.copy() 

10426 >>> df_copy.iloc[0, 0] = pd.NA 

10427 >>> df_copy.map(lambda x: len(str(x)), na_action='ignore') 

10428 0 1 

10429 0 NaN 4 

10430 1 5.0 5 

10431 

10432 It is also possible to use `map` with functions that are not 

10433 `lambda` functions: 

10434 

10435 >>> df.map(round, ndigits=1) 

10436 0 1 

10437 0 1.0 2.1 

10438 1 3.4 4.6 

10439 

10440 Note that a vectorized version of `func` often exists, which will 

10441 be much faster. You could square each number elementwise. 

10442 

10443 >>> df.map(lambda x: x**2) 

10444 0 1 

10445 0 1.000000 4.494400 

10446 1 11.262736 20.857489 

10447 

10448 But it's better to avoid map in that case. 

10449 

10450 >>> df ** 2 

10451 0 1 

10452 0 1.000000 4.494400 

10453 1 11.262736 20.857489 

10454 """ 

10455 if na_action not in {"ignore", None}: 

10456 raise ValueError( 

10457 f"na_action must be 'ignore' or None. Got {repr(na_action)}" 

10458 ) 

10459 

10460 if self.empty: 

10461 return self.copy() 

10462 

10463 func = functools.partial(func, **kwargs) 

10464 

10465 def infer(x): 

10466 return x._map_values(func, na_action=na_action) 

10467 

10468 return self.apply(infer).__finalize__(self, "map") 

10469 

10470 def applymap( 

10471 self, func: PythonFuncType, na_action: NaAction | None = None, **kwargs 

10472 ) -> DataFrame: 

10473 """ 

10474 Apply a function to a Dataframe elementwise. 

10475 

10476 .. deprecated:: 2.1.0 

10477 

10478 DataFrame.applymap has been deprecated. Use DataFrame.map instead. 

10479 

10480 This method applies a function that accepts and returns a scalar 

10481 to every element of a DataFrame. 

10482 

10483 Parameters 

10484 ---------- 

10485 func : callable 

10486 Python function, returns a single value from a single value. 

10487 na_action : {None, 'ignore'}, default None 

10488 If 'ignore', propagate NaN values, without passing them to func. 

10489 **kwargs 

10490 Additional keyword arguments to pass as keywords arguments to 

10491 `func`. 

10492 

10493 Returns 

10494 ------- 

10495 DataFrame 

10496 Transformed DataFrame. 

10497 

10498 See Also 

10499 -------- 

10500 DataFrame.apply : Apply a function along input axis of DataFrame. 

10501 DataFrame.map : Apply a function along input axis of DataFrame. 

10502 DataFrame.replace: Replace values given in `to_replace` with `value`. 

10503 

10504 Examples 

10505 -------- 

10506 >>> df = pd.DataFrame([[1, 2.12], [3.356, 4.567]]) 

10507 >>> df 

10508 0 1 

10509 0 1.000 2.120 

10510 1 3.356 4.567 

10511 

10512 >>> df.map(lambda x: len(str(x))) 

10513 0 1 

10514 0 3 4 

10515 1 5 5 

10516 """ 

10517 warnings.warn( 

10518 "DataFrame.applymap has been deprecated. Use DataFrame.map instead.", 

10519 FutureWarning, 

10520 stacklevel=find_stack_level(), 

10521 ) 

10522 return self.map(func, na_action=na_action, **kwargs) 

10523 

10524 # ---------------------------------------------------------------------- 

10525 # Merging / joining methods 

10526 

10527 def _append( 

10528 self, 

10529 other, 

10530 ignore_index: bool = False, 

10531 verify_integrity: bool = False, 

10532 sort: bool = False, 

10533 ) -> DataFrame: 

10534 if isinstance(other, (Series, dict)): 

10535 if isinstance(other, dict): 

10536 if not ignore_index: 

10537 raise TypeError("Can only append a dict if ignore_index=True") 

10538 other = Series(other) 

10539 if other.name is None and not ignore_index: 

10540 raise TypeError( 

10541 "Can only append a Series if ignore_index=True " 

10542 "or if the Series has a name" 

10543 ) 

10544 

10545 index = Index( 

10546 [other.name], 

10547 name=self.index.names 

10548 if isinstance(self.index, MultiIndex) 

10549 else self.index.name, 

10550 ) 

10551 row_df = other.to_frame().T 

10552 # infer_objects is needed for 

10553 # test_append_empty_frame_to_series_with_dateutil_tz 

10554 other = row_df.infer_objects(copy=False).rename_axis( 

10555 index.names, copy=False 

10556 ) 

10557 elif isinstance(other, list): 

10558 if not other: 

10559 pass 

10560 elif not isinstance(other[0], DataFrame): 

10561 other = DataFrame(other) 

10562 if self.index.name is not None and not ignore_index: 

10563 other.index.name = self.index.name 

10564 

10565 from pandas.core.reshape.concat import concat 

10566 

10567 if isinstance(other, (list, tuple)): 

10568 to_concat = [self, *other] 

10569 else: 

10570 to_concat = [self, other] 

10571 

10572 result = concat( 

10573 to_concat, 

10574 ignore_index=ignore_index, 

10575 verify_integrity=verify_integrity, 

10576 sort=sort, 

10577 ) 

10578 return result.__finalize__(self, method="append") 

10579 

10580 def join( 

10581 self, 

10582 other: DataFrame | Series | Iterable[DataFrame | Series], 

10583 on: IndexLabel | None = None, 

10584 how: MergeHow = "left", 

10585 lsuffix: str = "", 

10586 rsuffix: str = "", 

10587 sort: bool = False, 

10588 validate: JoinValidate | None = None, 

10589 ) -> DataFrame: 

10590 """ 

10591 Join columns of another DataFrame. 

10592 

10593 Join columns with `other` DataFrame either on index or on a key 

10594 column. Efficiently join multiple DataFrame objects by index at once by 

10595 passing a list. 

10596 

10597 Parameters 

10598 ---------- 

10599 other : DataFrame, Series, or a list containing any combination of them 

10600 Index should be similar to one of the columns in this one. If a 

10601 Series is passed, its name attribute must be set, and that will be 

10602 used as the column name in the resulting joined DataFrame. 

10603 on : str, list of str, or array-like, optional 

10604 Column or index level name(s) in the caller to join on the index 

10605 in `other`, otherwise joins index-on-index. If multiple 

10606 values given, the `other` DataFrame must have a MultiIndex. Can 

10607 pass an array as the join key if it is not already contained in 

10608 the calling DataFrame. Like an Excel VLOOKUP operation. 

10609 how : {'left', 'right', 'outer', 'inner', 'cross'}, default 'left' 

10610 How to handle the operation of the two objects. 

10611 

10612 * left: use calling frame's index (or column if on is specified) 

10613 * right: use `other`'s index. 

10614 * outer: form union of calling frame's index (or column if on is 

10615 specified) with `other`'s index, and sort it lexicographically. 

10616 * inner: form intersection of calling frame's index (or column if 

10617 on is specified) with `other`'s index, preserving the order 

10618 of the calling's one. 

10619 * cross: creates the cartesian product from both frames, preserves the order 

10620 of the left keys. 

10621 lsuffix : str, default '' 

10622 Suffix to use from left frame's overlapping columns. 

10623 rsuffix : str, default '' 

10624 Suffix to use from right frame's overlapping columns. 

10625 sort : bool, default False 

10626 Order result DataFrame lexicographically by the join key. If False, 

10627 the order of the join key depends on the join type (how keyword). 

10628 validate : str, optional 

10629 If specified, checks if join is of specified type. 

10630 

10631 * "one_to_one" or "1:1": check if join keys are unique in both left 

10632 and right datasets. 

10633 * "one_to_many" or "1:m": check if join keys are unique in left dataset. 

10634 * "many_to_one" or "m:1": check if join keys are unique in right dataset. 

10635 * "many_to_many" or "m:m": allowed, but does not result in checks. 

10636 

10637 .. versionadded:: 1.5.0 

10638 

10639 Returns 

10640 ------- 

10641 DataFrame 

10642 A dataframe containing columns from both the caller and `other`. 

10643 

10644 See Also 

10645 -------- 

10646 DataFrame.merge : For column(s)-on-column(s) operations. 

10647 

10648 Notes 

10649 ----- 

10650 Parameters `on`, `lsuffix`, and `rsuffix` are not supported when 

10651 passing a list of `DataFrame` objects. 

10652 

10653 Examples 

10654 -------- 

10655 >>> df = pd.DataFrame({'key': ['K0', 'K1', 'K2', 'K3', 'K4', 'K5'], 

10656 ... 'A': ['A0', 'A1', 'A2', 'A3', 'A4', 'A5']}) 

10657 

10658 >>> df 

10659 key A 

10660 0 K0 A0 

10661 1 K1 A1 

10662 2 K2 A2 

10663 3 K3 A3 

10664 4 K4 A4 

10665 5 K5 A5 

10666 

10667 >>> other = pd.DataFrame({'key': ['K0', 'K1', 'K2'], 

10668 ... 'B': ['B0', 'B1', 'B2']}) 

10669 

10670 >>> other 

10671 key B 

10672 0 K0 B0 

10673 1 K1 B1 

10674 2 K2 B2 

10675 

10676 Join DataFrames using their indexes. 

10677 

10678 >>> df.join(other, lsuffix='_caller', rsuffix='_other') 

10679 key_caller A key_other B 

10680 0 K0 A0 K0 B0 

10681 1 K1 A1 K1 B1 

10682 2 K2 A2 K2 B2 

10683 3 K3 A3 NaN NaN 

10684 4 K4 A4 NaN NaN 

10685 5 K5 A5 NaN NaN 

10686 

10687 If we want to join using the key columns, we need to set key to be 

10688 the index in both `df` and `other`. The joined DataFrame will have 

10689 key as its index. 

10690 

10691 >>> df.set_index('key').join(other.set_index('key')) 

10692 A B 

10693 key 

10694 K0 A0 B0 

10695 K1 A1 B1 

10696 K2 A2 B2 

10697 K3 A3 NaN 

10698 K4 A4 NaN 

10699 K5 A5 NaN 

10700 

10701 Another option to join using the key columns is to use the `on` 

10702 parameter. DataFrame.join always uses `other`'s index but we can use 

10703 any column in `df`. This method preserves the original DataFrame's 

10704 index in the result. 

10705 

10706 >>> df.join(other.set_index('key'), on='key') 

10707 key A B 

10708 0 K0 A0 B0 

10709 1 K1 A1 B1 

10710 2 K2 A2 B2 

10711 3 K3 A3 NaN 

10712 4 K4 A4 NaN 

10713 5 K5 A5 NaN 

10714 

10715 Using non-unique key values shows how they are matched. 

10716 

10717 >>> df = pd.DataFrame({'key': ['K0', 'K1', 'K1', 'K3', 'K0', 'K1'], 

10718 ... 'A': ['A0', 'A1', 'A2', 'A3', 'A4', 'A5']}) 

10719 

10720 >>> df 

10721 key A 

10722 0 K0 A0 

10723 1 K1 A1 

10724 2 K1 A2 

10725 3 K3 A3 

10726 4 K0 A4 

10727 5 K1 A5 

10728 

10729 >>> df.join(other.set_index('key'), on='key', validate='m:1') 

10730 key A B 

10731 0 K0 A0 B0 

10732 1 K1 A1 B1 

10733 2 K1 A2 B1 

10734 3 K3 A3 NaN 

10735 4 K0 A4 B0 

10736 5 K1 A5 B1 

10737 """ 

10738 from pandas.core.reshape.concat import concat 

10739 from pandas.core.reshape.merge import merge 

10740 

10741 if isinstance(other, Series): 

10742 if other.name is None: 

10743 raise ValueError("Other Series must have a name") 

10744 other = DataFrame({other.name: other}) 

10745 

10746 if isinstance(other, DataFrame): 

10747 if how == "cross": 

10748 return merge( 

10749 self, 

10750 other, 

10751 how=how, 

10752 on=on, 

10753 suffixes=(lsuffix, rsuffix), 

10754 sort=sort, 

10755 validate=validate, 

10756 ) 

10757 return merge( 

10758 self, 

10759 other, 

10760 left_on=on, 

10761 how=how, 

10762 left_index=on is None, 

10763 right_index=True, 

10764 suffixes=(lsuffix, rsuffix), 

10765 sort=sort, 

10766 validate=validate, 

10767 ) 

10768 else: 

10769 if on is not None: 

10770 raise ValueError( 

10771 "Joining multiple DataFrames only supported for joining on index" 

10772 ) 

10773 

10774 if rsuffix or lsuffix: 

10775 raise ValueError( 

10776 "Suffixes not supported when joining multiple DataFrames" 

10777 ) 

10778 

10779 # Mypy thinks the RHS is a 

10780 # "Union[DataFrame, Series, Iterable[Union[DataFrame, Series]]]" whereas 

10781 # the LHS is an "Iterable[DataFrame]", but in reality both types are 

10782 # "Iterable[Union[DataFrame, Series]]" due to the if statements 

10783 frames = [cast("DataFrame | Series", self)] + list(other) 

10784 

10785 can_concat = all(df.index.is_unique for df in frames) 

10786 

10787 # join indexes only using concat 

10788 if can_concat: 

10789 if how == "left": 

10790 res = concat( 

10791 frames, axis=1, join="outer", verify_integrity=True, sort=sort 

10792 ) 

10793 return res.reindex(self.index, copy=False) 

10794 else: 

10795 return concat( 

10796 frames, axis=1, join=how, verify_integrity=True, sort=sort 

10797 ) 

10798 

10799 joined = frames[0] 

10800 

10801 for frame in frames[1:]: 

10802 joined = merge( 

10803 joined, 

10804 frame, 

10805 how=how, 

10806 left_index=True, 

10807 right_index=True, 

10808 validate=validate, 

10809 ) 

10810 

10811 return joined 

10812 

10813 @Substitution("") 

10814 @Appender(_merge_doc, indents=2) 

10815 def merge( 

10816 self, 

10817 right: DataFrame | Series, 

10818 how: MergeHow = "inner", 

10819 on: IndexLabel | AnyArrayLike | None = None, 

10820 left_on: IndexLabel | AnyArrayLike | None = None, 

10821 right_on: IndexLabel | AnyArrayLike | None = None, 

10822 left_index: bool = False, 

10823 right_index: bool = False, 

10824 sort: bool = False, 

10825 suffixes: Suffixes = ("_x", "_y"), 

10826 copy: bool | None = None, 

10827 indicator: str | bool = False, 

10828 validate: MergeValidate | None = None, 

10829 ) -> DataFrame: 

10830 from pandas.core.reshape.merge import merge 

10831 

10832 return merge( 

10833 self, 

10834 right, 

10835 how=how, 

10836 on=on, 

10837 left_on=left_on, 

10838 right_on=right_on, 

10839 left_index=left_index, 

10840 right_index=right_index, 

10841 sort=sort, 

10842 suffixes=suffixes, 

10843 copy=copy, 

10844 indicator=indicator, 

10845 validate=validate, 

10846 ) 

10847 

10848 def round( 

10849 self, decimals: int | dict[IndexLabel, int] | Series = 0, *args, **kwargs 

10850 ) -> DataFrame: 

10851 """ 

10852 Round a DataFrame to a variable number of decimal places. 

10853 

10854 Parameters 

10855 ---------- 

10856 decimals : int, dict, Series 

10857 Number of decimal places to round each column to. If an int is 

10858 given, round each column to the same number of places. 

10859 Otherwise dict and Series round to variable numbers of places. 

10860 Column names should be in the keys if `decimals` is a 

10861 dict-like, or in the index if `decimals` is a Series. Any 

10862 columns not included in `decimals` will be left as is. Elements 

10863 of `decimals` which are not columns of the input will be 

10864 ignored. 

10865 *args 

10866 Additional keywords have no effect but might be accepted for 

10867 compatibility with numpy. 

10868 **kwargs 

10869 Additional keywords have no effect but might be accepted for 

10870 compatibility with numpy. 

10871 

10872 Returns 

10873 ------- 

10874 DataFrame 

10875 A DataFrame with the affected columns rounded to the specified 

10876 number of decimal places. 

10877 

10878 See Also 

10879 -------- 

10880 numpy.around : Round a numpy array to the given number of decimals. 

10881 Series.round : Round a Series to the given number of decimals. 

10882 

10883 Examples 

10884 -------- 

10885 >>> df = pd.DataFrame([(.21, .32), (.01, .67), (.66, .03), (.21, .18)], 

10886 ... columns=['dogs', 'cats']) 

10887 >>> df 

10888 dogs cats 

10889 0 0.21 0.32 

10890 1 0.01 0.67 

10891 2 0.66 0.03 

10892 3 0.21 0.18 

10893 

10894 By providing an integer each column is rounded to the same number 

10895 of decimal places 

10896 

10897 >>> df.round(1) 

10898 dogs cats 

10899 0 0.2 0.3 

10900 1 0.0 0.7 

10901 2 0.7 0.0 

10902 3 0.2 0.2 

10903 

10904 With a dict, the number of places for specific columns can be 

10905 specified with the column names as key and the number of decimal 

10906 places as value 

10907 

10908 >>> df.round({'dogs': 1, 'cats': 0}) 

10909 dogs cats 

10910 0 0.2 0.0 

10911 1 0.0 1.0 

10912 2 0.7 0.0 

10913 3 0.2 0.0 

10914 

10915 Using a Series, the number of places for specific columns can be 

10916 specified with the column names as index and the number of 

10917 decimal places as value 

10918 

10919 >>> decimals = pd.Series([0, 1], index=['cats', 'dogs']) 

10920 >>> df.round(decimals) 

10921 dogs cats 

10922 0 0.2 0.0 

10923 1 0.0 1.0 

10924 2 0.7 0.0 

10925 3 0.2 0.0 

10926 """ 

10927 from pandas.core.reshape.concat import concat 

10928 

10929 def _dict_round(df: DataFrame, decimals): 

10930 for col, vals in df.items(): 

10931 try: 

10932 yield _series_round(vals, decimals[col]) 

10933 except KeyError: 

10934 yield vals 

10935 

10936 def _series_round(ser: Series, decimals: int) -> Series: 

10937 if is_integer_dtype(ser.dtype) or is_float_dtype(ser.dtype): 

10938 return ser.round(decimals) 

10939 return ser 

10940 

10941 nv.validate_round(args, kwargs) 

10942 

10943 if isinstance(decimals, (dict, Series)): 

10944 if isinstance(decimals, Series) and not decimals.index.is_unique: 

10945 raise ValueError("Index of decimals must be unique") 

10946 if is_dict_like(decimals) and not all( 

10947 is_integer(value) for _, value in decimals.items() 

10948 ): 

10949 raise TypeError("Values in decimals must be integers") 

10950 new_cols = list(_dict_round(self, decimals)) 

10951 elif is_integer(decimals): 

10952 # Dispatch to Block.round 

10953 # Argument "decimals" to "round" of "BaseBlockManager" has incompatible 

10954 # type "Union[int, integer[Any]]"; expected "int" 

10955 new_mgr = self._mgr.round( 

10956 decimals=decimals, # type: ignore[arg-type] 

10957 using_cow=using_copy_on_write(), 

10958 ) 

10959 return self._constructor_from_mgr(new_mgr, axes=new_mgr.axes).__finalize__( 

10960 self, method="round" 

10961 ) 

10962 else: 

10963 raise TypeError("decimals must be an integer, a dict-like or a Series") 

10964 

10965 if new_cols is not None and len(new_cols) > 0: 

10966 return self._constructor( 

10967 concat(new_cols, axis=1), index=self.index, columns=self.columns 

10968 ).__finalize__(self, method="round") 

10969 else: 

10970 return self.copy(deep=False) 

10971 

10972 # ---------------------------------------------------------------------- 

10973 # Statistical methods, etc. 

10974 

10975 def corr( 

10976 self, 

10977 method: CorrelationMethod = "pearson", 

10978 min_periods: int = 1, 

10979 numeric_only: bool = False, 

10980 ) -> DataFrame: 

10981 """ 

10982 Compute pairwise correlation of columns, excluding NA/null values. 

10983 

10984 Parameters 

10985 ---------- 

10986 method : {'pearson', 'kendall', 'spearman'} or callable 

10987 Method of correlation: 

10988 

10989 * pearson : standard correlation coefficient 

10990 * kendall : Kendall Tau correlation coefficient 

10991 * spearman : Spearman rank correlation 

10992 * callable: callable with input two 1d ndarrays 

10993 and returning a float. Note that the returned matrix from corr 

10994 will have 1 along the diagonals and will be symmetric 

10995 regardless of the callable's behavior. 

10996 min_periods : int, optional 

10997 Minimum number of observations required per pair of columns 

10998 to have a valid result. Currently only available for Pearson 

10999 and Spearman correlation. 

11000 numeric_only : bool, default False 

11001 Include only `float`, `int` or `boolean` data. 

11002 

11003 .. versionadded:: 1.5.0 

11004 

11005 .. versionchanged:: 2.0.0 

11006 The default value of ``numeric_only`` is now ``False``. 

11007 

11008 Returns 

11009 ------- 

11010 DataFrame 

11011 Correlation matrix. 

11012 

11013 See Also 

11014 -------- 

11015 DataFrame.corrwith : Compute pairwise correlation with another 

11016 DataFrame or Series. 

11017 Series.corr : Compute the correlation between two Series. 

11018 

11019 Notes 

11020 ----- 

11021 Pearson, Kendall and Spearman correlation are currently computed using pairwise complete observations. 

11022 

11023 * `Pearson correlation coefficient <https://en.wikipedia.org/wiki/Pearson_correlation_coefficient>`_ 

11024 * `Kendall rank correlation coefficient <https://en.wikipedia.org/wiki/Kendall_rank_correlation_coefficient>`_ 

11025 * `Spearman's rank correlation coefficient <https://en.wikipedia.org/wiki/Spearman%27s_rank_correlation_coefficient>`_ 

11026 

11027 Examples 

11028 -------- 

11029 >>> def histogram_intersection(a, b): 

11030 ... v = np.minimum(a, b).sum().round(decimals=1) 

11031 ... return v 

11032 >>> df = pd.DataFrame([(.2, .3), (.0, .6), (.6, .0), (.2, .1)], 

11033 ... columns=['dogs', 'cats']) 

11034 >>> df.corr(method=histogram_intersection) 

11035 dogs cats 

11036 dogs 1.0 0.3 

11037 cats 0.3 1.0 

11038 

11039 >>> df = pd.DataFrame([(1, 1), (2, np.nan), (np.nan, 3), (4, 4)], 

11040 ... columns=['dogs', 'cats']) 

11041 >>> df.corr(min_periods=3) 

11042 dogs cats 

11043 dogs 1.0 NaN 

11044 cats NaN 1.0 

11045 """ # noqa: E501 

11046 data = self._get_numeric_data() if numeric_only else self 

11047 cols = data.columns 

11048 idx = cols.copy() 

11049 mat = data.to_numpy(dtype=float, na_value=np.nan, copy=False) 

11050 

11051 if method == "pearson": 

11052 correl = libalgos.nancorr(mat, minp=min_periods) 

11053 elif method == "spearman": 

11054 correl = libalgos.nancorr_spearman(mat, minp=min_periods) 

11055 elif method == "kendall" or callable(method): 

11056 if min_periods is None: 

11057 min_periods = 1 

11058 mat = mat.T 

11059 corrf = nanops.get_corr_func(method) 

11060 K = len(cols) 

11061 correl = np.empty((K, K), dtype=float) 

11062 mask = np.isfinite(mat) 

11063 for i, ac in enumerate(mat): 

11064 for j, bc in enumerate(mat): 

11065 if i > j: 

11066 continue 

11067 

11068 valid = mask[i] & mask[j] 

11069 if valid.sum() < min_periods: 

11070 c = np.nan 

11071 elif i == j: 

11072 c = 1.0 

11073 elif not valid.all(): 

11074 c = corrf(ac[valid], bc[valid]) 

11075 else: 

11076 c = corrf(ac, bc) 

11077 correl[i, j] = c 

11078 correl[j, i] = c 

11079 else: 

11080 raise ValueError( 

11081 "method must be either 'pearson', " 

11082 "'spearman', 'kendall', or a callable, " 

11083 f"'{method}' was supplied" 

11084 ) 

11085 

11086 result = self._constructor(correl, index=idx, columns=cols, copy=False) 

11087 return result.__finalize__(self, method="corr") 

11088 

11089 def cov( 

11090 self, 

11091 min_periods: int | None = None, 

11092 ddof: int | None = 1, 

11093 numeric_only: bool = False, 

11094 ) -> DataFrame: 

11095 """ 

11096 Compute pairwise covariance of columns, excluding NA/null values. 

11097 

11098 Compute the pairwise covariance among the series of a DataFrame. 

11099 The returned data frame is the `covariance matrix 

11100 <https://en.wikipedia.org/wiki/Covariance_matrix>`__ of the columns 

11101 of the DataFrame. 

11102 

11103 Both NA and null values are automatically excluded from the 

11104 calculation. (See the note below about bias from missing values.) 

11105 A threshold can be set for the minimum number of 

11106 observations for each value created. Comparisons with observations 

11107 below this threshold will be returned as ``NaN``. 

11108 

11109 This method is generally used for the analysis of time series data to 

11110 understand the relationship between different measures 

11111 across time. 

11112 

11113 Parameters 

11114 ---------- 

11115 min_periods : int, optional 

11116 Minimum number of observations required per pair of columns 

11117 to have a valid result. 

11118 

11119 ddof : int, default 1 

11120 Delta degrees of freedom. The divisor used in calculations 

11121 is ``N - ddof``, where ``N`` represents the number of elements. 

11122 This argument is applicable only when no ``nan`` is in the dataframe. 

11123 

11124 numeric_only : bool, default False 

11125 Include only `float`, `int` or `boolean` data. 

11126 

11127 .. versionadded:: 1.5.0 

11128 

11129 .. versionchanged:: 2.0.0 

11130 The default value of ``numeric_only`` is now ``False``. 

11131 

11132 Returns 

11133 ------- 

11134 DataFrame 

11135 The covariance matrix of the series of the DataFrame. 

11136 

11137 See Also 

11138 -------- 

11139 Series.cov : Compute covariance with another Series. 

11140 core.window.ewm.ExponentialMovingWindow.cov : Exponential weighted sample 

11141 covariance. 

11142 core.window.expanding.Expanding.cov : Expanding sample covariance. 

11143 core.window.rolling.Rolling.cov : Rolling sample covariance. 

11144 

11145 Notes 

11146 ----- 

11147 Returns the covariance matrix of the DataFrame's time series. 

11148 The covariance is normalized by N-ddof. 

11149 

11150 For DataFrames that have Series that are missing data (assuming that 

11151 data is `missing at random 

11152 <https://en.wikipedia.org/wiki/Missing_data#Missing_at_random>`__) 

11153 the returned covariance matrix will be an unbiased estimate 

11154 of the variance and covariance between the member Series. 

11155 

11156 However, for many applications this estimate may not be acceptable 

11157 because the estimate covariance matrix is not guaranteed to be positive 

11158 semi-definite. This could lead to estimate correlations having 

11159 absolute values which are greater than one, and/or a non-invertible 

11160 covariance matrix. See `Estimation of covariance matrices 

11161 <https://en.wikipedia.org/w/index.php?title=Estimation_of_covariance_ 

11162 matrices>`__ for more details. 

11163 

11164 Examples 

11165 -------- 

11166 >>> df = pd.DataFrame([(1, 2), (0, 3), (2, 0), (1, 1)], 

11167 ... columns=['dogs', 'cats']) 

11168 >>> df.cov() 

11169 dogs cats 

11170 dogs 0.666667 -1.000000 

11171 cats -1.000000 1.666667 

11172 

11173 >>> np.random.seed(42) 

11174 >>> df = pd.DataFrame(np.random.randn(1000, 5), 

11175 ... columns=['a', 'b', 'c', 'd', 'e']) 

11176 >>> df.cov() 

11177 a b c d e 

11178 a 0.998438 -0.020161 0.059277 -0.008943 0.014144 

11179 b -0.020161 1.059352 -0.008543 -0.024738 0.009826 

11180 c 0.059277 -0.008543 1.010670 -0.001486 -0.000271 

11181 d -0.008943 -0.024738 -0.001486 0.921297 -0.013692 

11182 e 0.014144 0.009826 -0.000271 -0.013692 0.977795 

11183 

11184 **Minimum number of periods** 

11185 

11186 This method also supports an optional ``min_periods`` keyword 

11187 that specifies the required minimum number of non-NA observations for 

11188 each column pair in order to have a valid result: 

11189 

11190 >>> np.random.seed(42) 

11191 >>> df = pd.DataFrame(np.random.randn(20, 3), 

11192 ... columns=['a', 'b', 'c']) 

11193 >>> df.loc[df.index[:5], 'a'] = np.nan 

11194 >>> df.loc[df.index[5:10], 'b'] = np.nan 

11195 >>> df.cov(min_periods=12) 

11196 a b c 

11197 a 0.316741 NaN -0.150812 

11198 b NaN 1.248003 0.191417 

11199 c -0.150812 0.191417 0.895202 

11200 """ 

11201 data = self._get_numeric_data() if numeric_only else self 

11202 cols = data.columns 

11203 idx = cols.copy() 

11204 mat = data.to_numpy(dtype=float, na_value=np.nan, copy=False) 

11205 

11206 if notna(mat).all(): 

11207 if min_periods is not None and min_periods > len(mat): 

11208 base_cov = np.empty((mat.shape[1], mat.shape[1])) 

11209 base_cov.fill(np.nan) 

11210 else: 

11211 base_cov = np.cov(mat.T, ddof=ddof) 

11212 base_cov = base_cov.reshape((len(cols), len(cols))) 

11213 else: 

11214 base_cov = libalgos.nancorr(mat, cov=True, minp=min_periods) 

11215 

11216 result = self._constructor(base_cov, index=idx, columns=cols, copy=False) 

11217 return result.__finalize__(self, method="cov") 

11218 

11219 def corrwith( 

11220 self, 

11221 other: DataFrame | Series, 

11222 axis: Axis = 0, 

11223 drop: bool = False, 

11224 method: CorrelationMethod = "pearson", 

11225 numeric_only: bool = False, 

11226 ) -> Series: 

11227 """ 

11228 Compute pairwise correlation. 

11229 

11230 Pairwise correlation is computed between rows or columns of 

11231 DataFrame with rows or columns of Series or DataFrame. DataFrames 

11232 are first aligned along both axes before computing the 

11233 correlations. 

11234 

11235 Parameters 

11236 ---------- 

11237 other : DataFrame, Series 

11238 Object with which to compute correlations. 

11239 axis : {0 or 'index', 1 or 'columns'}, default 0 

11240 The axis to use. 0 or 'index' to compute row-wise, 1 or 'columns' for 

11241 column-wise. 

11242 drop : bool, default False 

11243 Drop missing indices from result. 

11244 method : {'pearson', 'kendall', 'spearman'} or callable 

11245 Method of correlation: 

11246 

11247 * pearson : standard correlation coefficient 

11248 * kendall : Kendall Tau correlation coefficient 

11249 * spearman : Spearman rank correlation 

11250 * callable: callable with input two 1d ndarrays 

11251 and returning a float. 

11252 

11253 numeric_only : bool, default False 

11254 Include only `float`, `int` or `boolean` data. 

11255 

11256 .. versionadded:: 1.5.0 

11257 

11258 .. versionchanged:: 2.0.0 

11259 The default value of ``numeric_only`` is now ``False``. 

11260 

11261 Returns 

11262 ------- 

11263 Series 

11264 Pairwise correlations. 

11265 

11266 See Also 

11267 -------- 

11268 DataFrame.corr : Compute pairwise correlation of columns. 

11269 

11270 Examples 

11271 -------- 

11272 >>> index = ["a", "b", "c", "d", "e"] 

11273 >>> columns = ["one", "two", "three", "four"] 

11274 >>> df1 = pd.DataFrame(np.arange(20).reshape(5, 4), index=index, columns=columns) 

11275 >>> df2 = pd.DataFrame(np.arange(16).reshape(4, 4), index=index[:4], columns=columns) 

11276 >>> df1.corrwith(df2) 

11277 one 1.0 

11278 two 1.0 

11279 three 1.0 

11280 four 1.0 

11281 dtype: float64 

11282 

11283 >>> df2.corrwith(df1, axis=1) 

11284 a 1.0 

11285 b 1.0 

11286 c 1.0 

11287 d 1.0 

11288 e NaN 

11289 dtype: float64 

11290 """ # noqa: E501 

11291 axis = self._get_axis_number(axis) 

11292 this = self._get_numeric_data() if numeric_only else self 

11293 

11294 if isinstance(other, Series): 

11295 return this.apply(lambda x: other.corr(x, method=method), axis=axis) 

11296 

11297 if numeric_only: 

11298 other = other._get_numeric_data() 

11299 left, right = this.align(other, join="inner", copy=False) 

11300 

11301 if axis == 1: 

11302 left = left.T 

11303 right = right.T 

11304 

11305 if method == "pearson": 

11306 # mask missing values 

11307 left = left + right * 0 

11308 right = right + left * 0 

11309 

11310 # demeaned data 

11311 ldem = left - left.mean(numeric_only=numeric_only) 

11312 rdem = right - right.mean(numeric_only=numeric_only) 

11313 

11314 num = (ldem * rdem).sum() 

11315 dom = ( 

11316 (left.count() - 1) 

11317 * left.std(numeric_only=numeric_only) 

11318 * right.std(numeric_only=numeric_only) 

11319 ) 

11320 

11321 correl = num / dom 

11322 

11323 elif method in ["kendall", "spearman"] or callable(method): 

11324 

11325 def c(x): 

11326 return nanops.nancorr(x[0], x[1], method=method) 

11327 

11328 correl = self._constructor_sliced( 

11329 map(c, zip(left.values.T, right.values.T)), 

11330 index=left.columns, 

11331 copy=False, 

11332 ) 

11333 

11334 else: 

11335 raise ValueError( 

11336 f"Invalid method {method} was passed, " 

11337 "valid methods are: 'pearson', 'kendall', " 

11338 "'spearman', or callable" 

11339 ) 

11340 

11341 if not drop: 

11342 # Find non-matching labels along the given axis 

11343 # and append missing correlations (GH 22375) 

11344 raxis: AxisInt = 1 if axis == 0 else 0 

11345 result_index = this._get_axis(raxis).union(other._get_axis(raxis)) 

11346 idx_diff = result_index.difference(correl.index) 

11347 

11348 if len(idx_diff) > 0: 

11349 correl = correl._append( 

11350 Series([np.nan] * len(idx_diff), index=idx_diff) 

11351 ) 

11352 

11353 return correl 

11354 

11355 # ---------------------------------------------------------------------- 

11356 # ndarray-like stats methods 

11357 

11358 def count(self, axis: Axis = 0, numeric_only: bool = False): 

11359 """ 

11360 Count non-NA cells for each column or row. 

11361 

11362 The values `None`, `NaN`, `NaT`, ``pandas.NA`` are considered NA. 

11363 

11364 Parameters 

11365 ---------- 

11366 axis : {0 or 'index', 1 or 'columns'}, default 0 

11367 If 0 or 'index' counts are generated for each column. 

11368 If 1 or 'columns' counts are generated for each row. 

11369 numeric_only : bool, default False 

11370 Include only `float`, `int` or `boolean` data. 

11371 

11372 Returns 

11373 ------- 

11374 Series 

11375 For each column/row the number of non-NA/null entries. 

11376 

11377 See Also 

11378 -------- 

11379 Series.count: Number of non-NA elements in a Series. 

11380 DataFrame.value_counts: Count unique combinations of columns. 

11381 DataFrame.shape: Number of DataFrame rows and columns (including NA 

11382 elements). 

11383 DataFrame.isna: Boolean same-sized DataFrame showing places of NA 

11384 elements. 

11385 

11386 Examples 

11387 -------- 

11388 Constructing DataFrame from a dictionary: 

11389 

11390 >>> df = pd.DataFrame({"Person": 

11391 ... ["John", "Myla", "Lewis", "John", "Myla"], 

11392 ... "Age": [24., np.nan, 21., 33, 26], 

11393 ... "Single": [False, True, True, True, False]}) 

11394 >>> df 

11395 Person Age Single 

11396 0 John 24.0 False 

11397 1 Myla NaN True 

11398 2 Lewis 21.0 True 

11399 3 John 33.0 True 

11400 4 Myla 26.0 False 

11401 

11402 Notice the uncounted NA values: 

11403 

11404 >>> df.count() 

11405 Person 5 

11406 Age 4 

11407 Single 5 

11408 dtype: int64 

11409 

11410 Counts for each **row**: 

11411 

11412 >>> df.count(axis='columns') 

11413 0 3 

11414 1 2 

11415 2 3 

11416 3 3 

11417 4 3 

11418 dtype: int64 

11419 """ 

11420 axis = self._get_axis_number(axis) 

11421 

11422 if numeric_only: 

11423 frame = self._get_numeric_data() 

11424 else: 

11425 frame = self 

11426 

11427 # GH #423 

11428 if len(frame._get_axis(axis)) == 0: 

11429 result = self._constructor_sliced(0, index=frame._get_agg_axis(axis)) 

11430 else: 

11431 result = notna(frame).sum(axis=axis) 

11432 

11433 return result.astype("int64", copy=False).__finalize__(self, method="count") 

11434 

11435 def _reduce( 

11436 self, 

11437 op, 

11438 name: str, 

11439 *, 

11440 axis: Axis = 0, 

11441 skipna: bool = True, 

11442 numeric_only: bool = False, 

11443 filter_type=None, 

11444 **kwds, 

11445 ): 

11446 assert filter_type is None or filter_type == "bool", filter_type 

11447 out_dtype = "bool" if filter_type == "bool" else None 

11448 

11449 if axis is not None: 

11450 axis = self._get_axis_number(axis) 

11451 

11452 def func(values: np.ndarray): 

11453 # We only use this in the case that operates on self.values 

11454 return op(values, axis=axis, skipna=skipna, **kwds) 

11455 

11456 dtype_has_keepdims: dict[ExtensionDtype, bool] = {} 

11457 

11458 def blk_func(values, axis: Axis = 1): 

11459 if isinstance(values, ExtensionArray): 

11460 if not is_1d_only_ea_dtype(values.dtype) and not isinstance( 

11461 self._mgr, ArrayManager 

11462 ): 

11463 return values._reduce(name, axis=1, skipna=skipna, **kwds) 

11464 has_keepdims = dtype_has_keepdims.get(values.dtype) 

11465 if has_keepdims is None: 

11466 sign = signature(values._reduce) 

11467 has_keepdims = "keepdims" in sign.parameters 

11468 dtype_has_keepdims[values.dtype] = has_keepdims 

11469 if has_keepdims: 

11470 return values._reduce(name, skipna=skipna, keepdims=True, **kwds) 

11471 else: 

11472 warnings.warn( 

11473 f"{type(values)}._reduce will require a `keepdims` parameter " 

11474 "in the future", 

11475 FutureWarning, 

11476 stacklevel=find_stack_level(), 

11477 ) 

11478 result = values._reduce(name, skipna=skipna, **kwds) 

11479 return np.array([result]) 

11480 else: 

11481 return op(values, axis=axis, skipna=skipna, **kwds) 

11482 

11483 def _get_data() -> DataFrame: 

11484 if filter_type is None: 

11485 data = self._get_numeric_data() 

11486 else: 

11487 # GH#25101, GH#24434 

11488 assert filter_type == "bool" 

11489 data = self._get_bool_data() 

11490 return data 

11491 

11492 # Case with EAs see GH#35881 

11493 df = self 

11494 if numeric_only: 

11495 df = _get_data() 

11496 if axis is None: 

11497 dtype = find_common_type([arr.dtype for arr in df._mgr.arrays]) 

11498 if isinstance(dtype, ExtensionDtype): 

11499 df = df.astype(dtype, copy=False) 

11500 arr = concat_compat(list(df._iter_column_arrays())) 

11501 return arr._reduce(name, skipna=skipna, keepdims=False, **kwds) 

11502 return func(df.values) 

11503 elif axis == 1: 

11504 if len(df.index) == 0: 

11505 # Taking a transpose would result in no columns, losing the dtype. 

11506 # In the empty case, reducing along axis 0 or 1 gives the same 

11507 # result dtype, so reduce with axis=0 and ignore values 

11508 result = df._reduce( 

11509 op, 

11510 name, 

11511 axis=0, 

11512 skipna=skipna, 

11513 numeric_only=False, 

11514 filter_type=filter_type, 

11515 **kwds, 

11516 ).iloc[:0] 

11517 result.index = df.index 

11518 return result 

11519 

11520 # kurtosis excluded since groupby does not implement it 

11521 if df.shape[1] and name != "kurt": 

11522 dtype = find_common_type([arr.dtype for arr in df._mgr.arrays]) 

11523 if isinstance(dtype, ExtensionDtype): 

11524 # GH 54341: fastpath for EA-backed axis=1 reductions 

11525 # This flattens the frame into a single 1D array while keeping 

11526 # track of the row and column indices of the original frame. Once 

11527 # flattened, grouping by the row indices and aggregating should 

11528 # be equivalent to transposing the original frame and aggregating 

11529 # with axis=0. 

11530 name = {"argmax": "idxmax", "argmin": "idxmin"}.get(name, name) 

11531 df = df.astype(dtype, copy=False) 

11532 arr = concat_compat(list(df._iter_column_arrays())) 

11533 nrows, ncols = df.shape 

11534 row_index = np.tile(np.arange(nrows), ncols) 

11535 col_index = np.repeat(np.arange(ncols), nrows) 

11536 ser = Series(arr, index=col_index, copy=False) 

11537 # GroupBy will raise a warning with SeriesGroupBy as the object, 

11538 # likely confusing users 

11539 with rewrite_warning( 

11540 target_message=( 

11541 f"The behavior of SeriesGroupBy.{name} with all-NA values" 

11542 ), 

11543 target_category=FutureWarning, 

11544 new_message=( 

11545 f"The behavior of {type(self).__name__}.{name} with all-NA " 

11546 "values, or any-NA and skipna=False, is deprecated. In " 

11547 "a future version this will raise ValueError" 

11548 ), 

11549 ): 

11550 result = ser.groupby(row_index).agg(name, **kwds) 

11551 result.index = df.index 

11552 if not skipna and name not in ("any", "all"): 

11553 mask = df.isna().to_numpy(dtype=np.bool_).any(axis=1) 

11554 other = -1 if name in ("idxmax", "idxmin") else lib.no_default 

11555 result = result.mask(mask, other) 

11556 return result 

11557 

11558 df = df.T 

11559 

11560 # After possibly _get_data and transposing, we are now in the 

11561 # simple case where we can use BlockManager.reduce 

11562 res = df._mgr.reduce(blk_func) 

11563 out = df._constructor_from_mgr(res, axes=res.axes).iloc[0] 

11564 if out_dtype is not None and out.dtype != "boolean": 

11565 out = out.astype(out_dtype) 

11566 elif (df._mgr.get_dtypes() == object).any() and name not in ["any", "all"]: 

11567 out = out.astype(object) 

11568 elif len(self) == 0 and out.dtype == object and name in ("sum", "prod"): 

11569 # Even if we are object dtype, follow numpy and return 

11570 # float64, see test_apply_funcs_over_empty 

11571 out = out.astype(np.float64) 

11572 

11573 return out 

11574 

11575 def _reduce_axis1(self, name: str, func, skipna: bool) -> Series: 

11576 """ 

11577 Special case for _reduce to try to avoid a potentially-expensive transpose. 

11578 

11579 Apply the reduction block-wise along axis=1 and then reduce the resulting 

11580 1D arrays. 

11581 """ 

11582 if name == "all": 

11583 result = np.ones(len(self), dtype=bool) 

11584 ufunc = np.logical_and 

11585 elif name == "any": 

11586 result = np.zeros(len(self), dtype=bool) 

11587 # error: Incompatible types in assignment 

11588 # (expression has type "_UFunc_Nin2_Nout1[Literal['logical_or'], 

11589 # Literal[20], Literal[False]]", variable has type 

11590 # "_UFunc_Nin2_Nout1[Literal['logical_and'], Literal[20], 

11591 # Literal[True]]") 

11592 ufunc = np.logical_or # type: ignore[assignment] 

11593 else: 

11594 raise NotImplementedError(name) 

11595 

11596 for arr in self._mgr.arrays: 

11597 middle = func(arr, axis=0, skipna=skipna) 

11598 result = ufunc(result, middle) 

11599 

11600 res_ser = self._constructor_sliced(result, index=self.index, copy=False) 

11601 return res_ser 

11602 

11603 @doc(make_doc("any", ndim=2)) 

11604 # error: Signature of "any" incompatible with supertype "NDFrame" 

11605 def any( # type: ignore[override] 

11606 self, 

11607 *, 

11608 axis: Axis | None = 0, 

11609 bool_only: bool = False, 

11610 skipna: bool = True, 

11611 **kwargs, 

11612 ) -> Series | bool: 

11613 result = self._logical_func( 

11614 "any", nanops.nanany, axis, bool_only, skipna, **kwargs 

11615 ) 

11616 if isinstance(result, Series): 

11617 result = result.__finalize__(self, method="any") 

11618 return result 

11619 

11620 @doc(make_doc("all", ndim=2)) 

11621 def all( 

11622 self, 

11623 axis: Axis | None = 0, 

11624 bool_only: bool = False, 

11625 skipna: bool = True, 

11626 **kwargs, 

11627 ) -> Series | bool: 

11628 result = self._logical_func( 

11629 "all", nanops.nanall, axis, bool_only, skipna, **kwargs 

11630 ) 

11631 if isinstance(result, Series): 

11632 result = result.__finalize__(self, method="all") 

11633 return result 

11634 

11635 @doc(make_doc("min", ndim=2)) 

11636 def min( 

11637 self, 

11638 axis: Axis | None = 0, 

11639 skipna: bool = True, 

11640 numeric_only: bool = False, 

11641 **kwargs, 

11642 ): 

11643 result = super().min(axis, skipna, numeric_only, **kwargs) 

11644 if isinstance(result, Series): 

11645 result = result.__finalize__(self, method="min") 

11646 return result 

11647 

11648 @doc(make_doc("max", ndim=2)) 

11649 def max( 

11650 self, 

11651 axis: Axis | None = 0, 

11652 skipna: bool = True, 

11653 numeric_only: bool = False, 

11654 **kwargs, 

11655 ): 

11656 result = super().max(axis, skipna, numeric_only, **kwargs) 

11657 if isinstance(result, Series): 

11658 result = result.__finalize__(self, method="max") 

11659 return result 

11660 

11661 @doc(make_doc("sum", ndim=2)) 

11662 def sum( 

11663 self, 

11664 axis: Axis | None = 0, 

11665 skipna: bool = True, 

11666 numeric_only: bool = False, 

11667 min_count: int = 0, 

11668 **kwargs, 

11669 ): 

11670 result = super().sum(axis, skipna, numeric_only, min_count, **kwargs) 

11671 return result.__finalize__(self, method="sum") 

11672 

11673 @doc(make_doc("prod", ndim=2)) 

11674 def prod( 

11675 self, 

11676 axis: Axis | None = 0, 

11677 skipna: bool = True, 

11678 numeric_only: bool = False, 

11679 min_count: int = 0, 

11680 **kwargs, 

11681 ): 

11682 result = super().prod(axis, skipna, numeric_only, min_count, **kwargs) 

11683 return result.__finalize__(self, method="prod") 

11684 

11685 @doc(make_doc("mean", ndim=2)) 

11686 def mean( 

11687 self, 

11688 axis: Axis | None = 0, 

11689 skipna: bool = True, 

11690 numeric_only: bool = False, 

11691 **kwargs, 

11692 ): 

11693 result = super().mean(axis, skipna, numeric_only, **kwargs) 

11694 if isinstance(result, Series): 

11695 result = result.__finalize__(self, method="mean") 

11696 return result 

11697 

11698 @doc(make_doc("median", ndim=2)) 

11699 def median( 

11700 self, 

11701 axis: Axis | None = 0, 

11702 skipna: bool = True, 

11703 numeric_only: bool = False, 

11704 **kwargs, 

11705 ): 

11706 result = super().median(axis, skipna, numeric_only, **kwargs) 

11707 if isinstance(result, Series): 

11708 result = result.__finalize__(self, method="median") 

11709 return result 

11710 

11711 @doc(make_doc("sem", ndim=2)) 

11712 def sem( 

11713 self, 

11714 axis: Axis | None = 0, 

11715 skipna: bool = True, 

11716 ddof: int = 1, 

11717 numeric_only: bool = False, 

11718 **kwargs, 

11719 ): 

11720 result = super().sem(axis, skipna, ddof, numeric_only, **kwargs) 

11721 if isinstance(result, Series): 

11722 result = result.__finalize__(self, method="sem") 

11723 return result 

11724 

11725 @doc(make_doc("var", ndim=2)) 

11726 def var( 

11727 self, 

11728 axis: Axis | None = 0, 

11729 skipna: bool = True, 

11730 ddof: int = 1, 

11731 numeric_only: bool = False, 

11732 **kwargs, 

11733 ): 

11734 result = super().var(axis, skipna, ddof, numeric_only, **kwargs) 

11735 if isinstance(result, Series): 

11736 result = result.__finalize__(self, method="var") 

11737 return result 

11738 

11739 @doc(make_doc("std", ndim=2)) 

11740 def std( 

11741 self, 

11742 axis: Axis | None = 0, 

11743 skipna: bool = True, 

11744 ddof: int = 1, 

11745 numeric_only: bool = False, 

11746 **kwargs, 

11747 ): 

11748 result = super().std(axis, skipna, ddof, numeric_only, **kwargs) 

11749 if isinstance(result, Series): 

11750 result = result.__finalize__(self, method="std") 

11751 return result 

11752 

11753 @doc(make_doc("skew", ndim=2)) 

11754 def skew( 

11755 self, 

11756 axis: Axis | None = 0, 

11757 skipna: bool = True, 

11758 numeric_only: bool = False, 

11759 **kwargs, 

11760 ): 

11761 result = super().skew(axis, skipna, numeric_only, **kwargs) 

11762 if isinstance(result, Series): 

11763 result = result.__finalize__(self, method="skew") 

11764 return result 

11765 

11766 @doc(make_doc("kurt", ndim=2)) 

11767 def kurt( 

11768 self, 

11769 axis: Axis | None = 0, 

11770 skipna: bool = True, 

11771 numeric_only: bool = False, 

11772 **kwargs, 

11773 ): 

11774 result = super().kurt(axis, skipna, numeric_only, **kwargs) 

11775 if isinstance(result, Series): 

11776 result = result.__finalize__(self, method="kurt") 

11777 return result 

11778 

11779 kurtosis = kurt 

11780 product = prod 

11781 

11782 @doc(make_doc("cummin", ndim=2)) 

11783 def cummin(self, axis: Axis | None = None, skipna: bool = True, *args, **kwargs): 

11784 return NDFrame.cummin(self, axis, skipna, *args, **kwargs) 

11785 

11786 @doc(make_doc("cummax", ndim=2)) 

11787 def cummax(self, axis: Axis | None = None, skipna: bool = True, *args, **kwargs): 

11788 return NDFrame.cummax(self, axis, skipna, *args, **kwargs) 

11789 

11790 @doc(make_doc("cumsum", ndim=2)) 

11791 def cumsum(self, axis: Axis | None = None, skipna: bool = True, *args, **kwargs): 

11792 return NDFrame.cumsum(self, axis, skipna, *args, **kwargs) 

11793 

11794 @doc(make_doc("cumprod", 2)) 

11795 def cumprod(self, axis: Axis | None = None, skipna: bool = True, *args, **kwargs): 

11796 return NDFrame.cumprod(self, axis, skipna, *args, **kwargs) 

11797 

11798 def nunique(self, axis: Axis = 0, dropna: bool = True) -> Series: 

11799 """ 

11800 Count number of distinct elements in specified axis. 

11801 

11802 Return Series with number of distinct elements. Can ignore NaN 

11803 values. 

11804 

11805 Parameters 

11806 ---------- 

11807 axis : {0 or 'index', 1 or 'columns'}, default 0 

11808 The axis to use. 0 or 'index' for row-wise, 1 or 'columns' for 

11809 column-wise. 

11810 dropna : bool, default True 

11811 Don't include NaN in the counts. 

11812 

11813 Returns 

11814 ------- 

11815 Series 

11816 

11817 See Also 

11818 -------- 

11819 Series.nunique: Method nunique for Series. 

11820 DataFrame.count: Count non-NA cells for each column or row. 

11821 

11822 Examples 

11823 -------- 

11824 >>> df = pd.DataFrame({'A': [4, 5, 6], 'B': [4, 1, 1]}) 

11825 >>> df.nunique() 

11826 A 3 

11827 B 2 

11828 dtype: int64 

11829 

11830 >>> df.nunique(axis=1) 

11831 0 1 

11832 1 2 

11833 2 2 

11834 dtype: int64 

11835 """ 

11836 return self.apply(Series.nunique, axis=axis, dropna=dropna) 

11837 

11838 @doc(_shared_docs["idxmin"], numeric_only_default="False") 

11839 def idxmin( 

11840 self, axis: Axis = 0, skipna: bool = True, numeric_only: bool = False 

11841 ) -> Series: 

11842 axis = self._get_axis_number(axis) 

11843 

11844 if self.empty and len(self.axes[axis]): 

11845 axis_dtype = self.axes[axis].dtype 

11846 return self._constructor_sliced(dtype=axis_dtype) 

11847 

11848 if numeric_only: 

11849 data = self._get_numeric_data() 

11850 else: 

11851 data = self 

11852 

11853 res = data._reduce( 

11854 nanops.nanargmin, "argmin", axis=axis, skipna=skipna, numeric_only=False 

11855 ) 

11856 indices = res._values 

11857 # indices will always be np.ndarray since axis is not N 

11858 

11859 if (indices == -1).any(): 

11860 warnings.warn( 

11861 f"The behavior of {type(self).__name__}.idxmin with all-NA " 

11862 "values, or any-NA and skipna=False, is deprecated. In a future " 

11863 "version this will raise ValueError", 

11864 FutureWarning, 

11865 stacklevel=find_stack_level(), 

11866 ) 

11867 

11868 index = data._get_axis(axis) 

11869 result = algorithms.take( 

11870 index._values, indices, allow_fill=True, fill_value=index._na_value 

11871 ) 

11872 final_result = data._constructor_sliced(result, index=data._get_agg_axis(axis)) 

11873 return final_result.__finalize__(self, method="idxmin") 

11874 

11875 @doc(_shared_docs["idxmax"], numeric_only_default="False") 

11876 def idxmax( 

11877 self, axis: Axis = 0, skipna: bool = True, numeric_only: bool = False 

11878 ) -> Series: 

11879 axis = self._get_axis_number(axis) 

11880 

11881 if self.empty and len(self.axes[axis]): 

11882 axis_dtype = self.axes[axis].dtype 

11883 return self._constructor_sliced(dtype=axis_dtype) 

11884 

11885 if numeric_only: 

11886 data = self._get_numeric_data() 

11887 else: 

11888 data = self 

11889 

11890 res = data._reduce( 

11891 nanops.nanargmax, "argmax", axis=axis, skipna=skipna, numeric_only=False 

11892 ) 

11893 indices = res._values 

11894 # indices will always be 1d array since axis is not None 

11895 

11896 if (indices == -1).any(): 

11897 warnings.warn( 

11898 f"The behavior of {type(self).__name__}.idxmax with all-NA " 

11899 "values, or any-NA and skipna=False, is deprecated. In a future " 

11900 "version this will raise ValueError", 

11901 FutureWarning, 

11902 stacklevel=find_stack_level(), 

11903 ) 

11904 

11905 index = data._get_axis(axis) 

11906 result = algorithms.take( 

11907 index._values, indices, allow_fill=True, fill_value=index._na_value 

11908 ) 

11909 final_result = data._constructor_sliced(result, index=data._get_agg_axis(axis)) 

11910 return final_result.__finalize__(self, method="idxmax") 

11911 

11912 def _get_agg_axis(self, axis_num: int) -> Index: 

11913 """ 

11914 Let's be explicit about this. 

11915 """ 

11916 if axis_num == 0: 

11917 return self.columns 

11918 elif axis_num == 1: 

11919 return self.index 

11920 else: 

11921 raise ValueError(f"Axis must be 0 or 1 (got {repr(axis_num)})") 

11922 

11923 def mode( 

11924 self, axis: Axis = 0, numeric_only: bool = False, dropna: bool = True 

11925 ) -> DataFrame: 

11926 """ 

11927 Get the mode(s) of each element along the selected axis. 

11928 

11929 The mode of a set of values is the value that appears most often. 

11930 It can be multiple values. 

11931 

11932 Parameters 

11933 ---------- 

11934 axis : {0 or 'index', 1 or 'columns'}, default 0 

11935 The axis to iterate over while searching for the mode: 

11936 

11937 * 0 or 'index' : get mode of each column 

11938 * 1 or 'columns' : get mode of each row. 

11939 

11940 numeric_only : bool, default False 

11941 If True, only apply to numeric columns. 

11942 dropna : bool, default True 

11943 Don't consider counts of NaN/NaT. 

11944 

11945 Returns 

11946 ------- 

11947 DataFrame 

11948 The modes of each column or row. 

11949 

11950 See Also 

11951 -------- 

11952 Series.mode : Return the highest frequency value in a Series. 

11953 Series.value_counts : Return the counts of values in a Series. 

11954 

11955 Examples 

11956 -------- 

11957 >>> df = pd.DataFrame([('bird', 2, 2), 

11958 ... ('mammal', 4, np.nan), 

11959 ... ('arthropod', 8, 0), 

11960 ... ('bird', 2, np.nan)], 

11961 ... index=('falcon', 'horse', 'spider', 'ostrich'), 

11962 ... columns=('species', 'legs', 'wings')) 

11963 >>> df 

11964 species legs wings 

11965 falcon bird 2 2.0 

11966 horse mammal 4 NaN 

11967 spider arthropod 8 0.0 

11968 ostrich bird 2 NaN 

11969 

11970 By default, missing values are not considered, and the mode of wings 

11971 are both 0 and 2. Because the resulting DataFrame has two rows, 

11972 the second row of ``species`` and ``legs`` contains ``NaN``. 

11973 

11974 >>> df.mode() 

11975 species legs wings 

11976 0 bird 2.0 0.0 

11977 1 NaN NaN 2.0 

11978 

11979 Setting ``dropna=False`` ``NaN`` values are considered and they can be 

11980 the mode (like for wings). 

11981 

11982 >>> df.mode(dropna=False) 

11983 species legs wings 

11984 0 bird 2 NaN 

11985 

11986 Setting ``numeric_only=True``, only the mode of numeric columns is 

11987 computed, and columns of other types are ignored. 

11988 

11989 >>> df.mode(numeric_only=True) 

11990 legs wings 

11991 0 2.0 0.0 

11992 1 NaN 2.0 

11993 

11994 To compute the mode over columns and not rows, use the axis parameter: 

11995 

11996 >>> df.mode(axis='columns', numeric_only=True) 

11997 0 1 

11998 falcon 2.0 NaN 

11999 horse 4.0 NaN 

12000 spider 0.0 8.0 

12001 ostrich 2.0 NaN 

12002 """ 

12003 data = self if not numeric_only else self._get_numeric_data() 

12004 

12005 def f(s): 

12006 return s.mode(dropna=dropna) 

12007 

12008 data = data.apply(f, axis=axis) 

12009 # Ensure index is type stable (should always use int index) 

12010 if data.empty: 

12011 data.index = default_index(0) 

12012 

12013 return data 

12014 

12015 @overload 

12016 def quantile( 

12017 self, 

12018 q: float = ..., 

12019 axis: Axis = ..., 

12020 numeric_only: bool = ..., 

12021 interpolation: QuantileInterpolation = ..., 

12022 method: Literal["single", "table"] = ..., 

12023 ) -> Series: 

12024 ... 

12025 

12026 @overload 

12027 def quantile( 

12028 self, 

12029 q: AnyArrayLike | Sequence[float], 

12030 axis: Axis = ..., 

12031 numeric_only: bool = ..., 

12032 interpolation: QuantileInterpolation = ..., 

12033 method: Literal["single", "table"] = ..., 

12034 ) -> Series | DataFrame: 

12035 ... 

12036 

12037 @overload 

12038 def quantile( 

12039 self, 

12040 q: float | AnyArrayLike | Sequence[float] = ..., 

12041 axis: Axis = ..., 

12042 numeric_only: bool = ..., 

12043 interpolation: QuantileInterpolation = ..., 

12044 method: Literal["single", "table"] = ..., 

12045 ) -> Series | DataFrame: 

12046 ... 

12047 

12048 def quantile( 

12049 self, 

12050 q: float | AnyArrayLike | Sequence[float] = 0.5, 

12051 axis: Axis = 0, 

12052 numeric_only: bool = False, 

12053 interpolation: QuantileInterpolation = "linear", 

12054 method: Literal["single", "table"] = "single", 

12055 ) -> Series | DataFrame: 

12056 """ 

12057 Return values at the given quantile over requested axis. 

12058 

12059 Parameters 

12060 ---------- 

12061 q : float or array-like, default 0.5 (50% quantile) 

12062 Value between 0 <= q <= 1, the quantile(s) to compute. 

12063 axis : {0 or 'index', 1 or 'columns'}, default 0 

12064 Equals 0 or 'index' for row-wise, 1 or 'columns' for column-wise. 

12065 numeric_only : bool, default False 

12066 Include only `float`, `int` or `boolean` data. 

12067 

12068 .. versionchanged:: 2.0.0 

12069 The default value of ``numeric_only`` is now ``False``. 

12070 

12071 interpolation : {'linear', 'lower', 'higher', 'midpoint', 'nearest'} 

12072 This optional parameter specifies the interpolation method to use, 

12073 when the desired quantile lies between two data points `i` and `j`: 

12074 

12075 * linear: `i + (j - i) * fraction`, where `fraction` is the 

12076 fractional part of the index surrounded by `i` and `j`. 

12077 * lower: `i`. 

12078 * higher: `j`. 

12079 * nearest: `i` or `j` whichever is nearest. 

12080 * midpoint: (`i` + `j`) / 2. 

12081 method : {'single', 'table'}, default 'single' 

12082 Whether to compute quantiles per-column ('single') or over all columns 

12083 ('table'). When 'table', the only allowed interpolation methods are 

12084 'nearest', 'lower', and 'higher'. 

12085 

12086 Returns 

12087 ------- 

12088 Series or DataFrame 

12089 

12090 If ``q`` is an array, a DataFrame will be returned where the 

12091 index is ``q``, the columns are the columns of self, and the 

12092 values are the quantiles. 

12093 If ``q`` is a float, a Series will be returned where the 

12094 index is the columns of self and the values are the quantiles. 

12095 

12096 See Also 

12097 -------- 

12098 core.window.rolling.Rolling.quantile: Rolling quantile. 

12099 numpy.percentile: Numpy function to compute the percentile. 

12100 

12101 Examples 

12102 -------- 

12103 >>> df = pd.DataFrame(np.array([[1, 1], [2, 10], [3, 100], [4, 100]]), 

12104 ... columns=['a', 'b']) 

12105 >>> df.quantile(.1) 

12106 a 1.3 

12107 b 3.7 

12108 Name: 0.1, dtype: float64 

12109 >>> df.quantile([.1, .5]) 

12110 a b 

12111 0.1 1.3 3.7 

12112 0.5 2.5 55.0 

12113 

12114 Specifying `method='table'` will compute the quantile over all columns. 

12115 

12116 >>> df.quantile(.1, method="table", interpolation="nearest") 

12117 a 1 

12118 b 1 

12119 Name: 0.1, dtype: int64 

12120 >>> df.quantile([.1, .5], method="table", interpolation="nearest") 

12121 a b 

12122 0.1 1 1 

12123 0.5 3 100 

12124 

12125 Specifying `numeric_only=False` will also compute the quantile of 

12126 datetime and timedelta data. 

12127 

12128 >>> df = pd.DataFrame({'A': [1, 2], 

12129 ... 'B': [pd.Timestamp('2010'), 

12130 ... pd.Timestamp('2011')], 

12131 ... 'C': [pd.Timedelta('1 days'), 

12132 ... pd.Timedelta('2 days')]}) 

12133 >>> df.quantile(0.5, numeric_only=False) 

12134 A 1.5 

12135 B 2010-07-02 12:00:00 

12136 C 1 days 12:00:00 

12137 Name: 0.5, dtype: object 

12138 """ 

12139 validate_percentile(q) 

12140 axis = self._get_axis_number(axis) 

12141 

12142 if not is_list_like(q): 

12143 # BlockManager.quantile expects listlike, so we wrap and unwrap here 

12144 # error: List item 0 has incompatible type "float | ExtensionArray | 

12145 # ndarray[Any, Any] | Index | Series | Sequence[float]"; expected "float" 

12146 res_df = self.quantile( 

12147 [q], # type: ignore[list-item] 

12148 axis=axis, 

12149 numeric_only=numeric_only, 

12150 interpolation=interpolation, 

12151 method=method, 

12152 ) 

12153 if method == "single": 

12154 res = res_df.iloc[0] 

12155 else: 

12156 # cannot directly iloc over sparse arrays 

12157 res = res_df.T.iloc[:, 0] 

12158 if axis == 1 and len(self) == 0: 

12159 # GH#41544 try to get an appropriate dtype 

12160 dtype = find_common_type(list(self.dtypes)) 

12161 if needs_i8_conversion(dtype): 

12162 return res.astype(dtype) 

12163 return res 

12164 

12165 q = Index(q, dtype=np.float64) 

12166 data = self._get_numeric_data() if numeric_only else self 

12167 

12168 if axis == 1: 

12169 data = data.T 

12170 

12171 if len(data.columns) == 0: 

12172 # GH#23925 _get_numeric_data may have dropped all columns 

12173 cols = Index([], name=self.columns.name) 

12174 

12175 dtype = np.float64 

12176 if axis == 1: 

12177 # GH#41544 try to get an appropriate dtype 

12178 cdtype = find_common_type(list(self.dtypes)) 

12179 if needs_i8_conversion(cdtype): 

12180 dtype = cdtype 

12181 

12182 res = self._constructor([], index=q, columns=cols, dtype=dtype) 

12183 return res.__finalize__(self, method="quantile") 

12184 

12185 valid_method = {"single", "table"} 

12186 if method not in valid_method: 

12187 raise ValueError( 

12188 f"Invalid method: {method}. Method must be in {valid_method}." 

12189 ) 

12190 if method == "single": 

12191 res = data._mgr.quantile(qs=q, interpolation=interpolation) 

12192 elif method == "table": 

12193 valid_interpolation = {"nearest", "lower", "higher"} 

12194 if interpolation not in valid_interpolation: 

12195 raise ValueError( 

12196 f"Invalid interpolation: {interpolation}. " 

12197 f"Interpolation must be in {valid_interpolation}" 

12198 ) 

12199 # handle degenerate case 

12200 if len(data) == 0: 

12201 if data.ndim == 2: 

12202 dtype = find_common_type(list(self.dtypes)) 

12203 else: 

12204 dtype = self.dtype 

12205 return self._constructor([], index=q, columns=data.columns, dtype=dtype) 

12206 

12207 q_idx = np.quantile(np.arange(len(data)), q, method=interpolation) 

12208 

12209 by = data.columns 

12210 if len(by) > 1: 

12211 keys = [data._get_label_or_level_values(x) for x in by] 

12212 indexer = lexsort_indexer(keys) 

12213 else: 

12214 k = data._get_label_or_level_values(by[0]) 

12215 indexer = nargsort(k) 

12216 

12217 res = data._mgr.take(indexer[q_idx], verify=False) 

12218 res.axes[1] = q 

12219 

12220 result = self._constructor_from_mgr(res, axes=res.axes) 

12221 return result.__finalize__(self, method="quantile") 

12222 

12223 def to_timestamp( 

12224 self, 

12225 freq: Frequency | None = None, 

12226 how: ToTimestampHow = "start", 

12227 axis: Axis = 0, 

12228 copy: bool | None = None, 

12229 ) -> DataFrame: 

12230 """ 

12231 Cast to DatetimeIndex of timestamps, at *beginning* of period. 

12232 

12233 Parameters 

12234 ---------- 

12235 freq : str, default frequency of PeriodIndex 

12236 Desired frequency. 

12237 how : {'s', 'e', 'start', 'end'} 

12238 Convention for converting period to timestamp; start of period 

12239 vs. end. 

12240 axis : {0 or 'index', 1 or 'columns'}, default 0 

12241 The axis to convert (the index by default). 

12242 copy : bool, default True 

12243 If False then underlying input data is not copied. 

12244 

12245 .. note:: 

12246 The `copy` keyword will change behavior in pandas 3.0. 

12247 `Copy-on-Write 

12248 <https://pandas.pydata.org/docs/dev/user_guide/copy_on_write.html>`__ 

12249 will be enabled by default, which means that all methods with a 

12250 `copy` keyword will use a lazy copy mechanism to defer the copy and 

12251 ignore the `copy` keyword. The `copy` keyword will be removed in a 

12252 future version of pandas. 

12253 

12254 You can already get the future behavior and improvements through 

12255 enabling copy on write ``pd.options.mode.copy_on_write = True`` 

12256 

12257 Returns 

12258 ------- 

12259 DataFrame 

12260 The DataFrame has a DatetimeIndex. 

12261 

12262 Examples 

12263 -------- 

12264 >>> idx = pd.PeriodIndex(['2023', '2024'], freq='Y') 

12265 >>> d = {'col1': [1, 2], 'col2': [3, 4]} 

12266 >>> df1 = pd.DataFrame(data=d, index=idx) 

12267 >>> df1 

12268 col1 col2 

12269 2023 1 3 

12270 2024 2 4 

12271 

12272 The resulting timestamps will be at the beginning of the year in this case 

12273 

12274 >>> df1 = df1.to_timestamp() 

12275 >>> df1 

12276 col1 col2 

12277 2023-01-01 1 3 

12278 2024-01-01 2 4 

12279 >>> df1.index 

12280 DatetimeIndex(['2023-01-01', '2024-01-01'], dtype='datetime64[ns]', freq=None) 

12281 

12282 Using `freq` which is the offset that the Timestamps will have 

12283 

12284 >>> df2 = pd.DataFrame(data=d, index=idx) 

12285 >>> df2 = df2.to_timestamp(freq='M') 

12286 >>> df2 

12287 col1 col2 

12288 2023-01-31 1 3 

12289 2024-01-31 2 4 

12290 >>> df2.index 

12291 DatetimeIndex(['2023-01-31', '2024-01-31'], dtype='datetime64[ns]', freq=None) 

12292 """ 

12293 new_obj = self.copy(deep=copy and not using_copy_on_write()) 

12294 

12295 axis_name = self._get_axis_name(axis) 

12296 old_ax = getattr(self, axis_name) 

12297 if not isinstance(old_ax, PeriodIndex): 

12298 raise TypeError(f"unsupported Type {type(old_ax).__name__}") 

12299 

12300 new_ax = old_ax.to_timestamp(freq=freq, how=how) 

12301 

12302 setattr(new_obj, axis_name, new_ax) 

12303 return new_obj 

12304 

12305 def to_period( 

12306 self, freq: Frequency | None = None, axis: Axis = 0, copy: bool | None = None 

12307 ) -> DataFrame: 

12308 """ 

12309 Convert DataFrame from DatetimeIndex to PeriodIndex. 

12310 

12311 Convert DataFrame from DatetimeIndex to PeriodIndex with desired 

12312 frequency (inferred from index if not passed). 

12313 

12314 Parameters 

12315 ---------- 

12316 freq : str, default 

12317 Frequency of the PeriodIndex. 

12318 axis : {0 or 'index', 1 or 'columns'}, default 0 

12319 The axis to convert (the index by default). 

12320 copy : bool, default True 

12321 If False then underlying input data is not copied. 

12322 

12323 .. note:: 

12324 The `copy` keyword will change behavior in pandas 3.0. 

12325 `Copy-on-Write 

12326 <https://pandas.pydata.org/docs/dev/user_guide/copy_on_write.html>`__ 

12327 will be enabled by default, which means that all methods with a 

12328 `copy` keyword will use a lazy copy mechanism to defer the copy and 

12329 ignore the `copy` keyword. The `copy` keyword will be removed in a 

12330 future version of pandas. 

12331 

12332 You can already get the future behavior and improvements through 

12333 enabling copy on write ``pd.options.mode.copy_on_write = True`` 

12334 

12335 Returns 

12336 ------- 

12337 DataFrame 

12338 The DataFrame has a PeriodIndex. 

12339 

12340 Examples 

12341 -------- 

12342 >>> idx = pd.to_datetime( 

12343 ... [ 

12344 ... "2001-03-31 00:00:00", 

12345 ... "2002-05-31 00:00:00", 

12346 ... "2003-08-31 00:00:00", 

12347 ... ] 

12348 ... ) 

12349 

12350 >>> idx 

12351 DatetimeIndex(['2001-03-31', '2002-05-31', '2003-08-31'], 

12352 dtype='datetime64[ns]', freq=None) 

12353 

12354 >>> idx.to_period("M") 

12355 PeriodIndex(['2001-03', '2002-05', '2003-08'], dtype='period[M]') 

12356 

12357 For the yearly frequency 

12358 

12359 >>> idx.to_period("Y") 

12360 PeriodIndex(['2001', '2002', '2003'], dtype='period[Y-DEC]') 

12361 """ 

12362 new_obj = self.copy(deep=copy and not using_copy_on_write()) 

12363 

12364 axis_name = self._get_axis_name(axis) 

12365 old_ax = getattr(self, axis_name) 

12366 if not isinstance(old_ax, DatetimeIndex): 

12367 raise TypeError(f"unsupported Type {type(old_ax).__name__}") 

12368 

12369 new_ax = old_ax.to_period(freq=freq) 

12370 

12371 setattr(new_obj, axis_name, new_ax) 

12372 return new_obj 

12373 

12374 def isin(self, values: Series | DataFrame | Sequence | Mapping) -> DataFrame: 

12375 """ 

12376 Whether each element in the DataFrame is contained in values. 

12377 

12378 Parameters 

12379 ---------- 

12380 values : iterable, Series, DataFrame or dict 

12381 The result will only be true at a location if all the 

12382 labels match. If `values` is a Series, that's the index. If 

12383 `values` is a dict, the keys must be the column names, 

12384 which must match. If `values` is a DataFrame, 

12385 then both the index and column labels must match. 

12386 

12387 Returns 

12388 ------- 

12389 DataFrame 

12390 DataFrame of booleans showing whether each element in the DataFrame 

12391 is contained in values. 

12392 

12393 See Also 

12394 -------- 

12395 DataFrame.eq: Equality test for DataFrame. 

12396 Series.isin: Equivalent method on Series. 

12397 Series.str.contains: Test if pattern or regex is contained within a 

12398 string of a Series or Index. 

12399 

12400 Examples 

12401 -------- 

12402 >>> df = pd.DataFrame({'num_legs': [2, 4], 'num_wings': [2, 0]}, 

12403 ... index=['falcon', 'dog']) 

12404 >>> df 

12405 num_legs num_wings 

12406 falcon 2 2 

12407 dog 4 0 

12408 

12409 When ``values`` is a list check whether every value in the DataFrame 

12410 is present in the list (which animals have 0 or 2 legs or wings) 

12411 

12412 >>> df.isin([0, 2]) 

12413 num_legs num_wings 

12414 falcon True True 

12415 dog False True 

12416 

12417 To check if ``values`` is *not* in the DataFrame, use the ``~`` operator: 

12418 

12419 >>> ~df.isin([0, 2]) 

12420 num_legs num_wings 

12421 falcon False False 

12422 dog True False 

12423 

12424 When ``values`` is a dict, we can pass values to check for each 

12425 column separately: 

12426 

12427 >>> df.isin({'num_wings': [0, 3]}) 

12428 num_legs num_wings 

12429 falcon False False 

12430 dog False True 

12431 

12432 When ``values`` is a Series or DataFrame the index and column must 

12433 match. Note that 'falcon' does not match based on the number of legs 

12434 in other. 

12435 

12436 >>> other = pd.DataFrame({'num_legs': [8, 3], 'num_wings': [0, 2]}, 

12437 ... index=['spider', 'falcon']) 

12438 >>> df.isin(other) 

12439 num_legs num_wings 

12440 falcon False True 

12441 dog False False 

12442 """ 

12443 if isinstance(values, dict): 

12444 from pandas.core.reshape.concat import concat 

12445 

12446 values = collections.defaultdict(list, values) 

12447 result = concat( 

12448 ( 

12449 self.iloc[:, [i]].isin(values[col]) 

12450 for i, col in enumerate(self.columns) 

12451 ), 

12452 axis=1, 

12453 ) 

12454 elif isinstance(values, Series): 

12455 if not values.index.is_unique: 

12456 raise ValueError("cannot compute isin with a duplicate axis.") 

12457 result = self.eq(values.reindex_like(self), axis="index") 

12458 elif isinstance(values, DataFrame): 

12459 if not (values.columns.is_unique and values.index.is_unique): 

12460 raise ValueError("cannot compute isin with a duplicate axis.") 

12461 result = self.eq(values.reindex_like(self)) 

12462 else: 

12463 if not is_list_like(values): 

12464 raise TypeError( 

12465 "only list-like or dict-like objects are allowed " 

12466 "to be passed to DataFrame.isin(), " 

12467 f"you passed a '{type(values).__name__}'" 

12468 ) 

12469 

12470 def isin_(x): 

12471 # error: Argument 2 to "isin" has incompatible type "Union[Series, 

12472 # DataFrame, Sequence[Any], Mapping[Any, Any]]"; expected 

12473 # "Union[Union[Union[ExtensionArray, ndarray[Any, Any]], Index, 

12474 # Series], List[Any], range]" 

12475 result = algorithms.isin( 

12476 x.ravel(), 

12477 values, # type: ignore[arg-type] 

12478 ) 

12479 return result.reshape(x.shape) 

12480 

12481 res_mgr = self._mgr.apply(isin_) 

12482 result = self._constructor_from_mgr( 

12483 res_mgr, 

12484 axes=res_mgr.axes, 

12485 ) 

12486 return result.__finalize__(self, method="isin") 

12487 

12488 # ---------------------------------------------------------------------- 

12489 # Add index and columns 

12490 _AXIS_ORDERS: list[Literal["index", "columns"]] = ["index", "columns"] 

12491 _AXIS_TO_AXIS_NUMBER: dict[Axis, int] = { 

12492 **NDFrame._AXIS_TO_AXIS_NUMBER, 

12493 1: 1, 

12494 "columns": 1, 

12495 } 

12496 _AXIS_LEN = len(_AXIS_ORDERS) 

12497 _info_axis_number: Literal[1] = 1 

12498 _info_axis_name: Literal["columns"] = "columns" 

12499 

12500 index = properties.AxisProperty( 

12501 axis=1, 

12502 doc=""" 

12503 The index (row labels) of the DataFrame. 

12504 

12505 The index of a DataFrame is a series of labels that identify each row. 

12506 The labels can be integers, strings, or any other hashable type. The index 

12507 is used for label-based access and alignment, and can be accessed or 

12508 modified using this attribute. 

12509 

12510 Returns 

12511 ------- 

12512 pandas.Index 

12513 The index labels of the DataFrame. 

12514 

12515 See Also 

12516 -------- 

12517 DataFrame.columns : The column labels of the DataFrame. 

12518 DataFrame.to_numpy : Convert the DataFrame to a NumPy array. 

12519 

12520 Examples 

12521 -------- 

12522 >>> df = pd.DataFrame({'Name': ['Alice', 'Bob', 'Aritra'], 

12523 ... 'Age': [25, 30, 35], 

12524 ... 'Location': ['Seattle', 'New York', 'Kona']}, 

12525 ... index=([10, 20, 30])) 

12526 >>> df.index 

12527 Index([10, 20, 30], dtype='int64') 

12528 

12529 In this example, we create a DataFrame with 3 rows and 3 columns, 

12530 including Name, Age, and Location information. We set the index labels to 

12531 be the integers 10, 20, and 30. We then access the `index` attribute of the 

12532 DataFrame, which returns an `Index` object containing the index labels. 

12533 

12534 >>> df.index = [100, 200, 300] 

12535 >>> df 

12536 Name Age Location 

12537 100 Alice 25 Seattle 

12538 200 Bob 30 New York 

12539 300 Aritra 35 Kona 

12540 

12541 In this example, we modify the index labels of the DataFrame by assigning 

12542 a new list of labels to the `index` attribute. The DataFrame is then 

12543 updated with the new labels, and the output shows the modified DataFrame. 

12544 """, 

12545 ) 

12546 columns = properties.AxisProperty( 

12547 axis=0, 

12548 doc=dedent( 

12549 """ 

12550 The column labels of the DataFrame. 

12551 

12552 Examples 

12553 -------- 

12554 >>> df = pd.DataFrame({'A': [1, 2], 'B': [3, 4]}) 

12555 >>> df 

12556 A B 

12557 0 1 3 

12558 1 2 4 

12559 >>> df.columns 

12560 Index(['A', 'B'], dtype='object') 

12561 """ 

12562 ), 

12563 ) 

12564 

12565 # ---------------------------------------------------------------------- 

12566 # Add plotting methods to DataFrame 

12567 plot = CachedAccessor("plot", pandas.plotting.PlotAccessor) 

12568 hist = pandas.plotting.hist_frame 

12569 boxplot = pandas.plotting.boxplot_frame 

12570 sparse = CachedAccessor("sparse", SparseFrameAccessor) 

12571 

12572 # ---------------------------------------------------------------------- 

12573 # Internal Interface Methods 

12574 

12575 def _to_dict_of_blocks(self): 

12576 """ 

12577 Return a dict of dtype -> Constructor Types that 

12578 each is a homogeneous dtype. 

12579 

12580 Internal ONLY - only works for BlockManager 

12581 """ 

12582 mgr = self._mgr 

12583 # convert to BlockManager if needed -> this way support ArrayManager as well 

12584 mgr = cast(BlockManager, mgr_to_mgr(mgr, "block")) 

12585 return { 

12586 k: self._constructor_from_mgr(v, axes=v.axes).__finalize__(self) 

12587 for k, v, in mgr.to_dict().items() 

12588 } 

12589 

12590 @property 

12591 def values(self) -> np.ndarray: 

12592 """ 

12593 Return a Numpy representation of the DataFrame. 

12594 

12595 .. warning:: 

12596 

12597 We recommend using :meth:`DataFrame.to_numpy` instead. 

12598 

12599 Only the values in the DataFrame will be returned, the axes labels 

12600 will be removed. 

12601 

12602 Returns 

12603 ------- 

12604 numpy.ndarray 

12605 The values of the DataFrame. 

12606 

12607 See Also 

12608 -------- 

12609 DataFrame.to_numpy : Recommended alternative to this method. 

12610 DataFrame.index : Retrieve the index labels. 

12611 DataFrame.columns : Retrieving the column names. 

12612 

12613 Notes 

12614 ----- 

12615 The dtype will be a lower-common-denominator dtype (implicit 

12616 upcasting); that is to say if the dtypes (even of numeric types) 

12617 are mixed, the one that accommodates all will be chosen. Use this 

12618 with care if you are not dealing with the blocks. 

12619 

12620 e.g. If the dtypes are float16 and float32, dtype will be upcast to 

12621 float32. If dtypes are int32 and uint8, dtype will be upcast to 

12622 int32. By :func:`numpy.find_common_type` convention, mixing int64 

12623 and uint64 will result in a float64 dtype. 

12624 

12625 Examples 

12626 -------- 

12627 A DataFrame where all columns are the same type (e.g., int64) results 

12628 in an array of the same type. 

12629 

12630 >>> df = pd.DataFrame({'age': [ 3, 29], 

12631 ... 'height': [94, 170], 

12632 ... 'weight': [31, 115]}) 

12633 >>> df 

12634 age height weight 

12635 0 3 94 31 

12636 1 29 170 115 

12637 >>> df.dtypes 

12638 age int64 

12639 height int64 

12640 weight int64 

12641 dtype: object 

12642 >>> df.values 

12643 array([[ 3, 94, 31], 

12644 [ 29, 170, 115]]) 

12645 

12646 A DataFrame with mixed type columns(e.g., str/object, int64, float32) 

12647 results in an ndarray of the broadest type that accommodates these 

12648 mixed types (e.g., object). 

12649 

12650 >>> df2 = pd.DataFrame([('parrot', 24.0, 'second'), 

12651 ... ('lion', 80.5, 1), 

12652 ... ('monkey', np.nan, None)], 

12653 ... columns=('name', 'max_speed', 'rank')) 

12654 >>> df2.dtypes 

12655 name object 

12656 max_speed float64 

12657 rank object 

12658 dtype: object 

12659 >>> df2.values 

12660 array([['parrot', 24.0, 'second'], 

12661 ['lion', 80.5, 1], 

12662 ['monkey', nan, None]], dtype=object) 

12663 """ 

12664 return self._mgr.as_array() 

12665 

12666 

12667def _from_nested_dict(data) -> collections.defaultdict: 

12668 new_data: collections.defaultdict = collections.defaultdict(dict) 

12669 for index, s in data.items(): 

12670 for col, v in s.items(): 

12671 new_data[col][index] = v 

12672 return new_data 

12673 

12674 

12675def _reindex_for_setitem( 

12676 value: DataFrame | Series, index: Index 

12677) -> tuple[ArrayLike, BlockValuesRefs | None]: 

12678 # reindex if necessary 

12679 

12680 if value.index.equals(index) or not len(index): 

12681 if using_copy_on_write() and isinstance(value, Series): 

12682 return value._values, value._references 

12683 return value._values.copy(), None 

12684 

12685 # GH#4107 

12686 try: 

12687 reindexed_value = value.reindex(index)._values 

12688 except ValueError as err: 

12689 # raised in MultiIndex.from_tuples, see test_insert_error_msmgs 

12690 if not value.index.is_unique: 

12691 # duplicate axis 

12692 raise err 

12693 

12694 raise TypeError( 

12695 "incompatible index of inserted column with frame index" 

12696 ) from err 

12697 return reindexed_value, None