Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/pandas/core/frame.py: 19%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

2171 statements  

1""" 

2DataFrame 

3--------- 

4An efficient 2D container for potentially mixed-type time series or other 

5labeled data series. 

6 

7Similar to its R counterpart, data.frame, except providing automatic data 

8alignment and a host of useful data manipulation methods having to do with the 

9labeling information 

10""" 

11from __future__ import annotations 

12 

13import collections 

14from collections import abc 

15import datetime 

16import functools 

17from io import StringIO 

18import itertools 

19import sys 

20from textwrap import dedent 

21from typing import ( 

22 TYPE_CHECKING, 

23 Any, 

24 Callable, 

25 Hashable, 

26 Iterable, 

27 Iterator, 

28 Literal, 

29 Mapping, 

30 Sequence, 

31 cast, 

32 overload, 

33) 

34import warnings 

35 

36import numpy as np 

37from numpy import ma 

38 

39from pandas._config import ( 

40 get_option, 

41 using_copy_on_write, 

42) 

43 

44from pandas._libs import ( 

45 algos as libalgos, 

46 lib, 

47 properties, 

48) 

49from pandas._libs.hashtable import duplicated 

50from pandas._libs.lib import ( 

51 NoDefault, 

52 is_range_indexer, 

53 no_default, 

54) 

55from pandas._typing import ( 

56 AggFuncType, 

57 AlignJoin, 

58 AnyAll, 

59 AnyArrayLike, 

60 ArrayLike, 

61 Axes, 

62 Axis, 

63 AxisInt, 

64 ColspaceArgType, 

65 CompressionOptions, 

66 CorrelationMethod, 

67 DropKeep, 

68 Dtype, 

69 DtypeObj, 

70 FilePath, 

71 FillnaOptions, 

72 FloatFormatType, 

73 FormattersType, 

74 Frequency, 

75 IgnoreRaise, 

76 IndexKeyFunc, 

77 IndexLabel, 

78 Level, 

79 MergeHow, 

80 NaPosition, 

81 PythonFuncType, 

82 QuantileInterpolation, 

83 ReadBuffer, 

84 Renamer, 

85 Scalar, 

86 SortKind, 

87 StorageOptions, 

88 Suffixes, 

89 TimedeltaConvertibleTypes, 

90 TimestampConvertibleTypes, 

91 ValueKeyFunc, 

92 WriteBuffer, 

93 npt, 

94) 

95from pandas.compat import PYPY 

96from pandas.compat._optional import import_optional_dependency 

97from pandas.compat.numpy import ( 

98 function as nv, 

99 np_percentile_argname, 

100) 

101from pandas.errors import ( 

102 ChainedAssignmentError, 

103 InvalidIndexError, 

104 _chained_assignment_msg, 

105) 

106from pandas.util._decorators import ( 

107 Appender, 

108 Substitution, 

109 doc, 

110) 

111from pandas.util._exceptions import find_stack_level 

112from pandas.util._validators import ( 

113 validate_ascending, 

114 validate_bool_kwarg, 

115 validate_percentile, 

116) 

117 

118from pandas.core.dtypes.cast import ( 

119 LossySetitemError, 

120 can_hold_element, 

121 construct_1d_arraylike_from_scalar, 

122 construct_2d_arraylike_from_scalar, 

123 find_common_type, 

124 infer_dtype_from_scalar, 

125 invalidate_string_dtypes, 

126 maybe_box_native, 

127 maybe_downcast_to_dtype, 

128) 

129from pandas.core.dtypes.common import ( 

130 infer_dtype_from_object, 

131 is_1d_only_ea_dtype, 

132 is_bool_dtype, 

133 is_dataclass, 

134 is_dict_like, 

135 is_dtype_equal, 

136 is_extension_array_dtype, 

137 is_float, 

138 is_float_dtype, 

139 is_hashable, 

140 is_integer, 

141 is_integer_dtype, 

142 is_iterator, 

143 is_list_like, 

144 is_scalar, 

145 is_sequence, 

146 needs_i8_conversion, 

147 pandas_dtype, 

148) 

149from pandas.core.dtypes.dtypes import ExtensionDtype 

150from pandas.core.dtypes.missing import ( 

151 isna, 

152 notna, 

153) 

154 

155from pandas.core import ( 

156 algorithms, 

157 common as com, 

158 nanops, 

159 ops, 

160) 

161from pandas.core.accessor import CachedAccessor 

162from pandas.core.apply import ( 

163 reconstruct_func, 

164 relabel_result, 

165) 

166from pandas.core.array_algos.take import take_2d_multi 

167from pandas.core.arraylike import OpsMixin 

168from pandas.core.arrays import ( 

169 DatetimeArray, 

170 ExtensionArray, 

171 PeriodArray, 

172 TimedeltaArray, 

173) 

174from pandas.core.arrays.arrow import ArrowDtype 

175from pandas.core.arrays.sparse import SparseFrameAccessor 

176from pandas.core.construction import ( 

177 ensure_wrapped_if_datetimelike, 

178 extract_array, 

179 sanitize_array, 

180 sanitize_masked_array, 

181) 

182from pandas.core.generic import NDFrame 

183from pandas.core.indexers import check_key_length 

184from pandas.core.indexes.api import ( 

185 DatetimeIndex, 

186 Index, 

187 PeriodIndex, 

188 default_index, 

189 ensure_index, 

190 ensure_index_from_sequences, 

191) 

192from pandas.core.indexes.multi import ( 

193 MultiIndex, 

194 maybe_droplevels, 

195) 

196from pandas.core.indexing import ( 

197 check_bool_indexer, 

198 check_dict_or_set_indexers, 

199) 

200from pandas.core.internals import ( 

201 ArrayManager, 

202 BlockManager, 

203) 

204from pandas.core.internals.construction import ( 

205 arrays_to_mgr, 

206 dataclasses_to_dicts, 

207 dict_to_mgr, 

208 mgr_to_mgr, 

209 ndarray_to_mgr, 

210 nested_data_to_arrays, 

211 rec_array_to_mgr, 

212 reorder_arrays, 

213 to_arrays, 

214 treat_as_nested, 

215) 

216from pandas.core.methods import selectn 

217from pandas.core.reshape.melt import melt 

218from pandas.core.series import Series 

219from pandas.core.shared_docs import _shared_docs 

220from pandas.core.sorting import ( 

221 get_group_index, 

222 lexsort_indexer, 

223 nargsort, 

224) 

225 

226from pandas.io.common import get_handle 

227from pandas.io.formats import ( 

228 console, 

229 format as fmt, 

230) 

231from pandas.io.formats.info import ( 

232 INFO_DOCSTRING, 

233 DataFrameInfo, 

234 frame_sub_kwargs, 

235) 

236import pandas.plotting 

237 

238if TYPE_CHECKING: 

239 from pandas.core.groupby.generic import DataFrameGroupBy 

240 from pandas.core.interchange.dataframe_protocol import DataFrame as DataFrameXchg 

241 from pandas.core.internals import SingleDataManager 

242 from pandas.core.resample import Resampler 

243 

244 from pandas.io.formats.style import Styler 

245 

246# --------------------------------------------------------------------- 

247# Docstring templates 

248 

249_shared_doc_kwargs = { 

250 "axes": "index, columns", 

251 "klass": "DataFrame", 

252 "axes_single_arg": "{0 or 'index', 1 or 'columns'}", 

253 "axis": """axis : {0 or 'index', 1 or 'columns'}, default 0 

254 If 0 or 'index': apply function to each column. 

255 If 1 or 'columns': apply function to each row.""", 

256 "inplace": """ 

257 inplace : bool, default False 

258 Whether to modify the DataFrame rather than creating a new one.""", 

259 "optional_by": """ 

260by : str or list of str 

261 Name or list of names to sort by. 

262 

263 - if `axis` is 0 or `'index'` then `by` may contain index 

264 levels and/or column labels. 

265 - if `axis` is 1 or `'columns'` then `by` may contain column 

266 levels and/or index labels.""", 

267 "optional_reindex": """ 

268labels : array-like, optional 

269 New labels / index to conform the axis specified by 'axis' to. 

270index : array-like, optional 

271 New labels for the index. Preferably an Index object to avoid 

272 duplicating data. 

273columns : array-like, optional 

274 New labels for the columns. Preferably an Index object to avoid 

275 duplicating data. 

276axis : int or str, optional 

277 Axis to target. Can be either the axis name ('index', 'columns') 

278 or number (0, 1).""", 

279 "replace_iloc": """ 

280 This differs from updating with ``.loc`` or ``.iloc``, which require 

281 you to specify a location to update with some value.""", 

282} 

283 

284_numeric_only_doc = """numeric_only : bool, default False 

285 Include only float, int, boolean data. 

286""" 

287 

288_merge_doc = """ 

289Merge DataFrame or named Series objects with a database-style join. 

290 

291A named Series object is treated as a DataFrame with a single named column. 

292 

293The join is done on columns or indexes. If joining columns on 

294columns, the DataFrame indexes *will be ignored*. Otherwise if joining indexes 

295on indexes or indexes on a column or columns, the index will be passed on. 

296When performing a cross merge, no column specifications to merge on are 

297allowed. 

298 

299.. warning:: 

300 

301 If both key columns contain rows where the key is a null value, those 

302 rows will be matched against each other. This is different from usual SQL 

303 join behaviour and can lead to unexpected results. 

304 

305Parameters 

306----------%s 

307right : DataFrame or named Series 

308 Object to merge with. 

309how : {'left', 'right', 'outer', 'inner', 'cross'}, default 'inner' 

310 Type of merge to be performed. 

311 

312 * left: use only keys from left frame, similar to a SQL left outer join; 

313 preserve key order. 

314 * right: use only keys from right frame, similar to a SQL right outer join; 

315 preserve key order. 

316 * outer: use union of keys from both frames, similar to a SQL full outer 

317 join; sort keys lexicographically. 

318 * inner: use intersection of keys from both frames, similar to a SQL inner 

319 join; preserve the order of the left keys. 

320 * cross: creates the cartesian product from both frames, preserves the order 

321 of the left keys. 

322 

323 .. versionadded:: 1.2.0 

324 

325on : label or list 

326 Column or index level names to join on. These must be found in both 

327 DataFrames. If `on` is None and not merging on indexes then this defaults 

328 to the intersection of the columns in both DataFrames. 

329left_on : label or list, or array-like 

330 Column or index level names to join on in the left DataFrame. Can also 

331 be an array or list of arrays of the length of the left DataFrame. 

332 These arrays are treated as if they are columns. 

333right_on : label or list, or array-like 

334 Column or index level names to join on in the right DataFrame. Can also 

335 be an array or list of arrays of the length of the right DataFrame. 

336 These arrays are treated as if they are columns. 

337left_index : bool, default False 

338 Use the index from the left DataFrame as the join key(s). If it is a 

339 MultiIndex, the number of keys in the other DataFrame (either the index 

340 or a number of columns) must match the number of levels. 

341right_index : bool, default False 

342 Use the index from the right DataFrame as the join key. Same caveats as 

343 left_index. 

344sort : bool, default False 

345 Sort the join keys lexicographically in the result DataFrame. If False, 

346 the order of the join keys depends on the join type (how keyword). 

347suffixes : list-like, default is ("_x", "_y") 

348 A length-2 sequence where each element is optionally a string 

349 indicating the suffix to add to overlapping column names in 

350 `left` and `right` respectively. Pass a value of `None` instead 

351 of a string to indicate that the column name from `left` or 

352 `right` should be left as-is, with no suffix. At least one of the 

353 values must not be None. 

354copy : bool, default True 

355 If False, avoid copy if possible. 

356indicator : bool or str, default False 

357 If True, adds a column to the output DataFrame called "_merge" with 

358 information on the source of each row. The column can be given a different 

359 name by providing a string argument. The column will have a Categorical 

360 type with the value of "left_only" for observations whose merge key only 

361 appears in the left DataFrame, "right_only" for observations 

362 whose merge key only appears in the right DataFrame, and "both" 

363 if the observation's merge key is found in both DataFrames. 

364 

365validate : str, optional 

366 If specified, checks if merge is of specified type. 

367 

368 * "one_to_one" or "1:1": check if merge keys are unique in both 

369 left and right datasets. 

370 * "one_to_many" or "1:m": check if merge keys are unique in left 

371 dataset. 

372 * "many_to_one" or "m:1": check if merge keys are unique in right 

373 dataset. 

374 * "many_to_many" or "m:m": allowed, but does not result in checks. 

375 

376Returns 

377------- 

378DataFrame 

379 A DataFrame of the two merged objects. 

380 

381See Also 

382-------- 

383merge_ordered : Merge with optional filling/interpolation. 

384merge_asof : Merge on nearest keys. 

385DataFrame.join : Similar method using indices. 

386 

387Notes 

388----- 

389Support for specifying index levels as the `on`, `left_on`, and 

390`right_on` parameters was added in version 0.23.0 

391Support for merging named Series objects was added in version 0.24.0 

392 

393Examples 

394-------- 

395>>> df1 = pd.DataFrame({'lkey': ['foo', 'bar', 'baz', 'foo'], 

396... 'value': [1, 2, 3, 5]}) 

397>>> df2 = pd.DataFrame({'rkey': ['foo', 'bar', 'baz', 'foo'], 

398... 'value': [5, 6, 7, 8]}) 

399>>> df1 

400 lkey value 

4010 foo 1 

4021 bar 2 

4032 baz 3 

4043 foo 5 

405>>> df2 

406 rkey value 

4070 foo 5 

4081 bar 6 

4092 baz 7 

4103 foo 8 

411 

412Merge df1 and df2 on the lkey and rkey columns. The value columns have 

413the default suffixes, _x and _y, appended. 

414 

415>>> df1.merge(df2, left_on='lkey', right_on='rkey') 

416 lkey value_x rkey value_y 

4170 foo 1 foo 5 

4181 foo 1 foo 8 

4192 foo 5 foo 5 

4203 foo 5 foo 8 

4214 bar 2 bar 6 

4225 baz 3 baz 7 

423 

424Merge DataFrames df1 and df2 with specified left and right suffixes 

425appended to any overlapping columns. 

426 

427>>> df1.merge(df2, left_on='lkey', right_on='rkey', 

428... suffixes=('_left', '_right')) 

429 lkey value_left rkey value_right 

4300 foo 1 foo 5 

4311 foo 1 foo 8 

4322 foo 5 foo 5 

4333 foo 5 foo 8 

4344 bar 2 bar 6 

4355 baz 3 baz 7 

436 

437Merge DataFrames df1 and df2, but raise an exception if the DataFrames have 

438any overlapping columns. 

439 

440>>> df1.merge(df2, left_on='lkey', right_on='rkey', suffixes=(False, False)) 

441Traceback (most recent call last): 

442... 

443ValueError: columns overlap but no suffix specified: 

444 Index(['value'], dtype='object') 

445 

446>>> df1 = pd.DataFrame({'a': ['foo', 'bar'], 'b': [1, 2]}) 

447>>> df2 = pd.DataFrame({'a': ['foo', 'baz'], 'c': [3, 4]}) 

448>>> df1 

449 a b 

4500 foo 1 

4511 bar 2 

452>>> df2 

453 a c 

4540 foo 3 

4551 baz 4 

456 

457>>> df1.merge(df2, how='inner', on='a') 

458 a b c 

4590 foo 1 3 

460 

461>>> df1.merge(df2, how='left', on='a') 

462 a b c 

4630 foo 1 3.0 

4641 bar 2 NaN 

465 

466>>> df1 = pd.DataFrame({'left': ['foo', 'bar']}) 

467>>> df2 = pd.DataFrame({'right': [7, 8]}) 

468>>> df1 

469 left 

4700 foo 

4711 bar 

472>>> df2 

473 right 

4740 7 

4751 8 

476 

477>>> df1.merge(df2, how='cross') 

478 left right 

4790 foo 7 

4801 foo 8 

4812 bar 7 

4823 bar 8 

483""" 

484 

485 

486# ----------------------------------------------------------------------- 

487# DataFrame class 

488 

489 

490class DataFrame(NDFrame, OpsMixin): 

491 """ 

492 Two-dimensional, size-mutable, potentially heterogeneous tabular data. 

493 

494 Data structure also contains labeled axes (rows and columns). 

495 Arithmetic operations align on both row and column labels. Can be 

496 thought of as a dict-like container for Series objects. The primary 

497 pandas data structure. 

498 

499 Parameters 

500 ---------- 

501 data : ndarray (structured or homogeneous), Iterable, dict, or DataFrame 

502 Dict can contain Series, arrays, constants, dataclass or list-like objects. If 

503 data is a dict, column order follows insertion-order. If a dict contains Series 

504 which have an index defined, it is aligned by its index. This alignment also 

505 occurs if data is a Series or a DataFrame itself. Alignment is done on 

506 Series/DataFrame inputs. 

507 

508 If data is a list of dicts, column order follows insertion-order. 

509 

510 index : Index or array-like 

511 Index to use for resulting frame. Will default to RangeIndex if 

512 no indexing information part of input data and no index provided. 

513 columns : Index or array-like 

514 Column labels to use for resulting frame when data does not have them, 

515 defaulting to RangeIndex(0, 1, 2, ..., n). If data contains column labels, 

516 will perform column selection instead. 

517 dtype : dtype, default None 

518 Data type to force. Only a single dtype is allowed. If None, infer. 

519 copy : bool or None, default None 

520 Copy data from inputs. 

521 For dict data, the default of None behaves like ``copy=True``. For DataFrame 

522 or 2d ndarray input, the default of None behaves like ``copy=False``. 

523 If data is a dict containing one or more Series (possibly of different dtypes), 

524 ``copy=False`` will ensure that these inputs are not copied. 

525 

526 .. versionchanged:: 1.3.0 

527 

528 See Also 

529 -------- 

530 DataFrame.from_records : Constructor from tuples, also record arrays. 

531 DataFrame.from_dict : From dicts of Series, arrays, or dicts. 

532 read_csv : Read a comma-separated values (csv) file into DataFrame. 

533 read_table : Read general delimited file into DataFrame. 

534 read_clipboard : Read text from clipboard into DataFrame. 

535 

536 Notes 

537 ----- 

538 Please reference the :ref:`User Guide <basics.dataframe>` for more information. 

539 

540 Examples 

541 -------- 

542 Constructing DataFrame from a dictionary. 

543 

544 >>> d = {'col1': [1, 2], 'col2': [3, 4]} 

545 >>> df = pd.DataFrame(data=d) 

546 >>> df 

547 col1 col2 

548 0 1 3 

549 1 2 4 

550 

551 Notice that the inferred dtype is int64. 

552 

553 >>> df.dtypes 

554 col1 int64 

555 col2 int64 

556 dtype: object 

557 

558 To enforce a single dtype: 

559 

560 >>> df = pd.DataFrame(data=d, dtype=np.int8) 

561 >>> df.dtypes 

562 col1 int8 

563 col2 int8 

564 dtype: object 

565 

566 Constructing DataFrame from a dictionary including Series: 

567 

568 >>> d = {'col1': [0, 1, 2, 3], 'col2': pd.Series([2, 3], index=[2, 3])} 

569 >>> pd.DataFrame(data=d, index=[0, 1, 2, 3]) 

570 col1 col2 

571 0 0 NaN 

572 1 1 NaN 

573 2 2 2.0 

574 3 3 3.0 

575 

576 Constructing DataFrame from numpy ndarray: 

577 

578 >>> df2 = pd.DataFrame(np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]), 

579 ... columns=['a', 'b', 'c']) 

580 >>> df2 

581 a b c 

582 0 1 2 3 

583 1 4 5 6 

584 2 7 8 9 

585 

586 Constructing DataFrame from a numpy ndarray that has labeled columns: 

587 

588 >>> data = np.array([(1, 2, 3), (4, 5, 6), (7, 8, 9)], 

589 ... dtype=[("a", "i4"), ("b", "i4"), ("c", "i4")]) 

590 >>> df3 = pd.DataFrame(data, columns=['c', 'a']) 

591 ... 

592 >>> df3 

593 c a 

594 0 3 1 

595 1 6 4 

596 2 9 7 

597 

598 Constructing DataFrame from dataclass: 

599 

600 >>> from dataclasses import make_dataclass 

601 >>> Point = make_dataclass("Point", [("x", int), ("y", int)]) 

602 >>> pd.DataFrame([Point(0, 0), Point(0, 3), Point(2, 3)]) 

603 x y 

604 0 0 0 

605 1 0 3 

606 2 2 3 

607 

608 Constructing DataFrame from Series/DataFrame: 

609 

610 >>> ser = pd.Series([1, 2, 3], index=["a", "b", "c"]) 

611 >>> df = pd.DataFrame(data=ser, index=["a", "c"]) 

612 >>> df 

613 0 

614 a 1 

615 c 3 

616 

617 >>> df1 = pd.DataFrame([1, 2, 3], index=["a", "b", "c"], columns=["x"]) 

618 >>> df2 = pd.DataFrame(data=df1, index=["a", "c"]) 

619 >>> df2 

620 x 

621 a 1 

622 c 3 

623 """ 

624 

625 _internal_names_set = {"columns", "index"} | NDFrame._internal_names_set 

626 _typ = "dataframe" 

627 _HANDLED_TYPES = (Series, Index, ExtensionArray, np.ndarray) 

628 _accessors: set[str] = {"sparse"} 

629 _hidden_attrs: frozenset[str] = NDFrame._hidden_attrs | frozenset([]) 

630 _mgr: BlockManager | ArrayManager 

631 

632 @property 

633 def _constructor(self) -> Callable[..., DataFrame]: 

634 return DataFrame 

635 

636 _constructor_sliced: Callable[..., Series] = Series 

637 

638 # ---------------------------------------------------------------------- 

639 # Constructors 

640 

641 def __init__( 

642 self, 

643 data=None, 

644 index: Axes | None = None, 

645 columns: Axes | None = None, 

646 dtype: Dtype | None = None, 

647 copy: bool | None = None, 

648 ) -> None: 

649 if dtype is not None: 

650 dtype = self._validate_dtype(dtype) 

651 

652 if isinstance(data, DataFrame): 

653 data = data._mgr 

654 if not copy: 

655 # if not copying data, ensure to still return a shallow copy 

656 # to avoid the result sharing the same Manager 

657 data = data.copy(deep=False) 

658 

659 if isinstance(data, (BlockManager, ArrayManager)): 

660 if using_copy_on_write(): 

661 data = data.copy(deep=False) 

662 # first check if a Manager is passed without any other arguments 

663 # -> use fastpath (without checking Manager type) 

664 if index is None and columns is None and dtype is None and not copy: 

665 # GH#33357 fastpath 

666 NDFrame.__init__(self, data) 

667 return 

668 

669 manager = get_option("mode.data_manager") 

670 

671 # GH47215 

672 if index is not None and isinstance(index, set): 

673 raise ValueError("index cannot be a set") 

674 if columns is not None and isinstance(columns, set): 

675 raise ValueError("columns cannot be a set") 

676 

677 if copy is None: 

678 if isinstance(data, dict): 

679 # retain pre-GH#38939 default behavior 

680 copy = True 

681 elif ( 

682 manager == "array" 

683 and isinstance(data, (np.ndarray, ExtensionArray)) 

684 and data.ndim == 2 

685 ): 

686 # INFO(ArrayManager) by default copy the 2D input array to get 

687 # contiguous 1D arrays 

688 copy = True 

689 elif using_copy_on_write() and not isinstance( 

690 data, (Index, DataFrame, Series) 

691 ): 

692 copy = True 

693 else: 

694 copy = False 

695 

696 if data is None: 

697 index = index if index is not None else default_index(0) 

698 columns = columns if columns is not None else default_index(0) 

699 dtype = dtype if dtype is not None else pandas_dtype(object) 

700 data = [] 

701 

702 if isinstance(data, (BlockManager, ArrayManager)): 

703 mgr = self._init_mgr( 

704 data, axes={"index": index, "columns": columns}, dtype=dtype, copy=copy 

705 ) 

706 

707 elif isinstance(data, dict): 

708 # GH#38939 de facto copy defaults to False only in non-dict cases 

709 mgr = dict_to_mgr(data, index, columns, dtype=dtype, copy=copy, typ=manager) 

710 elif isinstance(data, ma.MaskedArray): 

711 from numpy.ma import mrecords 

712 

713 # masked recarray 

714 if isinstance(data, mrecords.MaskedRecords): 

715 raise TypeError( 

716 "MaskedRecords are not supported. Pass " 

717 "{name: data[name] for name in data.dtype.names} " 

718 "instead" 

719 ) 

720 

721 # a masked array 

722 data = sanitize_masked_array(data) 

723 mgr = ndarray_to_mgr( 

724 data, 

725 index, 

726 columns, 

727 dtype=dtype, 

728 copy=copy, 

729 typ=manager, 

730 ) 

731 

732 elif isinstance(data, (np.ndarray, Series, Index, ExtensionArray)): 

733 if data.dtype.names: 

734 # i.e. numpy structured array 

735 data = cast(np.ndarray, data) 

736 mgr = rec_array_to_mgr( 

737 data, 

738 index, 

739 columns, 

740 dtype, 

741 copy, 

742 typ=manager, 

743 ) 

744 elif getattr(data, "name", None) is not None: 

745 # i.e. Series/Index with non-None name 

746 _copy = copy if using_copy_on_write() else True 

747 mgr = dict_to_mgr( 

748 # error: Item "ndarray" of "Union[ndarray, Series, Index]" has no 

749 # attribute "name" 

750 {data.name: data}, # type: ignore[union-attr] 

751 index, 

752 columns, 

753 dtype=dtype, 

754 typ=manager, 

755 copy=_copy, 

756 ) 

757 else: 

758 mgr = ndarray_to_mgr( 

759 data, 

760 index, 

761 columns, 

762 dtype=dtype, 

763 copy=copy, 

764 typ=manager, 

765 ) 

766 

767 # For data is list-like, or Iterable (will consume into list) 

768 elif is_list_like(data): 

769 if not isinstance(data, abc.Sequence): 

770 if hasattr(data, "__array__"): 

771 # GH#44616 big perf improvement for e.g. pytorch tensor 

772 data = np.asarray(data) 

773 else: 

774 data = list(data) 

775 if len(data) > 0: 

776 if is_dataclass(data[0]): 

777 data = dataclasses_to_dicts(data) 

778 if not isinstance(data, np.ndarray) and treat_as_nested(data): 

779 # exclude ndarray as we may have cast it a few lines above 

780 if columns is not None: 

781 columns = ensure_index(columns) 

782 arrays, columns, index = nested_data_to_arrays( 

783 # error: Argument 3 to "nested_data_to_arrays" has incompatible 

784 # type "Optional[Collection[Any]]"; expected "Optional[Index]" 

785 data, 

786 columns, 

787 index, # type: ignore[arg-type] 

788 dtype, 

789 ) 

790 mgr = arrays_to_mgr( 

791 arrays, 

792 columns, 

793 index, 

794 dtype=dtype, 

795 typ=manager, 

796 ) 

797 else: 

798 mgr = ndarray_to_mgr( 

799 data, 

800 index, 

801 columns, 

802 dtype=dtype, 

803 copy=copy, 

804 typ=manager, 

805 ) 

806 else: 

807 mgr = dict_to_mgr( 

808 {}, 

809 index, 

810 columns if columns is not None else default_index(0), 

811 dtype=dtype, 

812 typ=manager, 

813 ) 

814 # For data is scalar 

815 else: 

816 if index is None or columns is None: 

817 raise ValueError("DataFrame constructor not properly called!") 

818 

819 index = ensure_index(index) 

820 columns = ensure_index(columns) 

821 

822 if not dtype: 

823 dtype, _ = infer_dtype_from_scalar(data, pandas_dtype=True) 

824 

825 # For data is a scalar extension dtype 

826 if isinstance(dtype, ExtensionDtype): 

827 # TODO(EA2D): special case not needed with 2D EAs 

828 

829 values = [ 

830 construct_1d_arraylike_from_scalar(data, len(index), dtype) 

831 for _ in range(len(columns)) 

832 ] 

833 mgr = arrays_to_mgr(values, columns, index, dtype=None, typ=manager) 

834 else: 

835 arr2d = construct_2d_arraylike_from_scalar( 

836 data, 

837 len(index), 

838 len(columns), 

839 dtype, 

840 copy, 

841 ) 

842 

843 mgr = ndarray_to_mgr( 

844 arr2d, 

845 index, 

846 columns, 

847 dtype=arr2d.dtype, 

848 copy=False, 

849 typ=manager, 

850 ) 

851 

852 # ensure correct Manager type according to settings 

853 mgr = mgr_to_mgr(mgr, typ=manager) 

854 

855 NDFrame.__init__(self, mgr) 

856 

857 # ---------------------------------------------------------------------- 

858 def __dataframe__( 

859 self, nan_as_null: bool = False, allow_copy: bool = True 

860 ) -> DataFrameXchg: 

861 """ 

862 Return the dataframe interchange object implementing the interchange protocol. 

863 

864 Parameters 

865 ---------- 

866 nan_as_null : bool, default False 

867 Whether to tell the DataFrame to overwrite null values in the data 

868 with ``NaN`` (or ``NaT``). 

869 allow_copy : bool, default True 

870 Whether to allow memory copying when exporting. If set to False 

871 it would cause non-zero-copy exports to fail. 

872 

873 Returns 

874 ------- 

875 DataFrame interchange object 

876 The object which consuming library can use to ingress the dataframe. 

877 

878 Notes 

879 ----- 

880 Details on the interchange protocol: 

881 https://data-apis.org/dataframe-protocol/latest/index.html 

882 

883 `nan_as_null` currently has no effect; once support for nullable extension 

884 dtypes is added, this value should be propagated to columns. 

885 """ 

886 

887 from pandas.core.interchange.dataframe import PandasDataFrameXchg 

888 

889 return PandasDataFrameXchg(self, nan_as_null, allow_copy) 

890 

891 # ---------------------------------------------------------------------- 

892 

893 @property 

894 def axes(self) -> list[Index]: 

895 """ 

896 Return a list representing the axes of the DataFrame. 

897 

898 It has the row axis labels and column axis labels as the only members. 

899 They are returned in that order. 

900 

901 Examples 

902 -------- 

903 >>> df = pd.DataFrame({'col1': [1, 2], 'col2': [3, 4]}) 

904 >>> df.axes 

905 [RangeIndex(start=0, stop=2, step=1), Index(['col1', 'col2'], 

906 dtype='object')] 

907 """ 

908 return [self.index, self.columns] 

909 

910 @property 

911 def shape(self) -> tuple[int, int]: 

912 """ 

913 Return a tuple representing the dimensionality of the DataFrame. 

914 

915 See Also 

916 -------- 

917 ndarray.shape : Tuple of array dimensions. 

918 

919 Examples 

920 -------- 

921 >>> df = pd.DataFrame({'col1': [1, 2], 'col2': [3, 4]}) 

922 >>> df.shape 

923 (2, 2) 

924 

925 >>> df = pd.DataFrame({'col1': [1, 2], 'col2': [3, 4], 

926 ... 'col3': [5, 6]}) 

927 >>> df.shape 

928 (2, 3) 

929 """ 

930 return len(self.index), len(self.columns) 

931 

932 @property 

933 def _is_homogeneous_type(self) -> bool: 

934 """ 

935 Whether all the columns in a DataFrame have the same type. 

936 

937 Returns 

938 ------- 

939 bool 

940 

941 See Also 

942 -------- 

943 Index._is_homogeneous_type : Whether the object has a single 

944 dtype. 

945 MultiIndex._is_homogeneous_type : Whether all the levels of a 

946 MultiIndex have the same dtype. 

947 

948 Examples 

949 -------- 

950 >>> DataFrame({"A": [1, 2], "B": [3, 4]})._is_homogeneous_type 

951 True 

952 >>> DataFrame({"A": [1, 2], "B": [3.0, 4.0]})._is_homogeneous_type 

953 False 

954 

955 Items with the same type but different sizes are considered 

956 different types. 

957 

958 >>> DataFrame({ 

959 ... "A": np.array([1, 2], dtype=np.int32), 

960 ... "B": np.array([1, 2], dtype=np.int64)})._is_homogeneous_type 

961 False 

962 """ 

963 if isinstance(self._mgr, ArrayManager): 

964 return len({arr.dtype for arr in self._mgr.arrays}) == 1 

965 if self._mgr.any_extension_types: 

966 return len({block.dtype for block in self._mgr.blocks}) == 1 

967 else: 

968 return not self._is_mixed_type 

969 

970 @property 

971 def _can_fast_transpose(self) -> bool: 

972 """ 

973 Can we transpose this DataFrame without creating any new array objects. 

974 """ 

975 if isinstance(self._mgr, ArrayManager): 

976 return False 

977 blocks = self._mgr.blocks 

978 if len(blocks) != 1: 

979 return False 

980 

981 dtype = blocks[0].dtype 

982 # TODO(EA2D) special case would be unnecessary with 2D EAs 

983 return not is_1d_only_ea_dtype(dtype) 

984 

985 @property 

986 def _values(self) -> np.ndarray | DatetimeArray | TimedeltaArray | PeriodArray: 

987 """ 

988 Analogue to ._values that may return a 2D ExtensionArray. 

989 """ 

990 mgr = self._mgr 

991 

992 if isinstance(mgr, ArrayManager): 

993 if len(mgr.arrays) == 1 and not is_1d_only_ea_dtype(mgr.arrays[0].dtype): 

994 # error: Item "ExtensionArray" of "Union[ndarray, ExtensionArray]" 

995 # has no attribute "reshape" 

996 return mgr.arrays[0].reshape(-1, 1) # type: ignore[union-attr] 

997 return ensure_wrapped_if_datetimelike(self.values) 

998 

999 blocks = mgr.blocks 

1000 if len(blocks) != 1: 

1001 return ensure_wrapped_if_datetimelike(self.values) 

1002 

1003 arr = blocks[0].values 

1004 if arr.ndim == 1: 

1005 # non-2D ExtensionArray 

1006 return self.values 

1007 

1008 # more generally, whatever we allow in NDArrayBackedExtensionBlock 

1009 arr = cast("np.ndarray | DatetimeArray | TimedeltaArray | PeriodArray", arr) 

1010 return arr.T 

1011 

1012 # ---------------------------------------------------------------------- 

1013 # Rendering Methods 

1014 

1015 def _repr_fits_vertical_(self) -> bool: 

1016 """ 

1017 Check length against max_rows. 

1018 """ 

1019 max_rows = get_option("display.max_rows") 

1020 return len(self) <= max_rows 

1021 

1022 def _repr_fits_horizontal_(self, ignore_width: bool = False) -> bool: 

1023 """ 

1024 Check if full repr fits in horizontal boundaries imposed by the display 

1025 options width and max_columns. 

1026 

1027 In case of non-interactive session, no boundaries apply. 

1028 

1029 `ignore_width` is here so ipynb+HTML output can behave the way 

1030 users expect. display.max_columns remains in effect. 

1031 GH3541, GH3573 

1032 """ 

1033 width, height = console.get_console_size() 

1034 max_columns = get_option("display.max_columns") 

1035 nb_columns = len(self.columns) 

1036 

1037 # exceed max columns 

1038 if (max_columns and nb_columns > max_columns) or ( 

1039 (not ignore_width) and width and nb_columns > (width // 2) 

1040 ): 

1041 return False 

1042 

1043 # used by repr_html under IPython notebook or scripts ignore terminal 

1044 # dims 

1045 if ignore_width or width is None or not console.in_interactive_session(): 

1046 return True 

1047 

1048 if get_option("display.width") is not None or console.in_ipython_frontend(): 

1049 # check at least the column row for excessive width 

1050 max_rows = 1 

1051 else: 

1052 max_rows = get_option("display.max_rows") 

1053 

1054 # when auto-detecting, so width=None and not in ipython front end 

1055 # check whether repr fits horizontal by actually checking 

1056 # the width of the rendered repr 

1057 buf = StringIO() 

1058 

1059 # only care about the stuff we'll actually print out 

1060 # and to_string on entire frame may be expensive 

1061 d = self 

1062 

1063 if max_rows is not None: # unlimited rows 

1064 # min of two, where one may be None 

1065 d = d.iloc[: min(max_rows, len(d))] 

1066 else: 

1067 return True 

1068 

1069 d.to_string(buf=buf) 

1070 value = buf.getvalue() 

1071 repr_width = max(len(line) for line in value.split("\n")) 

1072 

1073 return repr_width < width 

1074 

1075 def _info_repr(self) -> bool: 

1076 """ 

1077 True if the repr should show the info view. 

1078 """ 

1079 info_repr_option = get_option("display.large_repr") == "info" 

1080 return info_repr_option and not ( 

1081 self._repr_fits_horizontal_() and self._repr_fits_vertical_() 

1082 ) 

1083 

1084 def __repr__(self) -> str: 

1085 """ 

1086 Return a string representation for a particular DataFrame. 

1087 """ 

1088 if self._info_repr(): 

1089 buf = StringIO() 

1090 self.info(buf=buf) 

1091 return buf.getvalue() 

1092 

1093 repr_params = fmt.get_dataframe_repr_params() 

1094 return self.to_string(**repr_params) 

1095 

1096 def _repr_html_(self) -> str | None: 

1097 """ 

1098 Return a html representation for a particular DataFrame. 

1099 

1100 Mainly for IPython notebook. 

1101 """ 

1102 if self._info_repr(): 

1103 buf = StringIO() 

1104 self.info(buf=buf) 

1105 # need to escape the <class>, should be the first line. 

1106 val = buf.getvalue().replace("<", r"&lt;", 1) 

1107 val = val.replace(">", r"&gt;", 1) 

1108 return f"<pre>{val}</pre>" 

1109 

1110 if get_option("display.notebook_repr_html"): 

1111 max_rows = get_option("display.max_rows") 

1112 min_rows = get_option("display.min_rows") 

1113 max_cols = get_option("display.max_columns") 

1114 show_dimensions = get_option("display.show_dimensions") 

1115 

1116 formatter = fmt.DataFrameFormatter( 

1117 self, 

1118 columns=None, 

1119 col_space=None, 

1120 na_rep="NaN", 

1121 formatters=None, 

1122 float_format=None, 

1123 sparsify=None, 

1124 justify=None, 

1125 index_names=True, 

1126 header=True, 

1127 index=True, 

1128 bold_rows=True, 

1129 escape=True, 

1130 max_rows=max_rows, 

1131 min_rows=min_rows, 

1132 max_cols=max_cols, 

1133 show_dimensions=show_dimensions, 

1134 decimal=".", 

1135 ) 

1136 return fmt.DataFrameRenderer(formatter).to_html(notebook=True) 

1137 else: 

1138 return None 

1139 

1140 @overload 

1141 def to_string( 

1142 self, 

1143 buf: None = ..., 

1144 columns: Sequence[str] | None = ..., 

1145 col_space: int | list[int] | dict[Hashable, int] | None = ..., 

1146 header: bool | Sequence[str] = ..., 

1147 index: bool = ..., 

1148 na_rep: str = ..., 

1149 formatters: fmt.FormattersType | None = ..., 

1150 float_format: fmt.FloatFormatType | None = ..., 

1151 sparsify: bool | None = ..., 

1152 index_names: bool = ..., 

1153 justify: str | None = ..., 

1154 max_rows: int | None = ..., 

1155 max_cols: int | None = ..., 

1156 show_dimensions: bool = ..., 

1157 decimal: str = ..., 

1158 line_width: int | None = ..., 

1159 min_rows: int | None = ..., 

1160 max_colwidth: int | None = ..., 

1161 encoding: str | None = ..., 

1162 ) -> str: 

1163 ... 

1164 

1165 @overload 

1166 def to_string( 

1167 self, 

1168 buf: FilePath | WriteBuffer[str], 

1169 columns: Sequence[str] | None = ..., 

1170 col_space: int | list[int] | dict[Hashable, int] | None = ..., 

1171 header: bool | Sequence[str] = ..., 

1172 index: bool = ..., 

1173 na_rep: str = ..., 

1174 formatters: fmt.FormattersType | None = ..., 

1175 float_format: fmt.FloatFormatType | None = ..., 

1176 sparsify: bool | None = ..., 

1177 index_names: bool = ..., 

1178 justify: str | None = ..., 

1179 max_rows: int | None = ..., 

1180 max_cols: int | None = ..., 

1181 show_dimensions: bool = ..., 

1182 decimal: str = ..., 

1183 line_width: int | None = ..., 

1184 min_rows: int | None = ..., 

1185 max_colwidth: int | None = ..., 

1186 encoding: str | None = ..., 

1187 ) -> None: 

1188 ... 

1189 

1190 @Substitution( 

1191 header_type="bool or sequence of str", 

1192 header="Write out the column names. If a list of strings " 

1193 "is given, it is assumed to be aliases for the " 

1194 "column names", 

1195 col_space_type="int, list or dict of int", 

1196 col_space="The minimum width of each column. If a list of ints is given " 

1197 "every integers corresponds with one column. If a dict is given, the key " 

1198 "references the column, while the value defines the space to use.", 

1199 ) 

1200 @Substitution(shared_params=fmt.common_docstring, returns=fmt.return_docstring) 

1201 def to_string( 

1202 self, 

1203 buf: FilePath | WriteBuffer[str] | None = None, 

1204 columns: Sequence[str] | None = None, 

1205 col_space: int | list[int] | dict[Hashable, int] | None = None, 

1206 header: bool | Sequence[str] = True, 

1207 index: bool = True, 

1208 na_rep: str = "NaN", 

1209 formatters: fmt.FormattersType | None = None, 

1210 float_format: fmt.FloatFormatType | None = None, 

1211 sparsify: bool | None = None, 

1212 index_names: bool = True, 

1213 justify: str | None = None, 

1214 max_rows: int | None = None, 

1215 max_cols: int | None = None, 

1216 show_dimensions: bool = False, 

1217 decimal: str = ".", 

1218 line_width: int | None = None, 

1219 min_rows: int | None = None, 

1220 max_colwidth: int | None = None, 

1221 encoding: str | None = None, 

1222 ) -> str | None: 

1223 """ 

1224 Render a DataFrame to a console-friendly tabular output. 

1225 %(shared_params)s 

1226 line_width : int, optional 

1227 Width to wrap a line in characters. 

1228 min_rows : int, optional 

1229 The number of rows to display in the console in a truncated repr 

1230 (when number of rows is above `max_rows`). 

1231 max_colwidth : int, optional 

1232 Max width to truncate each column in characters. By default, no limit. 

1233 encoding : str, default "utf-8" 

1234 Set character encoding. 

1235 %(returns)s 

1236 See Also 

1237 -------- 

1238 to_html : Convert DataFrame to HTML. 

1239 

1240 Examples 

1241 -------- 

1242 >>> d = {'col1': [1, 2, 3], 'col2': [4, 5, 6]} 

1243 >>> df = pd.DataFrame(d) 

1244 >>> print(df.to_string()) 

1245 col1 col2 

1246 0 1 4 

1247 1 2 5 

1248 2 3 6 

1249 """ 

1250 from pandas import option_context 

1251 

1252 with option_context("display.max_colwidth", max_colwidth): 

1253 formatter = fmt.DataFrameFormatter( 

1254 self, 

1255 columns=columns, 

1256 col_space=col_space, 

1257 na_rep=na_rep, 

1258 formatters=formatters, 

1259 float_format=float_format, 

1260 sparsify=sparsify, 

1261 justify=justify, 

1262 index_names=index_names, 

1263 header=header, 

1264 index=index, 

1265 min_rows=min_rows, 

1266 max_rows=max_rows, 

1267 max_cols=max_cols, 

1268 show_dimensions=show_dimensions, 

1269 decimal=decimal, 

1270 ) 

1271 return fmt.DataFrameRenderer(formatter).to_string( 

1272 buf=buf, 

1273 encoding=encoding, 

1274 line_width=line_width, 

1275 ) 

1276 

1277 # ---------------------------------------------------------------------- 

1278 

1279 @property 

1280 def style(self) -> Styler: 

1281 """ 

1282 Returns a Styler object. 

1283 

1284 Contains methods for building a styled HTML representation of the DataFrame. 

1285 

1286 See Also 

1287 -------- 

1288 io.formats.style.Styler : Helps style a DataFrame or Series according to the 

1289 data with HTML and CSS. 

1290 """ 

1291 from pandas.io.formats.style import Styler 

1292 

1293 return Styler(self) 

1294 

1295 _shared_docs[ 

1296 "items" 

1297 ] = r""" 

1298 Iterate over (column name, Series) pairs. 

1299 

1300 Iterates over the DataFrame columns, returning a tuple with 

1301 the column name and the content as a Series. 

1302 

1303 Yields 

1304 ------ 

1305 label : object 

1306 The column names for the DataFrame being iterated over. 

1307 content : Series 

1308 The column entries belonging to each label, as a Series. 

1309 

1310 See Also 

1311 -------- 

1312 DataFrame.iterrows : Iterate over DataFrame rows as 

1313 (index, Series) pairs. 

1314 DataFrame.itertuples : Iterate over DataFrame rows as namedtuples 

1315 of the values. 

1316 

1317 Examples 

1318 -------- 

1319 >>> df = pd.DataFrame({'species': ['bear', 'bear', 'marsupial'], 

1320 ... 'population': [1864, 22000, 80000]}, 

1321 ... index=['panda', 'polar', 'koala']) 

1322 >>> df 

1323 species population 

1324 panda bear 1864 

1325 polar bear 22000 

1326 koala marsupial 80000 

1327 >>> for label, content in df.items(): 

1328 ... print(f'label: {label}') 

1329 ... print(f'content: {content}', sep='\n') 

1330 ... 

1331 label: species 

1332 content: 

1333 panda bear 

1334 polar bear 

1335 koala marsupial 

1336 Name: species, dtype: object 

1337 label: population 

1338 content: 

1339 panda 1864 

1340 polar 22000 

1341 koala 80000 

1342 Name: population, dtype: int64 

1343 """ 

1344 

1345 @Appender(_shared_docs["items"]) 

1346 def items(self) -> Iterable[tuple[Hashable, Series]]: 

1347 if self.columns.is_unique and hasattr(self, "_item_cache"): 

1348 for k in self.columns: 

1349 yield k, self._get_item_cache(k) 

1350 else: 

1351 for i, k in enumerate(self.columns): 

1352 yield k, self._ixs(i, axis=1) 

1353 

1354 def iterrows(self) -> Iterable[tuple[Hashable, Series]]: 

1355 """ 

1356 Iterate over DataFrame rows as (index, Series) pairs. 

1357 

1358 Yields 

1359 ------ 

1360 index : label or tuple of label 

1361 The index of the row. A tuple for a `MultiIndex`. 

1362 data : Series 

1363 The data of the row as a Series. 

1364 

1365 See Also 

1366 -------- 

1367 DataFrame.itertuples : Iterate over DataFrame rows as namedtuples of the values. 

1368 DataFrame.items : Iterate over (column name, Series) pairs. 

1369 

1370 Notes 

1371 ----- 

1372 1. Because ``iterrows`` returns a Series for each row, 

1373 it does **not** preserve dtypes across the rows (dtypes are 

1374 preserved across columns for DataFrames). For example, 

1375 

1376 >>> df = pd.DataFrame([[1, 1.5]], columns=['int', 'float']) 

1377 >>> row = next(df.iterrows())[1] 

1378 >>> row 

1379 int 1.0 

1380 float 1.5 

1381 Name: 0, dtype: float64 

1382 >>> print(row['int'].dtype) 

1383 float64 

1384 >>> print(df['int'].dtype) 

1385 int64 

1386 

1387 To preserve dtypes while iterating over the rows, it is better 

1388 to use :meth:`itertuples` which returns namedtuples of the values 

1389 and which is generally faster than ``iterrows``. 

1390 

1391 2. You should **never modify** something you are iterating over. 

1392 This is not guaranteed to work in all cases. Depending on the 

1393 data types, the iterator returns a copy and not a view, and writing 

1394 to it will have no effect. 

1395 """ 

1396 columns = self.columns 

1397 klass = self._constructor_sliced 

1398 using_cow = using_copy_on_write() 

1399 for k, v in zip(self.index, self.values): 

1400 s = klass(v, index=columns, name=k).__finalize__(self) 

1401 if using_cow and self._mgr.is_single_block: 

1402 s._mgr.add_references(self._mgr) # type: ignore[arg-type] 

1403 yield k, s 

1404 

1405 def itertuples( 

1406 self, index: bool = True, name: str | None = "Pandas" 

1407 ) -> Iterable[tuple[Any, ...]]: 

1408 """ 

1409 Iterate over DataFrame rows as namedtuples. 

1410 

1411 Parameters 

1412 ---------- 

1413 index : bool, default True 

1414 If True, return the index as the first element of the tuple. 

1415 name : str or None, default "Pandas" 

1416 The name of the returned namedtuples or None to return regular 

1417 tuples. 

1418 

1419 Returns 

1420 ------- 

1421 iterator 

1422 An object to iterate over namedtuples for each row in the 

1423 DataFrame with the first field possibly being the index and 

1424 following fields being the column values. 

1425 

1426 See Also 

1427 -------- 

1428 DataFrame.iterrows : Iterate over DataFrame rows as (index, Series) 

1429 pairs. 

1430 DataFrame.items : Iterate over (column name, Series) pairs. 

1431 

1432 Notes 

1433 ----- 

1434 The column names will be renamed to positional names if they are 

1435 invalid Python identifiers, repeated, or start with an underscore. 

1436 

1437 Examples 

1438 -------- 

1439 >>> df = pd.DataFrame({'num_legs': [4, 2], 'num_wings': [0, 2]}, 

1440 ... index=['dog', 'hawk']) 

1441 >>> df 

1442 num_legs num_wings 

1443 dog 4 0 

1444 hawk 2 2 

1445 >>> for row in df.itertuples(): 

1446 ... print(row) 

1447 ... 

1448 Pandas(Index='dog', num_legs=4, num_wings=0) 

1449 Pandas(Index='hawk', num_legs=2, num_wings=2) 

1450 

1451 By setting the `index` parameter to False we can remove the index 

1452 as the first element of the tuple: 

1453 

1454 >>> for row in df.itertuples(index=False): 

1455 ... print(row) 

1456 ... 

1457 Pandas(num_legs=4, num_wings=0) 

1458 Pandas(num_legs=2, num_wings=2) 

1459 

1460 With the `name` parameter set we set a custom name for the yielded 

1461 namedtuples: 

1462 

1463 >>> for row in df.itertuples(name='Animal'): 

1464 ... print(row) 

1465 ... 

1466 Animal(Index='dog', num_legs=4, num_wings=0) 

1467 Animal(Index='hawk', num_legs=2, num_wings=2) 

1468 """ 

1469 arrays = [] 

1470 fields = list(self.columns) 

1471 if index: 

1472 arrays.append(self.index) 

1473 fields.insert(0, "Index") 

1474 

1475 # use integer indexing because of possible duplicate column names 

1476 arrays.extend(self.iloc[:, k] for k in range(len(self.columns))) 

1477 

1478 if name is not None: 

1479 # https://github.com/python/mypy/issues/9046 

1480 # error: namedtuple() expects a string literal as the first argument 

1481 itertuple = collections.namedtuple( # type: ignore[misc] 

1482 name, fields, rename=True 

1483 ) 

1484 return map(itertuple._make, zip(*arrays)) 

1485 

1486 # fallback to regular tuples 

1487 return zip(*arrays) 

1488 

1489 def __len__(self) -> int: 

1490 """ 

1491 Returns length of info axis, but here we use the index. 

1492 """ 

1493 return len(self.index) 

1494 

1495 @overload 

1496 def dot(self, other: Series) -> Series: 

1497 ... 

1498 

1499 @overload 

1500 def dot(self, other: DataFrame | Index | ArrayLike) -> DataFrame: 

1501 ... 

1502 

1503 def dot(self, other: AnyArrayLike | DataFrame) -> DataFrame | Series: 

1504 """ 

1505 Compute the matrix multiplication between the DataFrame and other. 

1506 

1507 This method computes the matrix product between the DataFrame and the 

1508 values of an other Series, DataFrame or a numpy array. 

1509 

1510 It can also be called using ``self @ other`` in Python >= 3.5. 

1511 

1512 Parameters 

1513 ---------- 

1514 other : Series, DataFrame or array-like 

1515 The other object to compute the matrix product with. 

1516 

1517 Returns 

1518 ------- 

1519 Series or DataFrame 

1520 If other is a Series, return the matrix product between self and 

1521 other as a Series. If other is a DataFrame or a numpy.array, return 

1522 the matrix product of self and other in a DataFrame of a np.array. 

1523 

1524 See Also 

1525 -------- 

1526 Series.dot: Similar method for Series. 

1527 

1528 Notes 

1529 ----- 

1530 The dimensions of DataFrame and other must be compatible in order to 

1531 compute the matrix multiplication. In addition, the column names of 

1532 DataFrame and the index of other must contain the same values, as they 

1533 will be aligned prior to the multiplication. 

1534 

1535 The dot method for Series computes the inner product, instead of the 

1536 matrix product here. 

1537 

1538 Examples 

1539 -------- 

1540 Here we multiply a DataFrame with a Series. 

1541 

1542 >>> df = pd.DataFrame([[0, 1, -2, -1], [1, 1, 1, 1]]) 

1543 >>> s = pd.Series([1, 1, 2, 1]) 

1544 >>> df.dot(s) 

1545 0 -4 

1546 1 5 

1547 dtype: int64 

1548 

1549 Here we multiply a DataFrame with another DataFrame. 

1550 

1551 >>> other = pd.DataFrame([[0, 1], [1, 2], [-1, -1], [2, 0]]) 

1552 >>> df.dot(other) 

1553 0 1 

1554 0 1 4 

1555 1 2 2 

1556 

1557 Note that the dot method give the same result as @ 

1558 

1559 >>> df @ other 

1560 0 1 

1561 0 1 4 

1562 1 2 2 

1563 

1564 The dot method works also if other is an np.array. 

1565 

1566 >>> arr = np.array([[0, 1], [1, 2], [-1, -1], [2, 0]]) 

1567 >>> df.dot(arr) 

1568 0 1 

1569 0 1 4 

1570 1 2 2 

1571 

1572 Note how shuffling of the objects does not change the result. 

1573 

1574 >>> s2 = s.reindex([1, 0, 2, 3]) 

1575 >>> df.dot(s2) 

1576 0 -4 

1577 1 5 

1578 dtype: int64 

1579 """ 

1580 if isinstance(other, (Series, DataFrame)): 

1581 common = self.columns.union(other.index) 

1582 if len(common) > len(self.columns) or len(common) > len(other.index): 

1583 raise ValueError("matrices are not aligned") 

1584 

1585 left = self.reindex(columns=common, copy=False) 

1586 right = other.reindex(index=common, copy=False) 

1587 lvals = left.values 

1588 rvals = right._values 

1589 else: 

1590 left = self 

1591 lvals = self.values 

1592 rvals = np.asarray(other) 

1593 if lvals.shape[1] != rvals.shape[0]: 

1594 raise ValueError( 

1595 f"Dot product shape mismatch, {lvals.shape} vs {rvals.shape}" 

1596 ) 

1597 

1598 if isinstance(other, DataFrame): 

1599 return self._constructor( 

1600 np.dot(lvals, rvals), 

1601 index=left.index, 

1602 columns=other.columns, 

1603 copy=False, 

1604 ) 

1605 elif isinstance(other, Series): 

1606 return self._constructor_sliced( 

1607 np.dot(lvals, rvals), index=left.index, copy=False 

1608 ) 

1609 elif isinstance(rvals, (np.ndarray, Index)): 

1610 result = np.dot(lvals, rvals) 

1611 if result.ndim == 2: 

1612 return self._constructor(result, index=left.index, copy=False) 

1613 else: 

1614 return self._constructor_sliced(result, index=left.index, copy=False) 

1615 else: # pragma: no cover 

1616 raise TypeError(f"unsupported type: {type(other)}") 

1617 

1618 @overload 

1619 def __matmul__(self, other: Series) -> Series: 

1620 ... 

1621 

1622 @overload 

1623 def __matmul__(self, other: AnyArrayLike | DataFrame) -> DataFrame | Series: 

1624 ... 

1625 

1626 def __matmul__(self, other: AnyArrayLike | DataFrame) -> DataFrame | Series: 

1627 """ 

1628 Matrix multiplication using binary `@` operator in Python>=3.5. 

1629 """ 

1630 return self.dot(other) 

1631 

1632 def __rmatmul__(self, other) -> DataFrame: 

1633 """ 

1634 Matrix multiplication using binary `@` operator in Python>=3.5. 

1635 """ 

1636 try: 

1637 return self.T.dot(np.transpose(other)).T 

1638 except ValueError as err: 

1639 if "shape mismatch" not in str(err): 

1640 raise 

1641 # GH#21581 give exception message for original shapes 

1642 msg = f"shapes {np.shape(other)} and {self.shape} not aligned" 

1643 raise ValueError(msg) from err 

1644 

1645 # ---------------------------------------------------------------------- 

1646 # IO methods (to / from other formats) 

1647 

1648 @classmethod 

1649 def from_dict( 

1650 cls, 

1651 data: dict, 

1652 orient: str = "columns", 

1653 dtype: Dtype | None = None, 

1654 columns: Axes | None = None, 

1655 ) -> DataFrame: 

1656 """ 

1657 Construct DataFrame from dict of array-like or dicts. 

1658 

1659 Creates DataFrame object from dictionary by columns or by index 

1660 allowing dtype specification. 

1661 

1662 Parameters 

1663 ---------- 

1664 data : dict 

1665 Of the form {field : array-like} or {field : dict}. 

1666 orient : {'columns', 'index', 'tight'}, default 'columns' 

1667 The "orientation" of the data. If the keys of the passed dict 

1668 should be the columns of the resulting DataFrame, pass 'columns' 

1669 (default). Otherwise if the keys should be rows, pass 'index'. 

1670 If 'tight', assume a dict with keys ['index', 'columns', 'data', 

1671 'index_names', 'column_names']. 

1672 

1673 .. versionadded:: 1.4.0 

1674 'tight' as an allowed value for the ``orient`` argument 

1675 

1676 dtype : dtype, default None 

1677 Data type to force after DataFrame construction, otherwise infer. 

1678 columns : list, default None 

1679 Column labels to use when ``orient='index'``. Raises a ValueError 

1680 if used with ``orient='columns'`` or ``orient='tight'``. 

1681 

1682 Returns 

1683 ------- 

1684 DataFrame 

1685 

1686 See Also 

1687 -------- 

1688 DataFrame.from_records : DataFrame from structured ndarray, sequence 

1689 of tuples or dicts, or DataFrame. 

1690 DataFrame : DataFrame object creation using constructor. 

1691 DataFrame.to_dict : Convert the DataFrame to a dictionary. 

1692 

1693 Examples 

1694 -------- 

1695 By default the keys of the dict become the DataFrame columns: 

1696 

1697 >>> data = {'col_1': [3, 2, 1, 0], 'col_2': ['a', 'b', 'c', 'd']} 

1698 >>> pd.DataFrame.from_dict(data) 

1699 col_1 col_2 

1700 0 3 a 

1701 1 2 b 

1702 2 1 c 

1703 3 0 d 

1704 

1705 Specify ``orient='index'`` to create the DataFrame using dictionary 

1706 keys as rows: 

1707 

1708 >>> data = {'row_1': [3, 2, 1, 0], 'row_2': ['a', 'b', 'c', 'd']} 

1709 >>> pd.DataFrame.from_dict(data, orient='index') 

1710 0 1 2 3 

1711 row_1 3 2 1 0 

1712 row_2 a b c d 

1713 

1714 When using the 'index' orientation, the column names can be 

1715 specified manually: 

1716 

1717 >>> pd.DataFrame.from_dict(data, orient='index', 

1718 ... columns=['A', 'B', 'C', 'D']) 

1719 A B C D 

1720 row_1 3 2 1 0 

1721 row_2 a b c d 

1722 

1723 Specify ``orient='tight'`` to create the DataFrame using a 'tight' 

1724 format: 

1725 

1726 >>> data = {'index': [('a', 'b'), ('a', 'c')], 

1727 ... 'columns': [('x', 1), ('y', 2)], 

1728 ... 'data': [[1, 3], [2, 4]], 

1729 ... 'index_names': ['n1', 'n2'], 

1730 ... 'column_names': ['z1', 'z2']} 

1731 >>> pd.DataFrame.from_dict(data, orient='tight') 

1732 z1 x y 

1733 z2 1 2 

1734 n1 n2 

1735 a b 1 3 

1736 c 2 4 

1737 """ 

1738 index = None 

1739 orient = orient.lower() 

1740 if orient == "index": 

1741 if len(data) > 0: 

1742 # TODO speed up Series case 

1743 if isinstance(list(data.values())[0], (Series, dict)): 

1744 data = _from_nested_dict(data) 

1745 else: 

1746 index = list(data.keys()) 

1747 # error: Incompatible types in assignment (expression has type 

1748 # "List[Any]", variable has type "Dict[Any, Any]") 

1749 data = list(data.values()) # type: ignore[assignment] 

1750 elif orient in ("columns", "tight"): 

1751 if columns is not None: 

1752 raise ValueError(f"cannot use columns parameter with orient='{orient}'") 

1753 else: # pragma: no cover 

1754 raise ValueError( 

1755 f"Expected 'index', 'columns' or 'tight' for orient parameter. " 

1756 f"Got '{orient}' instead" 

1757 ) 

1758 

1759 if orient != "tight": 

1760 return cls(data, index=index, columns=columns, dtype=dtype) 

1761 else: 

1762 realdata = data["data"] 

1763 

1764 def create_index(indexlist, namelist): 

1765 index: Index 

1766 if len(namelist) > 1: 

1767 index = MultiIndex.from_tuples(indexlist, names=namelist) 

1768 else: 

1769 index = Index(indexlist, name=namelist[0]) 

1770 return index 

1771 

1772 index = create_index(data["index"], data["index_names"]) 

1773 columns = create_index(data["columns"], data["column_names"]) 

1774 return cls(realdata, index=index, columns=columns, dtype=dtype) 

1775 

1776 def to_numpy( 

1777 self, 

1778 dtype: npt.DTypeLike | None = None, 

1779 copy: bool = False, 

1780 na_value: object = lib.no_default, 

1781 ) -> np.ndarray: 

1782 """ 

1783 Convert the DataFrame to a NumPy array. 

1784 

1785 By default, the dtype of the returned array will be the common NumPy 

1786 dtype of all types in the DataFrame. For example, if the dtypes are 

1787 ``float16`` and ``float32``, the results dtype will be ``float32``. 

1788 This may require copying data and coercing values, which may be 

1789 expensive. 

1790 

1791 Parameters 

1792 ---------- 

1793 dtype : str or numpy.dtype, optional 

1794 The dtype to pass to :meth:`numpy.asarray`. 

1795 copy : bool, default False 

1796 Whether to ensure that the returned value is not a view on 

1797 another array. Note that ``copy=False`` does not *ensure* that 

1798 ``to_numpy()`` is no-copy. Rather, ``copy=True`` ensure that 

1799 a copy is made, even if not strictly necessary. 

1800 na_value : Any, optional 

1801 The value to use for missing values. The default value depends 

1802 on `dtype` and the dtypes of the DataFrame columns. 

1803 

1804 .. versionadded:: 1.1.0 

1805 

1806 Returns 

1807 ------- 

1808 numpy.ndarray 

1809 

1810 See Also 

1811 -------- 

1812 Series.to_numpy : Similar method for Series. 

1813 

1814 Examples 

1815 -------- 

1816 >>> pd.DataFrame({"A": [1, 2], "B": [3, 4]}).to_numpy() 

1817 array([[1, 3], 

1818 [2, 4]]) 

1819 

1820 With heterogeneous data, the lowest common type will have to 

1821 be used. 

1822 

1823 >>> df = pd.DataFrame({"A": [1, 2], "B": [3.0, 4.5]}) 

1824 >>> df.to_numpy() 

1825 array([[1. , 3. ], 

1826 [2. , 4.5]]) 

1827 

1828 For a mix of numeric and non-numeric types, the output array will 

1829 have object dtype. 

1830 

1831 >>> df['C'] = pd.date_range('2000', periods=2) 

1832 >>> df.to_numpy() 

1833 array([[1, 3.0, Timestamp('2000-01-01 00:00:00')], 

1834 [2, 4.5, Timestamp('2000-01-02 00:00:00')]], dtype=object) 

1835 """ 

1836 if dtype is not None: 

1837 dtype = np.dtype(dtype) 

1838 result = self._mgr.as_array(dtype=dtype, copy=copy, na_value=na_value) 

1839 if result.dtype is not dtype: 

1840 result = np.array(result, dtype=dtype, copy=False) 

1841 

1842 return result 

1843 

1844 def _create_data_for_split_and_tight_to_dict( 

1845 self, are_all_object_dtype_cols: bool, object_dtype_indices: list[int] 

1846 ) -> list: 

1847 """ 

1848 Simple helper method to create data for to ``to_dict(orient="split")`` and 

1849 ``to_dict(orient="tight")`` to create the main output data 

1850 """ 

1851 if are_all_object_dtype_cols: 

1852 data = [ 

1853 list(map(maybe_box_native, t)) 

1854 for t in self.itertuples(index=False, name=None) 

1855 ] 

1856 else: 

1857 data = [list(t) for t in self.itertuples(index=False, name=None)] 

1858 if object_dtype_indices: 

1859 # If we have object_dtype_cols, apply maybe_box_naive after list 

1860 # comprehension for perf 

1861 for row in data: 

1862 for i in object_dtype_indices: 

1863 row[i] = maybe_box_native(row[i]) 

1864 return data 

1865 

1866 @overload 

1867 def to_dict( 

1868 self, 

1869 orient: Literal["dict", "list", "series", "split", "tight", "index"] = ..., 

1870 into: type[dict] = ..., 

1871 ) -> dict: 

1872 ... 

1873 

1874 @overload 

1875 def to_dict(self, orient: Literal["records"], into: type[dict] = ...) -> list[dict]: 

1876 ... 

1877 

1878 def to_dict( 

1879 self, 

1880 orient: Literal[ 

1881 "dict", "list", "series", "split", "tight", "records", "index" 

1882 ] = "dict", 

1883 into: type[dict] = dict, 

1884 index: bool = True, 

1885 ) -> dict | list[dict]: 

1886 """ 

1887 Convert the DataFrame to a dictionary. 

1888 

1889 The type of the key-value pairs can be customized with the parameters 

1890 (see below). 

1891 

1892 Parameters 

1893 ---------- 

1894 orient : str {'dict', 'list', 'series', 'split', 'tight', 'records', 'index'} 

1895 Determines the type of the values of the dictionary. 

1896 

1897 - 'dict' (default) : dict like {column -> {index -> value}} 

1898 - 'list' : dict like {column -> [values]} 

1899 - 'series' : dict like {column -> Series(values)} 

1900 - 'split' : dict like 

1901 {'index' -> [index], 'columns' -> [columns], 'data' -> [values]} 

1902 - 'tight' : dict like 

1903 {'index' -> [index], 'columns' -> [columns], 'data' -> [values], 

1904 'index_names' -> [index.names], 'column_names' -> [column.names]} 

1905 - 'records' : list like 

1906 [{column -> value}, ... , {column -> value}] 

1907 - 'index' : dict like {index -> {column -> value}} 

1908 

1909 .. versionadded:: 1.4.0 

1910 'tight' as an allowed value for the ``orient`` argument 

1911 

1912 into : class, default dict 

1913 The collections.abc.Mapping subclass used for all Mappings 

1914 in the return value. Can be the actual class or an empty 

1915 instance of the mapping type you want. If you want a 

1916 collections.defaultdict, you must pass it initialized. 

1917 

1918 index : bool, default True 

1919 Whether to include the index item (and index_names item if `orient` 

1920 is 'tight') in the returned dictionary. Can only be ``False`` 

1921 when `orient` is 'split' or 'tight'. 

1922 

1923 .. versionadded:: 2.0.0 

1924 

1925 Returns 

1926 ------- 

1927 dict, list or collections.abc.Mapping 

1928 Return a collections.abc.Mapping object representing the DataFrame. 

1929 The resulting transformation depends on the `orient` parameter. 

1930 

1931 See Also 

1932 -------- 

1933 DataFrame.from_dict: Create a DataFrame from a dictionary. 

1934 DataFrame.to_json: Convert a DataFrame to JSON format. 

1935 

1936 Examples 

1937 -------- 

1938 >>> df = pd.DataFrame({'col1': [1, 2], 

1939 ... 'col2': [0.5, 0.75]}, 

1940 ... index=['row1', 'row2']) 

1941 >>> df 

1942 col1 col2 

1943 row1 1 0.50 

1944 row2 2 0.75 

1945 >>> df.to_dict() 

1946 {'col1': {'row1': 1, 'row2': 2}, 'col2': {'row1': 0.5, 'row2': 0.75}} 

1947 

1948 You can specify the return orientation. 

1949 

1950 >>> df.to_dict('series') 

1951 {'col1': row1 1 

1952 row2 2 

1953 Name: col1, dtype: int64, 

1954 'col2': row1 0.50 

1955 row2 0.75 

1956 Name: col2, dtype: float64} 

1957 

1958 >>> df.to_dict('split') 

1959 {'index': ['row1', 'row2'], 'columns': ['col1', 'col2'], 

1960 'data': [[1, 0.5], [2, 0.75]]} 

1961 

1962 >>> df.to_dict('records') 

1963 [{'col1': 1, 'col2': 0.5}, {'col1': 2, 'col2': 0.75}] 

1964 

1965 >>> df.to_dict('index') 

1966 {'row1': {'col1': 1, 'col2': 0.5}, 'row2': {'col1': 2, 'col2': 0.75}} 

1967 

1968 >>> df.to_dict('tight') 

1969 {'index': ['row1', 'row2'], 'columns': ['col1', 'col2'], 

1970 'data': [[1, 0.5], [2, 0.75]], 'index_names': [None], 'column_names': [None]} 

1971 

1972 You can also specify the mapping type. 

1973 

1974 >>> from collections import OrderedDict, defaultdict 

1975 >>> df.to_dict(into=OrderedDict) 

1976 OrderedDict([('col1', OrderedDict([('row1', 1), ('row2', 2)])), 

1977 ('col2', OrderedDict([('row1', 0.5), ('row2', 0.75)]))]) 

1978 

1979 If you want a `defaultdict`, you need to initialize it: 

1980 

1981 >>> dd = defaultdict(list) 

1982 >>> df.to_dict('records', into=dd) 

1983 [defaultdict(<class 'list'>, {'col1': 1, 'col2': 0.5}), 

1984 defaultdict(<class 'list'>, {'col1': 2, 'col2': 0.75})] 

1985 """ 

1986 from pandas.core.methods.to_dict import to_dict 

1987 

1988 return to_dict(self, orient, into, index) 

1989 

1990 def to_gbq( 

1991 self, 

1992 destination_table: str, 

1993 project_id: str | None = None, 

1994 chunksize: int | None = None, 

1995 reauth: bool = False, 

1996 if_exists: str = "fail", 

1997 auth_local_webserver: bool = True, 

1998 table_schema: list[dict[str, str]] | None = None, 

1999 location: str | None = None, 

2000 progress_bar: bool = True, 

2001 credentials=None, 

2002 ) -> None: 

2003 """ 

2004 Write a DataFrame to a Google BigQuery table. 

2005 

2006 This function requires the `pandas-gbq package 

2007 <https://pandas-gbq.readthedocs.io>`__. 

2008 

2009 See the `How to authenticate with Google BigQuery 

2010 <https://pandas-gbq.readthedocs.io/en/latest/howto/authentication.html>`__ 

2011 guide for authentication instructions. 

2012 

2013 Parameters 

2014 ---------- 

2015 destination_table : str 

2016 Name of table to be written, in the form ``dataset.tablename``. 

2017 project_id : str, optional 

2018 Google BigQuery Account project ID. Optional when available from 

2019 the environment. 

2020 chunksize : int, optional 

2021 Number of rows to be inserted in each chunk from the dataframe. 

2022 Set to ``None`` to load the whole dataframe at once. 

2023 reauth : bool, default False 

2024 Force Google BigQuery to re-authenticate the user. This is useful 

2025 if multiple accounts are used. 

2026 if_exists : str, default 'fail' 

2027 Behavior when the destination table exists. Value can be one of: 

2028 

2029 ``'fail'`` 

2030 If table exists raise pandas_gbq.gbq.TableCreationError. 

2031 ``'replace'`` 

2032 If table exists, drop it, recreate it, and insert data. 

2033 ``'append'`` 

2034 If table exists, insert data. Create if does not exist. 

2035 auth_local_webserver : bool, default True 

2036 Use the `local webserver flow`_ instead of the `console flow`_ 

2037 when getting user credentials. 

2038 

2039 .. _local webserver flow: 

2040 https://google-auth-oauthlib.readthedocs.io/en/latest/reference/google_auth_oauthlib.flow.html#google_auth_oauthlib.flow.InstalledAppFlow.run_local_server 

2041 .. _console flow: 

2042 https://google-auth-oauthlib.readthedocs.io/en/latest/reference/google_auth_oauthlib.flow.html#google_auth_oauthlib.flow.InstalledAppFlow.run_console 

2043 

2044 *New in version 0.2.0 of pandas-gbq*. 

2045 

2046 .. versionchanged:: 1.5.0 

2047 Default value is changed to ``True``. Google has deprecated the 

2048 ``auth_local_webserver = False`` `"out of band" (copy-paste) 

2049 flow 

2050 <https://developers.googleblog.com/2022/02/making-oauth-flows-safer.html?m=1#disallowed-oob>`_. 

2051 table_schema : list of dicts, optional 

2052 List of BigQuery table fields to which according DataFrame 

2053 columns conform to, e.g. ``[{'name': 'col1', 'type': 

2054 'STRING'},...]``. If schema is not provided, it will be 

2055 generated according to dtypes of DataFrame columns. See 

2056 BigQuery API documentation on available names of a field. 

2057 

2058 *New in version 0.3.1 of pandas-gbq*. 

2059 location : str, optional 

2060 Location where the load job should run. See the `BigQuery locations 

2061 documentation 

2062 <https://cloud.google.com/bigquery/docs/dataset-locations>`__ for a 

2063 list of available locations. The location must match that of the 

2064 target dataset. 

2065 

2066 *New in version 0.5.0 of pandas-gbq*. 

2067 progress_bar : bool, default True 

2068 Use the library `tqdm` to show the progress bar for the upload, 

2069 chunk by chunk. 

2070 

2071 *New in version 0.5.0 of pandas-gbq*. 

2072 credentials : google.auth.credentials.Credentials, optional 

2073 Credentials for accessing Google APIs. Use this parameter to 

2074 override default credentials, such as to use Compute Engine 

2075 :class:`google.auth.compute_engine.Credentials` or Service 

2076 Account :class:`google.oauth2.service_account.Credentials` 

2077 directly. 

2078 

2079 *New in version 0.8.0 of pandas-gbq*. 

2080 

2081 See Also 

2082 -------- 

2083 pandas_gbq.to_gbq : This function in the pandas-gbq library. 

2084 read_gbq : Read a DataFrame from Google BigQuery. 

2085 """ 

2086 from pandas.io import gbq 

2087 

2088 gbq.to_gbq( 

2089 self, 

2090 destination_table, 

2091 project_id=project_id, 

2092 chunksize=chunksize, 

2093 reauth=reauth, 

2094 if_exists=if_exists, 

2095 auth_local_webserver=auth_local_webserver, 

2096 table_schema=table_schema, 

2097 location=location, 

2098 progress_bar=progress_bar, 

2099 credentials=credentials, 

2100 ) 

2101 

2102 @classmethod 

2103 def from_records( 

2104 cls, 

2105 data, 

2106 index=None, 

2107 exclude=None, 

2108 columns=None, 

2109 coerce_float: bool = False, 

2110 nrows: int | None = None, 

2111 ) -> DataFrame: 

2112 """ 

2113 Convert structured or record ndarray to DataFrame. 

2114 

2115 Creates a DataFrame object from a structured ndarray, sequence of 

2116 tuples or dicts, or DataFrame. 

2117 

2118 Parameters 

2119 ---------- 

2120 data : structured ndarray, sequence of tuples or dicts, or DataFrame 

2121 Structured input data. 

2122 index : str, list of fields, array-like 

2123 Field of array to use as the index, alternately a specific set of 

2124 input labels to use. 

2125 exclude : sequence, default None 

2126 Columns or fields to exclude. 

2127 columns : sequence, default None 

2128 Column names to use. If the passed data do not have names 

2129 associated with them, this argument provides names for the 

2130 columns. Otherwise this argument indicates the order of the columns 

2131 in the result (any names not found in the data will become all-NA 

2132 columns). 

2133 coerce_float : bool, default False 

2134 Attempt to convert values of non-string, non-numeric objects (like 

2135 decimal.Decimal) to floating point, useful for SQL result sets. 

2136 nrows : int, default None 

2137 Number of rows to read if data is an iterator. 

2138 

2139 Returns 

2140 ------- 

2141 DataFrame 

2142 

2143 See Also 

2144 -------- 

2145 DataFrame.from_dict : DataFrame from dict of array-like or dicts. 

2146 DataFrame : DataFrame object creation using constructor. 

2147 

2148 Examples 

2149 -------- 

2150 Data can be provided as a structured ndarray: 

2151 

2152 >>> data = np.array([(3, 'a'), (2, 'b'), (1, 'c'), (0, 'd')], 

2153 ... dtype=[('col_1', 'i4'), ('col_2', 'U1')]) 

2154 >>> pd.DataFrame.from_records(data) 

2155 col_1 col_2 

2156 0 3 a 

2157 1 2 b 

2158 2 1 c 

2159 3 0 d 

2160 

2161 Data can be provided as a list of dicts: 

2162 

2163 >>> data = [{'col_1': 3, 'col_2': 'a'}, 

2164 ... {'col_1': 2, 'col_2': 'b'}, 

2165 ... {'col_1': 1, 'col_2': 'c'}, 

2166 ... {'col_1': 0, 'col_2': 'd'}] 

2167 >>> pd.DataFrame.from_records(data) 

2168 col_1 col_2 

2169 0 3 a 

2170 1 2 b 

2171 2 1 c 

2172 3 0 d 

2173 

2174 Data can be provided as a list of tuples with corresponding columns: 

2175 

2176 >>> data = [(3, 'a'), (2, 'b'), (1, 'c'), (0, 'd')] 

2177 >>> pd.DataFrame.from_records(data, columns=['col_1', 'col_2']) 

2178 col_1 col_2 

2179 0 3 a 

2180 1 2 b 

2181 2 1 c 

2182 3 0 d 

2183 """ 

2184 if isinstance(data, DataFrame): 

2185 if columns is not None: 

2186 if is_scalar(columns): 

2187 columns = [columns] 

2188 data = data[columns] 

2189 if index is not None: 

2190 data = data.set_index(index) 

2191 if exclude is not None: 

2192 data = data.drop(columns=exclude) 

2193 return data.copy(deep=False) 

2194 

2195 result_index = None 

2196 

2197 # Make a copy of the input columns so we can modify it 

2198 if columns is not None: 

2199 columns = ensure_index(columns) 

2200 

2201 def maybe_reorder( 

2202 arrays: list[ArrayLike], arr_columns: Index, columns: Index, index 

2203 ) -> tuple[list[ArrayLike], Index, Index | None]: 

2204 """ 

2205 If our desired 'columns' do not match the data's pre-existing 'arr_columns', 

2206 we re-order our arrays. This is like a pre-emptive (cheap) reindex. 

2207 """ 

2208 if len(arrays): 

2209 length = len(arrays[0]) 

2210 else: 

2211 length = 0 

2212 

2213 result_index = None 

2214 if len(arrays) == 0 and index is None and length == 0: 

2215 result_index = default_index(0) 

2216 

2217 arrays, arr_columns = reorder_arrays(arrays, arr_columns, columns, length) 

2218 return arrays, arr_columns, result_index 

2219 

2220 if is_iterator(data): 

2221 if nrows == 0: 

2222 return cls() 

2223 

2224 try: 

2225 first_row = next(data) 

2226 except StopIteration: 

2227 return cls(index=index, columns=columns) 

2228 

2229 dtype = None 

2230 if hasattr(first_row, "dtype") and first_row.dtype.names: 

2231 dtype = first_row.dtype 

2232 

2233 values = [first_row] 

2234 

2235 if nrows is None: 

2236 values += data 

2237 else: 

2238 values.extend(itertools.islice(data, nrows - 1)) 

2239 

2240 if dtype is not None: 

2241 data = np.array(values, dtype=dtype) 

2242 else: 

2243 data = values 

2244 

2245 if isinstance(data, dict): 

2246 if columns is None: 

2247 columns = arr_columns = ensure_index(sorted(data)) 

2248 arrays = [data[k] for k in columns] 

2249 else: 

2250 arrays = [] 

2251 arr_columns_list = [] 

2252 for k, v in data.items(): 

2253 if k in columns: 

2254 arr_columns_list.append(k) 

2255 arrays.append(v) 

2256 

2257 arr_columns = Index(arr_columns_list) 

2258 arrays, arr_columns, result_index = maybe_reorder( 

2259 arrays, arr_columns, columns, index 

2260 ) 

2261 

2262 elif isinstance(data, (np.ndarray, DataFrame)): 

2263 arrays, columns = to_arrays(data, columns) 

2264 arr_columns = columns 

2265 else: 

2266 arrays, arr_columns = to_arrays(data, columns) 

2267 if coerce_float: 

2268 for i, arr in enumerate(arrays): 

2269 if arr.dtype == object: 

2270 # error: Argument 1 to "maybe_convert_objects" has 

2271 # incompatible type "Union[ExtensionArray, ndarray]"; 

2272 # expected "ndarray" 

2273 arrays[i] = lib.maybe_convert_objects( 

2274 arr, # type: ignore[arg-type] 

2275 try_float=True, 

2276 ) 

2277 

2278 arr_columns = ensure_index(arr_columns) 

2279 if columns is None: 

2280 columns = arr_columns 

2281 else: 

2282 arrays, arr_columns, result_index = maybe_reorder( 

2283 arrays, arr_columns, columns, index 

2284 ) 

2285 

2286 if exclude is None: 

2287 exclude = set() 

2288 else: 

2289 exclude = set(exclude) 

2290 

2291 if index is not None: 

2292 if isinstance(index, str) or not hasattr(index, "__iter__"): 

2293 i = columns.get_loc(index) 

2294 exclude.add(index) 

2295 if len(arrays) > 0: 

2296 result_index = Index(arrays[i], name=index) 

2297 else: 

2298 result_index = Index([], name=index) 

2299 else: 

2300 try: 

2301 index_data = [arrays[arr_columns.get_loc(field)] for field in index] 

2302 except (KeyError, TypeError): 

2303 # raised by get_loc, see GH#29258 

2304 result_index = index 

2305 else: 

2306 result_index = ensure_index_from_sequences(index_data, names=index) 

2307 exclude.update(index) 

2308 

2309 if any(exclude): 

2310 arr_exclude = [x for x in exclude if x in arr_columns] 

2311 to_remove = [arr_columns.get_loc(col) for col in arr_exclude] 

2312 arrays = [v for i, v in enumerate(arrays) if i not in to_remove] 

2313 

2314 columns = columns.drop(exclude) 

2315 

2316 manager = get_option("mode.data_manager") 

2317 mgr = arrays_to_mgr(arrays, columns, result_index, typ=manager) 

2318 

2319 return cls(mgr) 

2320 

2321 def to_records( 

2322 self, index: bool = True, column_dtypes=None, index_dtypes=None 

2323 ) -> np.recarray: 

2324 """ 

2325 Convert DataFrame to a NumPy record array. 

2326 

2327 Index will be included as the first field of the record array if 

2328 requested. 

2329 

2330 Parameters 

2331 ---------- 

2332 index : bool, default True 

2333 Include index in resulting record array, stored in 'index' 

2334 field or using the index label, if set. 

2335 column_dtypes : str, type, dict, default None 

2336 If a string or type, the data type to store all columns. If 

2337 a dictionary, a mapping of column names and indices (zero-indexed) 

2338 to specific data types. 

2339 index_dtypes : str, type, dict, default None 

2340 If a string or type, the data type to store all index levels. If 

2341 a dictionary, a mapping of index level names and indices 

2342 (zero-indexed) to specific data types. 

2343 

2344 This mapping is applied only if `index=True`. 

2345 

2346 Returns 

2347 ------- 

2348 numpy.recarray 

2349 NumPy ndarray with the DataFrame labels as fields and each row 

2350 of the DataFrame as entries. 

2351 

2352 See Also 

2353 -------- 

2354 DataFrame.from_records: Convert structured or record ndarray 

2355 to DataFrame. 

2356 numpy.recarray: An ndarray that allows field access using 

2357 attributes, analogous to typed columns in a 

2358 spreadsheet. 

2359 

2360 Examples 

2361 -------- 

2362 >>> df = pd.DataFrame({'A': [1, 2], 'B': [0.5, 0.75]}, 

2363 ... index=['a', 'b']) 

2364 >>> df 

2365 A B 

2366 a 1 0.50 

2367 b 2 0.75 

2368 >>> df.to_records() 

2369 rec.array([('a', 1, 0.5 ), ('b', 2, 0.75)], 

2370 dtype=[('index', 'O'), ('A', '<i8'), ('B', '<f8')]) 

2371 

2372 If the DataFrame index has no label then the recarray field name 

2373 is set to 'index'. If the index has a label then this is used as the 

2374 field name: 

2375 

2376 >>> df.index = df.index.rename("I") 

2377 >>> df.to_records() 

2378 rec.array([('a', 1, 0.5 ), ('b', 2, 0.75)], 

2379 dtype=[('I', 'O'), ('A', '<i8'), ('B', '<f8')]) 

2380 

2381 The index can be excluded from the record array: 

2382 

2383 >>> df.to_records(index=False) 

2384 rec.array([(1, 0.5 ), (2, 0.75)], 

2385 dtype=[('A', '<i8'), ('B', '<f8')]) 

2386 

2387 Data types can be specified for the columns: 

2388 

2389 >>> df.to_records(column_dtypes={"A": "int32"}) 

2390 rec.array([('a', 1, 0.5 ), ('b', 2, 0.75)], 

2391 dtype=[('I', 'O'), ('A', '<i4'), ('B', '<f8')]) 

2392 

2393 As well as for the index: 

2394 

2395 >>> df.to_records(index_dtypes="<S2") 

2396 rec.array([(b'a', 1, 0.5 ), (b'b', 2, 0.75)], 

2397 dtype=[('I', 'S2'), ('A', '<i8'), ('B', '<f8')]) 

2398 

2399 >>> index_dtypes = f"<S{df.index.str.len().max()}" 

2400 >>> df.to_records(index_dtypes=index_dtypes) 

2401 rec.array([(b'a', 1, 0.5 ), (b'b', 2, 0.75)], 

2402 dtype=[('I', 'S1'), ('A', '<i8'), ('B', '<f8')]) 

2403 """ 

2404 if index: 

2405 ix_vals = [ 

2406 np.asarray(self.index.get_level_values(i)) 

2407 for i in range(self.index.nlevels) 

2408 ] 

2409 

2410 arrays = ix_vals + [ 

2411 np.asarray(self.iloc[:, i]) for i in range(len(self.columns)) 

2412 ] 

2413 

2414 index_names = list(self.index.names) 

2415 

2416 if isinstance(self.index, MultiIndex): 

2417 index_names = com.fill_missing_names(index_names) 

2418 elif index_names[0] is None: 

2419 index_names = ["index"] 

2420 

2421 names = [str(name) for name in itertools.chain(index_names, self.columns)] 

2422 else: 

2423 arrays = [np.asarray(self.iloc[:, i]) for i in range(len(self.columns))] 

2424 names = [str(c) for c in self.columns] 

2425 index_names = [] 

2426 

2427 index_len = len(index_names) 

2428 formats = [] 

2429 

2430 for i, v in enumerate(arrays): 

2431 index_int = i 

2432 

2433 # When the names and arrays are collected, we 

2434 # first collect those in the DataFrame's index, 

2435 # followed by those in its columns. 

2436 # 

2437 # Thus, the total length of the array is: 

2438 # len(index_names) + len(DataFrame.columns). 

2439 # 

2440 # This check allows us to see whether we are 

2441 # handling a name / array in the index or column. 

2442 if index_int < index_len: 

2443 dtype_mapping = index_dtypes 

2444 name = index_names[index_int] 

2445 else: 

2446 index_int -= index_len 

2447 dtype_mapping = column_dtypes 

2448 name = self.columns[index_int] 

2449 

2450 # We have a dictionary, so we get the data type 

2451 # associated with the index or column (which can 

2452 # be denoted by its name in the DataFrame or its 

2453 # position in DataFrame's array of indices or 

2454 # columns, whichever is applicable. 

2455 if is_dict_like(dtype_mapping): 

2456 if name in dtype_mapping: 

2457 dtype_mapping = dtype_mapping[name] 

2458 elif index_int in dtype_mapping: 

2459 dtype_mapping = dtype_mapping[index_int] 

2460 else: 

2461 dtype_mapping = None 

2462 

2463 # If no mapping can be found, use the array's 

2464 # dtype attribute for formatting. 

2465 # 

2466 # A valid dtype must either be a type or 

2467 # string naming a type. 

2468 if dtype_mapping is None: 

2469 formats.append(v.dtype) 

2470 elif isinstance(dtype_mapping, (type, np.dtype, str)): 

2471 # error: Argument 1 to "append" of "list" has incompatible 

2472 # type "Union[type, dtype[Any], str]"; expected "dtype[Any]" 

2473 formats.append(dtype_mapping) # type: ignore[arg-type] 

2474 else: 

2475 element = "row" if i < index_len else "column" 

2476 msg = f"Invalid dtype {dtype_mapping} specified for {element} {name}" 

2477 raise ValueError(msg) 

2478 

2479 return np.rec.fromarrays(arrays, dtype={"names": names, "formats": formats}) 

2480 

2481 @classmethod 

2482 def _from_arrays( 

2483 cls, 

2484 arrays, 

2485 columns, 

2486 index, 

2487 dtype: Dtype | None = None, 

2488 verify_integrity: bool = True, 

2489 ) -> DataFrame: 

2490 """ 

2491 Create DataFrame from a list of arrays corresponding to the columns. 

2492 

2493 Parameters 

2494 ---------- 

2495 arrays : list-like of arrays 

2496 Each array in the list corresponds to one column, in order. 

2497 columns : list-like, Index 

2498 The column names for the resulting DataFrame. 

2499 index : list-like, Index 

2500 The rows labels for the resulting DataFrame. 

2501 dtype : dtype, optional 

2502 Optional dtype to enforce for all arrays. 

2503 verify_integrity : bool, default True 

2504 Validate and homogenize all input. If set to False, it is assumed 

2505 that all elements of `arrays` are actual arrays how they will be 

2506 stored in a block (numpy ndarray or ExtensionArray), have the same 

2507 length as and are aligned with the index, and that `columns` and 

2508 `index` are ensured to be an Index object. 

2509 

2510 Returns 

2511 ------- 

2512 DataFrame 

2513 """ 

2514 if dtype is not None: 

2515 dtype = pandas_dtype(dtype) 

2516 

2517 manager = get_option("mode.data_manager") 

2518 columns = ensure_index(columns) 

2519 if len(columns) != len(arrays): 

2520 raise ValueError("len(columns) must match len(arrays)") 

2521 mgr = arrays_to_mgr( 

2522 arrays, 

2523 columns, 

2524 index, 

2525 dtype=dtype, 

2526 verify_integrity=verify_integrity, 

2527 typ=manager, 

2528 ) 

2529 return cls(mgr) 

2530 

2531 @doc( 

2532 storage_options=_shared_docs["storage_options"], 

2533 compression_options=_shared_docs["compression_options"] % "path", 

2534 ) 

2535 def to_stata( 

2536 self, 

2537 path: FilePath | WriteBuffer[bytes], 

2538 *, 

2539 convert_dates: dict[Hashable, str] | None = None, 

2540 write_index: bool = True, 

2541 byteorder: str | None = None, 

2542 time_stamp: datetime.datetime | None = None, 

2543 data_label: str | None = None, 

2544 variable_labels: dict[Hashable, str] | None = None, 

2545 version: int | None = 114, 

2546 convert_strl: Sequence[Hashable] | None = None, 

2547 compression: CompressionOptions = "infer", 

2548 storage_options: StorageOptions = None, 

2549 value_labels: dict[Hashable, dict[float, str]] | None = None, 

2550 ) -> None: 

2551 """ 

2552 Export DataFrame object to Stata dta format. 

2553 

2554 Writes the DataFrame to a Stata dataset file. 

2555 "dta" files contain a Stata dataset. 

2556 

2557 Parameters 

2558 ---------- 

2559 path : str, path object, or buffer 

2560 String, path object (implementing ``os.PathLike[str]``), or file-like 

2561 object implementing a binary ``write()`` function. 

2562 

2563 convert_dates : dict 

2564 Dictionary mapping columns containing datetime types to stata 

2565 internal format to use when writing the dates. Options are 'tc', 

2566 'td', 'tm', 'tw', 'th', 'tq', 'ty'. Column can be either an integer 

2567 or a name. Datetime columns that do not have a conversion type 

2568 specified will be converted to 'tc'. Raises NotImplementedError if 

2569 a datetime column has timezone information. 

2570 write_index : bool 

2571 Write the index to Stata dataset. 

2572 byteorder : str 

2573 Can be ">", "<", "little", or "big". default is `sys.byteorder`. 

2574 time_stamp : datetime 

2575 A datetime to use as file creation date. Default is the current 

2576 time. 

2577 data_label : str, optional 

2578 A label for the data set. Must be 80 characters or smaller. 

2579 variable_labels : dict 

2580 Dictionary containing columns as keys and variable labels as 

2581 values. Each label must be 80 characters or smaller. 

2582 version : {{114, 117, 118, 119, None}}, default 114 

2583 Version to use in the output dta file. Set to None to let pandas 

2584 decide between 118 or 119 formats depending on the number of 

2585 columns in the frame. Version 114 can be read by Stata 10 and 

2586 later. Version 117 can be read by Stata 13 or later. Version 118 

2587 is supported in Stata 14 and later. Version 119 is supported in 

2588 Stata 15 and later. Version 114 limits string variables to 244 

2589 characters or fewer while versions 117 and later allow strings 

2590 with lengths up to 2,000,000 characters. Versions 118 and 119 

2591 support Unicode characters, and version 119 supports more than 

2592 32,767 variables. 

2593 

2594 Version 119 should usually only be used when the number of 

2595 variables exceeds the capacity of dta format 118. Exporting 

2596 smaller datasets in format 119 may have unintended consequences, 

2597 and, as of November 2020, Stata SE cannot read version 119 files. 

2598 

2599 convert_strl : list, optional 

2600 List of column names to convert to string columns to Stata StrL 

2601 format. Only available if version is 117. Storing strings in the 

2602 StrL format can produce smaller dta files if strings have more than 

2603 8 characters and values are repeated. 

2604 {compression_options} 

2605 

2606 .. versionadded:: 1.1.0 

2607 

2608 .. versionchanged:: 1.4.0 Zstandard support. 

2609 

2610 {storage_options} 

2611 

2612 .. versionadded:: 1.2.0 

2613 

2614 value_labels : dict of dicts 

2615 Dictionary containing columns as keys and dictionaries of column value 

2616 to labels as values. Labels for a single variable must be 32,000 

2617 characters or smaller. 

2618 

2619 .. versionadded:: 1.4.0 

2620 

2621 Raises 

2622 ------ 

2623 NotImplementedError 

2624 * If datetimes contain timezone information 

2625 * Column dtype is not representable in Stata 

2626 ValueError 

2627 * Columns listed in convert_dates are neither datetime64[ns] 

2628 or datetime.datetime 

2629 * Column listed in convert_dates is not in DataFrame 

2630 * Categorical label contains more than 32,000 characters 

2631 

2632 See Also 

2633 -------- 

2634 read_stata : Import Stata data files. 

2635 io.stata.StataWriter : Low-level writer for Stata data files. 

2636 io.stata.StataWriter117 : Low-level writer for version 117 files. 

2637 

2638 Examples 

2639 -------- 

2640 >>> df = pd.DataFrame({{'animal': ['falcon', 'parrot', 'falcon', 

2641 ... 'parrot'], 

2642 ... 'speed': [350, 18, 361, 15]}}) 

2643 >>> df.to_stata('animals.dta') # doctest: +SKIP 

2644 """ 

2645 if version not in (114, 117, 118, 119, None): 

2646 raise ValueError("Only formats 114, 117, 118 and 119 are supported.") 

2647 if version == 114: 

2648 if convert_strl is not None: 

2649 raise ValueError("strl is not supported in format 114") 

2650 from pandas.io.stata import StataWriter as statawriter 

2651 elif version == 117: 

2652 # Incompatible import of "statawriter" (imported name has type 

2653 # "Type[StataWriter117]", local name has type "Type[StataWriter]") 

2654 from pandas.io.stata import ( # type: ignore[assignment] 

2655 StataWriter117 as statawriter, 

2656 ) 

2657 else: # versions 118 and 119 

2658 # Incompatible import of "statawriter" (imported name has type 

2659 # "Type[StataWriter117]", local name has type "Type[StataWriter]") 

2660 from pandas.io.stata import ( # type: ignore[assignment] 

2661 StataWriterUTF8 as statawriter, 

2662 ) 

2663 

2664 kwargs: dict[str, Any] = {} 

2665 if version is None or version >= 117: 

2666 # strl conversion is only supported >= 117 

2667 kwargs["convert_strl"] = convert_strl 

2668 if version is None or version >= 118: 

2669 # Specifying the version is only supported for UTF8 (118 or 119) 

2670 kwargs["version"] = version 

2671 

2672 writer = statawriter( 

2673 path, 

2674 self, 

2675 convert_dates=convert_dates, 

2676 byteorder=byteorder, 

2677 time_stamp=time_stamp, 

2678 data_label=data_label, 

2679 write_index=write_index, 

2680 variable_labels=variable_labels, 

2681 compression=compression, 

2682 storage_options=storage_options, 

2683 value_labels=value_labels, 

2684 **kwargs, 

2685 ) 

2686 writer.write_file() 

2687 

2688 def to_feather(self, path: FilePath | WriteBuffer[bytes], **kwargs) -> None: 

2689 """ 

2690 Write a DataFrame to the binary Feather format. 

2691 

2692 Parameters 

2693 ---------- 

2694 path : str, path object, file-like object 

2695 String, path object (implementing ``os.PathLike[str]``), or file-like 

2696 object implementing a binary ``write()`` function. If a string or a path, 

2697 it will be used as Root Directory path when writing a partitioned dataset. 

2698 **kwargs : 

2699 Additional keywords passed to :func:`pyarrow.feather.write_feather`. 

2700 Starting with pyarrow 0.17, this includes the `compression`, 

2701 `compression_level`, `chunksize` and `version` keywords. 

2702 

2703 .. versionadded:: 1.1.0 

2704 

2705 Notes 

2706 ----- 

2707 This function writes the dataframe as a `feather file 

2708 <https://arrow.apache.org/docs/python/feather.html>`_. Requires a default 

2709 index. For saving the DataFrame with your custom index use a method that 

2710 supports custom indices e.g. `to_parquet`. 

2711 """ 

2712 from pandas.io.feather_format import to_feather 

2713 

2714 to_feather(self, path, **kwargs) 

2715 

2716 @doc( 

2717 Series.to_markdown, 

2718 klass=_shared_doc_kwargs["klass"], 

2719 storage_options=_shared_docs["storage_options"], 

2720 examples="""Examples 

2721 -------- 

2722 >>> df = pd.DataFrame( 

2723 ... data={"animal_1": ["elk", "pig"], "animal_2": ["dog", "quetzal"]} 

2724 ... ) 

2725 >>> print(df.to_markdown()) 

2726 | | animal_1 | animal_2 | 

2727 |---:|:-----------|:-----------| 

2728 | 0 | elk | dog | 

2729 | 1 | pig | quetzal | 

2730 

2731 Output markdown with a tabulate option. 

2732 

2733 >>> print(df.to_markdown(tablefmt="grid")) 

2734 +----+------------+------------+ 

2735 | | animal_1 | animal_2 | 

2736 +====+============+============+ 

2737 | 0 | elk | dog | 

2738 +----+------------+------------+ 

2739 | 1 | pig | quetzal | 

2740 +----+------------+------------+""", 

2741 ) 

2742 def to_markdown( 

2743 self, 

2744 buf: FilePath | WriteBuffer[str] | None = None, 

2745 mode: str = "wt", 

2746 index: bool = True, 

2747 storage_options: StorageOptions = None, 

2748 **kwargs, 

2749 ) -> str | None: 

2750 if "showindex" in kwargs: 

2751 raise ValueError("Pass 'index' instead of 'showindex") 

2752 

2753 kwargs.setdefault("headers", "keys") 

2754 kwargs.setdefault("tablefmt", "pipe") 

2755 kwargs.setdefault("showindex", index) 

2756 tabulate = import_optional_dependency("tabulate") 

2757 result = tabulate.tabulate(self, **kwargs) 

2758 if buf is None: 

2759 return result 

2760 

2761 with get_handle(buf, mode, storage_options=storage_options) as handles: 

2762 handles.handle.write(result) 

2763 return None 

2764 

2765 @overload 

2766 def to_parquet( 

2767 self, 

2768 path: None = ..., 

2769 engine: str = ..., 

2770 compression: str | None = ..., 

2771 index: bool | None = ..., 

2772 partition_cols: list[str] | None = ..., 

2773 storage_options: StorageOptions = ..., 

2774 **kwargs, 

2775 ) -> bytes: 

2776 ... 

2777 

2778 @overload 

2779 def to_parquet( 

2780 self, 

2781 path: FilePath | WriteBuffer[bytes], 

2782 engine: str = ..., 

2783 compression: str | None = ..., 

2784 index: bool | None = ..., 

2785 partition_cols: list[str] | None = ..., 

2786 storage_options: StorageOptions = ..., 

2787 **kwargs, 

2788 ) -> None: 

2789 ... 

2790 

2791 @doc(storage_options=_shared_docs["storage_options"]) 

2792 def to_parquet( 

2793 self, 

2794 path: FilePath | WriteBuffer[bytes] | None = None, 

2795 engine: str = "auto", 

2796 compression: str | None = "snappy", 

2797 index: bool | None = None, 

2798 partition_cols: list[str] | None = None, 

2799 storage_options: StorageOptions = None, 

2800 **kwargs, 

2801 ) -> bytes | None: 

2802 """ 

2803 Write a DataFrame to the binary parquet format. 

2804 

2805 This function writes the dataframe as a `parquet file 

2806 <https://parquet.apache.org/>`_. You can choose different parquet 

2807 backends, and have the option of compression. See 

2808 :ref:`the user guide <io.parquet>` for more details. 

2809 

2810 Parameters 

2811 ---------- 

2812 path : str, path object, file-like object, or None, default None 

2813 String, path object (implementing ``os.PathLike[str]``), or file-like 

2814 object implementing a binary ``write()`` function. If None, the result is 

2815 returned as bytes. If a string or path, it will be used as Root Directory 

2816 path when writing a partitioned dataset. 

2817 

2818 .. versionchanged:: 1.2.0 

2819 

2820 Previously this was "fname" 

2821 

2822 engine : {{'auto', 'pyarrow', 'fastparquet'}}, default 'auto' 

2823 Parquet library to use. If 'auto', then the option 

2824 ``io.parquet.engine`` is used. The default ``io.parquet.engine`` 

2825 behavior is to try 'pyarrow', falling back to 'fastparquet' if 

2826 'pyarrow' is unavailable. 

2827 compression : {{'snappy', 'gzip', 'brotli', None}}, default 'snappy' 

2828 Name of the compression to use. Use ``None`` for no compression. 

2829 index : bool, default None 

2830 If ``True``, include the dataframe's index(es) in the file output. 

2831 If ``False``, they will not be written to the file. 

2832 If ``None``, similar to ``True`` the dataframe's index(es) 

2833 will be saved. However, instead of being saved as values, 

2834 the RangeIndex will be stored as a range in the metadata so it 

2835 doesn't require much space and is faster. Other indexes will 

2836 be included as columns in the file output. 

2837 partition_cols : list, optional, default None 

2838 Column names by which to partition the dataset. 

2839 Columns are partitioned in the order they are given. 

2840 Must be None if path is not a string. 

2841 {storage_options} 

2842 

2843 .. versionadded:: 1.2.0 

2844 

2845 **kwargs 

2846 Additional arguments passed to the parquet library. See 

2847 :ref:`pandas io <io.parquet>` for more details. 

2848 

2849 Returns 

2850 ------- 

2851 bytes if no path argument is provided else None 

2852 

2853 See Also 

2854 -------- 

2855 read_parquet : Read a parquet file. 

2856 DataFrame.to_orc : Write an orc file. 

2857 DataFrame.to_csv : Write a csv file. 

2858 DataFrame.to_sql : Write to a sql table. 

2859 DataFrame.to_hdf : Write to hdf. 

2860 

2861 Notes 

2862 ----- 

2863 This function requires either the `fastparquet 

2864 <https://pypi.org/project/fastparquet>`_ or `pyarrow 

2865 <https://arrow.apache.org/docs/python/>`_ library. 

2866 

2867 Examples 

2868 -------- 

2869 >>> df = pd.DataFrame(data={{'col1': [1, 2], 'col2': [3, 4]}}) 

2870 >>> df.to_parquet('df.parquet.gzip', 

2871 ... compression='gzip') # doctest: +SKIP 

2872 >>> pd.read_parquet('df.parquet.gzip') # doctest: +SKIP 

2873 col1 col2 

2874 0 1 3 

2875 1 2 4 

2876 

2877 If you want to get a buffer to the parquet content you can use a io.BytesIO 

2878 object, as long as you don't use partition_cols, which creates multiple files. 

2879 

2880 >>> import io 

2881 >>> f = io.BytesIO() 

2882 >>> df.to_parquet(f) 

2883 >>> f.seek(0) 

2884 0 

2885 >>> content = f.read() 

2886 """ 

2887 from pandas.io.parquet import to_parquet 

2888 

2889 return to_parquet( 

2890 self, 

2891 path, 

2892 engine, 

2893 compression=compression, 

2894 index=index, 

2895 partition_cols=partition_cols, 

2896 storage_options=storage_options, 

2897 **kwargs, 

2898 ) 

2899 

2900 def to_orc( 

2901 self, 

2902 path: FilePath | WriteBuffer[bytes] | None = None, 

2903 *, 

2904 engine: Literal["pyarrow"] = "pyarrow", 

2905 index: bool | None = None, 

2906 engine_kwargs: dict[str, Any] | None = None, 

2907 ) -> bytes | None: 

2908 """ 

2909 Write a DataFrame to the ORC format. 

2910 

2911 .. versionadded:: 1.5.0 

2912 

2913 Parameters 

2914 ---------- 

2915 path : str, file-like object or None, default None 

2916 If a string, it will be used as Root Directory path 

2917 when writing a partitioned dataset. By file-like object, 

2918 we refer to objects with a write() method, such as a file handle 

2919 (e.g. via builtin open function). If path is None, 

2920 a bytes object is returned. 

2921 engine : str, default 'pyarrow' 

2922 ORC library to use. Pyarrow must be >= 7.0.0. 

2923 index : bool, optional 

2924 If ``True``, include the dataframe's index(es) in the file output. 

2925 If ``False``, they will not be written to the file. 

2926 If ``None``, similar to ``infer`` the dataframe's index(es) 

2927 will be saved. However, instead of being saved as values, 

2928 the RangeIndex will be stored as a range in the metadata so it 

2929 doesn't require much space and is faster. Other indexes will 

2930 be included as columns in the file output. 

2931 engine_kwargs : dict[str, Any] or None, default None 

2932 Additional keyword arguments passed to :func:`pyarrow.orc.write_table`. 

2933 

2934 Returns 

2935 ------- 

2936 bytes if no path argument is provided else None 

2937 

2938 Raises 

2939 ------ 

2940 NotImplementedError 

2941 Dtype of one or more columns is category, unsigned integers, interval, 

2942 period or sparse. 

2943 ValueError 

2944 engine is not pyarrow. 

2945 

2946 See Also 

2947 -------- 

2948 read_orc : Read a ORC file. 

2949 DataFrame.to_parquet : Write a parquet file. 

2950 DataFrame.to_csv : Write a csv file. 

2951 DataFrame.to_sql : Write to a sql table. 

2952 DataFrame.to_hdf : Write to hdf. 

2953 

2954 Notes 

2955 ----- 

2956 * Before using this function you should read the :ref:`user guide about 

2957 ORC <io.orc>` and :ref:`install optional dependencies <install.warn_orc>`. 

2958 * This function requires `pyarrow <https://arrow.apache.org/docs/python/>`_ 

2959 library. 

2960 * For supported dtypes please refer to `supported ORC features in Arrow 

2961 <https://arrow.apache.org/docs/cpp/orc.html#data-types>`__. 

2962 * Currently timezones in datetime columns are not preserved when a 

2963 dataframe is converted into ORC files. 

2964 

2965 Examples 

2966 -------- 

2967 >>> df = pd.DataFrame(data={'col1': [1, 2], 'col2': [4, 3]}) 

2968 >>> df.to_orc('df.orc') # doctest: +SKIP 

2969 >>> pd.read_orc('df.orc') # doctest: +SKIP 

2970 col1 col2 

2971 0 1 4 

2972 1 2 3 

2973 

2974 If you want to get a buffer to the orc content you can write it to io.BytesIO 

2975 >>> import io 

2976 >>> b = io.BytesIO(df.to_orc()) # doctest: +SKIP 

2977 >>> b.seek(0) # doctest: +SKIP 

2978 0 

2979 >>> content = b.read() # doctest: +SKIP 

2980 """ 

2981 from pandas.io.orc import to_orc 

2982 

2983 return to_orc( 

2984 self, path, engine=engine, index=index, engine_kwargs=engine_kwargs 

2985 ) 

2986 

2987 @overload 

2988 def to_html( 

2989 self, 

2990 buf: FilePath | WriteBuffer[str], 

2991 columns: Sequence[Level] | None = ..., 

2992 col_space: ColspaceArgType | None = ..., 

2993 header: bool | Sequence[str] = ..., 

2994 index: bool = ..., 

2995 na_rep: str = ..., 

2996 formatters: FormattersType | None = ..., 

2997 float_format: FloatFormatType | None = ..., 

2998 sparsify: bool | None = ..., 

2999 index_names: bool = ..., 

3000 justify: str | None = ..., 

3001 max_rows: int | None = ..., 

3002 max_cols: int | None = ..., 

3003 show_dimensions: bool | str = ..., 

3004 decimal: str = ..., 

3005 bold_rows: bool = ..., 

3006 classes: str | list | tuple | None = ..., 

3007 escape: bool = ..., 

3008 notebook: bool = ..., 

3009 border: int | bool | None = ..., 

3010 table_id: str | None = ..., 

3011 render_links: bool = ..., 

3012 encoding: str | None = ..., 

3013 ) -> None: 

3014 ... 

3015 

3016 @overload 

3017 def to_html( 

3018 self, 

3019 buf: None = ..., 

3020 columns: Sequence[Level] | None = ..., 

3021 col_space: ColspaceArgType | None = ..., 

3022 header: bool | Sequence[str] = ..., 

3023 index: bool = ..., 

3024 na_rep: str = ..., 

3025 formatters: FormattersType | None = ..., 

3026 float_format: FloatFormatType | None = ..., 

3027 sparsify: bool | None = ..., 

3028 index_names: bool = ..., 

3029 justify: str | None = ..., 

3030 max_rows: int | None = ..., 

3031 max_cols: int | None = ..., 

3032 show_dimensions: bool | str = ..., 

3033 decimal: str = ..., 

3034 bold_rows: bool = ..., 

3035 classes: str | list | tuple | None = ..., 

3036 escape: bool = ..., 

3037 notebook: bool = ..., 

3038 border: int | bool | None = ..., 

3039 table_id: str | None = ..., 

3040 render_links: bool = ..., 

3041 encoding: str | None = ..., 

3042 ) -> str: 

3043 ... 

3044 

3045 @Substitution( 

3046 header_type="bool", 

3047 header="Whether to print column labels, default True", 

3048 col_space_type="str or int, list or dict of int or str", 

3049 col_space="The minimum width of each column in CSS length " 

3050 "units. An int is assumed to be px units.", 

3051 ) 

3052 @Substitution(shared_params=fmt.common_docstring, returns=fmt.return_docstring) 

3053 def to_html( 

3054 self, 

3055 buf: FilePath | WriteBuffer[str] | None = None, 

3056 columns: Sequence[Level] | None = None, 

3057 col_space: ColspaceArgType | None = None, 

3058 header: bool | Sequence[str] = True, 

3059 index: bool = True, 

3060 na_rep: str = "NaN", 

3061 formatters: FormattersType | None = None, 

3062 float_format: FloatFormatType | None = None, 

3063 sparsify: bool | None = None, 

3064 index_names: bool = True, 

3065 justify: str | None = None, 

3066 max_rows: int | None = None, 

3067 max_cols: int | None = None, 

3068 show_dimensions: bool | str = False, 

3069 decimal: str = ".", 

3070 bold_rows: bool = True, 

3071 classes: str | list | tuple | None = None, 

3072 escape: bool = True, 

3073 notebook: bool = False, 

3074 border: int | bool | None = None, 

3075 table_id: str | None = None, 

3076 render_links: bool = False, 

3077 encoding: str | None = None, 

3078 ) -> str | None: 

3079 """ 

3080 Render a DataFrame as an HTML table. 

3081 %(shared_params)s 

3082 bold_rows : bool, default True 

3083 Make the row labels bold in the output. 

3084 classes : str or list or tuple, default None 

3085 CSS class(es) to apply to the resulting html table. 

3086 escape : bool, default True 

3087 Convert the characters <, >, and & to HTML-safe sequences. 

3088 notebook : {True, False}, default False 

3089 Whether the generated HTML is for IPython Notebook. 

3090 border : int 

3091 A ``border=border`` attribute is included in the opening 

3092 `<table>` tag. Default ``pd.options.display.html.border``. 

3093 table_id : str, optional 

3094 A css id is included in the opening `<table>` tag if specified. 

3095 render_links : bool, default False 

3096 Convert URLs to HTML links. 

3097 encoding : str, default "utf-8" 

3098 Set character encoding. 

3099 

3100 .. versionadded:: 1.0 

3101 %(returns)s 

3102 See Also 

3103 -------- 

3104 to_string : Convert DataFrame to a string. 

3105 """ 

3106 if justify is not None and justify not in fmt._VALID_JUSTIFY_PARAMETERS: 

3107 raise ValueError("Invalid value for justify parameter") 

3108 

3109 formatter = fmt.DataFrameFormatter( 

3110 self, 

3111 columns=columns, 

3112 col_space=col_space, 

3113 na_rep=na_rep, 

3114 header=header, 

3115 index=index, 

3116 formatters=formatters, 

3117 float_format=float_format, 

3118 bold_rows=bold_rows, 

3119 sparsify=sparsify, 

3120 justify=justify, 

3121 index_names=index_names, 

3122 escape=escape, 

3123 decimal=decimal, 

3124 max_rows=max_rows, 

3125 max_cols=max_cols, 

3126 show_dimensions=show_dimensions, 

3127 ) 

3128 # TODO: a generic formatter wld b in DataFrameFormatter 

3129 return fmt.DataFrameRenderer(formatter).to_html( 

3130 buf=buf, 

3131 classes=classes, 

3132 notebook=notebook, 

3133 border=border, 

3134 encoding=encoding, 

3135 table_id=table_id, 

3136 render_links=render_links, 

3137 ) 

3138 

3139 @doc( 

3140 storage_options=_shared_docs["storage_options"], 

3141 compression_options=_shared_docs["compression_options"] % "path_or_buffer", 

3142 ) 

3143 def to_xml( 

3144 self, 

3145 path_or_buffer: FilePath | WriteBuffer[bytes] | WriteBuffer[str] | None = None, 

3146 index: bool = True, 

3147 root_name: str | None = "data", 

3148 row_name: str | None = "row", 

3149 na_rep: str | None = None, 

3150 attr_cols: list[str] | None = None, 

3151 elem_cols: list[str] | None = None, 

3152 namespaces: dict[str | None, str] | None = None, 

3153 prefix: str | None = None, 

3154 encoding: str = "utf-8", 

3155 xml_declaration: bool | None = True, 

3156 pretty_print: bool | None = True, 

3157 parser: str | None = "lxml", 

3158 stylesheet: FilePath | ReadBuffer[str] | ReadBuffer[bytes] | None = None, 

3159 compression: CompressionOptions = "infer", 

3160 storage_options: StorageOptions = None, 

3161 ) -> str | None: 

3162 """ 

3163 Render a DataFrame to an XML document. 

3164 

3165 .. versionadded:: 1.3.0 

3166 

3167 Parameters 

3168 ---------- 

3169 path_or_buffer : str, path object, file-like object, or None, default None 

3170 String, path object (implementing ``os.PathLike[str]``), or file-like 

3171 object implementing a ``write()`` function. If None, the result is returned 

3172 as a string. 

3173 index : bool, default True 

3174 Whether to include index in XML document. 

3175 root_name : str, default 'data' 

3176 The name of root element in XML document. 

3177 row_name : str, default 'row' 

3178 The name of row element in XML document. 

3179 na_rep : str, optional 

3180 Missing data representation. 

3181 attr_cols : list-like, optional 

3182 List of columns to write as attributes in row element. 

3183 Hierarchical columns will be flattened with underscore 

3184 delimiting the different levels. 

3185 elem_cols : list-like, optional 

3186 List of columns to write as children in row element. By default, 

3187 all columns output as children of row element. Hierarchical 

3188 columns will be flattened with underscore delimiting the 

3189 different levels. 

3190 namespaces : dict, optional 

3191 All namespaces to be defined in root element. Keys of dict 

3192 should be prefix names and values of dict corresponding URIs. 

3193 Default namespaces should be given empty string key. For 

3194 example, :: 

3195 

3196 namespaces = {{"": "https://example.com"}} 

3197 

3198 prefix : str, optional 

3199 Namespace prefix to be used for every element and/or attribute 

3200 in document. This should be one of the keys in ``namespaces`` 

3201 dict. 

3202 encoding : str, default 'utf-8' 

3203 Encoding of the resulting document. 

3204 xml_declaration : bool, default True 

3205 Whether to include the XML declaration at start of document. 

3206 pretty_print : bool, default True 

3207 Whether output should be pretty printed with indentation and 

3208 line breaks. 

3209 parser : {{'lxml','etree'}}, default 'lxml' 

3210 Parser module to use for building of tree. Only 'lxml' and 

3211 'etree' are supported. With 'lxml', the ability to use XSLT 

3212 stylesheet is supported. 

3213 stylesheet : str, path object or file-like object, optional 

3214 A URL, file-like object, or a raw string containing an XSLT 

3215 script used to transform the raw XML output. Script should use 

3216 layout of elements and attributes from original output. This 

3217 argument requires ``lxml`` to be installed. Only XSLT 1.0 

3218 scripts and not later versions is currently supported. 

3219 {compression_options} 

3220 

3221 .. versionchanged:: 1.4.0 Zstandard support. 

3222 

3223 {storage_options} 

3224 

3225 Returns 

3226 ------- 

3227 None or str 

3228 If ``io`` is None, returns the resulting XML format as a 

3229 string. Otherwise returns None. 

3230 

3231 See Also 

3232 -------- 

3233 to_json : Convert the pandas object to a JSON string. 

3234 to_html : Convert DataFrame to a html. 

3235 

3236 Examples 

3237 -------- 

3238 >>> df = pd.DataFrame({{'shape': ['square', 'circle', 'triangle'], 

3239 ... 'degrees': [360, 360, 180], 

3240 ... 'sides': [4, np.nan, 3]}}) 

3241 

3242 >>> df.to_xml() # doctest: +SKIP 

3243 <?xml version='1.0' encoding='utf-8'?> 

3244 <data> 

3245 <row> 

3246 <index>0</index> 

3247 <shape>square</shape> 

3248 <degrees>360</degrees> 

3249 <sides>4.0</sides> 

3250 </row> 

3251 <row> 

3252 <index>1</index> 

3253 <shape>circle</shape> 

3254 <degrees>360</degrees> 

3255 <sides/> 

3256 </row> 

3257 <row> 

3258 <index>2</index> 

3259 <shape>triangle</shape> 

3260 <degrees>180</degrees> 

3261 <sides>3.0</sides> 

3262 </row> 

3263 </data> 

3264 

3265 >>> df.to_xml(attr_cols=[ 

3266 ... 'index', 'shape', 'degrees', 'sides' 

3267 ... ]) # doctest: +SKIP 

3268 <?xml version='1.0' encoding='utf-8'?> 

3269 <data> 

3270 <row index="0" shape="square" degrees="360" sides="4.0"/> 

3271 <row index="1" shape="circle" degrees="360"/> 

3272 <row index="2" shape="triangle" degrees="180" sides="3.0"/> 

3273 </data> 

3274 

3275 >>> df.to_xml(namespaces={{"doc": "https://example.com"}}, 

3276 ... prefix="doc") # doctest: +SKIP 

3277 <?xml version='1.0' encoding='utf-8'?> 

3278 <doc:data xmlns:doc="https://example.com"> 

3279 <doc:row> 

3280 <doc:index>0</doc:index> 

3281 <doc:shape>square</doc:shape> 

3282 <doc:degrees>360</doc:degrees> 

3283 <doc:sides>4.0</doc:sides> 

3284 </doc:row> 

3285 <doc:row> 

3286 <doc:index>1</doc:index> 

3287 <doc:shape>circle</doc:shape> 

3288 <doc:degrees>360</doc:degrees> 

3289 <doc:sides/> 

3290 </doc:row> 

3291 <doc:row> 

3292 <doc:index>2</doc:index> 

3293 <doc:shape>triangle</doc:shape> 

3294 <doc:degrees>180</doc:degrees> 

3295 <doc:sides>3.0</doc:sides> 

3296 </doc:row> 

3297 </doc:data> 

3298 """ 

3299 

3300 from pandas.io.formats.xml import ( 

3301 EtreeXMLFormatter, 

3302 LxmlXMLFormatter, 

3303 ) 

3304 

3305 lxml = import_optional_dependency("lxml.etree", errors="ignore") 

3306 

3307 TreeBuilder: type[EtreeXMLFormatter] | type[LxmlXMLFormatter] 

3308 

3309 if parser == "lxml": 

3310 if lxml is not None: 

3311 TreeBuilder = LxmlXMLFormatter 

3312 else: 

3313 raise ImportError( 

3314 "lxml not found, please install or use the etree parser." 

3315 ) 

3316 

3317 elif parser == "etree": 

3318 TreeBuilder = EtreeXMLFormatter 

3319 

3320 else: 

3321 raise ValueError("Values for parser can only be lxml or etree.") 

3322 

3323 xml_formatter = TreeBuilder( 

3324 self, 

3325 path_or_buffer=path_or_buffer, 

3326 index=index, 

3327 root_name=root_name, 

3328 row_name=row_name, 

3329 na_rep=na_rep, 

3330 attr_cols=attr_cols, 

3331 elem_cols=elem_cols, 

3332 namespaces=namespaces, 

3333 prefix=prefix, 

3334 encoding=encoding, 

3335 xml_declaration=xml_declaration, 

3336 pretty_print=pretty_print, 

3337 stylesheet=stylesheet, 

3338 compression=compression, 

3339 storage_options=storage_options, 

3340 ) 

3341 

3342 return xml_formatter.write_output() 

3343 

3344 # ---------------------------------------------------------------------- 

3345 @doc(INFO_DOCSTRING, **frame_sub_kwargs) 

3346 def info( 

3347 self, 

3348 verbose: bool | None = None, 

3349 buf: WriteBuffer[str] | None = None, 

3350 max_cols: int | None = None, 

3351 memory_usage: bool | str | None = None, 

3352 show_counts: bool | None = None, 

3353 ) -> None: 

3354 info = DataFrameInfo( 

3355 data=self, 

3356 memory_usage=memory_usage, 

3357 ) 

3358 info.render( 

3359 buf=buf, 

3360 max_cols=max_cols, 

3361 verbose=verbose, 

3362 show_counts=show_counts, 

3363 ) 

3364 

3365 def memory_usage(self, index: bool = True, deep: bool = False) -> Series: 

3366 """ 

3367 Return the memory usage of each column in bytes. 

3368 

3369 The memory usage can optionally include the contribution of 

3370 the index and elements of `object` dtype. 

3371 

3372 This value is displayed in `DataFrame.info` by default. This can be 

3373 suppressed by setting ``pandas.options.display.memory_usage`` to False. 

3374 

3375 Parameters 

3376 ---------- 

3377 index : bool, default True 

3378 Specifies whether to include the memory usage of the DataFrame's 

3379 index in returned Series. If ``index=True``, the memory usage of 

3380 the index is the first item in the output. 

3381 deep : bool, default False 

3382 If True, introspect the data deeply by interrogating 

3383 `object` dtypes for system-level memory consumption, and include 

3384 it in the returned values. 

3385 

3386 Returns 

3387 ------- 

3388 Series 

3389 A Series whose index is the original column names and whose values 

3390 is the memory usage of each column in bytes. 

3391 

3392 See Also 

3393 -------- 

3394 numpy.ndarray.nbytes : Total bytes consumed by the elements of an 

3395 ndarray. 

3396 Series.memory_usage : Bytes consumed by a Series. 

3397 Categorical : Memory-efficient array for string values with 

3398 many repeated values. 

3399 DataFrame.info : Concise summary of a DataFrame. 

3400 

3401 Notes 

3402 ----- 

3403 See the :ref:`Frequently Asked Questions <df-memory-usage>` for more 

3404 details. 

3405 

3406 Examples 

3407 -------- 

3408 >>> dtypes = ['int64', 'float64', 'complex128', 'object', 'bool'] 

3409 >>> data = dict([(t, np.ones(shape=5000, dtype=int).astype(t)) 

3410 ... for t in dtypes]) 

3411 >>> df = pd.DataFrame(data) 

3412 >>> df.head() 

3413 int64 float64 complex128 object bool 

3414 0 1 1.0 1.0+0.0j 1 True 

3415 1 1 1.0 1.0+0.0j 1 True 

3416 2 1 1.0 1.0+0.0j 1 True 

3417 3 1 1.0 1.0+0.0j 1 True 

3418 4 1 1.0 1.0+0.0j 1 True 

3419 

3420 >>> df.memory_usage() 

3421 Index 128 

3422 int64 40000 

3423 float64 40000 

3424 complex128 80000 

3425 object 40000 

3426 bool 5000 

3427 dtype: int64 

3428 

3429 >>> df.memory_usage(index=False) 

3430 int64 40000 

3431 float64 40000 

3432 complex128 80000 

3433 object 40000 

3434 bool 5000 

3435 dtype: int64 

3436 

3437 The memory footprint of `object` dtype columns is ignored by default: 

3438 

3439 >>> df.memory_usage(deep=True) 

3440 Index 128 

3441 int64 40000 

3442 float64 40000 

3443 complex128 80000 

3444 object 180000 

3445 bool 5000 

3446 dtype: int64 

3447 

3448 Use a Categorical for efficient storage of an object-dtype column with 

3449 many repeated values. 

3450 

3451 >>> df['object'].astype('category').memory_usage(deep=True) 

3452 5244 

3453 """ 

3454 result = self._constructor_sliced( 

3455 [c.memory_usage(index=False, deep=deep) for col, c in self.items()], 

3456 index=self.columns, 

3457 dtype=np.intp, 

3458 ) 

3459 if index: 

3460 index_memory_usage = self._constructor_sliced( 

3461 self.index.memory_usage(deep=deep), index=["Index"] 

3462 ) 

3463 result = index_memory_usage._append(result) 

3464 return result 

3465 

3466 def transpose(self, *args, copy: bool = False) -> DataFrame: 

3467 """ 

3468 Transpose index and columns. 

3469 

3470 Reflect the DataFrame over its main diagonal by writing rows as columns 

3471 and vice-versa. The property :attr:`.T` is an accessor to the method 

3472 :meth:`transpose`. 

3473 

3474 Parameters 

3475 ---------- 

3476 *args : tuple, optional 

3477 Accepted for compatibility with NumPy. 

3478 copy : bool, default False 

3479 Whether to copy the data after transposing, even for DataFrames 

3480 with a single dtype. 

3481 

3482 Note that a copy is always required for mixed dtype DataFrames, 

3483 or for DataFrames with any extension types. 

3484 

3485 Returns 

3486 ------- 

3487 DataFrame 

3488 The transposed DataFrame. 

3489 

3490 See Also 

3491 -------- 

3492 numpy.transpose : Permute the dimensions of a given array. 

3493 

3494 Notes 

3495 ----- 

3496 Transposing a DataFrame with mixed dtypes will result in a homogeneous 

3497 DataFrame with the `object` dtype. In such a case, a copy of the data 

3498 is always made. 

3499 

3500 Examples 

3501 -------- 

3502 **Square DataFrame with homogeneous dtype** 

3503 

3504 >>> d1 = {'col1': [1, 2], 'col2': [3, 4]} 

3505 >>> df1 = pd.DataFrame(data=d1) 

3506 >>> df1 

3507 col1 col2 

3508 0 1 3 

3509 1 2 4 

3510 

3511 >>> df1_transposed = df1.T # or df1.transpose() 

3512 >>> df1_transposed 

3513 0 1 

3514 col1 1 2 

3515 col2 3 4 

3516 

3517 When the dtype is homogeneous in the original DataFrame, we get a 

3518 transposed DataFrame with the same dtype: 

3519 

3520 >>> df1.dtypes 

3521 col1 int64 

3522 col2 int64 

3523 dtype: object 

3524 >>> df1_transposed.dtypes 

3525 0 int64 

3526 1 int64 

3527 dtype: object 

3528 

3529 **Non-square DataFrame with mixed dtypes** 

3530 

3531 >>> d2 = {'name': ['Alice', 'Bob'], 

3532 ... 'score': [9.5, 8], 

3533 ... 'employed': [False, True], 

3534 ... 'kids': [0, 0]} 

3535 >>> df2 = pd.DataFrame(data=d2) 

3536 >>> df2 

3537 name score employed kids 

3538 0 Alice 9.5 False 0 

3539 1 Bob 8.0 True 0 

3540 

3541 >>> df2_transposed = df2.T # or df2.transpose() 

3542 >>> df2_transposed 

3543 0 1 

3544 name Alice Bob 

3545 score 9.5 8.0 

3546 employed False True 

3547 kids 0 0 

3548 

3549 When the DataFrame has mixed dtypes, we get a transposed DataFrame with 

3550 the `object` dtype: 

3551 

3552 >>> df2.dtypes 

3553 name object 

3554 score float64 

3555 employed bool 

3556 kids int64 

3557 dtype: object 

3558 >>> df2_transposed.dtypes 

3559 0 object 

3560 1 object 

3561 dtype: object 

3562 """ 

3563 nv.validate_transpose(args, {}) 

3564 # construct the args 

3565 

3566 dtypes = list(self.dtypes) 

3567 

3568 if self._can_fast_transpose: 

3569 # Note: tests pass without this, but this improves perf quite a bit. 

3570 new_vals = self._values.T 

3571 if copy and not using_copy_on_write(): 

3572 new_vals = new_vals.copy() 

3573 

3574 result = self._constructor( 

3575 new_vals, index=self.columns, columns=self.index, copy=False 

3576 ) 

3577 if using_copy_on_write() and len(self) > 0: 

3578 result._mgr.add_references(self._mgr) # type: ignore[arg-type] 

3579 

3580 elif ( 

3581 self._is_homogeneous_type and dtypes and is_extension_array_dtype(dtypes[0]) 

3582 ): 

3583 # We have EAs with the same dtype. We can preserve that dtype in transpose. 

3584 dtype = dtypes[0] 

3585 arr_type = dtype.construct_array_type() 

3586 values = self.values 

3587 

3588 new_values = [arr_type._from_sequence(row, dtype=dtype) for row in values] 

3589 result = type(self)._from_arrays( 

3590 new_values, index=self.columns, columns=self.index 

3591 ) 

3592 

3593 else: 

3594 new_arr = self.values.T 

3595 if copy and not using_copy_on_write(): 

3596 new_arr = new_arr.copy() 

3597 result = self._constructor( 

3598 new_arr, 

3599 index=self.columns, 

3600 columns=self.index, 

3601 # We already made a copy (more than one block) 

3602 copy=False, 

3603 ) 

3604 

3605 return result.__finalize__(self, method="transpose") 

3606 

3607 @property 

3608 def T(self) -> DataFrame: 

3609 """ 

3610 The transpose of the DataFrame. 

3611 

3612 Returns 

3613 ------- 

3614 DataFrame 

3615 The transposed DataFrame. 

3616 

3617 See Also 

3618 -------- 

3619 DataFrame.transpose : Transpose index and columns. 

3620 

3621 Examples 

3622 -------- 

3623 >>> df = pd.DataFrame({'col1': [1, 2], 'col2': [3, 4]}) 

3624 >>> df 

3625 col1 col2 

3626 0 1 3 

3627 1 2 4 

3628 

3629 >>> df.T 

3630 0 1 

3631 col1 1 2 

3632 col2 3 4 

3633 """ 

3634 return self.transpose() 

3635 

3636 # ---------------------------------------------------------------------- 

3637 # Indexing Methods 

3638 

3639 def _ixs(self, i: int, axis: AxisInt = 0) -> Series: 

3640 """ 

3641 Parameters 

3642 ---------- 

3643 i : int 

3644 axis : int 

3645 

3646 Returns 

3647 ------- 

3648 Series 

3649 """ 

3650 # irow 

3651 if axis == 0: 

3652 new_mgr = self._mgr.fast_xs(i) 

3653 

3654 # if we are a copy, mark as such 

3655 copy = isinstance(new_mgr.array, np.ndarray) and new_mgr.array.base is None 

3656 result = self._constructor_sliced(new_mgr, name=self.index[i]).__finalize__( 

3657 self 

3658 ) 

3659 result._set_is_copy(self, copy=copy) 

3660 return result 

3661 

3662 # icol 

3663 else: 

3664 label = self.columns[i] 

3665 

3666 col_mgr = self._mgr.iget(i) 

3667 result = self._box_col_values(col_mgr, i) 

3668 

3669 # this is a cached value, mark it so 

3670 result._set_as_cached(label, self) 

3671 return result 

3672 

3673 def _get_column_array(self, i: int) -> ArrayLike: 

3674 """ 

3675 Get the values of the i'th column (ndarray or ExtensionArray, as stored 

3676 in the Block) 

3677 

3678 Warning! The returned array is a view but doesn't handle Copy-on-Write, 

3679 so this should be used with caution (for read-only purposes). 

3680 """ 

3681 return self._mgr.iget_values(i) 

3682 

3683 def _iter_column_arrays(self) -> Iterator[ArrayLike]: 

3684 """ 

3685 Iterate over the arrays of all columns in order. 

3686 This returns the values as stored in the Block (ndarray or ExtensionArray). 

3687 

3688 Warning! The returned array is a view but doesn't handle Copy-on-Write, 

3689 so this should be used with caution (for read-only purposes). 

3690 """ 

3691 for i in range(len(self.columns)): 

3692 yield self._get_column_array(i) 

3693 

3694 def _getitem_nocopy(self, key: list): 

3695 """ 

3696 Behaves like __getitem__, but returns a view in cases where __getitem__ 

3697 would make a copy. 

3698 """ 

3699 # TODO(CoW): can be removed if/when we are always Copy-on-Write 

3700 indexer = self.columns._get_indexer_strict(key, "columns")[1] 

3701 new_axis = self.columns[indexer] 

3702 

3703 new_mgr = self._mgr.reindex_indexer( 

3704 new_axis, 

3705 indexer, 

3706 axis=0, 

3707 allow_dups=True, 

3708 copy=False, 

3709 only_slice=True, 

3710 ) 

3711 return self._constructor(new_mgr) 

3712 

3713 def __getitem__(self, key): 

3714 check_dict_or_set_indexers(key) 

3715 key = lib.item_from_zerodim(key) 

3716 key = com.apply_if_callable(key, self) 

3717 

3718 if is_hashable(key) and not is_iterator(key): 

3719 # is_iterator to exclude generator e.g. test_getitem_listlike 

3720 # shortcut if the key is in columns 

3721 is_mi = isinstance(self.columns, MultiIndex) 

3722 # GH#45316 Return view if key is not duplicated 

3723 # Only use drop_duplicates with duplicates for performance 

3724 if not is_mi and ( 

3725 self.columns.is_unique 

3726 and key in self.columns 

3727 or key in self.columns.drop_duplicates(keep=False) 

3728 ): 

3729 return self._get_item_cache(key) 

3730 

3731 elif is_mi and self.columns.is_unique and key in self.columns: 

3732 return self._getitem_multilevel(key) 

3733 # Do we have a slicer (on rows)? 

3734 if isinstance(key, slice): 

3735 indexer = self.index._convert_slice_indexer(key, kind="getitem") 

3736 if isinstance(indexer, np.ndarray): 

3737 # reachable with DatetimeIndex 

3738 indexer = lib.maybe_indices_to_slice( 

3739 indexer.astype(np.intp, copy=False), len(self) 

3740 ) 

3741 if isinstance(indexer, np.ndarray): 

3742 # GH#43223 If we can not convert, use take 

3743 return self.take(indexer, axis=0) 

3744 return self._slice(indexer, axis=0) 

3745 

3746 # Do we have a (boolean) DataFrame? 

3747 if isinstance(key, DataFrame): 

3748 return self.where(key) 

3749 

3750 # Do we have a (boolean) 1d indexer? 

3751 if com.is_bool_indexer(key): 

3752 return self._getitem_bool_array(key) 

3753 

3754 # We are left with two options: a single key, and a collection of keys, 

3755 # We interpret tuples as collections only for non-MultiIndex 

3756 is_single_key = isinstance(key, tuple) or not is_list_like(key) 

3757 

3758 if is_single_key: 

3759 if self.columns.nlevels > 1: 

3760 return self._getitem_multilevel(key) 

3761 indexer = self.columns.get_loc(key) 

3762 if is_integer(indexer): 

3763 indexer = [indexer] 

3764 else: 

3765 if is_iterator(key): 

3766 key = list(key) 

3767 indexer = self.columns._get_indexer_strict(key, "columns")[1] 

3768 

3769 # take() does not accept boolean indexers 

3770 if getattr(indexer, "dtype", None) == bool: 

3771 indexer = np.where(indexer)[0] 

3772 

3773 data = self._take_with_is_copy(indexer, axis=1) 

3774 

3775 if is_single_key: 

3776 # What does looking for a single key in a non-unique index return? 

3777 # The behavior is inconsistent. It returns a Series, except when 

3778 # - the key itself is repeated (test on data.shape, #9519), or 

3779 # - we have a MultiIndex on columns (test on self.columns, #21309) 

3780 if data.shape[1] == 1 and not isinstance(self.columns, MultiIndex): 

3781 # GH#26490 using data[key] can cause RecursionError 

3782 return data._get_item_cache(key) 

3783 

3784 return data 

3785 

3786 def _getitem_bool_array(self, key): 

3787 # also raises Exception if object array with NA values 

3788 # warning here just in case -- previously __setitem__ was 

3789 # reindexing but __getitem__ was not; it seems more reasonable to 

3790 # go with the __setitem__ behavior since that is more consistent 

3791 # with all other indexing behavior 

3792 if isinstance(key, Series) and not key.index.equals(self.index): 

3793 warnings.warn( 

3794 "Boolean Series key will be reindexed to match DataFrame index.", 

3795 UserWarning, 

3796 stacklevel=find_stack_level(), 

3797 ) 

3798 elif len(key) != len(self.index): 

3799 raise ValueError( 

3800 f"Item wrong length {len(key)} instead of {len(self.index)}." 

3801 ) 

3802 

3803 # check_bool_indexer will throw exception if Series key cannot 

3804 # be reindexed to match DataFrame rows 

3805 key = check_bool_indexer(self.index, key) 

3806 

3807 if key.all(): 

3808 return self.copy(deep=None) 

3809 

3810 indexer = key.nonzero()[0] 

3811 return self._take_with_is_copy(indexer, axis=0) 

3812 

3813 def _getitem_multilevel(self, key): 

3814 # self.columns is a MultiIndex 

3815 loc = self.columns.get_loc(key) 

3816 if isinstance(loc, (slice, np.ndarray)): 

3817 new_columns = self.columns[loc] 

3818 result_columns = maybe_droplevels(new_columns, key) 

3819 result = self.iloc[:, loc] 

3820 result.columns = result_columns 

3821 

3822 # If there is only one column being returned, and its name is 

3823 # either an empty string, or a tuple with an empty string as its 

3824 # first element, then treat the empty string as a placeholder 

3825 # and return the column as if the user had provided that empty 

3826 # string in the key. If the result is a Series, exclude the 

3827 # implied empty string from its name. 

3828 if len(result.columns) == 1: 

3829 # e.g. test_frame_getitem_multicolumn_empty_level, 

3830 # test_frame_mixed_depth_get, test_loc_setitem_single_column_slice 

3831 top = result.columns[0] 

3832 if isinstance(top, tuple): 

3833 top = top[0] 

3834 if top == "": 

3835 result = result[""] 

3836 if isinstance(result, Series): 

3837 result = self._constructor_sliced( 

3838 result, index=self.index, name=key 

3839 ) 

3840 

3841 result._set_is_copy(self) 

3842 return result 

3843 else: 

3844 # loc is neither a slice nor ndarray, so must be an int 

3845 return self._ixs(loc, axis=1) 

3846 

3847 def _get_value(self, index, col, takeable: bool = False) -> Scalar: 

3848 """ 

3849 Quickly retrieve single value at passed column and index. 

3850 

3851 Parameters 

3852 ---------- 

3853 index : row label 

3854 col : column label 

3855 takeable : interpret the index/col as indexers, default False 

3856 

3857 Returns 

3858 ------- 

3859 scalar 

3860 

3861 Notes 

3862 ----- 

3863 Assumes that both `self.index._index_as_unique` and 

3864 `self.columns._index_as_unique`; Caller is responsible for checking. 

3865 """ 

3866 if takeable: 

3867 series = self._ixs(col, axis=1) 

3868 return series._values[index] 

3869 

3870 series = self._get_item_cache(col) 

3871 engine = self.index._engine 

3872 

3873 if not isinstance(self.index, MultiIndex): 

3874 # CategoricalIndex: Trying to use the engine fastpath may give incorrect 

3875 # results if our categories are integers that dont match our codes 

3876 # IntervalIndex: IntervalTree has no get_loc 

3877 row = self.index.get_loc(index) 

3878 return series._values[row] 

3879 

3880 # For MultiIndex going through engine effectively restricts us to 

3881 # same-length tuples; see test_get_set_value_no_partial_indexing 

3882 loc = engine.get_loc(index) 

3883 return series._values[loc] 

3884 

3885 def isetitem(self, loc, value) -> None: 

3886 """ 

3887 Set the given value in the column with position `loc`. 

3888 

3889 This is a positional analogue to ``__setitem__``. 

3890 

3891 Parameters 

3892 ---------- 

3893 loc : int or sequence of ints 

3894 Index position for the column. 

3895 value : scalar or arraylike 

3896 Value(s) for the column. 

3897 

3898 Notes 

3899 ----- 

3900 ``frame.isetitem(loc, value)`` is an in-place method as it will 

3901 modify the DataFrame in place (not returning a new object). In contrast to 

3902 ``frame.iloc[:, i] = value`` which will try to update the existing values in 

3903 place, ``frame.isetitem(loc, value)`` will not update the values of the column 

3904 itself in place, it will instead insert a new array. 

3905 

3906 In cases where ``frame.columns`` is unique, this is equivalent to 

3907 ``frame[frame.columns[i]] = value``. 

3908 """ 

3909 if isinstance(value, DataFrame): 

3910 if is_scalar(loc): 

3911 loc = [loc] 

3912 

3913 for i, idx in enumerate(loc): 

3914 arraylike = self._sanitize_column(value.iloc[:, i]) 

3915 self._iset_item_mgr(idx, arraylike, inplace=False) 

3916 return 

3917 

3918 arraylike = self._sanitize_column(value) 

3919 self._iset_item_mgr(loc, arraylike, inplace=False) 

3920 

3921 def __setitem__(self, key, value): 

3922 if not PYPY and using_copy_on_write(): 

3923 if sys.getrefcount(self) <= 3: 

3924 warnings.warn( 

3925 _chained_assignment_msg, ChainedAssignmentError, stacklevel=2 

3926 ) 

3927 

3928 key = com.apply_if_callable(key, self) 

3929 

3930 # see if we can slice the rows 

3931 if isinstance(key, slice): 

3932 slc = self.index._convert_slice_indexer(key, kind="getitem") 

3933 return self._setitem_slice(slc, value) 

3934 

3935 if isinstance(key, DataFrame) or getattr(key, "ndim", None) == 2: 

3936 self._setitem_frame(key, value) 

3937 elif isinstance(key, (Series, np.ndarray, list, Index)): 

3938 self._setitem_array(key, value) 

3939 elif isinstance(value, DataFrame): 

3940 self._set_item_frame_value(key, value) 

3941 elif ( 

3942 is_list_like(value) 

3943 and not self.columns.is_unique 

3944 and 1 < len(self.columns.get_indexer_for([key])) == len(value) 

3945 ): 

3946 # Column to set is duplicated 

3947 self._setitem_array([key], value) 

3948 else: 

3949 # set column 

3950 self._set_item(key, value) 

3951 

3952 def _setitem_slice(self, key: slice, value) -> None: 

3953 # NB: we can't just use self.loc[key] = value because that 

3954 # operates on labels and we need to operate positional for 

3955 # backwards-compat, xref GH#31469 

3956 self._check_setitem_copy() 

3957 self.iloc[key] = value 

3958 

3959 def _setitem_array(self, key, value): 

3960 # also raises Exception if object array with NA values 

3961 if com.is_bool_indexer(key): 

3962 # bool indexer is indexing along rows 

3963 if len(key) != len(self.index): 

3964 raise ValueError( 

3965 f"Item wrong length {len(key)} instead of {len(self.index)}!" 

3966 ) 

3967 key = check_bool_indexer(self.index, key) 

3968 indexer = key.nonzero()[0] 

3969 self._check_setitem_copy() 

3970 if isinstance(value, DataFrame): 

3971 # GH#39931 reindex since iloc does not align 

3972 value = value.reindex(self.index.take(indexer)) 

3973 self.iloc[indexer] = value 

3974 

3975 else: 

3976 # Note: unlike self.iloc[:, indexer] = value, this will 

3977 # never try to overwrite values inplace 

3978 

3979 if isinstance(value, DataFrame): 

3980 check_key_length(self.columns, key, value) 

3981 for k1, k2 in zip(key, value.columns): 

3982 self[k1] = value[k2] 

3983 

3984 elif not is_list_like(value): 

3985 for col in key: 

3986 self[col] = value 

3987 

3988 elif isinstance(value, np.ndarray) and value.ndim == 2: 

3989 self._iset_not_inplace(key, value) 

3990 

3991 elif np.ndim(value) > 1: 

3992 # list of lists 

3993 value = DataFrame(value).values 

3994 return self._setitem_array(key, value) 

3995 

3996 else: 

3997 self._iset_not_inplace(key, value) 

3998 

3999 def _iset_not_inplace(self, key, value): 

4000 # GH#39510 when setting with df[key] = obj with a list-like key and 

4001 # list-like value, we iterate over those listlikes and set columns 

4002 # one at a time. This is different from dispatching to 

4003 # `self.loc[:, key]= value` because loc.__setitem__ may overwrite 

4004 # data inplace, whereas this will insert new arrays. 

4005 

4006 def igetitem(obj, i: int): 

4007 # Note: we catch DataFrame obj before getting here, but 

4008 # hypothetically would return obj.iloc[:, i] 

4009 if isinstance(obj, np.ndarray): 

4010 return obj[..., i] 

4011 else: 

4012 return obj[i] 

4013 

4014 if self.columns.is_unique: 

4015 if np.shape(value)[-1] != len(key): 

4016 raise ValueError("Columns must be same length as key") 

4017 

4018 for i, col in enumerate(key): 

4019 self[col] = igetitem(value, i) 

4020 

4021 else: 

4022 ilocs = self.columns.get_indexer_non_unique(key)[0] 

4023 if (ilocs < 0).any(): 

4024 # key entries not in self.columns 

4025 raise NotImplementedError 

4026 

4027 if np.shape(value)[-1] != len(ilocs): 

4028 raise ValueError("Columns must be same length as key") 

4029 

4030 assert np.ndim(value) <= 2 

4031 

4032 orig_columns = self.columns 

4033 

4034 # Using self.iloc[:, i] = ... may set values inplace, which 

4035 # by convention we do not do in __setitem__ 

4036 try: 

4037 self.columns = Index(range(len(self.columns))) 

4038 for i, iloc in enumerate(ilocs): 

4039 self[iloc] = igetitem(value, i) 

4040 finally: 

4041 self.columns = orig_columns 

4042 

4043 def _setitem_frame(self, key, value): 

4044 # support boolean setting with DataFrame input, e.g. 

4045 # df[df > df2] = 0 

4046 if isinstance(key, np.ndarray): 

4047 if key.shape != self.shape: 

4048 raise ValueError("Array conditional must be same shape as self") 

4049 key = self._constructor(key, **self._construct_axes_dict(), copy=False) 

4050 

4051 if key.size and not all(is_bool_dtype(dtype) for dtype in key.dtypes): 

4052 raise TypeError( 

4053 "Must pass DataFrame or 2-d ndarray with boolean values only" 

4054 ) 

4055 

4056 self._check_inplace_setting(value) 

4057 self._check_setitem_copy() 

4058 self._where(-key, value, inplace=True) 

4059 

4060 def _set_item_frame_value(self, key, value: DataFrame) -> None: 

4061 self._ensure_valid_index(value) 

4062 

4063 # align columns 

4064 if key in self.columns: 

4065 loc = self.columns.get_loc(key) 

4066 cols = self.columns[loc] 

4067 len_cols = 1 if is_scalar(cols) or isinstance(cols, tuple) else len(cols) 

4068 if len_cols != len(value.columns): 

4069 raise ValueError("Columns must be same length as key") 

4070 

4071 # align right-hand-side columns if self.columns 

4072 # is multi-index and self[key] is a sub-frame 

4073 if isinstance(self.columns, MultiIndex) and isinstance( 

4074 loc, (slice, Series, np.ndarray, Index) 

4075 ): 

4076 cols_droplevel = maybe_droplevels(cols, key) 

4077 if len(cols_droplevel) and not cols_droplevel.equals(value.columns): 

4078 value = value.reindex(cols_droplevel, axis=1) 

4079 

4080 for col, col_droplevel in zip(cols, cols_droplevel): 

4081 self[col] = value[col_droplevel] 

4082 return 

4083 

4084 if is_scalar(cols): 

4085 self[cols] = value[value.columns[0]] 

4086 return 

4087 

4088 # now align rows 

4089 arraylike = _reindex_for_setitem(value, self.index) 

4090 self._set_item_mgr(key, arraylike) 

4091 return 

4092 

4093 if len(value.columns) != 1: 

4094 raise ValueError( 

4095 "Cannot set a DataFrame with multiple columns to the single " 

4096 f"column {key}" 

4097 ) 

4098 

4099 self[key] = value[value.columns[0]] 

4100 

4101 def _iset_item_mgr( 

4102 self, loc: int | slice | np.ndarray, value, inplace: bool = False 

4103 ) -> None: 

4104 # when called from _set_item_mgr loc can be anything returned from get_loc 

4105 self._mgr.iset(loc, value, inplace=inplace) 

4106 self._clear_item_cache() 

4107 

4108 def _set_item_mgr(self, key, value: ArrayLike) -> None: 

4109 try: 

4110 loc = self._info_axis.get_loc(key) 

4111 except KeyError: 

4112 # This item wasn't present, just insert at end 

4113 self._mgr.insert(len(self._info_axis), key, value) 

4114 else: 

4115 self._iset_item_mgr(loc, value) 

4116 

4117 # check if we are modifying a copy 

4118 # try to set first as we want an invalid 

4119 # value exception to occur first 

4120 if len(self): 

4121 self._check_setitem_copy() 

4122 

4123 def _iset_item(self, loc: int, value) -> None: 

4124 arraylike = self._sanitize_column(value) 

4125 self._iset_item_mgr(loc, arraylike, inplace=True) 

4126 

4127 # check if we are modifying a copy 

4128 # try to set first as we want an invalid 

4129 # value exception to occur first 

4130 if len(self): 

4131 self._check_setitem_copy() 

4132 

4133 def _set_item(self, key, value) -> None: 

4134 """ 

4135 Add series to DataFrame in specified column. 

4136 

4137 If series is a numpy-array (not a Series/TimeSeries), it must be the 

4138 same length as the DataFrames index or an error will be thrown. 

4139 

4140 Series/TimeSeries will be conformed to the DataFrames index to 

4141 ensure homogeneity. 

4142 """ 

4143 value = self._sanitize_column(value) 

4144 

4145 if ( 

4146 key in self.columns 

4147 and value.ndim == 1 

4148 and not is_extension_array_dtype(value) 

4149 ): 

4150 # broadcast across multiple columns if necessary 

4151 if not self.columns.is_unique or isinstance(self.columns, MultiIndex): 

4152 existing_piece = self[key] 

4153 if isinstance(existing_piece, DataFrame): 

4154 value = np.tile(value, (len(existing_piece.columns), 1)).T 

4155 

4156 self._set_item_mgr(key, value) 

4157 

4158 def _set_value( 

4159 self, index: IndexLabel, col, value: Scalar, takeable: bool = False 

4160 ) -> None: 

4161 """ 

4162 Put single value at passed column and index. 

4163 

4164 Parameters 

4165 ---------- 

4166 index : Label 

4167 row label 

4168 col : Label 

4169 column label 

4170 value : scalar 

4171 takeable : bool, default False 

4172 Sets whether or not index/col interpreted as indexers 

4173 """ 

4174 try: 

4175 if takeable: 

4176 icol = col 

4177 iindex = cast(int, index) 

4178 else: 

4179 icol = self.columns.get_loc(col) 

4180 iindex = self.index.get_loc(index) 

4181 self._mgr.column_setitem(icol, iindex, value, inplace_only=True) 

4182 self._clear_item_cache() 

4183 

4184 except (KeyError, TypeError, ValueError, LossySetitemError): 

4185 # get_loc might raise a KeyError for missing labels (falling back 

4186 # to (i)loc will do expansion of the index) 

4187 # column_setitem will do validation that may raise TypeError, 

4188 # ValueError, or LossySetitemError 

4189 # set using a non-recursive method & reset the cache 

4190 if takeable: 

4191 self.iloc[index, col] = value 

4192 else: 

4193 self.loc[index, col] = value 

4194 self._item_cache.pop(col, None) 

4195 

4196 except InvalidIndexError as ii_err: 

4197 # GH48729: Seems like you are trying to assign a value to a 

4198 # row when only scalar options are permitted 

4199 raise InvalidIndexError( 

4200 f"You can only assign a scalar value not a {type(value)}" 

4201 ) from ii_err 

4202 

4203 def _ensure_valid_index(self, value) -> None: 

4204 """ 

4205 Ensure that if we don't have an index, that we can create one from the 

4206 passed value. 

4207 """ 

4208 # GH5632, make sure that we are a Series convertible 

4209 if not len(self.index) and is_list_like(value) and len(value): 

4210 if not isinstance(value, DataFrame): 

4211 try: 

4212 value = Series(value) 

4213 except (ValueError, NotImplementedError, TypeError) as err: 

4214 raise ValueError( 

4215 "Cannot set a frame with no defined index " 

4216 "and a value that cannot be converted to a Series" 

4217 ) from err 

4218 

4219 # GH31368 preserve name of index 

4220 index_copy = value.index.copy() 

4221 if self.index.name is not None: 

4222 index_copy.name = self.index.name 

4223 

4224 self._mgr = self._mgr.reindex_axis(index_copy, axis=1, fill_value=np.nan) 

4225 

4226 def _box_col_values(self, values: SingleDataManager, loc: int) -> Series: 

4227 """ 

4228 Provide boxed values for a column. 

4229 """ 

4230 # Lookup in columns so that if e.g. a str datetime was passed 

4231 # we attach the Timestamp object as the name. 

4232 name = self.columns[loc] 

4233 klass = self._constructor_sliced 

4234 # We get index=self.index bc values is a SingleDataManager 

4235 return klass(values, name=name, fastpath=True).__finalize__(self) 

4236 

4237 # ---------------------------------------------------------------------- 

4238 # Lookup Caching 

4239 

4240 def _clear_item_cache(self) -> None: 

4241 self._item_cache.clear() 

4242 

4243 def _get_item_cache(self, item: Hashable) -> Series: 

4244 """Return the cached item, item represents a label indexer.""" 

4245 if using_copy_on_write(): 

4246 loc = self.columns.get_loc(item) 

4247 return self._ixs(loc, axis=1) 

4248 

4249 cache = self._item_cache 

4250 res = cache.get(item) 

4251 if res is None: 

4252 # All places that call _get_item_cache have unique columns, 

4253 # pending resolution of GH#33047 

4254 

4255 loc = self.columns.get_loc(item) 

4256 res = self._ixs(loc, axis=1) 

4257 

4258 cache[item] = res 

4259 

4260 # for a chain 

4261 res._is_copy = self._is_copy 

4262 return res 

4263 

4264 def _reset_cacher(self) -> None: 

4265 # no-op for DataFrame 

4266 pass 

4267 

4268 def _maybe_cache_changed(self, item, value: Series, inplace: bool) -> None: 

4269 """ 

4270 The object has called back to us saying maybe it has changed. 

4271 """ 

4272 loc = self._info_axis.get_loc(item) 

4273 arraylike = value._values 

4274 

4275 old = self._ixs(loc, axis=1) 

4276 if old._values is value._values and inplace: 

4277 # GH#46149 avoid making unnecessary copies/block-splitting 

4278 return 

4279 

4280 self._mgr.iset(loc, arraylike, inplace=inplace) 

4281 

4282 # ---------------------------------------------------------------------- 

4283 # Unsorted 

4284 

4285 @overload 

4286 def query(self, expr: str, *, inplace: Literal[False] = ..., **kwargs) -> DataFrame: 

4287 ... 

4288 

4289 @overload 

4290 def query(self, expr: str, *, inplace: Literal[True], **kwargs) -> None: 

4291 ... 

4292 

4293 @overload 

4294 def query(self, expr: str, *, inplace: bool = ..., **kwargs) -> DataFrame | None: 

4295 ... 

4296 

4297 def query(self, expr: str, *, inplace: bool = False, **kwargs) -> DataFrame | None: 

4298 """ 

4299 Query the columns of a DataFrame with a boolean expression. 

4300 

4301 Parameters 

4302 ---------- 

4303 expr : str 

4304 The query string to evaluate. 

4305 

4306 You can refer to variables 

4307 in the environment by prefixing them with an '@' character like 

4308 ``@a + b``. 

4309 

4310 You can refer to column names that are not valid Python variable names 

4311 by surrounding them in backticks. Thus, column names containing spaces 

4312 or punctuations (besides underscores) or starting with digits must be 

4313 surrounded by backticks. (For example, a column named "Area (cm^2)" would 

4314 be referenced as ```Area (cm^2)```). Column names which are Python keywords 

4315 (like "list", "for", "import", etc) cannot be used. 

4316 

4317 For example, if one of your columns is called ``a a`` and you want 

4318 to sum it with ``b``, your query should be ```a a` + b``. 

4319 

4320 inplace : bool 

4321 Whether to modify the DataFrame rather than creating a new one. 

4322 **kwargs 

4323 See the documentation for :func:`eval` for complete details 

4324 on the keyword arguments accepted by :meth:`DataFrame.query`. 

4325 

4326 Returns 

4327 ------- 

4328 DataFrame or None 

4329 DataFrame resulting from the provided query expression or 

4330 None if ``inplace=True``. 

4331 

4332 See Also 

4333 -------- 

4334 eval : Evaluate a string describing operations on 

4335 DataFrame columns. 

4336 DataFrame.eval : Evaluate a string describing operations on 

4337 DataFrame columns. 

4338 

4339 Notes 

4340 ----- 

4341 The result of the evaluation of this expression is first passed to 

4342 :attr:`DataFrame.loc` and if that fails because of a 

4343 multidimensional key (e.g., a DataFrame) then the result will be passed 

4344 to :meth:`DataFrame.__getitem__`. 

4345 

4346 This method uses the top-level :func:`eval` function to 

4347 evaluate the passed query. 

4348 

4349 The :meth:`~pandas.DataFrame.query` method uses a slightly 

4350 modified Python syntax by default. For example, the ``&`` and ``|`` 

4351 (bitwise) operators have the precedence of their boolean cousins, 

4352 :keyword:`and` and :keyword:`or`. This *is* syntactically valid Python, 

4353 however the semantics are different. 

4354 

4355 You can change the semantics of the expression by passing the keyword 

4356 argument ``parser='python'``. This enforces the same semantics as 

4357 evaluation in Python space. Likewise, you can pass ``engine='python'`` 

4358 to evaluate an expression using Python itself as a backend. This is not 

4359 recommended as it is inefficient compared to using ``numexpr`` as the 

4360 engine. 

4361 

4362 The :attr:`DataFrame.index` and 

4363 :attr:`DataFrame.columns` attributes of the 

4364 :class:`~pandas.DataFrame` instance are placed in the query namespace 

4365 by default, which allows you to treat both the index and columns of the 

4366 frame as a column in the frame. 

4367 The identifier ``index`` is used for the frame index; you can also 

4368 use the name of the index to identify it in a query. Please note that 

4369 Python keywords may not be used as identifiers. 

4370 

4371 For further details and examples see the ``query`` documentation in 

4372 :ref:`indexing <indexing.query>`. 

4373 

4374 *Backtick quoted variables* 

4375 

4376 Backtick quoted variables are parsed as literal Python code and 

4377 are converted internally to a Python valid identifier. 

4378 This can lead to the following problems. 

4379 

4380 During parsing a number of disallowed characters inside the backtick 

4381 quoted string are replaced by strings that are allowed as a Python identifier. 

4382 These characters include all operators in Python, the space character, the 

4383 question mark, the exclamation mark, the dollar sign, and the euro sign. 

4384 For other characters that fall outside the ASCII range (U+0001..U+007F) 

4385 and those that are not further specified in PEP 3131, 

4386 the query parser will raise an error. 

4387 This excludes whitespace different than the space character, 

4388 but also the hashtag (as it is used for comments) and the backtick 

4389 itself (backtick can also not be escaped). 

4390 

4391 In a special case, quotes that make a pair around a backtick can 

4392 confuse the parser. 

4393 For example, ```it's` > `that's``` will raise an error, 

4394 as it forms a quoted string (``'s > `that'``) with a backtick inside. 

4395 

4396 See also the Python documentation about lexical analysis 

4397 (https://docs.python.org/3/reference/lexical_analysis.html) 

4398 in combination with the source code in :mod:`pandas.core.computation.parsing`. 

4399 

4400 Examples 

4401 -------- 

4402 >>> df = pd.DataFrame({'A': range(1, 6), 

4403 ... 'B': range(10, 0, -2), 

4404 ... 'C C': range(10, 5, -1)}) 

4405 >>> df 

4406 A B C C 

4407 0 1 10 10 

4408 1 2 8 9 

4409 2 3 6 8 

4410 3 4 4 7 

4411 4 5 2 6 

4412 >>> df.query('A > B') 

4413 A B C C 

4414 4 5 2 6 

4415 

4416 The previous expression is equivalent to 

4417 

4418 >>> df[df.A > df.B] 

4419 A B C C 

4420 4 5 2 6 

4421 

4422 For columns with spaces in their name, you can use backtick quoting. 

4423 

4424 >>> df.query('B == `C C`') 

4425 A B C C 

4426 0 1 10 10 

4427 

4428 The previous expression is equivalent to 

4429 

4430 >>> df[df.B == df['C C']] 

4431 A B C C 

4432 0 1 10 10 

4433 """ 

4434 inplace = validate_bool_kwarg(inplace, "inplace") 

4435 if not isinstance(expr, str): 

4436 msg = f"expr must be a string to be evaluated, {type(expr)} given" 

4437 raise ValueError(msg) 

4438 kwargs["level"] = kwargs.pop("level", 0) + 1 

4439 kwargs["target"] = None 

4440 res = self.eval(expr, **kwargs) 

4441 

4442 try: 

4443 result = self.loc[res] 

4444 except ValueError: 

4445 # when res is multi-dimensional loc raises, but this is sometimes a 

4446 # valid query 

4447 result = self[res] 

4448 

4449 if inplace: 

4450 self._update_inplace(result) 

4451 return None 

4452 else: 

4453 return result 

4454 

4455 @overload 

4456 def eval(self, expr: str, *, inplace: Literal[False] = ..., **kwargs) -> Any: 

4457 ... 

4458 

4459 @overload 

4460 def eval(self, expr: str, *, inplace: Literal[True], **kwargs) -> None: 

4461 ... 

4462 

4463 def eval(self, expr: str, *, inplace: bool = False, **kwargs) -> Any | None: 

4464 """ 

4465 Evaluate a string describing operations on DataFrame columns. 

4466 

4467 Operates on columns only, not specific rows or elements. This allows 

4468 `eval` to run arbitrary code, which can make you vulnerable to code 

4469 injection if you pass user input to this function. 

4470 

4471 Parameters 

4472 ---------- 

4473 expr : str 

4474 The expression string to evaluate. 

4475 inplace : bool, default False 

4476 If the expression contains an assignment, whether to perform the 

4477 operation inplace and mutate the existing DataFrame. Otherwise, 

4478 a new DataFrame is returned. 

4479 **kwargs 

4480 See the documentation for :func:`eval` for complete details 

4481 on the keyword arguments accepted by 

4482 :meth:`~pandas.DataFrame.query`. 

4483 

4484 Returns 

4485 ------- 

4486 ndarray, scalar, pandas object, or None 

4487 The result of the evaluation or None if ``inplace=True``. 

4488 

4489 See Also 

4490 -------- 

4491 DataFrame.query : Evaluates a boolean expression to query the columns 

4492 of a frame. 

4493 DataFrame.assign : Can evaluate an expression or function to create new 

4494 values for a column. 

4495 eval : Evaluate a Python expression as a string using various 

4496 backends. 

4497 

4498 Notes 

4499 ----- 

4500 For more details see the API documentation for :func:`~eval`. 

4501 For detailed examples see :ref:`enhancing performance with eval 

4502 <enhancingperf.eval>`. 

4503 

4504 Examples 

4505 -------- 

4506 >>> df = pd.DataFrame({'A': range(1, 6), 'B': range(10, 0, -2)}) 

4507 >>> df 

4508 A B 

4509 0 1 10 

4510 1 2 8 

4511 2 3 6 

4512 3 4 4 

4513 4 5 2 

4514 >>> df.eval('A + B') 

4515 0 11 

4516 1 10 

4517 2 9 

4518 3 8 

4519 4 7 

4520 dtype: int64 

4521 

4522 Assignment is allowed though by default the original DataFrame is not 

4523 modified. 

4524 

4525 >>> df.eval('C = A + B') 

4526 A B C 

4527 0 1 10 11 

4528 1 2 8 10 

4529 2 3 6 9 

4530 3 4 4 8 

4531 4 5 2 7 

4532 >>> df 

4533 A B 

4534 0 1 10 

4535 1 2 8 

4536 2 3 6 

4537 3 4 4 

4538 4 5 2 

4539 

4540 Multiple columns can be assigned to using multi-line expressions: 

4541 

4542 >>> df.eval( 

4543 ... ''' 

4544 ... C = A + B 

4545 ... D = A - B 

4546 ... ''' 

4547 ... ) 

4548 A B C D 

4549 0 1 10 11 -9 

4550 1 2 8 10 -6 

4551 2 3 6 9 -3 

4552 3 4 4 8 0 

4553 4 5 2 7 3 

4554 """ 

4555 from pandas.core.computation.eval import eval as _eval 

4556 

4557 inplace = validate_bool_kwarg(inplace, "inplace") 

4558 kwargs["level"] = kwargs.pop("level", 0) + 1 

4559 index_resolvers = self._get_index_resolvers() 

4560 column_resolvers = self._get_cleaned_column_resolvers() 

4561 resolvers = column_resolvers, index_resolvers 

4562 if "target" not in kwargs: 

4563 kwargs["target"] = self 

4564 kwargs["resolvers"] = tuple(kwargs.get("resolvers", ())) + resolvers 

4565 

4566 return _eval(expr, inplace=inplace, **kwargs) 

4567 

4568 def select_dtypes(self, include=None, exclude=None) -> DataFrame: 

4569 """ 

4570 Return a subset of the DataFrame's columns based on the column dtypes. 

4571 

4572 Parameters 

4573 ---------- 

4574 include, exclude : scalar or list-like 

4575 A selection of dtypes or strings to be included/excluded. At least 

4576 one of these parameters must be supplied. 

4577 

4578 Returns 

4579 ------- 

4580 DataFrame 

4581 The subset of the frame including the dtypes in ``include`` and 

4582 excluding the dtypes in ``exclude``. 

4583 

4584 Raises 

4585 ------ 

4586 ValueError 

4587 * If both of ``include`` and ``exclude`` are empty 

4588 * If ``include`` and ``exclude`` have overlapping elements 

4589 * If any kind of string dtype is passed in. 

4590 

4591 See Also 

4592 -------- 

4593 DataFrame.dtypes: Return Series with the data type of each column. 

4594 

4595 Notes 

4596 ----- 

4597 * To select all *numeric* types, use ``np.number`` or ``'number'`` 

4598 * To select strings you must use the ``object`` dtype, but note that 

4599 this will return *all* object dtype columns 

4600 * See the `numpy dtype hierarchy 

4601 <https://numpy.org/doc/stable/reference/arrays.scalars.html>`__ 

4602 * To select datetimes, use ``np.datetime64``, ``'datetime'`` or 

4603 ``'datetime64'`` 

4604 * To select timedeltas, use ``np.timedelta64``, ``'timedelta'`` or 

4605 ``'timedelta64'`` 

4606 * To select Pandas categorical dtypes, use ``'category'`` 

4607 * To select Pandas datetimetz dtypes, use ``'datetimetz'`` (new in 

4608 0.20.0) or ``'datetime64[ns, tz]'`` 

4609 

4610 Examples 

4611 -------- 

4612 >>> df = pd.DataFrame({'a': [1, 2] * 3, 

4613 ... 'b': [True, False] * 3, 

4614 ... 'c': [1.0, 2.0] * 3}) 

4615 >>> df 

4616 a b c 

4617 0 1 True 1.0 

4618 1 2 False 2.0 

4619 2 1 True 1.0 

4620 3 2 False 2.0 

4621 4 1 True 1.0 

4622 5 2 False 2.0 

4623 

4624 >>> df.select_dtypes(include='bool') 

4625 b 

4626 0 True 

4627 1 False 

4628 2 True 

4629 3 False 

4630 4 True 

4631 5 False 

4632 

4633 >>> df.select_dtypes(include=['float64']) 

4634 c 

4635 0 1.0 

4636 1 2.0 

4637 2 1.0 

4638 3 2.0 

4639 4 1.0 

4640 5 2.0 

4641 

4642 >>> df.select_dtypes(exclude=['int64']) 

4643 b c 

4644 0 True 1.0 

4645 1 False 2.0 

4646 2 True 1.0 

4647 3 False 2.0 

4648 4 True 1.0 

4649 5 False 2.0 

4650 """ 

4651 if not is_list_like(include): 

4652 include = (include,) if include is not None else () 

4653 if not is_list_like(exclude): 

4654 exclude = (exclude,) if exclude is not None else () 

4655 

4656 selection = (frozenset(include), frozenset(exclude)) 

4657 

4658 if not any(selection): 

4659 raise ValueError("at least one of include or exclude must be nonempty") 

4660 

4661 # convert the myriad valid dtypes object to a single representation 

4662 def check_int_infer_dtype(dtypes): 

4663 converted_dtypes: list[type] = [] 

4664 for dtype in dtypes: 

4665 # Numpy maps int to different types (int32, in64) on Windows and Linux 

4666 # see https://github.com/numpy/numpy/issues/9464 

4667 if (isinstance(dtype, str) and dtype == "int") or (dtype is int): 

4668 converted_dtypes.append(np.int32) 

4669 converted_dtypes.append(np.int64) 

4670 elif dtype == "float" or dtype is float: 

4671 # GH#42452 : np.dtype("float") coerces to np.float64 from Numpy 1.20 

4672 converted_dtypes.extend([np.float64, np.float32]) 

4673 else: 

4674 converted_dtypes.append(infer_dtype_from_object(dtype)) 

4675 return frozenset(converted_dtypes) 

4676 

4677 include = check_int_infer_dtype(include) 

4678 exclude = check_int_infer_dtype(exclude) 

4679 

4680 for dtypes in (include, exclude): 

4681 invalidate_string_dtypes(dtypes) 

4682 

4683 # can't both include AND exclude! 

4684 if not include.isdisjoint(exclude): 

4685 raise ValueError(f"include and exclude overlap on {(include & exclude)}") 

4686 

4687 def dtype_predicate(dtype: DtypeObj, dtypes_set) -> bool: 

4688 # GH 46870: BooleanDtype._is_numeric == True but should be excluded 

4689 dtype = dtype if not isinstance(dtype, ArrowDtype) else dtype.numpy_dtype 

4690 return issubclass(dtype.type, tuple(dtypes_set)) or ( 

4691 np.number in dtypes_set 

4692 and getattr(dtype, "_is_numeric", False) 

4693 and not is_bool_dtype(dtype) 

4694 ) 

4695 

4696 def predicate(arr: ArrayLike) -> bool: 

4697 dtype = arr.dtype 

4698 if include: 

4699 if not dtype_predicate(dtype, include): 

4700 return False 

4701 

4702 if exclude: 

4703 if dtype_predicate(dtype, exclude): 

4704 return False 

4705 

4706 return True 

4707 

4708 mgr = self._mgr._get_data_subset(predicate).copy(deep=None) 

4709 return type(self)(mgr).__finalize__(self) 

4710 

4711 def insert( 

4712 self, 

4713 loc: int, 

4714 column: Hashable, 

4715 value: Scalar | AnyArrayLike, 

4716 allow_duplicates: bool | lib.NoDefault = lib.no_default, 

4717 ) -> None: 

4718 """ 

4719 Insert column into DataFrame at specified location. 

4720 

4721 Raises a ValueError if `column` is already contained in the DataFrame, 

4722 unless `allow_duplicates` is set to True. 

4723 

4724 Parameters 

4725 ---------- 

4726 loc : int 

4727 Insertion index. Must verify 0 <= loc <= len(columns). 

4728 column : str, number, or hashable object 

4729 Label of the inserted column. 

4730 value : Scalar, Series, or array-like 

4731 allow_duplicates : bool, optional, default lib.no_default 

4732 

4733 See Also 

4734 -------- 

4735 Index.insert : Insert new item by index. 

4736 

4737 Examples 

4738 -------- 

4739 >>> df = pd.DataFrame({'col1': [1, 2], 'col2': [3, 4]}) 

4740 >>> df 

4741 col1 col2 

4742 0 1 3 

4743 1 2 4 

4744 >>> df.insert(1, "newcol", [99, 99]) 

4745 >>> df 

4746 col1 newcol col2 

4747 0 1 99 3 

4748 1 2 99 4 

4749 >>> df.insert(0, "col1", [100, 100], allow_duplicates=True) 

4750 >>> df 

4751 col1 col1 newcol col2 

4752 0 100 1 99 3 

4753 1 100 2 99 4 

4754 

4755 Notice that pandas uses index alignment in case of `value` from type `Series`: 

4756 

4757 >>> df.insert(0, "col0", pd.Series([5, 6], index=[1, 2])) 

4758 >>> df 

4759 col0 col1 col1 newcol col2 

4760 0 NaN 100 1 99 3 

4761 1 5.0 100 2 99 4 

4762 """ 

4763 if allow_duplicates is lib.no_default: 

4764 allow_duplicates = False 

4765 if allow_duplicates and not self.flags.allows_duplicate_labels: 

4766 raise ValueError( 

4767 "Cannot specify 'allow_duplicates=True' when " 

4768 "'self.flags.allows_duplicate_labels' is False." 

4769 ) 

4770 if not allow_duplicates and column in self.columns: 

4771 # Should this be a different kind of error?? 

4772 raise ValueError(f"cannot insert {column}, already exists") 

4773 if not isinstance(loc, int): 

4774 raise TypeError("loc must be int") 

4775 

4776 value = self._sanitize_column(value) 

4777 self._mgr.insert(loc, column, value) 

4778 

4779 def assign(self, **kwargs) -> DataFrame: 

4780 r""" 

4781 Assign new columns to a DataFrame. 

4782 

4783 Returns a new object with all original columns in addition to new ones. 

4784 Existing columns that are re-assigned will be overwritten. 

4785 

4786 Parameters 

4787 ---------- 

4788 **kwargs : dict of {str: callable or Series} 

4789 The column names are keywords. If the values are 

4790 callable, they are computed on the DataFrame and 

4791 assigned to the new columns. The callable must not 

4792 change input DataFrame (though pandas doesn't check it). 

4793 If the values are not callable, (e.g. a Series, scalar, or array), 

4794 they are simply assigned. 

4795 

4796 Returns 

4797 ------- 

4798 DataFrame 

4799 A new DataFrame with the new columns in addition to 

4800 all the existing columns. 

4801 

4802 Notes 

4803 ----- 

4804 Assigning multiple columns within the same ``assign`` is possible. 

4805 Later items in '\*\*kwargs' may refer to newly created or modified 

4806 columns in 'df'; items are computed and assigned into 'df' in order. 

4807 

4808 Examples 

4809 -------- 

4810 >>> df = pd.DataFrame({'temp_c': [17.0, 25.0]}, 

4811 ... index=['Portland', 'Berkeley']) 

4812 >>> df 

4813 temp_c 

4814 Portland 17.0 

4815 Berkeley 25.0 

4816 

4817 Where the value is a callable, evaluated on `df`: 

4818 

4819 >>> df.assign(temp_f=lambda x: x.temp_c * 9 / 5 + 32) 

4820 temp_c temp_f 

4821 Portland 17.0 62.6 

4822 Berkeley 25.0 77.0 

4823 

4824 Alternatively, the same behavior can be achieved by directly 

4825 referencing an existing Series or sequence: 

4826 

4827 >>> df.assign(temp_f=df['temp_c'] * 9 / 5 + 32) 

4828 temp_c temp_f 

4829 Portland 17.0 62.6 

4830 Berkeley 25.0 77.0 

4831 

4832 You can create multiple columns within the same assign where one 

4833 of the columns depends on another one defined within the same assign: 

4834 

4835 >>> df.assign(temp_f=lambda x: x['temp_c'] * 9 / 5 + 32, 

4836 ... temp_k=lambda x: (x['temp_f'] + 459.67) * 5 / 9) 

4837 temp_c temp_f temp_k 

4838 Portland 17.0 62.6 290.15 

4839 Berkeley 25.0 77.0 298.15 

4840 """ 

4841 data = self.copy(deep=None) 

4842 

4843 for k, v in kwargs.items(): 

4844 data[k] = com.apply_if_callable(v, data) 

4845 return data 

4846 

4847 def _sanitize_column(self, value) -> ArrayLike: 

4848 """ 

4849 Ensures new columns (which go into the BlockManager as new blocks) are 

4850 always copied and converted into an array. 

4851 

4852 Parameters 

4853 ---------- 

4854 value : scalar, Series, or array-like 

4855 

4856 Returns 

4857 ------- 

4858 numpy.ndarray or ExtensionArray 

4859 """ 

4860 self._ensure_valid_index(value) 

4861 

4862 # We can get there through isetitem with a DataFrame 

4863 # or through loc single_block_path 

4864 if isinstance(value, DataFrame): 

4865 return _reindex_for_setitem(value, self.index) 

4866 elif is_dict_like(value): 

4867 return _reindex_for_setitem(Series(value), self.index) 

4868 

4869 if is_list_like(value): 

4870 com.require_length_match(value, self.index) 

4871 return sanitize_array(value, self.index, copy=True, allow_2d=True) 

4872 

4873 @property 

4874 def _series(self): 

4875 return { 

4876 item: Series( 

4877 self._mgr.iget(idx), index=self.index, name=item, fastpath=True 

4878 ) 

4879 for idx, item in enumerate(self.columns) 

4880 } 

4881 

4882 # ---------------------------------------------------------------------- 

4883 # Reindexing and alignment 

4884 

4885 def _reindex_axes(self, axes, level, limit, tolerance, method, fill_value, copy): 

4886 frame = self 

4887 

4888 columns = axes["columns"] 

4889 if columns is not None: 

4890 frame = frame._reindex_columns( 

4891 columns, method, copy, level, fill_value, limit, tolerance 

4892 ) 

4893 

4894 index = axes["index"] 

4895 if index is not None: 

4896 frame = frame._reindex_index( 

4897 index, method, copy, level, fill_value, limit, tolerance 

4898 ) 

4899 

4900 return frame 

4901 

4902 def _reindex_index( 

4903 self, 

4904 new_index, 

4905 method, 

4906 copy: bool, 

4907 level: Level, 

4908 fill_value=np.nan, 

4909 limit=None, 

4910 tolerance=None, 

4911 ): 

4912 new_index, indexer = self.index.reindex( 

4913 new_index, method=method, level=level, limit=limit, tolerance=tolerance 

4914 ) 

4915 return self._reindex_with_indexers( 

4916 {0: [new_index, indexer]}, 

4917 copy=copy, 

4918 fill_value=fill_value, 

4919 allow_dups=False, 

4920 ) 

4921 

4922 def _reindex_columns( 

4923 self, 

4924 new_columns, 

4925 method, 

4926 copy: bool, 

4927 level: Level, 

4928 fill_value=None, 

4929 limit=None, 

4930 tolerance=None, 

4931 ): 

4932 new_columns, indexer = self.columns.reindex( 

4933 new_columns, method=method, level=level, limit=limit, tolerance=tolerance 

4934 ) 

4935 return self._reindex_with_indexers( 

4936 {1: [new_columns, indexer]}, 

4937 copy=copy, 

4938 fill_value=fill_value, 

4939 allow_dups=False, 

4940 ) 

4941 

4942 def _reindex_multi( 

4943 self, axes: dict[str, Index], copy: bool, fill_value 

4944 ) -> DataFrame: 

4945 """ 

4946 We are guaranteed non-Nones in the axes. 

4947 """ 

4948 

4949 new_index, row_indexer = self.index.reindex(axes["index"]) 

4950 new_columns, col_indexer = self.columns.reindex(axes["columns"]) 

4951 

4952 if row_indexer is not None and col_indexer is not None: 

4953 # Fastpath. By doing two 'take's at once we avoid making an 

4954 # unnecessary copy. 

4955 # We only get here with `not self._is_mixed_type`, which (almost) 

4956 # ensures that self.values is cheap. It may be worth making this 

4957 # condition more specific. 

4958 indexer = row_indexer, col_indexer 

4959 new_values = take_2d_multi(self.values, indexer, fill_value=fill_value) 

4960 return self._constructor( 

4961 new_values, index=new_index, columns=new_columns, copy=False 

4962 ) 

4963 else: 

4964 return self._reindex_with_indexers( 

4965 {0: [new_index, row_indexer], 1: [new_columns, col_indexer]}, 

4966 copy=copy, 

4967 fill_value=fill_value, 

4968 ) 

4969 

4970 @doc(NDFrame.align, **_shared_doc_kwargs) 

4971 def align( 

4972 self, 

4973 other: DataFrame, 

4974 join: AlignJoin = "outer", 

4975 axis: Axis | None = None, 

4976 level: Level = None, 

4977 copy: bool | None = None, 

4978 fill_value=None, 

4979 method: FillnaOptions | None = None, 

4980 limit: int | None = None, 

4981 fill_axis: Axis = 0, 

4982 broadcast_axis: Axis | None = None, 

4983 ) -> DataFrame: 

4984 return super().align( 

4985 other, 

4986 join=join, 

4987 axis=axis, 

4988 level=level, 

4989 copy=copy, 

4990 fill_value=fill_value, 

4991 method=method, 

4992 limit=limit, 

4993 fill_axis=fill_axis, 

4994 broadcast_axis=broadcast_axis, 

4995 ) 

4996 

4997 @Appender( 

4998 """ 

4999 Examples 

5000 -------- 

5001 >>> df = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}) 

5002 

5003 Change the row labels. 

5004 

5005 >>> df.set_axis(['a', 'b', 'c'], axis='index') 

5006 A B 

5007 a 1 4 

5008 b 2 5 

5009 c 3 6 

5010 

5011 Change the column labels. 

5012 

5013 >>> df.set_axis(['I', 'II'], axis='columns') 

5014 I II 

5015 0 1 4 

5016 1 2 5 

5017 2 3 6 

5018 """ 

5019 ) 

5020 @Substitution( 

5021 **_shared_doc_kwargs, 

5022 extended_summary_sub=" column or", 

5023 axis_description_sub=", and 1 identifies the columns", 

5024 see_also_sub=" or columns", 

5025 ) 

5026 @Appender(NDFrame.set_axis.__doc__) 

5027 def set_axis( 

5028 self, 

5029 labels, 

5030 *, 

5031 axis: Axis = 0, 

5032 copy: bool | None = None, 

5033 ) -> DataFrame: 

5034 return super().set_axis(labels, axis=axis, copy=copy) 

5035 

5036 @doc( 

5037 NDFrame.reindex, 

5038 klass=_shared_doc_kwargs["klass"], 

5039 optional_reindex=_shared_doc_kwargs["optional_reindex"], 

5040 ) 

5041 def reindex( # type: ignore[override] 

5042 self, 

5043 labels=None, 

5044 *, 

5045 index=None, 

5046 columns=None, 

5047 axis: Axis | None = None, 

5048 method: str | None = None, 

5049 copy: bool | None = None, 

5050 level: Level | None = None, 

5051 fill_value: Scalar | None = np.nan, 

5052 limit: int | None = None, 

5053 tolerance=None, 

5054 ) -> DataFrame: 

5055 return super().reindex( 

5056 labels=labels, 

5057 index=index, 

5058 columns=columns, 

5059 axis=axis, 

5060 method=method, 

5061 copy=copy, 

5062 level=level, 

5063 fill_value=fill_value, 

5064 limit=limit, 

5065 tolerance=tolerance, 

5066 ) 

5067 

5068 @overload 

5069 def drop( 

5070 self, 

5071 labels: IndexLabel = ..., 

5072 *, 

5073 axis: Axis = ..., 

5074 index: IndexLabel = ..., 

5075 columns: IndexLabel = ..., 

5076 level: Level = ..., 

5077 inplace: Literal[True], 

5078 errors: IgnoreRaise = ..., 

5079 ) -> None: 

5080 ... 

5081 

5082 @overload 

5083 def drop( 

5084 self, 

5085 labels: IndexLabel = ..., 

5086 *, 

5087 axis: Axis = ..., 

5088 index: IndexLabel = ..., 

5089 columns: IndexLabel = ..., 

5090 level: Level = ..., 

5091 inplace: Literal[False] = ..., 

5092 errors: IgnoreRaise = ..., 

5093 ) -> DataFrame: 

5094 ... 

5095 

5096 @overload 

5097 def drop( 

5098 self, 

5099 labels: IndexLabel = ..., 

5100 *, 

5101 axis: Axis = ..., 

5102 index: IndexLabel = ..., 

5103 columns: IndexLabel = ..., 

5104 level: Level = ..., 

5105 inplace: bool = ..., 

5106 errors: IgnoreRaise = ..., 

5107 ) -> DataFrame | None: 

5108 ... 

5109 

5110 def drop( 

5111 self, 

5112 labels: IndexLabel = None, 

5113 *, 

5114 axis: Axis = 0, 

5115 index: IndexLabel = None, 

5116 columns: IndexLabel = None, 

5117 level: Level = None, 

5118 inplace: bool = False, 

5119 errors: IgnoreRaise = "raise", 

5120 ) -> DataFrame | None: 

5121 """ 

5122 Drop specified labels from rows or columns. 

5123 

5124 Remove rows or columns by specifying label names and corresponding 

5125 axis, or by specifying directly index or column names. When using a 

5126 multi-index, labels on different levels can be removed by specifying 

5127 the level. See the :ref:`user guide <advanced.shown_levels>` 

5128 for more information about the now unused levels. 

5129 

5130 Parameters 

5131 ---------- 

5132 labels : single label or list-like 

5133 Index or column labels to drop. A tuple will be used as a single 

5134 label and not treated as a list-like. 

5135 axis : {0 or 'index', 1 or 'columns'}, default 0 

5136 Whether to drop labels from the index (0 or 'index') or 

5137 columns (1 or 'columns'). 

5138 index : single label or list-like 

5139 Alternative to specifying axis (``labels, axis=0`` 

5140 is equivalent to ``index=labels``). 

5141 columns : single label or list-like 

5142 Alternative to specifying axis (``labels, axis=1`` 

5143 is equivalent to ``columns=labels``). 

5144 level : int or level name, optional 

5145 For MultiIndex, level from which the labels will be removed. 

5146 inplace : bool, default False 

5147 If False, return a copy. Otherwise, do operation 

5148 inplace and return None. 

5149 errors : {'ignore', 'raise'}, default 'raise' 

5150 If 'ignore', suppress error and only existing labels are 

5151 dropped. 

5152 

5153 Returns 

5154 ------- 

5155 DataFrame or None 

5156 DataFrame without the removed index or column labels or 

5157 None if ``inplace=True``. 

5158 

5159 Raises 

5160 ------ 

5161 KeyError 

5162 If any of the labels is not found in the selected axis. 

5163 

5164 See Also 

5165 -------- 

5166 DataFrame.loc : Label-location based indexer for selection by label. 

5167 DataFrame.dropna : Return DataFrame with labels on given axis omitted 

5168 where (all or any) data are missing. 

5169 DataFrame.drop_duplicates : Return DataFrame with duplicate rows 

5170 removed, optionally only considering certain columns. 

5171 Series.drop : Return Series with specified index labels removed. 

5172 

5173 Examples 

5174 -------- 

5175 >>> df = pd.DataFrame(np.arange(12).reshape(3, 4), 

5176 ... columns=['A', 'B', 'C', 'D']) 

5177 >>> df 

5178 A B C D 

5179 0 0 1 2 3 

5180 1 4 5 6 7 

5181 2 8 9 10 11 

5182 

5183 Drop columns 

5184 

5185 >>> df.drop(['B', 'C'], axis=1) 

5186 A D 

5187 0 0 3 

5188 1 4 7 

5189 2 8 11 

5190 

5191 >>> df.drop(columns=['B', 'C']) 

5192 A D 

5193 0 0 3 

5194 1 4 7 

5195 2 8 11 

5196 

5197 Drop a row by index 

5198 

5199 >>> df.drop([0, 1]) 

5200 A B C D 

5201 2 8 9 10 11 

5202 

5203 Drop columns and/or rows of MultiIndex DataFrame 

5204 

5205 >>> midx = pd.MultiIndex(levels=[['lama', 'cow', 'falcon'], 

5206 ... ['speed', 'weight', 'length']], 

5207 ... codes=[[0, 0, 0, 1, 1, 1, 2, 2, 2], 

5208 ... [0, 1, 2, 0, 1, 2, 0, 1, 2]]) 

5209 >>> df = pd.DataFrame(index=midx, columns=['big', 'small'], 

5210 ... data=[[45, 30], [200, 100], [1.5, 1], [30, 20], 

5211 ... [250, 150], [1.5, 0.8], [320, 250], 

5212 ... [1, 0.8], [0.3, 0.2]]) 

5213 >>> df 

5214 big small 

5215 lama speed 45.0 30.0 

5216 weight 200.0 100.0 

5217 length 1.5 1.0 

5218 cow speed 30.0 20.0 

5219 weight 250.0 150.0 

5220 length 1.5 0.8 

5221 falcon speed 320.0 250.0 

5222 weight 1.0 0.8 

5223 length 0.3 0.2 

5224 

5225 Drop a specific index combination from the MultiIndex 

5226 DataFrame, i.e., drop the combination ``'falcon'`` and 

5227 ``'weight'``, which deletes only the corresponding row 

5228 

5229 >>> df.drop(index=('falcon', 'weight')) 

5230 big small 

5231 lama speed 45.0 30.0 

5232 weight 200.0 100.0 

5233 length 1.5 1.0 

5234 cow speed 30.0 20.0 

5235 weight 250.0 150.0 

5236 length 1.5 0.8 

5237 falcon speed 320.0 250.0 

5238 length 0.3 0.2 

5239 

5240 >>> df.drop(index='cow', columns='small') 

5241 big 

5242 lama speed 45.0 

5243 weight 200.0 

5244 length 1.5 

5245 falcon speed 320.0 

5246 weight 1.0 

5247 length 0.3 

5248 

5249 >>> df.drop(index='length', level=1) 

5250 big small 

5251 lama speed 45.0 30.0 

5252 weight 200.0 100.0 

5253 cow speed 30.0 20.0 

5254 weight 250.0 150.0 

5255 falcon speed 320.0 250.0 

5256 weight 1.0 0.8 

5257 """ 

5258 return super().drop( 

5259 labels=labels, 

5260 axis=axis, 

5261 index=index, 

5262 columns=columns, 

5263 level=level, 

5264 inplace=inplace, 

5265 errors=errors, 

5266 ) 

5267 

5268 @overload 

5269 def rename( 

5270 self, 

5271 mapper: Renamer | None = ..., 

5272 *, 

5273 index: Renamer | None = ..., 

5274 columns: Renamer | None = ..., 

5275 axis: Axis | None = ..., 

5276 copy: bool | None = ..., 

5277 inplace: Literal[True], 

5278 level: Level = ..., 

5279 errors: IgnoreRaise = ..., 

5280 ) -> None: 

5281 ... 

5282 

5283 @overload 

5284 def rename( 

5285 self, 

5286 mapper: Renamer | None = ..., 

5287 *, 

5288 index: Renamer | None = ..., 

5289 columns: Renamer | None = ..., 

5290 axis: Axis | None = ..., 

5291 copy: bool | None = ..., 

5292 inplace: Literal[False] = ..., 

5293 level: Level = ..., 

5294 errors: IgnoreRaise = ..., 

5295 ) -> DataFrame: 

5296 ... 

5297 

5298 @overload 

5299 def rename( 

5300 self, 

5301 mapper: Renamer | None = ..., 

5302 *, 

5303 index: Renamer | None = ..., 

5304 columns: Renamer | None = ..., 

5305 axis: Axis | None = ..., 

5306 copy: bool | None = ..., 

5307 inplace: bool = ..., 

5308 level: Level = ..., 

5309 errors: IgnoreRaise = ..., 

5310 ) -> DataFrame | None: 

5311 ... 

5312 

5313 def rename( 

5314 self, 

5315 mapper: Renamer | None = None, 

5316 *, 

5317 index: Renamer | None = None, 

5318 columns: Renamer | None = None, 

5319 axis: Axis | None = None, 

5320 copy: bool | None = None, 

5321 inplace: bool = False, 

5322 level: Level = None, 

5323 errors: IgnoreRaise = "ignore", 

5324 ) -> DataFrame | None: 

5325 """ 

5326 Rename columns or index labels. 

5327 

5328 Function / dict values must be unique (1-to-1). Labels not contained in 

5329 a dict / Series will be left as-is. Extra labels listed don't throw an 

5330 error. 

5331 

5332 See the :ref:`user guide <basics.rename>` for more. 

5333 

5334 Parameters 

5335 ---------- 

5336 mapper : dict-like or function 

5337 Dict-like or function transformations to apply to 

5338 that axis' values. Use either ``mapper`` and ``axis`` to 

5339 specify the axis to target with ``mapper``, or ``index`` and 

5340 ``columns``. 

5341 index : dict-like or function 

5342 Alternative to specifying axis (``mapper, axis=0`` 

5343 is equivalent to ``index=mapper``). 

5344 columns : dict-like or function 

5345 Alternative to specifying axis (``mapper, axis=1`` 

5346 is equivalent to ``columns=mapper``). 

5347 axis : {0 or 'index', 1 or 'columns'}, default 0 

5348 Axis to target with ``mapper``. Can be either the axis name 

5349 ('index', 'columns') or number (0, 1). The default is 'index'. 

5350 copy : bool, default True 

5351 Also copy underlying data. 

5352 inplace : bool, default False 

5353 Whether to modify the DataFrame rather than creating a new one. 

5354 If True then value of copy is ignored. 

5355 level : int or level name, default None 

5356 In case of a MultiIndex, only rename labels in the specified 

5357 level. 

5358 errors : {'ignore', 'raise'}, default 'ignore' 

5359 If 'raise', raise a `KeyError` when a dict-like `mapper`, `index`, 

5360 or `columns` contains labels that are not present in the Index 

5361 being transformed. 

5362 If 'ignore', existing keys will be renamed and extra keys will be 

5363 ignored. 

5364 

5365 Returns 

5366 ------- 

5367 DataFrame or None 

5368 DataFrame with the renamed axis labels or None if ``inplace=True``. 

5369 

5370 Raises 

5371 ------ 

5372 KeyError 

5373 If any of the labels is not found in the selected axis and 

5374 "errors='raise'". 

5375 

5376 See Also 

5377 -------- 

5378 DataFrame.rename_axis : Set the name of the axis. 

5379 

5380 Examples 

5381 -------- 

5382 ``DataFrame.rename`` supports two calling conventions 

5383 

5384 * ``(index=index_mapper, columns=columns_mapper, ...)`` 

5385 * ``(mapper, axis={'index', 'columns'}, ...)`` 

5386 

5387 We *highly* recommend using keyword arguments to clarify your 

5388 intent. 

5389 

5390 Rename columns using a mapping: 

5391 

5392 >>> df = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}) 

5393 >>> df.rename(columns={"A": "a", "B": "c"}) 

5394 a c 

5395 0 1 4 

5396 1 2 5 

5397 2 3 6 

5398 

5399 Rename index using a mapping: 

5400 

5401 >>> df.rename(index={0: "x", 1: "y", 2: "z"}) 

5402 A B 

5403 x 1 4 

5404 y 2 5 

5405 z 3 6 

5406 

5407 Cast index labels to a different type: 

5408 

5409 >>> df.index 

5410 RangeIndex(start=0, stop=3, step=1) 

5411 >>> df.rename(index=str).index 

5412 Index(['0', '1', '2'], dtype='object') 

5413 

5414 >>> df.rename(columns={"A": "a", "B": "b", "C": "c"}, errors="raise") 

5415 Traceback (most recent call last): 

5416 KeyError: ['C'] not found in axis 

5417 

5418 Using axis-style parameters: 

5419 

5420 >>> df.rename(str.lower, axis='columns') 

5421 a b 

5422 0 1 4 

5423 1 2 5 

5424 2 3 6 

5425 

5426 >>> df.rename({1: 2, 2: 4}, axis='index') 

5427 A B 

5428 0 1 4 

5429 2 2 5 

5430 4 3 6 

5431 """ 

5432 return super()._rename( 

5433 mapper=mapper, 

5434 index=index, 

5435 columns=columns, 

5436 axis=axis, 

5437 copy=copy, 

5438 inplace=inplace, 

5439 level=level, 

5440 errors=errors, 

5441 ) 

5442 

5443 @overload 

5444 def fillna( 

5445 self, 

5446 value: Hashable | Mapping | Series | DataFrame = ..., 

5447 *, 

5448 method: FillnaOptions | None = ..., 

5449 axis: Axis | None = ..., 

5450 inplace: Literal[False] = ..., 

5451 limit: int | None = ..., 

5452 downcast: dict | None = ..., 

5453 ) -> DataFrame: 

5454 ... 

5455 

5456 @overload 

5457 def fillna( 

5458 self, 

5459 value: Hashable | Mapping | Series | DataFrame = ..., 

5460 *, 

5461 method: FillnaOptions | None = ..., 

5462 axis: Axis | None = ..., 

5463 inplace: Literal[True], 

5464 limit: int | None = ..., 

5465 downcast: dict | None = ..., 

5466 ) -> None: 

5467 ... 

5468 

5469 @overload 

5470 def fillna( 

5471 self, 

5472 value: Hashable | Mapping | Series | DataFrame = ..., 

5473 *, 

5474 method: FillnaOptions | None = ..., 

5475 axis: Axis | None = ..., 

5476 inplace: bool = ..., 

5477 limit: int | None = ..., 

5478 downcast: dict | None = ..., 

5479 ) -> DataFrame | None: 

5480 ... 

5481 

5482 @doc(NDFrame.fillna, **_shared_doc_kwargs) 

5483 def fillna( 

5484 self, 

5485 value: Hashable | Mapping | Series | DataFrame = None, 

5486 *, 

5487 method: FillnaOptions | None = None, 

5488 axis: Axis | None = None, 

5489 inplace: bool = False, 

5490 limit: int | None = None, 

5491 downcast: dict | None = None, 

5492 ) -> DataFrame | None: 

5493 return super().fillna( 

5494 value=value, 

5495 method=method, 

5496 axis=axis, 

5497 inplace=inplace, 

5498 limit=limit, 

5499 downcast=downcast, 

5500 ) 

5501 

5502 def pop(self, item: Hashable) -> Series: 

5503 """ 

5504 Return item and drop from frame. Raise KeyError if not found. 

5505 

5506 Parameters 

5507 ---------- 

5508 item : label 

5509 Label of column to be popped. 

5510 

5511 Returns 

5512 ------- 

5513 Series 

5514 

5515 Examples 

5516 -------- 

5517 >>> df = pd.DataFrame([('falcon', 'bird', 389.0), 

5518 ... ('parrot', 'bird', 24.0), 

5519 ... ('lion', 'mammal', 80.5), 

5520 ... ('monkey', 'mammal', np.nan)], 

5521 ... columns=('name', 'class', 'max_speed')) 

5522 >>> df 

5523 name class max_speed 

5524 0 falcon bird 389.0 

5525 1 parrot bird 24.0 

5526 2 lion mammal 80.5 

5527 3 monkey mammal NaN 

5528 

5529 >>> df.pop('class') 

5530 0 bird 

5531 1 bird 

5532 2 mammal 

5533 3 mammal 

5534 Name: class, dtype: object 

5535 

5536 >>> df 

5537 name max_speed 

5538 0 falcon 389.0 

5539 1 parrot 24.0 

5540 2 lion 80.5 

5541 3 monkey NaN 

5542 """ 

5543 return super().pop(item=item) 

5544 

5545 @overload 

5546 def replace( 

5547 self, 

5548 to_replace=..., 

5549 value=..., 

5550 *, 

5551 inplace: Literal[False] = ..., 

5552 limit: int | None = ..., 

5553 regex: bool = ..., 

5554 method: Literal["pad", "ffill", "bfill"] | lib.NoDefault = ..., 

5555 ) -> DataFrame: 

5556 ... 

5557 

5558 @overload 

5559 def replace( 

5560 self, 

5561 to_replace=..., 

5562 value=..., 

5563 *, 

5564 inplace: Literal[True], 

5565 limit: int | None = ..., 

5566 regex: bool = ..., 

5567 method: Literal["pad", "ffill", "bfill"] | lib.NoDefault = ..., 

5568 ) -> None: 

5569 ... 

5570 

5571 @doc(NDFrame.replace, **_shared_doc_kwargs) 

5572 def replace( 

5573 self, 

5574 to_replace=None, 

5575 value=lib.no_default, 

5576 *, 

5577 inplace: bool = False, 

5578 limit: int | None = None, 

5579 regex: bool = False, 

5580 method: Literal["pad", "ffill", "bfill"] | lib.NoDefault = lib.no_default, 

5581 ) -> DataFrame | None: 

5582 return super().replace( 

5583 to_replace=to_replace, 

5584 value=value, 

5585 inplace=inplace, 

5586 limit=limit, 

5587 regex=regex, 

5588 method=method, 

5589 ) 

5590 

5591 def _replace_columnwise( 

5592 self, mapping: dict[Hashable, tuple[Any, Any]], inplace: bool, regex 

5593 ): 

5594 """ 

5595 Dispatch to Series.replace column-wise. 

5596 

5597 Parameters 

5598 ---------- 

5599 mapping : dict 

5600 of the form {col: (target, value)} 

5601 inplace : bool 

5602 regex : bool or same types as `to_replace` in DataFrame.replace 

5603 

5604 Returns 

5605 ------- 

5606 DataFrame or None 

5607 """ 

5608 # Operate column-wise 

5609 res = self if inplace else self.copy(deep=None) 

5610 ax = self.columns 

5611 

5612 for i, ax_value in enumerate(ax): 

5613 if ax_value in mapping: 

5614 ser = self.iloc[:, i] 

5615 

5616 target, value = mapping[ax_value] 

5617 newobj = ser.replace(target, value, regex=regex) 

5618 

5619 res._iset_item(i, newobj) 

5620 

5621 if inplace: 

5622 return 

5623 return res.__finalize__(self) 

5624 

5625 @doc(NDFrame.shift, klass=_shared_doc_kwargs["klass"]) 

5626 def shift( 

5627 self, 

5628 periods: int = 1, 

5629 freq: Frequency | None = None, 

5630 axis: Axis = 0, 

5631 fill_value: Hashable = lib.no_default, 

5632 ) -> DataFrame: 

5633 axis = self._get_axis_number(axis) 

5634 

5635 ncols = len(self.columns) 

5636 if ( 

5637 axis == 1 

5638 and periods != 0 

5639 and freq is None 

5640 and fill_value is lib.no_default 

5641 and ncols > 0 

5642 ): 

5643 # We will infer fill_value to match the closest column 

5644 

5645 # Use a column that we know is valid for our column's dtype GH#38434 

5646 label = self.columns[0] 

5647 

5648 if periods > 0: 

5649 result = self.iloc[:, :-periods] 

5650 for col in range(min(ncols, abs(periods))): 

5651 # TODO(EA2D): doing this in a loop unnecessary with 2D EAs 

5652 # Define filler inside loop so we get a copy 

5653 filler = self.iloc[:, 0].shift(len(self)) 

5654 result.insert(0, label, filler, allow_duplicates=True) 

5655 else: 

5656 result = self.iloc[:, -periods:] 

5657 for col in range(min(ncols, abs(periods))): 

5658 # Define filler inside loop so we get a copy 

5659 filler = self.iloc[:, -1].shift(len(self)) 

5660 result.insert( 

5661 len(result.columns), label, filler, allow_duplicates=True 

5662 ) 

5663 

5664 result.columns = self.columns.copy() 

5665 return result 

5666 elif ( 

5667 axis == 1 

5668 and periods != 0 

5669 and fill_value is not lib.no_default 

5670 and ncols > 0 

5671 ): 

5672 arrays = self._mgr.arrays 

5673 if len(arrays) > 1 or ( 

5674 # If we only have one block and we know that we can't 

5675 # keep the same dtype (i.e. the _can_hold_element check) 

5676 # then we can go through the reindex_indexer path 

5677 # (and avoid casting logic in the Block method). 

5678 not can_hold_element(arrays[0], fill_value) 

5679 ): 

5680 # GH#35488 we need to watch out for multi-block cases 

5681 # We only get here with fill_value not-lib.no_default 

5682 nper = abs(periods) 

5683 nper = min(nper, ncols) 

5684 if periods > 0: 

5685 indexer = np.array( 

5686 [-1] * nper + list(range(ncols - periods)), dtype=np.intp 

5687 ) 

5688 else: 

5689 indexer = np.array( 

5690 list(range(nper, ncols)) + [-1] * nper, dtype=np.intp 

5691 ) 

5692 mgr = self._mgr.reindex_indexer( 

5693 self.columns, 

5694 indexer, 

5695 axis=0, 

5696 fill_value=fill_value, 

5697 allow_dups=True, 

5698 ) 

5699 res_df = self._constructor(mgr) 

5700 return res_df.__finalize__(self, method="shift") 

5701 

5702 return super().shift( 

5703 periods=periods, freq=freq, axis=axis, fill_value=fill_value 

5704 ) 

5705 

5706 @overload 

5707 def set_index( 

5708 self, 

5709 keys, 

5710 *, 

5711 drop: bool = ..., 

5712 append: bool = ..., 

5713 inplace: Literal[False] = ..., 

5714 verify_integrity: bool = ..., 

5715 ) -> DataFrame: 

5716 ... 

5717 

5718 @overload 

5719 def set_index( 

5720 self, 

5721 keys, 

5722 *, 

5723 drop: bool = ..., 

5724 append: bool = ..., 

5725 inplace: Literal[True], 

5726 verify_integrity: bool = ..., 

5727 ) -> None: 

5728 ... 

5729 

5730 def set_index( 

5731 self, 

5732 keys, 

5733 *, 

5734 drop: bool = True, 

5735 append: bool = False, 

5736 inplace: bool = False, 

5737 verify_integrity: bool = False, 

5738 ) -> DataFrame | None: 

5739 """ 

5740 Set the DataFrame index using existing columns. 

5741 

5742 Set the DataFrame index (row labels) using one or more existing 

5743 columns or arrays (of the correct length). The index can replace the 

5744 existing index or expand on it. 

5745 

5746 Parameters 

5747 ---------- 

5748 keys : label or array-like or list of labels/arrays 

5749 This parameter can be either a single column key, a single array of 

5750 the same length as the calling DataFrame, or a list containing an 

5751 arbitrary combination of column keys and arrays. Here, "array" 

5752 encompasses :class:`Series`, :class:`Index`, ``np.ndarray``, and 

5753 instances of :class:`~collections.abc.Iterator`. 

5754 drop : bool, default True 

5755 Delete columns to be used as the new index. 

5756 append : bool, default False 

5757 Whether to append columns to existing index. 

5758 inplace : bool, default False 

5759 Whether to modify the DataFrame rather than creating a new one. 

5760 verify_integrity : bool, default False 

5761 Check the new index for duplicates. Otherwise defer the check until 

5762 necessary. Setting to False will improve the performance of this 

5763 method. 

5764 

5765 Returns 

5766 ------- 

5767 DataFrame or None 

5768 Changed row labels or None if ``inplace=True``. 

5769 

5770 See Also 

5771 -------- 

5772 DataFrame.reset_index : Opposite of set_index. 

5773 DataFrame.reindex : Change to new indices or expand indices. 

5774 DataFrame.reindex_like : Change to same indices as other DataFrame. 

5775 

5776 Examples 

5777 -------- 

5778 >>> df = pd.DataFrame({'month': [1, 4, 7, 10], 

5779 ... 'year': [2012, 2014, 2013, 2014], 

5780 ... 'sale': [55, 40, 84, 31]}) 

5781 >>> df 

5782 month year sale 

5783 0 1 2012 55 

5784 1 4 2014 40 

5785 2 7 2013 84 

5786 3 10 2014 31 

5787 

5788 Set the index to become the 'month' column: 

5789 

5790 >>> df.set_index('month') 

5791 year sale 

5792 month 

5793 1 2012 55 

5794 4 2014 40 

5795 7 2013 84 

5796 10 2014 31 

5797 

5798 Create a MultiIndex using columns 'year' and 'month': 

5799 

5800 >>> df.set_index(['year', 'month']) 

5801 sale 

5802 year month 

5803 2012 1 55 

5804 2014 4 40 

5805 2013 7 84 

5806 2014 10 31 

5807 

5808 Create a MultiIndex using an Index and a column: 

5809 

5810 >>> df.set_index([pd.Index([1, 2, 3, 4]), 'year']) 

5811 month sale 

5812 year 

5813 1 2012 1 55 

5814 2 2014 4 40 

5815 3 2013 7 84 

5816 4 2014 10 31 

5817 

5818 Create a MultiIndex using two Series: 

5819 

5820 >>> s = pd.Series([1, 2, 3, 4]) 

5821 >>> df.set_index([s, s**2]) 

5822 month year sale 

5823 1 1 1 2012 55 

5824 2 4 4 2014 40 

5825 3 9 7 2013 84 

5826 4 16 10 2014 31 

5827 """ 

5828 inplace = validate_bool_kwarg(inplace, "inplace") 

5829 self._check_inplace_and_allows_duplicate_labels(inplace) 

5830 if not isinstance(keys, list): 

5831 keys = [keys] 

5832 

5833 err_msg = ( 

5834 'The parameter "keys" may be a column key, one-dimensional ' 

5835 "array, or a list containing only valid column keys and " 

5836 "one-dimensional arrays." 

5837 ) 

5838 

5839 missing: list[Hashable] = [] 

5840 for col in keys: 

5841 if isinstance(col, (Index, Series, np.ndarray, list, abc.Iterator)): 

5842 # arrays are fine as long as they are one-dimensional 

5843 # iterators get converted to list below 

5844 if getattr(col, "ndim", 1) != 1: 

5845 raise ValueError(err_msg) 

5846 else: 

5847 # everything else gets tried as a key; see GH 24969 

5848 try: 

5849 found = col in self.columns 

5850 except TypeError as err: 

5851 raise TypeError( 

5852 f"{err_msg}. Received column of type {type(col)}" 

5853 ) from err 

5854 else: 

5855 if not found: 

5856 missing.append(col) 

5857 

5858 if missing: 

5859 raise KeyError(f"None of {missing} are in the columns") 

5860 

5861 if inplace: 

5862 frame = self 

5863 else: 

5864 # GH 49473 Use "lazy copy" with Copy-on-Write 

5865 frame = self.copy(deep=None) 

5866 

5867 arrays = [] 

5868 names: list[Hashable] = [] 

5869 if append: 

5870 names = list(self.index.names) 

5871 if isinstance(self.index, MultiIndex): 

5872 for i in range(self.index.nlevels): 

5873 arrays.append(self.index._get_level_values(i)) 

5874 else: 

5875 arrays.append(self.index) 

5876 

5877 to_remove: list[Hashable] = [] 

5878 for col in keys: 

5879 if isinstance(col, MultiIndex): 

5880 for n in range(col.nlevels): 

5881 arrays.append(col._get_level_values(n)) 

5882 names.extend(col.names) 

5883 elif isinstance(col, (Index, Series)): 

5884 # if Index then not MultiIndex (treated above) 

5885 

5886 # error: Argument 1 to "append" of "list" has incompatible type 

5887 # "Union[Index, Series]"; expected "Index" 

5888 arrays.append(col) # type:ignore[arg-type] 

5889 names.append(col.name) 

5890 elif isinstance(col, (list, np.ndarray)): 

5891 # error: Argument 1 to "append" of "list" has incompatible type 

5892 # "Union[List[Any], ndarray]"; expected "Index" 

5893 arrays.append(col) # type: ignore[arg-type] 

5894 names.append(None) 

5895 elif isinstance(col, abc.Iterator): 

5896 # error: Argument 1 to "append" of "list" has incompatible type 

5897 # "List[Any]"; expected "Index" 

5898 arrays.append(list(col)) # type: ignore[arg-type] 

5899 names.append(None) 

5900 # from here, col can only be a column label 

5901 else: 

5902 arrays.append(frame[col]) 

5903 names.append(col) 

5904 if drop: 

5905 to_remove.append(col) 

5906 

5907 if len(arrays[-1]) != len(self): 

5908 # check newest element against length of calling frame, since 

5909 # ensure_index_from_sequences would not raise for append=False. 

5910 raise ValueError( 

5911 f"Length mismatch: Expected {len(self)} rows, " 

5912 f"received array of length {len(arrays[-1])}" 

5913 ) 

5914 

5915 index = ensure_index_from_sequences(arrays, names) 

5916 

5917 if verify_integrity and not index.is_unique: 

5918 duplicates = index[index.duplicated()].unique() 

5919 raise ValueError(f"Index has duplicate keys: {duplicates}") 

5920 

5921 # use set to handle duplicate column names gracefully in case of drop 

5922 for c in set(to_remove): 

5923 del frame[c] 

5924 

5925 # clear up memory usage 

5926 index._cleanup() 

5927 

5928 frame.index = index 

5929 

5930 if not inplace: 

5931 return frame 

5932 return None 

5933 

5934 @overload 

5935 def reset_index( 

5936 self, 

5937 level: IndexLabel = ..., 

5938 *, 

5939 drop: bool = ..., 

5940 inplace: Literal[False] = ..., 

5941 col_level: Hashable = ..., 

5942 col_fill: Hashable = ..., 

5943 allow_duplicates: bool | lib.NoDefault = ..., 

5944 names: Hashable | Sequence[Hashable] = None, 

5945 ) -> DataFrame: 

5946 ... 

5947 

5948 @overload 

5949 def reset_index( 

5950 self, 

5951 level: IndexLabel = ..., 

5952 *, 

5953 drop: bool = ..., 

5954 inplace: Literal[True], 

5955 col_level: Hashable = ..., 

5956 col_fill: Hashable = ..., 

5957 allow_duplicates: bool | lib.NoDefault = ..., 

5958 names: Hashable | Sequence[Hashable] = None, 

5959 ) -> None: 

5960 ... 

5961 

5962 @overload 

5963 def reset_index( 

5964 self, 

5965 level: IndexLabel = ..., 

5966 *, 

5967 drop: bool = ..., 

5968 inplace: bool = ..., 

5969 col_level: Hashable = ..., 

5970 col_fill: Hashable = ..., 

5971 allow_duplicates: bool | lib.NoDefault = ..., 

5972 names: Hashable | Sequence[Hashable] = None, 

5973 ) -> DataFrame | None: 

5974 ... 

5975 

5976 def reset_index( 

5977 self, 

5978 level: IndexLabel = None, 

5979 *, 

5980 drop: bool = False, 

5981 inplace: bool = False, 

5982 col_level: Hashable = 0, 

5983 col_fill: Hashable = "", 

5984 allow_duplicates: bool | lib.NoDefault = lib.no_default, 

5985 names: Hashable | Sequence[Hashable] = None, 

5986 ) -> DataFrame | None: 

5987 """ 

5988 Reset the index, or a level of it. 

5989 

5990 Reset the index of the DataFrame, and use the default one instead. 

5991 If the DataFrame has a MultiIndex, this method can remove one or more 

5992 levels. 

5993 

5994 Parameters 

5995 ---------- 

5996 level : int, str, tuple, or list, default None 

5997 Only remove the given levels from the index. Removes all levels by 

5998 default. 

5999 drop : bool, default False 

6000 Do not try to insert index into dataframe columns. This resets 

6001 the index to the default integer index. 

6002 inplace : bool, default False 

6003 Whether to modify the DataFrame rather than creating a new one. 

6004 col_level : int or str, default 0 

6005 If the columns have multiple levels, determines which level the 

6006 labels are inserted into. By default it is inserted into the first 

6007 level. 

6008 col_fill : object, default '' 

6009 If the columns have multiple levels, determines how the other 

6010 levels are named. If None then the index name is repeated. 

6011 allow_duplicates : bool, optional, default lib.no_default 

6012 Allow duplicate column labels to be created. 

6013 

6014 .. versionadded:: 1.5.0 

6015 

6016 names : int, str or 1-dimensional list, default None 

6017 Using the given string, rename the DataFrame column which contains the 

6018 index data. If the DataFrame has a MultiIndex, this has to be a list or 

6019 tuple with length equal to the number of levels. 

6020 

6021 .. versionadded:: 1.5.0 

6022 

6023 Returns 

6024 ------- 

6025 DataFrame or None 

6026 DataFrame with the new index or None if ``inplace=True``. 

6027 

6028 See Also 

6029 -------- 

6030 DataFrame.set_index : Opposite of reset_index. 

6031 DataFrame.reindex : Change to new indices or expand indices. 

6032 DataFrame.reindex_like : Change to same indices as other DataFrame. 

6033 

6034 Examples 

6035 -------- 

6036 >>> df = pd.DataFrame([('bird', 389.0), 

6037 ... ('bird', 24.0), 

6038 ... ('mammal', 80.5), 

6039 ... ('mammal', np.nan)], 

6040 ... index=['falcon', 'parrot', 'lion', 'monkey'], 

6041 ... columns=('class', 'max_speed')) 

6042 >>> df 

6043 class max_speed 

6044 falcon bird 389.0 

6045 parrot bird 24.0 

6046 lion mammal 80.5 

6047 monkey mammal NaN 

6048 

6049 When we reset the index, the old index is added as a column, and a 

6050 new sequential index is used: 

6051 

6052 >>> df.reset_index() 

6053 index class max_speed 

6054 0 falcon bird 389.0 

6055 1 parrot bird 24.0 

6056 2 lion mammal 80.5 

6057 3 monkey mammal NaN 

6058 

6059 We can use the `drop` parameter to avoid the old index being added as 

6060 a column: 

6061 

6062 >>> df.reset_index(drop=True) 

6063 class max_speed 

6064 0 bird 389.0 

6065 1 bird 24.0 

6066 2 mammal 80.5 

6067 3 mammal NaN 

6068 

6069 You can also use `reset_index` with `MultiIndex`. 

6070 

6071 >>> index = pd.MultiIndex.from_tuples([('bird', 'falcon'), 

6072 ... ('bird', 'parrot'), 

6073 ... ('mammal', 'lion'), 

6074 ... ('mammal', 'monkey')], 

6075 ... names=['class', 'name']) 

6076 >>> columns = pd.MultiIndex.from_tuples([('speed', 'max'), 

6077 ... ('species', 'type')]) 

6078 >>> df = pd.DataFrame([(389.0, 'fly'), 

6079 ... (24.0, 'fly'), 

6080 ... (80.5, 'run'), 

6081 ... (np.nan, 'jump')], 

6082 ... index=index, 

6083 ... columns=columns) 

6084 >>> df 

6085 speed species 

6086 max type 

6087 class name 

6088 bird falcon 389.0 fly 

6089 parrot 24.0 fly 

6090 mammal lion 80.5 run 

6091 monkey NaN jump 

6092 

6093 Using the `names` parameter, choose a name for the index column: 

6094 

6095 >>> df.reset_index(names=['classes', 'names']) 

6096 classes names speed species 

6097 max type 

6098 0 bird falcon 389.0 fly 

6099 1 bird parrot 24.0 fly 

6100 2 mammal lion 80.5 run 

6101 3 mammal monkey NaN jump 

6102 

6103 If the index has multiple levels, we can reset a subset of them: 

6104 

6105 >>> df.reset_index(level='class') 

6106 class speed species 

6107 max type 

6108 name 

6109 falcon bird 389.0 fly 

6110 parrot bird 24.0 fly 

6111 lion mammal 80.5 run 

6112 monkey mammal NaN jump 

6113 

6114 If we are not dropping the index, by default, it is placed in the top 

6115 level. We can place it in another level: 

6116 

6117 >>> df.reset_index(level='class', col_level=1) 

6118 speed species 

6119 class max type 

6120 name 

6121 falcon bird 389.0 fly 

6122 parrot bird 24.0 fly 

6123 lion mammal 80.5 run 

6124 monkey mammal NaN jump 

6125 

6126 When the index is inserted under another level, we can specify under 

6127 which one with the parameter `col_fill`: 

6128 

6129 >>> df.reset_index(level='class', col_level=1, col_fill='species') 

6130 species speed species 

6131 class max type 

6132 name 

6133 falcon bird 389.0 fly 

6134 parrot bird 24.0 fly 

6135 lion mammal 80.5 run 

6136 monkey mammal NaN jump 

6137 

6138 If we specify a nonexistent level for `col_fill`, it is created: 

6139 

6140 >>> df.reset_index(level='class', col_level=1, col_fill='genus') 

6141 genus speed species 

6142 class max type 

6143 name 

6144 falcon bird 389.0 fly 

6145 parrot bird 24.0 fly 

6146 lion mammal 80.5 run 

6147 monkey mammal NaN jump 

6148 """ 

6149 inplace = validate_bool_kwarg(inplace, "inplace") 

6150 self._check_inplace_and_allows_duplicate_labels(inplace) 

6151 if inplace: 

6152 new_obj = self 

6153 else: 

6154 new_obj = self.copy(deep=None) 

6155 if allow_duplicates is not lib.no_default: 

6156 allow_duplicates = validate_bool_kwarg(allow_duplicates, "allow_duplicates") 

6157 

6158 new_index = default_index(len(new_obj)) 

6159 if level is not None: 

6160 if not isinstance(level, (tuple, list)): 

6161 level = [level] 

6162 level = [self.index._get_level_number(lev) for lev in level] 

6163 if len(level) < self.index.nlevels: 

6164 new_index = self.index.droplevel(level) 

6165 

6166 if not drop: 

6167 to_insert: Iterable[tuple[Any, Any | None]] 

6168 

6169 default = "index" if "index" not in self else "level_0" 

6170 names = self.index._get_default_index_names(names, default) 

6171 

6172 if isinstance(self.index, MultiIndex): 

6173 to_insert = zip(self.index.levels, self.index.codes) 

6174 else: 

6175 to_insert = ((self.index, None),) 

6176 

6177 multi_col = isinstance(self.columns, MultiIndex) 

6178 for i, (lev, lab) in reversed(list(enumerate(to_insert))): 

6179 if level is not None and i not in level: 

6180 continue 

6181 name = names[i] 

6182 if multi_col: 

6183 col_name = list(name) if isinstance(name, tuple) else [name] 

6184 if col_fill is None: 

6185 if len(col_name) not in (1, self.columns.nlevels): 

6186 raise ValueError( 

6187 "col_fill=None is incompatible " 

6188 f"with incomplete column name {name}" 

6189 ) 

6190 col_fill = col_name[0] 

6191 

6192 lev_num = self.columns._get_level_number(col_level) 

6193 name_lst = [col_fill] * lev_num + col_name 

6194 missing = self.columns.nlevels - len(name_lst) 

6195 name_lst += [col_fill] * missing 

6196 name = tuple(name_lst) 

6197 

6198 # to ndarray and maybe infer different dtype 

6199 level_values = lev._values 

6200 if level_values.dtype == np.object_: 

6201 level_values = lib.maybe_convert_objects(level_values) 

6202 

6203 if lab is not None: 

6204 # if we have the codes, extract the values with a mask 

6205 level_values = algorithms.take( 

6206 level_values, lab, allow_fill=True, fill_value=lev._na_value 

6207 ) 

6208 

6209 new_obj.insert( 

6210 0, 

6211 name, 

6212 level_values, 

6213 allow_duplicates=allow_duplicates, 

6214 ) 

6215 

6216 new_obj.index = new_index 

6217 if not inplace: 

6218 return new_obj 

6219 

6220 return None 

6221 

6222 # ---------------------------------------------------------------------- 

6223 # Reindex-based selection methods 

6224 

6225 @doc(NDFrame.isna, klass=_shared_doc_kwargs["klass"]) 

6226 def isna(self) -> DataFrame: 

6227 result = self._constructor(self._mgr.isna(func=isna)) 

6228 return result.__finalize__(self, method="isna") 

6229 

6230 @doc(NDFrame.isna, klass=_shared_doc_kwargs["klass"]) 

6231 def isnull(self) -> DataFrame: 

6232 """ 

6233 DataFrame.isnull is an alias for DataFrame.isna. 

6234 """ 

6235 return self.isna() 

6236 

6237 @doc(NDFrame.notna, klass=_shared_doc_kwargs["klass"]) 

6238 def notna(self) -> DataFrame: 

6239 return ~self.isna() 

6240 

6241 @doc(NDFrame.notna, klass=_shared_doc_kwargs["klass"]) 

6242 def notnull(self) -> DataFrame: 

6243 """ 

6244 DataFrame.notnull is an alias for DataFrame.notna. 

6245 """ 

6246 return ~self.isna() 

6247 

6248 @overload 

6249 def dropna( 

6250 self, 

6251 *, 

6252 axis: Axis = ..., 

6253 how: AnyAll | NoDefault = ..., 

6254 thresh: int | NoDefault = ..., 

6255 subset: IndexLabel = ..., 

6256 inplace: Literal[False] = ..., 

6257 ignore_index: bool = ..., 

6258 ) -> DataFrame: 

6259 ... 

6260 

6261 @overload 

6262 def dropna( 

6263 self, 

6264 *, 

6265 axis: Axis = ..., 

6266 how: AnyAll | NoDefault = ..., 

6267 thresh: int | NoDefault = ..., 

6268 subset: IndexLabel = ..., 

6269 inplace: Literal[True], 

6270 ignore_index: bool = ..., 

6271 ) -> None: 

6272 ... 

6273 

6274 def dropna( 

6275 self, 

6276 *, 

6277 axis: Axis = 0, 

6278 how: AnyAll | NoDefault = no_default, 

6279 thresh: int | NoDefault = no_default, 

6280 subset: IndexLabel = None, 

6281 inplace: bool = False, 

6282 ignore_index: bool = False, 

6283 ) -> DataFrame | None: 

6284 """ 

6285 Remove missing values. 

6286 

6287 See the :ref:`User Guide <missing_data>` for more on which values are 

6288 considered missing, and how to work with missing data. 

6289 

6290 Parameters 

6291 ---------- 

6292 axis : {0 or 'index', 1 or 'columns'}, default 0 

6293 Determine if rows or columns which contain missing values are 

6294 removed. 

6295 

6296 * 0, or 'index' : Drop rows which contain missing values. 

6297 * 1, or 'columns' : Drop columns which contain missing value. 

6298 

6299 Pass tuple or list to drop on multiple axes. 

6300 Only a single axis is allowed. 

6301 

6302 how : {'any', 'all'}, default 'any' 

6303 Determine if row or column is removed from DataFrame, when we have 

6304 at least one NA or all NA. 

6305 

6306 * 'any' : If any NA values are present, drop that row or column. 

6307 * 'all' : If all values are NA, drop that row or column. 

6308 

6309 thresh : int, optional 

6310 Require that many non-NA values. Cannot be combined with how. 

6311 subset : column label or sequence of labels, optional 

6312 Labels along other axis to consider, e.g. if you are dropping rows 

6313 these would be a list of columns to include. 

6314 inplace : bool, default False 

6315 Whether to modify the DataFrame rather than creating a new one. 

6316 ignore_index : bool, default ``False`` 

6317 If ``True``, the resulting axis will be labeled 0, 1, …, n - 1. 

6318 

6319 .. versionadded:: 2.0.0 

6320 

6321 Returns 

6322 ------- 

6323 DataFrame or None 

6324 DataFrame with NA entries dropped from it or None if ``inplace=True``. 

6325 

6326 See Also 

6327 -------- 

6328 DataFrame.isna: Indicate missing values. 

6329 DataFrame.notna : Indicate existing (non-missing) values. 

6330 DataFrame.fillna : Replace missing values. 

6331 Series.dropna : Drop missing values. 

6332 Index.dropna : Drop missing indices. 

6333 

6334 Examples 

6335 -------- 

6336 >>> df = pd.DataFrame({"name": ['Alfred', 'Batman', 'Catwoman'], 

6337 ... "toy": [np.nan, 'Batmobile', 'Bullwhip'], 

6338 ... "born": [pd.NaT, pd.Timestamp("1940-04-25"), 

6339 ... pd.NaT]}) 

6340 >>> df 

6341 name toy born 

6342 0 Alfred NaN NaT 

6343 1 Batman Batmobile 1940-04-25 

6344 2 Catwoman Bullwhip NaT 

6345 

6346 Drop the rows where at least one element is missing. 

6347 

6348 >>> df.dropna() 

6349 name toy born 

6350 1 Batman Batmobile 1940-04-25 

6351 

6352 Drop the columns where at least one element is missing. 

6353 

6354 >>> df.dropna(axis='columns') 

6355 name 

6356 0 Alfred 

6357 1 Batman 

6358 2 Catwoman 

6359 

6360 Drop the rows where all elements are missing. 

6361 

6362 >>> df.dropna(how='all') 

6363 name toy born 

6364 0 Alfred NaN NaT 

6365 1 Batman Batmobile 1940-04-25 

6366 2 Catwoman Bullwhip NaT 

6367 

6368 Keep only the rows with at least 2 non-NA values. 

6369 

6370 >>> df.dropna(thresh=2) 

6371 name toy born 

6372 1 Batman Batmobile 1940-04-25 

6373 2 Catwoman Bullwhip NaT 

6374 

6375 Define in which columns to look for missing values. 

6376 

6377 >>> df.dropna(subset=['name', 'toy']) 

6378 name toy born 

6379 1 Batman Batmobile 1940-04-25 

6380 2 Catwoman Bullwhip NaT 

6381 """ 

6382 if (how is not no_default) and (thresh is not no_default): 

6383 raise TypeError( 

6384 "You cannot set both the how and thresh arguments at the same time." 

6385 ) 

6386 

6387 if how is no_default: 

6388 how = "any" 

6389 

6390 inplace = validate_bool_kwarg(inplace, "inplace") 

6391 if isinstance(axis, (tuple, list)): 

6392 # GH20987 

6393 raise TypeError("supplying multiple axes to axis is no longer supported.") 

6394 

6395 axis = self._get_axis_number(axis) 

6396 agg_axis = 1 - axis 

6397 

6398 agg_obj = self 

6399 if subset is not None: 

6400 # subset needs to be list 

6401 if not is_list_like(subset): 

6402 subset = [subset] 

6403 ax = self._get_axis(agg_axis) 

6404 indices = ax.get_indexer_for(subset) 

6405 check = indices == -1 

6406 if check.any(): 

6407 raise KeyError(np.array(subset)[check].tolist()) 

6408 agg_obj = self.take(indices, axis=agg_axis) 

6409 

6410 if thresh is not no_default: 

6411 count = agg_obj.count(axis=agg_axis) 

6412 mask = count >= thresh 

6413 elif how == "any": 

6414 # faster equivalent to 'agg_obj.count(agg_axis) == self.shape[agg_axis]' 

6415 mask = notna(agg_obj).all(axis=agg_axis, bool_only=False) 

6416 elif how == "all": 

6417 # faster equivalent to 'agg_obj.count(agg_axis) > 0' 

6418 mask = notna(agg_obj).any(axis=agg_axis, bool_only=False) 

6419 else: 

6420 raise ValueError(f"invalid how option: {how}") 

6421 

6422 if np.all(mask): 

6423 result = self.copy(deep=None) 

6424 else: 

6425 result = self.loc(axis=axis)[mask] 

6426 

6427 if ignore_index: 

6428 result.index = default_index(len(result)) 

6429 

6430 if not inplace: 

6431 return result 

6432 self._update_inplace(result) 

6433 return None 

6434 

6435 def drop_duplicates( 

6436 self, 

6437 subset: Hashable | Sequence[Hashable] | None = None, 

6438 *, 

6439 keep: DropKeep = "first", 

6440 inplace: bool = False, 

6441 ignore_index: bool = False, 

6442 ) -> DataFrame | None: 

6443 """ 

6444 Return DataFrame with duplicate rows removed. 

6445 

6446 Considering certain columns is optional. Indexes, including time indexes 

6447 are ignored. 

6448 

6449 Parameters 

6450 ---------- 

6451 subset : column label or sequence of labels, optional 

6452 Only consider certain columns for identifying duplicates, by 

6453 default use all of the columns. 

6454 keep : {'first', 'last', ``False``}, default 'first' 

6455 Determines which duplicates (if any) to keep. 

6456 

6457 - 'first' : Drop duplicates except for the first occurrence. 

6458 - 'last' : Drop duplicates except for the last occurrence. 

6459 - ``False`` : Drop all duplicates. 

6460 

6461 inplace : bool, default ``False`` 

6462 Whether to modify the DataFrame rather than creating a new one. 

6463 ignore_index : bool, default ``False`` 

6464 If ``True``, the resulting axis will be labeled 0, 1, …, n - 1. 

6465 

6466 Returns 

6467 ------- 

6468 DataFrame or None 

6469 DataFrame with duplicates removed or None if ``inplace=True``. 

6470 

6471 See Also 

6472 -------- 

6473 DataFrame.value_counts: Count unique combinations of columns. 

6474 

6475 Examples 

6476 -------- 

6477 Consider dataset containing ramen rating. 

6478 

6479 >>> df = pd.DataFrame({ 

6480 ... 'brand': ['Yum Yum', 'Yum Yum', 'Indomie', 'Indomie', 'Indomie'], 

6481 ... 'style': ['cup', 'cup', 'cup', 'pack', 'pack'], 

6482 ... 'rating': [4, 4, 3.5, 15, 5] 

6483 ... }) 

6484 >>> df 

6485 brand style rating 

6486 0 Yum Yum cup 4.0 

6487 1 Yum Yum cup 4.0 

6488 2 Indomie cup 3.5 

6489 3 Indomie pack 15.0 

6490 4 Indomie pack 5.0 

6491 

6492 By default, it removes duplicate rows based on all columns. 

6493 

6494 >>> df.drop_duplicates() 

6495 brand style rating 

6496 0 Yum Yum cup 4.0 

6497 2 Indomie cup 3.5 

6498 3 Indomie pack 15.0 

6499 4 Indomie pack 5.0 

6500 

6501 To remove duplicates on specific column(s), use ``subset``. 

6502 

6503 >>> df.drop_duplicates(subset=['brand']) 

6504 brand style rating 

6505 0 Yum Yum cup 4.0 

6506 2 Indomie cup 3.5 

6507 

6508 To remove duplicates and keep last occurrences, use ``keep``. 

6509 

6510 >>> df.drop_duplicates(subset=['brand', 'style'], keep='last') 

6511 brand style rating 

6512 1 Yum Yum cup 4.0 

6513 2 Indomie cup 3.5 

6514 4 Indomie pack 5.0 

6515 """ 

6516 if self.empty: 

6517 return self.copy(deep=None) 

6518 

6519 inplace = validate_bool_kwarg(inplace, "inplace") 

6520 ignore_index = validate_bool_kwarg(ignore_index, "ignore_index") 

6521 

6522 result = self[-self.duplicated(subset, keep=keep)] 

6523 if ignore_index: 

6524 result.index = default_index(len(result)) 

6525 

6526 if inplace: 

6527 self._update_inplace(result) 

6528 return None 

6529 else: 

6530 return result 

6531 

6532 def duplicated( 

6533 self, 

6534 subset: Hashable | Sequence[Hashable] | None = None, 

6535 keep: DropKeep = "first", 

6536 ) -> Series: 

6537 """ 

6538 Return boolean Series denoting duplicate rows. 

6539 

6540 Considering certain columns is optional. 

6541 

6542 Parameters 

6543 ---------- 

6544 subset : column label or sequence of labels, optional 

6545 Only consider certain columns for identifying duplicates, by 

6546 default use all of the columns. 

6547 keep : {'first', 'last', False}, default 'first' 

6548 Determines which duplicates (if any) to mark. 

6549 

6550 - ``first`` : Mark duplicates as ``True`` except for the first occurrence. 

6551 - ``last`` : Mark duplicates as ``True`` except for the last occurrence. 

6552 - False : Mark all duplicates as ``True``. 

6553 

6554 Returns 

6555 ------- 

6556 Series 

6557 Boolean series for each duplicated rows. 

6558 

6559 See Also 

6560 -------- 

6561 Index.duplicated : Equivalent method on index. 

6562 Series.duplicated : Equivalent method on Series. 

6563 Series.drop_duplicates : Remove duplicate values from Series. 

6564 DataFrame.drop_duplicates : Remove duplicate values from DataFrame. 

6565 

6566 Examples 

6567 -------- 

6568 Consider dataset containing ramen rating. 

6569 

6570 >>> df = pd.DataFrame({ 

6571 ... 'brand': ['Yum Yum', 'Yum Yum', 'Indomie', 'Indomie', 'Indomie'], 

6572 ... 'style': ['cup', 'cup', 'cup', 'pack', 'pack'], 

6573 ... 'rating': [4, 4, 3.5, 15, 5] 

6574 ... }) 

6575 >>> df 

6576 brand style rating 

6577 0 Yum Yum cup 4.0 

6578 1 Yum Yum cup 4.0 

6579 2 Indomie cup 3.5 

6580 3 Indomie pack 15.0 

6581 4 Indomie pack 5.0 

6582 

6583 By default, for each set of duplicated values, the first occurrence 

6584 is set on False and all others on True. 

6585 

6586 >>> df.duplicated() 

6587 0 False 

6588 1 True 

6589 2 False 

6590 3 False 

6591 4 False 

6592 dtype: bool 

6593 

6594 By using 'last', the last occurrence of each set of duplicated values 

6595 is set on False and all others on True. 

6596 

6597 >>> df.duplicated(keep='last') 

6598 0 True 

6599 1 False 

6600 2 False 

6601 3 False 

6602 4 False 

6603 dtype: bool 

6604 

6605 By setting ``keep`` on False, all duplicates are True. 

6606 

6607 >>> df.duplicated(keep=False) 

6608 0 True 

6609 1 True 

6610 2 False 

6611 3 False 

6612 4 False 

6613 dtype: bool 

6614 

6615 To find duplicates on specific column(s), use ``subset``. 

6616 

6617 >>> df.duplicated(subset=['brand']) 

6618 0 False 

6619 1 True 

6620 2 False 

6621 3 True 

6622 4 True 

6623 dtype: bool 

6624 """ 

6625 

6626 if self.empty: 

6627 return self._constructor_sliced(dtype=bool) 

6628 

6629 def f(vals) -> tuple[np.ndarray, int]: 

6630 labels, shape = algorithms.factorize(vals, size_hint=len(self)) 

6631 return labels.astype("i8", copy=False), len(shape) 

6632 

6633 if subset is None: 

6634 # https://github.com/pandas-dev/pandas/issues/28770 

6635 # Incompatible types in assignment (expression has type "Index", variable 

6636 # has type "Sequence[Any]") 

6637 subset = self.columns # type: ignore[assignment] 

6638 elif ( 

6639 not np.iterable(subset) 

6640 or isinstance(subset, str) 

6641 or isinstance(subset, tuple) 

6642 and subset in self.columns 

6643 ): 

6644 subset = (subset,) 

6645 

6646 # needed for mypy since can't narrow types using np.iterable 

6647 subset = cast(Sequence, subset) 

6648 

6649 # Verify all columns in subset exist in the queried dataframe 

6650 # Otherwise, raise a KeyError, same as if you try to __getitem__ with a 

6651 # key that doesn't exist. 

6652 diff = set(subset) - set(self.columns) 

6653 if diff: 

6654 raise KeyError(Index(diff)) 

6655 

6656 if len(subset) == 1 and self.columns.is_unique: 

6657 # GH#45236 This is faster than get_group_index below 

6658 result = self[subset[0]].duplicated(keep) 

6659 result.name = None 

6660 else: 

6661 vals = (col.values for name, col in self.items() if name in subset) 

6662 labels, shape = map(list, zip(*map(f, vals))) 

6663 

6664 ids = get_group_index( 

6665 labels, 

6666 # error: Argument 1 to "tuple" has incompatible type "List[_T]"; 

6667 # expected "Iterable[int]" 

6668 tuple(shape), # type: ignore[arg-type] 

6669 sort=False, 

6670 xnull=False, 

6671 ) 

6672 result = self._constructor_sliced(duplicated(ids, keep), index=self.index) 

6673 return result.__finalize__(self, method="duplicated") 

6674 

6675 # ---------------------------------------------------------------------- 

6676 # Sorting 

6677 # error: Signature of "sort_values" incompatible with supertype "NDFrame" 

6678 @overload # type: ignore[override] 

6679 def sort_values( 

6680 self, 

6681 by: IndexLabel, 

6682 *, 

6683 axis: Axis = ..., 

6684 ascending=..., 

6685 inplace: Literal[False] = ..., 

6686 kind: str = ..., 

6687 na_position: str = ..., 

6688 ignore_index: bool = ..., 

6689 key: ValueKeyFunc = ..., 

6690 ) -> DataFrame: 

6691 ... 

6692 

6693 @overload 

6694 def sort_values( 

6695 self, 

6696 by: IndexLabel, 

6697 *, 

6698 axis: Axis = ..., 

6699 ascending=..., 

6700 inplace: Literal[True], 

6701 kind: str = ..., 

6702 na_position: str = ..., 

6703 ignore_index: bool = ..., 

6704 key: ValueKeyFunc = ..., 

6705 ) -> None: 

6706 ... 

6707 

6708 # TODO: Just move the sort_values doc here. 

6709 @Substitution(**_shared_doc_kwargs) 

6710 @Appender(NDFrame.sort_values.__doc__) 

6711 def sort_values( 

6712 self, 

6713 by: IndexLabel, 

6714 *, 

6715 axis: Axis = 0, 

6716 ascending: bool | list[bool] | tuple[bool, ...] = True, 

6717 inplace: bool = False, 

6718 kind: str = "quicksort", 

6719 na_position: str = "last", 

6720 ignore_index: bool = False, 

6721 key: ValueKeyFunc = None, 

6722 ) -> DataFrame | None: 

6723 inplace = validate_bool_kwarg(inplace, "inplace") 

6724 axis = self._get_axis_number(axis) 

6725 ascending = validate_ascending(ascending) 

6726 if not isinstance(by, list): 

6727 by = [by] 

6728 # error: Argument 1 to "len" has incompatible type "Union[bool, List[bool]]"; 

6729 # expected "Sized" 

6730 if is_sequence(ascending) and ( 

6731 len(by) != len(ascending) # type: ignore[arg-type] 

6732 ): 

6733 # error: Argument 1 to "len" has incompatible type "Union[bool, 

6734 # List[bool]]"; expected "Sized" 

6735 raise ValueError( 

6736 f"Length of ascending ({len(ascending)})" # type: ignore[arg-type] 

6737 f" != length of by ({len(by)})" 

6738 ) 

6739 if len(by) > 1: 

6740 keys = [self._get_label_or_level_values(x, axis=axis) for x in by] 

6741 

6742 # need to rewrap columns in Series to apply key function 

6743 if key is not None: 

6744 # error: List comprehension has incompatible type List[Series]; 

6745 # expected List[ndarray] 

6746 keys = [ 

6747 Series(k, name=name) # type: ignore[misc] 

6748 for (k, name) in zip(keys, by) 

6749 ] 

6750 

6751 indexer = lexsort_indexer( 

6752 keys, orders=ascending, na_position=na_position, key=key 

6753 ) 

6754 elif len(by): 

6755 # len(by) == 1 

6756 

6757 by = by[0] 

6758 k = self._get_label_or_level_values(by, axis=axis) 

6759 

6760 # need to rewrap column in Series to apply key function 

6761 if key is not None: 

6762 # error: Incompatible types in assignment (expression has type 

6763 # "Series", variable has type "ndarray") 

6764 k = Series(k, name=by) # type: ignore[assignment] 

6765 

6766 if isinstance(ascending, (tuple, list)): 

6767 ascending = ascending[0] 

6768 

6769 indexer = nargsort( 

6770 k, kind=kind, ascending=ascending, na_position=na_position, key=key 

6771 ) 

6772 else: 

6773 if inplace: 

6774 return self._update_inplace(self) 

6775 else: 

6776 return self.copy(deep=None) 

6777 

6778 if is_range_indexer(indexer, len(indexer)): 

6779 result = self.copy(deep=(not inplace and not using_copy_on_write())) 

6780 if ignore_index: 

6781 result.index = default_index(len(result)) 

6782 

6783 if inplace: 

6784 return self._update_inplace(result) 

6785 else: 

6786 return result 

6787 

6788 new_data = self._mgr.take( 

6789 indexer, axis=self._get_block_manager_axis(axis), verify=False 

6790 ) 

6791 

6792 if ignore_index: 

6793 new_data.set_axis( 

6794 self._get_block_manager_axis(axis), default_index(len(indexer)) 

6795 ) 

6796 

6797 result = self._constructor(new_data) 

6798 if inplace: 

6799 return self._update_inplace(result) 

6800 else: 

6801 return result.__finalize__(self, method="sort_values") 

6802 

6803 @overload 

6804 def sort_index( 

6805 self, 

6806 *, 

6807 axis: Axis = ..., 

6808 level: IndexLabel = ..., 

6809 ascending: bool | Sequence[bool] = ..., 

6810 inplace: Literal[True], 

6811 kind: SortKind = ..., 

6812 na_position: NaPosition = ..., 

6813 sort_remaining: bool = ..., 

6814 ignore_index: bool = ..., 

6815 key: IndexKeyFunc = ..., 

6816 ) -> None: 

6817 ... 

6818 

6819 @overload 

6820 def sort_index( 

6821 self, 

6822 *, 

6823 axis: Axis = ..., 

6824 level: IndexLabel = ..., 

6825 ascending: bool | Sequence[bool] = ..., 

6826 inplace: Literal[False] = ..., 

6827 kind: SortKind = ..., 

6828 na_position: NaPosition = ..., 

6829 sort_remaining: bool = ..., 

6830 ignore_index: bool = ..., 

6831 key: IndexKeyFunc = ..., 

6832 ) -> DataFrame: 

6833 ... 

6834 

6835 @overload 

6836 def sort_index( 

6837 self, 

6838 *, 

6839 axis: Axis = ..., 

6840 level: IndexLabel = ..., 

6841 ascending: bool | Sequence[bool] = ..., 

6842 inplace: bool = ..., 

6843 kind: SortKind = ..., 

6844 na_position: NaPosition = ..., 

6845 sort_remaining: bool = ..., 

6846 ignore_index: bool = ..., 

6847 key: IndexKeyFunc = ..., 

6848 ) -> DataFrame | None: 

6849 ... 

6850 

6851 def sort_index( 

6852 self, 

6853 *, 

6854 axis: Axis = 0, 

6855 level: IndexLabel = None, 

6856 ascending: bool | Sequence[bool] = True, 

6857 inplace: bool = False, 

6858 kind: SortKind = "quicksort", 

6859 na_position: NaPosition = "last", 

6860 sort_remaining: bool = True, 

6861 ignore_index: bool = False, 

6862 key: IndexKeyFunc = None, 

6863 ) -> DataFrame | None: 

6864 """ 

6865 Sort object by labels (along an axis). 

6866 

6867 Returns a new DataFrame sorted by label if `inplace` argument is 

6868 ``False``, otherwise updates the original DataFrame and returns None. 

6869 

6870 Parameters 

6871 ---------- 

6872 axis : {0 or 'index', 1 or 'columns'}, default 0 

6873 The axis along which to sort. The value 0 identifies the rows, 

6874 and 1 identifies the columns. 

6875 level : int or level name or list of ints or list of level names 

6876 If not None, sort on values in specified index level(s). 

6877 ascending : bool or list-like of bools, default True 

6878 Sort ascending vs. descending. When the index is a MultiIndex the 

6879 sort direction can be controlled for each level individually. 

6880 inplace : bool, default False 

6881 Whether to modify the DataFrame rather than creating a new one. 

6882 kind : {'quicksort', 'mergesort', 'heapsort', 'stable'}, default 'quicksort' 

6883 Choice of sorting algorithm. See also :func:`numpy.sort` for more 

6884 information. `mergesort` and `stable` are the only stable algorithms. For 

6885 DataFrames, this option is only applied when sorting on a single 

6886 column or label. 

6887 na_position : {'first', 'last'}, default 'last' 

6888 Puts NaNs at the beginning if `first`; `last` puts NaNs at the end. 

6889 Not implemented for MultiIndex. 

6890 sort_remaining : bool, default True 

6891 If True and sorting by level and index is multilevel, sort by other 

6892 levels too (in order) after sorting by specified level. 

6893 ignore_index : bool, default False 

6894 If True, the resulting axis will be labeled 0, 1, …, n - 1. 

6895 key : callable, optional 

6896 If not None, apply the key function to the index values 

6897 before sorting. This is similar to the `key` argument in the 

6898 builtin :meth:`sorted` function, with the notable difference that 

6899 this `key` function should be *vectorized*. It should expect an 

6900 ``Index`` and return an ``Index`` of the same shape. For MultiIndex 

6901 inputs, the key is applied *per level*. 

6902 

6903 .. versionadded:: 1.1.0 

6904 

6905 Returns 

6906 ------- 

6907 DataFrame or None 

6908 The original DataFrame sorted by the labels or None if ``inplace=True``. 

6909 

6910 See Also 

6911 -------- 

6912 Series.sort_index : Sort Series by the index. 

6913 DataFrame.sort_values : Sort DataFrame by the value. 

6914 Series.sort_values : Sort Series by the value. 

6915 

6916 Examples 

6917 -------- 

6918 >>> df = pd.DataFrame([1, 2, 3, 4, 5], index=[100, 29, 234, 1, 150], 

6919 ... columns=['A']) 

6920 >>> df.sort_index() 

6921 A 

6922 1 4 

6923 29 2 

6924 100 1 

6925 150 5 

6926 234 3 

6927 

6928 By default, it sorts in ascending order, to sort in descending order, 

6929 use ``ascending=False`` 

6930 

6931 >>> df.sort_index(ascending=False) 

6932 A 

6933 234 3 

6934 150 5 

6935 100 1 

6936 29 2 

6937 1 4 

6938 

6939 A key function can be specified which is applied to the index before 

6940 sorting. For a ``MultiIndex`` this is applied to each level separately. 

6941 

6942 >>> df = pd.DataFrame({"a": [1, 2, 3, 4]}, index=['A', 'b', 'C', 'd']) 

6943 >>> df.sort_index(key=lambda x: x.str.lower()) 

6944 a 

6945 A 1 

6946 b 2 

6947 C 3 

6948 d 4 

6949 """ 

6950 return super().sort_index( 

6951 axis=axis, 

6952 level=level, 

6953 ascending=ascending, 

6954 inplace=inplace, 

6955 kind=kind, 

6956 na_position=na_position, 

6957 sort_remaining=sort_remaining, 

6958 ignore_index=ignore_index, 

6959 key=key, 

6960 ) 

6961 

6962 def value_counts( 

6963 self, 

6964 subset: Sequence[Hashable] | None = None, 

6965 normalize: bool = False, 

6966 sort: bool = True, 

6967 ascending: bool = False, 

6968 dropna: bool = True, 

6969 ) -> Series: 

6970 """ 

6971 Return a Series containing counts of unique rows in the DataFrame. 

6972 

6973 .. versionadded:: 1.1.0 

6974 

6975 Parameters 

6976 ---------- 

6977 subset : label or list of labels, optional 

6978 Columns to use when counting unique combinations. 

6979 normalize : bool, default False 

6980 Return proportions rather than frequencies. 

6981 sort : bool, default True 

6982 Sort by frequencies. 

6983 ascending : bool, default False 

6984 Sort in ascending order. 

6985 dropna : bool, default True 

6986 Don’t include counts of rows that contain NA values. 

6987 

6988 .. versionadded:: 1.3.0 

6989 

6990 Returns 

6991 ------- 

6992 Series 

6993 

6994 See Also 

6995 -------- 

6996 Series.value_counts: Equivalent method on Series. 

6997 

6998 Notes 

6999 ----- 

7000 The returned Series will have a MultiIndex with one level per input 

7001 column but an Index (non-multi) for a single label. By default, rows 

7002 that contain any NA values are omitted from the result. By default, 

7003 the resulting Series will be in descending order so that the first 

7004 element is the most frequently-occurring row. 

7005 

7006 Examples 

7007 -------- 

7008 >>> df = pd.DataFrame({'num_legs': [2, 4, 4, 6], 

7009 ... 'num_wings': [2, 0, 0, 0]}, 

7010 ... index=['falcon', 'dog', 'cat', 'ant']) 

7011 >>> df 

7012 num_legs num_wings 

7013 falcon 2 2 

7014 dog 4 0 

7015 cat 4 0 

7016 ant 6 0 

7017 

7018 >>> df.value_counts() 

7019 num_legs num_wings 

7020 4 0 2 

7021 2 2 1 

7022 6 0 1 

7023 Name: count, dtype: int64 

7024 

7025 >>> df.value_counts(sort=False) 

7026 num_legs num_wings 

7027 2 2 1 

7028 4 0 2 

7029 6 0 1 

7030 Name: count, dtype: int64 

7031 

7032 >>> df.value_counts(ascending=True) 

7033 num_legs num_wings 

7034 2 2 1 

7035 6 0 1 

7036 4 0 2 

7037 Name: count, dtype: int64 

7038 

7039 >>> df.value_counts(normalize=True) 

7040 num_legs num_wings 

7041 4 0 0.50 

7042 2 2 0.25 

7043 6 0 0.25 

7044 Name: proportion, dtype: float64 

7045 

7046 With `dropna` set to `False` we can also count rows with NA values. 

7047 

7048 >>> df = pd.DataFrame({'first_name': ['John', 'Anne', 'John', 'Beth'], 

7049 ... 'middle_name': ['Smith', pd.NA, pd.NA, 'Louise']}) 

7050 >>> df 

7051 first_name middle_name 

7052 0 John Smith 

7053 1 Anne <NA> 

7054 2 John <NA> 

7055 3 Beth Louise 

7056 

7057 >>> df.value_counts() 

7058 first_name middle_name 

7059 Beth Louise 1 

7060 John Smith 1 

7061 Name: count, dtype: int64 

7062 

7063 >>> df.value_counts(dropna=False) 

7064 first_name middle_name 

7065 Anne NaN 1 

7066 Beth Louise 1 

7067 John Smith 1 

7068 NaN 1 

7069 Name: count, dtype: int64 

7070 

7071 >>> df.value_counts("first_name") 

7072 first_name 

7073 John 2 

7074 Anne 1 

7075 Beth 1 

7076 Name: count, dtype: int64 

7077 """ 

7078 if subset is None: 

7079 subset = self.columns.tolist() 

7080 

7081 name = "proportion" if normalize else "count" 

7082 counts = self.groupby(subset, dropna=dropna).grouper.size() 

7083 counts.name = name 

7084 

7085 if sort: 

7086 counts = counts.sort_values(ascending=ascending) 

7087 if normalize: 

7088 counts /= counts.sum() 

7089 

7090 # Force MultiIndex for single column 

7091 if is_list_like(subset) and len(subset) == 1: 

7092 counts.index = MultiIndex.from_arrays( 

7093 [counts.index], names=[counts.index.name] 

7094 ) 

7095 

7096 return counts 

7097 

7098 def nlargest(self, n: int, columns: IndexLabel, keep: str = "first") -> DataFrame: 

7099 """ 

7100 Return the first `n` rows ordered by `columns` in descending order. 

7101 

7102 Return the first `n` rows with the largest values in `columns`, in 

7103 descending order. The columns that are not specified are returned as 

7104 well, but not used for ordering. 

7105 

7106 This method is equivalent to 

7107 ``df.sort_values(columns, ascending=False).head(n)``, but more 

7108 performant. 

7109 

7110 Parameters 

7111 ---------- 

7112 n : int 

7113 Number of rows to return. 

7114 columns : label or list of labels 

7115 Column label(s) to order by. 

7116 keep : {'first', 'last', 'all'}, default 'first' 

7117 Where there are duplicate values: 

7118 

7119 - ``first`` : prioritize the first occurrence(s) 

7120 - ``last`` : prioritize the last occurrence(s) 

7121 - ``all`` : do not drop any duplicates, even it means 

7122 selecting more than `n` items. 

7123 

7124 Returns 

7125 ------- 

7126 DataFrame 

7127 The first `n` rows ordered by the given columns in descending 

7128 order. 

7129 

7130 See Also 

7131 -------- 

7132 DataFrame.nsmallest : Return the first `n` rows ordered by `columns` in 

7133 ascending order. 

7134 DataFrame.sort_values : Sort DataFrame by the values. 

7135 DataFrame.head : Return the first `n` rows without re-ordering. 

7136 

7137 Notes 

7138 ----- 

7139 This function cannot be used with all column types. For example, when 

7140 specifying columns with `object` or `category` dtypes, ``TypeError`` is 

7141 raised. 

7142 

7143 Examples 

7144 -------- 

7145 >>> df = pd.DataFrame({'population': [59000000, 65000000, 434000, 

7146 ... 434000, 434000, 337000, 11300, 

7147 ... 11300, 11300], 

7148 ... 'GDP': [1937894, 2583560 , 12011, 4520, 12128, 

7149 ... 17036, 182, 38, 311], 

7150 ... 'alpha-2': ["IT", "FR", "MT", "MV", "BN", 

7151 ... "IS", "NR", "TV", "AI"]}, 

7152 ... index=["Italy", "France", "Malta", 

7153 ... "Maldives", "Brunei", "Iceland", 

7154 ... "Nauru", "Tuvalu", "Anguilla"]) 

7155 >>> df 

7156 population GDP alpha-2 

7157 Italy 59000000 1937894 IT 

7158 France 65000000 2583560 FR 

7159 Malta 434000 12011 MT 

7160 Maldives 434000 4520 MV 

7161 Brunei 434000 12128 BN 

7162 Iceland 337000 17036 IS 

7163 Nauru 11300 182 NR 

7164 Tuvalu 11300 38 TV 

7165 Anguilla 11300 311 AI 

7166 

7167 In the following example, we will use ``nlargest`` to select the three 

7168 rows having the largest values in column "population". 

7169 

7170 >>> df.nlargest(3, 'population') 

7171 population GDP alpha-2 

7172 France 65000000 2583560 FR 

7173 Italy 59000000 1937894 IT 

7174 Malta 434000 12011 MT 

7175 

7176 When using ``keep='last'``, ties are resolved in reverse order: 

7177 

7178 >>> df.nlargest(3, 'population', keep='last') 

7179 population GDP alpha-2 

7180 France 65000000 2583560 FR 

7181 Italy 59000000 1937894 IT 

7182 Brunei 434000 12128 BN 

7183 

7184 When using ``keep='all'``, all duplicate items are maintained: 

7185 

7186 >>> df.nlargest(3, 'population', keep='all') 

7187 population GDP alpha-2 

7188 France 65000000 2583560 FR 

7189 Italy 59000000 1937894 IT 

7190 Malta 434000 12011 MT 

7191 Maldives 434000 4520 MV 

7192 Brunei 434000 12128 BN 

7193 

7194 To order by the largest values in column "population" and then "GDP", 

7195 we can specify multiple columns like in the next example. 

7196 

7197 >>> df.nlargest(3, ['population', 'GDP']) 

7198 population GDP alpha-2 

7199 France 65000000 2583560 FR 

7200 Italy 59000000 1937894 IT 

7201 Brunei 434000 12128 BN 

7202 """ 

7203 return selectn.SelectNFrame(self, n=n, keep=keep, columns=columns).nlargest() 

7204 

7205 def nsmallest(self, n: int, columns: IndexLabel, keep: str = "first") -> DataFrame: 

7206 """ 

7207 Return the first `n` rows ordered by `columns` in ascending order. 

7208 

7209 Return the first `n` rows with the smallest values in `columns`, in 

7210 ascending order. The columns that are not specified are returned as 

7211 well, but not used for ordering. 

7212 

7213 This method is equivalent to 

7214 ``df.sort_values(columns, ascending=True).head(n)``, but more 

7215 performant. 

7216 

7217 Parameters 

7218 ---------- 

7219 n : int 

7220 Number of items to retrieve. 

7221 columns : list or str 

7222 Column name or names to order by. 

7223 keep : {'first', 'last', 'all'}, default 'first' 

7224 Where there are duplicate values: 

7225 

7226 - ``first`` : take the first occurrence. 

7227 - ``last`` : take the last occurrence. 

7228 - ``all`` : do not drop any duplicates, even it means 

7229 selecting more than `n` items. 

7230 

7231 Returns 

7232 ------- 

7233 DataFrame 

7234 

7235 See Also 

7236 -------- 

7237 DataFrame.nlargest : Return the first `n` rows ordered by `columns` in 

7238 descending order. 

7239 DataFrame.sort_values : Sort DataFrame by the values. 

7240 DataFrame.head : Return the first `n` rows without re-ordering. 

7241 

7242 Examples 

7243 -------- 

7244 >>> df = pd.DataFrame({'population': [59000000, 65000000, 434000, 

7245 ... 434000, 434000, 337000, 337000, 

7246 ... 11300, 11300], 

7247 ... 'GDP': [1937894, 2583560 , 12011, 4520, 12128, 

7248 ... 17036, 182, 38, 311], 

7249 ... 'alpha-2': ["IT", "FR", "MT", "MV", "BN", 

7250 ... "IS", "NR", "TV", "AI"]}, 

7251 ... index=["Italy", "France", "Malta", 

7252 ... "Maldives", "Brunei", "Iceland", 

7253 ... "Nauru", "Tuvalu", "Anguilla"]) 

7254 >>> df 

7255 population GDP alpha-2 

7256 Italy 59000000 1937894 IT 

7257 France 65000000 2583560 FR 

7258 Malta 434000 12011 MT 

7259 Maldives 434000 4520 MV 

7260 Brunei 434000 12128 BN 

7261 Iceland 337000 17036 IS 

7262 Nauru 337000 182 NR 

7263 Tuvalu 11300 38 TV 

7264 Anguilla 11300 311 AI 

7265 

7266 In the following example, we will use ``nsmallest`` to select the 

7267 three rows having the smallest values in column "population". 

7268 

7269 >>> df.nsmallest(3, 'population') 

7270 population GDP alpha-2 

7271 Tuvalu 11300 38 TV 

7272 Anguilla 11300 311 AI 

7273 Iceland 337000 17036 IS 

7274 

7275 When using ``keep='last'``, ties are resolved in reverse order: 

7276 

7277 >>> df.nsmallest(3, 'population', keep='last') 

7278 population GDP alpha-2 

7279 Anguilla 11300 311 AI 

7280 Tuvalu 11300 38 TV 

7281 Nauru 337000 182 NR 

7282 

7283 When using ``keep='all'``, all duplicate items are maintained: 

7284 

7285 >>> df.nsmallest(3, 'population', keep='all') 

7286 population GDP alpha-2 

7287 Tuvalu 11300 38 TV 

7288 Anguilla 11300 311 AI 

7289 Iceland 337000 17036 IS 

7290 Nauru 337000 182 NR 

7291 

7292 To order by the smallest values in column "population" and then "GDP", we can 

7293 specify multiple columns like in the next example. 

7294 

7295 >>> df.nsmallest(3, ['population', 'GDP']) 

7296 population GDP alpha-2 

7297 Tuvalu 11300 38 TV 

7298 Anguilla 11300 311 AI 

7299 Nauru 337000 182 NR 

7300 """ 

7301 return selectn.SelectNFrame(self, n=n, keep=keep, columns=columns).nsmallest() 

7302 

7303 @doc( 

7304 Series.swaplevel, 

7305 klass=_shared_doc_kwargs["klass"], 

7306 extra_params=dedent( 

7307 """axis : {0 or 'index', 1 or 'columns'}, default 0 

7308 The axis to swap levels on. 0 or 'index' for row-wise, 1 or 

7309 'columns' for column-wise.""" 

7310 ), 

7311 examples=dedent( 

7312 """\ 

7313 Examples 

7314 -------- 

7315 >>> df = pd.DataFrame( 

7316 ... {"Grade": ["A", "B", "A", "C"]}, 

7317 ... index=[ 

7318 ... ["Final exam", "Final exam", "Coursework", "Coursework"], 

7319 ... ["History", "Geography", "History", "Geography"], 

7320 ... ["January", "February", "March", "April"], 

7321 ... ], 

7322 ... ) 

7323 >>> df 

7324 Grade 

7325 Final exam History January A 

7326 Geography February B 

7327 Coursework History March A 

7328 Geography April C 

7329 

7330 In the following example, we will swap the levels of the indices. 

7331 Here, we will swap the levels column-wise, but levels can be swapped row-wise 

7332 in a similar manner. Note that column-wise is the default behaviour. 

7333 By not supplying any arguments for i and j, we swap the last and second to 

7334 last indices. 

7335 

7336 >>> df.swaplevel() 

7337 Grade 

7338 Final exam January History A 

7339 February Geography B 

7340 Coursework March History A 

7341 April Geography C 

7342 

7343 By supplying one argument, we can choose which index to swap the last 

7344 index with. We can for example swap the first index with the last one as 

7345 follows. 

7346 

7347 >>> df.swaplevel(0) 

7348 Grade 

7349 January History Final exam A 

7350 February Geography Final exam B 

7351 March History Coursework A 

7352 April Geography Coursework C 

7353 

7354 We can also define explicitly which indices we want to swap by supplying values 

7355 for both i and j. Here, we for example swap the first and second indices. 

7356 

7357 >>> df.swaplevel(0, 1) 

7358 Grade 

7359 History Final exam January A 

7360 Geography Final exam February B 

7361 History Coursework March A 

7362 Geography Coursework April C""" 

7363 ), 

7364 ) 

7365 def swaplevel(self, i: Axis = -2, j: Axis = -1, axis: Axis = 0) -> DataFrame: 

7366 result = self.copy(deep=None) 

7367 

7368 axis = self._get_axis_number(axis) 

7369 

7370 if not isinstance(result._get_axis(axis), MultiIndex): # pragma: no cover 

7371 raise TypeError("Can only swap levels on a hierarchical axis.") 

7372 

7373 if axis == 0: 

7374 assert isinstance(result.index, MultiIndex) 

7375 result.index = result.index.swaplevel(i, j) 

7376 else: 

7377 assert isinstance(result.columns, MultiIndex) 

7378 result.columns = result.columns.swaplevel(i, j) 

7379 return result 

7380 

7381 def reorder_levels(self, order: Sequence[int | str], axis: Axis = 0) -> DataFrame: 

7382 """ 

7383 Rearrange index levels using input order. May not drop or duplicate levels. 

7384 

7385 Parameters 

7386 ---------- 

7387 order : list of int or list of str 

7388 List representing new level order. Reference level by number 

7389 (position) or by key (label). 

7390 axis : {0 or 'index', 1 or 'columns'}, default 0 

7391 Where to reorder levels. 

7392 

7393 Returns 

7394 ------- 

7395 DataFrame 

7396 

7397 Examples 

7398 -------- 

7399 >>> data = { 

7400 ... "class": ["Mammals", "Mammals", "Reptiles"], 

7401 ... "diet": ["Omnivore", "Carnivore", "Carnivore"], 

7402 ... "species": ["Humans", "Dogs", "Snakes"], 

7403 ... } 

7404 >>> df = pd.DataFrame(data, columns=["class", "diet", "species"]) 

7405 >>> df = df.set_index(["class", "diet"]) 

7406 >>> df 

7407 species 

7408 class diet 

7409 Mammals Omnivore Humans 

7410 Carnivore Dogs 

7411 Reptiles Carnivore Snakes 

7412 

7413 Let's reorder the levels of the index: 

7414 

7415 >>> df.reorder_levels(["diet", "class"]) 

7416 species 

7417 diet class 

7418 Omnivore Mammals Humans 

7419 Carnivore Mammals Dogs 

7420 Reptiles Snakes 

7421 """ 

7422 axis = self._get_axis_number(axis) 

7423 if not isinstance(self._get_axis(axis), MultiIndex): # pragma: no cover 

7424 raise TypeError("Can only reorder levels on a hierarchical axis.") 

7425 

7426 result = self.copy(deep=None) 

7427 

7428 if axis == 0: 

7429 assert isinstance(result.index, MultiIndex) 

7430 result.index = result.index.reorder_levels(order) 

7431 else: 

7432 assert isinstance(result.columns, MultiIndex) 

7433 result.columns = result.columns.reorder_levels(order) 

7434 return result 

7435 

7436 # ---------------------------------------------------------------------- 

7437 # Arithmetic Methods 

7438 

7439 def _cmp_method(self, other, op): 

7440 axis: Literal[1] = 1 # only relevant for Series other case 

7441 

7442 self, other = ops.align_method_FRAME(self, other, axis, flex=False, level=None) 

7443 

7444 # See GH#4537 for discussion of scalar op behavior 

7445 new_data = self._dispatch_frame_op(other, op, axis=axis) 

7446 return self._construct_result(new_data) 

7447 

7448 def _arith_method(self, other, op): 

7449 if ops.should_reindex_frame_op(self, other, op, 1, None, None): 

7450 return ops.frame_arith_method_with_reindex(self, other, op) 

7451 

7452 axis: Literal[1] = 1 # only relevant for Series other case 

7453 other = ops.maybe_prepare_scalar_for_op(other, (self.shape[axis],)) 

7454 

7455 self, other = ops.align_method_FRAME(self, other, axis, flex=True, level=None) 

7456 

7457 new_data = self._dispatch_frame_op(other, op, axis=axis) 

7458 return self._construct_result(new_data) 

7459 

7460 _logical_method = _arith_method 

7461 

7462 def _dispatch_frame_op(self, right, func: Callable, axis: AxisInt | None = None): 

7463 """ 

7464 Evaluate the frame operation func(left, right) by evaluating 

7465 column-by-column, dispatching to the Series implementation. 

7466 

7467 Parameters 

7468 ---------- 

7469 right : scalar, Series, or DataFrame 

7470 func : arithmetic or comparison operator 

7471 axis : {None, 0, 1} 

7472 

7473 Returns 

7474 ------- 

7475 DataFrame 

7476 """ 

7477 # Get the appropriate array-op to apply to each column/block's values. 

7478 array_op = ops.get_array_op(func) 

7479 

7480 right = lib.item_from_zerodim(right) 

7481 if not is_list_like(right): 

7482 # i.e. scalar, faster than checking np.ndim(right) == 0 

7483 with np.errstate(all="ignore"): 

7484 bm = self._mgr.apply(array_op, right=right) 

7485 return self._constructor(bm) 

7486 

7487 elif isinstance(right, DataFrame): 

7488 assert self.index.equals(right.index) 

7489 assert self.columns.equals(right.columns) 

7490 # TODO: The previous assertion `assert right._indexed_same(self)` 

7491 # fails in cases with empty columns reached via 

7492 # _frame_arith_method_with_reindex 

7493 

7494 # TODO operate_blockwise expects a manager of the same type 

7495 with np.errstate(all="ignore"): 

7496 bm = self._mgr.operate_blockwise( 

7497 # error: Argument 1 to "operate_blockwise" of "ArrayManager" has 

7498 # incompatible type "Union[ArrayManager, BlockManager]"; expected 

7499 # "ArrayManager" 

7500 # error: Argument 1 to "operate_blockwise" of "BlockManager" has 

7501 # incompatible type "Union[ArrayManager, BlockManager]"; expected 

7502 # "BlockManager" 

7503 right._mgr, # type: ignore[arg-type] 

7504 array_op, 

7505 ) 

7506 return self._constructor(bm) 

7507 

7508 elif isinstance(right, Series) and axis == 1: 

7509 # axis=1 means we want to operate row-by-row 

7510 assert right.index.equals(self.columns) 

7511 

7512 right = right._values 

7513 # maybe_align_as_frame ensures we do not have an ndarray here 

7514 assert not isinstance(right, np.ndarray) 

7515 

7516 with np.errstate(all="ignore"): 

7517 arrays = [ 

7518 array_op(_left, _right) 

7519 for _left, _right in zip(self._iter_column_arrays(), right) 

7520 ] 

7521 

7522 elif isinstance(right, Series): 

7523 assert right.index.equals(self.index) # Handle other cases later 

7524 right = right._values 

7525 

7526 with np.errstate(all="ignore"): 

7527 arrays = [array_op(left, right) for left in self._iter_column_arrays()] 

7528 

7529 else: 

7530 # Remaining cases have less-obvious dispatch rules 

7531 raise NotImplementedError(right) 

7532 

7533 return type(self)._from_arrays( 

7534 arrays, self.columns, self.index, verify_integrity=False 

7535 ) 

7536 

7537 def _combine_frame(self, other: DataFrame, func, fill_value=None): 

7538 # at this point we have `self._indexed_same(other)` 

7539 

7540 if fill_value is None: 

7541 # since _arith_op may be called in a loop, avoid function call 

7542 # overhead if possible by doing this check once 

7543 _arith_op = func 

7544 

7545 else: 

7546 

7547 def _arith_op(left, right): 

7548 # for the mixed_type case where we iterate over columns, 

7549 # _arith_op(left, right) is equivalent to 

7550 # left._binop(right, func, fill_value=fill_value) 

7551 left, right = ops.fill_binop(left, right, fill_value) 

7552 return func(left, right) 

7553 

7554 new_data = self._dispatch_frame_op(other, _arith_op) 

7555 return new_data 

7556 

7557 def _construct_result(self, result) -> DataFrame: 

7558 """ 

7559 Wrap the result of an arithmetic, comparison, or logical operation. 

7560 

7561 Parameters 

7562 ---------- 

7563 result : DataFrame 

7564 

7565 Returns 

7566 ------- 

7567 DataFrame 

7568 """ 

7569 out = self._constructor(result, copy=False).__finalize__(self) 

7570 # Pin columns instead of passing to constructor for compat with 

7571 # non-unique columns case 

7572 out.columns = self.columns 

7573 out.index = self.index 

7574 return out 

7575 

7576 def __divmod__(self, other) -> tuple[DataFrame, DataFrame]: 

7577 # Naive implementation, room for optimization 

7578 div = self // other 

7579 mod = self - div * other 

7580 return div, mod 

7581 

7582 def __rdivmod__(self, other) -> tuple[DataFrame, DataFrame]: 

7583 # Naive implementation, room for optimization 

7584 div = other // self 

7585 mod = other - div * self 

7586 return div, mod 

7587 

7588 # ---------------------------------------------------------------------- 

7589 # Combination-Related 

7590 

7591 @doc( 

7592 _shared_docs["compare"], 

7593 """ 

7594Returns 

7595------- 

7596DataFrame 

7597 DataFrame that shows the differences stacked side by side. 

7598 

7599 The resulting index will be a MultiIndex with 'self' and 'other' 

7600 stacked alternately at the inner level. 

7601 

7602Raises 

7603------ 

7604ValueError 

7605 When the two DataFrames don't have identical labels or shape. 

7606 

7607See Also 

7608-------- 

7609Series.compare : Compare with another Series and show differences. 

7610DataFrame.equals : Test whether two objects contain the same elements. 

7611 

7612Notes 

7613----- 

7614Matching NaNs will not appear as a difference. 

7615 

7616Can only compare identically-labeled 

7617(i.e. same shape, identical row and column labels) DataFrames 

7618 

7619Examples 

7620-------- 

7621>>> df = pd.DataFrame( 

7622... {{ 

7623... "col1": ["a", "a", "b", "b", "a"], 

7624... "col2": [1.0, 2.0, 3.0, np.nan, 5.0], 

7625... "col3": [1.0, 2.0, 3.0, 4.0, 5.0] 

7626... }}, 

7627... columns=["col1", "col2", "col3"], 

7628... ) 

7629>>> df 

7630 col1 col2 col3 

76310 a 1.0 1.0 

76321 a 2.0 2.0 

76332 b 3.0 3.0 

76343 b NaN 4.0 

76354 a 5.0 5.0 

7636 

7637>>> df2 = df.copy() 

7638>>> df2.loc[0, 'col1'] = 'c' 

7639>>> df2.loc[2, 'col3'] = 4.0 

7640>>> df2 

7641 col1 col2 col3 

76420 c 1.0 1.0 

76431 a 2.0 2.0 

76442 b 3.0 4.0 

76453 b NaN 4.0 

76464 a 5.0 5.0 

7647 

7648Align the differences on columns 

7649 

7650>>> df.compare(df2) 

7651 col1 col3 

7652 self other self other 

76530 a c NaN NaN 

76542 NaN NaN 3.0 4.0 

7655 

7656Assign result_names 

7657 

7658>>> df.compare(df2, result_names=("left", "right")) 

7659 col1 col3 

7660 left right left right 

76610 a c NaN NaN 

76622 NaN NaN 3.0 4.0 

7663 

7664Stack the differences on rows 

7665 

7666>>> df.compare(df2, align_axis=0) 

7667 col1 col3 

76680 self a NaN 

7669 other c NaN 

76702 self NaN 3.0 

7671 other NaN 4.0 

7672 

7673Keep the equal values 

7674 

7675>>> df.compare(df2, keep_equal=True) 

7676 col1 col3 

7677 self other self other 

76780 a c 1.0 1.0 

76792 b b 3.0 4.0 

7680 

7681Keep all original rows and columns 

7682 

7683>>> df.compare(df2, keep_shape=True) 

7684 col1 col2 col3 

7685 self other self other self other 

76860 a c NaN NaN NaN NaN 

76871 NaN NaN NaN NaN NaN NaN 

76882 NaN NaN NaN NaN 3.0 4.0 

76893 NaN NaN NaN NaN NaN NaN 

76904 NaN NaN NaN NaN NaN NaN 

7691 

7692Keep all original rows and columns and also all original values 

7693 

7694>>> df.compare(df2, keep_shape=True, keep_equal=True) 

7695 col1 col2 col3 

7696 self other self other self other 

76970 a c 1.0 1.0 1.0 1.0 

76981 a a 2.0 2.0 2.0 2.0 

76992 b b 3.0 3.0 3.0 4.0 

77003 b b NaN NaN 4.0 4.0 

77014 a a 5.0 5.0 5.0 5.0 

7702""", 

7703 klass=_shared_doc_kwargs["klass"], 

7704 ) 

7705 def compare( 

7706 self, 

7707 other: DataFrame, 

7708 align_axis: Axis = 1, 

7709 keep_shape: bool = False, 

7710 keep_equal: bool = False, 

7711 result_names: Suffixes = ("self", "other"), 

7712 ) -> DataFrame: 

7713 return super().compare( 

7714 other=other, 

7715 align_axis=align_axis, 

7716 keep_shape=keep_shape, 

7717 keep_equal=keep_equal, 

7718 result_names=result_names, 

7719 ) 

7720 

7721 def combine( 

7722 self, 

7723 other: DataFrame, 

7724 func: Callable[[Series, Series], Series | Hashable], 

7725 fill_value=None, 

7726 overwrite: bool = True, 

7727 ) -> DataFrame: 

7728 """ 

7729 Perform column-wise combine with another DataFrame. 

7730 

7731 Combines a DataFrame with `other` DataFrame using `func` 

7732 to element-wise combine columns. The row and column indexes of the 

7733 resulting DataFrame will be the union of the two. 

7734 

7735 Parameters 

7736 ---------- 

7737 other : DataFrame 

7738 The DataFrame to merge column-wise. 

7739 func : function 

7740 Function that takes two series as inputs and return a Series or a 

7741 scalar. Used to merge the two dataframes column by columns. 

7742 fill_value : scalar value, default None 

7743 The value to fill NaNs with prior to passing any column to the 

7744 merge func. 

7745 overwrite : bool, default True 

7746 If True, columns in `self` that do not exist in `other` will be 

7747 overwritten with NaNs. 

7748 

7749 Returns 

7750 ------- 

7751 DataFrame 

7752 Combination of the provided DataFrames. 

7753 

7754 See Also 

7755 -------- 

7756 DataFrame.combine_first : Combine two DataFrame objects and default to 

7757 non-null values in frame calling the method. 

7758 

7759 Examples 

7760 -------- 

7761 Combine using a simple function that chooses the smaller column. 

7762 

7763 >>> df1 = pd.DataFrame({'A': [0, 0], 'B': [4, 4]}) 

7764 >>> df2 = pd.DataFrame({'A': [1, 1], 'B': [3, 3]}) 

7765 >>> take_smaller = lambda s1, s2: s1 if s1.sum() < s2.sum() else s2 

7766 >>> df1.combine(df2, take_smaller) 

7767 A B 

7768 0 0 3 

7769 1 0 3 

7770 

7771 Example using a true element-wise combine function. 

7772 

7773 >>> df1 = pd.DataFrame({'A': [5, 0], 'B': [2, 4]}) 

7774 >>> df2 = pd.DataFrame({'A': [1, 1], 'B': [3, 3]}) 

7775 >>> df1.combine(df2, np.minimum) 

7776 A B 

7777 0 1 2 

7778 1 0 3 

7779 

7780 Using `fill_value` fills Nones prior to passing the column to the 

7781 merge function. 

7782 

7783 >>> df1 = pd.DataFrame({'A': [0, 0], 'B': [None, 4]}) 

7784 >>> df2 = pd.DataFrame({'A': [1, 1], 'B': [3, 3]}) 

7785 >>> df1.combine(df2, take_smaller, fill_value=-5) 

7786 A B 

7787 0 0 -5.0 

7788 1 0 4.0 

7789 

7790 However, if the same element in both dataframes is None, that None 

7791 is preserved 

7792 

7793 >>> df1 = pd.DataFrame({'A': [0, 0], 'B': [None, 4]}) 

7794 >>> df2 = pd.DataFrame({'A': [1, 1], 'B': [None, 3]}) 

7795 >>> df1.combine(df2, take_smaller, fill_value=-5) 

7796 A B 

7797 0 0 -5.0 

7798 1 0 3.0 

7799 

7800 Example that demonstrates the use of `overwrite` and behavior when 

7801 the axis differ between the dataframes. 

7802 

7803 >>> df1 = pd.DataFrame({'A': [0, 0], 'B': [4, 4]}) 

7804 >>> df2 = pd.DataFrame({'B': [3, 3], 'C': [-10, 1], }, index=[1, 2]) 

7805 >>> df1.combine(df2, take_smaller) 

7806 A B C 

7807 0 NaN NaN NaN 

7808 1 NaN 3.0 -10.0 

7809 2 NaN 3.0 1.0 

7810 

7811 >>> df1.combine(df2, take_smaller, overwrite=False) 

7812 A B C 

7813 0 0.0 NaN NaN 

7814 1 0.0 3.0 -10.0 

7815 2 NaN 3.0 1.0 

7816 

7817 Demonstrating the preference of the passed in dataframe. 

7818 

7819 >>> df2 = pd.DataFrame({'B': [3, 3], 'C': [1, 1], }, index=[1, 2]) 

7820 >>> df2.combine(df1, take_smaller) 

7821 A B C 

7822 0 0.0 NaN NaN 

7823 1 0.0 3.0 NaN 

7824 2 NaN 3.0 NaN 

7825 

7826 >>> df2.combine(df1, take_smaller, overwrite=False) 

7827 A B C 

7828 0 0.0 NaN NaN 

7829 1 0.0 3.0 1.0 

7830 2 NaN 3.0 1.0 

7831 """ 

7832 other_idxlen = len(other.index) # save for compare 

7833 

7834 this, other = self.align(other, copy=False) 

7835 new_index = this.index 

7836 

7837 if other.empty and len(new_index) == len(self.index): 

7838 return self.copy() 

7839 

7840 if self.empty and len(other) == other_idxlen: 

7841 return other.copy() 

7842 

7843 # sorts if possible; otherwise align above ensures that these are set-equal 

7844 new_columns = this.columns.union(other.columns) 

7845 do_fill = fill_value is not None 

7846 result = {} 

7847 for col in new_columns: 

7848 series = this[col] 

7849 other_series = other[col] 

7850 

7851 this_dtype = series.dtype 

7852 other_dtype = other_series.dtype 

7853 

7854 this_mask = isna(series) 

7855 other_mask = isna(other_series) 

7856 

7857 # don't overwrite columns unnecessarily 

7858 # DO propagate if this column is not in the intersection 

7859 if not overwrite and other_mask.all(): 

7860 result[col] = this[col].copy() 

7861 continue 

7862 

7863 if do_fill: 

7864 series = series.copy() 

7865 other_series = other_series.copy() 

7866 series[this_mask] = fill_value 

7867 other_series[other_mask] = fill_value 

7868 

7869 if col not in self.columns: 

7870 # If self DataFrame does not have col in other DataFrame, 

7871 # try to promote series, which is all NaN, as other_dtype. 

7872 new_dtype = other_dtype 

7873 try: 

7874 series = series.astype(new_dtype, copy=False) 

7875 except ValueError: 

7876 # e.g. new_dtype is integer types 

7877 pass 

7878 else: 

7879 # if we have different dtypes, possibly promote 

7880 new_dtype = find_common_type([this_dtype, other_dtype]) 

7881 series = series.astype(new_dtype, copy=False) 

7882 other_series = other_series.astype(new_dtype, copy=False) 

7883 

7884 arr = func(series, other_series) 

7885 if isinstance(new_dtype, np.dtype): 

7886 # if new_dtype is an EA Dtype, then `func` is expected to return 

7887 # the correct dtype without any additional casting 

7888 # error: No overload variant of "maybe_downcast_to_dtype" matches 

7889 # argument types "Union[Series, Hashable]", "dtype[Any]" 

7890 arr = maybe_downcast_to_dtype( # type: ignore[call-overload] 

7891 arr, new_dtype 

7892 ) 

7893 

7894 result[col] = arr 

7895 

7896 # convert_objects just in case 

7897 return self._constructor(result, index=new_index, columns=new_columns) 

7898 

7899 def combine_first(self, other: DataFrame) -> DataFrame: 

7900 """ 

7901 Update null elements with value in the same location in `other`. 

7902 

7903 Combine two DataFrame objects by filling null values in one DataFrame 

7904 with non-null values from other DataFrame. The row and column indexes 

7905 of the resulting DataFrame will be the union of the two. The resulting 

7906 dataframe contains the 'first' dataframe values and overrides the 

7907 second one values where both first.loc[index, col] and 

7908 second.loc[index, col] are not missing values, upon calling 

7909 first.combine_first(second). 

7910 

7911 Parameters 

7912 ---------- 

7913 other : DataFrame 

7914 Provided DataFrame to use to fill null values. 

7915 

7916 Returns 

7917 ------- 

7918 DataFrame 

7919 The result of combining the provided DataFrame with the other object. 

7920 

7921 See Also 

7922 -------- 

7923 DataFrame.combine : Perform series-wise operation on two DataFrames 

7924 using a given function. 

7925 

7926 Examples 

7927 -------- 

7928 >>> df1 = pd.DataFrame({'A': [None, 0], 'B': [None, 4]}) 

7929 >>> df2 = pd.DataFrame({'A': [1, 1], 'B': [3, 3]}) 

7930 >>> df1.combine_first(df2) 

7931 A B 

7932 0 1.0 3.0 

7933 1 0.0 4.0 

7934 

7935 Null values still persist if the location of that null value 

7936 does not exist in `other` 

7937 

7938 >>> df1 = pd.DataFrame({'A': [None, 0], 'B': [4, None]}) 

7939 >>> df2 = pd.DataFrame({'B': [3, 3], 'C': [1, 1]}, index=[1, 2]) 

7940 >>> df1.combine_first(df2) 

7941 A B C 

7942 0 NaN 4.0 NaN 

7943 1 0.0 3.0 1.0 

7944 2 NaN 3.0 1.0 

7945 """ 

7946 from pandas.core.computation import expressions 

7947 

7948 def combiner(x, y): 

7949 mask = extract_array(isna(x)) 

7950 

7951 x_values = extract_array(x, extract_numpy=True) 

7952 y_values = extract_array(y, extract_numpy=True) 

7953 

7954 # If the column y in other DataFrame is not in first DataFrame, 

7955 # just return y_values. 

7956 if y.name not in self.columns: 

7957 return y_values 

7958 

7959 return expressions.where(mask, y_values, x_values) 

7960 

7961 combined = self.combine(other, combiner, overwrite=False) 

7962 

7963 dtypes = { 

7964 col: find_common_type([self.dtypes[col], other.dtypes[col]]) 

7965 for col in self.columns.intersection(other.columns) 

7966 if not is_dtype_equal(combined.dtypes[col], self.dtypes[col]) 

7967 } 

7968 

7969 if dtypes: 

7970 combined = combined.astype(dtypes) 

7971 

7972 return combined 

7973 

7974 def update( 

7975 self, 

7976 other, 

7977 join: str = "left", 

7978 overwrite: bool = True, 

7979 filter_func=None, 

7980 errors: str = "ignore", 

7981 ) -> None: 

7982 """ 

7983 Modify in place using non-NA values from another DataFrame. 

7984 

7985 Aligns on indices. There is no return value. 

7986 

7987 Parameters 

7988 ---------- 

7989 other : DataFrame, or object coercible into a DataFrame 

7990 Should have at least one matching index/column label 

7991 with the original DataFrame. If a Series is passed, 

7992 its name attribute must be set, and that will be 

7993 used as the column name to align with the original DataFrame. 

7994 join : {'left'}, default 'left' 

7995 Only left join is implemented, keeping the index and columns of the 

7996 original object. 

7997 overwrite : bool, default True 

7998 How to handle non-NA values for overlapping keys: 

7999 

8000 * True: overwrite original DataFrame's values 

8001 with values from `other`. 

8002 * False: only update values that are NA in 

8003 the original DataFrame. 

8004 

8005 filter_func : callable(1d-array) -> bool 1d-array, optional 

8006 Can choose to replace values other than NA. Return True for values 

8007 that should be updated. 

8008 errors : {'raise', 'ignore'}, default 'ignore' 

8009 If 'raise', will raise a ValueError if the DataFrame and `other` 

8010 both contain non-NA data in the same place. 

8011 

8012 Returns 

8013 ------- 

8014 None 

8015 This method directly changes calling object. 

8016 

8017 Raises 

8018 ------ 

8019 ValueError 

8020 * When `errors='raise'` and there's overlapping non-NA data. 

8021 * When `errors` is not either `'ignore'` or `'raise'` 

8022 NotImplementedError 

8023 * If `join != 'left'` 

8024 

8025 See Also 

8026 -------- 

8027 dict.update : Similar method for dictionaries. 

8028 DataFrame.merge : For column(s)-on-column(s) operations. 

8029 

8030 Examples 

8031 -------- 

8032 >>> df = pd.DataFrame({'A': [1, 2, 3], 

8033 ... 'B': [400, 500, 600]}) 

8034 >>> new_df = pd.DataFrame({'B': [4, 5, 6], 

8035 ... 'C': [7, 8, 9]}) 

8036 >>> df.update(new_df) 

8037 >>> df 

8038 A B 

8039 0 1 4 

8040 1 2 5 

8041 2 3 6 

8042 

8043 The DataFrame's length does not increase as a result of the update, 

8044 only values at matching index/column labels are updated. 

8045 

8046 >>> df = pd.DataFrame({'A': ['a', 'b', 'c'], 

8047 ... 'B': ['x', 'y', 'z']}) 

8048 >>> new_df = pd.DataFrame({'B': ['d', 'e', 'f', 'g', 'h', 'i']}) 

8049 >>> df.update(new_df) 

8050 >>> df 

8051 A B 

8052 0 a d 

8053 1 b e 

8054 2 c f 

8055 

8056 For Series, its name attribute must be set. 

8057 

8058 >>> df = pd.DataFrame({'A': ['a', 'b', 'c'], 

8059 ... 'B': ['x', 'y', 'z']}) 

8060 >>> new_column = pd.Series(['d', 'e'], name='B', index=[0, 2]) 

8061 >>> df.update(new_column) 

8062 >>> df 

8063 A B 

8064 0 a d 

8065 1 b y 

8066 2 c e 

8067 >>> df = pd.DataFrame({'A': ['a', 'b', 'c'], 

8068 ... 'B': ['x', 'y', 'z']}) 

8069 >>> new_df = pd.DataFrame({'B': ['d', 'e']}, index=[1, 2]) 

8070 >>> df.update(new_df) 

8071 >>> df 

8072 A B 

8073 0 a x 

8074 1 b d 

8075 2 c e 

8076 

8077 If `other` contains NaNs the corresponding values are not updated 

8078 in the original dataframe. 

8079 

8080 >>> df = pd.DataFrame({'A': [1, 2, 3], 

8081 ... 'B': [400, 500, 600]}) 

8082 >>> new_df = pd.DataFrame({'B': [4, np.nan, 6]}) 

8083 >>> df.update(new_df) 

8084 >>> df 

8085 A B 

8086 0 1 4 

8087 1 2 500 

8088 2 3 6 

8089 """ 

8090 from pandas.core.computation import expressions 

8091 

8092 # TODO: Support other joins 

8093 if join != "left": # pragma: no cover 

8094 raise NotImplementedError("Only left join is supported") 

8095 if errors not in ["ignore", "raise"]: 

8096 raise ValueError("The parameter errors must be either 'ignore' or 'raise'") 

8097 

8098 if not isinstance(other, DataFrame): 

8099 other = DataFrame(other) 

8100 

8101 other = other.reindex(self.index) 

8102 

8103 for col in self.columns.intersection(other.columns): 

8104 this = self[col]._values 

8105 that = other[col]._values 

8106 

8107 if filter_func is not None: 

8108 with np.errstate(all="ignore"): 

8109 mask = ~filter_func(this) | isna(that) 

8110 else: 

8111 if errors == "raise": 

8112 mask_this = notna(that) 

8113 mask_that = notna(this) 

8114 if any(mask_this & mask_that): 

8115 raise ValueError("Data overlaps.") 

8116 

8117 if overwrite: 

8118 mask = isna(that) 

8119 else: 

8120 mask = notna(this) 

8121 

8122 # don't overwrite columns unnecessarily 

8123 if mask.all(): 

8124 continue 

8125 

8126 self.loc[:, col] = expressions.where(mask, this, that) 

8127 

8128 # ---------------------------------------------------------------------- 

8129 # Data reshaping 

8130 @Appender( 

8131 """ 

8132Examples 

8133-------- 

8134>>> df = pd.DataFrame({'Animal': ['Falcon', 'Falcon', 

8135... 'Parrot', 'Parrot'], 

8136... 'Max Speed': [380., 370., 24., 26.]}) 

8137>>> df 

8138 Animal Max Speed 

81390 Falcon 380.0 

81401 Falcon 370.0 

81412 Parrot 24.0 

81423 Parrot 26.0 

8143>>> df.groupby(['Animal']).mean() 

8144 Max Speed 

8145Animal 

8146Falcon 375.0 

8147Parrot 25.0 

8148 

8149**Hierarchical Indexes** 

8150 

8151We can groupby different levels of a hierarchical index 

8152using the `level` parameter: 

8153 

8154>>> arrays = [['Falcon', 'Falcon', 'Parrot', 'Parrot'], 

8155... ['Captive', 'Wild', 'Captive', 'Wild']] 

8156>>> index = pd.MultiIndex.from_arrays(arrays, names=('Animal', 'Type')) 

8157>>> df = pd.DataFrame({'Max Speed': [390., 350., 30., 20.]}, 

8158... index=index) 

8159>>> df 

8160 Max Speed 

8161Animal Type 

8162Falcon Captive 390.0 

8163 Wild 350.0 

8164Parrot Captive 30.0 

8165 Wild 20.0 

8166>>> df.groupby(level=0).mean() 

8167 Max Speed 

8168Animal 

8169Falcon 370.0 

8170Parrot 25.0 

8171>>> df.groupby(level="Type").mean() 

8172 Max Speed 

8173Type 

8174Captive 210.0 

8175Wild 185.0 

8176 

8177We can also choose to include NA in group keys or not by setting 

8178`dropna` parameter, the default setting is `True`. 

8179 

8180>>> l = [[1, 2, 3], [1, None, 4], [2, 1, 3], [1, 2, 2]] 

8181>>> df = pd.DataFrame(l, columns=["a", "b", "c"]) 

8182 

8183>>> df.groupby(by=["b"]).sum() 

8184 a c 

8185b 

81861.0 2 3 

81872.0 2 5 

8188 

8189>>> df.groupby(by=["b"], dropna=False).sum() 

8190 a c 

8191b 

81921.0 2 3 

81932.0 2 5 

8194NaN 1 4 

8195 

8196>>> l = [["a", 12, 12], [None, 12.3, 33.], ["b", 12.3, 123], ["a", 1, 1]] 

8197>>> df = pd.DataFrame(l, columns=["a", "b", "c"]) 

8198 

8199>>> df.groupby(by="a").sum() 

8200 b c 

8201a 

8202a 13.0 13.0 

8203b 12.3 123.0 

8204 

8205>>> df.groupby(by="a", dropna=False).sum() 

8206 b c 

8207a 

8208a 13.0 13.0 

8209b 12.3 123.0 

8210NaN 12.3 33.0 

8211 

8212When using ``.apply()``, use ``group_keys`` to include or exclude the group keys. 

8213The ``group_keys`` argument defaults to ``True`` (include). 

8214 

8215>>> df = pd.DataFrame({'Animal': ['Falcon', 'Falcon', 

8216... 'Parrot', 'Parrot'], 

8217... 'Max Speed': [380., 370., 24., 26.]}) 

8218>>> df.groupby("Animal", group_keys=True).apply(lambda x: x) 

8219 Animal Max Speed 

8220Animal 

8221Falcon 0 Falcon 380.0 

8222 1 Falcon 370.0 

8223Parrot 2 Parrot 24.0 

8224 3 Parrot 26.0 

8225 

8226>>> df.groupby("Animal", group_keys=False).apply(lambda x: x) 

8227 Animal Max Speed 

82280 Falcon 380.0 

82291 Falcon 370.0 

82302 Parrot 24.0 

82313 Parrot 26.0 

8232""" 

8233 ) 

8234 @Appender(_shared_docs["groupby"] % _shared_doc_kwargs) 

8235 def groupby( 

8236 self, 

8237 by=None, 

8238 axis: Axis = 0, 

8239 level: IndexLabel | None = None, 

8240 as_index: bool = True, 

8241 sort: bool = True, 

8242 group_keys: bool = True, 

8243 observed: bool = False, 

8244 dropna: bool = True, 

8245 ) -> DataFrameGroupBy: 

8246 from pandas.core.groupby.generic import DataFrameGroupBy 

8247 

8248 if level is None and by is None: 

8249 raise TypeError("You have to supply one of 'by' and 'level'") 

8250 axis = self._get_axis_number(axis) 

8251 

8252 return DataFrameGroupBy( 

8253 obj=self, 

8254 keys=by, 

8255 axis=axis, 

8256 level=level, 

8257 as_index=as_index, 

8258 sort=sort, 

8259 group_keys=group_keys, 

8260 observed=observed, 

8261 dropna=dropna, 

8262 ) 

8263 

8264 _shared_docs[ 

8265 "pivot" 

8266 ] = """ 

8267 Return reshaped DataFrame organized by given index / column values. 

8268 

8269 Reshape data (produce a "pivot" table) based on column values. Uses 

8270 unique values from specified `index` / `columns` to form axes of the 

8271 resulting DataFrame. This function does not support data 

8272 aggregation, multiple values will result in a MultiIndex in the 

8273 columns. See the :ref:`User Guide <reshaping>` for more on reshaping. 

8274 

8275 Parameters 

8276 ----------%s 

8277 columns : str or object or a list of str 

8278 Column to use to make new frame's columns. 

8279 

8280 .. versionchanged:: 1.1.0 

8281 Also accept list of columns names. 

8282 

8283 index : str or object or a list of str, optional 

8284 Column to use to make new frame's index. If not given, uses existing index. 

8285 

8286 .. versionchanged:: 1.1.0 

8287 Also accept list of index names. 

8288 

8289 values : str, object or a list of the previous, optional 

8290 Column(s) to use for populating new frame's values. If not 

8291 specified, all remaining columns will be used and the result will 

8292 have hierarchically indexed columns. 

8293 

8294 Returns 

8295 ------- 

8296 DataFrame 

8297 Returns reshaped DataFrame. 

8298 

8299 Raises 

8300 ------ 

8301 ValueError: 

8302 When there are any `index`, `columns` combinations with multiple 

8303 values. `DataFrame.pivot_table` when you need to aggregate. 

8304 

8305 See Also 

8306 -------- 

8307 DataFrame.pivot_table : Generalization of pivot that can handle 

8308 duplicate values for one index/column pair. 

8309 DataFrame.unstack : Pivot based on the index values instead of a 

8310 column. 

8311 wide_to_long : Wide panel to long format. Less flexible but more 

8312 user-friendly than melt. 

8313 

8314 Notes 

8315 ----- 

8316 For finer-tuned control, see hierarchical indexing documentation along 

8317 with the related stack/unstack methods. 

8318 

8319 Reference :ref:`the user guide <reshaping.pivot>` for more examples. 

8320 

8321 Examples 

8322 -------- 

8323 >>> df = pd.DataFrame({'foo': ['one', 'one', 'one', 'two', 'two', 

8324 ... 'two'], 

8325 ... 'bar': ['A', 'B', 'C', 'A', 'B', 'C'], 

8326 ... 'baz': [1, 2, 3, 4, 5, 6], 

8327 ... 'zoo': ['x', 'y', 'z', 'q', 'w', 't']}) 

8328 >>> df 

8329 foo bar baz zoo 

8330 0 one A 1 x 

8331 1 one B 2 y 

8332 2 one C 3 z 

8333 3 two A 4 q 

8334 4 two B 5 w 

8335 5 two C 6 t 

8336 

8337 >>> df.pivot(index='foo', columns='bar', values='baz') 

8338 bar A B C 

8339 foo 

8340 one 1 2 3 

8341 two 4 5 6 

8342 

8343 >>> df.pivot(index='foo', columns='bar')['baz'] 

8344 bar A B C 

8345 foo 

8346 one 1 2 3 

8347 two 4 5 6 

8348 

8349 >>> df.pivot(index='foo', columns='bar', values=['baz', 'zoo']) 

8350 baz zoo 

8351 bar A B C A B C 

8352 foo 

8353 one 1 2 3 x y z 

8354 two 4 5 6 q w t 

8355 

8356 You could also assign a list of column names or a list of index names. 

8357 

8358 >>> df = pd.DataFrame({ 

8359 ... "lev1": [1, 1, 1, 2, 2, 2], 

8360 ... "lev2": [1, 1, 2, 1, 1, 2], 

8361 ... "lev3": [1, 2, 1, 2, 1, 2], 

8362 ... "lev4": [1, 2, 3, 4, 5, 6], 

8363 ... "values": [0, 1, 2, 3, 4, 5]}) 

8364 >>> df 

8365 lev1 lev2 lev3 lev4 values 

8366 0 1 1 1 1 0 

8367 1 1 1 2 2 1 

8368 2 1 2 1 3 2 

8369 3 2 1 2 4 3 

8370 4 2 1 1 5 4 

8371 5 2 2 2 6 5 

8372 

8373 >>> df.pivot(index="lev1", columns=["lev2", "lev3"], values="values") 

8374 lev2 1 2 

8375 lev3 1 2 1 2 

8376 lev1 

8377 1 0.0 1.0 2.0 NaN 

8378 2 4.0 3.0 NaN 5.0 

8379 

8380 >>> df.pivot(index=["lev1", "lev2"], columns=["lev3"], values="values") 

8381 lev3 1 2 

8382 lev1 lev2 

8383 1 1 0.0 1.0 

8384 2 2.0 NaN 

8385 2 1 4.0 3.0 

8386 2 NaN 5.0 

8387 

8388 A ValueError is raised if there are any duplicates. 

8389 

8390 >>> df = pd.DataFrame({"foo": ['one', 'one', 'two', 'two'], 

8391 ... "bar": ['A', 'A', 'B', 'C'], 

8392 ... "baz": [1, 2, 3, 4]}) 

8393 >>> df 

8394 foo bar baz 

8395 0 one A 1 

8396 1 one A 2 

8397 2 two B 3 

8398 3 two C 4 

8399 

8400 Notice that the first two rows are the same for our `index` 

8401 and `columns` arguments. 

8402 

8403 >>> df.pivot(index='foo', columns='bar', values='baz') 

8404 Traceback (most recent call last): 

8405 ... 

8406 ValueError: Index contains duplicate entries, cannot reshape 

8407 """ 

8408 

8409 @Substitution("") 

8410 @Appender(_shared_docs["pivot"]) 

8411 def pivot(self, *, columns, index=lib.NoDefault, values=lib.NoDefault) -> DataFrame: 

8412 from pandas.core.reshape.pivot import pivot 

8413 

8414 return pivot(self, index=index, columns=columns, values=values) 

8415 

8416 _shared_docs[ 

8417 "pivot_table" 

8418 ] = """ 

8419 Create a spreadsheet-style pivot table as a DataFrame. 

8420 

8421 The levels in the pivot table will be stored in MultiIndex objects 

8422 (hierarchical indexes) on the index and columns of the result DataFrame. 

8423 

8424 Parameters 

8425 ----------%s 

8426 values : list-like or scalar, optional 

8427 Column or columns to aggregate. 

8428 index : column, Grouper, array, or list of the previous 

8429 If an array is passed, it must be the same length as the data. The 

8430 list can contain any of the other types (except list). 

8431 Keys to group by on the pivot table index. If an array is passed, 

8432 it is being used as the same manner as column values. 

8433 columns : column, Grouper, array, or list of the previous 

8434 If an array is passed, it must be the same length as the data. The 

8435 list can contain any of the other types (except list). 

8436 Keys to group by on the pivot table column. If an array is passed, 

8437 it is being used as the same manner as column values. 

8438 aggfunc : function, list of functions, dict, default numpy.mean 

8439 If list of functions passed, the resulting pivot table will have 

8440 hierarchical columns whose top level are the function names 

8441 (inferred from the function objects themselves) 

8442 If dict is passed, the key is column to aggregate and value 

8443 is function or list of functions. If ``margin=True``, 

8444 aggfunc will be used to calculate the partial aggregates. 

8445 fill_value : scalar, default None 

8446 Value to replace missing values with (in the resulting pivot table, 

8447 after aggregation). 

8448 margins : bool, default False 

8449 If ``margins=True``, special ``All`` columns and rows 

8450 will be added with partial group aggregates across the categories 

8451 on the rows and columns. 

8452 dropna : bool, default True 

8453 Do not include columns whose entries are all NaN. If True, 

8454 rows with a NaN value in any column will be omitted before 

8455 computing margins. 

8456 margins_name : str, default 'All' 

8457 Name of the row / column that will contain the totals 

8458 when margins is True. 

8459 observed : bool, default False 

8460 This only applies if any of the groupers are Categoricals. 

8461 If True: only show observed values for categorical groupers. 

8462 If False: show all values for categorical groupers. 

8463 

8464 sort : bool, default True 

8465 Specifies if the result should be sorted. 

8466 

8467 .. versionadded:: 1.3.0 

8468 

8469 Returns 

8470 ------- 

8471 DataFrame 

8472 An Excel style pivot table. 

8473 

8474 See Also 

8475 -------- 

8476 DataFrame.pivot : Pivot without aggregation that can handle 

8477 non-numeric data. 

8478 DataFrame.melt: Unpivot a DataFrame from wide to long format, 

8479 optionally leaving identifiers set. 

8480 wide_to_long : Wide panel to long format. Less flexible but more 

8481 user-friendly than melt. 

8482 

8483 Notes 

8484 ----- 

8485 Reference :ref:`the user guide <reshaping.pivot>` for more examples. 

8486 

8487 Examples 

8488 -------- 

8489 >>> df = pd.DataFrame({"A": ["foo", "foo", "foo", "foo", "foo", 

8490 ... "bar", "bar", "bar", "bar"], 

8491 ... "B": ["one", "one", "one", "two", "two", 

8492 ... "one", "one", "two", "two"], 

8493 ... "C": ["small", "large", "large", "small", 

8494 ... "small", "large", "small", "small", 

8495 ... "large"], 

8496 ... "D": [1, 2, 2, 3, 3, 4, 5, 6, 7], 

8497 ... "E": [2, 4, 5, 5, 6, 6, 8, 9, 9]}) 

8498 >>> df 

8499 A B C D E 

8500 0 foo one small 1 2 

8501 1 foo one large 2 4 

8502 2 foo one large 2 5 

8503 3 foo two small 3 5 

8504 4 foo two small 3 6 

8505 5 bar one large 4 6 

8506 6 bar one small 5 8 

8507 7 bar two small 6 9 

8508 8 bar two large 7 9 

8509 

8510 This first example aggregates values by taking the sum. 

8511 

8512 >>> table = pd.pivot_table(df, values='D', index=['A', 'B'], 

8513 ... columns=['C'], aggfunc=np.sum) 

8514 >>> table 

8515 C large small 

8516 A B 

8517 bar one 4.0 5.0 

8518 two 7.0 6.0 

8519 foo one 4.0 1.0 

8520 two NaN 6.0 

8521 

8522 We can also fill missing values using the `fill_value` parameter. 

8523 

8524 >>> table = pd.pivot_table(df, values='D', index=['A', 'B'], 

8525 ... columns=['C'], aggfunc=np.sum, fill_value=0) 

8526 >>> table 

8527 C large small 

8528 A B 

8529 bar one 4 5 

8530 two 7 6 

8531 foo one 4 1 

8532 two 0 6 

8533 

8534 The next example aggregates by taking the mean across multiple columns. 

8535 

8536 >>> table = pd.pivot_table(df, values=['D', 'E'], index=['A', 'C'], 

8537 ... aggfunc={'D': np.mean, 'E': np.mean}) 

8538 >>> table 

8539 D E 

8540 A C 

8541 bar large 5.500000 7.500000 

8542 small 5.500000 8.500000 

8543 foo large 2.000000 4.500000 

8544 small 2.333333 4.333333 

8545 

8546 We can also calculate multiple types of aggregations for any given 

8547 value column. 

8548 

8549 >>> table = pd.pivot_table(df, values=['D', 'E'], index=['A', 'C'], 

8550 ... aggfunc={'D': np.mean, 

8551 ... 'E': [min, max, np.mean]}) 

8552 >>> table 

8553 D E 

8554 mean max mean min 

8555 A C 

8556 bar large 5.500000 9 7.500000 6 

8557 small 5.500000 9 8.500000 8 

8558 foo large 2.000000 5 4.500000 4 

8559 small 2.333333 6 4.333333 2 

8560 """ 

8561 

8562 @Substitution("") 

8563 @Appender(_shared_docs["pivot_table"]) 

8564 def pivot_table( 

8565 self, 

8566 values=None, 

8567 index=None, 

8568 columns=None, 

8569 aggfunc: AggFuncType = "mean", 

8570 fill_value=None, 

8571 margins: bool = False, 

8572 dropna: bool = True, 

8573 margins_name: Level = "All", 

8574 observed: bool = False, 

8575 sort: bool = True, 

8576 ) -> DataFrame: 

8577 from pandas.core.reshape.pivot import pivot_table 

8578 

8579 return pivot_table( 

8580 self, 

8581 values=values, 

8582 index=index, 

8583 columns=columns, 

8584 aggfunc=aggfunc, 

8585 fill_value=fill_value, 

8586 margins=margins, 

8587 dropna=dropna, 

8588 margins_name=margins_name, 

8589 observed=observed, 

8590 sort=sort, 

8591 ) 

8592 

8593 def stack(self, level: Level = -1, dropna: bool = True): 

8594 """ 

8595 Stack the prescribed level(s) from columns to index. 

8596 

8597 Return a reshaped DataFrame or Series having a multi-level 

8598 index with one or more new inner-most levels compared to the current 

8599 DataFrame. The new inner-most levels are created by pivoting the 

8600 columns of the current dataframe: 

8601 

8602 - if the columns have a single level, the output is a Series; 

8603 - if the columns have multiple levels, the new index 

8604 level(s) is (are) taken from the prescribed level(s) and 

8605 the output is a DataFrame. 

8606 

8607 Parameters 

8608 ---------- 

8609 level : int, str, list, default -1 

8610 Level(s) to stack from the column axis onto the index 

8611 axis, defined as one index or label, or a list of indices 

8612 or labels. 

8613 dropna : bool, default True 

8614 Whether to drop rows in the resulting Frame/Series with 

8615 missing values. Stacking a column level onto the index 

8616 axis can create combinations of index and column values 

8617 that are missing from the original dataframe. See Examples 

8618 section. 

8619 

8620 Returns 

8621 ------- 

8622 DataFrame or Series 

8623 Stacked dataframe or series. 

8624 

8625 See Also 

8626 -------- 

8627 DataFrame.unstack : Unstack prescribed level(s) from index axis 

8628 onto column axis. 

8629 DataFrame.pivot : Reshape dataframe from long format to wide 

8630 format. 

8631 DataFrame.pivot_table : Create a spreadsheet-style pivot table 

8632 as a DataFrame. 

8633 

8634 Notes 

8635 ----- 

8636 The function is named by analogy with a collection of books 

8637 being reorganized from being side by side on a horizontal 

8638 position (the columns of the dataframe) to being stacked 

8639 vertically on top of each other (in the index of the 

8640 dataframe). 

8641 

8642 Reference :ref:`the user guide <reshaping.stacking>` for more examples. 

8643 

8644 Examples 

8645 -------- 

8646 **Single level columns** 

8647 

8648 >>> df_single_level_cols = pd.DataFrame([[0, 1], [2, 3]], 

8649 ... index=['cat', 'dog'], 

8650 ... columns=['weight', 'height']) 

8651 

8652 Stacking a dataframe with a single level column axis returns a Series: 

8653 

8654 >>> df_single_level_cols 

8655 weight height 

8656 cat 0 1 

8657 dog 2 3 

8658 >>> df_single_level_cols.stack() 

8659 cat weight 0 

8660 height 1 

8661 dog weight 2 

8662 height 3 

8663 dtype: int64 

8664 

8665 **Multi level columns: simple case** 

8666 

8667 >>> multicol1 = pd.MultiIndex.from_tuples([('weight', 'kg'), 

8668 ... ('weight', 'pounds')]) 

8669 >>> df_multi_level_cols1 = pd.DataFrame([[1, 2], [2, 4]], 

8670 ... index=['cat', 'dog'], 

8671 ... columns=multicol1) 

8672 

8673 Stacking a dataframe with a multi-level column axis: 

8674 

8675 >>> df_multi_level_cols1 

8676 weight 

8677 kg pounds 

8678 cat 1 2 

8679 dog 2 4 

8680 >>> df_multi_level_cols1.stack() 

8681 weight 

8682 cat kg 1 

8683 pounds 2 

8684 dog kg 2 

8685 pounds 4 

8686 

8687 **Missing values** 

8688 

8689 >>> multicol2 = pd.MultiIndex.from_tuples([('weight', 'kg'), 

8690 ... ('height', 'm')]) 

8691 >>> df_multi_level_cols2 = pd.DataFrame([[1.0, 2.0], [3.0, 4.0]], 

8692 ... index=['cat', 'dog'], 

8693 ... columns=multicol2) 

8694 

8695 It is common to have missing values when stacking a dataframe 

8696 with multi-level columns, as the stacked dataframe typically 

8697 has more values than the original dataframe. Missing values 

8698 are filled with NaNs: 

8699 

8700 >>> df_multi_level_cols2 

8701 weight height 

8702 kg m 

8703 cat 1.0 2.0 

8704 dog 3.0 4.0 

8705 >>> df_multi_level_cols2.stack() 

8706 height weight 

8707 cat kg NaN 1.0 

8708 m 2.0 NaN 

8709 dog kg NaN 3.0 

8710 m 4.0 NaN 

8711 

8712 **Prescribing the level(s) to be stacked** 

8713 

8714 The first parameter controls which level or levels are stacked: 

8715 

8716 >>> df_multi_level_cols2.stack(0) 

8717 kg m 

8718 cat height NaN 2.0 

8719 weight 1.0 NaN 

8720 dog height NaN 4.0 

8721 weight 3.0 NaN 

8722 >>> df_multi_level_cols2.stack([0, 1]) 

8723 cat height m 2.0 

8724 weight kg 1.0 

8725 dog height m 4.0 

8726 weight kg 3.0 

8727 dtype: float64 

8728 

8729 **Dropping missing values** 

8730 

8731 >>> df_multi_level_cols3 = pd.DataFrame([[None, 1.0], [2.0, 3.0]], 

8732 ... index=['cat', 'dog'], 

8733 ... columns=multicol2) 

8734 

8735 Note that rows where all values are missing are dropped by 

8736 default but this behaviour can be controlled via the dropna 

8737 keyword parameter: 

8738 

8739 >>> df_multi_level_cols3 

8740 weight height 

8741 kg m 

8742 cat NaN 1.0 

8743 dog 2.0 3.0 

8744 >>> df_multi_level_cols3.stack(dropna=False) 

8745 height weight 

8746 cat kg NaN NaN 

8747 m 1.0 NaN 

8748 dog kg NaN 2.0 

8749 m 3.0 NaN 

8750 >>> df_multi_level_cols3.stack(dropna=True) 

8751 height weight 

8752 cat m 1.0 NaN 

8753 dog kg NaN 2.0 

8754 m 3.0 NaN 

8755 """ 

8756 from pandas.core.reshape.reshape import ( 

8757 stack, 

8758 stack_multiple, 

8759 ) 

8760 

8761 if isinstance(level, (tuple, list)): 

8762 result = stack_multiple(self, level, dropna=dropna) 

8763 else: 

8764 result = stack(self, level, dropna=dropna) 

8765 

8766 return result.__finalize__(self, method="stack") 

8767 

8768 def explode( 

8769 self, 

8770 column: IndexLabel, 

8771 ignore_index: bool = False, 

8772 ) -> DataFrame: 

8773 """ 

8774 Transform each element of a list-like to a row, replicating index values. 

8775 

8776 Parameters 

8777 ---------- 

8778 column : IndexLabel 

8779 Column(s) to explode. 

8780 For multiple columns, specify a non-empty list with each element 

8781 be str or tuple, and all specified columns their list-like data 

8782 on same row of the frame must have matching length. 

8783 

8784 .. versionadded:: 1.3.0 

8785 Multi-column explode 

8786 

8787 ignore_index : bool, default False 

8788 If True, the resulting index will be labeled 0, 1, …, n - 1. 

8789 

8790 .. versionadded:: 1.1.0 

8791 

8792 Returns 

8793 ------- 

8794 DataFrame 

8795 Exploded lists to rows of the subset columns; 

8796 index will be duplicated for these rows. 

8797 

8798 Raises 

8799 ------ 

8800 ValueError : 

8801 * If columns of the frame are not unique. 

8802 * If specified columns to explode is empty list. 

8803 * If specified columns to explode have not matching count of 

8804 elements rowwise in the frame. 

8805 

8806 See Also 

8807 -------- 

8808 DataFrame.unstack : Pivot a level of the (necessarily hierarchical) 

8809 index labels. 

8810 DataFrame.melt : Unpivot a DataFrame from wide format to long format. 

8811 Series.explode : Explode a DataFrame from list-like columns to long format. 

8812 

8813 Notes 

8814 ----- 

8815 This routine will explode list-likes including lists, tuples, sets, 

8816 Series, and np.ndarray. The result dtype of the subset rows will 

8817 be object. Scalars will be returned unchanged, and empty list-likes will 

8818 result in a np.nan for that row. In addition, the ordering of rows in the 

8819 output will be non-deterministic when exploding sets. 

8820 

8821 Reference :ref:`the user guide <reshaping.explode>` for more examples. 

8822 

8823 Examples 

8824 -------- 

8825 >>> df = pd.DataFrame({'A': [[0, 1, 2], 'foo', [], [3, 4]], 

8826 ... 'B': 1, 

8827 ... 'C': [['a', 'b', 'c'], np.nan, [], ['d', 'e']]}) 

8828 >>> df 

8829 A B C 

8830 0 [0, 1, 2] 1 [a, b, c] 

8831 1 foo 1 NaN 

8832 2 [] 1 [] 

8833 3 [3, 4] 1 [d, e] 

8834 

8835 Single-column explode. 

8836 

8837 >>> df.explode('A') 

8838 A B C 

8839 0 0 1 [a, b, c] 

8840 0 1 1 [a, b, c] 

8841 0 2 1 [a, b, c] 

8842 1 foo 1 NaN 

8843 2 NaN 1 [] 

8844 3 3 1 [d, e] 

8845 3 4 1 [d, e] 

8846 

8847 Multi-column explode. 

8848 

8849 >>> df.explode(list('AC')) 

8850 A B C 

8851 0 0 1 a 

8852 0 1 1 b 

8853 0 2 1 c 

8854 1 foo 1 NaN 

8855 2 NaN 1 NaN 

8856 3 3 1 d 

8857 3 4 1 e 

8858 """ 

8859 if not self.columns.is_unique: 

8860 duplicate_cols = self.columns[self.columns.duplicated()].tolist() 

8861 raise ValueError( 

8862 f"DataFrame columns must be unique. Duplicate columns: {duplicate_cols}" 

8863 ) 

8864 

8865 columns: list[Hashable] 

8866 if is_scalar(column) or isinstance(column, tuple): 

8867 columns = [column] 

8868 elif isinstance(column, list) and all( 

8869 is_scalar(c) or isinstance(c, tuple) for c in column 

8870 ): 

8871 if not column: 

8872 raise ValueError("column must be nonempty") 

8873 if len(column) > len(set(column)): 

8874 raise ValueError("column must be unique") 

8875 columns = column 

8876 else: 

8877 raise ValueError("column must be a scalar, tuple, or list thereof") 

8878 

8879 df = self.reset_index(drop=True) 

8880 if len(columns) == 1: 

8881 result = df[columns[0]].explode() 

8882 else: 

8883 mylen = lambda x: len(x) if (is_list_like(x) and len(x) > 0) else 1 

8884 counts0 = self[columns[0]].apply(mylen) 

8885 for c in columns[1:]: 

8886 if not all(counts0 == self[c].apply(mylen)): 

8887 raise ValueError("columns must have matching element counts") 

8888 result = DataFrame({c: df[c].explode() for c in columns}) 

8889 result = df.drop(columns, axis=1).join(result) 

8890 if ignore_index: 

8891 result.index = default_index(len(result)) 

8892 else: 

8893 result.index = self.index.take(result.index) 

8894 result = result.reindex(columns=self.columns, copy=False) 

8895 

8896 return result.__finalize__(self, method="explode") 

8897 

8898 def unstack(self, level: Level = -1, fill_value=None): 

8899 """ 

8900 Pivot a level of the (necessarily hierarchical) index labels. 

8901 

8902 Returns a DataFrame having a new level of column labels whose inner-most level 

8903 consists of the pivoted index labels. 

8904 

8905 If the index is not a MultiIndex, the output will be a Series 

8906 (the analogue of stack when the columns are not a MultiIndex). 

8907 

8908 Parameters 

8909 ---------- 

8910 level : int, str, or list of these, default -1 (last level) 

8911 Level(s) of index to unstack, can pass level name. 

8912 fill_value : int, str or dict 

8913 Replace NaN with this value if the unstack produces missing values. 

8914 

8915 Returns 

8916 ------- 

8917 Series or DataFrame 

8918 

8919 See Also 

8920 -------- 

8921 DataFrame.pivot : Pivot a table based on column values. 

8922 DataFrame.stack : Pivot a level of the column labels (inverse operation 

8923 from `unstack`). 

8924 

8925 Notes 

8926 ----- 

8927 Reference :ref:`the user guide <reshaping.stacking>` for more examples. 

8928 

8929 Examples 

8930 -------- 

8931 >>> index = pd.MultiIndex.from_tuples([('one', 'a'), ('one', 'b'), 

8932 ... ('two', 'a'), ('two', 'b')]) 

8933 >>> s = pd.Series(np.arange(1.0, 5.0), index=index) 

8934 >>> s 

8935 one a 1.0 

8936 b 2.0 

8937 two a 3.0 

8938 b 4.0 

8939 dtype: float64 

8940 

8941 >>> s.unstack(level=-1) 

8942 a b 

8943 one 1.0 2.0 

8944 two 3.0 4.0 

8945 

8946 >>> s.unstack(level=0) 

8947 one two 

8948 a 1.0 3.0 

8949 b 2.0 4.0 

8950 

8951 >>> df = s.unstack(level=0) 

8952 >>> df.unstack() 

8953 one a 1.0 

8954 b 2.0 

8955 two a 3.0 

8956 b 4.0 

8957 dtype: float64 

8958 """ 

8959 from pandas.core.reshape.reshape import unstack 

8960 

8961 result = unstack(self, level, fill_value) 

8962 

8963 return result.__finalize__(self, method="unstack") 

8964 

8965 @Appender(_shared_docs["melt"] % {"caller": "df.melt(", "other": "melt"}) 

8966 def melt( 

8967 self, 

8968 id_vars=None, 

8969 value_vars=None, 

8970 var_name=None, 

8971 value_name: Hashable = "value", 

8972 col_level: Level = None, 

8973 ignore_index: bool = True, 

8974 ) -> DataFrame: 

8975 return melt( 

8976 self, 

8977 id_vars=id_vars, 

8978 value_vars=value_vars, 

8979 var_name=var_name, 

8980 value_name=value_name, 

8981 col_level=col_level, 

8982 ignore_index=ignore_index, 

8983 ).__finalize__(self, method="melt") 

8984 

8985 # ---------------------------------------------------------------------- 

8986 # Time series-related 

8987 

8988 @doc( 

8989 Series.diff, 

8990 klass="DataFrame", 

8991 extra_params="axis : {0 or 'index', 1 or 'columns'}, default 0\n " 

8992 "Take difference over rows (0) or columns (1).\n", 

8993 other_klass="Series", 

8994 examples=dedent( 

8995 """ 

8996 Difference with previous row 

8997 

8998 >>> df = pd.DataFrame({'a': [1, 2, 3, 4, 5, 6], 

8999 ... 'b': [1, 1, 2, 3, 5, 8], 

9000 ... 'c': [1, 4, 9, 16, 25, 36]}) 

9001 >>> df 

9002 a b c 

9003 0 1 1 1 

9004 1 2 1 4 

9005 2 3 2 9 

9006 3 4 3 16 

9007 4 5 5 25 

9008 5 6 8 36 

9009 

9010 >>> df.diff() 

9011 a b c 

9012 0 NaN NaN NaN 

9013 1 1.0 0.0 3.0 

9014 2 1.0 1.0 5.0 

9015 3 1.0 1.0 7.0 

9016 4 1.0 2.0 9.0 

9017 5 1.0 3.0 11.0 

9018 

9019 Difference with previous column 

9020 

9021 >>> df.diff(axis=1) 

9022 a b c 

9023 0 NaN 0 0 

9024 1 NaN -1 3 

9025 2 NaN -1 7 

9026 3 NaN -1 13 

9027 4 NaN 0 20 

9028 5 NaN 2 28 

9029 

9030 Difference with 3rd previous row 

9031 

9032 >>> df.diff(periods=3) 

9033 a b c 

9034 0 NaN NaN NaN 

9035 1 NaN NaN NaN 

9036 2 NaN NaN NaN 

9037 3 3.0 2.0 15.0 

9038 4 3.0 4.0 21.0 

9039 5 3.0 6.0 27.0 

9040 

9041 Difference with following row 

9042 

9043 >>> df.diff(periods=-1) 

9044 a b c 

9045 0 -1.0 0.0 -3.0 

9046 1 -1.0 -1.0 -5.0 

9047 2 -1.0 -1.0 -7.0 

9048 3 -1.0 -2.0 -9.0 

9049 4 -1.0 -3.0 -11.0 

9050 5 NaN NaN NaN 

9051 

9052 Overflow in input dtype 

9053 

9054 >>> df = pd.DataFrame({'a': [1, 0]}, dtype=np.uint8) 

9055 >>> df.diff() 

9056 a 

9057 0 NaN 

9058 1 255.0""" 

9059 ), 

9060 ) 

9061 def diff(self, periods: int = 1, axis: Axis = 0) -> DataFrame: 

9062 if not lib.is_integer(periods): 

9063 if not ( 

9064 is_float(periods) 

9065 # error: "int" has no attribute "is_integer" 

9066 and periods.is_integer() # type: ignore[attr-defined] 

9067 ): 

9068 raise ValueError("periods must be an integer") 

9069 periods = int(periods) 

9070 

9071 axis = self._get_axis_number(axis) 

9072 if axis == 1: 

9073 if periods != 0: 

9074 # in the periods == 0 case, this is equivalent diff of 0 periods 

9075 # along axis=0, and the Manager method may be somewhat more 

9076 # performant, so we dispatch in that case. 

9077 return self - self.shift(periods, axis=axis) 

9078 # With periods=0 this is equivalent to a diff with axis=0 

9079 axis = 0 

9080 

9081 new_data = self._mgr.diff(n=periods, axis=axis) 

9082 return self._constructor(new_data).__finalize__(self, "diff") 

9083 

9084 # ---------------------------------------------------------------------- 

9085 # Function application 

9086 

9087 def _gotitem( 

9088 self, 

9089 key: IndexLabel, 

9090 ndim: int, 

9091 subset: DataFrame | Series | None = None, 

9092 ) -> DataFrame | Series: 

9093 """ 

9094 Sub-classes to define. Return a sliced object. 

9095 

9096 Parameters 

9097 ---------- 

9098 key : string / list of selections 

9099 ndim : {1, 2} 

9100 requested ndim of result 

9101 subset : object, default None 

9102 subset to act on 

9103 """ 

9104 if subset is None: 

9105 subset = self 

9106 elif subset.ndim == 1: # is Series 

9107 return subset 

9108 

9109 # TODO: _shallow_copy(subset)? 

9110 return subset[key] 

9111 

9112 _agg_summary_and_see_also_doc = dedent( 

9113 """ 

9114 The aggregation operations are always performed over an axis, either the 

9115 index (default) or the column axis. This behavior is different from 

9116 `numpy` aggregation functions (`mean`, `median`, `prod`, `sum`, `std`, 

9117 `var`), where the default is to compute the aggregation of the flattened 

9118 array, e.g., ``numpy.mean(arr_2d)`` as opposed to 

9119 ``numpy.mean(arr_2d, axis=0)``. 

9120 

9121 `agg` is an alias for `aggregate`. Use the alias. 

9122 

9123 See Also 

9124 -------- 

9125 DataFrame.apply : Perform any type of operations. 

9126 DataFrame.transform : Perform transformation type operations. 

9127 core.groupby.GroupBy : Perform operations over groups. 

9128 core.resample.Resampler : Perform operations over resampled bins. 

9129 core.window.Rolling : Perform operations over rolling window. 

9130 core.window.Expanding : Perform operations over expanding window. 

9131 core.window.ExponentialMovingWindow : Perform operation over exponential weighted 

9132 window. 

9133 """ 

9134 ) 

9135 

9136 _agg_examples_doc = dedent( 

9137 """ 

9138 Examples 

9139 -------- 

9140 >>> df = pd.DataFrame([[1, 2, 3], 

9141 ... [4, 5, 6], 

9142 ... [7, 8, 9], 

9143 ... [np.nan, np.nan, np.nan]], 

9144 ... columns=['A', 'B', 'C']) 

9145 

9146 Aggregate these functions over the rows. 

9147 

9148 >>> df.agg(['sum', 'min']) 

9149 A B C 

9150 sum 12.0 15.0 18.0 

9151 min 1.0 2.0 3.0 

9152 

9153 Different aggregations per column. 

9154 

9155 >>> df.agg({'A' : ['sum', 'min'], 'B' : ['min', 'max']}) 

9156 A B 

9157 sum 12.0 NaN 

9158 min 1.0 2.0 

9159 max NaN 8.0 

9160 

9161 Aggregate different functions over the columns and rename the index of the resulting 

9162 DataFrame. 

9163 

9164 >>> df.agg(x=('A', max), y=('B', 'min'), z=('C', np.mean)) 

9165 A B C 

9166 x 7.0 NaN NaN 

9167 y NaN 2.0 NaN 

9168 z NaN NaN 6.0 

9169 

9170 Aggregate over the columns. 

9171 

9172 >>> df.agg("mean", axis="columns") 

9173 0 2.0 

9174 1 5.0 

9175 2 8.0 

9176 3 NaN 

9177 dtype: float64 

9178 """ 

9179 ) 

9180 

9181 @doc( 

9182 _shared_docs["aggregate"], 

9183 klass=_shared_doc_kwargs["klass"], 

9184 axis=_shared_doc_kwargs["axis"], 

9185 see_also=_agg_summary_and_see_also_doc, 

9186 examples=_agg_examples_doc, 

9187 ) 

9188 def aggregate(self, func=None, axis: Axis = 0, *args, **kwargs): 

9189 from pandas.core.apply import frame_apply 

9190 

9191 axis = self._get_axis_number(axis) 

9192 

9193 relabeling, func, columns, order = reconstruct_func(func, **kwargs) 

9194 

9195 op = frame_apply(self, func=func, axis=axis, args=args, kwargs=kwargs) 

9196 result = op.agg() 

9197 

9198 if relabeling: 

9199 # This is to keep the order to columns occurrence unchanged, and also 

9200 # keep the order of new columns occurrence unchanged 

9201 

9202 # For the return values of reconstruct_func, if relabeling is 

9203 # False, columns and order will be None. 

9204 assert columns is not None 

9205 assert order is not None 

9206 

9207 result_in_dict = relabel_result(result, func, columns, order) 

9208 result = DataFrame(result_in_dict, index=columns) 

9209 

9210 return result 

9211 

9212 agg = aggregate 

9213 

9214 # error: Signature of "any" incompatible with supertype "NDFrame" [override] 

9215 @overload # type: ignore[override] 

9216 def any( 

9217 self, 

9218 *, 

9219 axis: Axis = ..., 

9220 bool_only: bool | None = ..., 

9221 skipna: bool = ..., 

9222 level: None = ..., 

9223 **kwargs, 

9224 ) -> Series: 

9225 ... 

9226 

9227 @overload 

9228 def any( 

9229 self, 

9230 *, 

9231 axis: Axis = ..., 

9232 bool_only: bool | None = ..., 

9233 skipna: bool = ..., 

9234 level: Level, 

9235 **kwargs, 

9236 ) -> DataFrame | Series: 

9237 ... 

9238 

9239 # error: Missing return statement 

9240 @doc(NDFrame.any, **_shared_doc_kwargs) 

9241 def any( # type: ignore[empty-body] 

9242 self, 

9243 axis: Axis = 0, 

9244 bool_only: bool | None = None, 

9245 skipna: bool = True, 

9246 level: Level = None, 

9247 **kwargs, 

9248 ) -> DataFrame | Series: 

9249 ... 

9250 

9251 @doc( 

9252 _shared_docs["transform"], 

9253 klass=_shared_doc_kwargs["klass"], 

9254 axis=_shared_doc_kwargs["axis"], 

9255 ) 

9256 def transform( 

9257 self, func: AggFuncType, axis: Axis = 0, *args, **kwargs 

9258 ) -> DataFrame: 

9259 from pandas.core.apply import frame_apply 

9260 

9261 op = frame_apply(self, func=func, axis=axis, args=args, kwargs=kwargs) 

9262 result = op.transform() 

9263 assert isinstance(result, DataFrame) 

9264 return result 

9265 

9266 def apply( 

9267 self, 

9268 func: AggFuncType, 

9269 axis: Axis = 0, 

9270 raw: bool = False, 

9271 result_type: Literal["expand", "reduce", "broadcast"] | None = None, 

9272 args=(), 

9273 **kwargs, 

9274 ): 

9275 """ 

9276 Apply a function along an axis of the DataFrame. 

9277 

9278 Objects passed to the function are Series objects whose index is 

9279 either the DataFrame's index (``axis=0``) or the DataFrame's columns 

9280 (``axis=1``). By default (``result_type=None``), the final return type 

9281 is inferred from the return type of the applied function. Otherwise, 

9282 it depends on the `result_type` argument. 

9283 

9284 Parameters 

9285 ---------- 

9286 func : function 

9287 Function to apply to each column or row. 

9288 axis : {0 or 'index', 1 or 'columns'}, default 0 

9289 Axis along which the function is applied: 

9290 

9291 * 0 or 'index': apply function to each column. 

9292 * 1 or 'columns': apply function to each row. 

9293 

9294 raw : bool, default False 

9295 Determines if row or column is passed as a Series or ndarray object: 

9296 

9297 * ``False`` : passes each row or column as a Series to the 

9298 function. 

9299 * ``True`` : the passed function will receive ndarray objects 

9300 instead. 

9301 If you are just applying a NumPy reduction function this will 

9302 achieve much better performance. 

9303 

9304 result_type : {'expand', 'reduce', 'broadcast', None}, default None 

9305 These only act when ``axis=1`` (columns): 

9306 

9307 * 'expand' : list-like results will be turned into columns. 

9308 * 'reduce' : returns a Series if possible rather than expanding 

9309 list-like results. This is the opposite of 'expand'. 

9310 * 'broadcast' : results will be broadcast to the original shape 

9311 of the DataFrame, the original index and columns will be 

9312 retained. 

9313 

9314 The default behaviour (None) depends on the return value of the 

9315 applied function: list-like results will be returned as a Series 

9316 of those. However if the apply function returns a Series these 

9317 are expanded to columns. 

9318 args : tuple 

9319 Positional arguments to pass to `func` in addition to the 

9320 array/series. 

9321 **kwargs 

9322 Additional keyword arguments to pass as keywords arguments to 

9323 `func`. 

9324 

9325 Returns 

9326 ------- 

9327 Series or DataFrame 

9328 Result of applying ``func`` along the given axis of the 

9329 DataFrame. 

9330 

9331 See Also 

9332 -------- 

9333 DataFrame.applymap: For elementwise operations. 

9334 DataFrame.aggregate: Only perform aggregating type operations. 

9335 DataFrame.transform: Only perform transforming type operations. 

9336 

9337 Notes 

9338 ----- 

9339 Functions that mutate the passed object can produce unexpected 

9340 behavior or errors and are not supported. See :ref:`gotchas.udf-mutation` 

9341 for more details. 

9342 

9343 Examples 

9344 -------- 

9345 >>> df = pd.DataFrame([[4, 9]] * 3, columns=['A', 'B']) 

9346 >>> df 

9347 A B 

9348 0 4 9 

9349 1 4 9 

9350 2 4 9 

9351 

9352 Using a numpy universal function (in this case the same as 

9353 ``np.sqrt(df)``): 

9354 

9355 >>> df.apply(np.sqrt) 

9356 A B 

9357 0 2.0 3.0 

9358 1 2.0 3.0 

9359 2 2.0 3.0 

9360 

9361 Using a reducing function on either axis 

9362 

9363 >>> df.apply(np.sum, axis=0) 

9364 A 12 

9365 B 27 

9366 dtype: int64 

9367 

9368 >>> df.apply(np.sum, axis=1) 

9369 0 13 

9370 1 13 

9371 2 13 

9372 dtype: int64 

9373 

9374 Returning a list-like will result in a Series 

9375 

9376 >>> df.apply(lambda x: [1, 2], axis=1) 

9377 0 [1, 2] 

9378 1 [1, 2] 

9379 2 [1, 2] 

9380 dtype: object 

9381 

9382 Passing ``result_type='expand'`` will expand list-like results 

9383 to columns of a Dataframe 

9384 

9385 >>> df.apply(lambda x: [1, 2], axis=1, result_type='expand') 

9386 0 1 

9387 0 1 2 

9388 1 1 2 

9389 2 1 2 

9390 

9391 Returning a Series inside the function is similar to passing 

9392 ``result_type='expand'``. The resulting column names 

9393 will be the Series index. 

9394 

9395 >>> df.apply(lambda x: pd.Series([1, 2], index=['foo', 'bar']), axis=1) 

9396 foo bar 

9397 0 1 2 

9398 1 1 2 

9399 2 1 2 

9400 

9401 Passing ``result_type='broadcast'`` will ensure the same shape 

9402 result, whether list-like or scalar is returned by the function, 

9403 and broadcast it along the axis. The resulting column names will 

9404 be the originals. 

9405 

9406 >>> df.apply(lambda x: [1, 2], axis=1, result_type='broadcast') 

9407 A B 

9408 0 1 2 

9409 1 1 2 

9410 2 1 2 

9411 """ 

9412 from pandas.core.apply import frame_apply 

9413 

9414 op = frame_apply( 

9415 self, 

9416 func=func, 

9417 axis=axis, 

9418 raw=raw, 

9419 result_type=result_type, 

9420 args=args, 

9421 kwargs=kwargs, 

9422 ) 

9423 return op.apply().__finalize__(self, method="apply") 

9424 

9425 def applymap( 

9426 self, func: PythonFuncType, na_action: str | None = None, **kwargs 

9427 ) -> DataFrame: 

9428 """ 

9429 Apply a function to a Dataframe elementwise. 

9430 

9431 This method applies a function that accepts and returns a scalar 

9432 to every element of a DataFrame. 

9433 

9434 Parameters 

9435 ---------- 

9436 func : callable 

9437 Python function, returns a single value from a single value. 

9438 na_action : {None, 'ignore'}, default None 

9439 If ‘ignore’, propagate NaN values, without passing them to func. 

9440 

9441 .. versionadded:: 1.2 

9442 

9443 **kwargs 

9444 Additional keyword arguments to pass as keywords arguments to 

9445 `func`. 

9446 

9447 .. versionadded:: 1.3.0 

9448 

9449 Returns 

9450 ------- 

9451 DataFrame 

9452 Transformed DataFrame. 

9453 

9454 See Also 

9455 -------- 

9456 DataFrame.apply : Apply a function along input axis of DataFrame. 

9457 

9458 Examples 

9459 -------- 

9460 >>> df = pd.DataFrame([[1, 2.12], [3.356, 4.567]]) 

9461 >>> df 

9462 0 1 

9463 0 1.000 2.120 

9464 1 3.356 4.567 

9465 

9466 >>> df.applymap(lambda x: len(str(x))) 

9467 0 1 

9468 0 3 4 

9469 1 5 5 

9470 

9471 Like Series.map, NA values can be ignored: 

9472 

9473 >>> df_copy = df.copy() 

9474 >>> df_copy.iloc[0, 0] = pd.NA 

9475 >>> df_copy.applymap(lambda x: len(str(x)), na_action='ignore') 

9476 0 1 

9477 0 NaN 4 

9478 1 5.0 5 

9479 

9480 Note that a vectorized version of `func` often exists, which will 

9481 be much faster. You could square each number elementwise. 

9482 

9483 >>> df.applymap(lambda x: x**2) 

9484 0 1 

9485 0 1.000000 4.494400 

9486 1 11.262736 20.857489 

9487 

9488 But it's better to avoid applymap in that case. 

9489 

9490 >>> df ** 2 

9491 0 1 

9492 0 1.000000 4.494400 

9493 1 11.262736 20.857489 

9494 """ 

9495 if na_action not in {"ignore", None}: 

9496 raise ValueError( 

9497 f"na_action must be 'ignore' or None. Got {repr(na_action)}" 

9498 ) 

9499 ignore_na = na_action == "ignore" 

9500 func = functools.partial(func, **kwargs) 

9501 

9502 # if we have a dtype == 'M8[ns]', provide boxed values 

9503 def infer(x): 

9504 if x.empty: 

9505 return lib.map_infer(x, func, ignore_na=ignore_na) 

9506 return lib.map_infer(x.astype(object)._values, func, ignore_na=ignore_na) 

9507 

9508 return self.apply(infer).__finalize__(self, "applymap") 

9509 

9510 # ---------------------------------------------------------------------- 

9511 # Merging / joining methods 

9512 

9513 def _append( 

9514 self, 

9515 other, 

9516 ignore_index: bool = False, 

9517 verify_integrity: bool = False, 

9518 sort: bool = False, 

9519 ) -> DataFrame: 

9520 if isinstance(other, (Series, dict)): 

9521 if isinstance(other, dict): 

9522 if not ignore_index: 

9523 raise TypeError("Can only append a dict if ignore_index=True") 

9524 other = Series(other) 

9525 if other.name is None and not ignore_index: 

9526 raise TypeError( 

9527 "Can only append a Series if ignore_index=True " 

9528 "or if the Series has a name" 

9529 ) 

9530 

9531 index = Index( 

9532 [other.name], 

9533 name=self.index.names 

9534 if isinstance(self.index, MultiIndex) 

9535 else self.index.name, 

9536 ) 

9537 row_df = other.to_frame().T 

9538 # infer_objects is needed for 

9539 # test_append_empty_frame_to_series_with_dateutil_tz 

9540 other = row_df.infer_objects(copy=False).rename_axis( 

9541 index.names, copy=False 

9542 ) 

9543 elif isinstance(other, list): 

9544 if not other: 

9545 pass 

9546 elif not isinstance(other[0], DataFrame): 

9547 other = DataFrame(other) 

9548 if self.index.name is not None and not ignore_index: 

9549 other.index.name = self.index.name 

9550 

9551 from pandas.core.reshape.concat import concat 

9552 

9553 if isinstance(other, (list, tuple)): 

9554 to_concat = [self, *other] 

9555 else: 

9556 to_concat = [self, other] 

9557 

9558 result = concat( 

9559 to_concat, 

9560 ignore_index=ignore_index, 

9561 verify_integrity=verify_integrity, 

9562 sort=sort, 

9563 ) 

9564 return result.__finalize__(self, method="append") 

9565 

9566 def join( 

9567 self, 

9568 other: DataFrame | Series | Iterable[DataFrame | Series], 

9569 on: IndexLabel | None = None, 

9570 how: MergeHow = "left", 

9571 lsuffix: str = "", 

9572 rsuffix: str = "", 

9573 sort: bool = False, 

9574 validate: str | None = None, 

9575 ) -> DataFrame: 

9576 """ 

9577 Join columns of another DataFrame. 

9578 

9579 Join columns with `other` DataFrame either on index or on a key 

9580 column. Efficiently join multiple DataFrame objects by index at once by 

9581 passing a list. 

9582 

9583 Parameters 

9584 ---------- 

9585 other : DataFrame, Series, or a list containing any combination of them 

9586 Index should be similar to one of the columns in this one. If a 

9587 Series is passed, its name attribute must be set, and that will be 

9588 used as the column name in the resulting joined DataFrame. 

9589 on : str, list of str, or array-like, optional 

9590 Column or index level name(s) in the caller to join on the index 

9591 in `other`, otherwise joins index-on-index. If multiple 

9592 values given, the `other` DataFrame must have a MultiIndex. Can 

9593 pass an array as the join key if it is not already contained in 

9594 the calling DataFrame. Like an Excel VLOOKUP operation. 

9595 how : {'left', 'right', 'outer', 'inner', 'cross'}, default 'left' 

9596 How to handle the operation of the two objects. 

9597 

9598 * left: use calling frame's index (or column if on is specified) 

9599 * right: use `other`'s index. 

9600 * outer: form union of calling frame's index (or column if on is 

9601 specified) with `other`'s index, and sort it. 

9602 lexicographically. 

9603 * inner: form intersection of calling frame's index (or column if 

9604 on is specified) with `other`'s index, preserving the order 

9605 of the calling's one. 

9606 * cross: creates the cartesian product from both frames, preserves the order 

9607 of the left keys. 

9608 

9609 .. versionadded:: 1.2.0 

9610 

9611 lsuffix : str, default '' 

9612 Suffix to use from left frame's overlapping columns. 

9613 rsuffix : str, default '' 

9614 Suffix to use from right frame's overlapping columns. 

9615 sort : bool, default False 

9616 Order result DataFrame lexicographically by the join key. If False, 

9617 the order of the join key depends on the join type (how keyword). 

9618 validate : str, optional 

9619 If specified, checks if join is of specified type. 

9620 * "one_to_one" or "1:1": check if join keys are unique in both left 

9621 and right datasets. 

9622 * "one_to_many" or "1:m": check if join keys are unique in left dataset. 

9623 * "many_to_one" or "m:1": check if join keys are unique in right dataset. 

9624 * "many_to_many" or "m:m": allowed, but does not result in checks. 

9625 .. versionadded:: 1.5.0 

9626 

9627 Returns 

9628 ------- 

9629 DataFrame 

9630 A dataframe containing columns from both the caller and `other`. 

9631 

9632 See Also 

9633 -------- 

9634 DataFrame.merge : For column(s)-on-column(s) operations. 

9635 

9636 Notes 

9637 ----- 

9638 Parameters `on`, `lsuffix`, and `rsuffix` are not supported when 

9639 passing a list of `DataFrame` objects. 

9640 

9641 Support for specifying index levels as the `on` parameter was added 

9642 in version 0.23.0. 

9643 

9644 Examples 

9645 -------- 

9646 >>> df = pd.DataFrame({'key': ['K0', 'K1', 'K2', 'K3', 'K4', 'K5'], 

9647 ... 'A': ['A0', 'A1', 'A2', 'A3', 'A4', 'A5']}) 

9648 

9649 >>> df 

9650 key A 

9651 0 K0 A0 

9652 1 K1 A1 

9653 2 K2 A2 

9654 3 K3 A3 

9655 4 K4 A4 

9656 5 K5 A5 

9657 

9658 >>> other = pd.DataFrame({'key': ['K0', 'K1', 'K2'], 

9659 ... 'B': ['B0', 'B1', 'B2']}) 

9660 

9661 >>> other 

9662 key B 

9663 0 K0 B0 

9664 1 K1 B1 

9665 2 K2 B2 

9666 

9667 Join DataFrames using their indexes. 

9668 

9669 >>> df.join(other, lsuffix='_caller', rsuffix='_other') 

9670 key_caller A key_other B 

9671 0 K0 A0 K0 B0 

9672 1 K1 A1 K1 B1 

9673 2 K2 A2 K2 B2 

9674 3 K3 A3 NaN NaN 

9675 4 K4 A4 NaN NaN 

9676 5 K5 A5 NaN NaN 

9677 

9678 If we want to join using the key columns, we need to set key to be 

9679 the index in both `df` and `other`. The joined DataFrame will have 

9680 key as its index. 

9681 

9682 >>> df.set_index('key').join(other.set_index('key')) 

9683 A B 

9684 key 

9685 K0 A0 B0 

9686 K1 A1 B1 

9687 K2 A2 B2 

9688 K3 A3 NaN 

9689 K4 A4 NaN 

9690 K5 A5 NaN 

9691 

9692 Another option to join using the key columns is to use the `on` 

9693 parameter. DataFrame.join always uses `other`'s index but we can use 

9694 any column in `df`. This method preserves the original DataFrame's 

9695 index in the result. 

9696 

9697 >>> df.join(other.set_index('key'), on='key') 

9698 key A B 

9699 0 K0 A0 B0 

9700 1 K1 A1 B1 

9701 2 K2 A2 B2 

9702 3 K3 A3 NaN 

9703 4 K4 A4 NaN 

9704 5 K5 A5 NaN 

9705 

9706 Using non-unique key values shows how they are matched. 

9707 

9708 >>> df = pd.DataFrame({'key': ['K0', 'K1', 'K1', 'K3', 'K0', 'K1'], 

9709 ... 'A': ['A0', 'A1', 'A2', 'A3', 'A4', 'A5']}) 

9710 

9711 >>> df 

9712 key A 

9713 0 K0 A0 

9714 1 K1 A1 

9715 2 K1 A2 

9716 3 K3 A3 

9717 4 K0 A4 

9718 5 K1 A5 

9719 

9720 >>> df.join(other.set_index('key'), on='key', validate='m:1') 

9721 key A B 

9722 0 K0 A0 B0 

9723 1 K1 A1 B1 

9724 2 K1 A2 B1 

9725 3 K3 A3 NaN 

9726 4 K0 A4 B0 

9727 5 K1 A5 B1 

9728 """ 

9729 return self._join_compat( 

9730 other, 

9731 on=on, 

9732 how=how, 

9733 lsuffix=lsuffix, 

9734 rsuffix=rsuffix, 

9735 sort=sort, 

9736 validate=validate, 

9737 ) 

9738 

9739 def _join_compat( 

9740 self, 

9741 other: DataFrame | Series | Iterable[DataFrame | Series], 

9742 on: IndexLabel | None = None, 

9743 how: MergeHow = "left", 

9744 lsuffix: str = "", 

9745 rsuffix: str = "", 

9746 sort: bool = False, 

9747 validate: str | None = None, 

9748 ): 

9749 from pandas.core.reshape.concat import concat 

9750 from pandas.core.reshape.merge import merge 

9751 

9752 if isinstance(other, Series): 

9753 if other.name is None: 

9754 raise ValueError("Other Series must have a name") 

9755 other = DataFrame({other.name: other}) 

9756 

9757 if isinstance(other, DataFrame): 

9758 if how == "cross": 

9759 return merge( 

9760 self, 

9761 other, 

9762 how=how, 

9763 on=on, 

9764 suffixes=(lsuffix, rsuffix), 

9765 sort=sort, 

9766 validate=validate, 

9767 ) 

9768 return merge( 

9769 self, 

9770 other, 

9771 left_on=on, 

9772 how=how, 

9773 left_index=on is None, 

9774 right_index=True, 

9775 suffixes=(lsuffix, rsuffix), 

9776 sort=sort, 

9777 validate=validate, 

9778 ) 

9779 else: 

9780 if on is not None: 

9781 raise ValueError( 

9782 "Joining multiple DataFrames only supported for joining on index" 

9783 ) 

9784 

9785 if rsuffix or lsuffix: 

9786 raise ValueError( 

9787 "Suffixes not supported when joining multiple DataFrames" 

9788 ) 

9789 

9790 # Mypy thinks the RHS is a 

9791 # "Union[DataFrame, Series, Iterable[Union[DataFrame, Series]]]" whereas 

9792 # the LHS is an "Iterable[DataFrame]", but in reality both types are 

9793 # "Iterable[Union[DataFrame, Series]]" due to the if statements 

9794 frames = [cast("DataFrame | Series", self)] + list(other) 

9795 

9796 can_concat = all(df.index.is_unique for df in frames) 

9797 

9798 # join indexes only using concat 

9799 if can_concat: 

9800 if how == "left": 

9801 res = concat( 

9802 frames, axis=1, join="outer", verify_integrity=True, sort=sort 

9803 ) 

9804 return res.reindex(self.index, copy=False) 

9805 else: 

9806 return concat( 

9807 frames, axis=1, join=how, verify_integrity=True, sort=sort 

9808 ) 

9809 

9810 joined = frames[0] 

9811 

9812 for frame in frames[1:]: 

9813 joined = merge( 

9814 joined, 

9815 frame, 

9816 how=how, 

9817 left_index=True, 

9818 right_index=True, 

9819 validate=validate, 

9820 ) 

9821 

9822 return joined 

9823 

9824 @Substitution("") 

9825 @Appender(_merge_doc, indents=2) 

9826 def merge( 

9827 self, 

9828 right: DataFrame | Series, 

9829 how: MergeHow = "inner", 

9830 on: IndexLabel | None = None, 

9831 left_on: IndexLabel | None = None, 

9832 right_on: IndexLabel | None = None, 

9833 left_index: bool = False, 

9834 right_index: bool = False, 

9835 sort: bool = False, 

9836 suffixes: Suffixes = ("_x", "_y"), 

9837 copy: bool | None = None, 

9838 indicator: str | bool = False, 

9839 validate: str | None = None, 

9840 ) -> DataFrame: 

9841 from pandas.core.reshape.merge import merge 

9842 

9843 return merge( 

9844 self, 

9845 right, 

9846 how=how, 

9847 on=on, 

9848 left_on=left_on, 

9849 right_on=right_on, 

9850 left_index=left_index, 

9851 right_index=right_index, 

9852 sort=sort, 

9853 suffixes=suffixes, 

9854 copy=copy, 

9855 indicator=indicator, 

9856 validate=validate, 

9857 ) 

9858 

9859 def round( 

9860 self, decimals: int | dict[IndexLabel, int] | Series = 0, *args, **kwargs 

9861 ) -> DataFrame: 

9862 """ 

9863 Round a DataFrame to a variable number of decimal places. 

9864 

9865 Parameters 

9866 ---------- 

9867 decimals : int, dict, Series 

9868 Number of decimal places to round each column to. If an int is 

9869 given, round each column to the same number of places. 

9870 Otherwise dict and Series round to variable numbers of places. 

9871 Column names should be in the keys if `decimals` is a 

9872 dict-like, or in the index if `decimals` is a Series. Any 

9873 columns not included in `decimals` will be left as is. Elements 

9874 of `decimals` which are not columns of the input will be 

9875 ignored. 

9876 *args 

9877 Additional keywords have no effect but might be accepted for 

9878 compatibility with numpy. 

9879 **kwargs 

9880 Additional keywords have no effect but might be accepted for 

9881 compatibility with numpy. 

9882 

9883 Returns 

9884 ------- 

9885 DataFrame 

9886 A DataFrame with the affected columns rounded to the specified 

9887 number of decimal places. 

9888 

9889 See Also 

9890 -------- 

9891 numpy.around : Round a numpy array to the given number of decimals. 

9892 Series.round : Round a Series to the given number of decimals. 

9893 

9894 Examples 

9895 -------- 

9896 >>> df = pd.DataFrame([(.21, .32), (.01, .67), (.66, .03), (.21, .18)], 

9897 ... columns=['dogs', 'cats']) 

9898 >>> df 

9899 dogs cats 

9900 0 0.21 0.32 

9901 1 0.01 0.67 

9902 2 0.66 0.03 

9903 3 0.21 0.18 

9904 

9905 By providing an integer each column is rounded to the same number 

9906 of decimal places 

9907 

9908 >>> df.round(1) 

9909 dogs cats 

9910 0 0.2 0.3 

9911 1 0.0 0.7 

9912 2 0.7 0.0 

9913 3 0.2 0.2 

9914 

9915 With a dict, the number of places for specific columns can be 

9916 specified with the column names as key and the number of decimal 

9917 places as value 

9918 

9919 >>> df.round({'dogs': 1, 'cats': 0}) 

9920 dogs cats 

9921 0 0.2 0.0 

9922 1 0.0 1.0 

9923 2 0.7 0.0 

9924 3 0.2 0.0 

9925 

9926 Using a Series, the number of places for specific columns can be 

9927 specified with the column names as index and the number of 

9928 decimal places as value 

9929 

9930 >>> decimals = pd.Series([0, 1], index=['cats', 'dogs']) 

9931 >>> df.round(decimals) 

9932 dogs cats 

9933 0 0.2 0.0 

9934 1 0.0 1.0 

9935 2 0.7 0.0 

9936 3 0.2 0.0 

9937 """ 

9938 from pandas.core.reshape.concat import concat 

9939 

9940 def _dict_round(df: DataFrame, decimals): 

9941 for col, vals in df.items(): 

9942 try: 

9943 yield _series_round(vals, decimals[col]) 

9944 except KeyError: 

9945 yield vals 

9946 

9947 def _series_round(ser: Series, decimals: int) -> Series: 

9948 if is_integer_dtype(ser.dtype) or is_float_dtype(ser.dtype): 

9949 return ser.round(decimals) 

9950 return ser 

9951 

9952 nv.validate_round(args, kwargs) 

9953 

9954 if isinstance(decimals, (dict, Series)): 

9955 if isinstance(decimals, Series) and not decimals.index.is_unique: 

9956 raise ValueError("Index of decimals must be unique") 

9957 if is_dict_like(decimals) and not all( 

9958 is_integer(value) for _, value in decimals.items() 

9959 ): 

9960 raise TypeError("Values in decimals must be integers") 

9961 new_cols = list(_dict_round(self, decimals)) 

9962 elif is_integer(decimals): 

9963 # Dispatch to Block.round 

9964 return self._constructor( 

9965 self._mgr.round(decimals=decimals, using_cow=using_copy_on_write()), 

9966 ).__finalize__(self, method="round") 

9967 else: 

9968 raise TypeError("decimals must be an integer, a dict-like or a Series") 

9969 

9970 if new_cols is not None and len(new_cols) > 0: 

9971 return self._constructor( 

9972 concat(new_cols, axis=1), index=self.index, columns=self.columns 

9973 ).__finalize__(self, method="round") 

9974 else: 

9975 return self.copy(deep=False) 

9976 

9977 # ---------------------------------------------------------------------- 

9978 # Statistical methods, etc. 

9979 

9980 def corr( 

9981 self, 

9982 method: CorrelationMethod = "pearson", 

9983 min_periods: int = 1, 

9984 numeric_only: bool = False, 

9985 ) -> DataFrame: 

9986 """ 

9987 Compute pairwise correlation of columns, excluding NA/null values. 

9988 

9989 Parameters 

9990 ---------- 

9991 method : {'pearson', 'kendall', 'spearman'} or callable 

9992 Method of correlation: 

9993 

9994 * pearson : standard correlation coefficient 

9995 * kendall : Kendall Tau correlation coefficient 

9996 * spearman : Spearman rank correlation 

9997 * callable: callable with input two 1d ndarrays 

9998 and returning a float. Note that the returned matrix from corr 

9999 will have 1 along the diagonals and will be symmetric 

10000 regardless of the callable's behavior. 

10001 min_periods : int, optional 

10002 Minimum number of observations required per pair of columns 

10003 to have a valid result. Currently only available for Pearson 

10004 and Spearman correlation. 

10005 numeric_only : bool, default False 

10006 Include only `float`, `int` or `boolean` data. 

10007 

10008 .. versionadded:: 1.5.0 

10009 

10010 .. versionchanged:: 2.0.0 

10011 The default value of ``numeric_only`` is now ``False``. 

10012 

10013 Returns 

10014 ------- 

10015 DataFrame 

10016 Correlation matrix. 

10017 

10018 See Also 

10019 -------- 

10020 DataFrame.corrwith : Compute pairwise correlation with another 

10021 DataFrame or Series. 

10022 Series.corr : Compute the correlation between two Series. 

10023 

10024 Notes 

10025 ----- 

10026 Pearson, Kendall and Spearman correlation are currently computed using pairwise complete observations. 

10027 

10028 * `Pearson correlation coefficient <https://en.wikipedia.org/wiki/Pearson_correlation_coefficient>`_ 

10029 * `Kendall rank correlation coefficient <https://en.wikipedia.org/wiki/Kendall_rank_correlation_coefficient>`_ 

10030 * `Spearman's rank correlation coefficient <https://en.wikipedia.org/wiki/Spearman%27s_rank_correlation_coefficient>`_ 

10031 

10032 Examples 

10033 -------- 

10034 >>> def histogram_intersection(a, b): 

10035 ... v = np.minimum(a, b).sum().round(decimals=1) 

10036 ... return v 

10037 >>> df = pd.DataFrame([(.2, .3), (.0, .6), (.6, .0), (.2, .1)], 

10038 ... columns=['dogs', 'cats']) 

10039 >>> df.corr(method=histogram_intersection) 

10040 dogs cats 

10041 dogs 1.0 0.3 

10042 cats 0.3 1.0 

10043 

10044 >>> df = pd.DataFrame([(1, 1), (2, np.nan), (np.nan, 3), (4, 4)], 

10045 ... columns=['dogs', 'cats']) 

10046 >>> df.corr(min_periods=3) 

10047 dogs cats 

10048 dogs 1.0 NaN 

10049 cats NaN 1.0 

10050 """ # noqa:E501 

10051 data = self._get_numeric_data() if numeric_only else self 

10052 cols = data.columns 

10053 idx = cols.copy() 

10054 mat = data.to_numpy(dtype=float, na_value=np.nan, copy=False) 

10055 

10056 if method == "pearson": 

10057 correl = libalgos.nancorr(mat, minp=min_periods) 

10058 elif method == "spearman": 

10059 correl = libalgos.nancorr_spearman(mat, minp=min_periods) 

10060 elif method == "kendall" or callable(method): 

10061 if min_periods is None: 

10062 min_periods = 1 

10063 mat = mat.T 

10064 corrf = nanops.get_corr_func(method) 

10065 K = len(cols) 

10066 correl = np.empty((K, K), dtype=float) 

10067 mask = np.isfinite(mat) 

10068 for i, ac in enumerate(mat): 

10069 for j, bc in enumerate(mat): 

10070 if i > j: 

10071 continue 

10072 

10073 valid = mask[i] & mask[j] 

10074 if valid.sum() < min_periods: 

10075 c = np.nan 

10076 elif i == j: 

10077 c = 1.0 

10078 elif not valid.all(): 

10079 c = corrf(ac[valid], bc[valid]) 

10080 else: 

10081 c = corrf(ac, bc) 

10082 correl[i, j] = c 

10083 correl[j, i] = c 

10084 else: 

10085 raise ValueError( 

10086 "method must be either 'pearson', " 

10087 "'spearman', 'kendall', or a callable, " 

10088 f"'{method}' was supplied" 

10089 ) 

10090 

10091 result = self._constructor(correl, index=idx, columns=cols, copy=False) 

10092 return result.__finalize__(self, method="corr") 

10093 

10094 def cov( 

10095 self, 

10096 min_periods: int | None = None, 

10097 ddof: int | None = 1, 

10098 numeric_only: bool = False, 

10099 ) -> DataFrame: 

10100 """ 

10101 Compute pairwise covariance of columns, excluding NA/null values. 

10102 

10103 Compute the pairwise covariance among the series of a DataFrame. 

10104 The returned data frame is the `covariance matrix 

10105 <https://en.wikipedia.org/wiki/Covariance_matrix>`__ of the columns 

10106 of the DataFrame. 

10107 

10108 Both NA and null values are automatically excluded from the 

10109 calculation. (See the note below about bias from missing values.) 

10110 A threshold can be set for the minimum number of 

10111 observations for each value created. Comparisons with observations 

10112 below this threshold will be returned as ``NaN``. 

10113 

10114 This method is generally used for the analysis of time series data to 

10115 understand the relationship between different measures 

10116 across time. 

10117 

10118 Parameters 

10119 ---------- 

10120 min_periods : int, optional 

10121 Minimum number of observations required per pair of columns 

10122 to have a valid result. 

10123 

10124 ddof : int, default 1 

10125 Delta degrees of freedom. The divisor used in calculations 

10126 is ``N - ddof``, where ``N`` represents the number of elements. 

10127 

10128 .. versionadded:: 1.1.0 

10129 

10130 numeric_only : bool, default False 

10131 Include only `float`, `int` or `boolean` data. 

10132 

10133 .. versionadded:: 1.5.0 

10134 

10135 .. versionchanged:: 2.0.0 

10136 The default value of ``numeric_only`` is now ``False``. 

10137 

10138 Returns 

10139 ------- 

10140 DataFrame 

10141 The covariance matrix of the series of the DataFrame. 

10142 

10143 See Also 

10144 -------- 

10145 Series.cov : Compute covariance with another Series. 

10146 core.window.ewm.ExponentialMovingWindow.cov : Exponential weighted sample 

10147 covariance. 

10148 core.window.expanding.Expanding.cov : Expanding sample covariance. 

10149 core.window.rolling.Rolling.cov : Rolling sample covariance. 

10150 

10151 Notes 

10152 ----- 

10153 Returns the covariance matrix of the DataFrame's time series. 

10154 The covariance is normalized by N-ddof. 

10155 

10156 For DataFrames that have Series that are missing data (assuming that 

10157 data is `missing at random 

10158 <https://en.wikipedia.org/wiki/Missing_data#Missing_at_random>`__) 

10159 the returned covariance matrix will be an unbiased estimate 

10160 of the variance and covariance between the member Series. 

10161 

10162 However, for many applications this estimate may not be acceptable 

10163 because the estimate covariance matrix is not guaranteed to be positive 

10164 semi-definite. This could lead to estimate correlations having 

10165 absolute values which are greater than one, and/or a non-invertible 

10166 covariance matrix. See `Estimation of covariance matrices 

10167 <https://en.wikipedia.org/w/index.php?title=Estimation_of_covariance_ 

10168 matrices>`__ for more details. 

10169 

10170 Examples 

10171 -------- 

10172 >>> df = pd.DataFrame([(1, 2), (0, 3), (2, 0), (1, 1)], 

10173 ... columns=['dogs', 'cats']) 

10174 >>> df.cov() 

10175 dogs cats 

10176 dogs 0.666667 -1.000000 

10177 cats -1.000000 1.666667 

10178 

10179 >>> np.random.seed(42) 

10180 >>> df = pd.DataFrame(np.random.randn(1000, 5), 

10181 ... columns=['a', 'b', 'c', 'd', 'e']) 

10182 >>> df.cov() 

10183 a b c d e 

10184 a 0.998438 -0.020161 0.059277 -0.008943 0.014144 

10185 b -0.020161 1.059352 -0.008543 -0.024738 0.009826 

10186 c 0.059277 -0.008543 1.010670 -0.001486 -0.000271 

10187 d -0.008943 -0.024738 -0.001486 0.921297 -0.013692 

10188 e 0.014144 0.009826 -0.000271 -0.013692 0.977795 

10189 

10190 **Minimum number of periods** 

10191 

10192 This method also supports an optional ``min_periods`` keyword 

10193 that specifies the required minimum number of non-NA observations for 

10194 each column pair in order to have a valid result: 

10195 

10196 >>> np.random.seed(42) 

10197 >>> df = pd.DataFrame(np.random.randn(20, 3), 

10198 ... columns=['a', 'b', 'c']) 

10199 >>> df.loc[df.index[:5], 'a'] = np.nan 

10200 >>> df.loc[df.index[5:10], 'b'] = np.nan 

10201 >>> df.cov(min_periods=12) 

10202 a b c 

10203 a 0.316741 NaN -0.150812 

10204 b NaN 1.248003 0.191417 

10205 c -0.150812 0.191417 0.895202 

10206 """ 

10207 data = self._get_numeric_data() if numeric_only else self 

10208 cols = data.columns 

10209 idx = cols.copy() 

10210 mat = data.to_numpy(dtype=float, na_value=np.nan, copy=False) 

10211 

10212 if notna(mat).all(): 

10213 if min_periods is not None and min_periods > len(mat): 

10214 base_cov = np.empty((mat.shape[1], mat.shape[1])) 

10215 base_cov.fill(np.nan) 

10216 else: 

10217 base_cov = np.cov(mat.T, ddof=ddof) 

10218 base_cov = base_cov.reshape((len(cols), len(cols))) 

10219 else: 

10220 base_cov = libalgos.nancorr(mat, cov=True, minp=min_periods) 

10221 

10222 result = self._constructor(base_cov, index=idx, columns=cols, copy=False) 

10223 return result.__finalize__(self, method="cov") 

10224 

10225 def corrwith( 

10226 self, 

10227 other: DataFrame | Series, 

10228 axis: Axis = 0, 

10229 drop: bool = False, 

10230 method: CorrelationMethod = "pearson", 

10231 numeric_only: bool = False, 

10232 ) -> Series: 

10233 """ 

10234 Compute pairwise correlation. 

10235 

10236 Pairwise correlation is computed between rows or columns of 

10237 DataFrame with rows or columns of Series or DataFrame. DataFrames 

10238 are first aligned along both axes before computing the 

10239 correlations. 

10240 

10241 Parameters 

10242 ---------- 

10243 other : DataFrame, Series 

10244 Object with which to compute correlations. 

10245 axis : {0 or 'index', 1 or 'columns'}, default 0 

10246 The axis to use. 0 or 'index' to compute row-wise, 1 or 'columns' for 

10247 column-wise. 

10248 drop : bool, default False 

10249 Drop missing indices from result. 

10250 method : {'pearson', 'kendall', 'spearman'} or callable 

10251 Method of correlation: 

10252 

10253 * pearson : standard correlation coefficient 

10254 * kendall : Kendall Tau correlation coefficient 

10255 * spearman : Spearman rank correlation 

10256 * callable: callable with input two 1d ndarrays 

10257 and returning a float. 

10258 

10259 numeric_only : bool, default False 

10260 Include only `float`, `int` or `boolean` data. 

10261 

10262 .. versionadded:: 1.5.0 

10263 

10264 .. versionchanged:: 2.0.0 

10265 The default value of ``numeric_only`` is now ``False``. 

10266 

10267 Returns 

10268 ------- 

10269 Series 

10270 Pairwise correlations. 

10271 

10272 See Also 

10273 -------- 

10274 DataFrame.corr : Compute pairwise correlation of columns. 

10275 

10276 Examples 

10277 -------- 

10278 >>> index = ["a", "b", "c", "d", "e"] 

10279 >>> columns = ["one", "two", "three", "four"] 

10280 >>> df1 = pd.DataFrame(np.arange(20).reshape(5, 4), index=index, columns=columns) 

10281 >>> df2 = pd.DataFrame(np.arange(16).reshape(4, 4), index=index[:4], columns=columns) 

10282 >>> df1.corrwith(df2) 

10283 one 1.0 

10284 two 1.0 

10285 three 1.0 

10286 four 1.0 

10287 dtype: float64 

10288 

10289 >>> df2.corrwith(df1, axis=1) 

10290 a 1.0 

10291 b 1.0 

10292 c 1.0 

10293 d 1.0 

10294 e NaN 

10295 dtype: float64 

10296 """ # noqa:E501 

10297 axis = self._get_axis_number(axis) 

10298 this = self._get_numeric_data() if numeric_only else self 

10299 

10300 if isinstance(other, Series): 

10301 return this.apply(lambda x: other.corr(x, method=method), axis=axis) 

10302 

10303 if numeric_only: 

10304 other = other._get_numeric_data() 

10305 left, right = this.align(other, join="inner", copy=False) 

10306 

10307 if axis == 1: 

10308 left = left.T 

10309 right = right.T 

10310 

10311 if method == "pearson": 

10312 # mask missing values 

10313 left = left + right * 0 

10314 right = right + left * 0 

10315 

10316 # demeaned data 

10317 ldem = left - left.mean(numeric_only=numeric_only) 

10318 rdem = right - right.mean(numeric_only=numeric_only) 

10319 

10320 num = (ldem * rdem).sum() 

10321 dom = ( 

10322 (left.count() - 1) 

10323 * left.std(numeric_only=numeric_only) 

10324 * right.std(numeric_only=numeric_only) 

10325 ) 

10326 

10327 correl = num / dom 

10328 

10329 elif method in ["kendall", "spearman"] or callable(method): 

10330 

10331 def c(x): 

10332 return nanops.nancorr(x[0], x[1], method=method) 

10333 

10334 correl = self._constructor_sliced( 

10335 map(c, zip(left.values.T, right.values.T)), 

10336 index=left.columns, 

10337 copy=False, 

10338 ) 

10339 

10340 else: 

10341 raise ValueError( 

10342 f"Invalid method {method} was passed, " 

10343 "valid methods are: 'pearson', 'kendall', " 

10344 "'spearman', or callable" 

10345 ) 

10346 

10347 if not drop: 

10348 # Find non-matching labels along the given axis 

10349 # and append missing correlations (GH 22375) 

10350 raxis: AxisInt = 1 if axis == 0 else 0 

10351 result_index = this._get_axis(raxis).union(other._get_axis(raxis)) 

10352 idx_diff = result_index.difference(correl.index) 

10353 

10354 if len(idx_diff) > 0: 

10355 correl = correl._append( 

10356 Series([np.nan] * len(idx_diff), index=idx_diff) 

10357 ) 

10358 

10359 return correl 

10360 

10361 # ---------------------------------------------------------------------- 

10362 # ndarray-like stats methods 

10363 

10364 def count(self, axis: Axis = 0, numeric_only: bool = False): 

10365 """ 

10366 Count non-NA cells for each column or row. 

10367 

10368 The values `None`, `NaN`, `NaT`, and optionally `numpy.inf` (depending 

10369 on `pandas.options.mode.use_inf_as_na`) are considered NA. 

10370 

10371 Parameters 

10372 ---------- 

10373 axis : {0 or 'index', 1 or 'columns'}, default 0 

10374 If 0 or 'index' counts are generated for each column. 

10375 If 1 or 'columns' counts are generated for each row. 

10376 numeric_only : bool, default False 

10377 Include only `float`, `int` or `boolean` data. 

10378 

10379 Returns 

10380 ------- 

10381 Series or DataFrame 

10382 For each column/row the number of non-NA/null entries. 

10383 If `level` is specified returns a `DataFrame`. 

10384 

10385 See Also 

10386 -------- 

10387 Series.count: Number of non-NA elements in a Series. 

10388 DataFrame.value_counts: Count unique combinations of columns. 

10389 DataFrame.shape: Number of DataFrame rows and columns (including NA 

10390 elements). 

10391 DataFrame.isna: Boolean same-sized DataFrame showing places of NA 

10392 elements. 

10393 

10394 Examples 

10395 -------- 

10396 Constructing DataFrame from a dictionary: 

10397 

10398 >>> df = pd.DataFrame({"Person": 

10399 ... ["John", "Myla", "Lewis", "John", "Myla"], 

10400 ... "Age": [24., np.nan, 21., 33, 26], 

10401 ... "Single": [False, True, True, True, False]}) 

10402 >>> df 

10403 Person Age Single 

10404 0 John 24.0 False 

10405 1 Myla NaN True 

10406 2 Lewis 21.0 True 

10407 3 John 33.0 True 

10408 4 Myla 26.0 False 

10409 

10410 Notice the uncounted NA values: 

10411 

10412 >>> df.count() 

10413 Person 5 

10414 Age 4 

10415 Single 5 

10416 dtype: int64 

10417 

10418 Counts for each **row**: 

10419 

10420 >>> df.count(axis='columns') 

10421 0 3 

10422 1 2 

10423 2 3 

10424 3 3 

10425 4 3 

10426 dtype: int64 

10427 """ 

10428 axis = self._get_axis_number(axis) 

10429 

10430 if numeric_only: 

10431 frame = self._get_numeric_data() 

10432 else: 

10433 frame = self 

10434 

10435 # GH #423 

10436 if len(frame._get_axis(axis)) == 0: 

10437 result = self._constructor_sliced(0, index=frame._get_agg_axis(axis)) 

10438 else: 

10439 if frame._is_mixed_type or frame._mgr.any_extension_types: 

10440 # the or any_extension_types is really only hit for single- 

10441 # column frames with an extension array 

10442 result = notna(frame).sum(axis=axis) 

10443 else: 

10444 # GH13407 

10445 series_counts = notna(frame).sum(axis=axis) 

10446 counts = series_counts._values 

10447 result = self._constructor_sliced( 

10448 counts, index=frame._get_agg_axis(axis), copy=False 

10449 ) 

10450 

10451 return result.astype("int64").__finalize__(self, method="count") 

10452 

10453 def _reduce( 

10454 self, 

10455 op, 

10456 name: str, 

10457 *, 

10458 axis: Axis = 0, 

10459 skipna: bool = True, 

10460 numeric_only: bool = False, 

10461 filter_type=None, 

10462 **kwds, 

10463 ): 

10464 assert filter_type is None or filter_type == "bool", filter_type 

10465 out_dtype = "bool" if filter_type == "bool" else None 

10466 

10467 if axis is not None: 

10468 axis = self._get_axis_number(axis) 

10469 

10470 def func(values: np.ndarray): 

10471 # We only use this in the case that operates on self.values 

10472 return op(values, axis=axis, skipna=skipna, **kwds) 

10473 

10474 def blk_func(values, axis: Axis = 1): 

10475 if isinstance(values, ExtensionArray): 

10476 if not is_1d_only_ea_dtype(values.dtype) and not isinstance( 

10477 self._mgr, ArrayManager 

10478 ): 

10479 return values._reduce(name, axis=1, skipna=skipna, **kwds) 

10480 return values._reduce(name, skipna=skipna, **kwds) 

10481 else: 

10482 return op(values, axis=axis, skipna=skipna, **kwds) 

10483 

10484 def _get_data() -> DataFrame: 

10485 if filter_type is None: 

10486 data = self._get_numeric_data() 

10487 else: 

10488 # GH#25101, GH#24434 

10489 assert filter_type == "bool" 

10490 data = self._get_bool_data() 

10491 return data 

10492 

10493 # Case with EAs see GH#35881 

10494 df = self 

10495 if numeric_only: 

10496 df = _get_data() 

10497 if axis is None: 

10498 return func(df.values) 

10499 elif axis == 1: 

10500 if len(df.index) == 0: 

10501 # Taking a transpose would result in no columns, losing the dtype. 

10502 # In the empty case, reducing along axis 0 or 1 gives the same 

10503 # result dtype, so reduce with axis=0 and ignore values 

10504 result = df._reduce( 

10505 op, 

10506 name, 

10507 axis=0, 

10508 skipna=skipna, 

10509 numeric_only=False, 

10510 filter_type=filter_type, 

10511 **kwds, 

10512 ).iloc[:0] 

10513 result.index = df.index 

10514 return result 

10515 df = df.T 

10516 

10517 # After possibly _get_data and transposing, we are now in the 

10518 # simple case where we can use BlockManager.reduce 

10519 res = df._mgr.reduce(blk_func) 

10520 out = df._constructor(res).iloc[0] 

10521 if out_dtype is not None: 

10522 out = out.astype(out_dtype) 

10523 elif (df._mgr.get_dtypes() == object).any(): 

10524 out = out.astype(object) 

10525 elif len(self) == 0 and name in ("sum", "prod"): 

10526 # Even if we are object dtype, follow numpy and return 

10527 # float64, see test_apply_funcs_over_empty 

10528 out = out.astype(np.float64) 

10529 

10530 return out 

10531 

10532 def _reduce_axis1(self, name: str, func, skipna: bool) -> Series: 

10533 """ 

10534 Special case for _reduce to try to avoid a potentially-expensive transpose. 

10535 

10536 Apply the reduction block-wise along axis=1 and then reduce the resulting 

10537 1D arrays. 

10538 """ 

10539 if name == "all": 

10540 result = np.ones(len(self), dtype=bool) 

10541 ufunc = np.logical_and 

10542 elif name == "any": 

10543 result = np.zeros(len(self), dtype=bool) 

10544 # error: Incompatible types in assignment 

10545 # (expression has type "_UFunc_Nin2_Nout1[Literal['logical_or'], 

10546 # Literal[20], Literal[False]]", variable has type 

10547 # "_UFunc_Nin2_Nout1[Literal['logical_and'], Literal[20], 

10548 # Literal[True]]") 

10549 ufunc = np.logical_or # type: ignore[assignment] 

10550 else: 

10551 raise NotImplementedError(name) 

10552 

10553 for arr in self._mgr.arrays: 

10554 middle = func(arr, axis=0, skipna=skipna) 

10555 result = ufunc(result, middle) 

10556 

10557 res_ser = self._constructor_sliced(result, index=self.index, copy=False) 

10558 return res_ser 

10559 

10560 def nunique(self, axis: Axis = 0, dropna: bool = True) -> Series: 

10561 """ 

10562 Count number of distinct elements in specified axis. 

10563 

10564 Return Series with number of distinct elements. Can ignore NaN 

10565 values. 

10566 

10567 Parameters 

10568 ---------- 

10569 axis : {0 or 'index', 1 or 'columns'}, default 0 

10570 The axis to use. 0 or 'index' for row-wise, 1 or 'columns' for 

10571 column-wise. 

10572 dropna : bool, default True 

10573 Don't include NaN in the counts. 

10574 

10575 Returns 

10576 ------- 

10577 Series 

10578 

10579 See Also 

10580 -------- 

10581 Series.nunique: Method nunique for Series. 

10582 DataFrame.count: Count non-NA cells for each column or row. 

10583 

10584 Examples 

10585 -------- 

10586 >>> df = pd.DataFrame({'A': [4, 5, 6], 'B': [4, 1, 1]}) 

10587 >>> df.nunique() 

10588 A 3 

10589 B 2 

10590 dtype: int64 

10591 

10592 >>> df.nunique(axis=1) 

10593 0 1 

10594 1 2 

10595 2 2 

10596 dtype: int64 

10597 """ 

10598 return self.apply(Series.nunique, axis=axis, dropna=dropna) 

10599 

10600 @doc(_shared_docs["idxmin"], numeric_only_default="False") 

10601 def idxmin( 

10602 self, axis: Axis = 0, skipna: bool = True, numeric_only: bool = False 

10603 ) -> Series: 

10604 axis = self._get_axis_number(axis) 

10605 if numeric_only: 

10606 data = self._get_numeric_data() 

10607 else: 

10608 data = self 

10609 

10610 res = data._reduce( 

10611 nanops.nanargmin, "argmin", axis=axis, skipna=skipna, numeric_only=False 

10612 ) 

10613 indices = res._values 

10614 

10615 # indices will always be np.ndarray since axis is not None and 

10616 # values is a 2d array for DataFrame 

10617 # error: Item "int" of "Union[int, Any]" has no attribute "__iter__" 

10618 assert isinstance(indices, np.ndarray) # for mypy 

10619 

10620 index = data._get_axis(axis) 

10621 result = [index[i] if i >= 0 else np.nan for i in indices] 

10622 final_result = data._constructor_sliced(result, index=data._get_agg_axis(axis)) 

10623 return final_result.__finalize__(self, method="idxmin") 

10624 

10625 @doc(_shared_docs["idxmax"], numeric_only_default="False") 

10626 def idxmax( 

10627 self, axis: Axis = 0, skipna: bool = True, numeric_only: bool = False 

10628 ) -> Series: 

10629 axis = self._get_axis_number(axis) 

10630 if numeric_only: 

10631 data = self._get_numeric_data() 

10632 else: 

10633 data = self 

10634 

10635 res = data._reduce( 

10636 nanops.nanargmax, "argmax", axis=axis, skipna=skipna, numeric_only=False 

10637 ) 

10638 indices = res._values 

10639 

10640 # indices will always be np.ndarray since axis is not None and 

10641 # values is a 2d array for DataFrame 

10642 # error: Item "int" of "Union[int, Any]" has no attribute "__iter__" 

10643 assert isinstance(indices, np.ndarray) # for mypy 

10644 

10645 index = data._get_axis(axis) 

10646 result = [index[i] if i >= 0 else np.nan for i in indices] 

10647 final_result = data._constructor_sliced(result, index=data._get_agg_axis(axis)) 

10648 return final_result.__finalize__(self, method="idxmax") 

10649 

10650 def _get_agg_axis(self, axis_num: int) -> Index: 

10651 """ 

10652 Let's be explicit about this. 

10653 """ 

10654 if axis_num == 0: 

10655 return self.columns 

10656 elif axis_num == 1: 

10657 return self.index 

10658 else: 

10659 raise ValueError(f"Axis must be 0 or 1 (got {repr(axis_num)})") 

10660 

10661 def mode( 

10662 self, axis: Axis = 0, numeric_only: bool = False, dropna: bool = True 

10663 ) -> DataFrame: 

10664 """ 

10665 Get the mode(s) of each element along the selected axis. 

10666 

10667 The mode of a set of values is the value that appears most often. 

10668 It can be multiple values. 

10669 

10670 Parameters 

10671 ---------- 

10672 axis : {0 or 'index', 1 or 'columns'}, default 0 

10673 The axis to iterate over while searching for the mode: 

10674 

10675 * 0 or 'index' : get mode of each column 

10676 * 1 or 'columns' : get mode of each row. 

10677 

10678 numeric_only : bool, default False 

10679 If True, only apply to numeric columns. 

10680 dropna : bool, default True 

10681 Don't consider counts of NaN/NaT. 

10682 

10683 Returns 

10684 ------- 

10685 DataFrame 

10686 The modes of each column or row. 

10687 

10688 See Also 

10689 -------- 

10690 Series.mode : Return the highest frequency value in a Series. 

10691 Series.value_counts : Return the counts of values in a Series. 

10692 

10693 Examples 

10694 -------- 

10695 >>> df = pd.DataFrame([('bird', 2, 2), 

10696 ... ('mammal', 4, np.nan), 

10697 ... ('arthropod', 8, 0), 

10698 ... ('bird', 2, np.nan)], 

10699 ... index=('falcon', 'horse', 'spider', 'ostrich'), 

10700 ... columns=('species', 'legs', 'wings')) 

10701 >>> df 

10702 species legs wings 

10703 falcon bird 2 2.0 

10704 horse mammal 4 NaN 

10705 spider arthropod 8 0.0 

10706 ostrich bird 2 NaN 

10707 

10708 By default, missing values are not considered, and the mode of wings 

10709 are both 0 and 2. Because the resulting DataFrame has two rows, 

10710 the second row of ``species`` and ``legs`` contains ``NaN``. 

10711 

10712 >>> df.mode() 

10713 species legs wings 

10714 0 bird 2.0 0.0 

10715 1 NaN NaN 2.0 

10716 

10717 Setting ``dropna=False`` ``NaN`` values are considered and they can be 

10718 the mode (like for wings). 

10719 

10720 >>> df.mode(dropna=False) 

10721 species legs wings 

10722 0 bird 2 NaN 

10723 

10724 Setting ``numeric_only=True``, only the mode of numeric columns is 

10725 computed, and columns of other types are ignored. 

10726 

10727 >>> df.mode(numeric_only=True) 

10728 legs wings 

10729 0 2.0 0.0 

10730 1 NaN 2.0 

10731 

10732 To compute the mode over columns and not rows, use the axis parameter: 

10733 

10734 >>> df.mode(axis='columns', numeric_only=True) 

10735 0 1 

10736 falcon 2.0 NaN 

10737 horse 4.0 NaN 

10738 spider 0.0 8.0 

10739 ostrich 2.0 NaN 

10740 """ 

10741 data = self if not numeric_only else self._get_numeric_data() 

10742 

10743 def f(s): 

10744 return s.mode(dropna=dropna) 

10745 

10746 data = data.apply(f, axis=axis) 

10747 # Ensure index is type stable (should always use int index) 

10748 if data.empty: 

10749 data.index = default_index(0) 

10750 

10751 return data 

10752 

10753 @overload 

10754 def quantile( 

10755 self, 

10756 q: float = ..., 

10757 axis: Axis = ..., 

10758 numeric_only: bool = ..., 

10759 interpolation: QuantileInterpolation = ..., 

10760 ) -> Series: 

10761 ... 

10762 

10763 @overload 

10764 def quantile( 

10765 self, 

10766 q: AnyArrayLike | Sequence[float], 

10767 axis: Axis = ..., 

10768 numeric_only: bool = ..., 

10769 interpolation: QuantileInterpolation = ..., 

10770 ) -> Series | DataFrame: 

10771 ... 

10772 

10773 @overload 

10774 def quantile( 

10775 self, 

10776 q: float | AnyArrayLike | Sequence[float] = ..., 

10777 axis: Axis = ..., 

10778 numeric_only: bool = ..., 

10779 interpolation: QuantileInterpolation = ..., 

10780 ) -> Series | DataFrame: 

10781 ... 

10782 

10783 def quantile( 

10784 self, 

10785 q: float | AnyArrayLike | Sequence[float] = 0.5, 

10786 axis: Axis = 0, 

10787 numeric_only: bool = False, 

10788 interpolation: QuantileInterpolation = "linear", 

10789 method: Literal["single", "table"] = "single", 

10790 ) -> Series | DataFrame: 

10791 """ 

10792 Return values at the given quantile over requested axis. 

10793 

10794 Parameters 

10795 ---------- 

10796 q : float or array-like, default 0.5 (50% quantile) 

10797 Value between 0 <= q <= 1, the quantile(s) to compute. 

10798 axis : {0 or 'index', 1 or 'columns'}, default 0 

10799 Equals 0 or 'index' for row-wise, 1 or 'columns' for column-wise. 

10800 numeric_only : bool, default False 

10801 Include only `float`, `int` or `boolean` data. 

10802 

10803 .. versionchanged:: 2.0.0 

10804 The default value of ``numeric_only`` is now ``False``. 

10805 

10806 interpolation : {'linear', 'lower', 'higher', 'midpoint', 'nearest'} 

10807 This optional parameter specifies the interpolation method to use, 

10808 when the desired quantile lies between two data points `i` and `j`: 

10809 

10810 * linear: `i + (j - i) * fraction`, where `fraction` is the 

10811 fractional part of the index surrounded by `i` and `j`. 

10812 * lower: `i`. 

10813 * higher: `j`. 

10814 * nearest: `i` or `j` whichever is nearest. 

10815 * midpoint: (`i` + `j`) / 2. 

10816 method : {'single', 'table'}, default 'single' 

10817 Whether to compute quantiles per-column ('single') or over all columns 

10818 ('table'). When 'table', the only allowed interpolation methods are 

10819 'nearest', 'lower', and 'higher'. 

10820 

10821 Returns 

10822 ------- 

10823 Series or DataFrame 

10824 

10825 If ``q`` is an array, a DataFrame will be returned where the 

10826 index is ``q``, the columns are the columns of self, and the 

10827 values are the quantiles. 

10828 If ``q`` is a float, a Series will be returned where the 

10829 index is the columns of self and the values are the quantiles. 

10830 

10831 See Also 

10832 -------- 

10833 core.window.rolling.Rolling.quantile: Rolling quantile. 

10834 numpy.percentile: Numpy function to compute the percentile. 

10835 

10836 Examples 

10837 -------- 

10838 >>> df = pd.DataFrame(np.array([[1, 1], [2, 10], [3, 100], [4, 100]]), 

10839 ... columns=['a', 'b']) 

10840 >>> df.quantile(.1) 

10841 a 1.3 

10842 b 3.7 

10843 Name: 0.1, dtype: float64 

10844 >>> df.quantile([.1, .5]) 

10845 a b 

10846 0.1 1.3 3.7 

10847 0.5 2.5 55.0 

10848 

10849 Specifying `method='table'` will compute the quantile over all columns. 

10850 

10851 >>> df.quantile(.1, method="table", interpolation="nearest") 

10852 a 1 

10853 b 1 

10854 Name: 0.1, dtype: int64 

10855 >>> df.quantile([.1, .5], method="table", interpolation="nearest") 

10856 a b 

10857 0.1 1 1 

10858 0.5 3 100 

10859 

10860 Specifying `numeric_only=False` will also compute the quantile of 

10861 datetime and timedelta data. 

10862 

10863 >>> df = pd.DataFrame({'A': [1, 2], 

10864 ... 'B': [pd.Timestamp('2010'), 

10865 ... pd.Timestamp('2011')], 

10866 ... 'C': [pd.Timedelta('1 days'), 

10867 ... pd.Timedelta('2 days')]}) 

10868 >>> df.quantile(0.5, numeric_only=False) 

10869 A 1.5 

10870 B 2010-07-02 12:00:00 

10871 C 1 days 12:00:00 

10872 Name: 0.5, dtype: object 

10873 """ 

10874 validate_percentile(q) 

10875 axis = self._get_axis_number(axis) 

10876 

10877 if not is_list_like(q): 

10878 # BlockManager.quantile expects listlike, so we wrap and unwrap here 

10879 # error: List item 0 has incompatible type "Union[float, Union[Union[ 

10880 # ExtensionArray, ndarray[Any, Any]], Index, Series], Sequence[float]]"; 

10881 # expected "float" 

10882 res_df = self.quantile( # type: ignore[call-overload] 

10883 [q], 

10884 axis=axis, 

10885 numeric_only=numeric_only, 

10886 interpolation=interpolation, 

10887 method=method, 

10888 ) 

10889 if method == "single": 

10890 res = res_df.iloc[0] 

10891 else: 

10892 # cannot directly iloc over sparse arrays 

10893 res = res_df.T.iloc[:, 0] 

10894 if axis == 1 and len(self) == 0: 

10895 # GH#41544 try to get an appropriate dtype 

10896 dtype = find_common_type(list(self.dtypes)) 

10897 if needs_i8_conversion(dtype): 

10898 return res.astype(dtype) 

10899 return res 

10900 

10901 q = Index(q, dtype=np.float64) 

10902 data = self._get_numeric_data() if numeric_only else self 

10903 

10904 if axis == 1: 

10905 data = data.T 

10906 

10907 if len(data.columns) == 0: 

10908 # GH#23925 _get_numeric_data may have dropped all columns 

10909 cols = Index([], name=self.columns.name) 

10910 

10911 dtype = np.float64 

10912 if axis == 1: 

10913 # GH#41544 try to get an appropriate dtype 

10914 cdtype = find_common_type(list(self.dtypes)) 

10915 if needs_i8_conversion(cdtype): 

10916 dtype = cdtype 

10917 

10918 res = self._constructor([], index=q, columns=cols, dtype=dtype) 

10919 return res.__finalize__(self, method="quantile") 

10920 

10921 valid_method = {"single", "table"} 

10922 if method not in valid_method: 

10923 raise ValueError( 

10924 f"Invalid method: {method}. Method must be in {valid_method}." 

10925 ) 

10926 if method == "single": 

10927 res = data._mgr.quantile(qs=q, axis=1, interpolation=interpolation) 

10928 elif method == "table": 

10929 valid_interpolation = {"nearest", "lower", "higher"} 

10930 if interpolation not in valid_interpolation: 

10931 raise ValueError( 

10932 f"Invalid interpolation: {interpolation}. " 

10933 f"Interpolation must be in {valid_interpolation}" 

10934 ) 

10935 # handle degenerate case 

10936 if len(data) == 0: 

10937 if data.ndim == 2: 

10938 dtype = find_common_type(list(self.dtypes)) 

10939 else: 

10940 dtype = self.dtype 

10941 return self._constructor([], index=q, columns=data.columns, dtype=dtype) 

10942 

10943 q_idx = np.quantile( # type: ignore[call-overload] 

10944 np.arange(len(data)), q, **{np_percentile_argname: interpolation} 

10945 ) 

10946 

10947 by = data.columns 

10948 if len(by) > 1: 

10949 keys = [data._get_label_or_level_values(x) for x in by] 

10950 indexer = lexsort_indexer(keys) 

10951 else: 

10952 by = by[0] 

10953 k = data._get_label_or_level_values(by) # type: ignore[arg-type] 

10954 indexer = nargsort(k) 

10955 

10956 res = data._mgr.take(indexer[q_idx], verify=False) 

10957 res.axes[1] = q 

10958 

10959 result = self._constructor(res) 

10960 return result.__finalize__(self, method="quantile") 

10961 

10962 @doc(NDFrame.asfreq, **_shared_doc_kwargs) 

10963 def asfreq( 

10964 self, 

10965 freq: Frequency, 

10966 method: FillnaOptions | None = None, 

10967 how: str | None = None, 

10968 normalize: bool = False, 

10969 fill_value: Hashable = None, 

10970 ) -> DataFrame: 

10971 return super().asfreq( 

10972 freq=freq, 

10973 method=method, 

10974 how=how, 

10975 normalize=normalize, 

10976 fill_value=fill_value, 

10977 ) 

10978 

10979 @doc(NDFrame.resample, **_shared_doc_kwargs) 

10980 def resample( 

10981 self, 

10982 rule, 

10983 axis: Axis = 0, 

10984 closed: str | None = None, 

10985 label: str | None = None, 

10986 convention: str = "start", 

10987 kind: str | None = None, 

10988 on: Level = None, 

10989 level: Level = None, 

10990 origin: str | TimestampConvertibleTypes = "start_day", 

10991 offset: TimedeltaConvertibleTypes | None = None, 

10992 group_keys: bool = False, 

10993 ) -> Resampler: 

10994 return super().resample( 

10995 rule=rule, 

10996 axis=axis, 

10997 closed=closed, 

10998 label=label, 

10999 convention=convention, 

11000 kind=kind, 

11001 on=on, 

11002 level=level, 

11003 origin=origin, 

11004 offset=offset, 

11005 group_keys=group_keys, 

11006 ) 

11007 

11008 def to_timestamp( 

11009 self, 

11010 freq: Frequency | None = None, 

11011 how: str = "start", 

11012 axis: Axis = 0, 

11013 copy: bool | None = None, 

11014 ) -> DataFrame: 

11015 """ 

11016 Cast to DatetimeIndex of timestamps, at *beginning* of period. 

11017 

11018 Parameters 

11019 ---------- 

11020 freq : str, default frequency of PeriodIndex 

11021 Desired frequency. 

11022 how : {'s', 'e', 'start', 'end'} 

11023 Convention for converting period to timestamp; start of period 

11024 vs. end. 

11025 axis : {0 or 'index', 1 or 'columns'}, default 0 

11026 The axis to convert (the index by default). 

11027 copy : bool, default True 

11028 If False then underlying input data is not copied. 

11029 

11030 Returns 

11031 ------- 

11032 DataFrame 

11033 The DataFrame has a DatetimeIndex. 

11034 

11035 Examples 

11036 -------- 

11037 >>> idx = pd.PeriodIndex(['2023', '2024'], freq='Y') 

11038 >>> d = {'col1': [1, 2], 'col2': [3, 4]} 

11039 >>> df1 = pd.DataFrame(data=d, index=idx) 

11040 >>> df1 

11041 col1 col2 

11042 2023 1 3 

11043 2024 2 4 

11044 

11045 The resulting timestamps will be at the beginning of the year in this case 

11046 

11047 >>> df1 = df1.to_timestamp() 

11048 >>> df1 

11049 col1 col2 

11050 2023-01-01 1 3 

11051 2024-01-01 2 4 

11052 >>> df1.index 

11053 DatetimeIndex(['2023-01-01', '2024-01-01'], dtype='datetime64[ns]', freq=None) 

11054 

11055 Using `freq` which is the offset that the Timestamps will have 

11056 

11057 >>> df2 = pd.DataFrame(data=d, index=idx) 

11058 >>> df2 = df2.to_timestamp(freq='M') 

11059 >>> df2 

11060 col1 col2 

11061 2023-01-31 1 3 

11062 2024-01-31 2 4 

11063 >>> df2.index 

11064 DatetimeIndex(['2023-01-31', '2024-01-31'], dtype='datetime64[ns]', freq=None) 

11065 """ 

11066 new_obj = self.copy(deep=copy and not using_copy_on_write()) 

11067 

11068 axis_name = self._get_axis_name(axis) 

11069 old_ax = getattr(self, axis_name) 

11070 if not isinstance(old_ax, PeriodIndex): 

11071 raise TypeError(f"unsupported Type {type(old_ax).__name__}") 

11072 

11073 new_ax = old_ax.to_timestamp(freq=freq, how=how) 

11074 

11075 setattr(new_obj, axis_name, new_ax) 

11076 return new_obj 

11077 

11078 def to_period( 

11079 self, freq: Frequency | None = None, axis: Axis = 0, copy: bool | None = None 

11080 ) -> DataFrame: 

11081 """ 

11082 Convert DataFrame from DatetimeIndex to PeriodIndex. 

11083 

11084 Convert DataFrame from DatetimeIndex to PeriodIndex with desired 

11085 frequency (inferred from index if not passed). 

11086 

11087 Parameters 

11088 ---------- 

11089 freq : str, default 

11090 Frequency of the PeriodIndex. 

11091 axis : {0 or 'index', 1 or 'columns'}, default 0 

11092 The axis to convert (the index by default). 

11093 copy : bool, default True 

11094 If False then underlying input data is not copied. 

11095 

11096 Returns 

11097 ------- 

11098 DataFrame 

11099 The DataFrame has a PeriodIndex. 

11100 

11101 Examples 

11102 -------- 

11103 >>> idx = pd.to_datetime( 

11104 ... [ 

11105 ... "2001-03-31 00:00:00", 

11106 ... "2002-05-31 00:00:00", 

11107 ... "2003-08-31 00:00:00", 

11108 ... ] 

11109 ... ) 

11110 

11111 >>> idx 

11112 DatetimeIndex(['2001-03-31', '2002-05-31', '2003-08-31'], 

11113 dtype='datetime64[ns]', freq=None) 

11114 

11115 >>> idx.to_period("M") 

11116 PeriodIndex(['2001-03', '2002-05', '2003-08'], dtype='period[M]') 

11117 

11118 For the yearly frequency 

11119 

11120 >>> idx.to_period("Y") 

11121 PeriodIndex(['2001', '2002', '2003'], dtype='period[A-DEC]') 

11122 """ 

11123 new_obj = self.copy(deep=copy and not using_copy_on_write()) 

11124 

11125 axis_name = self._get_axis_name(axis) 

11126 old_ax = getattr(self, axis_name) 

11127 if not isinstance(old_ax, DatetimeIndex): 

11128 raise TypeError(f"unsupported Type {type(old_ax).__name__}") 

11129 

11130 new_ax = old_ax.to_period(freq=freq) 

11131 

11132 setattr(new_obj, axis_name, new_ax) 

11133 return new_obj 

11134 

11135 def isin(self, values: Series | DataFrame | Sequence | Mapping) -> DataFrame: 

11136 """ 

11137 Whether each element in the DataFrame is contained in values. 

11138 

11139 Parameters 

11140 ---------- 

11141 values : iterable, Series, DataFrame or dict 

11142 The result will only be true at a location if all the 

11143 labels match. If `values` is a Series, that's the index. If 

11144 `values` is a dict, the keys must be the column names, 

11145 which must match. If `values` is a DataFrame, 

11146 then both the index and column labels must match. 

11147 

11148 Returns 

11149 ------- 

11150 DataFrame 

11151 DataFrame of booleans showing whether each element in the DataFrame 

11152 is contained in values. 

11153 

11154 See Also 

11155 -------- 

11156 DataFrame.eq: Equality test for DataFrame. 

11157 Series.isin: Equivalent method on Series. 

11158 Series.str.contains: Test if pattern or regex is contained within a 

11159 string of a Series or Index. 

11160 

11161 Examples 

11162 -------- 

11163 >>> df = pd.DataFrame({'num_legs': [2, 4], 'num_wings': [2, 0]}, 

11164 ... index=['falcon', 'dog']) 

11165 >>> df 

11166 num_legs num_wings 

11167 falcon 2 2 

11168 dog 4 0 

11169 

11170 When ``values`` is a list check whether every value in the DataFrame 

11171 is present in the list (which animals have 0 or 2 legs or wings) 

11172 

11173 >>> df.isin([0, 2]) 

11174 num_legs num_wings 

11175 falcon True True 

11176 dog False True 

11177 

11178 To check if ``values`` is *not* in the DataFrame, use the ``~`` operator: 

11179 

11180 >>> ~df.isin([0, 2]) 

11181 num_legs num_wings 

11182 falcon False False 

11183 dog True False 

11184 

11185 When ``values`` is a dict, we can pass values to check for each 

11186 column separately: 

11187 

11188 >>> df.isin({'num_wings': [0, 3]}) 

11189 num_legs num_wings 

11190 falcon False False 

11191 dog False True 

11192 

11193 When ``values`` is a Series or DataFrame the index and column must 

11194 match. Note that 'falcon' does not match based on the number of legs 

11195 in other. 

11196 

11197 >>> other = pd.DataFrame({'num_legs': [8, 3], 'num_wings': [0, 2]}, 

11198 ... index=['spider', 'falcon']) 

11199 >>> df.isin(other) 

11200 num_legs num_wings 

11201 falcon False True 

11202 dog False False 

11203 """ 

11204 if isinstance(values, dict): 

11205 from pandas.core.reshape.concat import concat 

11206 

11207 values = collections.defaultdict(list, values) 

11208 result = concat( 

11209 ( 

11210 self.iloc[:, [i]].isin(values[col]) 

11211 for i, col in enumerate(self.columns) 

11212 ), 

11213 axis=1, 

11214 ) 

11215 elif isinstance(values, Series): 

11216 if not values.index.is_unique: 

11217 raise ValueError("cannot compute isin with a duplicate axis.") 

11218 result = self.eq(values.reindex_like(self), axis="index") 

11219 elif isinstance(values, DataFrame): 

11220 if not (values.columns.is_unique and values.index.is_unique): 

11221 raise ValueError("cannot compute isin with a duplicate axis.") 

11222 result = self.eq(values.reindex_like(self)) 

11223 else: 

11224 if not is_list_like(values): 

11225 raise TypeError( 

11226 "only list-like or dict-like objects are allowed " 

11227 "to be passed to DataFrame.isin(), " 

11228 f"you passed a '{type(values).__name__}'" 

11229 ) 

11230 # error: Argument 2 to "isin" has incompatible type "Union[Sequence[Any], 

11231 # Mapping[Any, Any]]"; expected "Union[Union[ExtensionArray, 

11232 # ndarray[Any, Any]], Index, Series]" 

11233 result = self._constructor( 

11234 algorithms.isin( 

11235 self.values.ravel(), values # type: ignore[arg-type] 

11236 ).reshape(self.shape), 

11237 self.index, 

11238 self.columns, 

11239 copy=False, 

11240 ) 

11241 return result.__finalize__(self, method="isin") 

11242 

11243 # ---------------------------------------------------------------------- 

11244 # Add index and columns 

11245 _AXIS_ORDERS: list[Literal["index", "columns"]] = ["index", "columns"] 

11246 _AXIS_TO_AXIS_NUMBER: dict[Axis, int] = { 

11247 **NDFrame._AXIS_TO_AXIS_NUMBER, 

11248 1: 1, 

11249 "columns": 1, 

11250 } 

11251 _AXIS_LEN = len(_AXIS_ORDERS) 

11252 _info_axis_number: Literal[1] = 1 

11253 _info_axis_name: Literal["columns"] = "columns" 

11254 

11255 index = properties.AxisProperty( 

11256 axis=1, doc="The index (row labels) of the DataFrame." 

11257 ) 

11258 columns = properties.AxisProperty(axis=0, doc="The column labels of the DataFrame.") 

11259 

11260 # ---------------------------------------------------------------------- 

11261 # Add plotting methods to DataFrame 

11262 plot = CachedAccessor("plot", pandas.plotting.PlotAccessor) 

11263 hist = pandas.plotting.hist_frame 

11264 boxplot = pandas.plotting.boxplot_frame 

11265 sparse = CachedAccessor("sparse", SparseFrameAccessor) 

11266 

11267 # ---------------------------------------------------------------------- 

11268 # Internal Interface Methods 

11269 

11270 def _to_dict_of_blocks(self, copy: bool = True): 

11271 """ 

11272 Return a dict of dtype -> Constructor Types that 

11273 each is a homogeneous dtype. 

11274 

11275 Internal ONLY - only works for BlockManager 

11276 """ 

11277 mgr = self._mgr 

11278 # convert to BlockManager if needed -> this way support ArrayManager as well 

11279 mgr = mgr_to_mgr(mgr, "block") 

11280 mgr = cast(BlockManager, mgr) 

11281 return { 

11282 k: self._constructor(v).__finalize__(self) 

11283 for k, v, in mgr.to_dict(copy=copy).items() 

11284 } 

11285 

11286 @property 

11287 def values(self) -> np.ndarray: 

11288 """ 

11289 Return a Numpy representation of the DataFrame. 

11290 

11291 .. warning:: 

11292 

11293 We recommend using :meth:`DataFrame.to_numpy` instead. 

11294 

11295 Only the values in the DataFrame will be returned, the axes labels 

11296 will be removed. 

11297 

11298 Returns 

11299 ------- 

11300 numpy.ndarray 

11301 The values of the DataFrame. 

11302 

11303 See Also 

11304 -------- 

11305 DataFrame.to_numpy : Recommended alternative to this method. 

11306 DataFrame.index : Retrieve the index labels. 

11307 DataFrame.columns : Retrieving the column names. 

11308 

11309 Notes 

11310 ----- 

11311 The dtype will be a lower-common-denominator dtype (implicit 

11312 upcasting); that is to say if the dtypes (even of numeric types) 

11313 are mixed, the one that accommodates all will be chosen. Use this 

11314 with care if you are not dealing with the blocks. 

11315 

11316 e.g. If the dtypes are float16 and float32, dtype will be upcast to 

11317 float32. If dtypes are int32 and uint8, dtype will be upcast to 

11318 int32. By :func:`numpy.find_common_type` convention, mixing int64 

11319 and uint64 will result in a float64 dtype. 

11320 

11321 Examples 

11322 -------- 

11323 A DataFrame where all columns are the same type (e.g., int64) results 

11324 in an array of the same type. 

11325 

11326 >>> df = pd.DataFrame({'age': [ 3, 29], 

11327 ... 'height': [94, 170], 

11328 ... 'weight': [31, 115]}) 

11329 >>> df 

11330 age height weight 

11331 0 3 94 31 

11332 1 29 170 115 

11333 >>> df.dtypes 

11334 age int64 

11335 height int64 

11336 weight int64 

11337 dtype: object 

11338 >>> df.values 

11339 array([[ 3, 94, 31], 

11340 [ 29, 170, 115]]) 

11341 

11342 A DataFrame with mixed type columns(e.g., str/object, int64, float32) 

11343 results in an ndarray of the broadest type that accommodates these 

11344 mixed types (e.g., object). 

11345 

11346 >>> df2 = pd.DataFrame([('parrot', 24.0, 'second'), 

11347 ... ('lion', 80.5, 1), 

11348 ... ('monkey', np.nan, None)], 

11349 ... columns=('name', 'max_speed', 'rank')) 

11350 >>> df2.dtypes 

11351 name object 

11352 max_speed float64 

11353 rank object 

11354 dtype: object 

11355 >>> df2.values 

11356 array([['parrot', 24.0, 'second'], 

11357 ['lion', 80.5, 1], 

11358 ['monkey', nan, None]], dtype=object) 

11359 """ 

11360 return self._mgr.as_array() 

11361 

11362 @overload 

11363 def ffill( 

11364 self, 

11365 *, 

11366 axis: None | Axis = ..., 

11367 inplace: Literal[False] = ..., 

11368 limit: None | int = ..., 

11369 downcast: dict | None = ..., 

11370 ) -> DataFrame: 

11371 ... 

11372 

11373 @overload 

11374 def ffill( 

11375 self, 

11376 *, 

11377 axis: None | Axis = ..., 

11378 inplace: Literal[True], 

11379 limit: None | int = ..., 

11380 downcast: dict | None = ..., 

11381 ) -> None: 

11382 ... 

11383 

11384 @overload 

11385 def ffill( 

11386 self, 

11387 *, 

11388 axis: None | Axis = ..., 

11389 inplace: bool = ..., 

11390 limit: None | int = ..., 

11391 downcast: dict | None = ..., 

11392 ) -> DataFrame | None: 

11393 ... 

11394 

11395 def ffill( 

11396 self, 

11397 *, 

11398 axis: None | Axis = None, 

11399 inplace: bool = False, 

11400 limit: None | int = None, 

11401 downcast: dict | None = None, 

11402 ) -> DataFrame | None: 

11403 return super().ffill(axis=axis, inplace=inplace, limit=limit, downcast=downcast) 

11404 

11405 @overload 

11406 def bfill( 

11407 self, 

11408 *, 

11409 axis: None | Axis = ..., 

11410 inplace: Literal[False] = ..., 

11411 limit: None | int = ..., 

11412 downcast=..., 

11413 ) -> DataFrame: 

11414 ... 

11415 

11416 @overload 

11417 def bfill( 

11418 self, 

11419 *, 

11420 axis: None | Axis = ..., 

11421 inplace: Literal[True], 

11422 limit: None | int = ..., 

11423 downcast=..., 

11424 ) -> None: 

11425 ... 

11426 

11427 @overload 

11428 def bfill( 

11429 self, 

11430 *, 

11431 axis: None | Axis = ..., 

11432 inplace: bool = ..., 

11433 limit: None | int = ..., 

11434 downcast=..., 

11435 ) -> DataFrame | None: 

11436 ... 

11437 

11438 def bfill( 

11439 self, 

11440 *, 

11441 axis: None | Axis = None, 

11442 inplace: bool = False, 

11443 limit: None | int = None, 

11444 downcast=None, 

11445 ) -> DataFrame | None: 

11446 return super().bfill(axis=axis, inplace=inplace, limit=limit, downcast=downcast) 

11447 

11448 def clip( 

11449 self: DataFrame, 

11450 lower: float | None = None, 

11451 upper: float | None = None, 

11452 *, 

11453 axis: Axis | None = None, 

11454 inplace: bool = False, 

11455 **kwargs, 

11456 ) -> DataFrame | None: 

11457 return super().clip(lower, upper, axis=axis, inplace=inplace, **kwargs) 

11458 

11459 def interpolate( 

11460 self: DataFrame, 

11461 method: str = "linear", 

11462 *, 

11463 axis: Axis = 0, 

11464 limit: int | None = None, 

11465 inplace: bool = False, 

11466 limit_direction: str | None = None, 

11467 limit_area: str | None = None, 

11468 downcast: str | None = None, 

11469 **kwargs, 

11470 ) -> DataFrame | None: 

11471 return super().interpolate( 

11472 method=method, 

11473 axis=axis, 

11474 limit=limit, 

11475 inplace=inplace, 

11476 limit_direction=limit_direction, 

11477 limit_area=limit_area, 

11478 downcast=downcast, 

11479 **kwargs, 

11480 ) 

11481 

11482 @overload 

11483 def where( 

11484 self, 

11485 cond, 

11486 other=..., 

11487 *, 

11488 inplace: Literal[False] = ..., 

11489 axis: Axis | None = ..., 

11490 level: Level = ..., 

11491 ) -> DataFrame: 

11492 ... 

11493 

11494 @overload 

11495 def where( 

11496 self, 

11497 cond, 

11498 other=..., 

11499 *, 

11500 inplace: Literal[True], 

11501 axis: Axis | None = ..., 

11502 level: Level = ..., 

11503 ) -> None: 

11504 ... 

11505 

11506 @overload 

11507 def where( 

11508 self, 

11509 cond, 

11510 other=..., 

11511 *, 

11512 inplace: bool = ..., 

11513 axis: Axis | None = ..., 

11514 level: Level = ..., 

11515 ) -> DataFrame | None: 

11516 ... 

11517 

11518 def where( 

11519 self, 

11520 cond, 

11521 other=lib.no_default, 

11522 *, 

11523 inplace: bool = False, 

11524 axis: Axis | None = None, 

11525 level: Level = None, 

11526 ) -> DataFrame | None: 

11527 return super().where( 

11528 cond, 

11529 other, 

11530 inplace=inplace, 

11531 axis=axis, 

11532 level=level, 

11533 ) 

11534 

11535 @overload 

11536 def mask( 

11537 self, 

11538 cond, 

11539 other=..., 

11540 *, 

11541 inplace: Literal[False] = ..., 

11542 axis: Axis | None = ..., 

11543 level: Level = ..., 

11544 ) -> DataFrame: 

11545 ... 

11546 

11547 @overload 

11548 def mask( 

11549 self, 

11550 cond, 

11551 other=..., 

11552 *, 

11553 inplace: Literal[True], 

11554 axis: Axis | None = ..., 

11555 level: Level = ..., 

11556 ) -> None: 

11557 ... 

11558 

11559 @overload 

11560 def mask( 

11561 self, 

11562 cond, 

11563 other=..., 

11564 *, 

11565 inplace: bool = ..., 

11566 axis: Axis | None = ..., 

11567 level: Level = ..., 

11568 ) -> DataFrame | None: 

11569 ... 

11570 

11571 def mask( 

11572 self, 

11573 cond, 

11574 other=lib.no_default, 

11575 *, 

11576 inplace: bool = False, 

11577 axis: Axis | None = None, 

11578 level: Level = None, 

11579 ) -> DataFrame | None: 

11580 return super().mask( 

11581 cond, 

11582 other, 

11583 inplace=inplace, 

11584 axis=axis, 

11585 level=level, 

11586 ) 

11587 

11588 

11589DataFrame._add_numeric_operations() 

11590 

11591ops.add_flex_arithmetic_methods(DataFrame) 

11592 

11593 

11594def _from_nested_dict(data) -> collections.defaultdict: 

11595 new_data: collections.defaultdict = collections.defaultdict(dict) 

11596 for index, s in data.items(): 

11597 for col, v in s.items(): 

11598 new_data[col][index] = v 

11599 return new_data 

11600 

11601 

11602def _reindex_for_setitem(value: DataFrame | Series, index: Index) -> ArrayLike: 

11603 # reindex if necessary 

11604 

11605 if value.index.equals(index) or not len(index): 

11606 return value._values.copy() 

11607 

11608 # GH#4107 

11609 try: 

11610 reindexed_value = value.reindex(index)._values 

11611 except ValueError as err: 

11612 # raised in MultiIndex.from_tuples, see test_insert_error_msmgs 

11613 if not value.index.is_unique: 

11614 # duplicate axis 

11615 raise err 

11616 

11617 raise TypeError( 

11618 "incompatible index of inserted column with frame index" 

11619 ) from err 

11620 return reindexed_value