Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/pandas/core/base.py: 39%

1"""

2Base and utility classes for pandas objects.

3"""

5from __future__ import annotations

7import textwrap

8from typing import (

9 TYPE_CHECKING,

10 Any,

11 Generic,

12 Hashable,

13 Iterator,

14 Literal,

15 TypeVar,

16 cast,

17 final,

18 overload,

19)

21import numpy as np

23from pandas._config import using_copy_on_write

25from pandas._libs import lib

26from pandas._typing import (

27 Axis,

28 AxisInt,

29 DtypeObj,

30 IndexLabel,

31 NDFrameT,

32 Shape,

33 npt,

34)

35from pandas.compat import PYPY

36from pandas.compat.numpy import function as nv

37from pandas.errors import AbstractMethodError

38from pandas.util._decorators import (

39 cache_readonly,

40 doc,

41)

43from pandas.core.dtypes.cast import can_hold_element

44from pandas.core.dtypes.common import (

45 is_categorical_dtype,

46 is_dict_like,

47 is_extension_array_dtype,

48 is_object_dtype,

49 is_scalar,

50)

51from pandas.core.dtypes.generic import (

52 ABCDataFrame,

53 ABCIndex,

54 ABCSeries,

55)

56from pandas.core.dtypes.missing import (

57 isna,

58 remove_na_arraylike,

59)

61from pandas.core import (

62 algorithms,

63 nanops,

64 ops,

65)

66from pandas.core.accessor import DirNamesMixin

67from pandas.core.arraylike import OpsMixin

68from pandas.core.arrays import ExtensionArray

69from pandas.core.construction import (

70 ensure_wrapped_if_datetimelike,

71 extract_array,

72)

74if TYPE_CHECKING:

75 from pandas._typing import (

76 DropKeep,

77 NumpySorter,

78 NumpyValueArrayLike,

79 ScalarLike_co,

80 )

82 from pandas import (

83 Categorical,

84 Index,

85 Series,

86 )

89_shared_docs: dict[str, str] = {}

90_indexops_doc_kwargs = {

91 "klass": "IndexOpsMixin",

92 "inplace": "",

93 "unique": "IndexOpsMixin",

94 "duplicated": "IndexOpsMixin",

95}

97_T = TypeVar("_T", bound="IndexOpsMixin")

100class PandasObject(DirNamesMixin):

101 """

102 Baseclass for various pandas objects.

103 """

104

105 # results from calls to methods decorated with cache_readonly get added to _cache

106 _cache: dict[str, Any]

107

108 @property

109 def _constructor(self):

110 """

111 Class constructor (for this class it's just `__class__`.

112 """

113 return type(self)

114

115 def __repr__(self) -> str:

116 """

117 Return a string representation for a particular object.

118 """

119 # Should be overwritten by base classes

120 return object.__repr__(self)

121

122 def _reset_cache(self, key: str | None = None) -> None:

123 """

124 Reset cached properties. If ``key`` is passed, only clears that key.

125 """

126 if not hasattr(self, "_cache"):

127 return

128 if key is None:

129 self._cache.clear()

130 else:

131 self._cache.pop(key, None)

132

133 def __sizeof__(self) -> int:

134 """

135 Generates the total memory usage for an object that returns

136 either a value or Series of values

137 """

138 memory_usage = getattr(self, "memory_usage", None)

139 if memory_usage:

140 mem = memory_usage(deep=True) # pylint: disable=not-callable

141 return int(mem if is_scalar(mem) else mem.sum())

142

143 # no memory_usage attribute, so fall back to object's 'sizeof'

144 return super().__sizeof__()

145

146

147class NoNewAttributesMixin:

148 """

149 Mixin which prevents adding new attributes.

150

151 Prevents additional attributes via xxx.attribute = "something" after a

152 call to `self.__freeze()`. Mainly used to prevent the user from using

153 wrong attributes on an accessor (`Series.cat/.str/.dt`).

154

155 If you really want to add a new attribute at a later time, you need to use

156 `object.__setattr__(self, key, value)`.

157 """

158

159 def _freeze(self) -> None:

160 """

161 Prevents setting additional attributes.

162 """

163 object.__setattr__(self, "__frozen", True)

164

165 # prevent adding any attribute via s.xxx.new_attribute = ...

166 def __setattr__(self, key: str, value) -> None:

167 # _cache is used by a decorator

168 # We need to check both 1.) cls.__dict__ and 2.) getattr(self, key)

169 # because

170 # 1.) getattr is false for attributes that raise errors

171 # 2.) cls.__dict__ doesn't traverse into base classes

172 if getattr(self, "__frozen", False) and not (

173 key == "_cache"

174 or key in type(self).__dict__

175 or getattr(self, key, None) is not None

176 ):

177 raise AttributeError(f"You cannot add any new attribute '{key}'")

178 object.__setattr__(self, key, value)

179

180

181class SelectionMixin(Generic[NDFrameT]):

182 """

183 mixin implementing the selection & aggregation interface on a group-like

184 object sub-classes need to define: obj, exclusions

185 """

186

187 obj: NDFrameT

188 _selection: IndexLabel | None = None

189 exclusions: frozenset[Hashable]

190 _internal_names = ["_cache", "__setstate__"]

191 _internal_names_set = set(_internal_names)

192

193 @final

194 @property

195 def _selection_list(self):

196 if not isinstance(

197 self._selection, (list, tuple, ABCSeries, ABCIndex, np.ndarray)

198 ):

199 return [self._selection]

200 return self._selection

201

202 @cache_readonly

203 def _selected_obj(self):

204 if self._selection is None or isinstance(self.obj, ABCSeries):

205 return self.obj

206 else:

207 return self.obj[self._selection]

208

209 @final

210 @cache_readonly

211 def ndim(self) -> int:

212 return self._selected_obj.ndim

213

214 @final

215 @cache_readonly

216 def _obj_with_exclusions(self):

217 if isinstance(self.obj, ABCSeries):

218 return self.obj

219

220 if self._selection is not None:

221 return self.obj._getitem_nocopy(self._selection_list)

222

223 if len(self.exclusions) > 0:

224 # equivalent to `self.obj.drop(self.exclusions, axis=1)

225 # but this avoids consolidating and making a copy

226 # TODO: following GH#45287 can we now use .drop directly without

227 # making a copy?

228 return self.obj._drop_axis(self.exclusions, axis=1, only_slice=True)

229 else:

230 return self.obj

231

232 def __getitem__(self, key):

233 if self._selection is not None:

234 raise IndexError(f"Column(s) {self._selection} already selected")

235

236 if isinstance(key, (list, tuple, ABCSeries, ABCIndex, np.ndarray)):

237 if len(self.obj.columns.intersection(key)) != len(set(key)):

238 bad_keys = list(set(key).difference(self.obj.columns))

239 raise KeyError(f"Columns not found: {str(bad_keys)[1:-1]}")

240 return self._gotitem(list(key), ndim=2)

241

242 else:

243 if key not in self.obj:

244 raise KeyError(f"Column not found: {key}")

245 ndim = self.obj[key].ndim

246 return self._gotitem(key, ndim=ndim)

247

248 def _gotitem(self, key, ndim: int, subset=None):

249 """

250 sub-classes to define

251 return a sliced object

252

253 Parameters

254 ----------

255 key : str / list of selections

256 ndim : {1, 2}

257 requested ndim of result

258 subset : object, default None

259 subset to act on

260 """

261 raise AbstractMethodError(self)

262

263 def aggregate(self, func, *args, **kwargs):

264 raise AbstractMethodError(self)

265

266 agg = aggregate

267

268

269class IndexOpsMixin(OpsMixin):

270 """

271 Common ops mixin to support a unified interface / docs for Series / Index

272 """

273

274 # ndarray compatibility

275 __array_priority__ = 1000

276 _hidden_attrs: frozenset[str] = frozenset(

277 ["tolist"] # tolist is not deprecated, just suppressed in the __dir__

278 )

279

280 @property

281 def dtype(self) -> DtypeObj:

282 # must be defined here as a property for mypy

283 raise AbstractMethodError(self)

284

285 @property

286 def _values(self) -> ExtensionArray | np.ndarray:

287 # must be defined here as a property for mypy

288 raise AbstractMethodError(self)

289

290 @final

291 def transpose(self: _T, *args, **kwargs) -> _T:

292 """

293 Return the transpose, which is by definition self.

294

295 Returns

296 -------

297 %(klass)s

298 """

299 nv.validate_transpose(args, kwargs)

300 return self

301

302 T = property(

303 transpose,

304 doc="""

305 Return the transpose, which is by definition self.

306 """,

307 )

308

309 @property

310 def shape(self) -> Shape:

311 """

312 Return a tuple of the shape of the underlying data.

313

314 Examples

315 --------

316 >>> s = pd.Series([1, 2, 3])

317 >>> s.shape

318 (3,)

319 """

320 return self._values.shape

321

322 def __len__(self) -> int:

323 # We need this defined here for mypy

324 raise AbstractMethodError(self)

325

326 @property

327 def ndim(self) -> Literal[1]:

328 """

329 Number of dimensions of the underlying data, by definition 1.

330 """

331 return 1

332

333 @final

334 def item(self):

335 """

336 Return the first element of the underlying data as a Python scalar.

337

338 Returns

339 -------

340 scalar

341 The first element of %(klass)s.

342

343 Raises

344 ------

345 ValueError

346 If the data is not length-1.

347 """

348 if len(self) == 1:

349 return next(iter(self))

350 raise ValueError("can only convert an array of size 1 to a Python scalar")

351

352 @property

353 def nbytes(self) -> int:

354 """

355 Return the number of bytes in the underlying data.

356 """

357 return self._values.nbytes

358

359 @property

360 def size(self) -> int:

361 """

362 Return the number of elements in the underlying data.

363 """

364 return len(self._values)

365

366 @property

367 def array(self) -> ExtensionArray:

368 """

369 The ExtensionArray of the data backing this Series or Index.

370

371 Returns

372 -------

373 ExtensionArray

374 An ExtensionArray of the values stored within. For extension

375 types, this is the actual array. For NumPy native types, this

376 is a thin (no copy) wrapper around :class:`numpy.ndarray`.

377

378 ``.array`` differs ``.values`` which may require converting the

379 data to a different form.

380

381 See Also

382 --------

383 Index.to_numpy : Similar method that always returns a NumPy array.

384 Series.to_numpy : Similar method that always returns a NumPy array.

385

386 Notes

387 -----

388 This table lays out the different array types for each extension

389 dtype within pandas.

390

391 ================== =============================

392 dtype array type

393 ================== =============================

394 category Categorical

395 period PeriodArray

396 interval IntervalArray

397 IntegerNA IntegerArray

398 string StringArray

399 boolean BooleanArray

400 datetime64[ns, tz] DatetimeArray

401 ================== =============================

402

403 For any 3rd-party extension types, the array type will be an

404 ExtensionArray.

405

406 For all remaining dtypes ``.array`` will be a

407 :class:`arrays.NumpyExtensionArray` wrapping the actual ndarray

408 stored within. If you absolutely need a NumPy array (possibly with

409 copying / coercing data), then use :meth:`Series.to_numpy` instead.

410

411 Examples

412 --------

413 For regular NumPy types like int, and float, a PandasArray

414 is returned.

415

416 >>> pd.Series([1, 2, 3]).array

417 <PandasArray>

418 [1, 2, 3]

419 Length: 3, dtype: int64

420

421 For extension types, like Categorical, the actual ExtensionArray

422 is returned

423

424 >>> ser = pd.Series(pd.Categorical(['a', 'b', 'a']))

425 >>> ser.array

426 ['a', 'b', 'a']

427 Categories (2, object): ['a', 'b']

428 """

429 raise AbstractMethodError(self)

430

431 @final

432 def to_numpy(

433 self,

434 dtype: npt.DTypeLike | None = None,

435 copy: bool = False,

436 na_value: object = lib.no_default,

437 **kwargs,

438 ) -> np.ndarray:

439 """

440 A NumPy ndarray representing the values in this Series or Index.

441

442 Parameters

443 ----------

444 dtype : str or numpy.dtype, optional

445 The dtype to pass to :meth:`numpy.asarray`.

446 copy : bool, default False

447 Whether to ensure that the returned value is not a view on

448 another array. Note that ``copy=False`` does not *ensure* that

449 ``to_numpy()`` is no-copy. Rather, ``copy=True`` ensure that

450 a copy is made, even if not strictly necessary.

451 na_value : Any, optional

452 The value to use for missing values. The default value depends

453 on `dtype` and the type of the array.

454 **kwargs

455 Additional keywords passed through to the ``to_numpy`` method

456 of the underlying array (for extension arrays).

457

458 Returns

459 -------

460 numpy.ndarray

461

462 See Also

463 --------

464 Series.array : Get the actual data stored within.

465 Index.array : Get the actual data stored within.

466 DataFrame.to_numpy : Similar method for DataFrame.

467

468 Notes

469 -----

470 The returned array will be the same up to equality (values equal

471 in `self` will be equal in the returned array; likewise for values

472 that are not equal). When `self` contains an ExtensionArray, the

473 dtype may be different. For example, for a category-dtype Series,

474 ``to_numpy()`` will return a NumPy array and the categorical dtype

475 will be lost.

476

477 For NumPy dtypes, this will be a reference to the actual data stored

478 in this Series or Index (assuming ``copy=False``). Modifying the result

479 in place will modify the data stored in the Series or Index (not that

480 we recommend doing that).

481

482 For extension types, ``to_numpy()`` *may* require copying data and

483 coercing the result to a NumPy type (possibly object), which may be

484 expensive. When you need a no-copy reference to the underlying data,

485 :attr:`Series.array` should be used instead.

486

487 This table lays out the different dtypes and default return types of

488 ``to_numpy()`` for various dtypes within pandas.

489

490 ================== ================================

491 dtype array type

492 ================== ================================

493 category[T] ndarray[T] (same dtype as input)

494 period ndarray[object] (Periods)

495 interval ndarray[object] (Intervals)

496 IntegerNA ndarray[object]

497 datetime64[ns] datetime64[ns]

498 datetime64[ns, tz] ndarray[object] (Timestamps)

499 ================== ================================

500

501 Examples

502 --------

503 >>> ser = pd.Series(pd.Categorical(['a', 'b', 'a']))

504 >>> ser.to_numpy()

505 array(['a', 'b', 'a'], dtype=object)

506

507 Specify the `dtype` to control how datetime-aware data is represented.

508 Use ``dtype=object`` to return an ndarray of pandas :class:`Timestamp`

509 objects, each with the correct ``tz``.

510

511 >>> ser = pd.Series(pd.date_range('2000', periods=2, tz="CET"))

512 >>> ser.to_numpy(dtype=object)

513 array([Timestamp('2000-01-01 00:00:00+0100', tz='CET'),

514 Timestamp('2000-01-02 00:00:00+0100', tz='CET')],

515 dtype=object)

516

517 Or ``dtype='datetime64[ns]'`` to return an ndarray of native

518 datetime64 values. The values are converted to UTC and the timezone

519 info is dropped.

520

521 >>> ser.to_numpy(dtype="datetime64[ns]")

522 ... # doctest: +ELLIPSIS

523 array(['1999-12-31T23:00:00.000000000', '2000-01-01T23:00:00...'],

524 dtype='datetime64[ns]')

525 """

526 if is_extension_array_dtype(self.dtype):

527 return self.array.to_numpy(dtype, copy=copy, na_value=na_value, **kwargs)

528 elif kwargs:

529 bad_keys = list(kwargs.keys())[0]

530 raise TypeError(

531 f"to_numpy() got an unexpected keyword argument '{bad_keys}'"

532 )

533

534 if na_value is not lib.no_default:

535 values = self._values

536 if not can_hold_element(values, na_value):

537 # if we can't hold the na_value asarray either makes a copy or we

538 # error before modifying values. The asarray later on thus won't make

539 # another copy

540 values = np.asarray(values, dtype=dtype)

541 else:

542 values = values.copy()

543

544 values[np.asanyarray(self.isna())] = na_value

545 else:

546 values = self._values

547

548 result = np.asarray(values, dtype=dtype)

549

550 if (copy and na_value is lib.no_default) or (

551 not copy and using_copy_on_write()

552 ):

553 if np.shares_memory(self._values[:2], result[:2]):

554 # Take slices to improve performance of check

555 if using_copy_on_write() and not copy:

556 result = result.view()

557 result.flags.writeable = False

558 else:

559 result = result.copy()

560

561 return result

562

563 @final

564 @property

565 def empty(self) -> bool:

566 return not self.size

567

568 def max(self, axis: AxisInt | None = None, skipna: bool = True, *args, **kwargs):

569 """

570 Return the maximum value of the Index.

571

572 Parameters

573 ----------

574 axis : int, optional

575 For compatibility with NumPy. Only 0 or None are allowed.

576 skipna : bool, default True

577 Exclude NA/null values when showing the result.

578 *args, **kwargs

579 Additional arguments and keywords for compatibility with NumPy.

580

581 Returns

582 -------

583 scalar

584 Maximum value.

585

586 See Also

587 --------

588 Index.min : Return the minimum value in an Index.

589 Series.max : Return the maximum value in a Series.

590 DataFrame.max : Return the maximum values in a DataFrame.

591

592 Examples

593 --------

594 >>> idx = pd.Index([3, 2, 1])

595 >>> idx.max()

596 3

597

598 >>> idx = pd.Index(['c', 'b', 'a'])

599 >>> idx.max()

600 'c'

601

602 For a MultiIndex, the maximum is determined lexicographically.

603

604 >>> idx = pd.MultiIndex.from_product([('a', 'b'), (2, 1)])

605 >>> idx.max()

606 ('b', 2)

607 """

608 nv.validate_minmax_axis(axis)

609 nv.validate_max(args, kwargs)

610 return nanops.nanmax(self._values, skipna=skipna)

611

612 @doc(op="max", oppose="min", value="largest")

613 def argmax(

614 self, axis: AxisInt | None = None, skipna: bool = True, *args, **kwargs

615 ) -> int:

616 """

617 Return int position of the {value} value in the Series.

618

619 If the {op}imum is achieved in multiple locations,

620 the first row position is returned.

621

622 Parameters

623 ----------

624 axis : {{None}}

625 Unused. Parameter needed for compatibility with DataFrame.

626 skipna : bool, default True

627 Exclude NA/null values when showing the result.

628 *args, **kwargs

629 Additional arguments and keywords for compatibility with NumPy.

630

631 Returns

632 -------

633 int

634 Row position of the {op}imum value.

635

636 See Also

637 --------

638 Series.arg{op} : Return position of the {op}imum value.

639 Series.arg{oppose} : Return position of the {oppose}imum value.

640 numpy.ndarray.arg{op} : Equivalent method for numpy arrays.

641 Series.idxmax : Return index label of the maximum values.

642 Series.idxmin : Return index label of the minimum values.

643

644 Examples

645 --------

646 Consider dataset containing cereal calories

647

648 >>> s = pd.Series({{'Corn Flakes': 100.0, 'Almond Delight': 110.0,

649 ... 'Cinnamon Toast Crunch': 120.0, 'Cocoa Puff': 110.0}})

650 >>> s

651 Corn Flakes 100.0

652 Almond Delight 110.0

653 Cinnamon Toast Crunch 120.0

654 Cocoa Puff 110.0

655 dtype: float64

656

657 >>> s.argmax()

658 2

659 >>> s.argmin()

660 0

661

662 The maximum cereal calories is the third element and

663 the minimum cereal calories is the first element,

664 since series is zero-indexed.

665 """

666 delegate = self._values

667 nv.validate_minmax_axis(axis)

668 skipna = nv.validate_argmax_with_skipna(skipna, args, kwargs)

669

670 if isinstance(delegate, ExtensionArray):

671 if not skipna and delegate.isna().any():

672 return -1

673 else:

674 return delegate.argmax()

675 else:

676 # error: Incompatible return value type (got "Union[int, ndarray]", expected

677 # "int")

678 return nanops.nanargmax( # type: ignore[return-value]

679 delegate, skipna=skipna

680 )

681

682 def min(self, axis: AxisInt | None = None, skipna: bool = True, *args, **kwargs):

683 """

684 Return the minimum value of the Index.

685

686 Parameters

687 ----------

688 axis : {None}

689 Dummy argument for consistency with Series.

690 skipna : bool, default True

691 Exclude NA/null values when showing the result.

692 *args, **kwargs

693 Additional arguments and keywords for compatibility with NumPy.

694

695 Returns

696 -------

697 scalar

698 Minimum value.

699

700 See Also

701 --------

702 Index.max : Return the maximum value of the object.

703 Series.min : Return the minimum value in a Series.

704 DataFrame.min : Return the minimum values in a DataFrame.

705

706 Examples

707 --------

708 >>> idx = pd.Index([3, 2, 1])

709 >>> idx.min()

710 1

711

712 >>> idx = pd.Index(['c', 'b', 'a'])

713 >>> idx.min()

714 'a'

715

716 For a MultiIndex, the minimum is determined lexicographically.

717

718 >>> idx = pd.MultiIndex.from_product([('a', 'b'), (2, 1)])

719 >>> idx.min()

720 ('a', 1)

721 """

722 nv.validate_minmax_axis(axis)

723 nv.validate_min(args, kwargs)

724 return nanops.nanmin(self._values, skipna=skipna)

725

726 @doc(argmax, op="min", oppose="max", value="smallest")

727 def argmin(

728 self, axis: AxisInt | None = None, skipna: bool = True, *args, **kwargs

729 ) -> int:

730 delegate = self._values

731 nv.validate_minmax_axis(axis)

732 skipna = nv.validate_argmin_with_skipna(skipna, args, kwargs)

733

734 if isinstance(delegate, ExtensionArray):

735 if not skipna and delegate.isna().any():

736 return -1

737 else:

738 return delegate.argmin()

739 else:

740 # error: Incompatible return value type (got "Union[int, ndarray]", expected

741 # "int")

742 return nanops.nanargmin( # type: ignore[return-value]

743 delegate, skipna=skipna

744 )

745

746 def tolist(self):

747 """

748 Return a list of the values.

749

750 These are each a scalar type, which is a Python scalar

751 (for str, int, float) or a pandas scalar

752 (for Timestamp/Timedelta/Interval/Period)

753

754 Returns

755 -------

756 list

757

758 See Also

759 --------

760 numpy.ndarray.tolist : Return the array as an a.ndim-levels deep

761 nested list of Python scalars.

762 """

763 return self._values.tolist()

764

765 to_list = tolist

766

767 def __iter__(self) -> Iterator:

768 """

769 Return an iterator of the values.

770

771 These are each a scalar type, which is a Python scalar

772 (for str, int, float) or a pandas scalar

773 (for Timestamp/Timedelta/Interval/Period)

774

775 Returns

776 -------

777 iterator

778 """

779 # We are explicitly making element iterators.

780 if not isinstance(self._values, np.ndarray):

781 # Check type instead of dtype to catch DTA/TDA

782 return iter(self._values)

783 else:

784 return map(self._values.item, range(self._values.size))

785

786 @cache_readonly

787 def hasnans(self) -> bool:

788 """

789 Return True if there are any NaNs.

790

791 Enables various performance speedups.

792

793 Returns

794 -------

795 bool

796 """

797 # error: Item "bool" of "Union[bool, ndarray[Any, dtype[bool_]], NDFrame]"

798 # has no attribute "any"

799 return bool(isna(self).any()) # type: ignore[union-attr]

800

801 def isna(self) -> npt.NDArray[np.bool_]:

802 return isna(self._values)

803

804 def _reduce(

805 self,

806 op,

807 name: str,

808 *,

809 axis: Axis = 0,

810 skipna: bool = True,

811 numeric_only=None,

812 filter_type=None,

813 **kwds,

814 ):

815 """

816 Perform the reduction type operation if we can.

817 """

818 func = getattr(self, name, None)

819 if func is None:

820 raise TypeError(

821 f"{type(self).__name__} cannot perform the operation {name}"

822 )

823 return func(skipna=skipna, **kwds)

824

825 @final

826 def _map_values(self, mapper, na_action=None):

827 """

828 An internal function that maps values using the input

829 correspondence (which can be a dict, Series, or function).

830

831 Parameters

832 ----------

833 mapper : function, dict, or Series

834 The input correspondence object

835 na_action : {None, 'ignore'}

836 If 'ignore', propagate NA values, without passing them to the

837 mapping function

838

839 Returns

840 -------

841 Union[Index, MultiIndex], inferred

842 The output of the mapping function applied to the index.

843 If the function returns a tuple with more than one element

844 a MultiIndex will be returned.

845 """

846 # we can fastpath dict/Series to an efficient map

847 # as we know that we are not going to have to yield

848 # python types

849 if is_dict_like(mapper):

850 if isinstance(mapper, dict) and hasattr(mapper, "__missing__"):

851 # If a dictionary subclass defines a default value method,

852 # convert mapper to a lookup function (GH #15999).

853 dict_with_default = mapper

854 mapper = lambda x: dict_with_default[

855 np.nan if isinstance(x, float) and np.isnan(x) else x

856 ]

857 else:

858 # Dictionary does not have a default. Thus it's safe to

859 # convert to an Series for efficiency.

860 # we specify the keys here to handle the

861 # possibility that they are tuples

862

863 # The return value of mapping with an empty mapper is

864 # expected to be pd.Series(np.nan, ...). As np.nan is

865 # of dtype float64 the return value of this method should

866 # be float64 as well

867 from pandas import Series

868

869 if len(mapper) == 0:

870 mapper = Series(mapper, dtype=np.float64)

871 else:

872 mapper = Series(mapper)

873

874 if isinstance(mapper, ABCSeries):

875 if na_action not in (None, "ignore"):

876 msg = (

877 "na_action must either be 'ignore' or None, "

878 f"{na_action} was passed"

879 )

880 raise ValueError(msg)

881

882 if na_action == "ignore":

883 mapper = mapper[mapper.index.notna()]

884

885 # Since values were input this means we came from either

886 # a dict or a series and mapper should be an index

887 if is_categorical_dtype(self.dtype):

888 # use the built in categorical series mapper which saves

889 # time by mapping the categories instead of all values

890

891 cat = cast("Categorical", self._values)

892 return cat.map(mapper)

893

894 values = self._values

895

896 indexer = mapper.index.get_indexer(values)

897 new_values = algorithms.take_nd(mapper._values, indexer)

898

899 return new_values

900

901 # we must convert to python types

902 if is_extension_array_dtype(self.dtype) and hasattr(self._values, "map"):

903 # GH#23179 some EAs do not have `map`

904 values = self._values

905 if na_action is not None:

906 raise NotImplementedError

907 map_f = lambda values, f: values.map(f)

908 else:

909 values = self._values.astype(object)

910 if na_action == "ignore":

911 map_f = lambda values, f: lib.map_infer_mask(

912 values, f, isna(values).view(np.uint8)

913 )

914 elif na_action is None:

915 map_f = lib.map_infer

916 else:

917 msg = (

918 "na_action must either be 'ignore' or None, "

919 f"{na_action} was passed"

920 )

921 raise ValueError(msg)

922

923 # mapper is a function

924 new_values = map_f(values, mapper)

925

926 return new_values

927

928 @final

929 def value_counts(

930 self,

931 normalize: bool = False,

932 sort: bool = True,

933 ascending: bool = False,

934 bins=None,

935 dropna: bool = True,

936 ) -> Series:

937 """

938 Return a Series containing counts of unique values.

939

940 The resulting object will be in descending order so that the

941 first element is the most frequently-occurring element.

942 Excludes NA values by default.

943

944 Parameters

945 ----------

946 normalize : bool, default False

947 If True then the object returned will contain the relative

948 frequencies of the unique values.

949 sort : bool, default True

950 Sort by frequencies.

951 ascending : bool, default False

952 Sort in ascending order.

953 bins : int, optional

954 Rather than count values, group them into half-open bins,

955 a convenience for ``pd.cut``, only works with numeric data.

956 dropna : bool, default True

957 Don't include counts of NaN.

958

959 Returns

960 -------

961 Series

962

963 See Also

964 --------

965 Series.count: Number of non-NA elements in a Series.

966 DataFrame.count: Number of non-NA elements in a DataFrame.

967 DataFrame.value_counts: Equivalent method on DataFrames.

968

969 Examples

970 --------

971 >>> index = pd.Index([3, 1, 2, 3, 4, np.nan])

972 >>> index.value_counts()

973 3.0 2

974 1.0 1

975 2.0 1

976 4.0 1

977 Name: count, dtype: int64

978

979 With `normalize` set to `True`, returns the relative frequency by

980 dividing all values by the sum of values.

981

982 >>> s = pd.Series([3, 1, 2, 3, 4, np.nan])

983 >>> s.value_counts(normalize=True)

984 3.0 0.4

985 1.0 0.2

986 2.0 0.2

987 4.0 0.2

988 Name: proportion, dtype: float64

989

990 **bins**

991

992 Bins can be useful for going from a continuous variable to a

993 categorical variable; instead of counting unique

994 apparitions of values, divide the index in the specified

995 number of half-open bins.

996

997 >>> s.value_counts(bins=3)

998 (0.996, 2.0] 2

999 (2.0, 3.0] 2

1000 (3.0, 4.0] 1

1001 Name: count, dtype: int64

1002

1003 **dropna**

1004

1005 With `dropna` set to `False` we can also see NaN index values.

1006

1007 >>> s.value_counts(dropna=False)

1008 3.0 2

1009 1.0 1

1010 2.0 1

1011 4.0 1

1012 NaN 1

1013 Name: count, dtype: int64

1014 """

1015 return algorithms.value_counts(

1016 self,

1017 sort=sort,

1018 ascending=ascending,

1019 normalize=normalize,

1020 bins=bins,

1021 dropna=dropna,

1022 )

1023

1024 def unique(self):

1025 values = self._values

1026 if not isinstance(values, np.ndarray):

1027 # i.e. ExtensionArray

1028 result = values.unique()

1029 else:

1030 result = algorithms.unique1d(values)

1031 return result

1032

1033 @final

1034 def nunique(self, dropna: bool = True) -> int:

1035 """

1036 Return number of unique elements in the object.

1037

1038 Excludes NA values by default.

1039

1040 Parameters

1041 ----------

1042 dropna : bool, default True

1043 Don't include NaN in the count.

1044

1045 Returns

1046 -------

1047 int

1048

1049 See Also

1050 --------

1051 DataFrame.nunique: Method nunique for DataFrame.

1052 Series.count: Count non-NA/null observations in the Series.

1053

1054 Examples

1055 --------

1056 >>> s = pd.Series([1, 3, 5, 7, 7])

1057 >>> s

1058 0 1

1059 1 3

1060 2 5

1061 3 7

1062 4 7

1063 dtype: int64

1064

1065 >>> s.nunique()

1066 4

1067 """

1068 uniqs = self.unique()

1069 if dropna:

1070 uniqs = remove_na_arraylike(uniqs)

1071 return len(uniqs)

1072

1073 @property

1074 def is_unique(self) -> bool:

1075 """

1076 Return boolean if values in the object are unique.

1077

1078 Returns

1079 -------

1080 bool

1081 """

1082 return self.nunique(dropna=False) == len(self)

1083

1084 @property

1085 def is_monotonic_increasing(self) -> bool:

1086 """

1087 Return boolean if values in the object are monotonically increasing.

1088

1089 Returns

1090 -------

1091 bool

1092 """

1093 from pandas import Index

1094

1095 return Index(self).is_monotonic_increasing

1096

1097 @property

1098 def is_monotonic_decreasing(self) -> bool:

1099 """

1100 Return boolean if values in the object are monotonically decreasing.

1101

1102 Returns

1103 -------

1104 bool

1105 """

1106 from pandas import Index

1107

1108 return Index(self).is_monotonic_decreasing

1109

1110 @final

1111 def _memory_usage(self, deep: bool = False) -> int:

1112 """

1113 Memory usage of the values.

1114

1115 Parameters

1116 ----------

1117 deep : bool, default False

1118 Introspect the data deeply, interrogate

1119 `object` dtypes for system-level memory consumption.

1120

1121 Returns

1122 -------

1123 bytes used

1124

1125 See Also

1126 --------

1127 numpy.ndarray.nbytes : Total bytes consumed by the elements of the

1128 array.

1129

1130 Notes

1131 -----

1132 Memory usage does not include memory consumed by elements that

1133 are not components of the array if deep=False or if used on PyPy

1134 """

1135 if hasattr(self.array, "memory_usage"):

1136 return self.array.memory_usage( # pyright: ignore[reportGeneralTypeIssues]

1137 deep=deep,

1138 )

1139

1140 v = self.array.nbytes

1141 if deep and is_object_dtype(self) and not PYPY:

1142 values = cast(np.ndarray, self._values)

1143 v += lib.memory_usage_of_objects(values)

1144 return v

1145

1146 @doc(

1147 algorithms.factorize,

1148 values="",

1149 order="",

1150 size_hint="",

1151 sort=textwrap.dedent(

1152 """\

1153 sort : bool, default False

1154 Sort `uniques` and shuffle `codes` to maintain the

1155 relationship.

1156 """

1157 ),

1158 )

1159 def factorize(

1160 self,

1161 sort: bool = False,

1162 use_na_sentinel: bool = True,

1163 ) -> tuple[npt.NDArray[np.intp], Index]:

1164 codes, uniques = algorithms.factorize(

1165 self._values, sort=sort, use_na_sentinel=use_na_sentinel

1166 )

1167 if uniques.dtype == np.float16:

1168 uniques = uniques.astype(np.float32)

1169

1170 if isinstance(self, ABCIndex):

1171 # preserve e.g. MultiIndex

1172 uniques = self._constructor(uniques)

1173 else:

1174 from pandas import Index

1175

1176 uniques = Index(uniques)

1177 return codes, uniques

1178

1179 _shared_docs[

1180 "searchsorted"

1181 ] = """

1182 Find indices where elements should be inserted to maintain order.

1183

1184 Find the indices into a sorted {klass} `self` such that, if the

1185 corresponding elements in `value` were inserted before the indices,

1186 the order of `self` would be preserved.

1187

1188 .. note::

1189

1190 The {klass} *must* be monotonically sorted, otherwise

1191 wrong locations will likely be returned. Pandas does *not*

1192 check this for you.

1193

1194 Parameters

1195 ----------

1196 value : array-like or scalar

1197 Values to insert into `self`.

1198 side : {{'left', 'right'}}, optional

1199 If 'left', the index of the first suitable location found is given.

1200 If 'right', return the last such index. If there is no suitable

1201 index, return either 0 or N (where N is the length of `self`).

1202 sorter : 1-D array-like, optional

1203 Optional array of integer indices that sort `self` into ascending

1204 order. They are typically the result of ``np.argsort``.

1205

1206 Returns

1207 -------

1208 int or array of int

1209 A scalar or array of insertion points with the

1210 same shape as `value`.

1211

1212 See Also

1213 --------

1214 sort_values : Sort by the values along either axis.

1215 numpy.searchsorted : Similar method from NumPy.

1216

1217 Notes

1218 -----

1219 Binary search is used to find the required insertion points.

1220

1221 Examples

1222 --------

1223 >>> ser = pd.Series([1, 2, 3])

1224 >>> ser

1225 0 1

1226 1 2

1227 2 3

1228 dtype: int64

1229

1230 >>> ser.searchsorted(4)

1231 3

1232

1233 >>> ser.searchsorted([0, 4])

1234 array([0, 3])

1235

1236 >>> ser.searchsorted([1, 3], side='left')

1237 array([0, 2])

1238

1239 >>> ser.searchsorted([1, 3], side='right')

1240 array([1, 3])

1241

1242 >>> ser = pd.Series(pd.to_datetime(['3/11/2000', '3/12/2000', '3/13/2000']))

1243 >>> ser

1244 0 2000-03-11

1245 1 2000-03-12

1246 2 2000-03-13

1247 dtype: datetime64[ns]

1248

1249 >>> ser.searchsorted('3/14/2000')

1250 3

1251

1252 >>> ser = pd.Categorical(

1253 ... ['apple', 'bread', 'bread', 'cheese', 'milk'], ordered=True

1254 ... )

1255 >>> ser

1256 ['apple', 'bread', 'bread', 'cheese', 'milk']

1257 Categories (4, object): ['apple' < 'bread' < 'cheese' < 'milk']

1258

1259 >>> ser.searchsorted('bread')

1260 1

1261

1262 >>> ser.searchsorted(['bread'], side='right')

1263 array([3])

1264

1265 If the values are not monotonically sorted, wrong locations

1266 may be returned:

1267

1268 >>> ser = pd.Series([2, 1, 3])

1269 >>> ser

1270 0 2

1271 1 1

1272 2 3

1273 dtype: int64

1274

1275 >>> ser.searchsorted(1) # doctest: +SKIP

1276 0 # wrong result, correct would be 1

1277 """

1278

1279 # This overload is needed so that the call to searchsorted in

1280 # pandas.core.resample.TimeGrouper._get_period_bins picks the correct result

1281

1282 @overload

1283 # The following ignore is also present in numpy/__init__.pyi

1284 # Possibly a mypy bug??

1285 # error: Overloaded function signatures 1 and 2 overlap with incompatible

1286 # return types [misc]

1287 def searchsorted( # type: ignore[misc]

1288 self,

1289 value: ScalarLike_co,

1290 side: Literal["left", "right"] = ...,

1291 sorter: NumpySorter = ...,

1292 ) -> np.intp:

1293 ...

1294

1295 @overload

1296 def searchsorted(

1297 self,

1298 value: npt.ArrayLike | ExtensionArray,

1299 side: Literal["left", "right"] = ...,

1300 sorter: NumpySorter = ...,

1301 ) -> npt.NDArray[np.intp]:

1302 ...

1303

1304 @doc(_shared_docs["searchsorted"], klass="Index")

1305 def searchsorted(

1306 self,

1307 value: NumpyValueArrayLike | ExtensionArray,

1308 side: Literal["left", "right"] = "left",

1309 sorter: NumpySorter = None,

1310 ) -> npt.NDArray[np.intp] | np.intp:

1311 if isinstance(value, ABCDataFrame):

1312 msg = (

1313 "Value must be 1-D array-like or scalar, "

1314 f"{type(value).__name__} is not supported"

1315 )

1316 raise ValueError(msg)

1317

1318 values = self._values

1319 if not isinstance(values, np.ndarray):

1320 # Going through EA.searchsorted directly improves performance GH#38083

1321 return values.searchsorted(value, side=side, sorter=sorter)

1322

1323 return algorithms.searchsorted(

1324 values,

1325 value,

1326 side=side,

1327 sorter=sorter,

1328 )

1329

1330 def drop_duplicates(self, *, keep: DropKeep = "first"):

1331 duplicated = self._duplicated(keep=keep)

1332 # error: Value of type "IndexOpsMixin" is not indexable

1333 return self[~duplicated] # type: ignore[index]

1334

1335 @final

1336 def _duplicated(self, keep: DropKeep = "first") -> npt.NDArray[np.bool_]:

1337 return algorithms.duplicated(self._values, keep=keep)

1338

1339 def _arith_method(self, other, op):

1340 res_name = ops.get_op_result_name(self, other)

1341

1342 lvalues = self._values

1343 rvalues = extract_array(other, extract_numpy=True, extract_range=True)

1344 rvalues = ops.maybe_prepare_scalar_for_op(rvalues, lvalues.shape)

1345 rvalues = ensure_wrapped_if_datetimelike(rvalues)

1346

1347 with np.errstate(all="ignore"):

1348 result = ops.arithmetic_op(lvalues, rvalues, op)

1349

1350 return self._construct_result(result, name=res_name)

1351

1352 def _construct_result(self, result, name):

1353 """

1354 Construct an appropriately-wrapped result from the ArrayLike result

1355 of an arithmetic-like operation.

1356 """

1357 raise AbstractMethodError(self)