Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/pandas/core/arrays/categorical.py: 22%

1from __future__ import annotations

3from csv import QUOTE_NONNUMERIC

4from functools import partial

5import operator

6from shutil import get_terminal_size

7from typing import (

8 TYPE_CHECKING,

9 Hashable,

10 Iterator,

11 Literal,

12 Sequence,

13 TypeVar,

14 cast,

15 overload,

16)

18import numpy as np

20from pandas._config import get_option

22from pandas._libs import (

23 NaT,

24 algos as libalgos,

25 lib,

26)

27from pandas._libs.arrays import NDArrayBacked

28from pandas._typing import (

29 ArrayLike,

30 AstypeArg,

31 AxisInt,

32 Dtype,

33 NpDtype,

34 Ordered,

35 Shape,

36 SortKind,

37 npt,

38 type_t,

39)

40from pandas.compat.numpy import function as nv

41from pandas.util._validators import validate_bool_kwarg

43from pandas.core.dtypes.cast import (

44 coerce_indexer_dtype,

45 find_common_type,

46)

47from pandas.core.dtypes.common import (

48 ensure_int64,

49 ensure_platform_int,

50 is_any_real_numeric_dtype,

51 is_bool_dtype,

52 is_categorical_dtype,

53 is_datetime64_dtype,

54 is_dict_like,

55 is_dtype_equal,

56 is_extension_array_dtype,

57 is_hashable,

58 is_integer_dtype,

59 is_list_like,

60 is_scalar,

61 is_timedelta64_dtype,

62 needs_i8_conversion,

63 pandas_dtype,

64)

65from pandas.core.dtypes.dtypes import (

66 CategoricalDtype,

67 ExtensionDtype,

68)

69from pandas.core.dtypes.generic import (

70 ABCIndex,

71 ABCSeries,

72)

73from pandas.core.dtypes.missing import (

74 is_valid_na_for_dtype,

75 isna,

76)

78from pandas.core import (

79 algorithms,

80 arraylike,

81 ops,

82)

83from pandas.core.accessor import (

84 PandasDelegate,

85 delegate_names,

86)

87from pandas.core.algorithms import (

88 factorize,

89 take_nd,

90)

91from pandas.core.arrays._mixins import (

92 NDArrayBackedExtensionArray,

93 ravel_compat,

94)

95from pandas.core.base import (

96 ExtensionArray,

97 NoNewAttributesMixin,

98 PandasObject,

99)

100import pandas.core.common as com

101from pandas.core.construction import (

102 extract_array,

103 sanitize_array,

104)

105from pandas.core.ops.common import unpack_zerodim_and_defer

106from pandas.core.sorting import nargsort

107from pandas.core.strings.object_array import ObjectStringArrayMixin

108

109from pandas.io.formats import console

110

111if TYPE_CHECKING:

112 from pandas import (

113 DataFrame,

114 Index,

115 Series,

116 )

117

118

119CategoricalT = TypeVar("CategoricalT", bound="Categorical")

120

121

122def _cat_compare_op(op):

123 opname = f"__{op.__name__}__"

124 fill_value = op is operator.ne

125

126 @unpack_zerodim_and_defer(opname)

127 def func(self, other):

128 hashable = is_hashable(other)

129 if is_list_like(other) and len(other) != len(self) and not hashable:

130 # in hashable case we may have a tuple that is itself a category

131 raise ValueError("Lengths must match.")

132

133 if not self.ordered:

134 if opname in ["__lt__", "__gt__", "__le__", "__ge__"]:

135 raise TypeError(

136 "Unordered Categoricals can only compare equality or not"

137 )

138 if isinstance(other, Categorical):

139 # Two Categoricals can only be compared if the categories are

140 # the same (maybe up to ordering, depending on ordered)

141

142 msg = "Categoricals can only be compared if 'categories' are the same."

143 if not self._categories_match_up_to_permutation(other):

144 raise TypeError(msg)

145

146 if not self.ordered and not self.categories.equals(other.categories):

147 # both unordered and different order

148 other_codes = recode_for_categories(

149 other.codes, other.categories, self.categories, copy=False

150 )

151 else:

152 other_codes = other._codes

153

154 ret = op(self._codes, other_codes)

155 mask = (self._codes == -1) | (other_codes == -1)

156 if mask.any():

157 ret[mask] = fill_value

158 return ret

159

160 if hashable:

161 if other in self.categories:

162 i = self._unbox_scalar(other)

163 ret = op(self._codes, i)

164

165 if opname not in {"__eq__", "__ge__", "__gt__"}:

166 # GH#29820 performance trick; get_loc will always give i>=0,

167 # so in the cases (__ne__, __le__, __lt__) the setting

168 # here is a no-op, so can be skipped.

169 mask = self._codes == -1

170 ret[mask] = fill_value

171 return ret

172 else:

173 return ops.invalid_comparison(self, other, op)

174 else:

175 # allow categorical vs object dtype array comparisons for equality

176 # these are only positional comparisons

177 if opname not in ["__eq__", "__ne__"]:

178 raise TypeError(

179 f"Cannot compare a Categorical for op {opname} with "

180 f"type {type(other)}.\nIf you want to compare values, "

181 "use 'np.asarray(cat) <op> other'."

182 )

183

184 if isinstance(other, ExtensionArray) and needs_i8_conversion(other.dtype):

185 # We would return NotImplemented here, but that messes up

186 # ExtensionIndex's wrapped methods

187 return op(other, self)

188 return getattr(np.array(self), opname)(np.array(other))

189

190 func.__name__ = opname

191

192 return func

193

194

195def contains(cat, key, container) -> bool:

196 """

197 Helper for membership check for ``key`` in ``cat``.

198

199 This is a helper method for :method:`__contains__`

200 and :class:`CategoricalIndex.__contains__`.

201

202 Returns True if ``key`` is in ``cat.categories`` and the

203 location of ``key`` in ``categories`` is in ``container``.

204

205 Parameters

206 ----------

207 cat : :class:`Categorical`or :class:`categoricalIndex`

208 key : a hashable object

209 The key to check membership for.

210 container : Container (e.g. list-like or mapping)

211 The container to check for membership in.

212

213 Returns

214 -------

215 is_in : bool

216 True if ``key`` is in ``self.categories`` and location of

217 ``key`` in ``categories`` is in ``container``, else False.

218

219 Notes

220 -----

221 This method does not check for NaN values. Do that separately

222 before calling this method.

223 """

224 hash(key)

225

226 # get location of key in categories.

227 # If a KeyError, the key isn't in categories, so logically

228 # can't be in container either.

229 try:

230 loc = cat.categories.get_loc(key)

231 except (KeyError, TypeError):

232 return False

233

234 # loc is the location of key in categories, but also the *value*

235 # for key in container. So, `key` may be in categories,

236 # but still not in `container`. Example ('b' in categories,

237 # but not in values):

238 # 'b' in Categorical(['a'], categories=['a', 'b']) # False

239 if is_scalar(loc):

240 return loc in container

241 else:

242 # if categories is an IntervalIndex, loc is an array.

243 return any(loc_ in container for loc_ in loc)

244

245

246class Categorical(NDArrayBackedExtensionArray, PandasObject, ObjectStringArrayMixin):

247 """

248 Represent a categorical variable in classic R / S-plus fashion.

249

250 `Categoricals` can only take on a limited, and usually fixed, number

251 of possible values (`categories`). In contrast to statistical categorical

252 variables, a `Categorical` might have an order, but numerical operations

253 (additions, divisions, ...) are not possible.

254

255 All values of the `Categorical` are either in `categories` or `np.nan`.

256 Assigning values outside of `categories` will raise a `ValueError`. Order

257 is defined by the order of the `categories`, not lexical order of the

258 values.

259

260 Parameters

261 ----------

262 values : list-like

263 The values of the categorical. If categories are given, values not in

264 categories will be replaced with NaN.

265 categories : Index-like (unique), optional

266 The unique categories for this categorical. If not given, the

267 categories are assumed to be the unique values of `values` (sorted, if

268 possible, otherwise in the order in which they appear).

269 ordered : bool, default False

270 Whether or not this categorical is treated as a ordered categorical.

271 If True, the resulting categorical will be ordered.

272 An ordered categorical respects, when sorted, the order of its

273 `categories` attribute (which in turn is the `categories` argument, if

274 provided).

275 dtype : CategoricalDtype

276 An instance of ``CategoricalDtype`` to use for this categorical.

277

278 Attributes

279 ----------

280 categories : Index

281 The categories of this categorical

282 codes : ndarray

283 The codes (integer positions, which point to the categories) of this

284 categorical, read only.

285 ordered : bool

286 Whether or not this Categorical is ordered.

287 dtype : CategoricalDtype

288 The instance of ``CategoricalDtype`` storing the ``categories``

289 and ``ordered``.

290

291 Methods

292 -------

293 from_codes

294 __array__

295

296 Raises

297 ------

298 ValueError

299 If the categories do not validate.

300 TypeError

301 If an explicit ``ordered=True`` is given but no `categories` and the

302 `values` are not sortable.

303

304 See Also

305 --------

306 CategoricalDtype : Type for categorical data.

307 CategoricalIndex : An Index with an underlying ``Categorical``.

308

309 Notes

310 -----

311 See the `user guide

312 <https://pandas.pydata.org/pandas-docs/stable/user_guide/categorical.html>`__

313 for more.

314

315 Examples

316 --------

317 >>> pd.Categorical([1, 2, 3, 1, 2, 3])

318 [1, 2, 3, 1, 2, 3]

319 Categories (3, int64): [1, 2, 3]

320

321 >>> pd.Categorical(['a', 'b', 'c', 'a', 'b', 'c'])

322 ['a', 'b', 'c', 'a', 'b', 'c']

323 Categories (3, object): ['a', 'b', 'c']

324

325 Missing values are not included as a category.

326

327 >>> c = pd.Categorical([1, 2, 3, 1, 2, 3, np.nan])

328 >>> c

329 [1, 2, 3, 1, 2, 3, NaN]

330 Categories (3, int64): [1, 2, 3]

331

332 However, their presence is indicated in the `codes` attribute

333 by code `-1`.

334

335 >>> c.codes

336 array([ 0, 1, 2, 0, 1, 2, -1], dtype=int8)

337

338 Ordered `Categoricals` can be sorted according to the custom order

339 of the categories and can have a min and max value.

340

341 >>> c = pd.Categorical(['a', 'b', 'c', 'a', 'b', 'c'], ordered=True,

342 ... categories=['c', 'b', 'a'])

343 >>> c

344 ['a', 'b', 'c', 'a', 'b', 'c']

345 Categories (3, object): ['c' < 'b' < 'a']

346 >>> c.min()

347 'c'

348 """

349

350 # For comparisons, so that numpy uses our implementation if the compare

351 # ops, which raise

352 __array_priority__ = 1000

353 # tolist is not actually deprecated, just suppressed in the __dir__

354 _hidden_attrs = PandasObject._hidden_attrs | frozenset(["tolist"])

355 _typ = "categorical"

356

357 _dtype: CategoricalDtype

358

359 def __init__(

360 self,

361 values,

362 categories=None,

363 ordered=None,

364 dtype: Dtype | None = None,

365 fastpath: bool = False,

366 copy: bool = True,

367 ) -> None:

368 dtype = CategoricalDtype._from_values_or_dtype(

369 values, categories, ordered, dtype

370 )

371 # At this point, dtype is always a CategoricalDtype, but

372 # we may have dtype.categories be None, and we need to

373 # infer categories in a factorization step further below

374

375 if fastpath:

376 codes = coerce_indexer_dtype(values, dtype.categories)

377 dtype = CategoricalDtype(ordered=False).update_dtype(dtype)

378 super().__init__(codes, dtype)

379 return

380

381 if not is_list_like(values):

382 # GH#38433

383 raise TypeError("Categorical input must be list-like")

384

385 # null_mask indicates missing values we want to exclude from inference.

386 # This means: only missing values in list-likes (not arrays/ndframes).

387 null_mask = np.array(False)

388

389 # sanitize input

390 if is_categorical_dtype(values):

391 if dtype.categories is None:

392 dtype = CategoricalDtype(values.categories, dtype.ordered)

393 elif not isinstance(values, (ABCIndex, ABCSeries, ExtensionArray)):

394 values = com.convert_to_list_like(values)

395 if isinstance(values, list) and len(values) == 0:

396 # By convention, empty lists result in object dtype:

397 values = np.array([], dtype=object)

398 elif isinstance(values, np.ndarray):

399 if values.ndim > 1:

400 # preempt sanitize_array from raising ValueError

401 raise NotImplementedError(

402 "> 1 ndim Categorical are not supported at this time"

403 )

404 values = sanitize_array(values, None)

405 else:

406 # i.e. must be a list

407 arr = sanitize_array(values, None)

408 null_mask = isna(arr)

409 if null_mask.any():

410 # We remove null values here, then below will re-insert

411 # them, grep "full_codes"

412 arr_list = [values[idx] for idx in np.where(~null_mask)[0]]

413

414 # GH#44900 Do not cast to float if we have only missing values

415 if arr_list or arr.dtype == "object":

416 sanitize_dtype = None

417 else:

418 sanitize_dtype = arr.dtype

419

420 arr = sanitize_array(arr_list, None, dtype=sanitize_dtype)

421 values = arr

422

423 if dtype.categories is None:

424 try:

425 codes, categories = factorize(values, sort=True)

426 except TypeError as err:

427 codes, categories = factorize(values, sort=False)

428 if dtype.ordered:

429 # raise, as we don't have a sortable data structure and so

430 # the user should give us one by specifying categories

431 raise TypeError(

432 "'values' is not ordered, please "

433 "explicitly specify the categories order "

434 "by passing in a categories argument."

435 ) from err

436

437 # we're inferring from values

438 dtype = CategoricalDtype(categories, dtype.ordered)

439

440 elif is_categorical_dtype(values.dtype):

441 old_codes = extract_array(values)._codes

442 codes = recode_for_categories(

443 old_codes, values.dtype.categories, dtype.categories, copy=copy

444 )

445

446 else:

447 codes = _get_codes_for_values(values, dtype.categories)

448

449 if null_mask.any():

450 # Reinsert -1 placeholders for previously removed missing values

451 full_codes = -np.ones(null_mask.shape, dtype=codes.dtype)

452 full_codes[~null_mask] = codes

453 codes = full_codes

454

455 dtype = CategoricalDtype(ordered=False).update_dtype(dtype)

456 arr = coerce_indexer_dtype(codes, dtype.categories)

457 super().__init__(arr, dtype)

458

459 @property

460 def dtype(self) -> CategoricalDtype:

461 """

462 The :class:`~pandas.api.types.CategoricalDtype` for this instance.

463 """

464 return self._dtype

465

466 @property

467 def _internal_fill_value(self) -> int:

468 # using the specific numpy integer instead of python int to get

469 # the correct dtype back from _quantile in the all-NA case

470 dtype = self._ndarray.dtype

471 return dtype.type(-1)

472

473 @classmethod

474 def _from_sequence(

475 cls, scalars, *, dtype: Dtype | None = None, copy: bool = False

476 ) -> Categorical:

477 return Categorical(scalars, dtype=dtype, copy=copy)

478

479 @overload

480 def astype(self, dtype: npt.DTypeLike, copy: bool = ...) -> np.ndarray:

481 ...

482

483 @overload

484 def astype(self, dtype: ExtensionDtype, copy: bool = ...) -> ExtensionArray:

485 ...

486

487 @overload

488 def astype(self, dtype: AstypeArg, copy: bool = ...) -> ArrayLike:

489 ...

490

491 def astype(self, dtype: AstypeArg, copy: bool = True) -> ArrayLike:

492 """

493 Coerce this type to another dtype

494

495 Parameters

496 ----------

497 dtype : numpy dtype or pandas type

498 copy : bool, default True

499 By default, astype always returns a newly allocated object.

500 If copy is set to False and dtype is categorical, the original

501 object is returned.

502 """

503 dtype = pandas_dtype(dtype)

504 if self.dtype is dtype:

505 result = self.copy() if copy else self

506

507 elif is_categorical_dtype(dtype):

508 dtype = cast(CategoricalDtype, dtype)

509

510 # GH 10696/18593/18630

511 dtype = self.dtype.update_dtype(dtype)

512 self = self.copy() if copy else self

513 result = self._set_dtype(dtype)

514

515 elif isinstance(dtype, ExtensionDtype):

516 return super().astype(dtype, copy=copy)

517

518 elif is_integer_dtype(dtype) and self.isna().any():

519 raise ValueError("Cannot convert float NaN to integer")

520

521 elif len(self.codes) == 0 or len(self.categories) == 0:

522 result = np.array(

523 self,

524 dtype=dtype,

525 copy=copy,

526 )

527

528 else:

529 # GH8628 (PERF): astype category codes instead of astyping array

530 new_cats = self.categories._values

531

532 try:

533 new_cats = new_cats.astype(dtype=dtype, copy=copy)

534 fill_value = self.categories._na_value

535 if not is_valid_na_for_dtype(fill_value, dtype):

536 fill_value = lib.item_from_zerodim(

537 np.array(self.categories._na_value).astype(dtype)

538 )

539 except (

540 TypeError, # downstream error msg for CategoricalIndex is misleading

541 ValueError,

542 ):

543 msg = f"Cannot cast {self.categories.dtype} dtype to {dtype}"

544 raise ValueError(msg)

545

546 result = take_nd(

547 new_cats, ensure_platform_int(self._codes), fill_value=fill_value

548 )

549

550 return result

551

552 def to_list(self):

553 """

554 Alias for tolist.

555 """

556 return self.tolist()

557

558 @classmethod

559 def _from_inferred_categories(

560 cls, inferred_categories, inferred_codes, dtype, true_values=None

561 ):

562 """

563 Construct a Categorical from inferred values.

564

565 For inferred categories (`dtype` is None) the categories are sorted.

566 For explicit `dtype`, the `inferred_categories` are cast to the

567 appropriate type.

568

569 Parameters

570 ----------

571 inferred_categories : Index

572 inferred_codes : Index

573 dtype : CategoricalDtype or 'category'

574 true_values : list, optional

575 If none are provided, the default ones are

576 "True", "TRUE", and "true."

577

578 Returns

579 -------

580 Categorical

581 """

582 from pandas import (

583 Index,

584 to_datetime,

585 to_numeric,

586 to_timedelta,

587 )

588

589 cats = Index(inferred_categories)

590 known_categories = (

591 isinstance(dtype, CategoricalDtype) and dtype.categories is not None

592 )

593

594 if known_categories:

595 # Convert to a specialized type with `dtype` if specified.

596 if is_any_real_numeric_dtype(dtype.categories):

597 cats = to_numeric(inferred_categories, errors="coerce")

598 elif is_datetime64_dtype(dtype.categories):

599 cats = to_datetime(inferred_categories, errors="coerce")

600 elif is_timedelta64_dtype(dtype.categories):

601 cats = to_timedelta(inferred_categories, errors="coerce")

602 elif is_bool_dtype(dtype.categories):

603 if true_values is None:

604 true_values = ["True", "TRUE", "true"]

605

606 # error: Incompatible types in assignment (expression has type

607 # "ndarray", variable has type "Index")

608 cats = cats.isin(true_values) # type: ignore[assignment]

609

610 if known_categories:

611 # Recode from observation order to dtype.categories order.

612 categories = dtype.categories

613 codes = recode_for_categories(inferred_codes, cats, categories)

614 elif not cats.is_monotonic_increasing:

615 # Sort categories and recode for unknown categories.

616 unsorted = cats.copy()

617 categories = cats.sort_values()

618

619 codes = recode_for_categories(inferred_codes, unsorted, categories)

620 dtype = CategoricalDtype(categories, ordered=False)

621 else:

622 dtype = CategoricalDtype(cats, ordered=False)

623 codes = inferred_codes

624

625 return cls(codes, dtype=dtype, fastpath=True)

626

627 @classmethod

628 def from_codes(

629 cls, codes, categories=None, ordered=None, dtype: Dtype | None = None

630 ) -> Categorical:

631 """

632 Make a Categorical type from codes and categories or dtype.

633

634 This constructor is useful if you already have codes and

635 categories/dtype and so do not need the (computation intensive)

636 factorization step, which is usually done on the constructor.

637

638 If your data does not follow this convention, please use the normal

639 constructor.

640

641 Parameters

642 ----------

643 codes : array-like of int

644 An integer array, where each integer points to a category in

645 categories or dtype.categories, or else is -1 for NaN.

646 categories : index-like, optional

647 The categories for the categorical. Items need to be unique.

648 If the categories are not given here, then they must be provided

649 in `dtype`.

650 ordered : bool, optional

651 Whether or not this categorical is treated as an ordered

652 categorical. If not given here or in `dtype`, the resulting

653 categorical will be unordered.

654 dtype : CategoricalDtype or "category", optional

655 If :class:`CategoricalDtype`, cannot be used together with

656 `categories` or `ordered`.

657

658 Returns

659 -------

660 Categorical

661

662 Examples

663 --------

664 >>> dtype = pd.CategoricalDtype(['a', 'b'], ordered=True)

665 >>> pd.Categorical.from_codes(codes=[0, 1, 0, 1], dtype=dtype)

666 ['a', 'b', 'a', 'b']

667 Categories (2, object): ['a' < 'b']

668 """

669 dtype = CategoricalDtype._from_values_or_dtype(

670 categories=categories, ordered=ordered, dtype=dtype

671 )

672 if dtype.categories is None:

673 msg = (

674 "The categories must be provided in 'categories' or "

675 "'dtype'. Both were None."

676 )

677 raise ValueError(msg)

678

679 if is_extension_array_dtype(codes) and is_integer_dtype(codes):

680 # Avoid the implicit conversion of Int to object

681 if isna(codes).any():

682 raise ValueError("codes cannot contain NA values")

683 codes = codes.to_numpy(dtype=np.int64)

684 else:

685 codes = np.asarray(codes)

686 if len(codes) and not is_integer_dtype(codes):

687 raise ValueError("codes need to be array-like integers")

688

689 if len(codes) and (codes.max() >= len(dtype.categories) or codes.min() < -1):

690 raise ValueError("codes need to be between -1 and len(categories)-1")

691

692 return cls(codes, dtype=dtype, fastpath=True)

693

694 # ------------------------------------------------------------------

695 # Categories/Codes/Ordered

696

697 @property

698 def categories(self) -> Index:

699 """

700 The categories of this categorical.

701

702 Setting assigns new values to each category (effectively a rename of

703 each individual category).

704

705 The assigned value has to be a list-like object. All items must be

706 unique and the number of items in the new categories must be the same

707 as the number of items in the old categories.

708

709 Raises

710 ------

711 ValueError

712 If the new categories do not validate as categories or if the

713 number of new categories is unequal the number of old categories

714

715 See Also

716 --------

717 rename_categories : Rename categories.

718 reorder_categories : Reorder categories.

719 add_categories : Add new categories.

720 remove_categories : Remove the specified categories.

721 remove_unused_categories : Remove categories which are not used.

722 set_categories : Set the categories to the specified ones.

723 """

724 return self.dtype.categories

725

726 @property

727 def ordered(self) -> Ordered:

728 """

729 Whether the categories have an ordered relationship.

730 """

731 return self.dtype.ordered

732

733 @property

734 def codes(self) -> np.ndarray:

735 """

736 The category codes of this categorical.

737

738 Codes are an array of integers which are the positions of the actual

739 values in the categories array.

740

741 There is no setter, use the other categorical methods and the normal item

742 setter to change values in the categorical.

743

744 Returns

745 -------

746 ndarray[int]

747 A non-writable view of the `codes` array.

748 """

749 v = self._codes.view()

750 v.flags.writeable = False

751 return v

752

753 def _set_categories(self, categories, fastpath: bool = False) -> None:

754 """

755 Sets new categories inplace

756

757 Parameters

758 ----------

759 fastpath : bool, default False

760 Don't perform validation of the categories for uniqueness or nulls

761

762 Examples

763 --------

764 >>> c = pd.Categorical(['a', 'b'])

765 >>> c

766 ['a', 'b']

767 Categories (2, object): ['a', 'b']

768

769 >>> c._set_categories(pd.Index(['a', 'c']))

770 >>> c

771 ['a', 'c']

772 Categories (2, object): ['a', 'c']

773 """

774 if fastpath:

775 new_dtype = CategoricalDtype._from_fastpath(categories, self.ordered)

776 else:

777 new_dtype = CategoricalDtype(categories, ordered=self.ordered)

778 if (

779 not fastpath

780 and self.dtype.categories is not None

781 and len(new_dtype.categories) != len(self.dtype.categories)

782 ):

783 raise ValueError(

784 "new categories need to have the same number of "

785 "items as the old categories!"

786 )

787

788 super().__init__(self._ndarray, new_dtype)

789

790 def _set_dtype(self, dtype: CategoricalDtype) -> Categorical:

791 """

792 Internal method for directly updating the CategoricalDtype

793

794 Parameters

795 ----------

796 dtype : CategoricalDtype

797

798 Notes

799 -----

800 We don't do any validation here. It's assumed that the dtype is

801 a (valid) instance of `CategoricalDtype`.

802 """

803 codes = recode_for_categories(self.codes, self.categories, dtype.categories)

804 return type(self)(codes, dtype=dtype, fastpath=True)

805

806 def set_ordered(self, value: bool) -> Categorical:

807 """

808 Set the ordered attribute to the boolean value.

809

810 Parameters

811 ----------

812 value : bool

813 Set whether this categorical is ordered (True) or not (False).

814 """

815 new_dtype = CategoricalDtype(self.categories, ordered=value)

816 cat = self.copy()

817 NDArrayBacked.__init__(cat, cat._ndarray, new_dtype)

818 return cat

819

820 def as_ordered(self) -> Categorical:

821 """

822 Set the Categorical to be ordered.

823

824 Returns

825 -------

826 Categorical

827 Ordered Categorical.

828 """

829 return self.set_ordered(True)

830

831 def as_unordered(self) -> Categorical:

832 """

833 Set the Categorical to be unordered.

834

835 Returns

836 -------

837 Categorical

838 Unordered Categorical.

839 """

840 return self.set_ordered(False)

841

842 def set_categories(self, new_categories, ordered=None, rename: bool = False):

843 """

844 Set the categories to the specified new_categories.

845

846 `new_categories` can include new categories (which will result in

847 unused categories) or remove old categories (which results in values

848 set to NaN). If `rename==True`, the categories will simple be renamed

849 (less or more items than in old categories will result in values set to

850 NaN or in unused categories respectively).

851

852 This method can be used to perform more than one action of adding,

853 removing, and reordering simultaneously and is therefore faster than

854 performing the individual steps via the more specialised methods.

855

856 On the other hand this methods does not do checks (e.g., whether the

857 old categories are included in the new categories on a reorder), which

858 can result in surprising changes, for example when using special string

859 dtypes, which does not considers a S1 string equal to a single char

860 python string.

861

862 Parameters

863 ----------

864 new_categories : Index-like

865 The categories in new order.

866 ordered : bool, default False

867 Whether or not the categorical is treated as a ordered categorical.

868 If not given, do not change the ordered information.

869 rename : bool, default False

870 Whether or not the new_categories should be considered as a rename

871 of the old categories or as reordered categories.

872

873 Returns

874 -------

875 Categorical with reordered categories.

876

877 Raises

878 ------

879 ValueError

880 If new_categories does not validate as categories

881

882 See Also

883 --------

884 rename_categories : Rename categories.

885 reorder_categories : Reorder categories.

886 add_categories : Add new categories.

887 remove_categories : Remove the specified categories.

888 remove_unused_categories : Remove categories which are not used.

889 """

890

891 if ordered is None:

892 ordered = self.dtype.ordered

893 new_dtype = CategoricalDtype(new_categories, ordered=ordered)

894

895 cat = self.copy()

896 if rename:

897 if cat.dtype.categories is not None and len(new_dtype.categories) < len(

898 cat.dtype.categories

899 ):

900 # remove all _codes which are larger and set to -1/NaN

901 cat._codes[cat._codes >= len(new_dtype.categories)] = -1

902 codes = cat._codes

903 else:

904 codes = recode_for_categories(

905 cat.codes, cat.categories, new_dtype.categories

906 )

907 NDArrayBacked.__init__(cat, codes, new_dtype)

908 return cat

909

910 def rename_categories(self, new_categories) -> Categorical:

911 """

912 Rename categories.

913

914 Parameters

915 ----------

916 new_categories : list-like, dict-like or callable

917

918 New categories which will replace old categories.

919

920 * list-like: all items must be unique and the number of items in

921 the new categories must match the existing number of categories.

922

923 * dict-like: specifies a mapping from

924 old categories to new. Categories not contained in the mapping

925 are passed through and extra categories in the mapping are

926 ignored.

927

928 * callable : a callable that is called on all items in the old

929 categories and whose return values comprise the new categories.

930

931 Returns

932 -------

933 Categorical

934 Categorical with renamed categories.

935

936 Raises

937 ------

938 ValueError

939 If new categories are list-like and do not have the same number of

940 items than the current categories or do not validate as categories

941

942 See Also

943 --------

944 reorder_categories : Reorder categories.

945 add_categories : Add new categories.

946 remove_categories : Remove the specified categories.

947 remove_unused_categories : Remove categories which are not used.

948 set_categories : Set the categories to the specified ones.

949

950 Examples

951 --------

952 >>> c = pd.Categorical(['a', 'a', 'b'])

953 >>> c.rename_categories([0, 1])

954 [0, 0, 1]

955 Categories (2, int64): [0, 1]

956

957 For dict-like ``new_categories``, extra keys are ignored and

958 categories not in the dictionary are passed through

959

960 >>> c.rename_categories({'a': 'A', 'c': 'C'})

961 ['A', 'A', 'b']

962 Categories (2, object): ['A', 'b']

963

964 You may also provide a callable to create the new categories

965

966 >>> c.rename_categories(lambda x: x.upper())

967 ['A', 'A', 'B']

968 Categories (2, object): ['A', 'B']

969 """

970

971 if is_dict_like(new_categories):

972 new_categories = [

973 new_categories.get(item, item) for item in self.categories

974 ]

975 elif callable(new_categories):

976 new_categories = [new_categories(item) for item in self.categories]

977

978 cat = self.copy()

979 cat._set_categories(new_categories)

980 return cat

981

982 def reorder_categories(self, new_categories, ordered=None):

983 """

984 Reorder categories as specified in new_categories.

985

986 `new_categories` need to include all old categories and no new category

987 items.

988

989 Parameters

990 ----------

991 new_categories : Index-like

992 The categories in new order.

993 ordered : bool, optional

994 Whether or not the categorical is treated as a ordered categorical.

995 If not given, do not change the ordered information.

996

997 Returns

998 -------

999 Categorical

1000 Categorical with reordered categories.

1001

1002 Raises

1003 ------

1004 ValueError

1005 If the new categories do not contain all old category items or any

1006 new ones

1007

1008 See Also

1009 --------

1010 rename_categories : Rename categories.

1011 add_categories : Add new categories.

1012 remove_categories : Remove the specified categories.

1013 remove_unused_categories : Remove categories which are not used.

1014 set_categories : Set the categories to the specified ones.

1015 """

1016 if (

1017 len(self.categories) != len(new_categories)

1018 or not self.categories.difference(new_categories).empty

1019 ):

1020 raise ValueError(

1021 "items in new_categories are not the same as in old categories"

1022 )

1023 return self.set_categories(new_categories, ordered=ordered)

1024

1025 def add_categories(self, new_categories) -> Categorical:

1026 """

1027 Add new categories.

1028

1029 `new_categories` will be included at the last/highest place in the

1030 categories and will be unused directly after this call.

1031

1032 Parameters

1033 ----------

1034 new_categories : category or list-like of category

1035 The new categories to be included.

1036

1037 Returns

1038 -------

1039 Categorical

1040 Categorical with new categories added.

1041

1042 Raises

1043 ------

1044 ValueError

1045 If the new categories include old categories or do not validate as

1046 categories

1047

1048 See Also

1049 --------

1050 rename_categories : Rename categories.

1051 reorder_categories : Reorder categories.

1052 remove_categories : Remove the specified categories.

1053 remove_unused_categories : Remove categories which are not used.

1054 set_categories : Set the categories to the specified ones.

1055

1056 Examples

1057 --------

1058 >>> c = pd.Categorical(['c', 'b', 'c'])

1059 >>> c

1060 ['c', 'b', 'c']

1061 Categories (2, object): ['b', 'c']

1062

1063 >>> c.add_categories(['d', 'a'])

1064 ['c', 'b', 'c']

1065 Categories (4, object): ['b', 'c', 'd', 'a']

1066 """

1067

1068 if not is_list_like(new_categories):

1069 new_categories = [new_categories]

1070 already_included = set(new_categories) & set(self.dtype.categories)

1071 if len(already_included) != 0:

1072 raise ValueError(

1073 f"new categories must not include old categories: {already_included}"

1074 )

1075

1076 if hasattr(new_categories, "dtype"):

1077 from pandas import Series

1078

1079 dtype = find_common_type(

1080 [self.dtype.categories.dtype, new_categories.dtype]

1081 )

1082 new_categories = Series(

1083 list(self.dtype.categories) + list(new_categories), dtype=dtype

1084 )

1085 else:

1086 new_categories = list(self.dtype.categories) + list(new_categories)

1087

1088 new_dtype = CategoricalDtype(new_categories, self.ordered)

1089 cat = self.copy()

1090 codes = coerce_indexer_dtype(cat._ndarray, new_dtype.categories)

1091 NDArrayBacked.__init__(cat, codes, new_dtype)

1092 return cat

1093

1094 def remove_categories(self, removals):

1095 """

1096 Remove the specified categories.

1097

1098 `removals` must be included in the old categories. Values which were in

1099 the removed categories will be set to NaN

1100

1101 Parameters

1102 ----------

1103 removals : category or list of categories

1104 The categories which should be removed.

1105

1106 Returns

1107 -------

1108 Categorical

1109 Categorical with removed categories.

1110

1111 Raises

1112 ------

1113 ValueError

1114 If the removals are not contained in the categories

1115

1116 See Also

1117 --------

1118 rename_categories : Rename categories.

1119 reorder_categories : Reorder categories.

1120 add_categories : Add new categories.

1121 remove_unused_categories : Remove categories which are not used.

1122 set_categories : Set the categories to the specified ones.

1123

1124 Examples

1125 --------

1126 >>> c = pd.Categorical(['a', 'c', 'b', 'c', 'd'])

1127 >>> c

1128 ['a', 'c', 'b', 'c', 'd']

1129 Categories (4, object): ['a', 'b', 'c', 'd']

1130

1131 >>> c.remove_categories(['d', 'a'])

1132 [NaN, 'c', 'b', 'c', NaN]

1133 Categories (2, object): ['b', 'c']

1134 """

1135 from pandas import Index

1136

1137 if not is_list_like(removals):

1138 removals = [removals]

1139

1140 removals = Index(removals).unique().dropna()

1141 new_categories = self.dtype.categories.difference(removals)

1142 not_included = removals.difference(self.dtype.categories)

1143

1144 if len(not_included) != 0:

1145 not_included = set(not_included)

1146 raise ValueError(f"removals must all be in old categories: {not_included}")

1147

1148 return self.set_categories(new_categories, ordered=self.ordered, rename=False)

1149

1150 def remove_unused_categories(self) -> Categorical:

1151 """

1152 Remove categories which are not used.

1153

1154 Returns

1155 -------

1156 Categorical

1157 Categorical with unused categories dropped.

1158

1159 See Also

1160 --------

1161 rename_categories : Rename categories.

1162 reorder_categories : Reorder categories.

1163 add_categories : Add new categories.

1164 remove_categories : Remove the specified categories.

1165 set_categories : Set the categories to the specified ones.

1166

1167 Examples

1168 --------

1169 >>> c = pd.Categorical(['a', 'c', 'b', 'c', 'd'])

1170 >>> c

1171 ['a', 'c', 'b', 'c', 'd']

1172 Categories (4, object): ['a', 'b', 'c', 'd']

1173

1174 >>> c[2] = 'a'

1175 >>> c[4] = 'c'

1176 >>> c

1177 ['a', 'c', 'a', 'c', 'c']

1178 Categories (4, object): ['a', 'b', 'c', 'd']

1179

1180 >>> c.remove_unused_categories()

1181 ['a', 'c', 'a', 'c', 'c']

1182 Categories (2, object): ['a', 'c']

1183 """

1184 idx, inv = np.unique(self._codes, return_inverse=True)

1185

1186 if idx.size != 0 and idx[0] == -1: # na sentinel

1187 idx, inv = idx[1:], inv - 1

1188

1189 new_categories = self.dtype.categories.take(idx)

1190 new_dtype = CategoricalDtype._from_fastpath(

1191 new_categories, ordered=self.ordered

1192 )

1193 new_codes = coerce_indexer_dtype(inv, new_dtype.categories)

1194

1195 cat = self.copy()

1196 NDArrayBacked.__init__(cat, new_codes, new_dtype)

1197 return cat

1198

1199 # ------------------------------------------------------------------

1200

1201 def map(self, mapper):

1202 """

1203 Map categories using an input mapping or function.

1204

1205 Maps the categories to new categories. If the mapping correspondence is

1206 one-to-one the result is a :class:`~pandas.Categorical` which has the

1207 same order property as the original, otherwise a :class:`~pandas.Index`

1208 is returned. NaN values are unaffected.

1209

1210 If a `dict` or :class:`~pandas.Series` is used any unmapped category is

1211 mapped to `NaN`. Note that if this happens an :class:`~pandas.Index`

1212 will be returned.

1213

1214 Parameters

1215 ----------

1216 mapper : function, dict, or Series

1217 Mapping correspondence.

1218

1219 Returns

1220 -------

1221 pandas.Categorical or pandas.Index

1222 Mapped categorical.

1223

1224 See Also

1225 --------

1226 CategoricalIndex.map : Apply a mapping correspondence on a

1227 :class:`~pandas.CategoricalIndex`.

1228 Index.map : Apply a mapping correspondence on an

1229 :class:`~pandas.Index`.

1230 Series.map : Apply a mapping correspondence on a

1231 :class:`~pandas.Series`.

1232 Series.apply : Apply more complex functions on a

1233 :class:`~pandas.Series`.

1234

1235 Examples

1236 --------

1237 >>> cat = pd.Categorical(['a', 'b', 'c'])

1238 >>> cat

1239 ['a', 'b', 'c']

1240 Categories (3, object): ['a', 'b', 'c']

1241 >>> cat.map(lambda x: x.upper())

1242 ['A', 'B', 'C']

1243 Categories (3, object): ['A', 'B', 'C']

1244 >>> cat.map({'a': 'first', 'b': 'second', 'c': 'third'})

1245 ['first', 'second', 'third']

1246 Categories (3, object): ['first', 'second', 'third']

1247

1248 If the mapping is one-to-one the ordering of the categories is

1249 preserved:

1250

1251 >>> cat = pd.Categorical(['a', 'b', 'c'], ordered=True)

1252 >>> cat

1253 ['a', 'b', 'c']

1254 Categories (3, object): ['a' < 'b' < 'c']

1255 >>> cat.map({'a': 3, 'b': 2, 'c': 1})

1256 [3, 2, 1]

1257 Categories (3, int64): [3 < 2 < 1]

1258

1259 If the mapping is not one-to-one an :class:`~pandas.Index` is returned:

1260

1261 >>> cat.map({'a': 'first', 'b': 'second', 'c': 'first'})

1262 Index(['first', 'second', 'first'], dtype='object')

1263

1264 If a `dict` is used, all unmapped categories are mapped to `NaN` and

1265 the result is an :class:`~pandas.Index`:

1266

1267 >>> cat.map({'a': 'first', 'b': 'second'})

1268 Index(['first', 'second', nan], dtype='object')

1269 """

1270 new_categories = self.categories.map(mapper)

1271 try:

1272 return self.from_codes(

1273 self._codes.copy(), categories=new_categories, ordered=self.ordered

1274 )

1275 except ValueError:

1276 # NA values are represented in self._codes with -1

1277 # np.take causes NA values to take final element in new_categories

1278 if np.any(self._codes == -1):

1279 new_categories = new_categories.insert(len(new_categories), np.nan)

1280 return np.take(new_categories, self._codes)

1281

1282 __eq__ = _cat_compare_op(operator.eq)

1283 __ne__ = _cat_compare_op(operator.ne)

1284 __lt__ = _cat_compare_op(operator.lt)

1285 __gt__ = _cat_compare_op(operator.gt)

1286 __le__ = _cat_compare_op(operator.le)

1287 __ge__ = _cat_compare_op(operator.ge)

1288

1289 # -------------------------------------------------------------

1290 # Validators; ideally these can be de-duplicated

1291

1292 def _validate_setitem_value(self, value):

1293 if not is_hashable(value):

1294 # wrap scalars and hashable-listlikes in list

1295 return self._validate_listlike(value)

1296 else:

1297 return self._validate_scalar(value)

1298

1299 def _validate_scalar(self, fill_value):

1300 """

1301 Convert a user-facing fill_value to a representation to use with our

1302 underlying ndarray, raising TypeError if this is not possible.

1303

1304 Parameters

1305 ----------

1306 fill_value : object

1307

1308 Returns

1309 -------

1310 fill_value : int

1311

1312 Raises

1313 ------

1314 TypeError

1315 """

1316

1317 if is_valid_na_for_dtype(fill_value, self.categories.dtype):

1318 fill_value = -1

1319 elif fill_value in self.categories:

1320 fill_value = self._unbox_scalar(fill_value)

1321 else:

1322 raise TypeError(

1323 "Cannot setitem on a Categorical with a new "

1324 f"category ({fill_value}), set the categories first"

1325 ) from None

1326 return fill_value

1327

1328 # -------------------------------------------------------------

1329

1330 @ravel_compat

1331 def __array__(self, dtype: NpDtype | None = None) -> np.ndarray:

1332 """

1333 The numpy array interface.

1334

1335 Returns

1336 -------

1337 numpy.array

1338 A numpy array of either the specified dtype or,

1339 if dtype==None (default), the same dtype as

1340 categorical.categories.dtype.

1341 """

1342 ret = take_nd(self.categories._values, self._codes)

1343 if dtype and not is_dtype_equal(dtype, self.categories.dtype):

1344 return np.asarray(ret, dtype)

1345 # When we're a Categorical[ExtensionArray], like Interval,

1346 # we need to ensure __array__ gets all the way to an

1347 # ndarray.

1348 return np.asarray(ret)

1349

1350 def __array_ufunc__(self, ufunc: np.ufunc, method: str, *inputs, **kwargs):

1351 # for binary ops, use our custom dunder methods

1352 result = ops.maybe_dispatch_ufunc_to_dunder_op(

1353 self, ufunc, method, *inputs, **kwargs

1354 )

1355 if result is not NotImplemented:

1356 return result

1357

1358 if "out" in kwargs:

1359 # e.g. test_numpy_ufuncs_out

1360 return arraylike.dispatch_ufunc_with_out(

1361 self, ufunc, method, *inputs, **kwargs

1362 )

1363

1364 if method == "reduce":

1365 # e.g. TestCategoricalAnalytics::test_min_max_ordered

1366 result = arraylike.dispatch_reduction_ufunc(

1367 self, ufunc, method, *inputs, **kwargs

1368 )

1369 if result is not NotImplemented:

1370 return result

1371

1372 # for all other cases, raise for now (similarly as what happens in

1373 # Series.__array_prepare__)

1374 raise TypeError(

1375 f"Object with dtype {self.dtype} cannot perform "

1376 f"the numpy op {ufunc.__name__}"

1377 )

1378

1379 def __setstate__(self, state) -> None:

1380 """Necessary for making this object picklable"""

1381 if not isinstance(state, dict):

1382 return super().__setstate__(state)

1383

1384 if "_dtype" not in state:

1385 state["_dtype"] = CategoricalDtype(state["_categories"], state["_ordered"])

1386

1387 if "_codes" in state and "_ndarray" not in state:

1388 # backward compat, changed what is property vs attribute

1389 state["_ndarray"] = state.pop("_codes")

1390

1391 super().__setstate__(state)

1392

1393 @property

1394 def nbytes(self) -> int:

1395 return self._codes.nbytes + self.dtype.categories.values.nbytes

1396

1397 def memory_usage(self, deep: bool = False) -> int:

1398 """

1399 Memory usage of my values

1400

1401 Parameters

1402 ----------

1403 deep : bool

1404 Introspect the data deeply, interrogate

1405 `object` dtypes for system-level memory consumption

1406

1407 Returns

1408 -------

1409 bytes used

1410

1411 Notes

1412 -----

1413 Memory usage does not include memory consumed by elements that

1414 are not components of the array if deep=False

1415

1416 See Also

1417 --------

1418 numpy.ndarray.nbytes

1419 """

1420 return self._codes.nbytes + self.dtype.categories.memory_usage(deep=deep)

1421

1422 def isna(self) -> np.ndarray:

1423 """

1424 Detect missing values

1425

1426 Missing values (-1 in .codes) are detected.

1427

1428 Returns

1429 -------

1430 np.ndarray[bool] of whether my values are null

1431

1432 See Also

1433 --------

1434 isna : Top-level isna.

1435 isnull : Alias of isna.

1436 Categorical.notna : Boolean inverse of Categorical.isna.

1437

1438 """

1439 return self._codes == -1

1440

1441 isnull = isna

1442

1443 def notna(self) -> np.ndarray:

1444 """

1445 Inverse of isna

1446

1447 Both missing values (-1 in .codes) and NA as a category are detected as

1448 null.

1449

1450 Returns

1451 -------

1452 np.ndarray[bool] of whether my values are not null

1453

1454 See Also

1455 --------

1456 notna : Top-level notna.

1457 notnull : Alias of notna.

1458 Categorical.isna : Boolean inverse of Categorical.notna.

1459

1460 """

1461 return ~self.isna()

1462

1463 notnull = notna

1464

1465 def value_counts(self, dropna: bool = True) -> Series:

1466 """

1467 Return a Series containing counts of each category.

1468

1469 Every category will have an entry, even those with a count of 0.

1470

1471 Parameters

1472 ----------

1473 dropna : bool, default True

1474 Don't include counts of NaN.

1475

1476 Returns

1477 -------

1478 counts : Series

1479

1480 See Also

1481 --------

1482 Series.value_counts

1483 """

1484 from pandas import (

1485 CategoricalIndex,

1486 Series,

1487 )

1488

1489 code, cat = self._codes, self.categories

1490 ncat, mask = (len(cat), code >= 0)

1491 ix, clean = np.arange(ncat), mask.all()

1492

1493 if dropna or clean:

1494 obs = code if clean else code[mask]

1495 count = np.bincount(obs, minlength=ncat or 0)

1496 else:

1497 count = np.bincount(np.where(mask, code, ncat))

1498 ix = np.append(ix, -1)

1499

1500 ix = coerce_indexer_dtype(ix, self.dtype.categories)

1501 ix = self._from_backing_data(ix)

1502

1503 return Series(

1504 count, index=CategoricalIndex(ix), dtype="int64", name="count", copy=False

1505 )

1506

1507 # error: Argument 2 of "_empty" is incompatible with supertype

1508 # "NDArrayBackedExtensionArray"; supertype defines the argument type as

1509 # "ExtensionDtype"

1510 @classmethod

1511 def _empty( # type: ignore[override]

1512 cls: type_t[Categorical], shape: Shape, dtype: CategoricalDtype

1513 ) -> Categorical:

1514 """

1515 Analogous to np.empty(shape, dtype=dtype)

1516

1517 Parameters

1518 ----------

1519 shape : tuple[int]

1520 dtype : CategoricalDtype

1521 """

1522 arr = cls._from_sequence([], dtype=dtype)

1523

1524 # We have to use np.zeros instead of np.empty otherwise the resulting

1525 # ndarray may contain codes not supported by this dtype, in which

1526 # case repr(result) could segfault.

1527 backing = np.zeros(shape, dtype=arr._ndarray.dtype)

1528

1529 return arr._from_backing_data(backing)

1530

1531 def _internal_get_values(self):

1532 """

1533 Return the values.

1534

1535 For internal compatibility with pandas formatting.

1536

1537 Returns

1538 -------

1539 np.ndarray or Index

1540 A numpy array of the same dtype as categorical.categories.dtype or

1541 Index if datetime / periods.

1542 """

1543 # if we are a datetime and period index, return Index to keep metadata

1544 if needs_i8_conversion(self.categories.dtype):

1545 return self.categories.take(self._codes, fill_value=NaT)

1546 elif is_integer_dtype(self.categories) and -1 in self._codes:

1547 return self.categories.astype("object").take(self._codes, fill_value=np.nan)

1548 return np.array(self)

1549

1550 def check_for_ordered(self, op) -> None:

1551 """assert that we are ordered"""

1552 if not self.ordered:

1553 raise TypeError(

1554 f"Categorical is not ordered for operation {op}\n"

1555 "you can use .as_ordered() to change the "

1556 "Categorical to an ordered one\n"

1557 )

1558

1559 def argsort(

1560 self, *, ascending: bool = True, kind: SortKind = "quicksort", **kwargs

1561 ):

1562 """

1563 Return the indices that would sort the Categorical.

1564

1565 Missing values are sorted at the end.

1566

1567 Parameters

1568 ----------

1569 ascending : bool, default True

1570 Whether the indices should result in an ascending

1571 or descending sort.

1572 kind : {'quicksort', 'mergesort', 'heapsort', 'stable'}, optional

1573 Sorting algorithm.

1574 **kwargs:

1575 passed through to :func:`numpy.argsort`.

1576

1577 Returns

1578 -------

1579 np.ndarray[np.intp]

1580

1581 See Also

1582 --------

1583 numpy.ndarray.argsort

1584

1585 Notes

1586 -----

1587 While an ordering is applied to the category values, arg-sorting

1588 in this context refers more to organizing and grouping together

1589 based on matching category values. Thus, this function can be

1590 called on an unordered Categorical instance unlike the functions

1591 'Categorical.min' and 'Categorical.max'.

1592

1593 Examples

1594 --------

1595 >>> pd.Categorical(['b', 'b', 'a', 'c']).argsort()

1596 array([2, 0, 1, 3])

1597

1598 >>> cat = pd.Categorical(['b', 'b', 'a', 'c'],

1599 ... categories=['c', 'b', 'a'],

1600 ... ordered=True)

1601 >>> cat.argsort()

1602 array([3, 0, 1, 2])

1603

1604 Missing values are placed at the end

1605

1606 >>> cat = pd.Categorical([2, None, 1])

1607 >>> cat.argsort()

1608 array([2, 0, 1])

1609 """

1610 return super().argsort(ascending=ascending, kind=kind, **kwargs)

1611

1612 @overload

1613 def sort_values(

1614 self,

1615 *,

1616 inplace: Literal[False] = ...,

1617 ascending: bool = ...,

1618 na_position: str = ...,

1619 ) -> Categorical:

1620 ...

1621

1622 @overload

1623 def sort_values(

1624 self, *, inplace: Literal[True], ascending: bool = ..., na_position: str = ...

1625 ) -> None:

1626 ...

1627

1628 def sort_values(

1629 self,

1630 *,

1631 inplace: bool = False,

1632 ascending: bool = True,

1633 na_position: str = "last",

1634 ) -> Categorical | None:

1635 """

1636 Sort the Categorical by category value returning a new

1637 Categorical by default.

1638

1639 While an ordering is applied to the category values, sorting in this

1640 context refers more to organizing and grouping together based on

1641 matching category values. Thus, this function can be called on an

1642 unordered Categorical instance unlike the functions 'Categorical.min'

1643 and 'Categorical.max'.

1644

1645 Parameters

1646 ----------

1647 inplace : bool, default False

1648 Do operation in place.

1649 ascending : bool, default True

1650 Order ascending. Passing False orders descending. The

1651 ordering parameter provides the method by which the

1652 category values are organized.

1653 na_position : {'first', 'last'} (optional, default='last')

1654 'first' puts NaNs at the beginning

1655 'last' puts NaNs at the end

1656

1657 Returns

1658 -------

1659 Categorical or None

1660

1661 See Also

1662 --------

1663 Categorical.sort

1664 Series.sort_values

1665

1666 Examples

1667 --------

1668 >>> c = pd.Categorical([1, 2, 2, 1, 5])

1669 >>> c

1670 [1, 2, 2, 1, 5]

1671 Categories (3, int64): [1, 2, 5]

1672 >>> c.sort_values()

1673 [1, 1, 2, 2, 5]

1674 Categories (3, int64): [1, 2, 5]

1675 >>> c.sort_values(ascending=False)

1676 [5, 2, 2, 1, 1]

1677 Categories (3, int64): [1, 2, 5]

1678

1679 >>> c = pd.Categorical([1, 2, 2, 1, 5])

1680

1681 'sort_values' behaviour with NaNs. Note that 'na_position'

1682 is independent of the 'ascending' parameter:

1683

1684 >>> c = pd.Categorical([np.nan, 2, 2, np.nan, 5])

1685 >>> c

1686 [NaN, 2, 2, NaN, 5]

1687 Categories (2, int64): [2, 5]

1688 >>> c.sort_values()

1689 [2, 2, 5, NaN, NaN]

1690 Categories (2, int64): [2, 5]

1691 >>> c.sort_values(ascending=False)

1692 [5, 2, 2, NaN, NaN]

1693 Categories (2, int64): [2, 5]

1694 >>> c.sort_values(na_position='first')

1695 [NaN, NaN, 2, 2, 5]

1696 Categories (2, int64): [2, 5]

1697 >>> c.sort_values(ascending=False, na_position='first')

1698 [NaN, NaN, 5, 2, 2]

1699 Categories (2, int64): [2, 5]

1700 """

1701 inplace = validate_bool_kwarg(inplace, "inplace")

1702 if na_position not in ["last", "first"]:

1703 raise ValueError(f"invalid na_position: {repr(na_position)}")

1704

1705 sorted_idx = nargsort(self, ascending=ascending, na_position=na_position)

1706

1707 if not inplace:

1708 codes = self._codes[sorted_idx]

1709 return self._from_backing_data(codes)

1710 self._codes[:] = self._codes[sorted_idx]

1711 return None

1712

1713 def _rank(

1714 self,

1715 *,

1716 axis: AxisInt = 0,

1717 method: str = "average",

1718 na_option: str = "keep",

1719 ascending: bool = True,

1720 pct: bool = False,

1721 ):

1722 """

1723 See Series.rank.__doc__.

1724 """

1725 if axis != 0:

1726 raise NotImplementedError

1727 vff = self._values_for_rank()

1728 return algorithms.rank(

1729 vff,

1730 axis=axis,

1731 method=method,

1732 na_option=na_option,

1733 ascending=ascending,

1734 pct=pct,

1735 )

1736

1737 def _values_for_rank(self):

1738 """

1739 For correctly ranking ordered categorical data. See GH#15420

1740

1741 Ordered categorical data should be ranked on the basis of

1742 codes with -1 translated to NaN.

1743

1744 Returns

1745 -------

1746 numpy.array

1747

1748 """

1749 from pandas import Series

1750

1751 if self.ordered:

1752 values = self.codes

1753 mask = values == -1

1754 if mask.any():

1755 values = values.astype("float64")

1756 values[mask] = np.nan

1757 elif is_any_real_numeric_dtype(self.categories):

1758 values = np.array(self)

1759 else:

1760 # reorder the categories (so rank can use the float codes)

1761 # instead of passing an object array to rank

1762 values = np.array(

1763 self.rename_categories(

1764 Series(self.categories, copy=False).rank().values

1765 )

1766 )

1767 return values

1768

1769 # ------------------------------------------------------------------

1770 # NDArrayBackedExtensionArray compat

1771

1772 @property

1773 def _codes(self) -> np.ndarray:

1774 return self._ndarray

1775

1776 def _box_func(self, i: int):

1777 if i == -1:

1778 return np.NaN

1779 return self.categories[i]

1780

1781 def _unbox_scalar(self, key) -> int:

1782 # searchsorted is very performance sensitive. By converting codes

1783 # to same dtype as self.codes, we get much faster performance.

1784 code = self.categories.get_loc(key)

1785 code = self._ndarray.dtype.type(code)

1786 return code

1787

1788 # ------------------------------------------------------------------

1789

1790 def __iter__(self) -> Iterator:

1791 """

1792 Returns an Iterator over the values of this Categorical.

1793 """

1794 if self.ndim == 1:

1795 return iter(self._internal_get_values().tolist())

1796 else:

1797 return (self[n] for n in range(len(self)))

1798

1799 def __contains__(self, key) -> bool:

1800 """

1801 Returns True if `key` is in this Categorical.

1802 """

1803 # if key is a NaN, check if any NaN is in self.

1804 if is_valid_na_for_dtype(key, self.categories.dtype):

1805 return bool(self.isna().any())

1806

1807 return contains(self, key, container=self._codes)

1808

1809 # ------------------------------------------------------------------

1810 # Rendering Methods

1811

1812 def _formatter(self, boxed: bool = False):

1813 # Defer to CategoricalFormatter's formatter.

1814 return None

1815

1816 def _tidy_repr(self, max_vals: int = 10, footer: bool = True) -> str:

1817 """

1818 a short repr displaying only max_vals and an optional (but default

1819 footer)

1820 """

1821 num = max_vals // 2

1822 head = self[:num]._get_repr(length=False, footer=False)

1823 tail = self[-(max_vals - num) :]._get_repr(length=False, footer=False)

1824

1825 result = f"{head[:-1]}, ..., {tail[1:]}"

1826 if footer:

1827 result = f"{result}\n{self._repr_footer()}"

1828

1829 return str(result)

1830

1831 def _repr_categories(self) -> list[str]:

1832 """

1833 return the base repr for the categories

1834 """

1835 max_categories = (

1836 10

1837 if get_option("display.max_categories") == 0

1838 else get_option("display.max_categories")

1839 )

1840 from pandas.io.formats import format as fmt

1841

1842 format_array = partial(

1843 fmt.format_array, formatter=None, quoting=QUOTE_NONNUMERIC

1844 )

1845 if len(self.categories) > max_categories:

1846 num = max_categories // 2

1847 head = format_array(self.categories[:num])

1848 tail = format_array(self.categories[-num:])

1849 category_strs = head + ["..."] + tail

1850 else:

1851 category_strs = format_array(self.categories)

1852

1853 # Strip all leading spaces, which format_array adds for columns...

1854 category_strs = [x.strip() for x in category_strs]

1855 return category_strs

1856

1857 def _repr_categories_info(self) -> str:

1858 """

1859 Returns a string representation of the footer.

1860 """

1861 category_strs = self._repr_categories()

1862 dtype = str(self.categories.dtype)

1863 levheader = f"Categories ({len(self.categories)}, {dtype}): "

1864 width, height = get_terminal_size()

1865 max_width = get_option("display.width") or width

1866 if console.in_ipython_frontend():

1867 # 0 = no breaks

1868 max_width = 0

1869 levstring = ""

1870 start = True

1871 cur_col_len = len(levheader) # header

1872 sep_len, sep = (3, " < ") if self.ordered else (2, ", ")

1873 linesep = f"{sep.rstrip()}\n" # remove whitespace

1874 for val in category_strs:

1875 if max_width != 0 and cur_col_len + sep_len + len(val) > max_width:

1876 levstring += linesep + (" " * (len(levheader) + 1))

1877 cur_col_len = len(levheader) + 1 # header + a whitespace

1878 elif not start:

1879 levstring += sep

1880 cur_col_len += len(val)

1881 levstring += val

1882 start = False

1883 # replace to simple save space by

1884 return f"{levheader}[{levstring.replace(' < ... < ', ' ... ')}]"

1885

1886 def _repr_footer(self) -> str:

1887 info = self._repr_categories_info()

1888 return f"Length: {len(self)}\n{info}"

1889

1890 def _get_repr(

1891 self, length: bool = True, na_rep: str = "NaN", footer: bool = True

1892 ) -> str:

1893 from pandas.io.formats import format as fmt

1894

1895 formatter = fmt.CategoricalFormatter(

1896 self, length=length, na_rep=na_rep, footer=footer

1897 )

1898 result = formatter.to_string()

1899 return str(result)

1900

1901 def __repr__(self) -> str:

1902 """

1903 String representation.

1904 """

1905 _maxlen = 10

1906 if len(self._codes) > _maxlen:

1907 result = self._tidy_repr(_maxlen)

1908 elif len(self._codes) > 0:

1909 result = self._get_repr(length=len(self) > _maxlen)

1910 else:

1911 msg = self._get_repr(length=False, footer=True).replace("\n", ", ")

1912 result = f"[], {msg}"

1913

1914 return result

1915

1916 # ------------------------------------------------------------------

1917

1918 def _validate_listlike(self, value):

1919 # NB: here we assume scalar-like tuples have already been excluded

1920 value = extract_array(value, extract_numpy=True)

1921

1922 # require identical categories set

1923 if isinstance(value, Categorical):

1924 if not is_dtype_equal(self.dtype, value.dtype):

1925 raise TypeError(

1926 "Cannot set a Categorical with another, "

1927 "without identical categories"

1928 )

1929 # is_dtype_equal implies categories_match_up_to_permutation

1930 value = self._encode_with_my_categories(value)

1931 return value._codes

1932

1933 from pandas import Index

1934

1935 # tupleize_cols=False for e.g. test_fillna_iterable_category GH#41914

1936 to_add = Index._with_infer(value, tupleize_cols=False).difference(

1937 self.categories

1938 )

1939

1940 # no assignments of values not in categories, but it's always ok to set

1941 # something to np.nan

1942 if len(to_add) and not isna(to_add).all():

1943 raise TypeError(

1944 "Cannot setitem on a Categorical with a new "

1945 "category, set the categories first"

1946 )

1947

1948 codes = self.categories.get_indexer(value)

1949 return codes.astype(self._ndarray.dtype, copy=False)

1950

1951 def _reverse_indexer(self) -> dict[Hashable, npt.NDArray[np.intp]]:

1952 """

1953 Compute the inverse of a categorical, returning

1954 a dict of categories -> indexers.

1955

1956 *This is an internal function*

1957

1958 Returns

1959 -------

1960 Dict[Hashable, np.ndarray[np.intp]]

1961 dict of categories -> indexers

1962

1963 Examples

1964 --------

1965 >>> c = pd.Categorical(list('aabca'))

1966 >>> c

1967 ['a', 'a', 'b', 'c', 'a']

1968 Categories (3, object): ['a', 'b', 'c']

1969 >>> c.categories

1970 Index(['a', 'b', 'c'], dtype='object')

1971 >>> c.codes

1972 array([0, 0, 1, 2, 0], dtype=int8)

1973 >>> c._reverse_indexer()

1974 {'a': array([0, 1, 4]), 'b': array([2]), 'c': array([3])}

1975

1976 """

1977 categories = self.categories

1978 r, counts = libalgos.groupsort_indexer(

1979 ensure_platform_int(self.codes), categories.size

1980 )

1981 counts = ensure_int64(counts).cumsum()

1982 _result = (r[start:end] for start, end in zip(counts, counts[1:]))

1983 return dict(zip(categories, _result))

1984

1985 # ------------------------------------------------------------------

1986 # Reductions

1987

1988 def min(self, *, skipna: bool = True, **kwargs):

1989 """

1990 The minimum value of the object.

1991

1992 Only ordered `Categoricals` have a minimum!

1993

1994 Raises

1995 ------

1996 TypeError

1997 If the `Categorical` is not `ordered`.

1998

1999 Returns

2000 -------

2001 min : the minimum of this `Categorical`, NA value if empty

2002 """

2003 nv.validate_minmax_axis(kwargs.get("axis", 0))

2004 nv.validate_min((), kwargs)

2005 self.check_for_ordered("min")

2006

2007 if not len(self._codes):

2008 return self.dtype.na_value

2009

2010 good = self._codes != -1

2011 if not good.all():

2012 if skipna and good.any():

2013 pointer = self._codes[good].min()

2014 else:

2015 return np.nan

2016 else:

2017 pointer = self._codes.min()

2018 return self._wrap_reduction_result(None, pointer)

2019

2020 def max(self, *, skipna: bool = True, **kwargs):

2021 """

2022 The maximum value of the object.

2023

2024 Only ordered `Categoricals` have a maximum!

2025

2026 Raises

2027 ------

2028 TypeError

2029 If the `Categorical` is not `ordered`.

2030

2031 Returns

2032 -------

2033 max : the maximum of this `Categorical`, NA if array is empty

2034 """

2035 nv.validate_minmax_axis(kwargs.get("axis", 0))

2036 nv.validate_max((), kwargs)

2037 self.check_for_ordered("max")

2038

2039 if not len(self._codes):

2040 return self.dtype.na_value

2041

2042 good = self._codes != -1

2043 if not good.all():

2044 if skipna and good.any():

2045 pointer = self._codes[good].max()

2046 else:

2047 return np.nan

2048 else:

2049 pointer = self._codes.max()

2050 return self._wrap_reduction_result(None, pointer)

2051

2052 def _mode(self, dropna: bool = True) -> Categorical:

2053 codes = self._codes

2054 mask = None

2055 if dropna:

2056 mask = self.isna()

2057

2058 res_codes = algorithms.mode(codes, mask=mask)

2059 res_codes = cast(np.ndarray, res_codes)

2060 assert res_codes.dtype == codes.dtype

2061 res = self._from_backing_data(res_codes)

2062 return res

2063

2064 # ------------------------------------------------------------------

2065 # ExtensionArray Interface

2066

2067 def unique(self):

2068 """

2069 Return the ``Categorical`` which ``categories`` and ``codes`` are

2070 unique.

2071

2072 .. versionchanged:: 1.3.0

2073

2074 Previously, unused categories were dropped from the new categories.

2075

2076 Returns

2077 -------

2078 Categorical

2079

2080 See Also

2081 --------

2082 pandas.unique

2083 CategoricalIndex.unique

2084 Series.unique : Return unique values of Series object.

2085

2086 Examples

2087 --------

2088 >>> pd.Categorical(list("baabc")).unique()

2089 ['b', 'a', 'c']

2090 Categories (3, object): ['a', 'b', 'c']

2091 >>> pd.Categorical(list("baab"), categories=list("abc"), ordered=True).unique()

2092 ['b', 'a']

2093 Categories (3, object): ['a' < 'b' < 'c']

2094 """

2095 # pylint: disable=useless-parent-delegation

2096 return super().unique()

2097

2098 def _cast_quantile_result(self, res_values: np.ndarray) -> np.ndarray:

2099 # make sure we have correct itemsize for resulting codes

2100 assert res_values.dtype == self._ndarray.dtype

2101 return res_values

2102

2103 def equals(self, other: object) -> bool:

2104 """

2105 Returns True if categorical arrays are equal.

2106

2107 Parameters

2108 ----------

2109 other : `Categorical`

2110

2111 Returns

2112 -------

2113 bool

2114 """

2115 if not isinstance(other, Categorical):

2116 return False

2117 elif self._categories_match_up_to_permutation(other):

2118 other = self._encode_with_my_categories(other)

2119 return np.array_equal(self._codes, other._codes)

2120 return False

2121

2122 @classmethod

2123 def _concat_same_type(

2124 cls: type[CategoricalT], to_concat: Sequence[CategoricalT], axis: AxisInt = 0

2125 ) -> CategoricalT:

2126 from pandas.core.dtypes.concat import union_categoricals

2127

2128 first = to_concat[0]

2129 if axis >= first.ndim:

2130 raise ValueError(

2131 f"axis {axis} is out of bounds for array of dimension {first.ndim}"

2132 )

2133

2134 if axis == 1:

2135 # Flatten, concatenate then reshape

2136 if not all(x.ndim == 2 for x in to_concat):

2137 raise ValueError

2138

2139 # pass correctly-shaped to union_categoricals

2140 tc_flat = []

2141 for obj in to_concat:

2142 tc_flat.extend([obj[:, i] for i in range(obj.shape[1])])

2143

2144 res_flat = cls._concat_same_type(tc_flat, axis=0)

2145

2146 result = res_flat.reshape(len(first), -1, order="F")

2147 return result

2148

2149 result = union_categoricals(to_concat)

2150 return result

2151

2152 # ------------------------------------------------------------------

2153

2154 def _encode_with_my_categories(self, other: Categorical) -> Categorical:

2155 """

2156 Re-encode another categorical using this Categorical's categories.

2157

2158 Notes

2159 -----

2160 This assumes we have already checked

2161 self._categories_match_up_to_permutation(other).

2162 """

2163 # Indexing on codes is more efficient if categories are the same,

2164 # so we can apply some optimizations based on the degree of

2165 # dtype-matching.

2166 codes = recode_for_categories(

2167 other.codes, other.categories, self.categories, copy=False

2168 )

2169 return self._from_backing_data(codes)

2170

2171 def _categories_match_up_to_permutation(self, other: Categorical) -> bool:

2172 """

2173 Returns True if categoricals are the same dtype

2174 same categories, and same ordered

2175

2176 Parameters

2177 ----------

2178 other : Categorical

2179

2180 Returns

2181 -------

2182 bool

2183 """

2184 return hash(self.dtype) == hash(other.dtype)

2185

2186 def describe(self) -> DataFrame:

2187 """

2188 Describes this Categorical

2189

2190 Returns

2191 -------

2192 description: `DataFrame`

2193 A dataframe with frequency and counts by category.

2194 """

2195 counts = self.value_counts(dropna=False)

2196 freqs = counts / counts.sum()

2197

2198 from pandas import Index

2199 from pandas.core.reshape.concat import concat

2200

2201 result = concat([counts, freqs], axis=1)

2202 result.columns = Index(["counts", "freqs"])

2203 result.index.name = "categories"

2204

2205 return result

2206

2207 def isin(self, values) -> npt.NDArray[np.bool_]:

2208 """

2209 Check whether `values` are contained in Categorical.

2210

2211 Return a boolean NumPy Array showing whether each element in

2212 the Categorical matches an element in the passed sequence of

2213 `values` exactly.

2214

2215 Parameters

2216 ----------

2217 values : set or list-like

2218 The sequence of values to test. Passing in a single string will

2219 raise a ``TypeError``. Instead, turn a single string into a

2220 list of one element.

2221

2222 Returns

2223 -------

2224 np.ndarray[bool]

2225

2226 Raises

2227 ------

2228 TypeError

2229 * If `values` is not a set or list-like

2230

2231 See Also

2232 --------

2233 pandas.Series.isin : Equivalent method on Series.

2234

2235 Examples

2236 --------

2237 >>> s = pd.Categorical(['lama', 'cow', 'lama', 'beetle', 'lama',

2238 ... 'hippo'])

2239 >>> s.isin(['cow', 'lama'])

2240 array([ True, True, True, False, True, False])

2241

2242 Passing a single string as ``s.isin('lama')`` will raise an error. Use

2243 a list of one element instead:

2244

2245 >>> s.isin(['lama'])

2246 array([ True, False, True, False, True, False])

2247 """

2248 if not is_list_like(values):

2249 values_type = type(values).__name__

2250 raise TypeError(

2251 "only list-like objects are allowed to be passed "

2252 f"to isin(), you passed a [{values_type}]"

2253 )

2254 values = sanitize_array(values, None, None)

2255 null_mask = np.asarray(isna(values))

2256 code_values = self.categories.get_indexer(values)

2257 code_values = code_values[null_mask | (code_values >= 0)]

2258 return algorithms.isin(self.codes, code_values)

2259

2260 def _replace(self, *, to_replace, value, inplace: bool = False):

2261 from pandas import Index

2262

2263 inplace = validate_bool_kwarg(inplace, "inplace")

2264 cat = self if inplace else self.copy()

2265

2266 mask = isna(np.asarray(value))

2267 if mask.any():

2268 removals = np.asarray(to_replace)[mask]

2269 removals = cat.categories[cat.categories.isin(removals)]

2270 new_cat = cat.remove_categories(removals)

2271 NDArrayBacked.__init__(cat, new_cat.codes, new_cat.dtype)

2272

2273 ser = cat.categories.to_series()

2274 ser = ser.replace(to_replace=to_replace, value=value)

2275

2276 all_values = Index(ser)

2277

2278 # GH51016: maintain order of existing categories

2279 idxr = cat.categories.get_indexer_for(all_values)

2280 locs = np.arange(len(ser))

2281 locs = np.where(idxr == -1, locs, idxr)

2282 locs = locs.argsort()

2283

2284 new_categories = ser.take(locs)

2285 new_categories = new_categories.drop_duplicates(keep="first")

2286 new_categories = Index(new_categories)

2287 new_codes = recode_for_categories(

2288 cat._codes, all_values, new_categories, copy=False

2289 )

2290 new_dtype = CategoricalDtype(new_categories, ordered=self.dtype.ordered)

2291 NDArrayBacked.__init__(cat, new_codes, new_dtype)

2292

2293 if not inplace:

2294 return cat

2295

2296 # ------------------------------------------------------------------------

2297 # String methods interface

2298 def _str_map(

2299 self, f, na_value=np.nan, dtype=np.dtype("object"), convert: bool = True

2300 ):

2301 # Optimization to apply the callable `f` to the categories once

2302 # and rebuild the result by `take`ing from the result with the codes.

2303 # Returns the same type as the object-dtype implementation though.

2304 from pandas.core.arrays import PandasArray

2305

2306 categories = self.categories

2307 codes = self.codes

2308 result = PandasArray(categories.to_numpy())._str_map(f, na_value, dtype)

2309 return take_nd(result, codes, fill_value=na_value)

2310

2311 def _str_get_dummies(self, sep: str = "|"):

2312 # sep may not be in categories. Just bail on this.

2313 from pandas.core.arrays import PandasArray

2314

2315 return PandasArray(self.astype(str))._str_get_dummies(sep)

2316

2317

2318# The Series.cat accessor

2319

2320

2321@delegate_names(

2322 delegate=Categorical, accessors=["categories", "ordered"], typ="property"

2323)

2324@delegate_names(

2325 delegate=Categorical,

2326 accessors=[

2327 "rename_categories",

2328 "reorder_categories",

2329 "add_categories",

2330 "remove_categories",

2331 "remove_unused_categories",

2332 "set_categories",

2333 "as_ordered",

2334 "as_unordered",

2335 ],

2336 typ="method",

2337)

2338class CategoricalAccessor(PandasDelegate, PandasObject, NoNewAttributesMixin):

2339 """

2340 Accessor object for categorical properties of the Series values.

2341

2342 Parameters

2343 ----------

2344 data : Series or CategoricalIndex

2345

2346 Examples

2347 --------

2348 >>> s = pd.Series(list("abbccc")).astype("category")

2349 >>> s

2350 0 a

2351 1 b

2352 2 b

2353 3 c

2354 4 c

2355 5 c

2356 dtype: category

2357 Categories (3, object): ['a', 'b', 'c']

2358

2359 >>> s.cat.categories

2360 Index(['a', 'b', 'c'], dtype='object')

2361

2362 >>> s.cat.rename_categories(list("cba"))

2363 0 c

2364 1 b

2365 2 b

2366 3 a

2367 4 a

2368 5 a

2369 dtype: category

2370 Categories (3, object): ['c', 'b', 'a']

2371

2372 >>> s.cat.reorder_categories(list("cba"))

2373 0 a

2374 1 b

2375 2 b

2376 3 c

2377 4 c

2378 5 c

2379 dtype: category

2380 Categories (3, object): ['c', 'b', 'a']

2381

2382 >>> s.cat.add_categories(["d", "e"])

2383 0 a

2384 1 b

2385 2 b

2386 3 c

2387 4 c

2388 5 c

2389 dtype: category

2390 Categories (5, object): ['a', 'b', 'c', 'd', 'e']

2391

2392 >>> s.cat.remove_categories(["a", "c"])

2393 0 NaN

2394 1 b

2395 2 b

2396 3 NaN

2397 4 NaN

2398 5 NaN

2399 dtype: category

2400 Categories (1, object): ['b']

2401

2402 >>> s1 = s.cat.add_categories(["d", "e"])

2403 >>> s1.cat.remove_unused_categories()

2404 0 a

2405 1 b

2406 2 b

2407 3 c

2408 4 c

2409 5 c

2410 dtype: category

2411 Categories (3, object): ['a', 'b', 'c']

2412

2413 >>> s.cat.set_categories(list("abcde"))

2414 0 a

2415 1 b

2416 2 b

2417 3 c

2418 4 c

2419 5 c

2420 dtype: category

2421 Categories (5, object): ['a', 'b', 'c', 'd', 'e']

2422

2423 >>> s.cat.as_ordered()

2424 0 a

2425 1 b

2426 2 b

2427 3 c

2428 4 c

2429 5 c

2430 dtype: category

2431 Categories (3, object): ['a' < 'b' < 'c']

2432

2433 >>> s.cat.as_unordered()

2434 0 a

2435 1 b

2436 2 b

2437 3 c

2438 4 c

2439 5 c

2440 dtype: category

2441 Categories (3, object): ['a', 'b', 'c']

2442 """

2443

2444 def __init__(self, data) -> None:

2445 self._validate(data)

2446 self._parent = data.values

2447 self._index = data.index

2448 self._name = data.name

2449 self._freeze()

2450

2451 @staticmethod

2452 def _validate(data):

2453 if not is_categorical_dtype(data.dtype):

2454 raise AttributeError("Can only use .cat accessor with a 'category' dtype")

2455

2456 def _delegate_property_get(self, name):

2457 return getattr(self._parent, name)

2458

2459 def _delegate_property_set(self, name, new_values):

2460 return setattr(self._parent, name, new_values)

2461

2462 @property

2463 def codes(self) -> Series:

2464 """

2465 Return Series of codes as well as the index.

2466 """

2467 from pandas import Series

2468

2469 return Series(self._parent.codes, index=self._index)

2470

2471 def _delegate_method(self, name, *args, **kwargs):

2472 from pandas import Series

2473

2474 method = getattr(self._parent, name)

2475 res = method(*args, **kwargs)

2476 if res is not None:

2477 return Series(res, index=self._index, name=self._name)

2478

2479

2480# utility routines

2481

2482

2483def _get_codes_for_values(values, categories: Index) -> np.ndarray:

2484 """

2485 utility routine to turn values into codes given the specified categories

2486

2487 If `values` is known to be a Categorical, use recode_for_categories instead.

2488 """

2489 if values.ndim > 1:

2490 flat = values.ravel()

2491 codes = _get_codes_for_values(flat, categories)

2492 return codes.reshape(values.shape)

2493

2494 codes = categories.get_indexer_for(values)

2495 return coerce_indexer_dtype(codes, categories)

2496

2497

2498def recode_for_categories(

2499 codes: np.ndarray, old_categories, new_categories, copy: bool = True

2500) -> np.ndarray:

2501 """

2502 Convert a set of codes for to a new set of categories

2503

2504 Parameters

2505 ----------

2506 codes : np.ndarray

2507 old_categories, new_categories : Index

2508 copy: bool, default True

2509 Whether to copy if the codes are unchanged.

2510

2511 Returns

2512 -------

2513 new_codes : np.ndarray[np.int64]

2514

2515 Examples

2516 --------

2517 >>> old_cat = pd.Index(['b', 'a', 'c'])

2518 >>> new_cat = pd.Index(['a', 'b'])

2519 >>> codes = np.array([0, 1, 1, 2])

2520 >>> recode_for_categories(codes, old_cat, new_cat)

2521 array([ 1, 0, 0, -1], dtype=int8)

2522 """

2523 if len(old_categories) == 0:

2524 # All null anyway, so just retain the nulls

2525 if copy:

2526 return codes.copy()

2527 return codes

2528 elif new_categories.equals(old_categories):

2529 # Same categories, so no need to actually recode

2530 if copy:

2531 return codes.copy()

2532 return codes

2533

2534 indexer = coerce_indexer_dtype(

2535 new_categories.get_indexer(old_categories), new_categories

2536 )

2537 new_codes = take_nd(indexer, codes, fill_value=-1)

2538 return new_codes

2539

2540

2541def factorize_from_iterable(values) -> tuple[np.ndarray, Index]:

2542 """

2543 Factorize an input `values` into `categories` and `codes`. Preserves

2544 categorical dtype in `categories`.

2545

2546 Parameters

2547 ----------

2548 values : list-like

2549

2550 Returns

2551 -------

2552 codes : ndarray

2553 categories : Index

2554 If `values` has a categorical dtype, then `categories` is

2555 a CategoricalIndex keeping the categories and order of `values`.

2556 """

2557 from pandas import CategoricalIndex

2558

2559 if not is_list_like(values):

2560 raise TypeError("Input must be list-like")

2561

2562 categories: Index

2563 if is_categorical_dtype(values):

2564 values = extract_array(values)

2565 # The Categorical we want to build has the same categories

2566 # as values but its codes are by def [0, ..., len(n_categories) - 1]

2567 cat_codes = np.arange(len(values.categories), dtype=values.codes.dtype)

2568 cat = Categorical.from_codes(cat_codes, dtype=values.dtype)

2569

2570 categories = CategoricalIndex(cat)

2571 codes = values.codes

2572 else:

2573 # The value of ordered is irrelevant since we don't use cat as such,

2574 # but only the resulting categories, the order of which is independent

2575 # from ordered. Set ordered to False as default. See GH #15457

2576 cat = Categorical(values, ordered=False)

2577 categories = cat.categories

2578 codes = cat.codes

2579 return codes, categories

2580

2581

2582def factorize_from_iterables(iterables) -> tuple[list[np.ndarray], list[Index]]:

2583 """

2584 A higher-level wrapper over `factorize_from_iterable`.

2585

2586 Parameters

2587 ----------

2588 iterables : list-like of list-likes

2589

2590 Returns

2591 -------

2592 codes : list of ndarrays

2593 categories : list of Indexes

2594

2595 Notes

2596 -----

2597 See `factorize_from_iterable` for more info.

2598 """

2599 if len(iterables) == 0:

2600 # For consistency, it should return two empty lists.

2601 return [], []

2602

2603 codes, categories = zip(*(factorize_from_iterable(it) for it in iterables))

2604 return list(codes), list(categories)