Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.9/dist-packages/pandas/core/arrays/categorical.py: 26%

1from __future__ import annotations

3from csv import QUOTE_NONNUMERIC

4from functools import partial

5import operator

6from shutil import get_terminal_size

7from typing import (

8 TYPE_CHECKING,

9 Literal,

10 cast,

11 overload,

12)

13import warnings

15import numpy as np

17from pandas._config import get_option

19from pandas._libs import (

20 NaT,

21 algos as libalgos,

22 lib,

23)

24from pandas._libs.arrays import NDArrayBacked

25from pandas.compat.numpy import function as nv

26from pandas.util._exceptions import find_stack_level

27from pandas.util._validators import validate_bool_kwarg

29from pandas.core.dtypes.cast import (

30 coerce_indexer_dtype,

31 find_common_type,

32)

33from pandas.core.dtypes.common import (

34 ensure_int64,

35 ensure_platform_int,

36 is_any_real_numeric_dtype,

37 is_bool_dtype,

38 is_dict_like,

39 is_hashable,

40 is_integer_dtype,

41 is_list_like,

42 is_scalar,

43 needs_i8_conversion,

44 pandas_dtype,

45)

46from pandas.core.dtypes.dtypes import (

47 ArrowDtype,

48 CategoricalDtype,

49 CategoricalDtypeType,

50 ExtensionDtype,

51)

52from pandas.core.dtypes.generic import (

53 ABCIndex,

54 ABCSeries,

55)

56from pandas.core.dtypes.missing import (

57 is_valid_na_for_dtype,

58 isna,

59)

61from pandas.core import (

62 algorithms,

63 arraylike,

64 ops,

65)

66from pandas.core.accessor import (

67 PandasDelegate,

68 delegate_names,

69)

70from pandas.core.algorithms import (

71 factorize,

72 take_nd,

73)

74from pandas.core.arrays._mixins import (

75 NDArrayBackedExtensionArray,

76 ravel_compat,

77)

78from pandas.core.base import (

79 ExtensionArray,

80 NoNewAttributesMixin,

81 PandasObject,

82)

83import pandas.core.common as com

84from pandas.core.construction import (

85 extract_array,

86 sanitize_array,

87)

88from pandas.core.ops.common import unpack_zerodim_and_defer

89from pandas.core.sorting import nargsort

90from pandas.core.strings.object_array import ObjectStringArrayMixin

92from pandas.io.formats import console

94if TYPE_CHECKING:

95 from collections.abc import (

96 Hashable,

97 Iterator,

98 Sequence,

99 )

100

101 from pandas._typing import (

102 ArrayLike,

103 AstypeArg,

104 AxisInt,

105 Dtype,

106 DtypeObj,

107 NpDtype,

108 Ordered,

109 Self,

110 Shape,

111 SortKind,

112 npt,

113 )

114

115 from pandas import (

116 DataFrame,

117 Index,

118 Series,

119 )

120

121

122def _cat_compare_op(op):

123 opname = f"__{op.__name__}__"

124 fill_value = op is operator.ne

125

126 @unpack_zerodim_and_defer(opname)

127 def func(self, other):

128 hashable = is_hashable(other)

129 if is_list_like(other) and len(other) != len(self) and not hashable:

130 # in hashable case we may have a tuple that is itself a category

131 raise ValueError("Lengths must match.")

132

133 if not self.ordered:

134 if opname in ["__lt__", "__gt__", "__le__", "__ge__"]:

135 raise TypeError(

136 "Unordered Categoricals can only compare equality or not"

137 )

138 if isinstance(other, Categorical):

139 # Two Categoricals can only be compared if the categories are

140 # the same (maybe up to ordering, depending on ordered)

141

142 msg = "Categoricals can only be compared if 'categories' are the same."

143 if not self._categories_match_up_to_permutation(other):

144 raise TypeError(msg)

145

146 if not self.ordered and not self.categories.equals(other.categories):

147 # both unordered and different order

148 other_codes = recode_for_categories(

149 other.codes, other.categories, self.categories, copy=False

150 )

151 else:

152 other_codes = other._codes

153

154 ret = op(self._codes, other_codes)

155 mask = (self._codes == -1) | (other_codes == -1)

156 if mask.any():

157 ret[mask] = fill_value

158 return ret

159

160 if hashable:

161 if other in self.categories:

162 i = self._unbox_scalar(other)

163 ret = op(self._codes, i)

164

165 if opname not in {"__eq__", "__ge__", "__gt__"}:

166 # GH#29820 performance trick; get_loc will always give i>=0,

167 # so in the cases (__ne__, __le__, __lt__) the setting

168 # here is a no-op, so can be skipped.

169 mask = self._codes == -1

170 ret[mask] = fill_value

171 return ret

172 else:

173 return ops.invalid_comparison(self, other, op)

174 else:

175 # allow categorical vs object dtype array comparisons for equality

176 # these are only positional comparisons

177 if opname not in ["__eq__", "__ne__"]:

178 raise TypeError(

179 f"Cannot compare a Categorical for op {opname} with "

180 f"type {type(other)}.\nIf you want to compare values, "

181 "use 'np.asarray(cat) <op> other'."

182 )

183

184 if isinstance(other, ExtensionArray) and needs_i8_conversion(other.dtype):

185 # We would return NotImplemented here, but that messes up

186 # ExtensionIndex's wrapped methods

187 return op(other, self)

188 return getattr(np.array(self), opname)(np.array(other))

189

190 func.__name__ = opname

191

192 return func

193

194

195def contains(cat, key, container) -> bool:

196 """

197 Helper for membership check for ``key`` in ``cat``.

198

199 This is a helper method for :method:`__contains__`

200 and :class:`CategoricalIndex.__contains__`.

201

202 Returns True if ``key`` is in ``cat.categories`` and the

203 location of ``key`` in ``categories`` is in ``container``.

204

205 Parameters

206 ----------

207 cat : :class:`Categorical`or :class:`categoricalIndex`

208 key : a hashable object

209 The key to check membership for.

210 container : Container (e.g. list-like or mapping)

211 The container to check for membership in.

212

213 Returns

214 -------

215 is_in : bool

216 True if ``key`` is in ``self.categories`` and location of

217 ``key`` in ``categories`` is in ``container``, else False.

218

219 Notes

220 -----

221 This method does not check for NaN values. Do that separately

222 before calling this method.

223 """

224 hash(key)

225

226 # get location of key in categories.

227 # If a KeyError, the key isn't in categories, so logically

228 # can't be in container either.

229 try:

230 loc = cat.categories.get_loc(key)

231 except (KeyError, TypeError):

232 return False

233

234 # loc is the location of key in categories, but also the *value*

235 # for key in container. So, `key` may be in categories,

236 # but still not in `container`. Example ('b' in categories,

237 # but not in values):

238 # 'b' in Categorical(['a'], categories=['a', 'b']) # False

239 if is_scalar(loc):

240 return loc in container

241 else:

242 # if categories is an IntervalIndex, loc is an array.

243 return any(loc_ in container for loc_ in loc)

244

245

246class Categorical(NDArrayBackedExtensionArray, PandasObject, ObjectStringArrayMixin):

247 """

248 Represent a categorical variable in classic R / S-plus fashion.

249

250 `Categoricals` can only take on a limited, and usually fixed, number

251 of possible values (`categories`). In contrast to statistical categorical

252 variables, a `Categorical` might have an order, but numerical operations

253 (additions, divisions, ...) are not possible.

254

255 All values of the `Categorical` are either in `categories` or `np.nan`.

256 Assigning values outside of `categories` will raise a `ValueError`. Order

257 is defined by the order of the `categories`, not lexical order of the

258 values.

259

260 Parameters

261 ----------

262 values : list-like

263 The values of the categorical. If categories are given, values not in

264 categories will be replaced with NaN.

265 categories : Index-like (unique), optional

266 The unique categories for this categorical. If not given, the

267 categories are assumed to be the unique values of `values` (sorted, if

268 possible, otherwise in the order in which they appear).

269 ordered : bool, default False

270 Whether or not this categorical is treated as a ordered categorical.

271 If True, the resulting categorical will be ordered.

272 An ordered categorical respects, when sorted, the order of its

273 `categories` attribute (which in turn is the `categories` argument, if

274 provided).

275 dtype : CategoricalDtype

276 An instance of ``CategoricalDtype`` to use for this categorical.

277

278 Attributes

279 ----------

280 categories : Index

281 The categories of this categorical.

282 codes : ndarray

283 The codes (integer positions, which point to the categories) of this

284 categorical, read only.

285 ordered : bool

286 Whether or not this Categorical is ordered.

287 dtype : CategoricalDtype

288 The instance of ``CategoricalDtype`` storing the ``categories``

289 and ``ordered``.

290

291 Methods

292 -------

293 from_codes

294 __array__

295

296 Raises

297 ------

298 ValueError

299 If the categories do not validate.

300 TypeError

301 If an explicit ``ordered=True`` is given but no `categories` and the

302 `values` are not sortable.

303

304 See Also

305 --------

306 CategoricalDtype : Type for categorical data.

307 CategoricalIndex : An Index with an underlying ``Categorical``.

308

309 Notes

310 -----

311 See the `user guide

312 <https://pandas.pydata.org/pandas-docs/stable/user_guide/categorical.html>`__

313 for more.

314

315 Examples

316 --------

317 >>> pd.Categorical([1, 2, 3, 1, 2, 3])

318 [1, 2, 3, 1, 2, 3]

319 Categories (3, int64): [1, 2, 3]

320

321 >>> pd.Categorical(['a', 'b', 'c', 'a', 'b', 'c'])

322 ['a', 'b', 'c', 'a', 'b', 'c']

323 Categories (3, object): ['a', 'b', 'c']

324

325 Missing values are not included as a category.

326

327 >>> c = pd.Categorical([1, 2, 3, 1, 2, 3, np.nan])

328 >>> c

329 [1, 2, 3, 1, 2, 3, NaN]

330 Categories (3, int64): [1, 2, 3]

331

332 However, their presence is indicated in the `codes` attribute

333 by code `-1`.

334

335 >>> c.codes

336 array([ 0, 1, 2, 0, 1, 2, -1], dtype=int8)

337

338 Ordered `Categoricals` can be sorted according to the custom order

339 of the categories and can have a min and max value.

340

341 >>> c = pd.Categorical(['a', 'b', 'c', 'a', 'b', 'c'], ordered=True,

342 ... categories=['c', 'b', 'a'])

343 >>> c

344 ['a', 'b', 'c', 'a', 'b', 'c']

345 Categories (3, object): ['c' < 'b' < 'a']

346 >>> c.min()

347 'c'

348 """

349

350 # For comparisons, so that numpy uses our implementation if the compare

351 # ops, which raise

352 __array_priority__ = 1000

353 # tolist is not actually deprecated, just suppressed in the __dir__

354 _hidden_attrs = PandasObject._hidden_attrs | frozenset(["tolist"])

355 _typ = "categorical"

356

357 _dtype: CategoricalDtype

358

359 @classmethod

360 # error: Argument 2 of "_simple_new" is incompatible with supertype

361 # "NDArrayBacked"; supertype defines the argument type as

362 # "Union[dtype[Any], ExtensionDtype]"

363 def _simple_new( # type: ignore[override]

364 cls, codes: np.ndarray, dtype: CategoricalDtype

365 ) -> Self:

366 # NB: This is not _quite_ as simple as the "usual" _simple_new

367 codes = coerce_indexer_dtype(codes, dtype.categories)

368 dtype = CategoricalDtype(ordered=False).update_dtype(dtype)

369 return super()._simple_new(codes, dtype)

370

371 def __init__(

372 self,

373 values,

374 categories=None,

375 ordered=None,

376 dtype: Dtype | None = None,

377 fastpath: bool | lib.NoDefault = lib.no_default,

378 copy: bool = True,

379 ) -> None:

380 if fastpath is not lib.no_default:

381 # GH#20110

382 warnings.warn(

383 "The 'fastpath' keyword in Categorical is deprecated and will "

384 "be removed in a future version. Use Categorical.from_codes instead",

385 DeprecationWarning,

386 stacklevel=find_stack_level(),

387 )

388 else:

389 fastpath = False

390

391 dtype = CategoricalDtype._from_values_or_dtype(

392 values, categories, ordered, dtype

393 )

394 # At this point, dtype is always a CategoricalDtype, but

395 # we may have dtype.categories be None, and we need to

396 # infer categories in a factorization step further below

397

398 if fastpath:

399 codes = coerce_indexer_dtype(values, dtype.categories)

400 dtype = CategoricalDtype(ordered=False).update_dtype(dtype)

401 super().__init__(codes, dtype)

402 return

403

404 if not is_list_like(values):

405 # GH#38433

406 raise TypeError("Categorical input must be list-like")

407

408 # null_mask indicates missing values we want to exclude from inference.

409 # This means: only missing values in list-likes (not arrays/ndframes).

410 null_mask = np.array(False)

411

412 # sanitize input

413 vdtype = getattr(values, "dtype", None)

414 if isinstance(vdtype, CategoricalDtype):

415 if dtype.categories is None:

416 dtype = CategoricalDtype(values.categories, dtype.ordered)

417 elif not isinstance(values, (ABCIndex, ABCSeries, ExtensionArray)):

418 values = com.convert_to_list_like(values)

419 if isinstance(values, list) and len(values) == 0:

420 # By convention, empty lists result in object dtype:

421 values = np.array([], dtype=object)

422 elif isinstance(values, np.ndarray):

423 if values.ndim > 1:

424 # preempt sanitize_array from raising ValueError

425 raise NotImplementedError(

426 "> 1 ndim Categorical are not supported at this time"

427 )

428 values = sanitize_array(values, None)

429 else:

430 # i.e. must be a list

431 arr = sanitize_array(values, None)

432 null_mask = isna(arr)

433 if null_mask.any():

434 # We remove null values here, then below will re-insert

435 # them, grep "full_codes"

436 arr_list = [values[idx] for idx in np.where(~null_mask)[0]]

437

438 # GH#44900 Do not cast to float if we have only missing values

439 if arr_list or arr.dtype == "object":

440 sanitize_dtype = None

441 else:

442 sanitize_dtype = arr.dtype

443

444 arr = sanitize_array(arr_list, None, dtype=sanitize_dtype)

445 values = arr

446

447 if dtype.categories is None:

448 if isinstance(values.dtype, ArrowDtype) and issubclass(

449 values.dtype.type, CategoricalDtypeType

450 ):

451 arr = values._pa_array.combine_chunks()

452 categories = arr.dictionary.to_pandas(types_mapper=ArrowDtype)

453 codes = arr.indices.to_numpy()

454 dtype = CategoricalDtype(categories, values.dtype.pyarrow_dtype.ordered)

455 else:

456 if not isinstance(values, ABCIndex):

457 # in particular RangeIndex xref test_index_equal_range_categories

458 values = sanitize_array(values, None)

459 try:

460 codes, categories = factorize(values, sort=True)

461 except TypeError as err:

462 codes, categories = factorize(values, sort=False)

463 if dtype.ordered:

464 # raise, as we don't have a sortable data structure and so

465 # the user should give us one by specifying categories

466 raise TypeError(

467 "'values' is not ordered, please "

468 "explicitly specify the categories order "

469 "by passing in a categories argument."

470 ) from err

471

472 # we're inferring from values

473 dtype = CategoricalDtype(categories, dtype.ordered)

474

475 elif isinstance(values.dtype, CategoricalDtype):

476 old_codes = extract_array(values)._codes

477 codes = recode_for_categories(

478 old_codes, values.dtype.categories, dtype.categories, copy=copy

479 )

480

481 else:

482 codes = _get_codes_for_values(values, dtype.categories)

483

484 if null_mask.any():

485 # Reinsert -1 placeholders for previously removed missing values

486 full_codes = -np.ones(null_mask.shape, dtype=codes.dtype)

487 full_codes[~null_mask] = codes

488 codes = full_codes

489

490 dtype = CategoricalDtype(ordered=False).update_dtype(dtype)

491 arr = coerce_indexer_dtype(codes, dtype.categories)

492 super().__init__(arr, dtype)

493

494 @property

495 def dtype(self) -> CategoricalDtype:

496 """

497 The :class:`~pandas.api.types.CategoricalDtype` for this instance.

498

499 Examples

500 --------

501 >>> cat = pd.Categorical(['a', 'b'], ordered=True)

502 >>> cat

503 ['a', 'b']

504 Categories (2, object): ['a' < 'b']

505 >>> cat.dtype

506 CategoricalDtype(categories=['a', 'b'], ordered=True, categories_dtype=object)

507 """

508 return self._dtype

509

510 @property

511 def _internal_fill_value(self) -> int:

512 # using the specific numpy integer instead of python int to get

513 # the correct dtype back from _quantile in the all-NA case

514 dtype = self._ndarray.dtype

515 return dtype.type(-1)

516

517 @classmethod

518 def _from_sequence(

519 cls, scalars, *, dtype: Dtype | None = None, copy: bool = False

520 ) -> Self:

521 return cls(scalars, dtype=dtype, copy=copy)

522

523 @classmethod

524 def _from_scalars(cls, scalars, *, dtype: DtypeObj) -> Self:

525 if dtype is None:

526 # The _from_scalars strictness doesn't make much sense in this case.

527 raise NotImplementedError

528

529 res = cls._from_sequence(scalars, dtype=dtype)

530

531 # if there are any non-category elements in scalars, these will be

532 # converted to NAs in res.

533 mask = isna(scalars)

534 if not (mask == res.isna()).all():

535 # Some non-category element in scalars got converted to NA in res.

536 raise ValueError

537 return res

538

539 @overload

540 def astype(self, dtype: npt.DTypeLike, copy: bool = ...) -> np.ndarray:

541 ...

542

543 @overload

544 def astype(self, dtype: ExtensionDtype, copy: bool = ...) -> ExtensionArray:

545 ...

546

547 @overload

548 def astype(self, dtype: AstypeArg, copy: bool = ...) -> ArrayLike:

549 ...

550

551 def astype(self, dtype: AstypeArg, copy: bool = True) -> ArrayLike:

552 """

553 Coerce this type to another dtype

554

555 Parameters

556 ----------

557 dtype : numpy dtype or pandas type

558 copy : bool, default True

559 By default, astype always returns a newly allocated object.

560 If copy is set to False and dtype is categorical, the original

561 object is returned.

562 """

563 dtype = pandas_dtype(dtype)

564 if self.dtype is dtype:

565 result = self.copy() if copy else self

566

567 elif isinstance(dtype, CategoricalDtype):

568 # GH 10696/18593/18630

569 dtype = self.dtype.update_dtype(dtype)

570 self = self.copy() if copy else self

571 result = self._set_dtype(dtype)

572

573 elif isinstance(dtype, ExtensionDtype):

574 return super().astype(dtype, copy=copy)

575

576 elif dtype.kind in "iu" and self.isna().any():

577 raise ValueError("Cannot convert float NaN to integer")

578

579 elif len(self.codes) == 0 or len(self.categories) == 0:

580 result = np.array(

581 self,

582 dtype=dtype,

583 copy=copy,

584 )

585

586 else:

587 # GH8628 (PERF): astype category codes instead of astyping array

588 new_cats = self.categories._values

589

590 try:

591 new_cats = new_cats.astype(dtype=dtype, copy=copy)

592 fill_value = self.categories._na_value

593 if not is_valid_na_for_dtype(fill_value, dtype):

594 fill_value = lib.item_from_zerodim(

595 np.array(self.categories._na_value).astype(dtype)

596 )

597 except (

598 TypeError, # downstream error msg for CategoricalIndex is misleading

599 ValueError,

600 ):

601 msg = f"Cannot cast {self.categories.dtype} dtype to {dtype}"

602 raise ValueError(msg)

603

604 result = take_nd(

605 new_cats, ensure_platform_int(self._codes), fill_value=fill_value

606 )

607

608 return result

609

610 def to_list(self):

611 """

612 Alias for tolist.

613 """

614 # GH#51254

615 warnings.warn(

616 "Categorical.to_list is deprecated and will be removed in a future "

617 "version. Use obj.tolist() instead",

618 FutureWarning,

619 stacklevel=find_stack_level(),

620 )

621 return self.tolist()

622

623 @classmethod

624 def _from_inferred_categories(

625 cls, inferred_categories, inferred_codes, dtype, true_values=None

626 ) -> Self:

627 """

628 Construct a Categorical from inferred values.

629

630 For inferred categories (`dtype` is None) the categories are sorted.

631 For explicit `dtype`, the `inferred_categories` are cast to the

632 appropriate type.

633

634 Parameters

635 ----------

636 inferred_categories : Index

637 inferred_codes : Index

638 dtype : CategoricalDtype or 'category'

639 true_values : list, optional

640 If none are provided, the default ones are

641 "True", "TRUE", and "true."

642

643 Returns

644 -------

645 Categorical

646 """

647 from pandas import (

648 Index,

649 to_datetime,

650 to_numeric,

651 to_timedelta,

652 )

653

654 cats = Index(inferred_categories)

655 known_categories = (

656 isinstance(dtype, CategoricalDtype) and dtype.categories is not None

657 )

658

659 if known_categories:

660 # Convert to a specialized type with `dtype` if specified.

661 if is_any_real_numeric_dtype(dtype.categories.dtype):

662 cats = to_numeric(inferred_categories, errors="coerce")

663 elif lib.is_np_dtype(dtype.categories.dtype, "M"):

664 cats = to_datetime(inferred_categories, errors="coerce")

665 elif lib.is_np_dtype(dtype.categories.dtype, "m"):

666 cats = to_timedelta(inferred_categories, errors="coerce")

667 elif is_bool_dtype(dtype.categories.dtype):

668 if true_values is None:

669 true_values = ["True", "TRUE", "true"]

670

671 # error: Incompatible types in assignment (expression has type

672 # "ndarray", variable has type "Index")

673 cats = cats.isin(true_values) # type: ignore[assignment]

674

675 if known_categories:

676 # Recode from observation order to dtype.categories order.

677 categories = dtype.categories

678 codes = recode_for_categories(inferred_codes, cats, categories)

679 elif not cats.is_monotonic_increasing:

680 # Sort categories and recode for unknown categories.

681 unsorted = cats.copy()

682 categories = cats.sort_values()

683

684 codes = recode_for_categories(inferred_codes, unsorted, categories)

685 dtype = CategoricalDtype(categories, ordered=False)

686 else:

687 dtype = CategoricalDtype(cats, ordered=False)

688 codes = inferred_codes

689

690 return cls._simple_new(codes, dtype=dtype)

691

692 @classmethod

693 def from_codes(

694 cls,

695 codes,

696 categories=None,

697 ordered=None,

698 dtype: Dtype | None = None,

699 validate: bool = True,

700 ) -> Self:

701 """

702 Make a Categorical type from codes and categories or dtype.

703

704 This constructor is useful if you already have codes and

705 categories/dtype and so do not need the (computation intensive)

706 factorization step, which is usually done on the constructor.

707

708 If your data does not follow this convention, please use the normal

709 constructor.

710

711 Parameters

712 ----------

713 codes : array-like of int

714 An integer array, where each integer points to a category in

715 categories or dtype.categories, or else is -1 for NaN.

716 categories : index-like, optional

717 The categories for the categorical. Items need to be unique.

718 If the categories are not given here, then they must be provided

719 in `dtype`.

720 ordered : bool, optional

721 Whether or not this categorical is treated as an ordered

722 categorical. If not given here or in `dtype`, the resulting

723 categorical will be unordered.

724 dtype : CategoricalDtype or "category", optional

725 If :class:`CategoricalDtype`, cannot be used together with

726 `categories` or `ordered`.

727 validate : bool, default True

728 If True, validate that the codes are valid for the dtype.

729 If False, don't validate that the codes are valid. Be careful about skipping

730 validation, as invalid codes can lead to severe problems, such as segfaults.

731

732 .. versionadded:: 2.1.0

733

734 Returns

735 -------

736 Categorical

737

738 Examples

739 --------

740 >>> dtype = pd.CategoricalDtype(['a', 'b'], ordered=True)

741 >>> pd.Categorical.from_codes(codes=[0, 1, 0, 1], dtype=dtype)

742 ['a', 'b', 'a', 'b']

743 Categories (2, object): ['a' < 'b']

744 """

745 dtype = CategoricalDtype._from_values_or_dtype(

746 categories=categories, ordered=ordered, dtype=dtype

747 )

748 if dtype.categories is None:

749 msg = (

750 "The categories must be provided in 'categories' or "

751 "'dtype'. Both were None."

752 )

753 raise ValueError(msg)

754

755 if validate:

756 # beware: non-valid codes may segfault

757 codes = cls._validate_codes_for_dtype(codes, dtype=dtype)

758

759 return cls._simple_new(codes, dtype=dtype)

760

761 # ------------------------------------------------------------------

762 # Categories/Codes/Ordered

763

764 @property

765 def categories(self) -> Index:

766 """

767 The categories of this categorical.

768

769 Setting assigns new values to each category (effectively a rename of

770 each individual category).

771

772 The assigned value has to be a list-like object. All items must be

773 unique and the number of items in the new categories must be the same

774 as the number of items in the old categories.

775

776 Raises

777 ------

778 ValueError

779 If the new categories do not validate as categories or if the

780 number of new categories is unequal the number of old categories

781

782 See Also

783 --------

784 rename_categories : Rename categories.

785 reorder_categories : Reorder categories.

786 add_categories : Add new categories.

787 remove_categories : Remove the specified categories.

788 remove_unused_categories : Remove categories which are not used.

789 set_categories : Set the categories to the specified ones.

790

791 Examples

792 --------

793 For :class:`pandas.Series`:

794

795 >>> ser = pd.Series(['a', 'b', 'c', 'a'], dtype='category')

796 >>> ser.cat.categories

797 Index(['a', 'b', 'c'], dtype='object')

798

799 >>> raw_cat = pd.Categorical(['a', 'b', 'c', 'a'], categories=['b', 'c', 'd'])

800 >>> ser = pd.Series(raw_cat)

801 >>> ser.cat.categories

802 Index(['b', 'c', 'd'], dtype='object')

803

804 For :class:`pandas.Categorical`:

805

806 >>> cat = pd.Categorical(['a', 'b'], ordered=True)

807 >>> cat.categories

808 Index(['a', 'b'], dtype='object')

809

810 For :class:`pandas.CategoricalIndex`:

811

812 >>> ci = pd.CategoricalIndex(['a', 'c', 'b', 'a', 'c', 'b'])

813 >>> ci.categories

814 Index(['a', 'b', 'c'], dtype='object')

815

816 >>> ci = pd.CategoricalIndex(['a', 'c'], categories=['c', 'b', 'a'])

817 >>> ci.categories

818 Index(['c', 'b', 'a'], dtype='object')

819 """

820 return self.dtype.categories

821

822 @property

823 def ordered(self) -> Ordered:

824 """

825 Whether the categories have an ordered relationship.

826

827 Examples

828 --------

829 For :class:`pandas.Series`:

830

831 >>> ser = pd.Series(['a', 'b', 'c', 'a'], dtype='category')

832 >>> ser.cat.ordered

833 False

834

835 >>> raw_cat = pd.Categorical(['a', 'b', 'c', 'a'], ordered=True)

836 >>> ser = pd.Series(raw_cat)

837 >>> ser.cat.ordered

838 True

839

840 For :class:`pandas.Categorical`:

841

842 >>> cat = pd.Categorical(['a', 'b'], ordered=True)

843 >>> cat.ordered

844 True

845

846 >>> cat = pd.Categorical(['a', 'b'], ordered=False)

847 >>> cat.ordered

848 False

849

850 For :class:`pandas.CategoricalIndex`:

851

852 >>> ci = pd.CategoricalIndex(['a', 'b'], ordered=True)

853 >>> ci.ordered

854 True

855

856 >>> ci = pd.CategoricalIndex(['a', 'b'], ordered=False)

857 >>> ci.ordered

858 False

859 """

860 return self.dtype.ordered

861

862 @property

863 def codes(self) -> np.ndarray:

864 """

865 The category codes of this categorical index.

866

867 Codes are an array of integers which are the positions of the actual

868 values in the categories array.

869

870 There is no setter, use the other categorical methods and the normal item

871 setter to change values in the categorical.

872

873 Returns

874 -------

875 ndarray[int]

876 A non-writable view of the ``codes`` array.

877

878 Examples

879 --------

880 For :class:`pandas.Categorical`:

881

882 >>> cat = pd.Categorical(['a', 'b'], ordered=True)

883 >>> cat.codes

884 array([0, 1], dtype=int8)

885

886 For :class:`pandas.CategoricalIndex`:

887

888 >>> ci = pd.CategoricalIndex(['a', 'b', 'c', 'a', 'b', 'c'])

889 >>> ci.codes

890 array([0, 1, 2, 0, 1, 2], dtype=int8)

891

892 >>> ci = pd.CategoricalIndex(['a', 'c'], categories=['c', 'b', 'a'])

893 >>> ci.codes

894 array([2, 0], dtype=int8)

895 """

896 v = self._codes.view()

897 v.flags.writeable = False

898 return v

899

900 def _set_categories(self, categories, fastpath: bool = False) -> None:

901 """

902 Sets new categories inplace

903

904 Parameters

905 ----------

906 fastpath : bool, default False

907 Don't perform validation of the categories for uniqueness or nulls

908

909 Examples

910 --------

911 >>> c = pd.Categorical(['a', 'b'])

912 >>> c

913 ['a', 'b']

914 Categories (2, object): ['a', 'b']

915

916 >>> c._set_categories(pd.Index(['a', 'c']))

917 >>> c

918 ['a', 'c']

919 Categories (2, object): ['a', 'c']

920 """

921 if fastpath:

922 new_dtype = CategoricalDtype._from_fastpath(categories, self.ordered)

923 else:

924 new_dtype = CategoricalDtype(categories, ordered=self.ordered)

925 if (

926 not fastpath

927 and self.dtype.categories is not None

928 and len(new_dtype.categories) != len(self.dtype.categories)

929 ):

930 raise ValueError(

931 "new categories need to have the same number of "

932 "items as the old categories!"

933 )

934

935 super().__init__(self._ndarray, new_dtype)

936

937 def _set_dtype(self, dtype: CategoricalDtype) -> Self:

938 """

939 Internal method for directly updating the CategoricalDtype

940

941 Parameters

942 ----------

943 dtype : CategoricalDtype

944

945 Notes

946 -----

947 We don't do any validation here. It's assumed that the dtype is

948 a (valid) instance of `CategoricalDtype`.

949 """

950 codes = recode_for_categories(self.codes, self.categories, dtype.categories)

951 return type(self)._simple_new(codes, dtype=dtype)

952

953 def set_ordered(self, value: bool) -> Self:

954 """

955 Set the ordered attribute to the boolean value.

956

957 Parameters

958 ----------

959 value : bool

960 Set whether this categorical is ordered (True) or not (False).

961 """

962 new_dtype = CategoricalDtype(self.categories, ordered=value)

963 cat = self.copy()

964 NDArrayBacked.__init__(cat, cat._ndarray, new_dtype)

965 return cat

966

967 def as_ordered(self) -> Self:

968 """

969 Set the Categorical to be ordered.

970

971 Returns

972 -------

973 Categorical

974 Ordered Categorical.

975

976 Examples

977 --------

978 For :class:`pandas.Series`:

979

980 >>> ser = pd.Series(['a', 'b', 'c', 'a'], dtype='category')

981 >>> ser.cat.ordered

982 False

983 >>> ser = ser.cat.as_ordered()

984 >>> ser.cat.ordered

985 True

986

987 For :class:`pandas.CategoricalIndex`:

988

989 >>> ci = pd.CategoricalIndex(['a', 'b', 'c', 'a'])

990 >>> ci.ordered

991 False

992 >>> ci = ci.as_ordered()

993 >>> ci.ordered

994 True

995 """

996 return self.set_ordered(True)

997

998 def as_unordered(self) -> Self:

999 """

1000 Set the Categorical to be unordered.

1001

1002 Returns

1003 -------

1004 Categorical

1005 Unordered Categorical.

1006

1007 Examples

1008 --------

1009 For :class:`pandas.Series`:

1010

1011 >>> raw_cat = pd.Categorical(['a', 'b', 'c', 'a'], ordered=True)

1012 >>> ser = pd.Series(raw_cat)

1013 >>> ser.cat.ordered

1014 True

1015 >>> ser = ser.cat.as_unordered()

1016 >>> ser.cat.ordered

1017 False

1018

1019 For :class:`pandas.CategoricalIndex`:

1020

1021 >>> ci = pd.CategoricalIndex(['a', 'b', 'c', 'a'], ordered=True)

1022 >>> ci.ordered

1023 True

1024 >>> ci = ci.as_unordered()

1025 >>> ci.ordered

1026 False

1027 """

1028 return self.set_ordered(False)

1029

1030 def set_categories(self, new_categories, ordered=None, rename: bool = False):

1031 """

1032 Set the categories to the specified new categories.

1033

1034 ``new_categories`` can include new categories (which will result in

1035 unused categories) or remove old categories (which results in values

1036 set to ``NaN``). If ``rename=True``, the categories will simply be renamed

1037 (less or more items than in old categories will result in values set to

1038 ``NaN`` or in unused categories respectively).

1039

1040 This method can be used to perform more than one action of adding,

1041 removing, and reordering simultaneously and is therefore faster than

1042 performing the individual steps via the more specialised methods.

1043

1044 On the other hand this methods does not do checks (e.g., whether the

1045 old categories are included in the new categories on a reorder), which

1046 can result in surprising changes, for example when using special string

1047 dtypes, which does not considers a S1 string equal to a single char

1048 python string.

1049

1050 Parameters

1051 ----------

1052 new_categories : Index-like

1053 The categories in new order.

1054 ordered : bool, default False

1055 Whether or not the categorical is treated as a ordered categorical.

1056 If not given, do not change the ordered information.

1057 rename : bool, default False

1058 Whether or not the new_categories should be considered as a rename

1059 of the old categories or as reordered categories.

1060

1061 Returns

1062 -------

1063 Categorical with reordered categories.

1064

1065 Raises

1066 ------

1067 ValueError

1068 If new_categories does not validate as categories

1069

1070 See Also

1071 --------

1072 rename_categories : Rename categories.

1073 reorder_categories : Reorder categories.

1074 add_categories : Add new categories.

1075 remove_categories : Remove the specified categories.

1076 remove_unused_categories : Remove categories which are not used.

1077

1078 Examples

1079 --------

1080 For :class:`pandas.Series`:

1081

1082 >>> raw_cat = pd.Categorical(['a', 'b', 'c', 'A'],

1083 ... categories=['a', 'b', 'c'], ordered=True)

1084 >>> ser = pd.Series(raw_cat)

1085 >>> ser

1086 0 a

1087 1 b

1088 2 c

1089 3 NaN

1090 dtype: category

1091 Categories (3, object): ['a' < 'b' < 'c']

1092

1093 >>> ser.cat.set_categories(['A', 'B', 'C'], rename=True)

1094 0 A

1095 1 B

1096 2 C

1097 3 NaN

1098 dtype: category

1099 Categories (3, object): ['A' < 'B' < 'C']

1100

1101 For :class:`pandas.CategoricalIndex`:

1102

1103 >>> ci = pd.CategoricalIndex(['a', 'b', 'c', 'A'],

1104 ... categories=['a', 'b', 'c'], ordered=True)

1105 >>> ci

1106 CategoricalIndex(['a', 'b', 'c', nan], categories=['a', 'b', 'c'],

1107 ordered=True, dtype='category')

1108

1109 >>> ci.set_categories(['A', 'b', 'c'])

1110 CategoricalIndex([nan, 'b', 'c', nan], categories=['A', 'b', 'c'],

1111 ordered=True, dtype='category')

1112 >>> ci.set_categories(['A', 'b', 'c'], rename=True)

1113 CategoricalIndex(['A', 'b', 'c', nan], categories=['A', 'b', 'c'],

1114 ordered=True, dtype='category')

1115 """

1116

1117 if ordered is None:

1118 ordered = self.dtype.ordered

1119 new_dtype = CategoricalDtype(new_categories, ordered=ordered)

1120

1121 cat = self.copy()

1122 if rename:

1123 if cat.dtype.categories is not None and len(new_dtype.categories) < len(

1124 cat.dtype.categories

1125 ):

1126 # remove all _codes which are larger and set to -1/NaN

1127 cat._codes[cat._codes >= len(new_dtype.categories)] = -1

1128 codes = cat._codes

1129 else:

1130 codes = recode_for_categories(

1131 cat.codes, cat.categories, new_dtype.categories

1132 )

1133 NDArrayBacked.__init__(cat, codes, new_dtype)

1134 return cat

1135

1136 def rename_categories(self, new_categories) -> Self:

1137 """

1138 Rename categories.

1139

1140 Parameters

1141 ----------

1142 new_categories : list-like, dict-like or callable

1143

1144 New categories which will replace old categories.

1145

1146 * list-like: all items must be unique and the number of items in

1147 the new categories must match the existing number of categories.

1148

1149 * dict-like: specifies a mapping from

1150 old categories to new. Categories not contained in the mapping

1151 are passed through and extra categories in the mapping are

1152 ignored.

1153

1154 * callable : a callable that is called on all items in the old

1155 categories and whose return values comprise the new categories.

1156

1157 Returns

1158 -------

1159 Categorical

1160 Categorical with renamed categories.

1161

1162 Raises

1163 ------

1164 ValueError

1165 If new categories are list-like and do not have the same number of

1166 items than the current categories or do not validate as categories

1167

1168 See Also

1169 --------

1170 reorder_categories : Reorder categories.

1171 add_categories : Add new categories.

1172 remove_categories : Remove the specified categories.

1173 remove_unused_categories : Remove categories which are not used.

1174 set_categories : Set the categories to the specified ones.

1175

1176 Examples

1177 --------

1178 >>> c = pd.Categorical(['a', 'a', 'b'])

1179 >>> c.rename_categories([0, 1])

1180 [0, 0, 1]

1181 Categories (2, int64): [0, 1]

1182

1183 For dict-like ``new_categories``, extra keys are ignored and

1184 categories not in the dictionary are passed through

1185

1186 >>> c.rename_categories({'a': 'A', 'c': 'C'})

1187 ['A', 'A', 'b']

1188 Categories (2, object): ['A', 'b']

1189

1190 You may also provide a callable to create the new categories

1191

1192 >>> c.rename_categories(lambda x: x.upper())

1193 ['A', 'A', 'B']

1194 Categories (2, object): ['A', 'B']

1195 """

1196

1197 if is_dict_like(new_categories):

1198 new_categories = [

1199 new_categories.get(item, item) for item in self.categories

1200 ]

1201 elif callable(new_categories):

1202 new_categories = [new_categories(item) for item in self.categories]

1203

1204 cat = self.copy()

1205 cat._set_categories(new_categories)

1206 return cat

1207

1208 def reorder_categories(self, new_categories, ordered=None) -> Self:

1209 """

1210 Reorder categories as specified in new_categories.

1211

1212 ``new_categories`` need to include all old categories and no new category

1213 items.

1214

1215 Parameters

1216 ----------

1217 new_categories : Index-like

1218 The categories in new order.

1219 ordered : bool, optional

1220 Whether or not the categorical is treated as a ordered categorical.

1221 If not given, do not change the ordered information.

1222

1223 Returns

1224 -------

1225 Categorical

1226 Categorical with reordered categories.

1227

1228 Raises

1229 ------

1230 ValueError

1231 If the new categories do not contain all old category items or any

1232 new ones

1233

1234 See Also

1235 --------

1236 rename_categories : Rename categories.

1237 add_categories : Add new categories.

1238 remove_categories : Remove the specified categories.

1239 remove_unused_categories : Remove categories which are not used.

1240 set_categories : Set the categories to the specified ones.

1241

1242 Examples

1243 --------

1244 For :class:`pandas.Series`:

1245

1246 >>> ser = pd.Series(['a', 'b', 'c', 'a'], dtype='category')

1247 >>> ser = ser.cat.reorder_categories(['c', 'b', 'a'], ordered=True)

1248 >>> ser

1249 0 a

1250 1 b

1251 2 c

1252 3 a

1253 dtype: category

1254 Categories (3, object): ['c' < 'b' < 'a']

1255

1256 >>> ser.sort_values()

1257 2 c

1258 1 b

1259 0 a

1260 3 a

1261 dtype: category

1262 Categories (3, object): ['c' < 'b' < 'a']

1263

1264 For :class:`pandas.CategoricalIndex`:

1265

1266 >>> ci = pd.CategoricalIndex(['a', 'b', 'c', 'a'])

1267 >>> ci

1268 CategoricalIndex(['a', 'b', 'c', 'a'], categories=['a', 'b', 'c'],

1269 ordered=False, dtype='category')

1270 >>> ci.reorder_categories(['c', 'b', 'a'], ordered=True)

1271 CategoricalIndex(['a', 'b', 'c', 'a'], categories=['c', 'b', 'a'],

1272 ordered=True, dtype='category')

1273 """

1274 if (

1275 len(self.categories) != len(new_categories)

1276 or not self.categories.difference(new_categories).empty

1277 ):

1278 raise ValueError(

1279 "items in new_categories are not the same as in old categories"

1280 )

1281 return self.set_categories(new_categories, ordered=ordered)

1282

1283 def add_categories(self, new_categories) -> Self:

1284 """

1285 Add new categories.

1286

1287 `new_categories` will be included at the last/highest place in the

1288 categories and will be unused directly after this call.

1289

1290 Parameters

1291 ----------

1292 new_categories : category or list-like of category

1293 The new categories to be included.

1294

1295 Returns

1296 -------

1297 Categorical

1298 Categorical with new categories added.

1299

1300 Raises

1301 ------

1302 ValueError

1303 If the new categories include old categories or do not validate as

1304 categories

1305

1306 See Also

1307 --------

1308 rename_categories : Rename categories.

1309 reorder_categories : Reorder categories.

1310 remove_categories : Remove the specified categories.

1311 remove_unused_categories : Remove categories which are not used.

1312 set_categories : Set the categories to the specified ones.

1313

1314 Examples

1315 --------

1316 >>> c = pd.Categorical(['c', 'b', 'c'])

1317 >>> c

1318 ['c', 'b', 'c']

1319 Categories (2, object): ['b', 'c']

1320

1321 >>> c.add_categories(['d', 'a'])

1322 ['c', 'b', 'c']

1323 Categories (4, object): ['b', 'c', 'd', 'a']

1324 """

1325

1326 if not is_list_like(new_categories):

1327 new_categories = [new_categories]

1328 already_included = set(new_categories) & set(self.dtype.categories)

1329 if len(already_included) != 0:

1330 raise ValueError(

1331 f"new categories must not include old categories: {already_included}"

1332 )

1333

1334 if hasattr(new_categories, "dtype"):

1335 from pandas import Series

1336

1337 dtype = find_common_type(

1338 [self.dtype.categories.dtype, new_categories.dtype]

1339 )

1340 new_categories = Series(

1341 list(self.dtype.categories) + list(new_categories), dtype=dtype

1342 )

1343 else:

1344 new_categories = list(self.dtype.categories) + list(new_categories)

1345

1346 new_dtype = CategoricalDtype(new_categories, self.ordered)

1347 cat = self.copy()

1348 codes = coerce_indexer_dtype(cat._ndarray, new_dtype.categories)

1349 NDArrayBacked.__init__(cat, codes, new_dtype)

1350 return cat

1351

1352 def remove_categories(self, removals) -> Self:

1353 """

1354 Remove the specified categories.

1355

1356 `removals` must be included in the old categories. Values which were in

1357 the removed categories will be set to NaN

1358

1359 Parameters

1360 ----------

1361 removals : category or list of categories

1362 The categories which should be removed.

1363

1364 Returns

1365 -------

1366 Categorical

1367 Categorical with removed categories.

1368

1369 Raises

1370 ------

1371 ValueError

1372 If the removals are not contained in the categories

1373

1374 See Also

1375 --------

1376 rename_categories : Rename categories.

1377 reorder_categories : Reorder categories.

1378 add_categories : Add new categories.

1379 remove_unused_categories : Remove categories which are not used.

1380 set_categories : Set the categories to the specified ones.

1381

1382 Examples

1383 --------

1384 >>> c = pd.Categorical(['a', 'c', 'b', 'c', 'd'])

1385 >>> c

1386 ['a', 'c', 'b', 'c', 'd']

1387 Categories (4, object): ['a', 'b', 'c', 'd']

1388

1389 >>> c.remove_categories(['d', 'a'])

1390 [NaN, 'c', 'b', 'c', NaN]

1391 Categories (2, object): ['b', 'c']

1392 """

1393 from pandas import Index

1394

1395 if not is_list_like(removals):

1396 removals = [removals]

1397

1398 removals = Index(removals).unique().dropna()

1399 new_categories = (

1400 self.dtype.categories.difference(removals, sort=False)

1401 if self.dtype.ordered is True

1402 else self.dtype.categories.difference(removals)

1403 )

1404 not_included = removals.difference(self.dtype.categories)

1405

1406 if len(not_included) != 0:

1407 not_included = set(not_included)

1408 raise ValueError(f"removals must all be in old categories: {not_included}")

1409

1410 return self.set_categories(new_categories, ordered=self.ordered, rename=False)

1411

1412 def remove_unused_categories(self) -> Self:

1413 """

1414 Remove categories which are not used.

1415

1416 Returns

1417 -------

1418 Categorical

1419 Categorical with unused categories dropped.

1420

1421 See Also

1422 --------

1423 rename_categories : Rename categories.

1424 reorder_categories : Reorder categories.

1425 add_categories : Add new categories.

1426 remove_categories : Remove the specified categories.

1427 set_categories : Set the categories to the specified ones.

1428

1429 Examples

1430 --------

1431 >>> c = pd.Categorical(['a', 'c', 'b', 'c', 'd'])

1432 >>> c

1433 ['a', 'c', 'b', 'c', 'd']

1434 Categories (4, object): ['a', 'b', 'c', 'd']

1435

1436 >>> c[2] = 'a'

1437 >>> c[4] = 'c'

1438 >>> c

1439 ['a', 'c', 'a', 'c', 'c']

1440 Categories (4, object): ['a', 'b', 'c', 'd']

1441

1442 >>> c.remove_unused_categories()

1443 ['a', 'c', 'a', 'c', 'c']

1444 Categories (2, object): ['a', 'c']

1445 """

1446 idx, inv = np.unique(self._codes, return_inverse=True)

1447

1448 if idx.size != 0 and idx[0] == -1: # na sentinel

1449 idx, inv = idx[1:], inv - 1

1450

1451 new_categories = self.dtype.categories.take(idx)

1452 new_dtype = CategoricalDtype._from_fastpath(

1453 new_categories, ordered=self.ordered

1454 )

1455 new_codes = coerce_indexer_dtype(inv, new_dtype.categories)

1456

1457 cat = self.copy()

1458 NDArrayBacked.__init__(cat, new_codes, new_dtype)

1459 return cat

1460

1461 # ------------------------------------------------------------------

1462

1463 def map(

1464 self,

1465 mapper,

1466 na_action: Literal["ignore"] | None | lib.NoDefault = lib.no_default,

1467 ):

1468 """

1469 Map categories using an input mapping or function.

1470

1471 Maps the categories to new categories. If the mapping correspondence is

1472 one-to-one the result is a :class:`~pandas.Categorical` which has the

1473 same order property as the original, otherwise a :class:`~pandas.Index`

1474 is returned. NaN values are unaffected.

1475

1476 If a `dict` or :class:`~pandas.Series` is used any unmapped category is

1477 mapped to `NaN`. Note that if this happens an :class:`~pandas.Index`

1478 will be returned.

1479

1480 Parameters

1481 ----------

1482 mapper : function, dict, or Series

1483 Mapping correspondence.

1484 na_action : {None, 'ignore'}, default 'ignore'

1485 If 'ignore', propagate NaN values, without passing them to the

1486 mapping correspondence.

1487

1488 .. deprecated:: 2.1.0

1489

1490 The default value of 'ignore' has been deprecated and will be changed to

1491 None in the future.

1492

1493 Returns

1494 -------

1495 pandas.Categorical or pandas.Index

1496 Mapped categorical.

1497

1498 See Also

1499 --------

1500 CategoricalIndex.map : Apply a mapping correspondence on a

1501 :class:`~pandas.CategoricalIndex`.

1502 Index.map : Apply a mapping correspondence on an

1503 :class:`~pandas.Index`.

1504 Series.map : Apply a mapping correspondence on a

1505 :class:`~pandas.Series`.

1506 Series.apply : Apply more complex functions on a

1507 :class:`~pandas.Series`.

1508

1509 Examples

1510 --------

1511 >>> cat = pd.Categorical(['a', 'b', 'c'])

1512 >>> cat

1513 ['a', 'b', 'c']

1514 Categories (3, object): ['a', 'b', 'c']

1515 >>> cat.map(lambda x: x.upper(), na_action=None)

1516 ['A', 'B', 'C']

1517 Categories (3, object): ['A', 'B', 'C']

1518 >>> cat.map({'a': 'first', 'b': 'second', 'c': 'third'}, na_action=None)

1519 ['first', 'second', 'third']

1520 Categories (3, object): ['first', 'second', 'third']

1521

1522 If the mapping is one-to-one the ordering of the categories is

1523 preserved:

1524

1525 >>> cat = pd.Categorical(['a', 'b', 'c'], ordered=True)

1526 >>> cat

1527 ['a', 'b', 'c']

1528 Categories (3, object): ['a' < 'b' < 'c']

1529 >>> cat.map({'a': 3, 'b': 2, 'c': 1}, na_action=None)

1530 [3, 2, 1]

1531 Categories (3, int64): [3 < 2 < 1]

1532

1533 If the mapping is not one-to-one an :class:`~pandas.Index` is returned:

1534

1535 >>> cat.map({'a': 'first', 'b': 'second', 'c': 'first'}, na_action=None)

1536 Index(['first', 'second', 'first'], dtype='object')

1537

1538 If a `dict` is used, all unmapped categories are mapped to `NaN` and

1539 the result is an :class:`~pandas.Index`:

1540

1541 >>> cat.map({'a': 'first', 'b': 'second'}, na_action=None)

1542 Index(['first', 'second', nan], dtype='object')

1543 """

1544 if na_action is lib.no_default:

1545 warnings.warn(

1546 "The default value of 'ignore' for the `na_action` parameter in "

1547 "pandas.Categorical.map is deprecated and will be "

1548 "changed to 'None' in a future version. Please set na_action to the "

1549 "desired value to avoid seeing this warning",

1550 FutureWarning,

1551 stacklevel=find_stack_level(),

1552 )

1553 na_action = "ignore"

1554

1555 assert callable(mapper) or is_dict_like(mapper)

1556

1557 new_categories = self.categories.map(mapper)

1558

1559 has_nans = np.any(self._codes == -1)

1560

1561 na_val = np.nan

1562 if na_action is None and has_nans:

1563 na_val = mapper(np.nan) if callable(mapper) else mapper.get(np.nan, np.nan)

1564

1565 if new_categories.is_unique and not new_categories.hasnans and na_val is np.nan:

1566 new_dtype = CategoricalDtype(new_categories, ordered=self.ordered)

1567 return self.from_codes(self._codes.copy(), dtype=new_dtype, validate=False)

1568

1569 if has_nans:

1570 new_categories = new_categories.insert(len(new_categories), na_val)

1571

1572 return np.take(new_categories, self._codes)

1573

1574 __eq__ = _cat_compare_op(operator.eq)

1575 __ne__ = _cat_compare_op(operator.ne)

1576 __lt__ = _cat_compare_op(operator.lt)

1577 __gt__ = _cat_compare_op(operator.gt)

1578 __le__ = _cat_compare_op(operator.le)

1579 __ge__ = _cat_compare_op(operator.ge)

1580

1581 # -------------------------------------------------------------

1582 # Validators; ideally these can be de-duplicated

1583

1584 def _validate_setitem_value(self, value):

1585 if not is_hashable(value):

1586 # wrap scalars and hashable-listlikes in list

1587 return self._validate_listlike(value)

1588 else:

1589 return self._validate_scalar(value)

1590

1591 def _validate_scalar(self, fill_value):

1592 """

1593 Convert a user-facing fill_value to a representation to use with our

1594 underlying ndarray, raising TypeError if this is not possible.

1595

1596 Parameters

1597 ----------

1598 fill_value : object

1599

1600 Returns

1601 -------

1602 fill_value : int

1603

1604 Raises

1605 ------

1606 TypeError

1607 """

1608

1609 if is_valid_na_for_dtype(fill_value, self.categories.dtype):

1610 fill_value = -1

1611 elif fill_value in self.categories:

1612 fill_value = self._unbox_scalar(fill_value)

1613 else:

1614 raise TypeError(

1615 "Cannot setitem on a Categorical with a new "

1616 f"category ({fill_value}), set the categories first"

1617 ) from None

1618 return fill_value

1619

1620 @classmethod

1621 def _validate_codes_for_dtype(cls, codes, *, dtype: CategoricalDtype) -> np.ndarray:

1622 if isinstance(codes, ExtensionArray) and is_integer_dtype(codes.dtype):

1623 # Avoid the implicit conversion of Int to object

1624 if isna(codes).any():

1625 raise ValueError("codes cannot contain NA values")

1626 codes = codes.to_numpy(dtype=np.int64)

1627 else:

1628 codes = np.asarray(codes)

1629 if len(codes) and codes.dtype.kind not in "iu":

1630 raise ValueError("codes need to be array-like integers")

1631

1632 if len(codes) and (codes.max() >= len(dtype.categories) or codes.min() < -1):

1633 raise ValueError("codes need to be between -1 and len(categories)-1")

1634 return codes

1635

1636 # -------------------------------------------------------------

1637

1638 @ravel_compat

1639 def __array__(

1640 self, dtype: NpDtype | None = None, copy: bool | None = None

1641 ) -> np.ndarray:

1642 """

1643 The numpy array interface.

1644

1645 Returns

1646 -------

1647 numpy.array

1648 A numpy array of either the specified dtype or,

1649 if dtype==None (default), the same dtype as

1650 categorical.categories.dtype.

1651

1652 Examples

1653 --------

1654

1655 >>> cat = pd.Categorical(['a', 'b'], ordered=True)

1656

1657 The following calls ``cat.__array__``

1658

1659 >>> np.asarray(cat)

1660 array(['a', 'b'], dtype=object)

1661 """

1662 ret = take_nd(self.categories._values, self._codes)

1663 if dtype and np.dtype(dtype) != self.categories.dtype:

1664 return np.asarray(ret, dtype)

1665 # When we're a Categorical[ExtensionArray], like Interval,

1666 # we need to ensure __array__ gets all the way to an

1667 # ndarray.

1668 return np.asarray(ret)

1669

1670 def __array_ufunc__(self, ufunc: np.ufunc, method: str, *inputs, **kwargs):

1671 # for binary ops, use our custom dunder methods

1672 result = arraylike.maybe_dispatch_ufunc_to_dunder_op(

1673 self, ufunc, method, *inputs, **kwargs

1674 )

1675 if result is not NotImplemented:

1676 return result

1677

1678 if "out" in kwargs:

1679 # e.g. test_numpy_ufuncs_out

1680 return arraylike.dispatch_ufunc_with_out(

1681 self, ufunc, method, *inputs, **kwargs

1682 )

1683

1684 if method == "reduce":

1685 # e.g. TestCategoricalAnalytics::test_min_max_ordered

1686 result = arraylike.dispatch_reduction_ufunc(

1687 self, ufunc, method, *inputs, **kwargs

1688 )

1689 if result is not NotImplemented:

1690 return result

1691

1692 # for all other cases, raise for now (similarly as what happens in

1693 # Series.__array_prepare__)

1694 raise TypeError(

1695 f"Object with dtype {self.dtype} cannot perform "

1696 f"the numpy op {ufunc.__name__}"

1697 )

1698

1699 def __setstate__(self, state) -> None:

1700 """Necessary for making this object picklable"""

1701 if not isinstance(state, dict):

1702 return super().__setstate__(state)

1703

1704 if "_dtype" not in state:

1705 state["_dtype"] = CategoricalDtype(state["_categories"], state["_ordered"])

1706

1707 if "_codes" in state and "_ndarray" not in state:

1708 # backward compat, changed what is property vs attribute

1709 state["_ndarray"] = state.pop("_codes")

1710

1711 super().__setstate__(state)

1712

1713 @property

1714 def nbytes(self) -> int:

1715 return self._codes.nbytes + self.dtype.categories.values.nbytes

1716

1717 def memory_usage(self, deep: bool = False) -> int:

1718 """

1719 Memory usage of my values

1720

1721 Parameters

1722 ----------

1723 deep : bool

1724 Introspect the data deeply, interrogate

1725 `object` dtypes for system-level memory consumption

1726

1727 Returns

1728 -------

1729 bytes used

1730

1731 Notes

1732 -----

1733 Memory usage does not include memory consumed by elements that

1734 are not components of the array if deep=False

1735

1736 See Also

1737 --------

1738 numpy.ndarray.nbytes

1739 """

1740 return self._codes.nbytes + self.dtype.categories.memory_usage(deep=deep)

1741

1742 def isna(self) -> npt.NDArray[np.bool_]:

1743 """

1744 Detect missing values

1745

1746 Missing values (-1 in .codes) are detected.

1747

1748 Returns

1749 -------

1750 np.ndarray[bool] of whether my values are null

1751

1752 See Also

1753 --------

1754 isna : Top-level isna.

1755 isnull : Alias of isna.

1756 Categorical.notna : Boolean inverse of Categorical.isna.

1757

1758 """

1759 return self._codes == -1

1760

1761 isnull = isna

1762

1763 def notna(self) -> npt.NDArray[np.bool_]:

1764 """

1765 Inverse of isna

1766

1767 Both missing values (-1 in .codes) and NA as a category are detected as

1768 null.

1769

1770 Returns

1771 -------

1772 np.ndarray[bool] of whether my values are not null

1773

1774 See Also

1775 --------

1776 notna : Top-level notna.

1777 notnull : Alias of notna.

1778 Categorical.isna : Boolean inverse of Categorical.notna.

1779

1780 """

1781 return ~self.isna()

1782

1783 notnull = notna

1784

1785 def value_counts(self, dropna: bool = True) -> Series:

1786 """

1787 Return a Series containing counts of each category.

1788

1789 Every category will have an entry, even those with a count of 0.

1790

1791 Parameters

1792 ----------

1793 dropna : bool, default True

1794 Don't include counts of NaN.

1795

1796 Returns

1797 -------

1798 counts : Series

1799

1800 See Also

1801 --------

1802 Series.value_counts

1803 """

1804 from pandas import (

1805 CategoricalIndex,

1806 Series,

1807 )

1808

1809 code, cat = self._codes, self.categories

1810 ncat, mask = (len(cat), code >= 0)

1811 ix, clean = np.arange(ncat), mask.all()

1812

1813 if dropna or clean:

1814 obs = code if clean else code[mask]

1815 count = np.bincount(obs, minlength=ncat or 0)

1816 else:

1817 count = np.bincount(np.where(mask, code, ncat))

1818 ix = np.append(ix, -1)

1819

1820 ix = coerce_indexer_dtype(ix, self.dtype.categories)

1821 ix = self._from_backing_data(ix)

1822

1823 return Series(

1824 count, index=CategoricalIndex(ix), dtype="int64", name="count", copy=False

1825 )

1826

1827 # error: Argument 2 of "_empty" is incompatible with supertype

1828 # "NDArrayBackedExtensionArray"; supertype defines the argument type as

1829 # "ExtensionDtype"

1830 @classmethod

1831 def _empty( # type: ignore[override]

1832 cls, shape: Shape, dtype: CategoricalDtype

1833 ) -> Self:

1834 """

1835 Analogous to np.empty(shape, dtype=dtype)

1836

1837 Parameters

1838 ----------

1839 shape : tuple[int]

1840 dtype : CategoricalDtype

1841 """

1842 arr = cls._from_sequence([], dtype=dtype)

1843

1844 # We have to use np.zeros instead of np.empty otherwise the resulting

1845 # ndarray may contain codes not supported by this dtype, in which

1846 # case repr(result) could segfault.

1847 backing = np.zeros(shape, dtype=arr._ndarray.dtype)

1848

1849 return arr._from_backing_data(backing)

1850

1851 def _internal_get_values(self) -> ArrayLike:

1852 """

1853 Return the values.

1854

1855 For internal compatibility with pandas formatting.

1856

1857 Returns

1858 -------

1859 np.ndarray or ExtensionArray

1860 A numpy array or ExtensionArray of the same dtype as

1861 categorical.categories.dtype.

1862 """

1863 # if we are a datetime and period index, return Index to keep metadata

1864 if needs_i8_conversion(self.categories.dtype):

1865 return self.categories.take(self._codes, fill_value=NaT)._values

1866 elif is_integer_dtype(self.categories.dtype) and -1 in self._codes:

1867 return (

1868 self.categories.astype("object")

1869 .take(self._codes, fill_value=np.nan)

1870 ._values

1871 )

1872 return np.array(self)

1873

1874 def check_for_ordered(self, op) -> None:

1875 """assert that we are ordered"""

1876 if not self.ordered:

1877 raise TypeError(

1878 f"Categorical is not ordered for operation {op}\n"

1879 "you can use .as_ordered() to change the "

1880 "Categorical to an ordered one\n"

1881 )

1882

1883 def argsort(

1884 self, *, ascending: bool = True, kind: SortKind = "quicksort", **kwargs

1885 ):

1886 """

1887 Return the indices that would sort the Categorical.

1888

1889 Missing values are sorted at the end.

1890

1891 Parameters

1892 ----------

1893 ascending : bool, default True

1894 Whether the indices should result in an ascending

1895 or descending sort.

1896 kind : {'quicksort', 'mergesort', 'heapsort', 'stable'}, optional

1897 Sorting algorithm.

1898 **kwargs:

1899 passed through to :func:`numpy.argsort`.

1900

1901 Returns

1902 -------

1903 np.ndarray[np.intp]

1904

1905 See Also

1906 --------

1907 numpy.ndarray.argsort

1908

1909 Notes

1910 -----

1911 While an ordering is applied to the category values, arg-sorting

1912 in this context refers more to organizing and grouping together

1913 based on matching category values. Thus, this function can be

1914 called on an unordered Categorical instance unlike the functions

1915 'Categorical.min' and 'Categorical.max'.

1916

1917 Examples

1918 --------

1919 >>> pd.Categorical(['b', 'b', 'a', 'c']).argsort()

1920 array([2, 0, 1, 3])

1921

1922 >>> cat = pd.Categorical(['b', 'b', 'a', 'c'],

1923 ... categories=['c', 'b', 'a'],

1924 ... ordered=True)

1925 >>> cat.argsort()

1926 array([3, 0, 1, 2])

1927

1928 Missing values are placed at the end

1929

1930 >>> cat = pd.Categorical([2, None, 1])

1931 >>> cat.argsort()

1932 array([2, 0, 1])

1933 """

1934 return super().argsort(ascending=ascending, kind=kind, **kwargs)

1935

1936 @overload

1937 def sort_values(

1938 self,

1939 *,

1940 inplace: Literal[False] = ...,

1941 ascending: bool = ...,

1942 na_position: str = ...,

1943 ) -> Self:

1944 ...

1945

1946 @overload

1947 def sort_values(

1948 self, *, inplace: Literal[True], ascending: bool = ..., na_position: str = ...

1949 ) -> None:

1950 ...

1951

1952 def sort_values(

1953 self,

1954 *,

1955 inplace: bool = False,

1956 ascending: bool = True,

1957 na_position: str = "last",

1958 ) -> Self | None:

1959 """

1960 Sort the Categorical by category value returning a new

1961 Categorical by default.

1962

1963 While an ordering is applied to the category values, sorting in this

1964 context refers more to organizing and grouping together based on

1965 matching category values. Thus, this function can be called on an

1966 unordered Categorical instance unlike the functions 'Categorical.min'

1967 and 'Categorical.max'.

1968

1969 Parameters

1970 ----------

1971 inplace : bool, default False

1972 Do operation in place.

1973 ascending : bool, default True

1974 Order ascending. Passing False orders descending. The

1975 ordering parameter provides the method by which the

1976 category values are organized.

1977 na_position : {'first', 'last'} (optional, default='last')

1978 'first' puts NaNs at the beginning

1979 'last' puts NaNs at the end

1980

1981 Returns

1982 -------

1983 Categorical or None

1984

1985 See Also

1986 --------

1987 Categorical.sort

1988 Series.sort_values

1989

1990 Examples

1991 --------

1992 >>> c = pd.Categorical([1, 2, 2, 1, 5])

1993 >>> c

1994 [1, 2, 2, 1, 5]

1995 Categories (3, int64): [1, 2, 5]

1996 >>> c.sort_values()

1997 [1, 1, 2, 2, 5]

1998 Categories (3, int64): [1, 2, 5]

1999 >>> c.sort_values(ascending=False)

2000 [5, 2, 2, 1, 1]

2001 Categories (3, int64): [1, 2, 5]

2002

2003 >>> c = pd.Categorical([1, 2, 2, 1, 5])

2004

2005 'sort_values' behaviour with NaNs. Note that 'na_position'

2006 is independent of the 'ascending' parameter:

2007

2008 >>> c = pd.Categorical([np.nan, 2, 2, np.nan, 5])

2009 >>> c

2010 [NaN, 2, 2, NaN, 5]

2011 Categories (2, int64): [2, 5]

2012 >>> c.sort_values()

2013 [2, 2, 5, NaN, NaN]

2014 Categories (2, int64): [2, 5]

2015 >>> c.sort_values(ascending=False)

2016 [5, 2, 2, NaN, NaN]

2017 Categories (2, int64): [2, 5]

2018 >>> c.sort_values(na_position='first')

2019 [NaN, NaN, 2, 2, 5]

2020 Categories (2, int64): [2, 5]

2021 >>> c.sort_values(ascending=False, na_position='first')

2022 [NaN, NaN, 5, 2, 2]

2023 Categories (2, int64): [2, 5]

2024 """

2025 inplace = validate_bool_kwarg(inplace, "inplace")

2026 if na_position not in ["last", "first"]:

2027 raise ValueError(f"invalid na_position: {repr(na_position)}")

2028

2029 sorted_idx = nargsort(self, ascending=ascending, na_position=na_position)

2030

2031 if not inplace:

2032 codes = self._codes[sorted_idx]

2033 return self._from_backing_data(codes)

2034 self._codes[:] = self._codes[sorted_idx]

2035 return None

2036

2037 def _rank(

2038 self,

2039 *,

2040 axis: AxisInt = 0,

2041 method: str = "average",

2042 na_option: str = "keep",

2043 ascending: bool = True,

2044 pct: bool = False,

2045 ):

2046 """

2047 See Series.rank.__doc__.

2048 """

2049 if axis != 0:

2050 raise NotImplementedError

2051 vff = self._values_for_rank()

2052 return algorithms.rank(

2053 vff,

2054 axis=axis,

2055 method=method,

2056 na_option=na_option,

2057 ascending=ascending,

2058 pct=pct,

2059 )

2060

2061 def _values_for_rank(self) -> np.ndarray:

2062 """

2063 For correctly ranking ordered categorical data. See GH#15420

2064

2065 Ordered categorical data should be ranked on the basis of

2066 codes with -1 translated to NaN.

2067

2068 Returns

2069 -------

2070 numpy.array

2071

2072 """

2073 from pandas import Series

2074

2075 if self.ordered:

2076 values = self.codes

2077 mask = values == -1

2078 if mask.any():

2079 values = values.astype("float64")

2080 values[mask] = np.nan

2081 elif is_any_real_numeric_dtype(self.categories.dtype):

2082 values = np.array(self)

2083 else:

2084 # reorder the categories (so rank can use the float codes)

2085 # instead of passing an object array to rank

2086 values = np.array(

2087 self.rename_categories(

2088 Series(self.categories, copy=False).rank().values

2089 )

2090 )

2091 return values

2092

2093 def _hash_pandas_object(

2094 self, *, encoding: str, hash_key: str, categorize: bool

2095 ) -> npt.NDArray[np.uint64]:

2096 """

2097 Hash a Categorical by hashing its categories, and then mapping the codes

2098 to the hashes.

2099

2100 Parameters

2101 ----------

2102 encoding : str

2103 hash_key : str

2104 categorize : bool

2105 Ignored for Categorical.

2106

2107 Returns

2108 -------

2109 np.ndarray[uint64]

2110 """

2111 # Note we ignore categorize, as we are already Categorical.

2112 from pandas.core.util.hashing import hash_array

2113

2114 # Convert ExtensionArrays to ndarrays

2115 values = np.asarray(self.categories._values)

2116 hashed = hash_array(values, encoding, hash_key, categorize=False)

2117

2118 # we have uint64, as we don't directly support missing values

2119 # we don't want to use take_nd which will coerce to float

2120 # instead, directly construct the result with a

2121 # max(np.uint64) as the missing value indicator

2122 #

2123 # TODO: GH#15362

2124

2125 mask = self.isna()

2126 if len(hashed):

2127 result = hashed.take(self._codes)

2128 else:

2129 result = np.zeros(len(mask), dtype="uint64")

2130

2131 if mask.any():

2132 result[mask] = lib.u8max

2133

2134 return result

2135

2136 # ------------------------------------------------------------------

2137 # NDArrayBackedExtensionArray compat

2138

2139 @property

2140 def _codes(self) -> np.ndarray:

2141 return self._ndarray

2142

2143 def _box_func(self, i: int):

2144 if i == -1:

2145 return np.nan

2146 return self.categories[i]

2147

2148 def _unbox_scalar(self, key) -> int:

2149 # searchsorted is very performance sensitive. By converting codes

2150 # to same dtype as self.codes, we get much faster performance.

2151 code = self.categories.get_loc(key)

2152 code = self._ndarray.dtype.type(code)

2153 return code

2154

2155 # ------------------------------------------------------------------

2156

2157 def __iter__(self) -> Iterator:

2158 """

2159 Returns an Iterator over the values of this Categorical.

2160 """

2161 if self.ndim == 1:

2162 return iter(self._internal_get_values().tolist())

2163 else:

2164 return (self[n] for n in range(len(self)))

2165

2166 def __contains__(self, key) -> bool:

2167 """

2168 Returns True if `key` is in this Categorical.

2169 """

2170 # if key is a NaN, check if any NaN is in self.

2171 if is_valid_na_for_dtype(key, self.categories.dtype):

2172 return bool(self.isna().any())

2173

2174 return contains(self, key, container=self._codes)

2175

2176 # ------------------------------------------------------------------

2177 # Rendering Methods

2178

2179 def _formatter(self, boxed: bool = False):

2180 # Returning None here will cause format_array to do inference.

2181 return None

2182

2183 def _repr_categories(self) -> list[str]:

2184 """

2185 return the base repr for the categories

2186 """

2187 max_categories = (

2188 10

2189 if get_option("display.max_categories") == 0

2190 else get_option("display.max_categories")

2191 )

2192 from pandas.io.formats import format as fmt

2193

2194 format_array = partial(

2195 fmt.format_array, formatter=None, quoting=QUOTE_NONNUMERIC

2196 )

2197 if len(self.categories) > max_categories:

2198 num = max_categories // 2

2199 head = format_array(self.categories[:num]._values)

2200 tail = format_array(self.categories[-num:]._values)

2201 category_strs = head + ["..."] + tail

2202 else:

2203 category_strs = format_array(self.categories._values)

2204

2205 # Strip all leading spaces, which format_array adds for columns...

2206 category_strs = [x.strip() for x in category_strs]

2207 return category_strs

2208

2209 def _get_repr_footer(self) -> str:

2210 """

2211 Returns a string representation of the footer.

2212 """

2213 category_strs = self._repr_categories()

2214 dtype = str(self.categories.dtype)

2215 levheader = f"Categories ({len(self.categories)}, {dtype}): "

2216 width, _ = get_terminal_size()

2217 max_width = get_option("display.width") or width

2218 if console.in_ipython_frontend():

2219 # 0 = no breaks

2220 max_width = 0

2221 levstring = ""

2222 start = True

2223 cur_col_len = len(levheader) # header

2224 sep_len, sep = (3, " < ") if self.ordered else (2, ", ")

2225 linesep = f"{sep.rstrip()}\n" # remove whitespace

2226 for val in category_strs:

2227 if max_width != 0 and cur_col_len + sep_len + len(val) > max_width:

2228 levstring += linesep + (" " * (len(levheader) + 1))

2229 cur_col_len = len(levheader) + 1 # header + a whitespace

2230 elif not start:

2231 levstring += sep

2232 cur_col_len += len(val)

2233 levstring += val

2234 start = False

2235 # replace to simple save space by

2236 return f"{levheader}[{levstring.replace(' < ... < ', ' ... ')}]"

2237

2238 def _get_values_repr(self) -> str:

2239 from pandas.io.formats import format as fmt

2240

2241 assert len(self) > 0

2242

2243 vals = self._internal_get_values()

2244 fmt_values = fmt.format_array(

2245 vals,

2246 None,

2247 float_format=None,

2248 na_rep="NaN",

2249 quoting=QUOTE_NONNUMERIC,

2250 )

2251

2252 fmt_values = [i.strip() for i in fmt_values]

2253 joined = ", ".join(fmt_values)

2254 result = "[" + joined + "]"

2255 return result

2256

2257 def __repr__(self) -> str:

2258 """

2259 String representation.

2260 """

2261 footer = self._get_repr_footer()

2262 length = len(self)

2263 max_len = 10

2264 if length > max_len:

2265 # In long cases we do not display all entries, so we add Length

2266 # information to the __repr__.

2267 num = max_len // 2

2268 head = self[:num]._get_values_repr()

2269 tail = self[-(max_len - num) :]._get_values_repr()

2270 body = f"{head[:-1]}, ..., {tail[1:]}"

2271 length_info = f"Length: {len(self)}"

2272 result = f"{body}\n{length_info}\n{footer}"

2273 elif length > 0:

2274 body = self._get_values_repr()

2275 result = f"{body}\n{footer}"

2276 else:

2277 # In the empty case we use a comma instead of newline to get

2278 # a more compact __repr__

2279 body = "[]"

2280 result = f"{body}, {footer}"

2281

2282 return result

2283

2284 # ------------------------------------------------------------------

2285

2286 def _validate_listlike(self, value):

2287 # NB: here we assume scalar-like tuples have already been excluded

2288 value = extract_array(value, extract_numpy=True)

2289

2290 # require identical categories set

2291 if isinstance(value, Categorical):

2292 if self.dtype != value.dtype:

2293 raise TypeError(

2294 "Cannot set a Categorical with another, "

2295 "without identical categories"

2296 )

2297 # dtype equality implies categories_match_up_to_permutation

2298 value = self._encode_with_my_categories(value)

2299 return value._codes

2300

2301 from pandas import Index

2302

2303 # tupleize_cols=False for e.g. test_fillna_iterable_category GH#41914

2304 to_add = Index._with_infer(value, tupleize_cols=False).difference(

2305 self.categories

2306 )

2307

2308 # no assignments of values not in categories, but it's always ok to set

2309 # something to np.nan

2310 if len(to_add) and not isna(to_add).all():

2311 raise TypeError(

2312 "Cannot setitem on a Categorical with a new "

2313 "category, set the categories first"

2314 )

2315

2316 codes = self.categories.get_indexer(value)

2317 return codes.astype(self._ndarray.dtype, copy=False)

2318

2319 def _reverse_indexer(self) -> dict[Hashable, npt.NDArray[np.intp]]:

2320 """

2321 Compute the inverse of a categorical, returning

2322 a dict of categories -> indexers.

2323

2324 *This is an internal function*

2325

2326 Returns

2327 -------

2328 Dict[Hashable, np.ndarray[np.intp]]

2329 dict of categories -> indexers

2330

2331 Examples

2332 --------

2333 >>> c = pd.Categorical(list('aabca'))

2334 >>> c

2335 ['a', 'a', 'b', 'c', 'a']

2336 Categories (3, object): ['a', 'b', 'c']

2337 >>> c.categories

2338 Index(['a', 'b', 'c'], dtype='object')

2339 >>> c.codes

2340 array([0, 0, 1, 2, 0], dtype=int8)

2341 >>> c._reverse_indexer()

2342 {'a': array([0, 1, 4]), 'b': array([2]), 'c': array([3])}

2343

2344 """

2345 categories = self.categories

2346 r, counts = libalgos.groupsort_indexer(

2347 ensure_platform_int(self.codes), categories.size

2348 )

2349 counts = ensure_int64(counts).cumsum()

2350 _result = (r[start:end] for start, end in zip(counts, counts[1:]))

2351 return dict(zip(categories, _result))

2352

2353 # ------------------------------------------------------------------

2354 # Reductions

2355

2356 def _reduce(

2357 self, name: str, *, skipna: bool = True, keepdims: bool = False, **kwargs

2358 ):

2359 result = super()._reduce(name, skipna=skipna, keepdims=keepdims, **kwargs)

2360 if name in ["argmax", "argmin"]:

2361 # don't wrap in Categorical!

2362 return result

2363 if keepdims:

2364 return type(self)(result, dtype=self.dtype)

2365 else:

2366 return result

2367

2368 def min(self, *, skipna: bool = True, **kwargs):

2369 """

2370 The minimum value of the object.

2371

2372 Only ordered `Categoricals` have a minimum!

2373

2374 Raises

2375 ------

2376 TypeError

2377 If the `Categorical` is not `ordered`.

2378

2379 Returns

2380 -------

2381 min : the minimum of this `Categorical`, NA value if empty

2382 """

2383 nv.validate_minmax_axis(kwargs.get("axis", 0))

2384 nv.validate_min((), kwargs)

2385 self.check_for_ordered("min")

2386

2387 if not len(self._codes):

2388 return self.dtype.na_value

2389

2390 good = self._codes != -1

2391 if not good.all():

2392 if skipna and good.any():

2393 pointer = self._codes[good].min()

2394 else:

2395 return np.nan

2396 else:

2397 pointer = self._codes.min()

2398 return self._wrap_reduction_result(None, pointer)

2399

2400 def max(self, *, skipna: bool = True, **kwargs):

2401 """

2402 The maximum value of the object.

2403

2404 Only ordered `Categoricals` have a maximum!

2405

2406 Raises

2407 ------

2408 TypeError

2409 If the `Categorical` is not `ordered`.

2410

2411 Returns

2412 -------

2413 max : the maximum of this `Categorical`, NA if array is empty

2414 """

2415 nv.validate_minmax_axis(kwargs.get("axis", 0))

2416 nv.validate_max((), kwargs)

2417 self.check_for_ordered("max")

2418

2419 if not len(self._codes):

2420 return self.dtype.na_value

2421

2422 good = self._codes != -1

2423 if not good.all():

2424 if skipna and good.any():

2425 pointer = self._codes[good].max()

2426 else:

2427 return np.nan

2428 else:

2429 pointer = self._codes.max()

2430 return self._wrap_reduction_result(None, pointer)

2431

2432 def _mode(self, dropna: bool = True) -> Categorical:

2433 codes = self._codes

2434 mask = None

2435 if dropna:

2436 mask = self.isna()

2437

2438 res_codes = algorithms.mode(codes, mask=mask)

2439 res_codes = cast(np.ndarray, res_codes)

2440 assert res_codes.dtype == codes.dtype

2441 res = self._from_backing_data(res_codes)

2442 return res

2443

2444 # ------------------------------------------------------------------

2445 # ExtensionArray Interface

2446

2447 def unique(self) -> Self:

2448 """

2449 Return the ``Categorical`` which ``categories`` and ``codes`` are

2450 unique.

2451

2452 .. versionchanged:: 1.3.0

2453

2454 Previously, unused categories were dropped from the new categories.

2455

2456 Returns

2457 -------

2458 Categorical

2459

2460 See Also

2461 --------

2462 pandas.unique

2463 CategoricalIndex.unique

2464 Series.unique : Return unique values of Series object.

2465

2466 Examples

2467 --------

2468 >>> pd.Categorical(list("baabc")).unique()

2469 ['b', 'a', 'c']

2470 Categories (3, object): ['a', 'b', 'c']

2471 >>> pd.Categorical(list("baab"), categories=list("abc"), ordered=True).unique()

2472 ['b', 'a']

2473 Categories (3, object): ['a' < 'b' < 'c']

2474 """

2475 # pylint: disable=useless-parent-delegation

2476 return super().unique()

2477

2478 def _cast_quantile_result(self, res_values: np.ndarray) -> np.ndarray:

2479 # make sure we have correct itemsize for resulting codes

2480 assert res_values.dtype == self._ndarray.dtype

2481 return res_values

2482

2483 def equals(self, other: object) -> bool:

2484 """

2485 Returns True if categorical arrays are equal.

2486

2487 Parameters

2488 ----------

2489 other : `Categorical`

2490

2491 Returns

2492 -------

2493 bool

2494 """

2495 if not isinstance(other, Categorical):

2496 return False

2497 elif self._categories_match_up_to_permutation(other):

2498 other = self._encode_with_my_categories(other)

2499 return np.array_equal(self._codes, other._codes)

2500 return False

2501

2502 @classmethod

2503 def _concat_same_type(cls, to_concat: Sequence[Self], axis: AxisInt = 0) -> Self:

2504 from pandas.core.dtypes.concat import union_categoricals

2505

2506 first = to_concat[0]

2507 if axis >= first.ndim:

2508 raise ValueError(

2509 f"axis {axis} is out of bounds for array of dimension {first.ndim}"

2510 )

2511

2512 if axis == 1:

2513 # Flatten, concatenate then reshape

2514 if not all(x.ndim == 2 for x in to_concat):

2515 raise ValueError

2516

2517 # pass correctly-shaped to union_categoricals

2518 tc_flat = []

2519 for obj in to_concat:

2520 tc_flat.extend([obj[:, i] for i in range(obj.shape[1])])

2521

2522 res_flat = cls._concat_same_type(tc_flat, axis=0)

2523

2524 result = res_flat.reshape(len(first), -1, order="F")

2525 return result

2526

2527 result = union_categoricals(to_concat)

2528 return result

2529

2530 # ------------------------------------------------------------------

2531

2532 def _encode_with_my_categories(self, other: Categorical) -> Categorical:

2533 """

2534 Re-encode another categorical using this Categorical's categories.

2535

2536 Notes

2537 -----

2538 This assumes we have already checked

2539 self._categories_match_up_to_permutation(other).

2540 """

2541 # Indexing on codes is more efficient if categories are the same,

2542 # so we can apply some optimizations based on the degree of

2543 # dtype-matching.

2544 codes = recode_for_categories(

2545 other.codes, other.categories, self.categories, copy=False

2546 )

2547 return self._from_backing_data(codes)

2548

2549 def _categories_match_up_to_permutation(self, other: Categorical) -> bool:

2550 """

2551 Returns True if categoricals are the same dtype

2552 same categories, and same ordered

2553

2554 Parameters

2555 ----------

2556 other : Categorical

2557

2558 Returns

2559 -------

2560 bool

2561 """

2562 return hash(self.dtype) == hash(other.dtype)

2563

2564 def describe(self) -> DataFrame:

2565 """

2566 Describes this Categorical

2567

2568 Returns

2569 -------

2570 description: `DataFrame`

2571 A dataframe with frequency and counts by category.

2572 """

2573 counts = self.value_counts(dropna=False)

2574 freqs = counts / counts.sum()

2575

2576 from pandas import Index

2577 from pandas.core.reshape.concat import concat

2578

2579 result = concat([counts, freqs], axis=1)

2580 result.columns = Index(["counts", "freqs"])

2581 result.index.name = "categories"

2582

2583 return result

2584

2585 def isin(self, values: ArrayLike) -> npt.NDArray[np.bool_]:

2586 """

2587 Check whether `values` are contained in Categorical.

2588

2589 Return a boolean NumPy Array showing whether each element in

2590 the Categorical matches an element in the passed sequence of

2591 `values` exactly.

2592

2593 Parameters

2594 ----------

2595 values : np.ndarray or ExtensionArray

2596 The sequence of values to test. Passing in a single string will

2597 raise a ``TypeError``. Instead, turn a single string into a

2598 list of one element.

2599

2600 Returns

2601 -------

2602 np.ndarray[bool]

2603

2604 Raises

2605 ------

2606 TypeError

2607 * If `values` is not a set or list-like

2608

2609 See Also

2610 --------

2611 pandas.Series.isin : Equivalent method on Series.

2612

2613 Examples

2614 --------

2615 >>> s = pd.Categorical(['lama', 'cow', 'lama', 'beetle', 'lama',

2616 ... 'hippo'])

2617 >>> s.isin(['cow', 'lama'])

2618 array([ True, True, True, False, True, False])

2619

2620 Passing a single string as ``s.isin('lama')`` will raise an error. Use

2621 a list of one element instead:

2622

2623 >>> s.isin(['lama'])

2624 array([ True, False, True, False, True, False])

2625 """

2626 null_mask = np.asarray(isna(values))

2627 code_values = self.categories.get_indexer_for(values)

2628 code_values = code_values[null_mask | (code_values >= 0)]

2629 return algorithms.isin(self.codes, code_values)

2630

2631 def _replace(self, *, to_replace, value, inplace: bool = False):

2632 from pandas import Index

2633

2634 orig_dtype = self.dtype

2635

2636 inplace = validate_bool_kwarg(inplace, "inplace")

2637 cat = self if inplace else self.copy()

2638

2639 mask = isna(np.asarray(value))

2640 if mask.any():

2641 removals = np.asarray(to_replace)[mask]

2642 removals = cat.categories[cat.categories.isin(removals)]

2643 new_cat = cat.remove_categories(removals)

2644 NDArrayBacked.__init__(cat, new_cat.codes, new_cat.dtype)

2645

2646 ser = cat.categories.to_series()

2647 ser = ser.replace(to_replace=to_replace, value=value)

2648

2649 all_values = Index(ser)

2650

2651 # GH51016: maintain order of existing categories

2652 idxr = cat.categories.get_indexer_for(all_values)

2653 locs = np.arange(len(ser))

2654 locs = np.where(idxr == -1, locs, idxr)

2655 locs = locs.argsort()

2656

2657 new_categories = ser.take(locs)

2658 new_categories = new_categories.drop_duplicates(keep="first")

2659 new_categories = Index(new_categories)

2660 new_codes = recode_for_categories(

2661 cat._codes, all_values, new_categories, copy=False

2662 )

2663 new_dtype = CategoricalDtype(new_categories, ordered=self.dtype.ordered)

2664 NDArrayBacked.__init__(cat, new_codes, new_dtype)

2665

2666 if new_dtype != orig_dtype:

2667 warnings.warn(

2668 # GH#55147

2669 "The behavior of Series.replace (and DataFrame.replace) with "

2670 "CategoricalDtype is deprecated. In a future version, replace "

2671 "will only be used for cases that preserve the categories. "

2672 "To change the categories, use ser.cat.rename_categories "

2673 "instead.",

2674 FutureWarning,

2675 stacklevel=find_stack_level(),

2676 )

2677 if not inplace:

2678 return cat

2679

2680 # ------------------------------------------------------------------------

2681 # String methods interface

2682 def _str_map(

2683 self, f, na_value=np.nan, dtype=np.dtype("object"), convert: bool = True

2684 ):

2685 # Optimization to apply the callable `f` to the categories once

2686 # and rebuild the result by `take`ing from the result with the codes.

2687 # Returns the same type as the object-dtype implementation though.

2688 from pandas.core.arrays import NumpyExtensionArray

2689

2690 categories = self.categories

2691 codes = self.codes

2692 result = NumpyExtensionArray(categories.to_numpy())._str_map(f, na_value, dtype)

2693 return take_nd(result, codes, fill_value=na_value)

2694

2695 def _str_get_dummies(self, sep: str = "|"):

2696 # sep may not be in categories. Just bail on this.

2697 from pandas.core.arrays import NumpyExtensionArray

2698

2699 return NumpyExtensionArray(self.astype(str))._str_get_dummies(sep)

2700

2701 # ------------------------------------------------------------------------

2702 # GroupBy Methods

2703

2704 def _groupby_op(

2705 self,

2706 *,

2707 how: str,

2708 has_dropped_na: bool,

2709 min_count: int,

2710 ngroups: int,

2711 ids: npt.NDArray[np.intp],

2712 **kwargs,

2713 ):

2714 from pandas.core.groupby.ops import WrappedCythonOp

2715

2716 kind = WrappedCythonOp.get_kind_from_how(how)

2717 op = WrappedCythonOp(how=how, kind=kind, has_dropped_na=has_dropped_na)

2718

2719 dtype = self.dtype

2720 if how in ["sum", "prod", "cumsum", "cumprod", "skew"]:

2721 raise TypeError(f"{dtype} type does not support {how} operations")

2722 if how in ["min", "max", "rank", "idxmin", "idxmax"] and not dtype.ordered:

2723 # raise TypeError instead of NotImplementedError to ensure we

2724 # don't go down a group-by-group path, since in the empty-groups

2725 # case that would fail to raise

2726 raise TypeError(f"Cannot perform {how} with non-ordered Categorical")

2727 if how not in [

2728 "rank",

2729 "any",

2730 "all",

2731 "first",

2732 "last",

2733 "min",

2734 "max",

2735 "idxmin",

2736 "idxmax",

2737 ]:

2738 if kind == "transform":

2739 raise TypeError(f"{dtype} type does not support {how} operations")

2740 raise TypeError(f"{dtype} dtype does not support aggregation '{how}'")

2741

2742 result_mask = None

2743 mask = self.isna()

2744 if how == "rank":

2745 assert self.ordered # checked earlier

2746 npvalues = self._ndarray

2747 elif how in ["first", "last", "min", "max", "idxmin", "idxmax"]:

2748 npvalues = self._ndarray

2749 result_mask = np.zeros(ngroups, dtype=bool)

2750 else:

2751 # any/all

2752 npvalues = self.astype(bool)

2753

2754 res_values = op._cython_op_ndim_compat(

2755 npvalues,

2756 min_count=min_count,

2757 ngroups=ngroups,

2758 comp_ids=ids,

2759 mask=mask,

2760 result_mask=result_mask,

2761 **kwargs,

2762 )

2763

2764 if how in op.cast_blocklist:

2765 return res_values

2766 elif how in ["first", "last", "min", "max"]:

2767 res_values[result_mask == 1] = -1

2768 return self._from_backing_data(res_values)

2769

2770

2771# The Series.cat accessor

2772

2773

2774@delegate_names(

2775 delegate=Categorical, accessors=["categories", "ordered"], typ="property"

2776)

2777@delegate_names(

2778 delegate=Categorical,

2779 accessors=[

2780 "rename_categories",

2781 "reorder_categories",

2782 "add_categories",

2783 "remove_categories",

2784 "remove_unused_categories",

2785 "set_categories",

2786 "as_ordered",

2787 "as_unordered",

2788 ],

2789 typ="method",

2790)

2791class CategoricalAccessor(PandasDelegate, PandasObject, NoNewAttributesMixin):

2792 """

2793 Accessor object for categorical properties of the Series values.

2794

2795 Parameters

2796 ----------

2797 data : Series or CategoricalIndex

2798

2799 Examples

2800 --------

2801 >>> s = pd.Series(list("abbccc")).astype("category")

2802 >>> s

2803 0 a

2804 1 b

2805 2 b

2806 3 c

2807 4 c

2808 5 c

2809 dtype: category

2810 Categories (3, object): ['a', 'b', 'c']

2811

2812 >>> s.cat.categories

2813 Index(['a', 'b', 'c'], dtype='object')

2814

2815 >>> s.cat.rename_categories(list("cba"))

2816 0 c

2817 1 b

2818 2 b

2819 3 a

2820 4 a

2821 5 a

2822 dtype: category

2823 Categories (3, object): ['c', 'b', 'a']

2824

2825 >>> s.cat.reorder_categories(list("cba"))

2826 0 a

2827 1 b

2828 2 b

2829 3 c

2830 4 c

2831 5 c

2832 dtype: category

2833 Categories (3, object): ['c', 'b', 'a']

2834

2835 >>> s.cat.add_categories(["d", "e"])

2836 0 a

2837 1 b

2838 2 b

2839 3 c

2840 4 c

2841 5 c

2842 dtype: category

2843 Categories (5, object): ['a', 'b', 'c', 'd', 'e']

2844

2845 >>> s.cat.remove_categories(["a", "c"])

2846 0 NaN

2847 1 b

2848 2 b

2849 3 NaN

2850 4 NaN

2851 5 NaN

2852 dtype: category

2853 Categories (1, object): ['b']

2854

2855 >>> s1 = s.cat.add_categories(["d", "e"])

2856 >>> s1.cat.remove_unused_categories()

2857 0 a

2858 1 b

2859 2 b

2860 3 c

2861 4 c

2862 5 c

2863 dtype: category

2864 Categories (3, object): ['a', 'b', 'c']

2865

2866 >>> s.cat.set_categories(list("abcde"))

2867 0 a

2868 1 b

2869 2 b

2870 3 c

2871 4 c

2872 5 c

2873 dtype: category

2874 Categories (5, object): ['a', 'b', 'c', 'd', 'e']

2875

2876 >>> s.cat.as_ordered()

2877 0 a

2878 1 b

2879 2 b

2880 3 c

2881 4 c

2882 5 c

2883 dtype: category

2884 Categories (3, object): ['a' < 'b' < 'c']

2885

2886 >>> s.cat.as_unordered()

2887 0 a

2888 1 b

2889 2 b

2890 3 c

2891 4 c

2892 5 c

2893 dtype: category

2894 Categories (3, object): ['a', 'b', 'c']

2895 """

2896

2897 def __init__(self, data) -> None:

2898 self._validate(data)

2899 self._parent = data.values

2900 self._index = data.index

2901 self._name = data.name

2902 self._freeze()

2903

2904 @staticmethod

2905 def _validate(data):

2906 if not isinstance(data.dtype, CategoricalDtype):

2907 raise AttributeError("Can only use .cat accessor with a 'category' dtype")

2908

2909 def _delegate_property_get(self, name: str):

2910 return getattr(self._parent, name)

2911

2912 # error: Signature of "_delegate_property_set" incompatible with supertype

2913 # "PandasDelegate"

2914 def _delegate_property_set(self, name: str, new_values): # type: ignore[override]

2915 return setattr(self._parent, name, new_values)

2916

2917 @property

2918 def codes(self) -> Series:

2919 """

2920 Return Series of codes as well as the index.

2921

2922 Examples

2923 --------

2924 >>> raw_cate = pd.Categorical(["a", "b", "c", "a"], categories=["a", "b"])

2925 >>> ser = pd.Series(raw_cate)

2926 >>> ser.cat.codes

2927 0 0

2928 1 1

2929 2 -1

2930 3 0

2931 dtype: int8

2932 """

2933 from pandas import Series

2934

2935 return Series(self._parent.codes, index=self._index)

2936

2937 def _delegate_method(self, name: str, *args, **kwargs):

2938 from pandas import Series

2939

2940 method = getattr(self._parent, name)

2941 res = method(*args, **kwargs)

2942 if res is not None:

2943 return Series(res, index=self._index, name=self._name)

2944

2945

2946# utility routines

2947

2948

2949def _get_codes_for_values(

2950 values: Index | Series | ExtensionArray | np.ndarray,

2951 categories: Index,

2952) -> np.ndarray:

2953 """

2954 utility routine to turn values into codes given the specified categories

2955

2956 If `values` is known to be a Categorical, use recode_for_categories instead.

2957 """

2958 codes = categories.get_indexer_for(values)

2959 return coerce_indexer_dtype(codes, categories)

2960

2961

2962def recode_for_categories(

2963 codes: np.ndarray, old_categories, new_categories, copy: bool = True

2964) -> np.ndarray:

2965 """

2966 Convert a set of codes for to a new set of categories

2967

2968 Parameters

2969 ----------

2970 codes : np.ndarray

2971 old_categories, new_categories : Index

2972 copy: bool, default True

2973 Whether to copy if the codes are unchanged.

2974

2975 Returns

2976 -------

2977 new_codes : np.ndarray[np.int64]

2978

2979 Examples

2980 --------

2981 >>> old_cat = pd.Index(['b', 'a', 'c'])

2982 >>> new_cat = pd.Index(['a', 'b'])

2983 >>> codes = np.array([0, 1, 1, 2])

2984 >>> recode_for_categories(codes, old_cat, new_cat)

2985 array([ 1, 0, 0, -1], dtype=int8)

2986 """

2987 if len(old_categories) == 0:

2988 # All null anyway, so just retain the nulls

2989 if copy:

2990 return codes.copy()

2991 return codes

2992 elif new_categories.equals(old_categories):

2993 # Same categories, so no need to actually recode

2994 if copy:

2995 return codes.copy()

2996 return codes

2997

2998 indexer = coerce_indexer_dtype(

2999 new_categories.get_indexer_for(old_categories), new_categories

3000 )

3001 new_codes = take_nd(indexer, codes, fill_value=-1)

3002 return new_codes

3003

3004

3005def factorize_from_iterable(values) -> tuple[np.ndarray, Index]:

3006 """

3007 Factorize an input `values` into `categories` and `codes`. Preserves

3008 categorical dtype in `categories`.

3009

3010 Parameters

3011 ----------

3012 values : list-like

3013

3014 Returns

3015 -------

3016 codes : ndarray

3017 categories : Index

3018 If `values` has a categorical dtype, then `categories` is

3019 a CategoricalIndex keeping the categories and order of `values`.

3020 """

3021 from pandas import CategoricalIndex

3022

3023 if not is_list_like(values):

3024 raise TypeError("Input must be list-like")

3025

3026 categories: Index

3027

3028 vdtype = getattr(values, "dtype", None)

3029 if isinstance(vdtype, CategoricalDtype):

3030 values = extract_array(values)

3031 # The Categorical we want to build has the same categories

3032 # as values but its codes are by def [0, ..., len(n_categories) - 1]

3033 cat_codes = np.arange(len(values.categories), dtype=values.codes.dtype)

3034 cat = Categorical.from_codes(cat_codes, dtype=values.dtype, validate=False)

3035

3036 categories = CategoricalIndex(cat)

3037 codes = values.codes

3038 else:

3039 # The value of ordered is irrelevant since we don't use cat as such,

3040 # but only the resulting categories, the order of which is independent

3041 # from ordered. Set ordered to False as default. See GH #15457

3042 cat = Categorical(values, ordered=False)

3043 categories = cat.categories

3044 codes = cat.codes

3045 return codes, categories

3046

3047

3048def factorize_from_iterables(iterables) -> tuple[list[np.ndarray], list[Index]]:

3049 """

3050 A higher-level wrapper over `factorize_from_iterable`.

3051

3052 Parameters

3053 ----------

3054 iterables : list-like of list-likes

3055

3056 Returns

3057 -------

3058 codes : list of ndarrays

3059 categories : list of Indexes

3060

3061 Notes

3062 -----

3063 See `factorize_from_iterable` for more info.

3064 """

3065 if len(iterables) == 0:

3066 # For consistency, it should return two empty lists.

3067 return [], []

3068

3069 codes, categories = zip(*(factorize_from_iterable(it) for it in iterables))

3070 return list(codes), list(categories)