1from __future__ import annotations
2
3from csv import QUOTE_NONNUMERIC
4from functools import partial
5import operator
6from shutil import get_terminal_size
7from typing import (
8 TYPE_CHECKING,
9 Literal,
10 cast,
11 overload,
12)
13import warnings
14
15import numpy as np
16
17from pandas._config import get_option
18
19from pandas._libs import (
20 NaT,
21 algos as libalgos,
22 lib,
23)
24from pandas._libs.arrays import NDArrayBacked
25from pandas.compat.numpy import function as nv
26from pandas.util._exceptions import find_stack_level
27from pandas.util._validators import validate_bool_kwarg
28
29from pandas.core.dtypes.cast import (
30 coerce_indexer_dtype,
31 find_common_type,
32)
33from pandas.core.dtypes.common import (
34 ensure_int64,
35 ensure_platform_int,
36 is_any_real_numeric_dtype,
37 is_bool_dtype,
38 is_dict_like,
39 is_hashable,
40 is_integer_dtype,
41 is_list_like,
42 is_scalar,
43 needs_i8_conversion,
44 pandas_dtype,
45)
46from pandas.core.dtypes.dtypes import (
47 ArrowDtype,
48 CategoricalDtype,
49 CategoricalDtypeType,
50 ExtensionDtype,
51)
52from pandas.core.dtypes.generic import (
53 ABCIndex,
54 ABCSeries,
55)
56from pandas.core.dtypes.missing import (
57 is_valid_na_for_dtype,
58 isna,
59)
60
61from pandas.core import (
62 algorithms,
63 arraylike,
64 ops,
65)
66from pandas.core.accessor import (
67 PandasDelegate,
68 delegate_names,
69)
70from pandas.core.algorithms import (
71 factorize,
72 take_nd,
73)
74from pandas.core.arrays._mixins import (
75 NDArrayBackedExtensionArray,
76 ravel_compat,
77)
78from pandas.core.base import (
79 ExtensionArray,
80 NoNewAttributesMixin,
81 PandasObject,
82)
83import pandas.core.common as com
84from pandas.core.construction import (
85 extract_array,
86 sanitize_array,
87)
88from pandas.core.ops.common import unpack_zerodim_and_defer
89from pandas.core.sorting import nargsort
90from pandas.core.strings.object_array import ObjectStringArrayMixin
91
92from pandas.io.formats import console
93
94if TYPE_CHECKING:
95 from collections.abc import (
96 Hashable,
97 Iterator,
98 Sequence,
99 )
100
101 from pandas._typing import (
102 ArrayLike,
103 AstypeArg,
104 AxisInt,
105 Dtype,
106 DtypeObj,
107 NpDtype,
108 Ordered,
109 Self,
110 Shape,
111 SortKind,
112 npt,
113 )
114
115 from pandas import (
116 DataFrame,
117 Index,
118 Series,
119 )
120
121
122def _cat_compare_op(op):
123 opname = f"__{op.__name__}__"
124 fill_value = op is operator.ne
125
126 @unpack_zerodim_and_defer(opname)
127 def func(self, other):
128 hashable = is_hashable(other)
129 if is_list_like(other) and len(other) != len(self) and not hashable:
130 # in hashable case we may have a tuple that is itself a category
131 raise ValueError("Lengths must match.")
132
133 if not self.ordered:
134 if opname in ["__lt__", "__gt__", "__le__", "__ge__"]:
135 raise TypeError(
136 "Unordered Categoricals can only compare equality or not"
137 )
138 if isinstance(other, Categorical):
139 # Two Categoricals can only be compared if the categories are
140 # the same (maybe up to ordering, depending on ordered)
141
142 msg = "Categoricals can only be compared if 'categories' are the same."
143 if not self._categories_match_up_to_permutation(other):
144 raise TypeError(msg)
145
146 if not self.ordered and not self.categories.equals(other.categories):
147 # both unordered and different order
148 other_codes = recode_for_categories(
149 other.codes, other.categories, self.categories, copy=False
150 )
151 else:
152 other_codes = other._codes
153
154 ret = op(self._codes, other_codes)
155 mask = (self._codes == -1) | (other_codes == -1)
156 if mask.any():
157 ret[mask] = fill_value
158 return ret
159
160 if hashable:
161 if other in self.categories:
162 i = self._unbox_scalar(other)
163 ret = op(self._codes, i)
164
165 if opname not in {"__eq__", "__ge__", "__gt__"}:
166 # GH#29820 performance trick; get_loc will always give i>=0,
167 # so in the cases (__ne__, __le__, __lt__) the setting
168 # here is a no-op, so can be skipped.
169 mask = self._codes == -1
170 ret[mask] = fill_value
171 return ret
172 else:
173 return ops.invalid_comparison(self, other, op)
174 else:
175 # allow categorical vs object dtype array comparisons for equality
176 # these are only positional comparisons
177 if opname not in ["__eq__", "__ne__"]:
178 raise TypeError(
179 f"Cannot compare a Categorical for op {opname} with "
180 f"type {type(other)}.\nIf you want to compare values, "
181 "use 'np.asarray(cat) <op> other'."
182 )
183
184 if isinstance(other, ExtensionArray) and needs_i8_conversion(other.dtype):
185 # We would return NotImplemented here, but that messes up
186 # ExtensionIndex's wrapped methods
187 return op(other, self)
188 return getattr(np.array(self), opname)(np.array(other))
189
190 func.__name__ = opname
191
192 return func
193
194
195def contains(cat, key, container) -> bool:
196 """
197 Helper for membership check for ``key`` in ``cat``.
198
199 This is a helper method for :method:`__contains__`
200 and :class:`CategoricalIndex.__contains__`.
201
202 Returns True if ``key`` is in ``cat.categories`` and the
203 location of ``key`` in ``categories`` is in ``container``.
204
205 Parameters
206 ----------
207 cat : :class:`Categorical`or :class:`categoricalIndex`
208 key : a hashable object
209 The key to check membership for.
210 container : Container (e.g. list-like or mapping)
211 The container to check for membership in.
212
213 Returns
214 -------
215 is_in : bool
216 True if ``key`` is in ``self.categories`` and location of
217 ``key`` in ``categories`` is in ``container``, else False.
218
219 Notes
220 -----
221 This method does not check for NaN values. Do that separately
222 before calling this method.
223 """
224 hash(key)
225
226 # get location of key in categories.
227 # If a KeyError, the key isn't in categories, so logically
228 # can't be in container either.
229 try:
230 loc = cat.categories.get_loc(key)
231 except (KeyError, TypeError):
232 return False
233
234 # loc is the location of key in categories, but also the *value*
235 # for key in container. So, `key` may be in categories,
236 # but still not in `container`. Example ('b' in categories,
237 # but not in values):
238 # 'b' in Categorical(['a'], categories=['a', 'b']) # False
239 if is_scalar(loc):
240 return loc in container
241 else:
242 # if categories is an IntervalIndex, loc is an array.
243 return any(loc_ in container for loc_ in loc)
244
245
246class Categorical(NDArrayBackedExtensionArray, PandasObject, ObjectStringArrayMixin):
247 """
248 Represent a categorical variable in classic R / S-plus fashion.
249
250 `Categoricals` can only take on a limited, and usually fixed, number
251 of possible values (`categories`). In contrast to statistical categorical
252 variables, a `Categorical` might have an order, but numerical operations
253 (additions, divisions, ...) are not possible.
254
255 All values of the `Categorical` are either in `categories` or `np.nan`.
256 Assigning values outside of `categories` will raise a `ValueError`. Order
257 is defined by the order of the `categories`, not lexical order of the
258 values.
259
260 Parameters
261 ----------
262 values : list-like
263 The values of the categorical. If categories are given, values not in
264 categories will be replaced with NaN.
265 categories : Index-like (unique), optional
266 The unique categories for this categorical. If not given, the
267 categories are assumed to be the unique values of `values` (sorted, if
268 possible, otherwise in the order in which they appear).
269 ordered : bool, default False
270 Whether or not this categorical is treated as a ordered categorical.
271 If True, the resulting categorical will be ordered.
272 An ordered categorical respects, when sorted, the order of its
273 `categories` attribute (which in turn is the `categories` argument, if
274 provided).
275 dtype : CategoricalDtype
276 An instance of ``CategoricalDtype`` to use for this categorical.
277
278 Attributes
279 ----------
280 categories : Index
281 The categories of this categorical.
282 codes : ndarray
283 The codes (integer positions, which point to the categories) of this
284 categorical, read only.
285 ordered : bool
286 Whether or not this Categorical is ordered.
287 dtype : CategoricalDtype
288 The instance of ``CategoricalDtype`` storing the ``categories``
289 and ``ordered``.
290
291 Methods
292 -------
293 from_codes
294 __array__
295
296 Raises
297 ------
298 ValueError
299 If the categories do not validate.
300 TypeError
301 If an explicit ``ordered=True`` is given but no `categories` and the
302 `values` are not sortable.
303
304 See Also
305 --------
306 CategoricalDtype : Type for categorical data.
307 CategoricalIndex : An Index with an underlying ``Categorical``.
308
309 Notes
310 -----
311 See the `user guide
312 <https://pandas.pydata.org/pandas-docs/stable/user_guide/categorical.html>`__
313 for more.
314
315 Examples
316 --------
317 >>> pd.Categorical([1, 2, 3, 1, 2, 3])
318 [1, 2, 3, 1, 2, 3]
319 Categories (3, int64): [1, 2, 3]
320
321 >>> pd.Categorical(['a', 'b', 'c', 'a', 'b', 'c'])
322 ['a', 'b', 'c', 'a', 'b', 'c']
323 Categories (3, object): ['a', 'b', 'c']
324
325 Missing values are not included as a category.
326
327 >>> c = pd.Categorical([1, 2, 3, 1, 2, 3, np.nan])
328 >>> c
329 [1, 2, 3, 1, 2, 3, NaN]
330 Categories (3, int64): [1, 2, 3]
331
332 However, their presence is indicated in the `codes` attribute
333 by code `-1`.
334
335 >>> c.codes
336 array([ 0, 1, 2, 0, 1, 2, -1], dtype=int8)
337
338 Ordered `Categoricals` can be sorted according to the custom order
339 of the categories and can have a min and max value.
340
341 >>> c = pd.Categorical(['a', 'b', 'c', 'a', 'b', 'c'], ordered=True,
342 ... categories=['c', 'b', 'a'])
343 >>> c
344 ['a', 'b', 'c', 'a', 'b', 'c']
345 Categories (3, object): ['c' < 'b' < 'a']
346 >>> c.min()
347 'c'
348 """
349
350 # For comparisons, so that numpy uses our implementation if the compare
351 # ops, which raise
352 __array_priority__ = 1000
353 # tolist is not actually deprecated, just suppressed in the __dir__
354 _hidden_attrs = PandasObject._hidden_attrs | frozenset(["tolist"])
355 _typ = "categorical"
356
357 _dtype: CategoricalDtype
358
359 @classmethod
360 # error: Argument 2 of "_simple_new" is incompatible with supertype
361 # "NDArrayBacked"; supertype defines the argument type as
362 # "Union[dtype[Any], ExtensionDtype]"
363 def _simple_new( # type: ignore[override]
364 cls, codes: np.ndarray, dtype: CategoricalDtype
365 ) -> Self:
366 # NB: This is not _quite_ as simple as the "usual" _simple_new
367 codes = coerce_indexer_dtype(codes, dtype.categories)
368 dtype = CategoricalDtype(ordered=False).update_dtype(dtype)
369 return super()._simple_new(codes, dtype)
370
371 def __init__(
372 self,
373 values,
374 categories=None,
375 ordered=None,
376 dtype: Dtype | None = None,
377 fastpath: bool | lib.NoDefault = lib.no_default,
378 copy: bool = True,
379 ) -> None:
380 if fastpath is not lib.no_default:
381 # GH#20110
382 warnings.warn(
383 "The 'fastpath' keyword in Categorical is deprecated and will "
384 "be removed in a future version. Use Categorical.from_codes instead",
385 DeprecationWarning,
386 stacklevel=find_stack_level(),
387 )
388 else:
389 fastpath = False
390
391 dtype = CategoricalDtype._from_values_or_dtype(
392 values, categories, ordered, dtype
393 )
394 # At this point, dtype is always a CategoricalDtype, but
395 # we may have dtype.categories be None, and we need to
396 # infer categories in a factorization step further below
397
398 if fastpath:
399 codes = coerce_indexer_dtype(values, dtype.categories)
400 dtype = CategoricalDtype(ordered=False).update_dtype(dtype)
401 super().__init__(codes, dtype)
402 return
403
404 if not is_list_like(values):
405 # GH#38433
406 raise TypeError("Categorical input must be list-like")
407
408 # null_mask indicates missing values we want to exclude from inference.
409 # This means: only missing values in list-likes (not arrays/ndframes).
410 null_mask = np.array(False)
411
412 # sanitize input
413 vdtype = getattr(values, "dtype", None)
414 if isinstance(vdtype, CategoricalDtype):
415 if dtype.categories is None:
416 dtype = CategoricalDtype(values.categories, dtype.ordered)
417 elif not isinstance(values, (ABCIndex, ABCSeries, ExtensionArray)):
418 values = com.convert_to_list_like(values)
419 if isinstance(values, list) and len(values) == 0:
420 # By convention, empty lists result in object dtype:
421 values = np.array([], dtype=object)
422 elif isinstance(values, np.ndarray):
423 if values.ndim > 1:
424 # preempt sanitize_array from raising ValueError
425 raise NotImplementedError(
426 "> 1 ndim Categorical are not supported at this time"
427 )
428 values = sanitize_array(values, None)
429 else:
430 # i.e. must be a list
431 arr = sanitize_array(values, None)
432 null_mask = isna(arr)
433 if null_mask.any():
434 # We remove null values here, then below will re-insert
435 # them, grep "full_codes"
436 arr_list = [values[idx] for idx in np.where(~null_mask)[0]]
437
438 # GH#44900 Do not cast to float if we have only missing values
439 if arr_list or arr.dtype == "object":
440 sanitize_dtype = None
441 else:
442 sanitize_dtype = arr.dtype
443
444 arr = sanitize_array(arr_list, None, dtype=sanitize_dtype)
445 values = arr
446
447 if dtype.categories is None:
448 if isinstance(values.dtype, ArrowDtype) and issubclass(
449 values.dtype.type, CategoricalDtypeType
450 ):
451 arr = values._pa_array.combine_chunks()
452 categories = arr.dictionary.to_pandas(types_mapper=ArrowDtype)
453 codes = arr.indices.to_numpy()
454 dtype = CategoricalDtype(categories, values.dtype.pyarrow_dtype.ordered)
455 else:
456 if not isinstance(values, ABCIndex):
457 # in particular RangeIndex xref test_index_equal_range_categories
458 values = sanitize_array(values, None)
459 try:
460 codes, categories = factorize(values, sort=True)
461 except TypeError as err:
462 codes, categories = factorize(values, sort=False)
463 if dtype.ordered:
464 # raise, as we don't have a sortable data structure and so
465 # the user should give us one by specifying categories
466 raise TypeError(
467 "'values' is not ordered, please "
468 "explicitly specify the categories order "
469 "by passing in a categories argument."
470 ) from err
471
472 # we're inferring from values
473 dtype = CategoricalDtype(categories, dtype.ordered)
474
475 elif isinstance(values.dtype, CategoricalDtype):
476 old_codes = extract_array(values)._codes
477 codes = recode_for_categories(
478 old_codes, values.dtype.categories, dtype.categories, copy=copy
479 )
480
481 else:
482 codes = _get_codes_for_values(values, dtype.categories)
483
484 if null_mask.any():
485 # Reinsert -1 placeholders for previously removed missing values
486 full_codes = -np.ones(null_mask.shape, dtype=codes.dtype)
487 full_codes[~null_mask] = codes
488 codes = full_codes
489
490 dtype = CategoricalDtype(ordered=False).update_dtype(dtype)
491 arr = coerce_indexer_dtype(codes, dtype.categories)
492 super().__init__(arr, dtype)
493
494 @property
495 def dtype(self) -> CategoricalDtype:
496 """
497 The :class:`~pandas.api.types.CategoricalDtype` for this instance.
498
499 Examples
500 --------
501 >>> cat = pd.Categorical(['a', 'b'], ordered=True)
502 >>> cat
503 ['a', 'b']
504 Categories (2, object): ['a' < 'b']
505 >>> cat.dtype
506 CategoricalDtype(categories=['a', 'b'], ordered=True, categories_dtype=object)
507 """
508 return self._dtype
509
510 @property
511 def _internal_fill_value(self) -> int:
512 # using the specific numpy integer instead of python int to get
513 # the correct dtype back from _quantile in the all-NA case
514 dtype = self._ndarray.dtype
515 return dtype.type(-1)
516
517 @classmethod
518 def _from_sequence(
519 cls, scalars, *, dtype: Dtype | None = None, copy: bool = False
520 ) -> Self:
521 return cls(scalars, dtype=dtype, copy=copy)
522
523 @classmethod
524 def _from_scalars(cls, scalars, *, dtype: DtypeObj) -> Self:
525 if dtype is None:
526 # The _from_scalars strictness doesn't make much sense in this case.
527 raise NotImplementedError
528
529 res = cls._from_sequence(scalars, dtype=dtype)
530
531 # if there are any non-category elements in scalars, these will be
532 # converted to NAs in res.
533 mask = isna(scalars)
534 if not (mask == res.isna()).all():
535 # Some non-category element in scalars got converted to NA in res.
536 raise ValueError
537 return res
538
539 @overload
540 def astype(self, dtype: npt.DTypeLike, copy: bool = ...) -> np.ndarray:
541 ...
542
543 @overload
544 def astype(self, dtype: ExtensionDtype, copy: bool = ...) -> ExtensionArray:
545 ...
546
547 @overload
548 def astype(self, dtype: AstypeArg, copy: bool = ...) -> ArrayLike:
549 ...
550
551 def astype(self, dtype: AstypeArg, copy: bool = True) -> ArrayLike:
552 """
553 Coerce this type to another dtype
554
555 Parameters
556 ----------
557 dtype : numpy dtype or pandas type
558 copy : bool, default True
559 By default, astype always returns a newly allocated object.
560 If copy is set to False and dtype is categorical, the original
561 object is returned.
562 """
563 dtype = pandas_dtype(dtype)
564 if self.dtype is dtype:
565 result = self.copy() if copy else self
566
567 elif isinstance(dtype, CategoricalDtype):
568 # GH 10696/18593/18630
569 dtype = self.dtype.update_dtype(dtype)
570 self = self.copy() if copy else self
571 result = self._set_dtype(dtype)
572
573 elif isinstance(dtype, ExtensionDtype):
574 return super().astype(dtype, copy=copy)
575
576 elif dtype.kind in "iu" and self.isna().any():
577 raise ValueError("Cannot convert float NaN to integer")
578
579 elif len(self.codes) == 0 or len(self.categories) == 0:
580 result = np.array(
581 self,
582 dtype=dtype,
583 copy=copy,
584 )
585
586 else:
587 # GH8628 (PERF): astype category codes instead of astyping array
588 new_cats = self.categories._values
589
590 try:
591 new_cats = new_cats.astype(dtype=dtype, copy=copy)
592 fill_value = self.categories._na_value
593 if not is_valid_na_for_dtype(fill_value, dtype):
594 fill_value = lib.item_from_zerodim(
595 np.array(self.categories._na_value).astype(dtype)
596 )
597 except (
598 TypeError, # downstream error msg for CategoricalIndex is misleading
599 ValueError,
600 ):
601 msg = f"Cannot cast {self.categories.dtype} dtype to {dtype}"
602 raise ValueError(msg)
603
604 result = take_nd(
605 new_cats, ensure_platform_int(self._codes), fill_value=fill_value
606 )
607
608 return result
609
610 def to_list(self):
611 """
612 Alias for tolist.
613 """
614 # GH#51254
615 warnings.warn(
616 "Categorical.to_list is deprecated and will be removed in a future "
617 "version. Use obj.tolist() instead",
618 FutureWarning,
619 stacklevel=find_stack_level(),
620 )
621 return self.tolist()
622
623 @classmethod
624 def _from_inferred_categories(
625 cls, inferred_categories, inferred_codes, dtype, true_values=None
626 ) -> Self:
627 """
628 Construct a Categorical from inferred values.
629
630 For inferred categories (`dtype` is None) the categories are sorted.
631 For explicit `dtype`, the `inferred_categories` are cast to the
632 appropriate type.
633
634 Parameters
635 ----------
636 inferred_categories : Index
637 inferred_codes : Index
638 dtype : CategoricalDtype or 'category'
639 true_values : list, optional
640 If none are provided, the default ones are
641 "True", "TRUE", and "true."
642
643 Returns
644 -------
645 Categorical
646 """
647 from pandas import (
648 Index,
649 to_datetime,
650 to_numeric,
651 to_timedelta,
652 )
653
654 cats = Index(inferred_categories)
655 known_categories = (
656 isinstance(dtype, CategoricalDtype) and dtype.categories is not None
657 )
658
659 if known_categories:
660 # Convert to a specialized type with `dtype` if specified.
661 if is_any_real_numeric_dtype(dtype.categories.dtype):
662 cats = to_numeric(inferred_categories, errors="coerce")
663 elif lib.is_np_dtype(dtype.categories.dtype, "M"):
664 cats = to_datetime(inferred_categories, errors="coerce")
665 elif lib.is_np_dtype(dtype.categories.dtype, "m"):
666 cats = to_timedelta(inferred_categories, errors="coerce")
667 elif is_bool_dtype(dtype.categories.dtype):
668 if true_values is None:
669 true_values = ["True", "TRUE", "true"]
670
671 # error: Incompatible types in assignment (expression has type
672 # "ndarray", variable has type "Index")
673 cats = cats.isin(true_values) # type: ignore[assignment]
674
675 if known_categories:
676 # Recode from observation order to dtype.categories order.
677 categories = dtype.categories
678 codes = recode_for_categories(inferred_codes, cats, categories)
679 elif not cats.is_monotonic_increasing:
680 # Sort categories and recode for unknown categories.
681 unsorted = cats.copy()
682 categories = cats.sort_values()
683
684 codes = recode_for_categories(inferred_codes, unsorted, categories)
685 dtype = CategoricalDtype(categories, ordered=False)
686 else:
687 dtype = CategoricalDtype(cats, ordered=False)
688 codes = inferred_codes
689
690 return cls._simple_new(codes, dtype=dtype)
691
692 @classmethod
693 def from_codes(
694 cls,
695 codes,
696 categories=None,
697 ordered=None,
698 dtype: Dtype | None = None,
699 validate: bool = True,
700 ) -> Self:
701 """
702 Make a Categorical type from codes and categories or dtype.
703
704 This constructor is useful if you already have codes and
705 categories/dtype and so do not need the (computation intensive)
706 factorization step, which is usually done on the constructor.
707
708 If your data does not follow this convention, please use the normal
709 constructor.
710
711 Parameters
712 ----------
713 codes : array-like of int
714 An integer array, where each integer points to a category in
715 categories or dtype.categories, or else is -1 for NaN.
716 categories : index-like, optional
717 The categories for the categorical. Items need to be unique.
718 If the categories are not given here, then they must be provided
719 in `dtype`.
720 ordered : bool, optional
721 Whether or not this categorical is treated as an ordered
722 categorical. If not given here or in `dtype`, the resulting
723 categorical will be unordered.
724 dtype : CategoricalDtype or "category", optional
725 If :class:`CategoricalDtype`, cannot be used together with
726 `categories` or `ordered`.
727 validate : bool, default True
728 If True, validate that the codes are valid for the dtype.
729 If False, don't validate that the codes are valid. Be careful about skipping
730 validation, as invalid codes can lead to severe problems, such as segfaults.
731
732 .. versionadded:: 2.1.0
733
734 Returns
735 -------
736 Categorical
737
738 Examples
739 --------
740 >>> dtype = pd.CategoricalDtype(['a', 'b'], ordered=True)
741 >>> pd.Categorical.from_codes(codes=[0, 1, 0, 1], dtype=dtype)
742 ['a', 'b', 'a', 'b']
743 Categories (2, object): ['a' < 'b']
744 """
745 dtype = CategoricalDtype._from_values_or_dtype(
746 categories=categories, ordered=ordered, dtype=dtype
747 )
748 if dtype.categories is None:
749 msg = (
750 "The categories must be provided in 'categories' or "
751 "'dtype'. Both were None."
752 )
753 raise ValueError(msg)
754
755 if validate:
756 # beware: non-valid codes may segfault
757 codes = cls._validate_codes_for_dtype(codes, dtype=dtype)
758
759 return cls._simple_new(codes, dtype=dtype)
760
761 # ------------------------------------------------------------------
762 # Categories/Codes/Ordered
763
764 @property
765 def categories(self) -> Index:
766 """
767 The categories of this categorical.
768
769 Setting assigns new values to each category (effectively a rename of
770 each individual category).
771
772 The assigned value has to be a list-like object. All items must be
773 unique and the number of items in the new categories must be the same
774 as the number of items in the old categories.
775
776 Raises
777 ------
778 ValueError
779 If the new categories do not validate as categories or if the
780 number of new categories is unequal the number of old categories
781
782 See Also
783 --------
784 rename_categories : Rename categories.
785 reorder_categories : Reorder categories.
786 add_categories : Add new categories.
787 remove_categories : Remove the specified categories.
788 remove_unused_categories : Remove categories which are not used.
789 set_categories : Set the categories to the specified ones.
790
791 Examples
792 --------
793 For :class:`pandas.Series`:
794
795 >>> ser = pd.Series(['a', 'b', 'c', 'a'], dtype='category')
796 >>> ser.cat.categories
797 Index(['a', 'b', 'c'], dtype='object')
798
799 >>> raw_cat = pd.Categorical(['a', 'b', 'c', 'a'], categories=['b', 'c', 'd'])
800 >>> ser = pd.Series(raw_cat)
801 >>> ser.cat.categories
802 Index(['b', 'c', 'd'], dtype='object')
803
804 For :class:`pandas.Categorical`:
805
806 >>> cat = pd.Categorical(['a', 'b'], ordered=True)
807 >>> cat.categories
808 Index(['a', 'b'], dtype='object')
809
810 For :class:`pandas.CategoricalIndex`:
811
812 >>> ci = pd.CategoricalIndex(['a', 'c', 'b', 'a', 'c', 'b'])
813 >>> ci.categories
814 Index(['a', 'b', 'c'], dtype='object')
815
816 >>> ci = pd.CategoricalIndex(['a', 'c'], categories=['c', 'b', 'a'])
817 >>> ci.categories
818 Index(['c', 'b', 'a'], dtype='object')
819 """
820 return self.dtype.categories
821
822 @property
823 def ordered(self) -> Ordered:
824 """
825 Whether the categories have an ordered relationship.
826
827 Examples
828 --------
829 For :class:`pandas.Series`:
830
831 >>> ser = pd.Series(['a', 'b', 'c', 'a'], dtype='category')
832 >>> ser.cat.ordered
833 False
834
835 >>> raw_cat = pd.Categorical(['a', 'b', 'c', 'a'], ordered=True)
836 >>> ser = pd.Series(raw_cat)
837 >>> ser.cat.ordered
838 True
839
840 For :class:`pandas.Categorical`:
841
842 >>> cat = pd.Categorical(['a', 'b'], ordered=True)
843 >>> cat.ordered
844 True
845
846 >>> cat = pd.Categorical(['a', 'b'], ordered=False)
847 >>> cat.ordered
848 False
849
850 For :class:`pandas.CategoricalIndex`:
851
852 >>> ci = pd.CategoricalIndex(['a', 'b'], ordered=True)
853 >>> ci.ordered
854 True
855
856 >>> ci = pd.CategoricalIndex(['a', 'b'], ordered=False)
857 >>> ci.ordered
858 False
859 """
860 return self.dtype.ordered
861
862 @property
863 def codes(self) -> np.ndarray:
864 """
865 The category codes of this categorical index.
866
867 Codes are an array of integers which are the positions of the actual
868 values in the categories array.
869
870 There is no setter, use the other categorical methods and the normal item
871 setter to change values in the categorical.
872
873 Returns
874 -------
875 ndarray[int]
876 A non-writable view of the ``codes`` array.
877
878 Examples
879 --------
880 For :class:`pandas.Categorical`:
881
882 >>> cat = pd.Categorical(['a', 'b'], ordered=True)
883 >>> cat.codes
884 array([0, 1], dtype=int8)
885
886 For :class:`pandas.CategoricalIndex`:
887
888 >>> ci = pd.CategoricalIndex(['a', 'b', 'c', 'a', 'b', 'c'])
889 >>> ci.codes
890 array([0, 1, 2, 0, 1, 2], dtype=int8)
891
892 >>> ci = pd.CategoricalIndex(['a', 'c'], categories=['c', 'b', 'a'])
893 >>> ci.codes
894 array([2, 0], dtype=int8)
895 """
896 v = self._codes.view()
897 v.flags.writeable = False
898 return v
899
900 def _set_categories(self, categories, fastpath: bool = False) -> None:
901 """
902 Sets new categories inplace
903
904 Parameters
905 ----------
906 fastpath : bool, default False
907 Don't perform validation of the categories for uniqueness or nulls
908
909 Examples
910 --------
911 >>> c = pd.Categorical(['a', 'b'])
912 >>> c
913 ['a', 'b']
914 Categories (2, object): ['a', 'b']
915
916 >>> c._set_categories(pd.Index(['a', 'c']))
917 >>> c
918 ['a', 'c']
919 Categories (2, object): ['a', 'c']
920 """
921 if fastpath:
922 new_dtype = CategoricalDtype._from_fastpath(categories, self.ordered)
923 else:
924 new_dtype = CategoricalDtype(categories, ordered=self.ordered)
925 if (
926 not fastpath
927 and self.dtype.categories is not None
928 and len(new_dtype.categories) != len(self.dtype.categories)
929 ):
930 raise ValueError(
931 "new categories need to have the same number of "
932 "items as the old categories!"
933 )
934
935 super().__init__(self._ndarray, new_dtype)
936
937 def _set_dtype(self, dtype: CategoricalDtype) -> Self:
938 """
939 Internal method for directly updating the CategoricalDtype
940
941 Parameters
942 ----------
943 dtype : CategoricalDtype
944
945 Notes
946 -----
947 We don't do any validation here. It's assumed that the dtype is
948 a (valid) instance of `CategoricalDtype`.
949 """
950 codes = recode_for_categories(self.codes, self.categories, dtype.categories)
951 return type(self)._simple_new(codes, dtype=dtype)
952
953 def set_ordered(self, value: bool) -> Self:
954 """
955 Set the ordered attribute to the boolean value.
956
957 Parameters
958 ----------
959 value : bool
960 Set whether this categorical is ordered (True) or not (False).
961 """
962 new_dtype = CategoricalDtype(self.categories, ordered=value)
963 cat = self.copy()
964 NDArrayBacked.__init__(cat, cat._ndarray, new_dtype)
965 return cat
966
967 def as_ordered(self) -> Self:
968 """
969 Set the Categorical to be ordered.
970
971 Returns
972 -------
973 Categorical
974 Ordered Categorical.
975
976 Examples
977 --------
978 For :class:`pandas.Series`:
979
980 >>> ser = pd.Series(['a', 'b', 'c', 'a'], dtype='category')
981 >>> ser.cat.ordered
982 False
983 >>> ser = ser.cat.as_ordered()
984 >>> ser.cat.ordered
985 True
986
987 For :class:`pandas.CategoricalIndex`:
988
989 >>> ci = pd.CategoricalIndex(['a', 'b', 'c', 'a'])
990 >>> ci.ordered
991 False
992 >>> ci = ci.as_ordered()
993 >>> ci.ordered
994 True
995 """
996 return self.set_ordered(True)
997
998 def as_unordered(self) -> Self:
999 """
1000 Set the Categorical to be unordered.
1001
1002 Returns
1003 -------
1004 Categorical
1005 Unordered Categorical.
1006
1007 Examples
1008 --------
1009 For :class:`pandas.Series`:
1010
1011 >>> raw_cat = pd.Categorical(['a', 'b', 'c', 'a'], ordered=True)
1012 >>> ser = pd.Series(raw_cat)
1013 >>> ser.cat.ordered
1014 True
1015 >>> ser = ser.cat.as_unordered()
1016 >>> ser.cat.ordered
1017 False
1018
1019 For :class:`pandas.CategoricalIndex`:
1020
1021 >>> ci = pd.CategoricalIndex(['a', 'b', 'c', 'a'], ordered=True)
1022 >>> ci.ordered
1023 True
1024 >>> ci = ci.as_unordered()
1025 >>> ci.ordered
1026 False
1027 """
1028 return self.set_ordered(False)
1029
1030 def set_categories(self, new_categories, ordered=None, rename: bool = False):
1031 """
1032 Set the categories to the specified new categories.
1033
1034 ``new_categories`` can include new categories (which will result in
1035 unused categories) or remove old categories (which results in values
1036 set to ``NaN``). If ``rename=True``, the categories will simply be renamed
1037 (less or more items than in old categories will result in values set to
1038 ``NaN`` or in unused categories respectively).
1039
1040 This method can be used to perform more than one action of adding,
1041 removing, and reordering simultaneously and is therefore faster than
1042 performing the individual steps via the more specialised methods.
1043
1044 On the other hand this methods does not do checks (e.g., whether the
1045 old categories are included in the new categories on a reorder), which
1046 can result in surprising changes, for example when using special string
1047 dtypes, which does not considers a S1 string equal to a single char
1048 python string.
1049
1050 Parameters
1051 ----------
1052 new_categories : Index-like
1053 The categories in new order.
1054 ordered : bool, default False
1055 Whether or not the categorical is treated as a ordered categorical.
1056 If not given, do not change the ordered information.
1057 rename : bool, default False
1058 Whether or not the new_categories should be considered as a rename
1059 of the old categories or as reordered categories.
1060
1061 Returns
1062 -------
1063 Categorical with reordered categories.
1064
1065 Raises
1066 ------
1067 ValueError
1068 If new_categories does not validate as categories
1069
1070 See Also
1071 --------
1072 rename_categories : Rename categories.
1073 reorder_categories : Reorder categories.
1074 add_categories : Add new categories.
1075 remove_categories : Remove the specified categories.
1076 remove_unused_categories : Remove categories which are not used.
1077
1078 Examples
1079 --------
1080 For :class:`pandas.Series`:
1081
1082 >>> raw_cat = pd.Categorical(['a', 'b', 'c', 'A'],
1083 ... categories=['a', 'b', 'c'], ordered=True)
1084 >>> ser = pd.Series(raw_cat)
1085 >>> ser
1086 0 a
1087 1 b
1088 2 c
1089 3 NaN
1090 dtype: category
1091 Categories (3, object): ['a' < 'b' < 'c']
1092
1093 >>> ser.cat.set_categories(['A', 'B', 'C'], rename=True)
1094 0 A
1095 1 B
1096 2 C
1097 3 NaN
1098 dtype: category
1099 Categories (3, object): ['A' < 'B' < 'C']
1100
1101 For :class:`pandas.CategoricalIndex`:
1102
1103 >>> ci = pd.CategoricalIndex(['a', 'b', 'c', 'A'],
1104 ... categories=['a', 'b', 'c'], ordered=True)
1105 >>> ci
1106 CategoricalIndex(['a', 'b', 'c', nan], categories=['a', 'b', 'c'],
1107 ordered=True, dtype='category')
1108
1109 >>> ci.set_categories(['A', 'b', 'c'])
1110 CategoricalIndex([nan, 'b', 'c', nan], categories=['A', 'b', 'c'],
1111 ordered=True, dtype='category')
1112 >>> ci.set_categories(['A', 'b', 'c'], rename=True)
1113 CategoricalIndex(['A', 'b', 'c', nan], categories=['A', 'b', 'c'],
1114 ordered=True, dtype='category')
1115 """
1116
1117 if ordered is None:
1118 ordered = self.dtype.ordered
1119 new_dtype = CategoricalDtype(new_categories, ordered=ordered)
1120
1121 cat = self.copy()
1122 if rename:
1123 if cat.dtype.categories is not None and len(new_dtype.categories) < len(
1124 cat.dtype.categories
1125 ):
1126 # remove all _codes which are larger and set to -1/NaN
1127 cat._codes[cat._codes >= len(new_dtype.categories)] = -1
1128 codes = cat._codes
1129 else:
1130 codes = recode_for_categories(
1131 cat.codes, cat.categories, new_dtype.categories
1132 )
1133 NDArrayBacked.__init__(cat, codes, new_dtype)
1134 return cat
1135
1136 def rename_categories(self, new_categories) -> Self:
1137 """
1138 Rename categories.
1139
1140 Parameters
1141 ----------
1142 new_categories : list-like, dict-like or callable
1143
1144 New categories which will replace old categories.
1145
1146 * list-like: all items must be unique and the number of items in
1147 the new categories must match the existing number of categories.
1148
1149 * dict-like: specifies a mapping from
1150 old categories to new. Categories not contained in the mapping
1151 are passed through and extra categories in the mapping are
1152 ignored.
1153
1154 * callable : a callable that is called on all items in the old
1155 categories and whose return values comprise the new categories.
1156
1157 Returns
1158 -------
1159 Categorical
1160 Categorical with renamed categories.
1161
1162 Raises
1163 ------
1164 ValueError
1165 If new categories are list-like and do not have the same number of
1166 items than the current categories or do not validate as categories
1167
1168 See Also
1169 --------
1170 reorder_categories : Reorder categories.
1171 add_categories : Add new categories.
1172 remove_categories : Remove the specified categories.
1173 remove_unused_categories : Remove categories which are not used.
1174 set_categories : Set the categories to the specified ones.
1175
1176 Examples
1177 --------
1178 >>> c = pd.Categorical(['a', 'a', 'b'])
1179 >>> c.rename_categories([0, 1])
1180 [0, 0, 1]
1181 Categories (2, int64): [0, 1]
1182
1183 For dict-like ``new_categories``, extra keys are ignored and
1184 categories not in the dictionary are passed through
1185
1186 >>> c.rename_categories({'a': 'A', 'c': 'C'})
1187 ['A', 'A', 'b']
1188 Categories (2, object): ['A', 'b']
1189
1190 You may also provide a callable to create the new categories
1191
1192 >>> c.rename_categories(lambda x: x.upper())
1193 ['A', 'A', 'B']
1194 Categories (2, object): ['A', 'B']
1195 """
1196
1197 if is_dict_like(new_categories):
1198 new_categories = [
1199 new_categories.get(item, item) for item in self.categories
1200 ]
1201 elif callable(new_categories):
1202 new_categories = [new_categories(item) for item in self.categories]
1203
1204 cat = self.copy()
1205 cat._set_categories(new_categories)
1206 return cat
1207
1208 def reorder_categories(self, new_categories, ordered=None) -> Self:
1209 """
1210 Reorder categories as specified in new_categories.
1211
1212 ``new_categories`` need to include all old categories and no new category
1213 items.
1214
1215 Parameters
1216 ----------
1217 new_categories : Index-like
1218 The categories in new order.
1219 ordered : bool, optional
1220 Whether or not the categorical is treated as a ordered categorical.
1221 If not given, do not change the ordered information.
1222
1223 Returns
1224 -------
1225 Categorical
1226 Categorical with reordered categories.
1227
1228 Raises
1229 ------
1230 ValueError
1231 If the new categories do not contain all old category items or any
1232 new ones
1233
1234 See Also
1235 --------
1236 rename_categories : Rename categories.
1237 add_categories : Add new categories.
1238 remove_categories : Remove the specified categories.
1239 remove_unused_categories : Remove categories which are not used.
1240 set_categories : Set the categories to the specified ones.
1241
1242 Examples
1243 --------
1244 For :class:`pandas.Series`:
1245
1246 >>> ser = pd.Series(['a', 'b', 'c', 'a'], dtype='category')
1247 >>> ser = ser.cat.reorder_categories(['c', 'b', 'a'], ordered=True)
1248 >>> ser
1249 0 a
1250 1 b
1251 2 c
1252 3 a
1253 dtype: category
1254 Categories (3, object): ['c' < 'b' < 'a']
1255
1256 >>> ser.sort_values()
1257 2 c
1258 1 b
1259 0 a
1260 3 a
1261 dtype: category
1262 Categories (3, object): ['c' < 'b' < 'a']
1263
1264 For :class:`pandas.CategoricalIndex`:
1265
1266 >>> ci = pd.CategoricalIndex(['a', 'b', 'c', 'a'])
1267 >>> ci
1268 CategoricalIndex(['a', 'b', 'c', 'a'], categories=['a', 'b', 'c'],
1269 ordered=False, dtype='category')
1270 >>> ci.reorder_categories(['c', 'b', 'a'], ordered=True)
1271 CategoricalIndex(['a', 'b', 'c', 'a'], categories=['c', 'b', 'a'],
1272 ordered=True, dtype='category')
1273 """
1274 if (
1275 len(self.categories) != len(new_categories)
1276 or not self.categories.difference(new_categories).empty
1277 ):
1278 raise ValueError(
1279 "items in new_categories are not the same as in old categories"
1280 )
1281 return self.set_categories(new_categories, ordered=ordered)
1282
1283 def add_categories(self, new_categories) -> Self:
1284 """
1285 Add new categories.
1286
1287 `new_categories` will be included at the last/highest place in the
1288 categories and will be unused directly after this call.
1289
1290 Parameters
1291 ----------
1292 new_categories : category or list-like of category
1293 The new categories to be included.
1294
1295 Returns
1296 -------
1297 Categorical
1298 Categorical with new categories added.
1299
1300 Raises
1301 ------
1302 ValueError
1303 If the new categories include old categories or do not validate as
1304 categories
1305
1306 See Also
1307 --------
1308 rename_categories : Rename categories.
1309 reorder_categories : Reorder categories.
1310 remove_categories : Remove the specified categories.
1311 remove_unused_categories : Remove categories which are not used.
1312 set_categories : Set the categories to the specified ones.
1313
1314 Examples
1315 --------
1316 >>> c = pd.Categorical(['c', 'b', 'c'])
1317 >>> c
1318 ['c', 'b', 'c']
1319 Categories (2, object): ['b', 'c']
1320
1321 >>> c.add_categories(['d', 'a'])
1322 ['c', 'b', 'c']
1323 Categories (4, object): ['b', 'c', 'd', 'a']
1324 """
1325
1326 if not is_list_like(new_categories):
1327 new_categories = [new_categories]
1328 already_included = set(new_categories) & set(self.dtype.categories)
1329 if len(already_included) != 0:
1330 raise ValueError(
1331 f"new categories must not include old categories: {already_included}"
1332 )
1333
1334 if hasattr(new_categories, "dtype"):
1335 from pandas import Series
1336
1337 dtype = find_common_type(
1338 [self.dtype.categories.dtype, new_categories.dtype]
1339 )
1340 new_categories = Series(
1341 list(self.dtype.categories) + list(new_categories), dtype=dtype
1342 )
1343 else:
1344 new_categories = list(self.dtype.categories) + list(new_categories)
1345
1346 new_dtype = CategoricalDtype(new_categories, self.ordered)
1347 cat = self.copy()
1348 codes = coerce_indexer_dtype(cat._ndarray, new_dtype.categories)
1349 NDArrayBacked.__init__(cat, codes, new_dtype)
1350 return cat
1351
1352 def remove_categories(self, removals) -> Self:
1353 """
1354 Remove the specified categories.
1355
1356 `removals` must be included in the old categories. Values which were in
1357 the removed categories will be set to NaN
1358
1359 Parameters
1360 ----------
1361 removals : category or list of categories
1362 The categories which should be removed.
1363
1364 Returns
1365 -------
1366 Categorical
1367 Categorical with removed categories.
1368
1369 Raises
1370 ------
1371 ValueError
1372 If the removals are not contained in the categories
1373
1374 See Also
1375 --------
1376 rename_categories : Rename categories.
1377 reorder_categories : Reorder categories.
1378 add_categories : Add new categories.
1379 remove_unused_categories : Remove categories which are not used.
1380 set_categories : Set the categories to the specified ones.
1381
1382 Examples
1383 --------
1384 >>> c = pd.Categorical(['a', 'c', 'b', 'c', 'd'])
1385 >>> c
1386 ['a', 'c', 'b', 'c', 'd']
1387 Categories (4, object): ['a', 'b', 'c', 'd']
1388
1389 >>> c.remove_categories(['d', 'a'])
1390 [NaN, 'c', 'b', 'c', NaN]
1391 Categories (2, object): ['b', 'c']
1392 """
1393 from pandas import Index
1394
1395 if not is_list_like(removals):
1396 removals = [removals]
1397
1398 removals = Index(removals).unique().dropna()
1399 new_categories = (
1400 self.dtype.categories.difference(removals, sort=False)
1401 if self.dtype.ordered is True
1402 else self.dtype.categories.difference(removals)
1403 )
1404 not_included = removals.difference(self.dtype.categories)
1405
1406 if len(not_included) != 0:
1407 not_included = set(not_included)
1408 raise ValueError(f"removals must all be in old categories: {not_included}")
1409
1410 return self.set_categories(new_categories, ordered=self.ordered, rename=False)
1411
1412 def remove_unused_categories(self) -> Self:
1413 """
1414 Remove categories which are not used.
1415
1416 Returns
1417 -------
1418 Categorical
1419 Categorical with unused categories dropped.
1420
1421 See Also
1422 --------
1423 rename_categories : Rename categories.
1424 reorder_categories : Reorder categories.
1425 add_categories : Add new categories.
1426 remove_categories : Remove the specified categories.
1427 set_categories : Set the categories to the specified ones.
1428
1429 Examples
1430 --------
1431 >>> c = pd.Categorical(['a', 'c', 'b', 'c', 'd'])
1432 >>> c
1433 ['a', 'c', 'b', 'c', 'd']
1434 Categories (4, object): ['a', 'b', 'c', 'd']
1435
1436 >>> c[2] = 'a'
1437 >>> c[4] = 'c'
1438 >>> c
1439 ['a', 'c', 'a', 'c', 'c']
1440 Categories (4, object): ['a', 'b', 'c', 'd']
1441
1442 >>> c.remove_unused_categories()
1443 ['a', 'c', 'a', 'c', 'c']
1444 Categories (2, object): ['a', 'c']
1445 """
1446 idx, inv = np.unique(self._codes, return_inverse=True)
1447
1448 if idx.size != 0 and idx[0] == -1: # na sentinel
1449 idx, inv = idx[1:], inv - 1
1450
1451 new_categories = self.dtype.categories.take(idx)
1452 new_dtype = CategoricalDtype._from_fastpath(
1453 new_categories, ordered=self.ordered
1454 )
1455 new_codes = coerce_indexer_dtype(inv, new_dtype.categories)
1456
1457 cat = self.copy()
1458 NDArrayBacked.__init__(cat, new_codes, new_dtype)
1459 return cat
1460
1461 # ------------------------------------------------------------------
1462
1463 def map(
1464 self,
1465 mapper,
1466 na_action: Literal["ignore"] | None | lib.NoDefault = lib.no_default,
1467 ):
1468 """
1469 Map categories using an input mapping or function.
1470
1471 Maps the categories to new categories. If the mapping correspondence is
1472 one-to-one the result is a :class:`~pandas.Categorical` which has the
1473 same order property as the original, otherwise a :class:`~pandas.Index`
1474 is returned. NaN values are unaffected.
1475
1476 If a `dict` or :class:`~pandas.Series` is used any unmapped category is
1477 mapped to `NaN`. Note that if this happens an :class:`~pandas.Index`
1478 will be returned.
1479
1480 Parameters
1481 ----------
1482 mapper : function, dict, or Series
1483 Mapping correspondence.
1484 na_action : {None, 'ignore'}, default 'ignore'
1485 If 'ignore', propagate NaN values, without passing them to the
1486 mapping correspondence.
1487
1488 .. deprecated:: 2.1.0
1489
1490 The default value of 'ignore' has been deprecated and will be changed to
1491 None in the future.
1492
1493 Returns
1494 -------
1495 pandas.Categorical or pandas.Index
1496 Mapped categorical.
1497
1498 See Also
1499 --------
1500 CategoricalIndex.map : Apply a mapping correspondence on a
1501 :class:`~pandas.CategoricalIndex`.
1502 Index.map : Apply a mapping correspondence on an
1503 :class:`~pandas.Index`.
1504 Series.map : Apply a mapping correspondence on a
1505 :class:`~pandas.Series`.
1506 Series.apply : Apply more complex functions on a
1507 :class:`~pandas.Series`.
1508
1509 Examples
1510 --------
1511 >>> cat = pd.Categorical(['a', 'b', 'c'])
1512 >>> cat
1513 ['a', 'b', 'c']
1514 Categories (3, object): ['a', 'b', 'c']
1515 >>> cat.map(lambda x: x.upper(), na_action=None)
1516 ['A', 'B', 'C']
1517 Categories (3, object): ['A', 'B', 'C']
1518 >>> cat.map({'a': 'first', 'b': 'second', 'c': 'third'}, na_action=None)
1519 ['first', 'second', 'third']
1520 Categories (3, object): ['first', 'second', 'third']
1521
1522 If the mapping is one-to-one the ordering of the categories is
1523 preserved:
1524
1525 >>> cat = pd.Categorical(['a', 'b', 'c'], ordered=True)
1526 >>> cat
1527 ['a', 'b', 'c']
1528 Categories (3, object): ['a' < 'b' < 'c']
1529 >>> cat.map({'a': 3, 'b': 2, 'c': 1}, na_action=None)
1530 [3, 2, 1]
1531 Categories (3, int64): [3 < 2 < 1]
1532
1533 If the mapping is not one-to-one an :class:`~pandas.Index` is returned:
1534
1535 >>> cat.map({'a': 'first', 'b': 'second', 'c': 'first'}, na_action=None)
1536 Index(['first', 'second', 'first'], dtype='object')
1537
1538 If a `dict` is used, all unmapped categories are mapped to `NaN` and
1539 the result is an :class:`~pandas.Index`:
1540
1541 >>> cat.map({'a': 'first', 'b': 'second'}, na_action=None)
1542 Index(['first', 'second', nan], dtype='object')
1543 """
1544 if na_action is lib.no_default:
1545 warnings.warn(
1546 "The default value of 'ignore' for the `na_action` parameter in "
1547 "pandas.Categorical.map is deprecated and will be "
1548 "changed to 'None' in a future version. Please set na_action to the "
1549 "desired value to avoid seeing this warning",
1550 FutureWarning,
1551 stacklevel=find_stack_level(),
1552 )
1553 na_action = "ignore"
1554
1555 assert callable(mapper) or is_dict_like(mapper)
1556
1557 new_categories = self.categories.map(mapper)
1558
1559 has_nans = np.any(self._codes == -1)
1560
1561 na_val = np.nan
1562 if na_action is None and has_nans:
1563 na_val = mapper(np.nan) if callable(mapper) else mapper.get(np.nan, np.nan)
1564
1565 if new_categories.is_unique and not new_categories.hasnans and na_val is np.nan:
1566 new_dtype = CategoricalDtype(new_categories, ordered=self.ordered)
1567 return self.from_codes(self._codes.copy(), dtype=new_dtype, validate=False)
1568
1569 if has_nans:
1570 new_categories = new_categories.insert(len(new_categories), na_val)
1571
1572 return np.take(new_categories, self._codes)
1573
1574 __eq__ = _cat_compare_op(operator.eq)
1575 __ne__ = _cat_compare_op(operator.ne)
1576 __lt__ = _cat_compare_op(operator.lt)
1577 __gt__ = _cat_compare_op(operator.gt)
1578 __le__ = _cat_compare_op(operator.le)
1579 __ge__ = _cat_compare_op(operator.ge)
1580
1581 # -------------------------------------------------------------
1582 # Validators; ideally these can be de-duplicated
1583
1584 def _validate_setitem_value(self, value):
1585 if not is_hashable(value):
1586 # wrap scalars and hashable-listlikes in list
1587 return self._validate_listlike(value)
1588 else:
1589 return self._validate_scalar(value)
1590
1591 def _validate_scalar(self, fill_value):
1592 """
1593 Convert a user-facing fill_value to a representation to use with our
1594 underlying ndarray, raising TypeError if this is not possible.
1595
1596 Parameters
1597 ----------
1598 fill_value : object
1599
1600 Returns
1601 -------
1602 fill_value : int
1603
1604 Raises
1605 ------
1606 TypeError
1607 """
1608
1609 if is_valid_na_for_dtype(fill_value, self.categories.dtype):
1610 fill_value = -1
1611 elif fill_value in self.categories:
1612 fill_value = self._unbox_scalar(fill_value)
1613 else:
1614 raise TypeError(
1615 "Cannot setitem on a Categorical with a new "
1616 f"category ({fill_value}), set the categories first"
1617 ) from None
1618 return fill_value
1619
1620 @classmethod
1621 def _validate_codes_for_dtype(cls, codes, *, dtype: CategoricalDtype) -> np.ndarray:
1622 if isinstance(codes, ExtensionArray) and is_integer_dtype(codes.dtype):
1623 # Avoid the implicit conversion of Int to object
1624 if isna(codes).any():
1625 raise ValueError("codes cannot contain NA values")
1626 codes = codes.to_numpy(dtype=np.int64)
1627 else:
1628 codes = np.asarray(codes)
1629 if len(codes) and codes.dtype.kind not in "iu":
1630 raise ValueError("codes need to be array-like integers")
1631
1632 if len(codes) and (codes.max() >= len(dtype.categories) or codes.min() < -1):
1633 raise ValueError("codes need to be between -1 and len(categories)-1")
1634 return codes
1635
1636 # -------------------------------------------------------------
1637
1638 @ravel_compat
1639 def __array__(
1640 self, dtype: NpDtype | None = None, copy: bool | None = None
1641 ) -> np.ndarray:
1642 """
1643 The numpy array interface.
1644
1645 Returns
1646 -------
1647 numpy.array
1648 A numpy array of either the specified dtype or,
1649 if dtype==None (default), the same dtype as
1650 categorical.categories.dtype.
1651
1652 Examples
1653 --------
1654
1655 >>> cat = pd.Categorical(['a', 'b'], ordered=True)
1656
1657 The following calls ``cat.__array__``
1658
1659 >>> np.asarray(cat)
1660 array(['a', 'b'], dtype=object)
1661 """
1662 ret = take_nd(self.categories._values, self._codes)
1663 if dtype and np.dtype(dtype) != self.categories.dtype:
1664 return np.asarray(ret, dtype)
1665 # When we're a Categorical[ExtensionArray], like Interval,
1666 # we need to ensure __array__ gets all the way to an
1667 # ndarray.
1668 return np.asarray(ret)
1669
1670 def __array_ufunc__(self, ufunc: np.ufunc, method: str, *inputs, **kwargs):
1671 # for binary ops, use our custom dunder methods
1672 result = arraylike.maybe_dispatch_ufunc_to_dunder_op(
1673 self, ufunc, method, *inputs, **kwargs
1674 )
1675 if result is not NotImplemented:
1676 return result
1677
1678 if "out" in kwargs:
1679 # e.g. test_numpy_ufuncs_out
1680 return arraylike.dispatch_ufunc_with_out(
1681 self, ufunc, method, *inputs, **kwargs
1682 )
1683
1684 if method == "reduce":
1685 # e.g. TestCategoricalAnalytics::test_min_max_ordered
1686 result = arraylike.dispatch_reduction_ufunc(
1687 self, ufunc, method, *inputs, **kwargs
1688 )
1689 if result is not NotImplemented:
1690 return result
1691
1692 # for all other cases, raise for now (similarly as what happens in
1693 # Series.__array_prepare__)
1694 raise TypeError(
1695 f"Object with dtype {self.dtype} cannot perform "
1696 f"the numpy op {ufunc.__name__}"
1697 )
1698
1699 def __setstate__(self, state) -> None:
1700 """Necessary for making this object picklable"""
1701 if not isinstance(state, dict):
1702 return super().__setstate__(state)
1703
1704 if "_dtype" not in state:
1705 state["_dtype"] = CategoricalDtype(state["_categories"], state["_ordered"])
1706
1707 if "_codes" in state and "_ndarray" not in state:
1708 # backward compat, changed what is property vs attribute
1709 state["_ndarray"] = state.pop("_codes")
1710
1711 super().__setstate__(state)
1712
1713 @property
1714 def nbytes(self) -> int:
1715 return self._codes.nbytes + self.dtype.categories.values.nbytes
1716
1717 def memory_usage(self, deep: bool = False) -> int:
1718 """
1719 Memory usage of my values
1720
1721 Parameters
1722 ----------
1723 deep : bool
1724 Introspect the data deeply, interrogate
1725 `object` dtypes for system-level memory consumption
1726
1727 Returns
1728 -------
1729 bytes used
1730
1731 Notes
1732 -----
1733 Memory usage does not include memory consumed by elements that
1734 are not components of the array if deep=False
1735
1736 See Also
1737 --------
1738 numpy.ndarray.nbytes
1739 """
1740 return self._codes.nbytes + self.dtype.categories.memory_usage(deep=deep)
1741
1742 def isna(self) -> npt.NDArray[np.bool_]:
1743 """
1744 Detect missing values
1745
1746 Missing values (-1 in .codes) are detected.
1747
1748 Returns
1749 -------
1750 np.ndarray[bool] of whether my values are null
1751
1752 See Also
1753 --------
1754 isna : Top-level isna.
1755 isnull : Alias of isna.
1756 Categorical.notna : Boolean inverse of Categorical.isna.
1757
1758 """
1759 return self._codes == -1
1760
1761 isnull = isna
1762
1763 def notna(self) -> npt.NDArray[np.bool_]:
1764 """
1765 Inverse of isna
1766
1767 Both missing values (-1 in .codes) and NA as a category are detected as
1768 null.
1769
1770 Returns
1771 -------
1772 np.ndarray[bool] of whether my values are not null
1773
1774 See Also
1775 --------
1776 notna : Top-level notna.
1777 notnull : Alias of notna.
1778 Categorical.isna : Boolean inverse of Categorical.notna.
1779
1780 """
1781 return ~self.isna()
1782
1783 notnull = notna
1784
1785 def value_counts(self, dropna: bool = True) -> Series:
1786 """
1787 Return a Series containing counts of each category.
1788
1789 Every category will have an entry, even those with a count of 0.
1790
1791 Parameters
1792 ----------
1793 dropna : bool, default True
1794 Don't include counts of NaN.
1795
1796 Returns
1797 -------
1798 counts : Series
1799
1800 See Also
1801 --------
1802 Series.value_counts
1803 """
1804 from pandas import (
1805 CategoricalIndex,
1806 Series,
1807 )
1808
1809 code, cat = self._codes, self.categories
1810 ncat, mask = (len(cat), code >= 0)
1811 ix, clean = np.arange(ncat), mask.all()
1812
1813 if dropna or clean:
1814 obs = code if clean else code[mask]
1815 count = np.bincount(obs, minlength=ncat or 0)
1816 else:
1817 count = np.bincount(np.where(mask, code, ncat))
1818 ix = np.append(ix, -1)
1819
1820 ix = coerce_indexer_dtype(ix, self.dtype.categories)
1821 ix = self._from_backing_data(ix)
1822
1823 return Series(
1824 count, index=CategoricalIndex(ix), dtype="int64", name="count", copy=False
1825 )
1826
1827 # error: Argument 2 of "_empty" is incompatible with supertype
1828 # "NDArrayBackedExtensionArray"; supertype defines the argument type as
1829 # "ExtensionDtype"
1830 @classmethod
1831 def _empty( # type: ignore[override]
1832 cls, shape: Shape, dtype: CategoricalDtype
1833 ) -> Self:
1834 """
1835 Analogous to np.empty(shape, dtype=dtype)
1836
1837 Parameters
1838 ----------
1839 shape : tuple[int]
1840 dtype : CategoricalDtype
1841 """
1842 arr = cls._from_sequence([], dtype=dtype)
1843
1844 # We have to use np.zeros instead of np.empty otherwise the resulting
1845 # ndarray may contain codes not supported by this dtype, in which
1846 # case repr(result) could segfault.
1847 backing = np.zeros(shape, dtype=arr._ndarray.dtype)
1848
1849 return arr._from_backing_data(backing)
1850
1851 def _internal_get_values(self) -> ArrayLike:
1852 """
1853 Return the values.
1854
1855 For internal compatibility with pandas formatting.
1856
1857 Returns
1858 -------
1859 np.ndarray or ExtensionArray
1860 A numpy array or ExtensionArray of the same dtype as
1861 categorical.categories.dtype.
1862 """
1863 # if we are a datetime and period index, return Index to keep metadata
1864 if needs_i8_conversion(self.categories.dtype):
1865 return self.categories.take(self._codes, fill_value=NaT)._values
1866 elif is_integer_dtype(self.categories.dtype) and -1 in self._codes:
1867 return (
1868 self.categories.astype("object")
1869 .take(self._codes, fill_value=np.nan)
1870 ._values
1871 )
1872 return np.array(self)
1873
1874 def check_for_ordered(self, op) -> None:
1875 """assert that we are ordered"""
1876 if not self.ordered:
1877 raise TypeError(
1878 f"Categorical is not ordered for operation {op}\n"
1879 "you can use .as_ordered() to change the "
1880 "Categorical to an ordered one\n"
1881 )
1882
1883 def argsort(
1884 self, *, ascending: bool = True, kind: SortKind = "quicksort", **kwargs
1885 ):
1886 """
1887 Return the indices that would sort the Categorical.
1888
1889 Missing values are sorted at the end.
1890
1891 Parameters
1892 ----------
1893 ascending : bool, default True
1894 Whether the indices should result in an ascending
1895 or descending sort.
1896 kind : {'quicksort', 'mergesort', 'heapsort', 'stable'}, optional
1897 Sorting algorithm.
1898 **kwargs:
1899 passed through to :func:`numpy.argsort`.
1900
1901 Returns
1902 -------
1903 np.ndarray[np.intp]
1904
1905 See Also
1906 --------
1907 numpy.ndarray.argsort
1908
1909 Notes
1910 -----
1911 While an ordering is applied to the category values, arg-sorting
1912 in this context refers more to organizing and grouping together
1913 based on matching category values. Thus, this function can be
1914 called on an unordered Categorical instance unlike the functions
1915 'Categorical.min' and 'Categorical.max'.
1916
1917 Examples
1918 --------
1919 >>> pd.Categorical(['b', 'b', 'a', 'c']).argsort()
1920 array([2, 0, 1, 3])
1921
1922 >>> cat = pd.Categorical(['b', 'b', 'a', 'c'],
1923 ... categories=['c', 'b', 'a'],
1924 ... ordered=True)
1925 >>> cat.argsort()
1926 array([3, 0, 1, 2])
1927
1928 Missing values are placed at the end
1929
1930 >>> cat = pd.Categorical([2, None, 1])
1931 >>> cat.argsort()
1932 array([2, 0, 1])
1933 """
1934 return super().argsort(ascending=ascending, kind=kind, **kwargs)
1935
1936 @overload
1937 def sort_values(
1938 self,
1939 *,
1940 inplace: Literal[False] = ...,
1941 ascending: bool = ...,
1942 na_position: str = ...,
1943 ) -> Self:
1944 ...
1945
1946 @overload
1947 def sort_values(
1948 self, *, inplace: Literal[True], ascending: bool = ..., na_position: str = ...
1949 ) -> None:
1950 ...
1951
1952 def sort_values(
1953 self,
1954 *,
1955 inplace: bool = False,
1956 ascending: bool = True,
1957 na_position: str = "last",
1958 ) -> Self | None:
1959 """
1960 Sort the Categorical by category value returning a new
1961 Categorical by default.
1962
1963 While an ordering is applied to the category values, sorting in this
1964 context refers more to organizing and grouping together based on
1965 matching category values. Thus, this function can be called on an
1966 unordered Categorical instance unlike the functions 'Categorical.min'
1967 and 'Categorical.max'.
1968
1969 Parameters
1970 ----------
1971 inplace : bool, default False
1972 Do operation in place.
1973 ascending : bool, default True
1974 Order ascending. Passing False orders descending. The
1975 ordering parameter provides the method by which the
1976 category values are organized.
1977 na_position : {'first', 'last'} (optional, default='last')
1978 'first' puts NaNs at the beginning
1979 'last' puts NaNs at the end
1980
1981 Returns
1982 -------
1983 Categorical or None
1984
1985 See Also
1986 --------
1987 Categorical.sort
1988 Series.sort_values
1989
1990 Examples
1991 --------
1992 >>> c = pd.Categorical([1, 2, 2, 1, 5])
1993 >>> c
1994 [1, 2, 2, 1, 5]
1995 Categories (3, int64): [1, 2, 5]
1996 >>> c.sort_values()
1997 [1, 1, 2, 2, 5]
1998 Categories (3, int64): [1, 2, 5]
1999 >>> c.sort_values(ascending=False)
2000 [5, 2, 2, 1, 1]
2001 Categories (3, int64): [1, 2, 5]
2002
2003 >>> c = pd.Categorical([1, 2, 2, 1, 5])
2004
2005 'sort_values' behaviour with NaNs. Note that 'na_position'
2006 is independent of the 'ascending' parameter:
2007
2008 >>> c = pd.Categorical([np.nan, 2, 2, np.nan, 5])
2009 >>> c
2010 [NaN, 2, 2, NaN, 5]
2011 Categories (2, int64): [2, 5]
2012 >>> c.sort_values()
2013 [2, 2, 5, NaN, NaN]
2014 Categories (2, int64): [2, 5]
2015 >>> c.sort_values(ascending=False)
2016 [5, 2, 2, NaN, NaN]
2017 Categories (2, int64): [2, 5]
2018 >>> c.sort_values(na_position='first')
2019 [NaN, NaN, 2, 2, 5]
2020 Categories (2, int64): [2, 5]
2021 >>> c.sort_values(ascending=False, na_position='first')
2022 [NaN, NaN, 5, 2, 2]
2023 Categories (2, int64): [2, 5]
2024 """
2025 inplace = validate_bool_kwarg(inplace, "inplace")
2026 if na_position not in ["last", "first"]:
2027 raise ValueError(f"invalid na_position: {repr(na_position)}")
2028
2029 sorted_idx = nargsort(self, ascending=ascending, na_position=na_position)
2030
2031 if not inplace:
2032 codes = self._codes[sorted_idx]
2033 return self._from_backing_data(codes)
2034 self._codes[:] = self._codes[sorted_idx]
2035 return None
2036
2037 def _rank(
2038 self,
2039 *,
2040 axis: AxisInt = 0,
2041 method: str = "average",
2042 na_option: str = "keep",
2043 ascending: bool = True,
2044 pct: bool = False,
2045 ):
2046 """
2047 See Series.rank.__doc__.
2048 """
2049 if axis != 0:
2050 raise NotImplementedError
2051 vff = self._values_for_rank()
2052 return algorithms.rank(
2053 vff,
2054 axis=axis,
2055 method=method,
2056 na_option=na_option,
2057 ascending=ascending,
2058 pct=pct,
2059 )
2060
2061 def _values_for_rank(self) -> np.ndarray:
2062 """
2063 For correctly ranking ordered categorical data. See GH#15420
2064
2065 Ordered categorical data should be ranked on the basis of
2066 codes with -1 translated to NaN.
2067
2068 Returns
2069 -------
2070 numpy.array
2071
2072 """
2073 from pandas import Series
2074
2075 if self.ordered:
2076 values = self.codes
2077 mask = values == -1
2078 if mask.any():
2079 values = values.astype("float64")
2080 values[mask] = np.nan
2081 elif is_any_real_numeric_dtype(self.categories.dtype):
2082 values = np.array(self)
2083 else:
2084 # reorder the categories (so rank can use the float codes)
2085 # instead of passing an object array to rank
2086 values = np.array(
2087 self.rename_categories(
2088 Series(self.categories, copy=False).rank().values
2089 )
2090 )
2091 return values
2092
2093 def _hash_pandas_object(
2094 self, *, encoding: str, hash_key: str, categorize: bool
2095 ) -> npt.NDArray[np.uint64]:
2096 """
2097 Hash a Categorical by hashing its categories, and then mapping the codes
2098 to the hashes.
2099
2100 Parameters
2101 ----------
2102 encoding : str
2103 hash_key : str
2104 categorize : bool
2105 Ignored for Categorical.
2106
2107 Returns
2108 -------
2109 np.ndarray[uint64]
2110 """
2111 # Note we ignore categorize, as we are already Categorical.
2112 from pandas.core.util.hashing import hash_array
2113
2114 # Convert ExtensionArrays to ndarrays
2115 values = np.asarray(self.categories._values)
2116 hashed = hash_array(values, encoding, hash_key, categorize=False)
2117
2118 # we have uint64, as we don't directly support missing values
2119 # we don't want to use take_nd which will coerce to float
2120 # instead, directly construct the result with a
2121 # max(np.uint64) as the missing value indicator
2122 #
2123 # TODO: GH#15362
2124
2125 mask = self.isna()
2126 if len(hashed):
2127 result = hashed.take(self._codes)
2128 else:
2129 result = np.zeros(len(mask), dtype="uint64")
2130
2131 if mask.any():
2132 result[mask] = lib.u8max
2133
2134 return result
2135
2136 # ------------------------------------------------------------------
2137 # NDArrayBackedExtensionArray compat
2138
2139 @property
2140 def _codes(self) -> np.ndarray:
2141 return self._ndarray
2142
2143 def _box_func(self, i: int):
2144 if i == -1:
2145 return np.nan
2146 return self.categories[i]
2147
2148 def _unbox_scalar(self, key) -> int:
2149 # searchsorted is very performance sensitive. By converting codes
2150 # to same dtype as self.codes, we get much faster performance.
2151 code = self.categories.get_loc(key)
2152 code = self._ndarray.dtype.type(code)
2153 return code
2154
2155 # ------------------------------------------------------------------
2156
2157 def __iter__(self) -> Iterator:
2158 """
2159 Returns an Iterator over the values of this Categorical.
2160 """
2161 if self.ndim == 1:
2162 return iter(self._internal_get_values().tolist())
2163 else:
2164 return (self[n] for n in range(len(self)))
2165
2166 def __contains__(self, key) -> bool:
2167 """
2168 Returns True if `key` is in this Categorical.
2169 """
2170 # if key is a NaN, check if any NaN is in self.
2171 if is_valid_na_for_dtype(key, self.categories.dtype):
2172 return bool(self.isna().any())
2173
2174 return contains(self, key, container=self._codes)
2175
2176 # ------------------------------------------------------------------
2177 # Rendering Methods
2178
2179 def _formatter(self, boxed: bool = False):
2180 # Returning None here will cause format_array to do inference.
2181 return None
2182
2183 def _repr_categories(self) -> list[str]:
2184 """
2185 return the base repr for the categories
2186 """
2187 max_categories = (
2188 10
2189 if get_option("display.max_categories") == 0
2190 else get_option("display.max_categories")
2191 )
2192 from pandas.io.formats import format as fmt
2193
2194 format_array = partial(
2195 fmt.format_array, formatter=None, quoting=QUOTE_NONNUMERIC
2196 )
2197 if len(self.categories) > max_categories:
2198 num = max_categories // 2
2199 head = format_array(self.categories[:num]._values)
2200 tail = format_array(self.categories[-num:]._values)
2201 category_strs = head + ["..."] + tail
2202 else:
2203 category_strs = format_array(self.categories._values)
2204
2205 # Strip all leading spaces, which format_array adds for columns...
2206 category_strs = [x.strip() for x in category_strs]
2207 return category_strs
2208
2209 def _get_repr_footer(self) -> str:
2210 """
2211 Returns a string representation of the footer.
2212 """
2213 category_strs = self._repr_categories()
2214 dtype = str(self.categories.dtype)
2215 levheader = f"Categories ({len(self.categories)}, {dtype}): "
2216 width, _ = get_terminal_size()
2217 max_width = get_option("display.width") or width
2218 if console.in_ipython_frontend():
2219 # 0 = no breaks
2220 max_width = 0
2221 levstring = ""
2222 start = True
2223 cur_col_len = len(levheader) # header
2224 sep_len, sep = (3, " < ") if self.ordered else (2, ", ")
2225 linesep = f"{sep.rstrip()}\n" # remove whitespace
2226 for val in category_strs:
2227 if max_width != 0 and cur_col_len + sep_len + len(val) > max_width:
2228 levstring += linesep + (" " * (len(levheader) + 1))
2229 cur_col_len = len(levheader) + 1 # header + a whitespace
2230 elif not start:
2231 levstring += sep
2232 cur_col_len += len(val)
2233 levstring += val
2234 start = False
2235 # replace to simple save space by
2236 return f"{levheader}[{levstring.replace(' < ... < ', ' ... ')}]"
2237
2238 def _get_values_repr(self) -> str:
2239 from pandas.io.formats import format as fmt
2240
2241 assert len(self) > 0
2242
2243 vals = self._internal_get_values()
2244 fmt_values = fmt.format_array(
2245 vals,
2246 None,
2247 float_format=None,
2248 na_rep="NaN",
2249 quoting=QUOTE_NONNUMERIC,
2250 )
2251
2252 fmt_values = [i.strip() for i in fmt_values]
2253 joined = ", ".join(fmt_values)
2254 result = "[" + joined + "]"
2255 return result
2256
2257 def __repr__(self) -> str:
2258 """
2259 String representation.
2260 """
2261 footer = self._get_repr_footer()
2262 length = len(self)
2263 max_len = 10
2264 if length > max_len:
2265 # In long cases we do not display all entries, so we add Length
2266 # information to the __repr__.
2267 num = max_len // 2
2268 head = self[:num]._get_values_repr()
2269 tail = self[-(max_len - num) :]._get_values_repr()
2270 body = f"{head[:-1]}, ..., {tail[1:]}"
2271 length_info = f"Length: {len(self)}"
2272 result = f"{body}\n{length_info}\n{footer}"
2273 elif length > 0:
2274 body = self._get_values_repr()
2275 result = f"{body}\n{footer}"
2276 else:
2277 # In the empty case we use a comma instead of newline to get
2278 # a more compact __repr__
2279 body = "[]"
2280 result = f"{body}, {footer}"
2281
2282 return result
2283
2284 # ------------------------------------------------------------------
2285
2286 def _validate_listlike(self, value):
2287 # NB: here we assume scalar-like tuples have already been excluded
2288 value = extract_array(value, extract_numpy=True)
2289
2290 # require identical categories set
2291 if isinstance(value, Categorical):
2292 if self.dtype != value.dtype:
2293 raise TypeError(
2294 "Cannot set a Categorical with another, "
2295 "without identical categories"
2296 )
2297 # dtype equality implies categories_match_up_to_permutation
2298 value = self._encode_with_my_categories(value)
2299 return value._codes
2300
2301 from pandas import Index
2302
2303 # tupleize_cols=False for e.g. test_fillna_iterable_category GH#41914
2304 to_add = Index._with_infer(value, tupleize_cols=False).difference(
2305 self.categories
2306 )
2307
2308 # no assignments of values not in categories, but it's always ok to set
2309 # something to np.nan
2310 if len(to_add) and not isna(to_add).all():
2311 raise TypeError(
2312 "Cannot setitem on a Categorical with a new "
2313 "category, set the categories first"
2314 )
2315
2316 codes = self.categories.get_indexer(value)
2317 return codes.astype(self._ndarray.dtype, copy=False)
2318
2319 def _reverse_indexer(self) -> dict[Hashable, npt.NDArray[np.intp]]:
2320 """
2321 Compute the inverse of a categorical, returning
2322 a dict of categories -> indexers.
2323
2324 *This is an internal function*
2325
2326 Returns
2327 -------
2328 Dict[Hashable, np.ndarray[np.intp]]
2329 dict of categories -> indexers
2330
2331 Examples
2332 --------
2333 >>> c = pd.Categorical(list('aabca'))
2334 >>> c
2335 ['a', 'a', 'b', 'c', 'a']
2336 Categories (3, object): ['a', 'b', 'c']
2337 >>> c.categories
2338 Index(['a', 'b', 'c'], dtype='object')
2339 >>> c.codes
2340 array([0, 0, 1, 2, 0], dtype=int8)
2341 >>> c._reverse_indexer()
2342 {'a': array([0, 1, 4]), 'b': array([2]), 'c': array([3])}
2343
2344 """
2345 categories = self.categories
2346 r, counts = libalgos.groupsort_indexer(
2347 ensure_platform_int(self.codes), categories.size
2348 )
2349 counts = ensure_int64(counts).cumsum()
2350 _result = (r[start:end] for start, end in zip(counts, counts[1:]))
2351 return dict(zip(categories, _result))
2352
2353 # ------------------------------------------------------------------
2354 # Reductions
2355
2356 def _reduce(
2357 self, name: str, *, skipna: bool = True, keepdims: bool = False, **kwargs
2358 ):
2359 result = super()._reduce(name, skipna=skipna, keepdims=keepdims, **kwargs)
2360 if name in ["argmax", "argmin"]:
2361 # don't wrap in Categorical!
2362 return result
2363 if keepdims:
2364 return type(self)(result, dtype=self.dtype)
2365 else:
2366 return result
2367
2368 def min(self, *, skipna: bool = True, **kwargs):
2369 """
2370 The minimum value of the object.
2371
2372 Only ordered `Categoricals` have a minimum!
2373
2374 Raises
2375 ------
2376 TypeError
2377 If the `Categorical` is not `ordered`.
2378
2379 Returns
2380 -------
2381 min : the minimum of this `Categorical`, NA value if empty
2382 """
2383 nv.validate_minmax_axis(kwargs.get("axis", 0))
2384 nv.validate_min((), kwargs)
2385 self.check_for_ordered("min")
2386
2387 if not len(self._codes):
2388 return self.dtype.na_value
2389
2390 good = self._codes != -1
2391 if not good.all():
2392 if skipna and good.any():
2393 pointer = self._codes[good].min()
2394 else:
2395 return np.nan
2396 else:
2397 pointer = self._codes.min()
2398 return self._wrap_reduction_result(None, pointer)
2399
2400 def max(self, *, skipna: bool = True, **kwargs):
2401 """
2402 The maximum value of the object.
2403
2404 Only ordered `Categoricals` have a maximum!
2405
2406 Raises
2407 ------
2408 TypeError
2409 If the `Categorical` is not `ordered`.
2410
2411 Returns
2412 -------
2413 max : the maximum of this `Categorical`, NA if array is empty
2414 """
2415 nv.validate_minmax_axis(kwargs.get("axis", 0))
2416 nv.validate_max((), kwargs)
2417 self.check_for_ordered("max")
2418
2419 if not len(self._codes):
2420 return self.dtype.na_value
2421
2422 good = self._codes != -1
2423 if not good.all():
2424 if skipna and good.any():
2425 pointer = self._codes[good].max()
2426 else:
2427 return np.nan
2428 else:
2429 pointer = self._codes.max()
2430 return self._wrap_reduction_result(None, pointer)
2431
2432 def _mode(self, dropna: bool = True) -> Categorical:
2433 codes = self._codes
2434 mask = None
2435 if dropna:
2436 mask = self.isna()
2437
2438 res_codes = algorithms.mode(codes, mask=mask)
2439 res_codes = cast(np.ndarray, res_codes)
2440 assert res_codes.dtype == codes.dtype
2441 res = self._from_backing_data(res_codes)
2442 return res
2443
2444 # ------------------------------------------------------------------
2445 # ExtensionArray Interface
2446
2447 def unique(self) -> Self:
2448 """
2449 Return the ``Categorical`` which ``categories`` and ``codes`` are
2450 unique.
2451
2452 .. versionchanged:: 1.3.0
2453
2454 Previously, unused categories were dropped from the new categories.
2455
2456 Returns
2457 -------
2458 Categorical
2459
2460 See Also
2461 --------
2462 pandas.unique
2463 CategoricalIndex.unique
2464 Series.unique : Return unique values of Series object.
2465
2466 Examples
2467 --------
2468 >>> pd.Categorical(list("baabc")).unique()
2469 ['b', 'a', 'c']
2470 Categories (3, object): ['a', 'b', 'c']
2471 >>> pd.Categorical(list("baab"), categories=list("abc"), ordered=True).unique()
2472 ['b', 'a']
2473 Categories (3, object): ['a' < 'b' < 'c']
2474 """
2475 # pylint: disable=useless-parent-delegation
2476 return super().unique()
2477
2478 def _cast_quantile_result(self, res_values: np.ndarray) -> np.ndarray:
2479 # make sure we have correct itemsize for resulting codes
2480 assert res_values.dtype == self._ndarray.dtype
2481 return res_values
2482
2483 def equals(self, other: object) -> bool:
2484 """
2485 Returns True if categorical arrays are equal.
2486
2487 Parameters
2488 ----------
2489 other : `Categorical`
2490
2491 Returns
2492 -------
2493 bool
2494 """
2495 if not isinstance(other, Categorical):
2496 return False
2497 elif self._categories_match_up_to_permutation(other):
2498 other = self._encode_with_my_categories(other)
2499 return np.array_equal(self._codes, other._codes)
2500 return False
2501
2502 @classmethod
2503 def _concat_same_type(cls, to_concat: Sequence[Self], axis: AxisInt = 0) -> Self:
2504 from pandas.core.dtypes.concat import union_categoricals
2505
2506 first = to_concat[0]
2507 if axis >= first.ndim:
2508 raise ValueError(
2509 f"axis {axis} is out of bounds for array of dimension {first.ndim}"
2510 )
2511
2512 if axis == 1:
2513 # Flatten, concatenate then reshape
2514 if not all(x.ndim == 2 for x in to_concat):
2515 raise ValueError
2516
2517 # pass correctly-shaped to union_categoricals
2518 tc_flat = []
2519 for obj in to_concat:
2520 tc_flat.extend([obj[:, i] for i in range(obj.shape[1])])
2521
2522 res_flat = cls._concat_same_type(tc_flat, axis=0)
2523
2524 result = res_flat.reshape(len(first), -1, order="F")
2525 return result
2526
2527 result = union_categoricals(to_concat)
2528 return result
2529
2530 # ------------------------------------------------------------------
2531
2532 def _encode_with_my_categories(self, other: Categorical) -> Categorical:
2533 """
2534 Re-encode another categorical using this Categorical's categories.
2535
2536 Notes
2537 -----
2538 This assumes we have already checked
2539 self._categories_match_up_to_permutation(other).
2540 """
2541 # Indexing on codes is more efficient if categories are the same,
2542 # so we can apply some optimizations based on the degree of
2543 # dtype-matching.
2544 codes = recode_for_categories(
2545 other.codes, other.categories, self.categories, copy=False
2546 )
2547 return self._from_backing_data(codes)
2548
2549 def _categories_match_up_to_permutation(self, other: Categorical) -> bool:
2550 """
2551 Returns True if categoricals are the same dtype
2552 same categories, and same ordered
2553
2554 Parameters
2555 ----------
2556 other : Categorical
2557
2558 Returns
2559 -------
2560 bool
2561 """
2562 return hash(self.dtype) == hash(other.dtype)
2563
2564 def describe(self) -> DataFrame:
2565 """
2566 Describes this Categorical
2567
2568 Returns
2569 -------
2570 description: `DataFrame`
2571 A dataframe with frequency and counts by category.
2572 """
2573 counts = self.value_counts(dropna=False)
2574 freqs = counts / counts.sum()
2575
2576 from pandas import Index
2577 from pandas.core.reshape.concat import concat
2578
2579 result = concat([counts, freqs], axis=1)
2580 result.columns = Index(["counts", "freqs"])
2581 result.index.name = "categories"
2582
2583 return result
2584
2585 def isin(self, values: ArrayLike) -> npt.NDArray[np.bool_]:
2586 """
2587 Check whether `values` are contained in Categorical.
2588
2589 Return a boolean NumPy Array showing whether each element in
2590 the Categorical matches an element in the passed sequence of
2591 `values` exactly.
2592
2593 Parameters
2594 ----------
2595 values : np.ndarray or ExtensionArray
2596 The sequence of values to test. Passing in a single string will
2597 raise a ``TypeError``. Instead, turn a single string into a
2598 list of one element.
2599
2600 Returns
2601 -------
2602 np.ndarray[bool]
2603
2604 Raises
2605 ------
2606 TypeError
2607 * If `values` is not a set or list-like
2608
2609 See Also
2610 --------
2611 pandas.Series.isin : Equivalent method on Series.
2612
2613 Examples
2614 --------
2615 >>> s = pd.Categorical(['lama', 'cow', 'lama', 'beetle', 'lama',
2616 ... 'hippo'])
2617 >>> s.isin(['cow', 'lama'])
2618 array([ True, True, True, False, True, False])
2619
2620 Passing a single string as ``s.isin('lama')`` will raise an error. Use
2621 a list of one element instead:
2622
2623 >>> s.isin(['lama'])
2624 array([ True, False, True, False, True, False])
2625 """
2626 null_mask = np.asarray(isna(values))
2627 code_values = self.categories.get_indexer_for(values)
2628 code_values = code_values[null_mask | (code_values >= 0)]
2629 return algorithms.isin(self.codes, code_values)
2630
2631 def _replace(self, *, to_replace, value, inplace: bool = False):
2632 from pandas import Index
2633
2634 orig_dtype = self.dtype
2635
2636 inplace = validate_bool_kwarg(inplace, "inplace")
2637 cat = self if inplace else self.copy()
2638
2639 mask = isna(np.asarray(value))
2640 if mask.any():
2641 removals = np.asarray(to_replace)[mask]
2642 removals = cat.categories[cat.categories.isin(removals)]
2643 new_cat = cat.remove_categories(removals)
2644 NDArrayBacked.__init__(cat, new_cat.codes, new_cat.dtype)
2645
2646 ser = cat.categories.to_series()
2647 ser = ser.replace(to_replace=to_replace, value=value)
2648
2649 all_values = Index(ser)
2650
2651 # GH51016: maintain order of existing categories
2652 idxr = cat.categories.get_indexer_for(all_values)
2653 locs = np.arange(len(ser))
2654 locs = np.where(idxr == -1, locs, idxr)
2655 locs = locs.argsort()
2656
2657 new_categories = ser.take(locs)
2658 new_categories = new_categories.drop_duplicates(keep="first")
2659 new_categories = Index(new_categories)
2660 new_codes = recode_for_categories(
2661 cat._codes, all_values, new_categories, copy=False
2662 )
2663 new_dtype = CategoricalDtype(new_categories, ordered=self.dtype.ordered)
2664 NDArrayBacked.__init__(cat, new_codes, new_dtype)
2665
2666 if new_dtype != orig_dtype:
2667 warnings.warn(
2668 # GH#55147
2669 "The behavior of Series.replace (and DataFrame.replace) with "
2670 "CategoricalDtype is deprecated. In a future version, replace "
2671 "will only be used for cases that preserve the categories. "
2672 "To change the categories, use ser.cat.rename_categories "
2673 "instead.",
2674 FutureWarning,
2675 stacklevel=find_stack_level(),
2676 )
2677 if not inplace:
2678 return cat
2679
2680 # ------------------------------------------------------------------------
2681 # String methods interface
2682 def _str_map(
2683 self, f, na_value=np.nan, dtype=np.dtype("object"), convert: bool = True
2684 ):
2685 # Optimization to apply the callable `f` to the categories once
2686 # and rebuild the result by `take`ing from the result with the codes.
2687 # Returns the same type as the object-dtype implementation though.
2688 from pandas.core.arrays import NumpyExtensionArray
2689
2690 categories = self.categories
2691 codes = self.codes
2692 result = NumpyExtensionArray(categories.to_numpy())._str_map(f, na_value, dtype)
2693 return take_nd(result, codes, fill_value=na_value)
2694
2695 def _str_get_dummies(self, sep: str = "|"):
2696 # sep may not be in categories. Just bail on this.
2697 from pandas.core.arrays import NumpyExtensionArray
2698
2699 return NumpyExtensionArray(self.astype(str))._str_get_dummies(sep)
2700
2701 # ------------------------------------------------------------------------
2702 # GroupBy Methods
2703
2704 def _groupby_op(
2705 self,
2706 *,
2707 how: str,
2708 has_dropped_na: bool,
2709 min_count: int,
2710 ngroups: int,
2711 ids: npt.NDArray[np.intp],
2712 **kwargs,
2713 ):
2714 from pandas.core.groupby.ops import WrappedCythonOp
2715
2716 kind = WrappedCythonOp.get_kind_from_how(how)
2717 op = WrappedCythonOp(how=how, kind=kind, has_dropped_na=has_dropped_na)
2718
2719 dtype = self.dtype
2720 if how in ["sum", "prod", "cumsum", "cumprod", "skew"]:
2721 raise TypeError(f"{dtype} type does not support {how} operations")
2722 if how in ["min", "max", "rank", "idxmin", "idxmax"] and not dtype.ordered:
2723 # raise TypeError instead of NotImplementedError to ensure we
2724 # don't go down a group-by-group path, since in the empty-groups
2725 # case that would fail to raise
2726 raise TypeError(f"Cannot perform {how} with non-ordered Categorical")
2727 if how not in [
2728 "rank",
2729 "any",
2730 "all",
2731 "first",
2732 "last",
2733 "min",
2734 "max",
2735 "idxmin",
2736 "idxmax",
2737 ]:
2738 if kind == "transform":
2739 raise TypeError(f"{dtype} type does not support {how} operations")
2740 raise TypeError(f"{dtype} dtype does not support aggregation '{how}'")
2741
2742 result_mask = None
2743 mask = self.isna()
2744 if how == "rank":
2745 assert self.ordered # checked earlier
2746 npvalues = self._ndarray
2747 elif how in ["first", "last", "min", "max", "idxmin", "idxmax"]:
2748 npvalues = self._ndarray
2749 result_mask = np.zeros(ngroups, dtype=bool)
2750 else:
2751 # any/all
2752 npvalues = self.astype(bool)
2753
2754 res_values = op._cython_op_ndim_compat(
2755 npvalues,
2756 min_count=min_count,
2757 ngroups=ngroups,
2758 comp_ids=ids,
2759 mask=mask,
2760 result_mask=result_mask,
2761 **kwargs,
2762 )
2763
2764 if how in op.cast_blocklist:
2765 return res_values
2766 elif how in ["first", "last", "min", "max"]:
2767 res_values[result_mask == 1] = -1
2768 return self._from_backing_data(res_values)
2769
2770
2771# The Series.cat accessor
2772
2773
2774@delegate_names(
2775 delegate=Categorical, accessors=["categories", "ordered"], typ="property"
2776)
2777@delegate_names(
2778 delegate=Categorical,
2779 accessors=[
2780 "rename_categories",
2781 "reorder_categories",
2782 "add_categories",
2783 "remove_categories",
2784 "remove_unused_categories",
2785 "set_categories",
2786 "as_ordered",
2787 "as_unordered",
2788 ],
2789 typ="method",
2790)
2791class CategoricalAccessor(PandasDelegate, PandasObject, NoNewAttributesMixin):
2792 """
2793 Accessor object for categorical properties of the Series values.
2794
2795 Parameters
2796 ----------
2797 data : Series or CategoricalIndex
2798
2799 Examples
2800 --------
2801 >>> s = pd.Series(list("abbccc")).astype("category")
2802 >>> s
2803 0 a
2804 1 b
2805 2 b
2806 3 c
2807 4 c
2808 5 c
2809 dtype: category
2810 Categories (3, object): ['a', 'b', 'c']
2811
2812 >>> s.cat.categories
2813 Index(['a', 'b', 'c'], dtype='object')
2814
2815 >>> s.cat.rename_categories(list("cba"))
2816 0 c
2817 1 b
2818 2 b
2819 3 a
2820 4 a
2821 5 a
2822 dtype: category
2823 Categories (3, object): ['c', 'b', 'a']
2824
2825 >>> s.cat.reorder_categories(list("cba"))
2826 0 a
2827 1 b
2828 2 b
2829 3 c
2830 4 c
2831 5 c
2832 dtype: category
2833 Categories (3, object): ['c', 'b', 'a']
2834
2835 >>> s.cat.add_categories(["d", "e"])
2836 0 a
2837 1 b
2838 2 b
2839 3 c
2840 4 c
2841 5 c
2842 dtype: category
2843 Categories (5, object): ['a', 'b', 'c', 'd', 'e']
2844
2845 >>> s.cat.remove_categories(["a", "c"])
2846 0 NaN
2847 1 b
2848 2 b
2849 3 NaN
2850 4 NaN
2851 5 NaN
2852 dtype: category
2853 Categories (1, object): ['b']
2854
2855 >>> s1 = s.cat.add_categories(["d", "e"])
2856 >>> s1.cat.remove_unused_categories()
2857 0 a
2858 1 b
2859 2 b
2860 3 c
2861 4 c
2862 5 c
2863 dtype: category
2864 Categories (3, object): ['a', 'b', 'c']
2865
2866 >>> s.cat.set_categories(list("abcde"))
2867 0 a
2868 1 b
2869 2 b
2870 3 c
2871 4 c
2872 5 c
2873 dtype: category
2874 Categories (5, object): ['a', 'b', 'c', 'd', 'e']
2875
2876 >>> s.cat.as_ordered()
2877 0 a
2878 1 b
2879 2 b
2880 3 c
2881 4 c
2882 5 c
2883 dtype: category
2884 Categories (3, object): ['a' < 'b' < 'c']
2885
2886 >>> s.cat.as_unordered()
2887 0 a
2888 1 b
2889 2 b
2890 3 c
2891 4 c
2892 5 c
2893 dtype: category
2894 Categories (3, object): ['a', 'b', 'c']
2895 """
2896
2897 def __init__(self, data) -> None:
2898 self._validate(data)
2899 self._parent = data.values
2900 self._index = data.index
2901 self._name = data.name
2902 self._freeze()
2903
2904 @staticmethod
2905 def _validate(data):
2906 if not isinstance(data.dtype, CategoricalDtype):
2907 raise AttributeError("Can only use .cat accessor with a 'category' dtype")
2908
2909 def _delegate_property_get(self, name: str):
2910 return getattr(self._parent, name)
2911
2912 # error: Signature of "_delegate_property_set" incompatible with supertype
2913 # "PandasDelegate"
2914 def _delegate_property_set(self, name: str, new_values): # type: ignore[override]
2915 return setattr(self._parent, name, new_values)
2916
2917 @property
2918 def codes(self) -> Series:
2919 """
2920 Return Series of codes as well as the index.
2921
2922 Examples
2923 --------
2924 >>> raw_cate = pd.Categorical(["a", "b", "c", "a"], categories=["a", "b"])
2925 >>> ser = pd.Series(raw_cate)
2926 >>> ser.cat.codes
2927 0 0
2928 1 1
2929 2 -1
2930 3 0
2931 dtype: int8
2932 """
2933 from pandas import Series
2934
2935 return Series(self._parent.codes, index=self._index)
2936
2937 def _delegate_method(self, name: str, *args, **kwargs):
2938 from pandas import Series
2939
2940 method = getattr(self._parent, name)
2941 res = method(*args, **kwargs)
2942 if res is not None:
2943 return Series(res, index=self._index, name=self._name)
2944
2945
2946# utility routines
2947
2948
2949def _get_codes_for_values(
2950 values: Index | Series | ExtensionArray | np.ndarray,
2951 categories: Index,
2952) -> np.ndarray:
2953 """
2954 utility routine to turn values into codes given the specified categories
2955
2956 If `values` is known to be a Categorical, use recode_for_categories instead.
2957 """
2958 codes = categories.get_indexer_for(values)
2959 return coerce_indexer_dtype(codes, categories)
2960
2961
2962def recode_for_categories(
2963 codes: np.ndarray, old_categories, new_categories, copy: bool = True
2964) -> np.ndarray:
2965 """
2966 Convert a set of codes for to a new set of categories
2967
2968 Parameters
2969 ----------
2970 codes : np.ndarray
2971 old_categories, new_categories : Index
2972 copy: bool, default True
2973 Whether to copy if the codes are unchanged.
2974
2975 Returns
2976 -------
2977 new_codes : np.ndarray[np.int64]
2978
2979 Examples
2980 --------
2981 >>> old_cat = pd.Index(['b', 'a', 'c'])
2982 >>> new_cat = pd.Index(['a', 'b'])
2983 >>> codes = np.array([0, 1, 1, 2])
2984 >>> recode_for_categories(codes, old_cat, new_cat)
2985 array([ 1, 0, 0, -1], dtype=int8)
2986 """
2987 if len(old_categories) == 0:
2988 # All null anyway, so just retain the nulls
2989 if copy:
2990 return codes.copy()
2991 return codes
2992 elif new_categories.equals(old_categories):
2993 # Same categories, so no need to actually recode
2994 if copy:
2995 return codes.copy()
2996 return codes
2997
2998 indexer = coerce_indexer_dtype(
2999 new_categories.get_indexer_for(old_categories), new_categories
3000 )
3001 new_codes = take_nd(indexer, codes, fill_value=-1)
3002 return new_codes
3003
3004
3005def factorize_from_iterable(values) -> tuple[np.ndarray, Index]:
3006 """
3007 Factorize an input `values` into `categories` and `codes`. Preserves
3008 categorical dtype in `categories`.
3009
3010 Parameters
3011 ----------
3012 values : list-like
3013
3014 Returns
3015 -------
3016 codes : ndarray
3017 categories : Index
3018 If `values` has a categorical dtype, then `categories` is
3019 a CategoricalIndex keeping the categories and order of `values`.
3020 """
3021 from pandas import CategoricalIndex
3022
3023 if not is_list_like(values):
3024 raise TypeError("Input must be list-like")
3025
3026 categories: Index
3027
3028 vdtype = getattr(values, "dtype", None)
3029 if isinstance(vdtype, CategoricalDtype):
3030 values = extract_array(values)
3031 # The Categorical we want to build has the same categories
3032 # as values but its codes are by def [0, ..., len(n_categories) - 1]
3033 cat_codes = np.arange(len(values.categories), dtype=values.codes.dtype)
3034 cat = Categorical.from_codes(cat_codes, dtype=values.dtype, validate=False)
3035
3036 categories = CategoricalIndex(cat)
3037 codes = values.codes
3038 else:
3039 # The value of ordered is irrelevant since we don't use cat as such,
3040 # but only the resulting categories, the order of which is independent
3041 # from ordered. Set ordered to False as default. See GH #15457
3042 cat = Categorical(values, ordered=False)
3043 categories = cat.categories
3044 codes = cat.codes
3045 return codes, categories
3046
3047
3048def factorize_from_iterables(iterables) -> tuple[list[np.ndarray], list[Index]]:
3049 """
3050 A higher-level wrapper over `factorize_from_iterable`.
3051
3052 Parameters
3053 ----------
3054 iterables : list-like of list-likes
3055
3056 Returns
3057 -------
3058 codes : list of ndarrays
3059 categories : list of Indexes
3060
3061 Notes
3062 -----
3063 See `factorize_from_iterable` for more info.
3064 """
3065 if len(iterables) == 0:
3066 # For consistency, it should return two empty lists.
3067 return [], []
3068
3069 codes, categories = zip(*(factorize_from_iterable(it) for it in iterables))
3070 return list(codes), list(categories)