1from __future__ import annotations
2
3from csv import QUOTE_NONNUMERIC
4from functools import partial
5import operator
6from shutil import get_terminal_size
7from typing import (
8 TYPE_CHECKING,
9 Hashable,
10 Iterator,
11 Literal,
12 Sequence,
13 TypeVar,
14 cast,
15 overload,
16)
17
18import numpy as np
19
20from pandas._config import get_option
21
22from pandas._libs import (
23 NaT,
24 algos as libalgos,
25 lib,
26)
27from pandas._libs.arrays import NDArrayBacked
28from pandas._typing import (
29 ArrayLike,
30 AstypeArg,
31 AxisInt,
32 Dtype,
33 NpDtype,
34 Ordered,
35 Shape,
36 SortKind,
37 npt,
38 type_t,
39)
40from pandas.compat.numpy import function as nv
41from pandas.util._validators import validate_bool_kwarg
42
43from pandas.core.dtypes.cast import (
44 coerce_indexer_dtype,
45 find_common_type,
46)
47from pandas.core.dtypes.common import (
48 ensure_int64,
49 ensure_platform_int,
50 is_any_real_numeric_dtype,
51 is_bool_dtype,
52 is_categorical_dtype,
53 is_datetime64_dtype,
54 is_dict_like,
55 is_dtype_equal,
56 is_extension_array_dtype,
57 is_hashable,
58 is_integer_dtype,
59 is_list_like,
60 is_scalar,
61 is_timedelta64_dtype,
62 needs_i8_conversion,
63 pandas_dtype,
64)
65from pandas.core.dtypes.dtypes import (
66 CategoricalDtype,
67 ExtensionDtype,
68)
69from pandas.core.dtypes.generic import (
70 ABCIndex,
71 ABCSeries,
72)
73from pandas.core.dtypes.missing import (
74 is_valid_na_for_dtype,
75 isna,
76)
77
78from pandas.core import (
79 algorithms,
80 arraylike,
81 ops,
82)
83from pandas.core.accessor import (
84 PandasDelegate,
85 delegate_names,
86)
87from pandas.core.algorithms import (
88 factorize,
89 take_nd,
90)
91from pandas.core.arrays._mixins import (
92 NDArrayBackedExtensionArray,
93 ravel_compat,
94)
95from pandas.core.base import (
96 ExtensionArray,
97 NoNewAttributesMixin,
98 PandasObject,
99)
100import pandas.core.common as com
101from pandas.core.construction import (
102 extract_array,
103 sanitize_array,
104)
105from pandas.core.ops.common import unpack_zerodim_and_defer
106from pandas.core.sorting import nargsort
107from pandas.core.strings.object_array import ObjectStringArrayMixin
108
109from pandas.io.formats import console
110
111if TYPE_CHECKING:
112 from pandas import (
113 DataFrame,
114 Index,
115 Series,
116 )
117
118
119CategoricalT = TypeVar("CategoricalT", bound="Categorical")
120
121
122def _cat_compare_op(op):
123 opname = f"__{op.__name__}__"
124 fill_value = op is operator.ne
125
126 @unpack_zerodim_and_defer(opname)
127 def func(self, other):
128 hashable = is_hashable(other)
129 if is_list_like(other) and len(other) != len(self) and not hashable:
130 # in hashable case we may have a tuple that is itself a category
131 raise ValueError("Lengths must match.")
132
133 if not self.ordered:
134 if opname in ["__lt__", "__gt__", "__le__", "__ge__"]:
135 raise TypeError(
136 "Unordered Categoricals can only compare equality or not"
137 )
138 if isinstance(other, Categorical):
139 # Two Categoricals can only be compared if the categories are
140 # the same (maybe up to ordering, depending on ordered)
141
142 msg = "Categoricals can only be compared if 'categories' are the same."
143 if not self._categories_match_up_to_permutation(other):
144 raise TypeError(msg)
145
146 if not self.ordered and not self.categories.equals(other.categories):
147 # both unordered and different order
148 other_codes = recode_for_categories(
149 other.codes, other.categories, self.categories, copy=False
150 )
151 else:
152 other_codes = other._codes
153
154 ret = op(self._codes, other_codes)
155 mask = (self._codes == -1) | (other_codes == -1)
156 if mask.any():
157 ret[mask] = fill_value
158 return ret
159
160 if hashable:
161 if other in self.categories:
162 i = self._unbox_scalar(other)
163 ret = op(self._codes, i)
164
165 if opname not in {"__eq__", "__ge__", "__gt__"}:
166 # GH#29820 performance trick; get_loc will always give i>=0,
167 # so in the cases (__ne__, __le__, __lt__) the setting
168 # here is a no-op, so can be skipped.
169 mask = self._codes == -1
170 ret[mask] = fill_value
171 return ret
172 else:
173 return ops.invalid_comparison(self, other, op)
174 else:
175 # allow categorical vs object dtype array comparisons for equality
176 # these are only positional comparisons
177 if opname not in ["__eq__", "__ne__"]:
178 raise TypeError(
179 f"Cannot compare a Categorical for op {opname} with "
180 f"type {type(other)}.\nIf you want to compare values, "
181 "use 'np.asarray(cat) <op> other'."
182 )
183
184 if isinstance(other, ExtensionArray) and needs_i8_conversion(other.dtype):
185 # We would return NotImplemented here, but that messes up
186 # ExtensionIndex's wrapped methods
187 return op(other, self)
188 return getattr(np.array(self), opname)(np.array(other))
189
190 func.__name__ = opname
191
192 return func
193
194
195def contains(cat, key, container) -> bool:
196 """
197 Helper for membership check for ``key`` in ``cat``.
198
199 This is a helper method for :method:`__contains__`
200 and :class:`CategoricalIndex.__contains__`.
201
202 Returns True if ``key`` is in ``cat.categories`` and the
203 location of ``key`` in ``categories`` is in ``container``.
204
205 Parameters
206 ----------
207 cat : :class:`Categorical`or :class:`categoricalIndex`
208 key : a hashable object
209 The key to check membership for.
210 container : Container (e.g. list-like or mapping)
211 The container to check for membership in.
212
213 Returns
214 -------
215 is_in : bool
216 True if ``key`` is in ``self.categories`` and location of
217 ``key`` in ``categories`` is in ``container``, else False.
218
219 Notes
220 -----
221 This method does not check for NaN values. Do that separately
222 before calling this method.
223 """
224 hash(key)
225
226 # get location of key in categories.
227 # If a KeyError, the key isn't in categories, so logically
228 # can't be in container either.
229 try:
230 loc = cat.categories.get_loc(key)
231 except (KeyError, TypeError):
232 return False
233
234 # loc is the location of key in categories, but also the *value*
235 # for key in container. So, `key` may be in categories,
236 # but still not in `container`. Example ('b' in categories,
237 # but not in values):
238 # 'b' in Categorical(['a'], categories=['a', 'b']) # False
239 if is_scalar(loc):
240 return loc in container
241 else:
242 # if categories is an IntervalIndex, loc is an array.
243 return any(loc_ in container for loc_ in loc)
244
245
246class Categorical(NDArrayBackedExtensionArray, PandasObject, ObjectStringArrayMixin):
247 """
248 Represent a categorical variable in classic R / S-plus fashion.
249
250 `Categoricals` can only take on a limited, and usually fixed, number
251 of possible values (`categories`). In contrast to statistical categorical
252 variables, a `Categorical` might have an order, but numerical operations
253 (additions, divisions, ...) are not possible.
254
255 All values of the `Categorical` are either in `categories` or `np.nan`.
256 Assigning values outside of `categories` will raise a `ValueError`. Order
257 is defined by the order of the `categories`, not lexical order of the
258 values.
259
260 Parameters
261 ----------
262 values : list-like
263 The values of the categorical. If categories are given, values not in
264 categories will be replaced with NaN.
265 categories : Index-like (unique), optional
266 The unique categories for this categorical. If not given, the
267 categories are assumed to be the unique values of `values` (sorted, if
268 possible, otherwise in the order in which they appear).
269 ordered : bool, default False
270 Whether or not this categorical is treated as a ordered categorical.
271 If True, the resulting categorical will be ordered.
272 An ordered categorical respects, when sorted, the order of its
273 `categories` attribute (which in turn is the `categories` argument, if
274 provided).
275 dtype : CategoricalDtype
276 An instance of ``CategoricalDtype`` to use for this categorical.
277
278 Attributes
279 ----------
280 categories : Index
281 The categories of this categorical
282 codes : ndarray
283 The codes (integer positions, which point to the categories) of this
284 categorical, read only.
285 ordered : bool
286 Whether or not this Categorical is ordered.
287 dtype : CategoricalDtype
288 The instance of ``CategoricalDtype`` storing the ``categories``
289 and ``ordered``.
290
291 Methods
292 -------
293 from_codes
294 __array__
295
296 Raises
297 ------
298 ValueError
299 If the categories do not validate.
300 TypeError
301 If an explicit ``ordered=True`` is given but no `categories` and the
302 `values` are not sortable.
303
304 See Also
305 --------
306 CategoricalDtype : Type for categorical data.
307 CategoricalIndex : An Index with an underlying ``Categorical``.
308
309 Notes
310 -----
311 See the `user guide
312 <https://pandas.pydata.org/pandas-docs/stable/user_guide/categorical.html>`__
313 for more.
314
315 Examples
316 --------
317 >>> pd.Categorical([1, 2, 3, 1, 2, 3])
318 [1, 2, 3, 1, 2, 3]
319 Categories (3, int64): [1, 2, 3]
320
321 >>> pd.Categorical(['a', 'b', 'c', 'a', 'b', 'c'])
322 ['a', 'b', 'c', 'a', 'b', 'c']
323 Categories (3, object): ['a', 'b', 'c']
324
325 Missing values are not included as a category.
326
327 >>> c = pd.Categorical([1, 2, 3, 1, 2, 3, np.nan])
328 >>> c
329 [1, 2, 3, 1, 2, 3, NaN]
330 Categories (3, int64): [1, 2, 3]
331
332 However, their presence is indicated in the `codes` attribute
333 by code `-1`.
334
335 >>> c.codes
336 array([ 0, 1, 2, 0, 1, 2, -1], dtype=int8)
337
338 Ordered `Categoricals` can be sorted according to the custom order
339 of the categories and can have a min and max value.
340
341 >>> c = pd.Categorical(['a', 'b', 'c', 'a', 'b', 'c'], ordered=True,
342 ... categories=['c', 'b', 'a'])
343 >>> c
344 ['a', 'b', 'c', 'a', 'b', 'c']
345 Categories (3, object): ['c' < 'b' < 'a']
346 >>> c.min()
347 'c'
348 """
349
350 # For comparisons, so that numpy uses our implementation if the compare
351 # ops, which raise
352 __array_priority__ = 1000
353 # tolist is not actually deprecated, just suppressed in the __dir__
354 _hidden_attrs = PandasObject._hidden_attrs | frozenset(["tolist"])
355 _typ = "categorical"
356
357 _dtype: CategoricalDtype
358
359 def __init__(
360 self,
361 values,
362 categories=None,
363 ordered=None,
364 dtype: Dtype | None = None,
365 fastpath: bool = False,
366 copy: bool = True,
367 ) -> None:
368 dtype = CategoricalDtype._from_values_or_dtype(
369 values, categories, ordered, dtype
370 )
371 # At this point, dtype is always a CategoricalDtype, but
372 # we may have dtype.categories be None, and we need to
373 # infer categories in a factorization step further below
374
375 if fastpath:
376 codes = coerce_indexer_dtype(values, dtype.categories)
377 dtype = CategoricalDtype(ordered=False).update_dtype(dtype)
378 super().__init__(codes, dtype)
379 return
380
381 if not is_list_like(values):
382 # GH#38433
383 raise TypeError("Categorical input must be list-like")
384
385 # null_mask indicates missing values we want to exclude from inference.
386 # This means: only missing values in list-likes (not arrays/ndframes).
387 null_mask = np.array(False)
388
389 # sanitize input
390 if is_categorical_dtype(values):
391 if dtype.categories is None:
392 dtype = CategoricalDtype(values.categories, dtype.ordered)
393 elif not isinstance(values, (ABCIndex, ABCSeries, ExtensionArray)):
394 values = com.convert_to_list_like(values)
395 if isinstance(values, list) and len(values) == 0:
396 # By convention, empty lists result in object dtype:
397 values = np.array([], dtype=object)
398 elif isinstance(values, np.ndarray):
399 if values.ndim > 1:
400 # preempt sanitize_array from raising ValueError
401 raise NotImplementedError(
402 "> 1 ndim Categorical are not supported at this time"
403 )
404 values = sanitize_array(values, None)
405 else:
406 # i.e. must be a list
407 arr = sanitize_array(values, None)
408 null_mask = isna(arr)
409 if null_mask.any():
410 # We remove null values here, then below will re-insert
411 # them, grep "full_codes"
412 arr_list = [values[idx] for idx in np.where(~null_mask)[0]]
413
414 # GH#44900 Do not cast to float if we have only missing values
415 if arr_list or arr.dtype == "object":
416 sanitize_dtype = None
417 else:
418 sanitize_dtype = arr.dtype
419
420 arr = sanitize_array(arr_list, None, dtype=sanitize_dtype)
421 values = arr
422
423 if dtype.categories is None:
424 try:
425 codes, categories = factorize(values, sort=True)
426 except TypeError as err:
427 codes, categories = factorize(values, sort=False)
428 if dtype.ordered:
429 # raise, as we don't have a sortable data structure and so
430 # the user should give us one by specifying categories
431 raise TypeError(
432 "'values' is not ordered, please "
433 "explicitly specify the categories order "
434 "by passing in a categories argument."
435 ) from err
436
437 # we're inferring from values
438 dtype = CategoricalDtype(categories, dtype.ordered)
439
440 elif is_categorical_dtype(values.dtype):
441 old_codes = extract_array(values)._codes
442 codes = recode_for_categories(
443 old_codes, values.dtype.categories, dtype.categories, copy=copy
444 )
445
446 else:
447 codes = _get_codes_for_values(values, dtype.categories)
448
449 if null_mask.any():
450 # Reinsert -1 placeholders for previously removed missing values
451 full_codes = -np.ones(null_mask.shape, dtype=codes.dtype)
452 full_codes[~null_mask] = codes
453 codes = full_codes
454
455 dtype = CategoricalDtype(ordered=False).update_dtype(dtype)
456 arr = coerce_indexer_dtype(codes, dtype.categories)
457 super().__init__(arr, dtype)
458
459 @property
460 def dtype(self) -> CategoricalDtype:
461 """
462 The :class:`~pandas.api.types.CategoricalDtype` for this instance.
463 """
464 return self._dtype
465
466 @property
467 def _internal_fill_value(self) -> int:
468 # using the specific numpy integer instead of python int to get
469 # the correct dtype back from _quantile in the all-NA case
470 dtype = self._ndarray.dtype
471 return dtype.type(-1)
472
473 @classmethod
474 def _from_sequence(
475 cls, scalars, *, dtype: Dtype | None = None, copy: bool = False
476 ) -> Categorical:
477 return Categorical(scalars, dtype=dtype, copy=copy)
478
479 @overload
480 def astype(self, dtype: npt.DTypeLike, copy: bool = ...) -> np.ndarray:
481 ...
482
483 @overload
484 def astype(self, dtype: ExtensionDtype, copy: bool = ...) -> ExtensionArray:
485 ...
486
487 @overload
488 def astype(self, dtype: AstypeArg, copy: bool = ...) -> ArrayLike:
489 ...
490
491 def astype(self, dtype: AstypeArg, copy: bool = True) -> ArrayLike:
492 """
493 Coerce this type to another dtype
494
495 Parameters
496 ----------
497 dtype : numpy dtype or pandas type
498 copy : bool, default True
499 By default, astype always returns a newly allocated object.
500 If copy is set to False and dtype is categorical, the original
501 object is returned.
502 """
503 dtype = pandas_dtype(dtype)
504 if self.dtype is dtype:
505 result = self.copy() if copy else self
506
507 elif is_categorical_dtype(dtype):
508 dtype = cast(CategoricalDtype, dtype)
509
510 # GH 10696/18593/18630
511 dtype = self.dtype.update_dtype(dtype)
512 self = self.copy() if copy else self
513 result = self._set_dtype(dtype)
514
515 elif isinstance(dtype, ExtensionDtype):
516 return super().astype(dtype, copy=copy)
517
518 elif is_integer_dtype(dtype) and self.isna().any():
519 raise ValueError("Cannot convert float NaN to integer")
520
521 elif len(self.codes) == 0 or len(self.categories) == 0:
522 result = np.array(
523 self,
524 dtype=dtype,
525 copy=copy,
526 )
527
528 else:
529 # GH8628 (PERF): astype category codes instead of astyping array
530 new_cats = self.categories._values
531
532 try:
533 new_cats = new_cats.astype(dtype=dtype, copy=copy)
534 fill_value = self.categories._na_value
535 if not is_valid_na_for_dtype(fill_value, dtype):
536 fill_value = lib.item_from_zerodim(
537 np.array(self.categories._na_value).astype(dtype)
538 )
539 except (
540 TypeError, # downstream error msg for CategoricalIndex is misleading
541 ValueError,
542 ):
543 msg = f"Cannot cast {self.categories.dtype} dtype to {dtype}"
544 raise ValueError(msg)
545
546 result = take_nd(
547 new_cats, ensure_platform_int(self._codes), fill_value=fill_value
548 )
549
550 return result
551
552 def to_list(self):
553 """
554 Alias for tolist.
555 """
556 return self.tolist()
557
558 @classmethod
559 def _from_inferred_categories(
560 cls, inferred_categories, inferred_codes, dtype, true_values=None
561 ):
562 """
563 Construct a Categorical from inferred values.
564
565 For inferred categories (`dtype` is None) the categories are sorted.
566 For explicit `dtype`, the `inferred_categories` are cast to the
567 appropriate type.
568
569 Parameters
570 ----------
571 inferred_categories : Index
572 inferred_codes : Index
573 dtype : CategoricalDtype or 'category'
574 true_values : list, optional
575 If none are provided, the default ones are
576 "True", "TRUE", and "true."
577
578 Returns
579 -------
580 Categorical
581 """
582 from pandas import (
583 Index,
584 to_datetime,
585 to_numeric,
586 to_timedelta,
587 )
588
589 cats = Index(inferred_categories)
590 known_categories = (
591 isinstance(dtype, CategoricalDtype) and dtype.categories is not None
592 )
593
594 if known_categories:
595 # Convert to a specialized type with `dtype` if specified.
596 if is_any_real_numeric_dtype(dtype.categories):
597 cats = to_numeric(inferred_categories, errors="coerce")
598 elif is_datetime64_dtype(dtype.categories):
599 cats = to_datetime(inferred_categories, errors="coerce")
600 elif is_timedelta64_dtype(dtype.categories):
601 cats = to_timedelta(inferred_categories, errors="coerce")
602 elif is_bool_dtype(dtype.categories):
603 if true_values is None:
604 true_values = ["True", "TRUE", "true"]
605
606 # error: Incompatible types in assignment (expression has type
607 # "ndarray", variable has type "Index")
608 cats = cats.isin(true_values) # type: ignore[assignment]
609
610 if known_categories:
611 # Recode from observation order to dtype.categories order.
612 categories = dtype.categories
613 codes = recode_for_categories(inferred_codes, cats, categories)
614 elif not cats.is_monotonic_increasing:
615 # Sort categories and recode for unknown categories.
616 unsorted = cats.copy()
617 categories = cats.sort_values()
618
619 codes = recode_for_categories(inferred_codes, unsorted, categories)
620 dtype = CategoricalDtype(categories, ordered=False)
621 else:
622 dtype = CategoricalDtype(cats, ordered=False)
623 codes = inferred_codes
624
625 return cls(codes, dtype=dtype, fastpath=True)
626
627 @classmethod
628 def from_codes(
629 cls, codes, categories=None, ordered=None, dtype: Dtype | None = None
630 ) -> Categorical:
631 """
632 Make a Categorical type from codes and categories or dtype.
633
634 This constructor is useful if you already have codes and
635 categories/dtype and so do not need the (computation intensive)
636 factorization step, which is usually done on the constructor.
637
638 If your data does not follow this convention, please use the normal
639 constructor.
640
641 Parameters
642 ----------
643 codes : array-like of int
644 An integer array, where each integer points to a category in
645 categories or dtype.categories, or else is -1 for NaN.
646 categories : index-like, optional
647 The categories for the categorical. Items need to be unique.
648 If the categories are not given here, then they must be provided
649 in `dtype`.
650 ordered : bool, optional
651 Whether or not this categorical is treated as an ordered
652 categorical. If not given here or in `dtype`, the resulting
653 categorical will be unordered.
654 dtype : CategoricalDtype or "category", optional
655 If :class:`CategoricalDtype`, cannot be used together with
656 `categories` or `ordered`.
657
658 Returns
659 -------
660 Categorical
661
662 Examples
663 --------
664 >>> dtype = pd.CategoricalDtype(['a', 'b'], ordered=True)
665 >>> pd.Categorical.from_codes(codes=[0, 1, 0, 1], dtype=dtype)
666 ['a', 'b', 'a', 'b']
667 Categories (2, object): ['a' < 'b']
668 """
669 dtype = CategoricalDtype._from_values_or_dtype(
670 categories=categories, ordered=ordered, dtype=dtype
671 )
672 if dtype.categories is None:
673 msg = (
674 "The categories must be provided in 'categories' or "
675 "'dtype'. Both were None."
676 )
677 raise ValueError(msg)
678
679 if is_extension_array_dtype(codes) and is_integer_dtype(codes):
680 # Avoid the implicit conversion of Int to object
681 if isna(codes).any():
682 raise ValueError("codes cannot contain NA values")
683 codes = codes.to_numpy(dtype=np.int64)
684 else:
685 codes = np.asarray(codes)
686 if len(codes) and not is_integer_dtype(codes):
687 raise ValueError("codes need to be array-like integers")
688
689 if len(codes) and (codes.max() >= len(dtype.categories) or codes.min() < -1):
690 raise ValueError("codes need to be between -1 and len(categories)-1")
691
692 return cls(codes, dtype=dtype, fastpath=True)
693
694 # ------------------------------------------------------------------
695 # Categories/Codes/Ordered
696
697 @property
698 def categories(self) -> Index:
699 """
700 The categories of this categorical.
701
702 Setting assigns new values to each category (effectively a rename of
703 each individual category).
704
705 The assigned value has to be a list-like object. All items must be
706 unique and the number of items in the new categories must be the same
707 as the number of items in the old categories.
708
709 Raises
710 ------
711 ValueError
712 If the new categories do not validate as categories or if the
713 number of new categories is unequal the number of old categories
714
715 See Also
716 --------
717 rename_categories : Rename categories.
718 reorder_categories : Reorder categories.
719 add_categories : Add new categories.
720 remove_categories : Remove the specified categories.
721 remove_unused_categories : Remove categories which are not used.
722 set_categories : Set the categories to the specified ones.
723 """
724 return self.dtype.categories
725
726 @property
727 def ordered(self) -> Ordered:
728 """
729 Whether the categories have an ordered relationship.
730 """
731 return self.dtype.ordered
732
733 @property
734 def codes(self) -> np.ndarray:
735 """
736 The category codes of this categorical.
737
738 Codes are an array of integers which are the positions of the actual
739 values in the categories array.
740
741 There is no setter, use the other categorical methods and the normal item
742 setter to change values in the categorical.
743
744 Returns
745 -------
746 ndarray[int]
747 A non-writable view of the `codes` array.
748 """
749 v = self._codes.view()
750 v.flags.writeable = False
751 return v
752
753 def _set_categories(self, categories, fastpath: bool = False) -> None:
754 """
755 Sets new categories inplace
756
757 Parameters
758 ----------
759 fastpath : bool, default False
760 Don't perform validation of the categories for uniqueness or nulls
761
762 Examples
763 --------
764 >>> c = pd.Categorical(['a', 'b'])
765 >>> c
766 ['a', 'b']
767 Categories (2, object): ['a', 'b']
768
769 >>> c._set_categories(pd.Index(['a', 'c']))
770 >>> c
771 ['a', 'c']
772 Categories (2, object): ['a', 'c']
773 """
774 if fastpath:
775 new_dtype = CategoricalDtype._from_fastpath(categories, self.ordered)
776 else:
777 new_dtype = CategoricalDtype(categories, ordered=self.ordered)
778 if (
779 not fastpath
780 and self.dtype.categories is not None
781 and len(new_dtype.categories) != len(self.dtype.categories)
782 ):
783 raise ValueError(
784 "new categories need to have the same number of "
785 "items as the old categories!"
786 )
787
788 super().__init__(self._ndarray, new_dtype)
789
790 def _set_dtype(self, dtype: CategoricalDtype) -> Categorical:
791 """
792 Internal method for directly updating the CategoricalDtype
793
794 Parameters
795 ----------
796 dtype : CategoricalDtype
797
798 Notes
799 -----
800 We don't do any validation here. It's assumed that the dtype is
801 a (valid) instance of `CategoricalDtype`.
802 """
803 codes = recode_for_categories(self.codes, self.categories, dtype.categories)
804 return type(self)(codes, dtype=dtype, fastpath=True)
805
806 def set_ordered(self, value: bool) -> Categorical:
807 """
808 Set the ordered attribute to the boolean value.
809
810 Parameters
811 ----------
812 value : bool
813 Set whether this categorical is ordered (True) or not (False).
814 """
815 new_dtype = CategoricalDtype(self.categories, ordered=value)
816 cat = self.copy()
817 NDArrayBacked.__init__(cat, cat._ndarray, new_dtype)
818 return cat
819
820 def as_ordered(self) -> Categorical:
821 """
822 Set the Categorical to be ordered.
823
824 Returns
825 -------
826 Categorical
827 Ordered Categorical.
828 """
829 return self.set_ordered(True)
830
831 def as_unordered(self) -> Categorical:
832 """
833 Set the Categorical to be unordered.
834
835 Returns
836 -------
837 Categorical
838 Unordered Categorical.
839 """
840 return self.set_ordered(False)
841
842 def set_categories(self, new_categories, ordered=None, rename: bool = False):
843 """
844 Set the categories to the specified new_categories.
845
846 `new_categories` can include new categories (which will result in
847 unused categories) or remove old categories (which results in values
848 set to NaN). If `rename==True`, the categories will simple be renamed
849 (less or more items than in old categories will result in values set to
850 NaN or in unused categories respectively).
851
852 This method can be used to perform more than one action of adding,
853 removing, and reordering simultaneously and is therefore faster than
854 performing the individual steps via the more specialised methods.
855
856 On the other hand this methods does not do checks (e.g., whether the
857 old categories are included in the new categories on a reorder), which
858 can result in surprising changes, for example when using special string
859 dtypes, which does not considers a S1 string equal to a single char
860 python string.
861
862 Parameters
863 ----------
864 new_categories : Index-like
865 The categories in new order.
866 ordered : bool, default False
867 Whether or not the categorical is treated as a ordered categorical.
868 If not given, do not change the ordered information.
869 rename : bool, default False
870 Whether or not the new_categories should be considered as a rename
871 of the old categories or as reordered categories.
872
873 Returns
874 -------
875 Categorical with reordered categories.
876
877 Raises
878 ------
879 ValueError
880 If new_categories does not validate as categories
881
882 See Also
883 --------
884 rename_categories : Rename categories.
885 reorder_categories : Reorder categories.
886 add_categories : Add new categories.
887 remove_categories : Remove the specified categories.
888 remove_unused_categories : Remove categories which are not used.
889 """
890
891 if ordered is None:
892 ordered = self.dtype.ordered
893 new_dtype = CategoricalDtype(new_categories, ordered=ordered)
894
895 cat = self.copy()
896 if rename:
897 if cat.dtype.categories is not None and len(new_dtype.categories) < len(
898 cat.dtype.categories
899 ):
900 # remove all _codes which are larger and set to -1/NaN
901 cat._codes[cat._codes >= len(new_dtype.categories)] = -1
902 codes = cat._codes
903 else:
904 codes = recode_for_categories(
905 cat.codes, cat.categories, new_dtype.categories
906 )
907 NDArrayBacked.__init__(cat, codes, new_dtype)
908 return cat
909
910 def rename_categories(self, new_categories) -> Categorical:
911 """
912 Rename categories.
913
914 Parameters
915 ----------
916 new_categories : list-like, dict-like or callable
917
918 New categories which will replace old categories.
919
920 * list-like: all items must be unique and the number of items in
921 the new categories must match the existing number of categories.
922
923 * dict-like: specifies a mapping from
924 old categories to new. Categories not contained in the mapping
925 are passed through and extra categories in the mapping are
926 ignored.
927
928 * callable : a callable that is called on all items in the old
929 categories and whose return values comprise the new categories.
930
931 Returns
932 -------
933 Categorical
934 Categorical with renamed categories.
935
936 Raises
937 ------
938 ValueError
939 If new categories are list-like and do not have the same number of
940 items than the current categories or do not validate as categories
941
942 See Also
943 --------
944 reorder_categories : Reorder categories.
945 add_categories : Add new categories.
946 remove_categories : Remove the specified categories.
947 remove_unused_categories : Remove categories which are not used.
948 set_categories : Set the categories to the specified ones.
949
950 Examples
951 --------
952 >>> c = pd.Categorical(['a', 'a', 'b'])
953 >>> c.rename_categories([0, 1])
954 [0, 0, 1]
955 Categories (2, int64): [0, 1]
956
957 For dict-like ``new_categories``, extra keys are ignored and
958 categories not in the dictionary are passed through
959
960 >>> c.rename_categories({'a': 'A', 'c': 'C'})
961 ['A', 'A', 'b']
962 Categories (2, object): ['A', 'b']
963
964 You may also provide a callable to create the new categories
965
966 >>> c.rename_categories(lambda x: x.upper())
967 ['A', 'A', 'B']
968 Categories (2, object): ['A', 'B']
969 """
970
971 if is_dict_like(new_categories):
972 new_categories = [
973 new_categories.get(item, item) for item in self.categories
974 ]
975 elif callable(new_categories):
976 new_categories = [new_categories(item) for item in self.categories]
977
978 cat = self.copy()
979 cat._set_categories(new_categories)
980 return cat
981
982 def reorder_categories(self, new_categories, ordered=None):
983 """
984 Reorder categories as specified in new_categories.
985
986 `new_categories` need to include all old categories and no new category
987 items.
988
989 Parameters
990 ----------
991 new_categories : Index-like
992 The categories in new order.
993 ordered : bool, optional
994 Whether or not the categorical is treated as a ordered categorical.
995 If not given, do not change the ordered information.
996
997 Returns
998 -------
999 Categorical
1000 Categorical with reordered categories.
1001
1002 Raises
1003 ------
1004 ValueError
1005 If the new categories do not contain all old category items or any
1006 new ones
1007
1008 See Also
1009 --------
1010 rename_categories : Rename categories.
1011 add_categories : Add new categories.
1012 remove_categories : Remove the specified categories.
1013 remove_unused_categories : Remove categories which are not used.
1014 set_categories : Set the categories to the specified ones.
1015 """
1016 if (
1017 len(self.categories) != len(new_categories)
1018 or not self.categories.difference(new_categories).empty
1019 ):
1020 raise ValueError(
1021 "items in new_categories are not the same as in old categories"
1022 )
1023 return self.set_categories(new_categories, ordered=ordered)
1024
1025 def add_categories(self, new_categories) -> Categorical:
1026 """
1027 Add new categories.
1028
1029 `new_categories` will be included at the last/highest place in the
1030 categories and will be unused directly after this call.
1031
1032 Parameters
1033 ----------
1034 new_categories : category or list-like of category
1035 The new categories to be included.
1036
1037 Returns
1038 -------
1039 Categorical
1040 Categorical with new categories added.
1041
1042 Raises
1043 ------
1044 ValueError
1045 If the new categories include old categories or do not validate as
1046 categories
1047
1048 See Also
1049 --------
1050 rename_categories : Rename categories.
1051 reorder_categories : Reorder categories.
1052 remove_categories : Remove the specified categories.
1053 remove_unused_categories : Remove categories which are not used.
1054 set_categories : Set the categories to the specified ones.
1055
1056 Examples
1057 --------
1058 >>> c = pd.Categorical(['c', 'b', 'c'])
1059 >>> c
1060 ['c', 'b', 'c']
1061 Categories (2, object): ['b', 'c']
1062
1063 >>> c.add_categories(['d', 'a'])
1064 ['c', 'b', 'c']
1065 Categories (4, object): ['b', 'c', 'd', 'a']
1066 """
1067
1068 if not is_list_like(new_categories):
1069 new_categories = [new_categories]
1070 already_included = set(new_categories) & set(self.dtype.categories)
1071 if len(already_included) != 0:
1072 raise ValueError(
1073 f"new categories must not include old categories: {already_included}"
1074 )
1075
1076 if hasattr(new_categories, "dtype"):
1077 from pandas import Series
1078
1079 dtype = find_common_type(
1080 [self.dtype.categories.dtype, new_categories.dtype]
1081 )
1082 new_categories = Series(
1083 list(self.dtype.categories) + list(new_categories), dtype=dtype
1084 )
1085 else:
1086 new_categories = list(self.dtype.categories) + list(new_categories)
1087
1088 new_dtype = CategoricalDtype(new_categories, self.ordered)
1089 cat = self.copy()
1090 codes = coerce_indexer_dtype(cat._ndarray, new_dtype.categories)
1091 NDArrayBacked.__init__(cat, codes, new_dtype)
1092 return cat
1093
1094 def remove_categories(self, removals):
1095 """
1096 Remove the specified categories.
1097
1098 `removals` must be included in the old categories. Values which were in
1099 the removed categories will be set to NaN
1100
1101 Parameters
1102 ----------
1103 removals : category or list of categories
1104 The categories which should be removed.
1105
1106 Returns
1107 -------
1108 Categorical
1109 Categorical with removed categories.
1110
1111 Raises
1112 ------
1113 ValueError
1114 If the removals are not contained in the categories
1115
1116 See Also
1117 --------
1118 rename_categories : Rename categories.
1119 reorder_categories : Reorder categories.
1120 add_categories : Add new categories.
1121 remove_unused_categories : Remove categories which are not used.
1122 set_categories : Set the categories to the specified ones.
1123
1124 Examples
1125 --------
1126 >>> c = pd.Categorical(['a', 'c', 'b', 'c', 'd'])
1127 >>> c
1128 ['a', 'c', 'b', 'c', 'd']
1129 Categories (4, object): ['a', 'b', 'c', 'd']
1130
1131 >>> c.remove_categories(['d', 'a'])
1132 [NaN, 'c', 'b', 'c', NaN]
1133 Categories (2, object): ['b', 'c']
1134 """
1135 from pandas import Index
1136
1137 if not is_list_like(removals):
1138 removals = [removals]
1139
1140 removals = Index(removals).unique().dropna()
1141 new_categories = self.dtype.categories.difference(removals)
1142 not_included = removals.difference(self.dtype.categories)
1143
1144 if len(not_included) != 0:
1145 not_included = set(not_included)
1146 raise ValueError(f"removals must all be in old categories: {not_included}")
1147
1148 return self.set_categories(new_categories, ordered=self.ordered, rename=False)
1149
1150 def remove_unused_categories(self) -> Categorical:
1151 """
1152 Remove categories which are not used.
1153
1154 Returns
1155 -------
1156 Categorical
1157 Categorical with unused categories dropped.
1158
1159 See Also
1160 --------
1161 rename_categories : Rename categories.
1162 reorder_categories : Reorder categories.
1163 add_categories : Add new categories.
1164 remove_categories : Remove the specified categories.
1165 set_categories : Set the categories to the specified ones.
1166
1167 Examples
1168 --------
1169 >>> c = pd.Categorical(['a', 'c', 'b', 'c', 'd'])
1170 >>> c
1171 ['a', 'c', 'b', 'c', 'd']
1172 Categories (4, object): ['a', 'b', 'c', 'd']
1173
1174 >>> c[2] = 'a'
1175 >>> c[4] = 'c'
1176 >>> c
1177 ['a', 'c', 'a', 'c', 'c']
1178 Categories (4, object): ['a', 'b', 'c', 'd']
1179
1180 >>> c.remove_unused_categories()
1181 ['a', 'c', 'a', 'c', 'c']
1182 Categories (2, object): ['a', 'c']
1183 """
1184 idx, inv = np.unique(self._codes, return_inverse=True)
1185
1186 if idx.size != 0 and idx[0] == -1: # na sentinel
1187 idx, inv = idx[1:], inv - 1
1188
1189 new_categories = self.dtype.categories.take(idx)
1190 new_dtype = CategoricalDtype._from_fastpath(
1191 new_categories, ordered=self.ordered
1192 )
1193 new_codes = coerce_indexer_dtype(inv, new_dtype.categories)
1194
1195 cat = self.copy()
1196 NDArrayBacked.__init__(cat, new_codes, new_dtype)
1197 return cat
1198
1199 # ------------------------------------------------------------------
1200
1201 def map(self, mapper):
1202 """
1203 Map categories using an input mapping or function.
1204
1205 Maps the categories to new categories. If the mapping correspondence is
1206 one-to-one the result is a :class:`~pandas.Categorical` which has the
1207 same order property as the original, otherwise a :class:`~pandas.Index`
1208 is returned. NaN values are unaffected.
1209
1210 If a `dict` or :class:`~pandas.Series` is used any unmapped category is
1211 mapped to `NaN`. Note that if this happens an :class:`~pandas.Index`
1212 will be returned.
1213
1214 Parameters
1215 ----------
1216 mapper : function, dict, or Series
1217 Mapping correspondence.
1218
1219 Returns
1220 -------
1221 pandas.Categorical or pandas.Index
1222 Mapped categorical.
1223
1224 See Also
1225 --------
1226 CategoricalIndex.map : Apply a mapping correspondence on a
1227 :class:`~pandas.CategoricalIndex`.
1228 Index.map : Apply a mapping correspondence on an
1229 :class:`~pandas.Index`.
1230 Series.map : Apply a mapping correspondence on a
1231 :class:`~pandas.Series`.
1232 Series.apply : Apply more complex functions on a
1233 :class:`~pandas.Series`.
1234
1235 Examples
1236 --------
1237 >>> cat = pd.Categorical(['a', 'b', 'c'])
1238 >>> cat
1239 ['a', 'b', 'c']
1240 Categories (3, object): ['a', 'b', 'c']
1241 >>> cat.map(lambda x: x.upper())
1242 ['A', 'B', 'C']
1243 Categories (3, object): ['A', 'B', 'C']
1244 >>> cat.map({'a': 'first', 'b': 'second', 'c': 'third'})
1245 ['first', 'second', 'third']
1246 Categories (3, object): ['first', 'second', 'third']
1247
1248 If the mapping is one-to-one the ordering of the categories is
1249 preserved:
1250
1251 >>> cat = pd.Categorical(['a', 'b', 'c'], ordered=True)
1252 >>> cat
1253 ['a', 'b', 'c']
1254 Categories (3, object): ['a' < 'b' < 'c']
1255 >>> cat.map({'a': 3, 'b': 2, 'c': 1})
1256 [3, 2, 1]
1257 Categories (3, int64): [3 < 2 < 1]
1258
1259 If the mapping is not one-to-one an :class:`~pandas.Index` is returned:
1260
1261 >>> cat.map({'a': 'first', 'b': 'second', 'c': 'first'})
1262 Index(['first', 'second', 'first'], dtype='object')
1263
1264 If a `dict` is used, all unmapped categories are mapped to `NaN` and
1265 the result is an :class:`~pandas.Index`:
1266
1267 >>> cat.map({'a': 'first', 'b': 'second'})
1268 Index(['first', 'second', nan], dtype='object')
1269 """
1270 new_categories = self.categories.map(mapper)
1271 try:
1272 return self.from_codes(
1273 self._codes.copy(), categories=new_categories, ordered=self.ordered
1274 )
1275 except ValueError:
1276 # NA values are represented in self._codes with -1
1277 # np.take causes NA values to take final element in new_categories
1278 if np.any(self._codes == -1):
1279 new_categories = new_categories.insert(len(new_categories), np.nan)
1280 return np.take(new_categories, self._codes)
1281
1282 __eq__ = _cat_compare_op(operator.eq)
1283 __ne__ = _cat_compare_op(operator.ne)
1284 __lt__ = _cat_compare_op(operator.lt)
1285 __gt__ = _cat_compare_op(operator.gt)
1286 __le__ = _cat_compare_op(operator.le)
1287 __ge__ = _cat_compare_op(operator.ge)
1288
1289 # -------------------------------------------------------------
1290 # Validators; ideally these can be de-duplicated
1291
1292 def _validate_setitem_value(self, value):
1293 if not is_hashable(value):
1294 # wrap scalars and hashable-listlikes in list
1295 return self._validate_listlike(value)
1296 else:
1297 return self._validate_scalar(value)
1298
1299 def _validate_scalar(self, fill_value):
1300 """
1301 Convert a user-facing fill_value to a representation to use with our
1302 underlying ndarray, raising TypeError if this is not possible.
1303
1304 Parameters
1305 ----------
1306 fill_value : object
1307
1308 Returns
1309 -------
1310 fill_value : int
1311
1312 Raises
1313 ------
1314 TypeError
1315 """
1316
1317 if is_valid_na_for_dtype(fill_value, self.categories.dtype):
1318 fill_value = -1
1319 elif fill_value in self.categories:
1320 fill_value = self._unbox_scalar(fill_value)
1321 else:
1322 raise TypeError(
1323 "Cannot setitem on a Categorical with a new "
1324 f"category ({fill_value}), set the categories first"
1325 ) from None
1326 return fill_value
1327
1328 # -------------------------------------------------------------
1329
1330 @ravel_compat
1331 def __array__(self, dtype: NpDtype | None = None) -> np.ndarray:
1332 """
1333 The numpy array interface.
1334
1335 Returns
1336 -------
1337 numpy.array
1338 A numpy array of either the specified dtype or,
1339 if dtype==None (default), the same dtype as
1340 categorical.categories.dtype.
1341 """
1342 ret = take_nd(self.categories._values, self._codes)
1343 if dtype and not is_dtype_equal(dtype, self.categories.dtype):
1344 return np.asarray(ret, dtype)
1345 # When we're a Categorical[ExtensionArray], like Interval,
1346 # we need to ensure __array__ gets all the way to an
1347 # ndarray.
1348 return np.asarray(ret)
1349
1350 def __array_ufunc__(self, ufunc: np.ufunc, method: str, *inputs, **kwargs):
1351 # for binary ops, use our custom dunder methods
1352 result = ops.maybe_dispatch_ufunc_to_dunder_op(
1353 self, ufunc, method, *inputs, **kwargs
1354 )
1355 if result is not NotImplemented:
1356 return result
1357
1358 if "out" in kwargs:
1359 # e.g. test_numpy_ufuncs_out
1360 return arraylike.dispatch_ufunc_with_out(
1361 self, ufunc, method, *inputs, **kwargs
1362 )
1363
1364 if method == "reduce":
1365 # e.g. TestCategoricalAnalytics::test_min_max_ordered
1366 result = arraylike.dispatch_reduction_ufunc(
1367 self, ufunc, method, *inputs, **kwargs
1368 )
1369 if result is not NotImplemented:
1370 return result
1371
1372 # for all other cases, raise for now (similarly as what happens in
1373 # Series.__array_prepare__)
1374 raise TypeError(
1375 f"Object with dtype {self.dtype} cannot perform "
1376 f"the numpy op {ufunc.__name__}"
1377 )
1378
1379 def __setstate__(self, state) -> None:
1380 """Necessary for making this object picklable"""
1381 if not isinstance(state, dict):
1382 return super().__setstate__(state)
1383
1384 if "_dtype" not in state:
1385 state["_dtype"] = CategoricalDtype(state["_categories"], state["_ordered"])
1386
1387 if "_codes" in state and "_ndarray" not in state:
1388 # backward compat, changed what is property vs attribute
1389 state["_ndarray"] = state.pop("_codes")
1390
1391 super().__setstate__(state)
1392
1393 @property
1394 def nbytes(self) -> int:
1395 return self._codes.nbytes + self.dtype.categories.values.nbytes
1396
1397 def memory_usage(self, deep: bool = False) -> int:
1398 """
1399 Memory usage of my values
1400
1401 Parameters
1402 ----------
1403 deep : bool
1404 Introspect the data deeply, interrogate
1405 `object` dtypes for system-level memory consumption
1406
1407 Returns
1408 -------
1409 bytes used
1410
1411 Notes
1412 -----
1413 Memory usage does not include memory consumed by elements that
1414 are not components of the array if deep=False
1415
1416 See Also
1417 --------
1418 numpy.ndarray.nbytes
1419 """
1420 return self._codes.nbytes + self.dtype.categories.memory_usage(deep=deep)
1421
1422 def isna(self) -> np.ndarray:
1423 """
1424 Detect missing values
1425
1426 Missing values (-1 in .codes) are detected.
1427
1428 Returns
1429 -------
1430 np.ndarray[bool] of whether my values are null
1431
1432 See Also
1433 --------
1434 isna : Top-level isna.
1435 isnull : Alias of isna.
1436 Categorical.notna : Boolean inverse of Categorical.isna.
1437
1438 """
1439 return self._codes == -1
1440
1441 isnull = isna
1442
1443 def notna(self) -> np.ndarray:
1444 """
1445 Inverse of isna
1446
1447 Both missing values (-1 in .codes) and NA as a category are detected as
1448 null.
1449
1450 Returns
1451 -------
1452 np.ndarray[bool] of whether my values are not null
1453
1454 See Also
1455 --------
1456 notna : Top-level notna.
1457 notnull : Alias of notna.
1458 Categorical.isna : Boolean inverse of Categorical.notna.
1459
1460 """
1461 return ~self.isna()
1462
1463 notnull = notna
1464
1465 def value_counts(self, dropna: bool = True) -> Series:
1466 """
1467 Return a Series containing counts of each category.
1468
1469 Every category will have an entry, even those with a count of 0.
1470
1471 Parameters
1472 ----------
1473 dropna : bool, default True
1474 Don't include counts of NaN.
1475
1476 Returns
1477 -------
1478 counts : Series
1479
1480 See Also
1481 --------
1482 Series.value_counts
1483 """
1484 from pandas import (
1485 CategoricalIndex,
1486 Series,
1487 )
1488
1489 code, cat = self._codes, self.categories
1490 ncat, mask = (len(cat), code >= 0)
1491 ix, clean = np.arange(ncat), mask.all()
1492
1493 if dropna or clean:
1494 obs = code if clean else code[mask]
1495 count = np.bincount(obs, minlength=ncat or 0)
1496 else:
1497 count = np.bincount(np.where(mask, code, ncat))
1498 ix = np.append(ix, -1)
1499
1500 ix = coerce_indexer_dtype(ix, self.dtype.categories)
1501 ix = self._from_backing_data(ix)
1502
1503 return Series(
1504 count, index=CategoricalIndex(ix), dtype="int64", name="count", copy=False
1505 )
1506
1507 # error: Argument 2 of "_empty" is incompatible with supertype
1508 # "NDArrayBackedExtensionArray"; supertype defines the argument type as
1509 # "ExtensionDtype"
1510 @classmethod
1511 def _empty( # type: ignore[override]
1512 cls: type_t[Categorical], shape: Shape, dtype: CategoricalDtype
1513 ) -> Categorical:
1514 """
1515 Analogous to np.empty(shape, dtype=dtype)
1516
1517 Parameters
1518 ----------
1519 shape : tuple[int]
1520 dtype : CategoricalDtype
1521 """
1522 arr = cls._from_sequence([], dtype=dtype)
1523
1524 # We have to use np.zeros instead of np.empty otherwise the resulting
1525 # ndarray may contain codes not supported by this dtype, in which
1526 # case repr(result) could segfault.
1527 backing = np.zeros(shape, dtype=arr._ndarray.dtype)
1528
1529 return arr._from_backing_data(backing)
1530
1531 def _internal_get_values(self):
1532 """
1533 Return the values.
1534
1535 For internal compatibility with pandas formatting.
1536
1537 Returns
1538 -------
1539 np.ndarray or Index
1540 A numpy array of the same dtype as categorical.categories.dtype or
1541 Index if datetime / periods.
1542 """
1543 # if we are a datetime and period index, return Index to keep metadata
1544 if needs_i8_conversion(self.categories.dtype):
1545 return self.categories.take(self._codes, fill_value=NaT)
1546 elif is_integer_dtype(self.categories) and -1 in self._codes:
1547 return self.categories.astype("object").take(self._codes, fill_value=np.nan)
1548 return np.array(self)
1549
1550 def check_for_ordered(self, op) -> None:
1551 """assert that we are ordered"""
1552 if not self.ordered:
1553 raise TypeError(
1554 f"Categorical is not ordered for operation {op}\n"
1555 "you can use .as_ordered() to change the "
1556 "Categorical to an ordered one\n"
1557 )
1558
1559 def argsort(
1560 self, *, ascending: bool = True, kind: SortKind = "quicksort", **kwargs
1561 ):
1562 """
1563 Return the indices that would sort the Categorical.
1564
1565 Missing values are sorted at the end.
1566
1567 Parameters
1568 ----------
1569 ascending : bool, default True
1570 Whether the indices should result in an ascending
1571 or descending sort.
1572 kind : {'quicksort', 'mergesort', 'heapsort', 'stable'}, optional
1573 Sorting algorithm.
1574 **kwargs:
1575 passed through to :func:`numpy.argsort`.
1576
1577 Returns
1578 -------
1579 np.ndarray[np.intp]
1580
1581 See Also
1582 --------
1583 numpy.ndarray.argsort
1584
1585 Notes
1586 -----
1587 While an ordering is applied to the category values, arg-sorting
1588 in this context refers more to organizing and grouping together
1589 based on matching category values. Thus, this function can be
1590 called on an unordered Categorical instance unlike the functions
1591 'Categorical.min' and 'Categorical.max'.
1592
1593 Examples
1594 --------
1595 >>> pd.Categorical(['b', 'b', 'a', 'c']).argsort()
1596 array([2, 0, 1, 3])
1597
1598 >>> cat = pd.Categorical(['b', 'b', 'a', 'c'],
1599 ... categories=['c', 'b', 'a'],
1600 ... ordered=True)
1601 >>> cat.argsort()
1602 array([3, 0, 1, 2])
1603
1604 Missing values are placed at the end
1605
1606 >>> cat = pd.Categorical([2, None, 1])
1607 >>> cat.argsort()
1608 array([2, 0, 1])
1609 """
1610 return super().argsort(ascending=ascending, kind=kind, **kwargs)
1611
1612 @overload
1613 def sort_values(
1614 self,
1615 *,
1616 inplace: Literal[False] = ...,
1617 ascending: bool = ...,
1618 na_position: str = ...,
1619 ) -> Categorical:
1620 ...
1621
1622 @overload
1623 def sort_values(
1624 self, *, inplace: Literal[True], ascending: bool = ..., na_position: str = ...
1625 ) -> None:
1626 ...
1627
1628 def sort_values(
1629 self,
1630 *,
1631 inplace: bool = False,
1632 ascending: bool = True,
1633 na_position: str = "last",
1634 ) -> Categorical | None:
1635 """
1636 Sort the Categorical by category value returning a new
1637 Categorical by default.
1638
1639 While an ordering is applied to the category values, sorting in this
1640 context refers more to organizing and grouping together based on
1641 matching category values. Thus, this function can be called on an
1642 unordered Categorical instance unlike the functions 'Categorical.min'
1643 and 'Categorical.max'.
1644
1645 Parameters
1646 ----------
1647 inplace : bool, default False
1648 Do operation in place.
1649 ascending : bool, default True
1650 Order ascending. Passing False orders descending. The
1651 ordering parameter provides the method by which the
1652 category values are organized.
1653 na_position : {'first', 'last'} (optional, default='last')
1654 'first' puts NaNs at the beginning
1655 'last' puts NaNs at the end
1656
1657 Returns
1658 -------
1659 Categorical or None
1660
1661 See Also
1662 --------
1663 Categorical.sort
1664 Series.sort_values
1665
1666 Examples
1667 --------
1668 >>> c = pd.Categorical([1, 2, 2, 1, 5])
1669 >>> c
1670 [1, 2, 2, 1, 5]
1671 Categories (3, int64): [1, 2, 5]
1672 >>> c.sort_values()
1673 [1, 1, 2, 2, 5]
1674 Categories (3, int64): [1, 2, 5]
1675 >>> c.sort_values(ascending=False)
1676 [5, 2, 2, 1, 1]
1677 Categories (3, int64): [1, 2, 5]
1678
1679 >>> c = pd.Categorical([1, 2, 2, 1, 5])
1680
1681 'sort_values' behaviour with NaNs. Note that 'na_position'
1682 is independent of the 'ascending' parameter:
1683
1684 >>> c = pd.Categorical([np.nan, 2, 2, np.nan, 5])
1685 >>> c
1686 [NaN, 2, 2, NaN, 5]
1687 Categories (2, int64): [2, 5]
1688 >>> c.sort_values()
1689 [2, 2, 5, NaN, NaN]
1690 Categories (2, int64): [2, 5]
1691 >>> c.sort_values(ascending=False)
1692 [5, 2, 2, NaN, NaN]
1693 Categories (2, int64): [2, 5]
1694 >>> c.sort_values(na_position='first')
1695 [NaN, NaN, 2, 2, 5]
1696 Categories (2, int64): [2, 5]
1697 >>> c.sort_values(ascending=False, na_position='first')
1698 [NaN, NaN, 5, 2, 2]
1699 Categories (2, int64): [2, 5]
1700 """
1701 inplace = validate_bool_kwarg(inplace, "inplace")
1702 if na_position not in ["last", "first"]:
1703 raise ValueError(f"invalid na_position: {repr(na_position)}")
1704
1705 sorted_idx = nargsort(self, ascending=ascending, na_position=na_position)
1706
1707 if not inplace:
1708 codes = self._codes[sorted_idx]
1709 return self._from_backing_data(codes)
1710 self._codes[:] = self._codes[sorted_idx]
1711 return None
1712
1713 def _rank(
1714 self,
1715 *,
1716 axis: AxisInt = 0,
1717 method: str = "average",
1718 na_option: str = "keep",
1719 ascending: bool = True,
1720 pct: bool = False,
1721 ):
1722 """
1723 See Series.rank.__doc__.
1724 """
1725 if axis != 0:
1726 raise NotImplementedError
1727 vff = self._values_for_rank()
1728 return algorithms.rank(
1729 vff,
1730 axis=axis,
1731 method=method,
1732 na_option=na_option,
1733 ascending=ascending,
1734 pct=pct,
1735 )
1736
1737 def _values_for_rank(self):
1738 """
1739 For correctly ranking ordered categorical data. See GH#15420
1740
1741 Ordered categorical data should be ranked on the basis of
1742 codes with -1 translated to NaN.
1743
1744 Returns
1745 -------
1746 numpy.array
1747
1748 """
1749 from pandas import Series
1750
1751 if self.ordered:
1752 values = self.codes
1753 mask = values == -1
1754 if mask.any():
1755 values = values.astype("float64")
1756 values[mask] = np.nan
1757 elif is_any_real_numeric_dtype(self.categories):
1758 values = np.array(self)
1759 else:
1760 # reorder the categories (so rank can use the float codes)
1761 # instead of passing an object array to rank
1762 values = np.array(
1763 self.rename_categories(
1764 Series(self.categories, copy=False).rank().values
1765 )
1766 )
1767 return values
1768
1769 # ------------------------------------------------------------------
1770 # NDArrayBackedExtensionArray compat
1771
1772 @property
1773 def _codes(self) -> np.ndarray:
1774 return self._ndarray
1775
1776 def _box_func(self, i: int):
1777 if i == -1:
1778 return np.NaN
1779 return self.categories[i]
1780
1781 def _unbox_scalar(self, key) -> int:
1782 # searchsorted is very performance sensitive. By converting codes
1783 # to same dtype as self.codes, we get much faster performance.
1784 code = self.categories.get_loc(key)
1785 code = self._ndarray.dtype.type(code)
1786 return code
1787
1788 # ------------------------------------------------------------------
1789
1790 def __iter__(self) -> Iterator:
1791 """
1792 Returns an Iterator over the values of this Categorical.
1793 """
1794 if self.ndim == 1:
1795 return iter(self._internal_get_values().tolist())
1796 else:
1797 return (self[n] for n in range(len(self)))
1798
1799 def __contains__(self, key) -> bool:
1800 """
1801 Returns True if `key` is in this Categorical.
1802 """
1803 # if key is a NaN, check if any NaN is in self.
1804 if is_valid_na_for_dtype(key, self.categories.dtype):
1805 return bool(self.isna().any())
1806
1807 return contains(self, key, container=self._codes)
1808
1809 # ------------------------------------------------------------------
1810 # Rendering Methods
1811
1812 def _formatter(self, boxed: bool = False):
1813 # Defer to CategoricalFormatter's formatter.
1814 return None
1815
1816 def _tidy_repr(self, max_vals: int = 10, footer: bool = True) -> str:
1817 """
1818 a short repr displaying only max_vals and an optional (but default
1819 footer)
1820 """
1821 num = max_vals // 2
1822 head = self[:num]._get_repr(length=False, footer=False)
1823 tail = self[-(max_vals - num) :]._get_repr(length=False, footer=False)
1824
1825 result = f"{head[:-1]}, ..., {tail[1:]}"
1826 if footer:
1827 result = f"{result}\n{self._repr_footer()}"
1828
1829 return str(result)
1830
1831 def _repr_categories(self) -> list[str]:
1832 """
1833 return the base repr for the categories
1834 """
1835 max_categories = (
1836 10
1837 if get_option("display.max_categories") == 0
1838 else get_option("display.max_categories")
1839 )
1840 from pandas.io.formats import format as fmt
1841
1842 format_array = partial(
1843 fmt.format_array, formatter=None, quoting=QUOTE_NONNUMERIC
1844 )
1845 if len(self.categories) > max_categories:
1846 num = max_categories // 2
1847 head = format_array(self.categories[:num])
1848 tail = format_array(self.categories[-num:])
1849 category_strs = head + ["..."] + tail
1850 else:
1851 category_strs = format_array(self.categories)
1852
1853 # Strip all leading spaces, which format_array adds for columns...
1854 category_strs = [x.strip() for x in category_strs]
1855 return category_strs
1856
1857 def _repr_categories_info(self) -> str:
1858 """
1859 Returns a string representation of the footer.
1860 """
1861 category_strs = self._repr_categories()
1862 dtype = str(self.categories.dtype)
1863 levheader = f"Categories ({len(self.categories)}, {dtype}): "
1864 width, height = get_terminal_size()
1865 max_width = get_option("display.width") or width
1866 if console.in_ipython_frontend():
1867 # 0 = no breaks
1868 max_width = 0
1869 levstring = ""
1870 start = True
1871 cur_col_len = len(levheader) # header
1872 sep_len, sep = (3, " < ") if self.ordered else (2, ", ")
1873 linesep = f"{sep.rstrip()}\n" # remove whitespace
1874 for val in category_strs:
1875 if max_width != 0 and cur_col_len + sep_len + len(val) > max_width:
1876 levstring += linesep + (" " * (len(levheader) + 1))
1877 cur_col_len = len(levheader) + 1 # header + a whitespace
1878 elif not start:
1879 levstring += sep
1880 cur_col_len += len(val)
1881 levstring += val
1882 start = False
1883 # replace to simple save space by
1884 return f"{levheader}[{levstring.replace(' < ... < ', ' ... ')}]"
1885
1886 def _repr_footer(self) -> str:
1887 info = self._repr_categories_info()
1888 return f"Length: {len(self)}\n{info}"
1889
1890 def _get_repr(
1891 self, length: bool = True, na_rep: str = "NaN", footer: bool = True
1892 ) -> str:
1893 from pandas.io.formats import format as fmt
1894
1895 formatter = fmt.CategoricalFormatter(
1896 self, length=length, na_rep=na_rep, footer=footer
1897 )
1898 result = formatter.to_string()
1899 return str(result)
1900
1901 def __repr__(self) -> str:
1902 """
1903 String representation.
1904 """
1905 _maxlen = 10
1906 if len(self._codes) > _maxlen:
1907 result = self._tidy_repr(_maxlen)
1908 elif len(self._codes) > 0:
1909 result = self._get_repr(length=len(self) > _maxlen)
1910 else:
1911 msg = self._get_repr(length=False, footer=True).replace("\n", ", ")
1912 result = f"[], {msg}"
1913
1914 return result
1915
1916 # ------------------------------------------------------------------
1917
1918 def _validate_listlike(self, value):
1919 # NB: here we assume scalar-like tuples have already been excluded
1920 value = extract_array(value, extract_numpy=True)
1921
1922 # require identical categories set
1923 if isinstance(value, Categorical):
1924 if not is_dtype_equal(self.dtype, value.dtype):
1925 raise TypeError(
1926 "Cannot set a Categorical with another, "
1927 "without identical categories"
1928 )
1929 # is_dtype_equal implies categories_match_up_to_permutation
1930 value = self._encode_with_my_categories(value)
1931 return value._codes
1932
1933 from pandas import Index
1934
1935 # tupleize_cols=False for e.g. test_fillna_iterable_category GH#41914
1936 to_add = Index._with_infer(value, tupleize_cols=False).difference(
1937 self.categories
1938 )
1939
1940 # no assignments of values not in categories, but it's always ok to set
1941 # something to np.nan
1942 if len(to_add) and not isna(to_add).all():
1943 raise TypeError(
1944 "Cannot setitem on a Categorical with a new "
1945 "category, set the categories first"
1946 )
1947
1948 codes = self.categories.get_indexer(value)
1949 return codes.astype(self._ndarray.dtype, copy=False)
1950
1951 def _reverse_indexer(self) -> dict[Hashable, npt.NDArray[np.intp]]:
1952 """
1953 Compute the inverse of a categorical, returning
1954 a dict of categories -> indexers.
1955
1956 *This is an internal function*
1957
1958 Returns
1959 -------
1960 Dict[Hashable, np.ndarray[np.intp]]
1961 dict of categories -> indexers
1962
1963 Examples
1964 --------
1965 >>> c = pd.Categorical(list('aabca'))
1966 >>> c
1967 ['a', 'a', 'b', 'c', 'a']
1968 Categories (3, object): ['a', 'b', 'c']
1969 >>> c.categories
1970 Index(['a', 'b', 'c'], dtype='object')
1971 >>> c.codes
1972 array([0, 0, 1, 2, 0], dtype=int8)
1973 >>> c._reverse_indexer()
1974 {'a': array([0, 1, 4]), 'b': array([2]), 'c': array([3])}
1975
1976 """
1977 categories = self.categories
1978 r, counts = libalgos.groupsort_indexer(
1979 ensure_platform_int(self.codes), categories.size
1980 )
1981 counts = ensure_int64(counts).cumsum()
1982 _result = (r[start:end] for start, end in zip(counts, counts[1:]))
1983 return dict(zip(categories, _result))
1984
1985 # ------------------------------------------------------------------
1986 # Reductions
1987
1988 def min(self, *, skipna: bool = True, **kwargs):
1989 """
1990 The minimum value of the object.
1991
1992 Only ordered `Categoricals` have a minimum!
1993
1994 Raises
1995 ------
1996 TypeError
1997 If the `Categorical` is not `ordered`.
1998
1999 Returns
2000 -------
2001 min : the minimum of this `Categorical`, NA value if empty
2002 """
2003 nv.validate_minmax_axis(kwargs.get("axis", 0))
2004 nv.validate_min((), kwargs)
2005 self.check_for_ordered("min")
2006
2007 if not len(self._codes):
2008 return self.dtype.na_value
2009
2010 good = self._codes != -1
2011 if not good.all():
2012 if skipna and good.any():
2013 pointer = self._codes[good].min()
2014 else:
2015 return np.nan
2016 else:
2017 pointer = self._codes.min()
2018 return self._wrap_reduction_result(None, pointer)
2019
2020 def max(self, *, skipna: bool = True, **kwargs):
2021 """
2022 The maximum value of the object.
2023
2024 Only ordered `Categoricals` have a maximum!
2025
2026 Raises
2027 ------
2028 TypeError
2029 If the `Categorical` is not `ordered`.
2030
2031 Returns
2032 -------
2033 max : the maximum of this `Categorical`, NA if array is empty
2034 """
2035 nv.validate_minmax_axis(kwargs.get("axis", 0))
2036 nv.validate_max((), kwargs)
2037 self.check_for_ordered("max")
2038
2039 if not len(self._codes):
2040 return self.dtype.na_value
2041
2042 good = self._codes != -1
2043 if not good.all():
2044 if skipna and good.any():
2045 pointer = self._codes[good].max()
2046 else:
2047 return np.nan
2048 else:
2049 pointer = self._codes.max()
2050 return self._wrap_reduction_result(None, pointer)
2051
2052 def _mode(self, dropna: bool = True) -> Categorical:
2053 codes = self._codes
2054 mask = None
2055 if dropna:
2056 mask = self.isna()
2057
2058 res_codes = algorithms.mode(codes, mask=mask)
2059 res_codes = cast(np.ndarray, res_codes)
2060 assert res_codes.dtype == codes.dtype
2061 res = self._from_backing_data(res_codes)
2062 return res
2063
2064 # ------------------------------------------------------------------
2065 # ExtensionArray Interface
2066
2067 def unique(self):
2068 """
2069 Return the ``Categorical`` which ``categories`` and ``codes`` are
2070 unique.
2071
2072 .. versionchanged:: 1.3.0
2073
2074 Previously, unused categories were dropped from the new categories.
2075
2076 Returns
2077 -------
2078 Categorical
2079
2080 See Also
2081 --------
2082 pandas.unique
2083 CategoricalIndex.unique
2084 Series.unique : Return unique values of Series object.
2085
2086 Examples
2087 --------
2088 >>> pd.Categorical(list("baabc")).unique()
2089 ['b', 'a', 'c']
2090 Categories (3, object): ['a', 'b', 'c']
2091 >>> pd.Categorical(list("baab"), categories=list("abc"), ordered=True).unique()
2092 ['b', 'a']
2093 Categories (3, object): ['a' < 'b' < 'c']
2094 """
2095 # pylint: disable=useless-parent-delegation
2096 return super().unique()
2097
2098 def _cast_quantile_result(self, res_values: np.ndarray) -> np.ndarray:
2099 # make sure we have correct itemsize for resulting codes
2100 assert res_values.dtype == self._ndarray.dtype
2101 return res_values
2102
2103 def equals(self, other: object) -> bool:
2104 """
2105 Returns True if categorical arrays are equal.
2106
2107 Parameters
2108 ----------
2109 other : `Categorical`
2110
2111 Returns
2112 -------
2113 bool
2114 """
2115 if not isinstance(other, Categorical):
2116 return False
2117 elif self._categories_match_up_to_permutation(other):
2118 other = self._encode_with_my_categories(other)
2119 return np.array_equal(self._codes, other._codes)
2120 return False
2121
2122 @classmethod
2123 def _concat_same_type(
2124 cls: type[CategoricalT], to_concat: Sequence[CategoricalT], axis: AxisInt = 0
2125 ) -> CategoricalT:
2126 from pandas.core.dtypes.concat import union_categoricals
2127
2128 first = to_concat[0]
2129 if axis >= first.ndim:
2130 raise ValueError(
2131 f"axis {axis} is out of bounds for array of dimension {first.ndim}"
2132 )
2133
2134 if axis == 1:
2135 # Flatten, concatenate then reshape
2136 if not all(x.ndim == 2 for x in to_concat):
2137 raise ValueError
2138
2139 # pass correctly-shaped to union_categoricals
2140 tc_flat = []
2141 for obj in to_concat:
2142 tc_flat.extend([obj[:, i] for i in range(obj.shape[1])])
2143
2144 res_flat = cls._concat_same_type(tc_flat, axis=0)
2145
2146 result = res_flat.reshape(len(first), -1, order="F")
2147 return result
2148
2149 result = union_categoricals(to_concat)
2150 return result
2151
2152 # ------------------------------------------------------------------
2153
2154 def _encode_with_my_categories(self, other: Categorical) -> Categorical:
2155 """
2156 Re-encode another categorical using this Categorical's categories.
2157
2158 Notes
2159 -----
2160 This assumes we have already checked
2161 self._categories_match_up_to_permutation(other).
2162 """
2163 # Indexing on codes is more efficient if categories are the same,
2164 # so we can apply some optimizations based on the degree of
2165 # dtype-matching.
2166 codes = recode_for_categories(
2167 other.codes, other.categories, self.categories, copy=False
2168 )
2169 return self._from_backing_data(codes)
2170
2171 def _categories_match_up_to_permutation(self, other: Categorical) -> bool:
2172 """
2173 Returns True if categoricals are the same dtype
2174 same categories, and same ordered
2175
2176 Parameters
2177 ----------
2178 other : Categorical
2179
2180 Returns
2181 -------
2182 bool
2183 """
2184 return hash(self.dtype) == hash(other.dtype)
2185
2186 def describe(self) -> DataFrame:
2187 """
2188 Describes this Categorical
2189
2190 Returns
2191 -------
2192 description: `DataFrame`
2193 A dataframe with frequency and counts by category.
2194 """
2195 counts = self.value_counts(dropna=False)
2196 freqs = counts / counts.sum()
2197
2198 from pandas import Index
2199 from pandas.core.reshape.concat import concat
2200
2201 result = concat([counts, freqs], axis=1)
2202 result.columns = Index(["counts", "freqs"])
2203 result.index.name = "categories"
2204
2205 return result
2206
2207 def isin(self, values) -> npt.NDArray[np.bool_]:
2208 """
2209 Check whether `values` are contained in Categorical.
2210
2211 Return a boolean NumPy Array showing whether each element in
2212 the Categorical matches an element in the passed sequence of
2213 `values` exactly.
2214
2215 Parameters
2216 ----------
2217 values : set or list-like
2218 The sequence of values to test. Passing in a single string will
2219 raise a ``TypeError``. Instead, turn a single string into a
2220 list of one element.
2221
2222 Returns
2223 -------
2224 np.ndarray[bool]
2225
2226 Raises
2227 ------
2228 TypeError
2229 * If `values` is not a set or list-like
2230
2231 See Also
2232 --------
2233 pandas.Series.isin : Equivalent method on Series.
2234
2235 Examples
2236 --------
2237 >>> s = pd.Categorical(['lama', 'cow', 'lama', 'beetle', 'lama',
2238 ... 'hippo'])
2239 >>> s.isin(['cow', 'lama'])
2240 array([ True, True, True, False, True, False])
2241
2242 Passing a single string as ``s.isin('lama')`` will raise an error. Use
2243 a list of one element instead:
2244
2245 >>> s.isin(['lama'])
2246 array([ True, False, True, False, True, False])
2247 """
2248 if not is_list_like(values):
2249 values_type = type(values).__name__
2250 raise TypeError(
2251 "only list-like objects are allowed to be passed "
2252 f"to isin(), you passed a [{values_type}]"
2253 )
2254 values = sanitize_array(values, None, None)
2255 null_mask = np.asarray(isna(values))
2256 code_values = self.categories.get_indexer(values)
2257 code_values = code_values[null_mask | (code_values >= 0)]
2258 return algorithms.isin(self.codes, code_values)
2259
2260 def _replace(self, *, to_replace, value, inplace: bool = False):
2261 from pandas import Index
2262
2263 inplace = validate_bool_kwarg(inplace, "inplace")
2264 cat = self if inplace else self.copy()
2265
2266 mask = isna(np.asarray(value))
2267 if mask.any():
2268 removals = np.asarray(to_replace)[mask]
2269 removals = cat.categories[cat.categories.isin(removals)]
2270 new_cat = cat.remove_categories(removals)
2271 NDArrayBacked.__init__(cat, new_cat.codes, new_cat.dtype)
2272
2273 ser = cat.categories.to_series()
2274 ser = ser.replace(to_replace=to_replace, value=value)
2275
2276 all_values = Index(ser)
2277
2278 # GH51016: maintain order of existing categories
2279 idxr = cat.categories.get_indexer_for(all_values)
2280 locs = np.arange(len(ser))
2281 locs = np.where(idxr == -1, locs, idxr)
2282 locs = locs.argsort()
2283
2284 new_categories = ser.take(locs)
2285 new_categories = new_categories.drop_duplicates(keep="first")
2286 new_categories = Index(new_categories)
2287 new_codes = recode_for_categories(
2288 cat._codes, all_values, new_categories, copy=False
2289 )
2290 new_dtype = CategoricalDtype(new_categories, ordered=self.dtype.ordered)
2291 NDArrayBacked.__init__(cat, new_codes, new_dtype)
2292
2293 if not inplace:
2294 return cat
2295
2296 # ------------------------------------------------------------------------
2297 # String methods interface
2298 def _str_map(
2299 self, f, na_value=np.nan, dtype=np.dtype("object"), convert: bool = True
2300 ):
2301 # Optimization to apply the callable `f` to the categories once
2302 # and rebuild the result by `take`ing from the result with the codes.
2303 # Returns the same type as the object-dtype implementation though.
2304 from pandas.core.arrays import PandasArray
2305
2306 categories = self.categories
2307 codes = self.codes
2308 result = PandasArray(categories.to_numpy())._str_map(f, na_value, dtype)
2309 return take_nd(result, codes, fill_value=na_value)
2310
2311 def _str_get_dummies(self, sep: str = "|"):
2312 # sep may not be in categories. Just bail on this.
2313 from pandas.core.arrays import PandasArray
2314
2315 return PandasArray(self.astype(str))._str_get_dummies(sep)
2316
2317
2318# The Series.cat accessor
2319
2320
2321@delegate_names(
2322 delegate=Categorical, accessors=["categories", "ordered"], typ="property"
2323)
2324@delegate_names(
2325 delegate=Categorical,
2326 accessors=[
2327 "rename_categories",
2328 "reorder_categories",
2329 "add_categories",
2330 "remove_categories",
2331 "remove_unused_categories",
2332 "set_categories",
2333 "as_ordered",
2334 "as_unordered",
2335 ],
2336 typ="method",
2337)
2338class CategoricalAccessor(PandasDelegate, PandasObject, NoNewAttributesMixin):
2339 """
2340 Accessor object for categorical properties of the Series values.
2341
2342 Parameters
2343 ----------
2344 data : Series or CategoricalIndex
2345
2346 Examples
2347 --------
2348 >>> s = pd.Series(list("abbccc")).astype("category")
2349 >>> s
2350 0 a
2351 1 b
2352 2 b
2353 3 c
2354 4 c
2355 5 c
2356 dtype: category
2357 Categories (3, object): ['a', 'b', 'c']
2358
2359 >>> s.cat.categories
2360 Index(['a', 'b', 'c'], dtype='object')
2361
2362 >>> s.cat.rename_categories(list("cba"))
2363 0 c
2364 1 b
2365 2 b
2366 3 a
2367 4 a
2368 5 a
2369 dtype: category
2370 Categories (3, object): ['c', 'b', 'a']
2371
2372 >>> s.cat.reorder_categories(list("cba"))
2373 0 a
2374 1 b
2375 2 b
2376 3 c
2377 4 c
2378 5 c
2379 dtype: category
2380 Categories (3, object): ['c', 'b', 'a']
2381
2382 >>> s.cat.add_categories(["d", "e"])
2383 0 a
2384 1 b
2385 2 b
2386 3 c
2387 4 c
2388 5 c
2389 dtype: category
2390 Categories (5, object): ['a', 'b', 'c', 'd', 'e']
2391
2392 >>> s.cat.remove_categories(["a", "c"])
2393 0 NaN
2394 1 b
2395 2 b
2396 3 NaN
2397 4 NaN
2398 5 NaN
2399 dtype: category
2400 Categories (1, object): ['b']
2401
2402 >>> s1 = s.cat.add_categories(["d", "e"])
2403 >>> s1.cat.remove_unused_categories()
2404 0 a
2405 1 b
2406 2 b
2407 3 c
2408 4 c
2409 5 c
2410 dtype: category
2411 Categories (3, object): ['a', 'b', 'c']
2412
2413 >>> s.cat.set_categories(list("abcde"))
2414 0 a
2415 1 b
2416 2 b
2417 3 c
2418 4 c
2419 5 c
2420 dtype: category
2421 Categories (5, object): ['a', 'b', 'c', 'd', 'e']
2422
2423 >>> s.cat.as_ordered()
2424 0 a
2425 1 b
2426 2 b
2427 3 c
2428 4 c
2429 5 c
2430 dtype: category
2431 Categories (3, object): ['a' < 'b' < 'c']
2432
2433 >>> s.cat.as_unordered()
2434 0 a
2435 1 b
2436 2 b
2437 3 c
2438 4 c
2439 5 c
2440 dtype: category
2441 Categories (3, object): ['a', 'b', 'c']
2442 """
2443
2444 def __init__(self, data) -> None:
2445 self._validate(data)
2446 self._parent = data.values
2447 self._index = data.index
2448 self._name = data.name
2449 self._freeze()
2450
2451 @staticmethod
2452 def _validate(data):
2453 if not is_categorical_dtype(data.dtype):
2454 raise AttributeError("Can only use .cat accessor with a 'category' dtype")
2455
2456 def _delegate_property_get(self, name):
2457 return getattr(self._parent, name)
2458
2459 def _delegate_property_set(self, name, new_values):
2460 return setattr(self._parent, name, new_values)
2461
2462 @property
2463 def codes(self) -> Series:
2464 """
2465 Return Series of codes as well as the index.
2466 """
2467 from pandas import Series
2468
2469 return Series(self._parent.codes, index=self._index)
2470
2471 def _delegate_method(self, name, *args, **kwargs):
2472 from pandas import Series
2473
2474 method = getattr(self._parent, name)
2475 res = method(*args, **kwargs)
2476 if res is not None:
2477 return Series(res, index=self._index, name=self._name)
2478
2479
2480# utility routines
2481
2482
2483def _get_codes_for_values(values, categories: Index) -> np.ndarray:
2484 """
2485 utility routine to turn values into codes given the specified categories
2486
2487 If `values` is known to be a Categorical, use recode_for_categories instead.
2488 """
2489 if values.ndim > 1:
2490 flat = values.ravel()
2491 codes = _get_codes_for_values(flat, categories)
2492 return codes.reshape(values.shape)
2493
2494 codes = categories.get_indexer_for(values)
2495 return coerce_indexer_dtype(codes, categories)
2496
2497
2498def recode_for_categories(
2499 codes: np.ndarray, old_categories, new_categories, copy: bool = True
2500) -> np.ndarray:
2501 """
2502 Convert a set of codes for to a new set of categories
2503
2504 Parameters
2505 ----------
2506 codes : np.ndarray
2507 old_categories, new_categories : Index
2508 copy: bool, default True
2509 Whether to copy if the codes are unchanged.
2510
2511 Returns
2512 -------
2513 new_codes : np.ndarray[np.int64]
2514
2515 Examples
2516 --------
2517 >>> old_cat = pd.Index(['b', 'a', 'c'])
2518 >>> new_cat = pd.Index(['a', 'b'])
2519 >>> codes = np.array([0, 1, 1, 2])
2520 >>> recode_for_categories(codes, old_cat, new_cat)
2521 array([ 1, 0, 0, -1], dtype=int8)
2522 """
2523 if len(old_categories) == 0:
2524 # All null anyway, so just retain the nulls
2525 if copy:
2526 return codes.copy()
2527 return codes
2528 elif new_categories.equals(old_categories):
2529 # Same categories, so no need to actually recode
2530 if copy:
2531 return codes.copy()
2532 return codes
2533
2534 indexer = coerce_indexer_dtype(
2535 new_categories.get_indexer(old_categories), new_categories
2536 )
2537 new_codes = take_nd(indexer, codes, fill_value=-1)
2538 return new_codes
2539
2540
2541def factorize_from_iterable(values) -> tuple[np.ndarray, Index]:
2542 """
2543 Factorize an input `values` into `categories` and `codes`. Preserves
2544 categorical dtype in `categories`.
2545
2546 Parameters
2547 ----------
2548 values : list-like
2549
2550 Returns
2551 -------
2552 codes : ndarray
2553 categories : Index
2554 If `values` has a categorical dtype, then `categories` is
2555 a CategoricalIndex keeping the categories and order of `values`.
2556 """
2557 from pandas import CategoricalIndex
2558
2559 if not is_list_like(values):
2560 raise TypeError("Input must be list-like")
2561
2562 categories: Index
2563 if is_categorical_dtype(values):
2564 values = extract_array(values)
2565 # The Categorical we want to build has the same categories
2566 # as values but its codes are by def [0, ..., len(n_categories) - 1]
2567 cat_codes = np.arange(len(values.categories), dtype=values.codes.dtype)
2568 cat = Categorical.from_codes(cat_codes, dtype=values.dtype)
2569
2570 categories = CategoricalIndex(cat)
2571 codes = values.codes
2572 else:
2573 # The value of ordered is irrelevant since we don't use cat as such,
2574 # but only the resulting categories, the order of which is independent
2575 # from ordered. Set ordered to False as default. See GH #15457
2576 cat = Categorical(values, ordered=False)
2577 categories = cat.categories
2578 codes = cat.codes
2579 return codes, categories
2580
2581
2582def factorize_from_iterables(iterables) -> tuple[list[np.ndarray], list[Index]]:
2583 """
2584 A higher-level wrapper over `factorize_from_iterable`.
2585
2586 Parameters
2587 ----------
2588 iterables : list-like of list-likes
2589
2590 Returns
2591 -------
2592 codes : list of ndarrays
2593 categories : list of Indexes
2594
2595 Notes
2596 -----
2597 See `factorize_from_iterable` for more info.
2598 """
2599 if len(iterables) == 0:
2600 # For consistency, it should return two empty lists.
2601 return [], []
2602
2603 codes, categories = zip(*(factorize_from_iterable(it) for it in iterables))
2604 return list(codes), list(categories)