1from __future__ import annotations
2
3from typing import (
4 TYPE_CHECKING,
5 Any,
6 Literal,
7 cast,
8)
9
10import numpy as np
11
12from pandas._libs import index as libindex
13from pandas.util._decorators import (
14 cache_readonly,
15 doc,
16)
17
18from pandas.core.dtypes.common import is_scalar
19from pandas.core.dtypes.concat import concat_compat
20from pandas.core.dtypes.dtypes import CategoricalDtype
21from pandas.core.dtypes.missing import (
22 is_valid_na_for_dtype,
23 isna,
24)
25
26from pandas.core.arrays.categorical import (
27 Categorical,
28 contains,
29)
30from pandas.core.construction import extract_array
31from pandas.core.indexes.base import (
32 Index,
33 maybe_extract_name,
34)
35from pandas.core.indexes.extension import (
36 NDArrayBackedExtensionIndex,
37 inherit_names,
38)
39
40if TYPE_CHECKING:
41 from collections.abc import Hashable
42
43 from pandas._typing import (
44 Dtype,
45 DtypeObj,
46 Self,
47 npt,
48 )
49
50
51@inherit_names(
52 [
53 "argsort",
54 "tolist",
55 "codes",
56 "categories",
57 "ordered",
58 "_reverse_indexer",
59 "searchsorted",
60 "min",
61 "max",
62 ],
63 Categorical,
64)
65@inherit_names(
66 [
67 "rename_categories",
68 "reorder_categories",
69 "add_categories",
70 "remove_categories",
71 "remove_unused_categories",
72 "set_categories",
73 "as_ordered",
74 "as_unordered",
75 ],
76 Categorical,
77 wrap=True,
78)
79class CategoricalIndex(NDArrayBackedExtensionIndex):
80 """
81 Index based on an underlying :class:`Categorical`.
82
83 CategoricalIndex, like Categorical, can only take on a limited,
84 and usually fixed, number of possible values (`categories`). Also,
85 like Categorical, it might have an order, but numerical operations
86 (additions, divisions, ...) are not possible.
87
88 Parameters
89 ----------
90 data : array-like (1-dimensional)
91 The values of the categorical. If `categories` are given, values not in
92 `categories` will be replaced with NaN.
93 categories : index-like, optional
94 The categories for the categorical. Items need to be unique.
95 If the categories are not given here (and also not in `dtype`), they
96 will be inferred from the `data`.
97 ordered : bool, optional
98 Whether or not this categorical is treated as an ordered
99 categorical. If not given here or in `dtype`, the resulting
100 categorical will be unordered.
101 dtype : CategoricalDtype or "category", optional
102 If :class:`CategoricalDtype`, cannot be used together with
103 `categories` or `ordered`.
104 copy : bool, default False
105 Make a copy of input ndarray.
106 name : object, optional
107 Name to be stored in the index.
108
109 Attributes
110 ----------
111 codes
112 categories
113 ordered
114
115 Methods
116 -------
117 rename_categories
118 reorder_categories
119 add_categories
120 remove_categories
121 remove_unused_categories
122 set_categories
123 as_ordered
124 as_unordered
125 map
126
127 Raises
128 ------
129 ValueError
130 If the categories do not validate.
131 TypeError
132 If an explicit ``ordered=True`` is given but no `categories` and the
133 `values` are not sortable.
134
135 See Also
136 --------
137 Index : The base pandas Index type.
138 Categorical : A categorical array.
139 CategoricalDtype : Type for categorical data.
140
141 Notes
142 -----
143 See the `user guide
144 <https://pandas.pydata.org/pandas-docs/stable/user_guide/advanced.html#categoricalindex>`__
145 for more.
146
147 Examples
148 --------
149 >>> pd.CategoricalIndex(["a", "b", "c", "a", "b", "c"])
150 CategoricalIndex(['a', 'b', 'c', 'a', 'b', 'c'],
151 categories=['a', 'b', 'c'], ordered=False, dtype='category')
152
153 ``CategoricalIndex`` can also be instantiated from a ``Categorical``:
154
155 >>> c = pd.Categorical(["a", "b", "c", "a", "b", "c"])
156 >>> pd.CategoricalIndex(c)
157 CategoricalIndex(['a', 'b', 'c', 'a', 'b', 'c'],
158 categories=['a', 'b', 'c'], ordered=False, dtype='category')
159
160 Ordered ``CategoricalIndex`` can have a min and max value.
161
162 >>> ci = pd.CategoricalIndex(
163 ... ["a", "b", "c", "a", "b", "c"], ordered=True, categories=["c", "b", "a"]
164 ... )
165 >>> ci
166 CategoricalIndex(['a', 'b', 'c', 'a', 'b', 'c'],
167 categories=['c', 'b', 'a'], ordered=True, dtype='category')
168 >>> ci.min()
169 'c'
170 """
171
172 _typ = "categoricalindex"
173 _data_cls = Categorical
174
175 @property
176 def _can_hold_strings(self):
177 return self.categories._can_hold_strings
178
179 @cache_readonly
180 def _should_fallback_to_positional(self) -> bool:
181 return self.categories._should_fallback_to_positional
182
183 codes: np.ndarray
184 categories: Index
185 ordered: bool | None
186 _data: Categorical
187 _values: Categorical
188
189 @property
190 def _engine_type(self) -> type[libindex.IndexEngine]:
191 # self.codes can have dtype int8, int16, int32 or int64, so we need
192 # to return the corresponding engine type (libindex.Int8Engine, etc.).
193 return {
194 np.int8: libindex.Int8Engine,
195 np.int16: libindex.Int16Engine,
196 np.int32: libindex.Int32Engine,
197 np.int64: libindex.Int64Engine,
198 }[self.codes.dtype.type]
199
200 # --------------------------------------------------------------------
201 # Constructors
202
203 def __new__(
204 cls,
205 data=None,
206 categories=None,
207 ordered=None,
208 dtype: Dtype | None = None,
209 copy: bool = False,
210 name: Hashable | None = None,
211 ) -> Self:
212 name = maybe_extract_name(name, data, cls)
213
214 if is_scalar(data):
215 # GH#38944 include None here, which pre-2.0 subbed in []
216 cls._raise_scalar_data_error(data)
217
218 data = Categorical(
219 data, categories=categories, ordered=ordered, dtype=dtype, copy=copy
220 )
221
222 return cls._simple_new(data, name=name)
223
224 # --------------------------------------------------------------------
225
226 def _is_dtype_compat(self, other: Index) -> Categorical:
227 """
228 *this is an internal non-public method*
229
230 provide a comparison between the dtype of self and other (coercing if
231 needed)
232
233 Parameters
234 ----------
235 other : Index
236
237 Returns
238 -------
239 Categorical
240
241 Raises
242 ------
243 TypeError if the dtypes are not compatible
244 """
245 if isinstance(other.dtype, CategoricalDtype):
246 cat = extract_array(other)
247 cat = cast(Categorical, cat)
248 if not cat._categories_match_up_to_permutation(self._values):
249 raise TypeError(
250 "categories must match existing categories when appending"
251 )
252
253 elif other._is_multi:
254 # preempt raising NotImplementedError in isna call
255 raise TypeError("MultiIndex is not dtype-compatible with CategoricalIndex")
256 else:
257 values = other
258
259 cat = Categorical(other, dtype=self.dtype)
260 other = CategoricalIndex(cat)
261 if not other.isin(values).all():
262 raise TypeError(
263 "cannot append a non-category item to a CategoricalIndex"
264 )
265 cat = other._values
266
267 if not ((cat == values) | (isna(cat) & isna(values))).all():
268 # GH#37667 see test_equals_non_category
269 raise TypeError(
270 "categories must match existing categories when appending"
271 )
272
273 return cat
274
275 def equals(self, other: object) -> bool:
276 """
277 Determine if two CategoricalIndex objects contain the same elements.
278
279 Returns
280 -------
281 bool
282 ``True`` if two :class:`pandas.CategoricalIndex` objects have equal
283 elements, ``False`` otherwise.
284
285 Examples
286 --------
287 >>> ci = pd.CategoricalIndex(['a', 'b', 'c', 'a', 'b', 'c'])
288 >>> ci2 = pd.CategoricalIndex(pd.Categorical(['a', 'b', 'c', 'a', 'b', 'c']))
289 >>> ci.equals(ci2)
290 True
291
292 The order of elements matters.
293
294 >>> ci3 = pd.CategoricalIndex(['c', 'b', 'a', 'a', 'b', 'c'])
295 >>> ci.equals(ci3)
296 False
297
298 The orderedness also matters.
299
300 >>> ci4 = ci.as_ordered()
301 >>> ci.equals(ci4)
302 False
303
304 The categories matter, but the order of the categories matters only when
305 ``ordered=True``.
306
307 >>> ci5 = ci.set_categories(['a', 'b', 'c', 'd'])
308 >>> ci.equals(ci5)
309 False
310
311 >>> ci6 = ci.set_categories(['b', 'c', 'a'])
312 >>> ci.equals(ci6)
313 True
314 >>> ci_ordered = pd.CategoricalIndex(['a', 'b', 'c', 'a', 'b', 'c'],
315 ... ordered=True)
316 >>> ci2_ordered = ci_ordered.set_categories(['b', 'c', 'a'])
317 >>> ci_ordered.equals(ci2_ordered)
318 False
319 """
320 if self.is_(other):
321 return True
322
323 if not isinstance(other, Index):
324 return False
325
326 try:
327 other = self._is_dtype_compat(other)
328 except (TypeError, ValueError):
329 return False
330
331 return self._data.equals(other)
332
333 # --------------------------------------------------------------------
334 # Rendering Methods
335
336 @property
337 def _formatter_func(self):
338 return self.categories._formatter_func
339
340 def _format_attrs(self):
341 """
342 Return a list of tuples of the (attr,formatted_value)
343 """
344 attrs: list[tuple[str, str | int | bool | None]]
345
346 attrs = [
347 (
348 "categories",
349 f"[{', '.join(self._data._repr_categories())}]",
350 ),
351 ("ordered", self.ordered),
352 ]
353 extra = super()._format_attrs()
354 return attrs + extra
355
356 # --------------------------------------------------------------------
357
358 @property
359 def inferred_type(self) -> str:
360 return "categorical"
361
362 @doc(Index.__contains__)
363 def __contains__(self, key: Any) -> bool:
364 # if key is a NaN, check if any NaN is in self.
365 if is_valid_na_for_dtype(key, self.categories.dtype):
366 return self.hasnans
367
368 return contains(self, key, container=self._engine)
369
370 def reindex(
371 self, target, method=None, level=None, limit: int | None = None, tolerance=None
372 ) -> tuple[Index, npt.NDArray[np.intp] | None]:
373 """
374 Create index with target's values (move/add/delete values as necessary)
375
376 Returns
377 -------
378 new_index : pd.Index
379 Resulting index
380 indexer : np.ndarray[np.intp] or None
381 Indices of output values in original index
382
383 """
384 if method is not None:
385 raise NotImplementedError(
386 "argument method is not implemented for CategoricalIndex.reindex"
387 )
388 if level is not None:
389 raise NotImplementedError(
390 "argument level is not implemented for CategoricalIndex.reindex"
391 )
392 if limit is not None:
393 raise NotImplementedError(
394 "argument limit is not implemented for CategoricalIndex.reindex"
395 )
396 return super().reindex(target)
397
398 # --------------------------------------------------------------------
399 # Indexing Methods
400
401 def _maybe_cast_indexer(self, key) -> int:
402 # GH#41933: we have to do this instead of self._data._validate_scalar
403 # because this will correctly get partial-indexing on Interval categories
404 try:
405 return self._data._unbox_scalar(key)
406 except KeyError:
407 if is_valid_na_for_dtype(key, self.categories.dtype):
408 return -1
409 raise
410
411 def _maybe_cast_listlike_indexer(self, values) -> CategoricalIndex:
412 if isinstance(values, CategoricalIndex):
413 values = values._data
414 if isinstance(values, Categorical):
415 # Indexing on codes is more efficient if categories are the same,
416 # so we can apply some optimizations based on the degree of
417 # dtype-matching.
418 cat = self._data._encode_with_my_categories(values)
419 codes = cat._codes
420 else:
421 codes = self.categories.get_indexer(values)
422 codes = codes.astype(self.codes.dtype, copy=False)
423 cat = self._data._from_backing_data(codes)
424 return type(self)._simple_new(cat)
425
426 # --------------------------------------------------------------------
427
428 def _is_comparable_dtype(self, dtype: DtypeObj) -> bool:
429 return self.categories._is_comparable_dtype(dtype)
430
431 def map(self, mapper, na_action: Literal["ignore"] | None = None):
432 """
433 Map values using input an input mapping or function.
434
435 Maps the values (their categories, not the codes) of the index to new
436 categories. If the mapping correspondence is one-to-one the result is a
437 :class:`~pandas.CategoricalIndex` which has the same order property as
438 the original, otherwise an :class:`~pandas.Index` is returned.
439
440 If a `dict` or :class:`~pandas.Series` is used any unmapped category is
441 mapped to `NaN`. Note that if this happens an :class:`~pandas.Index`
442 will be returned.
443
444 Parameters
445 ----------
446 mapper : function, dict, or Series
447 Mapping correspondence.
448
449 Returns
450 -------
451 pandas.CategoricalIndex or pandas.Index
452 Mapped index.
453
454 See Also
455 --------
456 Index.map : Apply a mapping correspondence on an
457 :class:`~pandas.Index`.
458 Series.map : Apply a mapping correspondence on a
459 :class:`~pandas.Series`.
460 Series.apply : Apply more complex functions on a
461 :class:`~pandas.Series`.
462
463 Examples
464 --------
465 >>> idx = pd.CategoricalIndex(['a', 'b', 'c'])
466 >>> idx
467 CategoricalIndex(['a', 'b', 'c'], categories=['a', 'b', 'c'],
468 ordered=False, dtype='category')
469 >>> idx.map(lambda x: x.upper())
470 CategoricalIndex(['A', 'B', 'C'], categories=['A', 'B', 'C'],
471 ordered=False, dtype='category')
472 >>> idx.map({'a': 'first', 'b': 'second', 'c': 'third'})
473 CategoricalIndex(['first', 'second', 'third'], categories=['first',
474 'second', 'third'], ordered=False, dtype='category')
475
476 If the mapping is one-to-one the ordering of the categories is
477 preserved:
478
479 >>> idx = pd.CategoricalIndex(['a', 'b', 'c'], ordered=True)
480 >>> idx
481 CategoricalIndex(['a', 'b', 'c'], categories=['a', 'b', 'c'],
482 ordered=True, dtype='category')
483 >>> idx.map({'a': 3, 'b': 2, 'c': 1})
484 CategoricalIndex([3, 2, 1], categories=[3, 2, 1], ordered=True,
485 dtype='category')
486
487 If the mapping is not one-to-one an :class:`~pandas.Index` is returned:
488
489 >>> idx.map({'a': 'first', 'b': 'second', 'c': 'first'})
490 Index(['first', 'second', 'first'], dtype='object')
491
492 If a `dict` is used, all unmapped categories are mapped to `NaN` and
493 the result is an :class:`~pandas.Index`:
494
495 >>> idx.map({'a': 'first', 'b': 'second'})
496 Index(['first', 'second', nan], dtype='object')
497 """
498 mapped = self._values.map(mapper, na_action=na_action)
499 return Index(mapped, name=self.name)
500
501 def _concat(self, to_concat: list[Index], name: Hashable) -> Index:
502 # if calling index is category, don't check dtype of others
503 try:
504 cat = Categorical._concat_same_type(
505 [self._is_dtype_compat(c) for c in to_concat]
506 )
507 except TypeError:
508 # not all to_concat elements are among our categories (or NA)
509
510 res = concat_compat([x._values for x in to_concat])
511 return Index(res, name=name)
512 else:
513 return type(self)._simple_new(cat, name=name)