1from __future__ import annotations
2
3from typing import (
4 Any,
5 Hashable,
6)
7
8import numpy as np
9
10from pandas._libs import index as libindex
11from pandas._typing import (
12 Dtype,
13 DtypeObj,
14 npt,
15)
16from pandas.util._decorators import (
17 cache_readonly,
18 doc,
19)
20
21from pandas.core.dtypes.common import (
22 is_categorical_dtype,
23 is_scalar,
24)
25from pandas.core.dtypes.missing import (
26 is_valid_na_for_dtype,
27 isna,
28 notna,
29)
30
31from pandas.core.arrays.categorical import (
32 Categorical,
33 contains,
34)
35from pandas.core.construction import extract_array
36import pandas.core.indexes.base as ibase
37from pandas.core.indexes.base import (
38 Index,
39 maybe_extract_name,
40)
41from pandas.core.indexes.extension import (
42 NDArrayBackedExtensionIndex,
43 inherit_names,
44)
45
46from pandas.io.formats.printing import pprint_thing
47
48_index_doc_kwargs: dict[str, str] = dict(ibase._index_doc_kwargs)
49_index_doc_kwargs.update({"target_klass": "CategoricalIndex"})
50
51
52@inherit_names(
53 [
54 "argsort",
55 "tolist",
56 "codes",
57 "categories",
58 "ordered",
59 "_reverse_indexer",
60 "searchsorted",
61 "min",
62 "max",
63 ],
64 Categorical,
65)
66@inherit_names(
67 [
68 "rename_categories",
69 "reorder_categories",
70 "add_categories",
71 "remove_categories",
72 "remove_unused_categories",
73 "set_categories",
74 "as_ordered",
75 "as_unordered",
76 ],
77 Categorical,
78 wrap=True,
79)
80class CategoricalIndex(NDArrayBackedExtensionIndex):
81 """
82 Index based on an underlying :class:`Categorical`.
83
84 CategoricalIndex, like Categorical, can only take on a limited,
85 and usually fixed, number of possible values (`categories`). Also,
86 like Categorical, it might have an order, but numerical operations
87 (additions, divisions, ...) are not possible.
88
89 Parameters
90 ----------
91 data : array-like (1-dimensional)
92 The values of the categorical. If `categories` are given, values not in
93 `categories` will be replaced with NaN.
94 categories : index-like, optional
95 The categories for the categorical. Items need to be unique.
96 If the categories are not given here (and also not in `dtype`), they
97 will be inferred from the `data`.
98 ordered : bool, optional
99 Whether or not this categorical is treated as an ordered
100 categorical. If not given here or in `dtype`, the resulting
101 categorical will be unordered.
102 dtype : CategoricalDtype or "category", optional
103 If :class:`CategoricalDtype`, cannot be used together with
104 `categories` or `ordered`.
105 copy : bool, default False
106 Make a copy of input ndarray.
107 name : object, optional
108 Name to be stored in the index.
109
110 Attributes
111 ----------
112 codes
113 categories
114 ordered
115
116 Methods
117 -------
118 rename_categories
119 reorder_categories
120 add_categories
121 remove_categories
122 remove_unused_categories
123 set_categories
124 as_ordered
125 as_unordered
126 map
127
128 Raises
129 ------
130 ValueError
131 If the categories do not validate.
132 TypeError
133 If an explicit ``ordered=True`` is given but no `categories` and the
134 `values` are not sortable.
135
136 See Also
137 --------
138 Index : The base pandas Index type.
139 Categorical : A categorical array.
140 CategoricalDtype : Type for categorical data.
141
142 Notes
143 -----
144 See the `user guide
145 <https://pandas.pydata.org/pandas-docs/stable/user_guide/advanced.html#categoricalindex>`__
146 for more.
147
148 Examples
149 --------
150 >>> pd.CategoricalIndex(["a", "b", "c", "a", "b", "c"])
151 CategoricalIndex(['a', 'b', 'c', 'a', 'b', 'c'],
152 categories=['a', 'b', 'c'], ordered=False, dtype='category')
153
154 ``CategoricalIndex`` can also be instantiated from a ``Categorical``:
155
156 >>> c = pd.Categorical(["a", "b", "c", "a", "b", "c"])
157 >>> pd.CategoricalIndex(c)
158 CategoricalIndex(['a', 'b', 'c', 'a', 'b', 'c'],
159 categories=['a', 'b', 'c'], ordered=False, dtype='category')
160
161 Ordered ``CategoricalIndex`` can have a min and max value.
162
163 >>> ci = pd.CategoricalIndex(
164 ... ["a", "b", "c", "a", "b", "c"], ordered=True, categories=["c", "b", "a"]
165 ... )
166 >>> ci
167 CategoricalIndex(['a', 'b', 'c', 'a', 'b', 'c'],
168 categories=['c', 'b', 'a'], ordered=True, dtype='category')
169 >>> ci.min()
170 'c'
171 """
172
173 _typ = "categoricalindex"
174 _data_cls = Categorical
175
176 @property
177 def _can_hold_strings(self):
178 return self.categories._can_hold_strings
179
180 @cache_readonly
181 def _should_fallback_to_positional(self) -> bool:
182 return self.categories._should_fallback_to_positional
183
184 codes: np.ndarray
185 categories: Index
186 ordered: bool | None
187 _data: Categorical
188 _values: Categorical
189
190 @property
191 def _engine_type(self) -> type[libindex.IndexEngine]:
192 # self.codes can have dtype int8, int16, int32 or int64, so we need
193 # to return the corresponding engine type (libindex.Int8Engine, etc.).
194 return {
195 np.int8: libindex.Int8Engine,
196 np.int16: libindex.Int16Engine,
197 np.int32: libindex.Int32Engine,
198 np.int64: libindex.Int64Engine,
199 }[self.codes.dtype.type]
200
201 # --------------------------------------------------------------------
202 # Constructors
203
204 def __new__(
205 cls,
206 data=None,
207 categories=None,
208 ordered=None,
209 dtype: Dtype | None = None,
210 copy: bool = False,
211 name: Hashable = None,
212 ) -> CategoricalIndex:
213 name = maybe_extract_name(name, data, cls)
214
215 if is_scalar(data):
216 # GH#38944 include None here, which pre-2.0 subbed in []
217 cls._raise_scalar_data_error(data)
218
219 data = Categorical(
220 data, categories=categories, ordered=ordered, dtype=dtype, copy=copy
221 )
222
223 return cls._simple_new(data, name=name)
224
225 # --------------------------------------------------------------------
226
227 def _is_dtype_compat(self, other) -> Categorical:
228 """
229 *this is an internal non-public method*
230
231 provide a comparison between the dtype of self and other (coercing if
232 needed)
233
234 Parameters
235 ----------
236 other : Index
237
238 Returns
239 -------
240 Categorical
241
242 Raises
243 ------
244 TypeError if the dtypes are not compatible
245 """
246 if is_categorical_dtype(other):
247 other = extract_array(other)
248 if not other._categories_match_up_to_permutation(self):
249 raise TypeError(
250 "categories must match existing categories when appending"
251 )
252
253 elif other._is_multi:
254 # preempt raising NotImplementedError in isna call
255 raise TypeError("MultiIndex is not dtype-compatible with CategoricalIndex")
256 else:
257 values = other
258
259 cat = Categorical(other, dtype=self.dtype)
260 other = CategoricalIndex(cat)
261 if not other.isin(values).all():
262 raise TypeError(
263 "cannot append a non-category item to a CategoricalIndex"
264 )
265 other = other._values
266
267 if not ((other == values) | (isna(other) & isna(values))).all():
268 # GH#37667 see test_equals_non_category
269 raise TypeError(
270 "categories must match existing categories when appending"
271 )
272
273 return other
274
275 def equals(self, other: object) -> bool:
276 """
277 Determine if two CategoricalIndex objects contain the same elements.
278
279 Returns
280 -------
281 bool
282 If two CategoricalIndex objects have equal elements True,
283 otherwise False.
284 """
285 if self.is_(other):
286 return True
287
288 if not isinstance(other, Index):
289 return False
290
291 try:
292 other = self._is_dtype_compat(other)
293 except (TypeError, ValueError):
294 return False
295
296 return self._data.equals(other)
297
298 # --------------------------------------------------------------------
299 # Rendering Methods
300
301 @property
302 def _formatter_func(self):
303 return self.categories._formatter_func
304
305 def _format_attrs(self):
306 """
307 Return a list of tuples of the (attr,formatted_value)
308 """
309 attrs: list[tuple[str, str | int | bool | None]]
310
311 attrs = [
312 (
313 "categories",
314 f"[{', '.join(self._data._repr_categories())}]",
315 ),
316 ("ordered", self.ordered),
317 ]
318 extra = super()._format_attrs()
319 return attrs + extra
320
321 def _format_with_header(self, header: list[str], na_rep: str) -> list[str]:
322 result = [
323 pprint_thing(x, escape_chars=("\t", "\r", "\n")) if notna(x) else na_rep
324 for x in self._values
325 ]
326 return header + result
327
328 # --------------------------------------------------------------------
329
330 @property
331 def inferred_type(self) -> str:
332 return "categorical"
333
334 @doc(Index.__contains__)
335 def __contains__(self, key: Any) -> bool:
336 # if key is a NaN, check if any NaN is in self.
337 if is_valid_na_for_dtype(key, self.categories.dtype):
338 return self.hasnans
339
340 return contains(self, key, container=self._engine)
341
342 def reindex(
343 self, target, method=None, level=None, limit=None, tolerance=None
344 ) -> tuple[Index, npt.NDArray[np.intp] | None]:
345 """
346 Create index with target's values (move/add/delete values as necessary)
347
348 Returns
349 -------
350 new_index : pd.Index
351 Resulting index
352 indexer : np.ndarray[np.intp] or None
353 Indices of output values in original index
354
355 """
356 if method is not None:
357 raise NotImplementedError(
358 "argument method is not implemented for CategoricalIndex.reindex"
359 )
360 if level is not None:
361 raise NotImplementedError(
362 "argument level is not implemented for CategoricalIndex.reindex"
363 )
364 if limit is not None:
365 raise NotImplementedError(
366 "argument limit is not implemented for CategoricalIndex.reindex"
367 )
368 return super().reindex(target)
369
370 # --------------------------------------------------------------------
371 # Indexing Methods
372
373 def _maybe_cast_indexer(self, key) -> int:
374 # GH#41933: we have to do this instead of self._data._validate_scalar
375 # because this will correctly get partial-indexing on Interval categories
376 try:
377 return self._data._unbox_scalar(key)
378 except KeyError:
379 if is_valid_na_for_dtype(key, self.categories.dtype):
380 return -1
381 raise
382
383 def _maybe_cast_listlike_indexer(self, values) -> CategoricalIndex:
384 if isinstance(values, CategoricalIndex):
385 values = values._data
386 if isinstance(values, Categorical):
387 # Indexing on codes is more efficient if categories are the same,
388 # so we can apply some optimizations based on the degree of
389 # dtype-matching.
390 cat = self._data._encode_with_my_categories(values)
391 codes = cat._codes
392 else:
393 codes = self.categories.get_indexer(values)
394 codes = codes.astype(self.codes.dtype, copy=False)
395 cat = self._data._from_backing_data(codes)
396 return type(self)._simple_new(cat)
397
398 # --------------------------------------------------------------------
399
400 def _is_comparable_dtype(self, dtype: DtypeObj) -> bool:
401 return self.categories._is_comparable_dtype(dtype)
402
403 def map(self, mapper):
404 """
405 Map values using input an input mapping or function.
406
407 Maps the values (their categories, not the codes) of the index to new
408 categories. If the mapping correspondence is one-to-one the result is a
409 :class:`~pandas.CategoricalIndex` which has the same order property as
410 the original, otherwise an :class:`~pandas.Index` is returned.
411
412 If a `dict` or :class:`~pandas.Series` is used any unmapped category is
413 mapped to `NaN`. Note that if this happens an :class:`~pandas.Index`
414 will be returned.
415
416 Parameters
417 ----------
418 mapper : function, dict, or Series
419 Mapping correspondence.
420
421 Returns
422 -------
423 pandas.CategoricalIndex or pandas.Index
424 Mapped index.
425
426 See Also
427 --------
428 Index.map : Apply a mapping correspondence on an
429 :class:`~pandas.Index`.
430 Series.map : Apply a mapping correspondence on a
431 :class:`~pandas.Series`.
432 Series.apply : Apply more complex functions on a
433 :class:`~pandas.Series`.
434
435 Examples
436 --------
437 >>> idx = pd.CategoricalIndex(['a', 'b', 'c'])
438 >>> idx
439 CategoricalIndex(['a', 'b', 'c'], categories=['a', 'b', 'c'],
440 ordered=False, dtype='category')
441 >>> idx.map(lambda x: x.upper())
442 CategoricalIndex(['A', 'B', 'C'], categories=['A', 'B', 'C'],
443 ordered=False, dtype='category')
444 >>> idx.map({'a': 'first', 'b': 'second', 'c': 'third'})
445 CategoricalIndex(['first', 'second', 'third'], categories=['first',
446 'second', 'third'], ordered=False, dtype='category')
447
448 If the mapping is one-to-one the ordering of the categories is
449 preserved:
450
451 >>> idx = pd.CategoricalIndex(['a', 'b', 'c'], ordered=True)
452 >>> idx
453 CategoricalIndex(['a', 'b', 'c'], categories=['a', 'b', 'c'],
454 ordered=True, dtype='category')
455 >>> idx.map({'a': 3, 'b': 2, 'c': 1})
456 CategoricalIndex([3, 2, 1], categories=[3, 2, 1], ordered=True,
457 dtype='category')
458
459 If the mapping is not one-to-one an :class:`~pandas.Index` is returned:
460
461 >>> idx.map({'a': 'first', 'b': 'second', 'c': 'first'})
462 Index(['first', 'second', 'first'], dtype='object')
463
464 If a `dict` is used, all unmapped categories are mapped to `NaN` and
465 the result is an :class:`~pandas.Index`:
466
467 >>> idx.map({'a': 'first', 'b': 'second'})
468 Index(['first', 'second', nan], dtype='object')
469 """
470 mapped = self._values.map(mapper)
471 return Index(mapped, name=self.name)
472
473 def _concat(self, to_concat: list[Index], name: Hashable) -> Index:
474 # if calling index is category, don't check dtype of others
475 try:
476 cat = Categorical._concat_same_type(
477 [self._is_dtype_compat(c) for c in to_concat]
478 )
479 except TypeError:
480 # not all to_concat elements are among our categories (or NA)
481 from pandas.core.dtypes.concat import concat_compat
482
483 res = concat_compat([x._values for x in to_concat])
484 return Index(res, name=name)
485 else:
486 return type(self)._simple_new(cat, name=name)