Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.9/dist-packages/pandas/core/arrays/categorical.py: 26%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

808 statements  

1from __future__ import annotations 

2 

3from csv import QUOTE_NONNUMERIC 

4from functools import partial 

5import operator 

6from shutil import get_terminal_size 

7from typing import ( 

8 TYPE_CHECKING, 

9 Literal, 

10 cast, 

11 overload, 

12) 

13import warnings 

14 

15import numpy as np 

16 

17from pandas._config import get_option 

18 

19from pandas._libs import ( 

20 NaT, 

21 algos as libalgos, 

22 lib, 

23) 

24from pandas._libs.arrays import NDArrayBacked 

25from pandas.compat.numpy import function as nv 

26from pandas.util._exceptions import find_stack_level 

27from pandas.util._validators import validate_bool_kwarg 

28 

29from pandas.core.dtypes.cast import ( 

30 coerce_indexer_dtype, 

31 find_common_type, 

32) 

33from pandas.core.dtypes.common import ( 

34 ensure_int64, 

35 ensure_platform_int, 

36 is_any_real_numeric_dtype, 

37 is_bool_dtype, 

38 is_dict_like, 

39 is_hashable, 

40 is_integer_dtype, 

41 is_list_like, 

42 is_scalar, 

43 needs_i8_conversion, 

44 pandas_dtype, 

45) 

46from pandas.core.dtypes.dtypes import ( 

47 ArrowDtype, 

48 CategoricalDtype, 

49 CategoricalDtypeType, 

50 ExtensionDtype, 

51) 

52from pandas.core.dtypes.generic import ( 

53 ABCIndex, 

54 ABCSeries, 

55) 

56from pandas.core.dtypes.missing import ( 

57 is_valid_na_for_dtype, 

58 isna, 

59) 

60 

61from pandas.core import ( 

62 algorithms, 

63 arraylike, 

64 ops, 

65) 

66from pandas.core.accessor import ( 

67 PandasDelegate, 

68 delegate_names, 

69) 

70from pandas.core.algorithms import ( 

71 factorize, 

72 take_nd, 

73) 

74from pandas.core.arrays._mixins import ( 

75 NDArrayBackedExtensionArray, 

76 ravel_compat, 

77) 

78from pandas.core.base import ( 

79 ExtensionArray, 

80 NoNewAttributesMixin, 

81 PandasObject, 

82) 

83import pandas.core.common as com 

84from pandas.core.construction import ( 

85 extract_array, 

86 sanitize_array, 

87) 

88from pandas.core.ops.common import unpack_zerodim_and_defer 

89from pandas.core.sorting import nargsort 

90from pandas.core.strings.object_array import ObjectStringArrayMixin 

91 

92from pandas.io.formats import console 

93 

94if TYPE_CHECKING: 

95 from collections.abc import ( 

96 Hashable, 

97 Iterator, 

98 Sequence, 

99 ) 

100 

101 from pandas._typing import ( 

102 ArrayLike, 

103 AstypeArg, 

104 AxisInt, 

105 Dtype, 

106 DtypeObj, 

107 NpDtype, 

108 Ordered, 

109 Self, 

110 Shape, 

111 SortKind, 

112 npt, 

113 ) 

114 

115 from pandas import ( 

116 DataFrame, 

117 Index, 

118 Series, 

119 ) 

120 

121 

122def _cat_compare_op(op): 

123 opname = f"__{op.__name__}__" 

124 fill_value = op is operator.ne 

125 

126 @unpack_zerodim_and_defer(opname) 

127 def func(self, other): 

128 hashable = is_hashable(other) 

129 if is_list_like(other) and len(other) != len(self) and not hashable: 

130 # in hashable case we may have a tuple that is itself a category 

131 raise ValueError("Lengths must match.") 

132 

133 if not self.ordered: 

134 if opname in ["__lt__", "__gt__", "__le__", "__ge__"]: 

135 raise TypeError( 

136 "Unordered Categoricals can only compare equality or not" 

137 ) 

138 if isinstance(other, Categorical): 

139 # Two Categoricals can only be compared if the categories are 

140 # the same (maybe up to ordering, depending on ordered) 

141 

142 msg = "Categoricals can only be compared if 'categories' are the same." 

143 if not self._categories_match_up_to_permutation(other): 

144 raise TypeError(msg) 

145 

146 if not self.ordered and not self.categories.equals(other.categories): 

147 # both unordered and different order 

148 other_codes = recode_for_categories( 

149 other.codes, other.categories, self.categories, copy=False 

150 ) 

151 else: 

152 other_codes = other._codes 

153 

154 ret = op(self._codes, other_codes) 

155 mask = (self._codes == -1) | (other_codes == -1) 

156 if mask.any(): 

157 ret[mask] = fill_value 

158 return ret 

159 

160 if hashable: 

161 if other in self.categories: 

162 i = self._unbox_scalar(other) 

163 ret = op(self._codes, i) 

164 

165 if opname not in {"__eq__", "__ge__", "__gt__"}: 

166 # GH#29820 performance trick; get_loc will always give i>=0, 

167 # so in the cases (__ne__, __le__, __lt__) the setting 

168 # here is a no-op, so can be skipped. 

169 mask = self._codes == -1 

170 ret[mask] = fill_value 

171 return ret 

172 else: 

173 return ops.invalid_comparison(self, other, op) 

174 else: 

175 # allow categorical vs object dtype array comparisons for equality 

176 # these are only positional comparisons 

177 if opname not in ["__eq__", "__ne__"]: 

178 raise TypeError( 

179 f"Cannot compare a Categorical for op {opname} with " 

180 f"type {type(other)}.\nIf you want to compare values, " 

181 "use 'np.asarray(cat) <op> other'." 

182 ) 

183 

184 if isinstance(other, ExtensionArray) and needs_i8_conversion(other.dtype): 

185 # We would return NotImplemented here, but that messes up 

186 # ExtensionIndex's wrapped methods 

187 return op(other, self) 

188 return getattr(np.array(self), opname)(np.array(other)) 

189 

190 func.__name__ = opname 

191 

192 return func 

193 

194 

195def contains(cat, key, container) -> bool: 

196 """ 

197 Helper for membership check for ``key`` in ``cat``. 

198 

199 This is a helper method for :method:`__contains__` 

200 and :class:`CategoricalIndex.__contains__`. 

201 

202 Returns True if ``key`` is in ``cat.categories`` and the 

203 location of ``key`` in ``categories`` is in ``container``. 

204 

205 Parameters 

206 ---------- 

207 cat : :class:`Categorical`or :class:`categoricalIndex` 

208 key : a hashable object 

209 The key to check membership for. 

210 container : Container (e.g. list-like or mapping) 

211 The container to check for membership in. 

212 

213 Returns 

214 ------- 

215 is_in : bool 

216 True if ``key`` is in ``self.categories`` and location of 

217 ``key`` in ``categories`` is in ``container``, else False. 

218 

219 Notes 

220 ----- 

221 This method does not check for NaN values. Do that separately 

222 before calling this method. 

223 """ 

224 hash(key) 

225 

226 # get location of key in categories. 

227 # If a KeyError, the key isn't in categories, so logically 

228 # can't be in container either. 

229 try: 

230 loc = cat.categories.get_loc(key) 

231 except (KeyError, TypeError): 

232 return False 

233 

234 # loc is the location of key in categories, but also the *value* 

235 # for key in container. So, `key` may be in categories, 

236 # but still not in `container`. Example ('b' in categories, 

237 # but not in values): 

238 # 'b' in Categorical(['a'], categories=['a', 'b']) # False 

239 if is_scalar(loc): 

240 return loc in container 

241 else: 

242 # if categories is an IntervalIndex, loc is an array. 

243 return any(loc_ in container for loc_ in loc) 

244 

245 

246class Categorical(NDArrayBackedExtensionArray, PandasObject, ObjectStringArrayMixin): 

247 """ 

248 Represent a categorical variable in classic R / S-plus fashion. 

249 

250 `Categoricals` can only take on a limited, and usually fixed, number 

251 of possible values (`categories`). In contrast to statistical categorical 

252 variables, a `Categorical` might have an order, but numerical operations 

253 (additions, divisions, ...) are not possible. 

254 

255 All values of the `Categorical` are either in `categories` or `np.nan`. 

256 Assigning values outside of `categories` will raise a `ValueError`. Order 

257 is defined by the order of the `categories`, not lexical order of the 

258 values. 

259 

260 Parameters 

261 ---------- 

262 values : list-like 

263 The values of the categorical. If categories are given, values not in 

264 categories will be replaced with NaN. 

265 categories : Index-like (unique), optional 

266 The unique categories for this categorical. If not given, the 

267 categories are assumed to be the unique values of `values` (sorted, if 

268 possible, otherwise in the order in which they appear). 

269 ordered : bool, default False 

270 Whether or not this categorical is treated as a ordered categorical. 

271 If True, the resulting categorical will be ordered. 

272 An ordered categorical respects, when sorted, the order of its 

273 `categories` attribute (which in turn is the `categories` argument, if 

274 provided). 

275 dtype : CategoricalDtype 

276 An instance of ``CategoricalDtype`` to use for this categorical. 

277 

278 Attributes 

279 ---------- 

280 categories : Index 

281 The categories of this categorical. 

282 codes : ndarray 

283 The codes (integer positions, which point to the categories) of this 

284 categorical, read only. 

285 ordered : bool 

286 Whether or not this Categorical is ordered. 

287 dtype : CategoricalDtype 

288 The instance of ``CategoricalDtype`` storing the ``categories`` 

289 and ``ordered``. 

290 

291 Methods 

292 ------- 

293 from_codes 

294 __array__ 

295 

296 Raises 

297 ------ 

298 ValueError 

299 If the categories do not validate. 

300 TypeError 

301 If an explicit ``ordered=True`` is given but no `categories` and the 

302 `values` are not sortable. 

303 

304 See Also 

305 -------- 

306 CategoricalDtype : Type for categorical data. 

307 CategoricalIndex : An Index with an underlying ``Categorical``. 

308 

309 Notes 

310 ----- 

311 See the `user guide 

312 <https://pandas.pydata.org/pandas-docs/stable/user_guide/categorical.html>`__ 

313 for more. 

314 

315 Examples 

316 -------- 

317 >>> pd.Categorical([1, 2, 3, 1, 2, 3]) 

318 [1, 2, 3, 1, 2, 3] 

319 Categories (3, int64): [1, 2, 3] 

320 

321 >>> pd.Categorical(['a', 'b', 'c', 'a', 'b', 'c']) 

322 ['a', 'b', 'c', 'a', 'b', 'c'] 

323 Categories (3, object): ['a', 'b', 'c'] 

324 

325 Missing values are not included as a category. 

326 

327 >>> c = pd.Categorical([1, 2, 3, 1, 2, 3, np.nan]) 

328 >>> c 

329 [1, 2, 3, 1, 2, 3, NaN] 

330 Categories (3, int64): [1, 2, 3] 

331 

332 However, their presence is indicated in the `codes` attribute 

333 by code `-1`. 

334 

335 >>> c.codes 

336 array([ 0, 1, 2, 0, 1, 2, -1], dtype=int8) 

337 

338 Ordered `Categoricals` can be sorted according to the custom order 

339 of the categories and can have a min and max value. 

340 

341 >>> c = pd.Categorical(['a', 'b', 'c', 'a', 'b', 'c'], ordered=True, 

342 ... categories=['c', 'b', 'a']) 

343 >>> c 

344 ['a', 'b', 'c', 'a', 'b', 'c'] 

345 Categories (3, object): ['c' < 'b' < 'a'] 

346 >>> c.min() 

347 'c' 

348 """ 

349 

350 # For comparisons, so that numpy uses our implementation if the compare 

351 # ops, which raise 

352 __array_priority__ = 1000 

353 # tolist is not actually deprecated, just suppressed in the __dir__ 

354 _hidden_attrs = PandasObject._hidden_attrs | frozenset(["tolist"]) 

355 _typ = "categorical" 

356 

357 _dtype: CategoricalDtype 

358 

359 @classmethod 

360 # error: Argument 2 of "_simple_new" is incompatible with supertype 

361 # "NDArrayBacked"; supertype defines the argument type as 

362 # "Union[dtype[Any], ExtensionDtype]" 

363 def _simple_new( # type: ignore[override] 

364 cls, codes: np.ndarray, dtype: CategoricalDtype 

365 ) -> Self: 

366 # NB: This is not _quite_ as simple as the "usual" _simple_new 

367 codes = coerce_indexer_dtype(codes, dtype.categories) 

368 dtype = CategoricalDtype(ordered=False).update_dtype(dtype) 

369 return super()._simple_new(codes, dtype) 

370 

371 def __init__( 

372 self, 

373 values, 

374 categories=None, 

375 ordered=None, 

376 dtype: Dtype | None = None, 

377 fastpath: bool | lib.NoDefault = lib.no_default, 

378 copy: bool = True, 

379 ) -> None: 

380 if fastpath is not lib.no_default: 

381 # GH#20110 

382 warnings.warn( 

383 "The 'fastpath' keyword in Categorical is deprecated and will " 

384 "be removed in a future version. Use Categorical.from_codes instead", 

385 DeprecationWarning, 

386 stacklevel=find_stack_level(), 

387 ) 

388 else: 

389 fastpath = False 

390 

391 dtype = CategoricalDtype._from_values_or_dtype( 

392 values, categories, ordered, dtype 

393 ) 

394 # At this point, dtype is always a CategoricalDtype, but 

395 # we may have dtype.categories be None, and we need to 

396 # infer categories in a factorization step further below 

397 

398 if fastpath: 

399 codes = coerce_indexer_dtype(values, dtype.categories) 

400 dtype = CategoricalDtype(ordered=False).update_dtype(dtype) 

401 super().__init__(codes, dtype) 

402 return 

403 

404 if not is_list_like(values): 

405 # GH#38433 

406 raise TypeError("Categorical input must be list-like") 

407 

408 # null_mask indicates missing values we want to exclude from inference. 

409 # This means: only missing values in list-likes (not arrays/ndframes). 

410 null_mask = np.array(False) 

411 

412 # sanitize input 

413 vdtype = getattr(values, "dtype", None) 

414 if isinstance(vdtype, CategoricalDtype): 

415 if dtype.categories is None: 

416 dtype = CategoricalDtype(values.categories, dtype.ordered) 

417 elif not isinstance(values, (ABCIndex, ABCSeries, ExtensionArray)): 

418 values = com.convert_to_list_like(values) 

419 if isinstance(values, list) and len(values) == 0: 

420 # By convention, empty lists result in object dtype: 

421 values = np.array([], dtype=object) 

422 elif isinstance(values, np.ndarray): 

423 if values.ndim > 1: 

424 # preempt sanitize_array from raising ValueError 

425 raise NotImplementedError( 

426 "> 1 ndim Categorical are not supported at this time" 

427 ) 

428 values = sanitize_array(values, None) 

429 else: 

430 # i.e. must be a list 

431 arr = sanitize_array(values, None) 

432 null_mask = isna(arr) 

433 if null_mask.any(): 

434 # We remove null values here, then below will re-insert 

435 # them, grep "full_codes" 

436 arr_list = [values[idx] for idx in np.where(~null_mask)[0]] 

437 

438 # GH#44900 Do not cast to float if we have only missing values 

439 if arr_list or arr.dtype == "object": 

440 sanitize_dtype = None 

441 else: 

442 sanitize_dtype = arr.dtype 

443 

444 arr = sanitize_array(arr_list, None, dtype=sanitize_dtype) 

445 values = arr 

446 

447 if dtype.categories is None: 

448 if isinstance(values.dtype, ArrowDtype) and issubclass( 

449 values.dtype.type, CategoricalDtypeType 

450 ): 

451 arr = values._pa_array.combine_chunks() 

452 categories = arr.dictionary.to_pandas(types_mapper=ArrowDtype) 

453 codes = arr.indices.to_numpy() 

454 dtype = CategoricalDtype(categories, values.dtype.pyarrow_dtype.ordered) 

455 else: 

456 if not isinstance(values, ABCIndex): 

457 # in particular RangeIndex xref test_index_equal_range_categories 

458 values = sanitize_array(values, None) 

459 try: 

460 codes, categories = factorize(values, sort=True) 

461 except TypeError as err: 

462 codes, categories = factorize(values, sort=False) 

463 if dtype.ordered: 

464 # raise, as we don't have a sortable data structure and so 

465 # the user should give us one by specifying categories 

466 raise TypeError( 

467 "'values' is not ordered, please " 

468 "explicitly specify the categories order " 

469 "by passing in a categories argument." 

470 ) from err 

471 

472 # we're inferring from values 

473 dtype = CategoricalDtype(categories, dtype.ordered) 

474 

475 elif isinstance(values.dtype, CategoricalDtype): 

476 old_codes = extract_array(values)._codes 

477 codes = recode_for_categories( 

478 old_codes, values.dtype.categories, dtype.categories, copy=copy 

479 ) 

480 

481 else: 

482 codes = _get_codes_for_values(values, dtype.categories) 

483 

484 if null_mask.any(): 

485 # Reinsert -1 placeholders for previously removed missing values 

486 full_codes = -np.ones(null_mask.shape, dtype=codes.dtype) 

487 full_codes[~null_mask] = codes 

488 codes = full_codes 

489 

490 dtype = CategoricalDtype(ordered=False).update_dtype(dtype) 

491 arr = coerce_indexer_dtype(codes, dtype.categories) 

492 super().__init__(arr, dtype) 

493 

494 @property 

495 def dtype(self) -> CategoricalDtype: 

496 """ 

497 The :class:`~pandas.api.types.CategoricalDtype` for this instance. 

498 

499 Examples 

500 -------- 

501 >>> cat = pd.Categorical(['a', 'b'], ordered=True) 

502 >>> cat 

503 ['a', 'b'] 

504 Categories (2, object): ['a' < 'b'] 

505 >>> cat.dtype 

506 CategoricalDtype(categories=['a', 'b'], ordered=True, categories_dtype=object) 

507 """ 

508 return self._dtype 

509 

510 @property 

511 def _internal_fill_value(self) -> int: 

512 # using the specific numpy integer instead of python int to get 

513 # the correct dtype back from _quantile in the all-NA case 

514 dtype = self._ndarray.dtype 

515 return dtype.type(-1) 

516 

517 @classmethod 

518 def _from_sequence( 

519 cls, scalars, *, dtype: Dtype | None = None, copy: bool = False 

520 ) -> Self: 

521 return cls(scalars, dtype=dtype, copy=copy) 

522 

523 @classmethod 

524 def _from_scalars(cls, scalars, *, dtype: DtypeObj) -> Self: 

525 if dtype is None: 

526 # The _from_scalars strictness doesn't make much sense in this case. 

527 raise NotImplementedError 

528 

529 res = cls._from_sequence(scalars, dtype=dtype) 

530 

531 # if there are any non-category elements in scalars, these will be 

532 # converted to NAs in res. 

533 mask = isna(scalars) 

534 if not (mask == res.isna()).all(): 

535 # Some non-category element in scalars got converted to NA in res. 

536 raise ValueError 

537 return res 

538 

539 @overload 

540 def astype(self, dtype: npt.DTypeLike, copy: bool = ...) -> np.ndarray: 

541 ... 

542 

543 @overload 

544 def astype(self, dtype: ExtensionDtype, copy: bool = ...) -> ExtensionArray: 

545 ... 

546 

547 @overload 

548 def astype(self, dtype: AstypeArg, copy: bool = ...) -> ArrayLike: 

549 ... 

550 

551 def astype(self, dtype: AstypeArg, copy: bool = True) -> ArrayLike: 

552 """ 

553 Coerce this type to another dtype 

554 

555 Parameters 

556 ---------- 

557 dtype : numpy dtype or pandas type 

558 copy : bool, default True 

559 By default, astype always returns a newly allocated object. 

560 If copy is set to False and dtype is categorical, the original 

561 object is returned. 

562 """ 

563 dtype = pandas_dtype(dtype) 

564 if self.dtype is dtype: 

565 result = self.copy() if copy else self 

566 

567 elif isinstance(dtype, CategoricalDtype): 

568 # GH 10696/18593/18630 

569 dtype = self.dtype.update_dtype(dtype) 

570 self = self.copy() if copy else self 

571 result = self._set_dtype(dtype) 

572 

573 elif isinstance(dtype, ExtensionDtype): 

574 return super().astype(dtype, copy=copy) 

575 

576 elif dtype.kind in "iu" and self.isna().any(): 

577 raise ValueError("Cannot convert float NaN to integer") 

578 

579 elif len(self.codes) == 0 or len(self.categories) == 0: 

580 result = np.array( 

581 self, 

582 dtype=dtype, 

583 copy=copy, 

584 ) 

585 

586 else: 

587 # GH8628 (PERF): astype category codes instead of astyping array 

588 new_cats = self.categories._values 

589 

590 try: 

591 new_cats = new_cats.astype(dtype=dtype, copy=copy) 

592 fill_value = self.categories._na_value 

593 if not is_valid_na_for_dtype(fill_value, dtype): 

594 fill_value = lib.item_from_zerodim( 

595 np.array(self.categories._na_value).astype(dtype) 

596 ) 

597 except ( 

598 TypeError, # downstream error msg for CategoricalIndex is misleading 

599 ValueError, 

600 ): 

601 msg = f"Cannot cast {self.categories.dtype} dtype to {dtype}" 

602 raise ValueError(msg) 

603 

604 result = take_nd( 

605 new_cats, ensure_platform_int(self._codes), fill_value=fill_value 

606 ) 

607 

608 return result 

609 

610 def to_list(self): 

611 """ 

612 Alias for tolist. 

613 """ 

614 # GH#51254 

615 warnings.warn( 

616 "Categorical.to_list is deprecated and will be removed in a future " 

617 "version. Use obj.tolist() instead", 

618 FutureWarning, 

619 stacklevel=find_stack_level(), 

620 ) 

621 return self.tolist() 

622 

623 @classmethod 

624 def _from_inferred_categories( 

625 cls, inferred_categories, inferred_codes, dtype, true_values=None 

626 ) -> Self: 

627 """ 

628 Construct a Categorical from inferred values. 

629 

630 For inferred categories (`dtype` is None) the categories are sorted. 

631 For explicit `dtype`, the `inferred_categories` are cast to the 

632 appropriate type. 

633 

634 Parameters 

635 ---------- 

636 inferred_categories : Index 

637 inferred_codes : Index 

638 dtype : CategoricalDtype or 'category' 

639 true_values : list, optional 

640 If none are provided, the default ones are 

641 "True", "TRUE", and "true." 

642 

643 Returns 

644 ------- 

645 Categorical 

646 """ 

647 from pandas import ( 

648 Index, 

649 to_datetime, 

650 to_numeric, 

651 to_timedelta, 

652 ) 

653 

654 cats = Index(inferred_categories) 

655 known_categories = ( 

656 isinstance(dtype, CategoricalDtype) and dtype.categories is not None 

657 ) 

658 

659 if known_categories: 

660 # Convert to a specialized type with `dtype` if specified. 

661 if is_any_real_numeric_dtype(dtype.categories.dtype): 

662 cats = to_numeric(inferred_categories, errors="coerce") 

663 elif lib.is_np_dtype(dtype.categories.dtype, "M"): 

664 cats = to_datetime(inferred_categories, errors="coerce") 

665 elif lib.is_np_dtype(dtype.categories.dtype, "m"): 

666 cats = to_timedelta(inferred_categories, errors="coerce") 

667 elif is_bool_dtype(dtype.categories.dtype): 

668 if true_values is None: 

669 true_values = ["True", "TRUE", "true"] 

670 

671 # error: Incompatible types in assignment (expression has type 

672 # "ndarray", variable has type "Index") 

673 cats = cats.isin(true_values) # type: ignore[assignment] 

674 

675 if known_categories: 

676 # Recode from observation order to dtype.categories order. 

677 categories = dtype.categories 

678 codes = recode_for_categories(inferred_codes, cats, categories) 

679 elif not cats.is_monotonic_increasing: 

680 # Sort categories and recode for unknown categories. 

681 unsorted = cats.copy() 

682 categories = cats.sort_values() 

683 

684 codes = recode_for_categories(inferred_codes, unsorted, categories) 

685 dtype = CategoricalDtype(categories, ordered=False) 

686 else: 

687 dtype = CategoricalDtype(cats, ordered=False) 

688 codes = inferred_codes 

689 

690 return cls._simple_new(codes, dtype=dtype) 

691 

692 @classmethod 

693 def from_codes( 

694 cls, 

695 codes, 

696 categories=None, 

697 ordered=None, 

698 dtype: Dtype | None = None, 

699 validate: bool = True, 

700 ) -> Self: 

701 """ 

702 Make a Categorical type from codes and categories or dtype. 

703 

704 This constructor is useful if you already have codes and 

705 categories/dtype and so do not need the (computation intensive) 

706 factorization step, which is usually done on the constructor. 

707 

708 If your data does not follow this convention, please use the normal 

709 constructor. 

710 

711 Parameters 

712 ---------- 

713 codes : array-like of int 

714 An integer array, where each integer points to a category in 

715 categories or dtype.categories, or else is -1 for NaN. 

716 categories : index-like, optional 

717 The categories for the categorical. Items need to be unique. 

718 If the categories are not given here, then they must be provided 

719 in `dtype`. 

720 ordered : bool, optional 

721 Whether or not this categorical is treated as an ordered 

722 categorical. If not given here or in `dtype`, the resulting 

723 categorical will be unordered. 

724 dtype : CategoricalDtype or "category", optional 

725 If :class:`CategoricalDtype`, cannot be used together with 

726 `categories` or `ordered`. 

727 validate : bool, default True 

728 If True, validate that the codes are valid for the dtype. 

729 If False, don't validate that the codes are valid. Be careful about skipping 

730 validation, as invalid codes can lead to severe problems, such as segfaults. 

731 

732 .. versionadded:: 2.1.0 

733 

734 Returns 

735 ------- 

736 Categorical 

737 

738 Examples 

739 -------- 

740 >>> dtype = pd.CategoricalDtype(['a', 'b'], ordered=True) 

741 >>> pd.Categorical.from_codes(codes=[0, 1, 0, 1], dtype=dtype) 

742 ['a', 'b', 'a', 'b'] 

743 Categories (2, object): ['a' < 'b'] 

744 """ 

745 dtype = CategoricalDtype._from_values_or_dtype( 

746 categories=categories, ordered=ordered, dtype=dtype 

747 ) 

748 if dtype.categories is None: 

749 msg = ( 

750 "The categories must be provided in 'categories' or " 

751 "'dtype'. Both were None." 

752 ) 

753 raise ValueError(msg) 

754 

755 if validate: 

756 # beware: non-valid codes may segfault 

757 codes = cls._validate_codes_for_dtype(codes, dtype=dtype) 

758 

759 return cls._simple_new(codes, dtype=dtype) 

760 

761 # ------------------------------------------------------------------ 

762 # Categories/Codes/Ordered 

763 

764 @property 

765 def categories(self) -> Index: 

766 """ 

767 The categories of this categorical. 

768 

769 Setting assigns new values to each category (effectively a rename of 

770 each individual category). 

771 

772 The assigned value has to be a list-like object. All items must be 

773 unique and the number of items in the new categories must be the same 

774 as the number of items in the old categories. 

775 

776 Raises 

777 ------ 

778 ValueError 

779 If the new categories do not validate as categories or if the 

780 number of new categories is unequal the number of old categories 

781 

782 See Also 

783 -------- 

784 rename_categories : Rename categories. 

785 reorder_categories : Reorder categories. 

786 add_categories : Add new categories. 

787 remove_categories : Remove the specified categories. 

788 remove_unused_categories : Remove categories which are not used. 

789 set_categories : Set the categories to the specified ones. 

790 

791 Examples 

792 -------- 

793 For :class:`pandas.Series`: 

794 

795 >>> ser = pd.Series(['a', 'b', 'c', 'a'], dtype='category') 

796 >>> ser.cat.categories 

797 Index(['a', 'b', 'c'], dtype='object') 

798 

799 >>> raw_cat = pd.Categorical(['a', 'b', 'c', 'a'], categories=['b', 'c', 'd']) 

800 >>> ser = pd.Series(raw_cat) 

801 >>> ser.cat.categories 

802 Index(['b', 'c', 'd'], dtype='object') 

803 

804 For :class:`pandas.Categorical`: 

805 

806 >>> cat = pd.Categorical(['a', 'b'], ordered=True) 

807 >>> cat.categories 

808 Index(['a', 'b'], dtype='object') 

809 

810 For :class:`pandas.CategoricalIndex`: 

811 

812 >>> ci = pd.CategoricalIndex(['a', 'c', 'b', 'a', 'c', 'b']) 

813 >>> ci.categories 

814 Index(['a', 'b', 'c'], dtype='object') 

815 

816 >>> ci = pd.CategoricalIndex(['a', 'c'], categories=['c', 'b', 'a']) 

817 >>> ci.categories 

818 Index(['c', 'b', 'a'], dtype='object') 

819 """ 

820 return self.dtype.categories 

821 

822 @property 

823 def ordered(self) -> Ordered: 

824 """ 

825 Whether the categories have an ordered relationship. 

826 

827 Examples 

828 -------- 

829 For :class:`pandas.Series`: 

830 

831 >>> ser = pd.Series(['a', 'b', 'c', 'a'], dtype='category') 

832 >>> ser.cat.ordered 

833 False 

834 

835 >>> raw_cat = pd.Categorical(['a', 'b', 'c', 'a'], ordered=True) 

836 >>> ser = pd.Series(raw_cat) 

837 >>> ser.cat.ordered 

838 True 

839 

840 For :class:`pandas.Categorical`: 

841 

842 >>> cat = pd.Categorical(['a', 'b'], ordered=True) 

843 >>> cat.ordered 

844 True 

845 

846 >>> cat = pd.Categorical(['a', 'b'], ordered=False) 

847 >>> cat.ordered 

848 False 

849 

850 For :class:`pandas.CategoricalIndex`: 

851 

852 >>> ci = pd.CategoricalIndex(['a', 'b'], ordered=True) 

853 >>> ci.ordered 

854 True 

855 

856 >>> ci = pd.CategoricalIndex(['a', 'b'], ordered=False) 

857 >>> ci.ordered 

858 False 

859 """ 

860 return self.dtype.ordered 

861 

862 @property 

863 def codes(self) -> np.ndarray: 

864 """ 

865 The category codes of this categorical index. 

866 

867 Codes are an array of integers which are the positions of the actual 

868 values in the categories array. 

869 

870 There is no setter, use the other categorical methods and the normal item 

871 setter to change values in the categorical. 

872 

873 Returns 

874 ------- 

875 ndarray[int] 

876 A non-writable view of the ``codes`` array. 

877 

878 Examples 

879 -------- 

880 For :class:`pandas.Categorical`: 

881 

882 >>> cat = pd.Categorical(['a', 'b'], ordered=True) 

883 >>> cat.codes 

884 array([0, 1], dtype=int8) 

885 

886 For :class:`pandas.CategoricalIndex`: 

887 

888 >>> ci = pd.CategoricalIndex(['a', 'b', 'c', 'a', 'b', 'c']) 

889 >>> ci.codes 

890 array([0, 1, 2, 0, 1, 2], dtype=int8) 

891 

892 >>> ci = pd.CategoricalIndex(['a', 'c'], categories=['c', 'b', 'a']) 

893 >>> ci.codes 

894 array([2, 0], dtype=int8) 

895 """ 

896 v = self._codes.view() 

897 v.flags.writeable = False 

898 return v 

899 

900 def _set_categories(self, categories, fastpath: bool = False) -> None: 

901 """ 

902 Sets new categories inplace 

903 

904 Parameters 

905 ---------- 

906 fastpath : bool, default False 

907 Don't perform validation of the categories for uniqueness or nulls 

908 

909 Examples 

910 -------- 

911 >>> c = pd.Categorical(['a', 'b']) 

912 >>> c 

913 ['a', 'b'] 

914 Categories (2, object): ['a', 'b'] 

915 

916 >>> c._set_categories(pd.Index(['a', 'c'])) 

917 >>> c 

918 ['a', 'c'] 

919 Categories (2, object): ['a', 'c'] 

920 """ 

921 if fastpath: 

922 new_dtype = CategoricalDtype._from_fastpath(categories, self.ordered) 

923 else: 

924 new_dtype = CategoricalDtype(categories, ordered=self.ordered) 

925 if ( 

926 not fastpath 

927 and self.dtype.categories is not None 

928 and len(new_dtype.categories) != len(self.dtype.categories) 

929 ): 

930 raise ValueError( 

931 "new categories need to have the same number of " 

932 "items as the old categories!" 

933 ) 

934 

935 super().__init__(self._ndarray, new_dtype) 

936 

937 def _set_dtype(self, dtype: CategoricalDtype) -> Self: 

938 """ 

939 Internal method for directly updating the CategoricalDtype 

940 

941 Parameters 

942 ---------- 

943 dtype : CategoricalDtype 

944 

945 Notes 

946 ----- 

947 We don't do any validation here. It's assumed that the dtype is 

948 a (valid) instance of `CategoricalDtype`. 

949 """ 

950 codes = recode_for_categories(self.codes, self.categories, dtype.categories) 

951 return type(self)._simple_new(codes, dtype=dtype) 

952 

953 def set_ordered(self, value: bool) -> Self: 

954 """ 

955 Set the ordered attribute to the boolean value. 

956 

957 Parameters 

958 ---------- 

959 value : bool 

960 Set whether this categorical is ordered (True) or not (False). 

961 """ 

962 new_dtype = CategoricalDtype(self.categories, ordered=value) 

963 cat = self.copy() 

964 NDArrayBacked.__init__(cat, cat._ndarray, new_dtype) 

965 return cat 

966 

967 def as_ordered(self) -> Self: 

968 """ 

969 Set the Categorical to be ordered. 

970 

971 Returns 

972 ------- 

973 Categorical 

974 Ordered Categorical. 

975 

976 Examples 

977 -------- 

978 For :class:`pandas.Series`: 

979 

980 >>> ser = pd.Series(['a', 'b', 'c', 'a'], dtype='category') 

981 >>> ser.cat.ordered 

982 False 

983 >>> ser = ser.cat.as_ordered() 

984 >>> ser.cat.ordered 

985 True 

986 

987 For :class:`pandas.CategoricalIndex`: 

988 

989 >>> ci = pd.CategoricalIndex(['a', 'b', 'c', 'a']) 

990 >>> ci.ordered 

991 False 

992 >>> ci = ci.as_ordered() 

993 >>> ci.ordered 

994 True 

995 """ 

996 return self.set_ordered(True) 

997 

998 def as_unordered(self) -> Self: 

999 """ 

1000 Set the Categorical to be unordered. 

1001 

1002 Returns 

1003 ------- 

1004 Categorical 

1005 Unordered Categorical. 

1006 

1007 Examples 

1008 -------- 

1009 For :class:`pandas.Series`: 

1010 

1011 >>> raw_cat = pd.Categorical(['a', 'b', 'c', 'a'], ordered=True) 

1012 >>> ser = pd.Series(raw_cat) 

1013 >>> ser.cat.ordered 

1014 True 

1015 >>> ser = ser.cat.as_unordered() 

1016 >>> ser.cat.ordered 

1017 False 

1018 

1019 For :class:`pandas.CategoricalIndex`: 

1020 

1021 >>> ci = pd.CategoricalIndex(['a', 'b', 'c', 'a'], ordered=True) 

1022 >>> ci.ordered 

1023 True 

1024 >>> ci = ci.as_unordered() 

1025 >>> ci.ordered 

1026 False 

1027 """ 

1028 return self.set_ordered(False) 

1029 

1030 def set_categories(self, new_categories, ordered=None, rename: bool = False): 

1031 """ 

1032 Set the categories to the specified new categories. 

1033 

1034 ``new_categories`` can include new categories (which will result in 

1035 unused categories) or remove old categories (which results in values 

1036 set to ``NaN``). If ``rename=True``, the categories will simply be renamed 

1037 (less or more items than in old categories will result in values set to 

1038 ``NaN`` or in unused categories respectively). 

1039 

1040 This method can be used to perform more than one action of adding, 

1041 removing, and reordering simultaneously and is therefore faster than 

1042 performing the individual steps via the more specialised methods. 

1043 

1044 On the other hand this methods does not do checks (e.g., whether the 

1045 old categories are included in the new categories on a reorder), which 

1046 can result in surprising changes, for example when using special string 

1047 dtypes, which does not considers a S1 string equal to a single char 

1048 python string. 

1049 

1050 Parameters 

1051 ---------- 

1052 new_categories : Index-like 

1053 The categories in new order. 

1054 ordered : bool, default False 

1055 Whether or not the categorical is treated as a ordered categorical. 

1056 If not given, do not change the ordered information. 

1057 rename : bool, default False 

1058 Whether or not the new_categories should be considered as a rename 

1059 of the old categories or as reordered categories. 

1060 

1061 Returns 

1062 ------- 

1063 Categorical with reordered categories. 

1064 

1065 Raises 

1066 ------ 

1067 ValueError 

1068 If new_categories does not validate as categories 

1069 

1070 See Also 

1071 -------- 

1072 rename_categories : Rename categories. 

1073 reorder_categories : Reorder categories. 

1074 add_categories : Add new categories. 

1075 remove_categories : Remove the specified categories. 

1076 remove_unused_categories : Remove categories which are not used. 

1077 

1078 Examples 

1079 -------- 

1080 For :class:`pandas.Series`: 

1081 

1082 >>> raw_cat = pd.Categorical(['a', 'b', 'c', 'A'], 

1083 ... categories=['a', 'b', 'c'], ordered=True) 

1084 >>> ser = pd.Series(raw_cat) 

1085 >>> ser 

1086 0 a 

1087 1 b 

1088 2 c 

1089 3 NaN 

1090 dtype: category 

1091 Categories (3, object): ['a' < 'b' < 'c'] 

1092 

1093 >>> ser.cat.set_categories(['A', 'B', 'C'], rename=True) 

1094 0 A 

1095 1 B 

1096 2 C 

1097 3 NaN 

1098 dtype: category 

1099 Categories (3, object): ['A' < 'B' < 'C'] 

1100 

1101 For :class:`pandas.CategoricalIndex`: 

1102 

1103 >>> ci = pd.CategoricalIndex(['a', 'b', 'c', 'A'], 

1104 ... categories=['a', 'b', 'c'], ordered=True) 

1105 >>> ci 

1106 CategoricalIndex(['a', 'b', 'c', nan], categories=['a', 'b', 'c'], 

1107 ordered=True, dtype='category') 

1108 

1109 >>> ci.set_categories(['A', 'b', 'c']) 

1110 CategoricalIndex([nan, 'b', 'c', nan], categories=['A', 'b', 'c'], 

1111 ordered=True, dtype='category') 

1112 >>> ci.set_categories(['A', 'b', 'c'], rename=True) 

1113 CategoricalIndex(['A', 'b', 'c', nan], categories=['A', 'b', 'c'], 

1114 ordered=True, dtype='category') 

1115 """ 

1116 

1117 if ordered is None: 

1118 ordered = self.dtype.ordered 

1119 new_dtype = CategoricalDtype(new_categories, ordered=ordered) 

1120 

1121 cat = self.copy() 

1122 if rename: 

1123 if cat.dtype.categories is not None and len(new_dtype.categories) < len( 

1124 cat.dtype.categories 

1125 ): 

1126 # remove all _codes which are larger and set to -1/NaN 

1127 cat._codes[cat._codes >= len(new_dtype.categories)] = -1 

1128 codes = cat._codes 

1129 else: 

1130 codes = recode_for_categories( 

1131 cat.codes, cat.categories, new_dtype.categories 

1132 ) 

1133 NDArrayBacked.__init__(cat, codes, new_dtype) 

1134 return cat 

1135 

1136 def rename_categories(self, new_categories) -> Self: 

1137 """ 

1138 Rename categories. 

1139 

1140 Parameters 

1141 ---------- 

1142 new_categories : list-like, dict-like or callable 

1143 

1144 New categories which will replace old categories. 

1145 

1146 * list-like: all items must be unique and the number of items in 

1147 the new categories must match the existing number of categories. 

1148 

1149 * dict-like: specifies a mapping from 

1150 old categories to new. Categories not contained in the mapping 

1151 are passed through and extra categories in the mapping are 

1152 ignored. 

1153 

1154 * callable : a callable that is called on all items in the old 

1155 categories and whose return values comprise the new categories. 

1156 

1157 Returns 

1158 ------- 

1159 Categorical 

1160 Categorical with renamed categories. 

1161 

1162 Raises 

1163 ------ 

1164 ValueError 

1165 If new categories are list-like and do not have the same number of 

1166 items than the current categories or do not validate as categories 

1167 

1168 See Also 

1169 -------- 

1170 reorder_categories : Reorder categories. 

1171 add_categories : Add new categories. 

1172 remove_categories : Remove the specified categories. 

1173 remove_unused_categories : Remove categories which are not used. 

1174 set_categories : Set the categories to the specified ones. 

1175 

1176 Examples 

1177 -------- 

1178 >>> c = pd.Categorical(['a', 'a', 'b']) 

1179 >>> c.rename_categories([0, 1]) 

1180 [0, 0, 1] 

1181 Categories (2, int64): [0, 1] 

1182 

1183 For dict-like ``new_categories``, extra keys are ignored and 

1184 categories not in the dictionary are passed through 

1185 

1186 >>> c.rename_categories({'a': 'A', 'c': 'C'}) 

1187 ['A', 'A', 'b'] 

1188 Categories (2, object): ['A', 'b'] 

1189 

1190 You may also provide a callable to create the new categories 

1191 

1192 >>> c.rename_categories(lambda x: x.upper()) 

1193 ['A', 'A', 'B'] 

1194 Categories (2, object): ['A', 'B'] 

1195 """ 

1196 

1197 if is_dict_like(new_categories): 

1198 new_categories = [ 

1199 new_categories.get(item, item) for item in self.categories 

1200 ] 

1201 elif callable(new_categories): 

1202 new_categories = [new_categories(item) for item in self.categories] 

1203 

1204 cat = self.copy() 

1205 cat._set_categories(new_categories) 

1206 return cat 

1207 

1208 def reorder_categories(self, new_categories, ordered=None) -> Self: 

1209 """ 

1210 Reorder categories as specified in new_categories. 

1211 

1212 ``new_categories`` need to include all old categories and no new category 

1213 items. 

1214 

1215 Parameters 

1216 ---------- 

1217 new_categories : Index-like 

1218 The categories in new order. 

1219 ordered : bool, optional 

1220 Whether or not the categorical is treated as a ordered categorical. 

1221 If not given, do not change the ordered information. 

1222 

1223 Returns 

1224 ------- 

1225 Categorical 

1226 Categorical with reordered categories. 

1227 

1228 Raises 

1229 ------ 

1230 ValueError 

1231 If the new categories do not contain all old category items or any 

1232 new ones 

1233 

1234 See Also 

1235 -------- 

1236 rename_categories : Rename categories. 

1237 add_categories : Add new categories. 

1238 remove_categories : Remove the specified categories. 

1239 remove_unused_categories : Remove categories which are not used. 

1240 set_categories : Set the categories to the specified ones. 

1241 

1242 Examples 

1243 -------- 

1244 For :class:`pandas.Series`: 

1245 

1246 >>> ser = pd.Series(['a', 'b', 'c', 'a'], dtype='category') 

1247 >>> ser = ser.cat.reorder_categories(['c', 'b', 'a'], ordered=True) 

1248 >>> ser 

1249 0 a 

1250 1 b 

1251 2 c 

1252 3 a 

1253 dtype: category 

1254 Categories (3, object): ['c' < 'b' < 'a'] 

1255 

1256 >>> ser.sort_values() 

1257 2 c 

1258 1 b 

1259 0 a 

1260 3 a 

1261 dtype: category 

1262 Categories (3, object): ['c' < 'b' < 'a'] 

1263 

1264 For :class:`pandas.CategoricalIndex`: 

1265 

1266 >>> ci = pd.CategoricalIndex(['a', 'b', 'c', 'a']) 

1267 >>> ci 

1268 CategoricalIndex(['a', 'b', 'c', 'a'], categories=['a', 'b', 'c'], 

1269 ordered=False, dtype='category') 

1270 >>> ci.reorder_categories(['c', 'b', 'a'], ordered=True) 

1271 CategoricalIndex(['a', 'b', 'c', 'a'], categories=['c', 'b', 'a'], 

1272 ordered=True, dtype='category') 

1273 """ 

1274 if ( 

1275 len(self.categories) != len(new_categories) 

1276 or not self.categories.difference(new_categories).empty 

1277 ): 

1278 raise ValueError( 

1279 "items in new_categories are not the same as in old categories" 

1280 ) 

1281 return self.set_categories(new_categories, ordered=ordered) 

1282 

1283 def add_categories(self, new_categories) -> Self: 

1284 """ 

1285 Add new categories. 

1286 

1287 `new_categories` will be included at the last/highest place in the 

1288 categories and will be unused directly after this call. 

1289 

1290 Parameters 

1291 ---------- 

1292 new_categories : category or list-like of category 

1293 The new categories to be included. 

1294 

1295 Returns 

1296 ------- 

1297 Categorical 

1298 Categorical with new categories added. 

1299 

1300 Raises 

1301 ------ 

1302 ValueError 

1303 If the new categories include old categories or do not validate as 

1304 categories 

1305 

1306 See Also 

1307 -------- 

1308 rename_categories : Rename categories. 

1309 reorder_categories : Reorder categories. 

1310 remove_categories : Remove the specified categories. 

1311 remove_unused_categories : Remove categories which are not used. 

1312 set_categories : Set the categories to the specified ones. 

1313 

1314 Examples 

1315 -------- 

1316 >>> c = pd.Categorical(['c', 'b', 'c']) 

1317 >>> c 

1318 ['c', 'b', 'c'] 

1319 Categories (2, object): ['b', 'c'] 

1320 

1321 >>> c.add_categories(['d', 'a']) 

1322 ['c', 'b', 'c'] 

1323 Categories (4, object): ['b', 'c', 'd', 'a'] 

1324 """ 

1325 

1326 if not is_list_like(new_categories): 

1327 new_categories = [new_categories] 

1328 already_included = set(new_categories) & set(self.dtype.categories) 

1329 if len(already_included) != 0: 

1330 raise ValueError( 

1331 f"new categories must not include old categories: {already_included}" 

1332 ) 

1333 

1334 if hasattr(new_categories, "dtype"): 

1335 from pandas import Series 

1336 

1337 dtype = find_common_type( 

1338 [self.dtype.categories.dtype, new_categories.dtype] 

1339 ) 

1340 new_categories = Series( 

1341 list(self.dtype.categories) + list(new_categories), dtype=dtype 

1342 ) 

1343 else: 

1344 new_categories = list(self.dtype.categories) + list(new_categories) 

1345 

1346 new_dtype = CategoricalDtype(new_categories, self.ordered) 

1347 cat = self.copy() 

1348 codes = coerce_indexer_dtype(cat._ndarray, new_dtype.categories) 

1349 NDArrayBacked.__init__(cat, codes, new_dtype) 

1350 return cat 

1351 

1352 def remove_categories(self, removals) -> Self: 

1353 """ 

1354 Remove the specified categories. 

1355 

1356 `removals` must be included in the old categories. Values which were in 

1357 the removed categories will be set to NaN 

1358 

1359 Parameters 

1360 ---------- 

1361 removals : category or list of categories 

1362 The categories which should be removed. 

1363 

1364 Returns 

1365 ------- 

1366 Categorical 

1367 Categorical with removed categories. 

1368 

1369 Raises 

1370 ------ 

1371 ValueError 

1372 If the removals are not contained in the categories 

1373 

1374 See Also 

1375 -------- 

1376 rename_categories : Rename categories. 

1377 reorder_categories : Reorder categories. 

1378 add_categories : Add new categories. 

1379 remove_unused_categories : Remove categories which are not used. 

1380 set_categories : Set the categories to the specified ones. 

1381 

1382 Examples 

1383 -------- 

1384 >>> c = pd.Categorical(['a', 'c', 'b', 'c', 'd']) 

1385 >>> c 

1386 ['a', 'c', 'b', 'c', 'd'] 

1387 Categories (4, object): ['a', 'b', 'c', 'd'] 

1388 

1389 >>> c.remove_categories(['d', 'a']) 

1390 [NaN, 'c', 'b', 'c', NaN] 

1391 Categories (2, object): ['b', 'c'] 

1392 """ 

1393 from pandas import Index 

1394 

1395 if not is_list_like(removals): 

1396 removals = [removals] 

1397 

1398 removals = Index(removals).unique().dropna() 

1399 new_categories = ( 

1400 self.dtype.categories.difference(removals, sort=False) 

1401 if self.dtype.ordered is True 

1402 else self.dtype.categories.difference(removals) 

1403 ) 

1404 not_included = removals.difference(self.dtype.categories) 

1405 

1406 if len(not_included) != 0: 

1407 not_included = set(not_included) 

1408 raise ValueError(f"removals must all be in old categories: {not_included}") 

1409 

1410 return self.set_categories(new_categories, ordered=self.ordered, rename=False) 

1411 

1412 def remove_unused_categories(self) -> Self: 

1413 """ 

1414 Remove categories which are not used. 

1415 

1416 Returns 

1417 ------- 

1418 Categorical 

1419 Categorical with unused categories dropped. 

1420 

1421 See Also 

1422 -------- 

1423 rename_categories : Rename categories. 

1424 reorder_categories : Reorder categories. 

1425 add_categories : Add new categories. 

1426 remove_categories : Remove the specified categories. 

1427 set_categories : Set the categories to the specified ones. 

1428 

1429 Examples 

1430 -------- 

1431 >>> c = pd.Categorical(['a', 'c', 'b', 'c', 'd']) 

1432 >>> c 

1433 ['a', 'c', 'b', 'c', 'd'] 

1434 Categories (4, object): ['a', 'b', 'c', 'd'] 

1435 

1436 >>> c[2] = 'a' 

1437 >>> c[4] = 'c' 

1438 >>> c 

1439 ['a', 'c', 'a', 'c', 'c'] 

1440 Categories (4, object): ['a', 'b', 'c', 'd'] 

1441 

1442 >>> c.remove_unused_categories() 

1443 ['a', 'c', 'a', 'c', 'c'] 

1444 Categories (2, object): ['a', 'c'] 

1445 """ 

1446 idx, inv = np.unique(self._codes, return_inverse=True) 

1447 

1448 if idx.size != 0 and idx[0] == -1: # na sentinel 

1449 idx, inv = idx[1:], inv - 1 

1450 

1451 new_categories = self.dtype.categories.take(idx) 

1452 new_dtype = CategoricalDtype._from_fastpath( 

1453 new_categories, ordered=self.ordered 

1454 ) 

1455 new_codes = coerce_indexer_dtype(inv, new_dtype.categories) 

1456 

1457 cat = self.copy() 

1458 NDArrayBacked.__init__(cat, new_codes, new_dtype) 

1459 return cat 

1460 

1461 # ------------------------------------------------------------------ 

1462 

1463 def map( 

1464 self, 

1465 mapper, 

1466 na_action: Literal["ignore"] | None | lib.NoDefault = lib.no_default, 

1467 ): 

1468 """ 

1469 Map categories using an input mapping or function. 

1470 

1471 Maps the categories to new categories. If the mapping correspondence is 

1472 one-to-one the result is a :class:`~pandas.Categorical` which has the 

1473 same order property as the original, otherwise a :class:`~pandas.Index` 

1474 is returned. NaN values are unaffected. 

1475 

1476 If a `dict` or :class:`~pandas.Series` is used any unmapped category is 

1477 mapped to `NaN`. Note that if this happens an :class:`~pandas.Index` 

1478 will be returned. 

1479 

1480 Parameters 

1481 ---------- 

1482 mapper : function, dict, or Series 

1483 Mapping correspondence. 

1484 na_action : {None, 'ignore'}, default 'ignore' 

1485 If 'ignore', propagate NaN values, without passing them to the 

1486 mapping correspondence. 

1487 

1488 .. deprecated:: 2.1.0 

1489 

1490 The default value of 'ignore' has been deprecated and will be changed to 

1491 None in the future. 

1492 

1493 Returns 

1494 ------- 

1495 pandas.Categorical or pandas.Index 

1496 Mapped categorical. 

1497 

1498 See Also 

1499 -------- 

1500 CategoricalIndex.map : Apply a mapping correspondence on a 

1501 :class:`~pandas.CategoricalIndex`. 

1502 Index.map : Apply a mapping correspondence on an 

1503 :class:`~pandas.Index`. 

1504 Series.map : Apply a mapping correspondence on a 

1505 :class:`~pandas.Series`. 

1506 Series.apply : Apply more complex functions on a 

1507 :class:`~pandas.Series`. 

1508 

1509 Examples 

1510 -------- 

1511 >>> cat = pd.Categorical(['a', 'b', 'c']) 

1512 >>> cat 

1513 ['a', 'b', 'c'] 

1514 Categories (3, object): ['a', 'b', 'c'] 

1515 >>> cat.map(lambda x: x.upper(), na_action=None) 

1516 ['A', 'B', 'C'] 

1517 Categories (3, object): ['A', 'B', 'C'] 

1518 >>> cat.map({'a': 'first', 'b': 'second', 'c': 'third'}, na_action=None) 

1519 ['first', 'second', 'third'] 

1520 Categories (3, object): ['first', 'second', 'third'] 

1521 

1522 If the mapping is one-to-one the ordering of the categories is 

1523 preserved: 

1524 

1525 >>> cat = pd.Categorical(['a', 'b', 'c'], ordered=True) 

1526 >>> cat 

1527 ['a', 'b', 'c'] 

1528 Categories (3, object): ['a' < 'b' < 'c'] 

1529 >>> cat.map({'a': 3, 'b': 2, 'c': 1}, na_action=None) 

1530 [3, 2, 1] 

1531 Categories (3, int64): [3 < 2 < 1] 

1532 

1533 If the mapping is not one-to-one an :class:`~pandas.Index` is returned: 

1534 

1535 >>> cat.map({'a': 'first', 'b': 'second', 'c': 'first'}, na_action=None) 

1536 Index(['first', 'second', 'first'], dtype='object') 

1537 

1538 If a `dict` is used, all unmapped categories are mapped to `NaN` and 

1539 the result is an :class:`~pandas.Index`: 

1540 

1541 >>> cat.map({'a': 'first', 'b': 'second'}, na_action=None) 

1542 Index(['first', 'second', nan], dtype='object') 

1543 """ 

1544 if na_action is lib.no_default: 

1545 warnings.warn( 

1546 "The default value of 'ignore' for the `na_action` parameter in " 

1547 "pandas.Categorical.map is deprecated and will be " 

1548 "changed to 'None' in a future version. Please set na_action to the " 

1549 "desired value to avoid seeing this warning", 

1550 FutureWarning, 

1551 stacklevel=find_stack_level(), 

1552 ) 

1553 na_action = "ignore" 

1554 

1555 assert callable(mapper) or is_dict_like(mapper) 

1556 

1557 new_categories = self.categories.map(mapper) 

1558 

1559 has_nans = np.any(self._codes == -1) 

1560 

1561 na_val = np.nan 

1562 if na_action is None and has_nans: 

1563 na_val = mapper(np.nan) if callable(mapper) else mapper.get(np.nan, np.nan) 

1564 

1565 if new_categories.is_unique and not new_categories.hasnans and na_val is np.nan: 

1566 new_dtype = CategoricalDtype(new_categories, ordered=self.ordered) 

1567 return self.from_codes(self._codes.copy(), dtype=new_dtype, validate=False) 

1568 

1569 if has_nans: 

1570 new_categories = new_categories.insert(len(new_categories), na_val) 

1571 

1572 return np.take(new_categories, self._codes) 

1573 

1574 __eq__ = _cat_compare_op(operator.eq) 

1575 __ne__ = _cat_compare_op(operator.ne) 

1576 __lt__ = _cat_compare_op(operator.lt) 

1577 __gt__ = _cat_compare_op(operator.gt) 

1578 __le__ = _cat_compare_op(operator.le) 

1579 __ge__ = _cat_compare_op(operator.ge) 

1580 

1581 # ------------------------------------------------------------- 

1582 # Validators; ideally these can be de-duplicated 

1583 

1584 def _validate_setitem_value(self, value): 

1585 if not is_hashable(value): 

1586 # wrap scalars and hashable-listlikes in list 

1587 return self._validate_listlike(value) 

1588 else: 

1589 return self._validate_scalar(value) 

1590 

1591 def _validate_scalar(self, fill_value): 

1592 """ 

1593 Convert a user-facing fill_value to a representation to use with our 

1594 underlying ndarray, raising TypeError if this is not possible. 

1595 

1596 Parameters 

1597 ---------- 

1598 fill_value : object 

1599 

1600 Returns 

1601 ------- 

1602 fill_value : int 

1603 

1604 Raises 

1605 ------ 

1606 TypeError 

1607 """ 

1608 

1609 if is_valid_na_for_dtype(fill_value, self.categories.dtype): 

1610 fill_value = -1 

1611 elif fill_value in self.categories: 

1612 fill_value = self._unbox_scalar(fill_value) 

1613 else: 

1614 raise TypeError( 

1615 "Cannot setitem on a Categorical with a new " 

1616 f"category ({fill_value}), set the categories first" 

1617 ) from None 

1618 return fill_value 

1619 

1620 @classmethod 

1621 def _validate_codes_for_dtype(cls, codes, *, dtype: CategoricalDtype) -> np.ndarray: 

1622 if isinstance(codes, ExtensionArray) and is_integer_dtype(codes.dtype): 

1623 # Avoid the implicit conversion of Int to object 

1624 if isna(codes).any(): 

1625 raise ValueError("codes cannot contain NA values") 

1626 codes = codes.to_numpy(dtype=np.int64) 

1627 else: 

1628 codes = np.asarray(codes) 

1629 if len(codes) and codes.dtype.kind not in "iu": 

1630 raise ValueError("codes need to be array-like integers") 

1631 

1632 if len(codes) and (codes.max() >= len(dtype.categories) or codes.min() < -1): 

1633 raise ValueError("codes need to be between -1 and len(categories)-1") 

1634 return codes 

1635 

1636 # ------------------------------------------------------------- 

1637 

1638 @ravel_compat 

1639 def __array__( 

1640 self, dtype: NpDtype | None = None, copy: bool | None = None 

1641 ) -> np.ndarray: 

1642 """ 

1643 The numpy array interface. 

1644 

1645 Returns 

1646 ------- 

1647 numpy.array 

1648 A numpy array of either the specified dtype or, 

1649 if dtype==None (default), the same dtype as 

1650 categorical.categories.dtype. 

1651 

1652 Examples 

1653 -------- 

1654 

1655 >>> cat = pd.Categorical(['a', 'b'], ordered=True) 

1656 

1657 The following calls ``cat.__array__`` 

1658 

1659 >>> np.asarray(cat) 

1660 array(['a', 'b'], dtype=object) 

1661 """ 

1662 ret = take_nd(self.categories._values, self._codes) 

1663 if dtype and np.dtype(dtype) != self.categories.dtype: 

1664 return np.asarray(ret, dtype) 

1665 # When we're a Categorical[ExtensionArray], like Interval, 

1666 # we need to ensure __array__ gets all the way to an 

1667 # ndarray. 

1668 return np.asarray(ret) 

1669 

1670 def __array_ufunc__(self, ufunc: np.ufunc, method: str, *inputs, **kwargs): 

1671 # for binary ops, use our custom dunder methods 

1672 result = arraylike.maybe_dispatch_ufunc_to_dunder_op( 

1673 self, ufunc, method, *inputs, **kwargs 

1674 ) 

1675 if result is not NotImplemented: 

1676 return result 

1677 

1678 if "out" in kwargs: 

1679 # e.g. test_numpy_ufuncs_out 

1680 return arraylike.dispatch_ufunc_with_out( 

1681 self, ufunc, method, *inputs, **kwargs 

1682 ) 

1683 

1684 if method == "reduce": 

1685 # e.g. TestCategoricalAnalytics::test_min_max_ordered 

1686 result = arraylike.dispatch_reduction_ufunc( 

1687 self, ufunc, method, *inputs, **kwargs 

1688 ) 

1689 if result is not NotImplemented: 

1690 return result 

1691 

1692 # for all other cases, raise for now (similarly as what happens in 

1693 # Series.__array_prepare__) 

1694 raise TypeError( 

1695 f"Object with dtype {self.dtype} cannot perform " 

1696 f"the numpy op {ufunc.__name__}" 

1697 ) 

1698 

1699 def __setstate__(self, state) -> None: 

1700 """Necessary for making this object picklable""" 

1701 if not isinstance(state, dict): 

1702 return super().__setstate__(state) 

1703 

1704 if "_dtype" not in state: 

1705 state["_dtype"] = CategoricalDtype(state["_categories"], state["_ordered"]) 

1706 

1707 if "_codes" in state and "_ndarray" not in state: 

1708 # backward compat, changed what is property vs attribute 

1709 state["_ndarray"] = state.pop("_codes") 

1710 

1711 super().__setstate__(state) 

1712 

1713 @property 

1714 def nbytes(self) -> int: 

1715 return self._codes.nbytes + self.dtype.categories.values.nbytes 

1716 

1717 def memory_usage(self, deep: bool = False) -> int: 

1718 """ 

1719 Memory usage of my values 

1720 

1721 Parameters 

1722 ---------- 

1723 deep : bool 

1724 Introspect the data deeply, interrogate 

1725 `object` dtypes for system-level memory consumption 

1726 

1727 Returns 

1728 ------- 

1729 bytes used 

1730 

1731 Notes 

1732 ----- 

1733 Memory usage does not include memory consumed by elements that 

1734 are not components of the array if deep=False 

1735 

1736 See Also 

1737 -------- 

1738 numpy.ndarray.nbytes 

1739 """ 

1740 return self._codes.nbytes + self.dtype.categories.memory_usage(deep=deep) 

1741 

1742 def isna(self) -> npt.NDArray[np.bool_]: 

1743 """ 

1744 Detect missing values 

1745 

1746 Missing values (-1 in .codes) are detected. 

1747 

1748 Returns 

1749 ------- 

1750 np.ndarray[bool] of whether my values are null 

1751 

1752 See Also 

1753 -------- 

1754 isna : Top-level isna. 

1755 isnull : Alias of isna. 

1756 Categorical.notna : Boolean inverse of Categorical.isna. 

1757 

1758 """ 

1759 return self._codes == -1 

1760 

1761 isnull = isna 

1762 

1763 def notna(self) -> npt.NDArray[np.bool_]: 

1764 """ 

1765 Inverse of isna 

1766 

1767 Both missing values (-1 in .codes) and NA as a category are detected as 

1768 null. 

1769 

1770 Returns 

1771 ------- 

1772 np.ndarray[bool] of whether my values are not null 

1773 

1774 See Also 

1775 -------- 

1776 notna : Top-level notna. 

1777 notnull : Alias of notna. 

1778 Categorical.isna : Boolean inverse of Categorical.notna. 

1779 

1780 """ 

1781 return ~self.isna() 

1782 

1783 notnull = notna 

1784 

1785 def value_counts(self, dropna: bool = True) -> Series: 

1786 """ 

1787 Return a Series containing counts of each category. 

1788 

1789 Every category will have an entry, even those with a count of 0. 

1790 

1791 Parameters 

1792 ---------- 

1793 dropna : bool, default True 

1794 Don't include counts of NaN. 

1795 

1796 Returns 

1797 ------- 

1798 counts : Series 

1799 

1800 See Also 

1801 -------- 

1802 Series.value_counts 

1803 """ 

1804 from pandas import ( 

1805 CategoricalIndex, 

1806 Series, 

1807 ) 

1808 

1809 code, cat = self._codes, self.categories 

1810 ncat, mask = (len(cat), code >= 0) 

1811 ix, clean = np.arange(ncat), mask.all() 

1812 

1813 if dropna or clean: 

1814 obs = code if clean else code[mask] 

1815 count = np.bincount(obs, minlength=ncat or 0) 

1816 else: 

1817 count = np.bincount(np.where(mask, code, ncat)) 

1818 ix = np.append(ix, -1) 

1819 

1820 ix = coerce_indexer_dtype(ix, self.dtype.categories) 

1821 ix = self._from_backing_data(ix) 

1822 

1823 return Series( 

1824 count, index=CategoricalIndex(ix), dtype="int64", name="count", copy=False 

1825 ) 

1826 

1827 # error: Argument 2 of "_empty" is incompatible with supertype 

1828 # "NDArrayBackedExtensionArray"; supertype defines the argument type as 

1829 # "ExtensionDtype" 

1830 @classmethod 

1831 def _empty( # type: ignore[override] 

1832 cls, shape: Shape, dtype: CategoricalDtype 

1833 ) -> Self: 

1834 """ 

1835 Analogous to np.empty(shape, dtype=dtype) 

1836 

1837 Parameters 

1838 ---------- 

1839 shape : tuple[int] 

1840 dtype : CategoricalDtype 

1841 """ 

1842 arr = cls._from_sequence([], dtype=dtype) 

1843 

1844 # We have to use np.zeros instead of np.empty otherwise the resulting 

1845 # ndarray may contain codes not supported by this dtype, in which 

1846 # case repr(result) could segfault. 

1847 backing = np.zeros(shape, dtype=arr._ndarray.dtype) 

1848 

1849 return arr._from_backing_data(backing) 

1850 

1851 def _internal_get_values(self) -> ArrayLike: 

1852 """ 

1853 Return the values. 

1854 

1855 For internal compatibility with pandas formatting. 

1856 

1857 Returns 

1858 ------- 

1859 np.ndarray or ExtensionArray 

1860 A numpy array or ExtensionArray of the same dtype as 

1861 categorical.categories.dtype. 

1862 """ 

1863 # if we are a datetime and period index, return Index to keep metadata 

1864 if needs_i8_conversion(self.categories.dtype): 

1865 return self.categories.take(self._codes, fill_value=NaT)._values 

1866 elif is_integer_dtype(self.categories.dtype) and -1 in self._codes: 

1867 return ( 

1868 self.categories.astype("object") 

1869 .take(self._codes, fill_value=np.nan) 

1870 ._values 

1871 ) 

1872 return np.array(self) 

1873 

1874 def check_for_ordered(self, op) -> None: 

1875 """assert that we are ordered""" 

1876 if not self.ordered: 

1877 raise TypeError( 

1878 f"Categorical is not ordered for operation {op}\n" 

1879 "you can use .as_ordered() to change the " 

1880 "Categorical to an ordered one\n" 

1881 ) 

1882 

1883 def argsort( 

1884 self, *, ascending: bool = True, kind: SortKind = "quicksort", **kwargs 

1885 ): 

1886 """ 

1887 Return the indices that would sort the Categorical. 

1888 

1889 Missing values are sorted at the end. 

1890 

1891 Parameters 

1892 ---------- 

1893 ascending : bool, default True 

1894 Whether the indices should result in an ascending 

1895 or descending sort. 

1896 kind : {'quicksort', 'mergesort', 'heapsort', 'stable'}, optional 

1897 Sorting algorithm. 

1898 **kwargs: 

1899 passed through to :func:`numpy.argsort`. 

1900 

1901 Returns 

1902 ------- 

1903 np.ndarray[np.intp] 

1904 

1905 See Also 

1906 -------- 

1907 numpy.ndarray.argsort 

1908 

1909 Notes 

1910 ----- 

1911 While an ordering is applied to the category values, arg-sorting 

1912 in this context refers more to organizing and grouping together 

1913 based on matching category values. Thus, this function can be 

1914 called on an unordered Categorical instance unlike the functions 

1915 'Categorical.min' and 'Categorical.max'. 

1916 

1917 Examples 

1918 -------- 

1919 >>> pd.Categorical(['b', 'b', 'a', 'c']).argsort() 

1920 array([2, 0, 1, 3]) 

1921 

1922 >>> cat = pd.Categorical(['b', 'b', 'a', 'c'], 

1923 ... categories=['c', 'b', 'a'], 

1924 ... ordered=True) 

1925 >>> cat.argsort() 

1926 array([3, 0, 1, 2]) 

1927 

1928 Missing values are placed at the end 

1929 

1930 >>> cat = pd.Categorical([2, None, 1]) 

1931 >>> cat.argsort() 

1932 array([2, 0, 1]) 

1933 """ 

1934 return super().argsort(ascending=ascending, kind=kind, **kwargs) 

1935 

1936 @overload 

1937 def sort_values( 

1938 self, 

1939 *, 

1940 inplace: Literal[False] = ..., 

1941 ascending: bool = ..., 

1942 na_position: str = ..., 

1943 ) -> Self: 

1944 ... 

1945 

1946 @overload 

1947 def sort_values( 

1948 self, *, inplace: Literal[True], ascending: bool = ..., na_position: str = ... 

1949 ) -> None: 

1950 ... 

1951 

1952 def sort_values( 

1953 self, 

1954 *, 

1955 inplace: bool = False, 

1956 ascending: bool = True, 

1957 na_position: str = "last", 

1958 ) -> Self | None: 

1959 """ 

1960 Sort the Categorical by category value returning a new 

1961 Categorical by default. 

1962 

1963 While an ordering is applied to the category values, sorting in this 

1964 context refers more to organizing and grouping together based on 

1965 matching category values. Thus, this function can be called on an 

1966 unordered Categorical instance unlike the functions 'Categorical.min' 

1967 and 'Categorical.max'. 

1968 

1969 Parameters 

1970 ---------- 

1971 inplace : bool, default False 

1972 Do operation in place. 

1973 ascending : bool, default True 

1974 Order ascending. Passing False orders descending. The 

1975 ordering parameter provides the method by which the 

1976 category values are organized. 

1977 na_position : {'first', 'last'} (optional, default='last') 

1978 'first' puts NaNs at the beginning 

1979 'last' puts NaNs at the end 

1980 

1981 Returns 

1982 ------- 

1983 Categorical or None 

1984 

1985 See Also 

1986 -------- 

1987 Categorical.sort 

1988 Series.sort_values 

1989 

1990 Examples 

1991 -------- 

1992 >>> c = pd.Categorical([1, 2, 2, 1, 5]) 

1993 >>> c 

1994 [1, 2, 2, 1, 5] 

1995 Categories (3, int64): [1, 2, 5] 

1996 >>> c.sort_values() 

1997 [1, 1, 2, 2, 5] 

1998 Categories (3, int64): [1, 2, 5] 

1999 >>> c.sort_values(ascending=False) 

2000 [5, 2, 2, 1, 1] 

2001 Categories (3, int64): [1, 2, 5] 

2002 

2003 >>> c = pd.Categorical([1, 2, 2, 1, 5]) 

2004 

2005 'sort_values' behaviour with NaNs. Note that 'na_position' 

2006 is independent of the 'ascending' parameter: 

2007 

2008 >>> c = pd.Categorical([np.nan, 2, 2, np.nan, 5]) 

2009 >>> c 

2010 [NaN, 2, 2, NaN, 5] 

2011 Categories (2, int64): [2, 5] 

2012 >>> c.sort_values() 

2013 [2, 2, 5, NaN, NaN] 

2014 Categories (2, int64): [2, 5] 

2015 >>> c.sort_values(ascending=False) 

2016 [5, 2, 2, NaN, NaN] 

2017 Categories (2, int64): [2, 5] 

2018 >>> c.sort_values(na_position='first') 

2019 [NaN, NaN, 2, 2, 5] 

2020 Categories (2, int64): [2, 5] 

2021 >>> c.sort_values(ascending=False, na_position='first') 

2022 [NaN, NaN, 5, 2, 2] 

2023 Categories (2, int64): [2, 5] 

2024 """ 

2025 inplace = validate_bool_kwarg(inplace, "inplace") 

2026 if na_position not in ["last", "first"]: 

2027 raise ValueError(f"invalid na_position: {repr(na_position)}") 

2028 

2029 sorted_idx = nargsort(self, ascending=ascending, na_position=na_position) 

2030 

2031 if not inplace: 

2032 codes = self._codes[sorted_idx] 

2033 return self._from_backing_data(codes) 

2034 self._codes[:] = self._codes[sorted_idx] 

2035 return None 

2036 

2037 def _rank( 

2038 self, 

2039 *, 

2040 axis: AxisInt = 0, 

2041 method: str = "average", 

2042 na_option: str = "keep", 

2043 ascending: bool = True, 

2044 pct: bool = False, 

2045 ): 

2046 """ 

2047 See Series.rank.__doc__. 

2048 """ 

2049 if axis != 0: 

2050 raise NotImplementedError 

2051 vff = self._values_for_rank() 

2052 return algorithms.rank( 

2053 vff, 

2054 axis=axis, 

2055 method=method, 

2056 na_option=na_option, 

2057 ascending=ascending, 

2058 pct=pct, 

2059 ) 

2060 

2061 def _values_for_rank(self) -> np.ndarray: 

2062 """ 

2063 For correctly ranking ordered categorical data. See GH#15420 

2064 

2065 Ordered categorical data should be ranked on the basis of 

2066 codes with -1 translated to NaN. 

2067 

2068 Returns 

2069 ------- 

2070 numpy.array 

2071 

2072 """ 

2073 from pandas import Series 

2074 

2075 if self.ordered: 

2076 values = self.codes 

2077 mask = values == -1 

2078 if mask.any(): 

2079 values = values.astype("float64") 

2080 values[mask] = np.nan 

2081 elif is_any_real_numeric_dtype(self.categories.dtype): 

2082 values = np.array(self) 

2083 else: 

2084 # reorder the categories (so rank can use the float codes) 

2085 # instead of passing an object array to rank 

2086 values = np.array( 

2087 self.rename_categories( 

2088 Series(self.categories, copy=False).rank().values 

2089 ) 

2090 ) 

2091 return values 

2092 

2093 def _hash_pandas_object( 

2094 self, *, encoding: str, hash_key: str, categorize: bool 

2095 ) -> npt.NDArray[np.uint64]: 

2096 """ 

2097 Hash a Categorical by hashing its categories, and then mapping the codes 

2098 to the hashes. 

2099 

2100 Parameters 

2101 ---------- 

2102 encoding : str 

2103 hash_key : str 

2104 categorize : bool 

2105 Ignored for Categorical. 

2106 

2107 Returns 

2108 ------- 

2109 np.ndarray[uint64] 

2110 """ 

2111 # Note we ignore categorize, as we are already Categorical. 

2112 from pandas.core.util.hashing import hash_array 

2113 

2114 # Convert ExtensionArrays to ndarrays 

2115 values = np.asarray(self.categories._values) 

2116 hashed = hash_array(values, encoding, hash_key, categorize=False) 

2117 

2118 # we have uint64, as we don't directly support missing values 

2119 # we don't want to use take_nd which will coerce to float 

2120 # instead, directly construct the result with a 

2121 # max(np.uint64) as the missing value indicator 

2122 # 

2123 # TODO: GH#15362 

2124 

2125 mask = self.isna() 

2126 if len(hashed): 

2127 result = hashed.take(self._codes) 

2128 else: 

2129 result = np.zeros(len(mask), dtype="uint64") 

2130 

2131 if mask.any(): 

2132 result[mask] = lib.u8max 

2133 

2134 return result 

2135 

2136 # ------------------------------------------------------------------ 

2137 # NDArrayBackedExtensionArray compat 

2138 

2139 @property 

2140 def _codes(self) -> np.ndarray: 

2141 return self._ndarray 

2142 

2143 def _box_func(self, i: int): 

2144 if i == -1: 

2145 return np.nan 

2146 return self.categories[i] 

2147 

2148 def _unbox_scalar(self, key) -> int: 

2149 # searchsorted is very performance sensitive. By converting codes 

2150 # to same dtype as self.codes, we get much faster performance. 

2151 code = self.categories.get_loc(key) 

2152 code = self._ndarray.dtype.type(code) 

2153 return code 

2154 

2155 # ------------------------------------------------------------------ 

2156 

2157 def __iter__(self) -> Iterator: 

2158 """ 

2159 Returns an Iterator over the values of this Categorical. 

2160 """ 

2161 if self.ndim == 1: 

2162 return iter(self._internal_get_values().tolist()) 

2163 else: 

2164 return (self[n] for n in range(len(self))) 

2165 

2166 def __contains__(self, key) -> bool: 

2167 """ 

2168 Returns True if `key` is in this Categorical. 

2169 """ 

2170 # if key is a NaN, check if any NaN is in self. 

2171 if is_valid_na_for_dtype(key, self.categories.dtype): 

2172 return bool(self.isna().any()) 

2173 

2174 return contains(self, key, container=self._codes) 

2175 

2176 # ------------------------------------------------------------------ 

2177 # Rendering Methods 

2178 

2179 def _formatter(self, boxed: bool = False): 

2180 # Returning None here will cause format_array to do inference. 

2181 return None 

2182 

2183 def _repr_categories(self) -> list[str]: 

2184 """ 

2185 return the base repr for the categories 

2186 """ 

2187 max_categories = ( 

2188 10 

2189 if get_option("display.max_categories") == 0 

2190 else get_option("display.max_categories") 

2191 ) 

2192 from pandas.io.formats import format as fmt 

2193 

2194 format_array = partial( 

2195 fmt.format_array, formatter=None, quoting=QUOTE_NONNUMERIC 

2196 ) 

2197 if len(self.categories) > max_categories: 

2198 num = max_categories // 2 

2199 head = format_array(self.categories[:num]._values) 

2200 tail = format_array(self.categories[-num:]._values) 

2201 category_strs = head + ["..."] + tail 

2202 else: 

2203 category_strs = format_array(self.categories._values) 

2204 

2205 # Strip all leading spaces, which format_array adds for columns... 

2206 category_strs = [x.strip() for x in category_strs] 

2207 return category_strs 

2208 

2209 def _get_repr_footer(self) -> str: 

2210 """ 

2211 Returns a string representation of the footer. 

2212 """ 

2213 category_strs = self._repr_categories() 

2214 dtype = str(self.categories.dtype) 

2215 levheader = f"Categories ({len(self.categories)}, {dtype}): " 

2216 width, _ = get_terminal_size() 

2217 max_width = get_option("display.width") or width 

2218 if console.in_ipython_frontend(): 

2219 # 0 = no breaks 

2220 max_width = 0 

2221 levstring = "" 

2222 start = True 

2223 cur_col_len = len(levheader) # header 

2224 sep_len, sep = (3, " < ") if self.ordered else (2, ", ") 

2225 linesep = f"{sep.rstrip()}\n" # remove whitespace 

2226 for val in category_strs: 

2227 if max_width != 0 and cur_col_len + sep_len + len(val) > max_width: 

2228 levstring += linesep + (" " * (len(levheader) + 1)) 

2229 cur_col_len = len(levheader) + 1 # header + a whitespace 

2230 elif not start: 

2231 levstring += sep 

2232 cur_col_len += len(val) 

2233 levstring += val 

2234 start = False 

2235 # replace to simple save space by 

2236 return f"{levheader}[{levstring.replace(' < ... < ', ' ... ')}]" 

2237 

2238 def _get_values_repr(self) -> str: 

2239 from pandas.io.formats import format as fmt 

2240 

2241 assert len(self) > 0 

2242 

2243 vals = self._internal_get_values() 

2244 fmt_values = fmt.format_array( 

2245 vals, 

2246 None, 

2247 float_format=None, 

2248 na_rep="NaN", 

2249 quoting=QUOTE_NONNUMERIC, 

2250 ) 

2251 

2252 fmt_values = [i.strip() for i in fmt_values] 

2253 joined = ", ".join(fmt_values) 

2254 result = "[" + joined + "]" 

2255 return result 

2256 

2257 def __repr__(self) -> str: 

2258 """ 

2259 String representation. 

2260 """ 

2261 footer = self._get_repr_footer() 

2262 length = len(self) 

2263 max_len = 10 

2264 if length > max_len: 

2265 # In long cases we do not display all entries, so we add Length 

2266 # information to the __repr__. 

2267 num = max_len // 2 

2268 head = self[:num]._get_values_repr() 

2269 tail = self[-(max_len - num) :]._get_values_repr() 

2270 body = f"{head[:-1]}, ..., {tail[1:]}" 

2271 length_info = f"Length: {len(self)}" 

2272 result = f"{body}\n{length_info}\n{footer}" 

2273 elif length > 0: 

2274 body = self._get_values_repr() 

2275 result = f"{body}\n{footer}" 

2276 else: 

2277 # In the empty case we use a comma instead of newline to get 

2278 # a more compact __repr__ 

2279 body = "[]" 

2280 result = f"{body}, {footer}" 

2281 

2282 return result 

2283 

2284 # ------------------------------------------------------------------ 

2285 

2286 def _validate_listlike(self, value): 

2287 # NB: here we assume scalar-like tuples have already been excluded 

2288 value = extract_array(value, extract_numpy=True) 

2289 

2290 # require identical categories set 

2291 if isinstance(value, Categorical): 

2292 if self.dtype != value.dtype: 

2293 raise TypeError( 

2294 "Cannot set a Categorical with another, " 

2295 "without identical categories" 

2296 ) 

2297 # dtype equality implies categories_match_up_to_permutation 

2298 value = self._encode_with_my_categories(value) 

2299 return value._codes 

2300 

2301 from pandas import Index 

2302 

2303 # tupleize_cols=False for e.g. test_fillna_iterable_category GH#41914 

2304 to_add = Index._with_infer(value, tupleize_cols=False).difference( 

2305 self.categories 

2306 ) 

2307 

2308 # no assignments of values not in categories, but it's always ok to set 

2309 # something to np.nan 

2310 if len(to_add) and not isna(to_add).all(): 

2311 raise TypeError( 

2312 "Cannot setitem on a Categorical with a new " 

2313 "category, set the categories first" 

2314 ) 

2315 

2316 codes = self.categories.get_indexer(value) 

2317 return codes.astype(self._ndarray.dtype, copy=False) 

2318 

2319 def _reverse_indexer(self) -> dict[Hashable, npt.NDArray[np.intp]]: 

2320 """ 

2321 Compute the inverse of a categorical, returning 

2322 a dict of categories -> indexers. 

2323 

2324 *This is an internal function* 

2325 

2326 Returns 

2327 ------- 

2328 Dict[Hashable, np.ndarray[np.intp]] 

2329 dict of categories -> indexers 

2330 

2331 Examples 

2332 -------- 

2333 >>> c = pd.Categorical(list('aabca')) 

2334 >>> c 

2335 ['a', 'a', 'b', 'c', 'a'] 

2336 Categories (3, object): ['a', 'b', 'c'] 

2337 >>> c.categories 

2338 Index(['a', 'b', 'c'], dtype='object') 

2339 >>> c.codes 

2340 array([0, 0, 1, 2, 0], dtype=int8) 

2341 >>> c._reverse_indexer() 

2342 {'a': array([0, 1, 4]), 'b': array([2]), 'c': array([3])} 

2343 

2344 """ 

2345 categories = self.categories 

2346 r, counts = libalgos.groupsort_indexer( 

2347 ensure_platform_int(self.codes), categories.size 

2348 ) 

2349 counts = ensure_int64(counts).cumsum() 

2350 _result = (r[start:end] for start, end in zip(counts, counts[1:])) 

2351 return dict(zip(categories, _result)) 

2352 

2353 # ------------------------------------------------------------------ 

2354 # Reductions 

2355 

2356 def _reduce( 

2357 self, name: str, *, skipna: bool = True, keepdims: bool = False, **kwargs 

2358 ): 

2359 result = super()._reduce(name, skipna=skipna, keepdims=keepdims, **kwargs) 

2360 if name in ["argmax", "argmin"]: 

2361 # don't wrap in Categorical! 

2362 return result 

2363 if keepdims: 

2364 return type(self)(result, dtype=self.dtype) 

2365 else: 

2366 return result 

2367 

2368 def min(self, *, skipna: bool = True, **kwargs): 

2369 """ 

2370 The minimum value of the object. 

2371 

2372 Only ordered `Categoricals` have a minimum! 

2373 

2374 Raises 

2375 ------ 

2376 TypeError 

2377 If the `Categorical` is not `ordered`. 

2378 

2379 Returns 

2380 ------- 

2381 min : the minimum of this `Categorical`, NA value if empty 

2382 """ 

2383 nv.validate_minmax_axis(kwargs.get("axis", 0)) 

2384 nv.validate_min((), kwargs) 

2385 self.check_for_ordered("min") 

2386 

2387 if not len(self._codes): 

2388 return self.dtype.na_value 

2389 

2390 good = self._codes != -1 

2391 if not good.all(): 

2392 if skipna and good.any(): 

2393 pointer = self._codes[good].min() 

2394 else: 

2395 return np.nan 

2396 else: 

2397 pointer = self._codes.min() 

2398 return self._wrap_reduction_result(None, pointer) 

2399 

2400 def max(self, *, skipna: bool = True, **kwargs): 

2401 """ 

2402 The maximum value of the object. 

2403 

2404 Only ordered `Categoricals` have a maximum! 

2405 

2406 Raises 

2407 ------ 

2408 TypeError 

2409 If the `Categorical` is not `ordered`. 

2410 

2411 Returns 

2412 ------- 

2413 max : the maximum of this `Categorical`, NA if array is empty 

2414 """ 

2415 nv.validate_minmax_axis(kwargs.get("axis", 0)) 

2416 nv.validate_max((), kwargs) 

2417 self.check_for_ordered("max") 

2418 

2419 if not len(self._codes): 

2420 return self.dtype.na_value 

2421 

2422 good = self._codes != -1 

2423 if not good.all(): 

2424 if skipna and good.any(): 

2425 pointer = self._codes[good].max() 

2426 else: 

2427 return np.nan 

2428 else: 

2429 pointer = self._codes.max() 

2430 return self._wrap_reduction_result(None, pointer) 

2431 

2432 def _mode(self, dropna: bool = True) -> Categorical: 

2433 codes = self._codes 

2434 mask = None 

2435 if dropna: 

2436 mask = self.isna() 

2437 

2438 res_codes = algorithms.mode(codes, mask=mask) 

2439 res_codes = cast(np.ndarray, res_codes) 

2440 assert res_codes.dtype == codes.dtype 

2441 res = self._from_backing_data(res_codes) 

2442 return res 

2443 

2444 # ------------------------------------------------------------------ 

2445 # ExtensionArray Interface 

2446 

2447 def unique(self) -> Self: 

2448 """ 

2449 Return the ``Categorical`` which ``categories`` and ``codes`` are 

2450 unique. 

2451 

2452 .. versionchanged:: 1.3.0 

2453 

2454 Previously, unused categories were dropped from the new categories. 

2455 

2456 Returns 

2457 ------- 

2458 Categorical 

2459 

2460 See Also 

2461 -------- 

2462 pandas.unique 

2463 CategoricalIndex.unique 

2464 Series.unique : Return unique values of Series object. 

2465 

2466 Examples 

2467 -------- 

2468 >>> pd.Categorical(list("baabc")).unique() 

2469 ['b', 'a', 'c'] 

2470 Categories (3, object): ['a', 'b', 'c'] 

2471 >>> pd.Categorical(list("baab"), categories=list("abc"), ordered=True).unique() 

2472 ['b', 'a'] 

2473 Categories (3, object): ['a' < 'b' < 'c'] 

2474 """ 

2475 # pylint: disable=useless-parent-delegation 

2476 return super().unique() 

2477 

2478 def _cast_quantile_result(self, res_values: np.ndarray) -> np.ndarray: 

2479 # make sure we have correct itemsize for resulting codes 

2480 assert res_values.dtype == self._ndarray.dtype 

2481 return res_values 

2482 

2483 def equals(self, other: object) -> bool: 

2484 """ 

2485 Returns True if categorical arrays are equal. 

2486 

2487 Parameters 

2488 ---------- 

2489 other : `Categorical` 

2490 

2491 Returns 

2492 ------- 

2493 bool 

2494 """ 

2495 if not isinstance(other, Categorical): 

2496 return False 

2497 elif self._categories_match_up_to_permutation(other): 

2498 other = self._encode_with_my_categories(other) 

2499 return np.array_equal(self._codes, other._codes) 

2500 return False 

2501 

2502 @classmethod 

2503 def _concat_same_type(cls, to_concat: Sequence[Self], axis: AxisInt = 0) -> Self: 

2504 from pandas.core.dtypes.concat import union_categoricals 

2505 

2506 first = to_concat[0] 

2507 if axis >= first.ndim: 

2508 raise ValueError( 

2509 f"axis {axis} is out of bounds for array of dimension {first.ndim}" 

2510 ) 

2511 

2512 if axis == 1: 

2513 # Flatten, concatenate then reshape 

2514 if not all(x.ndim == 2 for x in to_concat): 

2515 raise ValueError 

2516 

2517 # pass correctly-shaped to union_categoricals 

2518 tc_flat = [] 

2519 for obj in to_concat: 

2520 tc_flat.extend([obj[:, i] for i in range(obj.shape[1])]) 

2521 

2522 res_flat = cls._concat_same_type(tc_flat, axis=0) 

2523 

2524 result = res_flat.reshape(len(first), -1, order="F") 

2525 return result 

2526 

2527 result = union_categoricals(to_concat) 

2528 return result 

2529 

2530 # ------------------------------------------------------------------ 

2531 

2532 def _encode_with_my_categories(self, other: Categorical) -> Categorical: 

2533 """ 

2534 Re-encode another categorical using this Categorical's categories. 

2535 

2536 Notes 

2537 ----- 

2538 This assumes we have already checked 

2539 self._categories_match_up_to_permutation(other). 

2540 """ 

2541 # Indexing on codes is more efficient if categories are the same, 

2542 # so we can apply some optimizations based on the degree of 

2543 # dtype-matching. 

2544 codes = recode_for_categories( 

2545 other.codes, other.categories, self.categories, copy=False 

2546 ) 

2547 return self._from_backing_data(codes) 

2548 

2549 def _categories_match_up_to_permutation(self, other: Categorical) -> bool: 

2550 """ 

2551 Returns True if categoricals are the same dtype 

2552 same categories, and same ordered 

2553 

2554 Parameters 

2555 ---------- 

2556 other : Categorical 

2557 

2558 Returns 

2559 ------- 

2560 bool 

2561 """ 

2562 return hash(self.dtype) == hash(other.dtype) 

2563 

2564 def describe(self) -> DataFrame: 

2565 """ 

2566 Describes this Categorical 

2567 

2568 Returns 

2569 ------- 

2570 description: `DataFrame` 

2571 A dataframe with frequency and counts by category. 

2572 """ 

2573 counts = self.value_counts(dropna=False) 

2574 freqs = counts / counts.sum() 

2575 

2576 from pandas import Index 

2577 from pandas.core.reshape.concat import concat 

2578 

2579 result = concat([counts, freqs], axis=1) 

2580 result.columns = Index(["counts", "freqs"]) 

2581 result.index.name = "categories" 

2582 

2583 return result 

2584 

2585 def isin(self, values: ArrayLike) -> npt.NDArray[np.bool_]: 

2586 """ 

2587 Check whether `values` are contained in Categorical. 

2588 

2589 Return a boolean NumPy Array showing whether each element in 

2590 the Categorical matches an element in the passed sequence of 

2591 `values` exactly. 

2592 

2593 Parameters 

2594 ---------- 

2595 values : np.ndarray or ExtensionArray 

2596 The sequence of values to test. Passing in a single string will 

2597 raise a ``TypeError``. Instead, turn a single string into a 

2598 list of one element. 

2599 

2600 Returns 

2601 ------- 

2602 np.ndarray[bool] 

2603 

2604 Raises 

2605 ------ 

2606 TypeError 

2607 * If `values` is not a set or list-like 

2608 

2609 See Also 

2610 -------- 

2611 pandas.Series.isin : Equivalent method on Series. 

2612 

2613 Examples 

2614 -------- 

2615 >>> s = pd.Categorical(['lama', 'cow', 'lama', 'beetle', 'lama', 

2616 ... 'hippo']) 

2617 >>> s.isin(['cow', 'lama']) 

2618 array([ True, True, True, False, True, False]) 

2619 

2620 Passing a single string as ``s.isin('lama')`` will raise an error. Use 

2621 a list of one element instead: 

2622 

2623 >>> s.isin(['lama']) 

2624 array([ True, False, True, False, True, False]) 

2625 """ 

2626 null_mask = np.asarray(isna(values)) 

2627 code_values = self.categories.get_indexer_for(values) 

2628 code_values = code_values[null_mask | (code_values >= 0)] 

2629 return algorithms.isin(self.codes, code_values) 

2630 

2631 def _replace(self, *, to_replace, value, inplace: bool = False): 

2632 from pandas import Index 

2633 

2634 orig_dtype = self.dtype 

2635 

2636 inplace = validate_bool_kwarg(inplace, "inplace") 

2637 cat = self if inplace else self.copy() 

2638 

2639 mask = isna(np.asarray(value)) 

2640 if mask.any(): 

2641 removals = np.asarray(to_replace)[mask] 

2642 removals = cat.categories[cat.categories.isin(removals)] 

2643 new_cat = cat.remove_categories(removals) 

2644 NDArrayBacked.__init__(cat, new_cat.codes, new_cat.dtype) 

2645 

2646 ser = cat.categories.to_series() 

2647 ser = ser.replace(to_replace=to_replace, value=value) 

2648 

2649 all_values = Index(ser) 

2650 

2651 # GH51016: maintain order of existing categories 

2652 idxr = cat.categories.get_indexer_for(all_values) 

2653 locs = np.arange(len(ser)) 

2654 locs = np.where(idxr == -1, locs, idxr) 

2655 locs = locs.argsort() 

2656 

2657 new_categories = ser.take(locs) 

2658 new_categories = new_categories.drop_duplicates(keep="first") 

2659 new_categories = Index(new_categories) 

2660 new_codes = recode_for_categories( 

2661 cat._codes, all_values, new_categories, copy=False 

2662 ) 

2663 new_dtype = CategoricalDtype(new_categories, ordered=self.dtype.ordered) 

2664 NDArrayBacked.__init__(cat, new_codes, new_dtype) 

2665 

2666 if new_dtype != orig_dtype: 

2667 warnings.warn( 

2668 # GH#55147 

2669 "The behavior of Series.replace (and DataFrame.replace) with " 

2670 "CategoricalDtype is deprecated. In a future version, replace " 

2671 "will only be used for cases that preserve the categories. " 

2672 "To change the categories, use ser.cat.rename_categories " 

2673 "instead.", 

2674 FutureWarning, 

2675 stacklevel=find_stack_level(), 

2676 ) 

2677 if not inplace: 

2678 return cat 

2679 

2680 # ------------------------------------------------------------------------ 

2681 # String methods interface 

2682 def _str_map( 

2683 self, f, na_value=np.nan, dtype=np.dtype("object"), convert: bool = True 

2684 ): 

2685 # Optimization to apply the callable `f` to the categories once 

2686 # and rebuild the result by `take`ing from the result with the codes. 

2687 # Returns the same type as the object-dtype implementation though. 

2688 from pandas.core.arrays import NumpyExtensionArray 

2689 

2690 categories = self.categories 

2691 codes = self.codes 

2692 result = NumpyExtensionArray(categories.to_numpy())._str_map(f, na_value, dtype) 

2693 return take_nd(result, codes, fill_value=na_value) 

2694 

2695 def _str_get_dummies(self, sep: str = "|"): 

2696 # sep may not be in categories. Just bail on this. 

2697 from pandas.core.arrays import NumpyExtensionArray 

2698 

2699 return NumpyExtensionArray(self.astype(str))._str_get_dummies(sep) 

2700 

2701 # ------------------------------------------------------------------------ 

2702 # GroupBy Methods 

2703 

2704 def _groupby_op( 

2705 self, 

2706 *, 

2707 how: str, 

2708 has_dropped_na: bool, 

2709 min_count: int, 

2710 ngroups: int, 

2711 ids: npt.NDArray[np.intp], 

2712 **kwargs, 

2713 ): 

2714 from pandas.core.groupby.ops import WrappedCythonOp 

2715 

2716 kind = WrappedCythonOp.get_kind_from_how(how) 

2717 op = WrappedCythonOp(how=how, kind=kind, has_dropped_na=has_dropped_na) 

2718 

2719 dtype = self.dtype 

2720 if how in ["sum", "prod", "cumsum", "cumprod", "skew"]: 

2721 raise TypeError(f"{dtype} type does not support {how} operations") 

2722 if how in ["min", "max", "rank", "idxmin", "idxmax"] and not dtype.ordered: 

2723 # raise TypeError instead of NotImplementedError to ensure we 

2724 # don't go down a group-by-group path, since in the empty-groups 

2725 # case that would fail to raise 

2726 raise TypeError(f"Cannot perform {how} with non-ordered Categorical") 

2727 if how not in [ 

2728 "rank", 

2729 "any", 

2730 "all", 

2731 "first", 

2732 "last", 

2733 "min", 

2734 "max", 

2735 "idxmin", 

2736 "idxmax", 

2737 ]: 

2738 if kind == "transform": 

2739 raise TypeError(f"{dtype} type does not support {how} operations") 

2740 raise TypeError(f"{dtype} dtype does not support aggregation '{how}'") 

2741 

2742 result_mask = None 

2743 mask = self.isna() 

2744 if how == "rank": 

2745 assert self.ordered # checked earlier 

2746 npvalues = self._ndarray 

2747 elif how in ["first", "last", "min", "max", "idxmin", "idxmax"]: 

2748 npvalues = self._ndarray 

2749 result_mask = np.zeros(ngroups, dtype=bool) 

2750 else: 

2751 # any/all 

2752 npvalues = self.astype(bool) 

2753 

2754 res_values = op._cython_op_ndim_compat( 

2755 npvalues, 

2756 min_count=min_count, 

2757 ngroups=ngroups, 

2758 comp_ids=ids, 

2759 mask=mask, 

2760 result_mask=result_mask, 

2761 **kwargs, 

2762 ) 

2763 

2764 if how in op.cast_blocklist: 

2765 return res_values 

2766 elif how in ["first", "last", "min", "max"]: 

2767 res_values[result_mask == 1] = -1 

2768 return self._from_backing_data(res_values) 

2769 

2770 

2771# The Series.cat accessor 

2772 

2773 

2774@delegate_names( 

2775 delegate=Categorical, accessors=["categories", "ordered"], typ="property" 

2776) 

2777@delegate_names( 

2778 delegate=Categorical, 

2779 accessors=[ 

2780 "rename_categories", 

2781 "reorder_categories", 

2782 "add_categories", 

2783 "remove_categories", 

2784 "remove_unused_categories", 

2785 "set_categories", 

2786 "as_ordered", 

2787 "as_unordered", 

2788 ], 

2789 typ="method", 

2790) 

2791class CategoricalAccessor(PandasDelegate, PandasObject, NoNewAttributesMixin): 

2792 """ 

2793 Accessor object for categorical properties of the Series values. 

2794 

2795 Parameters 

2796 ---------- 

2797 data : Series or CategoricalIndex 

2798 

2799 Examples 

2800 -------- 

2801 >>> s = pd.Series(list("abbccc")).astype("category") 

2802 >>> s 

2803 0 a 

2804 1 b 

2805 2 b 

2806 3 c 

2807 4 c 

2808 5 c 

2809 dtype: category 

2810 Categories (3, object): ['a', 'b', 'c'] 

2811 

2812 >>> s.cat.categories 

2813 Index(['a', 'b', 'c'], dtype='object') 

2814 

2815 >>> s.cat.rename_categories(list("cba")) 

2816 0 c 

2817 1 b 

2818 2 b 

2819 3 a 

2820 4 a 

2821 5 a 

2822 dtype: category 

2823 Categories (3, object): ['c', 'b', 'a'] 

2824 

2825 >>> s.cat.reorder_categories(list("cba")) 

2826 0 a 

2827 1 b 

2828 2 b 

2829 3 c 

2830 4 c 

2831 5 c 

2832 dtype: category 

2833 Categories (3, object): ['c', 'b', 'a'] 

2834 

2835 >>> s.cat.add_categories(["d", "e"]) 

2836 0 a 

2837 1 b 

2838 2 b 

2839 3 c 

2840 4 c 

2841 5 c 

2842 dtype: category 

2843 Categories (5, object): ['a', 'b', 'c', 'd', 'e'] 

2844 

2845 >>> s.cat.remove_categories(["a", "c"]) 

2846 0 NaN 

2847 1 b 

2848 2 b 

2849 3 NaN 

2850 4 NaN 

2851 5 NaN 

2852 dtype: category 

2853 Categories (1, object): ['b'] 

2854 

2855 >>> s1 = s.cat.add_categories(["d", "e"]) 

2856 >>> s1.cat.remove_unused_categories() 

2857 0 a 

2858 1 b 

2859 2 b 

2860 3 c 

2861 4 c 

2862 5 c 

2863 dtype: category 

2864 Categories (3, object): ['a', 'b', 'c'] 

2865 

2866 >>> s.cat.set_categories(list("abcde")) 

2867 0 a 

2868 1 b 

2869 2 b 

2870 3 c 

2871 4 c 

2872 5 c 

2873 dtype: category 

2874 Categories (5, object): ['a', 'b', 'c', 'd', 'e'] 

2875 

2876 >>> s.cat.as_ordered() 

2877 0 a 

2878 1 b 

2879 2 b 

2880 3 c 

2881 4 c 

2882 5 c 

2883 dtype: category 

2884 Categories (3, object): ['a' < 'b' < 'c'] 

2885 

2886 >>> s.cat.as_unordered() 

2887 0 a 

2888 1 b 

2889 2 b 

2890 3 c 

2891 4 c 

2892 5 c 

2893 dtype: category 

2894 Categories (3, object): ['a', 'b', 'c'] 

2895 """ 

2896 

2897 def __init__(self, data) -> None: 

2898 self._validate(data) 

2899 self._parent = data.values 

2900 self._index = data.index 

2901 self._name = data.name 

2902 self._freeze() 

2903 

2904 @staticmethod 

2905 def _validate(data): 

2906 if not isinstance(data.dtype, CategoricalDtype): 

2907 raise AttributeError("Can only use .cat accessor with a 'category' dtype") 

2908 

2909 def _delegate_property_get(self, name: str): 

2910 return getattr(self._parent, name) 

2911 

2912 # error: Signature of "_delegate_property_set" incompatible with supertype 

2913 # "PandasDelegate" 

2914 def _delegate_property_set(self, name: str, new_values): # type: ignore[override] 

2915 return setattr(self._parent, name, new_values) 

2916 

2917 @property 

2918 def codes(self) -> Series: 

2919 """ 

2920 Return Series of codes as well as the index. 

2921 

2922 Examples 

2923 -------- 

2924 >>> raw_cate = pd.Categorical(["a", "b", "c", "a"], categories=["a", "b"]) 

2925 >>> ser = pd.Series(raw_cate) 

2926 >>> ser.cat.codes 

2927 0 0 

2928 1 1 

2929 2 -1 

2930 3 0 

2931 dtype: int8 

2932 """ 

2933 from pandas import Series 

2934 

2935 return Series(self._parent.codes, index=self._index) 

2936 

2937 def _delegate_method(self, name: str, *args, **kwargs): 

2938 from pandas import Series 

2939 

2940 method = getattr(self._parent, name) 

2941 res = method(*args, **kwargs) 

2942 if res is not None: 

2943 return Series(res, index=self._index, name=self._name) 

2944 

2945 

2946# utility routines 

2947 

2948 

2949def _get_codes_for_values( 

2950 values: Index | Series | ExtensionArray | np.ndarray, 

2951 categories: Index, 

2952) -> np.ndarray: 

2953 """ 

2954 utility routine to turn values into codes given the specified categories 

2955 

2956 If `values` is known to be a Categorical, use recode_for_categories instead. 

2957 """ 

2958 codes = categories.get_indexer_for(values) 

2959 return coerce_indexer_dtype(codes, categories) 

2960 

2961 

2962def recode_for_categories( 

2963 codes: np.ndarray, old_categories, new_categories, copy: bool = True 

2964) -> np.ndarray: 

2965 """ 

2966 Convert a set of codes for to a new set of categories 

2967 

2968 Parameters 

2969 ---------- 

2970 codes : np.ndarray 

2971 old_categories, new_categories : Index 

2972 copy: bool, default True 

2973 Whether to copy if the codes are unchanged. 

2974 

2975 Returns 

2976 ------- 

2977 new_codes : np.ndarray[np.int64] 

2978 

2979 Examples 

2980 -------- 

2981 >>> old_cat = pd.Index(['b', 'a', 'c']) 

2982 >>> new_cat = pd.Index(['a', 'b']) 

2983 >>> codes = np.array([0, 1, 1, 2]) 

2984 >>> recode_for_categories(codes, old_cat, new_cat) 

2985 array([ 1, 0, 0, -1], dtype=int8) 

2986 """ 

2987 if len(old_categories) == 0: 

2988 # All null anyway, so just retain the nulls 

2989 if copy: 

2990 return codes.copy() 

2991 return codes 

2992 elif new_categories.equals(old_categories): 

2993 # Same categories, so no need to actually recode 

2994 if copy: 

2995 return codes.copy() 

2996 return codes 

2997 

2998 indexer = coerce_indexer_dtype( 

2999 new_categories.get_indexer_for(old_categories), new_categories 

3000 ) 

3001 new_codes = take_nd(indexer, codes, fill_value=-1) 

3002 return new_codes 

3003 

3004 

3005def factorize_from_iterable(values) -> tuple[np.ndarray, Index]: 

3006 """ 

3007 Factorize an input `values` into `categories` and `codes`. Preserves 

3008 categorical dtype in `categories`. 

3009 

3010 Parameters 

3011 ---------- 

3012 values : list-like 

3013 

3014 Returns 

3015 ------- 

3016 codes : ndarray 

3017 categories : Index 

3018 If `values` has a categorical dtype, then `categories` is 

3019 a CategoricalIndex keeping the categories and order of `values`. 

3020 """ 

3021 from pandas import CategoricalIndex 

3022 

3023 if not is_list_like(values): 

3024 raise TypeError("Input must be list-like") 

3025 

3026 categories: Index 

3027 

3028 vdtype = getattr(values, "dtype", None) 

3029 if isinstance(vdtype, CategoricalDtype): 

3030 values = extract_array(values) 

3031 # The Categorical we want to build has the same categories 

3032 # as values but its codes are by def [0, ..., len(n_categories) - 1] 

3033 cat_codes = np.arange(len(values.categories), dtype=values.codes.dtype) 

3034 cat = Categorical.from_codes(cat_codes, dtype=values.dtype, validate=False) 

3035 

3036 categories = CategoricalIndex(cat) 

3037 codes = values.codes 

3038 else: 

3039 # The value of ordered is irrelevant since we don't use cat as such, 

3040 # but only the resulting categories, the order of which is independent 

3041 # from ordered. Set ordered to False as default. See GH #15457 

3042 cat = Categorical(values, ordered=False) 

3043 categories = cat.categories 

3044 codes = cat.codes 

3045 return codes, categories 

3046 

3047 

3048def factorize_from_iterables(iterables) -> tuple[list[np.ndarray], list[Index]]: 

3049 """ 

3050 A higher-level wrapper over `factorize_from_iterable`. 

3051 

3052 Parameters 

3053 ---------- 

3054 iterables : list-like of list-likes 

3055 

3056 Returns 

3057 ------- 

3058 codes : list of ndarrays 

3059 categories : list of Indexes 

3060 

3061 Notes 

3062 ----- 

3063 See `factorize_from_iterable` for more info. 

3064 """ 

3065 if len(iterables) == 0: 

3066 # For consistency, it should return two empty lists. 

3067 return [], [] 

3068 

3069 codes, categories = zip(*(factorize_from_iterable(it) for it in iterables)) 

3070 return list(codes), list(categories)