Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/pandas/core/arrays/categorical.py: 22%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

725 statements  

1from __future__ import annotations 

2 

3from csv import QUOTE_NONNUMERIC 

4from functools import partial 

5import operator 

6from shutil import get_terminal_size 

7from typing import ( 

8 TYPE_CHECKING, 

9 Hashable, 

10 Iterator, 

11 Literal, 

12 Sequence, 

13 TypeVar, 

14 cast, 

15 overload, 

16) 

17 

18import numpy as np 

19 

20from pandas._config import get_option 

21 

22from pandas._libs import ( 

23 NaT, 

24 algos as libalgos, 

25 lib, 

26) 

27from pandas._libs.arrays import NDArrayBacked 

28from pandas._typing import ( 

29 ArrayLike, 

30 AstypeArg, 

31 AxisInt, 

32 Dtype, 

33 NpDtype, 

34 Ordered, 

35 Shape, 

36 SortKind, 

37 npt, 

38 type_t, 

39) 

40from pandas.compat.numpy import function as nv 

41from pandas.util._validators import validate_bool_kwarg 

42 

43from pandas.core.dtypes.cast import ( 

44 coerce_indexer_dtype, 

45 find_common_type, 

46) 

47from pandas.core.dtypes.common import ( 

48 ensure_int64, 

49 ensure_platform_int, 

50 is_any_real_numeric_dtype, 

51 is_bool_dtype, 

52 is_categorical_dtype, 

53 is_datetime64_dtype, 

54 is_dict_like, 

55 is_dtype_equal, 

56 is_extension_array_dtype, 

57 is_hashable, 

58 is_integer_dtype, 

59 is_list_like, 

60 is_scalar, 

61 is_timedelta64_dtype, 

62 needs_i8_conversion, 

63 pandas_dtype, 

64) 

65from pandas.core.dtypes.dtypes import ( 

66 CategoricalDtype, 

67 ExtensionDtype, 

68) 

69from pandas.core.dtypes.generic import ( 

70 ABCIndex, 

71 ABCSeries, 

72) 

73from pandas.core.dtypes.missing import ( 

74 is_valid_na_for_dtype, 

75 isna, 

76) 

77 

78from pandas.core import ( 

79 algorithms, 

80 arraylike, 

81 ops, 

82) 

83from pandas.core.accessor import ( 

84 PandasDelegate, 

85 delegate_names, 

86) 

87from pandas.core.algorithms import ( 

88 factorize, 

89 take_nd, 

90) 

91from pandas.core.arrays._mixins import ( 

92 NDArrayBackedExtensionArray, 

93 ravel_compat, 

94) 

95from pandas.core.base import ( 

96 ExtensionArray, 

97 NoNewAttributesMixin, 

98 PandasObject, 

99) 

100import pandas.core.common as com 

101from pandas.core.construction import ( 

102 extract_array, 

103 sanitize_array, 

104) 

105from pandas.core.ops.common import unpack_zerodim_and_defer 

106from pandas.core.sorting import nargsort 

107from pandas.core.strings.object_array import ObjectStringArrayMixin 

108 

109from pandas.io.formats import console 

110 

111if TYPE_CHECKING: 

112 from pandas import ( 

113 DataFrame, 

114 Index, 

115 Series, 

116 ) 

117 

118 

119CategoricalT = TypeVar("CategoricalT", bound="Categorical") 

120 

121 

122def _cat_compare_op(op): 

123 opname = f"__{op.__name__}__" 

124 fill_value = op is operator.ne 

125 

126 @unpack_zerodim_and_defer(opname) 

127 def func(self, other): 

128 hashable = is_hashable(other) 

129 if is_list_like(other) and len(other) != len(self) and not hashable: 

130 # in hashable case we may have a tuple that is itself a category 

131 raise ValueError("Lengths must match.") 

132 

133 if not self.ordered: 

134 if opname in ["__lt__", "__gt__", "__le__", "__ge__"]: 

135 raise TypeError( 

136 "Unordered Categoricals can only compare equality or not" 

137 ) 

138 if isinstance(other, Categorical): 

139 # Two Categoricals can only be compared if the categories are 

140 # the same (maybe up to ordering, depending on ordered) 

141 

142 msg = "Categoricals can only be compared if 'categories' are the same." 

143 if not self._categories_match_up_to_permutation(other): 

144 raise TypeError(msg) 

145 

146 if not self.ordered and not self.categories.equals(other.categories): 

147 # both unordered and different order 

148 other_codes = recode_for_categories( 

149 other.codes, other.categories, self.categories, copy=False 

150 ) 

151 else: 

152 other_codes = other._codes 

153 

154 ret = op(self._codes, other_codes) 

155 mask = (self._codes == -1) | (other_codes == -1) 

156 if mask.any(): 

157 ret[mask] = fill_value 

158 return ret 

159 

160 if hashable: 

161 if other in self.categories: 

162 i = self._unbox_scalar(other) 

163 ret = op(self._codes, i) 

164 

165 if opname not in {"__eq__", "__ge__", "__gt__"}: 

166 # GH#29820 performance trick; get_loc will always give i>=0, 

167 # so in the cases (__ne__, __le__, __lt__) the setting 

168 # here is a no-op, so can be skipped. 

169 mask = self._codes == -1 

170 ret[mask] = fill_value 

171 return ret 

172 else: 

173 return ops.invalid_comparison(self, other, op) 

174 else: 

175 # allow categorical vs object dtype array comparisons for equality 

176 # these are only positional comparisons 

177 if opname not in ["__eq__", "__ne__"]: 

178 raise TypeError( 

179 f"Cannot compare a Categorical for op {opname} with " 

180 f"type {type(other)}.\nIf you want to compare values, " 

181 "use 'np.asarray(cat) <op> other'." 

182 ) 

183 

184 if isinstance(other, ExtensionArray) and needs_i8_conversion(other.dtype): 

185 # We would return NotImplemented here, but that messes up 

186 # ExtensionIndex's wrapped methods 

187 return op(other, self) 

188 return getattr(np.array(self), opname)(np.array(other)) 

189 

190 func.__name__ = opname 

191 

192 return func 

193 

194 

195def contains(cat, key, container) -> bool: 

196 """ 

197 Helper for membership check for ``key`` in ``cat``. 

198 

199 This is a helper method for :method:`__contains__` 

200 and :class:`CategoricalIndex.__contains__`. 

201 

202 Returns True if ``key`` is in ``cat.categories`` and the 

203 location of ``key`` in ``categories`` is in ``container``. 

204 

205 Parameters 

206 ---------- 

207 cat : :class:`Categorical`or :class:`categoricalIndex` 

208 key : a hashable object 

209 The key to check membership for. 

210 container : Container (e.g. list-like or mapping) 

211 The container to check for membership in. 

212 

213 Returns 

214 ------- 

215 is_in : bool 

216 True if ``key`` is in ``self.categories`` and location of 

217 ``key`` in ``categories`` is in ``container``, else False. 

218 

219 Notes 

220 ----- 

221 This method does not check for NaN values. Do that separately 

222 before calling this method. 

223 """ 

224 hash(key) 

225 

226 # get location of key in categories. 

227 # If a KeyError, the key isn't in categories, so logically 

228 # can't be in container either. 

229 try: 

230 loc = cat.categories.get_loc(key) 

231 except (KeyError, TypeError): 

232 return False 

233 

234 # loc is the location of key in categories, but also the *value* 

235 # for key in container. So, `key` may be in categories, 

236 # but still not in `container`. Example ('b' in categories, 

237 # but not in values): 

238 # 'b' in Categorical(['a'], categories=['a', 'b']) # False 

239 if is_scalar(loc): 

240 return loc in container 

241 else: 

242 # if categories is an IntervalIndex, loc is an array. 

243 return any(loc_ in container for loc_ in loc) 

244 

245 

246class Categorical(NDArrayBackedExtensionArray, PandasObject, ObjectStringArrayMixin): 

247 """ 

248 Represent a categorical variable in classic R / S-plus fashion. 

249 

250 `Categoricals` can only take on a limited, and usually fixed, number 

251 of possible values (`categories`). In contrast to statistical categorical 

252 variables, a `Categorical` might have an order, but numerical operations 

253 (additions, divisions, ...) are not possible. 

254 

255 All values of the `Categorical` are either in `categories` or `np.nan`. 

256 Assigning values outside of `categories` will raise a `ValueError`. Order 

257 is defined by the order of the `categories`, not lexical order of the 

258 values. 

259 

260 Parameters 

261 ---------- 

262 values : list-like 

263 The values of the categorical. If categories are given, values not in 

264 categories will be replaced with NaN. 

265 categories : Index-like (unique), optional 

266 The unique categories for this categorical. If not given, the 

267 categories are assumed to be the unique values of `values` (sorted, if 

268 possible, otherwise in the order in which they appear). 

269 ordered : bool, default False 

270 Whether or not this categorical is treated as a ordered categorical. 

271 If True, the resulting categorical will be ordered. 

272 An ordered categorical respects, when sorted, the order of its 

273 `categories` attribute (which in turn is the `categories` argument, if 

274 provided). 

275 dtype : CategoricalDtype 

276 An instance of ``CategoricalDtype`` to use for this categorical. 

277 

278 Attributes 

279 ---------- 

280 categories : Index 

281 The categories of this categorical 

282 codes : ndarray 

283 The codes (integer positions, which point to the categories) of this 

284 categorical, read only. 

285 ordered : bool 

286 Whether or not this Categorical is ordered. 

287 dtype : CategoricalDtype 

288 The instance of ``CategoricalDtype`` storing the ``categories`` 

289 and ``ordered``. 

290 

291 Methods 

292 ------- 

293 from_codes 

294 __array__ 

295 

296 Raises 

297 ------ 

298 ValueError 

299 If the categories do not validate. 

300 TypeError 

301 If an explicit ``ordered=True`` is given but no `categories` and the 

302 `values` are not sortable. 

303 

304 See Also 

305 -------- 

306 CategoricalDtype : Type for categorical data. 

307 CategoricalIndex : An Index with an underlying ``Categorical``. 

308 

309 Notes 

310 ----- 

311 See the `user guide 

312 <https://pandas.pydata.org/pandas-docs/stable/user_guide/categorical.html>`__ 

313 for more. 

314 

315 Examples 

316 -------- 

317 >>> pd.Categorical([1, 2, 3, 1, 2, 3]) 

318 [1, 2, 3, 1, 2, 3] 

319 Categories (3, int64): [1, 2, 3] 

320 

321 >>> pd.Categorical(['a', 'b', 'c', 'a', 'b', 'c']) 

322 ['a', 'b', 'c', 'a', 'b', 'c'] 

323 Categories (3, object): ['a', 'b', 'c'] 

324 

325 Missing values are not included as a category. 

326 

327 >>> c = pd.Categorical([1, 2, 3, 1, 2, 3, np.nan]) 

328 >>> c 

329 [1, 2, 3, 1, 2, 3, NaN] 

330 Categories (3, int64): [1, 2, 3] 

331 

332 However, their presence is indicated in the `codes` attribute 

333 by code `-1`. 

334 

335 >>> c.codes 

336 array([ 0, 1, 2, 0, 1, 2, -1], dtype=int8) 

337 

338 Ordered `Categoricals` can be sorted according to the custom order 

339 of the categories and can have a min and max value. 

340 

341 >>> c = pd.Categorical(['a', 'b', 'c', 'a', 'b', 'c'], ordered=True, 

342 ... categories=['c', 'b', 'a']) 

343 >>> c 

344 ['a', 'b', 'c', 'a', 'b', 'c'] 

345 Categories (3, object): ['c' < 'b' < 'a'] 

346 >>> c.min() 

347 'c' 

348 """ 

349 

350 # For comparisons, so that numpy uses our implementation if the compare 

351 # ops, which raise 

352 __array_priority__ = 1000 

353 # tolist is not actually deprecated, just suppressed in the __dir__ 

354 _hidden_attrs = PandasObject._hidden_attrs | frozenset(["tolist"]) 

355 _typ = "categorical" 

356 

357 _dtype: CategoricalDtype 

358 

359 def __init__( 

360 self, 

361 values, 

362 categories=None, 

363 ordered=None, 

364 dtype: Dtype | None = None, 

365 fastpath: bool = False, 

366 copy: bool = True, 

367 ) -> None: 

368 dtype = CategoricalDtype._from_values_or_dtype( 

369 values, categories, ordered, dtype 

370 ) 

371 # At this point, dtype is always a CategoricalDtype, but 

372 # we may have dtype.categories be None, and we need to 

373 # infer categories in a factorization step further below 

374 

375 if fastpath: 

376 codes = coerce_indexer_dtype(values, dtype.categories) 

377 dtype = CategoricalDtype(ordered=False).update_dtype(dtype) 

378 super().__init__(codes, dtype) 

379 return 

380 

381 if not is_list_like(values): 

382 # GH#38433 

383 raise TypeError("Categorical input must be list-like") 

384 

385 # null_mask indicates missing values we want to exclude from inference. 

386 # This means: only missing values in list-likes (not arrays/ndframes). 

387 null_mask = np.array(False) 

388 

389 # sanitize input 

390 if is_categorical_dtype(values): 

391 if dtype.categories is None: 

392 dtype = CategoricalDtype(values.categories, dtype.ordered) 

393 elif not isinstance(values, (ABCIndex, ABCSeries, ExtensionArray)): 

394 values = com.convert_to_list_like(values) 

395 if isinstance(values, list) and len(values) == 0: 

396 # By convention, empty lists result in object dtype: 

397 values = np.array([], dtype=object) 

398 elif isinstance(values, np.ndarray): 

399 if values.ndim > 1: 

400 # preempt sanitize_array from raising ValueError 

401 raise NotImplementedError( 

402 "> 1 ndim Categorical are not supported at this time" 

403 ) 

404 values = sanitize_array(values, None) 

405 else: 

406 # i.e. must be a list 

407 arr = sanitize_array(values, None) 

408 null_mask = isna(arr) 

409 if null_mask.any(): 

410 # We remove null values here, then below will re-insert 

411 # them, grep "full_codes" 

412 arr_list = [values[idx] for idx in np.where(~null_mask)[0]] 

413 

414 # GH#44900 Do not cast to float if we have only missing values 

415 if arr_list or arr.dtype == "object": 

416 sanitize_dtype = None 

417 else: 

418 sanitize_dtype = arr.dtype 

419 

420 arr = sanitize_array(arr_list, None, dtype=sanitize_dtype) 

421 values = arr 

422 

423 if dtype.categories is None: 

424 try: 

425 codes, categories = factorize(values, sort=True) 

426 except TypeError as err: 

427 codes, categories = factorize(values, sort=False) 

428 if dtype.ordered: 

429 # raise, as we don't have a sortable data structure and so 

430 # the user should give us one by specifying categories 

431 raise TypeError( 

432 "'values' is not ordered, please " 

433 "explicitly specify the categories order " 

434 "by passing in a categories argument." 

435 ) from err 

436 

437 # we're inferring from values 

438 dtype = CategoricalDtype(categories, dtype.ordered) 

439 

440 elif is_categorical_dtype(values.dtype): 

441 old_codes = extract_array(values)._codes 

442 codes = recode_for_categories( 

443 old_codes, values.dtype.categories, dtype.categories, copy=copy 

444 ) 

445 

446 else: 

447 codes = _get_codes_for_values(values, dtype.categories) 

448 

449 if null_mask.any(): 

450 # Reinsert -1 placeholders for previously removed missing values 

451 full_codes = -np.ones(null_mask.shape, dtype=codes.dtype) 

452 full_codes[~null_mask] = codes 

453 codes = full_codes 

454 

455 dtype = CategoricalDtype(ordered=False).update_dtype(dtype) 

456 arr = coerce_indexer_dtype(codes, dtype.categories) 

457 super().__init__(arr, dtype) 

458 

459 @property 

460 def dtype(self) -> CategoricalDtype: 

461 """ 

462 The :class:`~pandas.api.types.CategoricalDtype` for this instance. 

463 """ 

464 return self._dtype 

465 

466 @property 

467 def _internal_fill_value(self) -> int: 

468 # using the specific numpy integer instead of python int to get 

469 # the correct dtype back from _quantile in the all-NA case 

470 dtype = self._ndarray.dtype 

471 return dtype.type(-1) 

472 

473 @classmethod 

474 def _from_sequence( 

475 cls, scalars, *, dtype: Dtype | None = None, copy: bool = False 

476 ) -> Categorical: 

477 return Categorical(scalars, dtype=dtype, copy=copy) 

478 

479 @overload 

480 def astype(self, dtype: npt.DTypeLike, copy: bool = ...) -> np.ndarray: 

481 ... 

482 

483 @overload 

484 def astype(self, dtype: ExtensionDtype, copy: bool = ...) -> ExtensionArray: 

485 ... 

486 

487 @overload 

488 def astype(self, dtype: AstypeArg, copy: bool = ...) -> ArrayLike: 

489 ... 

490 

491 def astype(self, dtype: AstypeArg, copy: bool = True) -> ArrayLike: 

492 """ 

493 Coerce this type to another dtype 

494 

495 Parameters 

496 ---------- 

497 dtype : numpy dtype or pandas type 

498 copy : bool, default True 

499 By default, astype always returns a newly allocated object. 

500 If copy is set to False and dtype is categorical, the original 

501 object is returned. 

502 """ 

503 dtype = pandas_dtype(dtype) 

504 if self.dtype is dtype: 

505 result = self.copy() if copy else self 

506 

507 elif is_categorical_dtype(dtype): 

508 dtype = cast(CategoricalDtype, dtype) 

509 

510 # GH 10696/18593/18630 

511 dtype = self.dtype.update_dtype(dtype) 

512 self = self.copy() if copy else self 

513 result = self._set_dtype(dtype) 

514 

515 elif isinstance(dtype, ExtensionDtype): 

516 return super().astype(dtype, copy=copy) 

517 

518 elif is_integer_dtype(dtype) and self.isna().any(): 

519 raise ValueError("Cannot convert float NaN to integer") 

520 

521 elif len(self.codes) == 0 or len(self.categories) == 0: 

522 result = np.array( 

523 self, 

524 dtype=dtype, 

525 copy=copy, 

526 ) 

527 

528 else: 

529 # GH8628 (PERF): astype category codes instead of astyping array 

530 new_cats = self.categories._values 

531 

532 try: 

533 new_cats = new_cats.astype(dtype=dtype, copy=copy) 

534 fill_value = self.categories._na_value 

535 if not is_valid_na_for_dtype(fill_value, dtype): 

536 fill_value = lib.item_from_zerodim( 

537 np.array(self.categories._na_value).astype(dtype) 

538 ) 

539 except ( 

540 TypeError, # downstream error msg for CategoricalIndex is misleading 

541 ValueError, 

542 ): 

543 msg = f"Cannot cast {self.categories.dtype} dtype to {dtype}" 

544 raise ValueError(msg) 

545 

546 result = take_nd( 

547 new_cats, ensure_platform_int(self._codes), fill_value=fill_value 

548 ) 

549 

550 return result 

551 

552 def to_list(self): 

553 """ 

554 Alias for tolist. 

555 """ 

556 return self.tolist() 

557 

558 @classmethod 

559 def _from_inferred_categories( 

560 cls, inferred_categories, inferred_codes, dtype, true_values=None 

561 ): 

562 """ 

563 Construct a Categorical from inferred values. 

564 

565 For inferred categories (`dtype` is None) the categories are sorted. 

566 For explicit `dtype`, the `inferred_categories` are cast to the 

567 appropriate type. 

568 

569 Parameters 

570 ---------- 

571 inferred_categories : Index 

572 inferred_codes : Index 

573 dtype : CategoricalDtype or 'category' 

574 true_values : list, optional 

575 If none are provided, the default ones are 

576 "True", "TRUE", and "true." 

577 

578 Returns 

579 ------- 

580 Categorical 

581 """ 

582 from pandas import ( 

583 Index, 

584 to_datetime, 

585 to_numeric, 

586 to_timedelta, 

587 ) 

588 

589 cats = Index(inferred_categories) 

590 known_categories = ( 

591 isinstance(dtype, CategoricalDtype) and dtype.categories is not None 

592 ) 

593 

594 if known_categories: 

595 # Convert to a specialized type with `dtype` if specified. 

596 if is_any_real_numeric_dtype(dtype.categories): 

597 cats = to_numeric(inferred_categories, errors="coerce") 

598 elif is_datetime64_dtype(dtype.categories): 

599 cats = to_datetime(inferred_categories, errors="coerce") 

600 elif is_timedelta64_dtype(dtype.categories): 

601 cats = to_timedelta(inferred_categories, errors="coerce") 

602 elif is_bool_dtype(dtype.categories): 

603 if true_values is None: 

604 true_values = ["True", "TRUE", "true"] 

605 

606 # error: Incompatible types in assignment (expression has type 

607 # "ndarray", variable has type "Index") 

608 cats = cats.isin(true_values) # type: ignore[assignment] 

609 

610 if known_categories: 

611 # Recode from observation order to dtype.categories order. 

612 categories = dtype.categories 

613 codes = recode_for_categories(inferred_codes, cats, categories) 

614 elif not cats.is_monotonic_increasing: 

615 # Sort categories and recode for unknown categories. 

616 unsorted = cats.copy() 

617 categories = cats.sort_values() 

618 

619 codes = recode_for_categories(inferred_codes, unsorted, categories) 

620 dtype = CategoricalDtype(categories, ordered=False) 

621 else: 

622 dtype = CategoricalDtype(cats, ordered=False) 

623 codes = inferred_codes 

624 

625 return cls(codes, dtype=dtype, fastpath=True) 

626 

627 @classmethod 

628 def from_codes( 

629 cls, codes, categories=None, ordered=None, dtype: Dtype | None = None 

630 ) -> Categorical: 

631 """ 

632 Make a Categorical type from codes and categories or dtype. 

633 

634 This constructor is useful if you already have codes and 

635 categories/dtype and so do not need the (computation intensive) 

636 factorization step, which is usually done on the constructor. 

637 

638 If your data does not follow this convention, please use the normal 

639 constructor. 

640 

641 Parameters 

642 ---------- 

643 codes : array-like of int 

644 An integer array, where each integer points to a category in 

645 categories or dtype.categories, or else is -1 for NaN. 

646 categories : index-like, optional 

647 The categories for the categorical. Items need to be unique. 

648 If the categories are not given here, then they must be provided 

649 in `dtype`. 

650 ordered : bool, optional 

651 Whether or not this categorical is treated as an ordered 

652 categorical. If not given here or in `dtype`, the resulting 

653 categorical will be unordered. 

654 dtype : CategoricalDtype or "category", optional 

655 If :class:`CategoricalDtype`, cannot be used together with 

656 `categories` or `ordered`. 

657 

658 Returns 

659 ------- 

660 Categorical 

661 

662 Examples 

663 -------- 

664 >>> dtype = pd.CategoricalDtype(['a', 'b'], ordered=True) 

665 >>> pd.Categorical.from_codes(codes=[0, 1, 0, 1], dtype=dtype) 

666 ['a', 'b', 'a', 'b'] 

667 Categories (2, object): ['a' < 'b'] 

668 """ 

669 dtype = CategoricalDtype._from_values_or_dtype( 

670 categories=categories, ordered=ordered, dtype=dtype 

671 ) 

672 if dtype.categories is None: 

673 msg = ( 

674 "The categories must be provided in 'categories' or " 

675 "'dtype'. Both were None." 

676 ) 

677 raise ValueError(msg) 

678 

679 if is_extension_array_dtype(codes) and is_integer_dtype(codes): 

680 # Avoid the implicit conversion of Int to object 

681 if isna(codes).any(): 

682 raise ValueError("codes cannot contain NA values") 

683 codes = codes.to_numpy(dtype=np.int64) 

684 else: 

685 codes = np.asarray(codes) 

686 if len(codes) and not is_integer_dtype(codes): 

687 raise ValueError("codes need to be array-like integers") 

688 

689 if len(codes) and (codes.max() >= len(dtype.categories) or codes.min() < -1): 

690 raise ValueError("codes need to be between -1 and len(categories)-1") 

691 

692 return cls(codes, dtype=dtype, fastpath=True) 

693 

694 # ------------------------------------------------------------------ 

695 # Categories/Codes/Ordered 

696 

697 @property 

698 def categories(self) -> Index: 

699 """ 

700 The categories of this categorical. 

701 

702 Setting assigns new values to each category (effectively a rename of 

703 each individual category). 

704 

705 The assigned value has to be a list-like object. All items must be 

706 unique and the number of items in the new categories must be the same 

707 as the number of items in the old categories. 

708 

709 Raises 

710 ------ 

711 ValueError 

712 If the new categories do not validate as categories or if the 

713 number of new categories is unequal the number of old categories 

714 

715 See Also 

716 -------- 

717 rename_categories : Rename categories. 

718 reorder_categories : Reorder categories. 

719 add_categories : Add new categories. 

720 remove_categories : Remove the specified categories. 

721 remove_unused_categories : Remove categories which are not used. 

722 set_categories : Set the categories to the specified ones. 

723 """ 

724 return self.dtype.categories 

725 

726 @property 

727 def ordered(self) -> Ordered: 

728 """ 

729 Whether the categories have an ordered relationship. 

730 """ 

731 return self.dtype.ordered 

732 

733 @property 

734 def codes(self) -> np.ndarray: 

735 """ 

736 The category codes of this categorical. 

737 

738 Codes are an array of integers which are the positions of the actual 

739 values in the categories array. 

740 

741 There is no setter, use the other categorical methods and the normal item 

742 setter to change values in the categorical. 

743 

744 Returns 

745 ------- 

746 ndarray[int] 

747 A non-writable view of the `codes` array. 

748 """ 

749 v = self._codes.view() 

750 v.flags.writeable = False 

751 return v 

752 

753 def _set_categories(self, categories, fastpath: bool = False) -> None: 

754 """ 

755 Sets new categories inplace 

756 

757 Parameters 

758 ---------- 

759 fastpath : bool, default False 

760 Don't perform validation of the categories for uniqueness or nulls 

761 

762 Examples 

763 -------- 

764 >>> c = pd.Categorical(['a', 'b']) 

765 >>> c 

766 ['a', 'b'] 

767 Categories (2, object): ['a', 'b'] 

768 

769 >>> c._set_categories(pd.Index(['a', 'c'])) 

770 >>> c 

771 ['a', 'c'] 

772 Categories (2, object): ['a', 'c'] 

773 """ 

774 if fastpath: 

775 new_dtype = CategoricalDtype._from_fastpath(categories, self.ordered) 

776 else: 

777 new_dtype = CategoricalDtype(categories, ordered=self.ordered) 

778 if ( 

779 not fastpath 

780 and self.dtype.categories is not None 

781 and len(new_dtype.categories) != len(self.dtype.categories) 

782 ): 

783 raise ValueError( 

784 "new categories need to have the same number of " 

785 "items as the old categories!" 

786 ) 

787 

788 super().__init__(self._ndarray, new_dtype) 

789 

790 def _set_dtype(self, dtype: CategoricalDtype) -> Categorical: 

791 """ 

792 Internal method for directly updating the CategoricalDtype 

793 

794 Parameters 

795 ---------- 

796 dtype : CategoricalDtype 

797 

798 Notes 

799 ----- 

800 We don't do any validation here. It's assumed that the dtype is 

801 a (valid) instance of `CategoricalDtype`. 

802 """ 

803 codes = recode_for_categories(self.codes, self.categories, dtype.categories) 

804 return type(self)(codes, dtype=dtype, fastpath=True) 

805 

806 def set_ordered(self, value: bool) -> Categorical: 

807 """ 

808 Set the ordered attribute to the boolean value. 

809 

810 Parameters 

811 ---------- 

812 value : bool 

813 Set whether this categorical is ordered (True) or not (False). 

814 """ 

815 new_dtype = CategoricalDtype(self.categories, ordered=value) 

816 cat = self.copy() 

817 NDArrayBacked.__init__(cat, cat._ndarray, new_dtype) 

818 return cat 

819 

820 def as_ordered(self) -> Categorical: 

821 """ 

822 Set the Categorical to be ordered. 

823 

824 Returns 

825 ------- 

826 Categorical 

827 Ordered Categorical. 

828 """ 

829 return self.set_ordered(True) 

830 

831 def as_unordered(self) -> Categorical: 

832 """ 

833 Set the Categorical to be unordered. 

834 

835 Returns 

836 ------- 

837 Categorical 

838 Unordered Categorical. 

839 """ 

840 return self.set_ordered(False) 

841 

842 def set_categories(self, new_categories, ordered=None, rename: bool = False): 

843 """ 

844 Set the categories to the specified new_categories. 

845 

846 `new_categories` can include new categories (which will result in 

847 unused categories) or remove old categories (which results in values 

848 set to NaN). If `rename==True`, the categories will simple be renamed 

849 (less or more items than in old categories will result in values set to 

850 NaN or in unused categories respectively). 

851 

852 This method can be used to perform more than one action of adding, 

853 removing, and reordering simultaneously and is therefore faster than 

854 performing the individual steps via the more specialised methods. 

855 

856 On the other hand this methods does not do checks (e.g., whether the 

857 old categories are included in the new categories on a reorder), which 

858 can result in surprising changes, for example when using special string 

859 dtypes, which does not considers a S1 string equal to a single char 

860 python string. 

861 

862 Parameters 

863 ---------- 

864 new_categories : Index-like 

865 The categories in new order. 

866 ordered : bool, default False 

867 Whether or not the categorical is treated as a ordered categorical. 

868 If not given, do not change the ordered information. 

869 rename : bool, default False 

870 Whether or not the new_categories should be considered as a rename 

871 of the old categories or as reordered categories. 

872 

873 Returns 

874 ------- 

875 Categorical with reordered categories. 

876 

877 Raises 

878 ------ 

879 ValueError 

880 If new_categories does not validate as categories 

881 

882 See Also 

883 -------- 

884 rename_categories : Rename categories. 

885 reorder_categories : Reorder categories. 

886 add_categories : Add new categories. 

887 remove_categories : Remove the specified categories. 

888 remove_unused_categories : Remove categories which are not used. 

889 """ 

890 

891 if ordered is None: 

892 ordered = self.dtype.ordered 

893 new_dtype = CategoricalDtype(new_categories, ordered=ordered) 

894 

895 cat = self.copy() 

896 if rename: 

897 if cat.dtype.categories is not None and len(new_dtype.categories) < len( 

898 cat.dtype.categories 

899 ): 

900 # remove all _codes which are larger and set to -1/NaN 

901 cat._codes[cat._codes >= len(new_dtype.categories)] = -1 

902 codes = cat._codes 

903 else: 

904 codes = recode_for_categories( 

905 cat.codes, cat.categories, new_dtype.categories 

906 ) 

907 NDArrayBacked.__init__(cat, codes, new_dtype) 

908 return cat 

909 

910 def rename_categories(self, new_categories) -> Categorical: 

911 """ 

912 Rename categories. 

913 

914 Parameters 

915 ---------- 

916 new_categories : list-like, dict-like or callable 

917 

918 New categories which will replace old categories. 

919 

920 * list-like: all items must be unique and the number of items in 

921 the new categories must match the existing number of categories. 

922 

923 * dict-like: specifies a mapping from 

924 old categories to new. Categories not contained in the mapping 

925 are passed through and extra categories in the mapping are 

926 ignored. 

927 

928 * callable : a callable that is called on all items in the old 

929 categories and whose return values comprise the new categories. 

930 

931 Returns 

932 ------- 

933 Categorical 

934 Categorical with renamed categories. 

935 

936 Raises 

937 ------ 

938 ValueError 

939 If new categories are list-like and do not have the same number of 

940 items than the current categories or do not validate as categories 

941 

942 See Also 

943 -------- 

944 reorder_categories : Reorder categories. 

945 add_categories : Add new categories. 

946 remove_categories : Remove the specified categories. 

947 remove_unused_categories : Remove categories which are not used. 

948 set_categories : Set the categories to the specified ones. 

949 

950 Examples 

951 -------- 

952 >>> c = pd.Categorical(['a', 'a', 'b']) 

953 >>> c.rename_categories([0, 1]) 

954 [0, 0, 1] 

955 Categories (2, int64): [0, 1] 

956 

957 For dict-like ``new_categories``, extra keys are ignored and 

958 categories not in the dictionary are passed through 

959 

960 >>> c.rename_categories({'a': 'A', 'c': 'C'}) 

961 ['A', 'A', 'b'] 

962 Categories (2, object): ['A', 'b'] 

963 

964 You may also provide a callable to create the new categories 

965 

966 >>> c.rename_categories(lambda x: x.upper()) 

967 ['A', 'A', 'B'] 

968 Categories (2, object): ['A', 'B'] 

969 """ 

970 

971 if is_dict_like(new_categories): 

972 new_categories = [ 

973 new_categories.get(item, item) for item in self.categories 

974 ] 

975 elif callable(new_categories): 

976 new_categories = [new_categories(item) for item in self.categories] 

977 

978 cat = self.copy() 

979 cat._set_categories(new_categories) 

980 return cat 

981 

982 def reorder_categories(self, new_categories, ordered=None): 

983 """ 

984 Reorder categories as specified in new_categories. 

985 

986 `new_categories` need to include all old categories and no new category 

987 items. 

988 

989 Parameters 

990 ---------- 

991 new_categories : Index-like 

992 The categories in new order. 

993 ordered : bool, optional 

994 Whether or not the categorical is treated as a ordered categorical. 

995 If not given, do not change the ordered information. 

996 

997 Returns 

998 ------- 

999 Categorical 

1000 Categorical with reordered categories. 

1001 

1002 Raises 

1003 ------ 

1004 ValueError 

1005 If the new categories do not contain all old category items or any 

1006 new ones 

1007 

1008 See Also 

1009 -------- 

1010 rename_categories : Rename categories. 

1011 add_categories : Add new categories. 

1012 remove_categories : Remove the specified categories. 

1013 remove_unused_categories : Remove categories which are not used. 

1014 set_categories : Set the categories to the specified ones. 

1015 """ 

1016 if ( 

1017 len(self.categories) != len(new_categories) 

1018 or not self.categories.difference(new_categories).empty 

1019 ): 

1020 raise ValueError( 

1021 "items in new_categories are not the same as in old categories" 

1022 ) 

1023 return self.set_categories(new_categories, ordered=ordered) 

1024 

1025 def add_categories(self, new_categories) -> Categorical: 

1026 """ 

1027 Add new categories. 

1028 

1029 `new_categories` will be included at the last/highest place in the 

1030 categories and will be unused directly after this call. 

1031 

1032 Parameters 

1033 ---------- 

1034 new_categories : category or list-like of category 

1035 The new categories to be included. 

1036 

1037 Returns 

1038 ------- 

1039 Categorical 

1040 Categorical with new categories added. 

1041 

1042 Raises 

1043 ------ 

1044 ValueError 

1045 If the new categories include old categories or do not validate as 

1046 categories 

1047 

1048 See Also 

1049 -------- 

1050 rename_categories : Rename categories. 

1051 reorder_categories : Reorder categories. 

1052 remove_categories : Remove the specified categories. 

1053 remove_unused_categories : Remove categories which are not used. 

1054 set_categories : Set the categories to the specified ones. 

1055 

1056 Examples 

1057 -------- 

1058 >>> c = pd.Categorical(['c', 'b', 'c']) 

1059 >>> c 

1060 ['c', 'b', 'c'] 

1061 Categories (2, object): ['b', 'c'] 

1062 

1063 >>> c.add_categories(['d', 'a']) 

1064 ['c', 'b', 'c'] 

1065 Categories (4, object): ['b', 'c', 'd', 'a'] 

1066 """ 

1067 

1068 if not is_list_like(new_categories): 

1069 new_categories = [new_categories] 

1070 already_included = set(new_categories) & set(self.dtype.categories) 

1071 if len(already_included) != 0: 

1072 raise ValueError( 

1073 f"new categories must not include old categories: {already_included}" 

1074 ) 

1075 

1076 if hasattr(new_categories, "dtype"): 

1077 from pandas import Series 

1078 

1079 dtype = find_common_type( 

1080 [self.dtype.categories.dtype, new_categories.dtype] 

1081 ) 

1082 new_categories = Series( 

1083 list(self.dtype.categories) + list(new_categories), dtype=dtype 

1084 ) 

1085 else: 

1086 new_categories = list(self.dtype.categories) + list(new_categories) 

1087 

1088 new_dtype = CategoricalDtype(new_categories, self.ordered) 

1089 cat = self.copy() 

1090 codes = coerce_indexer_dtype(cat._ndarray, new_dtype.categories) 

1091 NDArrayBacked.__init__(cat, codes, new_dtype) 

1092 return cat 

1093 

1094 def remove_categories(self, removals): 

1095 """ 

1096 Remove the specified categories. 

1097 

1098 `removals` must be included in the old categories. Values which were in 

1099 the removed categories will be set to NaN 

1100 

1101 Parameters 

1102 ---------- 

1103 removals : category or list of categories 

1104 The categories which should be removed. 

1105 

1106 Returns 

1107 ------- 

1108 Categorical 

1109 Categorical with removed categories. 

1110 

1111 Raises 

1112 ------ 

1113 ValueError 

1114 If the removals are not contained in the categories 

1115 

1116 See Also 

1117 -------- 

1118 rename_categories : Rename categories. 

1119 reorder_categories : Reorder categories. 

1120 add_categories : Add new categories. 

1121 remove_unused_categories : Remove categories which are not used. 

1122 set_categories : Set the categories to the specified ones. 

1123 

1124 Examples 

1125 -------- 

1126 >>> c = pd.Categorical(['a', 'c', 'b', 'c', 'd']) 

1127 >>> c 

1128 ['a', 'c', 'b', 'c', 'd'] 

1129 Categories (4, object): ['a', 'b', 'c', 'd'] 

1130 

1131 >>> c.remove_categories(['d', 'a']) 

1132 [NaN, 'c', 'b', 'c', NaN] 

1133 Categories (2, object): ['b', 'c'] 

1134 """ 

1135 from pandas import Index 

1136 

1137 if not is_list_like(removals): 

1138 removals = [removals] 

1139 

1140 removals = Index(removals).unique().dropna() 

1141 new_categories = self.dtype.categories.difference(removals) 

1142 not_included = removals.difference(self.dtype.categories) 

1143 

1144 if len(not_included) != 0: 

1145 not_included = set(not_included) 

1146 raise ValueError(f"removals must all be in old categories: {not_included}") 

1147 

1148 return self.set_categories(new_categories, ordered=self.ordered, rename=False) 

1149 

1150 def remove_unused_categories(self) -> Categorical: 

1151 """ 

1152 Remove categories which are not used. 

1153 

1154 Returns 

1155 ------- 

1156 Categorical 

1157 Categorical with unused categories dropped. 

1158 

1159 See Also 

1160 -------- 

1161 rename_categories : Rename categories. 

1162 reorder_categories : Reorder categories. 

1163 add_categories : Add new categories. 

1164 remove_categories : Remove the specified categories. 

1165 set_categories : Set the categories to the specified ones. 

1166 

1167 Examples 

1168 -------- 

1169 >>> c = pd.Categorical(['a', 'c', 'b', 'c', 'd']) 

1170 >>> c 

1171 ['a', 'c', 'b', 'c', 'd'] 

1172 Categories (4, object): ['a', 'b', 'c', 'd'] 

1173 

1174 >>> c[2] = 'a' 

1175 >>> c[4] = 'c' 

1176 >>> c 

1177 ['a', 'c', 'a', 'c', 'c'] 

1178 Categories (4, object): ['a', 'b', 'c', 'd'] 

1179 

1180 >>> c.remove_unused_categories() 

1181 ['a', 'c', 'a', 'c', 'c'] 

1182 Categories (2, object): ['a', 'c'] 

1183 """ 

1184 idx, inv = np.unique(self._codes, return_inverse=True) 

1185 

1186 if idx.size != 0 and idx[0] == -1: # na sentinel 

1187 idx, inv = idx[1:], inv - 1 

1188 

1189 new_categories = self.dtype.categories.take(idx) 

1190 new_dtype = CategoricalDtype._from_fastpath( 

1191 new_categories, ordered=self.ordered 

1192 ) 

1193 new_codes = coerce_indexer_dtype(inv, new_dtype.categories) 

1194 

1195 cat = self.copy() 

1196 NDArrayBacked.__init__(cat, new_codes, new_dtype) 

1197 return cat 

1198 

1199 # ------------------------------------------------------------------ 

1200 

1201 def map(self, mapper): 

1202 """ 

1203 Map categories using an input mapping or function. 

1204 

1205 Maps the categories to new categories. If the mapping correspondence is 

1206 one-to-one the result is a :class:`~pandas.Categorical` which has the 

1207 same order property as the original, otherwise a :class:`~pandas.Index` 

1208 is returned. NaN values are unaffected. 

1209 

1210 If a `dict` or :class:`~pandas.Series` is used any unmapped category is 

1211 mapped to `NaN`. Note that if this happens an :class:`~pandas.Index` 

1212 will be returned. 

1213 

1214 Parameters 

1215 ---------- 

1216 mapper : function, dict, or Series 

1217 Mapping correspondence. 

1218 

1219 Returns 

1220 ------- 

1221 pandas.Categorical or pandas.Index 

1222 Mapped categorical. 

1223 

1224 See Also 

1225 -------- 

1226 CategoricalIndex.map : Apply a mapping correspondence on a 

1227 :class:`~pandas.CategoricalIndex`. 

1228 Index.map : Apply a mapping correspondence on an 

1229 :class:`~pandas.Index`. 

1230 Series.map : Apply a mapping correspondence on a 

1231 :class:`~pandas.Series`. 

1232 Series.apply : Apply more complex functions on a 

1233 :class:`~pandas.Series`. 

1234 

1235 Examples 

1236 -------- 

1237 >>> cat = pd.Categorical(['a', 'b', 'c']) 

1238 >>> cat 

1239 ['a', 'b', 'c'] 

1240 Categories (3, object): ['a', 'b', 'c'] 

1241 >>> cat.map(lambda x: x.upper()) 

1242 ['A', 'B', 'C'] 

1243 Categories (3, object): ['A', 'B', 'C'] 

1244 >>> cat.map({'a': 'first', 'b': 'second', 'c': 'third'}) 

1245 ['first', 'second', 'third'] 

1246 Categories (3, object): ['first', 'second', 'third'] 

1247 

1248 If the mapping is one-to-one the ordering of the categories is 

1249 preserved: 

1250 

1251 >>> cat = pd.Categorical(['a', 'b', 'c'], ordered=True) 

1252 >>> cat 

1253 ['a', 'b', 'c'] 

1254 Categories (3, object): ['a' < 'b' < 'c'] 

1255 >>> cat.map({'a': 3, 'b': 2, 'c': 1}) 

1256 [3, 2, 1] 

1257 Categories (3, int64): [3 < 2 < 1] 

1258 

1259 If the mapping is not one-to-one an :class:`~pandas.Index` is returned: 

1260 

1261 >>> cat.map({'a': 'first', 'b': 'second', 'c': 'first'}) 

1262 Index(['first', 'second', 'first'], dtype='object') 

1263 

1264 If a `dict` is used, all unmapped categories are mapped to `NaN` and 

1265 the result is an :class:`~pandas.Index`: 

1266 

1267 >>> cat.map({'a': 'first', 'b': 'second'}) 

1268 Index(['first', 'second', nan], dtype='object') 

1269 """ 

1270 new_categories = self.categories.map(mapper) 

1271 try: 

1272 return self.from_codes( 

1273 self._codes.copy(), categories=new_categories, ordered=self.ordered 

1274 ) 

1275 except ValueError: 

1276 # NA values are represented in self._codes with -1 

1277 # np.take causes NA values to take final element in new_categories 

1278 if np.any(self._codes == -1): 

1279 new_categories = new_categories.insert(len(new_categories), np.nan) 

1280 return np.take(new_categories, self._codes) 

1281 

1282 __eq__ = _cat_compare_op(operator.eq) 

1283 __ne__ = _cat_compare_op(operator.ne) 

1284 __lt__ = _cat_compare_op(operator.lt) 

1285 __gt__ = _cat_compare_op(operator.gt) 

1286 __le__ = _cat_compare_op(operator.le) 

1287 __ge__ = _cat_compare_op(operator.ge) 

1288 

1289 # ------------------------------------------------------------- 

1290 # Validators; ideally these can be de-duplicated 

1291 

1292 def _validate_setitem_value(self, value): 

1293 if not is_hashable(value): 

1294 # wrap scalars and hashable-listlikes in list 

1295 return self._validate_listlike(value) 

1296 else: 

1297 return self._validate_scalar(value) 

1298 

1299 def _validate_scalar(self, fill_value): 

1300 """ 

1301 Convert a user-facing fill_value to a representation to use with our 

1302 underlying ndarray, raising TypeError if this is not possible. 

1303 

1304 Parameters 

1305 ---------- 

1306 fill_value : object 

1307 

1308 Returns 

1309 ------- 

1310 fill_value : int 

1311 

1312 Raises 

1313 ------ 

1314 TypeError 

1315 """ 

1316 

1317 if is_valid_na_for_dtype(fill_value, self.categories.dtype): 

1318 fill_value = -1 

1319 elif fill_value in self.categories: 

1320 fill_value = self._unbox_scalar(fill_value) 

1321 else: 

1322 raise TypeError( 

1323 "Cannot setitem on a Categorical with a new " 

1324 f"category ({fill_value}), set the categories first" 

1325 ) from None 

1326 return fill_value 

1327 

1328 # ------------------------------------------------------------- 

1329 

1330 @ravel_compat 

1331 def __array__(self, dtype: NpDtype | None = None) -> np.ndarray: 

1332 """ 

1333 The numpy array interface. 

1334 

1335 Returns 

1336 ------- 

1337 numpy.array 

1338 A numpy array of either the specified dtype or, 

1339 if dtype==None (default), the same dtype as 

1340 categorical.categories.dtype. 

1341 """ 

1342 ret = take_nd(self.categories._values, self._codes) 

1343 if dtype and not is_dtype_equal(dtype, self.categories.dtype): 

1344 return np.asarray(ret, dtype) 

1345 # When we're a Categorical[ExtensionArray], like Interval, 

1346 # we need to ensure __array__ gets all the way to an 

1347 # ndarray. 

1348 return np.asarray(ret) 

1349 

1350 def __array_ufunc__(self, ufunc: np.ufunc, method: str, *inputs, **kwargs): 

1351 # for binary ops, use our custom dunder methods 

1352 result = ops.maybe_dispatch_ufunc_to_dunder_op( 

1353 self, ufunc, method, *inputs, **kwargs 

1354 ) 

1355 if result is not NotImplemented: 

1356 return result 

1357 

1358 if "out" in kwargs: 

1359 # e.g. test_numpy_ufuncs_out 

1360 return arraylike.dispatch_ufunc_with_out( 

1361 self, ufunc, method, *inputs, **kwargs 

1362 ) 

1363 

1364 if method == "reduce": 

1365 # e.g. TestCategoricalAnalytics::test_min_max_ordered 

1366 result = arraylike.dispatch_reduction_ufunc( 

1367 self, ufunc, method, *inputs, **kwargs 

1368 ) 

1369 if result is not NotImplemented: 

1370 return result 

1371 

1372 # for all other cases, raise for now (similarly as what happens in 

1373 # Series.__array_prepare__) 

1374 raise TypeError( 

1375 f"Object with dtype {self.dtype} cannot perform " 

1376 f"the numpy op {ufunc.__name__}" 

1377 ) 

1378 

1379 def __setstate__(self, state) -> None: 

1380 """Necessary for making this object picklable""" 

1381 if not isinstance(state, dict): 

1382 return super().__setstate__(state) 

1383 

1384 if "_dtype" not in state: 

1385 state["_dtype"] = CategoricalDtype(state["_categories"], state["_ordered"]) 

1386 

1387 if "_codes" in state and "_ndarray" not in state: 

1388 # backward compat, changed what is property vs attribute 

1389 state["_ndarray"] = state.pop("_codes") 

1390 

1391 super().__setstate__(state) 

1392 

1393 @property 

1394 def nbytes(self) -> int: 

1395 return self._codes.nbytes + self.dtype.categories.values.nbytes 

1396 

1397 def memory_usage(self, deep: bool = False) -> int: 

1398 """ 

1399 Memory usage of my values 

1400 

1401 Parameters 

1402 ---------- 

1403 deep : bool 

1404 Introspect the data deeply, interrogate 

1405 `object` dtypes for system-level memory consumption 

1406 

1407 Returns 

1408 ------- 

1409 bytes used 

1410 

1411 Notes 

1412 ----- 

1413 Memory usage does not include memory consumed by elements that 

1414 are not components of the array if deep=False 

1415 

1416 See Also 

1417 -------- 

1418 numpy.ndarray.nbytes 

1419 """ 

1420 return self._codes.nbytes + self.dtype.categories.memory_usage(deep=deep) 

1421 

1422 def isna(self) -> np.ndarray: 

1423 """ 

1424 Detect missing values 

1425 

1426 Missing values (-1 in .codes) are detected. 

1427 

1428 Returns 

1429 ------- 

1430 np.ndarray[bool] of whether my values are null 

1431 

1432 See Also 

1433 -------- 

1434 isna : Top-level isna. 

1435 isnull : Alias of isna. 

1436 Categorical.notna : Boolean inverse of Categorical.isna. 

1437 

1438 """ 

1439 return self._codes == -1 

1440 

1441 isnull = isna 

1442 

1443 def notna(self) -> np.ndarray: 

1444 """ 

1445 Inverse of isna 

1446 

1447 Both missing values (-1 in .codes) and NA as a category are detected as 

1448 null. 

1449 

1450 Returns 

1451 ------- 

1452 np.ndarray[bool] of whether my values are not null 

1453 

1454 See Also 

1455 -------- 

1456 notna : Top-level notna. 

1457 notnull : Alias of notna. 

1458 Categorical.isna : Boolean inverse of Categorical.notna. 

1459 

1460 """ 

1461 return ~self.isna() 

1462 

1463 notnull = notna 

1464 

1465 def value_counts(self, dropna: bool = True) -> Series: 

1466 """ 

1467 Return a Series containing counts of each category. 

1468 

1469 Every category will have an entry, even those with a count of 0. 

1470 

1471 Parameters 

1472 ---------- 

1473 dropna : bool, default True 

1474 Don't include counts of NaN. 

1475 

1476 Returns 

1477 ------- 

1478 counts : Series 

1479 

1480 See Also 

1481 -------- 

1482 Series.value_counts 

1483 """ 

1484 from pandas import ( 

1485 CategoricalIndex, 

1486 Series, 

1487 ) 

1488 

1489 code, cat = self._codes, self.categories 

1490 ncat, mask = (len(cat), code >= 0) 

1491 ix, clean = np.arange(ncat), mask.all() 

1492 

1493 if dropna or clean: 

1494 obs = code if clean else code[mask] 

1495 count = np.bincount(obs, minlength=ncat or 0) 

1496 else: 

1497 count = np.bincount(np.where(mask, code, ncat)) 

1498 ix = np.append(ix, -1) 

1499 

1500 ix = coerce_indexer_dtype(ix, self.dtype.categories) 

1501 ix = self._from_backing_data(ix) 

1502 

1503 return Series( 

1504 count, index=CategoricalIndex(ix), dtype="int64", name="count", copy=False 

1505 ) 

1506 

1507 # error: Argument 2 of "_empty" is incompatible with supertype 

1508 # "NDArrayBackedExtensionArray"; supertype defines the argument type as 

1509 # "ExtensionDtype" 

1510 @classmethod 

1511 def _empty( # type: ignore[override] 

1512 cls: type_t[Categorical], shape: Shape, dtype: CategoricalDtype 

1513 ) -> Categorical: 

1514 """ 

1515 Analogous to np.empty(shape, dtype=dtype) 

1516 

1517 Parameters 

1518 ---------- 

1519 shape : tuple[int] 

1520 dtype : CategoricalDtype 

1521 """ 

1522 arr = cls._from_sequence([], dtype=dtype) 

1523 

1524 # We have to use np.zeros instead of np.empty otherwise the resulting 

1525 # ndarray may contain codes not supported by this dtype, in which 

1526 # case repr(result) could segfault. 

1527 backing = np.zeros(shape, dtype=arr._ndarray.dtype) 

1528 

1529 return arr._from_backing_data(backing) 

1530 

1531 def _internal_get_values(self): 

1532 """ 

1533 Return the values. 

1534 

1535 For internal compatibility with pandas formatting. 

1536 

1537 Returns 

1538 ------- 

1539 np.ndarray or Index 

1540 A numpy array of the same dtype as categorical.categories.dtype or 

1541 Index if datetime / periods. 

1542 """ 

1543 # if we are a datetime and period index, return Index to keep metadata 

1544 if needs_i8_conversion(self.categories.dtype): 

1545 return self.categories.take(self._codes, fill_value=NaT) 

1546 elif is_integer_dtype(self.categories) and -1 in self._codes: 

1547 return self.categories.astype("object").take(self._codes, fill_value=np.nan) 

1548 return np.array(self) 

1549 

1550 def check_for_ordered(self, op) -> None: 

1551 """assert that we are ordered""" 

1552 if not self.ordered: 

1553 raise TypeError( 

1554 f"Categorical is not ordered for operation {op}\n" 

1555 "you can use .as_ordered() to change the " 

1556 "Categorical to an ordered one\n" 

1557 ) 

1558 

1559 def argsort( 

1560 self, *, ascending: bool = True, kind: SortKind = "quicksort", **kwargs 

1561 ): 

1562 """ 

1563 Return the indices that would sort the Categorical. 

1564 

1565 Missing values are sorted at the end. 

1566 

1567 Parameters 

1568 ---------- 

1569 ascending : bool, default True 

1570 Whether the indices should result in an ascending 

1571 or descending sort. 

1572 kind : {'quicksort', 'mergesort', 'heapsort', 'stable'}, optional 

1573 Sorting algorithm. 

1574 **kwargs: 

1575 passed through to :func:`numpy.argsort`. 

1576 

1577 Returns 

1578 ------- 

1579 np.ndarray[np.intp] 

1580 

1581 See Also 

1582 -------- 

1583 numpy.ndarray.argsort 

1584 

1585 Notes 

1586 ----- 

1587 While an ordering is applied to the category values, arg-sorting 

1588 in this context refers more to organizing and grouping together 

1589 based on matching category values. Thus, this function can be 

1590 called on an unordered Categorical instance unlike the functions 

1591 'Categorical.min' and 'Categorical.max'. 

1592 

1593 Examples 

1594 -------- 

1595 >>> pd.Categorical(['b', 'b', 'a', 'c']).argsort() 

1596 array([2, 0, 1, 3]) 

1597 

1598 >>> cat = pd.Categorical(['b', 'b', 'a', 'c'], 

1599 ... categories=['c', 'b', 'a'], 

1600 ... ordered=True) 

1601 >>> cat.argsort() 

1602 array([3, 0, 1, 2]) 

1603 

1604 Missing values are placed at the end 

1605 

1606 >>> cat = pd.Categorical([2, None, 1]) 

1607 >>> cat.argsort() 

1608 array([2, 0, 1]) 

1609 """ 

1610 return super().argsort(ascending=ascending, kind=kind, **kwargs) 

1611 

1612 @overload 

1613 def sort_values( 

1614 self, 

1615 *, 

1616 inplace: Literal[False] = ..., 

1617 ascending: bool = ..., 

1618 na_position: str = ..., 

1619 ) -> Categorical: 

1620 ... 

1621 

1622 @overload 

1623 def sort_values( 

1624 self, *, inplace: Literal[True], ascending: bool = ..., na_position: str = ... 

1625 ) -> None: 

1626 ... 

1627 

1628 def sort_values( 

1629 self, 

1630 *, 

1631 inplace: bool = False, 

1632 ascending: bool = True, 

1633 na_position: str = "last", 

1634 ) -> Categorical | None: 

1635 """ 

1636 Sort the Categorical by category value returning a new 

1637 Categorical by default. 

1638 

1639 While an ordering is applied to the category values, sorting in this 

1640 context refers more to organizing and grouping together based on 

1641 matching category values. Thus, this function can be called on an 

1642 unordered Categorical instance unlike the functions 'Categorical.min' 

1643 and 'Categorical.max'. 

1644 

1645 Parameters 

1646 ---------- 

1647 inplace : bool, default False 

1648 Do operation in place. 

1649 ascending : bool, default True 

1650 Order ascending. Passing False orders descending. The 

1651 ordering parameter provides the method by which the 

1652 category values are organized. 

1653 na_position : {'first', 'last'} (optional, default='last') 

1654 'first' puts NaNs at the beginning 

1655 'last' puts NaNs at the end 

1656 

1657 Returns 

1658 ------- 

1659 Categorical or None 

1660 

1661 See Also 

1662 -------- 

1663 Categorical.sort 

1664 Series.sort_values 

1665 

1666 Examples 

1667 -------- 

1668 >>> c = pd.Categorical([1, 2, 2, 1, 5]) 

1669 >>> c 

1670 [1, 2, 2, 1, 5] 

1671 Categories (3, int64): [1, 2, 5] 

1672 >>> c.sort_values() 

1673 [1, 1, 2, 2, 5] 

1674 Categories (3, int64): [1, 2, 5] 

1675 >>> c.sort_values(ascending=False) 

1676 [5, 2, 2, 1, 1] 

1677 Categories (3, int64): [1, 2, 5] 

1678 

1679 >>> c = pd.Categorical([1, 2, 2, 1, 5]) 

1680 

1681 'sort_values' behaviour with NaNs. Note that 'na_position' 

1682 is independent of the 'ascending' parameter: 

1683 

1684 >>> c = pd.Categorical([np.nan, 2, 2, np.nan, 5]) 

1685 >>> c 

1686 [NaN, 2, 2, NaN, 5] 

1687 Categories (2, int64): [2, 5] 

1688 >>> c.sort_values() 

1689 [2, 2, 5, NaN, NaN] 

1690 Categories (2, int64): [2, 5] 

1691 >>> c.sort_values(ascending=False) 

1692 [5, 2, 2, NaN, NaN] 

1693 Categories (2, int64): [2, 5] 

1694 >>> c.sort_values(na_position='first') 

1695 [NaN, NaN, 2, 2, 5] 

1696 Categories (2, int64): [2, 5] 

1697 >>> c.sort_values(ascending=False, na_position='first') 

1698 [NaN, NaN, 5, 2, 2] 

1699 Categories (2, int64): [2, 5] 

1700 """ 

1701 inplace = validate_bool_kwarg(inplace, "inplace") 

1702 if na_position not in ["last", "first"]: 

1703 raise ValueError(f"invalid na_position: {repr(na_position)}") 

1704 

1705 sorted_idx = nargsort(self, ascending=ascending, na_position=na_position) 

1706 

1707 if not inplace: 

1708 codes = self._codes[sorted_idx] 

1709 return self._from_backing_data(codes) 

1710 self._codes[:] = self._codes[sorted_idx] 

1711 return None 

1712 

1713 def _rank( 

1714 self, 

1715 *, 

1716 axis: AxisInt = 0, 

1717 method: str = "average", 

1718 na_option: str = "keep", 

1719 ascending: bool = True, 

1720 pct: bool = False, 

1721 ): 

1722 """ 

1723 See Series.rank.__doc__. 

1724 """ 

1725 if axis != 0: 

1726 raise NotImplementedError 

1727 vff = self._values_for_rank() 

1728 return algorithms.rank( 

1729 vff, 

1730 axis=axis, 

1731 method=method, 

1732 na_option=na_option, 

1733 ascending=ascending, 

1734 pct=pct, 

1735 ) 

1736 

1737 def _values_for_rank(self): 

1738 """ 

1739 For correctly ranking ordered categorical data. See GH#15420 

1740 

1741 Ordered categorical data should be ranked on the basis of 

1742 codes with -1 translated to NaN. 

1743 

1744 Returns 

1745 ------- 

1746 numpy.array 

1747 

1748 """ 

1749 from pandas import Series 

1750 

1751 if self.ordered: 

1752 values = self.codes 

1753 mask = values == -1 

1754 if mask.any(): 

1755 values = values.astype("float64") 

1756 values[mask] = np.nan 

1757 elif is_any_real_numeric_dtype(self.categories): 

1758 values = np.array(self) 

1759 else: 

1760 # reorder the categories (so rank can use the float codes) 

1761 # instead of passing an object array to rank 

1762 values = np.array( 

1763 self.rename_categories( 

1764 Series(self.categories, copy=False).rank().values 

1765 ) 

1766 ) 

1767 return values 

1768 

1769 # ------------------------------------------------------------------ 

1770 # NDArrayBackedExtensionArray compat 

1771 

1772 @property 

1773 def _codes(self) -> np.ndarray: 

1774 return self._ndarray 

1775 

1776 def _box_func(self, i: int): 

1777 if i == -1: 

1778 return np.NaN 

1779 return self.categories[i] 

1780 

1781 def _unbox_scalar(self, key) -> int: 

1782 # searchsorted is very performance sensitive. By converting codes 

1783 # to same dtype as self.codes, we get much faster performance. 

1784 code = self.categories.get_loc(key) 

1785 code = self._ndarray.dtype.type(code) 

1786 return code 

1787 

1788 # ------------------------------------------------------------------ 

1789 

1790 def __iter__(self) -> Iterator: 

1791 """ 

1792 Returns an Iterator over the values of this Categorical. 

1793 """ 

1794 if self.ndim == 1: 

1795 return iter(self._internal_get_values().tolist()) 

1796 else: 

1797 return (self[n] for n in range(len(self))) 

1798 

1799 def __contains__(self, key) -> bool: 

1800 """ 

1801 Returns True if `key` is in this Categorical. 

1802 """ 

1803 # if key is a NaN, check if any NaN is in self. 

1804 if is_valid_na_for_dtype(key, self.categories.dtype): 

1805 return bool(self.isna().any()) 

1806 

1807 return contains(self, key, container=self._codes) 

1808 

1809 # ------------------------------------------------------------------ 

1810 # Rendering Methods 

1811 

1812 def _formatter(self, boxed: bool = False): 

1813 # Defer to CategoricalFormatter's formatter. 

1814 return None 

1815 

1816 def _tidy_repr(self, max_vals: int = 10, footer: bool = True) -> str: 

1817 """ 

1818 a short repr displaying only max_vals and an optional (but default 

1819 footer) 

1820 """ 

1821 num = max_vals // 2 

1822 head = self[:num]._get_repr(length=False, footer=False) 

1823 tail = self[-(max_vals - num) :]._get_repr(length=False, footer=False) 

1824 

1825 result = f"{head[:-1]}, ..., {tail[1:]}" 

1826 if footer: 

1827 result = f"{result}\n{self._repr_footer()}" 

1828 

1829 return str(result) 

1830 

1831 def _repr_categories(self) -> list[str]: 

1832 """ 

1833 return the base repr for the categories 

1834 """ 

1835 max_categories = ( 

1836 10 

1837 if get_option("display.max_categories") == 0 

1838 else get_option("display.max_categories") 

1839 ) 

1840 from pandas.io.formats import format as fmt 

1841 

1842 format_array = partial( 

1843 fmt.format_array, formatter=None, quoting=QUOTE_NONNUMERIC 

1844 ) 

1845 if len(self.categories) > max_categories: 

1846 num = max_categories // 2 

1847 head = format_array(self.categories[:num]) 

1848 tail = format_array(self.categories[-num:]) 

1849 category_strs = head + ["..."] + tail 

1850 else: 

1851 category_strs = format_array(self.categories) 

1852 

1853 # Strip all leading spaces, which format_array adds for columns... 

1854 category_strs = [x.strip() for x in category_strs] 

1855 return category_strs 

1856 

1857 def _repr_categories_info(self) -> str: 

1858 """ 

1859 Returns a string representation of the footer. 

1860 """ 

1861 category_strs = self._repr_categories() 

1862 dtype = str(self.categories.dtype) 

1863 levheader = f"Categories ({len(self.categories)}, {dtype}): " 

1864 width, height = get_terminal_size() 

1865 max_width = get_option("display.width") or width 

1866 if console.in_ipython_frontend(): 

1867 # 0 = no breaks 

1868 max_width = 0 

1869 levstring = "" 

1870 start = True 

1871 cur_col_len = len(levheader) # header 

1872 sep_len, sep = (3, " < ") if self.ordered else (2, ", ") 

1873 linesep = f"{sep.rstrip()}\n" # remove whitespace 

1874 for val in category_strs: 

1875 if max_width != 0 and cur_col_len + sep_len + len(val) > max_width: 

1876 levstring += linesep + (" " * (len(levheader) + 1)) 

1877 cur_col_len = len(levheader) + 1 # header + a whitespace 

1878 elif not start: 

1879 levstring += sep 

1880 cur_col_len += len(val) 

1881 levstring += val 

1882 start = False 

1883 # replace to simple save space by 

1884 return f"{levheader}[{levstring.replace(' < ... < ', ' ... ')}]" 

1885 

1886 def _repr_footer(self) -> str: 

1887 info = self._repr_categories_info() 

1888 return f"Length: {len(self)}\n{info}" 

1889 

1890 def _get_repr( 

1891 self, length: bool = True, na_rep: str = "NaN", footer: bool = True 

1892 ) -> str: 

1893 from pandas.io.formats import format as fmt 

1894 

1895 formatter = fmt.CategoricalFormatter( 

1896 self, length=length, na_rep=na_rep, footer=footer 

1897 ) 

1898 result = formatter.to_string() 

1899 return str(result) 

1900 

1901 def __repr__(self) -> str: 

1902 """ 

1903 String representation. 

1904 """ 

1905 _maxlen = 10 

1906 if len(self._codes) > _maxlen: 

1907 result = self._tidy_repr(_maxlen) 

1908 elif len(self._codes) > 0: 

1909 result = self._get_repr(length=len(self) > _maxlen) 

1910 else: 

1911 msg = self._get_repr(length=False, footer=True).replace("\n", ", ") 

1912 result = f"[], {msg}" 

1913 

1914 return result 

1915 

1916 # ------------------------------------------------------------------ 

1917 

1918 def _validate_listlike(self, value): 

1919 # NB: here we assume scalar-like tuples have already been excluded 

1920 value = extract_array(value, extract_numpy=True) 

1921 

1922 # require identical categories set 

1923 if isinstance(value, Categorical): 

1924 if not is_dtype_equal(self.dtype, value.dtype): 

1925 raise TypeError( 

1926 "Cannot set a Categorical with another, " 

1927 "without identical categories" 

1928 ) 

1929 # is_dtype_equal implies categories_match_up_to_permutation 

1930 value = self._encode_with_my_categories(value) 

1931 return value._codes 

1932 

1933 from pandas import Index 

1934 

1935 # tupleize_cols=False for e.g. test_fillna_iterable_category GH#41914 

1936 to_add = Index._with_infer(value, tupleize_cols=False).difference( 

1937 self.categories 

1938 ) 

1939 

1940 # no assignments of values not in categories, but it's always ok to set 

1941 # something to np.nan 

1942 if len(to_add) and not isna(to_add).all(): 

1943 raise TypeError( 

1944 "Cannot setitem on a Categorical with a new " 

1945 "category, set the categories first" 

1946 ) 

1947 

1948 codes = self.categories.get_indexer(value) 

1949 return codes.astype(self._ndarray.dtype, copy=False) 

1950 

1951 def _reverse_indexer(self) -> dict[Hashable, npt.NDArray[np.intp]]: 

1952 """ 

1953 Compute the inverse of a categorical, returning 

1954 a dict of categories -> indexers. 

1955 

1956 *This is an internal function* 

1957 

1958 Returns 

1959 ------- 

1960 Dict[Hashable, np.ndarray[np.intp]] 

1961 dict of categories -> indexers 

1962 

1963 Examples 

1964 -------- 

1965 >>> c = pd.Categorical(list('aabca')) 

1966 >>> c 

1967 ['a', 'a', 'b', 'c', 'a'] 

1968 Categories (3, object): ['a', 'b', 'c'] 

1969 >>> c.categories 

1970 Index(['a', 'b', 'c'], dtype='object') 

1971 >>> c.codes 

1972 array([0, 0, 1, 2, 0], dtype=int8) 

1973 >>> c._reverse_indexer() 

1974 {'a': array([0, 1, 4]), 'b': array([2]), 'c': array([3])} 

1975 

1976 """ 

1977 categories = self.categories 

1978 r, counts = libalgos.groupsort_indexer( 

1979 ensure_platform_int(self.codes), categories.size 

1980 ) 

1981 counts = ensure_int64(counts).cumsum() 

1982 _result = (r[start:end] for start, end in zip(counts, counts[1:])) 

1983 return dict(zip(categories, _result)) 

1984 

1985 # ------------------------------------------------------------------ 

1986 # Reductions 

1987 

1988 def min(self, *, skipna: bool = True, **kwargs): 

1989 """ 

1990 The minimum value of the object. 

1991 

1992 Only ordered `Categoricals` have a minimum! 

1993 

1994 Raises 

1995 ------ 

1996 TypeError 

1997 If the `Categorical` is not `ordered`. 

1998 

1999 Returns 

2000 ------- 

2001 min : the minimum of this `Categorical`, NA value if empty 

2002 """ 

2003 nv.validate_minmax_axis(kwargs.get("axis", 0)) 

2004 nv.validate_min((), kwargs) 

2005 self.check_for_ordered("min") 

2006 

2007 if not len(self._codes): 

2008 return self.dtype.na_value 

2009 

2010 good = self._codes != -1 

2011 if not good.all(): 

2012 if skipna and good.any(): 

2013 pointer = self._codes[good].min() 

2014 else: 

2015 return np.nan 

2016 else: 

2017 pointer = self._codes.min() 

2018 return self._wrap_reduction_result(None, pointer) 

2019 

2020 def max(self, *, skipna: bool = True, **kwargs): 

2021 """ 

2022 The maximum value of the object. 

2023 

2024 Only ordered `Categoricals` have a maximum! 

2025 

2026 Raises 

2027 ------ 

2028 TypeError 

2029 If the `Categorical` is not `ordered`. 

2030 

2031 Returns 

2032 ------- 

2033 max : the maximum of this `Categorical`, NA if array is empty 

2034 """ 

2035 nv.validate_minmax_axis(kwargs.get("axis", 0)) 

2036 nv.validate_max((), kwargs) 

2037 self.check_for_ordered("max") 

2038 

2039 if not len(self._codes): 

2040 return self.dtype.na_value 

2041 

2042 good = self._codes != -1 

2043 if not good.all(): 

2044 if skipna and good.any(): 

2045 pointer = self._codes[good].max() 

2046 else: 

2047 return np.nan 

2048 else: 

2049 pointer = self._codes.max() 

2050 return self._wrap_reduction_result(None, pointer) 

2051 

2052 def _mode(self, dropna: bool = True) -> Categorical: 

2053 codes = self._codes 

2054 mask = None 

2055 if dropna: 

2056 mask = self.isna() 

2057 

2058 res_codes = algorithms.mode(codes, mask=mask) 

2059 res_codes = cast(np.ndarray, res_codes) 

2060 assert res_codes.dtype == codes.dtype 

2061 res = self._from_backing_data(res_codes) 

2062 return res 

2063 

2064 # ------------------------------------------------------------------ 

2065 # ExtensionArray Interface 

2066 

2067 def unique(self): 

2068 """ 

2069 Return the ``Categorical`` which ``categories`` and ``codes`` are 

2070 unique. 

2071 

2072 .. versionchanged:: 1.3.0 

2073 

2074 Previously, unused categories were dropped from the new categories. 

2075 

2076 Returns 

2077 ------- 

2078 Categorical 

2079 

2080 See Also 

2081 -------- 

2082 pandas.unique 

2083 CategoricalIndex.unique 

2084 Series.unique : Return unique values of Series object. 

2085 

2086 Examples 

2087 -------- 

2088 >>> pd.Categorical(list("baabc")).unique() 

2089 ['b', 'a', 'c'] 

2090 Categories (3, object): ['a', 'b', 'c'] 

2091 >>> pd.Categorical(list("baab"), categories=list("abc"), ordered=True).unique() 

2092 ['b', 'a'] 

2093 Categories (3, object): ['a' < 'b' < 'c'] 

2094 """ 

2095 # pylint: disable=useless-parent-delegation 

2096 return super().unique() 

2097 

2098 def _cast_quantile_result(self, res_values: np.ndarray) -> np.ndarray: 

2099 # make sure we have correct itemsize for resulting codes 

2100 assert res_values.dtype == self._ndarray.dtype 

2101 return res_values 

2102 

2103 def equals(self, other: object) -> bool: 

2104 """ 

2105 Returns True if categorical arrays are equal. 

2106 

2107 Parameters 

2108 ---------- 

2109 other : `Categorical` 

2110 

2111 Returns 

2112 ------- 

2113 bool 

2114 """ 

2115 if not isinstance(other, Categorical): 

2116 return False 

2117 elif self._categories_match_up_to_permutation(other): 

2118 other = self._encode_with_my_categories(other) 

2119 return np.array_equal(self._codes, other._codes) 

2120 return False 

2121 

2122 @classmethod 

2123 def _concat_same_type( 

2124 cls: type[CategoricalT], to_concat: Sequence[CategoricalT], axis: AxisInt = 0 

2125 ) -> CategoricalT: 

2126 from pandas.core.dtypes.concat import union_categoricals 

2127 

2128 first = to_concat[0] 

2129 if axis >= first.ndim: 

2130 raise ValueError( 

2131 f"axis {axis} is out of bounds for array of dimension {first.ndim}" 

2132 ) 

2133 

2134 if axis == 1: 

2135 # Flatten, concatenate then reshape 

2136 if not all(x.ndim == 2 for x in to_concat): 

2137 raise ValueError 

2138 

2139 # pass correctly-shaped to union_categoricals 

2140 tc_flat = [] 

2141 for obj in to_concat: 

2142 tc_flat.extend([obj[:, i] for i in range(obj.shape[1])]) 

2143 

2144 res_flat = cls._concat_same_type(tc_flat, axis=0) 

2145 

2146 result = res_flat.reshape(len(first), -1, order="F") 

2147 return result 

2148 

2149 result = union_categoricals(to_concat) 

2150 return result 

2151 

2152 # ------------------------------------------------------------------ 

2153 

2154 def _encode_with_my_categories(self, other: Categorical) -> Categorical: 

2155 """ 

2156 Re-encode another categorical using this Categorical's categories. 

2157 

2158 Notes 

2159 ----- 

2160 This assumes we have already checked 

2161 self._categories_match_up_to_permutation(other). 

2162 """ 

2163 # Indexing on codes is more efficient if categories are the same, 

2164 # so we can apply some optimizations based on the degree of 

2165 # dtype-matching. 

2166 codes = recode_for_categories( 

2167 other.codes, other.categories, self.categories, copy=False 

2168 ) 

2169 return self._from_backing_data(codes) 

2170 

2171 def _categories_match_up_to_permutation(self, other: Categorical) -> bool: 

2172 """ 

2173 Returns True if categoricals are the same dtype 

2174 same categories, and same ordered 

2175 

2176 Parameters 

2177 ---------- 

2178 other : Categorical 

2179 

2180 Returns 

2181 ------- 

2182 bool 

2183 """ 

2184 return hash(self.dtype) == hash(other.dtype) 

2185 

2186 def describe(self) -> DataFrame: 

2187 """ 

2188 Describes this Categorical 

2189 

2190 Returns 

2191 ------- 

2192 description: `DataFrame` 

2193 A dataframe with frequency and counts by category. 

2194 """ 

2195 counts = self.value_counts(dropna=False) 

2196 freqs = counts / counts.sum() 

2197 

2198 from pandas import Index 

2199 from pandas.core.reshape.concat import concat 

2200 

2201 result = concat([counts, freqs], axis=1) 

2202 result.columns = Index(["counts", "freqs"]) 

2203 result.index.name = "categories" 

2204 

2205 return result 

2206 

2207 def isin(self, values) -> npt.NDArray[np.bool_]: 

2208 """ 

2209 Check whether `values` are contained in Categorical. 

2210 

2211 Return a boolean NumPy Array showing whether each element in 

2212 the Categorical matches an element in the passed sequence of 

2213 `values` exactly. 

2214 

2215 Parameters 

2216 ---------- 

2217 values : set or list-like 

2218 The sequence of values to test. Passing in a single string will 

2219 raise a ``TypeError``. Instead, turn a single string into a 

2220 list of one element. 

2221 

2222 Returns 

2223 ------- 

2224 np.ndarray[bool] 

2225 

2226 Raises 

2227 ------ 

2228 TypeError 

2229 * If `values` is not a set or list-like 

2230 

2231 See Also 

2232 -------- 

2233 pandas.Series.isin : Equivalent method on Series. 

2234 

2235 Examples 

2236 -------- 

2237 >>> s = pd.Categorical(['lama', 'cow', 'lama', 'beetle', 'lama', 

2238 ... 'hippo']) 

2239 >>> s.isin(['cow', 'lama']) 

2240 array([ True, True, True, False, True, False]) 

2241 

2242 Passing a single string as ``s.isin('lama')`` will raise an error. Use 

2243 a list of one element instead: 

2244 

2245 >>> s.isin(['lama']) 

2246 array([ True, False, True, False, True, False]) 

2247 """ 

2248 if not is_list_like(values): 

2249 values_type = type(values).__name__ 

2250 raise TypeError( 

2251 "only list-like objects are allowed to be passed " 

2252 f"to isin(), you passed a [{values_type}]" 

2253 ) 

2254 values = sanitize_array(values, None, None) 

2255 null_mask = np.asarray(isna(values)) 

2256 code_values = self.categories.get_indexer(values) 

2257 code_values = code_values[null_mask | (code_values >= 0)] 

2258 return algorithms.isin(self.codes, code_values) 

2259 

2260 def _replace(self, *, to_replace, value, inplace: bool = False): 

2261 from pandas import Index 

2262 

2263 inplace = validate_bool_kwarg(inplace, "inplace") 

2264 cat = self if inplace else self.copy() 

2265 

2266 mask = isna(np.asarray(value)) 

2267 if mask.any(): 

2268 removals = np.asarray(to_replace)[mask] 

2269 removals = cat.categories[cat.categories.isin(removals)] 

2270 new_cat = cat.remove_categories(removals) 

2271 NDArrayBacked.__init__(cat, new_cat.codes, new_cat.dtype) 

2272 

2273 ser = cat.categories.to_series() 

2274 ser = ser.replace(to_replace=to_replace, value=value) 

2275 

2276 all_values = Index(ser) 

2277 

2278 # GH51016: maintain order of existing categories 

2279 idxr = cat.categories.get_indexer_for(all_values) 

2280 locs = np.arange(len(ser)) 

2281 locs = np.where(idxr == -1, locs, idxr) 

2282 locs = locs.argsort() 

2283 

2284 new_categories = ser.take(locs) 

2285 new_categories = new_categories.drop_duplicates(keep="first") 

2286 new_categories = Index(new_categories) 

2287 new_codes = recode_for_categories( 

2288 cat._codes, all_values, new_categories, copy=False 

2289 ) 

2290 new_dtype = CategoricalDtype(new_categories, ordered=self.dtype.ordered) 

2291 NDArrayBacked.__init__(cat, new_codes, new_dtype) 

2292 

2293 if not inplace: 

2294 return cat 

2295 

2296 # ------------------------------------------------------------------------ 

2297 # String methods interface 

2298 def _str_map( 

2299 self, f, na_value=np.nan, dtype=np.dtype("object"), convert: bool = True 

2300 ): 

2301 # Optimization to apply the callable `f` to the categories once 

2302 # and rebuild the result by `take`ing from the result with the codes. 

2303 # Returns the same type as the object-dtype implementation though. 

2304 from pandas.core.arrays import PandasArray 

2305 

2306 categories = self.categories 

2307 codes = self.codes 

2308 result = PandasArray(categories.to_numpy())._str_map(f, na_value, dtype) 

2309 return take_nd(result, codes, fill_value=na_value) 

2310 

2311 def _str_get_dummies(self, sep: str = "|"): 

2312 # sep may not be in categories. Just bail on this. 

2313 from pandas.core.arrays import PandasArray 

2314 

2315 return PandasArray(self.astype(str))._str_get_dummies(sep) 

2316 

2317 

2318# The Series.cat accessor 

2319 

2320 

2321@delegate_names( 

2322 delegate=Categorical, accessors=["categories", "ordered"], typ="property" 

2323) 

2324@delegate_names( 

2325 delegate=Categorical, 

2326 accessors=[ 

2327 "rename_categories", 

2328 "reorder_categories", 

2329 "add_categories", 

2330 "remove_categories", 

2331 "remove_unused_categories", 

2332 "set_categories", 

2333 "as_ordered", 

2334 "as_unordered", 

2335 ], 

2336 typ="method", 

2337) 

2338class CategoricalAccessor(PandasDelegate, PandasObject, NoNewAttributesMixin): 

2339 """ 

2340 Accessor object for categorical properties of the Series values. 

2341 

2342 Parameters 

2343 ---------- 

2344 data : Series or CategoricalIndex 

2345 

2346 Examples 

2347 -------- 

2348 >>> s = pd.Series(list("abbccc")).astype("category") 

2349 >>> s 

2350 0 a 

2351 1 b 

2352 2 b 

2353 3 c 

2354 4 c 

2355 5 c 

2356 dtype: category 

2357 Categories (3, object): ['a', 'b', 'c'] 

2358 

2359 >>> s.cat.categories 

2360 Index(['a', 'b', 'c'], dtype='object') 

2361 

2362 >>> s.cat.rename_categories(list("cba")) 

2363 0 c 

2364 1 b 

2365 2 b 

2366 3 a 

2367 4 a 

2368 5 a 

2369 dtype: category 

2370 Categories (3, object): ['c', 'b', 'a'] 

2371 

2372 >>> s.cat.reorder_categories(list("cba")) 

2373 0 a 

2374 1 b 

2375 2 b 

2376 3 c 

2377 4 c 

2378 5 c 

2379 dtype: category 

2380 Categories (3, object): ['c', 'b', 'a'] 

2381 

2382 >>> s.cat.add_categories(["d", "e"]) 

2383 0 a 

2384 1 b 

2385 2 b 

2386 3 c 

2387 4 c 

2388 5 c 

2389 dtype: category 

2390 Categories (5, object): ['a', 'b', 'c', 'd', 'e'] 

2391 

2392 >>> s.cat.remove_categories(["a", "c"]) 

2393 0 NaN 

2394 1 b 

2395 2 b 

2396 3 NaN 

2397 4 NaN 

2398 5 NaN 

2399 dtype: category 

2400 Categories (1, object): ['b'] 

2401 

2402 >>> s1 = s.cat.add_categories(["d", "e"]) 

2403 >>> s1.cat.remove_unused_categories() 

2404 0 a 

2405 1 b 

2406 2 b 

2407 3 c 

2408 4 c 

2409 5 c 

2410 dtype: category 

2411 Categories (3, object): ['a', 'b', 'c'] 

2412 

2413 >>> s.cat.set_categories(list("abcde")) 

2414 0 a 

2415 1 b 

2416 2 b 

2417 3 c 

2418 4 c 

2419 5 c 

2420 dtype: category 

2421 Categories (5, object): ['a', 'b', 'c', 'd', 'e'] 

2422 

2423 >>> s.cat.as_ordered() 

2424 0 a 

2425 1 b 

2426 2 b 

2427 3 c 

2428 4 c 

2429 5 c 

2430 dtype: category 

2431 Categories (3, object): ['a' < 'b' < 'c'] 

2432 

2433 >>> s.cat.as_unordered() 

2434 0 a 

2435 1 b 

2436 2 b 

2437 3 c 

2438 4 c 

2439 5 c 

2440 dtype: category 

2441 Categories (3, object): ['a', 'b', 'c'] 

2442 """ 

2443 

2444 def __init__(self, data) -> None: 

2445 self._validate(data) 

2446 self._parent = data.values 

2447 self._index = data.index 

2448 self._name = data.name 

2449 self._freeze() 

2450 

2451 @staticmethod 

2452 def _validate(data): 

2453 if not is_categorical_dtype(data.dtype): 

2454 raise AttributeError("Can only use .cat accessor with a 'category' dtype") 

2455 

2456 def _delegate_property_get(self, name): 

2457 return getattr(self._parent, name) 

2458 

2459 def _delegate_property_set(self, name, new_values): 

2460 return setattr(self._parent, name, new_values) 

2461 

2462 @property 

2463 def codes(self) -> Series: 

2464 """ 

2465 Return Series of codes as well as the index. 

2466 """ 

2467 from pandas import Series 

2468 

2469 return Series(self._parent.codes, index=self._index) 

2470 

2471 def _delegate_method(self, name, *args, **kwargs): 

2472 from pandas import Series 

2473 

2474 method = getattr(self._parent, name) 

2475 res = method(*args, **kwargs) 

2476 if res is not None: 

2477 return Series(res, index=self._index, name=self._name) 

2478 

2479 

2480# utility routines 

2481 

2482 

2483def _get_codes_for_values(values, categories: Index) -> np.ndarray: 

2484 """ 

2485 utility routine to turn values into codes given the specified categories 

2486 

2487 If `values` is known to be a Categorical, use recode_for_categories instead. 

2488 """ 

2489 if values.ndim > 1: 

2490 flat = values.ravel() 

2491 codes = _get_codes_for_values(flat, categories) 

2492 return codes.reshape(values.shape) 

2493 

2494 codes = categories.get_indexer_for(values) 

2495 return coerce_indexer_dtype(codes, categories) 

2496 

2497 

2498def recode_for_categories( 

2499 codes: np.ndarray, old_categories, new_categories, copy: bool = True 

2500) -> np.ndarray: 

2501 """ 

2502 Convert a set of codes for to a new set of categories 

2503 

2504 Parameters 

2505 ---------- 

2506 codes : np.ndarray 

2507 old_categories, new_categories : Index 

2508 copy: bool, default True 

2509 Whether to copy if the codes are unchanged. 

2510 

2511 Returns 

2512 ------- 

2513 new_codes : np.ndarray[np.int64] 

2514 

2515 Examples 

2516 -------- 

2517 >>> old_cat = pd.Index(['b', 'a', 'c']) 

2518 >>> new_cat = pd.Index(['a', 'b']) 

2519 >>> codes = np.array([0, 1, 1, 2]) 

2520 >>> recode_for_categories(codes, old_cat, new_cat) 

2521 array([ 1, 0, 0, -1], dtype=int8) 

2522 """ 

2523 if len(old_categories) == 0: 

2524 # All null anyway, so just retain the nulls 

2525 if copy: 

2526 return codes.copy() 

2527 return codes 

2528 elif new_categories.equals(old_categories): 

2529 # Same categories, so no need to actually recode 

2530 if copy: 

2531 return codes.copy() 

2532 return codes 

2533 

2534 indexer = coerce_indexer_dtype( 

2535 new_categories.get_indexer(old_categories), new_categories 

2536 ) 

2537 new_codes = take_nd(indexer, codes, fill_value=-1) 

2538 return new_codes 

2539 

2540 

2541def factorize_from_iterable(values) -> tuple[np.ndarray, Index]: 

2542 """ 

2543 Factorize an input `values` into `categories` and `codes`. Preserves 

2544 categorical dtype in `categories`. 

2545 

2546 Parameters 

2547 ---------- 

2548 values : list-like 

2549 

2550 Returns 

2551 ------- 

2552 codes : ndarray 

2553 categories : Index 

2554 If `values` has a categorical dtype, then `categories` is 

2555 a CategoricalIndex keeping the categories and order of `values`. 

2556 """ 

2557 from pandas import CategoricalIndex 

2558 

2559 if not is_list_like(values): 

2560 raise TypeError("Input must be list-like") 

2561 

2562 categories: Index 

2563 if is_categorical_dtype(values): 

2564 values = extract_array(values) 

2565 # The Categorical we want to build has the same categories 

2566 # as values but its codes are by def [0, ..., len(n_categories) - 1] 

2567 cat_codes = np.arange(len(values.categories), dtype=values.codes.dtype) 

2568 cat = Categorical.from_codes(cat_codes, dtype=values.dtype) 

2569 

2570 categories = CategoricalIndex(cat) 

2571 codes = values.codes 

2572 else: 

2573 # The value of ordered is irrelevant since we don't use cat as such, 

2574 # but only the resulting categories, the order of which is independent 

2575 # from ordered. Set ordered to False as default. See GH #15457 

2576 cat = Categorical(values, ordered=False) 

2577 categories = cat.categories 

2578 codes = cat.codes 

2579 return codes, categories 

2580 

2581 

2582def factorize_from_iterables(iterables) -> tuple[list[np.ndarray], list[Index]]: 

2583 """ 

2584 A higher-level wrapper over `factorize_from_iterable`. 

2585 

2586 Parameters 

2587 ---------- 

2588 iterables : list-like of list-likes 

2589 

2590 Returns 

2591 ------- 

2592 codes : list of ndarrays 

2593 categories : list of Indexes 

2594 

2595 Notes 

2596 ----- 

2597 See `factorize_from_iterable` for more info. 

2598 """ 

2599 if len(iterables) == 0: 

2600 # For consistency, it should return two empty lists. 

2601 return [], [] 

2602 

2603 codes, categories = zip(*(factorize_from_iterable(it) for it in iterables)) 

2604 return list(codes), list(categories)