1from __future__ import annotations
2
3import numpy as np
4
5from pandas.core.algorithms import unique1d
6from pandas.core.arrays.categorical import (
7 Categorical,
8 CategoricalDtype,
9 recode_for_categories,
10)
11
12
13def recode_for_groupby(
14 c: Categorical, sort: bool, observed: bool
15) -> tuple[Categorical, Categorical | None]:
16 """
17 Code the categories to ensure we can groupby for categoricals.
18
19 If observed=True, we return a new Categorical with the observed
20 categories only.
21
22 If sort=False, return a copy of self, coded with categories as
23 returned by .unique(), followed by any categories not appearing in
24 the data. If sort=True, return self.
25
26 This method is needed solely to ensure the categorical index of the
27 GroupBy result has categories in the order of appearance in the data
28 (GH-8868).
29
30 Parameters
31 ----------
32 c : Categorical
33 sort : bool
34 The value of the sort parameter groupby was called with.
35 observed : bool
36 Account only for the observed values
37
38 Returns
39 -------
40 Categorical
41 If sort=False, the new categories are set to the order of
42 appearance in codes (unless ordered=True, in which case the
43 original order is preserved), followed by any unrepresented
44 categories in the original order.
45 Categorical or None
46 If we are observed, return the original categorical, otherwise None
47 """
48 # we only care about observed values
49 if observed:
50 # In cases with c.ordered, this is equivalent to
51 # return c.remove_unused_categories(), c
52
53 unique_codes = unique1d(c.codes)
54
55 take_codes = unique_codes[unique_codes != -1]
56 if sort:
57 take_codes = np.sort(take_codes)
58
59 # we recode according to the uniques
60 categories = c.categories.take(take_codes)
61 codes = recode_for_categories(c.codes, c.categories, categories)
62
63 # return a new categorical that maps our new codes
64 # and categories
65 dtype = CategoricalDtype(categories, ordered=c.ordered)
66 return Categorical(codes, dtype=dtype, fastpath=True), c
67
68 # Already sorted according to c.categories; all is fine
69 if sort:
70 return c, None
71
72 # sort=False should order groups in as-encountered order (GH-8868)
73
74 # xref GH:46909: Re-ordering codes faster than using (set|add|reorder)_categories
75 all_codes = np.arange(c.categories.nunique())
76 # GH 38140: exclude nan from indexer for categories
77 unique_notnan_codes = unique1d(c.codes[c.codes != -1])
78 if sort:
79 unique_notnan_codes = np.sort(unique_notnan_codes)
80 if len(all_codes) > len(unique_notnan_codes):
81 # GH 13179: All categories need to be present, even if missing from the data
82 missing_codes = np.setdiff1d(all_codes, unique_notnan_codes, assume_unique=True)
83 take_codes = np.concatenate((unique_notnan_codes, missing_codes))
84 else:
85 take_codes = unique_notnan_codes
86
87 return Categorical(c, c.unique().categories.take(take_codes)), None