Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/pandas/core/groupby/categorical.py: 20%

1from __future__ import annotations

3import numpy as np

5from pandas.core.algorithms import unique1d

6from pandas.core.arrays.categorical import (

7 Categorical,

8 CategoricalDtype,

9 recode_for_categories,

10)

13def recode_for_groupby(

14 c: Categorical, sort: bool, observed: bool

15) -> tuple[Categorical, Categorical | None]:

16 """

17 Code the categories to ensure we can groupby for categoricals.

19 If observed=True, we return a new Categorical with the observed

20 categories only.

22 If sort=False, return a copy of self, coded with categories as

23 returned by .unique(), followed by any categories not appearing in

24 the data. If sort=True, return self.

26 This method is needed solely to ensure the categorical index of the

27 GroupBy result has categories in the order of appearance in the data

28 (GH-8868).

30 Parameters

31 ----------

32 c : Categorical

33 sort : bool

34 The value of the sort parameter groupby was called with.

35 observed : bool

36 Account only for the observed values

38 Returns

39 -------

40 Categorical

41 If sort=False, the new categories are set to the order of

42 appearance in codes (unless ordered=True, in which case the

43 original order is preserved), followed by any unrepresented

44 categories in the original order.

45 Categorical or None

46 If we are observed, return the original categorical, otherwise None

47 """

48 # we only care about observed values

49 if observed:

50 # In cases with c.ordered, this is equivalent to

51 # return c.remove_unused_categories(), c

53 unique_codes = unique1d(c.codes)

55 take_codes = unique_codes[unique_codes != -1]

56 if sort:

57 take_codes = np.sort(take_codes)

59 # we recode according to the uniques

60 categories = c.categories.take(take_codes)

61 codes = recode_for_categories(c.codes, c.categories, categories)

63 # return a new categorical that maps our new codes

64 # and categories

65 dtype = CategoricalDtype(categories, ordered=c.ordered)

66 return Categorical(codes, dtype=dtype, fastpath=True), c

68 # Already sorted according to c.categories; all is fine

69 if sort:

70 return c, None

72 # sort=False should order groups in as-encountered order (GH-8868)

74 # xref GH:46909: Re-ordering codes faster than using (set|add|reorder)_categories

75 all_codes = np.arange(c.categories.nunique())

76 # GH 38140: exclude nan from indexer for categories

77 unique_notnan_codes = unique1d(c.codes[c.codes != -1])

78 if sort:

79 unique_notnan_codes = np.sort(unique_notnan_codes)

80 if len(all_codes) > len(unique_notnan_codes):

81 # GH 13179: All categories need to be present, even if missing from the data

82 missing_codes = np.setdiff1d(all_codes, unique_notnan_codes, assume_unique=True)

83 take_codes = np.concatenate((unique_notnan_codes, missing_codes))

84 else:

85 take_codes = unique_notnan_codes

87 return Categorical(c, c.unique().categories.take(take_codes)), None