Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/pandas/core/groupby/categorical.py: 20%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

25 statements  

1from __future__ import annotations 

2 

3import numpy as np 

4 

5from pandas.core.algorithms import unique1d 

6from pandas.core.arrays.categorical import ( 

7 Categorical, 

8 CategoricalDtype, 

9 recode_for_categories, 

10) 

11 

12 

13def recode_for_groupby( 

14 c: Categorical, sort: bool, observed: bool 

15) -> tuple[Categorical, Categorical | None]: 

16 """ 

17 Code the categories to ensure we can groupby for categoricals. 

18 

19 If observed=True, we return a new Categorical with the observed 

20 categories only. 

21 

22 If sort=False, return a copy of self, coded with categories as 

23 returned by .unique(), followed by any categories not appearing in 

24 the data. If sort=True, return self. 

25 

26 This method is needed solely to ensure the categorical index of the 

27 GroupBy result has categories in the order of appearance in the data 

28 (GH-8868). 

29 

30 Parameters 

31 ---------- 

32 c : Categorical 

33 sort : bool 

34 The value of the sort parameter groupby was called with. 

35 observed : bool 

36 Account only for the observed values 

37 

38 Returns 

39 ------- 

40 Categorical 

41 If sort=False, the new categories are set to the order of 

42 appearance in codes (unless ordered=True, in which case the 

43 original order is preserved), followed by any unrepresented 

44 categories in the original order. 

45 Categorical or None 

46 If we are observed, return the original categorical, otherwise None 

47 """ 

48 # we only care about observed values 

49 if observed: 

50 # In cases with c.ordered, this is equivalent to 

51 # return c.remove_unused_categories(), c 

52 

53 unique_codes = unique1d(c.codes) 

54 

55 take_codes = unique_codes[unique_codes != -1] 

56 if sort: 

57 take_codes = np.sort(take_codes) 

58 

59 # we recode according to the uniques 

60 categories = c.categories.take(take_codes) 

61 codes = recode_for_categories(c.codes, c.categories, categories) 

62 

63 # return a new categorical that maps our new codes 

64 # and categories 

65 dtype = CategoricalDtype(categories, ordered=c.ordered) 

66 return Categorical(codes, dtype=dtype, fastpath=True), c 

67 

68 # Already sorted according to c.categories; all is fine 

69 if sort: 

70 return c, None 

71 

72 # sort=False should order groups in as-encountered order (GH-8868) 

73 

74 # xref GH:46909: Re-ordering codes faster than using (set|add|reorder)_categories 

75 all_codes = np.arange(c.categories.nunique()) 

76 # GH 38140: exclude nan from indexer for categories 

77 unique_notnan_codes = unique1d(c.codes[c.codes != -1]) 

78 if sort: 

79 unique_notnan_codes = np.sort(unique_notnan_codes) 

80 if len(all_codes) > len(unique_notnan_codes): 

81 # GH 13179: All categories need to be present, even if missing from the data 

82 missing_codes = np.setdiff1d(all_codes, unique_notnan_codes, assume_unique=True) 

83 take_codes = np.concatenate((unique_notnan_codes, missing_codes)) 

84 else: 

85 take_codes = unique_notnan_codes 

86 

87 return Categorical(c, c.unique().categories.take(take_codes)), None