1"""
2Provide user facing operators for doing the split part of the
3split-apply-combine paradigm.
4"""
5from __future__ import annotations
6
7from typing import (
8 TYPE_CHECKING,
9 Hashable,
10 Iterator,
11 final,
12)
13import warnings
14
15import numpy as np
16
17from pandas._config import using_copy_on_write
18
19from pandas._typing import (
20 ArrayLike,
21 Axis,
22 NDFrameT,
23 npt,
24)
25from pandas.errors import InvalidIndexError
26from pandas.util._decorators import cache_readonly
27from pandas.util._exceptions import find_stack_level
28
29from pandas.core.dtypes.common import (
30 is_categorical_dtype,
31 is_list_like,
32 is_scalar,
33)
34
35from pandas.core import algorithms
36from pandas.core.arrays import (
37 Categorical,
38 ExtensionArray,
39)
40import pandas.core.common as com
41from pandas.core.frame import DataFrame
42from pandas.core.groupby import ops
43from pandas.core.groupby.categorical import recode_for_groupby
44from pandas.core.indexes.api import (
45 CategoricalIndex,
46 Index,
47 MultiIndex,
48)
49from pandas.core.series import Series
50
51from pandas.io.formats.printing import pprint_thing
52
53if TYPE_CHECKING:
54 from pandas.core.generic import NDFrame
55
56
57class Grouper:
58 """
59 A Grouper allows the user to specify a groupby instruction for an object.
60
61 This specification will select a column via the key parameter, or if the
62 level and/or axis parameters are given, a level of the index of the target
63 object.
64
65 If `axis` and/or `level` are passed as keywords to both `Grouper` and
66 `groupby`, the values passed to `Grouper` take precedence.
67
68 Parameters
69 ----------
70 key : str, defaults to None
71 Groupby key, which selects the grouping column of the target.
72 level : name/number, defaults to None
73 The level for the target index.
74 freq : str / frequency object, defaults to None
75 This will groupby the specified frequency if the target selection
76 (via key or level) is a datetime-like object. For full specification
77 of available frequencies, please see `here
78 <https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#offset-aliases>`_.
79 axis : str, int, defaults to 0
80 Number/name of the axis.
81 sort : bool, default to False
82 Whether to sort the resulting labels.
83 closed : {'left' or 'right'}
84 Closed end of interval. Only when `freq` parameter is passed.
85 label : {'left' or 'right'}
86 Interval boundary to use for labeling.
87 Only when `freq` parameter is passed.
88 convention : {'start', 'end', 'e', 's'}
89 If grouper is PeriodIndex and `freq` parameter is passed.
90
91 origin : Timestamp or str, default 'start_day'
92 The timestamp on which to adjust the grouping. The timezone of origin must
93 match the timezone of the index.
94 If string, must be one of the following:
95
96 - 'epoch': `origin` is 1970-01-01
97 - 'start': `origin` is the first value of the timeseries
98 - 'start_day': `origin` is the first day at midnight of the timeseries
99
100 .. versionadded:: 1.1.0
101
102 - 'end': `origin` is the last value of the timeseries
103 - 'end_day': `origin` is the ceiling midnight of the last day
104
105 .. versionadded:: 1.3.0
106
107 offset : Timedelta or str, default is None
108 An offset timedelta added to the origin.
109
110 .. versionadded:: 1.1.0
111
112 dropna : bool, default True
113 If True, and if group keys contain NA values, NA values together with
114 row/column will be dropped. If False, NA values will also be treated as
115 the key in groups.
116
117 .. versionadded:: 1.2.0
118
119 Returns
120 -------
121 A specification for a groupby instruction
122
123 Examples
124 --------
125 Syntactic sugar for ``df.groupby('A')``
126
127 >>> df = pd.DataFrame(
128 ... {
129 ... "Animal": ["Falcon", "Parrot", "Falcon", "Falcon", "Parrot"],
130 ... "Speed": [100, 5, 200, 300, 15],
131 ... }
132 ... )
133 >>> df
134 Animal Speed
135 0 Falcon 100
136 1 Parrot 5
137 2 Falcon 200
138 3 Falcon 300
139 4 Parrot 15
140 >>> df.groupby(pd.Grouper(key="Animal")).mean()
141 Speed
142 Animal
143 Falcon 200.0
144 Parrot 10.0
145
146 Specify a resample operation on the column 'Publish date'
147
148 >>> df = pd.DataFrame(
149 ... {
150 ... "Publish date": [
151 ... pd.Timestamp("2000-01-02"),
152 ... pd.Timestamp("2000-01-02"),
153 ... pd.Timestamp("2000-01-09"),
154 ... pd.Timestamp("2000-01-16")
155 ... ],
156 ... "ID": [0, 1, 2, 3],
157 ... "Price": [10, 20, 30, 40]
158 ... }
159 ... )
160 >>> df
161 Publish date ID Price
162 0 2000-01-02 0 10
163 1 2000-01-02 1 20
164 2 2000-01-09 2 30
165 3 2000-01-16 3 40
166 >>> df.groupby(pd.Grouper(key="Publish date", freq="1W")).mean()
167 ID Price
168 Publish date
169 2000-01-02 0.5 15.0
170 2000-01-09 2.0 30.0
171 2000-01-16 3.0 40.0
172
173 If you want to adjust the start of the bins based on a fixed timestamp:
174
175 >>> start, end = '2000-10-01 23:30:00', '2000-10-02 00:30:00'
176 >>> rng = pd.date_range(start, end, freq='7min')
177 >>> ts = pd.Series(np.arange(len(rng)) * 3, index=rng)
178 >>> ts
179 2000-10-01 23:30:00 0
180 2000-10-01 23:37:00 3
181 2000-10-01 23:44:00 6
182 2000-10-01 23:51:00 9
183 2000-10-01 23:58:00 12
184 2000-10-02 00:05:00 15
185 2000-10-02 00:12:00 18
186 2000-10-02 00:19:00 21
187 2000-10-02 00:26:00 24
188 Freq: 7T, dtype: int64
189
190 >>> ts.groupby(pd.Grouper(freq='17min')).sum()
191 2000-10-01 23:14:00 0
192 2000-10-01 23:31:00 9
193 2000-10-01 23:48:00 21
194 2000-10-02 00:05:00 54
195 2000-10-02 00:22:00 24
196 Freq: 17T, dtype: int64
197
198 >>> ts.groupby(pd.Grouper(freq='17min', origin='epoch')).sum()
199 2000-10-01 23:18:00 0
200 2000-10-01 23:35:00 18
201 2000-10-01 23:52:00 27
202 2000-10-02 00:09:00 39
203 2000-10-02 00:26:00 24
204 Freq: 17T, dtype: int64
205
206 >>> ts.groupby(pd.Grouper(freq='17min', origin='2000-01-01')).sum()
207 2000-10-01 23:24:00 3
208 2000-10-01 23:41:00 15
209 2000-10-01 23:58:00 45
210 2000-10-02 00:15:00 45
211 Freq: 17T, dtype: int64
212
213 If you want to adjust the start of the bins with an `offset` Timedelta, the two
214 following lines are equivalent:
215
216 >>> ts.groupby(pd.Grouper(freq='17min', origin='start')).sum()
217 2000-10-01 23:30:00 9
218 2000-10-01 23:47:00 21
219 2000-10-02 00:04:00 54
220 2000-10-02 00:21:00 24
221 Freq: 17T, dtype: int64
222
223 >>> ts.groupby(pd.Grouper(freq='17min', offset='23h30min')).sum()
224 2000-10-01 23:30:00 9
225 2000-10-01 23:47:00 21
226 2000-10-02 00:04:00 54
227 2000-10-02 00:21:00 24
228 Freq: 17T, dtype: int64
229
230 To replace the use of the deprecated `base` argument, you can now use `offset`,
231 in this example it is equivalent to have `base=2`:
232
233 >>> ts.groupby(pd.Grouper(freq='17min', offset='2min')).sum()
234 2000-10-01 23:16:00 0
235 2000-10-01 23:33:00 9
236 2000-10-01 23:50:00 36
237 2000-10-02 00:07:00 39
238 2000-10-02 00:24:00 24
239 Freq: 17T, dtype: int64
240 """
241
242 sort: bool
243 dropna: bool
244 _gpr_index: Index | None
245 _grouper: Index | None
246
247 _attributes: tuple[str, ...] = ("key", "level", "freq", "axis", "sort", "dropna")
248
249 def __new__(cls, *args, **kwargs):
250 if kwargs.get("freq") is not None:
251 from pandas.core.resample import TimeGrouper
252
253 cls = TimeGrouper
254 return super().__new__(cls)
255
256 def __init__(
257 self,
258 key=None,
259 level=None,
260 freq=None,
261 axis: Axis = 0,
262 sort: bool = False,
263 dropna: bool = True,
264 ) -> None:
265 self.key = key
266 self.level = level
267 self.freq = freq
268 self.axis = axis
269 self.sort = sort
270 self.dropna = dropna
271
272 self._grouper_deprecated = None
273 self._indexer_deprecated = None
274 self._obj_deprecated = None
275 self._gpr_index = None
276 self.binner = None
277 self._grouper = None
278 self._indexer = None
279
280 def _get_grouper(
281 self, obj: NDFrameT, validate: bool = True
282 ) -> tuple[ops.BaseGrouper, NDFrameT]:
283 """
284 Parameters
285 ----------
286 obj : Series or DataFrame
287 validate : bool, default True
288 if True, validate the grouper
289
290 Returns
291 -------
292 a tuple of grouper, obj (possibly sorted)
293 """
294 obj, _, _ = self._set_grouper(obj)
295 grouper, _, obj = get_grouper(
296 obj,
297 [self.key],
298 axis=self.axis,
299 level=self.level,
300 sort=self.sort,
301 validate=validate,
302 dropna=self.dropna,
303 )
304 # Without setting this, subsequent lookups to .groups raise
305 # error: Incompatible types in assignment (expression has type "BaseGrouper",
306 # variable has type "None")
307 self._grouper_deprecated = grouper # type: ignore[assignment]
308
309 return grouper, obj
310
311 @final
312 def _set_grouper(
313 self, obj: NDFrame, sort: bool = False, *, gpr_index: Index | None = None
314 ):
315 """
316 given an object and the specifications, setup the internal grouper
317 for this particular specification
318
319 Parameters
320 ----------
321 obj : Series or DataFrame
322 sort : bool, default False
323 whether the resulting grouper should be sorted
324 gpr_index : Index or None, default None
325
326 Returns
327 -------
328 NDFrame
329 Index
330 np.ndarray[np.intp] | None
331 """
332 assert obj is not None
333
334 indexer = None
335
336 if self.key is not None and self.level is not None:
337 raise ValueError("The Grouper cannot specify both a key and a level!")
338
339 # Keep self._grouper value before overriding
340 if self._grouper is None:
341 # TODO: What are we assuming about subsequent calls?
342 self._grouper = gpr_index
343 self._indexer = self._indexer_deprecated
344
345 # the key must be a valid info item
346 if self.key is not None:
347 key = self.key
348 # The 'on' is already defined
349 if getattr(gpr_index, "name", None) == key and isinstance(obj, Series):
350 # Sometimes self._grouper will have been resorted while
351 # obj has not. In this case there is a mismatch when we
352 # call self._grouper.take(obj.index) so we need to undo the sorting
353 # before we call _grouper.take.
354 assert self._grouper is not None
355 if self._indexer is not None:
356 reverse_indexer = self._indexer.argsort()
357 unsorted_ax = self._grouper.take(reverse_indexer)
358 ax = unsorted_ax.take(obj.index)
359 else:
360 ax = self._grouper.take(obj.index)
361 else:
362 if key not in obj._info_axis:
363 raise KeyError(f"The grouper name {key} is not found")
364 ax = Index(obj[key], name=key)
365
366 else:
367 ax = obj._get_axis(self.axis)
368 if self.level is not None:
369 level = self.level
370
371 # if a level is given it must be a mi level or
372 # equivalent to the axis name
373 if isinstance(ax, MultiIndex):
374 level = ax._get_level_number(level)
375 ax = Index(ax._get_level_values(level), name=ax.names[level])
376
377 else:
378 if level not in (0, ax.name):
379 raise ValueError(f"The level {level} is not valid")
380
381 # possibly sort
382 if (self.sort or sort) and not ax.is_monotonic_increasing:
383 # use stable sort to support first, last, nth
384 # TODO: why does putting na_position="first" fix datetimelike cases?
385 indexer = self._indexer_deprecated = ax.array.argsort(
386 kind="mergesort", na_position="first"
387 )
388 ax = ax.take(indexer)
389 obj = obj.take(indexer, axis=self.axis)
390
391 # error: Incompatible types in assignment (expression has type
392 # "NDFrameT", variable has type "None")
393 self._obj_deprecated = obj # type: ignore[assignment]
394 self._gpr_index = ax
395 return obj, ax, indexer
396
397 @final
398 @property
399 def ax(self) -> Index:
400 warnings.warn(
401 f"{type(self).__name__}.ax is deprecated and will be removed in a "
402 "future version. Use Resampler.ax instead",
403 FutureWarning,
404 stacklevel=find_stack_level(),
405 )
406 index = self._gpr_index
407 if index is None:
408 raise ValueError("_set_grouper must be called before ax is accessed")
409 return index
410
411 @final
412 @property
413 def indexer(self):
414 warnings.warn(
415 f"{type(self).__name__}.indexer is deprecated and will be removed "
416 "in a future version. Use Resampler.indexer instead.",
417 FutureWarning,
418 stacklevel=find_stack_level(),
419 )
420 return self._indexer_deprecated
421
422 @final
423 @property
424 def obj(self):
425 warnings.warn(
426 f"{type(self).__name__}.obj is deprecated and will be removed "
427 "in a future version. Use GroupBy.indexer instead.",
428 FutureWarning,
429 stacklevel=find_stack_level(),
430 )
431 return self._obj_deprecated
432
433 @final
434 @property
435 def grouper(self):
436 warnings.warn(
437 f"{type(self).__name__}.grouper is deprecated and will be removed "
438 "in a future version. Use GroupBy.grouper instead.",
439 FutureWarning,
440 stacklevel=find_stack_level(),
441 )
442 return self._grouper_deprecated
443
444 @final
445 @property
446 def groups(self):
447 warnings.warn(
448 f"{type(self).__name__}.groups is deprecated and will be removed "
449 "in a future version. Use GroupBy.groups instead.",
450 FutureWarning,
451 stacklevel=find_stack_level(),
452 )
453 # error: "None" has no attribute "groups"
454 return self._grouper_deprecated.groups # type: ignore[attr-defined]
455
456 @final
457 def __repr__(self) -> str:
458 attrs_list = (
459 f"{attr_name}={repr(getattr(self, attr_name))}"
460 for attr_name in self._attributes
461 if getattr(self, attr_name) is not None
462 )
463 attrs = ", ".join(attrs_list)
464 cls_name = type(self).__name__
465 return f"{cls_name}({attrs})"
466
467
468@final
469class Grouping:
470 """
471 Holds the grouping information for a single key
472
473 Parameters
474 ----------
475 index : Index
476 grouper :
477 obj : DataFrame or Series
478 name : Label
479 level :
480 observed : bool, default False
481 If we are a Categorical, use the observed values
482 in_axis : if the Grouping is a column in self.obj and hence among
483 Groupby.exclusions list
484 dropna : bool, default True
485 Whether to drop NA groups.
486 uniques : Array-like, optional
487 When specified, will be used for unique values. Enables including empty groups
488 in the result for a BinGrouper. Must not contain duplicates.
489
490 Attributes
491 -------
492 indices : dict
493 Mapping of {group -> index_list}
494 codes : ndarray
495 Group codes
496 group_index : Index or None
497 unique groups
498 groups : dict
499 Mapping of {group -> label_list}
500 """
501
502 _codes: npt.NDArray[np.signedinteger] | None = None
503 _group_index: Index | None = None
504 _all_grouper: Categorical | None
505 _orig_cats: Index | None
506 _index: Index
507
508 def __init__(
509 self,
510 index: Index,
511 grouper=None,
512 obj: NDFrame | None = None,
513 level=None,
514 sort: bool = True,
515 observed: bool = False,
516 in_axis: bool = False,
517 dropna: bool = True,
518 uniques: ArrayLike | None = None,
519 ) -> None:
520 self.level = level
521 self._orig_grouper = grouper
522 grouping_vector = _convert_grouper(index, grouper)
523 self._all_grouper = None
524 self._orig_cats = None
525 self._index = index
526 self._sort = sort
527 self.obj = obj
528 self._observed = observed
529 self.in_axis = in_axis
530 self._dropna = dropna
531 self._uniques = uniques
532
533 # we have a single grouper which may be a myriad of things,
534 # some of which are dependent on the passing in level
535
536 ilevel = self._ilevel
537 if ilevel is not None:
538 # In extant tests, the new self.grouping_vector matches
539 # `index.get_level_values(ilevel)` whenever
540 # mapper is None and isinstance(index, MultiIndex)
541 if isinstance(index, MultiIndex):
542 index_level = index.get_level_values(ilevel)
543 else:
544 index_level = index
545
546 if grouping_vector is None:
547 grouping_vector = index_level
548 else:
549 mapper = grouping_vector
550 grouping_vector = index_level.map(mapper)
551
552 # a passed Grouper like, directly get the grouper in the same way
553 # as single grouper groupby, use the group_info to get codes
554 elif isinstance(grouping_vector, Grouper):
555 # get the new grouper; we already have disambiguated
556 # what key/level refer to exactly, don't need to
557 # check again as we have by this point converted these
558 # to an actual value (rather than a pd.Grouper)
559 assert self.obj is not None # for mypy
560 newgrouper, newobj = grouping_vector._get_grouper(self.obj, validate=False)
561 self.obj = newobj
562
563 if isinstance(newgrouper, ops.BinGrouper):
564 # TODO: can we unwrap this and get a tighter typing
565 # for self.grouping_vector?
566 grouping_vector = newgrouper
567 else:
568 # ops.BaseGrouper
569 # TODO: 2023-02-03 no test cases with len(newgrouper.groupings) > 1.
570 # If that were to occur, would we be throwing out information?
571 # error: Cannot determine type of "grouping_vector" [has-type]
572 ng = newgrouper.groupings[0].grouping_vector # type: ignore[has-type]
573 # use Index instead of ndarray so we can recover the name
574 grouping_vector = Index(ng, name=newgrouper.result_index.name)
575
576 elif not isinstance(
577 grouping_vector, (Series, Index, ExtensionArray, np.ndarray)
578 ):
579 # no level passed
580 if getattr(grouping_vector, "ndim", 1) != 1:
581 t = str(type(grouping_vector))
582 raise ValueError(f"Grouper for '{t}' not 1-dimensional")
583
584 grouping_vector = index.map(grouping_vector)
585
586 if not (
587 hasattr(grouping_vector, "__len__")
588 and len(grouping_vector) == len(index)
589 ):
590 grper = pprint_thing(grouping_vector)
591 errmsg = (
592 "Grouper result violates len(labels) == "
593 f"len(data)\nresult: {grper}"
594 )
595 raise AssertionError(errmsg)
596
597 if isinstance(grouping_vector, np.ndarray):
598 if grouping_vector.dtype.kind in ["m", "M"]:
599 # if we have a date/time-like grouper, make sure that we have
600 # Timestamps like
601 # TODO 2022-10-08 we only have one test that gets here and
602 # values are already in nanoseconds in that case.
603 grouping_vector = Series(grouping_vector).to_numpy()
604 elif is_categorical_dtype(grouping_vector):
605 # a passed Categorical
606 self._orig_cats = grouping_vector.categories
607 grouping_vector, self._all_grouper = recode_for_groupby(
608 grouping_vector, sort, observed
609 )
610
611 self.grouping_vector = grouping_vector
612
613 def __repr__(self) -> str:
614 return f"Grouping({self.name})"
615
616 def __iter__(self) -> Iterator:
617 return iter(self.indices)
618
619 @cache_readonly
620 def _passed_categorical(self) -> bool:
621 return is_categorical_dtype(self.grouping_vector)
622
623 @cache_readonly
624 def name(self) -> Hashable:
625 ilevel = self._ilevel
626 if ilevel is not None:
627 return self._index.names[ilevel]
628
629 if isinstance(self._orig_grouper, (Index, Series)):
630 return self._orig_grouper.name
631
632 elif isinstance(self.grouping_vector, ops.BaseGrouper):
633 return self.grouping_vector.result_index.name
634
635 elif isinstance(self.grouping_vector, Index):
636 return self.grouping_vector.name
637
638 # otherwise we have ndarray or ExtensionArray -> no name
639 return None
640
641 @cache_readonly
642 def _ilevel(self) -> int | None:
643 """
644 If necessary, converted index level name to index level position.
645 """
646 level = self.level
647 if level is None:
648 return None
649 if not isinstance(level, int):
650 index = self._index
651 if level not in index.names:
652 raise AssertionError(f"Level {level} not in index")
653 return index.names.index(level)
654 return level
655
656 @property
657 def ngroups(self) -> int:
658 return len(self.group_index)
659
660 @cache_readonly
661 def indices(self) -> dict[Hashable, npt.NDArray[np.intp]]:
662 # we have a list of groupers
663 if isinstance(self.grouping_vector, ops.BaseGrouper):
664 return self.grouping_vector.indices
665
666 values = Categorical(self.grouping_vector)
667 return values._reverse_indexer()
668
669 @property
670 def codes(self) -> npt.NDArray[np.signedinteger]:
671 return self._codes_and_uniques[0]
672
673 @cache_readonly
674 def group_arraylike(self) -> ArrayLike:
675 """
676 Analogous to result_index, but holding an ArrayLike to ensure
677 we can retain ExtensionDtypes.
678 """
679 if self._all_grouper is not None:
680 # retain dtype for categories, including unobserved ones
681 return self.result_index._values
682
683 elif self._passed_categorical:
684 return self.group_index._values
685
686 return self._codes_and_uniques[1]
687
688 @cache_readonly
689 def result_index(self) -> Index:
690 # result_index retains dtype for categories, including unobserved ones,
691 # which group_index does not
692 if self._all_grouper is not None:
693 group_idx = self.group_index
694 assert isinstance(group_idx, CategoricalIndex)
695 cats = self._orig_cats
696 # set_categories is dynamically added
697 return group_idx.set_categories(cats) # type: ignore[attr-defined]
698 return self.group_index
699
700 @cache_readonly
701 def group_index(self) -> Index:
702 codes, uniques = self._codes_and_uniques
703 if not self._dropna and self._passed_categorical:
704 assert isinstance(uniques, Categorical)
705 if self._sort and (codes == len(uniques)).any():
706 # Add NA value on the end when sorting
707 uniques = Categorical.from_codes(
708 np.append(uniques.codes, [-1]), uniques.categories
709 )
710 elif len(codes) > 0:
711 # Need to determine proper placement of NA value when not sorting
712 cat = self.grouping_vector
713 na_idx = (cat.codes < 0).argmax()
714 if cat.codes[na_idx] < 0:
715 # count number of unique codes that comes before the nan value
716 na_unique_idx = algorithms.nunique_ints(cat.codes[:na_idx])
717 uniques = Categorical.from_codes(
718 np.insert(uniques.codes, na_unique_idx, -1), uniques.categories
719 )
720 return Index._with_infer(uniques, name=self.name)
721
722 @cache_readonly
723 def _codes_and_uniques(self) -> tuple[npt.NDArray[np.signedinteger], ArrayLike]:
724 uniques: ArrayLike
725 if self._passed_categorical:
726 # we make a CategoricalIndex out of the cat grouper
727 # preserving the categories / ordered attributes;
728 # doesn't (yet - GH#46909) handle dropna=False
729 cat = self.grouping_vector
730 categories = cat.categories
731
732 if self._observed:
733 ucodes = algorithms.unique1d(cat.codes)
734 ucodes = ucodes[ucodes != -1]
735 if self._sort:
736 ucodes = np.sort(ucodes)
737 else:
738 ucodes = np.arange(len(categories))
739
740 uniques = Categorical.from_codes(
741 codes=ucodes, categories=categories, ordered=cat.ordered
742 )
743
744 codes = cat.codes
745 if not self._dropna:
746 na_mask = codes < 0
747 if np.any(na_mask):
748 if self._sort:
749 # Replace NA codes with `largest code + 1`
750 na_code = len(categories)
751 codes = np.where(na_mask, na_code, codes)
752 else:
753 # Insert NA code into the codes based on first appearance
754 # A negative code must exist, no need to check codes[na_idx] < 0
755 na_idx = na_mask.argmax()
756 # count number of unique codes that comes before the nan value
757 na_code = algorithms.nunique_ints(codes[:na_idx])
758 codes = np.where(codes >= na_code, codes + 1, codes)
759 codes = np.where(na_mask, na_code, codes)
760
761 if not self._observed:
762 uniques = uniques.reorder_categories(self._orig_cats)
763
764 return codes, uniques
765
766 elif isinstance(self.grouping_vector, ops.BaseGrouper):
767 # we have a list of groupers
768 codes = self.grouping_vector.codes_info
769 uniques = self.grouping_vector.result_index._values
770 elif self._uniques is not None:
771 # GH#50486 Code grouping_vector using _uniques; allows
772 # including uniques that are not present in grouping_vector.
773 cat = Categorical(self.grouping_vector, categories=self._uniques)
774 codes = cat.codes
775 uniques = self._uniques
776 else:
777 # GH35667, replace dropna=False with use_na_sentinel=False
778 # error: Incompatible types in assignment (expression has type "Union[
779 # ndarray[Any, Any], Index]", variable has type "Categorical")
780 codes, uniques = algorithms.factorize( # type: ignore[assignment]
781 self.grouping_vector, sort=self._sort, use_na_sentinel=self._dropna
782 )
783 return codes, uniques
784
785 @cache_readonly
786 def groups(self) -> dict[Hashable, np.ndarray]:
787 return self._index.groupby(Categorical.from_codes(self.codes, self.group_index))
788
789
790def get_grouper(
791 obj: NDFrameT,
792 key=None,
793 axis: Axis = 0,
794 level=None,
795 sort: bool = True,
796 observed: bool = False,
797 validate: bool = True,
798 dropna: bool = True,
799) -> tuple[ops.BaseGrouper, frozenset[Hashable], NDFrameT]:
800 """
801 Create and return a BaseGrouper, which is an internal
802 mapping of how to create the grouper indexers.
803 This may be composed of multiple Grouping objects, indicating
804 multiple groupers
805
806 Groupers are ultimately index mappings. They can originate as:
807 index mappings, keys to columns, functions, or Groupers
808
809 Groupers enable local references to axis,level,sort, while
810 the passed in axis, level, and sort are 'global'.
811
812 This routine tries to figure out what the passing in references
813 are and then creates a Grouping for each one, combined into
814 a BaseGrouper.
815
816 If observed & we have a categorical grouper, only show the observed
817 values.
818
819 If validate, then check for key/level overlaps.
820
821 """
822 group_axis = obj._get_axis(axis)
823
824 # validate that the passed single level is compatible with the passed
825 # axis of the object
826 if level is not None:
827 # TODO: These if-block and else-block are almost same.
828 # MultiIndex instance check is removable, but it seems that there are
829 # some processes only for non-MultiIndex in else-block,
830 # eg. `obj.index.name != level`. We have to consider carefully whether
831 # these are applicable for MultiIndex. Even if these are applicable,
832 # we need to check if it makes no side effect to subsequent processes
833 # on the outside of this condition.
834 # (GH 17621)
835 if isinstance(group_axis, MultiIndex):
836 if is_list_like(level) and len(level) == 1:
837 level = level[0]
838
839 if key is None and is_scalar(level):
840 # Get the level values from group_axis
841 key = group_axis.get_level_values(level)
842 level = None
843
844 else:
845 # allow level to be a length-one list-like object
846 # (e.g., level=[0])
847 # GH 13901
848 if is_list_like(level):
849 nlevels = len(level)
850 if nlevels == 1:
851 level = level[0]
852 elif nlevels == 0:
853 raise ValueError("No group keys passed!")
854 else:
855 raise ValueError("multiple levels only valid with MultiIndex")
856
857 if isinstance(level, str):
858 if obj._get_axis(axis).name != level:
859 raise ValueError(
860 f"level name {level} is not the name "
861 f"of the {obj._get_axis_name(axis)}"
862 )
863 elif level > 0 or level < -1:
864 raise ValueError("level > 0 or level < -1 only valid with MultiIndex")
865
866 # NOTE: `group_axis` and `group_axis.get_level_values(level)`
867 # are same in this section.
868 level = None
869 key = group_axis
870
871 # a passed-in Grouper, directly convert
872 if isinstance(key, Grouper):
873 grouper, obj = key._get_grouper(obj, validate=False)
874 if key.key is None:
875 return grouper, frozenset(), obj
876 else:
877 return grouper, frozenset({key.key}), obj
878
879 # already have a BaseGrouper, just return it
880 elif isinstance(key, ops.BaseGrouper):
881 return key, frozenset(), obj
882
883 if not isinstance(key, list):
884 keys = [key]
885 match_axis_length = False
886 else:
887 keys = key
888 match_axis_length = len(keys) == len(group_axis)
889
890 # what are we after, exactly?
891 any_callable = any(callable(g) or isinstance(g, dict) for g in keys)
892 any_groupers = any(isinstance(g, (Grouper, Grouping)) for g in keys)
893 any_arraylike = any(
894 isinstance(g, (list, tuple, Series, Index, np.ndarray)) for g in keys
895 )
896
897 # is this an index replacement?
898 if (
899 not any_callable
900 and not any_arraylike
901 and not any_groupers
902 and match_axis_length
903 and level is None
904 ):
905 if isinstance(obj, DataFrame):
906 all_in_columns_index = all(
907 g in obj.columns or g in obj.index.names for g in keys
908 )
909 else:
910 assert isinstance(obj, Series)
911 all_in_columns_index = all(g in obj.index.names for g in keys)
912
913 if not all_in_columns_index:
914 keys = [com.asarray_tuplesafe(keys)]
915
916 if isinstance(level, (tuple, list)):
917 if key is None:
918 keys = [None] * len(level)
919 levels = level
920 else:
921 levels = [level] * len(keys)
922
923 groupings: list[Grouping] = []
924 exclusions: set[Hashable] = set()
925
926 # if the actual grouper should be obj[key]
927 def is_in_axis(key) -> bool:
928 if not _is_label_like(key):
929 if obj.ndim == 1:
930 return False
931
932 # items -> .columns for DataFrame, .index for Series
933 items = obj.axes[-1]
934 try:
935 items.get_loc(key)
936 except (KeyError, TypeError, InvalidIndexError):
937 # TypeError shows up here if we pass e.g. an Index
938 return False
939
940 return True
941
942 # if the grouper is obj[name]
943 def is_in_obj(gpr) -> bool:
944 if not hasattr(gpr, "name"):
945 return False
946 if using_copy_on_write():
947 # For the CoW case, we check the references to determine if the
948 # series is part of the object
949 try:
950 obj_gpr_column = obj[gpr.name]
951 except (KeyError, IndexError, InvalidIndexError):
952 return False
953 if isinstance(gpr, Series) and isinstance(obj_gpr_column, Series):
954 return gpr._mgr.references_same_values( # type: ignore[union-attr]
955 obj_gpr_column._mgr, 0 # type: ignore[arg-type]
956 )
957 return False
958 try:
959 return gpr is obj[gpr.name]
960 except (KeyError, IndexError, InvalidIndexError):
961 # IndexError reached in e.g. test_skip_group_keys when we pass
962 # lambda here
963 # InvalidIndexError raised on key-types inappropriate for index,
964 # e.g. DatetimeIndex.get_loc(tuple())
965 return False
966
967 for gpr, level in zip(keys, levels):
968 if is_in_obj(gpr): # df.groupby(df['name'])
969 in_axis = True
970 exclusions.add(gpr.name)
971
972 elif is_in_axis(gpr): # df.groupby('name')
973 if obj.ndim != 1 and gpr in obj:
974 if validate:
975 obj._check_label_or_level_ambiguity(gpr, axis=axis)
976 in_axis, name, gpr = True, gpr, obj[gpr]
977 if gpr.ndim != 1:
978 # non-unique columns; raise here to get the name in the
979 # exception message
980 raise ValueError(f"Grouper for '{name}' not 1-dimensional")
981 exclusions.add(name)
982 elif obj._is_level_reference(gpr, axis=axis):
983 in_axis, level, gpr = False, gpr, None
984 else:
985 raise KeyError(gpr)
986 elif isinstance(gpr, Grouper) and gpr.key is not None:
987 # Add key to exclusions
988 exclusions.add(gpr.key)
989 in_axis = True
990 else:
991 in_axis = False
992
993 # create the Grouping
994 # allow us to passing the actual Grouping as the gpr
995 ping = (
996 Grouping(
997 group_axis,
998 gpr,
999 obj=obj,
1000 level=level,
1001 sort=sort,
1002 observed=observed,
1003 in_axis=in_axis,
1004 dropna=dropna,
1005 )
1006 if not isinstance(gpr, Grouping)
1007 else gpr
1008 )
1009
1010 groupings.append(ping)
1011
1012 if len(groupings) == 0 and len(obj):
1013 raise ValueError("No group keys passed!")
1014 if len(groupings) == 0:
1015 groupings.append(Grouping(Index([], dtype="int"), np.array([], dtype=np.intp)))
1016
1017 # create the internals grouper
1018 grouper = ops.BaseGrouper(group_axis, groupings, sort=sort, dropna=dropna)
1019 return grouper, frozenset(exclusions), obj
1020
1021
1022def _is_label_like(val) -> bool:
1023 return isinstance(val, (str, tuple)) or (val is not None and is_scalar(val))
1024
1025
1026def _convert_grouper(axis: Index, grouper):
1027 if isinstance(grouper, dict):
1028 return grouper.get
1029 elif isinstance(grouper, Series):
1030 if grouper.index.equals(axis):
1031 return grouper._values
1032 else:
1033 return grouper.reindex(axis)._values
1034 elif isinstance(grouper, MultiIndex):
1035 return grouper._values
1036 elif isinstance(grouper, (list, tuple, Index, Categorical, np.ndarray)):
1037 if len(grouper) != len(axis):
1038 raise ValueError("Grouper and axis must be same length")
1039
1040 if isinstance(grouper, (list, tuple)):
1041 grouper = com.asarray_tuplesafe(grouper)
1042 return grouper
1043 else:
1044 return grouper