1"""
2Provide user facing operators for doing the split part of the
3split-apply-combine paradigm.
4"""
5from __future__ import annotations
6
7from typing import (
8 TYPE_CHECKING,
9 final,
10)
11import warnings
12
13import numpy as np
14
15from pandas._config import (
16 using_copy_on_write,
17 warn_copy_on_write,
18)
19
20from pandas._libs import lib
21from pandas._libs.tslibs import OutOfBoundsDatetime
22from pandas.errors import InvalidIndexError
23from pandas.util._decorators import cache_readonly
24from pandas.util._exceptions import find_stack_level
25
26from pandas.core.dtypes.common import (
27 is_list_like,
28 is_scalar,
29)
30from pandas.core.dtypes.dtypes import CategoricalDtype
31
32from pandas.core import algorithms
33from pandas.core.arrays import (
34 Categorical,
35 ExtensionArray,
36)
37import pandas.core.common as com
38from pandas.core.frame import DataFrame
39from pandas.core.groupby import ops
40from pandas.core.groupby.categorical import recode_for_groupby
41from pandas.core.indexes.api import (
42 CategoricalIndex,
43 Index,
44 MultiIndex,
45)
46from pandas.core.series import Series
47
48from pandas.io.formats.printing import pprint_thing
49
50if TYPE_CHECKING:
51 from collections.abc import (
52 Hashable,
53 Iterator,
54 )
55
56 from pandas._typing import (
57 ArrayLike,
58 Axis,
59 NDFrameT,
60 npt,
61 )
62
63 from pandas.core.generic import NDFrame
64
65
66class Grouper:
67 """
68 A Grouper allows the user to specify a groupby instruction for an object.
69
70 This specification will select a column via the key parameter, or if the
71 level and/or axis parameters are given, a level of the index of the target
72 object.
73
74 If `axis` and/or `level` are passed as keywords to both `Grouper` and
75 `groupby`, the values passed to `Grouper` take precedence.
76
77 Parameters
78 ----------
79 key : str, defaults to None
80 Groupby key, which selects the grouping column of the target.
81 level : name/number, defaults to None
82 The level for the target index.
83 freq : str / frequency object, defaults to None
84 This will groupby the specified frequency if the target selection
85 (via key or level) is a datetime-like object. For full specification
86 of available frequencies, please see `here
87 <https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#offset-aliases>`_.
88 axis : str, int, defaults to 0
89 Number/name of the axis.
90 sort : bool, default to False
91 Whether to sort the resulting labels.
92 closed : {'left' or 'right'}
93 Closed end of interval. Only when `freq` parameter is passed.
94 label : {'left' or 'right'}
95 Interval boundary to use for labeling.
96 Only when `freq` parameter is passed.
97 convention : {'start', 'end', 'e', 's'}
98 If grouper is PeriodIndex and `freq` parameter is passed.
99
100 origin : Timestamp or str, default 'start_day'
101 The timestamp on which to adjust the grouping. The timezone of origin must
102 match the timezone of the index.
103 If string, must be one of the following:
104
105 - 'epoch': `origin` is 1970-01-01
106 - 'start': `origin` is the first value of the timeseries
107 - 'start_day': `origin` is the first day at midnight of the timeseries
108
109 - 'end': `origin` is the last value of the timeseries
110 - 'end_day': `origin` is the ceiling midnight of the last day
111
112 .. versionadded:: 1.3.0
113
114 offset : Timedelta or str, default is None
115 An offset timedelta added to the origin.
116
117 dropna : bool, default True
118 If True, and if group keys contain NA values, NA values together with
119 row/column will be dropped. If False, NA values will also be treated as
120 the key in groups.
121
122 Returns
123 -------
124 Grouper or pandas.api.typing.TimeGrouper
125 A TimeGrouper is returned if ``freq`` is not ``None``. Otherwise, a Grouper
126 is returned.
127
128 Examples
129 --------
130 ``df.groupby(pd.Grouper(key="Animal"))`` is equivalent to ``df.groupby('Animal')``
131
132 >>> df = pd.DataFrame(
133 ... {
134 ... "Animal": ["Falcon", "Parrot", "Falcon", "Falcon", "Parrot"],
135 ... "Speed": [100, 5, 200, 300, 15],
136 ... }
137 ... )
138 >>> df
139 Animal Speed
140 0 Falcon 100
141 1 Parrot 5
142 2 Falcon 200
143 3 Falcon 300
144 4 Parrot 15
145 >>> df.groupby(pd.Grouper(key="Animal")).mean()
146 Speed
147 Animal
148 Falcon 200.0
149 Parrot 10.0
150
151 Specify a resample operation on the column 'Publish date'
152
153 >>> df = pd.DataFrame(
154 ... {
155 ... "Publish date": [
156 ... pd.Timestamp("2000-01-02"),
157 ... pd.Timestamp("2000-01-02"),
158 ... pd.Timestamp("2000-01-09"),
159 ... pd.Timestamp("2000-01-16")
160 ... ],
161 ... "ID": [0, 1, 2, 3],
162 ... "Price": [10, 20, 30, 40]
163 ... }
164 ... )
165 >>> df
166 Publish date ID Price
167 0 2000-01-02 0 10
168 1 2000-01-02 1 20
169 2 2000-01-09 2 30
170 3 2000-01-16 3 40
171 >>> df.groupby(pd.Grouper(key="Publish date", freq="1W")).mean()
172 ID Price
173 Publish date
174 2000-01-02 0.5 15.0
175 2000-01-09 2.0 30.0
176 2000-01-16 3.0 40.0
177
178 If you want to adjust the start of the bins based on a fixed timestamp:
179
180 >>> start, end = '2000-10-01 23:30:00', '2000-10-02 00:30:00'
181 >>> rng = pd.date_range(start, end, freq='7min')
182 >>> ts = pd.Series(np.arange(len(rng)) * 3, index=rng)
183 >>> ts
184 2000-10-01 23:30:00 0
185 2000-10-01 23:37:00 3
186 2000-10-01 23:44:00 6
187 2000-10-01 23:51:00 9
188 2000-10-01 23:58:00 12
189 2000-10-02 00:05:00 15
190 2000-10-02 00:12:00 18
191 2000-10-02 00:19:00 21
192 2000-10-02 00:26:00 24
193 Freq: 7min, dtype: int64
194
195 >>> ts.groupby(pd.Grouper(freq='17min')).sum()
196 2000-10-01 23:14:00 0
197 2000-10-01 23:31:00 9
198 2000-10-01 23:48:00 21
199 2000-10-02 00:05:00 54
200 2000-10-02 00:22:00 24
201 Freq: 17min, dtype: int64
202
203 >>> ts.groupby(pd.Grouper(freq='17min', origin='epoch')).sum()
204 2000-10-01 23:18:00 0
205 2000-10-01 23:35:00 18
206 2000-10-01 23:52:00 27
207 2000-10-02 00:09:00 39
208 2000-10-02 00:26:00 24
209 Freq: 17min, dtype: int64
210
211 >>> ts.groupby(pd.Grouper(freq='17min', origin='2000-01-01')).sum()
212 2000-10-01 23:24:00 3
213 2000-10-01 23:41:00 15
214 2000-10-01 23:58:00 45
215 2000-10-02 00:15:00 45
216 Freq: 17min, dtype: int64
217
218 If you want to adjust the start of the bins with an `offset` Timedelta, the two
219 following lines are equivalent:
220
221 >>> ts.groupby(pd.Grouper(freq='17min', origin='start')).sum()
222 2000-10-01 23:30:00 9
223 2000-10-01 23:47:00 21
224 2000-10-02 00:04:00 54
225 2000-10-02 00:21:00 24
226 Freq: 17min, dtype: int64
227
228 >>> ts.groupby(pd.Grouper(freq='17min', offset='23h30min')).sum()
229 2000-10-01 23:30:00 9
230 2000-10-01 23:47:00 21
231 2000-10-02 00:04:00 54
232 2000-10-02 00:21:00 24
233 Freq: 17min, dtype: int64
234
235 To replace the use of the deprecated `base` argument, you can now use `offset`,
236 in this example it is equivalent to have `base=2`:
237
238 >>> ts.groupby(pd.Grouper(freq='17min', offset='2min')).sum()
239 2000-10-01 23:16:00 0
240 2000-10-01 23:33:00 9
241 2000-10-01 23:50:00 36
242 2000-10-02 00:07:00 39
243 2000-10-02 00:24:00 24
244 Freq: 17min, dtype: int64
245 """
246
247 sort: bool
248 dropna: bool
249 _gpr_index: Index | None
250 _grouper: Index | None
251
252 _attributes: tuple[str, ...] = ("key", "level", "freq", "axis", "sort", "dropna")
253
254 def __new__(cls, *args, **kwargs):
255 if kwargs.get("freq") is not None:
256 from pandas.core.resample import TimeGrouper
257
258 cls = TimeGrouper
259 return super().__new__(cls)
260
261 def __init__(
262 self,
263 key=None,
264 level=None,
265 freq=None,
266 axis: Axis | lib.NoDefault = lib.no_default,
267 sort: bool = False,
268 dropna: bool = True,
269 ) -> None:
270 if type(self) is Grouper:
271 # i.e. not TimeGrouper
272 if axis is not lib.no_default:
273 warnings.warn(
274 "Grouper axis keyword is deprecated and will be removed in a "
275 "future version. To group on axis=1, use obj.T.groupby(...) "
276 "instead",
277 FutureWarning,
278 stacklevel=find_stack_level(),
279 )
280 else:
281 axis = 0
282 if axis is lib.no_default:
283 axis = 0
284
285 self.key = key
286 self.level = level
287 self.freq = freq
288 self.axis = axis
289 self.sort = sort
290 self.dropna = dropna
291
292 self._grouper_deprecated = None
293 self._indexer_deprecated: npt.NDArray[np.intp] | None = None
294 self._obj_deprecated = None
295 self._gpr_index = None
296 self.binner = None
297 self._grouper = None
298 self._indexer: npt.NDArray[np.intp] | None = None
299
300 def _get_grouper(
301 self, obj: NDFrameT, validate: bool = True
302 ) -> tuple[ops.BaseGrouper, NDFrameT]:
303 """
304 Parameters
305 ----------
306 obj : Series or DataFrame
307 validate : bool, default True
308 if True, validate the grouper
309
310 Returns
311 -------
312 a tuple of grouper, obj (possibly sorted)
313 """
314 obj, _, _ = self._set_grouper(obj)
315 grouper, _, obj = get_grouper(
316 obj,
317 [self.key],
318 axis=self.axis,
319 level=self.level,
320 sort=self.sort,
321 validate=validate,
322 dropna=self.dropna,
323 )
324 # Without setting this, subsequent lookups to .groups raise
325 # error: Incompatible types in assignment (expression has type "BaseGrouper",
326 # variable has type "None")
327 self._grouper_deprecated = grouper # type: ignore[assignment]
328
329 return grouper, obj
330
331 def _set_grouper(
332 self, obj: NDFrameT, sort: bool = False, *, gpr_index: Index | None = None
333 ) -> tuple[NDFrameT, Index, npt.NDArray[np.intp] | None]:
334 """
335 given an object and the specifications, setup the internal grouper
336 for this particular specification
337
338 Parameters
339 ----------
340 obj : Series or DataFrame
341 sort : bool, default False
342 whether the resulting grouper should be sorted
343 gpr_index : Index or None, default None
344
345 Returns
346 -------
347 NDFrame
348 Index
349 np.ndarray[np.intp] | None
350 """
351 assert obj is not None
352
353 if self.key is not None and self.level is not None:
354 raise ValueError("The Grouper cannot specify both a key and a level!")
355
356 # Keep self._grouper value before overriding
357 if self._grouper is None:
358 # TODO: What are we assuming about subsequent calls?
359 self._grouper = gpr_index
360 self._indexer = self._indexer_deprecated
361
362 # the key must be a valid info item
363 if self.key is not None:
364 key = self.key
365 # The 'on' is already defined
366 if getattr(gpr_index, "name", None) == key and isinstance(obj, Series):
367 # Sometimes self._grouper will have been resorted while
368 # obj has not. In this case there is a mismatch when we
369 # call self._grouper.take(obj.index) so we need to undo the sorting
370 # before we call _grouper.take.
371 assert self._grouper is not None
372 if self._indexer is not None:
373 reverse_indexer = self._indexer.argsort()
374 unsorted_ax = self._grouper.take(reverse_indexer)
375 ax = unsorted_ax.take(obj.index)
376 else:
377 ax = self._grouper.take(obj.index)
378 else:
379 if key not in obj._info_axis:
380 raise KeyError(f"The grouper name {key} is not found")
381 ax = Index(obj[key], name=key)
382
383 else:
384 ax = obj._get_axis(self.axis)
385 if self.level is not None:
386 level = self.level
387
388 # if a level is given it must be a mi level or
389 # equivalent to the axis name
390 if isinstance(ax, MultiIndex):
391 level = ax._get_level_number(level)
392 ax = Index(ax._get_level_values(level), name=ax.names[level])
393
394 else:
395 if level not in (0, ax.name):
396 raise ValueError(f"The level {level} is not valid")
397
398 # possibly sort
399 indexer: npt.NDArray[np.intp] | None = None
400 if (self.sort or sort) and not ax.is_monotonic_increasing:
401 # use stable sort to support first, last, nth
402 # TODO: why does putting na_position="first" fix datetimelike cases?
403 indexer = self._indexer_deprecated = ax.array.argsort(
404 kind="mergesort", na_position="first"
405 )
406 ax = ax.take(indexer)
407 obj = obj.take(indexer, axis=self.axis)
408
409 # error: Incompatible types in assignment (expression has type
410 # "NDFrameT", variable has type "None")
411 self._obj_deprecated = obj # type: ignore[assignment]
412 self._gpr_index = ax
413 return obj, ax, indexer
414
415 @final
416 @property
417 def ax(self) -> Index:
418 warnings.warn(
419 f"{type(self).__name__}.ax is deprecated and will be removed in a "
420 "future version. Use Resampler.ax instead",
421 FutureWarning,
422 stacklevel=find_stack_level(),
423 )
424 index = self._gpr_index
425 if index is None:
426 raise ValueError("_set_grouper must be called before ax is accessed")
427 return index
428
429 @final
430 @property
431 def indexer(self):
432 warnings.warn(
433 f"{type(self).__name__}.indexer is deprecated and will be removed "
434 "in a future version. Use Resampler.indexer instead.",
435 FutureWarning,
436 stacklevel=find_stack_level(),
437 )
438 return self._indexer_deprecated
439
440 @final
441 @property
442 def obj(self):
443 # TODO(3.0): enforcing these deprecations on Grouper should close
444 # GH#25564, GH#41930
445 warnings.warn(
446 f"{type(self).__name__}.obj is deprecated and will be removed "
447 "in a future version. Use GroupBy.indexer instead.",
448 FutureWarning,
449 stacklevel=find_stack_level(),
450 )
451 return self._obj_deprecated
452
453 @final
454 @property
455 def grouper(self):
456 warnings.warn(
457 f"{type(self).__name__}.grouper is deprecated and will be removed "
458 "in a future version. Use GroupBy.grouper instead.",
459 FutureWarning,
460 stacklevel=find_stack_level(),
461 )
462 return self._grouper_deprecated
463
464 @final
465 @property
466 def groups(self):
467 warnings.warn(
468 f"{type(self).__name__}.groups is deprecated and will be removed "
469 "in a future version. Use GroupBy.groups instead.",
470 FutureWarning,
471 stacklevel=find_stack_level(),
472 )
473 # error: "None" has no attribute "groups"
474 return self._grouper_deprecated.groups # type: ignore[attr-defined]
475
476 @final
477 def __repr__(self) -> str:
478 attrs_list = (
479 f"{attr_name}={repr(getattr(self, attr_name))}"
480 for attr_name in self._attributes
481 if getattr(self, attr_name) is not None
482 )
483 attrs = ", ".join(attrs_list)
484 cls_name = type(self).__name__
485 return f"{cls_name}({attrs})"
486
487
488@final
489class Grouping:
490 """
491 Holds the grouping information for a single key
492
493 Parameters
494 ----------
495 index : Index
496 grouper :
497 obj : DataFrame or Series
498 name : Label
499 level :
500 observed : bool, default False
501 If we are a Categorical, use the observed values
502 in_axis : if the Grouping is a column in self.obj and hence among
503 Groupby.exclusions list
504 dropna : bool, default True
505 Whether to drop NA groups.
506 uniques : Array-like, optional
507 When specified, will be used for unique values. Enables including empty groups
508 in the result for a BinGrouper. Must not contain duplicates.
509
510 Attributes
511 -------
512 indices : dict
513 Mapping of {group -> index_list}
514 codes : ndarray
515 Group codes
516 group_index : Index or None
517 unique groups
518 groups : dict
519 Mapping of {group -> label_list}
520 """
521
522 _codes: npt.NDArray[np.signedinteger] | None = None
523 _all_grouper: Categorical | None
524 _orig_cats: Index | None
525 _index: Index
526
527 def __init__(
528 self,
529 index: Index,
530 grouper=None,
531 obj: NDFrame | None = None,
532 level=None,
533 sort: bool = True,
534 observed: bool = False,
535 in_axis: bool = False,
536 dropna: bool = True,
537 uniques: ArrayLike | None = None,
538 ) -> None:
539 self.level = level
540 self._orig_grouper = grouper
541 grouping_vector = _convert_grouper(index, grouper)
542 self._all_grouper = None
543 self._orig_cats = None
544 self._index = index
545 self._sort = sort
546 self.obj = obj
547 self._observed = observed
548 self.in_axis = in_axis
549 self._dropna = dropna
550 self._uniques = uniques
551
552 # we have a single grouper which may be a myriad of things,
553 # some of which are dependent on the passing in level
554
555 ilevel = self._ilevel
556 if ilevel is not None:
557 # In extant tests, the new self.grouping_vector matches
558 # `index.get_level_values(ilevel)` whenever
559 # mapper is None and isinstance(index, MultiIndex)
560 if isinstance(index, MultiIndex):
561 index_level = index.get_level_values(ilevel)
562 else:
563 index_level = index
564
565 if grouping_vector is None:
566 grouping_vector = index_level
567 else:
568 mapper = grouping_vector
569 grouping_vector = index_level.map(mapper)
570
571 # a passed Grouper like, directly get the grouper in the same way
572 # as single grouper groupby, use the group_info to get codes
573 elif isinstance(grouping_vector, Grouper):
574 # get the new grouper; we already have disambiguated
575 # what key/level refer to exactly, don't need to
576 # check again as we have by this point converted these
577 # to an actual value (rather than a pd.Grouper)
578 assert self.obj is not None # for mypy
579 newgrouper, newobj = grouping_vector._get_grouper(self.obj, validate=False)
580 self.obj = newobj
581
582 if isinstance(newgrouper, ops.BinGrouper):
583 # TODO: can we unwrap this and get a tighter typing
584 # for self.grouping_vector?
585 grouping_vector = newgrouper
586 else:
587 # ops.BaseGrouper
588 # TODO: 2023-02-03 no test cases with len(newgrouper.groupings) > 1.
589 # If that were to occur, would we be throwing out information?
590 # error: Cannot determine type of "grouping_vector" [has-type]
591 ng = newgrouper.groupings[0].grouping_vector # type: ignore[has-type]
592 # use Index instead of ndarray so we can recover the name
593 grouping_vector = Index(ng, name=newgrouper.result_index.name)
594
595 elif not isinstance(
596 grouping_vector, (Series, Index, ExtensionArray, np.ndarray)
597 ):
598 # no level passed
599 if getattr(grouping_vector, "ndim", 1) != 1:
600 t = str(type(grouping_vector))
601 raise ValueError(f"Grouper for '{t}' not 1-dimensional")
602
603 grouping_vector = index.map(grouping_vector)
604
605 if not (
606 hasattr(grouping_vector, "__len__")
607 and len(grouping_vector) == len(index)
608 ):
609 grper = pprint_thing(grouping_vector)
610 errmsg = (
611 "Grouper result violates len(labels) == "
612 f"len(data)\nresult: {grper}"
613 )
614 raise AssertionError(errmsg)
615
616 if isinstance(grouping_vector, np.ndarray):
617 if grouping_vector.dtype.kind in "mM":
618 # if we have a date/time-like grouper, make sure that we have
619 # Timestamps like
620 # TODO 2022-10-08 we only have one test that gets here and
621 # values are already in nanoseconds in that case.
622 grouping_vector = Series(grouping_vector).to_numpy()
623 elif isinstance(getattr(grouping_vector, "dtype", None), CategoricalDtype):
624 # a passed Categorical
625 self._orig_cats = grouping_vector.categories
626 grouping_vector, self._all_grouper = recode_for_groupby(
627 grouping_vector, sort, observed
628 )
629
630 self.grouping_vector = grouping_vector
631
632 def __repr__(self) -> str:
633 return f"Grouping({self.name})"
634
635 def __iter__(self) -> Iterator:
636 return iter(self.indices)
637
638 @cache_readonly
639 def _passed_categorical(self) -> bool:
640 dtype = getattr(self.grouping_vector, "dtype", None)
641 return isinstance(dtype, CategoricalDtype)
642
643 @cache_readonly
644 def name(self) -> Hashable:
645 ilevel = self._ilevel
646 if ilevel is not None:
647 return self._index.names[ilevel]
648
649 if isinstance(self._orig_grouper, (Index, Series)):
650 return self._orig_grouper.name
651
652 elif isinstance(self.grouping_vector, ops.BaseGrouper):
653 return self.grouping_vector.result_index.name
654
655 elif isinstance(self.grouping_vector, Index):
656 return self.grouping_vector.name
657
658 # otherwise we have ndarray or ExtensionArray -> no name
659 return None
660
661 @cache_readonly
662 def _ilevel(self) -> int | None:
663 """
664 If necessary, converted index level name to index level position.
665 """
666 level = self.level
667 if level is None:
668 return None
669 if not isinstance(level, int):
670 index = self._index
671 if level not in index.names:
672 raise AssertionError(f"Level {level} not in index")
673 return index.names.index(level)
674 return level
675
676 @property
677 def ngroups(self) -> int:
678 return len(self._group_index)
679
680 @cache_readonly
681 def indices(self) -> dict[Hashable, npt.NDArray[np.intp]]:
682 # we have a list of groupers
683 if isinstance(self.grouping_vector, ops.BaseGrouper):
684 return self.grouping_vector.indices
685
686 values = Categorical(self.grouping_vector)
687 return values._reverse_indexer()
688
689 @property
690 def codes(self) -> npt.NDArray[np.signedinteger]:
691 return self._codes_and_uniques[0]
692
693 @cache_readonly
694 def _group_arraylike(self) -> ArrayLike:
695 """
696 Analogous to result_index, but holding an ArrayLike to ensure
697 we can retain ExtensionDtypes.
698 """
699 if self._all_grouper is not None:
700 # retain dtype for categories, including unobserved ones
701 return self._result_index._values
702
703 elif self._passed_categorical:
704 return self._group_index._values
705
706 return self._codes_and_uniques[1]
707
708 @property
709 def group_arraylike(self) -> ArrayLike:
710 """
711 Analogous to result_index, but holding an ArrayLike to ensure
712 we can retain ExtensionDtypes.
713 """
714 warnings.warn(
715 "group_arraylike is deprecated and will be removed in a future "
716 "version of pandas",
717 category=FutureWarning,
718 stacklevel=find_stack_level(),
719 )
720 return self._group_arraylike
721
722 @cache_readonly
723 def _result_index(self) -> Index:
724 # result_index retains dtype for categories, including unobserved ones,
725 # which group_index does not
726 if self._all_grouper is not None:
727 group_idx = self._group_index
728 assert isinstance(group_idx, CategoricalIndex)
729 cats = self._orig_cats
730 # set_categories is dynamically added
731 return group_idx.set_categories(cats) # type: ignore[attr-defined]
732 return self._group_index
733
734 @property
735 def result_index(self) -> Index:
736 warnings.warn(
737 "result_index is deprecated and will be removed in a future "
738 "version of pandas",
739 category=FutureWarning,
740 stacklevel=find_stack_level(),
741 )
742 return self._result_index
743
744 @cache_readonly
745 def _group_index(self) -> Index:
746 codes, uniques = self._codes_and_uniques
747 if not self._dropna and self._passed_categorical:
748 assert isinstance(uniques, Categorical)
749 if self._sort and (codes == len(uniques)).any():
750 # Add NA value on the end when sorting
751 uniques = Categorical.from_codes(
752 np.append(uniques.codes, [-1]), uniques.categories, validate=False
753 )
754 elif len(codes) > 0:
755 # Need to determine proper placement of NA value when not sorting
756 cat = self.grouping_vector
757 na_idx = (cat.codes < 0).argmax()
758 if cat.codes[na_idx] < 0:
759 # count number of unique codes that comes before the nan value
760 na_unique_idx = algorithms.nunique_ints(cat.codes[:na_idx])
761 new_codes = np.insert(uniques.codes, na_unique_idx, -1)
762 uniques = Categorical.from_codes(
763 new_codes, uniques.categories, validate=False
764 )
765 return Index._with_infer(uniques, name=self.name)
766
767 @property
768 def group_index(self) -> Index:
769 warnings.warn(
770 "group_index is deprecated and will be removed in a future "
771 "version of pandas",
772 category=FutureWarning,
773 stacklevel=find_stack_level(),
774 )
775 return self._group_index
776
777 @cache_readonly
778 def _codes_and_uniques(self) -> tuple[npt.NDArray[np.signedinteger], ArrayLike]:
779 uniques: ArrayLike
780 if self._passed_categorical:
781 # we make a CategoricalIndex out of the cat grouper
782 # preserving the categories / ordered attributes;
783 # doesn't (yet - GH#46909) handle dropna=False
784 cat = self.grouping_vector
785 categories = cat.categories
786
787 if self._observed:
788 ucodes = algorithms.unique1d(cat.codes)
789 ucodes = ucodes[ucodes != -1]
790 if self._sort:
791 ucodes = np.sort(ucodes)
792 else:
793 ucodes = np.arange(len(categories))
794
795 uniques = Categorical.from_codes(
796 codes=ucodes, categories=categories, ordered=cat.ordered, validate=False
797 )
798
799 codes = cat.codes
800 if not self._dropna:
801 na_mask = codes < 0
802 if np.any(na_mask):
803 if self._sort:
804 # Replace NA codes with `largest code + 1`
805 na_code = len(categories)
806 codes = np.where(na_mask, na_code, codes)
807 else:
808 # Insert NA code into the codes based on first appearance
809 # A negative code must exist, no need to check codes[na_idx] < 0
810 na_idx = na_mask.argmax()
811 # count number of unique codes that comes before the nan value
812 na_code = algorithms.nunique_ints(codes[:na_idx])
813 codes = np.where(codes >= na_code, codes + 1, codes)
814 codes = np.where(na_mask, na_code, codes)
815
816 if not self._observed:
817 uniques = uniques.reorder_categories(self._orig_cats)
818
819 return codes, uniques
820
821 elif isinstance(self.grouping_vector, ops.BaseGrouper):
822 # we have a list of groupers
823 codes = self.grouping_vector.codes_info
824 uniques = self.grouping_vector.result_index._values
825 elif self._uniques is not None:
826 # GH#50486 Code grouping_vector using _uniques; allows
827 # including uniques that are not present in grouping_vector.
828 cat = Categorical(self.grouping_vector, categories=self._uniques)
829 codes = cat.codes
830 uniques = self._uniques
831 else:
832 # GH35667, replace dropna=False with use_na_sentinel=False
833 # error: Incompatible types in assignment (expression has type "Union[
834 # ndarray[Any, Any], Index]", variable has type "Categorical")
835 codes, uniques = algorithms.factorize( # type: ignore[assignment]
836 self.grouping_vector, sort=self._sort, use_na_sentinel=self._dropna
837 )
838 return codes, uniques
839
840 @cache_readonly
841 def groups(self) -> dict[Hashable, np.ndarray]:
842 cats = Categorical.from_codes(self.codes, self._group_index, validate=False)
843 return self._index.groupby(cats)
844
845
846def get_grouper(
847 obj: NDFrameT,
848 key=None,
849 axis: Axis = 0,
850 level=None,
851 sort: bool = True,
852 observed: bool = False,
853 validate: bool = True,
854 dropna: bool = True,
855) -> tuple[ops.BaseGrouper, frozenset[Hashable], NDFrameT]:
856 """
857 Create and return a BaseGrouper, which is an internal
858 mapping of how to create the grouper indexers.
859 This may be composed of multiple Grouping objects, indicating
860 multiple groupers
861
862 Groupers are ultimately index mappings. They can originate as:
863 index mappings, keys to columns, functions, or Groupers
864
865 Groupers enable local references to axis,level,sort, while
866 the passed in axis, level, and sort are 'global'.
867
868 This routine tries to figure out what the passing in references
869 are and then creates a Grouping for each one, combined into
870 a BaseGrouper.
871
872 If observed & we have a categorical grouper, only show the observed
873 values.
874
875 If validate, then check for key/level overlaps.
876
877 """
878 group_axis = obj._get_axis(axis)
879
880 # validate that the passed single level is compatible with the passed
881 # axis of the object
882 if level is not None:
883 # TODO: These if-block and else-block are almost same.
884 # MultiIndex instance check is removable, but it seems that there are
885 # some processes only for non-MultiIndex in else-block,
886 # eg. `obj.index.name != level`. We have to consider carefully whether
887 # these are applicable for MultiIndex. Even if these are applicable,
888 # we need to check if it makes no side effect to subsequent processes
889 # on the outside of this condition.
890 # (GH 17621)
891 if isinstance(group_axis, MultiIndex):
892 if is_list_like(level) and len(level) == 1:
893 level = level[0]
894
895 if key is None and is_scalar(level):
896 # Get the level values from group_axis
897 key = group_axis.get_level_values(level)
898 level = None
899
900 else:
901 # allow level to be a length-one list-like object
902 # (e.g., level=[0])
903 # GH 13901
904 if is_list_like(level):
905 nlevels = len(level)
906 if nlevels == 1:
907 level = level[0]
908 elif nlevels == 0:
909 raise ValueError("No group keys passed!")
910 else:
911 raise ValueError("multiple levels only valid with MultiIndex")
912
913 if isinstance(level, str):
914 if obj._get_axis(axis).name != level:
915 raise ValueError(
916 f"level name {level} is not the name "
917 f"of the {obj._get_axis_name(axis)}"
918 )
919 elif level > 0 or level < -1:
920 raise ValueError("level > 0 or level < -1 only valid with MultiIndex")
921
922 # NOTE: `group_axis` and `group_axis.get_level_values(level)`
923 # are same in this section.
924 level = None
925 key = group_axis
926
927 # a passed-in Grouper, directly convert
928 if isinstance(key, Grouper):
929 grouper, obj = key._get_grouper(obj, validate=False)
930 if key.key is None:
931 return grouper, frozenset(), obj
932 else:
933 return grouper, frozenset({key.key}), obj
934
935 # already have a BaseGrouper, just return it
936 elif isinstance(key, ops.BaseGrouper):
937 return key, frozenset(), obj
938
939 if not isinstance(key, list):
940 keys = [key]
941 match_axis_length = False
942 else:
943 keys = key
944 match_axis_length = len(keys) == len(group_axis)
945
946 # what are we after, exactly?
947 any_callable = any(callable(g) or isinstance(g, dict) for g in keys)
948 any_groupers = any(isinstance(g, (Grouper, Grouping)) for g in keys)
949 any_arraylike = any(
950 isinstance(g, (list, tuple, Series, Index, np.ndarray)) for g in keys
951 )
952
953 # is this an index replacement?
954 if (
955 not any_callable
956 and not any_arraylike
957 and not any_groupers
958 and match_axis_length
959 and level is None
960 ):
961 if isinstance(obj, DataFrame):
962 all_in_columns_index = all(
963 g in obj.columns or g in obj.index.names for g in keys
964 )
965 else:
966 assert isinstance(obj, Series)
967 all_in_columns_index = all(g in obj.index.names for g in keys)
968
969 if not all_in_columns_index:
970 keys = [com.asarray_tuplesafe(keys)]
971
972 if isinstance(level, (tuple, list)):
973 if key is None:
974 keys = [None] * len(level)
975 levels = level
976 else:
977 levels = [level] * len(keys)
978
979 groupings: list[Grouping] = []
980 exclusions: set[Hashable] = set()
981
982 # if the actual grouper should be obj[key]
983 def is_in_axis(key) -> bool:
984 if not _is_label_like(key):
985 if obj.ndim == 1:
986 return False
987
988 # items -> .columns for DataFrame, .index for Series
989 items = obj.axes[-1]
990 try:
991 items.get_loc(key)
992 except (KeyError, TypeError, InvalidIndexError):
993 # TypeError shows up here if we pass e.g. an Index
994 return False
995
996 return True
997
998 # if the grouper is obj[name]
999 def is_in_obj(gpr) -> bool:
1000 if not hasattr(gpr, "name"):
1001 return False
1002 if using_copy_on_write() or warn_copy_on_write():
1003 # For the CoW case, we check the references to determine if the
1004 # series is part of the object
1005 try:
1006 obj_gpr_column = obj[gpr.name]
1007 except (KeyError, IndexError, InvalidIndexError, OutOfBoundsDatetime):
1008 return False
1009 if isinstance(gpr, Series) and isinstance(obj_gpr_column, Series):
1010 return gpr._mgr.references_same_values( # type: ignore[union-attr]
1011 obj_gpr_column._mgr, 0 # type: ignore[arg-type]
1012 )
1013 return False
1014 try:
1015 return gpr is obj[gpr.name]
1016 except (KeyError, IndexError, InvalidIndexError, OutOfBoundsDatetime):
1017 # IndexError reached in e.g. test_skip_group_keys when we pass
1018 # lambda here
1019 # InvalidIndexError raised on key-types inappropriate for index,
1020 # e.g. DatetimeIndex.get_loc(tuple())
1021 # OutOfBoundsDatetime raised when obj is a Series with DatetimeIndex
1022 # and gpr.name is month str
1023 return False
1024
1025 for gpr, level in zip(keys, levels):
1026 if is_in_obj(gpr): # df.groupby(df['name'])
1027 in_axis = True
1028 exclusions.add(gpr.name)
1029
1030 elif is_in_axis(gpr): # df.groupby('name')
1031 if obj.ndim != 1 and gpr in obj:
1032 if validate:
1033 obj._check_label_or_level_ambiguity(gpr, axis=axis)
1034 in_axis, name, gpr = True, gpr, obj[gpr]
1035 if gpr.ndim != 1:
1036 # non-unique columns; raise here to get the name in the
1037 # exception message
1038 raise ValueError(f"Grouper for '{name}' not 1-dimensional")
1039 exclusions.add(name)
1040 elif obj._is_level_reference(gpr, axis=axis):
1041 in_axis, level, gpr = False, gpr, None
1042 else:
1043 raise KeyError(gpr)
1044 elif isinstance(gpr, Grouper) and gpr.key is not None:
1045 # Add key to exclusions
1046 exclusions.add(gpr.key)
1047 in_axis = True
1048 else:
1049 in_axis = False
1050
1051 # create the Grouping
1052 # allow us to passing the actual Grouping as the gpr
1053 ping = (
1054 Grouping(
1055 group_axis,
1056 gpr,
1057 obj=obj,
1058 level=level,
1059 sort=sort,
1060 observed=observed,
1061 in_axis=in_axis,
1062 dropna=dropna,
1063 )
1064 if not isinstance(gpr, Grouping)
1065 else gpr
1066 )
1067
1068 groupings.append(ping)
1069
1070 if len(groupings) == 0 and len(obj):
1071 raise ValueError("No group keys passed!")
1072 if len(groupings) == 0:
1073 groupings.append(Grouping(Index([], dtype="int"), np.array([], dtype=np.intp)))
1074
1075 # create the internals grouper
1076 grouper = ops.BaseGrouper(group_axis, groupings, sort=sort, dropna=dropna)
1077 return grouper, frozenset(exclusions), obj
1078
1079
1080def _is_label_like(val) -> bool:
1081 return isinstance(val, (str, tuple)) or (val is not None and is_scalar(val))
1082
1083
1084def _convert_grouper(axis: Index, grouper):
1085 if isinstance(grouper, dict):
1086 return grouper.get
1087 elif isinstance(grouper, Series):
1088 if grouper.index.equals(axis):
1089 return grouper._values
1090 else:
1091 return grouper.reindex(axis)._values
1092 elif isinstance(grouper, MultiIndex):
1093 return grouper._values
1094 elif isinstance(grouper, (list, tuple, Index, Categorical, np.ndarray)):
1095 if len(grouper) != len(axis):
1096 raise ValueError("Grouper and axis must be same length")
1097
1098 if isinstance(grouper, (list, tuple)):
1099 grouper = com.asarray_tuplesafe(grouper)
1100 return grouper
1101 else:
1102 return grouper